nicoscraper 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,297 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'rubygems'
3
+ require 'ruby-debug'
4
+ require 'damerau-levenshtein'
5
+ require 'kconv'
6
+
7
+ require 'parser'
8
+ require 'mylist'
9
+ require 'connector'
10
+
11
+ class Movie
12
+ def initialize(video_id)
13
+ @video_id = video_id
14
+ @available = false
15
+ end
16
+
17
+ private
18
+
19
+ public
20
+
21
+ # 指定されたマイリストに自分が入っていれば、真を返す。
22
+ def isBelongsTo (_mylistId, &block)
23
+ isBelongs = false
24
+ thisMl = Mylist.new(_mylistId)
25
+ thisMl.getInfoLt
26
+
27
+ thisMl.movies.each { |movie|
28
+ isBelongs = true if movie.video_id == @video_id
29
+ }
30
+
31
+ if isBelongs
32
+ puts "\sThis movie is found in mylist/" + _mylistId
33
+ else
34
+ puts "\sThis movie is not found in mylist/" + _mylistId
35
+ end
36
+
37
+ # 無駄なアクセスを省くため、マイリスト中の動画に関する追加処理があれば、
38
+ # ブロックとして実行できる。
39
+ block.call(thisMl)
40
+
41
+ return isBelongs
42
+ end
43
+
44
+ # 自分が含まれる、投稿者の作ったシリーズとしてまとめているマイリストのIDを返す。
45
+ # 情報取得元が異なるため、必ずしもisBelongsの結果とは包含関係にならない。
46
+ def isSeriesOf
47
+ if !@available then
48
+ puts "This movie object is not available."
49
+ return "failed"
50
+ end
51
+
52
+ puts
53
+ puts "Start to discern the seriality of..."
54
+ puts "\svideo_id:\s\s" + @video_id
55
+ puts "\stitle:\s\s\s\s\s" + @title
56
+ # extrMylist呼び出し
57
+ mylistIdAry = extrMylist
58
+ sMylistIdAry = []
59
+ mlObjAry = []
60
+ mylistId = nil
61
+ mylist = nil
62
+ similarity = 0.0
63
+
64
+ mylistIdAry.each { |_mylistId|
65
+ belongsTo = isBelongsTo(_mylistId) { |mylistObj|
66
+ similarity = mylistObj.isSeries
67
+ puts "\sSimilarity:\t" + similarity.to_s
68
+ }
69
+ puts belongsTo
70
+ if belongsTo && similarity > 0.7
71
+ puts "\s" + _mylistId.to_s + "\tis perecieved as series mylist."
72
+ sMylistIdAry.push(_mylistId)
73
+ end
74
+ }
75
+
76
+ sMylistIdAry.each { |mylistId|
77
+ puts mylistId
78
+ mlObjAry.push( Mylist.new(mylistId) )
79
+ }
80
+
81
+ puts "\sDiscern logic terminated."
82
+ return mlObjAry
83
+ end
84
+
85
+ # 動画説明文中から、マイリストIDを示す文字列を抜き出す。
86
+ def extrMylist
87
+ return if !@available
88
+ puts "Extracting mylistId from the description..."
89
+
90
+ mylistIdAry = []
91
+ extracted = @description.scan(/mylist\/[0-9]{1,8}/)
92
+ if extracted[0] != nil
93
+ extracted.each { |e|
94
+ id = e.scan(/[0-9]{1,8}/)[0]
95
+ mylistIdAry.push(id)
96
+ puts "\sID:\t" + id + " is extracted."
97
+ }
98
+ else
99
+ puts "\sMylistId is not found."
100
+ end
101
+
102
+ return mylistIdAry
103
+ end
104
+
105
+ def getInfo
106
+ con = Connector.new('xml')
107
+ host = 'ext.nicovideo.jp'
108
+ entity = '/api/getthumbinfo/' + @video_id
109
+ con.setWait(nil)
110
+ xml = con.xmlGet(host, entity)
111
+
112
+ unless
113
+ xml =~ /<nicovideo_thumb_response\sstatus=\"fail\">/ ||
114
+ xml == "failed"
115
+ then
116
+ param = NicoParser.getThumbInfo(xml)
117
+ set(param)
118
+ @available = true
119
+ else
120
+ @available = false
121
+ return "failed"
122
+ end
123
+ end
124
+
125
+ def set(paramObj)
126
+ paramObj.each_key { |key|
127
+ param = paramObj[key]
128
+ case key
129
+ when "available"
130
+ @available = param
131
+
132
+ when "video_id"
133
+ @video_id = param
134
+ when "mylist_id"
135
+ @mylist_id = param
136
+ when "item_id"
137
+ @item_id = param
138
+ when "description"
139
+ @description = param
140
+
141
+ # MylistAPI
142
+ when "video_id"
143
+ @video_id = param
144
+ when "item_id"
145
+ @item_id = param.to_i
146
+ when "description"
147
+ @description = param
148
+ when "item_data"
149
+ paramObj['item_data'].each_key { |key|
150
+ param = paramObj['item_data'][key]
151
+ case key
152
+ when "video_id"
153
+ @video_id = param
154
+ when "title"
155
+ @title = param
156
+ when "thumbnail_url"
157
+ @thumbnail_url = param
158
+ when "first_retrieve"
159
+ @first_retrieve = param
160
+ when "update_time"
161
+ @update_time = param
162
+ when "view_counter"
163
+ @view_counter = param.to_i
164
+ when "mylist_counter"
165
+ @mylist_counter = param.to_i
166
+ when "num_res"
167
+ @comment_num = param.to_i
168
+ when "length_seconds"
169
+ @length = param
170
+ when "deleted"
171
+ @deleted = param.to_i
172
+ when "last_res_body"
173
+ @last_res_body = param
174
+ end
175
+ }
176
+ when "watch"
177
+ @watch = param
178
+ when "create_time"
179
+ @create_time = param
180
+ when "update_time"
181
+ @update_time = param
182
+
183
+ # MylistAPI-Atom
184
+ when "video_id"
185
+ @video_id = param
186
+ when "item_id"
187
+ @item_id = param
188
+ when "memo"
189
+ @memo = param
190
+ when "published"
191
+ @published = param
192
+ when "updated"
193
+ @updated = param
194
+ when "thumbnail_url"
195
+ @thumbnail_url = param
196
+ when "length"
197
+ @length = param
198
+ when "view"
199
+ @view_counter = param.to_i
200
+ when "mylist"
201
+ @mylist_counter = param.to_i
202
+ when "res"
203
+ @comment_num = param.to_i
204
+ when "first_retrieve"
205
+ @first_retrieve = param
206
+ when "length"
207
+ @length = param
208
+
209
+ # getThumbInfo
210
+ when "video_id"
211
+ @video_id = param
212
+ when "title"
213
+ @title = param
214
+ when "description"
215
+ @description = param
216
+ when "thumbnail_url"
217
+ @thumbnail_url = param
218
+ when "first_retrieve"
219
+ @first_retrieve = param
220
+ when "length"
221
+ @length = param
222
+ when "movie_type"
223
+ @movie_type = param
224
+ when "size_high"
225
+ @size_high = param
226
+ when "size_low"
227
+ @size_low = param
228
+ when "view_counter"
229
+ @view_counter = param
230
+ when "mylist_counter"
231
+ @mylist_counter = param
232
+ when "comment_num"
233
+ @comment_num = param
234
+ when "last_res_body"
235
+ @last_res_body = param
236
+ when "watch_url"
237
+ @watch_url = param
238
+ when "thumb_type"
239
+ @thumb_type = param
240
+ when "embeddable"
241
+ @embeddable = param
242
+ when "movieNum_live_play"
243
+ @movieNum_live_play = param
244
+ when "tags_jp"
245
+ @tags_jp = param
246
+ when "tags_tw"
247
+ @tags_tw = param
248
+ when "tags_de"
249
+ @tags_de = param
250
+ when "tags_sp"
251
+ @tags_sp = param
252
+ when "user_id"
253
+ @user_id = param
254
+ end
255
+ }
256
+ end
257
+
258
+ attr_accessor :available
259
+
260
+ # MylistAPI
261
+ attr_accessor :video_id
262
+ attr_accessor :mylist_id
263
+ attr_accessor :item_id
264
+ attr_accessor :description
265
+
266
+ attr_accessor :title
267
+ attr_accessor :thumbnail_url
268
+ attr_accessor :first_retrieve
269
+ attr_accessor :update_time
270
+ attr_accessor :view_counter
271
+ attr_accessor :mylist_counter
272
+ attr_accessor :comment_num
273
+ attr_accessor :length
274
+ attr_accessor :deleted
275
+ attr_accessor :last_res_body
276
+
277
+ attr_accessor :watch
278
+ attr_accessor :create_time
279
+ attr_accessor :update_time
280
+
281
+ # MylistAPI-Atom
282
+ attr_accessor :memo
283
+ attr_accessor :published
284
+ attr_accessor :updated
285
+
286
+ # getThumbInfo
287
+ attr_accessor :movie_type
288
+ attr_accessor :size_high
289
+ attr_accessor :size_low
290
+ attr_accessor :watch_url
291
+ attr_accessor :thumb_type
292
+ attr_accessor :embeddable
293
+ attr_accessor :movieNum_live_play
294
+ attr_accessor :tags_jp
295
+ attr_accessor :tags_tw
296
+ attr_accessor :user_id
297
+ end
@@ -0,0 +1,258 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'rubygems'
3
+ require 'ruby-debug'
4
+ require 'kconv'
5
+
6
+ require 'parser'
7
+ require 'movie'
8
+ require 'connector'
9
+
10
+
11
+ class Mylist
12
+ def initialize (mylist_id)
13
+ @mylist_id = mylist_id
14
+ @movies = []
15
+ @available = false
16
+ end
17
+
18
+ def simOfTitle
19
+ match = false
20
+ dl = DamerauLevenshtein
21
+ d = 0.0
22
+
23
+ puts "matching..."
24
+
25
+ # O(n^2)なのでどうにかしたい。しかし、最大でも500C2=125000なので、
26
+ # 日々の利用については許容できると思う、
27
+ @movies.each { |myself|
28
+ @movies.each { |amovieNumther|
29
+ _d = dl.distance(myself.title, amovieNumther.title)
30
+ d += _d
31
+ }
32
+ }
33
+
34
+ similarity = 1 - ( (d / @movies.length) / title.length )
35
+ puts "Similarity: " + similarity.to_s
36
+ similarity
37
+ end
38
+
39
+ def userId
40
+ idGroup = {}
41
+ samePublisher = false
42
+ threshold = 0.9
43
+
44
+ @movies.each { |movie|
45
+ idGroup[movie.user_id] += 1
46
+ }
47
+
48
+ idGroup.each { |group|
49
+ if @movies.length / threshold < group.length
50
+ samePublicher = true
51
+ end
52
+ }
53
+
54
+ return samePublisher
55
+ end
56
+
57
+ # 自分がシリーズをまとめたマイリストであるかを判定する。
58
+ # 判定基準は、1.一定数以上の動画の投稿者が、マイリスト作成者と同じであること。
59
+ # 2.タイトルの類似度が、定められた基準以上であること。
60
+ def isSeries
61
+ l = @movies.length - 1
62
+ dlc = DamerauLevenshtein
63
+ dl = 0.0
64
+ dlAry = []
65
+ count_o = 0
66
+ count_i = 0
67
+
68
+ while count_o <= l do
69
+ count_i = count_o + 1
70
+ while count_i <= l do
71
+ dl = dlc.distance(
72
+ @movies[count_i].title,
73
+ @movies[count_o].title
74
+ )
75
+
76
+ dl = 1.0 - dl.fdiv( @movies[count_i].title.length)
77
+ dlAry.push(dl)
78
+
79
+ count_i += 1
80
+ end
81
+ count_o += 1
82
+ end
83
+
84
+ if l != 0 && dlAry.length > 0
85
+ t = 0
86
+ dlAry.each { |_dl| t += _dl }
87
+ similarity = t / dlAry.length
88
+ elsif dlAry.length == 0
89
+ similarity = 0
90
+ else
91
+ similarity = 1
92
+ end
93
+
94
+ return similarity
95
+ end
96
+
97
+ def getInfo
98
+ con = Connector.new('mech')
99
+ reqUrl = 'http://www.nicovideo.jp' +
100
+ '/mylist/' + @mylist_id.to_s
101
+ con.setWait(nil)
102
+ mechPage = con.mechGet(reqUrl)
103
+ result = []
104
+
105
+ # Mylist自身の情報を取得
106
+ jsonStr = mechPage.search(
107
+ "/html/body/div[2]" +
108
+ "/div/div[2]/script[7]"
109
+ ).to_html
110
+
111
+ reg = /MylistGroup\.preloadSingle.{1,}?Mylist\.preload\(/m
112
+ mlJson = jsonStr.scan(reg)[0]
113
+
114
+ id = mlJson.scan(/\sid:[^\n]{1,}/)[0]
115
+ .scan(/[0-9]{1,}/)[0]
116
+ user_id = mlJson.scan(/\suser_id:[^\n]{1,}/)[0]
117
+ .scan(/[0-9]{1,}/)[0]
118
+ name = mlJson.scan(/\sname:[^\n]{1,}/)[0]
119
+ name = name.slice(
120
+ " name: \"".length,
121
+ name.length - " name: \"".length - "\",\n".length
122
+ )
123
+ desc = mlJson.scan(/\sdescription:.{1,}/)[0]
124
+ desc = desc.slice(
125
+ " description: \"".length,
126
+ desc.length - " description: \"".length - "\",\npublic".length
127
+ )
128
+ public = mlJson.scan(/\spublic:[^,]{1,}/)[0]
129
+ .scan(/[0-9]{1,}/)[0]
130
+ default_sort = mlJson.scan(/\sdefault_sort:[^\n]{1,}/)[0]
131
+ .scan(/[0-9]{1,}/)[0]
132
+ create_time = mlJson.scan(/\screate_time:[^\n]{1,}/)[0]
133
+ .scan(/[0-9]{1,}/)[0]
134
+ update_time = mlJson.scan(/\supdate_time:[^\n]{1,}/)[0]
135
+ .scan(/[0-9]{1,}/)[0]
136
+ icon_id = mlJson.scan(/\sicon_id:[^\n]{1,}/)[0]
137
+ .scan(/[0-9]{1,}/)[0]
138
+
139
+ # mlJson = mlJson.scan(/[^\r\n ]{1,}/).join('')
140
+ #mlJson = mlJson.scan(/{.+/)[0].split(',')
141
+
142
+ # 説明文が空欄だった時の措置。
143
+ desc = mlJson[3].scan(/\".+\"/)[0]
144
+ if desc != nil then desc = desc.scan(/[^\"]{1,}/)[0] end
145
+
146
+ paramObj = {
147
+ "id" => id,
148
+ "user_id" => user_id,
149
+ "name" => name,
150
+ "description" => description,
151
+ "public" => public,
152
+ "default_sort" => default_sort,
153
+ "create_time" => create_time,
154
+ "update_time" => update_time,
155
+ "icon_id" => icon_id
156
+ # "sort_order" => ,
157
+ }
158
+ set(paramObj)
159
+
160
+ # 自分に含まれる動画の情報を取得
161
+ jsonStr = mechPage.search(
162
+ "/html/body/div[2]" +
163
+ "/div/div[2]/script[7]"
164
+ ).to_html
165
+
166
+ mvJson = jsonStr.scan(/Mylist.preload.+/)[0]
167
+ mvJson = mvJson.scan(/\".{1,}/)[0]
168
+ mvJson = mvJson.slice(0, mvJson.length - 5)
169
+ #mvJson = mvJson.split('},{')
170
+ mvJson = Unicode.unescape(mvJson).split('},{')
171
+
172
+ mvJson.each { |e|
173
+ e = "{" + e + "}"
174
+ param = JSON.parse(e)
175
+ movie = Movie.new(param['item_data']['video_id'])
176
+ movie.set(param)
177
+
178
+ @movies.push(movie)
179
+ }
180
+ end
181
+
182
+ def getInfoLt
183
+ con = Connector.new('xml')
184
+ host = 'www.nicovideo.jp'
185
+ puts @mylist_id
186
+ entity = '/mylist/' + @mylist_id.to_s + '?rss=atom&numbers=1'
187
+ con.setWait(nil)
188
+ xml = con.xmlGet(host, entity)
189
+
190
+ unless
191
+ xml == "failed"
192
+ then
193
+ parsed = NicoParser.mylistRss(xml)
194
+
195
+ parsed["entry"].each { |e|
196
+ movie = Movie.new(e["video_id"])
197
+ e["available"] = true
198
+ movie.set(e)
199
+ @movies.push(movie)
200
+ }
201
+
202
+ set(parsed["mylist"])
203
+ @available = true
204
+ end
205
+ end
206
+
207
+ def set(paramObj)
208
+ paramObj.each_key { |key|
209
+ param = paramObj[key]
210
+ case key
211
+ when "mylist_id"
212
+ @mylist_id = param
213
+ when "id"
214
+ @mylist_id = param
215
+ when "user_id"
216
+ @user_id = param
217
+ when "title"
218
+ @title = param
219
+ when "description"
220
+ @description = param
221
+ when "public"
222
+ @public = param
223
+ when "default_sort"
224
+ @default_sort = param
225
+ when "create_time"
226
+ @create_time = param
227
+ when "update_time"
228
+ @update_time = param
229
+ when "icon_id"
230
+ @icon_id = param
231
+ when "sort_order"
232
+ @sort_order = param
233
+ when "movies"
234
+ @movies = param
235
+
236
+ when "updated"
237
+ @update_time = param
238
+ when "author"
239
+ @author = param
240
+ end
241
+ }
242
+ end
243
+
244
+ attr_accessor :available
245
+
246
+ attr_accessor :mylist_id
247
+ attr_accessor :user_id
248
+ attr_accessor :title
249
+ attr_accessor :description
250
+ attr_accessor :public
251
+ attr_accessor :default_sort
252
+ attr_accessor :create_time
253
+ attr_accessor :update_time
254
+ attr_accessor :icon_id
255
+ attr_accessor :sort_order
256
+
257
+ attr_accessor :movies
258
+ end