nicoscraper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,297 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'rubygems'
3
+ require 'ruby-debug'
4
+ require 'damerau-levenshtein'
5
+ require 'kconv'
6
+
7
+ require 'parser'
8
+ require 'mylist'
9
+ require 'connector'
10
+
11
+ class Movie
12
+ def initialize(video_id)
13
+ @video_id = video_id
14
+ @available = false
15
+ end
16
+
17
+ private
18
+
19
+ public
20
+
21
+ # 指定されたマイリストに自分が入っていれば、真を返す。
22
+ def isBelongsTo (_mylistId, &block)
23
+ isBelongs = false
24
+ thisMl = Mylist.new(_mylistId)
25
+ thisMl.getInfoLt
26
+
27
+ thisMl.movies.each { |movie|
28
+ isBelongs = true if movie.video_id == @video_id
29
+ }
30
+
31
+ if isBelongs
32
+ puts "\sThis movie is found in mylist/" + _mylistId
33
+ else
34
+ puts "\sThis movie is not found in mylist/" + _mylistId
35
+ end
36
+
37
+ # 無駄なアクセスを省くため、マイリスト中の動画に関する追加処理があれば、
38
+ # ブロックとして実行できる。
39
+ block.call(thisMl)
40
+
41
+ return isBelongs
42
+ end
43
+
44
+ # 自分が含まれる、投稿者の作ったシリーズとしてまとめているマイリストのIDを返す。
45
+ # 情報取得元が異なるため、必ずしもisBelongsの結果とは包含関係にならない。
46
+ def isSeriesOf
47
+ if !@available then
48
+ puts "This movie object is not available."
49
+ return "failed"
50
+ end
51
+
52
+ puts
53
+ puts "Start to discern the seriality of..."
54
+ puts "\svideo_id:\s\s" + @video_id
55
+ puts "\stitle:\s\s\s\s\s" + @title
56
+ # extrMylist呼び出し
57
+ mylistIdAry = extrMylist
58
+ sMylistIdAry = []
59
+ mlObjAry = []
60
+ mylistId = nil
61
+ mylist = nil
62
+ similarity = 0.0
63
+
64
+ mylistIdAry.each { |_mylistId|
65
+ belongsTo = isBelongsTo(_mylistId) { |mylistObj|
66
+ similarity = mylistObj.isSeries
67
+ puts "\sSimilarity:\t" + similarity.to_s
68
+ }
69
+ puts belongsTo
70
+ if belongsTo && similarity > 0.7
71
+ puts "\s" + _mylistId.to_s + "\tis perecieved as series mylist."
72
+ sMylistIdAry.push(_mylistId)
73
+ end
74
+ }
75
+
76
+ sMylistIdAry.each { |mylistId|
77
+ puts mylistId
78
+ mlObjAry.push( Mylist.new(mylistId) )
79
+ }
80
+
81
+ puts "\sDiscern logic terminated."
82
+ return mlObjAry
83
+ end
84
+
85
+ # 動画説明文中から、マイリストIDを示す文字列を抜き出す。
86
+ def extrMylist
87
+ return if !@available
88
+ puts "Extracting mylistId from the description..."
89
+
90
+ mylistIdAry = []
91
+ extracted = @description.scan(/mylist\/[0-9]{1,8}/)
92
+ if extracted[0] != nil
93
+ extracted.each { |e|
94
+ id = e.scan(/[0-9]{1,8}/)[0]
95
+ mylistIdAry.push(id)
96
+ puts "\sID:\t" + id + " is extracted."
97
+ }
98
+ else
99
+ puts "\sMylistId is not found."
100
+ end
101
+
102
+ return mylistIdAry
103
+ end
104
+
105
+ def getInfo
106
+ con = Connector.new('xml')
107
+ host = 'ext.nicovideo.jp'
108
+ entity = '/api/getthumbinfo/' + @video_id
109
+ con.setWait(nil)
110
+ xml = con.xmlGet(host, entity)
111
+
112
+ unless
113
+ xml =~ /<nicovideo_thumb_response\sstatus=\"fail\">/ ||
114
+ xml == "failed"
115
+ then
116
+ param = NicoParser.getThumbInfo(xml)
117
+ set(param)
118
+ @available = true
119
+ else
120
+ @available = false
121
+ return "failed"
122
+ end
123
+ end
124
+
125
+ def set(paramObj)
126
+ paramObj.each_key { |key|
127
+ param = paramObj[key]
128
+ case key
129
+ when "available"
130
+ @available = param
131
+
132
+ when "video_id"
133
+ @video_id = param
134
+ when "mylist_id"
135
+ @mylist_id = param
136
+ when "item_id"
137
+ @item_id = param
138
+ when "description"
139
+ @description = param
140
+
141
+ # MylistAPI
142
+ when "video_id"
143
+ @video_id = param
144
+ when "item_id"
145
+ @item_id = param.to_i
146
+ when "description"
147
+ @description = param
148
+ when "item_data"
149
+ paramObj['item_data'].each_key { |key|
150
+ param = paramObj['item_data'][key]
151
+ case key
152
+ when "video_id"
153
+ @video_id = param
154
+ when "title"
155
+ @title = param
156
+ when "thumbnail_url"
157
+ @thumbnail_url = param
158
+ when "first_retrieve"
159
+ @first_retrieve = param
160
+ when "update_time"
161
+ @update_time = param
162
+ when "view_counter"
163
+ @view_counter = param.to_i
164
+ when "mylist_counter"
165
+ @mylist_counter = param.to_i
166
+ when "num_res"
167
+ @comment_num = param.to_i
168
+ when "length_seconds"
169
+ @length = param
170
+ when "deleted"
171
+ @deleted = param.to_i
172
+ when "last_res_body"
173
+ @last_res_body = param
174
+ end
175
+ }
176
+ when "watch"
177
+ @watch = param
178
+ when "create_time"
179
+ @create_time = param
180
+ when "update_time"
181
+ @update_time = param
182
+
183
+ # MylistAPI-Atom
184
+ when "video_id"
185
+ @video_id = param
186
+ when "item_id"
187
+ @item_id = param
188
+ when "memo"
189
+ @memo = param
190
+ when "published"
191
+ @published = param
192
+ when "updated"
193
+ @updated = param
194
+ when "thumbnail_url"
195
+ @thumbnail_url = param
196
+ when "length"
197
+ @length = param
198
+ when "view"
199
+ @view_counter = param.to_i
200
+ when "mylist"
201
+ @mylist_counter = param.to_i
202
+ when "res"
203
+ @comment_num = param.to_i
204
+ when "first_retrieve"
205
+ @first_retrieve = param
206
+ when "length"
207
+ @length = param
208
+
209
+ # getThumbInfo
210
+ when "video_id"
211
+ @video_id = param
212
+ when "title"
213
+ @title = param
214
+ when "description"
215
+ @description = param
216
+ when "thumbnail_url"
217
+ @thumbnail_url = param
218
+ when "first_retrieve"
219
+ @first_retrieve = param
220
+ when "length"
221
+ @length = param
222
+ when "movie_type"
223
+ @movie_type = param
224
+ when "size_high"
225
+ @size_high = param
226
+ when "size_low"
227
+ @size_low = param
228
+ when "view_counter"
229
+ @view_counter = param
230
+ when "mylist_counter"
231
+ @mylist_counter = param
232
+ when "comment_num"
233
+ @comment_num = param
234
+ when "last_res_body"
235
+ @last_res_body = param
236
+ when "watch_url"
237
+ @watch_url = param
238
+ when "thumb_type"
239
+ @thumb_type = param
240
+ when "embeddable"
241
+ @embeddable = param
242
+ when "movieNum_live_play"
243
+ @movieNum_live_play = param
244
+ when "tags_jp"
245
+ @tags_jp = param
246
+ when "tags_tw"
247
+ @tags_tw = param
248
+ when "tags_de"
249
+ @tags_de = param
250
+ when "tags_sp"
251
+ @tags_sp = param
252
+ when "user_id"
253
+ @user_id = param
254
+ end
255
+ }
256
+ end
257
+
258
+ attr_accessor :available
259
+
260
+ # MylistAPI
261
+ attr_accessor :video_id
262
+ attr_accessor :mylist_id
263
+ attr_accessor :item_id
264
+ attr_accessor :description
265
+
266
+ attr_accessor :title
267
+ attr_accessor :thumbnail_url
268
+ attr_accessor :first_retrieve
269
+ attr_accessor :update_time
270
+ attr_accessor :view_counter
271
+ attr_accessor :mylist_counter
272
+ attr_accessor :comment_num
273
+ attr_accessor :length
274
+ attr_accessor :deleted
275
+ attr_accessor :last_res_body
276
+
277
+ attr_accessor :watch
278
+ attr_accessor :create_time
279
+ attr_accessor :update_time
280
+
281
+ # MylistAPI-Atom
282
+ attr_accessor :memo
283
+ attr_accessor :published
284
+ attr_accessor :updated
285
+
286
+ # getThumbInfo
287
+ attr_accessor :movie_type
288
+ attr_accessor :size_high
289
+ attr_accessor :size_low
290
+ attr_accessor :watch_url
291
+ attr_accessor :thumb_type
292
+ attr_accessor :embeddable
293
+ attr_accessor :movieNum_live_play
294
+ attr_accessor :tags_jp
295
+ attr_accessor :tags_tw
296
+ attr_accessor :user_id
297
+ end
@@ -0,0 +1,258 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'rubygems'
3
+ require 'ruby-debug'
4
+ require 'kconv'
5
+
6
+ require 'parser'
7
+ require 'movie'
8
+ require 'connector'
9
+
10
+
11
+ class Mylist
12
+ def initialize (mylist_id)
13
+ @mylist_id = mylist_id
14
+ @movies = []
15
+ @available = false
16
+ end
17
+
18
+ def simOfTitle
19
+ match = false
20
+ dl = DamerauLevenshtein
21
+ d = 0.0
22
+
23
+ puts "matching..."
24
+
25
+ # O(n^2)なのでどうにかしたい。しかし、最大でも500C2=125000なので、
26
+ # 日々の利用については許容できると思う、
27
+ @movies.each { |myself|
28
+ @movies.each { |amovieNumther|
29
+ _d = dl.distance(myself.title, amovieNumther.title)
30
+ d += _d
31
+ }
32
+ }
33
+
34
+ similarity = 1 - ( (d / @movies.length) / title.length )
35
+ puts "Similarity: " + similarity.to_s
36
+ similarity
37
+ end
38
+
39
+ def userId
40
+ idGroup = {}
41
+ samePublisher = false
42
+ threshold = 0.9
43
+
44
+ @movies.each { |movie|
45
+ idGroup[movie.user_id] += 1
46
+ }
47
+
48
+ idGroup.each { |group|
49
+ if @movies.length / threshold < group.length
50
+ samePublicher = true
51
+ end
52
+ }
53
+
54
+ return samePublisher
55
+ end
56
+
57
+ # 自分がシリーズをまとめたマイリストであるかを判定する。
58
+ # 判定基準は、1.一定数以上の動画の投稿者が、マイリスト作成者と同じであること。
59
+ # 2.タイトルの類似度が、定められた基準以上であること。
60
+ def isSeries
61
+ l = @movies.length - 1
62
+ dlc = DamerauLevenshtein
63
+ dl = 0.0
64
+ dlAry = []
65
+ count_o = 0
66
+ count_i = 0
67
+
68
+ while count_o <= l do
69
+ count_i = count_o + 1
70
+ while count_i <= l do
71
+ dl = dlc.distance(
72
+ @movies[count_i].title,
73
+ @movies[count_o].title
74
+ )
75
+
76
+ dl = 1.0 - dl.fdiv( @movies[count_i].title.length)
77
+ dlAry.push(dl)
78
+
79
+ count_i += 1
80
+ end
81
+ count_o += 1
82
+ end
83
+
84
+ if l != 0 && dlAry.length > 0
85
+ t = 0
86
+ dlAry.each { |_dl| t += _dl }
87
+ similarity = t / dlAry.length
88
+ elsif dlAry.length == 0
89
+ similarity = 0
90
+ else
91
+ similarity = 1
92
+ end
93
+
94
+ return similarity
95
+ end
96
+
97
+ def getInfo
98
+ con = Connector.new('mech')
99
+ reqUrl = 'http://www.nicovideo.jp' +
100
+ '/mylist/' + @mylist_id.to_s
101
+ con.setWait(nil)
102
+ mechPage = con.mechGet(reqUrl)
103
+ result = []
104
+
105
+ # Mylist自身の情報を取得
106
+ jsonStr = mechPage.search(
107
+ "/html/body/div[2]" +
108
+ "/div/div[2]/script[7]"
109
+ ).to_html
110
+
111
+ reg = /MylistGroup\.preloadSingle.{1,}?Mylist\.preload\(/m
112
+ mlJson = jsonStr.scan(reg)[0]
113
+
114
+ id = mlJson.scan(/\sid:[^\n]{1,}/)[0]
115
+ .scan(/[0-9]{1,}/)[0]
116
+ user_id = mlJson.scan(/\suser_id:[^\n]{1,}/)[0]
117
+ .scan(/[0-9]{1,}/)[0]
118
+ name = mlJson.scan(/\sname:[^\n]{1,}/)[0]
119
+ name = name.slice(
120
+ " name: \"".length,
121
+ name.length - " name: \"".length - "\",\n".length
122
+ )
123
+ desc = mlJson.scan(/\sdescription:.{1,}/)[0]
124
+ desc = desc.slice(
125
+ " description: \"".length,
126
+ desc.length - " description: \"".length - "\",\npublic".length
127
+ )
128
+ public = mlJson.scan(/\spublic:[^,]{1,}/)[0]
129
+ .scan(/[0-9]{1,}/)[0]
130
+ default_sort = mlJson.scan(/\sdefault_sort:[^\n]{1,}/)[0]
131
+ .scan(/[0-9]{1,}/)[0]
132
+ create_time = mlJson.scan(/\screate_time:[^\n]{1,}/)[0]
133
+ .scan(/[0-9]{1,}/)[0]
134
+ update_time = mlJson.scan(/\supdate_time:[^\n]{1,}/)[0]
135
+ .scan(/[0-9]{1,}/)[0]
136
+ icon_id = mlJson.scan(/\sicon_id:[^\n]{1,}/)[0]
137
+ .scan(/[0-9]{1,}/)[0]
138
+
139
+ # mlJson = mlJson.scan(/[^\r\n ]{1,}/).join('')
140
+ #mlJson = mlJson.scan(/{.+/)[0].split(',')
141
+
142
+ # 説明文が空欄だった時の措置。
143
+ desc = mlJson[3].scan(/\".+\"/)[0]
144
+ if desc != nil then desc = desc.scan(/[^\"]{1,}/)[0] end
145
+
146
+ paramObj = {
147
+ "id" => id,
148
+ "user_id" => user_id,
149
+ "name" => name,
150
+ "description" => description,
151
+ "public" => public,
152
+ "default_sort" => default_sort,
153
+ "create_time" => create_time,
154
+ "update_time" => update_time,
155
+ "icon_id" => icon_id
156
+ # "sort_order" => ,
157
+ }
158
+ set(paramObj)
159
+
160
+ # 自分に含まれる動画の情報を取得
161
+ jsonStr = mechPage.search(
162
+ "/html/body/div[2]" +
163
+ "/div/div[2]/script[7]"
164
+ ).to_html
165
+
166
+ mvJson = jsonStr.scan(/Mylist.preload.+/)[0]
167
+ mvJson = mvJson.scan(/\".{1,}/)[0]
168
+ mvJson = mvJson.slice(0, mvJson.length - 5)
169
+ #mvJson = mvJson.split('},{')
170
+ mvJson = Unicode.unescape(mvJson).split('},{')
171
+
172
+ mvJson.each { |e|
173
+ e = "{" + e + "}"
174
+ param = JSON.parse(e)
175
+ movie = Movie.new(param['item_data']['video_id'])
176
+ movie.set(param)
177
+
178
+ @movies.push(movie)
179
+ }
180
+ end
181
+
182
+ def getInfoLt
183
+ con = Connector.new('xml')
184
+ host = 'www.nicovideo.jp'
185
+ puts @mylist_id
186
+ entity = '/mylist/' + @mylist_id.to_s + '?rss=atom&numbers=1'
187
+ con.setWait(nil)
188
+ xml = con.xmlGet(host, entity)
189
+
190
+ unless
191
+ xml == "failed"
192
+ then
193
+ parsed = NicoParser.mylistRss(xml)
194
+
195
+ parsed["entry"].each { |e|
196
+ movie = Movie.new(e["video_id"])
197
+ e["available"] = true
198
+ movie.set(e)
199
+ @movies.push(movie)
200
+ }
201
+
202
+ set(parsed["mylist"])
203
+ @available = true
204
+ end
205
+ end
206
+
207
+ def set(paramObj)
208
+ paramObj.each_key { |key|
209
+ param = paramObj[key]
210
+ case key
211
+ when "mylist_id"
212
+ @mylist_id = param
213
+ when "id"
214
+ @mylist_id = param
215
+ when "user_id"
216
+ @user_id = param
217
+ when "title"
218
+ @title = param
219
+ when "description"
220
+ @description = param
221
+ when "public"
222
+ @public = param
223
+ when "default_sort"
224
+ @default_sort = param
225
+ when "create_time"
226
+ @create_time = param
227
+ when "update_time"
228
+ @update_time = param
229
+ when "icon_id"
230
+ @icon_id = param
231
+ when "sort_order"
232
+ @sort_order = param
233
+ when "movies"
234
+ @movies = param
235
+
236
+ when "updated"
237
+ @update_time = param
238
+ when "author"
239
+ @author = param
240
+ end
241
+ }
242
+ end
243
+
244
+ attr_accessor :available
245
+
246
+ attr_accessor :mylist_id
247
+ attr_accessor :user_id
248
+ attr_accessor :title
249
+ attr_accessor :description
250
+ attr_accessor :public
251
+ attr_accessor :default_sort
252
+ attr_accessor :create_time
253
+ attr_accessor :update_time
254
+ attr_accessor :icon_id
255
+ attr_accessor :sort_order
256
+
257
+ attr_accessor :movies
258
+ end