nicoscraper 0.2.12 → 0.2.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,306 +4,385 @@ $:.unshift File.dirname(__FILE__)
4
4
  require 'rubygems'
5
5
  require 'xml'
6
6
  require 'time'
7
+ require 'json'
7
8
 
8
9
  require 'converter.rb'
9
10
 
10
11
  module Nicos
11
12
  module Parser
12
- def parseRow(symbol, type, doc)
13
- hash = {}
14
-
15
- value = case type
16
- when :Fixnum then
17
- doc.read
18
- doc.value.to_i
19
- when :String then
20
- doc.read
21
- p doc.value
22
- doc.value
23
- when :ISO8601 then
24
- doc.read
25
- Nicos::Converter.iso8601ToUnix(doc.value)
26
- when :JapDate then
27
- doc.read
28
- Nicos::Converter.japToUnix(doc.value)
29
- when :Time then
30
- doc.read
31
- Nicos::Converter.toSeconds(doc.value)
32
-
33
- # for Mylist Atom
34
- when :mylistId then
35
- doc.read
36
- Nicos::Extractor.mylistId(doc.value)
37
- when :videoId then
38
- doc.move_to_attribute("href")
39
- Nicos::Extractor.videoId(doc.value)
40
-
41
- # for getThumbInfo
42
- when :Tags then
43
- doc.move_to_attribute("domain")
44
- symbol = case doc.value
45
- when "jp" then :tags_jp
46
- when "tw" then :tags_tw
47
- when "de" then :tags_de
48
- when "es" then :tags_es
49
- end
50
-
51
- tags = []
52
- lockedTags = []
53
- category = nil
54
- lock = nil
13
+ module Xml
14
+ def parseRow(symbol, type, doc)
15
+ hash = {}
16
+
17
+ value = case type
18
+ # common
19
+ when :Fixnum then
20
+ doc.read
21
+ doc.value.to_i
22
+ when :String then
23
+ doc.read
24
+ doc.value
25
+ when :ISO8601 then
26
+ doc.read
27
+ Nicos::Converter.iso8601ToUnix(doc.value)
28
+ when :JapDate then
29
+ doc.read
30
+ Nicos::Converter.japToUnix(doc.value)
31
+ when :Time then
32
+ doc.read
33
+ Nicos::Converter.toSeconds(doc.value)
34
+
35
+ # for Mylist Atom
36
+ when :mylistId then
37
+ doc.read
38
+ Nicos::Extractor.mylistId(doc.value)
39
+ when :itemId then
40
+ doc.read
41
+ Nicos::Extractor.itemId(doc.value)
42
+ when :videoId then
43
+ doc.move_to_attribute("href")
44
+ Nicos::Extractor.videoId(doc.value)
45
+
46
+ # for getThumbInfo
47
+ when :Tags then
48
+ doc.move_to_attribute("domain")
49
+ symbol = case doc.value
50
+ when "jp" then :tags_jp
51
+ when "tw" then :tags_tw
52
+ when "de" then :tags_de
53
+ when "es" then :tags_es
54
+ end
55
55
 
56
- while doc.read
57
- unless doc.node_type == XML::Reader::TYPE_END_ENTITY
58
- break if doc.name === "tags"
56
+ tags = []
57
+ lockedTags = []
58
+ category = nil
59
+ lock = nil
60
+
61
+ while doc.read
62
+ unless doc.node_type == XML::Reader::TYPE_END_ENTITY
63
+ break if doc.name === "tags"
64
+
65
+ if category == nil
66
+ doc.move_to_attribute("category")
67
+ if doc.name === "category"
68
+ doc.read
69
+ category = doc.value
70
+ doc.read
71
+ end
72
+ end
59
73
 
60
- if category == nil
61
- doc.move_to_attribute("category")
62
- if doc.name === "category"
74
+ doc.move_to_attribute("lock")
75
+ if doc.name === "lock"
76
+ lock = true
63
77
  doc.read
64
- category = doc.value
65
78
  doc.read
79
+ else lock = false
66
80
  end
67
- end
68
81
 
69
- doc.move_to_attribute("lock")
70
- if doc.name === "lock"
71
- lock = true
72
- doc.read
73
- doc.read
74
- else lock = false
82
+ doc.read_inner_xml
83
+ if doc.value != nil
84
+ if lock then lockedTags.push(doc.value)
85
+ else tags.push(doc.value) end
86
+ end
75
87
  end
76
-
77
- doc.read_inner_xml
78
- if doc.value != nil
79
- if lock then lockedTags.push(doc.value)
80
- else tags.push(doc.value) end
81
- end
82
88
  end
83
- end
84
89
 
85
- {
86
- :category => category,
87
- :tags => tags,
88
- :lockedTags => lockedTags
89
- }
90
- end
90
+ {
91
+ :category => category,
92
+ :tags => tags,
93
+ :lockedTags => lockedTags
94
+ }
95
+ end
91
96
 
92
- hash[symbol] = value
93
- hash
94
- end
95
- module_function :parseRow
97
+ hash[symbol] = value
98
+ hash
99
+ end
100
+ module_function :parseRow
96
101
 
97
- def parseTag
98
- end
102
+ def parseTag
103
+ end
99
104
 
100
- # getThumbInfoが返すXMLを解析し、ハッシュオブジェクトにして返します。
101
- #
102
- # @return [HashObj]
103
- def getThumbInfo(xml)
104
- doc = XML::Reader.string(
105
- xml,
106
- :options => XML::Parser::Options::NOBLANKS |
107
- XML::Parser::Options::NOENT
108
- )
109
-
110
- n = -1
111
- parsed = {}
112
-
113
- while doc.read
114
- unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
115
- row = case doc.name
116
- when "video_id" then parseRow(:video_id, :String, doc)
117
- when "title" then parseRow(:title, :String, doc)
118
- when "description" then parseRow(:description, :String, doc)
119
- when "thumbnail_url" then parseRow(:thumbnail_url, :String, doc)
120
- when "movie_type" then parseRow(:movie_type, :String, doc)
121
- when "last_res_body" then parseRow(:last_res_body, :String, doc)
122
- when "watch_url" then parseRow(:watch_url, :String, doc)
123
- when "thumb_type" then parseRow(:thumb_type, :String, doc)
124
-
125
- when "size_high" then parseRow(:size_high, :Fixnum, doc)
126
- when "size_low" then parseRow(:size_low, :Fixnum, doc)
127
- when "view_counter" then parseRow(:view_counter, :Fixnum, doc)
128
- when "comment_num" then parseRow(:comment_num, :Fixnum, doc)
129
- when "mylist_counter" then parseRow(:mylist_counter,:Fixnum, doc)
130
- when "embeddable" then parseRow(:embeddable, :Fixnum, doc)
131
- when "no_live_play" then parseRow(:no_live_play, :Fixnum, doc)
132
- when "user_id" then parseRow(:user_id, :Fixnum, doc)
133
- when "first_retrieve" then parseRow(:first_retrieve,:ISO8601, doc)
134
- when "length" then parseRow(:length, :Time, doc)
135
- when "tags" then parseRow(:tags, :Tags, doc)
136
- when "tag" then parseRow(:tag, :Tag, doc)
137
- end
105
+ # getThumbInfoが返すXMLを解析し、ハッシュオブジェクトにして返します。
106
+ #
107
+ # @return [HashObj]
108
+ def getThumbInfo(xml)
109
+ doc = XML::Reader.string(
110
+ xml,
111
+ :options => XML::Parser::Options::NOBLANKS |
112
+ XML::Parser::Options::NOENT
113
+ )
114
+
115
+ n = -1
116
+ parsed = {}
138
117
 
139
- parsed.update(row) if row != nil
118
+ while doc.read
119
+ unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
120
+ row = case doc.name
121
+ when "video_id" then parseRow(:video_id, :String, doc)
122
+ when "title" then parseRow(:title, :String, doc)
123
+ when "description" then parseRow(:description, :String, doc)
124
+ when "thumbnail_url" then parseRow(:thumbnail_url, :String, doc)
125
+ when "movie_type" then parseRow(:movie_type, :String, doc)
126
+ when "last_res_body" then parseRow(:last_res_body, :String, doc)
127
+ when "watch_url" then parseRow(:watch_url, :String, doc)
128
+ when "thumb_type" then parseRow(:thumb_type, :String, doc)
129
+
130
+ when "size_high" then parseRow(:size_high, :Fixnum, doc)
131
+ when "size_low" then parseRow(:size_low, :Fixnum, doc)
132
+ when "view_counter" then parseRow(:view_counter, :Fixnum, doc)
133
+ when "comment_num" then parseRow(:comment_num, :Fixnum, doc)
134
+ when "mylist_counter" then parseRow(:mylist_counter,:Fixnum, doc)
135
+ when "embeddable" then parseRow(:embeddable, :Fixnum, doc)
136
+ when "no_live_play" then parseRow(:no_live_play, :Fixnum, doc)
137
+ when "user_id" then parseRow(:user_id, :Fixnum, doc)
138
+ when "first_retrieve" then parseRow(:first_retrieve,:ISO8601, doc)
139
+ when "length" then parseRow(:length, :Time, doc)
140
+ when "tags" then parseRow(:tags, :Tags, doc)
141
+ when "tag" then parseRow(:tag, :Tag, doc)
142
+ end
143
+
144
+ parsed.update(row) if row != nil
145
+ end
140
146
  end
147
+
148
+ doc.close
149
+ parsed
141
150
  end
142
151
 
143
- doc.close
144
- parsed
145
- end
152
+ # タグ検索のAtomフィードが返すXMLを解析し、ハッシュオブジェクトにして返します。
153
+ #
154
+ # @return [HashObj]
155
+ def tagAtom(xml)
156
+ doc = XML::Reader.string(
157
+ xml,
158
+ :options => XML::Parser::Options::NOBLANKS |
159
+ XML::Parser::Options::NOENT
160
+ )
161
+
162
+ n = -1
163
+ parsed = [{}]
146
164
 
147
- # タグ検索のAtomフィードが返すXMLを解析し、ハッシュオブジェクトにして返します。
148
- #
149
- # @return [HashObj]
150
- def tagAtom(xml)
151
- doc = XML::Reader.string(
152
- xml,
153
- :options => XML::Parser::Options::NOBLANKS |
154
- XML::Parser::Options::NOENT
155
- )
156
-
157
- n = -1
158
- parsed = [{}]
159
-
160
- while doc.read
161
- unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
162
- case doc.name
163
- when "entry"
164
- n += 1
165
- parsed[n] = {}
166
- when "title"
167
- doc.read
168
- parsed[n][:title] = doc.value
169
- when "link"
170
- doc.move_to_attribute("href")
171
- parsed[n][:video_id] = doc.value.split('/')[4]
172
- when "published", "updated"
173
- label = doc.name
174
- doc.read
175
- parsed[n][label] = Nicos::Converter.iso8601ToUnix(doc.value)
176
- when "p"
177
- doc.move_to_attribute("class")
178
- case doc.value
179
- when "nico-thumbnail"
180
- doc.read
181
- doc.move_to_attribute("src")
182
- parsed[n][:thumbnail_url] = doc.value
183
- when "nico-description"
184
- doc.read
185
- parsed[n][:description] = doc.value
186
- end
187
- when "strong"
188
- doc.move_to_attribute("class")
189
- case doc.value
190
- when "nico-info-length"
165
+ while doc.read
166
+ unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
167
+ case doc.name
168
+ when "entry"
169
+ n += 1
170
+ parsed[n] = {}
171
+ when "title"
191
172
  doc.read
192
- parsed[n][:length] = Nicos::Converter.toSeconds(doc.value)
193
- when "nico-info-date"
173
+ parsed[n][:title] = doc.value
174
+ when "link"
175
+ doc.move_to_attribute("href")
176
+ parsed[n][:video_id] = doc.value.split('/')[4]
177
+ when "published", "updated"
194
178
  label = doc.name
195
179
  doc.read
196
- parsed[n][:first_retrieve] = Nicos::Converter.japToUnix(doc.value)
197
- when "nico-numbers-view", "nico-numbers-res",
198
- "nico-numbers-mylist"
199
- label = doc.value
200
- doc.read
201
- parsed[n][label.slice(13,99)] = Nicos::Converter::commaRemover(doc.value)
202
- end
203
- end
180
+ parsed[n][label] = Nicos::Converter.iso8601ToUnix(doc.value)
181
+ when "p"
182
+ doc.move_to_attribute("class")
183
+ case doc.value
184
+ when "nico-thumbnail"
185
+ doc.read
186
+ doc.move_to_attribute("src")
187
+ parsed[n][:thumbnail_url] = doc.value
188
+ when "nico-description"
189
+ doc.read
190
+ parsed[n][:description] = doc.value
191
+ end
192
+ when "strong"
193
+ doc.move_to_attribute("class")
194
+ case doc.value
195
+ when "nico-info-length"
196
+ doc.read
197
+ parsed[n][:length] = Nicos::Converter.toSeconds(doc.value)
198
+ when "nico-info-date"
199
+ label = doc.name
200
+ doc.read
201
+ parsed[n][:first_retrieve] = Nicos::Converter.japToUnix(doc.value)
202
+ when "nico-numbers-view", "nico-numbers-res",
203
+ "nico-numbers-mylist"
204
+ label = doc.value
205
+ doc.read
206
+ parsed[n][label.slice(13,99)] = Nicos::Converter::commaRemover(doc.value)
207
+ end
208
+ end
209
+ end
204
210
  end
211
+
212
+ doc.close
213
+ parsed
205
214
  end
215
+
216
+ # マイリストのAtomフィードが返すXMLを解析し、ハッシュオブジェクトにして返します。
217
+ #
218
+ # @return [HashObj]
219
+ def mylistAtom(xml)
220
+ doc = XML::Reader.string(
221
+ xml,
222
+ :options => XML::Parser::Options::NOBLANKS |
223
+ XML::Parser::Options::NOENT
224
+ )
225
+
226
+ n = 0
227
+ parsed = { :mylist => {}, :entry => [{}] }
206
228
 
207
- doc.close
208
- parsed
209
- end
210
-
211
- # マイリストのAtomフィードが返すXMLを解析し、ハッシュオブジェクトにして返します。
212
- #
213
- # @return [HashObj]
214
- def mylistAtom(xml)
215
- doc = XML::Reader.string(
216
- xml,
217
- :options => XML::Parser::Options::NOBLANKS |
218
- XML::Parser::Options::NOENT
219
- )
220
-
221
- n = 0
222
- parsed = { :mylist => {}, :entry => [{}] }
223
-
224
- while doc.read
225
- break if doc.name === "entry"
226
-
227
- unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
228
- row = case doc.name
229
- when "title" then
230
- /(マイリスト )(.+)(‐ニコニコ動画)/ =~ parseRow(:title, :String, doc)[:title]
231
- { :title => $2 }
232
- when "id" then parseRow(:mylist_id, :mylistId,doc)
233
- when "subtitle" then parseRow(:description, :String, doc)
234
- when "updated" then parseRow(:updated, :ISO8601, doc)
235
- when "name" then parseRow(:author, :String, doc)
236
- end
229
+ while doc.read
230
+ break if doc.name === "entry"
231
+
232
+ unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
233
+ row = case doc.name
234
+ when "title" then
235
+ /(マイリスト )(.+)(‐ニコニコ動画)/ =~ parseRow(:title, :String, doc)[:title]
236
+ { :title => $2 }
237
+ when "id" then parseRow(:mylist_id, :mylistId,doc)
238
+ when "subtitle" then parseRow(:description, :String, doc)
239
+ when "updated" then parseRow(:updated, :ISO8601, doc)
240
+ when "name" then parseRow(:author, :String, doc)
241
+ end
237
242
 
238
- parsed[:mylist].update(row) if row != nil
243
+ parsed[:mylist].update(row) if row != nil
244
+ end
239
245
  end
240
- end
241
246
 
242
- while doc.read
243
- unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
244
- # bump up the page number
245
- if doc.name === "entry"
246
- n += 1
247
- parsed[:entry][n] = {}
247
+ while doc.read
248
+ unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
249
+ # bump up the page number
250
+ if doc.name === "entry"
251
+ n += 1
252
+ parsed[:entry][n] = {}
253
+ end
254
+
255
+ row = case doc.name
256
+ # <title> and <id> are marked up both in mylist and
257
+ # each entry's node. So we need to assign the value to the
258
+ # appropriate variable in accordance with node's location.
259
+ when "title" then parseRow(:title, :String, doc)
260
+ when "link" then parseRow(:video_id, :videoId, doc)
261
+ when "id" then parseRow(:item_id, :itemId, doc)
262
+ when "content"
263
+ doc.read
264
+ html = doc.value
265
+
266
+ /(<p\sclass=\"nico-memo\"\>)([^\<]{1,})/ =~ html
267
+ memo = $2
268
+
269
+ /(<p\sclass=\"nico-thumbnail\">.+src=\")(http:\/\/[^\"]{1,})/ =~ html
270
+ thumbnail_url = $2
271
+
272
+ /(<p\sclass=\"nico-description\"\>)([^\<]{1,})/ =~ html
273
+ description = $2
274
+
275
+ /(<strong\sclass\=\"nico-info-length\"\>)([^\<]{1,})/ =~ html
276
+ length = Nicos::Converter.toSeconds($2)
277
+
278
+ /(<strong\sclass\=\"nico-info-date\"\>)([^\<]{1,})/ =~ html
279
+ first_retrieve = Nicos::Converter.japToUnix($2)
280
+
281
+ /(<strong\sclass\=\"nico-numbers-view\"\>)([^\<]{1,})/ =~ html
282
+ view = Nicos::Converter.commaRemover($2)
283
+
284
+ /(<strong\sclass\=\"nico-numbers-res\"\>)([^\<]{1,})/ =~ html
285
+ res = Nicos::Converter.commaRemover($2)
286
+
287
+ /(<strong\sclass\=\"nico-numbers-mylist\"\>)([^\<]{1,})/ =~ html
288
+ mylist = Nicos::Converter.commaRemover($2)
289
+
290
+ {
291
+ :memo => memo,
292
+ :thumbnail_url => thumbnail_url,
293
+ :description => description,
294
+ :length => length,
295
+ :first_retrieve => first_retrieve,
296
+ :view => view,
297
+ :res => res,
298
+ :mylist => mylist
299
+ }
300
+ end
301
+
302
+ parsed[:entry][n].update(row) if row != nil
248
303
  end
304
+ end
305
+
306
+ doc.close
307
+ parsed
308
+ end
309
+
310
+ module_function :tagAtom
311
+ module_function :mylistAtom
312
+ module_function :getThumbInfo
313
+ end
249
314
 
250
- row = case doc.name
251
- # <title> and <id> are marked up both in mylist and
252
- # each entry's node. So we need to assign the value to the
253
- # appropriate variable in accordance with node's location.
254
- when "title" then parseRow(:title, :String, doc)
255
- when "link" then parseRow(:video_id, :videoId, doc)
256
- when "id" then parseRow(:item_id, :itemId, doc)
257
- when "content"
258
- doc.read
259
- html = doc.value
260
-
261
- /(<p\sclass=\"nico-memo\"\>)([^\<]{1,})/ =~ html
262
- memo = $2
263
-
264
- /(<p\sclass=\"nico-thumbnail\">.+src=\")(http:\/\/[^\"]{1,})/ =~ html
265
- thumbnail_url = $2
266
-
267
- /(<p\sclass=\"nico-description\"\>)([^\<]{1,})/ =~ html
268
- description = $2
269
-
270
- /(<strong\sclass\=\"nico-info-length\"\>)([^\<]{1,})/ =~ html
271
- length = Nicos::Converter.toSeconds($2)
272
-
273
- /(<strong\sclass\=\"nico-info-date\"\>)([^\<]{1,})/ =~ html
274
- first_retrieve = Nicos::Converter.japToUnix($2)
275
-
276
- /(<strong\sclass\=\"nico-numbers-view\"\>)([^\<]{1,})/ =~ html
277
- view = Nicos::Converter.commaRemover($2)
278
-
279
- /(<strong\sclass\=\"nico-numbers-res\"\>)([^\<]{1,})/ =~ html
280
- res = Nicos::Converter.commaRemover($2)
281
-
282
- /(<strong\sclass\=\"nico-numbers-mylist\"\>)([^\<]{1,})/ =~ html
283
- mylist = Nicos::Converter.commaRemover($2)
284
-
285
- {
286
- :memo => memo,
287
- :thumbnail_url => thumbnail_url,
288
- :description => description,
289
- :length => length,
290
- :first_retrieve => first_retrieve,
291
- :view => view,
292
- :res => res,
293
- :mylist => mylist
294
- }
295
- end
296
-
297
- parsed[:entry][n].update(row) if row != nil
315
+ module Html
316
+ def mylist(html)
317
+ rawScript = html.scan(
318
+ /\<script\stype\=\"text\/javascript\">.[^<]{1,}/
319
+ )[6]
320
+
321
+ /(Jarty\.globals\()(\{([^}]|\}[^)])+)/ =~ rawScript
322
+ s = $2
323
+
324
+ /(user_id:\s)([0-9]{1,})/ =~ s
325
+ user_id = $2
326
+
327
+ /(nickname:\s\")([^"]{1,})/ =~ s
328
+ author = $2
329
+
330
+ /(MylistGroup\.preload)(([^;]|[^)]\;)+)/ =~ rawScript
331
+ s = $2
332
+
333
+ /(name:\s\")([^"]{1,})/ =~ s
334
+ title = $2
335
+
336
+ /(description:\s\")([^"]{1,})/ =~ s
337
+ description = $2
338
+
339
+ /(id:\s)([0-9]{1,})/ =~ s
340
+ mylist_id = $2
341
+
342
+ /(public:\s)([0-9]{1,})/ =~ s
343
+ public = $2
344
+
345
+ /(default_sort:\s)([0-9]{1,})/ =~ s
346
+ default_sort = $2
347
+
348
+ /(create_time:\s)([0-9]{1,})/ =~ s
349
+ create_time = $2
350
+
351
+ /(update_time:\s)([0-9]{1,})/ =~ s
352
+ update_time = $2
353
+
354
+ /(icon_id:\s)([0-9]{1,})/ =~ s
355
+ icon_id = $2
356
+
357
+
358
+ /(Mylist\.preload\([0-9]{1,}\,)(.+(?=\]\)\;))/ =~ rawScript
359
+ if $2 != nil
360
+ s = $2 + "]"
361
+ entry = JSON.parse(s)
362
+ else
363
+ entry = nil
298
364
  end
365
+
366
+ parse = {
367
+ :mylist => {
368
+ :user_id => user_id,
369
+ :author => author,
370
+ :title => title,
371
+ :description => description,
372
+ :mylist_id => mylist_id,
373
+ :public => public,
374
+ :default_sort => default_sort,
375
+ :create_time => create_time,
376
+ :update_time => update_time,
377
+ :icon_id => icon_id
378
+ },
379
+ :entry => entry
380
+ }
381
+
382
+ parse
299
383
  end
300
384
 
301
- doc.close
302
- parsed
385
+ module_function :mylist
303
386
  end
304
-
305
- module_function :tagAtom
306
- module_function :mylistAtom
307
- module_function :getThumbInfo
308
387
  end
309
388
  end
@@ -47,12 +47,12 @@ module Nicos
47
47
  paramAry.push("page=#{@page}") if @page != 1
48
48
  paramAry.push(sortStr)
49
49
  paramAry.push("rss=atom&numbers=1") if method == :atom
50
- param = tag + "?" + paramAry.join('&')
50
+ param = "#{tag}?" + paramAry.join('&')
51
51
 
52
52
  host = 'www.nicovideo.jp'
53
- entity = '/tag/' + param
53
+ entity = '/tag/'
54
54
 
55
- @connector.get(host, entity)
55
+ @connector.get(host, entity, param)
56
56
  end
57
57
 
58
58
  def loop(tag, sort, method, &block)
@@ -160,7 +160,7 @@ module Nicos
160
160
  private
161
161
 
162
162
  def parse(xml)
163
- Nicos::Parser.tagAtom(xml)
163
+ Nicos::Parser::Xml.tagAtom(xml)
164
164
  end
165
165
 
166
166
  public