nicoscraper 0.2.12 → 0.2.13

Sign up to get free protection for your applications and to get access to all the features.
@@ -4,306 +4,385 @@ $:.unshift File.dirname(__FILE__)
4
4
  require 'rubygems'
5
5
  require 'xml'
6
6
  require 'time'
7
+ require 'json'
7
8
 
8
9
  require 'converter.rb'
9
10
 
10
11
  module Nicos
11
12
  module Parser
12
- def parseRow(symbol, type, doc)
13
- hash = {}
14
-
15
- value = case type
16
- when :Fixnum then
17
- doc.read
18
- doc.value.to_i
19
- when :String then
20
- doc.read
21
- p doc.value
22
- doc.value
23
- when :ISO8601 then
24
- doc.read
25
- Nicos::Converter.iso8601ToUnix(doc.value)
26
- when :JapDate then
27
- doc.read
28
- Nicos::Converter.japToUnix(doc.value)
29
- when :Time then
30
- doc.read
31
- Nicos::Converter.toSeconds(doc.value)
32
-
33
- # for Mylist Atom
34
- when :mylistId then
35
- doc.read
36
- Nicos::Extractor.mylistId(doc.value)
37
- when :videoId then
38
- doc.move_to_attribute("href")
39
- Nicos::Extractor.videoId(doc.value)
40
-
41
- # for getThumbInfo
42
- when :Tags then
43
- doc.move_to_attribute("domain")
44
- symbol = case doc.value
45
- when "jp" then :tags_jp
46
- when "tw" then :tags_tw
47
- when "de" then :tags_de
48
- when "es" then :tags_es
49
- end
50
-
51
- tags = []
52
- lockedTags = []
53
- category = nil
54
- lock = nil
13
+ module Xml
14
+ def parseRow(symbol, type, doc)
15
+ hash = {}
16
+
17
+ value = case type
18
+ # common
19
+ when :Fixnum then
20
+ doc.read
21
+ doc.value.to_i
22
+ when :String then
23
+ doc.read
24
+ doc.value
25
+ when :ISO8601 then
26
+ doc.read
27
+ Nicos::Converter.iso8601ToUnix(doc.value)
28
+ when :JapDate then
29
+ doc.read
30
+ Nicos::Converter.japToUnix(doc.value)
31
+ when :Time then
32
+ doc.read
33
+ Nicos::Converter.toSeconds(doc.value)
34
+
35
+ # for Mylist Atom
36
+ when :mylistId then
37
+ doc.read
38
+ Nicos::Extractor.mylistId(doc.value)
39
+ when :itemId then
40
+ doc.read
41
+ Nicos::Extractor.itemId(doc.value)
42
+ when :videoId then
43
+ doc.move_to_attribute("href")
44
+ Nicos::Extractor.videoId(doc.value)
45
+
46
+ # for getThumbInfo
47
+ when :Tags then
48
+ doc.move_to_attribute("domain")
49
+ symbol = case doc.value
50
+ when "jp" then :tags_jp
51
+ when "tw" then :tags_tw
52
+ when "de" then :tags_de
53
+ when "es" then :tags_es
54
+ end
55
55
 
56
- while doc.read
57
- unless doc.node_type == XML::Reader::TYPE_END_ENTITY
58
- break if doc.name === "tags"
56
+ tags = []
57
+ lockedTags = []
58
+ category = nil
59
+ lock = nil
60
+
61
+ while doc.read
62
+ unless doc.node_type == XML::Reader::TYPE_END_ENTITY
63
+ break if doc.name === "tags"
64
+
65
+ if category == nil
66
+ doc.move_to_attribute("category")
67
+ if doc.name === "category"
68
+ doc.read
69
+ category = doc.value
70
+ doc.read
71
+ end
72
+ end
59
73
 
60
- if category == nil
61
- doc.move_to_attribute("category")
62
- if doc.name === "category"
74
+ doc.move_to_attribute("lock")
75
+ if doc.name === "lock"
76
+ lock = true
63
77
  doc.read
64
- category = doc.value
65
78
  doc.read
79
+ else lock = false
66
80
  end
67
- end
68
81
 
69
- doc.move_to_attribute("lock")
70
- if doc.name === "lock"
71
- lock = true
72
- doc.read
73
- doc.read
74
- else lock = false
82
+ doc.read_inner_xml
83
+ if doc.value != nil
84
+ if lock then lockedTags.push(doc.value)
85
+ else tags.push(doc.value) end
86
+ end
75
87
  end
76
-
77
- doc.read_inner_xml
78
- if doc.value != nil
79
- if lock then lockedTags.push(doc.value)
80
- else tags.push(doc.value) end
81
- end
82
88
  end
83
- end
84
89
 
85
- {
86
- :category => category,
87
- :tags => tags,
88
- :lockedTags => lockedTags
89
- }
90
- end
90
+ {
91
+ :category => category,
92
+ :tags => tags,
93
+ :lockedTags => lockedTags
94
+ }
95
+ end
91
96
 
92
- hash[symbol] = value
93
- hash
94
- end
95
- module_function :parseRow
97
+ hash[symbol] = value
98
+ hash
99
+ end
100
+ module_function :parseRow
96
101
 
97
- def parseTag
98
- end
102
+ def parseTag
103
+ end
99
104
 
100
- # getThumbInfoが返すXMLを解析し、ハッシュオブジェクトにして返します。
101
- #
102
- # @return [HashObj]
103
- def getThumbInfo(xml)
104
- doc = XML::Reader.string(
105
- xml,
106
- :options => XML::Parser::Options::NOBLANKS |
107
- XML::Parser::Options::NOENT
108
- )
109
-
110
- n = -1
111
- parsed = {}
112
-
113
- while doc.read
114
- unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
115
- row = case doc.name
116
- when "video_id" then parseRow(:video_id, :String, doc)
117
- when "title" then parseRow(:title, :String, doc)
118
- when "description" then parseRow(:description, :String, doc)
119
- when "thumbnail_url" then parseRow(:thumbnail_url, :String, doc)
120
- when "movie_type" then parseRow(:movie_type, :String, doc)
121
- when "last_res_body" then parseRow(:last_res_body, :String, doc)
122
- when "watch_url" then parseRow(:watch_url, :String, doc)
123
- when "thumb_type" then parseRow(:thumb_type, :String, doc)
124
-
125
- when "size_high" then parseRow(:size_high, :Fixnum, doc)
126
- when "size_low" then parseRow(:size_low, :Fixnum, doc)
127
- when "view_counter" then parseRow(:view_counter, :Fixnum, doc)
128
- when "comment_num" then parseRow(:comment_num, :Fixnum, doc)
129
- when "mylist_counter" then parseRow(:mylist_counter,:Fixnum, doc)
130
- when "embeddable" then parseRow(:embeddable, :Fixnum, doc)
131
- when "no_live_play" then parseRow(:no_live_play, :Fixnum, doc)
132
- when "user_id" then parseRow(:user_id, :Fixnum, doc)
133
- when "first_retrieve" then parseRow(:first_retrieve,:ISO8601, doc)
134
- when "length" then parseRow(:length, :Time, doc)
135
- when "tags" then parseRow(:tags, :Tags, doc)
136
- when "tag" then parseRow(:tag, :Tag, doc)
137
- end
105
+ # getThumbInfoが返すXMLを解析し、ハッシュオブジェクトにして返します。
106
+ #
107
+ # @return [HashObj]
108
+ def getThumbInfo(xml)
109
+ doc = XML::Reader.string(
110
+ xml,
111
+ :options => XML::Parser::Options::NOBLANKS |
112
+ XML::Parser::Options::NOENT
113
+ )
114
+
115
+ n = -1
116
+ parsed = {}
138
117
 
139
- parsed.update(row) if row != nil
118
+ while doc.read
119
+ unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
120
+ row = case doc.name
121
+ when "video_id" then parseRow(:video_id, :String, doc)
122
+ when "title" then parseRow(:title, :String, doc)
123
+ when "description" then parseRow(:description, :String, doc)
124
+ when "thumbnail_url" then parseRow(:thumbnail_url, :String, doc)
125
+ when "movie_type" then parseRow(:movie_type, :String, doc)
126
+ when "last_res_body" then parseRow(:last_res_body, :String, doc)
127
+ when "watch_url" then parseRow(:watch_url, :String, doc)
128
+ when "thumb_type" then parseRow(:thumb_type, :String, doc)
129
+
130
+ when "size_high" then parseRow(:size_high, :Fixnum, doc)
131
+ when "size_low" then parseRow(:size_low, :Fixnum, doc)
132
+ when "view_counter" then parseRow(:view_counter, :Fixnum, doc)
133
+ when "comment_num" then parseRow(:comment_num, :Fixnum, doc)
134
+ when "mylist_counter" then parseRow(:mylist_counter,:Fixnum, doc)
135
+ when "embeddable" then parseRow(:embeddable, :Fixnum, doc)
136
+ when "no_live_play" then parseRow(:no_live_play, :Fixnum, doc)
137
+ when "user_id" then parseRow(:user_id, :Fixnum, doc)
138
+ when "first_retrieve" then parseRow(:first_retrieve,:ISO8601, doc)
139
+ when "length" then parseRow(:length, :Time, doc)
140
+ when "tags" then parseRow(:tags, :Tags, doc)
141
+ when "tag" then parseRow(:tag, :Tag, doc)
142
+ end
143
+
144
+ parsed.update(row) if row != nil
145
+ end
140
146
  end
147
+
148
+ doc.close
149
+ parsed
141
150
  end
142
151
 
143
- doc.close
144
- parsed
145
- end
152
+ # タグ検索のAtomフィードが返すXMLを解析し、ハッシュオブジェクトにして返します。
153
+ #
154
+ # @return [HashObj]
155
+ def tagAtom(xml)
156
+ doc = XML::Reader.string(
157
+ xml,
158
+ :options => XML::Parser::Options::NOBLANKS |
159
+ XML::Parser::Options::NOENT
160
+ )
161
+
162
+ n = -1
163
+ parsed = [{}]
146
164
 
147
- # タグ検索のAtomフィードが返すXMLを解析し、ハッシュオブジェクトにして返します。
148
- #
149
- # @return [HashObj]
150
- def tagAtom(xml)
151
- doc = XML::Reader.string(
152
- xml,
153
- :options => XML::Parser::Options::NOBLANKS |
154
- XML::Parser::Options::NOENT
155
- )
156
-
157
- n = -1
158
- parsed = [{}]
159
-
160
- while doc.read
161
- unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
162
- case doc.name
163
- when "entry"
164
- n += 1
165
- parsed[n] = {}
166
- when "title"
167
- doc.read
168
- parsed[n][:title] = doc.value
169
- when "link"
170
- doc.move_to_attribute("href")
171
- parsed[n][:video_id] = doc.value.split('/')[4]
172
- when "published", "updated"
173
- label = doc.name
174
- doc.read
175
- parsed[n][label] = Nicos::Converter.iso8601ToUnix(doc.value)
176
- when "p"
177
- doc.move_to_attribute("class")
178
- case doc.value
179
- when "nico-thumbnail"
180
- doc.read
181
- doc.move_to_attribute("src")
182
- parsed[n][:thumbnail_url] = doc.value
183
- when "nico-description"
184
- doc.read
185
- parsed[n][:description] = doc.value
186
- end
187
- when "strong"
188
- doc.move_to_attribute("class")
189
- case doc.value
190
- when "nico-info-length"
165
+ while doc.read
166
+ unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
167
+ case doc.name
168
+ when "entry"
169
+ n += 1
170
+ parsed[n] = {}
171
+ when "title"
191
172
  doc.read
192
- parsed[n][:length] = Nicos::Converter.toSeconds(doc.value)
193
- when "nico-info-date"
173
+ parsed[n][:title] = doc.value
174
+ when "link"
175
+ doc.move_to_attribute("href")
176
+ parsed[n][:video_id] = doc.value.split('/')[4]
177
+ when "published", "updated"
194
178
  label = doc.name
195
179
  doc.read
196
- parsed[n][:first_retrieve] = Nicos::Converter.japToUnix(doc.value)
197
- when "nico-numbers-view", "nico-numbers-res",
198
- "nico-numbers-mylist"
199
- label = doc.value
200
- doc.read
201
- parsed[n][label.slice(13,99)] = Nicos::Converter::commaRemover(doc.value)
202
- end
203
- end
180
+ parsed[n][label] = Nicos::Converter.iso8601ToUnix(doc.value)
181
+ when "p"
182
+ doc.move_to_attribute("class")
183
+ case doc.value
184
+ when "nico-thumbnail"
185
+ doc.read
186
+ doc.move_to_attribute("src")
187
+ parsed[n][:thumbnail_url] = doc.value
188
+ when "nico-description"
189
+ doc.read
190
+ parsed[n][:description] = doc.value
191
+ end
192
+ when "strong"
193
+ doc.move_to_attribute("class")
194
+ case doc.value
195
+ when "nico-info-length"
196
+ doc.read
197
+ parsed[n][:length] = Nicos::Converter.toSeconds(doc.value)
198
+ when "nico-info-date"
199
+ label = doc.name
200
+ doc.read
201
+ parsed[n][:first_retrieve] = Nicos::Converter.japToUnix(doc.value)
202
+ when "nico-numbers-view", "nico-numbers-res",
203
+ "nico-numbers-mylist"
204
+ label = doc.value
205
+ doc.read
206
+ parsed[n][label.slice(13,99)] = Nicos::Converter::commaRemover(doc.value)
207
+ end
208
+ end
209
+ end
204
210
  end
211
+
212
+ doc.close
213
+ parsed
205
214
  end
215
+
216
+ # マイリストのAtomフィードが返すXMLを解析し、ハッシュオブジェクトにして返します。
217
+ #
218
+ # @return [HashObj]
219
+ def mylistAtom(xml)
220
+ doc = XML::Reader.string(
221
+ xml,
222
+ :options => XML::Parser::Options::NOBLANKS |
223
+ XML::Parser::Options::NOENT
224
+ )
225
+
226
+ n = 0
227
+ parsed = { :mylist => {}, :entry => [{}] }
206
228
 
207
- doc.close
208
- parsed
209
- end
210
-
211
- # マイリストのAtomフィードが返すXMLを解析し、ハッシュオブジェクトにして返します。
212
- #
213
- # @return [HashObj]
214
- def mylistAtom(xml)
215
- doc = XML::Reader.string(
216
- xml,
217
- :options => XML::Parser::Options::NOBLANKS |
218
- XML::Parser::Options::NOENT
219
- )
220
-
221
- n = 0
222
- parsed = { :mylist => {}, :entry => [{}] }
223
-
224
- while doc.read
225
- break if doc.name === "entry"
226
-
227
- unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
228
- row = case doc.name
229
- when "title" then
230
- /(マイリスト )(.+)(‐ニコニコ動画)/ =~ parseRow(:title, :String, doc)[:title]
231
- { :title => $2 }
232
- when "id" then parseRow(:mylist_id, :mylistId,doc)
233
- when "subtitle" then parseRow(:description, :String, doc)
234
- when "updated" then parseRow(:updated, :ISO8601, doc)
235
- when "name" then parseRow(:author, :String, doc)
236
- end
229
+ while doc.read
230
+ break if doc.name === "entry"
231
+
232
+ unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
233
+ row = case doc.name
234
+ when "title" then
235
+ /(マイリスト )(.+)(‐ニコニコ動画)/ =~ parseRow(:title, :String, doc)[:title]
236
+ { :title => $2 }
237
+ when "id" then parseRow(:mylist_id, :mylistId,doc)
238
+ when "subtitle" then parseRow(:description, :String, doc)
239
+ when "updated" then parseRow(:updated, :ISO8601, doc)
240
+ when "name" then parseRow(:author, :String, doc)
241
+ end
237
242
 
238
- parsed[:mylist].update(row) if row != nil
243
+ parsed[:mylist].update(row) if row != nil
244
+ end
239
245
  end
240
- end
241
246
 
242
- while doc.read
243
- unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
244
- # bump up the page number
245
- if doc.name === "entry"
246
- n += 1
247
- parsed[:entry][n] = {}
247
+ while doc.read
248
+ unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
249
+ # bump up the page number
250
+ if doc.name === "entry"
251
+ n += 1
252
+ parsed[:entry][n] = {}
253
+ end
254
+
255
+ row = case doc.name
256
+ # <title> and <id> are marked up both in mylist and
257
+ # each entry's node. So we need to assign the value to the
258
+ # appropriate variable in accordance with node's location.
259
+ when "title" then parseRow(:title, :String, doc)
260
+ when "link" then parseRow(:video_id, :videoId, doc)
261
+ when "id" then parseRow(:item_id, :itemId, doc)
262
+ when "content"
263
+ doc.read
264
+ html = doc.value
265
+
266
+ /(<p\sclass=\"nico-memo\"\>)([^\<]{1,})/ =~ html
267
+ memo = $2
268
+
269
+ /(<p\sclass=\"nico-thumbnail\">.+src=\")(http:\/\/[^\"]{1,})/ =~ html
270
+ thumbnail_url = $2
271
+
272
+ /(<p\sclass=\"nico-description\"\>)([^\<]{1,})/ =~ html
273
+ description = $2
274
+
275
+ /(<strong\sclass\=\"nico-info-length\"\>)([^\<]{1,})/ =~ html
276
+ length = Nicos::Converter.toSeconds($2)
277
+
278
+ /(<strong\sclass\=\"nico-info-date\"\>)([^\<]{1,})/ =~ html
279
+ first_retrieve = Nicos::Converter.japToUnix($2)
280
+
281
+ /(<strong\sclass\=\"nico-numbers-view\"\>)([^\<]{1,})/ =~ html
282
+ view = Nicos::Converter.commaRemover($2)
283
+
284
+ /(<strong\sclass\=\"nico-numbers-res\"\>)([^\<]{1,})/ =~ html
285
+ res = Nicos::Converter.commaRemover($2)
286
+
287
+ /(<strong\sclass\=\"nico-numbers-mylist\"\>)([^\<]{1,})/ =~ html
288
+ mylist = Nicos::Converter.commaRemover($2)
289
+
290
+ {
291
+ :memo => memo,
292
+ :thumbnail_url => thumbnail_url,
293
+ :description => description,
294
+ :length => length,
295
+ :first_retrieve => first_retrieve,
296
+ :view => view,
297
+ :res => res,
298
+ :mylist => mylist
299
+ }
300
+ end
301
+
302
+ parsed[:entry][n].update(row) if row != nil
248
303
  end
304
+ end
305
+
306
+ doc.close
307
+ parsed
308
+ end
309
+
310
+ module_function :tagAtom
311
+ module_function :mylistAtom
312
+ module_function :getThumbInfo
313
+ end
249
314
 
250
- row = case doc.name
251
- # <title> and <id> are marked up both in mylist and
252
- # each entry's node. So we need to assign the value to the
253
- # appropriate variable in accordance with node's location.
254
- when "title" then parseRow(:title, :String, doc)
255
- when "link" then parseRow(:video_id, :videoId, doc)
256
- when "id" then parseRow(:item_id, :itemId, doc)
257
- when "content"
258
- doc.read
259
- html = doc.value
260
-
261
- /(<p\sclass=\"nico-memo\"\>)([^\<]{1,})/ =~ html
262
- memo = $2
263
-
264
- /(<p\sclass=\"nico-thumbnail\">.+src=\")(http:\/\/[^\"]{1,})/ =~ html
265
- thumbnail_url = $2
266
-
267
- /(<p\sclass=\"nico-description\"\>)([^\<]{1,})/ =~ html
268
- description = $2
269
-
270
- /(<strong\sclass\=\"nico-info-length\"\>)([^\<]{1,})/ =~ html
271
- length = Nicos::Converter.toSeconds($2)
272
-
273
- /(<strong\sclass\=\"nico-info-date\"\>)([^\<]{1,})/ =~ html
274
- first_retrieve = Nicos::Converter.japToUnix($2)
275
-
276
- /(<strong\sclass\=\"nico-numbers-view\"\>)([^\<]{1,})/ =~ html
277
- view = Nicos::Converter.commaRemover($2)
278
-
279
- /(<strong\sclass\=\"nico-numbers-res\"\>)([^\<]{1,})/ =~ html
280
- res = Nicos::Converter.commaRemover($2)
281
-
282
- /(<strong\sclass\=\"nico-numbers-mylist\"\>)([^\<]{1,})/ =~ html
283
- mylist = Nicos::Converter.commaRemover($2)
284
-
285
- {
286
- :memo => memo,
287
- :thumbnail_url => thumbnail_url,
288
- :description => description,
289
- :length => length,
290
- :first_retrieve => first_retrieve,
291
- :view => view,
292
- :res => res,
293
- :mylist => mylist
294
- }
295
- end
296
-
297
- parsed[:entry][n].update(row) if row != nil
315
+ module Html
316
+ def mylist(html)
317
+ rawScript = html.scan(
318
+ /\<script\stype\=\"text\/javascript\">.[^<]{1,}/
319
+ )[6]
320
+
321
+ /(Jarty\.globals\()(\{([^}]|\}[^)])+)/ =~ rawScript
322
+ s = $2
323
+
324
+ /(user_id:\s)([0-9]{1,})/ =~ s
325
+ user_id = $2
326
+
327
+ /(nickname:\s\")([^"]{1,})/ =~ s
328
+ author = $2
329
+
330
+ /(MylistGroup\.preload)(([^;]|[^)]\;)+)/ =~ rawScript
331
+ s = $2
332
+
333
+ /(name:\s\")([^"]{1,})/ =~ s
334
+ title = $2
335
+
336
+ /(description:\s\")([^"]{1,})/ =~ s
337
+ description = $2
338
+
339
+ /(id:\s)([0-9]{1,})/ =~ s
340
+ mylist_id = $2
341
+
342
+ /(public:\s)([0-9]{1,})/ =~ s
343
+ public = $2
344
+
345
+ /(default_sort:\s)([0-9]{1,})/ =~ s
346
+ default_sort = $2
347
+
348
+ /(create_time:\s)([0-9]{1,})/ =~ s
349
+ create_time = $2
350
+
351
+ /(update_time:\s)([0-9]{1,})/ =~ s
352
+ update_time = $2
353
+
354
+ /(icon_id:\s)([0-9]{1,})/ =~ s
355
+ icon_id = $2
356
+
357
+
358
+ /(Mylist\.preload\([0-9]{1,}\,)(.+(?=\]\)\;))/ =~ rawScript
359
+ if $2 != nil
360
+ s = $2 + "]"
361
+ entry = JSON.parse(s)
362
+ else
363
+ entry = nil
298
364
  end
365
+
366
+ parse = {
367
+ :mylist => {
368
+ :user_id => user_id,
369
+ :author => author,
370
+ :title => title,
371
+ :description => description,
372
+ :mylist_id => mylist_id,
373
+ :public => public,
374
+ :default_sort => default_sort,
375
+ :create_time => create_time,
376
+ :update_time => update_time,
377
+ :icon_id => icon_id
378
+ },
379
+ :entry => entry
380
+ }
381
+
382
+ parse
299
383
  end
300
384
 
301
- doc.close
302
- parsed
385
+ module_function :mylist
303
386
  end
304
-
305
- module_function :tagAtom
306
- module_function :mylistAtom
307
- module_function :getThumbInfo
308
387
  end
309
388
  end
@@ -47,12 +47,12 @@ module Nicos
47
47
  paramAry.push("page=#{@page}") if @page != 1
48
48
  paramAry.push(sortStr)
49
49
  paramAry.push("rss=atom&numbers=1") if method == :atom
50
- param = tag + "?" + paramAry.join('&')
50
+ param = "#{tag}?" + paramAry.join('&')
51
51
 
52
52
  host = 'www.nicovideo.jp'
53
- entity = '/tag/' + param
53
+ entity = '/tag/'
54
54
 
55
- @connector.get(host, entity)
55
+ @connector.get(host, entity, param)
56
56
  end
57
57
 
58
58
  def loop(tag, sort, method, &block)
@@ -160,7 +160,7 @@ module Nicos
160
160
  private
161
161
 
162
162
  def parse(xml)
163
- Nicos::Parser.tagAtom(xml)
163
+ Nicos::Parser::Xml.tagAtom(xml)
164
164
  end
165
165
 
166
166
  public