nicoscraper 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,6 @@
1
+ require 'rubygems'
2
+ require 'ruby-debug'
3
+
4
+ require 'movie'
5
+ require 'mylist'
6
+ require 'getmovie'
@@ -0,0 +1,247 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'rubygems'
3
+ require 'xml'
4
+ require 'time'
5
+ require 'converter'
6
+
7
+ module NicoParser
8
+ public
9
+
10
+ def getThumbInfo(xml)
11
+ doc = XML::Reader.string(
12
+ xml,
13
+ :options => XML::Parser::Options::NOBLANKS |
14
+ XML::Parser::Options::NOENT
15
+ )
16
+
17
+ n = -1
18
+ parsed = {}
19
+ category = ""
20
+
21
+ while doc.read
22
+ unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
23
+ case doc.name
24
+ when "video_id", "title", "description", "thumbnail_url",
25
+ "movie_type", "last_res_body" , "watch_url", "thumb_type"
26
+ label = doc.name
27
+ doc.read
28
+ parsed[label] = doc.value
29
+ when "size_high", "size_low", "view_counter", "comment_num",
30
+ "mylist_counter", "embeddable", "no_live_play",
31
+ "user_id"
32
+ label = doc.name
33
+ doc.read
34
+ parsed[label] = doc.value.to_i
35
+ when "first_retrieve"
36
+ label = doc.name
37
+ doc.read
38
+ parsed[label] = Convert.iso8601ToUnix(doc.value)
39
+ when "length"
40
+ doc.read
41
+ lengthStr = doc.value.split(/\:/)
42
+ length = lengthStr[0].to_i * 60 + lengthStr[1].to_i
43
+ parsed["length"] = length
44
+ when "tags"
45
+ doc.move_to_attribute("domain")
46
+ category = doc.value
47
+ if defined? parsed["tags" + category]
48
+ parsed["tags_" + category] = []
49
+ end
50
+ when "tag"
51
+ doc.read
52
+ parsed["tags_" + category].push(doc.value)
53
+ end
54
+ end
55
+ end
56
+
57
+ doc.close
58
+ parsed
59
+ end
60
+
61
+ def tagRss(xml)
62
+ doc = XML::Reader.string(
63
+ xml,
64
+ :options => XML::Parser::Options::NOBLANKS |
65
+ XML::Parser::Options::NOENT
66
+ )
67
+
68
+ n = -1
69
+ parsed = [{}]
70
+
71
+ while doc.read
72
+ unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
73
+ case doc.name
74
+ when "entry"
75
+ n += 1
76
+ parsed[n] = {}
77
+ when "title"
78
+ doc.read
79
+ parsed[n]["title"] = doc.value
80
+ when "link"
81
+ doc.move_to_attribute("href")
82
+ parsed[n]["video_id"] = doc.value.split('/')[4]
83
+ when "published", "updated"
84
+ label = doc.name
85
+ doc.read
86
+ parsed[n][label] = Convert.iso8601ToUnix(doc.value)
87
+ when "p"
88
+ doc.move_to_attribute("class")
89
+ case doc.value
90
+ when "nico-thumbnail"
91
+ doc.read
92
+ doc.move_to_attribute("src")
93
+ parsed[n]["thumbnail_url"] = doc.value
94
+ when "nico-description"
95
+ doc.read
96
+ parsed[n]["description"] = doc.value
97
+ end
98
+ when "strong"
99
+ doc.move_to_attribute("class")
100
+ case doc.value
101
+ when "nico-info-length"
102
+ doc.read
103
+ lengthStr = doc.value.split(/\:/)
104
+ length = lengthStr[0].to_i * 60 + lengthStr[1].to_i
105
+ parsed[n]["length"] = length
106
+ when "nico-numbers-view", "nico-numbers-res",
107
+ "nico-numbers-mylist"
108
+ label = doc.value
109
+ doc.read
110
+ parsed[n][label.slice(13,99)] = doc.value.to_i
111
+ end
112
+ end
113
+ end
114
+ end
115
+
116
+ doc.close
117
+ parsed
118
+ end
119
+
120
+ def mylistRss(xml)
121
+ doc = XML::Reader.string(
122
+ xml,
123
+ :options => XML::Parser::Options::NOBLANKS |
124
+ XML::Parser::Options::NOENT
125
+ )
126
+
127
+ n = -1
128
+ parsed = { "mylist" => {}, "entry" => [{}] }
129
+ while doc.read
130
+ unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
131
+ case doc.name
132
+
133
+ # <title> and <id> are marked up both in mylist and
134
+ # each entry's node. So we need to assign the value to the
135
+ # appropriate variable in accordance with node's location.
136
+ when "title"
137
+ if n == -1
138
+ doc.read
139
+ d = doc.value
140
+ tmp = doc.value.slice(6, 99)
141
+ tmp = tmp.slice(0, tmp.length - 7)
142
+ parsed["mylist"]["title"] = tmp
143
+ else
144
+ doc.read
145
+ parsed["entry"][n]["title"] = doc.value
146
+ end
147
+ when "link"
148
+ if n != -1
149
+ doc.move_to_attribute("href")
150
+ parsed["entry"][n]["video_id"] =
151
+ Extract.videoId(doc.value)
152
+ end
153
+ when "subtitle"
154
+ doc.read
155
+ parsed["entry"][n]["description"] = doc.value
156
+ when "id"
157
+ if n == -1
158
+ doc.read
159
+ parsed["mylist"]["mylist_id"] =
160
+ Extract.mylistId(doc.value)
161
+ else
162
+ doc.read
163
+ parsed["entry"][n]["item_id"] =
164
+ Extract.itemId(doc.value)
165
+ end
166
+ when "updated"
167
+ doc.read
168
+ parsed["mylist"]["updated"] =
169
+ Convert.iso8601ToUnix(doc.value)
170
+ when "name"
171
+ doc.read
172
+ parsed["mylist"]["author"] = doc.value
173
+ when "entry"
174
+ n += 1
175
+ parsed["entry"][n] = {}
176
+ when "content"
177
+ doc.read
178
+ html = doc.value
179
+
180
+ memo =
181
+ html.slice(
182
+ /<p\sclass\=\"nico-memo\"\>[^\<]{1,}/
183
+ ).to_s.slice(21, 999)
184
+
185
+ /(<p\sclass=\"nico-thumbnail\">.+src=\")(http:\/\/[^\"]{1,})/ =~ html
186
+ thumbnail_url = $2
187
+
188
+ description =
189
+ html.slice(
190
+ /<p\sclass\=\"nico-description\"\>[^\<]{1,}/
191
+ ).to_s.slice(31, 999)
192
+
193
+ length =
194
+ Convert.toSeconds(
195
+ html.slice(
196
+ /<strong\sclass\=\"nico-info-length\"\>[^\<]{1,}/
197
+ ).to_s.slice(33, 999)
198
+ )
199
+
200
+ first_retrieve =
201
+ Convert.japToUnix(
202
+ html.slice(
203
+ /<strong\sclass\=\"nico-info-date\"\>[^\<]{1,}/
204
+ ).to_s.slice(31, 999)
205
+ )
206
+
207
+ view =
208
+ Convert.commaRemover(
209
+ html.slice(
210
+ /<strong\sclass\=\"nico-numbers-view\"\>[^\<]{1,}/
211
+ ).to_s.slice(34, 999)
212
+ )
213
+
214
+ res =
215
+ Convert.commaRemover(
216
+ html.slice(
217
+ /<strong\sclass\=\"nico-numbers-res\"\>[^\<]{1,}/
218
+ ).to_s.slice(33, 999)
219
+ )
220
+
221
+ mylist =
222
+ Convert.commaRemover(
223
+ html.slice(
224
+ /<strong\sclass\=\"nico-numbers-mylist\"\>[^\<]{1,}/
225
+ ).to_s.slice(36, 999)
226
+ )
227
+
228
+ parsed["entry"][n]["memo"] = memo
229
+ parsed["entry"][n]["thumbnail_url"] = thumbnail_url
230
+ parsed["entry"][n]["description"] = description
231
+ parsed["entry"][n]["length"] = length
232
+ parsed["entry"][n]["first_retrieve"] = first_retrieve
233
+ parsed["entry"][n]["view"] = view
234
+ parsed["entry"][n]["res"] = res
235
+ parsed["entry"][n]["mylist"] = mylist
236
+ end
237
+ end
238
+ end
239
+
240
+ doc.close
241
+ parsed
242
+ end
243
+
244
+ module_function :tagRss
245
+ module_function :mylistRss
246
+ module_function :getThumbInfo
247
+ end
@@ -0,0 +1,205 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'rubygems'
3
+ require 'ruby-debug'
4
+
5
+ require 'time'
6
+ require 'mechanize'
7
+ require 'kconv'
8
+
9
+ require 'parser'
10
+
11
+
12
+ $wait_byTag = {
13
+ 'consec_count' => 10, # 連続してリクエストする回数
14
+ 'consec_wait' => 10, # 連続リクエスト後のウェイト
15
+ 'each' => 10, # 連続リクエスト時の、1リクエスト毎のウェイト
16
+
17
+ 'rejected' => 120, # アクセス拒絶時(「短時間での連続アクセスは・・・」)
18
+ # の場合の再試行までの時間
19
+ '403' => 600, # "403"時の再試行までのウェイト
20
+ 'increment' => 1, # アクセス拒絶時の、次回以降の1リクエスト毎のウェイトの増加量
21
+
22
+ 'timeout' => 5, # タイムアウト時の、再試行までのウェイト
23
+ '500' => 600, # "500"時の再試行までのウェイト
24
+ '503' => 600, # "503"時の再試行までのウェイト
25
+
26
+ 'allowance_time'=> 5 # 再試行回数の限度
27
+ }
28
+
29
+ $wait_byMylistLt = {
30
+ 'consec_count' => 10,
31
+ 'consec_wait' => 10,
32
+ 'each' => 10,
33
+
34
+ 'rejected' => 120,
35
+ '403' => 600,
36
+ 'increment' => 1,
37
+ 'timeout' => 5,
38
+ '500' => 600,
39
+ '503' => 600,
40
+ 'allowance_time'=> 5
41
+ }
42
+
43
+ module GetMovie
44
+ public
45
+
46
+ def byTag (tag, sort, waitObj, &block)
47
+ gMByTag = GetMovieByTag.new()
48
+ gMByTag.execute(tag, sort, waitObj) { |result, page|
49
+ block.call(result, page)
50
+ }
51
+ end
52
+
53
+ def byTagLt (tag, sort, waitObj, &block)
54
+ gMByTagLt = GetMovieByTagLt.new()
55
+ gMByTagLt.execute(tag, sort, waitObj) { |result, page|
56
+ block.call(result, page)
57
+ }
58
+ end
59
+
60
+ module_function :byTag
61
+ module_function :byTagLt
62
+ end
63
+
64
+ class GetMovieByTagSuper
65
+ private
66
+
67
+ def get (tag, sort, page, method, waitObj)
68
+ paramAry = []
69
+
70
+ case sort
71
+ when 'comment_new'
72
+ sortStr = ''
73
+ when 'comment_old'
74
+ sortStr = 'order=a'
75
+ when 'view_many'
76
+ sortStr = 'sort=v'
77
+ when 'view_few'
78
+ sortStr = 'sort=v&order=a'
79
+ when 'comment_many'
80
+ sortStr = 'sort=r'
81
+ when 'comment_few'
82
+ sortStr = 'sort=r&order=a'
83
+ when 'mylist_many'
84
+ sortStr = 'sort=m'
85
+ when 'mylist_few'
86
+ sortStr = 'sort=m&order=a'
87
+ when 'post_new'
88
+ sortStr = 'sort=f'
89
+ when 'post_old'
90
+ sortStr = 'sort=f&order=a'
91
+ when 'length_long'
92
+ sortStr = 'sort=l'
93
+ when 'length_short'
94
+ sortStr = 'sort=l&order=a'
95
+ end
96
+
97
+ if page != 1 then paramAry.push("page=#{page}"); end
98
+ paramAry.push(sortStr)
99
+ if method == "atom" then paramAry.push("rss=atom&numbers=1") end
100
+ param = tag + "?" + paramAry.join('&')
101
+
102
+ host = 'www.nicovideo.jp'
103
+ entity = '/tag/' + param
104
+
105
+ @con.setWait(waitObj)
106
+ @con.get(host, entity)
107
+ end
108
+
109
+ public
110
+
111
+ def loop (tag, sort, method, waitObj, &block)
112
+ termFlag = false
113
+ page = 1
114
+
115
+ begin
116
+ result = []
117
+ response = get(
118
+ tag,
119
+ sort,
120
+ page,
121
+ method,
122
+ waitObj
123
+ )
124
+
125
+ if response
126
+ result = parse(response)
127
+ termFlag = block.call(result, page)
128
+ else
129
+ termFlag = true
130
+ end
131
+
132
+ page += 1
133
+ end until termFlag
134
+ end
135
+ end
136
+
137
+
138
+ class GetMovieByTag < GetMovieByTagSuper
139
+ def initialize
140
+ @NumOfSearched = 32
141
+ @incrAmt = 0.2
142
+
143
+ @con = Connector.new('mech')
144
+
145
+ # HTML中の各パラメータの所在を示すXPath
146
+ @videoIdXP = "//div[@class='uad_thumbfrm']/table/tr/td/p/a"
147
+ @lengthXP = "//div[@class='uad_thumbfrm']/table/tr/td/p[2]/span"
148
+ @viewXP = "//div[@class='uad_thumbfrm']/table/tr/td[2]/div/nobr[1]/strong"
149
+ @resXP = "//div[@class='uad_thumbfrm']/table/tr/td[2]/div/nobr[2]/strong"
150
+ @mylistXP = "//div[@class='uad_thumbfrm']/table/tr/td[2]/div/nobr[3]/a/strong"
151
+ @adXP = "//div[@class='uad_thumbfrm']/table/tr/td[2]/div/nobr[4]/a/strong"
152
+ end
153
+
154
+ def parse(movieNum)
155
+ result = []
156
+
157
+ video_id = /(sm|nm)[0-9]{1,}/.match(@con.mech.page.search(@videoIdXP)[movieNum]['href'])[0]
158
+ lengthStr = @con.mech.page.search(@lengthXP)[movieNum].text.split(/\:/)
159
+ length = lengthStr[0].to_i * 60 + lengthStr[1].to_i
160
+ view = @con.mech.page.search(@viewXP)[movieNum]
161
+ .text.gsub(/\,/, '').to_i
162
+ res = @con.mech.page.search(@resXP)[movieNum]
163
+ .text.gsub(/\,/, '').to_i
164
+ mylist = @con.mech.page.search(@mylistXP)[movieNum]
165
+ .text.gsub(/\,/, '').to_i
166
+ ad = @con.mech.page.search(@adXP)[movieNum]
167
+ .text.gsub(/\,/, '').to_i
168
+
169
+ result.push({
170
+ "video_id" => video_id,
171
+ "length" => length,
172
+ "view" => view,
173
+ "res" => res,
174
+ "mylist" => mylist,
175
+ "ad" => ad
176
+ })
177
+ end
178
+
179
+ def execute(tag, sort, waitObj, &block)
180
+ loop(tag, sort, "mech", waitObj) { |result, page|
181
+ block.call(result, page)
182
+ }
183
+ end
184
+ end
185
+
186
+ class GetMovieByTagLt < GetMovieByTagSuper
187
+ def initialize
188
+ @NumOfSearched = 32
189
+ @incrAmt = 0.2
190
+ @con = Connector.new('atom')
191
+ end
192
+
193
+ def parse(xml)
194
+ NicoParser.tagRss(xml)
195
+ end
196
+
197
+ def execute(tag, sort, waitObj, &block)
198
+ loop(tag, sort, "atom", waitObj) { |result, page|
199
+ block.call(result, page)
200
+ }
201
+ end
202
+ end
203
+
204
+
205
+