nicoscraper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ require 'rubygems'
2
+ require 'ruby-debug'
3
+
4
+ require 'movie'
5
+ require 'mylist'
6
+ require 'getmovie'
@@ -0,0 +1,247 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'rubygems'
3
+ require 'xml'
4
+ require 'time'
5
+ require 'converter'
6
+
7
+ module NicoParser
8
+ public
9
+
10
+ def getThumbInfo(xml)
11
+ doc = XML::Reader.string(
12
+ xml,
13
+ :options => XML::Parser::Options::NOBLANKS |
14
+ XML::Parser::Options::NOENT
15
+ )
16
+
17
+ n = -1
18
+ parsed = {}
19
+ category = ""
20
+
21
+ while doc.read
22
+ unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
23
+ case doc.name
24
+ when "video_id", "title", "description", "thumbnail_url",
25
+ "movie_type", "last_res_body" , "watch_url", "thumb_type"
26
+ label = doc.name
27
+ doc.read
28
+ parsed[label] = doc.value
29
+ when "size_high", "size_low", "view_counter", "comment_num",
30
+ "mylist_counter", "embeddable", "no_live_play",
31
+ "user_id"
32
+ label = doc.name
33
+ doc.read
34
+ parsed[label] = doc.value.to_i
35
+ when "first_retrieve"
36
+ label = doc.name
37
+ doc.read
38
+ parsed[label] = Convert.iso8601ToUnix(doc.value)
39
+ when "length"
40
+ doc.read
41
+ lengthStr = doc.value.split(/\:/)
42
+ length = lengthStr[0].to_i * 60 + lengthStr[1].to_i
43
+ parsed["length"] = length
44
+ when "tags"
45
+ doc.move_to_attribute("domain")
46
+ category = doc.value
47
+ if defined? parsed["tags" + category]
48
+ parsed["tags_" + category] = []
49
+ end
50
+ when "tag"
51
+ doc.read
52
+ parsed["tags_" + category].push(doc.value)
53
+ end
54
+ end
55
+ end
56
+
57
+ doc.close
58
+ parsed
59
+ end
60
+
61
+ def tagRss(xml)
62
+ doc = XML::Reader.string(
63
+ xml,
64
+ :options => XML::Parser::Options::NOBLANKS |
65
+ XML::Parser::Options::NOENT
66
+ )
67
+
68
+ n = -1
69
+ parsed = [{}]
70
+
71
+ while doc.read
72
+ unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
73
+ case doc.name
74
+ when "entry"
75
+ n += 1
76
+ parsed[n] = {}
77
+ when "title"
78
+ doc.read
79
+ parsed[n]["title"] = doc.value
80
+ when "link"
81
+ doc.move_to_attribute("href")
82
+ parsed[n]["video_id"] = doc.value.split('/')[4]
83
+ when "published", "updated"
84
+ label = doc.name
85
+ doc.read
86
+ parsed[n][label] = Convert.iso8601ToUnix(doc.value)
87
+ when "p"
88
+ doc.move_to_attribute("class")
89
+ case doc.value
90
+ when "nico-thumbnail"
91
+ doc.read
92
+ doc.move_to_attribute("src")
93
+ parsed[n]["thumbnail_url"] = doc.value
94
+ when "nico-description"
95
+ doc.read
96
+ parsed[n]["description"] = doc.value
97
+ end
98
+ when "strong"
99
+ doc.move_to_attribute("class")
100
+ case doc.value
101
+ when "nico-info-length"
102
+ doc.read
103
+ lengthStr = doc.value.split(/\:/)
104
+ length = lengthStr[0].to_i * 60 + lengthStr[1].to_i
105
+ parsed[n]["length"] = length
106
+ when "nico-numbers-view", "nico-numbers-res",
107
+ "nico-numbers-mylist"
108
+ label = doc.value
109
+ doc.read
110
+ parsed[n][label.slice(13,99)] = doc.value.to_i
111
+ end
112
+ end
113
+ end
114
+ end
115
+
116
+ doc.close
117
+ parsed
118
+ end
119
+
120
+ def mylistRss(xml)
121
+ doc = XML::Reader.string(
122
+ xml,
123
+ :options => XML::Parser::Options::NOBLANKS |
124
+ XML::Parser::Options::NOENT
125
+ )
126
+
127
+ n = -1
128
+ parsed = { "mylist" => {}, "entry" => [{}] }
129
+ while doc.read
130
+ unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
131
+ case doc.name
132
+
133
+ # <title> and <id> are marked up both in mylist and
134
+ # each entry's node. So we need to assign the value to the
135
+ # appropriate variable in accordance with node's location.
136
+ when "title"
137
+ if n == -1
138
+ doc.read
139
+ d = doc.value
140
+ tmp = doc.value.slice(6, 99)
141
+ tmp = tmp.slice(0, tmp.length - 7)
142
+ parsed["mylist"]["title"] = tmp
143
+ else
144
+ doc.read
145
+ parsed["entry"][n]["title"] = doc.value
146
+ end
147
+ when "link"
148
+ if n != -1
149
+ doc.move_to_attribute("href")
150
+ parsed["entry"][n]["video_id"] =
151
+ Extract.videoId(doc.value)
152
+ end
153
+ when "subtitle"
154
+ doc.read
155
+ parsed["entry"][n]["description"] = doc.value
156
+ when "id"
157
+ if n == -1
158
+ doc.read
159
+ parsed["mylist"]["mylist_id"] =
160
+ Extract.mylistId(doc.value)
161
+ else
162
+ doc.read
163
+ parsed["entry"][n]["item_id"] =
164
+ Extract.itemId(doc.value)
165
+ end
166
+ when "updated"
167
+ doc.read
168
+ parsed["mylist"]["updated"] =
169
+ Convert.iso8601ToUnix(doc.value)
170
+ when "name"
171
+ doc.read
172
+ parsed["mylist"]["author"] = doc.value
173
+ when "entry"
174
+ n += 1
175
+ parsed["entry"][n] = {}
176
+ when "content"
177
+ doc.read
178
+ html = doc.value
179
+
180
+ memo =
181
+ html.slice(
182
+ /<p\sclass\=\"nico-memo\"\>[^\<]{1,}/
183
+ ).to_s.slice(21, 999)
184
+
185
+ /(<p\sclass=\"nico-thumbnail\">.+src=\")(http:\/\/[^\"]{1,})/ =~ html
186
+ thumbnail_url = $2
187
+
188
+ description =
189
+ html.slice(
190
+ /<p\sclass\=\"nico-description\"\>[^\<]{1,}/
191
+ ).to_s.slice(31, 999)
192
+
193
+ length =
194
+ Convert.toSeconds(
195
+ html.slice(
196
+ /<strong\sclass\=\"nico-info-length\"\>[^\<]{1,}/
197
+ ).to_s.slice(33, 999)
198
+ )
199
+
200
+ first_retrieve =
201
+ Convert.japToUnix(
202
+ html.slice(
203
+ /<strong\sclass\=\"nico-info-date\"\>[^\<]{1,}/
204
+ ).to_s.slice(31, 999)
205
+ )
206
+
207
+ view =
208
+ Convert.commaRemover(
209
+ html.slice(
210
+ /<strong\sclass\=\"nico-numbers-view\"\>[^\<]{1,}/
211
+ ).to_s.slice(34, 999)
212
+ )
213
+
214
+ res =
215
+ Convert.commaRemover(
216
+ html.slice(
217
+ /<strong\sclass\=\"nico-numbers-res\"\>[^\<]{1,}/
218
+ ).to_s.slice(33, 999)
219
+ )
220
+
221
+ mylist =
222
+ Convert.commaRemover(
223
+ html.slice(
224
+ /<strong\sclass\=\"nico-numbers-mylist\"\>[^\<]{1,}/
225
+ ).to_s.slice(36, 999)
226
+ )
227
+
228
+ parsed["entry"][n]["memo"] = memo
229
+ parsed["entry"][n]["thumbnail_url"] = thumbnail_url
230
+ parsed["entry"][n]["description"] = description
231
+ parsed["entry"][n]["length"] = length
232
+ parsed["entry"][n]["first_retrieve"] = first_retrieve
233
+ parsed["entry"][n]["view"] = view
234
+ parsed["entry"][n]["res"] = res
235
+ parsed["entry"][n]["mylist"] = mylist
236
+ end
237
+ end
238
+ end
239
+
240
+ doc.close
241
+ parsed
242
+ end
243
+
244
+ module_function :tagRss
245
+ module_function :mylistRss
246
+ module_function :getThumbInfo
247
+ end
@@ -0,0 +1,205 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'rubygems'
3
+ require 'ruby-debug'
4
+
5
+ require 'time'
6
+ require 'mechanize'
7
+ require 'kconv'
8
+
9
+ require 'parser'
10
+
11
+
12
+ $wait_byTag = {
13
+ 'consec_count' => 10, # 連続してリクエストする回数
14
+ 'consec_wait' => 10, # 連続リクエスト後のウェイト
15
+ 'each' => 10, # 連続リクエスト時の、1リクエスト毎のウェイト
16
+
17
+ 'rejected' => 120, # アクセス拒絶時(「短時間での連続アクセスは・・・」)
18
+ # の場合の再試行までの時間
19
+ '403' => 600, # "403"時の再試行までのウェイト
20
+ 'increment' => 1, # アクセス拒絶時の、次回以降の1リクエスト毎のウェイトの増加量
21
+
22
+ 'timeout' => 5, # タイムアウト時の、再試行までのウェイト
23
+ '500' => 600, # "500"時の再試行までのウェイト
24
+ '503' => 600, # "503"時の再試行までのウェイト
25
+
26
+ 'allowance_time'=> 5 # 再試行回数の限度
27
+ }
28
+
29
+ $wait_byMylistLt = {
30
+ 'consec_count' => 10,
31
+ 'consec_wait' => 10,
32
+ 'each' => 10,
33
+
34
+ 'rejected' => 120,
35
+ '403' => 600,
36
+ 'increment' => 1,
37
+ 'timeout' => 5,
38
+ '500' => 600,
39
+ '503' => 600,
40
+ 'allowance_time'=> 5
41
+ }
42
+
43
+ module GetMovie
44
+ public
45
+
46
+ def byTag (tag, sort, waitObj, &block)
47
+ gMByTag = GetMovieByTag.new()
48
+ gMByTag.execute(tag, sort, waitObj) { |result, page|
49
+ block.call(result, page)
50
+ }
51
+ end
52
+
53
+ def byTagLt (tag, sort, waitObj, &block)
54
+ gMByTagLt = GetMovieByTagLt.new()
55
+ gMByTagLt.execute(tag, sort, waitObj) { |result, page|
56
+ block.call(result, page)
57
+ }
58
+ end
59
+
60
+ module_function :byTag
61
+ module_function :byTagLt
62
+ end
63
+
64
+ class GetMovieByTagSuper
65
+ private
66
+
67
+ def get (tag, sort, page, method, waitObj)
68
+ paramAry = []
69
+
70
+ case sort
71
+ when 'comment_new'
72
+ sortStr = ''
73
+ when 'comment_old'
74
+ sortStr = 'order=a'
75
+ when 'view_many'
76
+ sortStr = 'sort=v'
77
+ when 'view_few'
78
+ sortStr = 'sort=v&order=a'
79
+ when 'comment_many'
80
+ sortStr = 'sort=r'
81
+ when 'comment_few'
82
+ sortStr = 'sort=r&order=a'
83
+ when 'mylist_many'
84
+ sortStr = 'sort=m'
85
+ when 'mylist_few'
86
+ sortStr = 'sort=m&order=a'
87
+ when 'post_new'
88
+ sortStr = 'sort=f'
89
+ when 'post_old'
90
+ sortStr = 'sort=f&order=a'
91
+ when 'length_long'
92
+ sortStr = 'sort=l'
93
+ when 'length_short'
94
+ sortStr = 'sort=l&order=a'
95
+ end
96
+
97
+ if page != 1 then paramAry.push("page=#{page}"); end
98
+ paramAry.push(sortStr)
99
+ if method == "atom" then paramAry.push("rss=atom&numbers=1") end
100
+ param = tag + "?" + paramAry.join('&')
101
+
102
+ host = 'www.nicovideo.jp'
103
+ entity = '/tag/' + param
104
+
105
+ @con.setWait(waitObj)
106
+ @con.get(host, entity)
107
+ end
108
+
109
+ public
110
+
111
+ def loop (tag, sort, method, waitObj, &block)
112
+ termFlag = false
113
+ page = 1
114
+
115
+ begin
116
+ result = []
117
+ response = get(
118
+ tag,
119
+ sort,
120
+ page,
121
+ method,
122
+ waitObj
123
+ )
124
+
125
+ if response
126
+ result = parse(response)
127
+ termFlag = block.call(result, page)
128
+ else
129
+ termFlag = true
130
+ end
131
+
132
+ page += 1
133
+ end until termFlag
134
+ end
135
+ end
136
+
137
+
138
+ class GetMovieByTag < GetMovieByTagSuper
139
+ def initialize
140
+ @NumOfSearched = 32
141
+ @incrAmt = 0.2
142
+
143
+ @con = Connector.new('mech')
144
+
145
+ # HTML中の各パラメータの所在を示すXPath
146
+ @videoIdXP = "//div[@class='uad_thumbfrm']/table/tr/td/p/a"
147
+ @lengthXP = "//div[@class='uad_thumbfrm']/table/tr/td/p[2]/span"
148
+ @viewXP = "//div[@class='uad_thumbfrm']/table/tr/td[2]/div/nobr[1]/strong"
149
+ @resXP = "//div[@class='uad_thumbfrm']/table/tr/td[2]/div/nobr[2]/strong"
150
+ @mylistXP = "//div[@class='uad_thumbfrm']/table/tr/td[2]/div/nobr[3]/a/strong"
151
+ @adXP = "//div[@class='uad_thumbfrm']/table/tr/td[2]/div/nobr[4]/a/strong"
152
+ end
153
+
154
+ def parse(movieNum)
155
+ result = []
156
+
157
+ video_id = /(sm|nm)[0-9]{1,}/.match(@con.mech.page.search(@videoIdXP)[movieNum]['href'])[0]
158
+ lengthStr = @con.mech.page.search(@lengthXP)[movieNum].text.split(/\:/)
159
+ length = lengthStr[0].to_i * 60 + lengthStr[1].to_i
160
+ view = @con.mech.page.search(@viewXP)[movieNum]
161
+ .text.gsub(/\,/, '').to_i
162
+ res = @con.mech.page.search(@resXP)[movieNum]
163
+ .text.gsub(/\,/, '').to_i
164
+ mylist = @con.mech.page.search(@mylistXP)[movieNum]
165
+ .text.gsub(/\,/, '').to_i
166
+ ad = @con.mech.page.search(@adXP)[movieNum]
167
+ .text.gsub(/\,/, '').to_i
168
+
169
+ result.push({
170
+ "video_id" => video_id,
171
+ "length" => length,
172
+ "view" => view,
173
+ "res" => res,
174
+ "mylist" => mylist,
175
+ "ad" => ad
176
+ })
177
+ end
178
+
179
+ def execute(tag, sort, waitObj, &block)
180
+ loop(tag, sort, "mech", waitObj) { |result, page|
181
+ block.call(result, page)
182
+ }
183
+ end
184
+ end
185
+
186
+ class GetMovieByTagLt < GetMovieByTagSuper
187
+ def initialize
188
+ @NumOfSearched = 32
189
+ @incrAmt = 0.2
190
+ @con = Connector.new('atom')
191
+ end
192
+
193
+ def parse(xml)
194
+ NicoParser.tagRss(xml)
195
+ end
196
+
197
+ def execute(tag, sort, waitObj, &block)
198
+ loop(tag, sort, "atom", waitObj) { |result, page|
199
+ block.call(result, page)
200
+ }
201
+ end
202
+ end
203
+
204
+
205
+