nicoscraper 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +23 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +19 -0
- data/Rakefile +54 -0
- data/VERSION +1 -0
- data/lib/connector.rb +269 -0
- data/lib/converter.rb +68 -0
- data/lib/movie.rb +297 -0
- data/lib/mylist.rb +258 -0
- data/lib/nicoscraper.rb +6 -0
- data/lib/parser.rb +247 -0
- data/lib/searcher.rb +205 -0
- data/nicoscraper.gemspec +72 -0
- data/test/helper.rb +18 -0
- data/test/test_nicoscraper.rb +7 -0
- metadata +139 -0
data/lib/nicoscraper.rb
ADDED
data/lib/parser.rb
ADDED
@@ -0,0 +1,247 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require 'rubygems'
|
3
|
+
require 'xml'
|
4
|
+
require 'time'
|
5
|
+
require 'converter'
|
6
|
+
|
7
|
+
module NicoParser
|
8
|
+
public
|
9
|
+
|
10
|
+
def getThumbInfo(xml)
|
11
|
+
doc = XML::Reader.string(
|
12
|
+
xml,
|
13
|
+
:options => XML::Parser::Options::NOBLANKS |
|
14
|
+
XML::Parser::Options::NOENT
|
15
|
+
)
|
16
|
+
|
17
|
+
n = -1
|
18
|
+
parsed = {}
|
19
|
+
category = ""
|
20
|
+
|
21
|
+
while doc.read
|
22
|
+
unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
|
23
|
+
case doc.name
|
24
|
+
when "video_id", "title", "description", "thumbnail_url",
|
25
|
+
"movie_type", "last_res_body" , "watch_url", "thumb_type"
|
26
|
+
label = doc.name
|
27
|
+
doc.read
|
28
|
+
parsed[label] = doc.value
|
29
|
+
when "size_high", "size_low", "view_counter", "comment_num",
|
30
|
+
"mylist_counter", "embeddable", "no_live_play",
|
31
|
+
"user_id"
|
32
|
+
label = doc.name
|
33
|
+
doc.read
|
34
|
+
parsed[label] = doc.value.to_i
|
35
|
+
when "first_retrieve"
|
36
|
+
label = doc.name
|
37
|
+
doc.read
|
38
|
+
parsed[label] = Convert.iso8601ToUnix(doc.value)
|
39
|
+
when "length"
|
40
|
+
doc.read
|
41
|
+
lengthStr = doc.value.split(/\:/)
|
42
|
+
length = lengthStr[0].to_i * 60 + lengthStr[1].to_i
|
43
|
+
parsed["length"] = length
|
44
|
+
when "tags"
|
45
|
+
doc.move_to_attribute("domain")
|
46
|
+
category = doc.value
|
47
|
+
if defined? parsed["tags" + category]
|
48
|
+
parsed["tags_" + category] = []
|
49
|
+
end
|
50
|
+
when "tag"
|
51
|
+
doc.read
|
52
|
+
parsed["tags_" + category].push(doc.value)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
doc.close
|
58
|
+
parsed
|
59
|
+
end
|
60
|
+
|
61
|
+
def tagRss(xml)
|
62
|
+
doc = XML::Reader.string(
|
63
|
+
xml,
|
64
|
+
:options => XML::Parser::Options::NOBLANKS |
|
65
|
+
XML::Parser::Options::NOENT
|
66
|
+
)
|
67
|
+
|
68
|
+
n = -1
|
69
|
+
parsed = [{}]
|
70
|
+
|
71
|
+
while doc.read
|
72
|
+
unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
|
73
|
+
case doc.name
|
74
|
+
when "entry"
|
75
|
+
n += 1
|
76
|
+
parsed[n] = {}
|
77
|
+
when "title"
|
78
|
+
doc.read
|
79
|
+
parsed[n]["title"] = doc.value
|
80
|
+
when "link"
|
81
|
+
doc.move_to_attribute("href")
|
82
|
+
parsed[n]["video_id"] = doc.value.split('/')[4]
|
83
|
+
when "published", "updated"
|
84
|
+
label = doc.name
|
85
|
+
doc.read
|
86
|
+
parsed[n][label] = Convert.iso8601ToUnix(doc.value)
|
87
|
+
when "p"
|
88
|
+
doc.move_to_attribute("class")
|
89
|
+
case doc.value
|
90
|
+
when "nico-thumbnail"
|
91
|
+
doc.read
|
92
|
+
doc.move_to_attribute("src")
|
93
|
+
parsed[n]["thumbnail_url"] = doc.value
|
94
|
+
when "nico-description"
|
95
|
+
doc.read
|
96
|
+
parsed[n]["description"] = doc.value
|
97
|
+
end
|
98
|
+
when "strong"
|
99
|
+
doc.move_to_attribute("class")
|
100
|
+
case doc.value
|
101
|
+
when "nico-info-length"
|
102
|
+
doc.read
|
103
|
+
lengthStr = doc.value.split(/\:/)
|
104
|
+
length = lengthStr[0].to_i * 60 + lengthStr[1].to_i
|
105
|
+
parsed[n]["length"] = length
|
106
|
+
when "nico-numbers-view", "nico-numbers-res",
|
107
|
+
"nico-numbers-mylist"
|
108
|
+
label = doc.value
|
109
|
+
doc.read
|
110
|
+
parsed[n][label.slice(13,99)] = doc.value.to_i
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
doc.close
|
117
|
+
parsed
|
118
|
+
end
|
119
|
+
|
120
|
+
def mylistRss(xml)
|
121
|
+
doc = XML::Reader.string(
|
122
|
+
xml,
|
123
|
+
:options => XML::Parser::Options::NOBLANKS |
|
124
|
+
XML::Parser::Options::NOENT
|
125
|
+
)
|
126
|
+
|
127
|
+
n = -1
|
128
|
+
parsed = { "mylist" => {}, "entry" => [{}] }
|
129
|
+
while doc.read
|
130
|
+
unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
|
131
|
+
case doc.name
|
132
|
+
|
133
|
+
# <title> and <id> are marked up both in mylist and
|
134
|
+
# each entry's node. So we need to assign the value to the
|
135
|
+
# appropriate variable in accordance with node's location.
|
136
|
+
when "title"
|
137
|
+
if n == -1
|
138
|
+
doc.read
|
139
|
+
d = doc.value
|
140
|
+
tmp = doc.value.slice(6, 99)
|
141
|
+
tmp = tmp.slice(0, tmp.length - 7)
|
142
|
+
parsed["mylist"]["title"] = tmp
|
143
|
+
else
|
144
|
+
doc.read
|
145
|
+
parsed["entry"][n]["title"] = doc.value
|
146
|
+
end
|
147
|
+
when "link"
|
148
|
+
if n != -1
|
149
|
+
doc.move_to_attribute("href")
|
150
|
+
parsed["entry"][n]["video_id"] =
|
151
|
+
Extract.videoId(doc.value)
|
152
|
+
end
|
153
|
+
when "subtitle"
|
154
|
+
doc.read
|
155
|
+
parsed["entry"][n]["description"] = doc.value
|
156
|
+
when "id"
|
157
|
+
if n == -1
|
158
|
+
doc.read
|
159
|
+
parsed["mylist"]["mylist_id"] =
|
160
|
+
Extract.mylistId(doc.value)
|
161
|
+
else
|
162
|
+
doc.read
|
163
|
+
parsed["entry"][n]["item_id"] =
|
164
|
+
Extract.itemId(doc.value)
|
165
|
+
end
|
166
|
+
when "updated"
|
167
|
+
doc.read
|
168
|
+
parsed["mylist"]["updated"] =
|
169
|
+
Convert.iso8601ToUnix(doc.value)
|
170
|
+
when "name"
|
171
|
+
doc.read
|
172
|
+
parsed["mylist"]["author"] = doc.value
|
173
|
+
when "entry"
|
174
|
+
n += 1
|
175
|
+
parsed["entry"][n] = {}
|
176
|
+
when "content"
|
177
|
+
doc.read
|
178
|
+
html = doc.value
|
179
|
+
|
180
|
+
memo =
|
181
|
+
html.slice(
|
182
|
+
/<p\sclass\=\"nico-memo\"\>[^\<]{1,}/
|
183
|
+
).to_s.slice(21, 999)
|
184
|
+
|
185
|
+
/(<p\sclass=\"nico-thumbnail\">.+src=\")(http:\/\/[^\"]{1,})/ =~ html
|
186
|
+
thumbnail_url = $2
|
187
|
+
|
188
|
+
description =
|
189
|
+
html.slice(
|
190
|
+
/<p\sclass\=\"nico-description\"\>[^\<]{1,}/
|
191
|
+
).to_s.slice(31, 999)
|
192
|
+
|
193
|
+
length =
|
194
|
+
Convert.toSeconds(
|
195
|
+
html.slice(
|
196
|
+
/<strong\sclass\=\"nico-info-length\"\>[^\<]{1,}/
|
197
|
+
).to_s.slice(33, 999)
|
198
|
+
)
|
199
|
+
|
200
|
+
first_retrieve =
|
201
|
+
Convert.japToUnix(
|
202
|
+
html.slice(
|
203
|
+
/<strong\sclass\=\"nico-info-date\"\>[^\<]{1,}/
|
204
|
+
).to_s.slice(31, 999)
|
205
|
+
)
|
206
|
+
|
207
|
+
view =
|
208
|
+
Convert.commaRemover(
|
209
|
+
html.slice(
|
210
|
+
/<strong\sclass\=\"nico-numbers-view\"\>[^\<]{1,}/
|
211
|
+
).to_s.slice(34, 999)
|
212
|
+
)
|
213
|
+
|
214
|
+
res =
|
215
|
+
Convert.commaRemover(
|
216
|
+
html.slice(
|
217
|
+
/<strong\sclass\=\"nico-numbers-res\"\>[^\<]{1,}/
|
218
|
+
).to_s.slice(33, 999)
|
219
|
+
)
|
220
|
+
|
221
|
+
mylist =
|
222
|
+
Convert.commaRemover(
|
223
|
+
html.slice(
|
224
|
+
/<strong\sclass\=\"nico-numbers-mylist\"\>[^\<]{1,}/
|
225
|
+
).to_s.slice(36, 999)
|
226
|
+
)
|
227
|
+
|
228
|
+
parsed["entry"][n]["memo"] = memo
|
229
|
+
parsed["entry"][n]["thumbnail_url"] = thumbnail_url
|
230
|
+
parsed["entry"][n]["description"] = description
|
231
|
+
parsed["entry"][n]["length"] = length
|
232
|
+
parsed["entry"][n]["first_retrieve"] = first_retrieve
|
233
|
+
parsed["entry"][n]["view"] = view
|
234
|
+
parsed["entry"][n]["res"] = res
|
235
|
+
parsed["entry"][n]["mylist"] = mylist
|
236
|
+
end
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
doc.close
|
241
|
+
parsed
|
242
|
+
end
|
243
|
+
|
244
|
+
module_function :tagRss
|
245
|
+
module_function :mylistRss
|
246
|
+
module_function :getThumbInfo
|
247
|
+
end
|
data/lib/searcher.rb
ADDED
@@ -0,0 +1,205 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require 'rubygems'
|
3
|
+
require 'ruby-debug'
|
4
|
+
|
5
|
+
require 'time'
|
6
|
+
require 'mechanize'
|
7
|
+
require 'kconv'
|
8
|
+
|
9
|
+
require 'parser'
|
10
|
+
|
11
|
+
|
12
|
+
$wait_byTag = {
|
13
|
+
'consec_count' => 10, # 連続してリクエストする回数
|
14
|
+
'consec_wait' => 10, # 連続リクエスト後のウェイト
|
15
|
+
'each' => 10, # 連続リクエスト時の、1リクエスト毎のウェイト
|
16
|
+
|
17
|
+
'rejected' => 120, # アクセス拒絶時(「短時間での連続アクセスは・・・」)
|
18
|
+
# の場合の再試行までの時間
|
19
|
+
'403' => 600, # "403"時の再試行までのウェイト
|
20
|
+
'increment' => 1, # アクセス拒絶時の、次回以降の1リクエスト毎のウェイトの増加量
|
21
|
+
|
22
|
+
'timeout' => 5, # タイムアウト時の、再試行までのウェイト
|
23
|
+
'500' => 600, # "500"時の再試行までのウェイト
|
24
|
+
'503' => 600, # "503"時の再試行までのウェイト
|
25
|
+
|
26
|
+
'allowance_time'=> 5 # 再試行回数の限度
|
27
|
+
}
|
28
|
+
|
29
|
+
$wait_byMylistLt = {
|
30
|
+
'consec_count' => 10,
|
31
|
+
'consec_wait' => 10,
|
32
|
+
'each' => 10,
|
33
|
+
|
34
|
+
'rejected' => 120,
|
35
|
+
'403' => 600,
|
36
|
+
'increment' => 1,
|
37
|
+
'timeout' => 5,
|
38
|
+
'500' => 600,
|
39
|
+
'503' => 600,
|
40
|
+
'allowance_time'=> 5
|
41
|
+
}
|
42
|
+
|
43
|
+
module GetMovie
|
44
|
+
public
|
45
|
+
|
46
|
+
def byTag (tag, sort, waitObj, &block)
|
47
|
+
gMByTag = GetMovieByTag.new()
|
48
|
+
gMByTag.execute(tag, sort, waitObj) { |result, page|
|
49
|
+
block.call(result, page)
|
50
|
+
}
|
51
|
+
end
|
52
|
+
|
53
|
+
def byTagLt (tag, sort, waitObj, &block)
|
54
|
+
gMByTagLt = GetMovieByTagLt.new()
|
55
|
+
gMByTagLt.execute(tag, sort, waitObj) { |result, page|
|
56
|
+
block.call(result, page)
|
57
|
+
}
|
58
|
+
end
|
59
|
+
|
60
|
+
module_function :byTag
|
61
|
+
module_function :byTagLt
|
62
|
+
end
|
63
|
+
|
64
|
+
class GetMovieByTagSuper
|
65
|
+
private
|
66
|
+
|
67
|
+
def get (tag, sort, page, method, waitObj)
|
68
|
+
paramAry = []
|
69
|
+
|
70
|
+
case sort
|
71
|
+
when 'comment_new'
|
72
|
+
sortStr = ''
|
73
|
+
when 'comment_old'
|
74
|
+
sortStr = 'order=a'
|
75
|
+
when 'view_many'
|
76
|
+
sortStr = 'sort=v'
|
77
|
+
when 'view_few'
|
78
|
+
sortStr = 'sort=v&order=a'
|
79
|
+
when 'comment_many'
|
80
|
+
sortStr = 'sort=r'
|
81
|
+
when 'comment_few'
|
82
|
+
sortStr = 'sort=r&order=a'
|
83
|
+
when 'mylist_many'
|
84
|
+
sortStr = 'sort=m'
|
85
|
+
when 'mylist_few'
|
86
|
+
sortStr = 'sort=m&order=a'
|
87
|
+
when 'post_new'
|
88
|
+
sortStr = 'sort=f'
|
89
|
+
when 'post_old'
|
90
|
+
sortStr = 'sort=f&order=a'
|
91
|
+
when 'length_long'
|
92
|
+
sortStr = 'sort=l'
|
93
|
+
when 'length_short'
|
94
|
+
sortStr = 'sort=l&order=a'
|
95
|
+
end
|
96
|
+
|
97
|
+
if page != 1 then paramAry.push("page=#{page}"); end
|
98
|
+
paramAry.push(sortStr)
|
99
|
+
if method == "atom" then paramAry.push("rss=atom&numbers=1") end
|
100
|
+
param = tag + "?" + paramAry.join('&')
|
101
|
+
|
102
|
+
host = 'www.nicovideo.jp'
|
103
|
+
entity = '/tag/' + param
|
104
|
+
|
105
|
+
@con.setWait(waitObj)
|
106
|
+
@con.get(host, entity)
|
107
|
+
end
|
108
|
+
|
109
|
+
public
|
110
|
+
|
111
|
+
def loop (tag, sort, method, waitObj, &block)
|
112
|
+
termFlag = false
|
113
|
+
page = 1
|
114
|
+
|
115
|
+
begin
|
116
|
+
result = []
|
117
|
+
response = get(
|
118
|
+
tag,
|
119
|
+
sort,
|
120
|
+
page,
|
121
|
+
method,
|
122
|
+
waitObj
|
123
|
+
)
|
124
|
+
|
125
|
+
if response
|
126
|
+
result = parse(response)
|
127
|
+
termFlag = block.call(result, page)
|
128
|
+
else
|
129
|
+
termFlag = true
|
130
|
+
end
|
131
|
+
|
132
|
+
page += 1
|
133
|
+
end until termFlag
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
|
138
|
+
class GetMovieByTag < GetMovieByTagSuper
|
139
|
+
def initialize
|
140
|
+
@NumOfSearched = 32
|
141
|
+
@incrAmt = 0.2
|
142
|
+
|
143
|
+
@con = Connector.new('mech')
|
144
|
+
|
145
|
+
# HTML中の各パラメータの所在を示すXPath
|
146
|
+
@videoIdXP = "//div[@class='uad_thumbfrm']/table/tr/td/p/a"
|
147
|
+
@lengthXP = "//div[@class='uad_thumbfrm']/table/tr/td/p[2]/span"
|
148
|
+
@viewXP = "//div[@class='uad_thumbfrm']/table/tr/td[2]/div/nobr[1]/strong"
|
149
|
+
@resXP = "//div[@class='uad_thumbfrm']/table/tr/td[2]/div/nobr[2]/strong"
|
150
|
+
@mylistXP = "//div[@class='uad_thumbfrm']/table/tr/td[2]/div/nobr[3]/a/strong"
|
151
|
+
@adXP = "//div[@class='uad_thumbfrm']/table/tr/td[2]/div/nobr[4]/a/strong"
|
152
|
+
end
|
153
|
+
|
154
|
+
def parse(movieNum)
|
155
|
+
result = []
|
156
|
+
|
157
|
+
video_id = /(sm|nm)[0-9]{1,}/.match(@con.mech.page.search(@videoIdXP)[movieNum]['href'])[0]
|
158
|
+
lengthStr = @con.mech.page.search(@lengthXP)[movieNum].text.split(/\:/)
|
159
|
+
length = lengthStr[0].to_i * 60 + lengthStr[1].to_i
|
160
|
+
view = @con.mech.page.search(@viewXP)[movieNum]
|
161
|
+
.text.gsub(/\,/, '').to_i
|
162
|
+
res = @con.mech.page.search(@resXP)[movieNum]
|
163
|
+
.text.gsub(/\,/, '').to_i
|
164
|
+
mylist = @con.mech.page.search(@mylistXP)[movieNum]
|
165
|
+
.text.gsub(/\,/, '').to_i
|
166
|
+
ad = @con.mech.page.search(@adXP)[movieNum]
|
167
|
+
.text.gsub(/\,/, '').to_i
|
168
|
+
|
169
|
+
result.push({
|
170
|
+
"video_id" => video_id,
|
171
|
+
"length" => length,
|
172
|
+
"view" => view,
|
173
|
+
"res" => res,
|
174
|
+
"mylist" => mylist,
|
175
|
+
"ad" => ad
|
176
|
+
})
|
177
|
+
end
|
178
|
+
|
179
|
+
def execute(tag, sort, waitObj, &block)
|
180
|
+
loop(tag, sort, "mech", waitObj) { |result, page|
|
181
|
+
block.call(result, page)
|
182
|
+
}
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
class GetMovieByTagLt < GetMovieByTagSuper
|
187
|
+
def initialize
|
188
|
+
@NumOfSearched = 32
|
189
|
+
@incrAmt = 0.2
|
190
|
+
@con = Connector.new('atom')
|
191
|
+
end
|
192
|
+
|
193
|
+
def parse(xml)
|
194
|
+
NicoParser.tagRss(xml)
|
195
|
+
end
|
196
|
+
|
197
|
+
def execute(tag, sort, waitObj, &block)
|
198
|
+
loop(tag, sort, "atom", waitObj) { |result, page|
|
199
|
+
block.call(result, page)
|
200
|
+
}
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
|
205
|
+
|