nicoscraper 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +1 -0
- data/VERSION +1 -1
- data/lib/connector.rb +234 -143
- data/lib/movie.rb +21 -21
- data/lib/mylist.rb +20 -55
- data/lib/nicoscraper.rb +1 -1
- data/lib/parser.rb +28 -49
- data/lib/searcher.rb +36 -85
- data/nicoscraper.gemspec +3 -2
- data/test/movie_spec.rb +125 -0
- metadata +4 -3
data/Rakefile
CHANGED
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/lib/connector.rb
CHANGED
@@ -1,86 +1,264 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
|
+
$:.unshift File.dirname(__FILE__)
|
3
|
+
|
2
4
|
require 'rubygems'
|
3
5
|
require 'ruby-debug'
|
4
6
|
require 'net/http'
|
5
7
|
|
8
|
+
|
6
9
|
class Connector
|
7
|
-
def initialize
|
8
|
-
@mode = mode
|
10
|
+
def initialize
|
9
11
|
# デフォルトのウェイト設定
|
12
|
+
@seqTime = 0
|
13
|
+
|
10
14
|
@waitConfig = {
|
11
|
-
'
|
12
|
-
'
|
13
|
-
'each'
|
14
|
-
|
15
|
-
'200-abnormal' => 1, # アクセス拒絶時(「短時間での連続アクセスは・・・」)の場合の再試行までの時間
|
16
|
-
'unavailable' => 10,
|
17
|
-
'403' => 1, # "403"時の再試行までのウェイト
|
18
|
-
'404' => 1, # "403"時の再試行までのウェイト
|
19
|
-
'increment' => 1, # アクセス拒絶時の、次回以降の1リクエスト毎のウェイトの増加量
|
15
|
+
'seqAccLimit' => 10, # 連続してリクエストする回数
|
16
|
+
'afterSeq' => 10, # 連続リクエスト後のウェイト
|
17
|
+
'each' => 1, # 連続リクエスト時の、1リクエスト毎のウェイト
|
20
18
|
|
21
|
-
'
|
22
|
-
'
|
23
|
-
|
24
|
-
|
25
|
-
|
19
|
+
'increment' => 1, # アクセス拒絶時の、次回以降の1リクエスト毎のウェイトの増加量
|
20
|
+
'' => 100,
|
21
|
+
|
22
|
+
'deniedSeqReq'=> {
|
23
|
+
'retryLimit' => 3,
|
24
|
+
'wait' => 120
|
25
|
+
},
|
26
|
+
|
27
|
+
'serverIsBusy'=> {
|
28
|
+
'retryLimit' => 3,
|
29
|
+
'wait' => 120
|
30
|
+
},
|
31
|
+
|
32
|
+
'serviceUnavailable' => {
|
33
|
+
'retryLimit' => 3,
|
34
|
+
'wait' => 120
|
35
|
+
},
|
36
|
+
|
37
|
+
'timedOut' => {
|
38
|
+
'retryLimit' => 3,
|
39
|
+
'wait' => 10
|
40
|
+
}
|
26
41
|
}
|
27
|
-
|
28
|
-
# 1つの検索結果画面に表示される動画の数。現時点では10個。
|
29
|
-
@NumOfSearched = 32
|
30
|
-
|
31
|
-
if @mode == "mech"
|
32
|
-
@mech = Mechanize.new
|
33
|
-
# メモリ節約のため、Mechanizeの履歴機能をオフにする。
|
34
|
-
@mech.max_history = 1
|
35
|
-
end
|
36
42
|
|
37
|
-
@
|
43
|
+
@result = {}
|
38
44
|
end
|
39
45
|
|
40
46
|
private
|
41
47
|
|
42
|
-
def
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
48
|
+
def notPublic
|
49
|
+
# マイリスト非公開のときに403になる。後で専用の処理を入れるべき。
|
50
|
+
puts "This movie/mylist is not public."
|
51
|
+
@result = "notPublic"
|
52
|
+
return { "order" => "terminate" }
|
53
|
+
end
|
54
|
+
|
55
|
+
def limInCommunity
|
56
|
+
puts "This movie/mylist is limited in comunity members."
|
57
|
+
# ex. item_id -> 1294702905
|
58
|
+
@result = "limInCommunity"
|
59
|
+
return { "order" => "terminate" }
|
60
|
+
end
|
61
|
+
|
62
|
+
def notFound
|
63
|
+
puts "This movie/mylist is not found."
|
64
|
+
@result = "notFound"
|
65
|
+
return { "order" => "terminate" }
|
66
|
+
end
|
67
|
+
|
68
|
+
def deleted
|
69
|
+
puts "This movie/mylist is deleted."
|
70
|
+
@result = "deleted"
|
71
|
+
return { "order" => "terminate" }
|
72
|
+
end
|
73
|
+
|
74
|
+
def deniedSeqReq
|
75
|
+
puts "Denied sequential requests."
|
76
|
+
sleep @waitConfig["deniedSeqReq"]
|
77
|
+
@result = "deniedSeqReq"
|
78
|
+
return { "order" => "retry" }
|
79
|
+
end
|
80
|
+
|
81
|
+
def serverIsBusy
|
82
|
+
puts "The server is busy."
|
83
|
+
sleep @waitConfig["serverIsBusy"]
|
84
|
+
@result = "serverIsBusy"
|
85
|
+
return { "order" => "retry" }
|
86
|
+
end
|
87
|
+
|
88
|
+
def serviceUnavailable
|
89
|
+
puts "Service unavailable."
|
90
|
+
sleep @waitConfig["serviceUnavailable"]
|
91
|
+
@result = "serviceUnavailable"
|
92
|
+
return { "order" => "retry" }
|
93
|
+
end
|
94
|
+
|
95
|
+
def timedOut
|
96
|
+
puts "Request timed out."
|
97
|
+
sleep @waitConfig["timedOut"]
|
98
|
+
@result = "timedOut"
|
99
|
+
return { "order" => "retry" }
|
100
|
+
end
|
101
|
+
|
102
|
+
def success(resBody)
|
103
|
+
sleep @waitConfig["each"]
|
104
|
+
@seqTime += 1
|
105
|
+
|
106
|
+
if @seqTime >= @waitConfig["seqAccLimit"]
|
107
|
+
sleep @waitConfig["afterSeq"]
|
108
|
+
@seqTime = 0
|
51
109
|
end
|
52
|
-
return
|
110
|
+
return { "order" => "success", "body" => resBody }
|
53
111
|
end
|
54
|
-
|
55
|
-
|
56
|
-
|
112
|
+
|
113
|
+
def wait(status)
|
114
|
+
puts "Wait for " + waitTime + " second."
|
115
|
+
sleep @waitConfig[status.to_s]
|
116
|
+
end
|
117
|
+
|
118
|
+
public
|
119
|
+
|
57
120
|
def setWait(waitConfig)
|
58
121
|
if waitConfig != nil
|
59
122
|
@waitConfig = mixin(@waitConfig, waitConfig)
|
60
123
|
end
|
61
124
|
end
|
125
|
+
end
|
62
126
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
127
|
+
|
128
|
+
class XmlConnector < Connector
|
129
|
+
def get (host, entity)
|
130
|
+
response = nil
|
131
|
+
|
132
|
+
begin
|
133
|
+
puts "Request to " + host + entity
|
134
|
+
Net::HTTP.start(host, 80) { |http|
|
135
|
+
response = http.get(entity)
|
136
|
+
}
|
137
|
+
|
138
|
+
rescue => e
|
139
|
+
puts e
|
140
|
+
rescue Timeout::Error => e
|
141
|
+
timeOut
|
142
|
+
|
143
|
+
else
|
144
|
+
res = case response
|
145
|
+
when Net::HTTPSuccess
|
146
|
+
reviewRes( response.body.force_encoding("UTF-8") )
|
147
|
+
# return response.body.force_encoding("UTF-8")
|
148
|
+
# when Net::HTTPRedirection
|
149
|
+
# fetch(response['location'], limit - 1)
|
150
|
+
when Net::HTTPForbidden
|
151
|
+
forbidden
|
152
|
+
when Net::HTTPNotFound
|
153
|
+
notFound
|
154
|
+
when Net::HTTPServiceUnavailable
|
155
|
+
serviceUnavailable
|
156
|
+
else
|
157
|
+
unknownError
|
158
|
+
end
|
159
|
+
end until res["order"] == "success" ||
|
160
|
+
res["order"] == "terminate"
|
161
|
+
|
162
|
+
res
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
class MylistAtomConnector < XmlConnector
|
167
|
+
private
|
168
|
+
|
169
|
+
def forbidden
|
170
|
+
# マイリストが非公開の場合、html/Atomのどちらへのリクエストであっても、403が返ってくる。
|
171
|
+
notPublic
|
172
|
+
end
|
173
|
+
|
174
|
+
def reviewRes(resBody)
|
175
|
+
if # アクセス集中時
|
176
|
+
/大変ご迷惑をおかけいたしますが、しばらく時間をあけてから再度検索いただくようご協力をお願いいたします。/ =~
|
177
|
+
resBody.force_encoding("UTF-8")
|
178
|
+
then
|
179
|
+
serverIsBusy
|
180
|
+
else
|
181
|
+
success(resBody)
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
class SearchByTagAtomConnector < XmlConnector
|
187
|
+
private
|
188
|
+
|
189
|
+
def forbidden
|
190
|
+
# マイリストが非公開の場合、html/Atomのどちらへのリクエストであっても、403が返ってくる。
|
191
|
+
notPublic
|
192
|
+
end
|
193
|
+
|
194
|
+
def reviewRes(resBody)
|
195
|
+
if # アクセス集中時
|
196
|
+
/大変ご迷惑をおかけいたしますが、しばらく時間をあけてから再度検索いただくようご協力をお願いいたします。/ =~
|
197
|
+
resBody.force_encoding("UTF-8")
|
198
|
+
then
|
199
|
+
serverIsBusy
|
200
|
+
else
|
201
|
+
success(resBody)
|
202
|
+
end
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
206
|
+
class GetThumbInfoConnector < XmlConnector
|
207
|
+
private
|
208
|
+
|
209
|
+
def reviewRes(resBody)
|
210
|
+
r = resBody.force_encoding("UTF-8")
|
211
|
+
|
212
|
+
if # getThumbInfoは、該当する動画がない・削除済み・コミュニティ限定でも200が返ってくる。
|
213
|
+
/<nicovideo_thumb_response\sstatus=\"fail\">/ =~ r
|
214
|
+
if /<code>NOT_FOUND<\/code>/ =~ r
|
215
|
+
notFound
|
216
|
+
elsif /<code>DELETED<\/code>/ =~ r
|
217
|
+
deleted
|
218
|
+
elsif /<code>COMMUNITY<\/code>/ =~ r
|
219
|
+
limInCommunity
|
220
|
+
else
|
221
|
+
serverIsBusy
|
73
222
|
end
|
74
|
-
|
75
|
-
|
223
|
+
else
|
224
|
+
success(resBody)
|
225
|
+
end
|
76
226
|
end
|
227
|
+
end
|
77
228
|
|
78
|
-
|
79
|
-
|
80
|
-
@
|
81
|
-
|
82
|
-
|
229
|
+
class HtmlConnector < Connector
|
230
|
+
def initialize(mode)
|
231
|
+
@mode = mode
|
232
|
+
# デフォルトのウェイト設定
|
233
|
+
@waitConfig = {
|
234
|
+
'consec_count' => 10, # 連続してリクエストする回数
|
235
|
+
'consec_wait' => 10, # 連続リクエスト後のウェイト
|
236
|
+
'each' => 10, # 連続リクエスト時の、1リクエスト毎のウェイト
|
237
|
+
|
238
|
+
'200-abnormal' => 300, # アクセス拒絶時(「短時間での連続アクセスは・・・」)の場合の再試行までの時間
|
239
|
+
'unavailable' => 10,
|
240
|
+
'403' => 300, # "403"時の再試行までのウェイト
|
241
|
+
'404' => 300, # "403"時の再試行までのウェイト
|
242
|
+
'increment' => 1, # アクセス拒絶時の、次回以降の1リクエスト毎のウェイトの増加量
|
243
|
+
|
244
|
+
'timeout' => 10, # タイムアウト時の、再試行までのウェイト
|
245
|
+
'500' => 10, # "500"時の再試行までのウェイト
|
246
|
+
'503' => 10, # "503"時の再試行までのウェイト
|
247
|
+
|
248
|
+
'retryLimit' => 3 # 再試行回数の限度
|
249
|
+
}
|
250
|
+
|
251
|
+
# 1つの検索結果画面に表示される動画の数。現時点では32個がデフォルトの模様。
|
252
|
+
@NumOfSearched = 32
|
253
|
+
|
254
|
+
@mech = Mechanize.new
|
255
|
+
# メモリ節約のため、Mechanizeの履歴機能を切る。
|
256
|
+
@mech.max_history = 1
|
257
|
+
|
258
|
+
@consec_count = 0
|
83
259
|
end
|
260
|
+
|
261
|
+
public
|
84
262
|
|
85
263
|
def errorStatus(ex)
|
86
264
|
# 再試行回数が
|
@@ -178,92 +356,5 @@ class Connector
|
|
178
356
|
return @mech.page
|
179
357
|
end
|
180
358
|
|
181
|
-
def xmlGet (host, entity)
|
182
|
-
response = nil
|
183
|
-
xmlDoc = nil
|
184
|
-
retryCount = 0
|
185
|
-
terminate = false
|
186
|
-
|
187
|
-
begin
|
188
|
-
puts "Requesting to " + host + entity
|
189
|
-
Net::HTTP.start(host, 80) { |http|
|
190
|
-
response = http.get(entity)
|
191
|
-
}
|
192
|
-
rescue => e
|
193
|
-
puts e
|
194
|
-
rescue Timeout::Error => e
|
195
|
-
puts e
|
196
|
-
puts "Timeout."
|
197
|
-
# マイリスト非公開のときに、403になる。後で専用の処理を入れるべき。
|
198
|
-
wait("timeout")
|
199
|
-
retryCount += 1
|
200
|
-
|
201
|
-
if retryCount >= @waitConfig["retryLimit"]
|
202
|
-
terminate = true
|
203
|
-
return "failed"
|
204
|
-
end
|
205
|
-
else
|
206
|
-
case response
|
207
|
-
when Net::HTTPSuccess
|
208
|
-
unless abnormalRes(response.body)
|
209
|
-
terminate = true
|
210
|
-
return response.body.force_encoding("UTF-8")
|
211
|
-
end
|
212
|
-
wait("200-abnormal")
|
213
|
-
retryCount += 1
|
214
|
-
when Net::HTTPRedirection
|
215
|
-
fetch(response['location'], limit - 1)
|
216
|
-
when Net::HTTPForbidden
|
217
|
-
puts "Access forbidden."
|
218
|
-
# マイリスト非公開のときに、403になる。後で専用の処理を入れるべき。
|
219
|
-
wait("403")
|
220
|
-
retryCount += 1
|
221
|
-
when Net::HTTPNotFound
|
222
|
-
puts "Http not found."
|
223
|
-
wait("404")
|
224
|
-
retryCount += 1
|
225
|
-
when Net::HTTPServiceUnavailable
|
226
|
-
puts "Access rejected or service unavailable."
|
227
|
-
wait("unavailable")
|
228
|
-
retryCount += 1
|
229
|
-
else
|
230
|
-
puts response.force_encoding("UTF-8")
|
231
|
-
puts "Unknown error."
|
232
|
-
wait("other")
|
233
|
-
retryCount += 1
|
234
|
-
end
|
235
|
-
|
236
|
-
if retryCount >= @waitConfig["retryLimit"]
|
237
|
-
terminate = true
|
238
|
-
return "failed"
|
239
|
-
end
|
240
|
-
end until terminate
|
241
|
-
end
|
242
|
-
|
243
|
-
def abnormalRes(resBody)
|
244
|
-
if
|
245
|
-
# mylistRss アクセス集中時
|
246
|
-
/大変ご迷惑をおかけいたしますが、しばらく時間をあけてから再度検索いただくようご協力をお願いいたします。/ =~ resBody.force_encoding("UTF-8") ||
|
247
|
-
# getThumbInfo失敗時
|
248
|
-
/<nicovideo_thumb_response\sstatus=\"fail\">/ =~ resBody
|
249
|
-
then
|
250
|
-
puts "!!!!"
|
251
|
-
true
|
252
|
-
end
|
253
|
-
end
|
254
|
-
|
255
|
-
def wait(status)
|
256
|
-
sleep @waitConfig[status.to_s]
|
257
|
-
end
|
258
|
-
|
259
|
-
def get (host, entity)
|
260
|
-
case @mode
|
261
|
-
when "html"
|
262
|
-
mechGet(host + entity)
|
263
|
-
when "atom"
|
264
|
-
xmlGet(host, entity)
|
265
|
-
end
|
266
|
-
end
|
267
|
-
|
268
359
|
attr_reader :mech
|
269
360
|
end
|
data/lib/movie.rb
CHANGED
@@ -1,12 +1,14 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
|
+
$:.unshift File.dirname(__FILE__)
|
3
|
+
|
2
4
|
require 'rubygems'
|
3
5
|
require 'ruby-debug'
|
4
6
|
require 'damerau-levenshtein'
|
5
7
|
require 'kconv'
|
6
8
|
|
7
|
-
require 'parser'
|
8
|
-
require 'mylist'
|
9
|
-
require 'connector'
|
9
|
+
require 'parser.rb'
|
10
|
+
require 'mylist.rb'
|
11
|
+
require 'connector.rb'
|
10
12
|
|
11
13
|
class Movie
|
12
14
|
def initialize(video_id)
|
@@ -19,9 +21,9 @@ class Movie
|
|
19
21
|
public
|
20
22
|
|
21
23
|
# 指定されたマイリストに自分が入っていれば、真を返す。
|
22
|
-
def isBelongsTo (
|
24
|
+
def isBelongsTo (mylistId, &block)
|
23
25
|
isBelongs = false
|
24
|
-
thisMl = Mylist.new(
|
26
|
+
thisMl = Mylist.new(mylistId)
|
25
27
|
thisMl.getInfoLt
|
26
28
|
|
27
29
|
thisMl.movies.each { |movie|
|
@@ -29,14 +31,14 @@ class Movie
|
|
29
31
|
}
|
30
32
|
|
31
33
|
if isBelongs
|
32
|
-
puts "\sThis movie is found in mylist/" +
|
34
|
+
puts "\sThis movie is found in mylist/" + mylistId.to_s
|
33
35
|
else
|
34
|
-
puts "\sThis movie is not found in mylist/" +
|
36
|
+
puts "\sThis movie is not found in mylist/" + mylistId.to_s
|
35
37
|
end
|
36
38
|
|
37
39
|
# 無駄なアクセスを省くため、マイリスト中の動画に関する追加処理があれば、
|
38
40
|
# ブロックとして実行できる。
|
39
|
-
block.call(thisMl)
|
41
|
+
block.call(thisMl) if block != nil
|
40
42
|
|
41
43
|
return isBelongs
|
42
44
|
end
|
@@ -63,7 +65,7 @@ class Movie
|
|
63
65
|
|
64
66
|
mylistIdAry.each { |_mylistId|
|
65
67
|
belongsTo = isBelongsTo(_mylistId) { |mylistObj|
|
66
|
-
similarity = mylistObj.
|
68
|
+
similarity = mylistObj.getSimilarity
|
67
69
|
puts "\sSimilarity:\t" + similarity.to_s
|
68
70
|
}
|
69
71
|
puts belongsTo
|
@@ -103,22 +105,20 @@ class Movie
|
|
103
105
|
end
|
104
106
|
|
105
107
|
def getInfo
|
106
|
-
con =
|
108
|
+
con = GetThumbInfoConnector.new()
|
107
109
|
host = 'ext.nicovideo.jp'
|
108
110
|
entity = '/api/getthumbinfo/' + @video_id
|
109
111
|
con.setWait(nil)
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
xml == "failed"
|
112
|
+
result = con.get(host, entity)
|
113
|
+
|
114
|
+
if
|
115
|
+
result["order"] == "success"
|
115
116
|
then
|
116
|
-
|
117
|
-
set(
|
117
|
+
parsed = NicoParser.getThumbInfo(result["body"])
|
118
|
+
set(parsed)
|
118
119
|
@available = true
|
119
120
|
else
|
120
121
|
@available = false
|
121
|
-
return "failed"
|
122
122
|
end
|
123
123
|
end
|
124
124
|
|
@@ -239,8 +239,8 @@ class Movie
|
|
239
239
|
@thumb_type = param
|
240
240
|
when "embeddable"
|
241
241
|
@embeddable = param
|
242
|
-
when "
|
243
|
-
@
|
242
|
+
when "no_live_play"
|
243
|
+
@no_live_play = param
|
244
244
|
when "tags_jp"
|
245
245
|
@tags_jp = param
|
246
246
|
when "tags_tw"
|
@@ -290,7 +290,7 @@ class Movie
|
|
290
290
|
attr_accessor :watch_url
|
291
291
|
attr_accessor :thumb_type
|
292
292
|
attr_accessor :embeddable
|
293
|
-
attr_accessor :
|
293
|
+
attr_accessor :no_live_play
|
294
294
|
attr_accessor :tags_jp
|
295
295
|
attr_accessor :tags_tw
|
296
296
|
attr_accessor :user_id
|
data/lib/mylist.rb
CHANGED
@@ -1,11 +1,13 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
|
+
$:.unshift File.dirname(__FILE__)
|
3
|
+
|
2
4
|
require 'rubygems'
|
3
5
|
require 'ruby-debug'
|
4
6
|
require 'kconv'
|
5
7
|
|
6
|
-
require 'parser'
|
7
|
-
require 'movie'
|
8
|
-
require 'connector'
|
8
|
+
require 'parser.rb'
|
9
|
+
require 'movie.rb'
|
10
|
+
require 'connector.rb'
|
9
11
|
|
10
12
|
|
11
13
|
class Mylist
|
@@ -15,49 +17,9 @@ class Mylist
|
|
15
17
|
@available = false
|
16
18
|
end
|
17
19
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
d = 0.0
|
22
|
-
|
23
|
-
puts "matching..."
|
24
|
-
|
25
|
-
# O(n^2)なのでどうにかしたい。しかし、最大でも500C2=125000なので、
|
26
|
-
# 日々の利用については許容できると思う、
|
27
|
-
@movies.each { |myself|
|
28
|
-
@movies.each { |amovieNumther|
|
29
|
-
_d = dl.distance(myself.title, amovieNumther.title)
|
30
|
-
d += _d
|
31
|
-
}
|
32
|
-
}
|
33
|
-
|
34
|
-
similarity = 1 - ( (d / @movies.length) / title.length )
|
35
|
-
puts "Similarity: " + similarity.to_s
|
36
|
-
similarity
|
37
|
-
end
|
38
|
-
|
39
|
-
def userId
|
40
|
-
idGroup = {}
|
41
|
-
samePublisher = false
|
42
|
-
threshold = 0.9
|
43
|
-
|
44
|
-
@movies.each { |movie|
|
45
|
-
idGroup[movie.user_id] += 1
|
46
|
-
}
|
47
|
-
|
48
|
-
idGroup.each { |group|
|
49
|
-
if @movies.length / threshold < group.length
|
50
|
-
samePublicher = true
|
51
|
-
end
|
52
|
-
}
|
53
|
-
|
54
|
-
return samePublisher
|
55
|
-
end
|
56
|
-
|
57
|
-
# 自分がシリーズをまとめたマイリストであるかを判定する。
|
58
|
-
# 判定基準は、1.一定数以上の動画の投稿者が、マイリスト作成者と同じであること。
|
59
|
-
# 2.タイトルの類似度が、定められた基準以上であること。
|
60
|
-
def isSeries
|
20
|
+
# 自分に含まれている動画のタイトルをすべての組み合わせにおいて比較し、
|
21
|
+
# 類似度の平均を返す。
|
22
|
+
def getSimilarity
|
61
23
|
l = @movies.length - 1
|
62
24
|
dlc = DamerauLevenshtein
|
63
25
|
dl = 0.0
|
@@ -180,17 +142,17 @@ class Mylist
|
|
180
142
|
end
|
181
143
|
|
182
144
|
def getInfoLt
|
183
|
-
con =
|
145
|
+
con = MylistAtomConnector.new()
|
184
146
|
host = 'www.nicovideo.jp'
|
185
147
|
puts @mylist_id
|
186
148
|
entity = '/mylist/' + @mylist_id.to_s + '?rss=atom&numbers=1'
|
187
149
|
con.setWait(nil)
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
then
|
193
|
-
parsed = NicoParser.mylistRss(
|
150
|
+
result = con.get(host, entity)
|
151
|
+
|
152
|
+
if
|
153
|
+
result["order"] == "success"
|
154
|
+
then
|
155
|
+
parsed = NicoParser.mylistRss(result["body"])
|
194
156
|
|
195
157
|
parsed["entry"].each { |e|
|
196
158
|
movie = Movie.new(e["video_id"])
|
@@ -201,7 +163,9 @@ class Mylist
|
|
201
163
|
|
202
164
|
set(parsed["mylist"])
|
203
165
|
@available = true
|
204
|
-
|
166
|
+
else
|
167
|
+
@available = false
|
168
|
+
end
|
205
169
|
end
|
206
170
|
|
207
171
|
def set(paramObj)
|
@@ -252,7 +216,8 @@ class Mylist
|
|
252
216
|
attr_accessor :create_time
|
253
217
|
attr_accessor :update_time
|
254
218
|
attr_accessor :icon_id
|
255
|
-
|
219
|
+
attr_accessor :sort_order
|
220
|
+
attr_accessor :author
|
256
221
|
|
257
222
|
attr_accessor :movies
|
258
223
|
end
|
data/lib/nicoscraper.rb
CHANGED
data/lib/parser.rb
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
|
+
$:.unshift File.dirname(__FILE__)
|
3
|
+
|
2
4
|
require 'rubygems'
|
3
5
|
require 'xml'
|
4
6
|
require 'time'
|
5
|
-
require 'converter'
|
7
|
+
require 'converter.rb'
|
6
8
|
|
7
9
|
module NicoParser
|
8
10
|
public
|
@@ -152,7 +154,7 @@ module NicoParser
|
|
152
154
|
end
|
153
155
|
when "subtitle"
|
154
156
|
doc.read
|
155
|
-
parsed["
|
157
|
+
parsed["mylist"]["description"] = doc.value
|
156
158
|
when "id"
|
157
159
|
if n == -1
|
158
160
|
doc.read
|
@@ -161,7 +163,7 @@ module NicoParser
|
|
161
163
|
else
|
162
164
|
doc.read
|
163
165
|
parsed["entry"][n]["item_id"] =
|
164
|
-
|
166
|
+
Extract.itemId(doc.value)
|
165
167
|
end
|
166
168
|
when "updated"
|
167
169
|
doc.read
|
@@ -176,54 +178,30 @@ module NicoParser
|
|
176
178
|
when "content"
|
177
179
|
doc.read
|
178
180
|
html = doc.value
|
179
|
-
|
180
|
-
memo
|
181
|
-
|
182
|
-
|
183
|
-
).to_s.slice(21, 999)
|
184
|
-
|
181
|
+
|
182
|
+
/(<p\sclass=\"nico-memo\"\>)([^\<]{1,})/ =~ html
|
183
|
+
memo = $2
|
184
|
+
|
185
185
|
/(<p\sclass=\"nico-thumbnail\">.+src=\")(http:\/\/[^\"]{1,})/ =~ html
|
186
186
|
thumbnail_url = $2
|
187
|
-
|
188
|
-
description
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
)
|
206
|
-
|
207
|
-
view =
|
208
|
-
Convert.commaRemover(
|
209
|
-
html.slice(
|
210
|
-
/<strong\sclass\=\"nico-numbers-view\"\>[^\<]{1,}/
|
211
|
-
).to_s.slice(34, 999)
|
212
|
-
)
|
213
|
-
|
214
|
-
res =
|
215
|
-
Convert.commaRemover(
|
216
|
-
html.slice(
|
217
|
-
/<strong\sclass\=\"nico-numbers-res\"\>[^\<]{1,}/
|
218
|
-
).to_s.slice(33, 999)
|
219
|
-
)
|
220
|
-
|
221
|
-
mylist =
|
222
|
-
Convert.commaRemover(
|
223
|
-
html.slice(
|
224
|
-
/<strong\sclass\=\"nico-numbers-mylist\"\>[^\<]{1,}/
|
225
|
-
).to_s.slice(36, 999)
|
226
|
-
)
|
187
|
+
|
188
|
+
/(<p\sclass\=\"nico-description\"\>)([^\<]{1,})/ =~ html
|
189
|
+
description = $2
|
190
|
+
|
191
|
+
/(<p\sclass\=\"nico-info-length\"\>)([^\<]{1,})/ =~ html
|
192
|
+
length = $2
|
193
|
+
|
194
|
+
/(<p\sclass\=\"nico-info-date\"\>)([^\<]{1,})/ =~ html
|
195
|
+
first_retrieve = $2
|
196
|
+
|
197
|
+
/(<p\sclass\=\"nico-numbers-view\"\>)([^\<]{1,})/ =~ html
|
198
|
+
view = $2
|
199
|
+
|
200
|
+
/(<p\sclass\=\"nico-numbers-res\"\>)([^\<]{1,})/ =~ html
|
201
|
+
res = $2
|
202
|
+
|
203
|
+
/(<p\sclass\=\"nico-numbers-mylist\"\>)([^\<]{1,})/ =~ html
|
204
|
+
mylist = $2
|
227
205
|
|
228
206
|
parsed["entry"][n]["memo"] = memo
|
229
207
|
parsed["entry"][n]["thumbnail_url"] = thumbnail_url
|
@@ -238,6 +216,7 @@ module NicoParser
|
|
238
216
|
end
|
239
217
|
|
240
218
|
doc.close
|
219
|
+
|
241
220
|
parsed
|
242
221
|
end
|
243
222
|
|
data/lib/searcher.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
1
|
+
# -*- encoding: utf-8 -*-# -*- encoding: utf-8 -*-
|
2
|
+
$:.unshift File.dirname(__FILE__)
|
3
|
+
|
2
4
|
require 'rubygems'
|
3
5
|
require 'ruby-debug'
|
4
6
|
|
@@ -6,65 +8,13 @@ require 'time'
|
|
6
8
|
require 'mechanize'
|
7
9
|
require 'kconv'
|
8
10
|
|
9
|
-
require 'parser'
|
10
|
-
|
11
|
-
|
12
|
-
$wait_byTag = {
|
13
|
-
'consec_count' => 10, # 連続してリクエストする回数
|
14
|
-
'consec_wait' => 10, # 連続リクエスト後のウェイト
|
15
|
-
'each' => 10, # 連続リクエスト時の、1リクエスト毎のウェイト
|
16
|
-
|
17
|
-
'rejected' => 120, # アクセス拒絶時(「短時間での連続アクセスは・・・」)
|
18
|
-
# の場合の再試行までの時間
|
19
|
-
'403' => 600, # "403"時の再試行までのウェイト
|
20
|
-
'increment' => 1, # アクセス拒絶時の、次回以降の1リクエスト毎のウェイトの増加量
|
21
|
-
|
22
|
-
'timeout' => 5, # タイムアウト時の、再試行までのウェイト
|
23
|
-
'500' => 600, # "500"時の再試行までのウェイト
|
24
|
-
'503' => 600, # "503"時の再試行までのウェイト
|
25
|
-
|
26
|
-
'allowance_time'=> 5 # 再試行回数の限度
|
27
|
-
}
|
28
|
-
|
29
|
-
$wait_byMylistLt = {
|
30
|
-
'consec_count' => 10,
|
31
|
-
'consec_wait' => 10,
|
32
|
-
'each' => 10,
|
11
|
+
require 'parser.rb'
|
33
12
|
|
34
|
-
'rejected' => 120,
|
35
|
-
'403' => 600,
|
36
|
-
'increment' => 1,
|
37
|
-
'timeout' => 5,
|
38
|
-
'500' => 600,
|
39
|
-
'503' => 600,
|
40
|
-
'allowance_time'=> 5
|
41
|
-
}
|
42
13
|
|
43
|
-
|
44
|
-
public
|
45
|
-
|
46
|
-
def byTag (tag, sort, waitObj, &block)
|
47
|
-
gMByTag = GetMovieByTag.new()
|
48
|
-
gMByTag.execute(tag, sort, waitObj) { |result, page|
|
49
|
-
block.call(result, page)
|
50
|
-
}
|
51
|
-
end
|
52
|
-
|
53
|
-
def byTagLt (tag, sort, waitObj, &block)
|
54
|
-
gMByTagLt = GetMovieByTagLt.new()
|
55
|
-
gMByTagLt.execute(tag, sort, waitObj) { |result, page|
|
56
|
-
block.call(result, page)
|
57
|
-
}
|
58
|
-
end
|
59
|
-
|
60
|
-
module_function :byTag
|
61
|
-
module_function :byTagLt
|
62
|
-
end
|
63
|
-
|
64
|
-
class GetMovieByTagSuper
|
14
|
+
class SearchByTagSuper
|
65
15
|
private
|
66
16
|
|
67
|
-
def get
|
17
|
+
def get(tag, sort, page, method, waitObj)
|
68
18
|
paramAry = []
|
69
19
|
|
70
20
|
case sort
|
@@ -94,7 +44,7 @@ class GetMovieByTagSuper
|
|
94
44
|
sortStr = 'sort=l&order=a'
|
95
45
|
end
|
96
46
|
|
97
|
-
|
47
|
+
paramAry.push("page=#{page}") if page != 1
|
98
48
|
paramAry.push(sortStr)
|
99
49
|
if method == "atom" then paramAry.push("rss=atom&numbers=1") end
|
100
50
|
param = tag + "?" + paramAry.join('&')
|
@@ -102,18 +52,16 @@ class GetMovieByTagSuper
|
|
102
52
|
host = 'www.nicovideo.jp'
|
103
53
|
entity = '/tag/' + param
|
104
54
|
|
105
|
-
@
|
106
|
-
@
|
55
|
+
@connector.setWait(waitObj)
|
56
|
+
@connector.get(host, entity)
|
107
57
|
end
|
108
58
|
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
page = 1
|
59
|
+
def loop(tag, sort, method, waitObj, &block)
|
60
|
+
termFlag = false
|
61
|
+
page = 1
|
62
|
+
movieObjAry = []
|
114
63
|
|
115
64
|
begin
|
116
|
-
result = []
|
117
65
|
response = get(
|
118
66
|
tag,
|
119
67
|
sort,
|
@@ -122,9 +70,16 @@ class GetMovieByTagSuper
|
|
122
70
|
waitObj
|
123
71
|
)
|
124
72
|
|
125
|
-
if response
|
126
|
-
result = parse(response)
|
127
|
-
|
73
|
+
if response["order"] == "success"
|
74
|
+
result = parse(response["body"])
|
75
|
+
result.each { |each|
|
76
|
+
movie = Movie.new(each["video_id"])
|
77
|
+
each["available"] = true
|
78
|
+
movie.set(each)
|
79
|
+
movieObjAry.push(movie)
|
80
|
+
}
|
81
|
+
|
82
|
+
termFlag = block.call(movieObjAry, page)
|
128
83
|
else
|
129
84
|
termFlag = true
|
130
85
|
end
|
@@ -134,13 +89,12 @@ class GetMovieByTagSuper
|
|
134
89
|
end
|
135
90
|
end
|
136
91
|
|
137
|
-
|
138
|
-
class GetMovieByTag < GetMovieByTagSuper
|
92
|
+
class SearchByTag < SearchByTagSuper
|
139
93
|
def initialize
|
140
|
-
@
|
94
|
+
@numOfSearched = 32
|
141
95
|
@incrAmt = 0.2
|
142
96
|
|
143
|
-
@
|
97
|
+
@connector = Connector.new('mech')
|
144
98
|
|
145
99
|
# HTML中の各パラメータの所在を示すXPath
|
146
100
|
@videoIdXP = "//div[@class='uad_thumbfrm']/table/tr/td/p/a"
|
@@ -154,16 +108,16 @@ class GetMovieByTag < GetMovieByTagSuper
|
|
154
108
|
def parse(movieNum)
|
155
109
|
result = []
|
156
110
|
|
157
|
-
video_id = /(sm|nm)[0-9]{1,}/.match(@
|
158
|
-
lengthStr = @
|
111
|
+
video_id = /(sm|nm)[0-9]{1,}/.match(@connector.mech.page.search(@videoIdXP)[movieNum]['href'])[0]
|
112
|
+
lengthStr = @connector.mech.page.search(@lengthXP)[movieNum].text.split(/\:/)
|
159
113
|
length = lengthStr[0].to_i * 60 + lengthStr[1].to_i
|
160
|
-
view = @
|
114
|
+
view = @connector.mech.page.search(@viewXP)[movieNum]
|
161
115
|
.text.gsub(/\,/, '').to_i
|
162
|
-
res = @
|
116
|
+
res = @connector.mech.page.search(@resXP)[movieNum]
|
163
117
|
.text.gsub(/\,/, '').to_i
|
164
|
-
mylist = @
|
118
|
+
mylist = @connector.mech.page.search(@mylistXP)[movieNum]
|
165
119
|
.text.gsub(/\,/, '').to_i
|
166
|
-
ad = @
|
120
|
+
ad = @connector.mech.page.search(@adXP)[movieNum]
|
167
121
|
.text.gsub(/\,/, '').to_i
|
168
122
|
|
169
123
|
result.push({
|
@@ -183,11 +137,11 @@ class GetMovieByTag < GetMovieByTagSuper
|
|
183
137
|
end
|
184
138
|
end
|
185
139
|
|
186
|
-
class
|
140
|
+
class SearchByTagLt < SearchByTagSuper
|
187
141
|
def initialize
|
188
|
-
@
|
142
|
+
@numOfSearched = 32
|
189
143
|
@incrAmt = 0.2
|
190
|
-
@
|
144
|
+
@connector = SearchByTagAtomConnector.new()
|
191
145
|
end
|
192
146
|
|
193
147
|
def parse(xml)
|
@@ -199,7 +153,4 @@ class GetMovieByTagLt < GetMovieByTagSuper
|
|
199
153
|
block.call(result, page)
|
200
154
|
}
|
201
155
|
end
|
202
|
-
end
|
203
|
-
|
204
|
-
|
205
|
-
|
156
|
+
end
|
data/nicoscraper.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{nicoscraper}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.2.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = [%q{Masami Yonehara}]
|
12
|
-
s.date = %q{2011-09-
|
12
|
+
s.date = %q{2011-09-22}
|
13
13
|
s.description = %q{It scrape movies and mylists of Niconico douga.
|
14
14
|
}
|
15
15
|
s.email = %q{zeitdiebe@gmail.com}
|
@@ -34,6 +34,7 @@ Gem::Specification.new do |s|
|
|
34
34
|
"lib/searcher.rb",
|
35
35
|
"nicoscraper.gemspec",
|
36
36
|
"test/helper.rb",
|
37
|
+
"test/movie_spec.rb",
|
37
38
|
"test/test_nicoscraper.rb"
|
38
39
|
]
|
39
40
|
s.homepage = %q{http://github.com/hdemon/nicoscraper}
|
data/test/movie_spec.rb
ADDED
@@ -0,0 +1,125 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.unshift File.dirname(__FILE__) + "/../lib"
|
3
|
+
|
4
|
+
require 'movie.rb'
|
5
|
+
require 'mylist.rb'
|
6
|
+
require 'searcher.rb'
|
7
|
+
|
8
|
+
describe Movie, "After executiton of 'getInfo' method" do
|
9
|
+
before(:all) do
|
10
|
+
@movie = Movie.new("sm1097445")
|
11
|
+
@movie.getInfo
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should have following values" do
|
15
|
+
@movie.available .should be_true
|
16
|
+
|
17
|
+
@movie.video_id .should == "sm1097445"
|
18
|
+
@movie.title .should == "【初音ミク】みくみくにしてあげる♪【してやんよ】"
|
19
|
+
@movie.description .should_not be_nil
|
20
|
+
@movie.thumbnail_url.should_not be_nil
|
21
|
+
@movie.first_retrieve.should == 1190218922
|
22
|
+
@movie.length .should == 99
|
23
|
+
@movie.movie_type .should == "flv"
|
24
|
+
@movie.size_high .should == 3906547
|
25
|
+
@movie.size_low .should == 1688098
|
26
|
+
|
27
|
+
@movie.view_counter .should_not be_nil
|
28
|
+
@movie.comment_num .should_not be_nil
|
29
|
+
@movie.mylist_counter .should_not be_nil
|
30
|
+
@movie.last_res_body.should_not be_nil
|
31
|
+
|
32
|
+
@movie.watch_url .should == "http://www.nicovideo.jp/watch/sm1097445"
|
33
|
+
@movie.thumb_type .should == "video"
|
34
|
+
@movie.embeddable .should == 1
|
35
|
+
@movie.no_live_play .should == 0
|
36
|
+
@movie.tags_jp .should_not be_nil
|
37
|
+
@movie.tags_tw .should_not be_nil
|
38
|
+
@movie.user_id .should == 70391
|
39
|
+
end
|
40
|
+
|
41
|
+
it "should return true when execute 'isBelongsTo' method." +
|
42
|
+
"passing arguments (1450136)" do
|
43
|
+
result = @movie.isBelongsTo(1450136)
|
44
|
+
result .should_not be_nil
|
45
|
+
result .should be_true
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
describe Movie, "when access with non-existent video_id" do
|
50
|
+
before(:all) do
|
51
|
+
@movie = Movie.new("sm*nonexistent")
|
52
|
+
@movie.getInfo
|
53
|
+
end
|
54
|
+
|
55
|
+
it "should be unavailable" do
|
56
|
+
@movie.available .should be_false
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
describe Mylist, "After executiton of 'getInfoLt' method" do
|
61
|
+
before(:all) do
|
62
|
+
@mylist = Mylist.new(15196568)
|
63
|
+
@mylist.getInfoLt
|
64
|
+
puts @mylist
|
65
|
+
end
|
66
|
+
|
67
|
+
it "should have following values" do
|
68
|
+
@mylist.available .should be_true
|
69
|
+
|
70
|
+
@mylist.mylist_id .should == 15196568
|
71
|
+
@mylist.user_id .should be_nil
|
72
|
+
@mylist.title .should == "【Oblivion】おっさんの大冒険"
|
73
|
+
@mylist.description .should_not be_nil
|
74
|
+
@mylist.public .should be_nil
|
75
|
+
@mylist.default_sort .should be_nil
|
76
|
+
@mylist.create_time .should be_nil
|
77
|
+
@mylist.update_time .should_not be_nil
|
78
|
+
|
79
|
+
@mylist.icon_id .should be_nil
|
80
|
+
@mylist.movies .should_not be_nil
|
81
|
+
@mylist.movies .should be_kind_of(Array)
|
82
|
+
@mylist.author .should == "おぽこ"
|
83
|
+
end
|
84
|
+
|
85
|
+
it "should return over 0.9 when execute 'getSimilarity' method." +
|
86
|
+
"passing arguments (1450136)" do
|
87
|
+
result = @mylist.getSimilarity
|
88
|
+
result .should_not be_nil
|
89
|
+
result .should >= 0.9
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
describe "When execute 'SearchByTagLt.execute' method " +
|
94
|
+
"passing following argument" do
|
95
|
+
before(:all) do
|
96
|
+
searcher = SearchByTagLt.new()
|
97
|
+
searcher.execute("ゆっくり実況プレイpart1リンク", "post_old", nil) { |result|
|
98
|
+
@result = result
|
99
|
+
}
|
100
|
+
end
|
101
|
+
|
102
|
+
it "should have Array of movie objects." do
|
103
|
+
@result .should be_kind_of(Array)
|
104
|
+
@result[0].should be_instance_of(Movie)
|
105
|
+
end
|
106
|
+
|
107
|
+
it "should contains movie objects that have following structure." do
|
108
|
+
@result[0].available .should be_true
|
109
|
+
|
110
|
+
@result[0].video_id .should_not be_nil
|
111
|
+
@result[0].title .should_not be_nil
|
112
|
+
@result[0].published .should_not be_nil
|
113
|
+
@result[0].updated .should_not be_nil
|
114
|
+
#@result[0].memo .should_not be_nil
|
115
|
+
@result[0].description .should_not be_nil
|
116
|
+
@result[0].thumbnail_url.should_not be_nil
|
117
|
+
@result[0].published .should_not be_nil
|
118
|
+
@result[0].updated .should_not be_nil
|
119
|
+
@result[0].length .should_not be_nil
|
120
|
+
|
121
|
+
@result[0].view_counter .should_not be_nil
|
122
|
+
@result[0].comment_num .should_not be_nil
|
123
|
+
@result[0].mylist_counter.should_not be_nil
|
124
|
+
end
|
125
|
+
end
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: nicoscraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.
|
5
|
+
version: 0.2.0
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Masami Yonehara
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2011-09-
|
13
|
+
date: 2011-09-22 00:00:00 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: damerau-levenshtein
|
@@ -104,6 +104,7 @@ files:
|
|
104
104
|
- lib/searcher.rb
|
105
105
|
- nicoscraper.gemspec
|
106
106
|
- test/helper.rb
|
107
|
+
- test/movie_spec.rb
|
107
108
|
- test/test_nicoscraper.rb
|
108
109
|
homepage: http://github.com/hdemon/nicoscraper
|
109
110
|
licenses:
|
@@ -118,7 +119,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
118
119
|
requirements:
|
119
120
|
- - ">="
|
120
121
|
- !ruby/object:Gem::Version
|
121
|
-
hash:
|
122
|
+
hash: -4040185051160948255
|
122
123
|
segments:
|
123
124
|
- 0
|
124
125
|
version: "0"
|