nicoscraper 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: nicoscraper
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.2.4
5
+ version: 0.2.5
6
6
  platform: ruby
7
7
  authors:
8
8
  - Masami Yonehara
@@ -10,7 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2011-09-23 00:00:00 Z
13
+ date: 2011-09-25 00:00:00 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: damerau-levenshtein
@@ -19,13 +19,24 @@ dependencies:
19
19
  requirements:
20
20
  - - ">="
21
21
  - !ruby/object:Gem::Version
22
- version: "0"
22
+ version: 0.5.3
23
23
  type: :runtime
24
24
  prerelease: false
25
25
  version_requirements: *id001
26
26
  - !ruby/object:Gem::Dependency
27
- name: rake
27
+ name: libxml-ruby
28
28
  requirement: &id002 !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 2.2.2
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: *id002
37
+ - !ruby/object:Gem::Dependency
38
+ name: rake
39
+ requirement: &id003 !ruby/object:Gem::Requirement
29
40
  none: false
30
41
  requirements:
31
42
  - - "="
@@ -33,10 +44,10 @@ dependencies:
33
44
  version: 0.8.7
34
45
  type: :development
35
46
  prerelease: false
36
- version_requirements: *id002
47
+ version_requirements: *id003
37
48
  - !ruby/object:Gem::Dependency
38
49
  name: shoulda
39
- requirement: &id003 !ruby/object:Gem::Requirement
50
+ requirement: &id004 !ruby/object:Gem::Requirement
40
51
  none: false
41
52
  requirements:
42
53
  - - ">="
@@ -44,10 +55,10 @@ dependencies:
44
55
  version: "0"
45
56
  type: :development
46
57
  prerelease: false
47
- version_requirements: *id003
58
+ version_requirements: *id004
48
59
  - !ruby/object:Gem::Dependency
49
60
  name: bundler
50
- requirement: &id004 !ruby/object:Gem::Requirement
61
+ requirement: &id005 !ruby/object:Gem::Requirement
51
62
  none: false
52
63
  requirements:
53
64
  - - ~>
@@ -55,10 +66,10 @@ dependencies:
55
66
  version: 1.0.0
56
67
  type: :development
57
68
  prerelease: false
58
- version_requirements: *id004
69
+ version_requirements: *id005
59
70
  - !ruby/object:Gem::Dependency
60
71
  name: jeweler
61
- requirement: &id005 !ruby/object:Gem::Requirement
72
+ requirement: &id006 !ruby/object:Gem::Requirement
62
73
  none: false
63
74
  requirements:
64
75
  - - ~>
@@ -66,10 +77,10 @@ dependencies:
66
77
  version: 1.6.4
67
78
  type: :development
68
79
  prerelease: false
69
- version_requirements: *id005
80
+ version_requirements: *id006
70
81
  - !ruby/object:Gem::Dependency
71
82
  name: rcov
72
- requirement: &id006 !ruby/object:Gem::Requirement
83
+ requirement: &id007 !ruby/object:Gem::Requirement
73
84
  none: false
74
85
  requirements:
75
86
  - - ">="
@@ -77,7 +88,7 @@ dependencies:
77
88
  version: "0"
78
89
  type: :development
79
90
  prerelease: false
80
- version_requirements: *id006
91
+ version_requirements: *id007
81
92
  description: "It scrape movies and mylists of Niconico douga.\n "
82
93
  email: zeitdiebe@gmail.com
83
94
  executables: []
@@ -96,15 +107,18 @@ files:
96
107
  - Rakefile
97
108
  - VERSION
98
109
  - index.html
99
- - lib/connector.rb
100
- - lib/converter.rb
101
- - lib/movie.rb
102
- - lib/mylist.rb
103
- - lib/namespace.rb
104
- - lib/parser.rb
105
- - lib/searcher.rb
106
- - nicoscraper.gemspec
110
+ - lib/classes/connector.rb
111
+ - lib/classes/converter.rb
112
+ - lib/classes/header.rb
113
+ - lib/classes/movie.rb
114
+ - lib/classes/mylist.rb
115
+ - lib/classes/parser.rb
116
+ - lib/classes/searcher.rb
117
+ - lib/classes/tools.rb
118
+ - lib/config/wait.rb
119
+ - lib/nicoscraper.rb
107
120
  - test/movie_spec.rb
121
+ - test/searcher_spec.rb
108
122
  homepage: http://github.com/hdemon/nicoscraper
109
123
  licenses:
110
124
  - MIT
@@ -118,7 +132,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
118
132
  requirements:
119
133
  - - ">="
120
134
  - !ruby/object:Gem::Version
121
- hash: 893829155072320695
135
+ hash: 2919755010107136156
122
136
  segments:
123
137
  - 0
124
138
  version: "0"
data/lib/connector.rb DELETED
@@ -1,364 +0,0 @@
1
- # -*- encoding: utf-8 -*-
2
- $:.unshift File.dirname(__FILE__)
3
-
4
- require 'rubygems'
5
- require 'ruby-debug'
6
- require 'net/http'
7
-
8
- require 'namespace.rb'
9
-
10
- module Nicos::Connector
11
- class Connector
12
- def initialize
13
- # デフォルトのウェイト設定
14
- @seqTime = 0
15
-
16
- @waitConfig = {
17
- 'seqAccLimit' => 10, # 連続してリクエストする回数
18
- 'afterSeq' => 10, # 連続リクエスト後のウェイト
19
- 'each' => 1, # 連続リクエスト時の、1リクエスト毎のウェイト
20
-
21
- 'increment' => 1, # アクセス拒絶時の、次回以降の1リクエスト毎のウェイトの増加量
22
- '' => 100,
23
-
24
- 'deniedSeqReq'=> {
25
- 'retryLimit' => 3,
26
- 'wait' => 120
27
- },
28
-
29
- 'serverIsBusy'=> {
30
- 'retryLimit' => 3,
31
- 'wait' => 120
32
- },
33
-
34
- 'serviceUnavailable' => {
35
- 'retryLimit' => 3,
36
- 'wait' => 120
37
- },
38
-
39
- 'timedOut' => {
40
- 'retryLimit' => 3,
41
- 'wait' => 10
42
- }
43
- }
44
-
45
- @result = {}
46
- end
47
-
48
- private
49
-
50
- def notPublic
51
- # マイリスト非公開のときに403になる。後で専用の処理を入れるべき。
52
- puts "This movie/mylist is not public."
53
- @result = "notPublic"
54
- return { "order" => "terminate" }
55
- end
56
-
57
- def limInCommunity
58
- puts "This movie/mylist is limited in comunity members."
59
- # ex. item_id -> 1294702905
60
- @result = "limInCommunity"
61
- return { "order" => "terminate" }
62
- end
63
-
64
- def notFound
65
- puts "This movie/mylist is not found."
66
- @result = "notFound"
67
- return { "order" => "terminate" }
68
- end
69
-
70
- def deleted
71
- puts "This movie/mylist is deleted."
72
- @result = "deleted"
73
- return { "order" => "terminate" }
74
- end
75
-
76
- def deniedSeqReq
77
- puts "Denied sequential requests."
78
- sleep @waitConfig["deniedSeqReq"]
79
- @result = "deniedSeqReq"
80
- return { "order" => "retry" }
81
- end
82
-
83
- def serverIsBusy
84
- puts "The server is busy."
85
- sleep @waitConfig["serverIsBusy"]
86
- @result = "serverIsBusy"
87
- return { "order" => "retry" }
88
- end
89
-
90
- def serviceUnavailable
91
- puts "Service unavailable."
92
- sleep @waitConfig["serviceUnavailable"]
93
- @result = "serviceUnavailable"
94
- return { "order" => "retry" }
95
- end
96
-
97
- def timedOut
98
- puts "Request timed out."
99
- sleep @waitConfig["timedOut"]
100
- @result = "timedOut"
101
- return { "order" => "retry" }
102
- end
103
-
104
- def success(resBody)
105
- sleep @waitConfig["each"]
106
- @seqTime += 1
107
-
108
- if @seqTime >= @waitConfig["seqAccLimit"]
109
- sleep @waitConfig["afterSeq"]
110
- @seqTime = 0
111
- end
112
- return { "order" => "success", "body" => resBody }
113
- end
114
-
115
- def wait(status)
116
- puts "Wait for " + waitTime + " second."
117
- sleep @waitConfig[status.to_s]
118
- end
119
-
120
- public
121
-
122
- def setWait(waitConfig)
123
- if waitConfig != nil
124
- @waitConfig = mixin(@waitConfig, waitConfig)
125
- end
126
- end
127
- end
128
-
129
- class Xml < Connector
130
- def get (host, entity)
131
- response = nil
132
-
133
- begin
134
- puts "Request to " + host + entity
135
- Net::HTTP.start(host, 80) { |http|
136
- response = http.get(entity)
137
- }
138
-
139
- rescue => e
140
- puts e
141
- rescue Timeout::Error => e
142
- timeOut
143
-
144
- else
145
- res = case response
146
- when Net::HTTPSuccess
147
- reviewRes( response.body.force_encoding("UTF-8") )
148
- # return response.body.force_encoding("UTF-8")
149
- # when Net::HTTPRedirection
150
- # fetch(response['location'], limit - 1)
151
- when Net::HTTPForbidden
152
- forbidden
153
- when Net::HTTPNotFound
154
- notFound
155
- when Net::HTTPServiceUnavailable
156
- serviceUnavailable
157
- else
158
- unknownError
159
- end
160
- end until res["order"] == "success" ||
161
- res["order"] == "terminate"
162
-
163
- res
164
- end
165
- end
166
-
167
- class MylistAtom < Xml
168
- private
169
-
170
- def forbidden
171
- # マイリストが非公開の場合、html/Atomのどちらへのリクエストであっても、403が返ってくる。
172
- notPublic
173
- end
174
-
175
- def reviewRes(resBody)
176
- if # アクセス集中時
177
- /大変ご迷惑をおかけいたしますが、しばらく時間をあけてから再度検索いただくようご協力をお願いいたします。/ =~
178
- resBody.force_encoding("UTF-8")
179
- then
180
- serverIsBusy
181
- else
182
- success(resBody)
183
- end
184
- end
185
- end
186
-
187
- class TagAtom < Xml
188
- private
189
-
190
- def forbidden
191
- # マイリストが非公開の場合、html/Atomのどちらへのリクエストであっても、403が返ってくる。
192
- notPublic
193
- end
194
-
195
- def reviewRes(resBody)
196
- if # アクセス集中時
197
- /大変ご迷惑をおかけいたしますが、しばらく時間をあけてから再度検索いただくようご協力をお願いいたします。/ =~
198
- resBody.force_encoding("UTF-8")
199
- then
200
- serverIsBusy
201
- else
202
- success(resBody)
203
- end
204
- end
205
- end
206
-
207
- class GetThumbInfo < Xml
208
- private
209
-
210
- def reviewRes(resBody)
211
- r = resBody.force_encoding("UTF-8")
212
-
213
- if # getThumbInfoは、該当する動画がない・削除済み・コミュニティ限定でも200が返ってくる。
214
- /<nicovideo_thumb_response\sstatus=\"fail\">/ =~ r
215
- if /<code>NOT_FOUND<\/code>/ =~ r
216
- notFound
217
- elsif /<code>DELETED<\/code>/ =~ r
218
- deleted
219
- elsif /<code>COMMUNITY<\/code>/ =~ r
220
- limInCommunity
221
- else
222
- serverIsBusy
223
- end
224
- else
225
- success(resBody)
226
- end
227
- end
228
- end
229
-
230
- =begin
231
- class HtmlConnector < Connector
232
- def initialize(mode)
233
- @mode = mode
234
- # デフォルトのウェイト設定
235
- @waitConfig = {
236
- 'consec_count' => 10, # 連続してリクエストする回数
237
- 'consec_wait' => 10, # 連続リクエスト後のウェイト
238
- 'each' => 10, # 連続リクエスト時の、1リクエスト毎のウェイト
239
-
240
- '200-abnormal' => 300, # アクセス拒絶時(「短時間での連続アクセスは・・・」)の場合の再試行までの時間
241
- 'unavailable' => 10,
242
- '403' => 300, # "403"時の再試行までのウェイト
243
- '404' => 300, # "403"時の再試行までのウェイト
244
- 'increment' => 1, # アクセス拒絶時の、次回以降の1リクエスト毎のウェイトの増加量
245
-
246
- 'timeout' => 10, # タイムアウト時の、再試行までのウェイト
247
- '500' => 10, # "500"時の再試行までのウェイト
248
- '503' => 10, # "503"時の再試行までのウェイト
249
-
250
- 'retryLimit' => 3 # 再試行回数の限度
251
- }
252
-
253
- # 1つの検索結果画面に表示される動画の数。現時点では32個がデフォルトの模様。
254
- @NumOfSearched = 32
255
-
256
- @mech = Mechanize.new
257
- # メモリ節約のため、Mechanizeの履歴機能を切る。
258
- @mech.max_history = 1
259
-
260
- @consec_count = 0
261
- end
262
-
263
- public
264
-
265
- def errorStatus(ex)
266
- # 再試行回数が
267
- @retryTime += 1
268
- if @retryTime >= @wait['allowance_time']
269
- return false
270
- end
271
-
272
- case ex.response_code
273
- when '403' then
274
- sleep @wait['403']
275
- warn "403"
276
- when '500' then
277
- sleep @wait['500']
278
- warn "500"
279
- when '503' then
280
- sleep @wait['503']
281
- warn "503"
282
- else
283
- warn "Server error: #{ex.code}"
284
- return false
285
- end
286
-
287
- @connection = false
288
- @failed += 1
289
- end
290
-
291
- def htmlReq (url, request, procedure)
292
- @failed = 0
293
-
294
- # 再試行ループ
295
- begin
296
- eachWait
297
- @connection = nil
298
- request.call(url)
299
-
300
- # タイムアウト時処理
301
- rescue TimeoutError
302
- timeOut
303
- retry
304
-
305
- # Mechanizeでアクセスし、200以外のステータスが返ってきた時
306
- # 実際に該当するコードが返ってきたことがないので、正常に動くか不明
307
- rescue Mechanize::ResponseCodeError => ex
308
- if errorStatus(ex) then retry
309
- else break end
310
-
311
- # HTTP Status:200時の処理
312
- else
313
- procedure.call
314
-
315
- # 失敗カウントが指定回数を超えたらループを終わる。
316
- if @failed >= @wait['allowance_time'] then
317
- puts 'Exceeded the limit of retry time.'
318
- @connection = false
319
- break
320
- end
321
- end until @connection
322
-
323
- # 連続アクセスカウント+1
324
- @consec_count += 1
325
- # 成功 = true / 失敗 = false
326
- return @connection
327
- end
328
-
329
- def htmlGet (host, entity)
330
- htmlReq(
331
- host + entity,
332
- lambda { |url|
333
- t = Thread.new do
334
- @mech.get(url)
335
- puts "Requesting for " + url
336
- end
337
- t.join
338
- },
339
- # HTTP Status:200時の処理
340
- lambda {
341
- # 連続アクセス拒絶メッセージが返ってきた時
342
- if /短時間での連続アクセスはご遠慮ください/ =~ @mech.page.search('/html').text then
343
- puts 'Access rejected.'
344
- @connection = false
345
- @failed += 1
346
-
347
- # ウェイトを置いた後、今後のページ毎のウェイトを増やす。
348
- puts 'Waiting for ' + @wait['rejected'] + 's.'
349
- sleep @wait['rejected']
350
- @wait['each'] += @wait['increment']
351
- puts 'Increased each @wait by ' + @wait['increment'] + 'sec.'
352
- else
353
- @connection = true
354
- end
355
- }
356
- )
357
-
358
- return @mech.page
359
- end
360
-
361
- attr_reader :mech
362
- end
363
- =end
364
- end
data/lib/converter.rb DELETED
@@ -1,72 +0,0 @@
1
- # -*- encoding: utf-8 -*-
2
- $:.unshift File.dirname(__FILE__)
3
-
4
- require 'rubygems'
5
- require 'xml'
6
- require 'time'
7
- require 'namespace.rb'
8
-
9
- module Nicos::Converter #:nodoc:
10
- def iso8601ToUnix(str)
11
- Time.strptime(str, "%Y-%m-%dT%H:%M:%S").to_i
12
- end
13
- module_function :iso8601ToUnix
14
-
15
- def japToUnix(str)
16
- str.gsub!(/年|月/, '-')
17
- .gsub!(/日/, 'T')
18
- .gsub!(/:/, ':')
19
- .gsub!(/\s/, '')
20
- iso8601ToUnix(str)
21
- end
22
- module_function :japToUnix
23
-
24
- def toSeconds(lengthStr)
25
- # lengthStr = "mm:ss"
26
- lengthStr = lengthStr.split(/\:/)
27
- lengthStr[0].to_i * 60 + lengthStr[1].to_i
28
- end
29
- module_function :toSeconds
30
-
31
- def commaRemover(str)
32
- str.gsub(/\,/, '').to_i
33
- end
34
- module_function :commaRemover
35
- end
36
-
37
- module Nicos::Extractor #:nodoc:
38
- def mylistId(str)
39
- /(mylist\/)([0-9]{1,})/ =~ str
40
- $2.to_i
41
- end
42
- module_function :mylistId
43
-
44
- def itemId(str)
45
- /(watch\/)([0-9]{1,})/ =~ str
46
- $2.to_i
47
- end
48
- module_function :itemId
49
-
50
- def videoId(str)
51
- /(http:\/\/www.nicovideo.jp\/watch\/)((sm|nm)[0-9]{1,})/ =~ str
52
- $2
53
- end
54
- module_function :videoId
55
- end
56
-
57
- module Nicos::Unicode
58
- def escape(str)
59
- ary = str.unpack("U*").map!{|i| "\\u#{i.to_s(16)}"}
60
- ary.join
61
- end
62
-
63
- UNESCAPE_WORKER_ARRAY = []
64
- def unescape(str)
65
- str.gsub(/\\u([0-9a-f]{4})/) {
66
- UNESCAPE_WORKER_ARRAY[0] = $1.hex
67
- UNESCAPE_WORKER_ARRAY.pack("U")
68
- }
69
- end
70
-
71
- module_function :escape, :unescape
72
- end