nicoscraper 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ lib/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/Gemfile ADDED
@@ -0,0 +1,15 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+ gem "damerau-levenshtein", ">= 0"
6
+
7
+ # Add dependencies to develop your gem here.
8
+ # Include everything needed to run rake, tests, features, etc.
9
+ group :development do
10
+ gem "rake", "0.8.7"
11
+ gem "shoulda", ">= 0"
12
+ gem "bundler", "~> 1.0.0"
13
+ gem "jeweler", "~> 1.6.4"
14
+ gem "rcov", ">= 0"
15
+ end
@@ -0,0 +1,23 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ damerau-levenshtein (0.5.3)
5
+ git (1.2.5)
6
+ jeweler (1.6.4)
7
+ bundler (~> 1.0)
8
+ git (>= 1.2.5)
9
+ rake
10
+ rake (0.8.7)
11
+ rcov (0.9.10)
12
+ shoulda (2.11.3)
13
+
14
+ PLATFORMS
15
+ ruby
16
+
17
+ DEPENDENCIES
18
+ bundler (~> 1.0.0)
19
+ damerau-levenshtein
20
+ jeweler (~> 1.6.4)
21
+ rake (= 0.8.7)
22
+ rcov
23
+ shoulda
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2011 Masami Yonehara
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,19 @@
1
+ = nicoscraper
2
+
3
+ Description goes here.
4
+
5
+ == Contributing to nicoscraper
6
+
7
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
8
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
9
+ * Fork the project
10
+ * Start a feature/bugfix branch
11
+ * Commit and push until you are happy with your contribution
12
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
13
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
14
+
15
+ == Copyright
16
+
17
+ Copyright (c) 2011 Masami Yonehara. See LICENSE.txt for
18
+ further details.
19
+
@@ -0,0 +1,54 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
+ gem.name = "nicoscraper"
18
+ gem.homepage = "http://github.com/hdemon/nicoscraper"
19
+ gem.license = "MIT"
20
+ gem.summary = "The scraper for Niconico douga."
21
+ gem.description = "It scrape movies and mylists of Niconico douga.
22
+ "
23
+ gem.email = "zeitdiebe@gmail.com"
24
+ gem.authors = ["Masami Yonehara"]
25
+ # dependencies defined in Gemfile
26
+ end
27
+ Jeweler::RubygemsDotOrgTasks.new
28
+
29
+ require 'rake/testtask'
30
+ Rake::TestTask.new(:test) do |test|
31
+ test.libs << 'lib' << 'test'
32
+ test.pattern = 'test/**/test_*.rb'
33
+ test.verbose = true
34
+ end
35
+
36
+ require 'rcov/rcovtask'
37
+ Rcov::RcovTask.new do |test|
38
+ test.libs << 'test'
39
+ test.pattern = 'test/**/test_*.rb'
40
+ test.verbose = true
41
+ test.rcov_opts << '--exclude "gems/*"'
42
+ end
43
+
44
+ task :default => :test
45
+
46
+ require 'rake/rdoctask'
47
+ Rake::RDocTask.new do |rdoc|
48
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
49
+
50
+ rdoc.rdoc_dir = 'rdoc'
51
+ rdoc.title = "nicoscraper #{version}"
52
+ rdoc.rdoc_files.include('README*')
53
+ rdoc.rdoc_files.include('lib/**/*.rb')
54
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,269 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'rubygems'
3
+ require 'ruby-debug'
4
+ require 'net/http'
5
+
6
+ class Connector
7
+ def initialize(mode)
8
+ @mode = mode
9
+ # デフォルトのウェイト設定
10
+ @waitConfig = {
11
+ 'consec_count' => 10, # 連続してリクエストする回数
12
+ 'consec_wait' => 10, # 連続リクエスト後のウェイト
13
+ 'each' => 10, # 連続リクエスト時の、1リクエスト毎のウェイト
14
+
15
+ '200-abnormal' => 1, # アクセス拒絶時(「短時間での連続アクセスは・・・」)の場合の再試行までの時間
16
+ 'unavailable' => 10,
17
+ '403' => 1, # "403"時の再試行までのウェイト
18
+ '404' => 1, # "403"時の再試行までのウェイト
19
+ 'increment' => 1, # アクセス拒絶時の、次回以降の1リクエスト毎のウェイトの増加量
20
+
21
+ 'timeout' => 5, # タイムアウト時の、再試行までのウェイト
22
+ '500' => 1, # "500"時の再試行までのウェイト
23
+ '503' => 1, # "503"時の再試行までのウェイト
24
+
25
+ 'retryLimit' => 5 # 再試行回数の限度
26
+ }
27
+
28
+ # 1つの検索結果画面に表示される動画の数。現時点では10個。
29
+ @NumOfSearched = 32
30
+
31
+ if @mode == "mech"
32
+ @mech = Mechanize.new
33
+ # メモリ節約のため、Mechanizeの履歴機能をオフにする。
34
+ @mech.max_history = 1
35
+ end
36
+
37
+ @consec_count = 0
38
+ end
39
+
40
+ private
41
+
42
+ def mixin(targetObj, overWriteObj)
43
+ output = Marshal.load(Marshal.dump(targetObj))
44
+ if targetObj.instance_of?(Hash)
45
+ overWriteObj.each_key { |key|
46
+ overWriteObj[key] = mixin(targetObj[key], overWriteObj[key])
47
+ output[key] = overWriteObj[key]
48
+ }
49
+ else
50
+ output = overWriteObj
51
+ end
52
+ return output
53
+ end
54
+
55
+ public
56
+
57
+ def setWait(waitConfig)
58
+ if waitConfig != nil
59
+ @waitConfig = mixin(@waitConfig, waitConfig)
60
+ end
61
+ end
62
+
63
+ def eachWait
64
+ # ウェイト...1回目の場合は無視 -------------------------
65
+ if @consec_count != 0
66
+ # 動画毎
67
+ sleep @wait['each']
68
+
69
+ # 一定のリクエスト回数毎
70
+ if @consec_count >= @wait['consec_count'] then
71
+ sleep @wait['consec_wait']
72
+ @consec_count = 0
73
+ end
74
+ end
75
+ # ------------------------------------------------
76
+ end
77
+
78
+ def timeOut
79
+ sleep @wait['timeout']
80
+ @connection = false
81
+ @failed += 1
82
+ warn "Timeout"
83
+ end
84
+
85
+ def errorStatus(ex)
86
+ # 再試行回数が
87
+ @retryTime += 1
88
+ if @retryTime >= @wait['allowance_time']
89
+ return false
90
+ end
91
+
92
+ case ex.response_code
93
+ when '403' then
94
+ sleep @wait['403']
95
+ warn "403"
96
+ when '500' then
97
+ sleep @wait['500']
98
+ warn "500"
99
+ when '503' then
100
+ sleep @wait['503']
101
+ warn "503"
102
+ else
103
+ warn "Server error: #{ex.code}"
104
+ return false
105
+ end
106
+
107
+ @connection = false
108
+ @failed += 1
109
+ end
110
+
111
+ def htmlReq (url, request, procedure)
112
+ @failed = 0
113
+
114
+ # 再試行ループ
115
+ begin
116
+ eachWait
117
+ @connection = nil
118
+ request.call(url)
119
+
120
+ # タイムアウト時処理
121
+ rescue TimeoutError
122
+ timeOut
123
+ retry
124
+
125
+ # Mechanizeでアクセスし、200以外のステータスが返ってきた時
126
+ # 実際に該当するコードが返ってきたことがないので、正常に動くか不明
127
+ rescue Mechanize::ResponseCodeError => ex
128
+ if errorStatus(ex) then retry
129
+ else break end
130
+
131
+ # HTTP Status:200時の処理
132
+ else
133
+ procedure.call
134
+
135
+ # 失敗カウントが指定回数を超えたらループを終わる。
136
+ if @failed >= @wait['allowance_time'] then
137
+ puts 'Exceeded the limit of retry time.'
138
+ @connection = false
139
+ break
140
+ end
141
+ end until @connection
142
+
143
+ # 連続アクセスカウント+1
144
+ @consec_count += 1
145
+ # 成功 = true / 失敗 = false
146
+ return @connection
147
+ end
148
+
149
+ def htmlGet (host, entity)
150
+ htmlReq(
151
+ host + entity,
152
+ lambda { |url|
153
+ t = Thread.new do
154
+ @mech.get(url)
155
+ puts "Requesting for " + url
156
+ end
157
+ t.join
158
+ },
159
+ # HTTP Status:200時の処理
160
+ lambda {
161
+ # 連続アクセス拒絶メッセージが返ってきた時
162
+ if /短時間での連続アクセスはご遠慮ください/ =~ @mech.page.search('/html').text then
163
+ puts 'Access rejected.'
164
+ @connection = false
165
+ @failed += 1
166
+
167
+ # ウェイトを置いた後、今後のページ毎のウェイトを増やす。
168
+ puts 'Waiting for ' + @wait['rejected'] + 's.'
169
+ sleep @wait['rejected']
170
+ @wait['each'] += @wait['increment']
171
+ puts 'Increased each @wait by ' + @wait['increment'] + 'sec.'
172
+ else
173
+ @connection = true
174
+ end
175
+ }
176
+ )
177
+
178
+ return @mech.page
179
+ end
180
+
181
+ def xmlGet (host, entity)
182
+ response = nil
183
+ xmlDoc = nil
184
+ retryCount = 0
185
+ terminate = false
186
+
187
+ begin
188
+ puts "Requesting to " + host + entity
189
+ Net::HTTP.start(host, 80) { |http|
190
+ response = http.get(entity)
191
+ }
192
+ rescue => e
193
+ puts e
194
+ rescue Timeout::Error => e
195
+ puts e
196
+ puts "Timeout."
197
+ # マイリスト非公開のときに、403になる。後で専用の処理を入れるべき。
198
+ wait("timeout")
199
+ retryCount += 1
200
+
201
+ if retryCount >= @waitConfig["retryLimit"]
202
+ terminate = true
203
+ return "failed"
204
+ end
205
+ else
206
+ case response
207
+ when Net::HTTPSuccess
208
+ unless abnormalRes(response.body)
209
+ terminate = true
210
+ return response.body.force_encoding("UTF-8")
211
+ end
212
+ wait("200-abnormal")
213
+ retryCount += 1
214
+ when Net::HTTPRedirection
215
+ fetch(response['location'], limit - 1)
216
+ when Net::HTTPForbidden
217
+ puts "Access forbidden."
218
+ # マイリスト非公開のときに、403になる。後で専用の処理を入れるべき。
219
+ wait("403")
220
+ retryCount += 1
221
+ when Net::HTTPNotFound
222
+ puts "Http not found."
223
+ wait("404")
224
+ retryCount += 1
225
+ when Net::HTTPServiceUnavailable
226
+ puts "Access rejected or service unavailable."
227
+ wait("unavailable")
228
+ retryCount += 1
229
+ else
230
+ puts response.force_encoding("UTF-8")
231
+ puts "Unknown error."
232
+ wait("other")
233
+ retryCount += 1
234
+ end
235
+
236
+ if retryCount >= @waitConfig["retryLimit"]
237
+ terminate = true
238
+ return "failed"
239
+ end
240
+ end until terminate
241
+ end
242
+
243
+ def abnormalRes(resBody)
244
+ if
245
+ # mylistRss アクセス集中時
246
+ /大変ご迷惑をおかけいたしますが、しばらく時間をあけてから再度検索いただくようご協力をお願いいたします。/ =~ resBody.force_encoding("UTF-8") ||
247
+ # getThumbInfo失敗時
248
+ /<nicovideo_thumb_response\sstatus=\"fail\">/ =~ resBody
249
+ then
250
+ puts "!!!!"
251
+ true
252
+ end
253
+ end
254
+
255
+ def wait(status)
256
+ sleep @waitConfig[status.to_s]
257
+ end
258
+
259
+ def get (host, entity)
260
+ case @mode
261
+ when "html"
262
+ mechGet(host + entity)
263
+ when "atom"
264
+ xmlGet(host, entity)
265
+ end
266
+ end
267
+
268
+ attr_reader :mech
269
+ end
@@ -0,0 +1,68 @@
1
+ require 'rubygems'
2
+ require 'xml'
3
+ require 'time'
4
+
5
+ module Convert
6
+ def iso8601ToUnix(str)
7
+ Time.strptime(str, "%Y-%m-%dT%H:%M:%S").to_i
8
+ end
9
+ module_function :iso8601ToUnix
10
+
11
+ def japToUnix(str)
12
+ str.gsub!(/年|月/, '-')
13
+ .gsub!(/日/, 'T')
14
+ .gsub!(/:/, ':')
15
+ .gsub!(/\s/, '')
16
+ iso8601ToUnix(str)
17
+ end
18
+ module_function :japToUnix
19
+
20
+ def toSeconds(lengthStr)
21
+ # lengthStr = "mm:ss"
22
+ lengthStr = lengthStr.split(/\:/)
23
+ lengthStr[0].to_i * 60 + lengthStr[1].to_i
24
+ end
25
+ module_function :toSeconds
26
+
27
+ def commaRemover(str)
28
+ str.gsub(/\,/, '').to_i
29
+ end
30
+ module_function :commaRemover
31
+ end
32
+
33
+ module Extract
34
+ def mylistId(str)
35
+ /(mylist\/)([0-9]{1,})/ =~ str
36
+ $2.to_i
37
+ end
38
+ module_function :mylistId
39
+
40
+ def itemId(str)
41
+ /(watch\/)([0-9]{1,})/ =~ str
42
+ $2.to_i
43
+ end
44
+ module_function :itemId
45
+
46
+ def videoId(str)
47
+ /(http:\/\/www.nicovideo.jp\/watch\/)((sm|nm)[0-9]{1,})/ =~ str
48
+ $2
49
+ end
50
+ module_function :videoId
51
+ end
52
+
53
+ module Unicode
54
+ def escape(str)
55
+ ary = str.unpack("U*").map!{|i| "\\u#{i.to_s(16)}"}
56
+ ary.join
57
+ end
58
+
59
+ UNESCAPE_WORKER_ARRAY = []
60
+ def unescape(str)
61
+ str.gsub(/\\u([0-9a-f]{4})/) {
62
+ UNESCAPE_WORKER_ARRAY[0] = $1.hex
63
+ UNESCAPE_WORKER_ARRAY.pack("U")
64
+ }
65
+ end
66
+
67
+ module_function :escape, :unescape
68
+ end