web_loader 2.2.0 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/web_loader/command.rb +2 -54
- data/lib/web_loader/downloader.rb +8 -0
- data/lib/web_loader/drivers/selenium_driver.rb +1 -1
- data/lib/web_loader/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 03a86b4f610a1575be326e19740897204b75e2dbe41e1eac56ee1c98d9c44558
|
|
4
|
+
data.tar.gz: b83a8f147007e94e4fac89abd4297066727dfe6ebf0a68052569f97cece0f223
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 27395f10236456d0780ee08c28d5b1c9fd447acab50261732eb24c4bb88011eb823381d01005ac6faf4d5e580d85dd1e16c69038c4ca1f4d3ed257613aeb1daf
|
|
7
|
+
data.tar.gz: 24a7b60226b63fcc7c1fda0a44ca11fbb14d10176936f93b0684fc0866289cbae73058fc7c4f485579fa83850520d6f9cc13112cddaff60d733e009b06ded5cc
|
data/Gemfile.lock
CHANGED
data/lib/web_loader/command.rb
CHANGED
|
@@ -36,8 +36,6 @@ module WebLoader
|
|
|
36
36
|
|
|
37
37
|
# ドライバーのセットアップ
|
|
38
38
|
@driver = driver
|
|
39
|
-
@driver.user_agent = @user_agent
|
|
40
|
-
@driver.binary = @binary
|
|
41
39
|
end
|
|
42
40
|
|
|
43
41
|
attr_reader :load_cache_page
|
|
@@ -67,15 +65,9 @@ module WebLoader
|
|
|
67
65
|
|
|
68
66
|
##### サーバーからロード
|
|
69
67
|
log("Load server: #{url}")
|
|
70
|
-
# uri = URI.parse(url)
|
|
71
|
-
# http = Net::HTTP.new(uri.host, uri.port)
|
|
72
|
-
# if uri.scheme == 'https'
|
|
73
|
-
# http.use_ssl = true
|
|
74
|
-
# http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
|
75
|
-
# end
|
|
76
|
-
# @response = nil
|
|
77
68
|
begin
|
|
78
|
-
|
|
69
|
+
@driver.user_agent = @user_agent
|
|
70
|
+
@driver.binary = @binary
|
|
79
71
|
@response = @driver.fetch(url)
|
|
80
72
|
rescue Net::ReadTimeout
|
|
81
73
|
# タイムアウトした場合リトライ可能ならばsleepした後に再度ロード実行
|
|
@@ -110,51 +102,7 @@ module WebLoader
|
|
|
110
102
|
# それ以外は対応した例外を発生
|
|
111
103
|
log("error #{url}", true)
|
|
112
104
|
end
|
|
113
|
-
|
|
114
105
|
result
|
|
115
|
-
|
|
116
|
-
# ##### レスポンスの処理
|
|
117
|
-
# result = nil
|
|
118
|
-
# case @response
|
|
119
|
-
# when Net::HTTPSuccess
|
|
120
|
-
# # @responseがNet::HTTPSuccessのサブクラスの場合成功とみなし読み込んだ内容を返す
|
|
121
|
-
# body = @response.body
|
|
122
|
-
# unless @binary
|
|
123
|
-
# # デフォルトでは ASCII-8BITが帰ってくる。
|
|
124
|
-
# # Content-Typeのcharsetとみなす。
|
|
125
|
-
# # https://bugs.ruby-lang.org/issues/2567
|
|
126
|
-
# encoding = @response.type_params['charset']
|
|
127
|
-
# body = toutf8(body, encoding)
|
|
128
|
-
# end
|
|
129
|
-
#
|
|
130
|
-
# if @use_cache || @always_write_cache
|
|
131
|
-
# log("Write cache: #{url}")
|
|
132
|
-
# Cache.write(@cache_dir, url, @response.code, body)
|
|
133
|
-
# end
|
|
134
|
-
# result = body
|
|
135
|
-
# when Net::HTTPRedirection
|
|
136
|
-
# result = load(to_redirect_url(uri, @response['location']), redirect_count - 1)
|
|
137
|
-
# # when Net::HTTPNotFound
|
|
138
|
-
# # result = nil
|
|
139
|
-
# when Net::HTTPTooManyRequests, Net::ReadTimeout
|
|
140
|
-
# # 上記以外のレスポンスの場合、リトライ可能ならばsleepした後に再度ロード実行
|
|
141
|
-
# if retry_count > 0
|
|
142
|
-
# sleep_for = 10
|
|
143
|
-
# if @response.is_a?(Net::HTTPTooManyRequests)
|
|
144
|
-
# # HTTPTooManyRequestsならばretry-afterで指定された値を取得。
|
|
145
|
-
# sleep_for = @response.header['retry-after'].to_i + 10
|
|
146
|
-
# log("Rate limit: #{uri} #{@response.header.to_hash} (429 Too Many Requests). Sleeping #{sleep_for} seconds and retry (##{retry_count}).")
|
|
147
|
-
# else
|
|
148
|
-
# log("Unknown response: #{uri} #{@response.inspect}. Sleeping #{sleep_for} seconds and retry (##{retry_count}).")
|
|
149
|
-
# end
|
|
150
|
-
# sleep sleep_for
|
|
151
|
-
# result = load(url, redirect_count , retry_count - 1)
|
|
152
|
-
# end
|
|
153
|
-
# else
|
|
154
|
-
# # それ以外は対応した例外を発生
|
|
155
|
-
# log("error #{url}", true)
|
|
156
|
-
# end
|
|
157
|
-
# result
|
|
158
106
|
end
|
|
159
107
|
|
|
160
108
|
private
|
|
@@ -18,6 +18,8 @@ module WebLoader
|
|
|
18
18
|
drivers = ['pureruby', 'selenium']
|
|
19
19
|
opt.on('-d DRIVER', '--driver=DRIVER', drivers, drivers.join("|") + "(default pureruby)") {|v| opts[:d] = v }
|
|
20
20
|
opt.on("--disable-cache", "Disable cache") {|v| opts[:disable_cache] = v }
|
|
21
|
+
opt.on('--user-agent=USERAGENT', 'Set User-Agent header') {|v| opts[:user_agent] = v }
|
|
22
|
+
opt.on('-b', '--binary', 'Download binary files') {|v| opts[:binary] = v }
|
|
21
23
|
opt.parse!(argv)
|
|
22
24
|
if argv.empty?
|
|
23
25
|
puts "Error: URL is required."
|
|
@@ -39,6 +41,12 @@ module WebLoader
|
|
|
39
41
|
if @opts[:disable_cache]
|
|
40
42
|
loader.use_cache = false
|
|
41
43
|
end
|
|
44
|
+
if @opts[:user_agent]
|
|
45
|
+
loader.user_agent = @opts[:user_agent]
|
|
46
|
+
end
|
|
47
|
+
if @opts[:binary]
|
|
48
|
+
loader.binary = true
|
|
49
|
+
end
|
|
42
50
|
loader.load(url)
|
|
43
51
|
end
|
|
44
52
|
|
|
@@ -38,7 +38,7 @@ module WebLoader
|
|
|
38
38
|
|
|
39
39
|
content_type = driver.execute_script("return document.contentType;")
|
|
40
40
|
|
|
41
|
-
body = @binary ? page_source.b : driver.page_source
|
|
41
|
+
body = @binary ? driver.page_source.b : driver.page_source
|
|
42
42
|
response = WebLoader::Response.new(status: 200,
|
|
43
43
|
headers: {
|
|
44
44
|
'Content-Type' => content_type
|
data/lib/web_loader/version.rb
CHANGED