web_loader 1.8.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.idea/copilot.data.migration.ask.xml +6 -0
- data/.idea/copilot.data.migration.ask2agent.xml +6 -0
- data/.idea/copilot.data.migration.edit.xml +6 -0
- data/Gemfile.lock +1 -1
- data/lib/web_loader/command.rb +68 -35
- data/lib/web_loader/drivers/http_driver.rb +30 -0
- data/lib/web_loader/response.rb +41 -0
- data/lib/web_loader/utils.rb +2 -0
- data/lib/web_loader/version.rb +1 -1
- data/lib/web_loader.rb +2 -0
- data/web_loader.iml +74 -0
- metadata +7 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: d51a9c53f63ed251de81a4664f36038919309a79298397ea72ae26fabd320734
|
|
4
|
+
data.tar.gz: 69e3294e0c85c07f88241e2249fcbeb512d3f2f446324788b5fbfcb1493b6908
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 9ed22698344e0212b05026fbf21afe9b5b77c3e7a88973c909c6aa1fbdb798626bc981bdae1b76490832b11720d2e88f1d89cea4fea7f05c5c99aea46894ea56
|
|
7
|
+
data.tar.gz: 5ee5974c7650ec1153a57e7d3299ec29349ac2797665527174e90d627389a17814bc3ba2c733f9c7e2850b6266caf1e89640944e33f4d36edeeb215a2c04e459
|
data/Gemfile.lock
CHANGED
data/lib/web_loader/command.rb
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
require 'open-uri'
|
|
2
2
|
require 'net/http'
|
|
3
3
|
require 'uri'
|
|
4
|
-
require 'kconv'
|
|
5
4
|
|
|
6
5
|
module WebLoader
|
|
7
6
|
class Command
|
|
@@ -23,7 +22,7 @@ module WebLoader
|
|
|
23
22
|
File.binwrite(file, content)
|
|
24
23
|
end
|
|
25
24
|
|
|
26
|
-
def initialize
|
|
25
|
+
def initialize(driver = ::WebLoader::Drivers::HttpDriver.new)
|
|
27
26
|
@use_cache = true
|
|
28
27
|
@load_cache_page = false #キャッシュを読み込んだかどうか
|
|
29
28
|
@cache_dir = File.expand_path(CACHE_DIR)
|
|
@@ -34,12 +33,18 @@ module WebLoader
|
|
|
34
33
|
@always_write_cache = false
|
|
35
34
|
@response = nil
|
|
36
35
|
@logger = nil
|
|
36
|
+
|
|
37
|
+
# ドライバーのセットアップ
|
|
38
|
+
@driver = driver
|
|
39
|
+
@driver.user_agent = @user_agent
|
|
40
|
+
@driver.binary = @binary
|
|
37
41
|
end
|
|
38
42
|
|
|
39
43
|
attr_reader :load_cache_page
|
|
40
44
|
attr_accessor :use_cache, :cache_dir, :binary, :user_agent, :verbose
|
|
41
45
|
attr_accessor :cache_limit
|
|
42
46
|
attr_accessor :always_write_cache
|
|
47
|
+
attr_accessor :driver
|
|
43
48
|
attr_reader :response
|
|
44
49
|
attr_accessor :logger
|
|
45
50
|
|
|
@@ -62,15 +67,16 @@ module WebLoader
|
|
|
62
67
|
|
|
63
68
|
##### サーバーからロード
|
|
64
69
|
log("Load server: #{url}")
|
|
65
|
-
uri = URI.parse(url)
|
|
66
|
-
http = Net::HTTP.new(uri.host, uri.port)
|
|
67
|
-
if uri.scheme == 'https'
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
end
|
|
71
|
-
@response = nil
|
|
70
|
+
# uri = URI.parse(url)
|
|
71
|
+
# http = Net::HTTP.new(uri.host, uri.port)
|
|
72
|
+
# if uri.scheme == 'https'
|
|
73
|
+
# http.use_ssl = true
|
|
74
|
+
# http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
|
75
|
+
# end
|
|
76
|
+
# @response = nil
|
|
72
77
|
begin
|
|
73
|
-
@response = http.get(uri.request_uri, 'User-Agent' => @user_agent) # request_uri=path + '?' + query
|
|
78
|
+
# @response = http.get(uri.request_uri, 'User-Agent' => @user_agent) # request_uri=path + '?' + query
|
|
79
|
+
@response = @driver.fetch(url)
|
|
74
80
|
rescue Net::ReadTimeout
|
|
75
81
|
# タイムアウトした場合リトライ可能ならばsleepした後に再度ロード実行
|
|
76
82
|
log("Read timeout: #{url}")
|
|
@@ -82,38 +88,21 @@ module WebLoader
|
|
|
82
88
|
|
|
83
89
|
##### レスポンスの処理
|
|
84
90
|
result = nil
|
|
85
|
-
|
|
86
|
-
when Net::HTTPSuccess
|
|
87
|
-
# @responseがNet::HTTPSuccessのサブクラスの場合成功とみなし読み込んだ内容を返す
|
|
91
|
+
if response.ok?
|
|
88
92
|
body = @response.body
|
|
89
|
-
unless @binary
|
|
90
|
-
# デフォルトでは ASCII-8BITが帰ってくる。
|
|
91
|
-
# Content-Typeのcharsetとみなす。
|
|
92
|
-
# https://bugs.ruby-lang.org/issues/2567
|
|
93
|
-
encoding = @response.type_params['charset']
|
|
94
|
-
body = toutf8(body, encoding)
|
|
95
|
-
end
|
|
96
|
-
|
|
97
93
|
if @use_cache || @always_write_cache
|
|
98
94
|
log("Write cache: #{url}")
|
|
99
|
-
Cache.write(@cache_dir, url, @response.
|
|
95
|
+
Cache.write(@cache_dir, url, @response.status, body)
|
|
100
96
|
end
|
|
101
97
|
result = body
|
|
102
|
-
|
|
103
|
-
result = load(to_redirect_url(
|
|
104
|
-
|
|
105
|
-
# result = nil
|
|
106
|
-
when Net::HTTPTooManyRequests, Net::ReadTimeout
|
|
98
|
+
elsif response.redirect?
|
|
99
|
+
result = load(to_redirect_url(URI.parse(url), @response.headers['location']), redirect_count - 1)
|
|
100
|
+
elsif response.rate_limited?
|
|
107
101
|
# 上記以外のレスポンスの場合、リトライ可能ならばsleepした後に再度ロード実行
|
|
108
102
|
if retry_count > 0
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
sleep_for = @response.header['retry-after'].to_i + 10
|
|
113
|
-
log("Rate limit: #{uri} #{@response.header.to_hash} (429 Too Many Requests). Sleeping #{sleep_for} seconds and retry (##{retry_count}).")
|
|
114
|
-
else
|
|
115
|
-
log("Unknown response: #{uri} #{@response.inspect}. Sleeping #{sleep_for} seconds and retry (##{retry_count}).")
|
|
116
|
-
end
|
|
103
|
+
# HTTPTooManyRequestsならばretry-afterで指定された値を取得。
|
|
104
|
+
sleep_for = @response.header['retry-after'].to_i + 10
|
|
105
|
+
log("Rate limit: #{uri} #{@response.header.to_hash} (429 Too Many Requests). Sleeping #{sleep_for} seconds and retry (##{retry_count}).")
|
|
117
106
|
sleep sleep_for
|
|
118
107
|
result = load(url, redirect_count , retry_count - 1)
|
|
119
108
|
end
|
|
@@ -121,7 +110,51 @@ module WebLoader
|
|
|
121
110
|
# それ以外は対応した例外を発生
|
|
122
111
|
log("error #{url}", true)
|
|
123
112
|
end
|
|
113
|
+
|
|
124
114
|
result
|
|
115
|
+
|
|
116
|
+
# ##### レスポンスの処理
|
|
117
|
+
# result = nil
|
|
118
|
+
# case @response
|
|
119
|
+
# when Net::HTTPSuccess
|
|
120
|
+
# # @responseがNet::HTTPSuccessのサブクラスの場合成功とみなし読み込んだ内容を返す
|
|
121
|
+
# body = @response.body
|
|
122
|
+
# unless @binary
|
|
123
|
+
# # デフォルトでは ASCII-8BITが帰ってくる。
|
|
124
|
+
# # Content-Typeのcharsetとみなす。
|
|
125
|
+
# # https://bugs.ruby-lang.org/issues/2567
|
|
126
|
+
# encoding = @response.type_params['charset']
|
|
127
|
+
# body = toutf8(body, encoding)
|
|
128
|
+
# end
|
|
129
|
+
#
|
|
130
|
+
# if @use_cache || @always_write_cache
|
|
131
|
+
# log("Write cache: #{url}")
|
|
132
|
+
# Cache.write(@cache_dir, url, @response.code, body)
|
|
133
|
+
# end
|
|
134
|
+
# result = body
|
|
135
|
+
# when Net::HTTPRedirection
|
|
136
|
+
# result = load(to_redirect_url(uri, @response['location']), redirect_count - 1)
|
|
137
|
+
# # when Net::HTTPNotFound
|
|
138
|
+
# # result = nil
|
|
139
|
+
# when Net::HTTPTooManyRequests, Net::ReadTimeout
|
|
140
|
+
# # 上記以外のレスポンスの場合、リトライ可能ならばsleepした後に再度ロード実行
|
|
141
|
+
# if retry_count > 0
|
|
142
|
+
# sleep_for = 10
|
|
143
|
+
# if @response.is_a?(Net::HTTPTooManyRequests)
|
|
144
|
+
# # HTTPTooManyRequestsならばretry-afterで指定された値を取得。
|
|
145
|
+
# sleep_for = @response.header['retry-after'].to_i + 10
|
|
146
|
+
# log("Rate limit: #{uri} #{@response.header.to_hash} (429 Too Many Requests). Sleeping #{sleep_for} seconds and retry (##{retry_count}).")
|
|
147
|
+
# else
|
|
148
|
+
# log("Unknown response: #{uri} #{@response.inspect}. Sleeping #{sleep_for} seconds and retry (##{retry_count}).")
|
|
149
|
+
# end
|
|
150
|
+
# sleep sleep_for
|
|
151
|
+
# result = load(url, redirect_count , retry_count - 1)
|
|
152
|
+
# end
|
|
153
|
+
# else
|
|
154
|
+
# # それ以外は対応した例外を発生
|
|
155
|
+
# log("error #{url}", true)
|
|
156
|
+
# end
|
|
157
|
+
# result
|
|
125
158
|
end
|
|
126
159
|
|
|
127
160
|
private
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
require 'open-uri'
|
|
2
|
+
require 'net/http'
|
|
3
|
+
require 'uri'
|
|
4
|
+
require 'kconv'
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
module WebLoader
|
|
8
|
+
module Drivers
|
|
9
|
+
class HttpDriver
|
|
10
|
+
|
|
11
|
+
def initialize
|
|
12
|
+
@user_agent = nil
|
|
13
|
+
@binary = false
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
attr_accessor :user_agent, :binary
|
|
17
|
+
|
|
18
|
+
def fetch(url)
|
|
19
|
+
uri = URI.parse(url)
|
|
20
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
21
|
+
if uri.scheme == 'https'
|
|
22
|
+
http.use_ssl = true
|
|
23
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
|
24
|
+
end
|
|
25
|
+
response = http.get(uri.request_uri, 'User-Agent' => @user_agent) # request_uri=path + '?' + query
|
|
26
|
+
WebLoader::Response.from_net_http(response, @binary)
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
module WebLoader
|
|
2
|
+
class Response
|
|
3
|
+
include WebLoader::Utils
|
|
4
|
+
|
|
5
|
+
def self.from_net_http(response, binary)
|
|
6
|
+
body = response.body
|
|
7
|
+
unless binary
|
|
8
|
+
# デフォルトでは ASCII-8BITが帰ってくる。
|
|
9
|
+
# Content-Typeのcharsetとみなす。
|
|
10
|
+
# https://bugs.ruby-lang.org/issues/2567
|
|
11
|
+
encoding = response.type_params['charset']
|
|
12
|
+
body = ::WebLoader::Utils.toutf8(body, encoding)
|
|
13
|
+
end
|
|
14
|
+
new(
|
|
15
|
+
status: response.code.to_i,
|
|
16
|
+
headers: response.each_header.to_h,
|
|
17
|
+
body: body
|
|
18
|
+
)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def self.from_selenium(driver, original_url)
|
|
22
|
+
# デフォルトは成功200
|
|
23
|
+
status = 200
|
|
24
|
+
# redirected = driver.current_url != original_url
|
|
25
|
+
# status = 300 if redirected # 簡易的にリダイレクト扱い
|
|
26
|
+
new(status: status, headers: {}, body: driver.page_source)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def initialize(status:, headers: {}, body: nil)
|
|
30
|
+
@status = status.to_i
|
|
31
|
+
@headers = headers || {}
|
|
32
|
+
@body = body
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
attr_reader :status, :headers, :body
|
|
36
|
+
|
|
37
|
+
def ok?; (200..299).include?(@status); end
|
|
38
|
+
def redirect?; (300..399).include?(@status); end
|
|
39
|
+
def rate_limited?; @status == 429; end
|
|
40
|
+
end
|
|
41
|
+
end
|
data/lib/web_loader/utils.rb
CHANGED
|
@@ -67,6 +67,7 @@ module WebLoader
|
|
|
67
67
|
end
|
|
68
68
|
result
|
|
69
69
|
end
|
|
70
|
+
module_function :toutf8
|
|
70
71
|
|
|
71
72
|
def to_redirect_url(orig_uri, location)
|
|
72
73
|
redirect_url = location
|
|
@@ -75,5 +76,6 @@ module WebLoader
|
|
|
75
76
|
end
|
|
76
77
|
redirect_url
|
|
77
78
|
end
|
|
79
|
+
module_function :to_redirect_url
|
|
78
80
|
end
|
|
79
81
|
end
|
data/lib/web_loader/version.rb
CHANGED
data/lib/web_loader.rb
CHANGED
|
@@ -4,6 +4,8 @@ require_relative "web_loader/version"
|
|
|
4
4
|
require_relative "web_loader/utils"
|
|
5
5
|
require_relative "web_loader/cache"
|
|
6
6
|
require_relative "web_loader/command"
|
|
7
|
+
require_relative "web_loader/response"
|
|
8
|
+
require_relative "web_loader/drivers/http_driver"
|
|
7
9
|
|
|
8
10
|
module WebLoader
|
|
9
11
|
class Error < StandardError; end
|
data/web_loader.iml
CHANGED
|
@@ -10,6 +10,80 @@
|
|
|
10
10
|
</content>
|
|
11
11
|
<orderEntry type="jdk" jdkName="rbenv: 3.2.6" jdkType="RUBY_SDK" />
|
|
12
12
|
<orderEntry type="sourceFolder" forTests="false" />
|
|
13
|
+
<orderEntry type="module-library">
|
|
14
|
+
<library name="minitest (v5.26.1) [path][gem]" type="rubylib">
|
|
15
|
+
<properties>
|
|
16
|
+
<option name="additionalInfo">
|
|
17
|
+
<AdditionalInfo>
|
|
18
|
+
<option name="authors" value="該当なし" />
|
|
19
|
+
<option name="email" value="該当なし" />
|
|
20
|
+
<option name="homepage" value="該当なし" />
|
|
21
|
+
<option name="summary" value="該当なし" />
|
|
22
|
+
</AdditionalInfo>
|
|
23
|
+
</option>
|
|
24
|
+
<option name="fromPath" value="true" />
|
|
25
|
+
<option name="name" value="minitest" />
|
|
26
|
+
<option name="requirePaths">
|
|
27
|
+
<list>
|
|
28
|
+
<option value="lib" />
|
|
29
|
+
</list>
|
|
30
|
+
</option>
|
|
31
|
+
<option name="url" value="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.26.1" />
|
|
32
|
+
<option name="version" value="5.26.1" />
|
|
33
|
+
</properties>
|
|
34
|
+
<CLASSES>
|
|
35
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.26.1/lib" />
|
|
36
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.26.1/test" />
|
|
37
|
+
</CLASSES>
|
|
38
|
+
<JAVADOC />
|
|
39
|
+
<SOURCES>
|
|
40
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.26.1/lib" />
|
|
41
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.26.1/test" />
|
|
42
|
+
</SOURCES>
|
|
43
|
+
<excluded>
|
|
44
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.26.1/test" />
|
|
45
|
+
</excluded>
|
|
46
|
+
</library>
|
|
47
|
+
</orderEntry>
|
|
48
|
+
<orderEntry type="module-library">
|
|
49
|
+
<library name="rake (v13.3.1) [path][gem]" type="rubylib">
|
|
50
|
+
<properties>
|
|
51
|
+
<option name="additionalInfo">
|
|
52
|
+
<AdditionalInfo>
|
|
53
|
+
<option name="authors" value="該当なし" />
|
|
54
|
+
<option name="email" value="該当なし" />
|
|
55
|
+
<option name="homepage" value="該当なし" />
|
|
56
|
+
<option name="summary" value="該当なし" />
|
|
57
|
+
</AdditionalInfo>
|
|
58
|
+
</option>
|
|
59
|
+
<option name="fromPath" value="true" />
|
|
60
|
+
<option name="name" value="rake" />
|
|
61
|
+
<option name="requirePaths">
|
|
62
|
+
<list>
|
|
63
|
+
<option value="lib" />
|
|
64
|
+
</list>
|
|
65
|
+
</option>
|
|
66
|
+
<option name="url" value="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.3.1" />
|
|
67
|
+
<option name="version" value="13.3.1" />
|
|
68
|
+
</properties>
|
|
69
|
+
<CLASSES>
|
|
70
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.3.1/doc" />
|
|
71
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.3.1/exe" />
|
|
72
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.3.1/lib" />
|
|
73
|
+
</CLASSES>
|
|
74
|
+
<JAVADOC />
|
|
75
|
+
<SOURCES>
|
|
76
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.3.1/doc" />
|
|
77
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.3.1/exe" />
|
|
78
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.3.1/lib" />
|
|
79
|
+
</SOURCES>
|
|
80
|
+
<excluded>
|
|
81
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.3.1/doc" />
|
|
82
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.3.1/exe" />
|
|
83
|
+
</excluded>
|
|
84
|
+
</library>
|
|
85
|
+
</orderEntry>
|
|
86
|
+
<orderEntry type="library" scope="PROVIDED" name="bundler (v2.6.3, rbenv: 3.2.6) [gem]" level="application" />
|
|
13
87
|
</component>
|
|
14
88
|
<component name="RakeTasksCache">
|
|
15
89
|
<option name="myRootTask">
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: web_loader
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version:
|
|
4
|
+
version: 2.0.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- src
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2025-11-
|
|
11
|
+
date: 2025-11-26 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: Web loader.
|
|
14
14
|
email:
|
|
@@ -21,6 +21,9 @@ files:
|
|
|
21
21
|
- ".DS_Store"
|
|
22
22
|
- ".idea/.gitignore"
|
|
23
23
|
- ".idea/copilot.data.migration.agent.xml"
|
|
24
|
+
- ".idea/copilot.data.migration.ask.xml"
|
|
25
|
+
- ".idea/copilot.data.migration.ask2agent.xml"
|
|
26
|
+
- ".idea/copilot.data.migration.edit.xml"
|
|
24
27
|
- ".idea/inspectionProfiles/Project_Default.xml"
|
|
25
28
|
- ".idea/misc.xml"
|
|
26
29
|
- ".idea/modules.xml"
|
|
@@ -36,6 +39,8 @@ files:
|
|
|
36
39
|
- lib/web_loader.rb
|
|
37
40
|
- lib/web_loader/cache.rb
|
|
38
41
|
- lib/web_loader/command.rb
|
|
42
|
+
- lib/web_loader/drivers/http_driver.rb
|
|
43
|
+
- lib/web_loader/response.rb
|
|
39
44
|
- lib/web_loader/utils.rb
|
|
40
45
|
- lib/web_loader/version.rb
|
|
41
46
|
- sig/web_loader.rbs
|