web_loader 1.0.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.idea/misc.xml +0 -1
- data/Gemfile.lock +4 -4
- data/lib/web_loader/command.rb +45 -6
- data/lib/web_loader/utils.rb +30 -23
- data/lib/web_loader/version.rb +1 -1
- data/web_loader.iml +41 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 57806142eb3fb353786c2e1f8d36bddbf4fbdd75a5752466e426e9fb92b33ebe
|
4
|
+
data.tar.gz: 7b3e79220bd13d8dfad067a305e58df2894677ffc9f3b1b8ab248070840fd0cc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7269eb19bf3b1a36e88b66f0af750192681fe72ee8308ddb9fae3faab533543ee3a2dcc678fc18b9647919705b8ae869f0f24d38fcdf51ce4f3dce139caeda05
|
7
|
+
data.tar.gz: 6578e2aff6cbc4dbd6639ad026f87d435dccf62cfdfe8d88014494225ff2ba58f99a23e5e8c7013102feb7119a47644acd9b39c22c0088e31f3a3acaedd252c1
|
data/.idea/misc.xml
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
web_loader (1.
|
4
|
+
web_loader (1.2.0)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
8
8
|
specs:
|
9
|
-
minitest (5.
|
10
|
-
rake (13.0
|
9
|
+
minitest (5.20.0)
|
10
|
+
rake (13.1.0)
|
11
11
|
|
12
12
|
PLATFORMS
|
13
13
|
x86_64-darwin-22
|
@@ -18,4 +18,4 @@ DEPENDENCIES
|
|
18
18
|
web_loader!
|
19
19
|
|
20
20
|
BUNDLED WITH
|
21
|
-
2.4.
|
21
|
+
2.4.20
|
data/lib/web_loader/command.rb
CHANGED
@@ -9,6 +9,9 @@ module WebLoader
|
|
9
9
|
|
10
10
|
USER_AGENT = "WebLoader"
|
11
11
|
CACHE_DIR = './cache'
|
12
|
+
DEFAULT_RETRY = 3
|
13
|
+
DEFAULT_REDIRECT = 10
|
14
|
+
DEFAULT_SLEEP = 10
|
12
15
|
|
13
16
|
def self.save_image(url, file)
|
14
17
|
# キャッシュせず単に保存する
|
@@ -31,9 +34,15 @@ module WebLoader
|
|
31
34
|
attr_reader :load_cache_page
|
32
35
|
attr_accessor :use_cache, :cache_dir, :binary, :user_agent, :verbose
|
33
36
|
|
34
|
-
def
|
35
|
-
|
37
|
+
def load_retry(url, retry_count = DEFAULT_RETRY)
|
38
|
+
load(url, DEFAULT_REDIRECT, retry_count)
|
39
|
+
end
|
40
|
+
|
41
|
+
def load(url, redirect_count = DEFAULT_REDIRECT, retry_count = 0)
|
42
|
+
raise ArgumentError, 'HTTP redirect too deep' if redirect_count == 0
|
36
43
|
log("Load: #{url}", @verbose)
|
44
|
+
|
45
|
+
##### キャッシュの読み込み
|
37
46
|
@load_cache_page = false
|
38
47
|
content = try_load_cache(url)
|
39
48
|
if content
|
@@ -41,6 +50,8 @@ module WebLoader
|
|
41
50
|
@load_cache_page = true
|
42
51
|
return content
|
43
52
|
end
|
53
|
+
|
54
|
+
##### サーバーからロード
|
44
55
|
log("Load server: #{url}", @verbose)
|
45
56
|
uri = URI.parse(url)
|
46
57
|
http = Net::HTTP.new(uri.host, uri.port)
|
@@ -48,7 +59,20 @@ module WebLoader
|
|
48
59
|
http.use_ssl = true
|
49
60
|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
50
61
|
end
|
51
|
-
response
|
62
|
+
response = nil
|
63
|
+
begin
|
64
|
+
response = http.get(uri.request_uri, 'User-Agent' => @user_agent) # request_uri=path + '?' + query
|
65
|
+
rescue Net::ReadTimeout
|
66
|
+
# タイムアウトした場合リトライ可能ならばsleepした後に再度ロード実行
|
67
|
+
log("Read timeout: #{url}", @verbose)
|
68
|
+
if retry_count > 0
|
69
|
+
sleep DEFAULT_SLEEP
|
70
|
+
return load(url, redirect_count , retry_count - 1)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
##### レスポンスの処理
|
75
|
+
result = nil
|
52
76
|
case response
|
53
77
|
when Net::HTTPSuccess
|
54
78
|
# responseがNet::HTTPSuccessのサブクラスの場合成功とみなし読み込んだ内容を返す
|
@@ -64,14 +88,29 @@ module WebLoader
|
|
64
88
|
log("Write cache: #{url}", @verbose)
|
65
89
|
Cache.write(@cache_dir, url, response.code, body)
|
66
90
|
end
|
67
|
-
|
91
|
+
result = body
|
68
92
|
when Net::HTTPRedirection
|
69
|
-
load(to_redirect_url(uri, response['location']),
|
93
|
+
result = load(to_redirect_url(uri, response['location']), redirect_count - 1)
|
70
94
|
else
|
71
|
-
|
95
|
+
# 上記以外のレスポンスの場合、リトライ可能ならばsleepした後に再度ロード実行
|
96
|
+
if retry_count > 0
|
97
|
+
sleep_for = 10
|
98
|
+
if response.is_a?(Net::HTTPTooManyRequests)
|
99
|
+
# HTTPTooManyRequestsならばretry-afterで指定された値を取得。
|
100
|
+
sleep_for = response.header['retry-after'].to_i + 10
|
101
|
+
log("Rate limit: #{uri} #{response.header.to_hash} (429 Too Many Requests). Sleeping #{sleep_for} seconds and retry (##{retry_count}).", @verbose)
|
102
|
+
else
|
103
|
+
log("Unknown response: #{uri} #{response.inspect}. Sleeping #{sleep_for} seconds and retry (##{retry_count}).", @verbose)
|
104
|
+
end
|
105
|
+
sleep sleep_for
|
106
|
+
result = load(url, redirect_count , retry_count - 1)
|
107
|
+
end
|
108
|
+
|
72
109
|
# それ以外は対応した例外を発生
|
110
|
+
log("error #{url}", true)
|
73
111
|
response.value
|
74
112
|
end
|
113
|
+
result
|
75
114
|
end
|
76
115
|
|
77
116
|
private
|
data/lib/web_loader/utils.rb
CHANGED
@@ -1,34 +1,41 @@
|
|
1
1
|
module WebLoader
|
2
2
|
module Utils
|
3
3
|
UTF_8 = 'UTF-8'
|
4
|
-
|
4
|
+
|
5
|
+
def toutf8_charset(str, charset)
|
6
|
+
# charsetが指定されていない場合はnil
|
7
|
+
return nil if charset.to_s.length == 0
|
8
|
+
|
9
|
+
result = nil
|
10
|
+
begin
|
11
|
+
# 文字列のcharsetを変更する
|
12
|
+
str.force_encoding(charset) # 例外が発生する場合あり。例えば"Shift_JIS"ではなく"Shift-JIS"が渡された場合。
|
13
|
+
# force_encodingが失敗した場合はnil
|
14
|
+
return nil unless str.valid_encoding?
|
15
|
+
result = nil
|
16
|
+
if charset =~ /#{UTF_8}/i
|
17
|
+
result = str
|
18
|
+
else
|
19
|
+
# エンコーディングがUTF8じゃない場合変換する
|
20
|
+
result = str.encode(UTF_8, invalid: :replace, undef: :replace)
|
21
|
+
end
|
22
|
+
rescue => ex
|
23
|
+
puts ex.message
|
24
|
+
end
|
25
|
+
result
|
26
|
+
end
|
27
|
+
|
28
|
+
def toutf8(str, charset)
|
5
29
|
# 2022/04/04(月)
|
6
30
|
# GITHUBのアポストロフィ(’ U+2019)が文字化け問題に対処するために新設。
|
7
31
|
# 原因は直接Kconv.toutf8にresponse.bodyをわたしていたことなので(Kconvのguessが失敗していたと思われる)、
|
8
32
|
# response.type_paramsを見てそれにforce_encodingすることで対処する。渡されているcharsetとWebページの文字コードが一致していればこれで問題はないはず。
|
9
|
-
|
10
|
-
result = str
|
33
|
+
result = nil
|
11
34
|
begin
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
# 指定された文字コードとみなせた場合
|
17
|
-
if response_encoding != UTF_8
|
18
|
-
# エンコーディングがUTF8じゃない場合返還する
|
19
|
-
result = str.encode(UTF_8, invalid: :replace, undef: :replace)
|
20
|
-
else
|
21
|
-
# UTF8の場合そのまま
|
22
|
-
result = str
|
23
|
-
end
|
24
|
-
else
|
25
|
-
# 指定された文字コードとみなせない場合元の文字列を返す
|
26
|
-
result = org_str
|
27
|
-
end
|
28
|
-
else
|
29
|
-
# responseで文字コードが指定されていない場合Kconvを使用
|
30
|
-
result = Kconv.toutf8(str)
|
31
|
-
end
|
35
|
+
# 指定されたcharsetで変換する
|
36
|
+
result = toutf8_charset(str.dup, charset)
|
37
|
+
# charsetによる変換が失敗した場合Kconvを使用
|
38
|
+
result = Kconv.toutf8(str) if result.nil?
|
32
39
|
rescue => ex
|
33
40
|
puts ex.message
|
34
41
|
end
|
data/lib/web_loader/version.rb
CHANGED
data/web_loader.iml
CHANGED
@@ -7,7 +7,47 @@
|
|
7
7
|
</content>
|
8
8
|
<orderEntry type="inheritedJdk" />
|
9
9
|
<orderEntry type="sourceFolder" forTests="false" />
|
10
|
-
<orderEntry type="library"
|
10
|
+
<orderEntry type="module-library">
|
11
|
+
<library name="minitest (v5.20.0) [path][gem]" type="rubylib">
|
12
|
+
<properties>
|
13
|
+
<option name="version" value="4" />
|
14
|
+
</properties>
|
15
|
+
<CLASSES>
|
16
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/minitest-5.20.0/lib" />
|
17
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/minitest-5.20.0/test" />
|
18
|
+
</CLASSES>
|
19
|
+
<JAVADOC />
|
20
|
+
<SOURCES>
|
21
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/minitest-5.20.0/lib" />
|
22
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/minitest-5.20.0/test" />
|
23
|
+
</SOURCES>
|
24
|
+
<excluded>
|
25
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/minitest-5.20.0/test" />
|
26
|
+
</excluded>
|
27
|
+
</library>
|
28
|
+
</orderEntry>
|
29
|
+
<orderEntry type="module-library">
|
30
|
+
<library name="rake (v13.1.0) [path][gem]" type="rubylib">
|
31
|
+
<properties>
|
32
|
+
<option name="version" value="4" />
|
33
|
+
</properties>
|
34
|
+
<CLASSES>
|
35
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.1.0/doc" />
|
36
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.1.0/exe" />
|
37
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.1.0/lib" />
|
38
|
+
</CLASSES>
|
39
|
+
<JAVADOC />
|
40
|
+
<SOURCES>
|
41
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.1.0/doc" />
|
42
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.1.0/exe" />
|
43
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.1.0/lib" />
|
44
|
+
</SOURCES>
|
45
|
+
<excluded>
|
46
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.1.0/doc" />
|
47
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.1.0/exe" />
|
48
|
+
</excluded>
|
49
|
+
</library>
|
50
|
+
</orderEntry>
|
11
51
|
</component>
|
12
52
|
<component name="RakeTasksCache">
|
13
53
|
<option name="myRootTask">
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: web_loader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- src
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-11-23 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Web loader.
|
14
14
|
email:
|