web_loader 0.9.2 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.idea/misc.xml +0 -1
- data/Gemfile.lock +2 -2
- data/lib/web_loader/command.rb +55 -8
- data/lib/web_loader/utils.rb +25 -23
- data/lib/web_loader/version.rb +1 -1
- data/test.sh +4 -0
- data/web_loader.iml +42 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1055ef0181a8f0e5857172c56783111bff1d102686f006c001e5b4b2627e65cb
|
4
|
+
data.tar.gz: da2499a388951ecc7d0c525442b847da0be5e32ab2b478dd8ae2338e6e0772de
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2e7e1b93c5af2ad6280843558fc89aba4014313c2fbb2999d0148749e20966bf05dcadc1e48bd8e46791feca2c7d78c6c3b99977778da667b36bd32964046ae6
|
7
|
+
data.tar.gz: 319950bb1a184a361e215805aa49b7dfa661f28bf3d98b28d6baa3124627dd2a4c217762409c75bb375f8e12494c117863b4358319fab02c4682349a79eeeaaf
|
data/.idea/misc.xml
CHANGED
data/Gemfile.lock
CHANGED
data/lib/web_loader/command.rb
CHANGED
@@ -9,6 +9,9 @@ module WebLoader
|
|
9
9
|
|
10
10
|
USER_AGENT = "WebLoader"
|
11
11
|
CACHE_DIR = './cache'
|
12
|
+
DEFAULT_RETRY = 3
|
13
|
+
DEFAULT_REDIRECT = 10
|
14
|
+
DEFAULT_SLEEP = 10
|
12
15
|
|
13
16
|
def self.save_image(url, file)
|
14
17
|
# キャッシュせず単に保存する
|
@@ -25,28 +28,51 @@ module WebLoader
|
|
25
28
|
@cache_dir = File.expand_path(CACHE_DIR)
|
26
29
|
@user_agent = "#{USER_AGENT}/#{VERSION}"
|
27
30
|
@binary = false
|
31
|
+
@verbose = false
|
28
32
|
end
|
29
33
|
|
30
34
|
attr_reader :load_cache_page
|
31
|
-
attr_accessor :use_cache, :cache_dir, :binary, :user_agent
|
35
|
+
attr_accessor :use_cache, :cache_dir, :binary, :user_agent, :verbose
|
32
36
|
|
33
|
-
def
|
34
|
-
|
37
|
+
def load_retry(url, retry_count = DEFAULT_RETRY)
|
38
|
+
load(url, DEFAULT_REDIRECT, retry_count)
|
39
|
+
end
|
40
|
+
|
41
|
+
def load(url, redirect_count = DEFAULT_REDIRECT, retry_count = 0)
|
42
|
+
raise ArgumentError, 'HTTP redirect too deep' if redirect_count == 0
|
43
|
+
log("Load: #{url}", @verbose)
|
35
44
|
|
45
|
+
##### キャッシュの読み込み
|
36
46
|
@load_cache_page = false
|
37
47
|
content = try_load_cache(url)
|
38
48
|
if content
|
49
|
+
log("Load cache: #{url}", @verbose)
|
39
50
|
@load_cache_page = true
|
40
51
|
return content
|
41
52
|
end
|
42
53
|
|
54
|
+
##### サーバーからロード
|
55
|
+
log("Load server: #{url}", @verbose)
|
43
56
|
uri = URI.parse(url)
|
44
57
|
http = Net::HTTP.new(uri.host, uri.port)
|
45
58
|
if uri.scheme == 'https'
|
46
59
|
http.use_ssl = true
|
47
60
|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
48
61
|
end
|
49
|
-
response
|
62
|
+
response = nil
|
63
|
+
begin
|
64
|
+
response = http.get(uri.request_uri, 'User-Agent' => @user_agent) # request_uri=path + '?' + query
|
65
|
+
rescue Net::ReadTimeout
|
66
|
+
# タイムアウトした場合リトライ可能ならばsleepした後に再度ロード実行
|
67
|
+
log("Read timeout: #{url}", @verbose)
|
68
|
+
if retry_count > 0
|
69
|
+
sleep DEFAULT_SLEEP
|
70
|
+
return load(url, redirect_count , retry_count - 1)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
##### レスポンスの処理
|
75
|
+
result = nil
|
50
76
|
case response
|
51
77
|
when Net::HTTPSuccess
|
52
78
|
# responseがNet::HTTPSuccessのサブクラスの場合成功とみなし読み込んだ内容を返す
|
@@ -58,15 +84,33 @@ module WebLoader
|
|
58
84
|
encoding = response.type_params['charset']
|
59
85
|
body = toutf8(body, encoding)
|
60
86
|
end
|
61
|
-
|
62
|
-
|
87
|
+
if @use_cache
|
88
|
+
log("Write cache: #{url}", @verbose)
|
89
|
+
Cache.write(@cache_dir, url, response.code, body)
|
90
|
+
end
|
91
|
+
result = body
|
63
92
|
when Net::HTTPRedirection
|
64
|
-
load(to_redirect_url(uri, response['location']),
|
93
|
+
result = load(to_redirect_url(uri, response['location']), redirect_count - 1)
|
65
94
|
else
|
66
|
-
|
95
|
+
# 上記以外のレスポンスの場合、リトライ可能ならばsleepした後に再度ロード実行
|
96
|
+
if retry_count > 0
|
97
|
+
sleep_for = 10
|
98
|
+
if response.is_a?(Net::HTTPTooManyRequests)
|
99
|
+
# HTTPTooManyRequestsならばretry-afterで指定された値を取得。
|
100
|
+
sleep_for = response.header['retry-after'].to_i + 10
|
101
|
+
log("Rate limit: #{uri} #{response.header.to_hash} (429 Too Many Requests). Sleeping #{sleep_for} seconds and retry (##{retry_count}).", @verbose)
|
102
|
+
else
|
103
|
+
log("Unknown response: #{uri} #{response.inspect}. Sleeping #{sleep_for} seconds and retry (##{retry_count}).", @verbose)
|
104
|
+
end
|
105
|
+
sleep sleep_for
|
106
|
+
result = load(url, redirect_count , retry_count - 1)
|
107
|
+
end
|
108
|
+
|
67
109
|
# それ以外は対応した例外を発生
|
110
|
+
log("error #{url}", true)
|
68
111
|
response.value
|
69
112
|
end
|
113
|
+
result
|
70
114
|
end
|
71
115
|
|
72
116
|
private
|
@@ -76,5 +120,8 @@ module WebLoader
|
|
76
120
|
Cache.load_content(@cache_dir, url)
|
77
121
|
end
|
78
122
|
|
123
|
+
def log(msg, put_log)
|
124
|
+
puts msg if put_log
|
125
|
+
end
|
79
126
|
end
|
80
127
|
end
|
data/lib/web_loader/utils.rb
CHANGED
@@ -1,34 +1,36 @@
|
|
1
1
|
module WebLoader
|
2
2
|
module Utils
|
3
3
|
UTF_8 = 'UTF-8'
|
4
|
-
|
4
|
+
|
5
|
+
def toutf8_charset(str, charset)
|
6
|
+
# charsetが指定されていない場合はnil
|
7
|
+
return nil if charset.to_s.length == 0
|
8
|
+
# 文字列のcharsetを変更する
|
9
|
+
str.force_encoding(charset)
|
10
|
+
# force_encodingが失敗した場合はnil
|
11
|
+
return nil unless str.valid_encoding?
|
12
|
+
|
13
|
+
result = nil
|
14
|
+
if charset =~ /#{UTF_8}/i
|
15
|
+
result = str
|
16
|
+
else
|
17
|
+
# エンコーディングがUTF8じゃない場合変換する
|
18
|
+
result = str.encode(UTF_8, invalid: :replace, undef: :replace)
|
19
|
+
end
|
20
|
+
result
|
21
|
+
end
|
22
|
+
|
23
|
+
def toutf8(str, charset)
|
5
24
|
# 2022/04/04(月)
|
6
25
|
# GITHUBのアポストロフィ(’ U+2019)が文字化け問題に対処するために新設。
|
7
26
|
# 原因は直接Kconv.toutf8にresponse.bodyをわたしていたことなので(Kconvのguessが失敗していたと思われる)、
|
8
27
|
# response.type_paramsを見てそれにforce_encodingすることで対処する。渡されているcharsetとWebページの文字コードが一致していればこれで問題はないはず。
|
9
|
-
|
10
|
-
result = str
|
28
|
+
result = nil
|
11
29
|
begin
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
# 指定された文字コードとみなせた場合
|
17
|
-
if response_encoding != UTF_8
|
18
|
-
# エンコーディングがUTF8じゃない場合返還する
|
19
|
-
result = str.encode(UTF_8, invalid: :replace, undef: :replace)
|
20
|
-
else
|
21
|
-
# UTF8の場合そのまま
|
22
|
-
result = str
|
23
|
-
end
|
24
|
-
else
|
25
|
-
# 指定された文字コードとみなせない場合元の文字列を返す
|
26
|
-
result = org_str
|
27
|
-
end
|
28
|
-
else
|
29
|
-
# responseで文字コードが指定されていない場合Kconvを使用
|
30
|
-
result = Kconv.toutf8(str)
|
31
|
-
end
|
30
|
+
# 指定されたcharsetで変換する
|
31
|
+
result = toutf8_charset(str.dup, charset)
|
32
|
+
# charsetによる変換が失敗した場合Kconvを使用
|
33
|
+
result = Kconv.toutf8(str) if result.nil?
|
32
34
|
rescue => ex
|
33
35
|
puts ex.message
|
34
36
|
end
|
data/lib/web_loader/version.rb
CHANGED
data/test.sh
ADDED
data/web_loader.iml
CHANGED
@@ -7,7 +7,48 @@
|
|
7
7
|
</content>
|
8
8
|
<orderEntry type="inheritedJdk" />
|
9
9
|
<orderEntry type="sourceFolder" forTests="false" />
|
10
|
-
<orderEntry type="library"
|
10
|
+
<orderEntry type="module-library">
|
11
|
+
<library name="minitest (v5.19.0) [path][gem]" type="rubylib">
|
12
|
+
<properties>
|
13
|
+
<option name="version" value="4" />
|
14
|
+
</properties>
|
15
|
+
<CLASSES>
|
16
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/minitest-5.19.0/lib" />
|
17
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/minitest-5.19.0/test" />
|
18
|
+
</CLASSES>
|
19
|
+
<JAVADOC />
|
20
|
+
<SOURCES>
|
21
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/minitest-5.19.0/lib" />
|
22
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/minitest-5.19.0/test" />
|
23
|
+
</SOURCES>
|
24
|
+
<excluded>
|
25
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/minitest-5.19.0/test" />
|
26
|
+
</excluded>
|
27
|
+
</library>
|
28
|
+
</orderEntry>
|
29
|
+
<orderEntry type="module-library">
|
30
|
+
<library name="rake (v13.0.6) [path][gem]" type="rubylib">
|
31
|
+
<properties>
|
32
|
+
<option name="version" value="4" />
|
33
|
+
</properties>
|
34
|
+
<CLASSES>
|
35
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.0.6/doc" />
|
36
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.0.6/exe" />
|
37
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.0.6/lib" />
|
38
|
+
</CLASSES>
|
39
|
+
<JAVADOC />
|
40
|
+
<SOURCES>
|
41
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.0.6/doc" />
|
42
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.0.6/exe" />
|
43
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.0.6/lib" />
|
44
|
+
</SOURCES>
|
45
|
+
<excluded>
|
46
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.0.6/doc" />
|
47
|
+
<root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.0.6/exe" />
|
48
|
+
</excluded>
|
49
|
+
</library>
|
50
|
+
</orderEntry>
|
51
|
+
<orderEntry type="library" scope="PROVIDED" name="bundler (v2.4.19, rbenv: 3.1.4) [gem]" level="application" />
|
11
52
|
</component>
|
12
53
|
<component name="RakeTasksCache">
|
13
54
|
<option name="myRootTask">
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: web_loader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- src
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-10-07 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Web loader.
|
14
14
|
email:
|
@@ -36,6 +36,7 @@ files:
|
|
36
36
|
- lib/web_loader/utils.rb
|
37
37
|
- lib/web_loader/version.rb
|
38
38
|
- sig/web_loader.rbs
|
39
|
+
- test.sh
|
39
40
|
- web_loader.iml
|
40
41
|
homepage: https://srcw.net
|
41
42
|
licenses:
|
@@ -59,7 +60,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
59
60
|
- !ruby/object:Gem::Version
|
60
61
|
version: '0'
|
61
62
|
requirements: []
|
62
|
-
rubygems_version: 3.
|
63
|
+
rubygems_version: 3.3.26
|
63
64
|
signing_key:
|
64
65
|
specification_version: 4
|
65
66
|
summary: Web loader.
|