web_loader 0.9.2 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bedf3023b91edc76a9176df7d866e5598679dfda140a6c118595819b5007ded0
4
- data.tar.gz: 2850c8535b50e8848cfef5d4e6bc764287ddc85e5a1cd22323703d061091c81a
3
+ metadata.gz: 1055ef0181a8f0e5857172c56783111bff1d102686f006c001e5b4b2627e65cb
4
+ data.tar.gz: da2499a388951ecc7d0c525442b847da0be5e32ab2b478dd8ae2338e6e0772de
5
5
  SHA512:
6
- metadata.gz: 36acc87915c495d51c6fae31605afb45c8c7f0fb7faa6be94d81fef48226222936238ea615ea2fed7a8cb89b0f909ab42768838cdf6ff11d568ff8245637f78b
7
- data.tar.gz: cf572416db59dc13ae4f253bd54aa6a51add2be5c3cb3ad401bdccd9e8f04f07df22ba14f34566d345828e0fe7f64a61ee43294b75532b06dffff0842edcde37
6
+ metadata.gz: 2e7e1b93c5af2ad6280843558fc89aba4014313c2fbb2999d0148749e20966bf05dcadc1e48bd8e46791feca2c7d78c6c3b99977778da667b36bd32964046ae6
7
+ data.tar.gz: 319950bb1a184a361e215805aa49b7dfa661f28bf3d98b28d6baa3124627dd2a4c217762409c75bb375f8e12494c117863b4358319fab02c4682349a79eeeaaf
data/.idea/misc.xml CHANGED
@@ -1,4 +1,3 @@
1
- <?xml version="1.0" encoding="UTF-8"?>
2
1
  <project version="4">
3
2
  <component name="ProjectRootManager" version="2" project-jdk-name="rbenv: 3.1.4" project-jdk-type="RUBY_SDK">
4
3
  <output url="file://$PROJECT_DIR$/out" />
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- web_loader (0.9.2)
4
+ web_loader (1.1.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -18,4 +18,4 @@ DEPENDENCIES
18
18
  web_loader!
19
19
 
20
20
  BUNDLED WITH
21
- 2.4.9
21
+ 2.4.19
@@ -9,6 +9,9 @@ module WebLoader
9
9
 
10
10
  USER_AGENT = "WebLoader"
11
11
  CACHE_DIR = './cache'
12
+ DEFAULT_RETRY = 3
13
+ DEFAULT_REDIRECT = 10
14
+ DEFAULT_SLEEP = 10
12
15
 
13
16
  def self.save_image(url, file)
14
17
  # キャッシュせず単に保存する
@@ -25,28 +28,51 @@ module WebLoader
25
28
  @cache_dir = File.expand_path(CACHE_DIR)
26
29
  @user_agent = "#{USER_AGENT}/#{VERSION}"
27
30
  @binary = false
31
+ @verbose = false
28
32
  end
29
33
 
30
34
  attr_reader :load_cache_page
31
- attr_accessor :use_cache, :cache_dir, :binary, :user_agent
35
+ attr_accessor :use_cache, :cache_dir, :binary, :user_agent, :verbose
32
36
 
33
- def load(url, limit = 10)
34
- raise ArgumentError, 'HTTP redirect too deep' if limit == 0
37
+ def load_retry(url, retry_count = DEFAULT_RETRY)
38
+ load(url, DEFAULT_REDIRECT, retry_count)
39
+ end
40
+
41
+ def load(url, redirect_count = DEFAULT_REDIRECT, retry_count = 0)
42
+ raise ArgumentError, 'HTTP redirect too deep' if redirect_count == 0
43
+ log("Load: #{url}", @verbose)
35
44
 
45
+ ##### キャッシュの読み込み
36
46
  @load_cache_page = false
37
47
  content = try_load_cache(url)
38
48
  if content
49
+ log("Load cache: #{url}", @verbose)
39
50
  @load_cache_page = true
40
51
  return content
41
52
  end
42
53
 
54
+ ##### サーバーからロード
55
+ log("Load server: #{url}", @verbose)
43
56
  uri = URI.parse(url)
44
57
  http = Net::HTTP.new(uri.host, uri.port)
45
58
  if uri.scheme == 'https'
46
59
  http.use_ssl = true
47
60
  http.verify_mode = OpenSSL::SSL::VERIFY_NONE
48
61
  end
49
- response, = http.get(uri.request_uri, 'User-Agent' => @user_agent) # request_uri=path + '?' + query
62
+ response = nil
63
+ begin
64
+ response = http.get(uri.request_uri, 'User-Agent' => @user_agent) # request_uri=path + '?' + query
65
+ rescue Net::ReadTimeout
66
+ # タイムアウトした場合リトライ可能ならばsleepした後に再度ロード実行
67
+ log("Read timeout: #{url}", @verbose)
68
+ if retry_count > 0
69
+ sleep DEFAULT_SLEEP
70
+ return load(url, redirect_count , retry_count - 1)
71
+ end
72
+ end
73
+
74
+ ##### レスポンスの処理
75
+ result = nil
50
76
  case response
51
77
  when Net::HTTPSuccess
52
78
  # responseがNet::HTTPSuccessのサブクラスの場合成功とみなし読み込んだ内容を返す
@@ -58,15 +84,33 @@ module WebLoader
58
84
  encoding = response.type_params['charset']
59
85
  body = toutf8(body, encoding)
60
86
  end
61
- Cache.write(@cache_dir, url, response.code, body) if @use_cache
62
- return body
87
+ if @use_cache
88
+ log("Write cache: #{url}", @verbose)
89
+ Cache.write(@cache_dir, url, response.code, body)
90
+ end
91
+ result = body
63
92
  when Net::HTTPRedirection
64
- load(to_redirect_url(uri, response['location']), limit - 1)
93
+ result = load(to_redirect_url(uri, response['location']), redirect_count - 1)
65
94
  else
66
- puts "error #{url}"
95
+ # 上記以外のレスポンスの場合、リトライ可能ならばsleepした後に再度ロード実行
96
+ if retry_count > 0
97
+ sleep_for = 10
98
+ if response.is_a?(Net::HTTPTooManyRequests)
99
+ # HTTPTooManyRequestsならばretry-afterで指定された値を取得。
100
+ sleep_for = response.header['retry-after'].to_i + 10
101
+ log("Rate limit: #{uri} #{response.header.to_hash} (429 Too Many Requests). Sleeping #{sleep_for} seconds and retry (##{retry_count}).", @verbose)
102
+ else
103
+ log("Unknown response: #{uri} #{response.inspect}. Sleeping #{sleep_for} seconds and retry (##{retry_count}).", @verbose)
104
+ end
105
+ sleep sleep_for
106
+ result = load(url, redirect_count , retry_count - 1)
107
+ end
108
+
67
109
  # それ以外は対応した例外を発生
110
+ log("error #{url}", true)
68
111
  response.value
69
112
  end
113
+ result
70
114
  end
71
115
 
72
116
  private
@@ -76,5 +120,8 @@ module WebLoader
76
120
  Cache.load_content(@cache_dir, url)
77
121
  end
78
122
 
123
+ def log(msg, put_log)
124
+ puts msg if put_log
125
+ end
79
126
  end
80
127
  end
@@ -1,34 +1,36 @@
1
1
  module WebLoader
2
2
  module Utils
3
3
  UTF_8 = 'UTF-8'
4
- def toutf8(str, response_encoding)
4
+
5
+ def toutf8_charset(str, charset)
6
+ # charsetが指定されていない場合はnil
7
+ return nil if charset.to_s.length == 0
8
+ # 文字列のcharsetを変更する
9
+ str.force_encoding(charset)
10
+ # force_encodingが失敗した場合はnil
11
+ return nil unless str.valid_encoding?
12
+
13
+ result = nil
14
+ if charset =~ /#{UTF_8}/i
15
+ result = str
16
+ else
17
+ # エンコーディングがUTF8じゃない場合変換する
18
+ result = str.encode(UTF_8, invalid: :replace, undef: :replace)
19
+ end
20
+ result
21
+ end
22
+
23
+ def toutf8(str, charset)
5
24
  # 2022/04/04(月)
6
25
  # GITHUBのアポストロフィ(&#x2019 U+2019)が文字化け問題に対処するために新設。
7
26
  # 原因は直接Kconv.toutf8にresponse.bodyをわたしていたことなので(Kconvのguessが失敗していたと思われる)、
8
27
  # response.type_paramsを見てそれにforce_encodingすることで対処する。渡されているcharsetとWebページの文字コードが一致していればこれで問題はないはず。
9
- org_str = str.dup
10
- result = str
28
+ result = nil
11
29
  begin
12
- if response_encoding.to_s.length > 0 # nilでないかつ長さが0以上
13
- # responseで指定された文字コードであるとみなす
14
- str.force_encoding(response_encoding)
15
- if str.valid_encoding?
16
- # 指定された文字コードとみなせた場合
17
- if response_encoding != UTF_8
18
- # エンコーディングがUTF8じゃない場合返還する
19
- result = str.encode(UTF_8, invalid: :replace, undef: :replace)
20
- else
21
- # UTF8の場合そのまま
22
- result = str
23
- end
24
- else
25
- # 指定された文字コードとみなせない場合元の文字列を返す
26
- result = org_str
27
- end
28
- else
29
- # responseで文字コードが指定されていない場合Kconvを使用
30
- result = Kconv.toutf8(str)
31
- end
30
+ # 指定されたcharsetで変換する
31
+ result = toutf8_charset(str.dup, charset)
32
+ # charsetによる変換が失敗した場合Kconvを使用
33
+ result = Kconv.toutf8(str) if result.nil?
32
34
  rescue => ex
33
35
  puts ex.message
34
36
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module WebLoader
4
- VERSION = "0.9.2"
4
+ VERSION = "1.1.0"
5
5
  end
data/test.sh ADDED
@@ -0,0 +1,4 @@
1
+ #!/bin/sh
2
+
3
+ bundle exec rake test
4
+
data/web_loader.iml CHANGED
@@ -7,7 +7,48 @@
7
7
  </content>
8
8
  <orderEntry type="inheritedJdk" />
9
9
  <orderEntry type="sourceFolder" forTests="false" />
10
- <orderEntry type="library" scope="PROVIDED" name="rake (v13.0.6, rbenv: 3.1.4) [gem]" level="application" />
10
+ <orderEntry type="module-library">
11
+ <library name="minitest (v5.19.0) [path][gem]" type="rubylib">
12
+ <properties>
13
+ <option name="version" value="4" />
14
+ </properties>
15
+ <CLASSES>
16
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/minitest-5.19.0/lib" />
17
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/minitest-5.19.0/test" />
18
+ </CLASSES>
19
+ <JAVADOC />
20
+ <SOURCES>
21
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/minitest-5.19.0/lib" />
22
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/minitest-5.19.0/test" />
23
+ </SOURCES>
24
+ <excluded>
25
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/minitest-5.19.0/test" />
26
+ </excluded>
27
+ </library>
28
+ </orderEntry>
29
+ <orderEntry type="module-library">
30
+ <library name="rake (v13.0.6) [path][gem]" type="rubylib">
31
+ <properties>
32
+ <option name="version" value="4" />
33
+ </properties>
34
+ <CLASSES>
35
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.0.6/doc" />
36
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.0.6/exe" />
37
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.0.6/lib" />
38
+ </CLASSES>
39
+ <JAVADOC />
40
+ <SOURCES>
41
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.0.6/doc" />
42
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.0.6/exe" />
43
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.0.6/lib" />
44
+ </SOURCES>
45
+ <excluded>
46
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.0.6/doc" />
47
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.0.6/exe" />
48
+ </excluded>
49
+ </library>
50
+ </orderEntry>
51
+ <orderEntry type="library" scope="PROVIDED" name="bundler (v2.4.19, rbenv: 3.1.4) [gem]" level="application" />
11
52
  </component>
12
53
  <component name="RakeTasksCache">
13
54
  <option name="myRootTask">
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web_loader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.2
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - src
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-08-01 00:00:00.000000000 Z
11
+ date: 2023-10-07 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Web loader.
14
14
  email:
@@ -36,6 +36,7 @@ files:
36
36
  - lib/web_loader/utils.rb
37
37
  - lib/web_loader/version.rb
38
38
  - sig/web_loader.rbs
39
+ - test.sh
39
40
  - web_loader.iml
40
41
  homepage: https://srcw.net
41
42
  licenses:
@@ -59,7 +60,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
59
60
  - !ruby/object:Gem::Version
60
61
  version: '0'
61
62
  requirements: []
62
- rubygems_version: 3.2.33
63
+ rubygems_version: 3.3.26
63
64
  signing_key:
64
65
  specification_version: 4
65
66
  summary: Web loader.