web_loader 0.9.2 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bedf3023b91edc76a9176df7d866e5598679dfda140a6c118595819b5007ded0
4
- data.tar.gz: 2850c8535b50e8848cfef5d4e6bc764287ddc85e5a1cd22323703d061091c81a
3
+ metadata.gz: 1055ef0181a8f0e5857172c56783111bff1d102686f006c001e5b4b2627e65cb
4
+ data.tar.gz: da2499a388951ecc7d0c525442b847da0be5e32ab2b478dd8ae2338e6e0772de
5
5
  SHA512:
6
- metadata.gz: 36acc87915c495d51c6fae31605afb45c8c7f0fb7faa6be94d81fef48226222936238ea615ea2fed7a8cb89b0f909ab42768838cdf6ff11d568ff8245637f78b
7
- data.tar.gz: cf572416db59dc13ae4f253bd54aa6a51add2be5c3cb3ad401bdccd9e8f04f07df22ba14f34566d345828e0fe7f64a61ee43294b75532b06dffff0842edcde37
6
+ metadata.gz: 2e7e1b93c5af2ad6280843558fc89aba4014313c2fbb2999d0148749e20966bf05dcadc1e48bd8e46791feca2c7d78c6c3b99977778da667b36bd32964046ae6
7
+ data.tar.gz: 319950bb1a184a361e215805aa49b7dfa661f28bf3d98b28d6baa3124627dd2a4c217762409c75bb375f8e12494c117863b4358319fab02c4682349a79eeeaaf
data/.idea/misc.xml CHANGED
@@ -1,4 +1,3 @@
1
- <?xml version="1.0" encoding="UTF-8"?>
2
1
  <project version="4">
3
2
  <component name="ProjectRootManager" version="2" project-jdk-name="rbenv: 3.1.4" project-jdk-type="RUBY_SDK">
4
3
  <output url="file://$PROJECT_DIR$/out" />
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- web_loader (0.9.2)
4
+ web_loader (1.1.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -18,4 +18,4 @@ DEPENDENCIES
18
18
  web_loader!
19
19
 
20
20
  BUNDLED WITH
21
- 2.4.9
21
+ 2.4.19
@@ -9,6 +9,9 @@ module WebLoader
9
9
 
10
10
  USER_AGENT = "WebLoader"
11
11
  CACHE_DIR = './cache'
12
+ DEFAULT_RETRY = 3
13
+ DEFAULT_REDIRECT = 10
14
+ DEFAULT_SLEEP = 10
12
15
 
13
16
  def self.save_image(url, file)
14
17
  # キャッシュせず単に保存する
@@ -25,28 +28,51 @@ module WebLoader
25
28
  @cache_dir = File.expand_path(CACHE_DIR)
26
29
  @user_agent = "#{USER_AGENT}/#{VERSION}"
27
30
  @binary = false
31
+ @verbose = false
28
32
  end
29
33
 
30
34
  attr_reader :load_cache_page
31
- attr_accessor :use_cache, :cache_dir, :binary, :user_agent
35
+ attr_accessor :use_cache, :cache_dir, :binary, :user_agent, :verbose
32
36
 
33
- def load(url, limit = 10)
34
- raise ArgumentError, 'HTTP redirect too deep' if limit == 0
37
+ def load_retry(url, retry_count = DEFAULT_RETRY)
38
+ load(url, DEFAULT_REDIRECT, retry_count)
39
+ end
40
+
41
+ def load(url, redirect_count = DEFAULT_REDIRECT, retry_count = 0)
42
+ raise ArgumentError, 'HTTP redirect too deep' if redirect_count == 0
43
+ log("Load: #{url}", @verbose)
35
44
 
45
+ ##### キャッシュの読み込み
36
46
  @load_cache_page = false
37
47
  content = try_load_cache(url)
38
48
  if content
49
+ log("Load cache: #{url}", @verbose)
39
50
  @load_cache_page = true
40
51
  return content
41
52
  end
42
53
 
54
+ ##### サーバーからロード
55
+ log("Load server: #{url}", @verbose)
43
56
  uri = URI.parse(url)
44
57
  http = Net::HTTP.new(uri.host, uri.port)
45
58
  if uri.scheme == 'https'
46
59
  http.use_ssl = true
47
60
  http.verify_mode = OpenSSL::SSL::VERIFY_NONE
48
61
  end
49
- response, = http.get(uri.request_uri, 'User-Agent' => @user_agent) # request_uri=path + '?' + query
62
+ response = nil
63
+ begin
64
+ response = http.get(uri.request_uri, 'User-Agent' => @user_agent) # request_uri=path + '?' + query
65
+ rescue Net::ReadTimeout
66
+ # タイムアウトした場合リトライ可能ならばsleepした後に再度ロード実行
67
+ log("Read timeout: #{url}", @verbose)
68
+ if retry_count > 0
69
+ sleep DEFAULT_SLEEP
70
+ return load(url, redirect_count , retry_count - 1)
71
+ end
72
+ end
73
+
74
+ ##### レスポンスの処理
75
+ result = nil
50
76
  case response
51
77
  when Net::HTTPSuccess
52
78
  # responseがNet::HTTPSuccessのサブクラスの場合成功とみなし読み込んだ内容を返す
@@ -58,15 +84,33 @@ module WebLoader
58
84
  encoding = response.type_params['charset']
59
85
  body = toutf8(body, encoding)
60
86
  end
61
- Cache.write(@cache_dir, url, response.code, body) if @use_cache
62
- return body
87
+ if @use_cache
88
+ log("Write cache: #{url}", @verbose)
89
+ Cache.write(@cache_dir, url, response.code, body)
90
+ end
91
+ result = body
63
92
  when Net::HTTPRedirection
64
- load(to_redirect_url(uri, response['location']), limit - 1)
93
+ result = load(to_redirect_url(uri, response['location']), redirect_count - 1)
65
94
  else
66
- puts "error #{url}"
95
+ # 上記以外のレスポンスの場合、リトライ可能ならばsleepした後に再度ロード実行
96
+ if retry_count > 0
97
+ sleep_for = 10
98
+ if response.is_a?(Net::HTTPTooManyRequests)
99
+ # HTTPTooManyRequestsならばretry-afterで指定された値を取得。
100
+ sleep_for = response.header['retry-after'].to_i + 10
101
+ log("Rate limit: #{uri} #{response.header.to_hash} (429 Too Many Requests). Sleeping #{sleep_for} seconds and retry (##{retry_count}).", @verbose)
102
+ else
103
+ log("Unknown response: #{uri} #{response.inspect}. Sleeping #{sleep_for} seconds and retry (##{retry_count}).", @verbose)
104
+ end
105
+ sleep sleep_for
106
+ result = load(url, redirect_count , retry_count - 1)
107
+ end
108
+
67
109
  # それ以外は対応した例外を発生
110
+ log("error #{url}", true)
68
111
  response.value
69
112
  end
113
+ result
70
114
  end
71
115
 
72
116
  private
@@ -76,5 +120,8 @@ module WebLoader
76
120
  Cache.load_content(@cache_dir, url)
77
121
  end
78
122
 
123
+ def log(msg, put_log)
124
+ puts msg if put_log
125
+ end
79
126
  end
80
127
  end
@@ -1,34 +1,36 @@
1
1
  module WebLoader
2
2
  module Utils
3
3
  UTF_8 = 'UTF-8'
4
- def toutf8(str, response_encoding)
4
+
5
+ def toutf8_charset(str, charset)
6
+ # charsetが指定されていない場合はnil
7
+ return nil if charset.to_s.length == 0
8
+ # 文字列のcharsetを変更する
9
+ str.force_encoding(charset)
10
+ # force_encodingが失敗した場合はnil
11
+ return nil unless str.valid_encoding?
12
+
13
+ result = nil
14
+ if charset =~ /#{UTF_8}/i
15
+ result = str
16
+ else
17
+ # エンコーディングがUTF8じゃない場合変換する
18
+ result = str.encode(UTF_8, invalid: :replace, undef: :replace)
19
+ end
20
+ result
21
+ end
22
+
23
+ def toutf8(str, charset)
5
24
  # 2022/04/04(月)
6
25
  # GITHUBのアポストロフィ(&#x2019 U+2019)が文字化け問題に対処するために新設。
7
26
  # 原因は直接Kconv.toutf8にresponse.bodyをわたしていたことなので(Kconvのguessが失敗していたと思われる)、
8
27
  # response.type_paramsを見てそれにforce_encodingすることで対処する。渡されているcharsetとWebページの文字コードが一致していればこれで問題はないはず。
9
- org_str = str.dup
10
- result = str
28
+ result = nil
11
29
  begin
12
- if response_encoding.to_s.length > 0 # nilでないかつ長さが0以上
13
- # responseで指定された文字コードであるとみなす
14
- str.force_encoding(response_encoding)
15
- if str.valid_encoding?
16
- # 指定された文字コードとみなせた場合
17
- if response_encoding != UTF_8
18
- # エンコーディングがUTF8じゃない場合返還する
19
- result = str.encode(UTF_8, invalid: :replace, undef: :replace)
20
- else
21
- # UTF8の場合そのまま
22
- result = str
23
- end
24
- else
25
- # 指定された文字コードとみなせない場合元の文字列を返す
26
- result = org_str
27
- end
28
- else
29
- # responseで文字コードが指定されていない場合Kconvを使用
30
- result = Kconv.toutf8(str)
31
- end
30
+ # 指定されたcharsetで変換する
31
+ result = toutf8_charset(str.dup, charset)
32
+ # charsetによる変換が失敗した場合Kconvを使用
33
+ result = Kconv.toutf8(str) if result.nil?
32
34
  rescue => ex
33
35
  puts ex.message
34
36
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module WebLoader
4
- VERSION = "0.9.2"
4
+ VERSION = "1.1.0"
5
5
  end
data/test.sh ADDED
@@ -0,0 +1,4 @@
1
+ #!/bin/sh
2
+
3
+ bundle exec rake test
4
+
data/web_loader.iml CHANGED
@@ -7,7 +7,48 @@
7
7
  </content>
8
8
  <orderEntry type="inheritedJdk" />
9
9
  <orderEntry type="sourceFolder" forTests="false" />
10
- <orderEntry type="library" scope="PROVIDED" name="rake (v13.0.6, rbenv: 3.1.4) [gem]" level="application" />
10
+ <orderEntry type="module-library">
11
+ <library name="minitest (v5.19.0) [path][gem]" type="rubylib">
12
+ <properties>
13
+ <option name="version" value="4" />
14
+ </properties>
15
+ <CLASSES>
16
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/minitest-5.19.0/lib" />
17
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/minitest-5.19.0/test" />
18
+ </CLASSES>
19
+ <JAVADOC />
20
+ <SOURCES>
21
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/minitest-5.19.0/lib" />
22
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/minitest-5.19.0/test" />
23
+ </SOURCES>
24
+ <excluded>
25
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/minitest-5.19.0/test" />
26
+ </excluded>
27
+ </library>
28
+ </orderEntry>
29
+ <orderEntry type="module-library">
30
+ <library name="rake (v13.0.6) [path][gem]" type="rubylib">
31
+ <properties>
32
+ <option name="version" value="4" />
33
+ </properties>
34
+ <CLASSES>
35
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.0.6/doc" />
36
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.0.6/exe" />
37
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.0.6/lib" />
38
+ </CLASSES>
39
+ <JAVADOC />
40
+ <SOURCES>
41
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.0.6/doc" />
42
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.0.6/exe" />
43
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.0.6/lib" />
44
+ </SOURCES>
45
+ <excluded>
46
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.0.6/doc" />
47
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.1.0/gems/rake-13.0.6/exe" />
48
+ </excluded>
49
+ </library>
50
+ </orderEntry>
51
+ <orderEntry type="library" scope="PROVIDED" name="bundler (v2.4.19, rbenv: 3.1.4) [gem]" level="application" />
11
52
  </component>
12
53
  <component name="RakeTasksCache">
13
54
  <option name="myRootTask">
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web_loader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.2
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - src
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-08-01 00:00:00.000000000 Z
11
+ date: 2023-10-07 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Web loader.
14
14
  email:
@@ -36,6 +36,7 @@ files:
36
36
  - lib/web_loader/utils.rb
37
37
  - lib/web_loader/version.rb
38
38
  - sig/web_loader.rbs
39
+ - test.sh
39
40
  - web_loader.iml
40
41
  homepage: https://srcw.net
41
42
  licenses:
@@ -59,7 +60,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
59
60
  - !ruby/object:Gem::Version
60
61
  version: '0'
61
62
  requirements: []
62
- rubygems_version: 3.2.33
63
+ rubygems_version: 3.3.26
63
64
  signing_key:
64
65
  specification_version: 4
65
66
  summary: Web loader.