web_loader 2.2.0 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 803b7f81240d732a305705cc0fa46610781688ce19203f0b7d8710822b088558
4
- data.tar.gz: ec8ed1467ab473fa5fbae5a370304e3bb2281e525ecfc4f54c752828f49c1d00
3
+ metadata.gz: 1be67fa0e3b136ad7851bca7be3da284c076ccd520841cdceee7e8ff4d9865b8
4
+ data.tar.gz: bf7d0f89c6876c16429b12b1793647ca4e66857d650c0b296c75db3435451ab2
5
5
  SHA512:
6
- metadata.gz: 51c54de50e0faa9a4886cc3c84efbf88887003c12e8cafcc9cfa1ea9c5ae5beeb795a2795d08d48031dd373f6cb56de1e28116c7d0450a18b288c04e81e54e0f
7
- data.tar.gz: 62cd1ecd932a88fb51f919a5565833e5d14bbef94414acdcc55c90ab075fb387d0d98050753d05438bcaf3d8a371b731a2b146ac57cb98d4420a2b9ff97ffccc
6
+ metadata.gz: 8ce1618a7d007b1c58c026569787e1cd8c5a462c78d10a8416cba3886ae6a4e4b66773716837fa6f81710a25e042d6e10f557af39c959a42ea5c8cee7335f857
7
+ data.tar.gz: 17123c7a0df75669a6943e3fd0623cf289a03cc60c75b481a8d9f19458c07dc8bed071267a3a8481a877f15d14060d88b2742217de0913b20272addd0a4f7b07
data/CLAUDE.md ADDED
@@ -0,0 +1,67 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## コマンド
6
+
7
+ ```bash
8
+ # 依存関係のインストール
9
+ bundle install
10
+
11
+ # テスト実行(全テスト)
12
+ bundle exec rake test
13
+
14
+ # 単一テストファイルの実行
15
+ bundle exec ruby -Ilib -Itest test/test_utils.rb
16
+
17
+ # CLIツールの実行
18
+ bundle exec exe/wl [オプション] <URL>
19
+
20
+ # gemのビルド
21
+ bundle exec rake build
22
+
23
+ # gemのリリース(バージョン更新後)
24
+ bundle exec rake release
25
+ ```
26
+
27
+ ### CLIオプション
28
+
29
+ ```
30
+ -d DRIVER, --driver=DRIVER ドライバ指定: pureruby(デフォルト) | selenium
31
+ --disable-cache キャッシュ無効化
32
+ --user-agent=USERAGENT User-Agentヘッダの設定
33
+ -b, --binary バイナリファイルのダウンロード
34
+ -v, --verbose 詳細ログ出力
35
+ ```
36
+
37
+ ## アーキテクチャ
38
+
39
+ ### 処理フロー
40
+
41
+ ```
42
+ exe/wl → Downloader.run(argv) → Command#load(url) → Driver#fetch(url) → Response
43
+
44
+ Cache (./cache/)
45
+ ```
46
+
47
+ ### 主要クラス
48
+
49
+ - **`Downloader`** (`lib/web_loader/downloader.rb`): CLIエントリポイント。optparseでオプション解析し、ドライバとCommandを組み立てる。継承してカスタム`wait_proc`を定義できる。
50
+ - **`Command`** (`lib/web_loader/command.rb`): ロードのコアロジック。キャッシュ管理、リダイレクト追跡(最大10回)、リトライ(タイムアウト・429対応)を担当。
51
+ - **`Cache`** (`lib/web_loader/cache.rb`): URLのMD5ハッシュをファイル名に使い `./cache/` 配下に `.html` + `.yml` ペアで保存。デフォルト有効期限は1時間。
52
+ - **`Response`** (`lib/web_loader/response.rb`): ステータスコードのラッパー。`ok?`(2xx)、`redirect?`(3xx)、`rate_limited?`(429)を提供。
53
+
54
+ ### ドライバ
55
+
56
+ `BaseDriver` を継承して `fetch(url)` を実装する設計。
57
+
58
+ - **`HttpDriver`**: `Net::HTTP` を使用。文字コード変換(`Utils.toutf8`)を行う。SSL証明書検証は無効(`VERIFY_NONE`)。
59
+ - **`SeleniumDriver`**: ヘッドレスChromeを使用。JavaScriptレンダリングが必要なページ向け。`wait_proc` でカスタム待機条件を定義可能(未指定時は`wait_seconds`秒スリープ)。
60
+
61
+ ### 文字コード処理
62
+
63
+ `Utils#toutf8` はレスポンスのCharset指定 → metaタグ検出 → Kconvフォールバックの順で変換。Shift_JISはWindows-31Jとして扱う。
64
+
65
+ ### キャッシュ
66
+
67
+ デフォルトのキャッシュディレクトリは実行カレントディレクトリ直下の `./cache/`。`Command#cache_dir` で変更可能。
data/Gemfile.lock CHANGED
@@ -1,18 +1,18 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- web_loader (2.2.0)
4
+ web_loader (2.4.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
8
8
  specs:
9
9
  base64 (0.3.0)
10
10
  logger (1.7.0)
11
- minitest (5.26.2)
12
- rake (13.3.1)
11
+ minitest (5.27.0)
12
+ rake (13.4.2)
13
13
  rexml (3.4.4)
14
14
  rubyzip (3.2.2)
15
- selenium-webdriver (4.38.0)
15
+ selenium-webdriver (4.43.0)
16
16
  base64 (~> 0.2)
17
17
  logger (~> 1.4)
18
18
  rexml (~> 3.2, >= 3.2.5)
@@ -35,10 +35,9 @@ module WebLoader
35
35
  content
36
36
  end
37
37
 
38
- def self.write(dir, url, code, content)
38
+ def self.write(dir, url, code, content, driver: nil)
39
39
  header_path = header_filename(dir, url)
40
- # YAML.dump({"url" => url, "code" => code}, open(header_path, "w"))
41
- File.write(header_path, YAML.dump({ "url" => url, "code" => code }))
40
+ File.write(header_path, YAML.dump({ "url" => url, "code" => code, "driver" => driver }))
42
41
  content_path = content_filename(dir, url)
43
42
  File.write(content_path, content)
44
43
  end
@@ -36,8 +36,6 @@ module WebLoader
36
36
 
37
37
  # ドライバーのセットアップ
38
38
  @driver = driver
39
- @driver.user_agent = @user_agent
40
- @driver.binary = @binary
41
39
  end
42
40
 
43
41
  attr_reader :load_cache_page
@@ -67,15 +65,9 @@ module WebLoader
67
65
 
68
66
  ##### サーバーからロード
69
67
  log("Load server: #{url}")
70
- # uri = URI.parse(url)
71
- # http = Net::HTTP.new(uri.host, uri.port)
72
- # if uri.scheme == 'https'
73
- # http.use_ssl = true
74
- # http.verify_mode = OpenSSL::SSL::VERIFY_NONE
75
- # end
76
- # @response = nil
77
68
  begin
78
- # @response = http.get(uri.request_uri, 'User-Agent' => @user_agent) # request_uri=path + '?' + query
69
+ @driver.user_agent = @user_agent
70
+ @driver.binary = @binary
79
71
  @response = @driver.fetch(url)
80
72
  rescue Net::ReadTimeout
81
73
  # タイムアウトした場合リトライ可能ならばsleepした後に再度ロード実行
@@ -92,7 +84,7 @@ module WebLoader
92
84
  body = @response.body
93
85
  if @use_cache || @always_write_cache
94
86
  log("Write cache: #{url}")
95
- Cache.write(@cache_dir, url, @response.status, body)
87
+ Cache.write(@cache_dir, url, @response.status, body, driver: @driver.driver_name)
96
88
  end
97
89
  result = body
98
90
  elsif response.redirect?
@@ -110,51 +102,7 @@ module WebLoader
110
102
  # それ以外は対応した例外を発生
111
103
  log("error #{url}", true)
112
104
  end
113
-
114
105
  result
115
-
116
- # ##### レスポンスの処理
117
- # result = nil
118
- # case @response
119
- # when Net::HTTPSuccess
120
- # # @responseがNet::HTTPSuccessのサブクラスの場合成功とみなし読み込んだ内容を返す
121
- # body = @response.body
122
- # unless @binary
123
- # # デフォルトでは ASCII-8BITが帰ってくる。
124
- # # Content-Typeのcharsetとみなす。
125
- # # https://bugs.ruby-lang.org/issues/2567
126
- # encoding = @response.type_params['charset']
127
- # body = toutf8(body, encoding)
128
- # end
129
- #
130
- # if @use_cache || @always_write_cache
131
- # log("Write cache: #{url}")
132
- # Cache.write(@cache_dir, url, @response.code, body)
133
- # end
134
- # result = body
135
- # when Net::HTTPRedirection
136
- # result = load(to_redirect_url(uri, @response['location']), redirect_count - 1)
137
- # # when Net::HTTPNotFound
138
- # # result = nil
139
- # when Net::HTTPTooManyRequests, Net::ReadTimeout
140
- # # 上記以外のレスポンスの場合、リトライ可能ならばsleepした後に再度ロード実行
141
- # if retry_count > 0
142
- # sleep_for = 10
143
- # if @response.is_a?(Net::HTTPTooManyRequests)
144
- # # HTTPTooManyRequestsならばretry-afterで指定された値を取得。
145
- # sleep_for = @response.header['retry-after'].to_i + 10
146
- # log("Rate limit: #{uri} #{@response.header.to_hash} (429 Too Many Requests). Sleeping #{sleep_for} seconds and retry (##{retry_count}).")
147
- # else
148
- # log("Unknown response: #{uri} #{@response.inspect}. Sleeping #{sleep_for} seconds and retry (##{retry_count}).")
149
- # end
150
- # sleep sleep_for
151
- # result = load(url, redirect_count , retry_count - 1)
152
- # end
153
- # else
154
- # # それ以外は対応した例外を発生
155
- # log("error #{url}", true)
156
- # end
157
- # result
158
106
  end
159
107
 
160
108
  private
@@ -18,6 +18,8 @@ module WebLoader
18
18
  drivers = ['pureruby', 'selenium']
19
19
  opt.on('-d DRIVER', '--driver=DRIVER', drivers, drivers.join("|") + "(default pureruby)") {|v| opts[:d] = v }
20
20
  opt.on("--disable-cache", "Disable cache") {|v| opts[:disable_cache] = v }
21
+ opt.on('--user-agent=USERAGENT', 'Set User-Agent header') {|v| opts[:user_agent] = v }
22
+ opt.on('-b', '--binary', 'Download binary files') {|v| opts[:binary] = v }
21
23
  opt.parse!(argv)
22
24
  if argv.empty?
23
25
  puts "Error: URL is required."
@@ -39,6 +41,12 @@ module WebLoader
39
41
  if @opts[:disable_cache]
40
42
  loader.use_cache = false
41
43
  end
44
+ if @opts[:user_agent]
45
+ loader.user_agent = @opts[:user_agent]
46
+ end
47
+ if @opts[:binary]
48
+ loader.binary = true
49
+ end
42
50
  loader.load(url)
43
51
  end
44
52
 
@@ -12,6 +12,10 @@ module WebLoader
12
12
  raise NotImplementedError, 'Subclasses must implement the fetch method'
13
13
  end
14
14
 
15
+ def driver_name
16
+ raise NotImplementedError, 'Subclasses must implement the driver_name method'
17
+ end
18
+
15
19
  end
16
20
  end
17
21
  end
@@ -8,6 +8,10 @@ module WebLoader
8
8
  module Drivers
9
9
  class HttpDriver < WebLoader::Drivers::BaseDriver
10
10
 
11
+ def driver_name
12
+ "http"
13
+ end
14
+
11
15
  def fetch(url)
12
16
  uri = URI.parse(url)
13
17
  http = Net::HTTP.new(uri.host, uri.port)
@@ -11,6 +11,10 @@ module WebLoader
11
11
  attr_accessor :wait_proc
12
12
  attr_accessor :wait_seconds
13
13
 
14
+ def driver_name
15
+ "selenium"
16
+ end
17
+
14
18
  def fetch(url)
15
19
  require 'selenium-webdriver'
16
20
 
@@ -38,7 +42,7 @@ module WebLoader
38
42
 
39
43
  content_type = driver.execute_script("return document.contentType;")
40
44
 
41
- body = @binary ? page_source.b : driver.page_source
45
+ body = @binary ? driver.page_source.b : driver.page_source
42
46
  response = WebLoader::Response.new(status: 200,
43
47
  headers: {
44
48
  'Content-Type' => content_type
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module WebLoader
4
- VERSION = "2.2.0"
4
+ VERSION = "2.4.0"
5
5
  end
data/web_loader.iml CHANGED
@@ -73,7 +73,7 @@
73
73
  </library>
74
74
  </orderEntry>
75
75
  <orderEntry type="module-library">
76
- <library name="minitest (v5.26.2) [path][gem]" type="rubylib">
76
+ <library name="minitest (v5.27.0) [path][gem]" type="rubylib">
77
77
  <properties>
78
78
  <option name="additionalInfo">
79
79
  <AdditionalInfo>
@@ -90,25 +90,25 @@
90
90
  <option value="lib" />
91
91
  </list>
92
92
  </option>
93
- <option name="url" value="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.26.2" />
94
- <option name="version" value="5.26.2" />
93
+ <option name="url" value="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.27.0" />
94
+ <option name="version" value="5.27.0" />
95
95
  </properties>
96
96
  <CLASSES>
97
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.26.2/lib" />
98
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.26.2/test" />
97
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.27.0/lib" />
98
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.27.0/test" />
99
99
  </CLASSES>
100
100
  <JAVADOC />
101
101
  <SOURCES>
102
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.26.2/lib" />
103
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.26.2/test" />
102
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.27.0/lib" />
103
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.27.0/test" />
104
104
  </SOURCES>
105
105
  <excluded>
106
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.26.2/test" />
106
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.27.0/test" />
107
107
  </excluded>
108
108
  </library>
109
109
  </orderEntry>
110
110
  <orderEntry type="module-library">
111
- <library name="rake (v13.3.1) [path][gem]" type="rubylib">
111
+ <library name="rake (v13.4.2) [path][gem]" type="rubylib">
112
112
  <properties>
113
113
  <option name="additionalInfo">
114
114
  <AdditionalInfo>
@@ -125,23 +125,23 @@
125
125
  <option value="lib" />
126
126
  </list>
127
127
  </option>
128
- <option name="url" value="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.3.1" />
129
- <option name="version" value="13.3.1" />
128
+ <option name="url" value="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.4.2" />
129
+ <option name="version" value="13.4.2" />
130
130
  </properties>
131
131
  <CLASSES>
132
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.3.1/doc" />
133
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.3.1/exe" />
134
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.3.1/lib" />
132
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.4.2/doc" />
133
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.4.2/exe" />
134
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.4.2/lib" />
135
135
  </CLASSES>
136
136
  <JAVADOC />
137
137
  <SOURCES>
138
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.3.1/doc" />
139
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.3.1/exe" />
140
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.3.1/lib" />
138
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.4.2/doc" />
139
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.4.2/exe" />
140
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.4.2/lib" />
141
141
  </SOURCES>
142
142
  <excluded>
143
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.3.1/doc" />
144
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.3.1/exe" />
143
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.4.2/doc" />
144
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.4.2/exe" />
145
145
  </excluded>
146
146
  </library>
147
147
  </orderEntry>
@@ -216,7 +216,7 @@
216
216
  </library>
217
217
  </orderEntry>
218
218
  <orderEntry type="module-library">
219
- <library name="selenium-webdriver (v4.38.0) [path][gem]" type="rubylib">
219
+ <library name="selenium-webdriver (v4.43.0) [path][gem]" type="rubylib">
220
220
  <properties>
221
221
  <option name="additionalInfo">
222
222
  <AdditionalInfo>
@@ -233,20 +233,20 @@
233
233
  <option value="lib" />
234
234
  </list>
235
235
  </option>
236
- <option name="url" value="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/selenium-webdriver-4.38.0" />
237
- <option name="version" value="4.38.0" />
236
+ <option name="url" value="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/selenium-webdriver-4.43.0" />
237
+ <option name="version" value="4.43.0" />
238
238
  </properties>
239
239
  <CLASSES>
240
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/selenium-webdriver-4.38.0/bin" />
241
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/selenium-webdriver-4.38.0/lib" />
240
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/selenium-webdriver-4.43.0/bin" />
241
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/selenium-webdriver-4.43.0/lib" />
242
242
  </CLASSES>
243
243
  <JAVADOC />
244
244
  <SOURCES>
245
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/selenium-webdriver-4.38.0/bin" />
246
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/selenium-webdriver-4.38.0/lib" />
245
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/selenium-webdriver-4.43.0/bin" />
246
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/selenium-webdriver-4.43.0/lib" />
247
247
  </SOURCES>
248
248
  <excluded>
249
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/selenium-webdriver-4.38.0/bin" />
249
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/selenium-webdriver-4.43.0/bin" />
250
250
  </excluded>
251
251
  </library>
252
252
  </orderEntry>
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web_loader
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.2.0
4
+ version: 2.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - src
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-11-28 00:00:00.000000000 Z
11
+ date: 2026-05-02 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Web loader.
14
14
  email:
@@ -28,6 +28,7 @@ files:
28
28
  - ".idea/misc.xml"
29
29
  - ".idea/modules.xml"
30
30
  - ".idea/vcs.xml"
31
+ - CLAUDE.md
31
32
  - CODE_OF_CONDUCT.md
32
33
  - Gemfile
33
34
  - Gemfile.lock