web_loader 2.3.0 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 03a86b4f610a1575be326e19740897204b75e2dbe41e1eac56ee1c98d9c44558
4
- data.tar.gz: b83a8f147007e94e4fac89abd4297066727dfe6ebf0a68052569f97cece0f223
3
+ metadata.gz: b658190fd45458df29b8ada480231f66bd00b3344068872420043e2c6b458160
4
+ data.tar.gz: 31e377c20c88f61003a7d55d24fc7141265bbaff5b5b17fbe7db6b3101c62094
5
5
  SHA512:
6
- metadata.gz: 27395f10236456d0780ee08c28d5b1c9fd447acab50261732eb24c4bb88011eb823381d01005ac6faf4d5e580d85dd1e16c69038c4ca1f4d3ed257613aeb1daf
7
- data.tar.gz: 24a7b60226b63fcc7c1fda0a44ca11fbb14d10176936f93b0684fc0866289cbae73058fc7c4f485579fa83850520d6f9cc13112cddaff60d733e009b06ded5cc
6
+ metadata.gz: f49f0daaac991f65ad9cabde30e85cf5e21cdcef3c603a583d534346ba0aeef21c04e871cb2dfcefa1cc3e2355280f8ff2b30b8a53036f4131d48987474d3b58
7
+ data.tar.gz: 5adb8bb4988825ad6dcb0b91b536dab67e9aecfe26a3bc044e63cb266d5529b086b337f544e4cf63391be9617a5769fb323217cf3a9ab272d9cf7358f9a8aa86
data/CLAUDE.md ADDED
@@ -0,0 +1,67 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## コマンド
6
+
7
+ ```bash
8
+ # 依存関係のインストール
9
+ bundle install
10
+
11
+ # テスト実行(全テスト)
12
+ bundle exec rake test
13
+
14
+ # 単一テストファイルの実行
15
+ bundle exec ruby -Ilib -Itest test/test_utils.rb
16
+
17
+ # CLIツールの実行
18
+ bundle exec exe/wl [オプション] <URL>
19
+
20
+ # gemのビルド
21
+ bundle exec rake build
22
+
23
+ # gemのリリース(バージョン更新後)
24
+ bundle exec rake release
25
+ ```
26
+
27
+ ### CLIオプション
28
+
29
+ ```
30
+ -d DRIVER, --driver=DRIVER ドライバ指定: pureruby(デフォルト) | selenium
31
+ --disable-cache キャッシュ無効化
32
+ --user-agent=USERAGENT User-Agentヘッダの設定
33
+ -b, --binary バイナリファイルのダウンロード
34
+ -v, --verbose 詳細ログ出力
35
+ ```
36
+
37
+ ## アーキテクチャ
38
+
39
+ ### 処理フロー
40
+
41
+ ```
42
+ exe/wl → Downloader.run(argv) → Command#load(url) → Driver#fetch(url) → Response
43
+
44
+ Cache (./cache/)
45
+ ```
46
+
47
+ ### 主要クラス
48
+
49
+ - **`Downloader`** (`lib/web_loader/downloader.rb`): CLIエントリポイント。optparseでオプション解析し、ドライバとCommandを組み立てる。継承してカスタム`wait_proc`を定義できる。
50
+ - **`Command`** (`lib/web_loader/command.rb`): ロードのコアロジック。キャッシュ管理、リダイレクト追跡(最大10回)、リトライ(タイムアウト・429対応)を担当。
51
+ - **`Cache`** (`lib/web_loader/cache.rb`): URLのMD5ハッシュをファイル名に使い `./cache/` 配下に `.html` + `.yml` ペアで保存。デフォルト有効期限は1時間。
52
+ - **`Response`** (`lib/web_loader/response.rb`): ステータスコードのラッパー。`ok?`(2xx)、`redirect?`(3xx)、`rate_limited?`(429)を提供。
53
+
54
+ ### ドライバ
55
+
56
+ `BaseDriver` を継承して `fetch(url)` を実装する設計。
57
+
58
+ - **`HttpDriver`**: `Net::HTTP` を使用。文字コード変換(`Utils.toutf8`)を行う。SSL証明書検証は無効(`VERIFY_NONE`)。
59
+ - **`SeleniumDriver`**: ヘッドレスChromeを使用。JavaScriptレンダリングが必要なページ向け。`wait_proc` でカスタム待機条件を定義可能(未指定時は`wait_seconds`秒スリープ)。
60
+
61
+ ### 文字コード処理
62
+
63
+ `Utils#toutf8` はレスポンスのCharset指定 → metaタグ検出 → Kconvフォールバックの順で変換。Shift_JISはWindows-31Jとして扱う。
64
+
65
+ ### キャッシュ
66
+
67
+ デフォルトのキャッシュディレクトリは実行カレントディレクトリ直下の `./cache/`。`Command#cache_dir` で変更可能。
data/Gemfile.lock CHANGED
@@ -1,18 +1,18 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- web_loader (2.3.0)
4
+ web_loader (2.5.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
8
8
  specs:
9
9
  base64 (0.3.0)
10
10
  logger (1.7.0)
11
- minitest (5.26.2)
12
- rake (13.3.1)
11
+ minitest (5.27.0)
12
+ rake (13.4.2)
13
13
  rexml (3.4.4)
14
14
  rubyzip (3.2.2)
15
- selenium-webdriver (4.38.0)
15
+ selenium-webdriver (4.43.0)
16
16
  base64 (~> 0.2)
17
17
  logger (~> 1.4)
18
18
  rexml (~> 3.2, >= 3.2.5)
@@ -35,10 +35,9 @@ module WebLoader
35
35
  content
36
36
  end
37
37
 
38
- def self.write(dir, url, code, content)
38
+ def self.write(dir, url, code, content, driver: nil)
39
39
  header_path = header_filename(dir, url)
40
- # YAML.dump({"url" => url, "code" => code}, open(header_path, "w"))
41
- File.write(header_path, YAML.dump({ "url" => url, "code" => code }))
40
+ File.write(header_path, YAML.dump({ "url" => url, "code" => code, "driver" => driver }))
42
41
  content_path = content_filename(dir, url)
43
42
  File.write(content_path, content)
44
43
  end
@@ -84,7 +84,7 @@ module WebLoader
84
84
  body = @response.body
85
85
  if @use_cache || @always_write_cache
86
86
  log("Write cache: #{url}")
87
- Cache.write(@cache_dir, url, @response.status, body)
87
+ Cache.write(@cache_dir, url, @response.status, body, driver: @driver.driver_name)
88
88
  end
89
89
  result = body
90
90
  elsif response.redirect?
@@ -20,11 +20,18 @@ module WebLoader
20
20
  opt.on("--disable-cache", "Disable cache") {|v| opts[:disable_cache] = v }
21
21
  opt.on('--user-agent=USERAGENT', 'Set User-Agent header') {|v| opts[:user_agent] = v }
22
22
  opt.on('-b', '--binary', 'Download binary files') {|v| opts[:binary] = v }
23
- opt.parse!(argv)
23
+ opt.on('--cache-dir=DIR', 'Cache directory (default ./cache)') {|v| opts[:cache_dir] = v }
24
+ begin
25
+ opt.parse!(argv)
26
+ rescue OptionParser::ParseError => e
27
+ warn "Error: #{e.message}"
28
+ puts opt.help
29
+ exit 1
30
+ end
24
31
  if argv.empty?
25
- puts "Error: URL is required."
32
+ warn "Error: URL is required."
26
33
  puts opt.help
27
- exit
34
+ exit 1
28
35
  end
29
36
  command = self.new(opts)
30
37
  url = argv[0]
@@ -47,6 +54,9 @@ module WebLoader
47
54
  if @opts[:binary]
48
55
  loader.binary = true
49
56
  end
57
+ if @opts[:cache_dir]
58
+ loader.cache_dir = File.expand_path(@opts[:cache_dir])
59
+ end
50
60
  loader.load(url)
51
61
  end
52
62
 
@@ -12,6 +12,10 @@ module WebLoader
12
12
  raise NotImplementedError, 'Subclasses must implement the fetch method'
13
13
  end
14
14
 
15
+ def driver_name
16
+ raise NotImplementedError, 'Subclasses must implement the driver_name method'
17
+ end
18
+
15
19
  end
16
20
  end
17
21
  end
@@ -8,6 +8,10 @@ module WebLoader
8
8
  module Drivers
9
9
  class HttpDriver < WebLoader::Drivers::BaseDriver
10
10
 
11
+ def driver_name
12
+ "pureruby"
13
+ end
14
+
11
15
  def fetch(url)
12
16
  uri = URI.parse(url)
13
17
  http = Net::HTTP.new(uri.host, uri.port)
@@ -11,6 +11,10 @@ module WebLoader
11
11
  attr_accessor :wait_proc
12
12
  attr_accessor :wait_seconds
13
13
 
14
+ def driver_name
15
+ "selenium"
16
+ end
17
+
14
18
  def fetch(url)
15
19
  require 'selenium-webdriver'
16
20
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module WebLoader
4
- VERSION = "2.3.0"
4
+ VERSION = "2.5.0"
5
5
  end
data/web_loader.iml CHANGED
@@ -73,7 +73,7 @@
73
73
  </library>
74
74
  </orderEntry>
75
75
  <orderEntry type="module-library">
76
- <library name="minitest (v5.26.2) [path][gem]" type="rubylib">
76
+ <library name="minitest (v5.27.0) [path][gem]" type="rubylib">
77
77
  <properties>
78
78
  <option name="additionalInfo">
79
79
  <AdditionalInfo>
@@ -90,25 +90,25 @@
90
90
  <option value="lib" />
91
91
  </list>
92
92
  </option>
93
- <option name="url" value="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.26.2" />
94
- <option name="version" value="5.26.2" />
93
+ <option name="url" value="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.27.0" />
94
+ <option name="version" value="5.27.0" />
95
95
  </properties>
96
96
  <CLASSES>
97
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.26.2/lib" />
98
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.26.2/test" />
97
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.27.0/lib" />
98
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.27.0/test" />
99
99
  </CLASSES>
100
100
  <JAVADOC />
101
101
  <SOURCES>
102
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.26.2/lib" />
103
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.26.2/test" />
102
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.27.0/lib" />
103
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.27.0/test" />
104
104
  </SOURCES>
105
105
  <excluded>
106
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.26.2/test" />
106
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/minitest-5.27.0/test" />
107
107
  </excluded>
108
108
  </library>
109
109
  </orderEntry>
110
110
  <orderEntry type="module-library">
111
- <library name="rake (v13.3.1) [path][gem]" type="rubylib">
111
+ <library name="rake (v13.4.2) [path][gem]" type="rubylib">
112
112
  <properties>
113
113
  <option name="additionalInfo">
114
114
  <AdditionalInfo>
@@ -125,23 +125,23 @@
125
125
  <option value="lib" />
126
126
  </list>
127
127
  </option>
128
- <option name="url" value="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.3.1" />
129
- <option name="version" value="13.3.1" />
128
+ <option name="url" value="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.4.2" />
129
+ <option name="version" value="13.4.2" />
130
130
  </properties>
131
131
  <CLASSES>
132
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.3.1/doc" />
133
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.3.1/exe" />
134
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.3.1/lib" />
132
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.4.2/doc" />
133
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.4.2/exe" />
134
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.4.2/lib" />
135
135
  </CLASSES>
136
136
  <JAVADOC />
137
137
  <SOURCES>
138
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.3.1/doc" />
139
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.3.1/exe" />
140
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.3.1/lib" />
138
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.4.2/doc" />
139
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.4.2/exe" />
140
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.4.2/lib" />
141
141
  </SOURCES>
142
142
  <excluded>
143
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.3.1/doc" />
144
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.3.1/exe" />
143
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.4.2/doc" />
144
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/rake-13.4.2/exe" />
145
145
  </excluded>
146
146
  </library>
147
147
  </orderEntry>
@@ -216,7 +216,7 @@
216
216
  </library>
217
217
  </orderEntry>
218
218
  <orderEntry type="module-library">
219
- <library name="selenium-webdriver (v4.38.0) [path][gem]" type="rubylib">
219
+ <library name="selenium-webdriver (v4.43.0) [path][gem]" type="rubylib">
220
220
  <properties>
221
221
  <option name="additionalInfo">
222
222
  <AdditionalInfo>
@@ -233,20 +233,20 @@
233
233
  <option value="lib" />
234
234
  </list>
235
235
  </option>
236
- <option name="url" value="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/selenium-webdriver-4.38.0" />
237
- <option name="version" value="4.38.0" />
236
+ <option name="url" value="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/selenium-webdriver-4.43.0" />
237
+ <option name="version" value="4.43.0" />
238
238
  </properties>
239
239
  <CLASSES>
240
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/selenium-webdriver-4.38.0/bin" />
241
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/selenium-webdriver-4.38.0/lib" />
240
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/selenium-webdriver-4.43.0/bin" />
241
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/selenium-webdriver-4.43.0/lib" />
242
242
  </CLASSES>
243
243
  <JAVADOC />
244
244
  <SOURCES>
245
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/selenium-webdriver-4.38.0/bin" />
246
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/selenium-webdriver-4.38.0/lib" />
245
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/selenium-webdriver-4.43.0/bin" />
246
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/selenium-webdriver-4.43.0/lib" />
247
247
  </SOURCES>
248
248
  <excluded>
249
- <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/selenium-webdriver-4.38.0/bin" />
249
+ <root url="file://$MODULE_DIR$/vendor/bundle/ruby/3.2.0/gems/selenium-webdriver-4.43.0/bin" />
250
250
  </excluded>
251
251
  </library>
252
252
  </orderEntry>
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web_loader
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.3.0
4
+ version: 2.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - src
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-11-28 00:00:00.000000000 Z
11
+ date: 2026-05-02 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Web loader.
14
14
  email:
@@ -28,6 +28,7 @@ files:
28
28
  - ".idea/misc.xml"
29
29
  - ".idea/modules.xml"
30
30
  - ".idea/vcs.xml"
31
+ - CLAUDE.md
31
32
  - CODE_OF_CONDUCT.md
32
33
  - Gemfile
33
34
  - Gemfile.lock