web_loader 2.1.0 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7bd0d0702828cbd7f4150c4dbf5f89144a1b5794a2e393fa9c6c9b0768e0577b
4
- data.tar.gz: ddd8af0a7a5827b9da3d12cb60e7fbb55841c5b7ed29d1668c0ba5507987c4ac
3
+ metadata.gz: 03a86b4f610a1575be326e19740897204b75e2dbe41e1eac56ee1c98d9c44558
4
+ data.tar.gz: b83a8f147007e94e4fac89abd4297066727dfe6ebf0a68052569f97cece0f223
5
5
  SHA512:
6
- metadata.gz: 1b65f8df8b55fe2340cc6a7dd50885c7223aa3246748599b55eb28903ca09c97af854b8861a53973ffb37a2e81b3faaf9b837dc23f77c9ddd543bbe70aa8693c
7
- data.tar.gz: 4464a9fba104605f4658f912c6910a16341410a511dc441e78357e6ab17a892c42286e465dc51e876ed4bb1334fe3a28418fac381afaf1e5d6aaf961a864a4e1
6
+ metadata.gz: 27395f10236456d0780ee08c28d5b1c9fd447acab50261732eb24c4bb88011eb823381d01005ac6faf4d5e580d85dd1e16c69038c4ca1f4d3ed257613aeb1daf
7
+ data.tar.gz: 24a7b60226b63fcc7c1fda0a44ca11fbb14d10176936f93b0684fc0866289cbae73058fc7c4f485579fa83850520d6f9cc13112cddaff60d733e009b06ded5cc
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- web_loader (2.1.0)
4
+ web_loader (2.3.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
data/exe/wl CHANGED
@@ -4,58 +4,21 @@ require "web_loader"
4
4
  require 'web_loader/drivers/selenium_driver'
5
5
  require 'optparse'
6
6
 
7
- class Wl
8
- def self.run(argv)
9
- STDOUT.sync = true
10
- opts = {}
11
- opt = OptionParser.new(argv)
12
- opt.banner = "Usage: #{opt.program_name} [-h|--help] [Options] <URL> "
13
- opt.version = WebLoader::VERSION
14
- opt.separator('')
15
- opt.separator("Options:")
16
- opt.on_head('-h', '--help', 'Show this message') do |v|
17
- puts opt.help
18
- exit
19
- end
20
- opt.on('-v', '--verbose', 'Verbose message') {|v| opts[:v] = v}
21
- drivers = ['pureruby', 'selenium']
22
- opt.on('-d DRIVER', '--driver=DRIVER', drivers, drivers.join("|") + "(default pureruby)") {|v| opts[:d] = v }
23
- opt.on("--disable-cache", "Disable cache") {|v| opts[:disable_cache] = v }
24
- opt.parse!(argv)
25
- if argv.empty?
26
- puts "Error: URL is required."
27
- puts opt.help
28
- exit
29
- end
30
- command = Wl.new(opts)
31
- url = argv[0]
32
- command.execute(url)
33
- end
34
7
 
35
- def initialize(opts)
36
- @opts = opts
37
- end
8
+ result = WebLoader::Downloader.run(ARGV)
9
+ puts result
38
10
 
39
- def execute(url)
40
- driver = create_driver
41
- loader = WebLoader::Command.new(driver)
42
- if @opts[:disable_cache]
43
- loader.use_cache = false
44
- end
45
- result = loader.load(url)
46
- puts result
47
- end
48
-
49
- private
50
- def create_driver
51
- case @opts[:d]
52
- when 'selenium'
53
- driver = WebLoader::Drivers::SeleniumDriver.new
54
- else
55
- driver = WebLoader::Drivers::HttpDriver.new
56
- end
57
- driver
58
- end
59
- end
60
-
61
- Wl.run(ARGV)
11
+ # custom downloader example(selenium with custom wait proc)
12
+ # wl --driver=selenium https://www.example.com
13
+ #
14
+ # class MyDownloader < WebLoader::Downloader
15
+ # def create_wait_proc
16
+ # # puts "Using custom wait proc..."
17
+ # proc do |driver|
18
+ # # Example wait condition: wait until the document is fully loaded
19
+ # driver.execute_script("return document.readyState") == "complete"
20
+ # end
21
+ # end
22
+ # end
23
+ # result = MyDownloader.run(ARGV)
24
+ # puts result
@@ -36,8 +36,6 @@ module WebLoader
36
36
 
37
37
  # ドライバーのセットアップ
38
38
  @driver = driver
39
- @driver.user_agent = @user_agent
40
- @driver.binary = @binary
41
39
  end
42
40
 
43
41
  attr_reader :load_cache_page
@@ -67,15 +65,9 @@ module WebLoader
67
65
 
68
66
  ##### サーバーからロード
69
67
  log("Load server: #{url}")
70
- # uri = URI.parse(url)
71
- # http = Net::HTTP.new(uri.host, uri.port)
72
- # if uri.scheme == 'https'
73
- # http.use_ssl = true
74
- # http.verify_mode = OpenSSL::SSL::VERIFY_NONE
75
- # end
76
- # @response = nil
77
68
  begin
78
- # @response = http.get(uri.request_uri, 'User-Agent' => @user_agent) # request_uri=path + '?' + query
69
+ @driver.user_agent = @user_agent
70
+ @driver.binary = @binary
79
71
  @response = @driver.fetch(url)
80
72
  rescue Net::ReadTimeout
81
73
  # タイムアウトした場合リトライ可能ならばsleepした後に再度ロード実行
@@ -110,51 +102,7 @@ module WebLoader
110
102
  # それ以外は対応した例外を発生
111
103
  log("error #{url}", true)
112
104
  end
113
-
114
105
  result
115
-
116
- # ##### レスポンスの処理
117
- # result = nil
118
- # case @response
119
- # when Net::HTTPSuccess
120
- # # @responseがNet::HTTPSuccessのサブクラスの場合成功とみなし読み込んだ内容を返す
121
- # body = @response.body
122
- # unless @binary
123
- # # デフォルトでは ASCII-8BITが帰ってくる。
124
- # # Content-Typeのcharsetとみなす。
125
- # # https://bugs.ruby-lang.org/issues/2567
126
- # encoding = @response.type_params['charset']
127
- # body = toutf8(body, encoding)
128
- # end
129
- #
130
- # if @use_cache || @always_write_cache
131
- # log("Write cache: #{url}")
132
- # Cache.write(@cache_dir, url, @response.code, body)
133
- # end
134
- # result = body
135
- # when Net::HTTPRedirection
136
- # result = load(to_redirect_url(uri, @response['location']), redirect_count - 1)
137
- # # when Net::HTTPNotFound
138
- # # result = nil
139
- # when Net::HTTPTooManyRequests, Net::ReadTimeout
140
- # # 上記以外のレスポンスの場合、リトライ可能ならばsleepした後に再度ロード実行
141
- # if retry_count > 0
142
- # sleep_for = 10
143
- # if @response.is_a?(Net::HTTPTooManyRequests)
144
- # # HTTPTooManyRequestsならばretry-afterで指定された値を取得。
145
- # sleep_for = @response.header['retry-after'].to_i + 10
146
- # log("Rate limit: #{uri} #{@response.header.to_hash} (429 Too Many Requests). Sleeping #{sleep_for} seconds and retry (##{retry_count}).")
147
- # else
148
- # log("Unknown response: #{uri} #{@response.inspect}. Sleeping #{sleep_for} seconds and retry (##{retry_count}).")
149
- # end
150
- # sleep sleep_for
151
- # result = load(url, redirect_count , retry_count - 1)
152
- # end
153
- # else
154
- # # それ以外は対応した例外を発生
155
- # log("error #{url}", true)
156
- # end
157
- # result
158
106
  end
159
107
 
160
108
  private
@@ -0,0 +1,74 @@
1
+
2
+ module WebLoader
3
+ class Downloader
4
+
5
+ def self.run(argv)
6
+ STDOUT.sync = true
7
+ opts = {}
8
+ opt = OptionParser.new(argv)
9
+ opt.banner = "Usage: #{opt.program_name} [-h|--help] [Options] <URL> "
10
+ opt.version = WebLoader::VERSION
11
+ opt.separator('')
12
+ opt.separator("Options:")
13
+ opt.on_head('-h', '--help', 'Show this message') do |v|
14
+ puts opt.help
15
+ exit
16
+ end
17
+ opt.on('-v', '--verbose', 'Verbose message') {|v| opts[:v] = v}
18
+ drivers = ['pureruby', 'selenium']
19
+ opt.on('-d DRIVER', '--driver=DRIVER', drivers, drivers.join("|") + "(default pureruby)") {|v| opts[:d] = v }
20
+ opt.on("--disable-cache", "Disable cache") {|v| opts[:disable_cache] = v }
21
+ opt.on('--user-agent=USERAGENT', 'Set User-Agent header') {|v| opts[:user_agent] = v }
22
+ opt.on('-b', '--binary', 'Download binary files') {|v| opts[:binary] = v }
23
+ opt.parse!(argv)
24
+ if argv.empty?
25
+ puts "Error: URL is required."
26
+ puts opt.help
27
+ exit
28
+ end
29
+ command = self.new(opts)
30
+ url = argv[0]
31
+ command.execute(url)
32
+ end
33
+
34
+ def initialize(opts)
35
+ @opts = opts
36
+ end
37
+
38
+ def execute(url)
39
+ driver = create_driver
40
+ loader = WebLoader::Command.new(driver)
41
+ if @opts[:disable_cache]
42
+ loader.use_cache = false
43
+ end
44
+ if @opts[:user_agent]
45
+ loader.user_agent = @opts[:user_agent]
46
+ end
47
+ if @opts[:binary]
48
+ loader.binary = true
49
+ end
50
+ loader.load(url)
51
+ end
52
+
53
+ private
54
+ def create_driver
55
+ case @opts[:d]
56
+ when 'selenium'
57
+ driver = WebLoader::Drivers::SeleniumDriver.new
58
+ driver.wait_proc = create_wait_proc
59
+ else
60
+ driver = WebLoader::Drivers::HttpDriver.new
61
+ end
62
+ driver
63
+ end
64
+
65
+ def create_wait_proc
66
+ # proc do |driver|
67
+ # # Example wait condition: wait until the document is fully loaded
68
+ # ready_state = driver.execute_script('return document.readyState')
69
+ # ready_state == 'complete'
70
+ # end
71
+ end
72
+ end
73
+
74
+ end
@@ -38,7 +38,7 @@ module WebLoader
38
38
 
39
39
  content_type = driver.execute_script("return document.contentType;")
40
40
 
41
- body = @binary ? page_source.b : driver.page_source
41
+ body = @binary ? driver.page_source.b : driver.page_source
42
42
  response = WebLoader::Response.new(status: 200,
43
43
  headers: {
44
44
  'Content-Type' => content_type
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module WebLoader
4
- VERSION = "2.1.0"
4
+ VERSION = "2.3.0"
5
5
  end
data/lib/web_loader.rb CHANGED
@@ -4,6 +4,7 @@ require_relative "web_loader/version"
4
4
  require_relative "web_loader/utils"
5
5
  require_relative "web_loader/cache"
6
6
  require_relative "web_loader/command"
7
+ require_relative "web_loader/downloader"
7
8
  require_relative "web_loader/response"
8
9
  require_relative "web_loader/drivers/base_driver"
9
10
  require_relative "web_loader/drivers/http_driver"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web_loader
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.0
4
+ version: 2.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - src
@@ -39,6 +39,7 @@ files:
39
39
  - lib/web_loader.rb
40
40
  - lib/web_loader/cache.rb
41
41
  - lib/web_loader/command.rb
42
+ - lib/web_loader/downloader.rb
42
43
  - lib/web_loader/drivers/base_driver.rb
43
44
  - lib/web_loader/drivers/http_driver.rb
44
45
  - lib/web_loader/drivers/selenium_driver.rb