web_loader 2.1.0 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/exe/wl +16 -53
- data/lib/web_loader/command.rb +2 -54
- data/lib/web_loader/downloader.rb +74 -0
- data/lib/web_loader/drivers/selenium_driver.rb +1 -1
- data/lib/web_loader/version.rb +1 -1
- data/lib/web_loader.rb +1 -0
- metadata +2 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 03a86b4f610a1575be326e19740897204b75e2dbe41e1eac56ee1c98d9c44558
|
|
4
|
+
data.tar.gz: b83a8f147007e94e4fac89abd4297066727dfe6ebf0a68052569f97cece0f223
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 27395f10236456d0780ee08c28d5b1c9fd447acab50261732eb24c4bb88011eb823381d01005ac6faf4d5e580d85dd1e16c69038c4ca1f4d3ed257613aeb1daf
|
|
7
|
+
data.tar.gz: 24a7b60226b63fcc7c1fda0a44ca11fbb14d10176936f93b0684fc0866289cbae73058fc7c4f485579fa83850520d6f9cc13112cddaff60d733e009b06ded5cc
|
data/Gemfile.lock
CHANGED
data/exe/wl
CHANGED
|
@@ -4,58 +4,21 @@ require "web_loader"
|
|
|
4
4
|
require 'web_loader/drivers/selenium_driver'
|
|
5
5
|
require 'optparse'
|
|
6
6
|
|
|
7
|
-
class Wl
|
|
8
|
-
def self.run(argv)
|
|
9
|
-
STDOUT.sync = true
|
|
10
|
-
opts = {}
|
|
11
|
-
opt = OptionParser.new(argv)
|
|
12
|
-
opt.banner = "Usage: #{opt.program_name} [-h|--help] [Options] <URL> "
|
|
13
|
-
opt.version = WebLoader::VERSION
|
|
14
|
-
opt.separator('')
|
|
15
|
-
opt.separator("Options:")
|
|
16
|
-
opt.on_head('-h', '--help', 'Show this message') do |v|
|
|
17
|
-
puts opt.help
|
|
18
|
-
exit
|
|
19
|
-
end
|
|
20
|
-
opt.on('-v', '--verbose', 'Verbose message') {|v| opts[:v] = v}
|
|
21
|
-
drivers = ['pureruby', 'selenium']
|
|
22
|
-
opt.on('-d DRIVER', '--driver=DRIVER', drivers, drivers.join("|") + "(default pureruby)") {|v| opts[:d] = v }
|
|
23
|
-
opt.on("--disable-cache", "Disable cache") {|v| opts[:disable_cache] = v }
|
|
24
|
-
opt.parse!(argv)
|
|
25
|
-
if argv.empty?
|
|
26
|
-
puts "Error: URL is required."
|
|
27
|
-
puts opt.help
|
|
28
|
-
exit
|
|
29
|
-
end
|
|
30
|
-
command = Wl.new(opts)
|
|
31
|
-
url = argv[0]
|
|
32
|
-
command.execute(url)
|
|
33
|
-
end
|
|
34
7
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
end
|
|
8
|
+
result = WebLoader::Downloader.run(ARGV)
|
|
9
|
+
puts result
|
|
38
10
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
driver = WebLoader::Drivers::SeleniumDriver.new
|
|
54
|
-
else
|
|
55
|
-
driver = WebLoader::Drivers::HttpDriver.new
|
|
56
|
-
end
|
|
57
|
-
driver
|
|
58
|
-
end
|
|
59
|
-
end
|
|
60
|
-
|
|
61
|
-
Wl.run(ARGV)
|
|
11
|
+
# custom downloader example(selenium with custom wait proc)
|
|
12
|
+
# wl --driver=selenium https://www.example.com
|
|
13
|
+
#
|
|
14
|
+
# class MyDownloader < WebLoader::Downloader
|
|
15
|
+
# def create_wait_proc
|
|
16
|
+
# # puts "Using custom wait proc..."
|
|
17
|
+
# proc do |driver|
|
|
18
|
+
# # Example wait condition: wait until the document is fully loaded
|
|
19
|
+
# driver.execute_script("return document.readyState") == "complete"
|
|
20
|
+
# end
|
|
21
|
+
# end
|
|
22
|
+
# end
|
|
23
|
+
# result = MyDownloader.run(ARGV)
|
|
24
|
+
# puts result
|
data/lib/web_loader/command.rb
CHANGED
|
@@ -36,8 +36,6 @@ module WebLoader
|
|
|
36
36
|
|
|
37
37
|
# ドライバーのセットアップ
|
|
38
38
|
@driver = driver
|
|
39
|
-
@driver.user_agent = @user_agent
|
|
40
|
-
@driver.binary = @binary
|
|
41
39
|
end
|
|
42
40
|
|
|
43
41
|
attr_reader :load_cache_page
|
|
@@ -67,15 +65,9 @@ module WebLoader
|
|
|
67
65
|
|
|
68
66
|
##### サーバーからロード
|
|
69
67
|
log("Load server: #{url}")
|
|
70
|
-
# uri = URI.parse(url)
|
|
71
|
-
# http = Net::HTTP.new(uri.host, uri.port)
|
|
72
|
-
# if uri.scheme == 'https'
|
|
73
|
-
# http.use_ssl = true
|
|
74
|
-
# http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
|
75
|
-
# end
|
|
76
|
-
# @response = nil
|
|
77
68
|
begin
|
|
78
|
-
|
|
69
|
+
@driver.user_agent = @user_agent
|
|
70
|
+
@driver.binary = @binary
|
|
79
71
|
@response = @driver.fetch(url)
|
|
80
72
|
rescue Net::ReadTimeout
|
|
81
73
|
# タイムアウトした場合リトライ可能ならばsleepした後に再度ロード実行
|
|
@@ -110,51 +102,7 @@ module WebLoader
|
|
|
110
102
|
# それ以外は対応した例外を発生
|
|
111
103
|
log("error #{url}", true)
|
|
112
104
|
end
|
|
113
|
-
|
|
114
105
|
result
|
|
115
|
-
|
|
116
|
-
# ##### レスポンスの処理
|
|
117
|
-
# result = nil
|
|
118
|
-
# case @response
|
|
119
|
-
# when Net::HTTPSuccess
|
|
120
|
-
# # @responseがNet::HTTPSuccessのサブクラスの場合成功とみなし読み込んだ内容を返す
|
|
121
|
-
# body = @response.body
|
|
122
|
-
# unless @binary
|
|
123
|
-
# # デフォルトでは ASCII-8BITが帰ってくる。
|
|
124
|
-
# # Content-Typeのcharsetとみなす。
|
|
125
|
-
# # https://bugs.ruby-lang.org/issues/2567
|
|
126
|
-
# encoding = @response.type_params['charset']
|
|
127
|
-
# body = toutf8(body, encoding)
|
|
128
|
-
# end
|
|
129
|
-
#
|
|
130
|
-
# if @use_cache || @always_write_cache
|
|
131
|
-
# log("Write cache: #{url}")
|
|
132
|
-
# Cache.write(@cache_dir, url, @response.code, body)
|
|
133
|
-
# end
|
|
134
|
-
# result = body
|
|
135
|
-
# when Net::HTTPRedirection
|
|
136
|
-
# result = load(to_redirect_url(uri, @response['location']), redirect_count - 1)
|
|
137
|
-
# # when Net::HTTPNotFound
|
|
138
|
-
# # result = nil
|
|
139
|
-
# when Net::HTTPTooManyRequests, Net::ReadTimeout
|
|
140
|
-
# # 上記以外のレスポンスの場合、リトライ可能ならばsleepした後に再度ロード実行
|
|
141
|
-
# if retry_count > 0
|
|
142
|
-
# sleep_for = 10
|
|
143
|
-
# if @response.is_a?(Net::HTTPTooManyRequests)
|
|
144
|
-
# # HTTPTooManyRequestsならばretry-afterで指定された値を取得。
|
|
145
|
-
# sleep_for = @response.header['retry-after'].to_i + 10
|
|
146
|
-
# log("Rate limit: #{uri} #{@response.header.to_hash} (429 Too Many Requests). Sleeping #{sleep_for} seconds and retry (##{retry_count}).")
|
|
147
|
-
# else
|
|
148
|
-
# log("Unknown response: #{uri} #{@response.inspect}. Sleeping #{sleep_for} seconds and retry (##{retry_count}).")
|
|
149
|
-
# end
|
|
150
|
-
# sleep sleep_for
|
|
151
|
-
# result = load(url, redirect_count , retry_count - 1)
|
|
152
|
-
# end
|
|
153
|
-
# else
|
|
154
|
-
# # それ以外は対応した例外を発生
|
|
155
|
-
# log("error #{url}", true)
|
|
156
|
-
# end
|
|
157
|
-
# result
|
|
158
106
|
end
|
|
159
107
|
|
|
160
108
|
private
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
|
|
2
|
+
module WebLoader
|
|
3
|
+
class Downloader
|
|
4
|
+
|
|
5
|
+
def self.run(argv)
|
|
6
|
+
STDOUT.sync = true
|
|
7
|
+
opts = {}
|
|
8
|
+
opt = OptionParser.new(argv)
|
|
9
|
+
opt.banner = "Usage: #{opt.program_name} [-h|--help] [Options] <URL> "
|
|
10
|
+
opt.version = WebLoader::VERSION
|
|
11
|
+
opt.separator('')
|
|
12
|
+
opt.separator("Options:")
|
|
13
|
+
opt.on_head('-h', '--help', 'Show this message') do |v|
|
|
14
|
+
puts opt.help
|
|
15
|
+
exit
|
|
16
|
+
end
|
|
17
|
+
opt.on('-v', '--verbose', 'Verbose message') {|v| opts[:v] = v}
|
|
18
|
+
drivers = ['pureruby', 'selenium']
|
|
19
|
+
opt.on('-d DRIVER', '--driver=DRIVER', drivers, drivers.join("|") + "(default pureruby)") {|v| opts[:d] = v }
|
|
20
|
+
opt.on("--disable-cache", "Disable cache") {|v| opts[:disable_cache] = v }
|
|
21
|
+
opt.on('--user-agent=USERAGENT', 'Set User-Agent header') {|v| opts[:user_agent] = v }
|
|
22
|
+
opt.on('-b', '--binary', 'Download binary files') {|v| opts[:binary] = v }
|
|
23
|
+
opt.parse!(argv)
|
|
24
|
+
if argv.empty?
|
|
25
|
+
puts "Error: URL is required."
|
|
26
|
+
puts opt.help
|
|
27
|
+
exit
|
|
28
|
+
end
|
|
29
|
+
command = self.new(opts)
|
|
30
|
+
url = argv[0]
|
|
31
|
+
command.execute(url)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def initialize(opts)
|
|
35
|
+
@opts = opts
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def execute(url)
|
|
39
|
+
driver = create_driver
|
|
40
|
+
loader = WebLoader::Command.new(driver)
|
|
41
|
+
if @opts[:disable_cache]
|
|
42
|
+
loader.use_cache = false
|
|
43
|
+
end
|
|
44
|
+
if @opts[:user_agent]
|
|
45
|
+
loader.user_agent = @opts[:user_agent]
|
|
46
|
+
end
|
|
47
|
+
if @opts[:binary]
|
|
48
|
+
loader.binary = true
|
|
49
|
+
end
|
|
50
|
+
loader.load(url)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
private
|
|
54
|
+
def create_driver
|
|
55
|
+
case @opts[:d]
|
|
56
|
+
when 'selenium'
|
|
57
|
+
driver = WebLoader::Drivers::SeleniumDriver.new
|
|
58
|
+
driver.wait_proc = create_wait_proc
|
|
59
|
+
else
|
|
60
|
+
driver = WebLoader::Drivers::HttpDriver.new
|
|
61
|
+
end
|
|
62
|
+
driver
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def create_wait_proc
|
|
66
|
+
# proc do |driver|
|
|
67
|
+
# # Example wait condition: wait until the document is fully loaded
|
|
68
|
+
# ready_state = driver.execute_script('return document.readyState')
|
|
69
|
+
# ready_state == 'complete'
|
|
70
|
+
# end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
end
|
|
@@ -38,7 +38,7 @@ module WebLoader
|
|
|
38
38
|
|
|
39
39
|
content_type = driver.execute_script("return document.contentType;")
|
|
40
40
|
|
|
41
|
-
body = @binary ? page_source.b : driver.page_source
|
|
41
|
+
body = @binary ? driver.page_source.b : driver.page_source
|
|
42
42
|
response = WebLoader::Response.new(status: 200,
|
|
43
43
|
headers: {
|
|
44
44
|
'Content-Type' => content_type
|
data/lib/web_loader/version.rb
CHANGED
data/lib/web_loader.rb
CHANGED
|
@@ -4,6 +4,7 @@ require_relative "web_loader/version"
|
|
|
4
4
|
require_relative "web_loader/utils"
|
|
5
5
|
require_relative "web_loader/cache"
|
|
6
6
|
require_relative "web_loader/command"
|
|
7
|
+
require_relative "web_loader/downloader"
|
|
7
8
|
require_relative "web_loader/response"
|
|
8
9
|
require_relative "web_loader/drivers/base_driver"
|
|
9
10
|
require_relative "web_loader/drivers/http_driver"
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: web_loader
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.
|
|
4
|
+
version: 2.3.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- src
|
|
@@ -39,6 +39,7 @@ files:
|
|
|
39
39
|
- lib/web_loader.rb
|
|
40
40
|
- lib/web_loader/cache.rb
|
|
41
41
|
- lib/web_loader/command.rb
|
|
42
|
+
- lib/web_loader/downloader.rb
|
|
42
43
|
- lib/web_loader/drivers/base_driver.rb
|
|
43
44
|
- lib/web_loader/drivers/http_driver.rb
|
|
44
45
|
- lib/web_loader/drivers/selenium_driver.rb
|