kimurai 1.3.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +9 -0
- data/CHANGELOG.md +29 -0
- data/Gemfile +2 -2
- data/README.md +478 -649
- data/Rakefile +6 -6
- data/bin/console +3 -4
- data/exe/kimurai +0 -1
- data/kimurai.gemspec +38 -37
- data/lib/kimurai/base/saver.rb +15 -19
- data/lib/kimurai/base/storage.rb +1 -1
- data/lib/kimurai/base.rb +42 -38
- data/lib/kimurai/base_helper.rb +5 -4
- data/lib/kimurai/browser_builder/mechanize_builder.rb +44 -38
- data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +63 -51
- data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +61 -55
- data/lib/kimurai/browser_builder.rb +7 -31
- data/lib/kimurai/capybara_configuration.rb +1 -1
- data/lib/kimurai/capybara_ext/driver/base.rb +50 -46
- data/lib/kimurai/capybara_ext/mechanize/driver.rb +51 -50
- data/lib/kimurai/capybara_ext/selenium/driver.rb +33 -29
- data/lib/kimurai/capybara_ext/session/config.rb +1 -1
- data/lib/kimurai/capybara_ext/session.rb +40 -38
- data/lib/kimurai/cli/generator.rb +15 -15
- data/lib/kimurai/cli.rb +52 -85
- data/lib/kimurai/core_ext/array.rb +2 -2
- data/lib/kimurai/core_ext/hash.rb +1 -1
- data/lib/kimurai/core_ext/numeric.rb +4 -4
- data/lib/kimurai/pipeline.rb +2 -1
- data/lib/kimurai/runner.rb +6 -6
- data/lib/kimurai/template/Gemfile +2 -2
- data/lib/kimurai/template/config/boot.rb +4 -4
- data/lib/kimurai/template/config/schedule.rb +15 -15
- data/lib/kimurai/template/spiders/application_spider.rb +14 -14
- data/lib/kimurai/version.rb +1 -1
- data/lib/kimurai.rb +7 -3
- metadata +58 -65
- data/.travis.yml +0 -5
- data/lib/kimurai/automation/deploy.yml +0 -54
- data/lib/kimurai/automation/setup/chromium_chromedriver.yml +0 -26
- data/lib/kimurai/automation/setup/firefox_geckodriver.yml +0 -20
- data/lib/kimurai/automation/setup/phantomjs.yml +0 -33
- data/lib/kimurai/automation/setup/ruby_environment.yml +0 -124
- data/lib/kimurai/automation/setup.yml +0 -44
- data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +0 -171
- data/lib/kimurai/capybara_ext/poltergeist/driver.rb +0 -13
- data/lib/kimurai/cli/ansible_command_builder.rb +0 -71
- data/lib/kimurai/template/config/automation.yml +0 -13
|
@@ -5,7 +5,7 @@ require_relative '../capybara_ext/selenium/driver'
|
|
|
5
5
|
require_relative '../capybara_ext/session'
|
|
6
6
|
|
|
7
7
|
module Kimurai
|
|
8
|
-
|
|
8
|
+
module BrowserBuilder
|
|
9
9
|
class SeleniumChromeBuilder
|
|
10
10
|
class << self
|
|
11
11
|
attr_accessor :virtual_display
|
|
@@ -23,28 +23,28 @@ module Kimurai
|
|
|
23
23
|
# Register driver
|
|
24
24
|
Capybara.register_driver :selenium_chrome do |app|
|
|
25
25
|
# Create driver options
|
|
26
|
-
|
|
26
|
+
# See all options here: https://seleniumhq.github.io/selenium/docs/api/rb/Selenium/WebDriver/Chrome/Options.html
|
|
27
|
+
driver_options = Selenium::WebDriver::Chrome::Options.new
|
|
28
|
+
driver_options.args += %w[--disable-gpu --no-sandbox --disable-translate
|
|
29
|
+
--disable-blink-features=AutomationControlled]
|
|
27
30
|
|
|
28
31
|
# Provide custom chrome browser path:
|
|
29
|
-
if chrome_path = Kimurai.configuration.selenium_chrome_path
|
|
30
|
-
|
|
32
|
+
if (chrome_path = Kimurai.configuration.selenium_chrome_path)
|
|
33
|
+
driver_options.binary = chrome_path
|
|
31
34
|
end
|
|
32
35
|
|
|
33
|
-
# See all options here: https://seleniumhq.github.io/selenium/docs/api/rb/Selenium/WebDriver/Chrome/Options.html
|
|
34
|
-
driver_options = Selenium::WebDriver::Chrome::Options.new(opts)
|
|
35
|
-
|
|
36
36
|
# Window size
|
|
37
|
-
if size = @config[:window_size].presence
|
|
37
|
+
if (size = @config[:window_size].presence)
|
|
38
38
|
driver_options.args << "--window-size=#{size.join(',')}"
|
|
39
|
-
logger.debug
|
|
39
|
+
logger.debug 'BrowserBuilder (selenium_chrome): enabled window_size'
|
|
40
40
|
end
|
|
41
41
|
|
|
42
42
|
# Proxy
|
|
43
|
-
if proxy = @config[:proxy].presence
|
|
44
|
-
proxy_string = (proxy.
|
|
45
|
-
ip, port, type, user, password = proxy_string.split(
|
|
43
|
+
if (proxy = @config[:proxy].presence)
|
|
44
|
+
proxy_string = (proxy.instance_of?(Proc) ? proxy.call : proxy).strip
|
|
45
|
+
ip, port, type, user, password = proxy_string.split(':')
|
|
46
46
|
|
|
47
|
-
if %w
|
|
47
|
+
if %w[http socks5].include?(type)
|
|
48
48
|
if user.nil? && password.nil?
|
|
49
49
|
driver_options.args << "--proxy-server=#{type}://#{ip}:#{port}"
|
|
50
50
|
logger.debug "BrowserBuilder (selenium_chrome): enabled #{type} proxy, ip: #{ip}, port: #{port}"
|
|
@@ -56,26 +56,26 @@ module Kimurai
|
|
|
56
56
|
end
|
|
57
57
|
end
|
|
58
58
|
|
|
59
|
-
if proxy_bypass_list = @config[:proxy_bypass_list].presence
|
|
59
|
+
if (proxy_bypass_list = @config[:proxy_bypass_list].presence)
|
|
60
60
|
if proxy
|
|
61
61
|
driver_options.args << "--proxy-bypass-list=#{proxy_bypass_list.join(';')}"
|
|
62
|
-
logger.debug
|
|
62
|
+
logger.debug 'BrowserBuilder (selenium_chrome): enabled proxy_bypass_list'
|
|
63
63
|
else
|
|
64
|
-
logger.error
|
|
64
|
+
logger.error 'BrowserBuilder (selenium_chrome): provide `proxy` to set proxy_bypass_list, skipped'
|
|
65
65
|
end
|
|
66
66
|
end
|
|
67
67
|
|
|
68
68
|
# SSL
|
|
69
69
|
if @config[:ignore_ssl_errors].present?
|
|
70
|
-
driver_options.args <<
|
|
71
|
-
driver_options.args <<
|
|
72
|
-
logger.debug
|
|
70
|
+
driver_options.args << '--ignore-certificate-errors'
|
|
71
|
+
driver_options.args << '--allow-insecure-localhost'
|
|
72
|
+
logger.debug 'BrowserBuilder (selenium_chrome): enabled ignore_ssl_errors'
|
|
73
73
|
end
|
|
74
74
|
|
|
75
75
|
# Disable images
|
|
76
76
|
if @config[:disable_images].present?
|
|
77
|
-
driver_options.prefs[
|
|
78
|
-
logger.debug
|
|
77
|
+
driver_options.prefs['profile.managed_default_content_settings.images'] = 2
|
|
78
|
+
logger.debug 'BrowserBuilder (selenium_chrome): enabled disable_images'
|
|
79
79
|
end
|
|
80
80
|
|
|
81
81
|
# Headers
|
|
@@ -83,72 +83,78 @@ module Kimurai
|
|
|
83
83
|
logger.warn "BrowserBuilder: (selenium_chrome): custom headers doesn't supported by selenium, skipped"
|
|
84
84
|
end
|
|
85
85
|
|
|
86
|
-
if user_agent = @config[:user_agent].presence
|
|
87
|
-
user_agent_string = (user_agent.
|
|
86
|
+
if (user_agent = @config[:user_agent].presence)
|
|
87
|
+
user_agent_string = (user_agent.instance_of?(Proc) ? user_agent.call : user_agent).strip
|
|
88
88
|
driver_options.args << "--user-agent='#{user_agent_string}'"
|
|
89
|
-
logger.debug
|
|
89
|
+
logger.debug 'BrowserBuilder (selenium_chrome): enabled custom user_agent'
|
|
90
90
|
end
|
|
91
91
|
|
|
92
92
|
# Headless mode
|
|
93
|
-
if ENV[
|
|
93
|
+
if ENV['HEADLESS'] != 'false'
|
|
94
94
|
if @config[:headless_mode] == :virtual_display
|
|
95
|
-
if Gem::Platform.local.os ==
|
|
95
|
+
if Gem::Platform.local.os == 'linux'
|
|
96
96
|
unless self.class.virtual_display
|
|
97
97
|
require 'headless'
|
|
98
98
|
self.class.virtual_display = Headless.new(reuse: true, destroy_at_exit: false)
|
|
99
99
|
self.class.virtual_display.start
|
|
100
100
|
end
|
|
101
101
|
|
|
102
|
-
logger.debug
|
|
102
|
+
logger.debug 'BrowserBuilder (selenium_chrome): enabled virtual_display headless_mode'
|
|
103
103
|
else
|
|
104
|
-
logger.error
|
|
105
|
-
|
|
104
|
+
logger.error 'BrowserBuilder (selenium_chrome): virtual_display headless_mode works only ' \
|
|
105
|
+
'on Linux platform. Browser will run in normal mode. Set `native` mode instead.'
|
|
106
106
|
end
|
|
107
107
|
else
|
|
108
|
-
driver_options.args <<
|
|
109
|
-
logger.debug
|
|
108
|
+
driver_options.args << '--headless'
|
|
109
|
+
logger.debug 'BrowserBuilder (selenium_chrome): enabled native headless_mode'
|
|
110
110
|
end
|
|
111
111
|
end
|
|
112
112
|
|
|
113
|
-
|
|
114
|
-
|
|
113
|
+
# Use Selenium Manager by default (auto-downloads driver), or custom path if configured
|
|
114
|
+
if (chromedriver_path = Kimurai.configuration.chromedriver_path)
|
|
115
|
+
service = Selenium::WebDriver::Service.chrome(path: chromedriver_path)
|
|
116
|
+
Capybara::Selenium::Driver.new(app, browser: :chrome, options: driver_options, service: service)
|
|
117
|
+
else
|
|
118
|
+
# Let Selenium Manager handle driver automatically
|
|
119
|
+
Capybara::Selenium::Driver.new(app, browser: :chrome, options: driver_options)
|
|
120
|
+
end
|
|
115
121
|
end
|
|
116
122
|
|
|
117
123
|
# Create browser instance (Capybara session)
|
|
118
124
|
@browser = Capybara::Session.new(:selenium_chrome)
|
|
119
125
|
@browser.spider = spider
|
|
120
|
-
logger.debug
|
|
126
|
+
logger.debug 'BrowserBuilder (selenium_chrome): created browser instance'
|
|
121
127
|
|
|
122
128
|
if @config[:extensions].present?
|
|
123
|
-
logger.error
|
|
129
|
+
logger.error 'BrowserBuilder (selenium_chrome): `extensions` option not supported by Selenium, skipped'
|
|
124
130
|
end
|
|
125
131
|
|
|
126
132
|
# Cookies
|
|
127
|
-
if cookies = @config[:cookies].presence
|
|
133
|
+
if (cookies = @config[:cookies].presence)
|
|
128
134
|
@browser.config.cookies = cookies
|
|
129
|
-
logger.debug
|
|
135
|
+
logger.debug 'BrowserBuilder (selenium_chrome): enabled custom cookies'
|
|
130
136
|
end
|
|
131
137
|
|
|
132
138
|
# Browser instance options
|
|
133
139
|
# skip_request_errors
|
|
134
|
-
if skip_errors = @config[:skip_request_errors].presence
|
|
140
|
+
if (skip_errors = @config[:skip_request_errors].presence)
|
|
135
141
|
@browser.config.skip_request_errors = skip_errors
|
|
136
|
-
logger.debug
|
|
142
|
+
logger.debug 'BrowserBuilder (selenium_chrome): enabled skip_request_errors'
|
|
137
143
|
end
|
|
138
144
|
|
|
139
145
|
# retry_request_errors
|
|
140
|
-
if retry_errors = @config[:retry_request_errors].presence
|
|
146
|
+
if (retry_errors = @config[:retry_request_errors].presence)
|
|
141
147
|
@browser.config.retry_request_errors = retry_errors
|
|
142
|
-
logger.debug
|
|
148
|
+
logger.debug 'BrowserBuilder (selenium_chrome): enabled retry_request_errors'
|
|
143
149
|
end
|
|
144
150
|
|
|
145
151
|
# restart_if
|
|
146
|
-
if requests_limit = @config.dig(:restart_if, :requests_limit).presence
|
|
152
|
+
if (requests_limit = @config.dig(:restart_if, :requests_limit).presence)
|
|
147
153
|
@browser.config.restart_if[:requests_limit] = requests_limit
|
|
148
154
|
logger.debug "BrowserBuilder (selenium_chrome): enabled restart_if.requests_limit >= #{requests_limit}"
|
|
149
155
|
end
|
|
150
156
|
|
|
151
|
-
if memory_limit = @config.dig(:restart_if, :memory_limit).presence
|
|
157
|
+
if (memory_limit = @config.dig(:restart_if, :memory_limit).presence)
|
|
152
158
|
@browser.config.restart_if[:memory_limit] = memory_limit
|
|
153
159
|
logger.debug "BrowserBuilder (selenium_chrome): enabled restart_if.memory_limit >= #{memory_limit}"
|
|
154
160
|
end
|
|
@@ -156,34 +162,40 @@ module Kimurai
|
|
|
156
162
|
# before_request clear_cookies
|
|
157
163
|
if @config.dig(:before_request, :clear_cookies)
|
|
158
164
|
@browser.config.before_request[:clear_cookies] = true
|
|
159
|
-
logger.debug
|
|
165
|
+
logger.debug 'BrowserBuilder (selenium_chrome): enabled before_request.clear_cookies'
|
|
160
166
|
end
|
|
161
167
|
|
|
162
168
|
# before_request clear_and_set_cookies
|
|
163
169
|
if @config.dig(:before_request, :clear_and_set_cookies)
|
|
164
|
-
if cookies = @config[:cookies].presence
|
|
170
|
+
if (cookies = @config[:cookies].presence)
|
|
165
171
|
@browser.config.cookies = cookies
|
|
166
172
|
@browser.config.before_request[:clear_and_set_cookies] = true
|
|
167
|
-
logger.debug
|
|
173
|
+
logger.debug 'BrowserBuilder (selenium_chrome): enabled before_request.clear_and_set_cookies'
|
|
168
174
|
else
|
|
169
|
-
logger.error
|
|
175
|
+
logger.error 'BrowserBuilder (selenium_chrome): cookies should be present to enable before_request.clear_and_set_cookies, skipped'
|
|
170
176
|
end
|
|
171
177
|
end
|
|
172
178
|
|
|
173
179
|
# before_request change_user_agent
|
|
174
180
|
if @config.dig(:before_request, :change_user_agent)
|
|
175
|
-
logger.error
|
|
181
|
+
logger.error 'BrowserBuilder (selenium_chrome): before_request.change_user_agent option not supported by Selenium, skipped'
|
|
176
182
|
end
|
|
177
183
|
|
|
178
184
|
# before_request change_proxy
|
|
179
185
|
if @config.dig(:before_request, :change_proxy)
|
|
180
|
-
logger.error
|
|
186
|
+
logger.error 'BrowserBuilder (selenium_chrome): before_request.change_proxy option not supported by Selenium, skipped'
|
|
181
187
|
end
|
|
182
188
|
|
|
183
189
|
# before_request delay
|
|
184
|
-
if delay = @config.dig(:before_request, :delay).presence
|
|
190
|
+
if (delay = @config.dig(:before_request, :delay).presence)
|
|
185
191
|
@browser.config.before_request[:delay] = delay
|
|
186
|
-
logger.debug
|
|
192
|
+
logger.debug 'BrowserBuilder (selenium_chrome): enabled before_request.delay'
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# encoding
|
|
196
|
+
if (encoding = @config[:encoding])
|
|
197
|
+
@browser.config.encoding = encoding
|
|
198
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled encoding: #{encoding}"
|
|
187
199
|
end
|
|
188
200
|
|
|
189
201
|
# return Capybara session instance
|
|
@@ -5,7 +5,7 @@ require_relative '../capybara_ext/selenium/driver'
|
|
|
5
5
|
require_relative '../capybara_ext/session'
|
|
6
6
|
|
|
7
7
|
module Kimurai
|
|
8
|
-
|
|
8
|
+
module BrowserBuilder
|
|
9
9
|
class SeleniumFirefoxBuilder
|
|
10
10
|
class << self
|
|
11
11
|
attr_accessor :virtual_display
|
|
@@ -25,28 +25,28 @@ module Kimurai
|
|
|
25
25
|
# Create driver options
|
|
26
26
|
driver_options = Selenium::WebDriver::Firefox::Options.new
|
|
27
27
|
driver_options.profile = Selenium::WebDriver::Firefox::Profile.new
|
|
28
|
-
driver_options.profile[
|
|
29
|
-
driver_options.profile[
|
|
28
|
+
driver_options.profile['browser.link.open_newwindow'] = 3 # open windows in tabs
|
|
29
|
+
driver_options.profile['media.peerconnection.enabled'] = false # disable web rtc
|
|
30
30
|
|
|
31
31
|
# Proxy
|
|
32
|
-
if proxy = @config[:proxy].presence
|
|
33
|
-
proxy_string = (proxy.
|
|
34
|
-
ip, port, type, user, password = proxy_string.split(
|
|
32
|
+
if (proxy = @config[:proxy].presence)
|
|
33
|
+
proxy_string = (proxy.instance_of?(Proc) ? proxy.call : proxy).strip
|
|
34
|
+
ip, port, type, user, password = proxy_string.split(':')
|
|
35
35
|
|
|
36
36
|
if user.nil? && password.nil?
|
|
37
|
-
driver_options.profile[
|
|
38
|
-
if type ==
|
|
39
|
-
driver_options.profile[
|
|
40
|
-
driver_options.profile[
|
|
41
|
-
driver_options.profile[
|
|
42
|
-
driver_options.profile[
|
|
37
|
+
driver_options.profile['network.proxy.type'] = 1
|
|
38
|
+
if type == 'http'
|
|
39
|
+
driver_options.profile['network.proxy.http'] = ip
|
|
40
|
+
driver_options.profile['network.proxy.http_port'] = port.to_i
|
|
41
|
+
driver_options.profile['network.proxy.ssl'] = ip
|
|
42
|
+
driver_options.profile['network.proxy.ssl_port'] = port.to_i
|
|
43
43
|
|
|
44
44
|
logger.debug "BrowserBuilder (selenium_firefox): enabled http proxy, ip: #{ip}, port: #{port}"
|
|
45
|
-
elsif type ==
|
|
46
|
-
driver_options.profile[
|
|
47
|
-
driver_options.profile[
|
|
48
|
-
driver_options.profile[
|
|
49
|
-
driver_options.profile[
|
|
45
|
+
elsif type == 'socks5'
|
|
46
|
+
driver_options.profile['network.proxy.socks'] = ip
|
|
47
|
+
driver_options.profile['network.proxy.socks_port'] = port.to_i
|
|
48
|
+
driver_options.profile['network.proxy.socks_version'] = 5
|
|
49
|
+
driver_options.profile['network.proxy.socks_remote_dns'] = true
|
|
50
50
|
|
|
51
51
|
logger.debug "BrowserBuilder (selenium_firefox): enabled socks5 proxy, ip: #{ip}, port: #{port}"
|
|
52
52
|
else
|
|
@@ -57,12 +57,12 @@ module Kimurai
|
|
|
57
57
|
end
|
|
58
58
|
end
|
|
59
59
|
|
|
60
|
-
if proxy_bypass_list = @config[:proxy_bypass_list].presence
|
|
60
|
+
if (proxy_bypass_list = @config[:proxy_bypass_list].presence)
|
|
61
61
|
if proxy
|
|
62
|
-
driver_options.profile[
|
|
63
|
-
logger.debug
|
|
62
|
+
driver_options.profile['network.proxy.no_proxies_on'] = proxy_bypass_list.join(', ')
|
|
63
|
+
logger.debug 'BrowserBuilder (selenium_firefox): enabled proxy_bypass_list'
|
|
64
64
|
else
|
|
65
|
-
logger.error
|
|
65
|
+
logger.error 'BrowserBuilder (selenium_firefox): provide `proxy` to set proxy_bypass_list, skipped'
|
|
66
66
|
end
|
|
67
67
|
end
|
|
68
68
|
|
|
@@ -70,13 +70,13 @@ module Kimurai
|
|
|
70
70
|
if @config[:ignore_ssl_errors].present?
|
|
71
71
|
driver_options.profile.secure_ssl = false
|
|
72
72
|
driver_options.profile.assume_untrusted_certificate_issuer = true
|
|
73
|
-
logger.debug
|
|
73
|
+
logger.debug 'BrowserBuilder (selenium_firefox): enabled ignore_ssl_errors'
|
|
74
74
|
end
|
|
75
75
|
|
|
76
76
|
# Disable images
|
|
77
77
|
if @config[:disable_images].present?
|
|
78
|
-
driver_options.profile[
|
|
79
|
-
logger.debug
|
|
78
|
+
driver_options.profile['permissions.default.image'] = 2
|
|
79
|
+
logger.debug 'BrowserBuilder (selenium_firefox): enabled disable_images'
|
|
80
80
|
end
|
|
81
81
|
|
|
82
82
|
# Headers
|
|
@@ -84,30 +84,30 @@ module Kimurai
|
|
|
84
84
|
logger.warn "BrowserBuilder: (selenium_firefox): custom headers doesn't supported by selenium, skipped"
|
|
85
85
|
end
|
|
86
86
|
|
|
87
|
-
if user_agent = @config[:user_agent].presence
|
|
88
|
-
user_agent_string = (user_agent.
|
|
89
|
-
driver_options.profile[
|
|
90
|
-
logger.debug
|
|
87
|
+
if (user_agent = @config[:user_agent].presence)
|
|
88
|
+
user_agent_string = (user_agent.instance_of?(Proc) ? user_agent.call : user_agent).strip
|
|
89
|
+
driver_options.profile['general.useragent.override'] = user_agent_string
|
|
90
|
+
logger.debug 'BrowserBuilder (selenium_firefox): enabled custom user_agent'
|
|
91
91
|
end
|
|
92
92
|
|
|
93
93
|
# Headless mode
|
|
94
|
-
if ENV[
|
|
94
|
+
if ENV['HEADLESS'] != 'false'
|
|
95
95
|
if @config[:headless_mode] == :virtual_display
|
|
96
|
-
if Gem::Platform.local.os ==
|
|
96
|
+
if Gem::Platform.local.os == 'linux'
|
|
97
97
|
unless self.class.virtual_display
|
|
98
98
|
require 'headless'
|
|
99
99
|
self.class.virtual_display = Headless.new(reuse: true, destroy_at_exit: false)
|
|
100
100
|
self.class.virtual_display.start
|
|
101
101
|
end
|
|
102
102
|
|
|
103
|
-
logger.debug
|
|
103
|
+
logger.debug 'BrowserBuilder (selenium_firefox): enabled virtual_display headless_mode'
|
|
104
104
|
else
|
|
105
|
-
logger.error
|
|
106
|
-
|
|
105
|
+
logger.error 'BrowserBuilder (selenium_firefox): virtual_display headless_mode works only ' \
|
|
106
|
+
'on Linux platform. Browser will run in normal mode. Set `native` mode instead.'
|
|
107
107
|
end
|
|
108
108
|
else
|
|
109
|
-
driver_options.args <<
|
|
110
|
-
logger.debug
|
|
109
|
+
driver_options.args << '--headless'
|
|
110
|
+
logger.debug 'BrowserBuilder (selenium_firefox): enabled native headless_mode'
|
|
111
111
|
end
|
|
112
112
|
end
|
|
113
113
|
|
|
@@ -117,44 +117,44 @@ module Kimurai
|
|
|
117
117
|
# Create browser instance (Capybara session)
|
|
118
118
|
@browser = Capybara::Session.new(:selenium_firefox)
|
|
119
119
|
@browser.spider = spider
|
|
120
|
-
logger.debug
|
|
120
|
+
logger.debug 'BrowserBuilder (selenium_firefox): created browser instance'
|
|
121
121
|
|
|
122
122
|
if @config[:extensions].present?
|
|
123
|
-
logger.error
|
|
123
|
+
logger.error 'BrowserBuilder (selenium_firefox): `extensions` option not supported by Selenium, skipped'
|
|
124
124
|
end
|
|
125
125
|
|
|
126
126
|
# Window size
|
|
127
|
-
if size = @config[:window_size].presence
|
|
127
|
+
if (size = @config[:window_size].presence)
|
|
128
128
|
@browser.current_window.resize_to(*size)
|
|
129
|
-
logger.debug
|
|
129
|
+
logger.debug 'BrowserBuilder (selenium_firefox): enabled window_size'
|
|
130
130
|
end
|
|
131
131
|
|
|
132
132
|
# Cookies
|
|
133
|
-
if cookies = @config[:cookies].presence
|
|
133
|
+
if (cookies = @config[:cookies].presence)
|
|
134
134
|
@browser.config.cookies = cookies
|
|
135
|
-
logger.debug
|
|
135
|
+
logger.debug 'BrowserBuilder (selenium_firefox): enabled custom cookies'
|
|
136
136
|
end
|
|
137
137
|
|
|
138
138
|
# Browser instance options
|
|
139
139
|
# skip_request_errors
|
|
140
|
-
if skip_errors = @config[:skip_request_errors].presence
|
|
140
|
+
if (skip_errors = @config[:skip_request_errors].presence)
|
|
141
141
|
@browser.config.skip_request_errors = skip_errors
|
|
142
|
-
logger.debug
|
|
142
|
+
logger.debug 'BrowserBuilder (selenium_firefox): enabled skip_request_errors'
|
|
143
143
|
end
|
|
144
144
|
|
|
145
145
|
# retry_request_errors
|
|
146
|
-
if retry_errors = @config[:retry_request_errors].presence
|
|
146
|
+
if (retry_errors = @config[:retry_request_errors].presence)
|
|
147
147
|
@browser.config.retry_request_errors = retry_errors
|
|
148
|
-
logger.debug
|
|
148
|
+
logger.debug 'BrowserBuilder (selenium_firefox): enabled retry_request_errors'
|
|
149
149
|
end
|
|
150
150
|
|
|
151
151
|
# restart_if
|
|
152
|
-
if requests_limit = @config.dig(:restart_if, :requests_limit).presence
|
|
152
|
+
if (requests_limit = @config.dig(:restart_if, :requests_limit).presence)
|
|
153
153
|
@browser.config.restart_if[:requests_limit] = requests_limit
|
|
154
154
|
logger.debug "BrowserBuilder (selenium_firefox): enabled restart_if.requests_limit >= #{requests_limit}"
|
|
155
155
|
end
|
|
156
156
|
|
|
157
|
-
if memory_limit = @config.dig(:restart_if, :memory_limit).presence
|
|
157
|
+
if (memory_limit = @config.dig(:restart_if, :memory_limit).presence)
|
|
158
158
|
@browser.config.restart_if[:memory_limit] = memory_limit
|
|
159
159
|
logger.debug "BrowserBuilder (selenium_firefox): enabled restart_if.memory_limit >= #{memory_limit}"
|
|
160
160
|
end
|
|
@@ -162,34 +162,40 @@ module Kimurai
|
|
|
162
162
|
# before_request clear_cookies
|
|
163
163
|
if @config.dig(:before_request, :clear_cookies)
|
|
164
164
|
@browser.config.before_request[:clear_cookies] = true
|
|
165
|
-
logger.debug
|
|
165
|
+
logger.debug 'BrowserBuilder (selenium_firefox): enabled before_request.clear_cookies'
|
|
166
166
|
end
|
|
167
167
|
|
|
168
168
|
# before_request clear_and_set_cookies
|
|
169
169
|
if @config.dig(:before_request, :clear_and_set_cookies)
|
|
170
|
-
if cookies = @config[:cookies].presence
|
|
170
|
+
if (cookies = @config[:cookies].presence)
|
|
171
171
|
@browser.config.cookies = cookies
|
|
172
172
|
@browser.config.before_request[:clear_and_set_cookies] = true
|
|
173
|
-
logger.debug
|
|
173
|
+
logger.debug 'BrowserBuilder (selenium_firefox): enabled before_request.clear_and_set_cookies'
|
|
174
174
|
else
|
|
175
|
-
logger.error
|
|
175
|
+
logger.error 'BrowserBuilder (selenium_firefox): cookies should be present to enable before_request.clear_and_set_cookies, skipped'
|
|
176
176
|
end
|
|
177
177
|
end
|
|
178
178
|
|
|
179
179
|
# before_request change_user_agent
|
|
180
180
|
if @config.dig(:before_request, :change_user_agent)
|
|
181
|
-
logger.error
|
|
181
|
+
logger.error 'BrowserBuilder (selenium_firefox): before_request.change_user_agent option not supported by Selenium, skipped'
|
|
182
182
|
end
|
|
183
183
|
|
|
184
184
|
# before_request change_proxy
|
|
185
185
|
if @config.dig(:before_request, :change_proxy)
|
|
186
|
-
logger.error
|
|
186
|
+
logger.error 'BrowserBuilder (selenium_firefox): before_request.change_proxy option not supported by Selenium, skipped'
|
|
187
187
|
end
|
|
188
188
|
|
|
189
189
|
# before_request delay
|
|
190
|
-
if delay = @config.dig(:before_request, :delay).presence
|
|
190
|
+
if (delay = @config.dig(:before_request, :delay).presence)
|
|
191
191
|
@browser.config.before_request[:delay] = delay
|
|
192
|
-
logger.debug
|
|
192
|
+
logger.debug 'BrowserBuilder (selenium_firefox): enabled before_request.delay'
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# encoding
|
|
196
|
+
if (encoding = @config[:encoding])
|
|
197
|
+
@browser.config.encoding = encoding
|
|
198
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled encoding: #{encoding}"
|
|
193
199
|
end
|
|
194
200
|
|
|
195
201
|
# return Capybara session instance
|
|
@@ -1,38 +1,14 @@
|
|
|
1
1
|
module Kimurai
|
|
2
|
-
|
|
3
|
-
AVAILABLE_ENGINES = [
|
|
4
|
-
:mechanize,
|
|
5
|
-
:mechanize_standalone,
|
|
6
|
-
:poltergeist_phantomjs,
|
|
7
|
-
:selenium_firefox,
|
|
8
|
-
:selenium_chrome
|
|
9
|
-
]
|
|
10
|
-
|
|
2
|
+
module BrowserBuilder
|
|
11
3
|
def self.build(engine, config = {}, spider:)
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
if config[:browser].present?
|
|
17
|
-
raise "++++++ BrowserBuilder: browser option is depricated. Now all sub-options inside " \
|
|
18
|
-
"`browser` should be placed right into `@config` hash, without `browser` parent key.\n" \
|
|
19
|
-
"See more here: https://github.com/vifreefly/kimuraframework/blob/master/CHANGELOG.md#breaking-changes-110 ++++++"
|
|
4
|
+
begin
|
|
5
|
+
require "kimurai/browser_builder/#{engine}_builder"
|
|
6
|
+
rescue LoadError
|
|
20
7
|
end
|
|
21
8
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
MechanizeBuilder.new(config, spider: spider).build
|
|
26
|
-
when :selenium_chrome
|
|
27
|
-
require_relative 'browser_builder/selenium_chrome_builder'
|
|
28
|
-
SeleniumChromeBuilder.new(config, spider: spider).build
|
|
29
|
-
when :poltergeist_phantomjs
|
|
30
|
-
require_relative 'browser_builder/poltergeist_phantomjs_builder'
|
|
31
|
-
PoltergeistPhantomJSBuilder.new(config, spider: spider).build
|
|
32
|
-
when :selenium_firefox
|
|
33
|
-
require_relative 'browser_builder/selenium_firefox_builder'
|
|
34
|
-
SeleniumFirefoxBuilder.new(config, spider: spider).build
|
|
35
|
-
end
|
|
9
|
+
builder_class_name = "#{engine}_builder".classify
|
|
10
|
+
builder = "Kimurai::BrowserBuilder::#{builder_class_name}".constantize
|
|
11
|
+
builder.new(config, spider: spider).build
|
|
36
12
|
end
|
|
37
13
|
end
|
|
38
14
|
end
|
|
@@ -3,7 +3,7 @@ require 'capybara'
|
|
|
3
3
|
Capybara.configure do |config|
|
|
4
4
|
config.run_server = false
|
|
5
5
|
config.default_selector = :xpath
|
|
6
|
-
config.save_path =
|
|
6
|
+
config.save_path = 'tmp'
|
|
7
7
|
config.default_max_wait_time = 10
|
|
8
8
|
config.ignore_hidden_elements = false
|
|
9
9
|
config.threadsafe = true
|
|
@@ -1,62 +1,66 @@
|
|
|
1
1
|
require 'pathname'
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
3
|
+
module Capybara
|
|
4
|
+
module Driver
|
|
5
|
+
class Base
|
|
6
|
+
attr_accessor :visited
|
|
7
|
+
attr_writer :requests, :responses
|
|
6
8
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
9
|
+
def requests
|
|
10
|
+
@requests ||= 0
|
|
11
|
+
end
|
|
10
12
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
13
|
+
def responses
|
|
14
|
+
@responses ||= 0
|
|
15
|
+
end
|
|
14
16
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
+
def current_memory
|
|
18
|
+
driver_pid = pid
|
|
17
19
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
20
|
+
all = (get_descendant_processes(driver_pid) << driver_pid).uniq
|
|
21
|
+
all.map { |pid| get_process_memory(pid) }.sum
|
|
22
|
+
end
|
|
21
23
|
|
|
22
|
-
|
|
24
|
+
private
|
|
23
25
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
26
|
+
def get_descendant_processes(base)
|
|
27
|
+
descendants = Hash.new { |ht, k| ht[k] = [k] }
|
|
28
|
+
Hash[*`ps -eo pid,ppid`.scan(/\d+/).map(&:to_i)].each do |pid, ppid|
|
|
29
|
+
descendants[ppid] << descendants[pid]
|
|
30
|
+
end
|
|
29
31
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
+
descendants[base].flatten - [base]
|
|
33
|
+
end
|
|
32
34
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
lines = file.each_line.select { |line| line.match(/^Pss/) }
|
|
43
|
-
return 0 if lines.empty?
|
|
44
|
-
|
|
45
|
-
lines.reduce(0) do |sum, line|
|
|
46
|
-
line.match(/(?<value>(\d*\.{0,1}\d+))\s+(?<unit>\w\w)/) do |m|
|
|
47
|
-
sum += m[:value].to_i
|
|
48
|
-
end
|
|
35
|
+
# https://github.com/schneems/get_process_mem
|
|
36
|
+
# Note: for Linux takes PSS (not RSS) memory (I think PSS better fits in this case)
|
|
37
|
+
def get_process_memory(pid)
|
|
38
|
+
case @platform ||= Gem::Platform.local.os
|
|
39
|
+
when 'linux'
|
|
40
|
+
begin
|
|
41
|
+
file = Pathname.new "/proc/#{pid}/smaps"
|
|
42
|
+
return 0 unless file.exist?
|
|
49
43
|
|
|
50
|
-
|
|
44
|
+
lines = file.each_line.select { |line| line.match(/^Pss/) }
|
|
45
|
+
return 0 if lines.empty?
|
|
46
|
+
|
|
47
|
+
lines.reduce(0) do |sum, line|
|
|
48
|
+
line.match(/(?<value>(\d*\.{0,1}\d+))\s+(?<unit>\w\w)/) do |m|
|
|
49
|
+
sum += m[:value].to_i
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
sum
|
|
53
|
+
end
|
|
54
|
+
rescue Errno::EACCES
|
|
55
|
+
0
|
|
56
|
+
end
|
|
57
|
+
when 'darwin'
|
|
58
|
+
mem = `ps -o rss= -p #{pid}`.strip
|
|
59
|
+
mem.empty? ? 0 : mem.to_i
|
|
60
|
+
else
|
|
61
|
+
raise "Can't check process memory, wrong type of platform: #{@platform}"
|
|
51
62
|
end
|
|
52
|
-
rescue Errno::EACCES
|
|
53
|
-
0
|
|
54
63
|
end
|
|
55
|
-
when "darwin"
|
|
56
|
-
mem = `ps -o rss= -p #{pid}`.strip
|
|
57
|
-
mem.empty? ? 0 : mem.to_i
|
|
58
|
-
else
|
|
59
|
-
raise "Can't check process memory, wrong type of platform: #{@platform}"
|
|
60
64
|
end
|
|
61
65
|
end
|
|
62
66
|
end
|