kimurai_dynamic 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.travis.yml +5 -0
- data/CHANGELOG.md +111 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +2038 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/exe/kimurai +6 -0
- data/kimurai.gemspec +48 -0
- data/lib/kimurai/automation/deploy.yml +54 -0
- data/lib/kimurai/automation/setup/chromium_chromedriver.yml +26 -0
- data/lib/kimurai/automation/setup/firefox_geckodriver.yml +20 -0
- data/lib/kimurai/automation/setup/phantomjs.yml +33 -0
- data/lib/kimurai/automation/setup/ruby_environment.yml +124 -0
- data/lib/kimurai/automation/setup.yml +45 -0
- data/lib/kimurai/base/saver.rb +106 -0
- data/lib/kimurai/base/storage.rb +54 -0
- data/lib/kimurai/base.rb +330 -0
- data/lib/kimurai/base_helper.rb +22 -0
- data/lib/kimurai/browser_builder/mechanize_builder.rb +154 -0
- data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +175 -0
- data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +199 -0
- data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +204 -0
- data/lib/kimurai/browser_builder.rb +20 -0
- data/lib/kimurai/capybara_configuration.rb +10 -0
- data/lib/kimurai/capybara_ext/driver/base.rb +62 -0
- data/lib/kimurai/capybara_ext/mechanize/driver.rb +71 -0
- data/lib/kimurai/capybara_ext/poltergeist/driver.rb +13 -0
- data/lib/kimurai/capybara_ext/selenium/driver.rb +34 -0
- data/lib/kimurai/capybara_ext/session/config.rb +22 -0
- data/lib/kimurai/capybara_ext/session.rb +249 -0
- data/lib/kimurai/cli/ansible_command_builder.rb +71 -0
- data/lib/kimurai/cli/generator.rb +57 -0
- data/lib/kimurai/cli.rb +183 -0
- data/lib/kimurai/core_ext/array.rb +14 -0
- data/lib/kimurai/core_ext/hash.rb +5 -0
- data/lib/kimurai/core_ext/numeric.rb +19 -0
- data/lib/kimurai/core_ext/string.rb +7 -0
- data/lib/kimurai/pipeline.rb +33 -0
- data/lib/kimurai/runner.rb +60 -0
- data/lib/kimurai/template/.gitignore +18 -0
- data/lib/kimurai/template/Gemfile +28 -0
- data/lib/kimurai/template/README.md +3 -0
- data/lib/kimurai/template/config/application.rb +37 -0
- data/lib/kimurai/template/config/automation.yml +13 -0
- data/lib/kimurai/template/config/boot.rb +22 -0
- data/lib/kimurai/template/config/initializers/.keep +0 -0
- data/lib/kimurai/template/config/schedule.rb +57 -0
- data/lib/kimurai/template/db/.keep +0 -0
- data/lib/kimurai/template/helpers/application_helper.rb +3 -0
- data/lib/kimurai/template/lib/.keep +0 -0
- data/lib/kimurai/template/log/.keep +0 -0
- data/lib/kimurai/template/pipelines/saver.rb +11 -0
- data/lib/kimurai/template/pipelines/validator.rb +24 -0
- data/lib/kimurai/template/spiders/application_spider.rb +143 -0
- data/lib/kimurai/template/tmp/.keep +0 -0
- data/lib/kimurai/version.rb +3 -0
- data/lib/kimurai.rb +54 -0
- metadata +349 -0
@@ -0,0 +1,199 @@
|
|
1
|
+
require 'capybara'
|
2
|
+
require 'selenium-webdriver'
|
3
|
+
require_relative '../capybara_configuration'
|
4
|
+
require_relative '../capybara_ext/selenium/driver'
|
5
|
+
require_relative '../capybara_ext/session'
|
6
|
+
|
7
|
+
module Kimurai::BrowserBuilder
|
8
|
+
class SeleniumChromeBuilder
|
9
|
+
class << self
|
10
|
+
attr_accessor :virtual_display
|
11
|
+
end
|
12
|
+
|
13
|
+
attr_reader :logger, :spider
|
14
|
+
|
15
|
+
def initialize(config, spider:)
|
16
|
+
@config = config
|
17
|
+
@spider = spider
|
18
|
+
@logger = spider.logger
|
19
|
+
end
|
20
|
+
|
21
|
+
def build
|
22
|
+
# Register driver
|
23
|
+
Capybara.register_driver :selenium_chrome do |app|
|
24
|
+
# Create driver options
|
25
|
+
opts = { args: %w[--disable-gpu --no-sandbox --disable-translate] }
|
26
|
+
|
27
|
+
# Provide custom chrome browser path:
|
28
|
+
if chrome_path = Kimurai.configuration.selenium_chrome_path
|
29
|
+
opts.merge!(binary: chrome_path)
|
30
|
+
end
|
31
|
+
|
32
|
+
# See all options here: https://seleniumhq.github.io/selenium/docs/api/rb/Selenium/WebDriver/Chrome/Options.html
|
33
|
+
driver_options = Selenium::WebDriver::Chrome::Options.new(opts)
|
34
|
+
|
35
|
+
# Window size
|
36
|
+
if size = @config[:window_size].presence
|
37
|
+
driver_options.args << "--window-size=#{size.join(',')}"
|
38
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled window_size"
|
39
|
+
end
|
40
|
+
|
41
|
+
# Proxy
|
42
|
+
if proxy = @config[:proxy].presence
|
43
|
+
proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
|
44
|
+
ip, port, type, user, password = proxy_string.split(":")
|
45
|
+
|
46
|
+
if %w(http socks5).include?(type)
|
47
|
+
if user.nil? && password.nil?
|
48
|
+
driver_options.args << "--proxy-server=#{type}://#{ip}:#{port}"
|
49
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled #{type} proxy, ip: #{ip}, port: #{port}"
|
50
|
+
else
|
51
|
+
logger.error "BrowserBuilder (selenium_chrome): proxy with authentication doesn't supported by selenium, skipped"
|
52
|
+
end
|
53
|
+
else
|
54
|
+
logger.error "BrowserBuilder (selenium_chrome): wrong type of proxy: #{type}, skipped"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
if proxy_bypass_list = @config[:proxy_bypass_list].presence
|
59
|
+
if proxy
|
60
|
+
driver_options.args << "--proxy-bypass-list=#{proxy_bypass_list.join(';')}"
|
61
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled proxy_bypass_list"
|
62
|
+
else
|
63
|
+
logger.error "BrowserBuilder (selenium_chrome): provide `proxy` to set proxy_bypass_list, skipped"
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# SSL
|
68
|
+
if @config[:ignore_ssl_errors].present?
|
69
|
+
driver_options.args << "--ignore-certificate-errors"
|
70
|
+
driver_options.args << "--allow-insecure-localhost"
|
71
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled ignore_ssl_errors"
|
72
|
+
end
|
73
|
+
|
74
|
+
# Disable images
|
75
|
+
if @config[:disable_images].present?
|
76
|
+
driver_options.prefs["profile.managed_default_content_settings.images"] = 2
|
77
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled disable_images"
|
78
|
+
end
|
79
|
+
|
80
|
+
# Headers
|
81
|
+
if @config[:headers].present?
|
82
|
+
logger.warn "BrowserBuilder: (selenium_chrome): custom headers doesn't supported by selenium, skipped"
|
83
|
+
end
|
84
|
+
|
85
|
+
if user_agent = @config[:user_agent].presence
|
86
|
+
user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
|
87
|
+
driver_options.args << "--user-agent='#{user_agent_string}'"
|
88
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled custom user_agent"
|
89
|
+
end
|
90
|
+
|
91
|
+
# Headless mode
|
92
|
+
if ENV["HEADLESS"] != "false"
|
93
|
+
if @config[:headless_mode] == :virtual_display
|
94
|
+
if Gem::Platform.local.os == "linux"
|
95
|
+
unless self.class.virtual_display
|
96
|
+
require 'headless'
|
97
|
+
self.class.virtual_display = Headless.new(reuse: true, destroy_at_exit: false)
|
98
|
+
self.class.virtual_display.start
|
99
|
+
end
|
100
|
+
|
101
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled virtual_display headless_mode"
|
102
|
+
else
|
103
|
+
logger.error "BrowserBuilder (selenium_chrome): virtual_display headless_mode works only " \
|
104
|
+
"on Linux platform. Browser will run in normal mode. Set `native` mode instead."
|
105
|
+
end
|
106
|
+
else
|
107
|
+
driver_options.args << "--headless"
|
108
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled native headless_mode"
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
chromedriver_path = Kimurai.configuration.chromedriver_path || "/usr/local/bin/chromedriver"
|
113
|
+
service = Selenium::WebDriver::Service.chrome(path: chromedriver_path)
|
114
|
+
Capybara::Selenium::Driver.new(app, browser: :chrome, options: driver_options, service: service)
|
115
|
+
end
|
116
|
+
|
117
|
+
# Create browser instance (Capybara session)
|
118
|
+
@browser = Capybara::Session.new(:selenium_chrome)
|
119
|
+
@browser.spider = spider
|
120
|
+
logger.debug "BrowserBuilder (selenium_chrome): created browser instance"
|
121
|
+
|
122
|
+
if @config[:extensions].present?
|
123
|
+
logger.error "BrowserBuilder (selenium_chrome): `extensions` option not supported by Selenium, skipped"
|
124
|
+
end
|
125
|
+
|
126
|
+
# Cookies
|
127
|
+
if cookies = @config[:cookies].presence
|
128
|
+
@browser.config.cookies = cookies
|
129
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled custom cookies"
|
130
|
+
end
|
131
|
+
|
132
|
+
# Browser instance options
|
133
|
+
# skip_request_errors
|
134
|
+
if skip_errors = @config[:skip_request_errors].presence
|
135
|
+
@browser.config.skip_request_errors = skip_errors
|
136
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled skip_request_errors"
|
137
|
+
end
|
138
|
+
|
139
|
+
# retry_request_errors
|
140
|
+
if retry_errors = @config[:retry_request_errors].presence
|
141
|
+
@browser.config.retry_request_errors = retry_errors
|
142
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled retry_request_errors"
|
143
|
+
end
|
144
|
+
|
145
|
+
# restart_if
|
146
|
+
if requests_limit = @config.dig(:restart_if, :requests_limit).presence
|
147
|
+
@browser.config.restart_if[:requests_limit] = requests_limit
|
148
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled restart_if.requests_limit >= #{requests_limit}"
|
149
|
+
end
|
150
|
+
|
151
|
+
if memory_limit = @config.dig(:restart_if, :memory_limit).presence
|
152
|
+
@browser.config.restart_if[:memory_limit] = memory_limit
|
153
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled restart_if.memory_limit >= #{memory_limit}"
|
154
|
+
end
|
155
|
+
|
156
|
+
# before_request clear_cookies
|
157
|
+
if @config.dig(:before_request, :clear_cookies)
|
158
|
+
@browser.config.before_request[:clear_cookies] = true
|
159
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled before_request.clear_cookies"
|
160
|
+
end
|
161
|
+
|
162
|
+
# before_request clear_and_set_cookies
|
163
|
+
if @config.dig(:before_request, :clear_and_set_cookies)
|
164
|
+
if cookies = @config[:cookies].presence
|
165
|
+
@browser.config.cookies = cookies
|
166
|
+
@browser.config.before_request[:clear_and_set_cookies] = true
|
167
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled before_request.clear_and_set_cookies"
|
168
|
+
else
|
169
|
+
logger.error "BrowserBuilder (selenium_chrome): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
# before_request change_user_agent
|
174
|
+
if @config.dig(:before_request, :change_user_agent)
|
175
|
+
logger.error "BrowserBuilder (selenium_chrome): before_request.change_user_agent option not supported by Selenium, skipped"
|
176
|
+
end
|
177
|
+
|
178
|
+
# before_request change_proxy
|
179
|
+
if @config.dig(:before_request, :change_proxy)
|
180
|
+
logger.error "BrowserBuilder (selenium_chrome): before_request.change_proxy option not supported by Selenium, skipped"
|
181
|
+
end
|
182
|
+
|
183
|
+
# before_request delay
|
184
|
+
if delay = @config.dig(:before_request, :delay).presence
|
185
|
+
@browser.config.before_request[:delay] = delay
|
186
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled before_request.delay"
|
187
|
+
end
|
188
|
+
|
189
|
+
# encoding
|
190
|
+
if encoding = @config[:encoding]
|
191
|
+
@browser.config.encoding = encoding
|
192
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled encoding: #{encoding}"
|
193
|
+
end
|
194
|
+
|
195
|
+
# return Capybara session instance
|
196
|
+
@browser
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
@@ -0,0 +1,204 @@
|
|
1
|
+
require 'capybara'
|
2
|
+
require 'selenium-webdriver'
|
3
|
+
require_relative '../capybara_configuration'
|
4
|
+
require_relative '../capybara_ext/selenium/driver'
|
5
|
+
require_relative '../capybara_ext/session'
|
6
|
+
|
7
|
+
module Kimurai::BrowserBuilder
|
8
|
+
class SeleniumFirefoxBuilder
|
9
|
+
class << self
|
10
|
+
attr_accessor :virtual_display
|
11
|
+
end
|
12
|
+
|
13
|
+
attr_reader :logger, :spider
|
14
|
+
|
15
|
+
def initialize(config, spider:)
|
16
|
+
@config = config
|
17
|
+
@spider = spider
|
18
|
+
@logger = spider.logger
|
19
|
+
end
|
20
|
+
|
21
|
+
def build
|
22
|
+
# Register driver
|
23
|
+
Capybara.register_driver :selenium_firefox do |app|
|
24
|
+
# Create driver options
|
25
|
+
driver_options = Selenium::WebDriver::Firefox::Options.new
|
26
|
+
driver_options.profile = Selenium::WebDriver::Firefox::Profile.new
|
27
|
+
driver_options.profile["browser.link.open_newwindow"] = 3 # open windows in tabs
|
28
|
+
driver_options.profile["media.peerconnection.enabled"] = false # disable web rtc
|
29
|
+
|
30
|
+
# Proxy
|
31
|
+
if proxy = @config[:proxy].presence
|
32
|
+
proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
|
33
|
+
ip, port, type, user, password = proxy_string.split(":")
|
34
|
+
|
35
|
+
if user.nil? && password.nil?
|
36
|
+
driver_options.profile["network.proxy.type"] = 1
|
37
|
+
if type == "http"
|
38
|
+
driver_options.profile["network.proxy.http"] = ip
|
39
|
+
driver_options.profile["network.proxy.http_port"] = port.to_i
|
40
|
+
driver_options.profile["network.proxy.ssl"] = ip
|
41
|
+
driver_options.profile["network.proxy.ssl_port"] = port.to_i
|
42
|
+
|
43
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled http proxy, ip: #{ip}, port: #{port}"
|
44
|
+
elsif type == "socks5"
|
45
|
+
driver_options.profile["network.proxy.socks"] = ip
|
46
|
+
driver_options.profile["network.proxy.socks_port"] = port.to_i
|
47
|
+
driver_options.profile["network.proxy.socks_version"] = 5
|
48
|
+
driver_options.profile["network.proxy.socks_remote_dns"] = true
|
49
|
+
|
50
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled socks5 proxy, ip: #{ip}, port: #{port}"
|
51
|
+
else
|
52
|
+
logger.error "BrowserBuilder (selenium_firefox): wrong type of proxy: #{type}, skipped"
|
53
|
+
end
|
54
|
+
else
|
55
|
+
logger.error "BrowserBuilder (selenium_firefox): proxy with authentication doesn't supported by selenium, skipped"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
if proxy_bypass_list = @config[:proxy_bypass_list].presence
|
60
|
+
if proxy
|
61
|
+
driver_options.profile["network.proxy.no_proxies_on"] = proxy_bypass_list.join(", ")
|
62
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled proxy_bypass_list"
|
63
|
+
else
|
64
|
+
logger.error "BrowserBuilder (selenium_firefox): provide `proxy` to set proxy_bypass_list, skipped"
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
# SSL
|
69
|
+
if @config[:ignore_ssl_errors].present?
|
70
|
+
driver_options.profile.secure_ssl = false
|
71
|
+
driver_options.profile.assume_untrusted_certificate_issuer = true
|
72
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled ignore_ssl_errors"
|
73
|
+
end
|
74
|
+
|
75
|
+
# Disable images
|
76
|
+
if @config[:disable_images].present?
|
77
|
+
driver_options.profile["permissions.default.image"] = 2
|
78
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled disable_images"
|
79
|
+
end
|
80
|
+
|
81
|
+
# Headers
|
82
|
+
if @config[:headers].present?
|
83
|
+
logger.warn "BrowserBuilder: (selenium_firefox): custom headers doesn't supported by selenium, skipped"
|
84
|
+
end
|
85
|
+
|
86
|
+
if user_agent = @config[:user_agent].presence
|
87
|
+
user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
|
88
|
+
driver_options.profile["general.useragent.override"] = user_agent_string
|
89
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled custom user_agent"
|
90
|
+
end
|
91
|
+
|
92
|
+
# Headless mode
|
93
|
+
if ENV["HEADLESS"] != "false"
|
94
|
+
if @config[:headless_mode] == :virtual_display
|
95
|
+
if Gem::Platform.local.os == "linux"
|
96
|
+
unless self.class.virtual_display
|
97
|
+
require 'headless'
|
98
|
+
self.class.virtual_display = Headless.new(reuse: true, destroy_at_exit: false)
|
99
|
+
self.class.virtual_display.start
|
100
|
+
end
|
101
|
+
|
102
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled virtual_display headless_mode"
|
103
|
+
else
|
104
|
+
logger.error "BrowserBuilder (selenium_firefox): virtual_display headless_mode works only " \
|
105
|
+
"on Linux platform. Browser will run in normal mode. Set `native` mode instead."
|
106
|
+
end
|
107
|
+
else
|
108
|
+
driver_options.args << "--headless"
|
109
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled native headless_mode"
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
Capybara::Selenium::Driver.new(app, browser: :firefox, options: driver_options)
|
114
|
+
end
|
115
|
+
|
116
|
+
# Create browser instance (Capybara session)
|
117
|
+
@browser = Capybara::Session.new(:selenium_firefox)
|
118
|
+
@browser.spider = spider
|
119
|
+
logger.debug "BrowserBuilder (selenium_firefox): created browser instance"
|
120
|
+
|
121
|
+
if @config[:extensions].present?
|
122
|
+
logger.error "BrowserBuilder (selenium_firefox): `extensions` option not supported by Selenium, skipped"
|
123
|
+
end
|
124
|
+
|
125
|
+
# Window size
|
126
|
+
if size = @config[:window_size].presence
|
127
|
+
@browser.current_window.resize_to(*size)
|
128
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled window_size"
|
129
|
+
end
|
130
|
+
|
131
|
+
# Cookies
|
132
|
+
if cookies = @config[:cookies].presence
|
133
|
+
@browser.config.cookies = cookies
|
134
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled custom cookies"
|
135
|
+
end
|
136
|
+
|
137
|
+
# Browser instance options
|
138
|
+
# skip_request_errors
|
139
|
+
if skip_errors = @config[:skip_request_errors].presence
|
140
|
+
@browser.config.skip_request_errors = skip_errors
|
141
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled skip_request_errors"
|
142
|
+
end
|
143
|
+
|
144
|
+
# retry_request_errors
|
145
|
+
if retry_errors = @config[:retry_request_errors].presence
|
146
|
+
@browser.config.retry_request_errors = retry_errors
|
147
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled retry_request_errors"
|
148
|
+
end
|
149
|
+
|
150
|
+
# restart_if
|
151
|
+
if requests_limit = @config.dig(:restart_if, :requests_limit).presence
|
152
|
+
@browser.config.restart_if[:requests_limit] = requests_limit
|
153
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled restart_if.requests_limit >= #{requests_limit}"
|
154
|
+
end
|
155
|
+
|
156
|
+
if memory_limit = @config.dig(:restart_if, :memory_limit).presence
|
157
|
+
@browser.config.restart_if[:memory_limit] = memory_limit
|
158
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled restart_if.memory_limit >= #{memory_limit}"
|
159
|
+
end
|
160
|
+
|
161
|
+
# before_request clear_cookies
|
162
|
+
if @config.dig(:before_request, :clear_cookies)
|
163
|
+
@browser.config.before_request[:clear_cookies] = true
|
164
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.clear_cookies"
|
165
|
+
end
|
166
|
+
|
167
|
+
# before_request clear_and_set_cookies
|
168
|
+
if @config.dig(:before_request, :clear_and_set_cookies)
|
169
|
+
if cookies = @config[:cookies].presence
|
170
|
+
@browser.config.cookies = cookies
|
171
|
+
@browser.config.before_request[:clear_and_set_cookies] = true
|
172
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.clear_and_set_cookies"
|
173
|
+
else
|
174
|
+
logger.error "BrowserBuilder (selenium_firefox): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
# before_request change_user_agent
|
179
|
+
if @config.dig(:before_request, :change_user_agent)
|
180
|
+
logger.error "BrowserBuilder (selenium_firefox): before_request.change_user_agent option not supported by Selenium, skipped"
|
181
|
+
end
|
182
|
+
|
183
|
+
# before_request change_proxy
|
184
|
+
if @config.dig(:before_request, :change_proxy)
|
185
|
+
logger.error "BrowserBuilder (selenium_firefox): before_request.change_proxy option not supported by Selenium, skipped"
|
186
|
+
end
|
187
|
+
|
188
|
+
# before_request delay
|
189
|
+
if delay = @config.dig(:before_request, :delay).presence
|
190
|
+
@browser.config.before_request[:delay] = delay
|
191
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.delay"
|
192
|
+
end
|
193
|
+
|
194
|
+
# encoding
|
195
|
+
if encoding = @config[:encoding]
|
196
|
+
@browser.config.encoding = encoding
|
197
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled encoding: #{encoding}"
|
198
|
+
end
|
199
|
+
|
200
|
+
# return Capybara session instance
|
201
|
+
@browser
|
202
|
+
end
|
203
|
+
end
|
204
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Kimurai
|
2
|
+
module BrowserBuilder
|
3
|
+
def self.build(engine, config = {}, spider:)
|
4
|
+
if config[:browser].present?
|
5
|
+
raise "++++++ BrowserBuilder: browser option is depricated. Now all sub-options inside " \
|
6
|
+
"`browser` should be placed right into `@config` hash, without `browser` parent key.\n" \
|
7
|
+
"See more here: https://github.com/vifreefly/kimuraframework/blob/master/CHANGELOG.md#breaking-changes-110 ++++++"
|
8
|
+
end
|
9
|
+
|
10
|
+
begin
|
11
|
+
require "kimurai/browser_builder/#{engine}_builder"
|
12
|
+
rescue LoadError => e
|
13
|
+
end
|
14
|
+
|
15
|
+
builder_class_name = "#{engine}_builder".classify
|
16
|
+
builder = "Kimurai::BrowserBuilder::#{builder_class_name}".constantize
|
17
|
+
builder.new(config, spider: spider).build
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'pathname'
|
2
|
+
|
3
|
+
class Capybara::Driver::Base
|
4
|
+
attr_accessor :visited
|
5
|
+
attr_writer :requests, :responses
|
6
|
+
|
7
|
+
def requests
|
8
|
+
@requests ||= 0
|
9
|
+
end
|
10
|
+
|
11
|
+
def responses
|
12
|
+
@responses ||= 0
|
13
|
+
end
|
14
|
+
|
15
|
+
def current_memory
|
16
|
+
driver_pid = pid
|
17
|
+
|
18
|
+
all = (get_descendant_processes(driver_pid) << driver_pid).uniq
|
19
|
+
all.map { |pid| get_process_memory(pid) }.sum
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def get_descendant_processes(base)
|
25
|
+
descendants = Hash.new { |ht, k| ht[k] = [k] }
|
26
|
+
Hash[*`ps -eo pid,ppid`.scan(/\d+/).map(&:to_i)].each do |pid, ppid|
|
27
|
+
descendants[ppid] << descendants[pid]
|
28
|
+
end
|
29
|
+
|
30
|
+
descendants[base].flatten - [base]
|
31
|
+
end
|
32
|
+
|
33
|
+
# https://github.com/schneems/get_process_mem
|
34
|
+
# Note: for Linux takes PSS (not RSS) memory (I think PSS better fits in this case)
|
35
|
+
def get_process_memory(pid)
|
36
|
+
case @platform ||= Gem::Platform.local.os
|
37
|
+
when "linux"
|
38
|
+
begin
|
39
|
+
file = Pathname.new "/proc/#{pid}/smaps"
|
40
|
+
return 0 unless file.exist?
|
41
|
+
|
42
|
+
lines = file.each_line.select { |line| line.match(/^Pss/) }
|
43
|
+
return 0 if lines.empty?
|
44
|
+
|
45
|
+
lines.reduce(0) do |sum, line|
|
46
|
+
line.match(/(?<value>(\d*\.{0,1}\d+))\s+(?<unit>\w\w)/) do |m|
|
47
|
+
sum += m[:value].to_i
|
48
|
+
end
|
49
|
+
|
50
|
+
sum
|
51
|
+
end
|
52
|
+
rescue Errno::EACCES
|
53
|
+
0
|
54
|
+
end
|
55
|
+
when "darwin"
|
56
|
+
mem = `ps -o rss= -p #{pid}`.strip
|
57
|
+
mem.empty? ? 0 : mem.to_i
|
58
|
+
else
|
59
|
+
raise "Can't check process memory, wrong type of platform: #{@platform}"
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
require_relative '../driver/base'
|
3
|
+
|
4
|
+
class Capybara::Mechanize::Driver
|
5
|
+
# Extend capybara-mechnize to support Poltergeist-like methods
|
6
|
+
# https://www.rubydoc.info/gems/poltergeist/Capybara/Poltergeist/Driver
|
7
|
+
|
8
|
+
def set_proxy(ip, port, type, user = nil, password = nil)
|
9
|
+
# type is always "http", "socks" is not supported (yet)
|
10
|
+
browser.agent.set_proxy(ip, port, user, password)
|
11
|
+
end
|
12
|
+
|
13
|
+
###
|
14
|
+
|
15
|
+
def headers
|
16
|
+
browser.agent.request_headers
|
17
|
+
end
|
18
|
+
|
19
|
+
def headers=(headers)
|
20
|
+
browser.agent.request_headers = headers
|
21
|
+
end
|
22
|
+
|
23
|
+
def add_header(name, value)
|
24
|
+
browser.agent.request_headers[name] = value
|
25
|
+
end
|
26
|
+
|
27
|
+
###
|
28
|
+
|
29
|
+
def get_cookies
|
30
|
+
browser.agent.cookies
|
31
|
+
end
|
32
|
+
|
33
|
+
def set_cookie(name, value, options = {})
|
34
|
+
options[:name] ||= name
|
35
|
+
options[:value] ||= value
|
36
|
+
|
37
|
+
cookie = Mechanize::Cookie.new(options.merge path: "/")
|
38
|
+
browser.agent.cookie_jar << cookie
|
39
|
+
end
|
40
|
+
|
41
|
+
def set_cookies(cookies)
|
42
|
+
cookies.each do |cookie|
|
43
|
+
set_cookie(cookie[:name], cookie[:value], cookie)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def clear_cookies
|
48
|
+
browser.agent.cookie_jar.clear!
|
49
|
+
end
|
50
|
+
|
51
|
+
###
|
52
|
+
|
53
|
+
def quit
|
54
|
+
browser.agent.shutdown
|
55
|
+
end
|
56
|
+
|
57
|
+
###
|
58
|
+
|
59
|
+
# Reset parent method `current_memory` for mechanize (we can't measure memory of Mechanize driver)
|
60
|
+
def current_memory
|
61
|
+
nil
|
62
|
+
end
|
63
|
+
|
64
|
+
def pid
|
65
|
+
nil
|
66
|
+
end
|
67
|
+
|
68
|
+
def port
|
69
|
+
nil
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require_relative '../driver/base'
|
2
|
+
|
3
|
+
class Capybara::Selenium::Driver
|
4
|
+
def get_cookies
|
5
|
+
browser.manage.all_cookies
|
6
|
+
end
|
7
|
+
|
8
|
+
def set_cookie(name, value, options = {})
|
9
|
+
options[:name] ||= name
|
10
|
+
options[:value] ||= value
|
11
|
+
|
12
|
+
browser.manage.add_cookie(options)
|
13
|
+
end
|
14
|
+
|
15
|
+
def set_cookies(cookies)
|
16
|
+
cookies.each do |cookie|
|
17
|
+
set_cookie(cookie[:name], cookie[:value], cookie)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def clear_cookies
|
22
|
+
browser.manage.delete_all_cookies
|
23
|
+
end
|
24
|
+
|
25
|
+
###
|
26
|
+
|
27
|
+
def pid
|
28
|
+
@pid ||= `lsof -i tcp:#{port} -t`.strip.to_i
|
29
|
+
end
|
30
|
+
|
31
|
+
def port
|
32
|
+
@port ||= browser.send(:bridge).instance_variable_get("@http").instance_variable_get("@server_url").port
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Capybara
|
2
|
+
class SessionConfig
|
3
|
+
attr_accessor :cookies, :proxy, :user_agent, :encoding
|
4
|
+
attr_writer :retry_request_errors, :skip_request_errors
|
5
|
+
|
6
|
+
def retry_request_errors
|
7
|
+
@retry_request_errors ||= []
|
8
|
+
end
|
9
|
+
|
10
|
+
def skip_request_errors
|
11
|
+
@skip_request_errors ||= []
|
12
|
+
end
|
13
|
+
|
14
|
+
def restart_if
|
15
|
+
@restart_if ||= {}
|
16
|
+
end
|
17
|
+
|
18
|
+
def before_request
|
19
|
+
@before_request ||= {}
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|