tanakai 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +12 -0
- data/.travis.yml +5 -0
- data/CHANGELOG.md +118 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +2038 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/exe/tanakai +6 -0
- data/lib/tanakai/automation/deploy.yml +54 -0
- data/lib/tanakai/automation/setup/chromium_chromedriver.yml +26 -0
- data/lib/tanakai/automation/setup/firefox_geckodriver.yml +20 -0
- data/lib/tanakai/automation/setup/phantomjs.yml +33 -0
- data/lib/tanakai/automation/setup/ruby_environment.yml +124 -0
- data/lib/tanakai/automation/setup.yml +45 -0
- data/lib/tanakai/base/saver.rb +106 -0
- data/lib/tanakai/base/storage.rb +54 -0
- data/lib/tanakai/base.rb +326 -0
- data/lib/tanakai/base_helper.rb +22 -0
- data/lib/tanakai/browser_builder/apparition_builder.rb +58 -0
- data/lib/tanakai/browser_builder/cuprite_builder.rb +54 -0
- data/lib/tanakai/browser_builder/mechanize_builder.rb +154 -0
- data/lib/tanakai/browser_builder/poltergeist_phantomjs_builder.rb +175 -0
- data/lib/tanakai/browser_builder/selenium_chrome_builder.rb +199 -0
- data/lib/tanakai/browser_builder/selenium_firefox_builder.rb +204 -0
- data/lib/tanakai/browser_builder.rb +20 -0
- data/lib/tanakai/capybara_configuration.rb +10 -0
- data/lib/tanakai/capybara_ext/apparition/driver.rb +13 -0
- data/lib/tanakai/capybara_ext/cuprite/driver.rb +13 -0
- data/lib/tanakai/capybara_ext/driver/base.rb +62 -0
- data/lib/tanakai/capybara_ext/mechanize/driver.rb +71 -0
- data/lib/tanakai/capybara_ext/poltergeist/driver.rb +13 -0
- data/lib/tanakai/capybara_ext/selenium/driver.rb +34 -0
- data/lib/tanakai/capybara_ext/session/config.rb +22 -0
- data/lib/tanakai/capybara_ext/session.rb +249 -0
- data/lib/tanakai/cli/ansible_command_builder.rb +71 -0
- data/lib/tanakai/cli/generator.rb +57 -0
- data/lib/tanakai/cli.rb +183 -0
- data/lib/tanakai/core_ext/array.rb +14 -0
- data/lib/tanakai/core_ext/hash.rb +5 -0
- data/lib/tanakai/core_ext/numeric.rb +19 -0
- data/lib/tanakai/core_ext/string.rb +7 -0
- data/lib/tanakai/pipeline.rb +33 -0
- data/lib/tanakai/runner.rb +60 -0
- data/lib/tanakai/template/.gitignore +18 -0
- data/lib/tanakai/template/Gemfile +28 -0
- data/lib/tanakai/template/README.md +3 -0
- data/lib/tanakai/template/config/application.rb +37 -0
- data/lib/tanakai/template/config/automation.yml +13 -0
- data/lib/tanakai/template/config/boot.rb +22 -0
- data/lib/tanakai/template/config/initializers/.keep +0 -0
- data/lib/tanakai/template/config/schedule.rb +57 -0
- data/lib/tanakai/template/db/.keep +0 -0
- data/lib/tanakai/template/helpers/application_helper.rb +3 -0
- data/lib/tanakai/template/lib/.keep +0 -0
- data/lib/tanakai/template/log/.keep +0 -0
- data/lib/tanakai/template/pipelines/saver.rb +11 -0
- data/lib/tanakai/template/pipelines/validator.rb +24 -0
- data/lib/tanakai/template/spiders/application_spider.rb +143 -0
- data/lib/tanakai/template/tmp/.keep +0 -0
- data/lib/tanakai/version.rb +3 -0
- data/lib/tanakai.rb +54 -0
- data/tanakai.gemspec +50 -0
- metadata +382 -0
@@ -0,0 +1,175 @@
|
|
1
|
+
require 'capybara'
|
2
|
+
require 'capybara/poltergeist'
|
3
|
+
require_relative '../capybara_configuration'
|
4
|
+
require_relative '../capybara_ext/poltergeist/driver'
|
5
|
+
require_relative '../capybara_ext/session'
|
6
|
+
|
7
|
+
module Tanakai::BrowserBuilder
|
8
|
+
class PoltergeistPhantomjsBuilder
|
9
|
+
attr_reader :logger, :spider
|
10
|
+
|
11
|
+
def initialize(config, spider:)
|
12
|
+
@config = config
|
13
|
+
@spider = spider
|
14
|
+
@logger = spider.logger
|
15
|
+
end
|
16
|
+
|
17
|
+
def build
|
18
|
+
# Register driver
|
19
|
+
Capybara.register_driver :poltergeist_phantomjs do |app|
|
20
|
+
# Create driver options
|
21
|
+
driver_options = {
|
22
|
+
js_errors: false, debug: false, inspector: false, phantomjs_options: []
|
23
|
+
}
|
24
|
+
|
25
|
+
if extensions = @config[:extensions].presence
|
26
|
+
driver_options[:extensions] = extensions
|
27
|
+
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled extensions"
|
28
|
+
end
|
29
|
+
|
30
|
+
# Window size
|
31
|
+
if size = @config[:window_size].presence
|
32
|
+
driver_options[:window_size] = size
|
33
|
+
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled window_size"
|
34
|
+
end
|
35
|
+
|
36
|
+
# SSL
|
37
|
+
if ssl_cert_path = @config[:ssl_cert_path].presence
|
38
|
+
driver_options[:phantomjs_options] << "--ssl-certificates-path=#{ssl_cert_path}"
|
39
|
+
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom ssl_cert"
|
40
|
+
end
|
41
|
+
|
42
|
+
if @config[:ignore_ssl_errors].present?
|
43
|
+
driver_options[:phantomjs_options].push("--ignore-ssl-errors=yes", "--ssl-protocol=any")
|
44
|
+
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled ignore_ssl_errors"
|
45
|
+
end
|
46
|
+
|
47
|
+
# Disable images
|
48
|
+
if @config[:disable_images].present?
|
49
|
+
driver_options[:phantomjs_options] << "--load-images=no"
|
50
|
+
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled disable_images"
|
51
|
+
end
|
52
|
+
|
53
|
+
Capybara::Poltergeist::Driver.new(app, driver_options)
|
54
|
+
end
|
55
|
+
|
56
|
+
# Create browser instance (Capybara session)
|
57
|
+
@browser = Capybara::Session.new(:poltergeist_phantomjs)
|
58
|
+
@browser.spider = spider
|
59
|
+
logger.debug "BrowserBuilder (poltergeist_phantomjs): created browser instance"
|
60
|
+
|
61
|
+
# Proxy
|
62
|
+
if proxy = @config[:proxy].presence
|
63
|
+
proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
|
64
|
+
ip, port, type = proxy_string.split(":")
|
65
|
+
|
66
|
+
if %w(http socks5).include?(type)
|
67
|
+
@browser.driver.set_proxy(*proxy_string.split(":"))
|
68
|
+
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled #{type} proxy, ip: #{ip}, port: #{port}"
|
69
|
+
else
|
70
|
+
logger.error "BrowserBuilder (poltergeist_phantomjs): wrong type of proxy: #{type}, skipped"
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# Headers
|
75
|
+
if headers = @config[:headers].presence
|
76
|
+
@browser.driver.headers = headers
|
77
|
+
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom headers"
|
78
|
+
end
|
79
|
+
|
80
|
+
if user_agent = @config[:user_agent].presence
|
81
|
+
user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
|
82
|
+
|
83
|
+
@browser.driver.add_header("User-Agent", user_agent_string)
|
84
|
+
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom user_agent"
|
85
|
+
end
|
86
|
+
|
87
|
+
# Cookies
|
88
|
+
if cookies = @config[:cookies].presence
|
89
|
+
cookies.each do |cookie|
|
90
|
+
@browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
|
91
|
+
end
|
92
|
+
|
93
|
+
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom cookies"
|
94
|
+
end
|
95
|
+
|
96
|
+
# Browser instance options
|
97
|
+
# skip_request_errors
|
98
|
+
if skip_errors = @config[:skip_request_errors].presence
|
99
|
+
@browser.config.skip_request_errors = skip_errors
|
100
|
+
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled skip_request_errors"
|
101
|
+
end
|
102
|
+
|
103
|
+
# retry_request_errors
|
104
|
+
if retry_errors = @config[:retry_request_errors].presence
|
105
|
+
@browser.config.retry_request_errors = retry_errors
|
106
|
+
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled retry_request_errors"
|
107
|
+
end
|
108
|
+
|
109
|
+
# restart_if
|
110
|
+
if requests_limit = @config.dig(:restart_if, :requests_limit).presence
|
111
|
+
@browser.config.restart_if[:requests_limit] = requests_limit
|
112
|
+
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled restart_if.requests_limit >= #{requests_limit}"
|
113
|
+
end
|
114
|
+
|
115
|
+
if memory_limit = @config.dig(:restart_if, :memory_limit).presence
|
116
|
+
@browser.config.restart_if[:memory_limit] = memory_limit
|
117
|
+
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled restart_if.memory_limit >= #{memory_limit}"
|
118
|
+
end
|
119
|
+
|
120
|
+
# before_request clear_cookies
|
121
|
+
if @config.dig(:before_request, :clear_cookies)
|
122
|
+
@browser.config.before_request[:clear_cookies] = true
|
123
|
+
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.clear_cookies"
|
124
|
+
end
|
125
|
+
|
126
|
+
# before_request clear_and_set_cookies
|
127
|
+
if @config.dig(:before_request, :clear_and_set_cookies)
|
128
|
+
if cookies = @config[:cookies].presence
|
129
|
+
@browser.config.cookies = cookies
|
130
|
+
@browser.config.before_request[:clear_and_set_cookies] = true
|
131
|
+
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.clear_and_set_cookies"
|
132
|
+
else
|
133
|
+
logger.error "BrowserBuilder (poltergeist_phantomjs): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
# before_request change_user_agent
|
138
|
+
if @config.dig(:before_request, :change_user_agent)
|
139
|
+
if @config[:user_agent].present? && @config[:user_agent].class == Proc
|
140
|
+
@browser.config.user_agent = @config[:user_agent]
|
141
|
+
@browser.config.before_request[:change_user_agent] = true
|
142
|
+
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.change_user_agent"
|
143
|
+
else
|
144
|
+
logger.error "BrowserBuilder (poltergeist_phantomjs): user_agent should be present and has lambda format to enable before_request.change_user_agent, skipped"
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
# before_request change_proxy
|
149
|
+
if @config.dig(:before_request, :change_proxy)
|
150
|
+
if @config[:proxy].present? && @config[:proxy].class == Proc
|
151
|
+
@browser.config.proxy = @config[:proxy]
|
152
|
+
@browser.config.before_request[:change_proxy] = true
|
153
|
+
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.change_proxy"
|
154
|
+
else
|
155
|
+
logger.error "BrowserBuilder (poltergeist_phantomjs): proxy should be present and has lambda format to enable before_request.change_proxy, skipped"
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
# before_request delay
|
160
|
+
if delay = @config.dig(:before_request, :delay).presence
|
161
|
+
@browser.config.before_request[:delay] = delay
|
162
|
+
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.delay"
|
163
|
+
end
|
164
|
+
|
165
|
+
# encoding
|
166
|
+
if encoding = @config[:encoding]
|
167
|
+
@browser.config.encoding = encoding
|
168
|
+
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled encoding: #{encoding}"
|
169
|
+
end
|
170
|
+
|
171
|
+
# return Capybara session instance
|
172
|
+
@browser
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
@@ -0,0 +1,199 @@
|
|
1
|
+
require 'capybara'
|
2
|
+
require 'selenium-webdriver'
|
3
|
+
require_relative '../capybara_configuration'
|
4
|
+
require_relative '../capybara_ext/selenium/driver'
|
5
|
+
require_relative '../capybara_ext/session'
|
6
|
+
|
7
|
+
module Tanakai::BrowserBuilder
|
8
|
+
class SeleniumChromeBuilder
|
9
|
+
class << self
|
10
|
+
attr_accessor :virtual_display
|
11
|
+
end
|
12
|
+
|
13
|
+
attr_reader :logger, :spider
|
14
|
+
|
15
|
+
def initialize(config, spider:)
|
16
|
+
@config = config
|
17
|
+
@spider = spider
|
18
|
+
@logger = spider.logger
|
19
|
+
end
|
20
|
+
|
21
|
+
def build
|
22
|
+
# Register driver
|
23
|
+
Capybara.register_driver :selenium_chrome do |app|
|
24
|
+
# Create driver options
|
25
|
+
opts = { args: %w[--disable-gpu --no-sandbox --disable-translate] }
|
26
|
+
|
27
|
+
# Provide custom chrome browser path:
|
28
|
+
if chrome_path = Tanakai.configuration.selenium_chrome_path
|
29
|
+
opts.merge!(binary: chrome_path)
|
30
|
+
end
|
31
|
+
|
32
|
+
# See all options here: https://seleniumhq.github.io/selenium/docs/api/rb/Selenium/WebDriver/Chrome/Options.html
|
33
|
+
driver_options = Selenium::WebDriver::Chrome::Options.new(opts)
|
34
|
+
|
35
|
+
# Window size
|
36
|
+
if size = @config[:window_size].presence
|
37
|
+
driver_options.args << "--window-size=#{size.join(',')}"
|
38
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled window_size"
|
39
|
+
end
|
40
|
+
|
41
|
+
# Proxy
|
42
|
+
if proxy = @config[:proxy].presence
|
43
|
+
proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
|
44
|
+
ip, port, type, user, password = proxy_string.split(":")
|
45
|
+
|
46
|
+
if %w(http socks5).include?(type)
|
47
|
+
if user.nil? && password.nil?
|
48
|
+
driver_options.args << "--proxy-server=#{type}://#{ip}:#{port}"
|
49
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled #{type} proxy, ip: #{ip}, port: #{port}"
|
50
|
+
else
|
51
|
+
logger.error "BrowserBuilder (selenium_chrome): proxy with authentication doesn't supported by selenium, skipped"
|
52
|
+
end
|
53
|
+
else
|
54
|
+
logger.error "BrowserBuilder (selenium_chrome): wrong type of proxy: #{type}, skipped"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
if proxy_bypass_list = @config[:proxy_bypass_list].presence
|
59
|
+
if proxy
|
60
|
+
driver_options.args << "--proxy-bypass-list=#{proxy_bypass_list.join(';')}"
|
61
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled proxy_bypass_list"
|
62
|
+
else
|
63
|
+
logger.error "BrowserBuilder (selenium_chrome): provide `proxy` to set proxy_bypass_list, skipped"
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# SSL
|
68
|
+
if @config[:ignore_ssl_errors].present?
|
69
|
+
driver_options.args << "--ignore-certificate-errors"
|
70
|
+
driver_options.args << "--allow-insecure-localhost"
|
71
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled ignore_ssl_errors"
|
72
|
+
end
|
73
|
+
|
74
|
+
# Disable images
|
75
|
+
if @config[:disable_images].present?
|
76
|
+
driver_options.prefs["profile.managed_default_content_settings.images"] = 2
|
77
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled disable_images"
|
78
|
+
end
|
79
|
+
|
80
|
+
# Headers
|
81
|
+
if @config[:headers].present?
|
82
|
+
logger.warn "BrowserBuilder: (selenium_chrome): custom headers doesn't supported by selenium, skipped"
|
83
|
+
end
|
84
|
+
|
85
|
+
if user_agent = @config[:user_agent].presence
|
86
|
+
user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
|
87
|
+
driver_options.args << "--user-agent='#{user_agent_string}'"
|
88
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled custom user_agent"
|
89
|
+
end
|
90
|
+
|
91
|
+
# Headless mode
|
92
|
+
if ENV["HEADLESS"] != "false"
|
93
|
+
if @config[:headless_mode] == :virtual_display
|
94
|
+
if Gem::Platform.local.os == "linux"
|
95
|
+
unless self.class.virtual_display
|
96
|
+
require 'headless'
|
97
|
+
self.class.virtual_display = Headless.new(reuse: true, destroy_at_exit: false)
|
98
|
+
self.class.virtual_display.start
|
99
|
+
end
|
100
|
+
|
101
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled virtual_display headless_mode"
|
102
|
+
else
|
103
|
+
logger.error "BrowserBuilder (selenium_chrome): virtual_display headless_mode works only " \
|
104
|
+
"on Linux platform. Browser will run in normal mode. Set `native` mode instead."
|
105
|
+
end
|
106
|
+
else
|
107
|
+
driver_options.args << "--headless"
|
108
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled native headless_mode"
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
chromedriver_path = Tanakai.configuration.chromedriver_path || "/usr/local/bin/chromedriver"
|
113
|
+
service = Selenium::WebDriver::Service.chrome(path: chromedriver_path)
|
114
|
+
Capybara::Selenium::Driver.new(app, browser: :chrome, options: driver_options, service: service)
|
115
|
+
end
|
116
|
+
|
117
|
+
# Create browser instance (Capybara session)
|
118
|
+
@browser = Capybara::Session.new(:selenium_chrome)
|
119
|
+
@browser.spider = spider
|
120
|
+
logger.debug "BrowserBuilder (selenium_chrome): created browser instance"
|
121
|
+
|
122
|
+
if @config[:extensions].present?
|
123
|
+
logger.error "BrowserBuilder (selenium_chrome): `extensions` option not supported by Selenium, skipped"
|
124
|
+
end
|
125
|
+
|
126
|
+
# Cookies
|
127
|
+
if cookies = @config[:cookies].presence
|
128
|
+
@browser.config.cookies = cookies
|
129
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled custom cookies"
|
130
|
+
end
|
131
|
+
|
132
|
+
# Browser instance options
|
133
|
+
# skip_request_errors
|
134
|
+
if skip_errors = @config[:skip_request_errors].presence
|
135
|
+
@browser.config.skip_request_errors = skip_errors
|
136
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled skip_request_errors"
|
137
|
+
end
|
138
|
+
|
139
|
+
# retry_request_errors
|
140
|
+
if retry_errors = @config[:retry_request_errors].presence
|
141
|
+
@browser.config.retry_request_errors = retry_errors
|
142
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled retry_request_errors"
|
143
|
+
end
|
144
|
+
|
145
|
+
# restart_if
|
146
|
+
if requests_limit = @config.dig(:restart_if, :requests_limit).presence
|
147
|
+
@browser.config.restart_if[:requests_limit] = requests_limit
|
148
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled restart_if.requests_limit >= #{requests_limit}"
|
149
|
+
end
|
150
|
+
|
151
|
+
if memory_limit = @config.dig(:restart_if, :memory_limit).presence
|
152
|
+
@browser.config.restart_if[:memory_limit] = memory_limit
|
153
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled restart_if.memory_limit >= #{memory_limit}"
|
154
|
+
end
|
155
|
+
|
156
|
+
# before_request clear_cookies
|
157
|
+
if @config.dig(:before_request, :clear_cookies)
|
158
|
+
@browser.config.before_request[:clear_cookies] = true
|
159
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled before_request.clear_cookies"
|
160
|
+
end
|
161
|
+
|
162
|
+
# before_request clear_and_set_cookies
|
163
|
+
if @config.dig(:before_request, :clear_and_set_cookies)
|
164
|
+
if cookies = @config[:cookies].presence
|
165
|
+
@browser.config.cookies = cookies
|
166
|
+
@browser.config.before_request[:clear_and_set_cookies] = true
|
167
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled before_request.clear_and_set_cookies"
|
168
|
+
else
|
169
|
+
logger.error "BrowserBuilder (selenium_chrome): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
# before_request change_user_agent
|
174
|
+
if @config.dig(:before_request, :change_user_agent)
|
175
|
+
logger.error "BrowserBuilder (selenium_chrome): before_request.change_user_agent option not supported by Selenium, skipped"
|
176
|
+
end
|
177
|
+
|
178
|
+
# before_request change_proxy
|
179
|
+
if @config.dig(:before_request, :change_proxy)
|
180
|
+
logger.error "BrowserBuilder (selenium_chrome): before_request.change_proxy option not supported by Selenium, skipped"
|
181
|
+
end
|
182
|
+
|
183
|
+
# before_request delay
|
184
|
+
if delay = @config.dig(:before_request, :delay).presence
|
185
|
+
@browser.config.before_request[:delay] = delay
|
186
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled before_request.delay"
|
187
|
+
end
|
188
|
+
|
189
|
+
# encoding
|
190
|
+
if encoding = @config[:encoding]
|
191
|
+
@browser.config.encoding = encoding
|
192
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled encoding: #{encoding}"
|
193
|
+
end
|
194
|
+
|
195
|
+
# return Capybara session instance
|
196
|
+
@browser
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
@@ -0,0 +1,204 @@
|
|
1
|
+
require 'capybara'
|
2
|
+
require 'selenium-webdriver'
|
3
|
+
require_relative '../capybara_configuration'
|
4
|
+
require_relative '../capybara_ext/selenium/driver'
|
5
|
+
require_relative '../capybara_ext/session'
|
6
|
+
|
7
|
+
module Tanakai::BrowserBuilder
|
8
|
+
class SeleniumFirefoxBuilder
|
9
|
+
class << self
|
10
|
+
attr_accessor :virtual_display
|
11
|
+
end
|
12
|
+
|
13
|
+
attr_reader :logger, :spider
|
14
|
+
|
15
|
+
def initialize(config, spider:)
|
16
|
+
@config = config
|
17
|
+
@spider = spider
|
18
|
+
@logger = spider.logger
|
19
|
+
end
|
20
|
+
|
21
|
+
def build
|
22
|
+
# Register driver
|
23
|
+
Capybara.register_driver :selenium_firefox do |app|
|
24
|
+
# Create driver options
|
25
|
+
driver_options = Selenium::WebDriver::Firefox::Options.new
|
26
|
+
driver_options.profile = Selenium::WebDriver::Firefox::Profile.new
|
27
|
+
driver_options.profile["browser.link.open_newwindow"] = 3 # open windows in tabs
|
28
|
+
driver_options.profile["media.peerconnection.enabled"] = false # disable web rtc
|
29
|
+
|
30
|
+
# Proxy
|
31
|
+
if proxy = @config[:proxy].presence
|
32
|
+
proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
|
33
|
+
ip, port, type, user, password = proxy_string.split(":")
|
34
|
+
|
35
|
+
if user.nil? && password.nil?
|
36
|
+
driver_options.profile["network.proxy.type"] = 1
|
37
|
+
if type == "http"
|
38
|
+
driver_options.profile["network.proxy.http"] = ip
|
39
|
+
driver_options.profile["network.proxy.http_port"] = port.to_i
|
40
|
+
driver_options.profile["network.proxy.ssl"] = ip
|
41
|
+
driver_options.profile["network.proxy.ssl_port"] = port.to_i
|
42
|
+
|
43
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled http proxy, ip: #{ip}, port: #{port}"
|
44
|
+
elsif type == "socks5"
|
45
|
+
driver_options.profile["network.proxy.socks"] = ip
|
46
|
+
driver_options.profile["network.proxy.socks_port"] = port.to_i
|
47
|
+
driver_options.profile["network.proxy.socks_version"] = 5
|
48
|
+
driver_options.profile["network.proxy.socks_remote_dns"] = true
|
49
|
+
|
50
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled socks5 proxy, ip: #{ip}, port: #{port}"
|
51
|
+
else
|
52
|
+
logger.error "BrowserBuilder (selenium_firefox): wrong type of proxy: #{type}, skipped"
|
53
|
+
end
|
54
|
+
else
|
55
|
+
logger.error "BrowserBuilder (selenium_firefox): proxy with authentication doesn't supported by selenium, skipped"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
if proxy_bypass_list = @config[:proxy_bypass_list].presence
|
60
|
+
if proxy
|
61
|
+
driver_options.profile["network.proxy.no_proxies_on"] = proxy_bypass_list.join(", ")
|
62
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled proxy_bypass_list"
|
63
|
+
else
|
64
|
+
logger.error "BrowserBuilder (selenium_firefox): provide `proxy` to set proxy_bypass_list, skipped"
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
# SSL
|
69
|
+
if @config[:ignore_ssl_errors].present?
|
70
|
+
driver_options.profile.secure_ssl = false
|
71
|
+
driver_options.profile.assume_untrusted_certificate_issuer = true
|
72
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled ignore_ssl_errors"
|
73
|
+
end
|
74
|
+
|
75
|
+
# Disable images
|
76
|
+
if @config[:disable_images].present?
|
77
|
+
driver_options.profile["permissions.default.image"] = 2
|
78
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled disable_images"
|
79
|
+
end
|
80
|
+
|
81
|
+
# Headers
|
82
|
+
if @config[:headers].present?
|
83
|
+
logger.warn "BrowserBuilder: (selenium_firefox): custom headers doesn't supported by selenium, skipped"
|
84
|
+
end
|
85
|
+
|
86
|
+
if user_agent = @config[:user_agent].presence
|
87
|
+
user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
|
88
|
+
driver_options.profile["general.useragent.override"] = user_agent_string
|
89
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled custom user_agent"
|
90
|
+
end
|
91
|
+
|
92
|
+
# Headless mode
|
93
|
+
if ENV["HEADLESS"] != "false"
|
94
|
+
if @config[:headless_mode] == :virtual_display
|
95
|
+
if Gem::Platform.local.os == "linux"
|
96
|
+
unless self.class.virtual_display
|
97
|
+
require 'headless'
|
98
|
+
self.class.virtual_display = Headless.new(reuse: true, destroy_at_exit: false)
|
99
|
+
self.class.virtual_display.start
|
100
|
+
end
|
101
|
+
|
102
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled virtual_display headless_mode"
|
103
|
+
else
|
104
|
+
logger.error "BrowserBuilder (selenium_firefox): virtual_display headless_mode works only " \
|
105
|
+
"on Linux platform. Browser will run in normal mode. Set `native` mode instead."
|
106
|
+
end
|
107
|
+
else
|
108
|
+
driver_options.args << "--headless"
|
109
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled native headless_mode"
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
Capybara::Selenium::Driver.new(app, browser: :firefox, options: driver_options)
|
114
|
+
end
|
115
|
+
|
116
|
+
# Create browser instance (Capybara session)
|
117
|
+
@browser = Capybara::Session.new(:selenium_firefox)
|
118
|
+
@browser.spider = spider
|
119
|
+
logger.debug "BrowserBuilder (selenium_firefox): created browser instance"
|
120
|
+
|
121
|
+
if @config[:extensions].present?
|
122
|
+
logger.error "BrowserBuilder (selenium_firefox): `extensions` option not supported by Selenium, skipped"
|
123
|
+
end
|
124
|
+
|
125
|
+
# Window size
|
126
|
+
if size = @config[:window_size].presence
|
127
|
+
@browser.current_window.resize_to(*size)
|
128
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled window_size"
|
129
|
+
end
|
130
|
+
|
131
|
+
# Cookies
|
132
|
+
if cookies = @config[:cookies].presence
|
133
|
+
@browser.config.cookies = cookies
|
134
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled custom cookies"
|
135
|
+
end
|
136
|
+
|
137
|
+
# Browser instance options
|
138
|
+
# skip_request_errors
|
139
|
+
if skip_errors = @config[:skip_request_errors].presence
|
140
|
+
@browser.config.skip_request_errors = skip_errors
|
141
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled skip_request_errors"
|
142
|
+
end
|
143
|
+
|
144
|
+
# retry_request_errors
|
145
|
+
if retry_errors = @config[:retry_request_errors].presence
|
146
|
+
@browser.config.retry_request_errors = retry_errors
|
147
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled retry_request_errors"
|
148
|
+
end
|
149
|
+
|
150
|
+
# restart_if
|
151
|
+
if requests_limit = @config.dig(:restart_if, :requests_limit).presence
|
152
|
+
@browser.config.restart_if[:requests_limit] = requests_limit
|
153
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled restart_if.requests_limit >= #{requests_limit}"
|
154
|
+
end
|
155
|
+
|
156
|
+
if memory_limit = @config.dig(:restart_if, :memory_limit).presence
|
157
|
+
@browser.config.restart_if[:memory_limit] = memory_limit
|
158
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled restart_if.memory_limit >= #{memory_limit}"
|
159
|
+
end
|
160
|
+
|
161
|
+
# before_request clear_cookies
|
162
|
+
if @config.dig(:before_request, :clear_cookies)
|
163
|
+
@browser.config.before_request[:clear_cookies] = true
|
164
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.clear_cookies"
|
165
|
+
end
|
166
|
+
|
167
|
+
# before_request clear_and_set_cookies
|
168
|
+
if @config.dig(:before_request, :clear_and_set_cookies)
|
169
|
+
if cookies = @config[:cookies].presence
|
170
|
+
@browser.config.cookies = cookies
|
171
|
+
@browser.config.before_request[:clear_and_set_cookies] = true
|
172
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.clear_and_set_cookies"
|
173
|
+
else
|
174
|
+
logger.error "BrowserBuilder (selenium_firefox): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
# before_request change_user_agent
|
179
|
+
if @config.dig(:before_request, :change_user_agent)
|
180
|
+
logger.error "BrowserBuilder (selenium_firefox): before_request.change_user_agent option not supported by Selenium, skipped"
|
181
|
+
end
|
182
|
+
|
183
|
+
# before_request change_proxy
|
184
|
+
if @config.dig(:before_request, :change_proxy)
|
185
|
+
logger.error "BrowserBuilder (selenium_firefox): before_request.change_proxy option not supported by Selenium, skipped"
|
186
|
+
end
|
187
|
+
|
188
|
+
# before_request delay
|
189
|
+
if delay = @config.dig(:before_request, :delay).presence
|
190
|
+
@browser.config.before_request[:delay] = delay
|
191
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.delay"
|
192
|
+
end
|
193
|
+
|
194
|
+
# encoding
|
195
|
+
if encoding = @config[:encoding]
|
196
|
+
@browser.config.encoding = encoding
|
197
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled encoding: #{encoding}"
|
198
|
+
end
|
199
|
+
|
200
|
+
# return Capybara session instance
|
201
|
+
@browser
|
202
|
+
end
|
203
|
+
end
|
204
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Tanakai
|
2
|
+
module BrowserBuilder
|
3
|
+
def self.build(engine, config = {}, spider:)
|
4
|
+
if config[:browser].present?
|
5
|
+
raise "++++++ BrowserBuilder: browser option is depricated. Now all sub-options inside " \
|
6
|
+
"`browser` should be placed right into `@config` hash, without `browser` parent key.\n" \
|
7
|
+
"See more here: https://github.com/vifreefly/kimuraframework/blob/master/CHANGELOG.md#breaking-changes-110 ++++++"
|
8
|
+
end
|
9
|
+
|
10
|
+
begin
|
11
|
+
require "tanakai/browser_builder/#{engine}_builder"
|
12
|
+
rescue LoadError => e
|
13
|
+
end
|
14
|
+
|
15
|
+
builder_class_name = "#{engine}_builder".classify
|
16
|
+
builder = "Tanakai::BrowserBuilder::#{builder_class_name}".constantize
|
17
|
+
builder.new(config, spider: spider).build
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require_relative '../driver/base'
|
2
|
+
|
3
|
+
module Capybara::Apparition
|
4
|
+
class Driver
|
5
|
+
def pid
|
6
|
+
@pid ||= `lsof -i tcp:#{port} -t`.strip.to_i
|
7
|
+
end
|
8
|
+
|
9
|
+
def port
|
10
|
+
@port ||= browser.client.instance_variable_get("@ws").instance_variable_get("@driver").instance_variable_get("@socket").instance_variable_get("@io").remote_address.inspect_sockaddr.split(':').last
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require_relative '../driver/base'
|
2
|
+
|
3
|
+
module Capybara::Cuprite
|
4
|
+
class Driver
|
5
|
+
def pid
|
6
|
+
@pid ||= `lsof -i tcp:#{port} -t`.strip.to_i
|
7
|
+
end
|
8
|
+
|
9
|
+
def port
|
10
|
+
@port ||= browser.client.instance_variable_get("@ws").instance_variable_get("@driver").instance_variable_get("@socket").instance_variable_get("@sock").remote_address.inspect_sockaddr.split(':').last
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|