tanakai 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +5 -0
  4. data/CHANGELOG.md +118 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +2038 -0
  8. data/Rakefile +10 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/exe/tanakai +6 -0
  12. data/lib/tanakai/automation/deploy.yml +54 -0
  13. data/lib/tanakai/automation/setup/chromium_chromedriver.yml +26 -0
  14. data/lib/tanakai/automation/setup/firefox_geckodriver.yml +20 -0
  15. data/lib/tanakai/automation/setup/phantomjs.yml +33 -0
  16. data/lib/tanakai/automation/setup/ruby_environment.yml +124 -0
  17. data/lib/tanakai/automation/setup.yml +45 -0
  18. data/lib/tanakai/base/saver.rb +106 -0
  19. data/lib/tanakai/base/storage.rb +54 -0
  20. data/lib/tanakai/base.rb +326 -0
  21. data/lib/tanakai/base_helper.rb +22 -0
  22. data/lib/tanakai/browser_builder/apparition_builder.rb +58 -0
  23. data/lib/tanakai/browser_builder/cuprite_builder.rb +54 -0
  24. data/lib/tanakai/browser_builder/mechanize_builder.rb +154 -0
  25. data/lib/tanakai/browser_builder/poltergeist_phantomjs_builder.rb +175 -0
  26. data/lib/tanakai/browser_builder/selenium_chrome_builder.rb +199 -0
  27. data/lib/tanakai/browser_builder/selenium_firefox_builder.rb +204 -0
  28. data/lib/tanakai/browser_builder.rb +20 -0
  29. data/lib/tanakai/capybara_configuration.rb +10 -0
  30. data/lib/tanakai/capybara_ext/apparition/driver.rb +13 -0
  31. data/lib/tanakai/capybara_ext/cuprite/driver.rb +13 -0
  32. data/lib/tanakai/capybara_ext/driver/base.rb +62 -0
  33. data/lib/tanakai/capybara_ext/mechanize/driver.rb +71 -0
  34. data/lib/tanakai/capybara_ext/poltergeist/driver.rb +13 -0
  35. data/lib/tanakai/capybara_ext/selenium/driver.rb +34 -0
  36. data/lib/tanakai/capybara_ext/session/config.rb +22 -0
  37. data/lib/tanakai/capybara_ext/session.rb +249 -0
  38. data/lib/tanakai/cli/ansible_command_builder.rb +71 -0
  39. data/lib/tanakai/cli/generator.rb +57 -0
  40. data/lib/tanakai/cli.rb +183 -0
  41. data/lib/tanakai/core_ext/array.rb +14 -0
  42. data/lib/tanakai/core_ext/hash.rb +5 -0
  43. data/lib/tanakai/core_ext/numeric.rb +19 -0
  44. data/lib/tanakai/core_ext/string.rb +7 -0
  45. data/lib/tanakai/pipeline.rb +33 -0
  46. data/lib/tanakai/runner.rb +60 -0
  47. data/lib/tanakai/template/.gitignore +18 -0
  48. data/lib/tanakai/template/Gemfile +28 -0
  49. data/lib/tanakai/template/README.md +3 -0
  50. data/lib/tanakai/template/config/application.rb +37 -0
  51. data/lib/tanakai/template/config/automation.yml +13 -0
  52. data/lib/tanakai/template/config/boot.rb +22 -0
  53. data/lib/tanakai/template/config/initializers/.keep +0 -0
  54. data/lib/tanakai/template/config/schedule.rb +57 -0
  55. data/lib/tanakai/template/db/.keep +0 -0
  56. data/lib/tanakai/template/helpers/application_helper.rb +3 -0
  57. data/lib/tanakai/template/lib/.keep +0 -0
  58. data/lib/tanakai/template/log/.keep +0 -0
  59. data/lib/tanakai/template/pipelines/saver.rb +11 -0
  60. data/lib/tanakai/template/pipelines/validator.rb +24 -0
  61. data/lib/tanakai/template/spiders/application_spider.rb +143 -0
  62. data/lib/tanakai/template/tmp/.keep +0 -0
  63. data/lib/tanakai/version.rb +3 -0
  64. data/lib/tanakai.rb +54 -0
  65. data/tanakai.gemspec +50 -0
  66. metadata +382 -0
@@ -0,0 +1,175 @@
1
+ require 'capybara'
2
+ require 'capybara/poltergeist'
3
+ require_relative '../capybara_configuration'
4
+ require_relative '../capybara_ext/poltergeist/driver'
5
+ require_relative '../capybara_ext/session'
6
+
7
+ module Tanakai::BrowserBuilder
8
+ class PoltergeistPhantomjsBuilder
9
+ attr_reader :logger, :spider
10
+
11
+ def initialize(config, spider:)
12
+ @config = config
13
+ @spider = spider
14
+ @logger = spider.logger
15
+ end
16
+
17
+ def build
18
+ # Register driver
19
+ Capybara.register_driver :poltergeist_phantomjs do |app|
20
+ # Create driver options
21
+ driver_options = {
22
+ js_errors: false, debug: false, inspector: false, phantomjs_options: []
23
+ }
24
+
25
+ if extensions = @config[:extensions].presence
26
+ driver_options[:extensions] = extensions
27
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled extensions"
28
+ end
29
+
30
+ # Window size
31
+ if size = @config[:window_size].presence
32
+ driver_options[:window_size] = size
33
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled window_size"
34
+ end
35
+
36
+ # SSL
37
+ if ssl_cert_path = @config[:ssl_cert_path].presence
38
+ driver_options[:phantomjs_options] << "--ssl-certificates-path=#{ssl_cert_path}"
39
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom ssl_cert"
40
+ end
41
+
42
+ if @config[:ignore_ssl_errors].present?
43
+ driver_options[:phantomjs_options].push("--ignore-ssl-errors=yes", "--ssl-protocol=any")
44
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled ignore_ssl_errors"
45
+ end
46
+
47
+ # Disable images
48
+ if @config[:disable_images].present?
49
+ driver_options[:phantomjs_options] << "--load-images=no"
50
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled disable_images"
51
+ end
52
+
53
+ Capybara::Poltergeist::Driver.new(app, driver_options)
54
+ end
55
+
56
+ # Create browser instance (Capybara session)
57
+ @browser = Capybara::Session.new(:poltergeist_phantomjs)
58
+ @browser.spider = spider
59
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): created browser instance"
60
+
61
+ # Proxy
62
+ if proxy = @config[:proxy].presence
63
+ proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
64
+ ip, port, type = proxy_string.split(":")
65
+
66
+ if %w(http socks5).include?(type)
67
+ @browser.driver.set_proxy(*proxy_string.split(":"))
68
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled #{type} proxy, ip: #{ip}, port: #{port}"
69
+ else
70
+ logger.error "BrowserBuilder (poltergeist_phantomjs): wrong type of proxy: #{type}, skipped"
71
+ end
72
+ end
73
+
74
+ # Headers
75
+ if headers = @config[:headers].presence
76
+ @browser.driver.headers = headers
77
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom headers"
78
+ end
79
+
80
+ if user_agent = @config[:user_agent].presence
81
+ user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
82
+
83
+ @browser.driver.add_header("User-Agent", user_agent_string)
84
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom user_agent"
85
+ end
86
+
87
+ # Cookies
88
+ if cookies = @config[:cookies].presence
89
+ cookies.each do |cookie|
90
+ @browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
91
+ end
92
+
93
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom cookies"
94
+ end
95
+
96
+ # Browser instance options
97
+ # skip_request_errors
98
+ if skip_errors = @config[:skip_request_errors].presence
99
+ @browser.config.skip_request_errors = skip_errors
100
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled skip_request_errors"
101
+ end
102
+
103
+ # retry_request_errors
104
+ if retry_errors = @config[:retry_request_errors].presence
105
+ @browser.config.retry_request_errors = retry_errors
106
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled retry_request_errors"
107
+ end
108
+
109
+ # restart_if
110
+ if requests_limit = @config.dig(:restart_if, :requests_limit).presence
111
+ @browser.config.restart_if[:requests_limit] = requests_limit
112
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled restart_if.requests_limit >= #{requests_limit}"
113
+ end
114
+
115
+ if memory_limit = @config.dig(:restart_if, :memory_limit).presence
116
+ @browser.config.restart_if[:memory_limit] = memory_limit
117
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled restart_if.memory_limit >= #{memory_limit}"
118
+ end
119
+
120
+ # before_request clear_cookies
121
+ if @config.dig(:before_request, :clear_cookies)
122
+ @browser.config.before_request[:clear_cookies] = true
123
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.clear_cookies"
124
+ end
125
+
126
+ # before_request clear_and_set_cookies
127
+ if @config.dig(:before_request, :clear_and_set_cookies)
128
+ if cookies = @config[:cookies].presence
129
+ @browser.config.cookies = cookies
130
+ @browser.config.before_request[:clear_and_set_cookies] = true
131
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.clear_and_set_cookies"
132
+ else
133
+ logger.error "BrowserBuilder (poltergeist_phantomjs): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
134
+ end
135
+ end
136
+
137
+ # before_request change_user_agent
138
+ if @config.dig(:before_request, :change_user_agent)
139
+ if @config[:user_agent].present? && @config[:user_agent].class == Proc
140
+ @browser.config.user_agent = @config[:user_agent]
141
+ @browser.config.before_request[:change_user_agent] = true
142
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.change_user_agent"
143
+ else
144
+ logger.error "BrowserBuilder (poltergeist_phantomjs): user_agent should be present and has lambda format to enable before_request.change_user_agent, skipped"
145
+ end
146
+ end
147
+
148
+ # before_request change_proxy
149
+ if @config.dig(:before_request, :change_proxy)
150
+ if @config[:proxy].present? && @config[:proxy].class == Proc
151
+ @browser.config.proxy = @config[:proxy]
152
+ @browser.config.before_request[:change_proxy] = true
153
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.change_proxy"
154
+ else
155
+ logger.error "BrowserBuilder (poltergeist_phantomjs): proxy should be present and has lambda format to enable before_request.change_proxy, skipped"
156
+ end
157
+ end
158
+
159
+ # before_request delay
160
+ if delay = @config.dig(:before_request, :delay).presence
161
+ @browser.config.before_request[:delay] = delay
162
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.delay"
163
+ end
164
+
165
+ # encoding
166
+ if encoding = @config[:encoding]
167
+ @browser.config.encoding = encoding
168
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled encoding: #{encoding}"
169
+ end
170
+
171
+ # return Capybara session instance
172
+ @browser
173
+ end
174
+ end
175
+ end
@@ -0,0 +1,199 @@
1
+ require 'capybara'
2
+ require 'selenium-webdriver'
3
+ require_relative '../capybara_configuration'
4
+ require_relative '../capybara_ext/selenium/driver'
5
+ require_relative '../capybara_ext/session'
6
+
7
+ module Tanakai::BrowserBuilder
8
+ class SeleniumChromeBuilder
9
+ class << self
10
+ attr_accessor :virtual_display
11
+ end
12
+
13
+ attr_reader :logger, :spider
14
+
15
+ def initialize(config, spider:)
16
+ @config = config
17
+ @spider = spider
18
+ @logger = spider.logger
19
+ end
20
+
21
+ def build
22
+ # Register driver
23
+ Capybara.register_driver :selenium_chrome do |app|
24
+ # Create driver options
25
+ opts = { args: %w[--disable-gpu --no-sandbox --disable-translate] }
26
+
27
+ # Provide custom chrome browser path:
28
+ if chrome_path = Tanakai.configuration.selenium_chrome_path
29
+ opts.merge!(binary: chrome_path)
30
+ end
31
+
32
+ # See all options here: https://seleniumhq.github.io/selenium/docs/api/rb/Selenium/WebDriver/Chrome/Options.html
33
+ driver_options = Selenium::WebDriver::Chrome::Options.new(opts)
34
+
35
+ # Window size
36
+ if size = @config[:window_size].presence
37
+ driver_options.args << "--window-size=#{size.join(',')}"
38
+ logger.debug "BrowserBuilder (selenium_chrome): enabled window_size"
39
+ end
40
+
41
+ # Proxy
42
+ if proxy = @config[:proxy].presence
43
+ proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
44
+ ip, port, type, user, password = proxy_string.split(":")
45
+
46
+ if %w(http socks5).include?(type)
47
+ if user.nil? && password.nil?
48
+ driver_options.args << "--proxy-server=#{type}://#{ip}:#{port}"
49
+ logger.debug "BrowserBuilder (selenium_chrome): enabled #{type} proxy, ip: #{ip}, port: #{port}"
50
+ else
51
+ logger.error "BrowserBuilder (selenium_chrome): proxy with authentication doesn't supported by selenium, skipped"
52
+ end
53
+ else
54
+ logger.error "BrowserBuilder (selenium_chrome): wrong type of proxy: #{type}, skipped"
55
+ end
56
+ end
57
+
58
+ if proxy_bypass_list = @config[:proxy_bypass_list].presence
59
+ if proxy
60
+ driver_options.args << "--proxy-bypass-list=#{proxy_bypass_list.join(';')}"
61
+ logger.debug "BrowserBuilder (selenium_chrome): enabled proxy_bypass_list"
62
+ else
63
+ logger.error "BrowserBuilder (selenium_chrome): provide `proxy` to set proxy_bypass_list, skipped"
64
+ end
65
+ end
66
+
67
+ # SSL
68
+ if @config[:ignore_ssl_errors].present?
69
+ driver_options.args << "--ignore-certificate-errors"
70
+ driver_options.args << "--allow-insecure-localhost"
71
+ logger.debug "BrowserBuilder (selenium_chrome): enabled ignore_ssl_errors"
72
+ end
73
+
74
+ # Disable images
75
+ if @config[:disable_images].present?
76
+ driver_options.prefs["profile.managed_default_content_settings.images"] = 2
77
+ logger.debug "BrowserBuilder (selenium_chrome): enabled disable_images"
78
+ end
79
+
80
+ # Headers
81
+ if @config[:headers].present?
82
+ logger.warn "BrowserBuilder: (selenium_chrome): custom headers doesn't supported by selenium, skipped"
83
+ end
84
+
85
+ if user_agent = @config[:user_agent].presence
86
+ user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
87
+ driver_options.args << "--user-agent='#{user_agent_string}'"
88
+ logger.debug "BrowserBuilder (selenium_chrome): enabled custom user_agent"
89
+ end
90
+
91
+ # Headless mode
92
+ if ENV["HEADLESS"] != "false"
93
+ if @config[:headless_mode] == :virtual_display
94
+ if Gem::Platform.local.os == "linux"
95
+ unless self.class.virtual_display
96
+ require 'headless'
97
+ self.class.virtual_display = Headless.new(reuse: true, destroy_at_exit: false)
98
+ self.class.virtual_display.start
99
+ end
100
+
101
+ logger.debug "BrowserBuilder (selenium_chrome): enabled virtual_display headless_mode"
102
+ else
103
+ logger.error "BrowserBuilder (selenium_chrome): virtual_display headless_mode works only " \
104
+ "on Linux platform. Browser will run in normal mode. Set `native` mode instead."
105
+ end
106
+ else
107
+ driver_options.args << "--headless"
108
+ logger.debug "BrowserBuilder (selenium_chrome): enabled native headless_mode"
109
+ end
110
+ end
111
+
112
+ chromedriver_path = Tanakai.configuration.chromedriver_path || "/usr/local/bin/chromedriver"
113
+ service = Selenium::WebDriver::Service.chrome(path: chromedriver_path)
114
+ Capybara::Selenium::Driver.new(app, browser: :chrome, options: driver_options, service: service)
115
+ end
116
+
117
+ # Create browser instance (Capybara session)
118
+ @browser = Capybara::Session.new(:selenium_chrome)
119
+ @browser.spider = spider
120
+ logger.debug "BrowserBuilder (selenium_chrome): created browser instance"
121
+
122
+ if @config[:extensions].present?
123
+ logger.error "BrowserBuilder (selenium_chrome): `extensions` option not supported by Selenium, skipped"
124
+ end
125
+
126
+ # Cookies
127
+ if cookies = @config[:cookies].presence
128
+ @browser.config.cookies = cookies
129
+ logger.debug "BrowserBuilder (selenium_chrome): enabled custom cookies"
130
+ end
131
+
132
+ # Browser instance options
133
+ # skip_request_errors
134
+ if skip_errors = @config[:skip_request_errors].presence
135
+ @browser.config.skip_request_errors = skip_errors
136
+ logger.debug "BrowserBuilder (selenium_chrome): enabled skip_request_errors"
137
+ end
138
+
139
+ # retry_request_errors
140
+ if retry_errors = @config[:retry_request_errors].presence
141
+ @browser.config.retry_request_errors = retry_errors
142
+ logger.debug "BrowserBuilder (selenium_chrome): enabled retry_request_errors"
143
+ end
144
+
145
+ # restart_if
146
+ if requests_limit = @config.dig(:restart_if, :requests_limit).presence
147
+ @browser.config.restart_if[:requests_limit] = requests_limit
148
+ logger.debug "BrowserBuilder (selenium_chrome): enabled restart_if.requests_limit >= #{requests_limit}"
149
+ end
150
+
151
+ if memory_limit = @config.dig(:restart_if, :memory_limit).presence
152
+ @browser.config.restart_if[:memory_limit] = memory_limit
153
+ logger.debug "BrowserBuilder (selenium_chrome): enabled restart_if.memory_limit >= #{memory_limit}"
154
+ end
155
+
156
+ # before_request clear_cookies
157
+ if @config.dig(:before_request, :clear_cookies)
158
+ @browser.config.before_request[:clear_cookies] = true
159
+ logger.debug "BrowserBuilder (selenium_chrome): enabled before_request.clear_cookies"
160
+ end
161
+
162
+ # before_request clear_and_set_cookies
163
+ if @config.dig(:before_request, :clear_and_set_cookies)
164
+ if cookies = @config[:cookies].presence
165
+ @browser.config.cookies = cookies
166
+ @browser.config.before_request[:clear_and_set_cookies] = true
167
+ logger.debug "BrowserBuilder (selenium_chrome): enabled before_request.clear_and_set_cookies"
168
+ else
169
+ logger.error "BrowserBuilder (selenium_chrome): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
170
+ end
171
+ end
172
+
173
+ # before_request change_user_agent
174
+ if @config.dig(:before_request, :change_user_agent)
175
+ logger.error "BrowserBuilder (selenium_chrome): before_request.change_user_agent option not supported by Selenium, skipped"
176
+ end
177
+
178
+ # before_request change_proxy
179
+ if @config.dig(:before_request, :change_proxy)
180
+ logger.error "BrowserBuilder (selenium_chrome): before_request.change_proxy option not supported by Selenium, skipped"
181
+ end
182
+
183
+ # before_request delay
184
+ if delay = @config.dig(:before_request, :delay).presence
185
+ @browser.config.before_request[:delay] = delay
186
+ logger.debug "BrowserBuilder (selenium_chrome): enabled before_request.delay"
187
+ end
188
+
189
+ # encoding
190
+ if encoding = @config[:encoding]
191
+ @browser.config.encoding = encoding
192
+ logger.debug "BrowserBuilder (selenium_chrome): enabled encoding: #{encoding}"
193
+ end
194
+
195
+ # return Capybara session instance
196
+ @browser
197
+ end
198
+ end
199
+ end
@@ -0,0 +1,204 @@
1
+ require 'capybara'
2
+ require 'selenium-webdriver'
3
+ require_relative '../capybara_configuration'
4
+ require_relative '../capybara_ext/selenium/driver'
5
+ require_relative '../capybara_ext/session'
6
+
7
+ module Tanakai::BrowserBuilder
8
+ class SeleniumFirefoxBuilder
9
+ class << self
10
+ attr_accessor :virtual_display
11
+ end
12
+
13
+ attr_reader :logger, :spider
14
+
15
+ def initialize(config, spider:)
16
+ @config = config
17
+ @spider = spider
18
+ @logger = spider.logger
19
+ end
20
+
21
+ def build
22
+ # Register driver
23
+ Capybara.register_driver :selenium_firefox do |app|
24
+ # Create driver options
25
+ driver_options = Selenium::WebDriver::Firefox::Options.new
26
+ driver_options.profile = Selenium::WebDriver::Firefox::Profile.new
27
+ driver_options.profile["browser.link.open_newwindow"] = 3 # open windows in tabs
28
+ driver_options.profile["media.peerconnection.enabled"] = false # disable web rtc
29
+
30
+ # Proxy
31
+ if proxy = @config[:proxy].presence
32
+ proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
33
+ ip, port, type, user, password = proxy_string.split(":")
34
+
35
+ if user.nil? && password.nil?
36
+ driver_options.profile["network.proxy.type"] = 1
37
+ if type == "http"
38
+ driver_options.profile["network.proxy.http"] = ip
39
+ driver_options.profile["network.proxy.http_port"] = port.to_i
40
+ driver_options.profile["network.proxy.ssl"] = ip
41
+ driver_options.profile["network.proxy.ssl_port"] = port.to_i
42
+
43
+ logger.debug "BrowserBuilder (selenium_firefox): enabled http proxy, ip: #{ip}, port: #{port}"
44
+ elsif type == "socks5"
45
+ driver_options.profile["network.proxy.socks"] = ip
46
+ driver_options.profile["network.proxy.socks_port"] = port.to_i
47
+ driver_options.profile["network.proxy.socks_version"] = 5
48
+ driver_options.profile["network.proxy.socks_remote_dns"] = true
49
+
50
+ logger.debug "BrowserBuilder (selenium_firefox): enabled socks5 proxy, ip: #{ip}, port: #{port}"
51
+ else
52
+ logger.error "BrowserBuilder (selenium_firefox): wrong type of proxy: #{type}, skipped"
53
+ end
54
+ else
55
+ logger.error "BrowserBuilder (selenium_firefox): proxy with authentication doesn't supported by selenium, skipped"
56
+ end
57
+ end
58
+
59
+ if proxy_bypass_list = @config[:proxy_bypass_list].presence
60
+ if proxy
61
+ driver_options.profile["network.proxy.no_proxies_on"] = proxy_bypass_list.join(", ")
62
+ logger.debug "BrowserBuilder (selenium_firefox): enabled proxy_bypass_list"
63
+ else
64
+ logger.error "BrowserBuilder (selenium_firefox): provide `proxy` to set proxy_bypass_list, skipped"
65
+ end
66
+ end
67
+
68
+ # SSL
69
+ if @config[:ignore_ssl_errors].present?
70
+ driver_options.profile.secure_ssl = false
71
+ driver_options.profile.assume_untrusted_certificate_issuer = true
72
+ logger.debug "BrowserBuilder (selenium_firefox): enabled ignore_ssl_errors"
73
+ end
74
+
75
+ # Disable images
76
+ if @config[:disable_images].present?
77
+ driver_options.profile["permissions.default.image"] = 2
78
+ logger.debug "BrowserBuilder (selenium_firefox): enabled disable_images"
79
+ end
80
+
81
+ # Headers
82
+ if @config[:headers].present?
83
+ logger.warn "BrowserBuilder: (selenium_firefox): custom headers doesn't supported by selenium, skipped"
84
+ end
85
+
86
+ if user_agent = @config[:user_agent].presence
87
+ user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
88
+ driver_options.profile["general.useragent.override"] = user_agent_string
89
+ logger.debug "BrowserBuilder (selenium_firefox): enabled custom user_agent"
90
+ end
91
+
92
+ # Headless mode
93
+ if ENV["HEADLESS"] != "false"
94
+ if @config[:headless_mode] == :virtual_display
95
+ if Gem::Platform.local.os == "linux"
96
+ unless self.class.virtual_display
97
+ require 'headless'
98
+ self.class.virtual_display = Headless.new(reuse: true, destroy_at_exit: false)
99
+ self.class.virtual_display.start
100
+ end
101
+
102
+ logger.debug "BrowserBuilder (selenium_firefox): enabled virtual_display headless_mode"
103
+ else
104
+ logger.error "BrowserBuilder (selenium_firefox): virtual_display headless_mode works only " \
105
+ "on Linux platform. Browser will run in normal mode. Set `native` mode instead."
106
+ end
107
+ else
108
+ driver_options.args << "--headless"
109
+ logger.debug "BrowserBuilder (selenium_firefox): enabled native headless_mode"
110
+ end
111
+ end
112
+
113
+ Capybara::Selenium::Driver.new(app, browser: :firefox, options: driver_options)
114
+ end
115
+
116
+ # Create browser instance (Capybara session)
117
+ @browser = Capybara::Session.new(:selenium_firefox)
118
+ @browser.spider = spider
119
+ logger.debug "BrowserBuilder (selenium_firefox): created browser instance"
120
+
121
+ if @config[:extensions].present?
122
+ logger.error "BrowserBuilder (selenium_firefox): `extensions` option not supported by Selenium, skipped"
123
+ end
124
+
125
+ # Window size
126
+ if size = @config[:window_size].presence
127
+ @browser.current_window.resize_to(*size)
128
+ logger.debug "BrowserBuilder (selenium_firefox): enabled window_size"
129
+ end
130
+
131
+ # Cookies
132
+ if cookies = @config[:cookies].presence
133
+ @browser.config.cookies = cookies
134
+ logger.debug "BrowserBuilder (selenium_firefox): enabled custom cookies"
135
+ end
136
+
137
+ # Browser instance options
138
+ # skip_request_errors
139
+ if skip_errors = @config[:skip_request_errors].presence
140
+ @browser.config.skip_request_errors = skip_errors
141
+ logger.debug "BrowserBuilder (selenium_firefox): enabled skip_request_errors"
142
+ end
143
+
144
+ # retry_request_errors
145
+ if retry_errors = @config[:retry_request_errors].presence
146
+ @browser.config.retry_request_errors = retry_errors
147
+ logger.debug "BrowserBuilder (selenium_firefox): enabled retry_request_errors"
148
+ end
149
+
150
+ # restart_if
151
+ if requests_limit = @config.dig(:restart_if, :requests_limit).presence
152
+ @browser.config.restart_if[:requests_limit] = requests_limit
153
+ logger.debug "BrowserBuilder (selenium_firefox): enabled restart_if.requests_limit >= #{requests_limit}"
154
+ end
155
+
156
+ if memory_limit = @config.dig(:restart_if, :memory_limit).presence
157
+ @browser.config.restart_if[:memory_limit] = memory_limit
158
+ logger.debug "BrowserBuilder (selenium_firefox): enabled restart_if.memory_limit >= #{memory_limit}"
159
+ end
160
+
161
+ # before_request clear_cookies
162
+ if @config.dig(:before_request, :clear_cookies)
163
+ @browser.config.before_request[:clear_cookies] = true
164
+ logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.clear_cookies"
165
+ end
166
+
167
+ # before_request clear_and_set_cookies
168
+ if @config.dig(:before_request, :clear_and_set_cookies)
169
+ if cookies = @config[:cookies].presence
170
+ @browser.config.cookies = cookies
171
+ @browser.config.before_request[:clear_and_set_cookies] = true
172
+ logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.clear_and_set_cookies"
173
+ else
174
+ logger.error "BrowserBuilder (selenium_firefox): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
175
+ end
176
+ end
177
+
178
+ # before_request change_user_agent
179
+ if @config.dig(:before_request, :change_user_agent)
180
+ logger.error "BrowserBuilder (selenium_firefox): before_request.change_user_agent option not supported by Selenium, skipped"
181
+ end
182
+
183
+ # before_request change_proxy
184
+ if @config.dig(:before_request, :change_proxy)
185
+ logger.error "BrowserBuilder (selenium_firefox): before_request.change_proxy option not supported by Selenium, skipped"
186
+ end
187
+
188
+ # before_request delay
189
+ if delay = @config.dig(:before_request, :delay).presence
190
+ @browser.config.before_request[:delay] = delay
191
+ logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.delay"
192
+ end
193
+
194
+ # encoding
195
+ if encoding = @config[:encoding]
196
+ @browser.config.encoding = encoding
197
+ logger.debug "BrowserBuilder (selenium_firefox): enabled encoding: #{encoding}"
198
+ end
199
+
200
+ # return Capybara session instance
201
+ @browser
202
+ end
203
+ end
204
+ end
@@ -0,0 +1,20 @@
1
+ module Tanakai
2
+ module BrowserBuilder
3
+ def self.build(engine, config = {}, spider:)
4
+ if config[:browser].present?
5
+ raise "++++++ BrowserBuilder: browser option is depricated. Now all sub-options inside " \
6
+ "`browser` should be placed right into `@config` hash, without `browser` parent key.\n" \
7
+ "See more here: https://github.com/vifreefly/kimuraframework/blob/master/CHANGELOG.md#breaking-changes-110 ++++++"
8
+ end
9
+
10
+ begin
11
+ require "tanakai/browser_builder/#{engine}_builder"
12
+ rescue LoadError => e
13
+ end
14
+
15
+ builder_class_name = "#{engine}_builder".classify
16
+ builder = "Tanakai::BrowserBuilder::#{builder_class_name}".constantize
17
+ builder.new(config, spider: spider).build
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,10 @@
1
+ require 'capybara'
2
+
3
+ Capybara.configure do |config|
4
+ config.run_server = false
5
+ config.default_selector = :xpath
6
+ config.save_path = "tmp"
7
+ config.default_max_wait_time = 10
8
+ config.ignore_hidden_elements = false
9
+ config.threadsafe = true
10
+ end
@@ -0,0 +1,13 @@
1
+ require_relative '../driver/base'
2
+
3
+ module Capybara::Apparition
4
+ class Driver
5
+ def pid
6
+ @pid ||= `lsof -i tcp:#{port} -t`.strip.to_i
7
+ end
8
+
9
+ def port
10
+ @port ||= browser.client.instance_variable_get("@ws").instance_variable_get("@driver").instance_variable_get("@socket").instance_variable_get("@io").remote_address.inspect_sockaddr.split(':').last
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,13 @@
1
+ require_relative '../driver/base'
2
+
3
+ module Capybara::Cuprite
4
+ class Driver
5
+ def pid
6
+ @pid ||= `lsof -i tcp:#{port} -t`.strip.to_i
7
+ end
8
+
9
+ def port
10
+ @port ||= browser.client.instance_variable_get("@ws").instance_variable_get("@driver").instance_variable_get("@socket").instance_variable_get("@sock").remote_address.inspect_sockaddr.split(':').last
11
+ end
12
+ end
13
+ end