tanakai 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +5 -0
  4. data/CHANGELOG.md +118 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +2038 -0
  8. data/Rakefile +10 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/exe/tanakai +6 -0
  12. data/lib/tanakai/automation/deploy.yml +54 -0
  13. data/lib/tanakai/automation/setup/chromium_chromedriver.yml +26 -0
  14. data/lib/tanakai/automation/setup/firefox_geckodriver.yml +20 -0
  15. data/lib/tanakai/automation/setup/phantomjs.yml +33 -0
  16. data/lib/tanakai/automation/setup/ruby_environment.yml +124 -0
  17. data/lib/tanakai/automation/setup.yml +45 -0
  18. data/lib/tanakai/base/saver.rb +106 -0
  19. data/lib/tanakai/base/storage.rb +54 -0
  20. data/lib/tanakai/base.rb +326 -0
  21. data/lib/tanakai/base_helper.rb +22 -0
  22. data/lib/tanakai/browser_builder/apparition_builder.rb +58 -0
  23. data/lib/tanakai/browser_builder/cuprite_builder.rb +54 -0
  24. data/lib/tanakai/browser_builder/mechanize_builder.rb +154 -0
  25. data/lib/tanakai/browser_builder/poltergeist_phantomjs_builder.rb +175 -0
  26. data/lib/tanakai/browser_builder/selenium_chrome_builder.rb +199 -0
  27. data/lib/tanakai/browser_builder/selenium_firefox_builder.rb +204 -0
  28. data/lib/tanakai/browser_builder.rb +20 -0
  29. data/lib/tanakai/capybara_configuration.rb +10 -0
  30. data/lib/tanakai/capybara_ext/apparition/driver.rb +13 -0
  31. data/lib/tanakai/capybara_ext/cuprite/driver.rb +13 -0
  32. data/lib/tanakai/capybara_ext/driver/base.rb +62 -0
  33. data/lib/tanakai/capybara_ext/mechanize/driver.rb +71 -0
  34. data/lib/tanakai/capybara_ext/poltergeist/driver.rb +13 -0
  35. data/lib/tanakai/capybara_ext/selenium/driver.rb +34 -0
  36. data/lib/tanakai/capybara_ext/session/config.rb +22 -0
  37. data/lib/tanakai/capybara_ext/session.rb +249 -0
  38. data/lib/tanakai/cli/ansible_command_builder.rb +71 -0
  39. data/lib/tanakai/cli/generator.rb +57 -0
  40. data/lib/tanakai/cli.rb +183 -0
  41. data/lib/tanakai/core_ext/array.rb +14 -0
  42. data/lib/tanakai/core_ext/hash.rb +5 -0
  43. data/lib/tanakai/core_ext/numeric.rb +19 -0
  44. data/lib/tanakai/core_ext/string.rb +7 -0
  45. data/lib/tanakai/pipeline.rb +33 -0
  46. data/lib/tanakai/runner.rb +60 -0
  47. data/lib/tanakai/template/.gitignore +18 -0
  48. data/lib/tanakai/template/Gemfile +28 -0
  49. data/lib/tanakai/template/README.md +3 -0
  50. data/lib/tanakai/template/config/application.rb +37 -0
  51. data/lib/tanakai/template/config/automation.yml +13 -0
  52. data/lib/tanakai/template/config/boot.rb +22 -0
  53. data/lib/tanakai/template/config/initializers/.keep +0 -0
  54. data/lib/tanakai/template/config/schedule.rb +57 -0
  55. data/lib/tanakai/template/db/.keep +0 -0
  56. data/lib/tanakai/template/helpers/application_helper.rb +3 -0
  57. data/lib/tanakai/template/lib/.keep +0 -0
  58. data/lib/tanakai/template/log/.keep +0 -0
  59. data/lib/tanakai/template/pipelines/saver.rb +11 -0
  60. data/lib/tanakai/template/pipelines/validator.rb +24 -0
  61. data/lib/tanakai/template/spiders/application_spider.rb +143 -0
  62. data/lib/tanakai/template/tmp/.keep +0 -0
  63. data/lib/tanakai/version.rb +3 -0
  64. data/lib/tanakai.rb +54 -0
  65. data/tanakai.gemspec +50 -0
  66. metadata +382 -0
@@ -0,0 +1,326 @@
1
+ require_relative 'base/saver'
2
+ require_relative 'base/storage'
3
+
4
+ module Tanakai
5
+ class Base
6
+ class InvalidUrlError < StandardError; end
7
+
8
+ # don't deep merge config's headers hash option
9
+ DMERGE_EXCLUDE = [:headers]
10
+
11
+ LoggerFormatter = proc do |severity, datetime, progname, msg|
12
+ current_thread_id = Thread.current.object_id
13
+ thread_type = Thread.main == Thread.current ? "M" : "C"
14
+ output = "%s, [%s#%d] [%s: %s] %5s -- %s: %s\n"
15
+ .freeze % [severity[0..0], datetime, $$, thread_type, current_thread_id, severity, progname, msg]
16
+
17
+ if Tanakai.configuration.colorize_logger != false && Tanakai.env == "development"
18
+ Rbcat.colorize(output, predefined: [:jsonhash, :logger])
19
+ else
20
+ output
21
+ end
22
+ end
23
+
24
+ include BaseHelper
25
+
26
+ ###
27
+
28
+ class << self
29
+ attr_reader :run_info, :savers, :storage
30
+ end
31
+
32
+ def self.running?
33
+ @run_info && @run_info[:status] == :running
34
+ end
35
+
36
+ def self.completed?
37
+ @run_info && @run_info[:status] == :completed
38
+ end
39
+
40
+ def self.failed?
41
+ @run_info && @run_info[:status] == :failed
42
+ end
43
+
44
+ def self.visits
45
+ @run_info && @run_info[:visits]
46
+ end
47
+
48
+ def self.items
49
+ @run_info && @run_info[:items]
50
+ end
51
+
52
+ def self.update(type, subtype)
53
+ return unless @run_info
54
+ @update_mutex.synchronize { @run_info[type][subtype] += 1 }
55
+ end
56
+
57
+ def self.add_event(scope, event)
58
+ return unless @run_info
59
+ @update_mutex.synchronize { @run_info[:events][scope][event] += 1 }
60
+ end
61
+
62
+ ###
63
+
64
+ @engine = :mechanize
65
+ @pipelines = []
66
+ @config = {}
67
+
68
+ def self.name
69
+ @name
70
+ end
71
+
72
+ def self.engine
73
+ @engine ||= superclass.engine
74
+ end
75
+
76
+ def self.pipelines
77
+ @pipelines ||= superclass.pipelines
78
+ end
79
+
80
+ def self.start_urls
81
+ @start_urls
82
+ end
83
+
84
+ def self.config
85
+ if superclass.equal?(::Object)
86
+ @config
87
+ else
88
+ superclass.config.deep_merge_excl(@config || {}, DMERGE_EXCLUDE)
89
+ end
90
+ end
91
+
92
+ ###
93
+
94
+ def self.logger
95
+ @logger ||= Tanakai.configuration.logger || begin
96
+ log_level = (ENV["LOG_LEVEL"] || Tanakai.configuration.log_level || "DEBUG").to_s.upcase
97
+ log_level = "Logger::#{log_level}".constantize
98
+ Logger.new(STDOUT, formatter: LoggerFormatter, level: log_level, progname: name)
99
+ end
100
+ end
101
+
102
+ def self.crawl!(exception_on_fail: true)
103
+ logger.error "Spider: already running: #{name}" and return false if running?
104
+
105
+ @storage = Storage.new
106
+ @savers = {}
107
+ @update_mutex = Mutex.new
108
+
109
+ @run_info = {
110
+ spider_name: name, status: :running, error: nil, environment: Tanakai.env,
111
+ start_time: Time.new, stop_time: nil, running_time: nil,
112
+ visits: { requests: 0, responses: 0 }, items: { sent: 0, processed: 0 },
113
+ events: { requests_errors: Hash.new(0), drop_items_errors: Hash.new(0), custom: Hash.new(0) }
114
+ }
115
+
116
+ ###
117
+
118
+ logger.info "Spider: started: #{name}"
119
+ open_spider if self.respond_to? :open_spider
120
+
121
+ spider = self.new
122
+ spider.with_info = true
123
+ if start_urls
124
+ start_urls.each do |start_url|
125
+ if start_url.class == Hash
126
+ spider.request_to(:parse, start_url)
127
+ else
128
+ spider.request_to(:parse, url: start_url)
129
+ end
130
+ end
131
+ else
132
+ spider.parse
133
+ end
134
+ rescue StandardError, SignalException, SystemExit => e
135
+ @run_info.merge!(status: :failed, error: e.inspect)
136
+ exception_on_fail ? raise(e) : [@run_info, e]
137
+ else
138
+ @run_info.merge!(status: :completed)
139
+ ensure
140
+ if spider
141
+ spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
142
+
143
+ stop_time = Time.now
144
+ total_time = (stop_time - @run_info[:start_time]).round(3)
145
+ @run_info.merge!(stop_time: stop_time, running_time: total_time)
146
+
147
+ close_spider if self.respond_to? :close_spider
148
+
149
+ message = "Spider: stopped: #{@run_info.merge(running_time: @run_info[:running_time]&.duration)}"
150
+ failed? ? logger.fatal(message) : logger.info(message)
151
+
152
+ @run_info, @storage, @savers, @update_mutex = nil
153
+ end
154
+ end
155
+
156
+ def self.parse!(handler, *args, **request)
157
+ spider = self.new
158
+
159
+ if args.present?
160
+ spider.public_send(handler, *args)
161
+ elsif request.present?
162
+ spider.request_to(handler, request)
163
+ else
164
+ spider.public_send(handler)
165
+ end
166
+ ensure
167
+ spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
168
+ end
169
+
170
+ ###
171
+
172
+ attr_reader :logger
173
+ attr_accessor :with_info
174
+
175
+ def initialize(engine = self.class.engine, config: {})
176
+ @engine = engine || self.class.engine
177
+ @config = self.class.config.deep_merge_excl(config, DMERGE_EXCLUDE)
178
+ @pipelines = self.class.pipelines.map do |pipeline_name|
179
+ klass = Pipeline.descendants.find { |kl| kl.name == pipeline_name }
180
+ instance = klass.new
181
+ instance.spider = self
182
+ [pipeline_name, instance]
183
+ end.to_h
184
+
185
+ @logger = self.class.logger
186
+ @savers = {}
187
+ end
188
+
189
+ def browser
190
+ @browser ||= BrowserBuilder.build(@engine, @config, spider: self)
191
+ end
192
+
193
+ def request_to(handler, delay = nil, url:, data: {}, response_type: :html)
194
+ raise InvalidUrlError, "Requested url is invalid: #{url}" unless URI.parse(url).kind_of?(URI::HTTP)
195
+
196
+ if @config[:skip_duplicate_requests] && !unique_request?(url)
197
+ add_event(:duplicate_requests) if self.with_info
198
+ logger.warn "Spider: request_to: not unique url: #{url}, skipped" and return
199
+ end
200
+
201
+ visited = delay ? browser.visit(url, delay: delay) : browser.visit(url)
202
+ return unless visited
203
+
204
+ public_send(handler, browser.current_response(response_type), { url: url, data: data })
205
+ end
206
+
207
+ def console(response = nil, url: nil, data: {})
208
+ binding.pry
209
+ end
210
+
211
+ ###
212
+
213
+ def storage
214
+ # Note: for `.crawl!` uses shared thread safe Storage instance,
215
+ # otherwise, each spider instance will have it's own Storage
216
+ @storage ||= self.with_info ? self.class.storage : Storage.new
217
+ end
218
+
219
+ def unique?(scope, value)
220
+ storage.unique?(scope, value)
221
+ end
222
+
223
+ def save_to(path, item, format:, position: true, append: false)
224
+ @savers[path] ||= begin
225
+ options = { format: format, position: position, append: append }
226
+ if self.with_info
227
+ self.class.savers[path] ||= Saver.new(path, options)
228
+ else
229
+ Saver.new(path, options)
230
+ end
231
+ end
232
+
233
+ @savers[path].save(item)
234
+ end
235
+
236
+ ###
237
+
238
+ def add_event(scope = :custom, event)
239
+ if self.with_info
240
+ self.class.add_event(scope, event)
241
+ end
242
+
243
+ logger.info "Spider: new event (scope: #{scope}): #{event}" if scope == :custom
244
+ end
245
+
246
+ ###
247
+
248
+ private
249
+
250
+ def create_browser(engine, config = {})
251
+ Tanakai::BrowserBuilder.build(engine, config, spider: self)
252
+ end
253
+
254
+ def unique_request?(url)
255
+ options = @config[:skip_duplicate_requests]
256
+ if options.class == Hash
257
+ scope = options[:scope] || :requests_urls
258
+ if options[:check_only]
259
+ storage.include?(scope, url) ? false : true
260
+ else
261
+ storage.unique?(scope, url) ? true : false
262
+ end
263
+ else
264
+ storage.unique?(:requests_urls, url) ? true : false
265
+ end
266
+ end
267
+
268
+ def send_item(item, options = {})
269
+ logger.debug "Pipeline: starting processing item through #{@pipelines.size} #{'pipeline'.pluralize(@pipelines.size)}..."
270
+ self.class.update(:items, :sent) if self.with_info
271
+
272
+ @pipelines.each do |name, instance|
273
+ item = options[name] ? instance.process_item(item, options: options[name]) : instance.process_item(item)
274
+ end
275
+ rescue => e
276
+ logger.error "Pipeline: dropped: #{e.inspect} (#{e.backtrace.first}), item: #{item}"
277
+ add_event(:drop_items_errors, e.inspect) if self.with_info
278
+ false
279
+ else
280
+ self.class.update(:items, :processed) if self.with_info
281
+ logger.info "Pipeline: processed: #{JSON.generate(item)}"
282
+ true
283
+ ensure
284
+ if self.with_info
285
+ logger.info "Info: items: sent: #{self.class.items[:sent]}, processed: #{self.class.items[:processed]}"
286
+ end
287
+ end
288
+
289
+ def in_parallel(handler, urls, threads:, data: {}, delay: nil, engine: @engine, config: {})
290
+ parts = urls.in_sorted_groups(threads, false)
291
+ urls_count = urls.size
292
+
293
+ all = []
294
+ start_time = Time.now
295
+ logger.info "Spider: in_parallel: starting processing #{urls_count} urls within #{threads} threads"
296
+
297
+ parts.each do |part|
298
+ all << Thread.new(part) do |part|
299
+ Thread.current.abort_on_exception = true
300
+
301
+ spider = self.class.new(engine, config: @config.deep_merge_excl(config, DMERGE_EXCLUDE))
302
+ spider.with_info = true if self.with_info
303
+
304
+ part.each do |url_data|
305
+ if url_data.class == Hash
306
+ if url_data[:url].present? && url_data[:data].present?
307
+ spider.request_to(handler, delay, url_data)
308
+ else
309
+ spider.public_send(handler, url_data)
310
+ end
311
+ else
312
+ spider.request_to(handler, delay, url: url_data, data: data)
313
+ end
314
+ end
315
+ ensure
316
+ spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
317
+ end
318
+
319
+ sleep 0.5
320
+ end
321
+
322
+ all.each(&:join)
323
+ logger.info "Spider: in_parallel: stopped processing #{urls_count} urls within #{threads} threads, total time: #{(Time.now - start_time).duration}"
324
+ end
325
+ end
326
+ end
@@ -0,0 +1,22 @@
1
+ module Tanakai
2
+ module BaseHelper
3
+ private
4
+
5
+ def absolute_url(url, base:)
6
+ return unless url
7
+ URI.join(base, URI.escape(url)).to_s
8
+ end
9
+
10
+ def escape_url(url)
11
+ uri = URI.parse(url)
12
+ rescue URI::InvalidURIError => e
13
+ URI.parse(URI.escape url).to_s rescue url
14
+ else
15
+ url
16
+ end
17
+
18
+ def normalize_url(url, base:)
19
+ escape_url(absolute_url(url, base: base))
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,58 @@
1
+ require 'capybara/apparition'
2
+ require_relative '../capybara_configuration'
3
+ require_relative '../capybara_ext/session'
4
+ require_relative '../capybara_ext/apparition/driver'
5
+
6
+ module Tanakai::BrowserBuilder
7
+ class ApparitionBuilder
8
+ attr_reader :logger, :spider
9
+
10
+ def initialize(config, spider:)
11
+ @config = config
12
+ @spider = spider
13
+ @logger = spider.logger
14
+ end
15
+
16
+ def build
17
+ # Register driver
18
+ Capybara.register_driver :apparition do |app|
19
+ timeout = ENV.fetch('TIMEOUT', 30).to_i
20
+ driver_options = { js_errors: false, timeout: timeout, debug: ENV['DEBUG'] }
21
+
22
+ driver_options[:headless] = ENV.fetch("HEADLESS", "true") == "true"
23
+ logger.debug "BrowserBuilder (apparition): enabled extensions"
24
+
25
+ Capybara::Apparition::Driver.new(app, driver_options)
26
+ end
27
+
28
+ # Create browser instance (Capybara session)
29
+ @browser = Capybara::Session.new(:apparition)
30
+ @browser.spider = spider
31
+ logger.debug "BrowserBuilder (apparition): created browser instance"
32
+
33
+ # Headers
34
+ if headers = @config[:headers].presence
35
+ @browser.driver.headers = headers
36
+ logger.debug "BrowserBuilder (apparition): enabled custom headers"
37
+ end
38
+
39
+ if user_agent = @config[:user_agent].presence
40
+ user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
41
+
42
+ @browser.driver.add_header("User-Agent", user_agent_string)
43
+ logger.debug "BrowserBuilder (apparition): enabled custom user_agent"
44
+ end
45
+
46
+ # Cookies
47
+ if cookies = @config[:cookies].presence
48
+ cookies.each do |cookie|
49
+ @browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
50
+ end
51
+
52
+ logger.debug "BrowserBuilder (apparition): enabled custom cookies"
53
+ end
54
+
55
+ @browser
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,54 @@
1
+ require 'capybara/cuprite'
2
+ require_relative '../capybara_configuration'
3
+ require_relative '../capybara_ext/session'
4
+ require_relative '../capybara_ext/cuprite/driver'
5
+
6
+ module Tanakai::BrowserBuilder
7
+ class CupriteBuilder
8
+ attr_reader :logger, :spider
9
+
10
+ def initialize(config, spider:)
11
+ @config = config
12
+ @spider = spider
13
+ @logger = spider.logger
14
+ end
15
+
16
+ def build
17
+ # Register driver
18
+ Capybara.register_driver :cuprite do |app|
19
+ driver_options = { headless: ENV.fetch("HEADLESS", "true") == "true" }
20
+ logger.debug "BrowserBuilder (cuprite): enabled extensions"
21
+
22
+ Capybara::Cuprite::Driver.new(app, driver_options)
23
+ end
24
+
25
+ # Create browser instance (Capybara session)
26
+ @browser = Capybara::Session.new(:cuprite)
27
+ @browser.spider = spider
28
+ logger.debug "BrowserBuilder (cuprite): created browser instance"
29
+
30
+ # Headers
31
+ if headers = @config[:headers].presence
32
+ @browser.driver.headers = headers
33
+ logger.debug "BrowserBuilder (cuprite): enabled custom headers"
34
+ end
35
+
36
+ if user_agent = @config[:user_agent].presence
37
+ user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
38
+ @browser.driver.headers = {"User-Agent" => user_agent_string}
39
+ logger.debug "BrowserBuilder (cuprite): enabled custom user_agent"
40
+ end
41
+
42
+ # Cookies
43
+ if cookies = @config[:cookies].presence
44
+ cookies.each do |cookie|
45
+ @browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
46
+ end
47
+
48
+ logger.debug "BrowserBuilder (cuprite): enabled custom cookies"
49
+ end
50
+
51
+ @browser
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,154 @@
1
+ require 'capybara'
2
+ require 'capybara/mechanize'
3
+ require_relative '../capybara_configuration'
4
+ require_relative '../capybara_ext/mechanize/driver'
5
+ require_relative '../capybara_ext/session'
6
+
7
+ module Tanakai::BrowserBuilder
8
+ class MechanizeBuilder
9
+ attr_reader :logger, :spider
10
+
11
+ def initialize(config, spider:)
12
+ @config = config
13
+ @spider = spider
14
+ @logger = spider.logger
15
+ end
16
+
17
+ def build
18
+ # Register driver
19
+ Capybara.register_driver :mechanize do |app|
20
+ driver = Capybara::Mechanize::Driver.new("app")
21
+ # keep the history as small as possible (by default it's unlimited)
22
+ driver.configure { |a| a.history.max_size = 2 }
23
+ driver
24
+ end
25
+
26
+ # Create browser instance (Capybara session)
27
+ @browser = Capybara::Session.new(:mechanize)
28
+ @browser.spider = spider
29
+ logger.debug "BrowserBuilder (mechanize): created browser instance"
30
+
31
+ if @config[:extensions].present?
32
+ logger.error "BrowserBuilder (mechanize): `extensions` option not supported, skipped"
33
+ end
34
+
35
+ # Proxy
36
+ if proxy = @config[:proxy].presence
37
+ proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
38
+ ip, port, type = proxy_string.split(":")
39
+
40
+ if type == "http"
41
+ @browser.driver.set_proxy(*proxy_string.split(":"))
42
+ logger.debug "BrowserBuilder (mechanize): enabled http proxy, ip: #{ip}, port: #{port}"
43
+ else
44
+ logger.error "BrowserBuilder (mechanize): can't set #{type} proxy (not supported), skipped"
45
+ end
46
+ end
47
+
48
+ # SSL
49
+ if ssl_cert_path = @config[:ssl_cert_path].presence
50
+ @browser.driver.browser.agent.http.ca_file = ssl_cert_path
51
+ logger.debug "BrowserBuilder (mechanize): enabled custom ssl_cert"
52
+ end
53
+
54
+ if @config[:ignore_ssl_errors].present?
55
+ @browser.driver.browser.agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
56
+ logger.debug "BrowserBuilder (mechanize): enabled ignore_ssl_errors"
57
+ end
58
+
59
+ # Headers
60
+ if headers = @config[:headers].presence
61
+ @browser.driver.headers = headers
62
+ logger.debug "BrowserBuilder (mechanize): enabled custom headers"
63
+ end
64
+
65
+ if user_agent = @config[:user_agent].presence
66
+ user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
67
+
68
+ @browser.driver.add_header("User-Agent", user_agent_string)
69
+ logger.debug "BrowserBuilder (mechanize): enabled custom user_agent"
70
+ end
71
+
72
+ # Cookies
73
+ if cookies = @config[:cookies].presence
74
+ cookies.each do |cookie|
75
+ @browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
76
+ end
77
+
78
+ logger.debug "BrowserBuilder (mechanize): enabled custom cookies"
79
+ end
80
+
81
+ # Browser instance options
82
+ # skip_request_errors
83
+ if skip_errors = @config[:skip_request_errors].presence
84
+ @browser.config.skip_request_errors = skip_errors
85
+ logger.debug "BrowserBuilder (mechanize): enabled skip_request_errors"
86
+ end
87
+
88
+ # retry_request_errors
89
+ if retry_errors = @config[:retry_request_errors].presence
90
+ @browser.config.retry_request_errors = retry_errors
91
+ logger.debug "BrowserBuilder (mechanize): enabled retry_request_errors"
92
+ end
93
+
94
+ # restart_if
95
+ if @config[:restart_if].present?
96
+ logger.warn "BrowserBuilder (mechanize): restart_if options not supported by Mechanize, skipped"
97
+ end
98
+
99
+ # before_request clear_cookies
100
+ if @config.dig(:before_request, :clear_cookies)
101
+ @browser.config.before_request[:clear_cookies] = true
102
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.clear_cookies"
103
+ end
104
+
105
+ # before_request clear_and_set_cookies
106
+ if @config.dig(:before_request, :clear_and_set_cookies)
107
+ if cookies = @config[:cookies].presence
108
+ @browser.config.cookies = cookies
109
+ @browser.config.before_request[:clear_and_set_cookies] = true
110
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.clear_and_set_cookies"
111
+ else
112
+ logger.error "BrowserBuilder (mechanize): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
113
+ end
114
+ end
115
+
116
+ # before_request change_user_agent
117
+ if @config.dig(:before_request, :change_user_agent)
118
+ if @config[:user_agent].present? && @config[:user_agent].class == Proc
119
+ @browser.config.user_agent = @config[:user_agent]
120
+ @browser.config.before_request[:change_user_agent] = true
121
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.change_user_agent"
122
+ else
123
+ logger.error "BrowserBuilder (mechanize): user_agent should be present and has lambda format to enable before_request.change_user_agent, skipped"
124
+ end
125
+ end
126
+
127
+ # before_request change_proxy
128
+ if @config.dig(:before_request, :change_proxy)
129
+ if @config[:proxy].present? && @config[:proxy].class == Proc
130
+ @browser.config.proxy = @config[:proxy]
131
+ @browser.config.before_request[:change_proxy] = true
132
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.change_proxy"
133
+ else
134
+ logger.error "BrowserBuilder (mechanize): proxy should be present and has lambda format to enable before_request.change_proxy, skipped"
135
+ end
136
+ end
137
+
138
+ # before_request delay
139
+ if delay = @config.dig(:before_request, :delay).presence
140
+ @browser.config.before_request[:delay] = delay
141
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.delay"
142
+ end
143
+
144
+ # encoding
145
+ if encoding = @config[:encoding]
146
+ @browser.config.encoding = encoding
147
+ logger.debug "BrowserBuilder (mechanize): enabled encoding: #{encoding}"
148
+ end
149
+
150
+ # return Capybara session instance
151
+ @browser
152
+ end
153
+ end
154
+ end