tanakai 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (66) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +5 -0
  4. data/CHANGELOG.md +118 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +2038 -0
  8. data/Rakefile +10 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/exe/tanakai +6 -0
  12. data/lib/tanakai/automation/deploy.yml +54 -0
  13. data/lib/tanakai/automation/setup/chromium_chromedriver.yml +26 -0
  14. data/lib/tanakai/automation/setup/firefox_geckodriver.yml +20 -0
  15. data/lib/tanakai/automation/setup/phantomjs.yml +33 -0
  16. data/lib/tanakai/automation/setup/ruby_environment.yml +124 -0
  17. data/lib/tanakai/automation/setup.yml +45 -0
  18. data/lib/tanakai/base/saver.rb +106 -0
  19. data/lib/tanakai/base/storage.rb +54 -0
  20. data/lib/tanakai/base.rb +326 -0
  21. data/lib/tanakai/base_helper.rb +22 -0
  22. data/lib/tanakai/browser_builder/apparition_builder.rb +58 -0
  23. data/lib/tanakai/browser_builder/cuprite_builder.rb +54 -0
  24. data/lib/tanakai/browser_builder/mechanize_builder.rb +154 -0
  25. data/lib/tanakai/browser_builder/poltergeist_phantomjs_builder.rb +175 -0
  26. data/lib/tanakai/browser_builder/selenium_chrome_builder.rb +199 -0
  27. data/lib/tanakai/browser_builder/selenium_firefox_builder.rb +204 -0
  28. data/lib/tanakai/browser_builder.rb +20 -0
  29. data/lib/tanakai/capybara_configuration.rb +10 -0
  30. data/lib/tanakai/capybara_ext/apparition/driver.rb +13 -0
  31. data/lib/tanakai/capybara_ext/cuprite/driver.rb +13 -0
  32. data/lib/tanakai/capybara_ext/driver/base.rb +62 -0
  33. data/lib/tanakai/capybara_ext/mechanize/driver.rb +71 -0
  34. data/lib/tanakai/capybara_ext/poltergeist/driver.rb +13 -0
  35. data/lib/tanakai/capybara_ext/selenium/driver.rb +34 -0
  36. data/lib/tanakai/capybara_ext/session/config.rb +22 -0
  37. data/lib/tanakai/capybara_ext/session.rb +249 -0
  38. data/lib/tanakai/cli/ansible_command_builder.rb +71 -0
  39. data/lib/tanakai/cli/generator.rb +57 -0
  40. data/lib/tanakai/cli.rb +183 -0
  41. data/lib/tanakai/core_ext/array.rb +14 -0
  42. data/lib/tanakai/core_ext/hash.rb +5 -0
  43. data/lib/tanakai/core_ext/numeric.rb +19 -0
  44. data/lib/tanakai/core_ext/string.rb +7 -0
  45. data/lib/tanakai/pipeline.rb +33 -0
  46. data/lib/tanakai/runner.rb +60 -0
  47. data/lib/tanakai/template/.gitignore +18 -0
  48. data/lib/tanakai/template/Gemfile +28 -0
  49. data/lib/tanakai/template/README.md +3 -0
  50. data/lib/tanakai/template/config/application.rb +37 -0
  51. data/lib/tanakai/template/config/automation.yml +13 -0
  52. data/lib/tanakai/template/config/boot.rb +22 -0
  53. data/lib/tanakai/template/config/initializers/.keep +0 -0
  54. data/lib/tanakai/template/config/schedule.rb +57 -0
  55. data/lib/tanakai/template/db/.keep +0 -0
  56. data/lib/tanakai/template/helpers/application_helper.rb +3 -0
  57. data/lib/tanakai/template/lib/.keep +0 -0
  58. data/lib/tanakai/template/log/.keep +0 -0
  59. data/lib/tanakai/template/pipelines/saver.rb +11 -0
  60. data/lib/tanakai/template/pipelines/validator.rb +24 -0
  61. data/lib/tanakai/template/spiders/application_spider.rb +143 -0
  62. data/lib/tanakai/template/tmp/.keep +0 -0
  63. data/lib/tanakai/version.rb +3 -0
  64. data/lib/tanakai.rb +54 -0
  65. data/tanakai.gemspec +50 -0
  66. metadata +382 -0
@@ -0,0 +1,326 @@
1
+ require_relative 'base/saver'
2
+ require_relative 'base/storage'
3
+
4
+ module Tanakai
5
+ class Base
6
+ class InvalidUrlError < StandardError; end
7
+
8
+ # don't deep merge config's headers hash option
9
+ DMERGE_EXCLUDE = [:headers]
10
+
11
+ LoggerFormatter = proc do |severity, datetime, progname, msg|
12
+ current_thread_id = Thread.current.object_id
13
+ thread_type = Thread.main == Thread.current ? "M" : "C"
14
+ output = "%s, [%s#%d] [%s: %s] %5s -- %s: %s\n"
15
+ .freeze % [severity[0..0], datetime, $$, thread_type, current_thread_id, severity, progname, msg]
16
+
17
+ if Tanakai.configuration.colorize_logger != false && Tanakai.env == "development"
18
+ Rbcat.colorize(output, predefined: [:jsonhash, :logger])
19
+ else
20
+ output
21
+ end
22
+ end
23
+
24
+ include BaseHelper
25
+
26
+ ###
27
+
28
+ class << self
29
+ attr_reader :run_info, :savers, :storage
30
+ end
31
+
32
+ def self.running?
33
+ @run_info && @run_info[:status] == :running
34
+ end
35
+
36
+ def self.completed?
37
+ @run_info && @run_info[:status] == :completed
38
+ end
39
+
40
+ def self.failed?
41
+ @run_info && @run_info[:status] == :failed
42
+ end
43
+
44
+ def self.visits
45
+ @run_info && @run_info[:visits]
46
+ end
47
+
48
+ def self.items
49
+ @run_info && @run_info[:items]
50
+ end
51
+
52
+ def self.update(type, subtype)
53
+ return unless @run_info
54
+ @update_mutex.synchronize { @run_info[type][subtype] += 1 }
55
+ end
56
+
57
+ def self.add_event(scope, event)
58
+ return unless @run_info
59
+ @update_mutex.synchronize { @run_info[:events][scope][event] += 1 }
60
+ end
61
+
62
+ ###
63
+
64
+ @engine = :mechanize
65
+ @pipelines = []
66
+ @config = {}
67
+
68
+ def self.name
69
+ @name
70
+ end
71
+
72
+ def self.engine
73
+ @engine ||= superclass.engine
74
+ end
75
+
76
+ def self.pipelines
77
+ @pipelines ||= superclass.pipelines
78
+ end
79
+
80
+ def self.start_urls
81
+ @start_urls
82
+ end
83
+
84
+ def self.config
85
+ if superclass.equal?(::Object)
86
+ @config
87
+ else
88
+ superclass.config.deep_merge_excl(@config || {}, DMERGE_EXCLUDE)
89
+ end
90
+ end
91
+
92
+ ###
93
+
94
+ def self.logger
95
+ @logger ||= Tanakai.configuration.logger || begin
96
+ log_level = (ENV["LOG_LEVEL"] || Tanakai.configuration.log_level || "DEBUG").to_s.upcase
97
+ log_level = "Logger::#{log_level}".constantize
98
+ Logger.new(STDOUT, formatter: LoggerFormatter, level: log_level, progname: name)
99
+ end
100
+ end
101
+
102
+ def self.crawl!(exception_on_fail: true)
103
+ logger.error "Spider: already running: #{name}" and return false if running?
104
+
105
+ @storage = Storage.new
106
+ @savers = {}
107
+ @update_mutex = Mutex.new
108
+
109
+ @run_info = {
110
+ spider_name: name, status: :running, error: nil, environment: Tanakai.env,
111
+ start_time: Time.new, stop_time: nil, running_time: nil,
112
+ visits: { requests: 0, responses: 0 }, items: { sent: 0, processed: 0 },
113
+ events: { requests_errors: Hash.new(0), drop_items_errors: Hash.new(0), custom: Hash.new(0) }
114
+ }
115
+
116
+ ###
117
+
118
+ logger.info "Spider: started: #{name}"
119
+ open_spider if self.respond_to? :open_spider
120
+
121
+ spider = self.new
122
+ spider.with_info = true
123
+ if start_urls
124
+ start_urls.each do |start_url|
125
+ if start_url.class == Hash
126
+ spider.request_to(:parse, start_url)
127
+ else
128
+ spider.request_to(:parse, url: start_url)
129
+ end
130
+ end
131
+ else
132
+ spider.parse
133
+ end
134
+ rescue StandardError, SignalException, SystemExit => e
135
+ @run_info.merge!(status: :failed, error: e.inspect)
136
+ exception_on_fail ? raise(e) : [@run_info, e]
137
+ else
138
+ @run_info.merge!(status: :completed)
139
+ ensure
140
+ if spider
141
+ spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
142
+
143
+ stop_time = Time.now
144
+ total_time = (stop_time - @run_info[:start_time]).round(3)
145
+ @run_info.merge!(stop_time: stop_time, running_time: total_time)
146
+
147
+ close_spider if self.respond_to? :close_spider
148
+
149
+ message = "Spider: stopped: #{@run_info.merge(running_time: @run_info[:running_time]&.duration)}"
150
+ failed? ? logger.fatal(message) : logger.info(message)
151
+
152
+ @run_info, @storage, @savers, @update_mutex = nil
153
+ end
154
+ end
155
+
156
+ def self.parse!(handler, *args, **request)
157
+ spider = self.new
158
+
159
+ if args.present?
160
+ spider.public_send(handler, *args)
161
+ elsif request.present?
162
+ spider.request_to(handler, request)
163
+ else
164
+ spider.public_send(handler)
165
+ end
166
+ ensure
167
+ spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
168
+ end
169
+
170
+ ###
171
+
172
+ attr_reader :logger
173
+ attr_accessor :with_info
174
+
175
+ def initialize(engine = self.class.engine, config: {})
176
+ @engine = engine || self.class.engine
177
+ @config = self.class.config.deep_merge_excl(config, DMERGE_EXCLUDE)
178
+ @pipelines = self.class.pipelines.map do |pipeline_name|
179
+ klass = Pipeline.descendants.find { |kl| kl.name == pipeline_name }
180
+ instance = klass.new
181
+ instance.spider = self
182
+ [pipeline_name, instance]
183
+ end.to_h
184
+
185
+ @logger = self.class.logger
186
+ @savers = {}
187
+ end
188
+
189
+ def browser
190
+ @browser ||= BrowserBuilder.build(@engine, @config, spider: self)
191
+ end
192
+
193
+ def request_to(handler, delay = nil, url:, data: {}, response_type: :html)
194
+ raise InvalidUrlError, "Requested url is invalid: #{url}" unless URI.parse(url).kind_of?(URI::HTTP)
195
+
196
+ if @config[:skip_duplicate_requests] && !unique_request?(url)
197
+ add_event(:duplicate_requests) if self.with_info
198
+ logger.warn "Spider: request_to: not unique url: #{url}, skipped" and return
199
+ end
200
+
201
+ visited = delay ? browser.visit(url, delay: delay) : browser.visit(url)
202
+ return unless visited
203
+
204
+ public_send(handler, browser.current_response(response_type), { url: url, data: data })
205
+ end
206
+
207
+ def console(response = nil, url: nil, data: {})
208
+ binding.pry
209
+ end
210
+
211
+ ###
212
+
213
+ def storage
214
+ # Note: for `.crawl!` uses shared thread safe Storage instance,
215
+ # otherwise, each spider instance will have it's own Storage
216
+ @storage ||= self.with_info ? self.class.storage : Storage.new
217
+ end
218
+
219
+ def unique?(scope, value)
220
+ storage.unique?(scope, value)
221
+ end
222
+
223
+ def save_to(path, item, format:, position: true, append: false)
224
+ @savers[path] ||= begin
225
+ options = { format: format, position: position, append: append }
226
+ if self.with_info
227
+ self.class.savers[path] ||= Saver.new(path, options)
228
+ else
229
+ Saver.new(path, options)
230
+ end
231
+ end
232
+
233
+ @savers[path].save(item)
234
+ end
235
+
236
+ ###
237
+
238
+ def add_event(scope = :custom, event)
239
+ if self.with_info
240
+ self.class.add_event(scope, event)
241
+ end
242
+
243
+ logger.info "Spider: new event (scope: #{scope}): #{event}" if scope == :custom
244
+ end
245
+
246
+ ###
247
+
248
+ private
249
+
250
+ def create_browser(engine, config = {})
251
+ Tanakai::BrowserBuilder.build(engine, config, spider: self)
252
+ end
253
+
254
+ def unique_request?(url)
255
+ options = @config[:skip_duplicate_requests]
256
+ if options.class == Hash
257
+ scope = options[:scope] || :requests_urls
258
+ if options[:check_only]
259
+ storage.include?(scope, url) ? false : true
260
+ else
261
+ storage.unique?(scope, url) ? true : false
262
+ end
263
+ else
264
+ storage.unique?(:requests_urls, url) ? true : false
265
+ end
266
+ end
267
+
268
+ def send_item(item, options = {})
269
+ logger.debug "Pipeline: starting processing item through #{@pipelines.size} #{'pipeline'.pluralize(@pipelines.size)}..."
270
+ self.class.update(:items, :sent) if self.with_info
271
+
272
+ @pipelines.each do |name, instance|
273
+ item = options[name] ? instance.process_item(item, options: options[name]) : instance.process_item(item)
274
+ end
275
+ rescue => e
276
+ logger.error "Pipeline: dropped: #{e.inspect} (#{e.backtrace.first}), item: #{item}"
277
+ add_event(:drop_items_errors, e.inspect) if self.with_info
278
+ false
279
+ else
280
+ self.class.update(:items, :processed) if self.with_info
281
+ logger.info "Pipeline: processed: #{JSON.generate(item)}"
282
+ true
283
+ ensure
284
+ if self.with_info
285
+ logger.info "Info: items: sent: #{self.class.items[:sent]}, processed: #{self.class.items[:processed]}"
286
+ end
287
+ end
288
+
289
+ def in_parallel(handler, urls, threads:, data: {}, delay: nil, engine: @engine, config: {})
290
+ parts = urls.in_sorted_groups(threads, false)
291
+ urls_count = urls.size
292
+
293
+ all = []
294
+ start_time = Time.now
295
+ logger.info "Spider: in_parallel: starting processing #{urls_count} urls within #{threads} threads"
296
+
297
+ parts.each do |part|
298
+ all << Thread.new(part) do |part|
299
+ Thread.current.abort_on_exception = true
300
+
301
+ spider = self.class.new(engine, config: @config.deep_merge_excl(config, DMERGE_EXCLUDE))
302
+ spider.with_info = true if self.with_info
303
+
304
+ part.each do |url_data|
305
+ if url_data.class == Hash
306
+ if url_data[:url].present? && url_data[:data].present?
307
+ spider.request_to(handler, delay, url_data)
308
+ else
309
+ spider.public_send(handler, url_data)
310
+ end
311
+ else
312
+ spider.request_to(handler, delay, url: url_data, data: data)
313
+ end
314
+ end
315
+ ensure
316
+ spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
317
+ end
318
+
319
+ sleep 0.5
320
+ end
321
+
322
+ all.each(&:join)
323
+ logger.info "Spider: in_parallel: stopped processing #{urls_count} urls within #{threads} threads, total time: #{(Time.now - start_time).duration}"
324
+ end
325
+ end
326
+ end
@@ -0,0 +1,22 @@
1
+ module Tanakai
2
+ module BaseHelper
3
+ private
4
+
5
+ def absolute_url(url, base:)
6
+ return unless url
7
+ URI.join(base, URI.escape(url)).to_s
8
+ end
9
+
10
+ def escape_url(url)
11
+ uri = URI.parse(url)
12
+ rescue URI::InvalidURIError => e
13
+ URI.parse(URI.escape url).to_s rescue url
14
+ else
15
+ url
16
+ end
17
+
18
+ def normalize_url(url, base:)
19
+ escape_url(absolute_url(url, base: base))
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,58 @@
1
+ require 'capybara/apparition'
2
+ require_relative '../capybara_configuration'
3
+ require_relative '../capybara_ext/session'
4
+ require_relative '../capybara_ext/apparition/driver'
5
+
6
+ module Tanakai::BrowserBuilder
7
+ class ApparitionBuilder
8
+ attr_reader :logger, :spider
9
+
10
+ def initialize(config, spider:)
11
+ @config = config
12
+ @spider = spider
13
+ @logger = spider.logger
14
+ end
15
+
16
+ def build
17
+ # Register driver
18
+ Capybara.register_driver :apparition do |app|
19
+ timeout = ENV.fetch('TIMEOUT', 30).to_i
20
+ driver_options = { js_errors: false, timeout: timeout, debug: ENV['DEBUG'] }
21
+
22
+ driver_options[:headless] = ENV.fetch("HEADLESS", "true") == "true"
23
+ logger.debug "BrowserBuilder (apparition): enabled extensions"
24
+
25
+ Capybara::Apparition::Driver.new(app, driver_options)
26
+ end
27
+
28
+ # Create browser instance (Capybara session)
29
+ @browser = Capybara::Session.new(:apparition)
30
+ @browser.spider = spider
31
+ logger.debug "BrowserBuilder (apparition): created browser instance"
32
+
33
+ # Headers
34
+ if headers = @config[:headers].presence
35
+ @browser.driver.headers = headers
36
+ logger.debug "BrowserBuilder (apparition): enabled custom headers"
37
+ end
38
+
39
+ if user_agent = @config[:user_agent].presence
40
+ user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
41
+
42
+ @browser.driver.add_header("User-Agent", user_agent_string)
43
+ logger.debug "BrowserBuilder (apparition): enabled custom user_agent"
44
+ end
45
+
46
+ # Cookies
47
+ if cookies = @config[:cookies].presence
48
+ cookies.each do |cookie|
49
+ @browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
50
+ end
51
+
52
+ logger.debug "BrowserBuilder (apparition): enabled custom cookies"
53
+ end
54
+
55
+ @browser
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,54 @@
1
+ require 'capybara/cuprite'
2
+ require_relative '../capybara_configuration'
3
+ require_relative '../capybara_ext/session'
4
+ require_relative '../capybara_ext/cuprite/driver'
5
+
6
+ module Tanakai::BrowserBuilder
7
+ class CupriteBuilder
8
+ attr_reader :logger, :spider
9
+
10
+ def initialize(config, spider:)
11
+ @config = config
12
+ @spider = spider
13
+ @logger = spider.logger
14
+ end
15
+
16
+ def build
17
+ # Register driver
18
+ Capybara.register_driver :cuprite do |app|
19
+ driver_options = { headless: ENV.fetch("HEADLESS", "true") == "true" }
20
+ logger.debug "BrowserBuilder (cuprite): enabled extensions"
21
+
22
+ Capybara::Cuprite::Driver.new(app, driver_options)
23
+ end
24
+
25
+ # Create browser instance (Capybara session)
26
+ @browser = Capybara::Session.new(:cuprite)
27
+ @browser.spider = spider
28
+ logger.debug "BrowserBuilder (cuprite): created browser instance"
29
+
30
+ # Headers
31
+ if headers = @config[:headers].presence
32
+ @browser.driver.headers = headers
33
+ logger.debug "BrowserBuilder (cuprite): enabled custom headers"
34
+ end
35
+
36
+ if user_agent = @config[:user_agent].presence
37
+ user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
38
+ @browser.driver.headers = {"User-Agent" => user_agent_string}
39
+ logger.debug "BrowserBuilder (cuprite): enabled custom user_agent"
40
+ end
41
+
42
+ # Cookies
43
+ if cookies = @config[:cookies].presence
44
+ cookies.each do |cookie|
45
+ @browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
46
+ end
47
+
48
+ logger.debug "BrowserBuilder (cuprite): enabled custom cookies"
49
+ end
50
+
51
+ @browser
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,154 @@
1
+ require 'capybara'
2
+ require 'capybara/mechanize'
3
+ require_relative '../capybara_configuration'
4
+ require_relative '../capybara_ext/mechanize/driver'
5
+ require_relative '../capybara_ext/session'
6
+
7
+ module Tanakai::BrowserBuilder
8
+ class MechanizeBuilder
9
+ attr_reader :logger, :spider
10
+
11
+ def initialize(config, spider:)
12
+ @config = config
13
+ @spider = spider
14
+ @logger = spider.logger
15
+ end
16
+
17
+ def build
18
+ # Register driver
19
+ Capybara.register_driver :mechanize do |app|
20
+ driver = Capybara::Mechanize::Driver.new("app")
21
+ # keep the history as small as possible (by default it's unlimited)
22
+ driver.configure { |a| a.history.max_size = 2 }
23
+ driver
24
+ end
25
+
26
+ # Create browser instance (Capybara session)
27
+ @browser = Capybara::Session.new(:mechanize)
28
+ @browser.spider = spider
29
+ logger.debug "BrowserBuilder (mechanize): created browser instance"
30
+
31
+ if @config[:extensions].present?
32
+ logger.error "BrowserBuilder (mechanize): `extensions` option not supported, skipped"
33
+ end
34
+
35
+ # Proxy
36
+ if proxy = @config[:proxy].presence
37
+ proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
38
+ ip, port, type = proxy_string.split(":")
39
+
40
+ if type == "http"
41
+ @browser.driver.set_proxy(*proxy_string.split(":"))
42
+ logger.debug "BrowserBuilder (mechanize): enabled http proxy, ip: #{ip}, port: #{port}"
43
+ else
44
+ logger.error "BrowserBuilder (mechanize): can't set #{type} proxy (not supported), skipped"
45
+ end
46
+ end
47
+
48
+ # SSL
49
+ if ssl_cert_path = @config[:ssl_cert_path].presence
50
+ @browser.driver.browser.agent.http.ca_file = ssl_cert_path
51
+ logger.debug "BrowserBuilder (mechanize): enabled custom ssl_cert"
52
+ end
53
+
54
+ if @config[:ignore_ssl_errors].present?
55
+ @browser.driver.browser.agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
56
+ logger.debug "BrowserBuilder (mechanize): enabled ignore_ssl_errors"
57
+ end
58
+
59
+ # Headers
60
+ if headers = @config[:headers].presence
61
+ @browser.driver.headers = headers
62
+ logger.debug "BrowserBuilder (mechanize): enabled custom headers"
63
+ end
64
+
65
+ if user_agent = @config[:user_agent].presence
66
+ user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
67
+
68
+ @browser.driver.add_header("User-Agent", user_agent_string)
69
+ logger.debug "BrowserBuilder (mechanize): enabled custom user_agent"
70
+ end
71
+
72
+ # Cookies
73
+ if cookies = @config[:cookies].presence
74
+ cookies.each do |cookie|
75
+ @browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
76
+ end
77
+
78
+ logger.debug "BrowserBuilder (mechanize): enabled custom cookies"
79
+ end
80
+
81
+ # Browser instance options
82
+ # skip_request_errors
83
+ if skip_errors = @config[:skip_request_errors].presence
84
+ @browser.config.skip_request_errors = skip_errors
85
+ logger.debug "BrowserBuilder (mechanize): enabled skip_request_errors"
86
+ end
87
+
88
+ # retry_request_errors
89
+ if retry_errors = @config[:retry_request_errors].presence
90
+ @browser.config.retry_request_errors = retry_errors
91
+ logger.debug "BrowserBuilder (mechanize): enabled retry_request_errors"
92
+ end
93
+
94
+ # restart_if
95
+ if @config[:restart_if].present?
96
+ logger.warn "BrowserBuilder (mechanize): restart_if options not supported by Mechanize, skipped"
97
+ end
98
+
99
+ # before_request clear_cookies
100
+ if @config.dig(:before_request, :clear_cookies)
101
+ @browser.config.before_request[:clear_cookies] = true
102
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.clear_cookies"
103
+ end
104
+
105
+ # before_request clear_and_set_cookies
106
+ if @config.dig(:before_request, :clear_and_set_cookies)
107
+ if cookies = @config[:cookies].presence
108
+ @browser.config.cookies = cookies
109
+ @browser.config.before_request[:clear_and_set_cookies] = true
110
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.clear_and_set_cookies"
111
+ else
112
+ logger.error "BrowserBuilder (mechanize): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
113
+ end
114
+ end
115
+
116
+ # before_request change_user_agent
117
+ if @config.dig(:before_request, :change_user_agent)
118
+ if @config[:user_agent].present? && @config[:user_agent].class == Proc
119
+ @browser.config.user_agent = @config[:user_agent]
120
+ @browser.config.before_request[:change_user_agent] = true
121
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.change_user_agent"
122
+ else
123
+ logger.error "BrowserBuilder (mechanize): user_agent should be present and has lambda format to enable before_request.change_user_agent, skipped"
124
+ end
125
+ end
126
+
127
+ # before_request change_proxy
128
+ if @config.dig(:before_request, :change_proxy)
129
+ if @config[:proxy].present? && @config[:proxy].class == Proc
130
+ @browser.config.proxy = @config[:proxy]
131
+ @browser.config.before_request[:change_proxy] = true
132
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.change_proxy"
133
+ else
134
+ logger.error "BrowserBuilder (mechanize): proxy should be present and has lambda format to enable before_request.change_proxy, skipped"
135
+ end
136
+ end
137
+
138
+ # before_request delay
139
+ if delay = @config.dig(:before_request, :delay).presence
140
+ @browser.config.before_request[:delay] = delay
141
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.delay"
142
+ end
143
+
144
+ # encoding
145
+ if encoding = @config[:encoding]
146
+ @browser.config.encoding = encoding
147
+ logger.debug "BrowserBuilder (mechanize): enabled encoding: #{encoding}"
148
+ end
149
+
150
+ # return Capybara session instance
151
+ @browser
152
+ end
153
+ end
154
+ end