kimurai_dynamic 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.travis.yml +5 -0
  4. data/CHANGELOG.md +111 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +2038 -0
  8. data/Rakefile +10 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/exe/kimurai +6 -0
  12. data/kimurai.gemspec +48 -0
  13. data/lib/kimurai/automation/deploy.yml +54 -0
  14. data/lib/kimurai/automation/setup/chromium_chromedriver.yml +26 -0
  15. data/lib/kimurai/automation/setup/firefox_geckodriver.yml +20 -0
  16. data/lib/kimurai/automation/setup/phantomjs.yml +33 -0
  17. data/lib/kimurai/automation/setup/ruby_environment.yml +124 -0
  18. data/lib/kimurai/automation/setup.yml +45 -0
  19. data/lib/kimurai/base/saver.rb +106 -0
  20. data/lib/kimurai/base/storage.rb +54 -0
  21. data/lib/kimurai/base.rb +330 -0
  22. data/lib/kimurai/base_helper.rb +22 -0
  23. data/lib/kimurai/browser_builder/mechanize_builder.rb +154 -0
  24. data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +175 -0
  25. data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +199 -0
  26. data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +204 -0
  27. data/lib/kimurai/browser_builder.rb +20 -0
  28. data/lib/kimurai/capybara_configuration.rb +10 -0
  29. data/lib/kimurai/capybara_ext/driver/base.rb +62 -0
  30. data/lib/kimurai/capybara_ext/mechanize/driver.rb +71 -0
  31. data/lib/kimurai/capybara_ext/poltergeist/driver.rb +13 -0
  32. data/lib/kimurai/capybara_ext/selenium/driver.rb +34 -0
  33. data/lib/kimurai/capybara_ext/session/config.rb +22 -0
  34. data/lib/kimurai/capybara_ext/session.rb +249 -0
  35. data/lib/kimurai/cli/ansible_command_builder.rb +71 -0
  36. data/lib/kimurai/cli/generator.rb +57 -0
  37. data/lib/kimurai/cli.rb +183 -0
  38. data/lib/kimurai/core_ext/array.rb +14 -0
  39. data/lib/kimurai/core_ext/hash.rb +5 -0
  40. data/lib/kimurai/core_ext/numeric.rb +19 -0
  41. data/lib/kimurai/core_ext/string.rb +7 -0
  42. data/lib/kimurai/pipeline.rb +33 -0
  43. data/lib/kimurai/runner.rb +60 -0
  44. data/lib/kimurai/template/.gitignore +18 -0
  45. data/lib/kimurai/template/Gemfile +28 -0
  46. data/lib/kimurai/template/README.md +3 -0
  47. data/lib/kimurai/template/config/application.rb +37 -0
  48. data/lib/kimurai/template/config/automation.yml +13 -0
  49. data/lib/kimurai/template/config/boot.rb +22 -0
  50. data/lib/kimurai/template/config/initializers/.keep +0 -0
  51. data/lib/kimurai/template/config/schedule.rb +57 -0
  52. data/lib/kimurai/template/db/.keep +0 -0
  53. data/lib/kimurai/template/helpers/application_helper.rb +3 -0
  54. data/lib/kimurai/template/lib/.keep +0 -0
  55. data/lib/kimurai/template/log/.keep +0 -0
  56. data/lib/kimurai/template/pipelines/saver.rb +11 -0
  57. data/lib/kimurai/template/pipelines/validator.rb +24 -0
  58. data/lib/kimurai/template/spiders/application_spider.rb +143 -0
  59. data/lib/kimurai/template/tmp/.keep +0 -0
  60. data/lib/kimurai/version.rb +3 -0
  61. data/lib/kimurai.rb +54 -0
  62. metadata +349 -0
@@ -0,0 +1,330 @@
1
+ require_relative 'base/saver'
2
+ require_relative 'base/storage'
3
+
4
+ module Kimurai
5
+ class Base
6
+ class InvalidUrlError < StandardError; end
7
+
8
+ # don't deep merge config's headers hash option
9
+ DMERGE_EXCLUDE = [:headers]
10
+
11
+ LoggerFormatter = proc do |severity, datetime, progname, msg|
12
+ current_thread_id = Thread.current.object_id
13
+ thread_type = Thread.main == Thread.current ? "M" : "C"
14
+ output = "%s, [%s#%d] [%s: %s] %5s -- %s: %s\n"
15
+ .freeze % [severity[0..0], datetime, $$, thread_type, current_thread_id, severity, progname, msg]
16
+
17
+ if Kimurai.configuration.colorize_logger != false && Kimurai.env == "development"
18
+ Rbcat.colorize(output, predefined: [:jsonhash, :logger])
19
+ else
20
+ output
21
+ end
22
+ end
23
+
24
+ include BaseHelper
25
+
26
+ ###
27
+
28
+ class << self
29
+ attr_reader :run_info, :savers, :storage
30
+ end
31
+
32
+ def self.running?
33
+ @run_info && @run_info[:status] == :running
34
+ end
35
+
36
+ def self.completed?
37
+ @run_info && @run_info[:status] == :completed
38
+ end
39
+
40
+ def self.failed?
41
+ @run_info && @run_info[:status] == :failed
42
+ end
43
+
44
+ def self.visits
45
+ @run_info && @run_info[:visits]
46
+ end
47
+
48
+ def self.items
49
+ @run_info && @run_info[:items]
50
+ end
51
+
52
+ def self.update(type, subtype)
53
+ return unless @run_info
54
+ @update_mutex.synchronize { @run_info[type][subtype] += 1 }
55
+ end
56
+
57
+ def self.add_event(scope, event)
58
+ return unless @run_info
59
+ @update_mutex.synchronize { @run_info[:events][scope][event] += 1 }
60
+ end
61
+
62
+ ###
63
+
64
+ @engine = :mechanize
65
+ @pipelines = []
66
+ @config = {}
67
+
68
+ def self.name
69
+ @name
70
+ end
71
+
72
+ def self.engine
73
+ @engine ||= superclass.engine
74
+ end
75
+
76
+ def self.pipelines
77
+ @pipelines ||= superclass.pipelines
78
+ end
79
+
80
+ def self.start_urls
81
+ @start_urls
82
+ end
83
+
84
+ def self.config
85
+ if superclass.equal?(::Object)
86
+ @config
87
+ else
88
+ superclass.config.deep_merge_excl(@config || {}, DMERGE_EXCLUDE)
89
+ end
90
+ end
91
+
92
+ ###
93
+
94
+ def self.logger
95
+ @logger ||= Kimurai.configuration.logger || begin
96
+ log_level = (ENV["LOG_LEVEL"] || Kimurai.configuration.log_level || "DEBUG").to_s.upcase
97
+ log_level = "Logger::#{log_level}".constantize
98
+ Logger.new(STDOUT, formatter: LoggerFormatter, level: log_level, progname: name)
99
+ end
100
+ end
101
+
102
+ def self.crawl!(exception_on_fail: true, url: nil)
103
+
104
+ logger.error "Spider: already running: #{name}" and return false if running?
105
+ if !url.nil?
106
+ start_urls = [url]
107
+ end
108
+
109
+ @storage = Storage.new
110
+ @savers = {}
111
+ @update_mutex = Mutex.new
112
+
113
+ @run_info = {
114
+ spider_name: name, status: :running, error: nil, environment: Kimurai.env,
115
+ start_time: Time.new, stop_time: nil, running_time: nil,
116
+ visits: { requests: 0, responses: 0 }, items: { sent: 0, processed: 0 },
117
+ events: { requests_errors: Hash.new(0), drop_items_errors: Hash.new(0), custom: Hash.new(0) }
118
+ }
119
+
120
+ ###
121
+
122
+ logger.info "Spider: started: #{name}"
123
+ open_spider if self.respond_to? :open_spider
124
+
125
+ spider = self.new
126
+ spider.with_info = true
127
+ if start_urls
128
+ start_urls.each do |start_url|
129
+ if start_url.class == Hash
130
+ spider.request_to(:parse, start_url)
131
+ else
132
+ spider.request_to(:parse, url: start_url)
133
+ end
134
+ end
135
+ else
136
+ spider.parse
137
+ end
138
+ rescue StandardError, SignalException, SystemExit => e
139
+ @run_info.merge!(status: :failed, error: e.inspect)
140
+ exception_on_fail ? raise(e) : [@run_info, e]
141
+ else
142
+ @run_info.merge!(status: :completed)
143
+ ensure
144
+ if spider
145
+ spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
146
+
147
+ stop_time = Time.now
148
+ total_time = (stop_time - @run_info[:start_time]).round(3)
149
+ @run_info.merge!(stop_time: stop_time, running_time: total_time)
150
+
151
+ close_spider if self.respond_to? :close_spider
152
+
153
+ message = "Spider: stopped: #{@run_info.merge(running_time: @run_info[:running_time]&.duration)}"
154
+ failed? ? logger.fatal(message) : logger.info(message)
155
+
156
+ @run_info, @storage, @savers, @update_mutex = nil
157
+ end
158
+ end
159
+
160
+ def self.parse!(handler, *args, **request)
161
+ spider = self.new
162
+
163
+ if args.present?
164
+ spider.public_send(handler, *args)
165
+ elsif request.present?
166
+ spider.request_to(handler, request)
167
+ else
168
+ spider.public_send(handler)
169
+ end
170
+ ensure
171
+ spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
172
+ end
173
+
174
+ ###
175
+
176
+ attr_reader :logger
177
+ attr_accessor :with_info
178
+
179
+ def initialize(engine = self.class.engine, config: {})
180
+ @engine = engine || self.class.engine
181
+ @config = self.class.config.deep_merge_excl(config, DMERGE_EXCLUDE)
182
+ @pipelines = self.class.pipelines.map do |pipeline_name|
183
+ klass = Pipeline.descendants.find { |kl| kl.name == pipeline_name }
184
+ instance = klass.new
185
+ instance.spider = self
186
+ [pipeline_name, instance]
187
+ end.to_h
188
+
189
+ @logger = self.class.logger
190
+ @savers = {}
191
+ end
192
+
193
+ def browser
194
+ @browser ||= BrowserBuilder.build(@engine, @config, spider: self)
195
+ end
196
+
197
+ def request_to(handler, delay = nil, url:, data: {}, response_type: :html)
198
+ raise InvalidUrlError, "Requested url is invalid: #{url}" unless URI.parse(url).kind_of?(URI::HTTP)
199
+
200
+ if @config[:skip_duplicate_requests] && !unique_request?(url)
201
+ add_event(:duplicate_requests) if self.with_info
202
+ logger.warn "Spider: request_to: not unique url: #{url}, skipped" and return
203
+ end
204
+
205
+ visited = delay ? browser.visit(url, delay: delay) : browser.visit(url)
206
+ return unless visited
207
+
208
+ public_send(handler, browser.current_response(response_type), { url: url, data: data })
209
+ end
210
+
211
+ def console(response = nil, url: nil, data: {})
212
+ binding.pry
213
+ end
214
+
215
+ ###
216
+
217
+ def storage
218
+ # Note: for `.crawl!` uses shared thread safe Storage instance,
219
+ # otherwise, each spider instance will have it's own Storage
220
+ @storage ||= self.with_info ? self.class.storage : Storage.new
221
+ end
222
+
223
+ def unique?(scope, value)
224
+ storage.unique?(scope, value)
225
+ end
226
+
227
+ def save_to(path, item, format:, position: true, append: false)
228
+ @savers[path] ||= begin
229
+ options = { format: format, position: position, append: append }
230
+ if self.with_info
231
+ self.class.savers[path] ||= Saver.new(path, options)
232
+ else
233
+ Saver.new(path, options)
234
+ end
235
+ end
236
+
237
+ @savers[path].save(item)
238
+ end
239
+
240
+ ###
241
+
242
+ def add_event(scope = :custom, event)
243
+ if self.with_info
244
+ self.class.add_event(scope, event)
245
+ end
246
+
247
+ logger.info "Spider: new event (scope: #{scope}): #{event}" if scope == :custom
248
+ end
249
+
250
+ ###
251
+
252
+ private
253
+
254
+ def create_browser(engine, config = {})
255
+ Kimurai::BrowserBuilder.build(engine, config, spider: self)
256
+ end
257
+
258
+ def unique_request?(url)
259
+ options = @config[:skip_duplicate_requests]
260
+ if options.class == Hash
261
+ scope = options[:scope] || :requests_urls
262
+ if options[:check_only]
263
+ storage.include?(scope, url) ? false : true
264
+ else
265
+ storage.unique?(scope, url) ? true : false
266
+ end
267
+ else
268
+ storage.unique?(:requests_urls, url) ? true : false
269
+ end
270
+ end
271
+
272
+ def send_item(item, options = {})
273
+ logger.debug "Pipeline: starting processing item through #{@pipelines.size} #{'pipeline'.pluralize(@pipelines.size)}..."
274
+ self.class.update(:items, :sent) if self.with_info
275
+
276
+ @pipelines.each do |name, instance|
277
+ item = options[name] ? instance.process_item(item, options: options[name]) : instance.process_item(item)
278
+ end
279
+ rescue => e
280
+ logger.error "Pipeline: dropped: #{e.inspect} (#{e.backtrace.first}), item: #{item}"
281
+ add_event(:drop_items_errors, e.inspect) if self.with_info
282
+ false
283
+ else
284
+ self.class.update(:items, :processed) if self.with_info
285
+ logger.info "Pipeline: processed: #{JSON.generate(item)}"
286
+ true
287
+ ensure
288
+ if self.with_info
289
+ logger.info "Info: items: sent: #{self.class.items[:sent]}, processed: #{self.class.items[:processed]}"
290
+ end
291
+ end
292
+
293
+ def in_parallel(handler, urls, threads:, data: {}, delay: nil, engine: @engine, config: {})
294
+ parts = urls.in_sorted_groups(threads, false)
295
+ urls_count = urls.size
296
+
297
+ all = []
298
+ start_time = Time.now
299
+ logger.info "Spider: in_parallel: starting processing #{urls_count} urls within #{threads} threads"
300
+
301
+ parts.each do |part|
302
+ all << Thread.new(part) do |part|
303
+ Thread.current.abort_on_exception = true
304
+
305
+ spider = self.class.new(engine, config: @config.deep_merge_excl(config, DMERGE_EXCLUDE))
306
+ spider.with_info = true if self.with_info
307
+
308
+ part.each do |url_data|
309
+ if url_data.class == Hash
310
+ if url_data[:url].present? && url_data[:data].present?
311
+ spider.request_to(handler, delay, url_data)
312
+ else
313
+ spider.public_send(handler, url_data)
314
+ end
315
+ else
316
+ spider.request_to(handler, delay, url: url_data, data: data)
317
+ end
318
+ end
319
+ ensure
320
+ spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
321
+ end
322
+
323
+ sleep 0.5
324
+ end
325
+
326
+ all.each(&:join)
327
+ logger.info "Spider: in_parallel: stopped processing #{urls_count} urls within #{threads} threads, total time: #{(Time.now - start_time).duration}"
328
+ end
329
+ end
330
+ end
@@ -0,0 +1,22 @@
1
+ module Kimurai
2
+ module BaseHelper
3
+ private
4
+
5
+ def absolute_url(url, base:)
6
+ return unless url
7
+ URI.join(base, URI.escape(url)).to_s
8
+ end
9
+
10
+ def escape_url(url)
11
+ uri = URI.parse(url)
12
+ rescue URI::InvalidURIError => e
13
+ URI.parse(URI.escape url).to_s rescue url
14
+ else
15
+ url
16
+ end
17
+
18
+ def normalize_url(url, base:)
19
+ escape_url(absolute_url(url, base: base))
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,154 @@
1
+ require 'capybara'
2
+ require 'capybara/mechanize'
3
+ require_relative '../capybara_configuration'
4
+ require_relative '../capybara_ext/mechanize/driver'
5
+ require_relative '../capybara_ext/session'
6
+
7
+ module Kimurai::BrowserBuilder
8
+ class MechanizeBuilder
9
+ attr_reader :logger, :spider
10
+
11
+ def initialize(config, spider:)
12
+ @config = config
13
+ @spider = spider
14
+ @logger = spider.logger
15
+ end
16
+
17
+ def build
18
+ # Register driver
19
+ Capybara.register_driver :mechanize do |app|
20
+ driver = Capybara::Mechanize::Driver.new("app")
21
+ # keep the history as small as possible (by default it's unlimited)
22
+ driver.configure { |a| a.history.max_size = 2 }
23
+ driver
24
+ end
25
+
26
+ # Create browser instance (Capybara session)
27
+ @browser = Capybara::Session.new(:mechanize)
28
+ @browser.spider = spider
29
+ logger.debug "BrowserBuilder (mechanize): created browser instance"
30
+
31
+ if @config[:extensions].present?
32
+ logger.error "BrowserBuilder (mechanize): `extensions` option not supported, skipped"
33
+ end
34
+
35
+ # Proxy
36
+ if proxy = @config[:proxy].presence
37
+ proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
38
+ ip, port, type = proxy_string.split(":")
39
+
40
+ if type == "http"
41
+ @browser.driver.set_proxy(*proxy_string.split(":"))
42
+ logger.debug "BrowserBuilder (mechanize): enabled http proxy, ip: #{ip}, port: #{port}"
43
+ else
44
+ logger.error "BrowserBuilder (mechanize): can't set #{type} proxy (not supported), skipped"
45
+ end
46
+ end
47
+
48
+ # SSL
49
+ if ssl_cert_path = @config[:ssl_cert_path].presence
50
+ @browser.driver.browser.agent.http.ca_file = ssl_cert_path
51
+ logger.debug "BrowserBuilder (mechanize): enabled custom ssl_cert"
52
+ end
53
+
54
+ if @config[:ignore_ssl_errors].present?
55
+ @browser.driver.browser.agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
56
+ logger.debug "BrowserBuilder (mechanize): enabled ignore_ssl_errors"
57
+ end
58
+
59
+ # Headers
60
+ if headers = @config[:headers].presence
61
+ @browser.driver.headers = headers
62
+ logger.debug "BrowserBuilder (mechanize): enabled custom headers"
63
+ end
64
+
65
+ if user_agent = @config[:user_agent].presence
66
+ user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
67
+
68
+ @browser.driver.add_header("User-Agent", user_agent_string)
69
+ logger.debug "BrowserBuilder (mechanize): enabled custom user_agent"
70
+ end
71
+
72
+ # Cookies
73
+ if cookies = @config[:cookies].presence
74
+ cookies.each do |cookie|
75
+ @browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
76
+ end
77
+
78
+ logger.debug "BrowserBuilder (mechanize): enabled custom cookies"
79
+ end
80
+
81
+ # Browser instance options
82
+ # skip_request_errors
83
+ if skip_errors = @config[:skip_request_errors].presence
84
+ @browser.config.skip_request_errors = skip_errors
85
+ logger.debug "BrowserBuilder (mechanize): enabled skip_request_errors"
86
+ end
87
+
88
+ # retry_request_errors
89
+ if retry_errors = @config[:retry_request_errors].presence
90
+ @browser.config.retry_request_errors = retry_errors
91
+ logger.debug "BrowserBuilder (mechanize): enabled retry_request_errors"
92
+ end
93
+
94
+ # restart_if
95
+ if @config[:restart_if].present?
96
+ logger.warn "BrowserBuilder (mechanize): restart_if options not supported by Mechanize, skipped"
97
+ end
98
+
99
+ # before_request clear_cookies
100
+ if @config.dig(:before_request, :clear_cookies)
101
+ @browser.config.before_request[:clear_cookies] = true
102
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.clear_cookies"
103
+ end
104
+
105
+ # before_request clear_and_set_cookies
106
+ if @config.dig(:before_request, :clear_and_set_cookies)
107
+ if cookies = @config[:cookies].presence
108
+ @browser.config.cookies = cookies
109
+ @browser.config.before_request[:clear_and_set_cookies] = true
110
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.clear_and_set_cookies"
111
+ else
112
+ logger.error "BrowserBuilder (mechanize): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
113
+ end
114
+ end
115
+
116
+ # before_request change_user_agent
117
+ if @config.dig(:before_request, :change_user_agent)
118
+ if @config[:user_agent].present? && @config[:user_agent].class == Proc
119
+ @browser.config.user_agent = @config[:user_agent]
120
+ @browser.config.before_request[:change_user_agent] = true
121
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.change_user_agent"
122
+ else
123
+ logger.error "BrowserBuilder (mechanize): user_agent should be present and has lambda format to enable before_request.change_user_agent, skipped"
124
+ end
125
+ end
126
+
127
+ # before_request change_proxy
128
+ if @config.dig(:before_request, :change_proxy)
129
+ if @config[:proxy].present? && @config[:proxy].class == Proc
130
+ @browser.config.proxy = @config[:proxy]
131
+ @browser.config.before_request[:change_proxy] = true
132
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.change_proxy"
133
+ else
134
+ logger.error "BrowserBuilder (mechanize): proxy should be present and has lambda format to enable before_request.change_proxy, skipped"
135
+ end
136
+ end
137
+
138
+ # before_request delay
139
+ if delay = @config.dig(:before_request, :delay).presence
140
+ @browser.config.before_request[:delay] = delay
141
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.delay"
142
+ end
143
+
144
+ # encoding
145
+ if encoding = @config[:encoding]
146
+ @browser.config.encoding = encoding
147
+ logger.debug "BrowserBuilder (mechanize): enabled encoding: #{encoding}"
148
+ end
149
+
150
+ # return Capybara session instance
151
+ @browser
152
+ end
153
+ end
154
+ end
@@ -0,0 +1,175 @@
1
+ require 'capybara'
2
+ require 'capybara/poltergeist'
3
+ require_relative '../capybara_configuration'
4
+ require_relative '../capybara_ext/poltergeist/driver'
5
+ require_relative '../capybara_ext/session'
6
+
7
+ module Kimurai::BrowserBuilder
8
+ class PoltergeistPhantomjsBuilder
9
+ attr_reader :logger, :spider
10
+
11
+ def initialize(config, spider:)
12
+ @config = config
13
+ @spider = spider
14
+ @logger = spider.logger
15
+ end
16
+
17
+ def build
18
+ # Register driver
19
+ Capybara.register_driver :poltergeist_phantomjs do |app|
20
+ # Create driver options
21
+ driver_options = {
22
+ js_errors: false, debug: false, inspector: false, phantomjs_options: []
23
+ }
24
+
25
+ if extensions = @config[:extensions].presence
26
+ driver_options[:extensions] = extensions
27
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled extensions"
28
+ end
29
+
30
+ # Window size
31
+ if size = @config[:window_size].presence
32
+ driver_options[:window_size] = size
33
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled window_size"
34
+ end
35
+
36
+ # SSL
37
+ if ssl_cert_path = @config[:ssl_cert_path].presence
38
+ driver_options[:phantomjs_options] << "--ssl-certificates-path=#{ssl_cert_path}"
39
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom ssl_cert"
40
+ end
41
+
42
+ if @config[:ignore_ssl_errors].present?
43
+ driver_options[:phantomjs_options].push("--ignore-ssl-errors=yes", "--ssl-protocol=any")
44
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled ignore_ssl_errors"
45
+ end
46
+
47
+ # Disable images
48
+ if @config[:disable_images].present?
49
+ driver_options[:phantomjs_options] << "--load-images=no"
50
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled disable_images"
51
+ end
52
+
53
+ Capybara::Poltergeist::Driver.new(app, driver_options)
54
+ end
55
+
56
+ # Create browser instance (Capybara session)
57
+ @browser = Capybara::Session.new(:poltergeist_phantomjs)
58
+ @browser.spider = spider
59
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): created browser instance"
60
+
61
+ # Proxy
62
+ if proxy = @config[:proxy].presence
63
+ proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
64
+ ip, port, type = proxy_string.split(":")
65
+
66
+ if %w(http socks5).include?(type)
67
+ @browser.driver.set_proxy(*proxy_string.split(":"))
68
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled #{type} proxy, ip: #{ip}, port: #{port}"
69
+ else
70
+ logger.error "BrowserBuilder (poltergeist_phantomjs): wrong type of proxy: #{type}, skipped"
71
+ end
72
+ end
73
+
74
+ # Headers
75
+ if headers = @config[:headers].presence
76
+ @browser.driver.headers = headers
77
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom headers"
78
+ end
79
+
80
+ if user_agent = @config[:user_agent].presence
81
+ user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
82
+
83
+ @browser.driver.add_header("User-Agent", user_agent_string)
84
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom user_agent"
85
+ end
86
+
87
+ # Cookies
88
+ if cookies = @config[:cookies].presence
89
+ cookies.each do |cookie|
90
+ @browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
91
+ end
92
+
93
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom cookies"
94
+ end
95
+
96
+ # Browser instance options
97
+ # skip_request_errors
98
+ if skip_errors = @config[:skip_request_errors].presence
99
+ @browser.config.skip_request_errors = skip_errors
100
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled skip_request_errors"
101
+ end
102
+
103
+ # retry_request_errors
104
+ if retry_errors = @config[:retry_request_errors].presence
105
+ @browser.config.retry_request_errors = retry_errors
106
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled retry_request_errors"
107
+ end
108
+
109
+ # restart_if
110
+ if requests_limit = @config.dig(:restart_if, :requests_limit).presence
111
+ @browser.config.restart_if[:requests_limit] = requests_limit
112
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled restart_if.requests_limit >= #{requests_limit}"
113
+ end
114
+
115
+ if memory_limit = @config.dig(:restart_if, :memory_limit).presence
116
+ @browser.config.restart_if[:memory_limit] = memory_limit
117
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled restart_if.memory_limit >= #{memory_limit}"
118
+ end
119
+
120
+ # before_request clear_cookies
121
+ if @config.dig(:before_request, :clear_cookies)
122
+ @browser.config.before_request[:clear_cookies] = true
123
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.clear_cookies"
124
+ end
125
+
126
+ # before_request clear_and_set_cookies
127
+ if @config.dig(:before_request, :clear_and_set_cookies)
128
+ if cookies = @config[:cookies].presence
129
+ @browser.config.cookies = cookies
130
+ @browser.config.before_request[:clear_and_set_cookies] = true
131
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.clear_and_set_cookies"
132
+ else
133
+ logger.error "BrowserBuilder (poltergeist_phantomjs): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
134
+ end
135
+ end
136
+
137
+ # before_request change_user_agent
138
+ if @config.dig(:before_request, :change_user_agent)
139
+ if @config[:user_agent].present? && @config[:user_agent].class == Proc
140
+ @browser.config.user_agent = @config[:user_agent]
141
+ @browser.config.before_request[:change_user_agent] = true
142
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.change_user_agent"
143
+ else
144
+ logger.error "BrowserBuilder (poltergeist_phantomjs): user_agent should be present and has lambda format to enable before_request.change_user_agent, skipped"
145
+ end
146
+ end
147
+
148
+ # before_request change_proxy
149
+ if @config.dig(:before_request, :change_proxy)
150
+ if @config[:proxy].present? && @config[:proxy].class == Proc
151
+ @browser.config.proxy = @config[:proxy]
152
+ @browser.config.before_request[:change_proxy] = true
153
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.change_proxy"
154
+ else
155
+ logger.error "BrowserBuilder (poltergeist_phantomjs): proxy should be present and has lambda format to enable before_request.change_proxy, skipped"
156
+ end
157
+ end
158
+
159
+ # before_request delay
160
+ if delay = @config.dig(:before_request, :delay).presence
161
+ @browser.config.before_request[:delay] = delay
162
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.delay"
163
+ end
164
+
165
+ # encoding
166
+ if encoding = @config[:encoding]
167
+ @browser.config.encoding = encoding
168
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled encoding: #{encoding}"
169
+ end
170
+
171
+ # return Capybara session instance
172
+ @browser
173
+ end
174
+ end
175
+ end