kimurai_dynamic 1.4.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.travis.yml +5 -0
  4. data/CHANGELOG.md +111 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +2038 -0
  8. data/Rakefile +10 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/exe/kimurai +6 -0
  12. data/kimurai.gemspec +48 -0
  13. data/lib/kimurai/automation/deploy.yml +54 -0
  14. data/lib/kimurai/automation/setup/chromium_chromedriver.yml +26 -0
  15. data/lib/kimurai/automation/setup/firefox_geckodriver.yml +20 -0
  16. data/lib/kimurai/automation/setup/phantomjs.yml +33 -0
  17. data/lib/kimurai/automation/setup/ruby_environment.yml +124 -0
  18. data/lib/kimurai/automation/setup.yml +45 -0
  19. data/lib/kimurai/base/saver.rb +106 -0
  20. data/lib/kimurai/base/storage.rb +54 -0
  21. data/lib/kimurai/base.rb +330 -0
  22. data/lib/kimurai/base_helper.rb +22 -0
  23. data/lib/kimurai/browser_builder/mechanize_builder.rb +154 -0
  24. data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +175 -0
  25. data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +199 -0
  26. data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +204 -0
  27. data/lib/kimurai/browser_builder.rb +20 -0
  28. data/lib/kimurai/capybara_configuration.rb +10 -0
  29. data/lib/kimurai/capybara_ext/driver/base.rb +62 -0
  30. data/lib/kimurai/capybara_ext/mechanize/driver.rb +71 -0
  31. data/lib/kimurai/capybara_ext/poltergeist/driver.rb +13 -0
  32. data/lib/kimurai/capybara_ext/selenium/driver.rb +34 -0
  33. data/lib/kimurai/capybara_ext/session/config.rb +22 -0
  34. data/lib/kimurai/capybara_ext/session.rb +249 -0
  35. data/lib/kimurai/cli/ansible_command_builder.rb +71 -0
  36. data/lib/kimurai/cli/generator.rb +57 -0
  37. data/lib/kimurai/cli.rb +183 -0
  38. data/lib/kimurai/core_ext/array.rb +14 -0
  39. data/lib/kimurai/core_ext/hash.rb +5 -0
  40. data/lib/kimurai/core_ext/numeric.rb +19 -0
  41. data/lib/kimurai/core_ext/string.rb +7 -0
  42. data/lib/kimurai/pipeline.rb +33 -0
  43. data/lib/kimurai/runner.rb +60 -0
  44. data/lib/kimurai/template/.gitignore +18 -0
  45. data/lib/kimurai/template/Gemfile +28 -0
  46. data/lib/kimurai/template/README.md +3 -0
  47. data/lib/kimurai/template/config/application.rb +37 -0
  48. data/lib/kimurai/template/config/automation.yml +13 -0
  49. data/lib/kimurai/template/config/boot.rb +22 -0
  50. data/lib/kimurai/template/config/initializers/.keep +0 -0
  51. data/lib/kimurai/template/config/schedule.rb +57 -0
  52. data/lib/kimurai/template/db/.keep +0 -0
  53. data/lib/kimurai/template/helpers/application_helper.rb +3 -0
  54. data/lib/kimurai/template/lib/.keep +0 -0
  55. data/lib/kimurai/template/log/.keep +0 -0
  56. data/lib/kimurai/template/pipelines/saver.rb +11 -0
  57. data/lib/kimurai/template/pipelines/validator.rb +24 -0
  58. data/lib/kimurai/template/spiders/application_spider.rb +143 -0
  59. data/lib/kimurai/template/tmp/.keep +0 -0
  60. data/lib/kimurai/version.rb +3 -0
  61. data/lib/kimurai.rb +54 -0
  62. metadata +349 -0
@@ -0,0 +1,330 @@
1
+ require_relative 'base/saver'
2
+ require_relative 'base/storage'
3
+
4
+ module Kimurai
5
+ class Base
6
+ class InvalidUrlError < StandardError; end
7
+
8
+ # don't deep merge config's headers hash option
9
+ DMERGE_EXCLUDE = [:headers]
10
+
11
+ LoggerFormatter = proc do |severity, datetime, progname, msg|
12
+ current_thread_id = Thread.current.object_id
13
+ thread_type = Thread.main == Thread.current ? "M" : "C"
14
+ output = "%s, [%s#%d] [%s: %s] %5s -- %s: %s\n"
15
+ .freeze % [severity[0..0], datetime, $$, thread_type, current_thread_id, severity, progname, msg]
16
+
17
+ if Kimurai.configuration.colorize_logger != false && Kimurai.env == "development"
18
+ Rbcat.colorize(output, predefined: [:jsonhash, :logger])
19
+ else
20
+ output
21
+ end
22
+ end
23
+
24
+ include BaseHelper
25
+
26
+ ###
27
+
28
+ class << self
29
+ attr_reader :run_info, :savers, :storage
30
+ end
31
+
32
+ def self.running?
33
+ @run_info && @run_info[:status] == :running
34
+ end
35
+
36
+ def self.completed?
37
+ @run_info && @run_info[:status] == :completed
38
+ end
39
+
40
+ def self.failed?
41
+ @run_info && @run_info[:status] == :failed
42
+ end
43
+
44
+ def self.visits
45
+ @run_info && @run_info[:visits]
46
+ end
47
+
48
+ def self.items
49
+ @run_info && @run_info[:items]
50
+ end
51
+
52
+ def self.update(type, subtype)
53
+ return unless @run_info
54
+ @update_mutex.synchronize { @run_info[type][subtype] += 1 }
55
+ end
56
+
57
+ def self.add_event(scope, event)
58
+ return unless @run_info
59
+ @update_mutex.synchronize { @run_info[:events][scope][event] += 1 }
60
+ end
61
+
62
+ ###
63
+
64
+ @engine = :mechanize
65
+ @pipelines = []
66
+ @config = {}
67
+
68
+ def self.name
69
+ @name
70
+ end
71
+
72
+ def self.engine
73
+ @engine ||= superclass.engine
74
+ end
75
+
76
+ def self.pipelines
77
+ @pipelines ||= superclass.pipelines
78
+ end
79
+
80
+ def self.start_urls
81
+ @start_urls
82
+ end
83
+
84
+ def self.config
85
+ if superclass.equal?(::Object)
86
+ @config
87
+ else
88
+ superclass.config.deep_merge_excl(@config || {}, DMERGE_EXCLUDE)
89
+ end
90
+ end
91
+
92
+ ###
93
+
94
+ def self.logger
95
+ @logger ||= Kimurai.configuration.logger || begin
96
+ log_level = (ENV["LOG_LEVEL"] || Kimurai.configuration.log_level || "DEBUG").to_s.upcase
97
+ log_level = "Logger::#{log_level}".constantize
98
+ Logger.new(STDOUT, formatter: LoggerFormatter, level: log_level, progname: name)
99
+ end
100
+ end
101
+
102
+ def self.crawl!(exception_on_fail: true, url: nil)
103
+
104
+ logger.error "Spider: already running: #{name}" and return false if running?
105
+ if !url.nil?
106
+ start_urls = [url]
107
+ end
108
+
109
+ @storage = Storage.new
110
+ @savers = {}
111
+ @update_mutex = Mutex.new
112
+
113
+ @run_info = {
114
+ spider_name: name, status: :running, error: nil, environment: Kimurai.env,
115
+ start_time: Time.new, stop_time: nil, running_time: nil,
116
+ visits: { requests: 0, responses: 0 }, items: { sent: 0, processed: 0 },
117
+ events: { requests_errors: Hash.new(0), drop_items_errors: Hash.new(0), custom: Hash.new(0) }
118
+ }
119
+
120
+ ###
121
+
122
+ logger.info "Spider: started: #{name}"
123
+ open_spider if self.respond_to? :open_spider
124
+
125
+ spider = self.new
126
+ spider.with_info = true
127
+ if start_urls
128
+ start_urls.each do |start_url|
129
+ if start_url.class == Hash
130
+ spider.request_to(:parse, start_url)
131
+ else
132
+ spider.request_to(:parse, url: start_url)
133
+ end
134
+ end
135
+ else
136
+ spider.parse
137
+ end
138
+ rescue StandardError, SignalException, SystemExit => e
139
+ @run_info.merge!(status: :failed, error: e.inspect)
140
+ exception_on_fail ? raise(e) : [@run_info, e]
141
+ else
142
+ @run_info.merge!(status: :completed)
143
+ ensure
144
+ if spider
145
+ spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
146
+
147
+ stop_time = Time.now
148
+ total_time = (stop_time - @run_info[:start_time]).round(3)
149
+ @run_info.merge!(stop_time: stop_time, running_time: total_time)
150
+
151
+ close_spider if self.respond_to? :close_spider
152
+
153
+ message = "Spider: stopped: #{@run_info.merge(running_time: @run_info[:running_time]&.duration)}"
154
+ failed? ? logger.fatal(message) : logger.info(message)
155
+
156
+ @run_info, @storage, @savers, @update_mutex = nil
157
+ end
158
+ end
159
+
160
+ def self.parse!(handler, *args, **request)
161
+ spider = self.new
162
+
163
+ if args.present?
164
+ spider.public_send(handler, *args)
165
+ elsif request.present?
166
+ spider.request_to(handler, request)
167
+ else
168
+ spider.public_send(handler)
169
+ end
170
+ ensure
171
+ spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
172
+ end
173
+
174
+ ###
175
+
176
+ attr_reader :logger
177
+ attr_accessor :with_info
178
+
179
+ def initialize(engine = self.class.engine, config: {})
180
+ @engine = engine || self.class.engine
181
+ @config = self.class.config.deep_merge_excl(config, DMERGE_EXCLUDE)
182
+ @pipelines = self.class.pipelines.map do |pipeline_name|
183
+ klass = Pipeline.descendants.find { |kl| kl.name == pipeline_name }
184
+ instance = klass.new
185
+ instance.spider = self
186
+ [pipeline_name, instance]
187
+ end.to_h
188
+
189
+ @logger = self.class.logger
190
+ @savers = {}
191
+ end
192
+
193
+ def browser
194
+ @browser ||= BrowserBuilder.build(@engine, @config, spider: self)
195
+ end
196
+
197
+ def request_to(handler, delay = nil, url:, data: {}, response_type: :html)
198
+ raise InvalidUrlError, "Requested url is invalid: #{url}" unless URI.parse(url).kind_of?(URI::HTTP)
199
+
200
+ if @config[:skip_duplicate_requests] && !unique_request?(url)
201
+ add_event(:duplicate_requests) if self.with_info
202
+ logger.warn "Spider: request_to: not unique url: #{url}, skipped" and return
203
+ end
204
+
205
+ visited = delay ? browser.visit(url, delay: delay) : browser.visit(url)
206
+ return unless visited
207
+
208
+ public_send(handler, browser.current_response(response_type), { url: url, data: data })
209
+ end
210
+
211
+ def console(response = nil, url: nil, data: {})
212
+ binding.pry
213
+ end
214
+
215
+ ###
216
+
217
+ def storage
218
+ # Note: for `.crawl!` uses shared thread safe Storage instance,
219
+ # otherwise, each spider instance will have it's own Storage
220
+ @storage ||= self.with_info ? self.class.storage : Storage.new
221
+ end
222
+
223
+ def unique?(scope, value)
224
+ storage.unique?(scope, value)
225
+ end
226
+
227
+ def save_to(path, item, format:, position: true, append: false)
228
+ @savers[path] ||= begin
229
+ options = { format: format, position: position, append: append }
230
+ if self.with_info
231
+ self.class.savers[path] ||= Saver.new(path, options)
232
+ else
233
+ Saver.new(path, options)
234
+ end
235
+ end
236
+
237
+ @savers[path].save(item)
238
+ end
239
+
240
+ ###
241
+
242
+ def add_event(scope = :custom, event)
243
+ if self.with_info
244
+ self.class.add_event(scope, event)
245
+ end
246
+
247
+ logger.info "Spider: new event (scope: #{scope}): #{event}" if scope == :custom
248
+ end
249
+
250
+ ###
251
+
252
+ private
253
+
254
+ def create_browser(engine, config = {})
255
+ Kimurai::BrowserBuilder.build(engine, config, spider: self)
256
+ end
257
+
258
+ def unique_request?(url)
259
+ options = @config[:skip_duplicate_requests]
260
+ if options.class == Hash
261
+ scope = options[:scope] || :requests_urls
262
+ if options[:check_only]
263
+ storage.include?(scope, url) ? false : true
264
+ else
265
+ storage.unique?(scope, url) ? true : false
266
+ end
267
+ else
268
+ storage.unique?(:requests_urls, url) ? true : false
269
+ end
270
+ end
271
+
272
+ def send_item(item, options = {})
273
+ logger.debug "Pipeline: starting processing item through #{@pipelines.size} #{'pipeline'.pluralize(@pipelines.size)}..."
274
+ self.class.update(:items, :sent) if self.with_info
275
+
276
+ @pipelines.each do |name, instance|
277
+ item = options[name] ? instance.process_item(item, options: options[name]) : instance.process_item(item)
278
+ end
279
+ rescue => e
280
+ logger.error "Pipeline: dropped: #{e.inspect} (#{e.backtrace.first}), item: #{item}"
281
+ add_event(:drop_items_errors, e.inspect) if self.with_info
282
+ false
283
+ else
284
+ self.class.update(:items, :processed) if self.with_info
285
+ logger.info "Pipeline: processed: #{JSON.generate(item)}"
286
+ true
287
+ ensure
288
+ if self.with_info
289
+ logger.info "Info: items: sent: #{self.class.items[:sent]}, processed: #{self.class.items[:processed]}"
290
+ end
291
+ end
292
+
293
+ def in_parallel(handler, urls, threads:, data: {}, delay: nil, engine: @engine, config: {})
294
+ parts = urls.in_sorted_groups(threads, false)
295
+ urls_count = urls.size
296
+
297
+ all = []
298
+ start_time = Time.now
299
+ logger.info "Spider: in_parallel: starting processing #{urls_count} urls within #{threads} threads"
300
+
301
+ parts.each do |part|
302
+ all << Thread.new(part) do |part|
303
+ Thread.current.abort_on_exception = true
304
+
305
+ spider = self.class.new(engine, config: @config.deep_merge_excl(config, DMERGE_EXCLUDE))
306
+ spider.with_info = true if self.with_info
307
+
308
+ part.each do |url_data|
309
+ if url_data.class == Hash
310
+ if url_data[:url].present? && url_data[:data].present?
311
+ spider.request_to(handler, delay, url_data)
312
+ else
313
+ spider.public_send(handler, url_data)
314
+ end
315
+ else
316
+ spider.request_to(handler, delay, url: url_data, data: data)
317
+ end
318
+ end
319
+ ensure
320
+ spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
321
+ end
322
+
323
+ sleep 0.5
324
+ end
325
+
326
+ all.each(&:join)
327
+ logger.info "Spider: in_parallel: stopped processing #{urls_count} urls within #{threads} threads, total time: #{(Time.now - start_time).duration}"
328
+ end
329
+ end
330
+ end
@@ -0,0 +1,22 @@
1
+ module Kimurai
2
+ module BaseHelper
3
+ private
4
+
5
+ def absolute_url(url, base:)
6
+ return unless url
7
+ URI.join(base, URI.escape(url)).to_s
8
+ end
9
+
10
+ def escape_url(url)
11
+ uri = URI.parse(url)
12
+ rescue URI::InvalidURIError => e
13
+ URI.parse(URI.escape url).to_s rescue url
14
+ else
15
+ url
16
+ end
17
+
18
+ def normalize_url(url, base:)
19
+ escape_url(absolute_url(url, base: base))
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,154 @@
1
+ require 'capybara'
2
+ require 'capybara/mechanize'
3
+ require_relative '../capybara_configuration'
4
+ require_relative '../capybara_ext/mechanize/driver'
5
+ require_relative '../capybara_ext/session'
6
+
7
+ module Kimurai::BrowserBuilder
8
+ class MechanizeBuilder
9
+ attr_reader :logger, :spider
10
+
11
+ def initialize(config, spider:)
12
+ @config = config
13
+ @spider = spider
14
+ @logger = spider.logger
15
+ end
16
+
17
+ def build
18
+ # Register driver
19
+ Capybara.register_driver :mechanize do |app|
20
+ driver = Capybara::Mechanize::Driver.new("app")
21
+ # keep the history as small as possible (by default it's unlimited)
22
+ driver.configure { |a| a.history.max_size = 2 }
23
+ driver
24
+ end
25
+
26
+ # Create browser instance (Capybara session)
27
+ @browser = Capybara::Session.new(:mechanize)
28
+ @browser.spider = spider
29
+ logger.debug "BrowserBuilder (mechanize): created browser instance"
30
+
31
+ if @config[:extensions].present?
32
+ logger.error "BrowserBuilder (mechanize): `extensions` option not supported, skipped"
33
+ end
34
+
35
+ # Proxy
36
+ if proxy = @config[:proxy].presence
37
+ proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
38
+ ip, port, type = proxy_string.split(":")
39
+
40
+ if type == "http"
41
+ @browser.driver.set_proxy(*proxy_string.split(":"))
42
+ logger.debug "BrowserBuilder (mechanize): enabled http proxy, ip: #{ip}, port: #{port}"
43
+ else
44
+ logger.error "BrowserBuilder (mechanize): can't set #{type} proxy (not supported), skipped"
45
+ end
46
+ end
47
+
48
+ # SSL
49
+ if ssl_cert_path = @config[:ssl_cert_path].presence
50
+ @browser.driver.browser.agent.http.ca_file = ssl_cert_path
51
+ logger.debug "BrowserBuilder (mechanize): enabled custom ssl_cert"
52
+ end
53
+
54
+ if @config[:ignore_ssl_errors].present?
55
+ @browser.driver.browser.agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
56
+ logger.debug "BrowserBuilder (mechanize): enabled ignore_ssl_errors"
57
+ end
58
+
59
+ # Headers
60
+ if headers = @config[:headers].presence
61
+ @browser.driver.headers = headers
62
+ logger.debug "BrowserBuilder (mechanize): enabled custom headers"
63
+ end
64
+
65
+ if user_agent = @config[:user_agent].presence
66
+ user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
67
+
68
+ @browser.driver.add_header("User-Agent", user_agent_string)
69
+ logger.debug "BrowserBuilder (mechanize): enabled custom user_agent"
70
+ end
71
+
72
+ # Cookies
73
+ if cookies = @config[:cookies].presence
74
+ cookies.each do |cookie|
75
+ @browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
76
+ end
77
+
78
+ logger.debug "BrowserBuilder (mechanize): enabled custom cookies"
79
+ end
80
+
81
+ # Browser instance options
82
+ # skip_request_errors
83
+ if skip_errors = @config[:skip_request_errors].presence
84
+ @browser.config.skip_request_errors = skip_errors
85
+ logger.debug "BrowserBuilder (mechanize): enabled skip_request_errors"
86
+ end
87
+
88
+ # retry_request_errors
89
+ if retry_errors = @config[:retry_request_errors].presence
90
+ @browser.config.retry_request_errors = retry_errors
91
+ logger.debug "BrowserBuilder (mechanize): enabled retry_request_errors"
92
+ end
93
+
94
+ # restart_if
95
+ if @config[:restart_if].present?
96
+ logger.warn "BrowserBuilder (mechanize): restart_if options not supported by Mechanize, skipped"
97
+ end
98
+
99
+ # before_request clear_cookies
100
+ if @config.dig(:before_request, :clear_cookies)
101
+ @browser.config.before_request[:clear_cookies] = true
102
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.clear_cookies"
103
+ end
104
+
105
+ # before_request clear_and_set_cookies
106
+ if @config.dig(:before_request, :clear_and_set_cookies)
107
+ if cookies = @config[:cookies].presence
108
+ @browser.config.cookies = cookies
109
+ @browser.config.before_request[:clear_and_set_cookies] = true
110
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.clear_and_set_cookies"
111
+ else
112
+ logger.error "BrowserBuilder (mechanize): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
113
+ end
114
+ end
115
+
116
+ # before_request change_user_agent
117
+ if @config.dig(:before_request, :change_user_agent)
118
+ if @config[:user_agent].present? && @config[:user_agent].class == Proc
119
+ @browser.config.user_agent = @config[:user_agent]
120
+ @browser.config.before_request[:change_user_agent] = true
121
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.change_user_agent"
122
+ else
123
+ logger.error "BrowserBuilder (mechanize): user_agent should be present and has lambda format to enable before_request.change_user_agent, skipped"
124
+ end
125
+ end
126
+
127
+ # before_request change_proxy
128
+ if @config.dig(:before_request, :change_proxy)
129
+ if @config[:proxy].present? && @config[:proxy].class == Proc
130
+ @browser.config.proxy = @config[:proxy]
131
+ @browser.config.before_request[:change_proxy] = true
132
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.change_proxy"
133
+ else
134
+ logger.error "BrowserBuilder (mechanize): proxy should be present and has lambda format to enable before_request.change_proxy, skipped"
135
+ end
136
+ end
137
+
138
+ # before_request delay
139
+ if delay = @config.dig(:before_request, :delay).presence
140
+ @browser.config.before_request[:delay] = delay
141
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.delay"
142
+ end
143
+
144
+ # encoding
145
+ if encoding = @config[:encoding]
146
+ @browser.config.encoding = encoding
147
+ logger.debug "BrowserBuilder (mechanize): enabled encoding: #{encoding}"
148
+ end
149
+
150
+ # return Capybara session instance
151
+ @browser
152
+ end
153
+ end
154
+ end
@@ -0,0 +1,175 @@
1
+ require 'capybara'
2
+ require 'capybara/poltergeist'
3
+ require_relative '../capybara_configuration'
4
+ require_relative '../capybara_ext/poltergeist/driver'
5
+ require_relative '../capybara_ext/session'
6
+
7
+ module Kimurai::BrowserBuilder
8
+ class PoltergeistPhantomjsBuilder
9
+ attr_reader :logger, :spider
10
+
11
+ def initialize(config, spider:)
12
+ @config = config
13
+ @spider = spider
14
+ @logger = spider.logger
15
+ end
16
+
17
+ def build
18
+ # Register driver
19
+ Capybara.register_driver :poltergeist_phantomjs do |app|
20
+ # Create driver options
21
+ driver_options = {
22
+ js_errors: false, debug: false, inspector: false, phantomjs_options: []
23
+ }
24
+
25
+ if extensions = @config[:extensions].presence
26
+ driver_options[:extensions] = extensions
27
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled extensions"
28
+ end
29
+
30
+ # Window size
31
+ if size = @config[:window_size].presence
32
+ driver_options[:window_size] = size
33
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled window_size"
34
+ end
35
+
36
+ # SSL
37
+ if ssl_cert_path = @config[:ssl_cert_path].presence
38
+ driver_options[:phantomjs_options] << "--ssl-certificates-path=#{ssl_cert_path}"
39
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom ssl_cert"
40
+ end
41
+
42
+ if @config[:ignore_ssl_errors].present?
43
+ driver_options[:phantomjs_options].push("--ignore-ssl-errors=yes", "--ssl-protocol=any")
44
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled ignore_ssl_errors"
45
+ end
46
+
47
+ # Disable images
48
+ if @config[:disable_images].present?
49
+ driver_options[:phantomjs_options] << "--load-images=no"
50
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled disable_images"
51
+ end
52
+
53
+ Capybara::Poltergeist::Driver.new(app, driver_options)
54
+ end
55
+
56
+ # Create browser instance (Capybara session)
57
+ @browser = Capybara::Session.new(:poltergeist_phantomjs)
58
+ @browser.spider = spider
59
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): created browser instance"
60
+
61
+ # Proxy
62
+ if proxy = @config[:proxy].presence
63
+ proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
64
+ ip, port, type = proxy_string.split(":")
65
+
66
+ if %w(http socks5).include?(type)
67
+ @browser.driver.set_proxy(*proxy_string.split(":"))
68
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled #{type} proxy, ip: #{ip}, port: #{port}"
69
+ else
70
+ logger.error "BrowserBuilder (poltergeist_phantomjs): wrong type of proxy: #{type}, skipped"
71
+ end
72
+ end
73
+
74
+ # Headers
75
+ if headers = @config[:headers].presence
76
+ @browser.driver.headers = headers
77
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom headers"
78
+ end
79
+
80
+ if user_agent = @config[:user_agent].presence
81
+ user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
82
+
83
+ @browser.driver.add_header("User-Agent", user_agent_string)
84
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom user_agent"
85
+ end
86
+
87
+ # Cookies
88
+ if cookies = @config[:cookies].presence
89
+ cookies.each do |cookie|
90
+ @browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
91
+ end
92
+
93
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom cookies"
94
+ end
95
+
96
+ # Browser instance options
97
+ # skip_request_errors
98
+ if skip_errors = @config[:skip_request_errors].presence
99
+ @browser.config.skip_request_errors = skip_errors
100
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled skip_request_errors"
101
+ end
102
+
103
+ # retry_request_errors
104
+ if retry_errors = @config[:retry_request_errors].presence
105
+ @browser.config.retry_request_errors = retry_errors
106
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled retry_request_errors"
107
+ end
108
+
109
+ # restart_if
110
+ if requests_limit = @config.dig(:restart_if, :requests_limit).presence
111
+ @browser.config.restart_if[:requests_limit] = requests_limit
112
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled restart_if.requests_limit >= #{requests_limit}"
113
+ end
114
+
115
+ if memory_limit = @config.dig(:restart_if, :memory_limit).presence
116
+ @browser.config.restart_if[:memory_limit] = memory_limit
117
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled restart_if.memory_limit >= #{memory_limit}"
118
+ end
119
+
120
+ # before_request clear_cookies
121
+ if @config.dig(:before_request, :clear_cookies)
122
+ @browser.config.before_request[:clear_cookies] = true
123
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.clear_cookies"
124
+ end
125
+
126
+ # before_request clear_and_set_cookies
127
+ if @config.dig(:before_request, :clear_and_set_cookies)
128
+ if cookies = @config[:cookies].presence
129
+ @browser.config.cookies = cookies
130
+ @browser.config.before_request[:clear_and_set_cookies] = true
131
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.clear_and_set_cookies"
132
+ else
133
+ logger.error "BrowserBuilder (poltergeist_phantomjs): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
134
+ end
135
+ end
136
+
137
+ # before_request change_user_agent
138
+ if @config.dig(:before_request, :change_user_agent)
139
+ if @config[:user_agent].present? && @config[:user_agent].class == Proc
140
+ @browser.config.user_agent = @config[:user_agent]
141
+ @browser.config.before_request[:change_user_agent] = true
142
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.change_user_agent"
143
+ else
144
+ logger.error "BrowserBuilder (poltergeist_phantomjs): user_agent should be present and has lambda format to enable before_request.change_user_agent, skipped"
145
+ end
146
+ end
147
+
148
+ # before_request change_proxy
149
+ if @config.dig(:before_request, :change_proxy)
150
+ if @config[:proxy].present? && @config[:proxy].class == Proc
151
+ @browser.config.proxy = @config[:proxy]
152
+ @browser.config.before_request[:change_proxy] = true
153
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.change_proxy"
154
+ else
155
+ logger.error "BrowserBuilder (poltergeist_phantomjs): proxy should be present and has lambda format to enable before_request.change_proxy, skipped"
156
+ end
157
+ end
158
+
159
+ # before_request delay
160
+ if delay = @config.dig(:before_request, :delay).presence
161
+ @browser.config.before_request[:delay] = delay
162
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.delay"
163
+ end
164
+
165
+ # encoding
166
+ if encoding = @config[:encoding]
167
+ @browser.config.encoding = encoding
168
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled encoding: #{encoding}"
169
+ end
170
+
171
+ # return Capybara session instance
172
+ @browser
173
+ end
174
+ end
175
+ end