kimurai 1.3.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +9 -0
  3. data/CHANGELOG.md +29 -0
  4. data/Gemfile +2 -2
  5. data/README.md +478 -649
  6. data/Rakefile +6 -6
  7. data/bin/console +3 -4
  8. data/exe/kimurai +0 -1
  9. data/kimurai.gemspec +38 -37
  10. data/lib/kimurai/base/saver.rb +15 -19
  11. data/lib/kimurai/base/storage.rb +1 -1
  12. data/lib/kimurai/base.rb +42 -38
  13. data/lib/kimurai/base_helper.rb +5 -4
  14. data/lib/kimurai/browser_builder/mechanize_builder.rb +44 -38
  15. data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +63 -51
  16. data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +61 -55
  17. data/lib/kimurai/browser_builder.rb +7 -31
  18. data/lib/kimurai/capybara_configuration.rb +1 -1
  19. data/lib/kimurai/capybara_ext/driver/base.rb +50 -46
  20. data/lib/kimurai/capybara_ext/mechanize/driver.rb +51 -50
  21. data/lib/kimurai/capybara_ext/selenium/driver.rb +33 -29
  22. data/lib/kimurai/capybara_ext/session/config.rb +1 -1
  23. data/lib/kimurai/capybara_ext/session.rb +40 -38
  24. data/lib/kimurai/cli/generator.rb +15 -15
  25. data/lib/kimurai/cli.rb +52 -85
  26. data/lib/kimurai/core_ext/array.rb +2 -2
  27. data/lib/kimurai/core_ext/hash.rb +1 -1
  28. data/lib/kimurai/core_ext/numeric.rb +4 -4
  29. data/lib/kimurai/pipeline.rb +2 -1
  30. data/lib/kimurai/runner.rb +6 -6
  31. data/lib/kimurai/template/Gemfile +2 -2
  32. data/lib/kimurai/template/config/boot.rb +4 -4
  33. data/lib/kimurai/template/config/schedule.rb +15 -15
  34. data/lib/kimurai/template/spiders/application_spider.rb +14 -14
  35. data/lib/kimurai/version.rb +1 -1
  36. data/lib/kimurai.rb +7 -3
  37. metadata +58 -65
  38. data/.travis.yml +0 -5
  39. data/lib/kimurai/automation/deploy.yml +0 -54
  40. data/lib/kimurai/automation/setup/chromium_chromedriver.yml +0 -26
  41. data/lib/kimurai/automation/setup/firefox_geckodriver.yml +0 -20
  42. data/lib/kimurai/automation/setup/phantomjs.yml +0 -33
  43. data/lib/kimurai/automation/setup/ruby_environment.yml +0 -124
  44. data/lib/kimurai/automation/setup.yml +0 -44
  45. data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +0 -171
  46. data/lib/kimurai/capybara_ext/poltergeist/driver.rb +0 -13
  47. data/lib/kimurai/cli/ansible_command_builder.rb +0 -71
  48. data/lib/kimurai/template/config/automation.yml +0 -13
data/Rakefile CHANGED
@@ -1,10 +1,10 @@
1
- require "bundler/gem_tasks"
2
- require "rake/testtask"
1
+ require 'bundler/gem_tasks'
2
+ require 'rake/testtask'
3
3
 
4
4
  Rake::TestTask.new(:test) do |t|
5
- t.libs << "test"
6
- t.libs << "lib"
7
- t.test_files = FileList["test/**/*_test.rb"]
5
+ t.libs << 'test'
6
+ t.libs << 'lib'
7
+ t.test_files = FileList['test/**/*_test.rb']
8
8
  end
9
9
 
10
- task :default => :test
10
+ task default: :test
data/bin/console CHANGED
@@ -1,7 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
-
3
- require "bundler/setup"
4
- require "kimurai"
2
+ require 'bundler/setup'
3
+ require 'kimurai'
5
4
 
6
5
  # You can add fixtures and/or initialization code here to make experimenting
7
6
  # with your gem easier. You can also use a different console, if you like.
@@ -10,5 +9,5 @@ require "kimurai"
10
9
  # require "pry"
11
10
  # Pry.start
12
11
 
13
- require "irb"
12
+ require 'irb'
14
13
  IRB.start(__FILE__)
data/exe/kimurai CHANGED
@@ -1,5 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
-
3
2
  require 'kimurai'
4
3
  require 'kimurai/cli'
5
4
 
data/kimurai.gemspec CHANGED
@@ -1,48 +1,49 @@
1
-
2
- lib = File.expand_path("../lib", __FILE__)
1
+ lib = File.expand_path('lib', __dir__)
3
2
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
- require "kimurai/version"
3
+ require 'kimurai/version'
5
4
 
6
5
  Gem::Specification.new do |spec|
7
- spec.name = "kimurai"
6
+ spec.name = 'kimurai'
8
7
  spec.version = Kimurai::VERSION
9
- spec.authors = ["Victor Afanasev"]
10
- spec.email = ["vicfreefly@gmail.com"]
8
+ spec.authors = ['Victor Afanasev']
9
+ spec.email = ['vicfreefly@gmail.com']
11
10
 
12
- spec.summary = "Modern web scraping framework written in Ruby and based on Capybara/Nokogiri"
13
- spec.homepage = "https://github.com/vifreefly/kimuraframework"
14
- spec.license = "MIT"
11
+ spec.summary = 'Modern web scraping framework written in Ruby and based on Capybara/Nokogiri'
12
+ spec.homepage = 'https://github.com/vifreefly/kimuraframework'
13
+ spec.license = 'MIT'
15
14
 
16
15
  # Specify which files should be added to the gem when it is released.
17
16
  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
18
- spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
17
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
19
18
  `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
20
19
  end
21
- spec.bindir = "exe"
22
- spec.executables = "kimurai"
23
- spec.require_paths = ["lib"]
24
- spec.required_ruby_version = ">= 2.5.0"
25
-
26
- spec.add_dependency "thor"
27
- spec.add_dependency "cliver"
28
- spec.add_dependency "activesupport"
29
- spec.add_dependency "murmurhash3"
30
- spec.add_dependency "nokogiri"
31
-
32
- spec.add_dependency "capybara", ">= 2.15", "< 4.0"
33
- spec.add_dependency "capybara-mechanize"
34
- spec.add_dependency "poltergeist"
35
- spec.add_dependency "selenium-webdriver"
36
-
37
- spec.add_dependency "headless"
38
- spec.add_dependency "pmap"
39
-
40
- spec.add_dependency "whenever"
41
-
42
- spec.add_dependency "rbcat", "~> 0.2"
43
- spec.add_dependency "pry"
44
-
45
- spec.add_development_dependency "bundler", "~> 1.16"
46
- spec.add_development_dependency "rake", "~> 10.0"
47
- spec.add_development_dependency "minitest", "~> 5.0"
20
+ spec.bindir = 'exe'
21
+ spec.executables = 'kimurai'
22
+ spec.require_paths = ['lib']
23
+ spec.required_ruby_version = '>= 3.1.0'
24
+
25
+ spec.add_dependency 'activesupport'
26
+ spec.add_dependency 'cliver'
27
+ spec.add_dependency 'csv'
28
+ spec.add_dependency 'murmurhash3'
29
+ spec.add_dependency 'nokogiri'
30
+ spec.add_dependency 'ostruct'
31
+ spec.add_dependency 'thor'
32
+
33
+ # for capybara-mechanize compatibility
34
+ spec.add_dependency 'mutex_m'
35
+ spec.add_dependency 'nkf'
36
+ spec.add_dependency 'reline'
37
+
38
+ spec.add_dependency 'capybara', '~> 3.40'
39
+ spec.add_dependency 'capybara-mechanize', '~> 1.13'
40
+ spec.add_dependency 'selenium-webdriver', '~> 4.27'
41
+
42
+ spec.add_dependency 'headless'
43
+ spec.add_dependency 'pmap'
44
+
45
+ spec.add_dependency 'whenever'
46
+
47
+ spec.add_dependency 'pry'
48
+ spec.add_dependency 'rbcat', '~> 1.0'
48
49
  end
@@ -7,9 +7,7 @@ module Kimurai
7
7
  attr_reader :format, :path, :position, :append
8
8
 
9
9
  def initialize(path, format:, position: true, append: false)
10
- unless %i(json pretty_json jsonlines csv).include?(format)
11
- raise "SimpleSaver: wrong type of format: #{format}"
12
- end
10
+ raise "SimpleSaver: wrong type of format: #{format}" unless %i[json pretty_json jsonlines csv].include?(format)
13
11
 
14
12
  @path = path
15
13
  @format = format
@@ -42,48 +40,48 @@ module Kimurai
42
40
  def save_to_json(item)
43
41
  data = JSON.generate([item])
44
42
 
45
- if @index > 1 || append && File.exists?(path)
43
+ if @index > 1 || append && File.exist?(path)
46
44
  file_content = File.read(path).sub(/\}\]\Z/, "\}\,")
47
- File.open(path, "w") do |f|
48
- f.write(file_content + data.sub(/\A\[/, ""))
45
+ File.open(path, 'w') do |f|
46
+ f.write(file_content + data.sub(/\A\[/, ''))
49
47
  end
50
48
  else
51
- File.open(path, "w") { |f| f.write(data) }
49
+ File.open(path, 'w') { |f| f.write(data) }
52
50
  end
53
51
  end
54
52
 
55
53
  def save_to_pretty_json(item)
56
54
  data = JSON.pretty_generate([item])
57
55
 
58
- if @index > 1 || append && File.exists?(path)
56
+ if @index > 1 || append && File.exist?(path)
59
57
  file_content = File.read(path).sub(/\}\n\]\Z/, "\}\,\n")
60
- File.open(path, "w") do |f|
61
- f.write(file_content + data.sub(/\A\[\n/, ""))
58
+ File.open(path, 'w') do |f|
59
+ f.write(file_content + data.sub(/\A\[\n/, ''))
62
60
  end
63
61
  else
64
- File.open(path, "w") { |f| f.write(data) }
62
+ File.open(path, 'w') { |f| f.write(data) }
65
63
  end
66
64
  end
67
65
 
68
66
  def save_to_jsonlines(item)
69
67
  data = JSON.generate(item)
70
68
 
71
- if @index > 1 || append && File.exists?(path)
72
- File.open(path, "a") { |file| file.write("\n" + data) }
69
+ if @index > 1 || append && File.exist?(path)
70
+ File.open(path, 'a') { |file| file.write("\n#{data}") }
73
71
  else
74
- File.open(path, "w") { |file| file.write(data) }
72
+ File.open(path, 'w') { |file| file.write(data) }
75
73
  end
76
74
  end
77
75
 
78
76
  def save_to_csv(item)
79
77
  data = flatten_hash(item)
80
78
 
81
- if @index > 1 || append && File.exists?(path)
82
- CSV.open(path, "a+", force_quotes: true) do |csv|
79
+ if @index > 1 || append && File.exist?(path)
80
+ CSV.open(path, 'a+', force_quotes: true) do |csv|
83
81
  csv << data.values
84
82
  end
85
83
  else
86
- CSV.open(path, "w", force_quotes: true) do |csv|
84
+ CSV.open(path, 'w', force_quotes: true) do |csv|
87
85
  csv << data.keys
88
86
  csv << data.values
89
87
  end
@@ -102,5 +100,3 @@ module Kimurai
102
100
  end
103
101
  end
104
102
  end
105
-
106
-
@@ -24,7 +24,7 @@ module Kimurai
24
24
  def add(scope, value)
25
25
  @mutex.synchronize do
26
26
  database[scope] ||= []
27
- if value.kind_of?(Array)
27
+ if value.is_a?(Array)
28
28
  database[scope] += value
29
29
  database[scope].uniq!
30
30
  else
data/lib/kimurai/base.rb CHANGED
@@ -1,19 +1,22 @@
1
+ require 'English'
1
2
  require_relative 'base/saver'
2
3
  require_relative 'base/storage'
3
4
 
4
5
  module Kimurai
5
6
  class Base
7
+ class InvalidUrlError < StandardError; end
8
+
6
9
  # don't deep merge config's headers hash option
7
- DMERGE_EXCLUDE = [:headers]
10
+ DMERGE_EXCLUDE = [:headers].freeze
8
11
 
9
12
  LoggerFormatter = proc do |severity, datetime, progname, msg|
10
13
  current_thread_id = Thread.current.object_id
11
- thread_type = Thread.main == Thread.current ? "M" : "C"
12
- output = "%s, [%s#%d] [%s: %s] %5s -- %s: %s\n"
13
- .freeze % [severity[0..0], datetime, $$, thread_type, current_thread_id, severity, progname, msg]
14
+ thread_type = Thread.main == Thread.current ? 'M' : 'C'
15
+ output = format("%s, [%s#%d] [%s: %s] %5s -- %s: %s\n", severity[0..0], datetime, $PROCESS_ID, thread_type,
16
+ current_thread_id, severity, progname, msg)
14
17
 
15
- if Kimurai.configuration.colorize_logger != false && Kimurai.env == "development"
16
- Rbcat.colorize(output, predefined: [:jsonhash, :logger])
18
+ if Kimurai.configuration.colorize_logger != false && Kimurai.env == 'development'
19
+ Rbcat.colorize(output, predefined: %i[jsonhash logger])
17
20
  else
18
21
  output
19
22
  end
@@ -49,11 +52,13 @@ module Kimurai
49
52
 
50
53
  def self.update(type, subtype)
51
54
  return unless @run_info
55
+
52
56
  @update_mutex.synchronize { @run_info[type][subtype] += 1 }
53
57
  end
54
58
 
55
59
  def self.add_event(scope, event)
56
60
  return unless @run_info
61
+
57
62
  @update_mutex.synchronize { @run_info[:events][scope][event] += 1 }
58
63
  end
59
64
 
@@ -91,9 +96,9 @@ module Kimurai
91
96
 
92
97
  def self.logger
93
98
  @logger ||= Kimurai.configuration.logger || begin
94
- log_level = (ENV["LOG_LEVEL"] || Kimurai.configuration.log_level || "DEBUG").to_s.upcase
99
+ log_level = (ENV['LOG_LEVEL'] || Kimurai.configuration.log_level || 'DEBUG').to_s.upcase
95
100
  log_level = "Logger::#{log_level}".constantize
96
- Logger.new(STDOUT, formatter: LoggerFormatter, level: log_level, progname: name)
101
+ Logger.new($stdout, formatter: LoggerFormatter, level: log_level, progname: name)
97
102
  end
98
103
  end
99
104
 
@@ -114,13 +119,13 @@ module Kimurai
114
119
  ###
115
120
 
116
121
  logger.info "Spider: started: #{name}"
117
- open_spider if self.respond_to? :open_spider
122
+ open_spider if respond_to? :open_spider
118
123
 
119
- spider = self.new
124
+ spider = new
120
125
  spider.with_info = true
121
126
  if start_urls
122
127
  start_urls.each do |start_url|
123
- if start_url.class == Hash
128
+ if start_url.instance_of?(Hash)
124
129
  spider.request_to(:parse, start_url)
125
130
  else
126
131
  spider.request_to(:parse, url: start_url)
@@ -136,13 +141,13 @@ module Kimurai
136
141
  @run_info.merge!(status: :completed)
137
142
  ensure
138
143
  if spider
139
- spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
144
+ spider.browser.destroy_driver! if spider.instance_variable_get('@browser')
140
145
 
141
146
  stop_time = Time.now
142
147
  total_time = (stop_time - @run_info[:start_time]).round(3)
143
148
  @run_info.merge!(stop_time: stop_time, running_time: total_time)
144
149
 
145
- close_spider if self.respond_to? :close_spider
150
+ close_spider if respond_to? :close_spider
146
151
 
147
152
  message = "Spider: stopped: #{@run_info.merge(running_time: @run_info[:running_time]&.duration)}"
148
153
  failed? ? logger.fatal(message) : logger.info(message)
@@ -152,7 +157,7 @@ module Kimurai
152
157
  end
153
158
 
154
159
  def self.parse!(handler, *args, **request)
155
- spider = self.new
160
+ spider = new
156
161
 
157
162
  if args.present?
158
163
  spider.public_send(handler, *args)
@@ -162,7 +167,7 @@ module Kimurai
162
167
  spider.public_send(handler)
163
168
  end
164
169
  ensure
165
- spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
170
+ spider.browser.destroy_driver! if spider.instance_variable_get('@browser')
166
171
  end
167
172
 
168
173
  ###
@@ -171,7 +176,7 @@ module Kimurai
171
176
  attr_accessor :with_info
172
177
 
173
178
  def initialize(engine = self.class.engine, config: {})
174
- @engine = engine
179
+ @engine = engine || self.class.engine
175
180
  @config = self.class.config.deep_merge_excl(config, DMERGE_EXCLUDE)
176
181
  @pipelines = self.class.pipelines.map do |pipeline_name|
177
182
  klass = Pipeline.descendants.find { |kl| kl.name == pipeline_name }
@@ -189,15 +194,17 @@ module Kimurai
189
194
  end
190
195
 
191
196
  def request_to(handler, delay = nil, url:, data: {}, response_type: :html)
197
+ raise InvalidUrlError, "Requested url is invalid: #{url}" unless URI.parse(url).is_a?(URI::HTTP)
198
+
192
199
  if @config[:skip_duplicate_requests] && !unique_request?(url)
193
- add_event(:duplicate_requests) if self.with_info
200
+ add_event(:duplicate_requests) if with_info
194
201
  logger.warn "Spider: request_to: not unique url: #{url}, skipped" and return
195
202
  end
196
203
 
197
204
  visited = delay ? browser.visit(url, delay: delay) : browser.visit(url)
198
205
  return unless visited
199
206
 
200
- public_send(handler, browser.current_response(response_type), { url: url, data: data })
207
+ public_send(handler, browser.current_response(response_type), **{ url: url, data: data })
201
208
  end
202
209
 
203
210
  def console(response = nil, url: nil, data: {})
@@ -207,9 +214,9 @@ module Kimurai
207
214
  ###
208
215
 
209
216
  def storage
210
- # Note: for `.crawl!` uses shared thread safe Storage instance,
217
+ # NOTE: for `.crawl!` uses shared thread safe Storage instance,
211
218
  # otherwise, each spider instance will have it's own Storage
212
- @storage ||= self.with_info ? self.class.storage : Storage.new
219
+ @storage ||= with_info ? self.class.storage : Storage.new
213
220
  end
214
221
 
215
222
  def unique?(scope, value)
@@ -219,10 +226,10 @@ module Kimurai
219
226
  def save_to(path, item, format:, position: true, append: false)
220
227
  @savers[path] ||= begin
221
228
  options = { format: format, position: position, append: append }
222
- if self.with_info
223
- self.class.savers[path] ||= Saver.new(path, options)
229
+ if with_info
230
+ self.class.savers[path] ||= Saver.new(path, **options)
224
231
  else
225
- Saver.new(path, options)
232
+ Saver.new(path, **options)
226
233
  end
227
234
  end
228
235
 
@@ -232,11 +239,8 @@ module Kimurai
232
239
  ###
233
240
 
234
241
  def add_event(scope = :custom, event)
235
- unless self.with_info
236
- raise "It's allowed to use `add_event` only while performing a full run (`.crawl!` method)"
237
- end
242
+ self.class.add_event(scope, event) if with_info
238
243
 
239
- self.class.add_event(scope, event)
240
244
  logger.info "Spider: new event (scope: #{scope}): #{event}" if scope == :custom
241
245
  end
242
246
 
@@ -250,35 +254,35 @@ module Kimurai
250
254
 
251
255
  def unique_request?(url)
252
256
  options = @config[:skip_duplicate_requests]
253
- if options.class == Hash
257
+ if options.instance_of?(Hash)
254
258
  scope = options[:scope] || :requests_urls
255
259
  if options[:check_only]
256
260
  storage.include?(scope, url) ? false : true
257
261
  else
258
- storage.unique?(scope, url) ? true : false
262
+ storage.unique?(scope, url) || false
259
263
  end
260
264
  else
261
- storage.unique?(:requests_urls, url) ? true : false
265
+ storage.unique?(:requests_urls, url) || false
262
266
  end
263
267
  end
264
268
 
265
269
  def send_item(item, options = {})
266
270
  logger.debug "Pipeline: starting processing item through #{@pipelines.size} #{'pipeline'.pluralize(@pipelines.size)}..."
267
- self.class.update(:items, :sent) if self.with_info
271
+ self.class.update(:items, :sent) if with_info
268
272
 
269
273
  @pipelines.each do |name, instance|
270
274
  item = options[name] ? instance.process_item(item, options: options[name]) : instance.process_item(item)
271
275
  end
272
- rescue => e
276
+ rescue StandardError => e
273
277
  logger.error "Pipeline: dropped: #{e.inspect} (#{e.backtrace.first}), item: #{item}"
274
- add_event(:drop_items_errors, e.inspect) if self.with_info
278
+ add_event(:drop_items_errors, e.inspect) if with_info
275
279
  false
276
280
  else
277
- self.class.update(:items, :processed) if self.with_info
281
+ self.class.update(:items, :processed) if with_info
278
282
  logger.info "Pipeline: processed: #{JSON.generate(item)}"
279
283
  true
280
284
  ensure
281
- if self.with_info
285
+ if with_info
282
286
  logger.info "Info: items: sent: #{self.class.items[:sent]}, processed: #{self.class.items[:processed]}"
283
287
  end
284
288
  end
@@ -296,10 +300,10 @@ module Kimurai
296
300
  Thread.current.abort_on_exception = true
297
301
 
298
302
  spider = self.class.new(engine, config: @config.deep_merge_excl(config, DMERGE_EXCLUDE))
299
- spider.with_info = true if self.with_info
303
+ spider.with_info = true if with_info
300
304
 
301
305
  part.each do |url_data|
302
- if url_data.class == Hash
306
+ if url_data.instance_of?(Hash)
303
307
  if url_data[:url].present? && url_data[:data].present?
304
308
  spider.request_to(handler, delay, url_data)
305
309
  else
@@ -310,7 +314,7 @@ module Kimurai
310
314
  end
311
315
  end
312
316
  ensure
313
- spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
317
+ spider.browser.destroy_driver! if spider.instance_variable_get('@browser')
314
318
  end
315
319
 
316
320
  sleep 0.5
@@ -4,13 +4,14 @@ module Kimurai
4
4
 
5
5
  def absolute_url(url, base:)
6
6
  return unless url
7
- URI.join(base, URI.escape(url)).to_s
7
+
8
+ URI.join(base, URI::DEFAULT_PARSER.escape(url)).to_s
8
9
  end
9
10
 
10
11
  def escape_url(url)
11
- uri = URI.parse(url)
12
- rescue URI::InvalidURIError => e
13
- URI.parse(URI.escape url).to_s rescue url
12
+ URI.parse(url)
13
+ rescue URI::InvalidURIError
14
+ URI.parse(URI::DEFAULT_PARSER.escape(url)).to_s rescue url
14
15
  else
15
16
  url
16
17
  end
@@ -5,7 +5,7 @@ require_relative '../capybara_ext/mechanize/driver'
5
5
  require_relative '../capybara_ext/session'
6
6
 
7
7
  module Kimurai
8
- class BrowserBuilder
8
+ module BrowserBuilder
9
9
  class MechanizeBuilder
10
10
  attr_reader :logger, :spider
11
11
 
@@ -17,8 +17,8 @@ module Kimurai
17
17
 
18
18
  def build
19
19
  # Register driver
20
- Capybara.register_driver :mechanize do |app|
21
- driver = Capybara::Mechanize::Driver.new("app")
20
+ Capybara.register_driver :mechanize do |_app|
21
+ driver = Capybara::Mechanize::Driver.new('app')
22
22
  # keep the history as small as possible (by default it's unlimited)
23
23
  driver.configure { |a| a.history.max_size = 2 }
24
24
  driver
@@ -27,19 +27,19 @@ module Kimurai
27
27
  # Create browser instance (Capybara session)
28
28
  @browser = Capybara::Session.new(:mechanize)
29
29
  @browser.spider = spider
30
- logger.debug "BrowserBuilder (mechanize): created browser instance"
30
+ logger.debug 'BrowserBuilder (mechanize): created browser instance'
31
31
 
32
32
  if @config[:extensions].present?
33
- logger.error "BrowserBuilder (mechanize): `extensions` option not supported, skipped"
33
+ logger.error 'BrowserBuilder (mechanize): `extensions` option not supported, skipped'
34
34
  end
35
35
 
36
36
  # Proxy
37
- if proxy = @config[:proxy].presence
38
- proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
39
- ip, port, type = proxy_string.split(":")
37
+ if (proxy = @config[:proxy].presence)
38
+ proxy_string = (proxy.instance_of?(Proc) ? proxy.call : proxy).strip
39
+ ip, port, type = proxy_string.split(':')
40
40
 
41
- if type == "http"
42
- @browser.driver.set_proxy(*proxy_string.split(":"))
41
+ if type == 'http'
42
+ @browser.driver.set_proxy(*proxy_string.split(':'))
43
43
  logger.debug "BrowserBuilder (mechanize): enabled http proxy, ip: #{ip}, port: #{port}"
44
44
  else
45
45
  logger.error "BrowserBuilder (mechanize): can't set #{type} proxy (not supported), skipped"
@@ -47,99 +47,105 @@ module Kimurai
47
47
  end
48
48
 
49
49
  # SSL
50
- if ssl_cert_path = @config[:ssl_cert_path].presence
50
+ if (ssl_cert_path = @config[:ssl_cert_path].presence)
51
51
  @browser.driver.browser.agent.http.ca_file = ssl_cert_path
52
- logger.debug "BrowserBuilder (mechanize): enabled custom ssl_cert"
52
+ logger.debug 'BrowserBuilder (mechanize): enabled custom ssl_cert'
53
53
  end
54
54
 
55
55
  if @config[:ignore_ssl_errors].present?
56
56
  @browser.driver.browser.agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
57
- logger.debug "BrowserBuilder (mechanize): enabled ignore_ssl_errors"
57
+ logger.debug 'BrowserBuilder (mechanize): enabled ignore_ssl_errors'
58
58
  end
59
59
 
60
60
  # Headers
61
- if headers = @config[:headers].presence
61
+ if (headers = @config[:headers].presence)
62
62
  @browser.driver.headers = headers
63
- logger.debug "BrowserBuilder (mechanize): enabled custom headers"
63
+ logger.debug 'BrowserBuilder (mechanize): enabled custom headers'
64
64
  end
65
65
 
66
- if user_agent = @config[:user_agent].presence
67
- user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
66
+ if (user_agent = @config[:user_agent].presence)
67
+ user_agent_string = (user_agent.instance_of?(Proc) ? user_agent.call : user_agent).strip
68
68
 
69
- @browser.driver.add_header("User-Agent", user_agent_string)
70
- logger.debug "BrowserBuilder (mechanize): enabled custom user_agent"
69
+ @browser.driver.add_header('User-Agent', user_agent_string)
70
+ logger.debug 'BrowserBuilder (mechanize): enabled custom user_agent'
71
71
  end
72
72
 
73
73
  # Cookies
74
- if cookies = @config[:cookies].presence
74
+ if (cookies = @config[:cookies].presence)
75
75
  cookies.each do |cookie|
76
76
  @browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
77
77
  end
78
78
 
79
- logger.debug "BrowserBuilder (mechanize): enabled custom cookies"
79
+ logger.debug 'BrowserBuilder (mechanize): enabled custom cookies'
80
80
  end
81
81
 
82
82
  # Browser instance options
83
83
  # skip_request_errors
84
- if skip_errors = @config[:skip_request_errors].presence
84
+ if (skip_errors = @config[:skip_request_errors].presence)
85
85
  @browser.config.skip_request_errors = skip_errors
86
- logger.debug "BrowserBuilder (mechanize): enabled skip_request_errors"
86
+ logger.debug 'BrowserBuilder (mechanize): enabled skip_request_errors'
87
87
  end
88
88
 
89
89
  # retry_request_errors
90
- if retry_errors = @config[:retry_request_errors].presence
90
+ if (retry_errors = @config[:retry_request_errors].presence)
91
91
  @browser.config.retry_request_errors = retry_errors
92
- logger.debug "BrowserBuilder (mechanize): enabled retry_request_errors"
92
+ logger.debug 'BrowserBuilder (mechanize): enabled retry_request_errors'
93
93
  end
94
94
 
95
95
  # restart_if
96
96
  if @config[:restart_if].present?
97
- logger.warn "BrowserBuilder (mechanize): restart_if options not supported by Mechanize, skipped"
97
+ logger.warn 'BrowserBuilder (mechanize): restart_if options not supported by Mechanize, skipped'
98
98
  end
99
99
 
100
100
  # before_request clear_cookies
101
101
  if @config.dig(:before_request, :clear_cookies)
102
102
  @browser.config.before_request[:clear_cookies] = true
103
- logger.debug "BrowserBuilder (mechanize): enabled before_request.clear_cookies"
103
+ logger.debug 'BrowserBuilder (mechanize): enabled before_request.clear_cookies'
104
104
  end
105
105
 
106
106
  # before_request clear_and_set_cookies
107
107
  if @config.dig(:before_request, :clear_and_set_cookies)
108
- if cookies = @config[:cookies].presence
108
+ if (cookies = @config[:cookies].presence)
109
109
  @browser.config.cookies = cookies
110
110
  @browser.config.before_request[:clear_and_set_cookies] = true
111
- logger.debug "BrowserBuilder (mechanize): enabled before_request.clear_and_set_cookies"
111
+ logger.debug 'BrowserBuilder (mechanize): enabled before_request.clear_and_set_cookies'
112
112
  else
113
- logger.error "BrowserBuilder (mechanize): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
113
+ logger.error 'BrowserBuilder (mechanize): cookies should be present to enable before_request.clear_and_set_cookies, skipped'
114
114
  end
115
115
  end
116
116
 
117
117
  # before_request change_user_agent
118
118
  if @config.dig(:before_request, :change_user_agent)
119
- if @config[:user_agent].present? && @config[:user_agent].class == Proc
119
+ if @config[:user_agent].present? && @config[:user_agent].instance_of?(Proc)
120
120
  @browser.config.user_agent = @config[:user_agent]
121
121
  @browser.config.before_request[:change_user_agent] = true
122
- logger.debug "BrowserBuilder (mechanize): enabled before_request.change_user_agent"
122
+ logger.debug 'BrowserBuilder (mechanize): enabled before_request.change_user_agent'
123
123
  else
124
- logger.error "BrowserBuilder (mechanize): user_agent should be present and has lambda format to enable before_request.change_user_agent, skipped"
124
+ logger.error 'BrowserBuilder (mechanize): user_agent should be present and has lambda format to enable before_request.change_user_agent, skipped'
125
125
  end
126
126
  end
127
127
 
128
128
  # before_request change_proxy
129
129
  if @config.dig(:before_request, :change_proxy)
130
- if @config[:proxy].present? && @config[:proxy].class == Proc
130
+ if @config[:proxy].present? && @config[:proxy].instance_of?(Proc)
131
131
  @browser.config.proxy = @config[:proxy]
132
132
  @browser.config.before_request[:change_proxy] = true
133
- logger.debug "BrowserBuilder (mechanize): enabled before_request.change_proxy"
133
+ logger.debug 'BrowserBuilder (mechanize): enabled before_request.change_proxy'
134
134
  else
135
- logger.error "BrowserBuilder (mechanize): proxy should be present and has lambda format to enable before_request.change_proxy, skipped"
135
+ logger.error 'BrowserBuilder (mechanize): proxy should be present and has lambda format to enable before_request.change_proxy, skipped'
136
136
  end
137
137
  end
138
138
 
139
139
  # before_request delay
140
- if delay = @config.dig(:before_request, :delay).presence
140
+ if (delay = @config.dig(:before_request, :delay).presence)
141
141
  @browser.config.before_request[:delay] = delay
142
- logger.debug "BrowserBuilder (mechanize): enabled before_request.delay"
142
+ logger.debug 'BrowserBuilder (mechanize): enabled before_request.delay'
143
+ end
144
+
145
+ # encoding
146
+ if (encoding = @config[:encoding])
147
+ @browser.config.encoding = encoding
148
+ logger.debug "BrowserBuilder (mechanize): enabled encoding: #{encoding}"
143
149
  end
144
150
 
145
151
  # return Capybara session instance