kimurai 1.4.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +9 -0
  3. data/CHANGELOG.md +21 -0
  4. data/Gemfile +2 -2
  5. data/README.md +476 -648
  6. data/Rakefile +6 -6
  7. data/bin/console +3 -4
  8. data/exe/kimurai +0 -1
  9. data/kimurai.gemspec +38 -37
  10. data/lib/kimurai/base/saver.rb +15 -19
  11. data/lib/kimurai/base/storage.rb +1 -1
  12. data/lib/kimurai/base.rb +38 -38
  13. data/lib/kimurai/base_helper.rb +5 -4
  14. data/lib/kimurai/browser_builder/mechanize_builder.rb +121 -119
  15. data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +160 -152
  16. data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +162 -160
  17. data/lib/kimurai/browser_builder.rb +1 -7
  18. data/lib/kimurai/capybara_configuration.rb +1 -1
  19. data/lib/kimurai/capybara_ext/driver/base.rb +50 -46
  20. data/lib/kimurai/capybara_ext/mechanize/driver.rb +51 -50
  21. data/lib/kimurai/capybara_ext/selenium/driver.rb +33 -29
  22. data/lib/kimurai/capybara_ext/session.rb +31 -38
  23. data/lib/kimurai/cli/generator.rb +15 -15
  24. data/lib/kimurai/cli.rb +49 -86
  25. data/lib/kimurai/core_ext/array.rb +2 -2
  26. data/lib/kimurai/core_ext/hash.rb +1 -1
  27. data/lib/kimurai/core_ext/numeric.rb +4 -4
  28. data/lib/kimurai/pipeline.rb +2 -1
  29. data/lib/kimurai/runner.rb +6 -6
  30. data/lib/kimurai/template/Gemfile +2 -2
  31. data/lib/kimurai/template/config/boot.rb +4 -4
  32. data/lib/kimurai/template/config/schedule.rb +15 -15
  33. data/lib/kimurai/template/spiders/application_spider.rb +8 -14
  34. data/lib/kimurai/version.rb +1 -1
  35. data/lib/kimurai.rb +7 -3
  36. metadata +58 -65
  37. data/.travis.yml +0 -5
  38. data/lib/kimurai/automation/deploy.yml +0 -54
  39. data/lib/kimurai/automation/setup/chromium_chromedriver.yml +0 -26
  40. data/lib/kimurai/automation/setup/firefox_geckodriver.yml +0 -20
  41. data/lib/kimurai/automation/setup/phantomjs.yml +0 -33
  42. data/lib/kimurai/automation/setup/ruby_environment.yml +0 -124
  43. data/lib/kimurai/automation/setup.yml +0 -44
  44. data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +0 -175
  45. data/lib/kimurai/capybara_ext/poltergeist/driver.rb +0 -13
  46. data/lib/kimurai/cli/ansible_command_builder.rb +0 -71
  47. data/lib/kimurai/template/config/automation.yml +0 -13
data/Rakefile CHANGED
@@ -1,10 +1,10 @@
1
- require "bundler/gem_tasks"
2
- require "rake/testtask"
1
+ require 'bundler/gem_tasks'
2
+ require 'rake/testtask'
3
3
 
4
4
  Rake::TestTask.new(:test) do |t|
5
- t.libs << "test"
6
- t.libs << "lib"
7
- t.test_files = FileList["test/**/*_test.rb"]
5
+ t.libs << 'test'
6
+ t.libs << 'lib'
7
+ t.test_files = FileList['test/**/*_test.rb']
8
8
  end
9
9
 
10
- task :default => :test
10
+ task default: :test
data/bin/console CHANGED
@@ -1,7 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
-
3
- require "bundler/setup"
4
- require "kimurai"
2
+ require 'bundler/setup'
3
+ require 'kimurai'
5
4
 
6
5
  # You can add fixtures and/or initialization code here to make experimenting
7
6
  # with your gem easier. You can also use a different console, if you like.
@@ -10,5 +9,5 @@ require "kimurai"
10
9
  # require "pry"
11
10
  # Pry.start
12
11
 
13
- require "irb"
12
+ require 'irb'
14
13
  IRB.start(__FILE__)
data/exe/kimurai CHANGED
@@ -1,5 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
-
3
2
  require 'kimurai'
4
3
  require 'kimurai/cli'
5
4
 
data/kimurai.gemspec CHANGED
@@ -1,48 +1,49 @@
1
-
2
- lib = File.expand_path("../lib", __FILE__)
1
+ lib = File.expand_path('lib', __dir__)
3
2
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
- require "kimurai/version"
3
+ require 'kimurai/version'
5
4
 
6
5
  Gem::Specification.new do |spec|
7
- spec.name = "kimurai"
6
+ spec.name = 'kimurai'
8
7
  spec.version = Kimurai::VERSION
9
- spec.authors = ["Victor Afanasev"]
10
- spec.email = ["vicfreefly@gmail.com"]
8
+ spec.authors = ['Victor Afanasev']
9
+ spec.email = ['vicfreefly@gmail.com']
11
10
 
12
- spec.summary = "Modern web scraping framework written in Ruby and based on Capybara/Nokogiri"
13
- spec.homepage = "https://github.com/vifreefly/kimuraframework"
14
- spec.license = "MIT"
11
+ spec.summary = 'Modern web scraping framework written in Ruby and based on Capybara/Nokogiri'
12
+ spec.homepage = 'https://github.com/vifreefly/kimuraframework'
13
+ spec.license = 'MIT'
15
14
 
16
15
  # Specify which files should be added to the gem when it is released.
17
16
  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
18
- spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
17
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
19
18
  `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
20
19
  end
21
- spec.bindir = "exe"
22
- spec.executables = "kimurai"
23
- spec.require_paths = ["lib"]
24
- spec.required_ruby_version = ">= 2.5.0"
25
-
26
- spec.add_dependency "thor"
27
- spec.add_dependency "cliver"
28
- spec.add_dependency "activesupport"
29
- spec.add_dependency "murmurhash3"
30
- spec.add_dependency "nokogiri"
31
-
32
- spec.add_dependency "capybara", ">= 2.15", "< 4.0"
33
- spec.add_dependency "capybara-mechanize"
34
- spec.add_dependency "poltergeist"
35
- spec.add_dependency "selenium-webdriver"
36
-
37
- spec.add_dependency "headless"
38
- spec.add_dependency "pmap"
39
-
40
- spec.add_dependency "whenever"
41
-
42
- spec.add_dependency "rbcat", "~> 0.2"
43
- spec.add_dependency "pry"
44
-
45
- spec.add_development_dependency "bundler", "~> 1.16"
46
- spec.add_development_dependency "rake", "~> 10.0"
47
- spec.add_development_dependency "minitest", "~> 5.0"
20
+ spec.bindir = 'exe'
21
+ spec.executables = 'kimurai'
22
+ spec.require_paths = ['lib']
23
+ spec.required_ruby_version = '>= 3.1.0'
24
+
25
+ spec.add_dependency 'activesupport'
26
+ spec.add_dependency 'cliver'
27
+ spec.add_dependency 'csv'
28
+ spec.add_dependency 'murmurhash3'
29
+ spec.add_dependency 'nokogiri'
30
+ spec.add_dependency 'ostruct'
31
+ spec.add_dependency 'thor'
32
+
33
+ # for capybara-mechanize compatibility
34
+ spec.add_dependency 'mutex_m'
35
+ spec.add_dependency 'nkf'
36
+ spec.add_dependency 'reline'
37
+
38
+ spec.add_dependency 'capybara', '~> 3.40'
39
+ spec.add_dependency 'capybara-mechanize', '~> 1.13'
40
+ spec.add_dependency 'selenium-webdriver', '~> 4.27'
41
+
42
+ spec.add_dependency 'headless'
43
+ spec.add_dependency 'pmap'
44
+
45
+ spec.add_dependency 'whenever'
46
+
47
+ spec.add_dependency 'pry'
48
+ spec.add_dependency 'rbcat', '~> 1.0'
48
49
  end
@@ -7,9 +7,7 @@ module Kimurai
7
7
  attr_reader :format, :path, :position, :append
8
8
 
9
9
  def initialize(path, format:, position: true, append: false)
10
- unless %i(json pretty_json jsonlines csv).include?(format)
11
- raise "SimpleSaver: wrong type of format: #{format}"
12
- end
10
+ raise "SimpleSaver: wrong type of format: #{format}" unless %i[json pretty_json jsonlines csv].include?(format)
13
11
 
14
12
  @path = path
15
13
  @format = format
@@ -42,48 +40,48 @@ module Kimurai
42
40
  def save_to_json(item)
43
41
  data = JSON.generate([item])
44
42
 
45
- if @index > 1 || append && File.exists?(path)
43
+ if @index > 1 || append && File.exist?(path)
46
44
  file_content = File.read(path).sub(/\}\]\Z/, "\}\,")
47
- File.open(path, "w") do |f|
48
- f.write(file_content + data.sub(/\A\[/, ""))
45
+ File.open(path, 'w') do |f|
46
+ f.write(file_content + data.sub(/\A\[/, ''))
49
47
  end
50
48
  else
51
- File.open(path, "w") { |f| f.write(data) }
49
+ File.open(path, 'w') { |f| f.write(data) }
52
50
  end
53
51
  end
54
52
 
55
53
  def save_to_pretty_json(item)
56
54
  data = JSON.pretty_generate([item])
57
55
 
58
- if @index > 1 || append && File.exists?(path)
56
+ if @index > 1 || append && File.exist?(path)
59
57
  file_content = File.read(path).sub(/\}\n\]\Z/, "\}\,\n")
60
- File.open(path, "w") do |f|
61
- f.write(file_content + data.sub(/\A\[\n/, ""))
58
+ File.open(path, 'w') do |f|
59
+ f.write(file_content + data.sub(/\A\[\n/, ''))
62
60
  end
63
61
  else
64
- File.open(path, "w") { |f| f.write(data) }
62
+ File.open(path, 'w') { |f| f.write(data) }
65
63
  end
66
64
  end
67
65
 
68
66
  def save_to_jsonlines(item)
69
67
  data = JSON.generate(item)
70
68
 
71
- if @index > 1 || append && File.exists?(path)
72
- File.open(path, "a") { |file| file.write("\n" + data) }
69
+ if @index > 1 || append && File.exist?(path)
70
+ File.open(path, 'a') { |file| file.write("\n#{data}") }
73
71
  else
74
- File.open(path, "w") { |file| file.write(data) }
72
+ File.open(path, 'w') { |file| file.write(data) }
75
73
  end
76
74
  end
77
75
 
78
76
  def save_to_csv(item)
79
77
  data = flatten_hash(item)
80
78
 
81
- if @index > 1 || append && File.exists?(path)
82
- CSV.open(path, "a+", force_quotes: true) do |csv|
79
+ if @index > 1 || append && File.exist?(path)
80
+ CSV.open(path, 'a+', force_quotes: true) do |csv|
83
81
  csv << data.values
84
82
  end
85
83
  else
86
- CSV.open(path, "w", force_quotes: true) do |csv|
84
+ CSV.open(path, 'w', force_quotes: true) do |csv|
87
85
  csv << data.keys
88
86
  csv << data.values
89
87
  end
@@ -102,5 +100,3 @@ module Kimurai
102
100
  end
103
101
  end
104
102
  end
105
-
106
-
@@ -24,7 +24,7 @@ module Kimurai
24
24
  def add(scope, value)
25
25
  @mutex.synchronize do
26
26
  database[scope] ||= []
27
- if value.kind_of?(Array)
27
+ if value.is_a?(Array)
28
28
  database[scope] += value
29
29
  database[scope].uniq!
30
30
  else
data/lib/kimurai/base.rb CHANGED
@@ -1,3 +1,4 @@
1
+ require 'English'
1
2
  require_relative 'base/saver'
2
3
  require_relative 'base/storage'
3
4
 
@@ -6,16 +7,16 @@ module Kimurai
6
7
  class InvalidUrlError < StandardError; end
7
8
 
8
9
  # don't deep merge config's headers hash option
9
- DMERGE_EXCLUDE = [:headers]
10
+ DMERGE_EXCLUDE = [:headers].freeze
10
11
 
11
12
  LoggerFormatter = proc do |severity, datetime, progname, msg|
12
13
  current_thread_id = Thread.current.object_id
13
- thread_type = Thread.main == Thread.current ? "M" : "C"
14
- output = "%s, [%s#%d] [%s: %s] %5s -- %s: %s\n"
15
- .freeze % [severity[0..0], datetime, $$, thread_type, current_thread_id, severity, progname, msg]
14
+ thread_type = Thread.main == Thread.current ? 'M' : 'C'
15
+ output = format("%s, [%s#%d] [%s: %s] %5s -- %s: %s\n", severity[0..0], datetime, $PROCESS_ID, thread_type,
16
+ current_thread_id, severity, progname, msg)
16
17
 
17
- if Kimurai.configuration.colorize_logger != false && Kimurai.env == "development"
18
- Rbcat.colorize(output, predefined: [:jsonhash, :logger])
18
+ if Kimurai.configuration.colorize_logger != false && Kimurai.env == 'development'
19
+ Rbcat.colorize(output, predefined: %i[jsonhash logger])
19
20
  else
20
21
  output
21
22
  end
@@ -51,11 +52,13 @@ module Kimurai
51
52
 
52
53
  def self.update(type, subtype)
53
54
  return unless @run_info
55
+
54
56
  @update_mutex.synchronize { @run_info[type][subtype] += 1 }
55
57
  end
56
58
 
57
59
  def self.add_event(scope, event)
58
60
  return unless @run_info
61
+
59
62
  @update_mutex.synchronize { @run_info[:events][scope][event] += 1 }
60
63
  end
61
64
 
@@ -93,9 +96,9 @@ module Kimurai
93
96
 
94
97
  def self.logger
95
98
  @logger ||= Kimurai.configuration.logger || begin
96
- log_level = (ENV["LOG_LEVEL"] || Kimurai.configuration.log_level || "DEBUG").to_s.upcase
99
+ log_level = (ENV['LOG_LEVEL'] || Kimurai.configuration.log_level || 'DEBUG').to_s.upcase
97
100
  log_level = "Logger::#{log_level}".constantize
98
- Logger.new(STDOUT, formatter: LoggerFormatter, level: log_level, progname: name)
101
+ Logger.new($stdout, formatter: LoggerFormatter, level: log_level, progname: name)
99
102
  end
100
103
  end
101
104
 
@@ -116,13 +119,13 @@ module Kimurai
116
119
  ###
117
120
 
118
121
  logger.info "Spider: started: #{name}"
119
- open_spider if self.respond_to? :open_spider
122
+ open_spider if respond_to? :open_spider
120
123
 
121
- spider = self.new
124
+ spider = new
122
125
  spider.with_info = true
123
126
  if start_urls
124
127
  start_urls.each do |start_url|
125
- if start_url.class == Hash
128
+ if start_url.instance_of?(Hash)
126
129
  spider.request_to(:parse, start_url)
127
130
  else
128
131
  spider.request_to(:parse, url: start_url)
@@ -138,13 +141,13 @@ module Kimurai
138
141
  @run_info.merge!(status: :completed)
139
142
  ensure
140
143
  if spider
141
- spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
144
+ spider.browser.destroy_driver! if spider.instance_variable_get('@browser')
142
145
 
143
146
  stop_time = Time.now
144
147
  total_time = (stop_time - @run_info[:start_time]).round(3)
145
148
  @run_info.merge!(stop_time: stop_time, running_time: total_time)
146
149
 
147
- close_spider if self.respond_to? :close_spider
150
+ close_spider if respond_to? :close_spider
148
151
 
149
152
  message = "Spider: stopped: #{@run_info.merge(running_time: @run_info[:running_time]&.duration)}"
150
153
  failed? ? logger.fatal(message) : logger.info(message)
@@ -154,7 +157,7 @@ module Kimurai
154
157
  end
155
158
 
156
159
  def self.parse!(handler, *args, **request)
157
- spider = self.new
160
+ spider = new
158
161
 
159
162
  if args.present?
160
163
  spider.public_send(handler, *args)
@@ -164,7 +167,7 @@ module Kimurai
164
167
  spider.public_send(handler)
165
168
  end
166
169
  ensure
167
- spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
170
+ spider.browser.destroy_driver! if spider.instance_variable_get('@browser')
168
171
  end
169
172
 
170
173
  ###
@@ -191,17 +194,17 @@ module Kimurai
191
194
  end
192
195
 
193
196
  def request_to(handler, delay = nil, url:, data: {}, response_type: :html)
194
- raise InvalidUrlError, "Requested url is invalid: #{url}" unless URI.parse(url).kind_of?(URI::HTTP)
197
+ raise InvalidUrlError, "Requested url is invalid: #{url}" unless URI.parse(url).is_a?(URI::HTTP)
195
198
 
196
199
  if @config[:skip_duplicate_requests] && !unique_request?(url)
197
- add_event(:duplicate_requests) if self.with_info
200
+ add_event(:duplicate_requests) if with_info
198
201
  logger.warn "Spider: request_to: not unique url: #{url}, skipped" and return
199
202
  end
200
203
 
201
204
  visited = delay ? browser.visit(url, delay: delay) : browser.visit(url)
202
205
  return unless visited
203
206
 
204
- public_send(handler, browser.current_response(response_type), { url: url, data: data })
207
+ public_send(handler, browser.current_response(response_type), **{ url: url, data: data })
205
208
  end
206
209
 
207
210
  def console(response = nil, url: nil, data: {})
@@ -211,9 +214,9 @@ module Kimurai
211
214
  ###
212
215
 
213
216
  def storage
214
- # Note: for `.crawl!` uses shared thread safe Storage instance,
217
+ # NOTE: for `.crawl!` uses shared thread safe Storage instance,
215
218
  # otherwise, each spider instance will have it's own Storage
216
- @storage ||= self.with_info ? self.class.storage : Storage.new
219
+ @storage ||= with_info ? self.class.storage : Storage.new
217
220
  end
218
221
 
219
222
  def unique?(scope, value)
@@ -223,10 +226,10 @@ module Kimurai
223
226
  def save_to(path, item, format:, position: true, append: false)
224
227
  @savers[path] ||= begin
225
228
  options = { format: format, position: position, append: append }
226
- if self.with_info
227
- self.class.savers[path] ||= Saver.new(path, options)
229
+ if with_info
230
+ self.class.savers[path] ||= Saver.new(path, **options)
228
231
  else
229
- Saver.new(path, options)
232
+ Saver.new(path, **options)
230
233
  end
231
234
  end
232
235
 
@@ -236,11 +239,8 @@ module Kimurai
236
239
  ###
237
240
 
238
241
  def add_event(scope = :custom, event)
239
- unless self.with_info
240
- raise "It's allowed to use `add_event` only while performing a full run (`.crawl!` method)"
241
- end
242
+ self.class.add_event(scope, event) if with_info
242
243
 
243
- self.class.add_event(scope, event)
244
244
  logger.info "Spider: new event (scope: #{scope}): #{event}" if scope == :custom
245
245
  end
246
246
 
@@ -254,35 +254,35 @@ module Kimurai
254
254
 
255
255
  def unique_request?(url)
256
256
  options = @config[:skip_duplicate_requests]
257
- if options.class == Hash
257
+ if options.instance_of?(Hash)
258
258
  scope = options[:scope] || :requests_urls
259
259
  if options[:check_only]
260
260
  storage.include?(scope, url) ? false : true
261
261
  else
262
- storage.unique?(scope, url) ? true : false
262
+ storage.unique?(scope, url) || false
263
263
  end
264
264
  else
265
- storage.unique?(:requests_urls, url) ? true : false
265
+ storage.unique?(:requests_urls, url) || false
266
266
  end
267
267
  end
268
268
 
269
269
  def send_item(item, options = {})
270
270
  logger.debug "Pipeline: starting processing item through #{@pipelines.size} #{'pipeline'.pluralize(@pipelines.size)}..."
271
- self.class.update(:items, :sent) if self.with_info
271
+ self.class.update(:items, :sent) if with_info
272
272
 
273
273
  @pipelines.each do |name, instance|
274
274
  item = options[name] ? instance.process_item(item, options: options[name]) : instance.process_item(item)
275
275
  end
276
- rescue => e
276
+ rescue StandardError => e
277
277
  logger.error "Pipeline: dropped: #{e.inspect} (#{e.backtrace.first}), item: #{item}"
278
- add_event(:drop_items_errors, e.inspect) if self.with_info
278
+ add_event(:drop_items_errors, e.inspect) if with_info
279
279
  false
280
280
  else
281
- self.class.update(:items, :processed) if self.with_info
281
+ self.class.update(:items, :processed) if with_info
282
282
  logger.info "Pipeline: processed: #{JSON.generate(item)}"
283
283
  true
284
284
  ensure
285
- if self.with_info
285
+ if with_info
286
286
  logger.info "Info: items: sent: #{self.class.items[:sent]}, processed: #{self.class.items[:processed]}"
287
287
  end
288
288
  end
@@ -300,10 +300,10 @@ module Kimurai
300
300
  Thread.current.abort_on_exception = true
301
301
 
302
302
  spider = self.class.new(engine, config: @config.deep_merge_excl(config, DMERGE_EXCLUDE))
303
- spider.with_info = true if self.with_info
303
+ spider.with_info = true if with_info
304
304
 
305
305
  part.each do |url_data|
306
- if url_data.class == Hash
306
+ if url_data.instance_of?(Hash)
307
307
  if url_data[:url].present? && url_data[:data].present?
308
308
  spider.request_to(handler, delay, url_data)
309
309
  else
@@ -314,7 +314,7 @@ module Kimurai
314
314
  end
315
315
  end
316
316
  ensure
317
- spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
317
+ spider.browser.destroy_driver! if spider.instance_variable_get('@browser')
318
318
  end
319
319
 
320
320
  sleep 0.5
@@ -4,13 +4,14 @@ module Kimurai
4
4
 
5
5
  def absolute_url(url, base:)
6
6
  return unless url
7
- URI.join(base, URI.escape(url)).to_s
7
+
8
+ URI.join(base, URI::DEFAULT_PARSER.escape(url)).to_s
8
9
  end
9
10
 
10
11
  def escape_url(url)
11
- uri = URI.parse(url)
12
- rescue URI::InvalidURIError => e
13
- URI.parse(URI.escape url).to_s rescue url
12
+ URI.parse(url)
13
+ rescue URI::InvalidURIError
14
+ URI.parse(URI::DEFAULT_PARSER.escape(url)).to_s rescue url
14
15
  else
15
16
  url
16
17
  end