kimurai 1.4.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +9 -0
  3. data/CHANGELOG.md +21 -0
  4. data/Gemfile +2 -2
  5. data/README.md +476 -648
  6. data/Rakefile +6 -6
  7. data/bin/console +3 -4
  8. data/exe/kimurai +0 -1
  9. data/kimurai.gemspec +38 -37
  10. data/lib/kimurai/base/saver.rb +15 -19
  11. data/lib/kimurai/base/storage.rb +1 -1
  12. data/lib/kimurai/base.rb +38 -38
  13. data/lib/kimurai/base_helper.rb +5 -4
  14. data/lib/kimurai/browser_builder/mechanize_builder.rb +121 -119
  15. data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +160 -152
  16. data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +162 -160
  17. data/lib/kimurai/browser_builder.rb +1 -7
  18. data/lib/kimurai/capybara_configuration.rb +1 -1
  19. data/lib/kimurai/capybara_ext/driver/base.rb +50 -46
  20. data/lib/kimurai/capybara_ext/mechanize/driver.rb +51 -50
  21. data/lib/kimurai/capybara_ext/selenium/driver.rb +33 -29
  22. data/lib/kimurai/capybara_ext/session.rb +31 -38
  23. data/lib/kimurai/cli/generator.rb +15 -15
  24. data/lib/kimurai/cli.rb +49 -86
  25. data/lib/kimurai/core_ext/array.rb +2 -2
  26. data/lib/kimurai/core_ext/hash.rb +1 -1
  27. data/lib/kimurai/core_ext/numeric.rb +4 -4
  28. data/lib/kimurai/pipeline.rb +2 -1
  29. data/lib/kimurai/runner.rb +6 -6
  30. data/lib/kimurai/template/Gemfile +2 -2
  31. data/lib/kimurai/template/config/boot.rb +4 -4
  32. data/lib/kimurai/template/config/schedule.rb +15 -15
  33. data/lib/kimurai/template/spiders/application_spider.rb +8 -14
  34. data/lib/kimurai/version.rb +1 -1
  35. data/lib/kimurai.rb +7 -3
  36. metadata +58 -65
  37. data/.travis.yml +0 -5
  38. data/lib/kimurai/automation/deploy.yml +0 -54
  39. data/lib/kimurai/automation/setup/chromium_chromedriver.yml +0 -26
  40. data/lib/kimurai/automation/setup/firefox_geckodriver.yml +0 -20
  41. data/lib/kimurai/automation/setup/phantomjs.yml +0 -33
  42. data/lib/kimurai/automation/setup/ruby_environment.yml +0 -124
  43. data/lib/kimurai/automation/setup.yml +0 -44
  44. data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +0 -175
  45. data/lib/kimurai/capybara_ext/poltergeist/driver.rb +0 -13
  46. data/lib/kimurai/cli/ansible_command_builder.rb +0 -71
  47. data/lib/kimurai/template/config/automation.yml +0 -13
@@ -1,34 +1,38 @@
1
1
  require_relative '../driver/base'
2
2
 
3
- class Capybara::Selenium::Driver
4
- def get_cookies
5
- browser.manage.all_cookies
6
- end
7
-
8
- def set_cookie(name, value, options = {})
9
- options[:name] ||= name
10
- options[:value] ||= value
11
-
12
- browser.manage.add_cookie(options)
13
- end
14
-
15
- def set_cookies(cookies)
16
- cookies.each do |cookie|
17
- set_cookie(cookie[:name], cookie[:value], cookie)
3
+ module Capybara
4
+ module Selenium
5
+ class Driver
6
+ def get_cookies
7
+ browser.manage.all_cookies
8
+ end
9
+
10
+ def set_cookie(name, value, options = {})
11
+ options[:name] ||= name
12
+ options[:value] ||= value
13
+
14
+ browser.manage.add_cookie(options)
15
+ end
16
+
17
+ def set_cookies(cookies)
18
+ cookies.each do |cookie|
19
+ set_cookie(cookie[:name], cookie[:value], cookie)
20
+ end
21
+ end
22
+
23
+ def clear_cookies
24
+ browser.manage.delete_all_cookies
25
+ end
26
+
27
+ ###
28
+
29
+ def pid
30
+ @pid ||= `lsof -i tcp:#{port} -t`.strip.to_i
31
+ end
32
+
33
+ def port
34
+ @port ||= browser.send(:bridge).instance_variable_get('@http').instance_variable_get('@server_url').port
35
+ end
18
36
  end
19
37
  end
20
-
21
- def clear_cookies
22
- browser.manage.delete_all_cookies
23
- end
24
-
25
- ###
26
-
27
- def pid
28
- @pid ||= `lsof -i tcp:#{port} -t`.strip.to_i
29
- end
30
-
31
- def port
32
- @port ||= browser.send(:bridge).instance_variable_get("@http").instance_variable_get("@server_url").port
33
- end
34
38
  end
@@ -7,11 +7,12 @@ module Capybara
7
7
  class Session
8
8
  attr_accessor :spider
9
9
 
10
- alias_method :original_visit, :visit
10
+ alias original_visit visit
11
11
  def visit(visit_uri, delay: config.before_request[:delay], skip_request_options: false, max_retries: 3)
12
12
  if spider
13
13
  process_delay(delay) if delay
14
- retries, sleep_interval = 0, 0
14
+ retries = 0
15
+ sleep_interval = 0
15
16
 
16
17
  begin
17
18
  check_request_options(visit_uri) unless skip_request_options
@@ -19,7 +20,7 @@ module Capybara
19
20
  spider.class.update(:visits, :requests) if spider.with_info
20
21
 
21
22
  original_visit(visit_uri)
22
- rescue => e
23
+ rescue StandardError => e
23
24
  if match_error?(e, type: :to_skip)
24
25
  logger.error "Browser: skip request error: #{e.inspect}, url: #{visit_uri}"
25
26
  spider.add_event(:requests_errors, e.inspect) if spider.with_info
@@ -29,7 +30,7 @@ module Capybara
29
30
  spider.add_event(:requests_errors, e.inspect) if spider.with_info
30
31
 
31
32
  if (retries += 1) <= max_retries
32
- logger.info "Browser: sleep #{(sleep_interval += 15)} seconds and process retry № #{retries} to the url: #{visit_uri}"
33
+ logger.info "Browser: sleep #{sleep_interval += 15} seconds and process retry № #{retries} to the url: #{visit_uri}"
33
34
  sleep sleep_interval and retry
34
35
  else
35
36
  logger.error "Browser: all retries (#{retries - 1}) to the url #{visit_uri} are gone"
@@ -48,7 +49,7 @@ module Capybara
48
49
  logger.info "Info: visits: requests: #{spider.class.visits[:requests]}, responses: #{spider.class.visits[:responses]}"
49
50
  end
50
51
 
51
- if memory = driver.current_memory
52
+ if (memory = driver.current_memory)
52
53
  logger.debug "Browser: driver.current_memory: #{memory}"
53
54
  end
54
55
  end
@@ -62,7 +63,7 @@ module Capybara
62
63
  begin
63
64
  @driver.quit
64
65
  # handle Net::ReadTimeout error for Selenium like drivers
65
- rescue Net::ReadTimeout => e
66
+ rescue Net::ReadTimeout
66
67
  @driver.quit
67
68
  end
68
69
 
@@ -74,13 +75,8 @@ module Capybara
74
75
  end
75
76
 
76
77
  def restart!
77
- if mode.match?(/poltergeist/)
78
- @driver.browser.restart
79
- @driver.requests, @driver.responses = 0, 0
80
- else
81
- destroy_driver!
82
- driver
83
- end
78
+ destroy_driver!
79
+ driver
84
80
 
85
81
  logger.info "Browser: driver has been restarted: name: #{mode}, pid: #{driver.pid}, port: #{driver.port}"
86
82
  end
@@ -90,7 +86,7 @@ module Capybara
90
86
  when :html
91
87
  if config.encoding
92
88
  if config.encoding == :auto
93
- charset = body.force_encoding("ISO-8859-1").encode("UTF-8")[/<meta.*?charset=["]?([\w+\d+\-]*)/i, 1]
89
+ charset = body.force_encoding('ISO-8859-1').encode('UTF-8')[/<meta.*?charset="?([\w+\d+-]*)/i, 1]
94
90
  Nokogiri::HTML(body, nil, charset)
95
91
  else
96
92
  Nokogiri::HTML(body, nil, config.encoding)
@@ -108,23 +104,22 @@ module Capybara
108
104
  # Handy method to perform some processing in the new tab within block and then automatically close this tab:
109
105
  # Usage (url):
110
106
  # browser.within_new_window_by(url: "https://google.com") do
111
- # do some stuff and then automatically close this tab and return back to the first tab
107
+ # do some stuff and then automatically close this tab and return back to the first tab
112
108
  # end
113
109
  # Usage (action) (when new tab opening by some action, for example by clicking
114
110
  # on a particular element):
115
111
  # action = -> { browser.find("//some/element/path").click }
116
112
  # browser.within_new_window_by(action: action) do
117
- # do some stuff and then automatically close this tab and return back to the first tab
113
+ # do some stuff and then automatically close this tab and return back to the first tab
118
114
  # end
119
115
  def within_new_window_by(action: nil, url: nil)
120
- case
121
- when action
116
+ if action
122
117
  opened_window = window_opened_by { action.call }
123
118
  within_window(opened_window) do
124
119
  yield
125
120
  current_window.close
126
121
  end
127
- when url
122
+ elsif url
128
123
  within_window(open_new_window) do
129
124
  visit(url)
130
125
 
@@ -137,14 +132,14 @@ module Capybara
137
132
  ###
138
133
 
139
134
  def scroll_to_bottom
140
- execute_script("window.scrollBy(0,10000)")
135
+ execute_script('window.scrollBy(0,10000)')
141
136
  end
142
137
 
143
138
  private
144
139
 
145
140
  def skip_error_on_failure?(e)
146
141
  config.retry_request_errors.any? do |error|
147
- error[:skip_on_failure] && e.class.ancestors.include?(error[:error]) if error.kind_of?(Hash)
142
+ error[:skip_on_failure] && e.class.ancestors.include?(error[:error]) if error.is_a?(Hash)
148
143
  end
149
144
  end
150
145
 
@@ -156,10 +151,10 @@ module Capybara
156
151
  end
157
152
 
158
153
  errors.any? do |error|
159
- if error.kind_of?(Hash)
154
+ if error.is_a?(Hash)
160
155
  match_class = e.class.ancestors.include?(error[:error])
161
156
  if error[:message].present?
162
- if error[:message].kind_of?(Regexp)
157
+ if error[:message].is_a?(Regexp)
163
158
  e.message&.match?(error[:message])
164
159
  else
165
160
  e.message&.include?(error[:message])
@@ -174,14 +169,14 @@ module Capybara
174
169
  end
175
170
 
176
171
  def process_delay(delay)
177
- interval = (delay.class == Range ? rand(delay) : delay)
172
+ interval = (delay.instance_of?(Range) ? rand(delay) : delay)
178
173
  logger.debug "Browser: sleep #{interval.round(2)} #{'second'.pluralize(interval)} before request..."
179
174
  sleep interval
180
175
  end
181
176
 
182
177
  def check_request_options(url_to_visit)
183
178
  # restart_if
184
- if memory_limit = config.restart_if[:memory_limit]
179
+ if (memory_limit = config.restart_if[:memory_limit])
185
180
  memory = driver.current_memory
186
181
  if memory && memory >= memory_limit
187
182
  logger.warn "Browser: memory_limit #{memory_limit} of driver.current_memory (#{memory}) is exceeded (engine: #{mode})"
@@ -189,7 +184,7 @@ module Capybara
189
184
  end
190
185
  end
191
186
 
192
- if requests_limit = config.restart_if[:requests_limit]
187
+ if (requests_limit = config.restart_if[:requests_limit])
193
188
  requests = driver.requests
194
189
  if requests >= requests_limit
195
190
  logger.warn "Browser: requests_limit #{requests_limit} of driver.requests (#{requests}) is exceeded (engine: #{mode})"
@@ -209,7 +204,7 @@ module Capybara
209
204
 
210
205
  if config.before_request[:clear_cookies]
211
206
  driver.clear_cookies
212
- logger.debug "Browser: cleared cookies before request"
207
+ logger.debug 'Browser: cleared cookies before request'
213
208
  end
214
209
 
215
210
  if config.before_request[:clear_and_set_cookies]
@@ -217,29 +212,27 @@ module Capybara
217
212
 
218
213
  # (Selenium only) if browser is not visited yet any page, visit url_to_visit
219
214
  # first and then set cookies (needs after browser restart):
220
- if driver.visited.nil? && mode.match?(/selenium/)
221
- visit(url_to_visit, skip_request_options: true)
222
- end
215
+ visit(url_to_visit, skip_request_options: true) if driver.visited.nil? && mode.match?(/selenium/)
223
216
 
224
217
  config.cookies.each do |cookie|
225
218
  driver.set_cookie(cookie[:name], cookie[:value], cookie)
226
219
  end
227
220
 
228
- logger.debug "Browser: cleared and set cookies before request"
221
+ logger.debug 'Browser: cleared and set cookies before request'
229
222
  end
230
223
 
231
224
  # user_agent
232
225
  if config.before_request[:change_user_agent]
233
- driver.add_header("User-Agent", config.user_agent.call)
234
- logger.debug "Browser: changed user_agent before request"
226
+ driver.add_header('User-Agent', config.user_agent.call)
227
+ logger.debug 'Browser: changed user_agent before request'
235
228
  end
236
229
 
237
230
  # proxy
238
- if config.before_request[:change_proxy]
239
- proxy_string = config.proxy.call
240
- driver.set_proxy(*proxy_string.split(":"))
241
- logger.debug "Browser: changed proxy before request"
242
- end
231
+ return unless config.before_request[:change_proxy]
232
+
233
+ proxy_string = config.proxy.call
234
+ driver.set_proxy(*proxy_string.split(':'))
235
+ logger.debug 'Browser: changed proxy before request'
243
236
  end
244
237
 
245
238
  def logger
@@ -4,20 +4,20 @@ module Kimurai
4
4
  include Thor::Actions
5
5
 
6
6
  def self.source_root
7
- File.dirname(File.expand_path('..', __FILE__))
7
+ File.dirname(File.expand_path(__dir__))
8
8
  end
9
9
 
10
10
  def generate_project(project_name)
11
- directory "template", project_name
11
+ directory 'template', project_name
12
12
  inside(project_name) do
13
- run "bundle install"
14
- run "git init"
13
+ run 'bundle install'
14
+ run 'git init'
15
15
  end
16
16
  end
17
17
 
18
18
  def generate_spider(spider_name, in_project:)
19
19
  spider_path = in_project ? "spiders/#{spider_name}.rb" : "./#{spider_name}.rb"
20
- raise "Spider #{spider_path} already exists" if File.exists? spider_path
20
+ raise "Spider #{spider_path} already exists" if File.exist? spider_path
21
21
 
22
22
  spider_class = to_spider_class(spider_name)
23
23
  create_file spider_path do
@@ -33,24 +33,24 @@ module Kimurai
33
33
  RUBY
34
34
  end
35
35
 
36
- unless in_project
37
- insert_into_file spider_path, " @engine = :mechanize\n", after: "@name = \"#{spider_name}\"\n"
38
- prepend_to_file spider_path, "require 'kimurai'\n\n"
39
- append_to_file spider_path, "\n#{spider_class}.crawl!"
40
- end
36
+ return if in_project
37
+
38
+ insert_into_file spider_path, " @engine = :mechanize\n", after: "@name = \"#{spider_name}\"\n"
39
+ prepend_to_file spider_path, "require 'kimurai'\n\n"
40
+ append_to_file spider_path, "\n#{spider_class}.crawl!"
41
41
  end
42
42
 
43
43
  def generate_schedule
44
- copy_file "template/config/schedule.rb", "./schedule.rb"
44
+ copy_file 'template/config/schedule.rb', './schedule.rb'
45
45
  end
46
46
 
47
47
  private
48
48
 
49
49
  def to_spider_class(string)
50
- string.sub(/^./) { $&.capitalize }
51
- .gsub(/(?:_|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
52
- .gsub(/(?:-|(\/))([a-z\d]*)/) { "Dash#{$2.capitalize}" }
53
- .gsub(/(?:\.|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
50
+ string.sub(/^./) { ::Regexp.last_match(0).capitalize }
51
+ .gsub(%r{(?:_|(/))([a-z\d]*)}) { "#{::Regexp.last_match(1)}#{::Regexp.last_match(2).capitalize}" }
52
+ .gsub(%r{(?:-|(/))([a-z\d]*)}) { "Dash#{::Regexp.last_match(2).capitalize}" }
53
+ .gsub(%r{(?:\.|(/))([a-z\d]*)}) { "#{::Regexp.last_match(1)}#{::Regexp.last_match(2).capitalize}" }
54
54
  end
55
55
  end
56
56
  end
data/lib/kimurai/cli.rb CHANGED
@@ -4,18 +4,22 @@ module Kimurai
4
4
  class CLI < Thor
5
5
  map %w[--version -v] => :__print_version
6
6
 
7
- desc "generate", "Generator, available types: project, spider, schedule"
7
+ desc 'new PROJECT_NAME', 'Create a new Kimurai project'
8
+ def new(project_name)
9
+ raise 'Provide project name to generate a new project' unless project_name.present?
10
+
11
+ Generator.new.generate_project(project_name)
12
+ end
13
+
14
+ desc 'generate', 'Generator, available types: spider, schedule'
8
15
  def generate(generator_type, *args)
9
16
  case generator_type
10
- when "project"
11
- project_name = args.shift
12
- raise "Provide project name to generate a new project" unless project_name.present?
13
- Generator.new.generate_project(project_name)
14
- when "spider"
17
+ when 'spider'
15
18
  spider_name = args.shift
16
- raise "Provide spider name to generate a spider" unless spider_name.present?
19
+ raise 'Provide spider name to generate a spider' unless spider_name.present?
20
+
17
21
  Generator.new.generate_spider(spider_name, in_project: inside_project?)
18
- when "schedule"
22
+ when 'schedule'
19
23
  Generator.new.generate_schedule
20
24
  else
21
25
  raise "Don't know this generator type: #{generator_type}"
@@ -24,82 +28,43 @@ module Kimurai
24
28
 
25
29
  ###
26
30
 
27
- desc "setup", "Setup server"
28
- option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
29
- option "ask-sudo", type: :boolean, banner: "Provide sudo password for a user to install system-wide packages"
30
- option "ask-auth-pass", type: :boolean, banner: "Auth using password"
31
- option "ssh-key-path", type: :string, banner: "Auth using ssh key"
32
- option :local, type: :boolean, banner: "Run setup on a local machine (Ubuntu only)"
33
- def setup(user_host)
34
- command = AnsibleCommandBuilder.new(user_host, options, playbook: "setup").get
35
-
36
- pid = spawn *command
37
- Process.wait pid
38
- end
39
-
40
- desc "deploy", "Deploy project to the server and update cron schedule"
41
- option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
42
- option "ask-auth-pass", type: :boolean, banner: "Auth using password"
43
- option "ssh-key-path", type: :string, banner: "Auth using ssh key"
44
- option "repo-url", type: :string, banner: "Repo url"
45
- option "repo-key-path", type: :string, banner: "SSH key for a git repo"
46
- def deploy(user_host)
47
- if !`git status --short`.empty?
48
- raise "Deploy: Please commit your changes first"
49
- elsif `git remote`.empty?
50
- raise "Deploy: Please add remote origin repository to your repo first"
51
- elsif !`git rev-list master...origin/master`.empty?
52
- raise "Deploy: Please push your commits to the remote origin repo first"
53
- end
54
-
55
- repo_url = options["repo-url"] ? options["repo-url"] : `git remote get-url origin`.strip
56
- repo_name = repo_url[/\/([^\/]*)\.git/i, 1]
57
-
58
- command = AnsibleCommandBuilder.new(user_host, options, playbook: "deploy",
59
- vars: { repo_url: repo_url, repo_name: repo_name, repo_key_path: options["repo-key-path"] }
60
- ).get
61
-
62
- pid = spawn *command
63
- Process.wait pid
64
- end
65
-
66
- ###
67
-
68
- desc "crawl", "Run a particular spider by it's name"
31
+ desc 'crawl', "Run a particular spider by it's name"
69
32
  def crawl(spider_name)
70
33
  raise "Can't find Kimurai project" unless inside_project?
34
+
71
35
  require './config/boot'
72
36
 
73
- unless klass = Kimurai.find_by_name(spider_name)
37
+ unless (klass = Kimurai.find_by_name(spider_name))
74
38
  raise "Can't find spider with name `#{spider_name}` in the project. " \
75
- "To list all available spiders, run: `$ bundle exec kimurai list`"
39
+ 'To list all available spiders, run: `$ bundle exec kimurai list`'
76
40
  end
77
41
 
78
42
  # Set time_zone if exists
79
- if time_zone = Kimurai.configuration.time_zone
43
+ if (time_zone = Kimurai.configuration.time_zone)
80
44
  Kimurai.time_zone = time_zone
81
45
  end
82
46
 
83
47
  klass.crawl!
84
48
  end
85
49
 
86
- desc "parse", "Parse url in the particular spider method"
87
- option :url, type: :string, required: true, banner: "Url to pass to the method"
50
+ desc 'parse', 'Parse url in the particular spider method'
51
+ option :url, type: :string, required: true, banner: 'Url to pass to the method'
88
52
  def parse(spider_name, method_name)
89
53
  raise "Can't find Kimurai project" unless inside_project?
54
+
90
55
  require './config/boot'
91
56
 
92
- unless klass = Kimurai.find_by_name(spider_name)
57
+ unless (klass = Kimurai.find_by_name(spider_name))
93
58
  raise "Can't find spider with name `#{spider_name}` in the project. " \
94
- "To list all available spiders, run: `$ bundle exec kimurai list`"
59
+ 'To list all available spiders, run: `$ bundle exec kimurai list`'
95
60
  end
96
61
 
97
- klass.parse!(method_name, url: options["url"])
62
+ klass.parse!(method_name, url: options['url'])
98
63
  end
99
64
 
100
- desc "console", "Start Kimurai console"
101
- option :engine, type: :string, banner: "Engine to use"
102
- option :url, type: :string, banner: "Url to process"
65
+ desc 'console', 'Start Kimurai console'
66
+ option :engine, type: :string, banner: 'Engine to use'
67
+ option :url, type: :string, banner: 'Url to process'
103
68
  def console(spider_name = nil)
104
69
  require 'pry'
105
70
  require './config/boot' if inside_project?
@@ -107,74 +72,72 @@ module Kimurai
107
72
  if spider_name
108
73
  raise "Can't find Kimurai project" unless inside_project?
109
74
 
110
- unless klass = Kimurai.find_by_name(spider_name)
75
+ unless (klass = Kimurai.find_by_name(spider_name))
111
76
  raise "Can't find spider with name `#{spider_name}` in the project. " \
112
- "To list all available spiders, run: `$ bundle exec kimurai list`"
77
+ 'To list all available spiders, run: `$ bundle exec kimurai list`'
113
78
  end
114
79
  else
115
80
  klass = inside_project? ? ApplicationSpider : ::Kimurai::Base
116
81
  end
117
82
 
118
- engine = options["engine"]&.delete(":")&.to_sym
119
- if url = options["url"]
120
- klass.new(engine).request_to(:console, url: options["url"])
83
+ engine = options['engine']&.delete(':')&.to_sym
84
+ if options['url']
85
+ klass.new(engine).request_to(:console, url: options['url'])
121
86
  else
122
87
  klass.new(engine).public_send(:console)
123
88
  end
124
89
  end
125
90
 
126
- desc "list", "List all available spiders in the current project"
91
+ desc 'list', 'List all available spiders in the current project'
127
92
  def list
128
93
  raise "Can't find Kimurai project" unless inside_project?
94
+
129
95
  require './config/boot'
130
96
 
131
- Kimurai.list.keys.each { |name| puts name }
97
+ Kimurai.list.keys.sort.each { |name| puts name }
132
98
  end
133
99
 
134
- desc "runner", "Run all spiders in the project in queue"
135
- option :include, type: :array, default: [], banner: "List of spiders to run"
136
- option :exclude, type: :array, default: [], banner: "List of spiders to exclude from run"
137
- option :jobs, aliases: :j, type: :numeric, default: 1, banner: "The number of concurrent jobs"
100
+ desc 'runner', 'Run all spiders in the project in queue'
101
+ option :include, type: :array, default: [], banner: 'List of spiders to run'
102
+ option :exclude, type: :array, default: [], banner: 'List of spiders to exclude from run'
103
+ option :jobs, aliases: :j, type: :numeric, default: 1, banner: 'The number of concurrent jobs'
138
104
  def runner
139
105
  raise "Can't find Kimurai project" unless inside_project?
140
106
 
141
- jobs = options["jobs"]
142
- raise "Jobs count can't be 0" if jobs == 0
107
+ jobs = options['jobs']
108
+ raise "Jobs count can't be 0" if jobs.zero?
143
109
 
144
110
  require './config/boot'
145
111
  require 'kimurai/runner'
146
112
 
147
- spiders = options["include"].presence || Kimurai.list.keys
148
- spiders -= options["exclude"]
113
+ spiders = options['include'].presence || Kimurai.list.keys
114
+ spiders -= options['exclude']
149
115
 
150
116
  Runner.new(spiders, jobs).run!
151
117
  end
152
118
 
153
- desc "--version, -v", "Print the version"
119
+ desc '--version, -v', 'Print the version'
154
120
  def __print_version
155
121
  puts VERSION
156
122
  end
157
123
 
158
- desc "dashboard", "Run dashboard"
124
+ desc 'dashboard', 'Run dashboard'
159
125
  def dashboard
160
126
  raise "Can't find Kimurai project" unless inside_project?
161
127
 
162
128
  require './config/boot'
163
- if Object.const_defined?("Kimurai::Dashboard")
164
- require 'kimurai/dashboard/app'
165
- Kimurai::Dashboard::App.run!
166
- else
167
- raise "Kimurai::Dashboard is not defined"
168
- end
129
+ raise 'Kimurai::Dashboard is not defined' unless Object.const_defined?('Kimurai::Dashboard')
130
+
131
+ require 'kimurai/dashboard/app'
132
+ Kimurai::Dashboard::App.run!
169
133
  end
170
134
 
171
135
  private
172
136
 
173
137
  def inside_project?
174
- Dir.exists?("spiders") && File.exists?("./config/boot.rb")
138
+ Dir.exist?('spiders') && File.exist?('./config/boot.rb')
175
139
  end
176
140
  end
177
141
  end
178
142
 
179
143
  require_relative 'cli/generator'
180
- require_relative 'cli/ansible_command_builder'
@@ -1,8 +1,8 @@
1
1
  class Array
2
2
  def in_sorted_groups(number, fill_width = nil)
3
- sorted_groups = Array.new(number) { |a| a = [] }
3
+ sorted_groups = Array.new(number) { |_a| [] }
4
4
 
5
- self.in_groups_of(number, fill_width).each do |group|
5
+ in_groups_of(number, fill_width).each do |group|
6
6
  number.times do |i|
7
7
  group.fetch(i) rescue next
8
8
  sorted_groups[i] << group[i]
@@ -1,5 +1,5 @@
1
1
  class Hash
2
2
  def deep_merge_excl(second, exclude)
3
- self.merge(second.slice(*exclude)).deep_merge(second.except(*exclude))
3
+ merge(second.slice(*exclude)).deep_merge(second.except(*exclude))
4
4
  end
5
5
  end
@@ -1,16 +1,16 @@
1
1
  class Numeric
2
2
  # https://stackoverflow.com/a/1679963
3
3
  def duration
4
- secs = self.to_int
4
+ secs = to_int
5
5
  mins = secs / 60
6
6
  hours = mins / 60
7
7
  days = hours / 24
8
8
 
9
- if days > 0
9
+ if days.positive?
10
10
  "#{days}d, #{hours % 24}h"
11
- elsif hours > 0
11
+ elsif hours.positive?
12
12
  "#{hours}h, #{mins % 60}m"
13
- elsif mins > 0
13
+ elsif mins.positive?
14
14
  "#{mins}m, #{secs % 60}s"
15
15
  elsif secs >= 0
16
16
  "#{secs}s"
@@ -1,8 +1,9 @@
1
1
  module Kimurai
2
2
  class Pipeline
3
3
  class DropItemError < StandardError; end
4
+
4
5
  def self.name
5
- self.to_s.sub(/.*?::/, "").underscore.to_sym
6
+ to_s.sub(/.*?::/, '').underscore.to_sym
6
7
  end
7
8
 
8
9
  include BaseHelper
@@ -19,17 +19,17 @@ module Kimurai
19
19
  spiders: @spiders
20
20
  }
21
21
 
22
- if time_zone = Kimurai.configuration.time_zone
22
+ if (time_zone = Kimurai.configuration.time_zone)
23
23
  Kimurai.time_zone = time_zone
24
24
  end
25
25
 
26
- ENV.store("SESSION_ID", @start_time.to_i.to_s)
27
- ENV.store("RBCAT_COLORIZER", "false")
26
+ ENV.store('SESSION_ID', @start_time.to_i.to_s)
27
+ ENV.store('RBCAT_COLORIZER', 'false')
28
28
  end
29
29
 
30
30
  def run!(exception_on_fail: true)
31
31
  puts ">>> Runner: started: #{session_info}"
32
- if at_start_callback = Kimurai.configuration.runner_at_start_callback
32
+ if (at_start_callback = Kimurai.configuration.runner_at_start_callback)
33
33
  at_start_callback.call(session_info)
34
34
  end
35
35
 
@@ -38,7 +38,7 @@ module Kimurai
38
38
  next unless running
39
39
 
40
40
  puts "> Runner: started spider: #{spider}, index: #{i}"
41
- pid = spawn("bundle", "exec", "kimurai", "crawl", spider, [:out, :err] => "log/#{spider}.log")
41
+ pid = spawn('bundle', 'exec', 'kimurai', 'crawl', spider, %i[out err] => "log/#{spider}.log")
42
42
  Process.wait pid
43
43
 
44
44
  puts "< Runner: stopped spider: #{spider}, index: #{i}"
@@ -51,7 +51,7 @@ module Kimurai
51
51
  else
52
52
  session_info.merge!(status: :completed, stop_time: Time.now)
53
53
  ensure
54
- if at_stop_callback = Kimurai.configuration.runner_at_stop_callback
54
+ if (at_stop_callback = Kimurai.configuration.runner_at_stop_callback)
55
55
  at_stop_callback.call(session_info)
56
56
  end
57
57
  puts "<<< Runner: stopped: #{session_info}"
@@ -1,10 +1,10 @@
1
1
  source 'https://rubygems.org'
2
2
  git_source(:github) { |repo| "https://github.com/#{repo}.git" }
3
3
 
4
- ruby '>= 2.5'
4
+ ruby '>= 3.1'
5
5
 
6
6
  # Framework
7
- gem 'kimurai', '~> 1.4'
7
+ gem 'kimurai', '~> 2.0'
8
8
 
9
9
  # Require files in directory and child directories recursively
10
10
  gem 'require_all'