kimurai 1.4.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +9 -0
- data/CHANGELOG.md +21 -0
- data/Gemfile +2 -2
- data/README.md +476 -648
- data/Rakefile +6 -6
- data/bin/console +3 -4
- data/exe/kimurai +0 -1
- data/kimurai.gemspec +38 -37
- data/lib/kimurai/base/saver.rb +15 -19
- data/lib/kimurai/base/storage.rb +1 -1
- data/lib/kimurai/base.rb +38 -38
- data/lib/kimurai/base_helper.rb +5 -4
- data/lib/kimurai/browser_builder/mechanize_builder.rb +121 -119
- data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +160 -152
- data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +162 -160
- data/lib/kimurai/browser_builder.rb +1 -7
- data/lib/kimurai/capybara_configuration.rb +1 -1
- data/lib/kimurai/capybara_ext/driver/base.rb +50 -46
- data/lib/kimurai/capybara_ext/mechanize/driver.rb +51 -50
- data/lib/kimurai/capybara_ext/selenium/driver.rb +33 -29
- data/lib/kimurai/capybara_ext/session.rb +31 -38
- data/lib/kimurai/cli/generator.rb +15 -15
- data/lib/kimurai/cli.rb +49 -86
- data/lib/kimurai/core_ext/array.rb +2 -2
- data/lib/kimurai/core_ext/hash.rb +1 -1
- data/lib/kimurai/core_ext/numeric.rb +4 -4
- data/lib/kimurai/pipeline.rb +2 -1
- data/lib/kimurai/runner.rb +6 -6
- data/lib/kimurai/template/Gemfile +2 -2
- data/lib/kimurai/template/config/boot.rb +4 -4
- data/lib/kimurai/template/config/schedule.rb +15 -15
- data/lib/kimurai/template/spiders/application_spider.rb +8 -14
- data/lib/kimurai/version.rb +1 -1
- data/lib/kimurai.rb +7 -3
- metadata +58 -65
- data/.travis.yml +0 -5
- data/lib/kimurai/automation/deploy.yml +0 -54
- data/lib/kimurai/automation/setup/chromium_chromedriver.yml +0 -26
- data/lib/kimurai/automation/setup/firefox_geckodriver.yml +0 -20
- data/lib/kimurai/automation/setup/phantomjs.yml +0 -33
- data/lib/kimurai/automation/setup/ruby_environment.yml +0 -124
- data/lib/kimurai/automation/setup.yml +0 -44
- data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +0 -175
- data/lib/kimurai/capybara_ext/poltergeist/driver.rb +0 -13
- data/lib/kimurai/cli/ansible_command_builder.rb +0 -71
- data/lib/kimurai/template/config/automation.yml +0 -13
|
@@ -1,34 +1,38 @@
|
|
|
1
1
|
require_relative '../driver/base'
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
3
|
+
module Capybara
|
|
4
|
+
module Selenium
|
|
5
|
+
class Driver
|
|
6
|
+
def get_cookies
|
|
7
|
+
browser.manage.all_cookies
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def set_cookie(name, value, options = {})
|
|
11
|
+
options[:name] ||= name
|
|
12
|
+
options[:value] ||= value
|
|
13
|
+
|
|
14
|
+
browser.manage.add_cookie(options)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def set_cookies(cookies)
|
|
18
|
+
cookies.each do |cookie|
|
|
19
|
+
set_cookie(cookie[:name], cookie[:value], cookie)
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def clear_cookies
|
|
24
|
+
browser.manage.delete_all_cookies
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
###
|
|
28
|
+
|
|
29
|
+
def pid
|
|
30
|
+
@pid ||= `lsof -i tcp:#{port} -t`.strip.to_i
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def port
|
|
34
|
+
@port ||= browser.send(:bridge).instance_variable_get('@http').instance_variable_get('@server_url').port
|
|
35
|
+
end
|
|
18
36
|
end
|
|
19
37
|
end
|
|
20
|
-
|
|
21
|
-
def clear_cookies
|
|
22
|
-
browser.manage.delete_all_cookies
|
|
23
|
-
end
|
|
24
|
-
|
|
25
|
-
###
|
|
26
|
-
|
|
27
|
-
def pid
|
|
28
|
-
@pid ||= `lsof -i tcp:#{port} -t`.strip.to_i
|
|
29
|
-
end
|
|
30
|
-
|
|
31
|
-
def port
|
|
32
|
-
@port ||= browser.send(:bridge).instance_variable_get("@http").instance_variable_get("@server_url").port
|
|
33
|
-
end
|
|
34
38
|
end
|
|
@@ -7,11 +7,12 @@ module Capybara
|
|
|
7
7
|
class Session
|
|
8
8
|
attr_accessor :spider
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
alias original_visit visit
|
|
11
11
|
def visit(visit_uri, delay: config.before_request[:delay], skip_request_options: false, max_retries: 3)
|
|
12
12
|
if spider
|
|
13
13
|
process_delay(delay) if delay
|
|
14
|
-
retries
|
|
14
|
+
retries = 0
|
|
15
|
+
sleep_interval = 0
|
|
15
16
|
|
|
16
17
|
begin
|
|
17
18
|
check_request_options(visit_uri) unless skip_request_options
|
|
@@ -19,7 +20,7 @@ module Capybara
|
|
|
19
20
|
spider.class.update(:visits, :requests) if spider.with_info
|
|
20
21
|
|
|
21
22
|
original_visit(visit_uri)
|
|
22
|
-
rescue => e
|
|
23
|
+
rescue StandardError => e
|
|
23
24
|
if match_error?(e, type: :to_skip)
|
|
24
25
|
logger.error "Browser: skip request error: #{e.inspect}, url: #{visit_uri}"
|
|
25
26
|
spider.add_event(:requests_errors, e.inspect) if spider.with_info
|
|
@@ -29,7 +30,7 @@ module Capybara
|
|
|
29
30
|
spider.add_event(:requests_errors, e.inspect) if spider.with_info
|
|
30
31
|
|
|
31
32
|
if (retries += 1) <= max_retries
|
|
32
|
-
logger.info "Browser: sleep #{
|
|
33
|
+
logger.info "Browser: sleep #{sleep_interval += 15} seconds and process retry № #{retries} to the url: #{visit_uri}"
|
|
33
34
|
sleep sleep_interval and retry
|
|
34
35
|
else
|
|
35
36
|
logger.error "Browser: all retries (#{retries - 1}) to the url #{visit_uri} are gone"
|
|
@@ -48,7 +49,7 @@ module Capybara
|
|
|
48
49
|
logger.info "Info: visits: requests: #{spider.class.visits[:requests]}, responses: #{spider.class.visits[:responses]}"
|
|
49
50
|
end
|
|
50
51
|
|
|
51
|
-
if memory = driver.current_memory
|
|
52
|
+
if (memory = driver.current_memory)
|
|
52
53
|
logger.debug "Browser: driver.current_memory: #{memory}"
|
|
53
54
|
end
|
|
54
55
|
end
|
|
@@ -62,7 +63,7 @@ module Capybara
|
|
|
62
63
|
begin
|
|
63
64
|
@driver.quit
|
|
64
65
|
# handle Net::ReadTimeout error for Selenium like drivers
|
|
65
|
-
rescue Net::ReadTimeout
|
|
66
|
+
rescue Net::ReadTimeout
|
|
66
67
|
@driver.quit
|
|
67
68
|
end
|
|
68
69
|
|
|
@@ -74,13 +75,8 @@ module Capybara
|
|
|
74
75
|
end
|
|
75
76
|
|
|
76
77
|
def restart!
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
@driver.requests, @driver.responses = 0, 0
|
|
80
|
-
else
|
|
81
|
-
destroy_driver!
|
|
82
|
-
driver
|
|
83
|
-
end
|
|
78
|
+
destroy_driver!
|
|
79
|
+
driver
|
|
84
80
|
|
|
85
81
|
logger.info "Browser: driver has been restarted: name: #{mode}, pid: #{driver.pid}, port: #{driver.port}"
|
|
86
82
|
end
|
|
@@ -90,7 +86,7 @@ module Capybara
|
|
|
90
86
|
when :html
|
|
91
87
|
if config.encoding
|
|
92
88
|
if config.encoding == :auto
|
|
93
|
-
charset = body.force_encoding(
|
|
89
|
+
charset = body.force_encoding('ISO-8859-1').encode('UTF-8')[/<meta.*?charset="?([\w+\d+-]*)/i, 1]
|
|
94
90
|
Nokogiri::HTML(body, nil, charset)
|
|
95
91
|
else
|
|
96
92
|
Nokogiri::HTML(body, nil, config.encoding)
|
|
@@ -108,23 +104,22 @@ module Capybara
|
|
|
108
104
|
# Handy method to perform some processing in the new tab within block and then automatically close this tab:
|
|
109
105
|
# Usage (url):
|
|
110
106
|
# browser.within_new_window_by(url: "https://google.com") do
|
|
111
|
-
|
|
107
|
+
# do some stuff and then automatically close this tab and return back to the first tab
|
|
112
108
|
# end
|
|
113
109
|
# Usage (action) (when new tab opening by some action, for example by clicking
|
|
114
110
|
# on a particular element):
|
|
115
111
|
# action = -> { browser.find("//some/element/path").click }
|
|
116
112
|
# browser.within_new_window_by(action: action) do
|
|
117
|
-
|
|
113
|
+
# do some stuff and then automatically close this tab and return back to the first tab
|
|
118
114
|
# end
|
|
119
115
|
def within_new_window_by(action: nil, url: nil)
|
|
120
|
-
|
|
121
|
-
when action
|
|
116
|
+
if action
|
|
122
117
|
opened_window = window_opened_by { action.call }
|
|
123
118
|
within_window(opened_window) do
|
|
124
119
|
yield
|
|
125
120
|
current_window.close
|
|
126
121
|
end
|
|
127
|
-
|
|
122
|
+
elsif url
|
|
128
123
|
within_window(open_new_window) do
|
|
129
124
|
visit(url)
|
|
130
125
|
|
|
@@ -137,14 +132,14 @@ module Capybara
|
|
|
137
132
|
###
|
|
138
133
|
|
|
139
134
|
def scroll_to_bottom
|
|
140
|
-
execute_script(
|
|
135
|
+
execute_script('window.scrollBy(0,10000)')
|
|
141
136
|
end
|
|
142
137
|
|
|
143
138
|
private
|
|
144
139
|
|
|
145
140
|
def skip_error_on_failure?(e)
|
|
146
141
|
config.retry_request_errors.any? do |error|
|
|
147
|
-
error[:skip_on_failure] && e.class.ancestors.include?(error[:error]) if error.
|
|
142
|
+
error[:skip_on_failure] && e.class.ancestors.include?(error[:error]) if error.is_a?(Hash)
|
|
148
143
|
end
|
|
149
144
|
end
|
|
150
145
|
|
|
@@ -156,10 +151,10 @@ module Capybara
|
|
|
156
151
|
end
|
|
157
152
|
|
|
158
153
|
errors.any? do |error|
|
|
159
|
-
if error.
|
|
154
|
+
if error.is_a?(Hash)
|
|
160
155
|
match_class = e.class.ancestors.include?(error[:error])
|
|
161
156
|
if error[:message].present?
|
|
162
|
-
if error[:message].
|
|
157
|
+
if error[:message].is_a?(Regexp)
|
|
163
158
|
e.message&.match?(error[:message])
|
|
164
159
|
else
|
|
165
160
|
e.message&.include?(error[:message])
|
|
@@ -174,14 +169,14 @@ module Capybara
|
|
|
174
169
|
end
|
|
175
170
|
|
|
176
171
|
def process_delay(delay)
|
|
177
|
-
interval = (delay.
|
|
172
|
+
interval = (delay.instance_of?(Range) ? rand(delay) : delay)
|
|
178
173
|
logger.debug "Browser: sleep #{interval.round(2)} #{'second'.pluralize(interval)} before request..."
|
|
179
174
|
sleep interval
|
|
180
175
|
end
|
|
181
176
|
|
|
182
177
|
def check_request_options(url_to_visit)
|
|
183
178
|
# restart_if
|
|
184
|
-
if memory_limit = config.restart_if[:memory_limit]
|
|
179
|
+
if (memory_limit = config.restart_if[:memory_limit])
|
|
185
180
|
memory = driver.current_memory
|
|
186
181
|
if memory && memory >= memory_limit
|
|
187
182
|
logger.warn "Browser: memory_limit #{memory_limit} of driver.current_memory (#{memory}) is exceeded (engine: #{mode})"
|
|
@@ -189,7 +184,7 @@ module Capybara
|
|
|
189
184
|
end
|
|
190
185
|
end
|
|
191
186
|
|
|
192
|
-
if requests_limit = config.restart_if[:requests_limit]
|
|
187
|
+
if (requests_limit = config.restart_if[:requests_limit])
|
|
193
188
|
requests = driver.requests
|
|
194
189
|
if requests >= requests_limit
|
|
195
190
|
logger.warn "Browser: requests_limit #{requests_limit} of driver.requests (#{requests}) is exceeded (engine: #{mode})"
|
|
@@ -209,7 +204,7 @@ module Capybara
|
|
|
209
204
|
|
|
210
205
|
if config.before_request[:clear_cookies]
|
|
211
206
|
driver.clear_cookies
|
|
212
|
-
logger.debug
|
|
207
|
+
logger.debug 'Browser: cleared cookies before request'
|
|
213
208
|
end
|
|
214
209
|
|
|
215
210
|
if config.before_request[:clear_and_set_cookies]
|
|
@@ -217,29 +212,27 @@ module Capybara
|
|
|
217
212
|
|
|
218
213
|
# (Selenium only) if browser is not visited yet any page, visit url_to_visit
|
|
219
214
|
# first and then set cookies (needs after browser restart):
|
|
220
|
-
if driver.visited.nil? && mode.match?(/selenium/)
|
|
221
|
-
visit(url_to_visit, skip_request_options: true)
|
|
222
|
-
end
|
|
215
|
+
visit(url_to_visit, skip_request_options: true) if driver.visited.nil? && mode.match?(/selenium/)
|
|
223
216
|
|
|
224
217
|
config.cookies.each do |cookie|
|
|
225
218
|
driver.set_cookie(cookie[:name], cookie[:value], cookie)
|
|
226
219
|
end
|
|
227
220
|
|
|
228
|
-
logger.debug
|
|
221
|
+
logger.debug 'Browser: cleared and set cookies before request'
|
|
229
222
|
end
|
|
230
223
|
|
|
231
224
|
# user_agent
|
|
232
225
|
if config.before_request[:change_user_agent]
|
|
233
|
-
driver.add_header(
|
|
234
|
-
logger.debug
|
|
226
|
+
driver.add_header('User-Agent', config.user_agent.call)
|
|
227
|
+
logger.debug 'Browser: changed user_agent before request'
|
|
235
228
|
end
|
|
236
229
|
|
|
237
230
|
# proxy
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
231
|
+
return unless config.before_request[:change_proxy]
|
|
232
|
+
|
|
233
|
+
proxy_string = config.proxy.call
|
|
234
|
+
driver.set_proxy(*proxy_string.split(':'))
|
|
235
|
+
logger.debug 'Browser: changed proxy before request'
|
|
243
236
|
end
|
|
244
237
|
|
|
245
238
|
def logger
|
|
@@ -4,20 +4,20 @@ module Kimurai
|
|
|
4
4
|
include Thor::Actions
|
|
5
5
|
|
|
6
6
|
def self.source_root
|
|
7
|
-
File.dirname(File.expand_path(
|
|
7
|
+
File.dirname(File.expand_path(__dir__))
|
|
8
8
|
end
|
|
9
9
|
|
|
10
10
|
def generate_project(project_name)
|
|
11
|
-
directory
|
|
11
|
+
directory 'template', project_name
|
|
12
12
|
inside(project_name) do
|
|
13
|
-
run
|
|
14
|
-
run
|
|
13
|
+
run 'bundle install'
|
|
14
|
+
run 'git init'
|
|
15
15
|
end
|
|
16
16
|
end
|
|
17
17
|
|
|
18
18
|
def generate_spider(spider_name, in_project:)
|
|
19
19
|
spider_path = in_project ? "spiders/#{spider_name}.rb" : "./#{spider_name}.rb"
|
|
20
|
-
raise "Spider #{spider_path} already exists" if File.
|
|
20
|
+
raise "Spider #{spider_path} already exists" if File.exist? spider_path
|
|
21
21
|
|
|
22
22
|
spider_class = to_spider_class(spider_name)
|
|
23
23
|
create_file spider_path do
|
|
@@ -33,24 +33,24 @@ module Kimurai
|
|
|
33
33
|
RUBY
|
|
34
34
|
end
|
|
35
35
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
36
|
+
return if in_project
|
|
37
|
+
|
|
38
|
+
insert_into_file spider_path, " @engine = :mechanize\n", after: "@name = \"#{spider_name}\"\n"
|
|
39
|
+
prepend_to_file spider_path, "require 'kimurai'\n\n"
|
|
40
|
+
append_to_file spider_path, "\n#{spider_class}.crawl!"
|
|
41
41
|
end
|
|
42
42
|
|
|
43
43
|
def generate_schedule
|
|
44
|
-
copy_file
|
|
44
|
+
copy_file 'template/config/schedule.rb', './schedule.rb'
|
|
45
45
|
end
|
|
46
46
|
|
|
47
47
|
private
|
|
48
48
|
|
|
49
49
|
def to_spider_class(string)
|
|
50
|
-
string.sub(/^./) {
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
50
|
+
string.sub(/^./) { ::Regexp.last_match(0).capitalize }
|
|
51
|
+
.gsub(%r{(?:_|(/))([a-z\d]*)}) { "#{::Regexp.last_match(1)}#{::Regexp.last_match(2).capitalize}" }
|
|
52
|
+
.gsub(%r{(?:-|(/))([a-z\d]*)}) { "Dash#{::Regexp.last_match(2).capitalize}" }
|
|
53
|
+
.gsub(%r{(?:\.|(/))([a-z\d]*)}) { "#{::Regexp.last_match(1)}#{::Regexp.last_match(2).capitalize}" }
|
|
54
54
|
end
|
|
55
55
|
end
|
|
56
56
|
end
|
data/lib/kimurai/cli.rb
CHANGED
|
@@ -4,18 +4,22 @@ module Kimurai
|
|
|
4
4
|
class CLI < Thor
|
|
5
5
|
map %w[--version -v] => :__print_version
|
|
6
6
|
|
|
7
|
-
desc
|
|
7
|
+
desc 'new PROJECT_NAME', 'Create a new Kimurai project'
|
|
8
|
+
def new(project_name)
|
|
9
|
+
raise 'Provide project name to generate a new project' unless project_name.present?
|
|
10
|
+
|
|
11
|
+
Generator.new.generate_project(project_name)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
desc 'generate', 'Generator, available types: spider, schedule'
|
|
8
15
|
def generate(generator_type, *args)
|
|
9
16
|
case generator_type
|
|
10
|
-
when
|
|
11
|
-
project_name = args.shift
|
|
12
|
-
raise "Provide project name to generate a new project" unless project_name.present?
|
|
13
|
-
Generator.new.generate_project(project_name)
|
|
14
|
-
when "spider"
|
|
17
|
+
when 'spider'
|
|
15
18
|
spider_name = args.shift
|
|
16
|
-
raise
|
|
19
|
+
raise 'Provide spider name to generate a spider' unless spider_name.present?
|
|
20
|
+
|
|
17
21
|
Generator.new.generate_spider(spider_name, in_project: inside_project?)
|
|
18
|
-
when
|
|
22
|
+
when 'schedule'
|
|
19
23
|
Generator.new.generate_schedule
|
|
20
24
|
else
|
|
21
25
|
raise "Don't know this generator type: #{generator_type}"
|
|
@@ -24,82 +28,43 @@ module Kimurai
|
|
|
24
28
|
|
|
25
29
|
###
|
|
26
30
|
|
|
27
|
-
desc
|
|
28
|
-
option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
|
|
29
|
-
option "ask-sudo", type: :boolean, banner: "Provide sudo password for a user to install system-wide packages"
|
|
30
|
-
option "ask-auth-pass", type: :boolean, banner: "Auth using password"
|
|
31
|
-
option "ssh-key-path", type: :string, banner: "Auth using ssh key"
|
|
32
|
-
option :local, type: :boolean, banner: "Run setup on a local machine (Ubuntu only)"
|
|
33
|
-
def setup(user_host)
|
|
34
|
-
command = AnsibleCommandBuilder.new(user_host, options, playbook: "setup").get
|
|
35
|
-
|
|
36
|
-
pid = spawn *command
|
|
37
|
-
Process.wait pid
|
|
38
|
-
end
|
|
39
|
-
|
|
40
|
-
desc "deploy", "Deploy project to the server and update cron schedule"
|
|
41
|
-
option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
|
|
42
|
-
option "ask-auth-pass", type: :boolean, banner: "Auth using password"
|
|
43
|
-
option "ssh-key-path", type: :string, banner: "Auth using ssh key"
|
|
44
|
-
option "repo-url", type: :string, banner: "Repo url"
|
|
45
|
-
option "repo-key-path", type: :string, banner: "SSH key for a git repo"
|
|
46
|
-
def deploy(user_host)
|
|
47
|
-
if !`git status --short`.empty?
|
|
48
|
-
raise "Deploy: Please commit your changes first"
|
|
49
|
-
elsif `git remote`.empty?
|
|
50
|
-
raise "Deploy: Please add remote origin repository to your repo first"
|
|
51
|
-
elsif !`git rev-list master...origin/master`.empty?
|
|
52
|
-
raise "Deploy: Please push your commits to the remote origin repo first"
|
|
53
|
-
end
|
|
54
|
-
|
|
55
|
-
repo_url = options["repo-url"] ? options["repo-url"] : `git remote get-url origin`.strip
|
|
56
|
-
repo_name = repo_url[/\/([^\/]*)\.git/i, 1]
|
|
57
|
-
|
|
58
|
-
command = AnsibleCommandBuilder.new(user_host, options, playbook: "deploy",
|
|
59
|
-
vars: { repo_url: repo_url, repo_name: repo_name, repo_key_path: options["repo-key-path"] }
|
|
60
|
-
).get
|
|
61
|
-
|
|
62
|
-
pid = spawn *command
|
|
63
|
-
Process.wait pid
|
|
64
|
-
end
|
|
65
|
-
|
|
66
|
-
###
|
|
67
|
-
|
|
68
|
-
desc "crawl", "Run a particular spider by it's name"
|
|
31
|
+
desc 'crawl', "Run a particular spider by it's name"
|
|
69
32
|
def crawl(spider_name)
|
|
70
33
|
raise "Can't find Kimurai project" unless inside_project?
|
|
34
|
+
|
|
71
35
|
require './config/boot'
|
|
72
36
|
|
|
73
|
-
unless klass = Kimurai.find_by_name(spider_name)
|
|
37
|
+
unless (klass = Kimurai.find_by_name(spider_name))
|
|
74
38
|
raise "Can't find spider with name `#{spider_name}` in the project. " \
|
|
75
|
-
|
|
39
|
+
'To list all available spiders, run: `$ bundle exec kimurai list`'
|
|
76
40
|
end
|
|
77
41
|
|
|
78
42
|
# Set time_zone if exists
|
|
79
|
-
if time_zone = Kimurai.configuration.time_zone
|
|
43
|
+
if (time_zone = Kimurai.configuration.time_zone)
|
|
80
44
|
Kimurai.time_zone = time_zone
|
|
81
45
|
end
|
|
82
46
|
|
|
83
47
|
klass.crawl!
|
|
84
48
|
end
|
|
85
49
|
|
|
86
|
-
desc
|
|
87
|
-
option :url, type: :string, required: true, banner:
|
|
50
|
+
desc 'parse', 'Parse url in the particular spider method'
|
|
51
|
+
option :url, type: :string, required: true, banner: 'Url to pass to the method'
|
|
88
52
|
def parse(spider_name, method_name)
|
|
89
53
|
raise "Can't find Kimurai project" unless inside_project?
|
|
54
|
+
|
|
90
55
|
require './config/boot'
|
|
91
56
|
|
|
92
|
-
unless klass = Kimurai.find_by_name(spider_name)
|
|
57
|
+
unless (klass = Kimurai.find_by_name(spider_name))
|
|
93
58
|
raise "Can't find spider with name `#{spider_name}` in the project. " \
|
|
94
|
-
|
|
59
|
+
'To list all available spiders, run: `$ bundle exec kimurai list`'
|
|
95
60
|
end
|
|
96
61
|
|
|
97
|
-
klass.parse!(method_name, url: options[
|
|
62
|
+
klass.parse!(method_name, url: options['url'])
|
|
98
63
|
end
|
|
99
64
|
|
|
100
|
-
desc
|
|
101
|
-
option :engine, type: :string, banner:
|
|
102
|
-
option :url, type: :string, banner:
|
|
65
|
+
desc 'console', 'Start Kimurai console'
|
|
66
|
+
option :engine, type: :string, banner: 'Engine to use'
|
|
67
|
+
option :url, type: :string, banner: 'Url to process'
|
|
103
68
|
def console(spider_name = nil)
|
|
104
69
|
require 'pry'
|
|
105
70
|
require './config/boot' if inside_project?
|
|
@@ -107,74 +72,72 @@ module Kimurai
|
|
|
107
72
|
if spider_name
|
|
108
73
|
raise "Can't find Kimurai project" unless inside_project?
|
|
109
74
|
|
|
110
|
-
unless klass = Kimurai.find_by_name(spider_name)
|
|
75
|
+
unless (klass = Kimurai.find_by_name(spider_name))
|
|
111
76
|
raise "Can't find spider with name `#{spider_name}` in the project. " \
|
|
112
|
-
|
|
77
|
+
'To list all available spiders, run: `$ bundle exec kimurai list`'
|
|
113
78
|
end
|
|
114
79
|
else
|
|
115
80
|
klass = inside_project? ? ApplicationSpider : ::Kimurai::Base
|
|
116
81
|
end
|
|
117
82
|
|
|
118
|
-
engine = options[
|
|
119
|
-
if
|
|
120
|
-
klass.new(engine).request_to(:console, url: options[
|
|
83
|
+
engine = options['engine']&.delete(':')&.to_sym
|
|
84
|
+
if options['url']
|
|
85
|
+
klass.new(engine).request_to(:console, url: options['url'])
|
|
121
86
|
else
|
|
122
87
|
klass.new(engine).public_send(:console)
|
|
123
88
|
end
|
|
124
89
|
end
|
|
125
90
|
|
|
126
|
-
desc
|
|
91
|
+
desc 'list', 'List all available spiders in the current project'
|
|
127
92
|
def list
|
|
128
93
|
raise "Can't find Kimurai project" unless inside_project?
|
|
94
|
+
|
|
129
95
|
require './config/boot'
|
|
130
96
|
|
|
131
|
-
Kimurai.list.keys.each { |name| puts name }
|
|
97
|
+
Kimurai.list.keys.sort.each { |name| puts name }
|
|
132
98
|
end
|
|
133
99
|
|
|
134
|
-
desc
|
|
135
|
-
option :include, type: :array, default: [], banner:
|
|
136
|
-
option :exclude, type: :array, default: [], banner:
|
|
137
|
-
option :jobs, aliases: :j, type: :numeric, default: 1, banner:
|
|
100
|
+
desc 'runner', 'Run all spiders in the project in queue'
|
|
101
|
+
option :include, type: :array, default: [], banner: 'List of spiders to run'
|
|
102
|
+
option :exclude, type: :array, default: [], banner: 'List of spiders to exclude from run'
|
|
103
|
+
option :jobs, aliases: :j, type: :numeric, default: 1, banner: 'The number of concurrent jobs'
|
|
138
104
|
def runner
|
|
139
105
|
raise "Can't find Kimurai project" unless inside_project?
|
|
140
106
|
|
|
141
|
-
jobs = options[
|
|
142
|
-
raise "Jobs count can't be 0" if jobs
|
|
107
|
+
jobs = options['jobs']
|
|
108
|
+
raise "Jobs count can't be 0" if jobs.zero?
|
|
143
109
|
|
|
144
110
|
require './config/boot'
|
|
145
111
|
require 'kimurai/runner'
|
|
146
112
|
|
|
147
|
-
spiders = options[
|
|
148
|
-
spiders -= options[
|
|
113
|
+
spiders = options['include'].presence || Kimurai.list.keys
|
|
114
|
+
spiders -= options['exclude']
|
|
149
115
|
|
|
150
116
|
Runner.new(spiders, jobs).run!
|
|
151
117
|
end
|
|
152
118
|
|
|
153
|
-
desc
|
|
119
|
+
desc '--version, -v', 'Print the version'
|
|
154
120
|
def __print_version
|
|
155
121
|
puts VERSION
|
|
156
122
|
end
|
|
157
123
|
|
|
158
|
-
desc
|
|
124
|
+
desc 'dashboard', 'Run dashboard'
|
|
159
125
|
def dashboard
|
|
160
126
|
raise "Can't find Kimurai project" unless inside_project?
|
|
161
127
|
|
|
162
128
|
require './config/boot'
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
raise "Kimurai::Dashboard is not defined"
|
|
168
|
-
end
|
|
129
|
+
raise 'Kimurai::Dashboard is not defined' unless Object.const_defined?('Kimurai::Dashboard')
|
|
130
|
+
|
|
131
|
+
require 'kimurai/dashboard/app'
|
|
132
|
+
Kimurai::Dashboard::App.run!
|
|
169
133
|
end
|
|
170
134
|
|
|
171
135
|
private
|
|
172
136
|
|
|
173
137
|
def inside_project?
|
|
174
|
-
Dir.
|
|
138
|
+
Dir.exist?('spiders') && File.exist?('./config/boot.rb')
|
|
175
139
|
end
|
|
176
140
|
end
|
|
177
141
|
end
|
|
178
142
|
|
|
179
143
|
require_relative 'cli/generator'
|
|
180
|
-
require_relative 'cli/ansible_command_builder'
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
class Array
|
|
2
2
|
def in_sorted_groups(number, fill_width = nil)
|
|
3
|
-
sorted_groups = Array.new(number) { |
|
|
3
|
+
sorted_groups = Array.new(number) { |_a| [] }
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
in_groups_of(number, fill_width).each do |group|
|
|
6
6
|
number.times do |i|
|
|
7
7
|
group.fetch(i) rescue next
|
|
8
8
|
sorted_groups[i] << group[i]
|
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
class Numeric
|
|
2
2
|
# https://stackoverflow.com/a/1679963
|
|
3
3
|
def duration
|
|
4
|
-
secs =
|
|
4
|
+
secs = to_int
|
|
5
5
|
mins = secs / 60
|
|
6
6
|
hours = mins / 60
|
|
7
7
|
days = hours / 24
|
|
8
8
|
|
|
9
|
-
if days
|
|
9
|
+
if days.positive?
|
|
10
10
|
"#{days}d, #{hours % 24}h"
|
|
11
|
-
elsif hours
|
|
11
|
+
elsif hours.positive?
|
|
12
12
|
"#{hours}h, #{mins % 60}m"
|
|
13
|
-
elsif mins
|
|
13
|
+
elsif mins.positive?
|
|
14
14
|
"#{mins}m, #{secs % 60}s"
|
|
15
15
|
elsif secs >= 0
|
|
16
16
|
"#{secs}s"
|
data/lib/kimurai/pipeline.rb
CHANGED
data/lib/kimurai/runner.rb
CHANGED
|
@@ -19,17 +19,17 @@ module Kimurai
|
|
|
19
19
|
spiders: @spiders
|
|
20
20
|
}
|
|
21
21
|
|
|
22
|
-
if time_zone = Kimurai.configuration.time_zone
|
|
22
|
+
if (time_zone = Kimurai.configuration.time_zone)
|
|
23
23
|
Kimurai.time_zone = time_zone
|
|
24
24
|
end
|
|
25
25
|
|
|
26
|
-
ENV.store(
|
|
27
|
-
ENV.store(
|
|
26
|
+
ENV.store('SESSION_ID', @start_time.to_i.to_s)
|
|
27
|
+
ENV.store('RBCAT_COLORIZER', 'false')
|
|
28
28
|
end
|
|
29
29
|
|
|
30
30
|
def run!(exception_on_fail: true)
|
|
31
31
|
puts ">>> Runner: started: #{session_info}"
|
|
32
|
-
if at_start_callback = Kimurai.configuration.runner_at_start_callback
|
|
32
|
+
if (at_start_callback = Kimurai.configuration.runner_at_start_callback)
|
|
33
33
|
at_start_callback.call(session_info)
|
|
34
34
|
end
|
|
35
35
|
|
|
@@ -38,7 +38,7 @@ module Kimurai
|
|
|
38
38
|
next unless running
|
|
39
39
|
|
|
40
40
|
puts "> Runner: started spider: #{spider}, index: #{i}"
|
|
41
|
-
pid = spawn(
|
|
41
|
+
pid = spawn('bundle', 'exec', 'kimurai', 'crawl', spider, %i[out err] => "log/#{spider}.log")
|
|
42
42
|
Process.wait pid
|
|
43
43
|
|
|
44
44
|
puts "< Runner: stopped spider: #{spider}, index: #{i}"
|
|
@@ -51,7 +51,7 @@ module Kimurai
|
|
|
51
51
|
else
|
|
52
52
|
session_info.merge!(status: :completed, stop_time: Time.now)
|
|
53
53
|
ensure
|
|
54
|
-
if at_stop_callback = Kimurai.configuration.runner_at_stop_callback
|
|
54
|
+
if (at_stop_callback = Kimurai.configuration.runner_at_stop_callback)
|
|
55
55
|
at_stop_callback.call(session_info)
|
|
56
56
|
end
|
|
57
57
|
puts "<<< Runner: stopped: #{session_info}"
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
source 'https://rubygems.org'
|
|
2
2
|
git_source(:github) { |repo| "https://github.com/#{repo}.git" }
|
|
3
3
|
|
|
4
|
-
ruby '>=
|
|
4
|
+
ruby '>= 3.1'
|
|
5
5
|
|
|
6
6
|
# Framework
|
|
7
|
-
gem 'kimurai', '~>
|
|
7
|
+
gem 'kimurai', '~> 2.0'
|
|
8
8
|
|
|
9
9
|
# Require files in directory and child directories recursively
|
|
10
10
|
gem 'require_all'
|