kimurai_dynamic 1.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.travis.yml +5 -0
- data/CHANGELOG.md +111 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +2038 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/exe/kimurai +6 -0
- data/kimurai.gemspec +48 -0
- data/lib/kimurai/automation/deploy.yml +54 -0
- data/lib/kimurai/automation/setup/chromium_chromedriver.yml +26 -0
- data/lib/kimurai/automation/setup/firefox_geckodriver.yml +20 -0
- data/lib/kimurai/automation/setup/phantomjs.yml +33 -0
- data/lib/kimurai/automation/setup/ruby_environment.yml +124 -0
- data/lib/kimurai/automation/setup.yml +45 -0
- data/lib/kimurai/base/saver.rb +106 -0
- data/lib/kimurai/base/storage.rb +54 -0
- data/lib/kimurai/base.rb +330 -0
- data/lib/kimurai/base_helper.rb +22 -0
- data/lib/kimurai/browser_builder/mechanize_builder.rb +154 -0
- data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +175 -0
- data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +199 -0
- data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +204 -0
- data/lib/kimurai/browser_builder.rb +20 -0
- data/lib/kimurai/capybara_configuration.rb +10 -0
- data/lib/kimurai/capybara_ext/driver/base.rb +62 -0
- data/lib/kimurai/capybara_ext/mechanize/driver.rb +71 -0
- data/lib/kimurai/capybara_ext/poltergeist/driver.rb +13 -0
- data/lib/kimurai/capybara_ext/selenium/driver.rb +34 -0
- data/lib/kimurai/capybara_ext/session/config.rb +22 -0
- data/lib/kimurai/capybara_ext/session.rb +249 -0
- data/lib/kimurai/cli/ansible_command_builder.rb +71 -0
- data/lib/kimurai/cli/generator.rb +57 -0
- data/lib/kimurai/cli.rb +183 -0
- data/lib/kimurai/core_ext/array.rb +14 -0
- data/lib/kimurai/core_ext/hash.rb +5 -0
- data/lib/kimurai/core_ext/numeric.rb +19 -0
- data/lib/kimurai/core_ext/string.rb +7 -0
- data/lib/kimurai/pipeline.rb +33 -0
- data/lib/kimurai/runner.rb +60 -0
- data/lib/kimurai/template/.gitignore +18 -0
- data/lib/kimurai/template/Gemfile +28 -0
- data/lib/kimurai/template/README.md +3 -0
- data/lib/kimurai/template/config/application.rb +37 -0
- data/lib/kimurai/template/config/automation.yml +13 -0
- data/lib/kimurai/template/config/boot.rb +22 -0
- data/lib/kimurai/template/config/initializers/.keep +0 -0
- data/lib/kimurai/template/config/schedule.rb +57 -0
- data/lib/kimurai/template/db/.keep +0 -0
- data/lib/kimurai/template/helpers/application_helper.rb +3 -0
- data/lib/kimurai/template/lib/.keep +0 -0
- data/lib/kimurai/template/log/.keep +0 -0
- data/lib/kimurai/template/pipelines/saver.rb +11 -0
- data/lib/kimurai/template/pipelines/validator.rb +24 -0
- data/lib/kimurai/template/spiders/application_spider.rb +143 -0
- data/lib/kimurai/template/tmp/.keep +0 -0
- data/lib/kimurai/version.rb +3 -0
- data/lib/kimurai.rb +54 -0
- metadata +349 -0
@@ -0,0 +1,249 @@
|
|
1
|
+
require 'capybara'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'json'
|
4
|
+
require_relative 'session/config'
|
5
|
+
|
6
|
+
module Capybara
|
7
|
+
class Session
|
8
|
+
attr_accessor :spider
|
9
|
+
|
10
|
+
alias_method :original_visit, :visit
|
11
|
+
def visit(visit_uri, delay: config.before_request[:delay], skip_request_options: false, max_retries: 3)
|
12
|
+
if spider
|
13
|
+
process_delay(delay) if delay
|
14
|
+
retries, sleep_interval = 0, 0
|
15
|
+
|
16
|
+
begin
|
17
|
+
check_request_options(visit_uri) unless skip_request_options
|
18
|
+
driver.requests += 1 and logger.info "Browser: started get request to: #{visit_uri}"
|
19
|
+
spider.class.update(:visits, :requests) if spider.with_info
|
20
|
+
|
21
|
+
original_visit(visit_uri)
|
22
|
+
rescue => e
|
23
|
+
if match_error?(e, type: :to_skip)
|
24
|
+
logger.error "Browser: skip request error: #{e.inspect}, url: #{visit_uri}"
|
25
|
+
spider.add_event(:requests_errors, e.inspect) if spider.with_info
|
26
|
+
false
|
27
|
+
elsif match_error?(e, type: :to_retry)
|
28
|
+
logger.error "Browser: retry request error: #{e.inspect}, url: #{visit_uri}"
|
29
|
+
spider.add_event(:requests_errors, e.inspect) if spider.with_info
|
30
|
+
|
31
|
+
if (retries += 1) <= max_retries
|
32
|
+
logger.info "Browser: sleep #{(sleep_interval += 15)} seconds and process retry № #{retries} to the url: #{visit_uri}"
|
33
|
+
sleep sleep_interval and retry
|
34
|
+
else
|
35
|
+
logger.error "Browser: all retries (#{retries - 1}) to the url #{visit_uri} are gone"
|
36
|
+
raise e unless skip_error_on_failure?(e)
|
37
|
+
end
|
38
|
+
else
|
39
|
+
raise e
|
40
|
+
end
|
41
|
+
else
|
42
|
+
driver.responses += 1 and logger.info "Browser: finished get request to: #{visit_uri}"
|
43
|
+
spider.class.update(:visits, :responses) if spider.with_info
|
44
|
+
driver.visited = true unless driver.visited
|
45
|
+
true
|
46
|
+
ensure
|
47
|
+
if spider.with_info
|
48
|
+
logger.info "Info: visits: requests: #{spider.class.visits[:requests]}, responses: #{spider.class.visits[:responses]}"
|
49
|
+
end
|
50
|
+
|
51
|
+
if memory = driver.current_memory
|
52
|
+
logger.debug "Browser: driver.current_memory: #{memory}"
|
53
|
+
end
|
54
|
+
end
|
55
|
+
else
|
56
|
+
original_visit(visit_uri)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def destroy_driver!
|
61
|
+
if @driver
|
62
|
+
begin
|
63
|
+
@driver.quit
|
64
|
+
# handle Net::ReadTimeout error for Selenium like drivers
|
65
|
+
rescue Net::ReadTimeout => e
|
66
|
+
@driver.quit
|
67
|
+
end
|
68
|
+
|
69
|
+
@driver = nil
|
70
|
+
logger.info "Browser: driver #{mode} has been destroyed"
|
71
|
+
else
|
72
|
+
logger.warn "Browser: driver #{mode} is not present"
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def restart!
|
77
|
+
if mode.match?(/poltergeist/)
|
78
|
+
@driver.browser.restart
|
79
|
+
@driver.requests, @driver.responses = 0, 0
|
80
|
+
else
|
81
|
+
destroy_driver!
|
82
|
+
driver
|
83
|
+
end
|
84
|
+
|
85
|
+
logger.info "Browser: driver has been restarted: name: #{mode}, pid: #{driver.pid}, port: #{driver.port}"
|
86
|
+
end
|
87
|
+
|
88
|
+
def current_response(response_type = :html)
|
89
|
+
case response_type
|
90
|
+
when :html
|
91
|
+
if config.encoding
|
92
|
+
if config.encoding == :auto
|
93
|
+
charset = body.force_encoding("ISO-8859-1").encode("UTF-8")[/<meta.*?charset=["]?([\w+\d+\-]*)/i, 1]
|
94
|
+
Nokogiri::HTML(body, nil, charset)
|
95
|
+
else
|
96
|
+
Nokogiri::HTML(body, nil, config.encoding)
|
97
|
+
end
|
98
|
+
else
|
99
|
+
Nokogiri::HTML(body)
|
100
|
+
end
|
101
|
+
when :json
|
102
|
+
JSON.parse(body)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
###
|
107
|
+
|
108
|
+
# Handy method to perform some processing in the new tab within block and then automatically close this tab:
|
109
|
+
# Usage (url):
|
110
|
+
# browser.within_new_window_by(url: "https://google.com") do
|
111
|
+
# do some stuff and then automatically close this tab and return back to the first tab
|
112
|
+
# end
|
113
|
+
# Usage (action) (when new tab opening by some action, for example by clicking
|
114
|
+
# on a particular element):
|
115
|
+
# action = -> { browser.find("//some/element/path").click }
|
116
|
+
# browser.within_new_window_by(action: action) do
|
117
|
+
# do some stuff and then automatically close this tab and return back to the first tab
|
118
|
+
# end
|
119
|
+
def within_new_window_by(action: nil, url: nil)
|
120
|
+
case
|
121
|
+
when action
|
122
|
+
opened_window = window_opened_by { action.call }
|
123
|
+
within_window(opened_window) do
|
124
|
+
yield
|
125
|
+
current_window.close
|
126
|
+
end
|
127
|
+
when url
|
128
|
+
within_window(open_new_window) do
|
129
|
+
visit(url)
|
130
|
+
|
131
|
+
yield
|
132
|
+
current_window.close
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
###
|
138
|
+
|
139
|
+
def scroll_to_bottom
|
140
|
+
execute_script("window.scrollBy(0,10000)")
|
141
|
+
end
|
142
|
+
|
143
|
+
private
|
144
|
+
|
145
|
+
def skip_error_on_failure?(e)
|
146
|
+
config.retry_request_errors.any? do |error|
|
147
|
+
error[:skip_on_failure] && e.class.ancestors.include?(error[:error]) if error.kind_of?(Hash)
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
def match_error?(e, type:)
|
152
|
+
errors =
|
153
|
+
case type
|
154
|
+
when :to_retry then config.retry_request_errors
|
155
|
+
when :to_skip then config.skip_request_errors
|
156
|
+
end
|
157
|
+
|
158
|
+
errors.any? do |error|
|
159
|
+
if error.kind_of?(Hash)
|
160
|
+
match_class = e.class.ancestors.include?(error[:error])
|
161
|
+
if error[:message].present?
|
162
|
+
if error[:message].kind_of?(Regexp)
|
163
|
+
e.message&.match?(error[:message])
|
164
|
+
else
|
165
|
+
e.message&.include?(error[:message])
|
166
|
+
end && match_class
|
167
|
+
else
|
168
|
+
match_class
|
169
|
+
end
|
170
|
+
else
|
171
|
+
e.class.ancestors.include?(error)
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
def process_delay(delay)
|
177
|
+
interval = (delay.class == Range ? rand(delay) : delay)
|
178
|
+
logger.debug "Browser: sleep #{interval.round(2)} #{'second'.pluralize(interval)} before request..."
|
179
|
+
sleep interval
|
180
|
+
end
|
181
|
+
|
182
|
+
def check_request_options(url_to_visit)
|
183
|
+
# restart_if
|
184
|
+
if memory_limit = config.restart_if[:memory_limit]
|
185
|
+
memory = driver.current_memory
|
186
|
+
if memory && memory >= memory_limit
|
187
|
+
logger.warn "Browser: memory_limit #{memory_limit} of driver.current_memory (#{memory}) is exceeded (engine: #{mode})"
|
188
|
+
restart!
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
if requests_limit = config.restart_if[:requests_limit]
|
193
|
+
requests = driver.requests
|
194
|
+
if requests >= requests_limit
|
195
|
+
logger.warn "Browser: requests_limit #{requests_limit} of driver.requests (#{requests}) is exceeded (engine: #{mode})"
|
196
|
+
restart!
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
# cookies
|
201
|
+
# (Selenium only) if config.cookies present and browser was just created,
|
202
|
+
# visit url_to_visit first and only then set cookies:
|
203
|
+
if driver.visited.nil? && config.cookies && mode.match?(/selenium/)
|
204
|
+
visit(url_to_visit, skip_request_options: true)
|
205
|
+
config.cookies.each do |cookie|
|
206
|
+
driver.set_cookie(cookie[:name], cookie[:value], cookie)
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
if config.before_request[:clear_cookies]
|
211
|
+
driver.clear_cookies
|
212
|
+
logger.debug "Browser: cleared cookies before request"
|
213
|
+
end
|
214
|
+
|
215
|
+
if config.before_request[:clear_and_set_cookies]
|
216
|
+
driver.clear_cookies
|
217
|
+
|
218
|
+
# (Selenium only) if browser is not visited yet any page, visit url_to_visit
|
219
|
+
# first and then set cookies (needs after browser restart):
|
220
|
+
if driver.visited.nil? && mode.match?(/selenium/)
|
221
|
+
visit(url_to_visit, skip_request_options: true)
|
222
|
+
end
|
223
|
+
|
224
|
+
config.cookies.each do |cookie|
|
225
|
+
driver.set_cookie(cookie[:name], cookie[:value], cookie)
|
226
|
+
end
|
227
|
+
|
228
|
+
logger.debug "Browser: cleared and set cookies before request"
|
229
|
+
end
|
230
|
+
|
231
|
+
# user_agent
|
232
|
+
if config.before_request[:change_user_agent]
|
233
|
+
driver.add_header("User-Agent", config.user_agent.call)
|
234
|
+
logger.debug "Browser: changed user_agent before request"
|
235
|
+
end
|
236
|
+
|
237
|
+
# proxy
|
238
|
+
if config.before_request[:change_proxy]
|
239
|
+
proxy_string = config.proxy.call
|
240
|
+
driver.set_proxy(*proxy_string.split(":"))
|
241
|
+
logger.debug "Browser: changed proxy before request"
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
def logger
|
246
|
+
spider.logger
|
247
|
+
end
|
248
|
+
end
|
249
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
require 'cliver'
|
2
|
+
|
3
|
+
module Kimurai
|
4
|
+
class CLI
|
5
|
+
class AnsibleCommandBuilder
|
6
|
+
def initialize(user_host, options, playbook:, vars: {})
|
7
|
+
@user_host = user_host
|
8
|
+
@options = options
|
9
|
+
@playbook = playbook
|
10
|
+
@vars = vars
|
11
|
+
end
|
12
|
+
|
13
|
+
def get
|
14
|
+
unless Cliver.detect("ansible-playbook")
|
15
|
+
raise "Can't find `ansible-playbook` executable, to install: " \
|
16
|
+
"Mac OS X: `$ brew install ansible`, Ubuntu: `$ sudo apt install ansible`"
|
17
|
+
end
|
18
|
+
|
19
|
+
user = @user_host[/(.*?)\@/, 1]
|
20
|
+
host = @user_host[/\@(.+)/, 1] || @user_host
|
21
|
+
inventory = @options["port"] ? "#{host}:#{@options['port']}," : "#{host},"
|
22
|
+
|
23
|
+
gem_dir = Gem::Specification.find_by_name("kimurai").gem_dir
|
24
|
+
playbook_path = gem_dir + "/lib/kimurai/automation/" + "#{@playbook}.yml"
|
25
|
+
|
26
|
+
command = [
|
27
|
+
"ansible-playbook", playbook_path,
|
28
|
+
"--inventory", inventory,
|
29
|
+
"--ssh-extra-args", "-oForwardAgent=yes",
|
30
|
+
"--connection", @options["local"] ? "local" : "smart",
|
31
|
+
"--extra-vars", "ansible_python_interpreter=/usr/bin/python3"
|
32
|
+
]
|
33
|
+
|
34
|
+
if File.exists? "config/automation.yml"
|
35
|
+
require 'yaml'
|
36
|
+
if config = YAML.load_file("config/automation.yml").dig(@playbook)
|
37
|
+
config.each { |key, value| @vars[key] = value unless @vars[key] }
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
@vars.each do |key, value|
|
42
|
+
next unless value.present?
|
43
|
+
command.push "--extra-vars", "#{key}=#{value}"
|
44
|
+
end
|
45
|
+
|
46
|
+
if user
|
47
|
+
command.push "--user", user
|
48
|
+
end
|
49
|
+
|
50
|
+
if @options["ask-sudo"]
|
51
|
+
command.push "--ask-become-pass"
|
52
|
+
end
|
53
|
+
|
54
|
+
if @options["ask-auth-pass"]
|
55
|
+
unless Cliver.detect("sshpass")
|
56
|
+
raise "Can't find `sshpass` executable for password authentication, to install: " \
|
57
|
+
"Mac OS X: `$ brew install http://git.io/sshpass.rb`, Ubuntu: `$ sudo apt install sshpass`"
|
58
|
+
end
|
59
|
+
|
60
|
+
command.push "--ask-pass"
|
61
|
+
end
|
62
|
+
|
63
|
+
if ssh_key_path = @options["ssh-key-path"]
|
64
|
+
command.push "--private-key", ssh_key_path
|
65
|
+
end
|
66
|
+
|
67
|
+
command
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
module Kimurai
|
2
|
+
class CLI
|
3
|
+
class Generator < Thor::Group
|
4
|
+
include Thor::Actions
|
5
|
+
|
6
|
+
def self.source_root
|
7
|
+
File.dirname(File.expand_path('..', __FILE__))
|
8
|
+
end
|
9
|
+
|
10
|
+
def generate_project(project_name)
|
11
|
+
directory "template", project_name
|
12
|
+
inside(project_name) do
|
13
|
+
run "bundle install"
|
14
|
+
run "git init"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def generate_spider(spider_name, in_project:)
|
19
|
+
spider_path = in_project ? "spiders/#{spider_name}.rb" : "./#{spider_name}.rb"
|
20
|
+
raise "Spider #{spider_path} already exists" if File.exists? spider_path
|
21
|
+
|
22
|
+
spider_class = to_spider_class(spider_name)
|
23
|
+
create_file spider_path do
|
24
|
+
<<~RUBY
|
25
|
+
class #{spider_class} < #{in_project ? 'ApplicationSpider' : 'Kimurai::Base'}
|
26
|
+
@name = "#{spider_name}"
|
27
|
+
@start_urls = []
|
28
|
+
@config = {}
|
29
|
+
|
30
|
+
def parse(response, url:, data: {})
|
31
|
+
end
|
32
|
+
end
|
33
|
+
RUBY
|
34
|
+
end
|
35
|
+
|
36
|
+
unless in_project
|
37
|
+
insert_into_file spider_path, " @engine = :mechanize\n", after: "@name = \"#{spider_name}\"\n"
|
38
|
+
prepend_to_file spider_path, "require 'kimurai'\n\n"
|
39
|
+
append_to_file spider_path, "\n#{spider_class}.crawl!"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def generate_schedule
|
44
|
+
copy_file "template/config/schedule.rb", "./schedule.rb"
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def to_spider_class(string)
|
50
|
+
string.sub(/^./) { $&.capitalize }
|
51
|
+
.gsub(/(?:_|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
|
52
|
+
.gsub(/(?:-|(\/))([a-z\d]*)/) { "Dash#{$2.capitalize}" }
|
53
|
+
.gsub(/(?:\.|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
data/lib/kimurai/cli.rb
ADDED
@@ -0,0 +1,183 @@
|
|
1
|
+
require 'thor'
|
2
|
+
|
3
|
+
module Kimurai
|
4
|
+
class CLI < Thor
|
5
|
+
map %w[--version -v] => :__print_version
|
6
|
+
|
7
|
+
desc "generate", "Generator, available types: project, spider, schedule"
|
8
|
+
def generate(generator_type, *args)
|
9
|
+
case generator_type
|
10
|
+
when "project"
|
11
|
+
project_name = args.shift
|
12
|
+
raise "Provide project name to generate a new project" unless project_name.present?
|
13
|
+
Generator.new.generate_project(project_name)
|
14
|
+
when "spider"
|
15
|
+
spider_name = args.shift
|
16
|
+
raise "Provide spider name to generate a spider" unless spider_name.present?
|
17
|
+
Generator.new.generate_spider(spider_name, in_project: inside_project?)
|
18
|
+
when "schedule"
|
19
|
+
Generator.new.generate_schedule
|
20
|
+
else
|
21
|
+
raise "Don't know this generator type: #{generator_type}"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
###
|
26
|
+
|
27
|
+
desc "setup", "Setup server"
|
28
|
+
option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
|
29
|
+
option "ask-sudo", type: :boolean, banner: "Provide sudo password for a user to install system-wide packages"
|
30
|
+
option "ask-auth-pass", type: :boolean, banner: "Auth using password"
|
31
|
+
option "ssh-key-path", type: :string, banner: "Auth using ssh key"
|
32
|
+
option :local, type: :boolean, banner: "Run setup on a local machine (Ubuntu only)"
|
33
|
+
def setup(user_host)
|
34
|
+
command = AnsibleCommandBuilder.new(user_host, options, playbook: "setup").get
|
35
|
+
|
36
|
+
pid = spawn *command
|
37
|
+
Process.wait pid
|
38
|
+
end
|
39
|
+
|
40
|
+
desc "deploy", "Deploy project to the server and update cron schedule"
|
41
|
+
option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
|
42
|
+
option "ask-auth-pass", type: :boolean, banner: "Auth using password"
|
43
|
+
option "ssh-key-path", type: :string, banner: "Auth using ssh key"
|
44
|
+
option "repo-url", type: :string, banner: "Repo url"
|
45
|
+
option "repo-key-path", type: :string, banner: "SSH key for a git repo"
|
46
|
+
option "skip-check", type: :boolean, default: false, banner: "Skip git repository checks"
|
47
|
+
def deploy(user_host)
|
48
|
+
unless options["skip-check"]
|
49
|
+
if !`git status --short`.empty?
|
50
|
+
raise "Deploy: Please commit your changes first"
|
51
|
+
elsif `git remote`.empty?
|
52
|
+
raise "Deploy: Please add remote origin repository to your repo first"
|
53
|
+
elsif !`git rev-list master...origin/master`.empty?
|
54
|
+
raise "Deploy: Please push your commits to the remote origin repo first"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
repo_url = options["repo-url"] ? options["repo-url"] : `git remote get-url origin`.strip
|
59
|
+
repo_name = repo_url[/\/([^\/]*)\.git/i, 1]
|
60
|
+
|
61
|
+
command = AnsibleCommandBuilder.new(user_host, options, playbook: "deploy",
|
62
|
+
vars: { repo_url: repo_url, repo_name: repo_name, repo_key_path: options["repo-key-path"] }
|
63
|
+
).get
|
64
|
+
|
65
|
+
pid = spawn *command
|
66
|
+
Process.wait pid
|
67
|
+
end
|
68
|
+
|
69
|
+
###
|
70
|
+
|
71
|
+
desc "crawl", "Run a particular spider by it's name"
|
72
|
+
def crawl(spider_name)
|
73
|
+
raise "Can't find Kimurai project" unless inside_project?
|
74
|
+
require './config/boot'
|
75
|
+
|
76
|
+
unless klass = Kimurai.find_by_name(spider_name)
|
77
|
+
raise "Can't find spider with name `#{spider_name}` in the project. " \
|
78
|
+
"To list all available spiders, run: `$ bundle exec kimurai list`"
|
79
|
+
end
|
80
|
+
|
81
|
+
# Set time_zone if exists
|
82
|
+
if time_zone = Kimurai.configuration.time_zone
|
83
|
+
Kimurai.time_zone = time_zone
|
84
|
+
end
|
85
|
+
|
86
|
+
klass.crawl!
|
87
|
+
end
|
88
|
+
|
89
|
+
desc "parse", "Parse url in the particular spider method"
|
90
|
+
option :url, type: :string, required: true, banner: "Url to pass to the method"
|
91
|
+
def parse(spider_name, method_name)
|
92
|
+
raise "Can't find Kimurai project" unless inside_project?
|
93
|
+
require './config/boot'
|
94
|
+
|
95
|
+
unless klass = Kimurai.find_by_name(spider_name)
|
96
|
+
raise "Can't find spider with name `#{spider_name}` in the project. " \
|
97
|
+
"To list all available spiders, run: `$ bundle exec kimurai list`"
|
98
|
+
end
|
99
|
+
|
100
|
+
klass.parse!(method_name, url: options["url"])
|
101
|
+
end
|
102
|
+
|
103
|
+
desc "console", "Start Kimurai console"
|
104
|
+
option :engine, type: :string, banner: "Engine to use"
|
105
|
+
option :url, type: :string, banner: "Url to process"
|
106
|
+
def console(spider_name = nil)
|
107
|
+
require 'pry'
|
108
|
+
require './config/boot' if inside_project?
|
109
|
+
|
110
|
+
if spider_name
|
111
|
+
raise "Can't find Kimurai project" unless inside_project?
|
112
|
+
|
113
|
+
unless klass = Kimurai.find_by_name(spider_name)
|
114
|
+
raise "Can't find spider with name `#{spider_name}` in the project. " \
|
115
|
+
"To list all available spiders, run: `$ bundle exec kimurai list`"
|
116
|
+
end
|
117
|
+
else
|
118
|
+
klass = inside_project? ? ApplicationSpider : ::Kimurai::Base
|
119
|
+
end
|
120
|
+
|
121
|
+
engine = options["engine"]&.delete(":")&.to_sym
|
122
|
+
if url = options["url"]
|
123
|
+
klass.new(engine).request_to(:console, url: options["url"])
|
124
|
+
else
|
125
|
+
klass.new(engine).public_send(:console)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
desc "list", "List all available spiders in the current project"
|
130
|
+
def list
|
131
|
+
raise "Can't find Kimurai project" unless inside_project?
|
132
|
+
require './config/boot'
|
133
|
+
|
134
|
+
Kimurai.list.keys.sort.each { |name| puts name }
|
135
|
+
end
|
136
|
+
|
137
|
+
desc "runner", "Run all spiders in the project in queue"
|
138
|
+
option :include, type: :array, default: [], banner: "List of spiders to run"
|
139
|
+
option :exclude, type: :array, default: [], banner: "List of spiders to exclude from run"
|
140
|
+
option :jobs, aliases: :j, type: :numeric, default: 1, banner: "The number of concurrent jobs"
|
141
|
+
def runner
|
142
|
+
raise "Can't find Kimurai project" unless inside_project?
|
143
|
+
|
144
|
+
jobs = options["jobs"]
|
145
|
+
raise "Jobs count can't be 0" if jobs == 0
|
146
|
+
|
147
|
+
require './config/boot'
|
148
|
+
require 'kimurai/runner'
|
149
|
+
|
150
|
+
spiders = options["include"].presence || Kimurai.list.keys
|
151
|
+
spiders -= options["exclude"]
|
152
|
+
|
153
|
+
Runner.new(spiders, jobs).run!
|
154
|
+
end
|
155
|
+
|
156
|
+
desc "--version, -v", "Print the version"
|
157
|
+
def __print_version
|
158
|
+
puts VERSION
|
159
|
+
end
|
160
|
+
|
161
|
+
desc "dashboard", "Run dashboard"
|
162
|
+
def dashboard
|
163
|
+
raise "Can't find Kimurai project" unless inside_project?
|
164
|
+
|
165
|
+
require './config/boot'
|
166
|
+
if Object.const_defined?("Kimurai::Dashboard")
|
167
|
+
require 'kimurai/dashboard/app'
|
168
|
+
Kimurai::Dashboard::App.run!
|
169
|
+
else
|
170
|
+
raise "Kimurai::Dashboard is not defined"
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
private
|
175
|
+
|
176
|
+
def inside_project?
|
177
|
+
Dir.exists?("spiders") && File.exists?("./config/boot.rb")
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
require_relative 'cli/generator'
|
183
|
+
require_relative 'cli/ansible_command_builder'
|
@@ -0,0 +1,14 @@
|
|
1
|
+
class Array
|
2
|
+
def in_sorted_groups(number, fill_width = nil)
|
3
|
+
sorted_groups = Array.new(number) { |a| a = [] }
|
4
|
+
|
5
|
+
self.in_groups_of(number, fill_width).each do |group|
|
6
|
+
number.times do |i|
|
7
|
+
group.fetch(i) rescue next
|
8
|
+
sorted_groups[i] << group[i]
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
sorted_groups
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
class Numeric
|
2
|
+
# https://stackoverflow.com/a/1679963
|
3
|
+
def duration
|
4
|
+
secs = self.to_int
|
5
|
+
mins = secs / 60
|
6
|
+
hours = mins / 60
|
7
|
+
days = hours / 24
|
8
|
+
|
9
|
+
if days > 0
|
10
|
+
"#{days}d, #{hours % 24}h"
|
11
|
+
elsif hours > 0
|
12
|
+
"#{hours}h, #{mins % 60}m"
|
13
|
+
elsif mins > 0
|
14
|
+
"#{mins}m, #{secs % 60}s"
|
15
|
+
elsif secs >= 0
|
16
|
+
"#{secs}s"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Kimurai
|
2
|
+
class Pipeline
|
3
|
+
class DropItemError < StandardError; end
|
4
|
+
def self.name
|
5
|
+
self.to_s.sub(/.*?::/, "").underscore.to_sym
|
6
|
+
end
|
7
|
+
|
8
|
+
include BaseHelper
|
9
|
+
attr_accessor :spider
|
10
|
+
|
11
|
+
def name
|
12
|
+
self.class.name
|
13
|
+
end
|
14
|
+
|
15
|
+
###
|
16
|
+
|
17
|
+
def storage
|
18
|
+
spider.storage
|
19
|
+
end
|
20
|
+
|
21
|
+
def unique?(scope, value)
|
22
|
+
spider.unique?(scope, value)
|
23
|
+
end
|
24
|
+
|
25
|
+
def save_to(path, item, format:, position: true, append: false)
|
26
|
+
spider.save_to(path, item, format: format, position: position, append: append)
|
27
|
+
end
|
28
|
+
|
29
|
+
def logger
|
30
|
+
spider.logger
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|