kimurai_dynamic 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.travis.yml +5 -0
- data/CHANGELOG.md +111 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +2038 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/exe/kimurai +6 -0
- data/kimurai.gemspec +48 -0
- data/lib/kimurai/automation/deploy.yml +54 -0
- data/lib/kimurai/automation/setup/chromium_chromedriver.yml +26 -0
- data/lib/kimurai/automation/setup/firefox_geckodriver.yml +20 -0
- data/lib/kimurai/automation/setup/phantomjs.yml +33 -0
- data/lib/kimurai/automation/setup/ruby_environment.yml +124 -0
- data/lib/kimurai/automation/setup.yml +45 -0
- data/lib/kimurai/base/saver.rb +106 -0
- data/lib/kimurai/base/storage.rb +54 -0
- data/lib/kimurai/base.rb +330 -0
- data/lib/kimurai/base_helper.rb +22 -0
- data/lib/kimurai/browser_builder/mechanize_builder.rb +154 -0
- data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +175 -0
- data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +199 -0
- data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +204 -0
- data/lib/kimurai/browser_builder.rb +20 -0
- data/lib/kimurai/capybara_configuration.rb +10 -0
- data/lib/kimurai/capybara_ext/driver/base.rb +62 -0
- data/lib/kimurai/capybara_ext/mechanize/driver.rb +71 -0
- data/lib/kimurai/capybara_ext/poltergeist/driver.rb +13 -0
- data/lib/kimurai/capybara_ext/selenium/driver.rb +34 -0
- data/lib/kimurai/capybara_ext/session/config.rb +22 -0
- data/lib/kimurai/capybara_ext/session.rb +249 -0
- data/lib/kimurai/cli/ansible_command_builder.rb +71 -0
- data/lib/kimurai/cli/generator.rb +57 -0
- data/lib/kimurai/cli.rb +183 -0
- data/lib/kimurai/core_ext/array.rb +14 -0
- data/lib/kimurai/core_ext/hash.rb +5 -0
- data/lib/kimurai/core_ext/numeric.rb +19 -0
- data/lib/kimurai/core_ext/string.rb +7 -0
- data/lib/kimurai/pipeline.rb +33 -0
- data/lib/kimurai/runner.rb +60 -0
- data/lib/kimurai/template/.gitignore +18 -0
- data/lib/kimurai/template/Gemfile +28 -0
- data/lib/kimurai/template/README.md +3 -0
- data/lib/kimurai/template/config/application.rb +37 -0
- data/lib/kimurai/template/config/automation.yml +13 -0
- data/lib/kimurai/template/config/boot.rb +22 -0
- data/lib/kimurai/template/config/initializers/.keep +0 -0
- data/lib/kimurai/template/config/schedule.rb +57 -0
- data/lib/kimurai/template/db/.keep +0 -0
- data/lib/kimurai/template/helpers/application_helper.rb +3 -0
- data/lib/kimurai/template/lib/.keep +0 -0
- data/lib/kimurai/template/log/.keep +0 -0
- data/lib/kimurai/template/pipelines/saver.rb +11 -0
- data/lib/kimurai/template/pipelines/validator.rb +24 -0
- data/lib/kimurai/template/spiders/application_spider.rb +143 -0
- data/lib/kimurai/template/tmp/.keep +0 -0
- data/lib/kimurai/version.rb +3 -0
- data/lib/kimurai.rb +54 -0
- metadata +349 -0
@@ -0,0 +1,249 @@
|
|
1
|
+
require 'capybara'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'json'
|
4
|
+
require_relative 'session/config'
|
5
|
+
|
6
|
+
module Capybara
|
7
|
+
class Session
|
8
|
+
attr_accessor :spider
|
9
|
+
|
10
|
+
alias_method :original_visit, :visit
|
11
|
+
def visit(visit_uri, delay: config.before_request[:delay], skip_request_options: false, max_retries: 3)
|
12
|
+
if spider
|
13
|
+
process_delay(delay) if delay
|
14
|
+
retries, sleep_interval = 0, 0
|
15
|
+
|
16
|
+
begin
|
17
|
+
check_request_options(visit_uri) unless skip_request_options
|
18
|
+
driver.requests += 1 and logger.info "Browser: started get request to: #{visit_uri}"
|
19
|
+
spider.class.update(:visits, :requests) if spider.with_info
|
20
|
+
|
21
|
+
original_visit(visit_uri)
|
22
|
+
rescue => e
|
23
|
+
if match_error?(e, type: :to_skip)
|
24
|
+
logger.error "Browser: skip request error: #{e.inspect}, url: #{visit_uri}"
|
25
|
+
spider.add_event(:requests_errors, e.inspect) if spider.with_info
|
26
|
+
false
|
27
|
+
elsif match_error?(e, type: :to_retry)
|
28
|
+
logger.error "Browser: retry request error: #{e.inspect}, url: #{visit_uri}"
|
29
|
+
spider.add_event(:requests_errors, e.inspect) if spider.with_info
|
30
|
+
|
31
|
+
if (retries += 1) <= max_retries
|
32
|
+
logger.info "Browser: sleep #{(sleep_interval += 15)} seconds and process retry № #{retries} to the url: #{visit_uri}"
|
33
|
+
sleep sleep_interval and retry
|
34
|
+
else
|
35
|
+
logger.error "Browser: all retries (#{retries - 1}) to the url #{visit_uri} are gone"
|
36
|
+
raise e unless skip_error_on_failure?(e)
|
37
|
+
end
|
38
|
+
else
|
39
|
+
raise e
|
40
|
+
end
|
41
|
+
else
|
42
|
+
driver.responses += 1 and logger.info "Browser: finished get request to: #{visit_uri}"
|
43
|
+
spider.class.update(:visits, :responses) if spider.with_info
|
44
|
+
driver.visited = true unless driver.visited
|
45
|
+
true
|
46
|
+
ensure
|
47
|
+
if spider.with_info
|
48
|
+
logger.info "Info: visits: requests: #{spider.class.visits[:requests]}, responses: #{spider.class.visits[:responses]}"
|
49
|
+
end
|
50
|
+
|
51
|
+
if memory = driver.current_memory
|
52
|
+
logger.debug "Browser: driver.current_memory: #{memory}"
|
53
|
+
end
|
54
|
+
end
|
55
|
+
else
|
56
|
+
original_visit(visit_uri)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def destroy_driver!
|
61
|
+
if @driver
|
62
|
+
begin
|
63
|
+
@driver.quit
|
64
|
+
# handle Net::ReadTimeout error for Selenium like drivers
|
65
|
+
rescue Net::ReadTimeout => e
|
66
|
+
@driver.quit
|
67
|
+
end
|
68
|
+
|
69
|
+
@driver = nil
|
70
|
+
logger.info "Browser: driver #{mode} has been destroyed"
|
71
|
+
else
|
72
|
+
logger.warn "Browser: driver #{mode} is not present"
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def restart!
|
77
|
+
if mode.match?(/poltergeist/)
|
78
|
+
@driver.browser.restart
|
79
|
+
@driver.requests, @driver.responses = 0, 0
|
80
|
+
else
|
81
|
+
destroy_driver!
|
82
|
+
driver
|
83
|
+
end
|
84
|
+
|
85
|
+
logger.info "Browser: driver has been restarted: name: #{mode}, pid: #{driver.pid}, port: #{driver.port}"
|
86
|
+
end
|
87
|
+
|
88
|
+
def current_response(response_type = :html)
|
89
|
+
case response_type
|
90
|
+
when :html
|
91
|
+
if config.encoding
|
92
|
+
if config.encoding == :auto
|
93
|
+
charset = body.force_encoding("ISO-8859-1").encode("UTF-8")[/<meta.*?charset=["]?([\w+\d+\-]*)/i, 1]
|
94
|
+
Nokogiri::HTML(body, nil, charset)
|
95
|
+
else
|
96
|
+
Nokogiri::HTML(body, nil, config.encoding)
|
97
|
+
end
|
98
|
+
else
|
99
|
+
Nokogiri::HTML(body)
|
100
|
+
end
|
101
|
+
when :json
|
102
|
+
JSON.parse(body)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
###
|
107
|
+
|
108
|
+
# Handy method to perform some processing in the new tab within block and then automatically close this tab:
|
109
|
+
# Usage (url):
|
110
|
+
# browser.within_new_window_by(url: "https://google.com") do
|
111
|
+
# do some stuff and then automatically close this tab and return back to the first tab
|
112
|
+
# end
|
113
|
+
# Usage (action) (when new tab opening by some action, for example by clicking
|
114
|
+
# on a particular element):
|
115
|
+
# action = -> { browser.find("//some/element/path").click }
|
116
|
+
# browser.within_new_window_by(action: action) do
|
117
|
+
# do some stuff and then automatically close this tab and return back to the first tab
|
118
|
+
# end
|
119
|
+
def within_new_window_by(action: nil, url: nil)
|
120
|
+
case
|
121
|
+
when action
|
122
|
+
opened_window = window_opened_by { action.call }
|
123
|
+
within_window(opened_window) do
|
124
|
+
yield
|
125
|
+
current_window.close
|
126
|
+
end
|
127
|
+
when url
|
128
|
+
within_window(open_new_window) do
|
129
|
+
visit(url)
|
130
|
+
|
131
|
+
yield
|
132
|
+
current_window.close
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
###
|
138
|
+
|
139
|
+
def scroll_to_bottom
|
140
|
+
execute_script("window.scrollBy(0,10000)")
|
141
|
+
end
|
142
|
+
|
143
|
+
private
|
144
|
+
|
145
|
+
def skip_error_on_failure?(e)
|
146
|
+
config.retry_request_errors.any? do |error|
|
147
|
+
error[:skip_on_failure] && e.class.ancestors.include?(error[:error]) if error.kind_of?(Hash)
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
def match_error?(e, type:)
|
152
|
+
errors =
|
153
|
+
case type
|
154
|
+
when :to_retry then config.retry_request_errors
|
155
|
+
when :to_skip then config.skip_request_errors
|
156
|
+
end
|
157
|
+
|
158
|
+
errors.any? do |error|
|
159
|
+
if error.kind_of?(Hash)
|
160
|
+
match_class = e.class.ancestors.include?(error[:error])
|
161
|
+
if error[:message].present?
|
162
|
+
if error[:message].kind_of?(Regexp)
|
163
|
+
e.message&.match?(error[:message])
|
164
|
+
else
|
165
|
+
e.message&.include?(error[:message])
|
166
|
+
end && match_class
|
167
|
+
else
|
168
|
+
match_class
|
169
|
+
end
|
170
|
+
else
|
171
|
+
e.class.ancestors.include?(error)
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
def process_delay(delay)
|
177
|
+
interval = (delay.class == Range ? rand(delay) : delay)
|
178
|
+
logger.debug "Browser: sleep #{interval.round(2)} #{'second'.pluralize(interval)} before request..."
|
179
|
+
sleep interval
|
180
|
+
end
|
181
|
+
|
182
|
+
def check_request_options(url_to_visit)
|
183
|
+
# restart_if
|
184
|
+
if memory_limit = config.restart_if[:memory_limit]
|
185
|
+
memory = driver.current_memory
|
186
|
+
if memory && memory >= memory_limit
|
187
|
+
logger.warn "Browser: memory_limit #{memory_limit} of driver.current_memory (#{memory}) is exceeded (engine: #{mode})"
|
188
|
+
restart!
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
if requests_limit = config.restart_if[:requests_limit]
|
193
|
+
requests = driver.requests
|
194
|
+
if requests >= requests_limit
|
195
|
+
logger.warn "Browser: requests_limit #{requests_limit} of driver.requests (#{requests}) is exceeded (engine: #{mode})"
|
196
|
+
restart!
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
# cookies
|
201
|
+
# (Selenium only) if config.cookies present and browser was just created,
|
202
|
+
# visit url_to_visit first and only then set cookies:
|
203
|
+
if driver.visited.nil? && config.cookies && mode.match?(/selenium/)
|
204
|
+
visit(url_to_visit, skip_request_options: true)
|
205
|
+
config.cookies.each do |cookie|
|
206
|
+
driver.set_cookie(cookie[:name], cookie[:value], cookie)
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
if config.before_request[:clear_cookies]
|
211
|
+
driver.clear_cookies
|
212
|
+
logger.debug "Browser: cleared cookies before request"
|
213
|
+
end
|
214
|
+
|
215
|
+
if config.before_request[:clear_and_set_cookies]
|
216
|
+
driver.clear_cookies
|
217
|
+
|
218
|
+
# (Selenium only) if browser is not visited yet any page, visit url_to_visit
|
219
|
+
# first and then set cookies (needs after browser restart):
|
220
|
+
if driver.visited.nil? && mode.match?(/selenium/)
|
221
|
+
visit(url_to_visit, skip_request_options: true)
|
222
|
+
end
|
223
|
+
|
224
|
+
config.cookies.each do |cookie|
|
225
|
+
driver.set_cookie(cookie[:name], cookie[:value], cookie)
|
226
|
+
end
|
227
|
+
|
228
|
+
logger.debug "Browser: cleared and set cookies before request"
|
229
|
+
end
|
230
|
+
|
231
|
+
# user_agent
|
232
|
+
if config.before_request[:change_user_agent]
|
233
|
+
driver.add_header("User-Agent", config.user_agent.call)
|
234
|
+
logger.debug "Browser: changed user_agent before request"
|
235
|
+
end
|
236
|
+
|
237
|
+
# proxy
|
238
|
+
if config.before_request[:change_proxy]
|
239
|
+
proxy_string = config.proxy.call
|
240
|
+
driver.set_proxy(*proxy_string.split(":"))
|
241
|
+
logger.debug "Browser: changed proxy before request"
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
def logger
|
246
|
+
spider.logger
|
247
|
+
end
|
248
|
+
end
|
249
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
require 'cliver'
|
2
|
+
|
3
|
+
module Kimurai
|
4
|
+
class CLI
|
5
|
+
class AnsibleCommandBuilder
|
6
|
+
def initialize(user_host, options, playbook:, vars: {})
|
7
|
+
@user_host = user_host
|
8
|
+
@options = options
|
9
|
+
@playbook = playbook
|
10
|
+
@vars = vars
|
11
|
+
end
|
12
|
+
|
13
|
+
def get
|
14
|
+
unless Cliver.detect("ansible-playbook")
|
15
|
+
raise "Can't find `ansible-playbook` executable, to install: " \
|
16
|
+
"Mac OS X: `$ brew install ansible`, Ubuntu: `$ sudo apt install ansible`"
|
17
|
+
end
|
18
|
+
|
19
|
+
user = @user_host[/(.*?)\@/, 1]
|
20
|
+
host = @user_host[/\@(.+)/, 1] || @user_host
|
21
|
+
inventory = @options["port"] ? "#{host}:#{@options['port']}," : "#{host},"
|
22
|
+
|
23
|
+
gem_dir = Gem::Specification.find_by_name("kimurai").gem_dir
|
24
|
+
playbook_path = gem_dir + "/lib/kimurai/automation/" + "#{@playbook}.yml"
|
25
|
+
|
26
|
+
command = [
|
27
|
+
"ansible-playbook", playbook_path,
|
28
|
+
"--inventory", inventory,
|
29
|
+
"--ssh-extra-args", "-oForwardAgent=yes",
|
30
|
+
"--connection", @options["local"] ? "local" : "smart",
|
31
|
+
"--extra-vars", "ansible_python_interpreter=/usr/bin/python3"
|
32
|
+
]
|
33
|
+
|
34
|
+
if File.exists? "config/automation.yml"
|
35
|
+
require 'yaml'
|
36
|
+
if config = YAML.load_file("config/automation.yml").dig(@playbook)
|
37
|
+
config.each { |key, value| @vars[key] = value unless @vars[key] }
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
@vars.each do |key, value|
|
42
|
+
next unless value.present?
|
43
|
+
command.push "--extra-vars", "#{key}=#{value}"
|
44
|
+
end
|
45
|
+
|
46
|
+
if user
|
47
|
+
command.push "--user", user
|
48
|
+
end
|
49
|
+
|
50
|
+
if @options["ask-sudo"]
|
51
|
+
command.push "--ask-become-pass"
|
52
|
+
end
|
53
|
+
|
54
|
+
if @options["ask-auth-pass"]
|
55
|
+
unless Cliver.detect("sshpass")
|
56
|
+
raise "Can't find `sshpass` executable for password authentication, to install: " \
|
57
|
+
"Mac OS X: `$ brew install http://git.io/sshpass.rb`, Ubuntu: `$ sudo apt install sshpass`"
|
58
|
+
end
|
59
|
+
|
60
|
+
command.push "--ask-pass"
|
61
|
+
end
|
62
|
+
|
63
|
+
if ssh_key_path = @options["ssh-key-path"]
|
64
|
+
command.push "--private-key", ssh_key_path
|
65
|
+
end
|
66
|
+
|
67
|
+
command
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
module Kimurai
|
2
|
+
class CLI
|
3
|
+
class Generator < Thor::Group
|
4
|
+
include Thor::Actions
|
5
|
+
|
6
|
+
def self.source_root
|
7
|
+
File.dirname(File.expand_path('..', __FILE__))
|
8
|
+
end
|
9
|
+
|
10
|
+
def generate_project(project_name)
|
11
|
+
directory "template", project_name
|
12
|
+
inside(project_name) do
|
13
|
+
run "bundle install"
|
14
|
+
run "git init"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def generate_spider(spider_name, in_project:)
|
19
|
+
spider_path = in_project ? "spiders/#{spider_name}.rb" : "./#{spider_name}.rb"
|
20
|
+
raise "Spider #{spider_path} already exists" if File.exists? spider_path
|
21
|
+
|
22
|
+
spider_class = to_spider_class(spider_name)
|
23
|
+
create_file spider_path do
|
24
|
+
<<~RUBY
|
25
|
+
class #{spider_class} < #{in_project ? 'ApplicationSpider' : 'Kimurai::Base'}
|
26
|
+
@name = "#{spider_name}"
|
27
|
+
@start_urls = []
|
28
|
+
@config = {}
|
29
|
+
|
30
|
+
def parse(response, url:, data: {})
|
31
|
+
end
|
32
|
+
end
|
33
|
+
RUBY
|
34
|
+
end
|
35
|
+
|
36
|
+
unless in_project
|
37
|
+
insert_into_file spider_path, " @engine = :mechanize\n", after: "@name = \"#{spider_name}\"\n"
|
38
|
+
prepend_to_file spider_path, "require 'kimurai'\n\n"
|
39
|
+
append_to_file spider_path, "\n#{spider_class}.crawl!"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def generate_schedule
|
44
|
+
copy_file "template/config/schedule.rb", "./schedule.rb"
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def to_spider_class(string)
|
50
|
+
string.sub(/^./) { $&.capitalize }
|
51
|
+
.gsub(/(?:_|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
|
52
|
+
.gsub(/(?:-|(\/))([a-z\d]*)/) { "Dash#{$2.capitalize}" }
|
53
|
+
.gsub(/(?:\.|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
data/lib/kimurai/cli.rb
ADDED
@@ -0,0 +1,183 @@
|
|
1
|
+
require 'thor'
|
2
|
+
|
3
|
+
module Kimurai
|
4
|
+
class CLI < Thor
|
5
|
+
map %w[--version -v] => :__print_version
|
6
|
+
|
7
|
+
desc "generate", "Generator, available types: project, spider, schedule"
|
8
|
+
def generate(generator_type, *args)
|
9
|
+
case generator_type
|
10
|
+
when "project"
|
11
|
+
project_name = args.shift
|
12
|
+
raise "Provide project name to generate a new project" unless project_name.present?
|
13
|
+
Generator.new.generate_project(project_name)
|
14
|
+
when "spider"
|
15
|
+
spider_name = args.shift
|
16
|
+
raise "Provide spider name to generate a spider" unless spider_name.present?
|
17
|
+
Generator.new.generate_spider(spider_name, in_project: inside_project?)
|
18
|
+
when "schedule"
|
19
|
+
Generator.new.generate_schedule
|
20
|
+
else
|
21
|
+
raise "Don't know this generator type: #{generator_type}"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
###
|
26
|
+
|
27
|
+
desc "setup", "Setup server"
|
28
|
+
option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
|
29
|
+
option "ask-sudo", type: :boolean, banner: "Provide sudo password for a user to install system-wide packages"
|
30
|
+
option "ask-auth-pass", type: :boolean, banner: "Auth using password"
|
31
|
+
option "ssh-key-path", type: :string, banner: "Auth using ssh key"
|
32
|
+
option :local, type: :boolean, banner: "Run setup on a local machine (Ubuntu only)"
|
33
|
+
def setup(user_host)
|
34
|
+
command = AnsibleCommandBuilder.new(user_host, options, playbook: "setup").get
|
35
|
+
|
36
|
+
pid = spawn *command
|
37
|
+
Process.wait pid
|
38
|
+
end
|
39
|
+
|
40
|
+
desc "deploy", "Deploy project to the server and update cron schedule"
|
41
|
+
option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
|
42
|
+
option "ask-auth-pass", type: :boolean, banner: "Auth using password"
|
43
|
+
option "ssh-key-path", type: :string, banner: "Auth using ssh key"
|
44
|
+
option "repo-url", type: :string, banner: "Repo url"
|
45
|
+
option "repo-key-path", type: :string, banner: "SSH key for a git repo"
|
46
|
+
option "skip-check", type: :boolean, default: false, banner: "Skip git repository checks"
|
47
|
+
def deploy(user_host)
|
48
|
+
unless options["skip-check"]
|
49
|
+
if !`git status --short`.empty?
|
50
|
+
raise "Deploy: Please commit your changes first"
|
51
|
+
elsif `git remote`.empty?
|
52
|
+
raise "Deploy: Please add remote origin repository to your repo first"
|
53
|
+
elsif !`git rev-list master...origin/master`.empty?
|
54
|
+
raise "Deploy: Please push your commits to the remote origin repo first"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
repo_url = options["repo-url"] ? options["repo-url"] : `git remote get-url origin`.strip
|
59
|
+
repo_name = repo_url[/\/([^\/]*)\.git/i, 1]
|
60
|
+
|
61
|
+
command = AnsibleCommandBuilder.new(user_host, options, playbook: "deploy",
|
62
|
+
vars: { repo_url: repo_url, repo_name: repo_name, repo_key_path: options["repo-key-path"] }
|
63
|
+
).get
|
64
|
+
|
65
|
+
pid = spawn *command
|
66
|
+
Process.wait pid
|
67
|
+
end
|
68
|
+
|
69
|
+
###
|
70
|
+
|
71
|
+
desc "crawl", "Run a particular spider by it's name"
|
72
|
+
def crawl(spider_name)
|
73
|
+
raise "Can't find Kimurai project" unless inside_project?
|
74
|
+
require './config/boot'
|
75
|
+
|
76
|
+
unless klass = Kimurai.find_by_name(spider_name)
|
77
|
+
raise "Can't find spider with name `#{spider_name}` in the project. " \
|
78
|
+
"To list all available spiders, run: `$ bundle exec kimurai list`"
|
79
|
+
end
|
80
|
+
|
81
|
+
# Set time_zone if exists
|
82
|
+
if time_zone = Kimurai.configuration.time_zone
|
83
|
+
Kimurai.time_zone = time_zone
|
84
|
+
end
|
85
|
+
|
86
|
+
klass.crawl!
|
87
|
+
end
|
88
|
+
|
89
|
+
desc "parse", "Parse url in the particular spider method"
|
90
|
+
option :url, type: :string, required: true, banner: "Url to pass to the method"
|
91
|
+
def parse(spider_name, method_name)
|
92
|
+
raise "Can't find Kimurai project" unless inside_project?
|
93
|
+
require './config/boot'
|
94
|
+
|
95
|
+
unless klass = Kimurai.find_by_name(spider_name)
|
96
|
+
raise "Can't find spider with name `#{spider_name}` in the project. " \
|
97
|
+
"To list all available spiders, run: `$ bundle exec kimurai list`"
|
98
|
+
end
|
99
|
+
|
100
|
+
klass.parse!(method_name, url: options["url"])
|
101
|
+
end
|
102
|
+
|
103
|
+
desc "console", "Start Kimurai console"
|
104
|
+
option :engine, type: :string, banner: "Engine to use"
|
105
|
+
option :url, type: :string, banner: "Url to process"
|
106
|
+
def console(spider_name = nil)
|
107
|
+
require 'pry'
|
108
|
+
require './config/boot' if inside_project?
|
109
|
+
|
110
|
+
if spider_name
|
111
|
+
raise "Can't find Kimurai project" unless inside_project?
|
112
|
+
|
113
|
+
unless klass = Kimurai.find_by_name(spider_name)
|
114
|
+
raise "Can't find spider with name `#{spider_name}` in the project. " \
|
115
|
+
"To list all available spiders, run: `$ bundle exec kimurai list`"
|
116
|
+
end
|
117
|
+
else
|
118
|
+
klass = inside_project? ? ApplicationSpider : ::Kimurai::Base
|
119
|
+
end
|
120
|
+
|
121
|
+
engine = options["engine"]&.delete(":")&.to_sym
|
122
|
+
if url = options["url"]
|
123
|
+
klass.new(engine).request_to(:console, url: options["url"])
|
124
|
+
else
|
125
|
+
klass.new(engine).public_send(:console)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
desc "list", "List all available spiders in the current project"
|
130
|
+
def list
|
131
|
+
raise "Can't find Kimurai project" unless inside_project?
|
132
|
+
require './config/boot'
|
133
|
+
|
134
|
+
Kimurai.list.keys.sort.each { |name| puts name }
|
135
|
+
end
|
136
|
+
|
137
|
+
desc "runner", "Run all spiders in the project in queue"
|
138
|
+
option :include, type: :array, default: [], banner: "List of spiders to run"
|
139
|
+
option :exclude, type: :array, default: [], banner: "List of spiders to exclude from run"
|
140
|
+
option :jobs, aliases: :j, type: :numeric, default: 1, banner: "The number of concurrent jobs"
|
141
|
+
def runner
|
142
|
+
raise "Can't find Kimurai project" unless inside_project?
|
143
|
+
|
144
|
+
jobs = options["jobs"]
|
145
|
+
raise "Jobs count can't be 0" if jobs == 0
|
146
|
+
|
147
|
+
require './config/boot'
|
148
|
+
require 'kimurai/runner'
|
149
|
+
|
150
|
+
spiders = options["include"].presence || Kimurai.list.keys
|
151
|
+
spiders -= options["exclude"]
|
152
|
+
|
153
|
+
Runner.new(spiders, jobs).run!
|
154
|
+
end
|
155
|
+
|
156
|
+
desc "--version, -v", "Print the version"
|
157
|
+
def __print_version
|
158
|
+
puts VERSION
|
159
|
+
end
|
160
|
+
|
161
|
+
desc "dashboard", "Run dashboard"
|
162
|
+
def dashboard
|
163
|
+
raise "Can't find Kimurai project" unless inside_project?
|
164
|
+
|
165
|
+
require './config/boot'
|
166
|
+
if Object.const_defined?("Kimurai::Dashboard")
|
167
|
+
require 'kimurai/dashboard/app'
|
168
|
+
Kimurai::Dashboard::App.run!
|
169
|
+
else
|
170
|
+
raise "Kimurai::Dashboard is not defined"
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
private
|
175
|
+
|
176
|
+
def inside_project?
|
177
|
+
Dir.exists?("spiders") && File.exists?("./config/boot.rb")
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
require_relative 'cli/generator'
|
183
|
+
require_relative 'cli/ansible_command_builder'
|
@@ -0,0 +1,14 @@
|
|
1
|
+
class Array
|
2
|
+
def in_sorted_groups(number, fill_width = nil)
|
3
|
+
sorted_groups = Array.new(number) { |a| a = [] }
|
4
|
+
|
5
|
+
self.in_groups_of(number, fill_width).each do |group|
|
6
|
+
number.times do |i|
|
7
|
+
group.fetch(i) rescue next
|
8
|
+
sorted_groups[i] << group[i]
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
sorted_groups
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
class Numeric
|
2
|
+
# https://stackoverflow.com/a/1679963
|
3
|
+
def duration
|
4
|
+
secs = self.to_int
|
5
|
+
mins = secs / 60
|
6
|
+
hours = mins / 60
|
7
|
+
days = hours / 24
|
8
|
+
|
9
|
+
if days > 0
|
10
|
+
"#{days}d, #{hours % 24}h"
|
11
|
+
elsif hours > 0
|
12
|
+
"#{hours}h, #{mins % 60}m"
|
13
|
+
elsif mins > 0
|
14
|
+
"#{mins}m, #{secs % 60}s"
|
15
|
+
elsif secs >= 0
|
16
|
+
"#{secs}s"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Kimurai
|
2
|
+
class Pipeline
|
3
|
+
class DropItemError < StandardError; end
|
4
|
+
def self.name
|
5
|
+
self.to_s.sub(/.*?::/, "").underscore.to_sym
|
6
|
+
end
|
7
|
+
|
8
|
+
include BaseHelper
|
9
|
+
attr_accessor :spider
|
10
|
+
|
11
|
+
def name
|
12
|
+
self.class.name
|
13
|
+
end
|
14
|
+
|
15
|
+
###
|
16
|
+
|
17
|
+
def storage
|
18
|
+
spider.storage
|
19
|
+
end
|
20
|
+
|
21
|
+
def unique?(scope, value)
|
22
|
+
spider.unique?(scope, value)
|
23
|
+
end
|
24
|
+
|
25
|
+
def save_to(path, item, format:, position: true, append: false)
|
26
|
+
spider.save_to(path, item, format: format, position: position, append: append)
|
27
|
+
end
|
28
|
+
|
29
|
+
def logger
|
30
|
+
spider.logger
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|