tanakai 1.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +12 -0
- data/.travis.yml +5 -0
- data/CHANGELOG.md +118 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +2038 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/exe/tanakai +6 -0
- data/lib/tanakai/automation/deploy.yml +54 -0
- data/lib/tanakai/automation/setup/chromium_chromedriver.yml +26 -0
- data/lib/tanakai/automation/setup/firefox_geckodriver.yml +20 -0
- data/lib/tanakai/automation/setup/phantomjs.yml +33 -0
- data/lib/tanakai/automation/setup/ruby_environment.yml +124 -0
- data/lib/tanakai/automation/setup.yml +45 -0
- data/lib/tanakai/base/saver.rb +106 -0
- data/lib/tanakai/base/storage.rb +54 -0
- data/lib/tanakai/base.rb +326 -0
- data/lib/tanakai/base_helper.rb +22 -0
- data/lib/tanakai/browser_builder/apparition_builder.rb +58 -0
- data/lib/tanakai/browser_builder/cuprite_builder.rb +54 -0
- data/lib/tanakai/browser_builder/mechanize_builder.rb +154 -0
- data/lib/tanakai/browser_builder/poltergeist_phantomjs_builder.rb +175 -0
- data/lib/tanakai/browser_builder/selenium_chrome_builder.rb +199 -0
- data/lib/tanakai/browser_builder/selenium_firefox_builder.rb +204 -0
- data/lib/tanakai/browser_builder.rb +20 -0
- data/lib/tanakai/capybara_configuration.rb +10 -0
- data/lib/tanakai/capybara_ext/apparition/driver.rb +13 -0
- data/lib/tanakai/capybara_ext/cuprite/driver.rb +13 -0
- data/lib/tanakai/capybara_ext/driver/base.rb +62 -0
- data/lib/tanakai/capybara_ext/mechanize/driver.rb +71 -0
- data/lib/tanakai/capybara_ext/poltergeist/driver.rb +13 -0
- data/lib/tanakai/capybara_ext/selenium/driver.rb +34 -0
- data/lib/tanakai/capybara_ext/session/config.rb +22 -0
- data/lib/tanakai/capybara_ext/session.rb +249 -0
- data/lib/tanakai/cli/ansible_command_builder.rb +71 -0
- data/lib/tanakai/cli/generator.rb +57 -0
- data/lib/tanakai/cli.rb +183 -0
- data/lib/tanakai/core_ext/array.rb +14 -0
- data/lib/tanakai/core_ext/hash.rb +5 -0
- data/lib/tanakai/core_ext/numeric.rb +19 -0
- data/lib/tanakai/core_ext/string.rb +7 -0
- data/lib/tanakai/pipeline.rb +33 -0
- data/lib/tanakai/runner.rb +60 -0
- data/lib/tanakai/template/.gitignore +18 -0
- data/lib/tanakai/template/Gemfile +28 -0
- data/lib/tanakai/template/README.md +3 -0
- data/lib/tanakai/template/config/application.rb +37 -0
- data/lib/tanakai/template/config/automation.yml +13 -0
- data/lib/tanakai/template/config/boot.rb +22 -0
- data/lib/tanakai/template/config/initializers/.keep +0 -0
- data/lib/tanakai/template/config/schedule.rb +57 -0
- data/lib/tanakai/template/db/.keep +0 -0
- data/lib/tanakai/template/helpers/application_helper.rb +3 -0
- data/lib/tanakai/template/lib/.keep +0 -0
- data/lib/tanakai/template/log/.keep +0 -0
- data/lib/tanakai/template/pipelines/saver.rb +11 -0
- data/lib/tanakai/template/pipelines/validator.rb +24 -0
- data/lib/tanakai/template/spiders/application_spider.rb +143 -0
- data/lib/tanakai/template/tmp/.keep +0 -0
- data/lib/tanakai/version.rb +3 -0
- data/lib/tanakai.rb +54 -0
- data/tanakai.gemspec +50 -0
- metadata +382 -0
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'pathname'
|
2
|
+
|
3
|
+
class Capybara::Driver::Base
|
4
|
+
attr_accessor :visited
|
5
|
+
attr_writer :requests, :responses
|
6
|
+
|
7
|
+
def requests
|
8
|
+
@requests ||= 0
|
9
|
+
end
|
10
|
+
|
11
|
+
def responses
|
12
|
+
@responses ||= 0
|
13
|
+
end
|
14
|
+
|
15
|
+
def current_memory
|
16
|
+
driver_pid = pid
|
17
|
+
|
18
|
+
all = (get_descendant_processes(driver_pid) << driver_pid).uniq
|
19
|
+
all.map { |pid| get_process_memory(pid) }.sum
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def get_descendant_processes(base)
|
25
|
+
descendants = Hash.new { |ht, k| ht[k] = [k] }
|
26
|
+
Hash[*`ps -eo pid,ppid`.scan(/\d+/).map(&:to_i)].each do |pid, ppid|
|
27
|
+
descendants[ppid] << descendants[pid]
|
28
|
+
end
|
29
|
+
|
30
|
+
descendants[base].flatten - [base]
|
31
|
+
end
|
32
|
+
|
33
|
+
# https://github.com/schneems/get_process_mem
|
34
|
+
# Note: for Linux takes PSS (not RSS) memory (I think PSS better fits in this case)
|
35
|
+
def get_process_memory(pid)
|
36
|
+
case @platform ||= Gem::Platform.local.os
|
37
|
+
when "linux"
|
38
|
+
begin
|
39
|
+
file = Pathname.new "/proc/#{pid}/smaps"
|
40
|
+
return 0 unless file.exist?
|
41
|
+
|
42
|
+
lines = file.each_line.select { |line| line.match(/^Pss/) }
|
43
|
+
return 0 if lines.empty?
|
44
|
+
|
45
|
+
lines.reduce(0) do |sum, line|
|
46
|
+
line.match(/(?<value>(\d*\.{0,1}\d+))\s+(?<unit>\w\w)/) do |m|
|
47
|
+
sum += m[:value].to_i
|
48
|
+
end
|
49
|
+
|
50
|
+
sum
|
51
|
+
end
|
52
|
+
rescue Errno::EACCES
|
53
|
+
0
|
54
|
+
end
|
55
|
+
when "darwin"
|
56
|
+
mem = `ps -o rss= -p #{pid}`.strip
|
57
|
+
mem.empty? ? 0 : mem.to_i
|
58
|
+
else
|
59
|
+
raise "Can't check process memory, wrong type of platform: #{@platform}"
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
require_relative '../driver/base'
|
3
|
+
|
4
|
+
class Capybara::Mechanize::Driver
|
5
|
+
# Extend capybara-mechnize to support Poltergeist-like methods
|
6
|
+
# https://www.rubydoc.info/gems/poltergeist/Capybara/Poltergeist/Driver
|
7
|
+
|
8
|
+
def set_proxy(ip, port, type, user = nil, password = nil)
|
9
|
+
# type is always "http", "socks" is not supported (yet)
|
10
|
+
browser.agent.set_proxy(ip, port, user, password)
|
11
|
+
end
|
12
|
+
|
13
|
+
###
|
14
|
+
|
15
|
+
def headers
|
16
|
+
browser.agent.request_headers
|
17
|
+
end
|
18
|
+
|
19
|
+
def headers=(headers)
|
20
|
+
browser.agent.request_headers = headers
|
21
|
+
end
|
22
|
+
|
23
|
+
def add_header(name, value)
|
24
|
+
browser.agent.request_headers[name] = value
|
25
|
+
end
|
26
|
+
|
27
|
+
###
|
28
|
+
|
29
|
+
def get_cookies
|
30
|
+
browser.agent.cookies
|
31
|
+
end
|
32
|
+
|
33
|
+
def set_cookie(name, value, options = {})
|
34
|
+
options[:name] ||= name
|
35
|
+
options[:value] ||= value
|
36
|
+
|
37
|
+
cookie = Mechanize::Cookie.new(options.merge path: "/")
|
38
|
+
browser.agent.cookie_jar << cookie
|
39
|
+
end
|
40
|
+
|
41
|
+
def set_cookies(cookies)
|
42
|
+
cookies.each do |cookie|
|
43
|
+
set_cookie(cookie[:name], cookie[:value], cookie)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def clear_cookies
|
48
|
+
browser.agent.cookie_jar.clear!
|
49
|
+
end
|
50
|
+
|
51
|
+
###
|
52
|
+
|
53
|
+
def quit
|
54
|
+
browser.agent.shutdown
|
55
|
+
end
|
56
|
+
|
57
|
+
###
|
58
|
+
|
59
|
+
# Reset parent method `current_memory` for mechanize (we can't measure memory of Mechanize driver)
|
60
|
+
def current_memory
|
61
|
+
nil
|
62
|
+
end
|
63
|
+
|
64
|
+
def pid
|
65
|
+
nil
|
66
|
+
end
|
67
|
+
|
68
|
+
def port
|
69
|
+
nil
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require_relative '../driver/base'
|
2
|
+
|
3
|
+
class Capybara::Selenium::Driver
|
4
|
+
def get_cookies
|
5
|
+
browser.manage.all_cookies
|
6
|
+
end
|
7
|
+
|
8
|
+
def set_cookie(name, value, options = {})
|
9
|
+
options[:name] ||= name
|
10
|
+
options[:value] ||= value
|
11
|
+
|
12
|
+
browser.manage.add_cookie(options)
|
13
|
+
end
|
14
|
+
|
15
|
+
def set_cookies(cookies)
|
16
|
+
cookies.each do |cookie|
|
17
|
+
set_cookie(cookie[:name], cookie[:value], cookie)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def clear_cookies
|
22
|
+
browser.manage.delete_all_cookies
|
23
|
+
end
|
24
|
+
|
25
|
+
###
|
26
|
+
|
27
|
+
def pid
|
28
|
+
@pid ||= `lsof -i tcp:#{port} -t`.strip.to_i
|
29
|
+
end
|
30
|
+
|
31
|
+
def port
|
32
|
+
@port ||= browser.send(:bridge).instance_variable_get("@http").instance_variable_get("@server_url").port
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Capybara
|
2
|
+
class SessionConfig
|
3
|
+
attr_accessor :cookies, :proxy, :user_agent, :encoding
|
4
|
+
attr_writer :retry_request_errors, :skip_request_errors
|
5
|
+
|
6
|
+
def retry_request_errors
|
7
|
+
@retry_request_errors ||= []
|
8
|
+
end
|
9
|
+
|
10
|
+
def skip_request_errors
|
11
|
+
@skip_request_errors ||= []
|
12
|
+
end
|
13
|
+
|
14
|
+
def restart_if
|
15
|
+
@restart_if ||= {}
|
16
|
+
end
|
17
|
+
|
18
|
+
def before_request
|
19
|
+
@before_request ||= {}
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,249 @@
|
|
1
|
+
require 'capybara'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'json'
|
4
|
+
require_relative 'session/config'
|
5
|
+
|
6
|
+
module Capybara
|
7
|
+
class Session
|
8
|
+
attr_accessor :spider
|
9
|
+
|
10
|
+
alias_method :original_visit, :visit
|
11
|
+
def visit(visit_uri, delay: config.before_request[:delay], skip_request_options: false, max_retries: 3)
|
12
|
+
if spider
|
13
|
+
process_delay(delay) if delay
|
14
|
+
retries, sleep_interval = 0, 0
|
15
|
+
|
16
|
+
begin
|
17
|
+
check_request_options(visit_uri) unless skip_request_options
|
18
|
+
driver.requests += 1 and logger.info "Browser: started get request to: #{visit_uri}"
|
19
|
+
spider.class.update(:visits, :requests) if spider.with_info
|
20
|
+
|
21
|
+
original_visit(visit_uri)
|
22
|
+
rescue => e
|
23
|
+
if match_error?(e, type: :to_skip)
|
24
|
+
logger.error "Browser: skip request error: #{e.inspect}, url: #{visit_uri}"
|
25
|
+
spider.add_event(:requests_errors, e.inspect) if spider.with_info
|
26
|
+
false
|
27
|
+
elsif match_error?(e, type: :to_retry)
|
28
|
+
logger.error "Browser: retry request error: #{e.inspect}, url: #{visit_uri}"
|
29
|
+
spider.add_event(:requests_errors, e.inspect) if spider.with_info
|
30
|
+
|
31
|
+
if (retries += 1) <= max_retries
|
32
|
+
logger.info "Browser: sleep #{(sleep_interval += 15)} seconds and process retry № #{retries} to the url: #{visit_uri}"
|
33
|
+
sleep sleep_interval and retry
|
34
|
+
else
|
35
|
+
logger.error "Browser: all retries (#{retries - 1}) to the url #{visit_uri} are gone"
|
36
|
+
raise e unless skip_error_on_failure?(e)
|
37
|
+
end
|
38
|
+
else
|
39
|
+
raise e
|
40
|
+
end
|
41
|
+
else
|
42
|
+
driver.responses += 1 and logger.info "Browser: finished get request to: #{visit_uri}"
|
43
|
+
spider.class.update(:visits, :responses) if spider.with_info
|
44
|
+
driver.visited = true unless driver.visited
|
45
|
+
true
|
46
|
+
ensure
|
47
|
+
if spider.with_info
|
48
|
+
logger.info "Info: visits: requests: #{spider.class.visits[:requests]}, responses: #{spider.class.visits[:responses]}"
|
49
|
+
end
|
50
|
+
|
51
|
+
if memory = driver.current_memory
|
52
|
+
logger.debug "Browser: driver.current_memory: #{memory}"
|
53
|
+
end
|
54
|
+
end
|
55
|
+
else
|
56
|
+
original_visit(visit_uri)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def destroy_driver!
|
61
|
+
if @driver
|
62
|
+
begin
|
63
|
+
@driver.quit
|
64
|
+
# handle Net::ReadTimeout error for Selenium like drivers
|
65
|
+
rescue Net::ReadTimeout => e
|
66
|
+
@driver.quit
|
67
|
+
end
|
68
|
+
|
69
|
+
@driver = nil
|
70
|
+
logger.info "Browser: driver #{mode} has been destroyed"
|
71
|
+
else
|
72
|
+
logger.warn "Browser: driver #{mode} is not present"
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def restart!
|
77
|
+
if mode.match?(/poltergeist/)
|
78
|
+
@driver.browser.restart
|
79
|
+
@driver.requests, @driver.responses = 0, 0
|
80
|
+
else
|
81
|
+
destroy_driver!
|
82
|
+
driver
|
83
|
+
end
|
84
|
+
|
85
|
+
logger.info "Browser: driver has been restarted: name: #{mode}, pid: #{driver.pid}, port: #{driver.port}"
|
86
|
+
end
|
87
|
+
|
88
|
+
def current_response(response_type = :html)
|
89
|
+
case response_type
|
90
|
+
when :html
|
91
|
+
if config.encoding
|
92
|
+
if config.encoding == :auto
|
93
|
+
charset = body.force_encoding("ISO-8859-1").encode("UTF-8")[/<meta.*?charset=["]?([\w+\d+\-]*)/i, 1]
|
94
|
+
Nokogiri::HTML(body, nil, charset)
|
95
|
+
else
|
96
|
+
Nokogiri::HTML(body, nil, config.encoding)
|
97
|
+
end
|
98
|
+
else
|
99
|
+
Nokogiri::HTML(body)
|
100
|
+
end
|
101
|
+
when :json
|
102
|
+
JSON.parse(body)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
###
|
107
|
+
|
108
|
+
# Handy method to perform some processing in the new tab within block and then automatically close this tab:
|
109
|
+
# Usage (url):
|
110
|
+
# browser.within_new_window_by(url: "https://google.com") do
|
111
|
+
# do some stuff and then automatically close this tab and return back to the first tab
|
112
|
+
# end
|
113
|
+
# Usage (action) (when new tab opening by some action, for example by clicking
|
114
|
+
# on a particular element):
|
115
|
+
# action = -> { browser.find("//some/element/path").click }
|
116
|
+
# browser.within_new_window_by(action: action) do
|
117
|
+
# do some stuff and then automatically close this tab and return back to the first tab
|
118
|
+
# end
|
119
|
+
def within_new_window_by(action: nil, url: nil)
|
120
|
+
case
|
121
|
+
when action
|
122
|
+
opened_window = window_opened_by { action.call }
|
123
|
+
within_window(opened_window) do
|
124
|
+
yield
|
125
|
+
current_window.close
|
126
|
+
end
|
127
|
+
when url
|
128
|
+
within_window(open_new_window) do
|
129
|
+
visit(url)
|
130
|
+
|
131
|
+
yield
|
132
|
+
current_window.close
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
###
|
138
|
+
|
139
|
+
def scroll_to_bottom
|
140
|
+
execute_script("window.scrollBy(0,10000)")
|
141
|
+
end
|
142
|
+
|
143
|
+
private
|
144
|
+
|
145
|
+
def skip_error_on_failure?(e)
|
146
|
+
config.retry_request_errors.any? do |error|
|
147
|
+
error[:skip_on_failure] && e.class.ancestors.include?(error[:error]) if error.kind_of?(Hash)
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
def match_error?(e, type:)
|
152
|
+
errors =
|
153
|
+
case type
|
154
|
+
when :to_retry then config.retry_request_errors
|
155
|
+
when :to_skip then config.skip_request_errors
|
156
|
+
end
|
157
|
+
|
158
|
+
errors.any? do |error|
|
159
|
+
if error.kind_of?(Hash)
|
160
|
+
match_class = e.class.ancestors.include?(error[:error])
|
161
|
+
if error[:message].present?
|
162
|
+
if error[:message].kind_of?(Regexp)
|
163
|
+
e.message&.match?(error[:message])
|
164
|
+
else
|
165
|
+
e.message&.include?(error[:message])
|
166
|
+
end && match_class
|
167
|
+
else
|
168
|
+
match_class
|
169
|
+
end
|
170
|
+
else
|
171
|
+
e.class.ancestors.include?(error)
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
def process_delay(delay)
|
177
|
+
interval = (delay.class == Range ? rand(delay) : delay)
|
178
|
+
logger.debug "Browser: sleep #{interval.round(2)} #{'second'.pluralize(interval)} before request..."
|
179
|
+
sleep interval
|
180
|
+
end
|
181
|
+
|
182
|
+
def check_request_options(url_to_visit)
|
183
|
+
# restart_if
|
184
|
+
if memory_limit = config.restart_if[:memory_limit]
|
185
|
+
memory = driver.current_memory
|
186
|
+
if memory && memory >= memory_limit
|
187
|
+
logger.warn "Browser: memory_limit #{memory_limit} of driver.current_memory (#{memory}) is exceeded (engine: #{mode})"
|
188
|
+
restart!
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
if requests_limit = config.restart_if[:requests_limit]
|
193
|
+
requests = driver.requests
|
194
|
+
if requests >= requests_limit
|
195
|
+
logger.warn "Browser: requests_limit #{requests_limit} of driver.requests (#{requests}) is exceeded (engine: #{mode})"
|
196
|
+
restart!
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
# cookies
|
201
|
+
# (Selenium only) if config.cookies present and browser was just created,
|
202
|
+
# visit url_to_visit first and only then set cookies:
|
203
|
+
if driver.visited.nil? && config.cookies && mode.match?(/selenium/)
|
204
|
+
visit(url_to_visit, skip_request_options: true)
|
205
|
+
config.cookies.each do |cookie|
|
206
|
+
driver.set_cookie(cookie[:name], cookie[:value], cookie)
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
if config.before_request[:clear_cookies]
|
211
|
+
driver.clear_cookies
|
212
|
+
logger.debug "Browser: cleared cookies before request"
|
213
|
+
end
|
214
|
+
|
215
|
+
if config.before_request[:clear_and_set_cookies]
|
216
|
+
driver.clear_cookies
|
217
|
+
|
218
|
+
# (Selenium only) if browser is not visited yet any page, visit url_to_visit
|
219
|
+
# first and then set cookies (needs after browser restart):
|
220
|
+
if driver.visited.nil? && mode.match?(/selenium/)
|
221
|
+
visit(url_to_visit, skip_request_options: true)
|
222
|
+
end
|
223
|
+
|
224
|
+
config.cookies.each do |cookie|
|
225
|
+
driver.set_cookie(cookie[:name], cookie[:value], cookie)
|
226
|
+
end
|
227
|
+
|
228
|
+
logger.debug "Browser: cleared and set cookies before request"
|
229
|
+
end
|
230
|
+
|
231
|
+
# user_agent
|
232
|
+
if config.before_request[:change_user_agent]
|
233
|
+
driver.add_header("User-Agent", config.user_agent.call)
|
234
|
+
logger.debug "Browser: changed user_agent before request"
|
235
|
+
end
|
236
|
+
|
237
|
+
# proxy
|
238
|
+
if config.before_request[:change_proxy]
|
239
|
+
proxy_string = config.proxy.call
|
240
|
+
driver.set_proxy(*proxy_string.split(":"))
|
241
|
+
logger.debug "Browser: changed proxy before request"
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
def logger
|
246
|
+
spider.logger
|
247
|
+
end
|
248
|
+
end
|
249
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
require 'cliver'
|
2
|
+
|
3
|
+
module Tanakai
|
4
|
+
class CLI
|
5
|
+
class AnsibleCommandBuilder
|
6
|
+
def initialize(user_host, options, playbook:, vars: {})
|
7
|
+
@user_host = user_host
|
8
|
+
@options = options
|
9
|
+
@playbook = playbook
|
10
|
+
@vars = vars
|
11
|
+
end
|
12
|
+
|
13
|
+
def get
|
14
|
+
unless Cliver.detect("ansible-playbook")
|
15
|
+
raise "Can't find `ansible-playbook` executable, to install: " \
|
16
|
+
"Mac OS X: `$ brew install ansible`, Ubuntu: `$ sudo apt install ansible`"
|
17
|
+
end
|
18
|
+
|
19
|
+
user = @user_host[/(.*?)\@/, 1]
|
20
|
+
host = @user_host[/\@(.+)/, 1] || @user_host
|
21
|
+
inventory = @options["port"] ? "#{host}:#{@options['port']}," : "#{host},"
|
22
|
+
|
23
|
+
gem_dir = Gem::Specification.find_by_name("tanakai").gem_dir
|
24
|
+
playbook_path = gem_dir + "/lib/tanakai/automation/" + "#{@playbook}.yml"
|
25
|
+
|
26
|
+
command = [
|
27
|
+
"ansible-playbook", playbook_path,
|
28
|
+
"--inventory", inventory,
|
29
|
+
"--ssh-extra-args", "-oForwardAgent=yes",
|
30
|
+
"--connection", @options["local"] ? "local" : "smart",
|
31
|
+
"--extra-vars", "ansible_python_interpreter=/usr/bin/python3"
|
32
|
+
]
|
33
|
+
|
34
|
+
if File.exists? "config/automation.yml"
|
35
|
+
require 'yaml'
|
36
|
+
if config = YAML.load_file("config/automation.yml").dig(@playbook)
|
37
|
+
config.each { |key, value| @vars[key] = value unless @vars[key] }
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
@vars.each do |key, value|
|
42
|
+
next unless value.present?
|
43
|
+
command.push "--extra-vars", "#{key}=#{value}"
|
44
|
+
end
|
45
|
+
|
46
|
+
if user
|
47
|
+
command.push "--user", user
|
48
|
+
end
|
49
|
+
|
50
|
+
if @options["ask-sudo"]
|
51
|
+
command.push "--ask-become-pass"
|
52
|
+
end
|
53
|
+
|
54
|
+
if @options["ask-auth-pass"]
|
55
|
+
unless Cliver.detect("sshpass")
|
56
|
+
raise "Can't find `sshpass` executable for password authentication, to install: " \
|
57
|
+
"Mac OS X: `$ brew install http://git.io/sshpass.rb`, Ubuntu: `$ sudo apt install sshpass`"
|
58
|
+
end
|
59
|
+
|
60
|
+
command.push "--ask-pass"
|
61
|
+
end
|
62
|
+
|
63
|
+
if ssh_key_path = @options["ssh-key-path"]
|
64
|
+
command.push "--private-key", ssh_key_path
|
65
|
+
end
|
66
|
+
|
67
|
+
command
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
module Tanakai
|
2
|
+
class CLI
|
3
|
+
class Generator < Thor::Group
|
4
|
+
include Thor::Actions
|
5
|
+
|
6
|
+
def self.source_root
|
7
|
+
File.dirname(File.expand_path('..', __FILE__))
|
8
|
+
end
|
9
|
+
|
10
|
+
def generate_project(project_name)
|
11
|
+
directory "template", project_name
|
12
|
+
inside(project_name) do
|
13
|
+
run "bundle install"
|
14
|
+
run "git init"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def generate_spider(spider_name, in_project:)
|
19
|
+
spider_path = in_project ? "spiders/#{spider_name}.rb" : "./#{spider_name}.rb"
|
20
|
+
raise "Spider #{spider_path} already exists" if File.exists? spider_path
|
21
|
+
|
22
|
+
spider_class = to_spider_class(spider_name)
|
23
|
+
create_file spider_path do
|
24
|
+
<<~RUBY
|
25
|
+
class #{spider_class} < #{in_project ? 'ApplicationSpider' : 'Tanakai::Base'}
|
26
|
+
@name = "#{spider_name}"
|
27
|
+
@start_urls = []
|
28
|
+
@config = {}
|
29
|
+
|
30
|
+
def parse(response, url:, data: {})
|
31
|
+
end
|
32
|
+
end
|
33
|
+
RUBY
|
34
|
+
end
|
35
|
+
|
36
|
+
unless in_project
|
37
|
+
insert_into_file spider_path, " @engine = :mechanize\n", after: "@name = \"#{spider_name}\"\n"
|
38
|
+
prepend_to_file spider_path, "require 'tanakai'\n\n"
|
39
|
+
append_to_file spider_path, "\n#{spider_class}.crawl!"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def generate_schedule
|
44
|
+
copy_file "template/config/schedule.rb", "./schedule.rb"
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def to_spider_class(string)
|
50
|
+
string.sub(/^./) { $&.capitalize }
|
51
|
+
.gsub(/(?:_|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
|
52
|
+
.gsub(/(?:-|(\/))([a-z\d]*)/) { "Dash#{$2.capitalize}" }
|
53
|
+
.gsub(/(?:\.|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|