tanakai 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +12 -0
- data/.travis.yml +5 -0
- data/CHANGELOG.md +118 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +2038 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/exe/tanakai +6 -0
- data/lib/tanakai/automation/deploy.yml +54 -0
- data/lib/tanakai/automation/setup/chromium_chromedriver.yml +26 -0
- data/lib/tanakai/automation/setup/firefox_geckodriver.yml +20 -0
- data/lib/tanakai/automation/setup/phantomjs.yml +33 -0
- data/lib/tanakai/automation/setup/ruby_environment.yml +124 -0
- data/lib/tanakai/automation/setup.yml +45 -0
- data/lib/tanakai/base/saver.rb +106 -0
- data/lib/tanakai/base/storage.rb +54 -0
- data/lib/tanakai/base.rb +326 -0
- data/lib/tanakai/base_helper.rb +22 -0
- data/lib/tanakai/browser_builder/apparition_builder.rb +58 -0
- data/lib/tanakai/browser_builder/cuprite_builder.rb +54 -0
- data/lib/tanakai/browser_builder/mechanize_builder.rb +154 -0
- data/lib/tanakai/browser_builder/poltergeist_phantomjs_builder.rb +175 -0
- data/lib/tanakai/browser_builder/selenium_chrome_builder.rb +199 -0
- data/lib/tanakai/browser_builder/selenium_firefox_builder.rb +204 -0
- data/lib/tanakai/browser_builder.rb +20 -0
- data/lib/tanakai/capybara_configuration.rb +10 -0
- data/lib/tanakai/capybara_ext/apparition/driver.rb +13 -0
- data/lib/tanakai/capybara_ext/cuprite/driver.rb +13 -0
- data/lib/tanakai/capybara_ext/driver/base.rb +62 -0
- data/lib/tanakai/capybara_ext/mechanize/driver.rb +71 -0
- data/lib/tanakai/capybara_ext/poltergeist/driver.rb +13 -0
- data/lib/tanakai/capybara_ext/selenium/driver.rb +34 -0
- data/lib/tanakai/capybara_ext/session/config.rb +22 -0
- data/lib/tanakai/capybara_ext/session.rb +249 -0
- data/lib/tanakai/cli/ansible_command_builder.rb +71 -0
- data/lib/tanakai/cli/generator.rb +57 -0
- data/lib/tanakai/cli.rb +183 -0
- data/lib/tanakai/core_ext/array.rb +14 -0
- data/lib/tanakai/core_ext/hash.rb +5 -0
- data/lib/tanakai/core_ext/numeric.rb +19 -0
- data/lib/tanakai/core_ext/string.rb +7 -0
- data/lib/tanakai/pipeline.rb +33 -0
- data/lib/tanakai/runner.rb +60 -0
- data/lib/tanakai/template/.gitignore +18 -0
- data/lib/tanakai/template/Gemfile +28 -0
- data/lib/tanakai/template/README.md +3 -0
- data/lib/tanakai/template/config/application.rb +37 -0
- data/lib/tanakai/template/config/automation.yml +13 -0
- data/lib/tanakai/template/config/boot.rb +22 -0
- data/lib/tanakai/template/config/initializers/.keep +0 -0
- data/lib/tanakai/template/config/schedule.rb +57 -0
- data/lib/tanakai/template/db/.keep +0 -0
- data/lib/tanakai/template/helpers/application_helper.rb +3 -0
- data/lib/tanakai/template/lib/.keep +0 -0
- data/lib/tanakai/template/log/.keep +0 -0
- data/lib/tanakai/template/pipelines/saver.rb +11 -0
- data/lib/tanakai/template/pipelines/validator.rb +24 -0
- data/lib/tanakai/template/spiders/application_spider.rb +143 -0
- data/lib/tanakai/template/tmp/.keep +0 -0
- data/lib/tanakai/version.rb +3 -0
- data/lib/tanakai.rb +54 -0
- data/tanakai.gemspec +50 -0
- metadata +382 -0
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'pathname'
|
2
|
+
|
3
|
+
class Capybara::Driver::Base
|
4
|
+
attr_accessor :visited
|
5
|
+
attr_writer :requests, :responses
|
6
|
+
|
7
|
+
def requests
|
8
|
+
@requests ||= 0
|
9
|
+
end
|
10
|
+
|
11
|
+
def responses
|
12
|
+
@responses ||= 0
|
13
|
+
end
|
14
|
+
|
15
|
+
def current_memory
|
16
|
+
driver_pid = pid
|
17
|
+
|
18
|
+
all = (get_descendant_processes(driver_pid) << driver_pid).uniq
|
19
|
+
all.map { |pid| get_process_memory(pid) }.sum
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def get_descendant_processes(base)
|
25
|
+
descendants = Hash.new { |ht, k| ht[k] = [k] }
|
26
|
+
Hash[*`ps -eo pid,ppid`.scan(/\d+/).map(&:to_i)].each do |pid, ppid|
|
27
|
+
descendants[ppid] << descendants[pid]
|
28
|
+
end
|
29
|
+
|
30
|
+
descendants[base].flatten - [base]
|
31
|
+
end
|
32
|
+
|
33
|
+
# https://github.com/schneems/get_process_mem
|
34
|
+
# Note: for Linux takes PSS (not RSS) memory (I think PSS better fits in this case)
|
35
|
+
def get_process_memory(pid)
|
36
|
+
case @platform ||= Gem::Platform.local.os
|
37
|
+
when "linux"
|
38
|
+
begin
|
39
|
+
file = Pathname.new "/proc/#{pid}/smaps"
|
40
|
+
return 0 unless file.exist?
|
41
|
+
|
42
|
+
lines = file.each_line.select { |line| line.match(/^Pss/) }
|
43
|
+
return 0 if lines.empty?
|
44
|
+
|
45
|
+
lines.reduce(0) do |sum, line|
|
46
|
+
line.match(/(?<value>(\d*\.{0,1}\d+))\s+(?<unit>\w\w)/) do |m|
|
47
|
+
sum += m[:value].to_i
|
48
|
+
end
|
49
|
+
|
50
|
+
sum
|
51
|
+
end
|
52
|
+
rescue Errno::EACCES
|
53
|
+
0
|
54
|
+
end
|
55
|
+
when "darwin"
|
56
|
+
mem = `ps -o rss= -p #{pid}`.strip
|
57
|
+
mem.empty? ? 0 : mem.to_i
|
58
|
+
else
|
59
|
+
raise "Can't check process memory, wrong type of platform: #{@platform}"
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
require_relative '../driver/base'
|
3
|
+
|
4
|
+
class Capybara::Mechanize::Driver
|
5
|
+
# Extend capybara-mechnize to support Poltergeist-like methods
|
6
|
+
# https://www.rubydoc.info/gems/poltergeist/Capybara/Poltergeist/Driver
|
7
|
+
|
8
|
+
def set_proxy(ip, port, type, user = nil, password = nil)
|
9
|
+
# type is always "http", "socks" is not supported (yet)
|
10
|
+
browser.agent.set_proxy(ip, port, user, password)
|
11
|
+
end
|
12
|
+
|
13
|
+
###
|
14
|
+
|
15
|
+
def headers
|
16
|
+
browser.agent.request_headers
|
17
|
+
end
|
18
|
+
|
19
|
+
def headers=(headers)
|
20
|
+
browser.agent.request_headers = headers
|
21
|
+
end
|
22
|
+
|
23
|
+
def add_header(name, value)
|
24
|
+
browser.agent.request_headers[name] = value
|
25
|
+
end
|
26
|
+
|
27
|
+
###
|
28
|
+
|
29
|
+
def get_cookies
|
30
|
+
browser.agent.cookies
|
31
|
+
end
|
32
|
+
|
33
|
+
def set_cookie(name, value, options = {})
|
34
|
+
options[:name] ||= name
|
35
|
+
options[:value] ||= value
|
36
|
+
|
37
|
+
cookie = Mechanize::Cookie.new(options.merge path: "/")
|
38
|
+
browser.agent.cookie_jar << cookie
|
39
|
+
end
|
40
|
+
|
41
|
+
def set_cookies(cookies)
|
42
|
+
cookies.each do |cookie|
|
43
|
+
set_cookie(cookie[:name], cookie[:value], cookie)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def clear_cookies
|
48
|
+
browser.agent.cookie_jar.clear!
|
49
|
+
end
|
50
|
+
|
51
|
+
###
|
52
|
+
|
53
|
+
def quit
|
54
|
+
browser.agent.shutdown
|
55
|
+
end
|
56
|
+
|
57
|
+
###
|
58
|
+
|
59
|
+
# Reset parent method `current_memory` for mechanize (we can't measure memory of Mechanize driver)
|
60
|
+
def current_memory
|
61
|
+
nil
|
62
|
+
end
|
63
|
+
|
64
|
+
def pid
|
65
|
+
nil
|
66
|
+
end
|
67
|
+
|
68
|
+
def port
|
69
|
+
nil
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require_relative '../driver/base'
|
2
|
+
|
3
|
+
class Capybara::Selenium::Driver
|
4
|
+
def get_cookies
|
5
|
+
browser.manage.all_cookies
|
6
|
+
end
|
7
|
+
|
8
|
+
def set_cookie(name, value, options = {})
|
9
|
+
options[:name] ||= name
|
10
|
+
options[:value] ||= value
|
11
|
+
|
12
|
+
browser.manage.add_cookie(options)
|
13
|
+
end
|
14
|
+
|
15
|
+
def set_cookies(cookies)
|
16
|
+
cookies.each do |cookie|
|
17
|
+
set_cookie(cookie[:name], cookie[:value], cookie)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def clear_cookies
|
22
|
+
browser.manage.delete_all_cookies
|
23
|
+
end
|
24
|
+
|
25
|
+
###
|
26
|
+
|
27
|
+
def pid
|
28
|
+
@pid ||= `lsof -i tcp:#{port} -t`.strip.to_i
|
29
|
+
end
|
30
|
+
|
31
|
+
def port
|
32
|
+
@port ||= browser.send(:bridge).instance_variable_get("@http").instance_variable_get("@server_url").port
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Capybara
|
2
|
+
class SessionConfig
|
3
|
+
attr_accessor :cookies, :proxy, :user_agent, :encoding
|
4
|
+
attr_writer :retry_request_errors, :skip_request_errors
|
5
|
+
|
6
|
+
def retry_request_errors
|
7
|
+
@retry_request_errors ||= []
|
8
|
+
end
|
9
|
+
|
10
|
+
def skip_request_errors
|
11
|
+
@skip_request_errors ||= []
|
12
|
+
end
|
13
|
+
|
14
|
+
def restart_if
|
15
|
+
@restart_if ||= {}
|
16
|
+
end
|
17
|
+
|
18
|
+
def before_request
|
19
|
+
@before_request ||= {}
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,249 @@
|
|
1
|
+
require 'capybara'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'json'
|
4
|
+
require_relative 'session/config'
|
5
|
+
|
6
|
+
module Capybara
|
7
|
+
class Session
|
8
|
+
attr_accessor :spider
|
9
|
+
|
10
|
+
alias_method :original_visit, :visit
|
11
|
+
def visit(visit_uri, delay: config.before_request[:delay], skip_request_options: false, max_retries: 3)
|
12
|
+
if spider
|
13
|
+
process_delay(delay) if delay
|
14
|
+
retries, sleep_interval = 0, 0
|
15
|
+
|
16
|
+
begin
|
17
|
+
check_request_options(visit_uri) unless skip_request_options
|
18
|
+
driver.requests += 1 and logger.info "Browser: started get request to: #{visit_uri}"
|
19
|
+
spider.class.update(:visits, :requests) if spider.with_info
|
20
|
+
|
21
|
+
original_visit(visit_uri)
|
22
|
+
rescue => e
|
23
|
+
if match_error?(e, type: :to_skip)
|
24
|
+
logger.error "Browser: skip request error: #{e.inspect}, url: #{visit_uri}"
|
25
|
+
spider.add_event(:requests_errors, e.inspect) if spider.with_info
|
26
|
+
false
|
27
|
+
elsif match_error?(e, type: :to_retry)
|
28
|
+
logger.error "Browser: retry request error: #{e.inspect}, url: #{visit_uri}"
|
29
|
+
spider.add_event(:requests_errors, e.inspect) if spider.with_info
|
30
|
+
|
31
|
+
if (retries += 1) <= max_retries
|
32
|
+
logger.info "Browser: sleep #{(sleep_interval += 15)} seconds and process retry № #{retries} to the url: #{visit_uri}"
|
33
|
+
sleep sleep_interval and retry
|
34
|
+
else
|
35
|
+
logger.error "Browser: all retries (#{retries - 1}) to the url #{visit_uri} are gone"
|
36
|
+
raise e unless skip_error_on_failure?(e)
|
37
|
+
end
|
38
|
+
else
|
39
|
+
raise e
|
40
|
+
end
|
41
|
+
else
|
42
|
+
driver.responses += 1 and logger.info "Browser: finished get request to: #{visit_uri}"
|
43
|
+
spider.class.update(:visits, :responses) if spider.with_info
|
44
|
+
driver.visited = true unless driver.visited
|
45
|
+
true
|
46
|
+
ensure
|
47
|
+
if spider.with_info
|
48
|
+
logger.info "Info: visits: requests: #{spider.class.visits[:requests]}, responses: #{spider.class.visits[:responses]}"
|
49
|
+
end
|
50
|
+
|
51
|
+
if memory = driver.current_memory
|
52
|
+
logger.debug "Browser: driver.current_memory: #{memory}"
|
53
|
+
end
|
54
|
+
end
|
55
|
+
else
|
56
|
+
original_visit(visit_uri)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def destroy_driver!
|
61
|
+
if @driver
|
62
|
+
begin
|
63
|
+
@driver.quit
|
64
|
+
# handle Net::ReadTimeout error for Selenium like drivers
|
65
|
+
rescue Net::ReadTimeout => e
|
66
|
+
@driver.quit
|
67
|
+
end
|
68
|
+
|
69
|
+
@driver = nil
|
70
|
+
logger.info "Browser: driver #{mode} has been destroyed"
|
71
|
+
else
|
72
|
+
logger.warn "Browser: driver #{mode} is not present"
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def restart!
|
77
|
+
if mode.match?(/poltergeist/)
|
78
|
+
@driver.browser.restart
|
79
|
+
@driver.requests, @driver.responses = 0, 0
|
80
|
+
else
|
81
|
+
destroy_driver!
|
82
|
+
driver
|
83
|
+
end
|
84
|
+
|
85
|
+
logger.info "Browser: driver has been restarted: name: #{mode}, pid: #{driver.pid}, port: #{driver.port}"
|
86
|
+
end
|
87
|
+
|
88
|
+
def current_response(response_type = :html)
|
89
|
+
case response_type
|
90
|
+
when :html
|
91
|
+
if config.encoding
|
92
|
+
if config.encoding == :auto
|
93
|
+
charset = body.force_encoding("ISO-8859-1").encode("UTF-8")[/<meta.*?charset=["]?([\w+\d+\-]*)/i, 1]
|
94
|
+
Nokogiri::HTML(body, nil, charset)
|
95
|
+
else
|
96
|
+
Nokogiri::HTML(body, nil, config.encoding)
|
97
|
+
end
|
98
|
+
else
|
99
|
+
Nokogiri::HTML(body)
|
100
|
+
end
|
101
|
+
when :json
|
102
|
+
JSON.parse(body)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
###
|
107
|
+
|
108
|
+
# Handy method to perform some processing in the new tab within block and then automatically close this tab:
|
109
|
+
# Usage (url):
|
110
|
+
# browser.within_new_window_by(url: "https://google.com") do
|
111
|
+
# do some stuff and then automatically close this tab and return back to the first tab
|
112
|
+
# end
|
113
|
+
# Usage (action) (when new tab opening by some action, for example by clicking
|
114
|
+
# on a particular element):
|
115
|
+
# action = -> { browser.find("//some/element/path").click }
|
116
|
+
# browser.within_new_window_by(action: action) do
|
117
|
+
# do some stuff and then automatically close this tab and return back to the first tab
|
118
|
+
# end
|
119
|
+
def within_new_window_by(action: nil, url: nil)
|
120
|
+
case
|
121
|
+
when action
|
122
|
+
opened_window = window_opened_by { action.call }
|
123
|
+
within_window(opened_window) do
|
124
|
+
yield
|
125
|
+
current_window.close
|
126
|
+
end
|
127
|
+
when url
|
128
|
+
within_window(open_new_window) do
|
129
|
+
visit(url)
|
130
|
+
|
131
|
+
yield
|
132
|
+
current_window.close
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
###
|
138
|
+
|
139
|
+
def scroll_to_bottom
|
140
|
+
execute_script("window.scrollBy(0,10000)")
|
141
|
+
end
|
142
|
+
|
143
|
+
private
|
144
|
+
|
145
|
+
def skip_error_on_failure?(e)
|
146
|
+
config.retry_request_errors.any? do |error|
|
147
|
+
error[:skip_on_failure] && e.class.ancestors.include?(error[:error]) if error.kind_of?(Hash)
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
def match_error?(e, type:)
|
152
|
+
errors =
|
153
|
+
case type
|
154
|
+
when :to_retry then config.retry_request_errors
|
155
|
+
when :to_skip then config.skip_request_errors
|
156
|
+
end
|
157
|
+
|
158
|
+
errors.any? do |error|
|
159
|
+
if error.kind_of?(Hash)
|
160
|
+
match_class = e.class.ancestors.include?(error[:error])
|
161
|
+
if error[:message].present?
|
162
|
+
if error[:message].kind_of?(Regexp)
|
163
|
+
e.message&.match?(error[:message])
|
164
|
+
else
|
165
|
+
e.message&.include?(error[:message])
|
166
|
+
end && match_class
|
167
|
+
else
|
168
|
+
match_class
|
169
|
+
end
|
170
|
+
else
|
171
|
+
e.class.ancestors.include?(error)
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
def process_delay(delay)
|
177
|
+
interval = (delay.class == Range ? rand(delay) : delay)
|
178
|
+
logger.debug "Browser: sleep #{interval.round(2)} #{'second'.pluralize(interval)} before request..."
|
179
|
+
sleep interval
|
180
|
+
end
|
181
|
+
|
182
|
+
def check_request_options(url_to_visit)
|
183
|
+
# restart_if
|
184
|
+
if memory_limit = config.restart_if[:memory_limit]
|
185
|
+
memory = driver.current_memory
|
186
|
+
if memory && memory >= memory_limit
|
187
|
+
logger.warn "Browser: memory_limit #{memory_limit} of driver.current_memory (#{memory}) is exceeded (engine: #{mode})"
|
188
|
+
restart!
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
if requests_limit = config.restart_if[:requests_limit]
|
193
|
+
requests = driver.requests
|
194
|
+
if requests >= requests_limit
|
195
|
+
logger.warn "Browser: requests_limit #{requests_limit} of driver.requests (#{requests}) is exceeded (engine: #{mode})"
|
196
|
+
restart!
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
# cookies
|
201
|
+
# (Selenium only) if config.cookies present and browser was just created,
|
202
|
+
# visit url_to_visit first and only then set cookies:
|
203
|
+
if driver.visited.nil? && config.cookies && mode.match?(/selenium/)
|
204
|
+
visit(url_to_visit, skip_request_options: true)
|
205
|
+
config.cookies.each do |cookie|
|
206
|
+
driver.set_cookie(cookie[:name], cookie[:value], cookie)
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
if config.before_request[:clear_cookies]
|
211
|
+
driver.clear_cookies
|
212
|
+
logger.debug "Browser: cleared cookies before request"
|
213
|
+
end
|
214
|
+
|
215
|
+
if config.before_request[:clear_and_set_cookies]
|
216
|
+
driver.clear_cookies
|
217
|
+
|
218
|
+
# (Selenium only) if browser is not visited yet any page, visit url_to_visit
|
219
|
+
# first and then set cookies (needs after browser restart):
|
220
|
+
if driver.visited.nil? && mode.match?(/selenium/)
|
221
|
+
visit(url_to_visit, skip_request_options: true)
|
222
|
+
end
|
223
|
+
|
224
|
+
config.cookies.each do |cookie|
|
225
|
+
driver.set_cookie(cookie[:name], cookie[:value], cookie)
|
226
|
+
end
|
227
|
+
|
228
|
+
logger.debug "Browser: cleared and set cookies before request"
|
229
|
+
end
|
230
|
+
|
231
|
+
# user_agent
|
232
|
+
if config.before_request[:change_user_agent]
|
233
|
+
driver.add_header("User-Agent", config.user_agent.call)
|
234
|
+
logger.debug "Browser: changed user_agent before request"
|
235
|
+
end
|
236
|
+
|
237
|
+
# proxy
|
238
|
+
if config.before_request[:change_proxy]
|
239
|
+
proxy_string = config.proxy.call
|
240
|
+
driver.set_proxy(*proxy_string.split(":"))
|
241
|
+
logger.debug "Browser: changed proxy before request"
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
def logger
|
246
|
+
spider.logger
|
247
|
+
end
|
248
|
+
end
|
249
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
require 'cliver'
|
2
|
+
|
3
|
+
module Tanakai
|
4
|
+
class CLI
|
5
|
+
class AnsibleCommandBuilder
|
6
|
+
def initialize(user_host, options, playbook:, vars: {})
|
7
|
+
@user_host = user_host
|
8
|
+
@options = options
|
9
|
+
@playbook = playbook
|
10
|
+
@vars = vars
|
11
|
+
end
|
12
|
+
|
13
|
+
def get
|
14
|
+
unless Cliver.detect("ansible-playbook")
|
15
|
+
raise "Can't find `ansible-playbook` executable, to install: " \
|
16
|
+
"Mac OS X: `$ brew install ansible`, Ubuntu: `$ sudo apt install ansible`"
|
17
|
+
end
|
18
|
+
|
19
|
+
user = @user_host[/(.*?)\@/, 1]
|
20
|
+
host = @user_host[/\@(.+)/, 1] || @user_host
|
21
|
+
inventory = @options["port"] ? "#{host}:#{@options['port']}," : "#{host},"
|
22
|
+
|
23
|
+
gem_dir = Gem::Specification.find_by_name("tanakai").gem_dir
|
24
|
+
playbook_path = gem_dir + "/lib/tanakai/automation/" + "#{@playbook}.yml"
|
25
|
+
|
26
|
+
command = [
|
27
|
+
"ansible-playbook", playbook_path,
|
28
|
+
"--inventory", inventory,
|
29
|
+
"--ssh-extra-args", "-oForwardAgent=yes",
|
30
|
+
"--connection", @options["local"] ? "local" : "smart",
|
31
|
+
"--extra-vars", "ansible_python_interpreter=/usr/bin/python3"
|
32
|
+
]
|
33
|
+
|
34
|
+
if File.exists? "config/automation.yml"
|
35
|
+
require 'yaml'
|
36
|
+
if config = YAML.load_file("config/automation.yml").dig(@playbook)
|
37
|
+
config.each { |key, value| @vars[key] = value unless @vars[key] }
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
@vars.each do |key, value|
|
42
|
+
next unless value.present?
|
43
|
+
command.push "--extra-vars", "#{key}=#{value}"
|
44
|
+
end
|
45
|
+
|
46
|
+
if user
|
47
|
+
command.push "--user", user
|
48
|
+
end
|
49
|
+
|
50
|
+
if @options["ask-sudo"]
|
51
|
+
command.push "--ask-become-pass"
|
52
|
+
end
|
53
|
+
|
54
|
+
if @options["ask-auth-pass"]
|
55
|
+
unless Cliver.detect("sshpass")
|
56
|
+
raise "Can't find `sshpass` executable for password authentication, to install: " \
|
57
|
+
"Mac OS X: `$ brew install http://git.io/sshpass.rb`, Ubuntu: `$ sudo apt install sshpass`"
|
58
|
+
end
|
59
|
+
|
60
|
+
command.push "--ask-pass"
|
61
|
+
end
|
62
|
+
|
63
|
+
if ssh_key_path = @options["ssh-key-path"]
|
64
|
+
command.push "--private-key", ssh_key_path
|
65
|
+
end
|
66
|
+
|
67
|
+
command
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
module Tanakai
|
2
|
+
class CLI
|
3
|
+
class Generator < Thor::Group
|
4
|
+
include Thor::Actions
|
5
|
+
|
6
|
+
def self.source_root
|
7
|
+
File.dirname(File.expand_path('..', __FILE__))
|
8
|
+
end
|
9
|
+
|
10
|
+
def generate_project(project_name)
|
11
|
+
directory "template", project_name
|
12
|
+
inside(project_name) do
|
13
|
+
run "bundle install"
|
14
|
+
run "git init"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def generate_spider(spider_name, in_project:)
|
19
|
+
spider_path = in_project ? "spiders/#{spider_name}.rb" : "./#{spider_name}.rb"
|
20
|
+
raise "Spider #{spider_path} already exists" if File.exists? spider_path
|
21
|
+
|
22
|
+
spider_class = to_spider_class(spider_name)
|
23
|
+
create_file spider_path do
|
24
|
+
<<~RUBY
|
25
|
+
class #{spider_class} < #{in_project ? 'ApplicationSpider' : 'Tanakai::Base'}
|
26
|
+
@name = "#{spider_name}"
|
27
|
+
@start_urls = []
|
28
|
+
@config = {}
|
29
|
+
|
30
|
+
def parse(response, url:, data: {})
|
31
|
+
end
|
32
|
+
end
|
33
|
+
RUBY
|
34
|
+
end
|
35
|
+
|
36
|
+
unless in_project
|
37
|
+
insert_into_file spider_path, " @engine = :mechanize\n", after: "@name = \"#{spider_name}\"\n"
|
38
|
+
prepend_to_file spider_path, "require 'tanakai'\n\n"
|
39
|
+
append_to_file spider_path, "\n#{spider_class}.crawl!"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def generate_schedule
|
44
|
+
copy_file "template/config/schedule.rb", "./schedule.rb"
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def to_spider_class(string)
|
50
|
+
string.sub(/^./) { $&.capitalize }
|
51
|
+
.gsub(/(?:_|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
|
52
|
+
.gsub(/(?:-|(\/))([a-z\d]*)/) { "Dash#{$2.capitalize}" }
|
53
|
+
.gsub(/(?:\.|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|