tanakai 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (66) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +5 -0
  4. data/CHANGELOG.md +118 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +2038 -0
  8. data/Rakefile +10 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/exe/tanakai +6 -0
  12. data/lib/tanakai/automation/deploy.yml +54 -0
  13. data/lib/tanakai/automation/setup/chromium_chromedriver.yml +26 -0
  14. data/lib/tanakai/automation/setup/firefox_geckodriver.yml +20 -0
  15. data/lib/tanakai/automation/setup/phantomjs.yml +33 -0
  16. data/lib/tanakai/automation/setup/ruby_environment.yml +124 -0
  17. data/lib/tanakai/automation/setup.yml +45 -0
  18. data/lib/tanakai/base/saver.rb +106 -0
  19. data/lib/tanakai/base/storage.rb +54 -0
  20. data/lib/tanakai/base.rb +326 -0
  21. data/lib/tanakai/base_helper.rb +22 -0
  22. data/lib/tanakai/browser_builder/apparition_builder.rb +58 -0
  23. data/lib/tanakai/browser_builder/cuprite_builder.rb +54 -0
  24. data/lib/tanakai/browser_builder/mechanize_builder.rb +154 -0
  25. data/lib/tanakai/browser_builder/poltergeist_phantomjs_builder.rb +175 -0
  26. data/lib/tanakai/browser_builder/selenium_chrome_builder.rb +199 -0
  27. data/lib/tanakai/browser_builder/selenium_firefox_builder.rb +204 -0
  28. data/lib/tanakai/browser_builder.rb +20 -0
  29. data/lib/tanakai/capybara_configuration.rb +10 -0
  30. data/lib/tanakai/capybara_ext/apparition/driver.rb +13 -0
  31. data/lib/tanakai/capybara_ext/cuprite/driver.rb +13 -0
  32. data/lib/tanakai/capybara_ext/driver/base.rb +62 -0
  33. data/lib/tanakai/capybara_ext/mechanize/driver.rb +71 -0
  34. data/lib/tanakai/capybara_ext/poltergeist/driver.rb +13 -0
  35. data/lib/tanakai/capybara_ext/selenium/driver.rb +34 -0
  36. data/lib/tanakai/capybara_ext/session/config.rb +22 -0
  37. data/lib/tanakai/capybara_ext/session.rb +249 -0
  38. data/lib/tanakai/cli/ansible_command_builder.rb +71 -0
  39. data/lib/tanakai/cli/generator.rb +57 -0
  40. data/lib/tanakai/cli.rb +183 -0
  41. data/lib/tanakai/core_ext/array.rb +14 -0
  42. data/lib/tanakai/core_ext/hash.rb +5 -0
  43. data/lib/tanakai/core_ext/numeric.rb +19 -0
  44. data/lib/tanakai/core_ext/string.rb +7 -0
  45. data/lib/tanakai/pipeline.rb +33 -0
  46. data/lib/tanakai/runner.rb +60 -0
  47. data/lib/tanakai/template/.gitignore +18 -0
  48. data/lib/tanakai/template/Gemfile +28 -0
  49. data/lib/tanakai/template/README.md +3 -0
  50. data/lib/tanakai/template/config/application.rb +37 -0
  51. data/lib/tanakai/template/config/automation.yml +13 -0
  52. data/lib/tanakai/template/config/boot.rb +22 -0
  53. data/lib/tanakai/template/config/initializers/.keep +0 -0
  54. data/lib/tanakai/template/config/schedule.rb +57 -0
  55. data/lib/tanakai/template/db/.keep +0 -0
  56. data/lib/tanakai/template/helpers/application_helper.rb +3 -0
  57. data/lib/tanakai/template/lib/.keep +0 -0
  58. data/lib/tanakai/template/log/.keep +0 -0
  59. data/lib/tanakai/template/pipelines/saver.rb +11 -0
  60. data/lib/tanakai/template/pipelines/validator.rb +24 -0
  61. data/lib/tanakai/template/spiders/application_spider.rb +143 -0
  62. data/lib/tanakai/template/tmp/.keep +0 -0
  63. data/lib/tanakai/version.rb +3 -0
  64. data/lib/tanakai.rb +54 -0
  65. data/tanakai.gemspec +50 -0
  66. metadata +382 -0
@@ -0,0 +1,62 @@
1
+ require 'pathname'
2
+
3
+ class Capybara::Driver::Base
4
+ attr_accessor :visited
5
+ attr_writer :requests, :responses
6
+
7
+ def requests
8
+ @requests ||= 0
9
+ end
10
+
11
+ def responses
12
+ @responses ||= 0
13
+ end
14
+
15
+ def current_memory
16
+ driver_pid = pid
17
+
18
+ all = (get_descendant_processes(driver_pid) << driver_pid).uniq
19
+ all.map { |pid| get_process_memory(pid) }.sum
20
+ end
21
+
22
+ private
23
+
24
+ def get_descendant_processes(base)
25
+ descendants = Hash.new { |ht, k| ht[k] = [k] }
26
+ Hash[*`ps -eo pid,ppid`.scan(/\d+/).map(&:to_i)].each do |pid, ppid|
27
+ descendants[ppid] << descendants[pid]
28
+ end
29
+
30
+ descendants[base].flatten - [base]
31
+ end
32
+
33
+ # https://github.com/schneems/get_process_mem
34
+ # Note: for Linux takes PSS (not RSS) memory (I think PSS better fits in this case)
35
+ def get_process_memory(pid)
36
+ case @platform ||= Gem::Platform.local.os
37
+ when "linux"
38
+ begin
39
+ file = Pathname.new "/proc/#{pid}/smaps"
40
+ return 0 unless file.exist?
41
+
42
+ lines = file.each_line.select { |line| line.match(/^Pss/) }
43
+ return 0 if lines.empty?
44
+
45
+ lines.reduce(0) do |sum, line|
46
+ line.match(/(?<value>(\d*\.{0,1}\d+))\s+(?<unit>\w\w)/) do |m|
47
+ sum += m[:value].to_i
48
+ end
49
+
50
+ sum
51
+ end
52
+ rescue Errno::EACCES
53
+ 0
54
+ end
55
+ when "darwin"
56
+ mem = `ps -o rss= -p #{pid}`.strip
57
+ mem.empty? ? 0 : mem.to_i
58
+ else
59
+ raise "Can't check process memory, wrong type of platform: #{@platform}"
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,71 @@
1
+ require 'mechanize'
2
+ require_relative '../driver/base'
3
+
4
+ class Capybara::Mechanize::Driver
5
+ # Extend capybara-mechnize to support Poltergeist-like methods
6
+ # https://www.rubydoc.info/gems/poltergeist/Capybara/Poltergeist/Driver
7
+
8
+ def set_proxy(ip, port, type, user = nil, password = nil)
9
+ # type is always "http", "socks" is not supported (yet)
10
+ browser.agent.set_proxy(ip, port, user, password)
11
+ end
12
+
13
+ ###
14
+
15
+ def headers
16
+ browser.agent.request_headers
17
+ end
18
+
19
+ def headers=(headers)
20
+ browser.agent.request_headers = headers
21
+ end
22
+
23
+ def add_header(name, value)
24
+ browser.agent.request_headers[name] = value
25
+ end
26
+
27
+ ###
28
+
29
+ def get_cookies
30
+ browser.agent.cookies
31
+ end
32
+
33
+ def set_cookie(name, value, options = {})
34
+ options[:name] ||= name
35
+ options[:value] ||= value
36
+
37
+ cookie = Mechanize::Cookie.new(options.merge path: "/")
38
+ browser.agent.cookie_jar << cookie
39
+ end
40
+
41
+ def set_cookies(cookies)
42
+ cookies.each do |cookie|
43
+ set_cookie(cookie[:name], cookie[:value], cookie)
44
+ end
45
+ end
46
+
47
+ def clear_cookies
48
+ browser.agent.cookie_jar.clear!
49
+ end
50
+
51
+ ###
52
+
53
+ def quit
54
+ browser.agent.shutdown
55
+ end
56
+
57
+ ###
58
+
59
+ # Reset parent method `current_memory` for mechanize (we can't measure memory of Mechanize driver)
60
+ def current_memory
61
+ nil
62
+ end
63
+
64
+ def pid
65
+ nil
66
+ end
67
+
68
+ def port
69
+ nil
70
+ end
71
+ end
@@ -0,0 +1,13 @@
1
+ require_relative '../driver/base'
2
+
3
+ module Capybara::Poltergeist
4
+ class Driver
5
+ def pid
6
+ client_pid
7
+ end
8
+
9
+ def port
10
+ server.port
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,34 @@
1
+ require_relative '../driver/base'
2
+
3
+ class Capybara::Selenium::Driver
4
+ def get_cookies
5
+ browser.manage.all_cookies
6
+ end
7
+
8
+ def set_cookie(name, value, options = {})
9
+ options[:name] ||= name
10
+ options[:value] ||= value
11
+
12
+ browser.manage.add_cookie(options)
13
+ end
14
+
15
+ def set_cookies(cookies)
16
+ cookies.each do |cookie|
17
+ set_cookie(cookie[:name], cookie[:value], cookie)
18
+ end
19
+ end
20
+
21
+ def clear_cookies
22
+ browser.manage.delete_all_cookies
23
+ end
24
+
25
+ ###
26
+
27
+ def pid
28
+ @pid ||= `lsof -i tcp:#{port} -t`.strip.to_i
29
+ end
30
+
31
+ def port
32
+ @port ||= browser.send(:bridge).instance_variable_get("@http").instance_variable_get("@server_url").port
33
+ end
34
+ end
@@ -0,0 +1,22 @@
1
+ module Capybara
2
+ class SessionConfig
3
+ attr_accessor :cookies, :proxy, :user_agent, :encoding
4
+ attr_writer :retry_request_errors, :skip_request_errors
5
+
6
+ def retry_request_errors
7
+ @retry_request_errors ||= []
8
+ end
9
+
10
+ def skip_request_errors
11
+ @skip_request_errors ||= []
12
+ end
13
+
14
+ def restart_if
15
+ @restart_if ||= {}
16
+ end
17
+
18
+ def before_request
19
+ @before_request ||= {}
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,249 @@
1
+ require 'capybara'
2
+ require 'nokogiri'
3
+ require 'json'
4
+ require_relative 'session/config'
5
+
6
+ module Capybara
7
+ class Session
8
+ attr_accessor :spider
9
+
10
+ alias_method :original_visit, :visit
11
+ def visit(visit_uri, delay: config.before_request[:delay], skip_request_options: false, max_retries: 3)
12
+ if spider
13
+ process_delay(delay) if delay
14
+ retries, sleep_interval = 0, 0
15
+
16
+ begin
17
+ check_request_options(visit_uri) unless skip_request_options
18
+ driver.requests += 1 and logger.info "Browser: started get request to: #{visit_uri}"
19
+ spider.class.update(:visits, :requests) if spider.with_info
20
+
21
+ original_visit(visit_uri)
22
+ rescue => e
23
+ if match_error?(e, type: :to_skip)
24
+ logger.error "Browser: skip request error: #{e.inspect}, url: #{visit_uri}"
25
+ spider.add_event(:requests_errors, e.inspect) if spider.with_info
26
+ false
27
+ elsif match_error?(e, type: :to_retry)
28
+ logger.error "Browser: retry request error: #{e.inspect}, url: #{visit_uri}"
29
+ spider.add_event(:requests_errors, e.inspect) if spider.with_info
30
+
31
+ if (retries += 1) <= max_retries
32
+ logger.info "Browser: sleep #{(sleep_interval += 15)} seconds and process retry № #{retries} to the url: #{visit_uri}"
33
+ sleep sleep_interval and retry
34
+ else
35
+ logger.error "Browser: all retries (#{retries - 1}) to the url #{visit_uri} are gone"
36
+ raise e unless skip_error_on_failure?(e)
37
+ end
38
+ else
39
+ raise e
40
+ end
41
+ else
42
+ driver.responses += 1 and logger.info "Browser: finished get request to: #{visit_uri}"
43
+ spider.class.update(:visits, :responses) if spider.with_info
44
+ driver.visited = true unless driver.visited
45
+ true
46
+ ensure
47
+ if spider.with_info
48
+ logger.info "Info: visits: requests: #{spider.class.visits[:requests]}, responses: #{spider.class.visits[:responses]}"
49
+ end
50
+
51
+ if memory = driver.current_memory
52
+ logger.debug "Browser: driver.current_memory: #{memory}"
53
+ end
54
+ end
55
+ else
56
+ original_visit(visit_uri)
57
+ end
58
+ end
59
+
60
+ def destroy_driver!
61
+ if @driver
62
+ begin
63
+ @driver.quit
64
+ # handle Net::ReadTimeout error for Selenium like drivers
65
+ rescue Net::ReadTimeout => e
66
+ @driver.quit
67
+ end
68
+
69
+ @driver = nil
70
+ logger.info "Browser: driver #{mode} has been destroyed"
71
+ else
72
+ logger.warn "Browser: driver #{mode} is not present"
73
+ end
74
+ end
75
+
76
+ def restart!
77
+ if mode.match?(/poltergeist/)
78
+ @driver.browser.restart
79
+ @driver.requests, @driver.responses = 0, 0
80
+ else
81
+ destroy_driver!
82
+ driver
83
+ end
84
+
85
+ logger.info "Browser: driver has been restarted: name: #{mode}, pid: #{driver.pid}, port: #{driver.port}"
86
+ end
87
+
88
+ def current_response(response_type = :html)
89
+ case response_type
90
+ when :html
91
+ if config.encoding
92
+ if config.encoding == :auto
93
+ charset = body.force_encoding("ISO-8859-1").encode("UTF-8")[/<meta.*?charset=["]?([\w+\d+\-]*)/i, 1]
94
+ Nokogiri::HTML(body, nil, charset)
95
+ else
96
+ Nokogiri::HTML(body, nil, config.encoding)
97
+ end
98
+ else
99
+ Nokogiri::HTML(body)
100
+ end
101
+ when :json
102
+ JSON.parse(body)
103
+ end
104
+ end
105
+
106
+ ###
107
+
108
+ # Handy method to perform some processing in the new tab within block and then automatically close this tab:
109
+ # Usage (url):
110
+ # browser.within_new_window_by(url: "https://google.com") do
111
+ # do some stuff and then automatically close this tab and return back to the first tab
112
+ # end
113
+ # Usage (action) (when new tab opening by some action, for example by clicking
114
+ # on a particular element):
115
+ # action = -> { browser.find("//some/element/path").click }
116
+ # browser.within_new_window_by(action: action) do
117
+ # do some stuff and then automatically close this tab and return back to the first tab
118
+ # end
119
+ def within_new_window_by(action: nil, url: nil)
120
+ case
121
+ when action
122
+ opened_window = window_opened_by { action.call }
123
+ within_window(opened_window) do
124
+ yield
125
+ current_window.close
126
+ end
127
+ when url
128
+ within_window(open_new_window) do
129
+ visit(url)
130
+
131
+ yield
132
+ current_window.close
133
+ end
134
+ end
135
+ end
136
+
137
+ ###
138
+
139
+ def scroll_to_bottom
140
+ execute_script("window.scrollBy(0,10000)")
141
+ end
142
+
143
+ private
144
+
145
+ def skip_error_on_failure?(e)
146
+ config.retry_request_errors.any? do |error|
147
+ error[:skip_on_failure] && e.class.ancestors.include?(error[:error]) if error.kind_of?(Hash)
148
+ end
149
+ end
150
+
151
+ def match_error?(e, type:)
152
+ errors =
153
+ case type
154
+ when :to_retry then config.retry_request_errors
155
+ when :to_skip then config.skip_request_errors
156
+ end
157
+
158
+ errors.any? do |error|
159
+ if error.kind_of?(Hash)
160
+ match_class = e.class.ancestors.include?(error[:error])
161
+ if error[:message].present?
162
+ if error[:message].kind_of?(Regexp)
163
+ e.message&.match?(error[:message])
164
+ else
165
+ e.message&.include?(error[:message])
166
+ end && match_class
167
+ else
168
+ match_class
169
+ end
170
+ else
171
+ e.class.ancestors.include?(error)
172
+ end
173
+ end
174
+ end
175
+
176
+ def process_delay(delay)
177
+ interval = (delay.class == Range ? rand(delay) : delay)
178
+ logger.debug "Browser: sleep #{interval.round(2)} #{'second'.pluralize(interval)} before request..."
179
+ sleep interval
180
+ end
181
+
182
+ def check_request_options(url_to_visit)
183
+ # restart_if
184
+ if memory_limit = config.restart_if[:memory_limit]
185
+ memory = driver.current_memory
186
+ if memory && memory >= memory_limit
187
+ logger.warn "Browser: memory_limit #{memory_limit} of driver.current_memory (#{memory}) is exceeded (engine: #{mode})"
188
+ restart!
189
+ end
190
+ end
191
+
192
+ if requests_limit = config.restart_if[:requests_limit]
193
+ requests = driver.requests
194
+ if requests >= requests_limit
195
+ logger.warn "Browser: requests_limit #{requests_limit} of driver.requests (#{requests}) is exceeded (engine: #{mode})"
196
+ restart!
197
+ end
198
+ end
199
+
200
+ # cookies
201
+ # (Selenium only) if config.cookies present and browser was just created,
202
+ # visit url_to_visit first and only then set cookies:
203
+ if driver.visited.nil? && config.cookies && mode.match?(/selenium/)
204
+ visit(url_to_visit, skip_request_options: true)
205
+ config.cookies.each do |cookie|
206
+ driver.set_cookie(cookie[:name], cookie[:value], cookie)
207
+ end
208
+ end
209
+
210
+ if config.before_request[:clear_cookies]
211
+ driver.clear_cookies
212
+ logger.debug "Browser: cleared cookies before request"
213
+ end
214
+
215
+ if config.before_request[:clear_and_set_cookies]
216
+ driver.clear_cookies
217
+
218
+ # (Selenium only) if browser is not visited yet any page, visit url_to_visit
219
+ # first and then set cookies (needs after browser restart):
220
+ if driver.visited.nil? && mode.match?(/selenium/)
221
+ visit(url_to_visit, skip_request_options: true)
222
+ end
223
+
224
+ config.cookies.each do |cookie|
225
+ driver.set_cookie(cookie[:name], cookie[:value], cookie)
226
+ end
227
+
228
+ logger.debug "Browser: cleared and set cookies before request"
229
+ end
230
+
231
+ # user_agent
232
+ if config.before_request[:change_user_agent]
233
+ driver.add_header("User-Agent", config.user_agent.call)
234
+ logger.debug "Browser: changed user_agent before request"
235
+ end
236
+
237
+ # proxy
238
+ if config.before_request[:change_proxy]
239
+ proxy_string = config.proxy.call
240
+ driver.set_proxy(*proxy_string.split(":"))
241
+ logger.debug "Browser: changed proxy before request"
242
+ end
243
+ end
244
+
245
+ def logger
246
+ spider.logger
247
+ end
248
+ end
249
+ end
@@ -0,0 +1,71 @@
1
+ require 'cliver'
2
+
3
+ module Tanakai
4
+ class CLI
5
+ class AnsibleCommandBuilder
6
+ def initialize(user_host, options, playbook:, vars: {})
7
+ @user_host = user_host
8
+ @options = options
9
+ @playbook = playbook
10
+ @vars = vars
11
+ end
12
+
13
+ def get
14
+ unless Cliver.detect("ansible-playbook")
15
+ raise "Can't find `ansible-playbook` executable, to install: " \
16
+ "Mac OS X: `$ brew install ansible`, Ubuntu: `$ sudo apt install ansible`"
17
+ end
18
+
19
+ user = @user_host[/(.*?)\@/, 1]
20
+ host = @user_host[/\@(.+)/, 1] || @user_host
21
+ inventory = @options["port"] ? "#{host}:#{@options['port']}," : "#{host},"
22
+
23
+ gem_dir = Gem::Specification.find_by_name("tanakai").gem_dir
24
+ playbook_path = gem_dir + "/lib/tanakai/automation/" + "#{@playbook}.yml"
25
+
26
+ command = [
27
+ "ansible-playbook", playbook_path,
28
+ "--inventory", inventory,
29
+ "--ssh-extra-args", "-oForwardAgent=yes",
30
+ "--connection", @options["local"] ? "local" : "smart",
31
+ "--extra-vars", "ansible_python_interpreter=/usr/bin/python3"
32
+ ]
33
+
34
+ if File.exists? "config/automation.yml"
35
+ require 'yaml'
36
+ if config = YAML.load_file("config/automation.yml").dig(@playbook)
37
+ config.each { |key, value| @vars[key] = value unless @vars[key] }
38
+ end
39
+ end
40
+
41
+ @vars.each do |key, value|
42
+ next unless value.present?
43
+ command.push "--extra-vars", "#{key}=#{value}"
44
+ end
45
+
46
+ if user
47
+ command.push "--user", user
48
+ end
49
+
50
+ if @options["ask-sudo"]
51
+ command.push "--ask-become-pass"
52
+ end
53
+
54
+ if @options["ask-auth-pass"]
55
+ unless Cliver.detect("sshpass")
56
+ raise "Can't find `sshpass` executable for password authentication, to install: " \
57
+ "Mac OS X: `$ brew install http://git.io/sshpass.rb`, Ubuntu: `$ sudo apt install sshpass`"
58
+ end
59
+
60
+ command.push "--ask-pass"
61
+ end
62
+
63
+ if ssh_key_path = @options["ssh-key-path"]
64
+ command.push "--private-key", ssh_key_path
65
+ end
66
+
67
+ command
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,57 @@
1
+ module Tanakai
2
+ class CLI
3
+ class Generator < Thor::Group
4
+ include Thor::Actions
5
+
6
+ def self.source_root
7
+ File.dirname(File.expand_path('..', __FILE__))
8
+ end
9
+
10
+ def generate_project(project_name)
11
+ directory "template", project_name
12
+ inside(project_name) do
13
+ run "bundle install"
14
+ run "git init"
15
+ end
16
+ end
17
+
18
+ def generate_spider(spider_name, in_project:)
19
+ spider_path = in_project ? "spiders/#{spider_name}.rb" : "./#{spider_name}.rb"
20
+ raise "Spider #{spider_path} already exists" if File.exists? spider_path
21
+
22
+ spider_class = to_spider_class(spider_name)
23
+ create_file spider_path do
24
+ <<~RUBY
25
+ class #{spider_class} < #{in_project ? 'ApplicationSpider' : 'Tanakai::Base'}
26
+ @name = "#{spider_name}"
27
+ @start_urls = []
28
+ @config = {}
29
+
30
+ def parse(response, url:, data: {})
31
+ end
32
+ end
33
+ RUBY
34
+ end
35
+
36
+ unless in_project
37
+ insert_into_file spider_path, " @engine = :mechanize\n", after: "@name = \"#{spider_name}\"\n"
38
+ prepend_to_file spider_path, "require 'tanakai'\n\n"
39
+ append_to_file spider_path, "\n#{spider_class}.crawl!"
40
+ end
41
+ end
42
+
43
+ def generate_schedule
44
+ copy_file "template/config/schedule.rb", "./schedule.rb"
45
+ end
46
+
47
+ private
48
+
49
+ def to_spider_class(string)
50
+ string.sub(/^./) { $&.capitalize }
51
+ .gsub(/(?:_|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
52
+ .gsub(/(?:-|(\/))([a-z\d]*)/) { "Dash#{$2.capitalize}" }
53
+ .gsub(/(?:\.|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
54
+ end
55
+ end
56
+ end
57
+ end