tanakai 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +5 -0
  4. data/CHANGELOG.md +118 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +2038 -0
  8. data/Rakefile +10 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/exe/tanakai +6 -0
  12. data/lib/tanakai/automation/deploy.yml +54 -0
  13. data/lib/tanakai/automation/setup/chromium_chromedriver.yml +26 -0
  14. data/lib/tanakai/automation/setup/firefox_geckodriver.yml +20 -0
  15. data/lib/tanakai/automation/setup/phantomjs.yml +33 -0
  16. data/lib/tanakai/automation/setup/ruby_environment.yml +124 -0
  17. data/lib/tanakai/automation/setup.yml +45 -0
  18. data/lib/tanakai/base/saver.rb +106 -0
  19. data/lib/tanakai/base/storage.rb +54 -0
  20. data/lib/tanakai/base.rb +326 -0
  21. data/lib/tanakai/base_helper.rb +22 -0
  22. data/lib/tanakai/browser_builder/apparition_builder.rb +58 -0
  23. data/lib/tanakai/browser_builder/cuprite_builder.rb +54 -0
  24. data/lib/tanakai/browser_builder/mechanize_builder.rb +154 -0
  25. data/lib/tanakai/browser_builder/poltergeist_phantomjs_builder.rb +175 -0
  26. data/lib/tanakai/browser_builder/selenium_chrome_builder.rb +199 -0
  27. data/lib/tanakai/browser_builder/selenium_firefox_builder.rb +204 -0
  28. data/lib/tanakai/browser_builder.rb +20 -0
  29. data/lib/tanakai/capybara_configuration.rb +10 -0
  30. data/lib/tanakai/capybara_ext/apparition/driver.rb +13 -0
  31. data/lib/tanakai/capybara_ext/cuprite/driver.rb +13 -0
  32. data/lib/tanakai/capybara_ext/driver/base.rb +62 -0
  33. data/lib/tanakai/capybara_ext/mechanize/driver.rb +71 -0
  34. data/lib/tanakai/capybara_ext/poltergeist/driver.rb +13 -0
  35. data/lib/tanakai/capybara_ext/selenium/driver.rb +34 -0
  36. data/lib/tanakai/capybara_ext/session/config.rb +22 -0
  37. data/lib/tanakai/capybara_ext/session.rb +249 -0
  38. data/lib/tanakai/cli/ansible_command_builder.rb +71 -0
  39. data/lib/tanakai/cli/generator.rb +57 -0
  40. data/lib/tanakai/cli.rb +183 -0
  41. data/lib/tanakai/core_ext/array.rb +14 -0
  42. data/lib/tanakai/core_ext/hash.rb +5 -0
  43. data/lib/tanakai/core_ext/numeric.rb +19 -0
  44. data/lib/tanakai/core_ext/string.rb +7 -0
  45. data/lib/tanakai/pipeline.rb +33 -0
  46. data/lib/tanakai/runner.rb +60 -0
  47. data/lib/tanakai/template/.gitignore +18 -0
  48. data/lib/tanakai/template/Gemfile +28 -0
  49. data/lib/tanakai/template/README.md +3 -0
  50. data/lib/tanakai/template/config/application.rb +37 -0
  51. data/lib/tanakai/template/config/automation.yml +13 -0
  52. data/lib/tanakai/template/config/boot.rb +22 -0
  53. data/lib/tanakai/template/config/initializers/.keep +0 -0
  54. data/lib/tanakai/template/config/schedule.rb +57 -0
  55. data/lib/tanakai/template/db/.keep +0 -0
  56. data/lib/tanakai/template/helpers/application_helper.rb +3 -0
  57. data/lib/tanakai/template/lib/.keep +0 -0
  58. data/lib/tanakai/template/log/.keep +0 -0
  59. data/lib/tanakai/template/pipelines/saver.rb +11 -0
  60. data/lib/tanakai/template/pipelines/validator.rb +24 -0
  61. data/lib/tanakai/template/spiders/application_spider.rb +143 -0
  62. data/lib/tanakai/template/tmp/.keep +0 -0
  63. data/lib/tanakai/version.rb +3 -0
  64. data/lib/tanakai.rb +54 -0
  65. data/tanakai.gemspec +50 -0
  66. metadata +382 -0
@@ -0,0 +1,62 @@
1
+ require 'pathname'
2
+
3
+ class Capybara::Driver::Base
4
+ attr_accessor :visited
5
+ attr_writer :requests, :responses
6
+
7
+ def requests
8
+ @requests ||= 0
9
+ end
10
+
11
+ def responses
12
+ @responses ||= 0
13
+ end
14
+
15
+ def current_memory
16
+ driver_pid = pid
17
+
18
+ all = (get_descendant_processes(driver_pid) << driver_pid).uniq
19
+ all.map { |pid| get_process_memory(pid) }.sum
20
+ end
21
+
22
+ private
23
+
24
+ def get_descendant_processes(base)
25
+ descendants = Hash.new { |ht, k| ht[k] = [k] }
26
+ Hash[*`ps -eo pid,ppid`.scan(/\d+/).map(&:to_i)].each do |pid, ppid|
27
+ descendants[ppid] << descendants[pid]
28
+ end
29
+
30
+ descendants[base].flatten - [base]
31
+ end
32
+
33
+ # https://github.com/schneems/get_process_mem
34
+ # Note: for Linux takes PSS (not RSS) memory (I think PSS better fits in this case)
35
+ def get_process_memory(pid)
36
+ case @platform ||= Gem::Platform.local.os
37
+ when "linux"
38
+ begin
39
+ file = Pathname.new "/proc/#{pid}/smaps"
40
+ return 0 unless file.exist?
41
+
42
+ lines = file.each_line.select { |line| line.match(/^Pss/) }
43
+ return 0 if lines.empty?
44
+
45
+ lines.reduce(0) do |sum, line|
46
+ line.match(/(?<value>(\d*\.{0,1}\d+))\s+(?<unit>\w\w)/) do |m|
47
+ sum += m[:value].to_i
48
+ end
49
+
50
+ sum
51
+ end
52
+ rescue Errno::EACCES
53
+ 0
54
+ end
55
+ when "darwin"
56
+ mem = `ps -o rss= -p #{pid}`.strip
57
+ mem.empty? ? 0 : mem.to_i
58
+ else
59
+ raise "Can't check process memory, wrong type of platform: #{@platform}"
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,71 @@
1
+ require 'mechanize'
2
+ require_relative '../driver/base'
3
+
4
+ class Capybara::Mechanize::Driver
5
+ # Extend capybara-mechnize to support Poltergeist-like methods
6
+ # https://www.rubydoc.info/gems/poltergeist/Capybara/Poltergeist/Driver
7
+
8
+ def set_proxy(ip, port, type, user = nil, password = nil)
9
+ # type is always "http", "socks" is not supported (yet)
10
+ browser.agent.set_proxy(ip, port, user, password)
11
+ end
12
+
13
+ ###
14
+
15
+ def headers
16
+ browser.agent.request_headers
17
+ end
18
+
19
+ def headers=(headers)
20
+ browser.agent.request_headers = headers
21
+ end
22
+
23
+ def add_header(name, value)
24
+ browser.agent.request_headers[name] = value
25
+ end
26
+
27
+ ###
28
+
29
+ def get_cookies
30
+ browser.agent.cookies
31
+ end
32
+
33
+ def set_cookie(name, value, options = {})
34
+ options[:name] ||= name
35
+ options[:value] ||= value
36
+
37
+ cookie = Mechanize::Cookie.new(options.merge path: "/")
38
+ browser.agent.cookie_jar << cookie
39
+ end
40
+
41
+ def set_cookies(cookies)
42
+ cookies.each do |cookie|
43
+ set_cookie(cookie[:name], cookie[:value], cookie)
44
+ end
45
+ end
46
+
47
+ def clear_cookies
48
+ browser.agent.cookie_jar.clear!
49
+ end
50
+
51
+ ###
52
+
53
+ def quit
54
+ browser.agent.shutdown
55
+ end
56
+
57
+ ###
58
+
59
+ # Reset parent method `current_memory` for mechanize (we can't measure memory of Mechanize driver)
60
+ def current_memory
61
+ nil
62
+ end
63
+
64
+ def pid
65
+ nil
66
+ end
67
+
68
+ def port
69
+ nil
70
+ end
71
+ end
@@ -0,0 +1,13 @@
1
+ require_relative '../driver/base'
2
+
3
+ module Capybara::Poltergeist
4
+ class Driver
5
+ def pid
6
+ client_pid
7
+ end
8
+
9
+ def port
10
+ server.port
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,34 @@
1
+ require_relative '../driver/base'
2
+
3
+ class Capybara::Selenium::Driver
4
+ def get_cookies
5
+ browser.manage.all_cookies
6
+ end
7
+
8
+ def set_cookie(name, value, options = {})
9
+ options[:name] ||= name
10
+ options[:value] ||= value
11
+
12
+ browser.manage.add_cookie(options)
13
+ end
14
+
15
+ def set_cookies(cookies)
16
+ cookies.each do |cookie|
17
+ set_cookie(cookie[:name], cookie[:value], cookie)
18
+ end
19
+ end
20
+
21
+ def clear_cookies
22
+ browser.manage.delete_all_cookies
23
+ end
24
+
25
+ ###
26
+
27
+ def pid
28
+ @pid ||= `lsof -i tcp:#{port} -t`.strip.to_i
29
+ end
30
+
31
+ def port
32
+ @port ||= browser.send(:bridge).instance_variable_get("@http").instance_variable_get("@server_url").port
33
+ end
34
+ end
@@ -0,0 +1,22 @@
1
+ module Capybara
2
+ class SessionConfig
3
+ attr_accessor :cookies, :proxy, :user_agent, :encoding
4
+ attr_writer :retry_request_errors, :skip_request_errors
5
+
6
+ def retry_request_errors
7
+ @retry_request_errors ||= []
8
+ end
9
+
10
+ def skip_request_errors
11
+ @skip_request_errors ||= []
12
+ end
13
+
14
+ def restart_if
15
+ @restart_if ||= {}
16
+ end
17
+
18
+ def before_request
19
+ @before_request ||= {}
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,249 @@
1
+ require 'capybara'
2
+ require 'nokogiri'
3
+ require 'json'
4
+ require_relative 'session/config'
5
+
6
+ module Capybara
7
+ class Session
8
+ attr_accessor :spider
9
+
10
+ alias_method :original_visit, :visit
11
+ def visit(visit_uri, delay: config.before_request[:delay], skip_request_options: false, max_retries: 3)
12
+ if spider
13
+ process_delay(delay) if delay
14
+ retries, sleep_interval = 0, 0
15
+
16
+ begin
17
+ check_request_options(visit_uri) unless skip_request_options
18
+ driver.requests += 1 and logger.info "Browser: started get request to: #{visit_uri}"
19
+ spider.class.update(:visits, :requests) if spider.with_info
20
+
21
+ original_visit(visit_uri)
22
+ rescue => e
23
+ if match_error?(e, type: :to_skip)
24
+ logger.error "Browser: skip request error: #{e.inspect}, url: #{visit_uri}"
25
+ spider.add_event(:requests_errors, e.inspect) if spider.with_info
26
+ false
27
+ elsif match_error?(e, type: :to_retry)
28
+ logger.error "Browser: retry request error: #{e.inspect}, url: #{visit_uri}"
29
+ spider.add_event(:requests_errors, e.inspect) if spider.with_info
30
+
31
+ if (retries += 1) <= max_retries
32
+ logger.info "Browser: sleep #{(sleep_interval += 15)} seconds and process retry № #{retries} to the url: #{visit_uri}"
33
+ sleep sleep_interval and retry
34
+ else
35
+ logger.error "Browser: all retries (#{retries - 1}) to the url #{visit_uri} are gone"
36
+ raise e unless skip_error_on_failure?(e)
37
+ end
38
+ else
39
+ raise e
40
+ end
41
+ else
42
+ driver.responses += 1 and logger.info "Browser: finished get request to: #{visit_uri}"
43
+ spider.class.update(:visits, :responses) if spider.with_info
44
+ driver.visited = true unless driver.visited
45
+ true
46
+ ensure
47
+ if spider.with_info
48
+ logger.info "Info: visits: requests: #{spider.class.visits[:requests]}, responses: #{spider.class.visits[:responses]}"
49
+ end
50
+
51
+ if memory = driver.current_memory
52
+ logger.debug "Browser: driver.current_memory: #{memory}"
53
+ end
54
+ end
55
+ else
56
+ original_visit(visit_uri)
57
+ end
58
+ end
59
+
60
+ def destroy_driver!
61
+ if @driver
62
+ begin
63
+ @driver.quit
64
+ # handle Net::ReadTimeout error for Selenium like drivers
65
+ rescue Net::ReadTimeout => e
66
+ @driver.quit
67
+ end
68
+
69
+ @driver = nil
70
+ logger.info "Browser: driver #{mode} has been destroyed"
71
+ else
72
+ logger.warn "Browser: driver #{mode} is not present"
73
+ end
74
+ end
75
+
76
+ def restart!
77
+ if mode.match?(/poltergeist/)
78
+ @driver.browser.restart
79
+ @driver.requests, @driver.responses = 0, 0
80
+ else
81
+ destroy_driver!
82
+ driver
83
+ end
84
+
85
+ logger.info "Browser: driver has been restarted: name: #{mode}, pid: #{driver.pid}, port: #{driver.port}"
86
+ end
87
+
88
+ def current_response(response_type = :html)
89
+ case response_type
90
+ when :html
91
+ if config.encoding
92
+ if config.encoding == :auto
93
+ charset = body.force_encoding("ISO-8859-1").encode("UTF-8")[/<meta.*?charset=["]?([\w+\d+\-]*)/i, 1]
94
+ Nokogiri::HTML(body, nil, charset)
95
+ else
96
+ Nokogiri::HTML(body, nil, config.encoding)
97
+ end
98
+ else
99
+ Nokogiri::HTML(body)
100
+ end
101
+ when :json
102
+ JSON.parse(body)
103
+ end
104
+ end
105
+
106
+ ###
107
+
108
+ # Handy method to perform some processing in the new tab within block and then automatically close this tab:
109
+ # Usage (url):
110
+ # browser.within_new_window_by(url: "https://google.com") do
111
+ # do some stuff and then automatically close this tab and return back to the first tab
112
+ # end
113
+ # Usage (action) (when new tab opening by some action, for example by clicking
114
+ # on a particular element):
115
+ # action = -> { browser.find("//some/element/path").click }
116
+ # browser.within_new_window_by(action: action) do
117
+ # do some stuff and then automatically close this tab and return back to the first tab
118
+ # end
119
+ def within_new_window_by(action: nil, url: nil)
120
+ case
121
+ when action
122
+ opened_window = window_opened_by { action.call }
123
+ within_window(opened_window) do
124
+ yield
125
+ current_window.close
126
+ end
127
+ when url
128
+ within_window(open_new_window) do
129
+ visit(url)
130
+
131
+ yield
132
+ current_window.close
133
+ end
134
+ end
135
+ end
136
+
137
+ ###
138
+
139
+ def scroll_to_bottom
140
+ execute_script("window.scrollBy(0,10000)")
141
+ end
142
+
143
+ private
144
+
145
+ def skip_error_on_failure?(e)
146
+ config.retry_request_errors.any? do |error|
147
+ error[:skip_on_failure] && e.class.ancestors.include?(error[:error]) if error.kind_of?(Hash)
148
+ end
149
+ end
150
+
151
+ def match_error?(e, type:)
152
+ errors =
153
+ case type
154
+ when :to_retry then config.retry_request_errors
155
+ when :to_skip then config.skip_request_errors
156
+ end
157
+
158
+ errors.any? do |error|
159
+ if error.kind_of?(Hash)
160
+ match_class = e.class.ancestors.include?(error[:error])
161
+ if error[:message].present?
162
+ if error[:message].kind_of?(Regexp)
163
+ e.message&.match?(error[:message])
164
+ else
165
+ e.message&.include?(error[:message])
166
+ end && match_class
167
+ else
168
+ match_class
169
+ end
170
+ else
171
+ e.class.ancestors.include?(error)
172
+ end
173
+ end
174
+ end
175
+
176
+ def process_delay(delay)
177
+ interval = (delay.class == Range ? rand(delay) : delay)
178
+ logger.debug "Browser: sleep #{interval.round(2)} #{'second'.pluralize(interval)} before request..."
179
+ sleep interval
180
+ end
181
+
182
+ def check_request_options(url_to_visit)
183
+ # restart_if
184
+ if memory_limit = config.restart_if[:memory_limit]
185
+ memory = driver.current_memory
186
+ if memory && memory >= memory_limit
187
+ logger.warn "Browser: memory_limit #{memory_limit} of driver.current_memory (#{memory}) is exceeded (engine: #{mode})"
188
+ restart!
189
+ end
190
+ end
191
+
192
+ if requests_limit = config.restart_if[:requests_limit]
193
+ requests = driver.requests
194
+ if requests >= requests_limit
195
+ logger.warn "Browser: requests_limit #{requests_limit} of driver.requests (#{requests}) is exceeded (engine: #{mode})"
196
+ restart!
197
+ end
198
+ end
199
+
200
+ # cookies
201
+ # (Selenium only) if config.cookies present and browser was just created,
202
+ # visit url_to_visit first and only then set cookies:
203
+ if driver.visited.nil? && config.cookies && mode.match?(/selenium/)
204
+ visit(url_to_visit, skip_request_options: true)
205
+ config.cookies.each do |cookie|
206
+ driver.set_cookie(cookie[:name], cookie[:value], cookie)
207
+ end
208
+ end
209
+
210
+ if config.before_request[:clear_cookies]
211
+ driver.clear_cookies
212
+ logger.debug "Browser: cleared cookies before request"
213
+ end
214
+
215
+ if config.before_request[:clear_and_set_cookies]
216
+ driver.clear_cookies
217
+
218
+ # (Selenium only) if browser is not visited yet any page, visit url_to_visit
219
+ # first and then set cookies (needs after browser restart):
220
+ if driver.visited.nil? && mode.match?(/selenium/)
221
+ visit(url_to_visit, skip_request_options: true)
222
+ end
223
+
224
+ config.cookies.each do |cookie|
225
+ driver.set_cookie(cookie[:name], cookie[:value], cookie)
226
+ end
227
+
228
+ logger.debug "Browser: cleared and set cookies before request"
229
+ end
230
+
231
+ # user_agent
232
+ if config.before_request[:change_user_agent]
233
+ driver.add_header("User-Agent", config.user_agent.call)
234
+ logger.debug "Browser: changed user_agent before request"
235
+ end
236
+
237
+ # proxy
238
+ if config.before_request[:change_proxy]
239
+ proxy_string = config.proxy.call
240
+ driver.set_proxy(*proxy_string.split(":"))
241
+ logger.debug "Browser: changed proxy before request"
242
+ end
243
+ end
244
+
245
+ def logger
246
+ spider.logger
247
+ end
248
+ end
249
+ end
@@ -0,0 +1,71 @@
1
+ require 'cliver'
2
+
3
+ module Tanakai
4
+ class CLI
5
+ class AnsibleCommandBuilder
6
+ def initialize(user_host, options, playbook:, vars: {})
7
+ @user_host = user_host
8
+ @options = options
9
+ @playbook = playbook
10
+ @vars = vars
11
+ end
12
+
13
+ def get
14
+ unless Cliver.detect("ansible-playbook")
15
+ raise "Can't find `ansible-playbook` executable, to install: " \
16
+ "Mac OS X: `$ brew install ansible`, Ubuntu: `$ sudo apt install ansible`"
17
+ end
18
+
19
+ user = @user_host[/(.*?)\@/, 1]
20
+ host = @user_host[/\@(.+)/, 1] || @user_host
21
+ inventory = @options["port"] ? "#{host}:#{@options['port']}," : "#{host},"
22
+
23
+ gem_dir = Gem::Specification.find_by_name("tanakai").gem_dir
24
+ playbook_path = gem_dir + "/lib/tanakai/automation/" + "#{@playbook}.yml"
25
+
26
+ command = [
27
+ "ansible-playbook", playbook_path,
28
+ "--inventory", inventory,
29
+ "--ssh-extra-args", "-oForwardAgent=yes",
30
+ "--connection", @options["local"] ? "local" : "smart",
31
+ "--extra-vars", "ansible_python_interpreter=/usr/bin/python3"
32
+ ]
33
+
34
+ if File.exists? "config/automation.yml"
35
+ require 'yaml'
36
+ if config = YAML.load_file("config/automation.yml").dig(@playbook)
37
+ config.each { |key, value| @vars[key] = value unless @vars[key] }
38
+ end
39
+ end
40
+
41
+ @vars.each do |key, value|
42
+ next unless value.present?
43
+ command.push "--extra-vars", "#{key}=#{value}"
44
+ end
45
+
46
+ if user
47
+ command.push "--user", user
48
+ end
49
+
50
+ if @options["ask-sudo"]
51
+ command.push "--ask-become-pass"
52
+ end
53
+
54
+ if @options["ask-auth-pass"]
55
+ unless Cliver.detect("sshpass")
56
+ raise "Can't find `sshpass` executable for password authentication, to install: " \
57
+ "Mac OS X: `$ brew install http://git.io/sshpass.rb`, Ubuntu: `$ sudo apt install sshpass`"
58
+ end
59
+
60
+ command.push "--ask-pass"
61
+ end
62
+
63
+ if ssh_key_path = @options["ssh-key-path"]
64
+ command.push "--private-key", ssh_key_path
65
+ end
66
+
67
+ command
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,57 @@
1
+ module Tanakai
2
+ class CLI
3
+ class Generator < Thor::Group
4
+ include Thor::Actions
5
+
6
+ def self.source_root
7
+ File.dirname(File.expand_path('..', __FILE__))
8
+ end
9
+
10
+ def generate_project(project_name)
11
+ directory "template", project_name
12
+ inside(project_name) do
13
+ run "bundle install"
14
+ run "git init"
15
+ end
16
+ end
17
+
18
+ def generate_spider(spider_name, in_project:)
19
+ spider_path = in_project ? "spiders/#{spider_name}.rb" : "./#{spider_name}.rb"
20
+ raise "Spider #{spider_path} already exists" if File.exists? spider_path
21
+
22
+ spider_class = to_spider_class(spider_name)
23
+ create_file spider_path do
24
+ <<~RUBY
25
+ class #{spider_class} < #{in_project ? 'ApplicationSpider' : 'Tanakai::Base'}
26
+ @name = "#{spider_name}"
27
+ @start_urls = []
28
+ @config = {}
29
+
30
+ def parse(response, url:, data: {})
31
+ end
32
+ end
33
+ RUBY
34
+ end
35
+
36
+ unless in_project
37
+ insert_into_file spider_path, " @engine = :mechanize\n", after: "@name = \"#{spider_name}\"\n"
38
+ prepend_to_file spider_path, "require 'tanakai'\n\n"
39
+ append_to_file spider_path, "\n#{spider_class}.crawl!"
40
+ end
41
+ end
42
+
43
+ def generate_schedule
44
+ copy_file "template/config/schedule.rb", "./schedule.rb"
45
+ end
46
+
47
+ private
48
+
49
+ def to_spider_class(string)
50
+ string.sub(/^./) { $&.capitalize }
51
+ .gsub(/(?:_|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
52
+ .gsub(/(?:-|(\/))([a-z\d]*)/) { "Dash#{$2.capitalize}" }
53
+ .gsub(/(?:\.|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
54
+ end
55
+ end
56
+ end
57
+ end