kimurai 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.travis.yml +5 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +1923 -0
  8. data/Rakefile +10 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/exe/kimurai +6 -0
  12. data/kimurai.gemspec +48 -0
  13. data/lib/kimurai.rb +53 -0
  14. data/lib/kimurai/automation/deploy.yml +54 -0
  15. data/lib/kimurai/automation/setup.yml +44 -0
  16. data/lib/kimurai/automation/setup/chromium_chromedriver.yml +26 -0
  17. data/lib/kimurai/automation/setup/firefox_geckodriver.yml +20 -0
  18. data/lib/kimurai/automation/setup/phantomjs.yml +33 -0
  19. data/lib/kimurai/automation/setup/ruby_environment.yml +124 -0
  20. data/lib/kimurai/base.rb +249 -0
  21. data/lib/kimurai/base/simple_saver.rb +98 -0
  22. data/lib/kimurai/base/uniq_checker.rb +22 -0
  23. data/lib/kimurai/base_helper.rb +22 -0
  24. data/lib/kimurai/browser_builder.rb +32 -0
  25. data/lib/kimurai/browser_builder/mechanize_builder.rb +140 -0
  26. data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +156 -0
  27. data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +178 -0
  28. data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +185 -0
  29. data/lib/kimurai/capybara_configuration.rb +10 -0
  30. data/lib/kimurai/capybara_ext/driver/base.rb +62 -0
  31. data/lib/kimurai/capybara_ext/mechanize/driver.rb +55 -0
  32. data/lib/kimurai/capybara_ext/poltergeist/driver.rb +13 -0
  33. data/lib/kimurai/capybara_ext/selenium/driver.rb +24 -0
  34. data/lib/kimurai/capybara_ext/session.rb +150 -0
  35. data/lib/kimurai/capybara_ext/session/config.rb +18 -0
  36. data/lib/kimurai/cli.rb +157 -0
  37. data/lib/kimurai/cli/ansible_command_builder.rb +71 -0
  38. data/lib/kimurai/cli/generator.rb +57 -0
  39. data/lib/kimurai/core_ext/array.rb +14 -0
  40. data/lib/kimurai/core_ext/numeric.rb +19 -0
  41. data/lib/kimurai/core_ext/string.rb +7 -0
  42. data/lib/kimurai/pipeline.rb +25 -0
  43. data/lib/kimurai/runner.rb +72 -0
  44. data/lib/kimurai/template/.gitignore +18 -0
  45. data/lib/kimurai/template/.ruby-version +1 -0
  46. data/lib/kimurai/template/Gemfile +20 -0
  47. data/lib/kimurai/template/README.md +3 -0
  48. data/lib/kimurai/template/config/application.rb +32 -0
  49. data/lib/kimurai/template/config/automation.yml +13 -0
  50. data/lib/kimurai/template/config/boot.rb +22 -0
  51. data/lib/kimurai/template/config/initializers/.keep +0 -0
  52. data/lib/kimurai/template/config/schedule.rb +57 -0
  53. data/lib/kimurai/template/db/.keep +0 -0
  54. data/lib/kimurai/template/helpers/application_helper.rb +3 -0
  55. data/lib/kimurai/template/lib/.keep +0 -0
  56. data/lib/kimurai/template/log/.keep +0 -0
  57. data/lib/kimurai/template/pipelines/saver.rb +11 -0
  58. data/lib/kimurai/template/pipelines/validator.rb +24 -0
  59. data/lib/kimurai/template/spiders/application_spider.rb +104 -0
  60. data/lib/kimurai/template/tmp/.keep +0 -0
  61. data/lib/kimurai/version.rb +3 -0
  62. metadata +349 -0
@@ -0,0 +1,10 @@
1
+ require 'capybara'
2
+
3
+ Capybara.configure do |config|
4
+ config.run_server = false
5
+ config.default_selector = :xpath
6
+ config.save_path = "tmp"
7
+ config.default_max_wait_time = 10
8
+ config.ignore_hidden_elements = false
9
+ config.threadsafe = true
10
+ end
@@ -0,0 +1,62 @@
1
+ require 'pathname'
2
+
3
+ class Capybara::Driver::Base
4
+ attr_accessor :visited
5
+ attr_writer :requests, :responses
6
+
7
+ def requests
8
+ @requests ||= 0
9
+ end
10
+
11
+ def responses
12
+ @responses ||= 0
13
+ end
14
+
15
+ def current_memory
16
+ driver_pid = pid
17
+
18
+ all = (get_descendant_processes(driver_pid) << driver_pid).uniq
19
+ all.map { |pid| get_process_memory(pid) }.sum
20
+ end
21
+
22
+ private
23
+
24
+ def get_descendant_processes(base)
25
+ descendants = Hash.new { |ht, k| ht[k] = [k] }
26
+ Hash[*`ps -eo pid,ppid`.scan(/\d+/).map(&:to_i)].each do |pid, ppid|
27
+ descendants[ppid] << descendants[pid]
28
+ end
29
+
30
+ descendants[base].flatten - [base]
31
+ end
32
+
33
+ # https://github.com/schneems/get_process_mem
34
+ # Note: for Linux takes PSS (not RSS) memory (I think PSS better fits in this case)
35
+ def get_process_memory(pid)
36
+ case @platform ||= Gem::Platform.local.os
37
+ when "linux"
38
+ begin
39
+ file = Pathname.new "/proc/#{pid}/smaps"
40
+ return 0 unless file.exist?
41
+
42
+ lines = file.each_line.select { |line| line.match(/^Pss/) }
43
+ return 0 if lines.empty?
44
+
45
+ lines.reduce(0) do |sum, line|
46
+ line.match(/(?<value>(\d*\.{0,1}\d+))\s+(?<unit>\w\w)/) do |m|
47
+ sum += m[:value].to_i
48
+ end
49
+
50
+ sum
51
+ end
52
+ rescue Errno::EACCES
53
+ 0
54
+ end
55
+ when "darwin"
56
+ mem = `ps -o rss= -p #{pid}`.strip
57
+ mem.empty? ? 0 : mem.to_i
58
+ else
59
+ raise "Can't check process memory, wrong type of platform: #{@platform}"
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,55 @@
1
+ require 'mechanize'
2
+ require_relative '../driver/base'
3
+
4
+ class Capybara::Mechanize::Driver
5
+ # Extend capybara-mechnize to support Poltergeist-like methods
6
+ # https://www.rubydoc.info/gems/poltergeist/Capybara/Poltergeist/Driver
7
+
8
+ def set_proxy(ip, port, type, user, password)
9
+ # type is always "http", "socks" is not supported (yet)
10
+ browser.agent.set_proxy(ip, port, user, password)
11
+ end
12
+
13
+ def headers
14
+ browser.agent.request_headers
15
+ end
16
+
17
+ def headers=(headers)
18
+ browser.agent.request_headers = headers
19
+ end
20
+
21
+ def add_header(name, value)
22
+ browser.agent.request_headers[name] = value
23
+ end
24
+
25
+ def set_cookie(name, value, options = {})
26
+ options[:name] ||= name
27
+ options[:value] ||= value
28
+
29
+ cookie = Mechanize::Cookie.new(options.merge path: "/")
30
+ browser.agent.cookie_jar << cookie
31
+ end
32
+
33
+ def clear_cookies
34
+ browser.agent.cookie_jar.clear!
35
+ end
36
+
37
+ def quit
38
+ browser.agent.shutdown
39
+ end
40
+
41
+ ###
42
+
43
+ # Reset parent method `current_memory` for mechanize (we can't measure memory of Mechanize driver)
44
+ def current_memory
45
+ nil
46
+ end
47
+
48
+ def pid
49
+ nil
50
+ end
51
+
52
+ def port
53
+ nil
54
+ end
55
+ end
@@ -0,0 +1,13 @@
1
+ require_relative '../driver/base'
2
+
3
+ module Capybara::Poltergeist
4
+ class Driver
5
+ def pid
6
+ client_pid
7
+ end
8
+
9
+ def port
10
+ server.port
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,24 @@
1
+ require_relative '../driver/base'
2
+
3
+ class Capybara::Selenium::Driver
4
+ def set_cookie(name, value, options = {})
5
+ options[:name] ||= name
6
+ options[:value] ||= value
7
+
8
+ browser.manage.add_cookie(options)
9
+ end
10
+
11
+ def clear_cookies
12
+ browser.manage.delete_all_cookies
13
+ end
14
+
15
+ ###
16
+
17
+ def pid
18
+ @pid ||= `lsof -i tcp:#{port} -t`.strip.to_i
19
+ end
20
+
21
+ def port
22
+ @port ||= browser.send(:bridge).instance_variable_get("@http").instance_variable_get("@server_url").port
23
+ end
24
+ end
@@ -0,0 +1,150 @@
1
+ require 'capybara'
2
+ require 'nokogiri'
3
+ require_relative 'session/config'
4
+
5
+ module Capybara
6
+ class Session
7
+ attr_accessor :spider
8
+
9
+ def current_response
10
+ Nokogiri::HTML(body)
11
+ end
12
+
13
+ alias_method :original_visit, :visit
14
+ def visit(visit_uri, delay: config.before_request[:delay], skip_request_options: false, max_retries: 3)
15
+ if spider
16
+ process_delay(delay) if delay
17
+ retries, sleep_interval = 0, 0
18
+
19
+ begin
20
+ check_request_options(visit_uri) unless skip_request_options
21
+ driver.requests += 1 and logger.info "Browser: started get request to: #{visit_uri}"
22
+ spider.class.update(:visits, :requests) if spider.with_info
23
+
24
+ original_visit(visit_uri)
25
+ rescue *config.retry_request_errors => e
26
+ logger.error "Browser: request visit error: #{e.inspect}, url: #{visit_uri}"
27
+
28
+ if (retries += 1) < max_retries
29
+ logger.info "Browser: sleep #{(sleep_interval += 15)} seconds and process retry № #{retries} to the url: #{visit_uri}"
30
+ sleep sleep_interval and retry
31
+ else
32
+ logger.error "Browser: all retries (#{retries}) to the url `#{visit_uri}` are gone"
33
+ raise e
34
+ end
35
+ else
36
+ driver.responses += 1 and logger.info "Browser: finished get request to: #{visit_uri}"
37
+ spider.class.update(:visits, :responses) if spider.with_info
38
+ driver.visited = true unless driver.visited
39
+ ensure
40
+ if spider.with_info
41
+ logger.info "Info: visits: requests: #{spider.class.visits[:requests]}, responses: #{spider.class.visits[:responses]}"
42
+ end
43
+
44
+ if memory = driver.current_memory
45
+ logger.debug "Browser: driver.current_memory: #{memory}"
46
+ end
47
+ end
48
+ else
49
+ original_visit(visit_uri)
50
+ end
51
+ end
52
+
53
+ def destroy_driver!
54
+ if @driver
55
+ @driver.quit
56
+ @driver = nil
57
+ logger.info "Browser: driver #{mode} has been destroyed"
58
+ else
59
+ logger.warn "Browser: driver #{mode} is not present"
60
+ end
61
+ end
62
+
63
+ def restart!
64
+ if mode.match?(/poltergeist/)
65
+ @driver.browser.restart
66
+ @driver.requests, @driver.responses = 0, 0
67
+ else
68
+ destroy_driver!
69
+ driver
70
+ end
71
+
72
+ logger.info "Browser: driver has been restarted: name: #{mode}, pid: #{driver.pid}, port: #{driver.port}"
73
+ end
74
+
75
+ private
76
+
77
+ def process_delay(delay)
78
+ interval = (delay.class == Range ? rand(delay) : delay)
79
+ logger.debug "Browser: sleep #{interval.round(2)} #{'second'.pluralize(interval)} before request..."
80
+ sleep interval
81
+ end
82
+
83
+ def check_request_options(url_to_visit)
84
+ # restart_if
85
+ if memory_limit = config.restart_if[:memory_limit]
86
+ memory = driver.current_memory
87
+ if memory && memory >= memory_limit
88
+ logger.warn "Browser: memory_limit #{memory_limit} of driver.current_memory (#{memory}) is exceeded (engine: #{mode})"
89
+ restart!
90
+ end
91
+ end
92
+
93
+ if requests_limit = config.restart_if[:requests_limit]
94
+ requests = driver.requests
95
+ if requests >= requests_limit
96
+ logger.warn "Browser: requests_limit #{requests_limit} of driver.requests (#{requests}) is exceeded (engine: #{mode})"
97
+ restart!
98
+ end
99
+ end
100
+
101
+ # cookies
102
+ # (Selenium only) if config.cookies present and browser was just created,
103
+ # visit url_to_visit first and only then set cookies:
104
+ if driver.visited.nil? && config.cookies && mode.match?(/selenium/)
105
+ visit(url_to_visit, skip_request_options: true)
106
+ config.cookies.each do |cookie|
107
+ driver.set_cookie(cookie[:name], cookie[:value], cookie)
108
+ end
109
+ end
110
+
111
+ if config.before_request[:clear_cookies]
112
+ driver.clear_cookies
113
+ logger.debug "Browser: cleared cookies before request"
114
+ end
115
+
116
+ if config.before_request[:clear_and_set_cookies]
117
+ driver.clear_cookies
118
+
119
+ # (Selenium only) if browser is not visited yet any page, visit url_to_visit
120
+ # first and then set cookies (needs after browser restart):
121
+ if driver.visited.nil? && mode.match?(/selenium/)
122
+ visit(url_to_visit, skip_request_options: true)
123
+ end
124
+
125
+ config.cookies.each do |cookie|
126
+ driver.set_cookie(cookie[:name], cookie[:value], cookie)
127
+ end
128
+
129
+ logger.debug "Browser: cleared and set cookies before request"
130
+ end
131
+
132
+ # user_agent
133
+ if config.before_request[:change_user_agent]
134
+ driver.add_header("User-Agent", config.user_agent.call)
135
+ logger.debug "Browser: changed user_agent before request"
136
+ end
137
+
138
+ # proxy
139
+ if config.before_request[:change_proxy]
140
+ proxy_string = config.proxy.call
141
+ driver.set_proxy(*proxy_string.split(":"))
142
+ logger.debug "Browser: changed proxy before request"
143
+ end
144
+ end
145
+
146
+ def logger
147
+ spider.logger
148
+ end
149
+ end
150
+ end
@@ -0,0 +1,18 @@
1
+ module Capybara
2
+ class SessionConfig
3
+ attr_accessor :cookies, :proxy, :user_agent
4
+ attr_writer :retry_request_errors
5
+
6
+ def retry_request_errors
7
+ @retry_request_errors ||= []
8
+ end
9
+
10
+ def restart_if
11
+ @restart_if ||= {}
12
+ end
13
+
14
+ def before_request
15
+ @before_request ||= {}
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,157 @@
1
+ require 'thor'
2
+
3
+ module Kimurai
4
+ class CLI < Thor
5
+ map %w[--version -v] => :__print_version
6
+
7
+ desc "generate", "Generator, available types: project, spider, schedule"
8
+ def generate(generator_type, *args)
9
+ case generator_type
10
+ when "project"
11
+ project_name = args.shift
12
+ raise "Provide project name to generate a new project" unless project_name.present?
13
+ Generator.new.generate_project(project_name)
14
+ when "spider"
15
+ spider_name = args.shift
16
+ raise "Provide spider name to generate a spider" unless spider_name.present?
17
+ Generator.new.generate_spider(spider_name, in_project: inside_project?)
18
+ when "schedule"
19
+ Generator.new.generate_schedule
20
+ else
21
+ raise "Don't know this generator type: #{generator_type}"
22
+ end
23
+ end
24
+
25
+ ###
26
+
27
+ desc "setup", "Setup server"
28
+ option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
29
+ option "ask-sudo", type: :boolean, banner: "Provide sudo password for a user to install system-wide packages"
30
+ option "ask-auth-pass", type: :boolean, banner: "Auth using password"
31
+ option "ssh-key-path", type: :string, banner: "Auth using ssh key"
32
+ option :local, type: :boolean, banner: "Run setup on a local machine (Ubuntu only)"
33
+ def setup(user_host)
34
+ command = AnsibleCommandBuilder.new(user_host, options, playbook: "setup").get
35
+
36
+ pid = spawn *command
37
+ Process.wait pid
38
+ end
39
+
40
+ desc "deploy", "Deploy project to the server and update cron schedule"
41
+ option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
42
+ option "ask-auth-pass", type: :boolean, banner: "Auth using password"
43
+ option "ssh-key-path", type: :string, banner: "Auth using ssh key"
44
+ option "repo-url", type: :string, banner: "Repo url"
45
+ option "repo-key-path", type: :string, banner: "SSH key for a git repo"
46
+ def deploy(user_host)
47
+ if !`git status --short`.empty?
48
+ raise "Deploy: Please commit your changes first"
49
+ elsif `git remote`.empty?
50
+ raise "Deploy: Please add remote origin repository to your repo first"
51
+ elsif !`git rev-list master...origin/master`.empty?
52
+ raise "Deploy: Please push your commits to the remote origin repo first"
53
+ end
54
+
55
+ repo_url = options["repo-url"] ? options["repo-url"] : `git remote get-url origin`.strip
56
+ repo_name = repo_url[/\/([^\/]*)\.git/i, 1]
57
+
58
+ command = AnsibleCommandBuilder.new(user_host, options, playbook: "deploy",
59
+ vars: { repo_url: repo_url, repo_name: repo_name, repo_key_path: options["repo-key-path"] }
60
+ ).get
61
+
62
+ pid = spawn *command
63
+ Process.wait pid
64
+ end
65
+
66
+ ###
67
+
68
+ desc "crawl", "Run a particular spider by it's name"
69
+ def crawl(spider_name)
70
+ raise "Can't find Kimurai project" unless inside_project?
71
+ require './config/boot'
72
+
73
+ unless klass = Kimurai.find_by_name(spider_name)
74
+ raise "Can't find spider with name `#{spider_name}` in the project. " \
75
+ "To list all available spiders, run: `$ bundle exec kimurai list`"
76
+ end
77
+
78
+ # Set time_zone if exists
79
+ if time_zone = Kimurai.configuration.time_zone
80
+ Kimurai.time_zone = time_zone
81
+ end
82
+
83
+ klass.crawl!
84
+ end
85
+
86
+ desc "parse", "Parse url in the particular spider method"
87
+ option :url, type: :string, required: true, banner: "Url to pass to the method"
88
+ def parse(spider_name, method_name)
89
+ raise "Can't find Kimurai project" unless inside_project?
90
+ require './config/boot'
91
+
92
+ unless klass = Kimurai.find_by_name(spider_name)
93
+ raise "Can't find spider with name `#{spider_name}` in the project. " \
94
+ "To list all available spiders, run: `$ bundle exec kimurai list`"
95
+ end
96
+
97
+ klass.parse!(method_name, url: options["url"])
98
+ end
99
+
100
+ desc "console", "Start Kimurai console"
101
+ option :engine, type: :string, banner: "Engine to use"
102
+ option :url, type: :string, banner: "Url to process"
103
+ def console(spider_name = nil)
104
+ require 'pry'
105
+ require './config/boot' if inside_project?
106
+
107
+ if spider_name
108
+ raise "Can't find Kimurai project" unless inside_project?
109
+
110
+ unless klass = Kimurai.find_by_name(spider_name)
111
+ raise "Can't find spider with name `#{spider_name}` in the project. " \
112
+ "To list all available spiders, run: `$ bundle exec kimurai list`"
113
+ end
114
+ else
115
+ klass = inside_project? ? ApplicationSpider : ::Kimurai::Base
116
+ end
117
+
118
+ engine = options["engine"]&.delete(":")&.to_sym
119
+ klass.parse!(:console, engine, url: options["url"])
120
+ end
121
+
122
+ desc "list", "List all available spiders in the current project"
123
+ def list
124
+ raise "Can't find Kimurai project" unless inside_project?
125
+ require './config/boot'
126
+
127
+ Kimurai.list.keys.each { |name| puts name }
128
+ end
129
+
130
+ desc "runner", "Run all spiders in the project in queue"
131
+ option :jobs, aliases: :j, type: :numeric, default: 1, banner: "The number of concurrent jobs"
132
+ def runner
133
+ raise "Can't find Kimurai project" unless inside_project?
134
+
135
+ jobs = options["jobs"]
136
+ raise "Jobs count can't be 0" if jobs == 0
137
+
138
+ require './config/boot'
139
+ require 'kimurai/runner'
140
+ Runner.new(parallel_jobs: jobs).run!
141
+ end
142
+
143
+ desc "--version, -v", "Print the version"
144
+ def __print_version
145
+ puts VERSION
146
+ end
147
+
148
+ private
149
+
150
+ def inside_project?
151
+ Dir.exists? "spiders"
152
+ end
153
+ end
154
+ end
155
+
156
+ require_relative 'cli/generator'
157
+ require_relative 'cli/ansible_command_builder'