kimurai 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.travis.yml +5 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +1923 -0
  8. data/Rakefile +10 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/exe/kimurai +6 -0
  12. data/kimurai.gemspec +48 -0
  13. data/lib/kimurai.rb +53 -0
  14. data/lib/kimurai/automation/deploy.yml +54 -0
  15. data/lib/kimurai/automation/setup.yml +44 -0
  16. data/lib/kimurai/automation/setup/chromium_chromedriver.yml +26 -0
  17. data/lib/kimurai/automation/setup/firefox_geckodriver.yml +20 -0
  18. data/lib/kimurai/automation/setup/phantomjs.yml +33 -0
  19. data/lib/kimurai/automation/setup/ruby_environment.yml +124 -0
  20. data/lib/kimurai/base.rb +249 -0
  21. data/lib/kimurai/base/simple_saver.rb +98 -0
  22. data/lib/kimurai/base/uniq_checker.rb +22 -0
  23. data/lib/kimurai/base_helper.rb +22 -0
  24. data/lib/kimurai/browser_builder.rb +32 -0
  25. data/lib/kimurai/browser_builder/mechanize_builder.rb +140 -0
  26. data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +156 -0
  27. data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +178 -0
  28. data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +185 -0
  29. data/lib/kimurai/capybara_configuration.rb +10 -0
  30. data/lib/kimurai/capybara_ext/driver/base.rb +62 -0
  31. data/lib/kimurai/capybara_ext/mechanize/driver.rb +55 -0
  32. data/lib/kimurai/capybara_ext/poltergeist/driver.rb +13 -0
  33. data/lib/kimurai/capybara_ext/selenium/driver.rb +24 -0
  34. data/lib/kimurai/capybara_ext/session.rb +150 -0
  35. data/lib/kimurai/capybara_ext/session/config.rb +18 -0
  36. data/lib/kimurai/cli.rb +157 -0
  37. data/lib/kimurai/cli/ansible_command_builder.rb +71 -0
  38. data/lib/kimurai/cli/generator.rb +57 -0
  39. data/lib/kimurai/core_ext/array.rb +14 -0
  40. data/lib/kimurai/core_ext/numeric.rb +19 -0
  41. data/lib/kimurai/core_ext/string.rb +7 -0
  42. data/lib/kimurai/pipeline.rb +25 -0
  43. data/lib/kimurai/runner.rb +72 -0
  44. data/lib/kimurai/template/.gitignore +18 -0
  45. data/lib/kimurai/template/.ruby-version +1 -0
  46. data/lib/kimurai/template/Gemfile +20 -0
  47. data/lib/kimurai/template/README.md +3 -0
  48. data/lib/kimurai/template/config/application.rb +32 -0
  49. data/lib/kimurai/template/config/automation.yml +13 -0
  50. data/lib/kimurai/template/config/boot.rb +22 -0
  51. data/lib/kimurai/template/config/initializers/.keep +0 -0
  52. data/lib/kimurai/template/config/schedule.rb +57 -0
  53. data/lib/kimurai/template/db/.keep +0 -0
  54. data/lib/kimurai/template/helpers/application_helper.rb +3 -0
  55. data/lib/kimurai/template/lib/.keep +0 -0
  56. data/lib/kimurai/template/log/.keep +0 -0
  57. data/lib/kimurai/template/pipelines/saver.rb +11 -0
  58. data/lib/kimurai/template/pipelines/validator.rb +24 -0
  59. data/lib/kimurai/template/spiders/application_spider.rb +104 -0
  60. data/lib/kimurai/template/tmp/.keep +0 -0
  61. data/lib/kimurai/version.rb +3 -0
  62. metadata +349 -0
@@ -0,0 +1,10 @@
1
+ require 'capybara'
2
+
3
+ Capybara.configure do |config|
4
+ config.run_server = false
5
+ config.default_selector = :xpath
6
+ config.save_path = "tmp"
7
+ config.default_max_wait_time = 10
8
+ config.ignore_hidden_elements = false
9
+ config.threadsafe = true
10
+ end
@@ -0,0 +1,62 @@
1
+ require 'pathname'
2
+
3
+ class Capybara::Driver::Base
4
+ attr_accessor :visited
5
+ attr_writer :requests, :responses
6
+
7
+ def requests
8
+ @requests ||= 0
9
+ end
10
+
11
+ def responses
12
+ @responses ||= 0
13
+ end
14
+
15
+ def current_memory
16
+ driver_pid = pid
17
+
18
+ all = (get_descendant_processes(driver_pid) << driver_pid).uniq
19
+ all.map { |pid| get_process_memory(pid) }.sum
20
+ end
21
+
22
+ private
23
+
24
+ def get_descendant_processes(base)
25
+ descendants = Hash.new { |ht, k| ht[k] = [k] }
26
+ Hash[*`ps -eo pid,ppid`.scan(/\d+/).map(&:to_i)].each do |pid, ppid|
27
+ descendants[ppid] << descendants[pid]
28
+ end
29
+
30
+ descendants[base].flatten - [base]
31
+ end
32
+
33
+ # https://github.com/schneems/get_process_mem
34
+ # Note: for Linux takes PSS (not RSS) memory (I think PSS better fits in this case)
35
+ def get_process_memory(pid)
36
+ case @platform ||= Gem::Platform.local.os
37
+ when "linux"
38
+ begin
39
+ file = Pathname.new "/proc/#{pid}/smaps"
40
+ return 0 unless file.exist?
41
+
42
+ lines = file.each_line.select { |line| line.match(/^Pss/) }
43
+ return 0 if lines.empty?
44
+
45
+ lines.reduce(0) do |sum, line|
46
+ line.match(/(?<value>(\d*\.{0,1}\d+))\s+(?<unit>\w\w)/) do |m|
47
+ sum += m[:value].to_i
48
+ end
49
+
50
+ sum
51
+ end
52
+ rescue Errno::EACCES
53
+ 0
54
+ end
55
+ when "darwin"
56
+ mem = `ps -o rss= -p #{pid}`.strip
57
+ mem.empty? ? 0 : mem.to_i
58
+ else
59
+ raise "Can't check process memory, wrong type of platform: #{@platform}"
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,55 @@
1
+ require 'mechanize'
2
+ require_relative '../driver/base'
3
+
4
+ class Capybara::Mechanize::Driver
5
+ # Extend capybara-mechnize to support Poltergeist-like methods
6
+ # https://www.rubydoc.info/gems/poltergeist/Capybara/Poltergeist/Driver
7
+
8
+ def set_proxy(ip, port, type, user, password)
9
+ # type is always "http", "socks" is not supported (yet)
10
+ browser.agent.set_proxy(ip, port, user, password)
11
+ end
12
+
13
+ def headers
14
+ browser.agent.request_headers
15
+ end
16
+
17
+ def headers=(headers)
18
+ browser.agent.request_headers = headers
19
+ end
20
+
21
+ def add_header(name, value)
22
+ browser.agent.request_headers[name] = value
23
+ end
24
+
25
+ def set_cookie(name, value, options = {})
26
+ options[:name] ||= name
27
+ options[:value] ||= value
28
+
29
+ cookie = Mechanize::Cookie.new(options.merge path: "/")
30
+ browser.agent.cookie_jar << cookie
31
+ end
32
+
33
+ def clear_cookies
34
+ browser.agent.cookie_jar.clear!
35
+ end
36
+
37
+ def quit
38
+ browser.agent.shutdown
39
+ end
40
+
41
+ ###
42
+
43
+ # Reset parent method `current_memory` for mechanize (we can't measure memory of Mechanize driver)
44
+ def current_memory
45
+ nil
46
+ end
47
+
48
+ def pid
49
+ nil
50
+ end
51
+
52
+ def port
53
+ nil
54
+ end
55
+ end
@@ -0,0 +1,13 @@
1
+ require_relative '../driver/base'
2
+
3
+ module Capybara::Poltergeist
4
+ class Driver
5
+ def pid
6
+ client_pid
7
+ end
8
+
9
+ def port
10
+ server.port
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,24 @@
1
+ require_relative '../driver/base'
2
+
3
+ class Capybara::Selenium::Driver
4
+ def set_cookie(name, value, options = {})
5
+ options[:name] ||= name
6
+ options[:value] ||= value
7
+
8
+ browser.manage.add_cookie(options)
9
+ end
10
+
11
+ def clear_cookies
12
+ browser.manage.delete_all_cookies
13
+ end
14
+
15
+ ###
16
+
17
+ def pid
18
+ @pid ||= `lsof -i tcp:#{port} -t`.strip.to_i
19
+ end
20
+
21
+ def port
22
+ @port ||= browser.send(:bridge).instance_variable_get("@http").instance_variable_get("@server_url").port
23
+ end
24
+ end
@@ -0,0 +1,150 @@
1
+ require 'capybara'
2
+ require 'nokogiri'
3
+ require_relative 'session/config'
4
+
5
+ module Capybara
6
+ class Session
7
+ attr_accessor :spider
8
+
9
+ def current_response
10
+ Nokogiri::HTML(body)
11
+ end
12
+
13
+ alias_method :original_visit, :visit
14
+ def visit(visit_uri, delay: config.before_request[:delay], skip_request_options: false, max_retries: 3)
15
+ if spider
16
+ process_delay(delay) if delay
17
+ retries, sleep_interval = 0, 0
18
+
19
+ begin
20
+ check_request_options(visit_uri) unless skip_request_options
21
+ driver.requests += 1 and logger.info "Browser: started get request to: #{visit_uri}"
22
+ spider.class.update(:visits, :requests) if spider.with_info
23
+
24
+ original_visit(visit_uri)
25
+ rescue *config.retry_request_errors => e
26
+ logger.error "Browser: request visit error: #{e.inspect}, url: #{visit_uri}"
27
+
28
+ if (retries += 1) < max_retries
29
+ logger.info "Browser: sleep #{(sleep_interval += 15)} seconds and process retry № #{retries} to the url: #{visit_uri}"
30
+ sleep sleep_interval and retry
31
+ else
32
+ logger.error "Browser: all retries (#{retries}) to the url `#{visit_uri}` are gone"
33
+ raise e
34
+ end
35
+ else
36
+ driver.responses += 1 and logger.info "Browser: finished get request to: #{visit_uri}"
37
+ spider.class.update(:visits, :responses) if spider.with_info
38
+ driver.visited = true unless driver.visited
39
+ ensure
40
+ if spider.with_info
41
+ logger.info "Info: visits: requests: #{spider.class.visits[:requests]}, responses: #{spider.class.visits[:responses]}"
42
+ end
43
+
44
+ if memory = driver.current_memory
45
+ logger.debug "Browser: driver.current_memory: #{memory}"
46
+ end
47
+ end
48
+ else
49
+ original_visit(visit_uri)
50
+ end
51
+ end
52
+
53
+ def destroy_driver!
54
+ if @driver
55
+ @driver.quit
56
+ @driver = nil
57
+ logger.info "Browser: driver #{mode} has been destroyed"
58
+ else
59
+ logger.warn "Browser: driver #{mode} is not present"
60
+ end
61
+ end
62
+
63
+ def restart!
64
+ if mode.match?(/poltergeist/)
65
+ @driver.browser.restart
66
+ @driver.requests, @driver.responses = 0, 0
67
+ else
68
+ destroy_driver!
69
+ driver
70
+ end
71
+
72
+ logger.info "Browser: driver has been restarted: name: #{mode}, pid: #{driver.pid}, port: #{driver.port}"
73
+ end
74
+
75
+ private
76
+
77
+ def process_delay(delay)
78
+ interval = (delay.class == Range ? rand(delay) : delay)
79
+ logger.debug "Browser: sleep #{interval.round(2)} #{'second'.pluralize(interval)} before request..."
80
+ sleep interval
81
+ end
82
+
83
+ def check_request_options(url_to_visit)
84
+ # restart_if
85
+ if memory_limit = config.restart_if[:memory_limit]
86
+ memory = driver.current_memory
87
+ if memory && memory >= memory_limit
88
+ logger.warn "Browser: memory_limit #{memory_limit} of driver.current_memory (#{memory}) is exceeded (engine: #{mode})"
89
+ restart!
90
+ end
91
+ end
92
+
93
+ if requests_limit = config.restart_if[:requests_limit]
94
+ requests = driver.requests
95
+ if requests >= requests_limit
96
+ logger.warn "Browser: requests_limit #{requests_limit} of driver.requests (#{requests}) is exceeded (engine: #{mode})"
97
+ restart!
98
+ end
99
+ end
100
+
101
+ # cookies
102
+ # (Selenium only) if config.cookies present and browser was just created,
103
+ # visit url_to_visit first and only then set cookies:
104
+ if driver.visited.nil? && config.cookies && mode.match?(/selenium/)
105
+ visit(url_to_visit, skip_request_options: true)
106
+ config.cookies.each do |cookie|
107
+ driver.set_cookie(cookie[:name], cookie[:value], cookie)
108
+ end
109
+ end
110
+
111
+ if config.before_request[:clear_cookies]
112
+ driver.clear_cookies
113
+ logger.debug "Browser: cleared cookies before request"
114
+ end
115
+
116
+ if config.before_request[:clear_and_set_cookies]
117
+ driver.clear_cookies
118
+
119
+ # (Selenium only) if browser is not visited yet any page, visit url_to_visit
120
+ # first and then set cookies (needs after browser restart):
121
+ if driver.visited.nil? && mode.match?(/selenium/)
122
+ visit(url_to_visit, skip_request_options: true)
123
+ end
124
+
125
+ config.cookies.each do |cookie|
126
+ driver.set_cookie(cookie[:name], cookie[:value], cookie)
127
+ end
128
+
129
+ logger.debug "Browser: cleared and set cookies before request"
130
+ end
131
+
132
+ # user_agent
133
+ if config.before_request[:change_user_agent]
134
+ driver.add_header("User-Agent", config.user_agent.call)
135
+ logger.debug "Browser: changed user_agent before request"
136
+ end
137
+
138
+ # proxy
139
+ if config.before_request[:change_proxy]
140
+ proxy_string = config.proxy.call
141
+ driver.set_proxy(*proxy_string.split(":"))
142
+ logger.debug "Browser: changed proxy before request"
143
+ end
144
+ end
145
+
146
+ def logger
147
+ spider.logger
148
+ end
149
+ end
150
+ end
@@ -0,0 +1,18 @@
1
+ module Capybara
2
+ class SessionConfig
3
+ attr_accessor :cookies, :proxy, :user_agent
4
+ attr_writer :retry_request_errors
5
+
6
+ def retry_request_errors
7
+ @retry_request_errors ||= []
8
+ end
9
+
10
+ def restart_if
11
+ @restart_if ||= {}
12
+ end
13
+
14
+ def before_request
15
+ @before_request ||= {}
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,157 @@
1
+ require 'thor'
2
+
3
+ module Kimurai
4
+ class CLI < Thor
5
+ map %w[--version -v] => :__print_version
6
+
7
+ desc "generate", "Generator, available types: project, spider, schedule"
8
+ def generate(generator_type, *args)
9
+ case generator_type
10
+ when "project"
11
+ project_name = args.shift
12
+ raise "Provide project name to generate a new project" unless project_name.present?
13
+ Generator.new.generate_project(project_name)
14
+ when "spider"
15
+ spider_name = args.shift
16
+ raise "Provide spider name to generate a spider" unless spider_name.present?
17
+ Generator.new.generate_spider(spider_name, in_project: inside_project?)
18
+ when "schedule"
19
+ Generator.new.generate_schedule
20
+ else
21
+ raise "Don't know this generator type: #{generator_type}"
22
+ end
23
+ end
24
+
25
+ ###
26
+
27
+ desc "setup", "Setup server"
28
+ option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
29
+ option "ask-sudo", type: :boolean, banner: "Provide sudo password for a user to install system-wide packages"
30
+ option "ask-auth-pass", type: :boolean, banner: "Auth using password"
31
+ option "ssh-key-path", type: :string, banner: "Auth using ssh key"
32
+ option :local, type: :boolean, banner: "Run setup on a local machine (Ubuntu only)"
33
+ def setup(user_host)
34
+ command = AnsibleCommandBuilder.new(user_host, options, playbook: "setup").get
35
+
36
+ pid = spawn *command
37
+ Process.wait pid
38
+ end
39
+
40
+ desc "deploy", "Deploy project to the server and update cron schedule"
41
+ option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
42
+ option "ask-auth-pass", type: :boolean, banner: "Auth using password"
43
+ option "ssh-key-path", type: :string, banner: "Auth using ssh key"
44
+ option "repo-url", type: :string, banner: "Repo url"
45
+ option "repo-key-path", type: :string, banner: "SSH key for a git repo"
46
+ def deploy(user_host)
47
+ if !`git status --short`.empty?
48
+ raise "Deploy: Please commit your changes first"
49
+ elsif `git remote`.empty?
50
+ raise "Deploy: Please add remote origin repository to your repo first"
51
+ elsif !`git rev-list master...origin/master`.empty?
52
+ raise "Deploy: Please push your commits to the remote origin repo first"
53
+ end
54
+
55
+ repo_url = options["repo-url"] ? options["repo-url"] : `git remote get-url origin`.strip
56
+ repo_name = repo_url[/\/([^\/]*)\.git/i, 1]
57
+
58
+ command = AnsibleCommandBuilder.new(user_host, options, playbook: "deploy",
59
+ vars: { repo_url: repo_url, repo_name: repo_name, repo_key_path: options["repo-key-path"] }
60
+ ).get
61
+
62
+ pid = spawn *command
63
+ Process.wait pid
64
+ end
65
+
66
+ ###
67
+
68
+ desc "crawl", "Run a particular spider by it's name"
69
+ def crawl(spider_name)
70
+ raise "Can't find Kimurai project" unless inside_project?
71
+ require './config/boot'
72
+
73
+ unless klass = Kimurai.find_by_name(spider_name)
74
+ raise "Can't find spider with name `#{spider_name}` in the project. " \
75
+ "To list all available spiders, run: `$ bundle exec kimurai list`"
76
+ end
77
+
78
+ # Set time_zone if exists
79
+ if time_zone = Kimurai.configuration.time_zone
80
+ Kimurai.time_zone = time_zone
81
+ end
82
+
83
+ klass.crawl!
84
+ end
85
+
86
+ desc "parse", "Parse url in the particular spider method"
87
+ option :url, type: :string, required: true, banner: "Url to pass to the method"
88
+ def parse(spider_name, method_name)
89
+ raise "Can't find Kimurai project" unless inside_project?
90
+ require './config/boot'
91
+
92
+ unless klass = Kimurai.find_by_name(spider_name)
93
+ raise "Can't find spider with name `#{spider_name}` in the project. " \
94
+ "To list all available spiders, run: `$ bundle exec kimurai list`"
95
+ end
96
+
97
+ klass.parse!(method_name, url: options["url"])
98
+ end
99
+
100
+ desc "console", "Start Kimurai console"
101
+ option :engine, type: :string, banner: "Engine to use"
102
+ option :url, type: :string, banner: "Url to process"
103
+ def console(spider_name = nil)
104
+ require 'pry'
105
+ require './config/boot' if inside_project?
106
+
107
+ if spider_name
108
+ raise "Can't find Kimurai project" unless inside_project?
109
+
110
+ unless klass = Kimurai.find_by_name(spider_name)
111
+ raise "Can't find spider with name `#{spider_name}` in the project. " \
112
+ "To list all available spiders, run: `$ bundle exec kimurai list`"
113
+ end
114
+ else
115
+ klass = inside_project? ? ApplicationSpider : ::Kimurai::Base
116
+ end
117
+
118
+ engine = options["engine"]&.delete(":")&.to_sym
119
+ klass.parse!(:console, engine, url: options["url"])
120
+ end
121
+
122
+ desc "list", "List all available spiders in the current project"
123
+ def list
124
+ raise "Can't find Kimurai project" unless inside_project?
125
+ require './config/boot'
126
+
127
+ Kimurai.list.keys.each { |name| puts name }
128
+ end
129
+
130
+ desc "runner", "Run all spiders in the project in queue"
131
+ option :jobs, aliases: :j, type: :numeric, default: 1, banner: "The number of concurrent jobs"
132
+ def runner
133
+ raise "Can't find Kimurai project" unless inside_project?
134
+
135
+ jobs = options["jobs"]
136
+ raise "Jobs count can't be 0" if jobs == 0
137
+
138
+ require './config/boot'
139
+ require 'kimurai/runner'
140
+ Runner.new(parallel_jobs: jobs).run!
141
+ end
142
+
143
+ desc "--version, -v", "Print the version"
144
+ def __print_version
145
+ puts VERSION
146
+ end
147
+
148
+ private
149
+
150
+ def inside_project?
151
+ Dir.exists? "spiders"
152
+ end
153
+ end
154
+ end
155
+
156
+ require_relative 'cli/generator'
157
+ require_relative 'cli/ansible_command_builder'