kimurai 1.3.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +9 -0
  3. data/CHANGELOG.md +29 -0
  4. data/Gemfile +2 -2
  5. data/README.md +478 -649
  6. data/Rakefile +6 -6
  7. data/bin/console +3 -4
  8. data/exe/kimurai +0 -1
  9. data/kimurai.gemspec +38 -37
  10. data/lib/kimurai/base/saver.rb +15 -19
  11. data/lib/kimurai/base/storage.rb +1 -1
  12. data/lib/kimurai/base.rb +42 -38
  13. data/lib/kimurai/base_helper.rb +5 -4
  14. data/lib/kimurai/browser_builder/mechanize_builder.rb +44 -38
  15. data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +63 -51
  16. data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +61 -55
  17. data/lib/kimurai/browser_builder.rb +7 -31
  18. data/lib/kimurai/capybara_configuration.rb +1 -1
  19. data/lib/kimurai/capybara_ext/driver/base.rb +50 -46
  20. data/lib/kimurai/capybara_ext/mechanize/driver.rb +51 -50
  21. data/lib/kimurai/capybara_ext/selenium/driver.rb +33 -29
  22. data/lib/kimurai/capybara_ext/session/config.rb +1 -1
  23. data/lib/kimurai/capybara_ext/session.rb +40 -38
  24. data/lib/kimurai/cli/generator.rb +15 -15
  25. data/lib/kimurai/cli.rb +52 -85
  26. data/lib/kimurai/core_ext/array.rb +2 -2
  27. data/lib/kimurai/core_ext/hash.rb +1 -1
  28. data/lib/kimurai/core_ext/numeric.rb +4 -4
  29. data/lib/kimurai/pipeline.rb +2 -1
  30. data/lib/kimurai/runner.rb +6 -6
  31. data/lib/kimurai/template/Gemfile +2 -2
  32. data/lib/kimurai/template/config/boot.rb +4 -4
  33. data/lib/kimurai/template/config/schedule.rb +15 -15
  34. data/lib/kimurai/template/spiders/application_spider.rb +14 -14
  35. data/lib/kimurai/version.rb +1 -1
  36. data/lib/kimurai.rb +7 -3
  37. metadata +58 -65
  38. data/.travis.yml +0 -5
  39. data/lib/kimurai/automation/deploy.yml +0 -54
  40. data/lib/kimurai/automation/setup/chromium_chromedriver.yml +0 -26
  41. data/lib/kimurai/automation/setup/firefox_geckodriver.yml +0 -20
  42. data/lib/kimurai/automation/setup/phantomjs.yml +0 -33
  43. data/lib/kimurai/automation/setup/ruby_environment.yml +0 -124
  44. data/lib/kimurai/automation/setup.yml +0 -44
  45. data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +0 -171
  46. data/lib/kimurai/capybara_ext/poltergeist/driver.rb +0 -13
  47. data/lib/kimurai/cli/ansible_command_builder.rb +0 -71
  48. data/lib/kimurai/template/config/automation.yml +0 -13
@@ -1,171 +0,0 @@
1
- require 'capybara'
2
- require 'capybara/poltergeist'
3
- require_relative '../capybara_configuration'
4
- require_relative '../capybara_ext/poltergeist/driver'
5
- require_relative '../capybara_ext/session'
6
-
7
- module Kimurai
8
- class BrowserBuilder
9
- class PoltergeistPhantomJSBuilder
10
- attr_reader :logger, :spider
11
-
12
- def initialize(config, spider:)
13
- @config = config
14
- @spider = spider
15
- @logger = spider.logger
16
- end
17
-
18
- def build
19
- # Register driver
20
- Capybara.register_driver :poltergeist_phantomjs do |app|
21
- # Create driver options
22
- driver_options = {
23
- js_errors: false, debug: false, inspector: false, phantomjs_options: []
24
- }
25
-
26
- if extensions = @config[:extensions].presence
27
- driver_options[:extensions] = extensions
28
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled extensions"
29
- end
30
-
31
- # Window size
32
- if size = @config[:window_size].presence
33
- driver_options[:window_size] = size
34
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled window_size"
35
- end
36
-
37
- # SSL
38
- if ssl_cert_path = @config[:ssl_cert_path].presence
39
- driver_options[:phantomjs_options] << "--ssl-certificates-path=#{ssl_cert_path}"
40
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom ssl_cert"
41
- end
42
-
43
- if @config[:ignore_ssl_errors].present?
44
- driver_options[:phantomjs_options].push("--ignore-ssl-errors=yes", "--ssl-protocol=any")
45
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled ignore_ssl_errors"
46
- end
47
-
48
- # Disable images
49
- if @config[:disable_images].present?
50
- driver_options[:phantomjs_options] << "--load-images=no"
51
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled disable_images"
52
- end
53
-
54
- Capybara::Poltergeist::Driver.new(app, driver_options)
55
- end
56
-
57
- # Create browser instance (Capybara session)
58
- @browser = Capybara::Session.new(:poltergeist_phantomjs)
59
- @browser.spider = spider
60
- logger.debug "BrowserBuilder (poltergeist_phantomjs): created browser instance"
61
-
62
- # Proxy
63
- if proxy = @config[:proxy].presence
64
- proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
65
- ip, port, type = proxy_string.split(":")
66
-
67
- if %w(http socks5).include?(type)
68
- @browser.driver.set_proxy(*proxy_string.split(":"))
69
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled #{type} proxy, ip: #{ip}, port: #{port}"
70
- else
71
- logger.error "BrowserBuilder (poltergeist_phantomjs): wrong type of proxy: #{type}, skipped"
72
- end
73
- end
74
-
75
- # Headers
76
- if headers = @config[:headers].presence
77
- @browser.driver.headers = headers
78
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom headers"
79
- end
80
-
81
- if user_agent = @config[:user_agent].presence
82
- user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
83
-
84
- @browser.driver.add_header("User-Agent", user_agent_string)
85
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom user_agent"
86
- end
87
-
88
- # Cookies
89
- if cookies = @config[:cookies].presence
90
- cookies.each do |cookie|
91
- @browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
92
- end
93
-
94
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom cookies"
95
- end
96
-
97
- # Browser instance options
98
- # skip_request_errors
99
- if skip_errors = @config[:skip_request_errors].presence
100
- @browser.config.skip_request_errors = skip_errors
101
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled skip_request_errors"
102
- end
103
-
104
- # retry_request_errors
105
- if retry_errors = @config[:retry_request_errors].presence
106
- @browser.config.retry_request_errors = retry_errors
107
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled retry_request_errors"
108
- end
109
-
110
- # restart_if
111
- if requests_limit = @config.dig(:restart_if, :requests_limit).presence
112
- @browser.config.restart_if[:requests_limit] = requests_limit
113
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled restart_if.requests_limit >= #{requests_limit}"
114
- end
115
-
116
- if memory_limit = @config.dig(:restart_if, :memory_limit).presence
117
- @browser.config.restart_if[:memory_limit] = memory_limit
118
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled restart_if.memory_limit >= #{memory_limit}"
119
- end
120
-
121
- # before_request clear_cookies
122
- if @config.dig(:before_request, :clear_cookies)
123
- @browser.config.before_request[:clear_cookies] = true
124
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.clear_cookies"
125
- end
126
-
127
- # before_request clear_and_set_cookies
128
- if @config.dig(:before_request, :clear_and_set_cookies)
129
- if cookies = @config[:cookies].presence
130
- @browser.config.cookies = cookies
131
- @browser.config.before_request[:clear_and_set_cookies] = true
132
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.clear_and_set_cookies"
133
- else
134
- logger.error "BrowserBuilder (poltergeist_phantomjs): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
135
- end
136
- end
137
-
138
- # before_request change_user_agent
139
- if @config.dig(:before_request, :change_user_agent)
140
- if @config[:user_agent].present? && @config[:user_agent].class == Proc
141
- @browser.config.user_agent = @config[:user_agent]
142
- @browser.config.before_request[:change_user_agent] = true
143
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.change_user_agent"
144
- else
145
- logger.error "BrowserBuilder (poltergeist_phantomjs): user_agent should be present and has lambda format to enable before_request.change_user_agent, skipped"
146
- end
147
- end
148
-
149
- # before_request change_proxy
150
- if @config.dig(:before_request, :change_proxy)
151
- if @config[:proxy].present? && @config[:proxy].class == Proc
152
- @browser.config.proxy = @config[:proxy]
153
- @browser.config.before_request[:change_proxy] = true
154
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.change_proxy"
155
- else
156
- logger.error "BrowserBuilder (poltergeist_phantomjs): proxy should be present and has lambda format to enable before_request.change_proxy, skipped"
157
- end
158
- end
159
-
160
- # before_request delay
161
- if delay = @config.dig(:before_request, :delay).presence
162
- @browser.config.before_request[:delay] = delay
163
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.delay"
164
- end
165
-
166
- # return Capybara session instance
167
- @browser
168
- end
169
- end
170
- end
171
- end
@@ -1,13 +0,0 @@
1
- require_relative '../driver/base'
2
-
3
- module Capybara::Poltergeist
4
- class Driver
5
- def pid
6
- client_pid
7
- end
8
-
9
- def port
10
- server.port
11
- end
12
- end
13
- end
@@ -1,71 +0,0 @@
1
- require 'cliver'
2
-
3
- module Kimurai
4
- class CLI
5
- class AnsibleCommandBuilder
6
- def initialize(user_host, options, playbook:, vars: {})
7
- @user_host = user_host
8
- @options = options
9
- @playbook = playbook
10
- @vars = vars
11
- end
12
-
13
- def get
14
- unless Cliver.detect("ansible-playbook")
15
- raise "Can't find `ansible-playbook` executable, to install: " \
16
- "Mac OS X: `$ brew install ansible`, Ubuntu: `$ sudo apt install ansible`"
17
- end
18
-
19
- user = @user_host[/(.*?)\@/, 1]
20
- host = @user_host[/\@(.+)/, 1] || @user_host
21
- inventory = @options["port"] ? "#{host}:#{@options['port']}," : "#{host},"
22
-
23
- gem_dir = Gem::Specification.find_by_name("kimurai").gem_dir
24
- playbook_path = gem_dir + "/lib/kimurai/automation/" + "#{@playbook}.yml"
25
-
26
- command = [
27
- "ansible-playbook", playbook_path,
28
- "--inventory", inventory,
29
- "--ssh-extra-args", "-oForwardAgent=yes",
30
- "--connection", @options["local"] ? "local" : "smart",
31
- "--extra-vars", "ansible_python_interpreter=/usr/bin/python3"
32
- ]
33
-
34
- if File.exists? "config/automation.yml"
35
- require 'yaml'
36
- if config = YAML.load_file("config/automation.yml").dig(@playbook)
37
- config.each { |key, value| @vars[key] = value unless @vars[key] }
38
- end
39
- end
40
-
41
- @vars.each do |key, value|
42
- next unless value.present?
43
- command.push "--extra-vars", "#{key}=#{value}"
44
- end
45
-
46
- if user
47
- command.push "--user", user
48
- end
49
-
50
- if @options["ask-sudo"]
51
- command.push "--ask-become-pass"
52
- end
53
-
54
- if @options["ask-auth-pass"]
55
- unless Cliver.detect("sshpass")
56
- raise "Can't find `sshpass` executable for password authentication, to install: " \
57
- "Mac OS X: `$ brew install http://git.io/sshpass.rb`, Ubuntu: `$ sudo apt install sshpass`"
58
- end
59
-
60
- command.push "--ask-pass"
61
- end
62
-
63
- if ssh_key_path = @options["ssh-key-path"]
64
- command.push "--private-key", ssh_key_path
65
- end
66
-
67
- command
68
- end
69
- end
70
- end
71
- end
@@ -1,13 +0,0 @@
1
- # software versions to install for `setup` command
2
- setup:
3
- ruby: 2.5.1
4
- # check latest here http://phantomjs.org/download.html
5
- phantomjs: 2.1.1
6
- # check latest here https://github.com/mozilla/geckodriver/releases/
7
- geckodriver: 0.21.0
8
- # check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
9
- chromedriver: 2.39
10
- # settings for deploy command, you can use cli options as well (--repo-url, --git-key-path)
11
- deploy:
12
- # repo_url: git@bitbucket.org:username/repo_name.git
13
- # repo_key_path: ~/.ssh/id_rsa