kimurai 1.3.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +9 -0
- data/CHANGELOG.md +29 -0
- data/Gemfile +2 -2
- data/README.md +478 -649
- data/Rakefile +6 -6
- data/bin/console +3 -4
- data/exe/kimurai +0 -1
- data/kimurai.gemspec +38 -37
- data/lib/kimurai/base/saver.rb +15 -19
- data/lib/kimurai/base/storage.rb +1 -1
- data/lib/kimurai/base.rb +42 -38
- data/lib/kimurai/base_helper.rb +5 -4
- data/lib/kimurai/browser_builder/mechanize_builder.rb +44 -38
- data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +63 -51
- data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +61 -55
- data/lib/kimurai/browser_builder.rb +7 -31
- data/lib/kimurai/capybara_configuration.rb +1 -1
- data/lib/kimurai/capybara_ext/driver/base.rb +50 -46
- data/lib/kimurai/capybara_ext/mechanize/driver.rb +51 -50
- data/lib/kimurai/capybara_ext/selenium/driver.rb +33 -29
- data/lib/kimurai/capybara_ext/session/config.rb +1 -1
- data/lib/kimurai/capybara_ext/session.rb +40 -38
- data/lib/kimurai/cli/generator.rb +15 -15
- data/lib/kimurai/cli.rb +52 -85
- data/lib/kimurai/core_ext/array.rb +2 -2
- data/lib/kimurai/core_ext/hash.rb +1 -1
- data/lib/kimurai/core_ext/numeric.rb +4 -4
- data/lib/kimurai/pipeline.rb +2 -1
- data/lib/kimurai/runner.rb +6 -6
- data/lib/kimurai/template/Gemfile +2 -2
- data/lib/kimurai/template/config/boot.rb +4 -4
- data/lib/kimurai/template/config/schedule.rb +15 -15
- data/lib/kimurai/template/spiders/application_spider.rb +14 -14
- data/lib/kimurai/version.rb +1 -1
- data/lib/kimurai.rb +7 -3
- metadata +58 -65
- data/.travis.yml +0 -5
- data/lib/kimurai/automation/deploy.yml +0 -54
- data/lib/kimurai/automation/setup/chromium_chromedriver.yml +0 -26
- data/lib/kimurai/automation/setup/firefox_geckodriver.yml +0 -20
- data/lib/kimurai/automation/setup/phantomjs.yml +0 -33
- data/lib/kimurai/automation/setup/ruby_environment.yml +0 -124
- data/lib/kimurai/automation/setup.yml +0 -44
- data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +0 -171
- data/lib/kimurai/capybara_ext/poltergeist/driver.rb +0 -13
- data/lib/kimurai/cli/ansible_command_builder.rb +0 -71
- data/lib/kimurai/template/config/automation.yml +0 -13
|
@@ -1,171 +0,0 @@
|
|
|
1
|
-
require 'capybara'
|
|
2
|
-
require 'capybara/poltergeist'
|
|
3
|
-
require_relative '../capybara_configuration'
|
|
4
|
-
require_relative '../capybara_ext/poltergeist/driver'
|
|
5
|
-
require_relative '../capybara_ext/session'
|
|
6
|
-
|
|
7
|
-
module Kimurai
|
|
8
|
-
class BrowserBuilder
|
|
9
|
-
class PoltergeistPhantomJSBuilder
|
|
10
|
-
attr_reader :logger, :spider
|
|
11
|
-
|
|
12
|
-
def initialize(config, spider:)
|
|
13
|
-
@config = config
|
|
14
|
-
@spider = spider
|
|
15
|
-
@logger = spider.logger
|
|
16
|
-
end
|
|
17
|
-
|
|
18
|
-
def build
|
|
19
|
-
# Register driver
|
|
20
|
-
Capybara.register_driver :poltergeist_phantomjs do |app|
|
|
21
|
-
# Create driver options
|
|
22
|
-
driver_options = {
|
|
23
|
-
js_errors: false, debug: false, inspector: false, phantomjs_options: []
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
if extensions = @config[:extensions].presence
|
|
27
|
-
driver_options[:extensions] = extensions
|
|
28
|
-
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled extensions"
|
|
29
|
-
end
|
|
30
|
-
|
|
31
|
-
# Window size
|
|
32
|
-
if size = @config[:window_size].presence
|
|
33
|
-
driver_options[:window_size] = size
|
|
34
|
-
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled window_size"
|
|
35
|
-
end
|
|
36
|
-
|
|
37
|
-
# SSL
|
|
38
|
-
if ssl_cert_path = @config[:ssl_cert_path].presence
|
|
39
|
-
driver_options[:phantomjs_options] << "--ssl-certificates-path=#{ssl_cert_path}"
|
|
40
|
-
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom ssl_cert"
|
|
41
|
-
end
|
|
42
|
-
|
|
43
|
-
if @config[:ignore_ssl_errors].present?
|
|
44
|
-
driver_options[:phantomjs_options].push("--ignore-ssl-errors=yes", "--ssl-protocol=any")
|
|
45
|
-
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled ignore_ssl_errors"
|
|
46
|
-
end
|
|
47
|
-
|
|
48
|
-
# Disable images
|
|
49
|
-
if @config[:disable_images].present?
|
|
50
|
-
driver_options[:phantomjs_options] << "--load-images=no"
|
|
51
|
-
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled disable_images"
|
|
52
|
-
end
|
|
53
|
-
|
|
54
|
-
Capybara::Poltergeist::Driver.new(app, driver_options)
|
|
55
|
-
end
|
|
56
|
-
|
|
57
|
-
# Create browser instance (Capybara session)
|
|
58
|
-
@browser = Capybara::Session.new(:poltergeist_phantomjs)
|
|
59
|
-
@browser.spider = spider
|
|
60
|
-
logger.debug "BrowserBuilder (poltergeist_phantomjs): created browser instance"
|
|
61
|
-
|
|
62
|
-
# Proxy
|
|
63
|
-
if proxy = @config[:proxy].presence
|
|
64
|
-
proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
|
|
65
|
-
ip, port, type = proxy_string.split(":")
|
|
66
|
-
|
|
67
|
-
if %w(http socks5).include?(type)
|
|
68
|
-
@browser.driver.set_proxy(*proxy_string.split(":"))
|
|
69
|
-
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled #{type} proxy, ip: #{ip}, port: #{port}"
|
|
70
|
-
else
|
|
71
|
-
logger.error "BrowserBuilder (poltergeist_phantomjs): wrong type of proxy: #{type}, skipped"
|
|
72
|
-
end
|
|
73
|
-
end
|
|
74
|
-
|
|
75
|
-
# Headers
|
|
76
|
-
if headers = @config[:headers].presence
|
|
77
|
-
@browser.driver.headers = headers
|
|
78
|
-
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom headers"
|
|
79
|
-
end
|
|
80
|
-
|
|
81
|
-
if user_agent = @config[:user_agent].presence
|
|
82
|
-
user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
|
|
83
|
-
|
|
84
|
-
@browser.driver.add_header("User-Agent", user_agent_string)
|
|
85
|
-
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom user_agent"
|
|
86
|
-
end
|
|
87
|
-
|
|
88
|
-
# Cookies
|
|
89
|
-
if cookies = @config[:cookies].presence
|
|
90
|
-
cookies.each do |cookie|
|
|
91
|
-
@browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
|
|
92
|
-
end
|
|
93
|
-
|
|
94
|
-
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom cookies"
|
|
95
|
-
end
|
|
96
|
-
|
|
97
|
-
# Browser instance options
|
|
98
|
-
# skip_request_errors
|
|
99
|
-
if skip_errors = @config[:skip_request_errors].presence
|
|
100
|
-
@browser.config.skip_request_errors = skip_errors
|
|
101
|
-
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled skip_request_errors"
|
|
102
|
-
end
|
|
103
|
-
|
|
104
|
-
# retry_request_errors
|
|
105
|
-
if retry_errors = @config[:retry_request_errors].presence
|
|
106
|
-
@browser.config.retry_request_errors = retry_errors
|
|
107
|
-
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled retry_request_errors"
|
|
108
|
-
end
|
|
109
|
-
|
|
110
|
-
# restart_if
|
|
111
|
-
if requests_limit = @config.dig(:restart_if, :requests_limit).presence
|
|
112
|
-
@browser.config.restart_if[:requests_limit] = requests_limit
|
|
113
|
-
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled restart_if.requests_limit >= #{requests_limit}"
|
|
114
|
-
end
|
|
115
|
-
|
|
116
|
-
if memory_limit = @config.dig(:restart_if, :memory_limit).presence
|
|
117
|
-
@browser.config.restart_if[:memory_limit] = memory_limit
|
|
118
|
-
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled restart_if.memory_limit >= #{memory_limit}"
|
|
119
|
-
end
|
|
120
|
-
|
|
121
|
-
# before_request clear_cookies
|
|
122
|
-
if @config.dig(:before_request, :clear_cookies)
|
|
123
|
-
@browser.config.before_request[:clear_cookies] = true
|
|
124
|
-
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.clear_cookies"
|
|
125
|
-
end
|
|
126
|
-
|
|
127
|
-
# before_request clear_and_set_cookies
|
|
128
|
-
if @config.dig(:before_request, :clear_and_set_cookies)
|
|
129
|
-
if cookies = @config[:cookies].presence
|
|
130
|
-
@browser.config.cookies = cookies
|
|
131
|
-
@browser.config.before_request[:clear_and_set_cookies] = true
|
|
132
|
-
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.clear_and_set_cookies"
|
|
133
|
-
else
|
|
134
|
-
logger.error "BrowserBuilder (poltergeist_phantomjs): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
|
|
135
|
-
end
|
|
136
|
-
end
|
|
137
|
-
|
|
138
|
-
# before_request change_user_agent
|
|
139
|
-
if @config.dig(:before_request, :change_user_agent)
|
|
140
|
-
if @config[:user_agent].present? && @config[:user_agent].class == Proc
|
|
141
|
-
@browser.config.user_agent = @config[:user_agent]
|
|
142
|
-
@browser.config.before_request[:change_user_agent] = true
|
|
143
|
-
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.change_user_agent"
|
|
144
|
-
else
|
|
145
|
-
logger.error "BrowserBuilder (poltergeist_phantomjs): user_agent should be present and has lambda format to enable before_request.change_user_agent, skipped"
|
|
146
|
-
end
|
|
147
|
-
end
|
|
148
|
-
|
|
149
|
-
# before_request change_proxy
|
|
150
|
-
if @config.dig(:before_request, :change_proxy)
|
|
151
|
-
if @config[:proxy].present? && @config[:proxy].class == Proc
|
|
152
|
-
@browser.config.proxy = @config[:proxy]
|
|
153
|
-
@browser.config.before_request[:change_proxy] = true
|
|
154
|
-
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.change_proxy"
|
|
155
|
-
else
|
|
156
|
-
logger.error "BrowserBuilder (poltergeist_phantomjs): proxy should be present and has lambda format to enable before_request.change_proxy, skipped"
|
|
157
|
-
end
|
|
158
|
-
end
|
|
159
|
-
|
|
160
|
-
# before_request delay
|
|
161
|
-
if delay = @config.dig(:before_request, :delay).presence
|
|
162
|
-
@browser.config.before_request[:delay] = delay
|
|
163
|
-
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.delay"
|
|
164
|
-
end
|
|
165
|
-
|
|
166
|
-
# return Capybara session instance
|
|
167
|
-
@browser
|
|
168
|
-
end
|
|
169
|
-
end
|
|
170
|
-
end
|
|
171
|
-
end
|
|
@@ -1,71 +0,0 @@
|
|
|
1
|
-
require 'cliver'
|
|
2
|
-
|
|
3
|
-
module Kimurai
|
|
4
|
-
class CLI
|
|
5
|
-
class AnsibleCommandBuilder
|
|
6
|
-
def initialize(user_host, options, playbook:, vars: {})
|
|
7
|
-
@user_host = user_host
|
|
8
|
-
@options = options
|
|
9
|
-
@playbook = playbook
|
|
10
|
-
@vars = vars
|
|
11
|
-
end
|
|
12
|
-
|
|
13
|
-
def get
|
|
14
|
-
unless Cliver.detect("ansible-playbook")
|
|
15
|
-
raise "Can't find `ansible-playbook` executable, to install: " \
|
|
16
|
-
"Mac OS X: `$ brew install ansible`, Ubuntu: `$ sudo apt install ansible`"
|
|
17
|
-
end
|
|
18
|
-
|
|
19
|
-
user = @user_host[/(.*?)\@/, 1]
|
|
20
|
-
host = @user_host[/\@(.+)/, 1] || @user_host
|
|
21
|
-
inventory = @options["port"] ? "#{host}:#{@options['port']}," : "#{host},"
|
|
22
|
-
|
|
23
|
-
gem_dir = Gem::Specification.find_by_name("kimurai").gem_dir
|
|
24
|
-
playbook_path = gem_dir + "/lib/kimurai/automation/" + "#{@playbook}.yml"
|
|
25
|
-
|
|
26
|
-
command = [
|
|
27
|
-
"ansible-playbook", playbook_path,
|
|
28
|
-
"--inventory", inventory,
|
|
29
|
-
"--ssh-extra-args", "-oForwardAgent=yes",
|
|
30
|
-
"--connection", @options["local"] ? "local" : "smart",
|
|
31
|
-
"--extra-vars", "ansible_python_interpreter=/usr/bin/python3"
|
|
32
|
-
]
|
|
33
|
-
|
|
34
|
-
if File.exists? "config/automation.yml"
|
|
35
|
-
require 'yaml'
|
|
36
|
-
if config = YAML.load_file("config/automation.yml").dig(@playbook)
|
|
37
|
-
config.each { |key, value| @vars[key] = value unless @vars[key] }
|
|
38
|
-
end
|
|
39
|
-
end
|
|
40
|
-
|
|
41
|
-
@vars.each do |key, value|
|
|
42
|
-
next unless value.present?
|
|
43
|
-
command.push "--extra-vars", "#{key}=#{value}"
|
|
44
|
-
end
|
|
45
|
-
|
|
46
|
-
if user
|
|
47
|
-
command.push "--user", user
|
|
48
|
-
end
|
|
49
|
-
|
|
50
|
-
if @options["ask-sudo"]
|
|
51
|
-
command.push "--ask-become-pass"
|
|
52
|
-
end
|
|
53
|
-
|
|
54
|
-
if @options["ask-auth-pass"]
|
|
55
|
-
unless Cliver.detect("sshpass")
|
|
56
|
-
raise "Can't find `sshpass` executable for password authentication, to install: " \
|
|
57
|
-
"Mac OS X: `$ brew install http://git.io/sshpass.rb`, Ubuntu: `$ sudo apt install sshpass`"
|
|
58
|
-
end
|
|
59
|
-
|
|
60
|
-
command.push "--ask-pass"
|
|
61
|
-
end
|
|
62
|
-
|
|
63
|
-
if ssh_key_path = @options["ssh-key-path"]
|
|
64
|
-
command.push "--private-key", ssh_key_path
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
command
|
|
68
|
-
end
|
|
69
|
-
end
|
|
70
|
-
end
|
|
71
|
-
end
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
# software versions to install for `setup` command
|
|
2
|
-
setup:
|
|
3
|
-
ruby: 2.5.1
|
|
4
|
-
# check latest here http://phantomjs.org/download.html
|
|
5
|
-
phantomjs: 2.1.1
|
|
6
|
-
# check latest here https://github.com/mozilla/geckodriver/releases/
|
|
7
|
-
geckodriver: 0.21.0
|
|
8
|
-
# check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
|
|
9
|
-
chromedriver: 2.39
|
|
10
|
-
# settings for deploy command, you can use cli options as well (--repo-url, --git-key-path)
|
|
11
|
-
deploy:
|
|
12
|
-
# repo_url: git@bitbucket.org:username/repo_name.git
|
|
13
|
-
# repo_key_path: ~/.ssh/id_rsa
|