kimurai 1.3.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +9 -0
- data/CHANGELOG.md +29 -0
- data/Gemfile +2 -2
- data/README.md +478 -649
- data/Rakefile +6 -6
- data/bin/console +3 -4
- data/exe/kimurai +0 -1
- data/kimurai.gemspec +38 -37
- data/lib/kimurai/base/saver.rb +15 -19
- data/lib/kimurai/base/storage.rb +1 -1
- data/lib/kimurai/base.rb +42 -38
- data/lib/kimurai/base_helper.rb +5 -4
- data/lib/kimurai/browser_builder/mechanize_builder.rb +44 -38
- data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +63 -51
- data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +61 -55
- data/lib/kimurai/browser_builder.rb +7 -31
- data/lib/kimurai/capybara_configuration.rb +1 -1
- data/lib/kimurai/capybara_ext/driver/base.rb +50 -46
- data/lib/kimurai/capybara_ext/mechanize/driver.rb +51 -50
- data/lib/kimurai/capybara_ext/selenium/driver.rb +33 -29
- data/lib/kimurai/capybara_ext/session/config.rb +1 -1
- data/lib/kimurai/capybara_ext/session.rb +40 -38
- data/lib/kimurai/cli/generator.rb +15 -15
- data/lib/kimurai/cli.rb +52 -85
- data/lib/kimurai/core_ext/array.rb +2 -2
- data/lib/kimurai/core_ext/hash.rb +1 -1
- data/lib/kimurai/core_ext/numeric.rb +4 -4
- data/lib/kimurai/pipeline.rb +2 -1
- data/lib/kimurai/runner.rb +6 -6
- data/lib/kimurai/template/Gemfile +2 -2
- data/lib/kimurai/template/config/boot.rb +4 -4
- data/lib/kimurai/template/config/schedule.rb +15 -15
- data/lib/kimurai/template/spiders/application_spider.rb +14 -14
- data/lib/kimurai/version.rb +1 -1
- data/lib/kimurai.rb +7 -3
- metadata +58 -65
- data/.travis.yml +0 -5
- data/lib/kimurai/automation/deploy.yml +0 -54
- data/lib/kimurai/automation/setup/chromium_chromedriver.yml +0 -26
- data/lib/kimurai/automation/setup/firefox_geckodriver.yml +0 -20
- data/lib/kimurai/automation/setup/phantomjs.yml +0 -33
- data/lib/kimurai/automation/setup/ruby_environment.yml +0 -124
- data/lib/kimurai/automation/setup.yml +0 -44
- data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +0 -171
- data/lib/kimurai/capybara_ext/poltergeist/driver.rb +0 -13
- data/lib/kimurai/cli/ansible_command_builder.rb +0 -71
- data/lib/kimurai/template/config/automation.yml +0 -13
|
@@ -1,71 +1,72 @@
|
|
|
1
1
|
require 'mechanize'
|
|
2
2
|
require_relative '../driver/base'
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
4
|
+
module Capybara
|
|
5
|
+
module Mechanize
|
|
6
|
+
class Driver
|
|
7
|
+
def set_proxy(ip, port, _type, user = nil, password = nil)
|
|
8
|
+
# type is always "http", "socks" is not supported (yet)
|
|
9
|
+
browser.agent.set_proxy(ip, port, user, password)
|
|
10
|
+
end
|
|
7
11
|
|
|
8
|
-
|
|
9
|
-
# type is always "http", "socks" is not supported (yet)
|
|
10
|
-
browser.agent.set_proxy(ip, port, user, password)
|
|
11
|
-
end
|
|
12
|
+
###
|
|
12
13
|
|
|
13
|
-
|
|
14
|
+
def headers
|
|
15
|
+
browser.agent.request_headers
|
|
16
|
+
end
|
|
14
17
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
+
def headers=(headers)
|
|
19
|
+
browser.agent.request_headers = headers
|
|
20
|
+
end
|
|
18
21
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
+
def add_header(name, value)
|
|
23
|
+
browser.agent.request_headers[name] = value
|
|
24
|
+
end
|
|
22
25
|
|
|
23
|
-
|
|
24
|
-
browser.agent.request_headers[name] = value
|
|
25
|
-
end
|
|
26
|
+
###
|
|
26
27
|
|
|
27
|
-
|
|
28
|
+
def get_cookies
|
|
29
|
+
browser.agent.cookies
|
|
30
|
+
end
|
|
28
31
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def set_cookie(name, value, options = {})
|
|
34
|
-
options[:name] ||= name
|
|
35
|
-
options[:value] ||= value
|
|
32
|
+
def set_cookie(name, value, options = {})
|
|
33
|
+
options[:name] ||= name
|
|
34
|
+
options[:value] ||= value
|
|
36
35
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
36
|
+
cookie = Mechanize::Cookie.new(options.merge(path: '/'))
|
|
37
|
+
browser.agent.cookie_jar << cookie
|
|
38
|
+
end
|
|
40
39
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
40
|
+
def set_cookies(cookies)
|
|
41
|
+
cookies.each do |cookie|
|
|
42
|
+
set_cookie(cookie[:name], cookie[:value], cookie)
|
|
43
|
+
end
|
|
44
|
+
end
|
|
46
45
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
46
|
+
def clear_cookies
|
|
47
|
+
browser.agent.cookie_jar.clear!
|
|
48
|
+
end
|
|
50
49
|
|
|
51
|
-
|
|
50
|
+
###
|
|
52
51
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
52
|
+
def quit
|
|
53
|
+
browser.agent.shutdown
|
|
54
|
+
end
|
|
56
55
|
|
|
57
|
-
|
|
56
|
+
###
|
|
58
57
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
58
|
+
# Reset parent method `current_memory` for mechanize (we can't measure memory of Mechanize driver)
|
|
59
|
+
def current_memory
|
|
60
|
+
nil
|
|
61
|
+
end
|
|
63
62
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
63
|
+
def pid
|
|
64
|
+
nil
|
|
65
|
+
end
|
|
67
66
|
|
|
68
|
-
|
|
69
|
-
|
|
67
|
+
def port
|
|
68
|
+
nil
|
|
69
|
+
end
|
|
70
|
+
end
|
|
70
71
|
end
|
|
71
72
|
end
|
|
@@ -1,34 +1,38 @@
|
|
|
1
1
|
require_relative '../driver/base'
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
3
|
+
module Capybara
|
|
4
|
+
module Selenium
|
|
5
|
+
class Driver
|
|
6
|
+
def get_cookies
|
|
7
|
+
browser.manage.all_cookies
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def set_cookie(name, value, options = {})
|
|
11
|
+
options[:name] ||= name
|
|
12
|
+
options[:value] ||= value
|
|
13
|
+
|
|
14
|
+
browser.manage.add_cookie(options)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def set_cookies(cookies)
|
|
18
|
+
cookies.each do |cookie|
|
|
19
|
+
set_cookie(cookie[:name], cookie[:value], cookie)
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def clear_cookies
|
|
24
|
+
browser.manage.delete_all_cookies
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
###
|
|
28
|
+
|
|
29
|
+
def pid
|
|
30
|
+
@pid ||= `lsof -i tcp:#{port} -t`.strip.to_i
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def port
|
|
34
|
+
@port ||= browser.send(:bridge).instance_variable_get('@http').instance_variable_get('@server_url').port
|
|
35
|
+
end
|
|
18
36
|
end
|
|
19
37
|
end
|
|
20
|
-
|
|
21
|
-
def clear_cookies
|
|
22
|
-
browser.manage.delete_all_cookies
|
|
23
|
-
end
|
|
24
|
-
|
|
25
|
-
###
|
|
26
|
-
|
|
27
|
-
def pid
|
|
28
|
-
@pid ||= `lsof -i tcp:#{port} -t`.strip.to_i
|
|
29
|
-
end
|
|
30
|
-
|
|
31
|
-
def port
|
|
32
|
-
@port ||= browser.send(:bridge).instance_variable_get("@http").instance_variable_get("@server_url").port
|
|
33
|
-
end
|
|
34
38
|
end
|
|
@@ -7,11 +7,12 @@ module Capybara
|
|
|
7
7
|
class Session
|
|
8
8
|
attr_accessor :spider
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
alias original_visit visit
|
|
11
11
|
def visit(visit_uri, delay: config.before_request[:delay], skip_request_options: false, max_retries: 3)
|
|
12
12
|
if spider
|
|
13
13
|
process_delay(delay) if delay
|
|
14
|
-
retries
|
|
14
|
+
retries = 0
|
|
15
|
+
sleep_interval = 0
|
|
15
16
|
|
|
16
17
|
begin
|
|
17
18
|
check_request_options(visit_uri) unless skip_request_options
|
|
@@ -19,7 +20,7 @@ module Capybara
|
|
|
19
20
|
spider.class.update(:visits, :requests) if spider.with_info
|
|
20
21
|
|
|
21
22
|
original_visit(visit_uri)
|
|
22
|
-
rescue => e
|
|
23
|
+
rescue StandardError => e
|
|
23
24
|
if match_error?(e, type: :to_skip)
|
|
24
25
|
logger.error "Browser: skip request error: #{e.inspect}, url: #{visit_uri}"
|
|
25
26
|
spider.add_event(:requests_errors, e.inspect) if spider.with_info
|
|
@@ -29,7 +30,7 @@ module Capybara
|
|
|
29
30
|
spider.add_event(:requests_errors, e.inspect) if spider.with_info
|
|
30
31
|
|
|
31
32
|
if (retries += 1) <= max_retries
|
|
32
|
-
logger.info "Browser: sleep #{
|
|
33
|
+
logger.info "Browser: sleep #{sleep_interval += 15} seconds and process retry № #{retries} to the url: #{visit_uri}"
|
|
33
34
|
sleep sleep_interval and retry
|
|
34
35
|
else
|
|
35
36
|
logger.error "Browser: all retries (#{retries - 1}) to the url #{visit_uri} are gone"
|
|
@@ -48,7 +49,7 @@ module Capybara
|
|
|
48
49
|
logger.info "Info: visits: requests: #{spider.class.visits[:requests]}, responses: #{spider.class.visits[:responses]}"
|
|
49
50
|
end
|
|
50
51
|
|
|
51
|
-
if memory = driver.current_memory
|
|
52
|
+
if (memory = driver.current_memory)
|
|
52
53
|
logger.debug "Browser: driver.current_memory: #{memory}"
|
|
53
54
|
end
|
|
54
55
|
end
|
|
@@ -62,7 +63,7 @@ module Capybara
|
|
|
62
63
|
begin
|
|
63
64
|
@driver.quit
|
|
64
65
|
# handle Net::ReadTimeout error for Selenium like drivers
|
|
65
|
-
rescue Net::ReadTimeout
|
|
66
|
+
rescue Net::ReadTimeout
|
|
66
67
|
@driver.quit
|
|
67
68
|
end
|
|
68
69
|
|
|
@@ -74,13 +75,8 @@ module Capybara
|
|
|
74
75
|
end
|
|
75
76
|
|
|
76
77
|
def restart!
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
@driver.requests, @driver.responses = 0, 0
|
|
80
|
-
else
|
|
81
|
-
destroy_driver!
|
|
82
|
-
driver
|
|
83
|
-
end
|
|
78
|
+
destroy_driver!
|
|
79
|
+
driver
|
|
84
80
|
|
|
85
81
|
logger.info "Browser: driver has been restarted: name: #{mode}, pid: #{driver.pid}, port: #{driver.port}"
|
|
86
82
|
end
|
|
@@ -88,7 +84,16 @@ module Capybara
|
|
|
88
84
|
def current_response(response_type = :html)
|
|
89
85
|
case response_type
|
|
90
86
|
when :html
|
|
91
|
-
|
|
87
|
+
if config.encoding
|
|
88
|
+
if config.encoding == :auto
|
|
89
|
+
charset = body.force_encoding('ISO-8859-1').encode('UTF-8')[/<meta.*?charset="?([\w+\d+-]*)/i, 1]
|
|
90
|
+
Nokogiri::HTML(body, nil, charset)
|
|
91
|
+
else
|
|
92
|
+
Nokogiri::HTML(body, nil, config.encoding)
|
|
93
|
+
end
|
|
94
|
+
else
|
|
95
|
+
Nokogiri::HTML(body)
|
|
96
|
+
end
|
|
92
97
|
when :json
|
|
93
98
|
JSON.parse(body)
|
|
94
99
|
end
|
|
@@ -99,23 +104,22 @@ module Capybara
|
|
|
99
104
|
# Handy method to perform some processing in the new tab within block and then automatically close this tab:
|
|
100
105
|
# Usage (url):
|
|
101
106
|
# browser.within_new_window_by(url: "https://google.com") do
|
|
102
|
-
|
|
107
|
+
# do some stuff and then automatically close this tab and return back to the first tab
|
|
103
108
|
# end
|
|
104
109
|
# Usage (action) (when new tab opening by some action, for example by clicking
|
|
105
110
|
# on a particular element):
|
|
106
111
|
# action = -> { browser.find("//some/element/path").click }
|
|
107
112
|
# browser.within_new_window_by(action: action) do
|
|
108
|
-
|
|
113
|
+
# do some stuff and then automatically close this tab and return back to the first tab
|
|
109
114
|
# end
|
|
110
115
|
def within_new_window_by(action: nil, url: nil)
|
|
111
|
-
|
|
112
|
-
when action
|
|
116
|
+
if action
|
|
113
117
|
opened_window = window_opened_by { action.call }
|
|
114
118
|
within_window(opened_window) do
|
|
115
119
|
yield
|
|
116
120
|
current_window.close
|
|
117
121
|
end
|
|
118
|
-
|
|
122
|
+
elsif url
|
|
119
123
|
within_window(open_new_window) do
|
|
120
124
|
visit(url)
|
|
121
125
|
|
|
@@ -128,14 +132,14 @@ module Capybara
|
|
|
128
132
|
###
|
|
129
133
|
|
|
130
134
|
def scroll_to_bottom
|
|
131
|
-
execute_script(
|
|
135
|
+
execute_script('window.scrollBy(0,10000)')
|
|
132
136
|
end
|
|
133
137
|
|
|
134
138
|
private
|
|
135
139
|
|
|
136
140
|
def skip_error_on_failure?(e)
|
|
137
141
|
config.retry_request_errors.any? do |error|
|
|
138
|
-
error[:skip_on_failure] && e.class.ancestors.include?(error[:error]) if error.
|
|
142
|
+
error[:skip_on_failure] && e.class.ancestors.include?(error[:error]) if error.is_a?(Hash)
|
|
139
143
|
end
|
|
140
144
|
end
|
|
141
145
|
|
|
@@ -147,10 +151,10 @@ module Capybara
|
|
|
147
151
|
end
|
|
148
152
|
|
|
149
153
|
errors.any? do |error|
|
|
150
|
-
if error.
|
|
154
|
+
if error.is_a?(Hash)
|
|
151
155
|
match_class = e.class.ancestors.include?(error[:error])
|
|
152
156
|
if error[:message].present?
|
|
153
|
-
if error[:message].
|
|
157
|
+
if error[:message].is_a?(Regexp)
|
|
154
158
|
e.message&.match?(error[:message])
|
|
155
159
|
else
|
|
156
160
|
e.message&.include?(error[:message])
|
|
@@ -165,14 +169,14 @@ module Capybara
|
|
|
165
169
|
end
|
|
166
170
|
|
|
167
171
|
def process_delay(delay)
|
|
168
|
-
interval = (delay.
|
|
172
|
+
interval = (delay.instance_of?(Range) ? rand(delay) : delay)
|
|
169
173
|
logger.debug "Browser: sleep #{interval.round(2)} #{'second'.pluralize(interval)} before request..."
|
|
170
174
|
sleep interval
|
|
171
175
|
end
|
|
172
176
|
|
|
173
177
|
def check_request_options(url_to_visit)
|
|
174
178
|
# restart_if
|
|
175
|
-
if memory_limit = config.restart_if[:memory_limit]
|
|
179
|
+
if (memory_limit = config.restart_if[:memory_limit])
|
|
176
180
|
memory = driver.current_memory
|
|
177
181
|
if memory && memory >= memory_limit
|
|
178
182
|
logger.warn "Browser: memory_limit #{memory_limit} of driver.current_memory (#{memory}) is exceeded (engine: #{mode})"
|
|
@@ -180,7 +184,7 @@ module Capybara
|
|
|
180
184
|
end
|
|
181
185
|
end
|
|
182
186
|
|
|
183
|
-
if requests_limit = config.restart_if[:requests_limit]
|
|
187
|
+
if (requests_limit = config.restart_if[:requests_limit])
|
|
184
188
|
requests = driver.requests
|
|
185
189
|
if requests >= requests_limit
|
|
186
190
|
logger.warn "Browser: requests_limit #{requests_limit} of driver.requests (#{requests}) is exceeded (engine: #{mode})"
|
|
@@ -200,7 +204,7 @@ module Capybara
|
|
|
200
204
|
|
|
201
205
|
if config.before_request[:clear_cookies]
|
|
202
206
|
driver.clear_cookies
|
|
203
|
-
logger.debug
|
|
207
|
+
logger.debug 'Browser: cleared cookies before request'
|
|
204
208
|
end
|
|
205
209
|
|
|
206
210
|
if config.before_request[:clear_and_set_cookies]
|
|
@@ -208,29 +212,27 @@ module Capybara
|
|
|
208
212
|
|
|
209
213
|
# (Selenium only) if browser is not visited yet any page, visit url_to_visit
|
|
210
214
|
# first and then set cookies (needs after browser restart):
|
|
211
|
-
if driver.visited.nil? && mode.match?(/selenium/)
|
|
212
|
-
visit(url_to_visit, skip_request_options: true)
|
|
213
|
-
end
|
|
215
|
+
visit(url_to_visit, skip_request_options: true) if driver.visited.nil? && mode.match?(/selenium/)
|
|
214
216
|
|
|
215
217
|
config.cookies.each do |cookie|
|
|
216
218
|
driver.set_cookie(cookie[:name], cookie[:value], cookie)
|
|
217
219
|
end
|
|
218
220
|
|
|
219
|
-
logger.debug
|
|
221
|
+
logger.debug 'Browser: cleared and set cookies before request'
|
|
220
222
|
end
|
|
221
223
|
|
|
222
224
|
# user_agent
|
|
223
225
|
if config.before_request[:change_user_agent]
|
|
224
|
-
driver.add_header(
|
|
225
|
-
logger.debug
|
|
226
|
+
driver.add_header('User-Agent', config.user_agent.call)
|
|
227
|
+
logger.debug 'Browser: changed user_agent before request'
|
|
226
228
|
end
|
|
227
229
|
|
|
228
230
|
# proxy
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
231
|
+
return unless config.before_request[:change_proxy]
|
|
232
|
+
|
|
233
|
+
proxy_string = config.proxy.call
|
|
234
|
+
driver.set_proxy(*proxy_string.split(':'))
|
|
235
|
+
logger.debug 'Browser: changed proxy before request'
|
|
234
236
|
end
|
|
235
237
|
|
|
236
238
|
def logger
|
|
@@ -4,20 +4,20 @@ module Kimurai
|
|
|
4
4
|
include Thor::Actions
|
|
5
5
|
|
|
6
6
|
def self.source_root
|
|
7
|
-
File.dirname(File.expand_path(
|
|
7
|
+
File.dirname(File.expand_path(__dir__))
|
|
8
8
|
end
|
|
9
9
|
|
|
10
10
|
def generate_project(project_name)
|
|
11
|
-
directory
|
|
11
|
+
directory 'template', project_name
|
|
12
12
|
inside(project_name) do
|
|
13
|
-
run
|
|
14
|
-
run
|
|
13
|
+
run 'bundle install'
|
|
14
|
+
run 'git init'
|
|
15
15
|
end
|
|
16
16
|
end
|
|
17
17
|
|
|
18
18
|
def generate_spider(spider_name, in_project:)
|
|
19
19
|
spider_path = in_project ? "spiders/#{spider_name}.rb" : "./#{spider_name}.rb"
|
|
20
|
-
raise "Spider #{spider_path} already exists" if File.
|
|
20
|
+
raise "Spider #{spider_path} already exists" if File.exist? spider_path
|
|
21
21
|
|
|
22
22
|
spider_class = to_spider_class(spider_name)
|
|
23
23
|
create_file spider_path do
|
|
@@ -33,24 +33,24 @@ module Kimurai
|
|
|
33
33
|
RUBY
|
|
34
34
|
end
|
|
35
35
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
36
|
+
return if in_project
|
|
37
|
+
|
|
38
|
+
insert_into_file spider_path, " @engine = :mechanize\n", after: "@name = \"#{spider_name}\"\n"
|
|
39
|
+
prepend_to_file spider_path, "require 'kimurai'\n\n"
|
|
40
|
+
append_to_file spider_path, "\n#{spider_class}.crawl!"
|
|
41
41
|
end
|
|
42
42
|
|
|
43
43
|
def generate_schedule
|
|
44
|
-
copy_file
|
|
44
|
+
copy_file 'template/config/schedule.rb', './schedule.rb'
|
|
45
45
|
end
|
|
46
46
|
|
|
47
47
|
private
|
|
48
48
|
|
|
49
49
|
def to_spider_class(string)
|
|
50
|
-
string.sub(/^./) {
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
50
|
+
string.sub(/^./) { ::Regexp.last_match(0).capitalize }
|
|
51
|
+
.gsub(%r{(?:_|(/))([a-z\d]*)}) { "#{::Regexp.last_match(1)}#{::Regexp.last_match(2).capitalize}" }
|
|
52
|
+
.gsub(%r{(?:-|(/))([a-z\d]*)}) { "Dash#{::Regexp.last_match(2).capitalize}" }
|
|
53
|
+
.gsub(%r{(?:\.|(/))([a-z\d]*)}) { "#{::Regexp.last_match(1)}#{::Regexp.last_match(2).capitalize}" }
|
|
54
54
|
end
|
|
55
55
|
end
|
|
56
56
|
end
|