kimurai 1.3.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +9 -0
  3. data/CHANGELOG.md +29 -0
  4. data/Gemfile +2 -2
  5. data/README.md +478 -649
  6. data/Rakefile +6 -6
  7. data/bin/console +3 -4
  8. data/exe/kimurai +0 -1
  9. data/kimurai.gemspec +38 -37
  10. data/lib/kimurai/base/saver.rb +15 -19
  11. data/lib/kimurai/base/storage.rb +1 -1
  12. data/lib/kimurai/base.rb +42 -38
  13. data/lib/kimurai/base_helper.rb +5 -4
  14. data/lib/kimurai/browser_builder/mechanize_builder.rb +44 -38
  15. data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +63 -51
  16. data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +61 -55
  17. data/lib/kimurai/browser_builder.rb +7 -31
  18. data/lib/kimurai/capybara_configuration.rb +1 -1
  19. data/lib/kimurai/capybara_ext/driver/base.rb +50 -46
  20. data/lib/kimurai/capybara_ext/mechanize/driver.rb +51 -50
  21. data/lib/kimurai/capybara_ext/selenium/driver.rb +33 -29
  22. data/lib/kimurai/capybara_ext/session/config.rb +1 -1
  23. data/lib/kimurai/capybara_ext/session.rb +40 -38
  24. data/lib/kimurai/cli/generator.rb +15 -15
  25. data/lib/kimurai/cli.rb +52 -85
  26. data/lib/kimurai/core_ext/array.rb +2 -2
  27. data/lib/kimurai/core_ext/hash.rb +1 -1
  28. data/lib/kimurai/core_ext/numeric.rb +4 -4
  29. data/lib/kimurai/pipeline.rb +2 -1
  30. data/lib/kimurai/runner.rb +6 -6
  31. data/lib/kimurai/template/Gemfile +2 -2
  32. data/lib/kimurai/template/config/boot.rb +4 -4
  33. data/lib/kimurai/template/config/schedule.rb +15 -15
  34. data/lib/kimurai/template/spiders/application_spider.rb +14 -14
  35. data/lib/kimurai/version.rb +1 -1
  36. data/lib/kimurai.rb +7 -3
  37. metadata +58 -65
  38. data/.travis.yml +0 -5
  39. data/lib/kimurai/automation/deploy.yml +0 -54
  40. data/lib/kimurai/automation/setup/chromium_chromedriver.yml +0 -26
  41. data/lib/kimurai/automation/setup/firefox_geckodriver.yml +0 -20
  42. data/lib/kimurai/automation/setup/phantomjs.yml +0 -33
  43. data/lib/kimurai/automation/setup/ruby_environment.yml +0 -124
  44. data/lib/kimurai/automation/setup.yml +0 -44
  45. data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +0 -171
  46. data/lib/kimurai/capybara_ext/poltergeist/driver.rb +0 -13
  47. data/lib/kimurai/cli/ansible_command_builder.rb +0 -71
  48. data/lib/kimurai/template/config/automation.yml +0 -13
@@ -1,71 +1,72 @@
1
1
  require 'mechanize'
2
2
  require_relative '../driver/base'
3
3
 
4
- class Capybara::Mechanize::Driver
5
- # Extend capybara-mechnize to support Poltergeist-like methods
6
- # https://www.rubydoc.info/gems/poltergeist/Capybara/Poltergeist/Driver
4
+ module Capybara
5
+ module Mechanize
6
+ class Driver
7
+ def set_proxy(ip, port, _type, user = nil, password = nil)
8
+ # type is always "http", "socks" is not supported (yet)
9
+ browser.agent.set_proxy(ip, port, user, password)
10
+ end
7
11
 
8
- def set_proxy(ip, port, type, user = nil, password = nil)
9
- # type is always "http", "socks" is not supported (yet)
10
- browser.agent.set_proxy(ip, port, user, password)
11
- end
12
+ ###
12
13
 
13
- ###
14
+ def headers
15
+ browser.agent.request_headers
16
+ end
14
17
 
15
- def headers
16
- browser.agent.request_headers
17
- end
18
+ def headers=(headers)
19
+ browser.agent.request_headers = headers
20
+ end
18
21
 
19
- def headers=(headers)
20
- browser.agent.request_headers = headers
21
- end
22
+ def add_header(name, value)
23
+ browser.agent.request_headers[name] = value
24
+ end
22
25
 
23
- def add_header(name, value)
24
- browser.agent.request_headers[name] = value
25
- end
26
+ ###
26
27
 
27
- ###
28
+ def get_cookies
29
+ browser.agent.cookies
30
+ end
28
31
 
29
- def get_cookies
30
- browser.agent.cookies
31
- end
32
-
33
- def set_cookie(name, value, options = {})
34
- options[:name] ||= name
35
- options[:value] ||= value
32
+ def set_cookie(name, value, options = {})
33
+ options[:name] ||= name
34
+ options[:value] ||= value
36
35
 
37
- cookie = Mechanize::Cookie.new(options.merge path: "/")
38
- browser.agent.cookie_jar << cookie
39
- end
36
+ cookie = Mechanize::Cookie.new(options.merge(path: '/'))
37
+ browser.agent.cookie_jar << cookie
38
+ end
40
39
 
41
- def set_cookies(cookies)
42
- cookies.each do |cookie|
43
- set_cookie(cookie[:name], cookie[:value], cookie)
44
- end
45
- end
40
+ def set_cookies(cookies)
41
+ cookies.each do |cookie|
42
+ set_cookie(cookie[:name], cookie[:value], cookie)
43
+ end
44
+ end
46
45
 
47
- def clear_cookies
48
- browser.agent.cookie_jar.clear!
49
- end
46
+ def clear_cookies
47
+ browser.agent.cookie_jar.clear!
48
+ end
50
49
 
51
- ###
50
+ ###
52
51
 
53
- def quit
54
- browser.agent.shutdown
55
- end
52
+ def quit
53
+ browser.agent.shutdown
54
+ end
56
55
 
57
- ###
56
+ ###
58
57
 
59
- # Reset parent method `current_memory` for mechanize (we can't measure memory of Mechanize driver)
60
- def current_memory
61
- nil
62
- end
58
+ # Reset parent method `current_memory` for mechanize (we can't measure memory of Mechanize driver)
59
+ def current_memory
60
+ nil
61
+ end
63
62
 
64
- def pid
65
- nil
66
- end
63
+ def pid
64
+ nil
65
+ end
67
66
 
68
- def port
69
- nil
67
+ def port
68
+ nil
69
+ end
70
+ end
70
71
  end
71
72
  end
@@ -1,34 +1,38 @@
1
1
  require_relative '../driver/base'
2
2
 
3
- class Capybara::Selenium::Driver
4
- def get_cookies
5
- browser.manage.all_cookies
6
- end
7
-
8
- def set_cookie(name, value, options = {})
9
- options[:name] ||= name
10
- options[:value] ||= value
11
-
12
- browser.manage.add_cookie(options)
13
- end
14
-
15
- def set_cookies(cookies)
16
- cookies.each do |cookie|
17
- set_cookie(cookie[:name], cookie[:value], cookie)
3
+ module Capybara
4
+ module Selenium
5
+ class Driver
6
+ def get_cookies
7
+ browser.manage.all_cookies
8
+ end
9
+
10
+ def set_cookie(name, value, options = {})
11
+ options[:name] ||= name
12
+ options[:value] ||= value
13
+
14
+ browser.manage.add_cookie(options)
15
+ end
16
+
17
+ def set_cookies(cookies)
18
+ cookies.each do |cookie|
19
+ set_cookie(cookie[:name], cookie[:value], cookie)
20
+ end
21
+ end
22
+
23
+ def clear_cookies
24
+ browser.manage.delete_all_cookies
25
+ end
26
+
27
+ ###
28
+
29
+ def pid
30
+ @pid ||= `lsof -i tcp:#{port} -t`.strip.to_i
31
+ end
32
+
33
+ def port
34
+ @port ||= browser.send(:bridge).instance_variable_get('@http').instance_variable_get('@server_url').port
35
+ end
18
36
  end
19
37
  end
20
-
21
- def clear_cookies
22
- browser.manage.delete_all_cookies
23
- end
24
-
25
- ###
26
-
27
- def pid
28
- @pid ||= `lsof -i tcp:#{port} -t`.strip.to_i
29
- end
30
-
31
- def port
32
- @port ||= browser.send(:bridge).instance_variable_get("@http").instance_variable_get("@server_url").port
33
- end
34
38
  end
@@ -1,6 +1,6 @@
1
1
  module Capybara
2
2
  class SessionConfig
3
- attr_accessor :cookies, :proxy, :user_agent
3
+ attr_accessor :cookies, :proxy, :user_agent, :encoding
4
4
  attr_writer :retry_request_errors, :skip_request_errors
5
5
 
6
6
  def retry_request_errors
@@ -7,11 +7,12 @@ module Capybara
7
7
  class Session
8
8
  attr_accessor :spider
9
9
 
10
- alias_method :original_visit, :visit
10
+ alias original_visit visit
11
11
  def visit(visit_uri, delay: config.before_request[:delay], skip_request_options: false, max_retries: 3)
12
12
  if spider
13
13
  process_delay(delay) if delay
14
- retries, sleep_interval = 0, 0
14
+ retries = 0
15
+ sleep_interval = 0
15
16
 
16
17
  begin
17
18
  check_request_options(visit_uri) unless skip_request_options
@@ -19,7 +20,7 @@ module Capybara
19
20
  spider.class.update(:visits, :requests) if spider.with_info
20
21
 
21
22
  original_visit(visit_uri)
22
- rescue => e
23
+ rescue StandardError => e
23
24
  if match_error?(e, type: :to_skip)
24
25
  logger.error "Browser: skip request error: #{e.inspect}, url: #{visit_uri}"
25
26
  spider.add_event(:requests_errors, e.inspect) if spider.with_info
@@ -29,7 +30,7 @@ module Capybara
29
30
  spider.add_event(:requests_errors, e.inspect) if spider.with_info
30
31
 
31
32
  if (retries += 1) <= max_retries
32
- logger.info "Browser: sleep #{(sleep_interval += 15)} seconds and process retry № #{retries} to the url: #{visit_uri}"
33
+ logger.info "Browser: sleep #{sleep_interval += 15} seconds and process retry № #{retries} to the url: #{visit_uri}"
33
34
  sleep sleep_interval and retry
34
35
  else
35
36
  logger.error "Browser: all retries (#{retries - 1}) to the url #{visit_uri} are gone"
@@ -48,7 +49,7 @@ module Capybara
48
49
  logger.info "Info: visits: requests: #{spider.class.visits[:requests]}, responses: #{spider.class.visits[:responses]}"
49
50
  end
50
51
 
51
- if memory = driver.current_memory
52
+ if (memory = driver.current_memory)
52
53
  logger.debug "Browser: driver.current_memory: #{memory}"
53
54
  end
54
55
  end
@@ -62,7 +63,7 @@ module Capybara
62
63
  begin
63
64
  @driver.quit
64
65
  # handle Net::ReadTimeout error for Selenium like drivers
65
- rescue Net::ReadTimeout => e
66
+ rescue Net::ReadTimeout
66
67
  @driver.quit
67
68
  end
68
69
 
@@ -74,13 +75,8 @@ module Capybara
74
75
  end
75
76
 
76
77
  def restart!
77
- if mode.match?(/poltergeist/)
78
- @driver.browser.restart
79
- @driver.requests, @driver.responses = 0, 0
80
- else
81
- destroy_driver!
82
- driver
83
- end
78
+ destroy_driver!
79
+ driver
84
80
 
85
81
  logger.info "Browser: driver has been restarted: name: #{mode}, pid: #{driver.pid}, port: #{driver.port}"
86
82
  end
@@ -88,7 +84,16 @@ module Capybara
88
84
  def current_response(response_type = :html)
89
85
  case response_type
90
86
  when :html
91
- Nokogiri::HTML(body)
87
+ if config.encoding
88
+ if config.encoding == :auto
89
+ charset = body.force_encoding('ISO-8859-1').encode('UTF-8')[/<meta.*?charset="?([\w+\d+-]*)/i, 1]
90
+ Nokogiri::HTML(body, nil, charset)
91
+ else
92
+ Nokogiri::HTML(body, nil, config.encoding)
93
+ end
94
+ else
95
+ Nokogiri::HTML(body)
96
+ end
92
97
  when :json
93
98
  JSON.parse(body)
94
99
  end
@@ -99,23 +104,22 @@ module Capybara
99
104
  # Handy method to perform some processing in the new tab within block and then automatically close this tab:
100
105
  # Usage (url):
101
106
  # browser.within_new_window_by(url: "https://google.com") do
102
- # do some stuff and then automatically close this tab and return back to the first tab
107
+ # do some stuff and then automatically close this tab and return back to the first tab
103
108
  # end
104
109
  # Usage (action) (when new tab opening by some action, for example by clicking
105
110
  # on a particular element):
106
111
  # action = -> { browser.find("//some/element/path").click }
107
112
  # browser.within_new_window_by(action: action) do
108
- # do some stuff and then automatically close this tab and return back to the first tab
113
+ # do some stuff and then automatically close this tab and return back to the first tab
109
114
  # end
110
115
  def within_new_window_by(action: nil, url: nil)
111
- case
112
- when action
116
+ if action
113
117
  opened_window = window_opened_by { action.call }
114
118
  within_window(opened_window) do
115
119
  yield
116
120
  current_window.close
117
121
  end
118
- when url
122
+ elsif url
119
123
  within_window(open_new_window) do
120
124
  visit(url)
121
125
 
@@ -128,14 +132,14 @@ module Capybara
128
132
  ###
129
133
 
130
134
  def scroll_to_bottom
131
- execute_script("window.scrollBy(0,10000)")
135
+ execute_script('window.scrollBy(0,10000)')
132
136
  end
133
137
 
134
138
  private
135
139
 
136
140
  def skip_error_on_failure?(e)
137
141
  config.retry_request_errors.any? do |error|
138
- error[:skip_on_failure] && e.class.ancestors.include?(error[:error]) if error.kind_of?(Hash)
142
+ error[:skip_on_failure] && e.class.ancestors.include?(error[:error]) if error.is_a?(Hash)
139
143
  end
140
144
  end
141
145
 
@@ -147,10 +151,10 @@ module Capybara
147
151
  end
148
152
 
149
153
  errors.any? do |error|
150
- if error.kind_of?(Hash)
154
+ if error.is_a?(Hash)
151
155
  match_class = e.class.ancestors.include?(error[:error])
152
156
  if error[:message].present?
153
- if error[:message].kind_of?(Regexp)
157
+ if error[:message].is_a?(Regexp)
154
158
  e.message&.match?(error[:message])
155
159
  else
156
160
  e.message&.include?(error[:message])
@@ -165,14 +169,14 @@ module Capybara
165
169
  end
166
170
 
167
171
  def process_delay(delay)
168
- interval = (delay.class == Range ? rand(delay) : delay)
172
+ interval = (delay.instance_of?(Range) ? rand(delay) : delay)
169
173
  logger.debug "Browser: sleep #{interval.round(2)} #{'second'.pluralize(interval)} before request..."
170
174
  sleep interval
171
175
  end
172
176
 
173
177
  def check_request_options(url_to_visit)
174
178
  # restart_if
175
- if memory_limit = config.restart_if[:memory_limit]
179
+ if (memory_limit = config.restart_if[:memory_limit])
176
180
  memory = driver.current_memory
177
181
  if memory && memory >= memory_limit
178
182
  logger.warn "Browser: memory_limit #{memory_limit} of driver.current_memory (#{memory}) is exceeded (engine: #{mode})"
@@ -180,7 +184,7 @@ module Capybara
180
184
  end
181
185
  end
182
186
 
183
- if requests_limit = config.restart_if[:requests_limit]
187
+ if (requests_limit = config.restart_if[:requests_limit])
184
188
  requests = driver.requests
185
189
  if requests >= requests_limit
186
190
  logger.warn "Browser: requests_limit #{requests_limit} of driver.requests (#{requests}) is exceeded (engine: #{mode})"
@@ -200,7 +204,7 @@ module Capybara
200
204
 
201
205
  if config.before_request[:clear_cookies]
202
206
  driver.clear_cookies
203
- logger.debug "Browser: cleared cookies before request"
207
+ logger.debug 'Browser: cleared cookies before request'
204
208
  end
205
209
 
206
210
  if config.before_request[:clear_and_set_cookies]
@@ -208,29 +212,27 @@ module Capybara
208
212
 
209
213
  # (Selenium only) if browser is not visited yet any page, visit url_to_visit
210
214
  # first and then set cookies (needs after browser restart):
211
- if driver.visited.nil? && mode.match?(/selenium/)
212
- visit(url_to_visit, skip_request_options: true)
213
- end
215
+ visit(url_to_visit, skip_request_options: true) if driver.visited.nil? && mode.match?(/selenium/)
214
216
 
215
217
  config.cookies.each do |cookie|
216
218
  driver.set_cookie(cookie[:name], cookie[:value], cookie)
217
219
  end
218
220
 
219
- logger.debug "Browser: cleared and set cookies before request"
221
+ logger.debug 'Browser: cleared and set cookies before request'
220
222
  end
221
223
 
222
224
  # user_agent
223
225
  if config.before_request[:change_user_agent]
224
- driver.add_header("User-Agent", config.user_agent.call)
225
- logger.debug "Browser: changed user_agent before request"
226
+ driver.add_header('User-Agent', config.user_agent.call)
227
+ logger.debug 'Browser: changed user_agent before request'
226
228
  end
227
229
 
228
230
  # proxy
229
- if config.before_request[:change_proxy]
230
- proxy_string = config.proxy.call
231
- driver.set_proxy(*proxy_string.split(":"))
232
- logger.debug "Browser: changed proxy before request"
233
- end
231
+ return unless config.before_request[:change_proxy]
232
+
233
+ proxy_string = config.proxy.call
234
+ driver.set_proxy(*proxy_string.split(':'))
235
+ logger.debug 'Browser: changed proxy before request'
234
236
  end
235
237
 
236
238
  def logger
@@ -4,20 +4,20 @@ module Kimurai
4
4
  include Thor::Actions
5
5
 
6
6
  def self.source_root
7
- File.dirname(File.expand_path('..', __FILE__))
7
+ File.dirname(File.expand_path(__dir__))
8
8
  end
9
9
 
10
10
  def generate_project(project_name)
11
- directory "template", project_name
11
+ directory 'template', project_name
12
12
  inside(project_name) do
13
- run "bundle install"
14
- run "git init"
13
+ run 'bundle install'
14
+ run 'git init'
15
15
  end
16
16
  end
17
17
 
18
18
  def generate_spider(spider_name, in_project:)
19
19
  spider_path = in_project ? "spiders/#{spider_name}.rb" : "./#{spider_name}.rb"
20
- raise "Spider #{spider_path} already exists" if File.exists? spider_path
20
+ raise "Spider #{spider_path} already exists" if File.exist? spider_path
21
21
 
22
22
  spider_class = to_spider_class(spider_name)
23
23
  create_file spider_path do
@@ -33,24 +33,24 @@ module Kimurai
33
33
  RUBY
34
34
  end
35
35
 
36
- unless in_project
37
- insert_into_file spider_path, " @engine = :mechanize\n", after: "@name = \"#{spider_name}\"\n"
38
- prepend_to_file spider_path, "require 'kimurai'\n\n"
39
- append_to_file spider_path, "\n#{spider_class}.crawl!"
40
- end
36
+ return if in_project
37
+
38
+ insert_into_file spider_path, " @engine = :mechanize\n", after: "@name = \"#{spider_name}\"\n"
39
+ prepend_to_file spider_path, "require 'kimurai'\n\n"
40
+ append_to_file spider_path, "\n#{spider_class}.crawl!"
41
41
  end
42
42
 
43
43
  def generate_schedule
44
- copy_file "template/config/schedule.rb", "./schedule.rb"
44
+ copy_file 'template/config/schedule.rb', './schedule.rb'
45
45
  end
46
46
 
47
47
  private
48
48
 
49
49
  def to_spider_class(string)
50
- string.sub(/^./) { $&.capitalize }
51
- .gsub(/(?:_|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
52
- .gsub(/(?:-|(\/))([a-z\d]*)/) { "Dash#{$2.capitalize}" }
53
- .gsub(/(?:\.|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
50
+ string.sub(/^./) { ::Regexp.last_match(0).capitalize }
51
+ .gsub(%r{(?:_|(/))([a-z\d]*)}) { "#{::Regexp.last_match(1)}#{::Regexp.last_match(2).capitalize}" }
52
+ .gsub(%r{(?:-|(/))([a-z\d]*)}) { "Dash#{::Regexp.last_match(2).capitalize}" }
53
+ .gsub(%r{(?:\.|(/))([a-z\d]*)}) { "#{::Regexp.last_match(1)}#{::Regexp.last_match(2).capitalize}" }
54
54
  end
55
55
  end
56
56
  end