kimurai 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -82,7 +82,7 @@ module Kimurai
82
82
  if user_agent = @config[:user_agent].presence
83
83
  user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
84
84
  driver_options.profile["general.useragent.override"] = user_agent_string
85
- logger.debug "BrowserBuilder (selenium_firefox): enabled custom user-agent"
85
+ logger.debug "BrowserBuilder (selenium_firefox): enabled custom user_agent"
86
86
  end
87
87
 
88
88
  # Headless mode
@@ -114,6 +114,10 @@ module Kimurai
114
114
  @browser.spider = spider
115
115
  logger.debug "BrowserBuilder (selenium_firefox): created browser instance"
116
116
 
117
+ if @config[:extensions].present?
118
+ logger.error "BrowserBuilder (selenium_firefox): `extensions` option not supported by Selenium, skipped"
119
+ end
120
+
117
121
  # Window size
118
122
  if size = @config[:window_size].presence
119
123
  @browser.current_window.resize_to(*size)
@@ -128,53 +132,53 @@ module Kimurai
128
132
 
129
133
  # Browser instance options
130
134
  # retry_request_errors
131
- if errors = @config.dig(:browser, :retry_request_errors).presence
135
+ if errors = @config[:retry_request_errors].presence
132
136
  @browser.config.retry_request_errors = errors
133
- logger.debug "BrowserBuilder (selenium_firefox): enabled `browser retry_request_errors`"
137
+ logger.debug "BrowserBuilder (selenium_firefox): enabled retry_request_errors"
134
138
  end
135
139
 
136
140
  # restart_if
137
- if requests_limit = @config.dig(:browser, :restart_if, :requests_limit).presence
141
+ if requests_limit = @config.dig(:restart_if, :requests_limit).presence
138
142
  @browser.config.restart_if[:requests_limit] = requests_limit
139
- logger.debug "BrowserBuilder (selenium_firefox): enabled `browser restart_if requests_limit` >= #{requests_limit}"
143
+ logger.debug "BrowserBuilder (selenium_firefox): enabled restart_if.requests_limit >= #{requests_limit}"
140
144
  end
141
145
 
142
- if memory_limit = @config.dig(:browser, :restart_if, :memory_limit).presence
146
+ if memory_limit = @config.dig(:restart_if, :memory_limit).presence
143
147
  @browser.config.restart_if[:memory_limit] = memory_limit
144
- logger.debug "BrowserBuilder (selenium_firefox): enabled `browser restart_if memory_limit` >= #{memory_limit}"
148
+ logger.debug "BrowserBuilder (selenium_firefox): enabled restart_if.memory_limit >= #{memory_limit}"
145
149
  end
146
150
 
147
151
  # before_request clear_cookies
148
- if @config.dig(:browser, :before_request, :clear_cookies)
152
+ if @config.dig(:before_request, :clear_cookies)
149
153
  @browser.config.before_request[:clear_cookies] = true
150
- logger.debug "BrowserBuilder (selenium_firefox): enabled `browser before_request clear_cookies`"
154
+ logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.clear_cookies"
151
155
  end
152
156
 
153
157
  # before_request clear_and_set_cookies
154
- if @config.dig(:browser, :before_request, :clear_and_set_cookies)
158
+ if @config.dig(:before_request, :clear_and_set_cookies)
155
159
  if cookies = @config[:cookies].presence
156
160
  @browser.config.cookies = cookies
157
161
  @browser.config.before_request[:clear_and_set_cookies] = true
158
- logger.debug "BrowserBuilder (selenium_firefox): enabled `browser before_request clear_and_set_cookies`"
162
+ logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.clear_and_set_cookies"
159
163
  else
160
- logger.error "BrowserBuilder (selenium_firefox): `cookies` should be present to enable `browser before_request clear_and_set_cookies`, skipped"
164
+ logger.error "BrowserBuilder (selenium_firefox): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
161
165
  end
162
166
  end
163
167
 
164
168
  # before_request change_user_agent
165
- if @config.dig(:browser, :before_request, :change_user_agent)
166
- logger.error "BrowserBuilder (selenium_firefox): `browser before_request change_user_agent` option not supported by Selenium, skipped"
169
+ if @config.dig(:before_request, :change_user_agent)
170
+ logger.error "BrowserBuilder (selenium_firefox): before_request.change_user_agent option not supported by Selenium, skipped"
167
171
  end
168
172
 
169
173
  # before_request change_proxy
170
- if @config.dig(:browser, :before_request, :change_proxy)
171
- logger.error "BrowserBuilder (selenium_firefox): `browser before_request change_proxy` option not supported by Selenium, skipped"
174
+ if @config.dig(:before_request, :change_proxy)
175
+ logger.error "BrowserBuilder (selenium_firefox): before_request.change_proxy option not supported by Selenium, skipped"
172
176
  end
173
177
 
174
178
  # before_request delay
175
- if delay = @config.dig(:browser, :before_request, :delay).presence
179
+ if delay = @config.dig(:before_request, :delay).presence
176
180
  @browser.config.before_request[:delay] = delay
177
- logger.debug "BrowserBuilder (selenium_firefox): enabled `browser before_request delay`"
181
+ logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.delay"
178
182
  end
179
183
 
180
184
  # return Capybara session instance
@@ -5,7 +5,7 @@ class Capybara::Mechanize::Driver
5
5
  # Extend capybara-mechnize to support Poltergeist-like methods
6
6
  # https://www.rubydoc.info/gems/poltergeist/Capybara/Poltergeist/Driver
7
7
 
8
- def set_proxy(ip, port, type, user, password)
8
+ def set_proxy(ip, port, type, user = nil, password = nil)
9
9
  # type is always "http", "socks" is not supported (yet)
10
10
  browser.agent.set_proxy(ip, port, user, password)
11
11
  end
@@ -6,10 +6,6 @@ module Capybara
6
6
  class Session
7
7
  attr_accessor :spider
8
8
 
9
- def current_response
10
- Nokogiri::HTML(body)
11
- end
12
-
13
9
  alias_method :original_visit, :visit
14
10
  def visit(visit_uri, delay: config.before_request[:delay], skip_request_options: false, max_retries: 3)
15
11
  if spider
@@ -24,12 +20,13 @@ module Capybara
24
20
  original_visit(visit_uri)
25
21
  rescue *config.retry_request_errors => e
26
22
  logger.error "Browser: request visit error: #{e.inspect}, url: #{visit_uri}"
23
+ spider.add_event(:requests_errors, e.inspect) if spider.with_info
27
24
 
28
- if (retries += 1) < max_retries
25
+ if (retries += 1) <= max_retries
29
26
  logger.info "Browser: sleep #{(sleep_interval += 15)} seconds and process retry № #{retries} to the url: #{visit_uri}"
30
27
  sleep sleep_interval and retry
31
28
  else
32
- logger.error "Browser: all retries (#{retries}) to the url `#{visit_uri}` are gone"
29
+ logger.error "Browser: all retries (#{retries - 1}) to the url `#{visit_uri}` are gone"
33
30
  raise e
34
31
  end
35
32
  else
@@ -52,7 +49,13 @@ module Capybara
52
49
 
53
50
  def destroy_driver!
54
51
  if @driver
55
- @driver.quit
52
+ begin
53
+ @driver.quit
54
+ # handle Net::ReadTimeout error for Selenium like drivers
55
+ rescue Net::ReadTimeout => e
56
+ @driver.quit
57
+ end
58
+
56
59
  @driver = nil
57
60
  logger.info "Browser: driver #{mode} has been destroyed"
58
61
  else
@@ -72,6 +75,43 @@ module Capybara
72
75
  logger.info "Browser: driver has been restarted: name: #{mode}, pid: #{driver.pid}, port: #{driver.port}"
73
76
  end
74
77
 
78
+ def current_response
79
+ Nokogiri::HTML(body)
80
+ end
81
+
82
+ ###
83
+
84
+ # Handy method to perform some processing in the new tab within block and then automatically close this tab:
85
+ # Usage (url):
86
+ # browser.within_new_window_by(url: "https://google.com") do
87
+ # do some stuff and then automatically close this tab and return back to the first tab
88
+ # end
89
+ # Usage (action) (when new tab opening by some action, for example by clicking
90
+ # on a particular element):
91
+ # action = -> { browser.find("//some/element/path").click }
92
+ # browser.within_new_window_by(action: action) do
93
+ # do some stuff and then automatically close this tab and return back to the first tab
94
+ # end
95
+ def within_new_window_by(action: nil, url: nil)
96
+ case
97
+ when action
98
+ opened_window = window_opened_by { action.call }
99
+ within_window(opened_window) do
100
+ yield
101
+ current_window.close
102
+ end
103
+ when url
104
+ within_window(open_new_window) do
105
+ visit(url)
106
+
107
+ yield
108
+ current_window.close
109
+ end
110
+ end
111
+ end
112
+
113
+ ###
114
+
75
115
  private
76
116
 
77
117
  def process_delay(delay)
@@ -66,6 +66,7 @@ module Kimurai
66
66
  ###
67
67
 
68
68
  desc "crawl", "Run a particular spider by it's name"
69
+ option :continue, aliases: :c, type: :boolean, default: false, banner: "Continue previous crawling"
69
70
  def crawl(spider_name)
70
71
  raise "Can't find Kimurai project" unless inside_project?
71
72
  require './config/boot'
@@ -80,7 +81,7 @@ module Kimurai
80
81
  Kimurai.time_zone = time_zone
81
82
  end
82
83
 
83
- klass.crawl!
84
+ klass.crawl!(continue: options["continue"])
84
85
  end
85
86
 
86
87
  desc "parse", "Parse url in the particular spider method"
@@ -14,12 +14,16 @@ module Kimurai
14
14
 
15
15
  ###
16
16
 
17
+ def storage
18
+ spider.storage
19
+ end
20
+
17
21
  def unique?(scope, value)
18
22
  spider.unique?(scope, value)
19
23
  end
20
24
 
21
- def save_to(path, item, format:, position: true)
22
- spider.save_to(path, item, format: format, position: position)
25
+ def save_to(path, item, format:, position: true, append: false)
26
+ spider.save_to(path, item, format: format, position: position, append: append)
23
27
  end
24
28
 
25
29
  def logger
@@ -18,3 +18,11 @@ group :development do
18
18
  gem 'pry'
19
19
  end
20
20
 
21
+ # If you want to save items to the database, require one of these gems:
22
+ # gem 'sqlite3'
23
+ # gem 'pg'
24
+ # gem 'mysql2'
25
+
26
+ # And use your preferred ORM/database connector:
27
+ # gem 'activerecord', require: 'active_record'
28
+ # gem 'sequel'
@@ -64,41 +64,56 @@ class ApplicationSpider < Kimurai::Base
64
64
  # Option to provide custom SSL certificate. Works only for :poltergeist_phantomjs and :mechanize
65
65
  # ssl_cert_path: "path/to/ssl_cert",
66
66
 
67
- # Browser (Capybara session instance) options:
68
- browser: {
69
- # Array of errors to retry while processing a request
70
- # retry_request_errors: [Net::ReadTimeout],
71
- # Restart browser if one of the options is true:
72
- restart_if: {
73
- # Restart browser if provided memory limit (in kilobytes) is exceeded (works for all engines)
74
- # memory_limit: 350_000,
75
-
76
- # Restart browser if provided requests limit is exceeded (works for all engines)
77
- # requests_limit: 100
78
- },
79
- before_request: {
80
- # Change proxy before each request. The `proxy:` option above should be presented
81
- # and has lambda format. Works only for poltergeist and mechanize engines
82
- # (Selenium doesn't support proxy rotation).
83
- # change_proxy: true,
84
-
85
- # Change user agent before each request. The `user_agent:` option above should be presented
86
- # and has lambda format. Works only for poltergeist and mechanize engines
87
- # (selenium doesn't support to get/set headers).
88
- # change_user_agent: true,
89
-
90
- # Clear all cookies before each request, works for all engines
91
- # clear_cookies: true,
92
-
93
- # If you want to clear all cookies + set custom cookies (`cookies:` option above should be presented)
94
- # use this option instead (works for all engines)
95
- # clear_and_set_cookies: true,
96
-
97
- # Global option to set delay between requests.
98
- # Delay can be `Integer`, `Float` or `Range` (`2..5`). In case of a range,
99
- # delay number will be chosen randomly for each request: `rand (2..5) # => 3`
100
- # delay: 1..3
101
- }
67
+ # Inject some JavaScript code to the browser.
68
+ # Format: array of strings, where each string is a path to JS file.
69
+ # Works only for poltergeist_phantomjs engine (Selenium doesn't support JS code injection)
70
+ # extensions: ["lib/code_to_inject.js"],
71
+
72
+ # Automatically skip duplicated (already visited) urls when using `request_to` method.
73
+ # Possible values: `true` or `hash` with options.
74
+ # In case of `true`, all visited urls will be added to the storage's scope `:requests_urls`
75
+ # and if url already contains in this scope, request will be skipped.
76
+ # You can configure this setting by providing additional options as hash:
77
+ # `skip_duplicate_requests: { scope: :custom_scope, check_only: true }`, where:
78
+ # `scope:` - use custom scope than `:requests_urls`
79
+ # `check_only:` - if true, then scope will be only checked for url, url will not
80
+ # be added to the scope if scope doesn't contains it.
81
+ # works for all drivers
82
+ # skip_duplicate_requests: true,
83
+
84
+ # Array of errors to retry while processing a request
85
+ # retry_request_errors: [Net::ReadTimeout],
86
+
87
+ # Restart browser if one of the options is true:
88
+ restart_if: {
89
+ # Restart browser if provided memory limit (in kilobytes) is exceeded (works for all engines)
90
+ # memory_limit: 350_000,
91
+
92
+ # Restart browser if provided requests limit is exceeded (works for all engines)
93
+ # requests_limit: 100
94
+ },
95
+ before_request: {
96
+ # Change proxy before each request. The `proxy:` option above should be presented
97
+ # and has lambda format. Works only for poltergeist and mechanize engines
98
+ # (Selenium doesn't support proxy rotation).
99
+ # change_proxy: true,
100
+
101
+ # Change user agent before each request. The `user_agent:` option above should be presented
102
+ # and has lambda format. Works only for poltergeist and mechanize engines
103
+ # (selenium doesn't support to get/set headers).
104
+ # change_user_agent: true,
105
+
106
+ # Clear all cookies before each request, works for all engines
107
+ # clear_cookies: true,
108
+
109
+ # If you want to clear all cookies + set custom cookies (`cookies:` option above should be presented)
110
+ # use this option instead (works for all engines)
111
+ # clear_and_set_cookies: true,
112
+
113
+ # Global option to set delay between requests.
114
+ # Delay can be `Integer`, `Float` or `Range` (`2..5`). In case of a range,
115
+ # delay number will be chosen randomly for each request: `rand (2..5) # => 3`
116
+ # delay: 1..3
102
117
  }
103
118
  }
104
119
  end
@@ -1,3 +1,3 @@
1
1
  module Kimurai
2
- VERSION = "1.0.1"
2
+ VERSION = "1.1.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kimurai
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Victor Afanasev
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-08-27 00:00:00.000000000 Z
11
+ date: 2018-09-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -282,8 +282,8 @@ files:
282
282
  - lib/kimurai/automation/setup/phantomjs.yml
283
283
  - lib/kimurai/automation/setup/ruby_environment.yml
284
284
  - lib/kimurai/base.rb
285
- - lib/kimurai/base/simple_saver.rb
286
- - lib/kimurai/base/uniq_checker.rb
285
+ - lib/kimurai/base/saver.rb
286
+ - lib/kimurai/base/storage.rb
287
287
  - lib/kimurai/base_helper.rb
288
288
  - lib/kimurai/browser_builder.rb
289
289
  - lib/kimurai/browser_builder/mechanize_builder.rb
@@ -323,7 +323,7 @@ files:
323
323
  - lib/kimurai/template/spiders/application_spider.rb
324
324
  - lib/kimurai/template/tmp/.keep
325
325
  - lib/kimurai/version.rb
326
- homepage: https://github.com/vifreefly/kimurai
326
+ homepage: https://github.com/vifreefly/kimuraframework
327
327
  licenses:
328
328
  - MIT
329
329
  metadata: {}
@@ -1,22 +0,0 @@
1
- module Kimurai
2
- class Base
3
- class UniqChecker
4
- def initialize
5
- @database = {}
6
- @mutex = Mutex.new
7
- end
8
-
9
- def unique?(scope, value)
10
- @mutex.synchronize do
11
- @database[scope] ||= []
12
- if @database[scope].include?(value)
13
- false
14
- else
15
- @database[scope].push(value)
16
- true
17
- end
18
- end
19
- end
20
- end
21
- end
22
- end