kimurai 1.0.1 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -82,7 +82,7 @@ module Kimurai
82
82
  if user_agent = @config[:user_agent].presence
83
83
  user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
84
84
  driver_options.profile["general.useragent.override"] = user_agent_string
85
- logger.debug "BrowserBuilder (selenium_firefox): enabled custom user-agent"
85
+ logger.debug "BrowserBuilder (selenium_firefox): enabled custom user_agent"
86
86
  end
87
87
 
88
88
  # Headless mode
@@ -114,6 +114,10 @@ module Kimurai
114
114
  @browser.spider = spider
115
115
  logger.debug "BrowserBuilder (selenium_firefox): created browser instance"
116
116
 
117
+ if @config[:extensions].present?
118
+ logger.error "BrowserBuilder (selenium_firefox): `extensions` option not supported by Selenium, skipped"
119
+ end
120
+
117
121
  # Window size
118
122
  if size = @config[:window_size].presence
119
123
  @browser.current_window.resize_to(*size)
@@ -128,53 +132,53 @@ module Kimurai
128
132
 
129
133
  # Browser instance options
130
134
  # retry_request_errors
131
- if errors = @config.dig(:browser, :retry_request_errors).presence
135
+ if errors = @config[:retry_request_errors].presence
132
136
  @browser.config.retry_request_errors = errors
133
- logger.debug "BrowserBuilder (selenium_firefox): enabled `browser retry_request_errors`"
137
+ logger.debug "BrowserBuilder (selenium_firefox): enabled retry_request_errors"
134
138
  end
135
139
 
136
140
  # restart_if
137
- if requests_limit = @config.dig(:browser, :restart_if, :requests_limit).presence
141
+ if requests_limit = @config.dig(:restart_if, :requests_limit).presence
138
142
  @browser.config.restart_if[:requests_limit] = requests_limit
139
- logger.debug "BrowserBuilder (selenium_firefox): enabled `browser restart_if requests_limit` >= #{requests_limit}"
143
+ logger.debug "BrowserBuilder (selenium_firefox): enabled restart_if.requests_limit >= #{requests_limit}"
140
144
  end
141
145
 
142
- if memory_limit = @config.dig(:browser, :restart_if, :memory_limit).presence
146
+ if memory_limit = @config.dig(:restart_if, :memory_limit).presence
143
147
  @browser.config.restart_if[:memory_limit] = memory_limit
144
- logger.debug "BrowserBuilder (selenium_firefox): enabled `browser restart_if memory_limit` >= #{memory_limit}"
148
+ logger.debug "BrowserBuilder (selenium_firefox): enabled restart_if.memory_limit >= #{memory_limit}"
145
149
  end
146
150
 
147
151
  # before_request clear_cookies
148
- if @config.dig(:browser, :before_request, :clear_cookies)
152
+ if @config.dig(:before_request, :clear_cookies)
149
153
  @browser.config.before_request[:clear_cookies] = true
150
- logger.debug "BrowserBuilder (selenium_firefox): enabled `browser before_request clear_cookies`"
154
+ logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.clear_cookies"
151
155
  end
152
156
 
153
157
  # before_request clear_and_set_cookies
154
- if @config.dig(:browser, :before_request, :clear_and_set_cookies)
158
+ if @config.dig(:before_request, :clear_and_set_cookies)
155
159
  if cookies = @config[:cookies].presence
156
160
  @browser.config.cookies = cookies
157
161
  @browser.config.before_request[:clear_and_set_cookies] = true
158
- logger.debug "BrowserBuilder (selenium_firefox): enabled `browser before_request clear_and_set_cookies`"
162
+ logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.clear_and_set_cookies"
159
163
  else
160
- logger.error "BrowserBuilder (selenium_firefox): `cookies` should be present to enable `browser before_request clear_and_set_cookies`, skipped"
164
+ logger.error "BrowserBuilder (selenium_firefox): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
161
165
  end
162
166
  end
163
167
 
164
168
  # before_request change_user_agent
165
- if @config.dig(:browser, :before_request, :change_user_agent)
166
- logger.error "BrowserBuilder (selenium_firefox): `browser before_request change_user_agent` option not supported by Selenium, skipped"
169
+ if @config.dig(:before_request, :change_user_agent)
170
+ logger.error "BrowserBuilder (selenium_firefox): before_request.change_user_agent option not supported by Selenium, skipped"
167
171
  end
168
172
 
169
173
  # before_request change_proxy
170
- if @config.dig(:browser, :before_request, :change_proxy)
171
- logger.error "BrowserBuilder (selenium_firefox): `browser before_request change_proxy` option not supported by Selenium, skipped"
174
+ if @config.dig(:before_request, :change_proxy)
175
+ logger.error "BrowserBuilder (selenium_firefox): before_request.change_proxy option not supported by Selenium, skipped"
172
176
  end
173
177
 
174
178
  # before_request delay
175
- if delay = @config.dig(:browser, :before_request, :delay).presence
179
+ if delay = @config.dig(:before_request, :delay).presence
176
180
  @browser.config.before_request[:delay] = delay
177
- logger.debug "BrowserBuilder (selenium_firefox): enabled `browser before_request delay`"
181
+ logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.delay"
178
182
  end
179
183
 
180
184
  # return Capybara session instance
@@ -5,7 +5,7 @@ class Capybara::Mechanize::Driver
5
5
  # Extend capybara-mechnize to support Poltergeist-like methods
6
6
  # https://www.rubydoc.info/gems/poltergeist/Capybara/Poltergeist/Driver
7
7
 
8
- def set_proxy(ip, port, type, user, password)
8
+ def set_proxy(ip, port, type, user = nil, password = nil)
9
9
  # type is always "http", "socks" is not supported (yet)
10
10
  browser.agent.set_proxy(ip, port, user, password)
11
11
  end
@@ -6,10 +6,6 @@ module Capybara
6
6
  class Session
7
7
  attr_accessor :spider
8
8
 
9
- def current_response
10
- Nokogiri::HTML(body)
11
- end
12
-
13
9
  alias_method :original_visit, :visit
14
10
  def visit(visit_uri, delay: config.before_request[:delay], skip_request_options: false, max_retries: 3)
15
11
  if spider
@@ -24,12 +20,13 @@ module Capybara
24
20
  original_visit(visit_uri)
25
21
  rescue *config.retry_request_errors => e
26
22
  logger.error "Browser: request visit error: #{e.inspect}, url: #{visit_uri}"
23
+ spider.add_event(:requests_errors, e.inspect) if spider.with_info
27
24
 
28
- if (retries += 1) < max_retries
25
+ if (retries += 1) <= max_retries
29
26
  logger.info "Browser: sleep #{(sleep_interval += 15)} seconds and process retry № #{retries} to the url: #{visit_uri}"
30
27
  sleep sleep_interval and retry
31
28
  else
32
- logger.error "Browser: all retries (#{retries}) to the url `#{visit_uri}` are gone"
29
+ logger.error "Browser: all retries (#{retries - 1}) to the url `#{visit_uri}` are gone"
33
30
  raise e
34
31
  end
35
32
  else
@@ -52,7 +49,13 @@ module Capybara
52
49
 
53
50
  def destroy_driver!
54
51
  if @driver
55
- @driver.quit
52
+ begin
53
+ @driver.quit
54
+ # handle Net::ReadTimeout error for Selenium like drivers
55
+ rescue Net::ReadTimeout => e
56
+ @driver.quit
57
+ end
58
+
56
59
  @driver = nil
57
60
  logger.info "Browser: driver #{mode} has been destroyed"
58
61
  else
@@ -72,6 +75,43 @@ module Capybara
72
75
  logger.info "Browser: driver has been restarted: name: #{mode}, pid: #{driver.pid}, port: #{driver.port}"
73
76
  end
74
77
 
78
+ def current_response
79
+ Nokogiri::HTML(body)
80
+ end
81
+
82
+ ###
83
+
84
+ # Handy method to perform some processing in the new tab within block and then automatically close this tab:
85
+ # Usage (url):
86
+ # browser.within_new_window_by(url: "https://google.com") do
87
+ # do some stuff and then automatically close this tab and return back to the first tab
88
+ # end
89
+ # Usage (action) (when new tab opening by some action, for example by clicking
90
+ # on a particular element):
91
+ # action = -> { browser.find("//some/element/path").click }
92
+ # browser.within_new_window_by(action: action) do
93
+ # do some stuff and then automatically close this tab and return back to the first tab
94
+ # end
95
+ def within_new_window_by(action: nil, url: nil)
96
+ case
97
+ when action
98
+ opened_window = window_opened_by { action.call }
99
+ within_window(opened_window) do
100
+ yield
101
+ current_window.close
102
+ end
103
+ when url
104
+ within_window(open_new_window) do
105
+ visit(url)
106
+
107
+ yield
108
+ current_window.close
109
+ end
110
+ end
111
+ end
112
+
113
+ ###
114
+
75
115
  private
76
116
 
77
117
  def process_delay(delay)
@@ -66,6 +66,7 @@ module Kimurai
66
66
  ###
67
67
 
68
68
  desc "crawl", "Run a particular spider by it's name"
69
+ option :continue, aliases: :c, type: :boolean, default: false, banner: "Continue previous crawling"
69
70
  def crawl(spider_name)
70
71
  raise "Can't find Kimurai project" unless inside_project?
71
72
  require './config/boot'
@@ -80,7 +81,7 @@ module Kimurai
80
81
  Kimurai.time_zone = time_zone
81
82
  end
82
83
 
83
- klass.crawl!
84
+ klass.crawl!(continue: options["continue"])
84
85
  end
85
86
 
86
87
  desc "parse", "Parse url in the particular spider method"
@@ -14,12 +14,16 @@ module Kimurai
14
14
 
15
15
  ###
16
16
 
17
+ def storage
18
+ spider.storage
19
+ end
20
+
17
21
  def unique?(scope, value)
18
22
  spider.unique?(scope, value)
19
23
  end
20
24
 
21
- def save_to(path, item, format:, position: true)
22
- spider.save_to(path, item, format: format, position: position)
25
+ def save_to(path, item, format:, position: true, append: false)
26
+ spider.save_to(path, item, format: format, position: position, append: append)
23
27
  end
24
28
 
25
29
  def logger
@@ -18,3 +18,11 @@ group :development do
18
18
  gem 'pry'
19
19
  end
20
20
 
21
+ # If you want to save items to the database, require one of these gems:
22
+ # gem 'sqlite3'
23
+ # gem 'pg'
24
+ # gem 'mysql2'
25
+
26
+ # And use your preferred ORM/database connector:
27
+ # gem 'activerecord', require: 'active_record'
28
+ # gem 'sequel'
@@ -64,41 +64,56 @@ class ApplicationSpider < Kimurai::Base
64
64
  # Option to provide custom SSL certificate. Works only for :poltergeist_phantomjs and :mechanize
65
65
  # ssl_cert_path: "path/to/ssl_cert",
66
66
 
67
- # Browser (Capybara session instance) options:
68
- browser: {
69
- # Array of errors to retry while processing a request
70
- # retry_request_errors: [Net::ReadTimeout],
71
- # Restart browser if one of the options is true:
72
- restart_if: {
73
- # Restart browser if provided memory limit (in kilobytes) is exceeded (works for all engines)
74
- # memory_limit: 350_000,
75
-
76
- # Restart browser if provided requests limit is exceeded (works for all engines)
77
- # requests_limit: 100
78
- },
79
- before_request: {
80
- # Change proxy before each request. The `proxy:` option above should be presented
81
- # and has lambda format. Works only for poltergeist and mechanize engines
82
- # (Selenium doesn't support proxy rotation).
83
- # change_proxy: true,
84
-
85
- # Change user agent before each request. The `user_agent:` option above should be presented
86
- # and has lambda format. Works only for poltergeist and mechanize engines
87
- # (selenium doesn't support to get/set headers).
88
- # change_user_agent: true,
89
-
90
- # Clear all cookies before each request, works for all engines
91
- # clear_cookies: true,
92
-
93
- # If you want to clear all cookies + set custom cookies (`cookies:` option above should be presented)
94
- # use this option instead (works for all engines)
95
- # clear_and_set_cookies: true,
96
-
97
- # Global option to set delay between requests.
98
- # Delay can be `Integer`, `Float` or `Range` (`2..5`). In case of a range,
99
- # delay number will be chosen randomly for each request: `rand (2..5) # => 3`
100
- # delay: 1..3
101
- }
67
+ # Inject some JavaScript code to the browser.
68
+ # Format: array of strings, where each string is a path to JS file.
69
+ # Works only for poltergeist_phantomjs engine (Selenium doesn't support JS code injection)
70
+ # extensions: ["lib/code_to_inject.js"],
71
+
72
+ # Automatically skip duplicated (already visited) urls when using `request_to` method.
73
+ # Possible values: `true` or `hash` with options.
74
+ # In case of `true`, all visited urls will be added to the storage's scope `:requests_urls`
75
+ # and if url already contains in this scope, request will be skipped.
76
+ # You can configure this setting by providing additional options as hash:
77
+ # `skip_duplicate_requests: { scope: :custom_scope, check_only: true }`, where:
78
+ # `scope:` - use custom scope than `:requests_urls`
79
+ # `check_only:` - if true, then scope will be only checked for url, url will not
80
+ # be added to the scope if scope doesn't contains it.
81
+ # works for all drivers
82
+ # skip_duplicate_requests: true,
83
+
84
+ # Array of errors to retry while processing a request
85
+ # retry_request_errors: [Net::ReadTimeout],
86
+
87
+ # Restart browser if one of the options is true:
88
+ restart_if: {
89
+ # Restart browser if provided memory limit (in kilobytes) is exceeded (works for all engines)
90
+ # memory_limit: 350_000,
91
+
92
+ # Restart browser if provided requests limit is exceeded (works for all engines)
93
+ # requests_limit: 100
94
+ },
95
+ before_request: {
96
+ # Change proxy before each request. The `proxy:` option above should be presented
97
+ # and has lambda format. Works only for poltergeist and mechanize engines
98
+ # (Selenium doesn't support proxy rotation).
99
+ # change_proxy: true,
100
+
101
+ # Change user agent before each request. The `user_agent:` option above should be presented
102
+ # and has lambda format. Works only for poltergeist and mechanize engines
103
+ # (selenium doesn't support to get/set headers).
104
+ # change_user_agent: true,
105
+
106
+ # Clear all cookies before each request, works for all engines
107
+ # clear_cookies: true,
108
+
109
+ # If you want to clear all cookies + set custom cookies (`cookies:` option above should be presented)
110
+ # use this option instead (works for all engines)
111
+ # clear_and_set_cookies: true,
112
+
113
+ # Global option to set delay between requests.
114
+ # Delay can be `Integer`, `Float` or `Range` (`2..5`). In case of a range,
115
+ # delay number will be chosen randomly for each request: `rand (2..5) # => 3`
116
+ # delay: 1..3
102
117
  }
103
118
  }
104
119
  end
@@ -1,3 +1,3 @@
1
1
  module Kimurai
2
- VERSION = "1.0.1"
2
+ VERSION = "1.1.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kimurai
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Victor Afanasev
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-08-27 00:00:00.000000000 Z
11
+ date: 2018-09-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -282,8 +282,8 @@ files:
282
282
  - lib/kimurai/automation/setup/phantomjs.yml
283
283
  - lib/kimurai/automation/setup/ruby_environment.yml
284
284
  - lib/kimurai/base.rb
285
- - lib/kimurai/base/simple_saver.rb
286
- - lib/kimurai/base/uniq_checker.rb
285
+ - lib/kimurai/base/saver.rb
286
+ - lib/kimurai/base/storage.rb
287
287
  - lib/kimurai/base_helper.rb
288
288
  - lib/kimurai/browser_builder.rb
289
289
  - lib/kimurai/browser_builder/mechanize_builder.rb
@@ -323,7 +323,7 @@ files:
323
323
  - lib/kimurai/template/spiders/application_spider.rb
324
324
  - lib/kimurai/template/tmp/.keep
325
325
  - lib/kimurai/version.rb
326
- homepage: https://github.com/vifreefly/kimurai
326
+ homepage: https://github.com/vifreefly/kimuraframework
327
327
  licenses:
328
328
  - MIT
329
329
  metadata: {}
@@ -1,22 +0,0 @@
1
- module Kimurai
2
- class Base
3
- class UniqChecker
4
- def initialize
5
- @database = {}
6
- @mutex = Mutex.new
7
- end
8
-
9
- def unique?(scope, value)
10
- @mutex.synchronize do
11
- @database[scope] ||= []
12
- if @database[scope].include?(value)
13
- false
14
- else
15
- @database[scope].push(value)
16
- true
17
- end
18
- end
19
- end
20
- end
21
- end
22
- end