kimurai 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +56 -1
- data/README.md +183 -69
- data/kimurai.gemspec +1 -1
- data/lib/kimurai/base.rb +96 -36
- data/lib/kimurai/base/{simple_saver.rb → saver.rb} +25 -17
- data/lib/kimurai/base/storage.rb +91 -0
- data/lib/kimurai/browser_builder.rb +6 -0
- data/lib/kimurai/browser_builder/mechanize_builder.rb +22 -18
- data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +25 -20
- data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +21 -23
- data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +22 -18
- data/lib/kimurai/capybara_ext/mechanize/driver.rb +1 -1
- data/lib/kimurai/capybara_ext/session.rb +47 -7
- data/lib/kimurai/cli.rb +2 -1
- data/lib/kimurai/pipeline.rb +6 -2
- data/lib/kimurai/template/Gemfile +8 -0
- data/lib/kimurai/template/spiders/application_spider.rb +50 -35
- data/lib/kimurai/version.rb +1 -1
- metadata +5 -5
- data/lib/kimurai/base/uniq_checker.rb +0 -22
@@ -82,7 +82,7 @@ module Kimurai
|
|
82
82
|
if user_agent = @config[:user_agent].presence
|
83
83
|
user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
|
84
84
|
driver_options.profile["general.useragent.override"] = user_agent_string
|
85
|
-
logger.debug "BrowserBuilder (selenium_firefox): enabled custom
|
85
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled custom user_agent"
|
86
86
|
end
|
87
87
|
|
88
88
|
# Headless mode
|
@@ -114,6 +114,10 @@ module Kimurai
|
|
114
114
|
@browser.spider = spider
|
115
115
|
logger.debug "BrowserBuilder (selenium_firefox): created browser instance"
|
116
116
|
|
117
|
+
if @config[:extensions].present?
|
118
|
+
logger.error "BrowserBuilder (selenium_firefox): `extensions` option not supported by Selenium, skipped"
|
119
|
+
end
|
120
|
+
|
117
121
|
# Window size
|
118
122
|
if size = @config[:window_size].presence
|
119
123
|
@browser.current_window.resize_to(*size)
|
@@ -128,53 +132,53 @@ module Kimurai
|
|
128
132
|
|
129
133
|
# Browser instance options
|
130
134
|
# retry_request_errors
|
131
|
-
if errors = @config
|
135
|
+
if errors = @config[:retry_request_errors].presence
|
132
136
|
@browser.config.retry_request_errors = errors
|
133
|
-
logger.debug "BrowserBuilder (selenium_firefox): enabled
|
137
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled retry_request_errors"
|
134
138
|
end
|
135
139
|
|
136
140
|
# restart_if
|
137
|
-
if requests_limit = @config.dig(:
|
141
|
+
if requests_limit = @config.dig(:restart_if, :requests_limit).presence
|
138
142
|
@browser.config.restart_if[:requests_limit] = requests_limit
|
139
|
-
logger.debug "BrowserBuilder (selenium_firefox): enabled
|
143
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled restart_if.requests_limit >= #{requests_limit}"
|
140
144
|
end
|
141
145
|
|
142
|
-
if memory_limit = @config.dig(:
|
146
|
+
if memory_limit = @config.dig(:restart_if, :memory_limit).presence
|
143
147
|
@browser.config.restart_if[:memory_limit] = memory_limit
|
144
|
-
logger.debug "BrowserBuilder (selenium_firefox): enabled
|
148
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled restart_if.memory_limit >= #{memory_limit}"
|
145
149
|
end
|
146
150
|
|
147
151
|
# before_request clear_cookies
|
148
|
-
if @config.dig(:
|
152
|
+
if @config.dig(:before_request, :clear_cookies)
|
149
153
|
@browser.config.before_request[:clear_cookies] = true
|
150
|
-
logger.debug "BrowserBuilder (selenium_firefox): enabled
|
154
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.clear_cookies"
|
151
155
|
end
|
152
156
|
|
153
157
|
# before_request clear_and_set_cookies
|
154
|
-
if @config.dig(:
|
158
|
+
if @config.dig(:before_request, :clear_and_set_cookies)
|
155
159
|
if cookies = @config[:cookies].presence
|
156
160
|
@browser.config.cookies = cookies
|
157
161
|
@browser.config.before_request[:clear_and_set_cookies] = true
|
158
|
-
logger.debug "BrowserBuilder (selenium_firefox): enabled
|
162
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.clear_and_set_cookies"
|
159
163
|
else
|
160
|
-
logger.error "BrowserBuilder (selenium_firefox):
|
164
|
+
logger.error "BrowserBuilder (selenium_firefox): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
|
161
165
|
end
|
162
166
|
end
|
163
167
|
|
164
168
|
# before_request change_user_agent
|
165
|
-
if @config.dig(:
|
166
|
-
logger.error "BrowserBuilder (selenium_firefox):
|
169
|
+
if @config.dig(:before_request, :change_user_agent)
|
170
|
+
logger.error "BrowserBuilder (selenium_firefox): before_request.change_user_agent option not supported by Selenium, skipped"
|
167
171
|
end
|
168
172
|
|
169
173
|
# before_request change_proxy
|
170
|
-
if @config.dig(:
|
171
|
-
logger.error "BrowserBuilder (selenium_firefox):
|
174
|
+
if @config.dig(:before_request, :change_proxy)
|
175
|
+
logger.error "BrowserBuilder (selenium_firefox): before_request.change_proxy option not supported by Selenium, skipped"
|
172
176
|
end
|
173
177
|
|
174
178
|
# before_request delay
|
175
|
-
if delay = @config.dig(:
|
179
|
+
if delay = @config.dig(:before_request, :delay).presence
|
176
180
|
@browser.config.before_request[:delay] = delay
|
177
|
-
logger.debug "BrowserBuilder (selenium_firefox): enabled
|
181
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.delay"
|
178
182
|
end
|
179
183
|
|
180
184
|
# return Capybara session instance
|
@@ -5,7 +5,7 @@ class Capybara::Mechanize::Driver
|
|
5
5
|
# Extend capybara-mechnize to support Poltergeist-like methods
|
6
6
|
# https://www.rubydoc.info/gems/poltergeist/Capybara/Poltergeist/Driver
|
7
7
|
|
8
|
-
def set_proxy(ip, port, type, user, password)
|
8
|
+
def set_proxy(ip, port, type, user = nil, password = nil)
|
9
9
|
# type is always "http", "socks" is not supported (yet)
|
10
10
|
browser.agent.set_proxy(ip, port, user, password)
|
11
11
|
end
|
@@ -6,10 +6,6 @@ module Capybara
|
|
6
6
|
class Session
|
7
7
|
attr_accessor :spider
|
8
8
|
|
9
|
-
def current_response
|
10
|
-
Nokogiri::HTML(body)
|
11
|
-
end
|
12
|
-
|
13
9
|
alias_method :original_visit, :visit
|
14
10
|
def visit(visit_uri, delay: config.before_request[:delay], skip_request_options: false, max_retries: 3)
|
15
11
|
if spider
|
@@ -24,12 +20,13 @@ module Capybara
|
|
24
20
|
original_visit(visit_uri)
|
25
21
|
rescue *config.retry_request_errors => e
|
26
22
|
logger.error "Browser: request visit error: #{e.inspect}, url: #{visit_uri}"
|
23
|
+
spider.add_event(:requests_errors, e.inspect) if spider.with_info
|
27
24
|
|
28
|
-
if (retries += 1)
|
25
|
+
if (retries += 1) <= max_retries
|
29
26
|
logger.info "Browser: sleep #{(sleep_interval += 15)} seconds and process retry № #{retries} to the url: #{visit_uri}"
|
30
27
|
sleep sleep_interval and retry
|
31
28
|
else
|
32
|
-
logger.error "Browser: all retries (#{retries}) to the url `#{visit_uri}` are gone"
|
29
|
+
logger.error "Browser: all retries (#{retries - 1}) to the url `#{visit_uri}` are gone"
|
33
30
|
raise e
|
34
31
|
end
|
35
32
|
else
|
@@ -52,7 +49,13 @@ module Capybara
|
|
52
49
|
|
53
50
|
def destroy_driver!
|
54
51
|
if @driver
|
55
|
-
|
52
|
+
begin
|
53
|
+
@driver.quit
|
54
|
+
# handle Net::ReadTimeout error for Selenium like drivers
|
55
|
+
rescue Net::ReadTimeout => e
|
56
|
+
@driver.quit
|
57
|
+
end
|
58
|
+
|
56
59
|
@driver = nil
|
57
60
|
logger.info "Browser: driver #{mode} has been destroyed"
|
58
61
|
else
|
@@ -72,6 +75,43 @@ module Capybara
|
|
72
75
|
logger.info "Browser: driver has been restarted: name: #{mode}, pid: #{driver.pid}, port: #{driver.port}"
|
73
76
|
end
|
74
77
|
|
78
|
+
def current_response
|
79
|
+
Nokogiri::HTML(body)
|
80
|
+
end
|
81
|
+
|
82
|
+
###
|
83
|
+
|
84
|
+
# Handy method to perform some processing in the new tab within block and then automatically close this tab:
|
85
|
+
# Usage (url):
|
86
|
+
# browser.within_new_window_by(url: "https://google.com") do
|
87
|
+
# do some stuff and then automatically close this tab and return back to the first tab
|
88
|
+
# end
|
89
|
+
# Usage (action) (when new tab opening by some action, for example by clicking
|
90
|
+
# on a particular element):
|
91
|
+
# action = -> { browser.find("//some/element/path").click }
|
92
|
+
# browser.within_new_window_by(action: action) do
|
93
|
+
# do some stuff and then automatically close this tab and return back to the first tab
|
94
|
+
# end
|
95
|
+
def within_new_window_by(action: nil, url: nil)
|
96
|
+
case
|
97
|
+
when action
|
98
|
+
opened_window = window_opened_by { action.call }
|
99
|
+
within_window(opened_window) do
|
100
|
+
yield
|
101
|
+
current_window.close
|
102
|
+
end
|
103
|
+
when url
|
104
|
+
within_window(open_new_window) do
|
105
|
+
visit(url)
|
106
|
+
|
107
|
+
yield
|
108
|
+
current_window.close
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
###
|
114
|
+
|
75
115
|
private
|
76
116
|
|
77
117
|
def process_delay(delay)
|
data/lib/kimurai/cli.rb
CHANGED
@@ -66,6 +66,7 @@ module Kimurai
|
|
66
66
|
###
|
67
67
|
|
68
68
|
desc "crawl", "Run a particular spider by it's name"
|
69
|
+
option :continue, aliases: :c, type: :boolean, default: false, banner: "Continue previous crawling"
|
69
70
|
def crawl(spider_name)
|
70
71
|
raise "Can't find Kimurai project" unless inside_project?
|
71
72
|
require './config/boot'
|
@@ -80,7 +81,7 @@ module Kimurai
|
|
80
81
|
Kimurai.time_zone = time_zone
|
81
82
|
end
|
82
83
|
|
83
|
-
klass.crawl!
|
84
|
+
klass.crawl!(continue: options["continue"])
|
84
85
|
end
|
85
86
|
|
86
87
|
desc "parse", "Parse url in the particular spider method"
|
data/lib/kimurai/pipeline.rb
CHANGED
@@ -14,12 +14,16 @@ module Kimurai
|
|
14
14
|
|
15
15
|
###
|
16
16
|
|
17
|
+
def storage
|
18
|
+
spider.storage
|
19
|
+
end
|
20
|
+
|
17
21
|
def unique?(scope, value)
|
18
22
|
spider.unique?(scope, value)
|
19
23
|
end
|
20
24
|
|
21
|
-
def save_to(path, item, format:, position: true)
|
22
|
-
spider.save_to(path, item, format: format, position: position)
|
25
|
+
def save_to(path, item, format:, position: true, append: false)
|
26
|
+
spider.save_to(path, item, format: format, position: position, append: append)
|
23
27
|
end
|
24
28
|
|
25
29
|
def logger
|
@@ -18,3 +18,11 @@ group :development do
|
|
18
18
|
gem 'pry'
|
19
19
|
end
|
20
20
|
|
21
|
+
# If you want to save items to the database, require one of these gems:
|
22
|
+
# gem 'sqlite3'
|
23
|
+
# gem 'pg'
|
24
|
+
# gem 'mysql2'
|
25
|
+
|
26
|
+
# And use your preferred ORM/database connector:
|
27
|
+
# gem 'activerecord', require: 'active_record'
|
28
|
+
# gem 'sequel'
|
@@ -64,41 +64,56 @@ class ApplicationSpider < Kimurai::Base
|
|
64
64
|
# Option to provide custom SSL certificate. Works only for :poltergeist_phantomjs and :mechanize
|
65
65
|
# ssl_cert_path: "path/to/ssl_cert",
|
66
66
|
|
67
|
-
#
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
67
|
+
# Inject some JavaScript code to the browser.
|
68
|
+
# Format: array of strings, where each string is a path to JS file.
|
69
|
+
# Works only for poltergeist_phantomjs engine (Selenium doesn't support JS code injection)
|
70
|
+
# extensions: ["lib/code_to_inject.js"],
|
71
|
+
|
72
|
+
# Automatically skip duplicated (already visited) urls when using `request_to` method.
|
73
|
+
# Possible values: `true` or `hash` with options.
|
74
|
+
# In case of `true`, all visited urls will be added to the storage's scope `:requests_urls`
|
75
|
+
# and if url already contains in this scope, request will be skipped.
|
76
|
+
# You can configure this setting by providing additional options as hash:
|
77
|
+
# `skip_duplicate_requests: { scope: :custom_scope, check_only: true }`, where:
|
78
|
+
# `scope:` - use custom scope than `:requests_urls`
|
79
|
+
# `check_only:` - if true, then scope will be only checked for url, url will not
|
80
|
+
# be added to the scope if scope doesn't contains it.
|
81
|
+
# works for all drivers
|
82
|
+
# skip_duplicate_requests: true,
|
83
|
+
|
84
|
+
# Array of errors to retry while processing a request
|
85
|
+
# retry_request_errors: [Net::ReadTimeout],
|
86
|
+
|
87
|
+
# Restart browser if one of the options is true:
|
88
|
+
restart_if: {
|
89
|
+
# Restart browser if provided memory limit (in kilobytes) is exceeded (works for all engines)
|
90
|
+
# memory_limit: 350_000,
|
91
|
+
|
92
|
+
# Restart browser if provided requests limit is exceeded (works for all engines)
|
93
|
+
# requests_limit: 100
|
94
|
+
},
|
95
|
+
before_request: {
|
96
|
+
# Change proxy before each request. The `proxy:` option above should be presented
|
97
|
+
# and has lambda format. Works only for poltergeist and mechanize engines
|
98
|
+
# (Selenium doesn't support proxy rotation).
|
99
|
+
# change_proxy: true,
|
100
|
+
|
101
|
+
# Change user agent before each request. The `user_agent:` option above should be presented
|
102
|
+
# and has lambda format. Works only for poltergeist and mechanize engines
|
103
|
+
# (selenium doesn't support to get/set headers).
|
104
|
+
# change_user_agent: true,
|
105
|
+
|
106
|
+
# Clear all cookies before each request, works for all engines
|
107
|
+
# clear_cookies: true,
|
108
|
+
|
109
|
+
# If you want to clear all cookies + set custom cookies (`cookies:` option above should be presented)
|
110
|
+
# use this option instead (works for all engines)
|
111
|
+
# clear_and_set_cookies: true,
|
112
|
+
|
113
|
+
# Global option to set delay between requests.
|
114
|
+
# Delay can be `Integer`, `Float` or `Range` (`2..5`). In case of a range,
|
115
|
+
# delay number will be chosen randomly for each request: `rand (2..5) # => 3`
|
116
|
+
# delay: 1..3
|
102
117
|
}
|
103
118
|
}
|
104
119
|
end
|
data/lib/kimurai/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kimurai
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Victor Afanasev
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-09-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -282,8 +282,8 @@ files:
|
|
282
282
|
- lib/kimurai/automation/setup/phantomjs.yml
|
283
283
|
- lib/kimurai/automation/setup/ruby_environment.yml
|
284
284
|
- lib/kimurai/base.rb
|
285
|
-
- lib/kimurai/base/
|
286
|
-
- lib/kimurai/base/
|
285
|
+
- lib/kimurai/base/saver.rb
|
286
|
+
- lib/kimurai/base/storage.rb
|
287
287
|
- lib/kimurai/base_helper.rb
|
288
288
|
- lib/kimurai/browser_builder.rb
|
289
289
|
- lib/kimurai/browser_builder/mechanize_builder.rb
|
@@ -323,7 +323,7 @@ files:
|
|
323
323
|
- lib/kimurai/template/spiders/application_spider.rb
|
324
324
|
- lib/kimurai/template/tmp/.keep
|
325
325
|
- lib/kimurai/version.rb
|
326
|
-
homepage: https://github.com/vifreefly/
|
326
|
+
homepage: https://github.com/vifreefly/kimuraframework
|
327
327
|
licenses:
|
328
328
|
- MIT
|
329
329
|
metadata: {}
|
@@ -1,22 +0,0 @@
|
|
1
|
-
module Kimurai
|
2
|
-
class Base
|
3
|
-
class UniqChecker
|
4
|
-
def initialize
|
5
|
-
@database = {}
|
6
|
-
@mutex = Mutex.new
|
7
|
-
end
|
8
|
-
|
9
|
-
def unique?(scope, value)
|
10
|
-
@mutex.synchronize do
|
11
|
-
@database[scope] ||= []
|
12
|
-
if @database[scope].include?(value)
|
13
|
-
false
|
14
|
-
else
|
15
|
-
@database[scope].push(value)
|
16
|
-
true
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|