kimurai 1.0.1 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +56 -1
- data/README.md +183 -69
- data/kimurai.gemspec +1 -1
- data/lib/kimurai/base.rb +96 -36
- data/lib/kimurai/base/{simple_saver.rb → saver.rb} +25 -17
- data/lib/kimurai/base/storage.rb +91 -0
- data/lib/kimurai/browser_builder.rb +6 -0
- data/lib/kimurai/browser_builder/mechanize_builder.rb +22 -18
- data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +25 -20
- data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +21 -23
- data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +22 -18
- data/lib/kimurai/capybara_ext/mechanize/driver.rb +1 -1
- data/lib/kimurai/capybara_ext/session.rb +47 -7
- data/lib/kimurai/cli.rb +2 -1
- data/lib/kimurai/pipeline.rb +6 -2
- data/lib/kimurai/template/Gemfile +8 -0
- data/lib/kimurai/template/spiders/application_spider.rb +50 -35
- data/lib/kimurai/version.rb +1 -1
- metadata +5 -5
- data/lib/kimurai/base/uniq_checker.rb +0 -22
@@ -82,7 +82,7 @@ module Kimurai
|
|
82
82
|
if user_agent = @config[:user_agent].presence
|
83
83
|
user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
|
84
84
|
driver_options.profile["general.useragent.override"] = user_agent_string
|
85
|
-
logger.debug "BrowserBuilder (selenium_firefox): enabled custom
|
85
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled custom user_agent"
|
86
86
|
end
|
87
87
|
|
88
88
|
# Headless mode
|
@@ -114,6 +114,10 @@ module Kimurai
|
|
114
114
|
@browser.spider = spider
|
115
115
|
logger.debug "BrowserBuilder (selenium_firefox): created browser instance"
|
116
116
|
|
117
|
+
if @config[:extensions].present?
|
118
|
+
logger.error "BrowserBuilder (selenium_firefox): `extensions` option not supported by Selenium, skipped"
|
119
|
+
end
|
120
|
+
|
117
121
|
# Window size
|
118
122
|
if size = @config[:window_size].presence
|
119
123
|
@browser.current_window.resize_to(*size)
|
@@ -128,53 +132,53 @@ module Kimurai
|
|
128
132
|
|
129
133
|
# Browser instance options
|
130
134
|
# retry_request_errors
|
131
|
-
if errors = @config
|
135
|
+
if errors = @config[:retry_request_errors].presence
|
132
136
|
@browser.config.retry_request_errors = errors
|
133
|
-
logger.debug "BrowserBuilder (selenium_firefox): enabled
|
137
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled retry_request_errors"
|
134
138
|
end
|
135
139
|
|
136
140
|
# restart_if
|
137
|
-
if requests_limit = @config.dig(:
|
141
|
+
if requests_limit = @config.dig(:restart_if, :requests_limit).presence
|
138
142
|
@browser.config.restart_if[:requests_limit] = requests_limit
|
139
|
-
logger.debug "BrowserBuilder (selenium_firefox): enabled
|
143
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled restart_if.requests_limit >= #{requests_limit}"
|
140
144
|
end
|
141
145
|
|
142
|
-
if memory_limit = @config.dig(:
|
146
|
+
if memory_limit = @config.dig(:restart_if, :memory_limit).presence
|
143
147
|
@browser.config.restart_if[:memory_limit] = memory_limit
|
144
|
-
logger.debug "BrowserBuilder (selenium_firefox): enabled
|
148
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled restart_if.memory_limit >= #{memory_limit}"
|
145
149
|
end
|
146
150
|
|
147
151
|
# before_request clear_cookies
|
148
|
-
if @config.dig(:
|
152
|
+
if @config.dig(:before_request, :clear_cookies)
|
149
153
|
@browser.config.before_request[:clear_cookies] = true
|
150
|
-
logger.debug "BrowserBuilder (selenium_firefox): enabled
|
154
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.clear_cookies"
|
151
155
|
end
|
152
156
|
|
153
157
|
# before_request clear_and_set_cookies
|
154
|
-
if @config.dig(:
|
158
|
+
if @config.dig(:before_request, :clear_and_set_cookies)
|
155
159
|
if cookies = @config[:cookies].presence
|
156
160
|
@browser.config.cookies = cookies
|
157
161
|
@browser.config.before_request[:clear_and_set_cookies] = true
|
158
|
-
logger.debug "BrowserBuilder (selenium_firefox): enabled
|
162
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.clear_and_set_cookies"
|
159
163
|
else
|
160
|
-
logger.error "BrowserBuilder (selenium_firefox):
|
164
|
+
logger.error "BrowserBuilder (selenium_firefox): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
|
161
165
|
end
|
162
166
|
end
|
163
167
|
|
164
168
|
# before_request change_user_agent
|
165
|
-
if @config.dig(:
|
166
|
-
logger.error "BrowserBuilder (selenium_firefox):
|
169
|
+
if @config.dig(:before_request, :change_user_agent)
|
170
|
+
logger.error "BrowserBuilder (selenium_firefox): before_request.change_user_agent option not supported by Selenium, skipped"
|
167
171
|
end
|
168
172
|
|
169
173
|
# before_request change_proxy
|
170
|
-
if @config.dig(:
|
171
|
-
logger.error "BrowserBuilder (selenium_firefox):
|
174
|
+
if @config.dig(:before_request, :change_proxy)
|
175
|
+
logger.error "BrowserBuilder (selenium_firefox): before_request.change_proxy option not supported by Selenium, skipped"
|
172
176
|
end
|
173
177
|
|
174
178
|
# before_request delay
|
175
|
-
if delay = @config.dig(:
|
179
|
+
if delay = @config.dig(:before_request, :delay).presence
|
176
180
|
@browser.config.before_request[:delay] = delay
|
177
|
-
logger.debug "BrowserBuilder (selenium_firefox): enabled
|
181
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.delay"
|
178
182
|
end
|
179
183
|
|
180
184
|
# return Capybara session instance
|
@@ -5,7 +5,7 @@ class Capybara::Mechanize::Driver
|
|
5
5
|
# Extend capybara-mechnize to support Poltergeist-like methods
|
6
6
|
# https://www.rubydoc.info/gems/poltergeist/Capybara/Poltergeist/Driver
|
7
7
|
|
8
|
-
def set_proxy(ip, port, type, user, password)
|
8
|
+
def set_proxy(ip, port, type, user = nil, password = nil)
|
9
9
|
# type is always "http", "socks" is not supported (yet)
|
10
10
|
browser.agent.set_proxy(ip, port, user, password)
|
11
11
|
end
|
@@ -6,10 +6,6 @@ module Capybara
|
|
6
6
|
class Session
|
7
7
|
attr_accessor :spider
|
8
8
|
|
9
|
-
def current_response
|
10
|
-
Nokogiri::HTML(body)
|
11
|
-
end
|
12
|
-
|
13
9
|
alias_method :original_visit, :visit
|
14
10
|
def visit(visit_uri, delay: config.before_request[:delay], skip_request_options: false, max_retries: 3)
|
15
11
|
if spider
|
@@ -24,12 +20,13 @@ module Capybara
|
|
24
20
|
original_visit(visit_uri)
|
25
21
|
rescue *config.retry_request_errors => e
|
26
22
|
logger.error "Browser: request visit error: #{e.inspect}, url: #{visit_uri}"
|
23
|
+
spider.add_event(:requests_errors, e.inspect) if spider.with_info
|
27
24
|
|
28
|
-
if (retries += 1)
|
25
|
+
if (retries += 1) <= max_retries
|
29
26
|
logger.info "Browser: sleep #{(sleep_interval += 15)} seconds and process retry № #{retries} to the url: #{visit_uri}"
|
30
27
|
sleep sleep_interval and retry
|
31
28
|
else
|
32
|
-
logger.error "Browser: all retries (#{retries}) to the url `#{visit_uri}` are gone"
|
29
|
+
logger.error "Browser: all retries (#{retries - 1}) to the url `#{visit_uri}` are gone"
|
33
30
|
raise e
|
34
31
|
end
|
35
32
|
else
|
@@ -52,7 +49,13 @@ module Capybara
|
|
52
49
|
|
53
50
|
def destroy_driver!
|
54
51
|
if @driver
|
55
|
-
|
52
|
+
begin
|
53
|
+
@driver.quit
|
54
|
+
# handle Net::ReadTimeout error for Selenium like drivers
|
55
|
+
rescue Net::ReadTimeout => e
|
56
|
+
@driver.quit
|
57
|
+
end
|
58
|
+
|
56
59
|
@driver = nil
|
57
60
|
logger.info "Browser: driver #{mode} has been destroyed"
|
58
61
|
else
|
@@ -72,6 +75,43 @@ module Capybara
|
|
72
75
|
logger.info "Browser: driver has been restarted: name: #{mode}, pid: #{driver.pid}, port: #{driver.port}"
|
73
76
|
end
|
74
77
|
|
78
|
+
def current_response
|
79
|
+
Nokogiri::HTML(body)
|
80
|
+
end
|
81
|
+
|
82
|
+
###
|
83
|
+
|
84
|
+
# Handy method to perform some processing in the new tab within block and then automatically close this tab:
|
85
|
+
# Usage (url):
|
86
|
+
# browser.within_new_window_by(url: "https://google.com") do
|
87
|
+
# do some stuff and then automatically close this tab and return back to the first tab
|
88
|
+
# end
|
89
|
+
# Usage (action) (when new tab opening by some action, for example by clicking
|
90
|
+
# on a particular element):
|
91
|
+
# action = -> { browser.find("//some/element/path").click }
|
92
|
+
# browser.within_new_window_by(action: action) do
|
93
|
+
# do some stuff and then automatically close this tab and return back to the first tab
|
94
|
+
# end
|
95
|
+
def within_new_window_by(action: nil, url: nil)
|
96
|
+
case
|
97
|
+
when action
|
98
|
+
opened_window = window_opened_by { action.call }
|
99
|
+
within_window(opened_window) do
|
100
|
+
yield
|
101
|
+
current_window.close
|
102
|
+
end
|
103
|
+
when url
|
104
|
+
within_window(open_new_window) do
|
105
|
+
visit(url)
|
106
|
+
|
107
|
+
yield
|
108
|
+
current_window.close
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
###
|
114
|
+
|
75
115
|
private
|
76
116
|
|
77
117
|
def process_delay(delay)
|
data/lib/kimurai/cli.rb
CHANGED
@@ -66,6 +66,7 @@ module Kimurai
|
|
66
66
|
###
|
67
67
|
|
68
68
|
desc "crawl", "Run a particular spider by it's name"
|
69
|
+
option :continue, aliases: :c, type: :boolean, default: false, banner: "Continue previous crawling"
|
69
70
|
def crawl(spider_name)
|
70
71
|
raise "Can't find Kimurai project" unless inside_project?
|
71
72
|
require './config/boot'
|
@@ -80,7 +81,7 @@ module Kimurai
|
|
80
81
|
Kimurai.time_zone = time_zone
|
81
82
|
end
|
82
83
|
|
83
|
-
klass.crawl!
|
84
|
+
klass.crawl!(continue: options["continue"])
|
84
85
|
end
|
85
86
|
|
86
87
|
desc "parse", "Parse url in the particular spider method"
|
data/lib/kimurai/pipeline.rb
CHANGED
@@ -14,12 +14,16 @@ module Kimurai
|
|
14
14
|
|
15
15
|
###
|
16
16
|
|
17
|
+
def storage
|
18
|
+
spider.storage
|
19
|
+
end
|
20
|
+
|
17
21
|
def unique?(scope, value)
|
18
22
|
spider.unique?(scope, value)
|
19
23
|
end
|
20
24
|
|
21
|
-
def save_to(path, item, format:, position: true)
|
22
|
-
spider.save_to(path, item, format: format, position: position)
|
25
|
+
def save_to(path, item, format:, position: true, append: false)
|
26
|
+
spider.save_to(path, item, format: format, position: position, append: append)
|
23
27
|
end
|
24
28
|
|
25
29
|
def logger
|
@@ -18,3 +18,11 @@ group :development do
|
|
18
18
|
gem 'pry'
|
19
19
|
end
|
20
20
|
|
21
|
+
# If you want to save items to the database, require one of these gems:
|
22
|
+
# gem 'sqlite3'
|
23
|
+
# gem 'pg'
|
24
|
+
# gem 'mysql2'
|
25
|
+
|
26
|
+
# And use your preferred ORM/database connector:
|
27
|
+
# gem 'activerecord', require: 'active_record'
|
28
|
+
# gem 'sequel'
|
@@ -64,41 +64,56 @@ class ApplicationSpider < Kimurai::Base
|
|
64
64
|
# Option to provide custom SSL certificate. Works only for :poltergeist_phantomjs and :mechanize
|
65
65
|
# ssl_cert_path: "path/to/ssl_cert",
|
66
66
|
|
67
|
-
#
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
67
|
+
# Inject some JavaScript code to the browser.
|
68
|
+
# Format: array of strings, where each string is a path to JS file.
|
69
|
+
# Works only for poltergeist_phantomjs engine (Selenium doesn't support JS code injection)
|
70
|
+
# extensions: ["lib/code_to_inject.js"],
|
71
|
+
|
72
|
+
# Automatically skip duplicated (already visited) urls when using `request_to` method.
|
73
|
+
# Possible values: `true` or `hash` with options.
|
74
|
+
# In case of `true`, all visited urls will be added to the storage's scope `:requests_urls`
|
75
|
+
# and if url already contains in this scope, request will be skipped.
|
76
|
+
# You can configure this setting by providing additional options as hash:
|
77
|
+
# `skip_duplicate_requests: { scope: :custom_scope, check_only: true }`, where:
|
78
|
+
# `scope:` - use custom scope than `:requests_urls`
|
79
|
+
# `check_only:` - if true, then scope will be only checked for url, url will not
|
80
|
+
# be added to the scope if scope doesn't contains it.
|
81
|
+
# works for all drivers
|
82
|
+
# skip_duplicate_requests: true,
|
83
|
+
|
84
|
+
# Array of errors to retry while processing a request
|
85
|
+
# retry_request_errors: [Net::ReadTimeout],
|
86
|
+
|
87
|
+
# Restart browser if one of the options is true:
|
88
|
+
restart_if: {
|
89
|
+
# Restart browser if provided memory limit (in kilobytes) is exceeded (works for all engines)
|
90
|
+
# memory_limit: 350_000,
|
91
|
+
|
92
|
+
# Restart browser if provided requests limit is exceeded (works for all engines)
|
93
|
+
# requests_limit: 100
|
94
|
+
},
|
95
|
+
before_request: {
|
96
|
+
# Change proxy before each request. The `proxy:` option above should be presented
|
97
|
+
# and has lambda format. Works only for poltergeist and mechanize engines
|
98
|
+
# (Selenium doesn't support proxy rotation).
|
99
|
+
# change_proxy: true,
|
100
|
+
|
101
|
+
# Change user agent before each request. The `user_agent:` option above should be presented
|
102
|
+
# and has lambda format. Works only for poltergeist and mechanize engines
|
103
|
+
# (selenium doesn't support to get/set headers).
|
104
|
+
# change_user_agent: true,
|
105
|
+
|
106
|
+
# Clear all cookies before each request, works for all engines
|
107
|
+
# clear_cookies: true,
|
108
|
+
|
109
|
+
# If you want to clear all cookies + set custom cookies (`cookies:` option above should be presented)
|
110
|
+
# use this option instead (works for all engines)
|
111
|
+
# clear_and_set_cookies: true,
|
112
|
+
|
113
|
+
# Global option to set delay between requests.
|
114
|
+
# Delay can be `Integer`, `Float` or `Range` (`2..5`). In case of a range,
|
115
|
+
# delay number will be chosen randomly for each request: `rand (2..5) # => 3`
|
116
|
+
# delay: 1..3
|
102
117
|
}
|
103
118
|
}
|
104
119
|
end
|
data/lib/kimurai/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kimurai
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Victor Afanasev
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-09-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -282,8 +282,8 @@ files:
|
|
282
282
|
- lib/kimurai/automation/setup/phantomjs.yml
|
283
283
|
- lib/kimurai/automation/setup/ruby_environment.yml
|
284
284
|
- lib/kimurai/base.rb
|
285
|
-
- lib/kimurai/base/
|
286
|
-
- lib/kimurai/base/
|
285
|
+
- lib/kimurai/base/saver.rb
|
286
|
+
- lib/kimurai/base/storage.rb
|
287
287
|
- lib/kimurai/base_helper.rb
|
288
288
|
- lib/kimurai/browser_builder.rb
|
289
289
|
- lib/kimurai/browser_builder/mechanize_builder.rb
|
@@ -323,7 +323,7 @@ files:
|
|
323
323
|
- lib/kimurai/template/spiders/application_spider.rb
|
324
324
|
- lib/kimurai/template/tmp/.keep
|
325
325
|
- lib/kimurai/version.rb
|
326
|
-
homepage: https://github.com/vifreefly/
|
326
|
+
homepage: https://github.com/vifreefly/kimuraframework
|
327
327
|
licenses:
|
328
328
|
- MIT
|
329
329
|
metadata: {}
|
@@ -1,22 +0,0 @@
|
|
1
|
-
module Kimurai
|
2
|
-
class Base
|
3
|
-
class UniqChecker
|
4
|
-
def initialize
|
5
|
-
@database = {}
|
6
|
-
@mutex = Mutex.new
|
7
|
-
end
|
8
|
-
|
9
|
-
def unique?(scope, value)
|
10
|
-
@mutex.synchronize do
|
11
|
-
@database[scope] ||= []
|
12
|
-
if @database[scope].include?(value)
|
13
|
-
false
|
14
|
-
else
|
15
|
-
@database[scope].push(value)
|
16
|
-
true
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|