kimurai 1.3.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +9 -0
- data/CHANGELOG.md +29 -0
- data/Gemfile +2 -2
- data/README.md +478 -649
- data/Rakefile +6 -6
- data/bin/console +3 -4
- data/exe/kimurai +0 -1
- data/kimurai.gemspec +38 -37
- data/lib/kimurai/base/saver.rb +15 -19
- data/lib/kimurai/base/storage.rb +1 -1
- data/lib/kimurai/base.rb +42 -38
- data/lib/kimurai/base_helper.rb +5 -4
- data/lib/kimurai/browser_builder/mechanize_builder.rb +44 -38
- data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +63 -51
- data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +61 -55
- data/lib/kimurai/browser_builder.rb +7 -31
- data/lib/kimurai/capybara_configuration.rb +1 -1
- data/lib/kimurai/capybara_ext/driver/base.rb +50 -46
- data/lib/kimurai/capybara_ext/mechanize/driver.rb +51 -50
- data/lib/kimurai/capybara_ext/selenium/driver.rb +33 -29
- data/lib/kimurai/capybara_ext/session/config.rb +1 -1
- data/lib/kimurai/capybara_ext/session.rb +40 -38
- data/lib/kimurai/cli/generator.rb +15 -15
- data/lib/kimurai/cli.rb +52 -85
- data/lib/kimurai/core_ext/array.rb +2 -2
- data/lib/kimurai/core_ext/hash.rb +1 -1
- data/lib/kimurai/core_ext/numeric.rb +4 -4
- data/lib/kimurai/pipeline.rb +2 -1
- data/lib/kimurai/runner.rb +6 -6
- data/lib/kimurai/template/Gemfile +2 -2
- data/lib/kimurai/template/config/boot.rb +4 -4
- data/lib/kimurai/template/config/schedule.rb +15 -15
- data/lib/kimurai/template/spiders/application_spider.rb +14 -14
- data/lib/kimurai/version.rb +1 -1
- data/lib/kimurai.rb +7 -3
- metadata +58 -65
- data/.travis.yml +0 -5
- data/lib/kimurai/automation/deploy.yml +0 -54
- data/lib/kimurai/automation/setup/chromium_chromedriver.yml +0 -26
- data/lib/kimurai/automation/setup/firefox_geckodriver.yml +0 -20
- data/lib/kimurai/automation/setup/phantomjs.yml +0 -33
- data/lib/kimurai/automation/setup/ruby_environment.yml +0 -124
- data/lib/kimurai/automation/setup.yml +0 -44
- data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +0 -171
- data/lib/kimurai/capybara_ext/poltergeist/driver.rb +0 -13
- data/lib/kimurai/cli/ansible_command_builder.rb +0 -71
- data/lib/kimurai/template/config/automation.yml +0 -13
data/Rakefile
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
require
|
|
2
|
-
require
|
|
1
|
+
require 'bundler/gem_tasks'
|
|
2
|
+
require 'rake/testtask'
|
|
3
3
|
|
|
4
4
|
Rake::TestTask.new(:test) do |t|
|
|
5
|
-
t.libs <<
|
|
6
|
-
t.libs <<
|
|
7
|
-
t.test_files = FileList[
|
|
5
|
+
t.libs << 'test'
|
|
6
|
+
t.libs << 'lib'
|
|
7
|
+
t.test_files = FileList['test/**/*_test.rb']
|
|
8
8
|
end
|
|
9
9
|
|
|
10
|
-
task :
|
|
10
|
+
task default: :test
|
data/bin/console
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env ruby
|
|
2
|
-
|
|
3
|
-
require
|
|
4
|
-
require "kimurai"
|
|
2
|
+
require 'bundler/setup'
|
|
3
|
+
require 'kimurai'
|
|
5
4
|
|
|
6
5
|
# You can add fixtures and/or initialization code here to make experimenting
|
|
7
6
|
# with your gem easier. You can also use a different console, if you like.
|
|
@@ -10,5 +9,5 @@ require "kimurai"
|
|
|
10
9
|
# require "pry"
|
|
11
10
|
# Pry.start
|
|
12
11
|
|
|
13
|
-
require
|
|
12
|
+
require 'irb'
|
|
14
13
|
IRB.start(__FILE__)
|
data/exe/kimurai
CHANGED
data/kimurai.gemspec
CHANGED
|
@@ -1,48 +1,49 @@
|
|
|
1
|
-
|
|
2
|
-
lib = File.expand_path("../lib", __FILE__)
|
|
1
|
+
lib = File.expand_path('lib', __dir__)
|
|
3
2
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
|
-
require
|
|
3
|
+
require 'kimurai/version'
|
|
5
4
|
|
|
6
5
|
Gem::Specification.new do |spec|
|
|
7
|
-
spec.name =
|
|
6
|
+
spec.name = 'kimurai'
|
|
8
7
|
spec.version = Kimurai::VERSION
|
|
9
|
-
spec.authors = [
|
|
10
|
-
spec.email = [
|
|
8
|
+
spec.authors = ['Victor Afanasev']
|
|
9
|
+
spec.email = ['vicfreefly@gmail.com']
|
|
11
10
|
|
|
12
|
-
spec.summary =
|
|
13
|
-
spec.homepage =
|
|
14
|
-
spec.license =
|
|
11
|
+
spec.summary = 'Modern web scraping framework written in Ruby and based on Capybara/Nokogiri'
|
|
12
|
+
spec.homepage = 'https://github.com/vifreefly/kimuraframework'
|
|
13
|
+
spec.license = 'MIT'
|
|
15
14
|
|
|
16
15
|
# Specify which files should be added to the gem when it is released.
|
|
17
16
|
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
|
18
|
-
spec.files = Dir.chdir(File.expand_path(
|
|
17
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
|
19
18
|
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
|
20
19
|
end
|
|
21
|
-
spec.bindir =
|
|
22
|
-
spec.executables =
|
|
23
|
-
spec.require_paths = [
|
|
24
|
-
spec.required_ruby_version =
|
|
25
|
-
|
|
26
|
-
spec.add_dependency
|
|
27
|
-
spec.add_dependency
|
|
28
|
-
spec.add_dependency
|
|
29
|
-
spec.add_dependency
|
|
30
|
-
spec.add_dependency
|
|
31
|
-
|
|
32
|
-
spec.add_dependency
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
spec.add_dependency
|
|
36
|
-
|
|
37
|
-
spec.add_dependency
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
spec.add_dependency
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
spec.add_dependency
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
spec.
|
|
47
|
-
|
|
20
|
+
spec.bindir = 'exe'
|
|
21
|
+
spec.executables = 'kimurai'
|
|
22
|
+
spec.require_paths = ['lib']
|
|
23
|
+
spec.required_ruby_version = '>= 3.1.0'
|
|
24
|
+
|
|
25
|
+
spec.add_dependency 'activesupport'
|
|
26
|
+
spec.add_dependency 'cliver'
|
|
27
|
+
spec.add_dependency 'csv'
|
|
28
|
+
spec.add_dependency 'murmurhash3'
|
|
29
|
+
spec.add_dependency 'nokogiri'
|
|
30
|
+
spec.add_dependency 'ostruct'
|
|
31
|
+
spec.add_dependency 'thor'
|
|
32
|
+
|
|
33
|
+
# for capybara-mechanize compatibility
|
|
34
|
+
spec.add_dependency 'mutex_m'
|
|
35
|
+
spec.add_dependency 'nkf'
|
|
36
|
+
spec.add_dependency 'reline'
|
|
37
|
+
|
|
38
|
+
spec.add_dependency 'capybara', '~> 3.40'
|
|
39
|
+
spec.add_dependency 'capybara-mechanize', '~> 1.13'
|
|
40
|
+
spec.add_dependency 'selenium-webdriver', '~> 4.27'
|
|
41
|
+
|
|
42
|
+
spec.add_dependency 'headless'
|
|
43
|
+
spec.add_dependency 'pmap'
|
|
44
|
+
|
|
45
|
+
spec.add_dependency 'whenever'
|
|
46
|
+
|
|
47
|
+
spec.add_dependency 'pry'
|
|
48
|
+
spec.add_dependency 'rbcat', '~> 1.0'
|
|
48
49
|
end
|
data/lib/kimurai/base/saver.rb
CHANGED
|
@@ -7,9 +7,7 @@ module Kimurai
|
|
|
7
7
|
attr_reader :format, :path, :position, :append
|
|
8
8
|
|
|
9
9
|
def initialize(path, format:, position: true, append: false)
|
|
10
|
-
unless %i
|
|
11
|
-
raise "SimpleSaver: wrong type of format: #{format}"
|
|
12
|
-
end
|
|
10
|
+
raise "SimpleSaver: wrong type of format: #{format}" unless %i[json pretty_json jsonlines csv].include?(format)
|
|
13
11
|
|
|
14
12
|
@path = path
|
|
15
13
|
@format = format
|
|
@@ -42,48 +40,48 @@ module Kimurai
|
|
|
42
40
|
def save_to_json(item)
|
|
43
41
|
data = JSON.generate([item])
|
|
44
42
|
|
|
45
|
-
if @index > 1 || append && File.
|
|
43
|
+
if @index > 1 || append && File.exist?(path)
|
|
46
44
|
file_content = File.read(path).sub(/\}\]\Z/, "\}\,")
|
|
47
|
-
File.open(path,
|
|
48
|
-
f.write(file_content + data.sub(/\A\[/,
|
|
45
|
+
File.open(path, 'w') do |f|
|
|
46
|
+
f.write(file_content + data.sub(/\A\[/, ''))
|
|
49
47
|
end
|
|
50
48
|
else
|
|
51
|
-
File.open(path,
|
|
49
|
+
File.open(path, 'w') { |f| f.write(data) }
|
|
52
50
|
end
|
|
53
51
|
end
|
|
54
52
|
|
|
55
53
|
def save_to_pretty_json(item)
|
|
56
54
|
data = JSON.pretty_generate([item])
|
|
57
55
|
|
|
58
|
-
if @index > 1 || append && File.
|
|
56
|
+
if @index > 1 || append && File.exist?(path)
|
|
59
57
|
file_content = File.read(path).sub(/\}\n\]\Z/, "\}\,\n")
|
|
60
|
-
File.open(path,
|
|
61
|
-
f.write(file_content + data.sub(/\A\[\n/,
|
|
58
|
+
File.open(path, 'w') do |f|
|
|
59
|
+
f.write(file_content + data.sub(/\A\[\n/, ''))
|
|
62
60
|
end
|
|
63
61
|
else
|
|
64
|
-
File.open(path,
|
|
62
|
+
File.open(path, 'w') { |f| f.write(data) }
|
|
65
63
|
end
|
|
66
64
|
end
|
|
67
65
|
|
|
68
66
|
def save_to_jsonlines(item)
|
|
69
67
|
data = JSON.generate(item)
|
|
70
68
|
|
|
71
|
-
if @index > 1 || append && File.
|
|
72
|
-
File.open(path,
|
|
69
|
+
if @index > 1 || append && File.exist?(path)
|
|
70
|
+
File.open(path, 'a') { |file| file.write("\n#{data}") }
|
|
73
71
|
else
|
|
74
|
-
File.open(path,
|
|
72
|
+
File.open(path, 'w') { |file| file.write(data) }
|
|
75
73
|
end
|
|
76
74
|
end
|
|
77
75
|
|
|
78
76
|
def save_to_csv(item)
|
|
79
77
|
data = flatten_hash(item)
|
|
80
78
|
|
|
81
|
-
if @index > 1 || append && File.
|
|
82
|
-
CSV.open(path,
|
|
79
|
+
if @index > 1 || append && File.exist?(path)
|
|
80
|
+
CSV.open(path, 'a+', force_quotes: true) do |csv|
|
|
83
81
|
csv << data.values
|
|
84
82
|
end
|
|
85
83
|
else
|
|
86
|
-
CSV.open(path,
|
|
84
|
+
CSV.open(path, 'w', force_quotes: true) do |csv|
|
|
87
85
|
csv << data.keys
|
|
88
86
|
csv << data.values
|
|
89
87
|
end
|
|
@@ -102,5 +100,3 @@ module Kimurai
|
|
|
102
100
|
end
|
|
103
101
|
end
|
|
104
102
|
end
|
|
105
|
-
|
|
106
|
-
|
data/lib/kimurai/base/storage.rb
CHANGED
data/lib/kimurai/base.rb
CHANGED
|
@@ -1,19 +1,22 @@
|
|
|
1
|
+
require 'English'
|
|
1
2
|
require_relative 'base/saver'
|
|
2
3
|
require_relative 'base/storage'
|
|
3
4
|
|
|
4
5
|
module Kimurai
|
|
5
6
|
class Base
|
|
7
|
+
class InvalidUrlError < StandardError; end
|
|
8
|
+
|
|
6
9
|
# don't deep merge config's headers hash option
|
|
7
|
-
DMERGE_EXCLUDE = [:headers]
|
|
10
|
+
DMERGE_EXCLUDE = [:headers].freeze
|
|
8
11
|
|
|
9
12
|
LoggerFormatter = proc do |severity, datetime, progname, msg|
|
|
10
13
|
current_thread_id = Thread.current.object_id
|
|
11
|
-
thread_type = Thread.main == Thread.current ?
|
|
12
|
-
output = "%s, [%s#%d] [%s: %s] %5s -- %s: %s\n"
|
|
13
|
-
|
|
14
|
+
thread_type = Thread.main == Thread.current ? 'M' : 'C'
|
|
15
|
+
output = format("%s, [%s#%d] [%s: %s] %5s -- %s: %s\n", severity[0..0], datetime, $PROCESS_ID, thread_type,
|
|
16
|
+
current_thread_id, severity, progname, msg)
|
|
14
17
|
|
|
15
|
-
if Kimurai.configuration.colorize_logger != false && Kimurai.env ==
|
|
16
|
-
Rbcat.colorize(output, predefined: [
|
|
18
|
+
if Kimurai.configuration.colorize_logger != false && Kimurai.env == 'development'
|
|
19
|
+
Rbcat.colorize(output, predefined: %i[jsonhash logger])
|
|
17
20
|
else
|
|
18
21
|
output
|
|
19
22
|
end
|
|
@@ -49,11 +52,13 @@ module Kimurai
|
|
|
49
52
|
|
|
50
53
|
def self.update(type, subtype)
|
|
51
54
|
return unless @run_info
|
|
55
|
+
|
|
52
56
|
@update_mutex.synchronize { @run_info[type][subtype] += 1 }
|
|
53
57
|
end
|
|
54
58
|
|
|
55
59
|
def self.add_event(scope, event)
|
|
56
60
|
return unless @run_info
|
|
61
|
+
|
|
57
62
|
@update_mutex.synchronize { @run_info[:events][scope][event] += 1 }
|
|
58
63
|
end
|
|
59
64
|
|
|
@@ -91,9 +96,9 @@ module Kimurai
|
|
|
91
96
|
|
|
92
97
|
def self.logger
|
|
93
98
|
@logger ||= Kimurai.configuration.logger || begin
|
|
94
|
-
log_level = (ENV[
|
|
99
|
+
log_level = (ENV['LOG_LEVEL'] || Kimurai.configuration.log_level || 'DEBUG').to_s.upcase
|
|
95
100
|
log_level = "Logger::#{log_level}".constantize
|
|
96
|
-
Logger.new(
|
|
101
|
+
Logger.new($stdout, formatter: LoggerFormatter, level: log_level, progname: name)
|
|
97
102
|
end
|
|
98
103
|
end
|
|
99
104
|
|
|
@@ -114,13 +119,13 @@ module Kimurai
|
|
|
114
119
|
###
|
|
115
120
|
|
|
116
121
|
logger.info "Spider: started: #{name}"
|
|
117
|
-
open_spider if
|
|
122
|
+
open_spider if respond_to? :open_spider
|
|
118
123
|
|
|
119
|
-
spider =
|
|
124
|
+
spider = new
|
|
120
125
|
spider.with_info = true
|
|
121
126
|
if start_urls
|
|
122
127
|
start_urls.each do |start_url|
|
|
123
|
-
if start_url.
|
|
128
|
+
if start_url.instance_of?(Hash)
|
|
124
129
|
spider.request_to(:parse, start_url)
|
|
125
130
|
else
|
|
126
131
|
spider.request_to(:parse, url: start_url)
|
|
@@ -136,13 +141,13 @@ module Kimurai
|
|
|
136
141
|
@run_info.merge!(status: :completed)
|
|
137
142
|
ensure
|
|
138
143
|
if spider
|
|
139
|
-
spider.browser.destroy_driver! if spider.instance_variable_get(
|
|
144
|
+
spider.browser.destroy_driver! if spider.instance_variable_get('@browser')
|
|
140
145
|
|
|
141
146
|
stop_time = Time.now
|
|
142
147
|
total_time = (stop_time - @run_info[:start_time]).round(3)
|
|
143
148
|
@run_info.merge!(stop_time: stop_time, running_time: total_time)
|
|
144
149
|
|
|
145
|
-
close_spider if
|
|
150
|
+
close_spider if respond_to? :close_spider
|
|
146
151
|
|
|
147
152
|
message = "Spider: stopped: #{@run_info.merge(running_time: @run_info[:running_time]&.duration)}"
|
|
148
153
|
failed? ? logger.fatal(message) : logger.info(message)
|
|
@@ -152,7 +157,7 @@ module Kimurai
|
|
|
152
157
|
end
|
|
153
158
|
|
|
154
159
|
def self.parse!(handler, *args, **request)
|
|
155
|
-
spider =
|
|
160
|
+
spider = new
|
|
156
161
|
|
|
157
162
|
if args.present?
|
|
158
163
|
spider.public_send(handler, *args)
|
|
@@ -162,7 +167,7 @@ module Kimurai
|
|
|
162
167
|
spider.public_send(handler)
|
|
163
168
|
end
|
|
164
169
|
ensure
|
|
165
|
-
spider.browser.destroy_driver! if spider.instance_variable_get(
|
|
170
|
+
spider.browser.destroy_driver! if spider.instance_variable_get('@browser')
|
|
166
171
|
end
|
|
167
172
|
|
|
168
173
|
###
|
|
@@ -171,7 +176,7 @@ module Kimurai
|
|
|
171
176
|
attr_accessor :with_info
|
|
172
177
|
|
|
173
178
|
def initialize(engine = self.class.engine, config: {})
|
|
174
|
-
@engine = engine
|
|
179
|
+
@engine = engine || self.class.engine
|
|
175
180
|
@config = self.class.config.deep_merge_excl(config, DMERGE_EXCLUDE)
|
|
176
181
|
@pipelines = self.class.pipelines.map do |pipeline_name|
|
|
177
182
|
klass = Pipeline.descendants.find { |kl| kl.name == pipeline_name }
|
|
@@ -189,15 +194,17 @@ module Kimurai
|
|
|
189
194
|
end
|
|
190
195
|
|
|
191
196
|
def request_to(handler, delay = nil, url:, data: {}, response_type: :html)
|
|
197
|
+
raise InvalidUrlError, "Requested url is invalid: #{url}" unless URI.parse(url).is_a?(URI::HTTP)
|
|
198
|
+
|
|
192
199
|
if @config[:skip_duplicate_requests] && !unique_request?(url)
|
|
193
|
-
add_event(:duplicate_requests) if
|
|
200
|
+
add_event(:duplicate_requests) if with_info
|
|
194
201
|
logger.warn "Spider: request_to: not unique url: #{url}, skipped" and return
|
|
195
202
|
end
|
|
196
203
|
|
|
197
204
|
visited = delay ? browser.visit(url, delay: delay) : browser.visit(url)
|
|
198
205
|
return unless visited
|
|
199
206
|
|
|
200
|
-
public_send(handler, browser.current_response(response_type), { url: url, data: data })
|
|
207
|
+
public_send(handler, browser.current_response(response_type), **{ url: url, data: data })
|
|
201
208
|
end
|
|
202
209
|
|
|
203
210
|
def console(response = nil, url: nil, data: {})
|
|
@@ -207,9 +214,9 @@ module Kimurai
|
|
|
207
214
|
###
|
|
208
215
|
|
|
209
216
|
def storage
|
|
210
|
-
#
|
|
217
|
+
# NOTE: for `.crawl!` uses shared thread safe Storage instance,
|
|
211
218
|
# otherwise, each spider instance will have it's own Storage
|
|
212
|
-
@storage ||=
|
|
219
|
+
@storage ||= with_info ? self.class.storage : Storage.new
|
|
213
220
|
end
|
|
214
221
|
|
|
215
222
|
def unique?(scope, value)
|
|
@@ -219,10 +226,10 @@ module Kimurai
|
|
|
219
226
|
def save_to(path, item, format:, position: true, append: false)
|
|
220
227
|
@savers[path] ||= begin
|
|
221
228
|
options = { format: format, position: position, append: append }
|
|
222
|
-
if
|
|
223
|
-
self.class.savers[path] ||= Saver.new(path, options)
|
|
229
|
+
if with_info
|
|
230
|
+
self.class.savers[path] ||= Saver.new(path, **options)
|
|
224
231
|
else
|
|
225
|
-
Saver.new(path, options)
|
|
232
|
+
Saver.new(path, **options)
|
|
226
233
|
end
|
|
227
234
|
end
|
|
228
235
|
|
|
@@ -232,11 +239,8 @@ module Kimurai
|
|
|
232
239
|
###
|
|
233
240
|
|
|
234
241
|
def add_event(scope = :custom, event)
|
|
235
|
-
|
|
236
|
-
raise "It's allowed to use `add_event` only while performing a full run (`.crawl!` method)"
|
|
237
|
-
end
|
|
242
|
+
self.class.add_event(scope, event) if with_info
|
|
238
243
|
|
|
239
|
-
self.class.add_event(scope, event)
|
|
240
244
|
logger.info "Spider: new event (scope: #{scope}): #{event}" if scope == :custom
|
|
241
245
|
end
|
|
242
246
|
|
|
@@ -250,35 +254,35 @@ module Kimurai
|
|
|
250
254
|
|
|
251
255
|
def unique_request?(url)
|
|
252
256
|
options = @config[:skip_duplicate_requests]
|
|
253
|
-
if options.
|
|
257
|
+
if options.instance_of?(Hash)
|
|
254
258
|
scope = options[:scope] || :requests_urls
|
|
255
259
|
if options[:check_only]
|
|
256
260
|
storage.include?(scope, url) ? false : true
|
|
257
261
|
else
|
|
258
|
-
storage.unique?(scope, url)
|
|
262
|
+
storage.unique?(scope, url) || false
|
|
259
263
|
end
|
|
260
264
|
else
|
|
261
|
-
storage.unique?(:requests_urls, url)
|
|
265
|
+
storage.unique?(:requests_urls, url) || false
|
|
262
266
|
end
|
|
263
267
|
end
|
|
264
268
|
|
|
265
269
|
def send_item(item, options = {})
|
|
266
270
|
logger.debug "Pipeline: starting processing item through #{@pipelines.size} #{'pipeline'.pluralize(@pipelines.size)}..."
|
|
267
|
-
self.class.update(:items, :sent) if
|
|
271
|
+
self.class.update(:items, :sent) if with_info
|
|
268
272
|
|
|
269
273
|
@pipelines.each do |name, instance|
|
|
270
274
|
item = options[name] ? instance.process_item(item, options: options[name]) : instance.process_item(item)
|
|
271
275
|
end
|
|
272
|
-
rescue => e
|
|
276
|
+
rescue StandardError => e
|
|
273
277
|
logger.error "Pipeline: dropped: #{e.inspect} (#{e.backtrace.first}), item: #{item}"
|
|
274
|
-
add_event(:drop_items_errors, e.inspect) if
|
|
278
|
+
add_event(:drop_items_errors, e.inspect) if with_info
|
|
275
279
|
false
|
|
276
280
|
else
|
|
277
|
-
self.class.update(:items, :processed) if
|
|
281
|
+
self.class.update(:items, :processed) if with_info
|
|
278
282
|
logger.info "Pipeline: processed: #{JSON.generate(item)}"
|
|
279
283
|
true
|
|
280
284
|
ensure
|
|
281
|
-
if
|
|
285
|
+
if with_info
|
|
282
286
|
logger.info "Info: items: sent: #{self.class.items[:sent]}, processed: #{self.class.items[:processed]}"
|
|
283
287
|
end
|
|
284
288
|
end
|
|
@@ -296,10 +300,10 @@ module Kimurai
|
|
|
296
300
|
Thread.current.abort_on_exception = true
|
|
297
301
|
|
|
298
302
|
spider = self.class.new(engine, config: @config.deep_merge_excl(config, DMERGE_EXCLUDE))
|
|
299
|
-
spider.with_info = true if
|
|
303
|
+
spider.with_info = true if with_info
|
|
300
304
|
|
|
301
305
|
part.each do |url_data|
|
|
302
|
-
if url_data.
|
|
306
|
+
if url_data.instance_of?(Hash)
|
|
303
307
|
if url_data[:url].present? && url_data[:data].present?
|
|
304
308
|
spider.request_to(handler, delay, url_data)
|
|
305
309
|
else
|
|
@@ -310,7 +314,7 @@ module Kimurai
|
|
|
310
314
|
end
|
|
311
315
|
end
|
|
312
316
|
ensure
|
|
313
|
-
spider.browser.destroy_driver! if spider.instance_variable_get(
|
|
317
|
+
spider.browser.destroy_driver! if spider.instance_variable_get('@browser')
|
|
314
318
|
end
|
|
315
319
|
|
|
316
320
|
sleep 0.5
|
data/lib/kimurai/base_helper.rb
CHANGED
|
@@ -4,13 +4,14 @@ module Kimurai
|
|
|
4
4
|
|
|
5
5
|
def absolute_url(url, base:)
|
|
6
6
|
return unless url
|
|
7
|
-
|
|
7
|
+
|
|
8
|
+
URI.join(base, URI::DEFAULT_PARSER.escape(url)).to_s
|
|
8
9
|
end
|
|
9
10
|
|
|
10
11
|
def escape_url(url)
|
|
11
|
-
|
|
12
|
-
rescue URI::InvalidURIError
|
|
13
|
-
URI.parse(URI.escape
|
|
12
|
+
URI.parse(url)
|
|
13
|
+
rescue URI::InvalidURIError
|
|
14
|
+
URI.parse(URI::DEFAULT_PARSER.escape(url)).to_s rescue url
|
|
14
15
|
else
|
|
15
16
|
url
|
|
16
17
|
end
|
|
@@ -5,7 +5,7 @@ require_relative '../capybara_ext/mechanize/driver'
|
|
|
5
5
|
require_relative '../capybara_ext/session'
|
|
6
6
|
|
|
7
7
|
module Kimurai
|
|
8
|
-
|
|
8
|
+
module BrowserBuilder
|
|
9
9
|
class MechanizeBuilder
|
|
10
10
|
attr_reader :logger, :spider
|
|
11
11
|
|
|
@@ -17,8 +17,8 @@ module Kimurai
|
|
|
17
17
|
|
|
18
18
|
def build
|
|
19
19
|
# Register driver
|
|
20
|
-
Capybara.register_driver :mechanize do |
|
|
21
|
-
driver = Capybara::Mechanize::Driver.new(
|
|
20
|
+
Capybara.register_driver :mechanize do |_app|
|
|
21
|
+
driver = Capybara::Mechanize::Driver.new('app')
|
|
22
22
|
# keep the history as small as possible (by default it's unlimited)
|
|
23
23
|
driver.configure { |a| a.history.max_size = 2 }
|
|
24
24
|
driver
|
|
@@ -27,19 +27,19 @@ module Kimurai
|
|
|
27
27
|
# Create browser instance (Capybara session)
|
|
28
28
|
@browser = Capybara::Session.new(:mechanize)
|
|
29
29
|
@browser.spider = spider
|
|
30
|
-
logger.debug
|
|
30
|
+
logger.debug 'BrowserBuilder (mechanize): created browser instance'
|
|
31
31
|
|
|
32
32
|
if @config[:extensions].present?
|
|
33
|
-
logger.error
|
|
33
|
+
logger.error 'BrowserBuilder (mechanize): `extensions` option not supported, skipped'
|
|
34
34
|
end
|
|
35
35
|
|
|
36
36
|
# Proxy
|
|
37
|
-
if proxy = @config[:proxy].presence
|
|
38
|
-
proxy_string = (proxy.
|
|
39
|
-
ip, port, type = proxy_string.split(
|
|
37
|
+
if (proxy = @config[:proxy].presence)
|
|
38
|
+
proxy_string = (proxy.instance_of?(Proc) ? proxy.call : proxy).strip
|
|
39
|
+
ip, port, type = proxy_string.split(':')
|
|
40
40
|
|
|
41
|
-
if type ==
|
|
42
|
-
@browser.driver.set_proxy(*proxy_string.split(
|
|
41
|
+
if type == 'http'
|
|
42
|
+
@browser.driver.set_proxy(*proxy_string.split(':'))
|
|
43
43
|
logger.debug "BrowserBuilder (mechanize): enabled http proxy, ip: #{ip}, port: #{port}"
|
|
44
44
|
else
|
|
45
45
|
logger.error "BrowserBuilder (mechanize): can't set #{type} proxy (not supported), skipped"
|
|
@@ -47,99 +47,105 @@ module Kimurai
|
|
|
47
47
|
end
|
|
48
48
|
|
|
49
49
|
# SSL
|
|
50
|
-
if ssl_cert_path = @config[:ssl_cert_path].presence
|
|
50
|
+
if (ssl_cert_path = @config[:ssl_cert_path].presence)
|
|
51
51
|
@browser.driver.browser.agent.http.ca_file = ssl_cert_path
|
|
52
|
-
logger.debug
|
|
52
|
+
logger.debug 'BrowserBuilder (mechanize): enabled custom ssl_cert'
|
|
53
53
|
end
|
|
54
54
|
|
|
55
55
|
if @config[:ignore_ssl_errors].present?
|
|
56
56
|
@browser.driver.browser.agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
|
57
|
-
logger.debug
|
|
57
|
+
logger.debug 'BrowserBuilder (mechanize): enabled ignore_ssl_errors'
|
|
58
58
|
end
|
|
59
59
|
|
|
60
60
|
# Headers
|
|
61
|
-
if headers = @config[:headers].presence
|
|
61
|
+
if (headers = @config[:headers].presence)
|
|
62
62
|
@browser.driver.headers = headers
|
|
63
|
-
logger.debug
|
|
63
|
+
logger.debug 'BrowserBuilder (mechanize): enabled custom headers'
|
|
64
64
|
end
|
|
65
65
|
|
|
66
|
-
if user_agent = @config[:user_agent].presence
|
|
67
|
-
user_agent_string = (user_agent.
|
|
66
|
+
if (user_agent = @config[:user_agent].presence)
|
|
67
|
+
user_agent_string = (user_agent.instance_of?(Proc) ? user_agent.call : user_agent).strip
|
|
68
68
|
|
|
69
|
-
@browser.driver.add_header(
|
|
70
|
-
logger.debug
|
|
69
|
+
@browser.driver.add_header('User-Agent', user_agent_string)
|
|
70
|
+
logger.debug 'BrowserBuilder (mechanize): enabled custom user_agent'
|
|
71
71
|
end
|
|
72
72
|
|
|
73
73
|
# Cookies
|
|
74
|
-
if cookies = @config[:cookies].presence
|
|
74
|
+
if (cookies = @config[:cookies].presence)
|
|
75
75
|
cookies.each do |cookie|
|
|
76
76
|
@browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
|
|
77
77
|
end
|
|
78
78
|
|
|
79
|
-
logger.debug
|
|
79
|
+
logger.debug 'BrowserBuilder (mechanize): enabled custom cookies'
|
|
80
80
|
end
|
|
81
81
|
|
|
82
82
|
# Browser instance options
|
|
83
83
|
# skip_request_errors
|
|
84
|
-
if skip_errors = @config[:skip_request_errors].presence
|
|
84
|
+
if (skip_errors = @config[:skip_request_errors].presence)
|
|
85
85
|
@browser.config.skip_request_errors = skip_errors
|
|
86
|
-
logger.debug
|
|
86
|
+
logger.debug 'BrowserBuilder (mechanize): enabled skip_request_errors'
|
|
87
87
|
end
|
|
88
88
|
|
|
89
89
|
# retry_request_errors
|
|
90
|
-
if retry_errors = @config[:retry_request_errors].presence
|
|
90
|
+
if (retry_errors = @config[:retry_request_errors].presence)
|
|
91
91
|
@browser.config.retry_request_errors = retry_errors
|
|
92
|
-
logger.debug
|
|
92
|
+
logger.debug 'BrowserBuilder (mechanize): enabled retry_request_errors'
|
|
93
93
|
end
|
|
94
94
|
|
|
95
95
|
# restart_if
|
|
96
96
|
if @config[:restart_if].present?
|
|
97
|
-
logger.warn
|
|
97
|
+
logger.warn 'BrowserBuilder (mechanize): restart_if options not supported by Mechanize, skipped'
|
|
98
98
|
end
|
|
99
99
|
|
|
100
100
|
# before_request clear_cookies
|
|
101
101
|
if @config.dig(:before_request, :clear_cookies)
|
|
102
102
|
@browser.config.before_request[:clear_cookies] = true
|
|
103
|
-
logger.debug
|
|
103
|
+
logger.debug 'BrowserBuilder (mechanize): enabled before_request.clear_cookies'
|
|
104
104
|
end
|
|
105
105
|
|
|
106
106
|
# before_request clear_and_set_cookies
|
|
107
107
|
if @config.dig(:before_request, :clear_and_set_cookies)
|
|
108
|
-
if cookies = @config[:cookies].presence
|
|
108
|
+
if (cookies = @config[:cookies].presence)
|
|
109
109
|
@browser.config.cookies = cookies
|
|
110
110
|
@browser.config.before_request[:clear_and_set_cookies] = true
|
|
111
|
-
logger.debug
|
|
111
|
+
logger.debug 'BrowserBuilder (mechanize): enabled before_request.clear_and_set_cookies'
|
|
112
112
|
else
|
|
113
|
-
logger.error
|
|
113
|
+
logger.error 'BrowserBuilder (mechanize): cookies should be present to enable before_request.clear_and_set_cookies, skipped'
|
|
114
114
|
end
|
|
115
115
|
end
|
|
116
116
|
|
|
117
117
|
# before_request change_user_agent
|
|
118
118
|
if @config.dig(:before_request, :change_user_agent)
|
|
119
|
-
if @config[:user_agent].present? && @config[:user_agent].
|
|
119
|
+
if @config[:user_agent].present? && @config[:user_agent].instance_of?(Proc)
|
|
120
120
|
@browser.config.user_agent = @config[:user_agent]
|
|
121
121
|
@browser.config.before_request[:change_user_agent] = true
|
|
122
|
-
logger.debug
|
|
122
|
+
logger.debug 'BrowserBuilder (mechanize): enabled before_request.change_user_agent'
|
|
123
123
|
else
|
|
124
|
-
logger.error
|
|
124
|
+
logger.error 'BrowserBuilder (mechanize): user_agent should be present and has lambda format to enable before_request.change_user_agent, skipped'
|
|
125
125
|
end
|
|
126
126
|
end
|
|
127
127
|
|
|
128
128
|
# before_request change_proxy
|
|
129
129
|
if @config.dig(:before_request, :change_proxy)
|
|
130
|
-
if @config[:proxy].present? && @config[:proxy].
|
|
130
|
+
if @config[:proxy].present? && @config[:proxy].instance_of?(Proc)
|
|
131
131
|
@browser.config.proxy = @config[:proxy]
|
|
132
132
|
@browser.config.before_request[:change_proxy] = true
|
|
133
|
-
logger.debug
|
|
133
|
+
logger.debug 'BrowserBuilder (mechanize): enabled before_request.change_proxy'
|
|
134
134
|
else
|
|
135
|
-
logger.error
|
|
135
|
+
logger.error 'BrowserBuilder (mechanize): proxy should be present and has lambda format to enable before_request.change_proxy, skipped'
|
|
136
136
|
end
|
|
137
137
|
end
|
|
138
138
|
|
|
139
139
|
# before_request delay
|
|
140
|
-
if delay = @config.dig(:before_request, :delay).presence
|
|
140
|
+
if (delay = @config.dig(:before_request, :delay).presence)
|
|
141
141
|
@browser.config.before_request[:delay] = delay
|
|
142
|
-
logger.debug
|
|
142
|
+
logger.debug 'BrowserBuilder (mechanize): enabled before_request.delay'
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# encoding
|
|
146
|
+
if (encoding = @config[:encoding])
|
|
147
|
+
@browser.config.encoding = encoding
|
|
148
|
+
logger.debug "BrowserBuilder (mechanize): enabled encoding: #{encoding}"
|
|
143
149
|
end
|
|
144
150
|
|
|
145
151
|
# return Capybara session instance
|