kimurai 1.4.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +9 -0
- data/CHANGELOG.md +21 -0
- data/Gemfile +2 -2
- data/README.md +476 -648
- data/Rakefile +6 -6
- data/bin/console +3 -4
- data/exe/kimurai +0 -1
- data/kimurai.gemspec +38 -37
- data/lib/kimurai/base/saver.rb +15 -19
- data/lib/kimurai/base/storage.rb +1 -1
- data/lib/kimurai/base.rb +38 -38
- data/lib/kimurai/base_helper.rb +5 -4
- data/lib/kimurai/browser_builder/mechanize_builder.rb +121 -119
- data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +160 -152
- data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +162 -160
- data/lib/kimurai/browser_builder.rb +1 -7
- data/lib/kimurai/capybara_configuration.rb +1 -1
- data/lib/kimurai/capybara_ext/driver/base.rb +50 -46
- data/lib/kimurai/capybara_ext/mechanize/driver.rb +51 -50
- data/lib/kimurai/capybara_ext/selenium/driver.rb +33 -29
- data/lib/kimurai/capybara_ext/session.rb +31 -38
- data/lib/kimurai/cli/generator.rb +15 -15
- data/lib/kimurai/cli.rb +49 -86
- data/lib/kimurai/core_ext/array.rb +2 -2
- data/lib/kimurai/core_ext/hash.rb +1 -1
- data/lib/kimurai/core_ext/numeric.rb +4 -4
- data/lib/kimurai/pipeline.rb +2 -1
- data/lib/kimurai/runner.rb +6 -6
- data/lib/kimurai/template/Gemfile +2 -2
- data/lib/kimurai/template/config/boot.rb +4 -4
- data/lib/kimurai/template/config/schedule.rb +15 -15
- data/lib/kimurai/template/spiders/application_spider.rb +8 -14
- data/lib/kimurai/version.rb +1 -1
- data/lib/kimurai.rb +7 -3
- metadata +58 -65
- data/.travis.yml +0 -5
- data/lib/kimurai/automation/deploy.yml +0 -54
- data/lib/kimurai/automation/setup/chromium_chromedriver.yml +0 -26
- data/lib/kimurai/automation/setup/firefox_geckodriver.yml +0 -20
- data/lib/kimurai/automation/setup/phantomjs.yml +0 -33
- data/lib/kimurai/automation/setup/ruby_environment.yml +0 -124
- data/lib/kimurai/automation/setup.yml +0 -44
- data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +0 -175
- data/lib/kimurai/capybara_ext/poltergeist/driver.rb +0 -13
- data/lib/kimurai/cli/ansible_command_builder.rb +0 -71
- data/lib/kimurai/template/config/automation.yml +0 -13
data/Rakefile
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
require
|
|
2
|
-
require
|
|
1
|
+
require 'bundler/gem_tasks'
|
|
2
|
+
require 'rake/testtask'
|
|
3
3
|
|
|
4
4
|
Rake::TestTask.new(:test) do |t|
|
|
5
|
-
t.libs <<
|
|
6
|
-
t.libs <<
|
|
7
|
-
t.test_files = FileList[
|
|
5
|
+
t.libs << 'test'
|
|
6
|
+
t.libs << 'lib'
|
|
7
|
+
t.test_files = FileList['test/**/*_test.rb']
|
|
8
8
|
end
|
|
9
9
|
|
|
10
|
-
task :
|
|
10
|
+
task default: :test
|
data/bin/console
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env ruby
|
|
2
|
-
|
|
3
|
-
require
|
|
4
|
-
require "kimurai"
|
|
2
|
+
require 'bundler/setup'
|
|
3
|
+
require 'kimurai'
|
|
5
4
|
|
|
6
5
|
# You can add fixtures and/or initialization code here to make experimenting
|
|
7
6
|
# with your gem easier. You can also use a different console, if you like.
|
|
@@ -10,5 +9,5 @@ require "kimurai"
|
|
|
10
9
|
# require "pry"
|
|
11
10
|
# Pry.start
|
|
12
11
|
|
|
13
|
-
require
|
|
12
|
+
require 'irb'
|
|
14
13
|
IRB.start(__FILE__)
|
data/exe/kimurai
CHANGED
data/kimurai.gemspec
CHANGED
|
@@ -1,48 +1,49 @@
|
|
|
1
|
-
|
|
2
|
-
lib = File.expand_path("../lib", __FILE__)
|
|
1
|
+
lib = File.expand_path('lib', __dir__)
|
|
3
2
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
|
-
require
|
|
3
|
+
require 'kimurai/version'
|
|
5
4
|
|
|
6
5
|
Gem::Specification.new do |spec|
|
|
7
|
-
spec.name =
|
|
6
|
+
spec.name = 'kimurai'
|
|
8
7
|
spec.version = Kimurai::VERSION
|
|
9
|
-
spec.authors = [
|
|
10
|
-
spec.email = [
|
|
8
|
+
spec.authors = ['Victor Afanasev']
|
|
9
|
+
spec.email = ['vicfreefly@gmail.com']
|
|
11
10
|
|
|
12
|
-
spec.summary =
|
|
13
|
-
spec.homepage =
|
|
14
|
-
spec.license =
|
|
11
|
+
spec.summary = 'Modern web scraping framework written in Ruby and based on Capybara/Nokogiri'
|
|
12
|
+
spec.homepage = 'https://github.com/vifreefly/kimuraframework'
|
|
13
|
+
spec.license = 'MIT'
|
|
15
14
|
|
|
16
15
|
# Specify which files should be added to the gem when it is released.
|
|
17
16
|
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
|
18
|
-
spec.files = Dir.chdir(File.expand_path(
|
|
17
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
|
19
18
|
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
|
20
19
|
end
|
|
21
|
-
spec.bindir =
|
|
22
|
-
spec.executables =
|
|
23
|
-
spec.require_paths = [
|
|
24
|
-
spec.required_ruby_version =
|
|
25
|
-
|
|
26
|
-
spec.add_dependency
|
|
27
|
-
spec.add_dependency
|
|
28
|
-
spec.add_dependency
|
|
29
|
-
spec.add_dependency
|
|
30
|
-
spec.add_dependency
|
|
31
|
-
|
|
32
|
-
spec.add_dependency
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
spec.add_dependency
|
|
36
|
-
|
|
37
|
-
spec.add_dependency
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
spec.add_dependency
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
spec.add_dependency
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
spec.
|
|
47
|
-
|
|
20
|
+
spec.bindir = 'exe'
|
|
21
|
+
spec.executables = 'kimurai'
|
|
22
|
+
spec.require_paths = ['lib']
|
|
23
|
+
spec.required_ruby_version = '>= 3.1.0'
|
|
24
|
+
|
|
25
|
+
spec.add_dependency 'activesupport'
|
|
26
|
+
spec.add_dependency 'cliver'
|
|
27
|
+
spec.add_dependency 'csv'
|
|
28
|
+
spec.add_dependency 'murmurhash3'
|
|
29
|
+
spec.add_dependency 'nokogiri'
|
|
30
|
+
spec.add_dependency 'ostruct'
|
|
31
|
+
spec.add_dependency 'thor'
|
|
32
|
+
|
|
33
|
+
# for capybara-mechanize compatibility
|
|
34
|
+
spec.add_dependency 'mutex_m'
|
|
35
|
+
spec.add_dependency 'nkf'
|
|
36
|
+
spec.add_dependency 'reline'
|
|
37
|
+
|
|
38
|
+
spec.add_dependency 'capybara', '~> 3.40'
|
|
39
|
+
spec.add_dependency 'capybara-mechanize', '~> 1.13'
|
|
40
|
+
spec.add_dependency 'selenium-webdriver', '~> 4.27'
|
|
41
|
+
|
|
42
|
+
spec.add_dependency 'headless'
|
|
43
|
+
spec.add_dependency 'pmap'
|
|
44
|
+
|
|
45
|
+
spec.add_dependency 'whenever'
|
|
46
|
+
|
|
47
|
+
spec.add_dependency 'pry'
|
|
48
|
+
spec.add_dependency 'rbcat', '~> 1.0'
|
|
48
49
|
end
|
data/lib/kimurai/base/saver.rb
CHANGED
|
@@ -7,9 +7,7 @@ module Kimurai
|
|
|
7
7
|
attr_reader :format, :path, :position, :append
|
|
8
8
|
|
|
9
9
|
def initialize(path, format:, position: true, append: false)
|
|
10
|
-
unless %i
|
|
11
|
-
raise "SimpleSaver: wrong type of format: #{format}"
|
|
12
|
-
end
|
|
10
|
+
raise "SimpleSaver: wrong type of format: #{format}" unless %i[json pretty_json jsonlines csv].include?(format)
|
|
13
11
|
|
|
14
12
|
@path = path
|
|
15
13
|
@format = format
|
|
@@ -42,48 +40,48 @@ module Kimurai
|
|
|
42
40
|
def save_to_json(item)
|
|
43
41
|
data = JSON.generate([item])
|
|
44
42
|
|
|
45
|
-
if @index > 1 || append && File.
|
|
43
|
+
if @index > 1 || append && File.exist?(path)
|
|
46
44
|
file_content = File.read(path).sub(/\}\]\Z/, "\}\,")
|
|
47
|
-
File.open(path,
|
|
48
|
-
f.write(file_content + data.sub(/\A\[/,
|
|
45
|
+
File.open(path, 'w') do |f|
|
|
46
|
+
f.write(file_content + data.sub(/\A\[/, ''))
|
|
49
47
|
end
|
|
50
48
|
else
|
|
51
|
-
File.open(path,
|
|
49
|
+
File.open(path, 'w') { |f| f.write(data) }
|
|
52
50
|
end
|
|
53
51
|
end
|
|
54
52
|
|
|
55
53
|
def save_to_pretty_json(item)
|
|
56
54
|
data = JSON.pretty_generate([item])
|
|
57
55
|
|
|
58
|
-
if @index > 1 || append && File.
|
|
56
|
+
if @index > 1 || append && File.exist?(path)
|
|
59
57
|
file_content = File.read(path).sub(/\}\n\]\Z/, "\}\,\n")
|
|
60
|
-
File.open(path,
|
|
61
|
-
f.write(file_content + data.sub(/\A\[\n/,
|
|
58
|
+
File.open(path, 'w') do |f|
|
|
59
|
+
f.write(file_content + data.sub(/\A\[\n/, ''))
|
|
62
60
|
end
|
|
63
61
|
else
|
|
64
|
-
File.open(path,
|
|
62
|
+
File.open(path, 'w') { |f| f.write(data) }
|
|
65
63
|
end
|
|
66
64
|
end
|
|
67
65
|
|
|
68
66
|
def save_to_jsonlines(item)
|
|
69
67
|
data = JSON.generate(item)
|
|
70
68
|
|
|
71
|
-
if @index > 1 || append && File.
|
|
72
|
-
File.open(path,
|
|
69
|
+
if @index > 1 || append && File.exist?(path)
|
|
70
|
+
File.open(path, 'a') { |file| file.write("\n#{data}") }
|
|
73
71
|
else
|
|
74
|
-
File.open(path,
|
|
72
|
+
File.open(path, 'w') { |file| file.write(data) }
|
|
75
73
|
end
|
|
76
74
|
end
|
|
77
75
|
|
|
78
76
|
def save_to_csv(item)
|
|
79
77
|
data = flatten_hash(item)
|
|
80
78
|
|
|
81
|
-
if @index > 1 || append && File.
|
|
82
|
-
CSV.open(path,
|
|
79
|
+
if @index > 1 || append && File.exist?(path)
|
|
80
|
+
CSV.open(path, 'a+', force_quotes: true) do |csv|
|
|
83
81
|
csv << data.values
|
|
84
82
|
end
|
|
85
83
|
else
|
|
86
|
-
CSV.open(path,
|
|
84
|
+
CSV.open(path, 'w', force_quotes: true) do |csv|
|
|
87
85
|
csv << data.keys
|
|
88
86
|
csv << data.values
|
|
89
87
|
end
|
|
@@ -102,5 +100,3 @@ module Kimurai
|
|
|
102
100
|
end
|
|
103
101
|
end
|
|
104
102
|
end
|
|
105
|
-
|
|
106
|
-
|
data/lib/kimurai/base/storage.rb
CHANGED
data/lib/kimurai/base.rb
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
require 'English'
|
|
1
2
|
require_relative 'base/saver'
|
|
2
3
|
require_relative 'base/storage'
|
|
3
4
|
|
|
@@ -6,16 +7,16 @@ module Kimurai
|
|
|
6
7
|
class InvalidUrlError < StandardError; end
|
|
7
8
|
|
|
8
9
|
# don't deep merge config's headers hash option
|
|
9
|
-
DMERGE_EXCLUDE = [:headers]
|
|
10
|
+
DMERGE_EXCLUDE = [:headers].freeze
|
|
10
11
|
|
|
11
12
|
LoggerFormatter = proc do |severity, datetime, progname, msg|
|
|
12
13
|
current_thread_id = Thread.current.object_id
|
|
13
|
-
thread_type = Thread.main == Thread.current ?
|
|
14
|
-
output = "%s, [%s#%d] [%s: %s] %5s -- %s: %s\n"
|
|
15
|
-
|
|
14
|
+
thread_type = Thread.main == Thread.current ? 'M' : 'C'
|
|
15
|
+
output = format("%s, [%s#%d] [%s: %s] %5s -- %s: %s\n", severity[0..0], datetime, $PROCESS_ID, thread_type,
|
|
16
|
+
current_thread_id, severity, progname, msg)
|
|
16
17
|
|
|
17
|
-
if Kimurai.configuration.colorize_logger != false && Kimurai.env ==
|
|
18
|
-
Rbcat.colorize(output, predefined: [
|
|
18
|
+
if Kimurai.configuration.colorize_logger != false && Kimurai.env == 'development'
|
|
19
|
+
Rbcat.colorize(output, predefined: %i[jsonhash logger])
|
|
19
20
|
else
|
|
20
21
|
output
|
|
21
22
|
end
|
|
@@ -51,11 +52,13 @@ module Kimurai
|
|
|
51
52
|
|
|
52
53
|
def self.update(type, subtype)
|
|
53
54
|
return unless @run_info
|
|
55
|
+
|
|
54
56
|
@update_mutex.synchronize { @run_info[type][subtype] += 1 }
|
|
55
57
|
end
|
|
56
58
|
|
|
57
59
|
def self.add_event(scope, event)
|
|
58
60
|
return unless @run_info
|
|
61
|
+
|
|
59
62
|
@update_mutex.synchronize { @run_info[:events][scope][event] += 1 }
|
|
60
63
|
end
|
|
61
64
|
|
|
@@ -93,9 +96,9 @@ module Kimurai
|
|
|
93
96
|
|
|
94
97
|
def self.logger
|
|
95
98
|
@logger ||= Kimurai.configuration.logger || begin
|
|
96
|
-
log_level = (ENV[
|
|
99
|
+
log_level = (ENV['LOG_LEVEL'] || Kimurai.configuration.log_level || 'DEBUG').to_s.upcase
|
|
97
100
|
log_level = "Logger::#{log_level}".constantize
|
|
98
|
-
Logger.new(
|
|
101
|
+
Logger.new($stdout, formatter: LoggerFormatter, level: log_level, progname: name)
|
|
99
102
|
end
|
|
100
103
|
end
|
|
101
104
|
|
|
@@ -116,13 +119,13 @@ module Kimurai
|
|
|
116
119
|
###
|
|
117
120
|
|
|
118
121
|
logger.info "Spider: started: #{name}"
|
|
119
|
-
open_spider if
|
|
122
|
+
open_spider if respond_to? :open_spider
|
|
120
123
|
|
|
121
|
-
spider =
|
|
124
|
+
spider = new
|
|
122
125
|
spider.with_info = true
|
|
123
126
|
if start_urls
|
|
124
127
|
start_urls.each do |start_url|
|
|
125
|
-
if start_url.
|
|
128
|
+
if start_url.instance_of?(Hash)
|
|
126
129
|
spider.request_to(:parse, start_url)
|
|
127
130
|
else
|
|
128
131
|
spider.request_to(:parse, url: start_url)
|
|
@@ -138,13 +141,13 @@ module Kimurai
|
|
|
138
141
|
@run_info.merge!(status: :completed)
|
|
139
142
|
ensure
|
|
140
143
|
if spider
|
|
141
|
-
spider.browser.destroy_driver! if spider.instance_variable_get(
|
|
144
|
+
spider.browser.destroy_driver! if spider.instance_variable_get('@browser')
|
|
142
145
|
|
|
143
146
|
stop_time = Time.now
|
|
144
147
|
total_time = (stop_time - @run_info[:start_time]).round(3)
|
|
145
148
|
@run_info.merge!(stop_time: stop_time, running_time: total_time)
|
|
146
149
|
|
|
147
|
-
close_spider if
|
|
150
|
+
close_spider if respond_to? :close_spider
|
|
148
151
|
|
|
149
152
|
message = "Spider: stopped: #{@run_info.merge(running_time: @run_info[:running_time]&.duration)}"
|
|
150
153
|
failed? ? logger.fatal(message) : logger.info(message)
|
|
@@ -154,7 +157,7 @@ module Kimurai
|
|
|
154
157
|
end
|
|
155
158
|
|
|
156
159
|
def self.parse!(handler, *args, **request)
|
|
157
|
-
spider =
|
|
160
|
+
spider = new
|
|
158
161
|
|
|
159
162
|
if args.present?
|
|
160
163
|
spider.public_send(handler, *args)
|
|
@@ -164,7 +167,7 @@ module Kimurai
|
|
|
164
167
|
spider.public_send(handler)
|
|
165
168
|
end
|
|
166
169
|
ensure
|
|
167
|
-
spider.browser.destroy_driver! if spider.instance_variable_get(
|
|
170
|
+
spider.browser.destroy_driver! if spider.instance_variable_get('@browser')
|
|
168
171
|
end
|
|
169
172
|
|
|
170
173
|
###
|
|
@@ -191,17 +194,17 @@ module Kimurai
|
|
|
191
194
|
end
|
|
192
195
|
|
|
193
196
|
def request_to(handler, delay = nil, url:, data: {}, response_type: :html)
|
|
194
|
-
raise InvalidUrlError, "Requested url is invalid: #{url}" unless URI.parse(url).
|
|
197
|
+
raise InvalidUrlError, "Requested url is invalid: #{url}" unless URI.parse(url).is_a?(URI::HTTP)
|
|
195
198
|
|
|
196
199
|
if @config[:skip_duplicate_requests] && !unique_request?(url)
|
|
197
|
-
add_event(:duplicate_requests) if
|
|
200
|
+
add_event(:duplicate_requests) if with_info
|
|
198
201
|
logger.warn "Spider: request_to: not unique url: #{url}, skipped" and return
|
|
199
202
|
end
|
|
200
203
|
|
|
201
204
|
visited = delay ? browser.visit(url, delay: delay) : browser.visit(url)
|
|
202
205
|
return unless visited
|
|
203
206
|
|
|
204
|
-
public_send(handler, browser.current_response(response_type), { url: url, data: data })
|
|
207
|
+
public_send(handler, browser.current_response(response_type), **{ url: url, data: data })
|
|
205
208
|
end
|
|
206
209
|
|
|
207
210
|
def console(response = nil, url: nil, data: {})
|
|
@@ -211,9 +214,9 @@ module Kimurai
|
|
|
211
214
|
###
|
|
212
215
|
|
|
213
216
|
def storage
|
|
214
|
-
#
|
|
217
|
+
# NOTE: for `.crawl!` uses shared thread safe Storage instance,
|
|
215
218
|
# otherwise, each spider instance will have it's own Storage
|
|
216
|
-
@storage ||=
|
|
219
|
+
@storage ||= with_info ? self.class.storage : Storage.new
|
|
217
220
|
end
|
|
218
221
|
|
|
219
222
|
def unique?(scope, value)
|
|
@@ -223,10 +226,10 @@ module Kimurai
|
|
|
223
226
|
def save_to(path, item, format:, position: true, append: false)
|
|
224
227
|
@savers[path] ||= begin
|
|
225
228
|
options = { format: format, position: position, append: append }
|
|
226
|
-
if
|
|
227
|
-
self.class.savers[path] ||= Saver.new(path, options)
|
|
229
|
+
if with_info
|
|
230
|
+
self.class.savers[path] ||= Saver.new(path, **options)
|
|
228
231
|
else
|
|
229
|
-
Saver.new(path, options)
|
|
232
|
+
Saver.new(path, **options)
|
|
230
233
|
end
|
|
231
234
|
end
|
|
232
235
|
|
|
@@ -236,11 +239,8 @@ module Kimurai
|
|
|
236
239
|
###
|
|
237
240
|
|
|
238
241
|
def add_event(scope = :custom, event)
|
|
239
|
-
|
|
240
|
-
raise "It's allowed to use `add_event` only while performing a full run (`.crawl!` method)"
|
|
241
|
-
end
|
|
242
|
+
self.class.add_event(scope, event) if with_info
|
|
242
243
|
|
|
243
|
-
self.class.add_event(scope, event)
|
|
244
244
|
logger.info "Spider: new event (scope: #{scope}): #{event}" if scope == :custom
|
|
245
245
|
end
|
|
246
246
|
|
|
@@ -254,35 +254,35 @@ module Kimurai
|
|
|
254
254
|
|
|
255
255
|
def unique_request?(url)
|
|
256
256
|
options = @config[:skip_duplicate_requests]
|
|
257
|
-
if options.
|
|
257
|
+
if options.instance_of?(Hash)
|
|
258
258
|
scope = options[:scope] || :requests_urls
|
|
259
259
|
if options[:check_only]
|
|
260
260
|
storage.include?(scope, url) ? false : true
|
|
261
261
|
else
|
|
262
|
-
storage.unique?(scope, url)
|
|
262
|
+
storage.unique?(scope, url) || false
|
|
263
263
|
end
|
|
264
264
|
else
|
|
265
|
-
storage.unique?(:requests_urls, url)
|
|
265
|
+
storage.unique?(:requests_urls, url) || false
|
|
266
266
|
end
|
|
267
267
|
end
|
|
268
268
|
|
|
269
269
|
def send_item(item, options = {})
|
|
270
270
|
logger.debug "Pipeline: starting processing item through #{@pipelines.size} #{'pipeline'.pluralize(@pipelines.size)}..."
|
|
271
|
-
self.class.update(:items, :sent) if
|
|
271
|
+
self.class.update(:items, :sent) if with_info
|
|
272
272
|
|
|
273
273
|
@pipelines.each do |name, instance|
|
|
274
274
|
item = options[name] ? instance.process_item(item, options: options[name]) : instance.process_item(item)
|
|
275
275
|
end
|
|
276
|
-
rescue => e
|
|
276
|
+
rescue StandardError => e
|
|
277
277
|
logger.error "Pipeline: dropped: #{e.inspect} (#{e.backtrace.first}), item: #{item}"
|
|
278
|
-
add_event(:drop_items_errors, e.inspect) if
|
|
278
|
+
add_event(:drop_items_errors, e.inspect) if with_info
|
|
279
279
|
false
|
|
280
280
|
else
|
|
281
|
-
self.class.update(:items, :processed) if
|
|
281
|
+
self.class.update(:items, :processed) if with_info
|
|
282
282
|
logger.info "Pipeline: processed: #{JSON.generate(item)}"
|
|
283
283
|
true
|
|
284
284
|
ensure
|
|
285
|
-
if
|
|
285
|
+
if with_info
|
|
286
286
|
logger.info "Info: items: sent: #{self.class.items[:sent]}, processed: #{self.class.items[:processed]}"
|
|
287
287
|
end
|
|
288
288
|
end
|
|
@@ -300,10 +300,10 @@ module Kimurai
|
|
|
300
300
|
Thread.current.abort_on_exception = true
|
|
301
301
|
|
|
302
302
|
spider = self.class.new(engine, config: @config.deep_merge_excl(config, DMERGE_EXCLUDE))
|
|
303
|
-
spider.with_info = true if
|
|
303
|
+
spider.with_info = true if with_info
|
|
304
304
|
|
|
305
305
|
part.each do |url_data|
|
|
306
|
-
if url_data.
|
|
306
|
+
if url_data.instance_of?(Hash)
|
|
307
307
|
if url_data[:url].present? && url_data[:data].present?
|
|
308
308
|
spider.request_to(handler, delay, url_data)
|
|
309
309
|
else
|
|
@@ -314,7 +314,7 @@ module Kimurai
|
|
|
314
314
|
end
|
|
315
315
|
end
|
|
316
316
|
ensure
|
|
317
|
-
spider.browser.destroy_driver! if spider.instance_variable_get(
|
|
317
|
+
spider.browser.destroy_driver! if spider.instance_variable_get('@browser')
|
|
318
318
|
end
|
|
319
319
|
|
|
320
320
|
sleep 0.5
|
data/lib/kimurai/base_helper.rb
CHANGED
|
@@ -4,13 +4,14 @@ module Kimurai
|
|
|
4
4
|
|
|
5
5
|
def absolute_url(url, base:)
|
|
6
6
|
return unless url
|
|
7
|
-
|
|
7
|
+
|
|
8
|
+
URI.join(base, URI::DEFAULT_PARSER.escape(url)).to_s
|
|
8
9
|
end
|
|
9
10
|
|
|
10
11
|
def escape_url(url)
|
|
11
|
-
|
|
12
|
-
rescue URI::InvalidURIError
|
|
13
|
-
URI.parse(URI.escape
|
|
12
|
+
URI.parse(url)
|
|
13
|
+
rescue URI::InvalidURIError
|
|
14
|
+
URI.parse(URI::DEFAULT_PARSER.escape(url)).to_s rescue url
|
|
14
15
|
else
|
|
15
16
|
url
|
|
16
17
|
end
|