kimurai 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.travis.yml +5 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +1923 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/exe/kimurai +6 -0
- data/kimurai.gemspec +48 -0
- data/lib/kimurai.rb +53 -0
- data/lib/kimurai/automation/deploy.yml +54 -0
- data/lib/kimurai/automation/setup.yml +44 -0
- data/lib/kimurai/automation/setup/chromium_chromedriver.yml +26 -0
- data/lib/kimurai/automation/setup/firefox_geckodriver.yml +20 -0
- data/lib/kimurai/automation/setup/phantomjs.yml +33 -0
- data/lib/kimurai/automation/setup/ruby_environment.yml +124 -0
- data/lib/kimurai/base.rb +249 -0
- data/lib/kimurai/base/simple_saver.rb +98 -0
- data/lib/kimurai/base/uniq_checker.rb +22 -0
- data/lib/kimurai/base_helper.rb +22 -0
- data/lib/kimurai/browser_builder.rb +32 -0
- data/lib/kimurai/browser_builder/mechanize_builder.rb +140 -0
- data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +156 -0
- data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +178 -0
- data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +185 -0
- data/lib/kimurai/capybara_configuration.rb +10 -0
- data/lib/kimurai/capybara_ext/driver/base.rb +62 -0
- data/lib/kimurai/capybara_ext/mechanize/driver.rb +55 -0
- data/lib/kimurai/capybara_ext/poltergeist/driver.rb +13 -0
- data/lib/kimurai/capybara_ext/selenium/driver.rb +24 -0
- data/lib/kimurai/capybara_ext/session.rb +150 -0
- data/lib/kimurai/capybara_ext/session/config.rb +18 -0
- data/lib/kimurai/cli.rb +157 -0
- data/lib/kimurai/cli/ansible_command_builder.rb +71 -0
- data/lib/kimurai/cli/generator.rb +57 -0
- data/lib/kimurai/core_ext/array.rb +14 -0
- data/lib/kimurai/core_ext/numeric.rb +19 -0
- data/lib/kimurai/core_ext/string.rb +7 -0
- data/lib/kimurai/pipeline.rb +25 -0
- data/lib/kimurai/runner.rb +72 -0
- data/lib/kimurai/template/.gitignore +18 -0
- data/lib/kimurai/template/.ruby-version +1 -0
- data/lib/kimurai/template/Gemfile +20 -0
- data/lib/kimurai/template/README.md +3 -0
- data/lib/kimurai/template/config/application.rb +32 -0
- data/lib/kimurai/template/config/automation.yml +13 -0
- data/lib/kimurai/template/config/boot.rb +22 -0
- data/lib/kimurai/template/config/initializers/.keep +0 -0
- data/lib/kimurai/template/config/schedule.rb +57 -0
- data/lib/kimurai/template/db/.keep +0 -0
- data/lib/kimurai/template/helpers/application_helper.rb +3 -0
- data/lib/kimurai/template/lib/.keep +0 -0
- data/lib/kimurai/template/log/.keep +0 -0
- data/lib/kimurai/template/pipelines/saver.rb +11 -0
- data/lib/kimurai/template/pipelines/validator.rb +24 -0
- data/lib/kimurai/template/spiders/application_spider.rb +104 -0
- data/lib/kimurai/template/tmp/.keep +0 -0
- data/lib/kimurai/version.rb +3 -0
- metadata +349 -0
data/lib/kimurai/base.rb
ADDED
@@ -0,0 +1,249 @@
|
|
1
|
+
require_relative 'base/simple_saver'
|
2
|
+
require_relative 'base/uniq_checker'
|
3
|
+
|
4
|
+
module Kimurai
|
5
|
+
class Base
|
6
|
+
LoggerFormatter = proc do |severity, datetime, progname, msg|
|
7
|
+
current_thread_id = Thread.current.object_id
|
8
|
+
thread_type = Thread.main == Thread.current ? "M" : "C"
|
9
|
+
output = "%s, [%s#%d] [%s: %s] %5s -- %s: %s\n"
|
10
|
+
.freeze % [severity[0..0], datetime, $$, thread_type, current_thread_id, severity, progname, msg]
|
11
|
+
|
12
|
+
if Kimurai.configuration.colorize_logger != false && Kimurai.env == "development"
|
13
|
+
Rbcat.colorize(output, predefined: [:jsonhash, :logger])
|
14
|
+
else
|
15
|
+
output
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
include BaseHelper
|
20
|
+
|
21
|
+
###
|
22
|
+
|
23
|
+
class << self
|
24
|
+
attr_reader :run_info
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.running?
|
28
|
+
@run_info && @run_info[:status] == :running
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.completed?
|
32
|
+
@run_info && @run_info[:status] == :completed
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.failed?
|
36
|
+
@run_info && @run_info[:status] == :failed
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.visits
|
40
|
+
@run_info && @run_info[:visits]
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.items
|
44
|
+
@run_info && @run_info[:items]
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.update(type, subtype)
|
48
|
+
return unless @run_info
|
49
|
+
|
50
|
+
(@update_mutex ||= Mutex.new).synchronize do
|
51
|
+
@run_info[type][subtype] += 1
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
###
|
56
|
+
|
57
|
+
@engine = :mechanize
|
58
|
+
@pipelines = []
|
59
|
+
@config = {}
|
60
|
+
|
61
|
+
###
|
62
|
+
|
63
|
+
def self.name
|
64
|
+
@name
|
65
|
+
end
|
66
|
+
|
67
|
+
def self.engine
|
68
|
+
@engine ||= superclass.engine
|
69
|
+
end
|
70
|
+
|
71
|
+
def self.pipelines
|
72
|
+
@pipelines ||= superclass.pipelines
|
73
|
+
end
|
74
|
+
|
75
|
+
def self.start_urls
|
76
|
+
@start_urls
|
77
|
+
end
|
78
|
+
|
79
|
+
def self.config
|
80
|
+
superclass.equal?(::Object) ? @config : superclass.config.deep_merge(@config || {})
|
81
|
+
end
|
82
|
+
|
83
|
+
###
|
84
|
+
|
85
|
+
def self.logger
|
86
|
+
@logger ||= Kimurai.configuration.logger || begin
|
87
|
+
log_level = (ENV["LOG_LEVEL"] || Kimurai.configuration.log_level || "DEBUG").to_s.upcase
|
88
|
+
log_level = "Logger::#{log_level}".constantize
|
89
|
+
Logger.new(STDOUT, formatter: LoggerFormatter, level: log_level, progname: name)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
###
|
94
|
+
|
95
|
+
def self.checker
|
96
|
+
@checker ||= UniqChecker.new
|
97
|
+
end
|
98
|
+
|
99
|
+
def unique?(scope, value)
|
100
|
+
self.class.checker.unique?(scope, value)
|
101
|
+
end
|
102
|
+
|
103
|
+
def self.saver
|
104
|
+
@saver ||= SimpleSaver.new
|
105
|
+
end
|
106
|
+
|
107
|
+
def save_to(path, item, format:, position: true)
|
108
|
+
self.class.saver.save(path, item, format: format, position: position)
|
109
|
+
end
|
110
|
+
|
111
|
+
###
|
112
|
+
|
113
|
+
def self.crawl!
|
114
|
+
logger.error "Spider: already running: #{name}" and return false if running?
|
115
|
+
@run_info = {
|
116
|
+
spider_name: name, status: :running, environment: Kimurai.env,
|
117
|
+
start_time: Time.new, stop_time: nil, running_time: nil,
|
118
|
+
visits: { requests: 0, responses: 0 }, items: { sent: 0, processed: 0 }, error: nil
|
119
|
+
}
|
120
|
+
|
121
|
+
logger.info "Spider: started: #{name}"
|
122
|
+
open_spider if self.respond_to? :open_spider
|
123
|
+
|
124
|
+
spider = self.new
|
125
|
+
spider.with_info = true
|
126
|
+
if start_urls
|
127
|
+
start_urls.each do |start_url|
|
128
|
+
spider.request_to(:parse, url: start_url)
|
129
|
+
end
|
130
|
+
else
|
131
|
+
spider.parse
|
132
|
+
end
|
133
|
+
rescue StandardError, SignalException => e
|
134
|
+
@run_info.merge!(status: :failed, error: e.inspect)
|
135
|
+
raise e
|
136
|
+
else
|
137
|
+
@run_info[:status] = :completed
|
138
|
+
@run_info
|
139
|
+
ensure
|
140
|
+
if spider
|
141
|
+
spider.browser.destroy_driver!
|
142
|
+
|
143
|
+
stop_time = Time.now
|
144
|
+
total_time = (stop_time - @run_info[:start_time]).round(3)
|
145
|
+
@run_info.merge!(stop_time: stop_time, running_time: total_time)
|
146
|
+
|
147
|
+
close_spider if self.respond_to? :close_spider
|
148
|
+
message = "Spider: stopped: #{@run_info.merge(running_time: @run_info[:running_time]&.duration)}"
|
149
|
+
failed? ? @logger.fatal(message) : @logger.info(message)
|
150
|
+
|
151
|
+
@run_info, @checker, @saver = nil
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
def self.parse!(handler, engine = nil, url: nil, data: {})
|
156
|
+
spider = engine ? self.new(engine) : self.new
|
157
|
+
url.present? ? spider.request_to(handler, url: url, data: data) : spider.public_send(handler)
|
158
|
+
ensure
|
159
|
+
spider.browser.destroy_driver!
|
160
|
+
end
|
161
|
+
|
162
|
+
###
|
163
|
+
|
164
|
+
attr_reader :logger
|
165
|
+
attr_accessor :with_info
|
166
|
+
|
167
|
+
def initialize(engine = self.class.engine, config: {})
|
168
|
+
@engine = engine
|
169
|
+
@config = self.class.config.deep_merge(config)
|
170
|
+
@pipelines = self.class.pipelines.map do |pipeline_name|
|
171
|
+
klass = Pipeline.descendants.find { |kl| kl.name == pipeline_name }
|
172
|
+
instance = klass.new
|
173
|
+
instance.spider = self
|
174
|
+
[pipeline_name, instance]
|
175
|
+
end.to_h
|
176
|
+
|
177
|
+
@logger = self.class.logger
|
178
|
+
end
|
179
|
+
|
180
|
+
def browser
|
181
|
+
@browser ||= BrowserBuilder.build(@engine, @config, spider: self)
|
182
|
+
end
|
183
|
+
|
184
|
+
def request_to(handler, delay = nil, url:, data: {})
|
185
|
+
request_data = { url: url, data: data }
|
186
|
+
delay ? browser.visit(url, delay: delay) : browser.visit(url)
|
187
|
+
public_send(handler, browser.current_response, request_data)
|
188
|
+
end
|
189
|
+
|
190
|
+
def console(response = nil, url: nil, data: {})
|
191
|
+
binding.pry
|
192
|
+
end
|
193
|
+
|
194
|
+
private
|
195
|
+
|
196
|
+
def send_item(item, options = {})
|
197
|
+
logger.debug "Pipeline: starting processing item through #{@pipelines.size} #{'pipeline'.pluralize(@pipelines.size)}..."
|
198
|
+
self.class.update(:items, :sent) if self.with_info
|
199
|
+
|
200
|
+
@pipelines.each do |name, instance|
|
201
|
+
item = options[name] ? instance.process_item(item, options: options[name]) : instance.process_item(item)
|
202
|
+
end
|
203
|
+
rescue => e
|
204
|
+
logger.error "Pipeline: dropped: #{e.inspect}, item: #{item}"
|
205
|
+
false
|
206
|
+
else
|
207
|
+
self.class.update(:items, :processed) if self.with_info
|
208
|
+
logger.info "Pipeline: processed: #{JSON.generate(item)}"
|
209
|
+
true
|
210
|
+
ensure
|
211
|
+
if self.with_info
|
212
|
+
logger.info "Info: items: sent: #{self.class.items[:sent]}, processed: #{self.class.items[:processed]}"
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
def in_parallel(handler, urls, threads:, data: {}, delay: nil, engine: @engine, config: {})
|
217
|
+
parts = urls.in_sorted_groups(threads, false)
|
218
|
+
urls_count = urls.size
|
219
|
+
|
220
|
+
all = []
|
221
|
+
start_time = Time.now
|
222
|
+
logger.info "Spider: in_parallel: starting processing #{urls_count} urls within #{threads} threads"
|
223
|
+
|
224
|
+
parts.each do |part|
|
225
|
+
all << Thread.new(part) do |part|
|
226
|
+
Thread.current.abort_on_exception = true
|
227
|
+
|
228
|
+
spider = self.class.new(engine, config: config)
|
229
|
+
spider.with_info = true if self.with_info
|
230
|
+
|
231
|
+
part.each do |url_data|
|
232
|
+
if url_data.class == Hash
|
233
|
+
spider.request_to(handler, delay, url_data)
|
234
|
+
else
|
235
|
+
spider.request_to(handler, delay, url: url_data, data: data)
|
236
|
+
end
|
237
|
+
end
|
238
|
+
ensure
|
239
|
+
spider.browser.destroy_driver!
|
240
|
+
end
|
241
|
+
|
242
|
+
sleep 0.5
|
243
|
+
end
|
244
|
+
|
245
|
+
all.each(&:join)
|
246
|
+
logger.info "Spider: in_parallel: stopped processing #{urls_count} urls within #{threads} threads, total time: #{(Time.now - start_time).duration}"
|
247
|
+
end
|
248
|
+
end
|
249
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'csv'
|
3
|
+
|
4
|
+
module Kimurai
|
5
|
+
class Base
|
6
|
+
class SimpleSaver
|
7
|
+
def initialize
|
8
|
+
@index = 0
|
9
|
+
@mutex = Mutex.new
|
10
|
+
end
|
11
|
+
|
12
|
+
def save(path, item, format:, position:)
|
13
|
+
@mutex.synchronize do
|
14
|
+
@index += 1
|
15
|
+
item[:position] = @index if position
|
16
|
+
|
17
|
+
case format
|
18
|
+
when :json
|
19
|
+
save_to_json(item, path)
|
20
|
+
when :pretty_json
|
21
|
+
save_to_pretty_json(item, path)
|
22
|
+
when :jsonlines
|
23
|
+
save_to_jsonlines(item, path)
|
24
|
+
when :csv
|
25
|
+
save_to_csv(item, path)
|
26
|
+
else
|
27
|
+
raise "SimpleSaver: wrong type of format: #{format}"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def save_to_json(item, path)
|
35
|
+
data = JSON.generate([item])
|
36
|
+
|
37
|
+
if @index > 1
|
38
|
+
file_content = File.read(path).sub(/\}\]\Z/, "\}\,")
|
39
|
+
File.open(path, "w") do |f|
|
40
|
+
f.write(file_content + data.sub(/\A\[/, ""))
|
41
|
+
end
|
42
|
+
else
|
43
|
+
File.open(path, "w") { |f| f.write(data) }
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def save_to_pretty_json(item, path)
|
48
|
+
data = JSON.pretty_generate([item])
|
49
|
+
|
50
|
+
if @index > 1
|
51
|
+
file_content = File.read(path).sub(/\}\n\]\Z/, "\}\,\n")
|
52
|
+
File.open(path, "w") do |f|
|
53
|
+
f.write(file_content + data.sub(/\A\[\n/, ""))
|
54
|
+
end
|
55
|
+
else
|
56
|
+
File.open(path, "w") { |f| f.write(data) }
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def save_to_jsonlines(item, path)
|
61
|
+
data = JSON.generate(item)
|
62
|
+
|
63
|
+
if @index > 1
|
64
|
+
File.open(path, "a") { |file| file.write("\n" + data) }
|
65
|
+
else
|
66
|
+
File.open(path, "w") { |file| file.write(data) }
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def save_to_csv(item, path)
|
71
|
+
data = flatten_hash(item)
|
72
|
+
|
73
|
+
if @index > 1
|
74
|
+
CSV.open(path, "a+", force_quotes: true) do |csv|
|
75
|
+
csv << data.values
|
76
|
+
end
|
77
|
+
else
|
78
|
+
CSV.open(path, "w", force_quotes: true) do |csv|
|
79
|
+
csv << data.keys
|
80
|
+
csv << data.values
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def flatten_hash(hash)
|
86
|
+
hash.each_with_object({}) do |(k, v), h|
|
87
|
+
if v.is_a? Hash
|
88
|
+
flatten_hash(v).map { |h_k, h_v| h["#{k}.#{h_k}"] = h_v }
|
89
|
+
else
|
90
|
+
h[k&.to_s] = v
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Kimurai
|
2
|
+
class Base
|
3
|
+
class UniqChecker
|
4
|
+
def initialize
|
5
|
+
@database = {}
|
6
|
+
@mutex = Mutex.new
|
7
|
+
end
|
8
|
+
|
9
|
+
def unique?(scope, value)
|
10
|
+
@mutex.synchronize do
|
11
|
+
@database[scope] ||= []
|
12
|
+
if @database[scope].include?(value)
|
13
|
+
false
|
14
|
+
else
|
15
|
+
@database[scope].push(value)
|
16
|
+
true
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Kimurai
|
2
|
+
module BaseHelper
|
3
|
+
private
|
4
|
+
|
5
|
+
def absolute_url(url, base:)
|
6
|
+
return unless url
|
7
|
+
URI.join(base, URI.escape(url)).to_s
|
8
|
+
end
|
9
|
+
|
10
|
+
def escape_url(url)
|
11
|
+
uri = URI.parse(url)
|
12
|
+
rescue URI::InvalidURIError => e
|
13
|
+
URI.parse(URI.escape url).to_s rescue url
|
14
|
+
else
|
15
|
+
url
|
16
|
+
end
|
17
|
+
|
18
|
+
def normalize_url(url, base:)
|
19
|
+
escape_url(absolute_url(url, base: base))
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Kimurai
|
2
|
+
class BrowserBuilder
|
3
|
+
AVAILABLE_ENGINES = [
|
4
|
+
:mechanize,
|
5
|
+
:mechanize_standalone,
|
6
|
+
:poltergeist_phantomjs,
|
7
|
+
:selenium_firefox,
|
8
|
+
:selenium_chrome
|
9
|
+
]
|
10
|
+
|
11
|
+
def self.build(engine, config = {}, spider:)
|
12
|
+
unless AVAILABLE_ENGINES.include? engine
|
13
|
+
raise "BrowserBuilder: wrong name of engine, available engines: #{AVAILABLE_ENGINES.join(', ')}"
|
14
|
+
end
|
15
|
+
|
16
|
+
case engine
|
17
|
+
when :mechanize
|
18
|
+
require_relative 'browser_builder/mechanize_builder'
|
19
|
+
MechanizeBuilder.new(config, spider: spider).build
|
20
|
+
when :selenium_chrome
|
21
|
+
require_relative 'browser_builder/selenium_chrome_builder'
|
22
|
+
SeleniumChromeBuilder.new(config, spider: spider).build
|
23
|
+
when :poltergeist_phantomjs
|
24
|
+
require_relative 'browser_builder/poltergeist_phantomjs_builder'
|
25
|
+
PoltergeistPhantomJSBuilder.new(config, spider: spider).build
|
26
|
+
when :selenium_firefox
|
27
|
+
require_relative 'browser_builder/selenium_firefox_builder'
|
28
|
+
SeleniumFirefoxBuilder.new(config, spider: spider).build
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,140 @@
|
|
1
|
+
require 'capybara'
|
2
|
+
require 'capybara/mechanize'
|
3
|
+
require_relative '../capybara_configuration'
|
4
|
+
require_relative '../capybara_ext/mechanize/driver'
|
5
|
+
require_relative '../capybara_ext/session'
|
6
|
+
|
7
|
+
module Kimurai
|
8
|
+
class BrowserBuilder
|
9
|
+
class MechanizeBuilder
|
10
|
+
attr_reader :logger, :spider
|
11
|
+
|
12
|
+
def initialize(config, spider:)
|
13
|
+
@config = config
|
14
|
+
@spider = spider
|
15
|
+
@logger = spider.logger
|
16
|
+
end
|
17
|
+
|
18
|
+
def build
|
19
|
+
# Register driver
|
20
|
+
Capybara.register_driver :mechanize do |app|
|
21
|
+
driver = Capybara::Mechanize::Driver.new("app")
|
22
|
+
# keep the history as small as possible (by default it's unlimited)
|
23
|
+
driver.configure { |a| a.history.max_size = 2 }
|
24
|
+
driver
|
25
|
+
end
|
26
|
+
|
27
|
+
# Create browser instance (Capybara session)
|
28
|
+
@browser = Capybara::Session.new(:mechanize)
|
29
|
+
@browser.spider = spider
|
30
|
+
logger.debug "BrowserBuilder (mechanize): created browser instance"
|
31
|
+
|
32
|
+
# Proxy
|
33
|
+
if proxy = @config[:proxy].presence
|
34
|
+
proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
|
35
|
+
ip, port, type = proxy_string.split(":")
|
36
|
+
|
37
|
+
if type == "socks5"
|
38
|
+
logger.error "BrowserBuilder (mechanize): can't set socks5 proxy (not supported), skipped"
|
39
|
+
else
|
40
|
+
@browser.set_proxy(*proxy_string.split(":"))
|
41
|
+
logger.debug "BrowserBuilder (mechanize): enabled #{type} proxy, ip: #{ip}, port: #{port}"
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# SSL
|
46
|
+
if ssl_cert_path = @config[:ssl_cert_path].presence
|
47
|
+
@browser.driver.browser.agent.http.ca_file = ssl_cert_path
|
48
|
+
logger.debug "BrowserBuilder (mechanize): enabled custom ssl_cert"
|
49
|
+
end
|
50
|
+
|
51
|
+
if @config[:ignore_ssl_errors].present?
|
52
|
+
@browser.driver.browser.agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
53
|
+
logger.debug "BrowserBuilder (mechanize): enabled ignore_ssl_errors"
|
54
|
+
end
|
55
|
+
|
56
|
+
# Headers
|
57
|
+
if headers = @config[:headers].presence
|
58
|
+
@browser.driver.headers = headers
|
59
|
+
logger.debug "BrowserBuilder (mechanize): enabled custom headers"
|
60
|
+
end
|
61
|
+
|
62
|
+
if user_agent = @config[:user_agent].presence
|
63
|
+
user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
|
64
|
+
|
65
|
+
@browser.driver.add_header("User-Agent", user_agent_string)
|
66
|
+
logger.debug "BrowserBuilder (mechanize): enabled custom user-agent"
|
67
|
+
end
|
68
|
+
|
69
|
+
# Cookies
|
70
|
+
if cookies = @config[:cookies].presence
|
71
|
+
cookies.each do |cookie|
|
72
|
+
@browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
|
73
|
+
end
|
74
|
+
|
75
|
+
logger.debug "BrowserBuilder (mechanize): enabled custom cookies"
|
76
|
+
end
|
77
|
+
|
78
|
+
# Browser instance options
|
79
|
+
# retry_request_errors
|
80
|
+
if errors = @config.dig(:browser, :retry_request_errors).presence
|
81
|
+
@browser.config.retry_request_errors = errors
|
82
|
+
logger.debug "BrowserBuilder (mechanize): enabled `browser retry_request_errors`"
|
83
|
+
end
|
84
|
+
|
85
|
+
# restart_if
|
86
|
+
if @config.dig(:browser, :restart_if).present?
|
87
|
+
logger.error "BrowserBuilder (mechanize): `browser restart_if` options not supported by Mechanize, skipped"
|
88
|
+
end
|
89
|
+
|
90
|
+
# before_request clear_cookies
|
91
|
+
if @config.dig(:browser, :before_request, :clear_cookies)
|
92
|
+
@browser.config.before_request[:clear_cookies] = true
|
93
|
+
logger.debug "BrowserBuilder (mechanize): enabled `browser before_request clear_cookies`"
|
94
|
+
end
|
95
|
+
|
96
|
+
# before_request clear_and_set_cookies
|
97
|
+
if @config.dig(:browser, :before_request, :clear_and_set_cookies)
|
98
|
+
if cookies = @config[:cookies].presence
|
99
|
+
@browser.config.cookies = cookies
|
100
|
+
@browser.config.before_request[:clear_and_set_cookies] = true
|
101
|
+
logger.debug "BrowserBuilder (mechanize): enabled `browser before_request clear_and_set_cookies`"
|
102
|
+
else
|
103
|
+
logger.error "BrowserBuilder (mechanize): `cookies` should be present to enable `browser before_request clear_and_set_cookies`, skipped"
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# before_request change_user_agent
|
108
|
+
if @config.dig(:browser, :before_request, :change_user_agent)
|
109
|
+
if @config[:user_agent].present? && @config[:user_agent].class == Proc
|
110
|
+
@browser.config.user_agent = @config[:user_agent]
|
111
|
+
@browser.config.before_request[:change_user_agent] = true
|
112
|
+
logger.debug "BrowserBuilder (mechanize): enabled `browser before_request change_user_agent`"
|
113
|
+
else
|
114
|
+
logger.error "BrowserBuilder (mechanize): `user_agent` should be present and has lambda format to enable `browser before_request change_user_agent`, skipped"
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# before_request change_proxy
|
119
|
+
if @config.dig(:browser, :before_request, :change_proxy)
|
120
|
+
if @config[:proxy].present? && @config[:proxy].class == Proc
|
121
|
+
@browser.config.proxy = @config[:proxy]
|
122
|
+
@browser.config.before_request[:change_proxy] = true
|
123
|
+
logger.debug "BrowserBuilder (mechanize): enabled `browser before_request change_proxy`"
|
124
|
+
else
|
125
|
+
logger.error "BrowserBuilder (mechanize): `proxy` should be present and has lambda format to enable `browser before_request change_proxy`, skipped"
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
# before_request delay
|
130
|
+
if delay = @config.dig(:browser, :before_request, :delay).presence
|
131
|
+
@browser.config.before_request[:delay] = delay
|
132
|
+
logger.debug "BrowserBuilder (mechanize): enabled `browser before_request delay`"
|
133
|
+
end
|
134
|
+
|
135
|
+
# return Capybara session instance
|
136
|
+
@browser
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|