tanakai 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +12 -0
- data/.travis.yml +5 -0
- data/CHANGELOG.md +118 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +2038 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/exe/tanakai +6 -0
- data/lib/tanakai/automation/deploy.yml +54 -0
- data/lib/tanakai/automation/setup/chromium_chromedriver.yml +26 -0
- data/lib/tanakai/automation/setup/firefox_geckodriver.yml +20 -0
- data/lib/tanakai/automation/setup/phantomjs.yml +33 -0
- data/lib/tanakai/automation/setup/ruby_environment.yml +124 -0
- data/lib/tanakai/automation/setup.yml +45 -0
- data/lib/tanakai/base/saver.rb +106 -0
- data/lib/tanakai/base/storage.rb +54 -0
- data/lib/tanakai/base.rb +326 -0
- data/lib/tanakai/base_helper.rb +22 -0
- data/lib/tanakai/browser_builder/apparition_builder.rb +58 -0
- data/lib/tanakai/browser_builder/cuprite_builder.rb +54 -0
- data/lib/tanakai/browser_builder/mechanize_builder.rb +154 -0
- data/lib/tanakai/browser_builder/poltergeist_phantomjs_builder.rb +175 -0
- data/lib/tanakai/browser_builder/selenium_chrome_builder.rb +199 -0
- data/lib/tanakai/browser_builder/selenium_firefox_builder.rb +204 -0
- data/lib/tanakai/browser_builder.rb +20 -0
- data/lib/tanakai/capybara_configuration.rb +10 -0
- data/lib/tanakai/capybara_ext/apparition/driver.rb +13 -0
- data/lib/tanakai/capybara_ext/cuprite/driver.rb +13 -0
- data/lib/tanakai/capybara_ext/driver/base.rb +62 -0
- data/lib/tanakai/capybara_ext/mechanize/driver.rb +71 -0
- data/lib/tanakai/capybara_ext/poltergeist/driver.rb +13 -0
- data/lib/tanakai/capybara_ext/selenium/driver.rb +34 -0
- data/lib/tanakai/capybara_ext/session/config.rb +22 -0
- data/lib/tanakai/capybara_ext/session.rb +249 -0
- data/lib/tanakai/cli/ansible_command_builder.rb +71 -0
- data/lib/tanakai/cli/generator.rb +57 -0
- data/lib/tanakai/cli.rb +183 -0
- data/lib/tanakai/core_ext/array.rb +14 -0
- data/lib/tanakai/core_ext/hash.rb +5 -0
- data/lib/tanakai/core_ext/numeric.rb +19 -0
- data/lib/tanakai/core_ext/string.rb +7 -0
- data/lib/tanakai/pipeline.rb +33 -0
- data/lib/tanakai/runner.rb +60 -0
- data/lib/tanakai/template/.gitignore +18 -0
- data/lib/tanakai/template/Gemfile +28 -0
- data/lib/tanakai/template/README.md +3 -0
- data/lib/tanakai/template/config/application.rb +37 -0
- data/lib/tanakai/template/config/automation.yml +13 -0
- data/lib/tanakai/template/config/boot.rb +22 -0
- data/lib/tanakai/template/config/initializers/.keep +0 -0
- data/lib/tanakai/template/config/schedule.rb +57 -0
- data/lib/tanakai/template/db/.keep +0 -0
- data/lib/tanakai/template/helpers/application_helper.rb +3 -0
- data/lib/tanakai/template/lib/.keep +0 -0
- data/lib/tanakai/template/log/.keep +0 -0
- data/lib/tanakai/template/pipelines/saver.rb +11 -0
- data/lib/tanakai/template/pipelines/validator.rb +24 -0
- data/lib/tanakai/template/spiders/application_spider.rb +143 -0
- data/lib/tanakai/template/tmp/.keep +0 -0
- data/lib/tanakai/version.rb +3 -0
- data/lib/tanakai.rb +54 -0
- data/tanakai.gemspec +50 -0
- metadata +382 -0
data/lib/tanakai/base.rb
ADDED
@@ -0,0 +1,326 @@
|
|
1
|
+
require_relative 'base/saver'
|
2
|
+
require_relative 'base/storage'
|
3
|
+
|
4
|
+
module Tanakai
|
5
|
+
class Base
|
6
|
+
class InvalidUrlError < StandardError; end
|
7
|
+
|
8
|
+
# don't deep merge config's headers hash option
|
9
|
+
DMERGE_EXCLUDE = [:headers]
|
10
|
+
|
11
|
+
LoggerFormatter = proc do |severity, datetime, progname, msg|
|
12
|
+
current_thread_id = Thread.current.object_id
|
13
|
+
thread_type = Thread.main == Thread.current ? "M" : "C"
|
14
|
+
output = "%s, [%s#%d] [%s: %s] %5s -- %s: %s\n"
|
15
|
+
.freeze % [severity[0..0], datetime, $$, thread_type, current_thread_id, severity, progname, msg]
|
16
|
+
|
17
|
+
if Tanakai.configuration.colorize_logger != false && Tanakai.env == "development"
|
18
|
+
Rbcat.colorize(output, predefined: [:jsonhash, :logger])
|
19
|
+
else
|
20
|
+
output
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
include BaseHelper
|
25
|
+
|
26
|
+
###
|
27
|
+
|
28
|
+
class << self
|
29
|
+
attr_reader :run_info, :savers, :storage
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.running?
|
33
|
+
@run_info && @run_info[:status] == :running
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.completed?
|
37
|
+
@run_info && @run_info[:status] == :completed
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.failed?
|
41
|
+
@run_info && @run_info[:status] == :failed
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.visits
|
45
|
+
@run_info && @run_info[:visits]
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.items
|
49
|
+
@run_info && @run_info[:items]
|
50
|
+
end
|
51
|
+
|
52
|
+
def self.update(type, subtype)
|
53
|
+
return unless @run_info
|
54
|
+
@update_mutex.synchronize { @run_info[type][subtype] += 1 }
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.add_event(scope, event)
|
58
|
+
return unless @run_info
|
59
|
+
@update_mutex.synchronize { @run_info[:events][scope][event] += 1 }
|
60
|
+
end
|
61
|
+
|
62
|
+
###
|
63
|
+
|
64
|
+
@engine = :mechanize
|
65
|
+
@pipelines = []
|
66
|
+
@config = {}
|
67
|
+
|
68
|
+
def self.name
|
69
|
+
@name
|
70
|
+
end
|
71
|
+
|
72
|
+
def self.engine
|
73
|
+
@engine ||= superclass.engine
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.pipelines
|
77
|
+
@pipelines ||= superclass.pipelines
|
78
|
+
end
|
79
|
+
|
80
|
+
def self.start_urls
|
81
|
+
@start_urls
|
82
|
+
end
|
83
|
+
|
84
|
+
def self.config
|
85
|
+
if superclass.equal?(::Object)
|
86
|
+
@config
|
87
|
+
else
|
88
|
+
superclass.config.deep_merge_excl(@config || {}, DMERGE_EXCLUDE)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
###
|
93
|
+
|
94
|
+
def self.logger
|
95
|
+
@logger ||= Tanakai.configuration.logger || begin
|
96
|
+
log_level = (ENV["LOG_LEVEL"] || Tanakai.configuration.log_level || "DEBUG").to_s.upcase
|
97
|
+
log_level = "Logger::#{log_level}".constantize
|
98
|
+
Logger.new(STDOUT, formatter: LoggerFormatter, level: log_level, progname: name)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def self.crawl!(exception_on_fail: true)
|
103
|
+
logger.error "Spider: already running: #{name}" and return false if running?
|
104
|
+
|
105
|
+
@storage = Storage.new
|
106
|
+
@savers = {}
|
107
|
+
@update_mutex = Mutex.new
|
108
|
+
|
109
|
+
@run_info = {
|
110
|
+
spider_name: name, status: :running, error: nil, environment: Tanakai.env,
|
111
|
+
start_time: Time.new, stop_time: nil, running_time: nil,
|
112
|
+
visits: { requests: 0, responses: 0 }, items: { sent: 0, processed: 0 },
|
113
|
+
events: { requests_errors: Hash.new(0), drop_items_errors: Hash.new(0), custom: Hash.new(0) }
|
114
|
+
}
|
115
|
+
|
116
|
+
###
|
117
|
+
|
118
|
+
logger.info "Spider: started: #{name}"
|
119
|
+
open_spider if self.respond_to? :open_spider
|
120
|
+
|
121
|
+
spider = self.new
|
122
|
+
spider.with_info = true
|
123
|
+
if start_urls
|
124
|
+
start_urls.each do |start_url|
|
125
|
+
if start_url.class == Hash
|
126
|
+
spider.request_to(:parse, start_url)
|
127
|
+
else
|
128
|
+
spider.request_to(:parse, url: start_url)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
else
|
132
|
+
spider.parse
|
133
|
+
end
|
134
|
+
rescue StandardError, SignalException, SystemExit => e
|
135
|
+
@run_info.merge!(status: :failed, error: e.inspect)
|
136
|
+
exception_on_fail ? raise(e) : [@run_info, e]
|
137
|
+
else
|
138
|
+
@run_info.merge!(status: :completed)
|
139
|
+
ensure
|
140
|
+
if spider
|
141
|
+
spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
|
142
|
+
|
143
|
+
stop_time = Time.now
|
144
|
+
total_time = (stop_time - @run_info[:start_time]).round(3)
|
145
|
+
@run_info.merge!(stop_time: stop_time, running_time: total_time)
|
146
|
+
|
147
|
+
close_spider if self.respond_to? :close_spider
|
148
|
+
|
149
|
+
message = "Spider: stopped: #{@run_info.merge(running_time: @run_info[:running_time]&.duration)}"
|
150
|
+
failed? ? logger.fatal(message) : logger.info(message)
|
151
|
+
|
152
|
+
@run_info, @storage, @savers, @update_mutex = nil
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
def self.parse!(handler, *args, **request)
|
157
|
+
spider = self.new
|
158
|
+
|
159
|
+
if args.present?
|
160
|
+
spider.public_send(handler, *args)
|
161
|
+
elsif request.present?
|
162
|
+
spider.request_to(handler, request)
|
163
|
+
else
|
164
|
+
spider.public_send(handler)
|
165
|
+
end
|
166
|
+
ensure
|
167
|
+
spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
|
168
|
+
end
|
169
|
+
|
170
|
+
###
|
171
|
+
|
172
|
+
attr_reader :logger
|
173
|
+
attr_accessor :with_info
|
174
|
+
|
175
|
+
def initialize(engine = self.class.engine, config: {})
|
176
|
+
@engine = engine || self.class.engine
|
177
|
+
@config = self.class.config.deep_merge_excl(config, DMERGE_EXCLUDE)
|
178
|
+
@pipelines = self.class.pipelines.map do |pipeline_name|
|
179
|
+
klass = Pipeline.descendants.find { |kl| kl.name == pipeline_name }
|
180
|
+
instance = klass.new
|
181
|
+
instance.spider = self
|
182
|
+
[pipeline_name, instance]
|
183
|
+
end.to_h
|
184
|
+
|
185
|
+
@logger = self.class.logger
|
186
|
+
@savers = {}
|
187
|
+
end
|
188
|
+
|
189
|
+
def browser
|
190
|
+
@browser ||= BrowserBuilder.build(@engine, @config, spider: self)
|
191
|
+
end
|
192
|
+
|
193
|
+
def request_to(handler, delay = nil, url:, data: {}, response_type: :html)
|
194
|
+
raise InvalidUrlError, "Requested url is invalid: #{url}" unless URI.parse(url).kind_of?(URI::HTTP)
|
195
|
+
|
196
|
+
if @config[:skip_duplicate_requests] && !unique_request?(url)
|
197
|
+
add_event(:duplicate_requests) if self.with_info
|
198
|
+
logger.warn "Spider: request_to: not unique url: #{url}, skipped" and return
|
199
|
+
end
|
200
|
+
|
201
|
+
visited = delay ? browser.visit(url, delay: delay) : browser.visit(url)
|
202
|
+
return unless visited
|
203
|
+
|
204
|
+
public_send(handler, browser.current_response(response_type), { url: url, data: data })
|
205
|
+
end
|
206
|
+
|
207
|
+
def console(response = nil, url: nil, data: {})
|
208
|
+
binding.pry
|
209
|
+
end
|
210
|
+
|
211
|
+
###
|
212
|
+
|
213
|
+
def storage
|
214
|
+
# Note: for `.crawl!` uses shared thread safe Storage instance,
|
215
|
+
# otherwise, each spider instance will have it's own Storage
|
216
|
+
@storage ||= self.with_info ? self.class.storage : Storage.new
|
217
|
+
end
|
218
|
+
|
219
|
+
def unique?(scope, value)
|
220
|
+
storage.unique?(scope, value)
|
221
|
+
end
|
222
|
+
|
223
|
+
def save_to(path, item, format:, position: true, append: false)
|
224
|
+
@savers[path] ||= begin
|
225
|
+
options = { format: format, position: position, append: append }
|
226
|
+
if self.with_info
|
227
|
+
self.class.savers[path] ||= Saver.new(path, options)
|
228
|
+
else
|
229
|
+
Saver.new(path, options)
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
@savers[path].save(item)
|
234
|
+
end
|
235
|
+
|
236
|
+
###
|
237
|
+
|
238
|
+
def add_event(scope = :custom, event)
|
239
|
+
if self.with_info
|
240
|
+
self.class.add_event(scope, event)
|
241
|
+
end
|
242
|
+
|
243
|
+
logger.info "Spider: new event (scope: #{scope}): #{event}" if scope == :custom
|
244
|
+
end
|
245
|
+
|
246
|
+
###
|
247
|
+
|
248
|
+
private
|
249
|
+
|
250
|
+
def create_browser(engine, config = {})
|
251
|
+
Tanakai::BrowserBuilder.build(engine, config, spider: self)
|
252
|
+
end
|
253
|
+
|
254
|
+
def unique_request?(url)
|
255
|
+
options = @config[:skip_duplicate_requests]
|
256
|
+
if options.class == Hash
|
257
|
+
scope = options[:scope] || :requests_urls
|
258
|
+
if options[:check_only]
|
259
|
+
storage.include?(scope, url) ? false : true
|
260
|
+
else
|
261
|
+
storage.unique?(scope, url) ? true : false
|
262
|
+
end
|
263
|
+
else
|
264
|
+
storage.unique?(:requests_urls, url) ? true : false
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
def send_item(item, options = {})
|
269
|
+
logger.debug "Pipeline: starting processing item through #{@pipelines.size} #{'pipeline'.pluralize(@pipelines.size)}..."
|
270
|
+
self.class.update(:items, :sent) if self.with_info
|
271
|
+
|
272
|
+
@pipelines.each do |name, instance|
|
273
|
+
item = options[name] ? instance.process_item(item, options: options[name]) : instance.process_item(item)
|
274
|
+
end
|
275
|
+
rescue => e
|
276
|
+
logger.error "Pipeline: dropped: #{e.inspect} (#{e.backtrace.first}), item: #{item}"
|
277
|
+
add_event(:drop_items_errors, e.inspect) if self.with_info
|
278
|
+
false
|
279
|
+
else
|
280
|
+
self.class.update(:items, :processed) if self.with_info
|
281
|
+
logger.info "Pipeline: processed: #{JSON.generate(item)}"
|
282
|
+
true
|
283
|
+
ensure
|
284
|
+
if self.with_info
|
285
|
+
logger.info "Info: items: sent: #{self.class.items[:sent]}, processed: #{self.class.items[:processed]}"
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
def in_parallel(handler, urls, threads:, data: {}, delay: nil, engine: @engine, config: {})
|
290
|
+
parts = urls.in_sorted_groups(threads, false)
|
291
|
+
urls_count = urls.size
|
292
|
+
|
293
|
+
all = []
|
294
|
+
start_time = Time.now
|
295
|
+
logger.info "Spider: in_parallel: starting processing #{urls_count} urls within #{threads} threads"
|
296
|
+
|
297
|
+
parts.each do |part|
|
298
|
+
all << Thread.new(part) do |part|
|
299
|
+
Thread.current.abort_on_exception = true
|
300
|
+
|
301
|
+
spider = self.class.new(engine, config: @config.deep_merge_excl(config, DMERGE_EXCLUDE))
|
302
|
+
spider.with_info = true if self.with_info
|
303
|
+
|
304
|
+
part.each do |url_data|
|
305
|
+
if url_data.class == Hash
|
306
|
+
if url_data[:url].present? && url_data[:data].present?
|
307
|
+
spider.request_to(handler, delay, url_data)
|
308
|
+
else
|
309
|
+
spider.public_send(handler, url_data)
|
310
|
+
end
|
311
|
+
else
|
312
|
+
spider.request_to(handler, delay, url: url_data, data: data)
|
313
|
+
end
|
314
|
+
end
|
315
|
+
ensure
|
316
|
+
spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
|
317
|
+
end
|
318
|
+
|
319
|
+
sleep 0.5
|
320
|
+
end
|
321
|
+
|
322
|
+
all.each(&:join)
|
323
|
+
logger.info "Spider: in_parallel: stopped processing #{urls_count} urls within #{threads} threads, total time: #{(Time.now - start_time).duration}"
|
324
|
+
end
|
325
|
+
end
|
326
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Tanakai
|
2
|
+
module BaseHelper
|
3
|
+
private
|
4
|
+
|
5
|
+
def absolute_url(url, base:)
|
6
|
+
return unless url
|
7
|
+
URI.join(base, URI.escape(url)).to_s
|
8
|
+
end
|
9
|
+
|
10
|
+
def escape_url(url)
|
11
|
+
uri = URI.parse(url)
|
12
|
+
rescue URI::InvalidURIError => e
|
13
|
+
URI.parse(URI.escape url).to_s rescue url
|
14
|
+
else
|
15
|
+
url
|
16
|
+
end
|
17
|
+
|
18
|
+
def normalize_url(url, base:)
|
19
|
+
escape_url(absolute_url(url, base: base))
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'capybara/apparition'
|
2
|
+
require_relative '../capybara_configuration'
|
3
|
+
require_relative '../capybara_ext/session'
|
4
|
+
require_relative '../capybara_ext/apparition/driver'
|
5
|
+
|
6
|
+
module Tanakai::BrowserBuilder
|
7
|
+
class ApparitionBuilder
|
8
|
+
attr_reader :logger, :spider
|
9
|
+
|
10
|
+
def initialize(config, spider:)
|
11
|
+
@config = config
|
12
|
+
@spider = spider
|
13
|
+
@logger = spider.logger
|
14
|
+
end
|
15
|
+
|
16
|
+
def build
|
17
|
+
# Register driver
|
18
|
+
Capybara.register_driver :apparition do |app|
|
19
|
+
timeout = ENV.fetch('TIMEOUT', 30).to_i
|
20
|
+
driver_options = { js_errors: false, timeout: timeout, debug: ENV['DEBUG'] }
|
21
|
+
|
22
|
+
driver_options[:headless] = ENV.fetch("HEADLESS", "true") == "true"
|
23
|
+
logger.debug "BrowserBuilder (apparition): enabled extensions"
|
24
|
+
|
25
|
+
Capybara::Apparition::Driver.new(app, driver_options)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Create browser instance (Capybara session)
|
29
|
+
@browser = Capybara::Session.new(:apparition)
|
30
|
+
@browser.spider = spider
|
31
|
+
logger.debug "BrowserBuilder (apparition): created browser instance"
|
32
|
+
|
33
|
+
# Headers
|
34
|
+
if headers = @config[:headers].presence
|
35
|
+
@browser.driver.headers = headers
|
36
|
+
logger.debug "BrowserBuilder (apparition): enabled custom headers"
|
37
|
+
end
|
38
|
+
|
39
|
+
if user_agent = @config[:user_agent].presence
|
40
|
+
user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
|
41
|
+
|
42
|
+
@browser.driver.add_header("User-Agent", user_agent_string)
|
43
|
+
logger.debug "BrowserBuilder (apparition): enabled custom user_agent"
|
44
|
+
end
|
45
|
+
|
46
|
+
# Cookies
|
47
|
+
if cookies = @config[:cookies].presence
|
48
|
+
cookies.each do |cookie|
|
49
|
+
@browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
|
50
|
+
end
|
51
|
+
|
52
|
+
logger.debug "BrowserBuilder (apparition): enabled custom cookies"
|
53
|
+
end
|
54
|
+
|
55
|
+
@browser
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'capybara/cuprite'
|
2
|
+
require_relative '../capybara_configuration'
|
3
|
+
require_relative '../capybara_ext/session'
|
4
|
+
require_relative '../capybara_ext/cuprite/driver'
|
5
|
+
|
6
|
+
module Tanakai::BrowserBuilder
|
7
|
+
class CupriteBuilder
|
8
|
+
attr_reader :logger, :spider
|
9
|
+
|
10
|
+
def initialize(config, spider:)
|
11
|
+
@config = config
|
12
|
+
@spider = spider
|
13
|
+
@logger = spider.logger
|
14
|
+
end
|
15
|
+
|
16
|
+
def build
|
17
|
+
# Register driver
|
18
|
+
Capybara.register_driver :cuprite do |app|
|
19
|
+
driver_options = { headless: ENV.fetch("HEADLESS", "true") == "true" }
|
20
|
+
logger.debug "BrowserBuilder (cuprite): enabled extensions"
|
21
|
+
|
22
|
+
Capybara::Cuprite::Driver.new(app, driver_options)
|
23
|
+
end
|
24
|
+
|
25
|
+
# Create browser instance (Capybara session)
|
26
|
+
@browser = Capybara::Session.new(:cuprite)
|
27
|
+
@browser.spider = spider
|
28
|
+
logger.debug "BrowserBuilder (cuprite): created browser instance"
|
29
|
+
|
30
|
+
# Headers
|
31
|
+
if headers = @config[:headers].presence
|
32
|
+
@browser.driver.headers = headers
|
33
|
+
logger.debug "BrowserBuilder (cuprite): enabled custom headers"
|
34
|
+
end
|
35
|
+
|
36
|
+
if user_agent = @config[:user_agent].presence
|
37
|
+
user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
|
38
|
+
@browser.driver.headers = {"User-Agent" => user_agent_string}
|
39
|
+
logger.debug "BrowserBuilder (cuprite): enabled custom user_agent"
|
40
|
+
end
|
41
|
+
|
42
|
+
# Cookies
|
43
|
+
if cookies = @config[:cookies].presence
|
44
|
+
cookies.each do |cookie|
|
45
|
+
@browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
|
46
|
+
end
|
47
|
+
|
48
|
+
logger.debug "BrowserBuilder (cuprite): enabled custom cookies"
|
49
|
+
end
|
50
|
+
|
51
|
+
@browser
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,154 @@
|
|
1
|
+
require 'capybara'
|
2
|
+
require 'capybara/mechanize'
|
3
|
+
require_relative '../capybara_configuration'
|
4
|
+
require_relative '../capybara_ext/mechanize/driver'
|
5
|
+
require_relative '../capybara_ext/session'
|
6
|
+
|
7
|
+
module Tanakai::BrowserBuilder
|
8
|
+
class MechanizeBuilder
|
9
|
+
attr_reader :logger, :spider
|
10
|
+
|
11
|
+
def initialize(config, spider:)
|
12
|
+
@config = config
|
13
|
+
@spider = spider
|
14
|
+
@logger = spider.logger
|
15
|
+
end
|
16
|
+
|
17
|
+
def build
|
18
|
+
# Register driver
|
19
|
+
Capybara.register_driver :mechanize do |app|
|
20
|
+
driver = Capybara::Mechanize::Driver.new("app")
|
21
|
+
# keep the history as small as possible (by default it's unlimited)
|
22
|
+
driver.configure { |a| a.history.max_size = 2 }
|
23
|
+
driver
|
24
|
+
end
|
25
|
+
|
26
|
+
# Create browser instance (Capybara session)
|
27
|
+
@browser = Capybara::Session.new(:mechanize)
|
28
|
+
@browser.spider = spider
|
29
|
+
logger.debug "BrowserBuilder (mechanize): created browser instance"
|
30
|
+
|
31
|
+
if @config[:extensions].present?
|
32
|
+
logger.error "BrowserBuilder (mechanize): `extensions` option not supported, skipped"
|
33
|
+
end
|
34
|
+
|
35
|
+
# Proxy
|
36
|
+
if proxy = @config[:proxy].presence
|
37
|
+
proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
|
38
|
+
ip, port, type = proxy_string.split(":")
|
39
|
+
|
40
|
+
if type == "http"
|
41
|
+
@browser.driver.set_proxy(*proxy_string.split(":"))
|
42
|
+
logger.debug "BrowserBuilder (mechanize): enabled http proxy, ip: #{ip}, port: #{port}"
|
43
|
+
else
|
44
|
+
logger.error "BrowserBuilder (mechanize): can't set #{type} proxy (not supported), skipped"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# SSL
|
49
|
+
if ssl_cert_path = @config[:ssl_cert_path].presence
|
50
|
+
@browser.driver.browser.agent.http.ca_file = ssl_cert_path
|
51
|
+
logger.debug "BrowserBuilder (mechanize): enabled custom ssl_cert"
|
52
|
+
end
|
53
|
+
|
54
|
+
if @config[:ignore_ssl_errors].present?
|
55
|
+
@browser.driver.browser.agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
56
|
+
logger.debug "BrowserBuilder (mechanize): enabled ignore_ssl_errors"
|
57
|
+
end
|
58
|
+
|
59
|
+
# Headers
|
60
|
+
if headers = @config[:headers].presence
|
61
|
+
@browser.driver.headers = headers
|
62
|
+
logger.debug "BrowserBuilder (mechanize): enabled custom headers"
|
63
|
+
end
|
64
|
+
|
65
|
+
if user_agent = @config[:user_agent].presence
|
66
|
+
user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
|
67
|
+
|
68
|
+
@browser.driver.add_header("User-Agent", user_agent_string)
|
69
|
+
logger.debug "BrowserBuilder (mechanize): enabled custom user_agent"
|
70
|
+
end
|
71
|
+
|
72
|
+
# Cookies
|
73
|
+
if cookies = @config[:cookies].presence
|
74
|
+
cookies.each do |cookie|
|
75
|
+
@browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
|
76
|
+
end
|
77
|
+
|
78
|
+
logger.debug "BrowserBuilder (mechanize): enabled custom cookies"
|
79
|
+
end
|
80
|
+
|
81
|
+
# Browser instance options
|
82
|
+
# skip_request_errors
|
83
|
+
if skip_errors = @config[:skip_request_errors].presence
|
84
|
+
@browser.config.skip_request_errors = skip_errors
|
85
|
+
logger.debug "BrowserBuilder (mechanize): enabled skip_request_errors"
|
86
|
+
end
|
87
|
+
|
88
|
+
# retry_request_errors
|
89
|
+
if retry_errors = @config[:retry_request_errors].presence
|
90
|
+
@browser.config.retry_request_errors = retry_errors
|
91
|
+
logger.debug "BrowserBuilder (mechanize): enabled retry_request_errors"
|
92
|
+
end
|
93
|
+
|
94
|
+
# restart_if
|
95
|
+
if @config[:restart_if].present?
|
96
|
+
logger.warn "BrowserBuilder (mechanize): restart_if options not supported by Mechanize, skipped"
|
97
|
+
end
|
98
|
+
|
99
|
+
# before_request clear_cookies
|
100
|
+
if @config.dig(:before_request, :clear_cookies)
|
101
|
+
@browser.config.before_request[:clear_cookies] = true
|
102
|
+
logger.debug "BrowserBuilder (mechanize): enabled before_request.clear_cookies"
|
103
|
+
end
|
104
|
+
|
105
|
+
# before_request clear_and_set_cookies
|
106
|
+
if @config.dig(:before_request, :clear_and_set_cookies)
|
107
|
+
if cookies = @config[:cookies].presence
|
108
|
+
@browser.config.cookies = cookies
|
109
|
+
@browser.config.before_request[:clear_and_set_cookies] = true
|
110
|
+
logger.debug "BrowserBuilder (mechanize): enabled before_request.clear_and_set_cookies"
|
111
|
+
else
|
112
|
+
logger.error "BrowserBuilder (mechanize): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
# before_request change_user_agent
|
117
|
+
if @config.dig(:before_request, :change_user_agent)
|
118
|
+
if @config[:user_agent].present? && @config[:user_agent].class == Proc
|
119
|
+
@browser.config.user_agent = @config[:user_agent]
|
120
|
+
@browser.config.before_request[:change_user_agent] = true
|
121
|
+
logger.debug "BrowserBuilder (mechanize): enabled before_request.change_user_agent"
|
122
|
+
else
|
123
|
+
logger.error "BrowserBuilder (mechanize): user_agent should be present and has lambda format to enable before_request.change_user_agent, skipped"
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
# before_request change_proxy
|
128
|
+
if @config.dig(:before_request, :change_proxy)
|
129
|
+
if @config[:proxy].present? && @config[:proxy].class == Proc
|
130
|
+
@browser.config.proxy = @config[:proxy]
|
131
|
+
@browser.config.before_request[:change_proxy] = true
|
132
|
+
logger.debug "BrowserBuilder (mechanize): enabled before_request.change_proxy"
|
133
|
+
else
|
134
|
+
logger.error "BrowserBuilder (mechanize): proxy should be present and has lambda format to enable before_request.change_proxy, skipped"
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
# before_request delay
|
139
|
+
if delay = @config.dig(:before_request, :delay).presence
|
140
|
+
@browser.config.before_request[:delay] = delay
|
141
|
+
logger.debug "BrowserBuilder (mechanize): enabled before_request.delay"
|
142
|
+
end
|
143
|
+
|
144
|
+
# encoding
|
145
|
+
if encoding = @config[:encoding]
|
146
|
+
@browser.config.encoding = encoding
|
147
|
+
logger.debug "BrowserBuilder (mechanize): enabled encoding: #{encoding}"
|
148
|
+
end
|
149
|
+
|
150
|
+
# return Capybara session instance
|
151
|
+
@browser
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|