tanakai 1.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +12 -0
- data/.travis.yml +5 -0
- data/CHANGELOG.md +118 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +2038 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/exe/tanakai +6 -0
- data/lib/tanakai/automation/deploy.yml +54 -0
- data/lib/tanakai/automation/setup/chromium_chromedriver.yml +26 -0
- data/lib/tanakai/automation/setup/firefox_geckodriver.yml +20 -0
- data/lib/tanakai/automation/setup/phantomjs.yml +33 -0
- data/lib/tanakai/automation/setup/ruby_environment.yml +124 -0
- data/lib/tanakai/automation/setup.yml +45 -0
- data/lib/tanakai/base/saver.rb +106 -0
- data/lib/tanakai/base/storage.rb +54 -0
- data/lib/tanakai/base.rb +326 -0
- data/lib/tanakai/base_helper.rb +22 -0
- data/lib/tanakai/browser_builder/apparition_builder.rb +58 -0
- data/lib/tanakai/browser_builder/cuprite_builder.rb +54 -0
- data/lib/tanakai/browser_builder/mechanize_builder.rb +154 -0
- data/lib/tanakai/browser_builder/poltergeist_phantomjs_builder.rb +175 -0
- data/lib/tanakai/browser_builder/selenium_chrome_builder.rb +199 -0
- data/lib/tanakai/browser_builder/selenium_firefox_builder.rb +204 -0
- data/lib/tanakai/browser_builder.rb +20 -0
- data/lib/tanakai/capybara_configuration.rb +10 -0
- data/lib/tanakai/capybara_ext/apparition/driver.rb +13 -0
- data/lib/tanakai/capybara_ext/cuprite/driver.rb +13 -0
- data/lib/tanakai/capybara_ext/driver/base.rb +62 -0
- data/lib/tanakai/capybara_ext/mechanize/driver.rb +71 -0
- data/lib/tanakai/capybara_ext/poltergeist/driver.rb +13 -0
- data/lib/tanakai/capybara_ext/selenium/driver.rb +34 -0
- data/lib/tanakai/capybara_ext/session/config.rb +22 -0
- data/lib/tanakai/capybara_ext/session.rb +249 -0
- data/lib/tanakai/cli/ansible_command_builder.rb +71 -0
- data/lib/tanakai/cli/generator.rb +57 -0
- data/lib/tanakai/cli.rb +183 -0
- data/lib/tanakai/core_ext/array.rb +14 -0
- data/lib/tanakai/core_ext/hash.rb +5 -0
- data/lib/tanakai/core_ext/numeric.rb +19 -0
- data/lib/tanakai/core_ext/string.rb +7 -0
- data/lib/tanakai/pipeline.rb +33 -0
- data/lib/tanakai/runner.rb +60 -0
- data/lib/tanakai/template/.gitignore +18 -0
- data/lib/tanakai/template/Gemfile +28 -0
- data/lib/tanakai/template/README.md +3 -0
- data/lib/tanakai/template/config/application.rb +37 -0
- data/lib/tanakai/template/config/automation.yml +13 -0
- data/lib/tanakai/template/config/boot.rb +22 -0
- data/lib/tanakai/template/config/initializers/.keep +0 -0
- data/lib/tanakai/template/config/schedule.rb +57 -0
- data/lib/tanakai/template/db/.keep +0 -0
- data/lib/tanakai/template/helpers/application_helper.rb +3 -0
- data/lib/tanakai/template/lib/.keep +0 -0
- data/lib/tanakai/template/log/.keep +0 -0
- data/lib/tanakai/template/pipelines/saver.rb +11 -0
- data/lib/tanakai/template/pipelines/validator.rb +24 -0
- data/lib/tanakai/template/spiders/application_spider.rb +143 -0
- data/lib/tanakai/template/tmp/.keep +0 -0
- data/lib/tanakai/version.rb +3 -0
- data/lib/tanakai.rb +54 -0
- data/tanakai.gemspec +50 -0
- metadata +382 -0
data/lib/tanakai/base.rb
ADDED
@@ -0,0 +1,326 @@
|
|
1
|
+
require_relative 'base/saver'
|
2
|
+
require_relative 'base/storage'
|
3
|
+
|
4
|
+
module Tanakai
|
5
|
+
class Base
|
6
|
+
class InvalidUrlError < StandardError; end
|
7
|
+
|
8
|
+
# don't deep merge config's headers hash option
|
9
|
+
DMERGE_EXCLUDE = [:headers]
|
10
|
+
|
11
|
+
LoggerFormatter = proc do |severity, datetime, progname, msg|
|
12
|
+
current_thread_id = Thread.current.object_id
|
13
|
+
thread_type = Thread.main == Thread.current ? "M" : "C"
|
14
|
+
output = "%s, [%s#%d] [%s: %s] %5s -- %s: %s\n"
|
15
|
+
.freeze % [severity[0..0], datetime, $$, thread_type, current_thread_id, severity, progname, msg]
|
16
|
+
|
17
|
+
if Tanakai.configuration.colorize_logger != false && Tanakai.env == "development"
|
18
|
+
Rbcat.colorize(output, predefined: [:jsonhash, :logger])
|
19
|
+
else
|
20
|
+
output
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
include BaseHelper
|
25
|
+
|
26
|
+
###
|
27
|
+
|
28
|
+
class << self
|
29
|
+
attr_reader :run_info, :savers, :storage
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.running?
|
33
|
+
@run_info && @run_info[:status] == :running
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.completed?
|
37
|
+
@run_info && @run_info[:status] == :completed
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.failed?
|
41
|
+
@run_info && @run_info[:status] == :failed
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.visits
|
45
|
+
@run_info && @run_info[:visits]
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.items
|
49
|
+
@run_info && @run_info[:items]
|
50
|
+
end
|
51
|
+
|
52
|
+
def self.update(type, subtype)
|
53
|
+
return unless @run_info
|
54
|
+
@update_mutex.synchronize { @run_info[type][subtype] += 1 }
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.add_event(scope, event)
|
58
|
+
return unless @run_info
|
59
|
+
@update_mutex.synchronize { @run_info[:events][scope][event] += 1 }
|
60
|
+
end
|
61
|
+
|
62
|
+
###
|
63
|
+
|
64
|
+
@engine = :mechanize
|
65
|
+
@pipelines = []
|
66
|
+
@config = {}
|
67
|
+
|
68
|
+
def self.name
|
69
|
+
@name
|
70
|
+
end
|
71
|
+
|
72
|
+
def self.engine
|
73
|
+
@engine ||= superclass.engine
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.pipelines
|
77
|
+
@pipelines ||= superclass.pipelines
|
78
|
+
end
|
79
|
+
|
80
|
+
def self.start_urls
|
81
|
+
@start_urls
|
82
|
+
end
|
83
|
+
|
84
|
+
def self.config
|
85
|
+
if superclass.equal?(::Object)
|
86
|
+
@config
|
87
|
+
else
|
88
|
+
superclass.config.deep_merge_excl(@config || {}, DMERGE_EXCLUDE)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
###
|
93
|
+
|
94
|
+
def self.logger
|
95
|
+
@logger ||= Tanakai.configuration.logger || begin
|
96
|
+
log_level = (ENV["LOG_LEVEL"] || Tanakai.configuration.log_level || "DEBUG").to_s.upcase
|
97
|
+
log_level = "Logger::#{log_level}".constantize
|
98
|
+
Logger.new(STDOUT, formatter: LoggerFormatter, level: log_level, progname: name)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def self.crawl!(exception_on_fail: true)
|
103
|
+
logger.error "Spider: already running: #{name}" and return false if running?
|
104
|
+
|
105
|
+
@storage = Storage.new
|
106
|
+
@savers = {}
|
107
|
+
@update_mutex = Mutex.new
|
108
|
+
|
109
|
+
@run_info = {
|
110
|
+
spider_name: name, status: :running, error: nil, environment: Tanakai.env,
|
111
|
+
start_time: Time.new, stop_time: nil, running_time: nil,
|
112
|
+
visits: { requests: 0, responses: 0 }, items: { sent: 0, processed: 0 },
|
113
|
+
events: { requests_errors: Hash.new(0), drop_items_errors: Hash.new(0), custom: Hash.new(0) }
|
114
|
+
}
|
115
|
+
|
116
|
+
###
|
117
|
+
|
118
|
+
logger.info "Spider: started: #{name}"
|
119
|
+
open_spider if self.respond_to? :open_spider
|
120
|
+
|
121
|
+
spider = self.new
|
122
|
+
spider.with_info = true
|
123
|
+
if start_urls
|
124
|
+
start_urls.each do |start_url|
|
125
|
+
if start_url.class == Hash
|
126
|
+
spider.request_to(:parse, start_url)
|
127
|
+
else
|
128
|
+
spider.request_to(:parse, url: start_url)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
else
|
132
|
+
spider.parse
|
133
|
+
end
|
134
|
+
rescue StandardError, SignalException, SystemExit => e
|
135
|
+
@run_info.merge!(status: :failed, error: e.inspect)
|
136
|
+
exception_on_fail ? raise(e) : [@run_info, e]
|
137
|
+
else
|
138
|
+
@run_info.merge!(status: :completed)
|
139
|
+
ensure
|
140
|
+
if spider
|
141
|
+
spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
|
142
|
+
|
143
|
+
stop_time = Time.now
|
144
|
+
total_time = (stop_time - @run_info[:start_time]).round(3)
|
145
|
+
@run_info.merge!(stop_time: stop_time, running_time: total_time)
|
146
|
+
|
147
|
+
close_spider if self.respond_to? :close_spider
|
148
|
+
|
149
|
+
message = "Spider: stopped: #{@run_info.merge(running_time: @run_info[:running_time]&.duration)}"
|
150
|
+
failed? ? logger.fatal(message) : logger.info(message)
|
151
|
+
|
152
|
+
@run_info, @storage, @savers, @update_mutex = nil
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
def self.parse!(handler, *args, **request)
|
157
|
+
spider = self.new
|
158
|
+
|
159
|
+
if args.present?
|
160
|
+
spider.public_send(handler, *args)
|
161
|
+
elsif request.present?
|
162
|
+
spider.request_to(handler, request)
|
163
|
+
else
|
164
|
+
spider.public_send(handler)
|
165
|
+
end
|
166
|
+
ensure
|
167
|
+
spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
|
168
|
+
end
|
169
|
+
|
170
|
+
###
|
171
|
+
|
172
|
+
attr_reader :logger
|
173
|
+
attr_accessor :with_info
|
174
|
+
|
175
|
+
def initialize(engine = self.class.engine, config: {})
|
176
|
+
@engine = engine || self.class.engine
|
177
|
+
@config = self.class.config.deep_merge_excl(config, DMERGE_EXCLUDE)
|
178
|
+
@pipelines = self.class.pipelines.map do |pipeline_name|
|
179
|
+
klass = Pipeline.descendants.find { |kl| kl.name == pipeline_name }
|
180
|
+
instance = klass.new
|
181
|
+
instance.spider = self
|
182
|
+
[pipeline_name, instance]
|
183
|
+
end.to_h
|
184
|
+
|
185
|
+
@logger = self.class.logger
|
186
|
+
@savers = {}
|
187
|
+
end
|
188
|
+
|
189
|
+
def browser
|
190
|
+
@browser ||= BrowserBuilder.build(@engine, @config, spider: self)
|
191
|
+
end
|
192
|
+
|
193
|
+
def request_to(handler, delay = nil, url:, data: {}, response_type: :html)
|
194
|
+
raise InvalidUrlError, "Requested url is invalid: #{url}" unless URI.parse(url).kind_of?(URI::HTTP)
|
195
|
+
|
196
|
+
if @config[:skip_duplicate_requests] && !unique_request?(url)
|
197
|
+
add_event(:duplicate_requests) if self.with_info
|
198
|
+
logger.warn "Spider: request_to: not unique url: #{url}, skipped" and return
|
199
|
+
end
|
200
|
+
|
201
|
+
visited = delay ? browser.visit(url, delay: delay) : browser.visit(url)
|
202
|
+
return unless visited
|
203
|
+
|
204
|
+
public_send(handler, browser.current_response(response_type), { url: url, data: data })
|
205
|
+
end
|
206
|
+
|
207
|
+
def console(response = nil, url: nil, data: {})
|
208
|
+
binding.pry
|
209
|
+
end
|
210
|
+
|
211
|
+
###
|
212
|
+
|
213
|
+
def storage
|
214
|
+
# Note: for `.crawl!` uses shared thread safe Storage instance,
|
215
|
+
# otherwise, each spider instance will have it's own Storage
|
216
|
+
@storage ||= self.with_info ? self.class.storage : Storage.new
|
217
|
+
end
|
218
|
+
|
219
|
+
def unique?(scope, value)
|
220
|
+
storage.unique?(scope, value)
|
221
|
+
end
|
222
|
+
|
223
|
+
def save_to(path, item, format:, position: true, append: false)
|
224
|
+
@savers[path] ||= begin
|
225
|
+
options = { format: format, position: position, append: append }
|
226
|
+
if self.with_info
|
227
|
+
self.class.savers[path] ||= Saver.new(path, options)
|
228
|
+
else
|
229
|
+
Saver.new(path, options)
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
@savers[path].save(item)
|
234
|
+
end
|
235
|
+
|
236
|
+
###
|
237
|
+
|
238
|
+
def add_event(scope = :custom, event)
|
239
|
+
if self.with_info
|
240
|
+
self.class.add_event(scope, event)
|
241
|
+
end
|
242
|
+
|
243
|
+
logger.info "Spider: new event (scope: #{scope}): #{event}" if scope == :custom
|
244
|
+
end
|
245
|
+
|
246
|
+
###
|
247
|
+
|
248
|
+
private
|
249
|
+
|
250
|
+
def create_browser(engine, config = {})
|
251
|
+
Tanakai::BrowserBuilder.build(engine, config, spider: self)
|
252
|
+
end
|
253
|
+
|
254
|
+
def unique_request?(url)
|
255
|
+
options = @config[:skip_duplicate_requests]
|
256
|
+
if options.class == Hash
|
257
|
+
scope = options[:scope] || :requests_urls
|
258
|
+
if options[:check_only]
|
259
|
+
storage.include?(scope, url) ? false : true
|
260
|
+
else
|
261
|
+
storage.unique?(scope, url) ? true : false
|
262
|
+
end
|
263
|
+
else
|
264
|
+
storage.unique?(:requests_urls, url) ? true : false
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
def send_item(item, options = {})
|
269
|
+
logger.debug "Pipeline: starting processing item through #{@pipelines.size} #{'pipeline'.pluralize(@pipelines.size)}..."
|
270
|
+
self.class.update(:items, :sent) if self.with_info
|
271
|
+
|
272
|
+
@pipelines.each do |name, instance|
|
273
|
+
item = options[name] ? instance.process_item(item, options: options[name]) : instance.process_item(item)
|
274
|
+
end
|
275
|
+
rescue => e
|
276
|
+
logger.error "Pipeline: dropped: #{e.inspect} (#{e.backtrace.first}), item: #{item}"
|
277
|
+
add_event(:drop_items_errors, e.inspect) if self.with_info
|
278
|
+
false
|
279
|
+
else
|
280
|
+
self.class.update(:items, :processed) if self.with_info
|
281
|
+
logger.info "Pipeline: processed: #{JSON.generate(item)}"
|
282
|
+
true
|
283
|
+
ensure
|
284
|
+
if self.with_info
|
285
|
+
logger.info "Info: items: sent: #{self.class.items[:sent]}, processed: #{self.class.items[:processed]}"
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
def in_parallel(handler, urls, threads:, data: {}, delay: nil, engine: @engine, config: {})
|
290
|
+
parts = urls.in_sorted_groups(threads, false)
|
291
|
+
urls_count = urls.size
|
292
|
+
|
293
|
+
all = []
|
294
|
+
start_time = Time.now
|
295
|
+
logger.info "Spider: in_parallel: starting processing #{urls_count} urls within #{threads} threads"
|
296
|
+
|
297
|
+
parts.each do |part|
|
298
|
+
all << Thread.new(part) do |part|
|
299
|
+
Thread.current.abort_on_exception = true
|
300
|
+
|
301
|
+
spider = self.class.new(engine, config: @config.deep_merge_excl(config, DMERGE_EXCLUDE))
|
302
|
+
spider.with_info = true if self.with_info
|
303
|
+
|
304
|
+
part.each do |url_data|
|
305
|
+
if url_data.class == Hash
|
306
|
+
if url_data[:url].present? && url_data[:data].present?
|
307
|
+
spider.request_to(handler, delay, url_data)
|
308
|
+
else
|
309
|
+
spider.public_send(handler, url_data)
|
310
|
+
end
|
311
|
+
else
|
312
|
+
spider.request_to(handler, delay, url: url_data, data: data)
|
313
|
+
end
|
314
|
+
end
|
315
|
+
ensure
|
316
|
+
spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
|
317
|
+
end
|
318
|
+
|
319
|
+
sleep 0.5
|
320
|
+
end
|
321
|
+
|
322
|
+
all.each(&:join)
|
323
|
+
logger.info "Spider: in_parallel: stopped processing #{urls_count} urls within #{threads} threads, total time: #{(Time.now - start_time).duration}"
|
324
|
+
end
|
325
|
+
end
|
326
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Tanakai
|
2
|
+
module BaseHelper
|
3
|
+
private
|
4
|
+
|
5
|
+
def absolute_url(url, base:)
|
6
|
+
return unless url
|
7
|
+
URI.join(base, URI.escape(url)).to_s
|
8
|
+
end
|
9
|
+
|
10
|
+
def escape_url(url)
|
11
|
+
uri = URI.parse(url)
|
12
|
+
rescue URI::InvalidURIError => e
|
13
|
+
URI.parse(URI.escape url).to_s rescue url
|
14
|
+
else
|
15
|
+
url
|
16
|
+
end
|
17
|
+
|
18
|
+
def normalize_url(url, base:)
|
19
|
+
escape_url(absolute_url(url, base: base))
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'capybara/apparition'
|
2
|
+
require_relative '../capybara_configuration'
|
3
|
+
require_relative '../capybara_ext/session'
|
4
|
+
require_relative '../capybara_ext/apparition/driver'
|
5
|
+
|
6
|
+
module Tanakai::BrowserBuilder
|
7
|
+
class ApparitionBuilder
|
8
|
+
attr_reader :logger, :spider
|
9
|
+
|
10
|
+
def initialize(config, spider:)
|
11
|
+
@config = config
|
12
|
+
@spider = spider
|
13
|
+
@logger = spider.logger
|
14
|
+
end
|
15
|
+
|
16
|
+
def build
|
17
|
+
# Register driver
|
18
|
+
Capybara.register_driver :apparition do |app|
|
19
|
+
timeout = ENV.fetch('TIMEOUT', 30).to_i
|
20
|
+
driver_options = { js_errors: false, timeout: timeout, debug: ENV['DEBUG'] }
|
21
|
+
|
22
|
+
driver_options[:headless] = ENV.fetch("HEADLESS", "true") == "true"
|
23
|
+
logger.debug "BrowserBuilder (apparition): enabled extensions"
|
24
|
+
|
25
|
+
Capybara::Apparition::Driver.new(app, driver_options)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Create browser instance (Capybara session)
|
29
|
+
@browser = Capybara::Session.new(:apparition)
|
30
|
+
@browser.spider = spider
|
31
|
+
logger.debug "BrowserBuilder (apparition): created browser instance"
|
32
|
+
|
33
|
+
# Headers
|
34
|
+
if headers = @config[:headers].presence
|
35
|
+
@browser.driver.headers = headers
|
36
|
+
logger.debug "BrowserBuilder (apparition): enabled custom headers"
|
37
|
+
end
|
38
|
+
|
39
|
+
if user_agent = @config[:user_agent].presence
|
40
|
+
user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
|
41
|
+
|
42
|
+
@browser.driver.add_header("User-Agent", user_agent_string)
|
43
|
+
logger.debug "BrowserBuilder (apparition): enabled custom user_agent"
|
44
|
+
end
|
45
|
+
|
46
|
+
# Cookies
|
47
|
+
if cookies = @config[:cookies].presence
|
48
|
+
cookies.each do |cookie|
|
49
|
+
@browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
|
50
|
+
end
|
51
|
+
|
52
|
+
logger.debug "BrowserBuilder (apparition): enabled custom cookies"
|
53
|
+
end
|
54
|
+
|
55
|
+
@browser
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'capybara/cuprite'
|
2
|
+
require_relative '../capybara_configuration'
|
3
|
+
require_relative '../capybara_ext/session'
|
4
|
+
require_relative '../capybara_ext/cuprite/driver'
|
5
|
+
|
6
|
+
module Tanakai::BrowserBuilder
|
7
|
+
class CupriteBuilder
|
8
|
+
attr_reader :logger, :spider
|
9
|
+
|
10
|
+
def initialize(config, spider:)
|
11
|
+
@config = config
|
12
|
+
@spider = spider
|
13
|
+
@logger = spider.logger
|
14
|
+
end
|
15
|
+
|
16
|
+
def build
|
17
|
+
# Register driver
|
18
|
+
Capybara.register_driver :cuprite do |app|
|
19
|
+
driver_options = { headless: ENV.fetch("HEADLESS", "true") == "true" }
|
20
|
+
logger.debug "BrowserBuilder (cuprite): enabled extensions"
|
21
|
+
|
22
|
+
Capybara::Cuprite::Driver.new(app, driver_options)
|
23
|
+
end
|
24
|
+
|
25
|
+
# Create browser instance (Capybara session)
|
26
|
+
@browser = Capybara::Session.new(:cuprite)
|
27
|
+
@browser.spider = spider
|
28
|
+
logger.debug "BrowserBuilder (cuprite): created browser instance"
|
29
|
+
|
30
|
+
# Headers
|
31
|
+
if headers = @config[:headers].presence
|
32
|
+
@browser.driver.headers = headers
|
33
|
+
logger.debug "BrowserBuilder (cuprite): enabled custom headers"
|
34
|
+
end
|
35
|
+
|
36
|
+
if user_agent = @config[:user_agent].presence
|
37
|
+
user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
|
38
|
+
@browser.driver.headers = {"User-Agent" => user_agent_string}
|
39
|
+
logger.debug "BrowserBuilder (cuprite): enabled custom user_agent"
|
40
|
+
end
|
41
|
+
|
42
|
+
# Cookies
|
43
|
+
if cookies = @config[:cookies].presence
|
44
|
+
cookies.each do |cookie|
|
45
|
+
@browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
|
46
|
+
end
|
47
|
+
|
48
|
+
logger.debug "BrowserBuilder (cuprite): enabled custom cookies"
|
49
|
+
end
|
50
|
+
|
51
|
+
@browser
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,154 @@
|
|
1
|
+
require 'capybara'
|
2
|
+
require 'capybara/mechanize'
|
3
|
+
require_relative '../capybara_configuration'
|
4
|
+
require_relative '../capybara_ext/mechanize/driver'
|
5
|
+
require_relative '../capybara_ext/session'
|
6
|
+
|
7
|
+
module Tanakai::BrowserBuilder
|
8
|
+
class MechanizeBuilder
|
9
|
+
attr_reader :logger, :spider
|
10
|
+
|
11
|
+
def initialize(config, spider:)
|
12
|
+
@config = config
|
13
|
+
@spider = spider
|
14
|
+
@logger = spider.logger
|
15
|
+
end
|
16
|
+
|
17
|
+
def build
|
18
|
+
# Register driver
|
19
|
+
Capybara.register_driver :mechanize do |app|
|
20
|
+
driver = Capybara::Mechanize::Driver.new("app")
|
21
|
+
# keep the history as small as possible (by default it's unlimited)
|
22
|
+
driver.configure { |a| a.history.max_size = 2 }
|
23
|
+
driver
|
24
|
+
end
|
25
|
+
|
26
|
+
# Create browser instance (Capybara session)
|
27
|
+
@browser = Capybara::Session.new(:mechanize)
|
28
|
+
@browser.spider = spider
|
29
|
+
logger.debug "BrowserBuilder (mechanize): created browser instance"
|
30
|
+
|
31
|
+
if @config[:extensions].present?
|
32
|
+
logger.error "BrowserBuilder (mechanize): `extensions` option not supported, skipped"
|
33
|
+
end
|
34
|
+
|
35
|
+
# Proxy
|
36
|
+
if proxy = @config[:proxy].presence
|
37
|
+
proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
|
38
|
+
ip, port, type = proxy_string.split(":")
|
39
|
+
|
40
|
+
if type == "http"
|
41
|
+
@browser.driver.set_proxy(*proxy_string.split(":"))
|
42
|
+
logger.debug "BrowserBuilder (mechanize): enabled http proxy, ip: #{ip}, port: #{port}"
|
43
|
+
else
|
44
|
+
logger.error "BrowserBuilder (mechanize): can't set #{type} proxy (not supported), skipped"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# SSL
|
49
|
+
if ssl_cert_path = @config[:ssl_cert_path].presence
|
50
|
+
@browser.driver.browser.agent.http.ca_file = ssl_cert_path
|
51
|
+
logger.debug "BrowserBuilder (mechanize): enabled custom ssl_cert"
|
52
|
+
end
|
53
|
+
|
54
|
+
if @config[:ignore_ssl_errors].present?
|
55
|
+
@browser.driver.browser.agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
56
|
+
logger.debug "BrowserBuilder (mechanize): enabled ignore_ssl_errors"
|
57
|
+
end
|
58
|
+
|
59
|
+
# Headers
|
60
|
+
if headers = @config[:headers].presence
|
61
|
+
@browser.driver.headers = headers
|
62
|
+
logger.debug "BrowserBuilder (mechanize): enabled custom headers"
|
63
|
+
end
|
64
|
+
|
65
|
+
if user_agent = @config[:user_agent].presence
|
66
|
+
user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
|
67
|
+
|
68
|
+
@browser.driver.add_header("User-Agent", user_agent_string)
|
69
|
+
logger.debug "BrowserBuilder (mechanize): enabled custom user_agent"
|
70
|
+
end
|
71
|
+
|
72
|
+
# Cookies
|
73
|
+
if cookies = @config[:cookies].presence
|
74
|
+
cookies.each do |cookie|
|
75
|
+
@browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
|
76
|
+
end
|
77
|
+
|
78
|
+
logger.debug "BrowserBuilder (mechanize): enabled custom cookies"
|
79
|
+
end
|
80
|
+
|
81
|
+
# Browser instance options
|
82
|
+
# skip_request_errors
|
83
|
+
if skip_errors = @config[:skip_request_errors].presence
|
84
|
+
@browser.config.skip_request_errors = skip_errors
|
85
|
+
logger.debug "BrowserBuilder (mechanize): enabled skip_request_errors"
|
86
|
+
end
|
87
|
+
|
88
|
+
# retry_request_errors
|
89
|
+
if retry_errors = @config[:retry_request_errors].presence
|
90
|
+
@browser.config.retry_request_errors = retry_errors
|
91
|
+
logger.debug "BrowserBuilder (mechanize): enabled retry_request_errors"
|
92
|
+
end
|
93
|
+
|
94
|
+
# restart_if
|
95
|
+
if @config[:restart_if].present?
|
96
|
+
logger.warn "BrowserBuilder (mechanize): restart_if options not supported by Mechanize, skipped"
|
97
|
+
end
|
98
|
+
|
99
|
+
# before_request clear_cookies
|
100
|
+
if @config.dig(:before_request, :clear_cookies)
|
101
|
+
@browser.config.before_request[:clear_cookies] = true
|
102
|
+
logger.debug "BrowserBuilder (mechanize): enabled before_request.clear_cookies"
|
103
|
+
end
|
104
|
+
|
105
|
+
# before_request clear_and_set_cookies
|
106
|
+
if @config.dig(:before_request, :clear_and_set_cookies)
|
107
|
+
if cookies = @config[:cookies].presence
|
108
|
+
@browser.config.cookies = cookies
|
109
|
+
@browser.config.before_request[:clear_and_set_cookies] = true
|
110
|
+
logger.debug "BrowserBuilder (mechanize): enabled before_request.clear_and_set_cookies"
|
111
|
+
else
|
112
|
+
logger.error "BrowserBuilder (mechanize): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
# before_request change_user_agent
|
117
|
+
if @config.dig(:before_request, :change_user_agent)
|
118
|
+
if @config[:user_agent].present? && @config[:user_agent].class == Proc
|
119
|
+
@browser.config.user_agent = @config[:user_agent]
|
120
|
+
@browser.config.before_request[:change_user_agent] = true
|
121
|
+
logger.debug "BrowserBuilder (mechanize): enabled before_request.change_user_agent"
|
122
|
+
else
|
123
|
+
logger.error "BrowserBuilder (mechanize): user_agent should be present and has lambda format to enable before_request.change_user_agent, skipped"
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
# before_request change_proxy
|
128
|
+
if @config.dig(:before_request, :change_proxy)
|
129
|
+
if @config[:proxy].present? && @config[:proxy].class == Proc
|
130
|
+
@browser.config.proxy = @config[:proxy]
|
131
|
+
@browser.config.before_request[:change_proxy] = true
|
132
|
+
logger.debug "BrowserBuilder (mechanize): enabled before_request.change_proxy"
|
133
|
+
else
|
134
|
+
logger.error "BrowserBuilder (mechanize): proxy should be present and has lambda format to enable before_request.change_proxy, skipped"
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
# before_request delay
|
139
|
+
if delay = @config.dig(:before_request, :delay).presence
|
140
|
+
@browser.config.before_request[:delay] = delay
|
141
|
+
logger.debug "BrowserBuilder (mechanize): enabled before_request.delay"
|
142
|
+
end
|
143
|
+
|
144
|
+
# encoding
|
145
|
+
if encoding = @config[:encoding]
|
146
|
+
@browser.config.encoding = encoding
|
147
|
+
logger.debug "BrowserBuilder (mechanize): enabled encoding: #{encoding}"
|
148
|
+
end
|
149
|
+
|
150
|
+
# return Capybara session instance
|
151
|
+
@browser
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|