scraper_utils 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/.rubocop.yml +5 -8
- data/CHANGELOG.md +14 -0
- data/GUIDELINES.md +75 -0
- data/Gemfile +6 -3
- data/IMPLEMENTATION.md +33 -0
- data/README.md +226 -177
- data/SPECS.md +25 -0
- data/bin/console +1 -0
- data/bin/setup +2 -1
- data/docs/example_scrape_with_fibers.rb +31 -0
- data/docs/example_scraper.rb +93 -0
- data/lib/scraper_utils/adaptive_delay.rb +70 -0
- data/lib/scraper_utils/authority_utils.rb +2 -2
- data/lib/scraper_utils/data_quality_monitor.rb +64 -0
- data/lib/scraper_utils/date_range_utils.rb +159 -0
- data/lib/scraper_utils/db_utils.rb +1 -2
- data/lib/scraper_utils/debug_utils.rb +63 -23
- data/lib/scraper_utils/fiber_scheduler.rb +229 -0
- data/lib/scraper_utils/log_utils.rb +58 -25
- data/lib/scraper_utils/mechanize_utils/agent_config.rb +276 -0
- data/lib/scraper_utils/mechanize_utils.rb +32 -30
- data/lib/scraper_utils/randomize_utils.rb +34 -0
- data/lib/scraper_utils/robots_checker.rb +149 -0
- data/lib/scraper_utils/version.rb +1 -1
- data/lib/scraper_utils.rb +6 -10
- data/scraper_utils.gemspec +3 -8
- metadata +17 -74
@@ -0,0 +1,229 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "fiber"
|
4
|
+
|
5
|
+
module ScraperUtils
|
6
|
+
# A utility module for interleaving multiple scraping operations
|
7
|
+
# using fibers during connection delay periods. This allows efficient
|
8
|
+
# use of wait time by switching between operations.
|
9
|
+
module FiberScheduler
|
10
|
+
# @return [Array<Fiber>] List of active fibers managed by the scheduler
|
11
|
+
def self.registry
|
12
|
+
@registry ||= []
|
13
|
+
end
|
14
|
+
|
15
|
+
# Checks if the current code is running within a registered fiber
|
16
|
+
#
|
17
|
+
# @return [Boolean] true if running in a registered fiber, false otherwise
|
18
|
+
def self.in_fiber?
|
19
|
+
!Fiber.current.nil? && registry.include?(Fiber.current)
|
20
|
+
end
|
21
|
+
|
22
|
+
# Gets the authority associated with the current fiber
|
23
|
+
#
|
24
|
+
# @return [String, nil] the authority name or nil if not in a fiber
|
25
|
+
def self.current_authority
|
26
|
+
return nil unless in_fiber?
|
27
|
+
|
28
|
+
Fiber.current.instance_variable_get(:@authority)
|
29
|
+
end
|
30
|
+
|
31
|
+
# Logs a message, automatically prefixing with authority name if in a fiber
|
32
|
+
#
|
33
|
+
# @param message [String] the message to log
|
34
|
+
# @return [void]
|
35
|
+
def self.log(message)
|
36
|
+
authority = current_authority
|
37
|
+
$stderr.flush
|
38
|
+
if authority
|
39
|
+
puts "[#{authority}] #{message}"
|
40
|
+
else
|
41
|
+
puts message
|
42
|
+
end
|
43
|
+
$stdout.flush
|
44
|
+
end
|
45
|
+
|
46
|
+
# Returns a hash of exceptions encountered during processing, indexed by authority
|
47
|
+
#
|
48
|
+
# @return [Hash{Symbol => Exception}] exceptions by authority
|
49
|
+
def self.exceptions
|
50
|
+
@exceptions ||= {}
|
51
|
+
end
|
52
|
+
|
53
|
+
# Returns a hash of the yielded / block values
|
54
|
+
#
|
55
|
+
# @return [Hash{Symbol => Any}] values by authority
|
56
|
+
def self.values
|
57
|
+
@values ||= {}
|
58
|
+
end
|
59
|
+
|
60
|
+
# Checks if fiber scheduling is currently enabled
|
61
|
+
#
|
62
|
+
# @return [Boolean] true if enabled, false otherwise
|
63
|
+
def self.enabled?
|
64
|
+
@enabled ||= false
|
65
|
+
end
|
66
|
+
|
67
|
+
# Enables fiber scheduling
|
68
|
+
#
|
69
|
+
# @return [void]
|
70
|
+
def self.enable!
|
71
|
+
reset! unless enabled?
|
72
|
+
@enabled = true
|
73
|
+
end
|
74
|
+
|
75
|
+
# Disables fiber scheduling
|
76
|
+
#
|
77
|
+
# @return [void]
|
78
|
+
def self.disable!
|
79
|
+
@enabled = false
|
80
|
+
end
|
81
|
+
|
82
|
+
# Resets the scheduler state, and disables. Use before retrying failed authorities.
|
83
|
+
#
|
84
|
+
# @return [void]
|
85
|
+
def self.reset!
|
86
|
+
@registry = []
|
87
|
+
@exceptions = {}
|
88
|
+
@values = {}
|
89
|
+
@enabled = false
|
90
|
+
@delay_requested = 0.0
|
91
|
+
@time_slept = 0.0
|
92
|
+
@resume_count = 0
|
93
|
+
@initial_resume_at = Time.now - 60.0 # one minute ago
|
94
|
+
end
|
95
|
+
|
96
|
+
# Registers a block to scrape for a specific authority
|
97
|
+
#
|
98
|
+
# @param authority [String] the name of the authority being processed
|
99
|
+
# @yield to the block containing the scraping operation to be run in the fiber
|
100
|
+
# @return [Fiber] a fiber that calls the block. With @authority and @resume_at instance vars
|
101
|
+
def self.register_operation(authority, &block)
|
102
|
+
# Automatically enable fiber scheduling when operations are registered
|
103
|
+
enable!
|
104
|
+
|
105
|
+
fiber = Fiber.new do
|
106
|
+
values[authority] = block.call
|
107
|
+
rescue StandardError => e
|
108
|
+
# Store exception against the authority
|
109
|
+
exceptions[authority] = e
|
110
|
+
ensure
|
111
|
+
# Remove itself when done regardless of success/failure
|
112
|
+
registry.delete(Fiber.current)
|
113
|
+
end
|
114
|
+
|
115
|
+
# Start fibres in registration order
|
116
|
+
@initial_resume_at += 0.1
|
117
|
+
fiber.instance_variable_set(:@resume_at, @initial_resume_at)
|
118
|
+
fiber.instance_variable_set(:@authority, authority)
|
119
|
+
registry << fiber
|
120
|
+
|
121
|
+
if DebugUtils.basic?
|
122
|
+
FiberScheduler.log "Registered #{authority} operation with fiber: #{fiber.object_id} for interleaving"
|
123
|
+
end
|
124
|
+
# Process immediately when testing
|
125
|
+
fiber.resume if ScraperUtils::RandomizeUtils.sequential?
|
126
|
+
fiber
|
127
|
+
end
|
128
|
+
|
129
|
+
# Run all registered fibers until completion
|
130
|
+
#
|
131
|
+
# @return [Hash] Exceptions that occurred during execution
|
132
|
+
def self.run_all
|
133
|
+
count = registry.size
|
134
|
+
while (fiber = find_earliest_fiber)
|
135
|
+
if fiber.alive?
|
136
|
+
authority = begin
|
137
|
+
fiber.instance_variable_get(:@authority)
|
138
|
+
rescue StandardError
|
139
|
+
nil
|
140
|
+
end
|
141
|
+
@resume_count ||= 0
|
142
|
+
@resume_count += 1
|
143
|
+
values[authority] = fiber.resume
|
144
|
+
else
|
145
|
+
FiberScheduler.log "WARNING: fiber is dead but did not remove itself from registry! #{fiber.object_id}"
|
146
|
+
registry.delete(fiber)
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
if @time_slept&.positive? && @delay_requested&.positive?
|
151
|
+
percent_slept = (100.0 * @time_slept / @delay_requested).round(1)
|
152
|
+
end
|
153
|
+
puts
|
154
|
+
FiberScheduler.log "FiberScheduler processed #{@resume_count} calls to delay for #{count} registrations, " \
|
155
|
+
"sleeping #{percent_slept}% (#{@time_slept&.round(1)}) of the " \
|
156
|
+
"#{@delay_requested&.round(1)} seconds requested."
|
157
|
+
puts
|
158
|
+
|
159
|
+
exceptions
|
160
|
+
end
|
161
|
+
|
162
|
+
# Delays the current fiber and potentially runs another one
|
163
|
+
# Falls back to regular sleep if fiber scheduling is not enabled
|
164
|
+
#
|
165
|
+
# @param seconds [Numeric] the number of seconds to delay
|
166
|
+
# @return [Integer] return from sleep operation or 0
|
167
|
+
def self.delay(seconds)
|
168
|
+
seconds = 0.0 unless seconds&.positive?
|
169
|
+
@delay_requested ||= 0.0
|
170
|
+
@delay_requested += seconds
|
171
|
+
|
172
|
+
current_fiber = Fiber.current
|
173
|
+
|
174
|
+
if !enabled? || !current_fiber || registry.size <= 1
|
175
|
+
@time_slept ||= 0.0
|
176
|
+
@time_slept += seconds
|
177
|
+
log("Sleeping #{seconds.round(3)} seconds") if DebugUtils.basic?
|
178
|
+
return sleep(seconds)
|
179
|
+
end
|
180
|
+
|
181
|
+
now = Time.now
|
182
|
+
resume_at = now + seconds
|
183
|
+
|
184
|
+
# Don't resume at the same time as someone else,
|
185
|
+
# FIFO queue if seconds == 0
|
186
|
+
@other_resumes ||= []
|
187
|
+
@other_resumes = @other_resumes.delete_if { |t| t < now }
|
188
|
+
while @other_resumes.include?(resume_at) && resume_at
|
189
|
+
resume_at += 0.01
|
190
|
+
end
|
191
|
+
|
192
|
+
# Used to compare when other fibers need to be resumed
|
193
|
+
current_fiber.instance_variable_set(:@resume_at, resume_at)
|
194
|
+
|
195
|
+
# Yield control back to the scheduler so another fiber can run
|
196
|
+
Fiber.yield
|
197
|
+
|
198
|
+
# When we get control back, check if we need to sleep more
|
199
|
+
remaining = resume_at - Time.now
|
200
|
+
if remaining.positive?
|
201
|
+
@time_slept ||= 0.0
|
202
|
+
@time_slept += remaining
|
203
|
+
log("Sleeping remaining #{remaining.round(3)} seconds") if DebugUtils.basic?
|
204
|
+
sleep(remaining)
|
205
|
+
end || 0
|
206
|
+
end
|
207
|
+
|
208
|
+
# Finds the fiber with the earliest wake-up time
|
209
|
+
#
|
210
|
+
# @return [Fiber, nil] the fiber with the earliest wake-up time or nil if none found
|
211
|
+
def self.find_earliest_fiber
|
212
|
+
earliest_time = nil
|
213
|
+
earliest_fiber = nil
|
214
|
+
|
215
|
+
registry.each do |fiber|
|
216
|
+
resume_at = fiber.instance_variable_get(:@resume_at)
|
217
|
+
if earliest_time.nil? || resume_at < earliest_time
|
218
|
+
earliest_time = resume_at
|
219
|
+
earliest_fiber = fiber
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
earliest_fiber
|
224
|
+
end
|
225
|
+
|
226
|
+
# Mark methods as private
|
227
|
+
private_class_method :find_earliest_fiber
|
228
|
+
end
|
229
|
+
end
|
@@ -13,13 +13,10 @@ module ScraperUtils
|
|
13
13
|
# @param start_time [Time] When this scraping attempt was started
|
14
14
|
# @param attempt [Integer] 1 for first run, 2 for first retry, 3 for last retry (without proxy)
|
15
15
|
# @param authorities [Array<Symbol>] List of authorities attempted to scrape
|
16
|
-
# @param
|
17
|
-
#
|
18
|
-
# - :unprocessable_records [Integer] Optional Number of unprocessable record like regions
|
19
|
-
# - :error [Exception, nil] Any exception that occurred during scraping
|
20
|
-
# - :proxy_used [Boolean] Whether a proxy was used
|
16
|
+
# @param exceptions [Hash{Symbol => Exception}] Any exceptions that occurred during scraping
|
17
|
+
# `DataQualityMonitor.stats` is checked for :saved and :unprocessed entries
|
21
18
|
# @return [void]
|
22
|
-
def self.log_scraping_run(start_time, attempt, authorities,
|
19
|
+
def self.log_scraping_run(start_time, attempt, authorities, exceptions)
|
23
20
|
raise ArgumentError, "Invalid start time" unless start_time.is_a?(Time)
|
24
21
|
raise ArgumentError, "Authorities must be a non-empty array" if authorities.empty?
|
25
22
|
|
@@ -31,10 +28,11 @@ module ScraperUtils
|
|
31
28
|
interrupted = []
|
32
29
|
|
33
30
|
authorities.each do |authority_label|
|
34
|
-
|
31
|
+
stats = ScraperUtils::DataQualityMonitor.stats&.fetch(authority_label, nil) || {}
|
35
32
|
|
36
|
-
|
37
|
-
|
33
|
+
exception = exceptions[authority_label]
|
34
|
+
status = if stats[:saved]&.positive?
|
35
|
+
exception ? :interrupted : :successful
|
38
36
|
else
|
39
37
|
:failed
|
40
38
|
end
|
@@ -51,13 +49,12 @@ module ScraperUtils
|
|
51
49
|
"run_at" => start_time.iso8601,
|
52
50
|
"attempt" => attempt,
|
53
51
|
"authority_label" => authority_label.to_s,
|
54
|
-
"
|
55
|
-
"unprocessable_records" =>
|
56
|
-
"used_proxy" => result[:proxy_used] ? 1 : 0,
|
52
|
+
"records_saved" => stats[:saved] || 0,
|
53
|
+
"unprocessable_records" => stats[:unprocessed] || 0,
|
57
54
|
"status" => status.to_s,
|
58
|
-
"error_message" =>
|
59
|
-
"error_class" =>
|
60
|
-
"error_backtrace" => extract_meaningful_backtrace(
|
55
|
+
"error_message" => exception&.message,
|
56
|
+
"error_class" => exception&.class&.to_s,
|
57
|
+
"error_backtrace" => extract_meaningful_backtrace(exception)
|
61
58
|
}
|
62
59
|
|
63
60
|
save_log_record(record)
|
@@ -76,37 +73,72 @@ module ScraperUtils
|
|
76
73
|
cleanup_old_records
|
77
74
|
end
|
78
75
|
|
79
|
-
|
80
|
-
|
76
|
+
# Report on the results
|
77
|
+
# @param authorities [Array<Symbol>] List of authorities attempted to scrape
|
78
|
+
# @param exceptions [Hash{Symbol => Exception}] Any exceptions that occurred during scraping
|
79
|
+
# `DataQualityMonitor.stats` is checked for :saved and :unprocessed entries
|
80
|
+
# @return [void]
|
81
|
+
def self.report_on_results(authorities, exceptions)
|
82
|
+
if ENV["MORPH_EXPECT_BAD"]
|
83
|
+
expect_bad = ENV["MORPH_EXPECT_BAD"].split(",").map(&:strip).map(&:to_sym)
|
84
|
+
end
|
85
|
+
expect_bad ||= []
|
86
|
+
|
87
|
+
$stderr.flush
|
88
|
+
puts "MORPH_EXPECT_BAD=#{ENV.fetch('MORPH_EXPECT_BAD', nil)}"
|
89
|
+
|
90
|
+
# Print summary table
|
91
|
+
puts "\nScraping Summary:"
|
92
|
+
summary_format = "%-20s %6s %6s %s"
|
81
93
|
|
82
|
-
puts
|
94
|
+
puts format(summary_format, 'Authority', 'OK', 'Bad', 'Exception')
|
95
|
+
puts format(summary_format, "-" * 20, "-" * 6, "-" * 6, "-" * 50)
|
96
|
+
|
97
|
+
authorities.each do |authority|
|
98
|
+
stats = ScraperUtils::DataQualityMonitor.stats&.fetch(authority, {}) || {}
|
99
|
+
|
100
|
+
ok_records = stats[:saved] || 0
|
101
|
+
bad_records = stats[:unprocessed] || 0
|
102
|
+
|
103
|
+
expect_bad_prefix = expect_bad.include?(authority) ? "[EXPECT BAD] " : ""
|
104
|
+
exception_msg = if exceptions[authority]
|
105
|
+
"#{exceptions[authority].class} - #{exceptions[authority].message}"
|
106
|
+
else
|
107
|
+
"-"
|
108
|
+
end
|
109
|
+
puts format(summary_format, authority.to_s, ok_records, bad_records,
|
110
|
+
"#{expect_bad_prefix}#{exception_msg}".slice(0, 70))
|
111
|
+
end
|
112
|
+
puts
|
83
113
|
|
84
114
|
errors = []
|
85
115
|
|
86
116
|
# Check for authorities that were expected to be bad but are now working
|
87
117
|
unexpected_working = expect_bad.select do |authority|
|
88
|
-
|
89
|
-
|
118
|
+
stats = ScraperUtils::DataQualityMonitor.stats&.fetch(authority, {}) || {}
|
119
|
+
stats[:saved]&.positive? && !exceptions[authority]
|
90
120
|
end
|
91
121
|
|
92
122
|
if unexpected_working.any?
|
93
|
-
errors <<
|
123
|
+
errors <<
|
124
|
+
"WARNING: Remove #{unexpected_working.join(',')} from MORPH_EXPECT_BAD as it now works!"
|
94
125
|
end
|
95
126
|
|
96
127
|
# Check for authorities with unexpected errors
|
97
128
|
unexpected_errors = authorities
|
98
|
-
.select { |authority|
|
129
|
+
.select { |authority| exceptions[authority] }
|
99
130
|
.reject { |authority| expect_bad.include?(authority) }
|
100
131
|
|
101
132
|
if unexpected_errors.any?
|
102
133
|
errors << "ERROR: Unexpected errors in: #{unexpected_errors.join(',')} " \
|
103
|
-
|
134
|
+
"(Add to MORPH_EXPECT_BAD?)"
|
104
135
|
unexpected_errors.each do |authority|
|
105
|
-
error =
|
136
|
+
error = exceptions[authority]
|
106
137
|
errors << " #{authority}: #{error.class} - #{error.message}"
|
107
138
|
end
|
108
139
|
end
|
109
140
|
|
141
|
+
$stdout.flush
|
110
142
|
if errors.any?
|
111
143
|
errors << "See earlier output for details"
|
112
144
|
raise errors.join("\n")
|
@@ -134,7 +166,8 @@ module ScraperUtils
|
|
134
166
|
"interrupted" => interrupted.join(","),
|
135
167
|
"successful_count" => successful.size,
|
136
168
|
"interrupted_count" => interrupted.size,
|
137
|
-
"failed_count" => failed.size
|
169
|
+
"failed_count" => failed.size,
|
170
|
+
"public_ip" => ScraperUtils::MechanizeUtils.public_ip
|
138
171
|
}
|
139
172
|
|
140
173
|
ScraperWiki.save_sqlite(
|
@@ -0,0 +1,276 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "mechanize"
|
4
|
+
require "ipaddr"
|
5
|
+
|
6
|
+
module ScraperUtils
|
7
|
+
module MechanizeUtils
|
8
|
+
# Configuration for a Mechanize agent with sensible defaults and configurable settings.
|
9
|
+
# Supports global configuration through {.configure} and per-instance overrides.
|
10
|
+
#
|
11
|
+
# @example Setting global defaults
|
12
|
+
# ScraperUtils::MechanizeUtils::AgentConfig.configure do |config|
|
13
|
+
# config.default_timeout = 90
|
14
|
+
# config.default_random_delay = 5
|
15
|
+
# end
|
16
|
+
#
|
17
|
+
# @example Creating an instance with defaults
|
18
|
+
# config = ScraperUtils::MechanizeUtils::AgentConfig.new
|
19
|
+
#
|
20
|
+
# @example Overriding specific settings
|
21
|
+
# config = ScraperUtils::MechanizeUtils::AgentConfig.new(
|
22
|
+
# timeout: 120,
|
23
|
+
# random_delay: 10
|
24
|
+
# )
|
25
|
+
class AgentConfig
|
26
|
+
DEFAULT_TIMEOUT = 60
|
27
|
+
DEFAULT_RANDOM_DELAY = 5
|
28
|
+
DEFAULT_MAX_LOAD = 33.3
|
29
|
+
MAX_LOAD_CAP = 50.0
|
30
|
+
|
31
|
+
# Class-level defaults that can be modified
|
32
|
+
class << self
|
33
|
+
# @return [Integer] Default timeout in seconds for agent connections
|
34
|
+
attr_accessor :default_timeout
|
35
|
+
|
36
|
+
# @return [Boolean] Default setting for compliance with headers and robots.txt
|
37
|
+
attr_accessor :default_compliant_mode
|
38
|
+
|
39
|
+
# @return [Integer, nil] Default average random delay in seconds
|
40
|
+
attr_accessor :default_random_delay
|
41
|
+
|
42
|
+
# @return [Float, nil] Default maximum server load percentage (nil = no response delay)
|
43
|
+
attr_accessor :default_max_load
|
44
|
+
|
45
|
+
# @return [Boolean] Default setting for SSL certificate verification
|
46
|
+
attr_accessor :default_disable_ssl_certificate_check
|
47
|
+
|
48
|
+
# @return [Boolean] Default flag for Australian proxy preference
|
49
|
+
attr_accessor :default_australian_proxy
|
50
|
+
|
51
|
+
# @return [String, nil] Default Mechanize user agent
|
52
|
+
attr_accessor :default_user_agent
|
53
|
+
|
54
|
+
# Configure default settings for all AgentConfig instances
|
55
|
+
# @yield [self] Yields self for configuration
|
56
|
+
# @example
|
57
|
+
# AgentConfig.configure do |config|
|
58
|
+
# config.default_timeout = 90
|
59
|
+
# config.default_random_delay = 5
|
60
|
+
# config.default_max_load = 15
|
61
|
+
# end
|
62
|
+
# @return [void]
|
63
|
+
def configure
|
64
|
+
yield self if block_given?
|
65
|
+
end
|
66
|
+
|
67
|
+
# Reset all configuration options to their default values
|
68
|
+
# @return [void]
|
69
|
+
def reset_defaults!
|
70
|
+
@default_timeout = ENV.fetch('MORPH_TIMEOUT', DEFAULT_TIMEOUT).to_i # 60
|
71
|
+
@default_compliant_mode = ENV.fetch('MORPH_NOT_COMPLIANT', nil).to_s.empty? # true
|
72
|
+
@default_random_delay = ENV.fetch('MORPH_RANDOM_DELAY', DEFAULT_RANDOM_DELAY).to_i # 33
|
73
|
+
@default_max_load = ENV.fetch('MORPH_MAX_LOAD', DEFAULT_MAX_LOAD).to_f # 20
|
74
|
+
@default_disable_ssl_certificate_check = !ENV.fetch('MORPH_DISABLE_SSL_CHECK', nil).to_s.empty? # false
|
75
|
+
@default_australian_proxy = !ENV.fetch('MORPH_USE_PROXY', nil).to_s.empty? # false
|
76
|
+
@default_user_agent = ENV.fetch('MORPH_USER_AGENT', nil) # Uses Mechanize user agent
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# Set defaults on load
|
81
|
+
reset_defaults!
|
82
|
+
|
83
|
+
# @return [String] User agent string
|
84
|
+
attr_reader :user_agent
|
85
|
+
|
86
|
+
# Give access for testing
|
87
|
+
|
88
|
+
attr_reader :max_load, :min_random, :max_random
|
89
|
+
|
90
|
+
# Creates Mechanize agent configuration with sensible defaults overridable via configure
|
91
|
+
# @param timeout [Integer, nil] Timeout for agent connections (default: 60)
|
92
|
+
# @param compliant_mode [Boolean, nil] Comply with headers and robots.txt (default: true)
|
93
|
+
# @param random_delay [Integer, nil] Average random delay in seconds (default: 3)
|
94
|
+
# @param max_load [Float, nil] Maximum server load percentage (nil = no delay, default: 20%)
|
95
|
+
# When compliant_mode is true, max_load is capped at 33%
|
96
|
+
# @param disable_ssl_certificate_check [Boolean, nil] Skip SSL verification (default: false)
|
97
|
+
# @param australian_proxy [Boolean, nil] Use proxy if available (default: false)
|
98
|
+
# @param user_agent [String, nil] Configure Mechanize user agent
|
99
|
+
def initialize(timeout: nil,
|
100
|
+
compliant_mode: nil,
|
101
|
+
random_delay: nil,
|
102
|
+
max_load: nil,
|
103
|
+
disable_ssl_certificate_check: nil,
|
104
|
+
australian_proxy: nil,
|
105
|
+
user_agent: nil)
|
106
|
+
@timeout = timeout.nil? ? self.class.default_timeout : timeout
|
107
|
+
@compliant_mode = compliant_mode.nil? ? self.class.default_compliant_mode : compliant_mode
|
108
|
+
@random_delay = random_delay.nil? ? self.class.default_random_delay : random_delay
|
109
|
+
@max_load = max_load.nil? ? self.class.default_max_load : max_load
|
110
|
+
@max_load = [@max_load || DEFAULT_MAX_LOAD, MAX_LOAD_CAP].min if @compliant_mode
|
111
|
+
@user_agent = user_agent.nil? ? self.class.default_user_agent : user_agent
|
112
|
+
|
113
|
+
@disable_ssl_certificate_check = if disable_ssl_certificate_check.nil?
|
114
|
+
self.class.default_disable_ssl_certificate_check
|
115
|
+
else
|
116
|
+
disable_ssl_certificate_check
|
117
|
+
end
|
118
|
+
@australian_proxy = if australian_proxy.nil?
|
119
|
+
self.class.default_australian_proxy
|
120
|
+
else
|
121
|
+
australian_proxy
|
122
|
+
end
|
123
|
+
|
124
|
+
# Validate proxy URL format if proxy will be used
|
125
|
+
@australian_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
|
126
|
+
if @australian_proxy
|
127
|
+
uri = begin
|
128
|
+
URI.parse(ScraperUtils.australian_proxy.to_s)
|
129
|
+
rescue URI::InvalidURIError => e
|
130
|
+
raise URI::InvalidURIError, "Invalid proxy URL format: #{e.message}"
|
131
|
+
end
|
132
|
+
unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
|
133
|
+
raise URI::InvalidURIError, "Proxy URL must start with http:// or https://"
|
134
|
+
end
|
135
|
+
unless uri.host && uri.port
|
136
|
+
raise URI::InvalidURIError, "Proxy URL must include host and port"
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
if @random_delay
|
141
|
+
@min_random = Math.sqrt(@random_delay * 3.0 / 13.0).round(3)
|
142
|
+
@max_random = (3 * @min_random).round(3)
|
143
|
+
end
|
144
|
+
|
145
|
+
today = Date.today.strftime("%Y-%m-%d")
|
146
|
+
@user_agent = ENV.fetch("MORPH_USER_AGENT", nil)&.sub("TODAY", today)
|
147
|
+
if @compliant_mode
|
148
|
+
version = ScraperUtils::VERSION
|
149
|
+
@user_agent ||= "Mozilla/5.0 (compatible; ScraperUtils/#{version} #{today}; +https://github.com/ianheggie-oaf/scraper_utils)"
|
150
|
+
end
|
151
|
+
|
152
|
+
@robots_checker = RobotsChecker.new(@user_agent) if @user_agent
|
153
|
+
@adaptive_delay = AdaptiveDelay.new(max_load: @max_load) if @max_load
|
154
|
+
display_options
|
155
|
+
end
|
156
|
+
|
157
|
+
# Configures a Mechanize agent with these settings
|
158
|
+
# @param agent [Mechanize] The agent to configure
|
159
|
+
# @return [void]
|
160
|
+
def configure_agent(agent)
|
161
|
+
agent.verify_mode = OpenSSL::SSL::VERIFY_NONE if @disable_ssl_certificate_check
|
162
|
+
|
163
|
+
if @timeout
|
164
|
+
agent.open_timeout = @timeout
|
165
|
+
agent.read_timeout = @timeout
|
166
|
+
end
|
167
|
+
if @compliant_mode
|
168
|
+
agent.user_agent = user_agent
|
169
|
+
agent.request_headers ||= {}
|
170
|
+
agent.request_headers["Accept"] =
|
171
|
+
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
|
172
|
+
agent.request_headers["Upgrade-Insecure-Requests"] = "1"
|
173
|
+
end
|
174
|
+
if @australian_proxy
|
175
|
+
agent.agent.set_proxy(ScraperUtils.australian_proxy)
|
176
|
+
agent.request_headers["Accept-Language"] = "en-AU,en-US;q=0.9,en;q=0.8"
|
177
|
+
verify_proxy_works(agent)
|
178
|
+
end
|
179
|
+
|
180
|
+
@connection_started_at = nil
|
181
|
+
agent.pre_connect_hooks << method(:pre_connect_hook)
|
182
|
+
agent.post_connect_hooks << method(:post_connect_hook)
|
183
|
+
end
|
184
|
+
|
185
|
+
private
|
186
|
+
|
187
|
+
def display_options
|
188
|
+
display_args = []
|
189
|
+
display_args << "timeout=#{@timeout}" if @timeout
|
190
|
+
display_args << if ScraperUtils.australian_proxy.to_s.empty? && !@australian_proxy
|
191
|
+
"#{ScraperUtils::AUSTRALIAN_PROXY_ENV_VAR} not set"
|
192
|
+
else
|
193
|
+
"australian_proxy=#{@australian_proxy.inspect}"
|
194
|
+
end
|
195
|
+
display_args << "compliant_mode" if @compliant_mode
|
196
|
+
display_args << "random_delay=#{@random_delay}" if @random_delay
|
197
|
+
display_args << "max_load=#{@max_load}%" if @max_load
|
198
|
+
display_args << "disable_ssl_certificate_check" if @disable_ssl_certificate_check
|
199
|
+
display_args << "default args" if display_args.empty?
|
200
|
+
ScraperUtils::FiberScheduler.log(
|
201
|
+
"Configuring Mechanize agent with #{display_args.join(', ')}"
|
202
|
+
)
|
203
|
+
end
|
204
|
+
|
205
|
+
def pre_connect_hook(_agent, request)
|
206
|
+
@connection_started_at = Time.now
|
207
|
+
return unless DebugUtils.verbose?
|
208
|
+
|
209
|
+
ScraperUtils::FiberScheduler.log(
|
210
|
+
"Pre Connect request: #{request.inspect} at #{@connection_started_at}"
|
211
|
+
)
|
212
|
+
end
|
213
|
+
|
214
|
+
def post_connect_hook(_agent, uri, response, _body)
|
215
|
+
raise ArgumentError, "URI must be present in post-connect hook" unless uri
|
216
|
+
|
217
|
+
response_time = Time.now - @connection_started_at
|
218
|
+
if DebugUtils.basic?
|
219
|
+
ScraperUtils::FiberScheduler.log(
|
220
|
+
"Post Connect uri: #{uri.inspect}, response: #{response.inspect} " \
|
221
|
+
"after #{response_time} seconds"
|
222
|
+
)
|
223
|
+
end
|
224
|
+
|
225
|
+
if @robots_checker&.disallowed?(uri)
|
226
|
+
raise ScraperUtils::UnprocessableSite,
|
227
|
+
"URL is disallowed by robots.txt specific rules: #{uri}"
|
228
|
+
end
|
229
|
+
|
230
|
+
delays = {
|
231
|
+
robot_txt: @robots_checker&.crawl_delay&.round(3),
|
232
|
+
max_load: @adaptive_delay&.next_delay(uri, response_time)&.round(3),
|
233
|
+
random: (@min_random ? (rand(@min_random..@max_random)**2).round(3) : nil)
|
234
|
+
}
|
235
|
+
@delay = delays.values.compact.max
|
236
|
+
if @delay&.positive?
|
237
|
+
$stderr.flush
|
238
|
+
ScraperUtils::FiberScheduler.log("Delaying #{@delay} seconds, max of #{delays.inspect}") if ENV["DEBUG"]
|
239
|
+
$stdout.flush
|
240
|
+
ScraperUtils::FiberScheduler.delay(@delay)
|
241
|
+
end
|
242
|
+
|
243
|
+
response
|
244
|
+
end
|
245
|
+
|
246
|
+
def verify_proxy_works(agent)
|
247
|
+
$stderr.flush
|
248
|
+
$stdout.flush
|
249
|
+
FiberScheduler.log "Checking proxy works..."
|
250
|
+
my_ip = MechanizeUtils.public_ip(agent)
|
251
|
+
begin
|
252
|
+
IPAddr.new(my_ip)
|
253
|
+
rescue IPAddr::InvalidAddressError => e
|
254
|
+
raise "Invalid public IP address returned by proxy check: #{my_ip.inspect}: #{e}"
|
255
|
+
end
|
256
|
+
ScraperUtils::FiberScheduler.log "Proxy is using IP address: #{my_ip.inspect}"
|
257
|
+
my_headers = MechanizeUtils.public_headers(agent)
|
258
|
+
begin
|
259
|
+
# Check response is JSON just to be safe!
|
260
|
+
headers = JSON.parse(my_headers)
|
261
|
+
puts "Proxy is passing headers:"
|
262
|
+
puts JSON.pretty_generate(headers["headers"])
|
263
|
+
rescue JSON::ParserError => e
|
264
|
+
puts "Couldn't parse public_headers: #{e}! Raw response:"
|
265
|
+
puts my_headers.inspect
|
266
|
+
end
|
267
|
+
rescue Timeout::Error => e # Includes Net::OpenTimeout
|
268
|
+
raise "Proxy check timed out: #{e}"
|
269
|
+
rescue Errno::ECONNREFUSED, Net::HTTP::Persistent::Error => e
|
270
|
+
raise "Failed to connect to proxy: #{e}"
|
271
|
+
rescue Mechanize::ResponseCodeError => e
|
272
|
+
raise "Proxy check error: #{e}"
|
273
|
+
end
|
274
|
+
end
|
275
|
+
end
|
276
|
+
end
|