scraper_utils 0.5.1 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +5 -0
- data/CHANGELOG.md +7 -0
- data/GUIDELINES.md +2 -1
- data/Gemfile +1 -0
- data/IMPLEMENTATION.md +40 -0
- data/README.md +29 -23
- data/SPECS.md +13 -1
- data/bin/rspec +27 -0
- data/docs/example_scrape_with_fibers.rb +4 -4
- data/docs/fibers_and_threads.md +72 -0
- data/docs/getting_started.md +6 -6
- data/docs/interleaving_requests.md +7 -7
- data/docs/parallel_requests.md +138 -0
- data/docs/randomizing_requests.md +12 -8
- data/docs/reducing_server_load.md +6 -6
- data/lib/scraper_utils/data_quality_monitor.rb +2 -3
- data/lib/scraper_utils/date_range_utils.rb +37 -78
- data/lib/scraper_utils/debug_utils.rb +5 -5
- data/lib/scraper_utils/log_utils.rb +15 -0
- data/lib/scraper_utils/mechanize_actions.rb +37 -8
- data/lib/scraper_utils/mechanize_utils/adaptive_delay.rb +79 -0
- data/lib/scraper_utils/mechanize_utils/agent_config.rb +31 -30
- data/lib/scraper_utils/mechanize_utils/robots_checker.rb +151 -0
- data/lib/scraper_utils/mechanize_utils.rb +8 -5
- data/lib/scraper_utils/randomize_utils.rb +22 -19
- data/lib/scraper_utils/scheduler/constants.rb +12 -0
- data/lib/scraper_utils/scheduler/operation_registry.rb +101 -0
- data/lib/scraper_utils/scheduler/operation_worker.rb +199 -0
- data/lib/scraper_utils/scheduler/process_request.rb +59 -0
- data/lib/scraper_utils/scheduler/thread_request.rb +51 -0
- data/lib/scraper_utils/scheduler/thread_response.rb +59 -0
- data/lib/scraper_utils/scheduler.rb +286 -0
- data/lib/scraper_utils/version.rb +1 -1
- data/lib/scraper_utils.rb +11 -14
- metadata +16 -6
- data/lib/scraper_utils/adaptive_delay.rb +0 -70
- data/lib/scraper_utils/fiber_scheduler.rb +0 -229
- data/lib/scraper_utils/robots_checker.rb +0 -149
@@ -3,7 +3,7 @@
|
|
3
3
|
module ScraperUtils
|
4
4
|
class DateRangeUtils
|
5
5
|
MERGE_ADJACENT_RANGES = true
|
6
|
-
PERIODS = [2, 3,
|
6
|
+
PERIODS = [2, 3, 4].freeze
|
7
7
|
|
8
8
|
class << self
|
9
9
|
# @return [Integer] Default number of days to cover
|
@@ -33,7 +33,7 @@ module ScraperUtils
|
|
33
33
|
def reset_defaults!
|
34
34
|
@default_days = ENV.fetch('MORPH_DAYS', 33).to_i # 33
|
35
35
|
@default_everytime = ENV.fetch('MORPH_EVERYTIME', 4).to_i # 4
|
36
|
-
@default_max_period = ENV.fetch('MORPH_MAX_PERIOD',
|
36
|
+
@default_max_period = ENV.fetch('MORPH_MAX_PERIOD', 2).to_i # 3
|
37
37
|
end
|
38
38
|
end
|
39
39
|
|
@@ -46,8 +46,8 @@ module ScraperUtils
|
|
46
46
|
# Generates one or more date ranges to check the most recent daily through to checking each max_period
|
47
47
|
# There is a graduated schedule from the latest `everytime` days through to the oldest of `days` dates which is checked each `max_period` days.
|
48
48
|
# @param days [Integer, nil] create ranges that cover the last `days` dates
|
49
|
-
# @param everytime [Integer, nil] Always include the latest `everytime` out of `days` dates
|
50
|
-
# @param max_period [Integer, nil] the last `days` dates must be checked at least every `max_period` days
|
49
|
+
# @param everytime [Integer, nil] Always include the latest `everytime` out of `days` dates (minimum 1)
|
50
|
+
# @param max_period [Integer, nil] the last `days` dates must be checked at least every `max_period` days (1..4)
|
51
51
|
# @param today [Date, nil] overrides the default determination of today at UTC+09:30 (middle of Australia)
|
52
52
|
# @return [Array{[Date, Date, String]}] being from_date, to_date and a comment
|
53
53
|
#
|
@@ -58,7 +58,7 @@ module ScraperUtils
|
|
58
58
|
def calculate_date_ranges(days: nil, everytime: nil, max_period: nil, today: nil)
|
59
59
|
_calculate_date_ranges(
|
60
60
|
Integer(days || self.class.default_days),
|
61
|
-
Integer(everytime || self.class.default_everytime),
|
61
|
+
[1, Integer(everytime || self.class.default_everytime)].max,
|
62
62
|
Integer(max_period || self.class.default_max_period),
|
63
63
|
today || Time.now(in: '+09:30').to_date
|
64
64
|
)
|
@@ -76,84 +76,43 @@ module ScraperUtils
|
|
76
76
|
# cover everything everytime
|
77
77
|
return [[today + 1 - days, today, "everything"]]
|
78
78
|
end
|
79
|
-
|
80
79
|
max_period = valid_periods.max
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
80
|
+
@max_period_used = max_period
|
81
|
+
|
82
|
+
one_half = ((days - everytime) / 2).to_i
|
83
|
+
one_third = ((days - everytime) / 3).to_i
|
84
|
+
two_ninths = (2 * (days - everytime) / 9).to_i
|
85
|
+
run_ranges =
|
86
|
+
case max_period
|
87
|
+
when 2
|
88
|
+
[
|
89
|
+
[[to_date - (one_half + everytime), to_date, "#{max_period}#0+everytime"]],
|
90
|
+
[[to_date - days, to_date - (one_half + everytime), "#{max_period}#1"], [to_date - everytime, to_date, "everytime"]]
|
91
|
+
]
|
92
|
+
when 3
|
93
|
+
[
|
94
|
+
[[to_date - days - 1, to_date + two_ninths - days, "3#0"], [to_date - (one_third + everytime), to_date, "2#0+everytime"]],
|
95
|
+
[[to_date + two_ninths - days, to_date + 2 * two_ninths - days, "3#1"], [to_date - everytime, to_date, "everytime"]],
|
96
|
+
[[to_date + 2 * two_ninths - days, to_date, "3#2+2#0+everytime"]],
|
97
|
+
[[to_date - days - 1, to_date + two_ninths - days, "3#3"], [to_date - everytime, to_date, "everytime"]],
|
98
|
+
[[to_date + two_ninths - days, to_date + 2 * two_ninths - days, "3#4"], [to_date - (one_third + everytime), to_date, "2#2+everytime"]],
|
99
|
+
[[to_date + 2 * two_ninths - days, to_date - (one_third + everytime), "3#5"], [to_date - everytime, to_date, "everytime"]]
|
100
|
+
]
|
101
|
+
else
|
102
|
+
[
|
103
|
+
[[to_date - (one_half + everytime), to_date, "2#0+everytime"]],
|
104
|
+
[[to_date - days - 2, to_date - (one_half + everytime), "4#0"], [to_date - everytime, to_date, "everytime"]],
|
105
|
+
[[to_date - (one_half + everytime), to_date, "2#1+everytime"]],
|
106
|
+
[[to_date - everytime, to_date, "everytime"]]
|
107
|
+
]
|
97
108
|
end
|
98
|
-
|
99
|
-
break unless days.positive?
|
100
|
-
|
101
|
-
this_period = [days, period].min
|
102
|
-
break if this_period <= 0
|
103
|
-
|
104
|
-
earliest_from = to_date - days
|
105
|
-
# we are working from the oldest back towards today
|
106
|
-
if run_number % period == index
|
107
|
-
from = to_date - index - (this_period - 1)
|
108
|
-
from = earliest_from if from < earliest_from
|
109
|
-
to = [today, to_date - index].min
|
110
|
-
break if from > to
|
109
|
+
run_number = today.to_date.jd % run_ranges.size
|
111
110
|
|
112
|
-
|
113
|
-
if ranges.any? && ranges.last[0] <= to + 1 && MERGE_ADJACENT_RANGES
|
114
|
-
# extend adjacent range
|
115
|
-
ranges.last[0] = [from, ranges.last[0]].min
|
116
|
-
ranges.last[2] = "#{period}\##{index},#{ranges.last[2]}"
|
117
|
-
else
|
118
|
-
to = ranges.last[0] - 1 if ranges.any? && to >= ranges.last[0]
|
119
|
-
ranges << [from, to, "#{period}\##{index}"]
|
120
|
-
end
|
121
|
-
end
|
122
|
-
days -= this_period
|
123
|
-
to_date -= this_period
|
124
|
-
end
|
125
|
-
end
|
126
|
-
# remainder of range at max_period, whatever that is
|
111
|
+
ranges = run_ranges[run_number]
|
127
112
|
if days.positive? && ScraperUtils::DebugUtils.trace?
|
128
|
-
|
129
|
-
end
|
130
|
-
index = -1
|
131
|
-
while days.positive?
|
132
|
-
index += 1
|
133
|
-
this_period = [days, max_period].min
|
134
|
-
break if this_period <= 0
|
135
|
-
|
136
|
-
earliest_from = to_date - days
|
137
|
-
if (run_number % max_period) == (index % max_period)
|
138
|
-
from = to_date - index - (this_period - 1)
|
139
|
-
from = earliest_from if from < earliest_from
|
140
|
-
to = to_date - index
|
141
|
-
break if from > to
|
142
|
-
|
143
|
-
@max_period_used = [this_period, @max_period_used].max
|
144
|
-
if ranges.any? && ranges.last[0] <= to + 1 && MERGE_ADJACENT_RANGES
|
145
|
-
# extend adjacent range
|
146
|
-
ranges.last[0] = [from, ranges.last[0]].min
|
147
|
-
ranges.last[2] = "#{this_period}\##{index},#{ranges.last[2]}"
|
148
|
-
else
|
149
|
-
to = ranges.last[0] - 1 if ranges.any? && to >= ranges.last[0]
|
150
|
-
ranges << [from, to, "#{this_period}\##{index}"]
|
151
|
-
end
|
152
|
-
end
|
153
|
-
days -= this_period
|
154
|
-
to_date -= this_period
|
113
|
+
LogUtils.log "DEBUG: #{max_period} ranges: #{ranges.inspect}"
|
155
114
|
end
|
156
|
-
ranges
|
115
|
+
ranges
|
157
116
|
end
|
158
117
|
end
|
159
118
|
end
|
@@ -51,17 +51,17 @@ module ScraperUtils
|
|
51
51
|
|
52
52
|
# Logs details of an HTTP request when debug mode is enabled
|
53
53
|
#
|
54
|
-
# @param
|
54
|
+
# @param http_method [String] HTTP http_method (GET, POST, etc.)
|
55
55
|
# @param url [String] Request URL
|
56
56
|
# @param parameters [Hash, nil] Optional request parameters
|
57
57
|
# @param headers [Hash, nil] Optional request headers
|
58
58
|
# @param body [Hash, nil] Optional request body
|
59
59
|
# @return [void]
|
60
|
-
def self.debug_request(
|
60
|
+
def self.debug_request(http_method, url, parameters: nil, headers: nil, body: nil)
|
61
61
|
return unless basic?
|
62
62
|
|
63
63
|
puts
|
64
|
-
|
64
|
+
LogUtils.log "🔍 #{http_method.upcase} #{url}"
|
65
65
|
puts "Parameters:", JSON.pretty_generate(parameters) if parameters
|
66
66
|
puts "Headers:", JSON.pretty_generate(headers) if headers
|
67
67
|
puts "Body:", JSON.pretty_generate(body) if body
|
@@ -77,7 +77,7 @@ module ScraperUtils
|
|
77
77
|
return unless trace?
|
78
78
|
|
79
79
|
puts
|
80
|
-
|
80
|
+
LogUtils.log "🔍 DEBUG: #{message}"
|
81
81
|
puts "Current URL: #{page.uri}"
|
82
82
|
puts "Page title: #{page.at('title').text.strip}" if page.at("title")
|
83
83
|
puts "",
|
@@ -98,7 +98,7 @@ module ScraperUtils
|
|
98
98
|
return unless trace?
|
99
99
|
|
100
100
|
puts
|
101
|
-
|
101
|
+
LogUtils.log "🔍 DEBUG: #{message}"
|
102
102
|
puts "Looking for selector: #{selector}"
|
103
103
|
element = page.at(selector)
|
104
104
|
if element
|
@@ -9,6 +9,21 @@ module ScraperUtils
|
|
9
9
|
LOG_TABLE = "scrape_log"
|
10
10
|
LOG_RETENTION_DAYS = 30
|
11
11
|
|
12
|
+
# Logs a message, automatically prefixing with authority name if in a fiber
|
13
|
+
#
|
14
|
+
# @param message [String] the message to log
|
15
|
+
# @return [void]
|
16
|
+
def self.log(message, authority = nil)
|
17
|
+
authority ||= Scheduler.current_authority
|
18
|
+
$stderr.flush
|
19
|
+
if authority
|
20
|
+
puts "[#{authority}] #{message}"
|
21
|
+
else
|
22
|
+
puts message
|
23
|
+
end
|
24
|
+
$stdout.flush
|
25
|
+
end
|
26
|
+
|
12
27
|
# Log details about a scraping run for one or more authorities
|
13
28
|
# @param start_time [Time] When this scraping attempt was started
|
14
29
|
# @param attempt [Integer] 1 for first run, 2 for first retry, 3 for last retry (without proxy)
|
@@ -9,7 +9,7 @@ module ScraperUtils
|
|
9
9
|
#
|
10
10
|
# actions = [
|
11
11
|
# [:click, "Next Page"],
|
12
|
-
# [:click, ["Option A", "
|
12
|
+
# [:click, ["Option A", "xpath://div[@id='results']/a", "css:.some-button"]] # Will select one randomly
|
13
13
|
# ]
|
14
14
|
#
|
15
15
|
# processor = ScraperUtils::MechanizeActions.new(agent)
|
@@ -50,7 +50,7 @@ module ScraperUtils
|
|
50
50
|
# @example Action format
|
51
51
|
# actions = [
|
52
52
|
# [:click, "Link Text"], # Click on link with this text
|
53
|
-
# [:click, ["Option A", "Option B"]],
|
53
|
+
# [:click, ["Option A", "text:Option B"]], # Click on one of these options (randomly selected)
|
54
54
|
# [:click, "css:.some-button"], # Use CSS selector
|
55
55
|
# [:click, "xpath://div[@id='results']/a"], # Use XPath selector
|
56
56
|
# [:block, ->(page, args, agent, results) { [page, { custom_results: 'data' }] }] # Custom block
|
@@ -67,8 +67,7 @@ module ScraperUtils
|
|
67
67
|
when :click
|
68
68
|
handle_click(current_page, args)
|
69
69
|
when :block
|
70
|
-
|
71
|
-
block.call(current_page, args, agent, @results.dup)
|
70
|
+
handle_block(current_page, args)
|
72
71
|
else
|
73
72
|
raise ArgumentError, "Unknown action type: #{action_type}"
|
74
73
|
end
|
@@ -81,6 +80,18 @@ module ScraperUtils
|
|
81
80
|
|
82
81
|
private
|
83
82
|
|
83
|
+
# Process a block action
|
84
|
+
#
|
85
|
+
# @param page [Mechanize::Page] The current page
|
86
|
+
# @param args [Array] The block and its arguments
|
87
|
+
# @return [Array<Mechanize::Page, Hash>] The resulting page and status
|
88
|
+
def handle_block(page, args)
|
89
|
+
block = args.shift
|
90
|
+
# Apply replacements to all remaining arguments
|
91
|
+
processed_args = args.map { |arg| apply_replacements(arg) }
|
92
|
+
block.call(page, processed_args.first, agent, @results.dup)
|
93
|
+
end
|
94
|
+
|
84
95
|
# Handle a click action
|
85
96
|
#
|
86
97
|
# @param page [Mechanize::Page] The current page
|
@@ -105,16 +116,34 @@ module ScraperUtils
|
|
105
116
|
# Select an element on the page based on selector string
|
106
117
|
#
|
107
118
|
# @param page [Mechanize::Page] The page to search in
|
108
|
-
# @param selector_string [String] The selector string
|
119
|
+
# @param selector_string [String] The selector string, optionally with "css:", "xpath:" or "text:" prefix
|
109
120
|
# @return [Mechanize::Element, nil] The selected element or nil if not found
|
110
121
|
def select_element(page, selector_string)
|
111
122
|
# Handle different selector types based on prefixes
|
112
123
|
if selector_string.start_with?("css:")
|
113
124
|
selector = selector_string.sub(/^css:/, '')
|
114
|
-
|
125
|
+
# We need to convert Nokogiri elements to Mechanize elements for clicking
|
126
|
+
css_element = page.at_css(selector)
|
127
|
+
return nil unless css_element
|
128
|
+
|
129
|
+
# If it's a link, find the matching Mechanize link
|
130
|
+
if css_element.name.downcase == 'a' && css_element['href']
|
131
|
+
return page.links.find { |link| link.href == css_element['href'] }
|
132
|
+
end
|
133
|
+
|
134
|
+
return css_element
|
115
135
|
elsif selector_string.start_with?("xpath:")
|
116
136
|
selector = selector_string.sub(/^xpath:/, '')
|
117
|
-
|
137
|
+
# We need to convert Nokogiri elements to Mechanize elements for clicking
|
138
|
+
xpath_element = page.at_xpath(selector)
|
139
|
+
return nil unless xpath_element
|
140
|
+
|
141
|
+
# If it's a link, find the matching Mechanize link
|
142
|
+
if xpath_element.name.downcase == 'a' && xpath_element['href']
|
143
|
+
return page.links.find { |link| link.href == xpath_element['href'] }
|
144
|
+
end
|
145
|
+
|
146
|
+
return xpath_element
|
118
147
|
else
|
119
148
|
# Default to text: for links
|
120
149
|
selector = selector_string.sub(/^text:/, '')
|
@@ -133,7 +162,7 @@ module ScraperUtils
|
|
133
162
|
end
|
134
163
|
end
|
135
164
|
|
136
|
-
# Get the link with the shortest (closest matching) text then the longest href
|
165
|
+
# Get the link with the a. shortest (closest matching) text and then b. the longest href
|
137
166
|
matching_links.min_by { |l| [l.text.strip.length, -l.href.length] }
|
138
167
|
end
|
139
168
|
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "uri"
|
4
|
+
|
5
|
+
module ScraperUtils
|
6
|
+
module MechanizeUtils
|
7
|
+
# Adapts delays between requests based on server response times.
|
8
|
+
# Target delay is proportional to response time based on max_load setting.
|
9
|
+
# Uses an exponential moving average to smooth variations in response times.
|
10
|
+
class AdaptiveDelay
|
11
|
+
DEFAULT_MIN_DELAY = 0.0
|
12
|
+
DEFAULT_MAX_DELAY = 30.0 # Presumed default timeout for Mechanize
|
13
|
+
|
14
|
+
attr_reader :min_delay, :max_delay, :max_load
|
15
|
+
|
16
|
+
# Creates a new adaptive delay calculator
|
17
|
+
#
|
18
|
+
# @param min_delay [Float] Minimum delay between requests in seconds
|
19
|
+
# @param max_delay [Float] Maximum delay between requests in seconds
|
20
|
+
# @param max_load [Float] Maximum load percentage (1-99) we aim to place on the server
|
21
|
+
# Lower values are more conservative (e.g., 20% = 4x response time delay)
|
22
|
+
def initialize(min_delay: DEFAULT_MIN_DELAY, max_delay: DEFAULT_MAX_DELAY, max_load: 20.0)
|
23
|
+
@delays = {} # domain -> last delay used
|
24
|
+
@min_delay = min_delay.to_f
|
25
|
+
@max_delay = max_delay.to_f
|
26
|
+
@max_load = max_load.to_f.clamp(1.0, 99.0)
|
27
|
+
@response_multiplier = (100.0 - @max_load) / @max_load
|
28
|
+
|
29
|
+
return unless DebugUtils.basic?
|
30
|
+
|
31
|
+
ScraperUtils::LogUtils.log(
|
32
|
+
"AdaptiveDelay initialized with delays between #{@min_delay} and #{@max_delay} seconds, " \
|
33
|
+
"Max_load #{@max_load}% thus response multiplier: #{@response_multiplier.round(2)}x"
|
34
|
+
)
|
35
|
+
end
|
36
|
+
|
37
|
+
# @param uri [URI::Generic, String] URL to get delay for
|
38
|
+
# @return [Float] Current delay for the domain, or min_delay if no delay set
|
39
|
+
def delay(uri)
|
40
|
+
@delays[domain(uri)] || @min_delay
|
41
|
+
end
|
42
|
+
|
43
|
+
# Returns the next_delay calculated from a smoothed average of response_time to use less than max_load% of server
|
44
|
+
#
|
45
|
+
# @param uri [URI::Generic, String] URL the response came from
|
46
|
+
# @param response_time [Float] Time in seconds the server took to respond
|
47
|
+
# @return [Float] The calculated delay to use with the next request
|
48
|
+
def next_delay(uri, response_time)
|
49
|
+
uris_domain = domain(uri)
|
50
|
+
# calculate target_delay to achieve desired max_load% using pre-calculated multiplier
|
51
|
+
target_delay = (response_time * @response_multiplier).clamp(0.0, @max_delay)
|
52
|
+
# Initialise average from initial_response_time rather than zero to start with reasonable approximation
|
53
|
+
current_delay = @delays[uris_domain] || target_delay
|
54
|
+
# exponential smooth the delay to smooth out wild swings (Equivalent to an RC low pass filter)
|
55
|
+
delay = ((3.0 * current_delay) + target_delay) / 4.0
|
56
|
+
delay = delay.clamp(@min_delay, @max_delay)
|
57
|
+
|
58
|
+
if DebugUtils.basic?
|
59
|
+
ScraperUtils::LogUtils.log(
|
60
|
+
"Adaptive delay for #{uris_domain} updated to #{delay.round(2)}s (target: " \
|
61
|
+
"#{@response_multiplier.round(1)}x response_time of #{response_time.round(2)}s)"
|
62
|
+
)
|
63
|
+
end
|
64
|
+
|
65
|
+
@delays[uris_domain] = delay
|
66
|
+
delay
|
67
|
+
end
|
68
|
+
|
69
|
+
private
|
70
|
+
|
71
|
+
# @param uri [URI::Generic, String] The URL to extract the domain from
|
72
|
+
# @return [String] The domain in the format "scheme://host"
|
73
|
+
def domain(uri)
|
74
|
+
uri = URI(uri) unless uri.is_a?(URI)
|
75
|
+
"#{uri.scheme}://#{uri.host}".downcase
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -24,7 +24,7 @@ module ScraperUtils
|
|
24
24
|
# )
|
25
25
|
class AgentConfig
|
26
26
|
DEFAULT_TIMEOUT = 60
|
27
|
-
DEFAULT_RANDOM_DELAY =
|
27
|
+
DEFAULT_RANDOM_DELAY = 0
|
28
28
|
DEFAULT_MAX_LOAD = 33.3
|
29
29
|
MAX_LOAD_CAP = 50.0
|
30
30
|
|
@@ -67,7 +67,7 @@ module ScraperUtils
|
|
67
67
|
# Reset all configuration options to their default values
|
68
68
|
# @return [void]
|
69
69
|
def reset_defaults!
|
70
|
-
@default_timeout = ENV.fetch('
|
70
|
+
@default_timeout = ENV.fetch('MORPH_CLIENT_TIMEOUT', DEFAULT_TIMEOUT).to_i # 60
|
71
71
|
@default_compliant_mode = ENV.fetch('MORPH_NOT_COMPLIANT', nil).to_s.empty? # true
|
72
72
|
@default_random_delay = ENV.fetch('MORPH_RANDOM_DELAY', DEFAULT_RANDOM_DELAY).to_i # 5
|
73
73
|
@default_max_load = ENV.fetch('MORPH_MAX_LOAD', DEFAULT_MAX_LOAD).to_f # 33.3
|
@@ -85,7 +85,7 @@ module ScraperUtils
|
|
85
85
|
|
86
86
|
# Give access for testing
|
87
87
|
|
88
|
-
attr_reader :max_load, :
|
88
|
+
attr_reader :max_load, :random_range
|
89
89
|
|
90
90
|
# Creates Mechanize agent configuration with sensible defaults overridable via configure
|
91
91
|
# @param timeout [Integer, nil] Timeout for agent connections (default: 60)
|
@@ -125,21 +125,21 @@ module ScraperUtils
|
|
125
125
|
@australian_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
|
126
126
|
if @australian_proxy
|
127
127
|
uri = begin
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
128
|
+
URI.parse(ScraperUtils.australian_proxy.to_s)
|
129
|
+
rescue URI::InvalidURIError => e
|
130
|
+
raise URI::InvalidURIError, "Invalid proxy URL format: #{e.message}"
|
131
|
+
end
|
132
132
|
unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
|
133
133
|
raise URI::InvalidURIError, "Proxy URL must start with http:// or https://"
|
134
134
|
end
|
135
|
-
unless uri.host && uri.port
|
135
|
+
unless !uri.host.to_s.empty? && uri.port&.positive?
|
136
136
|
raise URI::InvalidURIError, "Proxy URL must include host and port"
|
137
137
|
end
|
138
138
|
end
|
139
139
|
|
140
|
-
if @random_delay
|
141
|
-
|
142
|
-
@
|
140
|
+
if @random_delay&.positive?
|
141
|
+
min_random = Math.sqrt(@random_delay * 3.0 / 13.0)
|
142
|
+
@random_range = min_random.round(3)..(3 * min_random).round(3)
|
143
143
|
end
|
144
144
|
|
145
145
|
today = Date.today.strftime("%Y-%m-%d")
|
@@ -177,7 +177,6 @@ module ScraperUtils
|
|
177
177
|
verify_proxy_works(agent)
|
178
178
|
end
|
179
179
|
|
180
|
-
@connection_started_at = nil
|
181
180
|
agent.pre_connect_hooks << method(:pre_connect_hook)
|
182
181
|
agent.post_connect_hooks << method(:post_connect_hook)
|
183
182
|
end
|
@@ -193,11 +192,11 @@ module ScraperUtils
|
|
193
192
|
"australian_proxy=#{@australian_proxy.inspect}"
|
194
193
|
end
|
195
194
|
display_args << "compliant_mode" if @compliant_mode
|
196
|
-
display_args << "random_delay=#{@random_delay}" if @random_delay
|
195
|
+
display_args << "random_delay=#{@random_delay}" if @random_delay&.positive?
|
197
196
|
display_args << "max_load=#{@max_load}%" if @max_load
|
198
197
|
display_args << "disable_ssl_certificate_check" if @disable_ssl_certificate_check
|
199
198
|
display_args << "default args" if display_args.empty?
|
200
|
-
ScraperUtils::
|
199
|
+
ScraperUtils::LogUtils.log(
|
201
200
|
"Configuring Mechanize agent with #{display_args.join(', ')}"
|
202
201
|
)
|
203
202
|
end
|
@@ -206,7 +205,7 @@ module ScraperUtils
|
|
206
205
|
@connection_started_at = Time.now
|
207
206
|
return unless DebugUtils.verbose?
|
208
207
|
|
209
|
-
ScraperUtils::
|
208
|
+
ScraperUtils::LogUtils.log(
|
210
209
|
"Pre Connect request: #{request.inspect} at #{@connection_started_at}"
|
211
210
|
)
|
212
211
|
end
|
@@ -216,9 +215,9 @@ module ScraperUtils
|
|
216
215
|
|
217
216
|
response_time = Time.now - @connection_started_at
|
218
217
|
if DebugUtils.basic?
|
219
|
-
ScraperUtils::
|
218
|
+
ScraperUtils::LogUtils.log(
|
220
219
|
"Post Connect uri: #{uri.inspect}, response: #{response.inspect} " \
|
221
|
-
|
220
|
+
"after #{response_time} seconds"
|
222
221
|
)
|
223
222
|
end
|
224
223
|
|
@@ -227,33 +226,35 @@ module ScraperUtils
|
|
227
226
|
"URL is disallowed by robots.txt specific rules: #{uri}"
|
228
227
|
end
|
229
228
|
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
229
|
+
@delay_till = nil
|
230
|
+
@delay = @robots_checker&.crawl_delay&.round(3)
|
231
|
+
debug_msg = "Delaying robots.txt: crawl_delay #{@delay} seconds"
|
232
|
+
unless @delay&.positive?
|
233
|
+
delays = {
|
234
|
+
max_load: @adaptive_delay&.next_delay(uri, response_time)&.round(3),
|
235
|
+
random: (@random_range ? (rand(@random_range) ** 2).round(3) : nil)
|
236
|
+
}
|
237
|
+
@delay = [delays[:max_load], delays[:random]].compact.sum
|
238
|
+
debug_msg = "Delaying #{@delay} seconds, sum of: #{delays.inspect}"
|
239
|
+
end
|
236
240
|
if @delay&.positive?
|
237
|
-
|
238
|
-
ScraperUtils::
|
239
|
-
$stdout.flush
|
240
|
-
ScraperUtils::FiberScheduler.delay(@delay)
|
241
|
+
@delay_till = Time.now + @delay
|
242
|
+
ScraperUtils::LogUtils.log(debug_msg) if ScraperUtils::DebugUtils.basic?
|
241
243
|
end
|
242
|
-
|
243
244
|
response
|
244
245
|
end
|
245
246
|
|
246
247
|
def verify_proxy_works(agent)
|
247
248
|
$stderr.flush
|
248
249
|
$stdout.flush
|
249
|
-
|
250
|
+
LogUtils.log "Checking proxy works..."
|
250
251
|
my_ip = MechanizeUtils.public_ip(agent)
|
251
252
|
begin
|
252
253
|
IPAddr.new(my_ip)
|
253
254
|
rescue IPAddr::InvalidAddressError => e
|
254
255
|
raise "Invalid public IP address returned by proxy check: #{my_ip.inspect}: #{e}"
|
255
256
|
end
|
256
|
-
ScraperUtils::
|
257
|
+
ScraperUtils::LogUtils.log "Proxy is using IP address: #{my_ip.inspect}"
|
257
258
|
my_headers = MechanizeUtils.public_headers(agent)
|
258
259
|
begin
|
259
260
|
# Check response is JSON just to be safe!
|