scraper_utils 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +5 -0
  3. data/CHANGELOG.md +7 -0
  4. data/GUIDELINES.md +2 -1
  5. data/Gemfile +1 -0
  6. data/IMPLEMENTATION.md +40 -0
  7. data/README.md +29 -23
  8. data/SPECS.md +13 -1
  9. data/bin/rspec +27 -0
  10. data/docs/example_scrape_with_fibers.rb +4 -4
  11. data/docs/fibers_and_threads.md +72 -0
  12. data/docs/getting_started.md +6 -6
  13. data/docs/interleaving_requests.md +7 -7
  14. data/docs/parallel_requests.md +138 -0
  15. data/docs/randomizing_requests.md +12 -8
  16. data/docs/reducing_server_load.md +6 -6
  17. data/lib/scraper_utils/data_quality_monitor.rb +2 -3
  18. data/lib/scraper_utils/date_range_utils.rb +37 -78
  19. data/lib/scraper_utils/debug_utils.rb +5 -5
  20. data/lib/scraper_utils/log_utils.rb +15 -0
  21. data/lib/scraper_utils/mechanize_actions.rb +37 -8
  22. data/lib/scraper_utils/mechanize_utils/adaptive_delay.rb +79 -0
  23. data/lib/scraper_utils/mechanize_utils/agent_config.rb +31 -30
  24. data/lib/scraper_utils/mechanize_utils/robots_checker.rb +151 -0
  25. data/lib/scraper_utils/mechanize_utils.rb +8 -5
  26. data/lib/scraper_utils/randomize_utils.rb +22 -19
  27. data/lib/scraper_utils/scheduler/constants.rb +12 -0
  28. data/lib/scraper_utils/scheduler/operation_registry.rb +101 -0
  29. data/lib/scraper_utils/scheduler/operation_worker.rb +199 -0
  30. data/lib/scraper_utils/scheduler/process_request.rb +59 -0
  31. data/lib/scraper_utils/scheduler/thread_request.rb +51 -0
  32. data/lib/scraper_utils/scheduler/thread_response.rb +59 -0
  33. data/lib/scraper_utils/scheduler.rb +286 -0
  34. data/lib/scraper_utils/version.rb +1 -1
  35. data/lib/scraper_utils.rb +11 -14
  36. metadata +16 -6
  37. data/lib/scraper_utils/adaptive_delay.rb +0 -70
  38. data/lib/scraper_utils/fiber_scheduler.rb +0 -229
  39. data/lib/scraper_utils/robots_checker.rb +0 -149
@@ -3,7 +3,7 @@
3
3
  module ScraperUtils
4
4
  class DateRangeUtils
5
5
  MERGE_ADJACENT_RANGES = true
6
- PERIODS = [2, 3, 5, 8].freeze
6
+ PERIODS = [2, 3, 4].freeze
7
7
 
8
8
  class << self
9
9
  # @return [Integer] Default number of days to cover
@@ -33,7 +33,7 @@ module ScraperUtils
33
33
  def reset_defaults!
34
34
  @default_days = ENV.fetch('MORPH_DAYS', 33).to_i # 33
35
35
  @default_everytime = ENV.fetch('MORPH_EVERYTIME', 4).to_i # 4
36
- @default_max_period = ENV.fetch('MORPH_MAX_PERIOD', 3).to_i # 3
36
+ @default_max_period = ENV.fetch('MORPH_MAX_PERIOD', 2).to_i # 3
37
37
  end
38
38
  end
39
39
 
@@ -46,8 +46,8 @@ module ScraperUtils
46
46
  # Generates one or more date ranges to check the most recent daily through to checking each max_period
47
47
  # There is a graduated schedule from the latest `everytime` days through to the oldest of `days` dates which is checked each `max_period` days.
48
48
  # @param days [Integer, nil] create ranges that cover the last `days` dates
49
- # @param everytime [Integer, nil] Always include the latest `everytime` out of `days` dates
50
- # @param max_period [Integer, nil] the last `days` dates must be checked at least every `max_period` days
49
+ # @param everytime [Integer, nil] Always include the latest `everytime` out of `days` dates (minimum 1)
50
+ # @param max_period [Integer, nil] the last `days` dates must be checked at least every `max_period` days (1..4)
51
51
  # @param today [Date, nil] overrides the default determination of today at UTC+09:30 (middle of Australia)
52
52
  # @return [Array{[Date, Date, String]}] being from_date, to_date and a comment
53
53
  #
@@ -58,7 +58,7 @@ module ScraperUtils
58
58
  def calculate_date_ranges(days: nil, everytime: nil, max_period: nil, today: nil)
59
59
  _calculate_date_ranges(
60
60
  Integer(days || self.class.default_days),
61
- Integer(everytime || self.class.default_everytime),
61
+ [1, Integer(everytime || self.class.default_everytime)].max,
62
62
  Integer(max_period || self.class.default_max_period),
63
63
  today || Time.now(in: '+09:30').to_date
64
64
  )
@@ -76,84 +76,43 @@ module ScraperUtils
76
76
  # cover everything everytime
77
77
  return [[today + 1 - days, today, "everything"]]
78
78
  end
79
-
80
79
  max_period = valid_periods.max
81
-
82
- run_number = today.to_date.jd
83
- ranges = []
84
- if everytime.positive?
85
- ranges << [to_date + 1 - everytime, to_date, "everytime"]
86
- days -= everytime
87
- to_date -= everytime
88
- end
89
-
90
- periods = valid_periods.dup
91
- loop do
92
- period = periods.shift
93
- break if period.nil? || period >= max_period || !days.positive?
94
-
95
- if DebugUtils.trace?
96
- FiberScheduler.log "DEBUG: #{period} day periods started #{(today - to_date).to_i} days in."
80
+ @max_period_used = max_period
81
+
82
+ one_half = ((days - everytime) / 2).to_i
83
+ one_third = ((days - everytime) / 3).to_i
84
+ two_ninths = (2 * (days - everytime) / 9).to_i
85
+ run_ranges =
86
+ case max_period
87
+ when 2
88
+ [
89
+ [[to_date - (one_half + everytime), to_date, "#{max_period}#0+everytime"]],
90
+ [[to_date - days, to_date - (one_half + everytime), "#{max_period}#1"], [to_date - everytime, to_date, "everytime"]]
91
+ ]
92
+ when 3
93
+ [
94
+ [[to_date - days - 1, to_date + two_ninths - days, "3#0"], [to_date - (one_third + everytime), to_date, "2#0+everytime"]],
95
+ [[to_date + two_ninths - days, to_date + 2 * two_ninths - days, "3#1"], [to_date - everytime, to_date, "everytime"]],
96
+ [[to_date + 2 * two_ninths - days, to_date, "3#2+2#0+everytime"]],
97
+ [[to_date - days - 1, to_date + two_ninths - days, "3#3"], [to_date - everytime, to_date, "everytime"]],
98
+ [[to_date + two_ninths - days, to_date + 2 * two_ninths - days, "3#4"], [to_date - (one_third + everytime), to_date, "2#2+everytime"]],
99
+ [[to_date + 2 * two_ninths - days, to_date - (one_third + everytime), "3#5"], [to_date - everytime, to_date, "everytime"]]
100
+ ]
101
+ else
102
+ [
103
+ [[to_date - (one_half + everytime), to_date, "2#0+everytime"]],
104
+ [[to_date - days - 2, to_date - (one_half + everytime), "4#0"], [to_date - everytime, to_date, "everytime"]],
105
+ [[to_date - (one_half + everytime), to_date, "2#1+everytime"]],
106
+ [[to_date - everytime, to_date, "everytime"]]
107
+ ]
97
108
  end
98
- period.times do |index|
99
- break unless days.positive?
100
-
101
- this_period = [days, period].min
102
- break if this_period <= 0
103
-
104
- earliest_from = to_date - days
105
- # we are working from the oldest back towards today
106
- if run_number % period == index
107
- from = to_date - index - (this_period - 1)
108
- from = earliest_from if from < earliest_from
109
- to = [today, to_date - index].min
110
- break if from > to
109
+ run_number = today.to_date.jd % run_ranges.size
111
110
 
112
- @max_period_used = [this_period, @max_period_used].max
113
- if ranges.any? && ranges.last[0] <= to + 1 && MERGE_ADJACENT_RANGES
114
- # extend adjacent range
115
- ranges.last[0] = [from, ranges.last[0]].min
116
- ranges.last[2] = "#{period}\##{index},#{ranges.last[2]}"
117
- else
118
- to = ranges.last[0] - 1 if ranges.any? && to >= ranges.last[0]
119
- ranges << [from, to, "#{period}\##{index}"]
120
- end
121
- end
122
- days -= this_period
123
- to_date -= this_period
124
- end
125
- end
126
- # remainder of range at max_period, whatever that is
111
+ ranges = run_ranges[run_number]
127
112
  if days.positive? && ScraperUtils::DebugUtils.trace?
128
- FiberScheduler.log "DEBUG: #{max_period} day periods started #{(today - to_date).to_i} days in."
129
- end
130
- index = -1
131
- while days.positive?
132
- index += 1
133
- this_period = [days, max_period].min
134
- break if this_period <= 0
135
-
136
- earliest_from = to_date - days
137
- if (run_number % max_period) == (index % max_period)
138
- from = to_date - index - (this_period - 1)
139
- from = earliest_from if from < earliest_from
140
- to = to_date - index
141
- break if from > to
142
-
143
- @max_period_used = [this_period, @max_period_used].max
144
- if ranges.any? && ranges.last[0] <= to + 1 && MERGE_ADJACENT_RANGES
145
- # extend adjacent range
146
- ranges.last[0] = [from, ranges.last[0]].min
147
- ranges.last[2] = "#{this_period}\##{index},#{ranges.last[2]}"
148
- else
149
- to = ranges.last[0] - 1 if ranges.any? && to >= ranges.last[0]
150
- ranges << [from, to, "#{this_period}\##{index}"]
151
- end
152
- end
153
- days -= this_period
154
- to_date -= this_period
113
+ LogUtils.log "DEBUG: #{max_period} ranges: #{ranges.inspect}"
155
114
  end
156
- ranges.reverse
115
+ ranges
157
116
  end
158
117
  end
159
118
  end
@@ -51,17 +51,17 @@ module ScraperUtils
51
51
 
52
52
  # Logs details of an HTTP request when debug mode is enabled
53
53
  #
54
- # @param method [String] HTTP method (GET, POST, etc.)
54
+ # @param http_method [String] HTTP http_method (GET, POST, etc.)
55
55
  # @param url [String] Request URL
56
56
  # @param parameters [Hash, nil] Optional request parameters
57
57
  # @param headers [Hash, nil] Optional request headers
58
58
  # @param body [Hash, nil] Optional request body
59
59
  # @return [void]
60
- def self.debug_request(method, url, parameters: nil, headers: nil, body: nil)
60
+ def self.debug_request(http_method, url, parameters: nil, headers: nil, body: nil)
61
61
  return unless basic?
62
62
 
63
63
  puts
64
- FiberScheduler.log "🔍 #{method.upcase} #{url}"
64
+ LogUtils.log "🔍 #{http_method.upcase} #{url}"
65
65
  puts "Parameters:", JSON.pretty_generate(parameters) if parameters
66
66
  puts "Headers:", JSON.pretty_generate(headers) if headers
67
67
  puts "Body:", JSON.pretty_generate(body) if body
@@ -77,7 +77,7 @@ module ScraperUtils
77
77
  return unless trace?
78
78
 
79
79
  puts
80
- FiberScheduler.log "🔍 DEBUG: #{message}"
80
+ LogUtils.log "🔍 DEBUG: #{message}"
81
81
  puts "Current URL: #{page.uri}"
82
82
  puts "Page title: #{page.at('title').text.strip}" if page.at("title")
83
83
  puts "",
@@ -98,7 +98,7 @@ module ScraperUtils
98
98
  return unless trace?
99
99
 
100
100
  puts
101
- FiberScheduler.log "🔍 DEBUG: #{message}"
101
+ LogUtils.log "🔍 DEBUG: #{message}"
102
102
  puts "Looking for selector: #{selector}"
103
103
  element = page.at(selector)
104
104
  if element
@@ -9,6 +9,21 @@ module ScraperUtils
9
9
  LOG_TABLE = "scrape_log"
10
10
  LOG_RETENTION_DAYS = 30
11
11
 
12
+ # Logs a message, automatically prefixing with authority name if in a fiber
13
+ #
14
+ # @param message [String] the message to log
15
+ # @return [void]
16
+ def self.log(message, authority = nil)
17
+ authority ||= Scheduler.current_authority
18
+ $stderr.flush
19
+ if authority
20
+ puts "[#{authority}] #{message}"
21
+ else
22
+ puts message
23
+ end
24
+ $stdout.flush
25
+ end
26
+
12
27
  # Log details about a scraping run for one or more authorities
13
28
  # @param start_time [Time] When this scraping attempt was started
14
29
  # @param attempt [Integer] 1 for first run, 2 for first retry, 3 for last retry (without proxy)
@@ -9,7 +9,7 @@ module ScraperUtils
9
9
  #
10
10
  # actions = [
11
11
  # [:click, "Next Page"],
12
- # [:click, ["Option A", "Option B"]] # Will select one randomly
12
+ # [:click, ["Option A", "xpath://div[@id='results']/a", "css:.some-button"]] # Will select one randomly
13
13
  # ]
14
14
  #
15
15
  # processor = ScraperUtils::MechanizeActions.new(agent)
@@ -50,7 +50,7 @@ module ScraperUtils
50
50
  # @example Action format
51
51
  # actions = [
52
52
  # [:click, "Link Text"], # Click on link with this text
53
- # [:click, ["Option A", "Option B"]], # Click on one of these options (randomly selected)
53
+ # [:click, ["Option A", "text:Option B"]], # Click on one of these options (randomly selected)
54
54
  # [:click, "css:.some-button"], # Use CSS selector
55
55
  # [:click, "xpath://div[@id='results']/a"], # Use XPath selector
56
56
  # [:block, ->(page, args, agent, results) { [page, { custom_results: 'data' }] }] # Custom block
@@ -67,8 +67,7 @@ module ScraperUtils
67
67
  when :click
68
68
  handle_click(current_page, args)
69
69
  when :block
70
- block = args.shift
71
- block.call(current_page, args, agent, @results.dup)
70
+ handle_block(current_page, args)
72
71
  else
73
72
  raise ArgumentError, "Unknown action type: #{action_type}"
74
73
  end
@@ -81,6 +80,18 @@ module ScraperUtils
81
80
 
82
81
  private
83
82
 
83
+ # Process a block action
84
+ #
85
+ # @param page [Mechanize::Page] The current page
86
+ # @param args [Array] The block and its arguments
87
+ # @return [Array<Mechanize::Page, Hash>] The resulting page and status
88
+ def handle_block(page, args)
89
+ block = args.shift
90
+ # Apply replacements to all remaining arguments
91
+ processed_args = args.map { |arg| apply_replacements(arg) }
92
+ block.call(page, processed_args.first, agent, @results.dup)
93
+ end
94
+
84
95
  # Handle a click action
85
96
  #
86
97
  # @param page [Mechanize::Page] The current page
@@ -105,16 +116,34 @@ module ScraperUtils
105
116
  # Select an element on the page based on selector string
106
117
  #
107
118
  # @param page [Mechanize::Page] The page to search in
108
- # @param selector_string [String] The selector string
119
+ # @param selector_string [String] The selector string, optionally with "css:", "xpath:" or "text:" prefix
109
120
  # @return [Mechanize::Element, nil] The selected element or nil if not found
110
121
  def select_element(page, selector_string)
111
122
  # Handle different selector types based on prefixes
112
123
  if selector_string.start_with?("css:")
113
124
  selector = selector_string.sub(/^css:/, '')
114
- page.at_css(selector)
125
+ # We need to convert Nokogiri elements to Mechanize elements for clicking
126
+ css_element = page.at_css(selector)
127
+ return nil unless css_element
128
+
129
+ # If it's a link, find the matching Mechanize link
130
+ if css_element.name.downcase == 'a' && css_element['href']
131
+ return page.links.find { |link| link.href == css_element['href'] }
132
+ end
133
+
134
+ return css_element
115
135
  elsif selector_string.start_with?("xpath:")
116
136
  selector = selector_string.sub(/^xpath:/, '')
117
- page.at_xpath(selector)
137
+ # We need to convert Nokogiri elements to Mechanize elements for clicking
138
+ xpath_element = page.at_xpath(selector)
139
+ return nil unless xpath_element
140
+
141
+ # If it's a link, find the matching Mechanize link
142
+ if xpath_element.name.downcase == 'a' && xpath_element['href']
143
+ return page.links.find { |link| link.href == xpath_element['href'] }
144
+ end
145
+
146
+ return xpath_element
118
147
  else
119
148
  # Default to text: for links
120
149
  selector = selector_string.sub(/^text:/, '')
@@ -133,7 +162,7 @@ module ScraperUtils
133
162
  end
134
163
  end
135
164
 
136
- # Get the link with the shortest (closest matching) text then the longest href
165
+ # Get the link with the a. shortest (closest matching) text and then b. the longest href
137
166
  matching_links.min_by { |l| [l.text.strip.length, -l.href.length] }
138
167
  end
139
168
  end
@@ -0,0 +1,79 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "uri"
4
+
5
+ module ScraperUtils
6
+ module MechanizeUtils
7
+ # Adapts delays between requests based on server response times.
8
+ # Target delay is proportional to response time based on max_load setting.
9
+ # Uses an exponential moving average to smooth variations in response times.
10
+ class AdaptiveDelay
11
+ DEFAULT_MIN_DELAY = 0.0
12
+ DEFAULT_MAX_DELAY = 30.0 # Presumed default timeout for Mechanize
13
+
14
+ attr_reader :min_delay, :max_delay, :max_load
15
+
16
+ # Creates a new adaptive delay calculator
17
+ #
18
+ # @param min_delay [Float] Minimum delay between requests in seconds
19
+ # @param max_delay [Float] Maximum delay between requests in seconds
20
+ # @param max_load [Float] Maximum load percentage (1-99) we aim to place on the server
21
+ # Lower values are more conservative (e.g., 20% = 4x response time delay)
22
+ def initialize(min_delay: DEFAULT_MIN_DELAY, max_delay: DEFAULT_MAX_DELAY, max_load: 20.0)
23
+ @delays = {} # domain -> last delay used
24
+ @min_delay = min_delay.to_f
25
+ @max_delay = max_delay.to_f
26
+ @max_load = max_load.to_f.clamp(1.0, 99.0)
27
+ @response_multiplier = (100.0 - @max_load) / @max_load
28
+
29
+ return unless DebugUtils.basic?
30
+
31
+ ScraperUtils::LogUtils.log(
32
+ "AdaptiveDelay initialized with delays between #{@min_delay} and #{@max_delay} seconds, " \
33
+ "Max_load #{@max_load}% thus response multiplier: #{@response_multiplier.round(2)}x"
34
+ )
35
+ end
36
+
37
+ # @param uri [URI::Generic, String] URL to get delay for
38
+ # @return [Float] Current delay for the domain, or min_delay if no delay set
39
+ def delay(uri)
40
+ @delays[domain(uri)] || @min_delay
41
+ end
42
+
43
+ # Returns the next_delay calculated from a smoothed average of response_time to use less than max_load% of server
44
+ #
45
+ # @param uri [URI::Generic, String] URL the response came from
46
+ # @param response_time [Float] Time in seconds the server took to respond
47
+ # @return [Float] The calculated delay to use with the next request
48
+ def next_delay(uri, response_time)
49
+ uris_domain = domain(uri)
50
+ # calculate target_delay to achieve desired max_load% using pre-calculated multiplier
51
+ target_delay = (response_time * @response_multiplier).clamp(0.0, @max_delay)
52
+ # Initialise average from initial_response_time rather than zero to start with reasonable approximation
53
+ current_delay = @delays[uris_domain] || target_delay
54
+ # exponential smooth the delay to smooth out wild swings (Equivalent to an RC low pass filter)
55
+ delay = ((3.0 * current_delay) + target_delay) / 4.0
56
+ delay = delay.clamp(@min_delay, @max_delay)
57
+
58
+ if DebugUtils.basic?
59
+ ScraperUtils::LogUtils.log(
60
+ "Adaptive delay for #{uris_domain} updated to #{delay.round(2)}s (target: " \
61
+ "#{@response_multiplier.round(1)}x response_time of #{response_time.round(2)}s)"
62
+ )
63
+ end
64
+
65
+ @delays[uris_domain] = delay
66
+ delay
67
+ end
68
+
69
+ private
70
+
71
+ # @param uri [URI::Generic, String] The URL to extract the domain from
72
+ # @return [String] The domain in the format "scheme://host"
73
+ def domain(uri)
74
+ uri = URI(uri) unless uri.is_a?(URI)
75
+ "#{uri.scheme}://#{uri.host}".downcase
76
+ end
77
+ end
78
+ end
79
+ end
@@ -24,7 +24,7 @@ module ScraperUtils
24
24
  # )
25
25
  class AgentConfig
26
26
  DEFAULT_TIMEOUT = 60
27
- DEFAULT_RANDOM_DELAY = 5
27
+ DEFAULT_RANDOM_DELAY = 0
28
28
  DEFAULT_MAX_LOAD = 33.3
29
29
  MAX_LOAD_CAP = 50.0
30
30
 
@@ -67,7 +67,7 @@ module ScraperUtils
67
67
  # Reset all configuration options to their default values
68
68
  # @return [void]
69
69
  def reset_defaults!
70
- @default_timeout = ENV.fetch('MORPH_TIMEOUT', DEFAULT_TIMEOUT).to_i # 60
70
+ @default_timeout = ENV.fetch('MORPH_CLIENT_TIMEOUT', DEFAULT_TIMEOUT).to_i # 60
71
71
  @default_compliant_mode = ENV.fetch('MORPH_NOT_COMPLIANT', nil).to_s.empty? # true
72
72
  @default_random_delay = ENV.fetch('MORPH_RANDOM_DELAY', DEFAULT_RANDOM_DELAY).to_i # 5
73
73
  @default_max_load = ENV.fetch('MORPH_MAX_LOAD', DEFAULT_MAX_LOAD).to_f # 33.3
@@ -85,7 +85,7 @@ module ScraperUtils
85
85
 
86
86
  # Give access for testing
87
87
 
88
- attr_reader :max_load, :min_random, :max_random
88
+ attr_reader :max_load, :random_range
89
89
 
90
90
  # Creates Mechanize agent configuration with sensible defaults overridable via configure
91
91
  # @param timeout [Integer, nil] Timeout for agent connections (default: 60)
@@ -125,21 +125,21 @@ module ScraperUtils
125
125
  @australian_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
126
126
  if @australian_proxy
127
127
  uri = begin
128
- URI.parse(ScraperUtils.australian_proxy.to_s)
129
- rescue URI::InvalidURIError => e
130
- raise URI::InvalidURIError, "Invalid proxy URL format: #{e.message}"
131
- end
128
+ URI.parse(ScraperUtils.australian_proxy.to_s)
129
+ rescue URI::InvalidURIError => e
130
+ raise URI::InvalidURIError, "Invalid proxy URL format: #{e.message}"
131
+ end
132
132
  unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
133
133
  raise URI::InvalidURIError, "Proxy URL must start with http:// or https://"
134
134
  end
135
- unless uri.host && uri.port
135
+ unless !uri.host.to_s.empty? && uri.port&.positive?
136
136
  raise URI::InvalidURIError, "Proxy URL must include host and port"
137
137
  end
138
138
  end
139
139
 
140
- if @random_delay
141
- @min_random = Math.sqrt(@random_delay * 3.0 / 13.0).round(3)
142
- @max_random = (3 * @min_random).round(3)
140
+ if @random_delay&.positive?
141
+ min_random = Math.sqrt(@random_delay * 3.0 / 13.0)
142
+ @random_range = min_random.round(3)..(3 * min_random).round(3)
143
143
  end
144
144
 
145
145
  today = Date.today.strftime("%Y-%m-%d")
@@ -177,7 +177,6 @@ module ScraperUtils
177
177
  verify_proxy_works(agent)
178
178
  end
179
179
 
180
- @connection_started_at = nil
181
180
  agent.pre_connect_hooks << method(:pre_connect_hook)
182
181
  agent.post_connect_hooks << method(:post_connect_hook)
183
182
  end
@@ -193,11 +192,11 @@ module ScraperUtils
193
192
  "australian_proxy=#{@australian_proxy.inspect}"
194
193
  end
195
194
  display_args << "compliant_mode" if @compliant_mode
196
- display_args << "random_delay=#{@random_delay}" if @random_delay
195
+ display_args << "random_delay=#{@random_delay}" if @random_delay&.positive?
197
196
  display_args << "max_load=#{@max_load}%" if @max_load
198
197
  display_args << "disable_ssl_certificate_check" if @disable_ssl_certificate_check
199
198
  display_args << "default args" if display_args.empty?
200
- ScraperUtils::FiberScheduler.log(
199
+ ScraperUtils::LogUtils.log(
201
200
  "Configuring Mechanize agent with #{display_args.join(', ')}"
202
201
  )
203
202
  end
@@ -206,7 +205,7 @@ module ScraperUtils
206
205
  @connection_started_at = Time.now
207
206
  return unless DebugUtils.verbose?
208
207
 
209
- ScraperUtils::FiberScheduler.log(
208
+ ScraperUtils::LogUtils.log(
210
209
  "Pre Connect request: #{request.inspect} at #{@connection_started_at}"
211
210
  )
212
211
  end
@@ -216,9 +215,9 @@ module ScraperUtils
216
215
 
217
216
  response_time = Time.now - @connection_started_at
218
217
  if DebugUtils.basic?
219
- ScraperUtils::FiberScheduler.log(
218
+ ScraperUtils::LogUtils.log(
220
219
  "Post Connect uri: #{uri.inspect}, response: #{response.inspect} " \
221
- "after #{response_time} seconds"
220
+ "after #{response_time} seconds"
222
221
  )
223
222
  end
224
223
 
@@ -227,33 +226,35 @@ module ScraperUtils
227
226
  "URL is disallowed by robots.txt specific rules: #{uri}"
228
227
  end
229
228
 
230
- delays = {
231
- robot_txt: @robots_checker&.crawl_delay&.round(3),
232
- max_load: @adaptive_delay&.next_delay(uri, response_time)&.round(3),
233
- random: (@min_random ? (rand(@min_random..@max_random)**2).round(3) : nil)
234
- }
235
- @delay = delays.values.compact.max
229
+ @delay_till = nil
230
+ @delay = @robots_checker&.crawl_delay&.round(3)
231
+ debug_msg = "Delaying robots.txt: crawl_delay #{@delay} seconds"
232
+ unless @delay&.positive?
233
+ delays = {
234
+ max_load: @adaptive_delay&.next_delay(uri, response_time)&.round(3),
235
+ random: (@random_range ? (rand(@random_range) ** 2).round(3) : nil)
236
+ }
237
+ @delay = [delays[:max_load], delays[:random]].compact.sum
238
+ debug_msg = "Delaying #{@delay} seconds, sum of: #{delays.inspect}"
239
+ end
236
240
  if @delay&.positive?
237
- $stderr.flush
238
- ScraperUtils::FiberScheduler.log("Delaying #{@delay} seconds, max of #{delays.inspect}") if ENV["DEBUG"]
239
- $stdout.flush
240
- ScraperUtils::FiberScheduler.delay(@delay)
241
+ @delay_till = Time.now + @delay
242
+ ScraperUtils::LogUtils.log(debug_msg) if ScraperUtils::DebugUtils.basic?
241
243
  end
242
-
243
244
  response
244
245
  end
245
246
 
246
247
  def verify_proxy_works(agent)
247
248
  $stderr.flush
248
249
  $stdout.flush
249
- FiberScheduler.log "Checking proxy works..."
250
+ LogUtils.log "Checking proxy works..."
250
251
  my_ip = MechanizeUtils.public_ip(agent)
251
252
  begin
252
253
  IPAddr.new(my_ip)
253
254
  rescue IPAddr::InvalidAddressError => e
254
255
  raise "Invalid public IP address returned by proxy check: #{my_ip.inspect}: #{e}"
255
256
  end
256
- ScraperUtils::FiberScheduler.log "Proxy is using IP address: #{my_ip.inspect}"
257
+ ScraperUtils::LogUtils.log "Proxy is using IP address: #{my_ip.inspect}"
257
258
  my_headers = MechanizeUtils.public_headers(agent)
258
259
  begin
259
260
  # Check response is JSON just to be safe!