scraper_utils 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,229 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fiber"
4
+
5
+ module ScraperUtils
6
+ # A utility module for interleaving multiple scraping operations
7
+ # using fibers during connection delay periods. This allows efficient
8
+ # use of wait time by switching between operations.
9
+ module FiberScheduler
10
+ # @return [Array<Fiber>] List of active fibers managed by the scheduler
11
+ def self.registry
12
+ @registry ||= []
13
+ end
14
+
15
+ # Checks if the current code is running within a registered fiber
16
+ #
17
+ # @return [Boolean] true if running in a registered fiber, false otherwise
18
+ def self.in_fiber?
19
+ !Fiber.current.nil? && registry.include?(Fiber.current)
20
+ end
21
+
22
+ # Gets the authority associated with the current fiber
23
+ #
24
+ # @return [String, nil] the authority name or nil if not in a fiber
25
+ def self.current_authority
26
+ return nil unless in_fiber?
27
+
28
+ Fiber.current.instance_variable_get(:@authority)
29
+ end
30
+
31
+ # Logs a message, automatically prefixing with authority name if in a fiber
32
+ #
33
+ # @param message [String] the message to log
34
+ # @return [void]
35
+ def self.log(message)
36
+ authority = current_authority
37
+ $stderr.flush
38
+ if authority
39
+ puts "[#{authority}] #{message}"
40
+ else
41
+ puts message
42
+ end
43
+ $stdout.flush
44
+ end
45
+
46
+ # Returns a hash of exceptions encountered during processing, indexed by authority
47
+ #
48
+ # @return [Hash{Symbol => Exception}] exceptions by authority
49
+ def self.exceptions
50
+ @exceptions ||= {}
51
+ end
52
+
53
+ # Returns a hash of the yielded / block values
54
+ #
55
+ # @return [Hash{Symbol => Any}] values by authority
56
+ def self.values
57
+ @values ||= {}
58
+ end
59
+
60
+ # Checks if fiber scheduling is currently enabled
61
+ #
62
+ # @return [Boolean] true if enabled, false otherwise
63
+ def self.enabled?
64
+ @enabled ||= false
65
+ end
66
+
67
+ # Enables fiber scheduling
68
+ #
69
+ # @return [void]
70
+ def self.enable!
71
+ reset! unless enabled?
72
+ @enabled = true
73
+ end
74
+
75
+ # Disables fiber scheduling
76
+ #
77
+ # @return [void]
78
+ def self.disable!
79
+ @enabled = false
80
+ end
81
+
82
+ # Resets the scheduler state, and disables. Use before retrying failed authorities.
83
+ #
84
+ # @return [void]
85
+ def self.reset!
86
+ @registry = []
87
+ @exceptions = {}
88
+ @values = {}
89
+ @enabled = false
90
+ @delay_requested = 0.0
91
+ @time_slept = 0.0
92
+ @resume_count = 0
93
+ @initial_resume_at = Time.now - 60.0 # one minute ago
94
+ end
95
+
96
+ # Registers a block to scrape for a specific authority
97
+ #
98
+ # @param authority [String] the name of the authority being processed
99
+ # @yield to the block containing the scraping operation to be run in the fiber
100
+ # @return [Fiber] a fiber that calls the block. With @authority and @resume_at instance vars
101
+ def self.register_operation(authority, &block)
102
+ # Automatically enable fiber scheduling when operations are registered
103
+ enable!
104
+
105
+ fiber = Fiber.new do
106
+ values[authority] = block.call
107
+ rescue StandardError => e
108
+ # Store exception against the authority
109
+ exceptions[authority] = e
110
+ ensure
111
+ # Remove itself when done regardless of success/failure
112
+ registry.delete(Fiber.current)
113
+ end
114
+
115
+ # Start fibres in registration order
116
+ @initial_resume_at += 0.1
117
+ fiber.instance_variable_set(:@resume_at, @initial_resume_at)
118
+ fiber.instance_variable_set(:@authority, authority)
119
+ registry << fiber
120
+
121
+ if DebugUtils.basic?
122
+ FiberScheduler.log "Registered #{authority} operation with fiber: #{fiber.object_id} for interleaving"
123
+ end
124
+ # Process immediately when testing
125
+ fiber.resume if ScraperUtils::RandomizeUtils.sequential?
126
+ fiber
127
+ end
128
+
129
+ # Run all registered fibers until completion
130
+ #
131
+ # @return [Hash] Exceptions that occurred during execution
132
+ def self.run_all
133
+ count = registry.size
134
+ while (fiber = find_earliest_fiber)
135
+ if fiber.alive?
136
+ authority = begin
137
+ fiber.instance_variable_get(:@authority)
138
+ rescue StandardError
139
+ nil
140
+ end
141
+ @resume_count ||= 0
142
+ @resume_count += 1
143
+ values[authority] = fiber.resume
144
+ else
145
+ FiberScheduler.log "WARNING: fiber is dead but did not remove itself from registry! #{fiber.object_id}"
146
+ registry.delete(fiber)
147
+ end
148
+ end
149
+
150
+ if @time_slept&.positive? && @delay_requested&.positive?
151
+ percent_slept = (100.0 * @time_slept / @delay_requested).round(1)
152
+ end
153
+ puts
154
+ FiberScheduler.log "FiberScheduler processed #{@resume_count} calls to delay for #{count} registrations, " \
155
+ "sleeping #{percent_slept}% (#{@time_slept&.round(1)}) of the " \
156
+ "#{@delay_requested&.round(1)} seconds requested."
157
+ puts
158
+
159
+ exceptions
160
+ end
161
+
162
+ # Delays the current fiber and potentially runs another one
163
+ # Falls back to regular sleep if fiber scheduling is not enabled
164
+ #
165
+ # @param seconds [Numeric] the number of seconds to delay
166
+ # @return [Integer] return from sleep operation or 0
167
+ def self.delay(seconds)
168
+ seconds = 0.0 unless seconds&.positive?
169
+ @delay_requested ||= 0.0
170
+ @delay_requested += seconds
171
+
172
+ current_fiber = Fiber.current
173
+
174
+ if !enabled? || !current_fiber || registry.size <= 1
175
+ @time_slept ||= 0.0
176
+ @time_slept += seconds
177
+ log("Sleeping #{seconds.round(3)} seconds") if DebugUtils.basic?
178
+ return sleep(seconds)
179
+ end
180
+
181
+ now = Time.now
182
+ resume_at = now + seconds
183
+
184
+ # Don't resume at the same time as someone else,
185
+ # FIFO queue if seconds == 0
186
+ @other_resumes ||= []
187
+ @other_resumes = @other_resumes.delete_if { |t| t < now }
188
+ while @other_resumes.include?(resume_at) && resume_at
189
+ resume_at += 0.01
190
+ end
191
+
192
+ # Used to compare when other fibers need to be resumed
193
+ current_fiber.instance_variable_set(:@resume_at, resume_at)
194
+
195
+ # Yield control back to the scheduler so another fiber can run
196
+ Fiber.yield
197
+
198
+ # When we get control back, check if we need to sleep more
199
+ remaining = resume_at - Time.now
200
+ if remaining.positive?
201
+ @time_slept ||= 0.0
202
+ @time_slept += remaining
203
+ log("Sleeping remaining #{remaining.round(3)} seconds") if DebugUtils.basic?
204
+ sleep(remaining)
205
+ end || 0
206
+ end
207
+
208
+ # Finds the fiber with the earliest wake-up time
209
+ #
210
+ # @return [Fiber, nil] the fiber with the earliest wake-up time or nil if none found
211
+ def self.find_earliest_fiber
212
+ earliest_time = nil
213
+ earliest_fiber = nil
214
+
215
+ registry.each do |fiber|
216
+ resume_at = fiber.instance_variable_get(:@resume_at)
217
+ if earliest_time.nil? || resume_at < earliest_time
218
+ earliest_time = resume_at
219
+ earliest_fiber = fiber
220
+ end
221
+ end
222
+
223
+ earliest_fiber
224
+ end
225
+
226
+ # Mark methods as private
227
+ private_class_method :find_earliest_fiber
228
+ end
229
+ end
@@ -13,13 +13,10 @@ module ScraperUtils
13
13
  # @param start_time [Time] When this scraping attempt was started
14
14
  # @param attempt [Integer] 1 for first run, 2 for first retry, 3 for last retry (without proxy)
15
15
  # @param authorities [Array<Symbol>] List of authorities attempted to scrape
16
- # @param results [Hash] Results for each authority containing:
17
- # - :records_scraped [Integer] Number of records successfully scraped
18
- # - :unprocessable_records [Integer] Optional Number of unprocessable record like regions
19
- # - :error [Exception, nil] Any exception that occurred during scraping
20
- # - :proxy_used [Boolean] Whether a proxy was used
16
+ # @param exceptions [Hash{Symbol => Exception}] Any exceptions that occurred during scraping
17
+ # `DataQualityMonitor.stats` is checked for :saved and :unprocessed entries
21
18
  # @return [void]
22
- def self.log_scraping_run(start_time, attempt, authorities, results)
19
+ def self.log_scraping_run(start_time, attempt, authorities, exceptions)
23
20
  raise ArgumentError, "Invalid start time" unless start_time.is_a?(Time)
24
21
  raise ArgumentError, "Authorities must be a non-empty array" if authorities.empty?
25
22
 
@@ -31,10 +28,11 @@ module ScraperUtils
31
28
  interrupted = []
32
29
 
33
30
  authorities.each do |authority_label|
34
- result = results[authority_label] || {}
31
+ stats = ScraperUtils::DataQualityMonitor.stats&.fetch(authority_label, nil) || {}
35
32
 
36
- status = if result[:records_scraped]&.positive?
37
- result[:error] ? :interrupted : :successful
33
+ exception = exceptions[authority_label]
34
+ status = if stats[:saved]&.positive?
35
+ exception ? :interrupted : :successful
38
36
  else
39
37
  :failed
40
38
  end
@@ -51,13 +49,12 @@ module ScraperUtils
51
49
  "run_at" => start_time.iso8601,
52
50
  "attempt" => attempt,
53
51
  "authority_label" => authority_label.to_s,
54
- "records_scraped" => result[:records_scraped] || 0,
55
- "unprocessable_records" => result[:unprocessable_records] || 0,
56
- "used_proxy" => result[:proxy_used] ? 1 : 0,
52
+ "records_saved" => stats[:saved] || 0,
53
+ "unprocessable_records" => stats[:unprocessed] || 0,
57
54
  "status" => status.to_s,
58
- "error_message" => result[:error]&.message,
59
- "error_class" => result[:error]&.class&.to_s,
60
- "error_backtrace" => extract_meaningful_backtrace(result[:error])
55
+ "error_message" => exception&.message,
56
+ "error_class" => exception&.class&.to_s,
57
+ "error_backtrace" => extract_meaningful_backtrace(exception)
61
58
  }
62
59
 
63
60
  save_log_record(record)
@@ -76,37 +73,72 @@ module ScraperUtils
76
73
  cleanup_old_records
77
74
  end
78
75
 
79
- def self.report_on_results(authorities, results)
80
- expect_bad = ENV["MORPH_EXPECT_BAD"]&.split(",")&.map(&:to_sym) || []
76
+ # Report on the results
77
+ # @param authorities [Array<Symbol>] List of authorities attempted to scrape
78
+ # @param exceptions [Hash{Symbol => Exception}] Any exceptions that occurred during scraping
79
+ # `DataQualityMonitor.stats` is checked for :saved and :unprocessed entries
80
+ # @return [void]
81
+ def self.report_on_results(authorities, exceptions)
82
+ if ENV["MORPH_EXPECT_BAD"]
83
+ expect_bad = ENV["MORPH_EXPECT_BAD"].split(",").map(&:strip).map(&:to_sym)
84
+ end
85
+ expect_bad ||= []
86
+
87
+ $stderr.flush
88
+ puts "MORPH_EXPECT_BAD=#{ENV.fetch('MORPH_EXPECT_BAD', nil)}"
89
+
90
+ # Print summary table
91
+ puts "\nScraping Summary:"
92
+ summary_format = "%-20s %6s %6s %s"
81
93
 
82
- puts "MORPH_EXPECT_BAD=#{ENV['MORPH_EXPECT_BAD']}" if expect_bad.any?
94
+ puts format(summary_format, 'Authority', 'OK', 'Bad', 'Exception')
95
+ puts format(summary_format, "-" * 20, "-" * 6, "-" * 6, "-" * 50)
96
+
97
+ authorities.each do |authority|
98
+ stats = ScraperUtils::DataQualityMonitor.stats&.fetch(authority, {}) || {}
99
+
100
+ ok_records = stats[:saved] || 0
101
+ bad_records = stats[:unprocessed] || 0
102
+
103
+ expect_bad_prefix = expect_bad.include?(authority) ? "[EXPECT BAD] " : ""
104
+ exception_msg = if exceptions[authority]
105
+ "#{exceptions[authority].class} - #{exceptions[authority].message}"
106
+ else
107
+ "-"
108
+ end
109
+ puts format(summary_format, authority.to_s, ok_records, bad_records,
110
+ "#{expect_bad_prefix}#{exception_msg}".slice(0, 70))
111
+ end
112
+ puts
83
113
 
84
114
  errors = []
85
115
 
86
116
  # Check for authorities that were expected to be bad but are now working
87
117
  unexpected_working = expect_bad.select do |authority|
88
- result = results[authority]
89
- result && result[:records_scraped]&.positive? && result[:error].nil?
118
+ stats = ScraperUtils::DataQualityMonitor.stats&.fetch(authority, {}) || {}
119
+ stats[:saved]&.positive? && !exceptions[authority]
90
120
  end
91
121
 
92
122
  if unexpected_working.any?
93
- errors << "WARNING: Remove #{unexpected_working.join(',')} from EXPECT_BAD as it now works!"
123
+ errors <<
124
+ "WARNING: Remove #{unexpected_working.join(',')} from MORPH_EXPECT_BAD as it now works!"
94
125
  end
95
126
 
96
127
  # Check for authorities with unexpected errors
97
128
  unexpected_errors = authorities
98
- .select { |authority| results[authority]&.dig(:error) }
129
+ .select { |authority| exceptions[authority] }
99
130
  .reject { |authority| expect_bad.include?(authority) }
100
131
 
101
132
  if unexpected_errors.any?
102
133
  errors << "ERROR: Unexpected errors in: #{unexpected_errors.join(',')} " \
103
- "(Add to MORPH_EXPECT_BAD?)"
134
+ "(Add to MORPH_EXPECT_BAD?)"
104
135
  unexpected_errors.each do |authority|
105
- error = results[authority][:error]
136
+ error = exceptions[authority]
106
137
  errors << " #{authority}: #{error.class} - #{error.message}"
107
138
  end
108
139
  end
109
140
 
141
+ $stdout.flush
110
142
  if errors.any?
111
143
  errors << "See earlier output for details"
112
144
  raise errors.join("\n")
@@ -134,7 +166,8 @@ module ScraperUtils
134
166
  "interrupted" => interrupted.join(","),
135
167
  "successful_count" => successful.size,
136
168
  "interrupted_count" => interrupted.size,
137
- "failed_count" => failed.size
169
+ "failed_count" => failed.size,
170
+ "public_ip" => ScraperUtils::MechanizeUtils.public_ip
138
171
  }
139
172
 
140
173
  ScraperWiki.save_sqlite(
@@ -0,0 +1,276 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "mechanize"
4
+ require "ipaddr"
5
+
6
+ module ScraperUtils
7
+ module MechanizeUtils
8
+ # Configuration for a Mechanize agent with sensible defaults and configurable settings.
9
+ # Supports global configuration through {.configure} and per-instance overrides.
10
+ #
11
+ # @example Setting global defaults
12
+ # ScraperUtils::MechanizeUtils::AgentConfig.configure do |config|
13
+ # config.default_timeout = 90
14
+ # config.default_random_delay = 5
15
+ # end
16
+ #
17
+ # @example Creating an instance with defaults
18
+ # config = ScraperUtils::MechanizeUtils::AgentConfig.new
19
+ #
20
+ # @example Overriding specific settings
21
+ # config = ScraperUtils::MechanizeUtils::AgentConfig.new(
22
+ # timeout: 120,
23
+ # random_delay: 10
24
+ # )
25
+ class AgentConfig
26
+ DEFAULT_TIMEOUT = 60
27
+ DEFAULT_RANDOM_DELAY = 5
28
+ DEFAULT_MAX_LOAD = 33.3
29
+ MAX_LOAD_CAP = 50.0
30
+
31
+ # Class-level defaults that can be modified
32
+ class << self
33
+ # @return [Integer] Default timeout in seconds for agent connections
34
+ attr_accessor :default_timeout
35
+
36
+ # @return [Boolean] Default setting for compliance with headers and robots.txt
37
+ attr_accessor :default_compliant_mode
38
+
39
+ # @return [Integer, nil] Default average random delay in seconds
40
+ attr_accessor :default_random_delay
41
+
42
+ # @return [Float, nil] Default maximum server load percentage (nil = no response delay)
43
+ attr_accessor :default_max_load
44
+
45
+ # @return [Boolean] Default setting for SSL certificate verification
46
+ attr_accessor :default_disable_ssl_certificate_check
47
+
48
+ # @return [Boolean] Default flag for Australian proxy preference
49
+ attr_accessor :default_australian_proxy
50
+
51
+ # @return [String, nil] Default Mechanize user agent
52
+ attr_accessor :default_user_agent
53
+
54
+ # Configure default settings for all AgentConfig instances
55
+ # @yield [self] Yields self for configuration
56
+ # @example
57
+ # AgentConfig.configure do |config|
58
+ # config.default_timeout = 90
59
+ # config.default_random_delay = 5
60
+ # config.default_max_load = 15
61
+ # end
62
+ # @return [void]
63
+ def configure
64
+ yield self if block_given?
65
+ end
66
+
67
+ # Reset all configuration options to their default values
68
+ # @return [void]
69
+ def reset_defaults!
70
+ @default_timeout = ENV.fetch('MORPH_TIMEOUT', DEFAULT_TIMEOUT).to_i # 60
71
+ @default_compliant_mode = ENV.fetch('MORPH_NOT_COMPLIANT', nil).to_s.empty? # true
72
+ @default_random_delay = ENV.fetch('MORPH_RANDOM_DELAY', DEFAULT_RANDOM_DELAY).to_i # 33
73
+ @default_max_load = ENV.fetch('MORPH_MAX_LOAD', DEFAULT_MAX_LOAD).to_f # 20
74
+ @default_disable_ssl_certificate_check = !ENV.fetch('MORPH_DISABLE_SSL_CHECK', nil).to_s.empty? # false
75
+ @default_australian_proxy = !ENV.fetch('MORPH_USE_PROXY', nil).to_s.empty? # false
76
+ @default_user_agent = ENV.fetch('MORPH_USER_AGENT', nil) # Uses Mechanize user agent
77
+ end
78
+ end
79
+
80
+ # Set defaults on load
81
+ reset_defaults!
82
+
83
+ # @return [String] User agent string
84
+ attr_reader :user_agent
85
+
86
+ # Give access for testing
87
+
88
+ attr_reader :max_load, :min_random, :max_random
89
+
90
+ # Creates Mechanize agent configuration with sensible defaults overridable via configure
91
+ # @param timeout [Integer, nil] Timeout for agent connections (default: 60)
92
+ # @param compliant_mode [Boolean, nil] Comply with headers and robots.txt (default: true)
93
+ # @param random_delay [Integer, nil] Average random delay in seconds (default: 3)
94
+ # @param max_load [Float, nil] Maximum server load percentage (nil = no delay, default: 20%)
95
+ # When compliant_mode is true, max_load is capped at 33%
96
+ # @param disable_ssl_certificate_check [Boolean, nil] Skip SSL verification (default: false)
97
+ # @param australian_proxy [Boolean, nil] Use proxy if available (default: false)
98
+ # @param user_agent [String, nil] Configure Mechanize user agent
99
+ def initialize(timeout: nil,
100
+ compliant_mode: nil,
101
+ random_delay: nil,
102
+ max_load: nil,
103
+ disable_ssl_certificate_check: nil,
104
+ australian_proxy: nil,
105
+ user_agent: nil)
106
+ @timeout = timeout.nil? ? self.class.default_timeout : timeout
107
+ @compliant_mode = compliant_mode.nil? ? self.class.default_compliant_mode : compliant_mode
108
+ @random_delay = random_delay.nil? ? self.class.default_random_delay : random_delay
109
+ @max_load = max_load.nil? ? self.class.default_max_load : max_load
110
+ @max_load = [@max_load || DEFAULT_MAX_LOAD, MAX_LOAD_CAP].min if @compliant_mode
111
+ @user_agent = user_agent.nil? ? self.class.default_user_agent : user_agent
112
+
113
+ @disable_ssl_certificate_check = if disable_ssl_certificate_check.nil?
114
+ self.class.default_disable_ssl_certificate_check
115
+ else
116
+ disable_ssl_certificate_check
117
+ end
118
+ @australian_proxy = if australian_proxy.nil?
119
+ self.class.default_australian_proxy
120
+ else
121
+ australian_proxy
122
+ end
123
+
124
+ # Validate proxy URL format if proxy will be used
125
+ @australian_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
126
+ if @australian_proxy
127
+ uri = begin
128
+ URI.parse(ScraperUtils.australian_proxy.to_s)
129
+ rescue URI::InvalidURIError => e
130
+ raise URI::InvalidURIError, "Invalid proxy URL format: #{e.message}"
131
+ end
132
+ unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
133
+ raise URI::InvalidURIError, "Proxy URL must start with http:// or https://"
134
+ end
135
+ unless uri.host && uri.port
136
+ raise URI::InvalidURIError, "Proxy URL must include host and port"
137
+ end
138
+ end
139
+
140
+ if @random_delay
141
+ @min_random = Math.sqrt(@random_delay * 3.0 / 13.0).round(3)
142
+ @max_random = (3 * @min_random).round(3)
143
+ end
144
+
145
+ today = Date.today.strftime("%Y-%m-%d")
146
+ @user_agent = ENV.fetch("MORPH_USER_AGENT", nil)&.sub("TODAY", today)
147
+ if @compliant_mode
148
+ version = ScraperUtils::VERSION
149
+ @user_agent ||= "Mozilla/5.0 (compatible; ScraperUtils/#{version} #{today}; +https://github.com/ianheggie-oaf/scraper_utils)"
150
+ end
151
+
152
+ @robots_checker = RobotsChecker.new(@user_agent) if @user_agent
153
+ @adaptive_delay = AdaptiveDelay.new(max_load: @max_load) if @max_load
154
+ display_options
155
+ end
156
+
157
+ # Configures a Mechanize agent with these settings
158
+ # @param agent [Mechanize] The agent to configure
159
+ # @return [void]
160
+ def configure_agent(agent)
161
+ agent.verify_mode = OpenSSL::SSL::VERIFY_NONE if @disable_ssl_certificate_check
162
+
163
+ if @timeout
164
+ agent.open_timeout = @timeout
165
+ agent.read_timeout = @timeout
166
+ end
167
+ if @compliant_mode
168
+ agent.user_agent = user_agent
169
+ agent.request_headers ||= {}
170
+ agent.request_headers["Accept"] =
171
+ "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
172
+ agent.request_headers["Upgrade-Insecure-Requests"] = "1"
173
+ end
174
+ if @australian_proxy
175
+ agent.agent.set_proxy(ScraperUtils.australian_proxy)
176
+ agent.request_headers["Accept-Language"] = "en-AU,en-US;q=0.9,en;q=0.8"
177
+ verify_proxy_works(agent)
178
+ end
179
+
180
+ @connection_started_at = nil
181
+ agent.pre_connect_hooks << method(:pre_connect_hook)
182
+ agent.post_connect_hooks << method(:post_connect_hook)
183
+ end
184
+
185
+ private
186
+
187
+ def display_options
188
+ display_args = []
189
+ display_args << "timeout=#{@timeout}" if @timeout
190
+ display_args << if ScraperUtils.australian_proxy.to_s.empty? && !@australian_proxy
191
+ "#{ScraperUtils::AUSTRALIAN_PROXY_ENV_VAR} not set"
192
+ else
193
+ "australian_proxy=#{@australian_proxy.inspect}"
194
+ end
195
+ display_args << "compliant_mode" if @compliant_mode
196
+ display_args << "random_delay=#{@random_delay}" if @random_delay
197
+ display_args << "max_load=#{@max_load}%" if @max_load
198
+ display_args << "disable_ssl_certificate_check" if @disable_ssl_certificate_check
199
+ display_args << "default args" if display_args.empty?
200
+ ScraperUtils::FiberScheduler.log(
201
+ "Configuring Mechanize agent with #{display_args.join(', ')}"
202
+ )
203
+ end
204
+
205
+ def pre_connect_hook(_agent, request)
206
+ @connection_started_at = Time.now
207
+ return unless DebugUtils.verbose?
208
+
209
+ ScraperUtils::FiberScheduler.log(
210
+ "Pre Connect request: #{request.inspect} at #{@connection_started_at}"
211
+ )
212
+ end
213
+
214
+ def post_connect_hook(_agent, uri, response, _body)
215
+ raise ArgumentError, "URI must be present in post-connect hook" unless uri
216
+
217
+ response_time = Time.now - @connection_started_at
218
+ if DebugUtils.basic?
219
+ ScraperUtils::FiberScheduler.log(
220
+ "Post Connect uri: #{uri.inspect}, response: #{response.inspect} " \
221
+ "after #{response_time} seconds"
222
+ )
223
+ end
224
+
225
+ if @robots_checker&.disallowed?(uri)
226
+ raise ScraperUtils::UnprocessableSite,
227
+ "URL is disallowed by robots.txt specific rules: #{uri}"
228
+ end
229
+
230
+ delays = {
231
+ robot_txt: @robots_checker&.crawl_delay&.round(3),
232
+ max_load: @adaptive_delay&.next_delay(uri, response_time)&.round(3),
233
+ random: (@min_random ? (rand(@min_random..@max_random)**2).round(3) : nil)
234
+ }
235
+ @delay = delays.values.compact.max
236
+ if @delay&.positive?
237
+ $stderr.flush
238
+ ScraperUtils::FiberScheduler.log("Delaying #{@delay} seconds, max of #{delays.inspect}") if ENV["DEBUG"]
239
+ $stdout.flush
240
+ ScraperUtils::FiberScheduler.delay(@delay)
241
+ end
242
+
243
+ response
244
+ end
245
+
246
+ def verify_proxy_works(agent)
247
+ $stderr.flush
248
+ $stdout.flush
249
+ FiberScheduler.log "Checking proxy works..."
250
+ my_ip = MechanizeUtils.public_ip(agent)
251
+ begin
252
+ IPAddr.new(my_ip)
253
+ rescue IPAddr::InvalidAddressError => e
254
+ raise "Invalid public IP address returned by proxy check: #{my_ip.inspect}: #{e}"
255
+ end
256
+ ScraperUtils::FiberScheduler.log "Proxy is using IP address: #{my_ip.inspect}"
257
+ my_headers = MechanizeUtils.public_headers(agent)
258
+ begin
259
+ # Check response is JSON just to be safe!
260
+ headers = JSON.parse(my_headers)
261
+ puts "Proxy is passing headers:"
262
+ puts JSON.pretty_generate(headers["headers"])
263
+ rescue JSON::ParserError => e
264
+ puts "Couldn't parse public_headers: #{e}! Raw response:"
265
+ puts my_headers.inspect
266
+ end
267
+ rescue Timeout::Error => e # Includes Net::OpenTimeout
268
+ raise "Proxy check timed out: #{e}"
269
+ rescue Errno::ECONNREFUSED, Net::HTTP::Persistent::Error => e
270
+ raise "Failed to connect to proxy: #{e}"
271
+ rescue Mechanize::ResponseCodeError => e
272
+ raise "Proxy check error: #{e}"
273
+ end
274
+ end
275
+ end
276
+ end