scraper_utils 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "uri"
4
+
5
+ # Adapts delays between requests based on server response times.
6
+ # Target delay is proportional to response time based on max_load setting.
7
+ # Uses an exponential moving average to smooth variations in response times.
8
+ class AdaptiveDelay
9
+ DEFAULT_MIN_DELAY = 0.0
10
+ DEFAULT_MAX_DELAY = 30.0 # Presumed default timeout for Mechanize
11
+
12
+ attr_reader :min_delay, :max_delay, :max_load
13
+
14
+ # Creates a new adaptive delay calculator
15
+ #
16
+ # @param min_delay [Float] Minimum delay between requests in seconds
17
+ # @param max_delay [Float] Maximum delay between requests in seconds
18
+ # @param max_load [Float] Maximum load percentage (1-99) we aim to place on the server
19
+ # Lower values are more conservative (e.g., 20% = 4x response time delay)
20
+ def initialize(min_delay: DEFAULT_MIN_DELAY, max_delay: DEFAULT_MAX_DELAY, max_load: 20.0)
21
+ @delays = {} # domain -> last delay used
22
+ @min_delay = min_delay.to_f
23
+ @max_delay = max_delay.to_f
24
+ @max_load = max_load.to_f.clamp(1.0, 99.0)
25
+ @response_multiplier = (100.0 - @max_load) / @max_load
26
+
27
+ if ENV["DEBUG"]
28
+ ScraperUtils::FiberScheduler.log "AdaptiveDelay initialized with delays between #{@min_delay} and #{@max_delay} seconds"
29
+ ScraperUtils::FiberScheduler.log "Using max_load of #{@max_load}% (response time multiplier: #{@response_multiplier.round(2)}x)"
30
+ end
31
+ end
32
+
33
+ # @param uri [URI::Generic, String] The URL to extract the domain from
34
+ # @return [String] The domain in the format "scheme://host"
35
+ def domain(uri)
36
+ uri = URI(uri) unless uri.is_a?(URI)
37
+ "#{uri.scheme}://#{uri.host}".downcase
38
+ end
39
+
40
+ # @param uri [URI::Generic, String] URL to get delay for
41
+ # @return [Float] Current delay for the domain, or min_delay if no delay set
42
+ def delay(uri)
43
+ @delays[domain(uri)] || @min_delay
44
+ end
45
+
46
+ # @param uri [URI::Generic, String] URL the response came from
47
+ # @param response_time [Float] Time in seconds the server took to respond
48
+ # @return [Float] The calculated delay to use with the next request
49
+ def next_delay(uri, response_time)
50
+ uris_domain = domain(uri)
51
+ target_delay = (response_time * @response_multiplier).clamp(0.0, @max_delay)
52
+ current_delay = @delays[uris_domain] || target_delay
53
+ delay = ((9.0 * current_delay) + target_delay) / 10.0
54
+ delay = delay.clamp(@min_delay, @max_delay)
55
+
56
+ if ENV["DEBUG"]
57
+ ScraperUtils::FiberScheduler.log "Adaptive delay for #{uris_domain} updated to " \
58
+ "#{delay.round(2)}s (target: #{@response_multiplier.round(1)}x " \
59
+ "response_time of #{response_time.round(2)}s)"
60
+ end
61
+
62
+ @delays[uris_domain] = delay
63
+ delay
64
+ end
65
+ end
@@ -3,13 +3,13 @@
3
3
  module ScraperUtils
4
4
  # Utilities for managing and selecting authorities
5
5
  module AuthorityUtils
6
+ AUTHORITIES_ENV_VAR = "MORPH_AUTHORITIES"
7
+
6
8
  # Selects authorities based on environment variable or returns all authorities
7
9
  #
8
10
  # @param all_authorities [Array<Symbol>] Full list of available authorities
9
11
  # @return [Array<Symbol>] Selected subset of authorities or all authorities
10
12
  # @raise [ScraperUtils::Error] If invalid authorities are specified in MORPH_AUTHORITIES
11
- AUTHORITIES_ENV_VAR = "MORPH_AUTHORITIES"
12
-
13
13
  def self.selected_authorities(all_authorities)
14
14
  if ENV[AUTHORITIES_ENV_VAR]
15
15
  authorities = ENV[AUTHORITIES_ENV_VAR].split(",").map(&:strip).map(&:to_sym)
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ScraperUtils
4
+ # Monitors data quality during scraping by tracking successful vs failed record processing
5
+ # Automatically triggers an exception if the error rate exceeds a threshold
6
+ class DataQualityMonitor
7
+ # Get the statistics for all authorities
8
+ # @return [Hash, nil] Hash of statistics per authority or nil if none started
9
+ def self.stats
10
+ @stats
11
+ end
12
+
13
+ # Notes the start of processing an authority and clears any previous stats
14
+ #
15
+ # @param authority_label [Symbol] The authority we are processing
16
+ # @return [void]
17
+ def self.start_authority(authority_label)
18
+ @stats ||= {}
19
+ @authority_label = authority_label
20
+ @stats[@authority_label] = { saved: 0, unprocessed: 0}
21
+ end
22
+
23
+ def self.threshold
24
+ 5.01 + @stats[@authority_label][:saved] * 0.1 if @stats&.fetch(@authority_label, nil)
25
+ end
26
+
27
+ # Logs an unprocessable record and raises an exception if error threshold is exceeded
28
+ # The threshold is 5 + 10% of saved records
29
+ #
30
+ # @param e [Exception] The exception that caused the record to be unprocessable
31
+ # @param record [Hash, nil] The record that couldn't be processed
32
+ # @raise [ScraperUtils::UnprocessableSite] When too many records are unprocessable
33
+ # @return [void]
34
+ def self.log_unprocessable_record(e, record)
35
+ start_authority(:"") unless @stats
36
+ @stats[@authority_label][:unprocessed] += 1
37
+ ScraperUtils::FiberScheduler.log "Erroneous record #{@authority_label} - #{record&.fetch('address', nil) || record.inspect}: #{e}"
38
+ if @stats[@authority_label][:unprocessed] > threshold
39
+ raise ScraperUtils::UnprocessableSite, "Too many unprocessable_records for #{@authority_label}: #{@stats[@authority_label].inspect} - aborting processing of site!"
40
+ end
41
+ end
42
+
43
+ # Logs a successfully saved record
44
+ #
45
+ # @param record [Hash] The record that was saved
46
+ # @return [void]
47
+ def self.log_saved_record(record)
48
+ start_authority(:"") unless @stats
49
+ @stats[@authority_label][:saved] += 1
50
+ ScraperUtils::FiberScheduler.log "Saving record #{@authority_label} - #{record['address']}"
51
+ end
52
+ end
53
+ end
@@ -34,8 +34,9 @@ module ScraperUtils
34
34
  ["council_reference"]
35
35
  end
36
36
 
37
- puts "Saving record #{record['council_reference']} - #{record['address']}"
37
+
38
38
  ScraperWiki.save_sqlite(primary_key, record)
39
+ ScraperUtils::DataQualityMonitor.log_saved_record(record)
39
40
  end
40
41
  end
41
42
  end
@@ -16,19 +16,11 @@ module ScraperUtils
16
16
  def self.debug_request(method, url, parameters: nil, headers: nil, body: nil)
17
17
  return unless ScraperUtils.debug?
18
18
 
19
- puts "\n🔍 #{method.upcase} #{url}"
20
- if parameters
21
- puts "Parameters:"
22
- puts JSON.pretty_generate(parameters)
23
- end
24
- if headers
25
- puts "Headers:"
26
- puts JSON.pretty_generate(headers)
27
- end
28
- return unless body
29
-
30
- puts "Body:"
31
- puts JSON.pretty_generate(body)
19
+ puts
20
+ ScraperUtils::FiberScheduler.log "🔍 #{method.upcase} #{url}"
21
+ puts "Parameters:", JSON.pretty_generate(parameters) if parameters
22
+ puts "Headers:", JSON.pretty_generate(headers) if headers
23
+ puts "Body:", JSON.pretty_generate(body) if body
32
24
  end
33
25
 
34
26
  # Logs details of a web page when debug mode is enabled
@@ -39,15 +31,15 @@ module ScraperUtils
39
31
  def self.debug_page(page, message)
40
32
  return unless ScraperUtils.debug?
41
33
 
42
- puts "",
43
- "🔍 DEBUG: #{message}"
34
+ puts
35
+ ScraperUtils::FiberScheduler.log "🔍 DEBUG: #{message}"
44
36
  puts "Current URL: #{page.uri}"
45
37
  puts "Page title: #{page.at('title').text.strip}" if page.at("title")
46
38
  puts "",
47
- "Page content:"
48
- puts "-" * 40
49
- puts page.body
50
- puts "-" * 40
39
+ "Page content:",
40
+ "-" * 40,
41
+ page.body,
42
+ "-" * 40
51
43
  end
52
44
 
53
45
  # Logs details about a specific page selector when debug mode is enabled
@@ -59,7 +51,8 @@ module ScraperUtils
59
51
  def self.debug_selector(page, selector, message)
60
52
  return unless ScraperUtils.debug?
61
53
 
62
- puts "\n🔍 DEBUG: #{message}"
54
+ puts
55
+ ScraperUtils::FiberScheduler.log "🔍 DEBUG: #{message}"
63
56
  puts "Looking for selector: #{selector}"
64
57
  element = page.at(selector)
65
58
  if element
@@ -0,0 +1,206 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'fiber'
4
+
5
+ module ScraperUtils
6
+ # A utility module for interleaving multiple scraping operations
7
+ # using fibers during connection delay periods. This allows efficient
8
+ # use of wait time by switching between operations.
9
+ module FiberScheduler
10
+ # @return [Array<Fiber>] List of active fibers managed by the scheduler
11
+ def self.registry
12
+ @registry ||= []
13
+ end
14
+
15
+ # Checks if the current code is running within a registered fiber
16
+ #
17
+ # @return [Boolean] true if running in a registered fiber, false otherwise
18
+ def self.in_fiber?
19
+ !Fiber.current.nil? && registry.include?(Fiber.current)
20
+ end
21
+
22
+ # Gets the authority associated with the current fiber
23
+ #
24
+ # @return [String, nil] the authority name or nil if not in a fiber
25
+ def self.current_authority
26
+ return nil unless in_fiber?
27
+ Fiber.current.instance_variable_get(:@authority)
28
+ end
29
+
30
+ # Logs a message, automatically prefixing with authority name if in a fiber
31
+ #
32
+ # @param message [String] the message to log
33
+ # @return [void]
34
+ def self.log(message)
35
+ authority = current_authority
36
+ if authority
37
+ puts "[#{authority}] #{message}"
38
+ else
39
+ puts message
40
+ end
41
+ end
42
+
43
+ # Returns a hash of exceptions encountered during processing, indexed by authority
44
+ #
45
+ # @return [Hash{Symbol => Exception}] exceptions by authority
46
+ def self.exceptions
47
+ @exceptions ||= {}
48
+ end
49
+
50
+ # Returns a hash of values which will be the values yielded along the way then the block value when it completes
51
+ #
52
+ # @return [Hash{Symbol => Any}] values by authority
53
+ def self.values
54
+ @values ||= {}
55
+ end
56
+
57
+ # Checks if fiber scheduling is currently enabled
58
+ #
59
+ # @return [Boolean] true if enabled, false otherwise
60
+ def self.enabled?
61
+ @enabled ||= false
62
+ end
63
+
64
+ # Enables fiber scheduling
65
+ #
66
+ # @return [void]
67
+ def self.enable!
68
+ reset! unless enabled?
69
+ @enabled = true
70
+ end
71
+
72
+ # Disables fiber scheduling
73
+ #
74
+ # @return [void]
75
+ def self.disable!
76
+ @enabled = false
77
+ end
78
+
79
+ # Resets the scheduler state, and disables the scheduler. Use this before retrying failed authorities.
80
+ #
81
+ # @return [void]
82
+ def self.reset!
83
+ @registry = []
84
+ @exceptions = {}
85
+ @values = {}
86
+ @enabled = false
87
+ @delay_requested = 0.0
88
+ @time_slept = 0.0
89
+ @resume_count = 0
90
+ @initial_resume_at = Time.now - 60.0 # one minute ago
91
+ end
92
+
93
+ # Registers a block to scrape for a specific authority
94
+ #
95
+ # @param authority [String] the name of the authority being processed
96
+ # @yield to the block containing the scraping operation to be run in the fiber
97
+ # @return [Fiber] the created fiber that calls the block. With @authority and @resume_at instance variables
98
+ def self.register_operation(authority, &block)
99
+ # Automatically enable fiber scheduling when operations are registered
100
+ enable!
101
+
102
+ fiber = Fiber.new do
103
+ begin
104
+ values[authority] = block.call
105
+ rescue StandardError => e
106
+ # Store exception against the authority
107
+ exceptions[authority] = e
108
+ ensure
109
+ # Remove itself when done regardless of success/failure
110
+ registry.delete(Fiber.current)
111
+ end
112
+ end
113
+
114
+ # Start fibres in registration order
115
+ @initial_resume_at += 0.1
116
+ fiber.instance_variable_set(:@resume_at, @initial_resume_at)
117
+ fiber.instance_variable_set(:@authority, authority)
118
+ registry << fiber
119
+
120
+ puts "Registered #{authority} operation with fiber: #{fiber.object_id} for interleaving" if ENV['DEBUG']
121
+ # Important: Don't immediately resume the fiber here
122
+ # Let the caller decide when to start or coordinate fibers
123
+ fiber
124
+ end
125
+
126
+ # Run all registered fibers until completion
127
+ #
128
+ # @return [Hash] Exceptions that occurred during execution
129
+ def self.run_all
130
+ count = registry.size
131
+ while (fiber = find_earliest_fiber)
132
+ if fiber.alive?
133
+ authority = fiber.instance_variable_get(:@authority) rescue nil
134
+ @resume_count ||= 0
135
+ @resume_count += 1
136
+ values[authority] = fiber.resume
137
+ else
138
+ puts "WARNING: fiber is dead but did not remove itself from registry! #{fiber.object_id}"
139
+ registry.delete(fiber)
140
+ end
141
+ end
142
+
143
+ percent_slept = (100.0 * @time_slept / @delay_requested).round(1) if @time_slept&.positive? && @delay_requested&.positive?
144
+ puts "FiberScheduler processed #{@resume_count} calls to delay for #{count} registrations, sleeping " \
145
+ "#{percent_slept}% (#{@time_slept&.round(1)}) of the #{@delay_requested&.round(1)} seconds requested."
146
+
147
+ exceptions
148
+ end
149
+
150
+ # Delays the current fiber and potentially runs another one
151
+ # Falls back to regular sleep if fiber scheduling is not enabled
152
+ #
153
+ # @param seconds [Numeric] the number of seconds to delay
154
+ # @return [Integer] return from sleep operation or 0
155
+ def self.delay(seconds)
156
+ seconds = 0.0 unless seconds&.positive?
157
+ @delay_requested ||= 0.0
158
+ @delay_requested += seconds
159
+
160
+ current_fiber = Fiber.current
161
+
162
+ if !enabled? || !current_fiber || registry.size <= 1
163
+ @time_slept ||= 0.0
164
+ @time_slept += seconds
165
+ return sleep(seconds)
166
+ end
167
+
168
+ resume_at = Time.now + seconds
169
+
170
+ # Used to compare when other fibers need to be resumed
171
+ current_fiber.instance_variable_set(:@resume_at, resume_at)
172
+
173
+ # Yield control back to the scheduler so another fiber can run
174
+ Fiber.yield
175
+
176
+ # When we get control back, check if we need to sleep more
177
+ remaining = resume_at - Time.now
178
+ if remaining.positive?
179
+ @time_slept ||= 0.0
180
+ @time_slept += remaining
181
+ sleep(remaining)
182
+ end || 0
183
+ end
184
+
185
+ # Finds the fiber with the earliest wake-up time
186
+ #
187
+ # @return [Fiber, nil] the fiber with the earliest wake-up time or nil if none found
188
+ def self.find_earliest_fiber
189
+ earliest_time = nil
190
+ earliest_fiber = nil
191
+
192
+ registry.each do |fiber|
193
+ resume_at = fiber.instance_variable_get(:@resume_at)
194
+ if earliest_time.nil? || resume_at < earliest_time
195
+ earliest_time = resume_at
196
+ earliest_fiber = fiber
197
+ end
198
+ end
199
+
200
+ earliest_fiber
201
+ end
202
+
203
+ # Mark methods as private
204
+ private_class_method :find_earliest_fiber
205
+ end
206
+ end
@@ -13,13 +13,10 @@ module ScraperUtils
13
13
  # @param start_time [Time] When this scraping attempt was started
14
14
  # @param attempt [Integer] 1 for first run, 2 for first retry, 3 for last retry (without proxy)
15
15
  # @param authorities [Array<Symbol>] List of authorities attempted to scrape
16
- # @param results [Hash] Results for each authority containing:
17
- # - :records_scraped [Integer] Number of records successfully scraped
18
- # - :unprocessable_records [Integer] Optional Number of unprocessable record like regions
19
- # - :error [Exception, nil] Any exception that occurred during scraping
20
- # - :proxy_used [Boolean] Whether a proxy was used
16
+ # @param exceptions [Hash > Exception] Any exception that occurred during scraping
17
+ # DataQualityMonitor.stats is checked for :saved and :unprocessed entries
21
18
  # @return [void]
22
- def self.log_scraping_run(start_time, attempt, authorities, results)
19
+ def self.log_scraping_run(start_time, attempt, authorities, exceptions)
23
20
  raise ArgumentError, "Invalid start time" unless start_time.is_a?(Time)
24
21
  raise ArgumentError, "Authorities must be a non-empty array" if authorities.empty?
25
22
 
@@ -31,10 +28,11 @@ module ScraperUtils
31
28
  interrupted = []
32
29
 
33
30
  authorities.each do |authority_label|
34
- result = results[authority_label] || {}
31
+ stats = ScraperUtils::DataQualityMonitor.stats&.fetch(authority_label, nil) || {}
35
32
 
36
- status = if result[:records_scraped]&.positive?
37
- result[:error] ? :interrupted : :successful
33
+ exception = exceptions[authority_label]
34
+ status = if stats[:saved]&.positive?
35
+ exception ? :interrupted : :successful
38
36
  else
39
37
  :failed
40
38
  end
@@ -51,13 +49,12 @@ module ScraperUtils
51
49
  "run_at" => start_time.iso8601,
52
50
  "attempt" => attempt,
53
51
  "authority_label" => authority_label.to_s,
54
- "records_scraped" => result[:records_scraped] || 0,
55
- "unprocessable_records" => result[:unprocessable_records] || 0,
56
- "used_proxy" => result[:proxy_used] ? 1 : 0,
52
+ "records_saved" => stats[:saved] || 0,
53
+ "unprocessable_records" => stats[:unprocessed] || 0,
57
54
  "status" => status.to_s,
58
- "error_message" => result[:error]&.message,
59
- "error_class" => result[:error]&.class&.to_s,
60
- "error_backtrace" => extract_meaningful_backtrace(result[:error])
55
+ "error_message" => exception&.message,
56
+ "error_class" => exception&.class&.to_s,
57
+ "error_backtrace" => extract_meaningful_backtrace(exception)
61
58
  }
62
59
 
63
60
  save_log_record(record)
@@ -76,33 +73,66 @@ module ScraperUtils
76
73
  cleanup_old_records
77
74
  end
78
75
 
79
- def self.report_on_results(authorities, results)
80
- expect_bad = ENV["MORPH_EXPECT_BAD"]&.split(",")&.map(&:to_sym) || []
81
-
82
- puts "MORPH_EXPECT_BAD=#{ENV['MORPH_EXPECT_BAD']}" if expect_bad.any?
76
+ # Report on the results
77
+ # @param authorities [Array<Symbol>] List of authorities attempted to scrape
78
+ # @param exceptions [Hash > Exception] Any exception that occurred during scraping
79
+ # DataQualityMonitor.stats is checked for :saved and :unprocessed entries
80
+ # @return [void]
81
+ def self.report_on_results(authorities, exceptions)
82
+ expect_bad = ENV["MORPH_EXPECT_BAD"]&.split(",")&.map(&:strip)&.map(&:to_sym) || []
83
+
84
+ puts "MORPH_EXPECT_BAD=#{ENV.fetch('MORPH_EXPECT_BAD', nil)}" if expect_bad.any?
85
+
86
+ # Print summary table
87
+ puts "\nScraping Summary:"
88
+ summary_format = "%-20s %6s %6s %s"
89
+
90
+ puts summary_format % %w[Authority OK Bad Exception]
91
+ puts summary_format % ['-' * 20, '-' * 6, '-' * 6, '-' * 50]
92
+
93
+ authorities.each do |authority|
94
+ stats = ScraperUtils::DataQualityMonitor.stats&.fetch(authority, {}) || {}
95
+
96
+ ok_records = stats[:saved] || 0
97
+ bad_records = stats[:unprocessed] || 0
98
+
99
+ expect_bad_prefix = expect_bad.include?(authority) ? "[EXPECT BAD] " : ""
100
+ exception_msg = if exceptions[authority]
101
+ "#{exceptions[authority].class} - #{exceptions[authority].message}"
102
+ else
103
+ "-"
104
+ end
105
+ puts summary_format % [
106
+ authority.to_s,
107
+ ok_records,
108
+ bad_records,
109
+ "#{expect_bad_prefix}#{exception_msg}".slice(0, 70)
110
+ ]
111
+ end
112
+ puts
83
113
 
84
114
  errors = []
85
115
 
86
116
  # Check for authorities that were expected to be bad but are now working
87
117
  unexpected_working = expect_bad.select do |authority|
88
- result = results[authority]
89
- result && result[:records_scraped]&.positive? && result[:error].nil?
118
+ stats = ScraperUtils::DataQualityMonitor.stats&.fetch(authority, {}) || {}
119
+ stats[:saved]&.positive? && !exceptions[authority]
90
120
  end
91
121
 
92
122
  if unexpected_working.any?
93
- errors << "WARNING: Remove #{unexpected_working.join(',')} from EXPECT_BAD as it now works!"
123
+ errors << "WARNING: Remove #{unexpected_working.join(',')} from MORPH_EXPECT_BAD as it now works!"
94
124
  end
95
125
 
96
126
  # Check for authorities with unexpected errors
97
127
  unexpected_errors = authorities
98
- .select { |authority| results[authority]&.dig(:error) }
128
+ .select { |authority| exceptions[authority] }
99
129
  .reject { |authority| expect_bad.include?(authority) }
100
130
 
101
131
  if unexpected_errors.any?
102
132
  errors << "ERROR: Unexpected errors in: #{unexpected_errors.join(',')} " \
103
- "(Add to MORPH_EXPECT_BAD?)"
133
+ "(Add to MORPH_EXPECT_BAD?)"
104
134
  unexpected_errors.each do |authority|
105
- error = results[authority][:error]
135
+ error = exceptions[authority]
106
136
  errors << " #{authority}: #{error.class} - #{error.message}"
107
137
  end
108
138
  end
@@ -134,7 +164,8 @@ module ScraperUtils
134
164
  "interrupted" => interrupted.join(","),
135
165
  "successful_count" => successful.size,
136
166
  "interrupted_count" => interrupted.size,
137
- "failed_count" => failed.size
167
+ "failed_count" => failed.size,
168
+ "public_ip" => ScraperUtils::MechanizeUtils.public_ip
138
169
  }
139
170
 
140
171
  ScraperWiki.save_sqlite(