scraper_utils 0.8.2 → 0.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +39 -9
  3. data/CHANGELOG.md +34 -0
  4. data/README.md +7 -55
  5. data/docs/enhancing_specs.md +86 -47
  6. data/docs/example_custom_Rakefile +38 -0
  7. data/docs/example_dot_scraper_validation.yml +23 -0
  8. data/docs/mechanize_utilities.md +0 -3
  9. data/docs/testing_custom_scrapers.md +74 -0
  10. data/exe/validate_scraper_data +150 -0
  11. data/lib/scraper_utils/log_utils.rb +5 -5
  12. data/lib/scraper_utils/maths_utils.rb +23 -0
  13. data/lib/scraper_utils/mechanize_utils/agent_config.rb +9 -65
  14. data/lib/scraper_utils/mechanize_utils.rb +0 -2
  15. data/lib/scraper_utils/spec_support.rb +189 -6
  16. data/lib/scraper_utils/version.rb +1 -1
  17. data/lib/scraper_utils.rb +1 -5
  18. data/scraper_utils.gemspec +1 -0
  19. metadata +11 -24
  20. data/docs/example_scrape_with_fibers.rb +0 -31
  21. data/docs/fibers_and_threads.md +0 -72
  22. data/docs/interleaving_requests.md +0 -33
  23. data/docs/parallel_requests.md +0 -138
  24. data/docs/randomizing_requests.md +0 -38
  25. data/docs/reducing_server_load.md +0 -63
  26. data/lib/scraper_utils/cycle_utils.rb +0 -26
  27. data/lib/scraper_utils/date_range_utils.rb +0 -118
  28. data/lib/scraper_utils/mechanize_actions.rb +0 -183
  29. data/lib/scraper_utils/mechanize_utils/adaptive_delay.rb +0 -80
  30. data/lib/scraper_utils/mechanize_utils/robots_checker.rb +0 -151
  31. data/lib/scraper_utils/randomize_utils.rb +0 -37
  32. data/lib/scraper_utils/scheduler/constants.rb +0 -12
  33. data/lib/scraper_utils/scheduler/operation_registry.rb +0 -101
  34. data/lib/scraper_utils/scheduler/operation_worker.rb +0 -199
  35. data/lib/scraper_utils/scheduler/process_request.rb +0 -59
  36. data/lib/scraper_utils/scheduler/thread_request.rb +0 -51
  37. data/lib/scraper_utils/scheduler/thread_response.rb +0 -59
  38. data/lib/scraper_utils/scheduler.rb +0 -286
@@ -9,12 +9,12 @@ module ScraperUtils
9
9
  LOG_TABLE = "scrape_log"
10
10
  LOG_RETENTION_DAYS = 30
11
11
 
12
- # Logs a message, automatically prefixing with authority name if in a fiber
12
+ # Logs a message, automatically prefixing with authority name if in a sub process
13
13
  #
14
14
  # @param message [String] the message to log
15
15
  # @return [void]
16
16
  def self.log(message, authority = nil)
17
- authority ||= Scheduler.current_authority
17
+ authority ||= ENV['AUTHORITY']
18
18
  $stderr.flush
19
19
  if authority
20
20
  puts "[#{authority}] #{message}"
@@ -174,12 +174,12 @@ module ScraperUtils
174
174
 
175
175
  # Check for authorities with unexpected errors
176
176
  unexpected_errors = authorities
177
- .select { |authority| exceptions[authority] }
178
- .reject { |authority| expect_bad.include?(authority) }
177
+ .select { |authority| exceptions[authority] }
178
+ .reject { |authority| expect_bad.include?(authority) }
179
179
 
180
180
  if unexpected_errors.any?
181
181
  errors << "ERROR: Unexpected errors in: #{unexpected_errors.join(',')} " \
182
- "(Add to MORPH_EXPECT_BAD?)"
182
+ "(Add to MORPH_EXPECT_BAD?)"
183
183
  unexpected_errors.each do |authority|
184
184
  error = exceptions[authority]
185
185
  errors << " #{authority}: #{error.class} - #{error}"
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "scraperwiki"
4
+
5
+ module ScraperUtils
6
+ # Misc Maths Utilities
7
+ module MathsUtils
8
+ # Generate a fibonacci series
9
+ # @param max [Integer] The max the sequence goes up to
10
+ # @return [Array<Integer>] The fibonacci numbers up to max
11
+ def self.fibonacci_series(max)
12
+ result = []
13
+ # Start with the basic Fibonacci sequence
14
+ last_fib, this_fib = 1, 0
15
+ while this_fib <= max
16
+ result << this_fib
17
+ yield this_fib if block_given?
18
+ last_fib, this_fib = this_fib, this_fib + last_fib
19
+ end
20
+ result
21
+ end
22
+ end
23
+ end
@@ -10,8 +10,7 @@ module ScraperUtils
10
10
  #
11
11
  # @example Setting global defaults
12
12
  # ScraperUtils::MechanizeUtils::AgentConfig.configure do |config|
13
- # config.default_timeout = 90
14
- # config.default_random_delay = 5
13
+ # config.default_timeout = 500
15
14
  # end
16
15
  #
17
16
  # @example Creating an instance with defaults
@@ -24,24 +23,12 @@ module ScraperUtils
24
23
  # )
25
24
  class AgentConfig
26
25
  DEFAULT_TIMEOUT = 60
27
- DEFAULT_RANDOM_DELAY = 0
28
- DEFAULT_MAX_LOAD = 50.0
29
- MAX_LOAD_CAP = 80.0
30
26
 
31
27
  # Class-level defaults that can be modified
32
28
  class << self
33
29
  # @return [Integer] Default timeout in seconds for agent connections
34
30
  attr_accessor :default_timeout
35
31
 
36
- # @return [Boolean] Default setting for compliance with headers and robots.txt
37
- attr_accessor :default_compliant_mode
38
-
39
- # @return [Integer, nil] Default average random delay in seconds
40
- attr_accessor :default_random_delay
41
-
42
- # @return [Float, nil] Default maximum server load percentage (nil = no response delay)
43
- attr_accessor :default_max_load
44
-
45
32
  # @return [Boolean] Default setting for SSL certificate verification
46
33
  attr_accessor :default_disable_ssl_certificate_check
47
34
 
@@ -55,9 +42,7 @@ module ScraperUtils
55
42
  # @yield [self] Yields self for configuration
56
43
  # @example
57
44
  # AgentConfig.configure do |config|
58
- # config.default_timeout = 90
59
- # config.default_random_delay = 5
60
- # config.default_max_load = 15
45
+ # config.default_timeout = 300
61
46
  # end
62
47
  # @return [void]
63
48
  def configure
@@ -68,9 +53,6 @@ module ScraperUtils
68
53
  # @return [void]
69
54
  def reset_defaults!
70
55
  @default_timeout = ENV.fetch('MORPH_CLIENT_TIMEOUT', DEFAULT_TIMEOUT).to_i # 60
71
- @default_compliant_mode = ENV.fetch('MORPH_NOT_COMPLIANT', nil).to_s.empty? # true
72
- @default_random_delay = ENV.fetch('MORPH_RANDOM_DELAY', DEFAULT_RANDOM_DELAY).to_i # 0
73
- @default_max_load = ENV.fetch('MORPH_MAX_LOAD', DEFAULT_MAX_LOAD).to_f # 50.0
74
56
  @default_disable_ssl_certificate_check = !ENV.fetch('MORPH_DISABLE_SSL_CHECK', nil).to_s.empty? # false
75
57
  @default_australian_proxy = !ENV.fetch('MORPH_USE_PROXY', nil).to_s.empty? # false
76
58
  @default_user_agent = ENV.fetch('MORPH_USER_AGENT', nil) # Uses Mechanize user agent
@@ -89,10 +71,6 @@ module ScraperUtils
89
71
 
90
72
  # Creates Mechanize agent configuration with sensible defaults overridable via configure
91
73
  # @param timeout [Integer, nil] Timeout for agent connections (default: 60)
92
- # @param compliant_mode [Boolean, nil] Comply with headers and robots.txt (default: true)
93
- # @param random_delay [Integer, nil] Average random delay in seconds (default: 3)
94
- # @param max_load [Float, nil] Maximum server load percentage (nil = no delay, default: 20%)
95
- # When compliant_mode is true, max_load is capped at 33%
96
74
  # @param disable_ssl_certificate_check [Boolean, nil] Skip SSL verification (default: false)
97
75
  # @param australian_proxy [Boolean, nil] Use proxy if available (default: false)
98
76
  # @param user_agent [String, nil] Configure Mechanize user agent
@@ -104,10 +82,6 @@ module ScraperUtils
104
82
  australian_proxy: nil,
105
83
  user_agent: nil)
106
84
  @timeout = timeout.nil? ? self.class.default_timeout : timeout
107
- @compliant_mode = compliant_mode.nil? ? self.class.default_compliant_mode : compliant_mode
108
- @random_delay = random_delay.nil? ? self.class.default_random_delay : random_delay
109
- @max_load = max_load.nil? ? self.class.default_max_load : max_load
110
- @max_load = [@max_load || DEFAULT_MAX_LOAD, MAX_LOAD_CAP].min if @compliant_mode
111
85
  @user_agent = user_agent.nil? ? self.class.default_user_agent : user_agent
112
86
 
113
87
  @disable_ssl_certificate_check = if disable_ssl_certificate_check.nil?
@@ -144,13 +118,9 @@ module ScraperUtils
144
118
 
145
119
  today = Date.today.strftime("%Y-%m-%d")
146
120
  @user_agent = ENV.fetch("MORPH_USER_AGENT", nil)&.sub("TODAY", today)
147
- if @compliant_mode
148
- version = ScraperUtils::VERSION
149
- @user_agent ||= "Mozilla/5.0 (compatible; ScraperUtils/#{version} #{today}; +https://github.com/ianheggie-oaf/scraper_utils)"
150
- end
121
+ version = ScraperUtils::VERSION
122
+ @user_agent ||= "Mozilla/5.0 (compatible; ScraperUtils/#{version} #{today}; +https://github.com/ianheggie-oaf/scraper_utils)"
151
123
 
152
- @robots_checker = RobotsChecker.new(@user_agent) if @user_agent
153
- @adaptive_delay = AdaptiveDelay.new(max_load: @max_load) if @max_load
154
124
  display_options
155
125
  end
156
126
 
@@ -164,13 +134,11 @@ module ScraperUtils
164
134
  agent.open_timeout = @timeout
165
135
  agent.read_timeout = @timeout
166
136
  end
167
- if @compliant_mode
168
- agent.user_agent = user_agent
169
- agent.request_headers ||= {}
170
- agent.request_headers["Accept"] =
171
- "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
172
- agent.request_headers["Upgrade-Insecure-Requests"] = "1"
173
- end
137
+ agent.user_agent = user_agent
138
+ agent.request_headers ||= {}
139
+ agent.request_headers["Accept"] =
140
+ "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
141
+ agent.request_headers["Upgrade-Insecure-Requests"] = "1"
174
142
  if @australian_proxy
175
143
  agent.agent.set_proxy(ScraperUtils.australian_proxy)
176
144
  agent.request_headers["Accept-Language"] = "en-AU,en-US;q=0.9,en;q=0.8"
@@ -191,9 +159,6 @@ module ScraperUtils
191
159
  else
192
160
  "australian_proxy=#{@australian_proxy.inspect}"
193
161
  end
194
- display_args << "compliant_mode" if @compliant_mode
195
- display_args << "random_delay=#{@random_delay}" if @random_delay&.positive?
196
- display_args << "max_load=#{@max_load}%" if @max_load
197
162
  display_args << "disable_ssl_certificate_check" if @disable_ssl_certificate_check
198
163
  display_args << "default args" if display_args.empty?
199
164
  ScraperUtils::LogUtils.log(
@@ -220,27 +185,6 @@ module ScraperUtils
220
185
  "after #{response_time} seconds"
221
186
  )
222
187
  end
223
-
224
- if @robots_checker&.disallowed?(uri)
225
- raise ScraperUtils::UnprocessableSite,
226
- "URL is disallowed by robots.txt specific rules: #{uri}"
227
- end
228
-
229
- @delay_till = nil
230
- @delay = @robots_checker&.crawl_delay&.round(3)
231
- debug_msg = "Delaying robots.txt: crawl_delay #{@delay} seconds"
232
- unless @delay&.positive?
233
- delays = {
234
- max_load: @adaptive_delay&.next_delay(uri, response_time)&.round(3),
235
- random: (@random_range ? (rand(@random_range) ** 2).round(3) : nil)
236
- }
237
- @delay = [delays[:max_load], delays[:random]].compact.sum
238
- debug_msg = "Delaying #{@delay} seconds, sum of: #{delays.inspect}"
239
- end
240
- if @delay&.positive?
241
- @delay_till = Time.now + @delay
242
- ScraperUtils::LogUtils.log(debug_msg) if ScraperUtils::DebugUtils.basic?
243
- end
244
188
  response
245
189
  end
246
190
 
@@ -3,9 +3,7 @@
3
3
  require "mechanize"
4
4
  require "ipaddr"
5
5
 
6
- require_relative "mechanize_utils/adaptive_delay"
7
6
  require_relative "mechanize_utils/agent_config"
8
- require_relative "mechanize_utils/robots_checker"
9
7
 
10
8
  module ScraperUtils
11
9
  # Utilities for configuring and using Mechanize for web scraping
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "scraperwiki"
4
+ require "cgi"
4
5
 
5
6
  module ScraperUtils
6
7
  # Methods to support specs
@@ -35,6 +36,41 @@ module ScraperUtils
35
36
 
36
37
  AUSTRALIAN_POSTCODES = /\b\d{4}\b/.freeze
37
38
 
39
+ def self.fetch_url_with_redirects(url)
40
+ agent = Mechanize.new
41
+ # FIXME - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
42
+ agent.get(url)
43
+ end
44
+
45
+ def self.authority_label(results, prefix: '', suffix: '')
46
+ return nil if results.nil?
47
+
48
+ authority_labels = results.map { |record| record['authority_label']}.compact.uniq
49
+ return nil if authority_labels.empty?
50
+
51
+ raise "Expected one authority_label, not #{authority_labels.inspect}" if authority_labels.size > 1
52
+ "#{prefix}#{authority_labels.first}#{suffix}"
53
+ end
54
+
55
+ # Validates enough addresses are geocodable
56
+ # @param results [Array<Hash>] The results from scraping an authority
57
+ # @param percentage [Integer] The min percentage of addresses expected to be geocodable (default:50)
58
+ # @param variation [Integer] The variation allowed in addition to percentage (default:3)
59
+ # @raise RuntimeError if insufficient addresses are geocodable
60
+ def self.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3)
61
+ return nil if results.empty?
62
+
63
+ geocodable = results
64
+ .map { |record| record["address"] }
65
+ .uniq
66
+ .count { |text| ScraperUtils::SpecSupport.geocodable? text }
67
+ puts "Found #{geocodable} out of #{results.count} unique geocodable addresses " \
68
+ "(#{(100.0 * geocodable / results.count).round(1)}%)"
69
+ expected = [((percentage.to_f / 100.0) * results.count - variation), 1].max
70
+ raise "Expected at least #{expected} (#{percentage}% - #{variation}) geocodable addresses, got #{geocodable}" unless geocodable >= expected
71
+ geocodable
72
+ end
73
+
38
74
  # Check if an address is likely to be geocodable by analyzing its format.
39
75
  # This is a bit stricter than needed - typically assert >= 75% match
40
76
  # @param address [String] The address to check
@@ -43,7 +79,7 @@ module ScraperUtils
43
79
  return false if address.nil? || address.empty?
44
80
  check_address = ignore_case ? address.upcase : address
45
81
 
46
- # Basic structure check - must have a street name, suburb, state and postcode
82
+ # Basic structure check - must have a street type or unit/lot, uppercase suburb or postcode, state
47
83
  has_state = AUSTRALIAN_STATES.any? { |state| check_address.end_with?(" #{state}") || check_address.include?(" #{state} ") }
48
84
  has_postcode = address.match?(AUSTRALIAN_POSTCODES)
49
85
 
@@ -52,20 +88,20 @@ module ScraperUtils
52
88
 
53
89
  has_unit_or_lot = address.match?(/\b(Unit|Lot:?)\s+\d+/i)
54
90
 
55
- has_suburb_stats = check_address.match?(/(\b[A-Z]{2,}(\s+[A-Z]+)*,?|,\s+[A-Z][A-Za-z ]+)(\s+\d{4})?\s+(#{AUSTRALIAN_STATES.join('|')})\b/)
91
+ uppercase_words = address.scan(/\b[A-Z]{2,}\b/)
92
+ has_uppercase_suburb = uppercase_words.any? { |word| !AUSTRALIAN_STATES.include?(word) }
56
93
 
57
94
  if ENV["DEBUG"]
58
95
  missing = []
59
96
  unless has_street_type || has_unit_or_lot
60
97
  missing << "street type / unit / lot"
61
98
  end
99
+ missing << "postcode/Uppercase suburb" unless has_postcode || has_uppercase_suburb
62
100
  missing << "state" unless has_state
63
- missing << "postcode" unless has_postcode
64
- missing << "suburb state" unless has_suburb_stats
65
101
  puts " address: #{address} is not geocodable, missing #{missing.join(', ')}" if missing.any?
66
102
  end
67
103
 
68
- (has_street_type || has_unit_or_lot) && has_state && has_postcode && has_suburb_stats
104
+ (has_street_type || has_unit_or_lot) && (has_postcode || has_uppercase_suburb) && has_state
69
105
  end
70
106
 
71
107
  PLACEHOLDERS = [
@@ -80,11 +116,158 @@ module ScraperUtils
80
116
  PLACEHOLDERS.any? { |placeholder| text.to_s.match?(placeholder) }
81
117
  end
82
118
 
119
+ # Validates enough descriptions are reasonable
120
+ # @param results [Array<Hash>] The results from scraping an authority
121
+ # @param percentage [Integer] The min percentage of descriptions expected to be reasonable (default:50)
122
+ # @param variation [Integer] The variation allowed in addition to percentage (default:3)
123
+ # @raise RuntimeError if insufficient descriptions are reasonable
124
+ def self.validate_descriptions_are_reasonable!(results, percentage: 50, variation: 3)
125
+ return nil if results.empty?
126
+
127
+ descriptions = results
128
+ .map { |record| record["description"] }
129
+ .uniq
130
+ .count do |text|
131
+ selected = ScraperUtils::SpecSupport.reasonable_description? text
132
+ puts " description: #{text} is not reasonable" if ENV["DEBUG"] && !selected
133
+ selected
134
+ end
135
+ puts "Found #{descriptions} out of #{results.count} unique reasonable descriptions " \
136
+ "(#{(100.0 * descriptions / results.count).round(1)}%)"
137
+ expected = [(percentage.to_f / 100.0) * results.count - variation, 1].max
138
+ raise "Expected at least #{expected} (#{percentage}% - #{variation}) reasonable descriptions, got #{descriptions}" unless descriptions >= expected
139
+ descriptions
140
+ end
141
+
83
142
  # Check if this looks like a "reasonable" description
84
143
  # This is a bit stricter than needed - typically assert >= 75% match
85
144
  def self.reasonable_description?(text)
86
145
  !placeholder?(text) && text.to_s.split.size >= 3
87
146
  end
147
+
148
+ # Validates that all records use the expected global info_url and it returns 200
149
+ # @param results [Array<Hash>] The results from scraping an authority
150
+ # @param expected_url [String] The expected global info_url for this authority
151
+ # @raise RuntimeError if records don't use the expected URL or it doesn't return 200
152
+ def self.validate_uses_one_valid_info_url!(results, expected_url, bot_check_expected: false)
153
+ info_urls = results.map { |record| record["info_url"] }.uniq
154
+
155
+ unless info_urls.size == 1
156
+ raise "Expected all records to use one info_url '#{expected_url}', found: #{info_urls.size}"
157
+ end
158
+ unless info_urls.first == expected_url
159
+ raise "Expected all records to use global info_url '#{expected_url}', found: #{info_urls.first}"
160
+ end
161
+
162
+ puts "Checking the one expected info_url returns 200: #{expected_url}"
163
+
164
+ if defined?(VCR)
165
+ VCR.use_cassette("#{authority_label(results, suffix: '_')}one_info_url") do
166
+ page = fetch_url_with_redirects(expected_url)
167
+ validate_page_response(page, bot_check_expected)
168
+ end
169
+ else
170
+ page = fetch_url_with_redirects(expected_url)
171
+ validate_page_response(page, bot_check_expected)
172
+ end
173
+ end
174
+
175
+ # Validates that info_urls have expected details (unique URLs with content validation)
176
+ # @param results [Array<Hash>] The results from scraping an authority
177
+ # @param percentage [Integer] The min percentage of detail checks expected to pass (default:75)
178
+ # @param variation [Integer] The variation allowed in addition to percentage (default:3)
179
+ # @raise RuntimeError if insufficient detail checks pass
180
+ def self.validate_info_urls_have_expected_details!(results, percentage: 75, variation: 3, bot_check_expected: false)
181
+ if defined?(VCR)
182
+ VCR.use_cassette("#{authority_label(results, suffix: '_')}info_url_details") do
183
+ check_info_url_details(results, percentage, variation, bot_check_expected)
184
+ end
185
+ else
186
+ check_info_url_details(results, percentage, variation, bot_check_expected)
187
+ end
188
+ end
189
+
190
+ # Check if the page response indicates bot protection
191
+ # @param page [Mechanize::Page] The page response to check
192
+ # @return [Boolean] True if bot protection is detected
193
+ def self.bot_protection_detected?(page)
194
+ return true if %w[403 429].include?(page.code)
195
+
196
+ return false unless page.body
197
+
198
+ body_lower = page.body.downcase
199
+
200
+ # Check for common bot protection indicators
201
+ bot_indicators = [
202
+ 'recaptcha',
203
+ 'cloudflare',
204
+ 'are you human',
205
+ 'bot detection',
206
+ 'security check',
207
+ 'verify you are human',
208
+ 'access denied',
209
+ 'blocked',
210
+ 'captcha'
211
+ ]
212
+
213
+ bot_indicators.any? { |indicator| body_lower.include?(indicator) }
214
+ end
215
+
216
+ # Validate page response, accounting for bot protection
217
+ # @param page [Mechanize::Page] The page response to validate
218
+ # @param bot_check_expected [Boolean] Whether bot protection is acceptable
219
+ # @raise RuntimeError if page response is invalid and bot protection not expected
220
+ def self.validate_page_response(page, bot_check_expected)
221
+ if bot_check_expected && bot_protection_detected?(page)
222
+ puts " Bot protection detected - accepting as valid response"
223
+ return
224
+ end
225
+
226
+ raise "Expected 200 response from the one expected info_url, got #{page.code}" unless page.code == "200"
227
+ end
228
+
229
+ private
230
+
231
+ def self.check_info_url_details(results, percentage, variation, bot_check_expected)
232
+ count = 0
233
+ failed = 0
234
+ fib_indices = ScraperUtils::MathsUtils.fibonacci_series(results.size - 1).uniq
235
+
236
+ fib_indices.each do |index|
237
+ record = results[index]
238
+ info_url = record["info_url"]
239
+ puts "Checking info_url[#{index}]: #{info_url} has the expected reference, address and description..."
240
+
241
+ page = fetch_url_with_redirects(info_url)
242
+
243
+ if bot_check_expected && bot_protection_detected?(page)
244
+ puts " Bot protection detected - skipping detailed validation"
245
+ next
246
+ end
247
+
248
+ raise "Expected 200 response, got #{page.code}" unless page.code == "200"
249
+
250
+ page_body = page.body.dup.force_encoding("UTF-8").gsub(/\s\s+/, " ")
251
+
252
+ %w[council_reference address description].each do |attribute|
253
+ count += 1
254
+ expected = CGI.escapeHTML(record[attribute]).gsub(/\s\s+/, " ")
255
+ expected2 = expected.gsub(/(\S+)\s+(\S+)\z/, '\2 \1') # Handle Lismore post-code/state swap
256
+
257
+ next if page_body.include?(expected) || page_body.include?(expected2)
258
+
259
+ failed += 1
260
+ puts " Missing: #{expected}"
261
+ puts " IN: #{page_body}" if ENV['DEBUG']
262
+
263
+ min_required = [((percentage.to_f / 100.0) * count - variation), 1].max
264
+ passed = count - failed
265
+ raise "Too many failures: #{passed}/#{count} passed (min required: #{min_required})" if passed < min_required
266
+ end
267
+ end
268
+
269
+ puts "#{(100.0 * (count - failed) / count).round(1)}% detail checks passed (#{failed}/#{count} failed)!" if count > 0
270
+ end
271
+
88
272
  end
89
273
  end
90
-
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ScraperUtils
4
- VERSION = "0.8.2"
4
+ VERSION = "0.8.3"
5
5
  end
data/lib/scraper_utils.rb CHANGED
@@ -4,18 +4,14 @@ require "scraper_utils/version"
4
4
 
5
5
  # Public Apis (responsible for requiring their own dependencies)
6
6
  require "scraper_utils/authority_utils"
7
- require "scraper_utils/cycle_utils"
8
7
  require "scraper_utils/data_quality_monitor"
9
- require "scraper_utils/date_range_utils"
10
8
  require "scraper_utils/db_utils"
11
9
  require "scraper_utils/debug_utils"
12
10
  require "scraper_utils/log_utils"
13
- require "scraper_utils/randomize_utils"
14
- require "scraper_utils/scheduler"
11
+ require "scraper_utils/maths_utils"
15
12
  require "scraper_utils/spec_support"
16
13
 
17
14
  # Mechanize utilities
18
- require "scraper_utils/mechanize_actions"
19
15
  require "scraper_utils/mechanize_utils"
20
16
 
21
17
  # Utilities for planningalerts scrapers
@@ -41,5 +41,6 @@ Gem::Specification.new do |spec|
41
41
  spec.add_dependency "mechanize"
42
42
  spec.add_dependency "nokogiri"
43
43
  spec.add_dependency "sqlite3"
44
+ # Do NOT add - it depends on a non-default branch spec.add_dependency "scraperwiki"
44
45
  spec.metadata["rubygems_mfa_required"] = "true"
45
46
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scraper_utils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.2
4
+ version: 0.8.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ian Heggie
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-05-06 00:00:00.000000000 Z
11
+ date: 2025-07-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -56,7 +56,8 @@ description: Utilities to help make planningalerts scrapers, especially multi au
56
56
  scrapers, easier to develop, run and debug.
57
57
  email:
58
58
  - ian@heggie.biz
59
- executables: []
59
+ executables:
60
+ - validate_scraper_data
60
61
  extensions: []
61
62
  extra_rdoc_files: []
62
63
  files:
@@ -79,36 +80,22 @@ files:
79
80
  - bin/setup
80
81
  - docs/debugging.md
81
82
  - docs/enhancing_specs.md
82
- - docs/example_scrape_with_fibers.rb
83
+ - docs/example_custom_Rakefile
84
+ - docs/example_dot_scraper_validation.yml
83
85
  - docs/example_scraper.rb
84
- - docs/fibers_and_threads.md
85
86
  - docs/getting_started.md
86
- - docs/interleaving_requests.md
87
87
  - docs/mechanize_utilities.md
88
- - docs/parallel_requests.md
89
- - docs/randomizing_requests.md
90
- - docs/reducing_server_load.md
88
+ - docs/testing_custom_scrapers.md
89
+ - exe/validate_scraper_data
91
90
  - lib/scraper_utils.rb
92
91
  - lib/scraper_utils/authority_utils.rb
93
- - lib/scraper_utils/cycle_utils.rb
94
92
  - lib/scraper_utils/data_quality_monitor.rb
95
- - lib/scraper_utils/date_range_utils.rb
96
93
  - lib/scraper_utils/db_utils.rb
97
94
  - lib/scraper_utils/debug_utils.rb
98
95
  - lib/scraper_utils/log_utils.rb
99
- - lib/scraper_utils/mechanize_actions.rb
96
+ - lib/scraper_utils/maths_utils.rb
100
97
  - lib/scraper_utils/mechanize_utils.rb
101
- - lib/scraper_utils/mechanize_utils/adaptive_delay.rb
102
98
  - lib/scraper_utils/mechanize_utils/agent_config.rb
103
- - lib/scraper_utils/mechanize_utils/robots_checker.rb
104
- - lib/scraper_utils/randomize_utils.rb
105
- - lib/scraper_utils/scheduler.rb
106
- - lib/scraper_utils/scheduler/constants.rb
107
- - lib/scraper_utils/scheduler/operation_registry.rb
108
- - lib/scraper_utils/scheduler/operation_worker.rb
109
- - lib/scraper_utils/scheduler/process_request.rb
110
- - lib/scraper_utils/scheduler/thread_request.rb
111
- - lib/scraper_utils/scheduler/thread_response.rb
112
99
  - lib/scraper_utils/spec_support.rb
113
100
  - lib/scraper_utils/version.rb
114
101
  - scraper_utils.gemspec
@@ -119,7 +106,7 @@ metadata:
119
106
  allowed_push_host: https://rubygems.org
120
107
  homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
121
108
  source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
122
- documentation_uri: https://rubydoc.info/gems/scraper_utils/0.8.2
109
+ documentation_uri: https://rubydoc.info/gems/scraper_utils/0.8.3
123
110
  changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
124
111
  rubygems_mfa_required: 'true'
125
112
  post_install_message:
@@ -137,7 +124,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
137
124
  - !ruby/object:Gem::Version
138
125
  version: '0'
139
126
  requirements: []
140
- rubygems_version: 3.4.19
127
+ rubygems_version: 3.4.10
141
128
  signing_key:
142
129
  specification_version: 4
143
130
  summary: planningalerts scraper utilities
@@ -1,31 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- # Example scrape method updated to use ScraperUtils::FibreScheduler
4
-
5
- def scrape(authorities, attempt)
6
- ScraperUtils::Scheduler.reset!
7
- exceptions = {}
8
- authorities.each do |authority_label|
9
- ScraperUtils::Scheduler.register_operation(authority_label) do
10
- ScraperUtils::LogUtils.log(
11
- "Collecting feed data for #{authority_label}, attempt: #{attempt}..."
12
- )
13
- ScraperUtils::DataQualityMonitor.start_authority(authority_label)
14
- YourScraper.scrape(authority_label) do |record|
15
- record["authority_label"] = authority_label.to_s
16
- ScraperUtils::DbUtils.save_record(record)
17
- rescue ScraperUtils::UnprocessableRecord => e
18
- ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
19
- exceptions[authority_label] = e
20
- # Continues processing other records
21
- end
22
- rescue StandardError => e
23
- warn "#{authority_label}: ERROR: #{e}"
24
- warn e.backtrace || "No backtrace available"
25
- exceptions[authority_label] = e
26
- end
27
- # end of register_operation block
28
- end
29
- ScraperUtils::Scheduler.run_operations
30
- exceptions
31
- end