scraper_utils 0.1.0 ā 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/.rubocop.yml +5 -8
- data/CHANGELOG.md +14 -0
- data/GUIDELINES.md +75 -0
- data/Gemfile +6 -3
- data/IMPLEMENTATION.md +33 -0
- data/README.md +226 -177
- data/SPECS.md +25 -0
- data/bin/console +1 -0
- data/bin/setup +2 -1
- data/docs/example_scrape_with_fibers.rb +31 -0
- data/docs/example_scraper.rb +93 -0
- data/lib/scraper_utils/adaptive_delay.rb +70 -0
- data/lib/scraper_utils/authority_utils.rb +2 -2
- data/lib/scraper_utils/data_quality_monitor.rb +64 -0
- data/lib/scraper_utils/date_range_utils.rb +159 -0
- data/lib/scraper_utils/db_utils.rb +1 -2
- data/lib/scraper_utils/debug_utils.rb +63 -23
- data/lib/scraper_utils/fiber_scheduler.rb +229 -0
- data/lib/scraper_utils/log_utils.rb +58 -25
- data/lib/scraper_utils/mechanize_utils/agent_config.rb +276 -0
- data/lib/scraper_utils/mechanize_utils.rb +32 -30
- data/lib/scraper_utils/randomize_utils.rb +34 -0
- data/lib/scraper_utils/robots_checker.rb +149 -0
- data/lib/scraper_utils/version.rb +1 -1
- data/lib/scraper_utils.rb +6 -10
- data/scraper_utils.gemspec +3 -8
- metadata +17 -74
data/SPECS.md
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
SPECS
|
2
|
+
=====
|
3
|
+
|
4
|
+
These project specific Specifications go into further details than the
|
5
|
+
installation and usage notes in `README.md`.
|
6
|
+
|
7
|
+
ASK for clarification of any apparent conflicts with IMPLEMENTATION, GUIDELINES or project instructions.
|
8
|
+
|
9
|
+
## Core Design Principles
|
10
|
+
|
11
|
+
### Error Handling
|
12
|
+
- Record-level errors abort only that record's processing
|
13
|
+
- Allow up to 5 + 10% unprocessable records before failing
|
14
|
+
- External service reliability (e.g., robots.txt) should not block core functionality
|
15
|
+
|
16
|
+
### Rate Limiting
|
17
|
+
- Honor site-specific rate limits when clearly specified
|
18
|
+
- Apply adaptive delays based on response times
|
19
|
+
- Use randomized delays to avoid looking like a bot
|
20
|
+
- Support proxy configuration for geolocation needs
|
21
|
+
|
22
|
+
### Testing
|
23
|
+
- Ensure components are independently testable
|
24
|
+
- Avoid timing-based tests in favor of logic validation
|
25
|
+
- Keep test scenarios focused and under 20 lines
|
data/bin/console
CHANGED
data/bin/setup
CHANGED
@@ -0,0 +1,31 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Example scrape method updated to use ScraperUtils::FibreScheduler
|
4
|
+
|
5
|
+
def scrape(authorities, attempt)
|
6
|
+
ScraperUtils::FiberScheduler.reset!
|
7
|
+
exceptions = {}
|
8
|
+
authorities.each do |authority_label|
|
9
|
+
ScraperUtils::FiberScheduler.register_operation(authority_label) do
|
10
|
+
ScraperUtils::FiberScheduler.log(
|
11
|
+
"Collecting feed data for #{authority_label}, attempt: #{attempt}..."
|
12
|
+
)
|
13
|
+
ScraperUtils::DataQualityMonitor.start_authority(authority_label)
|
14
|
+
YourScraper.scrape(authority_label) do |record|
|
15
|
+
record["authority_label"] = authority_label.to_s
|
16
|
+
ScraperUtils::DbUtils.save_record(record)
|
17
|
+
rescue ScraperUtils::UnprocessableRecord => e
|
18
|
+
ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
|
19
|
+
exceptions[authority_label] = e
|
20
|
+
# Continues processing other records
|
21
|
+
end
|
22
|
+
rescue StandardError => e
|
23
|
+
warn "#{authority_label}: ERROR: #{e}"
|
24
|
+
warn e.backtrace || "No backtrace available"
|
25
|
+
exceptions[authority_label] = e
|
26
|
+
end
|
27
|
+
# end of register_operation block
|
28
|
+
end
|
29
|
+
ScraperUtils::FiberScheduler.run_all
|
30
|
+
exceptions
|
31
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
$LOAD_PATH << "./lib"
|
5
|
+
|
6
|
+
require "scraper_utils"
|
7
|
+
require "technology_one_scraper"
|
8
|
+
|
9
|
+
# Main Scraper class
|
10
|
+
class Scraper
|
11
|
+
AUTHORITIES = YourScraper::AUTHORITIES
|
12
|
+
|
13
|
+
# ADD: attempt argument
|
14
|
+
def scrape(authorities, attempt)
|
15
|
+
exceptions = {}
|
16
|
+
# ADD: Report attempt number
|
17
|
+
authorities.each do |authority_label|
|
18
|
+
puts "\nCollecting feed data for #{authority_label}, attempt: #{attempt}..."
|
19
|
+
|
20
|
+
begin
|
21
|
+
# REPLACE:
|
22
|
+
# YourScraper.scrape(authority_label) do |record|
|
23
|
+
# record["authority_label"] = authority_label.to_s
|
24
|
+
# YourScraper.log(record)
|
25
|
+
# ScraperWiki.save_sqlite(%w[authority_label council_reference], record)
|
26
|
+
# end
|
27
|
+
# WITH:
|
28
|
+
ScraperUtils::DataQualityMonitor.start_authority(authority_label)
|
29
|
+
YourScraper.scrape(authority_label) do |record|
|
30
|
+
begin
|
31
|
+
record["authority_label"] = authority_label.to_s
|
32
|
+
ScraperUtils::DbUtils.save_record(record)
|
33
|
+
rescue ScraperUtils::UnprocessableRecord => e
|
34
|
+
ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
|
35
|
+
exceptions[authority_label] = e
|
36
|
+
end
|
37
|
+
end
|
38
|
+
# END OF REPLACE
|
39
|
+
end
|
40
|
+
rescue StandardError => e
|
41
|
+
warn "#{authority_label}: ERROR: #{e}"
|
42
|
+
warn e.backtrace
|
43
|
+
exceptions[authority_label] = e
|
44
|
+
end
|
45
|
+
|
46
|
+
exceptions
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.selected_authorities
|
50
|
+
ScraperUtils::AuthorityUtils.selected_authorities(AUTHORITIES.keys)
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.run(authorities)
|
54
|
+
puts "Scraping authorities: #{authorities.join(', ')}"
|
55
|
+
start_time = Time.now
|
56
|
+
exceptions = scrape(authorities, 1)
|
57
|
+
# Set start_time and attempt to the call above and log run below
|
58
|
+
ScraperUtils::LogUtils.log_scraping_run(
|
59
|
+
start_time,
|
60
|
+
1,
|
61
|
+
authorities,
|
62
|
+
exceptions
|
63
|
+
)
|
64
|
+
|
65
|
+
unless exceptions.empty?
|
66
|
+
puts "\n***************************************************"
|
67
|
+
puts "Now retrying authorities which earlier had failures"
|
68
|
+
puts exceptions.keys.join(", ").to_s
|
69
|
+
puts "***************************************************"
|
70
|
+
|
71
|
+
start_time = Time.now
|
72
|
+
exceptions = scrape(exceptions.keys, 2)
|
73
|
+
# Set start_time and attempt to the call above and log run below
|
74
|
+
ScraperUtils::LogUtils.log_scraping_run(
|
75
|
+
start_time,
|
76
|
+
2,
|
77
|
+
authorities,
|
78
|
+
exceptions
|
79
|
+
)
|
80
|
+
end
|
81
|
+
|
82
|
+
# Report on results, raising errors for unexpected conditions
|
83
|
+
ScraperUtils::LogUtils.report_on_results(authorities, exceptions)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
if __FILE__ == $PROGRAM_NAME
|
88
|
+
# Default to list of authorities we can't or won't fix in code, explain why
|
89
|
+
# wagga: url redirects and then reports Application error
|
90
|
+
|
91
|
+
ENV["MORPH_EXPECT_BAD"] ||= "wagga"
|
92
|
+
Scraper.run(Scraper.selected_authorities)
|
93
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "uri"
|
4
|
+
|
5
|
+
module ScraperUtils
|
6
|
+
# Adapts delays between requests based on server response times.
|
7
|
+
# Target delay is proportional to response time based on max_load setting.
|
8
|
+
# Uses an exponential moving average to smooth variations in response times.
|
9
|
+
class AdaptiveDelay
|
10
|
+
DEFAULT_MIN_DELAY = 0.0
|
11
|
+
DEFAULT_MAX_DELAY = 30.0 # Presumed default timeout for Mechanize
|
12
|
+
|
13
|
+
attr_reader :min_delay, :max_delay, :max_load
|
14
|
+
|
15
|
+
# Creates a new adaptive delay calculator
|
16
|
+
#
|
17
|
+
# @param min_delay [Float] Minimum delay between requests in seconds
|
18
|
+
# @param max_delay [Float] Maximum delay between requests in seconds
|
19
|
+
# @param max_load [Float] Maximum load percentage (1-99) we aim to place on the server
|
20
|
+
# Lower values are more conservative (e.g., 20% = 4x response time delay)
|
21
|
+
def initialize(min_delay: DEFAULT_MIN_DELAY, max_delay: DEFAULT_MAX_DELAY, max_load: 20.0)
|
22
|
+
@delays = {} # domain -> last delay used
|
23
|
+
@min_delay = min_delay.to_f
|
24
|
+
@max_delay = max_delay.to_f
|
25
|
+
@max_load = max_load.to_f.clamp(1.0, 99.0)
|
26
|
+
@response_multiplier = (100.0 - @max_load) / @max_load
|
27
|
+
|
28
|
+
return unless DebugUtils.basic?
|
29
|
+
|
30
|
+
ScraperUtils::FiberScheduler.log(
|
31
|
+
"AdaptiveDelay initialized with delays between #{@min_delay} and #{@max_delay} seconds, " \
|
32
|
+
"Max_load #{@max_load}% thus response multiplier: #{@response_multiplier.round(2)}x"
|
33
|
+
)
|
34
|
+
end
|
35
|
+
|
36
|
+
# @param uri [URI::Generic, String] The URL to extract the domain from
|
37
|
+
# @return [String] The domain in the format "scheme://host"
|
38
|
+
def domain(uri)
|
39
|
+
uri = URI(uri) unless uri.is_a?(URI)
|
40
|
+
"#{uri.scheme}://#{uri.host}".downcase
|
41
|
+
end
|
42
|
+
|
43
|
+
# @param uri [URI::Generic, String] URL to get delay for
|
44
|
+
# @return [Float] Current delay for the domain, or min_delay if no delay set
|
45
|
+
def delay(uri)
|
46
|
+
@delays[domain(uri)] || @min_delay
|
47
|
+
end
|
48
|
+
|
49
|
+
# @param uri [URI::Generic, String] URL the response came from
|
50
|
+
# @param response_time [Float] Time in seconds the server took to respond
|
51
|
+
# @return [Float] The calculated delay to use with the next request
|
52
|
+
def next_delay(uri, response_time)
|
53
|
+
uris_domain = domain(uri)
|
54
|
+
target_delay = (response_time * @response_multiplier).clamp(0.0, @max_delay)
|
55
|
+
current_delay = @delays[uris_domain] || target_delay
|
56
|
+
delay = ((9.0 * current_delay) + target_delay) / 10.0
|
57
|
+
delay = delay.clamp(@min_delay, @max_delay)
|
58
|
+
|
59
|
+
if DebugUtils.basic?
|
60
|
+
ScraperUtils::FiberScheduler.log(
|
61
|
+
"Adaptive delay for #{uris_domain} updated to #{delay.round(2)}s (target: " \
|
62
|
+
"#{@response_multiplier.round(1)}x response_time of #{response_time.round(2)}s)"
|
63
|
+
)
|
64
|
+
end
|
65
|
+
|
66
|
+
@delays[uris_domain] = delay
|
67
|
+
delay
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -3,13 +3,13 @@
|
|
3
3
|
module ScraperUtils
|
4
4
|
# Utilities for managing and selecting authorities
|
5
5
|
module AuthorityUtils
|
6
|
+
AUTHORITIES_ENV_VAR = "MORPH_AUTHORITIES"
|
7
|
+
|
6
8
|
# Selects authorities based on environment variable or returns all authorities
|
7
9
|
#
|
8
10
|
# @param all_authorities [Array<Symbol>] Full list of available authorities
|
9
11
|
# @return [Array<Symbol>] Selected subset of authorities or all authorities
|
10
12
|
# @raise [ScraperUtils::Error] If invalid authorities are specified in MORPH_AUTHORITIES
|
11
|
-
AUTHORITIES_ENV_VAR = "MORPH_AUTHORITIES"
|
12
|
-
|
13
13
|
def self.selected_authorities(all_authorities)
|
14
14
|
if ENV[AUTHORITIES_ENV_VAR]
|
15
15
|
authorities = ENV[AUTHORITIES_ENV_VAR].split(",").map(&:strip).map(&:to_sym)
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ScraperUtils
|
4
|
+
# Monitors data quality during scraping by tracking successful vs failed record processing
|
5
|
+
# Automatically triggers an exception if the error rate exceeds a threshold
|
6
|
+
class DataQualityMonitor
|
7
|
+
# Get the statistics for all authorities
|
8
|
+
# @return [Hash, nil] Hash of statistics per authority or nil if none started
|
9
|
+
class << self
|
10
|
+
attr_reader :stats
|
11
|
+
end
|
12
|
+
|
13
|
+
# Notes the start of processing an authority and clears any previous stats
|
14
|
+
#
|
15
|
+
# @param authority_label [Symbol] The authority we are processing
|
16
|
+
# @return [void]
|
17
|
+
def self.start_authority(authority_label)
|
18
|
+
@stats ||= {}
|
19
|
+
@stats[authority_label] = { saved: 0, unprocessed: 0 }
|
20
|
+
end
|
21
|
+
|
22
|
+
# Extracts authority label and ensures stats are setup for record
|
23
|
+
def self.extract_authority(record)
|
24
|
+
authority_label = (record&.key?("authority_label") ? record["authority_label"] : "").to_sym
|
25
|
+
@stats ||= {}
|
26
|
+
@stats[authority_label] ||= { saved: 0, unprocessed: 0 }
|
27
|
+
authority_label
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.threshold(authority_label)
|
31
|
+
5.01 + (@stats[authority_label][:saved] * 0.1) if @stats&.fetch(authority_label, nil)
|
32
|
+
end
|
33
|
+
|
34
|
+
# Logs an unprocessable record and raises an exception if error threshold is exceeded
|
35
|
+
# The threshold is 5 + 10% of saved records
|
36
|
+
#
|
37
|
+
# @param exception [Exception] The exception that caused the record to be unprocessable
|
38
|
+
# @param record [Hash, nil] The record that couldn't be processed
|
39
|
+
# @raise [ScraperUtils::UnprocessableSite] When too many records are unprocessable
|
40
|
+
# @return [void]
|
41
|
+
def self.log_unprocessable_record(exception, record)
|
42
|
+
authority_label = extract_authority(record)
|
43
|
+
@stats[authority_label][:unprocessed] += 1
|
44
|
+
ScraperUtils::FiberScheduler.log "Erroneous record #{authority_label} - #{record&.fetch(
|
45
|
+
'address', nil
|
46
|
+
) || record.inspect}: #{exception}"
|
47
|
+
return unless @stats[authority_label][:unprocessed] > threshold(authority_label)
|
48
|
+
|
49
|
+
raise ScraperUtils::UnprocessableSite,
|
50
|
+
"Too many unprocessable_records for #{authority_label}: " \
|
51
|
+
"#{@stats[authority_label].inspect} - aborting processing of site!"
|
52
|
+
end
|
53
|
+
|
54
|
+
# Logs a successfully saved record
|
55
|
+
#
|
56
|
+
# @param record [Hash] The record that was saved
|
57
|
+
# @return [void]
|
58
|
+
def self.log_saved_record(record)
|
59
|
+
authority_label = extract_authority(record)
|
60
|
+
@stats[authority_label][:saved] += 1
|
61
|
+
ScraperUtils::FiberScheduler.log "Saving record #{authority_label} - #{record['address']}"
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,159 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ScraperUtils
|
4
|
+
class DateRangeUtils
|
5
|
+
MERGE_ADJACENT_RANGES = true
|
6
|
+
PERIODS = [2, 3, 5, 8].freeze
|
7
|
+
|
8
|
+
class << self
|
9
|
+
# @return [Integer] Default number of days to cover
|
10
|
+
attr_accessor :default_days
|
11
|
+
|
12
|
+
# @return [Integer] Default days to always include in ranges
|
13
|
+
attr_accessor :default_everytime
|
14
|
+
|
15
|
+
# @return [Integer, nil] Default max days between any one date being in a range
|
16
|
+
attr_accessor :default_max_period
|
17
|
+
|
18
|
+
# Configure default settings for all DateRangeUtils instances
|
19
|
+
# @yield [self] Yields self for configuration
|
20
|
+
# @example
|
21
|
+
# AgentConfig.configure do |config|
|
22
|
+
# config.default_everytime = 3
|
23
|
+
# config.default_days = 35
|
24
|
+
# config.default_max_period = 5
|
25
|
+
# end
|
26
|
+
# @return [void]
|
27
|
+
def configure
|
28
|
+
yield self if block_given?
|
29
|
+
end
|
30
|
+
|
31
|
+
# Reset all configuration options to their default values
|
32
|
+
# @return [void]
|
33
|
+
def reset_defaults!
|
34
|
+
@default_days = ENV.fetch('MORPH_DAYS', 33).to_i # 33
|
35
|
+
@default_everytime = ENV.fetch('MORPH_EVERYTIME', 4).to_i # 4
|
36
|
+
@default_max_period = ENV.fetch('MORPH_MAX_PERIOD', 3).to_i # 3
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# Set defaults on load
|
41
|
+
reset_defaults!
|
42
|
+
|
43
|
+
attr_reader :max_period_used
|
44
|
+
attr_reader :extended_max_period
|
45
|
+
|
46
|
+
# Generates one or more date ranges to check the most recent daily through to checking each max_period
|
47
|
+
# There is a graduated schedule from the latest `everytime` days through to the oldest of `days` dates which is checked each `max_period` days.
|
48
|
+
# @param days [Integer, nil] create ranges that cover the last `days` dates
|
49
|
+
# @param everytime [Integer, nil] Always include the latest `everytime` out of `days` dates
|
50
|
+
# @param max_period [Integer, nil] the last `days` dates must be checked at least every `max_period` days
|
51
|
+
# @param today [Date, nil] overrides the default determination of today at UTC+09:30 (middle of Australia)
|
52
|
+
# @return [Array{[Date, Date, String]}] being from_date, to_date and a comment
|
53
|
+
#
|
54
|
+
# Uses a Fibonacci sequence to create a natural progression of check frequencies.
|
55
|
+
# Newer data is checked more frequently, with periods between checks growing
|
56
|
+
# according to the Fibonacci sequence (2, 3, 5, 8, 13...) until reaching max_period.
|
57
|
+
# This creates an efficient schedule that mimics natural information decay patterns.
|
58
|
+
def calculate_date_ranges(days: nil, everytime: nil, max_period: nil, today: nil)
|
59
|
+
_calculate_date_ranges(
|
60
|
+
Integer(days || self.class.default_days),
|
61
|
+
Integer(everytime || self.class.default_everytime),
|
62
|
+
Integer(max_period || self.class.default_max_period),
|
63
|
+
today || Time.now(in: '+09:30').to_date
|
64
|
+
)
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def _calculate_date_ranges(days, everytime, max_period, today)
|
70
|
+
@max_period_used = 1
|
71
|
+
to_date = today
|
72
|
+
valid_periods = PERIODS.select { |p| p <= max_period }
|
73
|
+
if !max_period.positive? || !days.positive?
|
74
|
+
return []
|
75
|
+
elsif valid_periods.empty? || everytime >= days
|
76
|
+
# cover everything everytime
|
77
|
+
return [[today + 1 - days, today, "everything"]]
|
78
|
+
end
|
79
|
+
|
80
|
+
max_period = valid_periods.max
|
81
|
+
|
82
|
+
run_number = today.to_date.jd
|
83
|
+
ranges = []
|
84
|
+
if everytime.positive?
|
85
|
+
ranges << [to_date + 1 - everytime, to_date, "everytime"]
|
86
|
+
days -= everytime
|
87
|
+
to_date -= everytime
|
88
|
+
end
|
89
|
+
|
90
|
+
periods = valid_periods.dup
|
91
|
+
loop do
|
92
|
+
period = periods.shift
|
93
|
+
break if period.nil? || period >= max_period || !days.positive?
|
94
|
+
|
95
|
+
if DebugUtils.trace?
|
96
|
+
FiberScheduler.log "DEBUG: #{period} day periods started #{(today - to_date).to_i} days in."
|
97
|
+
end
|
98
|
+
period.times do |index|
|
99
|
+
break unless days.positive?
|
100
|
+
|
101
|
+
this_period = [days, period].min
|
102
|
+
break if this_period <= 0
|
103
|
+
|
104
|
+
earliest_from = to_date - days
|
105
|
+
# we are working from the oldest back towards today
|
106
|
+
if run_number % period == index
|
107
|
+
from = to_date - index - (this_period - 1)
|
108
|
+
from = earliest_from if from < earliest_from
|
109
|
+
to = [today, to_date - index].min
|
110
|
+
break if from > to
|
111
|
+
|
112
|
+
@max_period_used = [this_period, @max_period_used].max
|
113
|
+
if ranges.any? && ranges.last[0] <= to + 1 && MERGE_ADJACENT_RANGES
|
114
|
+
# extend adjacent range
|
115
|
+
ranges.last[0] = [from, ranges.last[0]].min
|
116
|
+
ranges.last[2] = "#{period}\##{index},#{ranges.last[2]}"
|
117
|
+
else
|
118
|
+
to = ranges.last[0] - 1 if ranges.any? && to >= ranges.last[0]
|
119
|
+
ranges << [from, to, "#{period}\##{index}"]
|
120
|
+
end
|
121
|
+
end
|
122
|
+
days -= this_period
|
123
|
+
to_date -= this_period
|
124
|
+
end
|
125
|
+
end
|
126
|
+
# remainder of range at max_period, whatever that is
|
127
|
+
if days.positive? && ScraperUtils::DebugUtils.trace?
|
128
|
+
FiberScheduler.log "DEBUG: #{max_period} day periods started #{(today - to_date).to_i} days in."
|
129
|
+
end
|
130
|
+
index = -1
|
131
|
+
while days.positive?
|
132
|
+
index += 1
|
133
|
+
this_period = [days, max_period].min
|
134
|
+
break if this_period <= 0
|
135
|
+
|
136
|
+
earliest_from = to_date - days
|
137
|
+
if (run_number % max_period) == (index % max_period)
|
138
|
+
from = to_date - index - (this_period - 1)
|
139
|
+
from = earliest_from if from < earliest_from
|
140
|
+
to = to_date - index
|
141
|
+
break if from > to
|
142
|
+
|
143
|
+
@max_period_used = [this_period, @max_period_used].max
|
144
|
+
if ranges.any? && ranges.last[0] <= to + 1 && MERGE_ADJACENT_RANGES
|
145
|
+
# extend adjacent range
|
146
|
+
ranges.last[0] = [from, ranges.last[0]].min
|
147
|
+
ranges.last[2] = "#{this_period}\##{index},#{ranges.last[2]}"
|
148
|
+
else
|
149
|
+
to = ranges.last[0] - 1 if ranges.any? && to >= ranges.last[0]
|
150
|
+
ranges << [from, to, "#{this_period}\##{index}"]
|
151
|
+
end
|
152
|
+
end
|
153
|
+
days -= this_period
|
154
|
+
to_date -= this_period
|
155
|
+
end
|
156
|
+
ranges.reverse
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
@@ -33,9 +33,8 @@ module ScraperUtils
|
|
33
33
|
else
|
34
34
|
["council_reference"]
|
35
35
|
end
|
36
|
-
|
37
|
-
puts "Saving record #{record['council_reference']} - #{record['address']}"
|
38
36
|
ScraperWiki.save_sqlite(primary_key, record)
|
37
|
+
ScraperUtils::DataQualityMonitor.log_saved_record(record)
|
39
38
|
end
|
40
39
|
end
|
41
40
|
end
|
@@ -5,6 +5,50 @@ require "json"
|
|
5
5
|
module ScraperUtils
|
6
6
|
# Utilities for debugging web scraping processes
|
7
7
|
module DebugUtils
|
8
|
+
DEBUG_ENV_VAR = "DEBUG"
|
9
|
+
MORPH_DEBUG_ENV_VAR = "MORPH_DEBUG"
|
10
|
+
|
11
|
+
# Debug level constants
|
12
|
+
DISABLED_LEVEL = 0
|
13
|
+
BASIC_LEVEL = 1
|
14
|
+
VERBOSE_LEVEL = 2
|
15
|
+
TRACE_LEVEL = 3
|
16
|
+
|
17
|
+
# Get current debug level (0 = disabled, 1 = basic, 2 = verbose, 3 = trace)
|
18
|
+
# Checks DEBUG and MORPH_DEBUG env variables
|
19
|
+
# @return [Integer] Debug level
|
20
|
+
def self.debug_level
|
21
|
+
debug = ENV.fetch(DEBUG_ENV_VAR, ENV.fetch(MORPH_DEBUG_ENV_VAR, '0'))
|
22
|
+
debug =~ /^\d/ ? debug.to_i : BASIC_LEVEL
|
23
|
+
end
|
24
|
+
|
25
|
+
# Check if debug is enabled at specified level or higher
|
26
|
+
#
|
27
|
+
# @param level [Integer] Minimum debug level to check for
|
28
|
+
# @return [Boolean] true if debugging at specified level is enabled
|
29
|
+
def self.debug?(level = BASIC_LEVEL)
|
30
|
+
debug_level >= level
|
31
|
+
end
|
32
|
+
|
33
|
+
# Check if basic debug output or higher is enabled
|
34
|
+
# @return [Boolean] true if debugging is enabled
|
35
|
+
def self.basic?
|
36
|
+
debug?(BASIC_LEVEL)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Check if verbose debug output or higher is enabled
|
40
|
+
# @return [Boolean] true if verbose debugging is enabled
|
41
|
+
def self.verbose?
|
42
|
+
debug?(VERBOSE_LEVEL)
|
43
|
+
end
|
44
|
+
|
45
|
+
# Check if debug tracing or higher is enabled
|
46
|
+
# @return [Boolean] true if debugging is enabled at trace level
|
47
|
+
def self.trace?
|
48
|
+
debug?(TRACE_LEVEL)
|
49
|
+
end
|
50
|
+
|
51
|
+
|
8
52
|
# Logs details of an HTTP request when debug mode is enabled
|
9
53
|
#
|
10
54
|
# @param method [String] HTTP method (GET, POST, etc.)
|
@@ -14,21 +58,14 @@ module ScraperUtils
|
|
14
58
|
# @param body [Hash, nil] Optional request body
|
15
59
|
# @return [void]
|
16
60
|
def self.debug_request(method, url, parameters: nil, headers: nil, body: nil)
|
17
|
-
return unless
|
61
|
+
return unless basic?
|
18
62
|
|
19
|
-
puts
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
puts "Headers:"
|
26
|
-
puts JSON.pretty_generate(headers)
|
27
|
-
end
|
28
|
-
return unless body
|
29
|
-
|
30
|
-
puts "Body:"
|
31
|
-
puts JSON.pretty_generate(body)
|
63
|
+
puts
|
64
|
+
FiberScheduler.log "š #{method.upcase} #{url}"
|
65
|
+
puts "Parameters:", JSON.pretty_generate(parameters) if parameters
|
66
|
+
puts "Headers:", JSON.pretty_generate(headers) if headers
|
67
|
+
puts "Body:", JSON.pretty_generate(body) if body
|
68
|
+
$stdout.flush
|
32
69
|
end
|
33
70
|
|
34
71
|
# Logs details of a web page when debug mode is enabled
|
@@ -37,17 +74,18 @@ module ScraperUtils
|
|
37
74
|
# @param message [String] Context or description for the debug output
|
38
75
|
# @return [void]
|
39
76
|
def self.debug_page(page, message)
|
40
|
-
return unless
|
77
|
+
return unless trace?
|
41
78
|
|
42
|
-
puts
|
43
|
-
|
79
|
+
puts
|
80
|
+
FiberScheduler.log "š DEBUG: #{message}"
|
44
81
|
puts "Current URL: #{page.uri}"
|
45
82
|
puts "Page title: #{page.at('title').text.strip}" if page.at("title")
|
46
83
|
puts "",
|
47
|
-
"Page content:"
|
48
|
-
|
49
|
-
|
50
|
-
|
84
|
+
"Page content:",
|
85
|
+
"-" * 40,
|
86
|
+
page.body,
|
87
|
+
"-" * 40
|
88
|
+
$stdout.flush
|
51
89
|
end
|
52
90
|
|
53
91
|
# Logs details about a specific page selector when debug mode is enabled
|
@@ -57,9 +95,10 @@ module ScraperUtils
|
|
57
95
|
# @param message [String] Context or description for the debug output
|
58
96
|
# @return [void]
|
59
97
|
def self.debug_selector(page, selector, message)
|
60
|
-
return unless
|
98
|
+
return unless trace?
|
61
99
|
|
62
|
-
puts
|
100
|
+
puts
|
101
|
+
FiberScheduler.log "š DEBUG: #{message}"
|
63
102
|
puts "Looking for selector: #{selector}"
|
64
103
|
element = page.at(selector)
|
65
104
|
if element
|
@@ -71,6 +110,7 @@ module ScraperUtils
|
|
71
110
|
puts page.body
|
72
111
|
puts "-" * 40
|
73
112
|
end
|
113
|
+
$stdout.flush
|
74
114
|
end
|
75
115
|
end
|
76
116
|
end
|