scraper_utils 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/SPECS.md ADDED
@@ -0,0 +1,25 @@
1
+ SPECS
2
+ =====
3
+
4
+ These project specific Specifications go into further details than the
5
+ installation and usage notes in `README.md`.
6
+
7
+ ASK for clarification of any apparent conflicts with IMPLEMENTATION, GUIDELINES or project instructions.
8
+
9
+ ## Core Design Principles
10
+
11
+ ### Error Handling
12
+ - Record-level errors abort only that record's processing
13
+ - Allow up to 5 + 10% unprocessable records before failing
14
+ - External service reliability (e.g., robots.txt) should not block core functionality
15
+
16
+ ### Rate Limiting
17
+ - Honor site-specific rate limits when clearly specified
18
+ - Apply adaptive delays based on response times
19
+ - Use randomized delays to avoid looking like a bot
20
+ - Support proxy configuration for geolocation needs
21
+
22
+ ### Testing
23
+ - Ensure components are independently testable
24
+ - Avoid timing-based tests in favor of logic validation
25
+ - Keep test scenarios focused and under 20 lines
data/bin/console CHANGED
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require "bundler/setup"
4
5
  require "scraper_utils"
data/bin/setup CHANGED
@@ -1,4 +1,5 @@
1
- #!/usr/bin/env bash
1
+ #!/bin/bash
2
+
2
3
  set -euo pipefail
3
4
  IFS=$'\n\t'
4
5
  set -vx
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Example scrape method updated to use ScraperUtils::FibreScheduler
4
+
5
+ def scrape(authorities, attempt)
6
+ ScraperUtils::FiberScheduler.reset!
7
+ exceptions = {}
8
+ authorities.each do |authority_label|
9
+ ScraperUtils::FiberScheduler.register_operation(authority_label) do
10
+ ScraperUtils::FiberScheduler.log(
11
+ "Collecting feed data for #{authority_label}, attempt: #{attempt}..."
12
+ )
13
+ ScraperUtils::DataQualityMonitor.start_authority(authority_label)
14
+ YourScraper.scrape(authority_label) do |record|
15
+ record["authority_label"] = authority_label.to_s
16
+ ScraperUtils::DbUtils.save_record(record)
17
+ rescue ScraperUtils::UnprocessableRecord => e
18
+ ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
19
+ exceptions[authority_label] = e
20
+ # Continues processing other records
21
+ end
22
+ rescue StandardError => e
23
+ warn "#{authority_label}: ERROR: #{e}"
24
+ warn e.backtrace || "No backtrace available"
25
+ exceptions[authority_label] = e
26
+ end
27
+ # end of register_operation block
28
+ end
29
+ ScraperUtils::FiberScheduler.run_all
30
+ exceptions
31
+ end
@@ -0,0 +1,93 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ $LOAD_PATH << "./lib"
5
+
6
+ require "scraper_utils"
7
+ require "technology_one_scraper"
8
+
9
+ # Main Scraper class
10
+ class Scraper
11
+ AUTHORITIES = YourScraper::AUTHORITIES
12
+
13
+ # ADD: attempt argument
14
+ def scrape(authorities, attempt)
15
+ exceptions = {}
16
+ # ADD: Report attempt number
17
+ authorities.each do |authority_label|
18
+ puts "\nCollecting feed data for #{authority_label}, attempt: #{attempt}..."
19
+
20
+ begin
21
+ # REPLACE:
22
+ # YourScraper.scrape(authority_label) do |record|
23
+ # record["authority_label"] = authority_label.to_s
24
+ # YourScraper.log(record)
25
+ # ScraperWiki.save_sqlite(%w[authority_label council_reference], record)
26
+ # end
27
+ # WITH:
28
+ ScraperUtils::DataQualityMonitor.start_authority(authority_label)
29
+ YourScraper.scrape(authority_label) do |record|
30
+ begin
31
+ record["authority_label"] = authority_label.to_s
32
+ ScraperUtils::DbUtils.save_record(record)
33
+ rescue ScraperUtils::UnprocessableRecord => e
34
+ ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
35
+ exceptions[authority_label] = e
36
+ end
37
+ end
38
+ # END OF REPLACE
39
+ end
40
+ rescue StandardError => e
41
+ warn "#{authority_label}: ERROR: #{e}"
42
+ warn e.backtrace
43
+ exceptions[authority_label] = e
44
+ end
45
+
46
+ exceptions
47
+ end
48
+
49
+ def self.selected_authorities
50
+ ScraperUtils::AuthorityUtils.selected_authorities(AUTHORITIES.keys)
51
+ end
52
+
53
+ def self.run(authorities)
54
+ puts "Scraping authorities: #{authorities.join(', ')}"
55
+ start_time = Time.now
56
+ exceptions = scrape(authorities, 1)
57
+ # Set start_time and attempt to the call above and log run below
58
+ ScraperUtils::LogUtils.log_scraping_run(
59
+ start_time,
60
+ 1,
61
+ authorities,
62
+ exceptions
63
+ )
64
+
65
+ unless exceptions.empty?
66
+ puts "\n***************************************************"
67
+ puts "Now retrying authorities which earlier had failures"
68
+ puts exceptions.keys.join(", ").to_s
69
+ puts "***************************************************"
70
+
71
+ start_time = Time.now
72
+ exceptions = scrape(exceptions.keys, 2)
73
+ # Set start_time and attempt to the call above and log run below
74
+ ScraperUtils::LogUtils.log_scraping_run(
75
+ start_time,
76
+ 2,
77
+ authorities,
78
+ exceptions
79
+ )
80
+ end
81
+
82
+ # Report on results, raising errors for unexpected conditions
83
+ ScraperUtils::LogUtils.report_on_results(authorities, exceptions)
84
+ end
85
+ end
86
+
87
+ if __FILE__ == $PROGRAM_NAME
88
+ # Default to list of authorities we can't or won't fix in code, explain why
89
+ # wagga: url redirects and then reports Application error
90
+
91
+ ENV["MORPH_EXPECT_BAD"] ||= "wagga"
92
+ Scraper.run(Scraper.selected_authorities)
93
+ end
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "uri"
4
+
5
+ module ScraperUtils
6
+ # Adapts delays between requests based on server response times.
7
+ # Target delay is proportional to response time based on max_load setting.
8
+ # Uses an exponential moving average to smooth variations in response times.
9
+ class AdaptiveDelay
10
+ DEFAULT_MIN_DELAY = 0.0
11
+ DEFAULT_MAX_DELAY = 30.0 # Presumed default timeout for Mechanize
12
+
13
+ attr_reader :min_delay, :max_delay, :max_load
14
+
15
+ # Creates a new adaptive delay calculator
16
+ #
17
+ # @param min_delay [Float] Minimum delay between requests in seconds
18
+ # @param max_delay [Float] Maximum delay between requests in seconds
19
+ # @param max_load [Float] Maximum load percentage (1-99) we aim to place on the server
20
+ # Lower values are more conservative (e.g., 20% = 4x response time delay)
21
+ def initialize(min_delay: DEFAULT_MIN_DELAY, max_delay: DEFAULT_MAX_DELAY, max_load: 20.0)
22
+ @delays = {} # domain -> last delay used
23
+ @min_delay = min_delay.to_f
24
+ @max_delay = max_delay.to_f
25
+ @max_load = max_load.to_f.clamp(1.0, 99.0)
26
+ @response_multiplier = (100.0 - @max_load) / @max_load
27
+
28
+ return unless DebugUtils.basic?
29
+
30
+ ScraperUtils::FiberScheduler.log(
31
+ "AdaptiveDelay initialized with delays between #{@min_delay} and #{@max_delay} seconds, " \
32
+ "Max_load #{@max_load}% thus response multiplier: #{@response_multiplier.round(2)}x"
33
+ )
34
+ end
35
+
36
+ # @param uri [URI::Generic, String] The URL to extract the domain from
37
+ # @return [String] The domain in the format "scheme://host"
38
+ def domain(uri)
39
+ uri = URI(uri) unless uri.is_a?(URI)
40
+ "#{uri.scheme}://#{uri.host}".downcase
41
+ end
42
+
43
+ # @param uri [URI::Generic, String] URL to get delay for
44
+ # @return [Float] Current delay for the domain, or min_delay if no delay set
45
+ def delay(uri)
46
+ @delays[domain(uri)] || @min_delay
47
+ end
48
+
49
+ # @param uri [URI::Generic, String] URL the response came from
50
+ # @param response_time [Float] Time in seconds the server took to respond
51
+ # @return [Float] The calculated delay to use with the next request
52
+ def next_delay(uri, response_time)
53
+ uris_domain = domain(uri)
54
+ target_delay = (response_time * @response_multiplier).clamp(0.0, @max_delay)
55
+ current_delay = @delays[uris_domain] || target_delay
56
+ delay = ((9.0 * current_delay) + target_delay) / 10.0
57
+ delay = delay.clamp(@min_delay, @max_delay)
58
+
59
+ if DebugUtils.basic?
60
+ ScraperUtils::FiberScheduler.log(
61
+ "Adaptive delay for #{uris_domain} updated to #{delay.round(2)}s (target: " \
62
+ "#{@response_multiplier.round(1)}x response_time of #{response_time.round(2)}s)"
63
+ )
64
+ end
65
+
66
+ @delays[uris_domain] = delay
67
+ delay
68
+ end
69
+ end
70
+ end
@@ -3,13 +3,13 @@
3
3
  module ScraperUtils
4
4
  # Utilities for managing and selecting authorities
5
5
  module AuthorityUtils
6
+ AUTHORITIES_ENV_VAR = "MORPH_AUTHORITIES"
7
+
6
8
  # Selects authorities based on environment variable or returns all authorities
7
9
  #
8
10
  # @param all_authorities [Array<Symbol>] Full list of available authorities
9
11
  # @return [Array<Symbol>] Selected subset of authorities or all authorities
10
12
  # @raise [ScraperUtils::Error] If invalid authorities are specified in MORPH_AUTHORITIES
11
- AUTHORITIES_ENV_VAR = "MORPH_AUTHORITIES"
12
-
13
13
  def self.selected_authorities(all_authorities)
14
14
  if ENV[AUTHORITIES_ENV_VAR]
15
15
  authorities = ENV[AUTHORITIES_ENV_VAR].split(",").map(&:strip).map(&:to_sym)
@@ -0,0 +1,64 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ScraperUtils
4
+ # Monitors data quality during scraping by tracking successful vs failed record processing
5
+ # Automatically triggers an exception if the error rate exceeds a threshold
6
+ class DataQualityMonitor
7
+ # Get the statistics for all authorities
8
+ # @return [Hash, nil] Hash of statistics per authority or nil if none started
9
+ class << self
10
+ attr_reader :stats
11
+ end
12
+
13
+ # Notes the start of processing an authority and clears any previous stats
14
+ #
15
+ # @param authority_label [Symbol] The authority we are processing
16
+ # @return [void]
17
+ def self.start_authority(authority_label)
18
+ @stats ||= {}
19
+ @stats[authority_label] = { saved: 0, unprocessed: 0 }
20
+ end
21
+
22
+ # Extracts authority label and ensures stats are setup for record
23
+ def self.extract_authority(record)
24
+ authority_label = (record&.key?("authority_label") ? record["authority_label"] : "").to_sym
25
+ @stats ||= {}
26
+ @stats[authority_label] ||= { saved: 0, unprocessed: 0 }
27
+ authority_label
28
+ end
29
+
30
+ def self.threshold(authority_label)
31
+ 5.01 + (@stats[authority_label][:saved] * 0.1) if @stats&.fetch(authority_label, nil)
32
+ end
33
+
34
+ # Logs an unprocessable record and raises an exception if error threshold is exceeded
35
+ # The threshold is 5 + 10% of saved records
36
+ #
37
+ # @param exception [Exception] The exception that caused the record to be unprocessable
38
+ # @param record [Hash, nil] The record that couldn't be processed
39
+ # @raise [ScraperUtils::UnprocessableSite] When too many records are unprocessable
40
+ # @return [void]
41
+ def self.log_unprocessable_record(exception, record)
42
+ authority_label = extract_authority(record)
43
+ @stats[authority_label][:unprocessed] += 1
44
+ ScraperUtils::FiberScheduler.log "Erroneous record #{authority_label} - #{record&.fetch(
45
+ 'address', nil
46
+ ) || record.inspect}: #{exception}"
47
+ return unless @stats[authority_label][:unprocessed] > threshold(authority_label)
48
+
49
+ raise ScraperUtils::UnprocessableSite,
50
+ "Too many unprocessable_records for #{authority_label}: " \
51
+ "#{@stats[authority_label].inspect} - aborting processing of site!"
52
+ end
53
+
54
+ # Logs a successfully saved record
55
+ #
56
+ # @param record [Hash] The record that was saved
57
+ # @return [void]
58
+ def self.log_saved_record(record)
59
+ authority_label = extract_authority(record)
60
+ @stats[authority_label][:saved] += 1
61
+ ScraperUtils::FiberScheduler.log "Saving record #{authority_label} - #{record['address']}"
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,159 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ScraperUtils
4
+ class DateRangeUtils
5
+ MERGE_ADJACENT_RANGES = true
6
+ PERIODS = [2, 3, 5, 8].freeze
7
+
8
+ class << self
9
+ # @return [Integer] Default number of days to cover
10
+ attr_accessor :default_days
11
+
12
+ # @return [Integer] Default days to always include in ranges
13
+ attr_accessor :default_everytime
14
+
15
+ # @return [Integer, nil] Default max days between any one date being in a range
16
+ attr_accessor :default_max_period
17
+
18
+ # Configure default settings for all DateRangeUtils instances
19
+ # @yield [self] Yields self for configuration
20
+ # @example
21
+ # AgentConfig.configure do |config|
22
+ # config.default_everytime = 3
23
+ # config.default_days = 35
24
+ # config.default_max_period = 5
25
+ # end
26
+ # @return [void]
27
+ def configure
28
+ yield self if block_given?
29
+ end
30
+
31
+ # Reset all configuration options to their default values
32
+ # @return [void]
33
+ def reset_defaults!
34
+ @default_days = ENV.fetch('MORPH_DAYS', 33).to_i # 33
35
+ @default_everytime = ENV.fetch('MORPH_EVERYTIME', 4).to_i # 4
36
+ @default_max_period = ENV.fetch('MORPH_MAX_PERIOD', 3).to_i # 3
37
+ end
38
+ end
39
+
40
+ # Set defaults on load
41
+ reset_defaults!
42
+
43
+ attr_reader :max_period_used
44
+ attr_reader :extended_max_period
45
+
46
+ # Generates one or more date ranges to check the most recent daily through to checking each max_period
47
+ # There is a graduated schedule from the latest `everytime` days through to the oldest of `days` dates which is checked each `max_period` days.
48
+ # @param days [Integer, nil] create ranges that cover the last `days` dates
49
+ # @param everytime [Integer, nil] Always include the latest `everytime` out of `days` dates
50
+ # @param max_period [Integer, nil] the last `days` dates must be checked at least every `max_period` days
51
+ # @param today [Date, nil] overrides the default determination of today at UTC+09:30 (middle of Australia)
52
+ # @return [Array{[Date, Date, String]}] being from_date, to_date and a comment
53
+ #
54
+ # Uses a Fibonacci sequence to create a natural progression of check frequencies.
55
+ # Newer data is checked more frequently, with periods between checks growing
56
+ # according to the Fibonacci sequence (2, 3, 5, 8, 13...) until reaching max_period.
57
+ # This creates an efficient schedule that mimics natural information decay patterns.
58
+ def calculate_date_ranges(days: nil, everytime: nil, max_period: nil, today: nil)
59
+ _calculate_date_ranges(
60
+ Integer(days || self.class.default_days),
61
+ Integer(everytime || self.class.default_everytime),
62
+ Integer(max_period || self.class.default_max_period),
63
+ today || Time.now(in: '+09:30').to_date
64
+ )
65
+ end
66
+
67
+ private
68
+
69
+ def _calculate_date_ranges(days, everytime, max_period, today)
70
+ @max_period_used = 1
71
+ to_date = today
72
+ valid_periods = PERIODS.select { |p| p <= max_period }
73
+ if !max_period.positive? || !days.positive?
74
+ return []
75
+ elsif valid_periods.empty? || everytime >= days
76
+ # cover everything everytime
77
+ return [[today + 1 - days, today, "everything"]]
78
+ end
79
+
80
+ max_period = valid_periods.max
81
+
82
+ run_number = today.to_date.jd
83
+ ranges = []
84
+ if everytime.positive?
85
+ ranges << [to_date + 1 - everytime, to_date, "everytime"]
86
+ days -= everytime
87
+ to_date -= everytime
88
+ end
89
+
90
+ periods = valid_periods.dup
91
+ loop do
92
+ period = periods.shift
93
+ break if period.nil? || period >= max_period || !days.positive?
94
+
95
+ if DebugUtils.trace?
96
+ FiberScheduler.log "DEBUG: #{period} day periods started #{(today - to_date).to_i} days in."
97
+ end
98
+ period.times do |index|
99
+ break unless days.positive?
100
+
101
+ this_period = [days, period].min
102
+ break if this_period <= 0
103
+
104
+ earliest_from = to_date - days
105
+ # we are working from the oldest back towards today
106
+ if run_number % period == index
107
+ from = to_date - index - (this_period - 1)
108
+ from = earliest_from if from < earliest_from
109
+ to = [today, to_date - index].min
110
+ break if from > to
111
+
112
+ @max_period_used = [this_period, @max_period_used].max
113
+ if ranges.any? && ranges.last[0] <= to + 1 && MERGE_ADJACENT_RANGES
114
+ # extend adjacent range
115
+ ranges.last[0] = [from, ranges.last[0]].min
116
+ ranges.last[2] = "#{period}\##{index},#{ranges.last[2]}"
117
+ else
118
+ to = ranges.last[0] - 1 if ranges.any? && to >= ranges.last[0]
119
+ ranges << [from, to, "#{period}\##{index}"]
120
+ end
121
+ end
122
+ days -= this_period
123
+ to_date -= this_period
124
+ end
125
+ end
126
+ # remainder of range at max_period, whatever that is
127
+ if days.positive? && ScraperUtils::DebugUtils.trace?
128
+ FiberScheduler.log "DEBUG: #{max_period} day periods started #{(today - to_date).to_i} days in."
129
+ end
130
+ index = -1
131
+ while days.positive?
132
+ index += 1
133
+ this_period = [days, max_period].min
134
+ break if this_period <= 0
135
+
136
+ earliest_from = to_date - days
137
+ if (run_number % max_period) == (index % max_period)
138
+ from = to_date - index - (this_period - 1)
139
+ from = earliest_from if from < earliest_from
140
+ to = to_date - index
141
+ break if from > to
142
+
143
+ @max_period_used = [this_period, @max_period_used].max
144
+ if ranges.any? && ranges.last[0] <= to + 1 && MERGE_ADJACENT_RANGES
145
+ # extend adjacent range
146
+ ranges.last[0] = [from, ranges.last[0]].min
147
+ ranges.last[2] = "#{this_period}\##{index},#{ranges.last[2]}"
148
+ else
149
+ to = ranges.last[0] - 1 if ranges.any? && to >= ranges.last[0]
150
+ ranges << [from, to, "#{this_period}\##{index}"]
151
+ end
152
+ end
153
+ days -= this_period
154
+ to_date -= this_period
155
+ end
156
+ ranges.reverse
157
+ end
158
+ end
159
+ end
@@ -33,9 +33,8 @@ module ScraperUtils
33
33
  else
34
34
  ["council_reference"]
35
35
  end
36
-
37
- puts "Saving record #{record['council_reference']} - #{record['address']}"
38
36
  ScraperWiki.save_sqlite(primary_key, record)
37
+ ScraperUtils::DataQualityMonitor.log_saved_record(record)
39
38
  end
40
39
  end
41
40
  end
@@ -5,6 +5,50 @@ require "json"
5
5
  module ScraperUtils
6
6
  # Utilities for debugging web scraping processes
7
7
  module DebugUtils
8
+ DEBUG_ENV_VAR = "DEBUG"
9
+ MORPH_DEBUG_ENV_VAR = "MORPH_DEBUG"
10
+
11
+ # Debug level constants
12
+ DISABLED_LEVEL = 0
13
+ BASIC_LEVEL = 1
14
+ VERBOSE_LEVEL = 2
15
+ TRACE_LEVEL = 3
16
+
17
+ # Get current debug level (0 = disabled, 1 = basic, 2 = verbose, 3 = trace)
18
+ # Checks DEBUG and MORPH_DEBUG env variables
19
+ # @return [Integer] Debug level
20
+ def self.debug_level
21
+ debug = ENV.fetch(DEBUG_ENV_VAR, ENV.fetch(MORPH_DEBUG_ENV_VAR, '0'))
22
+ debug =~ /^\d/ ? debug.to_i : BASIC_LEVEL
23
+ end
24
+
25
+ # Check if debug is enabled at specified level or higher
26
+ #
27
+ # @param level [Integer] Minimum debug level to check for
28
+ # @return [Boolean] true if debugging at specified level is enabled
29
+ def self.debug?(level = BASIC_LEVEL)
30
+ debug_level >= level
31
+ end
32
+
33
+ # Check if basic debug output or higher is enabled
34
+ # @return [Boolean] true if debugging is enabled
35
+ def self.basic?
36
+ debug?(BASIC_LEVEL)
37
+ end
38
+
39
+ # Check if verbose debug output or higher is enabled
40
+ # @return [Boolean] true if verbose debugging is enabled
41
+ def self.verbose?
42
+ debug?(VERBOSE_LEVEL)
43
+ end
44
+
45
+ # Check if debug tracing or higher is enabled
46
+ # @return [Boolean] true if debugging is enabled at trace level
47
+ def self.trace?
48
+ debug?(TRACE_LEVEL)
49
+ end
50
+
51
+
8
52
  # Logs details of an HTTP request when debug mode is enabled
9
53
  #
10
54
  # @param method [String] HTTP method (GET, POST, etc.)
@@ -14,21 +58,14 @@ module ScraperUtils
14
58
  # @param body [Hash, nil] Optional request body
15
59
  # @return [void]
16
60
  def self.debug_request(method, url, parameters: nil, headers: nil, body: nil)
17
- return unless ScraperUtils.debug?
61
+ return unless basic?
18
62
 
19
- puts "\nšŸ” #{method.upcase} #{url}"
20
- if parameters
21
- puts "Parameters:"
22
- puts JSON.pretty_generate(parameters)
23
- end
24
- if headers
25
- puts "Headers:"
26
- puts JSON.pretty_generate(headers)
27
- end
28
- return unless body
29
-
30
- puts "Body:"
31
- puts JSON.pretty_generate(body)
63
+ puts
64
+ FiberScheduler.log "šŸ” #{method.upcase} #{url}"
65
+ puts "Parameters:", JSON.pretty_generate(parameters) if parameters
66
+ puts "Headers:", JSON.pretty_generate(headers) if headers
67
+ puts "Body:", JSON.pretty_generate(body) if body
68
+ $stdout.flush
32
69
  end
33
70
 
34
71
  # Logs details of a web page when debug mode is enabled
@@ -37,17 +74,18 @@ module ScraperUtils
37
74
  # @param message [String] Context or description for the debug output
38
75
  # @return [void]
39
76
  def self.debug_page(page, message)
40
- return unless ScraperUtils.debug?
77
+ return unless trace?
41
78
 
42
- puts "",
43
- "šŸ” DEBUG: #{message}"
79
+ puts
80
+ FiberScheduler.log "šŸ” DEBUG: #{message}"
44
81
  puts "Current URL: #{page.uri}"
45
82
  puts "Page title: #{page.at('title').text.strip}" if page.at("title")
46
83
  puts "",
47
- "Page content:"
48
- puts "-" * 40
49
- puts page.body
50
- puts "-" * 40
84
+ "Page content:",
85
+ "-" * 40,
86
+ page.body,
87
+ "-" * 40
88
+ $stdout.flush
51
89
  end
52
90
 
53
91
  # Logs details about a specific page selector when debug mode is enabled
@@ -57,9 +95,10 @@ module ScraperUtils
57
95
  # @param message [String] Context or description for the debug output
58
96
  # @return [void]
59
97
  def self.debug_selector(page, selector, message)
60
- return unless ScraperUtils.debug?
98
+ return unless trace?
61
99
 
62
- puts "\nšŸ” DEBUG: #{message}"
100
+ puts
101
+ FiberScheduler.log "šŸ” DEBUG: #{message}"
63
102
  puts "Looking for selector: #{selector}"
64
103
  element = page.at(selector)
65
104
  if element
@@ -71,6 +110,7 @@ module ScraperUtils
71
110
  puts page.body
72
111
  puts "-" * 40
73
112
  end
113
+ $stdout.flush
74
114
  end
75
115
  end
76
116
  end