scraper_utils 0.5.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +5 -0
- data/CHANGELOG.md +19 -0
- data/GUIDELINES.md +2 -1
- data/Gemfile +1 -0
- data/IMPLEMENTATION.md +39 -0
- data/README.md +29 -23
- data/SPECS.md +13 -1
- data/bin/rspec +27 -0
- data/docs/enhancing_specs.md +100 -0
- data/docs/example_scrape_with_fibers.rb +4 -4
- data/docs/fibers_and_threads.md +72 -0
- data/docs/getting_started.md +6 -6
- data/docs/interleaving_requests.md +9 -8
- data/docs/mechanize_utilities.md +4 -4
- data/docs/parallel_requests.md +138 -0
- data/docs/randomizing_requests.md +12 -8
- data/docs/reducing_server_load.md +6 -6
- data/lib/scraper_utils/data_quality_monitor.rb +2 -3
- data/lib/scraper_utils/date_range_utils.rb +37 -78
- data/lib/scraper_utils/debug_utils.rb +5 -5
- data/lib/scraper_utils/log_utils.rb +15 -0
- data/lib/scraper_utils/mechanize_actions.rb +37 -8
- data/lib/scraper_utils/mechanize_utils/adaptive_delay.rb +80 -0
- data/lib/scraper_utils/mechanize_utils/agent_config.rb +35 -34
- data/lib/scraper_utils/mechanize_utils/robots_checker.rb +151 -0
- data/lib/scraper_utils/mechanize_utils.rb +8 -5
- data/lib/scraper_utils/randomize_utils.rb +22 -19
- data/lib/scraper_utils/scheduler/constants.rb +12 -0
- data/lib/scraper_utils/scheduler/operation_registry.rb +101 -0
- data/lib/scraper_utils/scheduler/operation_worker.rb +199 -0
- data/lib/scraper_utils/scheduler/process_request.rb +59 -0
- data/lib/scraper_utils/scheduler/thread_request.rb +51 -0
- data/lib/scraper_utils/scheduler/thread_response.rb +59 -0
- data/lib/scraper_utils/scheduler.rb +286 -0
- data/lib/scraper_utils/spec_support.rb +67 -0
- data/lib/scraper_utils/version.rb +1 -1
- data/lib/scraper_utils.rb +12 -14
- metadata +18 -6
- data/lib/scraper_utils/adaptive_delay.rb +0 -70
- data/lib/scraper_utils/fiber_scheduler.rb +0 -229
- data/lib/scraper_utils/robots_checker.rb +0 -149
@@ -0,0 +1,67 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "scraperwiki"
|
4
|
+
|
5
|
+
module ScraperUtils
|
6
|
+
# Methods to support specs
|
7
|
+
module SpecSupport
|
8
|
+
AUSTRALIAN_STATES = %w[ACT NSW NT QLD SA TAS VIC WA].freeze
|
9
|
+
COMMON_STREET_TYPES =
|
10
|
+
%w[
|
11
|
+
Avenue Ave Boulevard Court Crt Circle Chase Circuit Close Crescent
|
12
|
+
Drive Drv Lane Loop Parkway Place Parade Road Rd Street St Square Terrace Way
|
13
|
+
].freeze
|
14
|
+
AUSTRALIAN_POSTCODES = /\b\d{4}\b/.freeze
|
15
|
+
|
16
|
+
# Check if an address is likely to be geocodable by analyzing its format.
|
17
|
+
# This is a bit stricter than needed - typically assert >= 75% match
|
18
|
+
# @param address [String] The address to check
|
19
|
+
# @return [Boolean] True if the address appears to be geocodable.
|
20
|
+
def self.geocodable?(address, ignore_case: false)
|
21
|
+
return false if address.nil? || address.empty?
|
22
|
+
check_address = ignore_case ? address.upcase : address
|
23
|
+
|
24
|
+
# Basic structure check - must have a street name, suburb, state and postcode
|
25
|
+
has_state = AUSTRALIAN_STATES.any? { |state| check_address.end_with?(" #{state}") || check_address.include?(" #{state} ") }
|
26
|
+
has_postcode = address.match?(AUSTRALIAN_POSTCODES)
|
27
|
+
|
28
|
+
has_street_type = COMMON_STREET_TYPES.any? { |type| check_address.include?(" #{type}") || check_address.include?(" #{type.upcase}") }
|
29
|
+
|
30
|
+
has_unit_or_lot = address.match?(/\b(Unit|Lot:?)\s+\d+/i)
|
31
|
+
|
32
|
+
has_suburb_stats = check_address.match?(/\b[A-Z]{2,}(\s+[A-Z]+)*,?\s+(#{AUSTRALIAN_STATES.join('|')})\b/)
|
33
|
+
|
34
|
+
if ENV["DEBUG"]
|
35
|
+
missing = []
|
36
|
+
unless has_street_type || has_unit_or_lot
|
37
|
+
missing << "street type / unit / lot"
|
38
|
+
end
|
39
|
+
missing << "state" unless has_state
|
40
|
+
missing << "postcode" unless has_postcode
|
41
|
+
missing << "#{ignore_case ? '' : 'uppercase '}suburb state" unless has_suburb_stats
|
42
|
+
puts " address: #{address} is not geocodable, missing #{missing.join(', ')}" if missing.any?
|
43
|
+
end
|
44
|
+
|
45
|
+
(has_street_type || has_unit_or_lot) && has_state && has_postcode && has_suburb_stats
|
46
|
+
end
|
47
|
+
|
48
|
+
PLACEHOLDERS = [
|
49
|
+
/no description/i,
|
50
|
+
/not available/i,
|
51
|
+
/to be confirmed/i,
|
52
|
+
/\btbc\b/i,
|
53
|
+
%r{\bn/a\b}i
|
54
|
+
].freeze
|
55
|
+
|
56
|
+
def self.placeholder?(text)
|
57
|
+
PLACEHOLDERS.any? { |placeholder| text.to_s.match?(placeholder) }
|
58
|
+
end
|
59
|
+
|
60
|
+
# Check if this looks like a "reasonable" description
|
61
|
+
# This is a bit stricter than needed - typically assert >= 75% match
|
62
|
+
def self.reasonable_description?(text)
|
63
|
+
!placeholder?(text) && text.to_s.split.size >= 3
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
data/lib/scraper_utils.rb
CHANGED
@@ -1,20 +1,22 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require "scraper_utils/
|
3
|
+
require "scraper_utils/version"
|
4
|
+
|
5
|
+
# Public Apis (responsible for requiring their own dependencies)
|
4
6
|
require "scraper_utils/authority_utils"
|
5
7
|
require "scraper_utils/cycle_utils"
|
6
8
|
require "scraper_utils/data_quality_monitor"
|
7
9
|
require "scraper_utils/date_range_utils"
|
8
10
|
require "scraper_utils/db_utils"
|
9
11
|
require "scraper_utils/debug_utils"
|
10
|
-
require "scraper_utils/fiber_scheduler"
|
11
12
|
require "scraper_utils/log_utils"
|
13
|
+
require "scraper_utils/randomize_utils"
|
14
|
+
require "scraper_utils/scheduler"
|
15
|
+
require "scraper_utils/spec_support"
|
16
|
+
|
17
|
+
# Mechanize utilities
|
12
18
|
require "scraper_utils/mechanize_actions"
|
13
|
-
require "scraper_utils/mechanize_utils/agent_config"
|
14
19
|
require "scraper_utils/mechanize_utils"
|
15
|
-
require "scraper_utils/randomize_utils"
|
16
|
-
require "scraper_utils/robots_checker"
|
17
|
-
require "scraper_utils/version"
|
18
20
|
|
19
21
|
# Utilities for planningalerts scrapers
|
20
22
|
module ScraperUtils
|
@@ -22,17 +24,13 @@ module ScraperUtils
|
|
22
24
|
AUSTRALIAN_PROXY_ENV_VAR = "MORPH_AUSTRALIAN_PROXY"
|
23
25
|
|
24
26
|
# Fatal Error
|
25
|
-
class Error < StandardError
|
26
|
-
end
|
27
|
+
class Error < StandardError; end
|
27
28
|
|
28
29
|
# Fatal error with the site - retrying won't help
|
29
|
-
class UnprocessableSite < Error
|
30
|
-
end
|
30
|
+
class UnprocessableSite < Error; end
|
31
31
|
|
32
|
-
#
|
33
|
-
|
34
|
-
class UnprocessableRecord < Error
|
35
|
-
end
|
32
|
+
# Fatal Error for a record - other records may be processable
|
33
|
+
class UnprocessableRecord < Error; end
|
36
34
|
|
37
35
|
def self.australian_proxy
|
38
36
|
ap = ENV[AUSTRALIAN_PROXY_ENV_VAR].to_s
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scraper_utils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ian Heggie
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-04-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -64,6 +64,7 @@ files:
|
|
64
64
|
- ".rspec"
|
65
65
|
- ".rubocop.yml"
|
66
66
|
- ".travis.yml"
|
67
|
+
- ".yardopts"
|
67
68
|
- CHANGELOG.md
|
68
69
|
- GUIDELINES.md
|
69
70
|
- Gemfile
|
@@ -73,30 +74,41 @@ files:
|
|
73
74
|
- Rakefile
|
74
75
|
- SPECS.md
|
75
76
|
- bin/console
|
77
|
+
- bin/rspec
|
76
78
|
- bin/setup
|
77
79
|
- docs/debugging.md
|
80
|
+
- docs/enhancing_specs.md
|
78
81
|
- docs/example_scrape_with_fibers.rb
|
79
82
|
- docs/example_scraper.rb
|
83
|
+
- docs/fibers_and_threads.md
|
80
84
|
- docs/getting_started.md
|
81
85
|
- docs/interleaving_requests.md
|
82
86
|
- docs/mechanize_utilities.md
|
87
|
+
- docs/parallel_requests.md
|
83
88
|
- docs/randomizing_requests.md
|
84
89
|
- docs/reducing_server_load.md
|
85
90
|
- lib/scraper_utils.rb
|
86
|
-
- lib/scraper_utils/adaptive_delay.rb
|
87
91
|
- lib/scraper_utils/authority_utils.rb
|
88
92
|
- lib/scraper_utils/cycle_utils.rb
|
89
93
|
- lib/scraper_utils/data_quality_monitor.rb
|
90
94
|
- lib/scraper_utils/date_range_utils.rb
|
91
95
|
- lib/scraper_utils/db_utils.rb
|
92
96
|
- lib/scraper_utils/debug_utils.rb
|
93
|
-
- lib/scraper_utils/fiber_scheduler.rb
|
94
97
|
- lib/scraper_utils/log_utils.rb
|
95
98
|
- lib/scraper_utils/mechanize_actions.rb
|
96
99
|
- lib/scraper_utils/mechanize_utils.rb
|
100
|
+
- lib/scraper_utils/mechanize_utils/adaptive_delay.rb
|
97
101
|
- lib/scraper_utils/mechanize_utils/agent_config.rb
|
102
|
+
- lib/scraper_utils/mechanize_utils/robots_checker.rb
|
98
103
|
- lib/scraper_utils/randomize_utils.rb
|
99
|
-
- lib/scraper_utils/
|
104
|
+
- lib/scraper_utils/scheduler.rb
|
105
|
+
- lib/scraper_utils/scheduler/constants.rb
|
106
|
+
- lib/scraper_utils/scheduler/operation_registry.rb
|
107
|
+
- lib/scraper_utils/scheduler/operation_worker.rb
|
108
|
+
- lib/scraper_utils/scheduler/process_request.rb
|
109
|
+
- lib/scraper_utils/scheduler/thread_request.rb
|
110
|
+
- lib/scraper_utils/scheduler/thread_response.rb
|
111
|
+
- lib/scraper_utils/spec_support.rb
|
100
112
|
- lib/scraper_utils/version.rb
|
101
113
|
- scraper_utils.gemspec
|
102
114
|
homepage: https://github.com/ianheggie-oaf/scraper_utils
|
@@ -106,7 +118,7 @@ metadata:
|
|
106
118
|
allowed_push_host: https://rubygems.org
|
107
119
|
homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
|
108
120
|
source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
|
109
|
-
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.
|
121
|
+
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.7.0
|
110
122
|
changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
|
111
123
|
rubygems_mfa_required: 'true'
|
112
124
|
post_install_message:
|
@@ -1,70 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require "uri"
|
4
|
-
|
5
|
-
module ScraperUtils
|
6
|
-
# Adapts delays between requests based on server response times.
|
7
|
-
# Target delay is proportional to response time based on max_load setting.
|
8
|
-
# Uses an exponential moving average to smooth variations in response times.
|
9
|
-
class AdaptiveDelay
|
10
|
-
DEFAULT_MIN_DELAY = 0.0
|
11
|
-
DEFAULT_MAX_DELAY = 30.0 # Presumed default timeout for Mechanize
|
12
|
-
|
13
|
-
attr_reader :min_delay, :max_delay, :max_load
|
14
|
-
|
15
|
-
# Creates a new adaptive delay calculator
|
16
|
-
#
|
17
|
-
# @param min_delay [Float] Minimum delay between requests in seconds
|
18
|
-
# @param max_delay [Float] Maximum delay between requests in seconds
|
19
|
-
# @param max_load [Float] Maximum load percentage (1-99) we aim to place on the server
|
20
|
-
# Lower values are more conservative (e.g., 20% = 4x response time delay)
|
21
|
-
def initialize(min_delay: DEFAULT_MIN_DELAY, max_delay: DEFAULT_MAX_DELAY, max_load: 20.0)
|
22
|
-
@delays = {} # domain -> last delay used
|
23
|
-
@min_delay = min_delay.to_f
|
24
|
-
@max_delay = max_delay.to_f
|
25
|
-
@max_load = max_load.to_f.clamp(1.0, 99.0)
|
26
|
-
@response_multiplier = (100.0 - @max_load) / @max_load
|
27
|
-
|
28
|
-
return unless DebugUtils.basic?
|
29
|
-
|
30
|
-
ScraperUtils::FiberScheduler.log(
|
31
|
-
"AdaptiveDelay initialized with delays between #{@min_delay} and #{@max_delay} seconds, " \
|
32
|
-
"Max_load #{@max_load}% thus response multiplier: #{@response_multiplier.round(2)}x"
|
33
|
-
)
|
34
|
-
end
|
35
|
-
|
36
|
-
# @param uri [URI::Generic, String] The URL to extract the domain from
|
37
|
-
# @return [String] The domain in the format "scheme://host"
|
38
|
-
def domain(uri)
|
39
|
-
uri = URI(uri) unless uri.is_a?(URI)
|
40
|
-
"#{uri.scheme}://#{uri.host}".downcase
|
41
|
-
end
|
42
|
-
|
43
|
-
# @param uri [URI::Generic, String] URL to get delay for
|
44
|
-
# @return [Float] Current delay for the domain, or min_delay if no delay set
|
45
|
-
def delay(uri)
|
46
|
-
@delays[domain(uri)] || @min_delay
|
47
|
-
end
|
48
|
-
|
49
|
-
# @param uri [URI::Generic, String] URL the response came from
|
50
|
-
# @param response_time [Float] Time in seconds the server took to respond
|
51
|
-
# @return [Float] The calculated delay to use with the next request
|
52
|
-
def next_delay(uri, response_time)
|
53
|
-
uris_domain = domain(uri)
|
54
|
-
target_delay = (response_time * @response_multiplier).clamp(0.0, @max_delay)
|
55
|
-
current_delay = @delays[uris_domain] || target_delay
|
56
|
-
delay = ((9.0 * current_delay) + target_delay) / 10.0
|
57
|
-
delay = delay.clamp(@min_delay, @max_delay)
|
58
|
-
|
59
|
-
if DebugUtils.basic?
|
60
|
-
ScraperUtils::FiberScheduler.log(
|
61
|
-
"Adaptive delay for #{uris_domain} updated to #{delay.round(2)}s (target: " \
|
62
|
-
"#{@response_multiplier.round(1)}x response_time of #{response_time.round(2)}s)"
|
63
|
-
)
|
64
|
-
end
|
65
|
-
|
66
|
-
@delays[uris_domain] = delay
|
67
|
-
delay
|
68
|
-
end
|
69
|
-
end
|
70
|
-
end
|
@@ -1,229 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require "fiber"
|
4
|
-
|
5
|
-
module ScraperUtils
|
6
|
-
# A utility module for interleaving multiple scraping operations
|
7
|
-
# using fibers during connection delay periods. This allows efficient
|
8
|
-
# use of wait time by switching between operations.
|
9
|
-
module FiberScheduler
|
10
|
-
# @return [Array<Fiber>] List of active fibers managed by the scheduler
|
11
|
-
def self.registry
|
12
|
-
@registry ||= []
|
13
|
-
end
|
14
|
-
|
15
|
-
# Checks if the current code is running within a registered fiber
|
16
|
-
#
|
17
|
-
# @return [Boolean] true if running in a registered fiber, false otherwise
|
18
|
-
def self.in_fiber?
|
19
|
-
!Fiber.current.nil? && registry.include?(Fiber.current)
|
20
|
-
end
|
21
|
-
|
22
|
-
# Gets the authority associated with the current fiber
|
23
|
-
#
|
24
|
-
# @return [String, nil] the authority name or nil if not in a fiber
|
25
|
-
def self.current_authority
|
26
|
-
return nil unless in_fiber?
|
27
|
-
|
28
|
-
Fiber.current.instance_variable_get(:@authority)
|
29
|
-
end
|
30
|
-
|
31
|
-
# Logs a message, automatically prefixing with authority name if in a fiber
|
32
|
-
#
|
33
|
-
# @param message [String] the message to log
|
34
|
-
# @return [void]
|
35
|
-
def self.log(message)
|
36
|
-
authority = current_authority
|
37
|
-
$stderr.flush
|
38
|
-
if authority
|
39
|
-
puts "[#{authority}] #{message}"
|
40
|
-
else
|
41
|
-
puts message
|
42
|
-
end
|
43
|
-
$stdout.flush
|
44
|
-
end
|
45
|
-
|
46
|
-
# Returns a hash of exceptions encountered during processing, indexed by authority
|
47
|
-
#
|
48
|
-
# @return [Hash{Symbol => Exception}] exceptions by authority
|
49
|
-
def self.exceptions
|
50
|
-
@exceptions ||= {}
|
51
|
-
end
|
52
|
-
|
53
|
-
# Returns a hash of the yielded / block values
|
54
|
-
#
|
55
|
-
# @return [Hash{Symbol => Any}] values by authority
|
56
|
-
def self.values
|
57
|
-
@values ||= {}
|
58
|
-
end
|
59
|
-
|
60
|
-
# Checks if fiber scheduling is currently enabled
|
61
|
-
#
|
62
|
-
# @return [Boolean] true if enabled, false otherwise
|
63
|
-
def self.enabled?
|
64
|
-
@enabled ||= false
|
65
|
-
end
|
66
|
-
|
67
|
-
# Enables fiber scheduling
|
68
|
-
#
|
69
|
-
# @return [void]
|
70
|
-
def self.enable!
|
71
|
-
reset! unless enabled?
|
72
|
-
@enabled = true
|
73
|
-
end
|
74
|
-
|
75
|
-
# Disables fiber scheduling
|
76
|
-
#
|
77
|
-
# @return [void]
|
78
|
-
def self.disable!
|
79
|
-
@enabled = false
|
80
|
-
end
|
81
|
-
|
82
|
-
# Resets the scheduler state, and disables. Use before retrying failed authorities.
|
83
|
-
#
|
84
|
-
# @return [void]
|
85
|
-
def self.reset!
|
86
|
-
@registry = []
|
87
|
-
@exceptions = {}
|
88
|
-
@values = {}
|
89
|
-
@enabled = false
|
90
|
-
@delay_requested = 0.0
|
91
|
-
@time_slept = 0.0
|
92
|
-
@resume_count = 0
|
93
|
-
@initial_resume_at = Time.now - 60.0 # one minute ago
|
94
|
-
end
|
95
|
-
|
96
|
-
# Registers a block to scrape for a specific authority
|
97
|
-
#
|
98
|
-
# @param authority [String] the name of the authority being processed
|
99
|
-
# @yield to the block containing the scraping operation to be run in the fiber
|
100
|
-
# @return [Fiber] a fiber that calls the block. With @authority and @resume_at instance vars
|
101
|
-
def self.register_operation(authority, &block)
|
102
|
-
# Automatically enable fiber scheduling when operations are registered
|
103
|
-
enable!
|
104
|
-
|
105
|
-
fiber = Fiber.new do
|
106
|
-
values[authority] = block.call
|
107
|
-
rescue StandardError => e
|
108
|
-
# Store exception against the authority
|
109
|
-
exceptions[authority] = e
|
110
|
-
ensure
|
111
|
-
# Remove itself when done regardless of success/failure
|
112
|
-
registry.delete(Fiber.current)
|
113
|
-
end
|
114
|
-
|
115
|
-
# Start fibres in registration order
|
116
|
-
@initial_resume_at += 0.1
|
117
|
-
fiber.instance_variable_set(:@resume_at, @initial_resume_at)
|
118
|
-
fiber.instance_variable_set(:@authority, authority)
|
119
|
-
registry << fiber
|
120
|
-
|
121
|
-
if DebugUtils.basic?
|
122
|
-
FiberScheduler.log "Registered #{authority} operation with fiber: #{fiber.object_id} for interleaving"
|
123
|
-
end
|
124
|
-
# Process immediately when testing
|
125
|
-
fiber.resume if ScraperUtils::RandomizeUtils.sequential?
|
126
|
-
fiber
|
127
|
-
end
|
128
|
-
|
129
|
-
# Run all registered fibers until completion
|
130
|
-
#
|
131
|
-
# @return [Hash] Exceptions that occurred during execution
|
132
|
-
def self.run_all
|
133
|
-
count = registry.size
|
134
|
-
while (fiber = find_earliest_fiber)
|
135
|
-
if fiber.alive?
|
136
|
-
authority = begin
|
137
|
-
fiber.instance_variable_get(:@authority)
|
138
|
-
rescue StandardError
|
139
|
-
nil
|
140
|
-
end
|
141
|
-
@resume_count ||= 0
|
142
|
-
@resume_count += 1
|
143
|
-
values[authority] = fiber.resume
|
144
|
-
else
|
145
|
-
FiberScheduler.log "WARNING: fiber is dead but did not remove itself from registry! #{fiber.object_id}"
|
146
|
-
registry.delete(fiber)
|
147
|
-
end
|
148
|
-
end
|
149
|
-
|
150
|
-
if @time_slept&.positive? && @delay_requested&.positive?
|
151
|
-
percent_slept = (100.0 * @time_slept / @delay_requested).round(1)
|
152
|
-
end
|
153
|
-
puts
|
154
|
-
FiberScheduler.log "FiberScheduler processed #{@resume_count} calls to delay for #{count} registrations, " \
|
155
|
-
"sleeping #{percent_slept}% (#{@time_slept&.round(1)}) of the " \
|
156
|
-
"#{@delay_requested&.round(1)} seconds requested."
|
157
|
-
puts
|
158
|
-
|
159
|
-
exceptions
|
160
|
-
end
|
161
|
-
|
162
|
-
# Delays the current fiber and potentially runs another one
|
163
|
-
# Falls back to regular sleep if fiber scheduling is not enabled
|
164
|
-
#
|
165
|
-
# @param seconds [Numeric] the number of seconds to delay
|
166
|
-
# @return [Integer] return from sleep operation or 0
|
167
|
-
def self.delay(seconds)
|
168
|
-
seconds = 0.0 unless seconds&.positive?
|
169
|
-
@delay_requested ||= 0.0
|
170
|
-
@delay_requested += seconds
|
171
|
-
|
172
|
-
current_fiber = Fiber.current
|
173
|
-
|
174
|
-
if !enabled? || !current_fiber || registry.size <= 1
|
175
|
-
@time_slept ||= 0.0
|
176
|
-
@time_slept += seconds
|
177
|
-
log("Sleeping #{seconds.round(3)} seconds") if DebugUtils.basic?
|
178
|
-
return sleep(seconds)
|
179
|
-
end
|
180
|
-
|
181
|
-
now = Time.now
|
182
|
-
resume_at = now + seconds
|
183
|
-
|
184
|
-
# Don't resume at the same time as someone else,
|
185
|
-
# FIFO queue if seconds == 0
|
186
|
-
@other_resumes ||= []
|
187
|
-
@other_resumes = @other_resumes.delete_if { |t| t < now }
|
188
|
-
while @other_resumes.include?(resume_at) && resume_at
|
189
|
-
resume_at += 0.01
|
190
|
-
end
|
191
|
-
|
192
|
-
# Used to compare when other fibers need to be resumed
|
193
|
-
current_fiber.instance_variable_set(:@resume_at, resume_at)
|
194
|
-
|
195
|
-
# Yield control back to the scheduler so another fiber can run
|
196
|
-
Fiber.yield
|
197
|
-
|
198
|
-
# When we get control back, check if we need to sleep more
|
199
|
-
remaining = resume_at - Time.now
|
200
|
-
if remaining.positive?
|
201
|
-
@time_slept ||= 0.0
|
202
|
-
@time_slept += remaining
|
203
|
-
log("Sleeping remaining #{remaining.round(3)} seconds") if DebugUtils.basic?
|
204
|
-
sleep(remaining)
|
205
|
-
end || 0
|
206
|
-
end
|
207
|
-
|
208
|
-
# Finds the fiber with the earliest wake-up time
|
209
|
-
#
|
210
|
-
# @return [Fiber, nil] the fiber with the earliest wake-up time or nil if none found
|
211
|
-
def self.find_earliest_fiber
|
212
|
-
earliest_time = nil
|
213
|
-
earliest_fiber = nil
|
214
|
-
|
215
|
-
registry.each do |fiber|
|
216
|
-
resume_at = fiber.instance_variable_get(:@resume_at)
|
217
|
-
if earliest_time.nil? || resume_at < earliest_time
|
218
|
-
earliest_time = resume_at
|
219
|
-
earliest_fiber = fiber
|
220
|
-
end
|
221
|
-
end
|
222
|
-
|
223
|
-
earliest_fiber
|
224
|
-
end
|
225
|
-
|
226
|
-
# Mark methods as private
|
227
|
-
private_class_method :find_earliest_fiber
|
228
|
-
end
|
229
|
-
end
|
@@ -1,149 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module ScraperUtils
|
4
|
-
# robots.txt checker with deliberately simplistic rules
|
5
|
-
class RobotsChecker
|
6
|
-
# @return [String] Lowercased user_agent for matching
|
7
|
-
attr_reader :user_agent
|
8
|
-
|
9
|
-
# Initialize with full user agent string like:
|
10
|
-
# "Mozilla/5.0 (compatible; ScraperUtils/0.1.0 2025-02-22; +https://github.com/ianheggie-oaf/scraper_utils)"
|
11
|
-
# Extracts the bot name (e.g. "ScraperUtils") to check against robots.txt
|
12
|
-
# Checks for
|
13
|
-
# * Disallow for User-agent: bot_name and
|
14
|
-
# * Crawl-delay from either User-agent: bot name or * (default)
|
15
|
-
def initialize(user_agent)
|
16
|
-
@user_agent = extract_user_agent(user_agent).downcase
|
17
|
-
if DebugUtils.basic?
|
18
|
-
ScraperUtils::FiberScheduler.log(
|
19
|
-
"Checking robots.txt for user agent prefix: #{@user_agent} (case insensitive)"
|
20
|
-
)
|
21
|
-
end
|
22
|
-
@rules = {} # domain -> {rules: [], delay: int}
|
23
|
-
@delay = nil # Delay from last robots.txt check
|
24
|
-
end
|
25
|
-
|
26
|
-
# Check if a URL is disallowed based on robots.txt rules specific to our user agent
|
27
|
-
# @param url [String] The full URL to check
|
28
|
-
# @return [Boolean] true if specifically blocked for our user agent, otherwise false
|
29
|
-
def disallowed?(url)
|
30
|
-
return false unless url
|
31
|
-
|
32
|
-
uri = URI(url)
|
33
|
-
domain = "#{uri.scheme}://#{uri.host}"
|
34
|
-
path = uri.path || "/"
|
35
|
-
|
36
|
-
# Get or fetch robots.txt rules
|
37
|
-
rules = get_rules(domain)
|
38
|
-
return false unless rules # If we can't get robots.txt, assume allowed
|
39
|
-
|
40
|
-
# Store any delay found for this domain
|
41
|
-
@delay = rules[:our_delay]
|
42
|
-
|
43
|
-
# Check rules specific to our user agent
|
44
|
-
matches_any_rule?(path, rules[:our_rules])
|
45
|
-
end
|
46
|
-
|
47
|
-
# Returns the crawl delay (if any) that applied to the last URL checked
|
48
|
-
# Should be called after disallowed? to get relevant delay
|
49
|
-
# @return [Integer, nil] The delay in seconds, or nil if no delay specified
|
50
|
-
def crawl_delay
|
51
|
-
@delay
|
52
|
-
end
|
53
|
-
|
54
|
-
private
|
55
|
-
|
56
|
-
def extract_user_agent(user_agent)
|
57
|
-
if user_agent =~ /^(.*compatible;\s*)?([-_a-z0-9]+)/i
|
58
|
-
user_agent = ::Regexp.last_match(2)&.strip
|
59
|
-
end
|
60
|
-
user_agent&.strip
|
61
|
-
end
|
62
|
-
|
63
|
-
def matches_any_rule?(path, rules)
|
64
|
-
rules&.any? { |rule| path.start_with?(rule) }
|
65
|
-
end
|
66
|
-
|
67
|
-
def get_rules(domain)
|
68
|
-
return @rules[domain] if @rules.key?(domain)
|
69
|
-
|
70
|
-
begin
|
71
|
-
response = Net::HTTP.get_response(URI("#{domain}/robots.txt"))
|
72
|
-
return nil unless response.code.start_with?("2") # 2xx response
|
73
|
-
|
74
|
-
rules = parse_robots_txt(response.body)
|
75
|
-
@rules[domain] = rules
|
76
|
-
rules
|
77
|
-
rescue StandardError => e
|
78
|
-
if DebugUtils.basic?
|
79
|
-
ScraperUtils::FiberScheduler.log(
|
80
|
-
"WARNING: Failed to fetch robots.txt for #{domain}: #{e.message}"
|
81
|
-
)
|
82
|
-
end
|
83
|
-
nil
|
84
|
-
end
|
85
|
-
end
|
86
|
-
|
87
|
-
# Parse robots.txt content into structured rules
|
88
|
-
# Only collects rules for our specific user agent and generic crawl-delay
|
89
|
-
# @param content [String] The robots.txt content
|
90
|
-
# @return [Hash] Hash containing :our_rules and :our_delay
|
91
|
-
def parse_robots_txt(content)
|
92
|
-
sections = [] # Array of {agent:, rules:[], delay:} hashes
|
93
|
-
current_section = nil
|
94
|
-
|
95
|
-
content.each_line do |line|
|
96
|
-
line = line.strip.downcase
|
97
|
-
next if line.empty? || line.start_with?("#")
|
98
|
-
|
99
|
-
if line.start_with?("user-agent:")
|
100
|
-
agent = line.split(":", 2).last.strip
|
101
|
-
# Check if this is a continuation of the previous section
|
102
|
-
if current_section && current_section[:rules].empty? && current_section[:delay].nil?
|
103
|
-
current_section[:agents] << agent
|
104
|
-
else
|
105
|
-
current_section = { agents: [agent], rules: [], delay: nil }
|
106
|
-
sections << current_section
|
107
|
-
end
|
108
|
-
next
|
109
|
-
end
|
110
|
-
|
111
|
-
next unless current_section # Skip rules before first user-agent
|
112
|
-
|
113
|
-
if line.start_with?("disallow:")
|
114
|
-
path = line.split(":", 2).last.strip
|
115
|
-
current_section[:rules] << path unless path.empty?
|
116
|
-
elsif line.start_with?("crawl-delay:")
|
117
|
-
delay = line.split(":", 2).last.strip.to_i
|
118
|
-
current_section[:delay] = delay if delay.positive?
|
119
|
-
end
|
120
|
-
end
|
121
|
-
|
122
|
-
# Sort sections by most specific agent match first
|
123
|
-
matched_section = sections.find do |section|
|
124
|
-
section[:agents].any? do |agent|
|
125
|
-
# Our user agent starts with the agent from robots.txt
|
126
|
-
@user_agent.start_with?(agent) ||
|
127
|
-
# Or the agent from robots.txt starts with our user agent
|
128
|
-
# (handles ScraperUtils matching ScraperUtils/1.0)
|
129
|
-
agent.start_with?(@user_agent)
|
130
|
-
end
|
131
|
-
end
|
132
|
-
|
133
|
-
# Use matched section or fall back to wildcard
|
134
|
-
if matched_section
|
135
|
-
{
|
136
|
-
our_rules: matched_section[:rules],
|
137
|
-
our_delay: matched_section[:delay]
|
138
|
-
}
|
139
|
-
else
|
140
|
-
# Find default section
|
141
|
-
default_section = sections.find { |s| s[:agents].include?("*") }
|
142
|
-
{
|
143
|
-
our_rules: [],
|
144
|
-
our_delay: default_section&.dig(:delay)
|
145
|
-
}
|
146
|
-
end
|
147
|
-
end
|
148
|
-
end
|
149
|
-
end
|