scraper_utils 0.5.1 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +5 -0
- data/CHANGELOG.md +7 -0
- data/GUIDELINES.md +2 -1
- data/Gemfile +1 -0
- data/IMPLEMENTATION.md +40 -0
- data/README.md +29 -23
- data/SPECS.md +13 -1
- data/bin/rspec +27 -0
- data/docs/example_scrape_with_fibers.rb +4 -4
- data/docs/fibers_and_threads.md +72 -0
- data/docs/getting_started.md +6 -6
- data/docs/interleaving_requests.md +7 -7
- data/docs/parallel_requests.md +138 -0
- data/docs/randomizing_requests.md +12 -8
- data/docs/reducing_server_load.md +6 -6
- data/lib/scraper_utils/data_quality_monitor.rb +2 -3
- data/lib/scraper_utils/date_range_utils.rb +37 -78
- data/lib/scraper_utils/debug_utils.rb +5 -5
- data/lib/scraper_utils/log_utils.rb +15 -0
- data/lib/scraper_utils/mechanize_actions.rb +37 -8
- data/lib/scraper_utils/mechanize_utils/adaptive_delay.rb +79 -0
- data/lib/scraper_utils/mechanize_utils/agent_config.rb +31 -30
- data/lib/scraper_utils/mechanize_utils/robots_checker.rb +151 -0
- data/lib/scraper_utils/mechanize_utils.rb +8 -5
- data/lib/scraper_utils/randomize_utils.rb +22 -19
- data/lib/scraper_utils/scheduler/constants.rb +12 -0
- data/lib/scraper_utils/scheduler/operation_registry.rb +101 -0
- data/lib/scraper_utils/scheduler/operation_worker.rb +199 -0
- data/lib/scraper_utils/scheduler/process_request.rb +59 -0
- data/lib/scraper_utils/scheduler/thread_request.rb +51 -0
- data/lib/scraper_utils/scheduler/thread_response.rb +59 -0
- data/lib/scraper_utils/scheduler.rb +286 -0
- data/lib/scraper_utils/version.rb +1 -1
- data/lib/scraper_utils.rb +11 -14
- metadata +16 -6
- data/lib/scraper_utils/adaptive_delay.rb +0 -70
- data/lib/scraper_utils/fiber_scheduler.rb +0 -229
- data/lib/scraper_utils/robots_checker.rb +0 -149
@@ -1,229 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require "fiber"
|
4
|
-
|
5
|
-
module ScraperUtils
|
6
|
-
# A utility module for interleaving multiple scraping operations
|
7
|
-
# using fibers during connection delay periods. This allows efficient
|
8
|
-
# use of wait time by switching between operations.
|
9
|
-
module FiberScheduler
|
10
|
-
# @return [Array<Fiber>] List of active fibers managed by the scheduler
|
11
|
-
def self.registry
|
12
|
-
@registry ||= []
|
13
|
-
end
|
14
|
-
|
15
|
-
# Checks if the current code is running within a registered fiber
|
16
|
-
#
|
17
|
-
# @return [Boolean] true if running in a registered fiber, false otherwise
|
18
|
-
def self.in_fiber?
|
19
|
-
!Fiber.current.nil? && registry.include?(Fiber.current)
|
20
|
-
end
|
21
|
-
|
22
|
-
# Gets the authority associated with the current fiber
|
23
|
-
#
|
24
|
-
# @return [String, nil] the authority name or nil if not in a fiber
|
25
|
-
def self.current_authority
|
26
|
-
return nil unless in_fiber?
|
27
|
-
|
28
|
-
Fiber.current.instance_variable_get(:@authority)
|
29
|
-
end
|
30
|
-
|
31
|
-
# Logs a message, automatically prefixing with authority name if in a fiber
|
32
|
-
#
|
33
|
-
# @param message [String] the message to log
|
34
|
-
# @return [void]
|
35
|
-
def self.log(message)
|
36
|
-
authority = current_authority
|
37
|
-
$stderr.flush
|
38
|
-
if authority
|
39
|
-
puts "[#{authority}] #{message}"
|
40
|
-
else
|
41
|
-
puts message
|
42
|
-
end
|
43
|
-
$stdout.flush
|
44
|
-
end
|
45
|
-
|
46
|
-
# Returns a hash of exceptions encountered during processing, indexed by authority
|
47
|
-
#
|
48
|
-
# @return [Hash{Symbol => Exception}] exceptions by authority
|
49
|
-
def self.exceptions
|
50
|
-
@exceptions ||= {}
|
51
|
-
end
|
52
|
-
|
53
|
-
# Returns a hash of the yielded / block values
|
54
|
-
#
|
55
|
-
# @return [Hash{Symbol => Any}] values by authority
|
56
|
-
def self.values
|
57
|
-
@values ||= {}
|
58
|
-
end
|
59
|
-
|
60
|
-
# Checks if fiber scheduling is currently enabled
|
61
|
-
#
|
62
|
-
# @return [Boolean] true if enabled, false otherwise
|
63
|
-
def self.enabled?
|
64
|
-
@enabled ||= false
|
65
|
-
end
|
66
|
-
|
67
|
-
# Enables fiber scheduling
|
68
|
-
#
|
69
|
-
# @return [void]
|
70
|
-
def self.enable!
|
71
|
-
reset! unless enabled?
|
72
|
-
@enabled = true
|
73
|
-
end
|
74
|
-
|
75
|
-
# Disables fiber scheduling
|
76
|
-
#
|
77
|
-
# @return [void]
|
78
|
-
def self.disable!
|
79
|
-
@enabled = false
|
80
|
-
end
|
81
|
-
|
82
|
-
# Resets the scheduler state, and disables. Use before retrying failed authorities.
|
83
|
-
#
|
84
|
-
# @return [void]
|
85
|
-
def self.reset!
|
86
|
-
@registry = []
|
87
|
-
@exceptions = {}
|
88
|
-
@values = {}
|
89
|
-
@enabled = false
|
90
|
-
@delay_requested = 0.0
|
91
|
-
@time_slept = 0.0
|
92
|
-
@resume_count = 0
|
93
|
-
@initial_resume_at = Time.now - 60.0 # one minute ago
|
94
|
-
end
|
95
|
-
|
96
|
-
# Registers a block to scrape for a specific authority
|
97
|
-
#
|
98
|
-
# @param authority [String] the name of the authority being processed
|
99
|
-
# @yield to the block containing the scraping operation to be run in the fiber
|
100
|
-
# @return [Fiber] a fiber that calls the block. With @authority and @resume_at instance vars
|
101
|
-
def self.register_operation(authority, &block)
|
102
|
-
# Automatically enable fiber scheduling when operations are registered
|
103
|
-
enable!
|
104
|
-
|
105
|
-
fiber = Fiber.new do
|
106
|
-
values[authority] = block.call
|
107
|
-
rescue StandardError => e
|
108
|
-
# Store exception against the authority
|
109
|
-
exceptions[authority] = e
|
110
|
-
ensure
|
111
|
-
# Remove itself when done regardless of success/failure
|
112
|
-
registry.delete(Fiber.current)
|
113
|
-
end
|
114
|
-
|
115
|
-
# Start fibres in registration order
|
116
|
-
@initial_resume_at += 0.1
|
117
|
-
fiber.instance_variable_set(:@resume_at, @initial_resume_at)
|
118
|
-
fiber.instance_variable_set(:@authority, authority)
|
119
|
-
registry << fiber
|
120
|
-
|
121
|
-
if DebugUtils.basic?
|
122
|
-
FiberScheduler.log "Registered #{authority} operation with fiber: #{fiber.object_id} for interleaving"
|
123
|
-
end
|
124
|
-
# Process immediately when testing
|
125
|
-
fiber.resume if ScraperUtils::RandomizeUtils.sequential?
|
126
|
-
fiber
|
127
|
-
end
|
128
|
-
|
129
|
-
# Run all registered fibers until completion
|
130
|
-
#
|
131
|
-
# @return [Hash] Exceptions that occurred during execution
|
132
|
-
def self.run_all
|
133
|
-
count = registry.size
|
134
|
-
while (fiber = find_earliest_fiber)
|
135
|
-
if fiber.alive?
|
136
|
-
authority = begin
|
137
|
-
fiber.instance_variable_get(:@authority)
|
138
|
-
rescue StandardError
|
139
|
-
nil
|
140
|
-
end
|
141
|
-
@resume_count ||= 0
|
142
|
-
@resume_count += 1
|
143
|
-
values[authority] = fiber.resume
|
144
|
-
else
|
145
|
-
FiberScheduler.log "WARNING: fiber is dead but did not remove itself from registry! #{fiber.object_id}"
|
146
|
-
registry.delete(fiber)
|
147
|
-
end
|
148
|
-
end
|
149
|
-
|
150
|
-
if @time_slept&.positive? && @delay_requested&.positive?
|
151
|
-
percent_slept = (100.0 * @time_slept / @delay_requested).round(1)
|
152
|
-
end
|
153
|
-
puts
|
154
|
-
FiberScheduler.log "FiberScheduler processed #{@resume_count} calls to delay for #{count} registrations, " \
|
155
|
-
"sleeping #{percent_slept}% (#{@time_slept&.round(1)}) of the " \
|
156
|
-
"#{@delay_requested&.round(1)} seconds requested."
|
157
|
-
puts
|
158
|
-
|
159
|
-
exceptions
|
160
|
-
end
|
161
|
-
|
162
|
-
# Delays the current fiber and potentially runs another one
|
163
|
-
# Falls back to regular sleep if fiber scheduling is not enabled
|
164
|
-
#
|
165
|
-
# @param seconds [Numeric] the number of seconds to delay
|
166
|
-
# @return [Integer] return from sleep operation or 0
|
167
|
-
def self.delay(seconds)
|
168
|
-
seconds = 0.0 unless seconds&.positive?
|
169
|
-
@delay_requested ||= 0.0
|
170
|
-
@delay_requested += seconds
|
171
|
-
|
172
|
-
current_fiber = Fiber.current
|
173
|
-
|
174
|
-
if !enabled? || !current_fiber || registry.size <= 1
|
175
|
-
@time_slept ||= 0.0
|
176
|
-
@time_slept += seconds
|
177
|
-
log("Sleeping #{seconds.round(3)} seconds") if DebugUtils.basic?
|
178
|
-
return sleep(seconds)
|
179
|
-
end
|
180
|
-
|
181
|
-
now = Time.now
|
182
|
-
resume_at = now + seconds
|
183
|
-
|
184
|
-
# Don't resume at the same time as someone else,
|
185
|
-
# FIFO queue if seconds == 0
|
186
|
-
@other_resumes ||= []
|
187
|
-
@other_resumes = @other_resumes.delete_if { |t| t < now }
|
188
|
-
while @other_resumes.include?(resume_at) && resume_at
|
189
|
-
resume_at += 0.01
|
190
|
-
end
|
191
|
-
|
192
|
-
# Used to compare when other fibers need to be resumed
|
193
|
-
current_fiber.instance_variable_set(:@resume_at, resume_at)
|
194
|
-
|
195
|
-
# Yield control back to the scheduler so another fiber can run
|
196
|
-
Fiber.yield
|
197
|
-
|
198
|
-
# When we get control back, check if we need to sleep more
|
199
|
-
remaining = resume_at - Time.now
|
200
|
-
if remaining.positive?
|
201
|
-
@time_slept ||= 0.0
|
202
|
-
@time_slept += remaining
|
203
|
-
log("Sleeping remaining #{remaining.round(3)} seconds") if DebugUtils.basic?
|
204
|
-
sleep(remaining)
|
205
|
-
end || 0
|
206
|
-
end
|
207
|
-
|
208
|
-
# Finds the fiber with the earliest wake-up time
|
209
|
-
#
|
210
|
-
# @return [Fiber, nil] the fiber with the earliest wake-up time or nil if none found
|
211
|
-
def self.find_earliest_fiber
|
212
|
-
earliest_time = nil
|
213
|
-
earliest_fiber = nil
|
214
|
-
|
215
|
-
registry.each do |fiber|
|
216
|
-
resume_at = fiber.instance_variable_get(:@resume_at)
|
217
|
-
if earliest_time.nil? || resume_at < earliest_time
|
218
|
-
earliest_time = resume_at
|
219
|
-
earliest_fiber = fiber
|
220
|
-
end
|
221
|
-
end
|
222
|
-
|
223
|
-
earliest_fiber
|
224
|
-
end
|
225
|
-
|
226
|
-
# Mark methods as private
|
227
|
-
private_class_method :find_earliest_fiber
|
228
|
-
end
|
229
|
-
end
|
@@ -1,149 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module ScraperUtils
|
4
|
-
# robots.txt checker with deliberately simplistic rules
|
5
|
-
class RobotsChecker
|
6
|
-
# @return [String] Lowercased user_agent for matching
|
7
|
-
attr_reader :user_agent
|
8
|
-
|
9
|
-
# Initialize with full user agent string like:
|
10
|
-
# "Mozilla/5.0 (compatible; ScraperUtils/0.1.0 2025-02-22; +https://github.com/ianheggie-oaf/scraper_utils)"
|
11
|
-
# Extracts the bot name (e.g. "ScraperUtils") to check against robots.txt
|
12
|
-
# Checks for
|
13
|
-
# * Disallow for User-agent: bot_name and
|
14
|
-
# * Crawl-delay from either User-agent: bot name or * (default)
|
15
|
-
def initialize(user_agent)
|
16
|
-
@user_agent = extract_user_agent(user_agent).downcase
|
17
|
-
if DebugUtils.basic?
|
18
|
-
ScraperUtils::FiberScheduler.log(
|
19
|
-
"Checking robots.txt for user agent prefix: #{@user_agent} (case insensitive)"
|
20
|
-
)
|
21
|
-
end
|
22
|
-
@rules = {} # domain -> {rules: [], delay: int}
|
23
|
-
@delay = nil # Delay from last robots.txt check
|
24
|
-
end
|
25
|
-
|
26
|
-
# Check if a URL is disallowed based on robots.txt rules specific to our user agent
|
27
|
-
# @param url [String] The full URL to check
|
28
|
-
# @return [Boolean] true if specifically blocked for our user agent, otherwise false
|
29
|
-
def disallowed?(url)
|
30
|
-
return false unless url
|
31
|
-
|
32
|
-
uri = URI(url)
|
33
|
-
domain = "#{uri.scheme}://#{uri.host}"
|
34
|
-
path = uri.path || "/"
|
35
|
-
|
36
|
-
# Get or fetch robots.txt rules
|
37
|
-
rules = get_rules(domain)
|
38
|
-
return false unless rules # If we can't get robots.txt, assume allowed
|
39
|
-
|
40
|
-
# Store any delay found for this domain
|
41
|
-
@delay = rules[:our_delay]
|
42
|
-
|
43
|
-
# Check rules specific to our user agent
|
44
|
-
matches_any_rule?(path, rules[:our_rules])
|
45
|
-
end
|
46
|
-
|
47
|
-
# Returns the crawl delay (if any) that applied to the last URL checked
|
48
|
-
# Should be called after disallowed? to get relevant delay
|
49
|
-
# @return [Integer, nil] The delay in seconds, or nil if no delay specified
|
50
|
-
def crawl_delay
|
51
|
-
@delay
|
52
|
-
end
|
53
|
-
|
54
|
-
private
|
55
|
-
|
56
|
-
def extract_user_agent(user_agent)
|
57
|
-
if user_agent =~ /^(.*compatible;\s*)?([-_a-z0-9]+)/i
|
58
|
-
user_agent = ::Regexp.last_match(2)&.strip
|
59
|
-
end
|
60
|
-
user_agent&.strip
|
61
|
-
end
|
62
|
-
|
63
|
-
def matches_any_rule?(path, rules)
|
64
|
-
rules&.any? { |rule| path.start_with?(rule) }
|
65
|
-
end
|
66
|
-
|
67
|
-
def get_rules(domain)
|
68
|
-
return @rules[domain] if @rules.key?(domain)
|
69
|
-
|
70
|
-
begin
|
71
|
-
response = Net::HTTP.get_response(URI("#{domain}/robots.txt"))
|
72
|
-
return nil unless response.code.start_with?("2") # 2xx response
|
73
|
-
|
74
|
-
rules = parse_robots_txt(response.body)
|
75
|
-
@rules[domain] = rules
|
76
|
-
rules
|
77
|
-
rescue StandardError => e
|
78
|
-
if DebugUtils.basic?
|
79
|
-
ScraperUtils::FiberScheduler.log(
|
80
|
-
"WARNING: Failed to fetch robots.txt for #{domain}: #{e.message}"
|
81
|
-
)
|
82
|
-
end
|
83
|
-
nil
|
84
|
-
end
|
85
|
-
end
|
86
|
-
|
87
|
-
# Parse robots.txt content into structured rules
|
88
|
-
# Only collects rules for our specific user agent and generic crawl-delay
|
89
|
-
# @param content [String] The robots.txt content
|
90
|
-
# @return [Hash] Hash containing :our_rules and :our_delay
|
91
|
-
def parse_robots_txt(content)
|
92
|
-
sections = [] # Array of {agent:, rules:[], delay:} hashes
|
93
|
-
current_section = nil
|
94
|
-
|
95
|
-
content.each_line do |line|
|
96
|
-
line = line.strip.downcase
|
97
|
-
next if line.empty? || line.start_with?("#")
|
98
|
-
|
99
|
-
if line.start_with?("user-agent:")
|
100
|
-
agent = line.split(":", 2).last.strip
|
101
|
-
# Check if this is a continuation of the previous section
|
102
|
-
if current_section && current_section[:rules].empty? && current_section[:delay].nil?
|
103
|
-
current_section[:agents] << agent
|
104
|
-
else
|
105
|
-
current_section = { agents: [agent], rules: [], delay: nil }
|
106
|
-
sections << current_section
|
107
|
-
end
|
108
|
-
next
|
109
|
-
end
|
110
|
-
|
111
|
-
next unless current_section # Skip rules before first user-agent
|
112
|
-
|
113
|
-
if line.start_with?("disallow:")
|
114
|
-
path = line.split(":", 2).last.strip
|
115
|
-
current_section[:rules] << path unless path.empty?
|
116
|
-
elsif line.start_with?("crawl-delay:")
|
117
|
-
delay = line.split(":", 2).last.strip.to_i
|
118
|
-
current_section[:delay] = delay if delay.positive?
|
119
|
-
end
|
120
|
-
end
|
121
|
-
|
122
|
-
# Sort sections by most specific agent match first
|
123
|
-
matched_section = sections.find do |section|
|
124
|
-
section[:agents].any? do |agent|
|
125
|
-
# Our user agent starts with the agent from robots.txt
|
126
|
-
@user_agent.start_with?(agent) ||
|
127
|
-
# Or the agent from robots.txt starts with our user agent
|
128
|
-
# (handles ScraperUtils matching ScraperUtils/1.0)
|
129
|
-
agent.start_with?(@user_agent)
|
130
|
-
end
|
131
|
-
end
|
132
|
-
|
133
|
-
# Use matched section or fall back to wildcard
|
134
|
-
if matched_section
|
135
|
-
{
|
136
|
-
our_rules: matched_section[:rules],
|
137
|
-
our_delay: matched_section[:delay]
|
138
|
-
}
|
139
|
-
else
|
140
|
-
# Find default section
|
141
|
-
default_section = sections.find { |s| s[:agents].include?("*") }
|
142
|
-
{
|
143
|
-
our_rules: [],
|
144
|
-
our_delay: default_section&.dig(:delay)
|
145
|
-
}
|
146
|
-
end
|
147
|
-
end
|
148
|
-
end
|
149
|
-
end
|