scraper_utils 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +5 -0
  3. data/CHANGELOG.md +7 -0
  4. data/GUIDELINES.md +2 -1
  5. data/Gemfile +1 -0
  6. data/IMPLEMENTATION.md +40 -0
  7. data/README.md +29 -23
  8. data/SPECS.md +13 -1
  9. data/bin/rspec +27 -0
  10. data/docs/example_scrape_with_fibers.rb +4 -4
  11. data/docs/fibers_and_threads.md +72 -0
  12. data/docs/getting_started.md +6 -6
  13. data/docs/interleaving_requests.md +7 -7
  14. data/docs/parallel_requests.md +138 -0
  15. data/docs/randomizing_requests.md +12 -8
  16. data/docs/reducing_server_load.md +6 -6
  17. data/lib/scraper_utils/data_quality_monitor.rb +2 -3
  18. data/lib/scraper_utils/date_range_utils.rb +37 -78
  19. data/lib/scraper_utils/debug_utils.rb +5 -5
  20. data/lib/scraper_utils/log_utils.rb +15 -0
  21. data/lib/scraper_utils/mechanize_actions.rb +37 -8
  22. data/lib/scraper_utils/mechanize_utils/adaptive_delay.rb +79 -0
  23. data/lib/scraper_utils/mechanize_utils/agent_config.rb +31 -30
  24. data/lib/scraper_utils/mechanize_utils/robots_checker.rb +151 -0
  25. data/lib/scraper_utils/mechanize_utils.rb +8 -5
  26. data/lib/scraper_utils/randomize_utils.rb +22 -19
  27. data/lib/scraper_utils/scheduler/constants.rb +12 -0
  28. data/lib/scraper_utils/scheduler/operation_registry.rb +101 -0
  29. data/lib/scraper_utils/scheduler/operation_worker.rb +199 -0
  30. data/lib/scraper_utils/scheduler/process_request.rb +59 -0
  31. data/lib/scraper_utils/scheduler/thread_request.rb +51 -0
  32. data/lib/scraper_utils/scheduler/thread_response.rb +59 -0
  33. data/lib/scraper_utils/scheduler.rb +286 -0
  34. data/lib/scraper_utils/version.rb +1 -1
  35. data/lib/scraper_utils.rb +11 -14
  36. metadata +16 -6
  37. data/lib/scraper_utils/adaptive_delay.rb +0 -70
  38. data/lib/scraper_utils/fiber_scheduler.rb +0 -229
  39. data/lib/scraper_utils/robots_checker.rb +0 -149
@@ -1,229 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "fiber"
4
-
5
- module ScraperUtils
6
- # A utility module for interleaving multiple scraping operations
7
- # using fibers during connection delay periods. This allows efficient
8
- # use of wait time by switching between operations.
9
- module FiberScheduler
10
- # @return [Array<Fiber>] List of active fibers managed by the scheduler
11
- def self.registry
12
- @registry ||= []
13
- end
14
-
15
- # Checks if the current code is running within a registered fiber
16
- #
17
- # @return [Boolean] true if running in a registered fiber, false otherwise
18
- def self.in_fiber?
19
- !Fiber.current.nil? && registry.include?(Fiber.current)
20
- end
21
-
22
- # Gets the authority associated with the current fiber
23
- #
24
- # @return [String, nil] the authority name or nil if not in a fiber
25
- def self.current_authority
26
- return nil unless in_fiber?
27
-
28
- Fiber.current.instance_variable_get(:@authority)
29
- end
30
-
31
- # Logs a message, automatically prefixing with authority name if in a fiber
32
- #
33
- # @param message [String] the message to log
34
- # @return [void]
35
- def self.log(message)
36
- authority = current_authority
37
- $stderr.flush
38
- if authority
39
- puts "[#{authority}] #{message}"
40
- else
41
- puts message
42
- end
43
- $stdout.flush
44
- end
45
-
46
- # Returns a hash of exceptions encountered during processing, indexed by authority
47
- #
48
- # @return [Hash{Symbol => Exception}] exceptions by authority
49
- def self.exceptions
50
- @exceptions ||= {}
51
- end
52
-
53
- # Returns a hash of the yielded / block values
54
- #
55
- # @return [Hash{Symbol => Any}] values by authority
56
- def self.values
57
- @values ||= {}
58
- end
59
-
60
- # Checks if fiber scheduling is currently enabled
61
- #
62
- # @return [Boolean] true if enabled, false otherwise
63
- def self.enabled?
64
- @enabled ||= false
65
- end
66
-
67
- # Enables fiber scheduling
68
- #
69
- # @return [void]
70
- def self.enable!
71
- reset! unless enabled?
72
- @enabled = true
73
- end
74
-
75
- # Disables fiber scheduling
76
- #
77
- # @return [void]
78
- def self.disable!
79
- @enabled = false
80
- end
81
-
82
- # Resets the scheduler state, and disables. Use before retrying failed authorities.
83
- #
84
- # @return [void]
85
- def self.reset!
86
- @registry = []
87
- @exceptions = {}
88
- @values = {}
89
- @enabled = false
90
- @delay_requested = 0.0
91
- @time_slept = 0.0
92
- @resume_count = 0
93
- @initial_resume_at = Time.now - 60.0 # one minute ago
94
- end
95
-
96
- # Registers a block to scrape for a specific authority
97
- #
98
- # @param authority [String] the name of the authority being processed
99
- # @yield to the block containing the scraping operation to be run in the fiber
100
- # @return [Fiber] a fiber that calls the block. With @authority and @resume_at instance vars
101
- def self.register_operation(authority, &block)
102
- # Automatically enable fiber scheduling when operations are registered
103
- enable!
104
-
105
- fiber = Fiber.new do
106
- values[authority] = block.call
107
- rescue StandardError => e
108
- # Store exception against the authority
109
- exceptions[authority] = e
110
- ensure
111
- # Remove itself when done regardless of success/failure
112
- registry.delete(Fiber.current)
113
- end
114
-
115
- # Start fibres in registration order
116
- @initial_resume_at += 0.1
117
- fiber.instance_variable_set(:@resume_at, @initial_resume_at)
118
- fiber.instance_variable_set(:@authority, authority)
119
- registry << fiber
120
-
121
- if DebugUtils.basic?
122
- FiberScheduler.log "Registered #{authority} operation with fiber: #{fiber.object_id} for interleaving"
123
- end
124
- # Process immediately when testing
125
- fiber.resume if ScraperUtils::RandomizeUtils.sequential?
126
- fiber
127
- end
128
-
129
- # Run all registered fibers until completion
130
- #
131
- # @return [Hash] Exceptions that occurred during execution
132
- def self.run_all
133
- count = registry.size
134
- while (fiber = find_earliest_fiber)
135
- if fiber.alive?
136
- authority = begin
137
- fiber.instance_variable_get(:@authority)
138
- rescue StandardError
139
- nil
140
- end
141
- @resume_count ||= 0
142
- @resume_count += 1
143
- values[authority] = fiber.resume
144
- else
145
- FiberScheduler.log "WARNING: fiber is dead but did not remove itself from registry! #{fiber.object_id}"
146
- registry.delete(fiber)
147
- end
148
- end
149
-
150
- if @time_slept&.positive? && @delay_requested&.positive?
151
- percent_slept = (100.0 * @time_slept / @delay_requested).round(1)
152
- end
153
- puts
154
- FiberScheduler.log "FiberScheduler processed #{@resume_count} calls to delay for #{count} registrations, " \
155
- "sleeping #{percent_slept}% (#{@time_slept&.round(1)}) of the " \
156
- "#{@delay_requested&.round(1)} seconds requested."
157
- puts
158
-
159
- exceptions
160
- end
161
-
162
- # Delays the current fiber and potentially runs another one
163
- # Falls back to regular sleep if fiber scheduling is not enabled
164
- #
165
- # @param seconds [Numeric] the number of seconds to delay
166
- # @return [Integer] return from sleep operation or 0
167
- def self.delay(seconds)
168
- seconds = 0.0 unless seconds&.positive?
169
- @delay_requested ||= 0.0
170
- @delay_requested += seconds
171
-
172
- current_fiber = Fiber.current
173
-
174
- if !enabled? || !current_fiber || registry.size <= 1
175
- @time_slept ||= 0.0
176
- @time_slept += seconds
177
- log("Sleeping #{seconds.round(3)} seconds") if DebugUtils.basic?
178
- return sleep(seconds)
179
- end
180
-
181
- now = Time.now
182
- resume_at = now + seconds
183
-
184
- # Don't resume at the same time as someone else,
185
- # FIFO queue if seconds == 0
186
- @other_resumes ||= []
187
- @other_resumes = @other_resumes.delete_if { |t| t < now }
188
- while @other_resumes.include?(resume_at) && resume_at
189
- resume_at += 0.01
190
- end
191
-
192
- # Used to compare when other fibers need to be resumed
193
- current_fiber.instance_variable_set(:@resume_at, resume_at)
194
-
195
- # Yield control back to the scheduler so another fiber can run
196
- Fiber.yield
197
-
198
- # When we get control back, check if we need to sleep more
199
- remaining = resume_at - Time.now
200
- if remaining.positive?
201
- @time_slept ||= 0.0
202
- @time_slept += remaining
203
- log("Sleeping remaining #{remaining.round(3)} seconds") if DebugUtils.basic?
204
- sleep(remaining)
205
- end || 0
206
- end
207
-
208
- # Finds the fiber with the earliest wake-up time
209
- #
210
- # @return [Fiber, nil] the fiber with the earliest wake-up time or nil if none found
211
- def self.find_earliest_fiber
212
- earliest_time = nil
213
- earliest_fiber = nil
214
-
215
- registry.each do |fiber|
216
- resume_at = fiber.instance_variable_get(:@resume_at)
217
- if earliest_time.nil? || resume_at < earliest_time
218
- earliest_time = resume_at
219
- earliest_fiber = fiber
220
- end
221
- end
222
-
223
- earliest_fiber
224
- end
225
-
226
- # Mark methods as private
227
- private_class_method :find_earliest_fiber
228
- end
229
- end
@@ -1,149 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module ScraperUtils
4
- # robots.txt checker with deliberately simplistic rules
5
- class RobotsChecker
6
- # @return [String] Lowercased user_agent for matching
7
- attr_reader :user_agent
8
-
9
- # Initialize with full user agent string like:
10
- # "Mozilla/5.0 (compatible; ScraperUtils/0.1.0 2025-02-22; +https://github.com/ianheggie-oaf/scraper_utils)"
11
- # Extracts the bot name (e.g. "ScraperUtils") to check against robots.txt
12
- # Checks for
13
- # * Disallow for User-agent: bot_name and
14
- # * Crawl-delay from either User-agent: bot name or * (default)
15
- def initialize(user_agent)
16
- @user_agent = extract_user_agent(user_agent).downcase
17
- if DebugUtils.basic?
18
- ScraperUtils::FiberScheduler.log(
19
- "Checking robots.txt for user agent prefix: #{@user_agent} (case insensitive)"
20
- )
21
- end
22
- @rules = {} # domain -> {rules: [], delay: int}
23
- @delay = nil # Delay from last robots.txt check
24
- end
25
-
26
- # Check if a URL is disallowed based on robots.txt rules specific to our user agent
27
- # @param url [String] The full URL to check
28
- # @return [Boolean] true if specifically blocked for our user agent, otherwise false
29
- def disallowed?(url)
30
- return false unless url
31
-
32
- uri = URI(url)
33
- domain = "#{uri.scheme}://#{uri.host}"
34
- path = uri.path || "/"
35
-
36
- # Get or fetch robots.txt rules
37
- rules = get_rules(domain)
38
- return false unless rules # If we can't get robots.txt, assume allowed
39
-
40
- # Store any delay found for this domain
41
- @delay = rules[:our_delay]
42
-
43
- # Check rules specific to our user agent
44
- matches_any_rule?(path, rules[:our_rules])
45
- end
46
-
47
- # Returns the crawl delay (if any) that applied to the last URL checked
48
- # Should be called after disallowed? to get relevant delay
49
- # @return [Integer, nil] The delay in seconds, or nil if no delay specified
50
- def crawl_delay
51
- @delay
52
- end
53
-
54
- private
55
-
56
- def extract_user_agent(user_agent)
57
- if user_agent =~ /^(.*compatible;\s*)?([-_a-z0-9]+)/i
58
- user_agent = ::Regexp.last_match(2)&.strip
59
- end
60
- user_agent&.strip
61
- end
62
-
63
- def matches_any_rule?(path, rules)
64
- rules&.any? { |rule| path.start_with?(rule) }
65
- end
66
-
67
- def get_rules(domain)
68
- return @rules[domain] if @rules.key?(domain)
69
-
70
- begin
71
- response = Net::HTTP.get_response(URI("#{domain}/robots.txt"))
72
- return nil unless response.code.start_with?("2") # 2xx response
73
-
74
- rules = parse_robots_txt(response.body)
75
- @rules[domain] = rules
76
- rules
77
- rescue StandardError => e
78
- if DebugUtils.basic?
79
- ScraperUtils::FiberScheduler.log(
80
- "WARNING: Failed to fetch robots.txt for #{domain}: #{e.message}"
81
- )
82
- end
83
- nil
84
- end
85
- end
86
-
87
- # Parse robots.txt content into structured rules
88
- # Only collects rules for our specific user agent and generic crawl-delay
89
- # @param content [String] The robots.txt content
90
- # @return [Hash] Hash containing :our_rules and :our_delay
91
- def parse_robots_txt(content)
92
- sections = [] # Array of {agent:, rules:[], delay:} hashes
93
- current_section = nil
94
-
95
- content.each_line do |line|
96
- line = line.strip.downcase
97
- next if line.empty? || line.start_with?("#")
98
-
99
- if line.start_with?("user-agent:")
100
- agent = line.split(":", 2).last.strip
101
- # Check if this is a continuation of the previous section
102
- if current_section && current_section[:rules].empty? && current_section[:delay].nil?
103
- current_section[:agents] << agent
104
- else
105
- current_section = { agents: [agent], rules: [], delay: nil }
106
- sections << current_section
107
- end
108
- next
109
- end
110
-
111
- next unless current_section # Skip rules before first user-agent
112
-
113
- if line.start_with?("disallow:")
114
- path = line.split(":", 2).last.strip
115
- current_section[:rules] << path unless path.empty?
116
- elsif line.start_with?("crawl-delay:")
117
- delay = line.split(":", 2).last.strip.to_i
118
- current_section[:delay] = delay if delay.positive?
119
- end
120
- end
121
-
122
- # Sort sections by most specific agent match first
123
- matched_section = sections.find do |section|
124
- section[:agents].any? do |agent|
125
- # Our user agent starts with the agent from robots.txt
126
- @user_agent.start_with?(agent) ||
127
- # Or the agent from robots.txt starts with our user agent
128
- # (handles ScraperUtils matching ScraperUtils/1.0)
129
- agent.start_with?(@user_agent)
130
- end
131
- end
132
-
133
- # Use matched section or fall back to wildcard
134
- if matched_section
135
- {
136
- our_rules: matched_section[:rules],
137
- our_delay: matched_section[:delay]
138
- }
139
- else
140
- # Find default section
141
- default_section = sections.find { |s| s[:agents].include?("*") }
142
- {
143
- our_rules: [],
144
- our_delay: default_section&.dig(:delay)
145
- }
146
- end
147
- end
148
- end
149
- end