scraper_utils 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +1 -8
- data/CHANGELOG.md +5 -0
- data/GUIDELINES.md +75 -0
- data/Gemfile +1 -1
- data/IMPLEMENTATION.md +33 -0
- data/README.md +226 -131
- data/SPECS.md +25 -0
- data/bin/console +1 -0
- data/bin/setup +2 -1
- data/lib/scraper_utils/adaptive_delay.rb +65 -0
- data/lib/scraper_utils/authority_utils.rb +2 -2
- data/lib/scraper_utils/data_quality_monitor.rb +53 -0
- data/lib/scraper_utils/db_utils.rb +2 -1
- data/lib/scraper_utils/debug_utils.rb +13 -20
- data/lib/scraper_utils/fiber_scheduler.rb +206 -0
- data/lib/scraper_utils/log_utils.rb +57 -26
- data/lib/scraper_utils/mechanize_utils/agent_config.rb +255 -0
- data/lib/scraper_utils/mechanize_utils.rb +23 -29
- data/lib/scraper_utils/robots_checker.rb +144 -0
- data/lib/scraper_utils/version.rb +1 -1
- data/lib/scraper_utils.rb +3 -0
- data/scraper_utils.gemspec +3 -8
- metadata +13 -74
@@ -0,0 +1,255 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "mechanize"
|
4
|
+
require "ipaddr"
|
5
|
+
|
6
|
+
module ScraperUtils
|
7
|
+
module MechanizeUtils
|
8
|
+
# Configuration for a Mechanize agent with sensible defaults and configurable settings.
|
9
|
+
# Supports global configuration through {.configure} and per-instance overrides.
|
10
|
+
#
|
11
|
+
# @example Setting global defaults
|
12
|
+
# ScraperUtils::MechanizeUtils::AgentConfig.configure do |config|
|
13
|
+
# config.default_timeout = 90
|
14
|
+
# config.default_random_delay = 5
|
15
|
+
# end
|
16
|
+
#
|
17
|
+
# @example Creating an instance with defaults
|
18
|
+
# config = ScraperUtils::MechanizeUtils::AgentConfig.new
|
19
|
+
#
|
20
|
+
# @example Overriding specific settings
|
21
|
+
# config = ScraperUtils::MechanizeUtils::AgentConfig.new(
|
22
|
+
# timeout: 120,
|
23
|
+
# random_delay: 10
|
24
|
+
# )
|
25
|
+
class AgentConfig
|
26
|
+
# Class-level defaults that can be modified
|
27
|
+
class << self
|
28
|
+
# @return [Integer] Default timeout in seconds for agent connections
|
29
|
+
attr_accessor :default_timeout
|
30
|
+
|
31
|
+
# @return [Boolean] Default setting for compliance with headers and robots.txt
|
32
|
+
attr_accessor :default_compliant_mode
|
33
|
+
|
34
|
+
# @return [Integer, nil] Default average random delay in seconds
|
35
|
+
attr_accessor :default_random_delay
|
36
|
+
|
37
|
+
# @return [Float, nil] Default maximum server load percentage (nil = no response delay)
|
38
|
+
attr_accessor :default_max_load
|
39
|
+
|
40
|
+
# @return [Boolean] Default setting for SSL certificate verification
|
41
|
+
attr_accessor :default_disable_ssl_certificate_check
|
42
|
+
|
43
|
+
# @return [Boolean] Default flag for Australian proxy preference
|
44
|
+
attr_accessor :default_australian_proxy
|
45
|
+
|
46
|
+
# @return [String, nil] Default Mechanize user agent
|
47
|
+
attr_accessor :default_user_agent
|
48
|
+
|
49
|
+
# Configure default settings for all AgentConfig instances
|
50
|
+
# @yield [self] Yields self for configuration
|
51
|
+
# @example
|
52
|
+
# AgentConfig.configure do |config|
|
53
|
+
# config.default_timeout = 90
|
54
|
+
# config.default_random_delay = 5
|
55
|
+
# config.default_max_load = 15
|
56
|
+
# end
|
57
|
+
# @return [void]
|
58
|
+
def configure
|
59
|
+
yield self if block_given?
|
60
|
+
end
|
61
|
+
|
62
|
+
# Reset all configuration options to their default values
|
63
|
+
# @return [void]
|
64
|
+
def reset_defaults!
|
65
|
+
@default_timeout = 60
|
66
|
+
@default_compliant_mode = true
|
67
|
+
@default_random_delay = 3
|
68
|
+
@default_max_load = 20.0
|
69
|
+
@default_disable_ssl_certificate_check = false
|
70
|
+
@default_australian_proxy = nil
|
71
|
+
@default_user_agent = nil
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
# Set defaults on load
|
76
|
+
reset_defaults!
|
77
|
+
|
78
|
+
|
79
|
+
# @return [String] User agent string
|
80
|
+
attr_reader :user_agent
|
81
|
+
|
82
|
+
# Give access for testing
|
83
|
+
|
84
|
+
attr_reader :max_load
|
85
|
+
attr_reader :min_random
|
86
|
+
attr_reader :max_random
|
87
|
+
|
88
|
+
# Creates configuration for a Mechanize agent with sensible defaults
|
89
|
+
# @param timeout [Integer, nil] Timeout for agent connections (default: 60 unless changed)
|
90
|
+
# @param compliant_mode [Boolean, nil] Comply with headers and robots.txt (default: true unless changed)
|
91
|
+
# @param random_delay [Integer, nil] Average random delay in seconds (default: 3 unless changed)
|
92
|
+
# @param max_load [Float, nil] Maximum server load percentage (nil = no response delay, default: 20%)
|
93
|
+
# When compliant_mode is true, max_load is capped at 33%
|
94
|
+
# @param disable_ssl_certificate_check [Boolean, nil] Skip SSL verification (default: false unless changed)
|
95
|
+
# @param australian_proxy [Boolean, nil] Use proxy if available (default: false unless changed)
|
96
|
+
# @param user_agent [String, nil] Configure Mechanize user agent
|
97
|
+
def initialize(timeout: nil,
|
98
|
+
compliant_mode: nil,
|
99
|
+
random_delay: nil,
|
100
|
+
max_load: nil,
|
101
|
+
disable_ssl_certificate_check: nil,
|
102
|
+
australian_proxy: false,
|
103
|
+
user_agent: nil)
|
104
|
+
@timeout = timeout.nil? ? self.class.default_timeout : timeout
|
105
|
+
@compliant_mode = compliant_mode.nil? ? self.class.default_compliant_mode : compliant_mode
|
106
|
+
@random_delay = random_delay.nil? ? self.class.default_random_delay : random_delay
|
107
|
+
@max_load = max_load.nil? ? self.class.default_max_load : max_load
|
108
|
+
@max_load = [@max_load || 20.0, 33.0].min if @compliant_mode
|
109
|
+
@user_agent = user_agent.nil? ? self.class.default_user_agent : user_agent
|
110
|
+
|
111
|
+
@disable_ssl_certificate_check = disable_ssl_certificate_check.nil? ?
|
112
|
+
self.class.default_disable_ssl_certificate_check :
|
113
|
+
disable_ssl_certificate_check
|
114
|
+
@australian_proxy = australian_proxy.nil? ? self.class.default_australian_proxy : australian_proxy
|
115
|
+
|
116
|
+
# Validate proxy URL format if proxy will be used
|
117
|
+
@australian_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
|
118
|
+
if @australian_proxy
|
119
|
+
uri = begin
|
120
|
+
URI.parse(ScraperUtils.australian_proxy.to_s)
|
121
|
+
rescue URI::InvalidURIError => e
|
122
|
+
raise URI::InvalidURIError, "Invalid proxy URL format: #{e.message}"
|
123
|
+
end
|
124
|
+
unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
|
125
|
+
raise URI::InvalidURIError, "Proxy URL must start with http:// or https://"
|
126
|
+
end
|
127
|
+
unless uri.host && uri.port
|
128
|
+
raise URI::InvalidURIError, "Proxy URL must include host and port"
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
if @random_delay
|
133
|
+
@min_random = Math.sqrt(@random_delay * 3.0 / 13.0).round(3)
|
134
|
+
@max_random = (3 * @min_random).round(3)
|
135
|
+
end
|
136
|
+
|
137
|
+
today = Date.today.strftime("%Y-%m-%d")
|
138
|
+
@user_agent = ENV['MORPH_USER_AGENT']&.sub("TODAY", today)
|
139
|
+
if @compliant_mode
|
140
|
+
version = ScraperUtils::VERSION
|
141
|
+
@user_agent ||= "Mozilla/5.0 (compatible; ScraperUtils/#{version} #{today}; +https://github.com/ianheggie-oaf/scraper_utils)"
|
142
|
+
end
|
143
|
+
|
144
|
+
@robots_checker = RobotsChecker.new(@user_agent) if @user_agent
|
145
|
+
@adaptive_delay = AdaptiveDelay.new(max_load: @max_load) if @max_load
|
146
|
+
display_options
|
147
|
+
end
|
148
|
+
|
149
|
+
# Configures a Mechanize agent with these settings
|
150
|
+
# @param agent [Mechanize] The agent to configure
|
151
|
+
# @return [void]
|
152
|
+
def configure_agent(agent)
|
153
|
+
agent.verify_mode = OpenSSL::SSL::VERIFY_NONE if @disable_ssl_certificate_check
|
154
|
+
|
155
|
+
if @timeout
|
156
|
+
agent.open_timeout = @timeout
|
157
|
+
agent.read_timeout = @timeout
|
158
|
+
end
|
159
|
+
if @compliant_mode
|
160
|
+
agent.user_agent = user_agent
|
161
|
+
agent.request_headers ||= {}
|
162
|
+
agent.request_headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
|
163
|
+
agent.request_headers["Upgrade-Insecure-Requests"] = "1"
|
164
|
+
end
|
165
|
+
if @australian_proxy
|
166
|
+
agent.agent.set_proxy(ScraperUtils.australian_proxy)
|
167
|
+
agent.request_headers["Accept-Language"] = "en-AU,en-US;q=0.9,en;q=0.8"
|
168
|
+
verify_proxy_works(agent)
|
169
|
+
end
|
170
|
+
|
171
|
+
@connection_started_at = nil
|
172
|
+
agent.pre_connect_hooks << method(:pre_connect_hook)
|
173
|
+
agent.post_connect_hooks << method(:post_connect_hook)
|
174
|
+
end
|
175
|
+
|
176
|
+
private
|
177
|
+
|
178
|
+
def display_options
|
179
|
+
display_args = []
|
180
|
+
display_args << "timeout=#{@timeout}" if @timeout
|
181
|
+
if @australian_proxy
|
182
|
+
display_args << "australian_proxy=#{@australian_proxy.inspect}"
|
183
|
+
elsif ScraperUtils.australian_proxy.to_s.empty?
|
184
|
+
display_args << "#{ScraperUtils::AUSTRALIAN_PROXY_ENV_VAR} not set"
|
185
|
+
else
|
186
|
+
display_args << "australian_proxy=#{@australian_proxy.inspect}"
|
187
|
+
end
|
188
|
+
display_args << "compliant_mode" if @compliant_mode
|
189
|
+
display_args << "random_delay=#{@random_delay}" if @random_delay
|
190
|
+
display_args << "max_load=#{@max_load}%" if @max_load
|
191
|
+
display_args << "disable_ssl_certificate_check" if @disable_ssl_certificate_check
|
192
|
+
display_args << "default args" if display_args.empty?
|
193
|
+
ScraperUtils::FiberScheduler.log "Configuring Mechanize agent with #{display_args.join(', ')}"
|
194
|
+
end
|
195
|
+
|
196
|
+
def pre_connect_hook(_agent, request)
|
197
|
+
@connection_started_at = Time.now
|
198
|
+
ScraperUtils::FiberScheduler.log "Pre Connect request: #{request.inspect} at #{@connection_started_at}" if ENV["DEBUG"]
|
199
|
+
end
|
200
|
+
|
201
|
+
def post_connect_hook(_agent, uri, response, _body)
|
202
|
+
raise ArgumentError, "URI must be present in post-connect hook" unless uri
|
203
|
+
|
204
|
+
response_time = Time.now - @connection_started_at
|
205
|
+
if ENV["DEBUG"]
|
206
|
+
ScraperUtils::FiberScheduler.log "Post Connect uri: #{uri.inspect}, response: #{response.inspect} after #{response_time} seconds"
|
207
|
+
end
|
208
|
+
|
209
|
+
if @robots_checker&.disallowed?(uri)
|
210
|
+
raise ScraperUtils::UnprocessableSite,
|
211
|
+
"URL is disallowed by robots.txt specific rules: #{uri}"
|
212
|
+
end
|
213
|
+
|
214
|
+
delays = {
|
215
|
+
robot_txt: @robots_checker&.crawl_delay&.round(3),
|
216
|
+
max_load: @adaptive_delay&.next_delay(uri, response_time)&.round(3),
|
217
|
+
random: (@min_random ? (rand(@min_random..@max_random) ** 2).round(3) : nil)
|
218
|
+
}
|
219
|
+
@delay = delays.values.compact.max
|
220
|
+
if @delay&.positive?
|
221
|
+
puts "Delaying #{@delay} seconds, max of #{delays.inspect}" if ENV["DEBUG"]
|
222
|
+
sleep(@delay)
|
223
|
+
end
|
224
|
+
|
225
|
+
response
|
226
|
+
end
|
227
|
+
|
228
|
+
def verify_proxy_works(agent)
|
229
|
+
my_ip = MechanizeUtils.public_ip(agent)
|
230
|
+
begin
|
231
|
+
IPAddr.new(my_ip)
|
232
|
+
rescue IPAddr::InvalidAddressError => e
|
233
|
+
raise "Invalid public IP address returned by proxy check: #{my_ip.inspect}: #{e}"
|
234
|
+
end
|
235
|
+
ScraperUtils::FiberScheduler.log "Proxy is using IP address: #{my_ip.inspect}"
|
236
|
+
my_headers = MechanizeUtils::public_headers(agent)
|
237
|
+
begin
|
238
|
+
# Check response is JSON just to be safe!
|
239
|
+
headers = JSON.parse(my_headers)
|
240
|
+
puts "Proxy is passing headers:"
|
241
|
+
puts JSON.pretty_generate(headers['headers'])
|
242
|
+
rescue JSON::ParserError => e
|
243
|
+
puts "Couldn't parse public_headers: #{e}! Raw response:"
|
244
|
+
puts my_headers.inspect
|
245
|
+
end
|
246
|
+
rescue Net::OpenTimeout, Timeout::Error => e
|
247
|
+
raise "Proxy check timed out: #{e}"
|
248
|
+
rescue Errno::ECONNREFUSED, Net::HTTP::Persistent::Error => e
|
249
|
+
raise "Failed to connect to proxy: #{e}"
|
250
|
+
rescue Mechanize::ResponseCodeError => e
|
251
|
+
raise "Proxy check error: #{e}"
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|
255
|
+
end
|
@@ -1,32 +1,23 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require "mechanize"
|
4
|
+
require "ipaddr"
|
5
|
+
require "scraper_utils/mechanize_utils/agent_config"
|
4
6
|
|
5
7
|
module ScraperUtils
|
6
8
|
# Utilities for configuring and using Mechanize for web scraping
|
7
9
|
module MechanizeUtils
|
8
10
|
PUBLIC_IP_URL = "https://whatismyip.akamai.com/"
|
11
|
+
HEADERS_ECHO_URL = "https://httpbin.org/headers"
|
9
12
|
|
10
|
-
# Creates and configures a Mechanize agent
|
11
|
-
#
|
12
|
-
# @param timeout [Integer, nil] Timeout for agent connections
|
13
|
-
# @param australian_proxy [Boolean] Whether to use an Australian proxy
|
13
|
+
# Creates and configures a Mechanize agent
|
14
|
+
# @param (see AgentConfig#initialize)
|
14
15
|
# @return [Mechanize] Configured Mechanize agent
|
15
|
-
def self.mechanize_agent(
|
16
|
+
def self.mechanize_agent(**options)
|
16
17
|
agent = Mechanize.new
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
# On morph.io set the environment variable MORPH_AUSTRALIAN_PROXY to
|
21
|
-
# http://morph:password@au.proxy.oaf.org.au:8888 replacing password with
|
22
|
-
# the real password.
|
23
|
-
agent.agent.set_proxy(ScraperUtils.australian_proxy)
|
24
|
-
end
|
25
|
-
if timeout
|
26
|
-
agent.open_timeout = timeout
|
27
|
-
agent.read_timeout = timeout
|
28
|
-
end
|
29
|
-
public_ip(agent) if use_proxy
|
18
|
+
config = AgentConfig.new(**options)
|
19
|
+
config.configure_agent(agent)
|
20
|
+
agent.instance_variable_set(:@scraper_utils_config, config)
|
30
21
|
agent
|
31
22
|
end
|
32
23
|
|
@@ -47,24 +38,27 @@ module ScraperUtils
|
|
47
38
|
text = element.inner_text
|
48
39
|
return "Maintenance: #{text}" if text&.match?(/maintenance/i)
|
49
40
|
end
|
50
|
-
|
51
|
-
# Not in maintenance mode
|
52
41
|
nil
|
53
42
|
end
|
54
43
|
|
55
44
|
# Retrieves and logs the public IP address
|
56
45
|
#
|
57
|
-
# @param agent [Mechanize] Mechanize agent to use for IP lookup
|
58
|
-
# @param force [Boolean] Force a new IP lookup,
|
46
|
+
# @param agent [Mechanize, nil] Mechanize agent to use for IP lookup or nil when clearing cache
|
47
|
+
# @param force [Boolean] Force a new IP lookup, by clearing cache first
|
59
48
|
# @return [String] The public IP address
|
60
|
-
def self.public_ip(agent, force: false)
|
49
|
+
def self.public_ip(agent = nil, force: false)
|
61
50
|
@public_ip = nil if force
|
62
|
-
@public_ip ||=
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
51
|
+
@public_ip ||= agent&.get(PUBLIC_IP_URL)&.body&.strip if agent
|
52
|
+
end
|
53
|
+
|
54
|
+
# Retrieves and logs the headers that make it through the proxy
|
55
|
+
#
|
56
|
+
# @param agent [Mechanize, nil] Mechanize agent to use for IP lookup or nil when clearing cache
|
57
|
+
# @param force [Boolean] Force a new IP lookup, by clearing cache first
|
58
|
+
# @return [String] The list of headers in json format
|
59
|
+
def self.public_headers(agent = nil, force: false)
|
60
|
+
@public_headers = nil if force
|
61
|
+
@public_headers ||= agent&.get(HEADERS_ECHO_URL)&.body&.strip if agent
|
68
62
|
end
|
69
63
|
end
|
70
64
|
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ScraperUtils
|
4
|
+
# robots.txt checker with deliberately simplistic rules
|
5
|
+
class RobotsChecker
|
6
|
+
# @return [String] Lowercased user_agent for matching
|
7
|
+
attr_reader :user_agent
|
8
|
+
|
9
|
+
# Initialize with full user agent string like:
|
10
|
+
# "Mozilla/5.0 (compatible; ScraperUtils/0.1.0 2025-02-22; +https://github.com/ianheggie-oaf/scraper_utils)"
|
11
|
+
# Extracts the bot name (e.g. "ScraperUtils") to check against robots.txt
|
12
|
+
# Checks for
|
13
|
+
# * Disallow for User-agent: bot_name and
|
14
|
+
# * Crawl-delay from either User-agent: bot name or * (default)
|
15
|
+
def initialize(user_agent)
|
16
|
+
@user_agent = extract_user_agent(user_agent).downcase
|
17
|
+
if ENV["DEBUG"]
|
18
|
+
ScraperUtils::FiberScheduler.log "Checking robots.txt for user agent prefix: #{@user_agent} (case insensitive)"
|
19
|
+
end
|
20
|
+
@rules = {} # domain -> {rules: [], delay: int}
|
21
|
+
@delay = nil # Delay from last robots.txt check
|
22
|
+
end
|
23
|
+
|
24
|
+
# Check if a URL is disallowed based on robots.txt rules specific to our user agent
|
25
|
+
# @param url [String] The full URL to check
|
26
|
+
# @return [Boolean] true if specifically blocked for our user agent, otherwise false
|
27
|
+
def disallowed?(url)
|
28
|
+
return false unless url
|
29
|
+
|
30
|
+
uri = URI(url)
|
31
|
+
domain = "#{uri.scheme}://#{uri.host}"
|
32
|
+
path = uri.path || "/"
|
33
|
+
|
34
|
+
# Get or fetch robots.txt rules
|
35
|
+
rules = get_rules(domain)
|
36
|
+
return false unless rules # If we can't get robots.txt, assume allowed
|
37
|
+
|
38
|
+
# Store any delay found for this domain
|
39
|
+
@delay = rules[:our_delay]
|
40
|
+
|
41
|
+
# Check rules specific to our user agent
|
42
|
+
matches_any_rule?(path, rules[:our_rules])
|
43
|
+
end
|
44
|
+
|
45
|
+
# Returns the crawl delay (if any) that applied to the last URL checked
|
46
|
+
# Should be called after disallowed? to get relevant delay
|
47
|
+
# @return [Integer, nil] The delay in seconds, or nil if no delay specified
|
48
|
+
def crawl_delay
|
49
|
+
@delay
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def extract_user_agent(user_agent)
|
55
|
+
if user_agent =~ /^(.*compatible;\s*)?([-_a-z0-9]+)/i
|
56
|
+
user_agent = ::Regexp.last_match(2)&.strip
|
57
|
+
end
|
58
|
+
user_agent&.strip
|
59
|
+
end
|
60
|
+
|
61
|
+
def matches_any_rule?(path, rules)
|
62
|
+
rules&.any? { |rule| path.start_with?(rule) }
|
63
|
+
end
|
64
|
+
|
65
|
+
def get_rules(domain)
|
66
|
+
return @rules[domain] if @rules.key?(domain)
|
67
|
+
|
68
|
+
begin
|
69
|
+
response = Net::HTTP.get_response(URI("#{domain}/robots.txt"))
|
70
|
+
return nil unless response.code.start_with?("2") # 2xx response
|
71
|
+
|
72
|
+
rules = parse_robots_txt(response.body)
|
73
|
+
@rules[domain] = rules
|
74
|
+
rules
|
75
|
+
rescue StandardError => e
|
76
|
+
ScraperUtils::FiberScheduler.log "Warning: Failed to fetch robots.txt for #{domain}: #{e.message}" if ENV["DEBUG"]
|
77
|
+
nil
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# Parse robots.txt content into structured rules
|
82
|
+
# Only collects rules for our specific user agent and generic crawl-delay
|
83
|
+
# @param content [String] The robots.txt content
|
84
|
+
# @return [Hash] Hash containing :our_rules and :our_delay
|
85
|
+
def parse_robots_txt(content)
|
86
|
+
sections = [] # Array of {agent:, rules:[], delay:} hashes
|
87
|
+
current_section = nil
|
88
|
+
|
89
|
+
content.each_line do |line|
|
90
|
+
line = line.strip.downcase
|
91
|
+
next if line.empty? || line.start_with?("#")
|
92
|
+
|
93
|
+
if line.start_with?("user-agent:")
|
94
|
+
agent = line.split(":", 2).last.strip
|
95
|
+
# Check if this is a continuation of the previous section
|
96
|
+
if current_section && current_section[:rules].empty? && current_section[:delay].nil?
|
97
|
+
current_section[:agents] << agent
|
98
|
+
else
|
99
|
+
current_section = { agents: [agent], rules: [], delay: nil }
|
100
|
+
sections << current_section
|
101
|
+
end
|
102
|
+
next
|
103
|
+
end
|
104
|
+
|
105
|
+
next unless current_section # Skip rules before first user-agent
|
106
|
+
|
107
|
+
if line.start_with?("disallow:")
|
108
|
+
path = line.split(":", 2).last.strip
|
109
|
+
current_section[:rules] << path unless path.empty?
|
110
|
+
elsif line.start_with?("crawl-delay:")
|
111
|
+
delay = line.split(":", 2).last.strip.to_i
|
112
|
+
current_section[:delay] = delay if delay.positive?
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
# Sort sections by most specific agent match first
|
117
|
+
matched_section = sections.find do |section|
|
118
|
+
section[:agents].any? do |agent|
|
119
|
+
# Our user agent starts with the agent from robots.txt
|
120
|
+
@user_agent.start_with?(agent) ||
|
121
|
+
# Or the agent from robots.txt starts with our user agent
|
122
|
+
# (handles ScraperUtils matching ScraperUtils/1.0)
|
123
|
+
agent.start_with?(@user_agent)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
# Use matched section or fall back to wildcard
|
128
|
+
if matched_section
|
129
|
+
{
|
130
|
+
our_rules: matched_section[:rules],
|
131
|
+
our_delay: matched_section[:delay]
|
132
|
+
}
|
133
|
+
else
|
134
|
+
# Find default section
|
135
|
+
default_section = sections.find { |s| s[:agents].include?("*") }
|
136
|
+
{
|
137
|
+
our_rules: [],
|
138
|
+
our_delay: default_section&.dig(:delay)
|
139
|
+
}
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
data/lib/scraper_utils.rb
CHANGED
@@ -1,10 +1,13 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require "scraper_utils/adaptive_delay"
|
3
4
|
require "scraper_utils/authority_utils"
|
5
|
+
require "scraper_utils/data_quality_monitor"
|
4
6
|
require "scraper_utils/db_utils"
|
5
7
|
require "scraper_utils/debug_utils"
|
6
8
|
require "scraper_utils/log_utils"
|
7
9
|
require "scraper_utils/mechanize_utils"
|
10
|
+
require "scraper_utils/robots_checker"
|
8
11
|
require "scraper_utils/version"
|
9
12
|
|
10
13
|
# Utilities for planningalerts scrapers
|
data/scraper_utils.gemspec
CHANGED
@@ -13,7 +13,7 @@ Gem::Specification.new do |spec|
|
|
13
13
|
|
14
14
|
spec.summary = "planningalerts scraper utilities"
|
15
15
|
spec.description = "Utilities to help make planningalerts scrapers, " \
|
16
|
-
|
16
|
+
"+especially multis easier to develop, run and debug."
|
17
17
|
spec.homepage = "https://github.com/ianheggie-oaf/scraper_utils"
|
18
18
|
spec.license = "MIT"
|
19
19
|
|
@@ -25,7 +25,7 @@ Gem::Specification.new do |spec|
|
|
25
25
|
# spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
|
26
26
|
else
|
27
27
|
raise "RubyGems 2.0 or newer is required to protect against " \
|
28
|
-
|
28
|
+
"public gem pushes."
|
29
29
|
end
|
30
30
|
|
31
31
|
# Specify which files should be added to the gem when it is released.
|
@@ -40,10 +40,5 @@ Gem::Specification.new do |spec|
|
|
40
40
|
spec.add_dependency "mechanize"
|
41
41
|
spec.add_dependency "nokogiri"
|
42
42
|
spec.add_dependency "sqlite3"
|
43
|
-
|
44
|
-
spec.add_development_dependency "rake"
|
45
|
-
spec.add_development_dependency "rspec"
|
46
|
-
spec.add_development_dependency "rubocop"
|
47
|
-
spec.add_development_dependency "simplecov"
|
48
|
-
spec.add_development_dependency "simplecov-console"
|
43
|
+
spec.metadata["rubygems_mfa_required"] = "true"
|
49
44
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scraper_utils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ian Heggie
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-02-
|
11
|
+
date: 2025-02-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -52,76 +52,6 @@ dependencies:
|
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: rake
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
58
|
-
requirements:
|
59
|
-
- - ">="
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '0'
|
62
|
-
type: :development
|
63
|
-
prerelease: false
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
65
|
-
requirements:
|
66
|
-
- - ">="
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version: '0'
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: rspec
|
71
|
-
requirement: !ruby/object:Gem::Requirement
|
72
|
-
requirements:
|
73
|
-
- - ">="
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: '0'
|
76
|
-
type: :development
|
77
|
-
prerelease: false
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
79
|
-
requirements:
|
80
|
-
- - ">="
|
81
|
-
- !ruby/object:Gem::Version
|
82
|
-
version: '0'
|
83
|
-
- !ruby/object:Gem::Dependency
|
84
|
-
name: rubocop
|
85
|
-
requirement: !ruby/object:Gem::Requirement
|
86
|
-
requirements:
|
87
|
-
- - ">="
|
88
|
-
- !ruby/object:Gem::Version
|
89
|
-
version: '0'
|
90
|
-
type: :development
|
91
|
-
prerelease: false
|
92
|
-
version_requirements: !ruby/object:Gem::Requirement
|
93
|
-
requirements:
|
94
|
-
- - ">="
|
95
|
-
- !ruby/object:Gem::Version
|
96
|
-
version: '0'
|
97
|
-
- !ruby/object:Gem::Dependency
|
98
|
-
name: simplecov
|
99
|
-
requirement: !ruby/object:Gem::Requirement
|
100
|
-
requirements:
|
101
|
-
- - ">="
|
102
|
-
- !ruby/object:Gem::Version
|
103
|
-
version: '0'
|
104
|
-
type: :development
|
105
|
-
prerelease: false
|
106
|
-
version_requirements: !ruby/object:Gem::Requirement
|
107
|
-
requirements:
|
108
|
-
- - ">="
|
109
|
-
- !ruby/object:Gem::Version
|
110
|
-
version: '0'
|
111
|
-
- !ruby/object:Gem::Dependency
|
112
|
-
name: simplecov-console
|
113
|
-
requirement: !ruby/object:Gem::Requirement
|
114
|
-
requirements:
|
115
|
-
- - ">="
|
116
|
-
- !ruby/object:Gem::Version
|
117
|
-
version: '0'
|
118
|
-
type: :development
|
119
|
-
prerelease: false
|
120
|
-
version_requirements: !ruby/object:Gem::Requirement
|
121
|
-
requirements:
|
122
|
-
- - ">="
|
123
|
-
- !ruby/object:Gem::Version
|
124
|
-
version: '0'
|
125
55
|
description: Utilities to help make planningalerts scrapers, +especially multis easier
|
126
56
|
to develop, run and debug.
|
127
57
|
email:
|
@@ -134,18 +64,27 @@ files:
|
|
134
64
|
- ".rspec"
|
135
65
|
- ".rubocop.yml"
|
136
66
|
- ".travis.yml"
|
67
|
+
- CHANGELOG.md
|
68
|
+
- GUIDELINES.md
|
137
69
|
- Gemfile
|
70
|
+
- IMPLEMENTATION.md
|
138
71
|
- LICENSE.txt
|
139
72
|
- README.md
|
140
73
|
- Rakefile
|
74
|
+
- SPECS.md
|
141
75
|
- bin/console
|
142
76
|
- bin/setup
|
143
77
|
- lib/scraper_utils.rb
|
78
|
+
- lib/scraper_utils/adaptive_delay.rb
|
144
79
|
- lib/scraper_utils/authority_utils.rb
|
80
|
+
- lib/scraper_utils/data_quality_monitor.rb
|
145
81
|
- lib/scraper_utils/db_utils.rb
|
146
82
|
- lib/scraper_utils/debug_utils.rb
|
83
|
+
- lib/scraper_utils/fiber_scheduler.rb
|
147
84
|
- lib/scraper_utils/log_utils.rb
|
148
85
|
- lib/scraper_utils/mechanize_utils.rb
|
86
|
+
- lib/scraper_utils/mechanize_utils/agent_config.rb
|
87
|
+
- lib/scraper_utils/robots_checker.rb
|
149
88
|
- lib/scraper_utils/version.rb
|
150
89
|
- scraper_utils.gemspec
|
151
90
|
homepage: https://github.com/ianheggie-oaf/scraper_utils
|
@@ -155,6 +94,7 @@ metadata:
|
|
155
94
|
allowed_push_host: https://rubygems.org
|
156
95
|
homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
|
157
96
|
source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
|
97
|
+
rubygems_mfa_required: 'true'
|
158
98
|
post_install_message:
|
159
99
|
rdoc_options: []
|
160
100
|
require_paths:
|
@@ -170,8 +110,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
170
110
|
- !ruby/object:Gem::Version
|
171
111
|
version: '0'
|
172
112
|
requirements: []
|
173
|
-
|
174
|
-
rubygems_version: 2.7.6.2
|
113
|
+
rubygems_version: 3.4.10
|
175
114
|
signing_key:
|
176
115
|
specification_version: 4
|
177
116
|
summary: planningalerts scraper utilities
|