scraper_utils 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/.rubocop.yml +4 -0
- data/CHANGELOG.md +22 -1
- data/Gemfile +5 -2
- data/README.md +128 -149
- data/docs/example_scrape_with_fibers.rb +31 -0
- data/docs/example_scraper.rb +93 -0
- data/lib/scraper_utils/adaptive_delay.rb +55 -50
- data/lib/scraper_utils/cycle_utils.rb +25 -0
- data/lib/scraper_utils/data_quality_monitor.rb +28 -17
- data/lib/scraper_utils/date_range_utils.rb +159 -0
- data/lib/scraper_utils/db_utils.rb +0 -2
- data/lib/scraper_utils/debug_utils.rb +53 -6
- data/lib/scraper_utils/fiber_scheduler.rb +45 -22
- data/lib/scraper_utils/log_utils.rb +19 -17
- data/lib/scraper_utils/mechanize_utils/agent_config.rb +67 -46
- data/lib/scraper_utils/mechanize_utils.rb +12 -4
- data/lib/scraper_utils/randomize_utils.rb +34 -0
- data/lib/scraper_utils/robots_checker.rb +9 -4
- data/lib/scraper_utils/version.rb +1 -1
- data/lib/scraper_utils.rb +3 -10
- metadata +7 -2
@@ -13,8 +13,8 @@ module ScraperUtils
|
|
13
13
|
# @param start_time [Time] When this scraping attempt was started
|
14
14
|
# @param attempt [Integer] 1 for first run, 2 for first retry, 3 for last retry (without proxy)
|
15
15
|
# @param authorities [Array<Symbol>] List of authorities attempted to scrape
|
16
|
-
# @param exceptions [Hash
|
17
|
-
# DataQualityMonitor.stats is checked for :saved and :unprocessed entries
|
16
|
+
# @param exceptions [Hash{Symbol => Exception}] Any exceptions that occurred during scraping
|
17
|
+
# `DataQualityMonitor.stats` is checked for :saved and :unprocessed entries
|
18
18
|
# @return [void]
|
19
19
|
def self.log_scraping_run(start_time, attempt, authorities, exceptions)
|
20
20
|
raise ArgumentError, "Invalid start time" unless start_time.is_a?(Time)
|
@@ -75,39 +75,39 @@ module ScraperUtils
|
|
75
75
|
|
76
76
|
# Report on the results
|
77
77
|
# @param authorities [Array<Symbol>] List of authorities attempted to scrape
|
78
|
-
# @param exceptions [Hash
|
79
|
-
# DataQualityMonitor.stats is checked for :saved and :unprocessed entries
|
78
|
+
# @param exceptions [Hash{Symbol => Exception}] Any exceptions that occurred during scraping
|
79
|
+
# `DataQualityMonitor.stats` is checked for :saved and :unprocessed entries
|
80
80
|
# @return [void]
|
81
81
|
def self.report_on_results(authorities, exceptions)
|
82
|
-
|
82
|
+
if ENV["MORPH_EXPECT_BAD"]
|
83
|
+
expect_bad = ENV["MORPH_EXPECT_BAD"].split(",").map(&:strip).map(&:to_sym)
|
84
|
+
end
|
85
|
+
expect_bad ||= []
|
83
86
|
|
84
|
-
|
87
|
+
$stderr.flush
|
88
|
+
puts "MORPH_EXPECT_BAD=#{ENV.fetch('MORPH_EXPECT_BAD', nil)}"
|
85
89
|
|
86
90
|
# Print summary table
|
87
91
|
puts "\nScraping Summary:"
|
88
92
|
summary_format = "%-20s %6s %6s %s"
|
89
93
|
|
90
|
-
puts summary_format
|
91
|
-
puts summary_format
|
94
|
+
puts format(summary_format, 'Authority', 'OK', 'Bad', 'Exception')
|
95
|
+
puts format(summary_format, "-" * 20, "-" * 6, "-" * 6, "-" * 50)
|
92
96
|
|
93
97
|
authorities.each do |authority|
|
94
98
|
stats = ScraperUtils::DataQualityMonitor.stats&.fetch(authority, {}) || {}
|
95
|
-
|
99
|
+
|
96
100
|
ok_records = stats[:saved] || 0
|
97
101
|
bad_records = stats[:unprocessed] || 0
|
98
|
-
|
102
|
+
|
99
103
|
expect_bad_prefix = expect_bad.include?(authority) ? "[EXPECT BAD] " : ""
|
100
104
|
exception_msg = if exceptions[authority]
|
101
105
|
"#{exceptions[authority].class} - #{exceptions[authority].message}"
|
102
106
|
else
|
103
107
|
"-"
|
104
108
|
end
|
105
|
-
puts summary_format
|
106
|
-
|
107
|
-
ok_records,
|
108
|
-
bad_records,
|
109
|
-
"#{expect_bad_prefix}#{exception_msg}".slice(0, 70)
|
110
|
-
]
|
109
|
+
puts format(summary_format, authority.to_s, ok_records, bad_records,
|
110
|
+
"#{expect_bad_prefix}#{exception_msg}".slice(0, 70))
|
111
111
|
end
|
112
112
|
puts
|
113
113
|
|
@@ -120,7 +120,8 @@ module ScraperUtils
|
|
120
120
|
end
|
121
121
|
|
122
122
|
if unexpected_working.any?
|
123
|
-
errors <<
|
123
|
+
errors <<
|
124
|
+
"WARNING: Remove #{unexpected_working.join(',')} from MORPH_EXPECT_BAD as it now works!"
|
124
125
|
end
|
125
126
|
|
126
127
|
# Check for authorities with unexpected errors
|
@@ -137,6 +138,7 @@ module ScraperUtils
|
|
137
138
|
end
|
138
139
|
end
|
139
140
|
|
141
|
+
$stdout.flush
|
140
142
|
if errors.any?
|
141
143
|
errors << "See earlier output for details"
|
142
144
|
raise errors.join("\n")
|
@@ -23,6 +23,11 @@ module ScraperUtils
|
|
23
23
|
# random_delay: 10
|
24
24
|
# )
|
25
25
|
class AgentConfig
|
26
|
+
DEFAULT_TIMEOUT = 60
|
27
|
+
DEFAULT_RANDOM_DELAY = 5
|
28
|
+
DEFAULT_MAX_LOAD = 33.3
|
29
|
+
MAX_LOAD_CAP = 50.0
|
30
|
+
|
26
31
|
# Class-level defaults that can be modified
|
27
32
|
class << self
|
28
33
|
# @return [Integer] Default timeout in seconds for agent connections
|
@@ -62,65 +67,68 @@ module ScraperUtils
|
|
62
67
|
# Reset all configuration options to their default values
|
63
68
|
# @return [void]
|
64
69
|
def reset_defaults!
|
65
|
-
@default_timeout = 60
|
66
|
-
@default_compliant_mode = true
|
67
|
-
@default_random_delay =
|
68
|
-
@default_max_load =
|
69
|
-
@default_disable_ssl_certificate_check = false
|
70
|
-
@default_australian_proxy = nil
|
71
|
-
@default_user_agent = nil
|
70
|
+
@default_timeout = ENV.fetch('MORPH_TIMEOUT', DEFAULT_TIMEOUT).to_i # 60
|
71
|
+
@default_compliant_mode = ENV.fetch('MORPH_NOT_COMPLIANT', nil).to_s.empty? # true
|
72
|
+
@default_random_delay = ENV.fetch('MORPH_RANDOM_DELAY', DEFAULT_RANDOM_DELAY).to_i # 5
|
73
|
+
@default_max_load = ENV.fetch('MORPH_MAX_LOAD', DEFAULT_MAX_LOAD).to_f # 33.3
|
74
|
+
@default_disable_ssl_certificate_check = !ENV.fetch('MORPH_DISABLE_SSL_CHECK', nil).to_s.empty? # false
|
75
|
+
@default_australian_proxy = !ENV.fetch('MORPH_USE_PROXY', nil).to_s.empty? # false
|
76
|
+
@default_user_agent = ENV.fetch('MORPH_USER_AGENT', nil) # Uses Mechanize user agent
|
72
77
|
end
|
73
78
|
end
|
74
79
|
|
75
80
|
# Set defaults on load
|
76
81
|
reset_defaults!
|
77
82
|
|
78
|
-
|
79
83
|
# @return [String] User agent string
|
80
84
|
attr_reader :user_agent
|
81
85
|
|
82
86
|
# Give access for testing
|
83
87
|
|
84
|
-
attr_reader :max_load
|
85
|
-
attr_reader :min_random
|
86
|
-
attr_reader :max_random
|
88
|
+
attr_reader :max_load, :min_random, :max_random
|
87
89
|
|
88
|
-
# Creates
|
89
|
-
# @param timeout [Integer, nil] Timeout for agent connections (default: 60
|
90
|
-
# @param compliant_mode [Boolean, nil] Comply with headers and robots.txt (default: true
|
91
|
-
# @param random_delay [Integer, nil] Average random delay in seconds (default: 3
|
92
|
-
# @param max_load [Float, nil] Maximum server load percentage (nil = no
|
90
|
+
# Creates Mechanize agent configuration with sensible defaults overridable via configure
|
91
|
+
# @param timeout [Integer, nil] Timeout for agent connections (default: 60)
|
92
|
+
# @param compliant_mode [Boolean, nil] Comply with headers and robots.txt (default: true)
|
93
|
+
# @param random_delay [Integer, nil] Average random delay in seconds (default: 3)
|
94
|
+
# @param max_load [Float, nil] Maximum server load percentage (nil = no delay, default: 20%)
|
93
95
|
# When compliant_mode is true, max_load is capped at 33%
|
94
|
-
# @param disable_ssl_certificate_check [Boolean, nil] Skip SSL verification (default: false
|
95
|
-
# @param australian_proxy [Boolean, nil] Use proxy if available (default: false
|
96
|
+
# @param disable_ssl_certificate_check [Boolean, nil] Skip SSL verification (default: false)
|
97
|
+
# @param australian_proxy [Boolean, nil] Use proxy if available (default: false)
|
96
98
|
# @param user_agent [String, nil] Configure Mechanize user agent
|
97
99
|
def initialize(timeout: nil,
|
98
100
|
compliant_mode: nil,
|
99
101
|
random_delay: nil,
|
100
102
|
max_load: nil,
|
101
103
|
disable_ssl_certificate_check: nil,
|
102
|
-
australian_proxy:
|
104
|
+
australian_proxy: nil,
|
103
105
|
user_agent: nil)
|
104
106
|
@timeout = timeout.nil? ? self.class.default_timeout : timeout
|
105
107
|
@compliant_mode = compliant_mode.nil? ? self.class.default_compliant_mode : compliant_mode
|
106
108
|
@random_delay = random_delay.nil? ? self.class.default_random_delay : random_delay
|
107
109
|
@max_load = max_load.nil? ? self.class.default_max_load : max_load
|
108
|
-
@max_load = [@max_load ||
|
110
|
+
@max_load = [@max_load || DEFAULT_MAX_LOAD, MAX_LOAD_CAP].min if @compliant_mode
|
109
111
|
@user_agent = user_agent.nil? ? self.class.default_user_agent : user_agent
|
110
112
|
|
111
|
-
@disable_ssl_certificate_check = disable_ssl_certificate_check.nil?
|
112
|
-
self.class.default_disable_ssl_certificate_check
|
113
|
+
@disable_ssl_certificate_check = if disable_ssl_certificate_check.nil?
|
114
|
+
self.class.default_disable_ssl_certificate_check
|
115
|
+
else
|
113
116
|
disable_ssl_certificate_check
|
114
|
-
|
117
|
+
end
|
118
|
+
@australian_proxy = if australian_proxy.nil?
|
119
|
+
self.class.default_australian_proxy
|
120
|
+
else
|
121
|
+
australian_proxy
|
122
|
+
end
|
115
123
|
|
116
124
|
# Validate proxy URL format if proxy will be used
|
117
125
|
@australian_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
|
118
126
|
if @australian_proxy
|
119
127
|
uri = begin
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
128
|
+
URI.parse(ScraperUtils.australian_proxy.to_s)
|
129
|
+
rescue URI::InvalidURIError => e
|
130
|
+
raise URI::InvalidURIError, "Invalid proxy URL format: #{e.message}"
|
131
|
+
end
|
124
132
|
unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
|
125
133
|
raise URI::InvalidURIError, "Proxy URL must start with http:// or https://"
|
126
134
|
end
|
@@ -135,7 +143,7 @@ module ScraperUtils
|
|
135
143
|
end
|
136
144
|
|
137
145
|
today = Date.today.strftime("%Y-%m-%d")
|
138
|
-
@user_agent = ENV
|
146
|
+
@user_agent = ENV.fetch("MORPH_USER_AGENT", nil)&.sub("TODAY", today)
|
139
147
|
if @compliant_mode
|
140
148
|
version = ScraperUtils::VERSION
|
141
149
|
@user_agent ||= "Mozilla/5.0 (compatible; ScraperUtils/#{version} #{today}; +https://github.com/ianheggie-oaf/scraper_utils)"
|
@@ -159,7 +167,8 @@ module ScraperUtils
|
|
159
167
|
if @compliant_mode
|
160
168
|
agent.user_agent = user_agent
|
161
169
|
agent.request_headers ||= {}
|
162
|
-
agent.request_headers["Accept"] =
|
170
|
+
agent.request_headers["Accept"] =
|
171
|
+
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
|
163
172
|
agent.request_headers["Upgrade-Insecure-Requests"] = "1"
|
164
173
|
end
|
165
174
|
if @australian_proxy
|
@@ -178,32 +187,39 @@ module ScraperUtils
|
|
178
187
|
def display_options
|
179
188
|
display_args = []
|
180
189
|
display_args << "timeout=#{@timeout}" if @timeout
|
181
|
-
if
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
display_args << "australian_proxy=#{@australian_proxy.inspect}"
|
187
|
-
end
|
190
|
+
display_args << if ScraperUtils.australian_proxy.to_s.empty? && !@australian_proxy
|
191
|
+
"#{ScraperUtils::AUSTRALIAN_PROXY_ENV_VAR} not set"
|
192
|
+
else
|
193
|
+
"australian_proxy=#{@australian_proxy.inspect}"
|
194
|
+
end
|
188
195
|
display_args << "compliant_mode" if @compliant_mode
|
189
196
|
display_args << "random_delay=#{@random_delay}" if @random_delay
|
190
197
|
display_args << "max_load=#{@max_load}%" if @max_load
|
191
198
|
display_args << "disable_ssl_certificate_check" if @disable_ssl_certificate_check
|
192
199
|
display_args << "default args" if display_args.empty?
|
193
|
-
ScraperUtils::FiberScheduler.log
|
200
|
+
ScraperUtils::FiberScheduler.log(
|
201
|
+
"Configuring Mechanize agent with #{display_args.join(', ')}"
|
202
|
+
)
|
194
203
|
end
|
195
204
|
|
196
205
|
def pre_connect_hook(_agent, request)
|
197
206
|
@connection_started_at = Time.now
|
198
|
-
|
207
|
+
return unless DebugUtils.verbose?
|
208
|
+
|
209
|
+
ScraperUtils::FiberScheduler.log(
|
210
|
+
"Pre Connect request: #{request.inspect} at #{@connection_started_at}"
|
211
|
+
)
|
199
212
|
end
|
200
213
|
|
201
214
|
def post_connect_hook(_agent, uri, response, _body)
|
202
215
|
raise ArgumentError, "URI must be present in post-connect hook" unless uri
|
203
216
|
|
204
217
|
response_time = Time.now - @connection_started_at
|
205
|
-
if
|
206
|
-
ScraperUtils::FiberScheduler.log
|
218
|
+
if DebugUtils.basic?
|
219
|
+
ScraperUtils::FiberScheduler.log(
|
220
|
+
"Post Connect uri: #{uri.inspect}, response: #{response.inspect} " \
|
221
|
+
"after #{response_time} seconds"
|
222
|
+
)
|
207
223
|
end
|
208
224
|
|
209
225
|
if @robots_checker&.disallowed?(uri)
|
@@ -214,18 +230,23 @@ module ScraperUtils
|
|
214
230
|
delays = {
|
215
231
|
robot_txt: @robots_checker&.crawl_delay&.round(3),
|
216
232
|
max_load: @adaptive_delay&.next_delay(uri, response_time)&.round(3),
|
217
|
-
random: (@min_random ? (rand(@min_random..@max_random)
|
233
|
+
random: (@min_random ? (rand(@min_random..@max_random)**2).round(3) : nil)
|
218
234
|
}
|
219
235
|
@delay = delays.values.compact.max
|
220
236
|
if @delay&.positive?
|
221
|
-
|
222
|
-
|
237
|
+
$stderr.flush
|
238
|
+
ScraperUtils::FiberScheduler.log("Delaying #{@delay} seconds, max of #{delays.inspect}") if ENV["DEBUG"]
|
239
|
+
$stdout.flush
|
240
|
+
ScraperUtils::FiberScheduler.delay(@delay)
|
223
241
|
end
|
224
242
|
|
225
243
|
response
|
226
244
|
end
|
227
245
|
|
228
246
|
def verify_proxy_works(agent)
|
247
|
+
$stderr.flush
|
248
|
+
$stdout.flush
|
249
|
+
FiberScheduler.log "Checking proxy works..."
|
229
250
|
my_ip = MechanizeUtils.public_ip(agent)
|
230
251
|
begin
|
231
252
|
IPAddr.new(my_ip)
|
@@ -233,17 +254,17 @@ module ScraperUtils
|
|
233
254
|
raise "Invalid public IP address returned by proxy check: #{my_ip.inspect}: #{e}"
|
234
255
|
end
|
235
256
|
ScraperUtils::FiberScheduler.log "Proxy is using IP address: #{my_ip.inspect}"
|
236
|
-
my_headers = MechanizeUtils
|
257
|
+
my_headers = MechanizeUtils.public_headers(agent)
|
237
258
|
begin
|
238
259
|
# Check response is JSON just to be safe!
|
239
260
|
headers = JSON.parse(my_headers)
|
240
261
|
puts "Proxy is passing headers:"
|
241
|
-
puts JSON.pretty_generate(headers[
|
262
|
+
puts JSON.pretty_generate(headers["headers"])
|
242
263
|
rescue JSON::ParserError => e
|
243
264
|
puts "Couldn't parse public_headers: #{e}! Raw response:"
|
244
265
|
puts my_headers.inspect
|
245
266
|
end
|
246
|
-
rescue
|
267
|
+
rescue Timeout::Error => e # Includes Net::OpenTimeout
|
247
268
|
raise "Proxy check timed out: #{e}"
|
248
269
|
rescue Errno::ECONNREFUSED, Net::HTTP::Persistent::Error => e
|
249
270
|
raise "Failed to connect to proxy: #{e}"
|
@@ -45,20 +45,28 @@ module ScraperUtils
|
|
45
45
|
#
|
46
46
|
# @param agent [Mechanize, nil] Mechanize agent to use for IP lookup or nil when clearing cache
|
47
47
|
# @param force [Boolean] Force a new IP lookup, by clearing cache first
|
48
|
-
# @return [String] The public IP address
|
48
|
+
# @return [String, nil] The public IP address
|
49
49
|
def self.public_ip(agent = nil, force: false)
|
50
50
|
@public_ip = nil if force
|
51
|
-
@public_ip ||=
|
51
|
+
@public_ip ||= begin
|
52
|
+
response = agent&.get(PUBLIC_IP_URL)
|
53
|
+
response&.body&.strip
|
54
|
+
end
|
55
|
+
@public_ip
|
52
56
|
end
|
53
57
|
|
54
58
|
# Retrieves and logs the headers that make it through the proxy
|
55
59
|
#
|
56
60
|
# @param agent [Mechanize, nil] Mechanize agent to use for IP lookup or nil when clearing cache
|
57
61
|
# @param force [Boolean] Force a new IP lookup, by clearing cache first
|
58
|
-
# @return [String] The list of headers in json format
|
62
|
+
# @return [String, nil] The list of headers in json format
|
59
63
|
def self.public_headers(agent = nil, force: false)
|
60
64
|
@public_headers = nil if force
|
61
|
-
@public_headers ||=
|
65
|
+
@public_headers ||= begin
|
66
|
+
response = agent&.get(HEADERS_ECHO_URL)
|
67
|
+
response&.body&.strip
|
68
|
+
end
|
69
|
+
@public_headers
|
62
70
|
end
|
63
71
|
end
|
64
72
|
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ScraperUtils
|
4
|
+
# Provides utilities for randomizing processing order in scrapers,
|
5
|
+
# particularly helpful for distributing load and avoiding predictable patterns
|
6
|
+
module RandomizeUtils
|
7
|
+
# Returns a randomized version of the input collection when in production mode,
|
8
|
+
# or the original collection when in test/sequential mode
|
9
|
+
#
|
10
|
+
# @param collection [Array, Enumerable] Collection of items to potentially randomize
|
11
|
+
# @return [Array] Randomized or original collection depending on environment
|
12
|
+
def self.randomize_order(collection)
|
13
|
+
return collection.to_a if sequential?
|
14
|
+
|
15
|
+
collection.to_a.shuffle
|
16
|
+
end
|
17
|
+
|
18
|
+
# Checks if sequential processing is enabled
|
19
|
+
#
|
20
|
+
# @return [Boolean] true when in test mode or MORPH_PROCESS_SEQUENTIALLY is set
|
21
|
+
def self.sequential?
|
22
|
+
@sequential = !ENV["MORPH_PROCESS_SEQUENTIALLY"].to_s.empty? if @sequential.nil?
|
23
|
+
@sequential || false
|
24
|
+
end
|
25
|
+
|
26
|
+
# Explicitly set sequential mode for testing
|
27
|
+
#
|
28
|
+
# @param value [Boolean, nil] true to enable sequential mode, false to disable, nil to clear cache
|
29
|
+
# @return [Boolean, nil]
|
30
|
+
def self.sequential=(value)
|
31
|
+
@sequential = value
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -14,8 +14,10 @@ module ScraperUtils
|
|
14
14
|
# * Crawl-delay from either User-agent: bot name or * (default)
|
15
15
|
def initialize(user_agent)
|
16
16
|
@user_agent = extract_user_agent(user_agent).downcase
|
17
|
-
if
|
18
|
-
ScraperUtils::FiberScheduler.log
|
17
|
+
if DebugUtils.basic?
|
18
|
+
ScraperUtils::FiberScheduler.log(
|
19
|
+
"Checking robots.txt for user agent prefix: #{@user_agent} (case insensitive)"
|
20
|
+
)
|
19
21
|
end
|
20
22
|
@rules = {} # domain -> {rules: [], delay: int}
|
21
23
|
@delay = nil # Delay from last robots.txt check
|
@@ -73,7 +75,11 @@ module ScraperUtils
|
|
73
75
|
@rules[domain] = rules
|
74
76
|
rules
|
75
77
|
rescue StandardError => e
|
76
|
-
|
78
|
+
if DebugUtils.basic?
|
79
|
+
ScraperUtils::FiberScheduler.log(
|
80
|
+
"WARNING: Failed to fetch robots.txt for #{domain}: #{e.message}"
|
81
|
+
)
|
82
|
+
end
|
77
83
|
nil
|
78
84
|
end
|
79
85
|
end
|
@@ -141,4 +147,3 @@ module ScraperUtils
|
|
141
147
|
end
|
142
148
|
end
|
143
149
|
end
|
144
|
-
|
data/lib/scraper_utils.rb
CHANGED
@@ -5,8 +5,11 @@ require "scraper_utils/authority_utils"
|
|
5
5
|
require "scraper_utils/data_quality_monitor"
|
6
6
|
require "scraper_utils/db_utils"
|
7
7
|
require "scraper_utils/debug_utils"
|
8
|
+
require "scraper_utils/fiber_scheduler"
|
8
9
|
require "scraper_utils/log_utils"
|
10
|
+
require "scraper_utils/mechanize_utils/agent_config"
|
9
11
|
require "scraper_utils/mechanize_utils"
|
12
|
+
require "scraper_utils/randomize_utils"
|
10
13
|
require "scraper_utils/robots_checker"
|
11
14
|
require "scraper_utils/version"
|
12
15
|
|
@@ -15,9 +18,6 @@ module ScraperUtils
|
|
15
18
|
# Constants for configuration on Morph.io
|
16
19
|
AUSTRALIAN_PROXY_ENV_VAR = "MORPH_AUSTRALIAN_PROXY"
|
17
20
|
|
18
|
-
# Enable debug locally, not on morph.io
|
19
|
-
DEBUG_ENV_VAR = "DEBUG"
|
20
|
-
|
21
21
|
# Fatal Error
|
22
22
|
class Error < StandardError
|
23
23
|
end
|
@@ -31,13 +31,6 @@ module ScraperUtils
|
|
31
31
|
class UnprocessableRecord < Error
|
32
32
|
end
|
33
33
|
|
34
|
-
# Check if debug mode is enabled
|
35
|
-
#
|
36
|
-
# @return [Boolean] Whether debug mode is active
|
37
|
-
def self.debug?
|
38
|
-
!ENV[DEBUG_ENV_VAR].to_s.empty?
|
39
|
-
end
|
40
|
-
|
41
34
|
def self.australian_proxy
|
42
35
|
ap = ENV[AUSTRALIAN_PROXY_ENV_VAR].to_s
|
43
36
|
ap.empty? ? nil : ap
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scraper_utils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ian Heggie
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-03-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -74,16 +74,21 @@ files:
|
|
74
74
|
- SPECS.md
|
75
75
|
- bin/console
|
76
76
|
- bin/setup
|
77
|
+
- docs/example_scrape_with_fibers.rb
|
78
|
+
- docs/example_scraper.rb
|
77
79
|
- lib/scraper_utils.rb
|
78
80
|
- lib/scraper_utils/adaptive_delay.rb
|
79
81
|
- lib/scraper_utils/authority_utils.rb
|
82
|
+
- lib/scraper_utils/cycle_utils.rb
|
80
83
|
- lib/scraper_utils/data_quality_monitor.rb
|
84
|
+
- lib/scraper_utils/date_range_utils.rb
|
81
85
|
- lib/scraper_utils/db_utils.rb
|
82
86
|
- lib/scraper_utils/debug_utils.rb
|
83
87
|
- lib/scraper_utils/fiber_scheduler.rb
|
84
88
|
- lib/scraper_utils/log_utils.rb
|
85
89
|
- lib/scraper_utils/mechanize_utils.rb
|
86
90
|
- lib/scraper_utils/mechanize_utils/agent_config.rb
|
91
|
+
- lib/scraper_utils/randomize_utils.rb
|
87
92
|
- lib/scraper_utils/robots_checker.rb
|
88
93
|
- lib/scraper_utils/version.rb
|
89
94
|
- scraper_utils.gemspec
|