scraper_utils 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,255 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "mechanize"
4
+ require "ipaddr"
5
+
6
+ module ScraperUtils
7
+ module MechanizeUtils
8
+ # Configuration for a Mechanize agent with sensible defaults and configurable settings.
9
+ # Supports global configuration through {.configure} and per-instance overrides.
10
+ #
11
+ # @example Setting global defaults
12
+ # ScraperUtils::MechanizeUtils::AgentConfig.configure do |config|
13
+ # config.default_timeout = 90
14
+ # config.default_random_delay = 5
15
+ # end
16
+ #
17
+ # @example Creating an instance with defaults
18
+ # config = ScraperUtils::MechanizeUtils::AgentConfig.new
19
+ #
20
+ # @example Overriding specific settings
21
+ # config = ScraperUtils::MechanizeUtils::AgentConfig.new(
22
+ # timeout: 120,
23
+ # random_delay: 10
24
+ # )
25
+ class AgentConfig
26
+ # Class-level defaults that can be modified
27
+ class << self
28
+ # @return [Integer] Default timeout in seconds for agent connections
29
+ attr_accessor :default_timeout
30
+
31
+ # @return [Boolean] Default setting for compliance with headers and robots.txt
32
+ attr_accessor :default_compliant_mode
33
+
34
+ # @return [Integer, nil] Default average random delay in seconds
35
+ attr_accessor :default_random_delay
36
+
37
+ # @return [Float, nil] Default maximum server load percentage (nil = no response delay)
38
+ attr_accessor :default_max_load
39
+
40
+ # @return [Boolean] Default setting for SSL certificate verification
41
+ attr_accessor :default_disable_ssl_certificate_check
42
+
43
+ # @return [Boolean] Default flag for Australian proxy preference
44
+ attr_accessor :default_australian_proxy
45
+
46
+ # @return [String, nil] Default Mechanize user agent
47
+ attr_accessor :default_user_agent
48
+
49
+ # Configure default settings for all AgentConfig instances
50
+ # @yield [self] Yields self for configuration
51
+ # @example
52
+ # AgentConfig.configure do |config|
53
+ # config.default_timeout = 90
54
+ # config.default_random_delay = 5
55
+ # config.default_max_load = 15
56
+ # end
57
+ # @return [void]
58
+ def configure
59
+ yield self if block_given?
60
+ end
61
+
62
+ # Reset all configuration options to their default values
63
+ # @return [void]
64
+ def reset_defaults!
65
+ @default_timeout = 60
66
+ @default_compliant_mode = true
67
+ @default_random_delay = 3
68
+ @default_max_load = 20.0
69
+ @default_disable_ssl_certificate_check = false
70
+ @default_australian_proxy = nil
71
+ @default_user_agent = nil
72
+ end
73
+ end
74
+
75
+ # Set defaults on load
76
+ reset_defaults!
77
+
78
+
79
+ # @return [String] User agent string
80
+ attr_reader :user_agent
81
+
82
+ # Give access for testing
83
+
84
+ attr_reader :max_load
85
+ attr_reader :min_random
86
+ attr_reader :max_random
87
+
88
+ # Creates configuration for a Mechanize agent with sensible defaults
89
+ # @param timeout [Integer, nil] Timeout for agent connections (default: 60 unless changed)
90
+ # @param compliant_mode [Boolean, nil] Comply with headers and robots.txt (default: true unless changed)
91
+ # @param random_delay [Integer, nil] Average random delay in seconds (default: 3 unless changed)
92
+ # @param max_load [Float, nil] Maximum server load percentage (nil = no response delay, default: 20%)
93
+ # When compliant_mode is true, max_load is capped at 33%
94
+ # @param disable_ssl_certificate_check [Boolean, nil] Skip SSL verification (default: false unless changed)
95
+ # @param australian_proxy [Boolean, nil] Use proxy if available (default: false unless changed)
96
+ # @param user_agent [String, nil] Configure Mechanize user agent
97
+ def initialize(timeout: nil,
98
+ compliant_mode: nil,
99
+ random_delay: nil,
100
+ max_load: nil,
101
+ disable_ssl_certificate_check: nil,
102
+ australian_proxy: false,
103
+ user_agent: nil)
104
+ @timeout = timeout.nil? ? self.class.default_timeout : timeout
105
+ @compliant_mode = compliant_mode.nil? ? self.class.default_compliant_mode : compliant_mode
106
+ @random_delay = random_delay.nil? ? self.class.default_random_delay : random_delay
107
+ @max_load = max_load.nil? ? self.class.default_max_load : max_load
108
+ @max_load = [@max_load || 20.0, 33.0].min if @compliant_mode
109
+ @user_agent = user_agent.nil? ? self.class.default_user_agent : user_agent
110
+
111
+ @disable_ssl_certificate_check = disable_ssl_certificate_check.nil? ?
112
+ self.class.default_disable_ssl_certificate_check :
113
+ disable_ssl_certificate_check
114
+ @australian_proxy = australian_proxy.nil? ? self.class.default_australian_proxy : australian_proxy
115
+
116
+ # Validate proxy URL format if proxy will be used
117
+ @australian_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
118
+ if @australian_proxy
119
+ uri = begin
120
+ URI.parse(ScraperUtils.australian_proxy.to_s)
121
+ rescue URI::InvalidURIError => e
122
+ raise URI::InvalidURIError, "Invalid proxy URL format: #{e.message}"
123
+ end
124
+ unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
125
+ raise URI::InvalidURIError, "Proxy URL must start with http:// or https://"
126
+ end
127
+ unless uri.host && uri.port
128
+ raise URI::InvalidURIError, "Proxy URL must include host and port"
129
+ end
130
+ end
131
+
132
+ if @random_delay
133
+ @min_random = Math.sqrt(@random_delay * 3.0 / 13.0).round(3)
134
+ @max_random = (3 * @min_random).round(3)
135
+ end
136
+
137
+ today = Date.today.strftime("%Y-%m-%d")
138
+ @user_agent = ENV['MORPH_USER_AGENT']&.sub("TODAY", today)
139
+ if @compliant_mode
140
+ version = ScraperUtils::VERSION
141
+ @user_agent ||= "Mozilla/5.0 (compatible; ScraperUtils/#{version} #{today}; +https://github.com/ianheggie-oaf/scraper_utils)"
142
+ end
143
+
144
+ @robots_checker = RobotsChecker.new(@user_agent) if @user_agent
145
+ @adaptive_delay = AdaptiveDelay.new(max_load: @max_load) if @max_load
146
+ display_options
147
+ end
148
+
149
+ # Configures a Mechanize agent with these settings
150
+ # @param agent [Mechanize] The agent to configure
151
+ # @return [void]
152
+ def configure_agent(agent)
153
+ agent.verify_mode = OpenSSL::SSL::VERIFY_NONE if @disable_ssl_certificate_check
154
+
155
+ if @timeout
156
+ agent.open_timeout = @timeout
157
+ agent.read_timeout = @timeout
158
+ end
159
+ if @compliant_mode
160
+ agent.user_agent = user_agent
161
+ agent.request_headers ||= {}
162
+ agent.request_headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
163
+ agent.request_headers["Upgrade-Insecure-Requests"] = "1"
164
+ end
165
+ if @australian_proxy
166
+ agent.agent.set_proxy(ScraperUtils.australian_proxy)
167
+ agent.request_headers["Accept-Language"] = "en-AU,en-US;q=0.9,en;q=0.8"
168
+ verify_proxy_works(agent)
169
+ end
170
+
171
+ @connection_started_at = nil
172
+ agent.pre_connect_hooks << method(:pre_connect_hook)
173
+ agent.post_connect_hooks << method(:post_connect_hook)
174
+ end
175
+
176
+ private
177
+
178
+ def display_options
179
+ display_args = []
180
+ display_args << "timeout=#{@timeout}" if @timeout
181
+ if @australian_proxy
182
+ display_args << "australian_proxy=#{@australian_proxy.inspect}"
183
+ elsif ScraperUtils.australian_proxy.to_s.empty?
184
+ display_args << "#{ScraperUtils::AUSTRALIAN_PROXY_ENV_VAR} not set"
185
+ else
186
+ display_args << "australian_proxy=#{@australian_proxy.inspect}"
187
+ end
188
+ display_args << "compliant_mode" if @compliant_mode
189
+ display_args << "random_delay=#{@random_delay}" if @random_delay
190
+ display_args << "max_load=#{@max_load}%" if @max_load
191
+ display_args << "disable_ssl_certificate_check" if @disable_ssl_certificate_check
192
+ display_args << "default args" if display_args.empty?
193
+ ScraperUtils::FiberScheduler.log "Configuring Mechanize agent with #{display_args.join(', ')}"
194
+ end
195
+
196
+ def pre_connect_hook(_agent, request)
197
+ @connection_started_at = Time.now
198
+ ScraperUtils::FiberScheduler.log "Pre Connect request: #{request.inspect} at #{@connection_started_at}" if ENV["DEBUG"]
199
+ end
200
+
201
+ def post_connect_hook(_agent, uri, response, _body)
202
+ raise ArgumentError, "URI must be present in post-connect hook" unless uri
203
+
204
+ response_time = Time.now - @connection_started_at
205
+ if ENV["DEBUG"]
206
+ ScraperUtils::FiberScheduler.log "Post Connect uri: #{uri.inspect}, response: #{response.inspect} after #{response_time} seconds"
207
+ end
208
+
209
+ if @robots_checker&.disallowed?(uri)
210
+ raise ScraperUtils::UnprocessableSite,
211
+ "URL is disallowed by robots.txt specific rules: #{uri}"
212
+ end
213
+
214
+ delays = {
215
+ robot_txt: @robots_checker&.crawl_delay&.round(3),
216
+ max_load: @adaptive_delay&.next_delay(uri, response_time)&.round(3),
217
+ random: (@min_random ? (rand(@min_random..@max_random) ** 2).round(3) : nil)
218
+ }
219
+ @delay = delays.values.compact.max
220
+ if @delay&.positive?
221
+ puts "Delaying #{@delay} seconds, max of #{delays.inspect}" if ENV["DEBUG"]
222
+ sleep(@delay)
223
+ end
224
+
225
+ response
226
+ end
227
+
228
+ def verify_proxy_works(agent)
229
+ my_ip = MechanizeUtils.public_ip(agent)
230
+ begin
231
+ IPAddr.new(my_ip)
232
+ rescue IPAddr::InvalidAddressError => e
233
+ raise "Invalid public IP address returned by proxy check: #{my_ip.inspect}: #{e}"
234
+ end
235
+ ScraperUtils::FiberScheduler.log "Proxy is using IP address: #{my_ip.inspect}"
236
+ my_headers = MechanizeUtils::public_headers(agent)
237
+ begin
238
+ # Check response is JSON just to be safe!
239
+ headers = JSON.parse(my_headers)
240
+ puts "Proxy is passing headers:"
241
+ puts JSON.pretty_generate(headers['headers'])
242
+ rescue JSON::ParserError => e
243
+ puts "Couldn't parse public_headers: #{e}! Raw response:"
244
+ puts my_headers.inspect
245
+ end
246
+ rescue Net::OpenTimeout, Timeout::Error => e
247
+ raise "Proxy check timed out: #{e}"
248
+ rescue Errno::ECONNREFUSED, Net::HTTP::Persistent::Error => e
249
+ raise "Failed to connect to proxy: #{e}"
250
+ rescue Mechanize::ResponseCodeError => e
251
+ raise "Proxy check error: #{e}"
252
+ end
253
+ end
254
+ end
255
+ end
@@ -1,32 +1,23 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "mechanize"
4
+ require "ipaddr"
5
+ require "scraper_utils/mechanize_utils/agent_config"
4
6
 
5
7
  module ScraperUtils
6
8
  # Utilities for configuring and using Mechanize for web scraping
7
9
  module MechanizeUtils
8
10
  PUBLIC_IP_URL = "https://whatismyip.akamai.com/"
11
+ HEADERS_ECHO_URL = "https://httpbin.org/headers"
9
12
 
10
- # Creates and configures a Mechanize agent with optional proxy and timeout
11
- #
12
- # @param timeout [Integer, nil] Timeout for agent connections
13
- # @param australian_proxy [Boolean] Whether to use an Australian proxy
13
+ # Creates and configures a Mechanize agent
14
+ # @param (see AgentConfig#initialize)
14
15
  # @return [Mechanize] Configured Mechanize agent
15
- def self.mechanize_agent(timeout: nil, use_proxy: true)
16
+ def self.mechanize_agent(**options)
16
17
  agent = Mechanize.new
17
- agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
18
- use_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
19
- if use_proxy
20
- # On morph.io set the environment variable MORPH_AUSTRALIAN_PROXY to
21
- # http://morph:password@au.proxy.oaf.org.au:8888 replacing password with
22
- # the real password.
23
- agent.agent.set_proxy(ScraperUtils.australian_proxy)
24
- end
25
- if timeout
26
- agent.open_timeout = timeout
27
- agent.read_timeout = timeout
28
- end
29
- public_ip(agent) if use_proxy
18
+ config = AgentConfig.new(**options)
19
+ config.configure_agent(agent)
20
+ agent.instance_variable_set(:@scraper_utils_config, config)
30
21
  agent
31
22
  end
32
23
 
@@ -47,24 +38,27 @@ module ScraperUtils
47
38
  text = element.inner_text
48
39
  return "Maintenance: #{text}" if text&.match?(/maintenance/i)
49
40
  end
50
-
51
- # Not in maintenance mode
52
41
  nil
53
42
  end
54
43
 
55
44
  # Retrieves and logs the public IP address
56
45
  #
57
- # @param agent [Mechanize] Mechanize agent to use for IP lookup
58
- # @param force [Boolean] Force a new IP lookup, bypassing cache
46
+ # @param agent [Mechanize, nil] Mechanize agent to use for IP lookup or nil when clearing cache
47
+ # @param force [Boolean] Force a new IP lookup, by clearing cache first
59
48
  # @return [String] The public IP address
60
- def self.public_ip(agent, force: false)
49
+ def self.public_ip(agent = nil, force: false)
61
50
  @public_ip = nil if force
62
- @public_ip ||=
63
- begin
64
- ip = agent.get(PUBLIC_IP_URL).body.strip
65
- puts "Public IP: #{ip}"
66
- ip
67
- end
51
+ @public_ip ||= agent&.get(PUBLIC_IP_URL)&.body&.strip if agent
52
+ end
53
+
54
+ # Retrieves and logs the headers that make it through the proxy
55
+ #
56
+ # @param agent [Mechanize, nil] Mechanize agent to use for IP lookup or nil when clearing cache
57
+ # @param force [Boolean] Force a new IP lookup, by clearing cache first
58
+ # @return [String] The list of headers in json format
59
+ def self.public_headers(agent = nil, force: false)
60
+ @public_headers = nil if force
61
+ @public_headers ||= agent&.get(HEADERS_ECHO_URL)&.body&.strip if agent
68
62
  end
69
63
  end
70
64
  end
@@ -0,0 +1,144 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ScraperUtils
4
+ # robots.txt checker with deliberately simplistic rules
5
+ class RobotsChecker
6
+ # @return [String] Lowercased user_agent for matching
7
+ attr_reader :user_agent
8
+
9
+ # Initialize with full user agent string like:
10
+ # "Mozilla/5.0 (compatible; ScraperUtils/0.1.0 2025-02-22; +https://github.com/ianheggie-oaf/scraper_utils)"
11
+ # Extracts the bot name (e.g. "ScraperUtils") to check against robots.txt
12
+ # Checks for
13
+ # * Disallow for User-agent: bot_name and
14
+ # * Crawl-delay from either User-agent: bot name or * (default)
15
+ def initialize(user_agent)
16
+ @user_agent = extract_user_agent(user_agent).downcase
17
+ if ENV["DEBUG"]
18
+ ScraperUtils::FiberScheduler.log "Checking robots.txt for user agent prefix: #{@user_agent} (case insensitive)"
19
+ end
20
+ @rules = {} # domain -> {rules: [], delay: int}
21
+ @delay = nil # Delay from last robots.txt check
22
+ end
23
+
24
+ # Check if a URL is disallowed based on robots.txt rules specific to our user agent
25
+ # @param url [String] The full URL to check
26
+ # @return [Boolean] true if specifically blocked for our user agent, otherwise false
27
+ def disallowed?(url)
28
+ return false unless url
29
+
30
+ uri = URI(url)
31
+ domain = "#{uri.scheme}://#{uri.host}"
32
+ path = uri.path || "/"
33
+
34
+ # Get or fetch robots.txt rules
35
+ rules = get_rules(domain)
36
+ return false unless rules # If we can't get robots.txt, assume allowed
37
+
38
+ # Store any delay found for this domain
39
+ @delay = rules[:our_delay]
40
+
41
+ # Check rules specific to our user agent
42
+ matches_any_rule?(path, rules[:our_rules])
43
+ end
44
+
45
+ # Returns the crawl delay (if any) that applied to the last URL checked
46
+ # Should be called after disallowed? to get relevant delay
47
+ # @return [Integer, nil] The delay in seconds, or nil if no delay specified
48
+ def crawl_delay
49
+ @delay
50
+ end
51
+
52
+ private
53
+
54
+ def extract_user_agent(user_agent)
55
+ if user_agent =~ /^(.*compatible;\s*)?([-_a-z0-9]+)/i
56
+ user_agent = ::Regexp.last_match(2)&.strip
57
+ end
58
+ user_agent&.strip
59
+ end
60
+
61
+ def matches_any_rule?(path, rules)
62
+ rules&.any? { |rule| path.start_with?(rule) }
63
+ end
64
+
65
+ def get_rules(domain)
66
+ return @rules[domain] if @rules.key?(domain)
67
+
68
+ begin
69
+ response = Net::HTTP.get_response(URI("#{domain}/robots.txt"))
70
+ return nil unless response.code.start_with?("2") # 2xx response
71
+
72
+ rules = parse_robots_txt(response.body)
73
+ @rules[domain] = rules
74
+ rules
75
+ rescue StandardError => e
76
+ ScraperUtils::FiberScheduler.log "Warning: Failed to fetch robots.txt for #{domain}: #{e.message}" if ENV["DEBUG"]
77
+ nil
78
+ end
79
+ end
80
+
81
+ # Parse robots.txt content into structured rules
82
+ # Only collects rules for our specific user agent and generic crawl-delay
83
+ # @param content [String] The robots.txt content
84
+ # @return [Hash] Hash containing :our_rules and :our_delay
85
+ def parse_robots_txt(content)
86
+ sections = [] # Array of {agent:, rules:[], delay:} hashes
87
+ current_section = nil
88
+
89
+ content.each_line do |line|
90
+ line = line.strip.downcase
91
+ next if line.empty? || line.start_with?("#")
92
+
93
+ if line.start_with?("user-agent:")
94
+ agent = line.split(":", 2).last.strip
95
+ # Check if this is a continuation of the previous section
96
+ if current_section && current_section[:rules].empty? && current_section[:delay].nil?
97
+ current_section[:agents] << agent
98
+ else
99
+ current_section = { agents: [agent], rules: [], delay: nil }
100
+ sections << current_section
101
+ end
102
+ next
103
+ end
104
+
105
+ next unless current_section # Skip rules before first user-agent
106
+
107
+ if line.start_with?("disallow:")
108
+ path = line.split(":", 2).last.strip
109
+ current_section[:rules] << path unless path.empty?
110
+ elsif line.start_with?("crawl-delay:")
111
+ delay = line.split(":", 2).last.strip.to_i
112
+ current_section[:delay] = delay if delay.positive?
113
+ end
114
+ end
115
+
116
+ # Sort sections by most specific agent match first
117
+ matched_section = sections.find do |section|
118
+ section[:agents].any? do |agent|
119
+ # Our user agent starts with the agent from robots.txt
120
+ @user_agent.start_with?(agent) ||
121
+ # Or the agent from robots.txt starts with our user agent
122
+ # (handles ScraperUtils matching ScraperUtils/1.0)
123
+ agent.start_with?(@user_agent)
124
+ end
125
+ end
126
+
127
+ # Use matched section or fall back to wildcard
128
+ if matched_section
129
+ {
130
+ our_rules: matched_section[:rules],
131
+ our_delay: matched_section[:delay]
132
+ }
133
+ else
134
+ # Find default section
135
+ default_section = sections.find { |s| s[:agents].include?("*") }
136
+ {
137
+ our_rules: [],
138
+ our_delay: default_section&.dig(:delay)
139
+ }
140
+ end
141
+ end
142
+ end
143
+ end
144
+
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ScraperUtils
4
- VERSION = "0.1.0"
4
+ VERSION = "0.2.0"
5
5
  end
data/lib/scraper_utils.rb CHANGED
@@ -1,10 +1,13 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "scraper_utils/adaptive_delay"
3
4
  require "scraper_utils/authority_utils"
5
+ require "scraper_utils/data_quality_monitor"
4
6
  require "scraper_utils/db_utils"
5
7
  require "scraper_utils/debug_utils"
6
8
  require "scraper_utils/log_utils"
7
9
  require "scraper_utils/mechanize_utils"
10
+ require "scraper_utils/robots_checker"
8
11
  require "scraper_utils/version"
9
12
 
10
13
  # Utilities for planningalerts scrapers
@@ -13,7 +13,7 @@ Gem::Specification.new do |spec|
13
13
 
14
14
  spec.summary = "planningalerts scraper utilities"
15
15
  spec.description = "Utilities to help make planningalerts scrapers, " \
16
- "+especially multis easier to develop, run and debug."
16
+ "+especially multis easier to develop, run and debug."
17
17
  spec.homepage = "https://github.com/ianheggie-oaf/scraper_utils"
18
18
  spec.license = "MIT"
19
19
 
@@ -25,7 +25,7 @@ Gem::Specification.new do |spec|
25
25
  # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
26
26
  else
27
27
  raise "RubyGems 2.0 or newer is required to protect against " \
28
- "public gem pushes."
28
+ "public gem pushes."
29
29
  end
30
30
 
31
31
  # Specify which files should be added to the gem when it is released.
@@ -40,10 +40,5 @@ Gem::Specification.new do |spec|
40
40
  spec.add_dependency "mechanize"
41
41
  spec.add_dependency "nokogiri"
42
42
  spec.add_dependency "sqlite3"
43
-
44
- spec.add_development_dependency "rake"
45
- spec.add_development_dependency "rspec"
46
- spec.add_development_dependency "rubocop"
47
- spec.add_development_dependency "simplecov"
48
- spec.add_development_dependency "simplecov-console"
43
+ spec.metadata["rubygems_mfa_required"] = "true"
49
44
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scraper_utils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ian Heggie
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-02-22 00:00:00.000000000 Z
11
+ date: 2025-02-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -52,76 +52,6 @@ dependencies:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
- - !ruby/object:Gem::Dependency
56
- name: rake
57
- requirement: !ruby/object:Gem::Requirement
58
- requirements:
59
- - - ">="
60
- - !ruby/object:Gem::Version
61
- version: '0'
62
- type: :development
63
- prerelease: false
64
- version_requirements: !ruby/object:Gem::Requirement
65
- requirements:
66
- - - ">="
67
- - !ruby/object:Gem::Version
68
- version: '0'
69
- - !ruby/object:Gem::Dependency
70
- name: rspec
71
- requirement: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - ">="
74
- - !ruby/object:Gem::Version
75
- version: '0'
76
- type: :development
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - ">="
81
- - !ruby/object:Gem::Version
82
- version: '0'
83
- - !ruby/object:Gem::Dependency
84
- name: rubocop
85
- requirement: !ruby/object:Gem::Requirement
86
- requirements:
87
- - - ">="
88
- - !ruby/object:Gem::Version
89
- version: '0'
90
- type: :development
91
- prerelease: false
92
- version_requirements: !ruby/object:Gem::Requirement
93
- requirements:
94
- - - ">="
95
- - !ruby/object:Gem::Version
96
- version: '0'
97
- - !ruby/object:Gem::Dependency
98
- name: simplecov
99
- requirement: !ruby/object:Gem::Requirement
100
- requirements:
101
- - - ">="
102
- - !ruby/object:Gem::Version
103
- version: '0'
104
- type: :development
105
- prerelease: false
106
- version_requirements: !ruby/object:Gem::Requirement
107
- requirements:
108
- - - ">="
109
- - !ruby/object:Gem::Version
110
- version: '0'
111
- - !ruby/object:Gem::Dependency
112
- name: simplecov-console
113
- requirement: !ruby/object:Gem::Requirement
114
- requirements:
115
- - - ">="
116
- - !ruby/object:Gem::Version
117
- version: '0'
118
- type: :development
119
- prerelease: false
120
- version_requirements: !ruby/object:Gem::Requirement
121
- requirements:
122
- - - ">="
123
- - !ruby/object:Gem::Version
124
- version: '0'
125
55
  description: Utilities to help make planningalerts scrapers, +especially multis easier
126
56
  to develop, run and debug.
127
57
  email:
@@ -134,18 +64,27 @@ files:
134
64
  - ".rspec"
135
65
  - ".rubocop.yml"
136
66
  - ".travis.yml"
67
+ - CHANGELOG.md
68
+ - GUIDELINES.md
137
69
  - Gemfile
70
+ - IMPLEMENTATION.md
138
71
  - LICENSE.txt
139
72
  - README.md
140
73
  - Rakefile
74
+ - SPECS.md
141
75
  - bin/console
142
76
  - bin/setup
143
77
  - lib/scraper_utils.rb
78
+ - lib/scraper_utils/adaptive_delay.rb
144
79
  - lib/scraper_utils/authority_utils.rb
80
+ - lib/scraper_utils/data_quality_monitor.rb
145
81
  - lib/scraper_utils/db_utils.rb
146
82
  - lib/scraper_utils/debug_utils.rb
83
+ - lib/scraper_utils/fiber_scheduler.rb
147
84
  - lib/scraper_utils/log_utils.rb
148
85
  - lib/scraper_utils/mechanize_utils.rb
86
+ - lib/scraper_utils/mechanize_utils/agent_config.rb
87
+ - lib/scraper_utils/robots_checker.rb
149
88
  - lib/scraper_utils/version.rb
150
89
  - scraper_utils.gemspec
151
90
  homepage: https://github.com/ianheggie-oaf/scraper_utils
@@ -155,6 +94,7 @@ metadata:
155
94
  allowed_push_host: https://rubygems.org
156
95
  homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
157
96
  source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
97
+ rubygems_mfa_required: 'true'
158
98
  post_install_message:
159
99
  rdoc_options: []
160
100
  require_paths:
@@ -170,8 +110,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
170
110
  - !ruby/object:Gem::Version
171
111
  version: '0'
172
112
  requirements: []
173
- rubyforge_project:
174
- rubygems_version: 2.7.6.2
113
+ rubygems_version: 3.4.10
175
114
  signing_key:
176
115
  specification_version: 4
177
116
  summary: planningalerts scraper utilities