scraper_utils 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,32 +1,23 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "mechanize"
4
+ require "ipaddr"
5
+ require "scraper_utils/mechanize_utils/agent_config"
4
6
 
5
7
  module ScraperUtils
6
8
  # Utilities for configuring and using Mechanize for web scraping
7
9
  module MechanizeUtils
8
10
  PUBLIC_IP_URL = "https://whatismyip.akamai.com/"
11
+ HEADERS_ECHO_URL = "https://httpbin.org/headers"
9
12
 
10
- # Creates and configures a Mechanize agent with optional proxy and timeout
11
- #
12
- # @param timeout [Integer, nil] Timeout for agent connections
13
- # @param australian_proxy [Boolean] Whether to use an Australian proxy
13
+ # Creates and configures a Mechanize agent
14
+ # @param (see AgentConfig#initialize)
14
15
  # @return [Mechanize] Configured Mechanize agent
15
- def self.mechanize_agent(timeout: nil, use_proxy: true)
16
+ def self.mechanize_agent(**options)
16
17
  agent = Mechanize.new
17
- agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
18
- use_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
19
- if use_proxy
20
- # On morph.io set the environment variable MORPH_AUSTRALIAN_PROXY to
21
- # http://morph:password@au.proxy.oaf.org.au:8888 replacing password with
22
- # the real password.
23
- agent.agent.set_proxy(ScraperUtils.australian_proxy)
24
- end
25
- if timeout
26
- agent.open_timeout = timeout
27
- agent.read_timeout = timeout
28
- end
29
- public_ip(agent) if use_proxy
18
+ config = AgentConfig.new(**options)
19
+ config.configure_agent(agent)
20
+ agent.instance_variable_set(:@scraper_utils_config, config)
30
21
  agent
31
22
  end
32
23
 
@@ -47,24 +38,35 @@ module ScraperUtils
47
38
  text = element.inner_text
48
39
  return "Maintenance: #{text}" if text&.match?(/maintenance/i)
49
40
  end
50
-
51
- # Not in maintenance mode
52
41
  nil
53
42
  end
54
43
 
55
44
  # Retrieves and logs the public IP address
56
45
  #
57
- # @param agent [Mechanize] Mechanize agent to use for IP lookup
58
- # @param force [Boolean] Force a new IP lookup, bypassing cache
59
- # @return [String] The public IP address
60
- def self.public_ip(agent, force: false)
46
+ # @param agent [Mechanize, nil] Mechanize agent to use for IP lookup or nil when clearing cache
47
+ # @param force [Boolean] Force a new IP lookup, by clearing cache first
48
+ # @return [String, nil] The public IP address
49
+ def self.public_ip(agent = nil, force: false)
61
50
  @public_ip = nil if force
62
- @public_ip ||=
63
- begin
64
- ip = agent.get(PUBLIC_IP_URL).body.strip
65
- puts "Public IP: #{ip}"
66
- ip
67
- end
51
+ @public_ip ||= begin
52
+ response = agent&.get(PUBLIC_IP_URL)
53
+ response&.body&.strip
54
+ end
55
+ @public_ip
56
+ end
57
+
58
+ # Retrieves and logs the headers that make it through the proxy
59
+ #
60
+ # @param agent [Mechanize, nil] Mechanize agent to use for IP lookup or nil when clearing cache
61
+ # @param force [Boolean] Force a new IP lookup, by clearing cache first
62
+ # @return [String, nil] The list of headers in json format
63
+ def self.public_headers(agent = nil, force: false)
64
+ @public_headers = nil if force
65
+ @public_headers ||= begin
66
+ response = agent&.get(HEADERS_ECHO_URL)
67
+ response&.body&.strip
68
+ end
69
+ @public_headers
68
70
  end
69
71
  end
70
72
  end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ScraperUtils
4
+ # Provides utilities for randomizing processing order in scrapers,
5
+ # particularly helpful for distributing load and avoiding predictable patterns
6
+ module RandomizeUtils
7
+ # Returns a randomized version of the input collection when in production mode,
8
+ # or the original collection when in test/sequential mode
9
+ #
10
+ # @param collection [Array, Enumerable] Collection of items to potentially randomize
11
+ # @return [Array] Randomized or original collection depending on environment
12
+ def self.randomize_order(collection)
13
+ return collection.to_a if sequential?
14
+
15
+ collection.to_a.shuffle
16
+ end
17
+
18
+ # Checks if sequential processing is enabled
19
+ #
20
+ # @return [Boolean] true when in test mode or MORPH_PROCESS_SEQUENTIALLY is set
21
+ def self.sequential?
22
+ @sequential = !ENV["MORPH_PROCESS_SEQUENTIALLY"].to_s.empty? if @sequential.nil?
23
+ @sequential || false
24
+ end
25
+
26
+ # Explicitly set sequential mode for testing
27
+ #
28
+ # @param value [Boolean, nil] true to enable sequential mode, false to disable, nil to clear cache
29
+ # @return [Boolean, nil]
30
+ def self.sequential=(value)
31
+ @sequential = value
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,149 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ScraperUtils
4
+ # robots.txt checker with deliberately simplistic rules
5
+ class RobotsChecker
6
+ # @return [String] Lowercased user_agent for matching
7
+ attr_reader :user_agent
8
+
9
+ # Initialize with full user agent string like:
10
+ # "Mozilla/5.0 (compatible; ScraperUtils/0.1.0 2025-02-22; +https://github.com/ianheggie-oaf/scraper_utils)"
11
+ # Extracts the bot name (e.g. "ScraperUtils") to check against robots.txt
12
+ # Checks for
13
+ # * Disallow for User-agent: bot_name and
14
+ # * Crawl-delay from either User-agent: bot name or * (default)
15
+ def initialize(user_agent)
16
+ @user_agent = extract_user_agent(user_agent).downcase
17
+ if DebugUtils.basic?
18
+ ScraperUtils::FiberScheduler.log(
19
+ "Checking robots.txt for user agent prefix: #{@user_agent} (case insensitive)"
20
+ )
21
+ end
22
+ @rules = {} # domain -> {rules: [], delay: int}
23
+ @delay = nil # Delay from last robots.txt check
24
+ end
25
+
26
+ # Check if a URL is disallowed based on robots.txt rules specific to our user agent
27
+ # @param url [String] The full URL to check
28
+ # @return [Boolean] true if specifically blocked for our user agent, otherwise false
29
+ def disallowed?(url)
30
+ return false unless url
31
+
32
+ uri = URI(url)
33
+ domain = "#{uri.scheme}://#{uri.host}"
34
+ path = uri.path || "/"
35
+
36
+ # Get or fetch robots.txt rules
37
+ rules = get_rules(domain)
38
+ return false unless rules # If we can't get robots.txt, assume allowed
39
+
40
+ # Store any delay found for this domain
41
+ @delay = rules[:our_delay]
42
+
43
+ # Check rules specific to our user agent
44
+ matches_any_rule?(path, rules[:our_rules])
45
+ end
46
+
47
+ # Returns the crawl delay (if any) that applied to the last URL checked
48
+ # Should be called after disallowed? to get relevant delay
49
+ # @return [Integer, nil] The delay in seconds, or nil if no delay specified
50
+ def crawl_delay
51
+ @delay
52
+ end
53
+
54
+ private
55
+
56
+ def extract_user_agent(user_agent)
57
+ if user_agent =~ /^(.*compatible;\s*)?([-_a-z0-9]+)/i
58
+ user_agent = ::Regexp.last_match(2)&.strip
59
+ end
60
+ user_agent&.strip
61
+ end
62
+
63
+ def matches_any_rule?(path, rules)
64
+ rules&.any? { |rule| path.start_with?(rule) }
65
+ end
66
+
67
+ def get_rules(domain)
68
+ return @rules[domain] if @rules.key?(domain)
69
+
70
+ begin
71
+ response = Net::HTTP.get_response(URI("#{domain}/robots.txt"))
72
+ return nil unless response.code.start_with?("2") # 2xx response
73
+
74
+ rules = parse_robots_txt(response.body)
75
+ @rules[domain] = rules
76
+ rules
77
+ rescue StandardError => e
78
+ if DebugUtils.basic?
79
+ ScraperUtils::FiberScheduler.log(
80
+ "WARNING: Failed to fetch robots.txt for #{domain}: #{e.message}"
81
+ )
82
+ end
83
+ nil
84
+ end
85
+ end
86
+
87
+ # Parse robots.txt content into structured rules
88
+ # Only collects rules for our specific user agent and generic crawl-delay
89
+ # @param content [String] The robots.txt content
90
+ # @return [Hash] Hash containing :our_rules and :our_delay
91
+ def parse_robots_txt(content)
92
+ sections = [] # Array of {agent:, rules:[], delay:} hashes
93
+ current_section = nil
94
+
95
+ content.each_line do |line|
96
+ line = line.strip.downcase
97
+ next if line.empty? || line.start_with?("#")
98
+
99
+ if line.start_with?("user-agent:")
100
+ agent = line.split(":", 2).last.strip
101
+ # Check if this is a continuation of the previous section
102
+ if current_section && current_section[:rules].empty? && current_section[:delay].nil?
103
+ current_section[:agents] << agent
104
+ else
105
+ current_section = { agents: [agent], rules: [], delay: nil }
106
+ sections << current_section
107
+ end
108
+ next
109
+ end
110
+
111
+ next unless current_section # Skip rules before first user-agent
112
+
113
+ if line.start_with?("disallow:")
114
+ path = line.split(":", 2).last.strip
115
+ current_section[:rules] << path unless path.empty?
116
+ elsif line.start_with?("crawl-delay:")
117
+ delay = line.split(":", 2).last.strip.to_i
118
+ current_section[:delay] = delay if delay.positive?
119
+ end
120
+ end
121
+
122
+ # Sort sections by most specific agent match first
123
+ matched_section = sections.find do |section|
124
+ section[:agents].any? do |agent|
125
+ # Our user agent starts with the agent from robots.txt
126
+ @user_agent.start_with?(agent) ||
127
+ # Or the agent from robots.txt starts with our user agent
128
+ # (handles ScraperUtils matching ScraperUtils/1.0)
129
+ agent.start_with?(@user_agent)
130
+ end
131
+ end
132
+
133
+ # Use matched section or fall back to wildcard
134
+ if matched_section
135
+ {
136
+ our_rules: matched_section[:rules],
137
+ our_delay: matched_section[:delay]
138
+ }
139
+ else
140
+ # Find default section
141
+ default_section = sections.find { |s| s[:agents].include?("*") }
142
+ {
143
+ our_rules: [],
144
+ our_delay: default_section&.dig(:delay)
145
+ }
146
+ end
147
+ end
148
+ end
149
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ScraperUtils
4
- VERSION = "0.1.0"
4
+ VERSION = "0.3.0"
5
5
  end
data/lib/scraper_utils.rb CHANGED
@@ -1,10 +1,16 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "scraper_utils/adaptive_delay"
3
4
  require "scraper_utils/authority_utils"
5
+ require "scraper_utils/data_quality_monitor"
4
6
  require "scraper_utils/db_utils"
5
7
  require "scraper_utils/debug_utils"
8
+ require "scraper_utils/fiber_scheduler"
6
9
  require "scraper_utils/log_utils"
10
+ require "scraper_utils/mechanize_utils/agent_config"
7
11
  require "scraper_utils/mechanize_utils"
12
+ require "scraper_utils/randomize_utils"
13
+ require "scraper_utils/robots_checker"
8
14
  require "scraper_utils/version"
9
15
 
10
16
  # Utilities for planningalerts scrapers
@@ -12,9 +18,6 @@ module ScraperUtils
12
18
  # Constants for configuration on Morph.io
13
19
  AUSTRALIAN_PROXY_ENV_VAR = "MORPH_AUSTRALIAN_PROXY"
14
20
 
15
- # Enable debug locally, not on morph.io
16
- DEBUG_ENV_VAR = "DEBUG"
17
-
18
21
  # Fatal Error
19
22
  class Error < StandardError
20
23
  end
@@ -28,13 +31,6 @@ module ScraperUtils
28
31
  class UnprocessableRecord < Error
29
32
  end
30
33
 
31
- # Check if debug mode is enabled
32
- #
33
- # @return [Boolean] Whether debug mode is active
34
- def self.debug?
35
- !ENV[DEBUG_ENV_VAR].to_s.empty?
36
- end
37
-
38
34
  def self.australian_proxy
39
35
  ap = ENV[AUSTRALIAN_PROXY_ENV_VAR].to_s
40
36
  ap.empty? ? nil : ap
@@ -13,7 +13,7 @@ Gem::Specification.new do |spec|
13
13
 
14
14
  spec.summary = "planningalerts scraper utilities"
15
15
  spec.description = "Utilities to help make planningalerts scrapers, " \
16
- "+especially multis easier to develop, run and debug."
16
+ "+especially multis easier to develop, run and debug."
17
17
  spec.homepage = "https://github.com/ianheggie-oaf/scraper_utils"
18
18
  spec.license = "MIT"
19
19
 
@@ -25,7 +25,7 @@ Gem::Specification.new do |spec|
25
25
  # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
26
26
  else
27
27
  raise "RubyGems 2.0 or newer is required to protect against " \
28
- "public gem pushes."
28
+ "public gem pushes."
29
29
  end
30
30
 
31
31
  # Specify which files should be added to the gem when it is released.
@@ -40,10 +40,5 @@ Gem::Specification.new do |spec|
40
40
  spec.add_dependency "mechanize"
41
41
  spec.add_dependency "nokogiri"
42
42
  spec.add_dependency "sqlite3"
43
-
44
- spec.add_development_dependency "rake"
45
- spec.add_development_dependency "rspec"
46
- spec.add_development_dependency "rubocop"
47
- spec.add_development_dependency "simplecov"
48
- spec.add_development_dependency "simplecov-console"
43
+ spec.metadata["rubygems_mfa_required"] = "true"
49
44
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scraper_utils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ian Heggie
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-02-22 00:00:00.000000000 Z
11
+ date: 2025-03-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -52,76 +52,6 @@ dependencies:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
- - !ruby/object:Gem::Dependency
56
- name: rake
57
- requirement: !ruby/object:Gem::Requirement
58
- requirements:
59
- - - ">="
60
- - !ruby/object:Gem::Version
61
- version: '0'
62
- type: :development
63
- prerelease: false
64
- version_requirements: !ruby/object:Gem::Requirement
65
- requirements:
66
- - - ">="
67
- - !ruby/object:Gem::Version
68
- version: '0'
69
- - !ruby/object:Gem::Dependency
70
- name: rspec
71
- requirement: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - ">="
74
- - !ruby/object:Gem::Version
75
- version: '0'
76
- type: :development
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - ">="
81
- - !ruby/object:Gem::Version
82
- version: '0'
83
- - !ruby/object:Gem::Dependency
84
- name: rubocop
85
- requirement: !ruby/object:Gem::Requirement
86
- requirements:
87
- - - ">="
88
- - !ruby/object:Gem::Version
89
- version: '0'
90
- type: :development
91
- prerelease: false
92
- version_requirements: !ruby/object:Gem::Requirement
93
- requirements:
94
- - - ">="
95
- - !ruby/object:Gem::Version
96
- version: '0'
97
- - !ruby/object:Gem::Dependency
98
- name: simplecov
99
- requirement: !ruby/object:Gem::Requirement
100
- requirements:
101
- - - ">="
102
- - !ruby/object:Gem::Version
103
- version: '0'
104
- type: :development
105
- prerelease: false
106
- version_requirements: !ruby/object:Gem::Requirement
107
- requirements:
108
- - - ">="
109
- - !ruby/object:Gem::Version
110
- version: '0'
111
- - !ruby/object:Gem::Dependency
112
- name: simplecov-console
113
- requirement: !ruby/object:Gem::Requirement
114
- requirements:
115
- - - ">="
116
- - !ruby/object:Gem::Version
117
- version: '0'
118
- type: :development
119
- prerelease: false
120
- version_requirements: !ruby/object:Gem::Requirement
121
- requirements:
122
- - - ">="
123
- - !ruby/object:Gem::Version
124
- version: '0'
125
55
  description: Utilities to help make planningalerts scrapers, +especially multis easier
126
56
  to develop, run and debug.
127
57
  email:
@@ -134,18 +64,31 @@ files:
134
64
  - ".rspec"
135
65
  - ".rubocop.yml"
136
66
  - ".travis.yml"
67
+ - CHANGELOG.md
68
+ - GUIDELINES.md
137
69
  - Gemfile
70
+ - IMPLEMENTATION.md
138
71
  - LICENSE.txt
139
72
  - README.md
140
73
  - Rakefile
74
+ - SPECS.md
141
75
  - bin/console
142
76
  - bin/setup
77
+ - docs/example_scrape_with_fibers.rb
78
+ - docs/example_scraper.rb
143
79
  - lib/scraper_utils.rb
80
+ - lib/scraper_utils/adaptive_delay.rb
144
81
  - lib/scraper_utils/authority_utils.rb
82
+ - lib/scraper_utils/data_quality_monitor.rb
83
+ - lib/scraper_utils/date_range_utils.rb
145
84
  - lib/scraper_utils/db_utils.rb
146
85
  - lib/scraper_utils/debug_utils.rb
86
+ - lib/scraper_utils/fiber_scheduler.rb
147
87
  - lib/scraper_utils/log_utils.rb
148
88
  - lib/scraper_utils/mechanize_utils.rb
89
+ - lib/scraper_utils/mechanize_utils/agent_config.rb
90
+ - lib/scraper_utils/randomize_utils.rb
91
+ - lib/scraper_utils/robots_checker.rb
149
92
  - lib/scraper_utils/version.rb
150
93
  - scraper_utils.gemspec
151
94
  homepage: https://github.com/ianheggie-oaf/scraper_utils
@@ -155,6 +98,7 @@ metadata:
155
98
  allowed_push_host: https://rubygems.org
156
99
  homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
157
100
  source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
101
+ rubygems_mfa_required: 'true'
158
102
  post_install_message:
159
103
  rdoc_options: []
160
104
  require_paths:
@@ -170,8 +114,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
170
114
  - !ruby/object:Gem::Version
171
115
  version: '0'
172
116
  requirements: []
173
- rubyforge_project:
174
- rubygems_version: 2.7.6.2
117
+ rubygems_version: 3.4.10
175
118
  signing_key:
176
119
  specification_version: 4
177
120
  summary: planningalerts scraper utilities