scraper_utils 0.6.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 13ad14102f284c98d658bb928bcf7806ea7594326d11c5426903ebc6b1f919e0
4
- data.tar.gz: 260cd94a1b76e9851f5af47f716dc754386b081f3923ee0a0eb6fb2b2d086c4f
3
+ metadata.gz: 4293294d99565b0ee4adc097e9d31619868477e6efa8255f3cd5ec02dc8b275b
4
+ data.tar.gz: e78a08bced34a1f55b8d6e869158887da33f1d6b9803493c7d836402d64aa4ef
5
5
  SHA512:
6
- metadata.gz: 824b9e64ae7debdf9cddfc90b47de6dff7865e3f655ad6022f58181f38efd06788413e0251525410736e58d6fd325917a8b7a0ad6b2468fa7ab9de3b697955af
7
- data.tar.gz: fd154118b2eaa22962f4343a3f62ca1daabc19de924a36e4c3cd4f61c9c7bb08a232554f10141b4e3534f3fa2e546a86f860d3bc7997f54d29a067cfb3c4f451
6
+ metadata.gz: ea00598b58e69cf5d911b62148d5d38e13ef017dd5a22159839f3b4437bf28c7c2b17c4832e3869688970037048f69fb543f9645922d6b92888a5fc68d88fc33
7
+ data.tar.gz: 57500daacbec8602c9288bf27bd7ac5f2ff407f621f9ac2c9e114a85fbbb0c555ebebfdc62782b814616b9b46dad65bd5a665b24bb1c9e6774625469bb290485
data/CHANGELOG.md CHANGED
@@ -1,5 +1,22 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.7.1 - 2025-04-15
4
+
5
+ * Accept mixed case suburb names after a comma as well as uppercase suburb names as geocachable
6
+ * Accept more street type abbreviations and check they are on word boundaries
7
+
8
+ ## 0.7.0 - 2025-04-15
9
+
10
+ * Added Spec helpers and associated doc: `docs/enhancing_specs.md`
11
+ * `ScraperUtils::SpecSupport.geocodable?`
12
+ * `ScraperUtils::SpecSupport.reasonable_description?`
13
+
14
+ ## 0.6.1 - 2025-03-28
15
+
16
+ * Changed DEFAULT_MAX_LOAD to 50.0 as we are overestimating the load we present as network latency is included
17
+ * Correct documentation of spec_helper extra lines
18
+ * Fix misc bugs found in use
19
+
3
20
  ## 0.6.0 - 2025-03-16
4
21
 
5
22
  * Add threads for more efficient scraping
data/IMPLEMENTATION.md CHANGED
@@ -54,7 +54,6 @@ Our test directory structure reflects various testing strategies and aspects of
54
54
  These specs check the options we use when things go wrong in production
55
55
 
56
56
  - `spec/scraper_utils/no_threads/` - Tests with threads disabled (`MORPH_DISABLE_THREADS=1`)
57
- - `spec/scraper_utils/no_fibers/` - Tests with fibers disabled (`MORPH_MAX_WORKERS=0`)
58
57
  - `spec/scraper_utils/sequential/` - Tests with exactly one worker (`MORPH_MAX_WORKERS=1`)
59
58
 
60
59
  ### Directories to break up large specs
data/README.md CHANGED
@@ -15,11 +15,11 @@ Our goal is to access public planning information with minimal impact on your se
15
15
  default:
16
16
 
17
17
  - **Limit server load**:
18
- - We limit the max load we present to your server to well less than a third of a single cpu
19
- - The more loaded your server is, the longer we wait between requests
18
+ - We limit the max load we present to your server to less than a half of one of your cpu cores
19
+ - The more loaded your server is, the longer we wait between requests!
20
20
  - We respect Crawl-delay from robots.txt (see section below), so you can tell us an acceptable rate
21
21
  - Scarper developers can
22
- - reduce the max_load we present to your server even lower
22
+ - reduce the max_load we present to your server even further
23
23
  - add random extra delays to give your server a chance to catch up with background tasks
24
24
 
25
25
  - **Identify themselves**: Our user agent clearly indicates who we are and provides a link to the project repository:
@@ -0,0 +1,100 @@
1
+ # Enhancing specs
2
+
3
+ ScraperUtils provides two methods to help with checking results
4
+
5
+ * `ScraperUtils::SpecSupport.geocodable?`
6
+ * `ScraperUtils::SpecSupport.reasonable_description?`
7
+
8
+ ## Example Code:
9
+
10
+ ```ruby
11
+ # frozen_string_literal: true
12
+
13
+ require "timecop"
14
+ require_relative "../scraper"
15
+
16
+ RSpec.describe Scraper do
17
+ describe ".scrape" do
18
+ def test_scrape(authority)
19
+ File.delete("./data.sqlite") if File.exist?("./data.sqlite")
20
+
21
+ VCR.use_cassette(authority) do
22
+ date = Date.new(2025, 4, 15)
23
+ Timecop.freeze(date) do
24
+ Scraper.scrape([authority], 1)
25
+ end
26
+ end
27
+
28
+ expected = if File.exist?("spec/expected/#{authority}.yml")
29
+ YAML.safe_load(File.read("spec/expected/#{authority}.yml"))
30
+ else
31
+ []
32
+ end
33
+ results = ScraperWiki.select("* from data order by council_reference")
34
+
35
+ ScraperWiki.close_sqlite
36
+
37
+ if results != expected
38
+ # Overwrite expected so that we can compare with version control
39
+ # (and maybe commit if it is correct)
40
+ File.open("spec/expected/#{authority}.yml", "w") do |f|
41
+ f.write(results.to_yaml)
42
+ end
43
+ end
44
+
45
+ expect(results).to eq expected
46
+
47
+ geocodable = results
48
+ .map { |record| record["address"] }
49
+ .uniq
50
+ .count { |text| SpecHelper.geocodable? text }
51
+ puts "Found #{geocodable} out of #{results.count} unique geocodable addresses " \
52
+ "(#{(100.0 * geocodable / results.count).round(1)}%)"
53
+ expect(geocodable).to be > (0.7 * results.count)
54
+
55
+ descriptions = results
56
+ .map { |record| record["description"] }
57
+ .uniq
58
+ .count do |text|
59
+ selected = SpecHelper.reasonable_description? text
60
+ puts " description: #{text} is not reasonable" if ENV["DEBUG"] && !selected
61
+ selected
62
+ end
63
+ puts "Found #{descriptions} out of #{results.count} unique reasonable descriptions " \
64
+ "(#{(100.0 * descriptions / results.count).round(1)}%)"
65
+ expect(descriptions).to be > (0.55 * results.count)
66
+
67
+ info_urls = results
68
+ .map { |record| record["info_url"] }
69
+ .uniq
70
+ .count { |text| text.to_s.match(%r{\Ahttps?://}) }
71
+ puts "Found #{info_urls} out of #{results.count} unique info_urls " \
72
+ "(#{(100.0 * info_urls / results.count).round(1)}%)"
73
+ expect(info_urls).to be > (0.7 * results.count) if info_urls != 1
74
+
75
+ VCR.use_cassette("#{authority}.info_urls") do
76
+ results.each do |record|
77
+ info_url = record["info_url"]
78
+ puts "Checking info_url #{info_url} #{info_urls > 1 ? ' has expected details' : ''} ..."
79
+ response = Net::HTTP.get_response(URI(info_url))
80
+
81
+ expect(response.code).to eq("200")
82
+ # If info_url is the same for all records, then it won't have details
83
+ break if info_urls == 1
84
+
85
+ expect(response.body).to include(record["council_reference"])
86
+ expect(response.body).to include(record["address"])
87
+ expect(response.body).to include(record["description"])
88
+ end
89
+ end
90
+ end
91
+
92
+ Scraper.selected_authorities.each do |authority|
93
+ it authority do
94
+ test_scrape(authority)
95
+ end
96
+ end
97
+ end
98
+ end
99
+
100
+ ```
@@ -26,7 +26,8 @@ This uses {ScraperUtils::RandomizeUtils} for determining the order of operations
26
26
  `spec/spec_helper.rb`:
27
27
 
28
28
  ```ruby
29
- ScraperUtils::RandomizeUtils.sequential = true
29
+ ScraperUtils::RandomizeUtils.random = false
30
+ ScraperUtils::Scheduler.max_workers = 1
30
31
  ```
31
32
 
32
33
  For full details, see the {Scheduler}.
@@ -29,16 +29,16 @@ The agent returned is configured using Mechanize hooks to implement the desired
29
29
  ### Default Configuration
30
30
 
31
31
  By default, the Mechanize agent is configured with the following settings.
32
- As you can see, the defaults can be changed using env variables.
32
+ As you can see, the defaults can be changed using env variables or via code.
33
33
 
34
34
  Note - compliant mode forces max_load to be set to a value no greater than 50.
35
35
 
36
36
  ```ruby
37
37
  ScraperUtils::MechanizeUtils::AgentConfig.configure do |config|
38
- config.default_timeout = ENV.fetch('MORPH_TIMEOUT', 60).to_i # 60
38
+ config.default_timeout = ENV.fetch('MORPH_TIMEOUT', DEFAULT_TIMEOUT).to_i # 60
39
39
  config.default_compliant_mode = ENV.fetch('MORPH_NOT_COMPLIANT', nil).to_s.empty? # true
40
- config.default_random_delay = ENV.fetch('MORPH_RANDOM_DELAY', 5).to_i # 5
41
- config.default_max_load = ENV.fetch('MORPH_MAX_LOAD', 33.3).to_f # 33.3
40
+ config.default_random_delay = ENV.fetch('MORPH_RANDOM_DELAY', DEFAULT_RANDOM_DELAY).to_i # 0
41
+ config.default_max_load = ENV.fetch('MORPH_MAX_LOAD',DEFAULT_MAX_LOAD).to_f # 50.0
42
42
  config.default_disable_ssl_certificate_check = !ENV.fetch('MORPH_DISABLE_SSL_CHECK', nil).to_s.empty? # false
43
43
  config.default_australian_proxy = !ENV.fetch('MORPH_USE_PROXY', nil).to_s.empty? # false
44
44
  config.default_user_agent = ENV.fetch('MORPH_USER_AGENT', nil) # Uses Mechanize user agent
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "uri"
4
+ require_relative "agent_config"
4
5
 
5
6
  module ScraperUtils
6
7
  module MechanizeUtils
@@ -17,13 +18,13 @@ module ScraperUtils
17
18
  #
18
19
  # @param min_delay [Float] Minimum delay between requests in seconds
19
20
  # @param max_delay [Float] Maximum delay between requests in seconds
20
- # @param max_load [Float] Maximum load percentage (1-99) we aim to place on the server
21
+ # @param max_load [Float] Maximum load percentage (1..Constants::MAX_LOAD_CAP) we aim to place on the server
21
22
  # Lower values are more conservative (e.g., 20% = 4x response time delay)
22
- def initialize(min_delay: DEFAULT_MIN_DELAY, max_delay: DEFAULT_MAX_DELAY, max_load: 20.0)
23
+ def initialize(min_delay: DEFAULT_MIN_DELAY, max_delay: DEFAULT_MAX_DELAY, max_load: AgentConfig::DEFAULT_MAX_LOAD)
23
24
  @delays = {} # domain -> last delay used
24
25
  @min_delay = min_delay.to_f
25
26
  @max_delay = max_delay.to_f
26
- @max_load = max_load.to_f.clamp(1.0, 99.0)
27
+ @max_load = max_load.to_f.clamp(1.0, AgentConfig::MAX_LOAD_CAP)
27
28
  @response_multiplier = (100.0 - @max_load) / @max_load
28
29
 
29
30
  return unless DebugUtils.basic?
@@ -25,8 +25,8 @@ module ScraperUtils
25
25
  class AgentConfig
26
26
  DEFAULT_TIMEOUT = 60
27
27
  DEFAULT_RANDOM_DELAY = 0
28
- DEFAULT_MAX_LOAD = 33.3
29
- MAX_LOAD_CAP = 50.0
28
+ DEFAULT_MAX_LOAD = 50.0
29
+ MAX_LOAD_CAP = 80.0
30
30
 
31
31
  # Class-level defaults that can be modified
32
32
  class << self
@@ -69,8 +69,8 @@ module ScraperUtils
69
69
  def reset_defaults!
70
70
  @default_timeout = ENV.fetch('MORPH_CLIENT_TIMEOUT', DEFAULT_TIMEOUT).to_i # 60
71
71
  @default_compliant_mode = ENV.fetch('MORPH_NOT_COMPLIANT', nil).to_s.empty? # true
72
- @default_random_delay = ENV.fetch('MORPH_RANDOM_DELAY', DEFAULT_RANDOM_DELAY).to_i # 5
73
- @default_max_load = ENV.fetch('MORPH_MAX_LOAD', DEFAULT_MAX_LOAD).to_f # 33.3
72
+ @default_random_delay = ENV.fetch('MORPH_RANDOM_DELAY', DEFAULT_RANDOM_DELAY).to_i # 0
73
+ @default_max_load = ENV.fetch('MORPH_MAX_LOAD', DEFAULT_MAX_LOAD).to_f # 50.0
74
74
  @default_disable_ssl_certificate_check = !ENV.fetch('MORPH_DISABLE_SSL_CHECK', nil).to_s.empty? # false
75
75
  @default_australian_proxy = !ENV.fetch('MORPH_USE_PROXY', nil).to_s.empty? # false
76
76
  @default_user_agent = ENV.fetch('MORPH_USER_AGENT', nil) # Uses Mechanize user agent
@@ -68,7 +68,7 @@ module ScraperUtils
68
68
 
69
69
  # Controls whether Mechanize network requests are executed in parallel using threads
70
70
  #
71
- # @return [Integer] max concurrent workers using fibers and threads, defaults to MAX_WORKERS env variable or 50
71
+ # @return [Integer] max concurrent workers using fibers and threads, defaults to MORPH_MAX_WORKERS env variable or 50
72
72
  attr_accessor :max_workers
73
73
 
74
74
  # @return [Hash{Symbol => Exception}] exceptions by authority
@@ -259,7 +259,7 @@ module ScraperUtils
259
259
  #
260
260
  # @return [ThreadResponse, nil] Result of request execution
261
261
  def self.get_response(non_block = true)
262
- return nil if non_block && @response_queue.empty?
262
+ return nil if @response_queue.nil? || (non_block && @response_queue.empty?)
263
263
 
264
264
  @response_queue.pop(non_block)
265
265
  end
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "scraperwiki"
4
+
5
+ module ScraperUtils
6
+ # Methods to support specs
7
+ module SpecSupport
8
+ AUSTRALIAN_STATES = %w[ACT NSW NT QLD SA TAS VIC WA].freeze
9
+ STREET_TYPE_PATTERNS = [
10
+ /\bAv(e(nue)?)?\b/i,
11
+ /\bB(oulevard|lvd)\b/i,
12
+ /\b(Circuit|Cct)\b/i,
13
+ /\bCl(ose)?\b/i,
14
+ /\bC(our|r)t\b/i,
15
+ /\bCircle\b/i,
16
+ /\bChase\b/i,
17
+ /\bCr(escent)?\b/i,
18
+ /\bDr((ive)?|v)\b/i,
19
+ /\bEnt(rance)?\b/i,
20
+ /\bGr(ove)?\b/i,
21
+ /\bH(ighwa|w)y\b/i,
22
+ /\bLane\b/i,
23
+ /\bLoop\b/i,
24
+ /\bParkway\b/i,
25
+ /\bPl(ace)?\b/i,
26
+ /\bPriv(ate)?\b/i,
27
+ /\bParade\b/i,
28
+ /\bR(oa)?d\b/i,
29
+ /\bRise\b/i,
30
+ /\bSt(reet)?\b/i,
31
+ /\bSquare\b/i,
32
+ /\bTerrace\b/i,
33
+ /\bWay\b/i
34
+ ].freeze
35
+
36
+
37
+ AUSTRALIAN_POSTCODES = /\b\d{4}\b/.freeze
38
+
39
+ # Check if an address is likely to be geocodable by analyzing its format.
40
+ # This is a bit stricter than needed - typically assert >= 75% match
41
+ # @param address [String] The address to check
42
+ # @return [Boolean] True if the address appears to be geocodable.
43
+ def self.geocodable?(address, ignore_case: false)
44
+ return false if address.nil? || address.empty?
45
+ check_address = ignore_case ? address.upcase : address
46
+
47
+ # Basic structure check - must have a street name, suburb, state and postcode
48
+ has_state = AUSTRALIAN_STATES.any? { |state| check_address.end_with?(" #{state}") || check_address.include?(" #{state} ") }
49
+ has_postcode = address.match?(AUSTRALIAN_POSTCODES)
50
+
51
+ # Using the pre-compiled patterns
52
+ has_street_type = STREET_TYPE_PATTERNS.any? { |pattern| check_address.match?(pattern) }
53
+
54
+ has_unit_or_lot = address.match?(/\b(Unit|Lot:?)\s+\d+/i)
55
+
56
+ has_suburb_stats = check_address.match?(/(\b[A-Z]{2,}(\s+[A-Z]+)*,?|,\s+[A-Z][A-Za-z ]+)\s+(#{AUSTRALIAN_STATES.join('|')})\b/)
57
+
58
+ if ENV["DEBUG"]
59
+ missing = []
60
+ unless has_street_type || has_unit_or_lot
61
+ missing << "street type / unit / lot"
62
+ end
63
+ missing << "state" unless has_state
64
+ missing << "postcode" unless has_postcode
65
+ missing << "suburb state" unless has_suburb_stats
66
+ puts " address: #{address} is not geocodable, missing #{missing.join(', ')}" if missing.any?
67
+ end
68
+
69
+ (has_street_type || has_unit_or_lot) && has_state && has_postcode && has_suburb_stats
70
+ end
71
+
72
+ PLACEHOLDERS = [
73
+ /no description/i,
74
+ /not available/i,
75
+ /to be confirmed/i,
76
+ /\btbc\b/i,
77
+ %r{\bn/a\b}i
78
+ ].freeze
79
+
80
+ def self.placeholder?(text)
81
+ PLACEHOLDERS.any? { |placeholder| text.to_s.match?(placeholder) }
82
+ end
83
+
84
+ # Check if this looks like a "reasonable" description
85
+ # This is a bit stricter than needed - typically assert >= 75% match
86
+ def self.reasonable_description?(text)
87
+ !placeholder?(text) && text.to_s.split.size >= 3
88
+ end
89
+ end
90
+ end
91
+
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ScraperUtils
4
- VERSION = "0.6.0"
4
+ VERSION = "0.7.1"
5
5
  end
data/lib/scraper_utils.rb CHANGED
@@ -12,6 +12,7 @@ require "scraper_utils/debug_utils"
12
12
  require "scraper_utils/log_utils"
13
13
  require "scraper_utils/randomize_utils"
14
14
  require "scraper_utils/scheduler"
15
+ require "scraper_utils/spec_support"
15
16
 
16
17
  # Mechanize utilities
17
18
  require "scraper_utils/mechanize_actions"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scraper_utils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.7.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ian Heggie
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-03-27 00:00:00.000000000 Z
11
+ date: 2025-04-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -77,6 +77,7 @@ files:
77
77
  - bin/rspec
78
78
  - bin/setup
79
79
  - docs/debugging.md
80
+ - docs/enhancing_specs.md
80
81
  - docs/example_scrape_with_fibers.rb
81
82
  - docs/example_scraper.rb
82
83
  - docs/fibers_and_threads.md
@@ -107,6 +108,7 @@ files:
107
108
  - lib/scraper_utils/scheduler/process_request.rb
108
109
  - lib/scraper_utils/scheduler/thread_request.rb
109
110
  - lib/scraper_utils/scheduler/thread_response.rb
111
+ - lib/scraper_utils/spec_support.rb
110
112
  - lib/scraper_utils/version.rb
111
113
  - scraper_utils.gemspec
112
114
  homepage: https://github.com/ianheggie-oaf/scraper_utils
@@ -116,7 +118,7 @@ metadata:
116
118
  allowed_push_host: https://rubygems.org
117
119
  homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
118
120
  source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
119
- documentation_uri: https://rubydoc.info/gems/scraper_utils/0.6.0
121
+ documentation_uri: https://rubydoc.info/gems/scraper_utils/0.7.1
120
122
  changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
121
123
  rubygems_mfa_required: 'true'
122
124
  post_install_message: