scraper_utils 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: da26385a1d788bc9ad9d725f0eaefe233d4b70a8bf9aeab0af3168041adc0bc2
4
- data.tar.gz: '02892db893cc706ec67845bde0a05d80b89cdfc2d48759b2e66574e6f87b0031'
3
+ metadata.gz: ba66a28129ee09ab76cb0937d195ff68aa5058d1c73805235de4898384fe495d
4
+ data.tar.gz: 76c20b7ce9bd581e59fda41b8801bca8a26909d58ddcc74c46e7755e6038970a
5
5
  SHA512:
6
- metadata.gz: b8101b0b0d2ed1d775de54f0e8bac5a1a22ca6f540cea2752de76218a4915c325ec7569ac76a719bf540474f54123cb32f700635160cdbabdcc68679ac33c2e4
7
- data.tar.gz: ae1e5d72f45b077f0525e62dc0399f9bd6f519d52222518e95f57f8f45e03575f3b8b909fb02b030a27b33bfccaefec74bb143d9d13ff1bcfe094ceaa8e369d3
6
+ metadata.gz: 5ae13c2c5e4b8bb1c30c2c8a10dd30b42b349dfc7c416fc65bec504e27d7b5c9dad84b6ef195c52788412f238a6740e4fb8fc400e315a42aeb7ee57f8ada9a25
7
+ data.tar.gz: bf3d40831ee8667f663b442f92e76943db04eaa995095e09f4c3736e80919d26c924aa17d733a533ec32b278496c2aedd8c368eb40988e1fd9619c5febfb1567
data/CHANGELOG.md CHANGED
@@ -1,5 +1,14 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.7.2 - 2025-04-15
4
+
5
+ * Accept postcode before state as well as after
6
+
7
+ ## 0.7.1 - 2025-04-15
8
+
9
+ * Accept mixed case suburb names after a comma as well as uppercase suburb names as geocachable
10
+ * Accept more street type abbreviations and check they are on word boundaries
11
+
3
12
  ## 0.7.0 - 2025-04-15
4
13
 
5
14
  * Added Spec helpers and associated doc: `docs/enhancing_specs.md`
@@ -6,11 +6,33 @@ module ScraperUtils
6
6
  # Methods to support specs
7
7
  module SpecSupport
8
8
  AUSTRALIAN_STATES = %w[ACT NSW NT QLD SA TAS VIC WA].freeze
9
- COMMON_STREET_TYPES =
10
- %w[
11
- Avenue Ave Boulevard Court Crt Circle Chase Circuit Close Crescent
12
- Drive Drv Lane Loop Parkway Place Parade Road Rd Street St Square Terrace Way
13
- ].freeze
9
+ STREET_TYPE_PATTERNS = [
10
+ /\bAv(e(nue)?)?\b/i,
11
+ /\bB(oulevard|lvd)\b/i,
12
+ /\b(Circuit|Cct)\b/i,
13
+ /\bCl(ose)?\b/i,
14
+ /\bC(our|r)?t\b/i,
15
+ /\bCircle\b/i,
16
+ /\bChase\b/i,
17
+ /\bCr(es(cent)?)?\b/i,
18
+ /\bDr((ive)?|v)\b/i,
19
+ /\bEnt(rance)?\b/i,
20
+ /\bGr(ove)?\b/i,
21
+ /\bH(ighwa|w)y\b/i,
22
+ /\bLane\b/i,
23
+ /\bLoop\b/i,
24
+ /\bParkway\b/i,
25
+ /\bPl(ace)?\b/i,
26
+ /\bPriv(ate)?\b/i,
27
+ /\bParade\b/i,
28
+ /\bR(oa)?d\b/i,
29
+ /\bRise\b/i,
30
+ /\bSt(reet)?\b/i,
31
+ /\bSquare\b/i,
32
+ /\bTerrace\b/i,
33
+ /\bWay\b/i
34
+ ].freeze
35
+
14
36
  AUSTRALIAN_POSTCODES = /\b\d{4}\b/.freeze
15
37
 
16
38
  # Check if an address is likely to be geocodable by analyzing its format.
@@ -25,11 +47,12 @@ module ScraperUtils
25
47
  has_state = AUSTRALIAN_STATES.any? { |state| check_address.end_with?(" #{state}") || check_address.include?(" #{state} ") }
26
48
  has_postcode = address.match?(AUSTRALIAN_POSTCODES)
27
49
 
28
- has_street_type = COMMON_STREET_TYPES.any? { |type| check_address.include?(" #{type}") || check_address.include?(" #{type.upcase}") }
50
+ # Using the pre-compiled patterns
51
+ has_street_type = STREET_TYPE_PATTERNS.any? { |pattern| check_address.match?(pattern) }
29
52
 
30
53
  has_unit_or_lot = address.match?(/\b(Unit|Lot:?)\s+\d+/i)
31
54
 
32
- has_suburb_stats = check_address.match?(/\b[A-Z]{2,}(\s+[A-Z]+)*,?\s+(#{AUSTRALIAN_STATES.join('|')})\b/)
55
+ has_suburb_stats = check_address.match?(/(\b[A-Z]{2,}(\s+[A-Z]+)*,?|,\s+[A-Z][A-Za-z ]+)(\s+\d{4})?\s+(#{AUSTRALIAN_STATES.join('|')})\b/)
33
56
 
34
57
  if ENV["DEBUG"]
35
58
  missing = []
@@ -38,7 +61,7 @@ module ScraperUtils
38
61
  end
39
62
  missing << "state" unless has_state
40
63
  missing << "postcode" unless has_postcode
41
- missing << "#{ignore_case ? '' : 'uppercase '}suburb state" unless has_suburb_stats
64
+ missing << "suburb state" unless has_suburb_stats
42
65
  puts " address: #{address} is not geocodable, missing #{missing.join(', ')}" if missing.any?
43
66
  end
44
67
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ScraperUtils
4
- VERSION = "0.7.0"
4
+ VERSION = "0.7.2"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scraper_utils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.7.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ian Heggie
@@ -118,7 +118,7 @@ metadata:
118
118
  allowed_push_host: https://rubygems.org
119
119
  homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
120
120
  source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
121
- documentation_uri: https://rubydoc.info/gems/scraper_utils/0.7.0
121
+ documentation_uri: https://rubydoc.info/gems/scraper_utils/0.7.2
122
122
  changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
123
123
  rubygems_mfa_required: 'true'
124
124
  post_install_message: