scraper_utils 0.7.0 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/lib/scraper_utils/spec_support.rb +31 -8
- data/lib/scraper_utils/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ba66a28129ee09ab76cb0937d195ff68aa5058d1c73805235de4898384fe495d
|
4
|
+
data.tar.gz: 76c20b7ce9bd581e59fda41b8801bca8a26909d58ddcc74c46e7755e6038970a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5ae13c2c5e4b8bb1c30c2c8a10dd30b42b349dfc7c416fc65bec504e27d7b5c9dad84b6ef195c52788412f238a6740e4fb8fc400e315a42aeb7ee57f8ada9a25
|
7
|
+
data.tar.gz: bf3d40831ee8667f663b442f92e76943db04eaa995095e09f4c3736e80919d26c924aa17d733a533ec32b278496c2aedd8c368eb40988e1fd9619c5febfb1567
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,14 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
## 0.7.2 - 2025-04-15
|
4
|
+
|
5
|
+
* Accept postcode before state as well as after
|
6
|
+
|
7
|
+
## 0.7.1 - 2025-04-15
|
8
|
+
|
9
|
+
* Accept mixed case suburb names after a comma as well as uppercase suburb names as geocachable
|
10
|
+
* Accept more street type abbreviations and check they are on word boundaries
|
11
|
+
|
3
12
|
## 0.7.0 - 2025-04-15
|
4
13
|
|
5
14
|
* Added Spec helpers and associated doc: `docs/enhancing_specs.md`
|
@@ -6,11 +6,33 @@ module ScraperUtils
|
|
6
6
|
# Methods to support specs
|
7
7
|
module SpecSupport
|
8
8
|
AUSTRALIAN_STATES = %w[ACT NSW NT QLD SA TAS VIC WA].freeze
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
9
|
+
STREET_TYPE_PATTERNS = [
|
10
|
+
/\bAv(e(nue)?)?\b/i,
|
11
|
+
/\bB(oulevard|lvd)\b/i,
|
12
|
+
/\b(Circuit|Cct)\b/i,
|
13
|
+
/\bCl(ose)?\b/i,
|
14
|
+
/\bC(our|r)?t\b/i,
|
15
|
+
/\bCircle\b/i,
|
16
|
+
/\bChase\b/i,
|
17
|
+
/\bCr(es(cent)?)?\b/i,
|
18
|
+
/\bDr((ive)?|v)\b/i,
|
19
|
+
/\bEnt(rance)?\b/i,
|
20
|
+
/\bGr(ove)?\b/i,
|
21
|
+
/\bH(ighwa|w)y\b/i,
|
22
|
+
/\bLane\b/i,
|
23
|
+
/\bLoop\b/i,
|
24
|
+
/\bParkway\b/i,
|
25
|
+
/\bPl(ace)?\b/i,
|
26
|
+
/\bPriv(ate)?\b/i,
|
27
|
+
/\bParade\b/i,
|
28
|
+
/\bR(oa)?d\b/i,
|
29
|
+
/\bRise\b/i,
|
30
|
+
/\bSt(reet)?\b/i,
|
31
|
+
/\bSquare\b/i,
|
32
|
+
/\bTerrace\b/i,
|
33
|
+
/\bWay\b/i
|
34
|
+
].freeze
|
35
|
+
|
14
36
|
AUSTRALIAN_POSTCODES = /\b\d{4}\b/.freeze
|
15
37
|
|
16
38
|
# Check if an address is likely to be geocodable by analyzing its format.
|
@@ -25,11 +47,12 @@ module ScraperUtils
|
|
25
47
|
has_state = AUSTRALIAN_STATES.any? { |state| check_address.end_with?(" #{state}") || check_address.include?(" #{state} ") }
|
26
48
|
has_postcode = address.match?(AUSTRALIAN_POSTCODES)
|
27
49
|
|
28
|
-
|
50
|
+
# Using the pre-compiled patterns
|
51
|
+
has_street_type = STREET_TYPE_PATTERNS.any? { |pattern| check_address.match?(pattern) }
|
29
52
|
|
30
53
|
has_unit_or_lot = address.match?(/\b(Unit|Lot:?)\s+\d+/i)
|
31
54
|
|
32
|
-
has_suburb_stats = check_address.match?(
|
55
|
+
has_suburb_stats = check_address.match?(/(\b[A-Z]{2,}(\s+[A-Z]+)*,?|,\s+[A-Z][A-Za-z ]+)(\s+\d{4})?\s+(#{AUSTRALIAN_STATES.join('|')})\b/)
|
33
56
|
|
34
57
|
if ENV["DEBUG"]
|
35
58
|
missing = []
|
@@ -38,7 +61,7 @@ module ScraperUtils
|
|
38
61
|
end
|
39
62
|
missing << "state" unless has_state
|
40
63
|
missing << "postcode" unless has_postcode
|
41
|
-
missing << "
|
64
|
+
missing << "suburb state" unless has_suburb_stats
|
42
65
|
puts " address: #{address} is not geocodable, missing #{missing.join(', ')}" if missing.any?
|
43
66
|
end
|
44
67
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scraper_utils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ian Heggie
|
@@ -118,7 +118,7 @@ metadata:
|
|
118
118
|
allowed_push_host: https://rubygems.org
|
119
119
|
homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
|
120
120
|
source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
|
121
|
-
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.7.
|
121
|
+
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.7.2
|
122
122
|
changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
|
123
123
|
rubygems_mfa_required: 'true'
|
124
124
|
post_install_message:
|