scraper_utils 0.11.1 → 0.12.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/docs/example_scraper.rb +5 -1
- data/lib/scraper_utils/data_quality_monitor.rb +11 -4
- data/lib/scraper_utils/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: fc8aab3f24d29cc1e3fe44880d9030e7149372189a8691acd6e628e2a260c57c
|
|
4
|
+
data.tar.gz: a82ecb97878e57f5d0cf75d623c94a6ff54e7805c43a7be140e6a2da7bbcca49
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: bd6d178afa8669916b70f2c6bdb56b77a8a5995d18e47c64b360a8d5d334b479645d20792d80759afde1aac6550d79f46a0504030d8a8920c8dea55fa6ad6132
|
|
7
|
+
data.tar.gz: 20a2c5f144cfc8e2106d2e4643d7f3b0cb35110a5519d63bb0d8c1e655c8958fe73115089352a71272a419168c50299c8ce985d318be6bfa9635e64c9c4fb238
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.12.1 - 2026.02-18
|
|
4
|
+
|
|
5
|
+
* Added override for the threshold of when to abdon scraping due to unprocessable records
|
|
6
|
+
* $MORPH_UNPROCESSABLE_BASE - default 5.01
|
|
7
|
+
* $MORPH_UNPROCESSABLE_PERCENTAGE - default 10
|
|
8
|
+
* Log council_reference as well as address for unprocessable records
|
|
9
|
+
|
|
3
10
|
## 0.11.1 - 2026-02-13
|
|
4
11
|
|
|
5
12
|
* Output council_reference from log_saved_record as it helps me debug some web query issues
|
data/docs/example_scraper.rb
CHANGED
|
@@ -74,7 +74,7 @@ class Scraper
|
|
|
74
74
|
)
|
|
75
75
|
end
|
|
76
76
|
|
|
77
|
-
ScraperUtils::DbUtils.cleanup_old_records
|
|
77
|
+
ScraperUtils::DbUtils.cleanup_old_records
|
|
78
78
|
# Report on results, raising errors for unexpected conditions
|
|
79
79
|
ScraperUtils::LogUtils.report_on_results(authorities, exceptions)
|
|
80
80
|
end
|
|
@@ -92,6 +92,10 @@ if __FILE__ == $PROGRAM_NAME
|
|
|
92
92
|
|
|
93
93
|
ENV["MORPH_EXPECT_BAD"] = default_expect_bad.keys.join(',')
|
|
94
94
|
end
|
|
95
|
+
# If the sites have many unusable records - raise defaults
|
|
96
|
+
# ENV['MORPH_UNPROCESSABLE_BASE'] ||= "10"
|
|
97
|
+
# ENV['MORPH_UNPROCESSABLE_PERCENTAGE'] ||= "20"
|
|
98
|
+
|
|
95
99
|
Scraper.run(Scraper.selected_authorities)
|
|
96
100
|
|
|
97
101
|
# Dump database for morph-cli
|
|
@@ -26,8 +26,12 @@ module ScraperUtils
|
|
|
26
26
|
authority_label
|
|
27
27
|
end
|
|
28
28
|
|
|
29
|
+
# Threshold for unprocessable records
|
|
30
|
+
# Initial base of 5.01 (override using MORPH_UNPROCESSABLE_BASE)
|
|
31
|
+
# Initial percentage of 10% (override using MORPH_UNPROCESSABLE_PERCENTAGE)
|
|
29
32
|
def self.threshold(authority_label)
|
|
30
|
-
|
|
33
|
+
ENV.fetch('MORPH_UNPROCESSABLE_BASE', 5.01).to_f +
|
|
34
|
+
(@stats[authority_label][:saved].to_i * ENV.fetch('MORPH_UNPROCESSABLE_PERCENTAGE', 10.0).to_f / 100.0) if @stats&.fetch(authority_label, nil)
|
|
31
35
|
end
|
|
32
36
|
|
|
33
37
|
# Logs an unprocessable record and raises an exception if error threshold is exceeded
|
|
@@ -40,9 +44,12 @@ module ScraperUtils
|
|
|
40
44
|
def self.log_unprocessable_record(exception, record)
|
|
41
45
|
authority_label = extract_authority(record)
|
|
42
46
|
@stats[authority_label][:unprocessed] += 1
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
47
|
+
details = if record&.key?('council_reference') && record&.key?('address')
|
|
48
|
+
"#{record['council_reference']} - #{record['address']}"
|
|
49
|
+
else
|
|
50
|
+
record.inspect
|
|
51
|
+
end
|
|
52
|
+
ScraperUtils::LogUtils.log "Erroneous record #{details}: #{exception}"
|
|
46
53
|
return unless @stats[authority_label][:unprocessed] > threshold(authority_label)
|
|
47
54
|
|
|
48
55
|
raise ScraperUtils::UnprocessableSite,
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: scraper_utils
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.12.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ian Heggie
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-02-
|
|
11
|
+
date: 2026-02-18 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: mechanize
|
|
@@ -112,7 +112,7 @@ metadata:
|
|
|
112
112
|
allowed_push_host: https://rubygems.org
|
|
113
113
|
homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
|
|
114
114
|
source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
|
|
115
|
-
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.
|
|
115
|
+
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.12.1
|
|
116
116
|
changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
|
|
117
117
|
rubygems_mfa_required: 'true'
|
|
118
118
|
post_install_message:
|