scraper_utils 0.11.1 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2437b1d72d0a7d4924b4f22d906844fc15eef3448095e04365b210df368a0e80
4
- data.tar.gz: 860865eff6d101a28bc4c6be564a3cbd1c23ce97c7da258d4baba038c8e4713d
3
+ metadata.gz: fc8aab3f24d29cc1e3fe44880d9030e7149372189a8691acd6e628e2a260c57c
4
+ data.tar.gz: a82ecb97878e57f5d0cf75d623c94a6ff54e7805c43a7be140e6a2da7bbcca49
5
5
  SHA512:
6
- metadata.gz: 8e226c73b7530ffc0e382a438896ad0f0d9fbbc8acdc6064f4d33874be017b663fa525a1de7428cba0740849dbfae38152566ddb85bae8d71669e9ef995cefff
7
- data.tar.gz: 7ff0da628481fc403ca849e750b6e880f41cdca4f83ab92476d33f0aa3b42898a23825a2b92dc1618f371c47754e07a4a98fbf70d134c39bbd0e929281abd790
6
+ metadata.gz: bd6d178afa8669916b70f2c6bdb56b77a8a5995d18e47c64b360a8d5d334b479645d20792d80759afde1aac6550d79f46a0504030d8a8920c8dea55fa6ad6132
7
+ data.tar.gz: 20a2c5f144cfc8e2106d2e4643d7f3b0cb35110a5519d63bb0d8c1e655c8958fe73115089352a71272a419168c50299c8ce985d318be6bfa9635e64c9c4fb238
data/CHANGELOG.md CHANGED
@@ -1,5 +1,12 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.12.1 - 2026.02-18
4
+
5
+ * Added override for the threshold of when to abdon scraping due to unprocessable records
6
+ * $MORPH_UNPROCESSABLE_BASE - default 5.01
7
+ * $MORPH_UNPROCESSABLE_PERCENTAGE - default 10
8
+ * Log council_reference as well as address for unprocessable records
9
+
3
10
  ## 0.11.1 - 2026-02-13
4
11
 
5
12
  * Output council_reference from log_saved_record as it helps me debug some web query issues
@@ -74,7 +74,7 @@ class Scraper
74
74
  )
75
75
  end
76
76
 
77
- ScraperUtils::DbUtils.cleanup_old_records
77
+ ScraperUtils::DbUtils.cleanup_old_records
78
78
  # Report on results, raising errors for unexpected conditions
79
79
  ScraperUtils::LogUtils.report_on_results(authorities, exceptions)
80
80
  end
@@ -92,6 +92,10 @@ if __FILE__ == $PROGRAM_NAME
92
92
 
93
93
  ENV["MORPH_EXPECT_BAD"] = default_expect_bad.keys.join(',')
94
94
  end
95
+ # If the sites have many unusable records - raise defaults
96
+ # ENV['MORPH_UNPROCESSABLE_BASE'] ||= "10"
97
+ # ENV['MORPH_UNPROCESSABLE_PERCENTAGE'] ||= "20"
98
+
95
99
  Scraper.run(Scraper.selected_authorities)
96
100
 
97
101
  # Dump database for morph-cli
@@ -26,8 +26,12 @@ module ScraperUtils
26
26
  authority_label
27
27
  end
28
28
 
29
+ # Threshold for unprocessable records
30
+ # Initial base of 5.01 (override using MORPH_UNPROCESSABLE_BASE)
31
+ # Initial percentage of 10% (override using MORPH_UNPROCESSABLE_PERCENTAGE)
29
32
  def self.threshold(authority_label)
30
- 5.01 + (@stats[authority_label][:saved] * 0.1) if @stats&.fetch(authority_label, nil)
33
+ ENV.fetch('MORPH_UNPROCESSABLE_BASE', 5.01).to_f +
34
+ (@stats[authority_label][:saved].to_i * ENV.fetch('MORPH_UNPROCESSABLE_PERCENTAGE', 10.0).to_f / 100.0) if @stats&.fetch(authority_label, nil)
31
35
  end
32
36
 
33
37
  # Logs an unprocessable record and raises an exception if error threshold is exceeded
@@ -40,9 +44,12 @@ module ScraperUtils
40
44
  def self.log_unprocessable_record(exception, record)
41
45
  authority_label = extract_authority(record)
42
46
  @stats[authority_label][:unprocessed] += 1
43
- ScraperUtils::LogUtils.log "Erroneous record #{authority_label} - #{record&.fetch(
44
- 'address', nil
45
- ) || record.inspect}: #{exception}"
47
+ details = if record&.key?('council_reference') && record&.key?('address')
48
+ "#{record['council_reference']} - #{record['address']}"
49
+ else
50
+ record.inspect
51
+ end
52
+ ScraperUtils::LogUtils.log "Erroneous record #{details}: #{exception}"
46
53
  return unless @stats[authority_label][:unprocessed] > threshold(authority_label)
47
54
 
48
55
  raise ScraperUtils::UnprocessableSite,
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ScraperUtils
4
- VERSION = "0.11.1"
4
+ VERSION = "0.12.1"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scraper_utils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.1
4
+ version: 0.12.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ian Heggie
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-02-13 00:00:00.000000000 Z
11
+ date: 2026-02-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -112,7 +112,7 @@ metadata:
112
112
  allowed_push_host: https://rubygems.org
113
113
  homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
114
114
  source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
115
- documentation_uri: https://rubydoc.info/gems/scraper_utils/0.11.1
115
+ documentation_uri: https://rubydoc.info/gems/scraper_utils/0.12.1
116
116
  changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
117
117
  rubygems_mfa_required: 'true'
118
118
  post_install_message: