scraper_utils 0.10.1 → 0.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ffc82e602be6fd69f2086d9abb9c5003e248f96f3d427e31208153b134696eb6
4
- data.tar.gz: f60ba37d659d07f55f8c04a02a32c229f083e60e99deddb5507b58390ddaeacd
3
+ metadata.gz: 2437b1d72d0a7d4924b4f22d906844fc15eef3448095e04365b210df368a0e80
4
+ data.tar.gz: 860865eff6d101a28bc4c6be564a3cbd1c23ce97c7da258d4baba038c8e4713d
5
5
  SHA512:
6
- metadata.gz: 6ae035de4c7cdc76e6bad9bf6fe8053b835b85689d106172ae6916a4fc67bb712796242443ee710a0c96475607ffd49f93069746f6fec79b583c9a19ef75f68c
7
- data.tar.gz: 94e0a9ad963d5557161781a26233d792aa18ec63248aee3b79fcf9541f00fd9016e671d050571b53f2ae1d16e5f8e33d42f977e555e71af6e9565a4b2f5831c4
6
+ metadata.gz: 8e226c73b7530ffc0e382a438896ad0f0d9fbbc8acdc6064f4d33874be017b663fa525a1de7428cba0740849dbfae38152566ddb85bae8d71669e9ef995cefff
7
+ data.tar.gz: 7ff0da628481fc403ca849e750b6e880f41cdca4f83ab92476d33f0aa3b42898a23825a2b92dc1618f371c47754e07a4a98fbf70d134c39bbd0e929281abd790
data/CHANGELOG.md CHANGED
@@ -1,5 +1,15 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.11.1 - 2026-02-13
4
+
5
+ * Output council_reference from log_saved_record as it helps me debug some web query issues
6
+
7
+ ## 0.10.2 - 2026-02-09
8
+
9
+ * Added `ScraperUtils::MiscUtils.throttle_block` as documented in `docs/misc_utilities.md` for use with HTTParty
10
+ * Added spec for cleanup_old_records
11
+ * Fixed cleanup_old_records for multi scrapers if extra status tables where missing
12
+
3
13
  ## 0.10.1 - 2026-01-27
4
14
 
5
15
  * Added `ScraperUtils::DbUtils.cleanup_old_records` to Clean up records older than 30 days and approx once a month
@@ -0,0 +1,28 @@
1
+ # Misc Utilities
2
+
3
+ ## Throttling Requests
4
+
5
+ Use `ScraperUtils::MiscUtils.throttle_block` to automatically pace requests based on server response time:
6
+
7
+ ```ruby
8
+ response = ScraperUtils::MiscUtils.throttle_block do
9
+ HTTParty.get(url)
10
+ end
11
+ # process response
12
+ ```
13
+
14
+ The throttle automatically:
15
+
16
+ - Measures block execution time
17
+ - Adds 0.5s delay (configurable via `extra_delay:`)
18
+ - Pauses before next request based on previous timing
19
+ - Caps pause at 120s maximum
20
+
21
+ Override the next pause duration manually if needed:
22
+
23
+ ```ruby
24
+ ScraperUtils::MiscUtils.pause_duration = 2.0
25
+ ```
26
+
27
+ **Note:** the agent returned by `ScraperUtils::MechanizeUtils.mechanize_agent` automatically applies throttling when
28
+ each request is made and thus does not need to be wrapped with the helper.
@@ -57,7 +57,7 @@ module ScraperUtils
57
57
  def self.log_saved_record(record)
58
58
  authority_label = extract_authority(record)
59
59
  @stats[authority_label][:saved] += 1
60
- ScraperUtils::LogUtils.log "Saving record #{authority_label} - #{record['address']}"
60
+ ScraperUtils::LogUtils.log "Saving record #{authority_label&.empty? ? '' : "for #{authority_label}: "}#{record['council_reference']} - #{record['address']}"
61
61
  end
62
62
  end
63
63
  end
@@ -236,6 +236,8 @@ module ScraperUtils
236
236
  "DELETE FROM #{table} WHERE date(run_at) < date(?)",
237
237
  [cutoff]
238
238
  )
239
+ rescue SqliteMagic::NoSuchTable => e
240
+ ScraperUtils::LogUtils.log "Ignoring: #{e} whilst cleaning old records" if ScraperUtils::DebugUtils.trace?
239
241
  end
240
242
  end
241
243
 
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ScraperUtils
4
+ # Misc Standalone Utilities
5
+ module MiscUtils
6
+ MAX_PAUSE = 120.0
7
+
8
+ class << self
9
+ attr_accessor :pause_duration
10
+
11
+ # Throttle block to be nice to servers we are scraping
12
+ def throttle_block(extra_delay: 0.5)
13
+ if @pause_duration&.positive?
14
+ puts "Pausing #{@pause_duration}s" if ScraperUtils::DebugUtils.trace?
15
+ sleep(@pause_duration)
16
+ end
17
+ start_time = Time.now.to_f
18
+ result = yield
19
+ @pause_duration = (Time.now.to_f - start_time + extra_delay).round(3).clamp(0.0, MAX_PAUSE)
20
+ result
21
+ end
22
+ end
23
+ end
24
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ScraperUtils
4
- VERSION = "0.10.1"
4
+ VERSION = "0.11.1"
5
5
  end
data/lib/scraper_utils.rb CHANGED
@@ -9,6 +9,7 @@ require "scraper_utils/db_utils"
9
9
  require "scraper_utils/debug_utils"
10
10
  require "scraper_utils/log_utils"
11
11
  require "scraper_utils/maths_utils"
12
+ require "scraper_utils/misc_utils"
12
13
  require "scraper_utils/spec_support"
13
14
 
14
15
  # Mechanize utilities
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scraper_utils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.1
4
+ version: 0.11.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ian Heggie
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-02-04 00:00:00.000000000 Z
11
+ date: 2026-02-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -88,6 +88,7 @@ files:
88
88
  - docs/example_scraper.rb
89
89
  - docs/getting_started.md
90
90
  - docs/mechanize_utilities.md
91
+ - docs/misc_utilities.md
91
92
  - docs/parallel_scrapers.md
92
93
  - docs/testing_custom_scrapers.md
93
94
  - exe/validate_scraper_data
@@ -100,6 +101,7 @@ files:
100
101
  - lib/scraper_utils/maths_utils.rb
101
102
  - lib/scraper_utils/mechanize_utils.rb
102
103
  - lib/scraper_utils/mechanize_utils/agent_config.rb
104
+ - lib/scraper_utils/misc_utils.rb
103
105
  - lib/scraper_utils/spec_support.rb
104
106
  - lib/scraper_utils/version.rb
105
107
  - scraper_utils.gemspec
@@ -110,7 +112,7 @@ metadata:
110
112
  allowed_push_host: https://rubygems.org
111
113
  homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
112
114
  source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
113
- documentation_uri: https://rubydoc.info/gems/scraper_utils/0.10.1
115
+ documentation_uri: https://rubydoc.info/gems/scraper_utils/0.11.1
114
116
  changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
115
117
  rubygems_mfa_required: 'true'
116
118
  post_install_message: