scraper_utils 0.10.1 → 0.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ffc82e602be6fd69f2086d9abb9c5003e248f96f3d427e31208153b134696eb6
4
- data.tar.gz: f60ba37d659d07f55f8c04a02a32c229f083e60e99deddb5507b58390ddaeacd
3
+ metadata.gz: b3eb0cb212c43c54a13ffbd95bc09e0be7f1f371bd4169076ba9f7c2cfc38c19
4
+ data.tar.gz: 3ea1460b28059ec868f7d5acd618d435e725c48a1892c5820f31e7450f4aa925
5
5
  SHA512:
6
- metadata.gz: 6ae035de4c7cdc76e6bad9bf6fe8053b835b85689d106172ae6916a4fc67bb712796242443ee710a0c96475607ffd49f93069746f6fec79b583c9a19ef75f68c
7
- data.tar.gz: 94e0a9ad963d5557161781a26233d792aa18ec63248aee3b79fcf9541f00fd9016e671d050571b53f2ae1d16e5f8e33d42f977e555e71af6e9565a4b2f5831c4
6
+ metadata.gz: 5da0a39adeff7b05c9adf7c194d921c91395633bcd8057d257a4255f39f7fd03bf006bd627165187842356fca4243fa762e5054ca78295f9dd90dc864261b7f7
7
+ data.tar.gz: 4d7bfe3c874e4aa0a730ecae249841e9dabab3171681b007697ea7d7b6702f8c834c85c9a86a36136d2318ebdfb0f6d1e8eceed1cd09271d5384c80ad4419aef
data/CHANGELOG.md CHANGED
@@ -1,5 +1,11 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.10.2 - 2026-02-09
4
+
5
+ * Added `ScraperUtils::MiscUtils.throttle_block` as documented in `docs/misc_utilities.md` for use with HTTParty
6
+ * Added spec for cleanup_old_records
7
+ * Fixed cleanup_old_records for multi scrapers if extra status tables where missing
8
+
3
9
  ## 0.10.1 - 2026-01-27
4
10
 
5
11
  * Added `ScraperUtils::DbUtils.cleanup_old_records` to Clean up records older than 30 days and approx once a month
@@ -0,0 +1,28 @@
1
+ # Misc Utilities
2
+
3
+ ## Throttling Requests
4
+
5
+ Use `ScraperUtils::MiscUtils.throttle_block` to automatically pace requests based on server response time:
6
+
7
+ ```ruby
8
+ response = ScraperUtils::MiscUtils.throttle_block do
9
+ HTTParty.get(url)
10
+ end
11
+ # process response
12
+ ```
13
+
14
+ The throttle automatically:
15
+
16
+ - Measures block execution time
17
+ - Adds 0.5s delay (configurable via `extra_delay:`)
18
+ - Pauses before next request based on previous timing
19
+ - Caps pause at 120s maximum
20
+
21
+ Override the next pause duration manually if needed:
22
+
23
+ ```ruby
24
+ ScraperUtils::MiscUtils.pause_duration = 2.0
25
+ ```
26
+
27
+ **Note:** the agent returned by `ScraperUtils::MechanizeUtils.mechanize_agent` automatically applies throttling when
28
+ each request is made and thus does not need to be wrapped with the helper.
@@ -236,6 +236,8 @@ module ScraperUtils
236
236
  "DELETE FROM #{table} WHERE date(run_at) < date(?)",
237
237
  [cutoff]
238
238
  )
239
+ rescue SqliteMagic::NoSuchTable => e
240
+ ScraperUtils::LogUtils.log "Ignoring: #{e} whilst cleaning old records" if ScraperUtils::DebugUtils.trace?
239
241
  end
240
242
  end
241
243
 
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ScraperUtils
4
+ # Misc Standalone Utilities
5
+ module MiscUtils
6
+ MAX_PAUSE = 120.0
7
+
8
+ class << self
9
+ attr_accessor :pause_duration
10
+
11
+ # Throttle block to be nice to servers we are scraping
12
+ def throttle_block(extra_delay: 0.5)
13
+ if @pause_duration&.positive?
14
+ puts "Pausing #{@pause_duration}s" if ScraperUtils::DebugUtils.trace?
15
+ sleep(@pause_duration)
16
+ end
17
+ start_time = Time.now.to_f
18
+ result = yield
19
+ @pause_duration = (Time.now.to_f - start_time + extra_delay).round(3).clamp(0.0, MAX_PAUSE)
20
+ result
21
+ end
22
+ end
23
+ end
24
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ScraperUtils
4
- VERSION = "0.10.1"
4
+ VERSION = "0.10.2"
5
5
  end
data/lib/scraper_utils.rb CHANGED
@@ -9,6 +9,7 @@ require "scraper_utils/db_utils"
9
9
  require "scraper_utils/debug_utils"
10
10
  require "scraper_utils/log_utils"
11
11
  require "scraper_utils/maths_utils"
12
+ require "scraper_utils/misc_utils"
12
13
  require "scraper_utils/spec_support"
13
14
 
14
15
  # Mechanize utilities
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scraper_utils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.1
4
+ version: 0.10.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ian Heggie
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-02-04 00:00:00.000000000 Z
11
+ date: 2026-02-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -88,6 +88,7 @@ files:
88
88
  - docs/example_scraper.rb
89
89
  - docs/getting_started.md
90
90
  - docs/mechanize_utilities.md
91
+ - docs/misc_utilities.md
91
92
  - docs/parallel_scrapers.md
92
93
  - docs/testing_custom_scrapers.md
93
94
  - exe/validate_scraper_data
@@ -100,6 +101,7 @@ files:
100
101
  - lib/scraper_utils/maths_utils.rb
101
102
  - lib/scraper_utils/mechanize_utils.rb
102
103
  - lib/scraper_utils/mechanize_utils/agent_config.rb
104
+ - lib/scraper_utils/misc_utils.rb
103
105
  - lib/scraper_utils/spec_support.rb
104
106
  - lib/scraper_utils/version.rb
105
107
  - scraper_utils.gemspec
@@ -110,7 +112,7 @@ metadata:
110
112
  allowed_push_host: https://rubygems.org
111
113
  homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
112
114
  source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
113
- documentation_uri: https://rubydoc.info/gems/scraper_utils/0.10.1
115
+ documentation_uri: https://rubydoc.info/gems/scraper_utils/0.10.2
114
116
  changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
115
117
  rubygems_mfa_required: 'true'
116
118
  post_install_message: