RubyGems - scraper_utils - Versions diffs - 0.10.1 → 0.10.2 - Mend

scraper_utils 0.10.1 → 0.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +6 -0
data/docs/misc_utilities.md +28 -0
data/lib/scraper_utils/log_utils.rb +2 -0
data/lib/scraper_utils/misc_utils.rb +24 -0
data/lib/scraper_utils/version.rb +1 -1
data/lib/scraper_utils.rb +1 -0
metadata +5 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ffc82e602be6fd69f2086d9abb9c5003e248f96f3d427e31208153b134696eb6
-  data.tar.gz: f60ba37d659d07f55f8c04a02a32c229f083e60e99deddb5507b58390ddaeacd
+  metadata.gz: b3eb0cb212c43c54a13ffbd95bc09e0be7f1f371bd4169076ba9f7c2cfc38c19
+  data.tar.gz: 3ea1460b28059ec868f7d5acd618d435e725c48a1892c5820f31e7450f4aa925
 SHA512:
-  metadata.gz: 6ae035de4c7cdc76e6bad9bf6fe8053b835b85689d106172ae6916a4fc67bb712796242443ee710a0c96475607ffd49f93069746f6fec79b583c9a19ef75f68c
-  data.tar.gz: 94e0a9ad963d5557161781a26233d792aa18ec63248aee3b79fcf9541f00fd9016e671d050571b53f2ae1d16e5f8e33d42f977e555e71af6e9565a4b2f5831c4
+  metadata.gz: 5da0a39adeff7b05c9adf7c194d921c91395633bcd8057d257a4255f39f7fd03bf006bd627165187842356fca4243fa762e5054ca78295f9dd90dc864261b7f7
+  data.tar.gz: 4d7bfe3c874e4aa0a730ecae249841e9dabab3171681b007697ea7d7b6702f8c834c85c9a86a36136d2318ebdfb0f6d1e8eceed1cd09271d5384c80ad4419aef

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,11 @@
 # Changelog
+## 0.10.2 - 2026-02-09
+* Added `ScraperUtils::MiscUtils.throttle_block` as documented in `docs/misc_utilities.md` for use with HTTParty
+* Added spec for cleanup_old_records
+* Fixed cleanup_old_records for multi scrapers if extra status tables where missing
 ## 0.10.1 - 2026-01-27
 * Added  `ScraperUtils::DbUtils.cleanup_old_records` to Clean up records older than 30 days and approx once a month

data/docs/misc_utilities.md ADDED Viewed

@@ -0,0 +1,28 @@
+# Misc Utilities
+## Throttling Requests
+Use `ScraperUtils::MiscUtils.throttle_block` to automatically pace requests based on server response time:
+```ruby
+response = ScraperUtils::MiscUtils.throttle_block do
+  HTTParty.get(url)
+end
+# process response
+```
+The throttle automatically:
+- Measures block execution time
+- Adds 0.5s delay (configurable via `extra_delay:`)
+- Pauses before next request based on previous timing
+- Caps pause at 120s maximum
+Override the next pause duration manually if needed:
+```ruby
+ScraperUtils::MiscUtils.pause_duration = 2.0
+```
+**Note:** the agent returned by `ScraperUtils::MechanizeUtils.mechanize_agent` automatically applies throttling when
+each request is made and thus does not need to be wrapped with the helper.

data/lib/scraper_utils/log_utils.rb CHANGED Viewed

@@ -236,6 +236,8 @@ module ScraperUtils
           "DELETE FROM #{table} WHERE date(run_at) < date(?)",
           [cutoff]
         )
+      rescue SqliteMagic::NoSuchTable => e
+        ScraperUtils::LogUtils.log "Ignoring: #{e} whilst cleaning old records" if ScraperUtils::DebugUtils.trace?
       end
     end

data/lib/scraper_utils/misc_utils.rb ADDED Viewed

@@ -0,0 +1,24 @@
+# frozen_string_literal: true
+module ScraperUtils
+  # Misc Standalone Utilities
+  module MiscUtils
+    MAX_PAUSE = 120.0
+    class << self
+      attr_accessor :pause_duration
+      # Throttle block to be nice to servers we are scraping
+      def throttle_block(extra_delay: 0.5)
+        if @pause_duration&.positive?
+          puts "Pausing #{@pause_duration}s" if ScraperUtils::DebugUtils.trace?
+          sleep(@pause_duration)
+        end
+        start_time = Time.now.to_f
+        result = yield
+        @pause_duration = (Time.now.to_f - start_time + extra_delay).round(3).clamp(0.0, MAX_PAUSE)
+        result
+      end
+    end
+  end
+end

data/lib/scraper_utils/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module ScraperUtils
-  VERSION = "0.10.1"
+  VERSION = "0.10.2"
 end

data/lib/scraper_utils.rb CHANGED Viewed

@@ -9,6 +9,7 @@ require "scraper_utils/db_utils"
 require "scraper_utils/debug_utils"
 require "scraper_utils/log_utils"
 require "scraper_utils/maths_utils"
+require "scraper_utils/misc_utils"
 require "scraper_utils/spec_support"
 # Mechanize utilities

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: scraper_utils
 version: !ruby/object:Gem::Version
-  version: 0.10.1
+  version: 0.10.2
 platform: ruby
 authors:
 - Ian Heggie
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2026-02-04 00:00:00.000000000 Z
+date: 2026-02-08 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: mechanize
@@ -88,6 +88,7 @@ files:
 - docs/example_scraper.rb
 - docs/getting_started.md
 - docs/mechanize_utilities.md
+- docs/misc_utilities.md
 - docs/parallel_scrapers.md
 - docs/testing_custom_scrapers.md
 - exe/validate_scraper_data
@@ -100,6 +101,7 @@ files:
 - lib/scraper_utils/maths_utils.rb
 - lib/scraper_utils/mechanize_utils.rb
 - lib/scraper_utils/mechanize_utils/agent_config.rb
+- lib/scraper_utils/misc_utils.rb
 - lib/scraper_utils/spec_support.rb
 - lib/scraper_utils/version.rb
 - scraper_utils.gemspec
@@ -110,7 +112,7 @@ metadata:
   allowed_push_host: https://rubygems.org
   homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
   source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
-  documentation_uri: https://rubydoc.info/gems/scraper_utils/0.10.1
+  documentation_uri: https://rubydoc.info/gems/scraper_utils/0.10.2
   changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
   rubygems_mfa_required: 'true'
 post_install_message: