scraper_utils 0.10.1 → 0.10.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/docs/misc_utilities.md +28 -0
- data/lib/scraper_utils/log_utils.rb +2 -0
- data/lib/scraper_utils/misc_utils.rb +24 -0
- data/lib/scraper_utils/version.rb +1 -1
- data/lib/scraper_utils.rb +1 -0
- metadata +5 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: b3eb0cb212c43c54a13ffbd95bc09e0be7f1f371bd4169076ba9f7c2cfc38c19
|
|
4
|
+
data.tar.gz: 3ea1460b28059ec868f7d5acd618d435e725c48a1892c5820f31e7450f4aa925
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 5da0a39adeff7b05c9adf7c194d921c91395633bcd8057d257a4255f39f7fd03bf006bd627165187842356fca4243fa762e5054ca78295f9dd90dc864261b7f7
|
|
7
|
+
data.tar.gz: 4d7bfe3c874e4aa0a730ecae249841e9dabab3171681b007697ea7d7b6702f8c834c85c9a86a36136d2318ebdfb0f6d1e8eceed1cd09271d5384c80ad4419aef
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,11 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.10.2 - 2026-02-09
|
|
4
|
+
|
|
5
|
+
* Added `ScraperUtils::MiscUtils.throttle_block` as documented in `docs/misc_utilities.md` for use with HTTParty
|
|
6
|
+
* Added spec for cleanup_old_records
|
|
7
|
+
* Fixed cleanup_old_records for multi scrapers if extra status tables where missing
|
|
8
|
+
|
|
3
9
|
## 0.10.1 - 2026-01-27
|
|
4
10
|
|
|
5
11
|
* Added `ScraperUtils::DbUtils.cleanup_old_records` to Clean up records older than 30 days and approx once a month
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Misc Utilities
|
|
2
|
+
|
|
3
|
+
## Throttling Requests
|
|
4
|
+
|
|
5
|
+
Use `ScraperUtils::MiscUtils.throttle_block` to automatically pace requests based on server response time:
|
|
6
|
+
|
|
7
|
+
```ruby
|
|
8
|
+
response = ScraperUtils::MiscUtils.throttle_block do
|
|
9
|
+
HTTParty.get(url)
|
|
10
|
+
end
|
|
11
|
+
# process response
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
The throttle automatically:
|
|
15
|
+
|
|
16
|
+
- Measures block execution time
|
|
17
|
+
- Adds 0.5s delay (configurable via `extra_delay:`)
|
|
18
|
+
- Pauses before next request based on previous timing
|
|
19
|
+
- Caps pause at 120s maximum
|
|
20
|
+
|
|
21
|
+
Override the next pause duration manually if needed:
|
|
22
|
+
|
|
23
|
+
```ruby
|
|
24
|
+
ScraperUtils::MiscUtils.pause_duration = 2.0
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
**Note:** the agent returned by `ScraperUtils::MechanizeUtils.mechanize_agent` automatically applies throttling when
|
|
28
|
+
each request is made and thus does not need to be wrapped with the helper.
|
|
@@ -236,6 +236,8 @@ module ScraperUtils
|
|
|
236
236
|
"DELETE FROM #{table} WHERE date(run_at) < date(?)",
|
|
237
237
|
[cutoff]
|
|
238
238
|
)
|
|
239
|
+
rescue SqliteMagic::NoSuchTable => e
|
|
240
|
+
ScraperUtils::LogUtils.log "Ignoring: #{e} whilst cleaning old records" if ScraperUtils::DebugUtils.trace?
|
|
239
241
|
end
|
|
240
242
|
end
|
|
241
243
|
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module ScraperUtils
|
|
4
|
+
# Misc Standalone Utilities
|
|
5
|
+
module MiscUtils
|
|
6
|
+
MAX_PAUSE = 120.0
|
|
7
|
+
|
|
8
|
+
class << self
|
|
9
|
+
attr_accessor :pause_duration
|
|
10
|
+
|
|
11
|
+
# Throttle block to be nice to servers we are scraping
|
|
12
|
+
def throttle_block(extra_delay: 0.5)
|
|
13
|
+
if @pause_duration&.positive?
|
|
14
|
+
puts "Pausing #{@pause_duration}s" if ScraperUtils::DebugUtils.trace?
|
|
15
|
+
sleep(@pause_duration)
|
|
16
|
+
end
|
|
17
|
+
start_time = Time.now.to_f
|
|
18
|
+
result = yield
|
|
19
|
+
@pause_duration = (Time.now.to_f - start_time + extra_delay).round(3).clamp(0.0, MAX_PAUSE)
|
|
20
|
+
result
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
data/lib/scraper_utils.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: scraper_utils
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.10.
|
|
4
|
+
version: 0.10.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ian Heggie
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-02-
|
|
11
|
+
date: 2026-02-08 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: mechanize
|
|
@@ -88,6 +88,7 @@ files:
|
|
|
88
88
|
- docs/example_scraper.rb
|
|
89
89
|
- docs/getting_started.md
|
|
90
90
|
- docs/mechanize_utilities.md
|
|
91
|
+
- docs/misc_utilities.md
|
|
91
92
|
- docs/parallel_scrapers.md
|
|
92
93
|
- docs/testing_custom_scrapers.md
|
|
93
94
|
- exe/validate_scraper_data
|
|
@@ -100,6 +101,7 @@ files:
|
|
|
100
101
|
- lib/scraper_utils/maths_utils.rb
|
|
101
102
|
- lib/scraper_utils/mechanize_utils.rb
|
|
102
103
|
- lib/scraper_utils/mechanize_utils/agent_config.rb
|
|
104
|
+
- lib/scraper_utils/misc_utils.rb
|
|
103
105
|
- lib/scraper_utils/spec_support.rb
|
|
104
106
|
- lib/scraper_utils/version.rb
|
|
105
107
|
- scraper_utils.gemspec
|
|
@@ -110,7 +112,7 @@ metadata:
|
|
|
110
112
|
allowed_push_host: https://rubygems.org
|
|
111
113
|
homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
|
|
112
114
|
source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
|
|
113
|
-
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.10.
|
|
115
|
+
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.10.2
|
|
114
116
|
changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
|
|
115
117
|
rubygems_mfa_required: 'true'
|
|
116
118
|
post_install_message:
|