scraper_utils 0.9.2 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/README.md +3 -1
- data/docs/example_scraper.rb +1 -0
- data/docs/mechanize_utilities.md +15 -4
- data/lib/scraper_utils/db_utils.rb +24 -0
- data/lib/scraper_utils/mechanize_utils/agent_config.rb +28 -9
- data/lib/scraper_utils/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ffc82e602be6fd69f2086d9abb9c5003e248f96f3d427e31208153b134696eb6
|
|
4
|
+
data.tar.gz: f60ba37d659d07f55f8c04a02a32c229f083e60e99deddb5507b58390ddaeacd
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 6ae035de4c7cdc76e6bad9bf6fe8053b835b85689d106172ae6916a4fc67bb712796242443ee710a0c96475607ffd49f93069746f6fec79b583c9a19ef75f68c
|
|
7
|
+
data.tar.gz: 94e0a9ad963d5557161781a26233d792aa18ec63248aee3b79fcf9541f00fd9016e671d050571b53f2ae1d16e5f8e33d42f977e555e71af6e9565a4b2f5831c4
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.10.1 - 2026-01-27
|
|
4
|
+
|
|
5
|
+
* Added `ScraperUtils::DbUtils.cleanup_old_records` to Clean up records older than 30 days and approx once a month
|
|
6
|
+
vacuum the DB
|
|
7
|
+
* Pauses the request time plus 0.5 seconds
|
|
8
|
+
* Removed reference to random delay – it's either not required or it's not enough to make a difference
|
|
9
|
+
|
|
3
10
|
## 0.9.2 - 2026-01-27
|
|
4
11
|
|
|
5
12
|
* Removed Emoticons as they are four byte UTF-8 and some databases are configured to only store 3 byte UTF-8
|
data/README.md
CHANGED
|
@@ -23,6 +23,8 @@ see {file:docs/getting_started.md Getting Started guide}
|
|
|
23
23
|
|
|
24
24
|
- Configure Mechanize agents with sensible defaults
|
|
25
25
|
- Supports extra actions required to get to results page
|
|
26
|
+
- Plays nice with external servers by pausing (crawl_delay + response_time) between requests by default,
|
|
27
|
+
backing off when servers are slow
|
|
26
28
|
- {file:docs/mechanize_utilities.md Learn more about Mechanize utilities}
|
|
27
29
|
|
|
28
30
|
### Parallel Processing
|
|
@@ -77,4 +79,4 @@ on [ianheggie-oaf/scraper_utils | GitHub](https://github.com/ianheggie-oaf/scrap
|
|
|
77
79
|
|
|
78
80
|
## License
|
|
79
81
|
|
|
80
|
-
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
|
82
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/docs/example_scraper.rb
CHANGED
data/docs/mechanize_utilities.md
CHANGED
|
@@ -16,10 +16,18 @@ agent = ScraperUtils::MechanizeUtils.mechanize_agent(**options)
|
|
|
16
16
|
|
|
17
17
|
Add `client_options` to your AUTHORITIES configuration and move any of the following settings into it:
|
|
18
18
|
|
|
19
|
-
|
|
19
|
+
#### Connection Settings
|
|
20
|
+
|
|
21
|
+
* `timeout: Integer` - Timeout for agent connections in case the server is slower than normal (default: 60 seconds)
|
|
20
22
|
* `australian_proxy: true` - Use the proxy url in the `MORPH_AUSTRALIAN_PROXY` env variable if the site is geo-locked
|
|
21
23
|
* `disable_ssl_certificate_check: true` - Disabled SSL verification for old / incorrect certificates
|
|
22
24
|
|
|
25
|
+
#### Delay Settings (Respectful Scraping)
|
|
26
|
+
|
|
27
|
+
* `crawl_delay: Float` - Minimum delay between requests in seconds (default: 0.5)
|
|
28
|
+
* `max_load: Float` - Maximum server load as a percentage. E.g., `50` pauses as long as the response took, making total
|
|
29
|
+
time 50% response + 50% pause (default: 50.0)
|
|
30
|
+
|
|
23
31
|
Then adjust your code to accept `client_options` and pass then through to:
|
|
24
32
|
`ScraperUtils::MechanizeUtils.mechanize_agent(client_options || {})`
|
|
25
33
|
to receive a `Mechanize::Agent` configured accordingly.
|
|
@@ -42,11 +50,13 @@ ScraperUtils::MechanizeUtils::AgentConfig.configure do |config|
|
|
|
42
50
|
end
|
|
43
51
|
```
|
|
44
52
|
|
|
45
|
-
For full details, see
|
|
53
|
+
For full details, see
|
|
54
|
+
the [MechanizeUtils class documentation](https://rubydoc.info/gems/scraper_utils/ScraperUtils/MechanizeUtils).
|
|
46
55
|
|
|
47
56
|
## MechanizeActions
|
|
48
57
|
|
|
49
|
-
The `ScraperUtils::MechanizeActions` class provides a convenient way to execute a series of actions (like clicking
|
|
58
|
+
The `ScraperUtils::MechanizeActions` class provides a convenient way to execute a series of actions (like clicking
|
|
59
|
+
links, filling forms) on a Mechanize page.
|
|
50
60
|
|
|
51
61
|
### Action Format
|
|
52
62
|
|
|
@@ -86,4 +96,5 @@ actions = [
|
|
|
86
96
|
]
|
|
87
97
|
```
|
|
88
98
|
|
|
89
|
-
For full details, see
|
|
99
|
+
For full details, see
|
|
100
|
+
the [MechanizeActions class documentation](https://rubydoc.info/gems/scraper_utils/ScraperUtils/MechanizeActions).
|
|
@@ -56,5 +56,29 @@ module ScraperUtils
|
|
|
56
56
|
ScraperUtils::DataQualityMonitor.log_saved_record(record)
|
|
57
57
|
end
|
|
58
58
|
end
|
|
59
|
+
|
|
60
|
+
# Clean up records older than 30 days and approx once a month vacuum the DB
|
|
61
|
+
def self.cleanup_old_records
|
|
62
|
+
cutoff_date = (Date.today - 30).to_s
|
|
63
|
+
vacuum_cutoff_date = (Date.today - 35).to_s
|
|
64
|
+
|
|
65
|
+
stats = ScraperWiki.sqliteexecute(
|
|
66
|
+
"SELECT COUNT(*) as count, MIN(date_scraped) as oldest FROM data WHERE date_scraped < ?",
|
|
67
|
+
[cutoff_date]
|
|
68
|
+
).first
|
|
69
|
+
|
|
70
|
+
deleted_count = stats["count"]
|
|
71
|
+
oldest_date = stats["oldest"]
|
|
72
|
+
|
|
73
|
+
return unless deleted_count.positive? || ENV["VACUUM"]
|
|
74
|
+
|
|
75
|
+
LogUtils.log "Deleting #{deleted_count} applications scraped between #{oldest_date} and #{cutoff_date}"
|
|
76
|
+
ScraperWiki.sqliteexecute("DELETE FROM data WHERE date_scraped < ?", [cutoff_date])
|
|
77
|
+
|
|
78
|
+
return unless rand < 0.03 || (oldest_date && oldest_date < vacuum_cutoff_date) || ENV["VACUUM"]
|
|
79
|
+
|
|
80
|
+
LogUtils.log " Running VACUUM to reclaim space..."
|
|
81
|
+
ScraperWiki.sqliteexecute("VACUUM")
|
|
82
|
+
end
|
|
59
83
|
end
|
|
60
84
|
end
|
|
@@ -19,10 +19,11 @@ module ScraperUtils
|
|
|
19
19
|
# @example Overriding specific settings
|
|
20
20
|
# config = ScraperUtils::MechanizeUtils::AgentConfig.new(
|
|
21
21
|
# timeout: 120,
|
|
22
|
-
# random_delay: 10
|
|
23
22
|
# )
|
|
24
23
|
class AgentConfig
|
|
25
24
|
DEFAULT_TIMEOUT = 60
|
|
25
|
+
DEFAULT_CRAWL_DELAY = 0.5
|
|
26
|
+
DEFAULT_MAX_LOAD = 50.0
|
|
26
27
|
|
|
27
28
|
# Class-level defaults that can be modified
|
|
28
29
|
class << self
|
|
@@ -38,6 +39,13 @@ module ScraperUtils
|
|
|
38
39
|
# @return [String, nil] Default Mechanize user agent
|
|
39
40
|
attr_accessor :default_user_agent
|
|
40
41
|
|
|
42
|
+
# @return [Float, nil] Default Crawl delay between requests in seconds
|
|
43
|
+
attr_accessor :default_crawl_delay
|
|
44
|
+
|
|
45
|
+
# @return [Float, nil] Default Max load presented to an external server as a percentage
|
|
46
|
+
# 50 will result in a pause the same length as the response (ie 50% of total time will be the response, 50% pausing)
|
|
47
|
+
attr_accessor :default_max_load
|
|
48
|
+
|
|
41
49
|
# Configure default settings for all AgentConfig instances
|
|
42
50
|
# @yield [self] Yields self for configuration
|
|
43
51
|
# @example
|
|
@@ -56,6 +64,8 @@ module ScraperUtils
|
|
|
56
64
|
@default_disable_ssl_certificate_check = !ENV.fetch('MORPH_DISABLE_SSL_CHECK', nil).to_s.empty? # false
|
|
57
65
|
@default_australian_proxy = !ENV.fetch('MORPH_USE_PROXY', nil).to_s.empty? # false
|
|
58
66
|
@default_user_agent = ENV.fetch('MORPH_USER_AGENT', nil) # Uses Mechanize user agent
|
|
67
|
+
@default_crawl_delay = ENV.fetch('MORPH_CLIENT_CRAWL_DELAY', DEFAULT_CRAWL_DELAY)
|
|
68
|
+
@default_max_load = ENV.fetch('MORPH_MAX_LOAD', DEFAULT_MAX_LOAD)
|
|
59
69
|
end
|
|
60
70
|
end
|
|
61
71
|
|
|
@@ -67,7 +77,7 @@ module ScraperUtils
|
|
|
67
77
|
|
|
68
78
|
# Give access for testing
|
|
69
79
|
|
|
70
|
-
attr_reader :max_load, :
|
|
80
|
+
attr_reader :max_load, :crawl_delay
|
|
71
81
|
|
|
72
82
|
# Creates Mechanize agent configuration with sensible defaults overridable via configure
|
|
73
83
|
# @param timeout [Integer, nil] Timeout for agent connections (default: 60)
|
|
@@ -76,8 +86,8 @@ module ScraperUtils
|
|
|
76
86
|
# @param user_agent [String, nil] Configure Mechanize user agent
|
|
77
87
|
def initialize(timeout: nil,
|
|
78
88
|
compliant_mode: nil,
|
|
79
|
-
random_delay: nil,
|
|
80
89
|
max_load: nil,
|
|
90
|
+
crawl_delay: nil,
|
|
81
91
|
disable_ssl_certificate_check: nil,
|
|
82
92
|
australian_proxy: nil,
|
|
83
93
|
user_agent: nil)
|
|
@@ -94,6 +104,9 @@ module ScraperUtils
|
|
|
94
104
|
else
|
|
95
105
|
australian_proxy
|
|
96
106
|
end
|
|
107
|
+
@crawl_delay = crawl_delay.nil? ? self.class.default_crawl_delay : crawl_delay.to_f
|
|
108
|
+
# Clamp between 10 (delay 9 x response) and 100 (no delay)
|
|
109
|
+
@max_load = (max_load.nil? ? self.class.default_max_load : max_load).to_f.clamp(10.0, 100.0)
|
|
97
110
|
|
|
98
111
|
# Validate proxy URL format if proxy will be used
|
|
99
112
|
@australian_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
|
|
@@ -111,11 +124,6 @@ module ScraperUtils
|
|
|
111
124
|
end
|
|
112
125
|
end
|
|
113
126
|
|
|
114
|
-
if @random_delay&.positive?
|
|
115
|
-
min_random = Math.sqrt(@random_delay * 3.0 / 13.0)
|
|
116
|
-
@random_range = min_random.round(3)..(3 * min_random).round(3)
|
|
117
|
-
end
|
|
118
|
-
|
|
119
127
|
today = Date.today.strftime("%Y-%m-%d")
|
|
120
128
|
@user_agent = ENV.fetch("MORPH_USER_AGENT", nil)&.sub("TODAY", today)
|
|
121
129
|
version = ScraperUtils::VERSION
|
|
@@ -179,12 +187,23 @@ module ScraperUtils
|
|
|
179
187
|
raise ArgumentError, "URI must be present in post-connect hook" unless uri
|
|
180
188
|
|
|
181
189
|
response_time = Time.now - @connection_started_at
|
|
190
|
+
|
|
191
|
+
response_delay = @crawl_delay || 0.0
|
|
192
|
+
if @crawl_delay ||@max_load
|
|
193
|
+
response_delay += response_time
|
|
194
|
+
if @max_load && @max_load >= 1
|
|
195
|
+
response_delay += (100.0 - @max_load) * response_time / @max_load
|
|
196
|
+
end
|
|
197
|
+
response_delay = response_delay.round(3)
|
|
198
|
+
end
|
|
199
|
+
|
|
182
200
|
if DebugUtils.basic?
|
|
183
201
|
ScraperUtils::LogUtils.log(
|
|
184
202
|
"Post Connect uri: #{uri.inspect}, response: #{response.inspect} " \
|
|
185
|
-
"after #{response_time} seconds"
|
|
203
|
+
"after #{response_time} seconds#{response_delay > 0.0 ? ", pausing for #{response_delay} seconds" : ""}"
|
|
186
204
|
)
|
|
187
205
|
end
|
|
206
|
+
sleep(response_delay) if response_delay > 0.0
|
|
188
207
|
response
|
|
189
208
|
end
|
|
190
209
|
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: scraper_utils
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.10.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ian Heggie
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-02-04 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: mechanize
|
|
@@ -110,7 +110,7 @@ metadata:
|
|
|
110
110
|
allowed_push_host: https://rubygems.org
|
|
111
111
|
homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
|
|
112
112
|
source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
|
|
113
|
-
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.
|
|
113
|
+
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.10.1
|
|
114
114
|
changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
|
|
115
115
|
rubygems_mfa_required: 'true'
|
|
116
116
|
post_install_message:
|