scraper_utils 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/.rubocop.yml +4 -0
- data/CHANGELOG.md +10 -1
- data/Gemfile +5 -2
- data/README.md +103 -149
- data/docs/example_scrape_with_fibers.rb +31 -0
- data/docs/example_scraper.rb +93 -0
- data/lib/scraper_utils/adaptive_delay.rb +55 -50
- data/lib/scraper_utils/data_quality_monitor.rb +28 -17
- data/lib/scraper_utils/date_range_utils.rb +159 -0
- data/lib/scraper_utils/db_utils.rb +0 -2
- data/lib/scraper_utils/debug_utils.rb +53 -6
- data/lib/scraper_utils/fiber_scheduler.rb +45 -22
- data/lib/scraper_utils/log_utils.rb +19 -17
- data/lib/scraper_utils/mechanize_utils/agent_config.rb +67 -46
- data/lib/scraper_utils/mechanize_utils.rb +12 -4
- data/lib/scraper_utils/randomize_utils.rb +34 -0
- data/lib/scraper_utils/robots_checker.rb +9 -4
- data/lib/scraper_utils/version.rb +1 -1
- data/lib/scraper_utils.rb +3 -10
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a2082c406ad96266f644fc1dfc588046aa43494e9e79b8fec38fe59252c09f06
|
4
|
+
data.tar.gz: 5100907cdcc8c55ddd59b25cf393d212f681b573ef874c5a6c65f748a8c852ec
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: feabea23ac5b14f6b642769db303bcab954b22cfd2d95d7694af51afd661c73f1ff70a0c17b54557a730fe811275c126c0fdd4473e2ae6d2b83cfc495aa52bc6
|
7
|
+
data.tar.gz: d91154c0dbccfb4271fd4830a82316676c1ad5665773c30d212bcc1ab6178a09d4fee97b2b2a32480c32b8ee69da08fc9a78575460d49f8233fb91b74bf7df66
|
data/.gitignore
CHANGED
data/.rubocop.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,14 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
|
4
|
+
## 0.2.1 - 2025-02-28
|
5
|
+
|
6
|
+
Fixed broken v0.2.0
|
7
|
+
|
8
|
+
## 0.2.0 - 2025-02-28
|
9
|
+
|
10
|
+
Added FiberScheduler, enabled complient mode with delays by default and simplified usage removing third retry without proxy
|
11
|
+
|
3
12
|
## 0.1.0 - 2025-02-23
|
4
13
|
|
5
|
-
|
14
|
+
First release for development
|
data/Gemfile
CHANGED
@@ -22,12 +22,15 @@ gem "sqlite3", platform && (platform == :heroku16 ? "~> 1.4.0" : "~> 1.6.3")
|
|
22
22
|
gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git",
|
23
23
|
branch: "morph_defaults"
|
24
24
|
|
25
|
-
# development and test
|
25
|
+
# development and test gems
|
26
26
|
gem "rake", platform && (platform == :heroku16 ? "~> 12.3.3" : "~> 13.0")
|
27
27
|
gem "rspec", platform && (platform == :heroku16 ? "~> 3.9.0" : "~> 3.12")
|
28
|
-
gem "rubocop", platform && (platform == :heroku16 ? "~>
|
28
|
+
gem "rubocop", platform && (platform == :heroku16 ? "~> 1.28.2" : "~> 1.73")
|
29
|
+
gem "rubocop-rake", platform && (platform == :heroku16 ? "~> 0.6.0" : "~> 0.7")
|
30
|
+
gem "rubocop-rspec", platform && (platform == :heroku16 ? "~> 2.10.0" : "~> 3.5")
|
29
31
|
gem "simplecov", platform && (platform == :heroku16 ? "~> 0.18.0" : "~> 0.22.0")
|
30
32
|
gem "simplecov-console"
|
33
|
+
gem "terminal-table"
|
31
34
|
gem "webmock", platform && (platform == :heroku16 ? "~> 3.14.0" : "~> 3.19.0")
|
32
35
|
|
33
36
|
gemspec
|
data/README.md
CHANGED
@@ -13,11 +13,12 @@ our scraper accessing your systems, here's what you should know:
|
|
13
13
|
|
14
14
|
### How to Control Our Behavior
|
15
15
|
|
16
|
-
Our scraper utilities respect the standard server **robots.txt** control mechanisms (by default).
|
16
|
+
Our scraper utilities respect the standard server **robots.txt** control mechanisms (by default).
|
17
|
+
To control our access:
|
17
18
|
|
18
19
|
- Add a section for our user agent: `User-agent: ScraperUtils` (default)
|
19
|
-
- Set a crawl delay: `Crawl-delay:
|
20
|
-
- If needed specify disallowed paths
|
20
|
+
- Set a crawl delay, eg: `Crawl-delay: 20`
|
21
|
+
- If needed specify disallowed paths*: `Disallow: /private/`
|
21
22
|
|
22
23
|
### Built-in Politeness Features
|
23
24
|
|
@@ -26,13 +27,22 @@ Even without specific configuration, our scrapers will, by default:
|
|
26
27
|
- **Identify themselves**: Our user agent clearly indicates who we are and provides a link to the project repository:
|
27
28
|
`Mozilla/5.0 (compatible; ScraperUtils/0.2.0 2025-02-22; +https://github.com/ianheggie-oaf/scraper_utils)`
|
28
29
|
|
29
|
-
- **Limit server load**: We
|
30
|
-
|
31
|
-
The slower your server is running, the longer the delay we add between requests to help
|
32
|
-
In the default "compliant mode" this defaults to 20% and
|
30
|
+
- **Limit server load**: We slow down our requests so we should never be a significant load to your server, let alone
|
31
|
+
overload it.
|
32
|
+
The slower your server is running, the longer the delay we add between requests to help.
|
33
|
+
In the default "compliant mode" this defaults to a max load of 20% and is capped at 33%.
|
33
34
|
|
34
|
-
- **Add randomized delays**: We add random delays between requests to
|
35
|
-
|
35
|
+
- **Add randomized delays**: We add random delays between requests to further reduce our impact on servers, which should
|
36
|
+
bring us
|
37
|
+
- down to the load of a single industrious person.
|
38
|
+
|
39
|
+
Extra utilities provided for scrapers to further reduce your server load:
|
40
|
+
|
41
|
+
- **Interleave requests**: This spreads out the requests to your server rather than focusing on one scraper at a time.
|
42
|
+
|
43
|
+
- **Intelligent Date Range selection**: This reduces server load by over 60% by a smarter choice of date range searches,
|
44
|
+
checking the recent 4 days each day and reducing down to checking each 3 days by the end of the 33-day mark. This
|
45
|
+
replaces the simplistic check of the last 30 days each day.
|
36
46
|
|
37
47
|
Our goal is to access public planning information without negatively impacting your services.
|
38
48
|
|
@@ -50,10 +60,6 @@ And then execute:
|
|
50
60
|
|
51
61
|
$ bundle
|
52
62
|
|
53
|
-
Or install it yourself for testing:
|
54
|
-
|
55
|
-
$ gem install scraper_utils
|
56
|
-
|
57
63
|
Usage
|
58
64
|
-----
|
59
65
|
|
@@ -101,12 +107,12 @@ export DEBUG=1
|
|
101
107
|
Add `client_options` to your AUTHORITIES configuration and move any of the following settings into it:
|
102
108
|
|
103
109
|
* `timeout: Integer` - Timeout for agent connections in case the server is slower than normal
|
104
|
-
* `australian_proxy: true` - Use the MORPH_AUSTRALIAN_PROXY
|
110
|
+
* `australian_proxy: true` - Use the proxy url in the `MORPH_AUSTRALIAN_PROXY` env variable if the site is geo-locked
|
105
111
|
* `disable_ssl_certificate_check: true` - Disabled SSL verification for old / incorrect certificates
|
106
112
|
|
107
113
|
See the documentation on `ScraperUtils::MechanizeUtils::AgentConfig` for more options
|
108
114
|
|
109
|
-
Then adjust your code to accept client_options and pass then through to:
|
115
|
+
Then adjust your code to accept `client_options` and pass then through to:
|
110
116
|
`ScraperUtils::MechanizeUtils.mechanize_agent(client_options || {})`
|
111
117
|
to receive a `Mechanize::Agent` configured accordingly.
|
112
118
|
|
@@ -115,130 +121,48 @@ The agent returned is configured using Mechanize hooks to implement the desired
|
|
115
121
|
### Default Configuration
|
116
122
|
|
117
123
|
By default, the Mechanize agent is configured with the following settings.
|
124
|
+
As you can see, the defaults can be changed using env variables.
|
125
|
+
|
126
|
+
Note - compliant mode forces max_load to be set to a value no greater than 33.
|
127
|
+
PLEASE don't use our user agent string with a max_load higher than 33!
|
118
128
|
|
119
129
|
```ruby
|
120
130
|
ScraperUtils::MechanizeUtils::AgentConfig.configure do |config|
|
121
|
-
config.default_timeout = 60
|
122
|
-
config.default_compliant_mode = true
|
123
|
-
config.default_random_delay =
|
124
|
-
config.default_max_load = 20 #
|
125
|
-
config.default_disable_ssl_certificate_check = false
|
126
|
-
config.default_australian_proxy = false
|
131
|
+
config.default_timeout = ENV.fetch('MORPH_TIMEOUT', 60).to_i # 60
|
132
|
+
config.default_compliant_mode = ENV.fetch('MORPH_NOT_COMPLIANT', nil).to_s.empty? # true
|
133
|
+
config.default_random_delay = ENV.fetch('MORPH_RANDOM_DELAY', 15).to_i # 15
|
134
|
+
config.default_max_load = ENV.fetch('MORPH_MAX_LOAD', 20.0).to_f # 20
|
135
|
+
config.default_disable_ssl_certificate_check = !ENV.fetch('MORPH_DISABLE_SSL_CHECK', nil).to_s.empty? # false
|
136
|
+
config.default_australian_proxy = !ENV.fetch('MORPH_USE_PROXY', nil).to_s.empty? # false
|
137
|
+
config.default_user_agent = ENV.fetch('MORPH_USER_AGENT', nil) # Uses Mechanize user agent
|
127
138
|
end
|
128
139
|
```
|
129
140
|
|
130
141
|
You can modify these global defaults before creating any Mechanize agents. These settings will be used for all Mechanize
|
131
142
|
agents created by `ScraperUtils::MechanizeUtils.mechanize_agent` unless overridden by passing parameters to that method.
|
132
143
|
|
133
|
-
|
134
|
-
|
135
|
-
Update your `scraper.rb` as per the following example for basic utilities:
|
144
|
+
To speed up testing, set the following in `spec_helper.rb`:
|
136
145
|
|
137
146
|
```ruby
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
$LOAD_PATH << "./lib"
|
142
|
-
|
143
|
-
require "scraper_utils"
|
144
|
-
require "technology_one_scraper"
|
145
|
-
|
146
|
-
# Main Scraper class
|
147
|
-
class Scraper
|
148
|
-
AUTHORITIES = YourScraper::AUTHORITIES
|
149
|
-
|
150
|
-
# ADD: attempt argument
|
151
|
-
def scrape(authorities, attempt)
|
152
|
-
exceptions = {}
|
153
|
-
# ADD: Report attempt number
|
154
|
-
authorities.each do |authority_label|
|
155
|
-
puts "\nCollecting feed data for #{authority_label}, attempt: #{attempt}..."
|
156
|
-
|
157
|
-
begin
|
158
|
-
# REPLACE:
|
159
|
-
# YourScraper.scrape(authority_label) do |record|
|
160
|
-
# record["authority_label"] = authority_label.to_s
|
161
|
-
# YourScraper.log(record)
|
162
|
-
# ScraperWiki.save_sqlite(%w[authority_label council_reference], record)
|
163
|
-
# end
|
164
|
-
# WITH:
|
165
|
-
ScraperUtils::DataQualityMonitor.start_authority(authority_label)
|
166
|
-
YourScraper.scrape(authority_label) do |record|
|
167
|
-
begin
|
168
|
-
record["authority_label"] = authority_label.to_s
|
169
|
-
ScraperUtils::DbUtils.save_record(record)
|
170
|
-
rescue ScraperUtils::UnprocessableRecord => e
|
171
|
-
ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
|
172
|
-
exceptions[authority_label] = e
|
173
|
-
end
|
174
|
-
end
|
175
|
-
# END OF REPLACE
|
176
|
-
end
|
177
|
-
rescue StandardError => e
|
178
|
-
warn "#{authority_label}: ERROR: #{e}"
|
179
|
-
warn e.backtrace
|
180
|
-
exceptions[authority_label] = e
|
181
|
-
end
|
182
|
-
|
183
|
-
exceptions
|
184
|
-
end
|
185
|
-
|
186
|
-
def self.selected_authorities
|
187
|
-
ScraperUtils::AuthorityUtils.selected_authorities(AUTHORITIES.keys)
|
188
|
-
end
|
189
|
-
|
190
|
-
def self.run(authorities)
|
191
|
-
puts "Scraping authorities: #{authorities.join(', ')}"
|
192
|
-
start_time = Time.now
|
193
|
-
exceptions = scrape(authorities, 1)
|
194
|
-
# Set start_time and attempt to the call above and log run below
|
195
|
-
ScraperUtils::LogUtils.log_scraping_run(
|
196
|
-
start_time,
|
197
|
-
1,
|
198
|
-
authorities,
|
199
|
-
exceptions
|
200
|
-
)
|
201
|
-
|
202
|
-
unless exceptions.empty?
|
203
|
-
puts "\n***************************************************"
|
204
|
-
puts "Now retrying authorities which earlier had failures"
|
205
|
-
puts exceptions.keys.join(", ").to_s
|
206
|
-
puts "***************************************************"
|
207
|
-
|
208
|
-
start_time = Time.now
|
209
|
-
exceptions = scrape(exceptions.keys, 2)
|
210
|
-
# Set start_time and attempt to the call above and log run below
|
211
|
-
ScraperUtils::LogUtils.log_scraping_run(
|
212
|
-
start_time,
|
213
|
-
2,
|
214
|
-
authorities,
|
215
|
-
exceptions
|
216
|
-
)
|
217
|
-
end
|
218
|
-
|
219
|
-
# Report on results, raising errors for unexpected conditions
|
220
|
-
ScraperUtils::LogUtils.report_on_results(authorities, exceptions)
|
221
|
-
end
|
147
|
+
ScraperUtils::MechanizeUtils::AgentConfig.configure do |config|
|
148
|
+
config.default_random_delay = nil
|
149
|
+
config.default_max_load = 33
|
222
150
|
end
|
151
|
+
```
|
223
152
|
|
224
|
-
|
225
|
-
# Default to list of authorities we can't or won't fix in code, explain why
|
226
|
-
# wagga: url redirects and then reports Application error
|
153
|
+
### Example updated `scraper.rb` file
|
227
154
|
|
228
|
-
|
229
|
-
Scraper.run(Scraper.selected_authorities)
|
230
|
-
end
|
231
|
-
```
|
155
|
+
Update your `scraper.rb` as per the [example scraper](docs/example_scraper.rb).
|
232
156
|
|
233
157
|
Your code should raise ScraperUtils::UnprocessableRecord when there is a problem with the data presented on a page for a
|
234
158
|
record.
|
235
159
|
Then just before you would normally yield a record for saving, rescue that exception and:
|
236
160
|
|
237
|
-
* Call ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
|
161
|
+
* Call `ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)`
|
238
162
|
* NOT yield the record for saving
|
239
163
|
|
240
164
|
In your code update where create a mechanize agent (often `YourScraper.scrape_period`) and the `AUTHORITIES` hash
|
241
|
-
to move
|
165
|
+
to move Mechanize agent options (like `australian_proxy` and `timeout`) to a hash under a new key: `client_options`.
|
242
166
|
For example:
|
243
167
|
|
244
168
|
```ruby
|
@@ -297,44 +221,73 @@ The `ScraperUtils::FiberScheduler` provides a lightweight utility that:
|
|
297
221
|
* thus optimizing the total scraper run time
|
298
222
|
* allows you to increase the random delay for authorities without undue effect on total run time
|
299
223
|
* For the curious, it uses [ruby fibers](https://ruby-doc.org/core-2.5.8/Fiber.html) rather than threads as that is
|
300
|
-
simpler to get right and debug!
|
224
|
+
a simpler system and thus easier to get right, understand and debug!
|
225
|
+
* Cycles around the authorities when compliant_mode, max_load and random_delay are disabled
|
301
226
|
|
302
|
-
To enable change the scrape method
|
227
|
+
To enable change the scrape method to be like [example scrape method using fibers](docs/example_scrape_with_fibers.rb)
|
303
228
|
|
304
|
-
|
229
|
+
And use `ScraperUtils::FiberScheduler.log` instead of `puts` when logging within the authority processing code.
|
230
|
+
This will prefix the output lines with the authority name, which is needed since the system will interleave the work and
|
231
|
+
thus the output.
|
232
|
+
|
233
|
+
This uses `ScraperUtils::RandomizeUtils` as described below. Remember to add the recommended line to
|
234
|
+
`spec/spec_heper.rb`.
|
235
|
+
|
236
|
+
Intelligent Date Range Selection
|
237
|
+
--------------------------------
|
238
|
+
|
239
|
+
To further reduce server load and speed up scrapers, we provide an intelligent date range selection mechanism
|
240
|
+
that can reduce server requests by 60% without significantly impacting delay in picking up changes.
|
241
|
+
|
242
|
+
The `ScraperUtils::DateRangeUtils#calculate_date_ranges` method provides a smart approach to searching historical
|
243
|
+
records:
|
244
|
+
|
245
|
+
- Always checks the most recent 4 days daily (configurable)
|
246
|
+
- Progressively reduces search frequency for older records
|
247
|
+
- Uses a Fibonacci-like progression to create natural, efficient search intervals
|
248
|
+
- Configurable `max_period` (default is 3 days)
|
249
|
+
- merges adjacent search ranges and handles the changeover in search frequency by extending some searches
|
305
250
|
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
YourScraper.scrape(authority_label) do |record|
|
315
|
-
begin
|
316
|
-
record["authority_label"] = authority_label.to_s
|
317
|
-
ScraperUtils::DbUtils.save_record(record)
|
318
|
-
rescue ScraperUtils::UnprocessableRecord => e
|
319
|
-
ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
|
320
|
-
exceptions[authority_label] = e
|
321
|
-
end
|
322
|
-
end
|
323
|
-
rescue StandardError => e
|
324
|
-
warn "#{authority_label}: ERROR: #{e}"
|
325
|
-
warn e.backtrace
|
326
|
-
exceptions[authority_label] = e
|
327
|
-
end
|
328
|
-
end # end of register_operation block
|
251
|
+
Example usage in your scraper:
|
252
|
+
|
253
|
+
```ruby
|
254
|
+
date_ranges = ScraperUtils::DateRangeUtils.new.calculate_date_ranges
|
255
|
+
date_ranges.each do |from_date, to_date, _debugging_comment|
|
256
|
+
# Adjust your normal search code to use for this date range
|
257
|
+
your_search_records(from_date: from_date, to_date: to_date) do |record|
|
258
|
+
# process as normal
|
329
259
|
end
|
330
|
-
ScraperUtils::FiberScheduler.run_all
|
331
|
-
exceptions
|
332
260
|
end
|
333
261
|
```
|
334
262
|
|
335
|
-
|
336
|
-
|
337
|
-
|
263
|
+
Typical server load reductions:
|
264
|
+
|
265
|
+
* Max period 2 days : ~42% of the 33 days selected
|
266
|
+
* Max period 3 days : ~37% of the 33 days selected (default)
|
267
|
+
* Max period 5 days : ~35% (or ~31% when days = 45)
|
268
|
+
|
269
|
+
See the class documentation for customizing defaults and passing options.
|
270
|
+
|
271
|
+
Randomizing Requests
|
272
|
+
--------------------
|
273
|
+
|
274
|
+
Pass a `Collection` or `Array` to `ScraperUtils::RandomizeUtils.randomize_order` to randomize it in production, but
|
275
|
+
receive in as is when testing.
|
276
|
+
|
277
|
+
Use this with the list of records scraped from an index to randomise any requests for further information to be less Bot like.
|
278
|
+
|
279
|
+
### Spec setup
|
280
|
+
|
281
|
+
You should enforce sequential mode when testing by adding the following code to `spec/spec_helper.rb` :
|
282
|
+
|
283
|
+
```
|
284
|
+
ScraperUtils::RandomizeUtils.sequential = true
|
285
|
+
```
|
286
|
+
|
287
|
+
Note:
|
288
|
+
|
289
|
+
* You can also force sequential mode by setting the env variable `MORPH_PROCESS_SEQUENTIALLY` to `1` (any non blank)
|
290
|
+
* testing using VCR requires sequential mode
|
338
291
|
|
339
292
|
Development
|
340
293
|
-----------
|
@@ -356,7 +309,7 @@ NOTE: You need to use ruby 3.2.2 instead of 2.5.8 to release to OTP protected ac
|
|
356
309
|
Contributing
|
357
310
|
------------
|
358
311
|
|
359
|
-
Bug reports and pull requests are welcome on GitHub
|
312
|
+
Bug reports and pull requests with working tests are welcome on [GitHub](https://github.com/ianheggie-oaf/scraper_utils)
|
360
313
|
|
361
314
|
CHANGELOG.md is maintained by the author aiming to follow https://github.com/vweevers/common-changelog
|
362
315
|
|
@@ -364,3 +317,4 @@ License
|
|
364
317
|
-------
|
365
318
|
|
366
319
|
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
320
|
+
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Example scrape method updated to use ScraperUtils::FibreScheduler
|
4
|
+
|
5
|
+
def scrape(authorities, attempt)
|
6
|
+
ScraperUtils::FiberScheduler.reset!
|
7
|
+
exceptions = {}
|
8
|
+
authorities.each do |authority_label|
|
9
|
+
ScraperUtils::FiberScheduler.register_operation(authority_label) do
|
10
|
+
ScraperUtils::FiberScheduler.log(
|
11
|
+
"Collecting feed data for #{authority_label}, attempt: #{attempt}..."
|
12
|
+
)
|
13
|
+
ScraperUtils::DataQualityMonitor.start_authority(authority_label)
|
14
|
+
YourScraper.scrape(authority_label) do |record|
|
15
|
+
record["authority_label"] = authority_label.to_s
|
16
|
+
ScraperUtils::DbUtils.save_record(record)
|
17
|
+
rescue ScraperUtils::UnprocessableRecord => e
|
18
|
+
ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
|
19
|
+
exceptions[authority_label] = e
|
20
|
+
# Continues processing other records
|
21
|
+
end
|
22
|
+
rescue StandardError => e
|
23
|
+
warn "#{authority_label}: ERROR: #{e}"
|
24
|
+
warn e.backtrace || "No backtrace available"
|
25
|
+
exceptions[authority_label] = e
|
26
|
+
end
|
27
|
+
# end of register_operation block
|
28
|
+
end
|
29
|
+
ScraperUtils::FiberScheduler.run_all
|
30
|
+
exceptions
|
31
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
$LOAD_PATH << "./lib"
|
5
|
+
|
6
|
+
require "scraper_utils"
|
7
|
+
require "technology_one_scraper"
|
8
|
+
|
9
|
+
# Main Scraper class
|
10
|
+
class Scraper
|
11
|
+
AUTHORITIES = YourScraper::AUTHORITIES
|
12
|
+
|
13
|
+
# ADD: attempt argument
|
14
|
+
def scrape(authorities, attempt)
|
15
|
+
exceptions = {}
|
16
|
+
# ADD: Report attempt number
|
17
|
+
authorities.each do |authority_label|
|
18
|
+
puts "\nCollecting feed data for #{authority_label}, attempt: #{attempt}..."
|
19
|
+
|
20
|
+
begin
|
21
|
+
# REPLACE:
|
22
|
+
# YourScraper.scrape(authority_label) do |record|
|
23
|
+
# record["authority_label"] = authority_label.to_s
|
24
|
+
# YourScraper.log(record)
|
25
|
+
# ScraperWiki.save_sqlite(%w[authority_label council_reference], record)
|
26
|
+
# end
|
27
|
+
# WITH:
|
28
|
+
ScraperUtils::DataQualityMonitor.start_authority(authority_label)
|
29
|
+
YourScraper.scrape(authority_label) do |record|
|
30
|
+
begin
|
31
|
+
record["authority_label"] = authority_label.to_s
|
32
|
+
ScraperUtils::DbUtils.save_record(record)
|
33
|
+
rescue ScraperUtils::UnprocessableRecord => e
|
34
|
+
ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
|
35
|
+
exceptions[authority_label] = e
|
36
|
+
end
|
37
|
+
end
|
38
|
+
# END OF REPLACE
|
39
|
+
end
|
40
|
+
rescue StandardError => e
|
41
|
+
warn "#{authority_label}: ERROR: #{e}"
|
42
|
+
warn e.backtrace
|
43
|
+
exceptions[authority_label] = e
|
44
|
+
end
|
45
|
+
|
46
|
+
exceptions
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.selected_authorities
|
50
|
+
ScraperUtils::AuthorityUtils.selected_authorities(AUTHORITIES.keys)
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.run(authorities)
|
54
|
+
puts "Scraping authorities: #{authorities.join(', ')}"
|
55
|
+
start_time = Time.now
|
56
|
+
exceptions = scrape(authorities, 1)
|
57
|
+
# Set start_time and attempt to the call above and log run below
|
58
|
+
ScraperUtils::LogUtils.log_scraping_run(
|
59
|
+
start_time,
|
60
|
+
1,
|
61
|
+
authorities,
|
62
|
+
exceptions
|
63
|
+
)
|
64
|
+
|
65
|
+
unless exceptions.empty?
|
66
|
+
puts "\n***************************************************"
|
67
|
+
puts "Now retrying authorities which earlier had failures"
|
68
|
+
puts exceptions.keys.join(", ").to_s
|
69
|
+
puts "***************************************************"
|
70
|
+
|
71
|
+
start_time = Time.now
|
72
|
+
exceptions = scrape(exceptions.keys, 2)
|
73
|
+
# Set start_time and attempt to the call above and log run below
|
74
|
+
ScraperUtils::LogUtils.log_scraping_run(
|
75
|
+
start_time,
|
76
|
+
2,
|
77
|
+
authorities,
|
78
|
+
exceptions
|
79
|
+
)
|
80
|
+
end
|
81
|
+
|
82
|
+
# Report on results, raising errors for unexpected conditions
|
83
|
+
ScraperUtils::LogUtils.report_on_results(authorities, exceptions)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
if __FILE__ == $PROGRAM_NAME
|
88
|
+
# Default to list of authorities we can't or won't fix in code, explain why
|
89
|
+
# wagga: url redirects and then reports Application error
|
90
|
+
|
91
|
+
ENV["MORPH_EXPECT_BAD"] ||= "wagga"
|
92
|
+
Scraper.run(Scraper.selected_authorities)
|
93
|
+
end
|
@@ -2,64 +2,69 @@
|
|
2
2
|
|
3
3
|
require "uri"
|
4
4
|
|
5
|
-
|
6
|
-
#
|
7
|
-
#
|
8
|
-
|
9
|
-
|
10
|
-
|
5
|
+
module ScraperUtils
|
6
|
+
# Adapts delays between requests based on server response times.
|
7
|
+
# Target delay is proportional to response time based on max_load setting.
|
8
|
+
# Uses an exponential moving average to smooth variations in response times.
|
9
|
+
class AdaptiveDelay
|
10
|
+
DEFAULT_MIN_DELAY = 0.0
|
11
|
+
DEFAULT_MAX_DELAY = 30.0 # Presumed default timeout for Mechanize
|
11
12
|
|
12
|
-
|
13
|
+
attr_reader :min_delay, :max_delay, :max_load
|
13
14
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
15
|
+
# Creates a new adaptive delay calculator
|
16
|
+
#
|
17
|
+
# @param min_delay [Float] Minimum delay between requests in seconds
|
18
|
+
# @param max_delay [Float] Maximum delay between requests in seconds
|
19
|
+
# @param max_load [Float] Maximum load percentage (1-99) we aim to place on the server
|
20
|
+
# Lower values are more conservative (e.g., 20% = 4x response time delay)
|
21
|
+
def initialize(min_delay: DEFAULT_MIN_DELAY, max_delay: DEFAULT_MAX_DELAY, max_load: 20.0)
|
22
|
+
@delays = {} # domain -> last delay used
|
23
|
+
@min_delay = min_delay.to_f
|
24
|
+
@max_delay = max_delay.to_f
|
25
|
+
@max_load = max_load.to_f.clamp(1.0, 99.0)
|
26
|
+
@response_multiplier = (100.0 - @max_load) / @max_load
|
26
27
|
|
27
|
-
|
28
|
-
|
29
|
-
ScraperUtils::FiberScheduler.log
|
28
|
+
return unless DebugUtils.basic?
|
29
|
+
|
30
|
+
ScraperUtils::FiberScheduler.log(
|
31
|
+
"AdaptiveDelay initialized with delays between #{@min_delay} and #{@max_delay} seconds, " \
|
32
|
+
"Max_load #{@max_load}% thus response multiplier: #{@response_multiplier.round(2)}x"
|
33
|
+
)
|
30
34
|
end
|
31
|
-
end
|
32
35
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
36
|
+
# @param uri [URI::Generic, String] The URL to extract the domain from
|
37
|
+
# @return [String] The domain in the format "scheme://host"
|
38
|
+
def domain(uri)
|
39
|
+
uri = URI(uri) unless uri.is_a?(URI)
|
40
|
+
"#{uri.scheme}://#{uri.host}".downcase
|
41
|
+
end
|
39
42
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
43
|
+
# @param uri [URI::Generic, String] URL to get delay for
|
44
|
+
# @return [Float] Current delay for the domain, or min_delay if no delay set
|
45
|
+
def delay(uri)
|
46
|
+
@delays[domain(uri)] || @min_delay
|
47
|
+
end
|
45
48
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
49
|
+
# @param uri [URI::Generic, String] URL the response came from
|
50
|
+
# @param response_time [Float] Time in seconds the server took to respond
|
51
|
+
# @return [Float] The calculated delay to use with the next request
|
52
|
+
def next_delay(uri, response_time)
|
53
|
+
uris_domain = domain(uri)
|
54
|
+
target_delay = (response_time * @response_multiplier).clamp(0.0, @max_delay)
|
55
|
+
current_delay = @delays[uris_domain] || target_delay
|
56
|
+
delay = ((9.0 * current_delay) + target_delay) / 10.0
|
57
|
+
delay = delay.clamp(@min_delay, @max_delay)
|
55
58
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
59
|
+
if DebugUtils.basic?
|
60
|
+
ScraperUtils::FiberScheduler.log(
|
61
|
+
"Adaptive delay for #{uris_domain} updated to #{delay.round(2)}s (target: " \
|
62
|
+
"#{@response_multiplier.round(1)}x response_time of #{response_time.round(2)}s)"
|
63
|
+
)
|
64
|
+
end
|
61
65
|
|
62
|
-
|
63
|
-
|
66
|
+
@delays[uris_domain] = delay
|
67
|
+
delay
|
68
|
+
end
|
64
69
|
end
|
65
70
|
end
|