scraper_utils 0.5.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/docs/example_scraper.rb +14 -21
- data/docs/getting_started.md +10 -81
- data/docs/interleaving_requests.md +2 -31
- data/lib/scraper_utils/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 28f415290516d19f6ffc95a7d349a6ed269de987a0ffe45ed512ff29bfc82902
|
4
|
+
data.tar.gz: 9d337d1145754bf2375f4d2e18d89da7c21231d7b9b279bc51340f92781caa35
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8a3050451f512b2f77cf9cd806fc1602d6502b247f24248d93cfc12dea47bf5d7f02bd9be5453c0da7ee7b8acc0e3ee32cd375b52e04f0f366c2174ea7320bd9
|
7
|
+
data.tar.gz: 17befcb8b9305536385ddf6772aeee038df34bd42ce197c4951836bb396e1db9d3dabece2a543ece6f60e331ca244ef1f660f1b111af17d79703a2a783801183
|
data/CHANGELOG.md
CHANGED
data/docs/example_scraper.rb
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
$LOAD_PATH << "./lib"
|
5
5
|
|
6
6
|
require "scraper_utils"
|
7
|
-
require "
|
7
|
+
require "your_scraper"
|
8
8
|
|
9
9
|
# Main Scraper class
|
10
10
|
class Scraper
|
@@ -17,26 +17,18 @@ class Scraper
|
|
17
17
|
authorities.each do |authority_label|
|
18
18
|
puts "\nCollecting feed data for #{authority_label}, attempt: #{attempt}..."
|
19
19
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
YourScraper.scrape(authority_label) do |record|
|
30
|
-
begin
|
31
|
-
record["authority_label"] = authority_label.to_s
|
32
|
-
ScraperUtils::DbUtils.save_record(record)
|
33
|
-
rescue ScraperUtils::UnprocessableRecord => e
|
34
|
-
ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
|
35
|
-
exceptions[authority_label] = e
|
36
|
-
end
|
20
|
+
# REPLACE section with:
|
21
|
+
ScraperUtils::DataQualityMonitor.start_authority(authority_label)
|
22
|
+
YourScraper.scrape(authority_label) do |record|
|
23
|
+
begin
|
24
|
+
record["authority_label"] = authority_label.to_s
|
25
|
+
ScraperUtils::DbUtils.save_record(record)
|
26
|
+
rescue ScraperUtils::UnprocessableRecord => e
|
27
|
+
ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
|
28
|
+
exceptions[authority_label] = e
|
37
29
|
end
|
38
|
-
# END OF REPLACE
|
39
30
|
end
|
31
|
+
# END OF REPLACE
|
40
32
|
rescue StandardError => e
|
41
33
|
warn "#{authority_label}: ERROR: #{e}"
|
42
34
|
warn e.backtrace
|
@@ -86,8 +78,9 @@ end
|
|
86
78
|
|
87
79
|
if __FILE__ == $PROGRAM_NAME
|
88
80
|
# Default to list of authorities we can't or won't fix in code, explain why
|
89
|
-
#
|
81
|
+
# some: url-for-issue Summary Reason
|
82
|
+
# councils : url-for-issue Summary Reason
|
90
83
|
|
91
|
-
ENV["MORPH_EXPECT_BAD"] ||= "
|
84
|
+
ENV["MORPH_EXPECT_BAD"] ||= "some,councils"
|
92
85
|
Scraper.run(Scraper.selected_authorities)
|
93
86
|
end
|
data/docs/getting_started.md
CHANGED
@@ -54,90 +54,19 @@ export DEBUG=1 # for basic, or 2 for verbose or 3 for tracing nearly everything
|
|
54
54
|
|
55
55
|
## Example Scraper Implementation
|
56
56
|
|
57
|
-
Update your `scraper.rb` as
|
58
|
-
|
59
|
-
```ruby
|
60
|
-
#!/usr/bin/env ruby
|
61
|
-
# frozen_string_literal: true
|
62
|
-
|
63
|
-
$LOAD_PATH << "./lib"
|
64
|
-
|
65
|
-
require "scraper_utils"
|
66
|
-
require "your_scraper"
|
67
|
-
|
68
|
-
# Main Scraper class
|
69
|
-
class Scraper
|
70
|
-
AUTHORITIES = YourScraper::AUTHORITIES
|
71
|
-
|
72
|
-
def scrape(authorities, attempt)
|
73
|
-
exceptions = {}
|
74
|
-
authorities.each do |authority_label|
|
75
|
-
puts "\nCollecting feed data for #{authority_label}, attempt: #{attempt}..."
|
76
|
-
|
77
|
-
begin
|
78
|
-
ScraperUtils::DataQualityMonitor.start_authority(authority_label)
|
79
|
-
YourScraper.scrape(authority_label) do |record|
|
80
|
-
begin
|
81
|
-
record["authority_label"] = authority_label.to_s
|
82
|
-
ScraperUtils::DbUtils.save_record(record)
|
83
|
-
rescue ScraperUtils::UnprocessableRecord => e
|
84
|
-
ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
|
85
|
-
exceptions[authority_label] = e
|
86
|
-
end
|
87
|
-
end
|
88
|
-
rescue StandardError => e
|
89
|
-
warn "#{authority_label}: ERROR: #{e}"
|
90
|
-
warn e.backtrace
|
91
|
-
exceptions[authority_label] = e
|
92
|
-
end
|
93
|
-
end
|
94
|
-
|
95
|
-
exceptions
|
96
|
-
end
|
97
|
-
|
98
|
-
def self.selected_authorities
|
99
|
-
ScraperUtils::AuthorityUtils.selected_authorities(AUTHORITIES.keys)
|
100
|
-
end
|
101
|
-
|
102
|
-
def self.run(authorities)
|
103
|
-
puts "Scraping authorities: #{authorities.join(', ')}"
|
104
|
-
start_time = Time.now
|
105
|
-
exceptions = new.scrape(authorities, 1)
|
106
|
-
ScraperUtils::LogUtils.log_scraping_run(
|
107
|
-
start_time,
|
108
|
-
1,
|
109
|
-
authorities,
|
110
|
-
exceptions
|
111
|
-
)
|
112
|
-
|
113
|
-
unless exceptions.empty?
|
114
|
-
puts "\n***************************************************"
|
115
|
-
puts "Now retrying authorities which earlier had failures"
|
116
|
-
puts exceptions.keys.join(", ").to_s
|
117
|
-
puts "***************************************************"
|
118
|
-
|
119
|
-
start_time = Time.now
|
120
|
-
exceptions = new.scrape(exceptions.keys, 2)
|
121
|
-
ScraperUtils::LogUtils.log_scraping_run(
|
122
|
-
start_time,
|
123
|
-
2,
|
124
|
-
authorities,
|
125
|
-
exceptions
|
126
|
-
)
|
127
|
-
end
|
128
|
-
|
129
|
-
ScraperUtils::LogUtils.report_on_results(authorities, exceptions)
|
130
|
-
end
|
131
|
-
end
|
132
|
-
|
133
|
-
if __FILE__ == $PROGRAM_NAME
|
134
|
-
ENV["MORPH_EXPECT_BAD"] ||= "wagga"
|
135
|
-
Scraper.run(Scraper.selected_authorities)
|
136
|
-
end
|
137
|
-
```
|
57
|
+
Update your `scraper.rb` as per [example scraper](example_scraper.rb)
|
138
58
|
|
139
59
|
For more advanced implementations, see the [Interleaving Requests documentation](interleaving_requests.md).
|
140
60
|
|
61
|
+
## Logging Tables
|
62
|
+
|
63
|
+
The following logging tables are created for use in monitoring failure patterns and debugging issues.
|
64
|
+
Records are automaticaly cleared after 30 days.
|
65
|
+
|
66
|
+
The `ScraperUtils::LogUtils.log_scraping_run` call also logs the information to the `scrape_log` table.
|
67
|
+
|
68
|
+
The `ScraperUtils::LogUtils.save_summary_record` call also logs the information to the `scrape_summary` table.
|
69
|
+
|
141
70
|
## Next Steps
|
142
71
|
|
143
72
|
- [Reducing Server Load](reducing_server_load.md)
|
@@ -11,37 +11,8 @@ The `ScraperUtils::FiberScheduler` provides a lightweight utility that:
|
|
11
11
|
|
12
12
|
## Implementation
|
13
13
|
|
14
|
-
To enable fiber scheduling, change your scrape method
|
15
|
-
|
16
|
-
```ruby
|
17
|
-
def scrape(authorities, attempt)
|
18
|
-
ScraperUtils::FiberScheduler.reset!
|
19
|
-
exceptions = {}
|
20
|
-
authorities.each do |authority_label|
|
21
|
-
ScraperUtils::FiberScheduler.register_operation(authority_label) do
|
22
|
-
ScraperUtils::FiberScheduler.log(
|
23
|
-
"Collecting feed data for #{authority_label}, attempt: #{attempt}..."
|
24
|
-
)
|
25
|
-
ScraperUtils::DataQualityMonitor.start_authority(authority_label)
|
26
|
-
YourScraper.scrape(authority_label) do |record|
|
27
|
-
record["authority_label"] = authority_label.to_s
|
28
|
-
ScraperUtils::DbUtils.save_record(record)
|
29
|
-
rescue ScraperUtils::UnprocessableRecord => e
|
30
|
-
ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
|
31
|
-
exceptions[authority_label] = e
|
32
|
-
# Continues processing other records
|
33
|
-
end
|
34
|
-
rescue StandardError => e
|
35
|
-
warn "#{authority_label}: ERROR: #{e}"
|
36
|
-
warn e.backtrace || "No backtrace available"
|
37
|
-
exceptions[authority_label] = e
|
38
|
-
end
|
39
|
-
# end of register_operation block
|
40
|
-
end
|
41
|
-
ScraperUtils::FiberScheduler.run_all
|
42
|
-
exceptions
|
43
|
-
end
|
44
|
-
```
|
14
|
+
To enable fiber scheduling, change your scrape method as per
|
15
|
+
[example scrape with fibers](example_scrape_with_fibers.rb)
|
45
16
|
|
46
17
|
## Logging with FiberScheduler
|
47
18
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scraper_utils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ian Heggie
|
@@ -106,7 +106,7 @@ metadata:
|
|
106
106
|
allowed_push_host: https://rubygems.org
|
107
107
|
homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
|
108
108
|
source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
|
109
|
-
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.5.
|
109
|
+
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.5.1
|
110
110
|
changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
|
111
111
|
rubygems_mfa_required: 'true'
|
112
112
|
post_install_message:
|