scraper_utils 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 63a24c24b497494b79c4d7e12f04a1bd2555068f37f50389f3906c0033817d7e
4
- data.tar.gz: 6d6b96112dc3e2f9dc5a54de6318a544c240c0e3d5246ab4178c07346d0de7dc
3
+ metadata.gz: 28f415290516d19f6ffc95a7d349a6ed269de987a0ffe45ed512ff29bfc82902
4
+ data.tar.gz: 9d337d1145754bf2375f4d2e18d89da7c21231d7b9b279bc51340f92781caa35
5
5
  SHA512:
6
- metadata.gz: eda8d10d996d51b7ef1d2610e21da31390c10dd29f4daa70bd5d9c3c8dc6eb9bed651803ccd6a59f53b03dae4fcd1ea016802e693f8828f4a13b92e07a0b046e
7
- data.tar.gz: eba2704a99c6599a2789ec573fa335d7939a63d0c27b06886d6e905cd785e2095d7d0307e7aa1195a1209e022340fa5d027a72ccca61a350590058e998355d5d
6
+ metadata.gz: 8a3050451f512b2f77cf9cd806fc1602d6502b247f24248d93cfc12dea47bf5d7f02bd9be5453c0da7ee7b8acc0e3ee32cd375b52e04f0f366c2174ea7320bd9
7
+ data.tar.gz: 17befcb8b9305536385ddf6772aeee038df34bd42ce197c4951836bb396e1db9d3dabece2a543ece6f60e331ca244ef1f660f1b111af17d79703a2a783801183
data/CHANGELOG.md CHANGED
@@ -1,5 +1,9 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.5.1 - 2025-03-05
4
+
5
+ * Remove duplicated example code in docs
6
+
3
7
  ## 0.5.0 - 2025-03-05
4
8
 
5
9
  * Add action processing utility
@@ -4,7 +4,7 @@
4
4
  $LOAD_PATH << "./lib"
5
5
 
6
6
  require "scraper_utils"
7
- require "technology_one_scraper"
7
+ require "your_scraper"
8
8
 
9
9
  # Main Scraper class
10
10
  class Scraper
@@ -17,26 +17,18 @@ class Scraper
17
17
  authorities.each do |authority_label|
18
18
  puts "\nCollecting feed data for #{authority_label}, attempt: #{attempt}..."
19
19
 
20
- begin
21
- # REPLACE:
22
- # YourScraper.scrape(authority_label) do |record|
23
- # record["authority_label"] = authority_label.to_s
24
- # YourScraper.log(record)
25
- # ScraperWiki.save_sqlite(%w[authority_label council_reference], record)
26
- # end
27
- # WITH:
28
- ScraperUtils::DataQualityMonitor.start_authority(authority_label)
29
- YourScraper.scrape(authority_label) do |record|
30
- begin
31
- record["authority_label"] = authority_label.to_s
32
- ScraperUtils::DbUtils.save_record(record)
33
- rescue ScraperUtils::UnprocessableRecord => e
34
- ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
35
- exceptions[authority_label] = e
36
- end
20
+ # REPLACE section with:
21
+ ScraperUtils::DataQualityMonitor.start_authority(authority_label)
22
+ YourScraper.scrape(authority_label) do |record|
23
+ begin
24
+ record["authority_label"] = authority_label.to_s
25
+ ScraperUtils::DbUtils.save_record(record)
26
+ rescue ScraperUtils::UnprocessableRecord => e
27
+ ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
28
+ exceptions[authority_label] = e
37
29
  end
38
- # END OF REPLACE
39
30
  end
31
+ # END OF REPLACE
40
32
  rescue StandardError => e
41
33
  warn "#{authority_label}: ERROR: #{e}"
42
34
  warn e.backtrace
@@ -86,8 +78,9 @@ end
86
78
 
87
79
  if __FILE__ == $PROGRAM_NAME
88
80
  # Default to list of authorities we can't or won't fix in code, explain why
89
- # wagga: url redirects and then reports Application error
81
+ # some: url-for-issue Summary Reason
82
+ # councils : url-for-issue Summary Reason
90
83
 
91
- ENV["MORPH_EXPECT_BAD"] ||= "wagga"
84
+ ENV["MORPH_EXPECT_BAD"] ||= "some,councils"
92
85
  Scraper.run(Scraper.selected_authorities)
93
86
  end
@@ -54,90 +54,19 @@ export DEBUG=1 # for basic, or 2 for verbose or 3 for tracing nearly everything
54
54
 
55
55
  ## Example Scraper Implementation
56
56
 
57
- Update your `scraper.rb` as follows:
58
-
59
- ```ruby
60
- #!/usr/bin/env ruby
61
- # frozen_string_literal: true
62
-
63
- $LOAD_PATH << "./lib"
64
-
65
- require "scraper_utils"
66
- require "your_scraper"
67
-
68
- # Main Scraper class
69
- class Scraper
70
- AUTHORITIES = YourScraper::AUTHORITIES
71
-
72
- def scrape(authorities, attempt)
73
- exceptions = {}
74
- authorities.each do |authority_label|
75
- puts "\nCollecting feed data for #{authority_label}, attempt: #{attempt}..."
76
-
77
- begin
78
- ScraperUtils::DataQualityMonitor.start_authority(authority_label)
79
- YourScraper.scrape(authority_label) do |record|
80
- begin
81
- record["authority_label"] = authority_label.to_s
82
- ScraperUtils::DbUtils.save_record(record)
83
- rescue ScraperUtils::UnprocessableRecord => e
84
- ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
85
- exceptions[authority_label] = e
86
- end
87
- end
88
- rescue StandardError => e
89
- warn "#{authority_label}: ERROR: #{e}"
90
- warn e.backtrace
91
- exceptions[authority_label] = e
92
- end
93
- end
94
-
95
- exceptions
96
- end
97
-
98
- def self.selected_authorities
99
- ScraperUtils::AuthorityUtils.selected_authorities(AUTHORITIES.keys)
100
- end
101
-
102
- def self.run(authorities)
103
- puts "Scraping authorities: #{authorities.join(', ')}"
104
- start_time = Time.now
105
- exceptions = new.scrape(authorities, 1)
106
- ScraperUtils::LogUtils.log_scraping_run(
107
- start_time,
108
- 1,
109
- authorities,
110
- exceptions
111
- )
112
-
113
- unless exceptions.empty?
114
- puts "\n***************************************************"
115
- puts "Now retrying authorities which earlier had failures"
116
- puts exceptions.keys.join(", ").to_s
117
- puts "***************************************************"
118
-
119
- start_time = Time.now
120
- exceptions = new.scrape(exceptions.keys, 2)
121
- ScraperUtils::LogUtils.log_scraping_run(
122
- start_time,
123
- 2,
124
- authorities,
125
- exceptions
126
- )
127
- end
128
-
129
- ScraperUtils::LogUtils.report_on_results(authorities, exceptions)
130
- end
131
- end
132
-
133
- if __FILE__ == $PROGRAM_NAME
134
- ENV["MORPH_EXPECT_BAD"] ||= "wagga"
135
- Scraper.run(Scraper.selected_authorities)
136
- end
137
- ```
57
+ Update your `scraper.rb` as per [example scraper](example_scraper.rb)
138
58
 
139
59
  For more advanced implementations, see the [Interleaving Requests documentation](interleaving_requests.md).
140
60
 
61
+ ## Logging Tables
62
+
63
+ The following logging tables are created for use in monitoring failure patterns and debugging issues.
64
+ Records are automaticaly cleared after 30 days.
65
+
66
+ The `ScraperUtils::LogUtils.log_scraping_run` call also logs the information to the `scrape_log` table.
67
+
68
+ The `ScraperUtils::LogUtils.save_summary_record` call also logs the information to the `scrape_summary` table.
69
+
141
70
  ## Next Steps
142
71
 
143
72
  - [Reducing Server Load](reducing_server_load.md)
@@ -11,37 +11,8 @@ The `ScraperUtils::FiberScheduler` provides a lightweight utility that:
11
11
 
12
12
  ## Implementation
13
13
 
14
- To enable fiber scheduling, change your scrape method to follow this pattern:
15
-
16
- ```ruby
17
- def scrape(authorities, attempt)
18
- ScraperUtils::FiberScheduler.reset!
19
- exceptions = {}
20
- authorities.each do |authority_label|
21
- ScraperUtils::FiberScheduler.register_operation(authority_label) do
22
- ScraperUtils::FiberScheduler.log(
23
- "Collecting feed data for #{authority_label}, attempt: #{attempt}..."
24
- )
25
- ScraperUtils::DataQualityMonitor.start_authority(authority_label)
26
- YourScraper.scrape(authority_label) do |record|
27
- record["authority_label"] = authority_label.to_s
28
- ScraperUtils::DbUtils.save_record(record)
29
- rescue ScraperUtils::UnprocessableRecord => e
30
- ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
31
- exceptions[authority_label] = e
32
- # Continues processing other records
33
- end
34
- rescue StandardError => e
35
- warn "#{authority_label}: ERROR: #{e}"
36
- warn e.backtrace || "No backtrace available"
37
- exceptions[authority_label] = e
38
- end
39
- # end of register_operation block
40
- end
41
- ScraperUtils::FiberScheduler.run_all
42
- exceptions
43
- end
44
- ```
14
+ To enable fiber scheduling, change your scrape method as per
15
+ [example scrape with fibers](example_scrape_with_fibers.rb)
45
16
 
46
17
  ## Logging with FiberScheduler
47
18
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ScraperUtils
4
- VERSION = "0.5.0"
4
+ VERSION = "0.5.1"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scraper_utils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ian Heggie
@@ -106,7 +106,7 @@ metadata:
106
106
  allowed_push_host: https://rubygems.org
107
107
  homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
108
108
  source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
109
- documentation_uri: https://rubydoc.info/gems/scraper_utils/0.5.0
109
+ documentation_uri: https://rubydoc.info/gems/scraper_utils/0.5.1
110
110
  changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
111
111
  rubygems_mfa_required: 'true'
112
112
  post_install_message: