scraper_utils 0.7.1 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/Gemfile-heroku-18 +26 -0
- data/lib/scraper_utils/log_utils.rb +39 -5
- data/lib/scraper_utils/mechanize_utils/agent_config.rb +1 -1
- data/lib/scraper_utils/mechanize_utils/robots_checker.rb +1 -1
- data/lib/scraper_utils/scheduler/thread_response.rb +1 -1
- data/lib/scraper_utils/spec_support.rb +3 -4
- data/lib/scraper_utils/version.rb +1 -1
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9be052a9483d914f35555b8a4a80992be5d18737f79063161053faae9e97a849
|
4
|
+
data.tar.gz: b170b9f1b2b973647d8e3ce6426da425773077bdb8ef10dc551b451895326e1c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 210d580999d5db053f1581e7ba5926fa4d968e5e4e915e73bebb063f3106d106cb83d9fbb4645ad568e3994caab909ca27a942f10b5f965d3881eceb9a83f64a
|
7
|
+
data.tar.gz: 063eec87dc3e124744a73494db47e27505d7de330568ba1e3562d75bccdafe1ab054f358c6b93d3904785a001274d6aec7056f3bbc2ae4665280c605a4b100e9
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,15 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
## 0.8.0 - 2025-05-06
|
4
|
+
|
5
|
+
* Added ScraperUtils::LogUtils.project_backtrace_line to provide the first project related backtrace line
|
6
|
+
* Included this summarized line in ScraperUtils::LogUtils.report_on_results report
|
7
|
+
* Allow upto 250 character error message (was max 50)
|
8
|
+
|
9
|
+
## 0.7.2 - 2025-04-15
|
10
|
+
|
11
|
+
* Accept postcode before state as well as after
|
12
|
+
|
3
13
|
## 0.7.1 - 2025-04-15
|
4
14
|
|
5
15
|
* Accept mixed case suburb names after a comma as well as uppercase suburb names as geocachable
|
data/Gemfile-heroku-18
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
source "https://rubygems.org"
|
4
|
+
|
5
|
+
ruby "~> 3.2.2"
|
6
|
+
|
7
|
+
gem "mechanize", "~> 2.8.5"
|
8
|
+
gem "nokogiri", "~> 1.15.0"
|
9
|
+
gem "sqlite3", "~> 1.6.3"
|
10
|
+
|
11
|
+
# Unable to list in gemspec - Include it in your projects Gemfile when using this gem
|
12
|
+
gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git",
|
13
|
+
branch: "morph_defaults"
|
14
|
+
|
15
|
+
# development and test gems
|
16
|
+
gem "rake", "~> 13.0"
|
17
|
+
gem "rspec", "~> 3.12"
|
18
|
+
gem "rubocop", "~> 1.73"
|
19
|
+
gem "rubocop-rake", "~> 0.7"
|
20
|
+
gem "rubocop-rspec", "~> 3.5"
|
21
|
+
gem "simplecov", "~> 0.22.0"
|
22
|
+
gem "simplecov-console"
|
23
|
+
gem "terminal-table"
|
24
|
+
gem "webmock", "~> 3.19.0"
|
25
|
+
gem "yard"
|
26
|
+
|
@@ -67,7 +67,7 @@ module ScraperUtils
|
|
67
67
|
"records_saved" => stats[:saved] || 0,
|
68
68
|
"unprocessable_records" => stats[:unprocessed] || 0,
|
69
69
|
"status" => status.to_s,
|
70
|
-
"error_message" => exception&.
|
70
|
+
"error_message" => exception&.to_s,
|
71
71
|
"error_class" => exception&.class&.to_s,
|
72
72
|
"error_backtrace" => extract_meaningful_backtrace(exception)
|
73
73
|
}
|
@@ -88,6 +88,38 @@ module ScraperUtils
|
|
88
88
|
cleanup_old_records
|
89
89
|
end
|
90
90
|
|
91
|
+
# Extracts the first relevant line from backtrace that's from our project
|
92
|
+
# (not from gems, vendor, or Ruby standard library)
|
93
|
+
#
|
94
|
+
# @param backtrace [Array<String>] The exception backtrace
|
95
|
+
# @param options [Hash] Options hash
|
96
|
+
# @option options [String] :pwd The project root directory (defaults to current working directory)
|
97
|
+
# @option options [Boolean] :format If true, returns formatted string with brackets
|
98
|
+
# @return [String, nil] The relevant backtrace line without PWD prefix, or nil if none found
|
99
|
+
def self.project_backtrace_line(backtrace, options = {})
|
100
|
+
return nil if backtrace.nil? || backtrace.empty?
|
101
|
+
|
102
|
+
# Set defaults
|
103
|
+
pwd = options[:pwd] || Dir.pwd
|
104
|
+
format = options[:format] || false
|
105
|
+
|
106
|
+
# Normalize the root directory path with a trailing slash
|
107
|
+
pwd = File.join(pwd, '')
|
108
|
+
|
109
|
+
backtrace.each do |line|
|
110
|
+
next if line.include?('/gems/') ||
|
111
|
+
line.include?('/vendor/') ||
|
112
|
+
line.include?('/ruby/')
|
113
|
+
|
114
|
+
if line.start_with?(pwd)
|
115
|
+
relative_path = line.sub(pwd, '')
|
116
|
+
return format ? " [#{relative_path}]" : relative_path
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
format ? "" : nil
|
121
|
+
end
|
122
|
+
|
91
123
|
# Report on the results
|
92
124
|
# @param authorities [Array<Symbol>] List of authorities attempted to scrape
|
93
125
|
# @param exceptions [Hash{Symbol => Exception}] Any exceptions that occurred during scraping
|
@@ -117,12 +149,14 @@ module ScraperUtils
|
|
117
149
|
|
118
150
|
expect_bad_prefix = expect_bad.include?(authority) ? "[EXPECT BAD] " : ""
|
119
151
|
exception_msg = if exceptions[authority]
|
120
|
-
|
152
|
+
location = self.project_backtrace_line(exceptions[authority].backtrace, format: true)
|
153
|
+
puts "LOCATION: #{location.inspect}"
|
154
|
+
"#{exceptions[authority].class} - #{exceptions[authority]}#{location}"
|
121
155
|
else
|
122
156
|
"-"
|
123
157
|
end
|
124
158
|
puts format(summary_format, authority.to_s, ok_records, bad_records,
|
125
|
-
"#{expect_bad_prefix}#{exception_msg}".slice(0,
|
159
|
+
"#{expect_bad_prefix}#{exception_msg}".slice(0, 250))
|
126
160
|
end
|
127
161
|
puts
|
128
162
|
|
@@ -149,7 +183,7 @@ module ScraperUtils
|
|
149
183
|
"(Add to MORPH_EXPECT_BAD?)"
|
150
184
|
unexpected_errors.each do |authority|
|
151
185
|
error = exceptions[authority]
|
152
|
-
errors << " #{authority}: #{error.class} - #{error
|
186
|
+
errors << " #{authority}: #{error.class} - #{error}"
|
153
187
|
end
|
154
188
|
end
|
155
189
|
|
@@ -212,7 +246,7 @@ module ScraperUtils
|
|
212
246
|
|
213
247
|
lines = []
|
214
248
|
error.backtrace.each do |line|
|
215
|
-
lines << line if lines.length < 2 || !line.include?("/vendor/")
|
249
|
+
lines << line if lines.length < 2 || !(line.include?("/vendor/") || line.include?("/gems/") || line.include?("/ruby/"))
|
216
250
|
break if lines.length >= 6
|
217
251
|
end
|
218
252
|
|
@@ -127,7 +127,7 @@ module ScraperUtils
|
|
127
127
|
uri = begin
|
128
128
|
URI.parse(ScraperUtils.australian_proxy.to_s)
|
129
129
|
rescue URI::InvalidURIError => e
|
130
|
-
raise URI::InvalidURIError, "Invalid proxy URL format: #{e
|
130
|
+
raise URI::InvalidURIError, "Invalid proxy URL format: #{e}"
|
131
131
|
end
|
132
132
|
unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
|
133
133
|
raise URI::InvalidURIError, "Proxy URL must start with http:// or https://"
|
@@ -51,7 +51,7 @@ module ScraperUtils
|
|
51
51
|
# @return [String] Readable representation
|
52
52
|
def inspect
|
53
53
|
status = success? ? "success" : "FAILED"
|
54
|
-
error_info = success? ? "" : " - #{error.class}: #{error
|
54
|
+
error_info = success? ? "" : " - #{error.class}: #{error}"
|
55
55
|
"#<#{self.class} authority=#{authority} #{status}#{error_info} time=#{time_taken}>"
|
56
56
|
end
|
57
57
|
end
|
@@ -11,10 +11,10 @@ module ScraperUtils
|
|
11
11
|
/\bB(oulevard|lvd)\b/i,
|
12
12
|
/\b(Circuit|Cct)\b/i,
|
13
13
|
/\bCl(ose)?\b/i,
|
14
|
-
/\bC(our|r)t\b/i,
|
14
|
+
/\bC(our|r)?t\b/i,
|
15
15
|
/\bCircle\b/i,
|
16
16
|
/\bChase\b/i,
|
17
|
-
/\bCr(
|
17
|
+
/\bCr(es(cent)?)?\b/i,
|
18
18
|
/\bDr((ive)?|v)\b/i,
|
19
19
|
/\bEnt(rance)?\b/i,
|
20
20
|
/\bGr(ove)?\b/i,
|
@@ -33,7 +33,6 @@ module ScraperUtils
|
|
33
33
|
/\bWay\b/i
|
34
34
|
].freeze
|
35
35
|
|
36
|
-
|
37
36
|
AUSTRALIAN_POSTCODES = /\b\d{4}\b/.freeze
|
38
37
|
|
39
38
|
# Check if an address is likely to be geocodable by analyzing its format.
|
@@ -53,7 +52,7 @@ module ScraperUtils
|
|
53
52
|
|
54
53
|
has_unit_or_lot = address.match?(/\b(Unit|Lot:?)\s+\d+/i)
|
55
54
|
|
56
|
-
has_suburb_stats = check_address.match?(/(\b[A-Z]{2,}(\s+[A-Z]+)*,?|,\s+[A-Z][A-Za-z ]+)\s+(#{AUSTRALIAN_STATES.join('|')})\b/)
|
55
|
+
has_suburb_stats = check_address.match?(/(\b[A-Z]{2,}(\s+[A-Z]+)*,?|,\s+[A-Z][A-Za-z ]+)(\s+\d{4})?\s+(#{AUSTRALIAN_STATES.join('|')})\b/)
|
57
56
|
|
58
57
|
if ENV["DEBUG"]
|
59
58
|
missing = []
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scraper_utils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ian Heggie
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-05-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -68,6 +68,7 @@ files:
|
|
68
68
|
- CHANGELOG.md
|
69
69
|
- GUIDELINES.md
|
70
70
|
- Gemfile
|
71
|
+
- Gemfile-heroku-18
|
71
72
|
- IMPLEMENTATION.md
|
72
73
|
- LICENSE.txt
|
73
74
|
- README.md
|
@@ -118,7 +119,7 @@ metadata:
|
|
118
119
|
allowed_push_host: https://rubygems.org
|
119
120
|
homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
|
120
121
|
source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
|
121
|
-
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.
|
122
|
+
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.8.0
|
122
123
|
changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
|
123
124
|
rubygems_mfa_required: 'true'
|
124
125
|
post_install_message:
|
@@ -136,7 +137,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
136
137
|
- !ruby/object:Gem::Version
|
137
138
|
version: '0'
|
138
139
|
requirements: []
|
139
|
-
rubygems_version: 3.4.
|
140
|
+
rubygems_version: 3.4.19
|
140
141
|
signing_key:
|
141
142
|
specification_version: 4
|
142
143
|
summary: planningalerts scraper utilities
|