scraper_utils 0.7.2 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ba66a28129ee09ab76cb0937d195ff68aa5058d1c73805235de4898384fe495d
4
- data.tar.gz: 76c20b7ce9bd581e59fda41b8801bca8a26909d58ddcc74c46e7755e6038970a
3
+ metadata.gz: e83cc7c9eb0f93f1dd1b84e9528815cc9e81564dfbcacd7afdd32ec2acb60965
4
+ data.tar.gz: d1be3bb53939f60470b1ff247ac61c31a2946619306076a48c390e42431d2c1e
5
5
  SHA512:
6
- metadata.gz: 5ae13c2c5e4b8bb1c30c2c8a10dd30b42b349dfc7c416fc65bec504e27d7b5c9dad84b6ef195c52788412f238a6740e4fb8fc400e315a42aeb7ee57f8ada9a25
7
- data.tar.gz: bf3d40831ee8667f663b442f92e76943db04eaa995095e09f4c3736e80919d26c924aa17d733a533ec32b278496c2aedd8c368eb40988e1fd9619c5febfb1567
6
+ metadata.gz: 4a094c22431c7b4dd34512a8eca31e36f6a41555b436be222f615ada2914fe54d38e0e69e223689cbd6c299b8a523c4085007fae02bcef2f2524fa6c2dcbef89
7
+ data.tar.gz: cd86b1d0d217dfb8808db194c5d14d13a06753a7009267279396e2366c6251506ca36e0c95be856ca5afbc8b096857e19802b43badd3f326d32247346badb1f2
data/CHANGELOG.md CHANGED
@@ -1,5 +1,15 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.8.1 - 2025-05-06
4
+
5
+ * Removed debugging output accidentally left in
6
+
7
+ ## 0.8.0 - 2025-05-06
8
+
9
+ * Added ScraperUtils::LogUtils.project_backtrace_line to provide the first project related backtrace line
10
+ * Included this summarized line in ScraperUtils::LogUtils.report_on_results report
11
+ * Allow upto 250 character error message (was max 50)
12
+
3
13
  ## 0.7.2 - 2025-04-15
4
14
 
5
15
  * Accept postcode before state as well as after
data/Gemfile-heroku-18 ADDED
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ ruby "~> 3.2.2"
6
+
7
+ gem "mechanize", "~> 2.8.5"
8
+ gem "nokogiri", "~> 1.15.0"
9
+ gem "sqlite3", "~> 1.6.3"
10
+
11
+ # Unable to list in gemspec - Include it in your projects Gemfile when using this gem
12
+ gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git",
13
+ branch: "morph_defaults"
14
+
15
+ # development and test gems
16
+ gem "rake", "~> 13.0"
17
+ gem "rspec", "~> 3.12"
18
+ gem "rubocop", "~> 1.73"
19
+ gem "rubocop-rake", "~> 0.7"
20
+ gem "rubocop-rspec", "~> 3.5"
21
+ gem "simplecov", "~> 0.22.0"
22
+ gem "simplecov-console"
23
+ gem "terminal-table"
24
+ gem "webmock", "~> 3.19.0"
25
+ gem "yard"
26
+
@@ -67,7 +67,7 @@ module ScraperUtils
67
67
  "records_saved" => stats[:saved] || 0,
68
68
  "unprocessable_records" => stats[:unprocessed] || 0,
69
69
  "status" => status.to_s,
70
- "error_message" => exception&.message,
70
+ "error_message" => exception&.to_s,
71
71
  "error_class" => exception&.class&.to_s,
72
72
  "error_backtrace" => extract_meaningful_backtrace(exception)
73
73
  }
@@ -88,6 +88,38 @@ module ScraperUtils
88
88
  cleanup_old_records
89
89
  end
90
90
 
91
+ # Extracts the first relevant line from backtrace that's from our project
92
+ # (not from gems, vendor, or Ruby standard library)
93
+ #
94
+ # @param backtrace [Array<String>] The exception backtrace
95
+ # @param options [Hash] Options hash
96
+ # @option options [String] :pwd The project root directory (defaults to current working directory)
97
+ # @option options [Boolean] :format If true, returns formatted string with brackets
98
+ # @return [String, nil] The relevant backtrace line without PWD prefix, or nil if none found
99
+ def self.project_backtrace_line(backtrace, options = {})
100
+ return nil if backtrace.nil? || backtrace.empty?
101
+
102
+ # Set defaults
103
+ pwd = options[:pwd] || Dir.pwd
104
+ format = options[:format] || false
105
+
106
+ # Normalize the root directory path with a trailing slash
107
+ pwd = File.join(pwd, '')
108
+
109
+ backtrace.each do |line|
110
+ next if line.include?('/gems/') ||
111
+ line.include?('/vendor/') ||
112
+ line.include?('/ruby/')
113
+
114
+ if line.start_with?(pwd)
115
+ relative_path = line.sub(pwd, '')
116
+ return format ? " [#{relative_path}]" : relative_path
117
+ end
118
+ end
119
+
120
+ format ? "" : nil
121
+ end
122
+
91
123
  # Report on the results
92
124
  # @param authorities [Array<Symbol>] List of authorities attempted to scrape
93
125
  # @param exceptions [Hash{Symbol => Exception}] Any exceptions that occurred during scraping
@@ -117,12 +149,13 @@ module ScraperUtils
117
149
 
118
150
  expect_bad_prefix = expect_bad.include?(authority) ? "[EXPECT BAD] " : ""
119
151
  exception_msg = if exceptions[authority]
120
- "#{exceptions[authority].class} - #{exceptions[authority].message}"
152
+ location = self.project_backtrace_line(exceptions[authority].backtrace, format: true)
153
+ "#{exceptions[authority].class} - #{exceptions[authority]}#{location}"
121
154
  else
122
155
  "-"
123
156
  end
124
157
  puts format(summary_format, authority.to_s, ok_records, bad_records,
125
- "#{expect_bad_prefix}#{exception_msg}".slice(0, 70))
158
+ "#{expect_bad_prefix}#{exception_msg}".slice(0, 250))
126
159
  end
127
160
  puts
128
161
 
@@ -149,7 +182,7 @@ module ScraperUtils
149
182
  "(Add to MORPH_EXPECT_BAD?)"
150
183
  unexpected_errors.each do |authority|
151
184
  error = exceptions[authority]
152
- errors << " #{authority}: #{error.class} - #{error.message}"
185
+ errors << " #{authority}: #{error.class} - #{error}"
153
186
  end
154
187
  end
155
188
 
@@ -212,7 +245,7 @@ module ScraperUtils
212
245
 
213
246
  lines = []
214
247
  error.backtrace.each do |line|
215
- lines << line if lines.length < 2 || !line.include?("/vendor/")
248
+ lines << line if lines.length < 2 || !(line.include?("/vendor/") || line.include?("/gems/") || line.include?("/ruby/"))
216
249
  break if lines.length >= 6
217
250
  end
218
251
 
@@ -127,7 +127,7 @@ module ScraperUtils
127
127
  uri = begin
128
128
  URI.parse(ScraperUtils.australian_proxy.to_s)
129
129
  rescue URI::InvalidURIError => e
130
- raise URI::InvalidURIError, "Invalid proxy URL format: #{e.message}"
130
+ raise URI::InvalidURIError, "Invalid proxy URL format: #{e}"
131
131
  end
132
132
  unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
133
133
  raise URI::InvalidURIError, "Proxy URL must start with http:// or https://"
@@ -78,7 +78,7 @@ module ScraperUtils
78
78
  rescue StandardError => e
79
79
  if DebugUtils.basic?
80
80
  ScraperUtils::LogUtils.log(
81
- "WARNING: Failed to fetch robots.txt for #{domain}: #{e.message}"
81
+ "WARNING: Failed to fetch robots.txt for #{domain}: #{e}"
82
82
  )
83
83
  end
84
84
  nil
@@ -51,7 +51,7 @@ module ScraperUtils
51
51
  # @return [String] Readable representation
52
52
  def inspect
53
53
  status = success? ? "success" : "FAILED"
54
- error_info = success? ? "" : " - #{error.class}: #{error.message}"
54
+ error_info = success? ? "" : " - #{error.class}: #{error}"
55
55
  "#<#{self.class} authority=#{authority} #{status}#{error_info} time=#{time_taken}>"
56
56
  end
57
57
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ScraperUtils
4
- VERSION = "0.7.2"
4
+ VERSION = "0.8.1"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scraper_utils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.2
4
+ version: 0.8.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ian Heggie
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-04-15 00:00:00.000000000 Z
11
+ date: 2025-05-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -68,6 +68,7 @@ files:
68
68
  - CHANGELOG.md
69
69
  - GUIDELINES.md
70
70
  - Gemfile
71
+ - Gemfile-heroku-18
71
72
  - IMPLEMENTATION.md
72
73
  - LICENSE.txt
73
74
  - README.md
@@ -118,7 +119,7 @@ metadata:
118
119
  allowed_push_host: https://rubygems.org
119
120
  homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
120
121
  source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
121
- documentation_uri: https://rubydoc.info/gems/scraper_utils/0.7.2
122
+ documentation_uri: https://rubydoc.info/gems/scraper_utils/0.8.1
122
123
  changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
123
124
  rubygems_mfa_required: 'true'
124
125
  post_install_message:
@@ -136,7 +137,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
136
137
  - !ruby/object:Gem::Version
137
138
  version: '0'
138
139
  requirements: []
139
- rubygems_version: 3.4.10
140
+ rubygems_version: 3.4.19
140
141
  signing_key:
141
142
  specification_version: 4
142
143
  summary: planningalerts scraper utilities