scraper_utils 0.7.2 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ba66a28129ee09ab76cb0937d195ff68aa5058d1c73805235de4898384fe495d
4
- data.tar.gz: 76c20b7ce9bd581e59fda41b8801bca8a26909d58ddcc74c46e7755e6038970a
3
+ metadata.gz: 9be052a9483d914f35555b8a4a80992be5d18737f79063161053faae9e97a849
4
+ data.tar.gz: b170b9f1b2b973647d8e3ce6426da425773077bdb8ef10dc551b451895326e1c
5
5
  SHA512:
6
- metadata.gz: 5ae13c2c5e4b8bb1c30c2c8a10dd30b42b349dfc7c416fc65bec504e27d7b5c9dad84b6ef195c52788412f238a6740e4fb8fc400e315a42aeb7ee57f8ada9a25
7
- data.tar.gz: bf3d40831ee8667f663b442f92e76943db04eaa995095e09f4c3736e80919d26c924aa17d733a533ec32b278496c2aedd8c368eb40988e1fd9619c5febfb1567
6
+ metadata.gz: 210d580999d5db053f1581e7ba5926fa4d968e5e4e915e73bebb063f3106d106cb83d9fbb4645ad568e3994caab909ca27a942f10b5f965d3881eceb9a83f64a
7
+ data.tar.gz: 063eec87dc3e124744a73494db47e27505d7de330568ba1e3562d75bccdafe1ab054f358c6b93d3904785a001274d6aec7056f3bbc2ae4665280c605a4b100e9
data/CHANGELOG.md CHANGED
@@ -1,5 +1,11 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.8.0 - 2025-05-06
4
+
5
+ * Added ScraperUtils::LogUtils.project_backtrace_line to provide the first project related backtrace line
6
+ * Included this summarized line in ScraperUtils::LogUtils.report_on_results report
7
+ * Allow upto 250 character error message (was max 50)
8
+
3
9
  ## 0.7.2 - 2025-04-15
4
10
 
5
11
  * Accept postcode before state as well as after
data/Gemfile-heroku-18 ADDED
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ ruby "~> 3.2.2"
6
+
7
+ gem "mechanize", "~> 2.8.5"
8
+ gem "nokogiri", "~> 1.15.0"
9
+ gem "sqlite3", "~> 1.6.3"
10
+
11
+ # Unable to list in gemspec - Include it in your projects Gemfile when using this gem
12
+ gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git",
13
+ branch: "morph_defaults"
14
+
15
+ # development and test gems
16
+ gem "rake", "~> 13.0"
17
+ gem "rspec", "~> 3.12"
18
+ gem "rubocop", "~> 1.73"
19
+ gem "rubocop-rake", "~> 0.7"
20
+ gem "rubocop-rspec", "~> 3.5"
21
+ gem "simplecov", "~> 0.22.0"
22
+ gem "simplecov-console"
23
+ gem "terminal-table"
24
+ gem "webmock", "~> 3.19.0"
25
+ gem "yard"
26
+
@@ -67,7 +67,7 @@ module ScraperUtils
67
67
  "records_saved" => stats[:saved] || 0,
68
68
  "unprocessable_records" => stats[:unprocessed] || 0,
69
69
  "status" => status.to_s,
70
- "error_message" => exception&.message,
70
+ "error_message" => exception&.to_s,
71
71
  "error_class" => exception&.class&.to_s,
72
72
  "error_backtrace" => extract_meaningful_backtrace(exception)
73
73
  }
@@ -88,6 +88,38 @@ module ScraperUtils
88
88
  cleanup_old_records
89
89
  end
90
90
 
91
+ # Extracts the first relevant line from backtrace that's from our project
92
+ # (not from gems, vendor, or Ruby standard library)
93
+ #
94
+ # @param backtrace [Array<String>] The exception backtrace
95
+ # @param options [Hash] Options hash
96
+ # @option options [String] :pwd The project root directory (defaults to current working directory)
97
+ # @option options [Boolean] :format If true, returns formatted string with brackets
98
+ # @return [String, nil] The relevant backtrace line without PWD prefix, or nil if none found
99
+ def self.project_backtrace_line(backtrace, options = {})
100
+ return nil if backtrace.nil? || backtrace.empty?
101
+
102
+ # Set defaults
103
+ pwd = options[:pwd] || Dir.pwd
104
+ format = options[:format] || false
105
+
106
+ # Normalize the root directory path with a trailing slash
107
+ pwd = File.join(pwd, '')
108
+
109
+ backtrace.each do |line|
110
+ next if line.include?('/gems/') ||
111
+ line.include?('/vendor/') ||
112
+ line.include?('/ruby/')
113
+
114
+ if line.start_with?(pwd)
115
+ relative_path = line.sub(pwd, '')
116
+ return format ? " [#{relative_path}]" : relative_path
117
+ end
118
+ end
119
+
120
+ format ? "" : nil
121
+ end
122
+
91
123
  # Report on the results
92
124
  # @param authorities [Array<Symbol>] List of authorities attempted to scrape
93
125
  # @param exceptions [Hash{Symbol => Exception}] Any exceptions that occurred during scraping
@@ -117,12 +149,14 @@ module ScraperUtils
117
149
 
118
150
  expect_bad_prefix = expect_bad.include?(authority) ? "[EXPECT BAD] " : ""
119
151
  exception_msg = if exceptions[authority]
120
- "#{exceptions[authority].class} - #{exceptions[authority].message}"
152
+ location = self.project_backtrace_line(exceptions[authority].backtrace, format: true)
153
+ puts "LOCATION: #{location.inspect}"
154
+ "#{exceptions[authority].class} - #{exceptions[authority]}#{location}"
121
155
  else
122
156
  "-"
123
157
  end
124
158
  puts format(summary_format, authority.to_s, ok_records, bad_records,
125
- "#{expect_bad_prefix}#{exception_msg}".slice(0, 70))
159
+ "#{expect_bad_prefix}#{exception_msg}".slice(0, 250))
126
160
  end
127
161
  puts
128
162
 
@@ -149,7 +183,7 @@ module ScraperUtils
149
183
  "(Add to MORPH_EXPECT_BAD?)"
150
184
  unexpected_errors.each do |authority|
151
185
  error = exceptions[authority]
152
- errors << " #{authority}: #{error.class} - #{error.message}"
186
+ errors << " #{authority}: #{error.class} - #{error}"
153
187
  end
154
188
  end
155
189
 
@@ -212,7 +246,7 @@ module ScraperUtils
212
246
 
213
247
  lines = []
214
248
  error.backtrace.each do |line|
215
- lines << line if lines.length < 2 || !line.include?("/vendor/")
249
+ lines << line if lines.length < 2 || !(line.include?("/vendor/") || line.include?("/gems/") || line.include?("/ruby/"))
216
250
  break if lines.length >= 6
217
251
  end
218
252
 
@@ -127,7 +127,7 @@ module ScraperUtils
127
127
  uri = begin
128
128
  URI.parse(ScraperUtils.australian_proxy.to_s)
129
129
  rescue URI::InvalidURIError => e
130
- raise URI::InvalidURIError, "Invalid proxy URL format: #{e.message}"
130
+ raise URI::InvalidURIError, "Invalid proxy URL format: #{e}"
131
131
  end
132
132
  unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
133
133
  raise URI::InvalidURIError, "Proxy URL must start with http:// or https://"
@@ -78,7 +78,7 @@ module ScraperUtils
78
78
  rescue StandardError => e
79
79
  if DebugUtils.basic?
80
80
  ScraperUtils::LogUtils.log(
81
- "WARNING: Failed to fetch robots.txt for #{domain}: #{e.message}"
81
+ "WARNING: Failed to fetch robots.txt for #{domain}: #{e}"
82
82
  )
83
83
  end
84
84
  nil
@@ -51,7 +51,7 @@ module ScraperUtils
51
51
  # @return [String] Readable representation
52
52
  def inspect
53
53
  status = success? ? "success" : "FAILED"
54
- error_info = success? ? "" : " - #{error.class}: #{error.message}"
54
+ error_info = success? ? "" : " - #{error.class}: #{error}"
55
55
  "#<#{self.class} authority=#{authority} #{status}#{error_info} time=#{time_taken}>"
56
56
  end
57
57
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ScraperUtils
4
- VERSION = "0.7.2"
4
+ VERSION = "0.8.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scraper_utils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.2
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ian Heggie
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-04-15 00:00:00.000000000 Z
11
+ date: 2025-05-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -68,6 +68,7 @@ files:
68
68
  - CHANGELOG.md
69
69
  - GUIDELINES.md
70
70
  - Gemfile
71
+ - Gemfile-heroku-18
71
72
  - IMPLEMENTATION.md
72
73
  - LICENSE.txt
73
74
  - README.md
@@ -118,7 +119,7 @@ metadata:
118
119
  allowed_push_host: https://rubygems.org
119
120
  homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
120
121
  source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
121
- documentation_uri: https://rubydoc.info/gems/scraper_utils/0.7.2
122
+ documentation_uri: https://rubydoc.info/gems/scraper_utils/0.8.0
122
123
  changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
123
124
  rubygems_mfa_required: 'true'
124
125
  post_install_message:
@@ -136,7 +137,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
136
137
  - !ruby/object:Gem::Version
137
138
  version: '0'
138
139
  requirements: []
139
- rubygems_version: 3.4.10
140
+ rubygems_version: 3.4.19
140
141
  signing_key:
141
142
  specification_version: 4
142
143
  summary: planningalerts scraper utilities