fair_champion_harvester 0.1.9 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 07365a6c33e7c66a530b30ea4b6e947a8355e3b378e4e88c7609942864adf98a
4
- data.tar.gz: 20df41a8f201c1e74d882bf17e40e942119b77d88f4d935281e572b7e73d175f
3
+ metadata.gz: bd0c455e1fd407ab2dc6b3432c42feea74fa6beccf13eb4f0aaee54be1ff0f59
4
+ data.tar.gz: 916a04e52444b526d0acd692fbe42a23de3750061d7479fdb67b527468da0236
5
5
  SHA512:
6
- metadata.gz: d4eddba2884a2f87fbf1aac1a005c2794061e43920fab8d38bc55a3e9eb5a81bc65ee16f33e6e4c507a5d8608f183026e6a696a63dd7c3a938b41ed6d289a039
7
- data.tar.gz: 61648adf249a8ffe45184486564bfcf63a443c859f1eb43cd4222ada73e5f88d66b35b88866c85c76d369650dca54036da1180ff56d754c8cd462e1195bf8885
6
+ metadata.gz: 00707e4eb4ca196e58aa2883beb27e93f1841311069dbd5bf67664a10d4aede2a18c84a2c544470913b4996d7673a43933c9b6038ac2bf6d66cb9a5e3e6d8410
7
+ data.tar.gz: 9bab85d0441581c21422e00d53963590e859fc549f1b15bc829844f8a3681120f051356c3e948617cc79397b8b4708384f37c08b4a8d47883b9a54979e94a6de
data/CHANGELOG.md CHANGED
@@ -1,5 +1,25 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.1.11] - 2026-05-26
4
+
5
+ ### Fixed
6
+ - `lib/uri.rb` was shadowing Ruby's stdlib `require "uri"` because `lib/` is on the load path; moved `FAIRChampionHarvester::Uri` class to `lib/uri_resolver.rb` and turned `lib/uri.rb` into a stdlib-forwarding shim — this was the root cause of all HTTP fetch failures (`uninitialized constant URI`)
7
+ - `parse_link_http_headers`: handle multiple separate `Link:` headers (Array input) in addition to comma-separated single-string headers, using `Array(links).flat_map { |l| l.split(",") }`
8
+ - `parse_link_http_headers`: `rel` regex `\w+` → `[\w-]+` so hyphenated rel types like `cite-as` are captured correctly rather than silently truncated
9
+ - `parse_link_http_headers`: added `next unless url` guard against nil URLs; tightened URL regex to non-greedy `<([^>]*)>`
10
+ - `parse_link_http_headers`: added `describedby` to the allowlist alongside `meta` and `alternate`
11
+ - `parse_link_body_headers`: `link_nodes << NodeSet` → `link_nodes + NodeSet` (NodeSet concatenation); the old `<<` raised `ArgumentError: node must be a Nokogiri::XML::Node`
12
+ - `simplefetch`: corrected copy-paste bug where `guid` was referenced instead of `url` parameter
13
+ - `Core.fetch`: backtrace now always logged on `StandardError` (removed `if ENV["DEBUG"]` guard)
14
+
15
+ ### Added
16
+ - RSpec test suite with 14 unit tests for `parse_link_http_headers` and 6 live integration tests against `https://fairsharing.org/1547`; live tests clear stale `/tmp/*_error` cache files before each run
17
+
18
+ ## [0.1.10] - 2026-05-26
19
+
20
+ - variable url was not defined, but caught by begin block so no complaints
21
+
22
+
3
23
  ## [0.1.0] - 2026-03-27
4
24
 
5
25
  - Initial release
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module FairChampionHarvester
4
- VERSION = "0.1.9"
4
+ VERSION = "0.1.11"
5
5
  end
data/lib/harvester.rb CHANGED
@@ -229,7 +229,8 @@ module FAIRChampionHarvester
229
229
 
230
230
  # Focus on <link> tags inside <head> (MetaInspector's head_links equivalent)
231
231
  # We use css selector for simplicity and readability
232
- link_nodes = doc.css('head link[rel="alternate"][type]') # only those with rel=alternate AND type attr
232
+ link_nodes = doc.css('head link[rel="alternate"][type]') +
233
+ doc.css('head link[rel="describedby"][type]')
233
234
 
234
235
  # Your format lists – assuming these are constants/hashes like:
235
236
  # FAIRChampionHarvester::Utils::RDF_FORMATS => { jsonld: "application/ld+json", ... }
@@ -267,46 +268,7 @@ module FAIRChampionHarvester
267
268
 
268
269
  urls
269
270
  end
270
- # def self.parse_link_body_headers(url, body)
271
- # m = MetaInspector.new(url, document: body)
272
- # # accept any alternate that is in structured data format
273
- # ls = m.head_links.select do |l|
274
- # l[:rel] == 'alternate' and
275
- # [FAIRChampionHarvester::Utils::RDF_FORMATS.values,
276
- # FAIRChampionHarvester::Utils::XML_FORMATS.values,
277
- # FAIRChampionHarvester::Utils::JSON_FORMATS.values].flatten
278
- # .include?(l[:type])
279
- # end
280
- # # ls is an array of elements that look like this: [{:rel=>"alternate", :type=>"application/ld+json", :href=>"http://scidata.vitk.lv/dataset/303.jsonld"}]
281
- # urls = ls.map { |l| l[:href] }
282
- # urls.compact
283
- # warn "\n\nGOT BODY LINKS #{urls}\n\n"
284
- # urls
285
- # end
286
-
287
- # Recursively collects **all non-Hash values** (leaf values) from a nested Hash structure.
288
- #
289
- # Traverses the hash in depth-first order and gathers every value that is not itself
290
- # a Hash into a flat array. Keys are completely ignored.
291
- #
292
- # @param myHash [Hash] the nested hash to traverse
293
- # @param value [Object] currently unused (likely legacy or placeholder parameter)
294
- # @param vals [Array] accumulator for collected values (mutable, passed by reference)
295
- # @return [Array] flat list of all leaf (non-Hash) values in depth-first traversal order
296
- #
297
- # @example
298
- # h = {
299
- # name: "Alice",
300
- # info: {
301
- # age: 34,
302
- # address: { city: "Madrid", coords: { lat: 40.4168, lon: -3.7038 } },
303
- # hobbies: ["reading", "hiking"]
304
- # }
305
- # }
306
- #
307
- # deep_dive_values(h)
308
- # # => ["Alice", 34, "Madrid", 40.4168, -3.7038, "reading", "hiking"]
309
- #
271
+
310
272
  def self.deep_dive_values(myHash, value = nil, vals = [])
311
273
  myHash.each_pair do |_key, value|
312
274
  if value.is_a?(Hash)
@@ -320,38 +282,6 @@ module FAIRChampionHarvester
320
282
  vals
321
283
  end
322
284
 
323
- # Recursively collects **every key-value pair** from a nested Hash structure as [key, value] arrays.
324
- #
325
- # Traverses the entire nested hash in depth-first order and records every key-value pair
326
- # encountered — including pairs where the value is itself a Hash.
327
- #
328
- # Note: The `property` parameter is currently **not used** (dead code). Both branches
329
- # of the conditional do the same thing, so every pair is collected regardless of `property`.
330
- #
331
- # @param myHash [Hash] the nested hash to traverse
332
- # @param property [Symbol, String, nil] intended filter key (currently ineffective)
333
- # @param props [Array] accumulator for [key, value] pairs (mutable)
334
- # @return [Array<Array>] flat list of [key, value] tuples in depth-first order
335
- #
336
- # @example
337
- # h = {
338
- # user: "bob42",
339
- # config: {
340
- # theme: "dark",
341
- # alerts: { email: true, push: false }
342
- # }
343
- # }
344
- #
345
- # deep_dive_properties(h)
346
- # # => [[:user, "bob42"],
347
- # # [:config, {theme: "dark", alerts: {email: true, push: false}}],
348
- # # [:theme, "dark"],
349
- # # [:alerts, {email: true, push: false}],
350
- # # [:email, true],
351
- # # [:push, false]]
352
- #
353
- # deep_dive_properties(h, :email) # ← currently returns the same as above (bug)
354
- #
355
285
  def self.deep_dive_properties(myHash, property = nil, props = [])
356
286
  return props unless myHash.is_a?(Hash)
357
287
 
@@ -436,7 +366,7 @@ module FAIRChampionHarvester
436
366
  [response.headers, response.body.to_s] # return headers, body, and final URL
437
367
  else
438
368
  # Handle HTTP error status codes (4xx, 5xx, etc.)
439
- warn "HTTP Error #{response.status} for #{url}"
369
+ warn "HTTP Error #{response.status} for #{guid}"
440
370
  warn "Final URL: #{response.uri}" if response.uri
441
371
  FAIRChampionHarvester::Cache.writeErrorToCache(guid, headers)
442
372
  meta.comments << "WARN: HTTP error #{response.status} encountered when trying to resolve #{guid}\n" if meta
@@ -451,7 +381,7 @@ module FAIRChampionHarvester
451
381
  rescue StandardError => e
452
382
  # Catch any other unexpected errors
453
383
  warn "Unexpected error while fetching #{guid}: #{e.class} - #{e.message}"
454
- warn e.backtrace.first(5).join("\n") if ENV["DEBUG"]
384
+ warn e.backtrace.first(10).join("\n")
455
385
  FAIRChampionHarvester::Cache.writeErrorToCache(guid, headers)
456
386
  meta.comments << "WARN: HTTP error #{e.message} encountered when trying to resolve #{guid}\n" if meta
457
387
  false
@@ -487,7 +417,7 @@ module FAIRChampionHarvester
487
417
 
488
418
  response = HTTP
489
419
  .headers(headers).follow
490
- .get(guid.to_s) # or full URL
420
+ .get(url.to_s) # or full URL
491
421
 
492
422
  if response.status.success?
493
423
  [response.headers, response.body.to_s] # return headers, body, and final URL
@@ -499,11 +429,11 @@ module FAIRChampionHarvester
499
429
  end
500
430
  rescue HTTP::Error => e
501
431
  # This catches network errors, timeouts, connection failures, DNS errors, etc.
502
- warn "HTTP Request Failed for #{guid}: #{e.message}"
432
+ warn "HTTP Request Failed for #{url}: #{e.message}"
503
433
  false
504
434
  rescue StandardError => e
505
435
  # Catch any other unexpected errors
506
- warn "Unexpected error while fetching #{guid}: #{e.class} - #{e.message}"
436
+ warn "Unexpected error while fetching #{url}: #{e.class} - #{e.message}"
507
437
  false
508
438
  end
509
439
 
@@ -525,7 +455,7 @@ module FAIRChampionHarvester
525
455
  warn e.response
526
456
  false
527
457
  # now we are returning 'False', and we will check that with an \"if\" statement in our main code
528
- rescue Exception => e
458
+ rescue StandardError => e
529
459
  warn e
530
460
  false
531
461
  # now we are returning 'False', and we will check that with an \"if\" statement in our main code
@@ -550,7 +480,7 @@ module FAIRChampionHarvester
550
480
  warn e.response
551
481
  false
552
482
  # now we are returning 'False', and we will check that with an \"if\" statement in our main code
553
- rescue Exception => e
483
+ rescue StandardError => e
554
484
  warn e
555
485
  false
556
486
  # now we are returning 'False', and we will check that with an \"if\" statement in our main code
data/lib/uri.rb CHANGED
@@ -1,22 +1,5 @@
1
- module FAIRChampionHarvester
2
- class Uri
3
- def self.resolve_uri(guid, meta)
4
- type, url = Core.convertToURL(guid)
5
- meta.guidtype = type if meta.guidtype.nil?
1
+ # frozen_string_literal: true
6
2
 
7
- meta.comments << "INFO: Found a URI.\n"
8
- meta.comments << "INFO: Attempting to resolve #{url} using HTTP Headers #{FAIRChampionHarvester::Utils::AcceptHeader}.\n"
9
- FAIRChampionHarvester::URL.resolve_url(guid: url, meta: meta, nolinkheaders: false)
10
- meta.comments << "INFO: Attempting to resolve #{url} using HTTP Headers #{FAIRChampionHarvester::Utils::XML_FORMATS["xml"].join(",")}.\n"
11
- FAIRChampionHarvester::URL.resolve_url(guid: url, meta: meta, nolinkheaders: false,
12
- headers: { "Accept" => "#{FAIRChampionHarvester::Utils::XML_FORMATS["xml"].join(",")}" })
13
- meta.comments << "INFO: Attempting to resolve #{url} using HTTP Headers #{FAIRChampionHarvester::Utils::JSON_FORMATS["json"].join(",")}.\n"
14
- FAIRChampionHarvester::URL.resolve_url(guid: url, meta: meta, nolinkheaders: false,
15
- headers: { "Accept" => "#{FAIRChampionHarvester::Utils::JSON_FORMATS["json"].join(",")}" })
16
- meta.comments << "INFO: Attempting to resolve #{url} using HTTP Headers 'Accept: */*'.\n"
17
- FAIRChampionHarvester::URL.resolve_url(guid: url, meta: meta, nolinkheaders: false,
18
- headers: { "Accept" => "*/*" })
19
- meta
20
- end
21
- end
22
- end
3
+ # This file is named uri.rb, which means Ruby's load path resolves
4
+ # 'require "uri"' here instead of the stdlib. We forward to the real one.
5
+ require File.join(RbConfig::CONFIG["rubylibdir"], "uri")
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fair_champion_harvester
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.9
4
+ version: 0.1.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - markwilkinson