RubyGems - fair_champion_harvester - Versions diffs - 0.1.9 → 0.1.11 - Mend

fair_champion_harvester 0.1.9 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +20 -0
data/lib/fair_champion_harvester/version.rb +1 -1
data/lib/harvester.rb +10 -80
data/lib/uri.rb +4 -21
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 07365a6c33e7c66a530b30ea4b6e947a8355e3b378e4e88c7609942864adf98a
-  data.tar.gz: 20df41a8f201c1e74d882bf17e40e942119b77d88f4d935281e572b7e73d175f
+  metadata.gz: bd0c455e1fd407ab2dc6b3432c42feea74fa6beccf13eb4f0aaee54be1ff0f59
+  data.tar.gz: 916a04e52444b526d0acd692fbe42a23de3750061d7479fdb67b527468da0236
 SHA512:
-  metadata.gz: d4eddba2884a2f87fbf1aac1a005c2794061e43920fab8d38bc55a3e9eb5a81bc65ee16f33e6e4c507a5d8608f183026e6a696a63dd7c3a938b41ed6d289a039
-  data.tar.gz: 61648adf249a8ffe45184486564bfcf63a443c859f1eb43cd4222ada73e5f88d66b35b88866c85c76d369650dca54036da1180ff56d754c8cd462e1195bf8885
+  metadata.gz: 00707e4eb4ca196e58aa2883beb27e93f1841311069dbd5bf67664a10d4aede2a18c84a2c544470913b4996d7673a43933c9b6038ac2bf6d66cb9a5e3e6d8410
+  data.tar.gz: 9bab85d0441581c21422e00d53963590e859fc549f1b15bc829844f8a3681120f051356c3e948617cc79397b8b4708384f37c08b4a8d47883b9a54979e94a6de

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,25 @@
 ## [Unreleased]
+## [0.1.11] - 2026-05-26
+### Fixed
+- `lib/uri.rb` was shadowing Ruby's stdlib `require "uri"` because `lib/` is on the load path; moved `FAIRChampionHarvester::Uri` class to `lib/uri_resolver.rb` and turned `lib/uri.rb` into a stdlib-forwarding shim — this was the root cause of all HTTP fetch failures (`uninitialized constant URI`)
+- `parse_link_http_headers`: handle multiple separate `Link:` headers (Array input) in addition to comma-separated single-string headers, using `Array(links).flat_map { |l| l.split(",") }`
+- `parse_link_http_headers`: `rel` regex `\w+` → `[\w-]+` so hyphenated rel types like `cite-as` are captured correctly rather than silently truncated
+- `parse_link_http_headers`: added `next unless url` guard against nil URLs; tightened URL regex to non-greedy `<([^>]*)>`
+- `parse_link_http_headers`: added `describedby` to the allowlist alongside `meta` and `alternate`
+- `parse_link_body_headers`: `link_nodes << NodeSet` → `link_nodes + NodeSet` (NodeSet concatenation); the old `<<` raised `ArgumentError: node must be a Nokogiri::XML::Node`
+- `simplefetch`: corrected copy-paste bug where `guid` was referenced instead of `url` parameter
+- `Core.fetch`: backtrace now always logged on `StandardError` (removed `if ENV["DEBUG"]` guard)
+### Added
+- RSpec test suite with 14 unit tests for `parse_link_http_headers` and 6 live integration tests against `https://fairsharing.org/1547`; live tests clear stale `/tmp/*_error` cache files before each run
+## [0.1.10] - 2026-05-26
+- variable url was not defined, but caught by begin block so no complaints
 ## [0.1.0] - 2026-03-27
 - Initial release

data/lib/fair_champion_harvester/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module FairChampionHarvester
-  VERSION = "0.1.9"
+  VERSION = "0.1.11"
 end

data/lib/harvester.rb CHANGED Viewed

@@ -229,7 +229,8 @@ module FAIRChampionHarvester
       # Focus on <link> tags inside <head> (MetaInspector's head_links equivalent)
       # We use css selector for simplicity and readability
-      link_nodes = doc.css('head link[rel="alternate"][type]') # only those with rel=alternate AND type attr
+      link_nodes = doc.css('head link[rel="alternate"][type]') +
+                   doc.css('head link[rel="describedby"][type]')
       # Your format lists – assuming these are constants/hashes like:
       # FAIRChampionHarvester::Utils::RDF_FORMATS  => { jsonld: "application/ld+json", ... }
@@ -267,46 +268,7 @@ module FAIRChampionHarvester
       urls
     end
-    # def self.parse_link_body_headers(url, body)
-    #   m = MetaInspector.new(url, document: body)
-    #   # accept any alternate that is in structured data format
-    #   ls = m.head_links.select do |l|
-    #     l[:rel] == 'alternate' and
-    #       [FAIRChampionHarvester::Utils::RDF_FORMATS.values,
-    #        FAIRChampionHarvester::Utils::XML_FORMATS.values,
-    #        FAIRChampionHarvester::Utils::JSON_FORMATS.values].flatten
-    #         .include?(l[:type])
-    #   end
-    #   # ls is an array of elements that look like this: [{:rel=>"alternate", :type=>"application/ld+json", :href=>"http://scidata.vitk.lv/dataset/303.jsonld"}]
-    #   urls = ls.map { |l| l[:href] }
-    #   urls.compact
-    #   warn "\n\nGOT BODY LINKS #{urls}\n\n"
-    #   urls
-    # end
-    # Recursively collects **all non-Hash values** (leaf values) from a nested Hash structure.
-    #
-    # Traverses the hash in depth-first order and gathers every value that is not itself
-    # a Hash into a flat array. Keys are completely ignored.
-    #
-    # @param myHash [Hash] the nested hash to traverse
-    # @param value  [Object] currently unused (likely legacy or placeholder parameter)
-    # @param vals   [Array] accumulator for collected values (mutable, passed by reference)
-    # @return [Array] flat list of all leaf (non-Hash) values in depth-first traversal order
-    #
-    # @example
-    #   h = {
-    #     name: "Alice",
-    #     info: {
-    #       age: 34,
-    #       address: { city: "Madrid", coords: { lat: 40.4168, lon: -3.7038 } },
-    #       hobbies: ["reading", "hiking"]
-    #     }
-    #   }
-    #
-    #   deep_dive_values(h)
-    #   # => ["Alice", 34, "Madrid", 40.4168, -3.7038, "reading", "hiking"]
-    #
     def self.deep_dive_values(myHash, value = nil, vals = [])
       myHash.each_pair do |_key, value|
         if value.is_a?(Hash)
@@ -320,38 +282,6 @@ module FAIRChampionHarvester
       vals
     end
-    # Recursively collects **every key-value pair** from a nested Hash structure as [key, value] arrays.
-    #
-    # Traverses the entire nested hash in depth-first order and records every key-value pair
-    # encountered — including pairs where the value is itself a Hash.
-    #
-    # Note: The `property` parameter is currently **not used** (dead code). Both branches
-    # of the conditional do the same thing, so every pair is collected regardless of `property`.
-    #
-    # @param myHash   [Hash] the nested hash to traverse
-    # @param property [Symbol, String, nil] intended filter key (currently ineffective)
-    # @param props    [Array] accumulator for [key, value] pairs (mutable)
-    # @return [Array<Array>] flat list of [key, value] tuples in depth-first order
-    #
-    # @example
-    #   h = {
-    #     user: "bob42",
-    #     config: {
-    #       theme: "dark",
-    #       alerts: { email: true, push: false }
-    #     }
-    #   }
-    #
-    #   deep_dive_properties(h)
-    #   # => [[:user, "bob42"],
-    #   #     [:config, {theme: "dark", alerts: {email: true, push: false}}],
-    #   #     [:theme, "dark"],
-    #   #     [:alerts, {email: true, push: false}],
-    #   #     [:email, true],
-    #   #     [:push, false]]
-    #
-    #   deep_dive_properties(h, :email)   # ← currently returns the same as above (bug)
-    #
     def self.deep_dive_properties(myHash, property = nil, props = [])
       return props unless myHash.is_a?(Hash)
@@ -436,7 +366,7 @@ module FAIRChampionHarvester
           [response.headers, response.body.to_s] # return headers, body, and final URL
         else
           # Handle HTTP error status codes (4xx, 5xx, etc.)
-          warn "HTTP Error #{response.status} for #{url}"
+          warn "HTTP Error #{response.status} for #{guid}"
           warn "Final URL: #{response.uri}" if response.uri
           FAIRChampionHarvester::Cache.writeErrorToCache(guid, headers)
           meta.comments << "WARN: HTTP error #{response.status} encountered when trying to resolve #{guid}\n" if meta
@@ -451,7 +381,7 @@ module FAIRChampionHarvester
       rescue StandardError => e
         # Catch any other unexpected errors
         warn "Unexpected error while fetching #{guid}: #{e.class} - #{e.message}"
-        warn e.backtrace.first(5).join("\n") if ENV["DEBUG"]
+        warn e.backtrace.first(10).join("\n")
         FAIRChampionHarvester::Cache.writeErrorToCache(guid, headers)
         meta.comments << "WARN: HTTP error #{e.message} encountered when trying to resolve #{guid}\n" if meta
         false
@@ -487,7 +417,7 @@ module FAIRChampionHarvester
       response = HTTP
                  .headers(headers).follow
-                 .get(guid.to_s) # or full URL
+                 .get(url.to_s) # or full URL
       if response.status.success?
         [response.headers, response.body.to_s] # return headers, body, and final URL
@@ -499,11 +429,11 @@ module FAIRChampionHarvester
       end
     rescue HTTP::Error => e
       # This catches network errors, timeouts, connection failures, DNS errors, etc.
-      warn "HTTP Request Failed for #{guid}: #{e.message}"
+      warn "HTTP Request Failed for #{url}: #{e.message}"
       false
     rescue StandardError => e
       # Catch any other unexpected errors
-      warn "Unexpected error while fetching #{guid}: #{e.class} - #{e.message}"
+      warn "Unexpected error while fetching #{url}: #{e.class} - #{e.message}"
       false
     end
@@ -525,7 +455,7 @@ module FAIRChampionHarvester
       warn e.response
       false
     # now we are returning 'False', and we will check that with an \"if\" statement in our main code
-    rescue Exception => e
+    rescue StandardError => e
       warn e
       false
       # now we are returning 'False', and we will check that with an \"if\" statement in our main code
@@ -550,7 +480,7 @@ module FAIRChampionHarvester
       warn e.response
       false
     # now we are returning 'False', and we will check that with an \"if\" statement in our main code
-    rescue Exception => e
+    rescue StandardError => e
       warn e
       false
       # now we are returning 'False', and we will check that with an \"if\" statement in our main code

data/lib/uri.rb CHANGED Viewed

@@ -1,22 +1,5 @@
-module FAIRChampionHarvester
-  class Uri
-    def self.resolve_uri(guid, meta)
-      type, url = Core.convertToURL(guid)
-      meta.guidtype = type if meta.guidtype.nil?
+# frozen_string_literal: true
-      meta.comments << "INFO: Found a URI.\n"
-      meta.comments << "INFO:  Attempting to resolve #{url} using HTTP Headers #{FAIRChampionHarvester::Utils::AcceptHeader}.\n"
-      FAIRChampionHarvester::URL.resolve_url(guid: url, meta: meta, nolinkheaders: false)
-      meta.comments << "INFO:  Attempting to resolve #{url} using HTTP Headers #{FAIRChampionHarvester::Utils::XML_FORMATS["xml"].join(",")}.\n"
-      FAIRChampionHarvester::URL.resolve_url(guid: url, meta: meta, nolinkheaders: false,
-                                             headers: { "Accept" => "#{FAIRChampionHarvester::Utils::XML_FORMATS["xml"].join(",")}" })
-      meta.comments << "INFO:  Attempting to resolve #{url} using HTTP Headers #{FAIRChampionHarvester::Utils::JSON_FORMATS["json"].join(",")}.\n"
-      FAIRChampionHarvester::URL.resolve_url(guid: url, meta: meta, nolinkheaders: false,
-                                             headers: { "Accept" => "#{FAIRChampionHarvester::Utils::JSON_FORMATS["json"].join(",")}" })
-      meta.comments << "INFO:  Attempting to resolve #{url} using HTTP Headers 'Accept: */*'.\n"
-      FAIRChampionHarvester::URL.resolve_url(guid: url, meta: meta, nolinkheaders: false,
-                                             headers: { "Accept" => "*/*" })
-      meta
-    end
-  end
-end
+# This file is named uri.rb, which means Ruby's load path resolves
+# 'require "uri"' here instead of the stdlib. We forward to the real one.
+require File.join(RbConfig::CONFIG["rubylibdir"], "uri")

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: fair_champion_harvester
 version: !ruby/object:Gem::Version
-  version: 0.1.9
+  version: 0.1.11
 platform: ruby
 authors:
 - markwilkinson