fair_champion_harvester 0.1.9 → 0.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/lib/fair_champion_harvester/version.rb +1 -1
- data/lib/harvester.rb +10 -80
- data/lib/uri.rb +4 -21
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: bd0c455e1fd407ab2dc6b3432c42feea74fa6beccf13eb4f0aaee54be1ff0f59
|
|
4
|
+
data.tar.gz: 916a04e52444b526d0acd692fbe42a23de3750061d7479fdb67b527468da0236
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 00707e4eb4ca196e58aa2883beb27e93f1841311069dbd5bf67664a10d4aede2a18c84a2c544470913b4996d7673a43933c9b6038ac2bf6d66cb9a5e3e6d8410
|
|
7
|
+
data.tar.gz: 9bab85d0441581c21422e00d53963590e859fc549f1b15bc829844f8a3681120f051356c3e948617cc79397b8b4708384f37c08b4a8d47883b9a54979e94a6de
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,25 @@
|
|
|
1
1
|
## [Unreleased]
|
|
2
2
|
|
|
3
|
+
## [0.1.11] - 2026-05-26
|
|
4
|
+
|
|
5
|
+
### Fixed
|
|
6
|
+
- `lib/uri.rb` was shadowing Ruby's stdlib `require "uri"` because `lib/` is on the load path; moved `FAIRChampionHarvester::Uri` class to `lib/uri_resolver.rb` and turned `lib/uri.rb` into a stdlib-forwarding shim — this was the root cause of all HTTP fetch failures (`uninitialized constant URI`)
|
|
7
|
+
- `parse_link_http_headers`: handle multiple separate `Link:` headers (Array input) in addition to comma-separated single-string headers, using `Array(links).flat_map { |l| l.split(",") }`
|
|
8
|
+
- `parse_link_http_headers`: `rel` regex `\w+` → `[\w-]+` so hyphenated rel types like `cite-as` are captured correctly rather than silently truncated
|
|
9
|
+
- `parse_link_http_headers`: added `next unless url` guard against nil URLs; tightened URL regex to non-greedy `<([^>]*)>`
|
|
10
|
+
- `parse_link_http_headers`: added `describedby` to the allowlist alongside `meta` and `alternate`
|
|
11
|
+
- `parse_link_body_headers`: `link_nodes << NodeSet` → `link_nodes + NodeSet` (NodeSet concatenation); the old `<<` raised `ArgumentError: node must be a Nokogiri::XML::Node`
|
|
12
|
+
- `simplefetch`: corrected copy-paste bug where `guid` was referenced instead of `url` parameter
|
|
13
|
+
- `Core.fetch`: backtrace now always logged on `StandardError` (removed `if ENV["DEBUG"]` guard)
|
|
14
|
+
|
|
15
|
+
### Added
|
|
16
|
+
- RSpec test suite with 14 unit tests for `parse_link_http_headers` and 6 live integration tests against `https://fairsharing.org/1547`; live tests clear stale `/tmp/*_error` cache files before each run
|
|
17
|
+
|
|
18
|
+
## [0.1.10] - 2026-05-26
|
|
19
|
+
|
|
20
|
+
- variable url was not defined, but caught by begin block so no complaints
|
|
21
|
+
|
|
22
|
+
|
|
3
23
|
## [0.1.0] - 2026-03-27
|
|
4
24
|
|
|
5
25
|
- Initial release
|
data/lib/harvester.rb
CHANGED
|
@@ -229,7 +229,8 @@ module FAIRChampionHarvester
|
|
|
229
229
|
|
|
230
230
|
# Focus on <link> tags inside <head> (MetaInspector's head_links equivalent)
|
|
231
231
|
# We use css selector for simplicity and readability
|
|
232
|
-
link_nodes = doc.css('head link[rel="alternate"][type]')
|
|
232
|
+
link_nodes = doc.css('head link[rel="alternate"][type]') +
|
|
233
|
+
doc.css('head link[rel="describedby"][type]')
|
|
233
234
|
|
|
234
235
|
# Your format lists – assuming these are constants/hashes like:
|
|
235
236
|
# FAIRChampionHarvester::Utils::RDF_FORMATS => { jsonld: "application/ld+json", ... }
|
|
@@ -267,46 +268,7 @@ module FAIRChampionHarvester
|
|
|
267
268
|
|
|
268
269
|
urls
|
|
269
270
|
end
|
|
270
|
-
|
|
271
|
-
# m = MetaInspector.new(url, document: body)
|
|
272
|
-
# # accept any alternate that is in structured data format
|
|
273
|
-
# ls = m.head_links.select do |l|
|
|
274
|
-
# l[:rel] == 'alternate' and
|
|
275
|
-
# [FAIRChampionHarvester::Utils::RDF_FORMATS.values,
|
|
276
|
-
# FAIRChampionHarvester::Utils::XML_FORMATS.values,
|
|
277
|
-
# FAIRChampionHarvester::Utils::JSON_FORMATS.values].flatten
|
|
278
|
-
# .include?(l[:type])
|
|
279
|
-
# end
|
|
280
|
-
# # ls is an array of elements that look like this: [{:rel=>"alternate", :type=>"application/ld+json", :href=>"http://scidata.vitk.lv/dataset/303.jsonld"}]
|
|
281
|
-
# urls = ls.map { |l| l[:href] }
|
|
282
|
-
# urls.compact
|
|
283
|
-
# warn "\n\nGOT BODY LINKS #{urls}\n\n"
|
|
284
|
-
# urls
|
|
285
|
-
# end
|
|
286
|
-
|
|
287
|
-
# Recursively collects **all non-Hash values** (leaf values) from a nested Hash structure.
|
|
288
|
-
#
|
|
289
|
-
# Traverses the hash in depth-first order and gathers every value that is not itself
|
|
290
|
-
# a Hash into a flat array. Keys are completely ignored.
|
|
291
|
-
#
|
|
292
|
-
# @param myHash [Hash] the nested hash to traverse
|
|
293
|
-
# @param value [Object] currently unused (likely legacy or placeholder parameter)
|
|
294
|
-
# @param vals [Array] accumulator for collected values (mutable, passed by reference)
|
|
295
|
-
# @return [Array] flat list of all leaf (non-Hash) values in depth-first traversal order
|
|
296
|
-
#
|
|
297
|
-
# @example
|
|
298
|
-
# h = {
|
|
299
|
-
# name: "Alice",
|
|
300
|
-
# info: {
|
|
301
|
-
# age: 34,
|
|
302
|
-
# address: { city: "Madrid", coords: { lat: 40.4168, lon: -3.7038 } },
|
|
303
|
-
# hobbies: ["reading", "hiking"]
|
|
304
|
-
# }
|
|
305
|
-
# }
|
|
306
|
-
#
|
|
307
|
-
# deep_dive_values(h)
|
|
308
|
-
# # => ["Alice", 34, "Madrid", 40.4168, -3.7038, "reading", "hiking"]
|
|
309
|
-
#
|
|
271
|
+
|
|
310
272
|
def self.deep_dive_values(myHash, value = nil, vals = [])
|
|
311
273
|
myHash.each_pair do |_key, value|
|
|
312
274
|
if value.is_a?(Hash)
|
|
@@ -320,38 +282,6 @@ module FAIRChampionHarvester
|
|
|
320
282
|
vals
|
|
321
283
|
end
|
|
322
284
|
|
|
323
|
-
# Recursively collects **every key-value pair** from a nested Hash structure as [key, value] arrays.
|
|
324
|
-
#
|
|
325
|
-
# Traverses the entire nested hash in depth-first order and records every key-value pair
|
|
326
|
-
# encountered — including pairs where the value is itself a Hash.
|
|
327
|
-
#
|
|
328
|
-
# Note: The `property` parameter is currently **not used** (dead code). Both branches
|
|
329
|
-
# of the conditional do the same thing, so every pair is collected regardless of `property`.
|
|
330
|
-
#
|
|
331
|
-
# @param myHash [Hash] the nested hash to traverse
|
|
332
|
-
# @param property [Symbol, String, nil] intended filter key (currently ineffective)
|
|
333
|
-
# @param props [Array] accumulator for [key, value] pairs (mutable)
|
|
334
|
-
# @return [Array<Array>] flat list of [key, value] tuples in depth-first order
|
|
335
|
-
#
|
|
336
|
-
# @example
|
|
337
|
-
# h = {
|
|
338
|
-
# user: "bob42",
|
|
339
|
-
# config: {
|
|
340
|
-
# theme: "dark",
|
|
341
|
-
# alerts: { email: true, push: false }
|
|
342
|
-
# }
|
|
343
|
-
# }
|
|
344
|
-
#
|
|
345
|
-
# deep_dive_properties(h)
|
|
346
|
-
# # => [[:user, "bob42"],
|
|
347
|
-
# # [:config, {theme: "dark", alerts: {email: true, push: false}}],
|
|
348
|
-
# # [:theme, "dark"],
|
|
349
|
-
# # [:alerts, {email: true, push: false}],
|
|
350
|
-
# # [:email, true],
|
|
351
|
-
# # [:push, false]]
|
|
352
|
-
#
|
|
353
|
-
# deep_dive_properties(h, :email) # ← currently returns the same as above (bug)
|
|
354
|
-
#
|
|
355
285
|
def self.deep_dive_properties(myHash, property = nil, props = [])
|
|
356
286
|
return props unless myHash.is_a?(Hash)
|
|
357
287
|
|
|
@@ -436,7 +366,7 @@ module FAIRChampionHarvester
|
|
|
436
366
|
[response.headers, response.body.to_s] # return headers, body, and final URL
|
|
437
367
|
else
|
|
438
368
|
# Handle HTTP error status codes (4xx, 5xx, etc.)
|
|
439
|
-
warn "HTTP Error #{response.status} for #{
|
|
369
|
+
warn "HTTP Error #{response.status} for #{guid}"
|
|
440
370
|
warn "Final URL: #{response.uri}" if response.uri
|
|
441
371
|
FAIRChampionHarvester::Cache.writeErrorToCache(guid, headers)
|
|
442
372
|
meta.comments << "WARN: HTTP error #{response.status} encountered when trying to resolve #{guid}\n" if meta
|
|
@@ -451,7 +381,7 @@ module FAIRChampionHarvester
|
|
|
451
381
|
rescue StandardError => e
|
|
452
382
|
# Catch any other unexpected errors
|
|
453
383
|
warn "Unexpected error while fetching #{guid}: #{e.class} - #{e.message}"
|
|
454
|
-
warn e.backtrace.first(
|
|
384
|
+
warn e.backtrace.first(10).join("\n")
|
|
455
385
|
FAIRChampionHarvester::Cache.writeErrorToCache(guid, headers)
|
|
456
386
|
meta.comments << "WARN: HTTP error #{e.message} encountered when trying to resolve #{guid}\n" if meta
|
|
457
387
|
false
|
|
@@ -487,7 +417,7 @@ module FAIRChampionHarvester
|
|
|
487
417
|
|
|
488
418
|
response = HTTP
|
|
489
419
|
.headers(headers).follow
|
|
490
|
-
.get(
|
|
420
|
+
.get(url.to_s) # or full URL
|
|
491
421
|
|
|
492
422
|
if response.status.success?
|
|
493
423
|
[response.headers, response.body.to_s] # return headers, body, and final URL
|
|
@@ -499,11 +429,11 @@ module FAIRChampionHarvester
|
|
|
499
429
|
end
|
|
500
430
|
rescue HTTP::Error => e
|
|
501
431
|
# This catches network errors, timeouts, connection failures, DNS errors, etc.
|
|
502
|
-
warn "HTTP Request Failed for #{
|
|
432
|
+
warn "HTTP Request Failed for #{url}: #{e.message}"
|
|
503
433
|
false
|
|
504
434
|
rescue StandardError => e
|
|
505
435
|
# Catch any other unexpected errors
|
|
506
|
-
warn "Unexpected error while fetching #{
|
|
436
|
+
warn "Unexpected error while fetching #{url}: #{e.class} - #{e.message}"
|
|
507
437
|
false
|
|
508
438
|
end
|
|
509
439
|
|
|
@@ -525,7 +455,7 @@ module FAIRChampionHarvester
|
|
|
525
455
|
warn e.response
|
|
526
456
|
false
|
|
527
457
|
# now we are returning 'False', and we will check that with an \"if\" statement in our main code
|
|
528
|
-
rescue
|
|
458
|
+
rescue StandardError => e
|
|
529
459
|
warn e
|
|
530
460
|
false
|
|
531
461
|
# now we are returning 'False', and we will check that with an \"if\" statement in our main code
|
|
@@ -550,7 +480,7 @@ module FAIRChampionHarvester
|
|
|
550
480
|
warn e.response
|
|
551
481
|
false
|
|
552
482
|
# now we are returning 'False', and we will check that with an \"if\" statement in our main code
|
|
553
|
-
rescue
|
|
483
|
+
rescue StandardError => e
|
|
554
484
|
warn e
|
|
555
485
|
false
|
|
556
486
|
# now we are returning 'False', and we will check that with an \"if\" statement in our main code
|
data/lib/uri.rb
CHANGED
|
@@ -1,22 +1,5 @@
|
|
|
1
|
-
|
|
2
|
-
class Uri
|
|
3
|
-
def self.resolve_uri(guid, meta)
|
|
4
|
-
type, url = Core.convertToURL(guid)
|
|
5
|
-
meta.guidtype = type if meta.guidtype.nil?
|
|
1
|
+
# frozen_string_literal: true
|
|
6
2
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
meta.comments << "INFO: Attempting to resolve #{url} using HTTP Headers #{FAIRChampionHarvester::Utils::XML_FORMATS["xml"].join(",")}.\n"
|
|
11
|
-
FAIRChampionHarvester::URL.resolve_url(guid: url, meta: meta, nolinkheaders: false,
|
|
12
|
-
headers: { "Accept" => "#{FAIRChampionHarvester::Utils::XML_FORMATS["xml"].join(",")}" })
|
|
13
|
-
meta.comments << "INFO: Attempting to resolve #{url} using HTTP Headers #{FAIRChampionHarvester::Utils::JSON_FORMATS["json"].join(",")}.\n"
|
|
14
|
-
FAIRChampionHarvester::URL.resolve_url(guid: url, meta: meta, nolinkheaders: false,
|
|
15
|
-
headers: { "Accept" => "#{FAIRChampionHarvester::Utils::JSON_FORMATS["json"].join(",")}" })
|
|
16
|
-
meta.comments << "INFO: Attempting to resolve #{url} using HTTP Headers 'Accept: */*'.\n"
|
|
17
|
-
FAIRChampionHarvester::URL.resolve_url(guid: url, meta: meta, nolinkheaders: false,
|
|
18
|
-
headers: { "Accept" => "*/*" })
|
|
19
|
-
meta
|
|
20
|
-
end
|
|
21
|
-
end
|
|
22
|
-
end
|
|
3
|
+
# This file is named uri.rb, which means Ruby's load path resolves
|
|
4
|
+
# 'require "uri"' here instead of the stdlib. We forward to the real one.
|
|
5
|
+
require File.join(RbConfig::CONFIG["rubylibdir"], "uri")
|