RubyGems - fsp_harvester - Versions diffs - 0.1.5 → 0.1.9 - Mend

fsp_harvester 0.1.5 → 0.1.9

Files changed (18) hide show

checksums.yaml +4 -4
data/.rspec_status +55 -0
data/Gemfile.lock +5 -4
data/Rakefile +3 -3
data/launch.json +11 -0
data/lib/config.conf_docker +8 -0
data/lib/config.conf_local +8 -0
data/lib/constants.rb +12 -13
data/lib/fsp_harvester/version.rb +1 -1
data/lib/fsp_harvester.rb +94 -74
data/lib/fsp_metadata_external_tools.rb +82 -0
data/lib/fsp_metadata_harvester.rb +164 -0
data/lib/fsp_metadata_parser.rb +109 -0
data/lib/metadata_object.rb +96 -4
data/lib/signposting_tests.rb +87 -0
data/lib/warnings.json +79 -2
data/lib/web_utils.rb +15 -15
metadata +12 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 79507c31b14bab423d95a72fe441756551fa445caccea733ee75993fd7e0222c
-  data.tar.gz: a18796aaff5e57940306fecd1d82df5c18d579ffe0d5fb1cd1948a9a29d1bb3b
+  metadata.gz: 923ccb362ef4a71fa2f221b1b224dbd5d3ec78a14cc9da0e12a1e0df804162ff
+  data.tar.gz: e70a9a994c504a0867ab10e05c07d440a76677d7ff27e27b9bdb0c3338c02ffe
 SHA512:
-  metadata.gz: ed211e876c70b7c6bd3dad6dc9a7dada1e4e6d54f5c9a92286b24b9912b06b26f6b3c2fd3b22c8ac225ddb8ceaa3eb2b98d35a983f6be3fe78f4575450f8d857
-  data.tar.gz: af6d5af7520061d418680b5b9a5f90e066b55be26322b7a4d9275bf74546eb56e80e538cd78443d45b28241246eb9329c84c08c53faee9db67e8b1c893507a54
+  metadata.gz: d819cb1c19d40f8a093b723cbeb4273d122d3207df2d874558327fa6a2622be1bf2c671cc9dfebee74c689722825c2dd1957f8be6bfcf2c14c09097c1dc05a5b
+  data.tar.gz: 508a484d93eab373d0389c11f4361068d5586759a73573901ea54f4d2a028f713f71bb492976785499f1074eb8b359d79be550c09b196eb82838a1e401398fc4

data/.rspec_status ADDED Viewed

@@ -0,0 +1,55 @@
+example_id                         | status | run_time        |
+---------------------------------- | ------ | --------------- |
+./spec/cite-as_spec.rb[1:1:1]      | passed | 1.17 seconds    |
+./spec/cite-as_spec.rb[1:1:2]      | passed | 0.98776 seconds |
+./spec/cite-as_spec.rb[1:1:3]      | passed | 0.69753 seconds |
+./spec/cite-as_spec.rb[1:1:4]      | passed | 1.31 seconds    |
+./spec/cite-as_spec.rb[1:1:5]      | passed | 2.07 seconds    |
+./spec/cite-as_spec.rb[1:1:6]      | passed | 1.45 seconds    |
+./spec/cite-as_spec.rb[1:1:7]      | passed | 2.75 seconds    |
+./spec/cite-as_spec.rb[1:1:8]      | passed | 1.83 seconds    |
+./spec/cite-as_spec.rb[1:1:9]      | passed | 2.51 seconds    |
+./spec/cite-as_spec.rb[1:1:10]     | passed | 1.73 seconds    |
+./spec/cite-as_spec.rb[1:1:11]     | passed | 2.35 seconds    |
+./spec/cite-as_spec.rb[1:1:12]     | passed | 2.01 seconds    |
+./spec/cite-as_spec.rb[1:1:13]     | passed | 2.56 seconds    |
+./spec/cite-as_spec.rb[1:1:14]     | passed | 1.68 seconds    |
+./spec/cite-as_spec.rb[1:1:15]     | passed | 1.06 seconds    |
+./spec/cite-as_spec.rb[1:1:16]     | passed | 1.03 seconds    |
+./spec/cite-as_spec.rb[1:1:17]     | passed | 0.94321 seconds |
+./spec/cite-as_spec.rb[1:1:18]     | passed | 1.1 seconds     |
+./spec/cite-as_spec.rb[1:1:19]     | passed | 1.45 seconds    |
+./spec/cite-as_spec.rb[1:1:20]     | passed | 1.53 seconds    |
+./spec/cite-as_spec.rb[1:1:21]     | passed | 1.64 seconds    |
+./spec/cite-as_spec.rb[1:1:22]     | passed | 1.01 seconds    |
+./spec/cite-as_spec.rb[1:1:23]     | passed | 1.09 seconds    |
+./spec/cite-as_spec.rb[1:1:24]     | failed | 1.22 seconds    |
+./spec/cite-as_spec.rb[1:1:25]     | passed | 0.38248 seconds |
+./spec/describedby_spec.rb[1:1:1]  | passed | 2.24 seconds    |
+./spec/describedby_spec.rb[1:1:2]  | passed | 1.08 seconds    |
+./spec/describedby_spec.rb[1:1:3]  | passed | 1 second        |
+./spec/describedby_spec.rb[1:1:4]  | passed | 1.14 seconds    |
+./spec/describedby_spec.rb[1:1:5]  | passed | 1.03 seconds    |
+./spec/describedby_spec.rb[1:1:6]  | passed | 0.81364 seconds |
+./spec/describedby_spec.rb[1:1:7]  | passed | 0.77543 seconds |
+./spec/describedby_spec.rb[1:1:8]  | passed | 2.01 seconds    |
+./spec/describedby_spec.rb[1:1:9]  | passed | 1.35 seconds    |
+./spec/describedby_spec.rb[1:1:10] | passed | 1.73 seconds    |
+./spec/describedby_spec.rb[1:1:11] | passed | 2.36 seconds    |
+./spec/describedby_spec.rb[1:1:12] | passed | 2.73 seconds    |
+./spec/describedby_spec.rb[1:1:13] | passed | 1.5 seconds     |
+./spec/describedby_spec.rb[1:1:14] | passed | 1.8 seconds     |
+./spec/describedby_spec.rb[1:1:15] | passed | 1.65 seconds    |
+./spec/fsp_harvester_spec.rb[1:1]  | passed | 0.00053 seconds |
+./spec/fsp_harvester_spec.rb[1:2]  | passed | 1.76 seconds    |
+./spec/item_spec.rb[1:1:1]         | passed | 2.08 seconds    |
+./spec/item_spec.rb[1:1:2]         | passed | 2.27 seconds    |
+./spec/item_spec.rb[1:1:3]         | passed | 1.22 seconds    |
+./spec/item_spec.rb[1:1:4]         | passed | 1.61 seconds    |
+./spec/item_spec.rb[1:1:5]         | passed | 1.74 seconds    |
+./spec/item_spec.rb[1:1:6]         | passed | 1.95 seconds    |
+./spec/item_spec.rb[1:1:7]         | passed | 3.59 seconds    |
+./spec/item_spec.rb[1:1:8]         | passed | 0.41001 seconds |
+./spec/type_spec.rb[1:1:1]         | passed | 1.14 seconds    |
+./spec/type_spec.rb[1:1:2]         | passed | 0.94799 seconds |
+./spec/type_spec.rb[1:1:3]         | passed | 1.04 seconds    |

data/Gemfile.lock CHANGED Viewed

@@ -1,10 +1,10 @@
 PATH
   remote: .
   specs:
-    fsp_harvester (0.1.5)
+    fsp_harvester (0.1.9)
       json (~> 2.0)
       linkeddata (~> 3.2)
-      linkheaders-processor (~> 0.1.11)
+      linkheaders-processor (~> 0.1.15)
       metainspector (~> 5.11.2)
       parseconfig (~> 1.1)
       rake (~> 13.0)
@@ -126,10 +126,11 @@ GEM
       shex (~> 0.7)
       sparql (~> 3.2)
       sparql-client (~> 3.2)
-    linkheaders-processor (0.1.11)
+    linkheaders-processor (0.1.15)
       json (~> 2.0)
       json-ld (~> 3.2)
       json-ld-preloaded (~> 3.2)
+      link_header (~> 0.0.8)
       metainspector (~> 5.11.2)
       rest-client (~> 2.1)
       securerandom (~> 0.1.0)
@@ -248,7 +249,7 @@ GEM
       diff-lcs (>= 1.2.0, < 2.0)
       rspec-support (~> 3.11.0)
     rspec-support (3.11.0)
-    rubocop (1.32.0)
+    rubocop (1.33.0)
       json (~> 2.3)
       parallel (~> 1.10)
       parser (>= 3.1.0.0)

data/Rakefile CHANGED Viewed

@@ -1,11 +1,11 @@
 # frozen_string_literal: true
-require "bundler/gem_tasks"
-require "rspec/core/rake_task"
+require 'bundler/gem_tasks'
+require 'rspec/core/rake_task'
 RSpec::Core::RakeTask.new(:spec)
-require "rubocop/rake_task"
+require 'rubocop/rake_task'
 RuboCop::RakeTask.new

data/launch.json ADDED Viewed

@@ -0,0 +1,11 @@
+{
+    "name": "RSpec - all",
+    "type": "Ruby",
+    "request": "launch",
+    "cwd": "${workspaceRoot}",
+    "program": "/home/osboxes/.rvm/gems/ruby-3.0.0/bin/rspec",
+    "args": [
+      "-I",
+      "${workspaceRoot}"
+    ]
+  }

data/lib/config.conf_docker ADDED Viewed

@@ -0,0 +1,8 @@
+[extruct]
+command="extruct"
+[rdf]
+command="/usr/local/bundle/bin/rdf"
+[tika]
+command="http://tika:9998/meta"

data/lib/config.conf_local ADDED Viewed

@@ -0,0 +1,8 @@
+[extruct]
+command="extruct"
+[rdf]
+command="/home/osboxes/.rvm/gems/ruby-3.0.0/bin/rdf"
+[tika]
+command="http://tika:9998/meta"

data/lib/constants.rb CHANGED Viewed

@@ -1,17 +1,20 @@
 ACCEPT_ALL_HEADER = { 'Accept' => 'text/turtle, application/ld+json, application/rdf+xml, text/xhtml+xml, application/n3, application/rdf+n3, application/turtle, application/x-turtle, text/n3, text/turtle, text/rdf+n3, text/rdf+turtle, application/n-triples' }
+ACCEPT_STAR_HEADER = {'Accept' => '*/*'}
 TEXT_FORMATS = {
   'text' => ['text/plain']
 }
 RDF_FORMATS = {
-  'jsonld' => ['application/ld+json', 'application/vnd.schemaorg.ld+json'],  # NEW FOR DATACITE
+  'jsonld' => ['application/ld+json','application/x-ld+json', 'application/vnd.schemaorg.ld+json'],  # NEW FOR DATACITE
   'turtle' => ['text/turtle', 'application/n3', 'application/rdf+n3',
                'application/turtle', 'application/x-turtle', 'text/n3', 'text/turtle',
                'text/rdf+n3', 'text/rdf+turtle'],
   # 'rdfa'    => ['text/xhtml+xml', 'application/xhtml+xml'],
   'rdfxml' => ['application/rdf+xml'],
-  'triples' => ['application/n-triples', 'application/n-quads', 'application/trig']
+  'ntriples' => ['application/n-triples', 'application/trig'],
+  'nquads' => ['application/n-quads']
 }
 XML_FORMATS = {
@@ -73,12 +76,10 @@ GUID_TYPES = { 'inchi' => Regexp.new(/^\w{14}-\w{10}-\w$/),
                'uri' => Regexp.new(%r{^\w+:/?/?[^\s]+$}) }
 CONFIG = File.exist?('config.conf') ? ParseConfig.new('config.conf') : {}
-if CONFIG['extruct'] && CONFIG['extruct']['command'] && !CONFIG['extruct']['command'].empty?
-  extruct = config['extruct']['command']
-end
-extruct = 'extruct' unless @extruct_command
+extruct = CONFIG.dig(:extruct, :command)
+extruct ||= 'extruct'
 extruct.strip!
-case @extruct
+case extruct
 when /[&|;`$\s]/
   abort 'The Extruct command in the config file appears to be subject to command injection.  I will not continue'
 when /echo/i
@@ -86,8 +87,8 @@ when /echo/i
 end
 EXTRUCT_COMMAND = extruct
-rdf_command = CONFIG['rdf']['command'] if CONFIG['rdf'] && CONFIG['rdf']['command'] && !CONFIG['rdf']['command'].empty?
-rdf_command = 'rdf' unless @rdf_command
+rdf_command = CONFIG.dig(:rdf, :command)
+rdf_command ||= 'rdf'
 rdf_command.strip
 case rdf_command
 when /[&|;`$\s]/
@@ -99,8 +100,6 @@ when !(/rdf$/ =~ $_)
 end
 RDF_COMMAND = rdf_command
-if CONFIG['tika'] && CONFIG['tika']['command'] && !CONFIG['tika']['command'].empty?
-  tika_command = CONFIG['tika']['command']
-end
-tika_command = 'http://localhost:9998/meta' unless @tika_command
+tika_command = CONFIG.dig(:tika, :command)
+tika_command ||= 'http://localhost:9998/meta'
 TIKA_COMMAND = tika_command

data/lib/fsp_harvester/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module FspHarvester
-  VERSION = "0.1.5"
+  VERSION = "0.1.9"
 end

data/lib/fsp_harvester.rb CHANGED Viewed

@@ -1,24 +1,28 @@
 # frozen_string_literal: true
-require_relative "fsp_harvester/version"
-require "json/ld"
-require "json/ld/preloaded"
-require "json"
-require "linkheaders/processor"
-require "addressable"
-require "tempfile"
-require "xmlsimple"
-require "nokogiri"
-require "parseconfig"
-require "rest-client"
-require "cgi"
-require "digest"
-require "open3"
-require "metainspector"
-require "rdf/xsd"
-require_relative "./metadata_object"
-require_relative "./constants"
-require_relative "./web_utils"
+require_relative 'fsp_harvester/version'
+require 'json/ld'
+require 'json/ld/preloaded'
+require 'json'
+require 'linkheaders/processor'
+require 'addressable'
+require 'tempfile'
+require 'xmlsimple'
+require 'nokogiri'
+require 'parseconfig'
+require 'rest-client'
+require 'cgi'
+require 'digest'
+require 'open3'
+require 'metainspector'
+require 'rdf/xsd'
+require_relative './metadata_object'
+require_relative './constants'
+require_relative './web_utils'
+require_relative './signposting_tests'
+require_relative './fsp_metadata_harvester'
+require_relative './fsp_metadata_parser'
 module FspHarvester
   class Error < StandardError
@@ -27,33 +31,45 @@ module FspHarvester
   class Utils
     # @@distillerknown = {} # global, hash of sha256 keys of message bodies - have they been seen before t/f
     # @warnings = JSON.parse(File.read("warnings.json"))
-    @meta = FspHarvester::MetadataObject.new
     def self.resolve_guid(guid:)
-      @meta.finalURI = [guid]
+      @meta = FspHarvester::MetadataObject.new
+      @meta.all_uris = [guid]
       type, url = convertToURL(guid: guid)
       links = Array.new
-      unless type
-        @meta.warnings << ["006", guid, ""]
-        @meta.comments << "FATAL: GUID type not recognized.\n"
+      if type
+        links = resolve_url(url: url)
+        @meta.links << links
       else
-        links, @meta = resolve_url(url: url)
+        @meta.warnings << ['006', guid, '']
+        @meta.comments << "FATAL: GUID type not recognized.\n"
       end
       [links, @meta]
     end
+    def self.gather_metadata_from_describedby_links(links: [], metadata: FspHarvester::MetadataObject.new) # meta should have already been created by resolve+guid, but maybe not
+      @meta = metadata
+      db = []
+      links.each do |l|
+        db << l if l.relation == 'describedby'
+      end
+      FspHarvester::MetadataHarvester.extract_metadata(links: db, metadata: @meta)  # everything is gathered into the @meta metadata object
+      @meta
+    end
     def self.convertToURL(guid:)
       GUID_TYPES.each do |k, regex|
-        if k == "inchi" and regex.match(guid)
-          return "inchi", "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
-        elsif k == "handle1" and regex.match(guid)
-          return "handle", "http://hdl.handle.net/#{guid}"
-        elsif k == "handle2" and regex.match(guid)
-          return "handle", "http://hdl.handle.net/#{guid}"
-        elsif k == "uri" and regex.match(guid)
-          return "uri", guid
-        elsif k == "doi" and regex.match(guid)
-          return "doi", "https://doi.org/#{guid}"
+        if k == 'inchi' and regex.match(guid)
+          return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
+        elsif k == 'handle1' and regex.match(guid)
+          return 'handle', "http://hdl.handle.net/#{guid}"
+        elsif k == 'handle2' and regex.match(guid)
+          return 'handle', "http://hdl.handle.net/#{guid}"
+        elsif k == 'uri' and regex.match(guid)
+          return 'uri', guid
+        elsif k == 'doi' and regex.match(guid)
+          return 'doi', "https://doi.org/#{guid}"
         end
       end
       [nil, nil]
@@ -66,71 +82,75 @@ module FspHarvester
       false
     end
-    def self.resolve_url(url:, nolinkheaders: false, header: ACCEPT_ALL_HEADER)
-      @meta.guidtype = "uri" if @meta.guidtype.nil?
+    def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_STAR_HEADER)
+      @meta.guidtype = 'uri' if @meta.guidtype.nil?
       warn "\n\n FETCHING #{url} #{header}\n\n"
-      response = FspHarvester::WebUtils.fspfetch(url: url, headers: header)
-      warn "\n\n head #{response.headers.inspect}\n\n"
+      response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method, meta: @meta)
+      warn "\n\n head #{response.headers.inspect}\n\n" if response
       unless response
-        @meta.warnings << ["001", url, header]
+        @meta.warnings << ['001', url, header]
         @meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
-        return [[], @meta]
+        return []
       end
-      @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.finalURI.last}.  Using the output from this URL for the next few tests..."
+      @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}.  Using the output from this URL for the next few tests..."
       @meta.full_response << response.body
       links = process_link_headers(response: response) unless nolinkheaders
-      [links, @meta]
+      links
     end
     def self.process_link_headers(response:)
       warn "\n\n parsing #{response.headers}\n\n"
-      parser = LinkHeaders::Processor.new(default_anchor: @meta.finalURI.last)
+      parser = LinkHeaders::Processor.new(default_anchor: @meta.all_uris.last)
       parser.extract_and_parse(response: response)
-      factory = parser.factory  # LinkHeaders::LinkFactory
+      factory = parser.factory # LinkHeaders::LinkFactory
+      warn "\n\n length bfore #{factory.all_links.length}\n\n"
+      signpostingcheck(factory: factory)
+      warn "\n\n length aftr #{factory.all_links.length}\n\n"
+      warn "\n\n links #{factory.all_links}\n\n"
+      factory.all_links
+    end
-      citeas = 0
-      describedby = 0
-      warn "\n\n length #{factory.all_links.length}\n\n"
+    def self.signpostingcheck(factory:)
+      citeas = Array.new
+      describedby = Array.new
+      item = Array.new
+      types = Array.new
       factory.all_links.each do |l|
         case l.relation
-        when "cite-as"
-          citeas += 1
-        when "describedby"
-          describedby += 1
-          unless l.respond_to? "type"
-            @meta.warnings << ["005", l.href, ""]
-            @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires any describedby links to also have a 'type' attribute\n"
-          end
+        when 'cite-as'
+          citeas << l
+        when 'item'
+          item << l
+        when 'describedby'
+          describedby << l
+        when 'type'
+          types << l
         end
       end
-      if citeas > 1
-        self.check_for_conflicts(factory: factory)  # this merelty adsds to the metadata objects if there are conflicts
+      check_describedby_rules(describedby: describedby)
+      check_item_rules(item: item)
+      if citeas.length > 1
+        warn "INFO: multiple cite-as links found. Checking for conflicts\n"
+        @meta.comments << "INFO: multiple cite-as links found. Checking for conflicts\n"
+        citeas = check_for_citeas_conflicts(citeas: citeas) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
       end
-      unless citeas == 1 && describedby > 0
-        @meta.warnings << ["004", "", ""]
+      unless citeas.length == 1 && describedby.length > 0
+        @meta.warnings << ['004', '', '']
         @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires exactly one cite-as header, and at least one describedby header\n"
       end
-      factory.all_links
-    end
-    def self.check_for_conflicts(factory:) # incoming: {"link1" => {"sectiontype1" => value, "sectiontype2" => value2}}
-      @meta.comments << "INFO: checking for conflicting cite-as links"
-      citeas = Array.new
-      factory.all_links.each do |link|
-        next unless link.relation == 'cite-as'
-        citeas << link.href
-      end
-      unless citeas == citeas.uniq
-        @meta.warnings << ["007", "", ""]
-        @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard: Found conflicting cite-as link headers\n"
-      else
-        @meta.comments << "INFO: No conflicting cite-as links found."
+      unless types.length >=1
+        @meta.warnings << ['015', '', '']
+        @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires one or two 'type' link headers\n"
       end
     end
   end

data/lib/fsp_metadata_external_tools.rb ADDED Viewed

@@ -0,0 +1,82 @@
+# frozen_string_literal: true
+module FspHarvester
+  class Error < StandardError
+  end
+  class ExternalTools
+    def initialize(metadata: FspHarvester::MetadataObject.new)
+      @meta = metadata
+    end
+    def process_with_distiller(body:)
+      bhash = Digest::SHA256.hexdigest(body)
+      if @@distillerknown[bhash]
+        @meta.comments << "INFO: data is already parsed by distiller.\n"
+        #parse_rdf(body: body)
+      else
+        @meta.comments << "INFO: Using 'Kellog's Distiller' to try to extract metadata from return value (message body).\n"
+        file = Tempfile.new('foo', encoding: 'UTF-8')
+        body = body.force_encoding('UTF-8')
+        body.scrub!
+        body = body.gsub(%r{"@context"\s*:\s*"https?://schema.org/?"}, '"@context": "https://schema.org/docs/jsonldcontext.json"') # a bug in distiller, apparently
+        file.write(body)
+        file.rewind
+        @meta.comments << "INFO: The message body is being examined by Distiller\n"
+        # command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format turtle #{file.path} 2>/dev/null"
+        command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
+        # command = "LANG=en_US.UTF-8 /usr/local/bin/ruby #{@rdf_command} serialize --input-format rdfa --output-format jsonld #{file.path}"
+        # command = "LANG=en_US.UTF-8 /home/osboxes/.rvm/rubies/ruby-2.6.3/bin/ruby /home/osboxes/.rvm/gems/ruby-2.6.3/bin/rdf serialize --output-format jsonld #{file.path}"
+        warn "distiller command: #{command}"
+        result, _stderr, _status = Open3.capture3(command)
+        warn ''
+        warn "distiller errors: #{stderr}"
+        file.close
+        file.unlink
+        result = result.force_encoding('UTF-8')
+        warn "DIST RESULT: #{result}"
+        if result !~ /@context/i # failure returns nil
+          @meta.comments << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML..\n"
+          @meta.warnings << ['018', '', '']
+        else
+          @meta.comments << "INFO: The Distiller found parseable data.  Parsing as JSON-LD\n"
+          parse_rdf(result: result, content_type: "application/ld+json")
+        end
+        @@distillerknown[bhash] = true
+      end
+    end
+    def processs_with_extruct(uri:)
+      @meta.comments << "INFO:  Using 'extruct' to try to extract metadata from return value (message body) of #{uri}.\n"
+      warn 'begin open3'
+      stdout, stderr, status = Open3.capture3(EXTRUCT_COMMAND + ' ' + uri)
+      warn "open3 status: #{status} #{stdout}"
+      result = stderr # absurd that the output comes over stderr!  LOL!
+      if result.to_s.match(/(Failed\sto\sextract.*?)\n/)
+        @meta.comments << "WARN: extruct threw an error #{Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n"
+        @meta.warnings << ['019', '', '']
+        if result.to_s.match(/(ValueError:.*?)\n/)
+          @meta.comments << "WARN: extruct error was #{Regexp.last_match(1)}\n"
+          @meta.warnings << ['019', '', '']
+        end
+      elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
+        json = JSON.parse result
+        @meta.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
+        parse_rdf(body: json['json-ld'].to_json, content_type: 'application/ld+json') if json['json-ld'].any? # RDF
+        @meta.merge_hash(json['microdata'].first) if json['microdata'].any?
+        @meta.merge_hash(json['microformat'].first) if json['microformat'].any?
+        @meta.merge_hash(json['opengraph'].first) if json['opengraph'].any?
+        parse_rdf(body: json['rdfa'].to_json, content_type: 'application/ld+json') if json['rdfa'].any? # RDF
+        @meta.merge_hash(json.first) if json.first.is_a? Hash
+      else
+        @meta.comments << "WARN: the extruct tool failed to find parseable data at #{uri}\n"
+      end
+    end
+  end
+end

data/lib/fsp_metadata_harvester.rb ADDED Viewed

@@ -0,0 +1,164 @@
+# frozen_string_literal: true
+module FspHarvester
+  class Error < StandardError
+  end
+  class MetadataHarvester
+    def self.extract_metadata(links: [], metadata: FspHarvester::MetadataObject.new)
+      @meta = metadata
+      @meta.comments << 'INFO:  now collecting both linked data and hash-style data using the harvested links'
+      describedby = links.select { |l| l if l.relation == 'describedby' }
+      hvst = FspHarvester::MetadataParser.new(metadata_object: @meta) # put here because the class variable for detecting duplicates should apply to all URIs
+      describedby.each do |link|
+        accepttype = ACCEPT_STAR_HEADER
+        accept = link.respond_to?('type') ? link.type : nil
+        accepttype = { 'Accept' => accept } if accept
+        response = attempt_to_resolve(link: link, headers: accepttype)
+        abbreviation, content_type = attempt_to_detect_type(body: response.body, headers: response.headers)
+        unless abbreviation
+          @meta.warnings << ['017', url, header]
+          @meta.comments << "WARN: metadata format returned from #{url} using Accept header #{header} is not recognized.  Processing will end now.\n"
+          next
+        end
+        # process according to detected type
+        case abbreviation
+        when 'html'
+          @meta.comments << 'INFO: Processing html'
+          hvst.process_html(body: response.body, uri: link)
+        when 'xml'
+          @meta.comments << 'INFO: Processing xml'
+          hvst.process_xml(body: response.body)
+        when 'json'
+          @meta.comments << 'INFO: Processing json'
+          hvst.process_json(body: response.body)
+        when 'jsonld', 'rdfxml', 'turtle', 'ntriples', 'nquads'
+          @meta.comments << 'INFO: Processing linked data'
+          hvst.process_ld(body: response.body, content_type: content_type)
+        when 'specialist'
+          warn 'no specialized parsers so far'
+        end
+      end
+    end
+    def self.attempt_to_resolve(link:, headers: ACCEPT_STAR_HEADER)
+      @meta.comments << "INFO:  link #{link.href} being processed"
+      if link.respond_to? 'type'
+        header = { 'Accept' => link.type }
+      else
+        @meta.comments << "INFO:  link #{link.href} has no MIME type, defaulting to */*"
+      end
+      url = link.href
+      response = FspHarvester::WebUtils.fspfetch(url: url, method: :get, headers: header)
+      unless response
+        @meta.warnings << ['016', url, header]
+        @meta.comments << "WARN: Unable to resolve describedby link #{url} using HTTP Accept header #{header}.\n"
+      end
+      response
+    end
+    def self.attempt_to_detect_type(body:, headers:)
+      #  described by should be an html, xml, json, or linked data document
+      abbreviation = nil
+      content_type = nil
+      @meta.comments << 'INFO: Testing metadata format for html, xml, and linked data formats\n'
+      if body =~ /^\s*<\?xml/
+        if body =~ /<HTML/i
+          abbreviation = 'html'
+          content_type = 'text/html'
+          @meta.comments << 'INFO: appears to be HTML\n'
+        elsif body =~ /<rdf:RDF/i
+          abbreviation = 'rdfxml'
+          content_type = 'application/rdf+xml'
+          @meta.comments << 'INFO: appears to be RDF-XML\n'
+        else
+          abbreviation = 'xml'
+          content_type = 'application/xml'
+          @meta.comments << 'INFO: appears to be XML\n'
+        end
+      else
+        abbreviation, content_type = check_ld(body: body, claimed_type: headers[:content_type])
+        abbreviation, content_type = check_json(body: body) unless abbreviation
+      end
+      unless content_type
+        @meta.warnings << ['017', url, header]
+        @meta.comments << "WARN: metadata format returned from #{url} using Accept header #{header} is not recognized.  Processing will end now.\n"
+      end
+      [abbreviation, content_type]
+    end
+    def self.check_ld(body:, claimed_type:)
+      detected_type = ntriples_hack(body: body) # ntriples hack for one-line metadata records
+      unless detected_type
+        detected_type = RDF::Format.for({ sample: body[0..5000] })
+        @meta.comments << "INFO: Auto-detected type #{detected_type}\n"
+      end
+      contenttype = ''
+      abbreviation = ''
+      if detected_type
+        contenttype = detected_type.content_type.first # comes back as array
+        abbreviation = abbreviate_type(contenttype: contenttype)
+        @meta.comments << "INFO: using content-type #{contenttype}.\n"
+      else
+        @meta.comments << "INFO: metadata does not appear to be in a linked data format.  Trying other options.\n"
+      end
+      [abbreviation, contenttype]
+    end
+    def self.ntriples_hack(body:)  # distriller cannot recognize single-line ntriples unless they end with a period, which is not required by the spec... so hack it!
+      detected_type = nil
+      body.split.each do |line|
+        line.strip!
+        next if line.empty?
+        if line =~ %r{\s*<[^>]+>\s*<[^>]+>\s\S+}
+          @meta.comments << "INFO: running ntriples hack on  #{line + " ."}\n"
+          detected_type = RDF::Format.for({ sample: "#{line} ." })  # adding a period allows detection of ntriples by distiller
+          break
+        end
+      end
+      @meta.comments << "INFO: ntriples hack found: #{detected_type.to_s}\n"
+      if detected_type != RDF::NTriples::Format   # only return the hacky case
+        return nil
+      end
+      return detected_type
+    end
+    def self.check_json(body:)
+      abbreviation = nil
+      parsed = nil
+      begin
+        parsed = JSON.parse(body)
+      rescue StandardError
+        abbreviation = nil
+      end
+      if parsed
+        abbreviation = 'json'
+      else
+        @meta.comments << "INFO: metadata does not appear to be in JSON format.  No options left.\n"
+      end
+      [abbreviation, 'application/ld+json']
+    end
+    def self.abbreviate_type(contenttype:)
+      foundtype = nil
+      RDF_FORMATS.merge(XML_FORMATS).merge(HTML_FORMATS).merge(JSON_FORMATS).each do |type, vals|
+        warn "\n\ntype #{type}\nvals #{vals}\n\n"
+        @meta.comments << "INFO: testing #{type} MIME types for #{contenttype}"
+        next unless vals.include? contenttype
+        foundtype = type
+        @meta.comments << "INFO: detected a #{type} MIME type"
+        break
+      end
+      foundtype
+    end
+  end
+end

data/lib/fsp_metadata_parser.rb ADDED Viewed

@@ -0,0 +1,109 @@
+# frozen_string_literal: true
+module FspHarvester
+  class Error < StandardError
+  end
+  class MetadataParser
+    # attr_accessor :distillerknown
+    @@distillerknown = {}
+    def initialize(metadata_object: FspHarvester::MetadataObject.new)
+      @meta = metadata_object
+    end
+    def process_html(body:, uri:)
+      tools = FspHarvester::ExternalTools.new(metadata: @meta)
+      tools.process_with_distiller(body: body)
+      tools.process_with_extruct(uri: uri)
+    end
+    def process_xml(body:)
+      begin
+        hash = XmlSimple.xml_in(body)
+      rescue
+        @meta.comments << "CRITICAL: Malformed XML detected.  Cannot process metadata.\n"
+        @meta.warnings << ['020', '', '']
+      end
+      @meta.comments << "INFO: The XML is being merged in the metadata object\n"
+      @meta.hash.merge hash
+    end
+    def process_json(body:)
+      begin
+        hash = JSON.parse(body)
+      rescue
+        @meta.comments << "CRITICAL: Malformed JSON detected.  Cannot process metadata.\n"
+        @meta.warnings << ['021', '', '']
+      end
+      @meta.comments << "INFO: The JSON is being merged in the metadata object\n"
+      @meta.hash.merge hash
+    end
+    def process_ld(body:, content_type:)
+      parse_rdf(body: body, content_type: content_type)
+    end
+    def parse_rdf(body:, content_type:)
+      unless body
+        @meta.comments << "CRITICAL: The response message body component appears to have no content.\n"
+        @meta.warnings << ['018', '', '']
+        return
+      end
+      unless body.match(/\w/)
+        @meta.comments << "CRITICAL: The response message body component appears to have no content.\n"
+        @meta.warnings << ['018', '', '']
+        return
+      end
+      rdfformat = RDF::Format.for(content_type: content_type)
+      unless rdfformat
+        @meta.comments << "CRITICAL: Found what appears to be RDF (sample:  #{body[0..300].delete!("\n")}), but it could not find a parser.  Please report this error, along with the GUID of the resource, to the maintainer of the system.\n"
+        @meta.warnings << ['018', '', '']
+        return
+      end
+      graph = FspHarvester::Cache.checkRDFCache(body: body)
+      if graph.size > 0
+        warn "\n\n\n unmarshalling graph from cache\n\ngraph size #{graph.size}\n\n"
+        @meta.merge_rdf(graph.to_a)
+      else
+        warn "\n\n\nfound format #{rdfformat}\n\n"
+        @meta.comments << "INFO: The response message body component appears to contain #{rdfformat}.\n"
+        reader = ''
+        begin
+          reader = rdfformat.reader.new(body)
+        rescue Exception => e
+          @meta.comments << "WARN: Though linked data was found, it failed to parse (Exception #{e}).  This likely indicates some syntax error in the data.  As a result, no metadata will be extracted from this message.\n"
+          @meta.warnings << ['018', '', '']
+          return
+        end
+        begin
+          if reader.size == 0
+            @meta.comments << "WARN: Though linked data was found, it failed to parse.  This likely indicates some syntax error in the data.  As a result, no metadata will be extracted from this message.\n"
+            return
+          end
+          reader = rdfformat.reader.new(body) # have to re-read it here, but now its safe because we have already caught errors
+          warn 'WRITING TO CACHE'
+          FspHarvester::Cache.writeRDFCache(reader: reader, body: body) # write to the special RDF graph cache
+          warn 'WRITING DONE'
+          reader = rdfformat.reader.new(body)  # frustrating that we cannot rewind!
+          warn 'RE-READING DONE'
+          @meta.merge_rdf(reader.to_a)
+          warn 'MERGE DONE'
+        rescue RDF::ReaderError => e
+          @meta.comments << "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} ||  (sample of what was parsed:  #{body[0..300].delete("\n")})\n"
+          warn "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} ||  (sample of what was parsed:  #{body[0..300].delete("\n")})\n"
+          @meta.warnings << ['018', '', '']
+        rescue Exception => e
+          meta.comments << "CRITICAL: An unknown error occurred while parsing the (apparent) Linked Data (sample of what was parsed:  #{body[0..300].delete("\n")}).  Moving on...\n"
+          warn "\n\nCRITICAL: #{e.inspect} An unknown error occurred while parsing the (apparent) Linked Data (full body:  #{body}).  Moving on...\n"
+          @meta.warnings << ['018', '', '']
+        end
+      end
+    end
+  end
+end

data/lib/metadata_object.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module FspHarvester
   class MetadataObject
-    attr_accessor :hash, :graph, :comments, :warnings, :guidtype, :full_response, :finalURI  # a hash of metadata # a RDF.rb graph of metadata  # an array of comments  # the type of GUID that was detected # will be an array of Net::HTTP::Response
+    attr_accessor :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris  # a hash of metadata # a RDF.rb graph of metadata  # an array of comments  # the type of GUID that was detected # will be an array of Net::HTTP::Response
     def initialize(_params = {}) # get a name from the "new" call, or set a default
       @hash = {}
@@ -8,15 +8,16 @@ module FspHarvester
       @comments =  []
       @warnings =  []
       @full_response = []
-      @finalURI = []
+      @links = []
+      @all_uris = []
     end
     def merge_hash(hash)
-      # $stderr.puts "\n\n\nIncoming Hash #{hash.inspect}"
+      # warn "\n\n\nIncoming Hash #{hash.inspect}"
       self.hash = self.hash.merge(hash)
     end
-    def merge_rdf(triples)  # incoming list of triples
+    def merge_rdf(triples) # incoming list of triples
       graph << triples
       graph
     end
@@ -25,4 +26,95 @@ module FspHarvester
       graph
     end
   end
+  class Cache
+    def self.retrieveMetaObject(uri)
+      filename = (Digest::MD5.hexdigest uri) + '_meta'
+      warn "Checking Meta cache for #{filename}"
+      if File.exist?("/tmp/#{filename}")
+        warn 'FOUND Meta object in cache'
+        meta = Marshal.load(File.read("/tmp/#{filename}"))
+        warn 'Returning....'
+        return meta
+      end
+      warn 'Meta objectNot Found in Cache'
+      false
+    end
+    def self.cacheMetaObject(meta, uri)
+      filename = (Digest::MD5.hexdigest uri) + '_meta'
+      warn "in cacheMetaObject Writing to cache for #{filename}"
+      File.open("/tmp/#{filename}", 'wb') { |f| f.write(Marshal.dump(meta)) }
+    end
+    def self.checkRDFCache(body: )
+      fs = File.join('/tmp/', '*_graphbody')
+      bodies = Dir.glob(fs)
+      g = RDF::Graph.new
+      bodies.each do |bodyfile|
+        next unless File.size(bodyfile) == body.bytesize # compare body size
+        next unless bodyfile.match(/(.*)_graphbody$/) # continue if there's no match
+        filename = Regexp.last_match(1)
+        warn "Regexp match for #{filename} FOUND"
+        next unless File.exist?("#{filename}_graph") # @ get the associated graph file
+        warn "RDF Cache File #{filename} FOUND"
+        graph = Marshal.load(File.read("#{filename}_graph")) # unmarshal it
+        graph.each do |statement|
+          g << statement # need to do this because the unmarshalled object isn't entirely functional as an RDF::Graph object
+        end
+        warn "returning a graph of #{g.size}"
+        break
+      end
+      # return an empty graph otherwise
+      g
+    end
+    def self.writeRDFCache(reader:, body:)
+      filename = Digest::MD5.hexdigest body
+      graph = RDF::Graph.new
+      reader.each_statement { |s| graph << s }
+      warn "WRITING RDF TO CACHE #{filename}"
+      File.open("/tmp/#{filename}_graph", 'wb') { |f| f.write(Marshal.dump(graph)) }
+      File.open("/tmp/#{filename}_graphbody", 'wb') { |f| f.write(body) }
+      warn "wrote RDF filename: #{filename}"
+    end
+    def self.checkCache(uri, headers)
+      filename = Digest::MD5.hexdigest uri + headers.to_s
+      warn "Checking Error cache for #{filename}"
+      if File.exist?("/tmp/#{filename}_error")
+        warn 'Error file found in cache... returning'
+        return ['ERROR', nil, nil]
+      end
+      if File.exist?("/tmp/#{filename}_head") and File.exist?("/tmp/#{filename}_body")
+        warn 'FOUND data in cache'
+        head = Marshal.load(File.read("/tmp/#{filename}_head"))
+        body = Marshal.load(File.read("/tmp/#{filename}_body"))
+        all_uris = ''
+        all_uris = Marshal.load(File.read("/tmp/#{filename}_uri")) if File.exist?("/tmp/#{filename}_uri")
+        warn 'Returning....'
+        return [head, body, all_uris]
+      end
+      warn 'Not Found in Cache'
+    end
+    def self.writeToCache(uri, headers, head, body, all_uris)
+      filename = Digest::MD5.hexdigest uri + headers.to_s
+      warn "in writeToCache Writing to cache for #{filename}"
+      headfilename = filename + '_head'
+      bodyfilename = filename + '_body'
+      urifilename = filename + '_uri'
+      File.open("/tmp/#{headfilename}", 'wb') { |f| f.write(Marshal.dump(head)) }
+      File.open("/tmp/#{bodyfilename}", 'wb') { |f| f.write(Marshal.dump(body)) }
+      File.open("/tmp/#{urifilename}", 'wb') { |f| f.write(Marshal.dump(all_uris)) }
+    end
+    def self.writeErrorToCache(uri, headers)
+      filename = Digest::MD5.hexdigest uri + headers.to_s
+      warn "in writeErrorToCache Writing error to cache for #{filename}"
+      File.open("/tmp/#{filename}_error", 'wb') { |f| f.write('ERROR') }
+    end
+  end
 end

data/lib/signposting_tests.rb ADDED Viewed

@@ -0,0 +1,87 @@
+def check_for_citeas_conflicts(citeas: )
+  @meta.comments << 'INFO: checking for conflicting cite-as links'
+  citeas_hrefs = Hash.new
+  citeas.each do |link|
+    warn "INFO: Adding citeas #{link.href} to the testing queue."
+    @meta.comments << "INFO: Adding citeas #{link.href} to the testing queue."
+    citeas_hrefs[link.href] = link
+  end
+  if citeas_hrefs.length > 1
+    @meta.comments << 'INFO: Found multiple non-identical cite-as links.'
+    @meta.warnings << ['007', '', '']
+    @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard: Found conflicting cite-as link headers.\n"
+  end
+  citeas_hrefs.values  # return list of unique links
+end
+def check_describedby_rules(describedby:)
+  describedby.each do |l|
+    unless l.respond_to? 'type'
+      @meta.warnings << ['005', l.href, '']
+      @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires any describedby links to also have a 'type' attribute.\n"
+    end
+    type = l.type if l.respond_to? 'type'
+    type ||= '*/*'
+    header = { accept: type }
+    response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
+    if response
+      responsetype = response.headers[:content_type]
+      @meta.comments << "INFO: describedby link responds with content type #{responsetype}\n"
+      if responsetype =~ %r{^(.*/[^;]+)}
+        responsetype = Regexp.last_match(1).to_s # remove the e.g. charset information
+      end
+      @meta.comments << "INFO: testing content type |#{responsetype}| against |#{type}|\n"
+      if type != '*/*'
+        if responsetype == type
+          @meta.comments << "INFO: describedby link responds according to Signposting specifications\n"
+        else
+          @meta.warnings << ['009', l.href, header]
+          @meta.comments << "WARN: Content type of returned describedby link #{responsetype}does not match the 'type' attribute #{type}\n"
+        end
+      else
+        @meta.warnings << ['010', l.href, header]
+        @meta.comments << "WARN: Content type of returned describedby link is not specified in response headers or cannot be matched against accept headers\n"
+      end
+    else
+      @meta.warnings << ['008', l.href, header]
+      @meta.comments << "WARN: describedby link doesn't resolve\n"
+    end
+  end
+end
+def check_item_rules(item:)
+  item.each do |l| # l = LinkHeaders::Link
+    unless l.respond_to? 'type'
+      @meta.warnings << ['011', l.href, '']
+      @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which encourages any item links to also have a 'type' attribute.\n"
+    end
+    type = l.type if l.respond_to? 'type'
+    type ||= '*/*' # this becomes a frozen string
+    header = { accept: type }
+    response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
+    if response
+      if response.headers[:content_type] and type != '*/*'
+        rtype = type.gsub(%r{/}, "\/")   # because type is a frozen string
+        rtype = rtype.gsub(/\+/, '.')
+        typeregex = Regexp.new(type)
+        if response.headers[:content_type].match(typeregex)
+          warn response.headers[:content_type]
+          warn typeregex.inspect
+          @meta.comments << "INFO: item link responds according to Signposting specifications\n"
+        else
+          @meta.warnings << ['012', l.href, header]
+          @meta.comments << "WARN: Content type of returned item link does not match the 'type' attribute\n"
+        end
+      else
+        @meta.warnings << ['013', l.href, header]
+        @meta.comments << "WARN: Content type of returned item link is not specified in response headers or cannot be matched against accept headers\n"
+      end
+    else
+      @meta.warnings << ['014', l.href, header]
+      @meta.comments << "WARN: item link doesn't resolve\n"
+    end
+  end
+end

data/lib/warnings.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
    "001": {
-      "message": "Unable to resolve guid using Accept headers for Linked Data",
+      "message": "Unable to resolve guid using default (*/*) Accept headers",
       "linkout": "",
       "severity": "WARN"
    },
@@ -28,6 +28,83 @@
       "message": "GUID type not recognized",
       "linkout": "",
       "severity": "WARN"
-   }
+   },
+   "007": {
+      "message": "Conflicting cite-as links",
+      "linkout": "",
+      "severity": "WARN"
+   },
+   "008": {
+      "message": "describedby link does not resolve",
+      "linkout": "",
+      "severity": "WARN"
+   },
+   "009": {
+      "message": "Content-type of described-by link does not match the type attribute in the link header itself",
+      "linkout": "",
+      "severity": "WARN"
+   },
+   "010": {
+      "message": "Content-type of response from described-by link is undefined or cannot be compared to the link type",
+      "linkout": "",
+      "severity": "WARN"
+   },
+   "011": {
+      "message": "The resource does not follow the FAIR Signposting standard, which encourages any item links to have a type attribute",
+      "linkout": "",
+      "severity": "WARN"
+   },
+   "012": {
+      "message": "Content-type of response from resolving an item doesn't match the item type attribute in the link header",
+      "linkout": "",
+      "severity": "WARN"
+   },
+   "013": {
+      "message": "Content-type of response from resolving an item is undefined or cannot be compared to the link type",
+      "linkout": "",
+      "severity": "WARN"
+   },
+   "014": {
+      "message": "Item link does not resolve",
+      "linkout": "",
+      "severity": "WARN"
+   },
+   "015": {
+      "message": "Link headers do not include a link of type 'type', as required by the FAIR Signposting specification",
+      "linkout": "",
+      "severity": "WARN"
+   },
+   "016": {
+      "message": "Unable to resolve describedby link using Accept headers with the MIME type indicated in the link",
+      "linkout": "",
+      "severity": "WARN"
+   },
+   "017": {
+      "message": "Metadata format not recognized.",
+      "linkout": "",
+      "severity": "WARN"
+   },
+   "018": {
+      "message": "RDF parsing error - likely malformed RDF document.",
+      "linkout": "",
+      "severity": "WARN"
+   },
+   "019": {
+      "message": "HTML parsing error - unable to extract linked data from HTML.",
+      "linkout": "",
+      "severity": "WARN"
+   },
+   "020": {
+      "message": "XML parsing error - unable to process XML document.",
+      "linkout": "",
+      "severity": "WARN"
+   },
+   "021": {
+      "message": "JSON parsing error - unable to process JSON document.",
+      "linkout": "",
+      "severity": "WARN"
+   },
 }

data/lib/web_utils.rb CHANGED Viewed

@@ -1,32 +1,32 @@
 module FspHarvester
   class WebUtils
-    def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER)  # we will try to retrieve turtle whenever possible
+    def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get, meta: FspHarvester::MetadataObject.new)
       warn 'In fetch routine now.  '
       begin
         warn "executing call over the Web to #{url}"
         response = RestClient::Request.execute({
-                                                method: :get,
+                                                method: method,
                                                 url: url.to_s,
                                                 # user: user,
                                                 # password: pass,
                                                 headers: headers
                                               })
-        @meta.finalURI |= [response.request.url] if @meta  # it's possible to call this method without affecting the metadata object being created by the harvester
-        warn "There was a response to the call #{url}"
-        warn "There was a response to the call #{response.request.url}"
+        meta.all_uris |= [response.request.url]  # it's possible to call this method without affecting the metadata object being created by the harvester
+        warn "starting URL #{url}"
+        warn "final URL #{response.request.url}"
         warn "Response code #{response.code}"
-        if response.code == 203 && @meta
-          @meta.warnings << ["002", url, headers]
-          @meta.comments << "WARN: Response is non-authoritative (HTTP response code: #{response.code}).  Headers may have been manipulated encountered when trying to resolve #{url}\n"
+        if response.code == 203
+          meta.warnings << ["002", url, headers]
+          meta.comments << "WARN: Response is non-authoritative (HTTP response code: #{response.code}).  Headers may have been manipulated encountered when trying to resolve #{url}\n"
         end
         response
       rescue RestClient::ExceptionWithResponse => e
         warn "EXCEPTION WITH RESPONSE! #{e.response}\n#{e.response.headers}"
-        @meta.warnings << ["003", url, headers] if @meta
-        @meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n" if @meta
-        if e.response.code == 500
+        meta.warnings << ["003", url, headers]
+        meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
+        if (e.response.code == 500 or e.response.code == 404)
           return false
         else
           e.response
@@ -34,14 +34,14 @@ module FspHarvester
         # now we are returning the headers and body that were returned
       rescue RestClient::Exception => e
         warn "EXCEPTION WITH NO RESPONSE! #{e}"
-        @meta.warnings << ["003", url, headers] if @meta
-        @meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n" if @meta
+        meta.warnings << ["003", url, headers]
+        meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
         false
         # now we are returning 'False', and we will check that with an \"if\" statement in our main code
       rescue Exception => e
         warn "EXCEPTION UNKNOWN! #{e}"
-        @meta.warnings << ["003", url, headers] if @meta
-        @meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n" if @meta
+        meta.warnings << ["003", url, headers]
+        meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
         false
         # now we are returning 'False', and we will check that with an \"if\" statement in our main code
       end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: fsp_harvester
 version: !ruby/object:Gem::Version
-  version: 0.1.5
+  version: 0.1.9
 platform: ruby
 authors:
 - Mark Wilkinson
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2022-07-27 00:00:00.000000000 Z
+date: 2022-08-05 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: json
@@ -44,14 +44,14 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.1.11
+        version: 0.1.16
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.1.11
+        version: 0.1.16
 - !ruby/object:Gem::Dependency
   name: metainspector
   requirement: !ruby/object:Gem::Requirement
@@ -171,6 +171,7 @@ executables: []
 extensions: []
 extra_rdoc_files: []
 files:
+- ".rspec_status"
 - CHANGELOG.md
 - Gemfile
 - Gemfile.lock
@@ -180,10 +181,17 @@ files:
 - bin/console
 - bin/setup
 - example_test.rb
+- launch.json
+- lib/config.conf_docker
+- lib/config.conf_local
 - lib/constants.rb
 - lib/fsp_harvester.rb
 - lib/fsp_harvester/version.rb
+- lib/fsp_metadata_external_tools.rb
+- lib/fsp_metadata_harvester.rb
+- lib/fsp_metadata_parser.rb
 - lib/metadata_object.rb
+- lib/signposting_tests.rb
 - lib/swagger.rb
 - lib/warnings.json
 - lib/web_utils.rb