RubyGems - fsp_harvester - Versions diffs - 0.1.11 → 0.1.12 - Mend

fsp_harvester 0.1.11 → 0.1.12

Files changed (16) hide show

checksums.yaml +4 -4
data/.rspec_status +53 -53
data/Gemfile.lock +1 -1
data/lib/config.conf +8 -0
data/lib/{fsp_metadata_external_tools.rb → external_tools.rb} +17 -15
data/lib/fsp_harvester/version.rb +1 -1
data/lib/fsp_harvester.rb +8 -106
data/lib/harvester.rb +27 -0
data/lib/harvester_utils.rb +75 -0
data/lib/{fsp_metadata_harvester.rb → metadata_harvester.rb} +8 -8
data/lib/metadata_object.rb +1 -1
data/lib/{fsp_metadata_parser.rb → metadata_parser.rb} +28 -13
data/lib/signposting_tests.rb +9 -6
data/lib/swagger.rb +137 -177
data/lib/web_utils.rb +2 -2
metadata +8 -5

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 895567e9edd571dbca7dee89a0270d1c14342fed06c3eb81c81e06f3c07ddbed
-  data.tar.gz: 7eee65295c206d6cee7b4ef28830f64087ba172a294cde7401490bffa20dbe1a
+  metadata.gz: b38eea15fa26a3fe07290024342f8b6121dbd78c3cd2dd3496ca118fca22f6d4
+  data.tar.gz: a25ea37ecd78b2ef8dc41dca391c161ba4b262d910dbadcf34f07f0cd8e54af5
 SHA512:
-  metadata.gz: f0c7727598525cb55b6c2bfaf36d5ce3dda5da6efddf85888328b7c93b874c508989122627e5deaa5101fc0a20279432aa023ecefef112926219f267e3622234
-  data.tar.gz: 29f834c57ec73e27f988948893dc92fe56550b829585df390a9a1398770845115202289f6f9557c01eb2fc3eec218f863371db60649f6a3fef01da9457c2862e
+  metadata.gz: 83b766f2896a0776ed75ab3fd1e235d2c80173d3ffc0c22aea80b234856392497daf312db36461f03d0ca168228e6e385edf6d789b42dc4b43fcd6a073cda234
+  data.tar.gz: 16d74c199a138db0225e88c0e092a5272a518785c397d27da74c511a69260444e6c1c440021e08413c131d672cd2d528e8d19dbd8a20ca1bff97dad38600c995

data/.rspec_status CHANGED Viewed

@@ -1,55 +1,55 @@
 example_id                         | status | run_time        |
 ---------------------------------- | ------ | --------------- |
-./spec/cite-as_spec.rb[1:1:1]      | passed | 1.61 seconds    |
-./spec/cite-as_spec.rb[1:1:2]      | passed | 1.18 seconds    |
-./spec/cite-as_spec.rb[1:1:3]      | passed | 1.02 seconds    |
-./spec/cite-as_spec.rb[1:1:4]      | passed | 1.6 seconds     |
-./spec/cite-as_spec.rb[1:1:5]      | passed | 2.78 seconds    |
-./spec/cite-as_spec.rb[1:1:6]      | passed | 2.09 seconds    |
-./spec/cite-as_spec.rb[1:1:7]      | passed | 2.98 seconds    |
-./spec/cite-as_spec.rb[1:1:8]      | passed | 2.2 seconds     |
-./spec/cite-as_spec.rb[1:1:9]      | passed | 2.87 seconds    |
-./spec/cite-as_spec.rb[1:1:10]     | passed | 2.18 seconds    |
-./spec/cite-as_spec.rb[1:1:11]     | passed | 3.16 seconds    |
-./spec/cite-as_spec.rb[1:1:12]     | passed | 2.36 seconds    |
-./spec/cite-as_spec.rb[1:1:13]     | passed | 2.89 seconds    |
-./spec/cite-as_spec.rb[1:1:14]     | passed | 2.13 seconds    |
-./spec/cite-as_spec.rb[1:1:15]     | passed | 1.18 seconds    |
-./spec/cite-as_spec.rb[1:1:16]     | passed | 1.3 seconds     |
-./spec/cite-as_spec.rb[1:1:17]     | passed | 1.17 seconds    |
-./spec/cite-as_spec.rb[1:1:18]     | passed | 1.2 seconds     |
-./spec/cite-as_spec.rb[1:1:19]     | passed | 1.71 seconds    |
-./spec/cite-as_spec.rb[1:1:20]     | passed | 1.69 seconds    |
-./spec/cite-as_spec.rb[1:1:21]     | passed | 2.22 seconds    |
-./spec/cite-as_spec.rb[1:1:22]     | passed | 1.09 seconds    |
-./spec/cite-as_spec.rb[1:1:23]     | passed | 1.17 seconds    |
-./spec/cite-as_spec.rb[1:1:24]     | failed | 1.2 seconds     |
-./spec/cite-as_spec.rb[1:1:25]     | passed | 0.48048 seconds |
-./spec/describedby_spec.rb[1:1:1]  | passed | 2.12 seconds    |
-./spec/describedby_spec.rb[1:1:2]  | passed | 0.96254 seconds |
-./spec/describedby_spec.rb[1:1:3]  | passed | 0.92669 seconds |
-./spec/describedby_spec.rb[1:1:4]  | passed | 0.92801 seconds |
-./spec/describedby_spec.rb[1:1:5]  | passed | 1 second        |
-./spec/describedby_spec.rb[1:1:6]  | passed | 0.66763 seconds |
-./spec/describedby_spec.rb[1:1:7]  | passed | 0.66021 seconds |
-./spec/describedby_spec.rb[1:1:8]  | passed | 1.89 seconds    |
-./spec/describedby_spec.rb[1:1:9]  | passed | 1.3 seconds     |
-./spec/describedby_spec.rb[1:1:10] | passed | 1.7 seconds     |
-./spec/describedby_spec.rb[1:1:11] | passed | 2.28 seconds    |
-./spec/describedby_spec.rb[1:1:12] | passed | 2.27 seconds    |
-./spec/describedby_spec.rb[1:1:13] | passed | 1.39 seconds    |
-./spec/describedby_spec.rb[1:1:14] | passed | 1.65 seconds    |
-./spec/describedby_spec.rb[1:1:15] | passed | 1.7 seconds     |
-./spec/fsp_harvester_spec.rb[1:1]  | passed | 0.00215 seconds |
-./spec/fsp_harvester_spec.rb[1:2]  | failed | 0.00021 seconds |
-./spec/item_spec.rb[1:1:1]         | passed | 2.04 seconds    |
-./spec/item_spec.rb[1:1:2]         | passed | 2 seconds       |
-./spec/item_spec.rb[1:1:3]         | passed | 0.92924 seconds |
-./spec/item_spec.rb[1:1:4]         | passed | 1.36 seconds    |
-./spec/item_spec.rb[1:1:5]         | passed | 1.71 seconds    |
-./spec/item_spec.rb[1:1:6]         | passed | 1.68 seconds    |
-./spec/item_spec.rb[1:1:7]         | passed | 2.37 seconds    |
-./spec/item_spec.rb[1:1:8]         | passed | 0.34241 seconds |
-./spec/type_spec.rb[1:1:1]         | passed | 0.9855 seconds  |
-./spec/type_spec.rb[1:1:2]         | passed | 0.96202 seconds |
-./spec/type_spec.rb[1:1:3]         | passed | 0.96005 seconds |
+./spec/cite-as_spec.rb[1:1:1]      | passed | 1.3 seconds     |
+./spec/cite-as_spec.rb[1:1:2]      | passed | 1.21 seconds    |
+./spec/cite-as_spec.rb[1:1:3]      | passed | 1.09 seconds    |
+./spec/cite-as_spec.rb[1:1:4]      | passed | 1.69 seconds    |
+./spec/cite-as_spec.rb[1:1:5]      | passed | 2.72 seconds    |
+./spec/cite-as_spec.rb[1:1:6]      | passed | 2.3 seconds     |
+./spec/cite-as_spec.rb[1:1:7]      | passed | 3.36 seconds    |
+./spec/cite-as_spec.rb[1:1:8]      | passed | 2.26 seconds    |
+./spec/cite-as_spec.rb[1:1:9]      | passed | 2.82 seconds    |
+./spec/cite-as_spec.rb[1:1:10]     | passed | 2.3 seconds     |
+./spec/cite-as_spec.rb[1:1:11]     | passed | 3.37 seconds    |
+./spec/cite-as_spec.rb[1:1:12]     | passed | 2.2 seconds     |
+./spec/cite-as_spec.rb[1:1:13]     | passed | 2.94 seconds    |
+./spec/cite-as_spec.rb[1:1:14]     | passed | 2.44 seconds    |
+./spec/cite-as_spec.rb[1:1:15]     | passed | 1.54 seconds    |
+./spec/cite-as_spec.rb[1:1:16]     | passed | 1.29 seconds    |
+./spec/cite-as_spec.rb[1:1:17]     | passed | 1.25 seconds    |
+./spec/cite-as_spec.rb[1:1:18]     | passed | 1.15 seconds    |
+./spec/cite-as_spec.rb[1:1:19]     | passed | 1.7 seconds     |
+./spec/cite-as_spec.rb[1:1:20]     | passed | 1.66 seconds    |
+./spec/cite-as_spec.rb[1:1:21]     | passed | 2.41 seconds    |
+./spec/cite-as_spec.rb[1:1:22]     | passed | 1.64 seconds    |
+./spec/cite-as_spec.rb[1:1:23]     | passed | 1.35 seconds    |
+./spec/cite-as_spec.rb[1:1:24]     | failed | 1.25 seconds    |
+./spec/cite-as_spec.rb[1:1:25]     | passed | 0.51152 seconds |
+./spec/describedby_spec.rb[1:1:1]  | passed | 2.71 seconds    |
+./spec/describedby_spec.rb[1:1:2]  | passed | 1.25 seconds    |
+./spec/describedby_spec.rb[1:1:3]  | passed | 1.22 seconds    |
+./spec/describedby_spec.rb[1:1:4]  | passed | 1.22 seconds    |
+./spec/describedby_spec.rb[1:1:5]  | passed | 1.21 seconds    |
+./spec/describedby_spec.rb[1:1:6]  | passed | 1.02 seconds    |
+./spec/describedby_spec.rb[1:1:7]  | passed | 0.99175 seconds |
+./spec/describedby_spec.rb[1:1:8]  | passed | 2.44 seconds    |
+./spec/describedby_spec.rb[1:1:9]  | passed | 1.72 seconds    |
+./spec/describedby_spec.rb[1:1:10] | passed | 2.15 seconds    |
+./spec/describedby_spec.rb[1:1:11] | passed | 3.17 seconds    |
+./spec/describedby_spec.rb[1:1:12] | passed | 3.1 seconds     |
+./spec/describedby_spec.rb[1:1:13] | passed | 1.7 seconds     |
+./spec/describedby_spec.rb[1:1:14] | passed | 2.21 seconds    |
+./spec/describedby_spec.rb[1:1:15] | passed | 2.18 seconds    |
+./spec/fsp_harvester_spec.rb[1:1]  | passed | 0.00058 seconds |
+./spec/fsp_harvester_spec.rb[1:2]  | failed | 2.92 seconds    |
+./spec/item_spec.rb[1:1:1]         | passed | 3.09 seconds    |
+./spec/item_spec.rb[1:1:2]         | passed | 2.92 seconds    |
+./spec/item_spec.rb[1:1:3]         | passed | 1.12 seconds    |
+./spec/item_spec.rb[1:1:4]         | passed | 1.7 seconds     |
+./spec/item_spec.rb[1:1:5]         | passed | 2.24 seconds    |
+./spec/item_spec.rb[1:1:6]         | passed | 2.87 seconds    |
+./spec/item_spec.rb[1:1:7]         | passed | 3.03 seconds    |
+./spec/item_spec.rb[1:1:8]         | passed | 0.52338 seconds |
+./spec/type_spec.rb[1:1:1]         | passed | 1.42 seconds    |
+./spec/type_spec.rb[1:1:2]         | passed | 1.28 seconds    |
+./spec/type_spec.rb[1:1:3]         | passed | 1.52 seconds    |

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    fsp_harvester (0.1.11)
+    fsp_harvester (0.1.12)
       json (~> 2.0)
       linkeddata (~> 3.2)
       linkheaders-processor (~> 0.1.16)

data/lib/config.conf ADDED Viewed

@@ -0,0 +1,8 @@
+[extruct]
+command="extruct"
+[rdf]
+command="/home/osboxes/.rvm/gems/ruby-3.0.0/bin/rdf"
+[tika]
+command="http://tika:9998/meta"

data/lib/{fsp_metadata_external_tools.rb → external_tools.rb} RENAMED Viewed

@@ -1,12 +1,12 @@
 # frozen_string_literal: true
-module FspHarvester
+module HarvesterTools
   class Error < StandardError
   end
   class ExternalTools
-    def initialize(metadata: FspHarvester::MetadataObject.new)
+    def initialize(metadata: HarvesterTools::MetadataObject.new)
       @meta = metadata
     end
@@ -25,10 +25,7 @@ module FspHarvester
         file.rewind
         @meta.comments << "INFO: The message body is being examined by Distiller\n"
-        # command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format turtle #{file.path} 2>/dev/null"
-        command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
-        # command = "LANG=en_US.UTF-8 /usr/local/bin/ruby #{@rdf_command} serialize --input-format rdfa --output-format jsonld #{file.path}"
-        # command = "LANG=en_US.UTF-8 /home/osboxes/.rvm/rubies/ruby-2.6.3/bin/ruby /home/osboxes/.rvm/gems/ruby-2.6.3/bin/rdf serialize --output-format jsonld #{file.path}"
+        command = "LANG=en_US.UTF-8 #{RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
         warn "distiller command: #{command}"
         result, _stderr, _status = Open3.capture3(command)
         warn ''
@@ -41,12 +38,13 @@ module FspHarvester
         if result !~ /@context/i # failure returns nil
           @meta.comments << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML..\n"
           @meta.add_warning(['018', '', ''])
+          result = "{}"
         else
           @meta.comments << "INFO: The Distiller found parseable data.  Parsing as JSON-LD\n"
-          parse_rdf(result: result, content_type: "application/ld+json")
         end
         @@distillerknown[bhash] = true
       end
+      result
     end
     def processs_with_extruct(uri:)
@@ -55,6 +53,11 @@ module FspHarvester
       stdout, stderr, status = Open3.capture3(EXTRUCT_COMMAND + ' ' + uri)
       warn "open3 status: #{status} #{stdout}"
       result = stderr # absurd that the output comes over stderr!  LOL!
+      jsonld = {}
+      microdata = Hash.new
+      microformat = Hash.new
+      opengraph = Hash.new
+      rdfa = Hash.new
       if result.to_s.match(/(Failed\sto\sextract.*?)\n/)
         @meta.comments << "WARN: extruct threw an error #{Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n"
@@ -66,17 +69,16 @@ module FspHarvester
       elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
         json = JSON.parse result
         @meta.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
-        parse_rdf(body: json['json-ld'].to_json, content_type: 'application/ld+json') if json['json-ld'].any? # RDF
-        @meta.merge_hash(json['microdata'].first) if json['microdata'].any?
-        @meta.merge_hash(json['microformat'].first) if json['microformat'].any?
-        @meta.merge_hash(json['opengraph'].first) if json['opengraph'].any?
-        parse_rdf(body: json['rdfa'].to_json, content_type: 'application/ld+json') if json['rdfa'].any? # RDF
-        @meta.merge_hash(json.first) if json.first.is_a? Hash
+        jsonld = json['json-ld'].to_json if json['json-ld'].any?
+        microdata = json['microdata'].first if json['microdata'].any
+        microformat = json['microformat'].first if json['microformat'].any?
+        opengraph = json['opengraph'].first if json['opengraph'].any?
+        rdfa = json['rdfa'].to_json if json['rdfa'].any?
+        # @meta.merge_hash(json.first) if json.first.is_a? Hash
       else
         @meta.comments << "WARN: the extruct tool failed to find parseable data at #{uri}\n"
       end
+      [jsonld, microdata, microformat, opengraph, rdfa]
     end
   end
 end

data/lib/fsp_harvester/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module FspHarvester
-  VERSION = "0.1.11"
+  VERSION = "0.1.12"
 end

data/lib/fsp_harvester.rb CHANGED Viewed

@@ -1,121 +1,23 @@
-# frozen_string_literal: true
-require_relative 'fsp_harvester/version'
-require 'json/ld'
-require 'json/ld/preloaded'
-require 'json'
-require 'linkheaders/processor'
-require 'addressable'
-require 'tempfile'
-require 'xmlsimple'
-require 'nokogiri'
-require 'parseconfig'
-require 'rest-client'
-require 'cgi'
-require 'digest'
-require 'open3'
-require 'metainspector'
-require 'rdf/xsd'
-require_relative './metadata_object'
-require_relative './constants'
-require_relative './web_utils'
-require_relative './signposting_tests'
-require_relative './fsp_metadata_harvester'
-require_relative './fsp_metadata_parser'
+require_relative 'harvester'
 module FspHarvester
   class Error < StandardError
   end
   class Utils
-    # @@distillerknown = {} # global, hash of sha256 keys of message bodies - have they been seen before t/f
-    # @warnings = JSON.parse(File.read("warnings.json"))
-    def self.resolve_guid(guid:)
-      @meta = FspHarvester::MetadataObject.new
-      @meta.all_uris = [guid]
-      type, url = convertToURL(guid: guid)
-      links = Array.new
-      if type
-        links = resolve_url(url: url)
-        @meta.links << links
-      else
-        @meta.add_warning(['006', guid, ''])
-        @meta.comments << "FATAL: GUID type not recognized.\n"
-      end
-      [links, @meta]
-    end
-    def self.gather_metadata_from_describedby_links(links: [], metadata: FspHarvester::MetadataObject.new) # meta should have already been created by resolve+guid, but maybe not
+    def self.gather_metadata_from_describedby_links(links: [], metadata: HarvesterTools::MetadataObject.new) # meta should have already been created by resolve+guid, but maybe not
       @meta = metadata
       db = []
       links.each do |l|
         db << l if l.relation == 'describedby'
       end
-      FspHarvester::MetadataHarvester.extract_metadata(links: db, metadata: @meta)  # everything is gathered into the @meta metadata object
+      HarvesterTools::MetadataHarvester.extract_metadata(links: db, metadata: @meta)  # everything is gathered into the @meta metadata object
       @meta
     end
-    def self.convertToURL(guid:)
-      GUID_TYPES.each do |k, regex|
-        if k == 'inchi' and regex.match(guid)
-          return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
-        elsif k == 'handle1' and regex.match(guid)
-          return 'handle', "http://hdl.handle.net/#{guid}"
-        elsif k == 'handle2' and regex.match(guid)
-          return 'handle', "http://hdl.handle.net/#{guid}"
-        elsif k == 'uri' and regex.match(guid)
-          return 'uri', guid
-        elsif k == 'doi' and regex.match(guid)
-          return 'doi', "https://doi.org/#{guid}"
-        end
-      end
-      [nil, nil]
-    end
-    def self.typeit(guid:)
-      Utils::GUID_TYPES.each do |type, regex|
-        return type if regex.match(guid)
-      end
-      false
-    end
-    def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_STAR_HEADER)
-      @meta.guidtype = 'uri' if @meta.guidtype.nil?
-      warn "\n\n FETCHING #{url} #{header}\n\n"
-      response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method, meta: @meta)
-      warn "\n\n head #{response.headers.inspect}\n\n" if response
-      unless response
-        @meta.add_warning(['001', url, header])
-        @meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
-        return []
-      end
-      @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}.  Using the output from this URL for the next few tests..."
-      @meta.full_response << response.body
-      links = process_link_headers(response: response) unless nolinkheaders
-      links
-    end
-    def self.process_link_headers(response:)
-      warn "\n\n parsing #{response.headers}\n\n"
-      parser = LinkHeaders::Processor.new(default_anchor: @meta.all_uris.last)
-      parser.extract_and_parse(response: response)
-      factory = parser.factory # LinkHeaders::LinkFactory
-      warn "\n\n length bfore #{factory.all_links.length}\n\n"
-      signpostingcheck(factory: factory)
-      warn "\n\n length aftr #{factory.all_links.length}\n\n"
-      warn "\n\n links #{factory.all_links}\n\n"
-      factory.all_links
-    end
-    def self.signpostingcheck(factory:)
+    def self.signpostingcheck(factory:, metadata: HarvesterTools::MetadataObject.new)
+      @meta = metadata
       citeas = Array.new
       describedby = Array.new
       item = Array.new
@@ -134,13 +36,13 @@ module FspHarvester
         end
       end
-      check_describedby_rules(describedby: describedby)
-      check_item_rules(item: item)
+      check_describedby_rules(describedby: describedby, metadata: @meta)
+      check_item_rules(item: item, metadata: @meta)
       if citeas.length > 1
         warn "INFO: multiple cite-as links found. Checking for conflicts\n"
         @meta.comments << "INFO: multiple cite-as links found. Checking for conflicts\n"
-        citeas = check_for_citeas_conflicts(citeas: citeas) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
+        citeas = check_for_citeas_conflicts(citeas: citeas, metadata: @meta) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
       end
       unless citeas.length == 1 && describedby.length > 0

data/lib/harvester.rb ADDED Viewed

@@ -0,0 +1,27 @@
+# frozen_string_literal: true
+#require_relative 'fsp_harvester/version'
+require 'json/ld'
+require 'json/ld/preloaded'
+require 'json'
+require 'linkheaders/processor'
+require 'addressable'
+require 'tempfile'
+require 'xmlsimple'
+require 'nokogiri'
+require 'parseconfig'
+require 'rest-client'
+require 'cgi'
+require 'digest'
+require 'open3'
+require 'metainspector'
+require 'rdf/xsd'
+require_relative './metadata_object'
+require_relative './constants'
+require_relative './web_utils'
+require_relative './signposting_tests'
+require_relative './metadata_harvester'
+require_relative './fsp_harvester'
+require_relative './harvester_utils'
+require_relative './external_tools'
+require_relative './metadata_parser'

data/lib/harvester_utils.rb ADDED Viewed

@@ -0,0 +1,75 @@
+module HarvesterTools
+  class Error < StandardError
+  end
+  class Utils
+    def self.resolve_guid(guid:)
+      @meta = HarvesterTools::MetadataObject.new
+      @meta.all_uris = [guid]
+      type, url = convertToURL(guid: guid)
+      links = Array.new
+      if type
+        links = resolve_url(url: url)
+        @meta.links = @meta.links | links
+      else
+        @meta.add_warning(['006', guid, ''])
+        @meta.comments << "FATAL: GUID type not recognized.\n"
+      end
+      [links, @meta]
+    end
+    def self.convertToURL(guid:)
+      GUID_TYPES.each do |k, regex|
+        if k == 'inchi' and regex.match(guid)
+          return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
+        elsif k == 'handle1' and regex.match(guid)
+          return 'handle', "http://hdl.handle.net/#{guid}"
+        elsif k == 'handle2' and regex.match(guid)
+          return 'handle', "http://hdl.handle.net/#{guid}"
+        elsif k == 'uri' and regex.match(guid)
+          return 'uri', guid
+        elsif k == 'doi' and regex.match(guid)
+          return 'doi', "https://doi.org/#{guid}"
+        end
+      end
+      [nil, nil]
+    end
+    def self.typeit(guid:)
+      GUID_TYPES.each do |type, regex|
+        return type if regex.match(guid)
+      end
+      false
+    end
+    def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_STAR_HEADER)
+      @meta.guidtype = 'uri' if @meta.guidtype.nil?
+      warn "\n\n FETCHING #{url} #{header}\n\n"
+      response = HarvesterTools::WebUtils.fspfetch(url: url, headers: header, method: method, meta: @meta)
+      warn "\n\n head #{response.headers.inspect}\n\n" if response
+      unless response
+        @meta.add_warning(['001', url, header])
+        @meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
+        return []
+      end
+      @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}.  Using the output from this URL for the next few tests..."
+      @meta.full_response << response.body
+      links = process_link_headers(response: response) unless nolinkheaders
+      links
+    end
+    def self.process_link_headers(response:)
+      warn "\n\n parsing #{response.headers}\n\n"
+      parser = LinkHeaders::Processor.new(default_anchor: @meta.all_uris.last)
+      parser.extract_and_parse(response: response)
+      factory = parser.factory # LinkHeaders::LinkFactory
+      FspHarvester::Utils.signpostingcheck(factory: factory, metadata: @meta)
+      factory.all_links
+    end
+  end
+end

data/lib/{fsp_metadata_harvester.rb → metadata_harvester.rb} RENAMED Viewed

@@ -1,17 +1,17 @@
 # frozen_string_literal: true
-module FspHarvester
+module HarvesterTools
   class Error < StandardError
   end
   class MetadataHarvester
-    def self.extract_metadata(links: [], metadata: FspHarvester::MetadataObject.new)
+    def self.extract_metadata(links: [], metadata: HarvesterTools::MetadataObject.new)
       @meta = metadata
       @meta.comments << 'INFO:  now collecting both linked data and hash-style data using the harvested links'
       describedby = links.select { |l| l if l.relation == 'describedby' }
-      hvst = FspHarvester::MetadataParser.new(metadata_object: @meta) # put here because the class variable for detecting duplicates should apply to all URIs
+      hvst = HarvesterTools::MetadataParser.new(metadata_object: @meta) # put here because the class variable for detecting duplicates should apply to all URIs
       describedby.each do |link|
         accepttype = ACCEPT_STAR_HEADER
         accept = link.respond_to?('type') ? link.type : nil
@@ -30,16 +30,16 @@ module FspHarvester
         case abbreviation
         when 'html'
           @meta.comments << 'INFO: Processing html'
-          hvst.process_html(body: response.body, uri: link)
+          hvst.process_html(body: response.body, uri: link, metadata: @meta)
         when 'xml'
           @meta.comments << 'INFO: Processing xml'
-          hvst.process_xml(body: response.body)
+          hvst.process_xml(body: response.body, metadata: @meta)
         when 'json'
           @meta.comments << 'INFO: Processing json'
-          hvst.process_json(body: response.body)
+          hvst.process_json(body: response.body, metadata: @meta)
         when 'jsonld', 'rdfxml', 'turtle', 'ntriples', 'nquads'
           @meta.comments << 'INFO: Processing linked data'
-          hvst.process_ld(body: response.body, content_type: content_type)
+          hvst.process_ld(body: response.body, content_type: content_type, metadata: @meta)
         when 'specialist'
           warn 'no specialized parsers so far'
         end
@@ -54,7 +54,7 @@ module FspHarvester
         @meta.comments << "INFO:  link #{link.href} has no MIME type, defaulting to */*"
       end
       url = link.href
-      response = FspHarvester::WebUtils.fspfetch(url: url, method: :get, headers: header)
+      response = HarvesterTools::WebUtils.fspfetch(url: url, method: :get, headers: header)
       unless response
         @meta.add_warning(['016', url, header])
         @meta.comments << "WARN: Unable to resolve describedby link #{url} using HTTP Accept header #{header}.\n"

data/lib/metadata_object.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-module FspHarvester
+module HarvesterTools
   class MetadataObject
     attr_accessor :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris  # a hash of metadata # a RDF.rb graph of metadata  # an array of comments  # the type of GUID that was detected # will be an array of Net::HTTP::Response

data/lib/{fsp_metadata_parser.rb → metadata_parser.rb} RENAMED Viewed

@@ -1,6 +1,6 @@
 # frozen_string_literal: true
-module FspHarvester
+module HarvesterTools
   class Error < StandardError
   end
@@ -9,17 +9,25 @@ module FspHarvester
     @@distillerknown = {}
-    def initialize(metadata_object: FspHarvester::MetadataObject.new)
+    def initialize(metadata_object: HarvesterTools::MetadataObject.new)
       @meta = metadata_object
     end
-    def process_html(body:, uri:)
-      tools = FspHarvester::ExternalTools.new(metadata: @meta)
-      tools.process_with_distiller(body: body)
-      tools.process_with_extruct(uri: uri)
+    def process_html(body:, uri:, metadata:)
+      @meta = metadata
+      tools = HarvesterTools::ExternalTools.new(metadata: @meta)
+      result = tools.process_with_distiller(body: body)
+      jsonld, microdata, microformat, opengraph, rdfa = tools.process_with_extruct(uri: uri)
+      parse_rdf(body: jsonld, content_type: 'application/ld+json')
+      @meta.merge_hash(microdata)
+      @meta.merge_hash(microformat)
+      @meta.merge_hash(opengraph)
+      parse_rdf(body: rdfa, content_type: 'application/ld+json')
     end
-    def process_xml(body:)
+    def process_xml(body:, metadata:)
+      @meta = metadata
       begin
         hash = XmlSimple.xml_in(body)
       rescue
@@ -30,7 +38,8 @@ module FspHarvester
       @meta.hash.merge hash
     end
-    def process_json(body:)
+    def process_json(body:, metadata:)
+      @meta = metadata
       begin
         hash = JSON.parse(body)
       rescue
@@ -41,11 +50,17 @@ module FspHarvester
       @meta.hash.merge hash
     end
-    def process_ld(body:, content_type:)
-      parse_rdf(body: body, content_type: content_type)
+    def process_ld(body:, content_type:, metadata:)
+      @meta = metadata
+      parse_rdf(body: body, content_type: content_type, metadata: @meta)
+    end
+    def parse_rdf(body:, content_type:, metadata:)
+      self.class.parse_rdf(body: body, content_type: content_type, metadata: metadata)
     end
-    def parse_rdf(body:, content_type:)
+    def self.parse_rdf(body:, content_type:, metadata:)
+      @meta = metadata
       unless body
         @meta.comments << "CRITICAL: The response message body component appears to have no content.\n"
         @meta.add_warning(['018', '', ''])
@@ -65,7 +80,7 @@ module FspHarvester
         return
       end
-      graph = FspHarvester::Cache.checkRDFCache(body: body)
+      graph = HarvesterTools::Cache.checkRDFCache(body: body)
       if graph.size > 0
         warn "\n\n\n unmarshalling graph from cache\n\ngraph size #{graph.size}\n\n"
         @meta.merge_rdf(graph.to_a)
@@ -88,7 +103,7 @@ module FspHarvester
           end
           reader = rdfformat.reader.new(body) # have to re-read it here, but now its safe because we have already caught errors
           warn 'WRITING TO CACHE'
-          FspHarvester::Cache.writeRDFCache(reader: reader, body: body) # write to the special RDF graph cache
+          HarvesterTools::Cache.writeRDFCache(reader: reader, body: body) # write to the special RDF graph cache
           warn 'WRITING DONE'
           reader = rdfformat.reader.new(body)  # frustrating that we cannot rewind!
           warn 'RE-READING DONE'

data/lib/signposting_tests.rb CHANGED Viewed

@@ -1,4 +1,5 @@
-def check_for_citeas_conflicts(citeas: )
+def check_for_citeas_conflicts(citeas:, metadata: )
+  @meta = metadata
   @meta.comments << 'INFO: checking for conflicting cite-as links'
   citeas_hrefs = Hash.new
   citeas.each do |link|
@@ -6,7 +7,7 @@ def check_for_citeas_conflicts(citeas: )
     @meta.comments << "INFO: Adding citeas #{link.href} to the testing queue."
     citeas_hrefs[link.href] = link
   end
+#warn "finalhash #{citeas_hrefs}"
   if citeas_hrefs.length > 1
     @meta.comments << 'INFO: Found multiple non-identical cite-as links.'
     @meta.add_warning(['007', '', ''])
@@ -16,7 +17,8 @@ def check_for_citeas_conflicts(citeas: )
 end
-def check_describedby_rules(describedby:)
+def check_describedby_rules(describedby:, metadata:)
+  @meta = metadata
   describedby.each do |l|
     unless l.respond_to? 'type'
       @meta.add_warning(['005', l.href, ''])
@@ -25,7 +27,7 @@ def check_describedby_rules(describedby:)
     type = l.type if l.respond_to? 'type'
     type ||= '*/*'
     header = { accept: type }
-    response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
+    response = HarvesterTools::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
     if response
       responsetype = response.headers[:content_type]
       @meta.comments << "INFO: describedby link responds with content type #{responsetype}\n"
@@ -51,7 +53,8 @@ def check_describedby_rules(describedby:)
   end
 end
-def check_item_rules(item:)
+def check_item_rules(item:, metadata:)
+  @meta = metadata
   item.each do |l| # l = LinkHeaders::Link
     unless l.respond_to? 'type'
       @meta.add_warning(['011', l.href, ''])
@@ -60,7 +63,7 @@ def check_item_rules(item:)
     type = l.type if l.respond_to? 'type'
     type ||= '*/*' # this becomes a frozen string
     header = { accept: type }
-    response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
+    response = HarvesterTools::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
     if response
       if response.headers[:content_type] and type != '*/*'

data/lib/swagger.rb CHANGED Viewed

@@ -1,64 +1,39 @@
-class Swagger
-    attr_accessor :debug
-    attr_accessor :title
-    attr_accessor :tests_metric
-    attr_accessor :description
-    attr_accessor :applies_to_principle
-    attr_accessor :organization
-    attr_accessor :org_url
-    attr_accessor :responsible_developer
-    attr_accessor :email
-    attr_accessor :developer_ORCiD
-    attr_accessor :protocol
-    attr_accessor :host
-    attr_accessor :basePath
-    attr_accessor :path
-    attr_accessor :response_description
-    attr_accessor :schemas
-    attr_accessor :comments
-    attr_accessor :fairsharing_key_location
-    attr_accessor :score
-    attr_accessor :testedGUID
-    def initialize(params = {})
-        @debug = params.fetch(:debug, false)
-      @title = params.fetch(:title, 'unnamed')
-      @tests_metric = params.fetch(:tests_metric)
-      @description = params.fetch(:description, 'default_description')
-      @applies_to_principle = params.fetch(:applies_to_principle, 'some principle')
-      @version = params.fetch(:version, "0.1")
-      @organization = params.fetch(:organization, 'Some Organization')
-      @org_url = params.fetch(:org_url)
-      @responsible_develper = params.fetch(:responsible_developer, 'Some Person')
-      @email = params.fetch(:email)
-      @developer_ORCiD = params.fetch(:developer_ORCiD)
-      @host = params.fetch(:host)
-      @protocol = params.fetch(:protocol, "https")
-      @basePath = params.fetch(:basePath)
-      @path = params.fetch(:path)
-      @response_description = params.fetch(:response_description)
-      @schemas = params.fetch(:schemas, [])
-      @comments = params.fetch(:comments, [])
-      @fairsharing_key_location = params.fetch(:fairsharing_key_location)
-      @score = params.fetch(:score, 0)
-      @testedGUID = params.fetch(:testedGUID, "")
-    end
-    def fairsharing_key
-        return @fairsharing_key_location
-    end
-    def getSwagger
-  message = <<"EOF_EOF"
+class Swagger
+  attr_accessor :debug, :title, :tests_metric, :description, :applies_to_principle, :organization, :org_url,
+                :responsible_developer, :email, :developer_ORCiD, :protocol, :host, :basePath, :path,
+                :response_description, :schemas, :comments, :fairsharing_key_location, :score, :testedGUID
+  def initialize(params = {})
+    @debug = params.fetch(:debug, false)
+    @title = params.fetch(:title, 'unnamed')
+    @tests_metric = params.fetch(:tests_metric)
+    @description = params.fetch(:description, 'default_description')
+    @applies_to_principle = params.fetch(:applies_to_principle, 'some principle')
+    @version = params.fetch(:version, '0.1')
+    @organization = params.fetch(:organization, 'Some Organization')
+    @org_url = params.fetch(:org_url)
+    @responsible_develper = params.fetch(:responsible_developer, 'Some Person')
+    @email = params.fetch(:email)
+    @developer_ORCiD = params.fetch(:developer_ORCiD)
+    @host = params.fetch(:host)
+    @protocol = params.fetch(:protocol, 'https')
+    @basePath = params.fetch(:basePath)
+    @path = params.fetch(:path)
+    @response_description = params.fetch(:response_description)
+    @schemas = params.fetch(:schemas, [])
+    @comments = params.fetch(:comments, [])
+    @fairsharing_key_location = params.fetch(:fairsharing_key_location)
+    @score = params.fetch(:score, 0)
+    @testedGUID = params.fetch(:testedGUID, '')
+  end
+  def fairsharing_key
+    @fairsharing_key_location
+  end
+  def getSwagger
+    message = <<"EOF_EOF"
   swagger: '2.0'
   info:
    version: '#{@version}'
@@ -89,7 +64,7 @@ class Swagger
           $ref: '#/definitions/schemas'
      consumes:
        - application/json
-     produces:
+     produces:#{'  '}
        - application/json
      responses:
        "200":
@@ -98,127 +73,112 @@ class Swagger
   definitions:
     schemas:
       required:
-  EOF_EOF
-      self.schemas.keys.each do |key|
-        message += "     - #{key}\n"
+EOF_EOF
+    schemas.keys.each do |key|
+      message += "     - #{key}\n"
+    end
+    message += "    properties:\n"
+    schemas.keys.each do |key|
+      message += "        #{key}:\n"
+      message += "          type: #{schemas[key][0]}\n"
+      message += "          description: >-\n"
+      message += "            #{schemas[key][1]}\n"
+    end
+    message
+  end
+  # A utility function that SHOULD NOT BE CALLED EXTERNALLY
+  #
+  # @param s - subject node
+  # @param p - predicate node
+  # @param o - object node
+  # @param repo - an RDF::Graph object
+  def triplify(s, p, o, repo)
+    s = s.strip if s.instance_of?(String)
+    p = p.strip if p.instance_of?(String)
+    o = o.strip if o.instance_of?(String)
+    unless s.respond_to?('uri')
+      if s.to_s =~ %r{^\w+:/?/?[^\s]+}
+        s = RDF::URI.new(s.to_s)
+      else
+        debug and warn "Subject #{s} must be a URI-compatible thingy"
+        abort "Subject #{s} must be a URI-compatible thingy"
       end
-      message += "    properties:\n"
-      self.schemas.keys.each do |key|
-            message += "        #{key}:\n"
-            message += "          type: #{self.schemas[key][0]}\n"
-            message += "          description: >-\n"
-            message += "            #{self.schemas[key][1]}\n"
+    end
+    unless p.respond_to?('uri')
+      if p.to_s =~ %r{^\w+:/?/?[^\s]+}
+        p = RDF::URI.new(p.to_s)
+      else
+        debug and warn "Predicate #{p} must be a URI-compatible thingy"
+        abort "Predicate #{p} must be a URI-compatible thingy"
       end
-      return message
     end
-      # A utility function that SHOULD NOT BE CALLED EXTERNALLY
-      #
-      # @param s - subject node
-      # @param p - predicate node
-      # @param o - object node
-      # @param repo - an RDF::Graph object
-      def triplify(s, p, o, repo)
-        if s.class == String
-                s = s.strip
-        end
-        if p.class == String
-                p = p.strip
-        end
-        if o.class == String
-                o = o.strip
-        end
-        unless s.respond_to?('uri')
-          if s.to_s =~ /^\w+:\/?\/?[^\s]+/
-                  s = RDF::URI.new(s.to_s)
-          else
-            self.debug and $stderr.puts "Subject #{s.to_s} must be a URI-compatible thingy"
-            abort "Subject #{s.to_s} must be a URI-compatible thingy"
-          end
-        end
-        unless p.respond_to?('uri')
-          if p.to_s =~ /^\w+:\/?\/?[^\s]+/
-                  p = RDF::URI.new(p.to_s)
-          else
-            self.debug and $stderr.puts "Predicate #{p.to_s} must be a URI-compatible thingy"
-            abort "Predicate #{p.to_s} must be a URI-compatible thingy"
-          end
-        end
-        unless o.respond_to?('uri')
-          if o.to_s =~ /\A\w+:\/?\/?\w[^\s]+/
-                  o = RDF::URI.new(o.to_s)
+    unless o.respond_to?('uri')
+      o = if o.to_s =~ %r{\A\w+:/?/?\w[^\s]+}
+            RDF::URI.new(o.to_s)
           elsif o.to_s =~ /^\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d/
-                  o = RDF::Literal.new(o.to_s, :datatype => RDF::XSD.date)
+            RDF::Literal.new(o.to_s, datatype: RDF::XSD.date)
           elsif o.to_s =~ /^[+-]?\d+\.\d+/
-                  o = RDF::Literal.new(o.to_s, :datatype => RDF::XSD.float)
+            RDF::Literal.new(o.to_s, datatype: RDF::XSD.float)
           elsif o.to_s =~ /^[+-]?[0-9]+$/
-                  o = RDF::Literal.new(o.to_s, :datatype => RDF::XSD.int)
+            RDF::Literal.new(o.to_s, datatype: RDF::XSD.int)
           else
-                  o = RDF::Literal.new(o.to_s, :language => :en)
+            RDF::Literal.new(o.to_s, language: :en)
           end
-        end
-        self.debug and $stderr.puts("\n\ninserting #{s.to_s} #{p.to_s} #{o.to_s}\n\n")
-        triple = RDF::Statement(s, p, o)
-        repo.insert(triple)
-        return true
-      end
-      # A utility function that SHOULD NOT BE CALLED EXTERNALLY
-      #
-      # @param s - subject node
-      # @param p - predicate node
-      # @param o - object node
-      # @param repo - an RDF::Graph object
-      def Swagger.triplify(s, p, o, repo)
-        return triplify(s,p,o,repo)
-      end
-      def addComment(newcomment)
-            self.comments << newcomment.to_s
-            #return self.comments
-      end
-    def createEvaluationResponse
-      g = RDF::Graph.new
-      dt = Time.now.iso8601
-      uri = self.testedGUID
-      me = self.protocol + "://" + self.host + "/" + self.basePath + self.path
-       meURI  ="#{me}##{uri}/result-#{dt}"
-       meURI  =Addressable::URI.escape(meURI)
-      triplify(meURI, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://fairmetrics.org/resources/metric_evaluation_result", g );
-      triplify(meURI, "http://semanticscience.org/resource/SIO_000300", self.score, g )
-      triplify(meURI, "http://purl.obolibrary.org/obo/date", dt, g )
-      triplify(meURI, "http://schema.org/softwareVersion", VERSION, g )
-      triplify(meURI,"http://semanticscience.org/resource/SIO_000332", uri, g)
-      comments = "no comments received.  "
-      comments = self.comments.join("\n") if self.comments.size > 0
-      triplify(meURI, "http://schema.org/comment", comments, g)
-      return g.dump(:jsonld)
-    end
+    end
+    debug and warn("\n\ninserting #{s} #{p} #{o}\n\n")
+    triple = RDF::Statement(s, p, o)
+    repo.insert(triple)
+    true
+  end
+  # A utility function that SHOULD NOT BE CALLED EXTERNALLY
+  #
+  # @param s - subject node
+  # @param p - predicate node
+  # @param o - object node
+  # @param repo - an RDF::Graph object
+  def self.triplify(s, p, o, repo)
+    triplify(s, p, o, repo)
+  end
+  def addComment(newcomment)
+    comments << newcomment.to_s
+    # return self.comments
+  end
+  def createEvaluationResponse
+    g = RDF::Graph.new
+    dt = Time.now.iso8601
+    uri = testedGUID
+    me = protocol + '://' + host + '/' + basePath + path
+    meURI = "#{me}##{uri}/result-#{dt}"
+    meURI = Addressable::URI.escape(meURI)
+    triplify(meURI, 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
+             'http://fairmetrics.org/resources/metric_evaluation_result', g)
+    triplify(meURI, 'http://semanticscience.org/resource/SIO_000300', score, g)
+    triplify(meURI, 'http://purl.obolibrary.org/obo/date', dt, g)
+    triplify(meURI, 'http://schema.org/softwareVersion', VERSION, g)
+    triplify(meURI, 'http://semanticscience.org/resource/SIO_000332', uri, g)
+    comments = 'no comments received.  '
+    comments = self.comments.join("\n") if self.comments.size > 0
+    triplify(meURI, 'http://schema.org/comment', comments, g)
+    g.dump(:jsonld)
   end
+end

data/lib/web_utils.rb CHANGED Viewed

@@ -1,7 +1,7 @@
-module FspHarvester
+module HarvesterTools
   class WebUtils
-    def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get, meta: FspHarvester::MetadataObject.new)
+    def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get, meta: HarvesterTools::MetadataObject.new)
       warn 'In fetch routine now.  '
       begin

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: fsp_harvester
 version: !ruby/object:Gem::Version
-  version: 0.1.11
+  version: 0.1.12
 platform: ruby
 authors:
 - Mark Wilkinson
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2022-08-08 00:00:00.000000000 Z
+date: 2022-08-11 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: json
@@ -182,15 +182,18 @@ files:
 - bin/setup
 - example_test.rb
 - launch.json
+- lib/config.conf
 - lib/config.conf_docker
 - lib/config.conf_local
 - lib/constants.rb
+- lib/external_tools.rb
 - lib/fsp_harvester.rb
 - lib/fsp_harvester/version.rb
-- lib/fsp_metadata_external_tools.rb
-- lib/fsp_metadata_harvester.rb
-- lib/fsp_metadata_parser.rb
+- lib/harvester.rb
+- lib/harvester_utils.rb
+- lib/metadata_harvester.rb
 - lib/metadata_object.rb
+- lib/metadata_parser.rb
 - lib/signposting_tests.rb
 - lib/swagger.rb
 - lib/warnings.json