RubyGems - fsp_harvester - Versions diffs - 0.1.7 → 0.1.9 - Mend

fsp_harvester 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +4 -4
data/.rspec_status +55 -0
data/Gemfile.lock +5 -4
data/launch.json +11 -0
data/lib/config.conf_docker +8 -0
data/lib/config.conf_local +8 -0
data/lib/constants.rb +12 -13
data/lib/fsp_harvester/version.rb +1 -1
data/lib/fsp_harvester.rb +30 -8
data/lib/fsp_metadata_external_tools.rb +82 -0
data/lib/fsp_metadata_harvester.rb +164 -0
data/lib/fsp_metadata_parser.rb +109 -0
data/lib/metadata_object.rb +96 -4
data/lib/signposting_tests.rb +87 -0
data/lib/warnings.json +38 -2
data/lib/web_utils.rb +13 -13
metadata +12 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 7da32f3321193e93f64154c35db0c840f5e7f086451660f99b62d3f4c834e295
-  data.tar.gz: 5a0a1ff4ef6b2100accd8bab4f20d6842d25ebf88d5e600e646804da9ed24bd9
+  metadata.gz: 923ccb362ef4a71fa2f221b1b224dbd5d3ec78a14cc9da0e12a1e0df804162ff
+  data.tar.gz: e70a9a994c504a0867ab10e05c07d440a76677d7ff27e27b9bdb0c3338c02ffe
 SHA512:
-  metadata.gz: dad90f81b73489a151220c132d74508832573a352b209a537bbbaf6543a90b9e132cca80ecca33d15d02eca91841642d73e795113076f4564730194b5bf1fa53
-  data.tar.gz: 5d11c2f002f4e73a4971aec0d047cd14d35f6763155d0effa9c79f419ff6fd26f853158b290d2264edd148633570f5ecd50d4e1f6c7e9d136c7d2880517bdefc
+  metadata.gz: d819cb1c19d40f8a093b723cbeb4273d122d3207df2d874558327fa6a2622be1bf2c671cc9dfebee74c689722825c2dd1957f8be6bfcf2c14c09097c1dc05a5b
+  data.tar.gz: 508a484d93eab373d0389c11f4361068d5586759a73573901ea54f4d2a028f713f71bb492976785499f1074eb8b359d79be550c09b196eb82838a1e401398fc4

data/.rspec_status ADDED Viewed

@@ -0,0 +1,55 @@
+example_id                         | status | run_time        |
+---------------------------------- | ------ | --------------- |
+./spec/cite-as_spec.rb[1:1:1]      | passed | 1.17 seconds    |
+./spec/cite-as_spec.rb[1:1:2]      | passed | 0.98776 seconds |
+./spec/cite-as_spec.rb[1:1:3]      | passed | 0.69753 seconds |
+./spec/cite-as_spec.rb[1:1:4]      | passed | 1.31 seconds    |
+./spec/cite-as_spec.rb[1:1:5]      | passed | 2.07 seconds    |
+./spec/cite-as_spec.rb[1:1:6]      | passed | 1.45 seconds    |
+./spec/cite-as_spec.rb[1:1:7]      | passed | 2.75 seconds    |
+./spec/cite-as_spec.rb[1:1:8]      | passed | 1.83 seconds    |
+./spec/cite-as_spec.rb[1:1:9]      | passed | 2.51 seconds    |
+./spec/cite-as_spec.rb[1:1:10]     | passed | 1.73 seconds    |
+./spec/cite-as_spec.rb[1:1:11]     | passed | 2.35 seconds    |
+./spec/cite-as_spec.rb[1:1:12]     | passed | 2.01 seconds    |
+./spec/cite-as_spec.rb[1:1:13]     | passed | 2.56 seconds    |
+./spec/cite-as_spec.rb[1:1:14]     | passed | 1.68 seconds    |
+./spec/cite-as_spec.rb[1:1:15]     | passed | 1.06 seconds    |
+./spec/cite-as_spec.rb[1:1:16]     | passed | 1.03 seconds    |
+./spec/cite-as_spec.rb[1:1:17]     | passed | 0.94321 seconds |
+./spec/cite-as_spec.rb[1:1:18]     | passed | 1.1 seconds     |
+./spec/cite-as_spec.rb[1:1:19]     | passed | 1.45 seconds    |
+./spec/cite-as_spec.rb[1:1:20]     | passed | 1.53 seconds    |
+./spec/cite-as_spec.rb[1:1:21]     | passed | 1.64 seconds    |
+./spec/cite-as_spec.rb[1:1:22]     | passed | 1.01 seconds    |
+./spec/cite-as_spec.rb[1:1:23]     | passed | 1.09 seconds    |
+./spec/cite-as_spec.rb[1:1:24]     | failed | 1.22 seconds    |
+./spec/cite-as_spec.rb[1:1:25]     | passed | 0.38248 seconds |
+./spec/describedby_spec.rb[1:1:1]  | passed | 2.24 seconds    |
+./spec/describedby_spec.rb[1:1:2]  | passed | 1.08 seconds    |
+./spec/describedby_spec.rb[1:1:3]  | passed | 1 second        |
+./spec/describedby_spec.rb[1:1:4]  | passed | 1.14 seconds    |
+./spec/describedby_spec.rb[1:1:5]  | passed | 1.03 seconds    |
+./spec/describedby_spec.rb[1:1:6]  | passed | 0.81364 seconds |
+./spec/describedby_spec.rb[1:1:7]  | passed | 0.77543 seconds |
+./spec/describedby_spec.rb[1:1:8]  | passed | 2.01 seconds    |
+./spec/describedby_spec.rb[1:1:9]  | passed | 1.35 seconds    |
+./spec/describedby_spec.rb[1:1:10] | passed | 1.73 seconds    |
+./spec/describedby_spec.rb[1:1:11] | passed | 2.36 seconds    |
+./spec/describedby_spec.rb[1:1:12] | passed | 2.73 seconds    |
+./spec/describedby_spec.rb[1:1:13] | passed | 1.5 seconds     |
+./spec/describedby_spec.rb[1:1:14] | passed | 1.8 seconds     |
+./spec/describedby_spec.rb[1:1:15] | passed | 1.65 seconds    |
+./spec/fsp_harvester_spec.rb[1:1]  | passed | 0.00053 seconds |
+./spec/fsp_harvester_spec.rb[1:2]  | passed | 1.76 seconds    |
+./spec/item_spec.rb[1:1:1]         | passed | 2.08 seconds    |
+./spec/item_spec.rb[1:1:2]         | passed | 2.27 seconds    |
+./spec/item_spec.rb[1:1:3]         | passed | 1.22 seconds    |
+./spec/item_spec.rb[1:1:4]         | passed | 1.61 seconds    |
+./spec/item_spec.rb[1:1:5]         | passed | 1.74 seconds    |
+./spec/item_spec.rb[1:1:6]         | passed | 1.95 seconds    |
+./spec/item_spec.rb[1:1:7]         | passed | 3.59 seconds    |
+./spec/item_spec.rb[1:1:8]         | passed | 0.41001 seconds |
+./spec/type_spec.rb[1:1:1]         | passed | 1.14 seconds    |
+./spec/type_spec.rb[1:1:2]         | passed | 0.94799 seconds |
+./spec/type_spec.rb[1:1:3]         | passed | 1.04 seconds    |

data/Gemfile.lock CHANGED Viewed

@@ -1,10 +1,10 @@
 PATH
   remote: .
   specs:
-    fsp_harvester (0.1.7)
+    fsp_harvester (0.1.9)
       json (~> 2.0)
       linkeddata (~> 3.2)
-      linkheaders-processor (~> 0.1.13)
+      linkheaders-processor (~> 0.1.15)
       metainspector (~> 5.11.2)
       parseconfig (~> 1.1)
       rake (~> 13.0)
@@ -126,10 +126,11 @@ GEM
       shex (~> 0.7)
       sparql (~> 3.2)
       sparql-client (~> 3.2)
-    linkheaders-processor (0.1.13)
+    linkheaders-processor (0.1.15)
       json (~> 2.0)
       json-ld (~> 3.2)
       json-ld-preloaded (~> 3.2)
+      link_header (~> 0.0.8)
       metainspector (~> 5.11.2)
       rest-client (~> 2.1)
       securerandom (~> 0.1.0)
@@ -248,7 +249,7 @@ GEM
       diff-lcs (>= 1.2.0, < 2.0)
       rspec-support (~> 3.11.0)
     rspec-support (3.11.0)
-    rubocop (1.32.0)
+    rubocop (1.33.0)
       json (~> 2.3)
       parallel (~> 1.10)
       parser (>= 3.1.0.0)

data/launch.json ADDED Viewed

@@ -0,0 +1,11 @@
+{
+    "name": "RSpec - all",
+    "type": "Ruby",
+    "request": "launch",
+    "cwd": "${workspaceRoot}",
+    "program": "/home/osboxes/.rvm/gems/ruby-3.0.0/bin/rspec",
+    "args": [
+      "-I",
+      "${workspaceRoot}"
+    ]
+  }

data/lib/config.conf_docker ADDED Viewed

@@ -0,0 +1,8 @@
+[extruct]
+command="extruct"
+[rdf]
+command="/usr/local/bundle/bin/rdf"
+[tika]
+command="http://tika:9998/meta"

data/lib/config.conf_local ADDED Viewed

@@ -0,0 +1,8 @@
+[extruct]
+command="extruct"
+[rdf]
+command="/home/osboxes/.rvm/gems/ruby-3.0.0/bin/rdf"
+[tika]
+command="http://tika:9998/meta"

data/lib/constants.rb CHANGED Viewed

@@ -1,17 +1,20 @@
 ACCEPT_ALL_HEADER = { 'Accept' => 'text/turtle, application/ld+json, application/rdf+xml, text/xhtml+xml, application/n3, application/rdf+n3, application/turtle, application/x-turtle, text/n3, text/turtle, text/rdf+n3, text/rdf+turtle, application/n-triples' }
+ACCEPT_STAR_HEADER = {'Accept' => '*/*'}
 TEXT_FORMATS = {
   'text' => ['text/plain']
 }
 RDF_FORMATS = {
-  'jsonld' => ['application/ld+json', 'application/vnd.schemaorg.ld+json'],  # NEW FOR DATACITE
+  'jsonld' => ['application/ld+json','application/x-ld+json', 'application/vnd.schemaorg.ld+json'],  # NEW FOR DATACITE
   'turtle' => ['text/turtle', 'application/n3', 'application/rdf+n3',
                'application/turtle', 'application/x-turtle', 'text/n3', 'text/turtle',
                'text/rdf+n3', 'text/rdf+turtle'],
   # 'rdfa'    => ['text/xhtml+xml', 'application/xhtml+xml'],
   'rdfxml' => ['application/rdf+xml'],
-  'triples' => ['application/n-triples', 'application/n-quads', 'application/trig']
+  'ntriples' => ['application/n-triples', 'application/trig'],
+  'nquads' => ['application/n-quads']
 }
 XML_FORMATS = {
@@ -73,12 +76,10 @@ GUID_TYPES = { 'inchi' => Regexp.new(/^\w{14}-\w{10}-\w$/),
                'uri' => Regexp.new(%r{^\w+:/?/?[^\s]+$}) }
 CONFIG = File.exist?('config.conf') ? ParseConfig.new('config.conf') : {}
-if CONFIG['extruct'] && CONFIG['extruct']['command'] && !CONFIG['extruct']['command'].empty?
-  extruct = config['extruct']['command']
-end
-extruct = 'extruct' unless @extruct_command
+extruct = CONFIG.dig(:extruct, :command)
+extruct ||= 'extruct'
 extruct.strip!
-case @extruct
+case extruct
 when /[&|;`$\s]/
   abort 'The Extruct command in the config file appears to be subject to command injection.  I will not continue'
 when /echo/i
@@ -86,8 +87,8 @@ when /echo/i
 end
 EXTRUCT_COMMAND = extruct
-rdf_command = CONFIG['rdf']['command'] if CONFIG['rdf'] && CONFIG['rdf']['command'] && !CONFIG['rdf']['command'].empty?
-rdf_command = 'rdf' unless @rdf_command
+rdf_command = CONFIG.dig(:rdf, :command)
+rdf_command ||= 'rdf'
 rdf_command.strip
 case rdf_command
 when /[&|;`$\s]/
@@ -99,8 +100,6 @@ when !(/rdf$/ =~ $_)
 end
 RDF_COMMAND = rdf_command
-if CONFIG['tika'] && CONFIG['tika']['command'] && !CONFIG['tika']['command'].empty?
-  tika_command = CONFIG['tika']['command']
-end
-tika_command = 'http://localhost:9998/meta' unless @tika_command
+tika_command = CONFIG.dig(:tika, :command)
+tika_command ||= 'http://localhost:9998/meta'
 TIKA_COMMAND = tika_command

data/lib/fsp_harvester/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module FspHarvester
-  VERSION = "0.1.7"  # up to date
+  VERSION = "0.1.9"
 end

data/lib/fsp_harvester.rb CHANGED Viewed

@@ -20,6 +20,9 @@ require_relative './metadata_object'
 require_relative './constants'
 require_relative './web_utils'
 require_relative './signposting_tests'
+require_relative './fsp_metadata_harvester'
+require_relative './fsp_metadata_parser'
 module FspHarvester
   class Error < StandardError
@@ -32,11 +35,12 @@ module FspHarvester
     def self.resolve_guid(guid:)
       @meta = FspHarvester::MetadataObject.new
-      @meta.finalURI = [guid]
+      @meta.all_uris = [guid]
       type, url = convertToURL(guid: guid)
       links = Array.new
       if type
         links = resolve_url(url: url)
+        @meta.links << links
       else
         @meta.warnings << ['006', guid, '']
         @meta.comments << "FATAL: GUID type not recognized.\n"
@@ -44,6 +48,16 @@ module FspHarvester
       [links, @meta]
     end
+    def self.gather_metadata_from_describedby_links(links: [], metadata: FspHarvester::MetadataObject.new) # meta should have already been created by resolve+guid, but maybe not
+      @meta = metadata
+      db = []
+      links.each do |l|
+        db << l if l.relation == 'describedby'
+      end
+      FspHarvester::MetadataHarvester.extract_metadata(links: db, metadata: @meta)  # everything is gathered into the @meta metadata object
+      @meta
+    end
     def self.convertToURL(guid:)
       GUID_TYPES.each do |k, regex|
         if k == 'inchi' and regex.match(guid)
@@ -68,10 +82,10 @@ module FspHarvester
       false
     end
-    def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_ALL_HEADER)
+    def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_STAR_HEADER)
       @meta.guidtype = 'uri' if @meta.guidtype.nil?
       warn "\n\n FETCHING #{url} #{header}\n\n"
-      response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method)
+      response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method, meta: @meta)
       warn "\n\n head #{response.headers.inspect}\n\n" if response
       unless response
@@ -80,7 +94,7 @@ module FspHarvester
         return []
       end
-      @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.finalURI.last}.  Using the output from this URL for the next few tests..."
+      @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}.  Using the output from this URL for the next few tests..."
       @meta.full_response << response.body
       links = process_link_headers(response: response) unless nolinkheaders
@@ -90,7 +104,7 @@ module FspHarvester
     def self.process_link_headers(response:)
       warn "\n\n parsing #{response.headers}\n\n"
-      parser = LinkHeaders::Processor.new(default_anchor: @meta.finalURI.last)
+      parser = LinkHeaders::Processor.new(default_anchor: @meta.all_uris.last)
       parser.extract_and_parse(response: response)
       factory = parser.factory # LinkHeaders::LinkFactory
@@ -105,6 +119,8 @@ module FspHarvester
       citeas = Array.new
       describedby = Array.new
       item = Array.new
+      types = Array.new
       factory.all_links.each do |l|
         case l.relation
         when 'cite-as'
@@ -113,23 +129,29 @@ module FspHarvester
           item << l
         when 'describedby'
           describedby << l
+        when 'type'
+          types << l
         end
       end
       check_describedby_rules(describedby: describedby)
       check_item_rules(item: item)
-      uniqueciteas = Array.new
       if citeas.length > 1
         warn "INFO: multiple cite-as links found. Checking for conflicts\n"
         @meta.comments << "INFO: multiple cite-as links found. Checking for conflicts\n"
-        uniqueciteas = check_for_citeas_conflicts(citeas: citeas) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
+        citeas = check_for_citeas_conflicts(citeas: citeas) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
       end
-      unless uniqueciteas == 1 && describedby.length > 0
+      unless citeas.length == 1 && describedby.length > 0
         @meta.warnings << ['004', '', '']
         @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires exactly one cite-as header, and at least one describedby header\n"
       end
+      unless types.length >=1
+        @meta.warnings << ['015', '', '']
+        @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires one or two 'type' link headers\n"
+      end
     end
   end
 end

data/lib/fsp_metadata_external_tools.rb ADDED Viewed

@@ -0,0 +1,82 @@
+# frozen_string_literal: true
+module FspHarvester
+  class Error < StandardError
+  end
+  class ExternalTools
+    def initialize(metadata: FspHarvester::MetadataObject.new)
+      @meta = metadata
+    end
+    def process_with_distiller(body:)
+      bhash = Digest::SHA256.hexdigest(body)
+      if @@distillerknown[bhash]
+        @meta.comments << "INFO: data is already parsed by distiller.\n"
+        #parse_rdf(body: body)
+      else
+        @meta.comments << "INFO: Using 'Kellog's Distiller' to try to extract metadata from return value (message body).\n"
+        file = Tempfile.new('foo', encoding: 'UTF-8')
+        body = body.force_encoding('UTF-8')
+        body.scrub!
+        body = body.gsub(%r{"@context"\s*:\s*"https?://schema.org/?"}, '"@context": "https://schema.org/docs/jsonldcontext.json"') # a bug in distiller, apparently
+        file.write(body)
+        file.rewind
+        @meta.comments << "INFO: The message body is being examined by Distiller\n"
+        # command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format turtle #{file.path} 2>/dev/null"
+        command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
+        # command = "LANG=en_US.UTF-8 /usr/local/bin/ruby #{@rdf_command} serialize --input-format rdfa --output-format jsonld #{file.path}"
+        # command = "LANG=en_US.UTF-8 /home/osboxes/.rvm/rubies/ruby-2.6.3/bin/ruby /home/osboxes/.rvm/gems/ruby-2.6.3/bin/rdf serialize --output-format jsonld #{file.path}"
+        warn "distiller command: #{command}"
+        result, _stderr, _status = Open3.capture3(command)
+        warn ''
+        warn "distiller errors: #{stderr}"
+        file.close
+        file.unlink
+        result = result.force_encoding('UTF-8')
+        warn "DIST RESULT: #{result}"
+        if result !~ /@context/i # failure returns nil
+          @meta.comments << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML..\n"
+          @meta.warnings << ['018', '', '']
+        else
+          @meta.comments << "INFO: The Distiller found parseable data.  Parsing as JSON-LD\n"
+          parse_rdf(result: result, content_type: "application/ld+json")
+        end
+        @@distillerknown[bhash] = true
+      end
+    end
+    def processs_with_extruct(uri:)
+      @meta.comments << "INFO:  Using 'extruct' to try to extract metadata from return value (message body) of #{uri}.\n"
+      warn 'begin open3'
+      stdout, stderr, status = Open3.capture3(EXTRUCT_COMMAND + ' ' + uri)
+      warn "open3 status: #{status} #{stdout}"
+      result = stderr # absurd that the output comes over stderr!  LOL!
+      if result.to_s.match(/(Failed\sto\sextract.*?)\n/)
+        @meta.comments << "WARN: extruct threw an error #{Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n"
+        @meta.warnings << ['019', '', '']
+        if result.to_s.match(/(ValueError:.*?)\n/)
+          @meta.comments << "WARN: extruct error was #{Regexp.last_match(1)}\n"
+          @meta.warnings << ['019', '', '']
+        end
+      elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
+        json = JSON.parse result
+        @meta.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
+        parse_rdf(body: json['json-ld'].to_json, content_type: 'application/ld+json') if json['json-ld'].any? # RDF
+        @meta.merge_hash(json['microdata'].first) if json['microdata'].any?
+        @meta.merge_hash(json['microformat'].first) if json['microformat'].any?
+        @meta.merge_hash(json['opengraph'].first) if json['opengraph'].any?
+        parse_rdf(body: json['rdfa'].to_json, content_type: 'application/ld+json') if json['rdfa'].any? # RDF
+        @meta.merge_hash(json.first) if json.first.is_a? Hash
+      else
+        @meta.comments << "WARN: the extruct tool failed to find parseable data at #{uri}\n"
+      end
+    end
+  end
+end

data/lib/fsp_metadata_harvester.rb ADDED Viewed

@@ -0,0 +1,164 @@
+# frozen_string_literal: true
+module FspHarvester
+  class Error < StandardError
+  end
+  class MetadataHarvester
+    def self.extract_metadata(links: [], metadata: FspHarvester::MetadataObject.new)
+      @meta = metadata
+      @meta.comments << 'INFO:  now collecting both linked data and hash-style data using the harvested links'
+      describedby = links.select { |l| l if l.relation == 'describedby' }
+      hvst = FspHarvester::MetadataParser.new(metadata_object: @meta) # put here because the class variable for detecting duplicates should apply to all URIs
+      describedby.each do |link|
+        accepttype = ACCEPT_STAR_HEADER
+        accept = link.respond_to?('type') ? link.type : nil
+        accepttype = { 'Accept' => accept } if accept
+        response = attempt_to_resolve(link: link, headers: accepttype)
+        abbreviation, content_type = attempt_to_detect_type(body: response.body, headers: response.headers)
+        unless abbreviation
+          @meta.warnings << ['017', url, header]
+          @meta.comments << "WARN: metadata format returned from #{url} using Accept header #{header} is not recognized.  Processing will end now.\n"
+          next
+        end
+        # process according to detected type
+        case abbreviation
+        when 'html'
+          @meta.comments << 'INFO: Processing html'
+          hvst.process_html(body: response.body, uri: link)
+        when 'xml'
+          @meta.comments << 'INFO: Processing xml'
+          hvst.process_xml(body: response.body)
+        when 'json'
+          @meta.comments << 'INFO: Processing json'
+          hvst.process_json(body: response.body)
+        when 'jsonld', 'rdfxml', 'turtle', 'ntriples', 'nquads'
+          @meta.comments << 'INFO: Processing linked data'
+          hvst.process_ld(body: response.body, content_type: content_type)
+        when 'specialist'
+          warn 'no specialized parsers so far'
+        end
+      end
+    end
+    def self.attempt_to_resolve(link:, headers: ACCEPT_STAR_HEADER)
+      @meta.comments << "INFO:  link #{link.href} being processed"
+      if link.respond_to? 'type'
+        header = { 'Accept' => link.type }
+      else
+        @meta.comments << "INFO:  link #{link.href} has no MIME type, defaulting to */*"
+      end
+      url = link.href
+      response = FspHarvester::WebUtils.fspfetch(url: url, method: :get, headers: header)
+      unless response
+        @meta.warnings << ['016', url, header]
+        @meta.comments << "WARN: Unable to resolve describedby link #{url} using HTTP Accept header #{header}.\n"
+      end
+      response
+    end
+    def self.attempt_to_detect_type(body:, headers:)
+      #  described by should be an html, xml, json, or linked data document
+      abbreviation = nil
+      content_type = nil
+      @meta.comments << 'INFO: Testing metadata format for html, xml, and linked data formats\n'
+      if body =~ /^\s*<\?xml/
+        if body =~ /<HTML/i
+          abbreviation = 'html'
+          content_type = 'text/html'
+          @meta.comments << 'INFO: appears to be HTML\n'
+        elsif body =~ /<rdf:RDF/i
+          abbreviation = 'rdfxml'
+          content_type = 'application/rdf+xml'
+          @meta.comments << 'INFO: appears to be RDF-XML\n'
+        else
+          abbreviation = 'xml'
+          content_type = 'application/xml'
+          @meta.comments << 'INFO: appears to be XML\n'
+        end
+      else
+        abbreviation, content_type = check_ld(body: body, claimed_type: headers[:content_type])
+        abbreviation, content_type = check_json(body: body) unless abbreviation
+      end
+      unless content_type
+        @meta.warnings << ['017', url, header]
+        @meta.comments << "WARN: metadata format returned from #{url} using Accept header #{header} is not recognized.  Processing will end now.\n"
+      end
+      [abbreviation, content_type]
+    end
+    def self.check_ld(body:, claimed_type:)
+      detected_type = ntriples_hack(body: body) # ntriples hack for one-line metadata records
+      unless detected_type
+        detected_type = RDF::Format.for({ sample: body[0..5000] })
+        @meta.comments << "INFO: Auto-detected type #{detected_type}\n"
+      end
+      contenttype = ''
+      abbreviation = ''
+      if detected_type
+        contenttype = detected_type.content_type.first # comes back as array
+        abbreviation = abbreviate_type(contenttype: contenttype)
+        @meta.comments << "INFO: using content-type #{contenttype}.\n"
+      else
+        @meta.comments << "INFO: metadata does not appear to be in a linked data format.  Trying other options.\n"
+      end
+      [abbreviation, contenttype]
+    end
+    def self.ntriples_hack(body:)  # distriller cannot recognize single-line ntriples unless they end with a period, which is not required by the spec... so hack it!
+      detected_type = nil
+      body.split.each do |line|
+        line.strip!
+        next if line.empty?
+        if line =~ %r{\s*<[^>]+>\s*<[^>]+>\s\S+}
+          @meta.comments << "INFO: running ntriples hack on  #{line + " ."}\n"
+          detected_type = RDF::Format.for({ sample: "#{line} ." })  # adding a period allows detection of ntriples by distiller
+          break
+        end
+      end
+      @meta.comments << "INFO: ntriples hack found: #{detected_type.to_s}\n"
+      if detected_type != RDF::NTriples::Format   # only return the hacky case
+        return nil
+      end
+      return detected_type
+    end
+    def self.check_json(body:)
+      abbreviation = nil
+      parsed = nil
+      begin
+        parsed = JSON.parse(body)
+      rescue StandardError
+        abbreviation = nil
+      end
+      if parsed
+        abbreviation = 'json'
+      else
+        @meta.comments << "INFO: metadata does not appear to be in JSON format.  No options left.\n"
+      end
+      [abbreviation, 'application/ld+json']
+    end
+    def self.abbreviate_type(contenttype:)
+      foundtype = nil
+      RDF_FORMATS.merge(XML_FORMATS).merge(HTML_FORMATS).merge(JSON_FORMATS).each do |type, vals|
+        warn "\n\ntype #{type}\nvals #{vals}\n\n"
+        @meta.comments << "INFO: testing #{type} MIME types for #{contenttype}"
+        next unless vals.include? contenttype
+        foundtype = type
+        @meta.comments << "INFO: detected a #{type} MIME type"
+        break
+      end
+      foundtype
+    end
+  end
+end

data/lib/fsp_metadata_parser.rb ADDED Viewed

@@ -0,0 +1,109 @@
+# frozen_string_literal: true
+module FspHarvester
+  class Error < StandardError
+  end
+  class MetadataParser
+    # attr_accessor :distillerknown
+    @@distillerknown = {}
+    def initialize(metadata_object: FspHarvester::MetadataObject.new)
+      @meta = metadata_object
+    end
+    def process_html(body:, uri:)
+      tools = FspHarvester::ExternalTools.new(metadata: @meta)
+      tools.process_with_distiller(body: body)
+      tools.process_with_extruct(uri: uri)
+    end
+    def process_xml(body:)
+      begin
+        hash = XmlSimple.xml_in(body)
+      rescue
+        @meta.comments << "CRITICAL: Malformed XML detected.  Cannot process metadata.\n"
+        @meta.warnings << ['020', '', '']
+      end
+      @meta.comments << "INFO: The XML is being merged in the metadata object\n"
+      @meta.hash.merge hash
+    end
+    def process_json(body:)
+      begin
+        hash = JSON.parse(body)
+      rescue
+        @meta.comments << "CRITICAL: Malformed JSON detected.  Cannot process metadata.\n"
+        @meta.warnings << ['021', '', '']
+      end
+      @meta.comments << "INFO: The JSON is being merged in the metadata object\n"
+      @meta.hash.merge hash
+    end
+    def process_ld(body:, content_type:)
+      parse_rdf(body: body, content_type: content_type)
+    end
+    def parse_rdf(body:, content_type:)
+      unless body
+        @meta.comments << "CRITICAL: The response message body component appears to have no content.\n"
+        @meta.warnings << ['018', '', '']
+        return
+      end
+      unless body.match(/\w/)
+        @meta.comments << "CRITICAL: The response message body component appears to have no content.\n"
+        @meta.warnings << ['018', '', '']
+        return
+      end
+      rdfformat = RDF::Format.for(content_type: content_type)
+      unless rdfformat
+        @meta.comments << "CRITICAL: Found what appears to be RDF (sample:  #{body[0..300].delete!("\n")}), but it could not find a parser.  Please report this error, along with the GUID of the resource, to the maintainer of the system.\n"
+        @meta.warnings << ['018', '', '']
+        return
+      end
+      graph = FspHarvester::Cache.checkRDFCache(body: body)
+      if graph.size > 0
+        warn "\n\n\n unmarshalling graph from cache\n\ngraph size #{graph.size}\n\n"
+        @meta.merge_rdf(graph.to_a)
+      else
+        warn "\n\n\nfound format #{rdfformat}\n\n"
+        @meta.comments << "INFO: The response message body component appears to contain #{rdfformat}.\n"
+        reader = ''
+        begin
+          reader = rdfformat.reader.new(body)
+        rescue Exception => e
+          @meta.comments << "WARN: Though linked data was found, it failed to parse (Exception #{e}).  This likely indicates some syntax error in the data.  As a result, no metadata will be extracted from this message.\n"
+          @meta.warnings << ['018', '', '']
+          return
+        end
+        begin
+          if reader.size == 0
+            @meta.comments << "WARN: Though linked data was found, it failed to parse.  This likely indicates some syntax error in the data.  As a result, no metadata will be extracted from this message.\n"
+            return
+          end
+          reader = rdfformat.reader.new(body) # have to re-read it here, but now its safe because we have already caught errors
+          warn 'WRITING TO CACHE'
+          FspHarvester::Cache.writeRDFCache(reader: reader, body: body) # write to the special RDF graph cache
+          warn 'WRITING DONE'
+          reader = rdfformat.reader.new(body)  # frustrating that we cannot rewind!
+          warn 'RE-READING DONE'
+          @meta.merge_rdf(reader.to_a)
+          warn 'MERGE DONE'
+        rescue RDF::ReaderError => e
+          @meta.comments << "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} ||  (sample of what was parsed:  #{body[0..300].delete("\n")})\n"
+          warn "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} ||  (sample of what was parsed:  #{body[0..300].delete("\n")})\n"
+          @meta.warnings << ['018', '', '']
+        rescue Exception => e
+          meta.comments << "CRITICAL: An unknown error occurred while parsing the (apparent) Linked Data (sample of what was parsed:  #{body[0..300].delete("\n")}).  Moving on...\n"
+          warn "\n\nCRITICAL: #{e.inspect} An unknown error occurred while parsing the (apparent) Linked Data (full body:  #{body}).  Moving on...\n"
+          @meta.warnings << ['018', '', '']
+        end
+      end
+    end
+  end
+end

data/lib/metadata_object.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module FspHarvester
   class MetadataObject
-    attr_accessor :hash, :graph, :comments, :warnings, :guidtype, :full_response, :finalURI  # a hash of metadata # a RDF.rb graph of metadata  # an array of comments  # the type of GUID that was detected # will be an array of Net::HTTP::Response
+    attr_accessor :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris  # a hash of metadata # a RDF.rb graph of metadata  # an array of comments  # the type of GUID that was detected # will be an array of Net::HTTP::Response
     def initialize(_params = {}) # get a name from the "new" call, or set a default
       @hash = {}
@@ -8,15 +8,16 @@ module FspHarvester
       @comments =  []
       @warnings =  []
       @full_response = []
-      @finalURI = []
+      @links = []
+      @all_uris = []
     end
     def merge_hash(hash)
-      # $stderr.puts "\n\n\nIncoming Hash #{hash.inspect}"
+      # warn "\n\n\nIncoming Hash #{hash.inspect}"
       self.hash = self.hash.merge(hash)
     end
-    def merge_rdf(triples)  # incoming list of triples
+    def merge_rdf(triples) # incoming list of triples
       graph << triples
       graph
     end
@@ -25,4 +26,95 @@ module FspHarvester
       graph
     end
   end
+  class Cache
+    def self.retrieveMetaObject(uri)
+      filename = (Digest::MD5.hexdigest uri) + '_meta'
+      warn "Checking Meta cache for #{filename}"
+      if File.exist?("/tmp/#{filename}")
+        warn 'FOUND Meta object in cache'
+        meta = Marshal.load(File.read("/tmp/#{filename}"))
+        warn 'Returning....'
+        return meta
+      end
+      warn 'Meta objectNot Found in Cache'
+      false
+    end
+    def self.cacheMetaObject(meta, uri)
+      filename = (Digest::MD5.hexdigest uri) + '_meta'
+      warn "in cacheMetaObject Writing to cache for #{filename}"
+      File.open("/tmp/#{filename}", 'wb') { |f| f.write(Marshal.dump(meta)) }
+    end
+    def self.checkRDFCache(body: )
+      fs = File.join('/tmp/', '*_graphbody')
+      bodies = Dir.glob(fs)
+      g = RDF::Graph.new
+      bodies.each do |bodyfile|
+        next unless File.size(bodyfile) == body.bytesize # compare body size
+        next unless bodyfile.match(/(.*)_graphbody$/) # continue if there's no match
+        filename = Regexp.last_match(1)
+        warn "Regexp match for #{filename} FOUND"
+        next unless File.exist?("#{filename}_graph") # @ get the associated graph file
+        warn "RDF Cache File #{filename} FOUND"
+        graph = Marshal.load(File.read("#{filename}_graph")) # unmarshal it
+        graph.each do |statement|
+          g << statement # need to do this because the unmarshalled object isn't entirely functional as an RDF::Graph object
+        end
+        warn "returning a graph of #{g.size}"
+        break
+      end
+      # return an empty graph otherwise
+      g
+    end
+    def self.writeRDFCache(reader:, body:)
+      filename = Digest::MD5.hexdigest body
+      graph = RDF::Graph.new
+      reader.each_statement { |s| graph << s }
+      warn "WRITING RDF TO CACHE #{filename}"
+      File.open("/tmp/#{filename}_graph", 'wb') { |f| f.write(Marshal.dump(graph)) }
+      File.open("/tmp/#{filename}_graphbody", 'wb') { |f| f.write(body) }
+      warn "wrote RDF filename: #{filename}"
+    end
+    def self.checkCache(uri, headers)
+      filename = Digest::MD5.hexdigest uri + headers.to_s
+      warn "Checking Error cache for #{filename}"
+      if File.exist?("/tmp/#{filename}_error")
+        warn 'Error file found in cache... returning'
+        return ['ERROR', nil, nil]
+      end
+      if File.exist?("/tmp/#{filename}_head") and File.exist?("/tmp/#{filename}_body")
+        warn 'FOUND data in cache'
+        head = Marshal.load(File.read("/tmp/#{filename}_head"))
+        body = Marshal.load(File.read("/tmp/#{filename}_body"))
+        all_uris = ''
+        all_uris = Marshal.load(File.read("/tmp/#{filename}_uri")) if File.exist?("/tmp/#{filename}_uri")
+        warn 'Returning....'
+        return [head, body, all_uris]
+      end
+      warn 'Not Found in Cache'
+    end
+    def self.writeToCache(uri, headers, head, body, all_uris)
+      filename = Digest::MD5.hexdigest uri + headers.to_s
+      warn "in writeToCache Writing to cache for #{filename}"
+      headfilename = filename + '_head'
+      bodyfilename = filename + '_body'
+      urifilename = filename + '_uri'
+      File.open("/tmp/#{headfilename}", 'wb') { |f| f.write(Marshal.dump(head)) }
+      File.open("/tmp/#{bodyfilename}", 'wb') { |f| f.write(Marshal.dump(body)) }
+      File.open("/tmp/#{urifilename}", 'wb') { |f| f.write(Marshal.dump(all_uris)) }
+    end
+    def self.writeErrorToCache(uri, headers)
+      filename = Digest::MD5.hexdigest uri + headers.to_s
+      warn "in writeErrorToCache Writing error to cache for #{filename}"
+      File.open("/tmp/#{filename}_error", 'wb') { |f| f.write('ERROR') }
+    end
+  end
 end

data/lib/signposting_tests.rb ADDED Viewed

@@ -0,0 +1,87 @@
+def check_for_citeas_conflicts(citeas: )
+  @meta.comments << 'INFO: checking for conflicting cite-as links'
+  citeas_hrefs = Hash.new
+  citeas.each do |link|
+    warn "INFO: Adding citeas #{link.href} to the testing queue."
+    @meta.comments << "INFO: Adding citeas #{link.href} to the testing queue."
+    citeas_hrefs[link.href] = link
+  end
+  if citeas_hrefs.length > 1
+    @meta.comments << 'INFO: Found multiple non-identical cite-as links.'
+    @meta.warnings << ['007', '', '']
+    @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard: Found conflicting cite-as link headers.\n"
+  end
+  citeas_hrefs.values  # return list of unique links
+end
+def check_describedby_rules(describedby:)
+  describedby.each do |l|
+    unless l.respond_to? 'type'
+      @meta.warnings << ['005', l.href, '']
+      @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires any describedby links to also have a 'type' attribute.\n"
+    end
+    type = l.type if l.respond_to? 'type'
+    type ||= '*/*'
+    header = { accept: type }
+    response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
+    if response
+      responsetype = response.headers[:content_type]
+      @meta.comments << "INFO: describedby link responds with content type #{responsetype}\n"
+      if responsetype =~ %r{^(.*/[^;]+)}
+        responsetype = Regexp.last_match(1).to_s # remove the e.g. charset information
+      end
+      @meta.comments << "INFO: testing content type |#{responsetype}| against |#{type}|\n"
+      if type != '*/*'
+        if responsetype == type
+          @meta.comments << "INFO: describedby link responds according to Signposting specifications\n"
+        else
+          @meta.warnings << ['009', l.href, header]
+          @meta.comments << "WARN: Content type of returned describedby link #{responsetype}does not match the 'type' attribute #{type}\n"
+        end
+      else
+        @meta.warnings << ['010', l.href, header]
+        @meta.comments << "WARN: Content type of returned describedby link is not specified in response headers or cannot be matched against accept headers\n"
+      end
+    else
+      @meta.warnings << ['008', l.href, header]
+      @meta.comments << "WARN: describedby link doesn't resolve\n"
+    end
+  end
+end
+def check_item_rules(item:)
+  item.each do |l| # l = LinkHeaders::Link
+    unless l.respond_to? 'type'
+      @meta.warnings << ['011', l.href, '']
+      @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which encourages any item links to also have a 'type' attribute.\n"
+    end
+    type = l.type if l.respond_to? 'type'
+    type ||= '*/*' # this becomes a frozen string
+    header = { accept: type }
+    response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
+    if response
+      if response.headers[:content_type] and type != '*/*'
+        rtype = type.gsub(%r{/}, "\/")   # because type is a frozen string
+        rtype = rtype.gsub(/\+/, '.')
+        typeregex = Regexp.new(type)
+        if response.headers[:content_type].match(typeregex)
+          warn response.headers[:content_type]
+          warn typeregex.inspect
+          @meta.comments << "INFO: item link responds according to Signposting specifications\n"
+        else
+          @meta.warnings << ['012', l.href, header]
+          @meta.comments << "WARN: Content type of returned item link does not match the 'type' attribute\n"
+        end
+      else
+        @meta.warnings << ['013', l.href, header]
+        @meta.comments << "WARN: Content type of returned item link is not specified in response headers or cannot be matched against accept headers\n"
+      end
+    else
+      @meta.warnings << ['014', l.href, header]
+      @meta.comments << "WARN: item link doesn't resolve\n"
+    end
+  end
+end

data/lib/warnings.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
    "001": {
-      "message": "Unable to resolve guid using Accept headers for Linked Data",
+      "message": "Unable to resolve guid using default (*/*) Accept headers",
       "linkout": "",
       "severity": "WARN"
    },
@@ -68,7 +68,43 @@
       "message": "Item link does not resolve",
       "linkout": "",
       "severity": "WARN"
-   }
+   },
+   "015": {
+      "message": "Link headers do not include a link of type 'type', as required by the FAIR Signposting specification",
+      "linkout": "",
+      "severity": "WARN"
+   },
+   "016": {
+      "message": "Unable to resolve describedby link using Accept headers with the MIME type indicated in the link",
+      "linkout": "",
+      "severity": "WARN"
+   },
+   "017": {
+      "message": "Metadata format not recognized.",
+      "linkout": "",
+      "severity": "WARN"
+   },
+   "018": {
+      "message": "RDF parsing error - likely malformed RDF document.",
+      "linkout": "",
+      "severity": "WARN"
+   },
+   "019": {
+      "message": "HTML parsing error - unable to extract linked data from HTML.",
+      "linkout": "",
+      "severity": "WARN"
+   },
+   "020": {
+      "message": "XML parsing error - unable to process XML document.",
+      "linkout": "",
+      "severity": "WARN"
+   },
+   "021": {
+      "message": "JSON parsing error - unable to process JSON document.",
+      "linkout": "",
+      "severity": "WARN"
+   },
 }

data/lib/web_utils.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module FspHarvester
   class WebUtils
-    def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get)
+    def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get, meta: FspHarvester::MetadataObject.new)
       warn 'In fetch routine now.  '
       begin
@@ -13,19 +13,19 @@ module FspHarvester
                                                 # password: pass,
                                                 headers: headers
                                               })
-        @meta.finalURI |= [response.request.url] if @meta  # it's possible to call this method without affecting the metadata object being created by the harvester
-        warn "There was a response to the call #{url}"
-        warn "There was a response to the call #{response.request.url}"
+        meta.all_uris |= [response.request.url]  # it's possible to call this method without affecting the metadata object being created by the harvester
+        warn "starting URL #{url}"
+        warn "final URL #{response.request.url}"
         warn "Response code #{response.code}"
-        if response.code == 203 && @meta
-          @meta.warnings << ["002", url, headers]
-          @meta.comments << "WARN: Response is non-authoritative (HTTP response code: #{response.code}).  Headers may have been manipulated encountered when trying to resolve #{url}\n"
+        if response.code == 203
+          meta.warnings << ["002", url, headers]
+          meta.comments << "WARN: Response is non-authoritative (HTTP response code: #{response.code}).  Headers may have been manipulated encountered when trying to resolve #{url}\n"
         end
         response
       rescue RestClient::ExceptionWithResponse => e
         warn "EXCEPTION WITH RESPONSE! #{e.response}\n#{e.response.headers}"
-        @meta.warnings << ["003", url, headers] if @meta
-        @meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n" if @meta
+        meta.warnings << ["003", url, headers]
+        meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
         if (e.response.code == 500 or e.response.code == 404)
           return false
         else
@@ -34,14 +34,14 @@ module FspHarvester
         # now we are returning the headers and body that were returned
       rescue RestClient::Exception => e
         warn "EXCEPTION WITH NO RESPONSE! #{e}"
-        @meta.warnings << ["003", url, headers] if @meta
-        @meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n" if @meta
+        meta.warnings << ["003", url, headers]
+        meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
         false
         # now we are returning 'False', and we will check that with an \"if\" statement in our main code
       rescue Exception => e
         warn "EXCEPTION UNKNOWN! #{e}"
-        @meta.warnings << ["003", url, headers] if @meta
-        @meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n" if @meta
+        meta.warnings << ["003", url, headers]
+        meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
         false
         # now we are returning 'False', and we will check that with an \"if\" statement in our main code
       end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: fsp_harvester
 version: !ruby/object:Gem::Version
-  version: 0.1.7
+  version: 0.1.9
 platform: ruby
 authors:
 - Mark Wilkinson
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2022-08-02 00:00:00.000000000 Z
+date: 2022-08-05 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: json
@@ -44,14 +44,14 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.1.13
+        version: 0.1.16
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.1.13
+        version: 0.1.16
 - !ruby/object:Gem::Dependency
   name: metainspector
   requirement: !ruby/object:Gem::Requirement
@@ -171,6 +171,7 @@ executables: []
 extensions: []
 extra_rdoc_files: []
 files:
+- ".rspec_status"
 - CHANGELOG.md
 - Gemfile
 - Gemfile.lock
@@ -180,10 +181,17 @@ files:
 - bin/console
 - bin/setup
 - example_test.rb
+- launch.json
+- lib/config.conf_docker
+- lib/config.conf_local
 - lib/constants.rb
 - lib/fsp_harvester.rb
 - lib/fsp_harvester/version.rb
+- lib/fsp_metadata_external_tools.rb
+- lib/fsp_metadata_harvester.rb
+- lib/fsp_metadata_parser.rb
 - lib/metadata_object.rb
+- lib/signposting_tests.rb
 - lib/swagger.rb
 - lib/warnings.json
 - lib/web_utils.rb