RubyGems - fsp_harvester - Versions diffs - 0.1.7 → 0.1.11 - Mend

fsp_harvester 0.1.7 → 0.1.11

Files changed (17) hide show

checksums.yaml +4 -4
data/.rspec_status +55 -0
data/Gemfile.lock +9 -8
data/launch.json +11 -0
data/lib/config.conf_docker +8 -0
data/lib/config.conf_local +8 -0
data/lib/constants.rb +12 -13
data/lib/fsp_harvester/version.rb +1 -1
data/lib/fsp_harvester.rb +33 -11
data/lib/fsp_metadata_external_tools.rb +82 -0
data/lib/fsp_metadata_harvester.rb +164 -0
data/lib/fsp_metadata_parser.rb +109 -0
data/lib/metadata_object.rb +109 -4
data/lib/signposting_tests.rb +87 -0
data/lib/warnings.json +36 -3
data/lib/web_utils.rb +13 -13
metadata +12 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 7da32f3321193e93f64154c35db0c840f5e7f086451660f99b62d3f4c834e295
-  data.tar.gz: 5a0a1ff4ef6b2100accd8bab4f20d6842d25ebf88d5e600e646804da9ed24bd9
+  metadata.gz: 895567e9edd571dbca7dee89a0270d1c14342fed06c3eb81c81e06f3c07ddbed
+  data.tar.gz: 7eee65295c206d6cee7b4ef28830f64087ba172a294cde7401490bffa20dbe1a
 SHA512:
-  metadata.gz: dad90f81b73489a151220c132d74508832573a352b209a537bbbaf6543a90b9e132cca80ecca33d15d02eca91841642d73e795113076f4564730194b5bf1fa53
-  data.tar.gz: 5d11c2f002f4e73a4971aec0d047cd14d35f6763155d0effa9c79f419ff6fd26f853158b290d2264edd148633570f5ecd50d4e1f6c7e9d136c7d2880517bdefc
+  metadata.gz: f0c7727598525cb55b6c2bfaf36d5ce3dda5da6efddf85888328b7c93b874c508989122627e5deaa5101fc0a20279432aa023ecefef112926219f267e3622234
+  data.tar.gz: 29f834c57ec73e27f988948893dc92fe56550b829585df390a9a1398770845115202289f6f9557c01eb2fc3eec218f863371db60649f6a3fef01da9457c2862e

data/.rspec_status ADDED Viewed

@@ -0,0 +1,55 @@
+example_id                         | status | run_time        |
+---------------------------------- | ------ | --------------- |
+./spec/cite-as_spec.rb[1:1:1]      | passed | 1.61 seconds    |
+./spec/cite-as_spec.rb[1:1:2]      | passed | 1.18 seconds    |
+./spec/cite-as_spec.rb[1:1:3]      | passed | 1.02 seconds    |
+./spec/cite-as_spec.rb[1:1:4]      | passed | 1.6 seconds     |
+./spec/cite-as_spec.rb[1:1:5]      | passed | 2.78 seconds    |
+./spec/cite-as_spec.rb[1:1:6]      | passed | 2.09 seconds    |
+./spec/cite-as_spec.rb[1:1:7]      | passed | 2.98 seconds    |
+./spec/cite-as_spec.rb[1:1:8]      | passed | 2.2 seconds     |
+./spec/cite-as_spec.rb[1:1:9]      | passed | 2.87 seconds    |
+./spec/cite-as_spec.rb[1:1:10]     | passed | 2.18 seconds    |
+./spec/cite-as_spec.rb[1:1:11]     | passed | 3.16 seconds    |
+./spec/cite-as_spec.rb[1:1:12]     | passed | 2.36 seconds    |
+./spec/cite-as_spec.rb[1:1:13]     | passed | 2.89 seconds    |
+./spec/cite-as_spec.rb[1:1:14]     | passed | 2.13 seconds    |
+./spec/cite-as_spec.rb[1:1:15]     | passed | 1.18 seconds    |
+./spec/cite-as_spec.rb[1:1:16]     | passed | 1.3 seconds     |
+./spec/cite-as_spec.rb[1:1:17]     | passed | 1.17 seconds    |
+./spec/cite-as_spec.rb[1:1:18]     | passed | 1.2 seconds     |
+./spec/cite-as_spec.rb[1:1:19]     | passed | 1.71 seconds    |
+./spec/cite-as_spec.rb[1:1:20]     | passed | 1.69 seconds    |
+./spec/cite-as_spec.rb[1:1:21]     | passed | 2.22 seconds    |
+./spec/cite-as_spec.rb[1:1:22]     | passed | 1.09 seconds    |
+./spec/cite-as_spec.rb[1:1:23]     | passed | 1.17 seconds    |
+./spec/cite-as_spec.rb[1:1:24]     | failed | 1.2 seconds     |
+./spec/cite-as_spec.rb[1:1:25]     | passed | 0.48048 seconds |
+./spec/describedby_spec.rb[1:1:1]  | passed | 2.12 seconds    |
+./spec/describedby_spec.rb[1:1:2]  | passed | 0.96254 seconds |
+./spec/describedby_spec.rb[1:1:3]  | passed | 0.92669 seconds |
+./spec/describedby_spec.rb[1:1:4]  | passed | 0.92801 seconds |
+./spec/describedby_spec.rb[1:1:5]  | passed | 1 second        |
+./spec/describedby_spec.rb[1:1:6]  | passed | 0.66763 seconds |
+./spec/describedby_spec.rb[1:1:7]  | passed | 0.66021 seconds |
+./spec/describedby_spec.rb[1:1:8]  | passed | 1.89 seconds    |
+./spec/describedby_spec.rb[1:1:9]  | passed | 1.3 seconds     |
+./spec/describedby_spec.rb[1:1:10] | passed | 1.7 seconds     |
+./spec/describedby_spec.rb[1:1:11] | passed | 2.28 seconds    |
+./spec/describedby_spec.rb[1:1:12] | passed | 2.27 seconds    |
+./spec/describedby_spec.rb[1:1:13] | passed | 1.39 seconds    |
+./spec/describedby_spec.rb[1:1:14] | passed | 1.65 seconds    |
+./spec/describedby_spec.rb[1:1:15] | passed | 1.7 seconds     |
+./spec/fsp_harvester_spec.rb[1:1]  | passed | 0.00215 seconds |
+./spec/fsp_harvester_spec.rb[1:2]  | failed | 0.00021 seconds |
+./spec/item_spec.rb[1:1:1]         | passed | 2.04 seconds    |
+./spec/item_spec.rb[1:1:2]         | passed | 2 seconds       |
+./spec/item_spec.rb[1:1:3]         | passed | 0.92924 seconds |
+./spec/item_spec.rb[1:1:4]         | passed | 1.36 seconds    |
+./spec/item_spec.rb[1:1:5]         | passed | 1.71 seconds    |
+./spec/item_spec.rb[1:1:6]         | passed | 1.68 seconds    |
+./spec/item_spec.rb[1:1:7]         | passed | 2.37 seconds    |
+./spec/item_spec.rb[1:1:8]         | passed | 0.34241 seconds |
+./spec/type_spec.rb[1:1:1]         | passed | 0.9855 seconds  |
+./spec/type_spec.rb[1:1:2]         | passed | 0.96202 seconds |
+./spec/type_spec.rb[1:1:3]         | passed | 0.96005 seconds |

data/Gemfile.lock CHANGED Viewed

@@ -1,10 +1,10 @@
 PATH
   remote: .
   specs:
-    fsp_harvester (0.1.7)
+    fsp_harvester (0.1.11)
       json (~> 2.0)
       linkeddata (~> 3.2)
-      linkheaders-processor (~> 0.1.13)
+      linkheaders-processor (~> 0.1.16)
       metainspector (~> 5.11.2)
       parseconfig (~> 1.1)
       rake (~> 13.0)
@@ -36,7 +36,7 @@ GEM
       scanf (~> 1.0)
       sxp (~> 1.2)
       unicode-types (~> 1.7)
-    faraday (1.10.0)
+    faraday (1.10.1)
       faraday-em_http (~> 1.0)
       faraday-em_synchrony (~> 1.0)
       faraday-excon (~> 1.1)
@@ -82,13 +82,13 @@ GEM
       concurrent-ruby (~> 1.0)
     json (2.6.2)
     json-canonicalization (0.3.0)
-    json-ld (3.2.1)
+    json-ld (3.2.3)
       htmlentities (~> 4.3)
       json-canonicalization (~> 0.3)
       link_header (~> 0.0, >= 0.0.8)
       multi_json (~> 1.15)
       rack (~> 2.2)
-      rdf (~> 3.2)
+      rdf (~> 3.2, >= 3.2.9)
     json-ld-preloaded (3.2.0)
       json-ld (~> 3.2)
       rdf (~> 3.2)
@@ -126,10 +126,11 @@ GEM
       shex (~> 0.7)
       sparql (~> 3.2)
       sparql-client (~> 3.2)
-    linkheaders-processor (0.1.13)
+    linkheaders-processor (0.1.16)
       json (~> 2.0)
       json-ld (~> 3.2)
       json-ld-preloaded (~> 3.2)
+      link_header (~> 0.0.8)
       metainspector (~> 5.11.2)
       rest-client (~> 2.1)
       securerandom (~> 0.1.0)
@@ -165,7 +166,7 @@ GEM
     rack (2.2.4)
     rainbow (3.1.1)
     rake (13.0.6)
-    rdf (3.2.8)
+    rdf (3.2.9)
       link_header (~> 0.0, >= 0.0.8)
     rdf-aggregate-repo (3.2.1)
       rdf (~> 3.2)
@@ -248,7 +249,7 @@ GEM
       diff-lcs (>= 1.2.0, < 2.0)
       rspec-support (~> 3.11.0)
     rspec-support (3.11.0)
-    rubocop (1.32.0)
+    rubocop (1.33.0)
       json (~> 2.3)
       parallel (~> 1.10)
       parser (>= 3.1.0.0)

data/launch.json ADDED Viewed

@@ -0,0 +1,11 @@
+{
+    "name": "RSpec - all",
+    "type": "Ruby",
+    "request": "launch",
+    "cwd": "${workspaceRoot}",
+    "program": "/home/osboxes/.rvm/gems/ruby-3.0.0/bin/rspec",
+    "args": [
+      "-I",
+      "${workspaceRoot}"
+    ]
+  }

data/lib/config.conf_docker ADDED Viewed

@@ -0,0 +1,8 @@
+[extruct]
+command="extruct"
+[rdf]
+command="/usr/local/bundle/bin/rdf"
+[tika]
+command="http://tika:9998/meta"

data/lib/config.conf_local ADDED Viewed

@@ -0,0 +1,8 @@
+[extruct]
+command="extruct"
+[rdf]
+command="/home/osboxes/.rvm/gems/ruby-3.0.0/bin/rdf"
+[tika]
+command="http://tika:9998/meta"

data/lib/constants.rb CHANGED Viewed

@@ -1,17 +1,20 @@
 ACCEPT_ALL_HEADER = { 'Accept' => 'text/turtle, application/ld+json, application/rdf+xml, text/xhtml+xml, application/n3, application/rdf+n3, application/turtle, application/x-turtle, text/n3, text/turtle, text/rdf+n3, text/rdf+turtle, application/n-triples' }
+ACCEPT_STAR_HEADER = {'Accept' => '*/*'}
 TEXT_FORMATS = {
   'text' => ['text/plain']
 }
 RDF_FORMATS = {
-  'jsonld' => ['application/ld+json', 'application/vnd.schemaorg.ld+json'],  # NEW FOR DATACITE
+  'jsonld' => ['application/ld+json','application/x-ld+json', 'application/vnd.schemaorg.ld+json'],  # NEW FOR DATACITE
   'turtle' => ['text/turtle', 'application/n3', 'application/rdf+n3',
                'application/turtle', 'application/x-turtle', 'text/n3', 'text/turtle',
                'text/rdf+n3', 'text/rdf+turtle'],
   # 'rdfa'    => ['text/xhtml+xml', 'application/xhtml+xml'],
   'rdfxml' => ['application/rdf+xml'],
-  'triples' => ['application/n-triples', 'application/n-quads', 'application/trig']
+  'ntriples' => ['application/n-triples', 'application/trig'],
+  'nquads' => ['application/n-quads']
 }
 XML_FORMATS = {
@@ -73,12 +76,10 @@ GUID_TYPES = { 'inchi' => Regexp.new(/^\w{14}-\w{10}-\w$/),
                'uri' => Regexp.new(%r{^\w+:/?/?[^\s]+$}) }
 CONFIG = File.exist?('config.conf') ? ParseConfig.new('config.conf') : {}
-if CONFIG['extruct'] && CONFIG['extruct']['command'] && !CONFIG['extruct']['command'].empty?
-  extruct = config['extruct']['command']
-end
-extruct = 'extruct' unless @extruct_command
+extruct = CONFIG.dig(:extruct, :command)
+extruct ||= 'extruct'
 extruct.strip!
-case @extruct
+case extruct
 when /[&|;`$\s]/
   abort 'The Extruct command in the config file appears to be subject to command injection.  I will not continue'
 when /echo/i
@@ -86,8 +87,8 @@ when /echo/i
 end
 EXTRUCT_COMMAND = extruct
-rdf_command = CONFIG['rdf']['command'] if CONFIG['rdf'] && CONFIG['rdf']['command'] && !CONFIG['rdf']['command'].empty?
-rdf_command = 'rdf' unless @rdf_command
+rdf_command = CONFIG.dig(:rdf, :command)
+rdf_command ||= 'rdf'
 rdf_command.strip
 case rdf_command
 when /[&|;`$\s]/
@@ -99,8 +100,6 @@ when !(/rdf$/ =~ $_)
 end
 RDF_COMMAND = rdf_command
-if CONFIG['tika'] && CONFIG['tika']['command'] && !CONFIG['tika']['command'].empty?
-  tika_command = CONFIG['tika']['command']
-end
-tika_command = 'http://localhost:9998/meta' unless @tika_command
+tika_command = CONFIG.dig(:tika, :command)
+tika_command ||= 'http://localhost:9998/meta'
 TIKA_COMMAND = tika_command

data/lib/fsp_harvester/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module FspHarvester
-  VERSION = "0.1.7"  # up to date
+  VERSION = "0.1.11"
 end

data/lib/fsp_harvester.rb CHANGED Viewed

@@ -20,6 +20,9 @@ require_relative './metadata_object'
 require_relative './constants'
 require_relative './web_utils'
 require_relative './signposting_tests'
+require_relative './fsp_metadata_harvester'
+require_relative './fsp_metadata_parser'
 module FspHarvester
   class Error < StandardError
@@ -32,18 +35,29 @@ module FspHarvester
     def self.resolve_guid(guid:)
       @meta = FspHarvester::MetadataObject.new
-      @meta.finalURI = [guid]
+      @meta.all_uris = [guid]
       type, url = convertToURL(guid: guid)
       links = Array.new
       if type
         links = resolve_url(url: url)
+        @meta.links << links
       else
-        @meta.warnings << ['006', guid, '']
+        @meta.add_warning(['006', guid, ''])
         @meta.comments << "FATAL: GUID type not recognized.\n"
       end
       [links, @meta]
     end
+    def self.gather_metadata_from_describedby_links(links: [], metadata: FspHarvester::MetadataObject.new) # meta should have already been created by resolve+guid, but maybe not
+      @meta = metadata
+      db = []
+      links.each do |l|
+        db << l if l.relation == 'describedby'
+      end
+      FspHarvester::MetadataHarvester.extract_metadata(links: db, metadata: @meta)  # everything is gathered into the @meta metadata object
+      @meta
+    end
     def self.convertToURL(guid:)
       GUID_TYPES.each do |k, regex|
         if k == 'inchi' and regex.match(guid)
@@ -68,19 +82,19 @@ module FspHarvester
       false
     end
-    def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_ALL_HEADER)
+    def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_STAR_HEADER)
       @meta.guidtype = 'uri' if @meta.guidtype.nil?
       warn "\n\n FETCHING #{url} #{header}\n\n"
-      response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method)
+      response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method, meta: @meta)
       warn "\n\n head #{response.headers.inspect}\n\n" if response
       unless response
-        @meta.warnings << ['001', url, header]
+        @meta.add_warning(['001', url, header])
         @meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
         return []
       end
-      @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.finalURI.last}.  Using the output from this URL for the next few tests..."
+      @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}.  Using the output from this URL for the next few tests..."
       @meta.full_response << response.body
       links = process_link_headers(response: response) unless nolinkheaders
@@ -90,7 +104,7 @@ module FspHarvester
     def self.process_link_headers(response:)
       warn "\n\n parsing #{response.headers}\n\n"
-      parser = LinkHeaders::Processor.new(default_anchor: @meta.finalURI.last)
+      parser = LinkHeaders::Processor.new(default_anchor: @meta.all_uris.last)
       parser.extract_and_parse(response: response)
       factory = parser.factory # LinkHeaders::LinkFactory
@@ -105,6 +119,8 @@ module FspHarvester
       citeas = Array.new
       describedby = Array.new
       item = Array.new
+      types = Array.new
       factory.all_links.each do |l|
         case l.relation
         when 'cite-as'
@@ -113,23 +129,29 @@ module FspHarvester
           item << l
         when 'describedby'
           describedby << l
+        when 'type'
+          types << l
         end
       end
       check_describedby_rules(describedby: describedby)
       check_item_rules(item: item)
-      uniqueciteas = Array.new
       if citeas.length > 1
         warn "INFO: multiple cite-as links found. Checking for conflicts\n"
         @meta.comments << "INFO: multiple cite-as links found. Checking for conflicts\n"
-        uniqueciteas = check_for_citeas_conflicts(citeas: citeas) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
+        citeas = check_for_citeas_conflicts(citeas: citeas) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
       end
-      unless uniqueciteas == 1 && describedby.length > 0
-        @meta.warnings << ['004', '', '']
+      unless citeas.length == 1 && describedby.length > 0
+        @meta.add_warning(['004', '', ''])
         @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires exactly one cite-as header, and at least one describedby header\n"
       end
+      unless types.length >=1
+        @meta.add_warning(['015', '', ''])
+        @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires one or two 'type' link headers\n"
+      end
     end
   end
 end

data/lib/fsp_metadata_external_tools.rb ADDED Viewed

@@ -0,0 +1,82 @@
+# frozen_string_literal: true
+module FspHarvester
+  class Error < StandardError
+  end
+  class ExternalTools
+    def initialize(metadata: FspHarvester::MetadataObject.new)
+      @meta = metadata
+    end
+    def process_with_distiller(body:)
+      bhash = Digest::SHA256.hexdigest(body)
+      if @@distillerknown[bhash]
+        @meta.comments << "INFO: data is already parsed by distiller.\n"
+        #parse_rdf(body: body)
+      else
+        @meta.comments << "INFO: Using 'Kellog's Distiller' to try to extract metadata from return value (message body).\n"
+        file = Tempfile.new('foo', encoding: 'UTF-8')
+        body = body.force_encoding('UTF-8')
+        body.scrub!
+        body = body.gsub(%r{"@context"\s*:\s*"https?://schema.org/?"}, '"@context": "https://schema.org/docs/jsonldcontext.json"') # a bug in distiller, apparently
+        file.write(body)
+        file.rewind
+        @meta.comments << "INFO: The message body is being examined by Distiller\n"
+        # command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format turtle #{file.path} 2>/dev/null"
+        command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
+        # command = "LANG=en_US.UTF-8 /usr/local/bin/ruby #{@rdf_command} serialize --input-format rdfa --output-format jsonld #{file.path}"
+        # command = "LANG=en_US.UTF-8 /home/osboxes/.rvm/rubies/ruby-2.6.3/bin/ruby /home/osboxes/.rvm/gems/ruby-2.6.3/bin/rdf serialize --output-format jsonld #{file.path}"
+        warn "distiller command: #{command}"
+        result, _stderr, _status = Open3.capture3(command)
+        warn ''
+        warn "distiller errors: #{stderr}"
+        file.close
+        file.unlink
+        result = result.force_encoding('UTF-8')
+        warn "DIST RESULT: #{result}"
+        if result !~ /@context/i # failure returns nil
+          @meta.comments << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML..\n"
+          @meta.add_warning(['018', '', ''])
+        else
+          @meta.comments << "INFO: The Distiller found parseable data.  Parsing as JSON-LD\n"
+          parse_rdf(result: result, content_type: "application/ld+json")
+        end
+        @@distillerknown[bhash] = true
+      end
+    end
+    def processs_with_extruct(uri:)
+      @meta.comments << "INFO:  Using 'extruct' to try to extract metadata from return value (message body) of #{uri}.\n"
+      warn 'begin open3'
+      stdout, stderr, status = Open3.capture3(EXTRUCT_COMMAND + ' ' + uri)
+      warn "open3 status: #{status} #{stdout}"
+      result = stderr # absurd that the output comes over stderr!  LOL!
+      if result.to_s.match(/(Failed\sto\sextract.*?)\n/)
+        @meta.comments << "WARN: extruct threw an error #{Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n"
+        @meta.add_warning(['019', '', ''])
+        if result.to_s.match(/(ValueError:.*?)\n/)
+          @meta.comments << "WARN: extruct error was #{Regexp.last_match(1)}\n"
+          @meta.add_warning(['019', '', ''])
+        end
+      elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
+        json = JSON.parse result
+        @meta.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
+        parse_rdf(body: json['json-ld'].to_json, content_type: 'application/ld+json') if json['json-ld'].any? # RDF
+        @meta.merge_hash(json['microdata'].first) if json['microdata'].any?
+        @meta.merge_hash(json['microformat'].first) if json['microformat'].any?
+        @meta.merge_hash(json['opengraph'].first) if json['opengraph'].any?
+        parse_rdf(body: json['rdfa'].to_json, content_type: 'application/ld+json') if json['rdfa'].any? # RDF
+        @meta.merge_hash(json.first) if json.first.is_a? Hash
+      else
+        @meta.comments << "WARN: the extruct tool failed to find parseable data at #{uri}\n"
+      end
+    end
+  end
+end

data/lib/fsp_metadata_harvester.rb ADDED Viewed

@@ -0,0 +1,164 @@
+# frozen_string_literal: true
+module FspHarvester
+  class Error < StandardError
+  end
+  class MetadataHarvester
+    def self.extract_metadata(links: [], metadata: FspHarvester::MetadataObject.new)
+      @meta = metadata
+      @meta.comments << 'INFO:  now collecting both linked data and hash-style data using the harvested links'
+      describedby = links.select { |l| l if l.relation == 'describedby' }
+      hvst = FspHarvester::MetadataParser.new(metadata_object: @meta) # put here because the class variable for detecting duplicates should apply to all URIs
+      describedby.each do |link|
+        accepttype = ACCEPT_STAR_HEADER
+        accept = link.respond_to?('type') ? link.type : nil
+        accepttype = { 'Accept' => accept } if accept
+        response = attempt_to_resolve(link: link, headers: accepttype)
+        abbreviation, content_type = attempt_to_detect_type(body: response.body, headers: response.headers)
+        unless abbreviation
+          @meta.add_warning(['017', url, header])
+          @meta.comments << "WARN: metadata format returned from #{url} using Accept header #{header} is not recognized.  Processing will end now.\n"
+          next
+        end
+        # process according to detected type
+        case abbreviation
+        when 'html'
+          @meta.comments << 'INFO: Processing html'
+          hvst.process_html(body: response.body, uri: link)
+        when 'xml'
+          @meta.comments << 'INFO: Processing xml'
+          hvst.process_xml(body: response.body)
+        when 'json'
+          @meta.comments << 'INFO: Processing json'
+          hvst.process_json(body: response.body)
+        when 'jsonld', 'rdfxml', 'turtle', 'ntriples', 'nquads'
+          @meta.comments << 'INFO: Processing linked data'
+          hvst.process_ld(body: response.body, content_type: content_type)
+        when 'specialist'
+          warn 'no specialized parsers so far'
+        end
+      end
+    end
+    def self.attempt_to_resolve(link:, headers: ACCEPT_STAR_HEADER)
+      @meta.comments << "INFO:  link #{link.href} being processed"
+      if link.respond_to? 'type'
+        header = { 'Accept' => link.type }
+      else
+        @meta.comments << "INFO:  link #{link.href} has no MIME type, defaulting to */*"
+      end
+      url = link.href
+      response = FspHarvester::WebUtils.fspfetch(url: url, method: :get, headers: header)
+      unless response
+        @meta.add_warning(['016', url, header])
+        @meta.comments << "WARN: Unable to resolve describedby link #{url} using HTTP Accept header #{header}.\n"
+      end
+      response
+    end
+    def self.attempt_to_detect_type(body:, headers:)
+      #  described by should be an html, xml, json, or linked data document
+      abbreviation = nil
+      content_type = nil
+      @meta.comments << 'INFO: Testing metadata format for html, xml, and linked data formats\n'
+      if body =~ /^\s*<\?xml/
+        if body =~ /<HTML/i
+          abbreviation = 'html'
+          content_type = 'text/html'
+          @meta.comments << 'INFO: appears to be HTML\n'
+        elsif body =~ /<rdf:RDF/i
+          abbreviation = 'rdfxml'
+          content_type = 'application/rdf+xml'
+          @meta.comments << 'INFO: appears to be RDF-XML\n'
+        else
+          abbreviation = 'xml'
+          content_type = 'application/xml'
+          @meta.comments << 'INFO: appears to be XML\n'
+        end
+      else
+        abbreviation, content_type = check_ld(body: body, claimed_type: headers[:content_type])
+        abbreviation, content_type = check_json(body: body) unless abbreviation
+      end
+      unless content_type
+        @meta.add_warning(['017', url, header])
+        @meta.comments << "WARN: metadata format returned from #{url} using Accept header #{header} is not recognized.  Processing will end now.\n"
+      end
+      [abbreviation, content_type]
+    end
+    def self.check_ld(body:, claimed_type:)
+      detected_type = ntriples_hack(body: body) # ntriples hack for one-line metadata records
+      unless detected_type
+        detected_type = RDF::Format.for({ sample: body[0..5000] })
+        @meta.comments << "INFO: Auto-detected type #{detected_type}\n"
+      end
+      contenttype = ''
+      abbreviation = ''
+      if detected_type
+        contenttype = detected_type.content_type.first # comes back as array
+        abbreviation = abbreviate_type(contenttype: contenttype)
+        @meta.comments << "INFO: using content-type #{contenttype}.\n"
+      else
+        @meta.comments << "INFO: metadata does not appear to be in a linked data format.  Trying other options.\n"
+      end
+      [abbreviation, contenttype]
+    end
+    def self.ntriples_hack(body:)  # distriller cannot recognize single-line ntriples unless they end with a period, which is not required by the spec... so hack it!
+      detected_type = nil
+      body.split.each do |line|
+        line.strip!
+        next if line.empty?
+        if line =~ %r{\s*<[^>]+>\s*<[^>]+>\s\S+}
+          @meta.comments << "INFO: running ntriples hack on  #{line + " ."}\n"
+          detected_type = RDF::Format.for({ sample: "#{line} ." })  # adding a period allows detection of ntriples by distiller
+          break
+        end
+      end
+      @meta.comments << "INFO: ntriples hack found: #{detected_type.to_s}\n"
+      if detected_type != RDF::NTriples::Format   # only return the hacky case
+        return nil
+      end
+      return detected_type
+    end
+    def self.check_json(body:)
+      abbreviation = nil
+      parsed = nil
+      begin
+        parsed = JSON.parse(body)
+      rescue StandardError
+        abbreviation = nil
+      end
+      if parsed
+        abbreviation = 'json'
+      else
+        @meta.comments << "INFO: metadata does not appear to be in JSON format.  No options left.\n"
+      end
+      [abbreviation, 'application/ld+json']
+    end
+    def self.abbreviate_type(contenttype:)
+      foundtype = nil
+      RDF_FORMATS.merge(XML_FORMATS).merge(HTML_FORMATS).merge(JSON_FORMATS).each do |type, vals|
+        warn "\n\ntype #{type}\nvals #{vals}\n\n"
+        @meta.comments << "INFO: testing #{type} MIME types for #{contenttype}"
+        next unless vals.include? contenttype
+        foundtype = type
+        @meta.comments << "INFO: detected a #{type} MIME type"
+        break
+      end
+      foundtype
+    end
+  end
+end

data/lib/fsp_metadata_parser.rb ADDED Viewed

@@ -0,0 +1,109 @@
+# frozen_string_literal: true
+module FspHarvester
+  class Error < StandardError
+  end
+  class MetadataParser
+    # attr_accessor :distillerknown
+    @@distillerknown = {}
+    def initialize(metadata_object: FspHarvester::MetadataObject.new)
+      @meta = metadata_object
+    end
+    def process_html(body:, uri:)
+      tools = FspHarvester::ExternalTools.new(metadata: @meta)
+      tools.process_with_distiller(body: body)
+      tools.process_with_extruct(uri: uri)
+    end
+    def process_xml(body:)
+      begin
+        hash = XmlSimple.xml_in(body)
+      rescue
+        @meta.comments << "CRITICAL: Malformed XML detected.  Cannot process metadata.\n"
+        @meta.add_warning(['020', '', ''])
+      end
+      @meta.comments << "INFO: The XML is being merged in the metadata object\n"
+      @meta.hash.merge hash
+    end
+    def process_json(body:)
+      begin
+        hash = JSON.parse(body)
+      rescue
+        @meta.comments << "CRITICAL: Malformed JSON detected.  Cannot process metadata.\n"
+        @meta.add_warning(['021', '', ''])
+      end
+      @meta.comments << "INFO: The JSON is being merged in the metadata object\n"
+      @meta.hash.merge hash
+    end
+    def process_ld(body:, content_type:)
+      parse_rdf(body: body, content_type: content_type)
+    end
+    def parse_rdf(body:, content_type:)
+      unless body
+        @meta.comments << "CRITICAL: The response message body component appears to have no content.\n"
+        @meta.add_warning(['018', '', ''])
+        return
+      end
+      unless body.match(/\w/)
+        @meta.comments << "CRITICAL: The response message body component appears to have no content.\n"
+        @meta.add_warning(['018', '', ''])
+        return
+      end
+      rdfformat = RDF::Format.for(content_type: content_type)
+      unless rdfformat
+        @meta.comments << "CRITICAL: Found what appears to be RDF (sample:  #{body[0..300].delete!("\n")}), but it could not find a parser.  Please report this error, along with the GUID of the resource, to the maintainer of the system.\n"
+        @meta.add_warning(['018', '', ''])
+        return
+      end
+      graph = FspHarvester::Cache.checkRDFCache(body: body)
+      if graph.size > 0
+        warn "\n\n\n unmarshalling graph from cache\n\ngraph size #{graph.size}\n\n"
+        @meta.merge_rdf(graph.to_a)
+      else
+        warn "\n\n\nfound format #{rdfformat}\n\n"
+        @meta.comments << "INFO: The response message body component appears to contain #{rdfformat}.\n"
+        reader = ''
+        begin
+          reader = rdfformat.reader.new(body)
+        rescue Exception => e
+          @meta.comments << "WARN: Though linked data was found, it failed to parse (Exception #{e}).  This likely indicates some syntax error in the data.  As a result, no metadata will be extracted from this message.\n"
+          @meta.add_warning(['018', '', ''])
+          return
+        end
+        begin
+          if reader.size == 0
+            @meta.comments << "WARN: Though linked data was found, it failed to parse.  This likely indicates some syntax error in the data.  As a result, no metadata will be extracted from this message.\n"
+            return
+          end
+          reader = rdfformat.reader.new(body) # have to re-read it here, but now its safe because we have already caught errors
+          warn 'WRITING TO CACHE'
+          FspHarvester::Cache.writeRDFCache(reader: reader, body: body) # write to the special RDF graph cache
+          warn 'WRITING DONE'
+          reader = rdfformat.reader.new(body)  # frustrating that we cannot rewind!
+          warn 'RE-READING DONE'
+          @meta.merge_rdf(reader.to_a)
+          warn 'MERGE DONE'
+        rescue RDF::ReaderError => e
+          @meta.comments << "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} ||  (sample of what was parsed:  #{body[0..300].delete("\n")})\n"
+          warn "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} ||  (sample of what was parsed:  #{body[0..300].delete("\n")})\n"
+          @meta.add_warning(['018', '', ''])
+        rescue Exception => e
+          meta.comments << "CRITICAL: An unknown error occurred while parsing the (apparent) Linked Data (sample of what was parsed:  #{body[0..300].delete("\n")}).  Moving on...\n"
+          warn "\n\nCRITICAL: #{e.inspect} An unknown error occurred while parsing the (apparent) Linked Data (full body:  #{body}).  Moving on...\n"
+          @meta.add_warning(['018', '', ''])
+        end
+      end
+    end
+  end
+end

data/lib/metadata_object.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module FspHarvester
   class MetadataObject
-    attr_accessor :hash, :graph, :comments, :warnings, :guidtype, :full_response, :finalURI  # a hash of metadata # a RDF.rb graph of metadata  # an array of comments  # the type of GUID that was detected # will be an array of Net::HTTP::Response
+    attr_accessor :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris  # a hash of metadata # a RDF.rb graph of metadata  # an array of comments  # the type of GUID that was detected # will be an array of Net::HTTP::Response
     def initialize(_params = {}) # get a name from the "new" call, or set a default
       @hash = {}
@@ -8,15 +8,19 @@ module FspHarvester
       @comments =  []
       @warnings =  []
       @full_response = []
-      @finalURI = []
+      @links = []
+      @all_uris = []
+      w = RestClient.get("https://raw.githubusercontent.com/markwilkinson/FAIR-Signposting-Harvester/master/lib/warnings.json")
+      #@warn = File.read("./lib/warnings.json")
+      @warn = JSON.parse(w)
     end
     def merge_hash(hash)
-      # $stderr.puts "\n\n\nIncoming Hash #{hash.inspect}"
+      # warn "\n\n\nIncoming Hash #{hash.inspect}"
       self.hash = self.hash.merge(hash)
     end
-    def merge_rdf(triples)  # incoming list of triples
+    def merge_rdf(triples) # incoming list of triples
       graph << triples
       graph
     end
@@ -24,5 +28,106 @@ module FspHarvester
     def rdf
       graph
     end
+    def add_warning(warning)
+      id = warning[0]
+      url = warning[1]
+      headers = warning[2]
+      message = @warn[id]['message']
+      linkout = @warn[id]['linkout']
+      severity = @warn[id]['severity']
+      self.warnings << {"id" => id, "message" => message, "severity" => severity, "linkout" => linkout, "processed_url" => url, "accept_headers": headers}
+    end
+  end
+  class Cache
+    def self.retrieveMetaObject(uri)
+      filename = (Digest::MD5.hexdigest uri) + '_meta'
+      warn "Checking Meta cache for #{filename}"
+      if File.exist?("/tmp/#{filename}")
+        warn 'FOUND Meta object in cache'
+        meta = Marshal.load(File.read("/tmp/#{filename}"))
+        warn 'Returning....'
+        return meta
+      end
+      warn 'Meta objectNot Found in Cache'
+      false
+    end
+    def self.cacheMetaObject(meta, uri)
+      filename = (Digest::MD5.hexdigest uri) + '_meta'
+      warn "in cacheMetaObject Writing to cache for #{filename}"
+      File.open("/tmp/#{filename}", 'wb') { |f| f.write(Marshal.dump(meta)) }
+    end
+    def self.checkRDFCache(body: )
+      fs = File.join('/tmp/', '*_graphbody')
+      bodies = Dir.glob(fs)
+      g = RDF::Graph.new
+      bodies.each do |bodyfile|
+        next unless File.size(bodyfile) == body.bytesize # compare body size
+        next unless bodyfile.match(/(.*)_graphbody$/) # continue if there's no match
+        filename = Regexp.last_match(1)
+        warn "Regexp match for #{filename} FOUND"
+        next unless File.exist?("#{filename}_graph") # @ get the associated graph file
+        warn "RDF Cache File #{filename} FOUND"
+        graph = Marshal.load(File.read("#{filename}_graph")) # unmarshal it
+        graph.each do |statement|
+          g << statement # need to do this because the unmarshalled object isn't entirely functional as an RDF::Graph object
+        end
+        warn "returning a graph of #{g.size}"
+        break
+      end
+      # return an empty graph otherwise
+      g
+    end
+    def self.writeRDFCache(reader:, body:)
+      filename = Digest::MD5.hexdigest body
+      graph = RDF::Graph.new
+      reader.each_statement { |s| graph << s }
+      warn "WRITING RDF TO CACHE #{filename}"
+      File.open("/tmp/#{filename}_graph", 'wb') { |f| f.write(Marshal.dump(graph)) }
+      File.open("/tmp/#{filename}_graphbody", 'wb') { |f| f.write(body) }
+      warn "wrote RDF filename: #{filename}"
+    end
+    def self.checkCache(uri, headers)
+      filename = Digest::MD5.hexdigest uri + headers.to_s
+      warn "Checking Error cache for #{filename}"
+      if File.exist?("/tmp/#{filename}_error")
+        warn 'Error file found in cache... returning'
+        return ['ERROR', nil, nil]
+      end
+      if File.exist?("/tmp/#{filename}_head") and File.exist?("/tmp/#{filename}_body")
+        warn 'FOUND data in cache'
+        head = Marshal.load(File.read("/tmp/#{filename}_head"))
+        body = Marshal.load(File.read("/tmp/#{filename}_body"))
+        all_uris = ''
+        all_uris = Marshal.load(File.read("/tmp/#{filename}_uri")) if File.exist?("/tmp/#{filename}_uri")
+        warn 'Returning....'
+        return [head, body, all_uris]
+      end
+      warn 'Not Found in Cache'
+    end
+    def self.writeToCache(uri, headers, head, body, all_uris)
+      filename = Digest::MD5.hexdigest uri + headers.to_s
+      warn "in writeToCache Writing to cache for #{filename}"
+      headfilename = filename + '_head'
+      bodyfilename = filename + '_body'
+      urifilename = filename + '_uri'
+      File.open("/tmp/#{headfilename}", 'wb') { |f| f.write(Marshal.dump(head)) }
+      File.open("/tmp/#{bodyfilename}", 'wb') { |f| f.write(Marshal.dump(body)) }
+      File.open("/tmp/#{urifilename}", 'wb') { |f| f.write(Marshal.dump(all_uris)) }
+    end
+    def self.writeErrorToCache(uri, headers)
+      filename = Digest::MD5.hexdigest uri + headers.to_s
+      warn "in writeErrorToCache Writing error to cache for #{filename}"
+      File.open("/tmp/#{filename}_error", 'wb') { |f| f.write('ERROR') }
+    end
   end
 end

data/lib/signposting_tests.rb ADDED Viewed

@@ -0,0 +1,87 @@
+def check_for_citeas_conflicts(citeas: )
+  @meta.comments << 'INFO: checking for conflicting cite-as links'
+  citeas_hrefs = Hash.new
+  citeas.each do |link|
+    warn "INFO: Adding citeas #{link.href} to the testing queue."
+    @meta.comments << "INFO: Adding citeas #{link.href} to the testing queue."
+    citeas_hrefs[link.href] = link
+  end
+  if citeas_hrefs.length > 1
+    @meta.comments << 'INFO: Found multiple non-identical cite-as links.'
+    @meta.add_warning(['007', '', ''])
+    @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard: Found conflicting cite-as link headers.\n"
+  end
+  citeas_hrefs.values  # return list of unique links
+end
+def check_describedby_rules(describedby:)
+  describedby.each do |l|
+    unless l.respond_to? 'type'
+      @meta.add_warning(['005', l.href, ''])
+      @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires any describedby links to also have a 'type' attribute.\n"
+    end
+    type = l.type if l.respond_to? 'type'
+    type ||= '*/*'
+    header = { accept: type }
+    response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
+    if response
+      responsetype = response.headers[:content_type]
+      @meta.comments << "INFO: describedby link responds with content type #{responsetype}\n"
+      if responsetype =~ %r{^(.*/[^;]+)}
+        responsetype = Regexp.last_match(1).to_s # remove the e.g. charset information
+      end
+      @meta.comments << "INFO: testing content type |#{responsetype}| against |#{type}|\n"
+      if type != '*/*'
+        if responsetype == type
+          @meta.comments << "INFO: describedby link responds according to Signposting specifications\n"
+        else
+          @meta.add_warning(['009', l.href, header])
+          @meta.comments << "WARN: Content type of returned describedby link #{responsetype}does not match the 'type' attribute #{type}\n"
+        end
+      else
+        @meta.add_warning(['010', l.href, header])
+        @meta.comments << "WARN: Content type of returned describedby link is not specified in response headers or cannot be matched against accept headers\n"
+      end
+    else
+      @meta.add_warning(['008', l.href, header])
+      @meta.comments << "WARN: describedby link doesn't resolve\n"
+    end
+  end
+end
+def check_item_rules(item:)
+  item.each do |l| # l = LinkHeaders::Link
+    unless l.respond_to? 'type'
+      @meta.add_warning(['011', l.href, ''])
+      @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which encourages any item links to also have a 'type' attribute.\n"
+    end
+    type = l.type if l.respond_to? 'type'
+    type ||= '*/*' # this becomes a frozen string
+    header = { accept: type }
+    response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
+    if response
+      if response.headers[:content_type] and type != '*/*'
+        rtype = type.gsub(%r{/}, "\/")   # because type is a frozen string
+        rtype = rtype.gsub(/\+/, '.')
+        typeregex = Regexp.new(type)
+        if response.headers[:content_type].match(typeregex)
+          warn response.headers[:content_type]
+          warn typeregex.inspect
+          @meta.comments << "INFO: item link responds according to Signposting specifications\n"
+        else
+          @meta.add_warning(['012', l.href, header])
+          @meta.comments << "WARN: Content type of returned item link does not match the 'type' attribute\n"
+        end
+      else
+        @meta.add_warning(['013', l.href, header])
+        @meta.comments << "WARN: Content type of returned item link is not specified in response headers or cannot be matched against accept headers\n"
+      end
+    else
+      @meta.add_warning(['014', l.href, header])
+      @meta.comments << "WARN: item link doesn't resolve\n"
+    end
+  end
+end

data/lib/warnings.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
    "001": {
-      "message": "Unable to resolve guid using Accept headers for Linked Data",
+      "message": "Unable to resolve guid using default (*/*) Accept headers",
       "linkout": "",
       "severity": "WARN"
    },
@@ -68,7 +68,40 @@
       "message": "Item link does not resolve",
       "linkout": "",
       "severity": "WARN"
+   },
+   "015": {
+      "message": "Link headers do not include a link of type 'type', as required by the FAIR Signposting specification",
+      "linkout": "",
+      "severity": "WARN"
+   },
+   "016": {
+      "message": "Unable to resolve describedby link using Accept headers with the MIME type indicated in the link",
+      "linkout": "",
+      "severity": "WARN"
+   },
+   "017": {
+      "message": "Metadata format not recognized.",
+      "linkout": "",
+      "severity": "WARN"
+   },
+   "018": {
+      "message": "RDF parsing error - likely malformed RDF document.",
+      "linkout": "",
+      "severity": "WARN"
+   },
+   "019": {
+      "message": "HTML parsing error - unable to extract linked data from HTML.",
+      "linkout": "",
+      "severity": "WARN"
+   },
+   "020": {
+      "message": "XML parsing error - unable to process XML document.",
+      "linkout": "",
+      "severity": "WARN"
+   },
+   "021": {
+      "message": "JSON parsing error - unable to process JSON document.",
+      "linkout": "",
+      "severity": "WARN"
    }
 }

data/lib/web_utils.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module FspHarvester
   class WebUtils
-    def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get)
+    def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get, meta: FspHarvester::MetadataObject.new)
       warn 'In fetch routine now.  '
       begin
@@ -13,19 +13,19 @@ module FspHarvester
                                                 # password: pass,
                                                 headers: headers
                                               })
-        @meta.finalURI |= [response.request.url] if @meta  # it's possible to call this method without affecting the metadata object being created by the harvester
-        warn "There was a response to the call #{url}"
-        warn "There was a response to the call #{response.request.url}"
+        meta.all_uris |= [response.request.url]  # it's possible to call this method without affecting the metadata object being created by the harvester
+        warn "starting URL #{url}"
+        warn "final URL #{response.request.url}"
         warn "Response code #{response.code}"
-        if response.code == 203 && @meta
-          @meta.warnings << ["002", url, headers]
-          @meta.comments << "WARN: Response is non-authoritative (HTTP response code: #{response.code}).  Headers may have been manipulated encountered when trying to resolve #{url}\n"
+        if response.code == 203
+          meta.warnings << ["002", url, headers]
+          meta.comments << "WARN: Response is non-authoritative (HTTP response code: #{response.code}).  Headers may have been manipulated encountered when trying to resolve #{url}\n"
         end
         response
       rescue RestClient::ExceptionWithResponse => e
         warn "EXCEPTION WITH RESPONSE! #{e.response}\n#{e.response.headers}"
-        @meta.warnings << ["003", url, headers] if @meta
-        @meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n" if @meta
+        meta.warnings << ["003", url, headers]
+        meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
         if (e.response.code == 500 or e.response.code == 404)
           return false
         else
@@ -34,14 +34,14 @@ module FspHarvester
         # now we are returning the headers and body that were returned
       rescue RestClient::Exception => e
         warn "EXCEPTION WITH NO RESPONSE! #{e}"
-        @meta.warnings << ["003", url, headers] if @meta
-        @meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n" if @meta
+        meta.warnings << ["003", url, headers]
+        meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
         false
         # now we are returning 'False', and we will check that with an \"if\" statement in our main code
       rescue Exception => e
         warn "EXCEPTION UNKNOWN! #{e}"
-        @meta.warnings << ["003", url, headers] if @meta
-        @meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n" if @meta
+        meta.warnings << ["003", url, headers]
+        meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
         false
         # now we are returning 'False', and we will check that with an \"if\" statement in our main code
       end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: fsp_harvester
 version: !ruby/object:Gem::Version
-  version: 0.1.7
+  version: 0.1.11
 platform: ruby
 authors:
 - Mark Wilkinson
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2022-08-02 00:00:00.000000000 Z
+date: 2022-08-08 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: json
@@ -44,14 +44,14 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.1.13
+        version: 0.1.16
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.1.13
+        version: 0.1.16
 - !ruby/object:Gem::Dependency
   name: metainspector
   requirement: !ruby/object:Gem::Requirement
@@ -171,6 +171,7 @@ executables: []
 extensions: []
 extra_rdoc_files: []
 files:
+- ".rspec_status"
 - CHANGELOG.md
 - Gemfile
 - Gemfile.lock
@@ -180,10 +181,17 @@ files:
 - bin/console
 - bin/setup
 - example_test.rb
+- launch.json
+- lib/config.conf_docker
+- lib/config.conf_local
 - lib/constants.rb
 - lib/fsp_harvester.rb
 - lib/fsp_harvester/version.rb
+- lib/fsp_metadata_external_tools.rb
+- lib/fsp_metadata_harvester.rb
+- lib/fsp_metadata_parser.rb
 - lib/metadata_object.rb
+- lib/signposting_tests.rb
 - lib/swagger.rb
 - lib/warnings.json
 - lib/web_utils.rb