RubyGems - fsp_harvester - Versions diffs - 0.1.20 → 0.1.21 - Mend

fsp_harvester 0.1.20 → 0.1.21

Files changed (14) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 72df31c63580f2b47676bb719c860cd26cab4290346f20bd481f67d18b29f765
-  data.tar.gz: 477bfe524de0a1822790eac1caefb642a5e881734e8f1bc3c8f46c1a91b3e1e0
+  metadata.gz: 8498c33db9c350fec8ea4e734b31087f798a4f433f211115c69ded468dbcdb12
+  data.tar.gz: f3f408b24575f4f310c6f00ac0d42c3106f68fcd43199d3dbb73d8e4deb403fe
 SHA512:
-  metadata.gz: 328b1bf4531034b38f325ec7c2dfb682007ed8ef5fb4f9ea72a4776ffdb49bbdde280bd959f9adddcfb93f6a065b77af68ab6d5222942bc16b5d50901f771770
-  data.tar.gz: d5bc1e2e88be865c17aa12bca0a4308f3bc8e476bc0f49f40ca7a48a4e92142d8613c8e12823d9bb5e2735ea4d4cc492d6740481fdbebd513cb4f0be0c8114c8
+  metadata.gz: 263549dc8b8bf2fe8a4bc50289092ad2e55d9bbc05cabc509637786b4923948345220f0ee7a04fa5db497f670d9ab79d4e35e05648f09f189103ae869040baad
+  data.tar.gz: 6b35a320400ff37561ddf2cff506a3a4f385cd31933e4df8ceb4d436a4f97974e7122ab96962b8ce3cfc183144fcf41592fa417abcdb609fd44080061fc5e1a3

data/.rspec_status CHANGED Viewed

@@ -1,57 +1,60 @@
-example_id                         | status | run_time        |
----------------------------------- | ------ | --------------- |
-./spec/cite-as_spec.rb[1:1:1]      | passed | 1.36 seconds    |
-./spec/cite-as_spec.rb[1:1:2]      | passed | 1.31 seconds    |
-./spec/cite-as_spec.rb[1:1:3]      | passed | 1.53 seconds    |
-./spec/cite-as_spec.rb[1:1:4]      | passed | 1.84 seconds    |
-./spec/cite-as_spec.rb[1:1:5]      | passed | 2.77 seconds    |
-./spec/cite-as_spec.rb[1:1:6]      | passed | 2.06 seconds    |
-./spec/cite-as_spec.rb[1:1:7]      | passed | 2.96 seconds    |
-./spec/cite-as_spec.rb[1:1:8]      | passed | 2.28 seconds    |
-./spec/cite-as_spec.rb[1:1:9]      | passed | 2.83 seconds    |
-./spec/cite-as_spec.rb[1:1:10]     | passed | 2.14 seconds    |
-./spec/cite-as_spec.rb[1:1:11]     | passed | 3.19 seconds    |
-./spec/cite-as_spec.rb[1:1:12]     | passed | 3.06 seconds    |
-./spec/cite-as_spec.rb[1:1:13]     | passed | 2.77 seconds    |
-./spec/cite-as_spec.rb[1:1:14]     | passed | 2.2 seconds     |
-./spec/cite-as_spec.rb[1:1:15]     | passed | 1.19 seconds    |
-./spec/cite-as_spec.rb[1:1:16]     | passed | 1.1 seconds     |
-./spec/cite-as_spec.rb[1:1:17]     | passed | 1.31 seconds    |
-./spec/cite-as_spec.rb[1:1:18]     | passed | 1.14 seconds    |
-./spec/cite-as_spec.rb[1:1:19]     | passed | 1.68 seconds    |
-./spec/cite-as_spec.rb[1:1:20]     | passed | 1.69 seconds    |
-./spec/cite-as_spec.rb[1:1:21]     | passed | 2.35 seconds    |
-./spec/cite-as_spec.rb[1:1:22]     | passed | 1.12 seconds    |
-./spec/cite-as_spec.rb[1:1:23]     | passed | 1.16 seconds    |
-./spec/cite-as_spec.rb[1:1:24]     | failed | 1.45 seconds    |
-./spec/cite-as_spec.rb[1:1:25]     | passed | 0.72571 seconds |
-./spec/describedby_spec.rb[1:1:1]  | passed | 3.09 seconds    |
-./spec/describedby_spec.rb[1:1:2]  | passed | 1.13 seconds    |
-./spec/describedby_spec.rb[1:1:3]  | passed | 1.22 seconds    |
-./spec/describedby_spec.rb[1:1:4]  | passed | 1.11 seconds    |
-./spec/describedby_spec.rb[1:1:5]  | passed | 1.21 seconds    |
-./spec/describedby_spec.rb[1:1:6]  | passed | 1.24 seconds    |
-./spec/describedby_spec.rb[1:1:7]  | passed | 1.53 seconds    |
-./spec/describedby_spec.rb[1:1:8]  | passed | 2.53 seconds    |
-./spec/describedby_spec.rb[1:1:9]  | passed | 1.74 seconds    |
-./spec/describedby_spec.rb[1:1:10] | passed | 2.59 seconds    |
-./spec/describedby_spec.rb[1:1:11] | passed | 3.49 seconds    |
-./spec/describedby_spec.rb[1:1:12] | passed | 3.82 seconds    |
-./spec/describedby_spec.rb[1:1:13] | passed | 1.65 seconds    |
-./spec/describedby_spec.rb[1:1:14] | passed | 2.19 seconds    |
-./spec/describedby_spec.rb[1:1:15] | passed | 2.16 seconds    |
-./spec/fsp_harvester_spec.rb[1:1]  | passed | 0.00015 seconds |
-./spec/fsp_harvester_spec.rb[1:2]  | passed | 2.49 seconds    |
-./spec/fsp_harvester_spec.rb[1:3]  | passed | 7.06 seconds    |
-./spec/fsp_harvester_spec.rb[1:4]  | passed | 2.74 seconds    |
-./spec/item_spec.rb[1:1:1]         | passed | 3.41 seconds    |
-./spec/item_spec.rb[1:1:2]         | passed | 2.84 seconds    |
-./spec/item_spec.rb[1:1:3]         | passed | 1.15 seconds    |
-./spec/item_spec.rb[1:1:4]         | passed | 1.74 seconds    |
-./spec/item_spec.rb[1:1:5]         | passed | 2.6 seconds     |
-./spec/item_spec.rb[1:1:6]         | passed | 2.32 seconds    |
-./spec/item_spec.rb[1:1:7]         | passed | 2.81 seconds    |
-./spec/item_spec.rb[1:1:8]         | passed | 0.49717 seconds |
-./spec/type_spec.rb[1:1:1]         | passed | 1.25 seconds    |
-./spec/type_spec.rb[1:1:2]         | passed | 1.18 seconds    |
-./spec/type_spec.rb[1:1:3]         | passed | 1.58 seconds    |
+example_id                         | status | run_time               |
+---------------------------------- | ------ | ---------------------- |
+./spec/cite-as_spec.rb[1:1:1]      | passed | 1.77 seconds           |
+./spec/cite-as_spec.rb[1:1:2]      | passed | 1.22 seconds           |
+./spec/cite-as_spec.rb[1:1:3]      | passed | 1.09 seconds           |
+./spec/cite-as_spec.rb[1:1:4]      | passed | 1.89 seconds           |
+./spec/cite-as_spec.rb[1:1:5]      | passed | 2.95 seconds           |
+./spec/cite-as_spec.rb[1:1:6]      | passed | 2.14 seconds           |
+./spec/cite-as_spec.rb[1:1:7]      | passed | 2.96 seconds           |
+./spec/cite-as_spec.rb[1:1:8]      | passed | 2.28 seconds           |
+./spec/cite-as_spec.rb[1:1:9]      | passed | 3.4 seconds            |
+./spec/cite-as_spec.rb[1:1:10]     | passed | 2.21 seconds           |
+./spec/cite-as_spec.rb[1:1:11]     | passed | 2.82 seconds           |
+./spec/cite-as_spec.rb[1:1:12]     | passed | 2.23 seconds           |
+./spec/cite-as_spec.rb[1:1:13]     | passed | 3.36 seconds           |
+./spec/cite-as_spec.rb[1:1:14]     | passed | 2.19 seconds           |
+./spec/cite-as_spec.rb[1:1:15]     | passed | 1.19 seconds           |
+./spec/cite-as_spec.rb[1:1:16]     | passed | 1.23 seconds           |
+./spec/cite-as_spec.rb[1:1:17]     | passed | 1.19 seconds           |
+./spec/cite-as_spec.rb[1:1:18]     | passed | 1.28 seconds           |
+./spec/cite-as_spec.rb[1:1:19]     | passed | 1.94 seconds           |
+./spec/cite-as_spec.rb[1:1:20]     | passed | 2.1 seconds            |
+./spec/cite-as_spec.rb[1:1:21]     | passed | 2.23 seconds           |
+./spec/cite-as_spec.rb[1:1:22]     | passed | 1.17 seconds           |
+./spec/cite-as_spec.rb[1:1:23]     | passed | 1.13 seconds           |
+./spec/cite-as_spec.rb[1:1:24]     | failed | 1.24 seconds           |
+./spec/cite-as_spec.rb[1:1:25]     | passed | 0.49678 seconds        |
+./spec/describedby_spec.rb[1:1:1]  | passed | 3.18 seconds           |
+./spec/describedby_spec.rb[1:1:2]  | passed | 1.34 seconds           |
+./spec/describedby_spec.rb[1:1:3]  | passed | 1.2 seconds            |
+./spec/describedby_spec.rb[1:1:4]  | passed | 1.14 seconds           |
+./spec/describedby_spec.rb[1:1:5]  | passed | 1.24 seconds           |
+./spec/describedby_spec.rb[1:1:6]  | passed | 1.04 seconds           |
+./spec/describedby_spec.rb[1:1:7]  | passed | 0.9844 seconds         |
+./spec/describedby_spec.rb[1:1:8]  | passed | 2.07 seconds           |
+./spec/describedby_spec.rb[1:1:9]  | passed | 2.16 seconds           |
+./spec/describedby_spec.rb[1:1:10] | passed | 2.36 seconds           |
+./spec/describedby_spec.rb[1:1:11] | passed | 2.91 seconds           |
+./spec/describedby_spec.rb[1:1:12] | passed | 2.93 seconds           |
+./spec/describedby_spec.rb[1:1:13] | passed | 1.79 seconds           |
+./spec/describedby_spec.rb[1:1:14] | passed | 2.5 seconds            |
+./spec/describedby_spec.rb[1:1:15] | passed | 2.24 seconds           |
+./spec/fsp_harvester_spec.rb[1:1]  | passed | 0.00102 seconds        |
+./spec/fsp_harvester_spec.rb[1:2]  | passed | 2.5 seconds            |
+./spec/fsp_harvester_spec.rb[1:3]  | passed | 29.49 seconds          |
+./spec/fsp_harvester_spec.rb[1:4]  | passed | 2.53 seconds           |
+./spec/fsp_harvester_spec.rb[1:5]  | passed | 2.65 seconds           |
+./spec/fsp_harvester_spec.rb[1:6]  | failed | 1 minute 24.1 seconds  |
+./spec/fsp_harvester_spec.rb[1:7]  | passed | 2 minutes 24.3 seconds |
+./spec/item_spec.rb[1:1:1]         | passed | 2.71 seconds           |
+./spec/item_spec.rb[1:1:2]         | passed | 2.98 seconds           |
+./spec/item_spec.rb[1:1:3]         | passed | 1.33 seconds           |
+./spec/item_spec.rb[1:1:4]         | passed | 1.81 seconds           |
+./spec/item_spec.rb[1:1:5]         | passed | 2.2 seconds            |
+./spec/item_spec.rb[1:1:6]         | passed | 2.25 seconds           |
+./spec/item_spec.rb[1:1:7]         | passed | 2.94 seconds           |
+./spec/item_spec.rb[1:1:8]         | passed | 0.62818 seconds        |
+./spec/type_spec.rb[1:1:1]         | passed | 1.33 seconds           |
+./spec/type_spec.rb[1:1:2]         | passed | 1.22 seconds           |
+./spec/type_spec.rb[1:1:3]         | passed | 1.61 seconds           |

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    fsp_harvester (0.1.20)
+    fsp_harvester (0.1.21)
       json (~> 2.0)
       linkeddata (~> 3.2)
       linkheaders-processor (~> 0.1.18)

data/lib/constants.rb CHANGED Viewed

@@ -1,4 +1,6 @@
-ACCEPT_ALL_HEADER = { 'Accept' => 'text/turtle, application/ld+json, application/rdf+xml, text/xhtml+xml, application/n3, application/rdf+n3, application/turtle, application/x-turtle, text/n3, text/turtle, text/rdf+n3, text/rdf+turtle, application/n-triples' }
+module FspHarvester
+ACCEPT_LD_HEADER = { 'Accept' => 'text/turtle, application/ld+json, application/rdf+xml, text/xhtml+xml, application/n3, application/rdf+n3, application/turtle, application/x-turtle, text/n3, text/turtle, text/rdf+n3, text/rdf+turtle, application/n-triples' }
 ACCEPT_STAR_HEADER = {'Accept' => '*/*'}
@@ -77,6 +79,7 @@ GUID_TYPES = {
   'uri' => Regexp.new(%r{^\w+:/?/?[^\s]+$}),
   'ark' => Regexp.new(%r{^ark:/[^\s]+$})
 }
+end
 # CONFIG = File.exist?('config.conf') ? ParseConfig.new('config.conf') : {}
 # extruct = CONFIG.dig(:extruct, :command)
@@ -88,7 +91,7 @@ when /[&|;`$\s]/
 when /echo/i
   abort 'The Extruct command appears to be subject to command injection.  I will not continue'
 end
-EXTRUCT_COMMAND = extruct
+FspHarvester::EXTRUCT_COMMAND = extruct
 # rdf_command = CONFIG.dig(:rdf, :command)
 rdf_command = ENV['RDF_COMMAND'] || 'rdf'
@@ -101,8 +104,8 @@ when /echo/i
 when !(/rdf$/ =~ $_)
   abort "this software requires that Kelloggs Distiller tool is used. The distiller command must end in 'rdf'"
 end
-RDF_COMMAND = rdf_command
+FspHarvester::RDF_COMMAND = rdf_command
 # tika_command = CONFIG.dig(:tika, :command)
 tika_command = ENV['TIKA_COMMAND'] || 'http://localhost:9998/meta'
-TIKA_COMMAND = tika_command
+FspHarvester::TIKA_COMMAND = tika_command

data/lib/external_tools.rb CHANGED Viewed

@@ -5,18 +5,21 @@ module HarvesterTools
   end
   class ExternalTools
+    attr_accessor :distillerknown, :extructknown
     def initialize(metadata: HarvesterTools::MetadataObject.new)
+      @distillerknown = {}
+      @extructknown = {}
       @meta = metadata
     end
-    def process_with_distiller(body:)
+    def process_with_distiller(body:, metadata:)
+      meta = metadata
       bhash = Digest::SHA256.hexdigest(body)
-      if @@distillerknown[bhash]
-        @meta.comments << "INFO: data is already parsed by distiller.\n"
-        #parse_rdf(body: body)
+      if distillerknown[bhash]
+        meta.comments << "INFO: data is already parsed by distiller.\n"
       else
-        @meta.comments << "INFO: Using 'Kellog's Distiller' to try to extract metadata from return value (message body).\n"
+        meta.comments << "INFO: Using 'Kellog's Distiller' to try to extract metadata from return value (message body).\n"
         file = Tempfile.new('foo', encoding: 'UTF-8')
         body = body.force_encoding('UTF-8')
         body.scrub!
@@ -24,60 +27,73 @@ module HarvesterTools
         file.write(body)
         file.rewind
-        @meta.comments << "INFO: The message body is being examined by Distiller\n"
-        command = "LANG=en_US.UTF-8 #{RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
+        meta.comments << "INFO: The message body is being examined by Distiller\n"
+        command = "LANG=en_US.UTF-8 #{FspHarvester::RDF_COMMAND} serialize --input-format rdfa --output-format jsonld #{file.path}"
         warn "distiller command: #{command}"
         result, _stderr, _status = Open3.capture3(command)
         warn ''
-        warn "distiller errors: #{stderr}"
+        warn "distiller errors: #{_stderr}" if _stderr
         file.close
         file.unlink
         result = result.force_encoding('UTF-8')
-        warn "DIST RESULT: #{result}"
+        # warn "DIST RESULT: #{result}"
         if result !~ /@context/i # failure returns nil
-          @meta.comments << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML..\n"
-          @meta.add_warning(['018', '', ''])
-          result = "{}"
+          meta.comments << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML..\n"
+          meta.add_warning(['018', '', ''])
+          result = '{}'
         else
-          @meta.comments << "INFO: The Distiller found parseable data.  Parsing as JSON-LD\n"
+          meta.comments << "INFO: The Distiller found parseable data.  Parsing as JSON-LD\n"
         end
-        @@distillerknown[bhash] = true
+        distillerknown[bhash] = true
       end
       result
     end
-    def processs_with_extruct(uri:)
-      @meta.comments << "INFO:  Using 'extruct' to try to extract metadata from return value (message body) of #{uri}.\n"
-      warn 'begin open3'
-      stdout, stderr, status = Open3.capture3(EXTRUCT_COMMAND + ' ' + uri)
-      warn "open3 status: #{status} #{stdout}"
-      result = stderr # absurd that the output comes over stderr!  LOL!
-      jsonld = {}
-      microdata = Hash.new
-      microformat = Hash.new
-      opengraph = Hash.new
-      rdfa = Hash.new
+    def process_with_extruct(uri:, metadata:)
+      bhash = Digest::SHA256.hexdigest(uri)
+      jsonld = '{}'
+      microdata = {}
+      microformat = {}
+      opengraph = {}
+      rdfa = '{}'
-      if result.to_s.match(/(Failed\sto\sextract.*?)\n/)
-        @meta.comments << "WARN: extruct threw an error #{Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n"
-        @meta.add_warning(['019', '', ''])
-        if result.to_s.match(/(ValueError:.*?)\n/)
-          @meta.comments << "WARN: extruct error was #{Regexp.last_match(1)}\n"
-          @meta.add_warning(['019', '', ''])
-        end
-      elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
-        json = JSON.parse result
-        @meta.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
-        jsonld = json['json-ld'].to_json if json['json-ld'].any?
-        microdata = json['microdata'].first if json['microdata'].any
-        microformat = json['microformat'].first if json['microformat'].any?
-        opengraph = json['opengraph'].first if json['opengraph'].any?
-        rdfa = json['rdfa'].to_json if json['rdfa'].any?
-        # @meta.merge_hash(json.first) if json.first.is_a? Hash
+      if extructknown[bhash]
+        metadata.comments << "INFO: data is already parsed by extruct.\n"
       else
-        @meta.comments << "WARN: the extruct tool failed to find parseable data at #{uri}\n"
+        metadata.comments << "INFO:  Using 'extruct' to try to extract metadata from return value (message body) of #{uri}.\n"
+        warn 'begin open3'
+        stdout, stderr, status = Open3.capture3(FspHarvester::EXTRUCT_COMMAND + ' ' + uri)
+        warn "open3 status: #{status} #{stdout}"
+        result = stderr # absurd that the output comes over stderr!  LOL!
+        if result.to_s.match(/(Failed\sto\sextract.*?)\n/)
+          metadata.comments << "WARN: extruct threw an error #{Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n"
+          metadata.add_warning(['019', '', ''])
+          if result.to_s.match(/(ValueError:.*?)\n/)
+            metadata.comments << "WARN: extruct error was #{Regexp.last_match(1)}\n"
+            metadata.add_warning(['019', '', ''])
+          end
+        elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
+          begin
+            json = JSON.parse result
+          rescue StandardError
+            metadata.comments << "WARN: extruct threw an error when attempting to parse the extruct command return value from processing #{uri}.\n"
+            metadata.add_warning(['019', '', ''])
+            return [jsonld, microdata, microformat, opengraph, rdfa]
+          end
+          metadata.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
+          jsonld = json['json-ld'].to_json if json['json-ld'].any?
+          microdata = json['microdata'].first if json['microdata'].any?
+          microformat = json['microformat'].first if json['microformat'].any?
+          opengraph = json['opengraph'].first if json['opengraph'].any?
+          rdfa = json['rdfa'].to_json if json['rdfa'].any?
+          # @meta.merge_hash(json.first) if json.first.is_a? Hash
+        else
+          @meta.comments << "WARN: the extruct tool failed to find parseable data at #{uri}\n"
+        end
       end
+      extructknown[bhash] = true
       [jsonld, microdata, microformat, opengraph, rdfa]
     end
   end

data/lib/fsp_harvester/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module FspHarvester
-  VERSION = "0.1.20"
+  VERSION = "0.1.21"
 end

data/lib/harvester_brute.rb CHANGED Viewed

@@ -3,45 +3,81 @@ module HarvesterTools
   end
   class BruteForce
-    def self.begin_brute_force(guid:, metadata: HarvesterTools::MetadataObject.new)
+    def self.begin_brute_force(guid:, links: [], metadata: HarvesterTools::MetadataObject.new)
       type, url = HarvesterTools::Utils.convertToURL(guid: guid)
       return false unless type
-      do_content_negotiation(url: url, metadata: metadata)
+      # TODO:  follow rel=alternate headers, if they are in LD or Hash format
+      do_content_negotiation(url: url, metadata: metadata, links: links)
       metadata
     end
-    def self.do_content_negotiation(url:, metadata:)
-      response = resolve_url_brute(url: url, metadata: metadata, headers: ACCEPT_ALL_HEADER)
+    def self.do_content_negotiation(url:, metadata:, links: [])
+      warn "\n\nINFO: entering content negotiation of #{url}\n\n"
+      metadata.comments << "INFO: entering content negotiation of #{url}.\n"
+      response = resolve_url_brute(url: url, metadata: metadata, headers: FspHarvester::ACCEPT_LD_HEADER)
       if response
         HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata)
       end
-      response = resolve_url_brute(url: url, metadata: metadata, headers: ACCEPT_STAR_HEADER)
+      response = resolve_url_brute(url: url, metadata: metadata, headers: FspHarvester::ACCEPT_STAR_HEADER)
       if response
         HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from landing page
-        response = resolve_url_brute(url: response.request.url, metadata: metadata, headers: ACCEPT_ALL_HEADER) # now do content negotiation on the landing page
+        response = resolve_url_brute(url: response.request.url, metadata: metadata, headers: FspHarvester::ACCEPT_LD_HEADER) # now do content negotiation on the landing page
         if response
           HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from landing page
         end
       end
+      process_alternates(links: links, metadata: metadata)
+    end
+    def self.process_alternates(links: [], metadata:)
+      warn "\n\nINFO: entering content negotiation on link alternates\n\n"
+      metadata.comments << "IINFO: entering content negotiation on link alternates.\n"
+      # process "alternate" links
+      links.each do |link|
+        next unless link.relation == "alternate"
+        url = link.href
+        headers = {'Accept' => "#{link.type}"} if link.respond_to?("type")
+        headers ||= FspHarvester::ACCEPT_STAR_HEADER
+        warn "\n\nINFO: resolving alternate #{url} with headers #{headers.to_s}\n\n"
+        metadata.comments << "IINFO: entering content negotiation on link alternates.\n"
+        response = resolve_url_brute(url: url, metadata: metadata, headers: headers) # now do content negotiation on the link
+        if response
+          HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from alternate link
+        end
+      end
     end
     def self.resolve_url_brute(url:, method: :get, nolinkheaders: true, headers:, metadata:)
-      @meta = metadata
-      @meta.guidtype = 'uri' if @meta.guidtype.nil?
-      warn "\n\n BRUTE FETCHING #{url} #{headers}\n\n"
-      response = HarvesterTools::WebUtils.fspfetch(url: url, headers: headers, method: method, meta: @meta)
+      cache_key = Digest::MD5.hexdigest url + headers.to_s
+      if metadata.url_header_hash[cache_key]
+        warn "Already processed #{url} - moving on"
+        metadata.comments << "INFO: Already processed #{url} - moving on.\n"
+        return false
+      end
+      metadata.guidtype = 'uri' if metadata.guidtype.nil?
+      warn "\n\n BRUTE FETCHING #{url} \nwith headers\n #{headers}\n\n"
+      response = HarvesterTools::WebUtils.fspfetch(url: url, headers: headers, method: method, meta: metadata)
       warn "\n\n head #{response.headers.inspect}\n\n" if response
       unless response
-        @meta.add_warning(['001', url, headers])
-        @meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{headers}.\n"
-        @meta.full_response << [url, "No response"]
+        metadata.add_warning(['001', url, headers])
+        metadata.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{headers}.\n"
+        metadata.full_response << [url, "No response"]
         false
       end
-      @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}.  Using the output from this URL for the next few tests..."
-      @meta.full_response << [url, response.body]
+      metadata.comments << "INFO: following redirection using this header led to the following URL: #{metadata.all_uris.last}.  Using the output from this URL for the next few tests..."
+      metadata.full_response << [url, response.body]
+      metadata.url_header_hash[cache_key] = true
       response
     end
   end

data/lib/harvester_utils.rb CHANGED Viewed

@@ -20,7 +20,7 @@ module HarvesterTools
     end
     def self.convertToURL(guid:)
-      GUID_TYPES.each do |k, regex|
+      FspHarvester::GUID_TYPES.each do |k, regex|
         if k == 'inchi' and regex.match(guid)
           return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
         elsif k == 'handle1' and regex.match(guid)
@@ -39,13 +39,13 @@ module HarvesterTools
     end
     def self.typeit(guid:)
-      GUID_TYPES.each do |type, regex|
+      FspHarvester::GUID_TYPES.each do |type, regex|
         return type if regex.match(guid)
       end
       false
     end
-    def self.resolve_url(url:, method: :get, nolinkheaders: false, metadata:, header: ACCEPT_STAR_HEADER)
+    def self.resolve_url(url:, method: :get, nolinkheaders: false, metadata:, header: FspHarvester::ACCEPT_STAR_HEADER)
       @meta = metadata
       @meta.guidtype = 'uri' if @meta.guidtype.nil?
       warn "\n\n FETCHING #{url} #{header}\n\n"
@@ -59,7 +59,7 @@ module HarvesterTools
       end
       @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}.  Using the output from this URL for the next few tests..."
-      @meta.full_response << response.body
+      @meta.full_response << [url, response.body]
       links = process_link_headers(response: response, metadata: @meta) unless nolinkheaders
       links

data/lib/metadata_harvester.rb CHANGED Viewed

@@ -13,7 +13,7 @@ module HarvesterTools
       hvst = HarvesterTools::MetadataParser.new(metadata_object: @meta) # put here because the class variable for detecting duplicates should apply to all URIs
       describedby.each do |link|
-        accepttype = ACCEPT_STAR_HEADER
+        accepttype = FspHarvester::ACCEPT_STAR_HEADER
         accept = link.respond_to?('type') ? link.type : nil
         accepttype = { 'Accept' => accept } if accept
@@ -38,9 +38,14 @@ module HarvesterTools
       abbreviation, content_type = attempt_to_detect_type(body: response.body, headers: response.headers)
       unless abbreviation
         @meta.add_warning(['017', response.request.url, ''])
-        @meta.comments << "WARN: metadata format returned from #{response.request.url} is not recognized. Moving on.\n"
+        @meta.comments << "WARN: format returned from #{response.request.url} is not recognized. Moving on.\n"
         return
       end
+      request_content_types = response.request.headers["Accept"].split(/,\s*/)
+      unless (request_content_types.include? content_type) and !(request_content_types.include? "*/*") and (response.code != 406)
+        @meta.add_warning(['023', response.request.url, ''])
+        @meta.comments << "WARN: format returned from #{response.request.url} does not match request type.  This should result in a 406 error, but instead was accepted as a 200.\n"
+      end
       process_according_to_type(body: response.body, uri: response.request.url, metadata: @meta,
                                 abbreviation: abbreviation, content_type: content_type)
     end
@@ -65,7 +70,7 @@ module HarvesterTools
       end
     end
-    def self.attempt_to_resolve(link:, headers: ACCEPT_STAR_HEADER)
+    def self.attempt_to_resolve(link:, headers: FspHarvester::ACCEPT_STAR_HEADER)
       @meta.comments << "INFO:  link #{link.href} being processed"
       if link.respond_to? 'type'
         header = { 'Accept' => link.type }
@@ -86,23 +91,37 @@ module HarvesterTools
       abbreviation = nil
       content_type = nil
       @meta.comments << 'INFO: Testing metadata format for html, xml, and linked data formats\n'
+      claimed_type = headers[:content_type]
+      claimed_type.gsub!(/\s*;.*/, '')
       if body =~ /^\s*<\?xml/
-        if body =~ /<HTML/i
+        if body[0..1000] =~ /<HTML/i  # take a sample, it should appear quite early (it will appear in other places in e.g. tutorial documents)
           abbreviation = 'html'
-          content_type = 'text/html'
+          content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
+          @meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
+          content_type |= 'text/html'
           @meta.comments << 'INFO: appears to be HTML\n'
         elsif body =~ /<rdf:RDF/i
           abbreviation = 'rdfxml'
-          content_type = 'application/rdf+xml'
+          content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
+          @meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
+          content_type |= 'application/rdf+xml'
           @meta.comments << 'INFO: appears to be RDF-XML\n'
         else
           abbreviation = 'xml'
-          content_type = 'application/xml'
+          content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
+          @meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
+          content_type |= 'application/xml'
           @meta.comments << 'INFO: appears to be XML\n'
         end
+      elsif body[0..1000] =~ /<HTML/i # take a sample, it should appear quite early (it will appear in other places in e.g. tutorial documents)
+        abbreviation = 'html'
+        content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
+        @meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
+        content_type ||= 'text/html'
+        @meta.comments << 'INFO: appears to be HTML\n'
       else
-        abbreviation, content_type = check_ld(body: body, claimed_type: headers[:content_type])
-        abbreviation, content_type = check_json(body: body) unless abbreviation
+        abbreviation, content_type = check_ld(body: body, claimed_type: claimed_type)
+        abbreviation, content_type = check_json(body: body) unless abbreviation  # don't test if LD already found!
       end
       unless content_type
@@ -112,18 +131,46 @@ module HarvesterTools
       [abbreviation, content_type]
     end
+    def self.validate_claimed_type(abbreviation:, claimed_type:)
+        warn "\n\nclaimed type #{claimed_type}\nabbreviation #{abbreviation}\n\n"
+        claimed_type.gsub!(/\s*;.*/, '')
+        case abbreviation
+        when 'html'
+          return claimed_type if FspHarvester::HTML_FORMATS['html'].include? claimed_type
+        when 'xml'
+          return claimed_type if FspHarvester::XML_FORMATS['xml'].include? claimed_type
+        when 'json'
+          return claimed_type if FspHarvester::JSON_FORMATS['json'].include? claimed_type
+        when 'jsonld', 'rdfxml', 'turtle', 'ntriples', 'nquads'
+          return claimed_type if FspHarvester::RDF_FORMATS.values.flatten.include? claimed_type
+        when 'specialist'
+          warn 'no specialized parsers so far'
+        end
+        return false
+    end
     def self.check_ld(body:, claimed_type:)
       detected_type = ntriples_hack(body: body) # ntriples hack for one-line metadata records
-      unless detected_type
+      unless detected_type  # see if distiller can detect a type
         detected_type = RDF::Format.for({ sample: body[0..5000] })
         @meta.comments << "INFO: Auto-detected type #{detected_type}\n"
       end
+      # at this point, detected_type is something like RDF::Turtle::Format (or nil).  This will return a content-type
       contenttype = ''
       abbreviation = ''
       if detected_type
-        contenttype = detected_type.content_type.first # comes back as array
-        abbreviation = abbreviate_type(contenttype: contenttype)
-        @meta.comments << "INFO: using content-type #{contenttype}.\n"
+        detectedcontenttypes = detected_type.content_type # comes back as array of [application/x, application/y]
+        unless detectedcontenttypes.include? claimed_type
+          @meta.add_warning(['022', @meta.all_uris.last, "" ])
+          contenttype = detected_type.content_type.first  # just pick one arbitrarily, since it doesn't match thedeclared type anyway
+          abbreviation = abbreviate_type(contenttype: contenttype)
+          @meta.comments << "INFO: using content-type #{contenttype} even though there was a mismatch.\n"
+        else
+          contenttype = claimed_type  # just pick one arbitrarily, since it doesn't match thedeclared type anyway
+          abbreviation = abbreviate_type(contenttype: contenttype)
+          @meta.comments << "INFO: using content-type #{contenttype}.\n"
+        end
       else
         @meta.comments << "INFO: metadata does not appear to be in a linked data format.  Trying other options.\n"
       end
@@ -161,13 +208,14 @@ module HarvesterTools
         abbreviation = 'json'
       else
         @meta.comments << "INFO: metadata does not appear to be in JSON format.  No options left.\n"
+        return [nil, nil]
       end
-      [abbreviation, 'application/ld+json']
+      [abbreviation, 'application/json']
     end
     def self.abbreviate_type(contenttype:)
       foundtype = nil
-      RDF_FORMATS.merge(XML_FORMATS).merge(HTML_FORMATS).merge(JSON_FORMATS).each do |type, vals|
+      FspHarvester::RDF_FORMATS.merge(FspHarvester::XML_FORMATS).merge(FspHarvester::HTML_FORMATS).merge(FspHarvester::JSON_FORMATS).each do |type, vals|
         warn "\n\ntype #{type}\nvals #{vals}\n\n"
         @meta.comments << "INFO: testing #{type} MIME types for #{contenttype}"
         next unless vals.include? contenttype

data/lib/metadata_object.rb CHANGED Viewed

@@ -1,8 +1,8 @@
 module HarvesterTools
   class MetadataObject
-    attr_accessor :id, :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris, :tested_guid, :score, :version, :date  # a hash of metadata # a RDF.rb graph of metadata  # an array of comments  # the type of GUID that was detected # will be an array of Net::HTTP::Response
+    attr_accessor :id, :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris, :tested_guid, :score, :version, :date, :url_header_hash  # a hash of metadata # a RDF.rb graph of metadata  # an array of comments  # the type of GUID that was detected # will be an array of Net::HTTP::Response
-    def initialize(id: "unidentified_metadata") # get a name from the "new" call, or set a default
+    def initialize(id: "urn:local:unidentified_metadata") # get a name from the "new" call, or set a default
       @id = id
       @hash = {}
       @graph = RDF::Graph.new
@@ -16,6 +16,7 @@ module HarvesterTools
       @score = 0
       @version = '0.0'
       @date = Time.now.strftime('%Y-%m-%dT%H:%M:%S.%L%z')
+      @url_header_hash = Hash.new(false) # the combinarion of URL and the accept headers, sha1 hashed, for quick lookup if it has already been processed
       w = RestClient.get("https://raw.githubusercontent.com/markwilkinson/FAIR-Signposting-Harvester/master/lib/warnings.json")
       #@warn = File.read("./lib/warnings.json")
       @warn = JSON.parse(w)
@@ -37,6 +38,7 @@ module HarvesterTools
     def add_warning(warning)
       id = warning[0]
+      return unless @warn[id]   # if there's a mismatch between code and the warnings in github
       url = warning[1]
       headers = warning[2]
       message = @warn[id]['message']

data/lib/metadata_parser.rb CHANGED Viewed

@@ -13,17 +13,16 @@ module HarvesterTools
       @meta = metadata_object
     end
-    def process_html(body:, uri:, metadata:)
-      @meta = metadata
+    def process_html(body:, uri:, metadata: @meta)
       tools = HarvesterTools::ExternalTools.new(metadata: @meta)
-      result = tools.process_with_distiller(body: body)
+      tools.process_with_distiller(body: body, metadata: @meta) # adds to @meta
-      jsonld, microdata, microformat, opengraph, rdfa = tools.process_with_extruct(uri: uri)
-      parse_rdf(body: jsonld, content_type: 'application/ld+json')
+      jsonld, microdata, microformat, opengraph, rdfa = tools.process_with_extruct(uri: uri, metadata: @meta)
+      parse_rdf(body: jsonld, content_type: 'application/ld+json', metadata: metadata)
       @meta.merge_hash(microdata)
       @meta.merge_hash(microformat)
       @meta.merge_hash(opengraph)
-      parse_rdf(body: rdfa, content_type: 'application/ld+json')
+      parse_rdf(body: rdfa, content_type: 'application/ld+json', metadata: @meta)
     end
     def process_xml(body:, metadata:)

data/lib/warnings.json CHANGED Viewed

@@ -116,6 +116,18 @@
                   {"Validator": "https://jsononline.net/json-validator"}],
       "severity": "WARN"
    },
+   "022": {
+      "message": "Mismatch between the Content-type header and the content of the returned document.",
+      "linkout": [],
+      "severity": "WARN"
+   },
+   "023": {
+      "message": "Returned content-type is not compatible with any requested content-type, yet a HTTP 406 error was not returned",
+      "linkout": [],
+      "severity": "WARN"
+   },
    "600": {
       "message": "Data identifier cannot be unambiguously determined, therefore cannot be tested against known persistent identifier schemas",
       "linkout": [],

data/lib/web_utils.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module HarvesterTools
   class WebUtils
-    def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get, meta: HarvesterTools::MetadataObject.new)
+    def self.fspfetch(url:, headers: ACCEPT_STAR_HEADER, method: :get, meta: HarvesterTools::MetadataObject.new)
       warn 'In fetch routine now.  '
       begin

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: fsp_harvester
 version: !ruby/object:Gem::Version
-  version: 0.1.20
+  version: 0.1.21
 platform: ruby
 authors:
 - Mark Wilkinson
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2022-08-17 00:00:00.000000000 Z
+date: 2022-08-18 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: json