RubyGems - fsp_harvester - Versions diffs - 0.1.20 → 0.1.21 - Mend

fsp_harvester 0.1.20 → 0.1.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 72df31c63580f2b47676bb719c860cd26cab4290346f20bd481f67d18b29f765
-  data.tar.gz: 477bfe524de0a1822790eac1caefb642a5e881734e8f1bc3c8f46c1a91b3e1e0
+  metadata.gz: 8498c33db9c350fec8ea4e734b31087f798a4f433f211115c69ded468dbcdb12
+  data.tar.gz: f3f408b24575f4f310c6f00ac0d42c3106f68fcd43199d3dbb73d8e4deb403fe
 SHA512:
-  metadata.gz: 328b1bf4531034b38f325ec7c2dfb682007ed8ef5fb4f9ea72a4776ffdb49bbdde280bd959f9adddcfb93f6a065b77af68ab6d5222942bc16b5d50901f771770
-  data.tar.gz: d5bc1e2e88be865c17aa12bca0a4308f3bc8e476bc0f49f40ca7a48a4e92142d8613c8e12823d9bb5e2735ea4d4cc492d6740481fdbebd513cb4f0be0c8114c8
+  metadata.gz: 263549dc8b8bf2fe8a4bc50289092ad2e55d9bbc05cabc509637786b4923948345220f0ee7a04fa5db497f670d9ab79d4e35e05648f09f189103ae869040baad
+  data.tar.gz: 6b35a320400ff37561ddf2cff506a3a4f385cd31933e4df8ceb4d436a4f97974e7122ab96962b8ce3cfc183144fcf41592fa417abcdb609fd44080061fc5e1a3

data/.rspec_status CHANGED Viewed

@@ -1,57 +1,60 @@
-example_id                         | status | run_time        |
----------------------------------- | ------ | --------------- |
-./spec/cite-as_spec.rb[1:1:1]      | passed | 1.36 seconds    |
-./spec/cite-as_spec.rb[1:1:2]      | passed | 1.31 seconds    |
-./spec/cite-as_spec.rb[1:1:3]      | passed | 1.53 seconds    |
-./spec/cite-as_spec.rb[1:1:4]      | passed | 1.84 seconds    |
-./spec/cite-as_spec.rb[1:1:5]      | passed | 2.77 seconds    |
-./spec/cite-as_spec.rb[1:1:6]      | passed | 2.06 seconds    |
-./spec/cite-as_spec.rb[1:1:7]      | passed | 2.96 seconds    |
-./spec/cite-as_spec.rb[1:1:8]      | passed | 2.28 seconds    |
-./spec/cite-as_spec.rb[1:1:9]      | passed | 2.83 seconds    |
-./spec/cite-as_spec.rb[1:1:10]     | passed | 2.14 seconds    |
-./spec/cite-as_spec.rb[1:1:11]     | passed | 3.19 seconds    |
-./spec/cite-as_spec.rb[1:1:12]     | passed | 3.06 seconds    |
-./spec/cite-as_spec.rb[1:1:13]     | passed | 2.77 seconds    |
-./spec/cite-as_spec.rb[1:1:14]     | passed | 2.2 seconds     |
-./spec/cite-as_spec.rb[1:1:15]     | passed | 1.19 seconds    |
-./spec/cite-as_spec.rb[1:1:16]     | passed | 1.1 seconds     |
-./spec/cite-as_spec.rb[1:1:17]     | passed | 1.31 seconds    |
-./spec/cite-as_spec.rb[1:1:18]     | passed | 1.14 seconds    |
-./spec/cite-as_spec.rb[1:1:19]     | passed | 1.68 seconds    |
-./spec/cite-as_spec.rb[1:1:20]     | passed | 1.69 seconds    |
-./spec/cite-as_spec.rb[1:1:21]     | passed | 2.35 seconds    |
-./spec/cite-as_spec.rb[1:1:22]     | passed | 1.12 seconds    |
-./spec/cite-as_spec.rb[1:1:23]     | passed | 1.16 seconds    |
-./spec/cite-as_spec.rb[1:1:24]     | failed | 1.45 seconds    |
-./spec/cite-as_spec.rb[1:1:25]     | passed | 0.72571 seconds |
-./spec/describedby_spec.rb[1:1:1]  | passed | 3.09 seconds    |
-./spec/describedby_spec.rb[1:1:2]  | passed | 1.13 seconds    |
-./spec/describedby_spec.rb[1:1:3]  | passed | 1.22 seconds    |
-./spec/describedby_spec.rb[1:1:4]  | passed | 1.11 seconds    |
-./spec/describedby_spec.rb[1:1:5]  | passed | 1.21 seconds    |
-./spec/describedby_spec.rb[1:1:6]  | passed | 1.24 seconds    |
-./spec/describedby_spec.rb[1:1:7]  | passed | 1.53 seconds    |
-./spec/describedby_spec.rb[1:1:8]  | passed | 2.53 seconds    |
-./spec/describedby_spec.rb[1:1:9]  | passed | 1.74 seconds    |
-./spec/describedby_spec.rb[1:1:10] | passed | 2.59 seconds    |
-./spec/describedby_spec.rb[1:1:11] | passed | 3.49 seconds    |
-./spec/describedby_spec.rb[1:1:12] | passed | 3.82 seconds    |
-./spec/describedby_spec.rb[1:1:13] | passed | 1.65 seconds    |
-./spec/describedby_spec.rb[1:1:14] | passed | 2.19 seconds    |
-./spec/describedby_spec.rb[1:1:15] | passed | 2.16 seconds    |
-./spec/fsp_harvester_spec.rb[1:1]  | passed | 0.00015 seconds |
-./spec/fsp_harvester_spec.rb[1:2]  | passed | 2.49 seconds    |
-./spec/fsp_harvester_spec.rb[1:3]  | passed | 7.06 seconds    |
-./spec/fsp_harvester_spec.rb[1:4]  | passed | 2.74 seconds    |
-./spec/item_spec.rb[1:1:1]         | passed | 3.41 seconds    |
-./spec/item_spec.rb[1:1:2]         | passed | 2.84 seconds    |
-./spec/item_spec.rb[1:1:3]         | passed | 1.15 seconds    |
-./spec/item_spec.rb[1:1:4]         | passed | 1.74 seconds    |
-./spec/item_spec.rb[1:1:5]         | passed | 2.6 seconds     |
-./spec/item_spec.rb[1:1:6]         | passed | 2.32 seconds    |
-./spec/item_spec.rb[1:1:7]         | passed | 2.81 seconds    |
-./spec/item_spec.rb[1:1:8]         | passed | 0.49717 seconds |
-./spec/type_spec.rb[1:1:1]         | passed | 1.25 seconds    |
-./spec/type_spec.rb[1:1:2]         | passed | 1.18 seconds    |
-./spec/type_spec.rb[1:1:3]         | passed | 1.58 seconds    |
+example_id                         | status | run_time               |
+---------------------------------- | ------ | ---------------------- |
+./spec/cite-as_spec.rb[1:1:1]      | passed | 1.77 seconds           |
+./spec/cite-as_spec.rb[1:1:2]      | passed | 1.22 seconds           |
+./spec/cite-as_spec.rb[1:1:3]      | passed | 1.09 seconds           |
+./spec/cite-as_spec.rb[1:1:4]      | passed | 1.89 seconds           |
+./spec/cite-as_spec.rb[1:1:5]      | passed | 2.95 seconds           |
+./spec/cite-as_spec.rb[1:1:6]      | passed | 2.14 seconds           |
+./spec/cite-as_spec.rb[1:1:7]      | passed | 2.96 seconds           |
+./spec/cite-as_spec.rb[1:1:8]      | passed | 2.28 seconds           |
+./spec/cite-as_spec.rb[1:1:9]      | passed | 3.4 seconds            |
+./spec/cite-as_spec.rb[1:1:10]     | passed | 2.21 seconds           |
+./spec/cite-as_spec.rb[1:1:11]     | passed | 2.82 seconds           |
+./spec/cite-as_spec.rb[1:1:12]     | passed | 2.23 seconds           |
+./spec/cite-as_spec.rb[1:1:13]     | passed | 3.36 seconds           |
+./spec/cite-as_spec.rb[1:1:14]     | passed | 2.19 seconds           |
+./spec/cite-as_spec.rb[1:1:15]     | passed | 1.19 seconds           |
+./spec/cite-as_spec.rb[1:1:16]     | passed | 1.23 seconds           |
+./spec/cite-as_spec.rb[1:1:17]     | passed | 1.19 seconds           |
+./spec/cite-as_spec.rb[1:1:18]     | passed | 1.28 seconds           |
+./spec/cite-as_spec.rb[1:1:19]     | passed | 1.94 seconds           |
+./spec/cite-as_spec.rb[1:1:20]     | passed | 2.1 seconds            |
+./spec/cite-as_spec.rb[1:1:21]     | passed | 2.23 seconds           |
+./spec/cite-as_spec.rb[1:1:22]     | passed | 1.17 seconds           |
+./spec/cite-as_spec.rb[1:1:23]     | passed | 1.13 seconds           |
+./spec/cite-as_spec.rb[1:1:24]     | failed | 1.24 seconds           |
+./spec/cite-as_spec.rb[1:1:25]     | passed | 0.49678 seconds        |
+./spec/describedby_spec.rb[1:1:1]  | passed | 3.18 seconds           |
+./spec/describedby_spec.rb[1:1:2]  | passed | 1.34 seconds           |
+./spec/describedby_spec.rb[1:1:3]  | passed | 1.2 seconds            |
+./spec/describedby_spec.rb[1:1:4]  | passed | 1.14 seconds           |
+./spec/describedby_spec.rb[1:1:5]  | passed | 1.24 seconds           |
+./spec/describedby_spec.rb[1:1:6]  | passed | 1.04 seconds           |
+./spec/describedby_spec.rb[1:1:7]  | passed | 0.9844 seconds         |
+./spec/describedby_spec.rb[1:1:8]  | passed | 2.07 seconds           |
+./spec/describedby_spec.rb[1:1:9]  | passed | 2.16 seconds           |
+./spec/describedby_spec.rb[1:1:10] | passed | 2.36 seconds           |
+./spec/describedby_spec.rb[1:1:11] | passed | 2.91 seconds           |
+./spec/describedby_spec.rb[1:1:12] | passed | 2.93 seconds           |
+./spec/describedby_spec.rb[1:1:13] | passed | 1.79 seconds           |
+./spec/describedby_spec.rb[1:1:14] | passed | 2.5 seconds            |
+./spec/describedby_spec.rb[1:1:15] | passed | 2.24 seconds           |
+./spec/fsp_harvester_spec.rb[1:1]  | passed | 0.00102 seconds        |
+./spec/fsp_harvester_spec.rb[1:2]  | passed | 2.5 seconds            |
+./spec/fsp_harvester_spec.rb[1:3]  | passed | 29.49 seconds          |
+./spec/fsp_harvester_spec.rb[1:4]  | passed | 2.53 seconds           |
+./spec/fsp_harvester_spec.rb[1:5]  | passed | 2.65 seconds           |
+./spec/fsp_harvester_spec.rb[1:6]  | failed | 1 minute 24.1 seconds  |
+./spec/fsp_harvester_spec.rb[1:7]  | passed | 2 minutes 24.3 seconds |
+./spec/item_spec.rb[1:1:1]         | passed | 2.71 seconds           |
+./spec/item_spec.rb[1:1:2]         | passed | 2.98 seconds           |
+./spec/item_spec.rb[1:1:3]         | passed | 1.33 seconds           |
+./spec/item_spec.rb[1:1:4]         | passed | 1.81 seconds           |
+./spec/item_spec.rb[1:1:5]         | passed | 2.2 seconds            |
+./spec/item_spec.rb[1:1:6]         | passed | 2.25 seconds           |
+./spec/item_spec.rb[1:1:7]         | passed | 2.94 seconds           |
+./spec/item_spec.rb[1:1:8]         | passed | 0.62818 seconds        |
+./spec/type_spec.rb[1:1:1]         | passed | 1.33 seconds           |
+./spec/type_spec.rb[1:1:2]         | passed | 1.22 seconds           |
+./spec/type_spec.rb[1:1:3]         | passed | 1.61 seconds           |

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    fsp_harvester (0.1.20)
+    fsp_harvester (0.1.21)
       json (~> 2.0)
       linkeddata (~> 3.2)
       linkheaders-processor (~> 0.1.18)

data/lib/constants.rb CHANGED Viewed

@@ -1,4 +1,6 @@
-ACCEPT_ALL_HEADER = { 'Accept' => 'text/turtle, application/ld+json, application/rdf+xml, text/xhtml+xml, application/n3, application/rdf+n3, application/turtle, application/x-turtle, text/n3, text/turtle, text/rdf+n3, text/rdf+turtle, application/n-triples' }
+module FspHarvester
+ACCEPT_LD_HEADER = { 'Accept' => 'text/turtle, application/ld+json, application/rdf+xml, text/xhtml+xml, application/n3, application/rdf+n3, application/turtle, application/x-turtle, text/n3, text/turtle, text/rdf+n3, text/rdf+turtle, application/n-triples' }
 ACCEPT_STAR_HEADER = {'Accept' => '*/*'}
@@ -77,6 +79,7 @@ GUID_TYPES = {
   'uri' => Regexp.new(%r{^\w+:/?/?[^\s]+$}),
   'ark' => Regexp.new(%r{^ark:/[^\s]+$})
 }
+end
 # CONFIG = File.exist?('config.conf') ? ParseConfig.new('config.conf') : {}
 # extruct = CONFIG.dig(:extruct, :command)
@@ -88,7 +91,7 @@ when /[&|;`$\s]/
 when /echo/i
   abort 'The Extruct command appears to be subject to command injection.  I will not continue'
 end
-EXTRUCT_COMMAND = extruct
+FspHarvester::EXTRUCT_COMMAND = extruct
 # rdf_command = CONFIG.dig(:rdf, :command)
 rdf_command = ENV['RDF_COMMAND'] || 'rdf'
@@ -101,8 +104,8 @@ when /echo/i
 when !(/rdf$/ =~ $_)
   abort "this software requires that Kelloggs Distiller tool is used. The distiller command must end in 'rdf'"
 end
-RDF_COMMAND = rdf_command
+FspHarvester::RDF_COMMAND = rdf_command
 # tika_command = CONFIG.dig(:tika, :command)
 tika_command = ENV['TIKA_COMMAND'] || 'http://localhost:9998/meta'
-TIKA_COMMAND = tika_command
+FspHarvester::TIKA_COMMAND = tika_command

data/lib/external_tools.rb CHANGED Viewed

@@ -5,18 +5,21 @@ module HarvesterTools
   end
   class ExternalTools
+    attr_accessor :distillerknown, :extructknown
     def initialize(metadata: HarvesterTools::MetadataObject.new)
+      @distillerknown = {}
+      @extructknown = {}
       @meta = metadata
     end
-    def process_with_distiller(body:)
+    def process_with_distiller(body:, metadata:)
+      meta = metadata
       bhash = Digest::SHA256.hexdigest(body)
-      if @@distillerknown[bhash]
-        @meta.comments << "INFO: data is already parsed by distiller.\n"
-        #parse_rdf(body: body)
+      if distillerknown[bhash]
+        meta.comments << "INFO: data is already parsed by distiller.\n"
       else
-        @meta.comments << "INFO: Using 'Kellog's Distiller' to try to extract metadata from return value (message body).\n"
+        meta.comments << "INFO: Using 'Kellog's Distiller' to try to extract metadata from return value (message body).\n"
         file = Tempfile.new('foo', encoding: 'UTF-8')
         body = body.force_encoding('UTF-8')
         body.scrub!
@@ -24,60 +27,73 @@ module HarvesterTools
         file.write(body)
         file.rewind
-        @meta.comments << "INFO: The message body is being examined by Distiller\n"
-        command = "LANG=en_US.UTF-8 #{RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
+        meta.comments << "INFO: The message body is being examined by Distiller\n"
+        command = "LANG=en_US.UTF-8 #{FspHarvester::RDF_COMMAND} serialize --input-format rdfa --output-format jsonld #{file.path}"
         warn "distiller command: #{command}"
         result, _stderr, _status = Open3.capture3(command)
         warn ''
-        warn "distiller errors: #{stderr}"
+        warn "distiller errors: #{_stderr}" if _stderr
         file.close
         file.unlink
         result = result.force_encoding('UTF-8')
-        warn "DIST RESULT: #{result}"
+        # warn "DIST RESULT: #{result}"
         if result !~ /@context/i # failure returns nil
-          @meta.comments << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML..\n"
-          @meta.add_warning(['018', '', ''])
-          result = "{}"
+          meta.comments << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML..\n"
+          meta.add_warning(['018', '', ''])
+          result = '{}'
         else
-          @meta.comments << "INFO: The Distiller found parseable data.  Parsing as JSON-LD\n"
+          meta.comments << "INFO: The Distiller found parseable data.  Parsing as JSON-LD\n"
         end
-        @@distillerknown[bhash] = true
+        distillerknown[bhash] = true
       end
       result
     end
-    def processs_with_extruct(uri:)
-      @meta.comments << "INFO:  Using 'extruct' to try to extract metadata from return value (message body) of #{uri}.\n"
-      warn 'begin open3'
-      stdout, stderr, status = Open3.capture3(EXTRUCT_COMMAND + ' ' + uri)
-      warn "open3 status: #{status} #{stdout}"
-      result = stderr # absurd that the output comes over stderr!  LOL!
-      jsonld = {}
-      microdata = Hash.new
-      microformat = Hash.new
-      opengraph = Hash.new
-      rdfa = Hash.new
+    def process_with_extruct(uri:, metadata:)
+      bhash = Digest::SHA256.hexdigest(uri)
+      jsonld = '{}'
+      microdata = {}
+      microformat = {}
+      opengraph = {}
+      rdfa = '{}'
-      if result.to_s.match(/(Failed\sto\sextract.*?)\n/)
-        @meta.comments << "WARN: extruct threw an error #{Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n"
-        @meta.add_warning(['019', '', ''])
-        if result.to_s.match(/(ValueError:.*?)\n/)
-          @meta.comments << "WARN: extruct error was #{Regexp.last_match(1)}\n"
-          @meta.add_warning(['019', '', ''])
-        end
-      elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
-        json = JSON.parse result
-        @meta.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
-        jsonld = json['json-ld'].to_json if json['json-ld'].any?
-        microdata = json['microdata'].first if json['microdata'].any
-        microformat = json['microformat'].first if json['microformat'].any?
-        opengraph = json['opengraph'].first if json['opengraph'].any?
-        rdfa = json['rdfa'].to_json if json['rdfa'].any?
-        # @meta.merge_hash(json.first) if json.first.is_a? Hash
+      if extructknown[bhash]
+        metadata.comments << "INFO: data is already parsed by extruct.\n"
       else
-        @meta.comments << "WARN: the extruct tool failed to find parseable data at #{uri}\n"
+        metadata.comments << "INFO:  Using 'extruct' to try to extract metadata from return value (message body) of #{uri}.\n"
+        warn 'begin open3'
+        stdout, stderr, status = Open3.capture3(FspHarvester::EXTRUCT_COMMAND + ' ' + uri)
+        warn "open3 status: #{status} #{stdout}"
+        result = stderr # absurd that the output comes over stderr!  LOL!
+        if result.to_s.match(/(Failed\sto\sextract.*?)\n/)
+          metadata.comments << "WARN: extruct threw an error #{Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n"
+          metadata.add_warning(['019', '', ''])
+          if result.to_s.match(/(ValueError:.*?)\n/)
+            metadata.comments << "WARN: extruct error was #{Regexp.last_match(1)}\n"
+            metadata.add_warning(['019', '', ''])
+          end
+        elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
+          begin
+            json = JSON.parse result
+          rescue StandardError
+            metadata.comments << "WARN: extruct threw an error when attempting to parse the extruct command return value from processing #{uri}.\n"
+            metadata.add_warning(['019', '', ''])
+            return [jsonld, microdata, microformat, opengraph, rdfa]
+          end
+          metadata.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
+          jsonld = json['json-ld'].to_json if json['json-ld'].any?
+          microdata = json['microdata'].first if json['microdata'].any?
+          microformat = json['microformat'].first if json['microformat'].any?
+          opengraph = json['opengraph'].first if json['opengraph'].any?
+          rdfa = json['rdfa'].to_json if json['rdfa'].any?
+          # @meta.merge_hash(json.first) if json.first.is_a? Hash
+        else
+          @meta.comments << "WARN: the extruct tool failed to find parseable data at #{uri}\n"
+        end
       end
+      extructknown[bhash] = true
       [jsonld, microdata, microformat, opengraph, rdfa]
     end
   end

data/lib/fsp_harvester/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module FspHarvester
-  VERSION = "0.1.20"
+  VERSION = "0.1.21"
 end

data/lib/harvester_brute.rb CHANGED Viewed

@@ -3,45 +3,81 @@ module HarvesterTools
   end
   class BruteForce
-    def self.begin_brute_force(guid:, metadata: HarvesterTools::MetadataObject.new)
+    def self.begin_brute_force(guid:, links: [], metadata: HarvesterTools::MetadataObject.new)
       type, url = HarvesterTools::Utils.convertToURL(guid: guid)
       return false unless type
-      do_content_negotiation(url: url, metadata: metadata)
+      # TODO:  follow rel=alternate headers, if they are in LD or Hash format
+      do_content_negotiation(url: url, metadata: metadata, links: links)
       metadata
     end
-    def self.do_content_negotiation(url:, metadata:)
-      response = resolve_url_brute(url: url, metadata: metadata, headers: ACCEPT_ALL_HEADER)
+    def self.do_content_negotiation(url:, metadata:, links: [])
+      warn "\n\nINFO: entering content negotiation of #{url}\n\n"
+      metadata.comments << "INFO: entering content negotiation of #{url}.\n"
+      response = resolve_url_brute(url: url, metadata: metadata, headers: FspHarvester::ACCEPT_LD_HEADER)
       if response
         HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata)
       end
-      response = resolve_url_brute(url: url, metadata: metadata, headers: ACCEPT_STAR_HEADER)
+      response = resolve_url_brute(url: url, metadata: metadata, headers: FspHarvester::ACCEPT_STAR_HEADER)
       if response
         HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from landing page
-        response = resolve_url_brute(url: response.request.url, metadata: metadata, headers: ACCEPT_ALL_HEADER) # now do content negotiation on the landing page
+        response = resolve_url_brute(url: response.request.url, metadata: metadata, headers: FspHarvester::ACCEPT_LD_HEADER) # now do content negotiation on the landing page
         if response
           HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from landing page
         end
       end
+      process_alternates(links: links, metadata: metadata)
+    end
+    def self.process_alternates(links: [], metadata:)
+      warn "\n\nINFO: entering content negotiation on link alternates\n\n"
+      metadata.comments << "IINFO: entering content negotiation on link alternates.\n"
+      # process "alternate" links
+      links.each do |link|
+        next unless link.relation == "alternate"
+        url = link.href
+        headers = {'Accept' => "#{link.type}"} if link.respond_to?("type")
+        headers ||= FspHarvester::ACCEPT_STAR_HEADER
+        warn "\n\nINFO: resolving alternate #{url} with headers #{headers.to_s}\n\n"
+        metadata.comments << "IINFO: entering content negotiation on link alternates.\n"
+        response = resolve_url_brute(url: url, metadata: metadata, headers: headers) # now do content negotiation on the link
+        if response
+          HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from alternate link
+        end
+      end
     end
     def self.resolve_url_brute(url:, method: :get, nolinkheaders: true, headers:, metadata:)
-      @meta = metadata
-      @meta.guidtype = 'uri' if @meta.guidtype.nil?
-      warn "\n\n BRUTE FETCHING #{url} #{headers}\n\n"
-      response = HarvesterTools::WebUtils.fspfetch(url: url, headers: headers, method: method, meta: @meta)
+      cache_key = Digest::MD5.hexdigest url + headers.to_s
+      if metadata.url_header_hash[cache_key]
+        warn "Already processed #{url} - moving on"
+        metadata.comments << "INFO: Already processed #{url} - moving on.\n"
+        return false
+      end
+      metadata.guidtype = 'uri' if metadata.guidtype.nil?
+      warn "\n\n BRUTE FETCHING #{url} \nwith headers\n #{headers}\n\n"
+      response = HarvesterTools::WebUtils.fspfetch(url: url, headers: headers, method: method, meta: metadata)
       warn "\n\n head #{response.headers.inspect}\n\n" if response
       unless response
-        @meta.add_warning(['001', url, headers])
-        @meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{headers}.\n"
-        @meta.full_response << [url, "No response"]
+        metadata.add_warning(['001', url, headers])
+        metadata.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{headers}.\n"
+        metadata.full_response << [url, "No response"]
         false
       end
-      @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}.  Using the output from this URL for the next few tests..."
-      @meta.full_response << [url, response.body]
+      metadata.comments << "INFO: following redirection using this header led to the following URL: #{metadata.all_uris.last}.  Using the output from this URL for the next few tests..."
+      metadata.full_response << [url, response.body]
+      metadata.url_header_hash[cache_key] = true
       response
     end
   end

data/lib/harvester_utils.rb CHANGED Viewed

@@ -20,7 +20,7 @@ module HarvesterTools
     end
     def self.convertToURL(guid:)
-      GUID_TYPES.each do |k, regex|
+      FspHarvester::GUID_TYPES.each do |k, regex|
         if k == 'inchi' and regex.match(guid)
           return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
         elsif k == 'handle1' and regex.match(guid)
@@ -39,13 +39,13 @@ module HarvesterTools
     end
     def self.typeit(guid:)
-      GUID_TYPES.each do |type, regex|
+      FspHarvester::GUID_TYPES.each do |type, regex|
         return type if regex.match(guid)
       end
       false
     end
-    def self.resolve_url(url:, method: :get, nolinkheaders: false, metadata:, header: ACCEPT_STAR_HEADER)
+    def self.resolve_url(url:, method: :get, nolinkheaders: false, metadata:, header: FspHarvester::ACCEPT_STAR_HEADER)
       @meta = metadata
       @meta.guidtype = 'uri' if @meta.guidtype.nil?
       warn "\n\n FETCHING #{url} #{header}\n\n"
@@ -59,7 +59,7 @@ module HarvesterTools
       end
       @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}.  Using the output from this URL for the next few tests..."
-      @meta.full_response << response.body
+      @meta.full_response << [url, response.body]
       links = process_link_headers(response: response, metadata: @meta) unless nolinkheaders
       links

data/lib/metadata_harvester.rb CHANGED Viewed

@@ -13,7 +13,7 @@ module HarvesterTools
       hvst = HarvesterTools::MetadataParser.new(metadata_object: @meta) # put here because the class variable for detecting duplicates should apply to all URIs
       describedby.each do |link|
-        accepttype = ACCEPT_STAR_HEADER
+        accepttype = FspHarvester::ACCEPT_STAR_HEADER
         accept = link.respond_to?('type') ? link.type : nil
         accepttype = { 'Accept' => accept } if accept
@@ -38,9 +38,14 @@ module HarvesterTools
       abbreviation, content_type = attempt_to_detect_type(body: response.body, headers: response.headers)
       unless abbreviation
         @meta.add_warning(['017', response.request.url, ''])
-        @meta.comments << "WARN: metadata format returned from #{response.request.url} is not recognized. Moving on.\n"
+        @meta.comments << "WARN: format returned from #{response.request.url} is not recognized. Moving on.\n"
         return
       end
+      request_content_types = response.request.headers["Accept"].split(/,\s*/)
+      unless (request_content_types.include? content_type) and !(request_content_types.include? "*/*") and (response.code != 406)
+        @meta.add_warning(['023', response.request.url, ''])
+        @meta.comments << "WARN: format returned from #{response.request.url} does not match request type.  This should result in a 406 error, but instead was accepted as a 200.\n"
+      end
       process_according_to_type(body: response.body, uri: response.request.url, metadata: @meta,
                                 abbreviation: abbreviation, content_type: content_type)
     end
@@ -65,7 +70,7 @@ module HarvesterTools
       end
     end
-    def self.attempt_to_resolve(link:, headers: ACCEPT_STAR_HEADER)
+    def self.attempt_to_resolve(link:, headers: FspHarvester::ACCEPT_STAR_HEADER)
       @meta.comments << "INFO:  link #{link.href} being processed"
       if link.respond_to? 'type'
         header = { 'Accept' => link.type }
@@ -86,23 +91,37 @@ module HarvesterTools
       abbreviation = nil
       content_type = nil
       @meta.comments << 'INFO: Testing metadata format for html, xml, and linked data formats\n'
+      claimed_type = headers[:content_type]
+      claimed_type.gsub!(/\s*;.*/, '')
       if body =~ /^\s*<\?xml/
-        if body =~ /<HTML/i
+        if body[0..1000] =~ /<HTML/i  # take a sample, it should appear quite early (it will appear in other places in e.g. tutorial documents)
           abbreviation = 'html'
-          content_type = 'text/html'
+          content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
+          @meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
+          content_type |= 'text/html'
           @meta.comments << 'INFO: appears to be HTML\n'
         elsif body =~ /<rdf:RDF/i
           abbreviation = 'rdfxml'
-          content_type = 'application/rdf+xml'
+          content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
+          @meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
+          content_type |= 'application/rdf+xml'
           @meta.comments << 'INFO: appears to be RDF-XML\n'
         else
           abbreviation = 'xml'
-          content_type = 'application/xml'
+          content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
+          @meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
+          content_type |= 'application/xml'
           @meta.comments << 'INFO: appears to be XML\n'
         end
+      elsif body[0..1000] =~ /<HTML/i # take a sample, it should appear quite early (it will appear in other places in e.g. tutorial documents)
+        abbreviation = 'html'
+        content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
+        @meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
+        content_type ||= 'text/html'
+        @meta.comments << 'INFO: appears to be HTML\n'
       else
-        abbreviation, content_type = check_ld(body: body, claimed_type: headers[:content_type])
-        abbreviation, content_type = check_json(body: body) unless abbreviation
+        abbreviation, content_type = check_ld(body: body, claimed_type: claimed_type)
+        abbreviation, content_type = check_json(body: body) unless abbreviation  # don't test if LD already found!
       end
       unless content_type
@@ -112,18 +131,46 @@ module HarvesterTools
       [abbreviation, content_type]
     end
+    def self.validate_claimed_type(abbreviation:, claimed_type:)
+        warn "\n\nclaimed type #{claimed_type}\nabbreviation #{abbreviation}\n\n"
+        claimed_type.gsub!(/\s*;.*/, '')
+        case abbreviation
+        when 'html'
+          return claimed_type if FspHarvester::HTML_FORMATS['html'].include? claimed_type
+        when 'xml'
+          return claimed_type if FspHarvester::XML_FORMATS['xml'].include? claimed_type
+        when 'json'
+          return claimed_type if FspHarvester::JSON_FORMATS['json'].include? claimed_type
+        when 'jsonld', 'rdfxml', 'turtle', 'ntriples', 'nquads'
+          return claimed_type if FspHarvester::RDF_FORMATS.values.flatten.include? claimed_type
+        when 'specialist'
+          warn 'no specialized parsers so far'
+        end
+        return false
+    end
     def self.check_ld(body:, claimed_type:)
       detected_type = ntriples_hack(body: body) # ntriples hack for one-line metadata records
-      unless detected_type
+      unless detected_type  # see if distiller can detect a type
         detected_type = RDF::Format.for({ sample: body[0..5000] })
         @meta.comments << "INFO: Auto-detected type #{detected_type}\n"
       end
+      # at this point, detected_type is something like RDF::Turtle::Format (or nil).  This will return a content-type
       contenttype = ''
       abbreviation = ''
       if detected_type
-        contenttype = detected_type.content_type.first # comes back as array
-        abbreviation = abbreviate_type(contenttype: contenttype)
-        @meta.comments << "INFO: using content-type #{contenttype}.\n"
+        detectedcontenttypes = detected_type.content_type # comes back as array of [application/x, application/y]
+        unless detectedcontenttypes.include? claimed_type
+          @meta.add_warning(['022', @meta.all_uris.last, "" ])
+          contenttype = detected_type.content_type.first  # just pick one arbitrarily, since it doesn't match thedeclared type anyway
+          abbreviation = abbreviate_type(contenttype: contenttype)
+          @meta.comments << "INFO: using content-type #{contenttype} even though there was a mismatch.\n"
+        else
+          contenttype = claimed_type  # just pick one arbitrarily, since it doesn't match thedeclared type anyway
+          abbreviation = abbreviate_type(contenttype: contenttype)
+          @meta.comments << "INFO: using content-type #{contenttype}.\n"
+        end
       else
         @meta.comments << "INFO: metadata does not appear to be in a linked data format.  Trying other options.\n"
       end
@@ -161,13 +208,14 @@ module HarvesterTools
         abbreviation = 'json'
       else
         @meta.comments << "INFO: metadata does not appear to be in JSON format.  No options left.\n"
+        return [nil, nil]
       end
-      [abbreviation, 'application/ld+json']
+      [abbreviation, 'application/json']
     end
     def self.abbreviate_type(contenttype:)
       foundtype = nil
-      RDF_FORMATS.merge(XML_FORMATS).merge(HTML_FORMATS).merge(JSON_FORMATS).each do |type, vals|
+      FspHarvester::RDF_FORMATS.merge(FspHarvester::XML_FORMATS).merge(FspHarvester::HTML_FORMATS).merge(FspHarvester::JSON_FORMATS).each do |type, vals|
         warn "\n\ntype #{type}\nvals #{vals}\n\n"
         @meta.comments << "INFO: testing #{type} MIME types for #{contenttype}"
         next unless vals.include? contenttype

data/lib/metadata_object.rb CHANGED Viewed

@@ -1,8 +1,8 @@
 module HarvesterTools
   class MetadataObject
-    attr_accessor :id, :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris, :tested_guid, :score, :version, :date  # a hash of metadata # a RDF.rb graph of metadata  # an array of comments  # the type of GUID that was detected # will be an array of Net::HTTP::Response
+    attr_accessor :id, :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris, :tested_guid, :score, :version, :date, :url_header_hash  # a hash of metadata # a RDF.rb graph of metadata  # an array of comments  # the type of GUID that was detected # will be an array of Net::HTTP::Response
-    def initialize(id: "unidentified_metadata") # get a name from the "new" call, or set a default
+    def initialize(id: "urn:local:unidentified_metadata") # get a name from the "new" call, or set a default
       @id = id
       @hash = {}
       @graph = RDF::Graph.new
@@ -16,6 +16,7 @@ module HarvesterTools
       @score = 0
       @version = '0.0'
       @date = Time.now.strftime('%Y-%m-%dT%H:%M:%S.%L%z')
+      @url_header_hash = Hash.new(false) # the combinarion of URL and the accept headers, sha1 hashed, for quick lookup if it has already been processed
       w = RestClient.get("https://raw.githubusercontent.com/markwilkinson/FAIR-Signposting-Harvester/master/lib/warnings.json")
       #@warn = File.read("./lib/warnings.json")
       @warn = JSON.parse(w)
@@ -37,6 +38,7 @@ module HarvesterTools
     def add_warning(warning)
       id = warning[0]
+      return unless @warn[id]   # if there's a mismatch between code and the warnings in github
       url = warning[1]
       headers = warning[2]
       message = @warn[id]['message']

data/lib/metadata_parser.rb CHANGED Viewed

@@ -13,17 +13,16 @@ module HarvesterTools
       @meta = metadata_object
     end
-    def process_html(body:, uri:, metadata:)
-      @meta = metadata
+    def process_html(body:, uri:, metadata: @meta)
       tools = HarvesterTools::ExternalTools.new(metadata: @meta)
-      result = tools.process_with_distiller(body: body)
+      tools.process_with_distiller(body: body, metadata: @meta) # adds to @meta
-      jsonld, microdata, microformat, opengraph, rdfa = tools.process_with_extruct(uri: uri)
-      parse_rdf(body: jsonld, content_type: 'application/ld+json')
+      jsonld, microdata, microformat, opengraph, rdfa = tools.process_with_extruct(uri: uri, metadata: @meta)
+      parse_rdf(body: jsonld, content_type: 'application/ld+json', metadata: metadata)
       @meta.merge_hash(microdata)
       @meta.merge_hash(microformat)
       @meta.merge_hash(opengraph)
-      parse_rdf(body: rdfa, content_type: 'application/ld+json')
+      parse_rdf(body: rdfa, content_type: 'application/ld+json', metadata: @meta)
     end
     def process_xml(body:, metadata:)

data/lib/warnings.json CHANGED Viewed

@@ -116,6 +116,18 @@
                   {"Validator": "https://jsononline.net/json-validator"}],
       "severity": "WARN"
    },
+   "022": {
+      "message": "Mismatch between the Content-type header and the content of the returned document.",
+      "linkout": [],
+      "severity": "WARN"
+   },
+   "023": {
+      "message": "Returned content-type is not compatible with any requested content-type, yet a HTTP 406 error was not returned",
+      "linkout": [],
+      "severity": "WARN"
+   },
    "600": {
       "message": "Data identifier cannot be unambiguously determined, therefore cannot be tested against known persistent identifier schemas",
       "linkout": [],

data/lib/web_utils.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module HarvesterTools
   class WebUtils
-    def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get, meta: HarvesterTools::MetadataObject.new)
+    def self.fspfetch(url:, headers: ACCEPT_STAR_HEADER, method: :get, meta: HarvesterTools::MetadataObject.new)
       warn 'In fetch routine now.  '
       begin

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: fsp_harvester
 version: !ruby/object:Gem::Version
-  version: 0.1.20
+  version: 0.1.21
 platform: ruby
 authors:
 - Mark Wilkinson
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2022-08-17 00:00:00.000000000 Z
+date: 2022-08-18 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: json