RubyGems - fsp_harvester - Versions diffs - 0.1.10 → 0.1.13 - Mend

fsp_harvester 0.1.10 → 0.1.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +4 -4
data/.rspec_status +53 -53
data/Gemfile.lock +38 -30
data/lib/config.conf +8 -0
data/lib/constants.rb +8 -5
data/lib/{fsp_metadata_external_tools.rb → external_tools.rb} +17 -15
data/lib/fsp_harvester/version.rb +1 -1
data/lib/fsp_harvester.rb +8 -106
data/lib/harvester.rb +28 -0
data/lib/harvester_utils.rb +78 -0
data/lib/{fsp_metadata_harvester.rb → metadata_harvester.rb} +51 -33
data/lib/metadata_object.rb +4 -3
data/lib/{fsp_metadata_parser.rb → metadata_parser.rb} +28 -13
data/lib/signposting_tests.rb +9 -6
data/lib/warnings.json +33 -24
data/lib/web_utils.rb +3 -3
metadata +10 -8
data/lib/swagger.rb +0 -224

data/lib/{fsp_metadata_harvester.rb → metadata_harvester.rb} RENAMED Viewed

@@ -1,17 +1,17 @@
 # frozen_string_literal: true
-module FspHarvester
+module HarvesterTools
   class Error < StandardError
   end
   class MetadataHarvester
-    def self.extract_metadata(links: [], metadata: FspHarvester::MetadataObject.new)
+    def self.extract_metadata_from_links(links: [], metadata: HarvesterTools::MetadataObject.new)
       @meta = metadata
       @meta.comments << 'INFO:  now collecting both linked data and hash-style data using the harvested links'
       describedby = links.select { |l| l if l.relation == 'describedby' }
-      hvst = FspHarvester::MetadataParser.new(metadata_object: @meta) # put here because the class variable for detecting duplicates should apply to all URIs
+      hvst = HarvesterTools::MetadataParser.new(metadata_object: @meta) # put here because the class variable for detecting duplicates should apply to all URIs
       describedby.each do |link|
         accepttype = ACCEPT_STAR_HEADER
         accept = link.respond_to?('type') ? link.type : nil
@@ -26,23 +26,42 @@ module FspHarvester
           next
         end
-        # process according to detected type
-        case abbreviation
-        when 'html'
-          @meta.comments << 'INFO: Processing html'
-          hvst.process_html(body: response.body, uri: link)
-        when 'xml'
-          @meta.comments << 'INFO: Processing xml'
-          hvst.process_xml(body: response.body)
-        when 'json'
-          @meta.comments << 'INFO: Processing json'
-          hvst.process_json(body: response.body)
-        when 'jsonld', 'rdfxml', 'turtle', 'ntriples', 'nquads'
-          @meta.comments << 'INFO: Processing linked data'
-          hvst.process_ld(body: response.body, content_type: content_type)
-        when 'specialist'
-          warn 'no specialized parsers so far'
-        end
+        process_according_to_type(body: response.body, uri: link, metadata: @meta, abbreviation: abbreviation,
+                                  content_type: content_type, harvester: hvst)
+      end
+    end
+    def self.extract_metadata_from_body(response:, metadata: HarvesterTools::MetadataObject.new)
+      @meta = metadata
+      @meta.comments << 'INFO:  now collecting both linked data and hash-style data using the harvested links'
+      abbreviation, content_type = attempt_to_detect_type(body: response.body, headers: response.headers)
+      unless abbreviation
+        @meta.add_warning(['017', response.request.url, ''])
+        @meta.comments << "WARN: metadata format returned from #{response.request.url} is not recognized. Moving on.\n"
+        return
+      end
+      process_according_to_type(body: response.body, uri: response.request.url, metadata: @meta,
+                                abbreviation: abbreviation, content_type: content_type)
+    end
+    def self.process_according_to_type(body:, uri:, abbreviation:, content_type:, metadata:,
+                                   harvester: HarvesterTools::MetadataParser.new(metadata_object: @meta))
+      case abbreviation
+      when 'html'
+        @meta.comments << 'INFO: Processing html'
+        harvester.process_html(body: body, uri: uri, metadata: @meta)
+      when 'xml'
+        @meta.comments << 'INFO: Processing xml'
+        harvester.process_xml(body: body, metadata: @meta)
+      when 'json'
+        @meta.comments << 'INFO: Processing json'
+        harvester.process_json(body: body, metadata: @meta)
+      when 'jsonld', 'rdfxml', 'turtle', 'ntriples', 'nquads'
+        @meta.comments << 'INFO: Processing linked data'
+        harvester.process_ld(body: body, content_type: content_type, metadata: @meta)
+      when 'specialist'
+        warn 'no specialized parsers so far'
       end
     end
@@ -54,7 +73,7 @@ module FspHarvester
         @meta.comments << "INFO:  link #{link.href} has no MIME type, defaulting to */*"
       end
       url = link.href
-      response = FspHarvester::WebUtils.fspfetch(url: url, method: :get, headers: header)
+      response = HarvesterTools::WebUtils.fspfetch(url: url, method: :get, headers: header)
       unless response
         @meta.add_warning(['016', url, header])
         @meta.comments << "WARN: Unable to resolve describedby link #{url} using HTTP Accept header #{header}.\n"
@@ -111,24 +130,23 @@ module FspHarvester
       [abbreviation, contenttype]
     end
-    def self.ntriples_hack(body:)  # distriller cannot recognize single-line ntriples unless they end with a period, which is not required by the spec... so hack it!
+    def self.ntriples_hack(body:) # distriller cannot recognize single-line ntriples unless they end with a period, which is not required by the spec... so hack it!
       detected_type = nil
       body.split.each do |line|
         line.strip!
         next if line.empty?
-        if line =~ %r{\s*<[^>]+>\s*<[^>]+>\s\S+}
-          @meta.comments << "INFO: running ntriples hack on  #{line + " ."}\n"
-          detected_type = RDF::Format.for({ sample: "#{line} ." })  # adding a period allows detection of ntriples by distiller
-          break
-        end
-      end
-      @meta.comments << "INFO: ntriples hack found: #{detected_type.to_s}\n"
-      if detected_type != RDF::NTriples::Format   # only return the hacky case
-        return nil
+        next unless line =~ /\s*<[^>]+>\s*<[^>]+>\s\S+/
+        @meta.comments << "INFO: running ntriples hack on  #{line + ' .'}\n"
+        detected_type = RDF::Format.for({ sample: "#{line} ." }) # adding a period allows detection of ntriples by distiller
+        break
       end
-      return detected_type
-    end
+      @meta.comments << "INFO: ntriples hack found: #{detected_type}\n"
+      return nil if detected_type != RDF::NTriples::Format # only return the hacky case
+      detected_type
+    end
     def self.check_json(body:)
       abbreviation = nil

data/lib/metadata_object.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-module FspHarvester
+module HarvesterTools
   class MetadataObject
     attr_accessor :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris  # a hash of metadata # a RDF.rb graph of metadata  # an array of comments  # the type of GUID that was detected # will be an array of Net::HTTP::Response
@@ -10,8 +10,9 @@ module FspHarvester
       @full_response = []
       @links = []
       @all_uris = []
-      @warn = File.read("./lib/warnings.json")
-      @warn = JSON.parse(@warn)
+      w = RestClient.get("https://raw.githubusercontent.com/markwilkinson/FAIR-Signposting-Harvester/master/lib/warnings.json")
+      #@warn = File.read("./lib/warnings.json")
+      @warn = JSON.parse(w)
     end
     def merge_hash(hash)

data/lib/{fsp_metadata_parser.rb → metadata_parser.rb} RENAMED Viewed

@@ -1,6 +1,6 @@
 # frozen_string_literal: true
-module FspHarvester
+module HarvesterTools
   class Error < StandardError
   end
@@ -9,17 +9,25 @@ module FspHarvester
     @@distillerknown = {}
-    def initialize(metadata_object: FspHarvester::MetadataObject.new)
+    def initialize(metadata_object: HarvesterTools::MetadataObject.new)
       @meta = metadata_object
     end
-    def process_html(body:, uri:)
-      tools = FspHarvester::ExternalTools.new(metadata: @meta)
-      tools.process_with_distiller(body: body)
-      tools.process_with_extruct(uri: uri)
+    def process_html(body:, uri:, metadata:)
+      @meta = metadata
+      tools = HarvesterTools::ExternalTools.new(metadata: @meta)
+      result = tools.process_with_distiller(body: body)
+      jsonld, microdata, microformat, opengraph, rdfa = tools.process_with_extruct(uri: uri)
+      parse_rdf(body: jsonld, content_type: 'application/ld+json')
+      @meta.merge_hash(microdata)
+      @meta.merge_hash(microformat)
+      @meta.merge_hash(opengraph)
+      parse_rdf(body: rdfa, content_type: 'application/ld+json')
     end
-    def process_xml(body:)
+    def process_xml(body:, metadata:)
+      @meta = metadata
       begin
         hash = XmlSimple.xml_in(body)
       rescue
@@ -30,7 +38,8 @@ module FspHarvester
       @meta.hash.merge hash
     end
-    def process_json(body:)
+    def process_json(body:, metadata:)
+      @meta = metadata
       begin
         hash = JSON.parse(body)
       rescue
@@ -41,11 +50,17 @@ module FspHarvester
       @meta.hash.merge hash
     end
-    def process_ld(body:, content_type:)
-      parse_rdf(body: body, content_type: content_type)
+    def process_ld(body:, content_type:, metadata:)
+      @meta = metadata
+      parse_rdf(body: body, content_type: content_type, metadata: @meta)
+    end
+    def parse_rdf(body:, content_type:, metadata:)
+      self.class.parse_rdf(body: body, content_type: content_type, metadata: metadata)
     end
-    def parse_rdf(body:, content_type:)
+    def self.parse_rdf(body:, content_type:, metadata:)
+      @meta = metadata
       unless body
         @meta.comments << "CRITICAL: The response message body component appears to have no content.\n"
         @meta.add_warning(['018', '', ''])
@@ -65,7 +80,7 @@ module FspHarvester
         return
       end
-      graph = FspHarvester::Cache.checkRDFCache(body: body)
+      graph = HarvesterTools::Cache.checkRDFCache(body: body)
       if graph.size > 0
         warn "\n\n\n unmarshalling graph from cache\n\ngraph size #{graph.size}\n\n"
         @meta.merge_rdf(graph.to_a)
@@ -88,7 +103,7 @@ module FspHarvester
           end
           reader = rdfformat.reader.new(body) # have to re-read it here, but now its safe because we have already caught errors
           warn 'WRITING TO CACHE'
-          FspHarvester::Cache.writeRDFCache(reader: reader, body: body) # write to the special RDF graph cache
+          HarvesterTools::Cache.writeRDFCache(reader: reader, body: body) # write to the special RDF graph cache
           warn 'WRITING DONE'
           reader = rdfformat.reader.new(body)  # frustrating that we cannot rewind!
           warn 'RE-READING DONE'

data/lib/signposting_tests.rb CHANGED Viewed

@@ -1,4 +1,5 @@
-def check_for_citeas_conflicts(citeas: )
+def check_for_citeas_conflicts(citeas:, metadata: )
+  @meta = metadata
   @meta.comments << 'INFO: checking for conflicting cite-as links'
   citeas_hrefs = Hash.new
   citeas.each do |link|
@@ -6,7 +7,7 @@ def check_for_citeas_conflicts(citeas: )
     @meta.comments << "INFO: Adding citeas #{link.href} to the testing queue."
     citeas_hrefs[link.href] = link
   end
+#warn "finalhash #{citeas_hrefs}"
   if citeas_hrefs.length > 1
     @meta.comments << 'INFO: Found multiple non-identical cite-as links.'
     @meta.add_warning(['007', '', ''])
@@ -16,7 +17,8 @@ def check_for_citeas_conflicts(citeas: )
 end
-def check_describedby_rules(describedby:)
+def check_describedby_rules(describedby:, metadata:)
+  @meta = metadata
   describedby.each do |l|
     unless l.respond_to? 'type'
       @meta.add_warning(['005', l.href, ''])
@@ -25,7 +27,7 @@ def check_describedby_rules(describedby:)
     type = l.type if l.respond_to? 'type'
     type ||= '*/*'
     header = { accept: type }
-    response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
+    response = HarvesterTools::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
     if response
       responsetype = response.headers[:content_type]
       @meta.comments << "INFO: describedby link responds with content type #{responsetype}\n"
@@ -51,7 +53,8 @@ def check_describedby_rules(describedby:)
   end
 end
-def check_item_rules(item:)
+def check_item_rules(item:, metadata:)
+  @meta = metadata
   item.each do |l| # l = LinkHeaders::Link
     unless l.respond_to? 'type'
       @meta.add_warning(['011', l.href, ''])
@@ -60,7 +63,7 @@ def check_item_rules(item:)
     type = l.type if l.respond_to? 'type'
     type ||= '*/*' # this becomes a frozen string
     header = { accept: type }
-    response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
+    response = HarvesterTools::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
     if response
       if response.headers[:content_type] and type != '*/*'

data/lib/warnings.json CHANGED Viewed

@@ -1,110 +1,119 @@
 {
    "001": {
       "message": "Unable to resolve guid using default (*/*) Accept headers",
-      "linkout": "",
+      "linkout": [{"FAIR Principle": "https://www.go-fair.org/fair-principles/metadata-retrievable-identifier-standardised-communication-protocol/"},
+                  {"FAIRsharing": "https://doi.org/10.25504/FAIRsharing.cd2f9e"}
+               ],
       "severity": "WARN"
    },
    "002": {
       "message": "HTTP Response (203) is non-authoritative",
-      "linkout": "",
+      "linkout": [{"Documentation": "https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/203"}],
       "severity": "WARN"
    },
    "003": {
       "message": "HTTP Response indicates failure (500-range)",
-      "linkout": "",
+      "linkout": [{"Documentation": "https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500"}],
       "severity": "WARN"
    },
    "004": {
       "message": "The resource does not follow the FAIR Signposting standard, which requires exactly one cite-as header, and at least one describedby header",
-      "linkout": "",
+      "linkout": [{"Documentation": "http://www.signposting.org/FAIR/#level1"}],
       "severity": "WARN"
    },
    "005": {
       "message": "The resource does not follow the FAIR Signposting standard, which requires any describedby links to also have a 'type' attribute indicating the Accept headers that should be sent with the request",
-      "linkout": "",
+      "linkout": [{"Documentation": "http://www.signposting.org/FAIR/#level1"}],
       "severity": "WARN"
    },
    "006": {
       "message": "GUID type not recognized",
-      "linkout": "",
+      "linkout": [{"FAIRsharing": "https://fairsharing.org/search?fairsharingRegistry=Standard&recordType=identifier_schema&page=1"}],
       "severity": "WARN"
    },
    "007": {
       "message": "Conflicting cite-as links",
-      "linkout": "",
+      "linkout": [{"Documentation": "http://www.signposting.org/FAIR/#level1"}],
       "severity": "WARN"
    },
    "008": {
       "message": "describedby link does not resolve",
-      "linkout": "",
+      "linkout": [{"Documentation": "http://www.signposting.org/FAIR/#level1"}],
       "severity": "WARN"
    },
    "009": {
       "message": "Content-type of described-by link does not match the type attribute in the link header itself",
-      "linkout": "",
+      "linkout": [{"Documentation": "http://www.signposting.org/FAIR/#level1"},
+                  {"Documentation": "https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Type"}],
       "severity": "WARN"
    },
    "010": {
       "message": "Content-type of response from described-by link is undefined or cannot be compared to the link type",
-      "linkout": "",
+      "linkout": [{"Documentation": "http://www.signposting.org/FAIR/#level1"}],
       "severity": "WARN"
    },
    "011": {
       "message": "The resource does not follow the FAIR Signposting standard, which encourages any item links to have a type attribute",
-      "linkout": "",
+      "linkout": [{"Documentation": "http://www.signposting.org/FAIR/#level1"}],
       "severity": "WARN"
    },
    "012": {
       "message": "Content-type of response from resolving an item doesn't match the item type attribute in the link header",
-      "linkout": "",
+      "linkout": [{"Documentation": "http://www.signposting.org/FAIR/#level1"},
+                  {"Documentation": "https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Type"}],
       "severity": "WARN"
    },
    "013": {
       "message": "Content-type of response from resolving an item is undefined or cannot be compared to the link type",
-      "linkout": "",
+      "linkout": [{"Documentation": "https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Type"}],
       "severity": "WARN"
    },
    "014": {
       "message": "Item link does not resolve",
-      "linkout": "",
+      "linkout": [{"Documentation": "http://www.signposting.org/FAIR/#level1"}],
       "severity": "WARN"
    },
    "015": {
       "message": "Link headers do not include a link of type 'type', as required by the FAIR Signposting specification",
-      "linkout": "",
+      "linkout": [{"Documentation": "http://www.signposting.org/FAIR/#level1"}],
       "severity": "WARN"
    },
    "016": {
       "message": "Unable to resolve describedby link using Accept headers with the MIME type indicated in the link",
-      "linkout": "",
+      "linkout": [{"Documentation": "https://developer.mozilla.org/en-US/docs/Web/HTTP/Content_negotiation"}],
       "severity": "WARN"
    },
    "017": {
       "message": "Metadata format not recognized.",
-      "linkout": "",
+      "linkout": [{"FAIRsharing": "https://fairsharing.org/search?subjects=Computer%2520Science,subject%2520agnostic&page=1&recordType=model_and_format"}],
       "severity": "WARN"
    },
    "018": {
       "message": "RDF parsing error - likely malformed RDF document.",
-      "linkout": "",
+      "linkout": [{"FAIRsharing": "https://fairsharing.org/FAIRsharing.p77ph9"},
+                  {"Documentation": "http://www.w3.org/TR/2014/REC-rdf11-concepts-20140225/"},
+                  {"Validator": "http://rdf.greggkellogg.net/distiller"}],
       "severity": "WARN"
    },
    "019": {
       "message": "HTML parsing error - unable to extract linked data from HTML.",
-      "linkout": "",
+      "linkout": [{"FAIRsharing": "https://fairsharing.org/FAIRsharing.YugnuL"},
+                  {"Documentation": "https://www.w3.org/TR/html53/"},
+                  {"validator": "https://validator.w3.org/"}],
       "severity": "WARN"
    },
    "020": {
       "message": "XML parsing error - unable to process XML document.",
-      "linkout": "",
+      "linkout": [{"FAIRsharing": "https://fairsharing.org/FAIRsharing.b5cc91"},
+                  {"Documentation": "https://www.w3.org/TR/xml/"},
+                  {"Validator": "https://www.xmlvalidation.com/"}],
       "severity": "WARN"
    },
    "021": {
       "message": "JSON parsing error - unable to process JSON document.",
-      "linkout": "",
+      "linkout": [{"FAIRsharing": "https://fairsharing.org/FAIRsharing.5bbab9"},
+                  {"Documentation": "http://dx.doi.org/10.17487/RFC8259"},
+                  {"Validator": "https://jsononline.net/json-validator"}],
       "severity": "WARN"
    }
 }

data/lib/web_utils.rb CHANGED Viewed

@@ -1,7 +1,7 @@
-module FspHarvester
+module HarvesterTools
   class WebUtils
-    def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get, meta: FspHarvester::MetadataObject.new)
+    def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get, meta: HarvesterTools::MetadataObject.new)
       warn 'In fetch routine now.  '
       begin
@@ -23,7 +23,7 @@ module FspHarvester
         end
         response
       rescue RestClient::ExceptionWithResponse => e
-        warn "EXCEPTION WITH RESPONSE! #{e.response}\n#{e.response.headers}"
+        warn "EXCEPTION WITH RESPONSE! #{e.response.code} with response #{e.response}\nfailed response headers: #{e.response.headers}"
         meta.warnings << ["003", url, headers]
         meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
         if (e.response.code == 500 or e.response.code == 404)

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: fsp_harvester
 version: !ruby/object:Gem::Version
-  version: 0.1.10
+  version: 0.1.13
 platform: ruby
 authors:
 - Mark Wilkinson
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2022-08-08 00:00:00.000000000 Z
+date: 2022-08-12 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: json
@@ -44,14 +44,14 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.1.16
+        version: 0.1.17
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.1.16
+        version: 0.1.17
 - !ruby/object:Gem::Dependency
   name: metainspector
   requirement: !ruby/object:Gem::Requirement
@@ -182,17 +182,19 @@ files:
 - bin/setup
 - example_test.rb
 - launch.json
+- lib/config.conf
 - lib/config.conf_docker
 - lib/config.conf_local
 - lib/constants.rb
+- lib/external_tools.rb
 - lib/fsp_harvester.rb
 - lib/fsp_harvester/version.rb
-- lib/fsp_metadata_external_tools.rb
-- lib/fsp_metadata_harvester.rb
-- lib/fsp_metadata_parser.rb
+- lib/harvester.rb
+- lib/harvester_utils.rb
+- lib/metadata_harvester.rb
 - lib/metadata_object.rb
+- lib/metadata_parser.rb
 - lib/signposting_tests.rb
-- lib/swagger.rb
 - lib/warnings.json
 - lib/web_utils.rb
 homepage: https://github.com/markwilkinson/FAIR-Signposting-Harvester