fsp_harvester 0.1.5 → 0.1.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 79507c31b14bab423d95a72fe441756551fa445caccea733ee75993fd7e0222c
4
- data.tar.gz: a18796aaff5e57940306fecd1d82df5c18d579ffe0d5fb1cd1948a9a29d1bb3b
3
+ metadata.gz: 923ccb362ef4a71fa2f221b1b224dbd5d3ec78a14cc9da0e12a1e0df804162ff
4
+ data.tar.gz: e70a9a994c504a0867ab10e05c07d440a76677d7ff27e27b9bdb0c3338c02ffe
5
5
  SHA512:
6
- metadata.gz: ed211e876c70b7c6bd3dad6dc9a7dada1e4e6d54f5c9a92286b24b9912b06b26f6b3c2fd3b22c8ac225ddb8ceaa3eb2b98d35a983f6be3fe78f4575450f8d857
7
- data.tar.gz: af6d5af7520061d418680b5b9a5f90e066b55be26322b7a4d9275bf74546eb56e80e538cd78443d45b28241246eb9329c84c08c53faee9db67e8b1c893507a54
6
+ metadata.gz: d819cb1c19d40f8a093b723cbeb4273d122d3207df2d874558327fa6a2622be1bf2c671cc9dfebee74c689722825c2dd1957f8be6bfcf2c14c09097c1dc05a5b
7
+ data.tar.gz: 508a484d93eab373d0389c11f4361068d5586759a73573901ea54f4d2a028f713f71bb492976785499f1074eb8b359d79be550c09b196eb82838a1e401398fc4
data/.rspec_status ADDED
@@ -0,0 +1,55 @@
1
+ example_id | status | run_time |
2
+ ---------------------------------- | ------ | --------------- |
3
+ ./spec/cite-as_spec.rb[1:1:1] | passed | 1.17 seconds |
4
+ ./spec/cite-as_spec.rb[1:1:2] | passed | 0.98776 seconds |
5
+ ./spec/cite-as_spec.rb[1:1:3] | passed | 0.69753 seconds |
6
+ ./spec/cite-as_spec.rb[1:1:4] | passed | 1.31 seconds |
7
+ ./spec/cite-as_spec.rb[1:1:5] | passed | 2.07 seconds |
8
+ ./spec/cite-as_spec.rb[1:1:6] | passed | 1.45 seconds |
9
+ ./spec/cite-as_spec.rb[1:1:7] | passed | 2.75 seconds |
10
+ ./spec/cite-as_spec.rb[1:1:8] | passed | 1.83 seconds |
11
+ ./spec/cite-as_spec.rb[1:1:9] | passed | 2.51 seconds |
12
+ ./spec/cite-as_spec.rb[1:1:10] | passed | 1.73 seconds |
13
+ ./spec/cite-as_spec.rb[1:1:11] | passed | 2.35 seconds |
14
+ ./spec/cite-as_spec.rb[1:1:12] | passed | 2.01 seconds |
15
+ ./spec/cite-as_spec.rb[1:1:13] | passed | 2.56 seconds |
16
+ ./spec/cite-as_spec.rb[1:1:14] | passed | 1.68 seconds |
17
+ ./spec/cite-as_spec.rb[1:1:15] | passed | 1.06 seconds |
18
+ ./spec/cite-as_spec.rb[1:1:16] | passed | 1.03 seconds |
19
+ ./spec/cite-as_spec.rb[1:1:17] | passed | 0.94321 seconds |
20
+ ./spec/cite-as_spec.rb[1:1:18] | passed | 1.1 seconds |
21
+ ./spec/cite-as_spec.rb[1:1:19] | passed | 1.45 seconds |
22
+ ./spec/cite-as_spec.rb[1:1:20] | passed | 1.53 seconds |
23
+ ./spec/cite-as_spec.rb[1:1:21] | passed | 1.64 seconds |
24
+ ./spec/cite-as_spec.rb[1:1:22] | passed | 1.01 seconds |
25
+ ./spec/cite-as_spec.rb[1:1:23] | passed | 1.09 seconds |
26
+ ./spec/cite-as_spec.rb[1:1:24] | failed | 1.22 seconds |
27
+ ./spec/cite-as_spec.rb[1:1:25] | passed | 0.38248 seconds |
28
+ ./spec/describedby_spec.rb[1:1:1] | passed | 2.24 seconds |
29
+ ./spec/describedby_spec.rb[1:1:2] | passed | 1.08 seconds |
30
+ ./spec/describedby_spec.rb[1:1:3] | passed | 1 second |
31
+ ./spec/describedby_spec.rb[1:1:4] | passed | 1.14 seconds |
32
+ ./spec/describedby_spec.rb[1:1:5] | passed | 1.03 seconds |
33
+ ./spec/describedby_spec.rb[1:1:6] | passed | 0.81364 seconds |
34
+ ./spec/describedby_spec.rb[1:1:7] | passed | 0.77543 seconds |
35
+ ./spec/describedby_spec.rb[1:1:8] | passed | 2.01 seconds |
36
+ ./spec/describedby_spec.rb[1:1:9] | passed | 1.35 seconds |
37
+ ./spec/describedby_spec.rb[1:1:10] | passed | 1.73 seconds |
38
+ ./spec/describedby_spec.rb[1:1:11] | passed | 2.36 seconds |
39
+ ./spec/describedby_spec.rb[1:1:12] | passed | 2.73 seconds |
40
+ ./spec/describedby_spec.rb[1:1:13] | passed | 1.5 seconds |
41
+ ./spec/describedby_spec.rb[1:1:14] | passed | 1.8 seconds |
42
+ ./spec/describedby_spec.rb[1:1:15] | passed | 1.65 seconds |
43
+ ./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00053 seconds |
44
+ ./spec/fsp_harvester_spec.rb[1:2] | passed | 1.76 seconds |
45
+ ./spec/item_spec.rb[1:1:1] | passed | 2.08 seconds |
46
+ ./spec/item_spec.rb[1:1:2] | passed | 2.27 seconds |
47
+ ./spec/item_spec.rb[1:1:3] | passed | 1.22 seconds |
48
+ ./spec/item_spec.rb[1:1:4] | passed | 1.61 seconds |
49
+ ./spec/item_spec.rb[1:1:5] | passed | 1.74 seconds |
50
+ ./spec/item_spec.rb[1:1:6] | passed | 1.95 seconds |
51
+ ./spec/item_spec.rb[1:1:7] | passed | 3.59 seconds |
52
+ ./spec/item_spec.rb[1:1:8] | passed | 0.41001 seconds |
53
+ ./spec/type_spec.rb[1:1:1] | passed | 1.14 seconds |
54
+ ./spec/type_spec.rb[1:1:2] | passed | 0.94799 seconds |
55
+ ./spec/type_spec.rb[1:1:3] | passed | 1.04 seconds |
data/Gemfile.lock CHANGED
@@ -1,10 +1,10 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fsp_harvester (0.1.5)
4
+ fsp_harvester (0.1.9)
5
5
  json (~> 2.0)
6
6
  linkeddata (~> 3.2)
7
- linkheaders-processor (~> 0.1.11)
7
+ linkheaders-processor (~> 0.1.15)
8
8
  metainspector (~> 5.11.2)
9
9
  parseconfig (~> 1.1)
10
10
  rake (~> 13.0)
@@ -126,10 +126,11 @@ GEM
126
126
  shex (~> 0.7)
127
127
  sparql (~> 3.2)
128
128
  sparql-client (~> 3.2)
129
- linkheaders-processor (0.1.11)
129
+ linkheaders-processor (0.1.15)
130
130
  json (~> 2.0)
131
131
  json-ld (~> 3.2)
132
132
  json-ld-preloaded (~> 3.2)
133
+ link_header (~> 0.0.8)
133
134
  metainspector (~> 5.11.2)
134
135
  rest-client (~> 2.1)
135
136
  securerandom (~> 0.1.0)
@@ -248,7 +249,7 @@ GEM
248
249
  diff-lcs (>= 1.2.0, < 2.0)
249
250
  rspec-support (~> 3.11.0)
250
251
  rspec-support (3.11.0)
251
- rubocop (1.32.0)
252
+ rubocop (1.33.0)
252
253
  json (~> 2.3)
253
254
  parallel (~> 1.10)
254
255
  parser (>= 3.1.0.0)
data/Rakefile CHANGED
@@ -1,11 +1,11 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require "bundler/gem_tasks"
4
- require "rspec/core/rake_task"
3
+ require 'bundler/gem_tasks'
4
+ require 'rspec/core/rake_task'
5
5
 
6
6
  RSpec::Core::RakeTask.new(:spec)
7
7
 
8
- require "rubocop/rake_task"
8
+ require 'rubocop/rake_task'
9
9
 
10
10
  RuboCop::RakeTask.new
11
11
 
data/launch.json ADDED
@@ -0,0 +1,11 @@
1
+ {
2
+ "name": "RSpec - all",
3
+ "type": "Ruby",
4
+ "request": "launch",
5
+ "cwd": "${workspaceRoot}",
6
+ "program": "/home/osboxes/.rvm/gems/ruby-3.0.0/bin/rspec",
7
+ "args": [
8
+ "-I",
9
+ "${workspaceRoot}"
10
+ ]
11
+ }
@@ -0,0 +1,8 @@
1
+ [extruct]
2
+ command="extruct"
3
+
4
+ [rdf]
5
+ command="/usr/local/bundle/bin/rdf"
6
+
7
+ [tika]
8
+ command="http://tika:9998/meta"
@@ -0,0 +1,8 @@
1
+ [extruct]
2
+ command="extruct"
3
+
4
+ [rdf]
5
+ command="/home/osboxes/.rvm/gems/ruby-3.0.0/bin/rdf"
6
+
7
+ [tika]
8
+ command="http://tika:9998/meta"
data/lib/constants.rb CHANGED
@@ -1,17 +1,20 @@
1
1
  ACCEPT_ALL_HEADER = { 'Accept' => 'text/turtle, application/ld+json, application/rdf+xml, text/xhtml+xml, application/n3, application/rdf+n3, application/turtle, application/x-turtle, text/n3, text/turtle, text/rdf+n3, text/rdf+turtle, application/n-triples' }
2
2
 
3
+ ACCEPT_STAR_HEADER = {'Accept' => '*/*'}
4
+
3
5
  TEXT_FORMATS = {
4
6
  'text' => ['text/plain']
5
7
  }
6
8
 
7
9
  RDF_FORMATS = {
8
- 'jsonld' => ['application/ld+json', 'application/vnd.schemaorg.ld+json'], # NEW FOR DATACITE
10
+ 'jsonld' => ['application/ld+json','application/x-ld+json', 'application/vnd.schemaorg.ld+json'], # NEW FOR DATACITE
9
11
  'turtle' => ['text/turtle', 'application/n3', 'application/rdf+n3',
10
12
  'application/turtle', 'application/x-turtle', 'text/n3', 'text/turtle',
11
13
  'text/rdf+n3', 'text/rdf+turtle'],
12
14
  # 'rdfa' => ['text/xhtml+xml', 'application/xhtml+xml'],
13
15
  'rdfxml' => ['application/rdf+xml'],
14
- 'triples' => ['application/n-triples', 'application/n-quads', 'application/trig']
16
+ 'ntriples' => ['application/n-triples', 'application/trig'],
17
+ 'nquads' => ['application/n-quads']
15
18
  }
16
19
 
17
20
  XML_FORMATS = {
@@ -73,12 +76,10 @@ GUID_TYPES = { 'inchi' => Regexp.new(/^\w{14}-\w{10}-\w$/),
73
76
  'uri' => Regexp.new(%r{^\w+:/?/?[^\s]+$}) }
74
77
 
75
78
  CONFIG = File.exist?('config.conf') ? ParseConfig.new('config.conf') : {}
76
- if CONFIG['extruct'] && CONFIG['extruct']['command'] && !CONFIG['extruct']['command'].empty?
77
- extruct = config['extruct']['command']
78
- end
79
- extruct = 'extruct' unless @extruct_command
79
+ extruct = CONFIG.dig(:extruct, :command)
80
+ extruct ||= 'extruct'
80
81
  extruct.strip!
81
- case @extruct
82
+ case extruct
82
83
  when /[&|;`$\s]/
83
84
  abort 'The Extruct command in the config file appears to be subject to command injection. I will not continue'
84
85
  when /echo/i
@@ -86,8 +87,8 @@ when /echo/i
86
87
  end
87
88
  EXTRUCT_COMMAND = extruct
88
89
 
89
- rdf_command = CONFIG['rdf']['command'] if CONFIG['rdf'] && CONFIG['rdf']['command'] && !CONFIG['rdf']['command'].empty?
90
- rdf_command = 'rdf' unless @rdf_command
90
+ rdf_command = CONFIG.dig(:rdf, :command)
91
+ rdf_command ||= 'rdf'
91
92
  rdf_command.strip
92
93
  case rdf_command
93
94
  when /[&|;`$\s]/
@@ -99,8 +100,6 @@ when !(/rdf$/ =~ $_)
99
100
  end
100
101
  RDF_COMMAND = rdf_command
101
102
 
102
- if CONFIG['tika'] && CONFIG['tika']['command'] && !CONFIG['tika']['command'].empty?
103
- tika_command = CONFIG['tika']['command']
104
- end
105
- tika_command = 'http://localhost:9998/meta' unless @tika_command
103
+ tika_command = CONFIG.dig(:tika, :command)
104
+ tika_command ||= 'http://localhost:9998/meta'
106
105
  TIKA_COMMAND = tika_command
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module FspHarvester
4
- VERSION = "0.1.5"
4
+ VERSION = "0.1.9"
5
5
  end
data/lib/fsp_harvester.rb CHANGED
@@ -1,24 +1,28 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative "fsp_harvester/version"
4
- require "json/ld"
5
- require "json/ld/preloaded"
6
- require "json"
7
- require "linkheaders/processor"
8
- require "addressable"
9
- require "tempfile"
10
- require "xmlsimple"
11
- require "nokogiri"
12
- require "parseconfig"
13
- require "rest-client"
14
- require "cgi"
15
- require "digest"
16
- require "open3"
17
- require "metainspector"
18
- require "rdf/xsd"
19
- require_relative "./metadata_object"
20
- require_relative "./constants"
21
- require_relative "./web_utils"
3
+ require_relative 'fsp_harvester/version'
4
+ require 'json/ld'
5
+ require 'json/ld/preloaded'
6
+ require 'json'
7
+ require 'linkheaders/processor'
8
+ require 'addressable'
9
+ require 'tempfile'
10
+ require 'xmlsimple'
11
+ require 'nokogiri'
12
+ require 'parseconfig'
13
+ require 'rest-client'
14
+ require 'cgi'
15
+ require 'digest'
16
+ require 'open3'
17
+ require 'metainspector'
18
+ require 'rdf/xsd'
19
+ require_relative './metadata_object'
20
+ require_relative './constants'
21
+ require_relative './web_utils'
22
+ require_relative './signposting_tests'
23
+ require_relative './fsp_metadata_harvester'
24
+ require_relative './fsp_metadata_parser'
25
+
22
26
 
23
27
  module FspHarvester
24
28
  class Error < StandardError
@@ -27,33 +31,45 @@ module FspHarvester
27
31
  class Utils
28
32
  # @@distillerknown = {} # global, hash of sha256 keys of message bodies - have they been seen before t/f
29
33
  # @warnings = JSON.parse(File.read("warnings.json"))
30
- @meta = FspHarvester::MetadataObject.new
34
+
31
35
 
32
36
  def self.resolve_guid(guid:)
33
- @meta.finalURI = [guid]
37
+ @meta = FspHarvester::MetadataObject.new
38
+ @meta.all_uris = [guid]
34
39
  type, url = convertToURL(guid: guid)
35
40
  links = Array.new
36
- unless type
37
- @meta.warnings << ["006", guid, ""]
38
- @meta.comments << "FATAL: GUID type not recognized.\n"
41
+ if type
42
+ links = resolve_url(url: url)
43
+ @meta.links << links
39
44
  else
40
- links, @meta = resolve_url(url: url)
45
+ @meta.warnings << ['006', guid, '']
46
+ @meta.comments << "FATAL: GUID type not recognized.\n"
41
47
  end
42
48
  [links, @meta]
43
49
  end
44
50
 
51
+ def self.gather_metadata_from_describedby_links(links: [], metadata: FspHarvester::MetadataObject.new) # meta should have already been created by resolve+guid, but maybe not
52
+ @meta = metadata
53
+ db = []
54
+ links.each do |l|
55
+ db << l if l.relation == 'describedby'
56
+ end
57
+ FspHarvester::MetadataHarvester.extract_metadata(links: db, metadata: @meta) # everything is gathered into the @meta metadata object
58
+ @meta
59
+ end
60
+
45
61
  def self.convertToURL(guid:)
46
62
  GUID_TYPES.each do |k, regex|
47
- if k == "inchi" and regex.match(guid)
48
- return "inchi", "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
49
- elsif k == "handle1" and regex.match(guid)
50
- return "handle", "http://hdl.handle.net/#{guid}"
51
- elsif k == "handle2" and regex.match(guid)
52
- return "handle", "http://hdl.handle.net/#{guid}"
53
- elsif k == "uri" and regex.match(guid)
54
- return "uri", guid
55
- elsif k == "doi" and regex.match(guid)
56
- return "doi", "https://doi.org/#{guid}"
63
+ if k == 'inchi' and regex.match(guid)
64
+ return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
65
+ elsif k == 'handle1' and regex.match(guid)
66
+ return 'handle', "http://hdl.handle.net/#{guid}"
67
+ elsif k == 'handle2' and regex.match(guid)
68
+ return 'handle', "http://hdl.handle.net/#{guid}"
69
+ elsif k == 'uri' and regex.match(guid)
70
+ return 'uri', guid
71
+ elsif k == 'doi' and regex.match(guid)
72
+ return 'doi', "https://doi.org/#{guid}"
57
73
  end
58
74
  end
59
75
  [nil, nil]
@@ -66,71 +82,75 @@ module FspHarvester
66
82
  false
67
83
  end
68
84
 
69
- def self.resolve_url(url:, nolinkheaders: false, header: ACCEPT_ALL_HEADER)
70
- @meta.guidtype = "uri" if @meta.guidtype.nil?
85
+ def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_STAR_HEADER)
86
+ @meta.guidtype = 'uri' if @meta.guidtype.nil?
71
87
  warn "\n\n FETCHING #{url} #{header}\n\n"
72
- response = FspHarvester::WebUtils.fspfetch(url: url, headers: header)
73
- warn "\n\n head #{response.headers.inspect}\n\n"
88
+ response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method, meta: @meta)
89
+ warn "\n\n head #{response.headers.inspect}\n\n" if response
74
90
 
75
91
  unless response
76
- @meta.warnings << ["001", url, header]
92
+ @meta.warnings << ['001', url, header]
77
93
  @meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
78
- return [[], @meta]
94
+ return []
79
95
  end
80
96
 
81
- @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.finalURI.last}. Using the output from this URL for the next few tests..."
97
+ @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
82
98
  @meta.full_response << response.body
83
99
 
84
100
  links = process_link_headers(response: response) unless nolinkheaders
85
- [links, @meta]
101
+ links
86
102
  end
87
103
 
88
104
  def self.process_link_headers(response:)
89
105
  warn "\n\n parsing #{response.headers}\n\n"
90
106
 
91
- parser = LinkHeaders::Processor.new(default_anchor: @meta.finalURI.last)
107
+ parser = LinkHeaders::Processor.new(default_anchor: @meta.all_uris.last)
92
108
  parser.extract_and_parse(response: response)
93
- factory = parser.factory # LinkHeaders::LinkFactory
109
+ factory = parser.factory # LinkHeaders::LinkFactory
110
+
111
+ warn "\n\n length bfore #{factory.all_links.length}\n\n"
112
+ signpostingcheck(factory: factory)
113
+ warn "\n\n length aftr #{factory.all_links.length}\n\n"
114
+ warn "\n\n links #{factory.all_links}\n\n"
115
+ factory.all_links
116
+ end
94
117
 
95
- citeas = 0
96
- describedby = 0
97
- warn "\n\n length #{factory.all_links.length}\n\n"
118
+ def self.signpostingcheck(factory:)
119
+ citeas = Array.new
120
+ describedby = Array.new
121
+ item = Array.new
122
+ types = Array.new
98
123
 
99
124
  factory.all_links.each do |l|
100
125
  case l.relation
101
- when "cite-as"
102
- citeas += 1
103
- when "describedby"
104
- describedby += 1
105
- unless l.respond_to? "type"
106
- @meta.warnings << ["005", l.href, ""]
107
- @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires any describedby links to also have a 'type' attribute\n"
108
- end
126
+ when 'cite-as'
127
+ citeas << l
128
+ when 'item'
129
+ item << l
130
+ when 'describedby'
131
+ describedby << l
132
+ when 'type'
133
+ types << l
109
134
  end
110
135
  end
111
- if citeas > 1
112
- self.check_for_conflicts(factory: factory) # this merelty adsds to the metadata objects if there are conflicts
136
+
137
+ check_describedby_rules(describedby: describedby)
138
+ check_item_rules(item: item)
139
+
140
+ if citeas.length > 1
141
+ warn "INFO: multiple cite-as links found. Checking for conflicts\n"
142
+ @meta.comments << "INFO: multiple cite-as links found. Checking for conflicts\n"
143
+ citeas = check_for_citeas_conflicts(citeas: citeas) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
113
144
  end
114
145
 
115
- unless citeas == 1 && describedby > 0
116
- @meta.warnings << ["004", "", ""]
146
+ unless citeas.length == 1 && describedby.length > 0
147
+ @meta.warnings << ['004', '', '']
117
148
  @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires exactly one cite-as header, and at least one describedby header\n"
118
149
  end
119
- factory.all_links
120
- end
121
150
 
122
- def self.check_for_conflicts(factory:) # incoming: {"link1" => {"sectiontype1" => value, "sectiontype2" => value2}}
123
- @meta.comments << "INFO: checking for conflicting cite-as links"
124
- citeas = Array.new
125
- factory.all_links.each do |link|
126
- next unless link.relation == 'cite-as'
127
- citeas << link.href
128
- end
129
- unless citeas == citeas.uniq
130
- @meta.warnings << ["007", "", ""]
131
- @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard: Found conflicting cite-as link headers\n"
132
- else
133
- @meta.comments << "INFO: No conflicting cite-as links found."
151
+ unless types.length >=1
152
+ @meta.warnings << ['015', '', '']
153
+ @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires one or two 'type' link headers\n"
134
154
  end
135
155
  end
136
156
  end
@@ -0,0 +1,82 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FspHarvester
4
+ class Error < StandardError
5
+ end
6
+
7
+ class ExternalTools
8
+
9
+ def initialize(metadata: FspHarvester::MetadataObject.new)
10
+ @meta = metadata
11
+ end
12
+
13
+ def process_with_distiller(body:)
14
+ bhash = Digest::SHA256.hexdigest(body)
15
+ if @@distillerknown[bhash]
16
+ @meta.comments << "INFO: data is already parsed by distiller.\n"
17
+ #parse_rdf(body: body)
18
+ else
19
+ @meta.comments << "INFO: Using 'Kellog's Distiller' to try to extract metadata from return value (message body).\n"
20
+ file = Tempfile.new('foo', encoding: 'UTF-8')
21
+ body = body.force_encoding('UTF-8')
22
+ body.scrub!
23
+ body = body.gsub(%r{"@context"\s*:\s*"https?://schema.org/?"}, '"@context": "https://schema.org/docs/jsonldcontext.json"') # a bug in distiller, apparently
24
+ file.write(body)
25
+ file.rewind
26
+
27
+ @meta.comments << "INFO: The message body is being examined by Distiller\n"
28
+ # command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format turtle #{file.path} 2>/dev/null"
29
+ command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
30
+ # command = "LANG=en_US.UTF-8 /usr/local/bin/ruby #{@rdf_command} serialize --input-format rdfa --output-format jsonld #{file.path}"
31
+ # command = "LANG=en_US.UTF-8 /home/osboxes/.rvm/rubies/ruby-2.6.3/bin/ruby /home/osboxes/.rvm/gems/ruby-2.6.3/bin/rdf serialize --output-format jsonld #{file.path}"
32
+ warn "distiller command: #{command}"
33
+ result, _stderr, _status = Open3.capture3(command)
34
+ warn ''
35
+ warn "distiller errors: #{stderr}"
36
+ file.close
37
+ file.unlink
38
+
39
+ result = result.force_encoding('UTF-8')
40
+ warn "DIST RESULT: #{result}"
41
+ if result !~ /@context/i # failure returns nil
42
+ @meta.comments << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML..\n"
43
+ @meta.warnings << ['018', '', '']
44
+ else
45
+ @meta.comments << "INFO: The Distiller found parseable data. Parsing as JSON-LD\n"
46
+ parse_rdf(result: result, content_type: "application/ld+json")
47
+ end
48
+ @@distillerknown[bhash] = true
49
+ end
50
+ end
51
+
52
+ def processs_with_extruct(uri:)
53
+ @meta.comments << "INFO: Using 'extruct' to try to extract metadata from return value (message body) of #{uri}.\n"
54
+ warn 'begin open3'
55
+ stdout, stderr, status = Open3.capture3(EXTRUCT_COMMAND + ' ' + uri)
56
+ warn "open3 status: #{status} #{stdout}"
57
+ result = stderr # absurd that the output comes over stderr! LOL!
58
+
59
+ if result.to_s.match(/(Failed\sto\sextract.*?)\n/)
60
+ @meta.comments << "WARN: extruct threw an error #{Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n"
61
+ @meta.warnings << ['019', '', '']
62
+ if result.to_s.match(/(ValueError:.*?)\n/)
63
+ @meta.comments << "WARN: extruct error was #{Regexp.last_match(1)}\n"
64
+ @meta.warnings << ['019', '', '']
65
+ end
66
+ elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
67
+ json = JSON.parse result
68
+ @meta.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
69
+
70
+ parse_rdf(body: json['json-ld'].to_json, content_type: 'application/ld+json') if json['json-ld'].any? # RDF
71
+ @meta.merge_hash(json['microdata'].first) if json['microdata'].any?
72
+ @meta.merge_hash(json['microformat'].first) if json['microformat'].any?
73
+ @meta.merge_hash(json['opengraph'].first) if json['opengraph'].any?
74
+ parse_rdf(body: json['rdfa'].to_json, content_type: 'application/ld+json') if json['rdfa'].any? # RDF
75
+
76
+ @meta.merge_hash(json.first) if json.first.is_a? Hash
77
+ else
78
+ @meta.comments << "WARN: the extruct tool failed to find parseable data at #{uri}\n"
79
+ end
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,164 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FspHarvester
4
+ class Error < StandardError
5
+ end
6
+
7
+ class MetadataHarvester
8
+ def self.extract_metadata(links: [], metadata: FspHarvester::MetadataObject.new)
9
+ @meta = metadata
10
+ @meta.comments << 'INFO: now collecting both linked data and hash-style data using the harvested links'
11
+
12
+ describedby = links.select { |l| l if l.relation == 'describedby' }
13
+
14
+ hvst = FspHarvester::MetadataParser.new(metadata_object: @meta) # put here because the class variable for detecting duplicates should apply to all URIs
15
+ describedby.each do |link|
16
+ accepttype = ACCEPT_STAR_HEADER
17
+ accept = link.respond_to?('type') ? link.type : nil
18
+ accepttype = { 'Accept' => accept } if accept
19
+
20
+ response = attempt_to_resolve(link: link, headers: accepttype)
21
+
22
+ abbreviation, content_type = attempt_to_detect_type(body: response.body, headers: response.headers)
23
+ unless abbreviation
24
+ @meta.warnings << ['017', url, header]
25
+ @meta.comments << "WARN: metadata format returned from #{url} using Accept header #{header} is not recognized. Processing will end now.\n"
26
+ next
27
+ end
28
+
29
+ # process according to detected type
30
+ case abbreviation
31
+ when 'html'
32
+ @meta.comments << 'INFO: Processing html'
33
+ hvst.process_html(body: response.body, uri: link)
34
+ when 'xml'
35
+ @meta.comments << 'INFO: Processing xml'
36
+ hvst.process_xml(body: response.body)
37
+ when 'json'
38
+ @meta.comments << 'INFO: Processing json'
39
+ hvst.process_json(body: response.body)
40
+ when 'jsonld', 'rdfxml', 'turtle', 'ntriples', 'nquads'
41
+ @meta.comments << 'INFO: Processing linked data'
42
+ hvst.process_ld(body: response.body, content_type: content_type)
43
+ when 'specialist'
44
+ warn 'no specialized parsers so far'
45
+ end
46
+ end
47
+ end
48
+
49
+ def self.attempt_to_resolve(link:, headers: ACCEPT_STAR_HEADER)
50
+ @meta.comments << "INFO: link #{link.href} being processed"
51
+ if link.respond_to? 'type'
52
+ header = { 'Accept' => link.type }
53
+ else
54
+ @meta.comments << "INFO: link #{link.href} has no MIME type, defaulting to */*"
55
+ end
56
+ url = link.href
57
+ response = FspHarvester::WebUtils.fspfetch(url: url, method: :get, headers: header)
58
+ unless response
59
+ @meta.warnings << ['016', url, header]
60
+ @meta.comments << "WARN: Unable to resolve describedby link #{url} using HTTP Accept header #{header}.\n"
61
+ end
62
+ response
63
+ end
64
+
65
+ def self.attempt_to_detect_type(body:, headers:)
66
+ # described by should be an html, xml, json, or linked data document
67
+ abbreviation = nil
68
+ content_type = nil
69
+ @meta.comments << 'INFO: Testing metadata format for html, xml, and linked data formats\n'
70
+ if body =~ /^\s*<\?xml/
71
+ if body =~ /<HTML/i
72
+ abbreviation = 'html'
73
+ content_type = 'text/html'
74
+ @meta.comments << 'INFO: appears to be HTML\n'
75
+ elsif body =~ /<rdf:RDF/i
76
+ abbreviation = 'rdfxml'
77
+ content_type = 'application/rdf+xml'
78
+ @meta.comments << 'INFO: appears to be RDF-XML\n'
79
+ else
80
+ abbreviation = 'xml'
81
+ content_type = 'application/xml'
82
+ @meta.comments << 'INFO: appears to be XML\n'
83
+ end
84
+ else
85
+ abbreviation, content_type = check_ld(body: body, claimed_type: headers[:content_type])
86
+ abbreviation, content_type = check_json(body: body) unless abbreviation
87
+ end
88
+
89
+ unless content_type
90
+ @meta.warnings << ['017', url, header]
91
+ @meta.comments << "WARN: metadata format returned from #{url} using Accept header #{header} is not recognized. Processing will end now.\n"
92
+ end
93
+ [abbreviation, content_type]
94
+ end
95
+
96
+ def self.check_ld(body:, claimed_type:)
97
+ detected_type = ntriples_hack(body: body) # ntriples hack for one-line metadata records
98
+ unless detected_type
99
+ detected_type = RDF::Format.for({ sample: body[0..5000] })
100
+ @meta.comments << "INFO: Auto-detected type #{detected_type}\n"
101
+ end
102
+ contenttype = ''
103
+ abbreviation = ''
104
+ if detected_type
105
+ contenttype = detected_type.content_type.first # comes back as array
106
+ abbreviation = abbreviate_type(contenttype: contenttype)
107
+ @meta.comments << "INFO: using content-type #{contenttype}.\n"
108
+ else
109
+ @meta.comments << "INFO: metadata does not appear to be in a linked data format. Trying other options.\n"
110
+ end
111
+ [abbreviation, contenttype]
112
+ end
113
+
114
+ def self.ntriples_hack(body:) # distriller cannot recognize single-line ntriples unless they end with a period, which is not required by the spec... so hack it!
115
+ detected_type = nil
116
+ body.split.each do |line|
117
+ line.strip!
118
+ next if line.empty?
119
+ if line =~ %r{\s*<[^>]+>\s*<[^>]+>\s\S+}
120
+ @meta.comments << "INFO: running ntriples hack on #{line + " ."}\n"
121
+ detected_type = RDF::Format.for({ sample: "#{line} ." }) # adding a period allows detection of ntriples by distiller
122
+ break
123
+ end
124
+ end
125
+ @meta.comments << "INFO: ntriples hack found: #{detected_type.to_s}\n"
126
+ if detected_type != RDF::NTriples::Format # only return the hacky case
127
+ return nil
128
+ end
129
+ return detected_type
130
+ end
131
+
132
+
133
+ def self.check_json(body:)
134
+ abbreviation = nil
135
+ parsed = nil
136
+ begin
137
+ parsed = JSON.parse(body)
138
+ rescue StandardError
139
+ abbreviation = nil
140
+ end
141
+
142
+ if parsed
143
+ abbreviation = 'json'
144
+ else
145
+ @meta.comments << "INFO: metadata does not appear to be in JSON format. No options left.\n"
146
+ end
147
+ [abbreviation, 'application/ld+json']
148
+ end
149
+
150
+ def self.abbreviate_type(contenttype:)
151
+ foundtype = nil
152
+ RDF_FORMATS.merge(XML_FORMATS).merge(HTML_FORMATS).merge(JSON_FORMATS).each do |type, vals|
153
+ warn "\n\ntype #{type}\nvals #{vals}\n\n"
154
+ @meta.comments << "INFO: testing #{type} MIME types for #{contenttype}"
155
+ next unless vals.include? contenttype
156
+
157
+ foundtype = type
158
+ @meta.comments << "INFO: detected a #{type} MIME type"
159
+ break
160
+ end
161
+ foundtype
162
+ end
163
+ end
164
+ end
@@ -0,0 +1,109 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FspHarvester
4
+ class Error < StandardError
5
+ end
6
+
7
+ class MetadataParser
8
+ # attr_accessor :distillerknown
9
+
10
+ @@distillerknown = {}
11
+
12
+ def initialize(metadata_object: FspHarvester::MetadataObject.new)
13
+ @meta = metadata_object
14
+ end
15
+
16
+ def process_html(body:, uri:)
17
+ tools = FspHarvester::ExternalTools.new(metadata: @meta)
18
+ tools.process_with_distiller(body: body)
19
+ tools.process_with_extruct(uri: uri)
20
+ end
21
+
22
+ def process_xml(body:)
23
+ begin
24
+ hash = XmlSimple.xml_in(body)
25
+ rescue
26
+ @meta.comments << "CRITICAL: Malformed XML detected. Cannot process metadata.\n"
27
+ @meta.warnings << ['020', '', '']
28
+ end
29
+ @meta.comments << "INFO: The XML is being merged in the metadata object\n"
30
+ @meta.hash.merge hash
31
+ end
32
+
33
+ def process_json(body:)
34
+ begin
35
+ hash = JSON.parse(body)
36
+ rescue
37
+ @meta.comments << "CRITICAL: Malformed JSON detected. Cannot process metadata.\n"
38
+ @meta.warnings << ['021', '', '']
39
+ end
40
+ @meta.comments << "INFO: The JSON is being merged in the metadata object\n"
41
+ @meta.hash.merge hash
42
+ end
43
+
44
+ def process_ld(body:, content_type:)
45
+ parse_rdf(body: body, content_type: content_type)
46
+ end
47
+
48
+ def parse_rdf(body:, content_type:)
49
+ unless body
50
+ @meta.comments << "CRITICAL: The response message body component appears to have no content.\n"
51
+ @meta.warnings << ['018', '', '']
52
+ return
53
+ end
54
+
55
+ unless body.match(/\w/)
56
+ @meta.comments << "CRITICAL: The response message body component appears to have no content.\n"
57
+ @meta.warnings << ['018', '', '']
58
+ return
59
+ end
60
+
61
+ rdfformat = RDF::Format.for(content_type: content_type)
62
+ unless rdfformat
63
+ @meta.comments << "CRITICAL: Found what appears to be RDF (sample: #{body[0..300].delete!("\n")}), but it could not find a parser. Please report this error, along with the GUID of the resource, to the maintainer of the system.\n"
64
+ @meta.warnings << ['018', '', '']
65
+ return
66
+ end
67
+
68
+ graph = FspHarvester::Cache.checkRDFCache(body: body)
69
+ if graph.size > 0
70
+ warn "\n\n\n unmarshalling graph from cache\n\ngraph size #{graph.size}\n\n"
71
+ @meta.merge_rdf(graph.to_a)
72
+ else
73
+ warn "\n\n\nfound format #{rdfformat}\n\n"
74
+ @meta.comments << "INFO: The response message body component appears to contain #{rdfformat}.\n"
75
+ reader = ''
76
+ begin
77
+ reader = rdfformat.reader.new(body)
78
+ rescue Exception => e
79
+ @meta.comments << "WARN: Though linked data was found, it failed to parse (Exception #{e}). This likely indicates some syntax error in the data. As a result, no metadata will be extracted from this message.\n"
80
+ @meta.warnings << ['018', '', '']
81
+ return
82
+ end
83
+
84
+ begin
85
+ if reader.size == 0
86
+ @meta.comments << "WARN: Though linked data was found, it failed to parse. This likely indicates some syntax error in the data. As a result, no metadata will be extracted from this message.\n"
87
+ return
88
+ end
89
+ reader = rdfformat.reader.new(body) # have to re-read it here, but now its safe because we have already caught errors
90
+ warn 'WRITING TO CACHE'
91
+ FspHarvester::Cache.writeRDFCache(reader: reader, body: body) # write to the special RDF graph cache
92
+ warn 'WRITING DONE'
93
+ reader = rdfformat.reader.new(body) # frustrating that we cannot rewind!
94
+ warn 'RE-READING DONE'
95
+ @meta.merge_rdf(reader.to_a)
96
+ warn 'MERGE DONE'
97
+ rescue RDF::ReaderError => e
98
+ @meta.comments << "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} || (sample of what was parsed: #{body[0..300].delete("\n")})\n"
99
+ warn "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} || (sample of what was parsed: #{body[0..300].delete("\n")})\n"
100
+ @meta.warnings << ['018', '', '']
101
+ rescue Exception => e
102
+ meta.comments << "CRITICAL: An unknown error occurred while parsing the (apparent) Linked Data (sample of what was parsed: #{body[0..300].delete("\n")}). Moving on...\n"
103
+ warn "\n\nCRITICAL: #{e.inspect} An unknown error occurred while parsing the (apparent) Linked Data (full body: #{body}). Moving on...\n"
104
+ @meta.warnings << ['018', '', '']
105
+ end
106
+ end
107
+ end
108
+ end
109
+ end
@@ -1,6 +1,6 @@
1
1
  module FspHarvester
2
2
  class MetadataObject
3
- attr_accessor :hash, :graph, :comments, :warnings, :guidtype, :full_response, :finalURI # a hash of metadata # a RDF.rb graph of metadata # an array of comments # the type of GUID that was detected # will be an array of Net::HTTP::Response
3
+ attr_accessor :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris # a hash of metadata # a RDF.rb graph of metadata # an array of comments # the type of GUID that was detected # will be an array of Net::HTTP::Response
4
4
 
5
5
  def initialize(_params = {}) # get a name from the "new" call, or set a default
6
6
  @hash = {}
@@ -8,15 +8,16 @@ module FspHarvester
8
8
  @comments = []
9
9
  @warnings = []
10
10
  @full_response = []
11
- @finalURI = []
11
+ @links = []
12
+ @all_uris = []
12
13
  end
13
14
 
14
15
  def merge_hash(hash)
15
- # $stderr.puts "\n\n\nIncoming Hash #{hash.inspect}"
16
+ # warn "\n\n\nIncoming Hash #{hash.inspect}"
16
17
  self.hash = self.hash.merge(hash)
17
18
  end
18
19
 
19
- def merge_rdf(triples) # incoming list of triples
20
+ def merge_rdf(triples) # incoming list of triples
20
21
  graph << triples
21
22
  graph
22
23
  end
@@ -25,4 +26,95 @@ module FspHarvester
25
26
  graph
26
27
  end
27
28
  end
29
+
30
+ class Cache
31
+ def self.retrieveMetaObject(uri)
32
+ filename = (Digest::MD5.hexdigest uri) + '_meta'
33
+ warn "Checking Meta cache for #{filename}"
34
+ if File.exist?("/tmp/#{filename}")
35
+ warn 'FOUND Meta object in cache'
36
+ meta = Marshal.load(File.read("/tmp/#{filename}"))
37
+ warn 'Returning....'
38
+ return meta
39
+ end
40
+ warn 'Meta objectNot Found in Cache'
41
+ false
42
+ end
43
+
44
+ def self.cacheMetaObject(meta, uri)
45
+ filename = (Digest::MD5.hexdigest uri) + '_meta'
46
+ warn "in cacheMetaObject Writing to cache for #{filename}"
47
+ File.open("/tmp/#{filename}", 'wb') { |f| f.write(Marshal.dump(meta)) }
48
+ end
49
+
50
+ def self.checkRDFCache(body: )
51
+ fs = File.join('/tmp/', '*_graphbody')
52
+ bodies = Dir.glob(fs)
53
+ g = RDF::Graph.new
54
+ bodies.each do |bodyfile|
55
+ next unless File.size(bodyfile) == body.bytesize # compare body size
56
+ next unless bodyfile.match(/(.*)_graphbody$/) # continue if there's no match
57
+
58
+ filename = Regexp.last_match(1)
59
+ warn "Regexp match for #{filename} FOUND"
60
+ next unless File.exist?("#{filename}_graph") # @ get the associated graph file
61
+
62
+ warn "RDF Cache File #{filename} FOUND"
63
+ graph = Marshal.load(File.read("#{filename}_graph")) # unmarshal it
64
+ graph.each do |statement|
65
+ g << statement # need to do this because the unmarshalled object isn't entirely functional as an RDF::Graph object
66
+ end
67
+ warn "returning a graph of #{g.size}"
68
+ break
69
+ end
70
+ # return an empty graph otherwise
71
+ g
72
+ end
73
+
74
+ def self.writeRDFCache(reader:, body:)
75
+ filename = Digest::MD5.hexdigest body
76
+ graph = RDF::Graph.new
77
+ reader.each_statement { |s| graph << s }
78
+ warn "WRITING RDF TO CACHE #{filename}"
79
+ File.open("/tmp/#{filename}_graph", 'wb') { |f| f.write(Marshal.dump(graph)) }
80
+ File.open("/tmp/#{filename}_graphbody", 'wb') { |f| f.write(body) }
81
+ warn "wrote RDF filename: #{filename}"
82
+ end
83
+
84
+ def self.checkCache(uri, headers)
85
+ filename = Digest::MD5.hexdigest uri + headers.to_s
86
+ warn "Checking Error cache for #{filename}"
87
+ if File.exist?("/tmp/#{filename}_error")
88
+ warn 'Error file found in cache... returning'
89
+ return ['ERROR', nil, nil]
90
+ end
91
+ if File.exist?("/tmp/#{filename}_head") and File.exist?("/tmp/#{filename}_body")
92
+ warn 'FOUND data in cache'
93
+ head = Marshal.load(File.read("/tmp/#{filename}_head"))
94
+ body = Marshal.load(File.read("/tmp/#{filename}_body"))
95
+ all_uris = ''
96
+ all_uris = Marshal.load(File.read("/tmp/#{filename}_uri")) if File.exist?("/tmp/#{filename}_uri")
97
+ warn 'Returning....'
98
+ return [head, body, all_uris]
99
+ end
100
+ warn 'Not Found in Cache'
101
+ end
102
+
103
+ def self.writeToCache(uri, headers, head, body, all_uris)
104
+ filename = Digest::MD5.hexdigest uri + headers.to_s
105
+ warn "in writeToCache Writing to cache for #{filename}"
106
+ headfilename = filename + '_head'
107
+ bodyfilename = filename + '_body'
108
+ urifilename = filename + '_uri'
109
+ File.open("/tmp/#{headfilename}", 'wb') { |f| f.write(Marshal.dump(head)) }
110
+ File.open("/tmp/#{bodyfilename}", 'wb') { |f| f.write(Marshal.dump(body)) }
111
+ File.open("/tmp/#{urifilename}", 'wb') { |f| f.write(Marshal.dump(all_uris)) }
112
+ end
113
+
114
+ def self.writeErrorToCache(uri, headers)
115
+ filename = Digest::MD5.hexdigest uri + headers.to_s
116
+ warn "in writeErrorToCache Writing error to cache for #{filename}"
117
+ File.open("/tmp/#{filename}_error", 'wb') { |f| f.write('ERROR') }
118
+ end
119
+ end
28
120
  end
@@ -0,0 +1,87 @@
1
+ def check_for_citeas_conflicts(citeas: )
2
+ @meta.comments << 'INFO: checking for conflicting cite-as links'
3
+ citeas_hrefs = Hash.new
4
+ citeas.each do |link|
5
+ warn "INFO: Adding citeas #{link.href} to the testing queue."
6
+ @meta.comments << "INFO: Adding citeas #{link.href} to the testing queue."
7
+ citeas_hrefs[link.href] = link
8
+ end
9
+
10
+ if citeas_hrefs.length > 1
11
+ @meta.comments << 'INFO: Found multiple non-identical cite-as links.'
12
+ @meta.warnings << ['007', '', '']
13
+ @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard: Found conflicting cite-as link headers.\n"
14
+ end
15
+ citeas_hrefs.values # return list of unique links
16
+ end
17
+
18
+
19
+ def check_describedby_rules(describedby:)
20
+ describedby.each do |l|
21
+ unless l.respond_to? 'type'
22
+ @meta.warnings << ['005', l.href, '']
23
+ @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires any describedby links to also have a 'type' attribute.\n"
24
+ end
25
+ type = l.type if l.respond_to? 'type'
26
+ type ||= '*/*'
27
+ header = { accept: type }
28
+ response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
29
+ if response
30
+ responsetype = response.headers[:content_type]
31
+ @meta.comments << "INFO: describedby link responds with content type #{responsetype}\n"
32
+ if responsetype =~ %r{^(.*/[^;]+)}
33
+ responsetype = Regexp.last_match(1).to_s # remove the e.g. charset information
34
+ end
35
+ @meta.comments << "INFO: testing content type |#{responsetype}| against |#{type}|\n"
36
+ if type != '*/*'
37
+ if responsetype == type
38
+ @meta.comments << "INFO: describedby link responds according to Signposting specifications\n"
39
+ else
40
+ @meta.warnings << ['009', l.href, header]
41
+ @meta.comments << "WARN: Content type of returned describedby link #{responsetype}does not match the 'type' attribute #{type}\n"
42
+ end
43
+ else
44
+ @meta.warnings << ['010', l.href, header]
45
+ @meta.comments << "WARN: Content type of returned describedby link is not specified in response headers or cannot be matched against accept headers\n"
46
+ end
47
+ else
48
+ @meta.warnings << ['008', l.href, header]
49
+ @meta.comments << "WARN: describedby link doesn't resolve\n"
50
+ end
51
+ end
52
+ end
53
+
54
+ def check_item_rules(item:)
55
+ item.each do |l| # l = LinkHeaders::Link
56
+ unless l.respond_to? 'type'
57
+ @meta.warnings << ['011', l.href, '']
58
+ @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which encourages any item links to also have a 'type' attribute.\n"
59
+ end
60
+ type = l.type if l.respond_to? 'type'
61
+ type ||= '*/*' # this becomes a frozen string
62
+ header = { accept: type }
63
+ response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
64
+
65
+ if response
66
+ if response.headers[:content_type] and type != '*/*'
67
+ rtype = type.gsub(%r{/}, "\/") # because type is a frozen string
68
+ rtype = rtype.gsub(/\+/, '.')
69
+ typeregex = Regexp.new(type)
70
+ if response.headers[:content_type].match(typeregex)
71
+ warn response.headers[:content_type]
72
+ warn typeregex.inspect
73
+ @meta.comments << "INFO: item link responds according to Signposting specifications\n"
74
+ else
75
+ @meta.warnings << ['012', l.href, header]
76
+ @meta.comments << "WARN: Content type of returned item link does not match the 'type' attribute\n"
77
+ end
78
+ else
79
+ @meta.warnings << ['013', l.href, header]
80
+ @meta.comments << "WARN: Content type of returned item link is not specified in response headers or cannot be matched against accept headers\n"
81
+ end
82
+ else
83
+ @meta.warnings << ['014', l.href, header]
84
+ @meta.comments << "WARN: item link doesn't resolve\n"
85
+ end
86
+ end
87
+ end
data/lib/warnings.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "001": {
3
- "message": "Unable to resolve guid using Accept headers for Linked Data",
3
+ "message": "Unable to resolve guid using default (*/*) Accept headers",
4
4
  "linkout": "",
5
5
  "severity": "WARN"
6
6
  },
@@ -28,6 +28,83 @@
28
28
  "message": "GUID type not recognized",
29
29
  "linkout": "",
30
30
  "severity": "WARN"
31
- }
31
+ },
32
+ "007": {
33
+ "message": "Conflicting cite-as links",
34
+ "linkout": "",
35
+ "severity": "WARN"
36
+ },
37
+ "008": {
38
+ "message": "describedby link does not resolve",
39
+ "linkout": "",
40
+ "severity": "WARN"
41
+ },
42
+ "009": {
43
+ "message": "Content-type of described-by link does not match the type attribute in the link header itself",
44
+ "linkout": "",
45
+ "severity": "WARN"
46
+ },
47
+ "010": {
48
+ "message": "Content-type of response from described-by link is undefined or cannot be compared to the link type",
49
+ "linkout": "",
50
+ "severity": "WARN"
51
+ },
52
+ "011": {
53
+ "message": "The resource does not follow the FAIR Signposting standard, which encourages any item links to have a type attribute",
54
+ "linkout": "",
55
+ "severity": "WARN"
56
+ },
57
+ "012": {
58
+ "message": "Content-type of response from resolving an item doesn't match the item type attribute in the link header",
59
+ "linkout": "",
60
+ "severity": "WARN"
61
+ },
62
+ "013": {
63
+ "message": "Content-type of response from resolving an item is undefined or cannot be compared to the link type",
64
+ "linkout": "",
65
+ "severity": "WARN"
66
+ },
67
+ "014": {
68
+ "message": "Item link does not resolve",
69
+ "linkout": "",
70
+ "severity": "WARN"
71
+ },
72
+ "015": {
73
+ "message": "Link headers do not include a link of type 'type', as required by the FAIR Signposting specification",
74
+ "linkout": "",
75
+ "severity": "WARN"
76
+ },
77
+ "016": {
78
+ "message": "Unable to resolve describedby link using Accept headers with the MIME type indicated in the link",
79
+ "linkout": "",
80
+ "severity": "WARN"
81
+ },
82
+ "017": {
83
+ "message": "Metadata format not recognized.",
84
+ "linkout": "",
85
+ "severity": "WARN"
86
+ },
87
+ "018": {
88
+ "message": "RDF parsing error - likely malformed RDF document.",
89
+ "linkout": "",
90
+ "severity": "WARN"
91
+ },
92
+ "019": {
93
+ "message": "HTML parsing error - unable to extract linked data from HTML.",
94
+ "linkout": "",
95
+ "severity": "WARN"
96
+ },
97
+ "020": {
98
+ "message": "XML parsing error - unable to process XML document.",
99
+ "linkout": "",
100
+ "severity": "WARN"
101
+ },
102
+ "021": {
103
+ "message": "JSON parsing error - unable to process JSON document.",
104
+ "linkout": "",
105
+ "severity": "WARN"
106
+ },
107
+
108
+
32
109
 
33
110
  }
data/lib/web_utils.rb CHANGED
@@ -1,32 +1,32 @@
1
1
  module FspHarvester
2
2
 
3
3
  class WebUtils
4
- def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER) # we will try to retrieve turtle whenever possible
4
+ def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get, meta: FspHarvester::MetadataObject.new)
5
5
  warn 'In fetch routine now. '
6
6
 
7
7
  begin
8
8
  warn "executing call over the Web to #{url}"
9
9
  response = RestClient::Request.execute({
10
- method: :get,
10
+ method: method,
11
11
  url: url.to_s,
12
12
  # user: user,
13
13
  # password: pass,
14
14
  headers: headers
15
15
  })
16
- @meta.finalURI |= [response.request.url] if @meta # it's possible to call this method without affecting the metadata object being created by the harvester
17
- warn "There was a response to the call #{url}"
18
- warn "There was a response to the call #{response.request.url}"
16
+ meta.all_uris |= [response.request.url] # it's possible to call this method without affecting the metadata object being created by the harvester
17
+ warn "starting URL #{url}"
18
+ warn "final URL #{response.request.url}"
19
19
  warn "Response code #{response.code}"
20
- if response.code == 203 && @meta
21
- @meta.warnings << ["002", url, headers]
22
- @meta.comments << "WARN: Response is non-authoritative (HTTP response code: #{response.code}). Headers may have been manipulated encountered when trying to resolve #{url}\n"
20
+ if response.code == 203
21
+ meta.warnings << ["002", url, headers]
22
+ meta.comments << "WARN: Response is non-authoritative (HTTP response code: #{response.code}). Headers may have been manipulated encountered when trying to resolve #{url}\n"
23
23
  end
24
24
  response
25
25
  rescue RestClient::ExceptionWithResponse => e
26
26
  warn "EXCEPTION WITH RESPONSE! #{e.response}\n#{e.response.headers}"
27
- @meta.warnings << ["003", url, headers] if @meta
28
- @meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n" if @meta
29
- if e.response.code == 500
27
+ meta.warnings << ["003", url, headers]
28
+ meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
29
+ if (e.response.code == 500 or e.response.code == 404)
30
30
  return false
31
31
  else
32
32
  e.response
@@ -34,14 +34,14 @@ module FspHarvester
34
34
  # now we are returning the headers and body that were returned
35
35
  rescue RestClient::Exception => e
36
36
  warn "EXCEPTION WITH NO RESPONSE! #{e}"
37
- @meta.warnings << ["003", url, headers] if @meta
38
- @meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n" if @meta
37
+ meta.warnings << ["003", url, headers]
38
+ meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
39
39
  false
40
40
  # now we are returning 'False', and we will check that with an \"if\" statement in our main code
41
41
  rescue Exception => e
42
42
  warn "EXCEPTION UNKNOWN! #{e}"
43
- @meta.warnings << ["003", url, headers] if @meta
44
- @meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n" if @meta
43
+ meta.warnings << ["003", url, headers]
44
+ meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
45
45
  false
46
46
  # now we are returning 'False', and we will check that with an \"if\" statement in our main code
47
47
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fsp_harvester
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mark Wilkinson
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-07-27 00:00:00.000000000 Z
11
+ date: 2022-08-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: json
@@ -44,14 +44,14 @@ dependencies:
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: 0.1.11
47
+ version: 0.1.16
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: 0.1.11
54
+ version: 0.1.16
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: metainspector
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -171,6 +171,7 @@ executables: []
171
171
  extensions: []
172
172
  extra_rdoc_files: []
173
173
  files:
174
+ - ".rspec_status"
174
175
  - CHANGELOG.md
175
176
  - Gemfile
176
177
  - Gemfile.lock
@@ -180,10 +181,17 @@ files:
180
181
  - bin/console
181
182
  - bin/setup
182
183
  - example_test.rb
184
+ - launch.json
185
+ - lib/config.conf_docker
186
+ - lib/config.conf_local
183
187
  - lib/constants.rb
184
188
  - lib/fsp_harvester.rb
185
189
  - lib/fsp_harvester/version.rb
190
+ - lib/fsp_metadata_external_tools.rb
191
+ - lib/fsp_metadata_harvester.rb
192
+ - lib/fsp_metadata_parser.rb
186
193
  - lib/metadata_object.rb
194
+ - lib/signposting_tests.rb
187
195
  - lib/swagger.rb
188
196
  - lib/warnings.json
189
197
  - lib/web_utils.rb