fsp_harvester 0.1.7 → 0.1.11

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7da32f3321193e93f64154c35db0c840f5e7f086451660f99b62d3f4c834e295
4
- data.tar.gz: 5a0a1ff4ef6b2100accd8bab4f20d6842d25ebf88d5e600e646804da9ed24bd9
3
+ metadata.gz: 895567e9edd571dbca7dee89a0270d1c14342fed06c3eb81c81e06f3c07ddbed
4
+ data.tar.gz: 7eee65295c206d6cee7b4ef28830f64087ba172a294cde7401490bffa20dbe1a
5
5
  SHA512:
6
- metadata.gz: dad90f81b73489a151220c132d74508832573a352b209a537bbbaf6543a90b9e132cca80ecca33d15d02eca91841642d73e795113076f4564730194b5bf1fa53
7
- data.tar.gz: 5d11c2f002f4e73a4971aec0d047cd14d35f6763155d0effa9c79f419ff6fd26f853158b290d2264edd148633570f5ecd50d4e1f6c7e9d136c7d2880517bdefc
6
+ metadata.gz: f0c7727598525cb55b6c2bfaf36d5ce3dda5da6efddf85888328b7c93b874c508989122627e5deaa5101fc0a20279432aa023ecefef112926219f267e3622234
7
+ data.tar.gz: 29f834c57ec73e27f988948893dc92fe56550b829585df390a9a1398770845115202289f6f9557c01eb2fc3eec218f863371db60649f6a3fef01da9457c2862e
data/.rspec_status ADDED
@@ -0,0 +1,55 @@
1
+ example_id | status | run_time |
2
+ ---------------------------------- | ------ | --------------- |
3
+ ./spec/cite-as_spec.rb[1:1:1] | passed | 1.61 seconds |
4
+ ./spec/cite-as_spec.rb[1:1:2] | passed | 1.18 seconds |
5
+ ./spec/cite-as_spec.rb[1:1:3] | passed | 1.02 seconds |
6
+ ./spec/cite-as_spec.rb[1:1:4] | passed | 1.6 seconds |
7
+ ./spec/cite-as_spec.rb[1:1:5] | passed | 2.78 seconds |
8
+ ./spec/cite-as_spec.rb[1:1:6] | passed | 2.09 seconds |
9
+ ./spec/cite-as_spec.rb[1:1:7] | passed | 2.98 seconds |
10
+ ./spec/cite-as_spec.rb[1:1:8] | passed | 2.2 seconds |
11
+ ./spec/cite-as_spec.rb[1:1:9] | passed | 2.87 seconds |
12
+ ./spec/cite-as_spec.rb[1:1:10] | passed | 2.18 seconds |
13
+ ./spec/cite-as_spec.rb[1:1:11] | passed | 3.16 seconds |
14
+ ./spec/cite-as_spec.rb[1:1:12] | passed | 2.36 seconds |
15
+ ./spec/cite-as_spec.rb[1:1:13] | passed | 2.89 seconds |
16
+ ./spec/cite-as_spec.rb[1:1:14] | passed | 2.13 seconds |
17
+ ./spec/cite-as_spec.rb[1:1:15] | passed | 1.18 seconds |
18
+ ./spec/cite-as_spec.rb[1:1:16] | passed | 1.3 seconds |
19
+ ./spec/cite-as_spec.rb[1:1:17] | passed | 1.17 seconds |
20
+ ./spec/cite-as_spec.rb[1:1:18] | passed | 1.2 seconds |
21
+ ./spec/cite-as_spec.rb[1:1:19] | passed | 1.71 seconds |
22
+ ./spec/cite-as_spec.rb[1:1:20] | passed | 1.69 seconds |
23
+ ./spec/cite-as_spec.rb[1:1:21] | passed | 2.22 seconds |
24
+ ./spec/cite-as_spec.rb[1:1:22] | passed | 1.09 seconds |
25
+ ./spec/cite-as_spec.rb[1:1:23] | passed | 1.17 seconds |
26
+ ./spec/cite-as_spec.rb[1:1:24] | failed | 1.2 seconds |
27
+ ./spec/cite-as_spec.rb[1:1:25] | passed | 0.48048 seconds |
28
+ ./spec/describedby_spec.rb[1:1:1] | passed | 2.12 seconds |
29
+ ./spec/describedby_spec.rb[1:1:2] | passed | 0.96254 seconds |
30
+ ./spec/describedby_spec.rb[1:1:3] | passed | 0.92669 seconds |
31
+ ./spec/describedby_spec.rb[1:1:4] | passed | 0.92801 seconds |
32
+ ./spec/describedby_spec.rb[1:1:5] | passed | 1 second |
33
+ ./spec/describedby_spec.rb[1:1:6] | passed | 0.66763 seconds |
34
+ ./spec/describedby_spec.rb[1:1:7] | passed | 0.66021 seconds |
35
+ ./spec/describedby_spec.rb[1:1:8] | passed | 1.89 seconds |
36
+ ./spec/describedby_spec.rb[1:1:9] | passed | 1.3 seconds |
37
+ ./spec/describedby_spec.rb[1:1:10] | passed | 1.7 seconds |
38
+ ./spec/describedby_spec.rb[1:1:11] | passed | 2.28 seconds |
39
+ ./spec/describedby_spec.rb[1:1:12] | passed | 2.27 seconds |
40
+ ./spec/describedby_spec.rb[1:1:13] | passed | 1.39 seconds |
41
+ ./spec/describedby_spec.rb[1:1:14] | passed | 1.65 seconds |
42
+ ./spec/describedby_spec.rb[1:1:15] | passed | 1.7 seconds |
43
+ ./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00215 seconds |
44
+ ./spec/fsp_harvester_spec.rb[1:2] | failed | 0.00021 seconds |
45
+ ./spec/item_spec.rb[1:1:1] | passed | 2.04 seconds |
46
+ ./spec/item_spec.rb[1:1:2] | passed | 2 seconds |
47
+ ./spec/item_spec.rb[1:1:3] | passed | 0.92924 seconds |
48
+ ./spec/item_spec.rb[1:1:4] | passed | 1.36 seconds |
49
+ ./spec/item_spec.rb[1:1:5] | passed | 1.71 seconds |
50
+ ./spec/item_spec.rb[1:1:6] | passed | 1.68 seconds |
51
+ ./spec/item_spec.rb[1:1:7] | passed | 2.37 seconds |
52
+ ./spec/item_spec.rb[1:1:8] | passed | 0.34241 seconds |
53
+ ./spec/type_spec.rb[1:1:1] | passed | 0.9855 seconds |
54
+ ./spec/type_spec.rb[1:1:2] | passed | 0.96202 seconds |
55
+ ./spec/type_spec.rb[1:1:3] | passed | 0.96005 seconds |
data/Gemfile.lock CHANGED
@@ -1,10 +1,10 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fsp_harvester (0.1.7)
4
+ fsp_harvester (0.1.11)
5
5
  json (~> 2.0)
6
6
  linkeddata (~> 3.2)
7
- linkheaders-processor (~> 0.1.13)
7
+ linkheaders-processor (~> 0.1.16)
8
8
  metainspector (~> 5.11.2)
9
9
  parseconfig (~> 1.1)
10
10
  rake (~> 13.0)
@@ -36,7 +36,7 @@ GEM
36
36
  scanf (~> 1.0)
37
37
  sxp (~> 1.2)
38
38
  unicode-types (~> 1.7)
39
- faraday (1.10.0)
39
+ faraday (1.10.1)
40
40
  faraday-em_http (~> 1.0)
41
41
  faraday-em_synchrony (~> 1.0)
42
42
  faraday-excon (~> 1.1)
@@ -82,13 +82,13 @@ GEM
82
82
  concurrent-ruby (~> 1.0)
83
83
  json (2.6.2)
84
84
  json-canonicalization (0.3.0)
85
- json-ld (3.2.1)
85
+ json-ld (3.2.3)
86
86
  htmlentities (~> 4.3)
87
87
  json-canonicalization (~> 0.3)
88
88
  link_header (~> 0.0, >= 0.0.8)
89
89
  multi_json (~> 1.15)
90
90
  rack (~> 2.2)
91
- rdf (~> 3.2)
91
+ rdf (~> 3.2, >= 3.2.9)
92
92
  json-ld-preloaded (3.2.0)
93
93
  json-ld (~> 3.2)
94
94
  rdf (~> 3.2)
@@ -126,10 +126,11 @@ GEM
126
126
  shex (~> 0.7)
127
127
  sparql (~> 3.2)
128
128
  sparql-client (~> 3.2)
129
- linkheaders-processor (0.1.13)
129
+ linkheaders-processor (0.1.16)
130
130
  json (~> 2.0)
131
131
  json-ld (~> 3.2)
132
132
  json-ld-preloaded (~> 3.2)
133
+ link_header (~> 0.0.8)
133
134
  metainspector (~> 5.11.2)
134
135
  rest-client (~> 2.1)
135
136
  securerandom (~> 0.1.0)
@@ -165,7 +166,7 @@ GEM
165
166
  rack (2.2.4)
166
167
  rainbow (3.1.1)
167
168
  rake (13.0.6)
168
- rdf (3.2.8)
169
+ rdf (3.2.9)
169
170
  link_header (~> 0.0, >= 0.0.8)
170
171
  rdf-aggregate-repo (3.2.1)
171
172
  rdf (~> 3.2)
@@ -248,7 +249,7 @@ GEM
248
249
  diff-lcs (>= 1.2.0, < 2.0)
249
250
  rspec-support (~> 3.11.0)
250
251
  rspec-support (3.11.0)
251
- rubocop (1.32.0)
252
+ rubocop (1.33.0)
252
253
  json (~> 2.3)
253
254
  parallel (~> 1.10)
254
255
  parser (>= 3.1.0.0)
data/launch.json ADDED
@@ -0,0 +1,11 @@
1
+ {
2
+ "name": "RSpec - all",
3
+ "type": "Ruby",
4
+ "request": "launch",
5
+ "cwd": "${workspaceRoot}",
6
+ "program": "/home/osboxes/.rvm/gems/ruby-3.0.0/bin/rspec",
7
+ "args": [
8
+ "-I",
9
+ "${workspaceRoot}"
10
+ ]
11
+ }
@@ -0,0 +1,8 @@
1
+ [extruct]
2
+ command="extruct"
3
+
4
+ [rdf]
5
+ command="/usr/local/bundle/bin/rdf"
6
+
7
+ [tika]
8
+ command="http://tika:9998/meta"
@@ -0,0 +1,8 @@
1
+ [extruct]
2
+ command="extruct"
3
+
4
+ [rdf]
5
+ command="/home/osboxes/.rvm/gems/ruby-3.0.0/bin/rdf"
6
+
7
+ [tika]
8
+ command="http://tika:9998/meta"
data/lib/constants.rb CHANGED
@@ -1,17 +1,20 @@
1
1
  ACCEPT_ALL_HEADER = { 'Accept' => 'text/turtle, application/ld+json, application/rdf+xml, text/xhtml+xml, application/n3, application/rdf+n3, application/turtle, application/x-turtle, text/n3, text/turtle, text/rdf+n3, text/rdf+turtle, application/n-triples' }
2
2
 
3
+ ACCEPT_STAR_HEADER = {'Accept' => '*/*'}
4
+
3
5
  TEXT_FORMATS = {
4
6
  'text' => ['text/plain']
5
7
  }
6
8
 
7
9
  RDF_FORMATS = {
8
- 'jsonld' => ['application/ld+json', 'application/vnd.schemaorg.ld+json'], # NEW FOR DATACITE
10
+ 'jsonld' => ['application/ld+json','application/x-ld+json', 'application/vnd.schemaorg.ld+json'], # NEW FOR DATACITE
9
11
  'turtle' => ['text/turtle', 'application/n3', 'application/rdf+n3',
10
12
  'application/turtle', 'application/x-turtle', 'text/n3', 'text/turtle',
11
13
  'text/rdf+n3', 'text/rdf+turtle'],
12
14
  # 'rdfa' => ['text/xhtml+xml', 'application/xhtml+xml'],
13
15
  'rdfxml' => ['application/rdf+xml'],
14
- 'triples' => ['application/n-triples', 'application/n-quads', 'application/trig']
16
+ 'ntriples' => ['application/n-triples', 'application/trig'],
17
+ 'nquads' => ['application/n-quads']
15
18
  }
16
19
 
17
20
  XML_FORMATS = {
@@ -73,12 +76,10 @@ GUID_TYPES = { 'inchi' => Regexp.new(/^\w{14}-\w{10}-\w$/),
73
76
  'uri' => Regexp.new(%r{^\w+:/?/?[^\s]+$}) }
74
77
 
75
78
  CONFIG = File.exist?('config.conf') ? ParseConfig.new('config.conf') : {}
76
- if CONFIG['extruct'] && CONFIG['extruct']['command'] && !CONFIG['extruct']['command'].empty?
77
- extruct = config['extruct']['command']
78
- end
79
- extruct = 'extruct' unless @extruct_command
79
+ extruct = CONFIG.dig(:extruct, :command)
80
+ extruct ||= 'extruct'
80
81
  extruct.strip!
81
- case @extruct
82
+ case extruct
82
83
  when /[&|;`$\s]/
83
84
  abort 'The Extruct command in the config file appears to be subject to command injection. I will not continue'
84
85
  when /echo/i
@@ -86,8 +87,8 @@ when /echo/i
86
87
  end
87
88
  EXTRUCT_COMMAND = extruct
88
89
 
89
- rdf_command = CONFIG['rdf']['command'] if CONFIG['rdf'] && CONFIG['rdf']['command'] && !CONFIG['rdf']['command'].empty?
90
- rdf_command = 'rdf' unless @rdf_command
90
+ rdf_command = CONFIG.dig(:rdf, :command)
91
+ rdf_command ||= 'rdf'
91
92
  rdf_command.strip
92
93
  case rdf_command
93
94
  when /[&|;`$\s]/
@@ -99,8 +100,6 @@ when !(/rdf$/ =~ $_)
99
100
  end
100
101
  RDF_COMMAND = rdf_command
101
102
 
102
- if CONFIG['tika'] && CONFIG['tika']['command'] && !CONFIG['tika']['command'].empty?
103
- tika_command = CONFIG['tika']['command']
104
- end
105
- tika_command = 'http://localhost:9998/meta' unless @tika_command
103
+ tika_command = CONFIG.dig(:tika, :command)
104
+ tika_command ||= 'http://localhost:9998/meta'
106
105
  TIKA_COMMAND = tika_command
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module FspHarvester
4
- VERSION = "0.1.7" # up to date
4
+ VERSION = "0.1.11"
5
5
  end
data/lib/fsp_harvester.rb CHANGED
@@ -20,6 +20,9 @@ require_relative './metadata_object'
20
20
  require_relative './constants'
21
21
  require_relative './web_utils'
22
22
  require_relative './signposting_tests'
23
+ require_relative './fsp_metadata_harvester'
24
+ require_relative './fsp_metadata_parser'
25
+
23
26
 
24
27
  module FspHarvester
25
28
  class Error < StandardError
@@ -32,18 +35,29 @@ module FspHarvester
32
35
 
33
36
  def self.resolve_guid(guid:)
34
37
  @meta = FspHarvester::MetadataObject.new
35
- @meta.finalURI = [guid]
38
+ @meta.all_uris = [guid]
36
39
  type, url = convertToURL(guid: guid)
37
40
  links = Array.new
38
41
  if type
39
42
  links = resolve_url(url: url)
43
+ @meta.links << links
40
44
  else
41
- @meta.warnings << ['006', guid, '']
45
+ @meta.add_warning(['006', guid, ''])
42
46
  @meta.comments << "FATAL: GUID type not recognized.\n"
43
47
  end
44
48
  [links, @meta]
45
49
  end
46
50
 
51
+ def self.gather_metadata_from_describedby_links(links: [], metadata: FspHarvester::MetadataObject.new) # meta should have already been created by resolve+guid, but maybe not
52
+ @meta = metadata
53
+ db = []
54
+ links.each do |l|
55
+ db << l if l.relation == 'describedby'
56
+ end
57
+ FspHarvester::MetadataHarvester.extract_metadata(links: db, metadata: @meta) # everything is gathered into the @meta metadata object
58
+ @meta
59
+ end
60
+
47
61
  def self.convertToURL(guid:)
48
62
  GUID_TYPES.each do |k, regex|
49
63
  if k == 'inchi' and regex.match(guid)
@@ -68,19 +82,19 @@ module FspHarvester
68
82
  false
69
83
  end
70
84
 
71
- def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_ALL_HEADER)
85
+ def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_STAR_HEADER)
72
86
  @meta.guidtype = 'uri' if @meta.guidtype.nil?
73
87
  warn "\n\n FETCHING #{url} #{header}\n\n"
74
- response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method)
88
+ response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method, meta: @meta)
75
89
  warn "\n\n head #{response.headers.inspect}\n\n" if response
76
90
 
77
91
  unless response
78
- @meta.warnings << ['001', url, header]
92
+ @meta.add_warning(['001', url, header])
79
93
  @meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
80
94
  return []
81
95
  end
82
96
 
83
- @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.finalURI.last}. Using the output from this URL for the next few tests..."
97
+ @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
84
98
  @meta.full_response << response.body
85
99
 
86
100
  links = process_link_headers(response: response) unless nolinkheaders
@@ -90,7 +104,7 @@ module FspHarvester
90
104
  def self.process_link_headers(response:)
91
105
  warn "\n\n parsing #{response.headers}\n\n"
92
106
 
93
- parser = LinkHeaders::Processor.new(default_anchor: @meta.finalURI.last)
107
+ parser = LinkHeaders::Processor.new(default_anchor: @meta.all_uris.last)
94
108
  parser.extract_and_parse(response: response)
95
109
  factory = parser.factory # LinkHeaders::LinkFactory
96
110
 
@@ -105,6 +119,8 @@ module FspHarvester
105
119
  citeas = Array.new
106
120
  describedby = Array.new
107
121
  item = Array.new
122
+ types = Array.new
123
+
108
124
  factory.all_links.each do |l|
109
125
  case l.relation
110
126
  when 'cite-as'
@@ -113,23 +129,29 @@ module FspHarvester
113
129
  item << l
114
130
  when 'describedby'
115
131
  describedby << l
132
+ when 'type'
133
+ types << l
116
134
  end
117
135
  end
118
136
 
119
137
  check_describedby_rules(describedby: describedby)
120
138
  check_item_rules(item: item)
121
139
 
122
- uniqueciteas = Array.new
123
140
  if citeas.length > 1
124
141
  warn "INFO: multiple cite-as links found. Checking for conflicts\n"
125
142
  @meta.comments << "INFO: multiple cite-as links found. Checking for conflicts\n"
126
- uniqueciteas = check_for_citeas_conflicts(citeas: citeas) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
143
+ citeas = check_for_citeas_conflicts(citeas: citeas) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
127
144
  end
128
145
 
129
- unless uniqueciteas == 1 && describedby.length > 0
130
- @meta.warnings << ['004', '', '']
146
+ unless citeas.length == 1 && describedby.length > 0
147
+ @meta.add_warning(['004', '', ''])
131
148
  @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires exactly one cite-as header, and at least one describedby header\n"
132
149
  end
150
+
151
+ unless types.length >=1
152
+ @meta.add_warning(['015', '', ''])
153
+ @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires one or two 'type' link headers\n"
154
+ end
133
155
  end
134
156
  end
135
157
  end
@@ -0,0 +1,82 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FspHarvester
4
+ class Error < StandardError
5
+ end
6
+
7
+ class ExternalTools
8
+
9
+ def initialize(metadata: FspHarvester::MetadataObject.new)
10
+ @meta = metadata
11
+ end
12
+
13
+ def process_with_distiller(body:)
14
+ bhash = Digest::SHA256.hexdigest(body)
15
+ if @@distillerknown[bhash]
16
+ @meta.comments << "INFO: data is already parsed by distiller.\n"
17
+ #parse_rdf(body: body)
18
+ else
19
+ @meta.comments << "INFO: Using 'Kellog's Distiller' to try to extract metadata from return value (message body).\n"
20
+ file = Tempfile.new('foo', encoding: 'UTF-8')
21
+ body = body.force_encoding('UTF-8')
22
+ body.scrub!
23
+ body = body.gsub(%r{"@context"\s*:\s*"https?://schema.org/?"}, '"@context": "https://schema.org/docs/jsonldcontext.json"') # a bug in distiller, apparently
24
+ file.write(body)
25
+ file.rewind
26
+
27
+ @meta.comments << "INFO: The message body is being examined by Distiller\n"
28
+ # command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format turtle #{file.path} 2>/dev/null"
29
+ command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
30
+ # command = "LANG=en_US.UTF-8 /usr/local/bin/ruby #{@rdf_command} serialize --input-format rdfa --output-format jsonld #{file.path}"
31
+ # command = "LANG=en_US.UTF-8 /home/osboxes/.rvm/rubies/ruby-2.6.3/bin/ruby /home/osboxes/.rvm/gems/ruby-2.6.3/bin/rdf serialize --output-format jsonld #{file.path}"
32
+ warn "distiller command: #{command}"
33
+ result, _stderr, _status = Open3.capture3(command)
34
+ warn ''
35
+ warn "distiller errors: #{stderr}"
36
+ file.close
37
+ file.unlink
38
+
39
+ result = result.force_encoding('UTF-8')
40
+ warn "DIST RESULT: #{result}"
41
+ if result !~ /@context/i # failure returns nil
42
+ @meta.comments << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML..\n"
43
+ @meta.add_warning(['018', '', ''])
44
+ else
45
+ @meta.comments << "INFO: The Distiller found parseable data. Parsing as JSON-LD\n"
46
+ parse_rdf(result: result, content_type: "application/ld+json")
47
+ end
48
+ @@distillerknown[bhash] = true
49
+ end
50
+ end
51
+
52
+ def processs_with_extruct(uri:)
53
+ @meta.comments << "INFO: Using 'extruct' to try to extract metadata from return value (message body) of #{uri}.\n"
54
+ warn 'begin open3'
55
+ stdout, stderr, status = Open3.capture3(EXTRUCT_COMMAND + ' ' + uri)
56
+ warn "open3 status: #{status} #{stdout}"
57
+ result = stderr # absurd that the output comes over stderr! LOL!
58
+
59
+ if result.to_s.match(/(Failed\sto\sextract.*?)\n/)
60
+ @meta.comments << "WARN: extruct threw an error #{Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n"
61
+ @meta.add_warning(['019', '', ''])
62
+ if result.to_s.match(/(ValueError:.*?)\n/)
63
+ @meta.comments << "WARN: extruct error was #{Regexp.last_match(1)}\n"
64
+ @meta.add_warning(['019', '', ''])
65
+ end
66
+ elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
67
+ json = JSON.parse result
68
+ @meta.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
69
+
70
+ parse_rdf(body: json['json-ld'].to_json, content_type: 'application/ld+json') if json['json-ld'].any? # RDF
71
+ @meta.merge_hash(json['microdata'].first) if json['microdata'].any?
72
+ @meta.merge_hash(json['microformat'].first) if json['microformat'].any?
73
+ @meta.merge_hash(json['opengraph'].first) if json['opengraph'].any?
74
+ parse_rdf(body: json['rdfa'].to_json, content_type: 'application/ld+json') if json['rdfa'].any? # RDF
75
+
76
+ @meta.merge_hash(json.first) if json.first.is_a? Hash
77
+ else
78
+ @meta.comments << "WARN: the extruct tool failed to find parseable data at #{uri}\n"
79
+ end
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,164 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FspHarvester
4
+ class Error < StandardError
5
+ end
6
+
7
+ class MetadataHarvester
8
+ def self.extract_metadata(links: [], metadata: FspHarvester::MetadataObject.new)
9
+ @meta = metadata
10
+ @meta.comments << 'INFO: now collecting both linked data and hash-style data using the harvested links'
11
+
12
+ describedby = links.select { |l| l if l.relation == 'describedby' }
13
+
14
+ hvst = FspHarvester::MetadataParser.new(metadata_object: @meta) # put here because the class variable for detecting duplicates should apply to all URIs
15
+ describedby.each do |link|
16
+ accepttype = ACCEPT_STAR_HEADER
17
+ accept = link.respond_to?('type') ? link.type : nil
18
+ accepttype = { 'Accept' => accept } if accept
19
+
20
+ response = attempt_to_resolve(link: link, headers: accepttype)
21
+
22
+ abbreviation, content_type = attempt_to_detect_type(body: response.body, headers: response.headers)
23
+ unless abbreviation
24
+ @meta.add_warning(['017', url, header])
25
+ @meta.comments << "WARN: metadata format returned from #{url} using Accept header #{header} is not recognized. Processing will end now.\n"
26
+ next
27
+ end
28
+
29
+ # process according to detected type
30
+ case abbreviation
31
+ when 'html'
32
+ @meta.comments << 'INFO: Processing html'
33
+ hvst.process_html(body: response.body, uri: link)
34
+ when 'xml'
35
+ @meta.comments << 'INFO: Processing xml'
36
+ hvst.process_xml(body: response.body)
37
+ when 'json'
38
+ @meta.comments << 'INFO: Processing json'
39
+ hvst.process_json(body: response.body)
40
+ when 'jsonld', 'rdfxml', 'turtle', 'ntriples', 'nquads'
41
+ @meta.comments << 'INFO: Processing linked data'
42
+ hvst.process_ld(body: response.body, content_type: content_type)
43
+ when 'specialist'
44
+ warn 'no specialized parsers so far'
45
+ end
46
+ end
47
+ end
48
+
49
+ def self.attempt_to_resolve(link:, headers: ACCEPT_STAR_HEADER)
50
+ @meta.comments << "INFO: link #{link.href} being processed"
51
+ if link.respond_to? 'type'
52
+ header = { 'Accept' => link.type }
53
+ else
54
+ @meta.comments << "INFO: link #{link.href} has no MIME type, defaulting to */*"
55
+ end
56
+ url = link.href
57
+ response = FspHarvester::WebUtils.fspfetch(url: url, method: :get, headers: header)
58
+ unless response
59
+ @meta.add_warning(['016', url, header])
60
+ @meta.comments << "WARN: Unable to resolve describedby link #{url} using HTTP Accept header #{header}.\n"
61
+ end
62
+ response
63
+ end
64
+
65
+ def self.attempt_to_detect_type(body:, headers:)
66
+ # described by should be an html, xml, json, or linked data document
67
+ abbreviation = nil
68
+ content_type = nil
69
+ @meta.comments << 'INFO: Testing metadata format for html, xml, and linked data formats\n'
70
+ if body =~ /^\s*<\?xml/
71
+ if body =~ /<HTML/i
72
+ abbreviation = 'html'
73
+ content_type = 'text/html'
74
+ @meta.comments << 'INFO: appears to be HTML\n'
75
+ elsif body =~ /<rdf:RDF/i
76
+ abbreviation = 'rdfxml'
77
+ content_type = 'application/rdf+xml'
78
+ @meta.comments << 'INFO: appears to be RDF-XML\n'
79
+ else
80
+ abbreviation = 'xml'
81
+ content_type = 'application/xml'
82
+ @meta.comments << 'INFO: appears to be XML\n'
83
+ end
84
+ else
85
+ abbreviation, content_type = check_ld(body: body, claimed_type: headers[:content_type])
86
+ abbreviation, content_type = check_json(body: body) unless abbreviation
87
+ end
88
+
89
+ unless content_type
90
+ @meta.add_warning(['017', url, header])
91
+ @meta.comments << "WARN: metadata format returned from #{url} using Accept header #{header} is not recognized. Processing will end now.\n"
92
+ end
93
+ [abbreviation, content_type]
94
+ end
95
+
96
+ def self.check_ld(body:, claimed_type:)
97
+ detected_type = ntriples_hack(body: body) # ntriples hack for one-line metadata records
98
+ unless detected_type
99
+ detected_type = RDF::Format.for({ sample: body[0..5000] })
100
+ @meta.comments << "INFO: Auto-detected type #{detected_type}\n"
101
+ end
102
+ contenttype = ''
103
+ abbreviation = ''
104
+ if detected_type
105
+ contenttype = detected_type.content_type.first # comes back as array
106
+ abbreviation = abbreviate_type(contenttype: contenttype)
107
+ @meta.comments << "INFO: using content-type #{contenttype}.\n"
108
+ else
109
+ @meta.comments << "INFO: metadata does not appear to be in a linked data format. Trying other options.\n"
110
+ end
111
+ [abbreviation, contenttype]
112
+ end
113
+
114
+ def self.ntriples_hack(body:) # distriller cannot recognize single-line ntriples unless they end with a period, which is not required by the spec... so hack it!
115
+ detected_type = nil
116
+ body.split.each do |line|
117
+ line.strip!
118
+ next if line.empty?
119
+ if line =~ %r{\s*<[^>]+>\s*<[^>]+>\s\S+}
120
+ @meta.comments << "INFO: running ntriples hack on #{line + " ."}\n"
121
+ detected_type = RDF::Format.for({ sample: "#{line} ." }) # adding a period allows detection of ntriples by distiller
122
+ break
123
+ end
124
+ end
125
+ @meta.comments << "INFO: ntriples hack found: #{detected_type.to_s}\n"
126
+ if detected_type != RDF::NTriples::Format # only return the hacky case
127
+ return nil
128
+ end
129
+ return detected_type
130
+ end
131
+
132
+
133
+ def self.check_json(body:)
134
+ abbreviation = nil
135
+ parsed = nil
136
+ begin
137
+ parsed = JSON.parse(body)
138
+ rescue StandardError
139
+ abbreviation = nil
140
+ end
141
+
142
+ if parsed
143
+ abbreviation = 'json'
144
+ else
145
+ @meta.comments << "INFO: metadata does not appear to be in JSON format. No options left.\n"
146
+ end
147
+ [abbreviation, 'application/ld+json']
148
+ end
149
+
150
+ def self.abbreviate_type(contenttype:)
151
+ foundtype = nil
152
+ RDF_FORMATS.merge(XML_FORMATS).merge(HTML_FORMATS).merge(JSON_FORMATS).each do |type, vals|
153
+ warn "\n\ntype #{type}\nvals #{vals}\n\n"
154
+ @meta.comments << "INFO: testing #{type} MIME types for #{contenttype}"
155
+ next unless vals.include? contenttype
156
+
157
+ foundtype = type
158
+ @meta.comments << "INFO: detected a #{type} MIME type"
159
+ break
160
+ end
161
+ foundtype
162
+ end
163
+ end
164
+ end
@@ -0,0 +1,109 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FspHarvester
4
+ class Error < StandardError
5
+ end
6
+
7
+ class MetadataParser
8
+ # attr_accessor :distillerknown
9
+
10
+ @@distillerknown = {}
11
+
12
+ def initialize(metadata_object: FspHarvester::MetadataObject.new)
13
+ @meta = metadata_object
14
+ end
15
+
16
+ def process_html(body:, uri:)
17
+ tools = FspHarvester::ExternalTools.new(metadata: @meta)
18
+ tools.process_with_distiller(body: body)
19
+ tools.process_with_extruct(uri: uri)
20
+ end
21
+
22
+ def process_xml(body:)
23
+ begin
24
+ hash = XmlSimple.xml_in(body)
25
+ rescue
26
+ @meta.comments << "CRITICAL: Malformed XML detected. Cannot process metadata.\n"
27
+ @meta.add_warning(['020', '', ''])
28
+ end
29
+ @meta.comments << "INFO: The XML is being merged in the metadata object\n"
30
+ @meta.hash.merge hash
31
+ end
32
+
33
+ def process_json(body:)
34
+ begin
35
+ hash = JSON.parse(body)
36
+ rescue
37
+ @meta.comments << "CRITICAL: Malformed JSON detected. Cannot process metadata.\n"
38
+ @meta.add_warning(['021', '', ''])
39
+ end
40
+ @meta.comments << "INFO: The JSON is being merged in the metadata object\n"
41
+ @meta.hash.merge hash
42
+ end
43
+
44
+ def process_ld(body:, content_type:)
45
+ parse_rdf(body: body, content_type: content_type)
46
+ end
47
+
48
+ def parse_rdf(body:, content_type:)
49
+ unless body
50
+ @meta.comments << "CRITICAL: The response message body component appears to have no content.\n"
51
+ @meta.add_warning(['018', '', ''])
52
+ return
53
+ end
54
+
55
+ unless body.match(/\w/)
56
+ @meta.comments << "CRITICAL: The response message body component appears to have no content.\n"
57
+ @meta.add_warning(['018', '', ''])
58
+ return
59
+ end
60
+
61
+ rdfformat = RDF::Format.for(content_type: content_type)
62
+ unless rdfformat
63
+ @meta.comments << "CRITICAL: Found what appears to be RDF (sample: #{body[0..300].delete!("\n")}), but it could not find a parser. Please report this error, along with the GUID of the resource, to the maintainer of the system.\n"
64
+ @meta.add_warning(['018', '', ''])
65
+ return
66
+ end
67
+
68
+ graph = FspHarvester::Cache.checkRDFCache(body: body)
69
+ if graph.size > 0
70
+ warn "\n\n\n unmarshalling graph from cache\n\ngraph size #{graph.size}\n\n"
71
+ @meta.merge_rdf(graph.to_a)
72
+ else
73
+ warn "\n\n\nfound format #{rdfformat}\n\n"
74
+ @meta.comments << "INFO: The response message body component appears to contain #{rdfformat}.\n"
75
+ reader = ''
76
+ begin
77
+ reader = rdfformat.reader.new(body)
78
+ rescue Exception => e
79
+ @meta.comments << "WARN: Though linked data was found, it failed to parse (Exception #{e}). This likely indicates some syntax error in the data. As a result, no metadata will be extracted from this message.\n"
80
+ @meta.add_warning(['018', '', ''])
81
+ return
82
+ end
83
+
84
+ begin
85
+ if reader.size == 0
86
+ @meta.comments << "WARN: Though linked data was found, it failed to parse. This likely indicates some syntax error in the data. As a result, no metadata will be extracted from this message.\n"
87
+ return
88
+ end
89
+ reader = rdfformat.reader.new(body) # have to re-read it here, but now its safe because we have already caught errors
90
+ warn 'WRITING TO CACHE'
91
+ FspHarvester::Cache.writeRDFCache(reader: reader, body: body) # write to the special RDF graph cache
92
+ warn 'WRITING DONE'
93
+ reader = rdfformat.reader.new(body) # frustrating that we cannot rewind!
94
+ warn 'RE-READING DONE'
95
+ @meta.merge_rdf(reader.to_a)
96
+ warn 'MERGE DONE'
97
+ rescue RDF::ReaderError => e
98
+ @meta.comments << "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} || (sample of what was parsed: #{body[0..300].delete("\n")})\n"
99
+ warn "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} || (sample of what was parsed: #{body[0..300].delete("\n")})\n"
100
+ @meta.add_warning(['018', '', ''])
101
+ rescue Exception => e
102
+ meta.comments << "CRITICAL: An unknown error occurred while parsing the (apparent) Linked Data (sample of what was parsed: #{body[0..300].delete("\n")}). Moving on...\n"
103
+ warn "\n\nCRITICAL: #{e.inspect} An unknown error occurred while parsing the (apparent) Linked Data (full body: #{body}). Moving on...\n"
104
+ @meta.add_warning(['018', '', ''])
105
+ end
106
+ end
107
+ end
108
+ end
109
+ end
@@ -1,6 +1,6 @@
1
1
  module FspHarvester
2
2
  class MetadataObject
3
- attr_accessor :hash, :graph, :comments, :warnings, :guidtype, :full_response, :finalURI # a hash of metadata # a RDF.rb graph of metadata # an array of comments # the type of GUID that was detected # will be an array of Net::HTTP::Response
3
+ attr_accessor :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris # a hash of metadata # a RDF.rb graph of metadata # an array of comments # the type of GUID that was detected # will be an array of Net::HTTP::Response
4
4
 
5
5
  def initialize(_params = {}) # get a name from the "new" call, or set a default
6
6
  @hash = {}
@@ -8,15 +8,19 @@ module FspHarvester
8
8
  @comments = []
9
9
  @warnings = []
10
10
  @full_response = []
11
- @finalURI = []
11
+ @links = []
12
+ @all_uris = []
13
+ w = RestClient.get("https://raw.githubusercontent.com/markwilkinson/FAIR-Signposting-Harvester/master/lib/warnings.json")
14
+ #@warn = File.read("./lib/warnings.json")
15
+ @warn = JSON.parse(w)
12
16
  end
13
17
 
14
18
  def merge_hash(hash)
15
- # $stderr.puts "\n\n\nIncoming Hash #{hash.inspect}"
19
+ # warn "\n\n\nIncoming Hash #{hash.inspect}"
16
20
  self.hash = self.hash.merge(hash)
17
21
  end
18
22
 
19
- def merge_rdf(triples) # incoming list of triples
23
+ def merge_rdf(triples) # incoming list of triples
20
24
  graph << triples
21
25
  graph
22
26
  end
@@ -24,5 +28,106 @@ module FspHarvester
24
28
  def rdf
25
29
  graph
26
30
  end
31
+
32
+ def add_warning(warning)
33
+ id = warning[0]
34
+ url = warning[1]
35
+ headers = warning[2]
36
+ message = @warn[id]['message']
37
+ linkout = @warn[id]['linkout']
38
+ severity = @warn[id]['severity']
39
+ self.warnings << {"id" => id, "message" => message, "severity" => severity, "linkout" => linkout, "processed_url" => url, "accept_headers": headers}
40
+ end
41
+ end
42
+
43
+ class Cache
44
+ def self.retrieveMetaObject(uri)
45
+ filename = (Digest::MD5.hexdigest uri) + '_meta'
46
+ warn "Checking Meta cache for #{filename}"
47
+ if File.exist?("/tmp/#{filename}")
48
+ warn 'FOUND Meta object in cache'
49
+ meta = Marshal.load(File.read("/tmp/#{filename}"))
50
+ warn 'Returning....'
51
+ return meta
52
+ end
53
+ warn 'Meta objectNot Found in Cache'
54
+ false
55
+ end
56
+
57
+ def self.cacheMetaObject(meta, uri)
58
+ filename = (Digest::MD5.hexdigest uri) + '_meta'
59
+ warn "in cacheMetaObject Writing to cache for #{filename}"
60
+ File.open("/tmp/#{filename}", 'wb') { |f| f.write(Marshal.dump(meta)) }
61
+ end
62
+
63
+ def self.checkRDFCache(body: )
64
+ fs = File.join('/tmp/', '*_graphbody')
65
+ bodies = Dir.glob(fs)
66
+ g = RDF::Graph.new
67
+ bodies.each do |bodyfile|
68
+ next unless File.size(bodyfile) == body.bytesize # compare body size
69
+ next unless bodyfile.match(/(.*)_graphbody$/) # continue if there's no match
70
+
71
+ filename = Regexp.last_match(1)
72
+ warn "Regexp match for #{filename} FOUND"
73
+ next unless File.exist?("#{filename}_graph") # @ get the associated graph file
74
+
75
+ warn "RDF Cache File #{filename} FOUND"
76
+ graph = Marshal.load(File.read("#{filename}_graph")) # unmarshal it
77
+ graph.each do |statement|
78
+ g << statement # need to do this because the unmarshalled object isn't entirely functional as an RDF::Graph object
79
+ end
80
+ warn "returning a graph of #{g.size}"
81
+ break
82
+ end
83
+ # return an empty graph otherwise
84
+ g
85
+ end
86
+
87
+ def self.writeRDFCache(reader:, body:)
88
+ filename = Digest::MD5.hexdigest body
89
+ graph = RDF::Graph.new
90
+ reader.each_statement { |s| graph << s }
91
+ warn "WRITING RDF TO CACHE #{filename}"
92
+ File.open("/tmp/#{filename}_graph", 'wb') { |f| f.write(Marshal.dump(graph)) }
93
+ File.open("/tmp/#{filename}_graphbody", 'wb') { |f| f.write(body) }
94
+ warn "wrote RDF filename: #{filename}"
95
+ end
96
+
97
+ def self.checkCache(uri, headers)
98
+ filename = Digest::MD5.hexdigest uri + headers.to_s
99
+ warn "Checking Error cache for #{filename}"
100
+ if File.exist?("/tmp/#{filename}_error")
101
+ warn 'Error file found in cache... returning'
102
+ return ['ERROR', nil, nil]
103
+ end
104
+ if File.exist?("/tmp/#{filename}_head") and File.exist?("/tmp/#{filename}_body")
105
+ warn 'FOUND data in cache'
106
+ head = Marshal.load(File.read("/tmp/#{filename}_head"))
107
+ body = Marshal.load(File.read("/tmp/#{filename}_body"))
108
+ all_uris = ''
109
+ all_uris = Marshal.load(File.read("/tmp/#{filename}_uri")) if File.exist?("/tmp/#{filename}_uri")
110
+ warn 'Returning....'
111
+ return [head, body, all_uris]
112
+ end
113
+ warn 'Not Found in Cache'
114
+ end
115
+
116
+ def self.writeToCache(uri, headers, head, body, all_uris)
117
+ filename = Digest::MD5.hexdigest uri + headers.to_s
118
+ warn "in writeToCache Writing to cache for #{filename}"
119
+ headfilename = filename + '_head'
120
+ bodyfilename = filename + '_body'
121
+ urifilename = filename + '_uri'
122
+ File.open("/tmp/#{headfilename}", 'wb') { |f| f.write(Marshal.dump(head)) }
123
+ File.open("/tmp/#{bodyfilename}", 'wb') { |f| f.write(Marshal.dump(body)) }
124
+ File.open("/tmp/#{urifilename}", 'wb') { |f| f.write(Marshal.dump(all_uris)) }
125
+ end
126
+
127
+ def self.writeErrorToCache(uri, headers)
128
+ filename = Digest::MD5.hexdigest uri + headers.to_s
129
+ warn "in writeErrorToCache Writing error to cache for #{filename}"
130
+ File.open("/tmp/#{filename}_error", 'wb') { |f| f.write('ERROR') }
131
+ end
27
132
  end
28
133
  end
@@ -0,0 +1,87 @@
1
+ def check_for_citeas_conflicts(citeas: )
2
+ @meta.comments << 'INFO: checking for conflicting cite-as links'
3
+ citeas_hrefs = Hash.new
4
+ citeas.each do |link|
5
+ warn "INFO: Adding citeas #{link.href} to the testing queue."
6
+ @meta.comments << "INFO: Adding citeas #{link.href} to the testing queue."
7
+ citeas_hrefs[link.href] = link
8
+ end
9
+
10
+ if citeas_hrefs.length > 1
11
+ @meta.comments << 'INFO: Found multiple non-identical cite-as links.'
12
+ @meta.add_warning(['007', '', ''])
13
+ @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard: Found conflicting cite-as link headers.\n"
14
+ end
15
+ citeas_hrefs.values # return list of unique links
16
+ end
17
+
18
+
19
+ def check_describedby_rules(describedby:)
20
+ describedby.each do |l|
21
+ unless l.respond_to? 'type'
22
+ @meta.add_warning(['005', l.href, ''])
23
+ @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires any describedby links to also have a 'type' attribute.\n"
24
+ end
25
+ type = l.type if l.respond_to? 'type'
26
+ type ||= '*/*'
27
+ header = { accept: type }
28
+ response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
29
+ if response
30
+ responsetype = response.headers[:content_type]
31
+ @meta.comments << "INFO: describedby link responds with content type #{responsetype}\n"
32
+ if responsetype =~ %r{^(.*/[^;]+)}
33
+ responsetype = Regexp.last_match(1).to_s # remove the e.g. charset information
34
+ end
35
+ @meta.comments << "INFO: testing content type |#{responsetype}| against |#{type}|\n"
36
+ if type != '*/*'
37
+ if responsetype == type
38
+ @meta.comments << "INFO: describedby link responds according to Signposting specifications\n"
39
+ else
40
+ @meta.add_warning(['009', l.href, header])
41
+ @meta.comments << "WARN: Content type of returned describedby link #{responsetype}does not match the 'type' attribute #{type}\n"
42
+ end
43
+ else
44
+ @meta.add_warning(['010', l.href, header])
45
+ @meta.comments << "WARN: Content type of returned describedby link is not specified in response headers or cannot be matched against accept headers\n"
46
+ end
47
+ else
48
+ @meta.add_warning(['008', l.href, header])
49
+ @meta.comments << "WARN: describedby link doesn't resolve\n"
50
+ end
51
+ end
52
+ end
53
+
54
+ def check_item_rules(item:)
55
+ item.each do |l| # l = LinkHeaders::Link
56
+ unless l.respond_to? 'type'
57
+ @meta.add_warning(['011', l.href, ''])
58
+ @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which encourages any item links to also have a 'type' attribute.\n"
59
+ end
60
+ type = l.type if l.respond_to? 'type'
61
+ type ||= '*/*' # this becomes a frozen string
62
+ header = { accept: type }
63
+ response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
64
+
65
+ if response
66
+ if response.headers[:content_type] and type != '*/*'
67
+ rtype = type.gsub(%r{/}, "\/") # because type is a frozen string
68
+ rtype = rtype.gsub(/\+/, '.')
69
+ typeregex = Regexp.new(type)
70
+ if response.headers[:content_type].match(typeregex)
71
+ warn response.headers[:content_type]
72
+ warn typeregex.inspect
73
+ @meta.comments << "INFO: item link responds according to Signposting specifications\n"
74
+ else
75
+ @meta.add_warning(['012', l.href, header])
76
+ @meta.comments << "WARN: Content type of returned item link does not match the 'type' attribute\n"
77
+ end
78
+ else
79
+ @meta.add_warning(['013', l.href, header])
80
+ @meta.comments << "WARN: Content type of returned item link is not specified in response headers or cannot be matched against accept headers\n"
81
+ end
82
+ else
83
+ @meta.add_warning(['014', l.href, header])
84
+ @meta.comments << "WARN: item link doesn't resolve\n"
85
+ end
86
+ end
87
+ end
data/lib/warnings.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "001": {
3
- "message": "Unable to resolve guid using Accept headers for Linked Data",
3
+ "message": "Unable to resolve guid using default (*/*) Accept headers",
4
4
  "linkout": "",
5
5
  "severity": "WARN"
6
6
  },
@@ -68,7 +68,40 @@
68
68
  "message": "Item link does not resolve",
69
69
  "linkout": "",
70
70
  "severity": "WARN"
71
+ },
72
+ "015": {
73
+ "message": "Link headers do not include a link of type 'type', as required by the FAIR Signposting specification",
74
+ "linkout": "",
75
+ "severity": "WARN"
76
+ },
77
+ "016": {
78
+ "message": "Unable to resolve describedby link using Accept headers with the MIME type indicated in the link",
79
+ "linkout": "",
80
+ "severity": "WARN"
81
+ },
82
+ "017": {
83
+ "message": "Metadata format not recognized.",
84
+ "linkout": "",
85
+ "severity": "WARN"
86
+ },
87
+ "018": {
88
+ "message": "RDF parsing error - likely malformed RDF document.",
89
+ "linkout": "",
90
+ "severity": "WARN"
91
+ },
92
+ "019": {
93
+ "message": "HTML parsing error - unable to extract linked data from HTML.",
94
+ "linkout": "",
95
+ "severity": "WARN"
96
+ },
97
+ "020": {
98
+ "message": "XML parsing error - unable to process XML document.",
99
+ "linkout": "",
100
+ "severity": "WARN"
101
+ },
102
+ "021": {
103
+ "message": "JSON parsing error - unable to process JSON document.",
104
+ "linkout": "",
105
+ "severity": "WARN"
71
106
  }
72
-
73
-
74
107
  }
data/lib/web_utils.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  module FspHarvester
2
2
 
3
3
  class WebUtils
4
- def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get)
4
+ def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get, meta: FspHarvester::MetadataObject.new)
5
5
  warn 'In fetch routine now. '
6
6
 
7
7
  begin
@@ -13,19 +13,19 @@ module FspHarvester
13
13
  # password: pass,
14
14
  headers: headers
15
15
  })
16
- @meta.finalURI |= [response.request.url] if @meta # it's possible to call this method without affecting the metadata object being created by the harvester
17
- warn "There was a response to the call #{url}"
18
- warn "There was a response to the call #{response.request.url}"
16
+ meta.all_uris |= [response.request.url] # it's possible to call this method without affecting the metadata object being created by the harvester
17
+ warn "starting URL #{url}"
18
+ warn "final URL #{response.request.url}"
19
19
  warn "Response code #{response.code}"
20
- if response.code == 203 && @meta
21
- @meta.warnings << ["002", url, headers]
22
- @meta.comments << "WARN: Response is non-authoritative (HTTP response code: #{response.code}). Headers may have been manipulated encountered when trying to resolve #{url}\n"
20
+ if response.code == 203
21
+ meta.warnings << ["002", url, headers]
22
+ meta.comments << "WARN: Response is non-authoritative (HTTP response code: #{response.code}). Headers may have been manipulated encountered when trying to resolve #{url}\n"
23
23
  end
24
24
  response
25
25
  rescue RestClient::ExceptionWithResponse => e
26
26
  warn "EXCEPTION WITH RESPONSE! #{e.response}\n#{e.response.headers}"
27
- @meta.warnings << ["003", url, headers] if @meta
28
- @meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n" if @meta
27
+ meta.warnings << ["003", url, headers]
28
+ meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
29
29
  if (e.response.code == 500 or e.response.code == 404)
30
30
  return false
31
31
  else
@@ -34,14 +34,14 @@ module FspHarvester
34
34
  # now we are returning the headers and body that were returned
35
35
  rescue RestClient::Exception => e
36
36
  warn "EXCEPTION WITH NO RESPONSE! #{e}"
37
- @meta.warnings << ["003", url, headers] if @meta
38
- @meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n" if @meta
37
+ meta.warnings << ["003", url, headers]
38
+ meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
39
39
  false
40
40
  # now we are returning 'False', and we will check that with an \"if\" statement in our main code
41
41
  rescue Exception => e
42
42
  warn "EXCEPTION UNKNOWN! #{e}"
43
- @meta.warnings << ["003", url, headers] if @meta
44
- @meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n" if @meta
43
+ meta.warnings << ["003", url, headers]
44
+ meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
45
45
  false
46
46
  # now we are returning 'False', and we will check that with an \"if\" statement in our main code
47
47
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fsp_harvester
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.1.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mark Wilkinson
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-08-02 00:00:00.000000000 Z
11
+ date: 2022-08-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: json
@@ -44,14 +44,14 @@ dependencies:
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: 0.1.13
47
+ version: 0.1.16
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: 0.1.13
54
+ version: 0.1.16
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: metainspector
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -171,6 +171,7 @@ executables: []
171
171
  extensions: []
172
172
  extra_rdoc_files: []
173
173
  files:
174
+ - ".rspec_status"
174
175
  - CHANGELOG.md
175
176
  - Gemfile
176
177
  - Gemfile.lock
@@ -180,10 +181,17 @@ files:
180
181
  - bin/console
181
182
  - bin/setup
182
183
  - example_test.rb
184
+ - launch.json
185
+ - lib/config.conf_docker
186
+ - lib/config.conf_local
183
187
  - lib/constants.rb
184
188
  - lib/fsp_harvester.rb
185
189
  - lib/fsp_harvester/version.rb
190
+ - lib/fsp_metadata_external_tools.rb
191
+ - lib/fsp_metadata_harvester.rb
192
+ - lib/fsp_metadata_parser.rb
186
193
  - lib/metadata_object.rb
194
+ - lib/signposting_tests.rb
187
195
  - lib/swagger.rb
188
196
  - lib/warnings.json
189
197
  - lib/web_utils.rb