fsp_harvester 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7da32f3321193e93f64154c35db0c840f5e7f086451660f99b62d3f4c834e295
4
- data.tar.gz: 5a0a1ff4ef6b2100accd8bab4f20d6842d25ebf88d5e600e646804da9ed24bd9
3
+ metadata.gz: 923ccb362ef4a71fa2f221b1b224dbd5d3ec78a14cc9da0e12a1e0df804162ff
4
+ data.tar.gz: e70a9a994c504a0867ab10e05c07d440a76677d7ff27e27b9bdb0c3338c02ffe
5
5
  SHA512:
6
- metadata.gz: dad90f81b73489a151220c132d74508832573a352b209a537bbbaf6543a90b9e132cca80ecca33d15d02eca91841642d73e795113076f4564730194b5bf1fa53
7
- data.tar.gz: 5d11c2f002f4e73a4971aec0d047cd14d35f6763155d0effa9c79f419ff6fd26f853158b290d2264edd148633570f5ecd50d4e1f6c7e9d136c7d2880517bdefc
6
+ metadata.gz: d819cb1c19d40f8a093b723cbeb4273d122d3207df2d874558327fa6a2622be1bf2c671cc9dfebee74c689722825c2dd1957f8be6bfcf2c14c09097c1dc05a5b
7
+ data.tar.gz: 508a484d93eab373d0389c11f4361068d5586759a73573901ea54f4d2a028f713f71bb492976785499f1074eb8b359d79be550c09b196eb82838a1e401398fc4
data/.rspec_status ADDED
@@ -0,0 +1,55 @@
1
+ example_id | status | run_time |
2
+ ---------------------------------- | ------ | --------------- |
3
+ ./spec/cite-as_spec.rb[1:1:1] | passed | 1.17 seconds |
4
+ ./spec/cite-as_spec.rb[1:1:2] | passed | 0.98776 seconds |
5
+ ./spec/cite-as_spec.rb[1:1:3] | passed | 0.69753 seconds |
6
+ ./spec/cite-as_spec.rb[1:1:4] | passed | 1.31 seconds |
7
+ ./spec/cite-as_spec.rb[1:1:5] | passed | 2.07 seconds |
8
+ ./spec/cite-as_spec.rb[1:1:6] | passed | 1.45 seconds |
9
+ ./spec/cite-as_spec.rb[1:1:7] | passed | 2.75 seconds |
10
+ ./spec/cite-as_spec.rb[1:1:8] | passed | 1.83 seconds |
11
+ ./spec/cite-as_spec.rb[1:1:9] | passed | 2.51 seconds |
12
+ ./spec/cite-as_spec.rb[1:1:10] | passed | 1.73 seconds |
13
+ ./spec/cite-as_spec.rb[1:1:11] | passed | 2.35 seconds |
14
+ ./spec/cite-as_spec.rb[1:1:12] | passed | 2.01 seconds |
15
+ ./spec/cite-as_spec.rb[1:1:13] | passed | 2.56 seconds |
16
+ ./spec/cite-as_spec.rb[1:1:14] | passed | 1.68 seconds |
17
+ ./spec/cite-as_spec.rb[1:1:15] | passed | 1.06 seconds |
18
+ ./spec/cite-as_spec.rb[1:1:16] | passed | 1.03 seconds |
19
+ ./spec/cite-as_spec.rb[1:1:17] | passed | 0.94321 seconds |
20
+ ./spec/cite-as_spec.rb[1:1:18] | passed | 1.1 seconds |
21
+ ./spec/cite-as_spec.rb[1:1:19] | passed | 1.45 seconds |
22
+ ./spec/cite-as_spec.rb[1:1:20] | passed | 1.53 seconds |
23
+ ./spec/cite-as_spec.rb[1:1:21] | passed | 1.64 seconds |
24
+ ./spec/cite-as_spec.rb[1:1:22] | passed | 1.01 seconds |
25
+ ./spec/cite-as_spec.rb[1:1:23] | passed | 1.09 seconds |
26
+ ./spec/cite-as_spec.rb[1:1:24] | failed | 1.22 seconds |
27
+ ./spec/cite-as_spec.rb[1:1:25] | passed | 0.38248 seconds |
28
+ ./spec/describedby_spec.rb[1:1:1] | passed | 2.24 seconds |
29
+ ./spec/describedby_spec.rb[1:1:2] | passed | 1.08 seconds |
30
+ ./spec/describedby_spec.rb[1:1:3] | passed | 1 second |
31
+ ./spec/describedby_spec.rb[1:1:4] | passed | 1.14 seconds |
32
+ ./spec/describedby_spec.rb[1:1:5] | passed | 1.03 seconds |
33
+ ./spec/describedby_spec.rb[1:1:6] | passed | 0.81364 seconds |
34
+ ./spec/describedby_spec.rb[1:1:7] | passed | 0.77543 seconds |
35
+ ./spec/describedby_spec.rb[1:1:8] | passed | 2.01 seconds |
36
+ ./spec/describedby_spec.rb[1:1:9] | passed | 1.35 seconds |
37
+ ./spec/describedby_spec.rb[1:1:10] | passed | 1.73 seconds |
38
+ ./spec/describedby_spec.rb[1:1:11] | passed | 2.36 seconds |
39
+ ./spec/describedby_spec.rb[1:1:12] | passed | 2.73 seconds |
40
+ ./spec/describedby_spec.rb[1:1:13] | passed | 1.5 seconds |
41
+ ./spec/describedby_spec.rb[1:1:14] | passed | 1.8 seconds |
42
+ ./spec/describedby_spec.rb[1:1:15] | passed | 1.65 seconds |
43
+ ./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00053 seconds |
44
+ ./spec/fsp_harvester_spec.rb[1:2] | passed | 1.76 seconds |
45
+ ./spec/item_spec.rb[1:1:1] | passed | 2.08 seconds |
46
+ ./spec/item_spec.rb[1:1:2] | passed | 2.27 seconds |
47
+ ./spec/item_spec.rb[1:1:3] | passed | 1.22 seconds |
48
+ ./spec/item_spec.rb[1:1:4] | passed | 1.61 seconds |
49
+ ./spec/item_spec.rb[1:1:5] | passed | 1.74 seconds |
50
+ ./spec/item_spec.rb[1:1:6] | passed | 1.95 seconds |
51
+ ./spec/item_spec.rb[1:1:7] | passed | 3.59 seconds |
52
+ ./spec/item_spec.rb[1:1:8] | passed | 0.41001 seconds |
53
+ ./spec/type_spec.rb[1:1:1] | passed | 1.14 seconds |
54
+ ./spec/type_spec.rb[1:1:2] | passed | 0.94799 seconds |
55
+ ./spec/type_spec.rb[1:1:3] | passed | 1.04 seconds |
data/Gemfile.lock CHANGED
@@ -1,10 +1,10 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fsp_harvester (0.1.7)
4
+ fsp_harvester (0.1.9)
5
5
  json (~> 2.0)
6
6
  linkeddata (~> 3.2)
7
- linkheaders-processor (~> 0.1.13)
7
+ linkheaders-processor (~> 0.1.15)
8
8
  metainspector (~> 5.11.2)
9
9
  parseconfig (~> 1.1)
10
10
  rake (~> 13.0)
@@ -126,10 +126,11 @@ GEM
126
126
  shex (~> 0.7)
127
127
  sparql (~> 3.2)
128
128
  sparql-client (~> 3.2)
129
- linkheaders-processor (0.1.13)
129
+ linkheaders-processor (0.1.15)
130
130
  json (~> 2.0)
131
131
  json-ld (~> 3.2)
132
132
  json-ld-preloaded (~> 3.2)
133
+ link_header (~> 0.0.8)
133
134
  metainspector (~> 5.11.2)
134
135
  rest-client (~> 2.1)
135
136
  securerandom (~> 0.1.0)
@@ -248,7 +249,7 @@ GEM
248
249
  diff-lcs (>= 1.2.0, < 2.0)
249
250
  rspec-support (~> 3.11.0)
250
251
  rspec-support (3.11.0)
251
- rubocop (1.32.0)
252
+ rubocop (1.33.0)
252
253
  json (~> 2.3)
253
254
  parallel (~> 1.10)
254
255
  parser (>= 3.1.0.0)
data/launch.json ADDED
@@ -0,0 +1,11 @@
1
+ {
2
+ "name": "RSpec - all",
3
+ "type": "Ruby",
4
+ "request": "launch",
5
+ "cwd": "${workspaceRoot}",
6
+ "program": "/home/osboxes/.rvm/gems/ruby-3.0.0/bin/rspec",
7
+ "args": [
8
+ "-I",
9
+ "${workspaceRoot}"
10
+ ]
11
+ }
@@ -0,0 +1,8 @@
1
+ [extruct]
2
+ command="extruct"
3
+
4
+ [rdf]
5
+ command="/usr/local/bundle/bin/rdf"
6
+
7
+ [tika]
8
+ command="http://tika:9998/meta"
@@ -0,0 +1,8 @@
1
+ [extruct]
2
+ command="extruct"
3
+
4
+ [rdf]
5
+ command="/home/osboxes/.rvm/gems/ruby-3.0.0/bin/rdf"
6
+
7
+ [tika]
8
+ command="http://tika:9998/meta"
data/lib/constants.rb CHANGED
@@ -1,17 +1,20 @@
1
1
  ACCEPT_ALL_HEADER = { 'Accept' => 'text/turtle, application/ld+json, application/rdf+xml, text/xhtml+xml, application/n3, application/rdf+n3, application/turtle, application/x-turtle, text/n3, text/turtle, text/rdf+n3, text/rdf+turtle, application/n-triples' }
2
2
 
3
+ ACCEPT_STAR_HEADER = {'Accept' => '*/*'}
4
+
3
5
  TEXT_FORMATS = {
4
6
  'text' => ['text/plain']
5
7
  }
6
8
 
7
9
  RDF_FORMATS = {
8
- 'jsonld' => ['application/ld+json', 'application/vnd.schemaorg.ld+json'], # NEW FOR DATACITE
10
+ 'jsonld' => ['application/ld+json','application/x-ld+json', 'application/vnd.schemaorg.ld+json'], # NEW FOR DATACITE
9
11
  'turtle' => ['text/turtle', 'application/n3', 'application/rdf+n3',
10
12
  'application/turtle', 'application/x-turtle', 'text/n3', 'text/turtle',
11
13
  'text/rdf+n3', 'text/rdf+turtle'],
12
14
  # 'rdfa' => ['text/xhtml+xml', 'application/xhtml+xml'],
13
15
  'rdfxml' => ['application/rdf+xml'],
14
- 'triples' => ['application/n-triples', 'application/n-quads', 'application/trig']
16
+ 'ntriples' => ['application/n-triples', 'application/trig'],
17
+ 'nquads' => ['application/n-quads']
15
18
  }
16
19
 
17
20
  XML_FORMATS = {
@@ -73,12 +76,10 @@ GUID_TYPES = { 'inchi' => Regexp.new(/^\w{14}-\w{10}-\w$/),
73
76
  'uri' => Regexp.new(%r{^\w+:/?/?[^\s]+$}) }
74
77
 
75
78
  CONFIG = File.exist?('config.conf') ? ParseConfig.new('config.conf') : {}
76
- if CONFIG['extruct'] && CONFIG['extruct']['command'] && !CONFIG['extruct']['command'].empty?
77
- extruct = config['extruct']['command']
78
- end
79
- extruct = 'extruct' unless @extruct_command
79
+ extruct = CONFIG.dig(:extruct, :command)
80
+ extruct ||= 'extruct'
80
81
  extruct.strip!
81
- case @extruct
82
+ case extruct
82
83
  when /[&|;`$\s]/
83
84
  abort 'The Extruct command in the config file appears to be subject to command injection. I will not continue'
84
85
  when /echo/i
@@ -86,8 +87,8 @@ when /echo/i
86
87
  end
87
88
  EXTRUCT_COMMAND = extruct
88
89
 
89
- rdf_command = CONFIG['rdf']['command'] if CONFIG['rdf'] && CONFIG['rdf']['command'] && !CONFIG['rdf']['command'].empty?
90
- rdf_command = 'rdf' unless @rdf_command
90
+ rdf_command = CONFIG.dig(:rdf, :command)
91
+ rdf_command ||= 'rdf'
91
92
  rdf_command.strip
92
93
  case rdf_command
93
94
  when /[&|;`$\s]/
@@ -99,8 +100,6 @@ when !(/rdf$/ =~ $_)
99
100
  end
100
101
  RDF_COMMAND = rdf_command
101
102
 
102
- if CONFIG['tika'] && CONFIG['tika']['command'] && !CONFIG['tika']['command'].empty?
103
- tika_command = CONFIG['tika']['command']
104
- end
105
- tika_command = 'http://localhost:9998/meta' unless @tika_command
103
+ tika_command = CONFIG.dig(:tika, :command)
104
+ tika_command ||= 'http://localhost:9998/meta'
106
105
  TIKA_COMMAND = tika_command
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module FspHarvester
4
- VERSION = "0.1.7" # up to date
4
+ VERSION = "0.1.9"
5
5
  end
data/lib/fsp_harvester.rb CHANGED
@@ -20,6 +20,9 @@ require_relative './metadata_object'
20
20
  require_relative './constants'
21
21
  require_relative './web_utils'
22
22
  require_relative './signposting_tests'
23
+ require_relative './fsp_metadata_harvester'
24
+ require_relative './fsp_metadata_parser'
25
+
23
26
 
24
27
  module FspHarvester
25
28
  class Error < StandardError
@@ -32,11 +35,12 @@ module FspHarvester
32
35
 
33
36
  def self.resolve_guid(guid:)
34
37
  @meta = FspHarvester::MetadataObject.new
35
- @meta.finalURI = [guid]
38
+ @meta.all_uris = [guid]
36
39
  type, url = convertToURL(guid: guid)
37
40
  links = Array.new
38
41
  if type
39
42
  links = resolve_url(url: url)
43
+ @meta.links << links
40
44
  else
41
45
  @meta.warnings << ['006', guid, '']
42
46
  @meta.comments << "FATAL: GUID type not recognized.\n"
@@ -44,6 +48,16 @@ module FspHarvester
44
48
  [links, @meta]
45
49
  end
46
50
 
51
+ def self.gather_metadata_from_describedby_links(links: [], metadata: FspHarvester::MetadataObject.new) # meta should have already been created by resolve+guid, but maybe not
52
+ @meta = metadata
53
+ db = []
54
+ links.each do |l|
55
+ db << l if l.relation == 'describedby'
56
+ end
57
+ FspHarvester::MetadataHarvester.extract_metadata(links: db, metadata: @meta) # everything is gathered into the @meta metadata object
58
+ @meta
59
+ end
60
+
47
61
  def self.convertToURL(guid:)
48
62
  GUID_TYPES.each do |k, regex|
49
63
  if k == 'inchi' and regex.match(guid)
@@ -68,10 +82,10 @@ module FspHarvester
68
82
  false
69
83
  end
70
84
 
71
- def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_ALL_HEADER)
85
+ def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_STAR_HEADER)
72
86
  @meta.guidtype = 'uri' if @meta.guidtype.nil?
73
87
  warn "\n\n FETCHING #{url} #{header}\n\n"
74
- response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method)
88
+ response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method, meta: @meta)
75
89
  warn "\n\n head #{response.headers.inspect}\n\n" if response
76
90
 
77
91
  unless response
@@ -80,7 +94,7 @@ module FspHarvester
80
94
  return []
81
95
  end
82
96
 
83
- @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.finalURI.last}. Using the output from this URL for the next few tests..."
97
+ @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
84
98
  @meta.full_response << response.body
85
99
 
86
100
  links = process_link_headers(response: response) unless nolinkheaders
@@ -90,7 +104,7 @@ module FspHarvester
90
104
  def self.process_link_headers(response:)
91
105
  warn "\n\n parsing #{response.headers}\n\n"
92
106
 
93
- parser = LinkHeaders::Processor.new(default_anchor: @meta.finalURI.last)
107
+ parser = LinkHeaders::Processor.new(default_anchor: @meta.all_uris.last)
94
108
  parser.extract_and_parse(response: response)
95
109
  factory = parser.factory # LinkHeaders::LinkFactory
96
110
 
@@ -105,6 +119,8 @@ module FspHarvester
105
119
  citeas = Array.new
106
120
  describedby = Array.new
107
121
  item = Array.new
122
+ types = Array.new
123
+
108
124
  factory.all_links.each do |l|
109
125
  case l.relation
110
126
  when 'cite-as'
@@ -113,23 +129,29 @@ module FspHarvester
113
129
  item << l
114
130
  when 'describedby'
115
131
  describedby << l
132
+ when 'type'
133
+ types << l
116
134
  end
117
135
  end
118
136
 
119
137
  check_describedby_rules(describedby: describedby)
120
138
  check_item_rules(item: item)
121
139
 
122
- uniqueciteas = Array.new
123
140
  if citeas.length > 1
124
141
  warn "INFO: multiple cite-as links found. Checking for conflicts\n"
125
142
  @meta.comments << "INFO: multiple cite-as links found. Checking for conflicts\n"
126
- uniqueciteas = check_for_citeas_conflicts(citeas: citeas) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
143
+ citeas = check_for_citeas_conflicts(citeas: citeas) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
127
144
  end
128
145
 
129
- unless uniqueciteas == 1 && describedby.length > 0
146
+ unless citeas.length == 1 && describedby.length > 0
130
147
  @meta.warnings << ['004', '', '']
131
148
  @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires exactly one cite-as header, and at least one describedby header\n"
132
149
  end
150
+
151
+ unless types.length >=1
152
+ @meta.warnings << ['015', '', '']
153
+ @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires one or two 'type' link headers\n"
154
+ end
133
155
  end
134
156
  end
135
157
  end
@@ -0,0 +1,82 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FspHarvester
4
+ class Error < StandardError
5
+ end
6
+
7
+ class ExternalTools
8
+
9
+ def initialize(metadata: FspHarvester::MetadataObject.new)
10
+ @meta = metadata
11
+ end
12
+
13
+ def process_with_distiller(body:)
14
+ bhash = Digest::SHA256.hexdigest(body)
15
+ if @@distillerknown[bhash]
16
+ @meta.comments << "INFO: data is already parsed by distiller.\n"
17
+ #parse_rdf(body: body)
18
+ else
19
+ @meta.comments << "INFO: Using 'Kellog's Distiller' to try to extract metadata from return value (message body).\n"
20
+ file = Tempfile.new('foo', encoding: 'UTF-8')
21
+ body = body.force_encoding('UTF-8')
22
+ body.scrub!
23
+ body = body.gsub(%r{"@context"\s*:\s*"https?://schema.org/?"}, '"@context": "https://schema.org/docs/jsonldcontext.json"') # a bug in distiller, apparently
24
+ file.write(body)
25
+ file.rewind
26
+
27
+ @meta.comments << "INFO: The message body is being examined by Distiller\n"
28
+ # command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format turtle #{file.path} 2>/dev/null"
29
+ command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
30
+ # command = "LANG=en_US.UTF-8 /usr/local/bin/ruby #{@rdf_command} serialize --input-format rdfa --output-format jsonld #{file.path}"
31
+ # command = "LANG=en_US.UTF-8 /home/osboxes/.rvm/rubies/ruby-2.6.3/bin/ruby /home/osboxes/.rvm/gems/ruby-2.6.3/bin/rdf serialize --output-format jsonld #{file.path}"
32
+ warn "distiller command: #{command}"
33
+ result, _stderr, _status = Open3.capture3(command)
34
+ warn ''
35
+ warn "distiller errors: #{stderr}"
36
+ file.close
37
+ file.unlink
38
+
39
+ result = result.force_encoding('UTF-8')
40
+ warn "DIST RESULT: #{result}"
41
+ if result !~ /@context/i # failure returns nil
42
+ @meta.comments << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML..\n"
43
+ @meta.warnings << ['018', '', '']
44
+ else
45
+ @meta.comments << "INFO: The Distiller found parseable data. Parsing as JSON-LD\n"
46
+ parse_rdf(result: result, content_type: "application/ld+json")
47
+ end
48
+ @@distillerknown[bhash] = true
49
+ end
50
+ end
51
+
52
+ def processs_with_extruct(uri:)
53
+ @meta.comments << "INFO: Using 'extruct' to try to extract metadata from return value (message body) of #{uri}.\n"
54
+ warn 'begin open3'
55
+ stdout, stderr, status = Open3.capture3(EXTRUCT_COMMAND + ' ' + uri)
56
+ warn "open3 status: #{status} #{stdout}"
57
+ result = stderr # absurd that the output comes over stderr! LOL!
58
+
59
+ if result.to_s.match(/(Failed\sto\sextract.*?)\n/)
60
+ @meta.comments << "WARN: extruct threw an error #{Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n"
61
+ @meta.warnings << ['019', '', '']
62
+ if result.to_s.match(/(ValueError:.*?)\n/)
63
+ @meta.comments << "WARN: extruct error was #{Regexp.last_match(1)}\n"
64
+ @meta.warnings << ['019', '', '']
65
+ end
66
+ elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
67
+ json = JSON.parse result
68
+ @meta.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
69
+
70
+ parse_rdf(body: json['json-ld'].to_json, content_type: 'application/ld+json') if json['json-ld'].any? # RDF
71
+ @meta.merge_hash(json['microdata'].first) if json['microdata'].any?
72
+ @meta.merge_hash(json['microformat'].first) if json['microformat'].any?
73
+ @meta.merge_hash(json['opengraph'].first) if json['opengraph'].any?
74
+ parse_rdf(body: json['rdfa'].to_json, content_type: 'application/ld+json') if json['rdfa'].any? # RDF
75
+
76
+ @meta.merge_hash(json.first) if json.first.is_a? Hash
77
+ else
78
+ @meta.comments << "WARN: the extruct tool failed to find parseable data at #{uri}\n"
79
+ end
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,164 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FspHarvester
4
+ class Error < StandardError
5
+ end
6
+
7
+ class MetadataHarvester
8
+ def self.extract_metadata(links: [], metadata: FspHarvester::MetadataObject.new)
9
+ @meta = metadata
10
+ @meta.comments << 'INFO: now collecting both linked data and hash-style data using the harvested links'
11
+
12
+ describedby = links.select { |l| l if l.relation == 'describedby' }
13
+
14
+ hvst = FspHarvester::MetadataParser.new(metadata_object: @meta) # put here because the class variable for detecting duplicates should apply to all URIs
15
+ describedby.each do |link|
16
+ accepttype = ACCEPT_STAR_HEADER
17
+ accept = link.respond_to?('type') ? link.type : nil
18
+ accepttype = { 'Accept' => accept } if accept
19
+
20
+ response = attempt_to_resolve(link: link, headers: accepttype)
21
+
22
+ abbreviation, content_type = attempt_to_detect_type(body: response.body, headers: response.headers)
23
+ unless abbreviation
24
+ @meta.warnings << ['017', url, header]
25
+ @meta.comments << "WARN: metadata format returned from #{url} using Accept header #{header} is not recognized. Processing will end now.\n"
26
+ next
27
+ end
28
+
29
+ # process according to detected type
30
+ case abbreviation
31
+ when 'html'
32
+ @meta.comments << 'INFO: Processing html'
33
+ hvst.process_html(body: response.body, uri: link)
34
+ when 'xml'
35
+ @meta.comments << 'INFO: Processing xml'
36
+ hvst.process_xml(body: response.body)
37
+ when 'json'
38
+ @meta.comments << 'INFO: Processing json'
39
+ hvst.process_json(body: response.body)
40
+ when 'jsonld', 'rdfxml', 'turtle', 'ntriples', 'nquads'
41
+ @meta.comments << 'INFO: Processing linked data'
42
+ hvst.process_ld(body: response.body, content_type: content_type)
43
+ when 'specialist'
44
+ warn 'no specialized parsers so far'
45
+ end
46
+ end
47
+ end
48
+
49
+ def self.attempt_to_resolve(link:, headers: ACCEPT_STAR_HEADER)
50
+ @meta.comments << "INFO: link #{link.href} being processed"
51
+ if link.respond_to? 'type'
52
+ header = { 'Accept' => link.type }
53
+ else
54
+ @meta.comments << "INFO: link #{link.href} has no MIME type, defaulting to */*"
55
+ end
56
+ url = link.href
57
+ response = FspHarvester::WebUtils.fspfetch(url: url, method: :get, headers: header)
58
+ unless response
59
+ @meta.warnings << ['016', url, header]
60
+ @meta.comments << "WARN: Unable to resolve describedby link #{url} using HTTP Accept header #{header}.\n"
61
+ end
62
+ response
63
+ end
64
+
65
+ def self.attempt_to_detect_type(body:, headers:)
66
+ # described by should be an html, xml, json, or linked data document
67
+ abbreviation = nil
68
+ content_type = nil
69
+ @meta.comments << 'INFO: Testing metadata format for html, xml, and linked data formats\n'
70
+ if body =~ /^\s*<\?xml/
71
+ if body =~ /<HTML/i
72
+ abbreviation = 'html'
73
+ content_type = 'text/html'
74
+ @meta.comments << 'INFO: appears to be HTML\n'
75
+ elsif body =~ /<rdf:RDF/i
76
+ abbreviation = 'rdfxml'
77
+ content_type = 'application/rdf+xml'
78
+ @meta.comments << 'INFO: appears to be RDF-XML\n'
79
+ else
80
+ abbreviation = 'xml'
81
+ content_type = 'application/xml'
82
+ @meta.comments << 'INFO: appears to be XML\n'
83
+ end
84
+ else
85
+ abbreviation, content_type = check_ld(body: body, claimed_type: headers[:content_type])
86
+ abbreviation, content_type = check_json(body: body) unless abbreviation
87
+ end
88
+
89
+ unless content_type
90
+ @meta.warnings << ['017', url, header]
91
+ @meta.comments << "WARN: metadata format returned from #{url} using Accept header #{header} is not recognized. Processing will end now.\n"
92
+ end
93
+ [abbreviation, content_type]
94
+ end
95
+
96
+ def self.check_ld(body:, claimed_type:)
97
+ detected_type = ntriples_hack(body: body) # ntriples hack for one-line metadata records
98
+ unless detected_type
99
+ detected_type = RDF::Format.for({ sample: body[0..5000] })
100
+ @meta.comments << "INFO: Auto-detected type #{detected_type}\n"
101
+ end
102
+ contenttype = ''
103
+ abbreviation = ''
104
+ if detected_type
105
+ contenttype = detected_type.content_type.first # comes back as array
106
+ abbreviation = abbreviate_type(contenttype: contenttype)
107
+ @meta.comments << "INFO: using content-type #{contenttype}.\n"
108
+ else
109
+ @meta.comments << "INFO: metadata does not appear to be in a linked data format. Trying other options.\n"
110
+ end
111
+ [abbreviation, contenttype]
112
+ end
113
+
114
+ def self.ntriples_hack(body:) # distriller cannot recognize single-line ntriples unless they end with a period, which is not required by the spec... so hack it!
115
+ detected_type = nil
116
+ body.split.each do |line|
117
+ line.strip!
118
+ next if line.empty?
119
+ if line =~ %r{\s*<[^>]+>\s*<[^>]+>\s\S+}
120
+ @meta.comments << "INFO: running ntriples hack on #{line + " ."}\n"
121
+ detected_type = RDF::Format.for({ sample: "#{line} ." }) # adding a period allows detection of ntriples by distiller
122
+ break
123
+ end
124
+ end
125
+ @meta.comments << "INFO: ntriples hack found: #{detected_type.to_s}\n"
126
+ if detected_type != RDF::NTriples::Format # only return the hacky case
127
+ return nil
128
+ end
129
+ return detected_type
130
+ end
131
+
132
+
133
+ def self.check_json(body:)
134
+ abbreviation = nil
135
+ parsed = nil
136
+ begin
137
+ parsed = JSON.parse(body)
138
+ rescue StandardError
139
+ abbreviation = nil
140
+ end
141
+
142
+ if parsed
143
+ abbreviation = 'json'
144
+ else
145
+ @meta.comments << "INFO: metadata does not appear to be in JSON format. No options left.\n"
146
+ end
147
+ [abbreviation, 'application/ld+json']
148
+ end
149
+
150
+ def self.abbreviate_type(contenttype:)
151
+ foundtype = nil
152
+ RDF_FORMATS.merge(XML_FORMATS).merge(HTML_FORMATS).merge(JSON_FORMATS).each do |type, vals|
153
+ warn "\n\ntype #{type}\nvals #{vals}\n\n"
154
+ @meta.comments << "INFO: testing #{type} MIME types for #{contenttype}"
155
+ next unless vals.include? contenttype
156
+
157
+ foundtype = type
158
+ @meta.comments << "INFO: detected a #{type} MIME type"
159
+ break
160
+ end
161
+ foundtype
162
+ end
163
+ end
164
+ end
@@ -0,0 +1,109 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FspHarvester
4
+ class Error < StandardError
5
+ end
6
+
7
+ class MetadataParser
8
+ # attr_accessor :distillerknown
9
+
10
+ @@distillerknown = {}
11
+
12
+ def initialize(metadata_object: FspHarvester::MetadataObject.new)
13
+ @meta = metadata_object
14
+ end
15
+
16
+ def process_html(body:, uri:)
17
+ tools = FspHarvester::ExternalTools.new(metadata: @meta)
18
+ tools.process_with_distiller(body: body)
19
+ tools.process_with_extruct(uri: uri)
20
+ end
21
+
22
+ def process_xml(body:)
23
+ begin
24
+ hash = XmlSimple.xml_in(body)
25
+ rescue
26
+ @meta.comments << "CRITICAL: Malformed XML detected. Cannot process metadata.\n"
27
+ @meta.warnings << ['020', '', '']
28
+ end
29
+ @meta.comments << "INFO: The XML is being merged in the metadata object\n"
30
+ @meta.hash.merge hash
31
+ end
32
+
33
+ def process_json(body:)
34
+ begin
35
+ hash = JSON.parse(body)
36
+ rescue
37
+ @meta.comments << "CRITICAL: Malformed JSON detected. Cannot process metadata.\n"
38
+ @meta.warnings << ['021', '', '']
39
+ end
40
+ @meta.comments << "INFO: The JSON is being merged in the metadata object\n"
41
+ @meta.hash.merge hash
42
+ end
43
+
44
+ def process_ld(body:, content_type:)
45
+ parse_rdf(body: body, content_type: content_type)
46
+ end
47
+
48
+ def parse_rdf(body:, content_type:)
49
+ unless body
50
+ @meta.comments << "CRITICAL: The response message body component appears to have no content.\n"
51
+ @meta.warnings << ['018', '', '']
52
+ return
53
+ end
54
+
55
+ unless body.match(/\w/)
56
+ @meta.comments << "CRITICAL: The response message body component appears to have no content.\n"
57
+ @meta.warnings << ['018', '', '']
58
+ return
59
+ end
60
+
61
+ rdfformat = RDF::Format.for(content_type: content_type)
62
+ unless rdfformat
63
+ @meta.comments << "CRITICAL: Found what appears to be RDF (sample: #{body[0..300].delete!("\n")}), but it could not find a parser. Please report this error, along with the GUID of the resource, to the maintainer of the system.\n"
64
+ @meta.warnings << ['018', '', '']
65
+ return
66
+ end
67
+
68
+ graph = FspHarvester::Cache.checkRDFCache(body: body)
69
+ if graph.size > 0
70
+ warn "\n\n\n unmarshalling graph from cache\n\ngraph size #{graph.size}\n\n"
71
+ @meta.merge_rdf(graph.to_a)
72
+ else
73
+ warn "\n\n\nfound format #{rdfformat}\n\n"
74
+ @meta.comments << "INFO: The response message body component appears to contain #{rdfformat}.\n"
75
+ reader = ''
76
+ begin
77
+ reader = rdfformat.reader.new(body)
78
+ rescue Exception => e
79
+ @meta.comments << "WARN: Though linked data was found, it failed to parse (Exception #{e}). This likely indicates some syntax error in the data. As a result, no metadata will be extracted from this message.\n"
80
+ @meta.warnings << ['018', '', '']
81
+ return
82
+ end
83
+
84
+ begin
85
+ if reader.size == 0
86
+ @meta.comments << "WARN: Though linked data was found, it failed to parse. This likely indicates some syntax error in the data. As a result, no metadata will be extracted from this message.\n"
87
+ return
88
+ end
89
+ reader = rdfformat.reader.new(body) # have to re-read it here, but now its safe because we have already caught errors
90
+ warn 'WRITING TO CACHE'
91
+ FspHarvester::Cache.writeRDFCache(reader: reader, body: body) # write to the special RDF graph cache
92
+ warn 'WRITING DONE'
93
+ reader = rdfformat.reader.new(body) # frustrating that we cannot rewind!
94
+ warn 'RE-READING DONE'
95
+ @meta.merge_rdf(reader.to_a)
96
+ warn 'MERGE DONE'
97
+ rescue RDF::ReaderError => e
98
+ @meta.comments << "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} || (sample of what was parsed: #{body[0..300].delete("\n")})\n"
99
+ warn "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} || (sample of what was parsed: #{body[0..300].delete("\n")})\n"
100
+ @meta.warnings << ['018', '', '']
101
+ rescue Exception => e
102
+ meta.comments << "CRITICAL: An unknown error occurred while parsing the (apparent) Linked Data (sample of what was parsed: #{body[0..300].delete("\n")}). Moving on...\n"
103
+ warn "\n\nCRITICAL: #{e.inspect} An unknown error occurred while parsing the (apparent) Linked Data (full body: #{body}). Moving on...\n"
104
+ @meta.warnings << ['018', '', '']
105
+ end
106
+ end
107
+ end
108
+ end
109
+ end
@@ -1,6 +1,6 @@
1
1
  module FspHarvester
2
2
  class MetadataObject
3
- attr_accessor :hash, :graph, :comments, :warnings, :guidtype, :full_response, :finalURI # a hash of metadata # a RDF.rb graph of metadata # an array of comments # the type of GUID that was detected # will be an array of Net::HTTP::Response
3
+ attr_accessor :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris # a hash of metadata # a RDF.rb graph of metadata # an array of comments # the type of GUID that was detected # will be an array of Net::HTTP::Response
4
4
 
5
5
  def initialize(_params = {}) # get a name from the "new" call, or set a default
6
6
  @hash = {}
@@ -8,15 +8,16 @@ module FspHarvester
8
8
  @comments = []
9
9
  @warnings = []
10
10
  @full_response = []
11
- @finalURI = []
11
+ @links = []
12
+ @all_uris = []
12
13
  end
13
14
 
14
15
  def merge_hash(hash)
15
- # $stderr.puts "\n\n\nIncoming Hash #{hash.inspect}"
16
+ # warn "\n\n\nIncoming Hash #{hash.inspect}"
16
17
  self.hash = self.hash.merge(hash)
17
18
  end
18
19
 
19
- def merge_rdf(triples) # incoming list of triples
20
+ def merge_rdf(triples) # incoming list of triples
20
21
  graph << triples
21
22
  graph
22
23
  end
@@ -25,4 +26,95 @@ module FspHarvester
25
26
  graph
26
27
  end
27
28
  end
29
+
30
+ class Cache
31
+ def self.retrieveMetaObject(uri)
32
+ filename = (Digest::MD5.hexdigest uri) + '_meta'
33
+ warn "Checking Meta cache for #{filename}"
34
+ if File.exist?("/tmp/#{filename}")
35
+ warn 'FOUND Meta object in cache'
36
+ meta = Marshal.load(File.read("/tmp/#{filename}"))
37
+ warn 'Returning....'
38
+ return meta
39
+ end
40
+ warn 'Meta objectNot Found in Cache'
41
+ false
42
+ end
43
+
44
+ def self.cacheMetaObject(meta, uri)
45
+ filename = (Digest::MD5.hexdigest uri) + '_meta'
46
+ warn "in cacheMetaObject Writing to cache for #{filename}"
47
+ File.open("/tmp/#{filename}", 'wb') { |f| f.write(Marshal.dump(meta)) }
48
+ end
49
+
50
+ def self.checkRDFCache(body: )
51
+ fs = File.join('/tmp/', '*_graphbody')
52
+ bodies = Dir.glob(fs)
53
+ g = RDF::Graph.new
54
+ bodies.each do |bodyfile|
55
+ next unless File.size(bodyfile) == body.bytesize # compare body size
56
+ next unless bodyfile.match(/(.*)_graphbody$/) # continue if there's no match
57
+
58
+ filename = Regexp.last_match(1)
59
+ warn "Regexp match for #{filename} FOUND"
60
+ next unless File.exist?("#{filename}_graph") # @ get the associated graph file
61
+
62
+ warn "RDF Cache File #{filename} FOUND"
63
+ graph = Marshal.load(File.read("#{filename}_graph")) # unmarshal it
64
+ graph.each do |statement|
65
+ g << statement # need to do this because the unmarshalled object isn't entirely functional as an RDF::Graph object
66
+ end
67
+ warn "returning a graph of #{g.size}"
68
+ break
69
+ end
70
+ # return an empty graph otherwise
71
+ g
72
+ end
73
+
74
+ def self.writeRDFCache(reader:, body:)
75
+ filename = Digest::MD5.hexdigest body
76
+ graph = RDF::Graph.new
77
+ reader.each_statement { |s| graph << s }
78
+ warn "WRITING RDF TO CACHE #{filename}"
79
+ File.open("/tmp/#{filename}_graph", 'wb') { |f| f.write(Marshal.dump(graph)) }
80
+ File.open("/tmp/#{filename}_graphbody", 'wb') { |f| f.write(body) }
81
+ warn "wrote RDF filename: #{filename}"
82
+ end
83
+
84
+ def self.checkCache(uri, headers)
85
+ filename = Digest::MD5.hexdigest uri + headers.to_s
86
+ warn "Checking Error cache for #{filename}"
87
+ if File.exist?("/tmp/#{filename}_error")
88
+ warn 'Error file found in cache... returning'
89
+ return ['ERROR', nil, nil]
90
+ end
91
+ if File.exist?("/tmp/#{filename}_head") and File.exist?("/tmp/#{filename}_body")
92
+ warn 'FOUND data in cache'
93
+ head = Marshal.load(File.read("/tmp/#{filename}_head"))
94
+ body = Marshal.load(File.read("/tmp/#{filename}_body"))
95
+ all_uris = ''
96
+ all_uris = Marshal.load(File.read("/tmp/#{filename}_uri")) if File.exist?("/tmp/#{filename}_uri")
97
+ warn 'Returning....'
98
+ return [head, body, all_uris]
99
+ end
100
+ warn 'Not Found in Cache'
101
+ end
102
+
103
+ def self.writeToCache(uri, headers, head, body, all_uris)
104
+ filename = Digest::MD5.hexdigest uri + headers.to_s
105
+ warn "in writeToCache Writing to cache for #{filename}"
106
+ headfilename = filename + '_head'
107
+ bodyfilename = filename + '_body'
108
+ urifilename = filename + '_uri'
109
+ File.open("/tmp/#{headfilename}", 'wb') { |f| f.write(Marshal.dump(head)) }
110
+ File.open("/tmp/#{bodyfilename}", 'wb') { |f| f.write(Marshal.dump(body)) }
111
+ File.open("/tmp/#{urifilename}", 'wb') { |f| f.write(Marshal.dump(all_uris)) }
112
+ end
113
+
114
+ def self.writeErrorToCache(uri, headers)
115
+ filename = Digest::MD5.hexdigest uri + headers.to_s
116
+ warn "in writeErrorToCache Writing error to cache for #{filename}"
117
+ File.open("/tmp/#{filename}_error", 'wb') { |f| f.write('ERROR') }
118
+ end
119
+ end
28
120
  end
@@ -0,0 +1,87 @@
1
+ def check_for_citeas_conflicts(citeas: )
2
+ @meta.comments << 'INFO: checking for conflicting cite-as links'
3
+ citeas_hrefs = Hash.new
4
+ citeas.each do |link|
5
+ warn "INFO: Adding citeas #{link.href} to the testing queue."
6
+ @meta.comments << "INFO: Adding citeas #{link.href} to the testing queue."
7
+ citeas_hrefs[link.href] = link
8
+ end
9
+
10
+ if citeas_hrefs.length > 1
11
+ @meta.comments << 'INFO: Found multiple non-identical cite-as links.'
12
+ @meta.warnings << ['007', '', '']
13
+ @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard: Found conflicting cite-as link headers.\n"
14
+ end
15
+ citeas_hrefs.values # return list of unique links
16
+ end
17
+
18
+
19
+ def check_describedby_rules(describedby:)
20
+ describedby.each do |l|
21
+ unless l.respond_to? 'type'
22
+ @meta.warnings << ['005', l.href, '']
23
+ @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires any describedby links to also have a 'type' attribute.\n"
24
+ end
25
+ type = l.type if l.respond_to? 'type'
26
+ type ||= '*/*'
27
+ header = { accept: type }
28
+ response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
29
+ if response
30
+ responsetype = response.headers[:content_type]
31
+ @meta.comments << "INFO: describedby link responds with content type #{responsetype}\n"
32
+ if responsetype =~ %r{^(.*/[^;]+)}
33
+ responsetype = Regexp.last_match(1).to_s # remove the e.g. charset information
34
+ end
35
+ @meta.comments << "INFO: testing content type |#{responsetype}| against |#{type}|\n"
36
+ if type != '*/*'
37
+ if responsetype == type
38
+ @meta.comments << "INFO: describedby link responds according to Signposting specifications\n"
39
+ else
40
+ @meta.warnings << ['009', l.href, header]
41
+ @meta.comments << "WARN: Content type of returned describedby link #{responsetype}does not match the 'type' attribute #{type}\n"
42
+ end
43
+ else
44
+ @meta.warnings << ['010', l.href, header]
45
+ @meta.comments << "WARN: Content type of returned describedby link is not specified in response headers or cannot be matched against accept headers\n"
46
+ end
47
+ else
48
+ @meta.warnings << ['008', l.href, header]
49
+ @meta.comments << "WARN: describedby link doesn't resolve\n"
50
+ end
51
+ end
52
+ end
53
+
54
+ def check_item_rules(item:)
55
+ item.each do |l| # l = LinkHeaders::Link
56
+ unless l.respond_to? 'type'
57
+ @meta.warnings << ['011', l.href, '']
58
+ @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which encourages any item links to also have a 'type' attribute.\n"
59
+ end
60
+ type = l.type if l.respond_to? 'type'
61
+ type ||= '*/*' # this becomes a frozen string
62
+ header = { accept: type }
63
+ response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
64
+
65
+ if response
66
+ if response.headers[:content_type] and type != '*/*'
67
+ rtype = type.gsub(%r{/}, "\/") # because type is a frozen string
68
+ rtype = rtype.gsub(/\+/, '.')
69
+ typeregex = Regexp.new(type)
70
+ if response.headers[:content_type].match(typeregex)
71
+ warn response.headers[:content_type]
72
+ warn typeregex.inspect
73
+ @meta.comments << "INFO: item link responds according to Signposting specifications\n"
74
+ else
75
+ @meta.warnings << ['012', l.href, header]
76
+ @meta.comments << "WARN: Content type of returned item link does not match the 'type' attribute\n"
77
+ end
78
+ else
79
+ @meta.warnings << ['013', l.href, header]
80
+ @meta.comments << "WARN: Content type of returned item link is not specified in response headers or cannot be matched against accept headers\n"
81
+ end
82
+ else
83
+ @meta.warnings << ['014', l.href, header]
84
+ @meta.comments << "WARN: item link doesn't resolve\n"
85
+ end
86
+ end
87
+ end
data/lib/warnings.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "001": {
3
- "message": "Unable to resolve guid using Accept headers for Linked Data",
3
+ "message": "Unable to resolve guid using default (*/*) Accept headers",
4
4
  "linkout": "",
5
5
  "severity": "WARN"
6
6
  },
@@ -68,7 +68,43 @@
68
68
  "message": "Item link does not resolve",
69
69
  "linkout": "",
70
70
  "severity": "WARN"
71
- }
71
+ },
72
+ "015": {
73
+ "message": "Link headers do not include a link of type 'type', as required by the FAIR Signposting specification",
74
+ "linkout": "",
75
+ "severity": "WARN"
76
+ },
77
+ "016": {
78
+ "message": "Unable to resolve describedby link using Accept headers with the MIME type indicated in the link",
79
+ "linkout": "",
80
+ "severity": "WARN"
81
+ },
82
+ "017": {
83
+ "message": "Metadata format not recognized.",
84
+ "linkout": "",
85
+ "severity": "WARN"
86
+ },
87
+ "018": {
88
+ "message": "RDF parsing error - likely malformed RDF document.",
89
+ "linkout": "",
90
+ "severity": "WARN"
91
+ },
92
+ "019": {
93
+ "message": "HTML parsing error - unable to extract linked data from HTML.",
94
+ "linkout": "",
95
+ "severity": "WARN"
96
+ },
97
+ "020": {
98
+ "message": "XML parsing error - unable to process XML document.",
99
+ "linkout": "",
100
+ "severity": "WARN"
101
+ },
102
+ "021": {
103
+ "message": "JSON parsing error - unable to process JSON document.",
104
+ "linkout": "",
105
+ "severity": "WARN"
106
+ },
107
+
72
108
 
73
109
 
74
110
  }
data/lib/web_utils.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  module FspHarvester
2
2
 
3
3
  class WebUtils
4
- def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get)
4
+ def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get, meta: FspHarvester::MetadataObject.new)
5
5
  warn 'In fetch routine now. '
6
6
 
7
7
  begin
@@ -13,19 +13,19 @@ module FspHarvester
13
13
  # password: pass,
14
14
  headers: headers
15
15
  })
16
- @meta.finalURI |= [response.request.url] if @meta # it's possible to call this method without affecting the metadata object being created by the harvester
17
- warn "There was a response to the call #{url}"
18
- warn "There was a response to the call #{response.request.url}"
16
+ meta.all_uris |= [response.request.url] # it's possible to call this method without affecting the metadata object being created by the harvester
17
+ warn "starting URL #{url}"
18
+ warn "final URL #{response.request.url}"
19
19
  warn "Response code #{response.code}"
20
- if response.code == 203 && @meta
21
- @meta.warnings << ["002", url, headers]
22
- @meta.comments << "WARN: Response is non-authoritative (HTTP response code: #{response.code}). Headers may have been manipulated encountered when trying to resolve #{url}\n"
20
+ if response.code == 203
21
+ meta.warnings << ["002", url, headers]
22
+ meta.comments << "WARN: Response is non-authoritative (HTTP response code: #{response.code}). Headers may have been manipulated encountered when trying to resolve #{url}\n"
23
23
  end
24
24
  response
25
25
  rescue RestClient::ExceptionWithResponse => e
26
26
  warn "EXCEPTION WITH RESPONSE! #{e.response}\n#{e.response.headers}"
27
- @meta.warnings << ["003", url, headers] if @meta
28
- @meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n" if @meta
27
+ meta.warnings << ["003", url, headers]
28
+ meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
29
29
  if (e.response.code == 500 or e.response.code == 404)
30
30
  return false
31
31
  else
@@ -34,14 +34,14 @@ module FspHarvester
34
34
  # now we are returning the headers and body that were returned
35
35
  rescue RestClient::Exception => e
36
36
  warn "EXCEPTION WITH NO RESPONSE! #{e}"
37
- @meta.warnings << ["003", url, headers] if @meta
38
- @meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n" if @meta
37
+ meta.warnings << ["003", url, headers]
38
+ meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
39
39
  false
40
40
  # now we are returning 'False', and we will check that with an \"if\" statement in our main code
41
41
  rescue Exception => e
42
42
  warn "EXCEPTION UNKNOWN! #{e}"
43
- @meta.warnings << ["003", url, headers] if @meta
44
- @meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n" if @meta
43
+ meta.warnings << ["003", url, headers]
44
+ meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
45
45
  false
46
46
  # now we are returning 'False', and we will check that with an \"if\" statement in our main code
47
47
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fsp_harvester
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.1.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mark Wilkinson
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-08-02 00:00:00.000000000 Z
11
+ date: 2022-08-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: json
@@ -44,14 +44,14 @@ dependencies:
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: 0.1.13
47
+ version: 0.1.16
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: 0.1.13
54
+ version: 0.1.16
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: metainspector
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -171,6 +171,7 @@ executables: []
171
171
  extensions: []
172
172
  extra_rdoc_files: []
173
173
  files:
174
+ - ".rspec_status"
174
175
  - CHANGELOG.md
175
176
  - Gemfile
176
177
  - Gemfile.lock
@@ -180,10 +181,17 @@ files:
180
181
  - bin/console
181
182
  - bin/setup
182
183
  - example_test.rb
184
+ - launch.json
185
+ - lib/config.conf_docker
186
+ - lib/config.conf_local
183
187
  - lib/constants.rb
184
188
  - lib/fsp_harvester.rb
185
189
  - lib/fsp_harvester/version.rb
190
+ - lib/fsp_metadata_external_tools.rb
191
+ - lib/fsp_metadata_harvester.rb
192
+ - lib/fsp_metadata_parser.rb
186
193
  - lib/metadata_object.rb
194
+ - lib/signposting_tests.rb
187
195
  - lib/swagger.rb
188
196
  - lib/warnings.json
189
197
  - lib/web_utils.rb