fsp_harvester 0.1.7 → 0.1.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rspec_status +55 -0
- data/Gemfile.lock +9 -8
- data/launch.json +11 -0
- data/lib/config.conf_docker +8 -0
- data/lib/config.conf_local +8 -0
- data/lib/constants.rb +12 -13
- data/lib/fsp_harvester/version.rb +1 -1
- data/lib/fsp_harvester.rb +33 -11
- data/lib/fsp_metadata_external_tools.rb +82 -0
- data/lib/fsp_metadata_harvester.rb +164 -0
- data/lib/fsp_metadata_parser.rb +109 -0
- data/lib/metadata_object.rb +109 -4
- data/lib/signposting_tests.rb +87 -0
- data/lib/warnings.json +36 -3
- data/lib/web_utils.rb +13 -13
- metadata +12 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 895567e9edd571dbca7dee89a0270d1c14342fed06c3eb81c81e06f3c07ddbed
|
4
|
+
data.tar.gz: 7eee65295c206d6cee7b4ef28830f64087ba172a294cde7401490bffa20dbe1a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f0c7727598525cb55b6c2bfaf36d5ce3dda5da6efddf85888328b7c93b874c508989122627e5deaa5101fc0a20279432aa023ecefef112926219f267e3622234
|
7
|
+
data.tar.gz: 29f834c57ec73e27f988948893dc92fe56550b829585df390a9a1398770845115202289f6f9557c01eb2fc3eec218f863371db60649f6a3fef01da9457c2862e
|
data/.rspec_status
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
example_id | status | run_time |
|
2
|
+
---------------------------------- | ------ | --------------- |
|
3
|
+
./spec/cite-as_spec.rb[1:1:1] | passed | 1.61 seconds |
|
4
|
+
./spec/cite-as_spec.rb[1:1:2] | passed | 1.18 seconds |
|
5
|
+
./spec/cite-as_spec.rb[1:1:3] | passed | 1.02 seconds |
|
6
|
+
./spec/cite-as_spec.rb[1:1:4] | passed | 1.6 seconds |
|
7
|
+
./spec/cite-as_spec.rb[1:1:5] | passed | 2.78 seconds |
|
8
|
+
./spec/cite-as_spec.rb[1:1:6] | passed | 2.09 seconds |
|
9
|
+
./spec/cite-as_spec.rb[1:1:7] | passed | 2.98 seconds |
|
10
|
+
./spec/cite-as_spec.rb[1:1:8] | passed | 2.2 seconds |
|
11
|
+
./spec/cite-as_spec.rb[1:1:9] | passed | 2.87 seconds |
|
12
|
+
./spec/cite-as_spec.rb[1:1:10] | passed | 2.18 seconds |
|
13
|
+
./spec/cite-as_spec.rb[1:1:11] | passed | 3.16 seconds |
|
14
|
+
./spec/cite-as_spec.rb[1:1:12] | passed | 2.36 seconds |
|
15
|
+
./spec/cite-as_spec.rb[1:1:13] | passed | 2.89 seconds |
|
16
|
+
./spec/cite-as_spec.rb[1:1:14] | passed | 2.13 seconds |
|
17
|
+
./spec/cite-as_spec.rb[1:1:15] | passed | 1.18 seconds |
|
18
|
+
./spec/cite-as_spec.rb[1:1:16] | passed | 1.3 seconds |
|
19
|
+
./spec/cite-as_spec.rb[1:1:17] | passed | 1.17 seconds |
|
20
|
+
./spec/cite-as_spec.rb[1:1:18] | passed | 1.2 seconds |
|
21
|
+
./spec/cite-as_spec.rb[1:1:19] | passed | 1.71 seconds |
|
22
|
+
./spec/cite-as_spec.rb[1:1:20] | passed | 1.69 seconds |
|
23
|
+
./spec/cite-as_spec.rb[1:1:21] | passed | 2.22 seconds |
|
24
|
+
./spec/cite-as_spec.rb[1:1:22] | passed | 1.09 seconds |
|
25
|
+
./spec/cite-as_spec.rb[1:1:23] | passed | 1.17 seconds |
|
26
|
+
./spec/cite-as_spec.rb[1:1:24] | failed | 1.2 seconds |
|
27
|
+
./spec/cite-as_spec.rb[1:1:25] | passed | 0.48048 seconds |
|
28
|
+
./spec/describedby_spec.rb[1:1:1] | passed | 2.12 seconds |
|
29
|
+
./spec/describedby_spec.rb[1:1:2] | passed | 0.96254 seconds |
|
30
|
+
./spec/describedby_spec.rb[1:1:3] | passed | 0.92669 seconds |
|
31
|
+
./spec/describedby_spec.rb[1:1:4] | passed | 0.92801 seconds |
|
32
|
+
./spec/describedby_spec.rb[1:1:5] | passed | 1 second |
|
33
|
+
./spec/describedby_spec.rb[1:1:6] | passed | 0.66763 seconds |
|
34
|
+
./spec/describedby_spec.rb[1:1:7] | passed | 0.66021 seconds |
|
35
|
+
./spec/describedby_spec.rb[1:1:8] | passed | 1.89 seconds |
|
36
|
+
./spec/describedby_spec.rb[1:1:9] | passed | 1.3 seconds |
|
37
|
+
./spec/describedby_spec.rb[1:1:10] | passed | 1.7 seconds |
|
38
|
+
./spec/describedby_spec.rb[1:1:11] | passed | 2.28 seconds |
|
39
|
+
./spec/describedby_spec.rb[1:1:12] | passed | 2.27 seconds |
|
40
|
+
./spec/describedby_spec.rb[1:1:13] | passed | 1.39 seconds |
|
41
|
+
./spec/describedby_spec.rb[1:1:14] | passed | 1.65 seconds |
|
42
|
+
./spec/describedby_spec.rb[1:1:15] | passed | 1.7 seconds |
|
43
|
+
./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00215 seconds |
|
44
|
+
./spec/fsp_harvester_spec.rb[1:2] | failed | 0.00021 seconds |
|
45
|
+
./spec/item_spec.rb[1:1:1] | passed | 2.04 seconds |
|
46
|
+
./spec/item_spec.rb[1:1:2] | passed | 2 seconds |
|
47
|
+
./spec/item_spec.rb[1:1:3] | passed | 0.92924 seconds |
|
48
|
+
./spec/item_spec.rb[1:1:4] | passed | 1.36 seconds |
|
49
|
+
./spec/item_spec.rb[1:1:5] | passed | 1.71 seconds |
|
50
|
+
./spec/item_spec.rb[1:1:6] | passed | 1.68 seconds |
|
51
|
+
./spec/item_spec.rb[1:1:7] | passed | 2.37 seconds |
|
52
|
+
./spec/item_spec.rb[1:1:8] | passed | 0.34241 seconds |
|
53
|
+
./spec/type_spec.rb[1:1:1] | passed | 0.9855 seconds |
|
54
|
+
./spec/type_spec.rb[1:1:2] | passed | 0.96202 seconds |
|
55
|
+
./spec/type_spec.rb[1:1:3] | passed | 0.96005 seconds |
|
data/Gemfile.lock
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
fsp_harvester (0.1.
|
4
|
+
fsp_harvester (0.1.11)
|
5
5
|
json (~> 2.0)
|
6
6
|
linkeddata (~> 3.2)
|
7
|
-
linkheaders-processor (~> 0.1.
|
7
|
+
linkheaders-processor (~> 0.1.16)
|
8
8
|
metainspector (~> 5.11.2)
|
9
9
|
parseconfig (~> 1.1)
|
10
10
|
rake (~> 13.0)
|
@@ -36,7 +36,7 @@ GEM
|
|
36
36
|
scanf (~> 1.0)
|
37
37
|
sxp (~> 1.2)
|
38
38
|
unicode-types (~> 1.7)
|
39
|
-
faraday (1.10.
|
39
|
+
faraday (1.10.1)
|
40
40
|
faraday-em_http (~> 1.0)
|
41
41
|
faraday-em_synchrony (~> 1.0)
|
42
42
|
faraday-excon (~> 1.1)
|
@@ -82,13 +82,13 @@ GEM
|
|
82
82
|
concurrent-ruby (~> 1.0)
|
83
83
|
json (2.6.2)
|
84
84
|
json-canonicalization (0.3.0)
|
85
|
-
json-ld (3.2.
|
85
|
+
json-ld (3.2.3)
|
86
86
|
htmlentities (~> 4.3)
|
87
87
|
json-canonicalization (~> 0.3)
|
88
88
|
link_header (~> 0.0, >= 0.0.8)
|
89
89
|
multi_json (~> 1.15)
|
90
90
|
rack (~> 2.2)
|
91
|
-
rdf (~> 3.2)
|
91
|
+
rdf (~> 3.2, >= 3.2.9)
|
92
92
|
json-ld-preloaded (3.2.0)
|
93
93
|
json-ld (~> 3.2)
|
94
94
|
rdf (~> 3.2)
|
@@ -126,10 +126,11 @@ GEM
|
|
126
126
|
shex (~> 0.7)
|
127
127
|
sparql (~> 3.2)
|
128
128
|
sparql-client (~> 3.2)
|
129
|
-
linkheaders-processor (0.1.
|
129
|
+
linkheaders-processor (0.1.16)
|
130
130
|
json (~> 2.0)
|
131
131
|
json-ld (~> 3.2)
|
132
132
|
json-ld-preloaded (~> 3.2)
|
133
|
+
link_header (~> 0.0.8)
|
133
134
|
metainspector (~> 5.11.2)
|
134
135
|
rest-client (~> 2.1)
|
135
136
|
securerandom (~> 0.1.0)
|
@@ -165,7 +166,7 @@ GEM
|
|
165
166
|
rack (2.2.4)
|
166
167
|
rainbow (3.1.1)
|
167
168
|
rake (13.0.6)
|
168
|
-
rdf (3.2.
|
169
|
+
rdf (3.2.9)
|
169
170
|
link_header (~> 0.0, >= 0.0.8)
|
170
171
|
rdf-aggregate-repo (3.2.1)
|
171
172
|
rdf (~> 3.2)
|
@@ -248,7 +249,7 @@ GEM
|
|
248
249
|
diff-lcs (>= 1.2.0, < 2.0)
|
249
250
|
rspec-support (~> 3.11.0)
|
250
251
|
rspec-support (3.11.0)
|
251
|
-
rubocop (1.
|
252
|
+
rubocop (1.33.0)
|
252
253
|
json (~> 2.3)
|
253
254
|
parallel (~> 1.10)
|
254
255
|
parser (>= 3.1.0.0)
|
data/launch.json
ADDED
data/lib/constants.rb
CHANGED
@@ -1,17 +1,20 @@
|
|
1
1
|
ACCEPT_ALL_HEADER = { 'Accept' => 'text/turtle, application/ld+json, application/rdf+xml, text/xhtml+xml, application/n3, application/rdf+n3, application/turtle, application/x-turtle, text/n3, text/turtle, text/rdf+n3, text/rdf+turtle, application/n-triples' }
|
2
2
|
|
3
|
+
ACCEPT_STAR_HEADER = {'Accept' => '*/*'}
|
4
|
+
|
3
5
|
TEXT_FORMATS = {
|
4
6
|
'text' => ['text/plain']
|
5
7
|
}
|
6
8
|
|
7
9
|
RDF_FORMATS = {
|
8
|
-
'jsonld' => ['application/ld+json', 'application/vnd.schemaorg.ld+json'], # NEW FOR DATACITE
|
10
|
+
'jsonld' => ['application/ld+json','application/x-ld+json', 'application/vnd.schemaorg.ld+json'], # NEW FOR DATACITE
|
9
11
|
'turtle' => ['text/turtle', 'application/n3', 'application/rdf+n3',
|
10
12
|
'application/turtle', 'application/x-turtle', 'text/n3', 'text/turtle',
|
11
13
|
'text/rdf+n3', 'text/rdf+turtle'],
|
12
14
|
# 'rdfa' => ['text/xhtml+xml', 'application/xhtml+xml'],
|
13
15
|
'rdfxml' => ['application/rdf+xml'],
|
14
|
-
'
|
16
|
+
'ntriples' => ['application/n-triples', 'application/trig'],
|
17
|
+
'nquads' => ['application/n-quads']
|
15
18
|
}
|
16
19
|
|
17
20
|
XML_FORMATS = {
|
@@ -73,12 +76,10 @@ GUID_TYPES = { 'inchi' => Regexp.new(/^\w{14}-\w{10}-\w$/),
|
|
73
76
|
'uri' => Regexp.new(%r{^\w+:/?/?[^\s]+$}) }
|
74
77
|
|
75
78
|
CONFIG = File.exist?('config.conf') ? ParseConfig.new('config.conf') : {}
|
76
|
-
|
77
|
-
|
78
|
-
end
|
79
|
-
extruct = 'extruct' unless @extruct_command
|
79
|
+
extruct = CONFIG.dig(:extruct, :command)
|
80
|
+
extruct ||= 'extruct'
|
80
81
|
extruct.strip!
|
81
|
-
case
|
82
|
+
case extruct
|
82
83
|
when /[&|;`$\s]/
|
83
84
|
abort 'The Extruct command in the config file appears to be subject to command injection. I will not continue'
|
84
85
|
when /echo/i
|
@@ -86,8 +87,8 @@ when /echo/i
|
|
86
87
|
end
|
87
88
|
EXTRUCT_COMMAND = extruct
|
88
89
|
|
89
|
-
rdf_command = CONFIG
|
90
|
-
rdf_command
|
90
|
+
rdf_command = CONFIG.dig(:rdf, :command)
|
91
|
+
rdf_command ||= 'rdf'
|
91
92
|
rdf_command.strip
|
92
93
|
case rdf_command
|
93
94
|
when /[&|;`$\s]/
|
@@ -99,8 +100,6 @@ when !(/rdf$/ =~ $_)
|
|
99
100
|
end
|
100
101
|
RDF_COMMAND = rdf_command
|
101
102
|
|
102
|
-
|
103
|
-
|
104
|
-
end
|
105
|
-
tika_command = 'http://localhost:9998/meta' unless @tika_command
|
103
|
+
tika_command = CONFIG.dig(:tika, :command)
|
104
|
+
tika_command ||= 'http://localhost:9998/meta'
|
106
105
|
TIKA_COMMAND = tika_command
|
data/lib/fsp_harvester.rb
CHANGED
@@ -20,6 +20,9 @@ require_relative './metadata_object'
|
|
20
20
|
require_relative './constants'
|
21
21
|
require_relative './web_utils'
|
22
22
|
require_relative './signposting_tests'
|
23
|
+
require_relative './fsp_metadata_harvester'
|
24
|
+
require_relative './fsp_metadata_parser'
|
25
|
+
|
23
26
|
|
24
27
|
module FspHarvester
|
25
28
|
class Error < StandardError
|
@@ -32,18 +35,29 @@ module FspHarvester
|
|
32
35
|
|
33
36
|
def self.resolve_guid(guid:)
|
34
37
|
@meta = FspHarvester::MetadataObject.new
|
35
|
-
@meta.
|
38
|
+
@meta.all_uris = [guid]
|
36
39
|
type, url = convertToURL(guid: guid)
|
37
40
|
links = Array.new
|
38
41
|
if type
|
39
42
|
links = resolve_url(url: url)
|
43
|
+
@meta.links << links
|
40
44
|
else
|
41
|
-
@meta.
|
45
|
+
@meta.add_warning(['006', guid, ''])
|
42
46
|
@meta.comments << "FATAL: GUID type not recognized.\n"
|
43
47
|
end
|
44
48
|
[links, @meta]
|
45
49
|
end
|
46
50
|
|
51
|
+
def self.gather_metadata_from_describedby_links(links: [], metadata: FspHarvester::MetadataObject.new) # meta should have already been created by resolve+guid, but maybe not
|
52
|
+
@meta = metadata
|
53
|
+
db = []
|
54
|
+
links.each do |l|
|
55
|
+
db << l if l.relation == 'describedby'
|
56
|
+
end
|
57
|
+
FspHarvester::MetadataHarvester.extract_metadata(links: db, metadata: @meta) # everything is gathered into the @meta metadata object
|
58
|
+
@meta
|
59
|
+
end
|
60
|
+
|
47
61
|
def self.convertToURL(guid:)
|
48
62
|
GUID_TYPES.each do |k, regex|
|
49
63
|
if k == 'inchi' and regex.match(guid)
|
@@ -68,19 +82,19 @@ module FspHarvester
|
|
68
82
|
false
|
69
83
|
end
|
70
84
|
|
71
|
-
def self.resolve_url(url:, method: :get, nolinkheaders: false, header:
|
85
|
+
def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_STAR_HEADER)
|
72
86
|
@meta.guidtype = 'uri' if @meta.guidtype.nil?
|
73
87
|
warn "\n\n FETCHING #{url} #{header}\n\n"
|
74
|
-
response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method)
|
88
|
+
response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method, meta: @meta)
|
75
89
|
warn "\n\n head #{response.headers.inspect}\n\n" if response
|
76
90
|
|
77
91
|
unless response
|
78
|
-
@meta.
|
92
|
+
@meta.add_warning(['001', url, header])
|
79
93
|
@meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
|
80
94
|
return []
|
81
95
|
end
|
82
96
|
|
83
|
-
@meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.
|
97
|
+
@meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
|
84
98
|
@meta.full_response << response.body
|
85
99
|
|
86
100
|
links = process_link_headers(response: response) unless nolinkheaders
|
@@ -90,7 +104,7 @@ module FspHarvester
|
|
90
104
|
def self.process_link_headers(response:)
|
91
105
|
warn "\n\n parsing #{response.headers}\n\n"
|
92
106
|
|
93
|
-
parser = LinkHeaders::Processor.new(default_anchor: @meta.
|
107
|
+
parser = LinkHeaders::Processor.new(default_anchor: @meta.all_uris.last)
|
94
108
|
parser.extract_and_parse(response: response)
|
95
109
|
factory = parser.factory # LinkHeaders::LinkFactory
|
96
110
|
|
@@ -105,6 +119,8 @@ module FspHarvester
|
|
105
119
|
citeas = Array.new
|
106
120
|
describedby = Array.new
|
107
121
|
item = Array.new
|
122
|
+
types = Array.new
|
123
|
+
|
108
124
|
factory.all_links.each do |l|
|
109
125
|
case l.relation
|
110
126
|
when 'cite-as'
|
@@ -113,23 +129,29 @@ module FspHarvester
|
|
113
129
|
item << l
|
114
130
|
when 'describedby'
|
115
131
|
describedby << l
|
132
|
+
when 'type'
|
133
|
+
types << l
|
116
134
|
end
|
117
135
|
end
|
118
136
|
|
119
137
|
check_describedby_rules(describedby: describedby)
|
120
138
|
check_item_rules(item: item)
|
121
139
|
|
122
|
-
uniqueciteas = Array.new
|
123
140
|
if citeas.length > 1
|
124
141
|
warn "INFO: multiple cite-as links found. Checking for conflicts\n"
|
125
142
|
@meta.comments << "INFO: multiple cite-as links found. Checking for conflicts\n"
|
126
|
-
|
143
|
+
citeas = check_for_citeas_conflicts(citeas: citeas) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
|
127
144
|
end
|
128
145
|
|
129
|
-
unless
|
130
|
-
@meta.
|
146
|
+
unless citeas.length == 1 && describedby.length > 0
|
147
|
+
@meta.add_warning(['004', '', ''])
|
131
148
|
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires exactly one cite-as header, and at least one describedby header\n"
|
132
149
|
end
|
150
|
+
|
151
|
+
unless types.length >=1
|
152
|
+
@meta.add_warning(['015', '', ''])
|
153
|
+
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires one or two 'type' link headers\n"
|
154
|
+
end
|
133
155
|
end
|
134
156
|
end
|
135
157
|
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FspHarvester
|
4
|
+
class Error < StandardError
|
5
|
+
end
|
6
|
+
|
7
|
+
class ExternalTools
|
8
|
+
|
9
|
+
def initialize(metadata: FspHarvester::MetadataObject.new)
|
10
|
+
@meta = metadata
|
11
|
+
end
|
12
|
+
|
13
|
+
def process_with_distiller(body:)
|
14
|
+
bhash = Digest::SHA256.hexdigest(body)
|
15
|
+
if @@distillerknown[bhash]
|
16
|
+
@meta.comments << "INFO: data is already parsed by distiller.\n"
|
17
|
+
#parse_rdf(body: body)
|
18
|
+
else
|
19
|
+
@meta.comments << "INFO: Using 'Kellog's Distiller' to try to extract metadata from return value (message body).\n"
|
20
|
+
file = Tempfile.new('foo', encoding: 'UTF-8')
|
21
|
+
body = body.force_encoding('UTF-8')
|
22
|
+
body.scrub!
|
23
|
+
body = body.gsub(%r{"@context"\s*:\s*"https?://schema.org/?"}, '"@context": "https://schema.org/docs/jsonldcontext.json"') # a bug in distiller, apparently
|
24
|
+
file.write(body)
|
25
|
+
file.rewind
|
26
|
+
|
27
|
+
@meta.comments << "INFO: The message body is being examined by Distiller\n"
|
28
|
+
# command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format turtle #{file.path} 2>/dev/null"
|
29
|
+
command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
|
30
|
+
# command = "LANG=en_US.UTF-8 /usr/local/bin/ruby #{@rdf_command} serialize --input-format rdfa --output-format jsonld #{file.path}"
|
31
|
+
# command = "LANG=en_US.UTF-8 /home/osboxes/.rvm/rubies/ruby-2.6.3/bin/ruby /home/osboxes/.rvm/gems/ruby-2.6.3/bin/rdf serialize --output-format jsonld #{file.path}"
|
32
|
+
warn "distiller command: #{command}"
|
33
|
+
result, _stderr, _status = Open3.capture3(command)
|
34
|
+
warn ''
|
35
|
+
warn "distiller errors: #{stderr}"
|
36
|
+
file.close
|
37
|
+
file.unlink
|
38
|
+
|
39
|
+
result = result.force_encoding('UTF-8')
|
40
|
+
warn "DIST RESULT: #{result}"
|
41
|
+
if result !~ /@context/i # failure returns nil
|
42
|
+
@meta.comments << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML..\n"
|
43
|
+
@meta.add_warning(['018', '', ''])
|
44
|
+
else
|
45
|
+
@meta.comments << "INFO: The Distiller found parseable data. Parsing as JSON-LD\n"
|
46
|
+
parse_rdf(result: result, content_type: "application/ld+json")
|
47
|
+
end
|
48
|
+
@@distillerknown[bhash] = true
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def processs_with_extruct(uri:)
|
53
|
+
@meta.comments << "INFO: Using 'extruct' to try to extract metadata from return value (message body) of #{uri}.\n"
|
54
|
+
warn 'begin open3'
|
55
|
+
stdout, stderr, status = Open3.capture3(EXTRUCT_COMMAND + ' ' + uri)
|
56
|
+
warn "open3 status: #{status} #{stdout}"
|
57
|
+
result = stderr # absurd that the output comes over stderr! LOL!
|
58
|
+
|
59
|
+
if result.to_s.match(/(Failed\sto\sextract.*?)\n/)
|
60
|
+
@meta.comments << "WARN: extruct threw an error #{Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n"
|
61
|
+
@meta.add_warning(['019', '', ''])
|
62
|
+
if result.to_s.match(/(ValueError:.*?)\n/)
|
63
|
+
@meta.comments << "WARN: extruct error was #{Regexp.last_match(1)}\n"
|
64
|
+
@meta.add_warning(['019', '', ''])
|
65
|
+
end
|
66
|
+
elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
|
67
|
+
json = JSON.parse result
|
68
|
+
@meta.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
|
69
|
+
|
70
|
+
parse_rdf(body: json['json-ld'].to_json, content_type: 'application/ld+json') if json['json-ld'].any? # RDF
|
71
|
+
@meta.merge_hash(json['microdata'].first) if json['microdata'].any?
|
72
|
+
@meta.merge_hash(json['microformat'].first) if json['microformat'].any?
|
73
|
+
@meta.merge_hash(json['opengraph'].first) if json['opengraph'].any?
|
74
|
+
parse_rdf(body: json['rdfa'].to_json, content_type: 'application/ld+json') if json['rdfa'].any? # RDF
|
75
|
+
|
76
|
+
@meta.merge_hash(json.first) if json.first.is_a? Hash
|
77
|
+
else
|
78
|
+
@meta.comments << "WARN: the extruct tool failed to find parseable data at #{uri}\n"
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
@@ -0,0 +1,164 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FspHarvester
|
4
|
+
class Error < StandardError
|
5
|
+
end
|
6
|
+
|
7
|
+
class MetadataHarvester
|
8
|
+
def self.extract_metadata(links: [], metadata: FspHarvester::MetadataObject.new)
|
9
|
+
@meta = metadata
|
10
|
+
@meta.comments << 'INFO: now collecting both linked data and hash-style data using the harvested links'
|
11
|
+
|
12
|
+
describedby = links.select { |l| l if l.relation == 'describedby' }
|
13
|
+
|
14
|
+
hvst = FspHarvester::MetadataParser.new(metadata_object: @meta) # put here because the class variable for detecting duplicates should apply to all URIs
|
15
|
+
describedby.each do |link|
|
16
|
+
accepttype = ACCEPT_STAR_HEADER
|
17
|
+
accept = link.respond_to?('type') ? link.type : nil
|
18
|
+
accepttype = { 'Accept' => accept } if accept
|
19
|
+
|
20
|
+
response = attempt_to_resolve(link: link, headers: accepttype)
|
21
|
+
|
22
|
+
abbreviation, content_type = attempt_to_detect_type(body: response.body, headers: response.headers)
|
23
|
+
unless abbreviation
|
24
|
+
@meta.add_warning(['017', url, header])
|
25
|
+
@meta.comments << "WARN: metadata format returned from #{url} using Accept header #{header} is not recognized. Processing will end now.\n"
|
26
|
+
next
|
27
|
+
end
|
28
|
+
|
29
|
+
# process according to detected type
|
30
|
+
case abbreviation
|
31
|
+
when 'html'
|
32
|
+
@meta.comments << 'INFO: Processing html'
|
33
|
+
hvst.process_html(body: response.body, uri: link)
|
34
|
+
when 'xml'
|
35
|
+
@meta.comments << 'INFO: Processing xml'
|
36
|
+
hvst.process_xml(body: response.body)
|
37
|
+
when 'json'
|
38
|
+
@meta.comments << 'INFO: Processing json'
|
39
|
+
hvst.process_json(body: response.body)
|
40
|
+
when 'jsonld', 'rdfxml', 'turtle', 'ntriples', 'nquads'
|
41
|
+
@meta.comments << 'INFO: Processing linked data'
|
42
|
+
hvst.process_ld(body: response.body, content_type: content_type)
|
43
|
+
when 'specialist'
|
44
|
+
warn 'no specialized parsers so far'
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.attempt_to_resolve(link:, headers: ACCEPT_STAR_HEADER)
|
50
|
+
@meta.comments << "INFO: link #{link.href} being processed"
|
51
|
+
if link.respond_to? 'type'
|
52
|
+
header = { 'Accept' => link.type }
|
53
|
+
else
|
54
|
+
@meta.comments << "INFO: link #{link.href} has no MIME type, defaulting to */*"
|
55
|
+
end
|
56
|
+
url = link.href
|
57
|
+
response = FspHarvester::WebUtils.fspfetch(url: url, method: :get, headers: header)
|
58
|
+
unless response
|
59
|
+
@meta.add_warning(['016', url, header])
|
60
|
+
@meta.comments << "WARN: Unable to resolve describedby link #{url} using HTTP Accept header #{header}.\n"
|
61
|
+
end
|
62
|
+
response
|
63
|
+
end
|
64
|
+
|
65
|
+
def self.attempt_to_detect_type(body:, headers:)
|
66
|
+
# described by should be an html, xml, json, or linked data document
|
67
|
+
abbreviation = nil
|
68
|
+
content_type = nil
|
69
|
+
@meta.comments << 'INFO: Testing metadata format for html, xml, and linked data formats\n'
|
70
|
+
if body =~ /^\s*<\?xml/
|
71
|
+
if body =~ /<HTML/i
|
72
|
+
abbreviation = 'html'
|
73
|
+
content_type = 'text/html'
|
74
|
+
@meta.comments << 'INFO: appears to be HTML\n'
|
75
|
+
elsif body =~ /<rdf:RDF/i
|
76
|
+
abbreviation = 'rdfxml'
|
77
|
+
content_type = 'application/rdf+xml'
|
78
|
+
@meta.comments << 'INFO: appears to be RDF-XML\n'
|
79
|
+
else
|
80
|
+
abbreviation = 'xml'
|
81
|
+
content_type = 'application/xml'
|
82
|
+
@meta.comments << 'INFO: appears to be XML\n'
|
83
|
+
end
|
84
|
+
else
|
85
|
+
abbreviation, content_type = check_ld(body: body, claimed_type: headers[:content_type])
|
86
|
+
abbreviation, content_type = check_json(body: body) unless abbreviation
|
87
|
+
end
|
88
|
+
|
89
|
+
unless content_type
|
90
|
+
@meta.add_warning(['017', url, header])
|
91
|
+
@meta.comments << "WARN: metadata format returned from #{url} using Accept header #{header} is not recognized. Processing will end now.\n"
|
92
|
+
end
|
93
|
+
[abbreviation, content_type]
|
94
|
+
end
|
95
|
+
|
96
|
+
def self.check_ld(body:, claimed_type:)
|
97
|
+
detected_type = ntriples_hack(body: body) # ntriples hack for one-line metadata records
|
98
|
+
unless detected_type
|
99
|
+
detected_type = RDF::Format.for({ sample: body[0..5000] })
|
100
|
+
@meta.comments << "INFO: Auto-detected type #{detected_type}\n"
|
101
|
+
end
|
102
|
+
contenttype = ''
|
103
|
+
abbreviation = ''
|
104
|
+
if detected_type
|
105
|
+
contenttype = detected_type.content_type.first # comes back as array
|
106
|
+
abbreviation = abbreviate_type(contenttype: contenttype)
|
107
|
+
@meta.comments << "INFO: using content-type #{contenttype}.\n"
|
108
|
+
else
|
109
|
+
@meta.comments << "INFO: metadata does not appear to be in a linked data format. Trying other options.\n"
|
110
|
+
end
|
111
|
+
[abbreviation, contenttype]
|
112
|
+
end
|
113
|
+
|
114
|
+
def self.ntriples_hack(body:) # distriller cannot recognize single-line ntriples unless they end with a period, which is not required by the spec... so hack it!
|
115
|
+
detected_type = nil
|
116
|
+
body.split.each do |line|
|
117
|
+
line.strip!
|
118
|
+
next if line.empty?
|
119
|
+
if line =~ %r{\s*<[^>]+>\s*<[^>]+>\s\S+}
|
120
|
+
@meta.comments << "INFO: running ntriples hack on #{line + " ."}\n"
|
121
|
+
detected_type = RDF::Format.for({ sample: "#{line} ." }) # adding a period allows detection of ntriples by distiller
|
122
|
+
break
|
123
|
+
end
|
124
|
+
end
|
125
|
+
@meta.comments << "INFO: ntriples hack found: #{detected_type.to_s}\n"
|
126
|
+
if detected_type != RDF::NTriples::Format # only return the hacky case
|
127
|
+
return nil
|
128
|
+
end
|
129
|
+
return detected_type
|
130
|
+
end
|
131
|
+
|
132
|
+
|
133
|
+
def self.check_json(body:)
|
134
|
+
abbreviation = nil
|
135
|
+
parsed = nil
|
136
|
+
begin
|
137
|
+
parsed = JSON.parse(body)
|
138
|
+
rescue StandardError
|
139
|
+
abbreviation = nil
|
140
|
+
end
|
141
|
+
|
142
|
+
if parsed
|
143
|
+
abbreviation = 'json'
|
144
|
+
else
|
145
|
+
@meta.comments << "INFO: metadata does not appear to be in JSON format. No options left.\n"
|
146
|
+
end
|
147
|
+
[abbreviation, 'application/ld+json']
|
148
|
+
end
|
149
|
+
|
150
|
+
def self.abbreviate_type(contenttype:)
|
151
|
+
foundtype = nil
|
152
|
+
RDF_FORMATS.merge(XML_FORMATS).merge(HTML_FORMATS).merge(JSON_FORMATS).each do |type, vals|
|
153
|
+
warn "\n\ntype #{type}\nvals #{vals}\n\n"
|
154
|
+
@meta.comments << "INFO: testing #{type} MIME types for #{contenttype}"
|
155
|
+
next unless vals.include? contenttype
|
156
|
+
|
157
|
+
foundtype = type
|
158
|
+
@meta.comments << "INFO: detected a #{type} MIME type"
|
159
|
+
break
|
160
|
+
end
|
161
|
+
foundtype
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FspHarvester
|
4
|
+
class Error < StandardError
|
5
|
+
end
|
6
|
+
|
7
|
+
class MetadataParser
|
8
|
+
# attr_accessor :distillerknown
|
9
|
+
|
10
|
+
@@distillerknown = {}
|
11
|
+
|
12
|
+
def initialize(metadata_object: FspHarvester::MetadataObject.new)
|
13
|
+
@meta = metadata_object
|
14
|
+
end
|
15
|
+
|
16
|
+
def process_html(body:, uri:)
|
17
|
+
tools = FspHarvester::ExternalTools.new(metadata: @meta)
|
18
|
+
tools.process_with_distiller(body: body)
|
19
|
+
tools.process_with_extruct(uri: uri)
|
20
|
+
end
|
21
|
+
|
22
|
+
def process_xml(body:)
|
23
|
+
begin
|
24
|
+
hash = XmlSimple.xml_in(body)
|
25
|
+
rescue
|
26
|
+
@meta.comments << "CRITICAL: Malformed XML detected. Cannot process metadata.\n"
|
27
|
+
@meta.add_warning(['020', '', ''])
|
28
|
+
end
|
29
|
+
@meta.comments << "INFO: The XML is being merged in the metadata object\n"
|
30
|
+
@meta.hash.merge hash
|
31
|
+
end
|
32
|
+
|
33
|
+
def process_json(body:)
|
34
|
+
begin
|
35
|
+
hash = JSON.parse(body)
|
36
|
+
rescue
|
37
|
+
@meta.comments << "CRITICAL: Malformed JSON detected. Cannot process metadata.\n"
|
38
|
+
@meta.add_warning(['021', '', ''])
|
39
|
+
end
|
40
|
+
@meta.comments << "INFO: The JSON is being merged in the metadata object\n"
|
41
|
+
@meta.hash.merge hash
|
42
|
+
end
|
43
|
+
|
44
|
+
def process_ld(body:, content_type:)
|
45
|
+
parse_rdf(body: body, content_type: content_type)
|
46
|
+
end
|
47
|
+
|
48
|
+
def parse_rdf(body:, content_type:)
|
49
|
+
unless body
|
50
|
+
@meta.comments << "CRITICAL: The response message body component appears to have no content.\n"
|
51
|
+
@meta.add_warning(['018', '', ''])
|
52
|
+
return
|
53
|
+
end
|
54
|
+
|
55
|
+
unless body.match(/\w/)
|
56
|
+
@meta.comments << "CRITICAL: The response message body component appears to have no content.\n"
|
57
|
+
@meta.add_warning(['018', '', ''])
|
58
|
+
return
|
59
|
+
end
|
60
|
+
|
61
|
+
rdfformat = RDF::Format.for(content_type: content_type)
|
62
|
+
unless rdfformat
|
63
|
+
@meta.comments << "CRITICAL: Found what appears to be RDF (sample: #{body[0..300].delete!("\n")}), but it could not find a parser. Please report this error, along with the GUID of the resource, to the maintainer of the system.\n"
|
64
|
+
@meta.add_warning(['018', '', ''])
|
65
|
+
return
|
66
|
+
end
|
67
|
+
|
68
|
+
graph = FspHarvester::Cache.checkRDFCache(body: body)
|
69
|
+
if graph.size > 0
|
70
|
+
warn "\n\n\n unmarshalling graph from cache\n\ngraph size #{graph.size}\n\n"
|
71
|
+
@meta.merge_rdf(graph.to_a)
|
72
|
+
else
|
73
|
+
warn "\n\n\nfound format #{rdfformat}\n\n"
|
74
|
+
@meta.comments << "INFO: The response message body component appears to contain #{rdfformat}.\n"
|
75
|
+
reader = ''
|
76
|
+
begin
|
77
|
+
reader = rdfformat.reader.new(body)
|
78
|
+
rescue Exception => e
|
79
|
+
@meta.comments << "WARN: Though linked data was found, it failed to parse (Exception #{e}). This likely indicates some syntax error in the data. As a result, no metadata will be extracted from this message.\n"
|
80
|
+
@meta.add_warning(['018', '', ''])
|
81
|
+
return
|
82
|
+
end
|
83
|
+
|
84
|
+
begin
|
85
|
+
if reader.size == 0
|
86
|
+
@meta.comments << "WARN: Though linked data was found, it failed to parse. This likely indicates some syntax error in the data. As a result, no metadata will be extracted from this message.\n"
|
87
|
+
return
|
88
|
+
end
|
89
|
+
reader = rdfformat.reader.new(body) # have to re-read it here, but now its safe because we have already caught errors
|
90
|
+
warn 'WRITING TO CACHE'
|
91
|
+
FspHarvester::Cache.writeRDFCache(reader: reader, body: body) # write to the special RDF graph cache
|
92
|
+
warn 'WRITING DONE'
|
93
|
+
reader = rdfformat.reader.new(body) # frustrating that we cannot rewind!
|
94
|
+
warn 'RE-READING DONE'
|
95
|
+
@meta.merge_rdf(reader.to_a)
|
96
|
+
warn 'MERGE DONE'
|
97
|
+
rescue RDF::ReaderError => e
|
98
|
+
@meta.comments << "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} || (sample of what was parsed: #{body[0..300].delete("\n")})\n"
|
99
|
+
warn "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} || (sample of what was parsed: #{body[0..300].delete("\n")})\n"
|
100
|
+
@meta.add_warning(['018', '', ''])
|
101
|
+
rescue Exception => e
|
102
|
+
meta.comments << "CRITICAL: An unknown error occurred while parsing the (apparent) Linked Data (sample of what was parsed: #{body[0..300].delete("\n")}). Moving on...\n"
|
103
|
+
warn "\n\nCRITICAL: #{e.inspect} An unknown error occurred while parsing the (apparent) Linked Data (full body: #{body}). Moving on...\n"
|
104
|
+
@meta.add_warning(['018', '', ''])
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
data/lib/metadata_object.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
module FspHarvester
|
2
2
|
class MetadataObject
|
3
|
-
attr_accessor :hash, :graph, :comments, :warnings, :guidtype, :full_response, :
|
3
|
+
attr_accessor :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris # a hash of metadata # a RDF.rb graph of metadata # an array of comments # the type of GUID that was detected # will be an array of Net::HTTP::Response
|
4
4
|
|
5
5
|
def initialize(_params = {}) # get a name from the "new" call, or set a default
|
6
6
|
@hash = {}
|
@@ -8,15 +8,19 @@ module FspHarvester
|
|
8
8
|
@comments = []
|
9
9
|
@warnings = []
|
10
10
|
@full_response = []
|
11
|
-
@
|
11
|
+
@links = []
|
12
|
+
@all_uris = []
|
13
|
+
w = RestClient.get("https://raw.githubusercontent.com/markwilkinson/FAIR-Signposting-Harvester/master/lib/warnings.json")
|
14
|
+
#@warn = File.read("./lib/warnings.json")
|
15
|
+
@warn = JSON.parse(w)
|
12
16
|
end
|
13
17
|
|
14
18
|
def merge_hash(hash)
|
15
|
-
#
|
19
|
+
# warn "\n\n\nIncoming Hash #{hash.inspect}"
|
16
20
|
self.hash = self.hash.merge(hash)
|
17
21
|
end
|
18
22
|
|
19
|
-
def merge_rdf(triples)
|
23
|
+
def merge_rdf(triples) # incoming list of triples
|
20
24
|
graph << triples
|
21
25
|
graph
|
22
26
|
end
|
@@ -24,5 +28,106 @@ module FspHarvester
|
|
24
28
|
def rdf
|
25
29
|
graph
|
26
30
|
end
|
31
|
+
|
32
|
+
def add_warning(warning)
|
33
|
+
id = warning[0]
|
34
|
+
url = warning[1]
|
35
|
+
headers = warning[2]
|
36
|
+
message = @warn[id]['message']
|
37
|
+
linkout = @warn[id]['linkout']
|
38
|
+
severity = @warn[id]['severity']
|
39
|
+
self.warnings << {"id" => id, "message" => message, "severity" => severity, "linkout" => linkout, "processed_url" => url, "accept_headers": headers}
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
class Cache
|
44
|
+
def self.retrieveMetaObject(uri)
|
45
|
+
filename = (Digest::MD5.hexdigest uri) + '_meta'
|
46
|
+
warn "Checking Meta cache for #{filename}"
|
47
|
+
if File.exist?("/tmp/#{filename}")
|
48
|
+
warn 'FOUND Meta object in cache'
|
49
|
+
meta = Marshal.load(File.read("/tmp/#{filename}"))
|
50
|
+
warn 'Returning....'
|
51
|
+
return meta
|
52
|
+
end
|
53
|
+
warn 'Meta objectNot Found in Cache'
|
54
|
+
false
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.cacheMetaObject(meta, uri)
|
58
|
+
filename = (Digest::MD5.hexdigest uri) + '_meta'
|
59
|
+
warn "in cacheMetaObject Writing to cache for #{filename}"
|
60
|
+
File.open("/tmp/#{filename}", 'wb') { |f| f.write(Marshal.dump(meta)) }
|
61
|
+
end
|
62
|
+
|
63
|
+
def self.checkRDFCache(body: )
|
64
|
+
fs = File.join('/tmp/', '*_graphbody')
|
65
|
+
bodies = Dir.glob(fs)
|
66
|
+
g = RDF::Graph.new
|
67
|
+
bodies.each do |bodyfile|
|
68
|
+
next unless File.size(bodyfile) == body.bytesize # compare body size
|
69
|
+
next unless bodyfile.match(/(.*)_graphbody$/) # continue if there's no match
|
70
|
+
|
71
|
+
filename = Regexp.last_match(1)
|
72
|
+
warn "Regexp match for #{filename} FOUND"
|
73
|
+
next unless File.exist?("#{filename}_graph") # @ get the associated graph file
|
74
|
+
|
75
|
+
warn "RDF Cache File #{filename} FOUND"
|
76
|
+
graph = Marshal.load(File.read("#{filename}_graph")) # unmarshal it
|
77
|
+
graph.each do |statement|
|
78
|
+
g << statement # need to do this because the unmarshalled object isn't entirely functional as an RDF::Graph object
|
79
|
+
end
|
80
|
+
warn "returning a graph of #{g.size}"
|
81
|
+
break
|
82
|
+
end
|
83
|
+
# return an empty graph otherwise
|
84
|
+
g
|
85
|
+
end
|
86
|
+
|
87
|
+
def self.writeRDFCache(reader:, body:)
|
88
|
+
filename = Digest::MD5.hexdigest body
|
89
|
+
graph = RDF::Graph.new
|
90
|
+
reader.each_statement { |s| graph << s }
|
91
|
+
warn "WRITING RDF TO CACHE #{filename}"
|
92
|
+
File.open("/tmp/#{filename}_graph", 'wb') { |f| f.write(Marshal.dump(graph)) }
|
93
|
+
File.open("/tmp/#{filename}_graphbody", 'wb') { |f| f.write(body) }
|
94
|
+
warn "wrote RDF filename: #{filename}"
|
95
|
+
end
|
96
|
+
|
97
|
+
def self.checkCache(uri, headers)
|
98
|
+
filename = Digest::MD5.hexdigest uri + headers.to_s
|
99
|
+
warn "Checking Error cache for #{filename}"
|
100
|
+
if File.exist?("/tmp/#{filename}_error")
|
101
|
+
warn 'Error file found in cache... returning'
|
102
|
+
return ['ERROR', nil, nil]
|
103
|
+
end
|
104
|
+
if File.exist?("/tmp/#{filename}_head") and File.exist?("/tmp/#{filename}_body")
|
105
|
+
warn 'FOUND data in cache'
|
106
|
+
head = Marshal.load(File.read("/tmp/#{filename}_head"))
|
107
|
+
body = Marshal.load(File.read("/tmp/#{filename}_body"))
|
108
|
+
all_uris = ''
|
109
|
+
all_uris = Marshal.load(File.read("/tmp/#{filename}_uri")) if File.exist?("/tmp/#{filename}_uri")
|
110
|
+
warn 'Returning....'
|
111
|
+
return [head, body, all_uris]
|
112
|
+
end
|
113
|
+
warn 'Not Found in Cache'
|
114
|
+
end
|
115
|
+
|
116
|
+
def self.writeToCache(uri, headers, head, body, all_uris)
|
117
|
+
filename = Digest::MD5.hexdigest uri + headers.to_s
|
118
|
+
warn "in writeToCache Writing to cache for #{filename}"
|
119
|
+
headfilename = filename + '_head'
|
120
|
+
bodyfilename = filename + '_body'
|
121
|
+
urifilename = filename + '_uri'
|
122
|
+
File.open("/tmp/#{headfilename}", 'wb') { |f| f.write(Marshal.dump(head)) }
|
123
|
+
File.open("/tmp/#{bodyfilename}", 'wb') { |f| f.write(Marshal.dump(body)) }
|
124
|
+
File.open("/tmp/#{urifilename}", 'wb') { |f| f.write(Marshal.dump(all_uris)) }
|
125
|
+
end
|
126
|
+
|
127
|
+
def self.writeErrorToCache(uri, headers)
|
128
|
+
filename = Digest::MD5.hexdigest uri + headers.to_s
|
129
|
+
warn "in writeErrorToCache Writing error to cache for #{filename}"
|
130
|
+
File.open("/tmp/#{filename}_error", 'wb') { |f| f.write('ERROR') }
|
131
|
+
end
|
27
132
|
end
|
28
133
|
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
def check_for_citeas_conflicts(citeas: )
|
2
|
+
@meta.comments << 'INFO: checking for conflicting cite-as links'
|
3
|
+
citeas_hrefs = Hash.new
|
4
|
+
citeas.each do |link|
|
5
|
+
warn "INFO: Adding citeas #{link.href} to the testing queue."
|
6
|
+
@meta.comments << "INFO: Adding citeas #{link.href} to the testing queue."
|
7
|
+
citeas_hrefs[link.href] = link
|
8
|
+
end
|
9
|
+
|
10
|
+
if citeas_hrefs.length > 1
|
11
|
+
@meta.comments << 'INFO: Found multiple non-identical cite-as links.'
|
12
|
+
@meta.add_warning(['007', '', ''])
|
13
|
+
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard: Found conflicting cite-as link headers.\n"
|
14
|
+
end
|
15
|
+
citeas_hrefs.values # return list of unique links
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
def check_describedby_rules(describedby:)
|
20
|
+
describedby.each do |l|
|
21
|
+
unless l.respond_to? 'type'
|
22
|
+
@meta.add_warning(['005', l.href, ''])
|
23
|
+
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires any describedby links to also have a 'type' attribute.\n"
|
24
|
+
end
|
25
|
+
type = l.type if l.respond_to? 'type'
|
26
|
+
type ||= '*/*'
|
27
|
+
header = { accept: type }
|
28
|
+
response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
|
29
|
+
if response
|
30
|
+
responsetype = response.headers[:content_type]
|
31
|
+
@meta.comments << "INFO: describedby link responds with content type #{responsetype}\n"
|
32
|
+
if responsetype =~ %r{^(.*/[^;]+)}
|
33
|
+
responsetype = Regexp.last_match(1).to_s # remove the e.g. charset information
|
34
|
+
end
|
35
|
+
@meta.comments << "INFO: testing content type |#{responsetype}| against |#{type}|\n"
|
36
|
+
if type != '*/*'
|
37
|
+
if responsetype == type
|
38
|
+
@meta.comments << "INFO: describedby link responds according to Signposting specifications\n"
|
39
|
+
else
|
40
|
+
@meta.add_warning(['009', l.href, header])
|
41
|
+
@meta.comments << "WARN: Content type of returned describedby link #{responsetype}does not match the 'type' attribute #{type}\n"
|
42
|
+
end
|
43
|
+
else
|
44
|
+
@meta.add_warning(['010', l.href, header])
|
45
|
+
@meta.comments << "WARN: Content type of returned describedby link is not specified in response headers or cannot be matched against accept headers\n"
|
46
|
+
end
|
47
|
+
else
|
48
|
+
@meta.add_warning(['008', l.href, header])
|
49
|
+
@meta.comments << "WARN: describedby link doesn't resolve\n"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def check_item_rules(item:)
|
55
|
+
item.each do |l| # l = LinkHeaders::Link
|
56
|
+
unless l.respond_to? 'type'
|
57
|
+
@meta.add_warning(['011', l.href, ''])
|
58
|
+
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which encourages any item links to also have a 'type' attribute.\n"
|
59
|
+
end
|
60
|
+
type = l.type if l.respond_to? 'type'
|
61
|
+
type ||= '*/*' # this becomes a frozen string
|
62
|
+
header = { accept: type }
|
63
|
+
response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
|
64
|
+
|
65
|
+
if response
|
66
|
+
if response.headers[:content_type] and type != '*/*'
|
67
|
+
rtype = type.gsub(%r{/}, "\/") # because type is a frozen string
|
68
|
+
rtype = rtype.gsub(/\+/, '.')
|
69
|
+
typeregex = Regexp.new(type)
|
70
|
+
if response.headers[:content_type].match(typeregex)
|
71
|
+
warn response.headers[:content_type]
|
72
|
+
warn typeregex.inspect
|
73
|
+
@meta.comments << "INFO: item link responds according to Signposting specifications\n"
|
74
|
+
else
|
75
|
+
@meta.add_warning(['012', l.href, header])
|
76
|
+
@meta.comments << "WARN: Content type of returned item link does not match the 'type' attribute\n"
|
77
|
+
end
|
78
|
+
else
|
79
|
+
@meta.add_warning(['013', l.href, header])
|
80
|
+
@meta.comments << "WARN: Content type of returned item link is not specified in response headers or cannot be matched against accept headers\n"
|
81
|
+
end
|
82
|
+
else
|
83
|
+
@meta.add_warning(['014', l.href, header])
|
84
|
+
@meta.comments << "WARN: item link doesn't resolve\n"
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
data/lib/warnings.json
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
{
|
2
2
|
"001": {
|
3
|
-
"message": "Unable to resolve guid using Accept headers
|
3
|
+
"message": "Unable to resolve guid using default (*/*) Accept headers",
|
4
4
|
"linkout": "",
|
5
5
|
"severity": "WARN"
|
6
6
|
},
|
@@ -68,7 +68,40 @@
|
|
68
68
|
"message": "Item link does not resolve",
|
69
69
|
"linkout": "",
|
70
70
|
"severity": "WARN"
|
71
|
+
},
|
72
|
+
"015": {
|
73
|
+
"message": "Link headers do not include a link of type 'type', as required by the FAIR Signposting specification",
|
74
|
+
"linkout": "",
|
75
|
+
"severity": "WARN"
|
76
|
+
},
|
77
|
+
"016": {
|
78
|
+
"message": "Unable to resolve describedby link using Accept headers with the MIME type indicated in the link",
|
79
|
+
"linkout": "",
|
80
|
+
"severity": "WARN"
|
81
|
+
},
|
82
|
+
"017": {
|
83
|
+
"message": "Metadata format not recognized.",
|
84
|
+
"linkout": "",
|
85
|
+
"severity": "WARN"
|
86
|
+
},
|
87
|
+
"018": {
|
88
|
+
"message": "RDF parsing error - likely malformed RDF document.",
|
89
|
+
"linkout": "",
|
90
|
+
"severity": "WARN"
|
91
|
+
},
|
92
|
+
"019": {
|
93
|
+
"message": "HTML parsing error - unable to extract linked data from HTML.",
|
94
|
+
"linkout": "",
|
95
|
+
"severity": "WARN"
|
96
|
+
},
|
97
|
+
"020": {
|
98
|
+
"message": "XML parsing error - unable to process XML document.",
|
99
|
+
"linkout": "",
|
100
|
+
"severity": "WARN"
|
101
|
+
},
|
102
|
+
"021": {
|
103
|
+
"message": "JSON parsing error - unable to process JSON document.",
|
104
|
+
"linkout": "",
|
105
|
+
"severity": "WARN"
|
71
106
|
}
|
72
|
-
|
73
|
-
|
74
107
|
}
|
data/lib/web_utils.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module FspHarvester
|
2
2
|
|
3
3
|
class WebUtils
|
4
|
-
def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get)
|
4
|
+
def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get, meta: FspHarvester::MetadataObject.new)
|
5
5
|
warn 'In fetch routine now. '
|
6
6
|
|
7
7
|
begin
|
@@ -13,19 +13,19 @@ module FspHarvester
|
|
13
13
|
# password: pass,
|
14
14
|
headers: headers
|
15
15
|
})
|
16
|
-
|
17
|
-
warn "
|
18
|
-
warn "
|
16
|
+
meta.all_uris |= [response.request.url] # it's possible to call this method without affecting the metadata object being created by the harvester
|
17
|
+
warn "starting URL #{url}"
|
18
|
+
warn "final URL #{response.request.url}"
|
19
19
|
warn "Response code #{response.code}"
|
20
|
-
if response.code == 203
|
21
|
-
|
22
|
-
|
20
|
+
if response.code == 203
|
21
|
+
meta.warnings << ["002", url, headers]
|
22
|
+
meta.comments << "WARN: Response is non-authoritative (HTTP response code: #{response.code}). Headers may have been manipulated encountered when trying to resolve #{url}\n"
|
23
23
|
end
|
24
24
|
response
|
25
25
|
rescue RestClient::ExceptionWithResponse => e
|
26
26
|
warn "EXCEPTION WITH RESPONSE! #{e.response}\n#{e.response.headers}"
|
27
|
-
|
28
|
-
|
27
|
+
meta.warnings << ["003", url, headers]
|
28
|
+
meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
|
29
29
|
if (e.response.code == 500 or e.response.code == 404)
|
30
30
|
return false
|
31
31
|
else
|
@@ -34,14 +34,14 @@ module FspHarvester
|
|
34
34
|
# now we are returning the headers and body that were returned
|
35
35
|
rescue RestClient::Exception => e
|
36
36
|
warn "EXCEPTION WITH NO RESPONSE! #{e}"
|
37
|
-
|
38
|
-
|
37
|
+
meta.warnings << ["003", url, headers]
|
38
|
+
meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
|
39
39
|
false
|
40
40
|
# now we are returning 'False', and we will check that with an \"if\" statement in our main code
|
41
41
|
rescue Exception => e
|
42
42
|
warn "EXCEPTION UNKNOWN! #{e}"
|
43
|
-
|
44
|
-
|
43
|
+
meta.warnings << ["003", url, headers]
|
44
|
+
meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
|
45
45
|
false
|
46
46
|
# now we are returning 'False', and we will check that with an \"if\" statement in our main code
|
47
47
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fsp_harvester
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mark Wilkinson
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-08-
|
11
|
+
date: 2022-08-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: json
|
@@ -44,14 +44,14 @@ dependencies:
|
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: 0.1.
|
47
|
+
version: 0.1.16
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: 0.1.
|
54
|
+
version: 0.1.16
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: metainspector
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -171,6 +171,7 @@ executables: []
|
|
171
171
|
extensions: []
|
172
172
|
extra_rdoc_files: []
|
173
173
|
files:
|
174
|
+
- ".rspec_status"
|
174
175
|
- CHANGELOG.md
|
175
176
|
- Gemfile
|
176
177
|
- Gemfile.lock
|
@@ -180,10 +181,17 @@ files:
|
|
180
181
|
- bin/console
|
181
182
|
- bin/setup
|
182
183
|
- example_test.rb
|
184
|
+
- launch.json
|
185
|
+
- lib/config.conf_docker
|
186
|
+
- lib/config.conf_local
|
183
187
|
- lib/constants.rb
|
184
188
|
- lib/fsp_harvester.rb
|
185
189
|
- lib/fsp_harvester/version.rb
|
190
|
+
- lib/fsp_metadata_external_tools.rb
|
191
|
+
- lib/fsp_metadata_harvester.rb
|
192
|
+
- lib/fsp_metadata_parser.rb
|
186
193
|
- lib/metadata_object.rb
|
194
|
+
- lib/signposting_tests.rb
|
187
195
|
- lib/swagger.rb
|
188
196
|
- lib/warnings.json
|
189
197
|
- lib/web_utils.rb
|