fsp_harvester 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rspec_status +55 -0
- data/Gemfile.lock +5 -4
- data/launch.json +11 -0
- data/lib/config.conf_docker +8 -0
- data/lib/config.conf_local +8 -0
- data/lib/constants.rb +12 -13
- data/lib/fsp_harvester/version.rb +1 -1
- data/lib/fsp_harvester.rb +30 -8
- data/lib/fsp_metadata_external_tools.rb +82 -0
- data/lib/fsp_metadata_harvester.rb +164 -0
- data/lib/fsp_metadata_parser.rb +109 -0
- data/lib/metadata_object.rb +96 -4
- data/lib/signposting_tests.rb +87 -0
- data/lib/warnings.json +38 -2
- data/lib/web_utils.rb +13 -13
- metadata +12 -4
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 923ccb362ef4a71fa2f221b1b224dbd5d3ec78a14cc9da0e12a1e0df804162ff
|
|
4
|
+
data.tar.gz: e70a9a994c504a0867ab10e05c07d440a76677d7ff27e27b9bdb0c3338c02ffe
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: d819cb1c19d40f8a093b723cbeb4273d122d3207df2d874558327fa6a2622be1bf2c671cc9dfebee74c689722825c2dd1957f8be6bfcf2c14c09097c1dc05a5b
|
|
7
|
+
data.tar.gz: 508a484d93eab373d0389c11f4361068d5586759a73573901ea54f4d2a028f713f71bb492976785499f1074eb8b359d79be550c09b196eb82838a1e401398fc4
|
data/.rspec_status
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
example_id | status | run_time |
|
|
2
|
+
---------------------------------- | ------ | --------------- |
|
|
3
|
+
./spec/cite-as_spec.rb[1:1:1] | passed | 1.17 seconds |
|
|
4
|
+
./spec/cite-as_spec.rb[1:1:2] | passed | 0.98776 seconds |
|
|
5
|
+
./spec/cite-as_spec.rb[1:1:3] | passed | 0.69753 seconds |
|
|
6
|
+
./spec/cite-as_spec.rb[1:1:4] | passed | 1.31 seconds |
|
|
7
|
+
./spec/cite-as_spec.rb[1:1:5] | passed | 2.07 seconds |
|
|
8
|
+
./spec/cite-as_spec.rb[1:1:6] | passed | 1.45 seconds |
|
|
9
|
+
./spec/cite-as_spec.rb[1:1:7] | passed | 2.75 seconds |
|
|
10
|
+
./spec/cite-as_spec.rb[1:1:8] | passed | 1.83 seconds |
|
|
11
|
+
./spec/cite-as_spec.rb[1:1:9] | passed | 2.51 seconds |
|
|
12
|
+
./spec/cite-as_spec.rb[1:1:10] | passed | 1.73 seconds |
|
|
13
|
+
./spec/cite-as_spec.rb[1:1:11] | passed | 2.35 seconds |
|
|
14
|
+
./spec/cite-as_spec.rb[1:1:12] | passed | 2.01 seconds |
|
|
15
|
+
./spec/cite-as_spec.rb[1:1:13] | passed | 2.56 seconds |
|
|
16
|
+
./spec/cite-as_spec.rb[1:1:14] | passed | 1.68 seconds |
|
|
17
|
+
./spec/cite-as_spec.rb[1:1:15] | passed | 1.06 seconds |
|
|
18
|
+
./spec/cite-as_spec.rb[1:1:16] | passed | 1.03 seconds |
|
|
19
|
+
./spec/cite-as_spec.rb[1:1:17] | passed | 0.94321 seconds |
|
|
20
|
+
./spec/cite-as_spec.rb[1:1:18] | passed | 1.1 seconds |
|
|
21
|
+
./spec/cite-as_spec.rb[1:1:19] | passed | 1.45 seconds |
|
|
22
|
+
./spec/cite-as_spec.rb[1:1:20] | passed | 1.53 seconds |
|
|
23
|
+
./spec/cite-as_spec.rb[1:1:21] | passed | 1.64 seconds |
|
|
24
|
+
./spec/cite-as_spec.rb[1:1:22] | passed | 1.01 seconds |
|
|
25
|
+
./spec/cite-as_spec.rb[1:1:23] | passed | 1.09 seconds |
|
|
26
|
+
./spec/cite-as_spec.rb[1:1:24] | failed | 1.22 seconds |
|
|
27
|
+
./spec/cite-as_spec.rb[1:1:25] | passed | 0.38248 seconds |
|
|
28
|
+
./spec/describedby_spec.rb[1:1:1] | passed | 2.24 seconds |
|
|
29
|
+
./spec/describedby_spec.rb[1:1:2] | passed | 1.08 seconds |
|
|
30
|
+
./spec/describedby_spec.rb[1:1:3] | passed | 1 second |
|
|
31
|
+
./spec/describedby_spec.rb[1:1:4] | passed | 1.14 seconds |
|
|
32
|
+
./spec/describedby_spec.rb[1:1:5] | passed | 1.03 seconds |
|
|
33
|
+
./spec/describedby_spec.rb[1:1:6] | passed | 0.81364 seconds |
|
|
34
|
+
./spec/describedby_spec.rb[1:1:7] | passed | 0.77543 seconds |
|
|
35
|
+
./spec/describedby_spec.rb[1:1:8] | passed | 2.01 seconds |
|
|
36
|
+
./spec/describedby_spec.rb[1:1:9] | passed | 1.35 seconds |
|
|
37
|
+
./spec/describedby_spec.rb[1:1:10] | passed | 1.73 seconds |
|
|
38
|
+
./spec/describedby_spec.rb[1:1:11] | passed | 2.36 seconds |
|
|
39
|
+
./spec/describedby_spec.rb[1:1:12] | passed | 2.73 seconds |
|
|
40
|
+
./spec/describedby_spec.rb[1:1:13] | passed | 1.5 seconds |
|
|
41
|
+
./spec/describedby_spec.rb[1:1:14] | passed | 1.8 seconds |
|
|
42
|
+
./spec/describedby_spec.rb[1:1:15] | passed | 1.65 seconds |
|
|
43
|
+
./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00053 seconds |
|
|
44
|
+
./spec/fsp_harvester_spec.rb[1:2] | passed | 1.76 seconds |
|
|
45
|
+
./spec/item_spec.rb[1:1:1] | passed | 2.08 seconds |
|
|
46
|
+
./spec/item_spec.rb[1:1:2] | passed | 2.27 seconds |
|
|
47
|
+
./spec/item_spec.rb[1:1:3] | passed | 1.22 seconds |
|
|
48
|
+
./spec/item_spec.rb[1:1:4] | passed | 1.61 seconds |
|
|
49
|
+
./spec/item_spec.rb[1:1:5] | passed | 1.74 seconds |
|
|
50
|
+
./spec/item_spec.rb[1:1:6] | passed | 1.95 seconds |
|
|
51
|
+
./spec/item_spec.rb[1:1:7] | passed | 3.59 seconds |
|
|
52
|
+
./spec/item_spec.rb[1:1:8] | passed | 0.41001 seconds |
|
|
53
|
+
./spec/type_spec.rb[1:1:1] | passed | 1.14 seconds |
|
|
54
|
+
./spec/type_spec.rb[1:1:2] | passed | 0.94799 seconds |
|
|
55
|
+
./spec/type_spec.rb[1:1:3] | passed | 1.04 seconds |
|
data/Gemfile.lock
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
fsp_harvester (0.1.
|
|
4
|
+
fsp_harvester (0.1.9)
|
|
5
5
|
json (~> 2.0)
|
|
6
6
|
linkeddata (~> 3.2)
|
|
7
|
-
linkheaders-processor (~> 0.1.
|
|
7
|
+
linkheaders-processor (~> 0.1.15)
|
|
8
8
|
metainspector (~> 5.11.2)
|
|
9
9
|
parseconfig (~> 1.1)
|
|
10
10
|
rake (~> 13.0)
|
|
@@ -126,10 +126,11 @@ GEM
|
|
|
126
126
|
shex (~> 0.7)
|
|
127
127
|
sparql (~> 3.2)
|
|
128
128
|
sparql-client (~> 3.2)
|
|
129
|
-
linkheaders-processor (0.1.
|
|
129
|
+
linkheaders-processor (0.1.15)
|
|
130
130
|
json (~> 2.0)
|
|
131
131
|
json-ld (~> 3.2)
|
|
132
132
|
json-ld-preloaded (~> 3.2)
|
|
133
|
+
link_header (~> 0.0.8)
|
|
133
134
|
metainspector (~> 5.11.2)
|
|
134
135
|
rest-client (~> 2.1)
|
|
135
136
|
securerandom (~> 0.1.0)
|
|
@@ -248,7 +249,7 @@ GEM
|
|
|
248
249
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
249
250
|
rspec-support (~> 3.11.0)
|
|
250
251
|
rspec-support (3.11.0)
|
|
251
|
-
rubocop (1.
|
|
252
|
+
rubocop (1.33.0)
|
|
252
253
|
json (~> 2.3)
|
|
253
254
|
parallel (~> 1.10)
|
|
254
255
|
parser (>= 3.1.0.0)
|
data/launch.json
ADDED
data/lib/constants.rb
CHANGED
|
@@ -1,17 +1,20 @@
|
|
|
1
1
|
ACCEPT_ALL_HEADER = { 'Accept' => 'text/turtle, application/ld+json, application/rdf+xml, text/xhtml+xml, application/n3, application/rdf+n3, application/turtle, application/x-turtle, text/n3, text/turtle, text/rdf+n3, text/rdf+turtle, application/n-triples' }
|
|
2
2
|
|
|
3
|
+
ACCEPT_STAR_HEADER = {'Accept' => '*/*'}
|
|
4
|
+
|
|
3
5
|
TEXT_FORMATS = {
|
|
4
6
|
'text' => ['text/plain']
|
|
5
7
|
}
|
|
6
8
|
|
|
7
9
|
RDF_FORMATS = {
|
|
8
|
-
'jsonld' => ['application/ld+json', 'application/vnd.schemaorg.ld+json'], # NEW FOR DATACITE
|
|
10
|
+
'jsonld' => ['application/ld+json','application/x-ld+json', 'application/vnd.schemaorg.ld+json'], # NEW FOR DATACITE
|
|
9
11
|
'turtle' => ['text/turtle', 'application/n3', 'application/rdf+n3',
|
|
10
12
|
'application/turtle', 'application/x-turtle', 'text/n3', 'text/turtle',
|
|
11
13
|
'text/rdf+n3', 'text/rdf+turtle'],
|
|
12
14
|
# 'rdfa' => ['text/xhtml+xml', 'application/xhtml+xml'],
|
|
13
15
|
'rdfxml' => ['application/rdf+xml'],
|
|
14
|
-
'
|
|
16
|
+
'ntriples' => ['application/n-triples', 'application/trig'],
|
|
17
|
+
'nquads' => ['application/n-quads']
|
|
15
18
|
}
|
|
16
19
|
|
|
17
20
|
XML_FORMATS = {
|
|
@@ -73,12 +76,10 @@ GUID_TYPES = { 'inchi' => Regexp.new(/^\w{14}-\w{10}-\w$/),
|
|
|
73
76
|
'uri' => Regexp.new(%r{^\w+:/?/?[^\s]+$}) }
|
|
74
77
|
|
|
75
78
|
CONFIG = File.exist?('config.conf') ? ParseConfig.new('config.conf') : {}
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
end
|
|
79
|
-
extruct = 'extruct' unless @extruct_command
|
|
79
|
+
extruct = CONFIG.dig(:extruct, :command)
|
|
80
|
+
extruct ||= 'extruct'
|
|
80
81
|
extruct.strip!
|
|
81
|
-
case
|
|
82
|
+
case extruct
|
|
82
83
|
when /[&|;`$\s]/
|
|
83
84
|
abort 'The Extruct command in the config file appears to be subject to command injection. I will not continue'
|
|
84
85
|
when /echo/i
|
|
@@ -86,8 +87,8 @@ when /echo/i
|
|
|
86
87
|
end
|
|
87
88
|
EXTRUCT_COMMAND = extruct
|
|
88
89
|
|
|
89
|
-
rdf_command = CONFIG
|
|
90
|
-
rdf_command
|
|
90
|
+
rdf_command = CONFIG.dig(:rdf, :command)
|
|
91
|
+
rdf_command ||= 'rdf'
|
|
91
92
|
rdf_command.strip
|
|
92
93
|
case rdf_command
|
|
93
94
|
when /[&|;`$\s]/
|
|
@@ -99,8 +100,6 @@ when !(/rdf$/ =~ $_)
|
|
|
99
100
|
end
|
|
100
101
|
RDF_COMMAND = rdf_command
|
|
101
102
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
end
|
|
105
|
-
tika_command = 'http://localhost:9998/meta' unless @tika_command
|
|
103
|
+
tika_command = CONFIG.dig(:tika, :command)
|
|
104
|
+
tika_command ||= 'http://localhost:9998/meta'
|
|
106
105
|
TIKA_COMMAND = tika_command
|
data/lib/fsp_harvester.rb
CHANGED
|
@@ -20,6 +20,9 @@ require_relative './metadata_object'
|
|
|
20
20
|
require_relative './constants'
|
|
21
21
|
require_relative './web_utils'
|
|
22
22
|
require_relative './signposting_tests'
|
|
23
|
+
require_relative './fsp_metadata_harvester'
|
|
24
|
+
require_relative './fsp_metadata_parser'
|
|
25
|
+
|
|
23
26
|
|
|
24
27
|
module FspHarvester
|
|
25
28
|
class Error < StandardError
|
|
@@ -32,11 +35,12 @@ module FspHarvester
|
|
|
32
35
|
|
|
33
36
|
def self.resolve_guid(guid:)
|
|
34
37
|
@meta = FspHarvester::MetadataObject.new
|
|
35
|
-
@meta.
|
|
38
|
+
@meta.all_uris = [guid]
|
|
36
39
|
type, url = convertToURL(guid: guid)
|
|
37
40
|
links = Array.new
|
|
38
41
|
if type
|
|
39
42
|
links = resolve_url(url: url)
|
|
43
|
+
@meta.links << links
|
|
40
44
|
else
|
|
41
45
|
@meta.warnings << ['006', guid, '']
|
|
42
46
|
@meta.comments << "FATAL: GUID type not recognized.\n"
|
|
@@ -44,6 +48,16 @@ module FspHarvester
|
|
|
44
48
|
[links, @meta]
|
|
45
49
|
end
|
|
46
50
|
|
|
51
|
+
def self.gather_metadata_from_describedby_links(links: [], metadata: FspHarvester::MetadataObject.new) # meta should have already been created by resolve+guid, but maybe not
|
|
52
|
+
@meta = metadata
|
|
53
|
+
db = []
|
|
54
|
+
links.each do |l|
|
|
55
|
+
db << l if l.relation == 'describedby'
|
|
56
|
+
end
|
|
57
|
+
FspHarvester::MetadataHarvester.extract_metadata(links: db, metadata: @meta) # everything is gathered into the @meta metadata object
|
|
58
|
+
@meta
|
|
59
|
+
end
|
|
60
|
+
|
|
47
61
|
def self.convertToURL(guid:)
|
|
48
62
|
GUID_TYPES.each do |k, regex|
|
|
49
63
|
if k == 'inchi' and regex.match(guid)
|
|
@@ -68,10 +82,10 @@ module FspHarvester
|
|
|
68
82
|
false
|
|
69
83
|
end
|
|
70
84
|
|
|
71
|
-
def self.resolve_url(url:, method: :get, nolinkheaders: false, header:
|
|
85
|
+
def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_STAR_HEADER)
|
|
72
86
|
@meta.guidtype = 'uri' if @meta.guidtype.nil?
|
|
73
87
|
warn "\n\n FETCHING #{url} #{header}\n\n"
|
|
74
|
-
response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method)
|
|
88
|
+
response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method, meta: @meta)
|
|
75
89
|
warn "\n\n head #{response.headers.inspect}\n\n" if response
|
|
76
90
|
|
|
77
91
|
unless response
|
|
@@ -80,7 +94,7 @@ module FspHarvester
|
|
|
80
94
|
return []
|
|
81
95
|
end
|
|
82
96
|
|
|
83
|
-
@meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.
|
|
97
|
+
@meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
|
|
84
98
|
@meta.full_response << response.body
|
|
85
99
|
|
|
86
100
|
links = process_link_headers(response: response) unless nolinkheaders
|
|
@@ -90,7 +104,7 @@ module FspHarvester
|
|
|
90
104
|
def self.process_link_headers(response:)
|
|
91
105
|
warn "\n\n parsing #{response.headers}\n\n"
|
|
92
106
|
|
|
93
|
-
parser = LinkHeaders::Processor.new(default_anchor: @meta.
|
|
107
|
+
parser = LinkHeaders::Processor.new(default_anchor: @meta.all_uris.last)
|
|
94
108
|
parser.extract_and_parse(response: response)
|
|
95
109
|
factory = parser.factory # LinkHeaders::LinkFactory
|
|
96
110
|
|
|
@@ -105,6 +119,8 @@ module FspHarvester
|
|
|
105
119
|
citeas = Array.new
|
|
106
120
|
describedby = Array.new
|
|
107
121
|
item = Array.new
|
|
122
|
+
types = Array.new
|
|
123
|
+
|
|
108
124
|
factory.all_links.each do |l|
|
|
109
125
|
case l.relation
|
|
110
126
|
when 'cite-as'
|
|
@@ -113,23 +129,29 @@ module FspHarvester
|
|
|
113
129
|
item << l
|
|
114
130
|
when 'describedby'
|
|
115
131
|
describedby << l
|
|
132
|
+
when 'type'
|
|
133
|
+
types << l
|
|
116
134
|
end
|
|
117
135
|
end
|
|
118
136
|
|
|
119
137
|
check_describedby_rules(describedby: describedby)
|
|
120
138
|
check_item_rules(item: item)
|
|
121
139
|
|
|
122
|
-
uniqueciteas = Array.new
|
|
123
140
|
if citeas.length > 1
|
|
124
141
|
warn "INFO: multiple cite-as links found. Checking for conflicts\n"
|
|
125
142
|
@meta.comments << "INFO: multiple cite-as links found. Checking for conflicts\n"
|
|
126
|
-
|
|
143
|
+
citeas = check_for_citeas_conflicts(citeas: citeas) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
|
|
127
144
|
end
|
|
128
145
|
|
|
129
|
-
unless
|
|
146
|
+
unless citeas.length == 1 && describedby.length > 0
|
|
130
147
|
@meta.warnings << ['004', '', '']
|
|
131
148
|
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires exactly one cite-as header, and at least one describedby header\n"
|
|
132
149
|
end
|
|
150
|
+
|
|
151
|
+
unless types.length >=1
|
|
152
|
+
@meta.warnings << ['015', '', '']
|
|
153
|
+
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires one or two 'type' link headers\n"
|
|
154
|
+
end
|
|
133
155
|
end
|
|
134
156
|
end
|
|
135
157
|
end
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FspHarvester
|
|
4
|
+
class Error < StandardError
|
|
5
|
+
end
|
|
6
|
+
|
|
7
|
+
class ExternalTools
|
|
8
|
+
|
|
9
|
+
def initialize(metadata: FspHarvester::MetadataObject.new)
|
|
10
|
+
@meta = metadata
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def process_with_distiller(body:)
|
|
14
|
+
bhash = Digest::SHA256.hexdigest(body)
|
|
15
|
+
if @@distillerknown[bhash]
|
|
16
|
+
@meta.comments << "INFO: data is already parsed by distiller.\n"
|
|
17
|
+
#parse_rdf(body: body)
|
|
18
|
+
else
|
|
19
|
+
@meta.comments << "INFO: Using 'Kellog's Distiller' to try to extract metadata from return value (message body).\n"
|
|
20
|
+
file = Tempfile.new('foo', encoding: 'UTF-8')
|
|
21
|
+
body = body.force_encoding('UTF-8')
|
|
22
|
+
body.scrub!
|
|
23
|
+
body = body.gsub(%r{"@context"\s*:\s*"https?://schema.org/?"}, '"@context": "https://schema.org/docs/jsonldcontext.json"') # a bug in distiller, apparently
|
|
24
|
+
file.write(body)
|
|
25
|
+
file.rewind
|
|
26
|
+
|
|
27
|
+
@meta.comments << "INFO: The message body is being examined by Distiller\n"
|
|
28
|
+
# command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format turtle #{file.path} 2>/dev/null"
|
|
29
|
+
command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
|
|
30
|
+
# command = "LANG=en_US.UTF-8 /usr/local/bin/ruby #{@rdf_command} serialize --input-format rdfa --output-format jsonld #{file.path}"
|
|
31
|
+
# command = "LANG=en_US.UTF-8 /home/osboxes/.rvm/rubies/ruby-2.6.3/bin/ruby /home/osboxes/.rvm/gems/ruby-2.6.3/bin/rdf serialize --output-format jsonld #{file.path}"
|
|
32
|
+
warn "distiller command: #{command}"
|
|
33
|
+
result, _stderr, _status = Open3.capture3(command)
|
|
34
|
+
warn ''
|
|
35
|
+
warn "distiller errors: #{stderr}"
|
|
36
|
+
file.close
|
|
37
|
+
file.unlink
|
|
38
|
+
|
|
39
|
+
result = result.force_encoding('UTF-8')
|
|
40
|
+
warn "DIST RESULT: #{result}"
|
|
41
|
+
if result !~ /@context/i # failure returns nil
|
|
42
|
+
@meta.comments << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML..\n"
|
|
43
|
+
@meta.warnings << ['018', '', '']
|
|
44
|
+
else
|
|
45
|
+
@meta.comments << "INFO: The Distiller found parseable data. Parsing as JSON-LD\n"
|
|
46
|
+
parse_rdf(result: result, content_type: "application/ld+json")
|
|
47
|
+
end
|
|
48
|
+
@@distillerknown[bhash] = true
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def processs_with_extruct(uri:)
|
|
53
|
+
@meta.comments << "INFO: Using 'extruct' to try to extract metadata from return value (message body) of #{uri}.\n"
|
|
54
|
+
warn 'begin open3'
|
|
55
|
+
stdout, stderr, status = Open3.capture3(EXTRUCT_COMMAND + ' ' + uri)
|
|
56
|
+
warn "open3 status: #{status} #{stdout}"
|
|
57
|
+
result = stderr # absurd that the output comes over stderr! LOL!
|
|
58
|
+
|
|
59
|
+
if result.to_s.match(/(Failed\sto\sextract.*?)\n/)
|
|
60
|
+
@meta.comments << "WARN: extruct threw an error #{Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n"
|
|
61
|
+
@meta.warnings << ['019', '', '']
|
|
62
|
+
if result.to_s.match(/(ValueError:.*?)\n/)
|
|
63
|
+
@meta.comments << "WARN: extruct error was #{Regexp.last_match(1)}\n"
|
|
64
|
+
@meta.warnings << ['019', '', '']
|
|
65
|
+
end
|
|
66
|
+
elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
|
|
67
|
+
json = JSON.parse result
|
|
68
|
+
@meta.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
|
|
69
|
+
|
|
70
|
+
parse_rdf(body: json['json-ld'].to_json, content_type: 'application/ld+json') if json['json-ld'].any? # RDF
|
|
71
|
+
@meta.merge_hash(json['microdata'].first) if json['microdata'].any?
|
|
72
|
+
@meta.merge_hash(json['microformat'].first) if json['microformat'].any?
|
|
73
|
+
@meta.merge_hash(json['opengraph'].first) if json['opengraph'].any?
|
|
74
|
+
parse_rdf(body: json['rdfa'].to_json, content_type: 'application/ld+json') if json['rdfa'].any? # RDF
|
|
75
|
+
|
|
76
|
+
@meta.merge_hash(json.first) if json.first.is_a? Hash
|
|
77
|
+
else
|
|
78
|
+
@meta.comments << "WARN: the extruct tool failed to find parseable data at #{uri}\n"
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FspHarvester
|
|
4
|
+
class Error < StandardError
|
|
5
|
+
end
|
|
6
|
+
|
|
7
|
+
class MetadataHarvester
|
|
8
|
+
def self.extract_metadata(links: [], metadata: FspHarvester::MetadataObject.new)
|
|
9
|
+
@meta = metadata
|
|
10
|
+
@meta.comments << 'INFO: now collecting both linked data and hash-style data using the harvested links'
|
|
11
|
+
|
|
12
|
+
describedby = links.select { |l| l if l.relation == 'describedby' }
|
|
13
|
+
|
|
14
|
+
hvst = FspHarvester::MetadataParser.new(metadata_object: @meta) # put here because the class variable for detecting duplicates should apply to all URIs
|
|
15
|
+
describedby.each do |link|
|
|
16
|
+
accepttype = ACCEPT_STAR_HEADER
|
|
17
|
+
accept = link.respond_to?('type') ? link.type : nil
|
|
18
|
+
accepttype = { 'Accept' => accept } if accept
|
|
19
|
+
|
|
20
|
+
response = attempt_to_resolve(link: link, headers: accepttype)
|
|
21
|
+
|
|
22
|
+
abbreviation, content_type = attempt_to_detect_type(body: response.body, headers: response.headers)
|
|
23
|
+
unless abbreviation
|
|
24
|
+
@meta.warnings << ['017', url, header]
|
|
25
|
+
@meta.comments << "WARN: metadata format returned from #{url} using Accept header #{header} is not recognized. Processing will end now.\n"
|
|
26
|
+
next
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# process according to detected type
|
|
30
|
+
case abbreviation
|
|
31
|
+
when 'html'
|
|
32
|
+
@meta.comments << 'INFO: Processing html'
|
|
33
|
+
hvst.process_html(body: response.body, uri: link)
|
|
34
|
+
when 'xml'
|
|
35
|
+
@meta.comments << 'INFO: Processing xml'
|
|
36
|
+
hvst.process_xml(body: response.body)
|
|
37
|
+
when 'json'
|
|
38
|
+
@meta.comments << 'INFO: Processing json'
|
|
39
|
+
hvst.process_json(body: response.body)
|
|
40
|
+
when 'jsonld', 'rdfxml', 'turtle', 'ntriples', 'nquads'
|
|
41
|
+
@meta.comments << 'INFO: Processing linked data'
|
|
42
|
+
hvst.process_ld(body: response.body, content_type: content_type)
|
|
43
|
+
when 'specialist'
|
|
44
|
+
warn 'no specialized parsers so far'
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def self.attempt_to_resolve(link:, headers: ACCEPT_STAR_HEADER)
|
|
50
|
+
@meta.comments << "INFO: link #{link.href} being processed"
|
|
51
|
+
if link.respond_to? 'type'
|
|
52
|
+
header = { 'Accept' => link.type }
|
|
53
|
+
else
|
|
54
|
+
@meta.comments << "INFO: link #{link.href} has no MIME type, defaulting to */*"
|
|
55
|
+
end
|
|
56
|
+
url = link.href
|
|
57
|
+
response = FspHarvester::WebUtils.fspfetch(url: url, method: :get, headers: header)
|
|
58
|
+
unless response
|
|
59
|
+
@meta.warnings << ['016', url, header]
|
|
60
|
+
@meta.comments << "WARN: Unable to resolve describedby link #{url} using HTTP Accept header #{header}.\n"
|
|
61
|
+
end
|
|
62
|
+
response
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def self.attempt_to_detect_type(body:, headers:)
|
|
66
|
+
# described by should be an html, xml, json, or linked data document
|
|
67
|
+
abbreviation = nil
|
|
68
|
+
content_type = nil
|
|
69
|
+
@meta.comments << 'INFO: Testing metadata format for html, xml, and linked data formats\n'
|
|
70
|
+
if body =~ /^\s*<\?xml/
|
|
71
|
+
if body =~ /<HTML/i
|
|
72
|
+
abbreviation = 'html'
|
|
73
|
+
content_type = 'text/html'
|
|
74
|
+
@meta.comments << 'INFO: appears to be HTML\n'
|
|
75
|
+
elsif body =~ /<rdf:RDF/i
|
|
76
|
+
abbreviation = 'rdfxml'
|
|
77
|
+
content_type = 'application/rdf+xml'
|
|
78
|
+
@meta.comments << 'INFO: appears to be RDF-XML\n'
|
|
79
|
+
else
|
|
80
|
+
abbreviation = 'xml'
|
|
81
|
+
content_type = 'application/xml'
|
|
82
|
+
@meta.comments << 'INFO: appears to be XML\n'
|
|
83
|
+
end
|
|
84
|
+
else
|
|
85
|
+
abbreviation, content_type = check_ld(body: body, claimed_type: headers[:content_type])
|
|
86
|
+
abbreviation, content_type = check_json(body: body) unless abbreviation
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
unless content_type
|
|
90
|
+
@meta.warnings << ['017', url, header]
|
|
91
|
+
@meta.comments << "WARN: metadata format returned from #{url} using Accept header #{header} is not recognized. Processing will end now.\n"
|
|
92
|
+
end
|
|
93
|
+
[abbreviation, content_type]
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def self.check_ld(body:, claimed_type:)
|
|
97
|
+
detected_type = ntriples_hack(body: body) # ntriples hack for one-line metadata records
|
|
98
|
+
unless detected_type
|
|
99
|
+
detected_type = RDF::Format.for({ sample: body[0..5000] })
|
|
100
|
+
@meta.comments << "INFO: Auto-detected type #{detected_type}\n"
|
|
101
|
+
end
|
|
102
|
+
contenttype = ''
|
|
103
|
+
abbreviation = ''
|
|
104
|
+
if detected_type
|
|
105
|
+
contenttype = detected_type.content_type.first # comes back as array
|
|
106
|
+
abbreviation = abbreviate_type(contenttype: contenttype)
|
|
107
|
+
@meta.comments << "INFO: using content-type #{contenttype}.\n"
|
|
108
|
+
else
|
|
109
|
+
@meta.comments << "INFO: metadata does not appear to be in a linked data format. Trying other options.\n"
|
|
110
|
+
end
|
|
111
|
+
[abbreviation, contenttype]
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def self.ntriples_hack(body:) # distriller cannot recognize single-line ntriples unless they end with a period, which is not required by the spec... so hack it!
|
|
115
|
+
detected_type = nil
|
|
116
|
+
body.split.each do |line|
|
|
117
|
+
line.strip!
|
|
118
|
+
next if line.empty?
|
|
119
|
+
if line =~ %r{\s*<[^>]+>\s*<[^>]+>\s\S+}
|
|
120
|
+
@meta.comments << "INFO: running ntriples hack on #{line + " ."}\n"
|
|
121
|
+
detected_type = RDF::Format.for({ sample: "#{line} ." }) # adding a period allows detection of ntriples by distiller
|
|
122
|
+
break
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
@meta.comments << "INFO: ntriples hack found: #{detected_type.to_s}\n"
|
|
126
|
+
if detected_type != RDF::NTriples::Format # only return the hacky case
|
|
127
|
+
return nil
|
|
128
|
+
end
|
|
129
|
+
return detected_type
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def self.check_json(body:)
|
|
134
|
+
abbreviation = nil
|
|
135
|
+
parsed = nil
|
|
136
|
+
begin
|
|
137
|
+
parsed = JSON.parse(body)
|
|
138
|
+
rescue StandardError
|
|
139
|
+
abbreviation = nil
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
if parsed
|
|
143
|
+
abbreviation = 'json'
|
|
144
|
+
else
|
|
145
|
+
@meta.comments << "INFO: metadata does not appear to be in JSON format. No options left.\n"
|
|
146
|
+
end
|
|
147
|
+
[abbreviation, 'application/ld+json']
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def self.abbreviate_type(contenttype:)
|
|
151
|
+
foundtype = nil
|
|
152
|
+
RDF_FORMATS.merge(XML_FORMATS).merge(HTML_FORMATS).merge(JSON_FORMATS).each do |type, vals|
|
|
153
|
+
warn "\n\ntype #{type}\nvals #{vals}\n\n"
|
|
154
|
+
@meta.comments << "INFO: testing #{type} MIME types for #{contenttype}"
|
|
155
|
+
next unless vals.include? contenttype
|
|
156
|
+
|
|
157
|
+
foundtype = type
|
|
158
|
+
@meta.comments << "INFO: detected a #{type} MIME type"
|
|
159
|
+
break
|
|
160
|
+
end
|
|
161
|
+
foundtype
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
end
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FspHarvester
|
|
4
|
+
class Error < StandardError
|
|
5
|
+
end
|
|
6
|
+
|
|
7
|
+
class MetadataParser
|
|
8
|
+
# attr_accessor :distillerknown
|
|
9
|
+
|
|
10
|
+
@@distillerknown = {}
|
|
11
|
+
|
|
12
|
+
def initialize(metadata_object: FspHarvester::MetadataObject.new)
|
|
13
|
+
@meta = metadata_object
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def process_html(body:, uri:)
|
|
17
|
+
tools = FspHarvester::ExternalTools.new(metadata: @meta)
|
|
18
|
+
tools.process_with_distiller(body: body)
|
|
19
|
+
tools.process_with_extruct(uri: uri)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def process_xml(body:)
|
|
23
|
+
begin
|
|
24
|
+
hash = XmlSimple.xml_in(body)
|
|
25
|
+
rescue
|
|
26
|
+
@meta.comments << "CRITICAL: Malformed XML detected. Cannot process metadata.\n"
|
|
27
|
+
@meta.warnings << ['020', '', '']
|
|
28
|
+
end
|
|
29
|
+
@meta.comments << "INFO: The XML is being merged in the metadata object\n"
|
|
30
|
+
@meta.hash.merge hash
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def process_json(body:)
|
|
34
|
+
begin
|
|
35
|
+
hash = JSON.parse(body)
|
|
36
|
+
rescue
|
|
37
|
+
@meta.comments << "CRITICAL: Malformed JSON detected. Cannot process metadata.\n"
|
|
38
|
+
@meta.warnings << ['021', '', '']
|
|
39
|
+
end
|
|
40
|
+
@meta.comments << "INFO: The JSON is being merged in the metadata object\n"
|
|
41
|
+
@meta.hash.merge hash
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def process_ld(body:, content_type:)
|
|
45
|
+
parse_rdf(body: body, content_type: content_type)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def parse_rdf(body:, content_type:)
|
|
49
|
+
unless body
|
|
50
|
+
@meta.comments << "CRITICAL: The response message body component appears to have no content.\n"
|
|
51
|
+
@meta.warnings << ['018', '', '']
|
|
52
|
+
return
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
unless body.match(/\w/)
|
|
56
|
+
@meta.comments << "CRITICAL: The response message body component appears to have no content.\n"
|
|
57
|
+
@meta.warnings << ['018', '', '']
|
|
58
|
+
return
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
rdfformat = RDF::Format.for(content_type: content_type)
|
|
62
|
+
unless rdfformat
|
|
63
|
+
@meta.comments << "CRITICAL: Found what appears to be RDF (sample: #{body[0..300].delete!("\n")}), but it could not find a parser. Please report this error, along with the GUID of the resource, to the maintainer of the system.\n"
|
|
64
|
+
@meta.warnings << ['018', '', '']
|
|
65
|
+
return
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
graph = FspHarvester::Cache.checkRDFCache(body: body)
|
|
69
|
+
if graph.size > 0
|
|
70
|
+
warn "\n\n\n unmarshalling graph from cache\n\ngraph size #{graph.size}\n\n"
|
|
71
|
+
@meta.merge_rdf(graph.to_a)
|
|
72
|
+
else
|
|
73
|
+
warn "\n\n\nfound format #{rdfformat}\n\n"
|
|
74
|
+
@meta.comments << "INFO: The response message body component appears to contain #{rdfformat}.\n"
|
|
75
|
+
reader = ''
|
|
76
|
+
begin
|
|
77
|
+
reader = rdfformat.reader.new(body)
|
|
78
|
+
rescue Exception => e
|
|
79
|
+
@meta.comments << "WARN: Though linked data was found, it failed to parse (Exception #{e}). This likely indicates some syntax error in the data. As a result, no metadata will be extracted from this message.\n"
|
|
80
|
+
@meta.warnings << ['018', '', '']
|
|
81
|
+
return
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
begin
|
|
85
|
+
if reader.size == 0
|
|
86
|
+
@meta.comments << "WARN: Though linked data was found, it failed to parse. This likely indicates some syntax error in the data. As a result, no metadata will be extracted from this message.\n"
|
|
87
|
+
return
|
|
88
|
+
end
|
|
89
|
+
reader = rdfformat.reader.new(body) # have to re-read it here, but now its safe because we have already caught errors
|
|
90
|
+
warn 'WRITING TO CACHE'
|
|
91
|
+
FspHarvester::Cache.writeRDFCache(reader: reader, body: body) # write to the special RDF graph cache
|
|
92
|
+
warn 'WRITING DONE'
|
|
93
|
+
reader = rdfformat.reader.new(body) # frustrating that we cannot rewind!
|
|
94
|
+
warn 'RE-READING DONE'
|
|
95
|
+
@meta.merge_rdf(reader.to_a)
|
|
96
|
+
warn 'MERGE DONE'
|
|
97
|
+
rescue RDF::ReaderError => e
|
|
98
|
+
@meta.comments << "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} || (sample of what was parsed: #{body[0..300].delete("\n")})\n"
|
|
99
|
+
warn "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} || (sample of what was parsed: #{body[0..300].delete("\n")})\n"
|
|
100
|
+
@meta.warnings << ['018', '', '']
|
|
101
|
+
rescue Exception => e
|
|
102
|
+
meta.comments << "CRITICAL: An unknown error occurred while parsing the (apparent) Linked Data (sample of what was parsed: #{body[0..300].delete("\n")}). Moving on...\n"
|
|
103
|
+
warn "\n\nCRITICAL: #{e.inspect} An unknown error occurred while parsing the (apparent) Linked Data (full body: #{body}). Moving on...\n"
|
|
104
|
+
@meta.warnings << ['018', '', '']
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
end
|
data/lib/metadata_object.rb
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
module FspHarvester
|
|
2
2
|
class MetadataObject
|
|
3
|
-
attr_accessor :hash, :graph, :comments, :warnings, :guidtype, :full_response, :
|
|
3
|
+
attr_accessor :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris # a hash of metadata # a RDF.rb graph of metadata # an array of comments # the type of GUID that was detected # will be an array of Net::HTTP::Response
|
|
4
4
|
|
|
5
5
|
def initialize(_params = {}) # get a name from the "new" call, or set a default
|
|
6
6
|
@hash = {}
|
|
@@ -8,15 +8,16 @@ module FspHarvester
|
|
|
8
8
|
@comments = []
|
|
9
9
|
@warnings = []
|
|
10
10
|
@full_response = []
|
|
11
|
-
@
|
|
11
|
+
@links = []
|
|
12
|
+
@all_uris = []
|
|
12
13
|
end
|
|
13
14
|
|
|
14
15
|
def merge_hash(hash)
|
|
15
|
-
#
|
|
16
|
+
# warn "\n\n\nIncoming Hash #{hash.inspect}"
|
|
16
17
|
self.hash = self.hash.merge(hash)
|
|
17
18
|
end
|
|
18
19
|
|
|
19
|
-
def merge_rdf(triples)
|
|
20
|
+
def merge_rdf(triples) # incoming list of triples
|
|
20
21
|
graph << triples
|
|
21
22
|
graph
|
|
22
23
|
end
|
|
@@ -25,4 +26,95 @@ module FspHarvester
|
|
|
25
26
|
graph
|
|
26
27
|
end
|
|
27
28
|
end
|
|
29
|
+
|
|
30
|
+
class Cache
|
|
31
|
+
def self.retrieveMetaObject(uri)
|
|
32
|
+
filename = (Digest::MD5.hexdigest uri) + '_meta'
|
|
33
|
+
warn "Checking Meta cache for #{filename}"
|
|
34
|
+
if File.exist?("/tmp/#{filename}")
|
|
35
|
+
warn 'FOUND Meta object in cache'
|
|
36
|
+
meta = Marshal.load(File.read("/tmp/#{filename}"))
|
|
37
|
+
warn 'Returning....'
|
|
38
|
+
return meta
|
|
39
|
+
end
|
|
40
|
+
warn 'Meta objectNot Found in Cache'
|
|
41
|
+
false
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def self.cacheMetaObject(meta, uri)
|
|
45
|
+
filename = (Digest::MD5.hexdigest uri) + '_meta'
|
|
46
|
+
warn "in cacheMetaObject Writing to cache for #{filename}"
|
|
47
|
+
File.open("/tmp/#{filename}", 'wb') { |f| f.write(Marshal.dump(meta)) }
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def self.checkRDFCache(body: )
|
|
51
|
+
fs = File.join('/tmp/', '*_graphbody')
|
|
52
|
+
bodies = Dir.glob(fs)
|
|
53
|
+
g = RDF::Graph.new
|
|
54
|
+
bodies.each do |bodyfile|
|
|
55
|
+
next unless File.size(bodyfile) == body.bytesize # compare body size
|
|
56
|
+
next unless bodyfile.match(/(.*)_graphbody$/) # continue if there's no match
|
|
57
|
+
|
|
58
|
+
filename = Regexp.last_match(1)
|
|
59
|
+
warn "Regexp match for #{filename} FOUND"
|
|
60
|
+
next unless File.exist?("#{filename}_graph") # @ get the associated graph file
|
|
61
|
+
|
|
62
|
+
warn "RDF Cache File #{filename} FOUND"
|
|
63
|
+
graph = Marshal.load(File.read("#{filename}_graph")) # unmarshal it
|
|
64
|
+
graph.each do |statement|
|
|
65
|
+
g << statement # need to do this because the unmarshalled object isn't entirely functional as an RDF::Graph object
|
|
66
|
+
end
|
|
67
|
+
warn "returning a graph of #{g.size}"
|
|
68
|
+
break
|
|
69
|
+
end
|
|
70
|
+
# return an empty graph otherwise
|
|
71
|
+
g
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def self.writeRDFCache(reader:, body:)
|
|
75
|
+
filename = Digest::MD5.hexdigest body
|
|
76
|
+
graph = RDF::Graph.new
|
|
77
|
+
reader.each_statement { |s| graph << s }
|
|
78
|
+
warn "WRITING RDF TO CACHE #{filename}"
|
|
79
|
+
File.open("/tmp/#{filename}_graph", 'wb') { |f| f.write(Marshal.dump(graph)) }
|
|
80
|
+
File.open("/tmp/#{filename}_graphbody", 'wb') { |f| f.write(body) }
|
|
81
|
+
warn "wrote RDF filename: #{filename}"
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def self.checkCache(uri, headers)
|
|
85
|
+
filename = Digest::MD5.hexdigest uri + headers.to_s
|
|
86
|
+
warn "Checking Error cache for #{filename}"
|
|
87
|
+
if File.exist?("/tmp/#{filename}_error")
|
|
88
|
+
warn 'Error file found in cache... returning'
|
|
89
|
+
return ['ERROR', nil, nil]
|
|
90
|
+
end
|
|
91
|
+
if File.exist?("/tmp/#{filename}_head") and File.exist?("/tmp/#{filename}_body")
|
|
92
|
+
warn 'FOUND data in cache'
|
|
93
|
+
head = Marshal.load(File.read("/tmp/#{filename}_head"))
|
|
94
|
+
body = Marshal.load(File.read("/tmp/#{filename}_body"))
|
|
95
|
+
all_uris = ''
|
|
96
|
+
all_uris = Marshal.load(File.read("/tmp/#{filename}_uri")) if File.exist?("/tmp/#{filename}_uri")
|
|
97
|
+
warn 'Returning....'
|
|
98
|
+
return [head, body, all_uris]
|
|
99
|
+
end
|
|
100
|
+
warn 'Not Found in Cache'
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def self.writeToCache(uri, headers, head, body, all_uris)
|
|
104
|
+
filename = Digest::MD5.hexdigest uri + headers.to_s
|
|
105
|
+
warn "in writeToCache Writing to cache for #{filename}"
|
|
106
|
+
headfilename = filename + '_head'
|
|
107
|
+
bodyfilename = filename + '_body'
|
|
108
|
+
urifilename = filename + '_uri'
|
|
109
|
+
File.open("/tmp/#{headfilename}", 'wb') { |f| f.write(Marshal.dump(head)) }
|
|
110
|
+
File.open("/tmp/#{bodyfilename}", 'wb') { |f| f.write(Marshal.dump(body)) }
|
|
111
|
+
File.open("/tmp/#{urifilename}", 'wb') { |f| f.write(Marshal.dump(all_uris)) }
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def self.writeErrorToCache(uri, headers)
|
|
115
|
+
filename = Digest::MD5.hexdigest uri + headers.to_s
|
|
116
|
+
warn "in writeErrorToCache Writing error to cache for #{filename}"
|
|
117
|
+
File.open("/tmp/#{filename}_error", 'wb') { |f| f.write('ERROR') }
|
|
118
|
+
end
|
|
119
|
+
end
|
|
28
120
|
end
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
def check_for_citeas_conflicts(citeas: )
|
|
2
|
+
@meta.comments << 'INFO: checking for conflicting cite-as links'
|
|
3
|
+
citeas_hrefs = Hash.new
|
|
4
|
+
citeas.each do |link|
|
|
5
|
+
warn "INFO: Adding citeas #{link.href} to the testing queue."
|
|
6
|
+
@meta.comments << "INFO: Adding citeas #{link.href} to the testing queue."
|
|
7
|
+
citeas_hrefs[link.href] = link
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
if citeas_hrefs.length > 1
|
|
11
|
+
@meta.comments << 'INFO: Found multiple non-identical cite-as links.'
|
|
12
|
+
@meta.warnings << ['007', '', '']
|
|
13
|
+
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard: Found conflicting cite-as link headers.\n"
|
|
14
|
+
end
|
|
15
|
+
citeas_hrefs.values # return list of unique links
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def check_describedby_rules(describedby:)
|
|
20
|
+
describedby.each do |l|
|
|
21
|
+
unless l.respond_to? 'type'
|
|
22
|
+
@meta.warnings << ['005', l.href, '']
|
|
23
|
+
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires any describedby links to also have a 'type' attribute.\n"
|
|
24
|
+
end
|
|
25
|
+
type = l.type if l.respond_to? 'type'
|
|
26
|
+
type ||= '*/*'
|
|
27
|
+
header = { accept: type }
|
|
28
|
+
response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
|
|
29
|
+
if response
|
|
30
|
+
responsetype = response.headers[:content_type]
|
|
31
|
+
@meta.comments << "INFO: describedby link responds with content type #{responsetype}\n"
|
|
32
|
+
if responsetype =~ %r{^(.*/[^;]+)}
|
|
33
|
+
responsetype = Regexp.last_match(1).to_s # remove the e.g. charset information
|
|
34
|
+
end
|
|
35
|
+
@meta.comments << "INFO: testing content type |#{responsetype}| against |#{type}|\n"
|
|
36
|
+
if type != '*/*'
|
|
37
|
+
if responsetype == type
|
|
38
|
+
@meta.comments << "INFO: describedby link responds according to Signposting specifications\n"
|
|
39
|
+
else
|
|
40
|
+
@meta.warnings << ['009', l.href, header]
|
|
41
|
+
@meta.comments << "WARN: Content type of returned describedby link #{responsetype}does not match the 'type' attribute #{type}\n"
|
|
42
|
+
end
|
|
43
|
+
else
|
|
44
|
+
@meta.warnings << ['010', l.href, header]
|
|
45
|
+
@meta.comments << "WARN: Content type of returned describedby link is not specified in response headers or cannot be matched against accept headers\n"
|
|
46
|
+
end
|
|
47
|
+
else
|
|
48
|
+
@meta.warnings << ['008', l.href, header]
|
|
49
|
+
@meta.comments << "WARN: describedby link doesn't resolve\n"
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def check_item_rules(item:)
|
|
55
|
+
item.each do |l| # l = LinkHeaders::Link
|
|
56
|
+
unless l.respond_to? 'type'
|
|
57
|
+
@meta.warnings << ['011', l.href, '']
|
|
58
|
+
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which encourages any item links to also have a 'type' attribute.\n"
|
|
59
|
+
end
|
|
60
|
+
type = l.type if l.respond_to? 'type'
|
|
61
|
+
type ||= '*/*' # this becomes a frozen string
|
|
62
|
+
header = { accept: type }
|
|
63
|
+
response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
|
|
64
|
+
|
|
65
|
+
if response
|
|
66
|
+
if response.headers[:content_type] and type != '*/*'
|
|
67
|
+
rtype = type.gsub(%r{/}, "\/") # because type is a frozen string
|
|
68
|
+
rtype = rtype.gsub(/\+/, '.')
|
|
69
|
+
typeregex = Regexp.new(type)
|
|
70
|
+
if response.headers[:content_type].match(typeregex)
|
|
71
|
+
warn response.headers[:content_type]
|
|
72
|
+
warn typeregex.inspect
|
|
73
|
+
@meta.comments << "INFO: item link responds according to Signposting specifications\n"
|
|
74
|
+
else
|
|
75
|
+
@meta.warnings << ['012', l.href, header]
|
|
76
|
+
@meta.comments << "WARN: Content type of returned item link does not match the 'type' attribute\n"
|
|
77
|
+
end
|
|
78
|
+
else
|
|
79
|
+
@meta.warnings << ['013', l.href, header]
|
|
80
|
+
@meta.comments << "WARN: Content type of returned item link is not specified in response headers or cannot be matched against accept headers\n"
|
|
81
|
+
end
|
|
82
|
+
else
|
|
83
|
+
@meta.warnings << ['014', l.href, header]
|
|
84
|
+
@meta.comments << "WARN: item link doesn't resolve\n"
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
data/lib/warnings.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"001": {
|
|
3
|
-
"message": "Unable to resolve guid using Accept headers
|
|
3
|
+
"message": "Unable to resolve guid using default (*/*) Accept headers",
|
|
4
4
|
"linkout": "",
|
|
5
5
|
"severity": "WARN"
|
|
6
6
|
},
|
|
@@ -68,7 +68,43 @@
|
|
|
68
68
|
"message": "Item link does not resolve",
|
|
69
69
|
"linkout": "",
|
|
70
70
|
"severity": "WARN"
|
|
71
|
-
}
|
|
71
|
+
},
|
|
72
|
+
"015": {
|
|
73
|
+
"message": "Link headers do not include a link of type 'type', as required by the FAIR Signposting specification",
|
|
74
|
+
"linkout": "",
|
|
75
|
+
"severity": "WARN"
|
|
76
|
+
},
|
|
77
|
+
"016": {
|
|
78
|
+
"message": "Unable to resolve describedby link using Accept headers with the MIME type indicated in the link",
|
|
79
|
+
"linkout": "",
|
|
80
|
+
"severity": "WARN"
|
|
81
|
+
},
|
|
82
|
+
"017": {
|
|
83
|
+
"message": "Metadata format not recognized.",
|
|
84
|
+
"linkout": "",
|
|
85
|
+
"severity": "WARN"
|
|
86
|
+
},
|
|
87
|
+
"018": {
|
|
88
|
+
"message": "RDF parsing error - likely malformed RDF document.",
|
|
89
|
+
"linkout": "",
|
|
90
|
+
"severity": "WARN"
|
|
91
|
+
},
|
|
92
|
+
"019": {
|
|
93
|
+
"message": "HTML parsing error - unable to extract linked data from HTML.",
|
|
94
|
+
"linkout": "",
|
|
95
|
+
"severity": "WARN"
|
|
96
|
+
},
|
|
97
|
+
"020": {
|
|
98
|
+
"message": "XML parsing error - unable to process XML document.",
|
|
99
|
+
"linkout": "",
|
|
100
|
+
"severity": "WARN"
|
|
101
|
+
},
|
|
102
|
+
"021": {
|
|
103
|
+
"message": "JSON parsing error - unable to process JSON document.",
|
|
104
|
+
"linkout": "",
|
|
105
|
+
"severity": "WARN"
|
|
106
|
+
},
|
|
107
|
+
|
|
72
108
|
|
|
73
109
|
|
|
74
110
|
}
|
data/lib/web_utils.rb
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
module FspHarvester
|
|
2
2
|
|
|
3
3
|
class WebUtils
|
|
4
|
-
def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get)
|
|
4
|
+
def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get, meta: FspHarvester::MetadataObject.new)
|
|
5
5
|
warn 'In fetch routine now. '
|
|
6
6
|
|
|
7
7
|
begin
|
|
@@ -13,19 +13,19 @@ module FspHarvester
|
|
|
13
13
|
# password: pass,
|
|
14
14
|
headers: headers
|
|
15
15
|
})
|
|
16
|
-
|
|
17
|
-
warn "
|
|
18
|
-
warn "
|
|
16
|
+
meta.all_uris |= [response.request.url] # it's possible to call this method without affecting the metadata object being created by the harvester
|
|
17
|
+
warn "starting URL #{url}"
|
|
18
|
+
warn "final URL #{response.request.url}"
|
|
19
19
|
warn "Response code #{response.code}"
|
|
20
|
-
if response.code == 203
|
|
21
|
-
|
|
22
|
-
|
|
20
|
+
if response.code == 203
|
|
21
|
+
meta.warnings << ["002", url, headers]
|
|
22
|
+
meta.comments << "WARN: Response is non-authoritative (HTTP response code: #{response.code}). Headers may have been manipulated encountered when trying to resolve #{url}\n"
|
|
23
23
|
end
|
|
24
24
|
response
|
|
25
25
|
rescue RestClient::ExceptionWithResponse => e
|
|
26
26
|
warn "EXCEPTION WITH RESPONSE! #{e.response}\n#{e.response.headers}"
|
|
27
|
-
|
|
28
|
-
|
|
27
|
+
meta.warnings << ["003", url, headers]
|
|
28
|
+
meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
|
|
29
29
|
if (e.response.code == 500 or e.response.code == 404)
|
|
30
30
|
return false
|
|
31
31
|
else
|
|
@@ -34,14 +34,14 @@ module FspHarvester
|
|
|
34
34
|
# now we are returning the headers and body that were returned
|
|
35
35
|
rescue RestClient::Exception => e
|
|
36
36
|
warn "EXCEPTION WITH NO RESPONSE! #{e}"
|
|
37
|
-
|
|
38
|
-
|
|
37
|
+
meta.warnings << ["003", url, headers]
|
|
38
|
+
meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
|
|
39
39
|
false
|
|
40
40
|
# now we are returning 'False', and we will check that with an \"if\" statement in our main code
|
|
41
41
|
rescue Exception => e
|
|
42
42
|
warn "EXCEPTION UNKNOWN! #{e}"
|
|
43
|
-
|
|
44
|
-
|
|
43
|
+
meta.warnings << ["003", url, headers]
|
|
44
|
+
meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
|
|
45
45
|
false
|
|
46
46
|
# now we are returning 'False', and we will check that with an \"if\" statement in our main code
|
|
47
47
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: fsp_harvester
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.9
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Mark Wilkinson
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2022-08-
|
|
11
|
+
date: 2022-08-05 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: json
|
|
@@ -44,14 +44,14 @@ dependencies:
|
|
|
44
44
|
requirements:
|
|
45
45
|
- - "~>"
|
|
46
46
|
- !ruby/object:Gem::Version
|
|
47
|
-
version: 0.1.
|
|
47
|
+
version: 0.1.16
|
|
48
48
|
type: :runtime
|
|
49
49
|
prerelease: false
|
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
|
51
51
|
requirements:
|
|
52
52
|
- - "~>"
|
|
53
53
|
- !ruby/object:Gem::Version
|
|
54
|
-
version: 0.1.
|
|
54
|
+
version: 0.1.16
|
|
55
55
|
- !ruby/object:Gem::Dependency
|
|
56
56
|
name: metainspector
|
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -171,6 +171,7 @@ executables: []
|
|
|
171
171
|
extensions: []
|
|
172
172
|
extra_rdoc_files: []
|
|
173
173
|
files:
|
|
174
|
+
- ".rspec_status"
|
|
174
175
|
- CHANGELOG.md
|
|
175
176
|
- Gemfile
|
|
176
177
|
- Gemfile.lock
|
|
@@ -180,10 +181,17 @@ files:
|
|
|
180
181
|
- bin/console
|
|
181
182
|
- bin/setup
|
|
182
183
|
- example_test.rb
|
|
184
|
+
- launch.json
|
|
185
|
+
- lib/config.conf_docker
|
|
186
|
+
- lib/config.conf_local
|
|
183
187
|
- lib/constants.rb
|
|
184
188
|
- lib/fsp_harvester.rb
|
|
185
189
|
- lib/fsp_harvester/version.rb
|
|
190
|
+
- lib/fsp_metadata_external_tools.rb
|
|
191
|
+
- lib/fsp_metadata_harvester.rb
|
|
192
|
+
- lib/fsp_metadata_parser.rb
|
|
186
193
|
- lib/metadata_object.rb
|
|
194
|
+
- lib/signposting_tests.rb
|
|
187
195
|
- lib/swagger.rb
|
|
188
196
|
- lib/warnings.json
|
|
189
197
|
- lib/web_utils.rb
|