fsp_harvester 0.1.9 → 0.1.12
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rspec_status +53 -53
- data/Gemfile.lock +7 -7
- data/lib/config.conf +8 -0
- data/lib/{fsp_metadata_external_tools.rb → external_tools.rb} +20 -18
- data/lib/fsp_harvester/version.rb +1 -1
- data/lib/fsp_harvester.rb +10 -108
- data/lib/harvester.rb +27 -0
- data/lib/harvester_utils.rb +75 -0
- data/lib/{fsp_metadata_harvester.rb → metadata_harvester.rb} +11 -11
- data/lib/metadata_object.rb +14 -1
- data/lib/{fsp_metadata_parser.rb → metadata_parser.rb} +36 -21
- data/lib/signposting_tests.rb +18 -15
- data/lib/swagger.rb +137 -177
- data/lib/warnings.json +1 -4
- data/lib/web_utils.rb +2 -2
- metadata +8 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b38eea15fa26a3fe07290024342f8b6121dbd78c3cd2dd3496ca118fca22f6d4
|
4
|
+
data.tar.gz: a25ea37ecd78b2ef8dc41dca391c161ba4b262d910dbadcf34f07f0cd8e54af5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 83b766f2896a0776ed75ab3fd1e235d2c80173d3ffc0c22aea80b234856392497daf312db36461f03d0ca168228e6e385edf6d789b42dc4b43fcd6a073cda234
|
7
|
+
data.tar.gz: 16d74c199a138db0225e88c0e092a5272a518785c397d27da74c511a69260444e6c1c440021e08413c131d672cd2d528e8d19dbd8a20ca1bff97dad38600c995
|
data/.rspec_status
CHANGED
@@ -1,55 +1,55 @@
|
|
1
1
|
example_id | status | run_time |
|
2
2
|
---------------------------------- | ------ | --------------- |
|
3
|
-
./spec/cite-as_spec.rb[1:1:1] | passed | 1.
|
4
|
-
./spec/cite-as_spec.rb[1:1:2] | passed |
|
5
|
-
./spec/cite-as_spec.rb[1:1:3] | passed |
|
6
|
-
./spec/cite-as_spec.rb[1:1:4] | passed | 1.
|
7
|
-
./spec/cite-as_spec.rb[1:1:5] | passed | 2.
|
8
|
-
./spec/cite-as_spec.rb[1:1:6] | passed |
|
9
|
-
./spec/cite-as_spec.rb[1:1:7] | passed |
|
10
|
-
./spec/cite-as_spec.rb[1:1:8] | passed |
|
11
|
-
./spec/cite-as_spec.rb[1:1:9] | passed | 2.
|
12
|
-
./spec/cite-as_spec.rb[1:1:10] | passed |
|
13
|
-
./spec/cite-as_spec.rb[1:1:11] | passed |
|
14
|
-
./spec/cite-as_spec.rb[1:1:12] | passed | 2.
|
15
|
-
./spec/cite-as_spec.rb[1:1:13] | passed | 2.
|
16
|
-
./spec/cite-as_spec.rb[1:1:14] | passed |
|
17
|
-
./spec/cite-as_spec.rb[1:1:15] | passed | 1.
|
18
|
-
./spec/cite-as_spec.rb[1:1:16] | passed | 1.
|
19
|
-
./spec/cite-as_spec.rb[1:1:17] | passed |
|
20
|
-
./spec/cite-as_spec.rb[1:1:18] | passed | 1.
|
21
|
-
./spec/cite-as_spec.rb[1:1:19] | passed | 1.
|
22
|
-
./spec/cite-as_spec.rb[1:1:20] | passed | 1.
|
23
|
-
./spec/cite-as_spec.rb[1:1:21] | passed |
|
24
|
-
./spec/cite-as_spec.rb[1:1:22] | passed | 1.
|
25
|
-
./spec/cite-as_spec.rb[1:1:23] | passed | 1.
|
26
|
-
./spec/cite-as_spec.rb[1:1:24] | failed | 1.
|
27
|
-
./spec/cite-as_spec.rb[1:1:25] | passed | 0.
|
28
|
-
./spec/describedby_spec.rb[1:1:1] | passed | 2.
|
29
|
-
./spec/describedby_spec.rb[1:1:2] | passed | 1.
|
30
|
-
./spec/describedby_spec.rb[1:1:3] | passed | 1
|
31
|
-
./spec/describedby_spec.rb[1:1:4] | passed | 1.
|
32
|
-
./spec/describedby_spec.rb[1:1:5] | passed | 1.
|
33
|
-
./spec/describedby_spec.rb[1:1:6] | passed |
|
34
|
-
./spec/describedby_spec.rb[1:1:7] | passed | 0.
|
35
|
-
./spec/describedby_spec.rb[1:1:8] | passed | 2.
|
36
|
-
./spec/describedby_spec.rb[1:1:9] | passed | 1.
|
37
|
-
./spec/describedby_spec.rb[1:1:10] | passed |
|
38
|
-
./spec/describedby_spec.rb[1:1:11] | passed |
|
39
|
-
./spec/describedby_spec.rb[1:1:12] | passed |
|
40
|
-
./spec/describedby_spec.rb[1:1:13] | passed | 1.
|
41
|
-
./spec/describedby_spec.rb[1:1:14] | passed |
|
42
|
-
./spec/describedby_spec.rb[1:1:15] | passed |
|
43
|
-
./spec/fsp_harvester_spec.rb[1:1] | passed | 0.
|
44
|
-
./spec/fsp_harvester_spec.rb[1:2] |
|
45
|
-
./spec/item_spec.rb[1:1:1] | passed |
|
46
|
-
./spec/item_spec.rb[1:1:2] | passed | 2.
|
47
|
-
./spec/item_spec.rb[1:1:3] | passed | 1.
|
48
|
-
./spec/item_spec.rb[1:1:4] | passed | 1.
|
49
|
-
./spec/item_spec.rb[1:1:5] | passed |
|
50
|
-
./spec/item_spec.rb[1:1:6] | passed |
|
51
|
-
./spec/item_spec.rb[1:1:7] | passed | 3.
|
52
|
-
./spec/item_spec.rb[1:1:8] | passed | 0.
|
53
|
-
./spec/type_spec.rb[1:1:1] | passed | 1.
|
54
|
-
./spec/type_spec.rb[1:1:2] | passed |
|
55
|
-
./spec/type_spec.rb[1:1:3] | passed | 1.
|
3
|
+
./spec/cite-as_spec.rb[1:1:1] | passed | 1.3 seconds |
|
4
|
+
./spec/cite-as_spec.rb[1:1:2] | passed | 1.21 seconds |
|
5
|
+
./spec/cite-as_spec.rb[1:1:3] | passed | 1.09 seconds |
|
6
|
+
./spec/cite-as_spec.rb[1:1:4] | passed | 1.69 seconds |
|
7
|
+
./spec/cite-as_spec.rb[1:1:5] | passed | 2.72 seconds |
|
8
|
+
./spec/cite-as_spec.rb[1:1:6] | passed | 2.3 seconds |
|
9
|
+
./spec/cite-as_spec.rb[1:1:7] | passed | 3.36 seconds |
|
10
|
+
./spec/cite-as_spec.rb[1:1:8] | passed | 2.26 seconds |
|
11
|
+
./spec/cite-as_spec.rb[1:1:9] | passed | 2.82 seconds |
|
12
|
+
./spec/cite-as_spec.rb[1:1:10] | passed | 2.3 seconds |
|
13
|
+
./spec/cite-as_spec.rb[1:1:11] | passed | 3.37 seconds |
|
14
|
+
./spec/cite-as_spec.rb[1:1:12] | passed | 2.2 seconds |
|
15
|
+
./spec/cite-as_spec.rb[1:1:13] | passed | 2.94 seconds |
|
16
|
+
./spec/cite-as_spec.rb[1:1:14] | passed | 2.44 seconds |
|
17
|
+
./spec/cite-as_spec.rb[1:1:15] | passed | 1.54 seconds |
|
18
|
+
./spec/cite-as_spec.rb[1:1:16] | passed | 1.29 seconds |
|
19
|
+
./spec/cite-as_spec.rb[1:1:17] | passed | 1.25 seconds |
|
20
|
+
./spec/cite-as_spec.rb[1:1:18] | passed | 1.15 seconds |
|
21
|
+
./spec/cite-as_spec.rb[1:1:19] | passed | 1.7 seconds |
|
22
|
+
./spec/cite-as_spec.rb[1:1:20] | passed | 1.66 seconds |
|
23
|
+
./spec/cite-as_spec.rb[1:1:21] | passed | 2.41 seconds |
|
24
|
+
./spec/cite-as_spec.rb[1:1:22] | passed | 1.64 seconds |
|
25
|
+
./spec/cite-as_spec.rb[1:1:23] | passed | 1.35 seconds |
|
26
|
+
./spec/cite-as_spec.rb[1:1:24] | failed | 1.25 seconds |
|
27
|
+
./spec/cite-as_spec.rb[1:1:25] | passed | 0.51152 seconds |
|
28
|
+
./spec/describedby_spec.rb[1:1:1] | passed | 2.71 seconds |
|
29
|
+
./spec/describedby_spec.rb[1:1:2] | passed | 1.25 seconds |
|
30
|
+
./spec/describedby_spec.rb[1:1:3] | passed | 1.22 seconds |
|
31
|
+
./spec/describedby_spec.rb[1:1:4] | passed | 1.22 seconds |
|
32
|
+
./spec/describedby_spec.rb[1:1:5] | passed | 1.21 seconds |
|
33
|
+
./spec/describedby_spec.rb[1:1:6] | passed | 1.02 seconds |
|
34
|
+
./spec/describedby_spec.rb[1:1:7] | passed | 0.99175 seconds |
|
35
|
+
./spec/describedby_spec.rb[1:1:8] | passed | 2.44 seconds |
|
36
|
+
./spec/describedby_spec.rb[1:1:9] | passed | 1.72 seconds |
|
37
|
+
./spec/describedby_spec.rb[1:1:10] | passed | 2.15 seconds |
|
38
|
+
./spec/describedby_spec.rb[1:1:11] | passed | 3.17 seconds |
|
39
|
+
./spec/describedby_spec.rb[1:1:12] | passed | 3.1 seconds |
|
40
|
+
./spec/describedby_spec.rb[1:1:13] | passed | 1.7 seconds |
|
41
|
+
./spec/describedby_spec.rb[1:1:14] | passed | 2.21 seconds |
|
42
|
+
./spec/describedby_spec.rb[1:1:15] | passed | 2.18 seconds |
|
43
|
+
./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00058 seconds |
|
44
|
+
./spec/fsp_harvester_spec.rb[1:2] | failed | 2.92 seconds |
|
45
|
+
./spec/item_spec.rb[1:1:1] | passed | 3.09 seconds |
|
46
|
+
./spec/item_spec.rb[1:1:2] | passed | 2.92 seconds |
|
47
|
+
./spec/item_spec.rb[1:1:3] | passed | 1.12 seconds |
|
48
|
+
./spec/item_spec.rb[1:1:4] | passed | 1.7 seconds |
|
49
|
+
./spec/item_spec.rb[1:1:5] | passed | 2.24 seconds |
|
50
|
+
./spec/item_spec.rb[1:1:6] | passed | 2.87 seconds |
|
51
|
+
./spec/item_spec.rb[1:1:7] | passed | 3.03 seconds |
|
52
|
+
./spec/item_spec.rb[1:1:8] | passed | 0.52338 seconds |
|
53
|
+
./spec/type_spec.rb[1:1:1] | passed | 1.42 seconds |
|
54
|
+
./spec/type_spec.rb[1:1:2] | passed | 1.28 seconds |
|
55
|
+
./spec/type_spec.rb[1:1:3] | passed | 1.52 seconds |
|
data/Gemfile.lock
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
fsp_harvester (0.1.
|
4
|
+
fsp_harvester (0.1.12)
|
5
5
|
json (~> 2.0)
|
6
6
|
linkeddata (~> 3.2)
|
7
|
-
linkheaders-processor (~> 0.1.
|
7
|
+
linkheaders-processor (~> 0.1.16)
|
8
8
|
metainspector (~> 5.11.2)
|
9
9
|
parseconfig (~> 1.1)
|
10
10
|
rake (~> 13.0)
|
@@ -36,7 +36,7 @@ GEM
|
|
36
36
|
scanf (~> 1.0)
|
37
37
|
sxp (~> 1.2)
|
38
38
|
unicode-types (~> 1.7)
|
39
|
-
faraday (1.10.
|
39
|
+
faraday (1.10.1)
|
40
40
|
faraday-em_http (~> 1.0)
|
41
41
|
faraday-em_synchrony (~> 1.0)
|
42
42
|
faraday-excon (~> 1.1)
|
@@ -82,13 +82,13 @@ GEM
|
|
82
82
|
concurrent-ruby (~> 1.0)
|
83
83
|
json (2.6.2)
|
84
84
|
json-canonicalization (0.3.0)
|
85
|
-
json-ld (3.2.
|
85
|
+
json-ld (3.2.3)
|
86
86
|
htmlentities (~> 4.3)
|
87
87
|
json-canonicalization (~> 0.3)
|
88
88
|
link_header (~> 0.0, >= 0.0.8)
|
89
89
|
multi_json (~> 1.15)
|
90
90
|
rack (~> 2.2)
|
91
|
-
rdf (~> 3.2)
|
91
|
+
rdf (~> 3.2, >= 3.2.9)
|
92
92
|
json-ld-preloaded (3.2.0)
|
93
93
|
json-ld (~> 3.2)
|
94
94
|
rdf (~> 3.2)
|
@@ -126,7 +126,7 @@ GEM
|
|
126
126
|
shex (~> 0.7)
|
127
127
|
sparql (~> 3.2)
|
128
128
|
sparql-client (~> 3.2)
|
129
|
-
linkheaders-processor (0.1.
|
129
|
+
linkheaders-processor (0.1.16)
|
130
130
|
json (~> 2.0)
|
131
131
|
json-ld (~> 3.2)
|
132
132
|
json-ld-preloaded (~> 3.2)
|
@@ -166,7 +166,7 @@ GEM
|
|
166
166
|
rack (2.2.4)
|
167
167
|
rainbow (3.1.1)
|
168
168
|
rake (13.0.6)
|
169
|
-
rdf (3.2.
|
169
|
+
rdf (3.2.9)
|
170
170
|
link_header (~> 0.0, >= 0.0.8)
|
171
171
|
rdf-aggregate-repo (3.2.1)
|
172
172
|
rdf (~> 3.2)
|
data/lib/config.conf
ADDED
@@ -1,12 +1,12 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
module
|
3
|
+
module HarvesterTools
|
4
4
|
class Error < StandardError
|
5
5
|
end
|
6
6
|
|
7
7
|
class ExternalTools
|
8
8
|
|
9
|
-
def initialize(metadata:
|
9
|
+
def initialize(metadata: HarvesterTools::MetadataObject.new)
|
10
10
|
@meta = metadata
|
11
11
|
end
|
12
12
|
|
@@ -25,10 +25,7 @@ module FspHarvester
|
|
25
25
|
file.rewind
|
26
26
|
|
27
27
|
@meta.comments << "INFO: The message body is being examined by Distiller\n"
|
28
|
-
|
29
|
-
command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
|
30
|
-
# command = "LANG=en_US.UTF-8 /usr/local/bin/ruby #{@rdf_command} serialize --input-format rdfa --output-format jsonld #{file.path}"
|
31
|
-
# command = "LANG=en_US.UTF-8 /home/osboxes/.rvm/rubies/ruby-2.6.3/bin/ruby /home/osboxes/.rvm/gems/ruby-2.6.3/bin/rdf serialize --output-format jsonld #{file.path}"
|
28
|
+
command = "LANG=en_US.UTF-8 #{RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
|
32
29
|
warn "distiller command: #{command}"
|
33
30
|
result, _stderr, _status = Open3.capture3(command)
|
34
31
|
warn ''
|
@@ -40,13 +37,14 @@ module FspHarvester
|
|
40
37
|
warn "DIST RESULT: #{result}"
|
41
38
|
if result !~ /@context/i # failure returns nil
|
42
39
|
@meta.comments << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML..\n"
|
43
|
-
@meta.
|
40
|
+
@meta.add_warning(['018', '', ''])
|
41
|
+
result = "{}"
|
44
42
|
else
|
45
43
|
@meta.comments << "INFO: The Distiller found parseable data. Parsing as JSON-LD\n"
|
46
|
-
parse_rdf(result: result, content_type: "application/ld+json")
|
47
44
|
end
|
48
45
|
@@distillerknown[bhash] = true
|
49
46
|
end
|
47
|
+
result
|
50
48
|
end
|
51
49
|
|
52
50
|
def processs_with_extruct(uri:)
|
@@ -55,28 +53,32 @@ module FspHarvester
|
|
55
53
|
stdout, stderr, status = Open3.capture3(EXTRUCT_COMMAND + ' ' + uri)
|
56
54
|
warn "open3 status: #{status} #{stdout}"
|
57
55
|
result = stderr # absurd that the output comes over stderr! LOL!
|
56
|
+
jsonld = {}
|
57
|
+
microdata = Hash.new
|
58
|
+
microformat = Hash.new
|
59
|
+
opengraph = Hash.new
|
60
|
+
rdfa = Hash.new
|
58
61
|
|
59
62
|
if result.to_s.match(/(Failed\sto\sextract.*?)\n/)
|
60
63
|
@meta.comments << "WARN: extruct threw an error #{Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n"
|
61
|
-
@meta.
|
64
|
+
@meta.add_warning(['019', '', ''])
|
62
65
|
if result.to_s.match(/(ValueError:.*?)\n/)
|
63
66
|
@meta.comments << "WARN: extruct error was #{Regexp.last_match(1)}\n"
|
64
|
-
@meta.
|
67
|
+
@meta.add_warning(['019', '', ''])
|
65
68
|
end
|
66
69
|
elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
|
67
70
|
json = JSON.parse result
|
68
71
|
@meta.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
@meta.merge_hash(json.first) if json.first.is_a? Hash
|
72
|
+
jsonld = json['json-ld'].to_json if json['json-ld'].any?
|
73
|
+
microdata = json['microdata'].first if json['microdata'].any
|
74
|
+
microformat = json['microformat'].first if json['microformat'].any?
|
75
|
+
opengraph = json['opengraph'].first if json['opengraph'].any?
|
76
|
+
rdfa = json['rdfa'].to_json if json['rdfa'].any?
|
77
|
+
# @meta.merge_hash(json.first) if json.first.is_a? Hash
|
77
78
|
else
|
78
79
|
@meta.comments << "WARN: the extruct tool failed to find parseable data at #{uri}\n"
|
79
80
|
end
|
81
|
+
[jsonld, microdata, microformat, opengraph, rdfa]
|
80
82
|
end
|
81
83
|
end
|
82
84
|
end
|
data/lib/fsp_harvester.rb
CHANGED
@@ -1,121 +1,23 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require_relative 'fsp_harvester/version'
|
4
|
-
require 'json/ld'
|
5
|
-
require 'json/ld/preloaded'
|
6
|
-
require 'json'
|
7
|
-
require 'linkheaders/processor'
|
8
|
-
require 'addressable'
|
9
|
-
require 'tempfile'
|
10
|
-
require 'xmlsimple'
|
11
|
-
require 'nokogiri'
|
12
|
-
require 'parseconfig'
|
13
|
-
require 'rest-client'
|
14
|
-
require 'cgi'
|
15
|
-
require 'digest'
|
16
|
-
require 'open3'
|
17
|
-
require 'metainspector'
|
18
|
-
require 'rdf/xsd'
|
19
|
-
require_relative './metadata_object'
|
20
|
-
require_relative './constants'
|
21
|
-
require_relative './web_utils'
|
22
|
-
require_relative './signposting_tests'
|
23
|
-
require_relative './fsp_metadata_harvester'
|
24
|
-
require_relative './fsp_metadata_parser'
|
25
|
-
|
26
1
|
|
2
|
+
require_relative 'harvester'
|
27
3
|
module FspHarvester
|
28
4
|
class Error < StandardError
|
29
5
|
end
|
30
6
|
|
31
7
|
class Utils
|
32
|
-
# @@distillerknown = {} # global, hash of sha256 keys of message bodies - have they been seen before t/f
|
33
|
-
# @warnings = JSON.parse(File.read("warnings.json"))
|
34
|
-
|
35
|
-
|
36
|
-
def self.resolve_guid(guid:)
|
37
|
-
@meta = FspHarvester::MetadataObject.new
|
38
|
-
@meta.all_uris = [guid]
|
39
|
-
type, url = convertToURL(guid: guid)
|
40
|
-
links = Array.new
|
41
|
-
if type
|
42
|
-
links = resolve_url(url: url)
|
43
|
-
@meta.links << links
|
44
|
-
else
|
45
|
-
@meta.warnings << ['006', guid, '']
|
46
|
-
@meta.comments << "FATAL: GUID type not recognized.\n"
|
47
|
-
end
|
48
|
-
[links, @meta]
|
49
|
-
end
|
50
8
|
|
51
|
-
def self.gather_metadata_from_describedby_links(links: [], metadata:
|
9
|
+
def self.gather_metadata_from_describedby_links(links: [], metadata: HarvesterTools::MetadataObject.new) # meta should have already been created by resolve+guid, but maybe not
|
52
10
|
@meta = metadata
|
53
11
|
db = []
|
54
12
|
links.each do |l|
|
55
13
|
db << l if l.relation == 'describedby'
|
56
14
|
end
|
57
|
-
|
15
|
+
HarvesterTools::MetadataHarvester.extract_metadata(links: db, metadata: @meta) # everything is gathered into the @meta metadata object
|
58
16
|
@meta
|
59
17
|
end
|
60
18
|
|
61
|
-
def self.
|
62
|
-
|
63
|
-
if k == 'inchi' and regex.match(guid)
|
64
|
-
return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
|
65
|
-
elsif k == 'handle1' and regex.match(guid)
|
66
|
-
return 'handle', "http://hdl.handle.net/#{guid}"
|
67
|
-
elsif k == 'handle2' and regex.match(guid)
|
68
|
-
return 'handle', "http://hdl.handle.net/#{guid}"
|
69
|
-
elsif k == 'uri' and regex.match(guid)
|
70
|
-
return 'uri', guid
|
71
|
-
elsif k == 'doi' and regex.match(guid)
|
72
|
-
return 'doi', "https://doi.org/#{guid}"
|
73
|
-
end
|
74
|
-
end
|
75
|
-
[nil, nil]
|
76
|
-
end
|
77
|
-
|
78
|
-
def self.typeit(guid:)
|
79
|
-
Utils::GUID_TYPES.each do |type, regex|
|
80
|
-
return type if regex.match(guid)
|
81
|
-
end
|
82
|
-
false
|
83
|
-
end
|
84
|
-
|
85
|
-
def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_STAR_HEADER)
|
86
|
-
@meta.guidtype = 'uri' if @meta.guidtype.nil?
|
87
|
-
warn "\n\n FETCHING #{url} #{header}\n\n"
|
88
|
-
response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method, meta: @meta)
|
89
|
-
warn "\n\n head #{response.headers.inspect}\n\n" if response
|
90
|
-
|
91
|
-
unless response
|
92
|
-
@meta.warnings << ['001', url, header]
|
93
|
-
@meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
|
94
|
-
return []
|
95
|
-
end
|
96
|
-
|
97
|
-
@meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
|
98
|
-
@meta.full_response << response.body
|
99
|
-
|
100
|
-
links = process_link_headers(response: response) unless nolinkheaders
|
101
|
-
links
|
102
|
-
end
|
103
|
-
|
104
|
-
def self.process_link_headers(response:)
|
105
|
-
warn "\n\n parsing #{response.headers}\n\n"
|
106
|
-
|
107
|
-
parser = LinkHeaders::Processor.new(default_anchor: @meta.all_uris.last)
|
108
|
-
parser.extract_and_parse(response: response)
|
109
|
-
factory = parser.factory # LinkHeaders::LinkFactory
|
110
|
-
|
111
|
-
warn "\n\n length bfore #{factory.all_links.length}\n\n"
|
112
|
-
signpostingcheck(factory: factory)
|
113
|
-
warn "\n\n length aftr #{factory.all_links.length}\n\n"
|
114
|
-
warn "\n\n links #{factory.all_links}\n\n"
|
115
|
-
factory.all_links
|
116
|
-
end
|
117
|
-
|
118
|
-
def self.signpostingcheck(factory:)
|
19
|
+
def self.signpostingcheck(factory:, metadata: HarvesterTools::MetadataObject.new)
|
20
|
+
@meta = metadata
|
119
21
|
citeas = Array.new
|
120
22
|
describedby = Array.new
|
121
23
|
item = Array.new
|
@@ -134,22 +36,22 @@ module FspHarvester
|
|
134
36
|
end
|
135
37
|
end
|
136
38
|
|
137
|
-
check_describedby_rules(describedby: describedby)
|
138
|
-
check_item_rules(item: item)
|
39
|
+
check_describedby_rules(describedby: describedby, metadata: @meta)
|
40
|
+
check_item_rules(item: item, metadata: @meta)
|
139
41
|
|
140
42
|
if citeas.length > 1
|
141
43
|
warn "INFO: multiple cite-as links found. Checking for conflicts\n"
|
142
44
|
@meta.comments << "INFO: multiple cite-as links found. Checking for conflicts\n"
|
143
|
-
citeas = check_for_citeas_conflicts(citeas: citeas) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
|
45
|
+
citeas = check_for_citeas_conflicts(citeas: citeas, metadata: @meta) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
|
144
46
|
end
|
145
47
|
|
146
48
|
unless citeas.length == 1 && describedby.length > 0
|
147
|
-
@meta.
|
49
|
+
@meta.add_warning(['004', '', ''])
|
148
50
|
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires exactly one cite-as header, and at least one describedby header\n"
|
149
51
|
end
|
150
52
|
|
151
53
|
unless types.length >=1
|
152
|
-
@meta.
|
54
|
+
@meta.add_warning(['015', '', ''])
|
153
55
|
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires one or two 'type' link headers\n"
|
154
56
|
end
|
155
57
|
end
|
data/lib/harvester.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
#require_relative 'fsp_harvester/version'
|
4
|
+
require 'json/ld'
|
5
|
+
require 'json/ld/preloaded'
|
6
|
+
require 'json'
|
7
|
+
require 'linkheaders/processor'
|
8
|
+
require 'addressable'
|
9
|
+
require 'tempfile'
|
10
|
+
require 'xmlsimple'
|
11
|
+
require 'nokogiri'
|
12
|
+
require 'parseconfig'
|
13
|
+
require 'rest-client'
|
14
|
+
require 'cgi'
|
15
|
+
require 'digest'
|
16
|
+
require 'open3'
|
17
|
+
require 'metainspector'
|
18
|
+
require 'rdf/xsd'
|
19
|
+
require_relative './metadata_object'
|
20
|
+
require_relative './constants'
|
21
|
+
require_relative './web_utils'
|
22
|
+
require_relative './signposting_tests'
|
23
|
+
require_relative './metadata_harvester'
|
24
|
+
require_relative './fsp_harvester'
|
25
|
+
require_relative './harvester_utils'
|
26
|
+
require_relative './external_tools'
|
27
|
+
require_relative './metadata_parser'
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module HarvesterTools
|
2
|
+
class Error < StandardError
|
3
|
+
end
|
4
|
+
|
5
|
+
class Utils
|
6
|
+
|
7
|
+
def self.resolve_guid(guid:)
|
8
|
+
@meta = HarvesterTools::MetadataObject.new
|
9
|
+
@meta.all_uris = [guid]
|
10
|
+
type, url = convertToURL(guid: guid)
|
11
|
+
links = Array.new
|
12
|
+
if type
|
13
|
+
links = resolve_url(url: url)
|
14
|
+
@meta.links = @meta.links | links
|
15
|
+
else
|
16
|
+
@meta.add_warning(['006', guid, ''])
|
17
|
+
@meta.comments << "FATAL: GUID type not recognized.\n"
|
18
|
+
end
|
19
|
+
[links, @meta]
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.convertToURL(guid:)
|
23
|
+
GUID_TYPES.each do |k, regex|
|
24
|
+
if k == 'inchi' and regex.match(guid)
|
25
|
+
return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
|
26
|
+
elsif k == 'handle1' and regex.match(guid)
|
27
|
+
return 'handle', "http://hdl.handle.net/#{guid}"
|
28
|
+
elsif k == 'handle2' and regex.match(guid)
|
29
|
+
return 'handle', "http://hdl.handle.net/#{guid}"
|
30
|
+
elsif k == 'uri' and regex.match(guid)
|
31
|
+
return 'uri', guid
|
32
|
+
elsif k == 'doi' and regex.match(guid)
|
33
|
+
return 'doi', "https://doi.org/#{guid}"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
[nil, nil]
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.typeit(guid:)
|
40
|
+
GUID_TYPES.each do |type, regex|
|
41
|
+
return type if regex.match(guid)
|
42
|
+
end
|
43
|
+
false
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_STAR_HEADER)
|
47
|
+
@meta.guidtype = 'uri' if @meta.guidtype.nil?
|
48
|
+
warn "\n\n FETCHING #{url} #{header}\n\n"
|
49
|
+
response = HarvesterTools::WebUtils.fspfetch(url: url, headers: header, method: method, meta: @meta)
|
50
|
+
warn "\n\n head #{response.headers.inspect}\n\n" if response
|
51
|
+
|
52
|
+
unless response
|
53
|
+
@meta.add_warning(['001', url, header])
|
54
|
+
@meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
|
55
|
+
return []
|
56
|
+
end
|
57
|
+
|
58
|
+
@meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
|
59
|
+
@meta.full_response << response.body
|
60
|
+
|
61
|
+
links = process_link_headers(response: response) unless nolinkheaders
|
62
|
+
links
|
63
|
+
end
|
64
|
+
|
65
|
+
def self.process_link_headers(response:)
|
66
|
+
warn "\n\n parsing #{response.headers}\n\n"
|
67
|
+
|
68
|
+
parser = LinkHeaders::Processor.new(default_anchor: @meta.all_uris.last)
|
69
|
+
parser.extract_and_parse(response: response)
|
70
|
+
factory = parser.factory # LinkHeaders::LinkFactory
|
71
|
+
FspHarvester::Utils.signpostingcheck(factory: factory, metadata: @meta)
|
72
|
+
factory.all_links
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -1,17 +1,17 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
module
|
3
|
+
module HarvesterTools
|
4
4
|
class Error < StandardError
|
5
5
|
end
|
6
6
|
|
7
7
|
class MetadataHarvester
|
8
|
-
def self.extract_metadata(links: [], metadata:
|
8
|
+
def self.extract_metadata(links: [], metadata: HarvesterTools::MetadataObject.new)
|
9
9
|
@meta = metadata
|
10
10
|
@meta.comments << 'INFO: now collecting both linked data and hash-style data using the harvested links'
|
11
11
|
|
12
12
|
describedby = links.select { |l| l if l.relation == 'describedby' }
|
13
13
|
|
14
|
-
hvst =
|
14
|
+
hvst = HarvesterTools::MetadataParser.new(metadata_object: @meta) # put here because the class variable for detecting duplicates should apply to all URIs
|
15
15
|
describedby.each do |link|
|
16
16
|
accepttype = ACCEPT_STAR_HEADER
|
17
17
|
accept = link.respond_to?('type') ? link.type : nil
|
@@ -21,7 +21,7 @@ module FspHarvester
|
|
21
21
|
|
22
22
|
abbreviation, content_type = attempt_to_detect_type(body: response.body, headers: response.headers)
|
23
23
|
unless abbreviation
|
24
|
-
@meta.
|
24
|
+
@meta.add_warning(['017', url, header])
|
25
25
|
@meta.comments << "WARN: metadata format returned from #{url} using Accept header #{header} is not recognized. Processing will end now.\n"
|
26
26
|
next
|
27
27
|
end
|
@@ -30,16 +30,16 @@ module FspHarvester
|
|
30
30
|
case abbreviation
|
31
31
|
when 'html'
|
32
32
|
@meta.comments << 'INFO: Processing html'
|
33
|
-
hvst.process_html(body: response.body, uri: link)
|
33
|
+
hvst.process_html(body: response.body, uri: link, metadata: @meta)
|
34
34
|
when 'xml'
|
35
35
|
@meta.comments << 'INFO: Processing xml'
|
36
|
-
hvst.process_xml(body: response.body)
|
36
|
+
hvst.process_xml(body: response.body, metadata: @meta)
|
37
37
|
when 'json'
|
38
38
|
@meta.comments << 'INFO: Processing json'
|
39
|
-
hvst.process_json(body: response.body)
|
39
|
+
hvst.process_json(body: response.body, metadata: @meta)
|
40
40
|
when 'jsonld', 'rdfxml', 'turtle', 'ntriples', 'nquads'
|
41
41
|
@meta.comments << 'INFO: Processing linked data'
|
42
|
-
hvst.process_ld(body: response.body, content_type: content_type)
|
42
|
+
hvst.process_ld(body: response.body, content_type: content_type, metadata: @meta)
|
43
43
|
when 'specialist'
|
44
44
|
warn 'no specialized parsers so far'
|
45
45
|
end
|
@@ -54,9 +54,9 @@ module FspHarvester
|
|
54
54
|
@meta.comments << "INFO: link #{link.href} has no MIME type, defaulting to */*"
|
55
55
|
end
|
56
56
|
url = link.href
|
57
|
-
response =
|
57
|
+
response = HarvesterTools::WebUtils.fspfetch(url: url, method: :get, headers: header)
|
58
58
|
unless response
|
59
|
-
@meta.
|
59
|
+
@meta.add_warning(['016', url, header])
|
60
60
|
@meta.comments << "WARN: Unable to resolve describedby link #{url} using HTTP Accept header #{header}.\n"
|
61
61
|
end
|
62
62
|
response
|
@@ -87,7 +87,7 @@ module FspHarvester
|
|
87
87
|
end
|
88
88
|
|
89
89
|
unless content_type
|
90
|
-
@meta.
|
90
|
+
@meta.add_warning(['017', url, header])
|
91
91
|
@meta.comments << "WARN: metadata format returned from #{url} using Accept header #{header} is not recognized. Processing will end now.\n"
|
92
92
|
end
|
93
93
|
[abbreviation, content_type]
|
data/lib/metadata_object.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
module
|
1
|
+
module HarvesterTools
|
2
2
|
class MetadataObject
|
3
3
|
attr_accessor :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris # a hash of metadata # a RDF.rb graph of metadata # an array of comments # the type of GUID that was detected # will be an array of Net::HTTP::Response
|
4
4
|
|
@@ -10,6 +10,9 @@ module FspHarvester
|
|
10
10
|
@full_response = []
|
11
11
|
@links = []
|
12
12
|
@all_uris = []
|
13
|
+
w = RestClient.get("https://raw.githubusercontent.com/markwilkinson/FAIR-Signposting-Harvester/master/lib/warnings.json")
|
14
|
+
#@warn = File.read("./lib/warnings.json")
|
15
|
+
@warn = JSON.parse(w)
|
13
16
|
end
|
14
17
|
|
15
18
|
def merge_hash(hash)
|
@@ -25,6 +28,16 @@ module FspHarvester
|
|
25
28
|
def rdf
|
26
29
|
graph
|
27
30
|
end
|
31
|
+
|
32
|
+
def add_warning(warning)
|
33
|
+
id = warning[0]
|
34
|
+
url = warning[1]
|
35
|
+
headers = warning[2]
|
36
|
+
message = @warn[id]['message']
|
37
|
+
linkout = @warn[id]['linkout']
|
38
|
+
severity = @warn[id]['severity']
|
39
|
+
self.warnings << {"id" => id, "message" => message, "severity" => severity, "linkout" => linkout, "processed_url" => url, "accept_headers": headers}
|
40
|
+
end
|
28
41
|
end
|
29
42
|
|
30
43
|
class Cache
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
module
|
3
|
+
module HarvesterTools
|
4
4
|
class Error < StandardError
|
5
5
|
end
|
6
6
|
|
@@ -9,63 +9,78 @@ module FspHarvester
|
|
9
9
|
|
10
10
|
@@distillerknown = {}
|
11
11
|
|
12
|
-
def initialize(metadata_object:
|
12
|
+
def initialize(metadata_object: HarvesterTools::MetadataObject.new)
|
13
13
|
@meta = metadata_object
|
14
14
|
end
|
15
15
|
|
16
|
-
def process_html(body:, uri:)
|
17
|
-
|
18
|
-
tools.
|
19
|
-
tools.
|
16
|
+
def process_html(body:, uri:, metadata:)
|
17
|
+
@meta = metadata
|
18
|
+
tools = HarvesterTools::ExternalTools.new(metadata: @meta)
|
19
|
+
result = tools.process_with_distiller(body: body)
|
20
|
+
|
21
|
+
jsonld, microdata, microformat, opengraph, rdfa = tools.process_with_extruct(uri: uri)
|
22
|
+
parse_rdf(body: jsonld, content_type: 'application/ld+json')
|
23
|
+
@meta.merge_hash(microdata)
|
24
|
+
@meta.merge_hash(microformat)
|
25
|
+
@meta.merge_hash(opengraph)
|
26
|
+
parse_rdf(body: rdfa, content_type: 'application/ld+json')
|
20
27
|
end
|
21
28
|
|
22
|
-
def process_xml(body:)
|
29
|
+
def process_xml(body:, metadata:)
|
30
|
+
@meta = metadata
|
23
31
|
begin
|
24
32
|
hash = XmlSimple.xml_in(body)
|
25
33
|
rescue
|
26
34
|
@meta.comments << "CRITICAL: Malformed XML detected. Cannot process metadata.\n"
|
27
|
-
@meta.
|
35
|
+
@meta.add_warning(['020', '', ''])
|
28
36
|
end
|
29
37
|
@meta.comments << "INFO: The XML is being merged in the metadata object\n"
|
30
38
|
@meta.hash.merge hash
|
31
39
|
end
|
32
40
|
|
33
|
-
def process_json(body:)
|
41
|
+
def process_json(body:, metadata:)
|
42
|
+
@meta = metadata
|
34
43
|
begin
|
35
44
|
hash = JSON.parse(body)
|
36
45
|
rescue
|
37
46
|
@meta.comments << "CRITICAL: Malformed JSON detected. Cannot process metadata.\n"
|
38
|
-
@meta.
|
47
|
+
@meta.add_warning(['021', '', ''])
|
39
48
|
end
|
40
49
|
@meta.comments << "INFO: The JSON is being merged in the metadata object\n"
|
41
50
|
@meta.hash.merge hash
|
42
51
|
end
|
43
52
|
|
44
|
-
def process_ld(body:, content_type:)
|
45
|
-
|
53
|
+
def process_ld(body:, content_type:, metadata:)
|
54
|
+
@meta = metadata
|
55
|
+
parse_rdf(body: body, content_type: content_type, metadata: @meta)
|
56
|
+
end
|
57
|
+
|
58
|
+
def parse_rdf(body:, content_type:, metadata:)
|
59
|
+
self.class.parse_rdf(body: body, content_type: content_type, metadata: metadata)
|
46
60
|
end
|
47
61
|
|
48
|
-
def parse_rdf(body:, content_type:)
|
62
|
+
def self.parse_rdf(body:, content_type:, metadata:)
|
63
|
+
@meta = metadata
|
49
64
|
unless body
|
50
65
|
@meta.comments << "CRITICAL: The response message body component appears to have no content.\n"
|
51
|
-
@meta.
|
66
|
+
@meta.add_warning(['018', '', ''])
|
52
67
|
return
|
53
68
|
end
|
54
69
|
|
55
70
|
unless body.match(/\w/)
|
56
71
|
@meta.comments << "CRITICAL: The response message body component appears to have no content.\n"
|
57
|
-
@meta.
|
72
|
+
@meta.add_warning(['018', '', ''])
|
58
73
|
return
|
59
74
|
end
|
60
75
|
|
61
76
|
rdfformat = RDF::Format.for(content_type: content_type)
|
62
77
|
unless rdfformat
|
63
78
|
@meta.comments << "CRITICAL: Found what appears to be RDF (sample: #{body[0..300].delete!("\n")}), but it could not find a parser. Please report this error, along with the GUID of the resource, to the maintainer of the system.\n"
|
64
|
-
@meta.
|
79
|
+
@meta.add_warning(['018', '', ''])
|
65
80
|
return
|
66
81
|
end
|
67
82
|
|
68
|
-
graph =
|
83
|
+
graph = HarvesterTools::Cache.checkRDFCache(body: body)
|
69
84
|
if graph.size > 0
|
70
85
|
warn "\n\n\n unmarshalling graph from cache\n\ngraph size #{graph.size}\n\n"
|
71
86
|
@meta.merge_rdf(graph.to_a)
|
@@ -77,7 +92,7 @@ module FspHarvester
|
|
77
92
|
reader = rdfformat.reader.new(body)
|
78
93
|
rescue Exception => e
|
79
94
|
@meta.comments << "WARN: Though linked data was found, it failed to parse (Exception #{e}). This likely indicates some syntax error in the data. As a result, no metadata will be extracted from this message.\n"
|
80
|
-
@meta.
|
95
|
+
@meta.add_warning(['018', '', ''])
|
81
96
|
return
|
82
97
|
end
|
83
98
|
|
@@ -88,7 +103,7 @@ module FspHarvester
|
|
88
103
|
end
|
89
104
|
reader = rdfformat.reader.new(body) # have to re-read it here, but now its safe because we have already caught errors
|
90
105
|
warn 'WRITING TO CACHE'
|
91
|
-
|
106
|
+
HarvesterTools::Cache.writeRDFCache(reader: reader, body: body) # write to the special RDF graph cache
|
92
107
|
warn 'WRITING DONE'
|
93
108
|
reader = rdfformat.reader.new(body) # frustrating that we cannot rewind!
|
94
109
|
warn 'RE-READING DONE'
|
@@ -97,11 +112,11 @@ module FspHarvester
|
|
97
112
|
rescue RDF::ReaderError => e
|
98
113
|
@meta.comments << "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} || (sample of what was parsed: #{body[0..300].delete("\n")})\n"
|
99
114
|
warn "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} || (sample of what was parsed: #{body[0..300].delete("\n")})\n"
|
100
|
-
@meta.
|
115
|
+
@meta.add_warning(['018', '', ''])
|
101
116
|
rescue Exception => e
|
102
117
|
meta.comments << "CRITICAL: An unknown error occurred while parsing the (apparent) Linked Data (sample of what was parsed: #{body[0..300].delete("\n")}). Moving on...\n"
|
103
118
|
warn "\n\nCRITICAL: #{e.inspect} An unknown error occurred while parsing the (apparent) Linked Data (full body: #{body}). Moving on...\n"
|
104
|
-
@meta.
|
119
|
+
@meta.add_warning(['018', '', ''])
|
105
120
|
end
|
106
121
|
end
|
107
122
|
end
|
data/lib/signposting_tests.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
|
-
def check_for_citeas_conflicts(citeas: )
|
1
|
+
def check_for_citeas_conflicts(citeas:, metadata: )
|
2
|
+
@meta = metadata
|
2
3
|
@meta.comments << 'INFO: checking for conflicting cite-as links'
|
3
4
|
citeas_hrefs = Hash.new
|
4
5
|
citeas.each do |link|
|
@@ -6,26 +7,27 @@ def check_for_citeas_conflicts(citeas: )
|
|
6
7
|
@meta.comments << "INFO: Adding citeas #{link.href} to the testing queue."
|
7
8
|
citeas_hrefs[link.href] = link
|
8
9
|
end
|
9
|
-
|
10
|
+
#warn "finalhash #{citeas_hrefs}"
|
10
11
|
if citeas_hrefs.length > 1
|
11
12
|
@meta.comments << 'INFO: Found multiple non-identical cite-as links.'
|
12
|
-
@meta.
|
13
|
+
@meta.add_warning(['007', '', ''])
|
13
14
|
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard: Found conflicting cite-as link headers.\n"
|
14
15
|
end
|
15
16
|
citeas_hrefs.values # return list of unique links
|
16
17
|
end
|
17
18
|
|
18
19
|
|
19
|
-
def check_describedby_rules(describedby:)
|
20
|
+
def check_describedby_rules(describedby:, metadata:)
|
21
|
+
@meta = metadata
|
20
22
|
describedby.each do |l|
|
21
23
|
unless l.respond_to? 'type'
|
22
|
-
@meta.
|
24
|
+
@meta.add_warning(['005', l.href, ''])
|
23
25
|
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires any describedby links to also have a 'type' attribute.\n"
|
24
26
|
end
|
25
27
|
type = l.type if l.respond_to? 'type'
|
26
28
|
type ||= '*/*'
|
27
29
|
header = { accept: type }
|
28
|
-
response =
|
30
|
+
response = HarvesterTools::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
|
29
31
|
if response
|
30
32
|
responsetype = response.headers[:content_type]
|
31
33
|
@meta.comments << "INFO: describedby link responds with content type #{responsetype}\n"
|
@@ -37,30 +39,31 @@ def check_describedby_rules(describedby:)
|
|
37
39
|
if responsetype == type
|
38
40
|
@meta.comments << "INFO: describedby link responds according to Signposting specifications\n"
|
39
41
|
else
|
40
|
-
@meta.
|
42
|
+
@meta.add_warning(['009', l.href, header])
|
41
43
|
@meta.comments << "WARN: Content type of returned describedby link #{responsetype}does not match the 'type' attribute #{type}\n"
|
42
44
|
end
|
43
45
|
else
|
44
|
-
@meta.
|
46
|
+
@meta.add_warning(['010', l.href, header])
|
45
47
|
@meta.comments << "WARN: Content type of returned describedby link is not specified in response headers or cannot be matched against accept headers\n"
|
46
48
|
end
|
47
49
|
else
|
48
|
-
@meta.
|
50
|
+
@meta.add_warning(['008', l.href, header])
|
49
51
|
@meta.comments << "WARN: describedby link doesn't resolve\n"
|
50
52
|
end
|
51
53
|
end
|
52
54
|
end
|
53
55
|
|
54
|
-
def check_item_rules(item:)
|
56
|
+
def check_item_rules(item:, metadata:)
|
57
|
+
@meta = metadata
|
55
58
|
item.each do |l| # l = LinkHeaders::Link
|
56
59
|
unless l.respond_to? 'type'
|
57
|
-
@meta.
|
60
|
+
@meta.add_warning(['011', l.href, ''])
|
58
61
|
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which encourages any item links to also have a 'type' attribute.\n"
|
59
62
|
end
|
60
63
|
type = l.type if l.respond_to? 'type'
|
61
64
|
type ||= '*/*' # this becomes a frozen string
|
62
65
|
header = { accept: type }
|
63
|
-
response =
|
66
|
+
response = HarvesterTools::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
|
64
67
|
|
65
68
|
if response
|
66
69
|
if response.headers[:content_type] and type != '*/*'
|
@@ -72,15 +75,15 @@ def check_item_rules(item:)
|
|
72
75
|
warn typeregex.inspect
|
73
76
|
@meta.comments << "INFO: item link responds according to Signposting specifications\n"
|
74
77
|
else
|
75
|
-
@meta.
|
78
|
+
@meta.add_warning(['012', l.href, header])
|
76
79
|
@meta.comments << "WARN: Content type of returned item link does not match the 'type' attribute\n"
|
77
80
|
end
|
78
81
|
else
|
79
|
-
@meta.
|
82
|
+
@meta.add_warning(['013', l.href, header])
|
80
83
|
@meta.comments << "WARN: Content type of returned item link is not specified in response headers or cannot be matched against accept headers\n"
|
81
84
|
end
|
82
85
|
else
|
83
|
-
@meta.
|
86
|
+
@meta.add_warning(['014', l.href, header])
|
84
87
|
@meta.comments << "WARN: item link doesn't resolve\n"
|
85
88
|
end
|
86
89
|
end
|
data/lib/swagger.rb
CHANGED
@@ -1,64 +1,39 @@
|
|
1
|
-
class Swagger
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
@protocol = params.fetch(:protocol, "https")
|
38
|
-
@basePath = params.fetch(:basePath)
|
39
|
-
@path = params.fetch(:path)
|
40
|
-
@response_description = params.fetch(:response_description)
|
41
|
-
@schemas = params.fetch(:schemas, [])
|
42
|
-
@comments = params.fetch(:comments, [])
|
43
|
-
@fairsharing_key_location = params.fetch(:fairsharing_key_location)
|
44
|
-
@score = params.fetch(:score, 0)
|
45
|
-
@testedGUID = params.fetch(:testedGUID, "")
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
end
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
def fairsharing_key
|
54
|
-
return @fairsharing_key_location
|
55
|
-
end
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
def getSwagger
|
60
|
-
|
61
|
-
message = <<"EOF_EOF"
|
1
|
+
class Swagger
|
2
|
+
attr_accessor :debug, :title, :tests_metric, :description, :applies_to_principle, :organization, :org_url,
|
3
|
+
:responsible_developer, :email, :developer_ORCiD, :protocol, :host, :basePath, :path,
|
4
|
+
:response_description, :schemas, :comments, :fairsharing_key_location, :score, :testedGUID
|
5
|
+
|
6
|
+
def initialize(params = {})
|
7
|
+
@debug = params.fetch(:debug, false)
|
8
|
+
|
9
|
+
@title = params.fetch(:title, 'unnamed')
|
10
|
+
@tests_metric = params.fetch(:tests_metric)
|
11
|
+
@description = params.fetch(:description, 'default_description')
|
12
|
+
@applies_to_principle = params.fetch(:applies_to_principle, 'some principle')
|
13
|
+
@version = params.fetch(:version, '0.1')
|
14
|
+
@organization = params.fetch(:organization, 'Some Organization')
|
15
|
+
@org_url = params.fetch(:org_url)
|
16
|
+
@responsible_develper = params.fetch(:responsible_developer, 'Some Person')
|
17
|
+
@email = params.fetch(:email)
|
18
|
+
@developer_ORCiD = params.fetch(:developer_ORCiD)
|
19
|
+
@host = params.fetch(:host)
|
20
|
+
@protocol = params.fetch(:protocol, 'https')
|
21
|
+
@basePath = params.fetch(:basePath)
|
22
|
+
@path = params.fetch(:path)
|
23
|
+
@response_description = params.fetch(:response_description)
|
24
|
+
@schemas = params.fetch(:schemas, [])
|
25
|
+
@comments = params.fetch(:comments, [])
|
26
|
+
@fairsharing_key_location = params.fetch(:fairsharing_key_location)
|
27
|
+
@score = params.fetch(:score, 0)
|
28
|
+
@testedGUID = params.fetch(:testedGUID, '')
|
29
|
+
end
|
30
|
+
|
31
|
+
def fairsharing_key
|
32
|
+
@fairsharing_key_location
|
33
|
+
end
|
34
|
+
|
35
|
+
def getSwagger
|
36
|
+
message = <<"EOF_EOF"
|
62
37
|
swagger: '2.0'
|
63
38
|
info:
|
64
39
|
version: '#{@version}'
|
@@ -89,7 +64,7 @@ class Swagger
|
|
89
64
|
$ref: '#/definitions/schemas'
|
90
65
|
consumes:
|
91
66
|
- application/json
|
92
|
-
produces
|
67
|
+
produces:#{' '}
|
93
68
|
- application/json
|
94
69
|
responses:
|
95
70
|
"200":
|
@@ -98,127 +73,112 @@ class Swagger
|
|
98
73
|
definitions:
|
99
74
|
schemas:
|
100
75
|
required:
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
76
|
+
EOF_EOF
|
77
|
+
|
78
|
+
schemas.keys.each do |key|
|
79
|
+
message += " - #{key}\n"
|
80
|
+
end
|
81
|
+
message += " properties:\n"
|
82
|
+
schemas.keys.each do |key|
|
83
|
+
message += " #{key}:\n"
|
84
|
+
message += " type: #{schemas[key][0]}\n"
|
85
|
+
message += " description: >-\n"
|
86
|
+
message += " #{schemas[key][1]}\n"
|
87
|
+
end
|
88
|
+
|
89
|
+
message
|
90
|
+
end
|
91
|
+
|
92
|
+
# A utility function that SHOULD NOT BE CALLED EXTERNALLY
|
93
|
+
#
|
94
|
+
# @param s - subject node
|
95
|
+
# @param p - predicate node
|
96
|
+
# @param o - object node
|
97
|
+
# @param repo - an RDF::Graph object
|
98
|
+
def triplify(s, p, o, repo)
|
99
|
+
s = s.strip if s.instance_of?(String)
|
100
|
+
p = p.strip if p.instance_of?(String)
|
101
|
+
o = o.strip if o.instance_of?(String)
|
102
|
+
|
103
|
+
unless s.respond_to?('uri')
|
104
|
+
|
105
|
+
if s.to_s =~ %r{^\w+:/?/?[^\s]+}
|
106
|
+
s = RDF::URI.new(s.to_s)
|
107
|
+
else
|
108
|
+
debug and warn "Subject #{s} must be a URI-compatible thingy"
|
109
|
+
abort "Subject #{s} must be a URI-compatible thingy"
|
107
110
|
end
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
111
|
+
end
|
112
|
+
|
113
|
+
unless p.respond_to?('uri')
|
114
|
+
|
115
|
+
if p.to_s =~ %r{^\w+:/?/?[^\s]+}
|
116
|
+
p = RDF::URI.new(p.to_s)
|
117
|
+
else
|
118
|
+
debug and warn "Predicate #{p} must be a URI-compatible thingy"
|
119
|
+
abort "Predicate #{p} must be a URI-compatible thingy"
|
114
120
|
end
|
115
|
-
|
116
|
-
return message
|
117
121
|
end
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
#
|
123
|
-
# @param s - subject node
|
124
|
-
# @param p - predicate node
|
125
|
-
# @param o - object node
|
126
|
-
# @param repo - an RDF::Graph object
|
127
|
-
def triplify(s, p, o, repo)
|
128
|
-
|
129
|
-
if s.class == String
|
130
|
-
s = s.strip
|
131
|
-
end
|
132
|
-
if p.class == String
|
133
|
-
p = p.strip
|
134
|
-
end
|
135
|
-
if o.class == String
|
136
|
-
o = o.strip
|
137
|
-
end
|
138
|
-
|
139
|
-
unless s.respond_to?('uri')
|
140
|
-
|
141
|
-
if s.to_s =~ /^\w+:\/?\/?[^\s]+/
|
142
|
-
s = RDF::URI.new(s.to_s)
|
143
|
-
else
|
144
|
-
self.debug and $stderr.puts "Subject #{s.to_s} must be a URI-compatible thingy"
|
145
|
-
abort "Subject #{s.to_s} must be a URI-compatible thingy"
|
146
|
-
end
|
147
|
-
end
|
148
|
-
|
149
|
-
unless p.respond_to?('uri')
|
150
|
-
|
151
|
-
if p.to_s =~ /^\w+:\/?\/?[^\s]+/
|
152
|
-
p = RDF::URI.new(p.to_s)
|
153
|
-
else
|
154
|
-
self.debug and $stderr.puts "Predicate #{p.to_s} must be a URI-compatible thingy"
|
155
|
-
abort "Predicate #{p.to_s} must be a URI-compatible thingy"
|
156
|
-
end
|
157
|
-
end
|
158
|
-
|
159
|
-
unless o.respond_to?('uri')
|
160
|
-
if o.to_s =~ /\A\w+:\/?\/?\w[^\s]+/
|
161
|
-
o = RDF::URI.new(o.to_s)
|
122
|
+
|
123
|
+
unless o.respond_to?('uri')
|
124
|
+
o = if o.to_s =~ %r{\A\w+:/?/?\w[^\s]+}
|
125
|
+
RDF::URI.new(o.to_s)
|
162
126
|
elsif o.to_s =~ /^\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d/
|
163
|
-
|
127
|
+
RDF::Literal.new(o.to_s, datatype: RDF::XSD.date)
|
164
128
|
elsif o.to_s =~ /^[+-]?\d+\.\d+/
|
165
|
-
|
129
|
+
RDF::Literal.new(o.to_s, datatype: RDF::XSD.float)
|
166
130
|
elsif o.to_s =~ /^[+-]?[0-9]+$/
|
167
|
-
|
131
|
+
RDF::Literal.new(o.to_s, datatype: RDF::XSD.int)
|
168
132
|
else
|
169
|
-
|
133
|
+
RDF::Literal.new(o.to_s, language: :en)
|
170
134
|
end
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
return g.dump(:jsonld)
|
220
|
-
end
|
221
|
-
|
135
|
+
end
|
136
|
+
|
137
|
+
debug and warn("\n\ninserting #{s} #{p} #{o}\n\n")
|
138
|
+
triple = RDF::Statement(s, p, o)
|
139
|
+
repo.insert(triple)
|
140
|
+
|
141
|
+
true
|
142
|
+
end
|
143
|
+
|
144
|
+
# A utility function that SHOULD NOT BE CALLED EXTERNALLY
|
145
|
+
#
|
146
|
+
# @param s - subject node
|
147
|
+
# @param p - predicate node
|
148
|
+
# @param o - object node
|
149
|
+
# @param repo - an RDF::Graph object
|
150
|
+
def self.triplify(s, p, o, repo)
|
151
|
+
triplify(s, p, o, repo)
|
152
|
+
end
|
153
|
+
|
154
|
+
def addComment(newcomment)
|
155
|
+
comments << newcomment.to_s
|
156
|
+
# return self.comments
|
157
|
+
end
|
158
|
+
|
159
|
+
def createEvaluationResponse
|
160
|
+
g = RDF::Graph.new
|
161
|
+
|
162
|
+
dt = Time.now.iso8601
|
163
|
+
uri = testedGUID
|
164
|
+
|
165
|
+
me = protocol + '://' + host + '/' + basePath + path
|
166
|
+
|
167
|
+
meURI = "#{me}##{uri}/result-#{dt}"
|
168
|
+
meURI = Addressable::URI.escape(meURI)
|
169
|
+
|
170
|
+
triplify(meURI, 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
|
171
|
+
'http://fairmetrics.org/resources/metric_evaluation_result', g)
|
172
|
+
triplify(meURI, 'http://semanticscience.org/resource/SIO_000300', score, g)
|
173
|
+
triplify(meURI, 'http://purl.obolibrary.org/obo/date', dt, g)
|
174
|
+
triplify(meURI, 'http://schema.org/softwareVersion', VERSION, g)
|
175
|
+
triplify(meURI, 'http://semanticscience.org/resource/SIO_000332', uri, g)
|
176
|
+
|
177
|
+
comments = 'no comments received. '
|
178
|
+
|
179
|
+
comments = self.comments.join("\n") if self.comments.size > 0
|
180
|
+
triplify(meURI, 'http://schema.org/comment', comments, g)
|
181
|
+
|
182
|
+
g.dump(:jsonld)
|
222
183
|
end
|
223
|
-
|
224
|
-
|
184
|
+
end
|
data/lib/warnings.json
CHANGED
data/lib/web_utils.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
module
|
1
|
+
module HarvesterTools
|
2
2
|
|
3
3
|
class WebUtils
|
4
|
-
def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get, meta:
|
4
|
+
def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get, meta: HarvesterTools::MetadataObject.new)
|
5
5
|
warn 'In fetch routine now. '
|
6
6
|
|
7
7
|
begin
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fsp_harvester
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mark Wilkinson
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-08-
|
11
|
+
date: 2022-08-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: json
|
@@ -182,15 +182,18 @@ files:
|
|
182
182
|
- bin/setup
|
183
183
|
- example_test.rb
|
184
184
|
- launch.json
|
185
|
+
- lib/config.conf
|
185
186
|
- lib/config.conf_docker
|
186
187
|
- lib/config.conf_local
|
187
188
|
- lib/constants.rb
|
189
|
+
- lib/external_tools.rb
|
188
190
|
- lib/fsp_harvester.rb
|
189
191
|
- lib/fsp_harvester/version.rb
|
190
|
-
- lib/
|
191
|
-
- lib/
|
192
|
-
- lib/
|
192
|
+
- lib/harvester.rb
|
193
|
+
- lib/harvester_utils.rb
|
194
|
+
- lib/metadata_harvester.rb
|
193
195
|
- lib/metadata_object.rb
|
196
|
+
- lib/metadata_parser.rb
|
194
197
|
- lib/signposting_tests.rb
|
195
198
|
- lib/swagger.rb
|
196
199
|
- lib/warnings.json
|