fsp_harvester 0.1.11 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rspec_status +53 -53
- data/Gemfile.lock +1 -1
- data/lib/config.conf +8 -0
- data/lib/{fsp_metadata_external_tools.rb → external_tools.rb} +17 -15
- data/lib/fsp_harvester/version.rb +1 -1
- data/lib/fsp_harvester.rb +8 -106
- data/lib/harvester.rb +27 -0
- data/lib/harvester_utils.rb +75 -0
- data/lib/{fsp_metadata_harvester.rb → metadata_harvester.rb} +8 -8
- data/lib/metadata_object.rb +1 -1
- data/lib/{fsp_metadata_parser.rb → metadata_parser.rb} +28 -13
- data/lib/signposting_tests.rb +9 -6
- data/lib/swagger.rb +137 -177
- data/lib/web_utils.rb +2 -2
- metadata +8 -5
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: b38eea15fa26a3fe07290024342f8b6121dbd78c3cd2dd3496ca118fca22f6d4
|
|
4
|
+
data.tar.gz: a25ea37ecd78b2ef8dc41dca391c161ba4b262d910dbadcf34f07f0cd8e54af5
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 83b766f2896a0776ed75ab3fd1e235d2c80173d3ffc0c22aea80b234856392497daf312db36461f03d0ca168228e6e385edf6d789b42dc4b43fcd6a073cda234
|
|
7
|
+
data.tar.gz: 16d74c199a138db0225e88c0e092a5272a518785c397d27da74c511a69260444e6c1c440021e08413c131d672cd2d528e8d19dbd8a20ca1bff97dad38600c995
|
data/.rspec_status
CHANGED
|
@@ -1,55 +1,55 @@
|
|
|
1
1
|
example_id | status | run_time |
|
|
2
2
|
---------------------------------- | ------ | --------------- |
|
|
3
|
-
./spec/cite-as_spec.rb[1:1:1] | passed | 1.
|
|
4
|
-
./spec/cite-as_spec.rb[1:1:2] | passed | 1.
|
|
5
|
-
./spec/cite-as_spec.rb[1:1:3] | passed | 1.
|
|
6
|
-
./spec/cite-as_spec.rb[1:1:4] | passed | 1.
|
|
7
|
-
./spec/cite-as_spec.rb[1:1:5] | passed | 2.
|
|
8
|
-
./spec/cite-as_spec.rb[1:1:6] | passed | 2.
|
|
9
|
-
./spec/cite-as_spec.rb[1:1:7] | passed |
|
|
10
|
-
./spec/cite-as_spec.rb[1:1:8] | passed | 2.
|
|
11
|
-
./spec/cite-as_spec.rb[1:1:9] | passed | 2.
|
|
12
|
-
./spec/cite-as_spec.rb[1:1:10] | passed | 2.
|
|
13
|
-
./spec/cite-as_spec.rb[1:1:11] | passed | 3.
|
|
14
|
-
./spec/cite-as_spec.rb[1:1:12] | passed | 2.
|
|
15
|
-
./spec/cite-as_spec.rb[1:1:13] | passed | 2.
|
|
16
|
-
./spec/cite-as_spec.rb[1:1:14] | passed | 2.
|
|
17
|
-
./spec/cite-as_spec.rb[1:1:15] | passed | 1.
|
|
18
|
-
./spec/cite-as_spec.rb[1:1:16] | passed | 1.
|
|
19
|
-
./spec/cite-as_spec.rb[1:1:17] | passed | 1.
|
|
20
|
-
./spec/cite-as_spec.rb[1:1:18] | passed | 1.
|
|
21
|
-
./spec/cite-as_spec.rb[1:1:19] | passed | 1.
|
|
22
|
-
./spec/cite-as_spec.rb[1:1:20] | passed | 1.
|
|
23
|
-
./spec/cite-as_spec.rb[1:1:21] | passed | 2.
|
|
24
|
-
./spec/cite-as_spec.rb[1:1:22] | passed | 1.
|
|
25
|
-
./spec/cite-as_spec.rb[1:1:23] | passed | 1.
|
|
26
|
-
./spec/cite-as_spec.rb[1:1:24] | failed | 1.
|
|
27
|
-
./spec/cite-as_spec.rb[1:1:25] | passed | 0.
|
|
28
|
-
./spec/describedby_spec.rb[1:1:1] | passed | 2.
|
|
29
|
-
./spec/describedby_spec.rb[1:1:2] | passed |
|
|
30
|
-
./spec/describedby_spec.rb[1:1:3] | passed |
|
|
31
|
-
./spec/describedby_spec.rb[1:1:4] | passed |
|
|
32
|
-
./spec/describedby_spec.rb[1:1:5] | passed | 1
|
|
33
|
-
./spec/describedby_spec.rb[1:1:6] | passed |
|
|
34
|
-
./spec/describedby_spec.rb[1:1:7] | passed | 0.
|
|
35
|
-
./spec/describedby_spec.rb[1:1:8] | passed |
|
|
36
|
-
./spec/describedby_spec.rb[1:1:9] | passed | 1.
|
|
37
|
-
./spec/describedby_spec.rb[1:1:10] | passed |
|
|
38
|
-
./spec/describedby_spec.rb[1:1:11] | passed |
|
|
39
|
-
./spec/describedby_spec.rb[1:1:12] | passed |
|
|
40
|
-
./spec/describedby_spec.rb[1:1:13] | passed | 1.
|
|
41
|
-
./spec/describedby_spec.rb[1:1:14] | passed |
|
|
42
|
-
./spec/describedby_spec.rb[1:1:15] | passed |
|
|
43
|
-
./spec/fsp_harvester_spec.rb[1:1] | passed | 0.
|
|
44
|
-
./spec/fsp_harvester_spec.rb[1:2] | failed |
|
|
45
|
-
./spec/item_spec.rb[1:1:1] | passed |
|
|
46
|
-
./spec/item_spec.rb[1:1:2] | passed | 2 seconds
|
|
47
|
-
./spec/item_spec.rb[1:1:3] | passed |
|
|
48
|
-
./spec/item_spec.rb[1:1:4] | passed | 1.
|
|
49
|
-
./spec/item_spec.rb[1:1:5] | passed |
|
|
50
|
-
./spec/item_spec.rb[1:1:6] | passed |
|
|
51
|
-
./spec/item_spec.rb[1:1:7] | passed |
|
|
52
|
-
./spec/item_spec.rb[1:1:8] | passed | 0.
|
|
53
|
-
./spec/type_spec.rb[1:1:1] | passed |
|
|
54
|
-
./spec/type_spec.rb[1:1:2] | passed |
|
|
55
|
-
./spec/type_spec.rb[1:1:3] | passed |
|
|
3
|
+
./spec/cite-as_spec.rb[1:1:1] | passed | 1.3 seconds |
|
|
4
|
+
./spec/cite-as_spec.rb[1:1:2] | passed | 1.21 seconds |
|
|
5
|
+
./spec/cite-as_spec.rb[1:1:3] | passed | 1.09 seconds |
|
|
6
|
+
./spec/cite-as_spec.rb[1:1:4] | passed | 1.69 seconds |
|
|
7
|
+
./spec/cite-as_spec.rb[1:1:5] | passed | 2.72 seconds |
|
|
8
|
+
./spec/cite-as_spec.rb[1:1:6] | passed | 2.3 seconds |
|
|
9
|
+
./spec/cite-as_spec.rb[1:1:7] | passed | 3.36 seconds |
|
|
10
|
+
./spec/cite-as_spec.rb[1:1:8] | passed | 2.26 seconds |
|
|
11
|
+
./spec/cite-as_spec.rb[1:1:9] | passed | 2.82 seconds |
|
|
12
|
+
./spec/cite-as_spec.rb[1:1:10] | passed | 2.3 seconds |
|
|
13
|
+
./spec/cite-as_spec.rb[1:1:11] | passed | 3.37 seconds |
|
|
14
|
+
./spec/cite-as_spec.rb[1:1:12] | passed | 2.2 seconds |
|
|
15
|
+
./spec/cite-as_spec.rb[1:1:13] | passed | 2.94 seconds |
|
|
16
|
+
./spec/cite-as_spec.rb[1:1:14] | passed | 2.44 seconds |
|
|
17
|
+
./spec/cite-as_spec.rb[1:1:15] | passed | 1.54 seconds |
|
|
18
|
+
./spec/cite-as_spec.rb[1:1:16] | passed | 1.29 seconds |
|
|
19
|
+
./spec/cite-as_spec.rb[1:1:17] | passed | 1.25 seconds |
|
|
20
|
+
./spec/cite-as_spec.rb[1:1:18] | passed | 1.15 seconds |
|
|
21
|
+
./spec/cite-as_spec.rb[1:1:19] | passed | 1.7 seconds |
|
|
22
|
+
./spec/cite-as_spec.rb[1:1:20] | passed | 1.66 seconds |
|
|
23
|
+
./spec/cite-as_spec.rb[1:1:21] | passed | 2.41 seconds |
|
|
24
|
+
./spec/cite-as_spec.rb[1:1:22] | passed | 1.64 seconds |
|
|
25
|
+
./spec/cite-as_spec.rb[1:1:23] | passed | 1.35 seconds |
|
|
26
|
+
./spec/cite-as_spec.rb[1:1:24] | failed | 1.25 seconds |
|
|
27
|
+
./spec/cite-as_spec.rb[1:1:25] | passed | 0.51152 seconds |
|
|
28
|
+
./spec/describedby_spec.rb[1:1:1] | passed | 2.71 seconds |
|
|
29
|
+
./spec/describedby_spec.rb[1:1:2] | passed | 1.25 seconds |
|
|
30
|
+
./spec/describedby_spec.rb[1:1:3] | passed | 1.22 seconds |
|
|
31
|
+
./spec/describedby_spec.rb[1:1:4] | passed | 1.22 seconds |
|
|
32
|
+
./spec/describedby_spec.rb[1:1:5] | passed | 1.21 seconds |
|
|
33
|
+
./spec/describedby_spec.rb[1:1:6] | passed | 1.02 seconds |
|
|
34
|
+
./spec/describedby_spec.rb[1:1:7] | passed | 0.99175 seconds |
|
|
35
|
+
./spec/describedby_spec.rb[1:1:8] | passed | 2.44 seconds |
|
|
36
|
+
./spec/describedby_spec.rb[1:1:9] | passed | 1.72 seconds |
|
|
37
|
+
./spec/describedby_spec.rb[1:1:10] | passed | 2.15 seconds |
|
|
38
|
+
./spec/describedby_spec.rb[1:1:11] | passed | 3.17 seconds |
|
|
39
|
+
./spec/describedby_spec.rb[1:1:12] | passed | 3.1 seconds |
|
|
40
|
+
./spec/describedby_spec.rb[1:1:13] | passed | 1.7 seconds |
|
|
41
|
+
./spec/describedby_spec.rb[1:1:14] | passed | 2.21 seconds |
|
|
42
|
+
./spec/describedby_spec.rb[1:1:15] | passed | 2.18 seconds |
|
|
43
|
+
./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00058 seconds |
|
|
44
|
+
./spec/fsp_harvester_spec.rb[1:2] | failed | 2.92 seconds |
|
|
45
|
+
./spec/item_spec.rb[1:1:1] | passed | 3.09 seconds |
|
|
46
|
+
./spec/item_spec.rb[1:1:2] | passed | 2.92 seconds |
|
|
47
|
+
./spec/item_spec.rb[1:1:3] | passed | 1.12 seconds |
|
|
48
|
+
./spec/item_spec.rb[1:1:4] | passed | 1.7 seconds |
|
|
49
|
+
./spec/item_spec.rb[1:1:5] | passed | 2.24 seconds |
|
|
50
|
+
./spec/item_spec.rb[1:1:6] | passed | 2.87 seconds |
|
|
51
|
+
./spec/item_spec.rb[1:1:7] | passed | 3.03 seconds |
|
|
52
|
+
./spec/item_spec.rb[1:1:8] | passed | 0.52338 seconds |
|
|
53
|
+
./spec/type_spec.rb[1:1:1] | passed | 1.42 seconds |
|
|
54
|
+
./spec/type_spec.rb[1:1:2] | passed | 1.28 seconds |
|
|
55
|
+
./spec/type_spec.rb[1:1:3] | passed | 1.52 seconds |
|
data/Gemfile.lock
CHANGED
data/lib/config.conf
ADDED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
module
|
|
3
|
+
module HarvesterTools
|
|
4
4
|
class Error < StandardError
|
|
5
5
|
end
|
|
6
6
|
|
|
7
7
|
class ExternalTools
|
|
8
8
|
|
|
9
|
-
def initialize(metadata:
|
|
9
|
+
def initialize(metadata: HarvesterTools::MetadataObject.new)
|
|
10
10
|
@meta = metadata
|
|
11
11
|
end
|
|
12
12
|
|
|
@@ -25,10 +25,7 @@ module FspHarvester
|
|
|
25
25
|
file.rewind
|
|
26
26
|
|
|
27
27
|
@meta.comments << "INFO: The message body is being examined by Distiller\n"
|
|
28
|
-
|
|
29
|
-
command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
|
|
30
|
-
# command = "LANG=en_US.UTF-8 /usr/local/bin/ruby #{@rdf_command} serialize --input-format rdfa --output-format jsonld #{file.path}"
|
|
31
|
-
# command = "LANG=en_US.UTF-8 /home/osboxes/.rvm/rubies/ruby-2.6.3/bin/ruby /home/osboxes/.rvm/gems/ruby-2.6.3/bin/rdf serialize --output-format jsonld #{file.path}"
|
|
28
|
+
command = "LANG=en_US.UTF-8 #{RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
|
|
32
29
|
warn "distiller command: #{command}"
|
|
33
30
|
result, _stderr, _status = Open3.capture3(command)
|
|
34
31
|
warn ''
|
|
@@ -41,12 +38,13 @@ module FspHarvester
|
|
|
41
38
|
if result !~ /@context/i # failure returns nil
|
|
42
39
|
@meta.comments << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML..\n"
|
|
43
40
|
@meta.add_warning(['018', '', ''])
|
|
41
|
+
result = "{}"
|
|
44
42
|
else
|
|
45
43
|
@meta.comments << "INFO: The Distiller found parseable data. Parsing as JSON-LD\n"
|
|
46
|
-
parse_rdf(result: result, content_type: "application/ld+json")
|
|
47
44
|
end
|
|
48
45
|
@@distillerknown[bhash] = true
|
|
49
46
|
end
|
|
47
|
+
result
|
|
50
48
|
end
|
|
51
49
|
|
|
52
50
|
def processs_with_extruct(uri:)
|
|
@@ -55,6 +53,11 @@ module FspHarvester
|
|
|
55
53
|
stdout, stderr, status = Open3.capture3(EXTRUCT_COMMAND + ' ' + uri)
|
|
56
54
|
warn "open3 status: #{status} #{stdout}"
|
|
57
55
|
result = stderr # absurd that the output comes over stderr! LOL!
|
|
56
|
+
jsonld = {}
|
|
57
|
+
microdata = Hash.new
|
|
58
|
+
microformat = Hash.new
|
|
59
|
+
opengraph = Hash.new
|
|
60
|
+
rdfa = Hash.new
|
|
58
61
|
|
|
59
62
|
if result.to_s.match(/(Failed\sto\sextract.*?)\n/)
|
|
60
63
|
@meta.comments << "WARN: extruct threw an error #{Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n"
|
|
@@ -66,17 +69,16 @@ module FspHarvester
|
|
|
66
69
|
elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
|
|
67
70
|
json = JSON.parse result
|
|
68
71
|
@meta.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
@meta.merge_hash(json.first) if json.first.is_a? Hash
|
|
72
|
+
jsonld = json['json-ld'].to_json if json['json-ld'].any?
|
|
73
|
+
microdata = json['microdata'].first if json['microdata'].any
|
|
74
|
+
microformat = json['microformat'].first if json['microformat'].any?
|
|
75
|
+
opengraph = json['opengraph'].first if json['opengraph'].any?
|
|
76
|
+
rdfa = json['rdfa'].to_json if json['rdfa'].any?
|
|
77
|
+
# @meta.merge_hash(json.first) if json.first.is_a? Hash
|
|
77
78
|
else
|
|
78
79
|
@meta.comments << "WARN: the extruct tool failed to find parseable data at #{uri}\n"
|
|
79
80
|
end
|
|
81
|
+
[jsonld, microdata, microformat, opengraph, rdfa]
|
|
80
82
|
end
|
|
81
83
|
end
|
|
82
84
|
end
|
data/lib/fsp_harvester.rb
CHANGED
|
@@ -1,121 +1,23 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require_relative 'fsp_harvester/version'
|
|
4
|
-
require 'json/ld'
|
|
5
|
-
require 'json/ld/preloaded'
|
|
6
|
-
require 'json'
|
|
7
|
-
require 'linkheaders/processor'
|
|
8
|
-
require 'addressable'
|
|
9
|
-
require 'tempfile'
|
|
10
|
-
require 'xmlsimple'
|
|
11
|
-
require 'nokogiri'
|
|
12
|
-
require 'parseconfig'
|
|
13
|
-
require 'rest-client'
|
|
14
|
-
require 'cgi'
|
|
15
|
-
require 'digest'
|
|
16
|
-
require 'open3'
|
|
17
|
-
require 'metainspector'
|
|
18
|
-
require 'rdf/xsd'
|
|
19
|
-
require_relative './metadata_object'
|
|
20
|
-
require_relative './constants'
|
|
21
|
-
require_relative './web_utils'
|
|
22
|
-
require_relative './signposting_tests'
|
|
23
|
-
require_relative './fsp_metadata_harvester'
|
|
24
|
-
require_relative './fsp_metadata_parser'
|
|
25
|
-
|
|
26
1
|
|
|
2
|
+
require_relative 'harvester'
|
|
27
3
|
module FspHarvester
|
|
28
4
|
class Error < StandardError
|
|
29
5
|
end
|
|
30
6
|
|
|
31
7
|
class Utils
|
|
32
|
-
# @@distillerknown = {} # global, hash of sha256 keys of message bodies - have they been seen before t/f
|
|
33
|
-
# @warnings = JSON.parse(File.read("warnings.json"))
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def self.resolve_guid(guid:)
|
|
37
|
-
@meta = FspHarvester::MetadataObject.new
|
|
38
|
-
@meta.all_uris = [guid]
|
|
39
|
-
type, url = convertToURL(guid: guid)
|
|
40
|
-
links = Array.new
|
|
41
|
-
if type
|
|
42
|
-
links = resolve_url(url: url)
|
|
43
|
-
@meta.links << links
|
|
44
|
-
else
|
|
45
|
-
@meta.add_warning(['006', guid, ''])
|
|
46
|
-
@meta.comments << "FATAL: GUID type not recognized.\n"
|
|
47
|
-
end
|
|
48
|
-
[links, @meta]
|
|
49
|
-
end
|
|
50
8
|
|
|
51
|
-
def self.gather_metadata_from_describedby_links(links: [], metadata:
|
|
9
|
+
def self.gather_metadata_from_describedby_links(links: [], metadata: HarvesterTools::MetadataObject.new) # meta should have already been created by resolve+guid, but maybe not
|
|
52
10
|
@meta = metadata
|
|
53
11
|
db = []
|
|
54
12
|
links.each do |l|
|
|
55
13
|
db << l if l.relation == 'describedby'
|
|
56
14
|
end
|
|
57
|
-
|
|
15
|
+
HarvesterTools::MetadataHarvester.extract_metadata(links: db, metadata: @meta) # everything is gathered into the @meta metadata object
|
|
58
16
|
@meta
|
|
59
17
|
end
|
|
60
18
|
|
|
61
|
-
def self.
|
|
62
|
-
|
|
63
|
-
if k == 'inchi' and regex.match(guid)
|
|
64
|
-
return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
|
|
65
|
-
elsif k == 'handle1' and regex.match(guid)
|
|
66
|
-
return 'handle', "http://hdl.handle.net/#{guid}"
|
|
67
|
-
elsif k == 'handle2' and regex.match(guid)
|
|
68
|
-
return 'handle', "http://hdl.handle.net/#{guid}"
|
|
69
|
-
elsif k == 'uri' and regex.match(guid)
|
|
70
|
-
return 'uri', guid
|
|
71
|
-
elsif k == 'doi' and regex.match(guid)
|
|
72
|
-
return 'doi', "https://doi.org/#{guid}"
|
|
73
|
-
end
|
|
74
|
-
end
|
|
75
|
-
[nil, nil]
|
|
76
|
-
end
|
|
77
|
-
|
|
78
|
-
def self.typeit(guid:)
|
|
79
|
-
Utils::GUID_TYPES.each do |type, regex|
|
|
80
|
-
return type if regex.match(guid)
|
|
81
|
-
end
|
|
82
|
-
false
|
|
83
|
-
end
|
|
84
|
-
|
|
85
|
-
def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_STAR_HEADER)
|
|
86
|
-
@meta.guidtype = 'uri' if @meta.guidtype.nil?
|
|
87
|
-
warn "\n\n FETCHING #{url} #{header}\n\n"
|
|
88
|
-
response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method, meta: @meta)
|
|
89
|
-
warn "\n\n head #{response.headers.inspect}\n\n" if response
|
|
90
|
-
|
|
91
|
-
unless response
|
|
92
|
-
@meta.add_warning(['001', url, header])
|
|
93
|
-
@meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
|
|
94
|
-
return []
|
|
95
|
-
end
|
|
96
|
-
|
|
97
|
-
@meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
|
|
98
|
-
@meta.full_response << response.body
|
|
99
|
-
|
|
100
|
-
links = process_link_headers(response: response) unless nolinkheaders
|
|
101
|
-
links
|
|
102
|
-
end
|
|
103
|
-
|
|
104
|
-
def self.process_link_headers(response:)
|
|
105
|
-
warn "\n\n parsing #{response.headers}\n\n"
|
|
106
|
-
|
|
107
|
-
parser = LinkHeaders::Processor.new(default_anchor: @meta.all_uris.last)
|
|
108
|
-
parser.extract_and_parse(response: response)
|
|
109
|
-
factory = parser.factory # LinkHeaders::LinkFactory
|
|
110
|
-
|
|
111
|
-
warn "\n\n length bfore #{factory.all_links.length}\n\n"
|
|
112
|
-
signpostingcheck(factory: factory)
|
|
113
|
-
warn "\n\n length aftr #{factory.all_links.length}\n\n"
|
|
114
|
-
warn "\n\n links #{factory.all_links}\n\n"
|
|
115
|
-
factory.all_links
|
|
116
|
-
end
|
|
117
|
-
|
|
118
|
-
def self.signpostingcheck(factory:)
|
|
19
|
+
def self.signpostingcheck(factory:, metadata: HarvesterTools::MetadataObject.new)
|
|
20
|
+
@meta = metadata
|
|
119
21
|
citeas = Array.new
|
|
120
22
|
describedby = Array.new
|
|
121
23
|
item = Array.new
|
|
@@ -134,13 +36,13 @@ module FspHarvester
|
|
|
134
36
|
end
|
|
135
37
|
end
|
|
136
38
|
|
|
137
|
-
check_describedby_rules(describedby: describedby)
|
|
138
|
-
check_item_rules(item: item)
|
|
39
|
+
check_describedby_rules(describedby: describedby, metadata: @meta)
|
|
40
|
+
check_item_rules(item: item, metadata: @meta)
|
|
139
41
|
|
|
140
42
|
if citeas.length > 1
|
|
141
43
|
warn "INFO: multiple cite-as links found. Checking for conflicts\n"
|
|
142
44
|
@meta.comments << "INFO: multiple cite-as links found. Checking for conflicts\n"
|
|
143
|
-
citeas = check_for_citeas_conflicts(citeas: citeas) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
|
|
45
|
+
citeas = check_for_citeas_conflicts(citeas: citeas, metadata: @meta) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
|
|
144
46
|
end
|
|
145
47
|
|
|
146
48
|
unless citeas.length == 1 && describedby.length > 0
|
data/lib/harvester.rb
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
#require_relative 'fsp_harvester/version'
|
|
4
|
+
require 'json/ld'
|
|
5
|
+
require 'json/ld/preloaded'
|
|
6
|
+
require 'json'
|
|
7
|
+
require 'linkheaders/processor'
|
|
8
|
+
require 'addressable'
|
|
9
|
+
require 'tempfile'
|
|
10
|
+
require 'xmlsimple'
|
|
11
|
+
require 'nokogiri'
|
|
12
|
+
require 'parseconfig'
|
|
13
|
+
require 'rest-client'
|
|
14
|
+
require 'cgi'
|
|
15
|
+
require 'digest'
|
|
16
|
+
require 'open3'
|
|
17
|
+
require 'metainspector'
|
|
18
|
+
require 'rdf/xsd'
|
|
19
|
+
require_relative './metadata_object'
|
|
20
|
+
require_relative './constants'
|
|
21
|
+
require_relative './web_utils'
|
|
22
|
+
require_relative './signposting_tests'
|
|
23
|
+
require_relative './metadata_harvester'
|
|
24
|
+
require_relative './fsp_harvester'
|
|
25
|
+
require_relative './harvester_utils'
|
|
26
|
+
require_relative './external_tools'
|
|
27
|
+
require_relative './metadata_parser'
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
module HarvesterTools
|
|
2
|
+
class Error < StandardError
|
|
3
|
+
end
|
|
4
|
+
|
|
5
|
+
class Utils
|
|
6
|
+
|
|
7
|
+
def self.resolve_guid(guid:)
|
|
8
|
+
@meta = HarvesterTools::MetadataObject.new
|
|
9
|
+
@meta.all_uris = [guid]
|
|
10
|
+
type, url = convertToURL(guid: guid)
|
|
11
|
+
links = Array.new
|
|
12
|
+
if type
|
|
13
|
+
links = resolve_url(url: url)
|
|
14
|
+
@meta.links = @meta.links | links
|
|
15
|
+
else
|
|
16
|
+
@meta.add_warning(['006', guid, ''])
|
|
17
|
+
@meta.comments << "FATAL: GUID type not recognized.\n"
|
|
18
|
+
end
|
|
19
|
+
[links, @meta]
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def self.convertToURL(guid:)
|
|
23
|
+
GUID_TYPES.each do |k, regex|
|
|
24
|
+
if k == 'inchi' and regex.match(guid)
|
|
25
|
+
return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
|
|
26
|
+
elsif k == 'handle1' and regex.match(guid)
|
|
27
|
+
return 'handle', "http://hdl.handle.net/#{guid}"
|
|
28
|
+
elsif k == 'handle2' and regex.match(guid)
|
|
29
|
+
return 'handle', "http://hdl.handle.net/#{guid}"
|
|
30
|
+
elsif k == 'uri' and regex.match(guid)
|
|
31
|
+
return 'uri', guid
|
|
32
|
+
elsif k == 'doi' and regex.match(guid)
|
|
33
|
+
return 'doi', "https://doi.org/#{guid}"
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
[nil, nil]
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def self.typeit(guid:)
|
|
40
|
+
GUID_TYPES.each do |type, regex|
|
|
41
|
+
return type if regex.match(guid)
|
|
42
|
+
end
|
|
43
|
+
false
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_STAR_HEADER)
|
|
47
|
+
@meta.guidtype = 'uri' if @meta.guidtype.nil?
|
|
48
|
+
warn "\n\n FETCHING #{url} #{header}\n\n"
|
|
49
|
+
response = HarvesterTools::WebUtils.fspfetch(url: url, headers: header, method: method, meta: @meta)
|
|
50
|
+
warn "\n\n head #{response.headers.inspect}\n\n" if response
|
|
51
|
+
|
|
52
|
+
unless response
|
|
53
|
+
@meta.add_warning(['001', url, header])
|
|
54
|
+
@meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
|
|
55
|
+
return []
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
@meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
|
|
59
|
+
@meta.full_response << response.body
|
|
60
|
+
|
|
61
|
+
links = process_link_headers(response: response) unless nolinkheaders
|
|
62
|
+
links
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def self.process_link_headers(response:)
|
|
66
|
+
warn "\n\n parsing #{response.headers}\n\n"
|
|
67
|
+
|
|
68
|
+
parser = LinkHeaders::Processor.new(default_anchor: @meta.all_uris.last)
|
|
69
|
+
parser.extract_and_parse(response: response)
|
|
70
|
+
factory = parser.factory # LinkHeaders::LinkFactory
|
|
71
|
+
FspHarvester::Utils.signpostingcheck(factory: factory, metadata: @meta)
|
|
72
|
+
factory.all_links
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
module
|
|
3
|
+
module HarvesterTools
|
|
4
4
|
class Error < StandardError
|
|
5
5
|
end
|
|
6
6
|
|
|
7
7
|
class MetadataHarvester
|
|
8
|
-
def self.extract_metadata(links: [], metadata:
|
|
8
|
+
def self.extract_metadata(links: [], metadata: HarvesterTools::MetadataObject.new)
|
|
9
9
|
@meta = metadata
|
|
10
10
|
@meta.comments << 'INFO: now collecting both linked data and hash-style data using the harvested links'
|
|
11
11
|
|
|
12
12
|
describedby = links.select { |l| l if l.relation == 'describedby' }
|
|
13
13
|
|
|
14
|
-
hvst =
|
|
14
|
+
hvst = HarvesterTools::MetadataParser.new(metadata_object: @meta) # put here because the class variable for detecting duplicates should apply to all URIs
|
|
15
15
|
describedby.each do |link|
|
|
16
16
|
accepttype = ACCEPT_STAR_HEADER
|
|
17
17
|
accept = link.respond_to?('type') ? link.type : nil
|
|
@@ -30,16 +30,16 @@ module FspHarvester
|
|
|
30
30
|
case abbreviation
|
|
31
31
|
when 'html'
|
|
32
32
|
@meta.comments << 'INFO: Processing html'
|
|
33
|
-
hvst.process_html(body: response.body, uri: link)
|
|
33
|
+
hvst.process_html(body: response.body, uri: link, metadata: @meta)
|
|
34
34
|
when 'xml'
|
|
35
35
|
@meta.comments << 'INFO: Processing xml'
|
|
36
|
-
hvst.process_xml(body: response.body)
|
|
36
|
+
hvst.process_xml(body: response.body, metadata: @meta)
|
|
37
37
|
when 'json'
|
|
38
38
|
@meta.comments << 'INFO: Processing json'
|
|
39
|
-
hvst.process_json(body: response.body)
|
|
39
|
+
hvst.process_json(body: response.body, metadata: @meta)
|
|
40
40
|
when 'jsonld', 'rdfxml', 'turtle', 'ntriples', 'nquads'
|
|
41
41
|
@meta.comments << 'INFO: Processing linked data'
|
|
42
|
-
hvst.process_ld(body: response.body, content_type: content_type)
|
|
42
|
+
hvst.process_ld(body: response.body, content_type: content_type, metadata: @meta)
|
|
43
43
|
when 'specialist'
|
|
44
44
|
warn 'no specialized parsers so far'
|
|
45
45
|
end
|
|
@@ -54,7 +54,7 @@ module FspHarvester
|
|
|
54
54
|
@meta.comments << "INFO: link #{link.href} has no MIME type, defaulting to */*"
|
|
55
55
|
end
|
|
56
56
|
url = link.href
|
|
57
|
-
response =
|
|
57
|
+
response = HarvesterTools::WebUtils.fspfetch(url: url, method: :get, headers: header)
|
|
58
58
|
unless response
|
|
59
59
|
@meta.add_warning(['016', url, header])
|
|
60
60
|
@meta.comments << "WARN: Unable to resolve describedby link #{url} using HTTP Accept header #{header}.\n"
|
data/lib/metadata_object.rb
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
module
|
|
1
|
+
module HarvesterTools
|
|
2
2
|
class MetadataObject
|
|
3
3
|
attr_accessor :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris # a hash of metadata # a RDF.rb graph of metadata # an array of comments # the type of GUID that was detected # will be an array of Net::HTTP::Response
|
|
4
4
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
module
|
|
3
|
+
module HarvesterTools
|
|
4
4
|
class Error < StandardError
|
|
5
5
|
end
|
|
6
6
|
|
|
@@ -9,17 +9,25 @@ module FspHarvester
|
|
|
9
9
|
|
|
10
10
|
@@distillerknown = {}
|
|
11
11
|
|
|
12
|
-
def initialize(metadata_object:
|
|
12
|
+
def initialize(metadata_object: HarvesterTools::MetadataObject.new)
|
|
13
13
|
@meta = metadata_object
|
|
14
14
|
end
|
|
15
15
|
|
|
16
|
-
def process_html(body:, uri:)
|
|
17
|
-
|
|
18
|
-
tools.
|
|
19
|
-
tools.
|
|
16
|
+
def process_html(body:, uri:, metadata:)
|
|
17
|
+
@meta = metadata
|
|
18
|
+
tools = HarvesterTools::ExternalTools.new(metadata: @meta)
|
|
19
|
+
result = tools.process_with_distiller(body: body)
|
|
20
|
+
|
|
21
|
+
jsonld, microdata, microformat, opengraph, rdfa = tools.process_with_extruct(uri: uri)
|
|
22
|
+
parse_rdf(body: jsonld, content_type: 'application/ld+json')
|
|
23
|
+
@meta.merge_hash(microdata)
|
|
24
|
+
@meta.merge_hash(microformat)
|
|
25
|
+
@meta.merge_hash(opengraph)
|
|
26
|
+
parse_rdf(body: rdfa, content_type: 'application/ld+json')
|
|
20
27
|
end
|
|
21
28
|
|
|
22
|
-
def process_xml(body:)
|
|
29
|
+
def process_xml(body:, metadata:)
|
|
30
|
+
@meta = metadata
|
|
23
31
|
begin
|
|
24
32
|
hash = XmlSimple.xml_in(body)
|
|
25
33
|
rescue
|
|
@@ -30,7 +38,8 @@ module FspHarvester
|
|
|
30
38
|
@meta.hash.merge hash
|
|
31
39
|
end
|
|
32
40
|
|
|
33
|
-
def process_json(body:)
|
|
41
|
+
def process_json(body:, metadata:)
|
|
42
|
+
@meta = metadata
|
|
34
43
|
begin
|
|
35
44
|
hash = JSON.parse(body)
|
|
36
45
|
rescue
|
|
@@ -41,11 +50,17 @@ module FspHarvester
|
|
|
41
50
|
@meta.hash.merge hash
|
|
42
51
|
end
|
|
43
52
|
|
|
44
|
-
def process_ld(body:, content_type:)
|
|
45
|
-
|
|
53
|
+
def process_ld(body:, content_type:, metadata:)
|
|
54
|
+
@meta = metadata
|
|
55
|
+
parse_rdf(body: body, content_type: content_type, metadata: @meta)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def parse_rdf(body:, content_type:, metadata:)
|
|
59
|
+
self.class.parse_rdf(body: body, content_type: content_type, metadata: metadata)
|
|
46
60
|
end
|
|
47
61
|
|
|
48
|
-
def parse_rdf(body:, content_type:)
|
|
62
|
+
def self.parse_rdf(body:, content_type:, metadata:)
|
|
63
|
+
@meta = metadata
|
|
49
64
|
unless body
|
|
50
65
|
@meta.comments << "CRITICAL: The response message body component appears to have no content.\n"
|
|
51
66
|
@meta.add_warning(['018', '', ''])
|
|
@@ -65,7 +80,7 @@ module FspHarvester
|
|
|
65
80
|
return
|
|
66
81
|
end
|
|
67
82
|
|
|
68
|
-
graph =
|
|
83
|
+
graph = HarvesterTools::Cache.checkRDFCache(body: body)
|
|
69
84
|
if graph.size > 0
|
|
70
85
|
warn "\n\n\n unmarshalling graph from cache\n\ngraph size #{graph.size}\n\n"
|
|
71
86
|
@meta.merge_rdf(graph.to_a)
|
|
@@ -88,7 +103,7 @@ module FspHarvester
|
|
|
88
103
|
end
|
|
89
104
|
reader = rdfformat.reader.new(body) # have to re-read it here, but now its safe because we have already caught errors
|
|
90
105
|
warn 'WRITING TO CACHE'
|
|
91
|
-
|
|
106
|
+
HarvesterTools::Cache.writeRDFCache(reader: reader, body: body) # write to the special RDF graph cache
|
|
92
107
|
warn 'WRITING DONE'
|
|
93
108
|
reader = rdfformat.reader.new(body) # frustrating that we cannot rewind!
|
|
94
109
|
warn 'RE-READING DONE'
|
data/lib/signposting_tests.rb
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
def check_for_citeas_conflicts(citeas: )
|
|
1
|
+
def check_for_citeas_conflicts(citeas:, metadata: )
|
|
2
|
+
@meta = metadata
|
|
2
3
|
@meta.comments << 'INFO: checking for conflicting cite-as links'
|
|
3
4
|
citeas_hrefs = Hash.new
|
|
4
5
|
citeas.each do |link|
|
|
@@ -6,7 +7,7 @@ def check_for_citeas_conflicts(citeas: )
|
|
|
6
7
|
@meta.comments << "INFO: Adding citeas #{link.href} to the testing queue."
|
|
7
8
|
citeas_hrefs[link.href] = link
|
|
8
9
|
end
|
|
9
|
-
|
|
10
|
+
#warn "finalhash #{citeas_hrefs}"
|
|
10
11
|
if citeas_hrefs.length > 1
|
|
11
12
|
@meta.comments << 'INFO: Found multiple non-identical cite-as links.'
|
|
12
13
|
@meta.add_warning(['007', '', ''])
|
|
@@ -16,7 +17,8 @@ def check_for_citeas_conflicts(citeas: )
|
|
|
16
17
|
end
|
|
17
18
|
|
|
18
19
|
|
|
19
|
-
def check_describedby_rules(describedby:)
|
|
20
|
+
def check_describedby_rules(describedby:, metadata:)
|
|
21
|
+
@meta = metadata
|
|
20
22
|
describedby.each do |l|
|
|
21
23
|
unless l.respond_to? 'type'
|
|
22
24
|
@meta.add_warning(['005', l.href, ''])
|
|
@@ -25,7 +27,7 @@ def check_describedby_rules(describedby:)
|
|
|
25
27
|
type = l.type if l.respond_to? 'type'
|
|
26
28
|
type ||= '*/*'
|
|
27
29
|
header = { accept: type }
|
|
28
|
-
response =
|
|
30
|
+
response = HarvesterTools::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
|
|
29
31
|
if response
|
|
30
32
|
responsetype = response.headers[:content_type]
|
|
31
33
|
@meta.comments << "INFO: describedby link responds with content type #{responsetype}\n"
|
|
@@ -51,7 +53,8 @@ def check_describedby_rules(describedby:)
|
|
|
51
53
|
end
|
|
52
54
|
end
|
|
53
55
|
|
|
54
|
-
def check_item_rules(item:)
|
|
56
|
+
def check_item_rules(item:, metadata:)
|
|
57
|
+
@meta = metadata
|
|
55
58
|
item.each do |l| # l = LinkHeaders::Link
|
|
56
59
|
unless l.respond_to? 'type'
|
|
57
60
|
@meta.add_warning(['011', l.href, ''])
|
|
@@ -60,7 +63,7 @@ def check_item_rules(item:)
|
|
|
60
63
|
type = l.type if l.respond_to? 'type'
|
|
61
64
|
type ||= '*/*' # this becomes a frozen string
|
|
62
65
|
header = { accept: type }
|
|
63
|
-
response =
|
|
66
|
+
response = HarvesterTools::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
|
|
64
67
|
|
|
65
68
|
if response
|
|
66
69
|
if response.headers[:content_type] and type != '*/*'
|
data/lib/swagger.rb
CHANGED
|
@@ -1,64 +1,39 @@
|
|
|
1
|
-
class Swagger
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
@protocol = params.fetch(:protocol, "https")
|
|
38
|
-
@basePath = params.fetch(:basePath)
|
|
39
|
-
@path = params.fetch(:path)
|
|
40
|
-
@response_description = params.fetch(:response_description)
|
|
41
|
-
@schemas = params.fetch(:schemas, [])
|
|
42
|
-
@comments = params.fetch(:comments, [])
|
|
43
|
-
@fairsharing_key_location = params.fetch(:fairsharing_key_location)
|
|
44
|
-
@score = params.fetch(:score, 0)
|
|
45
|
-
@testedGUID = params.fetch(:testedGUID, "")
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
end
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
def fairsharing_key
|
|
54
|
-
return @fairsharing_key_location
|
|
55
|
-
end
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
def getSwagger
|
|
60
|
-
|
|
61
|
-
message = <<"EOF_EOF"
|
|
1
|
+
class Swagger
|
|
2
|
+
attr_accessor :debug, :title, :tests_metric, :description, :applies_to_principle, :organization, :org_url,
|
|
3
|
+
:responsible_developer, :email, :developer_ORCiD, :protocol, :host, :basePath, :path,
|
|
4
|
+
:response_description, :schemas, :comments, :fairsharing_key_location, :score, :testedGUID
|
|
5
|
+
|
|
6
|
+
def initialize(params = {})
|
|
7
|
+
@debug = params.fetch(:debug, false)
|
|
8
|
+
|
|
9
|
+
@title = params.fetch(:title, 'unnamed')
|
|
10
|
+
@tests_metric = params.fetch(:tests_metric)
|
|
11
|
+
@description = params.fetch(:description, 'default_description')
|
|
12
|
+
@applies_to_principle = params.fetch(:applies_to_principle, 'some principle')
|
|
13
|
+
@version = params.fetch(:version, '0.1')
|
|
14
|
+
@organization = params.fetch(:organization, 'Some Organization')
|
|
15
|
+
@org_url = params.fetch(:org_url)
|
|
16
|
+
@responsible_develper = params.fetch(:responsible_developer, 'Some Person')
|
|
17
|
+
@email = params.fetch(:email)
|
|
18
|
+
@developer_ORCiD = params.fetch(:developer_ORCiD)
|
|
19
|
+
@host = params.fetch(:host)
|
|
20
|
+
@protocol = params.fetch(:protocol, 'https')
|
|
21
|
+
@basePath = params.fetch(:basePath)
|
|
22
|
+
@path = params.fetch(:path)
|
|
23
|
+
@response_description = params.fetch(:response_description)
|
|
24
|
+
@schemas = params.fetch(:schemas, [])
|
|
25
|
+
@comments = params.fetch(:comments, [])
|
|
26
|
+
@fairsharing_key_location = params.fetch(:fairsharing_key_location)
|
|
27
|
+
@score = params.fetch(:score, 0)
|
|
28
|
+
@testedGUID = params.fetch(:testedGUID, '')
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def fairsharing_key
|
|
32
|
+
@fairsharing_key_location
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def getSwagger
|
|
36
|
+
message = <<"EOF_EOF"
|
|
62
37
|
swagger: '2.0'
|
|
63
38
|
info:
|
|
64
39
|
version: '#{@version}'
|
|
@@ -89,7 +64,7 @@ class Swagger
|
|
|
89
64
|
$ref: '#/definitions/schemas'
|
|
90
65
|
consumes:
|
|
91
66
|
- application/json
|
|
92
|
-
produces
|
|
67
|
+
produces:#{' '}
|
|
93
68
|
- application/json
|
|
94
69
|
responses:
|
|
95
70
|
"200":
|
|
@@ -98,127 +73,112 @@ class Swagger
|
|
|
98
73
|
definitions:
|
|
99
74
|
schemas:
|
|
100
75
|
required:
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
76
|
+
EOF_EOF
|
|
77
|
+
|
|
78
|
+
schemas.keys.each do |key|
|
|
79
|
+
message += " - #{key}\n"
|
|
80
|
+
end
|
|
81
|
+
message += " properties:\n"
|
|
82
|
+
schemas.keys.each do |key|
|
|
83
|
+
message += " #{key}:\n"
|
|
84
|
+
message += " type: #{schemas[key][0]}\n"
|
|
85
|
+
message += " description: >-\n"
|
|
86
|
+
message += " #{schemas[key][1]}\n"
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
message
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# A utility function that SHOULD NOT BE CALLED EXTERNALLY
|
|
93
|
+
#
|
|
94
|
+
# @param s - subject node
|
|
95
|
+
# @param p - predicate node
|
|
96
|
+
# @param o - object node
|
|
97
|
+
# @param repo - an RDF::Graph object
|
|
98
|
+
def triplify(s, p, o, repo)
|
|
99
|
+
s = s.strip if s.instance_of?(String)
|
|
100
|
+
p = p.strip if p.instance_of?(String)
|
|
101
|
+
o = o.strip if o.instance_of?(String)
|
|
102
|
+
|
|
103
|
+
unless s.respond_to?('uri')
|
|
104
|
+
|
|
105
|
+
if s.to_s =~ %r{^\w+:/?/?[^\s]+}
|
|
106
|
+
s = RDF::URI.new(s.to_s)
|
|
107
|
+
else
|
|
108
|
+
debug and warn "Subject #{s} must be a URI-compatible thingy"
|
|
109
|
+
abort "Subject #{s} must be a URI-compatible thingy"
|
|
107
110
|
end
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
unless p.respond_to?('uri')
|
|
114
|
+
|
|
115
|
+
if p.to_s =~ %r{^\w+:/?/?[^\s]+}
|
|
116
|
+
p = RDF::URI.new(p.to_s)
|
|
117
|
+
else
|
|
118
|
+
debug and warn "Predicate #{p} must be a URI-compatible thingy"
|
|
119
|
+
abort "Predicate #{p} must be a URI-compatible thingy"
|
|
114
120
|
end
|
|
115
|
-
|
|
116
|
-
return message
|
|
117
121
|
end
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
#
|
|
123
|
-
# @param s - subject node
|
|
124
|
-
# @param p - predicate node
|
|
125
|
-
# @param o - object node
|
|
126
|
-
# @param repo - an RDF::Graph object
|
|
127
|
-
def triplify(s, p, o, repo)
|
|
128
|
-
|
|
129
|
-
if s.class == String
|
|
130
|
-
s = s.strip
|
|
131
|
-
end
|
|
132
|
-
if p.class == String
|
|
133
|
-
p = p.strip
|
|
134
|
-
end
|
|
135
|
-
if o.class == String
|
|
136
|
-
o = o.strip
|
|
137
|
-
end
|
|
138
|
-
|
|
139
|
-
unless s.respond_to?('uri')
|
|
140
|
-
|
|
141
|
-
if s.to_s =~ /^\w+:\/?\/?[^\s]+/
|
|
142
|
-
s = RDF::URI.new(s.to_s)
|
|
143
|
-
else
|
|
144
|
-
self.debug and $stderr.puts "Subject #{s.to_s} must be a URI-compatible thingy"
|
|
145
|
-
abort "Subject #{s.to_s} must be a URI-compatible thingy"
|
|
146
|
-
end
|
|
147
|
-
end
|
|
148
|
-
|
|
149
|
-
unless p.respond_to?('uri')
|
|
150
|
-
|
|
151
|
-
if p.to_s =~ /^\w+:\/?\/?[^\s]+/
|
|
152
|
-
p = RDF::URI.new(p.to_s)
|
|
153
|
-
else
|
|
154
|
-
self.debug and $stderr.puts "Predicate #{p.to_s} must be a URI-compatible thingy"
|
|
155
|
-
abort "Predicate #{p.to_s} must be a URI-compatible thingy"
|
|
156
|
-
end
|
|
157
|
-
end
|
|
158
|
-
|
|
159
|
-
unless o.respond_to?('uri')
|
|
160
|
-
if o.to_s =~ /\A\w+:\/?\/?\w[^\s]+/
|
|
161
|
-
o = RDF::URI.new(o.to_s)
|
|
122
|
+
|
|
123
|
+
unless o.respond_to?('uri')
|
|
124
|
+
o = if o.to_s =~ %r{\A\w+:/?/?\w[^\s]+}
|
|
125
|
+
RDF::URI.new(o.to_s)
|
|
162
126
|
elsif o.to_s =~ /^\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d/
|
|
163
|
-
|
|
127
|
+
RDF::Literal.new(o.to_s, datatype: RDF::XSD.date)
|
|
164
128
|
elsif o.to_s =~ /^[+-]?\d+\.\d+/
|
|
165
|
-
|
|
129
|
+
RDF::Literal.new(o.to_s, datatype: RDF::XSD.float)
|
|
166
130
|
elsif o.to_s =~ /^[+-]?[0-9]+$/
|
|
167
|
-
|
|
131
|
+
RDF::Literal.new(o.to_s, datatype: RDF::XSD.int)
|
|
168
132
|
else
|
|
169
|
-
|
|
133
|
+
RDF::Literal.new(o.to_s, language: :en)
|
|
170
134
|
end
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
return g.dump(:jsonld)
|
|
220
|
-
end
|
|
221
|
-
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
debug and warn("\n\ninserting #{s} #{p} #{o}\n\n")
|
|
138
|
+
triple = RDF::Statement(s, p, o)
|
|
139
|
+
repo.insert(triple)
|
|
140
|
+
|
|
141
|
+
true
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# A utility function that SHOULD NOT BE CALLED EXTERNALLY
|
|
145
|
+
#
|
|
146
|
+
# @param s - subject node
|
|
147
|
+
# @param p - predicate node
|
|
148
|
+
# @param o - object node
|
|
149
|
+
# @param repo - an RDF::Graph object
|
|
150
|
+
def self.triplify(s, p, o, repo)
|
|
151
|
+
triplify(s, p, o, repo)
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def addComment(newcomment)
|
|
155
|
+
comments << newcomment.to_s
|
|
156
|
+
# return self.comments
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
def createEvaluationResponse
|
|
160
|
+
g = RDF::Graph.new
|
|
161
|
+
|
|
162
|
+
dt = Time.now.iso8601
|
|
163
|
+
uri = testedGUID
|
|
164
|
+
|
|
165
|
+
me = protocol + '://' + host + '/' + basePath + path
|
|
166
|
+
|
|
167
|
+
meURI = "#{me}##{uri}/result-#{dt}"
|
|
168
|
+
meURI = Addressable::URI.escape(meURI)
|
|
169
|
+
|
|
170
|
+
triplify(meURI, 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
|
|
171
|
+
'http://fairmetrics.org/resources/metric_evaluation_result', g)
|
|
172
|
+
triplify(meURI, 'http://semanticscience.org/resource/SIO_000300', score, g)
|
|
173
|
+
triplify(meURI, 'http://purl.obolibrary.org/obo/date', dt, g)
|
|
174
|
+
triplify(meURI, 'http://schema.org/softwareVersion', VERSION, g)
|
|
175
|
+
triplify(meURI, 'http://semanticscience.org/resource/SIO_000332', uri, g)
|
|
176
|
+
|
|
177
|
+
comments = 'no comments received. '
|
|
178
|
+
|
|
179
|
+
comments = self.comments.join("\n") if self.comments.size > 0
|
|
180
|
+
triplify(meURI, 'http://schema.org/comment', comments, g)
|
|
181
|
+
|
|
182
|
+
g.dump(:jsonld)
|
|
222
183
|
end
|
|
223
|
-
|
|
224
|
-
|
|
184
|
+
end
|
data/lib/web_utils.rb
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
module
|
|
1
|
+
module HarvesterTools
|
|
2
2
|
|
|
3
3
|
class WebUtils
|
|
4
|
-
def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get, meta:
|
|
4
|
+
def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get, meta: HarvesterTools::MetadataObject.new)
|
|
5
5
|
warn 'In fetch routine now. '
|
|
6
6
|
|
|
7
7
|
begin
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: fsp_harvester
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.12
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Mark Wilkinson
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2022-08-
|
|
11
|
+
date: 2022-08-11 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: json
|
|
@@ -182,15 +182,18 @@ files:
|
|
|
182
182
|
- bin/setup
|
|
183
183
|
- example_test.rb
|
|
184
184
|
- launch.json
|
|
185
|
+
- lib/config.conf
|
|
185
186
|
- lib/config.conf_docker
|
|
186
187
|
- lib/config.conf_local
|
|
187
188
|
- lib/constants.rb
|
|
189
|
+
- lib/external_tools.rb
|
|
188
190
|
- lib/fsp_harvester.rb
|
|
189
191
|
- lib/fsp_harvester/version.rb
|
|
190
|
-
- lib/
|
|
191
|
-
- lib/
|
|
192
|
-
- lib/
|
|
192
|
+
- lib/harvester.rb
|
|
193
|
+
- lib/harvester_utils.rb
|
|
194
|
+
- lib/metadata_harvester.rb
|
|
193
195
|
- lib/metadata_object.rb
|
|
196
|
+
- lib/metadata_parser.rb
|
|
194
197
|
- lib/signposting_tests.rb
|
|
195
198
|
- lib/swagger.rb
|
|
196
199
|
- lib/warnings.json
|