fsp_harvester 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +3 -3
- data/lib/fsp_harvester/version.rb +1 -1
- data/lib/fsp_harvester.rb +121 -56
- data/lib/warnings.json +41 -0
- data/lib/web_utils.rb +3 -3
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f24e34b5239426a8555e5d893cb3692a1f03442f6b4af3c03c5751c975a7871b
|
4
|
+
data.tar.gz: d372b73eb7693e5a4c9a2f78e20d02b62f3c195ed4185db7532018a45a694570
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e85c8ba90bee37156e8a4d8e98ec0d8c2148ffc86e24ac3faf0adde1146efaafa2333e7e25acf2b5b4d05aa1f2a9a411deb18890175e93bd1b8d0980773e42c2
|
7
|
+
data.tar.gz: f20559315f1b9aff81978600f50743fdee2d3b200eae0dc375ccf267fda90909cb16a6e43686a5efea0f6c583c90c4511c7390702ad7f9e9704f80f829da128b
|
data/Gemfile.lock
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
fsp_harvester (0.1.
|
4
|
+
fsp_harvester (0.1.6)
|
5
5
|
json (~> 2.0)
|
6
6
|
linkeddata (~> 3.2)
|
7
|
-
linkheaders-processor (~> 0.1.
|
7
|
+
linkheaders-processor (~> 0.1.12)
|
8
8
|
metainspector (~> 5.11.2)
|
9
9
|
parseconfig (~> 1.1)
|
10
10
|
rake (~> 13.0)
|
@@ -126,7 +126,7 @@ GEM
|
|
126
126
|
shex (~> 0.7)
|
127
127
|
sparql (~> 3.2)
|
128
128
|
sparql-client (~> 3.2)
|
129
|
-
linkheaders-processor (0.1.
|
129
|
+
linkheaders-processor (0.1.12)
|
130
130
|
json (~> 2.0)
|
131
131
|
json-ld (~> 3.2)
|
132
132
|
json-ld-preloaded (~> 3.2)
|
data/lib/fsp_harvester.rb
CHANGED
@@ -1,24 +1,24 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require_relative
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require
|
7
|
-
require
|
8
|
-
require
|
9
|
-
require
|
10
|
-
require
|
11
|
-
require
|
12
|
-
require
|
13
|
-
require
|
14
|
-
require
|
15
|
-
require
|
16
|
-
require
|
17
|
-
require
|
18
|
-
require
|
19
|
-
require_relative
|
20
|
-
require_relative
|
21
|
-
require_relative
|
3
|
+
require_relative 'fsp_harvester/version'
|
4
|
+
require 'json/ld'
|
5
|
+
require 'json/ld/preloaded'
|
6
|
+
require 'json'
|
7
|
+
require 'linkheaders/processor'
|
8
|
+
require 'addressable'
|
9
|
+
require 'tempfile'
|
10
|
+
require 'xmlsimple'
|
11
|
+
require 'nokogiri'
|
12
|
+
require 'parseconfig'
|
13
|
+
require 'rest-client'
|
14
|
+
require 'cgi'
|
15
|
+
require 'digest'
|
16
|
+
require 'open3'
|
17
|
+
require 'metainspector'
|
18
|
+
require 'rdf/xsd'
|
19
|
+
require_relative './metadata_object'
|
20
|
+
require_relative './constants'
|
21
|
+
require_relative './web_utils'
|
22
22
|
|
23
23
|
module FspHarvester
|
24
24
|
class Error < StandardError
|
@@ -32,28 +32,28 @@ module FspHarvester
|
|
32
32
|
def self.resolve_guid(guid:)
|
33
33
|
@meta.finalURI = [guid]
|
34
34
|
type, url = convertToURL(guid: guid)
|
35
|
-
links =
|
36
|
-
|
37
|
-
@meta.warnings << ["006", guid, ""]
|
38
|
-
@meta.comments << "FATAL: GUID type not recognized.\n"
|
39
|
-
else
|
35
|
+
links = []
|
36
|
+
if type
|
40
37
|
links, @meta = resolve_url(url: url)
|
38
|
+
else
|
39
|
+
@meta.warnings << ['006', guid, '']
|
40
|
+
@meta.comments << "FATAL: GUID type not recognized.\n"
|
41
41
|
end
|
42
42
|
[links, @meta]
|
43
43
|
end
|
44
44
|
|
45
45
|
def self.convertToURL(guid:)
|
46
46
|
GUID_TYPES.each do |k, regex|
|
47
|
-
if k ==
|
48
|
-
return
|
49
|
-
elsif k ==
|
50
|
-
return
|
51
|
-
elsif k ==
|
52
|
-
return
|
53
|
-
elsif k ==
|
54
|
-
return
|
55
|
-
elsif k ==
|
56
|
-
return
|
47
|
+
if k == 'inchi' and regex.match(guid)
|
48
|
+
return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
|
49
|
+
elsif k == 'handle1' and regex.match(guid)
|
50
|
+
return 'handle', "http://hdl.handle.net/#{guid}"
|
51
|
+
elsif k == 'handle2' and regex.match(guid)
|
52
|
+
return 'handle', "http://hdl.handle.net/#{guid}"
|
53
|
+
elsif k == 'uri' and regex.match(guid)
|
54
|
+
return 'uri', guid
|
55
|
+
elsif k == 'doi' and regex.match(guid)
|
56
|
+
return 'doi', "https://doi.org/#{guid}"
|
57
57
|
end
|
58
58
|
end
|
59
59
|
[nil, nil]
|
@@ -66,14 +66,14 @@ module FspHarvester
|
|
66
66
|
false
|
67
67
|
end
|
68
68
|
|
69
|
-
def self.resolve_url(url:, nolinkheaders: false, header: ACCEPT_ALL_HEADER)
|
70
|
-
@meta.guidtype =
|
69
|
+
def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_ALL_HEADER)
|
70
|
+
@meta.guidtype = 'uri' if @meta.guidtype.nil?
|
71
71
|
warn "\n\n FETCHING #{url} #{header}\n\n"
|
72
|
-
response = FspHarvester::WebUtils.fspfetch(url: url, headers: header)
|
73
|
-
warn "\n\n head #{response.headers.inspect}\n\n"
|
72
|
+
response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method)
|
73
|
+
warn "\n\n head #{response.headers.inspect}\n\n" if response
|
74
74
|
|
75
75
|
unless response
|
76
|
-
@meta.warnings << [
|
76
|
+
@meta.warnings << ['001', url, header]
|
77
77
|
@meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
|
78
78
|
return [[], @meta]
|
79
79
|
end
|
@@ -90,48 +90,113 @@ module FspHarvester
|
|
90
90
|
|
91
91
|
parser = LinkHeaders::Processor.new(default_anchor: @meta.finalURI.last)
|
92
92
|
parser.extract_and_parse(response: response)
|
93
|
-
factory = parser.factory
|
93
|
+
factory = parser.factory # LinkHeaders::LinkFactory
|
94
94
|
|
95
|
-
citeas = 0
|
96
|
-
describedby = 0
|
97
95
|
warn "\n\n length #{factory.all_links.length}\n\n"
|
96
|
+
signpostingcheck(factory: factory)
|
97
|
+
end
|
98
98
|
|
99
|
+
def self.signpostingcheck(factory:)
|
100
|
+
citeas = 0
|
101
|
+
describedby = 0
|
99
102
|
factory.all_links.each do |l|
|
100
103
|
case l.relation
|
101
|
-
when
|
104
|
+
when 'cite-as'
|
102
105
|
citeas += 1
|
103
|
-
when
|
106
|
+
when 'item'
|
107
|
+
if !(l.respond_to? 'type')
|
108
|
+
@meta.warnings << ['011', l.href, '']
|
109
|
+
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which encourages any item links to also have a 'type' attribute.\n"
|
110
|
+
end
|
111
|
+
type = l.type if l.respond_to? 'type'
|
112
|
+
type = '*/*' unless type # this becomes a frozen string
|
113
|
+
header = { accept: type }
|
114
|
+
response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
|
115
|
+
|
116
|
+
if response
|
117
|
+
if response.headers[:content_type] and !(type == '*/*')
|
118
|
+
rtype = type.gsub(%r{/}, "\/") # because type is a frozen string
|
119
|
+
rtype = rtype.gsub(/\+/, '.')
|
120
|
+
typeregex = Regexp.new(type)
|
121
|
+
if response.headers[:content_type].match(typeregex)
|
122
|
+
warn response.headers[:content_type]
|
123
|
+
warn typeregex.inspect
|
124
|
+
@meta.comments << "INFO: item link responds according to Signposting specifications\n"
|
125
|
+
else
|
126
|
+
@meta.warnings << ['012', l.href, header]
|
127
|
+
@meta.comments << "WARN: Content type of returned item link does not match the 'type' attribute\n"
|
128
|
+
end
|
129
|
+
else
|
130
|
+
@meta.warnings << ['013', l.href, header]
|
131
|
+
@meta.comments << "WARN: Content type of returned item link is not specified in response headers or cannot be matched against accept headers\n"
|
132
|
+
end
|
133
|
+
else
|
134
|
+
@meta.warnings << ['014', l.href, header]
|
135
|
+
@meta.comments << "WARN: item link doesn't resolve\n"
|
136
|
+
end
|
137
|
+
|
138
|
+
when 'describedby'
|
104
139
|
describedby += 1
|
105
|
-
|
106
|
-
@meta.warnings << [
|
107
|
-
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires any describedby links to also have a 'type' attribute
|
140
|
+
if !(l.respond_to? 'type')
|
141
|
+
@meta.warnings << ['005', l.href, '']
|
142
|
+
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires any describedby links to also have a 'type' attribute.\n"
|
143
|
+
end
|
144
|
+
type = l.type if l.respond_to? 'type'
|
145
|
+
type = '*/*' unless type
|
146
|
+
header = { accept: type }
|
147
|
+
response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
|
148
|
+
if response
|
149
|
+
if response.headers[:content_type] and !(type == '*/*')
|
150
|
+
rtype = type.gsub(%r{/}, "\/")
|
151
|
+
rtype = rtype.gsub(/\+/, '.')
|
152
|
+
typeregex = Regexp.new(rtype)
|
153
|
+
if response.headers[:content_type].match(typeregex)
|
154
|
+
warn response.headers[:content_type]
|
155
|
+
warn typeregex.inspect
|
156
|
+
@meta.comments << "INFO: describedby link responds according to Signposting specifications\n"
|
157
|
+
else
|
158
|
+
@meta.warnings << ['009', l.href, header]
|
159
|
+
@meta.comments << "WARN: Content type of returned describedby link does not match the 'type' attribute\n"
|
160
|
+
end
|
161
|
+
else
|
162
|
+
@meta.warnings << ['010', l.href, header]
|
163
|
+
@meta.comments << "WARN: Content type of returned describedby link is not specified in response headers or cannot be matched against accept headers\n"
|
164
|
+
end
|
165
|
+
else
|
166
|
+
@meta.warnings << ['008', l.href, header]
|
167
|
+
@meta.comments << "WARN: describedby link doesn't resolve\n"
|
108
168
|
end
|
109
169
|
end
|
110
170
|
end
|
111
171
|
if citeas > 1
|
112
|
-
|
172
|
+
@meta.comments << "INFO: multiple cite-as links found. Checking for conflicts\n"
|
173
|
+
citeas = check_for_citeas_conflicts(factory: factory) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
|
113
174
|
end
|
114
175
|
|
115
176
|
unless citeas == 1 && describedby > 0
|
116
|
-
@meta.warnings << [
|
177
|
+
@meta.warnings << ['004', '', '']
|
117
178
|
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires exactly one cite-as header, and at least one describedby header\n"
|
118
179
|
end
|
119
180
|
factory.all_links
|
120
181
|
end
|
121
182
|
|
122
|
-
def self.
|
123
|
-
@meta.comments <<
|
124
|
-
citeas =
|
183
|
+
def self.check_for_citeas_conflicts(factory:)
|
184
|
+
@meta.comments << 'INFO: checking for conflicting cite-as links'
|
185
|
+
citeas = []
|
125
186
|
factory.all_links.each do |link|
|
126
187
|
next unless link.relation == 'cite-as'
|
188
|
+
|
189
|
+
@meta.comments << "INFO: Adding citeas #{link.href} to the testing queue."
|
127
190
|
citeas << link.href
|
128
191
|
end
|
129
|
-
|
130
|
-
|
192
|
+
|
193
|
+
if citeas.uniq.length == 1
|
194
|
+
@meta.comments << 'INFO: No conflicting cite-as links found.'
|
195
|
+
else # only one allowed!
|
196
|
+
@meta.warnings << ['007', '', '']
|
131
197
|
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard: Found conflicting cite-as link headers\n"
|
132
|
-
else
|
133
|
-
@meta.comments << "INFO: No conflicting cite-as links found."
|
134
198
|
end
|
199
|
+
citeas.uniq
|
135
200
|
end
|
136
201
|
end
|
137
202
|
end
|
data/lib/warnings.json
CHANGED
@@ -28,6 +28,47 @@
|
|
28
28
|
"message": "GUID type not recognized",
|
29
29
|
"linkout": "",
|
30
30
|
"severity": "WARN"
|
31
|
+
},
|
32
|
+
"007": {
|
33
|
+
"message": "Conflicting cite-as links",
|
34
|
+
"linkout": "",
|
35
|
+
"severity": "WARN"
|
36
|
+
},
|
37
|
+
"008": {
|
38
|
+
"message": "describedby link does not resolve",
|
39
|
+
"linkout": "",
|
40
|
+
"severity": "WARN"
|
41
|
+
},
|
42
|
+
"009": {
|
43
|
+
"message": "Content-type of described-by link does not match the type attribute in the link header itself",
|
44
|
+
"linkout": "",
|
45
|
+
"severity": "WARN"
|
46
|
+
},
|
47
|
+
"010": {
|
48
|
+
"message": "Content-type of response from described-by link is undefined or cannot be compared to the link type",
|
49
|
+
"linkout": "",
|
50
|
+
"severity": "WARN"
|
51
|
+
},
|
52
|
+
"011": {
|
53
|
+
"message": "The resource does not follow the FAIR Signposting standard, which encourages any item links to have a type attribute",
|
54
|
+
"linkout": "",
|
55
|
+
"severity": "WARN"
|
56
|
+
},
|
57
|
+
"012": {
|
58
|
+
"message": "Content-type of response from resolving an item doesn't match the item type attribute in the link header",
|
59
|
+
"linkout": "",
|
60
|
+
"severity": "WARN"
|
61
|
+
},
|
62
|
+
"013": {
|
63
|
+
"message": "Content-type of response from resolving an item is undefined or cannot be compared to the link type",
|
64
|
+
"linkout": "",
|
65
|
+
"severity": "WARN"
|
66
|
+
},
|
67
|
+
"014": {
|
68
|
+
"message": "Item link does not resolve",
|
69
|
+
"linkout": "",
|
70
|
+
"severity": "WARN"
|
31
71
|
}
|
72
|
+
|
32
73
|
|
33
74
|
}
|
data/lib/web_utils.rb
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
module FspHarvester
|
2
2
|
|
3
3
|
class WebUtils
|
4
|
-
def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER
|
4
|
+
def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get)
|
5
5
|
warn 'In fetch routine now. '
|
6
6
|
|
7
7
|
begin
|
8
8
|
warn "executing call over the Web to #{url}"
|
9
9
|
response = RestClient::Request.execute({
|
10
|
-
method:
|
10
|
+
method: method,
|
11
11
|
url: url.to_s,
|
12
12
|
# user: user,
|
13
13
|
# password: pass,
|
@@ -26,7 +26,7 @@ module FspHarvester
|
|
26
26
|
warn "EXCEPTION WITH RESPONSE! #{e.response}\n#{e.response.headers}"
|
27
27
|
@meta.warnings << ["003", url, headers] if @meta
|
28
28
|
@meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n" if @meta
|
29
|
-
if e.response.code == 500
|
29
|
+
if (e.response.code == 500 or e.response.code == 404)
|
30
30
|
return false
|
31
31
|
else
|
32
32
|
e.response
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fsp_harvester
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mark Wilkinson
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-07-
|
11
|
+
date: 2022-07-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: json
|
@@ -44,14 +44,14 @@ dependencies:
|
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: 0.1.
|
47
|
+
version: 0.1.12
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: 0.1.
|
54
|
+
version: 0.1.12
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: metainspector
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|