fsp_harvester 0.1.3 → 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +3 -3
- data/lib/fsp_harvester/version.rb +1 -1
- data/lib/fsp_harvester.rb +121 -56
- data/lib/warnings.json +41 -0
- data/lib/web_utils.rb +3 -3
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f24e34b5239426a8555e5d893cb3692a1f03442f6b4af3c03c5751c975a7871b
|
4
|
+
data.tar.gz: d372b73eb7693e5a4c9a2f78e20d02b62f3c195ed4185db7532018a45a694570
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e85c8ba90bee37156e8a4d8e98ec0d8c2148ffc86e24ac3faf0adde1146efaafa2333e7e25acf2b5b4d05aa1f2a9a411deb18890175e93bd1b8d0980773e42c2
|
7
|
+
data.tar.gz: f20559315f1b9aff81978600f50743fdee2d3b200eae0dc375ccf267fda90909cb16a6e43686a5efea0f6c583c90c4511c7390702ad7f9e9704f80f829da128b
|
data/Gemfile.lock
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
fsp_harvester (0.1.
|
4
|
+
fsp_harvester (0.1.6)
|
5
5
|
json (~> 2.0)
|
6
6
|
linkeddata (~> 3.2)
|
7
|
-
linkheaders-processor (~> 0.1.
|
7
|
+
linkheaders-processor (~> 0.1.12)
|
8
8
|
metainspector (~> 5.11.2)
|
9
9
|
parseconfig (~> 1.1)
|
10
10
|
rake (~> 13.0)
|
@@ -126,7 +126,7 @@ GEM
|
|
126
126
|
shex (~> 0.7)
|
127
127
|
sparql (~> 3.2)
|
128
128
|
sparql-client (~> 3.2)
|
129
|
-
linkheaders-processor (0.1.
|
129
|
+
linkheaders-processor (0.1.12)
|
130
130
|
json (~> 2.0)
|
131
131
|
json-ld (~> 3.2)
|
132
132
|
json-ld-preloaded (~> 3.2)
|
data/lib/fsp_harvester.rb
CHANGED
@@ -1,24 +1,24 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require_relative
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require
|
7
|
-
require
|
8
|
-
require
|
9
|
-
require
|
10
|
-
require
|
11
|
-
require
|
12
|
-
require
|
13
|
-
require
|
14
|
-
require
|
15
|
-
require
|
16
|
-
require
|
17
|
-
require
|
18
|
-
require
|
19
|
-
require_relative
|
20
|
-
require_relative
|
21
|
-
require_relative
|
3
|
+
require_relative 'fsp_harvester/version'
|
4
|
+
require 'json/ld'
|
5
|
+
require 'json/ld/preloaded'
|
6
|
+
require 'json'
|
7
|
+
require 'linkheaders/processor'
|
8
|
+
require 'addressable'
|
9
|
+
require 'tempfile'
|
10
|
+
require 'xmlsimple'
|
11
|
+
require 'nokogiri'
|
12
|
+
require 'parseconfig'
|
13
|
+
require 'rest-client'
|
14
|
+
require 'cgi'
|
15
|
+
require 'digest'
|
16
|
+
require 'open3'
|
17
|
+
require 'metainspector'
|
18
|
+
require 'rdf/xsd'
|
19
|
+
require_relative './metadata_object'
|
20
|
+
require_relative './constants'
|
21
|
+
require_relative './web_utils'
|
22
22
|
|
23
23
|
module FspHarvester
|
24
24
|
class Error < StandardError
|
@@ -32,28 +32,28 @@ module FspHarvester
|
|
32
32
|
def self.resolve_guid(guid:)
|
33
33
|
@meta.finalURI = [guid]
|
34
34
|
type, url = convertToURL(guid: guid)
|
35
|
-
links =
|
36
|
-
|
37
|
-
@meta.warnings << ["006", guid, ""]
|
38
|
-
@meta.comments << "FATAL: GUID type not recognized.\n"
|
39
|
-
else
|
35
|
+
links = []
|
36
|
+
if type
|
40
37
|
links, @meta = resolve_url(url: url)
|
38
|
+
else
|
39
|
+
@meta.warnings << ['006', guid, '']
|
40
|
+
@meta.comments << "FATAL: GUID type not recognized.\n"
|
41
41
|
end
|
42
42
|
[links, @meta]
|
43
43
|
end
|
44
44
|
|
45
45
|
def self.convertToURL(guid:)
|
46
46
|
GUID_TYPES.each do |k, regex|
|
47
|
-
if k ==
|
48
|
-
return
|
49
|
-
elsif k ==
|
50
|
-
return
|
51
|
-
elsif k ==
|
52
|
-
return
|
53
|
-
elsif k ==
|
54
|
-
return
|
55
|
-
elsif k ==
|
56
|
-
return
|
47
|
+
if k == 'inchi' and regex.match(guid)
|
48
|
+
return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
|
49
|
+
elsif k == 'handle1' and regex.match(guid)
|
50
|
+
return 'handle', "http://hdl.handle.net/#{guid}"
|
51
|
+
elsif k == 'handle2' and regex.match(guid)
|
52
|
+
return 'handle', "http://hdl.handle.net/#{guid}"
|
53
|
+
elsif k == 'uri' and regex.match(guid)
|
54
|
+
return 'uri', guid
|
55
|
+
elsif k == 'doi' and regex.match(guid)
|
56
|
+
return 'doi', "https://doi.org/#{guid}"
|
57
57
|
end
|
58
58
|
end
|
59
59
|
[nil, nil]
|
@@ -66,14 +66,14 @@ module FspHarvester
|
|
66
66
|
false
|
67
67
|
end
|
68
68
|
|
69
|
-
def self.resolve_url(url:, nolinkheaders: false, header: ACCEPT_ALL_HEADER)
|
70
|
-
@meta.guidtype =
|
69
|
+
def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_ALL_HEADER)
|
70
|
+
@meta.guidtype = 'uri' if @meta.guidtype.nil?
|
71
71
|
warn "\n\n FETCHING #{url} #{header}\n\n"
|
72
|
-
response = FspHarvester::WebUtils.fspfetch(url: url, headers: header)
|
73
|
-
warn "\n\n head #{response.headers.inspect}\n\n"
|
72
|
+
response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method)
|
73
|
+
warn "\n\n head #{response.headers.inspect}\n\n" if response
|
74
74
|
|
75
75
|
unless response
|
76
|
-
@meta.warnings << [
|
76
|
+
@meta.warnings << ['001', url, header]
|
77
77
|
@meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
|
78
78
|
return [[], @meta]
|
79
79
|
end
|
@@ -90,48 +90,113 @@ module FspHarvester
|
|
90
90
|
|
91
91
|
parser = LinkHeaders::Processor.new(default_anchor: @meta.finalURI.last)
|
92
92
|
parser.extract_and_parse(response: response)
|
93
|
-
factory = parser.factory
|
93
|
+
factory = parser.factory # LinkHeaders::LinkFactory
|
94
94
|
|
95
|
-
citeas = 0
|
96
|
-
describedby = 0
|
97
95
|
warn "\n\n length #{factory.all_links.length}\n\n"
|
96
|
+
signpostingcheck(factory: factory)
|
97
|
+
end
|
98
98
|
|
99
|
+
def self.signpostingcheck(factory:)
|
100
|
+
citeas = 0
|
101
|
+
describedby = 0
|
99
102
|
factory.all_links.each do |l|
|
100
103
|
case l.relation
|
101
|
-
when
|
104
|
+
when 'cite-as'
|
102
105
|
citeas += 1
|
103
|
-
when
|
106
|
+
when 'item'
|
107
|
+
if !(l.respond_to? 'type')
|
108
|
+
@meta.warnings << ['011', l.href, '']
|
109
|
+
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which encourages any item links to also have a 'type' attribute.\n"
|
110
|
+
end
|
111
|
+
type = l.type if l.respond_to? 'type'
|
112
|
+
type = '*/*' unless type # this becomes a frozen string
|
113
|
+
header = { accept: type }
|
114
|
+
response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
|
115
|
+
|
116
|
+
if response
|
117
|
+
if response.headers[:content_type] and !(type == '*/*')
|
118
|
+
rtype = type.gsub(%r{/}, "\/") # because type is a frozen string
|
119
|
+
rtype = rtype.gsub(/\+/, '.')
|
120
|
+
typeregex = Regexp.new(type)
|
121
|
+
if response.headers[:content_type].match(typeregex)
|
122
|
+
warn response.headers[:content_type]
|
123
|
+
warn typeregex.inspect
|
124
|
+
@meta.comments << "INFO: item link responds according to Signposting specifications\n"
|
125
|
+
else
|
126
|
+
@meta.warnings << ['012', l.href, header]
|
127
|
+
@meta.comments << "WARN: Content type of returned item link does not match the 'type' attribute\n"
|
128
|
+
end
|
129
|
+
else
|
130
|
+
@meta.warnings << ['013', l.href, header]
|
131
|
+
@meta.comments << "WARN: Content type of returned item link is not specified in response headers or cannot be matched against accept headers\n"
|
132
|
+
end
|
133
|
+
else
|
134
|
+
@meta.warnings << ['014', l.href, header]
|
135
|
+
@meta.comments << "WARN: item link doesn't resolve\n"
|
136
|
+
end
|
137
|
+
|
138
|
+
when 'describedby'
|
104
139
|
describedby += 1
|
105
|
-
|
106
|
-
@meta.warnings << [
|
107
|
-
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires any describedby links to also have a 'type' attribute
|
140
|
+
if !(l.respond_to? 'type')
|
141
|
+
@meta.warnings << ['005', l.href, '']
|
142
|
+
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires any describedby links to also have a 'type' attribute.\n"
|
143
|
+
end
|
144
|
+
type = l.type if l.respond_to? 'type'
|
145
|
+
type = '*/*' unless type
|
146
|
+
header = { accept: type }
|
147
|
+
response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
|
148
|
+
if response
|
149
|
+
if response.headers[:content_type] and !(type == '*/*')
|
150
|
+
rtype = type.gsub(%r{/}, "\/")
|
151
|
+
rtype = rtype.gsub(/\+/, '.')
|
152
|
+
typeregex = Regexp.new(rtype)
|
153
|
+
if response.headers[:content_type].match(typeregex)
|
154
|
+
warn response.headers[:content_type]
|
155
|
+
warn typeregex.inspect
|
156
|
+
@meta.comments << "INFO: describedby link responds according to Signposting specifications\n"
|
157
|
+
else
|
158
|
+
@meta.warnings << ['009', l.href, header]
|
159
|
+
@meta.comments << "WARN: Content type of returned describedby link does not match the 'type' attribute\n"
|
160
|
+
end
|
161
|
+
else
|
162
|
+
@meta.warnings << ['010', l.href, header]
|
163
|
+
@meta.comments << "WARN: Content type of returned describedby link is not specified in response headers or cannot be matched against accept headers\n"
|
164
|
+
end
|
165
|
+
else
|
166
|
+
@meta.warnings << ['008', l.href, header]
|
167
|
+
@meta.comments << "WARN: describedby link doesn't resolve\n"
|
108
168
|
end
|
109
169
|
end
|
110
170
|
end
|
111
171
|
if citeas > 1
|
112
|
-
|
172
|
+
@meta.comments << "INFO: multiple cite-as links found. Checking for conflicts\n"
|
173
|
+
citeas = check_for_citeas_conflicts(factory: factory) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
|
113
174
|
end
|
114
175
|
|
115
176
|
unless citeas == 1 && describedby > 0
|
116
|
-
@meta.warnings << [
|
177
|
+
@meta.warnings << ['004', '', '']
|
117
178
|
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires exactly one cite-as header, and at least one describedby header\n"
|
118
179
|
end
|
119
180
|
factory.all_links
|
120
181
|
end
|
121
182
|
|
122
|
-
def self.
|
123
|
-
@meta.comments <<
|
124
|
-
citeas =
|
183
|
+
def self.check_for_citeas_conflicts(factory:)
|
184
|
+
@meta.comments << 'INFO: checking for conflicting cite-as links'
|
185
|
+
citeas = []
|
125
186
|
factory.all_links.each do |link|
|
126
187
|
next unless link.relation == 'cite-as'
|
188
|
+
|
189
|
+
@meta.comments << "INFO: Adding citeas #{link.href} to the testing queue."
|
127
190
|
citeas << link.href
|
128
191
|
end
|
129
|
-
|
130
|
-
|
192
|
+
|
193
|
+
if citeas.uniq.length == 1
|
194
|
+
@meta.comments << 'INFO: No conflicting cite-as links found.'
|
195
|
+
else # only one allowed!
|
196
|
+
@meta.warnings << ['007', '', '']
|
131
197
|
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard: Found conflicting cite-as link headers\n"
|
132
|
-
else
|
133
|
-
@meta.comments << "INFO: No conflicting cite-as links found."
|
134
198
|
end
|
199
|
+
citeas.uniq
|
135
200
|
end
|
136
201
|
end
|
137
202
|
end
|
data/lib/warnings.json
CHANGED
@@ -28,6 +28,47 @@
|
|
28
28
|
"message": "GUID type not recognized",
|
29
29
|
"linkout": "",
|
30
30
|
"severity": "WARN"
|
31
|
+
},
|
32
|
+
"007": {
|
33
|
+
"message": "Conflicting cite-as links",
|
34
|
+
"linkout": "",
|
35
|
+
"severity": "WARN"
|
36
|
+
},
|
37
|
+
"008": {
|
38
|
+
"message": "describedby link does not resolve",
|
39
|
+
"linkout": "",
|
40
|
+
"severity": "WARN"
|
41
|
+
},
|
42
|
+
"009": {
|
43
|
+
"message": "Content-type of described-by link does not match the type attribute in the link header itself",
|
44
|
+
"linkout": "",
|
45
|
+
"severity": "WARN"
|
46
|
+
},
|
47
|
+
"010": {
|
48
|
+
"message": "Content-type of response from described-by link is undefined or cannot be compared to the link type",
|
49
|
+
"linkout": "",
|
50
|
+
"severity": "WARN"
|
51
|
+
},
|
52
|
+
"011": {
|
53
|
+
"message": "The resource does not follow the FAIR Signposting standard, which encourages any item links to have a type attribute",
|
54
|
+
"linkout": "",
|
55
|
+
"severity": "WARN"
|
56
|
+
},
|
57
|
+
"012": {
|
58
|
+
"message": "Content-type of response from resolving an item doesn't match the item type attribute in the link header",
|
59
|
+
"linkout": "",
|
60
|
+
"severity": "WARN"
|
61
|
+
},
|
62
|
+
"013": {
|
63
|
+
"message": "Content-type of response from resolving an item is undefined or cannot be compared to the link type",
|
64
|
+
"linkout": "",
|
65
|
+
"severity": "WARN"
|
66
|
+
},
|
67
|
+
"014": {
|
68
|
+
"message": "Item link does not resolve",
|
69
|
+
"linkout": "",
|
70
|
+
"severity": "WARN"
|
31
71
|
}
|
72
|
+
|
32
73
|
|
33
74
|
}
|
data/lib/web_utils.rb
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
module FspHarvester
|
2
2
|
|
3
3
|
class WebUtils
|
4
|
-
def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER
|
4
|
+
def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get)
|
5
5
|
warn 'In fetch routine now. '
|
6
6
|
|
7
7
|
begin
|
8
8
|
warn "executing call over the Web to #{url}"
|
9
9
|
response = RestClient::Request.execute({
|
10
|
-
method:
|
10
|
+
method: method,
|
11
11
|
url: url.to_s,
|
12
12
|
# user: user,
|
13
13
|
# password: pass,
|
@@ -26,7 +26,7 @@ module FspHarvester
|
|
26
26
|
warn "EXCEPTION WITH RESPONSE! #{e.response}\n#{e.response.headers}"
|
27
27
|
@meta.warnings << ["003", url, headers] if @meta
|
28
28
|
@meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n" if @meta
|
29
|
-
if e.response.code == 500
|
29
|
+
if (e.response.code == 500 or e.response.code == 404)
|
30
30
|
return false
|
31
31
|
else
|
32
32
|
e.response
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fsp_harvester
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mark Wilkinson
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-07-
|
11
|
+
date: 2022-07-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: json
|
@@ -44,14 +44,14 @@ dependencies:
|
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: 0.1.
|
47
|
+
version: 0.1.12
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: 0.1.
|
54
|
+
version: 0.1.12
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: metainspector
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|