fsp_harvester 0.1.4 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +3 -3
- data/Rakefile +3 -3
- data/lib/fsp_harvester/version.rb +1 -1
- data/lib/fsp_harvester.rb +71 -73
- data/lib/warnings.json +41 -0
- data/lib/web_utils.rb +3 -3
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7da32f3321193e93f64154c35db0c840f5e7f086451660f99b62d3f4c834e295
|
4
|
+
data.tar.gz: 5a0a1ff4ef6b2100accd8bab4f20d6842d25ebf88d5e600e646804da9ed24bd9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dad90f81b73489a151220c132d74508832573a352b209a537bbbaf6543a90b9e132cca80ecca33d15d02eca91841642d73e795113076f4564730194b5bf1fa53
|
7
|
+
data.tar.gz: 5d11c2f002f4e73a4971aec0d047cd14d35f6763155d0effa9c79f419ff6fd26f853158b290d2264edd148633570f5ecd50d4e1f6c7e9d136c7d2880517bdefc
|
data/Gemfile.lock
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
fsp_harvester (0.1.
|
4
|
+
fsp_harvester (0.1.7)
|
5
5
|
json (~> 2.0)
|
6
6
|
linkeddata (~> 3.2)
|
7
|
-
linkheaders-processor (~> 0.1.
|
7
|
+
linkheaders-processor (~> 0.1.13)
|
8
8
|
metainspector (~> 5.11.2)
|
9
9
|
parseconfig (~> 1.1)
|
10
10
|
rake (~> 13.0)
|
@@ -126,7 +126,7 @@ GEM
|
|
126
126
|
shex (~> 0.7)
|
127
127
|
sparql (~> 3.2)
|
128
128
|
sparql-client (~> 3.2)
|
129
|
-
linkheaders-processor (0.1.
|
129
|
+
linkheaders-processor (0.1.13)
|
130
130
|
json (~> 2.0)
|
131
131
|
json-ld (~> 3.2)
|
132
132
|
json-ld-preloaded (~> 3.2)
|
data/Rakefile
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require
|
4
|
-
require
|
3
|
+
require 'bundler/gem_tasks'
|
4
|
+
require 'rspec/core/rake_task'
|
5
5
|
|
6
6
|
RSpec::Core::RakeTask.new(:spec)
|
7
7
|
|
8
|
-
require
|
8
|
+
require 'rubocop/rake_task'
|
9
9
|
|
10
10
|
RuboCop::RakeTask.new
|
11
11
|
|
data/lib/fsp_harvester.rb
CHANGED
@@ -1,24 +1,25 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require_relative
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require
|
7
|
-
require
|
8
|
-
require
|
9
|
-
require
|
10
|
-
require
|
11
|
-
require
|
12
|
-
require
|
13
|
-
require
|
14
|
-
require
|
15
|
-
require
|
16
|
-
require
|
17
|
-
require
|
18
|
-
require
|
19
|
-
require_relative
|
20
|
-
require_relative
|
21
|
-
require_relative
|
3
|
+
require_relative 'fsp_harvester/version'
|
4
|
+
require 'json/ld'
|
5
|
+
require 'json/ld/preloaded'
|
6
|
+
require 'json'
|
7
|
+
require 'linkheaders/processor'
|
8
|
+
require 'addressable'
|
9
|
+
require 'tempfile'
|
10
|
+
require 'xmlsimple'
|
11
|
+
require 'nokogiri'
|
12
|
+
require 'parseconfig'
|
13
|
+
require 'rest-client'
|
14
|
+
require 'cgi'
|
15
|
+
require 'digest'
|
16
|
+
require 'open3'
|
17
|
+
require 'metainspector'
|
18
|
+
require 'rdf/xsd'
|
19
|
+
require_relative './metadata_object'
|
20
|
+
require_relative './constants'
|
21
|
+
require_relative './web_utils'
|
22
|
+
require_relative './signposting_tests'
|
22
23
|
|
23
24
|
module FspHarvester
|
24
25
|
class Error < StandardError
|
@@ -27,33 +28,34 @@ module FspHarvester
|
|
27
28
|
class Utils
|
28
29
|
# @@distillerknown = {} # global, hash of sha256 keys of message bodies - have they been seen before t/f
|
29
30
|
# @warnings = JSON.parse(File.read("warnings.json"))
|
30
|
-
|
31
|
+
|
31
32
|
|
32
33
|
def self.resolve_guid(guid:)
|
34
|
+
@meta = FspHarvester::MetadataObject.new
|
33
35
|
@meta.finalURI = [guid]
|
34
36
|
type, url = convertToURL(guid: guid)
|
35
37
|
links = Array.new
|
36
|
-
|
37
|
-
|
38
|
-
@meta.comments << "FATAL: GUID type not recognized.\n"
|
38
|
+
if type
|
39
|
+
links = resolve_url(url: url)
|
39
40
|
else
|
40
|
-
|
41
|
+
@meta.warnings << ['006', guid, '']
|
42
|
+
@meta.comments << "FATAL: GUID type not recognized.\n"
|
41
43
|
end
|
42
44
|
[links, @meta]
|
43
45
|
end
|
44
46
|
|
45
47
|
def self.convertToURL(guid:)
|
46
48
|
GUID_TYPES.each do |k, regex|
|
47
|
-
if k ==
|
48
|
-
return
|
49
|
-
elsif k ==
|
50
|
-
return
|
51
|
-
elsif k ==
|
52
|
-
return
|
53
|
-
elsif k ==
|
54
|
-
return
|
55
|
-
elsif k ==
|
56
|
-
return
|
49
|
+
if k == 'inchi' and regex.match(guid)
|
50
|
+
return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
|
51
|
+
elsif k == 'handle1' and regex.match(guid)
|
52
|
+
return 'handle', "http://hdl.handle.net/#{guid}"
|
53
|
+
elsif k == 'handle2' and regex.match(guid)
|
54
|
+
return 'handle', "http://hdl.handle.net/#{guid}"
|
55
|
+
elsif k == 'uri' and regex.match(guid)
|
56
|
+
return 'uri', guid
|
57
|
+
elsif k == 'doi' and regex.match(guid)
|
58
|
+
return 'doi', "https://doi.org/#{guid}"
|
57
59
|
end
|
58
60
|
end
|
59
61
|
[nil, nil]
|
@@ -66,23 +68,23 @@ module FspHarvester
|
|
66
68
|
false
|
67
69
|
end
|
68
70
|
|
69
|
-
def self.resolve_url(url:, nolinkheaders: false, header: ACCEPT_ALL_HEADER)
|
70
|
-
@meta.guidtype =
|
71
|
+
def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_ALL_HEADER)
|
72
|
+
@meta.guidtype = 'uri' if @meta.guidtype.nil?
|
71
73
|
warn "\n\n FETCHING #{url} #{header}\n\n"
|
72
|
-
response = FspHarvester::WebUtils.fspfetch(url: url, headers: header)
|
73
|
-
warn "\n\n head #{response.headers.inspect}\n\n"
|
74
|
+
response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method)
|
75
|
+
warn "\n\n head #{response.headers.inspect}\n\n" if response
|
74
76
|
|
75
77
|
unless response
|
76
|
-
@meta.warnings << [
|
78
|
+
@meta.warnings << ['001', url, header]
|
77
79
|
@meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
|
78
|
-
return [
|
80
|
+
return []
|
79
81
|
end
|
80
82
|
|
81
83
|
@meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.finalURI.last}. Using the output from this URL for the next few tests..."
|
82
84
|
@meta.full_response << response.body
|
83
85
|
|
84
86
|
links = process_link_headers(response: response) unless nolinkheaders
|
85
|
-
|
87
|
+
links
|
86
88
|
end
|
87
89
|
|
88
90
|
def self.process_link_headers(response:)
|
@@ -90,47 +92,43 @@ module FspHarvester
|
|
90
92
|
|
91
93
|
parser = LinkHeaders::Processor.new(default_anchor: @meta.finalURI.last)
|
92
94
|
parser.extract_and_parse(response: response)
|
93
|
-
factory = parser.factory
|
95
|
+
factory = parser.factory # LinkHeaders::LinkFactory
|
94
96
|
|
95
|
-
|
96
|
-
|
97
|
-
warn "\n\n length #{factory.all_links.length}\n\n"
|
97
|
+
warn "\n\n length bfore #{factory.all_links.length}\n\n"
|
98
|
+
signpostingcheck(factory: factory)
|
99
|
+
warn "\n\n length aftr #{factory.all_links.length}\n\n"
|
100
|
+
warn "\n\n links #{factory.all_links}\n\n"
|
101
|
+
factory.all_links
|
102
|
+
end
|
98
103
|
|
104
|
+
def self.signpostingcheck(factory:)
|
105
|
+
citeas = Array.new
|
106
|
+
describedby = Array.new
|
107
|
+
item = Array.new
|
99
108
|
factory.all_links.each do |l|
|
100
109
|
case l.relation
|
101
|
-
when
|
102
|
-
citeas
|
103
|
-
when
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires any describedby links to also have a 'type' attribute\n"
|
108
|
-
end
|
110
|
+
when 'cite-as'
|
111
|
+
citeas << l
|
112
|
+
when 'item'
|
113
|
+
item << l
|
114
|
+
when 'describedby'
|
115
|
+
describedby << l
|
109
116
|
end
|
110
117
|
end
|
111
|
-
if citeas > 1
|
112
|
-
self.check_for_conflicts(factory: factory) # this merelty adsds to the metadata objects if there are conflicts
|
113
|
-
end
|
114
118
|
|
115
|
-
|
116
|
-
|
117
|
-
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires exactly one cite-as header, and at least one describedby header\n"
|
118
|
-
end
|
119
|
-
factory.all_links
|
120
|
-
end
|
119
|
+
check_describedby_rules(describedby: describedby)
|
120
|
+
check_item_rules(item: item)
|
121
121
|
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
citeas << link.href
|
122
|
+
uniqueciteas = Array.new
|
123
|
+
if citeas.length > 1
|
124
|
+
warn "INFO: multiple cite-as links found. Checking for conflicts\n"
|
125
|
+
@meta.comments << "INFO: multiple cite-as links found. Checking for conflicts\n"
|
126
|
+
uniqueciteas = check_for_citeas_conflicts(citeas: citeas) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
|
128
127
|
end
|
129
|
-
|
130
|
-
|
131
|
-
@meta.
|
132
|
-
|
133
|
-
@meta.comments << "INFO: No conflicting cite-as links found."
|
128
|
+
|
129
|
+
unless uniqueciteas == 1 && describedby.length > 0
|
130
|
+
@meta.warnings << ['004', '', '']
|
131
|
+
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires exactly one cite-as header, and at least one describedby header\n"
|
134
132
|
end
|
135
133
|
end
|
136
134
|
end
|
data/lib/warnings.json
CHANGED
@@ -28,6 +28,47 @@
|
|
28
28
|
"message": "GUID type not recognized",
|
29
29
|
"linkout": "",
|
30
30
|
"severity": "WARN"
|
31
|
+
},
|
32
|
+
"007": {
|
33
|
+
"message": "Conflicting cite-as links",
|
34
|
+
"linkout": "",
|
35
|
+
"severity": "WARN"
|
36
|
+
},
|
37
|
+
"008": {
|
38
|
+
"message": "describedby link does not resolve",
|
39
|
+
"linkout": "",
|
40
|
+
"severity": "WARN"
|
41
|
+
},
|
42
|
+
"009": {
|
43
|
+
"message": "Content-type of described-by link does not match the type attribute in the link header itself",
|
44
|
+
"linkout": "",
|
45
|
+
"severity": "WARN"
|
46
|
+
},
|
47
|
+
"010": {
|
48
|
+
"message": "Content-type of response from described-by link is undefined or cannot be compared to the link type",
|
49
|
+
"linkout": "",
|
50
|
+
"severity": "WARN"
|
51
|
+
},
|
52
|
+
"011": {
|
53
|
+
"message": "The resource does not follow the FAIR Signposting standard, which encourages any item links to have a type attribute",
|
54
|
+
"linkout": "",
|
55
|
+
"severity": "WARN"
|
56
|
+
},
|
57
|
+
"012": {
|
58
|
+
"message": "Content-type of response from resolving an item doesn't match the item type attribute in the link header",
|
59
|
+
"linkout": "",
|
60
|
+
"severity": "WARN"
|
61
|
+
},
|
62
|
+
"013": {
|
63
|
+
"message": "Content-type of response from resolving an item is undefined or cannot be compared to the link type",
|
64
|
+
"linkout": "",
|
65
|
+
"severity": "WARN"
|
66
|
+
},
|
67
|
+
"014": {
|
68
|
+
"message": "Item link does not resolve",
|
69
|
+
"linkout": "",
|
70
|
+
"severity": "WARN"
|
31
71
|
}
|
72
|
+
|
32
73
|
|
33
74
|
}
|
data/lib/web_utils.rb
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
module FspHarvester
|
2
2
|
|
3
3
|
class WebUtils
|
4
|
-
def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER
|
4
|
+
def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get)
|
5
5
|
warn 'In fetch routine now. '
|
6
6
|
|
7
7
|
begin
|
8
8
|
warn "executing call over the Web to #{url}"
|
9
9
|
response = RestClient::Request.execute({
|
10
|
-
method:
|
10
|
+
method: method,
|
11
11
|
url: url.to_s,
|
12
12
|
# user: user,
|
13
13
|
# password: pass,
|
@@ -26,7 +26,7 @@ module FspHarvester
|
|
26
26
|
warn "EXCEPTION WITH RESPONSE! #{e.response}\n#{e.response.headers}"
|
27
27
|
@meta.warnings << ["003", url, headers] if @meta
|
28
28
|
@meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n" if @meta
|
29
|
-
if e.response.code == 500
|
29
|
+
if (e.response.code == 500 or e.response.code == 404)
|
30
30
|
return false
|
31
31
|
else
|
32
32
|
e.response
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fsp_harvester
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mark Wilkinson
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-08-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: json
|
@@ -44,14 +44,14 @@ dependencies:
|
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: 0.1.
|
47
|
+
version: 0.1.13
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: 0.1.
|
54
|
+
version: 0.1.13
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: metainspector
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|