fsp_harvester 0.1.4 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +3 -3
- data/Rakefile +3 -3
- data/lib/fsp_harvester/version.rb +1 -1
- data/lib/fsp_harvester.rb +71 -73
- data/lib/warnings.json +41 -0
- data/lib/web_utils.rb +3 -3
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7da32f3321193e93f64154c35db0c840f5e7f086451660f99b62d3f4c834e295
|
4
|
+
data.tar.gz: 5a0a1ff4ef6b2100accd8bab4f20d6842d25ebf88d5e600e646804da9ed24bd9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dad90f81b73489a151220c132d74508832573a352b209a537bbbaf6543a90b9e132cca80ecca33d15d02eca91841642d73e795113076f4564730194b5bf1fa53
|
7
|
+
data.tar.gz: 5d11c2f002f4e73a4971aec0d047cd14d35f6763155d0effa9c79f419ff6fd26f853158b290d2264edd148633570f5ecd50d4e1f6c7e9d136c7d2880517bdefc
|
data/Gemfile.lock
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
fsp_harvester (0.1.
|
4
|
+
fsp_harvester (0.1.7)
|
5
5
|
json (~> 2.0)
|
6
6
|
linkeddata (~> 3.2)
|
7
|
-
linkheaders-processor (~> 0.1.
|
7
|
+
linkheaders-processor (~> 0.1.13)
|
8
8
|
metainspector (~> 5.11.2)
|
9
9
|
parseconfig (~> 1.1)
|
10
10
|
rake (~> 13.0)
|
@@ -126,7 +126,7 @@ GEM
|
|
126
126
|
shex (~> 0.7)
|
127
127
|
sparql (~> 3.2)
|
128
128
|
sparql-client (~> 3.2)
|
129
|
-
linkheaders-processor (0.1.
|
129
|
+
linkheaders-processor (0.1.13)
|
130
130
|
json (~> 2.0)
|
131
131
|
json-ld (~> 3.2)
|
132
132
|
json-ld-preloaded (~> 3.2)
|
data/Rakefile
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require
|
4
|
-
require
|
3
|
+
require 'bundler/gem_tasks'
|
4
|
+
require 'rspec/core/rake_task'
|
5
5
|
|
6
6
|
RSpec::Core::RakeTask.new(:spec)
|
7
7
|
|
8
|
-
require
|
8
|
+
require 'rubocop/rake_task'
|
9
9
|
|
10
10
|
RuboCop::RakeTask.new
|
11
11
|
|
data/lib/fsp_harvester.rb
CHANGED
@@ -1,24 +1,25 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require_relative
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require
|
7
|
-
require
|
8
|
-
require
|
9
|
-
require
|
10
|
-
require
|
11
|
-
require
|
12
|
-
require
|
13
|
-
require
|
14
|
-
require
|
15
|
-
require
|
16
|
-
require
|
17
|
-
require
|
18
|
-
require
|
19
|
-
require_relative
|
20
|
-
require_relative
|
21
|
-
require_relative
|
3
|
+
require_relative 'fsp_harvester/version'
|
4
|
+
require 'json/ld'
|
5
|
+
require 'json/ld/preloaded'
|
6
|
+
require 'json'
|
7
|
+
require 'linkheaders/processor'
|
8
|
+
require 'addressable'
|
9
|
+
require 'tempfile'
|
10
|
+
require 'xmlsimple'
|
11
|
+
require 'nokogiri'
|
12
|
+
require 'parseconfig'
|
13
|
+
require 'rest-client'
|
14
|
+
require 'cgi'
|
15
|
+
require 'digest'
|
16
|
+
require 'open3'
|
17
|
+
require 'metainspector'
|
18
|
+
require 'rdf/xsd'
|
19
|
+
require_relative './metadata_object'
|
20
|
+
require_relative './constants'
|
21
|
+
require_relative './web_utils'
|
22
|
+
require_relative './signposting_tests'
|
22
23
|
|
23
24
|
module FspHarvester
|
24
25
|
class Error < StandardError
|
@@ -27,33 +28,34 @@ module FspHarvester
|
|
27
28
|
class Utils
|
28
29
|
# @@distillerknown = {} # global, hash of sha256 keys of message bodies - have they been seen before t/f
|
29
30
|
# @warnings = JSON.parse(File.read("warnings.json"))
|
30
|
-
|
31
|
+
|
31
32
|
|
32
33
|
def self.resolve_guid(guid:)
|
34
|
+
@meta = FspHarvester::MetadataObject.new
|
33
35
|
@meta.finalURI = [guid]
|
34
36
|
type, url = convertToURL(guid: guid)
|
35
37
|
links = Array.new
|
36
|
-
|
37
|
-
|
38
|
-
@meta.comments << "FATAL: GUID type not recognized.\n"
|
38
|
+
if type
|
39
|
+
links = resolve_url(url: url)
|
39
40
|
else
|
40
|
-
|
41
|
+
@meta.warnings << ['006', guid, '']
|
42
|
+
@meta.comments << "FATAL: GUID type not recognized.\n"
|
41
43
|
end
|
42
44
|
[links, @meta]
|
43
45
|
end
|
44
46
|
|
45
47
|
def self.convertToURL(guid:)
|
46
48
|
GUID_TYPES.each do |k, regex|
|
47
|
-
if k ==
|
48
|
-
return
|
49
|
-
elsif k ==
|
50
|
-
return
|
51
|
-
elsif k ==
|
52
|
-
return
|
53
|
-
elsif k ==
|
54
|
-
return
|
55
|
-
elsif k ==
|
56
|
-
return
|
49
|
+
if k == 'inchi' and regex.match(guid)
|
50
|
+
return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
|
51
|
+
elsif k == 'handle1' and regex.match(guid)
|
52
|
+
return 'handle', "http://hdl.handle.net/#{guid}"
|
53
|
+
elsif k == 'handle2' and regex.match(guid)
|
54
|
+
return 'handle', "http://hdl.handle.net/#{guid}"
|
55
|
+
elsif k == 'uri' and regex.match(guid)
|
56
|
+
return 'uri', guid
|
57
|
+
elsif k == 'doi' and regex.match(guid)
|
58
|
+
return 'doi', "https://doi.org/#{guid}"
|
57
59
|
end
|
58
60
|
end
|
59
61
|
[nil, nil]
|
@@ -66,23 +68,23 @@ module FspHarvester
|
|
66
68
|
false
|
67
69
|
end
|
68
70
|
|
69
|
-
def self.resolve_url(url:, nolinkheaders: false, header: ACCEPT_ALL_HEADER)
|
70
|
-
@meta.guidtype =
|
71
|
+
def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_ALL_HEADER)
|
72
|
+
@meta.guidtype = 'uri' if @meta.guidtype.nil?
|
71
73
|
warn "\n\n FETCHING #{url} #{header}\n\n"
|
72
|
-
response = FspHarvester::WebUtils.fspfetch(url: url, headers: header)
|
73
|
-
warn "\n\n head #{response.headers.inspect}\n\n"
|
74
|
+
response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method)
|
75
|
+
warn "\n\n head #{response.headers.inspect}\n\n" if response
|
74
76
|
|
75
77
|
unless response
|
76
|
-
@meta.warnings << [
|
78
|
+
@meta.warnings << ['001', url, header]
|
77
79
|
@meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
|
78
|
-
return [
|
80
|
+
return []
|
79
81
|
end
|
80
82
|
|
81
83
|
@meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.finalURI.last}. Using the output from this URL for the next few tests..."
|
82
84
|
@meta.full_response << response.body
|
83
85
|
|
84
86
|
links = process_link_headers(response: response) unless nolinkheaders
|
85
|
-
|
87
|
+
links
|
86
88
|
end
|
87
89
|
|
88
90
|
def self.process_link_headers(response:)
|
@@ -90,47 +92,43 @@ module FspHarvester
|
|
90
92
|
|
91
93
|
parser = LinkHeaders::Processor.new(default_anchor: @meta.finalURI.last)
|
92
94
|
parser.extract_and_parse(response: response)
|
93
|
-
factory = parser.factory
|
95
|
+
factory = parser.factory # LinkHeaders::LinkFactory
|
94
96
|
|
95
|
-
|
96
|
-
|
97
|
-
warn "\n\n length #{factory.all_links.length}\n\n"
|
97
|
+
warn "\n\n length bfore #{factory.all_links.length}\n\n"
|
98
|
+
signpostingcheck(factory: factory)
|
99
|
+
warn "\n\n length aftr #{factory.all_links.length}\n\n"
|
100
|
+
warn "\n\n links #{factory.all_links}\n\n"
|
101
|
+
factory.all_links
|
102
|
+
end
|
98
103
|
|
104
|
+
def self.signpostingcheck(factory:)
|
105
|
+
citeas = Array.new
|
106
|
+
describedby = Array.new
|
107
|
+
item = Array.new
|
99
108
|
factory.all_links.each do |l|
|
100
109
|
case l.relation
|
101
|
-
when
|
102
|
-
citeas
|
103
|
-
when
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires any describedby links to also have a 'type' attribute\n"
|
108
|
-
end
|
110
|
+
when 'cite-as'
|
111
|
+
citeas << l
|
112
|
+
when 'item'
|
113
|
+
item << l
|
114
|
+
when 'describedby'
|
115
|
+
describedby << l
|
109
116
|
end
|
110
117
|
end
|
111
|
-
if citeas > 1
|
112
|
-
self.check_for_conflicts(factory: factory) # this merelty adsds to the metadata objects if there are conflicts
|
113
|
-
end
|
114
118
|
|
115
|
-
|
116
|
-
|
117
|
-
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires exactly one cite-as header, and at least one describedby header\n"
|
118
|
-
end
|
119
|
-
factory.all_links
|
120
|
-
end
|
119
|
+
check_describedby_rules(describedby: describedby)
|
120
|
+
check_item_rules(item: item)
|
121
121
|
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
citeas << link.href
|
122
|
+
uniqueciteas = Array.new
|
123
|
+
if citeas.length > 1
|
124
|
+
warn "INFO: multiple cite-as links found. Checking for conflicts\n"
|
125
|
+
@meta.comments << "INFO: multiple cite-as links found. Checking for conflicts\n"
|
126
|
+
uniqueciteas = check_for_citeas_conflicts(citeas: citeas) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
|
128
127
|
end
|
129
|
-
|
130
|
-
|
131
|
-
@meta.
|
132
|
-
|
133
|
-
@meta.comments << "INFO: No conflicting cite-as links found."
|
128
|
+
|
129
|
+
unless uniqueciteas == 1 && describedby.length > 0
|
130
|
+
@meta.warnings << ['004', '', '']
|
131
|
+
@meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires exactly one cite-as header, and at least one describedby header\n"
|
134
132
|
end
|
135
133
|
end
|
136
134
|
end
|
data/lib/warnings.json
CHANGED
@@ -28,6 +28,47 @@
|
|
28
28
|
"message": "GUID type not recognized",
|
29
29
|
"linkout": "",
|
30
30
|
"severity": "WARN"
|
31
|
+
},
|
32
|
+
"007": {
|
33
|
+
"message": "Conflicting cite-as links",
|
34
|
+
"linkout": "",
|
35
|
+
"severity": "WARN"
|
36
|
+
},
|
37
|
+
"008": {
|
38
|
+
"message": "describedby link does not resolve",
|
39
|
+
"linkout": "",
|
40
|
+
"severity": "WARN"
|
41
|
+
},
|
42
|
+
"009": {
|
43
|
+
"message": "Content-type of described-by link does not match the type attribute in the link header itself",
|
44
|
+
"linkout": "",
|
45
|
+
"severity": "WARN"
|
46
|
+
},
|
47
|
+
"010": {
|
48
|
+
"message": "Content-type of response from described-by link is undefined or cannot be compared to the link type",
|
49
|
+
"linkout": "",
|
50
|
+
"severity": "WARN"
|
51
|
+
},
|
52
|
+
"011": {
|
53
|
+
"message": "The resource does not follow the FAIR Signposting standard, which encourages any item links to have a type attribute",
|
54
|
+
"linkout": "",
|
55
|
+
"severity": "WARN"
|
56
|
+
},
|
57
|
+
"012": {
|
58
|
+
"message": "Content-type of response from resolving an item doesn't match the item type attribute in the link header",
|
59
|
+
"linkout": "",
|
60
|
+
"severity": "WARN"
|
61
|
+
},
|
62
|
+
"013": {
|
63
|
+
"message": "Content-type of response from resolving an item is undefined or cannot be compared to the link type",
|
64
|
+
"linkout": "",
|
65
|
+
"severity": "WARN"
|
66
|
+
},
|
67
|
+
"014": {
|
68
|
+
"message": "Item link does not resolve",
|
69
|
+
"linkout": "",
|
70
|
+
"severity": "WARN"
|
31
71
|
}
|
72
|
+
|
32
73
|
|
33
74
|
}
|
data/lib/web_utils.rb
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
module FspHarvester
|
2
2
|
|
3
3
|
class WebUtils
|
4
|
-
def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER
|
4
|
+
def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get)
|
5
5
|
warn 'In fetch routine now. '
|
6
6
|
|
7
7
|
begin
|
8
8
|
warn "executing call over the Web to #{url}"
|
9
9
|
response = RestClient::Request.execute({
|
10
|
-
method:
|
10
|
+
method: method,
|
11
11
|
url: url.to_s,
|
12
12
|
# user: user,
|
13
13
|
# password: pass,
|
@@ -26,7 +26,7 @@ module FspHarvester
|
|
26
26
|
warn "EXCEPTION WITH RESPONSE! #{e.response}\n#{e.response.headers}"
|
27
27
|
@meta.warnings << ["003", url, headers] if @meta
|
28
28
|
@meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n" if @meta
|
29
|
-
if e.response.code == 500
|
29
|
+
if (e.response.code == 500 or e.response.code == 404)
|
30
30
|
return false
|
31
31
|
else
|
32
32
|
e.response
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fsp_harvester
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mark Wilkinson
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-08-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: json
|
@@ -44,14 +44,14 @@ dependencies:
|
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: 0.1.
|
47
|
+
version: 0.1.13
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: 0.1.
|
54
|
+
version: 0.1.13
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: metainspector
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|