fsp_harvester 0.1.5 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 79507c31b14bab423d95a72fe441756551fa445caccea733ee75993fd7e0222c
4
- data.tar.gz: a18796aaff5e57940306fecd1d82df5c18d579ffe0d5fb1cd1948a9a29d1bb3b
3
+ metadata.gz: f24e34b5239426a8555e5d893cb3692a1f03442f6b4af3c03c5751c975a7871b
4
+ data.tar.gz: d372b73eb7693e5a4c9a2f78e20d02b62f3c195ed4185db7532018a45a694570
5
5
  SHA512:
6
- metadata.gz: ed211e876c70b7c6bd3dad6dc9a7dada1e4e6d54f5c9a92286b24b9912b06b26f6b3c2fd3b22c8ac225ddb8ceaa3eb2b98d35a983f6be3fe78f4575450f8d857
7
- data.tar.gz: af6d5af7520061d418680b5b9a5f90e066b55be26322b7a4d9275bf74546eb56e80e538cd78443d45b28241246eb9329c84c08c53faee9db67e8b1c893507a54
6
+ metadata.gz: e85c8ba90bee37156e8a4d8e98ec0d8c2148ffc86e24ac3faf0adde1146efaafa2333e7e25acf2b5b4d05aa1f2a9a411deb18890175e93bd1b8d0980773e42c2
7
+ data.tar.gz: f20559315f1b9aff81978600f50743fdee2d3b200eae0dc375ccf267fda90909cb16a6e43686a5efea0f6c583c90c4511c7390702ad7f9e9704f80f829da128b
data/Gemfile.lock CHANGED
@@ -1,10 +1,10 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fsp_harvester (0.1.5)
4
+ fsp_harvester (0.1.6)
5
5
  json (~> 2.0)
6
6
  linkeddata (~> 3.2)
7
- linkheaders-processor (~> 0.1.11)
7
+ linkheaders-processor (~> 0.1.12)
8
8
  metainspector (~> 5.11.2)
9
9
  parseconfig (~> 1.1)
10
10
  rake (~> 13.0)
@@ -126,7 +126,7 @@ GEM
126
126
  shex (~> 0.7)
127
127
  sparql (~> 3.2)
128
128
  sparql-client (~> 3.2)
129
- linkheaders-processor (0.1.11)
129
+ linkheaders-processor (0.1.12)
130
130
  json (~> 2.0)
131
131
  json-ld (~> 3.2)
132
132
  json-ld-preloaded (~> 3.2)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module FspHarvester
4
- VERSION = "0.1.5"
4
+ VERSION = "0.1.6"
5
5
  end
data/lib/fsp_harvester.rb CHANGED
@@ -1,24 +1,24 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative "fsp_harvester/version"
4
- require "json/ld"
5
- require "json/ld/preloaded"
6
- require "json"
7
- require "linkheaders/processor"
8
- require "addressable"
9
- require "tempfile"
10
- require "xmlsimple"
11
- require "nokogiri"
12
- require "parseconfig"
13
- require "rest-client"
14
- require "cgi"
15
- require "digest"
16
- require "open3"
17
- require "metainspector"
18
- require "rdf/xsd"
19
- require_relative "./metadata_object"
20
- require_relative "./constants"
21
- require_relative "./web_utils"
3
+ require_relative 'fsp_harvester/version'
4
+ require 'json/ld'
5
+ require 'json/ld/preloaded'
6
+ require 'json'
7
+ require 'linkheaders/processor'
8
+ require 'addressable'
9
+ require 'tempfile'
10
+ require 'xmlsimple'
11
+ require 'nokogiri'
12
+ require 'parseconfig'
13
+ require 'rest-client'
14
+ require 'cgi'
15
+ require 'digest'
16
+ require 'open3'
17
+ require 'metainspector'
18
+ require 'rdf/xsd'
19
+ require_relative './metadata_object'
20
+ require_relative './constants'
21
+ require_relative './web_utils'
22
22
 
23
23
  module FspHarvester
24
24
  class Error < StandardError
@@ -32,28 +32,28 @@ module FspHarvester
32
32
  def self.resolve_guid(guid:)
33
33
  @meta.finalURI = [guid]
34
34
  type, url = convertToURL(guid: guid)
35
- links = Array.new
36
- unless type
37
- @meta.warnings << ["006", guid, ""]
38
- @meta.comments << "FATAL: GUID type not recognized.\n"
39
- else
35
+ links = []
36
+ if type
40
37
  links, @meta = resolve_url(url: url)
38
+ else
39
+ @meta.warnings << ['006', guid, '']
40
+ @meta.comments << "FATAL: GUID type not recognized.\n"
41
41
  end
42
42
  [links, @meta]
43
43
  end
44
44
 
45
45
  def self.convertToURL(guid:)
46
46
  GUID_TYPES.each do |k, regex|
47
- if k == "inchi" and regex.match(guid)
48
- return "inchi", "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
49
- elsif k == "handle1" and regex.match(guid)
50
- return "handle", "http://hdl.handle.net/#{guid}"
51
- elsif k == "handle2" and regex.match(guid)
52
- return "handle", "http://hdl.handle.net/#{guid}"
53
- elsif k == "uri" and regex.match(guid)
54
- return "uri", guid
55
- elsif k == "doi" and regex.match(guid)
56
- return "doi", "https://doi.org/#{guid}"
47
+ if k == 'inchi' and regex.match(guid)
48
+ return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
49
+ elsif k == 'handle1' and regex.match(guid)
50
+ return 'handle', "http://hdl.handle.net/#{guid}"
51
+ elsif k == 'handle2' and regex.match(guid)
52
+ return 'handle', "http://hdl.handle.net/#{guid}"
53
+ elsif k == 'uri' and regex.match(guid)
54
+ return 'uri', guid
55
+ elsif k == 'doi' and regex.match(guid)
56
+ return 'doi', "https://doi.org/#{guid}"
57
57
  end
58
58
  end
59
59
  [nil, nil]
@@ -66,14 +66,14 @@ module FspHarvester
66
66
  false
67
67
  end
68
68
 
69
- def self.resolve_url(url:, nolinkheaders: false, header: ACCEPT_ALL_HEADER)
70
- @meta.guidtype = "uri" if @meta.guidtype.nil?
69
+ def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_ALL_HEADER)
70
+ @meta.guidtype = 'uri' if @meta.guidtype.nil?
71
71
  warn "\n\n FETCHING #{url} #{header}\n\n"
72
- response = FspHarvester::WebUtils.fspfetch(url: url, headers: header)
73
- warn "\n\n head #{response.headers.inspect}\n\n"
72
+ response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method)
73
+ warn "\n\n head #{response.headers.inspect}\n\n" if response
74
74
 
75
75
  unless response
76
- @meta.warnings << ["001", url, header]
76
+ @meta.warnings << ['001', url, header]
77
77
  @meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
78
78
  return [[], @meta]
79
79
  end
@@ -90,48 +90,113 @@ module FspHarvester
90
90
 
91
91
  parser = LinkHeaders::Processor.new(default_anchor: @meta.finalURI.last)
92
92
  parser.extract_and_parse(response: response)
93
- factory = parser.factory # LinkHeaders::LinkFactory
93
+ factory = parser.factory # LinkHeaders::LinkFactory
94
94
 
95
- citeas = 0
96
- describedby = 0
97
95
  warn "\n\n length #{factory.all_links.length}\n\n"
96
+ signpostingcheck(factory: factory)
97
+ end
98
98
 
99
+ def self.signpostingcheck(factory:)
100
+ citeas = 0
101
+ describedby = 0
99
102
  factory.all_links.each do |l|
100
103
  case l.relation
101
- when "cite-as"
104
+ when 'cite-as'
102
105
  citeas += 1
103
- when "describedby"
106
+ when 'item'
107
+ if !(l.respond_to? 'type')
108
+ @meta.warnings << ['011', l.href, '']
109
+ @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which encourages any item links to also have a 'type' attribute.\n"
110
+ end
111
+ type = l.type if l.respond_to? 'type'
112
+ type = '*/*' unless type # this becomes a frozen string
113
+ header = { accept: type }
114
+ response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
115
+
116
+ if response
117
+ if response.headers[:content_type] and !(type == '*/*')
118
+ rtype = type.gsub(%r{/}, "\/") # because type is a frozen string
119
+ rtype = rtype.gsub(/\+/, '.')
120
+ typeregex = Regexp.new(type)
121
+ if response.headers[:content_type].match(typeregex)
122
+ warn response.headers[:content_type]
123
+ warn typeregex.inspect
124
+ @meta.comments << "INFO: item link responds according to Signposting specifications\n"
125
+ else
126
+ @meta.warnings << ['012', l.href, header]
127
+ @meta.comments << "WARN: Content type of returned item link does not match the 'type' attribute\n"
128
+ end
129
+ else
130
+ @meta.warnings << ['013', l.href, header]
131
+ @meta.comments << "WARN: Content type of returned item link is not specified in response headers or cannot be matched against accept headers\n"
132
+ end
133
+ else
134
+ @meta.warnings << ['014', l.href, header]
135
+ @meta.comments << "WARN: item link doesn't resolve\n"
136
+ end
137
+
138
+ when 'describedby'
104
139
  describedby += 1
105
- unless l.respond_to? "type"
106
- @meta.warnings << ["005", l.href, ""]
107
- @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires any describedby links to also have a 'type' attribute\n"
140
+ if !(l.respond_to? 'type')
141
+ @meta.warnings << ['005', l.href, '']
142
+ @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires any describedby links to also have a 'type' attribute.\n"
143
+ end
144
+ type = l.type if l.respond_to? 'type'
145
+ type = '*/*' unless type
146
+ header = { accept: type }
147
+ response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
148
+ if response
149
+ if response.headers[:content_type] and !(type == '*/*')
150
+ rtype = type.gsub(%r{/}, "\/")
151
+ rtype = rtype.gsub(/\+/, '.')
152
+ typeregex = Regexp.new(rtype)
153
+ if response.headers[:content_type].match(typeregex)
154
+ warn response.headers[:content_type]
155
+ warn typeregex.inspect
156
+ @meta.comments << "INFO: describedby link responds according to Signposting specifications\n"
157
+ else
158
+ @meta.warnings << ['009', l.href, header]
159
+ @meta.comments << "WARN: Content type of returned describedby link does not match the 'type' attribute\n"
160
+ end
161
+ else
162
+ @meta.warnings << ['010', l.href, header]
163
+ @meta.comments << "WARN: Content type of returned describedby link is not specified in response headers or cannot be matched against accept headers\n"
164
+ end
165
+ else
166
+ @meta.warnings << ['008', l.href, header]
167
+ @meta.comments << "WARN: describedby link doesn't resolve\n"
108
168
  end
109
169
  end
110
170
  end
111
171
  if citeas > 1
112
- self.check_for_conflicts(factory: factory) # this merelty adsds to the metadata objects if there are conflicts
172
+ @meta.comments << "INFO: multiple cite-as links found. Checking for conflicts\n"
173
+ citeas = check_for_citeas_conflicts(factory: factory) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
113
174
  end
114
175
 
115
176
  unless citeas == 1 && describedby > 0
116
- @meta.warnings << ["004", "", ""]
177
+ @meta.warnings << ['004', '', '']
117
178
  @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires exactly one cite-as header, and at least one describedby header\n"
118
179
  end
119
180
  factory.all_links
120
181
  end
121
182
 
122
- def self.check_for_conflicts(factory:) # incoming: {"link1" => {"sectiontype1" => value, "sectiontype2" => value2}}
123
- @meta.comments << "INFO: checking for conflicting cite-as links"
124
- citeas = Array.new
183
+ def self.check_for_citeas_conflicts(factory:)
184
+ @meta.comments << 'INFO: checking for conflicting cite-as links'
185
+ citeas = []
125
186
  factory.all_links.each do |link|
126
187
  next unless link.relation == 'cite-as'
188
+
189
+ @meta.comments << "INFO: Adding citeas #{link.href} to the testing queue."
127
190
  citeas << link.href
128
191
  end
129
- unless citeas == citeas.uniq
130
- @meta.warnings << ["007", "", ""]
192
+
193
+ if citeas.uniq.length == 1
194
+ @meta.comments << 'INFO: No conflicting cite-as links found.'
195
+ else # only one allowed!
196
+ @meta.warnings << ['007', '', '']
131
197
  @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard: Found conflicting cite-as link headers\n"
132
- else
133
- @meta.comments << "INFO: No conflicting cite-as links found."
134
198
  end
199
+ citeas.uniq
135
200
  end
136
201
  end
137
202
  end
data/lib/warnings.json CHANGED
@@ -28,6 +28,47 @@
28
28
  "message": "GUID type not recognized",
29
29
  "linkout": "",
30
30
  "severity": "WARN"
31
+ },
32
+ "007": {
33
+ "message": "Conflicting cite-as links",
34
+ "linkout": "",
35
+ "severity": "WARN"
36
+ },
37
+ "008": {
38
+ "message": "describedby link does not resolve",
39
+ "linkout": "",
40
+ "severity": "WARN"
41
+ },
42
+ "009": {
43
+ "message": "Content-type of described-by link does not match the type attribute in the link header itself",
44
+ "linkout": "",
45
+ "severity": "WARN"
46
+ },
47
+ "010": {
48
+ "message": "Content-type of response from described-by link is undefined or cannot be compared to the link type",
49
+ "linkout": "",
50
+ "severity": "WARN"
51
+ },
52
+ "011": {
53
+ "message": "The resource does not follow the FAIR Signposting standard, which encourages any item links to have a type attribute",
54
+ "linkout": "",
55
+ "severity": "WARN"
56
+ },
57
+ "012": {
58
+ "message": "Content-type of response from resolving an item doesn't match the item type attribute in the link header",
59
+ "linkout": "",
60
+ "severity": "WARN"
61
+ },
62
+ "013": {
63
+ "message": "Content-type of response from resolving an item is undefined or cannot be compared to the link type",
64
+ "linkout": "",
65
+ "severity": "WARN"
66
+ },
67
+ "014": {
68
+ "message": "Item link does not resolve",
69
+ "linkout": "",
70
+ "severity": "WARN"
31
71
  }
72
+
32
73
 
33
74
  }
data/lib/web_utils.rb CHANGED
@@ -1,13 +1,13 @@
1
1
  module FspHarvester
2
2
 
3
3
  class WebUtils
4
- def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER) # we will try to retrieve turtle whenever possible
4
+ def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get)
5
5
  warn 'In fetch routine now. '
6
6
 
7
7
  begin
8
8
  warn "executing call over the Web to #{url}"
9
9
  response = RestClient::Request.execute({
10
- method: :get,
10
+ method: method,
11
11
  url: url.to_s,
12
12
  # user: user,
13
13
  # password: pass,
@@ -26,7 +26,7 @@ module FspHarvester
26
26
  warn "EXCEPTION WITH RESPONSE! #{e.response}\n#{e.response.headers}"
27
27
  @meta.warnings << ["003", url, headers] if @meta
28
28
  @meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n" if @meta
29
- if e.response.code == 500
29
+ if (e.response.code == 500 or e.response.code == 404)
30
30
  return false
31
31
  else
32
32
  e.response
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fsp_harvester
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mark Wilkinson
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-07-27 00:00:00.000000000 Z
11
+ date: 2022-07-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: json
@@ -44,14 +44,14 @@ dependencies:
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: 0.1.11
47
+ version: 0.1.12
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: 0.1.11
54
+ version: 0.1.12
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: metainspector
57
57
  requirement: !ruby/object:Gem::Requirement