fsp_harvester 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 79507c31b14bab423d95a72fe441756551fa445caccea733ee75993fd7e0222c
4
- data.tar.gz: a18796aaff5e57940306fecd1d82df5c18d579ffe0d5fb1cd1948a9a29d1bb3b
3
+ metadata.gz: f24e34b5239426a8555e5d893cb3692a1f03442f6b4af3c03c5751c975a7871b
4
+ data.tar.gz: d372b73eb7693e5a4c9a2f78e20d02b62f3c195ed4185db7532018a45a694570
5
5
  SHA512:
6
- metadata.gz: ed211e876c70b7c6bd3dad6dc9a7dada1e4e6d54f5c9a92286b24b9912b06b26f6b3c2fd3b22c8ac225ddb8ceaa3eb2b98d35a983f6be3fe78f4575450f8d857
7
- data.tar.gz: af6d5af7520061d418680b5b9a5f90e066b55be26322b7a4d9275bf74546eb56e80e538cd78443d45b28241246eb9329c84c08c53faee9db67e8b1c893507a54
6
+ metadata.gz: e85c8ba90bee37156e8a4d8e98ec0d8c2148ffc86e24ac3faf0adde1146efaafa2333e7e25acf2b5b4d05aa1f2a9a411deb18890175e93bd1b8d0980773e42c2
7
+ data.tar.gz: f20559315f1b9aff81978600f50743fdee2d3b200eae0dc375ccf267fda90909cb16a6e43686a5efea0f6c583c90c4511c7390702ad7f9e9704f80f829da128b
data/Gemfile.lock CHANGED
@@ -1,10 +1,10 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fsp_harvester (0.1.5)
4
+ fsp_harvester (0.1.6)
5
5
  json (~> 2.0)
6
6
  linkeddata (~> 3.2)
7
- linkheaders-processor (~> 0.1.11)
7
+ linkheaders-processor (~> 0.1.12)
8
8
  metainspector (~> 5.11.2)
9
9
  parseconfig (~> 1.1)
10
10
  rake (~> 13.0)
@@ -126,7 +126,7 @@ GEM
126
126
  shex (~> 0.7)
127
127
  sparql (~> 3.2)
128
128
  sparql-client (~> 3.2)
129
- linkheaders-processor (0.1.11)
129
+ linkheaders-processor (0.1.12)
130
130
  json (~> 2.0)
131
131
  json-ld (~> 3.2)
132
132
  json-ld-preloaded (~> 3.2)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module FspHarvester
4
- VERSION = "0.1.5"
4
+ VERSION = "0.1.6"
5
5
  end
data/lib/fsp_harvester.rb CHANGED
@@ -1,24 +1,24 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative "fsp_harvester/version"
4
- require "json/ld"
5
- require "json/ld/preloaded"
6
- require "json"
7
- require "linkheaders/processor"
8
- require "addressable"
9
- require "tempfile"
10
- require "xmlsimple"
11
- require "nokogiri"
12
- require "parseconfig"
13
- require "rest-client"
14
- require "cgi"
15
- require "digest"
16
- require "open3"
17
- require "metainspector"
18
- require "rdf/xsd"
19
- require_relative "./metadata_object"
20
- require_relative "./constants"
21
- require_relative "./web_utils"
3
+ require_relative 'fsp_harvester/version'
4
+ require 'json/ld'
5
+ require 'json/ld/preloaded'
6
+ require 'json'
7
+ require 'linkheaders/processor'
8
+ require 'addressable'
9
+ require 'tempfile'
10
+ require 'xmlsimple'
11
+ require 'nokogiri'
12
+ require 'parseconfig'
13
+ require 'rest-client'
14
+ require 'cgi'
15
+ require 'digest'
16
+ require 'open3'
17
+ require 'metainspector'
18
+ require 'rdf/xsd'
19
+ require_relative './metadata_object'
20
+ require_relative './constants'
21
+ require_relative './web_utils'
22
22
 
23
23
  module FspHarvester
24
24
  class Error < StandardError
@@ -32,28 +32,28 @@ module FspHarvester
32
32
  def self.resolve_guid(guid:)
33
33
  @meta.finalURI = [guid]
34
34
  type, url = convertToURL(guid: guid)
35
- links = Array.new
36
- unless type
37
- @meta.warnings << ["006", guid, ""]
38
- @meta.comments << "FATAL: GUID type not recognized.\n"
39
- else
35
+ links = []
36
+ if type
40
37
  links, @meta = resolve_url(url: url)
38
+ else
39
+ @meta.warnings << ['006', guid, '']
40
+ @meta.comments << "FATAL: GUID type not recognized.\n"
41
41
  end
42
42
  [links, @meta]
43
43
  end
44
44
 
45
45
  def self.convertToURL(guid:)
46
46
  GUID_TYPES.each do |k, regex|
47
- if k == "inchi" and regex.match(guid)
48
- return "inchi", "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
49
- elsif k == "handle1" and regex.match(guid)
50
- return "handle", "http://hdl.handle.net/#{guid}"
51
- elsif k == "handle2" and regex.match(guid)
52
- return "handle", "http://hdl.handle.net/#{guid}"
53
- elsif k == "uri" and regex.match(guid)
54
- return "uri", guid
55
- elsif k == "doi" and regex.match(guid)
56
- return "doi", "https://doi.org/#{guid}"
47
+ if k == 'inchi' and regex.match(guid)
48
+ return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
49
+ elsif k == 'handle1' and regex.match(guid)
50
+ return 'handle', "http://hdl.handle.net/#{guid}"
51
+ elsif k == 'handle2' and regex.match(guid)
52
+ return 'handle', "http://hdl.handle.net/#{guid}"
53
+ elsif k == 'uri' and regex.match(guid)
54
+ return 'uri', guid
55
+ elsif k == 'doi' and regex.match(guid)
56
+ return 'doi', "https://doi.org/#{guid}"
57
57
  end
58
58
  end
59
59
  [nil, nil]
@@ -66,14 +66,14 @@ module FspHarvester
66
66
  false
67
67
  end
68
68
 
69
- def self.resolve_url(url:, nolinkheaders: false, header: ACCEPT_ALL_HEADER)
70
- @meta.guidtype = "uri" if @meta.guidtype.nil?
69
+ def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_ALL_HEADER)
70
+ @meta.guidtype = 'uri' if @meta.guidtype.nil?
71
71
  warn "\n\n FETCHING #{url} #{header}\n\n"
72
- response = FspHarvester::WebUtils.fspfetch(url: url, headers: header)
73
- warn "\n\n head #{response.headers.inspect}\n\n"
72
+ response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method)
73
+ warn "\n\n head #{response.headers.inspect}\n\n" if response
74
74
 
75
75
  unless response
76
- @meta.warnings << ["001", url, header]
76
+ @meta.warnings << ['001', url, header]
77
77
  @meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
78
78
  return [[], @meta]
79
79
  end
@@ -90,48 +90,113 @@ module FspHarvester
90
90
 
91
91
  parser = LinkHeaders::Processor.new(default_anchor: @meta.finalURI.last)
92
92
  parser.extract_and_parse(response: response)
93
- factory = parser.factory # LinkHeaders::LinkFactory
93
+ factory = parser.factory # LinkHeaders::LinkFactory
94
94
 
95
- citeas = 0
96
- describedby = 0
97
95
  warn "\n\n length #{factory.all_links.length}\n\n"
96
+ signpostingcheck(factory: factory)
97
+ end
98
98
 
99
+ def self.signpostingcheck(factory:)
100
+ citeas = 0
101
+ describedby = 0
99
102
  factory.all_links.each do |l|
100
103
  case l.relation
101
- when "cite-as"
104
+ when 'cite-as'
102
105
  citeas += 1
103
- when "describedby"
106
+ when 'item'
107
+ if !(l.respond_to? 'type')
108
+ @meta.warnings << ['011', l.href, '']
109
+ @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which encourages any item links to also have a 'type' attribute.\n"
110
+ end
111
+ type = l.type if l.respond_to? 'type'
112
+ type = '*/*' unless type # this becomes a frozen string
113
+ header = { accept: type }
114
+ response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
115
+
116
+ if response
117
+ if response.headers[:content_type] and !(type == '*/*')
118
+ rtype = type.gsub(%r{/}, "\/") # because type is a frozen string
119
+ rtype = rtype.gsub(/\+/, '.')
120
+ typeregex = Regexp.new(type)
121
+ if response.headers[:content_type].match(typeregex)
122
+ warn response.headers[:content_type]
123
+ warn typeregex.inspect
124
+ @meta.comments << "INFO: item link responds according to Signposting specifications\n"
125
+ else
126
+ @meta.warnings << ['012', l.href, header]
127
+ @meta.comments << "WARN: Content type of returned item link does not match the 'type' attribute\n"
128
+ end
129
+ else
130
+ @meta.warnings << ['013', l.href, header]
131
+ @meta.comments << "WARN: Content type of returned item link is not specified in response headers or cannot be matched against accept headers\n"
132
+ end
133
+ else
134
+ @meta.warnings << ['014', l.href, header]
135
+ @meta.comments << "WARN: item link doesn't resolve\n"
136
+ end
137
+
138
+ when 'describedby'
104
139
  describedby += 1
105
- unless l.respond_to? "type"
106
- @meta.warnings << ["005", l.href, ""]
107
- @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires any describedby links to also have a 'type' attribute\n"
140
+ if !(l.respond_to? 'type')
141
+ @meta.warnings << ['005', l.href, '']
142
+ @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires any describedby links to also have a 'type' attribute.\n"
143
+ end
144
+ type = l.type if l.respond_to? 'type'
145
+ type = '*/*' unless type
146
+ header = { accept: type }
147
+ response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
148
+ if response
149
+ if response.headers[:content_type] and !(type == '*/*')
150
+ rtype = type.gsub(%r{/}, "\/")
151
+ rtype = rtype.gsub(/\+/, '.')
152
+ typeregex = Regexp.new(rtype)
153
+ if response.headers[:content_type].match(typeregex)
154
+ warn response.headers[:content_type]
155
+ warn typeregex.inspect
156
+ @meta.comments << "INFO: describedby link responds according to Signposting specifications\n"
157
+ else
158
+ @meta.warnings << ['009', l.href, header]
159
+ @meta.comments << "WARN: Content type of returned describedby link does not match the 'type' attribute\n"
160
+ end
161
+ else
162
+ @meta.warnings << ['010', l.href, header]
163
+ @meta.comments << "WARN: Content type of returned describedby link is not specified in response headers or cannot be matched against accept headers\n"
164
+ end
165
+ else
166
+ @meta.warnings << ['008', l.href, header]
167
+ @meta.comments << "WARN: describedby link doesn't resolve\n"
108
168
  end
109
169
  end
110
170
  end
111
171
  if citeas > 1
112
- self.check_for_conflicts(factory: factory) # this merelty adsds to the metadata objects if there are conflicts
172
+ @meta.comments << "INFO: multiple cite-as links found. Checking for conflicts\n"
173
+ citeas = check_for_citeas_conflicts(factory: factory) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
113
174
  end
114
175
 
115
176
  unless citeas == 1 && describedby > 0
116
- @meta.warnings << ["004", "", ""]
177
+ @meta.warnings << ['004', '', '']
117
178
  @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires exactly one cite-as header, and at least one describedby header\n"
118
179
  end
119
180
  factory.all_links
120
181
  end
121
182
 
122
- def self.check_for_conflicts(factory:) # incoming: {"link1" => {"sectiontype1" => value, "sectiontype2" => value2}}
123
- @meta.comments << "INFO: checking for conflicting cite-as links"
124
- citeas = Array.new
183
+ def self.check_for_citeas_conflicts(factory:)
184
+ @meta.comments << 'INFO: checking for conflicting cite-as links'
185
+ citeas = []
125
186
  factory.all_links.each do |link|
126
187
  next unless link.relation == 'cite-as'
188
+
189
+ @meta.comments << "INFO: Adding citeas #{link.href} to the testing queue."
127
190
  citeas << link.href
128
191
  end
129
- unless citeas == citeas.uniq
130
- @meta.warnings << ["007", "", ""]
192
+
193
+ if citeas.uniq.length == 1
194
+ @meta.comments << 'INFO: No conflicting cite-as links found.'
195
+ else # only one allowed!
196
+ @meta.warnings << ['007', '', '']
131
197
  @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard: Found conflicting cite-as link headers\n"
132
- else
133
- @meta.comments << "INFO: No conflicting cite-as links found."
134
198
  end
199
+ citeas.uniq
135
200
  end
136
201
  end
137
202
  end
data/lib/warnings.json CHANGED
@@ -28,6 +28,47 @@
28
28
  "message": "GUID type not recognized",
29
29
  "linkout": "",
30
30
  "severity": "WARN"
31
+ },
32
+ "007": {
33
+ "message": "Conflicting cite-as links",
34
+ "linkout": "",
35
+ "severity": "WARN"
36
+ },
37
+ "008": {
38
+ "message": "describedby link does not resolve",
39
+ "linkout": "",
40
+ "severity": "WARN"
41
+ },
42
+ "009": {
43
+ "message": "Content-type of described-by link does not match the type attribute in the link header itself",
44
+ "linkout": "",
45
+ "severity": "WARN"
46
+ },
47
+ "010": {
48
+ "message": "Content-type of response from described-by link is undefined or cannot be compared to the link type",
49
+ "linkout": "",
50
+ "severity": "WARN"
51
+ },
52
+ "011": {
53
+ "message": "The resource does not follow the FAIR Signposting standard, which encourages any item links to have a type attribute",
54
+ "linkout": "",
55
+ "severity": "WARN"
56
+ },
57
+ "012": {
58
+ "message": "Content-type of response from resolving an item doesn't match the item type attribute in the link header",
59
+ "linkout": "",
60
+ "severity": "WARN"
61
+ },
62
+ "013": {
63
+ "message": "Content-type of response from resolving an item is undefined or cannot be compared to the link type",
64
+ "linkout": "",
65
+ "severity": "WARN"
66
+ },
67
+ "014": {
68
+ "message": "Item link does not resolve",
69
+ "linkout": "",
70
+ "severity": "WARN"
31
71
  }
72
+
32
73
 
33
74
  }
data/lib/web_utils.rb CHANGED
@@ -1,13 +1,13 @@
1
1
  module FspHarvester
2
2
 
3
3
  class WebUtils
4
- def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER) # we will try to retrieve turtle whenever possible
4
+ def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get)
5
5
  warn 'In fetch routine now. '
6
6
 
7
7
  begin
8
8
  warn "executing call over the Web to #{url}"
9
9
  response = RestClient::Request.execute({
10
- method: :get,
10
+ method: method,
11
11
  url: url.to_s,
12
12
  # user: user,
13
13
  # password: pass,
@@ -26,7 +26,7 @@ module FspHarvester
26
26
  warn "EXCEPTION WITH RESPONSE! #{e.response}\n#{e.response.headers}"
27
27
  @meta.warnings << ["003", url, headers] if @meta
28
28
  @meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n" if @meta
29
- if e.response.code == 500
29
+ if (e.response.code == 500 or e.response.code == 404)
30
30
  return false
31
31
  else
32
32
  e.response
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fsp_harvester
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mark Wilkinson
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-07-27 00:00:00.000000000 Z
11
+ date: 2022-07-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: json
@@ -44,14 +44,14 @@ dependencies:
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: 0.1.11
47
+ version: 0.1.12
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: 0.1.11
54
+ version: 0.1.12
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: metainspector
57
57
  requirement: !ruby/object:Gem::Requirement