fsp_harvester 0.1.4 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ceb83f45b1094afb1bab24e948cf863c764cb9f045f42706017dd36c7507aa1f
4
- data.tar.gz: 41fdf2bfb9166cfaff70058523f442630967721541cb9edc1fc147932f0229b3
3
+ metadata.gz: 7da32f3321193e93f64154c35db0c840f5e7f086451660f99b62d3f4c834e295
4
+ data.tar.gz: 5a0a1ff4ef6b2100accd8bab4f20d6842d25ebf88d5e600e646804da9ed24bd9
5
5
  SHA512:
6
- metadata.gz: 7db52cf3b6fcfc8b4ab9a9f5b470ab97d24e192130eb54c3850a5b0c46a4dd4595748266f50322775d3078f7e372c0ba00c4a414342b1b7b82268565b230a173
7
- data.tar.gz: c2f2bc421c0c0dca304d4ed7e9393b023b9d1af4fc6e0457b334296010d0e08132f44fed64202afb52df913ba6a8b69e1c91b0309b6020f8d79a3f7cb7de2a03
6
+ metadata.gz: dad90f81b73489a151220c132d74508832573a352b209a537bbbaf6543a90b9e132cca80ecca33d15d02eca91841642d73e795113076f4564730194b5bf1fa53
7
+ data.tar.gz: 5d11c2f002f4e73a4971aec0d047cd14d35f6763155d0effa9c79f419ff6fd26f853158b290d2264edd148633570f5ecd50d4e1f6c7e9d136c7d2880517bdefc
data/Gemfile.lock CHANGED
@@ -1,10 +1,10 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fsp_harvester (0.1.4)
4
+ fsp_harvester (0.1.7)
5
5
  json (~> 2.0)
6
6
  linkeddata (~> 3.2)
7
- linkheaders-processor (~> 0.1.11)
7
+ linkheaders-processor (~> 0.1.13)
8
8
  metainspector (~> 5.11.2)
9
9
  parseconfig (~> 1.1)
10
10
  rake (~> 13.0)
@@ -126,7 +126,7 @@ GEM
126
126
  shex (~> 0.7)
127
127
  sparql (~> 3.2)
128
128
  sparql-client (~> 3.2)
129
- linkheaders-processor (0.1.11)
129
+ linkheaders-processor (0.1.13)
130
130
  json (~> 2.0)
131
131
  json-ld (~> 3.2)
132
132
  json-ld-preloaded (~> 3.2)
data/Rakefile CHANGED
@@ -1,11 +1,11 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require "bundler/gem_tasks"
4
- require "rspec/core/rake_task"
3
+ require 'bundler/gem_tasks'
4
+ require 'rspec/core/rake_task'
5
5
 
6
6
  RSpec::Core::RakeTask.new(:spec)
7
7
 
8
- require "rubocop/rake_task"
8
+ require 'rubocop/rake_task'
9
9
 
10
10
  RuboCop::RakeTask.new
11
11
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module FspHarvester
4
- VERSION = "0.1.4"
4
+ VERSION = "0.1.7" # up to date
5
5
  end
data/lib/fsp_harvester.rb CHANGED
@@ -1,24 +1,25 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative "fsp_harvester/version"
4
- require "json/ld"
5
- require "json/ld/preloaded"
6
- require "json"
7
- require "linkheaders/processor"
8
- require "addressable"
9
- require "tempfile"
10
- require "xmlsimple"
11
- require "nokogiri"
12
- require "parseconfig"
13
- require "rest-client"
14
- require "cgi"
15
- require "digest"
16
- require "open3"
17
- require "metainspector"
18
- require "rdf/xsd"
19
- require_relative "./metadata_object"
20
- require_relative "./constants"
21
- require_relative "./web_utils"
3
+ require_relative 'fsp_harvester/version'
4
+ require 'json/ld'
5
+ require 'json/ld/preloaded'
6
+ require 'json'
7
+ require 'linkheaders/processor'
8
+ require 'addressable'
9
+ require 'tempfile'
10
+ require 'xmlsimple'
11
+ require 'nokogiri'
12
+ require 'parseconfig'
13
+ require 'rest-client'
14
+ require 'cgi'
15
+ require 'digest'
16
+ require 'open3'
17
+ require 'metainspector'
18
+ require 'rdf/xsd'
19
+ require_relative './metadata_object'
20
+ require_relative './constants'
21
+ require_relative './web_utils'
22
+ require_relative './signposting_tests'
22
23
 
23
24
  module FspHarvester
24
25
  class Error < StandardError
@@ -27,33 +28,34 @@ module FspHarvester
27
28
  class Utils
28
29
  # @@distillerknown = {} # global, hash of sha256 keys of message bodies - have they been seen before t/f
29
30
  # @warnings = JSON.parse(File.read("warnings.json"))
30
- @meta = FspHarvester::MetadataObject.new
31
+
31
32
 
32
33
  def self.resolve_guid(guid:)
34
+ @meta = FspHarvester::MetadataObject.new
33
35
  @meta.finalURI = [guid]
34
36
  type, url = convertToURL(guid: guid)
35
37
  links = Array.new
36
- unless type
37
- @meta.warnings << ["006", guid, ""]
38
- @meta.comments << "FATAL: GUID type not recognized.\n"
38
+ if type
39
+ links = resolve_url(url: url)
39
40
  else
40
- links, @meta = resolve_url(url: url)
41
+ @meta.warnings << ['006', guid, '']
42
+ @meta.comments << "FATAL: GUID type not recognized.\n"
41
43
  end
42
44
  [links, @meta]
43
45
  end
44
46
 
45
47
  def self.convertToURL(guid:)
46
48
  GUID_TYPES.each do |k, regex|
47
- if k == "inchi" and regex.match(guid)
48
- return "inchi", "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
49
- elsif k == "handle1" and regex.match(guid)
50
- return "handle", "http://hdl.handle.net/#{guid}"
51
- elsif k == "handle2" and regex.match(guid)
52
- return "handle", "http://hdl.handle.net/#{guid}"
53
- elsif k == "uri" and regex.match(guid)
54
- return "uri", guid
55
- elsif k == "doi" and regex.match(guid)
56
- return "doi", "https://doi.org/#{guid}"
49
+ if k == 'inchi' and regex.match(guid)
50
+ return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
51
+ elsif k == 'handle1' and regex.match(guid)
52
+ return 'handle', "http://hdl.handle.net/#{guid}"
53
+ elsif k == 'handle2' and regex.match(guid)
54
+ return 'handle', "http://hdl.handle.net/#{guid}"
55
+ elsif k == 'uri' and regex.match(guid)
56
+ return 'uri', guid
57
+ elsif k == 'doi' and regex.match(guid)
58
+ return 'doi', "https://doi.org/#{guid}"
57
59
  end
58
60
  end
59
61
  [nil, nil]
@@ -66,23 +68,23 @@ module FspHarvester
66
68
  false
67
69
  end
68
70
 
69
- def self.resolve_url(url:, nolinkheaders: false, header: ACCEPT_ALL_HEADER)
70
- @meta.guidtype = "uri" if @meta.guidtype.nil?
71
+ def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_ALL_HEADER)
72
+ @meta.guidtype = 'uri' if @meta.guidtype.nil?
71
73
  warn "\n\n FETCHING #{url} #{header}\n\n"
72
- response = FspHarvester::WebUtils.fspfetch(url: url, headers: header)
73
- warn "\n\n head #{response.headers.inspect}\n\n"
74
+ response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method)
75
+ warn "\n\n head #{response.headers.inspect}\n\n" if response
74
76
 
75
77
  unless response
76
- @meta.warnings << ["001", url, header]
78
+ @meta.warnings << ['001', url, header]
77
79
  @meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
78
- return [[], @meta]
80
+ return []
79
81
  end
80
82
 
81
83
  @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.finalURI.last}. Using the output from this URL for the next few tests..."
82
84
  @meta.full_response << response.body
83
85
 
84
86
  links = process_link_headers(response: response) unless nolinkheaders
85
- [links, @meta]
87
+ links
86
88
  end
87
89
 
88
90
  def self.process_link_headers(response:)
@@ -90,47 +92,43 @@ module FspHarvester
90
92
 
91
93
  parser = LinkHeaders::Processor.new(default_anchor: @meta.finalURI.last)
92
94
  parser.extract_and_parse(response: response)
93
- factory = parser.factory # LinkHeaders::LinkFactory
95
+ factory = parser.factory # LinkHeaders::LinkFactory
94
96
 
95
- citeas = 0
96
- describedby = 0
97
- warn "\n\n length #{factory.all_links.length}\n\n"
97
+ warn "\n\n length bfore #{factory.all_links.length}\n\n"
98
+ signpostingcheck(factory: factory)
99
+ warn "\n\n length aftr #{factory.all_links.length}\n\n"
100
+ warn "\n\n links #{factory.all_links}\n\n"
101
+ factory.all_links
102
+ end
98
103
 
104
+ def self.signpostingcheck(factory:)
105
+ citeas = Array.new
106
+ describedby = Array.new
107
+ item = Array.new
99
108
  factory.all_links.each do |l|
100
109
  case l.relation
101
- when "cite-as"
102
- citeas += 1
103
- when "describedby"
104
- describedby += 1
105
- unless l.respond_to? "type"
106
- @meta.warnings << ["005", l.url, ""]
107
- @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires any describedby links to also have a 'type' attribute\n"
108
- end
110
+ when 'cite-as'
111
+ citeas << l
112
+ when 'item'
113
+ item << l
114
+ when 'describedby'
115
+ describedby << l
109
116
  end
110
117
  end
111
- if citeas > 1
112
- self.check_for_conflicts(factory: factory) # this merelty adsds to the metadata objects if there are conflicts
113
- end
114
118
 
115
- unless citeas == 1 && describedby > 0
116
- @meta.warnings << ["004", "", ""]
117
- @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires exactly one cite-as header, and at least one describedby header\n"
118
- end
119
- factory.all_links
120
- end
119
+ check_describedby_rules(describedby: describedby)
120
+ check_item_rules(item: item)
121
121
 
122
- def self.check_for_conflicts(factory:) # incoming: {"link1" => {"sectiontype1" => value, "sectiontype2" => value2}}
123
- @meta.comments << "INFO: checking for conflicting cite-as links"
124
- citeas = Array.new
125
- factory.all_links.each do |link|
126
- next unless link.relation == 'cite-as'
127
- citeas << link.href
122
+ uniqueciteas = Array.new
123
+ if citeas.length > 1
124
+ warn "INFO: multiple cite-as links found. Checking for conflicts\n"
125
+ @meta.comments << "INFO: multiple cite-as links found. Checking for conflicts\n"
126
+ uniqueciteas = check_for_citeas_conflicts(citeas: citeas) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
128
127
  end
129
- unless citeas == citeas.uniq
130
- @meta.warnings << ["007", url, header]
131
- @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard: Found conflicting cite-as link headers\n"
132
- else
133
- @meta.comments << "INFO: No conflicting cite-as links found."
128
+
129
+ unless uniqueciteas == 1 && describedby.length > 0
130
+ @meta.warnings << ['004', '', '']
131
+ @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires exactly one cite-as header, and at least one describedby header\n"
134
132
  end
135
133
  end
136
134
  end
data/lib/warnings.json CHANGED
@@ -28,6 +28,47 @@
28
28
  "message": "GUID type not recognized",
29
29
  "linkout": "",
30
30
  "severity": "WARN"
31
+ },
32
+ "007": {
33
+ "message": "Conflicting cite-as links",
34
+ "linkout": "",
35
+ "severity": "WARN"
36
+ },
37
+ "008": {
38
+ "message": "describedby link does not resolve",
39
+ "linkout": "",
40
+ "severity": "WARN"
41
+ },
42
+ "009": {
43
+ "message": "Content-type of described-by link does not match the type attribute in the link header itself",
44
+ "linkout": "",
45
+ "severity": "WARN"
46
+ },
47
+ "010": {
48
+ "message": "Content-type of response from described-by link is undefined or cannot be compared to the link type",
49
+ "linkout": "",
50
+ "severity": "WARN"
51
+ },
52
+ "011": {
53
+ "message": "The resource does not follow the FAIR Signposting standard, which encourages any item links to have a type attribute",
54
+ "linkout": "",
55
+ "severity": "WARN"
56
+ },
57
+ "012": {
58
+ "message": "Content-type of response from resolving an item doesn't match the item type attribute in the link header",
59
+ "linkout": "",
60
+ "severity": "WARN"
61
+ },
62
+ "013": {
63
+ "message": "Content-type of response from resolving an item is undefined or cannot be compared to the link type",
64
+ "linkout": "",
65
+ "severity": "WARN"
66
+ },
67
+ "014": {
68
+ "message": "Item link does not resolve",
69
+ "linkout": "",
70
+ "severity": "WARN"
31
71
  }
72
+
32
73
 
33
74
  }
data/lib/web_utils.rb CHANGED
@@ -1,13 +1,13 @@
1
1
  module FspHarvester
2
2
 
3
3
  class WebUtils
4
- def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER) # we will try to retrieve turtle whenever possible
4
+ def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get)
5
5
  warn 'In fetch routine now. '
6
6
 
7
7
  begin
8
8
  warn "executing call over the Web to #{url}"
9
9
  response = RestClient::Request.execute({
10
- method: :get,
10
+ method: method,
11
11
  url: url.to_s,
12
12
  # user: user,
13
13
  # password: pass,
@@ -26,7 +26,7 @@ module FspHarvester
26
26
  warn "EXCEPTION WITH RESPONSE! #{e.response}\n#{e.response.headers}"
27
27
  @meta.warnings << ["003", url, headers] if @meta
28
28
  @meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n" if @meta
29
- if e.response.code == 500
29
+ if (e.response.code == 500 or e.response.code == 404)
30
30
  return false
31
31
  else
32
32
  e.response
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fsp_harvester
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mark Wilkinson
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-07-27 00:00:00.000000000 Z
11
+ date: 2022-08-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: json
@@ -44,14 +44,14 @@ dependencies:
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: 0.1.11
47
+ version: 0.1.13
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: 0.1.11
54
+ version: 0.1.13
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: metainspector
57
57
  requirement: !ruby/object:Gem::Requirement