fsp_harvester 0.1.9 → 0.1.12

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 923ccb362ef4a71fa2f221b1b224dbd5d3ec78a14cc9da0e12a1e0df804162ff
4
- data.tar.gz: e70a9a994c504a0867ab10e05c07d440a76677d7ff27e27b9bdb0c3338c02ffe
3
+ metadata.gz: b38eea15fa26a3fe07290024342f8b6121dbd78c3cd2dd3496ca118fca22f6d4
4
+ data.tar.gz: a25ea37ecd78b2ef8dc41dca391c161ba4b262d910dbadcf34f07f0cd8e54af5
5
5
  SHA512:
6
- metadata.gz: d819cb1c19d40f8a093b723cbeb4273d122d3207df2d874558327fa6a2622be1bf2c671cc9dfebee74c689722825c2dd1957f8be6bfcf2c14c09097c1dc05a5b
7
- data.tar.gz: 508a484d93eab373d0389c11f4361068d5586759a73573901ea54f4d2a028f713f71bb492976785499f1074eb8b359d79be550c09b196eb82838a1e401398fc4
6
+ metadata.gz: 83b766f2896a0776ed75ab3fd1e235d2c80173d3ffc0c22aea80b234856392497daf312db36461f03d0ca168228e6e385edf6d789b42dc4b43fcd6a073cda234
7
+ data.tar.gz: 16d74c199a138db0225e88c0e092a5272a518785c397d27da74c511a69260444e6c1c440021e08413c131d672cd2d528e8d19dbd8a20ca1bff97dad38600c995
data/.rspec_status CHANGED
@@ -1,55 +1,55 @@
1
1
  example_id | status | run_time |
2
2
  ---------------------------------- | ------ | --------------- |
3
- ./spec/cite-as_spec.rb[1:1:1] | passed | 1.17 seconds |
4
- ./spec/cite-as_spec.rb[1:1:2] | passed | 0.98776 seconds |
5
- ./spec/cite-as_spec.rb[1:1:3] | passed | 0.69753 seconds |
6
- ./spec/cite-as_spec.rb[1:1:4] | passed | 1.31 seconds |
7
- ./spec/cite-as_spec.rb[1:1:5] | passed | 2.07 seconds |
8
- ./spec/cite-as_spec.rb[1:1:6] | passed | 1.45 seconds |
9
- ./spec/cite-as_spec.rb[1:1:7] | passed | 2.75 seconds |
10
- ./spec/cite-as_spec.rb[1:1:8] | passed | 1.83 seconds |
11
- ./spec/cite-as_spec.rb[1:1:9] | passed | 2.51 seconds |
12
- ./spec/cite-as_spec.rb[1:1:10] | passed | 1.73 seconds |
13
- ./spec/cite-as_spec.rb[1:1:11] | passed | 2.35 seconds |
14
- ./spec/cite-as_spec.rb[1:1:12] | passed | 2.01 seconds |
15
- ./spec/cite-as_spec.rb[1:1:13] | passed | 2.56 seconds |
16
- ./spec/cite-as_spec.rb[1:1:14] | passed | 1.68 seconds |
17
- ./spec/cite-as_spec.rb[1:1:15] | passed | 1.06 seconds |
18
- ./spec/cite-as_spec.rb[1:1:16] | passed | 1.03 seconds |
19
- ./spec/cite-as_spec.rb[1:1:17] | passed | 0.94321 seconds |
20
- ./spec/cite-as_spec.rb[1:1:18] | passed | 1.1 seconds |
21
- ./spec/cite-as_spec.rb[1:1:19] | passed | 1.45 seconds |
22
- ./spec/cite-as_spec.rb[1:1:20] | passed | 1.53 seconds |
23
- ./spec/cite-as_spec.rb[1:1:21] | passed | 1.64 seconds |
24
- ./spec/cite-as_spec.rb[1:1:22] | passed | 1.01 seconds |
25
- ./spec/cite-as_spec.rb[1:1:23] | passed | 1.09 seconds |
26
- ./spec/cite-as_spec.rb[1:1:24] | failed | 1.22 seconds |
27
- ./spec/cite-as_spec.rb[1:1:25] | passed | 0.38248 seconds |
28
- ./spec/describedby_spec.rb[1:1:1] | passed | 2.24 seconds |
29
- ./spec/describedby_spec.rb[1:1:2] | passed | 1.08 seconds |
30
- ./spec/describedby_spec.rb[1:1:3] | passed | 1 second |
31
- ./spec/describedby_spec.rb[1:1:4] | passed | 1.14 seconds |
32
- ./spec/describedby_spec.rb[1:1:5] | passed | 1.03 seconds |
33
- ./spec/describedby_spec.rb[1:1:6] | passed | 0.81364 seconds |
34
- ./spec/describedby_spec.rb[1:1:7] | passed | 0.77543 seconds |
35
- ./spec/describedby_spec.rb[1:1:8] | passed | 2.01 seconds |
36
- ./spec/describedby_spec.rb[1:1:9] | passed | 1.35 seconds |
37
- ./spec/describedby_spec.rb[1:1:10] | passed | 1.73 seconds |
38
- ./spec/describedby_spec.rb[1:1:11] | passed | 2.36 seconds |
39
- ./spec/describedby_spec.rb[1:1:12] | passed | 2.73 seconds |
40
- ./spec/describedby_spec.rb[1:1:13] | passed | 1.5 seconds |
41
- ./spec/describedby_spec.rb[1:1:14] | passed | 1.8 seconds |
42
- ./spec/describedby_spec.rb[1:1:15] | passed | 1.65 seconds |
43
- ./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00053 seconds |
44
- ./spec/fsp_harvester_spec.rb[1:2] | passed | 1.76 seconds |
45
- ./spec/item_spec.rb[1:1:1] | passed | 2.08 seconds |
46
- ./spec/item_spec.rb[1:1:2] | passed | 2.27 seconds |
47
- ./spec/item_spec.rb[1:1:3] | passed | 1.22 seconds |
48
- ./spec/item_spec.rb[1:1:4] | passed | 1.61 seconds |
49
- ./spec/item_spec.rb[1:1:5] | passed | 1.74 seconds |
50
- ./spec/item_spec.rb[1:1:6] | passed | 1.95 seconds |
51
- ./spec/item_spec.rb[1:1:7] | passed | 3.59 seconds |
52
- ./spec/item_spec.rb[1:1:8] | passed | 0.41001 seconds |
53
- ./spec/type_spec.rb[1:1:1] | passed | 1.14 seconds |
54
- ./spec/type_spec.rb[1:1:2] | passed | 0.94799 seconds |
55
- ./spec/type_spec.rb[1:1:3] | passed | 1.04 seconds |
3
+ ./spec/cite-as_spec.rb[1:1:1] | passed | 1.3 seconds |
4
+ ./spec/cite-as_spec.rb[1:1:2] | passed | 1.21 seconds |
5
+ ./spec/cite-as_spec.rb[1:1:3] | passed | 1.09 seconds |
6
+ ./spec/cite-as_spec.rb[1:1:4] | passed | 1.69 seconds |
7
+ ./spec/cite-as_spec.rb[1:1:5] | passed | 2.72 seconds |
8
+ ./spec/cite-as_spec.rb[1:1:6] | passed | 2.3 seconds |
9
+ ./spec/cite-as_spec.rb[1:1:7] | passed | 3.36 seconds |
10
+ ./spec/cite-as_spec.rb[1:1:8] | passed | 2.26 seconds |
11
+ ./spec/cite-as_spec.rb[1:1:9] | passed | 2.82 seconds |
12
+ ./spec/cite-as_spec.rb[1:1:10] | passed | 2.3 seconds |
13
+ ./spec/cite-as_spec.rb[1:1:11] | passed | 3.37 seconds |
14
+ ./spec/cite-as_spec.rb[1:1:12] | passed | 2.2 seconds |
15
+ ./spec/cite-as_spec.rb[1:1:13] | passed | 2.94 seconds |
16
+ ./spec/cite-as_spec.rb[1:1:14] | passed | 2.44 seconds |
17
+ ./spec/cite-as_spec.rb[1:1:15] | passed | 1.54 seconds |
18
+ ./spec/cite-as_spec.rb[1:1:16] | passed | 1.29 seconds |
19
+ ./spec/cite-as_spec.rb[1:1:17] | passed | 1.25 seconds |
20
+ ./spec/cite-as_spec.rb[1:1:18] | passed | 1.15 seconds |
21
+ ./spec/cite-as_spec.rb[1:1:19] | passed | 1.7 seconds |
22
+ ./spec/cite-as_spec.rb[1:1:20] | passed | 1.66 seconds |
23
+ ./spec/cite-as_spec.rb[1:1:21] | passed | 2.41 seconds |
24
+ ./spec/cite-as_spec.rb[1:1:22] | passed | 1.64 seconds |
25
+ ./spec/cite-as_spec.rb[1:1:23] | passed | 1.35 seconds |
26
+ ./spec/cite-as_spec.rb[1:1:24] | failed | 1.25 seconds |
27
+ ./spec/cite-as_spec.rb[1:1:25] | passed | 0.51152 seconds |
28
+ ./spec/describedby_spec.rb[1:1:1] | passed | 2.71 seconds |
29
+ ./spec/describedby_spec.rb[1:1:2] | passed | 1.25 seconds |
30
+ ./spec/describedby_spec.rb[1:1:3] | passed | 1.22 seconds |
31
+ ./spec/describedby_spec.rb[1:1:4] | passed | 1.22 seconds |
32
+ ./spec/describedby_spec.rb[1:1:5] | passed | 1.21 seconds |
33
+ ./spec/describedby_spec.rb[1:1:6] | passed | 1.02 seconds |
34
+ ./spec/describedby_spec.rb[1:1:7] | passed | 0.99175 seconds |
35
+ ./spec/describedby_spec.rb[1:1:8] | passed | 2.44 seconds |
36
+ ./spec/describedby_spec.rb[1:1:9] | passed | 1.72 seconds |
37
+ ./spec/describedby_spec.rb[1:1:10] | passed | 2.15 seconds |
38
+ ./spec/describedby_spec.rb[1:1:11] | passed | 3.17 seconds |
39
+ ./spec/describedby_spec.rb[1:1:12] | passed | 3.1 seconds |
40
+ ./spec/describedby_spec.rb[1:1:13] | passed | 1.7 seconds |
41
+ ./spec/describedby_spec.rb[1:1:14] | passed | 2.21 seconds |
42
+ ./spec/describedby_spec.rb[1:1:15] | passed | 2.18 seconds |
43
+ ./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00058 seconds |
44
+ ./spec/fsp_harvester_spec.rb[1:2] | failed | 2.92 seconds |
45
+ ./spec/item_spec.rb[1:1:1] | passed | 3.09 seconds |
46
+ ./spec/item_spec.rb[1:1:2] | passed | 2.92 seconds |
47
+ ./spec/item_spec.rb[1:1:3] | passed | 1.12 seconds |
48
+ ./spec/item_spec.rb[1:1:4] | passed | 1.7 seconds |
49
+ ./spec/item_spec.rb[1:1:5] | passed | 2.24 seconds |
50
+ ./spec/item_spec.rb[1:1:6] | passed | 2.87 seconds |
51
+ ./spec/item_spec.rb[1:1:7] | passed | 3.03 seconds |
52
+ ./spec/item_spec.rb[1:1:8] | passed | 0.52338 seconds |
53
+ ./spec/type_spec.rb[1:1:1] | passed | 1.42 seconds |
54
+ ./spec/type_spec.rb[1:1:2] | passed | 1.28 seconds |
55
+ ./spec/type_spec.rb[1:1:3] | passed | 1.52 seconds |
data/Gemfile.lock CHANGED
@@ -1,10 +1,10 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fsp_harvester (0.1.9)
4
+ fsp_harvester (0.1.12)
5
5
  json (~> 2.0)
6
6
  linkeddata (~> 3.2)
7
- linkheaders-processor (~> 0.1.15)
7
+ linkheaders-processor (~> 0.1.16)
8
8
  metainspector (~> 5.11.2)
9
9
  parseconfig (~> 1.1)
10
10
  rake (~> 13.0)
@@ -36,7 +36,7 @@ GEM
36
36
  scanf (~> 1.0)
37
37
  sxp (~> 1.2)
38
38
  unicode-types (~> 1.7)
39
- faraday (1.10.0)
39
+ faraday (1.10.1)
40
40
  faraday-em_http (~> 1.0)
41
41
  faraday-em_synchrony (~> 1.0)
42
42
  faraday-excon (~> 1.1)
@@ -82,13 +82,13 @@ GEM
82
82
  concurrent-ruby (~> 1.0)
83
83
  json (2.6.2)
84
84
  json-canonicalization (0.3.0)
85
- json-ld (3.2.1)
85
+ json-ld (3.2.3)
86
86
  htmlentities (~> 4.3)
87
87
  json-canonicalization (~> 0.3)
88
88
  link_header (~> 0.0, >= 0.0.8)
89
89
  multi_json (~> 1.15)
90
90
  rack (~> 2.2)
91
- rdf (~> 3.2)
91
+ rdf (~> 3.2, >= 3.2.9)
92
92
  json-ld-preloaded (3.2.0)
93
93
  json-ld (~> 3.2)
94
94
  rdf (~> 3.2)
@@ -126,7 +126,7 @@ GEM
126
126
  shex (~> 0.7)
127
127
  sparql (~> 3.2)
128
128
  sparql-client (~> 3.2)
129
- linkheaders-processor (0.1.15)
129
+ linkheaders-processor (0.1.16)
130
130
  json (~> 2.0)
131
131
  json-ld (~> 3.2)
132
132
  json-ld-preloaded (~> 3.2)
@@ -166,7 +166,7 @@ GEM
166
166
  rack (2.2.4)
167
167
  rainbow (3.1.1)
168
168
  rake (13.0.6)
169
- rdf (3.2.8)
169
+ rdf (3.2.9)
170
170
  link_header (~> 0.0, >= 0.0.8)
171
171
  rdf-aggregate-repo (3.2.1)
172
172
  rdf (~> 3.2)
data/lib/config.conf ADDED
@@ -0,0 +1,8 @@
1
+ [extruct]
2
+ command="extruct"
3
+
4
+ [rdf]
5
+ command="/home/osboxes/.rvm/gems/ruby-3.0.0/bin/rdf"
6
+
7
+ [tika]
8
+ command="http://tika:9998/meta"
@@ -1,12 +1,12 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- module FspHarvester
3
+ module HarvesterTools
4
4
  class Error < StandardError
5
5
  end
6
6
 
7
7
  class ExternalTools
8
8
 
9
- def initialize(metadata: FspHarvester::MetadataObject.new)
9
+ def initialize(metadata: HarvesterTools::MetadataObject.new)
10
10
  @meta = metadata
11
11
  end
12
12
 
@@ -25,10 +25,7 @@ module FspHarvester
25
25
  file.rewind
26
26
 
27
27
  @meta.comments << "INFO: The message body is being examined by Distiller\n"
28
- # command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format turtle #{file.path} 2>/dev/null"
29
- command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
30
- # command = "LANG=en_US.UTF-8 /usr/local/bin/ruby #{@rdf_command} serialize --input-format rdfa --output-format jsonld #{file.path}"
31
- # command = "LANG=en_US.UTF-8 /home/osboxes/.rvm/rubies/ruby-2.6.3/bin/ruby /home/osboxes/.rvm/gems/ruby-2.6.3/bin/rdf serialize --output-format jsonld #{file.path}"
28
+ command = "LANG=en_US.UTF-8 #{RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
32
29
  warn "distiller command: #{command}"
33
30
  result, _stderr, _status = Open3.capture3(command)
34
31
  warn ''
@@ -40,13 +37,14 @@ module FspHarvester
40
37
  warn "DIST RESULT: #{result}"
41
38
  if result !~ /@context/i # failure returns nil
42
39
  @meta.comments << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML..\n"
43
- @meta.warnings << ['018', '', '']
40
+ @meta.add_warning(['018', '', ''])
41
+ result = "{}"
44
42
  else
45
43
  @meta.comments << "INFO: The Distiller found parseable data. Parsing as JSON-LD\n"
46
- parse_rdf(result: result, content_type: "application/ld+json")
47
44
  end
48
45
  @@distillerknown[bhash] = true
49
46
  end
47
+ result
50
48
  end
51
49
 
52
50
  def processs_with_extruct(uri:)
@@ -55,28 +53,32 @@ module FspHarvester
55
53
  stdout, stderr, status = Open3.capture3(EXTRUCT_COMMAND + ' ' + uri)
56
54
  warn "open3 status: #{status} #{stdout}"
57
55
  result = stderr # absurd that the output comes over stderr! LOL!
56
+ jsonld = {}
57
+ microdata = Hash.new
58
+ microformat = Hash.new
59
+ opengraph = Hash.new
60
+ rdfa = Hash.new
58
61
 
59
62
  if result.to_s.match(/(Failed\sto\sextract.*?)\n/)
60
63
  @meta.comments << "WARN: extruct threw an error #{Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n"
61
- @meta.warnings << ['019', '', '']
64
+ @meta.add_warning(['019', '', ''])
62
65
  if result.to_s.match(/(ValueError:.*?)\n/)
63
66
  @meta.comments << "WARN: extruct error was #{Regexp.last_match(1)}\n"
64
- @meta.warnings << ['019', '', '']
67
+ @meta.add_warning(['019', '', ''])
65
68
  end
66
69
  elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
67
70
  json = JSON.parse result
68
71
  @meta.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
69
-
70
- parse_rdf(body: json['json-ld'].to_json, content_type: 'application/ld+json') if json['json-ld'].any? # RDF
71
- @meta.merge_hash(json['microdata'].first) if json['microdata'].any?
72
- @meta.merge_hash(json['microformat'].first) if json['microformat'].any?
73
- @meta.merge_hash(json['opengraph'].first) if json['opengraph'].any?
74
- parse_rdf(body: json['rdfa'].to_json, content_type: 'application/ld+json') if json['rdfa'].any? # RDF
75
-
76
- @meta.merge_hash(json.first) if json.first.is_a? Hash
72
+ jsonld = json['json-ld'].to_json if json['json-ld'].any?
73
+ microdata = json['microdata'].first if json['microdata'].any
74
+ microformat = json['microformat'].first if json['microformat'].any?
75
+ opengraph = json['opengraph'].first if json['opengraph'].any?
76
+ rdfa = json['rdfa'].to_json if json['rdfa'].any?
77
+ # @meta.merge_hash(json.first) if json.first.is_a? Hash
77
78
  else
78
79
  @meta.comments << "WARN: the extruct tool failed to find parseable data at #{uri}\n"
79
80
  end
81
+ [jsonld, microdata, microformat, opengraph, rdfa]
80
82
  end
81
83
  end
82
84
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module FspHarvester
4
- VERSION = "0.1.9"
4
+ VERSION = "0.1.12"
5
5
  end
data/lib/fsp_harvester.rb CHANGED
@@ -1,121 +1,23 @@
1
- # frozen_string_literal: true
2
-
3
- require_relative 'fsp_harvester/version'
4
- require 'json/ld'
5
- require 'json/ld/preloaded'
6
- require 'json'
7
- require 'linkheaders/processor'
8
- require 'addressable'
9
- require 'tempfile'
10
- require 'xmlsimple'
11
- require 'nokogiri'
12
- require 'parseconfig'
13
- require 'rest-client'
14
- require 'cgi'
15
- require 'digest'
16
- require 'open3'
17
- require 'metainspector'
18
- require 'rdf/xsd'
19
- require_relative './metadata_object'
20
- require_relative './constants'
21
- require_relative './web_utils'
22
- require_relative './signposting_tests'
23
- require_relative './fsp_metadata_harvester'
24
- require_relative './fsp_metadata_parser'
25
-
26
1
 
2
+ require_relative 'harvester'
27
3
  module FspHarvester
28
4
  class Error < StandardError
29
5
  end
30
6
 
31
7
  class Utils
32
- # @@distillerknown = {} # global, hash of sha256 keys of message bodies - have they been seen before t/f
33
- # @warnings = JSON.parse(File.read("warnings.json"))
34
-
35
-
36
- def self.resolve_guid(guid:)
37
- @meta = FspHarvester::MetadataObject.new
38
- @meta.all_uris = [guid]
39
- type, url = convertToURL(guid: guid)
40
- links = Array.new
41
- if type
42
- links = resolve_url(url: url)
43
- @meta.links << links
44
- else
45
- @meta.warnings << ['006', guid, '']
46
- @meta.comments << "FATAL: GUID type not recognized.\n"
47
- end
48
- [links, @meta]
49
- end
50
8
 
51
- def self.gather_metadata_from_describedby_links(links: [], metadata: FspHarvester::MetadataObject.new) # meta should have already been created by resolve+guid, but maybe not
9
+ def self.gather_metadata_from_describedby_links(links: [], metadata: HarvesterTools::MetadataObject.new) # meta should have already been created by resolve+guid, but maybe not
52
10
  @meta = metadata
53
11
  db = []
54
12
  links.each do |l|
55
13
  db << l if l.relation == 'describedby'
56
14
  end
57
- FspHarvester::MetadataHarvester.extract_metadata(links: db, metadata: @meta) # everything is gathered into the @meta metadata object
15
+ HarvesterTools::MetadataHarvester.extract_metadata(links: db, metadata: @meta) # everything is gathered into the @meta metadata object
58
16
  @meta
59
17
  end
60
18
 
61
- def self.convertToURL(guid:)
62
- GUID_TYPES.each do |k, regex|
63
- if k == 'inchi' and regex.match(guid)
64
- return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
65
- elsif k == 'handle1' and regex.match(guid)
66
- return 'handle', "http://hdl.handle.net/#{guid}"
67
- elsif k == 'handle2' and regex.match(guid)
68
- return 'handle', "http://hdl.handle.net/#{guid}"
69
- elsif k == 'uri' and regex.match(guid)
70
- return 'uri', guid
71
- elsif k == 'doi' and regex.match(guid)
72
- return 'doi', "https://doi.org/#{guid}"
73
- end
74
- end
75
- [nil, nil]
76
- end
77
-
78
- def self.typeit(guid:)
79
- Utils::GUID_TYPES.each do |type, regex|
80
- return type if regex.match(guid)
81
- end
82
- false
83
- end
84
-
85
- def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_STAR_HEADER)
86
- @meta.guidtype = 'uri' if @meta.guidtype.nil?
87
- warn "\n\n FETCHING #{url} #{header}\n\n"
88
- response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method, meta: @meta)
89
- warn "\n\n head #{response.headers.inspect}\n\n" if response
90
-
91
- unless response
92
- @meta.warnings << ['001', url, header]
93
- @meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
94
- return []
95
- end
96
-
97
- @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
98
- @meta.full_response << response.body
99
-
100
- links = process_link_headers(response: response) unless nolinkheaders
101
- links
102
- end
103
-
104
- def self.process_link_headers(response:)
105
- warn "\n\n parsing #{response.headers}\n\n"
106
-
107
- parser = LinkHeaders::Processor.new(default_anchor: @meta.all_uris.last)
108
- parser.extract_and_parse(response: response)
109
- factory = parser.factory # LinkHeaders::LinkFactory
110
-
111
- warn "\n\n length bfore #{factory.all_links.length}\n\n"
112
- signpostingcheck(factory: factory)
113
- warn "\n\n length aftr #{factory.all_links.length}\n\n"
114
- warn "\n\n links #{factory.all_links}\n\n"
115
- factory.all_links
116
- end
117
-
118
- def self.signpostingcheck(factory:)
19
+ def self.signpostingcheck(factory:, metadata: HarvesterTools::MetadataObject.new)
20
+ @meta = metadata
119
21
  citeas = Array.new
120
22
  describedby = Array.new
121
23
  item = Array.new
@@ -134,22 +36,22 @@ module FspHarvester
134
36
  end
135
37
  end
136
38
 
137
- check_describedby_rules(describedby: describedby)
138
- check_item_rules(item: item)
39
+ check_describedby_rules(describedby: describedby, metadata: @meta)
40
+ check_item_rules(item: item, metadata: @meta)
139
41
 
140
42
  if citeas.length > 1
141
43
  warn "INFO: multiple cite-as links found. Checking for conflicts\n"
142
44
  @meta.comments << "INFO: multiple cite-as links found. Checking for conflicts\n"
143
- citeas = check_for_citeas_conflicts(citeas: citeas) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
45
+ citeas = check_for_citeas_conflicts(citeas: citeas, metadata: @meta) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
144
46
  end
145
47
 
146
48
  unless citeas.length == 1 && describedby.length > 0
147
- @meta.warnings << ['004', '', '']
49
+ @meta.add_warning(['004', '', ''])
148
50
  @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires exactly one cite-as header, and at least one describedby header\n"
149
51
  end
150
52
 
151
53
  unless types.length >=1
152
- @meta.warnings << ['015', '', '']
54
+ @meta.add_warning(['015', '', ''])
153
55
  @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires one or two 'type' link headers\n"
154
56
  end
155
57
  end
data/lib/harvester.rb ADDED
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ #require_relative 'fsp_harvester/version'
4
+ require 'json/ld'
5
+ require 'json/ld/preloaded'
6
+ require 'json'
7
+ require 'linkheaders/processor'
8
+ require 'addressable'
9
+ require 'tempfile'
10
+ require 'xmlsimple'
11
+ require 'nokogiri'
12
+ require 'parseconfig'
13
+ require 'rest-client'
14
+ require 'cgi'
15
+ require 'digest'
16
+ require 'open3'
17
+ require 'metainspector'
18
+ require 'rdf/xsd'
19
+ require_relative './metadata_object'
20
+ require_relative './constants'
21
+ require_relative './web_utils'
22
+ require_relative './signposting_tests'
23
+ require_relative './metadata_harvester'
24
+ require_relative './fsp_harvester'
25
+ require_relative './harvester_utils'
26
+ require_relative './external_tools'
27
+ require_relative './metadata_parser'
@@ -0,0 +1,75 @@
1
+ module HarvesterTools
2
+ class Error < StandardError
3
+ end
4
+
5
+ class Utils
6
+
7
+ def self.resolve_guid(guid:)
8
+ @meta = HarvesterTools::MetadataObject.new
9
+ @meta.all_uris = [guid]
10
+ type, url = convertToURL(guid: guid)
11
+ links = Array.new
12
+ if type
13
+ links = resolve_url(url: url)
14
+ @meta.links = @meta.links | links
15
+ else
16
+ @meta.add_warning(['006', guid, ''])
17
+ @meta.comments << "FATAL: GUID type not recognized.\n"
18
+ end
19
+ [links, @meta]
20
+ end
21
+
22
+ def self.convertToURL(guid:)
23
+ GUID_TYPES.each do |k, regex|
24
+ if k == 'inchi' and regex.match(guid)
25
+ return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
26
+ elsif k == 'handle1' and regex.match(guid)
27
+ return 'handle', "http://hdl.handle.net/#{guid}"
28
+ elsif k == 'handle2' and regex.match(guid)
29
+ return 'handle', "http://hdl.handle.net/#{guid}"
30
+ elsif k == 'uri' and regex.match(guid)
31
+ return 'uri', guid
32
+ elsif k == 'doi' and regex.match(guid)
33
+ return 'doi', "https://doi.org/#{guid}"
34
+ end
35
+ end
36
+ [nil, nil]
37
+ end
38
+
39
+ def self.typeit(guid:)
40
+ GUID_TYPES.each do |type, regex|
41
+ return type if regex.match(guid)
42
+ end
43
+ false
44
+ end
45
+
46
+ def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_STAR_HEADER)
47
+ @meta.guidtype = 'uri' if @meta.guidtype.nil?
48
+ warn "\n\n FETCHING #{url} #{header}\n\n"
49
+ response = HarvesterTools::WebUtils.fspfetch(url: url, headers: header, method: method, meta: @meta)
50
+ warn "\n\n head #{response.headers.inspect}\n\n" if response
51
+
52
+ unless response
53
+ @meta.add_warning(['001', url, header])
54
+ @meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
55
+ return []
56
+ end
57
+
58
+ @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
59
+ @meta.full_response << response.body
60
+
61
+ links = process_link_headers(response: response) unless nolinkheaders
62
+ links
63
+ end
64
+
65
+ def self.process_link_headers(response:)
66
+ warn "\n\n parsing #{response.headers}\n\n"
67
+
68
+ parser = LinkHeaders::Processor.new(default_anchor: @meta.all_uris.last)
69
+ parser.extract_and_parse(response: response)
70
+ factory = parser.factory # LinkHeaders::LinkFactory
71
+ FspHarvester::Utils.signpostingcheck(factory: factory, metadata: @meta)
72
+ factory.all_links
73
+ end
74
+ end
75
+ end
@@ -1,17 +1,17 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- module FspHarvester
3
+ module HarvesterTools
4
4
  class Error < StandardError
5
5
  end
6
6
 
7
7
  class MetadataHarvester
8
- def self.extract_metadata(links: [], metadata: FspHarvester::MetadataObject.new)
8
+ def self.extract_metadata(links: [], metadata: HarvesterTools::MetadataObject.new)
9
9
  @meta = metadata
10
10
  @meta.comments << 'INFO: now collecting both linked data and hash-style data using the harvested links'
11
11
 
12
12
  describedby = links.select { |l| l if l.relation == 'describedby' }
13
13
 
14
- hvst = FspHarvester::MetadataParser.new(metadata_object: @meta) # put here because the class variable for detecting duplicates should apply to all URIs
14
+ hvst = HarvesterTools::MetadataParser.new(metadata_object: @meta) # put here because the class variable for detecting duplicates should apply to all URIs
15
15
  describedby.each do |link|
16
16
  accepttype = ACCEPT_STAR_HEADER
17
17
  accept = link.respond_to?('type') ? link.type : nil
@@ -21,7 +21,7 @@ module FspHarvester
21
21
 
22
22
  abbreviation, content_type = attempt_to_detect_type(body: response.body, headers: response.headers)
23
23
  unless abbreviation
24
- @meta.warnings << ['017', url, header]
24
+ @meta.add_warning(['017', url, header])
25
25
  @meta.comments << "WARN: metadata format returned from #{url} using Accept header #{header} is not recognized. Processing will end now.\n"
26
26
  next
27
27
  end
@@ -30,16 +30,16 @@ module FspHarvester
30
30
  case abbreviation
31
31
  when 'html'
32
32
  @meta.comments << 'INFO: Processing html'
33
- hvst.process_html(body: response.body, uri: link)
33
+ hvst.process_html(body: response.body, uri: link, metadata: @meta)
34
34
  when 'xml'
35
35
  @meta.comments << 'INFO: Processing xml'
36
- hvst.process_xml(body: response.body)
36
+ hvst.process_xml(body: response.body, metadata: @meta)
37
37
  when 'json'
38
38
  @meta.comments << 'INFO: Processing json'
39
- hvst.process_json(body: response.body)
39
+ hvst.process_json(body: response.body, metadata: @meta)
40
40
  when 'jsonld', 'rdfxml', 'turtle', 'ntriples', 'nquads'
41
41
  @meta.comments << 'INFO: Processing linked data'
42
- hvst.process_ld(body: response.body, content_type: content_type)
42
+ hvst.process_ld(body: response.body, content_type: content_type, metadata: @meta)
43
43
  when 'specialist'
44
44
  warn 'no specialized parsers so far'
45
45
  end
@@ -54,9 +54,9 @@ module FspHarvester
54
54
  @meta.comments << "INFO: link #{link.href} has no MIME type, defaulting to */*"
55
55
  end
56
56
  url = link.href
57
- response = FspHarvester::WebUtils.fspfetch(url: url, method: :get, headers: header)
57
+ response = HarvesterTools::WebUtils.fspfetch(url: url, method: :get, headers: header)
58
58
  unless response
59
- @meta.warnings << ['016', url, header]
59
+ @meta.add_warning(['016', url, header])
60
60
  @meta.comments << "WARN: Unable to resolve describedby link #{url} using HTTP Accept header #{header}.\n"
61
61
  end
62
62
  response
@@ -87,7 +87,7 @@ module FspHarvester
87
87
  end
88
88
 
89
89
  unless content_type
90
- @meta.warnings << ['017', url, header]
90
+ @meta.add_warning(['017', url, header])
91
91
  @meta.comments << "WARN: metadata format returned from #{url} using Accept header #{header} is not recognized. Processing will end now.\n"
92
92
  end
93
93
  [abbreviation, content_type]
@@ -1,4 +1,4 @@
1
- module FspHarvester
1
+ module HarvesterTools
2
2
  class MetadataObject
3
3
  attr_accessor :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris # a hash of metadata # a RDF.rb graph of metadata # an array of comments # the type of GUID that was detected # will be an array of Net::HTTP::Response
4
4
 
@@ -10,6 +10,9 @@ module FspHarvester
10
10
  @full_response = []
11
11
  @links = []
12
12
  @all_uris = []
13
+ w = RestClient.get("https://raw.githubusercontent.com/markwilkinson/FAIR-Signposting-Harvester/master/lib/warnings.json")
14
+ #@warn = File.read("./lib/warnings.json")
15
+ @warn = JSON.parse(w)
13
16
  end
14
17
 
15
18
  def merge_hash(hash)
@@ -25,6 +28,16 @@ module FspHarvester
25
28
  def rdf
26
29
  graph
27
30
  end
31
+
32
+ def add_warning(warning)
33
+ id = warning[0]
34
+ url = warning[1]
35
+ headers = warning[2]
36
+ message = @warn[id]['message']
37
+ linkout = @warn[id]['linkout']
38
+ severity = @warn[id]['severity']
39
+ self.warnings << {"id" => id, "message" => message, "severity" => severity, "linkout" => linkout, "processed_url" => url, "accept_headers": headers}
40
+ end
28
41
  end
29
42
 
30
43
  class Cache
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- module FspHarvester
3
+ module HarvesterTools
4
4
  class Error < StandardError
5
5
  end
6
6
 
@@ -9,63 +9,78 @@ module FspHarvester
9
9
 
10
10
  @@distillerknown = {}
11
11
 
12
- def initialize(metadata_object: FspHarvester::MetadataObject.new)
12
+ def initialize(metadata_object: HarvesterTools::MetadataObject.new)
13
13
  @meta = metadata_object
14
14
  end
15
15
 
16
- def process_html(body:, uri:)
17
- tools = FspHarvester::ExternalTools.new(metadata: @meta)
18
- tools.process_with_distiller(body: body)
19
- tools.process_with_extruct(uri: uri)
16
+ def process_html(body:, uri:, metadata:)
17
+ @meta = metadata
18
+ tools = HarvesterTools::ExternalTools.new(metadata: @meta)
19
+ result = tools.process_with_distiller(body: body)
20
+
21
+ jsonld, microdata, microformat, opengraph, rdfa = tools.process_with_extruct(uri: uri)
22
+ parse_rdf(body: jsonld, content_type: 'application/ld+json')
23
+ @meta.merge_hash(microdata)
24
+ @meta.merge_hash(microformat)
25
+ @meta.merge_hash(opengraph)
26
+ parse_rdf(body: rdfa, content_type: 'application/ld+json')
20
27
  end
21
28
 
22
- def process_xml(body:)
29
+ def process_xml(body:, metadata:)
30
+ @meta = metadata
23
31
  begin
24
32
  hash = XmlSimple.xml_in(body)
25
33
  rescue
26
34
  @meta.comments << "CRITICAL: Malformed XML detected. Cannot process metadata.\n"
27
- @meta.warnings << ['020', '', '']
35
+ @meta.add_warning(['020', '', ''])
28
36
  end
29
37
  @meta.comments << "INFO: The XML is being merged in the metadata object\n"
30
38
  @meta.hash.merge hash
31
39
  end
32
40
 
33
- def process_json(body:)
41
+ def process_json(body:, metadata:)
42
+ @meta = metadata
34
43
  begin
35
44
  hash = JSON.parse(body)
36
45
  rescue
37
46
  @meta.comments << "CRITICAL: Malformed JSON detected. Cannot process metadata.\n"
38
- @meta.warnings << ['021', '', '']
47
+ @meta.add_warning(['021', '', ''])
39
48
  end
40
49
  @meta.comments << "INFO: The JSON is being merged in the metadata object\n"
41
50
  @meta.hash.merge hash
42
51
  end
43
52
 
44
- def process_ld(body:, content_type:)
45
- parse_rdf(body: body, content_type: content_type)
53
+ def process_ld(body:, content_type:, metadata:)
54
+ @meta = metadata
55
+ parse_rdf(body: body, content_type: content_type, metadata: @meta)
56
+ end
57
+
58
+ def parse_rdf(body:, content_type:, metadata:)
59
+ self.class.parse_rdf(body: body, content_type: content_type, metadata: metadata)
46
60
  end
47
61
 
48
- def parse_rdf(body:, content_type:)
62
+ def self.parse_rdf(body:, content_type:, metadata:)
63
+ @meta = metadata
49
64
  unless body
50
65
  @meta.comments << "CRITICAL: The response message body component appears to have no content.\n"
51
- @meta.warnings << ['018', '', '']
66
+ @meta.add_warning(['018', '', ''])
52
67
  return
53
68
  end
54
69
 
55
70
  unless body.match(/\w/)
56
71
  @meta.comments << "CRITICAL: The response message body component appears to have no content.\n"
57
- @meta.warnings << ['018', '', '']
72
+ @meta.add_warning(['018', '', ''])
58
73
  return
59
74
  end
60
75
 
61
76
  rdfformat = RDF::Format.for(content_type: content_type)
62
77
  unless rdfformat
63
78
  @meta.comments << "CRITICAL: Found what appears to be RDF (sample: #{body[0..300].delete!("\n")}), but it could not find a parser. Please report this error, along with the GUID of the resource, to the maintainer of the system.\n"
64
- @meta.warnings << ['018', '', '']
79
+ @meta.add_warning(['018', '', ''])
65
80
  return
66
81
  end
67
82
 
68
- graph = FspHarvester::Cache.checkRDFCache(body: body)
83
+ graph = HarvesterTools::Cache.checkRDFCache(body: body)
69
84
  if graph.size > 0
70
85
  warn "\n\n\n unmarshalling graph from cache\n\ngraph size #{graph.size}\n\n"
71
86
  @meta.merge_rdf(graph.to_a)
@@ -77,7 +92,7 @@ module FspHarvester
77
92
  reader = rdfformat.reader.new(body)
78
93
  rescue Exception => e
79
94
  @meta.comments << "WARN: Though linked data was found, it failed to parse (Exception #{e}). This likely indicates some syntax error in the data. As a result, no metadata will be extracted from this message.\n"
80
- @meta.warnings << ['018', '', '']
95
+ @meta.add_warning(['018', '', ''])
81
96
  return
82
97
  end
83
98
 
@@ -88,7 +103,7 @@ module FspHarvester
88
103
  end
89
104
  reader = rdfformat.reader.new(body) # have to re-read it here, but now its safe because we have already caught errors
90
105
  warn 'WRITING TO CACHE'
91
- FspHarvester::Cache.writeRDFCache(reader: reader, body: body) # write to the special RDF graph cache
106
+ HarvesterTools::Cache.writeRDFCache(reader: reader, body: body) # write to the special RDF graph cache
92
107
  warn 'WRITING DONE'
93
108
  reader = rdfformat.reader.new(body) # frustrating that we cannot rewind!
94
109
  warn 'RE-READING DONE'
@@ -97,11 +112,11 @@ module FspHarvester
97
112
  rescue RDF::ReaderError => e
98
113
  @meta.comments << "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} || (sample of what was parsed: #{body[0..300].delete("\n")})\n"
99
114
  warn "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} || (sample of what was parsed: #{body[0..300].delete("\n")})\n"
100
- @meta.warnings << ['018', '', '']
115
+ @meta.add_warning(['018', '', ''])
101
116
  rescue Exception => e
102
117
  meta.comments << "CRITICAL: An unknown error occurred while parsing the (apparent) Linked Data (sample of what was parsed: #{body[0..300].delete("\n")}). Moving on...\n"
103
118
  warn "\n\nCRITICAL: #{e.inspect} An unknown error occurred while parsing the (apparent) Linked Data (full body: #{body}). Moving on...\n"
104
- @meta.warnings << ['018', '', '']
119
+ @meta.add_warning(['018', '', ''])
105
120
  end
106
121
  end
107
122
  end
@@ -1,4 +1,5 @@
1
- def check_for_citeas_conflicts(citeas: )
1
+ def check_for_citeas_conflicts(citeas:, metadata: )
2
+ @meta = metadata
2
3
  @meta.comments << 'INFO: checking for conflicting cite-as links'
3
4
  citeas_hrefs = Hash.new
4
5
  citeas.each do |link|
@@ -6,26 +7,27 @@ def check_for_citeas_conflicts(citeas: )
6
7
  @meta.comments << "INFO: Adding citeas #{link.href} to the testing queue."
7
8
  citeas_hrefs[link.href] = link
8
9
  end
9
-
10
+ #warn "finalhash #{citeas_hrefs}"
10
11
  if citeas_hrefs.length > 1
11
12
  @meta.comments << 'INFO: Found multiple non-identical cite-as links.'
12
- @meta.warnings << ['007', '', '']
13
+ @meta.add_warning(['007', '', ''])
13
14
  @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard: Found conflicting cite-as link headers.\n"
14
15
  end
15
16
  citeas_hrefs.values # return list of unique links
16
17
  end
17
18
 
18
19
 
19
- def check_describedby_rules(describedby:)
20
+ def check_describedby_rules(describedby:, metadata:)
21
+ @meta = metadata
20
22
  describedby.each do |l|
21
23
  unless l.respond_to? 'type'
22
- @meta.warnings << ['005', l.href, '']
24
+ @meta.add_warning(['005', l.href, ''])
23
25
  @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which requires any describedby links to also have a 'type' attribute.\n"
24
26
  end
25
27
  type = l.type if l.respond_to? 'type'
26
28
  type ||= '*/*'
27
29
  header = { accept: type }
28
- response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
30
+ response = HarvesterTools::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
29
31
  if response
30
32
  responsetype = response.headers[:content_type]
31
33
  @meta.comments << "INFO: describedby link responds with content type #{responsetype}\n"
@@ -37,30 +39,31 @@ def check_describedby_rules(describedby:)
37
39
  if responsetype == type
38
40
  @meta.comments << "INFO: describedby link responds according to Signposting specifications\n"
39
41
  else
40
- @meta.warnings << ['009', l.href, header]
42
+ @meta.add_warning(['009', l.href, header])
41
43
  @meta.comments << "WARN: Content type of returned describedby link #{responsetype}does not match the 'type' attribute #{type}\n"
42
44
  end
43
45
  else
44
- @meta.warnings << ['010', l.href, header]
46
+ @meta.add_warning(['010', l.href, header])
45
47
  @meta.comments << "WARN: Content type of returned describedby link is not specified in response headers or cannot be matched against accept headers\n"
46
48
  end
47
49
  else
48
- @meta.warnings << ['008', l.href, header]
50
+ @meta.add_warning(['008', l.href, header])
49
51
  @meta.comments << "WARN: describedby link doesn't resolve\n"
50
52
  end
51
53
  end
52
54
  end
53
55
 
54
- def check_item_rules(item:)
56
+ def check_item_rules(item:, metadata:)
57
+ @meta = metadata
55
58
  item.each do |l| # l = LinkHeaders::Link
56
59
  unless l.respond_to? 'type'
57
- @meta.warnings << ['011', l.href, '']
60
+ @meta.add_warning(['011', l.href, ''])
58
61
  @meta.comments << "WARN: The resource does not follow the FAIR Signposting standard, which encourages any item links to also have a 'type' attribute.\n"
59
62
  end
60
63
  type = l.type if l.respond_to? 'type'
61
64
  type ||= '*/*' # this becomes a frozen string
62
65
  header = { accept: type }
63
- response = FspHarvester::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
66
+ response = HarvesterTools::WebUtils.fspfetch(url: l.href, headers: header, method: :head)
64
67
 
65
68
  if response
66
69
  if response.headers[:content_type] and type != '*/*'
@@ -72,15 +75,15 @@ def check_item_rules(item:)
72
75
  warn typeregex.inspect
73
76
  @meta.comments << "INFO: item link responds according to Signposting specifications\n"
74
77
  else
75
- @meta.warnings << ['012', l.href, header]
78
+ @meta.add_warning(['012', l.href, header])
76
79
  @meta.comments << "WARN: Content type of returned item link does not match the 'type' attribute\n"
77
80
  end
78
81
  else
79
- @meta.warnings << ['013', l.href, header]
82
+ @meta.add_warning(['013', l.href, header])
80
83
  @meta.comments << "WARN: Content type of returned item link is not specified in response headers or cannot be matched against accept headers\n"
81
84
  end
82
85
  else
83
- @meta.warnings << ['014', l.href, header]
86
+ @meta.add_warning(['014', l.href, header])
84
87
  @meta.comments << "WARN: item link doesn't resolve\n"
85
88
  end
86
89
  end
data/lib/swagger.rb CHANGED
@@ -1,64 +1,39 @@
1
- class Swagger
2
- attr_accessor :debug
3
- attr_accessor :title
4
- attr_accessor :tests_metric
5
- attr_accessor :description
6
- attr_accessor :applies_to_principle
7
- attr_accessor :organization
8
- attr_accessor :org_url
9
- attr_accessor :responsible_developer
10
- attr_accessor :email
11
- attr_accessor :developer_ORCiD
12
- attr_accessor :protocol
13
- attr_accessor :host
14
- attr_accessor :basePath
15
- attr_accessor :path
16
- attr_accessor :response_description
17
- attr_accessor :schemas
18
- attr_accessor :comments
19
- attr_accessor :fairsharing_key_location
20
- attr_accessor :score
21
- attr_accessor :testedGUID
22
-
23
- def initialize(params = {})
24
- @debug = params.fetch(:debug, false)
25
-
26
- @title = params.fetch(:title, 'unnamed')
27
- @tests_metric = params.fetch(:tests_metric)
28
- @description = params.fetch(:description, 'default_description')
29
- @applies_to_principle = params.fetch(:applies_to_principle, 'some principle')
30
- @version = params.fetch(:version, "0.1")
31
- @organization = params.fetch(:organization, 'Some Organization')
32
- @org_url = params.fetch(:org_url)
33
- @responsible_develper = params.fetch(:responsible_developer, 'Some Person')
34
- @email = params.fetch(:email)
35
- @developer_ORCiD = params.fetch(:developer_ORCiD)
36
- @host = params.fetch(:host)
37
- @protocol = params.fetch(:protocol, "https")
38
- @basePath = params.fetch(:basePath)
39
- @path = params.fetch(:path)
40
- @response_description = params.fetch(:response_description)
41
- @schemas = params.fetch(:schemas, [])
42
- @comments = params.fetch(:comments, [])
43
- @fairsharing_key_location = params.fetch(:fairsharing_key_location)
44
- @score = params.fetch(:score, 0)
45
- @testedGUID = params.fetch(:testedGUID, "")
46
-
47
-
48
-
49
- end
50
-
51
-
52
-
53
- def fairsharing_key
54
- return @fairsharing_key_location
55
- end
56
-
57
-
58
-
59
- def getSwagger
60
-
61
- message = <<"EOF_EOF"
1
+ class Swagger
2
+ attr_accessor :debug, :title, :tests_metric, :description, :applies_to_principle, :organization, :org_url,
3
+ :responsible_developer, :email, :developer_ORCiD, :protocol, :host, :basePath, :path,
4
+ :response_description, :schemas, :comments, :fairsharing_key_location, :score, :testedGUID
5
+
6
+ def initialize(params = {})
7
+ @debug = params.fetch(:debug, false)
8
+
9
+ @title = params.fetch(:title, 'unnamed')
10
+ @tests_metric = params.fetch(:tests_metric)
11
+ @description = params.fetch(:description, 'default_description')
12
+ @applies_to_principle = params.fetch(:applies_to_principle, 'some principle')
13
+ @version = params.fetch(:version, '0.1')
14
+ @organization = params.fetch(:organization, 'Some Organization')
15
+ @org_url = params.fetch(:org_url)
16
+ @responsible_develper = params.fetch(:responsible_developer, 'Some Person')
17
+ @email = params.fetch(:email)
18
+ @developer_ORCiD = params.fetch(:developer_ORCiD)
19
+ @host = params.fetch(:host)
20
+ @protocol = params.fetch(:protocol, 'https')
21
+ @basePath = params.fetch(:basePath)
22
+ @path = params.fetch(:path)
23
+ @response_description = params.fetch(:response_description)
24
+ @schemas = params.fetch(:schemas, [])
25
+ @comments = params.fetch(:comments, [])
26
+ @fairsharing_key_location = params.fetch(:fairsharing_key_location)
27
+ @score = params.fetch(:score, 0)
28
+ @testedGUID = params.fetch(:testedGUID, '')
29
+ end
30
+
31
+ def fairsharing_key
32
+ @fairsharing_key_location
33
+ end
34
+
35
+ def getSwagger
36
+ message = <<"EOF_EOF"
62
37
  swagger: '2.0'
63
38
  info:
64
39
  version: '#{@version}'
@@ -89,7 +64,7 @@ class Swagger
89
64
  $ref: '#/definitions/schemas'
90
65
  consumes:
91
66
  - application/json
92
- produces:
67
+ produces:#{' '}
93
68
  - application/json
94
69
  responses:
95
70
  "200":
@@ -98,127 +73,112 @@ class Swagger
98
73
  definitions:
99
74
  schemas:
100
75
  required:
101
- EOF_EOF
102
-
103
-
104
-
105
- self.schemas.keys.each do |key|
106
- message += " - #{key}\n"
76
+ EOF_EOF
77
+
78
+ schemas.keys.each do |key|
79
+ message += " - #{key}\n"
80
+ end
81
+ message += " properties:\n"
82
+ schemas.keys.each do |key|
83
+ message += " #{key}:\n"
84
+ message += " type: #{schemas[key][0]}\n"
85
+ message += " description: >-\n"
86
+ message += " #{schemas[key][1]}\n"
87
+ end
88
+
89
+ message
90
+ end
91
+
92
+ # A utility function that SHOULD NOT BE CALLED EXTERNALLY
93
+ #
94
+ # @param s - subject node
95
+ # @param p - predicate node
96
+ # @param o - object node
97
+ # @param repo - an RDF::Graph object
98
+ def triplify(s, p, o, repo)
99
+ s = s.strip if s.instance_of?(String)
100
+ p = p.strip if p.instance_of?(String)
101
+ o = o.strip if o.instance_of?(String)
102
+
103
+ unless s.respond_to?('uri')
104
+
105
+ if s.to_s =~ %r{^\w+:/?/?[^\s]+}
106
+ s = RDF::URI.new(s.to_s)
107
+ else
108
+ debug and warn "Subject #{s} must be a URI-compatible thingy"
109
+ abort "Subject #{s} must be a URI-compatible thingy"
107
110
  end
108
- message += " properties:\n"
109
- self.schemas.keys.each do |key|
110
- message += " #{key}:\n"
111
- message += " type: #{self.schemas[key][0]}\n"
112
- message += " description: >-\n"
113
- message += " #{self.schemas[key][1]}\n"
111
+ end
112
+
113
+ unless p.respond_to?('uri')
114
+
115
+ if p.to_s =~ %r{^\w+:/?/?[^\s]+}
116
+ p = RDF::URI.new(p.to_s)
117
+ else
118
+ debug and warn "Predicate #{p} must be a URI-compatible thingy"
119
+ abort "Predicate #{p} must be a URI-compatible thingy"
114
120
  end
115
-
116
- return message
117
121
  end
118
-
119
-
120
-
121
- # A utility function that SHOULD NOT BE CALLED EXTERNALLY
122
- #
123
- # @param s - subject node
124
- # @param p - predicate node
125
- # @param o - object node
126
- # @param repo - an RDF::Graph object
127
- def triplify(s, p, o, repo)
128
-
129
- if s.class == String
130
- s = s.strip
131
- end
132
- if p.class == String
133
- p = p.strip
134
- end
135
- if o.class == String
136
- o = o.strip
137
- end
138
-
139
- unless s.respond_to?('uri')
140
-
141
- if s.to_s =~ /^\w+:\/?\/?[^\s]+/
142
- s = RDF::URI.new(s.to_s)
143
- else
144
- self.debug and $stderr.puts "Subject #{s.to_s} must be a URI-compatible thingy"
145
- abort "Subject #{s.to_s} must be a URI-compatible thingy"
146
- end
147
- end
148
-
149
- unless p.respond_to?('uri')
150
-
151
- if p.to_s =~ /^\w+:\/?\/?[^\s]+/
152
- p = RDF::URI.new(p.to_s)
153
- else
154
- self.debug and $stderr.puts "Predicate #{p.to_s} must be a URI-compatible thingy"
155
- abort "Predicate #{p.to_s} must be a URI-compatible thingy"
156
- end
157
- end
158
-
159
- unless o.respond_to?('uri')
160
- if o.to_s =~ /\A\w+:\/?\/?\w[^\s]+/
161
- o = RDF::URI.new(o.to_s)
122
+
123
+ unless o.respond_to?('uri')
124
+ o = if o.to_s =~ %r{\A\w+:/?/?\w[^\s]+}
125
+ RDF::URI.new(o.to_s)
162
126
  elsif o.to_s =~ /^\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d/
163
- o = RDF::Literal.new(o.to_s, :datatype => RDF::XSD.date)
127
+ RDF::Literal.new(o.to_s, datatype: RDF::XSD.date)
164
128
  elsif o.to_s =~ /^[+-]?\d+\.\d+/
165
- o = RDF::Literal.new(o.to_s, :datatype => RDF::XSD.float)
129
+ RDF::Literal.new(o.to_s, datatype: RDF::XSD.float)
166
130
  elsif o.to_s =~ /^[+-]?[0-9]+$/
167
- o = RDF::Literal.new(o.to_s, :datatype => RDF::XSD.int)
131
+ RDF::Literal.new(o.to_s, datatype: RDF::XSD.int)
168
132
  else
169
- o = RDF::Literal.new(o.to_s, :language => :en)
133
+ RDF::Literal.new(o.to_s, language: :en)
170
134
  end
171
- end
172
-
173
- self.debug and $stderr.puts("\n\ninserting #{s.to_s} #{p.to_s} #{o.to_s}\n\n")
174
- triple = RDF::Statement(s, p, o)
175
- repo.insert(triple)
176
-
177
- return true
178
- end
179
-
180
-
181
- # A utility function that SHOULD NOT BE CALLED EXTERNALLY
182
- #
183
- # @param s - subject node
184
- # @param p - predicate node
185
- # @param o - object node
186
- # @param repo - an RDF::Graph object
187
- def Swagger.triplify(s, p, o, repo)
188
- return triplify(s,p,o,repo)
189
- end
190
-
191
- def addComment(newcomment)
192
- self.comments << newcomment.to_s
193
- #return self.comments
194
- end
195
-
196
- def createEvaluationResponse
197
-
198
- g = RDF::Graph.new
199
-
200
- dt = Time.now.iso8601
201
- uri = self.testedGUID
202
-
203
- me = self.protocol + "://" + self.host + "/" + self.basePath + self.path
204
-
205
- meURI ="#{me}##{uri}/result-#{dt}"
206
- meURI =Addressable::URI.escape(meURI)
207
-
208
- triplify(meURI, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://fairmetrics.org/resources/metric_evaluation_result", g );
209
- triplify(meURI, "http://semanticscience.org/resource/SIO_000300", self.score, g )
210
- triplify(meURI, "http://purl.obolibrary.org/obo/date", dt, g )
211
- triplify(meURI, "http://schema.org/softwareVersion", VERSION, g )
212
- triplify(meURI,"http://semanticscience.org/resource/SIO_000332", uri, g)
213
-
214
- comments = "no comments received. "
215
-
216
- comments = self.comments.join("\n") if self.comments.size > 0
217
- triplify(meURI, "http://schema.org/comment", comments, g)
218
-
219
- return g.dump(:jsonld)
220
- end
221
-
135
+ end
136
+
137
+ debug and warn("\n\ninserting #{s} #{p} #{o}\n\n")
138
+ triple = RDF::Statement(s, p, o)
139
+ repo.insert(triple)
140
+
141
+ true
142
+ end
143
+
144
+ # A utility function that SHOULD NOT BE CALLED EXTERNALLY
145
+ #
146
+ # @param s - subject node
147
+ # @param p - predicate node
148
+ # @param o - object node
149
+ # @param repo - an RDF::Graph object
150
+ def self.triplify(s, p, o, repo)
151
+ triplify(s, p, o, repo)
152
+ end
153
+
154
+ def addComment(newcomment)
155
+ comments << newcomment.to_s
156
+ # return self.comments
157
+ end
158
+
159
+ def createEvaluationResponse
160
+ g = RDF::Graph.new
161
+
162
+ dt = Time.now.iso8601
163
+ uri = testedGUID
164
+
165
+ me = protocol + '://' + host + '/' + basePath + path
166
+
167
+ meURI = "#{me}##{uri}/result-#{dt}"
168
+ meURI = Addressable::URI.escape(meURI)
169
+
170
+ triplify(meURI, 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
171
+ 'http://fairmetrics.org/resources/metric_evaluation_result', g)
172
+ triplify(meURI, 'http://semanticscience.org/resource/SIO_000300', score, g)
173
+ triplify(meURI, 'http://purl.obolibrary.org/obo/date', dt, g)
174
+ triplify(meURI, 'http://schema.org/softwareVersion', VERSION, g)
175
+ triplify(meURI, 'http://semanticscience.org/resource/SIO_000332', uri, g)
176
+
177
+ comments = 'no comments received. '
178
+
179
+ comments = self.comments.join("\n") if self.comments.size > 0
180
+ triplify(meURI, 'http://schema.org/comment', comments, g)
181
+
182
+ g.dump(:jsonld)
222
183
  end
223
-
224
-
184
+ end
data/lib/warnings.json CHANGED
@@ -103,8 +103,5 @@
103
103
  "message": "JSON parsing error - unable to process JSON document.",
104
104
  "linkout": "",
105
105
  "severity": "WARN"
106
- },
107
-
108
-
109
-
106
+ }
110
107
  }
data/lib/web_utils.rb CHANGED
@@ -1,7 +1,7 @@
1
- module FspHarvester
1
+ module HarvesterTools
2
2
 
3
3
  class WebUtils
4
- def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get, meta: FspHarvester::MetadataObject.new)
4
+ def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get, meta: HarvesterTools::MetadataObject.new)
5
5
  warn 'In fetch routine now. '
6
6
 
7
7
  begin
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fsp_harvester
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.9
4
+ version: 0.1.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mark Wilkinson
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-08-05 00:00:00.000000000 Z
11
+ date: 2022-08-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: json
@@ -182,15 +182,18 @@ files:
182
182
  - bin/setup
183
183
  - example_test.rb
184
184
  - launch.json
185
+ - lib/config.conf
185
186
  - lib/config.conf_docker
186
187
  - lib/config.conf_local
187
188
  - lib/constants.rb
189
+ - lib/external_tools.rb
188
190
  - lib/fsp_harvester.rb
189
191
  - lib/fsp_harvester/version.rb
190
- - lib/fsp_metadata_external_tools.rb
191
- - lib/fsp_metadata_harvester.rb
192
- - lib/fsp_metadata_parser.rb
192
+ - lib/harvester.rb
193
+ - lib/harvester_utils.rb
194
+ - lib/metadata_harvester.rb
193
195
  - lib/metadata_object.rb
196
+ - lib/metadata_parser.rb
194
197
  - lib/signposting_tests.rb
195
198
  - lib/swagger.rb
196
199
  - lib/warnings.json