fsp_harvester 0.1.12 → 0.1.15

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b38eea15fa26a3fe07290024342f8b6121dbd78c3cd2dd3496ca118fca22f6d4
4
- data.tar.gz: a25ea37ecd78b2ef8dc41dca391c161ba4b262d910dbadcf34f07f0cd8e54af5
3
+ metadata.gz: 9e0ffd5048e360ce8e8cced890a586664af797065d2c8d6312927d694835e84b
4
+ data.tar.gz: 840269a8b28da70bed8c5e46674ff3730cbee66f624064cab84f98d5b3a2ff00
5
5
  SHA512:
6
- metadata.gz: 83b766f2896a0776ed75ab3fd1e235d2c80173d3ffc0c22aea80b234856392497daf312db36461f03d0ca168228e6e385edf6d789b42dc4b43fcd6a073cda234
7
- data.tar.gz: 16d74c199a138db0225e88c0e092a5272a518785c397d27da74c511a69260444e6c1c440021e08413c131d672cd2d528e8d19dbd8a20ca1bff97dad38600c995
6
+ metadata.gz: 4c01cc88a8f57e024c7aeed89a8251d97b130bca987dc14d914e87fa87ea744d3de7ab11ca340b0456f295edafdd872d4f63d0f0ef23dbe9c3cc8ebc97a64ae5
7
+ data.tar.gz: 2c274758ec874bb1c25ebd5286ecbc2b7e91205430a94cf3ada9c7350511fe362532f2c6d213a6fd6657ccdf7184df9c0eaf9c2461c0d25dc87da00b2aded390
data/.rspec_status CHANGED
@@ -1,55 +1,56 @@
1
1
  example_id | status | run_time |
2
2
  ---------------------------------- | ------ | --------------- |
3
- ./spec/cite-as_spec.rb[1:1:1] | passed | 1.3 seconds |
4
- ./spec/cite-as_spec.rb[1:1:2] | passed | 1.21 seconds |
5
- ./spec/cite-as_spec.rb[1:1:3] | passed | 1.09 seconds |
6
- ./spec/cite-as_spec.rb[1:1:4] | passed | 1.69 seconds |
7
- ./spec/cite-as_spec.rb[1:1:5] | passed | 2.72 seconds |
8
- ./spec/cite-as_spec.rb[1:1:6] | passed | 2.3 seconds |
9
- ./spec/cite-as_spec.rb[1:1:7] | passed | 3.36 seconds |
10
- ./spec/cite-as_spec.rb[1:1:8] | passed | 2.26 seconds |
11
- ./spec/cite-as_spec.rb[1:1:9] | passed | 2.82 seconds |
12
- ./spec/cite-as_spec.rb[1:1:10] | passed | 2.3 seconds |
13
- ./spec/cite-as_spec.rb[1:1:11] | passed | 3.37 seconds |
14
- ./spec/cite-as_spec.rb[1:1:12] | passed | 2.2 seconds |
15
- ./spec/cite-as_spec.rb[1:1:13] | passed | 2.94 seconds |
16
- ./spec/cite-as_spec.rb[1:1:14] | passed | 2.44 seconds |
17
- ./spec/cite-as_spec.rb[1:1:15] | passed | 1.54 seconds |
18
- ./spec/cite-as_spec.rb[1:1:16] | passed | 1.29 seconds |
19
- ./spec/cite-as_spec.rb[1:1:17] | passed | 1.25 seconds |
20
- ./spec/cite-as_spec.rb[1:1:18] | passed | 1.15 seconds |
21
- ./spec/cite-as_spec.rb[1:1:19] | passed | 1.7 seconds |
3
+ ./spec/cite-as_spec.rb[1:1:1] | passed | 1.66 seconds |
4
+ ./spec/cite-as_spec.rb[1:1:2] | passed | 1.13 seconds |
5
+ ./spec/cite-as_spec.rb[1:1:3] | passed | 1.08 seconds |
6
+ ./spec/cite-as_spec.rb[1:1:4] | passed | 1.68 seconds |
7
+ ./spec/cite-as_spec.rb[1:1:5] | passed | 2.86 seconds |
8
+ ./spec/cite-as_spec.rb[1:1:6] | passed | 2.11 seconds |
9
+ ./spec/cite-as_spec.rb[1:1:7] | passed | 3.07 seconds |
10
+ ./spec/cite-as_spec.rb[1:1:8] | passed | 2.13 seconds |
11
+ ./spec/cite-as_spec.rb[1:1:9] | passed | 2.73 seconds |
12
+ ./spec/cite-as_spec.rb[1:1:10] | passed | 2.64 seconds |
13
+ ./spec/cite-as_spec.rb[1:1:11] | passed | 3.36 seconds |
14
+ ./spec/cite-as_spec.rb[1:1:12] | passed | 2.26 seconds |
15
+ ./spec/cite-as_spec.rb[1:1:13] | passed | 2.9 seconds |
16
+ ./spec/cite-as_spec.rb[1:1:14] | passed | 2.31 seconds |
17
+ ./spec/cite-as_spec.rb[1:1:15] | passed | 1.47 seconds |
18
+ ./spec/cite-as_spec.rb[1:1:16] | passed | 1.22 seconds |
19
+ ./spec/cite-as_spec.rb[1:1:17] | passed | 1.23 seconds |
20
+ ./spec/cite-as_spec.rb[1:1:18] | passed | 1.2 seconds |
21
+ ./spec/cite-as_spec.rb[1:1:19] | passed | 1.71 seconds |
22
22
  ./spec/cite-as_spec.rb[1:1:20] | passed | 1.66 seconds |
23
- ./spec/cite-as_spec.rb[1:1:21] | passed | 2.41 seconds |
24
- ./spec/cite-as_spec.rb[1:1:22] | passed | 1.64 seconds |
25
- ./spec/cite-as_spec.rb[1:1:23] | passed | 1.35 seconds |
26
- ./spec/cite-as_spec.rb[1:1:24] | failed | 1.25 seconds |
27
- ./spec/cite-as_spec.rb[1:1:25] | passed | 0.51152 seconds |
28
- ./spec/describedby_spec.rb[1:1:1] | passed | 2.71 seconds |
29
- ./spec/describedby_spec.rb[1:1:2] | passed | 1.25 seconds |
23
+ ./spec/cite-as_spec.rb[1:1:21] | passed | 2.5 seconds |
24
+ ./spec/cite-as_spec.rb[1:1:22] | passed | 1.54 seconds |
25
+ ./spec/cite-as_spec.rb[1:1:23] | passed | 1.25 seconds |
26
+ ./spec/cite-as_spec.rb[1:1:24] | failed | 1.35 seconds |
27
+ ./spec/cite-as_spec.rb[1:1:25] | passed | 0.50811 seconds |
28
+ ./spec/describedby_spec.rb[1:1:1] | passed | 3.45 seconds |
29
+ ./spec/describedby_spec.rb[1:1:2] | passed | 1.3 seconds |
30
30
  ./spec/describedby_spec.rb[1:1:3] | passed | 1.22 seconds |
31
31
  ./spec/describedby_spec.rb[1:1:4] | passed | 1.22 seconds |
32
- ./spec/describedby_spec.rb[1:1:5] | passed | 1.21 seconds |
33
- ./spec/describedby_spec.rb[1:1:6] | passed | 1.02 seconds |
34
- ./spec/describedby_spec.rb[1:1:7] | passed | 0.99175 seconds |
32
+ ./spec/describedby_spec.rb[1:1:5] | passed | 1.15 seconds |
33
+ ./spec/describedby_spec.rb[1:1:6] | passed | 1.04 seconds |
34
+ ./spec/describedby_spec.rb[1:1:7] | passed | 1.12 seconds |
35
35
  ./spec/describedby_spec.rb[1:1:8] | passed | 2.44 seconds |
36
- ./spec/describedby_spec.rb[1:1:9] | passed | 1.72 seconds |
37
- ./spec/describedby_spec.rb[1:1:10] | passed | 2.15 seconds |
38
- ./spec/describedby_spec.rb[1:1:11] | passed | 3.17 seconds |
39
- ./spec/describedby_spec.rb[1:1:12] | passed | 3.1 seconds |
40
- ./spec/describedby_spec.rb[1:1:13] | passed | 1.7 seconds |
41
- ./spec/describedby_spec.rb[1:1:14] | passed | 2.21 seconds |
42
- ./spec/describedby_spec.rb[1:1:15] | passed | 2.18 seconds |
36
+ ./spec/describedby_spec.rb[1:1:9] | passed | 2.15 seconds |
37
+ ./spec/describedby_spec.rb[1:1:10] | passed | 2.19 seconds |
38
+ ./spec/describedby_spec.rb[1:1:11] | passed | 2.98 seconds |
39
+ ./spec/describedby_spec.rb[1:1:12] | passed | 2.87 seconds |
40
+ ./spec/describedby_spec.rb[1:1:13] | passed | 1.74 seconds |
41
+ ./spec/describedby_spec.rb[1:1:14] | passed | 2.27 seconds |
42
+ ./spec/describedby_spec.rb[1:1:15] | passed | 2.28 seconds |
43
43
  ./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00058 seconds |
44
44
  ./spec/fsp_harvester_spec.rb[1:2] | failed | 2.92 seconds |
45
- ./spec/item_spec.rb[1:1:1] | passed | 3.09 seconds |
46
- ./spec/item_spec.rb[1:1:2] | passed | 2.92 seconds |
47
- ./spec/item_spec.rb[1:1:3] | passed | 1.12 seconds |
48
- ./spec/item_spec.rb[1:1:4] | passed | 1.7 seconds |
49
- ./spec/item_spec.rb[1:1:5] | passed | 2.24 seconds |
50
- ./spec/item_spec.rb[1:1:6] | passed | 2.87 seconds |
51
- ./spec/item_spec.rb[1:1:7] | passed | 3.03 seconds |
52
- ./spec/item_spec.rb[1:1:8] | passed | 0.52338 seconds |
53
- ./spec/type_spec.rb[1:1:1] | passed | 1.42 seconds |
54
- ./spec/type_spec.rb[1:1:2] | passed | 1.28 seconds |
55
- ./spec/type_spec.rb[1:1:3] | passed | 1.52 seconds |
45
+ ./spec/fsp_harvester_spec.rb[1:3] | passed | 6.87 seconds |
46
+ ./spec/item_spec.rb[1:1:1] | passed | 3.19 seconds |
47
+ ./spec/item_spec.rb[1:1:2] | passed | 2.81 seconds |
48
+ ./spec/item_spec.rb[1:1:3] | passed | 1.27 seconds |
49
+ ./spec/item_spec.rb[1:1:4] | passed | 1.76 seconds |
50
+ ./spec/item_spec.rb[1:1:5] | passed | 2.43 seconds |
51
+ ./spec/item_spec.rb[1:1:6] | passed | 2.23 seconds |
52
+ ./spec/item_spec.rb[1:1:7] | passed | 2.94 seconds |
53
+ ./spec/item_spec.rb[1:1:8] | passed | 0.52517 seconds |
54
+ ./spec/type_spec.rb[1:1:1] | passed | 1.35 seconds |
55
+ ./spec/type_spec.rb[1:1:2] | passed | 1.32 seconds |
56
+ ./spec/type_spec.rb[1:1:3] | passed | 1.54 seconds |
data/Gemfile.lock CHANGED
@@ -1,10 +1,10 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fsp_harvester (0.1.12)
4
+ fsp_harvester (0.1.15)
5
5
  json (~> 2.0)
6
6
  linkeddata (~> 3.2)
7
- linkheaders-processor (~> 0.1.16)
7
+ linkheaders-processor (~> 0.1.17)
8
8
  metainspector (~> 5.11.2)
9
9
  parseconfig (~> 1.1)
10
10
  rake (~> 13.0)
@@ -56,7 +56,7 @@ GEM
56
56
  faraday-encoding (0.0.5)
57
57
  faraday
58
58
  faraday-excon (1.1.0)
59
- faraday-http-cache (2.4.0)
59
+ faraday-http-cache (2.4.1)
60
60
  faraday (>= 0.8)
61
61
  faraday-httpclient (1.0.1)
62
62
  faraday-multipart (1.0.4)
@@ -99,34 +99,35 @@ GEM
99
99
  sparql (~> 3.2)
100
100
  sxp (~> 1.2)
101
101
  link_header (0.0.8)
102
- linkeddata (3.2.0)
103
- json-ld (~> 3.2)
102
+ linkeddata (3.2.1)
103
+ json-ld (~> 3.2, >= 3.2.3)
104
104
  json-ld-preloaded (~> 3.2)
105
105
  ld-patch (~> 3.2)
106
- nokogiri (~> 1.12, >= 1.12.5)
107
- rdf (~> 3.2)
108
- rdf-aggregate-repo (~> 3.2)
106
+ nokogiri (~> 1.13, >= 1.13.8)
107
+ rdf (~> 3.2, >= 3.2.9)
108
+ rdf-aggregate-repo (~> 3.2, >= 3.2.1)
109
109
  rdf-hamster-repo (~> 3.2)
110
- rdf-isomorphic (~> 3.2)
110
+ rdf-isomorphic (~> 3.2, >= 3.2.1)
111
111
  rdf-json (~> 3.2)
112
- rdf-microdata (~> 3.2)
113
- rdf-n3 (~> 3.2)
112
+ rdf-microdata (~> 3.2, >= 3.2.1)
113
+ rdf-n3 (~> 3.2, >= 3.2.1)
114
114
  rdf-normalize (~> 0.5)
115
- rdf-ordered-repo (~> 3.2)
115
+ rdf-ordered-repo (~> 3.2, >= 3.2.1)
116
116
  rdf-rdfa (~> 3.2)
117
117
  rdf-rdfxml (~> 3.2)
118
118
  rdf-reasoner (~> 0.8)
119
- rdf-tabular (~> 3.2)
119
+ rdf-tabular (~> 3.2, >= 3.2.1)
120
120
  rdf-trig (~> 3.2)
121
121
  rdf-trix (~> 3.2)
122
- rdf-turtle (~> 3.2)
123
- rdf-vocab (~> 3.2)
124
- rdf-xsd (~> 3.2)
125
- shacl (~> 0.2)
126
- shex (~> 0.7)
127
- sparql (~> 3.2)
128
- sparql-client (~> 3.2)
129
- linkheaders-processor (0.1.16)
122
+ rdf-turtle (~> 3.2, >= 3.2.1)
123
+ rdf-vocab (~> 3.2, >= 3.2.1)
124
+ rdf-xsd (~> 3.2, >= 3.2.1)
125
+ shacl (~> 0.2, >= 0.2.1)
126
+ shex (~> 0.7, >= 0.7.1)
127
+ sparql (~> 3.2, >= 3.2.4)
128
+ sparql-client (~> 3.2, >= 3.2.1)
129
+ yaml-ld (~> 0.0)
130
+ linkheaders-processor (0.1.17)
130
131
  json (~> 2.0)
131
132
  json-ld (~> 3.2)
132
133
  json-ld-preloaded (~> 3.2)
@@ -159,8 +160,10 @@ GEM
159
160
  racc (~> 1.4)
160
161
  parallel (1.22.1)
161
162
  parseconfig (1.1.2)
162
- parser (3.1.2.0)
163
+ parser (3.1.2.1)
163
164
  ast (~> 2.4.1)
165
+ psych (4.0.4)
166
+ stringio
164
167
  public_suffix (4.0.7)
165
168
  racc (1.6.0)
166
169
  rack (2.2.4)
@@ -249,17 +252,17 @@ GEM
249
252
  diff-lcs (>= 1.2.0, < 2.0)
250
253
  rspec-support (~> 3.11.0)
251
254
  rspec-support (3.11.0)
252
- rubocop (1.33.0)
255
+ rubocop (1.34.1)
253
256
  json (~> 2.3)
254
257
  parallel (~> 1.10)
255
- parser (>= 3.1.0.0)
258
+ parser (>= 3.1.2.1)
256
259
  rainbow (>= 2.2.2, < 4.0)
257
260
  regexp_parser (>= 1.8, < 3.0)
258
261
  rexml (>= 3.2.5, < 4.0)
259
- rubocop-ast (>= 1.19.1, < 2.0)
262
+ rubocop-ast (>= 1.20.0, < 2.0)
260
263
  ruby-progressbar (~> 1.7)
261
264
  unicode-display_width (>= 1.4.0, < 3.0)
262
- rubocop-ast (1.19.1)
265
+ rubocop-ast (1.21.0)
263
266
  parser (>= 3.1.1.0)
264
267
  ruby-progressbar (1.11.0)
265
268
  ruby2_keywords (0.0.5)
@@ -291,6 +294,7 @@ GEM
291
294
  sparql-client (3.2.1)
292
295
  net-http-persistent (~> 4.0, >= 4.0.1)
293
296
  rdf (~> 3.2, >= 3.2.6)
297
+ stringio (3.0.2)
294
298
  sxp (1.2.2)
295
299
  matrix
296
300
  rdf (~> 3.2)
@@ -303,6 +307,10 @@ GEM
303
307
  unicode-types (1.7.0)
304
308
  xml-simple (1.1.9)
305
309
  rexml
310
+ yaml-ld (0.0.1)
311
+ json-ld (~> 3.2, >= 3.2.2)
312
+ psych (~> 4.0)
313
+ rdf (~> 3.2)
306
314
 
307
315
  PLATFORMS
308
316
  x86_64-linux
data/lib/constants.rb CHANGED
@@ -69,11 +69,14 @@ SELF_IDENTIFIER_PREDICATES = [
69
69
  'https://schema.org/identifier'
70
70
  ]
71
71
 
72
- GUID_TYPES = { 'inchi' => Regexp.new(/^\w{14}-\w{10}-\w$/),
73
- 'doi' => Regexp.new(%r{^10.\d{4,9}/[-._;()/:A-Z0-9]+$}i),
74
- 'handle1' => Regexp.new(%r{^[^/]+/[^/]+$}i),
75
- 'handle2' => Regexp.new(%r{^\d{4,5}/[-._;()/:A-Z0-9]+$}i), # legacy style 12345/AGB47A
76
- 'uri' => Regexp.new(%r{^\w+:/?/?[^\s]+$}) }
72
+ GUID_TYPES = {
73
+ 'inchi' => Regexp.new(/^\w{14}-\w{10}-\w$/),
74
+ 'doi' => Regexp.new(%r{^10.\d{4,9}/[-._;()/:A-Z0-9]+$}i),
75
+ 'handle1' => Regexp.new(%r{^[^/]+/[^/]+$}i),
76
+ 'handle2' => Regexp.new(%r{^\d{4,5}/[-._;()/:A-Z0-9]+$}i), # legacy style 12345/AGB47A
77
+ 'uri' => Regexp.new(%r{^\w+:/?/?[^\s]+$}),
78
+ 'ark' => Regexp.new(%r{^ark:/[^\s]+$})
79
+ }
77
80
 
78
81
  CONFIG = File.exist?('config.conf') ? ParseConfig.new('config.conf') : {}
79
82
  extruct = CONFIG.dig(:extruct, :command)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module FspHarvester
4
- VERSION = "0.1.12"
4
+ VERSION = "0.1.15"
5
5
  end
data/lib/fsp_harvester.rb CHANGED
@@ -12,7 +12,7 @@ module FspHarvester
12
12
  links.each do |l|
13
13
  db << l if l.relation == 'describedby'
14
14
  end
15
- HarvesterTools::MetadataHarvester.extract_metadata(links: db, metadata: @meta) # everything is gathered into the @meta metadata object
15
+ HarvesterTools::MetadataHarvester.extract_metadata_from_links(links: db, metadata: @meta) # everything is gathered into the @meta metadata object
16
16
  @meta
17
17
  end
18
18
 
data/lib/harvester.rb CHANGED
@@ -23,5 +23,6 @@ require_relative './signposting_tests'
23
23
  require_relative './metadata_harvester'
24
24
  require_relative './fsp_harvester'
25
25
  require_relative './harvester_utils'
26
+ require_relative './harvester_brute'
26
27
  require_relative './external_tools'
27
28
  require_relative './metadata_parser'
@@ -0,0 +1,48 @@
1
+ module HarvesterTools
2
+ class Error < StandardError
3
+ end
4
+
5
+ class BruteForce
6
+ def self.begin_brute_force(guid:, metadata: HarvesterTools::MetadataObject.new)
7
+ type, url = HarvesterTools::Utils.convertToURL(guid: guid)
8
+ return false unless type
9
+
10
+ do_content_negotiation(url: url, metadata: metadata)
11
+ metadata
12
+ end
13
+
14
+ def self.do_content_negotiation(url:, metadata:)
15
+ response = resolve_url_brute(url: url, metadata: metadata, headers: ACCEPT_ALL_HEADER)
16
+ if response
17
+ HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata)
18
+ end
19
+ response = resolve_url_brute(url: url, metadata: metadata, headers: ACCEPT_STAR_HEADER)
20
+ if response
21
+ HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from landing page
22
+ response = resolve_url_brute(url: response.request.url, metadata: metadata, headers: ACCEPT_ALL_HEADER) # now do content negotiation on the landing page
23
+ if response
24
+ HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from landing page
25
+ end
26
+ end
27
+ end
28
+
29
+ def self.resolve_url_brute(url:, method: :get, nolinkheaders: true, headers:, metadata:)
30
+ @meta = metadata
31
+ @meta.guidtype = 'uri' if @meta.guidtype.nil?
32
+ warn "\n\n BRUTE FETCHING #{url} #{headers}\n\n"
33
+ response = HarvesterTools::WebUtils.fspfetch(url: url, headers: headers, method: method, meta: @meta)
34
+ warn "\n\n head #{response.headers.inspect}\n\n" if response
35
+
36
+ unless response
37
+ @meta.add_warning(['001', url, headers])
38
+ @meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{headers}.\n"
39
+ @meta.full_response << [url, "No response"]
40
+ false
41
+ end
42
+
43
+ @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
44
+ @meta.full_response << [url, response.body]
45
+ response
46
+ end
47
+ end
48
+ end
@@ -10,7 +10,7 @@ module HarvesterTools
10
10
  type, url = convertToURL(guid: guid)
11
11
  links = Array.new
12
12
  if type
13
- links = resolve_url(url: url)
13
+ links = resolve_url(url: url, metadata: @meta)
14
14
  @meta.links = @meta.links | links
15
15
  else
16
16
  @meta.add_warning(['006', guid, ''])
@@ -31,6 +31,8 @@ module HarvesterTools
31
31
  return 'uri', guid
32
32
  elsif k == 'doi' and regex.match(guid)
33
33
  return 'doi', "https://doi.org/#{guid}"
34
+ elsif k == 'ark' and regex.match(guid)
35
+ return 'ark', "https://n2t.net/#{guid}"
34
36
  end
35
37
  end
36
38
  [nil, nil]
@@ -43,7 +45,8 @@ module HarvesterTools
43
45
  false
44
46
  end
45
47
 
46
- def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_STAR_HEADER)
48
+ def self.resolve_url(url:, method: :get, nolinkheaders: false, metadata:, header: ACCEPT_STAR_HEADER)
49
+ @meta = metadata
47
50
  @meta.guidtype = 'uri' if @meta.guidtype.nil?
48
51
  warn "\n\n FETCHING #{url} #{header}\n\n"
49
52
  response = HarvesterTools::WebUtils.fspfetch(url: url, headers: header, method: method, meta: @meta)
@@ -58,17 +61,17 @@ module HarvesterTools
58
61
  @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
59
62
  @meta.full_response << response.body
60
63
 
61
- links = process_link_headers(response: response) unless nolinkheaders
64
+ links = process_link_headers(response: response, metadata: @meta) unless nolinkheaders
62
65
  links
63
66
  end
64
67
 
65
- def self.process_link_headers(response:)
68
+ def self.process_link_headers(response:, metadata:)
66
69
  warn "\n\n parsing #{response.headers}\n\n"
67
-
68
- parser = LinkHeaders::Processor.new(default_anchor: @meta.all_uris.last)
70
+
71
+ parser = LinkHeaders::Processor.new(default_anchor: metadata.all_uris.last)
69
72
  parser.extract_and_parse(response: response)
70
73
  factory = parser.factory # LinkHeaders::LinkFactory
71
- FspHarvester::Utils.signpostingcheck(factory: factory, metadata: @meta)
74
+ FspHarvester::Utils.signpostingcheck(factory: factory, metadata: metadata)
72
75
  factory.all_links
73
76
  end
74
77
  end
@@ -5,7 +5,7 @@ module HarvesterTools
5
5
  end
6
6
 
7
7
  class MetadataHarvester
8
- def self.extract_metadata(links: [], metadata: HarvesterTools::MetadataObject.new)
8
+ def self.extract_metadata_from_links(links: [], metadata: HarvesterTools::MetadataObject.new)
9
9
  @meta = metadata
10
10
  @meta.comments << 'INFO: now collecting both linked data and hash-style data using the harvested links'
11
11
 
@@ -26,23 +26,42 @@ module HarvesterTools
26
26
  next
27
27
  end
28
28
 
29
- # process according to detected type
30
- case abbreviation
31
- when 'html'
32
- @meta.comments << 'INFO: Processing html'
33
- hvst.process_html(body: response.body, uri: link, metadata: @meta)
34
- when 'xml'
35
- @meta.comments << 'INFO: Processing xml'
36
- hvst.process_xml(body: response.body, metadata: @meta)
37
- when 'json'
38
- @meta.comments << 'INFO: Processing json'
39
- hvst.process_json(body: response.body, metadata: @meta)
40
- when 'jsonld', 'rdfxml', 'turtle', 'ntriples', 'nquads'
41
- @meta.comments << 'INFO: Processing linked data'
42
- hvst.process_ld(body: response.body, content_type: content_type, metadata: @meta)
43
- when 'specialist'
44
- warn 'no specialized parsers so far'
45
- end
29
+ process_according_to_type(body: response.body, uri: link, metadata: @meta, abbreviation: abbreviation,
30
+ content_type: content_type, harvester: hvst)
31
+ end
32
+ end
33
+
34
+ def self.extract_metadata_from_body(response:, metadata: HarvesterTools::MetadataObject.new)
35
+ @meta = metadata
36
+ @meta.comments << 'INFO: now collecting both linked data and hash-style data using the harvested links'
37
+
38
+ abbreviation, content_type = attempt_to_detect_type(body: response.body, headers: response.headers)
39
+ unless abbreviation
40
+ @meta.add_warning(['017', response.request.url, ''])
41
+ @meta.comments << "WARN: metadata format returned from #{response.request.url} is not recognized. Moving on.\n"
42
+ return
43
+ end
44
+ process_according_to_type(body: response.body, uri: response.request.url, metadata: @meta,
45
+ abbreviation: abbreviation, content_type: content_type)
46
+ end
47
+
48
+ def self.process_according_to_type(body:, uri:, abbreviation:, content_type:, metadata:,
49
+ harvester: HarvesterTools::MetadataParser.new(metadata_object: @meta))
50
+ case abbreviation
51
+ when 'html'
52
+ @meta.comments << 'INFO: Processing html'
53
+ harvester.process_html(body: body, uri: uri, metadata: @meta)
54
+ when 'xml'
55
+ @meta.comments << 'INFO: Processing xml'
56
+ harvester.process_xml(body: body, metadata: @meta)
57
+ when 'json'
58
+ @meta.comments << 'INFO: Processing json'
59
+ harvester.process_json(body: body, metadata: @meta)
60
+ when 'jsonld', 'rdfxml', 'turtle', 'ntriples', 'nquads'
61
+ @meta.comments << 'INFO: Processing linked data'
62
+ harvester.process_ld(body: body, content_type: content_type, metadata: @meta)
63
+ when 'specialist'
64
+ warn 'no specialized parsers so far'
46
65
  end
47
66
  end
48
67
 
@@ -111,24 +130,23 @@ module HarvesterTools
111
130
  [abbreviation, contenttype]
112
131
  end
113
132
 
114
- def self.ntriples_hack(body:) # distriller cannot recognize single-line ntriples unless they end with a period, which is not required by the spec... so hack it!
133
+ def self.ntriples_hack(body:) # distriller cannot recognize single-line ntriples unless they end with a period, which is not required by the spec... so hack it!
115
134
  detected_type = nil
116
135
  body.split.each do |line|
117
136
  line.strip!
118
137
  next if line.empty?
119
- if line =~ %r{\s*<[^>]+>\s*<[^>]+>\s\S+}
120
- @meta.comments << "INFO: running ntriples hack on #{line + " ."}\n"
121
- detected_type = RDF::Format.for({ sample: "#{line} ." }) # adding a period allows detection of ntriples by distiller
122
- break
123
- end
124
- end
125
- @meta.comments << "INFO: ntriples hack found: #{detected_type.to_s}\n"
126
- if detected_type != RDF::NTriples::Format # only return the hacky case
127
- return nil
138
+
139
+ next unless line =~ /\s*<[^>]+>\s*<[^>]+>\s\S+/
140
+
141
+ @meta.comments << "INFO: running ntriples hack on #{line + ' .'}\n"
142
+ detected_type = RDF::Format.for({ sample: "#{line} ." }) # adding a period allows detection of ntriples by distiller
143
+ break
128
144
  end
129
- return detected_type
130
- end
145
+ @meta.comments << "INFO: ntriples hack found: #{detected_type}\n"
146
+ return nil if detected_type != RDF::NTriples::Format # only return the hacky case
131
147
 
148
+ detected_type
149
+ end
132
150
 
133
151
  def self.check_json(body:)
134
152
  abbreviation = nil
data/lib/warnings.json CHANGED
@@ -1,107 +1,119 @@
1
1
  {
2
2
  "001": {
3
3
  "message": "Unable to resolve guid using default (*/*) Accept headers",
4
- "linkout": "",
4
+ "linkout": [{"FAIR Principle": "https://www.go-fair.org/fair-principles/metadata-retrievable-identifier-standardised-communication-protocol/"},
5
+ {"FAIRsharing": "https://doi.org/10.25504/FAIRsharing.cd2f9e"}
6
+ ],
5
7
  "severity": "WARN"
6
8
  },
7
9
  "002": {
8
10
  "message": "HTTP Response (203) is non-authoritative",
9
- "linkout": "",
11
+ "linkout": [{"Documentation": "https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/203"}],
10
12
  "severity": "WARN"
11
13
  },
12
14
  "003": {
13
15
  "message": "HTTP Response indicates failure (500-range)",
14
- "linkout": "",
16
+ "linkout": [{"Documentation": "https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500"}],
15
17
  "severity": "WARN"
16
18
  },
17
19
  "004": {
18
20
  "message": "The resource does not follow the FAIR Signposting standard, which requires exactly one cite-as header, and at least one describedby header",
19
- "linkout": "",
21
+ "linkout": [{"Documentation": "http://www.signposting.org/FAIR/#level1"}],
20
22
  "severity": "WARN"
21
23
  },
22
24
  "005": {
23
25
  "message": "The resource does not follow the FAIR Signposting standard, which requires any describedby links to also have a 'type' attribute indicating the Accept headers that should be sent with the request",
24
- "linkout": "",
26
+ "linkout": [{"Documentation": "http://www.signposting.org/FAIR/#level1"}],
25
27
  "severity": "WARN"
26
28
  },
27
29
  "006": {
28
30
  "message": "GUID type not recognized",
29
- "linkout": "",
31
+ "linkout": [{"FAIRsharing": "https://fairsharing.org/search?fairsharingRegistry=Standard&recordType=identifier_schema&page=1"}],
30
32
  "severity": "WARN"
31
33
  },
32
34
  "007": {
33
35
  "message": "Conflicting cite-as links",
34
- "linkout": "",
36
+ "linkout": [{"Documentation": "http://www.signposting.org/FAIR/#level1"}],
35
37
  "severity": "WARN"
36
38
  },
37
39
  "008": {
38
40
  "message": "describedby link does not resolve",
39
- "linkout": "",
41
+ "linkout": [{"Documentation": "http://www.signposting.org/FAIR/#level1"}],
40
42
  "severity": "WARN"
41
43
  },
42
44
  "009": {
43
45
  "message": "Content-type of described-by link does not match the type attribute in the link header itself",
44
- "linkout": "",
46
+ "linkout": [{"Documentation": "http://www.signposting.org/FAIR/#level1"},
47
+ {"Documentation": "https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Type"}],
45
48
  "severity": "WARN"
46
49
  },
47
50
  "010": {
48
51
  "message": "Content-type of response from described-by link is undefined or cannot be compared to the link type",
49
- "linkout": "",
52
+ "linkout": [{"Documentation": "http://www.signposting.org/FAIR/#level1"}],
50
53
  "severity": "WARN"
51
54
  },
52
55
  "011": {
53
56
  "message": "The resource does not follow the FAIR Signposting standard, which encourages any item links to have a type attribute",
54
- "linkout": "",
57
+ "linkout": [{"Documentation": "http://www.signposting.org/FAIR/#level1"}],
55
58
  "severity": "WARN"
56
59
  },
57
60
  "012": {
58
61
  "message": "Content-type of response from resolving an item doesn't match the item type attribute in the link header",
59
- "linkout": "",
62
+ "linkout": [{"Documentation": "http://www.signposting.org/FAIR/#level1"},
63
+ {"Documentation": "https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Type"}],
60
64
  "severity": "WARN"
61
65
  },
62
66
  "013": {
63
67
  "message": "Content-type of response from resolving an item is undefined or cannot be compared to the link type",
64
- "linkout": "",
68
+ "linkout": [{"Documentation": "https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Type"}],
65
69
  "severity": "WARN"
66
70
  },
67
71
  "014": {
68
72
  "message": "Item link does not resolve",
69
- "linkout": "",
73
+ "linkout": [{"Documentation": "http://www.signposting.org/FAIR/#level1"}],
70
74
  "severity": "WARN"
71
75
  },
72
76
  "015": {
73
77
  "message": "Link headers do not include a link of type 'type', as required by the FAIR Signposting specification",
74
- "linkout": "",
78
+ "linkout": [{"Documentation": "http://www.signposting.org/FAIR/#level1"}],
75
79
  "severity": "WARN"
76
80
  },
77
81
  "016": {
78
82
  "message": "Unable to resolve describedby link using Accept headers with the MIME type indicated in the link",
79
- "linkout": "",
83
+ "linkout": [{"Documentation": "https://developer.mozilla.org/en-US/docs/Web/HTTP/Content_negotiation"}],
80
84
  "severity": "WARN"
81
85
  },
82
86
  "017": {
83
87
  "message": "Metadata format not recognized.",
84
- "linkout": "",
88
+ "linkout": [{"FAIRsharing": "https://fairsharing.org/search?subjects=Computer%2520Science,subject%2520agnostic&page=1&recordType=model_and_format"}],
85
89
  "severity": "WARN"
86
90
  },
87
91
  "018": {
88
92
  "message": "RDF parsing error - likely malformed RDF document.",
89
- "linkout": "",
93
+ "linkout": [{"FAIRsharing": "https://fairsharing.org/FAIRsharing.p77ph9"},
94
+ {"Documentation": "http://www.w3.org/TR/2014/REC-rdf11-concepts-20140225/"},
95
+ {"Validator": "http://rdf.greggkellogg.net/distiller"}],
90
96
  "severity": "WARN"
91
97
  },
92
98
  "019": {
93
99
  "message": "HTML parsing error - unable to extract linked data from HTML.",
94
- "linkout": "",
100
+ "linkout": [{"FAIRsharing": "https://fairsharing.org/FAIRsharing.YugnuL"},
101
+ {"Documentation": "https://www.w3.org/TR/html53/"},
102
+ {"validator": "https://validator.w3.org/"}],
95
103
  "severity": "WARN"
96
104
  },
97
105
  "020": {
98
106
  "message": "XML parsing error - unable to process XML document.",
99
- "linkout": "",
107
+ "linkout": [{"FAIRsharing": "https://fairsharing.org/FAIRsharing.b5cc91"},
108
+ {"Documentation": "https://www.w3.org/TR/xml/"},
109
+ {"Validator": "https://www.xmlvalidation.com/"}],
100
110
  "severity": "WARN"
101
111
  },
102
112
  "021": {
103
113
  "message": "JSON parsing error - unable to process JSON document.",
104
- "linkout": "",
114
+ "linkout": [{"FAIRsharing": "https://fairsharing.org/FAIRsharing.5bbab9"},
115
+ {"Documentation": "http://dx.doi.org/10.17487/RFC8259"},
116
+ {"Validator": "https://jsononline.net/json-validator"}],
105
117
  "severity": "WARN"
106
118
  }
107
119
  }
data/lib/web_utils.rb CHANGED
@@ -18,13 +18,13 @@ module HarvesterTools
18
18
  warn "final URL #{response.request.url}"
19
19
  warn "Response code #{response.code}"
20
20
  if response.code == 203
21
- meta.warnings << ["002", url, headers]
21
+ meta.add_warning(["002", url, headers])
22
22
  meta.comments << "WARN: Response is non-authoritative (HTTP response code: #{response.code}). Headers may have been manipulated encountered when trying to resolve #{url}\n"
23
23
  end
24
24
  response
25
25
  rescue RestClient::ExceptionWithResponse => e
26
- warn "EXCEPTION WITH RESPONSE! #{e.response}\n#{e.response.headers}"
27
- meta.warnings << ["003", url, headers]
26
+ warn "EXCEPTION WITH RESPONSE! #{e.response.code} with response #{e.response}\nfailed response headers: #{e.response.headers}"
27
+ meta.add_warning(["003", url, headers])
28
28
  meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
29
29
  if (e.response.code == 500 or e.response.code == 404)
30
30
  return false
@@ -34,13 +34,13 @@ module HarvesterTools
34
34
  # now we are returning the headers and body that were returned
35
35
  rescue RestClient::Exception => e
36
36
  warn "EXCEPTION WITH NO RESPONSE! #{e}"
37
- meta.warnings << ["003", url, headers]
37
+ meta.add_warning(["003", url, headers])
38
38
  meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
39
39
  false
40
40
  # now we are returning 'False', and we will check that with an \"if\" statement in our main code
41
41
  rescue Exception => e
42
42
  warn "EXCEPTION UNKNOWN! #{e}"
43
- meta.warnings << ["003", url, headers]
43
+ meta.add_warning(["003", url, headers])
44
44
  meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
45
45
  false
46
46
  # now we are returning 'False', and we will check that with an \"if\" statement in our main code
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fsp_harvester
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.12
4
+ version: 0.1.15
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mark Wilkinson
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-08-11 00:00:00.000000000 Z
11
+ date: 2022-08-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: json
@@ -44,14 +44,14 @@ dependencies:
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: 0.1.16
47
+ version: 0.1.17
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: 0.1.16
54
+ version: 0.1.17
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: metainspector
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -190,12 +190,12 @@ files:
190
190
  - lib/fsp_harvester.rb
191
191
  - lib/fsp_harvester/version.rb
192
192
  - lib/harvester.rb
193
+ - lib/harvester_brute.rb
193
194
  - lib/harvester_utils.rb
194
195
  - lib/metadata_harvester.rb
195
196
  - lib/metadata_object.rb
196
197
  - lib/metadata_parser.rb
197
198
  - lib/signposting_tests.rb
198
- - lib/swagger.rb
199
199
  - lib/warnings.json
200
200
  - lib/web_utils.rb
201
201
  homepage: https://github.com/markwilkinson/FAIR-Signposting-Harvester
data/lib/swagger.rb DELETED
@@ -1,184 +0,0 @@
1
- class Swagger
2
- attr_accessor :debug, :title, :tests_metric, :description, :applies_to_principle, :organization, :org_url,
3
- :responsible_developer, :email, :developer_ORCiD, :protocol, :host, :basePath, :path,
4
- :response_description, :schemas, :comments, :fairsharing_key_location, :score, :testedGUID
5
-
6
- def initialize(params = {})
7
- @debug = params.fetch(:debug, false)
8
-
9
- @title = params.fetch(:title, 'unnamed')
10
- @tests_metric = params.fetch(:tests_metric)
11
- @description = params.fetch(:description, 'default_description')
12
- @applies_to_principle = params.fetch(:applies_to_principle, 'some principle')
13
- @version = params.fetch(:version, '0.1')
14
- @organization = params.fetch(:organization, 'Some Organization')
15
- @org_url = params.fetch(:org_url)
16
- @responsible_develper = params.fetch(:responsible_developer, 'Some Person')
17
- @email = params.fetch(:email)
18
- @developer_ORCiD = params.fetch(:developer_ORCiD)
19
- @host = params.fetch(:host)
20
- @protocol = params.fetch(:protocol, 'https')
21
- @basePath = params.fetch(:basePath)
22
- @path = params.fetch(:path)
23
- @response_description = params.fetch(:response_description)
24
- @schemas = params.fetch(:schemas, [])
25
- @comments = params.fetch(:comments, [])
26
- @fairsharing_key_location = params.fetch(:fairsharing_key_location)
27
- @score = params.fetch(:score, 0)
28
- @testedGUID = params.fetch(:testedGUID, '')
29
- end
30
-
31
- def fairsharing_key
32
- @fairsharing_key_location
33
- end
34
-
35
- def getSwagger
36
- message = <<"EOF_EOF"
37
- swagger: '2.0'
38
- info:
39
- version: '#{@version}'
40
- title: "#{@title}"
41
- x-tests_metric: '#{@tests_metric}'
42
- description: >-
43
- #{@description}
44
- x-applies_to_principle: "#{@applies_to_principle}"
45
- contact:
46
- x-organization: "#{@organization}"
47
- url: "#{@org_url}"
48
- name: '#{@responsible_develper}'
49
- x-role: "responsible developer"
50
- email: #{@email}
51
- x-id: '#{developer_ORCiD}'
52
- host: #{@host}
53
- basePath: #{@basePath}
54
- schemes:
55
- - #{@protocol}
56
- paths:
57
- #{@path}:
58
- post:
59
- parameters:
60
- - name: content
61
- in: body
62
- required: true
63
- schema:
64
- $ref: '#/definitions/schemas'
65
- consumes:
66
- - application/json
67
- produces:#{' '}
68
- - application/json
69
- responses:
70
- "200":
71
- description: >-
72
- #{@response_description}
73
- definitions:
74
- schemas:
75
- required:
76
- EOF_EOF
77
-
78
- schemas.keys.each do |key|
79
- message += " - #{key}\n"
80
- end
81
- message += " properties:\n"
82
- schemas.keys.each do |key|
83
- message += " #{key}:\n"
84
- message += " type: #{schemas[key][0]}\n"
85
- message += " description: >-\n"
86
- message += " #{schemas[key][1]}\n"
87
- end
88
-
89
- message
90
- end
91
-
92
- # A utility function that SHOULD NOT BE CALLED EXTERNALLY
93
- #
94
- # @param s - subject node
95
- # @param p - predicate node
96
- # @param o - object node
97
- # @param repo - an RDF::Graph object
98
- def triplify(s, p, o, repo)
99
- s = s.strip if s.instance_of?(String)
100
- p = p.strip if p.instance_of?(String)
101
- o = o.strip if o.instance_of?(String)
102
-
103
- unless s.respond_to?('uri')
104
-
105
- if s.to_s =~ %r{^\w+:/?/?[^\s]+}
106
- s = RDF::URI.new(s.to_s)
107
- else
108
- debug and warn "Subject #{s} must be a URI-compatible thingy"
109
- abort "Subject #{s} must be a URI-compatible thingy"
110
- end
111
- end
112
-
113
- unless p.respond_to?('uri')
114
-
115
- if p.to_s =~ %r{^\w+:/?/?[^\s]+}
116
- p = RDF::URI.new(p.to_s)
117
- else
118
- debug and warn "Predicate #{p} must be a URI-compatible thingy"
119
- abort "Predicate #{p} must be a URI-compatible thingy"
120
- end
121
- end
122
-
123
- unless o.respond_to?('uri')
124
- o = if o.to_s =~ %r{\A\w+:/?/?\w[^\s]+}
125
- RDF::URI.new(o.to_s)
126
- elsif o.to_s =~ /^\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d/
127
- RDF::Literal.new(o.to_s, datatype: RDF::XSD.date)
128
- elsif o.to_s =~ /^[+-]?\d+\.\d+/
129
- RDF::Literal.new(o.to_s, datatype: RDF::XSD.float)
130
- elsif o.to_s =~ /^[+-]?[0-9]+$/
131
- RDF::Literal.new(o.to_s, datatype: RDF::XSD.int)
132
- else
133
- RDF::Literal.new(o.to_s, language: :en)
134
- end
135
- end
136
-
137
- debug and warn("\n\ninserting #{s} #{p} #{o}\n\n")
138
- triple = RDF::Statement(s, p, o)
139
- repo.insert(triple)
140
-
141
- true
142
- end
143
-
144
- # A utility function that SHOULD NOT BE CALLED EXTERNALLY
145
- #
146
- # @param s - subject node
147
- # @param p - predicate node
148
- # @param o - object node
149
- # @param repo - an RDF::Graph object
150
- def self.triplify(s, p, o, repo)
151
- triplify(s, p, o, repo)
152
- end
153
-
154
- def addComment(newcomment)
155
- comments << newcomment.to_s
156
- # return self.comments
157
- end
158
-
159
- def createEvaluationResponse
160
- g = RDF::Graph.new
161
-
162
- dt = Time.now.iso8601
163
- uri = testedGUID
164
-
165
- me = protocol + '://' + host + '/' + basePath + path
166
-
167
- meURI = "#{me}##{uri}/result-#{dt}"
168
- meURI = Addressable::URI.escape(meURI)
169
-
170
- triplify(meURI, 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
171
- 'http://fairmetrics.org/resources/metric_evaluation_result', g)
172
- triplify(meURI, 'http://semanticscience.org/resource/SIO_000300', score, g)
173
- triplify(meURI, 'http://purl.obolibrary.org/obo/date', dt, g)
174
- triplify(meURI, 'http://schema.org/softwareVersion', VERSION, g)
175
- triplify(meURI, 'http://semanticscience.org/resource/SIO_000332', uri, g)
176
-
177
- comments = 'no comments received. '
178
-
179
- comments = self.comments.join("\n") if self.comments.size > 0
180
- triplify(meURI, 'http://schema.org/comment', comments, g)
181
-
182
- g.dump(:jsonld)
183
- end
184
- end