fsp_harvester 0.1.18 → 0.1.21

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3f6bed703ae1a03ff30a1abb88e54b033a6ffbd24df5cacc70b2e0662af7e1be
4
- data.tar.gz: 49370b82123eb0b7b6c92fd603996cec6909becf35fc789885de218aae0fb446
3
+ metadata.gz: 8498c33db9c350fec8ea4e734b31087f798a4f433f211115c69ded468dbcdb12
4
+ data.tar.gz: f3f408b24575f4f310c6f00ac0d42c3106f68fcd43199d3dbb73d8e4deb403fe
5
5
  SHA512:
6
- metadata.gz: 2abbbfba153e08b83e832640942c978f9612437d1ee7fc3891122be96bfd8da442c2460d8a7b6f303ecc469b6de568907ed3e592a359513d8ab23966fad39786
7
- data.tar.gz: 146004aae9a8495523b2c2578a84a8268b1398f005fdd98788b816f256c98128c34a537e68c078fa681e262bec3a2d0890e9226168b85735c95742f88848a0d0
6
+ metadata.gz: 263549dc8b8bf2fe8a4bc50289092ad2e55d9bbc05cabc509637786b4923948345220f0ee7a04fa5db497f670d9ab79d4e35e05648f09f189103ae869040baad
7
+ data.tar.gz: 6b35a320400ff37561ddf2cff506a3a4f385cd31933e4df8ceb4d436a4f97974e7122ab96962b8ce3cfc183144fcf41592fa417abcdb609fd44080061fc5e1a3
data/.rspec_status CHANGED
@@ -1,57 +1,60 @@
1
- example_id | status | run_time |
2
- ---------------------------------- | ------ | --------------- |
3
- ./spec/cite-as_spec.rb[1:1:1] | passed | 1.66 seconds |
4
- ./spec/cite-as_spec.rb[1:1:2] | passed | 1.13 seconds |
5
- ./spec/cite-as_spec.rb[1:1:3] | passed | 1.08 seconds |
6
- ./spec/cite-as_spec.rb[1:1:4] | passed | 1.68 seconds |
7
- ./spec/cite-as_spec.rb[1:1:5] | passed | 2.86 seconds |
8
- ./spec/cite-as_spec.rb[1:1:6] | passed | 2.11 seconds |
9
- ./spec/cite-as_spec.rb[1:1:7] | passed | 3.07 seconds |
10
- ./spec/cite-as_spec.rb[1:1:8] | passed | 2.13 seconds |
11
- ./spec/cite-as_spec.rb[1:1:9] | passed | 2.73 seconds |
12
- ./spec/cite-as_spec.rb[1:1:10] | passed | 2.64 seconds |
13
- ./spec/cite-as_spec.rb[1:1:11] | passed | 3.36 seconds |
14
- ./spec/cite-as_spec.rb[1:1:12] | passed | 2.26 seconds |
15
- ./spec/cite-as_spec.rb[1:1:13] | passed | 2.9 seconds |
16
- ./spec/cite-as_spec.rb[1:1:14] | passed | 2.31 seconds |
17
- ./spec/cite-as_spec.rb[1:1:15] | passed | 1.47 seconds |
18
- ./spec/cite-as_spec.rb[1:1:16] | passed | 1.22 seconds |
19
- ./spec/cite-as_spec.rb[1:1:17] | passed | 1.23 seconds |
20
- ./spec/cite-as_spec.rb[1:1:18] | passed | 1.2 seconds |
21
- ./spec/cite-as_spec.rb[1:1:19] | passed | 1.71 seconds |
22
- ./spec/cite-as_spec.rb[1:1:20] | passed | 1.66 seconds |
23
- ./spec/cite-as_spec.rb[1:1:21] | passed | 2.5 seconds |
24
- ./spec/cite-as_spec.rb[1:1:22] | passed | 1.54 seconds |
25
- ./spec/cite-as_spec.rb[1:1:23] | passed | 1.25 seconds |
26
- ./spec/cite-as_spec.rb[1:1:24] | failed | 1.35 seconds |
27
- ./spec/cite-as_spec.rb[1:1:25] | passed | 0.50811 seconds |
28
- ./spec/describedby_spec.rb[1:1:1] | passed | 4.39 seconds |
29
- ./spec/describedby_spec.rb[1:1:2] | passed | 1.63 seconds |
30
- ./spec/describedby_spec.rb[1:1:3] | passed | 1.49 seconds |
31
- ./spec/describedby_spec.rb[1:1:4] | passed | 1.21 seconds |
32
- ./spec/describedby_spec.rb[1:1:5] | passed | 1.17 seconds |
33
- ./spec/describedby_spec.rb[1:1:6] | passed | 1.11 seconds |
34
- ./spec/describedby_spec.rb[1:1:7] | passed | 1.11 seconds |
35
- ./spec/describedby_spec.rb[1:1:8] | passed | 2.34 seconds |
36
- ./spec/describedby_spec.rb[1:1:9] | passed | 2.09 seconds |
37
- ./spec/describedby_spec.rb[1:1:10] | passed | 2.69 seconds |
38
- ./spec/describedby_spec.rb[1:1:11] | passed | 3.02 seconds |
39
- ./spec/describedby_spec.rb[1:1:12] | passed | 2.9 seconds |
40
- ./spec/describedby_spec.rb[1:1:13] | passed | 2.27 seconds |
41
- ./spec/describedby_spec.rb[1:1:14] | passed | 2.53 seconds |
42
- ./spec/describedby_spec.rb[1:1:15] | passed | 2.33 seconds |
43
- ./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00101 seconds |
44
- ./spec/fsp_harvester_spec.rb[1:2] | passed | 3.15 seconds |
45
- ./spec/fsp_harvester_spec.rb[1:3] | passed | 7.1 seconds |
46
- ./spec/fsp_harvester_spec.rb[1:4] | passed | 2.47 seconds |
47
- ./spec/item_spec.rb[1:1:1] | passed | 3.19 seconds |
48
- ./spec/item_spec.rb[1:1:2] | passed | 2.81 seconds |
49
- ./spec/item_spec.rb[1:1:3] | passed | 1.27 seconds |
50
- ./spec/item_spec.rb[1:1:4] | passed | 1.76 seconds |
51
- ./spec/item_spec.rb[1:1:5] | passed | 2.43 seconds |
52
- ./spec/item_spec.rb[1:1:6] | passed | 2.23 seconds |
53
- ./spec/item_spec.rb[1:1:7] | passed | 2.94 seconds |
54
- ./spec/item_spec.rb[1:1:8] | passed | 0.52517 seconds |
55
- ./spec/type_spec.rb[1:1:1] | passed | 1.35 seconds |
56
- ./spec/type_spec.rb[1:1:2] | passed | 1.32 seconds |
57
- ./spec/type_spec.rb[1:1:3] | passed | 1.54 seconds |
1
+ example_id | status | run_time |
2
+ ---------------------------------- | ------ | ---------------------- |
3
+ ./spec/cite-as_spec.rb[1:1:1] | passed | 1.77 seconds |
4
+ ./spec/cite-as_spec.rb[1:1:2] | passed | 1.22 seconds |
5
+ ./spec/cite-as_spec.rb[1:1:3] | passed | 1.09 seconds |
6
+ ./spec/cite-as_spec.rb[1:1:4] | passed | 1.89 seconds |
7
+ ./spec/cite-as_spec.rb[1:1:5] | passed | 2.95 seconds |
8
+ ./spec/cite-as_spec.rb[1:1:6] | passed | 2.14 seconds |
9
+ ./spec/cite-as_spec.rb[1:1:7] | passed | 2.96 seconds |
10
+ ./spec/cite-as_spec.rb[1:1:8] | passed | 2.28 seconds |
11
+ ./spec/cite-as_spec.rb[1:1:9] | passed | 3.4 seconds |
12
+ ./spec/cite-as_spec.rb[1:1:10] | passed | 2.21 seconds |
13
+ ./spec/cite-as_spec.rb[1:1:11] | passed | 2.82 seconds |
14
+ ./spec/cite-as_spec.rb[1:1:12] | passed | 2.23 seconds |
15
+ ./spec/cite-as_spec.rb[1:1:13] | passed | 3.36 seconds |
16
+ ./spec/cite-as_spec.rb[1:1:14] | passed | 2.19 seconds |
17
+ ./spec/cite-as_spec.rb[1:1:15] | passed | 1.19 seconds |
18
+ ./spec/cite-as_spec.rb[1:1:16] | passed | 1.23 seconds |
19
+ ./spec/cite-as_spec.rb[1:1:17] | passed | 1.19 seconds |
20
+ ./spec/cite-as_spec.rb[1:1:18] | passed | 1.28 seconds |
21
+ ./spec/cite-as_spec.rb[1:1:19] | passed | 1.94 seconds |
22
+ ./spec/cite-as_spec.rb[1:1:20] | passed | 2.1 seconds |
23
+ ./spec/cite-as_spec.rb[1:1:21] | passed | 2.23 seconds |
24
+ ./spec/cite-as_spec.rb[1:1:22] | passed | 1.17 seconds |
25
+ ./spec/cite-as_spec.rb[1:1:23] | passed | 1.13 seconds |
26
+ ./spec/cite-as_spec.rb[1:1:24] | failed | 1.24 seconds |
27
+ ./spec/cite-as_spec.rb[1:1:25] | passed | 0.49678 seconds |
28
+ ./spec/describedby_spec.rb[1:1:1] | passed | 3.18 seconds |
29
+ ./spec/describedby_spec.rb[1:1:2] | passed | 1.34 seconds |
30
+ ./spec/describedby_spec.rb[1:1:3] | passed | 1.2 seconds |
31
+ ./spec/describedby_spec.rb[1:1:4] | passed | 1.14 seconds |
32
+ ./spec/describedby_spec.rb[1:1:5] | passed | 1.24 seconds |
33
+ ./spec/describedby_spec.rb[1:1:6] | passed | 1.04 seconds |
34
+ ./spec/describedby_spec.rb[1:1:7] | passed | 0.9844 seconds |
35
+ ./spec/describedby_spec.rb[1:1:8] | passed | 2.07 seconds |
36
+ ./spec/describedby_spec.rb[1:1:9] | passed | 2.16 seconds |
37
+ ./spec/describedby_spec.rb[1:1:10] | passed | 2.36 seconds |
38
+ ./spec/describedby_spec.rb[1:1:11] | passed | 2.91 seconds |
39
+ ./spec/describedby_spec.rb[1:1:12] | passed | 2.93 seconds |
40
+ ./spec/describedby_spec.rb[1:1:13] | passed | 1.79 seconds |
41
+ ./spec/describedby_spec.rb[1:1:14] | passed | 2.5 seconds |
42
+ ./spec/describedby_spec.rb[1:1:15] | passed | 2.24 seconds |
43
+ ./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00102 seconds |
44
+ ./spec/fsp_harvester_spec.rb[1:2] | passed | 2.5 seconds |
45
+ ./spec/fsp_harvester_spec.rb[1:3] | passed | 29.49 seconds |
46
+ ./spec/fsp_harvester_spec.rb[1:4] | passed | 2.53 seconds |
47
+ ./spec/fsp_harvester_spec.rb[1:5] | passed | 2.65 seconds |
48
+ ./spec/fsp_harvester_spec.rb[1:6] | failed | 1 minute 24.1 seconds |
49
+ ./spec/fsp_harvester_spec.rb[1:7] | passed | 2 minutes 24.3 seconds |
50
+ ./spec/item_spec.rb[1:1:1] | passed | 2.71 seconds |
51
+ ./spec/item_spec.rb[1:1:2] | passed | 2.98 seconds |
52
+ ./spec/item_spec.rb[1:1:3] | passed | 1.33 seconds |
53
+ ./spec/item_spec.rb[1:1:4] | passed | 1.81 seconds |
54
+ ./spec/item_spec.rb[1:1:5] | passed | 2.2 seconds |
55
+ ./spec/item_spec.rb[1:1:6] | passed | 2.25 seconds |
56
+ ./spec/item_spec.rb[1:1:7] | passed | 2.94 seconds |
57
+ ./spec/item_spec.rb[1:1:8] | passed | 0.62818 seconds |
58
+ ./spec/type_spec.rb[1:1:1] | passed | 1.33 seconds |
59
+ ./spec/type_spec.rb[1:1:2] | passed | 1.22 seconds |
60
+ ./spec/type_spec.rb[1:1:3] | passed | 1.61 seconds |
data/Gemfile.lock CHANGED
@@ -1,10 +1,10 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fsp_harvester (0.1.18)
4
+ fsp_harvester (0.1.21)
5
5
  json (~> 2.0)
6
6
  linkeddata (~> 3.2)
7
- linkheaders-processor (~> 0.1.17)
7
+ linkheaders-processor (~> 0.1.18)
8
8
  metainspector (~> 5.11.2)
9
9
  parseconfig (~> 1.1)
10
10
  rake (~> 13.0)
@@ -127,7 +127,7 @@ GEM
127
127
  sparql (~> 3.2, >= 3.2.4)
128
128
  sparql-client (~> 3.2, >= 3.2.1)
129
129
  yaml-ld (~> 0.0)
130
- linkheaders-processor (0.1.17)
130
+ linkheaders-processor (0.1.18)
131
131
  json (~> 2.0)
132
132
  json-ld (~> 3.2)
133
133
  json-ld-preloaded (~> 3.2)
@@ -252,14 +252,14 @@ GEM
252
252
  diff-lcs (>= 1.2.0, < 2.0)
253
253
  rspec-support (~> 3.11.0)
254
254
  rspec-support (3.11.0)
255
- rubocop (1.34.1)
255
+ rubocop (1.35.0)
256
256
  json (~> 2.3)
257
257
  parallel (~> 1.10)
258
258
  parser (>= 3.1.2.1)
259
259
  rainbow (>= 2.2.2, < 4.0)
260
260
  regexp_parser (>= 1.8, < 3.0)
261
261
  rexml (>= 3.2.5, < 4.0)
262
- rubocop-ast (>= 1.20.0, < 2.0)
262
+ rubocop-ast (>= 1.20.1, < 2.0)
263
263
  ruby-progressbar (~> 1.7)
264
264
  unicode-display_width (>= 1.4.0, < 3.0)
265
265
  rubocop-ast (1.21.0)
data/lib/constants.rb CHANGED
@@ -1,4 +1,6 @@
1
- ACCEPT_ALL_HEADER = { 'Accept' => 'text/turtle, application/ld+json, application/rdf+xml, text/xhtml+xml, application/n3, application/rdf+n3, application/turtle, application/x-turtle, text/n3, text/turtle, text/rdf+n3, text/rdf+turtle, application/n-triples' }
1
+ module FspHarvester
2
+
3
+ ACCEPT_LD_HEADER = { 'Accept' => 'text/turtle, application/ld+json, application/rdf+xml, text/xhtml+xml, application/n3, application/rdf+n3, application/turtle, application/x-turtle, text/n3, text/turtle, text/rdf+n3, text/rdf+turtle, application/n-triples' }
2
4
 
3
5
  ACCEPT_STAR_HEADER = {'Accept' => '*/*'}
4
6
 
@@ -77,6 +79,7 @@ GUID_TYPES = {
77
79
  'uri' => Regexp.new(%r{^\w+:/?/?[^\s]+$}),
78
80
  'ark' => Regexp.new(%r{^ark:/[^\s]+$})
79
81
  }
82
+ end
80
83
 
81
84
  # CONFIG = File.exist?('config.conf') ? ParseConfig.new('config.conf') : {}
82
85
  # extruct = CONFIG.dig(:extruct, :command)
@@ -88,7 +91,7 @@ when /[&|;`$\s]/
88
91
  when /echo/i
89
92
  abort 'The Extruct command appears to be subject to command injection. I will not continue'
90
93
  end
91
- EXTRUCT_COMMAND = extruct
94
+ FspHarvester::EXTRUCT_COMMAND = extruct
92
95
 
93
96
  # rdf_command = CONFIG.dig(:rdf, :command)
94
97
  rdf_command = ENV['RDF_COMMAND'] || 'rdf'
@@ -101,8 +104,8 @@ when /echo/i
101
104
  when !(/rdf$/ =~ $_)
102
105
  abort "this software requires that Kelloggs Distiller tool is used. The distiller command must end in 'rdf'"
103
106
  end
104
- RDF_COMMAND = rdf_command
107
+ FspHarvester::RDF_COMMAND = rdf_command
105
108
 
106
109
  # tika_command = CONFIG.dig(:tika, :command)
107
110
  tika_command = ENV['TIKA_COMMAND'] || 'http://localhost:9998/meta'
108
- TIKA_COMMAND = tika_command
111
+ FspHarvester::TIKA_COMMAND = tika_command
@@ -5,18 +5,21 @@ module HarvesterTools
5
5
  end
6
6
 
7
7
  class ExternalTools
8
+ attr_accessor :distillerknown, :extructknown
8
9
 
9
10
  def initialize(metadata: HarvesterTools::MetadataObject.new)
11
+ @distillerknown = {}
12
+ @extructknown = {}
10
13
  @meta = metadata
11
14
  end
12
15
 
13
- def process_with_distiller(body:)
16
+ def process_with_distiller(body:, metadata:)
17
+ meta = metadata
14
18
  bhash = Digest::SHA256.hexdigest(body)
15
- if @@distillerknown[bhash]
16
- @meta.comments << "INFO: data is already parsed by distiller.\n"
17
- #parse_rdf(body: body)
19
+ if distillerknown[bhash]
20
+ meta.comments << "INFO: data is already parsed by distiller.\n"
18
21
  else
19
- @meta.comments << "INFO: Using 'Kellog's Distiller' to try to extract metadata from return value (message body).\n"
22
+ meta.comments << "INFO: Using 'Kellog's Distiller' to try to extract metadata from return value (message body).\n"
20
23
  file = Tempfile.new('foo', encoding: 'UTF-8')
21
24
  body = body.force_encoding('UTF-8')
22
25
  body.scrub!
@@ -24,60 +27,73 @@ module HarvesterTools
24
27
  file.write(body)
25
28
  file.rewind
26
29
 
27
- @meta.comments << "INFO: The message body is being examined by Distiller\n"
28
- command = "LANG=en_US.UTF-8 #{RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
30
+ meta.comments << "INFO: The message body is being examined by Distiller\n"
31
+ command = "LANG=en_US.UTF-8 #{FspHarvester::RDF_COMMAND} serialize --input-format rdfa --output-format jsonld #{file.path}"
29
32
  warn "distiller command: #{command}"
30
33
  result, _stderr, _status = Open3.capture3(command)
31
34
  warn ''
32
- warn "distiller errors: #{stderr}"
35
+ warn "distiller errors: #{_stderr}" if _stderr
33
36
  file.close
34
37
  file.unlink
35
38
 
36
39
  result = result.force_encoding('UTF-8')
37
- warn "DIST RESULT: #{result}"
40
+ # warn "DIST RESULT: #{result}"
38
41
  if result !~ /@context/i # failure returns nil
39
- @meta.comments << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML..\n"
40
- @meta.add_warning(['018', '', ''])
41
- result = "{}"
42
+ meta.comments << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML..\n"
43
+ meta.add_warning(['018', '', ''])
44
+ result = '{}'
42
45
  else
43
- @meta.comments << "INFO: The Distiller found parseable data. Parsing as JSON-LD\n"
46
+ meta.comments << "INFO: The Distiller found parseable data. Parsing as JSON-LD\n"
44
47
  end
45
- @@distillerknown[bhash] = true
48
+ distillerknown[bhash] = true
46
49
  end
47
50
  result
48
51
  end
49
52
 
50
- def processs_with_extruct(uri:)
51
- @meta.comments << "INFO: Using 'extruct' to try to extract metadata from return value (message body) of #{uri}.\n"
52
- warn 'begin open3'
53
- stdout, stderr, status = Open3.capture3(EXTRUCT_COMMAND + ' ' + uri)
54
- warn "open3 status: #{status} #{stdout}"
55
- result = stderr # absurd that the output comes over stderr! LOL!
56
- jsonld = {}
57
- microdata = Hash.new
58
- microformat = Hash.new
59
- opengraph = Hash.new
60
- rdfa = Hash.new
53
+ def process_with_extruct(uri:, metadata:)
54
+ bhash = Digest::SHA256.hexdigest(uri)
55
+ jsonld = '{}'
56
+ microdata = {}
57
+ microformat = {}
58
+ opengraph = {}
59
+ rdfa = '{}'
61
60
 
62
- if result.to_s.match(/(Failed\sto\sextract.*?)\n/)
63
- @meta.comments << "WARN: extruct threw an error #{Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n"
64
- @meta.add_warning(['019', '', ''])
65
- if result.to_s.match(/(ValueError:.*?)\n/)
66
- @meta.comments << "WARN: extruct error was #{Regexp.last_match(1)}\n"
67
- @meta.add_warning(['019', '', ''])
68
- end
69
- elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
70
- json = JSON.parse result
71
- @meta.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
72
- jsonld = json['json-ld'].to_json if json['json-ld'].any?
73
- microdata = json['microdata'].first if json['microdata'].any
74
- microformat = json['microformat'].first if json['microformat'].any?
75
- opengraph = json['opengraph'].first if json['opengraph'].any?
76
- rdfa = json['rdfa'].to_json if json['rdfa'].any?
77
- # @meta.merge_hash(json.first) if json.first.is_a? Hash
61
+ if extructknown[bhash]
62
+ metadata.comments << "INFO: data is already parsed by extruct.\n"
78
63
  else
79
- @meta.comments << "WARN: the extruct tool failed to find parseable data at #{uri}\n"
64
+ metadata.comments << "INFO: Using 'extruct' to try to extract metadata from return value (message body) of #{uri}.\n"
65
+ warn 'begin open3'
66
+ stdout, stderr, status = Open3.capture3(FspHarvester::EXTRUCT_COMMAND + ' ' + uri)
67
+ warn "open3 status: #{status} #{stdout}"
68
+ result = stderr # absurd that the output comes over stderr! LOL!
69
+
70
+ if result.to_s.match(/(Failed\sto\sextract.*?)\n/)
71
+ metadata.comments << "WARN: extruct threw an error #{Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n"
72
+ metadata.add_warning(['019', '', ''])
73
+ if result.to_s.match(/(ValueError:.*?)\n/)
74
+ metadata.comments << "WARN: extruct error was #{Regexp.last_match(1)}\n"
75
+ metadata.add_warning(['019', '', ''])
76
+ end
77
+ elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
78
+ begin
79
+ json = JSON.parse result
80
+ rescue StandardError
81
+ metadata.comments << "WARN: extruct threw an error when attempting to parse the extruct command return value from processing #{uri}.\n"
82
+ metadata.add_warning(['019', '', ''])
83
+ return [jsonld, microdata, microformat, opengraph, rdfa]
84
+ end
85
+ metadata.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
86
+ jsonld = json['json-ld'].to_json if json['json-ld'].any?
87
+ microdata = json['microdata'].first if json['microdata'].any?
88
+ microformat = json['microformat'].first if json['microformat'].any?
89
+ opengraph = json['opengraph'].first if json['opengraph'].any?
90
+ rdfa = json['rdfa'].to_json if json['rdfa'].any?
91
+ # @meta.merge_hash(json.first) if json.first.is_a? Hash
92
+ else
93
+ @meta.comments << "WARN: the extruct tool failed to find parseable data at #{uri}\n"
94
+ end
80
95
  end
96
+ extructknown[bhash] = true
81
97
  [jsonld, microdata, microformat, opengraph, rdfa]
82
98
  end
83
99
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module FspHarvester
4
- VERSION = "0.1.18"
4
+ VERSION = "0.1.21"
5
5
  end
data/lib/harvester.rb CHANGED
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- #require_relative 'fsp_harvester/version'
3
+ require_relative './fsp_harvester/version'
4
4
  require 'json/ld'
5
5
  require 'json/ld/preloaded'
6
6
  require 'json'
@@ -3,45 +3,81 @@ module HarvesterTools
3
3
  end
4
4
 
5
5
  class BruteForce
6
- def self.begin_brute_force(guid:, metadata: HarvesterTools::MetadataObject.new)
6
+ def self.begin_brute_force(guid:, links: [], metadata: HarvesterTools::MetadataObject.new)
7
7
  type, url = HarvesterTools::Utils.convertToURL(guid: guid)
8
8
  return false unless type
9
9
 
10
- do_content_negotiation(url: url, metadata: metadata)
10
+ # TODO: follow rel=alternate headers, if they are in LD or Hash format
11
+ do_content_negotiation(url: url, metadata: metadata, links: links)
11
12
  metadata
12
13
  end
13
14
 
14
- def self.do_content_negotiation(url:, metadata:)
15
- response = resolve_url_brute(url: url, metadata: metadata, headers: ACCEPT_ALL_HEADER)
15
+ def self.do_content_negotiation(url:, metadata:, links: [])
16
+ warn "\n\nINFO: entering content negotiation of #{url}\n\n"
17
+ metadata.comments << "INFO: entering content negotiation of #{url}.\n"
18
+
19
+
20
+ response = resolve_url_brute(url: url, metadata: metadata, headers: FspHarvester::ACCEPT_LD_HEADER)
16
21
  if response
17
22
  HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata)
18
23
  end
19
- response = resolve_url_brute(url: url, metadata: metadata, headers: ACCEPT_STAR_HEADER)
24
+ response = resolve_url_brute(url: url, metadata: metadata, headers: FspHarvester::ACCEPT_STAR_HEADER)
20
25
  if response
21
26
  HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from landing page
22
- response = resolve_url_brute(url: response.request.url, metadata: metadata, headers: ACCEPT_ALL_HEADER) # now do content negotiation on the landing page
27
+ response = resolve_url_brute(url: response.request.url, metadata: metadata, headers: FspHarvester::ACCEPT_LD_HEADER) # now do content negotiation on the landing page
23
28
  if response
24
29
  HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from landing page
25
30
  end
26
31
  end
32
+
33
+ process_alternates(links: links, metadata: metadata)
34
+ end
35
+
36
+ def self.process_alternates(links: [], metadata:)
37
+ warn "\n\nINFO: entering content negotiation on link alternates\n\n"
38
+ metadata.comments << "IINFO: entering content negotiation on link alternates.\n"
39
+ # process "alternate" links
40
+ links.each do |link|
41
+ next unless link.relation == "alternate"
42
+
43
+ url = link.href
44
+ headers = {'Accept' => "#{link.type}"} if link.respond_to?("type")
45
+ headers ||= FspHarvester::ACCEPT_STAR_HEADER
46
+ warn "\n\nINFO: resolving alternate #{url} with headers #{headers.to_s}\n\n"
47
+ metadata.comments << "IINFO: entering content negotiation on link alternates.\n"
48
+ response = resolve_url_brute(url: url, metadata: metadata, headers: headers) # now do content negotiation on the link
49
+ if response
50
+ HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from alternate link
51
+ end
52
+ end
53
+
27
54
  end
28
55
 
56
+
29
57
  def self.resolve_url_brute(url:, method: :get, nolinkheaders: true, headers:, metadata:)
30
- @meta = metadata
31
- @meta.guidtype = 'uri' if @meta.guidtype.nil?
32
- warn "\n\n BRUTE FETCHING #{url} #{headers}\n\n"
33
- response = HarvesterTools::WebUtils.fspfetch(url: url, headers: headers, method: method, meta: @meta)
58
+
59
+ cache_key = Digest::MD5.hexdigest url + headers.to_s
60
+ if metadata.url_header_hash[cache_key]
61
+ warn "Already processed #{url} - moving on"
62
+ metadata.comments << "INFO: Already processed #{url} - moving on.\n"
63
+ return false
64
+ end
65
+
66
+ metadata.guidtype = 'uri' if metadata.guidtype.nil?
67
+ warn "\n\n BRUTE FETCHING #{url} \nwith headers\n #{headers}\n\n"
68
+ response = HarvesterTools::WebUtils.fspfetch(url: url, headers: headers, method: method, meta: metadata)
34
69
  warn "\n\n head #{response.headers.inspect}\n\n" if response
35
70
 
36
71
  unless response
37
- @meta.add_warning(['001', url, headers])
38
- @meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{headers}.\n"
39
- @meta.full_response << [url, "No response"]
72
+ metadata.add_warning(['001', url, headers])
73
+ metadata.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{headers}.\n"
74
+ metadata.full_response << [url, "No response"]
40
75
  false
41
76
  end
42
77
 
43
- @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
44
- @meta.full_response << [url, response.body]
78
+ metadata.comments << "INFO: following redirection using this header led to the following URL: #{metadata.all_uris.last}. Using the output from this URL for the next few tests..."
79
+ metadata.full_response << [url, response.body]
80
+ metadata.url_header_hash[cache_key] = true
45
81
  response
46
82
  end
47
83
  end
@@ -11,7 +11,7 @@ module HarvesterTools
11
11
  links = Array.new
12
12
  if type
13
13
  links = resolve_url(url: url, metadata: @meta)
14
- @meta.links = @meta.links | links
14
+ @meta.links = @meta.links.append(*links)
15
15
  else
16
16
  @meta.add_warning(['006', guid, ''])
17
17
  @meta.comments << "FATAL: GUID type not recognized.\n"
@@ -20,7 +20,7 @@ module HarvesterTools
20
20
  end
21
21
 
22
22
  def self.convertToURL(guid:)
23
- GUID_TYPES.each do |k, regex|
23
+ FspHarvester::GUID_TYPES.each do |k, regex|
24
24
  if k == 'inchi' and regex.match(guid)
25
25
  return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
26
26
  elsif k == 'handle1' and regex.match(guid)
@@ -39,13 +39,13 @@ module HarvesterTools
39
39
  end
40
40
 
41
41
  def self.typeit(guid:)
42
- GUID_TYPES.each do |type, regex|
42
+ FspHarvester::GUID_TYPES.each do |type, regex|
43
43
  return type if regex.match(guid)
44
44
  end
45
45
  false
46
46
  end
47
47
 
48
- def self.resolve_url(url:, method: :get, nolinkheaders: false, metadata:, header: ACCEPT_STAR_HEADER)
48
+ def self.resolve_url(url:, method: :get, nolinkheaders: false, metadata:, header: FspHarvester::ACCEPT_STAR_HEADER)
49
49
  @meta = metadata
50
50
  @meta.guidtype = 'uri' if @meta.guidtype.nil?
51
51
  warn "\n\n FETCHING #{url} #{header}\n\n"
@@ -59,7 +59,7 @@ module HarvesterTools
59
59
  end
60
60
 
61
61
  @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
62
- @meta.full_response << response.body
62
+ @meta.full_response << [url, response.body]
63
63
 
64
64
  links = process_link_headers(response: response, metadata: @meta) unless nolinkheaders
65
65
  links
@@ -13,7 +13,7 @@ module HarvesterTools
13
13
 
14
14
  hvst = HarvesterTools::MetadataParser.new(metadata_object: @meta) # put here because the class variable for detecting duplicates should apply to all URIs
15
15
  describedby.each do |link|
16
- accepttype = ACCEPT_STAR_HEADER
16
+ accepttype = FspHarvester::ACCEPT_STAR_HEADER
17
17
  accept = link.respond_to?('type') ? link.type : nil
18
18
  accepttype = { 'Accept' => accept } if accept
19
19
 
@@ -38,9 +38,14 @@ module HarvesterTools
38
38
  abbreviation, content_type = attempt_to_detect_type(body: response.body, headers: response.headers)
39
39
  unless abbreviation
40
40
  @meta.add_warning(['017', response.request.url, ''])
41
- @meta.comments << "WARN: metadata format returned from #{response.request.url} is not recognized. Moving on.\n"
41
+ @meta.comments << "WARN: format returned from #{response.request.url} is not recognized. Moving on.\n"
42
42
  return
43
43
  end
44
+ request_content_types = response.request.headers["Accept"].split(/,\s*/)
45
+ unless (request_content_types.include? content_type) and !(request_content_types.include? "*/*") and (response.code != 406)
46
+ @meta.add_warning(['023', response.request.url, ''])
47
+ @meta.comments << "WARN: format returned from #{response.request.url} does not match request type. This should result in a 406 error, but instead was accepted as a 200.\n"
48
+ end
44
49
  process_according_to_type(body: response.body, uri: response.request.url, metadata: @meta,
45
50
  abbreviation: abbreviation, content_type: content_type)
46
51
  end
@@ -65,7 +70,7 @@ module HarvesterTools
65
70
  end
66
71
  end
67
72
 
68
- def self.attempt_to_resolve(link:, headers: ACCEPT_STAR_HEADER)
73
+ def self.attempt_to_resolve(link:, headers: FspHarvester::ACCEPT_STAR_HEADER)
69
74
  @meta.comments << "INFO: link #{link.href} being processed"
70
75
  if link.respond_to? 'type'
71
76
  header = { 'Accept' => link.type }
@@ -86,23 +91,37 @@ module HarvesterTools
86
91
  abbreviation = nil
87
92
  content_type = nil
88
93
  @meta.comments << 'INFO: Testing metadata format for html, xml, and linked data formats\n'
94
+ claimed_type = headers[:content_type]
95
+ claimed_type.gsub!(/\s*;.*/, '')
89
96
  if body =~ /^\s*<\?xml/
90
- if body =~ /<HTML/i
97
+ if body[0..1000] =~ /<HTML/i # take a sample, it should appear quite early (it will appear in other places in e.g. tutorial documents)
91
98
  abbreviation = 'html'
92
- content_type = 'text/html'
99
+ content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
100
+ @meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
101
+ content_type |= 'text/html'
93
102
  @meta.comments << 'INFO: appears to be HTML\n'
94
103
  elsif body =~ /<rdf:RDF/i
95
104
  abbreviation = 'rdfxml'
96
- content_type = 'application/rdf+xml'
105
+ content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
106
+ @meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
107
+ content_type |= 'application/rdf+xml'
97
108
  @meta.comments << 'INFO: appears to be RDF-XML\n'
98
109
  else
99
110
  abbreviation = 'xml'
100
- content_type = 'application/xml'
111
+ content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
112
+ @meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
113
+ content_type |= 'application/xml'
101
114
  @meta.comments << 'INFO: appears to be XML\n'
102
115
  end
116
+ elsif body[0..1000] =~ /<HTML/i # take a sample, it should appear quite early (it will appear in other places in e.g. tutorial documents)
117
+ abbreviation = 'html'
118
+ content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
119
+ @meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
120
+ content_type ||= 'text/html'
121
+ @meta.comments << 'INFO: appears to be HTML\n'
103
122
  else
104
- abbreviation, content_type = check_ld(body: body, claimed_type: headers[:content_type])
105
- abbreviation, content_type = check_json(body: body) unless abbreviation
123
+ abbreviation, content_type = check_ld(body: body, claimed_type: claimed_type)
124
+ abbreviation, content_type = check_json(body: body) unless abbreviation # don't test if LD already found!
106
125
  end
107
126
 
108
127
  unless content_type
@@ -112,18 +131,46 @@ module HarvesterTools
112
131
  [abbreviation, content_type]
113
132
  end
114
133
 
134
+ def self.validate_claimed_type(abbreviation:, claimed_type:)
135
+ warn "\n\nclaimed type #{claimed_type}\nabbreviation #{abbreviation}\n\n"
136
+ claimed_type.gsub!(/\s*;.*/, '')
137
+
138
+ case abbreviation
139
+ when 'html'
140
+ return claimed_type if FspHarvester::HTML_FORMATS['html'].include? claimed_type
141
+ when 'xml'
142
+ return claimed_type if FspHarvester::XML_FORMATS['xml'].include? claimed_type
143
+ when 'json'
144
+ return claimed_type if FspHarvester::JSON_FORMATS['json'].include? claimed_type
145
+ when 'jsonld', 'rdfxml', 'turtle', 'ntriples', 'nquads'
146
+ return claimed_type if FspHarvester::RDF_FORMATS.values.flatten.include? claimed_type
147
+ when 'specialist'
148
+ warn 'no specialized parsers so far'
149
+ end
150
+ return false
151
+ end
152
+
115
153
  def self.check_ld(body:, claimed_type:)
116
154
  detected_type = ntriples_hack(body: body) # ntriples hack for one-line metadata records
117
- unless detected_type
155
+ unless detected_type # see if distiller can detect a type
118
156
  detected_type = RDF::Format.for({ sample: body[0..5000] })
119
157
  @meta.comments << "INFO: Auto-detected type #{detected_type}\n"
120
158
  end
159
+ # at this point, detected_type is something like RDF::Turtle::Format (or nil). This will return a content-type
121
160
  contenttype = ''
122
161
  abbreviation = ''
123
162
  if detected_type
124
- contenttype = detected_type.content_type.first # comes back as array
125
- abbreviation = abbreviate_type(contenttype: contenttype)
126
- @meta.comments << "INFO: using content-type #{contenttype}.\n"
163
+ detectedcontenttypes = detected_type.content_type # comes back as array of [application/x, application/y]
164
+ unless detectedcontenttypes.include? claimed_type
165
+ @meta.add_warning(['022', @meta.all_uris.last, "" ])
166
+ contenttype = detected_type.content_type.first # just pick one arbitrarily, since it doesn't match thedeclared type anyway
167
+ abbreviation = abbreviate_type(contenttype: contenttype)
168
+ @meta.comments << "INFO: using content-type #{contenttype} even though there was a mismatch.\n"
169
+ else
170
+ contenttype = claimed_type # just pick one arbitrarily, since it doesn't match thedeclared type anyway
171
+ abbreviation = abbreviate_type(contenttype: contenttype)
172
+ @meta.comments << "INFO: using content-type #{contenttype}.\n"
173
+ end
127
174
  else
128
175
  @meta.comments << "INFO: metadata does not appear to be in a linked data format. Trying other options.\n"
129
176
  end
@@ -161,13 +208,14 @@ module HarvesterTools
161
208
  abbreviation = 'json'
162
209
  else
163
210
  @meta.comments << "INFO: metadata does not appear to be in JSON format. No options left.\n"
211
+ return [nil, nil]
164
212
  end
165
- [abbreviation, 'application/ld+json']
213
+ [abbreviation, 'application/json']
166
214
  end
167
215
 
168
216
  def self.abbreviate_type(contenttype:)
169
217
  foundtype = nil
170
- RDF_FORMATS.merge(XML_FORMATS).merge(HTML_FORMATS).merge(JSON_FORMATS).each do |type, vals|
218
+ FspHarvester::RDF_FORMATS.merge(FspHarvester::XML_FORMATS).merge(FspHarvester::HTML_FORMATS).merge(FspHarvester::JSON_FORMATS).each do |type, vals|
171
219
  warn "\n\ntype #{type}\nvals #{vals}\n\n"
172
220
  @meta.comments << "INFO: testing #{type} MIME types for #{contenttype}"
173
221
  next unless vals.include? contenttype
@@ -1,19 +1,22 @@
1
1
  module HarvesterTools
2
2
  class MetadataObject
3
- attr_accessor :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris, :guid, :score, :version, :date # a hash of metadata # a RDF.rb graph of metadata # an array of comments # the type of GUID that was detected # will be an array of Net::HTTP::Response
3
+ attr_accessor :id, :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris, :tested_guid, :score, :version, :date, :url_header_hash # a hash of metadata # a RDF.rb graph of metadata # an array of comments # the type of GUID that was detected # will be an array of Net::HTTP::Response
4
4
 
5
- def initialize() # get a name from the "new" call, or set a default
5
+ def initialize(id: "urn:local:unidentified_metadata") # get a name from the "new" call, or set a default
6
+ @id = id
6
7
  @hash = {}
7
8
  @graph = RDF::Graph.new
8
9
  @comments = []
9
10
  @warnings = []
10
11
  @full_response = []
11
12
  @links = []
13
+ @guidtype = ""
12
14
  @all_uris = []
13
- @guid = ""
15
+ @tested_guid = ""
14
16
  @score = 0
15
17
  @version = '0.0'
16
18
  @date = Time.now.strftime('%Y-%m-%dT%H:%M:%S.%L%z')
19
+ @url_header_hash = Hash.new(false) # the combinarion of URL and the accept headers, sha1 hashed, for quick lookup if it has already been processed
17
20
  w = RestClient.get("https://raw.githubusercontent.com/markwilkinson/FAIR-Signposting-Harvester/master/lib/warnings.json")
18
21
  #@warn = File.read("./lib/warnings.json")
19
22
  @warn = JSON.parse(w)
@@ -35,6 +38,7 @@ module HarvesterTools
35
38
 
36
39
  def add_warning(warning)
37
40
  id = warning[0]
41
+ return unless @warn[id] # if there's a mismatch between code and the warnings in github
38
42
  url = warning[1]
39
43
  headers = warning[2]
40
44
  message = @warn[id]['message']
@@ -13,17 +13,16 @@ module HarvesterTools
13
13
  @meta = metadata_object
14
14
  end
15
15
 
16
- def process_html(body:, uri:, metadata:)
17
- @meta = metadata
16
+ def process_html(body:, uri:, metadata: @meta)
18
17
  tools = HarvesterTools::ExternalTools.new(metadata: @meta)
19
- result = tools.process_with_distiller(body: body)
18
+ tools.process_with_distiller(body: body, metadata: @meta) # adds to @meta
20
19
 
21
- jsonld, microdata, microformat, opengraph, rdfa = tools.process_with_extruct(uri: uri)
22
- parse_rdf(body: jsonld, content_type: 'application/ld+json')
20
+ jsonld, microdata, microformat, opengraph, rdfa = tools.process_with_extruct(uri: uri, metadata: @meta)
21
+ parse_rdf(body: jsonld, content_type: 'application/ld+json', metadata: metadata)
23
22
  @meta.merge_hash(microdata)
24
23
  @meta.merge_hash(microformat)
25
24
  @meta.merge_hash(opengraph)
26
- parse_rdf(body: rdfa, content_type: 'application/ld+json')
25
+ parse_rdf(body: rdfa, content_type: 'application/ld+json', metadata: @meta)
27
26
  end
28
27
 
29
28
  def process_xml(body:, metadata:)
data/lib/warnings.json CHANGED
@@ -116,6 +116,18 @@
116
116
  {"Validator": "https://jsononline.net/json-validator"}],
117
117
  "severity": "WARN"
118
118
  },
119
+ "022": {
120
+ "message": "Mismatch between the Content-type header and the content of the returned document.",
121
+ "linkout": [],
122
+ "severity": "WARN"
123
+ },
124
+ "023": {
125
+ "message": "Returned content-type is not compatible with any requested content-type, yet a HTTP 406 error was not returned",
126
+ "linkout": [],
127
+ "severity": "WARN"
128
+ },
129
+
130
+
119
131
  "600": {
120
132
  "message": "Data identifier cannot be unambiguously determined, therefore cannot be tested against known persistent identifier schemas",
121
133
  "linkout": [],
data/lib/web_utils.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  module HarvesterTools
2
2
 
3
3
  class WebUtils
4
- def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get, meta: HarvesterTools::MetadataObject.new)
4
+ def self.fspfetch(url:, headers: ACCEPT_STAR_HEADER, method: :get, meta: HarvesterTools::MetadataObject.new)
5
5
  warn 'In fetch routine now. '
6
6
 
7
7
  begin
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fsp_harvester
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.18
4
+ version: 0.1.21
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mark Wilkinson
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-08-16 00:00:00.000000000 Z
11
+ date: 2022-08-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: json
@@ -44,14 +44,14 @@ dependencies:
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: 0.1.17
47
+ version: 0.1.18
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: 0.1.17
54
+ version: 0.1.18
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: metainspector
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -180,7 +180,6 @@ files:
180
180
  - Rakefile
181
181
  - bin/console
182
182
  - bin/setup
183
- - example_test.rb
184
183
  - launch.json
185
184
  - lib/constants.rb
186
185
  - lib/external_tools.rb
data/example_test.rb DELETED
@@ -1,24 +0,0 @@
1
- # frozen string literal = false
2
- require 'cgi'
3
- require 'json'
4
- require 'uri'
5
- require 'rdf'
6
- require 'rdf/turtle'
7
- require 'sparql'
8
- require 'fsp_harvester'
9
-
10
- def test_guid(guid:)
11
- _links, metadata = FspHarvester::Utils.resolve_guid(guid: guid) # [LinkHeader::Link], FspHarvester::MetadataObject
12
-
13
- metadata.comments << if metadata.guidtype == 'unknown'
14
- "FAILURE: The identifier #{guid} did not match any known identification system.\n"
15
- else
16
- "SUCCESS: The identifier #{guid} matched known GUID type system #{metadata.guidtype}.\n"
17
- end
18
- metadata.comments
19
- end
20
-
21
- guid = ARGV[0] || 'https://s11.no/2022/a2a-fair-metrics/07-http-describedby-citeas-linkset-json/'
22
- response = test_guid(guid: guid)
23
-
24
- puts response