fsp_harvester 0.1.20 → 0.1.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 72df31c63580f2b47676bb719c860cd26cab4290346f20bd481f67d18b29f765
4
- data.tar.gz: 477bfe524de0a1822790eac1caefb642a5e881734e8f1bc3c8f46c1a91b3e1e0
3
+ metadata.gz: 8498c33db9c350fec8ea4e734b31087f798a4f433f211115c69ded468dbcdb12
4
+ data.tar.gz: f3f408b24575f4f310c6f00ac0d42c3106f68fcd43199d3dbb73d8e4deb403fe
5
5
  SHA512:
6
- metadata.gz: 328b1bf4531034b38f325ec7c2dfb682007ed8ef5fb4f9ea72a4776ffdb49bbdde280bd959f9adddcfb93f6a065b77af68ab6d5222942bc16b5d50901f771770
7
- data.tar.gz: d5bc1e2e88be865c17aa12bca0a4308f3bc8e476bc0f49f40ca7a48a4e92142d8613c8e12823d9bb5e2735ea4d4cc492d6740481fdbebd513cb4f0be0c8114c8
6
+ metadata.gz: 263549dc8b8bf2fe8a4bc50289092ad2e55d9bbc05cabc509637786b4923948345220f0ee7a04fa5db497f670d9ab79d4e35e05648f09f189103ae869040baad
7
+ data.tar.gz: 6b35a320400ff37561ddf2cff506a3a4f385cd31933e4df8ceb4d436a4f97974e7122ab96962b8ce3cfc183144fcf41592fa417abcdb609fd44080061fc5e1a3
data/.rspec_status CHANGED
@@ -1,57 +1,60 @@
1
- example_id | status | run_time |
2
- ---------------------------------- | ------ | --------------- |
3
- ./spec/cite-as_spec.rb[1:1:1] | passed | 1.36 seconds |
4
- ./spec/cite-as_spec.rb[1:1:2] | passed | 1.31 seconds |
5
- ./spec/cite-as_spec.rb[1:1:3] | passed | 1.53 seconds |
6
- ./spec/cite-as_spec.rb[1:1:4] | passed | 1.84 seconds |
7
- ./spec/cite-as_spec.rb[1:1:5] | passed | 2.77 seconds |
8
- ./spec/cite-as_spec.rb[1:1:6] | passed | 2.06 seconds |
9
- ./spec/cite-as_spec.rb[1:1:7] | passed | 2.96 seconds |
10
- ./spec/cite-as_spec.rb[1:1:8] | passed | 2.28 seconds |
11
- ./spec/cite-as_spec.rb[1:1:9] | passed | 2.83 seconds |
12
- ./spec/cite-as_spec.rb[1:1:10] | passed | 2.14 seconds |
13
- ./spec/cite-as_spec.rb[1:1:11] | passed | 3.19 seconds |
14
- ./spec/cite-as_spec.rb[1:1:12] | passed | 3.06 seconds |
15
- ./spec/cite-as_spec.rb[1:1:13] | passed | 2.77 seconds |
16
- ./spec/cite-as_spec.rb[1:1:14] | passed | 2.2 seconds |
17
- ./spec/cite-as_spec.rb[1:1:15] | passed | 1.19 seconds |
18
- ./spec/cite-as_spec.rb[1:1:16] | passed | 1.1 seconds |
19
- ./spec/cite-as_spec.rb[1:1:17] | passed | 1.31 seconds |
20
- ./spec/cite-as_spec.rb[1:1:18] | passed | 1.14 seconds |
21
- ./spec/cite-as_spec.rb[1:1:19] | passed | 1.68 seconds |
22
- ./spec/cite-as_spec.rb[1:1:20] | passed | 1.69 seconds |
23
- ./spec/cite-as_spec.rb[1:1:21] | passed | 2.35 seconds |
24
- ./spec/cite-as_spec.rb[1:1:22] | passed | 1.12 seconds |
25
- ./spec/cite-as_spec.rb[1:1:23] | passed | 1.16 seconds |
26
- ./spec/cite-as_spec.rb[1:1:24] | failed | 1.45 seconds |
27
- ./spec/cite-as_spec.rb[1:1:25] | passed | 0.72571 seconds |
28
- ./spec/describedby_spec.rb[1:1:1] | passed | 3.09 seconds |
29
- ./spec/describedby_spec.rb[1:1:2] | passed | 1.13 seconds |
30
- ./spec/describedby_spec.rb[1:1:3] | passed | 1.22 seconds |
31
- ./spec/describedby_spec.rb[1:1:4] | passed | 1.11 seconds |
32
- ./spec/describedby_spec.rb[1:1:5] | passed | 1.21 seconds |
33
- ./spec/describedby_spec.rb[1:1:6] | passed | 1.24 seconds |
34
- ./spec/describedby_spec.rb[1:1:7] | passed | 1.53 seconds |
35
- ./spec/describedby_spec.rb[1:1:8] | passed | 2.53 seconds |
36
- ./spec/describedby_spec.rb[1:1:9] | passed | 1.74 seconds |
37
- ./spec/describedby_spec.rb[1:1:10] | passed | 2.59 seconds |
38
- ./spec/describedby_spec.rb[1:1:11] | passed | 3.49 seconds |
39
- ./spec/describedby_spec.rb[1:1:12] | passed | 3.82 seconds |
40
- ./spec/describedby_spec.rb[1:1:13] | passed | 1.65 seconds |
41
- ./spec/describedby_spec.rb[1:1:14] | passed | 2.19 seconds |
42
- ./spec/describedby_spec.rb[1:1:15] | passed | 2.16 seconds |
43
- ./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00015 seconds |
44
- ./spec/fsp_harvester_spec.rb[1:2] | passed | 2.49 seconds |
45
- ./spec/fsp_harvester_spec.rb[1:3] | passed | 7.06 seconds |
46
- ./spec/fsp_harvester_spec.rb[1:4] | passed | 2.74 seconds |
47
- ./spec/item_spec.rb[1:1:1] | passed | 3.41 seconds |
48
- ./spec/item_spec.rb[1:1:2] | passed | 2.84 seconds |
49
- ./spec/item_spec.rb[1:1:3] | passed | 1.15 seconds |
50
- ./spec/item_spec.rb[1:1:4] | passed | 1.74 seconds |
51
- ./spec/item_spec.rb[1:1:5] | passed | 2.6 seconds |
52
- ./spec/item_spec.rb[1:1:6] | passed | 2.32 seconds |
53
- ./spec/item_spec.rb[1:1:7] | passed | 2.81 seconds |
54
- ./spec/item_spec.rb[1:1:8] | passed | 0.49717 seconds |
55
- ./spec/type_spec.rb[1:1:1] | passed | 1.25 seconds |
56
- ./spec/type_spec.rb[1:1:2] | passed | 1.18 seconds |
57
- ./spec/type_spec.rb[1:1:3] | passed | 1.58 seconds |
1
+ example_id | status | run_time |
2
+ ---------------------------------- | ------ | ---------------------- |
3
+ ./spec/cite-as_spec.rb[1:1:1] | passed | 1.77 seconds |
4
+ ./spec/cite-as_spec.rb[1:1:2] | passed | 1.22 seconds |
5
+ ./spec/cite-as_spec.rb[1:1:3] | passed | 1.09 seconds |
6
+ ./spec/cite-as_spec.rb[1:1:4] | passed | 1.89 seconds |
7
+ ./spec/cite-as_spec.rb[1:1:5] | passed | 2.95 seconds |
8
+ ./spec/cite-as_spec.rb[1:1:6] | passed | 2.14 seconds |
9
+ ./spec/cite-as_spec.rb[1:1:7] | passed | 2.96 seconds |
10
+ ./spec/cite-as_spec.rb[1:1:8] | passed | 2.28 seconds |
11
+ ./spec/cite-as_spec.rb[1:1:9] | passed | 3.4 seconds |
12
+ ./spec/cite-as_spec.rb[1:1:10] | passed | 2.21 seconds |
13
+ ./spec/cite-as_spec.rb[1:1:11] | passed | 2.82 seconds |
14
+ ./spec/cite-as_spec.rb[1:1:12] | passed | 2.23 seconds |
15
+ ./spec/cite-as_spec.rb[1:1:13] | passed | 3.36 seconds |
16
+ ./spec/cite-as_spec.rb[1:1:14] | passed | 2.19 seconds |
17
+ ./spec/cite-as_spec.rb[1:1:15] | passed | 1.19 seconds |
18
+ ./spec/cite-as_spec.rb[1:1:16] | passed | 1.23 seconds |
19
+ ./spec/cite-as_spec.rb[1:1:17] | passed | 1.19 seconds |
20
+ ./spec/cite-as_spec.rb[1:1:18] | passed | 1.28 seconds |
21
+ ./spec/cite-as_spec.rb[1:1:19] | passed | 1.94 seconds |
22
+ ./spec/cite-as_spec.rb[1:1:20] | passed | 2.1 seconds |
23
+ ./spec/cite-as_spec.rb[1:1:21] | passed | 2.23 seconds |
24
+ ./spec/cite-as_spec.rb[1:1:22] | passed | 1.17 seconds |
25
+ ./spec/cite-as_spec.rb[1:1:23] | passed | 1.13 seconds |
26
+ ./spec/cite-as_spec.rb[1:1:24] | failed | 1.24 seconds |
27
+ ./spec/cite-as_spec.rb[1:1:25] | passed | 0.49678 seconds |
28
+ ./spec/describedby_spec.rb[1:1:1] | passed | 3.18 seconds |
29
+ ./spec/describedby_spec.rb[1:1:2] | passed | 1.34 seconds |
30
+ ./spec/describedby_spec.rb[1:1:3] | passed | 1.2 seconds |
31
+ ./spec/describedby_spec.rb[1:1:4] | passed | 1.14 seconds |
32
+ ./spec/describedby_spec.rb[1:1:5] | passed | 1.24 seconds |
33
+ ./spec/describedby_spec.rb[1:1:6] | passed | 1.04 seconds |
34
+ ./spec/describedby_spec.rb[1:1:7] | passed | 0.9844 seconds |
35
+ ./spec/describedby_spec.rb[1:1:8] | passed | 2.07 seconds |
36
+ ./spec/describedby_spec.rb[1:1:9] | passed | 2.16 seconds |
37
+ ./spec/describedby_spec.rb[1:1:10] | passed | 2.36 seconds |
38
+ ./spec/describedby_spec.rb[1:1:11] | passed | 2.91 seconds |
39
+ ./spec/describedby_spec.rb[1:1:12] | passed | 2.93 seconds |
40
+ ./spec/describedby_spec.rb[1:1:13] | passed | 1.79 seconds |
41
+ ./spec/describedby_spec.rb[1:1:14] | passed | 2.5 seconds |
42
+ ./spec/describedby_spec.rb[1:1:15] | passed | 2.24 seconds |
43
+ ./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00102 seconds |
44
+ ./spec/fsp_harvester_spec.rb[1:2] | passed | 2.5 seconds |
45
+ ./spec/fsp_harvester_spec.rb[1:3] | passed | 29.49 seconds |
46
+ ./spec/fsp_harvester_spec.rb[1:4] | passed | 2.53 seconds |
47
+ ./spec/fsp_harvester_spec.rb[1:5] | passed | 2.65 seconds |
48
+ ./spec/fsp_harvester_spec.rb[1:6] | failed | 1 minute 24.1 seconds |
49
+ ./spec/fsp_harvester_spec.rb[1:7] | passed | 2 minutes 24.3 seconds |
50
+ ./spec/item_spec.rb[1:1:1] | passed | 2.71 seconds |
51
+ ./spec/item_spec.rb[1:1:2] | passed | 2.98 seconds |
52
+ ./spec/item_spec.rb[1:1:3] | passed | 1.33 seconds |
53
+ ./spec/item_spec.rb[1:1:4] | passed | 1.81 seconds |
54
+ ./spec/item_spec.rb[1:1:5] | passed | 2.2 seconds |
55
+ ./spec/item_spec.rb[1:1:6] | passed | 2.25 seconds |
56
+ ./spec/item_spec.rb[1:1:7] | passed | 2.94 seconds |
57
+ ./spec/item_spec.rb[1:1:8] | passed | 0.62818 seconds |
58
+ ./spec/type_spec.rb[1:1:1] | passed | 1.33 seconds |
59
+ ./spec/type_spec.rb[1:1:2] | passed | 1.22 seconds |
60
+ ./spec/type_spec.rb[1:1:3] | passed | 1.61 seconds |
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fsp_harvester (0.1.20)
4
+ fsp_harvester (0.1.21)
5
5
  json (~> 2.0)
6
6
  linkeddata (~> 3.2)
7
7
  linkheaders-processor (~> 0.1.18)
data/lib/constants.rb CHANGED
@@ -1,4 +1,6 @@
1
- ACCEPT_ALL_HEADER = { 'Accept' => 'text/turtle, application/ld+json, application/rdf+xml, text/xhtml+xml, application/n3, application/rdf+n3, application/turtle, application/x-turtle, text/n3, text/turtle, text/rdf+n3, text/rdf+turtle, application/n-triples' }
1
+ module FspHarvester
2
+
3
+ ACCEPT_LD_HEADER = { 'Accept' => 'text/turtle, application/ld+json, application/rdf+xml, text/xhtml+xml, application/n3, application/rdf+n3, application/turtle, application/x-turtle, text/n3, text/turtle, text/rdf+n3, text/rdf+turtle, application/n-triples' }
2
4
 
3
5
  ACCEPT_STAR_HEADER = {'Accept' => '*/*'}
4
6
 
@@ -77,6 +79,7 @@ GUID_TYPES = {
77
79
  'uri' => Regexp.new(%r{^\w+:/?/?[^\s]+$}),
78
80
  'ark' => Regexp.new(%r{^ark:/[^\s]+$})
79
81
  }
82
+ end
80
83
 
81
84
  # CONFIG = File.exist?('config.conf') ? ParseConfig.new('config.conf') : {}
82
85
  # extruct = CONFIG.dig(:extruct, :command)
@@ -88,7 +91,7 @@ when /[&|;`$\s]/
88
91
  when /echo/i
89
92
  abort 'The Extruct command appears to be subject to command injection. I will not continue'
90
93
  end
91
- EXTRUCT_COMMAND = extruct
94
+ FspHarvester::EXTRUCT_COMMAND = extruct
92
95
 
93
96
  # rdf_command = CONFIG.dig(:rdf, :command)
94
97
  rdf_command = ENV['RDF_COMMAND'] || 'rdf'
@@ -101,8 +104,8 @@ when /echo/i
101
104
  when !(/rdf$/ =~ $_)
102
105
  abort "this software requires that Kelloggs Distiller tool is used. The distiller command must end in 'rdf'"
103
106
  end
104
- RDF_COMMAND = rdf_command
107
+ FspHarvester::RDF_COMMAND = rdf_command
105
108
 
106
109
  # tika_command = CONFIG.dig(:tika, :command)
107
110
  tika_command = ENV['TIKA_COMMAND'] || 'http://localhost:9998/meta'
108
- TIKA_COMMAND = tika_command
111
+ FspHarvester::TIKA_COMMAND = tika_command
@@ -5,18 +5,21 @@ module HarvesterTools
5
5
  end
6
6
 
7
7
  class ExternalTools
8
+ attr_accessor :distillerknown, :extructknown
8
9
 
9
10
  def initialize(metadata: HarvesterTools::MetadataObject.new)
11
+ @distillerknown = {}
12
+ @extructknown = {}
10
13
  @meta = metadata
11
14
  end
12
15
 
13
- def process_with_distiller(body:)
16
+ def process_with_distiller(body:, metadata:)
17
+ meta = metadata
14
18
  bhash = Digest::SHA256.hexdigest(body)
15
- if @@distillerknown[bhash]
16
- @meta.comments << "INFO: data is already parsed by distiller.\n"
17
- #parse_rdf(body: body)
19
+ if distillerknown[bhash]
20
+ meta.comments << "INFO: data is already parsed by distiller.\n"
18
21
  else
19
- @meta.comments << "INFO: Using 'Kellog's Distiller' to try to extract metadata from return value (message body).\n"
22
+ meta.comments << "INFO: Using 'Kellog's Distiller' to try to extract metadata from return value (message body).\n"
20
23
  file = Tempfile.new('foo', encoding: 'UTF-8')
21
24
  body = body.force_encoding('UTF-8')
22
25
  body.scrub!
@@ -24,60 +27,73 @@ module HarvesterTools
24
27
  file.write(body)
25
28
  file.rewind
26
29
 
27
- @meta.comments << "INFO: The message body is being examined by Distiller\n"
28
- command = "LANG=en_US.UTF-8 #{RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
30
+ meta.comments << "INFO: The message body is being examined by Distiller\n"
31
+ command = "LANG=en_US.UTF-8 #{FspHarvester::RDF_COMMAND} serialize --input-format rdfa --output-format jsonld #{file.path}"
29
32
  warn "distiller command: #{command}"
30
33
  result, _stderr, _status = Open3.capture3(command)
31
34
  warn ''
32
- warn "distiller errors: #{stderr}"
35
+ warn "distiller errors: #{_stderr}" if _stderr
33
36
  file.close
34
37
  file.unlink
35
38
 
36
39
  result = result.force_encoding('UTF-8')
37
- warn "DIST RESULT: #{result}"
40
+ # warn "DIST RESULT: #{result}"
38
41
  if result !~ /@context/i # failure returns nil
39
- @meta.comments << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML..\n"
40
- @meta.add_warning(['018', '', ''])
41
- result = "{}"
42
+ meta.comments << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML..\n"
43
+ meta.add_warning(['018', '', ''])
44
+ result = '{}'
42
45
  else
43
- @meta.comments << "INFO: The Distiller found parseable data. Parsing as JSON-LD\n"
46
+ meta.comments << "INFO: The Distiller found parseable data. Parsing as JSON-LD\n"
44
47
  end
45
- @@distillerknown[bhash] = true
48
+ distillerknown[bhash] = true
46
49
  end
47
50
  result
48
51
  end
49
52
 
50
- def processs_with_extruct(uri:)
51
- @meta.comments << "INFO: Using 'extruct' to try to extract metadata from return value (message body) of #{uri}.\n"
52
- warn 'begin open3'
53
- stdout, stderr, status = Open3.capture3(EXTRUCT_COMMAND + ' ' + uri)
54
- warn "open3 status: #{status} #{stdout}"
55
- result = stderr # absurd that the output comes over stderr! LOL!
56
- jsonld = {}
57
- microdata = Hash.new
58
- microformat = Hash.new
59
- opengraph = Hash.new
60
- rdfa = Hash.new
53
+ def process_with_extruct(uri:, metadata:)
54
+ bhash = Digest::SHA256.hexdigest(uri)
55
+ jsonld = '{}'
56
+ microdata = {}
57
+ microformat = {}
58
+ opengraph = {}
59
+ rdfa = '{}'
61
60
 
62
- if result.to_s.match(/(Failed\sto\sextract.*?)\n/)
63
- @meta.comments << "WARN: extruct threw an error #{Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n"
64
- @meta.add_warning(['019', '', ''])
65
- if result.to_s.match(/(ValueError:.*?)\n/)
66
- @meta.comments << "WARN: extruct error was #{Regexp.last_match(1)}\n"
67
- @meta.add_warning(['019', '', ''])
68
- end
69
- elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
70
- json = JSON.parse result
71
- @meta.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
72
- jsonld = json['json-ld'].to_json if json['json-ld'].any?
73
- microdata = json['microdata'].first if json['microdata'].any
74
- microformat = json['microformat'].first if json['microformat'].any?
75
- opengraph = json['opengraph'].first if json['opengraph'].any?
76
- rdfa = json['rdfa'].to_json if json['rdfa'].any?
77
- # @meta.merge_hash(json.first) if json.first.is_a? Hash
61
+ if extructknown[bhash]
62
+ metadata.comments << "INFO: data is already parsed by extruct.\n"
78
63
  else
79
- @meta.comments << "WARN: the extruct tool failed to find parseable data at #{uri}\n"
64
+ metadata.comments << "INFO: Using 'extruct' to try to extract metadata from return value (message body) of #{uri}.\n"
65
+ warn 'begin open3'
66
+ stdout, stderr, status = Open3.capture3(FspHarvester::EXTRUCT_COMMAND + ' ' + uri)
67
+ warn "open3 status: #{status} #{stdout}"
68
+ result = stderr # absurd that the output comes over stderr! LOL!
69
+
70
+ if result.to_s.match(/(Failed\sto\sextract.*?)\n/)
71
+ metadata.comments << "WARN: extruct threw an error #{Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n"
72
+ metadata.add_warning(['019', '', ''])
73
+ if result.to_s.match(/(ValueError:.*?)\n/)
74
+ metadata.comments << "WARN: extruct error was #{Regexp.last_match(1)}\n"
75
+ metadata.add_warning(['019', '', ''])
76
+ end
77
+ elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
78
+ begin
79
+ json = JSON.parse result
80
+ rescue StandardError
81
+ metadata.comments << "WARN: extruct threw an error when attempting to parse the extruct command return value from processing #{uri}.\n"
82
+ metadata.add_warning(['019', '', ''])
83
+ return [jsonld, microdata, microformat, opengraph, rdfa]
84
+ end
85
+ metadata.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
86
+ jsonld = json['json-ld'].to_json if json['json-ld'].any?
87
+ microdata = json['microdata'].first if json['microdata'].any?
88
+ microformat = json['microformat'].first if json['microformat'].any?
89
+ opengraph = json['opengraph'].first if json['opengraph'].any?
90
+ rdfa = json['rdfa'].to_json if json['rdfa'].any?
91
+ # @meta.merge_hash(json.first) if json.first.is_a? Hash
92
+ else
93
+ @meta.comments << "WARN: the extruct tool failed to find parseable data at #{uri}\n"
94
+ end
80
95
  end
96
+ extructknown[bhash] = true
81
97
  [jsonld, microdata, microformat, opengraph, rdfa]
82
98
  end
83
99
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module FspHarvester
4
- VERSION = "0.1.20"
4
+ VERSION = "0.1.21"
5
5
  end
@@ -3,45 +3,81 @@ module HarvesterTools
3
3
  end
4
4
 
5
5
  class BruteForce
6
- def self.begin_brute_force(guid:, metadata: HarvesterTools::MetadataObject.new)
6
+ def self.begin_brute_force(guid:, links: [], metadata: HarvesterTools::MetadataObject.new)
7
7
  type, url = HarvesterTools::Utils.convertToURL(guid: guid)
8
8
  return false unless type
9
9
 
10
- do_content_negotiation(url: url, metadata: metadata)
10
+ # TODO: follow rel=alternate headers, if they are in LD or Hash format
11
+ do_content_negotiation(url: url, metadata: metadata, links: links)
11
12
  metadata
12
13
  end
13
14
 
14
- def self.do_content_negotiation(url:, metadata:)
15
- response = resolve_url_brute(url: url, metadata: metadata, headers: ACCEPT_ALL_HEADER)
15
+ def self.do_content_negotiation(url:, metadata:, links: [])
16
+ warn "\n\nINFO: entering content negotiation of #{url}\n\n"
17
+ metadata.comments << "INFO: entering content negotiation of #{url}.\n"
18
+
19
+
20
+ response = resolve_url_brute(url: url, metadata: metadata, headers: FspHarvester::ACCEPT_LD_HEADER)
16
21
  if response
17
22
  HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata)
18
23
  end
19
- response = resolve_url_brute(url: url, metadata: metadata, headers: ACCEPT_STAR_HEADER)
24
+ response = resolve_url_brute(url: url, metadata: metadata, headers: FspHarvester::ACCEPT_STAR_HEADER)
20
25
  if response
21
26
  HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from landing page
22
- response = resolve_url_brute(url: response.request.url, metadata: metadata, headers: ACCEPT_ALL_HEADER) # now do content negotiation on the landing page
27
+ response = resolve_url_brute(url: response.request.url, metadata: metadata, headers: FspHarvester::ACCEPT_LD_HEADER) # now do content negotiation on the landing page
23
28
  if response
24
29
  HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from landing page
25
30
  end
26
31
  end
32
+
33
+ process_alternates(links: links, metadata: metadata)
34
+ end
35
+
36
+ def self.process_alternates(links: [], metadata:)
37
+ warn "\n\nINFO: entering content negotiation on link alternates\n\n"
38
+ metadata.comments << "IINFO: entering content negotiation on link alternates.\n"
39
+ # process "alternate" links
40
+ links.each do |link|
41
+ next unless link.relation == "alternate"
42
+
43
+ url = link.href
44
+ headers = {'Accept' => "#{link.type}"} if link.respond_to?("type")
45
+ headers ||= FspHarvester::ACCEPT_STAR_HEADER
46
+ warn "\n\nINFO: resolving alternate #{url} with headers #{headers.to_s}\n\n"
47
+ metadata.comments << "IINFO: entering content negotiation on link alternates.\n"
48
+ response = resolve_url_brute(url: url, metadata: metadata, headers: headers) # now do content negotiation on the link
49
+ if response
50
+ HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from alternate link
51
+ end
52
+ end
53
+
27
54
  end
28
55
 
56
+
29
57
  def self.resolve_url_brute(url:, method: :get, nolinkheaders: true, headers:, metadata:)
30
- @meta = metadata
31
- @meta.guidtype = 'uri' if @meta.guidtype.nil?
32
- warn "\n\n BRUTE FETCHING #{url} #{headers}\n\n"
33
- response = HarvesterTools::WebUtils.fspfetch(url: url, headers: headers, method: method, meta: @meta)
58
+
59
+ cache_key = Digest::MD5.hexdigest url + headers.to_s
60
+ if metadata.url_header_hash[cache_key]
61
+ warn "Already processed #{url} - moving on"
62
+ metadata.comments << "INFO: Already processed #{url} - moving on.\n"
63
+ return false
64
+ end
65
+
66
+ metadata.guidtype = 'uri' if metadata.guidtype.nil?
67
+ warn "\n\n BRUTE FETCHING #{url} \nwith headers\n #{headers}\n\n"
68
+ response = HarvesterTools::WebUtils.fspfetch(url: url, headers: headers, method: method, meta: metadata)
34
69
  warn "\n\n head #{response.headers.inspect}\n\n" if response
35
70
 
36
71
  unless response
37
- @meta.add_warning(['001', url, headers])
38
- @meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{headers}.\n"
39
- @meta.full_response << [url, "No response"]
72
+ metadata.add_warning(['001', url, headers])
73
+ metadata.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{headers}.\n"
74
+ metadata.full_response << [url, "No response"]
40
75
  false
41
76
  end
42
77
 
43
- @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
44
- @meta.full_response << [url, response.body]
78
+ metadata.comments << "INFO: following redirection using this header led to the following URL: #{metadata.all_uris.last}. Using the output from this URL for the next few tests..."
79
+ metadata.full_response << [url, response.body]
80
+ metadata.url_header_hash[cache_key] = true
45
81
  response
46
82
  end
47
83
  end
@@ -20,7 +20,7 @@ module HarvesterTools
20
20
  end
21
21
 
22
22
  def self.convertToURL(guid:)
23
- GUID_TYPES.each do |k, regex|
23
+ FspHarvester::GUID_TYPES.each do |k, regex|
24
24
  if k == 'inchi' and regex.match(guid)
25
25
  return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
26
26
  elsif k == 'handle1' and regex.match(guid)
@@ -39,13 +39,13 @@ module HarvesterTools
39
39
  end
40
40
 
41
41
  def self.typeit(guid:)
42
- GUID_TYPES.each do |type, regex|
42
+ FspHarvester::GUID_TYPES.each do |type, regex|
43
43
  return type if regex.match(guid)
44
44
  end
45
45
  false
46
46
  end
47
47
 
48
- def self.resolve_url(url:, method: :get, nolinkheaders: false, metadata:, header: ACCEPT_STAR_HEADER)
48
+ def self.resolve_url(url:, method: :get, nolinkheaders: false, metadata:, header: FspHarvester::ACCEPT_STAR_HEADER)
49
49
  @meta = metadata
50
50
  @meta.guidtype = 'uri' if @meta.guidtype.nil?
51
51
  warn "\n\n FETCHING #{url} #{header}\n\n"
@@ -59,7 +59,7 @@ module HarvesterTools
59
59
  end
60
60
 
61
61
  @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
62
- @meta.full_response << response.body
62
+ @meta.full_response << [url, response.body]
63
63
 
64
64
  links = process_link_headers(response: response, metadata: @meta) unless nolinkheaders
65
65
  links
@@ -13,7 +13,7 @@ module HarvesterTools
13
13
 
14
14
  hvst = HarvesterTools::MetadataParser.new(metadata_object: @meta) # put here because the class variable for detecting duplicates should apply to all URIs
15
15
  describedby.each do |link|
16
- accepttype = ACCEPT_STAR_HEADER
16
+ accepttype = FspHarvester::ACCEPT_STAR_HEADER
17
17
  accept = link.respond_to?('type') ? link.type : nil
18
18
  accepttype = { 'Accept' => accept } if accept
19
19
 
@@ -38,9 +38,14 @@ module HarvesterTools
38
38
  abbreviation, content_type = attempt_to_detect_type(body: response.body, headers: response.headers)
39
39
  unless abbreviation
40
40
  @meta.add_warning(['017', response.request.url, ''])
41
- @meta.comments << "WARN: metadata format returned from #{response.request.url} is not recognized. Moving on.\n"
41
+ @meta.comments << "WARN: format returned from #{response.request.url} is not recognized. Moving on.\n"
42
42
  return
43
43
  end
44
+ request_content_types = response.request.headers["Accept"].split(/,\s*/)
45
+ unless (request_content_types.include? content_type) and !(request_content_types.include? "*/*") and (response.code != 406)
46
+ @meta.add_warning(['023', response.request.url, ''])
47
+ @meta.comments << "WARN: format returned from #{response.request.url} does not match request type. This should result in a 406 error, but instead was accepted as a 200.\n"
48
+ end
44
49
  process_according_to_type(body: response.body, uri: response.request.url, metadata: @meta,
45
50
  abbreviation: abbreviation, content_type: content_type)
46
51
  end
@@ -65,7 +70,7 @@ module HarvesterTools
65
70
  end
66
71
  end
67
72
 
68
- def self.attempt_to_resolve(link:, headers: ACCEPT_STAR_HEADER)
73
+ def self.attempt_to_resolve(link:, headers: FspHarvester::ACCEPT_STAR_HEADER)
69
74
  @meta.comments << "INFO: link #{link.href} being processed"
70
75
  if link.respond_to? 'type'
71
76
  header = { 'Accept' => link.type }
@@ -86,23 +91,37 @@ module HarvesterTools
86
91
  abbreviation = nil
87
92
  content_type = nil
88
93
  @meta.comments << 'INFO: Testing metadata format for html, xml, and linked data formats\n'
94
+ claimed_type = headers[:content_type]
95
+ claimed_type.gsub!(/\s*;.*/, '')
89
96
  if body =~ /^\s*<\?xml/
90
- if body =~ /<HTML/i
97
+ if body[0..1000] =~ /<HTML/i # take a sample, it should appear quite early (it will appear in other places in e.g. tutorial documents)
91
98
  abbreviation = 'html'
92
- content_type = 'text/html'
99
+ content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
100
+ @meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
101
+ content_type |= 'text/html'
93
102
  @meta.comments << 'INFO: appears to be HTML\n'
94
103
  elsif body =~ /<rdf:RDF/i
95
104
  abbreviation = 'rdfxml'
96
- content_type = 'application/rdf+xml'
105
+ content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
106
+ @meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
107
+ content_type |= 'application/rdf+xml'
97
108
  @meta.comments << 'INFO: appears to be RDF-XML\n'
98
109
  else
99
110
  abbreviation = 'xml'
100
- content_type = 'application/xml'
111
+ content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
112
+ @meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
113
+ content_type |= 'application/xml'
101
114
  @meta.comments << 'INFO: appears to be XML\n'
102
115
  end
116
+ elsif body[0..1000] =~ /<HTML/i # take a sample, it should appear quite early (it will appear in other places in e.g. tutorial documents)
117
+ abbreviation = 'html'
118
+ content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
119
+ @meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
120
+ content_type ||= 'text/html'
121
+ @meta.comments << 'INFO: appears to be HTML\n'
103
122
  else
104
- abbreviation, content_type = check_ld(body: body, claimed_type: headers[:content_type])
105
- abbreviation, content_type = check_json(body: body) unless abbreviation
123
+ abbreviation, content_type = check_ld(body: body, claimed_type: claimed_type)
124
+ abbreviation, content_type = check_json(body: body) unless abbreviation # don't test if LD already found!
106
125
  end
107
126
 
108
127
  unless content_type
@@ -112,18 +131,46 @@ module HarvesterTools
112
131
  [abbreviation, content_type]
113
132
  end
114
133
 
134
+ def self.validate_claimed_type(abbreviation:, claimed_type:)
135
+ warn "\n\nclaimed type #{claimed_type}\nabbreviation #{abbreviation}\n\n"
136
+ claimed_type.gsub!(/\s*;.*/, '')
137
+
138
+ case abbreviation
139
+ when 'html'
140
+ return claimed_type if FspHarvester::HTML_FORMATS['html'].include? claimed_type
141
+ when 'xml'
142
+ return claimed_type if FspHarvester::XML_FORMATS['xml'].include? claimed_type
143
+ when 'json'
144
+ return claimed_type if FspHarvester::JSON_FORMATS['json'].include? claimed_type
145
+ when 'jsonld', 'rdfxml', 'turtle', 'ntriples', 'nquads'
146
+ return claimed_type if FspHarvester::RDF_FORMATS.values.flatten.include? claimed_type
147
+ when 'specialist'
148
+ warn 'no specialized parsers so far'
149
+ end
150
+ return false
151
+ end
152
+
115
153
  def self.check_ld(body:, claimed_type:)
116
154
  detected_type = ntriples_hack(body: body) # ntriples hack for one-line metadata records
117
- unless detected_type
155
+ unless detected_type # see if distiller can detect a type
118
156
  detected_type = RDF::Format.for({ sample: body[0..5000] })
119
157
  @meta.comments << "INFO: Auto-detected type #{detected_type}\n"
120
158
  end
159
+ # at this point, detected_type is something like RDF::Turtle::Format (or nil). This will return a content-type
121
160
  contenttype = ''
122
161
  abbreviation = ''
123
162
  if detected_type
124
- contenttype = detected_type.content_type.first # comes back as array
125
- abbreviation = abbreviate_type(contenttype: contenttype)
126
- @meta.comments << "INFO: using content-type #{contenttype}.\n"
163
+ detectedcontenttypes = detected_type.content_type # comes back as array of [application/x, application/y]
164
+ unless detectedcontenttypes.include? claimed_type
165
+ @meta.add_warning(['022', @meta.all_uris.last, "" ])
166
+ contenttype = detected_type.content_type.first # just pick one arbitrarily, since it doesn't match thedeclared type anyway
167
+ abbreviation = abbreviate_type(contenttype: contenttype)
168
+ @meta.comments << "INFO: using content-type #{contenttype} even though there was a mismatch.\n"
169
+ else
170
+ contenttype = claimed_type # just pick one arbitrarily, since it doesn't match thedeclared type anyway
171
+ abbreviation = abbreviate_type(contenttype: contenttype)
172
+ @meta.comments << "INFO: using content-type #{contenttype}.\n"
173
+ end
127
174
  else
128
175
  @meta.comments << "INFO: metadata does not appear to be in a linked data format. Trying other options.\n"
129
176
  end
@@ -161,13 +208,14 @@ module HarvesterTools
161
208
  abbreviation = 'json'
162
209
  else
163
210
  @meta.comments << "INFO: metadata does not appear to be in JSON format. No options left.\n"
211
+ return [nil, nil]
164
212
  end
165
- [abbreviation, 'application/ld+json']
213
+ [abbreviation, 'application/json']
166
214
  end
167
215
 
168
216
  def self.abbreviate_type(contenttype:)
169
217
  foundtype = nil
170
- RDF_FORMATS.merge(XML_FORMATS).merge(HTML_FORMATS).merge(JSON_FORMATS).each do |type, vals|
218
+ FspHarvester::RDF_FORMATS.merge(FspHarvester::XML_FORMATS).merge(FspHarvester::HTML_FORMATS).merge(FspHarvester::JSON_FORMATS).each do |type, vals|
171
219
  warn "\n\ntype #{type}\nvals #{vals}\n\n"
172
220
  @meta.comments << "INFO: testing #{type} MIME types for #{contenttype}"
173
221
  next unless vals.include? contenttype
@@ -1,8 +1,8 @@
1
1
  module HarvesterTools
2
2
  class MetadataObject
3
- attr_accessor :id, :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris, :tested_guid, :score, :version, :date # a hash of metadata # a RDF.rb graph of metadata # an array of comments # the type of GUID that was detected # will be an array of Net::HTTP::Response
3
+ attr_accessor :id, :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris, :tested_guid, :score, :version, :date, :url_header_hash # a hash of metadata # a RDF.rb graph of metadata # an array of comments # the type of GUID that was detected # will be an array of Net::HTTP::Response
4
4
 
5
- def initialize(id: "unidentified_metadata") # get a name from the "new" call, or set a default
5
+ def initialize(id: "urn:local:unidentified_metadata") # get a name from the "new" call, or set a default
6
6
  @id = id
7
7
  @hash = {}
8
8
  @graph = RDF::Graph.new
@@ -16,6 +16,7 @@ module HarvesterTools
16
16
  @score = 0
17
17
  @version = '0.0'
18
18
  @date = Time.now.strftime('%Y-%m-%dT%H:%M:%S.%L%z')
19
+ @url_header_hash = Hash.new(false) # the combinarion of URL and the accept headers, sha1 hashed, for quick lookup if it has already been processed
19
20
  w = RestClient.get("https://raw.githubusercontent.com/markwilkinson/FAIR-Signposting-Harvester/master/lib/warnings.json")
20
21
  #@warn = File.read("./lib/warnings.json")
21
22
  @warn = JSON.parse(w)
@@ -37,6 +38,7 @@ module HarvesterTools
37
38
 
38
39
  def add_warning(warning)
39
40
  id = warning[0]
41
+ return unless @warn[id] # if there's a mismatch between code and the warnings in github
40
42
  url = warning[1]
41
43
  headers = warning[2]
42
44
  message = @warn[id]['message']
@@ -13,17 +13,16 @@ module HarvesterTools
13
13
  @meta = metadata_object
14
14
  end
15
15
 
16
- def process_html(body:, uri:, metadata:)
17
- @meta = metadata
16
+ def process_html(body:, uri:, metadata: @meta)
18
17
  tools = HarvesterTools::ExternalTools.new(metadata: @meta)
19
- result = tools.process_with_distiller(body: body)
18
+ tools.process_with_distiller(body: body, metadata: @meta) # adds to @meta
20
19
 
21
- jsonld, microdata, microformat, opengraph, rdfa = tools.process_with_extruct(uri: uri)
22
- parse_rdf(body: jsonld, content_type: 'application/ld+json')
20
+ jsonld, microdata, microformat, opengraph, rdfa = tools.process_with_extruct(uri: uri, metadata: @meta)
21
+ parse_rdf(body: jsonld, content_type: 'application/ld+json', metadata: metadata)
23
22
  @meta.merge_hash(microdata)
24
23
  @meta.merge_hash(microformat)
25
24
  @meta.merge_hash(opengraph)
26
- parse_rdf(body: rdfa, content_type: 'application/ld+json')
25
+ parse_rdf(body: rdfa, content_type: 'application/ld+json', metadata: @meta)
27
26
  end
28
27
 
29
28
  def process_xml(body:, metadata:)
data/lib/warnings.json CHANGED
@@ -116,6 +116,18 @@
116
116
  {"Validator": "https://jsononline.net/json-validator"}],
117
117
  "severity": "WARN"
118
118
  },
119
+ "022": {
120
+ "message": "Mismatch between the Content-type header and the content of the returned document.",
121
+ "linkout": [],
122
+ "severity": "WARN"
123
+ },
124
+ "023": {
125
+ "message": "Returned content-type is not compatible with any requested content-type, yet a HTTP 406 error was not returned",
126
+ "linkout": [],
127
+ "severity": "WARN"
128
+ },
129
+
130
+
119
131
  "600": {
120
132
  "message": "Data identifier cannot be unambiguously determined, therefore cannot be tested against known persistent identifier schemas",
121
133
  "linkout": [],
data/lib/web_utils.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  module HarvesterTools
2
2
 
3
3
  class WebUtils
4
- def self.fspfetch(url:, headers: ACCEPT_ALL_HEADER, method: :get, meta: HarvesterTools::MetadataObject.new)
4
+ def self.fspfetch(url:, headers: ACCEPT_STAR_HEADER, method: :get, meta: HarvesterTools::MetadataObject.new)
5
5
  warn 'In fetch routine now. '
6
6
 
7
7
  begin
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fsp_harvester
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.20
4
+ version: 0.1.21
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mark Wilkinson
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-08-17 00:00:00.000000000 Z
11
+ date: 2022-08-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: json