fsp_harvester 0.1.20 → 0.1.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rspec_status +60 -57
- data/Gemfile.lock +1 -1
- data/lib/constants.rb +7 -4
- data/lib/external_tools.rb +58 -42
- data/lib/fsp_harvester/version.rb +1 -1
- data/lib/harvester_brute.rb +51 -15
- data/lib/harvester_utils.rb +4 -4
- data/lib/metadata_harvester.rb +63 -15
- data/lib/metadata_object.rb +4 -2
- data/lib/metadata_parser.rb +5 -6
- data/lib/warnings.json +12 -0
- data/lib/web_utils.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 8498c33db9c350fec8ea4e734b31087f798a4f433f211115c69ded468dbcdb12
|
|
4
|
+
data.tar.gz: f3f408b24575f4f310c6f00ac0d42c3106f68fcd43199d3dbb73d8e4deb403fe
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 263549dc8b8bf2fe8a4bc50289092ad2e55d9bbc05cabc509637786b4923948345220f0ee7a04fa5db497f670d9ab79d4e35e05648f09f189103ae869040baad
|
|
7
|
+
data.tar.gz: 6b35a320400ff37561ddf2cff506a3a4f385cd31933e4df8ceb4d436a4f97974e7122ab96962b8ce3cfc183144fcf41592fa417abcdb609fd44080061fc5e1a3
|
data/.rspec_status
CHANGED
|
@@ -1,57 +1,60 @@
|
|
|
1
|
-
example_id | status | run_time
|
|
2
|
-
---------------------------------- | ------ |
|
|
3
|
-
./spec/cite-as_spec.rb[1:1:1] | passed | 1.
|
|
4
|
-
./spec/cite-as_spec.rb[1:1:2] | passed | 1.
|
|
5
|
-
./spec/cite-as_spec.rb[1:1:3] | passed | 1.
|
|
6
|
-
./spec/cite-as_spec.rb[1:1:4] | passed | 1.
|
|
7
|
-
./spec/cite-as_spec.rb[1:1:5] | passed | 2.
|
|
8
|
-
./spec/cite-as_spec.rb[1:1:6] | passed | 2.
|
|
9
|
-
./spec/cite-as_spec.rb[1:1:7] | passed | 2.96 seconds
|
|
10
|
-
./spec/cite-as_spec.rb[1:1:8] | passed | 2.28 seconds
|
|
11
|
-
./spec/cite-as_spec.rb[1:1:9] | passed |
|
|
12
|
-
./spec/cite-as_spec.rb[1:1:10] | passed | 2.
|
|
13
|
-
./spec/cite-as_spec.rb[1:1:11] | passed |
|
|
14
|
-
./spec/cite-as_spec.rb[1:1:12] | passed |
|
|
15
|
-
./spec/cite-as_spec.rb[1:1:13] | passed |
|
|
16
|
-
./spec/cite-as_spec.rb[1:1:14] | passed | 2.
|
|
17
|
-
./spec/cite-as_spec.rb[1:1:15] | passed | 1.19 seconds
|
|
18
|
-
./spec/cite-as_spec.rb[1:1:16] | passed | 1.
|
|
19
|
-
./spec/cite-as_spec.rb[1:1:17] | passed | 1.
|
|
20
|
-
./spec/cite-as_spec.rb[1:1:18] | passed | 1.
|
|
21
|
-
./spec/cite-as_spec.rb[1:1:19] | passed | 1.
|
|
22
|
-
./spec/cite-as_spec.rb[1:1:20] | passed | 1
|
|
23
|
-
./spec/cite-as_spec.rb[1:1:21] | passed | 2.
|
|
24
|
-
./spec/cite-as_spec.rb[1:1:22] | passed | 1.
|
|
25
|
-
./spec/cite-as_spec.rb[1:1:23] | passed | 1.
|
|
26
|
-
./spec/cite-as_spec.rb[1:1:24] | failed | 1.
|
|
27
|
-
./spec/cite-as_spec.rb[1:1:25] | passed | 0.
|
|
28
|
-
./spec/describedby_spec.rb[1:1:1] | passed | 3.
|
|
29
|
-
./spec/describedby_spec.rb[1:1:2] | passed | 1.
|
|
30
|
-
./spec/describedby_spec.rb[1:1:3] | passed | 1.
|
|
31
|
-
./spec/describedby_spec.rb[1:1:4] | passed | 1.
|
|
32
|
-
./spec/describedby_spec.rb[1:1:5] | passed | 1.
|
|
33
|
-
./spec/describedby_spec.rb[1:1:6] | passed | 1.
|
|
34
|
-
./spec/describedby_spec.rb[1:1:7] | passed |
|
|
35
|
-
./spec/describedby_spec.rb[1:1:8] | passed | 2.
|
|
36
|
-
./spec/describedby_spec.rb[1:1:9] | passed |
|
|
37
|
-
./spec/describedby_spec.rb[1:1:10] | passed | 2.
|
|
38
|
-
./spec/describedby_spec.rb[1:1:11] | passed |
|
|
39
|
-
./spec/describedby_spec.rb[1:1:12] | passed |
|
|
40
|
-
./spec/describedby_spec.rb[1:1:13] | passed | 1.
|
|
41
|
-
./spec/describedby_spec.rb[1:1:14] | passed | 2.
|
|
42
|
-
./spec/describedby_spec.rb[1:1:15] | passed | 2.
|
|
43
|
-
./spec/fsp_harvester_spec.rb[1:1] | passed | 0.
|
|
44
|
-
./spec/fsp_harvester_spec.rb[1:2] | passed | 2.
|
|
45
|
-
./spec/fsp_harvester_spec.rb[1:3] | passed |
|
|
46
|
-
./spec/fsp_harvester_spec.rb[1:4] | passed | 2.
|
|
47
|
-
./spec/
|
|
48
|
-
./spec/
|
|
49
|
-
./spec/
|
|
50
|
-
./spec/item_spec.rb[1:1:
|
|
51
|
-
./spec/item_spec.rb[1:1:
|
|
52
|
-
./spec/item_spec.rb[1:1:
|
|
53
|
-
./spec/item_spec.rb[1:1:
|
|
54
|
-
./spec/item_spec.rb[1:1:
|
|
55
|
-
./spec/
|
|
56
|
-
./spec/
|
|
57
|
-
./spec/
|
|
1
|
+
example_id | status | run_time |
|
|
2
|
+
---------------------------------- | ------ | ---------------------- |
|
|
3
|
+
./spec/cite-as_spec.rb[1:1:1] | passed | 1.77 seconds |
|
|
4
|
+
./spec/cite-as_spec.rb[1:1:2] | passed | 1.22 seconds |
|
|
5
|
+
./spec/cite-as_spec.rb[1:1:3] | passed | 1.09 seconds |
|
|
6
|
+
./spec/cite-as_spec.rb[1:1:4] | passed | 1.89 seconds |
|
|
7
|
+
./spec/cite-as_spec.rb[1:1:5] | passed | 2.95 seconds |
|
|
8
|
+
./spec/cite-as_spec.rb[1:1:6] | passed | 2.14 seconds |
|
|
9
|
+
./spec/cite-as_spec.rb[1:1:7] | passed | 2.96 seconds |
|
|
10
|
+
./spec/cite-as_spec.rb[1:1:8] | passed | 2.28 seconds |
|
|
11
|
+
./spec/cite-as_spec.rb[1:1:9] | passed | 3.4 seconds |
|
|
12
|
+
./spec/cite-as_spec.rb[1:1:10] | passed | 2.21 seconds |
|
|
13
|
+
./spec/cite-as_spec.rb[1:1:11] | passed | 2.82 seconds |
|
|
14
|
+
./spec/cite-as_spec.rb[1:1:12] | passed | 2.23 seconds |
|
|
15
|
+
./spec/cite-as_spec.rb[1:1:13] | passed | 3.36 seconds |
|
|
16
|
+
./spec/cite-as_spec.rb[1:1:14] | passed | 2.19 seconds |
|
|
17
|
+
./spec/cite-as_spec.rb[1:1:15] | passed | 1.19 seconds |
|
|
18
|
+
./spec/cite-as_spec.rb[1:1:16] | passed | 1.23 seconds |
|
|
19
|
+
./spec/cite-as_spec.rb[1:1:17] | passed | 1.19 seconds |
|
|
20
|
+
./spec/cite-as_spec.rb[1:1:18] | passed | 1.28 seconds |
|
|
21
|
+
./spec/cite-as_spec.rb[1:1:19] | passed | 1.94 seconds |
|
|
22
|
+
./spec/cite-as_spec.rb[1:1:20] | passed | 2.1 seconds |
|
|
23
|
+
./spec/cite-as_spec.rb[1:1:21] | passed | 2.23 seconds |
|
|
24
|
+
./spec/cite-as_spec.rb[1:1:22] | passed | 1.17 seconds |
|
|
25
|
+
./spec/cite-as_spec.rb[1:1:23] | passed | 1.13 seconds |
|
|
26
|
+
./spec/cite-as_spec.rb[1:1:24] | failed | 1.24 seconds |
|
|
27
|
+
./spec/cite-as_spec.rb[1:1:25] | passed | 0.49678 seconds |
|
|
28
|
+
./spec/describedby_spec.rb[1:1:1] | passed | 3.18 seconds |
|
|
29
|
+
./spec/describedby_spec.rb[1:1:2] | passed | 1.34 seconds |
|
|
30
|
+
./spec/describedby_spec.rb[1:1:3] | passed | 1.2 seconds |
|
|
31
|
+
./spec/describedby_spec.rb[1:1:4] | passed | 1.14 seconds |
|
|
32
|
+
./spec/describedby_spec.rb[1:1:5] | passed | 1.24 seconds |
|
|
33
|
+
./spec/describedby_spec.rb[1:1:6] | passed | 1.04 seconds |
|
|
34
|
+
./spec/describedby_spec.rb[1:1:7] | passed | 0.9844 seconds |
|
|
35
|
+
./spec/describedby_spec.rb[1:1:8] | passed | 2.07 seconds |
|
|
36
|
+
./spec/describedby_spec.rb[1:1:9] | passed | 2.16 seconds |
|
|
37
|
+
./spec/describedby_spec.rb[1:1:10] | passed | 2.36 seconds |
|
|
38
|
+
./spec/describedby_spec.rb[1:1:11] | passed | 2.91 seconds |
|
|
39
|
+
./spec/describedby_spec.rb[1:1:12] | passed | 2.93 seconds |
|
|
40
|
+
./spec/describedby_spec.rb[1:1:13] | passed | 1.79 seconds |
|
|
41
|
+
./spec/describedby_spec.rb[1:1:14] | passed | 2.5 seconds |
|
|
42
|
+
./spec/describedby_spec.rb[1:1:15] | passed | 2.24 seconds |
|
|
43
|
+
./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00102 seconds |
|
|
44
|
+
./spec/fsp_harvester_spec.rb[1:2] | passed | 2.5 seconds |
|
|
45
|
+
./spec/fsp_harvester_spec.rb[1:3] | passed | 29.49 seconds |
|
|
46
|
+
./spec/fsp_harvester_spec.rb[1:4] | passed | 2.53 seconds |
|
|
47
|
+
./spec/fsp_harvester_spec.rb[1:5] | passed | 2.65 seconds |
|
|
48
|
+
./spec/fsp_harvester_spec.rb[1:6] | failed | 1 minute 24.1 seconds |
|
|
49
|
+
./spec/fsp_harvester_spec.rb[1:7] | passed | 2 minutes 24.3 seconds |
|
|
50
|
+
./spec/item_spec.rb[1:1:1] | passed | 2.71 seconds |
|
|
51
|
+
./spec/item_spec.rb[1:1:2] | passed | 2.98 seconds |
|
|
52
|
+
./spec/item_spec.rb[1:1:3] | passed | 1.33 seconds |
|
|
53
|
+
./spec/item_spec.rb[1:1:4] | passed | 1.81 seconds |
|
|
54
|
+
./spec/item_spec.rb[1:1:5] | passed | 2.2 seconds |
|
|
55
|
+
./spec/item_spec.rb[1:1:6] | passed | 2.25 seconds |
|
|
56
|
+
./spec/item_spec.rb[1:1:7] | passed | 2.94 seconds |
|
|
57
|
+
./spec/item_spec.rb[1:1:8] | passed | 0.62818 seconds |
|
|
58
|
+
./spec/type_spec.rb[1:1:1] | passed | 1.33 seconds |
|
|
59
|
+
./spec/type_spec.rb[1:1:2] | passed | 1.22 seconds |
|
|
60
|
+
./spec/type_spec.rb[1:1:3] | passed | 1.61 seconds |
|
data/Gemfile.lock
CHANGED
data/lib/constants.rb
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
|
|
1
|
+
module FspHarvester
|
|
2
|
+
|
|
3
|
+
ACCEPT_LD_HEADER = { 'Accept' => 'text/turtle, application/ld+json, application/rdf+xml, text/xhtml+xml, application/n3, application/rdf+n3, application/turtle, application/x-turtle, text/n3, text/turtle, text/rdf+n3, text/rdf+turtle, application/n-triples' }
|
|
2
4
|
|
|
3
5
|
ACCEPT_STAR_HEADER = {'Accept' => '*/*'}
|
|
4
6
|
|
|
@@ -77,6 +79,7 @@ GUID_TYPES = {
|
|
|
77
79
|
'uri' => Regexp.new(%r{^\w+:/?/?[^\s]+$}),
|
|
78
80
|
'ark' => Regexp.new(%r{^ark:/[^\s]+$})
|
|
79
81
|
}
|
|
82
|
+
end
|
|
80
83
|
|
|
81
84
|
# CONFIG = File.exist?('config.conf') ? ParseConfig.new('config.conf') : {}
|
|
82
85
|
# extruct = CONFIG.dig(:extruct, :command)
|
|
@@ -88,7 +91,7 @@ when /[&|;`$\s]/
|
|
|
88
91
|
when /echo/i
|
|
89
92
|
abort 'The Extruct command appears to be subject to command injection. I will not continue'
|
|
90
93
|
end
|
|
91
|
-
EXTRUCT_COMMAND = extruct
|
|
94
|
+
FspHarvester::EXTRUCT_COMMAND = extruct
|
|
92
95
|
|
|
93
96
|
# rdf_command = CONFIG.dig(:rdf, :command)
|
|
94
97
|
rdf_command = ENV['RDF_COMMAND'] || 'rdf'
|
|
@@ -101,8 +104,8 @@ when /echo/i
|
|
|
101
104
|
when !(/rdf$/ =~ $_)
|
|
102
105
|
abort "this software requires that Kelloggs Distiller tool is used. The distiller command must end in 'rdf'"
|
|
103
106
|
end
|
|
104
|
-
RDF_COMMAND = rdf_command
|
|
107
|
+
FspHarvester::RDF_COMMAND = rdf_command
|
|
105
108
|
|
|
106
109
|
# tika_command = CONFIG.dig(:tika, :command)
|
|
107
110
|
tika_command = ENV['TIKA_COMMAND'] || 'http://localhost:9998/meta'
|
|
108
|
-
TIKA_COMMAND = tika_command
|
|
111
|
+
FspHarvester::TIKA_COMMAND = tika_command
|
data/lib/external_tools.rb
CHANGED
|
@@ -5,18 +5,21 @@ module HarvesterTools
|
|
|
5
5
|
end
|
|
6
6
|
|
|
7
7
|
class ExternalTools
|
|
8
|
+
attr_accessor :distillerknown, :extructknown
|
|
8
9
|
|
|
9
10
|
def initialize(metadata: HarvesterTools::MetadataObject.new)
|
|
11
|
+
@distillerknown = {}
|
|
12
|
+
@extructknown = {}
|
|
10
13
|
@meta = metadata
|
|
11
14
|
end
|
|
12
15
|
|
|
13
|
-
def process_with_distiller(body:)
|
|
16
|
+
def process_with_distiller(body:, metadata:)
|
|
17
|
+
meta = metadata
|
|
14
18
|
bhash = Digest::SHA256.hexdigest(body)
|
|
15
|
-
if
|
|
16
|
-
|
|
17
|
-
#parse_rdf(body: body)
|
|
19
|
+
if distillerknown[bhash]
|
|
20
|
+
meta.comments << "INFO: data is already parsed by distiller.\n"
|
|
18
21
|
else
|
|
19
|
-
|
|
22
|
+
meta.comments << "INFO: Using 'Kellog's Distiller' to try to extract metadata from return value (message body).\n"
|
|
20
23
|
file = Tempfile.new('foo', encoding: 'UTF-8')
|
|
21
24
|
body = body.force_encoding('UTF-8')
|
|
22
25
|
body.scrub!
|
|
@@ -24,60 +27,73 @@ module HarvesterTools
|
|
|
24
27
|
file.write(body)
|
|
25
28
|
file.rewind
|
|
26
29
|
|
|
27
|
-
|
|
28
|
-
command = "LANG=en_US.UTF-8 #{
|
|
30
|
+
meta.comments << "INFO: The message body is being examined by Distiller\n"
|
|
31
|
+
command = "LANG=en_US.UTF-8 #{FspHarvester::RDF_COMMAND} serialize --input-format rdfa --output-format jsonld #{file.path}"
|
|
29
32
|
warn "distiller command: #{command}"
|
|
30
33
|
result, _stderr, _status = Open3.capture3(command)
|
|
31
34
|
warn ''
|
|
32
|
-
warn "distiller errors: #{
|
|
35
|
+
warn "distiller errors: #{_stderr}" if _stderr
|
|
33
36
|
file.close
|
|
34
37
|
file.unlink
|
|
35
38
|
|
|
36
39
|
result = result.force_encoding('UTF-8')
|
|
37
|
-
warn "DIST RESULT: #{result}"
|
|
40
|
+
# warn "DIST RESULT: #{result}"
|
|
38
41
|
if result !~ /@context/i # failure returns nil
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
result =
|
|
42
|
+
meta.comments << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML..\n"
|
|
43
|
+
meta.add_warning(['018', '', ''])
|
|
44
|
+
result = '{}'
|
|
42
45
|
else
|
|
43
|
-
|
|
46
|
+
meta.comments << "INFO: The Distiller found parseable data. Parsing as JSON-LD\n"
|
|
44
47
|
end
|
|
45
|
-
|
|
48
|
+
distillerknown[bhash] = true
|
|
46
49
|
end
|
|
47
50
|
result
|
|
48
51
|
end
|
|
49
52
|
|
|
50
|
-
def
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
microdata = Hash.new
|
|
58
|
-
microformat = Hash.new
|
|
59
|
-
opengraph = Hash.new
|
|
60
|
-
rdfa = Hash.new
|
|
53
|
+
def process_with_extruct(uri:, metadata:)
|
|
54
|
+
bhash = Digest::SHA256.hexdigest(uri)
|
|
55
|
+
jsonld = '{}'
|
|
56
|
+
microdata = {}
|
|
57
|
+
microformat = {}
|
|
58
|
+
opengraph = {}
|
|
59
|
+
rdfa = '{}'
|
|
61
60
|
|
|
62
|
-
if
|
|
63
|
-
|
|
64
|
-
@meta.add_warning(['019', '', ''])
|
|
65
|
-
if result.to_s.match(/(ValueError:.*?)\n/)
|
|
66
|
-
@meta.comments << "WARN: extruct error was #{Regexp.last_match(1)}\n"
|
|
67
|
-
@meta.add_warning(['019', '', ''])
|
|
68
|
-
end
|
|
69
|
-
elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
|
|
70
|
-
json = JSON.parse result
|
|
71
|
-
@meta.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
|
|
72
|
-
jsonld = json['json-ld'].to_json if json['json-ld'].any?
|
|
73
|
-
microdata = json['microdata'].first if json['microdata'].any
|
|
74
|
-
microformat = json['microformat'].first if json['microformat'].any?
|
|
75
|
-
opengraph = json['opengraph'].first if json['opengraph'].any?
|
|
76
|
-
rdfa = json['rdfa'].to_json if json['rdfa'].any?
|
|
77
|
-
# @meta.merge_hash(json.first) if json.first.is_a? Hash
|
|
61
|
+
if extructknown[bhash]
|
|
62
|
+
metadata.comments << "INFO: data is already parsed by extruct.\n"
|
|
78
63
|
else
|
|
79
|
-
|
|
64
|
+
metadata.comments << "INFO: Using 'extruct' to try to extract metadata from return value (message body) of #{uri}.\n"
|
|
65
|
+
warn 'begin open3'
|
|
66
|
+
stdout, stderr, status = Open3.capture3(FspHarvester::EXTRUCT_COMMAND + ' ' + uri)
|
|
67
|
+
warn "open3 status: #{status} #{stdout}"
|
|
68
|
+
result = stderr # absurd that the output comes over stderr! LOL!
|
|
69
|
+
|
|
70
|
+
if result.to_s.match(/(Failed\sto\sextract.*?)\n/)
|
|
71
|
+
metadata.comments << "WARN: extruct threw an error #{Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n"
|
|
72
|
+
metadata.add_warning(['019', '', ''])
|
|
73
|
+
if result.to_s.match(/(ValueError:.*?)\n/)
|
|
74
|
+
metadata.comments << "WARN: extruct error was #{Regexp.last_match(1)}\n"
|
|
75
|
+
metadata.add_warning(['019', '', ''])
|
|
76
|
+
end
|
|
77
|
+
elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
|
|
78
|
+
begin
|
|
79
|
+
json = JSON.parse result
|
|
80
|
+
rescue StandardError
|
|
81
|
+
metadata.comments << "WARN: extruct threw an error when attempting to parse the extruct command return value from processing #{uri}.\n"
|
|
82
|
+
metadata.add_warning(['019', '', ''])
|
|
83
|
+
return [jsonld, microdata, microformat, opengraph, rdfa]
|
|
84
|
+
end
|
|
85
|
+
metadata.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
|
|
86
|
+
jsonld = json['json-ld'].to_json if json['json-ld'].any?
|
|
87
|
+
microdata = json['microdata'].first if json['microdata'].any?
|
|
88
|
+
microformat = json['microformat'].first if json['microformat'].any?
|
|
89
|
+
opengraph = json['opengraph'].first if json['opengraph'].any?
|
|
90
|
+
rdfa = json['rdfa'].to_json if json['rdfa'].any?
|
|
91
|
+
# @meta.merge_hash(json.first) if json.first.is_a? Hash
|
|
92
|
+
else
|
|
93
|
+
@meta.comments << "WARN: the extruct tool failed to find parseable data at #{uri}\n"
|
|
94
|
+
end
|
|
80
95
|
end
|
|
96
|
+
extructknown[bhash] = true
|
|
81
97
|
[jsonld, microdata, microformat, opengraph, rdfa]
|
|
82
98
|
end
|
|
83
99
|
end
|
data/lib/harvester_brute.rb
CHANGED
|
@@ -3,45 +3,81 @@ module HarvesterTools
|
|
|
3
3
|
end
|
|
4
4
|
|
|
5
5
|
class BruteForce
|
|
6
|
-
def self.begin_brute_force(guid:, metadata: HarvesterTools::MetadataObject.new)
|
|
6
|
+
def self.begin_brute_force(guid:, links: [], metadata: HarvesterTools::MetadataObject.new)
|
|
7
7
|
type, url = HarvesterTools::Utils.convertToURL(guid: guid)
|
|
8
8
|
return false unless type
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
# TODO: follow rel=alternate headers, if they are in LD or Hash format
|
|
11
|
+
do_content_negotiation(url: url, metadata: metadata, links: links)
|
|
11
12
|
metadata
|
|
12
13
|
end
|
|
13
14
|
|
|
14
|
-
def self.do_content_negotiation(url:, metadata:)
|
|
15
|
-
|
|
15
|
+
def self.do_content_negotiation(url:, metadata:, links: [])
|
|
16
|
+
warn "\n\nINFO: entering content negotiation of #{url}\n\n"
|
|
17
|
+
metadata.comments << "INFO: entering content negotiation of #{url}.\n"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
response = resolve_url_brute(url: url, metadata: metadata, headers: FspHarvester::ACCEPT_LD_HEADER)
|
|
16
21
|
if response
|
|
17
22
|
HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata)
|
|
18
23
|
end
|
|
19
|
-
response = resolve_url_brute(url: url, metadata: metadata, headers: ACCEPT_STAR_HEADER)
|
|
24
|
+
response = resolve_url_brute(url: url, metadata: metadata, headers: FspHarvester::ACCEPT_STAR_HEADER)
|
|
20
25
|
if response
|
|
21
26
|
HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from landing page
|
|
22
|
-
response = resolve_url_brute(url: response.request.url, metadata: metadata, headers:
|
|
27
|
+
response = resolve_url_brute(url: response.request.url, metadata: metadata, headers: FspHarvester::ACCEPT_LD_HEADER) # now do content negotiation on the landing page
|
|
23
28
|
if response
|
|
24
29
|
HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from landing page
|
|
25
30
|
end
|
|
26
31
|
end
|
|
32
|
+
|
|
33
|
+
process_alternates(links: links, metadata: metadata)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def self.process_alternates(links: [], metadata:)
|
|
37
|
+
warn "\n\nINFO: entering content negotiation on link alternates\n\n"
|
|
38
|
+
metadata.comments << "IINFO: entering content negotiation on link alternates.\n"
|
|
39
|
+
# process "alternate" links
|
|
40
|
+
links.each do |link|
|
|
41
|
+
next unless link.relation == "alternate"
|
|
42
|
+
|
|
43
|
+
url = link.href
|
|
44
|
+
headers = {'Accept' => "#{link.type}"} if link.respond_to?("type")
|
|
45
|
+
headers ||= FspHarvester::ACCEPT_STAR_HEADER
|
|
46
|
+
warn "\n\nINFO: resolving alternate #{url} with headers #{headers.to_s}\n\n"
|
|
47
|
+
metadata.comments << "IINFO: entering content negotiation on link alternates.\n"
|
|
48
|
+
response = resolve_url_brute(url: url, metadata: metadata, headers: headers) # now do content negotiation on the link
|
|
49
|
+
if response
|
|
50
|
+
HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from alternate link
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
27
54
|
end
|
|
28
55
|
|
|
56
|
+
|
|
29
57
|
def self.resolve_url_brute(url:, method: :get, nolinkheaders: true, headers:, metadata:)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
58
|
+
|
|
59
|
+
cache_key = Digest::MD5.hexdigest url + headers.to_s
|
|
60
|
+
if metadata.url_header_hash[cache_key]
|
|
61
|
+
warn "Already processed #{url} - moving on"
|
|
62
|
+
metadata.comments << "INFO: Already processed #{url} - moving on.\n"
|
|
63
|
+
return false
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
metadata.guidtype = 'uri' if metadata.guidtype.nil?
|
|
67
|
+
warn "\n\n BRUTE FETCHING #{url} \nwith headers\n #{headers}\n\n"
|
|
68
|
+
response = HarvesterTools::WebUtils.fspfetch(url: url, headers: headers, method: method, meta: metadata)
|
|
34
69
|
warn "\n\n head #{response.headers.inspect}\n\n" if response
|
|
35
70
|
|
|
36
71
|
unless response
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
72
|
+
metadata.add_warning(['001', url, headers])
|
|
73
|
+
metadata.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{headers}.\n"
|
|
74
|
+
metadata.full_response << [url, "No response"]
|
|
40
75
|
false
|
|
41
76
|
end
|
|
42
77
|
|
|
43
|
-
|
|
44
|
-
|
|
78
|
+
metadata.comments << "INFO: following redirection using this header led to the following URL: #{metadata.all_uris.last}. Using the output from this URL for the next few tests..."
|
|
79
|
+
metadata.full_response << [url, response.body]
|
|
80
|
+
metadata.url_header_hash[cache_key] = true
|
|
45
81
|
response
|
|
46
82
|
end
|
|
47
83
|
end
|
data/lib/harvester_utils.rb
CHANGED
|
@@ -20,7 +20,7 @@ module HarvesterTools
|
|
|
20
20
|
end
|
|
21
21
|
|
|
22
22
|
def self.convertToURL(guid:)
|
|
23
|
-
GUID_TYPES.each do |k, regex|
|
|
23
|
+
FspHarvester::GUID_TYPES.each do |k, regex|
|
|
24
24
|
if k == 'inchi' and regex.match(guid)
|
|
25
25
|
return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
|
|
26
26
|
elsif k == 'handle1' and regex.match(guid)
|
|
@@ -39,13 +39,13 @@ module HarvesterTools
|
|
|
39
39
|
end
|
|
40
40
|
|
|
41
41
|
def self.typeit(guid:)
|
|
42
|
-
GUID_TYPES.each do |type, regex|
|
|
42
|
+
FspHarvester::GUID_TYPES.each do |type, regex|
|
|
43
43
|
return type if regex.match(guid)
|
|
44
44
|
end
|
|
45
45
|
false
|
|
46
46
|
end
|
|
47
47
|
|
|
48
|
-
def self.resolve_url(url:, method: :get, nolinkheaders: false, metadata:, header: ACCEPT_STAR_HEADER)
|
|
48
|
+
def self.resolve_url(url:, method: :get, nolinkheaders: false, metadata:, header: FspHarvester::ACCEPT_STAR_HEADER)
|
|
49
49
|
@meta = metadata
|
|
50
50
|
@meta.guidtype = 'uri' if @meta.guidtype.nil?
|
|
51
51
|
warn "\n\n FETCHING #{url} #{header}\n\n"
|
|
@@ -59,7 +59,7 @@ module HarvesterTools
|
|
|
59
59
|
end
|
|
60
60
|
|
|
61
61
|
@meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
|
|
62
|
-
@meta.full_response << response.body
|
|
62
|
+
@meta.full_response << [url, response.body]
|
|
63
63
|
|
|
64
64
|
links = process_link_headers(response: response, metadata: @meta) unless nolinkheaders
|
|
65
65
|
links
|
data/lib/metadata_harvester.rb
CHANGED
|
@@ -13,7 +13,7 @@ module HarvesterTools
|
|
|
13
13
|
|
|
14
14
|
hvst = HarvesterTools::MetadataParser.new(metadata_object: @meta) # put here because the class variable for detecting duplicates should apply to all URIs
|
|
15
15
|
describedby.each do |link|
|
|
16
|
-
accepttype = ACCEPT_STAR_HEADER
|
|
16
|
+
accepttype = FspHarvester::ACCEPT_STAR_HEADER
|
|
17
17
|
accept = link.respond_to?('type') ? link.type : nil
|
|
18
18
|
accepttype = { 'Accept' => accept } if accept
|
|
19
19
|
|
|
@@ -38,9 +38,14 @@ module HarvesterTools
|
|
|
38
38
|
abbreviation, content_type = attempt_to_detect_type(body: response.body, headers: response.headers)
|
|
39
39
|
unless abbreviation
|
|
40
40
|
@meta.add_warning(['017', response.request.url, ''])
|
|
41
|
-
@meta.comments << "WARN:
|
|
41
|
+
@meta.comments << "WARN: format returned from #{response.request.url} is not recognized. Moving on.\n"
|
|
42
42
|
return
|
|
43
43
|
end
|
|
44
|
+
request_content_types = response.request.headers["Accept"].split(/,\s*/)
|
|
45
|
+
unless (request_content_types.include? content_type) and !(request_content_types.include? "*/*") and (response.code != 406)
|
|
46
|
+
@meta.add_warning(['023', response.request.url, ''])
|
|
47
|
+
@meta.comments << "WARN: format returned from #{response.request.url} does not match request type. This should result in a 406 error, but instead was accepted as a 200.\n"
|
|
48
|
+
end
|
|
44
49
|
process_according_to_type(body: response.body, uri: response.request.url, metadata: @meta,
|
|
45
50
|
abbreviation: abbreviation, content_type: content_type)
|
|
46
51
|
end
|
|
@@ -65,7 +70,7 @@ module HarvesterTools
|
|
|
65
70
|
end
|
|
66
71
|
end
|
|
67
72
|
|
|
68
|
-
def self.attempt_to_resolve(link:, headers: ACCEPT_STAR_HEADER)
|
|
73
|
+
def self.attempt_to_resolve(link:, headers: FspHarvester::ACCEPT_STAR_HEADER)
|
|
69
74
|
@meta.comments << "INFO: link #{link.href} being processed"
|
|
70
75
|
if link.respond_to? 'type'
|
|
71
76
|
header = { 'Accept' => link.type }
|
|
@@ -86,23 +91,37 @@ module HarvesterTools
|
|
|
86
91
|
abbreviation = nil
|
|
87
92
|
content_type = nil
|
|
88
93
|
@meta.comments << 'INFO: Testing metadata format for html, xml, and linked data formats\n'
|
|
94
|
+
claimed_type = headers[:content_type]
|
|
95
|
+
claimed_type.gsub!(/\s*;.*/, '')
|
|
89
96
|
if body =~ /^\s*<\?xml/
|
|
90
|
-
if body =~ /<HTML/i
|
|
97
|
+
if body[0..1000] =~ /<HTML/i # take a sample, it should appear quite early (it will appear in other places in e.g. tutorial documents)
|
|
91
98
|
abbreviation = 'html'
|
|
92
|
-
content_type =
|
|
99
|
+
content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
|
|
100
|
+
@meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
|
|
101
|
+
content_type |= 'text/html'
|
|
93
102
|
@meta.comments << 'INFO: appears to be HTML\n'
|
|
94
103
|
elsif body =~ /<rdf:RDF/i
|
|
95
104
|
abbreviation = 'rdfxml'
|
|
96
|
-
content_type =
|
|
105
|
+
content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
|
|
106
|
+
@meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
|
|
107
|
+
content_type |= 'application/rdf+xml'
|
|
97
108
|
@meta.comments << 'INFO: appears to be RDF-XML\n'
|
|
98
109
|
else
|
|
99
110
|
abbreviation = 'xml'
|
|
100
|
-
content_type =
|
|
111
|
+
content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
|
|
112
|
+
@meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
|
|
113
|
+
content_type |= 'application/xml'
|
|
101
114
|
@meta.comments << 'INFO: appears to be XML\n'
|
|
102
115
|
end
|
|
116
|
+
elsif body[0..1000] =~ /<HTML/i # take a sample, it should appear quite early (it will appear in other places in e.g. tutorial documents)
|
|
117
|
+
abbreviation = 'html'
|
|
118
|
+
content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
|
|
119
|
+
@meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
|
|
120
|
+
content_type ||= 'text/html'
|
|
121
|
+
@meta.comments << 'INFO: appears to be HTML\n'
|
|
103
122
|
else
|
|
104
|
-
abbreviation, content_type = check_ld(body: body, claimed_type:
|
|
105
|
-
abbreviation, content_type = check_json(body: body) unless abbreviation
|
|
123
|
+
abbreviation, content_type = check_ld(body: body, claimed_type: claimed_type)
|
|
124
|
+
abbreviation, content_type = check_json(body: body) unless abbreviation # don't test if LD already found!
|
|
106
125
|
end
|
|
107
126
|
|
|
108
127
|
unless content_type
|
|
@@ -112,18 +131,46 @@ module HarvesterTools
|
|
|
112
131
|
[abbreviation, content_type]
|
|
113
132
|
end
|
|
114
133
|
|
|
134
|
+
def self.validate_claimed_type(abbreviation:, claimed_type:)
|
|
135
|
+
warn "\n\nclaimed type #{claimed_type}\nabbreviation #{abbreviation}\n\n"
|
|
136
|
+
claimed_type.gsub!(/\s*;.*/, '')
|
|
137
|
+
|
|
138
|
+
case abbreviation
|
|
139
|
+
when 'html'
|
|
140
|
+
return claimed_type if FspHarvester::HTML_FORMATS['html'].include? claimed_type
|
|
141
|
+
when 'xml'
|
|
142
|
+
return claimed_type if FspHarvester::XML_FORMATS['xml'].include? claimed_type
|
|
143
|
+
when 'json'
|
|
144
|
+
return claimed_type if FspHarvester::JSON_FORMATS['json'].include? claimed_type
|
|
145
|
+
when 'jsonld', 'rdfxml', 'turtle', 'ntriples', 'nquads'
|
|
146
|
+
return claimed_type if FspHarvester::RDF_FORMATS.values.flatten.include? claimed_type
|
|
147
|
+
when 'specialist'
|
|
148
|
+
warn 'no specialized parsers so far'
|
|
149
|
+
end
|
|
150
|
+
return false
|
|
151
|
+
end
|
|
152
|
+
|
|
115
153
|
def self.check_ld(body:, claimed_type:)
|
|
116
154
|
detected_type = ntriples_hack(body: body) # ntriples hack for one-line metadata records
|
|
117
|
-
unless detected_type
|
|
155
|
+
unless detected_type # see if distiller can detect a type
|
|
118
156
|
detected_type = RDF::Format.for({ sample: body[0..5000] })
|
|
119
157
|
@meta.comments << "INFO: Auto-detected type #{detected_type}\n"
|
|
120
158
|
end
|
|
159
|
+
# at this point, detected_type is something like RDF::Turtle::Format (or nil). This will return a content-type
|
|
121
160
|
contenttype = ''
|
|
122
161
|
abbreviation = ''
|
|
123
162
|
if detected_type
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
163
|
+
detectedcontenttypes = detected_type.content_type # comes back as array of [application/x, application/y]
|
|
164
|
+
unless detectedcontenttypes.include? claimed_type
|
|
165
|
+
@meta.add_warning(['022', @meta.all_uris.last, "" ])
|
|
166
|
+
contenttype = detected_type.content_type.first # just pick one arbitrarily, since it doesn't match thedeclared type anyway
|
|
167
|
+
abbreviation = abbreviate_type(contenttype: contenttype)
|
|
168
|
+
@meta.comments << "INFO: using content-type #{contenttype} even though there was a mismatch.\n"
|
|
169
|
+
else
|
|
170
|
+
contenttype = claimed_type # just pick one arbitrarily, since it doesn't match thedeclared type anyway
|
|
171
|
+
abbreviation = abbreviate_type(contenttype: contenttype)
|
|
172
|
+
@meta.comments << "INFO: using content-type #{contenttype}.\n"
|
|
173
|
+
end
|
|
127
174
|
else
|
|
128
175
|
@meta.comments << "INFO: metadata does not appear to be in a linked data format. Trying other options.\n"
|
|
129
176
|
end
|
|
@@ -161,13 +208,14 @@ module HarvesterTools
|
|
|
161
208
|
abbreviation = 'json'
|
|
162
209
|
else
|
|
163
210
|
@meta.comments << "INFO: metadata does not appear to be in JSON format. No options left.\n"
|
|
211
|
+
return [nil, nil]
|
|
164
212
|
end
|
|
165
|
-
[abbreviation, 'application/
|
|
213
|
+
[abbreviation, 'application/json']
|
|
166
214
|
end
|
|
167
215
|
|
|
168
216
|
def self.abbreviate_type(contenttype:)
|
|
169
217
|
foundtype = nil
|
|
170
|
-
RDF_FORMATS.merge(XML_FORMATS).merge(HTML_FORMATS).merge(JSON_FORMATS).each do |type, vals|
|
|
218
|
+
FspHarvester::RDF_FORMATS.merge(FspHarvester::XML_FORMATS).merge(FspHarvester::HTML_FORMATS).merge(FspHarvester::JSON_FORMATS).each do |type, vals|
|
|
171
219
|
warn "\n\ntype #{type}\nvals #{vals}\n\n"
|
|
172
220
|
@meta.comments << "INFO: testing #{type} MIME types for #{contenttype}"
|
|
173
221
|
next unless vals.include? contenttype
|
data/lib/metadata_object.rb
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
module HarvesterTools
|
|
2
2
|
class MetadataObject
|
|
3
|
-
attr_accessor :id, :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris, :tested_guid, :score, :version, :date # a hash of metadata # a RDF.rb graph of metadata # an array of comments # the type of GUID that was detected # will be an array of Net::HTTP::Response
|
|
3
|
+
attr_accessor :id, :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris, :tested_guid, :score, :version, :date, :url_header_hash # a hash of metadata # a RDF.rb graph of metadata # an array of comments # the type of GUID that was detected # will be an array of Net::HTTP::Response
|
|
4
4
|
|
|
5
|
-
def initialize(id: "unidentified_metadata") # get a name from the "new" call, or set a default
|
|
5
|
+
def initialize(id: "urn:local:unidentified_metadata") # get a name from the "new" call, or set a default
|
|
6
6
|
@id = id
|
|
7
7
|
@hash = {}
|
|
8
8
|
@graph = RDF::Graph.new
|
|
@@ -16,6 +16,7 @@ module HarvesterTools
|
|
|
16
16
|
@score = 0
|
|
17
17
|
@version = '0.0'
|
|
18
18
|
@date = Time.now.strftime('%Y-%m-%dT%H:%M:%S.%L%z')
|
|
19
|
+
@url_header_hash = Hash.new(false) # the combinarion of URL and the accept headers, sha1 hashed, for quick lookup if it has already been processed
|
|
19
20
|
w = RestClient.get("https://raw.githubusercontent.com/markwilkinson/FAIR-Signposting-Harvester/master/lib/warnings.json")
|
|
20
21
|
#@warn = File.read("./lib/warnings.json")
|
|
21
22
|
@warn = JSON.parse(w)
|
|
@@ -37,6 +38,7 @@ module HarvesterTools
|
|
|
37
38
|
|
|
38
39
|
def add_warning(warning)
|
|
39
40
|
id = warning[0]
|
|
41
|
+
return unless @warn[id] # if there's a mismatch between code and the warnings in github
|
|
40
42
|
url = warning[1]
|
|
41
43
|
headers = warning[2]
|
|
42
44
|
message = @warn[id]['message']
|
data/lib/metadata_parser.rb
CHANGED
|
@@ -13,17 +13,16 @@ module HarvesterTools
|
|
|
13
13
|
@meta = metadata_object
|
|
14
14
|
end
|
|
15
15
|
|
|
16
|
-
def process_html(body:, uri:, metadata:)
|
|
17
|
-
@meta = metadata
|
|
16
|
+
def process_html(body:, uri:, metadata: @meta)
|
|
18
17
|
tools = HarvesterTools::ExternalTools.new(metadata: @meta)
|
|
19
|
-
|
|
18
|
+
tools.process_with_distiller(body: body, metadata: @meta) # adds to @meta
|
|
20
19
|
|
|
21
|
-
jsonld, microdata, microformat, opengraph, rdfa = tools.process_with_extruct(uri: uri)
|
|
22
|
-
parse_rdf(body: jsonld, content_type: 'application/ld+json')
|
|
20
|
+
jsonld, microdata, microformat, opengraph, rdfa = tools.process_with_extruct(uri: uri, metadata: @meta)
|
|
21
|
+
parse_rdf(body: jsonld, content_type: 'application/ld+json', metadata: metadata)
|
|
23
22
|
@meta.merge_hash(microdata)
|
|
24
23
|
@meta.merge_hash(microformat)
|
|
25
24
|
@meta.merge_hash(opengraph)
|
|
26
|
-
parse_rdf(body: rdfa, content_type: 'application/ld+json')
|
|
25
|
+
parse_rdf(body: rdfa, content_type: 'application/ld+json', metadata: @meta)
|
|
27
26
|
end
|
|
28
27
|
|
|
29
28
|
def process_xml(body:, metadata:)
|
data/lib/warnings.json
CHANGED
|
@@ -116,6 +116,18 @@
|
|
|
116
116
|
{"Validator": "https://jsononline.net/json-validator"}],
|
|
117
117
|
"severity": "WARN"
|
|
118
118
|
},
|
|
119
|
+
"022": {
|
|
120
|
+
"message": "Mismatch between the Content-type header and the content of the returned document.",
|
|
121
|
+
"linkout": [],
|
|
122
|
+
"severity": "WARN"
|
|
123
|
+
},
|
|
124
|
+
"023": {
|
|
125
|
+
"message": "Returned content-type is not compatible with any requested content-type, yet a HTTP 406 error was not returned",
|
|
126
|
+
"linkout": [],
|
|
127
|
+
"severity": "WARN"
|
|
128
|
+
},
|
|
129
|
+
|
|
130
|
+
|
|
119
131
|
"600": {
|
|
120
132
|
"message": "Data identifier cannot be unambiguously determined, therefore cannot be tested against known persistent identifier schemas",
|
|
121
133
|
"linkout": [],
|
data/lib/web_utils.rb
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
module HarvesterTools
|
|
2
2
|
|
|
3
3
|
class WebUtils
|
|
4
|
-
def self.fspfetch(url:, headers:
|
|
4
|
+
def self.fspfetch(url:, headers: ACCEPT_STAR_HEADER, method: :get, meta: HarvesterTools::MetadataObject.new)
|
|
5
5
|
warn 'In fetch routine now. '
|
|
6
6
|
|
|
7
7
|
begin
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: fsp_harvester
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.21
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Mark Wilkinson
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2022-08-
|
|
11
|
+
date: 2022-08-18 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: json
|