fsp_harvester 0.1.20 → 0.1.21
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rspec_status +60 -57
- data/Gemfile.lock +1 -1
- data/lib/constants.rb +7 -4
- data/lib/external_tools.rb +58 -42
- data/lib/fsp_harvester/version.rb +1 -1
- data/lib/harvester_brute.rb +51 -15
- data/lib/harvester_utils.rb +4 -4
- data/lib/metadata_harvester.rb +63 -15
- data/lib/metadata_object.rb +4 -2
- data/lib/metadata_parser.rb +5 -6
- data/lib/warnings.json +12 -0
- data/lib/web_utils.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8498c33db9c350fec8ea4e734b31087f798a4f433f211115c69ded468dbcdb12
|
4
|
+
data.tar.gz: f3f408b24575f4f310c6f00ac0d42c3106f68fcd43199d3dbb73d8e4deb403fe
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 263549dc8b8bf2fe8a4bc50289092ad2e55d9bbc05cabc509637786b4923948345220f0ee7a04fa5db497f670d9ab79d4e35e05648f09f189103ae869040baad
|
7
|
+
data.tar.gz: 6b35a320400ff37561ddf2cff506a3a4f385cd31933e4df8ceb4d436a4f97974e7122ab96962b8ce3cfc183144fcf41592fa417abcdb609fd44080061fc5e1a3
|
data/.rspec_status
CHANGED
@@ -1,57 +1,60 @@
|
|
1
|
-
example_id | status | run_time
|
2
|
-
---------------------------------- | ------ |
|
3
|
-
./spec/cite-as_spec.rb[1:1:1] | passed | 1.
|
4
|
-
./spec/cite-as_spec.rb[1:1:2] | passed | 1.
|
5
|
-
./spec/cite-as_spec.rb[1:1:3] | passed | 1.
|
6
|
-
./spec/cite-as_spec.rb[1:1:4] | passed | 1.
|
7
|
-
./spec/cite-as_spec.rb[1:1:5] | passed | 2.
|
8
|
-
./spec/cite-as_spec.rb[1:1:6] | passed | 2.
|
9
|
-
./spec/cite-as_spec.rb[1:1:7] | passed | 2.96 seconds
|
10
|
-
./spec/cite-as_spec.rb[1:1:8] | passed | 2.28 seconds
|
11
|
-
./spec/cite-as_spec.rb[1:1:9] | passed |
|
12
|
-
./spec/cite-as_spec.rb[1:1:10] | passed | 2.
|
13
|
-
./spec/cite-as_spec.rb[1:1:11] | passed |
|
14
|
-
./spec/cite-as_spec.rb[1:1:12] | passed |
|
15
|
-
./spec/cite-as_spec.rb[1:1:13] | passed |
|
16
|
-
./spec/cite-as_spec.rb[1:1:14] | passed | 2.
|
17
|
-
./spec/cite-as_spec.rb[1:1:15] | passed | 1.19 seconds
|
18
|
-
./spec/cite-as_spec.rb[1:1:16] | passed | 1.
|
19
|
-
./spec/cite-as_spec.rb[1:1:17] | passed | 1.
|
20
|
-
./spec/cite-as_spec.rb[1:1:18] | passed | 1.
|
21
|
-
./spec/cite-as_spec.rb[1:1:19] | passed | 1.
|
22
|
-
./spec/cite-as_spec.rb[1:1:20] | passed | 1
|
23
|
-
./spec/cite-as_spec.rb[1:1:21] | passed | 2.
|
24
|
-
./spec/cite-as_spec.rb[1:1:22] | passed | 1.
|
25
|
-
./spec/cite-as_spec.rb[1:1:23] | passed | 1.
|
26
|
-
./spec/cite-as_spec.rb[1:1:24] | failed | 1.
|
27
|
-
./spec/cite-as_spec.rb[1:1:25] | passed | 0.
|
28
|
-
./spec/describedby_spec.rb[1:1:1] | passed | 3.
|
29
|
-
./spec/describedby_spec.rb[1:1:2] | passed | 1.
|
30
|
-
./spec/describedby_spec.rb[1:1:3] | passed | 1.
|
31
|
-
./spec/describedby_spec.rb[1:1:4] | passed | 1.
|
32
|
-
./spec/describedby_spec.rb[1:1:5] | passed | 1.
|
33
|
-
./spec/describedby_spec.rb[1:1:6] | passed | 1.
|
34
|
-
./spec/describedby_spec.rb[1:1:7] | passed |
|
35
|
-
./spec/describedby_spec.rb[1:1:8] | passed | 2.
|
36
|
-
./spec/describedby_spec.rb[1:1:9] | passed |
|
37
|
-
./spec/describedby_spec.rb[1:1:10] | passed | 2.
|
38
|
-
./spec/describedby_spec.rb[1:1:11] | passed |
|
39
|
-
./spec/describedby_spec.rb[1:1:12] | passed |
|
40
|
-
./spec/describedby_spec.rb[1:1:13] | passed | 1.
|
41
|
-
./spec/describedby_spec.rb[1:1:14] | passed | 2.
|
42
|
-
./spec/describedby_spec.rb[1:1:15] | passed | 2.
|
43
|
-
./spec/fsp_harvester_spec.rb[1:1] | passed | 0.
|
44
|
-
./spec/fsp_harvester_spec.rb[1:2] | passed | 2.
|
45
|
-
./spec/fsp_harvester_spec.rb[1:3] | passed |
|
46
|
-
./spec/fsp_harvester_spec.rb[1:4] | passed | 2.
|
47
|
-
./spec/
|
48
|
-
./spec/
|
49
|
-
./spec/
|
50
|
-
./spec/item_spec.rb[1:1:
|
51
|
-
./spec/item_spec.rb[1:1:
|
52
|
-
./spec/item_spec.rb[1:1:
|
53
|
-
./spec/item_spec.rb[1:1:
|
54
|
-
./spec/item_spec.rb[1:1:
|
55
|
-
./spec/
|
56
|
-
./spec/
|
57
|
-
./spec/
|
1
|
+
example_id | status | run_time |
|
2
|
+
---------------------------------- | ------ | ---------------------- |
|
3
|
+
./spec/cite-as_spec.rb[1:1:1] | passed | 1.77 seconds |
|
4
|
+
./spec/cite-as_spec.rb[1:1:2] | passed | 1.22 seconds |
|
5
|
+
./spec/cite-as_spec.rb[1:1:3] | passed | 1.09 seconds |
|
6
|
+
./spec/cite-as_spec.rb[1:1:4] | passed | 1.89 seconds |
|
7
|
+
./spec/cite-as_spec.rb[1:1:5] | passed | 2.95 seconds |
|
8
|
+
./spec/cite-as_spec.rb[1:1:6] | passed | 2.14 seconds |
|
9
|
+
./spec/cite-as_spec.rb[1:1:7] | passed | 2.96 seconds |
|
10
|
+
./spec/cite-as_spec.rb[1:1:8] | passed | 2.28 seconds |
|
11
|
+
./spec/cite-as_spec.rb[1:1:9] | passed | 3.4 seconds |
|
12
|
+
./spec/cite-as_spec.rb[1:1:10] | passed | 2.21 seconds |
|
13
|
+
./spec/cite-as_spec.rb[1:1:11] | passed | 2.82 seconds |
|
14
|
+
./spec/cite-as_spec.rb[1:1:12] | passed | 2.23 seconds |
|
15
|
+
./spec/cite-as_spec.rb[1:1:13] | passed | 3.36 seconds |
|
16
|
+
./spec/cite-as_spec.rb[1:1:14] | passed | 2.19 seconds |
|
17
|
+
./spec/cite-as_spec.rb[1:1:15] | passed | 1.19 seconds |
|
18
|
+
./spec/cite-as_spec.rb[1:1:16] | passed | 1.23 seconds |
|
19
|
+
./spec/cite-as_spec.rb[1:1:17] | passed | 1.19 seconds |
|
20
|
+
./spec/cite-as_spec.rb[1:1:18] | passed | 1.28 seconds |
|
21
|
+
./spec/cite-as_spec.rb[1:1:19] | passed | 1.94 seconds |
|
22
|
+
./spec/cite-as_spec.rb[1:1:20] | passed | 2.1 seconds |
|
23
|
+
./spec/cite-as_spec.rb[1:1:21] | passed | 2.23 seconds |
|
24
|
+
./spec/cite-as_spec.rb[1:1:22] | passed | 1.17 seconds |
|
25
|
+
./spec/cite-as_spec.rb[1:1:23] | passed | 1.13 seconds |
|
26
|
+
./spec/cite-as_spec.rb[1:1:24] | failed | 1.24 seconds |
|
27
|
+
./spec/cite-as_spec.rb[1:1:25] | passed | 0.49678 seconds |
|
28
|
+
./spec/describedby_spec.rb[1:1:1] | passed | 3.18 seconds |
|
29
|
+
./spec/describedby_spec.rb[1:1:2] | passed | 1.34 seconds |
|
30
|
+
./spec/describedby_spec.rb[1:1:3] | passed | 1.2 seconds |
|
31
|
+
./spec/describedby_spec.rb[1:1:4] | passed | 1.14 seconds |
|
32
|
+
./spec/describedby_spec.rb[1:1:5] | passed | 1.24 seconds |
|
33
|
+
./spec/describedby_spec.rb[1:1:6] | passed | 1.04 seconds |
|
34
|
+
./spec/describedby_spec.rb[1:1:7] | passed | 0.9844 seconds |
|
35
|
+
./spec/describedby_spec.rb[1:1:8] | passed | 2.07 seconds |
|
36
|
+
./spec/describedby_spec.rb[1:1:9] | passed | 2.16 seconds |
|
37
|
+
./spec/describedby_spec.rb[1:1:10] | passed | 2.36 seconds |
|
38
|
+
./spec/describedby_spec.rb[1:1:11] | passed | 2.91 seconds |
|
39
|
+
./spec/describedby_spec.rb[1:1:12] | passed | 2.93 seconds |
|
40
|
+
./spec/describedby_spec.rb[1:1:13] | passed | 1.79 seconds |
|
41
|
+
./spec/describedby_spec.rb[1:1:14] | passed | 2.5 seconds |
|
42
|
+
./spec/describedby_spec.rb[1:1:15] | passed | 2.24 seconds |
|
43
|
+
./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00102 seconds |
|
44
|
+
./spec/fsp_harvester_spec.rb[1:2] | passed | 2.5 seconds |
|
45
|
+
./spec/fsp_harvester_spec.rb[1:3] | passed | 29.49 seconds |
|
46
|
+
./spec/fsp_harvester_spec.rb[1:4] | passed | 2.53 seconds |
|
47
|
+
./spec/fsp_harvester_spec.rb[1:5] | passed | 2.65 seconds |
|
48
|
+
./spec/fsp_harvester_spec.rb[1:6] | failed | 1 minute 24.1 seconds |
|
49
|
+
./spec/fsp_harvester_spec.rb[1:7] | passed | 2 minutes 24.3 seconds |
|
50
|
+
./spec/item_spec.rb[1:1:1] | passed | 2.71 seconds |
|
51
|
+
./spec/item_spec.rb[1:1:2] | passed | 2.98 seconds |
|
52
|
+
./spec/item_spec.rb[1:1:3] | passed | 1.33 seconds |
|
53
|
+
./spec/item_spec.rb[1:1:4] | passed | 1.81 seconds |
|
54
|
+
./spec/item_spec.rb[1:1:5] | passed | 2.2 seconds |
|
55
|
+
./spec/item_spec.rb[1:1:6] | passed | 2.25 seconds |
|
56
|
+
./spec/item_spec.rb[1:1:7] | passed | 2.94 seconds |
|
57
|
+
./spec/item_spec.rb[1:1:8] | passed | 0.62818 seconds |
|
58
|
+
./spec/type_spec.rb[1:1:1] | passed | 1.33 seconds |
|
59
|
+
./spec/type_spec.rb[1:1:2] | passed | 1.22 seconds |
|
60
|
+
./spec/type_spec.rb[1:1:3] | passed | 1.61 seconds |
|
data/Gemfile.lock
CHANGED
data/lib/constants.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
|
-
|
1
|
+
module FspHarvester
|
2
|
+
|
3
|
+
ACCEPT_LD_HEADER = { 'Accept' => 'text/turtle, application/ld+json, application/rdf+xml, text/xhtml+xml, application/n3, application/rdf+n3, application/turtle, application/x-turtle, text/n3, text/turtle, text/rdf+n3, text/rdf+turtle, application/n-triples' }
|
2
4
|
|
3
5
|
ACCEPT_STAR_HEADER = {'Accept' => '*/*'}
|
4
6
|
|
@@ -77,6 +79,7 @@ GUID_TYPES = {
|
|
77
79
|
'uri' => Regexp.new(%r{^\w+:/?/?[^\s]+$}),
|
78
80
|
'ark' => Regexp.new(%r{^ark:/[^\s]+$})
|
79
81
|
}
|
82
|
+
end
|
80
83
|
|
81
84
|
# CONFIG = File.exist?('config.conf') ? ParseConfig.new('config.conf') : {}
|
82
85
|
# extruct = CONFIG.dig(:extruct, :command)
|
@@ -88,7 +91,7 @@ when /[&|;`$\s]/
|
|
88
91
|
when /echo/i
|
89
92
|
abort 'The Extruct command appears to be subject to command injection. I will not continue'
|
90
93
|
end
|
91
|
-
EXTRUCT_COMMAND = extruct
|
94
|
+
FspHarvester::EXTRUCT_COMMAND = extruct
|
92
95
|
|
93
96
|
# rdf_command = CONFIG.dig(:rdf, :command)
|
94
97
|
rdf_command = ENV['RDF_COMMAND'] || 'rdf'
|
@@ -101,8 +104,8 @@ when /echo/i
|
|
101
104
|
when !(/rdf$/ =~ $_)
|
102
105
|
abort "this software requires that Kelloggs Distiller tool is used. The distiller command must end in 'rdf'"
|
103
106
|
end
|
104
|
-
RDF_COMMAND = rdf_command
|
107
|
+
FspHarvester::RDF_COMMAND = rdf_command
|
105
108
|
|
106
109
|
# tika_command = CONFIG.dig(:tika, :command)
|
107
110
|
tika_command = ENV['TIKA_COMMAND'] || 'http://localhost:9998/meta'
|
108
|
-
TIKA_COMMAND = tika_command
|
111
|
+
FspHarvester::TIKA_COMMAND = tika_command
|
data/lib/external_tools.rb
CHANGED
@@ -5,18 +5,21 @@ module HarvesterTools
|
|
5
5
|
end
|
6
6
|
|
7
7
|
class ExternalTools
|
8
|
+
attr_accessor :distillerknown, :extructknown
|
8
9
|
|
9
10
|
def initialize(metadata: HarvesterTools::MetadataObject.new)
|
11
|
+
@distillerknown = {}
|
12
|
+
@extructknown = {}
|
10
13
|
@meta = metadata
|
11
14
|
end
|
12
15
|
|
13
|
-
def process_with_distiller(body:)
|
16
|
+
def process_with_distiller(body:, metadata:)
|
17
|
+
meta = metadata
|
14
18
|
bhash = Digest::SHA256.hexdigest(body)
|
15
|
-
if
|
16
|
-
|
17
|
-
#parse_rdf(body: body)
|
19
|
+
if distillerknown[bhash]
|
20
|
+
meta.comments << "INFO: data is already parsed by distiller.\n"
|
18
21
|
else
|
19
|
-
|
22
|
+
meta.comments << "INFO: Using 'Kellog's Distiller' to try to extract metadata from return value (message body).\n"
|
20
23
|
file = Tempfile.new('foo', encoding: 'UTF-8')
|
21
24
|
body = body.force_encoding('UTF-8')
|
22
25
|
body.scrub!
|
@@ -24,60 +27,73 @@ module HarvesterTools
|
|
24
27
|
file.write(body)
|
25
28
|
file.rewind
|
26
29
|
|
27
|
-
|
28
|
-
command = "LANG=en_US.UTF-8 #{
|
30
|
+
meta.comments << "INFO: The message body is being examined by Distiller\n"
|
31
|
+
command = "LANG=en_US.UTF-8 #{FspHarvester::RDF_COMMAND} serialize --input-format rdfa --output-format jsonld #{file.path}"
|
29
32
|
warn "distiller command: #{command}"
|
30
33
|
result, _stderr, _status = Open3.capture3(command)
|
31
34
|
warn ''
|
32
|
-
warn "distiller errors: #{
|
35
|
+
warn "distiller errors: #{_stderr}" if _stderr
|
33
36
|
file.close
|
34
37
|
file.unlink
|
35
38
|
|
36
39
|
result = result.force_encoding('UTF-8')
|
37
|
-
warn "DIST RESULT: #{result}"
|
40
|
+
# warn "DIST RESULT: #{result}"
|
38
41
|
if result !~ /@context/i # failure returns nil
|
39
|
-
|
40
|
-
|
41
|
-
result =
|
42
|
+
meta.comments << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML..\n"
|
43
|
+
meta.add_warning(['018', '', ''])
|
44
|
+
result = '{}'
|
42
45
|
else
|
43
|
-
|
46
|
+
meta.comments << "INFO: The Distiller found parseable data. Parsing as JSON-LD\n"
|
44
47
|
end
|
45
|
-
|
48
|
+
distillerknown[bhash] = true
|
46
49
|
end
|
47
50
|
result
|
48
51
|
end
|
49
52
|
|
50
|
-
def
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
microdata = Hash.new
|
58
|
-
microformat = Hash.new
|
59
|
-
opengraph = Hash.new
|
60
|
-
rdfa = Hash.new
|
53
|
+
def process_with_extruct(uri:, metadata:)
|
54
|
+
bhash = Digest::SHA256.hexdigest(uri)
|
55
|
+
jsonld = '{}'
|
56
|
+
microdata = {}
|
57
|
+
microformat = {}
|
58
|
+
opengraph = {}
|
59
|
+
rdfa = '{}'
|
61
60
|
|
62
|
-
if
|
63
|
-
|
64
|
-
@meta.add_warning(['019', '', ''])
|
65
|
-
if result.to_s.match(/(ValueError:.*?)\n/)
|
66
|
-
@meta.comments << "WARN: extruct error was #{Regexp.last_match(1)}\n"
|
67
|
-
@meta.add_warning(['019', '', ''])
|
68
|
-
end
|
69
|
-
elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
|
70
|
-
json = JSON.parse result
|
71
|
-
@meta.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
|
72
|
-
jsonld = json['json-ld'].to_json if json['json-ld'].any?
|
73
|
-
microdata = json['microdata'].first if json['microdata'].any
|
74
|
-
microformat = json['microformat'].first if json['microformat'].any?
|
75
|
-
opengraph = json['opengraph'].first if json['opengraph'].any?
|
76
|
-
rdfa = json['rdfa'].to_json if json['rdfa'].any?
|
77
|
-
# @meta.merge_hash(json.first) if json.first.is_a? Hash
|
61
|
+
if extructknown[bhash]
|
62
|
+
metadata.comments << "INFO: data is already parsed by extruct.\n"
|
78
63
|
else
|
79
|
-
|
64
|
+
metadata.comments << "INFO: Using 'extruct' to try to extract metadata from return value (message body) of #{uri}.\n"
|
65
|
+
warn 'begin open3'
|
66
|
+
stdout, stderr, status = Open3.capture3(FspHarvester::EXTRUCT_COMMAND + ' ' + uri)
|
67
|
+
warn "open3 status: #{status} #{stdout}"
|
68
|
+
result = stderr # absurd that the output comes over stderr! LOL!
|
69
|
+
|
70
|
+
if result.to_s.match(/(Failed\sto\sextract.*?)\n/)
|
71
|
+
metadata.comments << "WARN: extruct threw an error #{Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n"
|
72
|
+
metadata.add_warning(['019', '', ''])
|
73
|
+
if result.to_s.match(/(ValueError:.*?)\n/)
|
74
|
+
metadata.comments << "WARN: extruct error was #{Regexp.last_match(1)}\n"
|
75
|
+
metadata.add_warning(['019', '', ''])
|
76
|
+
end
|
77
|
+
elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
|
78
|
+
begin
|
79
|
+
json = JSON.parse result
|
80
|
+
rescue StandardError
|
81
|
+
metadata.comments << "WARN: extruct threw an error when attempting to parse the extruct command return value from processing #{uri}.\n"
|
82
|
+
metadata.add_warning(['019', '', ''])
|
83
|
+
return [jsonld, microdata, microformat, opengraph, rdfa]
|
84
|
+
end
|
85
|
+
metadata.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
|
86
|
+
jsonld = json['json-ld'].to_json if json['json-ld'].any?
|
87
|
+
microdata = json['microdata'].first if json['microdata'].any?
|
88
|
+
microformat = json['microformat'].first if json['microformat'].any?
|
89
|
+
opengraph = json['opengraph'].first if json['opengraph'].any?
|
90
|
+
rdfa = json['rdfa'].to_json if json['rdfa'].any?
|
91
|
+
# @meta.merge_hash(json.first) if json.first.is_a? Hash
|
92
|
+
else
|
93
|
+
@meta.comments << "WARN: the extruct tool failed to find parseable data at #{uri}\n"
|
94
|
+
end
|
80
95
|
end
|
96
|
+
extructknown[bhash] = true
|
81
97
|
[jsonld, microdata, microformat, opengraph, rdfa]
|
82
98
|
end
|
83
99
|
end
|
data/lib/harvester_brute.rb
CHANGED
@@ -3,45 +3,81 @@ module HarvesterTools
|
|
3
3
|
end
|
4
4
|
|
5
5
|
class BruteForce
|
6
|
-
def self.begin_brute_force(guid:, metadata: HarvesterTools::MetadataObject.new)
|
6
|
+
def self.begin_brute_force(guid:, links: [], metadata: HarvesterTools::MetadataObject.new)
|
7
7
|
type, url = HarvesterTools::Utils.convertToURL(guid: guid)
|
8
8
|
return false unless type
|
9
9
|
|
10
|
-
|
10
|
+
# TODO: follow rel=alternate headers, if they are in LD or Hash format
|
11
|
+
do_content_negotiation(url: url, metadata: metadata, links: links)
|
11
12
|
metadata
|
12
13
|
end
|
13
14
|
|
14
|
-
def self.do_content_negotiation(url:, metadata:)
|
15
|
-
|
15
|
+
def self.do_content_negotiation(url:, metadata:, links: [])
|
16
|
+
warn "\n\nINFO: entering content negotiation of #{url}\n\n"
|
17
|
+
metadata.comments << "INFO: entering content negotiation of #{url}.\n"
|
18
|
+
|
19
|
+
|
20
|
+
response = resolve_url_brute(url: url, metadata: metadata, headers: FspHarvester::ACCEPT_LD_HEADER)
|
16
21
|
if response
|
17
22
|
HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata)
|
18
23
|
end
|
19
|
-
response = resolve_url_brute(url: url, metadata: metadata, headers: ACCEPT_STAR_HEADER)
|
24
|
+
response = resolve_url_brute(url: url, metadata: metadata, headers: FspHarvester::ACCEPT_STAR_HEADER)
|
20
25
|
if response
|
21
26
|
HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from landing page
|
22
|
-
response = resolve_url_brute(url: response.request.url, metadata: metadata, headers:
|
27
|
+
response = resolve_url_brute(url: response.request.url, metadata: metadata, headers: FspHarvester::ACCEPT_LD_HEADER) # now do content negotiation on the landing page
|
23
28
|
if response
|
24
29
|
HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from landing page
|
25
30
|
end
|
26
31
|
end
|
32
|
+
|
33
|
+
process_alternates(links: links, metadata: metadata)
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.process_alternates(links: [], metadata:)
|
37
|
+
warn "\n\nINFO: entering content negotiation on link alternates\n\n"
|
38
|
+
metadata.comments << "IINFO: entering content negotiation on link alternates.\n"
|
39
|
+
# process "alternate" links
|
40
|
+
links.each do |link|
|
41
|
+
next unless link.relation == "alternate"
|
42
|
+
|
43
|
+
url = link.href
|
44
|
+
headers = {'Accept' => "#{link.type}"} if link.respond_to?("type")
|
45
|
+
headers ||= FspHarvester::ACCEPT_STAR_HEADER
|
46
|
+
warn "\n\nINFO: resolving alternate #{url} with headers #{headers.to_s}\n\n"
|
47
|
+
metadata.comments << "IINFO: entering content negotiation on link alternates.\n"
|
48
|
+
response = resolve_url_brute(url: url, metadata: metadata, headers: headers) # now do content negotiation on the link
|
49
|
+
if response
|
50
|
+
HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from alternate link
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
27
54
|
end
|
28
55
|
|
56
|
+
|
29
57
|
def self.resolve_url_brute(url:, method: :get, nolinkheaders: true, headers:, metadata:)
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
58
|
+
|
59
|
+
cache_key = Digest::MD5.hexdigest url + headers.to_s
|
60
|
+
if metadata.url_header_hash[cache_key]
|
61
|
+
warn "Already processed #{url} - moving on"
|
62
|
+
metadata.comments << "INFO: Already processed #{url} - moving on.\n"
|
63
|
+
return false
|
64
|
+
end
|
65
|
+
|
66
|
+
metadata.guidtype = 'uri' if metadata.guidtype.nil?
|
67
|
+
warn "\n\n BRUTE FETCHING #{url} \nwith headers\n #{headers}\n\n"
|
68
|
+
response = HarvesterTools::WebUtils.fspfetch(url: url, headers: headers, method: method, meta: metadata)
|
34
69
|
warn "\n\n head #{response.headers.inspect}\n\n" if response
|
35
70
|
|
36
71
|
unless response
|
37
|
-
|
38
|
-
|
39
|
-
|
72
|
+
metadata.add_warning(['001', url, headers])
|
73
|
+
metadata.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{headers}.\n"
|
74
|
+
metadata.full_response << [url, "No response"]
|
40
75
|
false
|
41
76
|
end
|
42
77
|
|
43
|
-
|
44
|
-
|
78
|
+
metadata.comments << "INFO: following redirection using this header led to the following URL: #{metadata.all_uris.last}. Using the output from this URL for the next few tests..."
|
79
|
+
metadata.full_response << [url, response.body]
|
80
|
+
metadata.url_header_hash[cache_key] = true
|
45
81
|
response
|
46
82
|
end
|
47
83
|
end
|
data/lib/harvester_utils.rb
CHANGED
@@ -20,7 +20,7 @@ module HarvesterTools
|
|
20
20
|
end
|
21
21
|
|
22
22
|
def self.convertToURL(guid:)
|
23
|
-
GUID_TYPES.each do |k, regex|
|
23
|
+
FspHarvester::GUID_TYPES.each do |k, regex|
|
24
24
|
if k == 'inchi' and regex.match(guid)
|
25
25
|
return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
|
26
26
|
elsif k == 'handle1' and regex.match(guid)
|
@@ -39,13 +39,13 @@ module HarvesterTools
|
|
39
39
|
end
|
40
40
|
|
41
41
|
def self.typeit(guid:)
|
42
|
-
GUID_TYPES.each do |type, regex|
|
42
|
+
FspHarvester::GUID_TYPES.each do |type, regex|
|
43
43
|
return type if regex.match(guid)
|
44
44
|
end
|
45
45
|
false
|
46
46
|
end
|
47
47
|
|
48
|
-
def self.resolve_url(url:, method: :get, nolinkheaders: false, metadata:, header: ACCEPT_STAR_HEADER)
|
48
|
+
def self.resolve_url(url:, method: :get, nolinkheaders: false, metadata:, header: FspHarvester::ACCEPT_STAR_HEADER)
|
49
49
|
@meta = metadata
|
50
50
|
@meta.guidtype = 'uri' if @meta.guidtype.nil?
|
51
51
|
warn "\n\n FETCHING #{url} #{header}\n\n"
|
@@ -59,7 +59,7 @@ module HarvesterTools
|
|
59
59
|
end
|
60
60
|
|
61
61
|
@meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
|
62
|
-
@meta.full_response << response.body
|
62
|
+
@meta.full_response << [url, response.body]
|
63
63
|
|
64
64
|
links = process_link_headers(response: response, metadata: @meta) unless nolinkheaders
|
65
65
|
links
|
data/lib/metadata_harvester.rb
CHANGED
@@ -13,7 +13,7 @@ module HarvesterTools
|
|
13
13
|
|
14
14
|
hvst = HarvesterTools::MetadataParser.new(metadata_object: @meta) # put here because the class variable for detecting duplicates should apply to all URIs
|
15
15
|
describedby.each do |link|
|
16
|
-
accepttype = ACCEPT_STAR_HEADER
|
16
|
+
accepttype = FspHarvester::ACCEPT_STAR_HEADER
|
17
17
|
accept = link.respond_to?('type') ? link.type : nil
|
18
18
|
accepttype = { 'Accept' => accept } if accept
|
19
19
|
|
@@ -38,9 +38,14 @@ module HarvesterTools
|
|
38
38
|
abbreviation, content_type = attempt_to_detect_type(body: response.body, headers: response.headers)
|
39
39
|
unless abbreviation
|
40
40
|
@meta.add_warning(['017', response.request.url, ''])
|
41
|
-
@meta.comments << "WARN:
|
41
|
+
@meta.comments << "WARN: format returned from #{response.request.url} is not recognized. Moving on.\n"
|
42
42
|
return
|
43
43
|
end
|
44
|
+
request_content_types = response.request.headers["Accept"].split(/,\s*/)
|
45
|
+
unless (request_content_types.include? content_type) and !(request_content_types.include? "*/*") and (response.code != 406)
|
46
|
+
@meta.add_warning(['023', response.request.url, ''])
|
47
|
+
@meta.comments << "WARN: format returned from #{response.request.url} does not match request type. This should result in a 406 error, but instead was accepted as a 200.\n"
|
48
|
+
end
|
44
49
|
process_according_to_type(body: response.body, uri: response.request.url, metadata: @meta,
|
45
50
|
abbreviation: abbreviation, content_type: content_type)
|
46
51
|
end
|
@@ -65,7 +70,7 @@ module HarvesterTools
|
|
65
70
|
end
|
66
71
|
end
|
67
72
|
|
68
|
-
def self.attempt_to_resolve(link:, headers: ACCEPT_STAR_HEADER)
|
73
|
+
def self.attempt_to_resolve(link:, headers: FspHarvester::ACCEPT_STAR_HEADER)
|
69
74
|
@meta.comments << "INFO: link #{link.href} being processed"
|
70
75
|
if link.respond_to? 'type'
|
71
76
|
header = { 'Accept' => link.type }
|
@@ -86,23 +91,37 @@ module HarvesterTools
|
|
86
91
|
abbreviation = nil
|
87
92
|
content_type = nil
|
88
93
|
@meta.comments << 'INFO: Testing metadata format for html, xml, and linked data formats\n'
|
94
|
+
claimed_type = headers[:content_type]
|
95
|
+
claimed_type.gsub!(/\s*;.*/, '')
|
89
96
|
if body =~ /^\s*<\?xml/
|
90
|
-
if body =~ /<HTML/i
|
97
|
+
if body[0..1000] =~ /<HTML/i # take a sample, it should appear quite early (it will appear in other places in e.g. tutorial documents)
|
91
98
|
abbreviation = 'html'
|
92
|
-
content_type =
|
99
|
+
content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
|
100
|
+
@meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
|
101
|
+
content_type |= 'text/html'
|
93
102
|
@meta.comments << 'INFO: appears to be HTML\n'
|
94
103
|
elsif body =~ /<rdf:RDF/i
|
95
104
|
abbreviation = 'rdfxml'
|
96
|
-
content_type =
|
105
|
+
content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
|
106
|
+
@meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
|
107
|
+
content_type |= 'application/rdf+xml'
|
97
108
|
@meta.comments << 'INFO: appears to be RDF-XML\n'
|
98
109
|
else
|
99
110
|
abbreviation = 'xml'
|
100
|
-
content_type =
|
111
|
+
content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
|
112
|
+
@meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
|
113
|
+
content_type |= 'application/xml'
|
101
114
|
@meta.comments << 'INFO: appears to be XML\n'
|
102
115
|
end
|
116
|
+
elsif body[0..1000] =~ /<HTML/i # take a sample, it should appear quite early (it will appear in other places in e.g. tutorial documents)
|
117
|
+
abbreviation = 'html'
|
118
|
+
content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
|
119
|
+
@meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
|
120
|
+
content_type ||= 'text/html'
|
121
|
+
@meta.comments << 'INFO: appears to be HTML\n'
|
103
122
|
else
|
104
|
-
abbreviation, content_type = check_ld(body: body, claimed_type:
|
105
|
-
abbreviation, content_type = check_json(body: body) unless abbreviation
|
123
|
+
abbreviation, content_type = check_ld(body: body, claimed_type: claimed_type)
|
124
|
+
abbreviation, content_type = check_json(body: body) unless abbreviation # don't test if LD already found!
|
106
125
|
end
|
107
126
|
|
108
127
|
unless content_type
|
@@ -112,18 +131,46 @@ module HarvesterTools
|
|
112
131
|
[abbreviation, content_type]
|
113
132
|
end
|
114
133
|
|
134
|
+
def self.validate_claimed_type(abbreviation:, claimed_type:)
|
135
|
+
warn "\n\nclaimed type #{claimed_type}\nabbreviation #{abbreviation}\n\n"
|
136
|
+
claimed_type.gsub!(/\s*;.*/, '')
|
137
|
+
|
138
|
+
case abbreviation
|
139
|
+
when 'html'
|
140
|
+
return claimed_type if FspHarvester::HTML_FORMATS['html'].include? claimed_type
|
141
|
+
when 'xml'
|
142
|
+
return claimed_type if FspHarvester::XML_FORMATS['xml'].include? claimed_type
|
143
|
+
when 'json'
|
144
|
+
return claimed_type if FspHarvester::JSON_FORMATS['json'].include? claimed_type
|
145
|
+
when 'jsonld', 'rdfxml', 'turtle', 'ntriples', 'nquads'
|
146
|
+
return claimed_type if FspHarvester::RDF_FORMATS.values.flatten.include? claimed_type
|
147
|
+
when 'specialist'
|
148
|
+
warn 'no specialized parsers so far'
|
149
|
+
end
|
150
|
+
return false
|
151
|
+
end
|
152
|
+
|
115
153
|
def self.check_ld(body:, claimed_type:)
|
116
154
|
detected_type = ntriples_hack(body: body) # ntriples hack for one-line metadata records
|
117
|
-
unless detected_type
|
155
|
+
unless detected_type # see if distiller can detect a type
|
118
156
|
detected_type = RDF::Format.for({ sample: body[0..5000] })
|
119
157
|
@meta.comments << "INFO: Auto-detected type #{detected_type}\n"
|
120
158
|
end
|
159
|
+
# at this point, detected_type is something like RDF::Turtle::Format (or nil). This will return a content-type
|
121
160
|
contenttype = ''
|
122
161
|
abbreviation = ''
|
123
162
|
if detected_type
|
124
|
-
|
125
|
-
|
126
|
-
|
163
|
+
detectedcontenttypes = detected_type.content_type # comes back as array of [application/x, application/y]
|
164
|
+
unless detectedcontenttypes.include? claimed_type
|
165
|
+
@meta.add_warning(['022', @meta.all_uris.last, "" ])
|
166
|
+
contenttype = detected_type.content_type.first # just pick one arbitrarily, since it doesn't match thedeclared type anyway
|
167
|
+
abbreviation = abbreviate_type(contenttype: contenttype)
|
168
|
+
@meta.comments << "INFO: using content-type #{contenttype} even though there was a mismatch.\n"
|
169
|
+
else
|
170
|
+
contenttype = claimed_type # just pick one arbitrarily, since it doesn't match thedeclared type anyway
|
171
|
+
abbreviation = abbreviate_type(contenttype: contenttype)
|
172
|
+
@meta.comments << "INFO: using content-type #{contenttype}.\n"
|
173
|
+
end
|
127
174
|
else
|
128
175
|
@meta.comments << "INFO: metadata does not appear to be in a linked data format. Trying other options.\n"
|
129
176
|
end
|
@@ -161,13 +208,14 @@ module HarvesterTools
|
|
161
208
|
abbreviation = 'json'
|
162
209
|
else
|
163
210
|
@meta.comments << "INFO: metadata does not appear to be in JSON format. No options left.\n"
|
211
|
+
return [nil, nil]
|
164
212
|
end
|
165
|
-
[abbreviation, 'application/
|
213
|
+
[abbreviation, 'application/json']
|
166
214
|
end
|
167
215
|
|
168
216
|
def self.abbreviate_type(contenttype:)
|
169
217
|
foundtype = nil
|
170
|
-
RDF_FORMATS.merge(XML_FORMATS).merge(HTML_FORMATS).merge(JSON_FORMATS).each do |type, vals|
|
218
|
+
FspHarvester::RDF_FORMATS.merge(FspHarvester::XML_FORMATS).merge(FspHarvester::HTML_FORMATS).merge(FspHarvester::JSON_FORMATS).each do |type, vals|
|
171
219
|
warn "\n\ntype #{type}\nvals #{vals}\n\n"
|
172
220
|
@meta.comments << "INFO: testing #{type} MIME types for #{contenttype}"
|
173
221
|
next unless vals.include? contenttype
|
data/lib/metadata_object.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
module HarvesterTools
|
2
2
|
class MetadataObject
|
3
|
-
attr_accessor :id, :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris, :tested_guid, :score, :version, :date # a hash of metadata # a RDF.rb graph of metadata # an array of comments # the type of GUID that was detected # will be an array of Net::HTTP::Response
|
3
|
+
attr_accessor :id, :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris, :tested_guid, :score, :version, :date, :url_header_hash # a hash of metadata # a RDF.rb graph of metadata # an array of comments # the type of GUID that was detected # will be an array of Net::HTTP::Response
|
4
4
|
|
5
|
-
def initialize(id: "unidentified_metadata") # get a name from the "new" call, or set a default
|
5
|
+
def initialize(id: "urn:local:unidentified_metadata") # get a name from the "new" call, or set a default
|
6
6
|
@id = id
|
7
7
|
@hash = {}
|
8
8
|
@graph = RDF::Graph.new
|
@@ -16,6 +16,7 @@ module HarvesterTools
|
|
16
16
|
@score = 0
|
17
17
|
@version = '0.0'
|
18
18
|
@date = Time.now.strftime('%Y-%m-%dT%H:%M:%S.%L%z')
|
19
|
+
@url_header_hash = Hash.new(false) # the combinarion of URL and the accept headers, sha1 hashed, for quick lookup if it has already been processed
|
19
20
|
w = RestClient.get("https://raw.githubusercontent.com/markwilkinson/FAIR-Signposting-Harvester/master/lib/warnings.json")
|
20
21
|
#@warn = File.read("./lib/warnings.json")
|
21
22
|
@warn = JSON.parse(w)
|
@@ -37,6 +38,7 @@ module HarvesterTools
|
|
37
38
|
|
38
39
|
def add_warning(warning)
|
39
40
|
id = warning[0]
|
41
|
+
return unless @warn[id] # if there's a mismatch between code and the warnings in github
|
40
42
|
url = warning[1]
|
41
43
|
headers = warning[2]
|
42
44
|
message = @warn[id]['message']
|
data/lib/metadata_parser.rb
CHANGED
@@ -13,17 +13,16 @@ module HarvesterTools
|
|
13
13
|
@meta = metadata_object
|
14
14
|
end
|
15
15
|
|
16
|
-
def process_html(body:, uri:, metadata:)
|
17
|
-
@meta = metadata
|
16
|
+
def process_html(body:, uri:, metadata: @meta)
|
18
17
|
tools = HarvesterTools::ExternalTools.new(metadata: @meta)
|
19
|
-
|
18
|
+
tools.process_with_distiller(body: body, metadata: @meta) # adds to @meta
|
20
19
|
|
21
|
-
jsonld, microdata, microformat, opengraph, rdfa = tools.process_with_extruct(uri: uri)
|
22
|
-
parse_rdf(body: jsonld, content_type: 'application/ld+json')
|
20
|
+
jsonld, microdata, microformat, opengraph, rdfa = tools.process_with_extruct(uri: uri, metadata: @meta)
|
21
|
+
parse_rdf(body: jsonld, content_type: 'application/ld+json', metadata: metadata)
|
23
22
|
@meta.merge_hash(microdata)
|
24
23
|
@meta.merge_hash(microformat)
|
25
24
|
@meta.merge_hash(opengraph)
|
26
|
-
parse_rdf(body: rdfa, content_type: 'application/ld+json')
|
25
|
+
parse_rdf(body: rdfa, content_type: 'application/ld+json', metadata: @meta)
|
27
26
|
end
|
28
27
|
|
29
28
|
def process_xml(body:, metadata:)
|
data/lib/warnings.json
CHANGED
@@ -116,6 +116,18 @@
|
|
116
116
|
{"Validator": "https://jsononline.net/json-validator"}],
|
117
117
|
"severity": "WARN"
|
118
118
|
},
|
119
|
+
"022": {
|
120
|
+
"message": "Mismatch between the Content-type header and the content of the returned document.",
|
121
|
+
"linkout": [],
|
122
|
+
"severity": "WARN"
|
123
|
+
},
|
124
|
+
"023": {
|
125
|
+
"message": "Returned content-type is not compatible with any requested content-type, yet a HTTP 406 error was not returned",
|
126
|
+
"linkout": [],
|
127
|
+
"severity": "WARN"
|
128
|
+
},
|
129
|
+
|
130
|
+
|
119
131
|
"600": {
|
120
132
|
"message": "Data identifier cannot be unambiguously determined, therefore cannot be tested against known persistent identifier schemas",
|
121
133
|
"linkout": [],
|
data/lib/web_utils.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module HarvesterTools
|
2
2
|
|
3
3
|
class WebUtils
|
4
|
-
def self.fspfetch(url:, headers:
|
4
|
+
def self.fspfetch(url:, headers: ACCEPT_STAR_HEADER, method: :get, meta: HarvesterTools::MetadataObject.new)
|
5
5
|
warn 'In fetch routine now. '
|
6
6
|
|
7
7
|
begin
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fsp_harvester
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.21
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mark Wilkinson
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-08-
|
11
|
+
date: 2022-08-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: json
|