fsp_harvester 0.1.22 → 0.1.23
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rspec_status +60 -60
- data/Gemfile.lock +1 -1
- data/lib/fsp_harvester/version.rb +1 -1
- data/lib/harvester_brute.rb +24 -1
- data/lib/metadata_parser.rb +27 -29
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 34f1b3296ceae51a5d5c5e2e17123fab158a3218bbb5abab80be04fbccbd21fa
|
4
|
+
data.tar.gz: 6f59acbe4999636685f612b09822ecf11e42234bfafdf9290112a83e2782b4d7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ac498d2cf39d739a53251e7c9c338c3d19070df5950eaae8fb6068820a66590f6ed78d998c97498284e791e3bb84fcf45515d221d7a4d4ca9cf73b213d3c69f2
|
7
|
+
data.tar.gz: bb432a89bf2066063b96162bdfb0560d75d5d26fc041b49c532bdcca5aefb4c4250b6c37f62627eabb446440f34f202f66bf88f608190a27fe0c573757b00886
|
data/.rspec_status
CHANGED
@@ -1,60 +1,60 @@
|
|
1
|
-
example_id | status | run_time
|
2
|
-
---------------------------------- | ------ |
|
3
|
-
./spec/cite-as_spec.rb[1:1:1] | passed | 1.87 seconds
|
4
|
-
./spec/cite-as_spec.rb[1:1:2] | passed | 1.3 seconds
|
5
|
-
./spec/cite-as_spec.rb[1:1:3] | passed | 1.53 seconds
|
6
|
-
./spec/cite-as_spec.rb[1:1:4] | passed | 2.09 seconds
|
7
|
-
./spec/cite-as_spec.rb[1:1:5] | passed | 8.09 seconds
|
8
|
-
./spec/cite-as_spec.rb[1:1:6] | passed | 2.63 seconds
|
9
|
-
./spec/cite-as_spec.rb[1:1:7] | passed | 2.9 seconds
|
10
|
-
./spec/cite-as_spec.rb[1:1:8] | passed | 2.21 seconds
|
11
|
-
./spec/cite-as_spec.rb[1:1:9] | passed | 2.85 seconds
|
12
|
-
./spec/cite-as_spec.rb[1:1:10] | passed | 2.89 seconds
|
13
|
-
./spec/cite-as_spec.rb[1:1:11] | passed | 3.16 seconds
|
14
|
-
./spec/cite-as_spec.rb[1:1:12] | passed | 2.23 seconds
|
15
|
-
./spec/cite-as_spec.rb[1:1:13] | passed | 2.92 seconds
|
16
|
-
./spec/cite-as_spec.rb[1:1:14] | passed | 2.8 seconds
|
17
|
-
./spec/cite-as_spec.rb[1:1:15] | passed | 1.21 seconds
|
18
|
-
./spec/cite-as_spec.rb[1:1:16] | passed | 1.28 seconds
|
19
|
-
./spec/cite-as_spec.rb[1:1:17] | passed | 1.19 seconds
|
20
|
-
./spec/cite-as_spec.rb[1:1:18] | passed | 1.24 seconds
|
21
|
-
./spec/cite-as_spec.rb[1:1:19] | passed | 1.7 seconds
|
22
|
-
./spec/cite-as_spec.rb[1:1:20] | passed | 1.74 seconds
|
23
|
-
./spec/cite-as_spec.rb[1:1:21] | passed | 2.75 seconds
|
24
|
-
./spec/cite-as_spec.rb[1:1:22] | passed | 1.35 seconds
|
25
|
-
./spec/cite-as_spec.rb[1:1:23] | passed | 1.19 seconds
|
26
|
-
./spec/cite-as_spec.rb[1:1:24] | failed | 1.2 seconds
|
27
|
-
./spec/cite-as_spec.rb[1:1:25] | passed | 0.60282 seconds
|
28
|
-
./spec/describedby_spec.rb[1:1:1] | passed | 3.23 seconds
|
29
|
-
./spec/describedby_spec.rb[1:1:2] | passed | 1.43 seconds
|
30
|
-
./spec/describedby_spec.rb[1:1:3] | passed | 1.31 seconds
|
31
|
-
./spec/describedby_spec.rb[1:1:4] | passed | 1.37 seconds
|
32
|
-
./spec/describedby_spec.rb[1:1:5] | passed | 1.24 seconds
|
33
|
-
./spec/describedby_spec.rb[1:1:6] | passed | 1.09 seconds
|
34
|
-
./spec/describedby_spec.rb[1:1:7] | passed | 1.03 seconds
|
35
|
-
./spec/describedby_spec.rb[1:1:8] | passed | 2.28 seconds
|
36
|
-
./spec/describedby_spec.rb[1:1:9] | passed | 1.84 seconds
|
37
|
-
./spec/describedby_spec.rb[1:1:10] | passed | 2.23 seconds
|
38
|
-
./spec/describedby_spec.rb[1:1:11] | passed | 2.97 seconds
|
39
|
-
./spec/describedby_spec.rb[1:1:12] | passed | 2.97 seconds
|
40
|
-
./spec/describedby_spec.rb[1:1:13] | passed | 1.65 seconds
|
41
|
-
./spec/describedby_spec.rb[1:1:14] | passed | 2.27 seconds
|
42
|
-
./spec/describedby_spec.rb[1:1:15] | passed | 2.53 seconds
|
43
|
-
./spec/fsp_harvester_spec.rb[1:1] | passed | 0.
|
44
|
-
./spec/fsp_harvester_spec.rb[1:2] | passed |
|
45
|
-
./spec/fsp_harvester_spec.rb[1:3] | passed |
|
46
|
-
./spec/fsp_harvester_spec.rb[1:4] | passed | 2.
|
47
|
-
./spec/fsp_harvester_spec.rb[1:5] | passed | 2.
|
48
|
-
./spec/fsp_harvester_spec.rb[1:6] |
|
49
|
-
./spec/fsp_harvester_spec.rb[1:7] | passed | 1 minute
|
50
|
-
./spec/item_spec.rb[1:1:1] | passed | 3.8 seconds
|
51
|
-
./spec/item_spec.rb[1:1:2] | passed | 3.3 seconds
|
52
|
-
./spec/item_spec.rb[1:1:3] | passed | 1.33 seconds
|
53
|
-
./spec/item_spec.rb[1:1:4] | passed | 1.68 seconds
|
54
|
-
./spec/item_spec.rb[1:1:5] | passed | 2.44 seconds
|
55
|
-
./spec/item_spec.rb[1:1:6] | passed | 2.64 seconds
|
56
|
-
./spec/item_spec.rb[1:1:7] | passed | 3.02 seconds
|
57
|
-
./spec/item_spec.rb[1:1:8] | passed | 0.49403 seconds
|
58
|
-
./spec/type_spec.rb[1:1:1] | passed | 1.23 seconds
|
59
|
-
./spec/type_spec.rb[1:1:2] | passed | 1.25 seconds
|
60
|
-
./spec/type_spec.rb[1:1:3] | passed | 1.57 seconds
|
1
|
+
example_id | status | run_time |
|
2
|
+
---------------------------------- | ------ | --------------------- |
|
3
|
+
./spec/cite-as_spec.rb[1:1:1] | passed | 1.87 seconds |
|
4
|
+
./spec/cite-as_spec.rb[1:1:2] | passed | 1.3 seconds |
|
5
|
+
./spec/cite-as_spec.rb[1:1:3] | passed | 1.53 seconds |
|
6
|
+
./spec/cite-as_spec.rb[1:1:4] | passed | 2.09 seconds |
|
7
|
+
./spec/cite-as_spec.rb[1:1:5] | passed | 8.09 seconds |
|
8
|
+
./spec/cite-as_spec.rb[1:1:6] | passed | 2.63 seconds |
|
9
|
+
./spec/cite-as_spec.rb[1:1:7] | passed | 2.9 seconds |
|
10
|
+
./spec/cite-as_spec.rb[1:1:8] | passed | 2.21 seconds |
|
11
|
+
./spec/cite-as_spec.rb[1:1:9] | passed | 2.85 seconds |
|
12
|
+
./spec/cite-as_spec.rb[1:1:10] | passed | 2.89 seconds |
|
13
|
+
./spec/cite-as_spec.rb[1:1:11] | passed | 3.16 seconds |
|
14
|
+
./spec/cite-as_spec.rb[1:1:12] | passed | 2.23 seconds |
|
15
|
+
./spec/cite-as_spec.rb[1:1:13] | passed | 2.92 seconds |
|
16
|
+
./spec/cite-as_spec.rb[1:1:14] | passed | 2.8 seconds |
|
17
|
+
./spec/cite-as_spec.rb[1:1:15] | passed | 1.21 seconds |
|
18
|
+
./spec/cite-as_spec.rb[1:1:16] | passed | 1.28 seconds |
|
19
|
+
./spec/cite-as_spec.rb[1:1:17] | passed | 1.19 seconds |
|
20
|
+
./spec/cite-as_spec.rb[1:1:18] | passed | 1.24 seconds |
|
21
|
+
./spec/cite-as_spec.rb[1:1:19] | passed | 1.7 seconds |
|
22
|
+
./spec/cite-as_spec.rb[1:1:20] | passed | 1.74 seconds |
|
23
|
+
./spec/cite-as_spec.rb[1:1:21] | passed | 2.75 seconds |
|
24
|
+
./spec/cite-as_spec.rb[1:1:22] | passed | 1.35 seconds |
|
25
|
+
./spec/cite-as_spec.rb[1:1:23] | passed | 1.19 seconds |
|
26
|
+
./spec/cite-as_spec.rb[1:1:24] | failed | 1.2 seconds |
|
27
|
+
./spec/cite-as_spec.rb[1:1:25] | passed | 0.60282 seconds |
|
28
|
+
./spec/describedby_spec.rb[1:1:1] | passed | 3.23 seconds |
|
29
|
+
./spec/describedby_spec.rb[1:1:2] | passed | 1.43 seconds |
|
30
|
+
./spec/describedby_spec.rb[1:1:3] | passed | 1.31 seconds |
|
31
|
+
./spec/describedby_spec.rb[1:1:4] | passed | 1.37 seconds |
|
32
|
+
./spec/describedby_spec.rb[1:1:5] | passed | 1.24 seconds |
|
33
|
+
./spec/describedby_spec.rb[1:1:6] | passed | 1.09 seconds |
|
34
|
+
./spec/describedby_spec.rb[1:1:7] | passed | 1.03 seconds |
|
35
|
+
./spec/describedby_spec.rb[1:1:8] | passed | 2.28 seconds |
|
36
|
+
./spec/describedby_spec.rb[1:1:9] | passed | 1.84 seconds |
|
37
|
+
./spec/describedby_spec.rb[1:1:10] | passed | 2.23 seconds |
|
38
|
+
./spec/describedby_spec.rb[1:1:11] | passed | 2.97 seconds |
|
39
|
+
./spec/describedby_spec.rb[1:1:12] | passed | 2.97 seconds |
|
40
|
+
./spec/describedby_spec.rb[1:1:13] | passed | 1.65 seconds |
|
41
|
+
./spec/describedby_spec.rb[1:1:14] | passed | 2.27 seconds |
|
42
|
+
./spec/describedby_spec.rb[1:1:15] | passed | 2.53 seconds |
|
43
|
+
./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00178 seconds |
|
44
|
+
./spec/fsp_harvester_spec.rb[1:2] | passed | 5.49 seconds |
|
45
|
+
./spec/fsp_harvester_spec.rb[1:3] | passed | 39.87 seconds |
|
46
|
+
./spec/fsp_harvester_spec.rb[1:4] | passed | 2.62 seconds |
|
47
|
+
./spec/fsp_harvester_spec.rb[1:5] | passed | 2.61 seconds |
|
48
|
+
./spec/fsp_harvester_spec.rb[1:6] | failed | 54.05 seconds |
|
49
|
+
./spec/fsp_harvester_spec.rb[1:7] | passed | 1 minute 26.9 seconds |
|
50
|
+
./spec/item_spec.rb[1:1:1] | passed | 3.8 seconds |
|
51
|
+
./spec/item_spec.rb[1:1:2] | passed | 3.3 seconds |
|
52
|
+
./spec/item_spec.rb[1:1:3] | passed | 1.33 seconds |
|
53
|
+
./spec/item_spec.rb[1:1:4] | passed | 1.68 seconds |
|
54
|
+
./spec/item_spec.rb[1:1:5] | passed | 2.44 seconds |
|
55
|
+
./spec/item_spec.rb[1:1:6] | passed | 2.64 seconds |
|
56
|
+
./spec/item_spec.rb[1:1:7] | passed | 3.02 seconds |
|
57
|
+
./spec/item_spec.rb[1:1:8] | passed | 0.49403 seconds |
|
58
|
+
./spec/type_spec.rb[1:1:1] | passed | 1.23 seconds |
|
59
|
+
./spec/type_spec.rb[1:1:2] | passed | 1.25 seconds |
|
60
|
+
./spec/type_spec.rb[1:1:3] | passed | 1.57 seconds |
|
data/Gemfile.lock
CHANGED
data/lib/harvester_brute.rb
CHANGED
@@ -39,12 +39,13 @@ module HarvesterTools
|
|
39
39
|
# process "alternate" links
|
40
40
|
links.each do |link|
|
41
41
|
next unless link.relation == "alternate"
|
42
|
+
next unless sanity_check_alternate(link: link, metadata: metadata) # don't try to process zip files! LOL!
|
42
43
|
|
43
44
|
url = link.href
|
44
45
|
headers = {'Accept' => "#{link.type}"} if link.respond_to?("type")
|
45
46
|
headers ||= FspHarvester::ACCEPT_STAR_HEADER
|
46
47
|
warn "\n\nINFO: resolving alternate #{url} with headers #{headers.to_s}\n\n"
|
47
|
-
metadata.comments << "
|
48
|
+
metadata.comments << "INFO: entering content negotiation on link alternates.\n"
|
48
49
|
response = resolve_url_brute(url: url, metadata: metadata, headers: headers) # now do content negotiation on the link
|
49
50
|
if response
|
50
51
|
HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from alternate link
|
@@ -53,6 +54,28 @@ module HarvesterTools
|
|
53
54
|
|
54
55
|
end
|
55
56
|
|
57
|
+
def self.sanity_check_alternate(link:, metadata:)
|
58
|
+
type = link.type if link.respond_to?('type')
|
59
|
+
href = link.href
|
60
|
+
unless type # we're gonna have to check extensions...
|
61
|
+
m = href.match(/.*\.[\w\-]+/)
|
62
|
+
extension = m[1]
|
63
|
+
unless %w[json jsonld rdf ttl turtle n3 triples ntriples txt html xhtml nq xml].include? extension
|
64
|
+
warn "\n\nINFO: extension #{extension} is not trusted as a link 'alternate' representation for metadata, so it will not be processed\n"
|
65
|
+
metadata.comments << "INFO: extension #{extension} is not trusted as a link 'alternate' representation for metadata, so it will not be processed.\n"
|
66
|
+
return false
|
67
|
+
end
|
68
|
+
return true
|
69
|
+
end
|
70
|
+
type.gsub!(/;.*/, '') # remove any UTF8 blah blah
|
71
|
+
abbrev = HarvesterTools::MetadataHarvester.abbreviate_type(contenttype: type)
|
72
|
+
unless abbrev
|
73
|
+
warn "\n\nINFO: content-type #{type} is not trusted as a link 'alternate' representation for metadata, so it will not be processed\n"
|
74
|
+
metadata.comments << "INFO: content-type #{type} is not trusted as a link 'alternate' representation for metadata, so it will not be processed.\n"
|
75
|
+
return false
|
76
|
+
end
|
77
|
+
true
|
78
|
+
end
|
56
79
|
|
57
80
|
def self.resolve_url_brute(url:, method: :get, nolinkheaders: true, headers:, metadata:)
|
58
81
|
|
data/lib/metadata_parser.rb
CHANGED
@@ -26,32 +26,30 @@ module HarvesterTools
|
|
26
26
|
end
|
27
27
|
|
28
28
|
def process_xml(body:, metadata:)
|
29
|
-
|
29
|
+
|
30
30
|
begin
|
31
31
|
hash = XmlSimple.xml_in(body)
|
32
32
|
rescue
|
33
|
-
|
34
|
-
|
33
|
+
metadata.comments << "CRITICAL: Malformed XML detected. Cannot process metadata.\n"
|
34
|
+
metadata.add_warning(['020', '', ''])
|
35
35
|
end
|
36
|
-
|
37
|
-
|
36
|
+
metadata.comments << "INFO: The XML is being merged in the metadata object\n"
|
37
|
+
metadata.hash.merge hash
|
38
38
|
end
|
39
39
|
|
40
40
|
def process_json(body:, metadata:)
|
41
|
-
@meta = metadata
|
42
41
|
begin
|
43
42
|
hash = JSON.parse(body)
|
44
43
|
rescue
|
45
|
-
|
46
|
-
|
44
|
+
metadata.comments << "CRITICAL: Malformed JSON detected. Cannot process metadata.\n"
|
45
|
+
metadata.add_warning(['021', '', ''])
|
47
46
|
end
|
48
|
-
|
49
|
-
|
47
|
+
metadata.comments << "INFO: The JSON is being merged in the metadata object\n"
|
48
|
+
metadata.hash.merge hash
|
50
49
|
end
|
51
50
|
|
52
51
|
def process_ld(body:, content_type:, metadata:)
|
53
|
-
|
54
|
-
parse_rdf(body: body, content_type: content_type, metadata: @meta)
|
52
|
+
parse_rdf(body: body, content_type: content_type, metadata: metadata)
|
55
53
|
end
|
56
54
|
|
57
55
|
def parse_rdf(body:, content_type:, metadata:)
|
@@ -61,43 +59,43 @@ module HarvesterTools
|
|
61
59
|
def self.parse_rdf(body:, content_type:, metadata:)
|
62
60
|
@meta = metadata
|
63
61
|
unless body
|
64
|
-
|
65
|
-
|
62
|
+
metadata.comments << "CRITICAL: The response message body component appears to have no content.\n"
|
63
|
+
metadata.add_warning(['018', '', ''])
|
66
64
|
return
|
67
65
|
end
|
68
66
|
|
69
67
|
unless body.match(/\w/)
|
70
|
-
|
71
|
-
|
68
|
+
metadata.comments << "CRITICAL: The response message body component appears to have no content.\n"
|
69
|
+
metadata.add_warning(['018', '', ''])
|
72
70
|
return
|
73
71
|
end
|
74
72
|
|
75
73
|
rdfformat = RDF::Format.for(content_type: content_type)
|
76
74
|
unless rdfformat
|
77
|
-
|
78
|
-
|
75
|
+
metadata.comments << "CRITICAL: Found what appears to be RDF (sample: #{body[0..300].delete!("\n")}), but it could not find a parser. Please report this error, along with the GUID of the resource, to the maintainer of the system.\n"
|
76
|
+
metadata.add_warning(['018', '', ''])
|
79
77
|
return
|
80
78
|
end
|
81
79
|
|
82
80
|
graph = HarvesterTools::Cache.checkRDFCache(body: body)
|
83
81
|
if graph.size > 0
|
84
82
|
warn "\n\n\n unmarshalling graph from cache\n\ngraph size #{graph.size}\n\n"
|
85
|
-
|
83
|
+
metadata.merge_rdf(graph.to_a)
|
86
84
|
else
|
87
85
|
warn "\n\n\nfound format #{rdfformat}\n\n"
|
88
|
-
|
86
|
+
metadata.comments << "INFO: The response message body component appears to contain #{rdfformat}.\n"
|
89
87
|
reader = ''
|
90
88
|
begin
|
91
89
|
reader = rdfformat.reader.new(body.force_encoding('UTF-8'))
|
92
90
|
rescue Exception => e
|
93
|
-
|
94
|
-
|
91
|
+
metadata.comments << "WARN: Though linked data was found, it failed to parse (Exception #{e}). This likely indicates some syntax error in the data. As a result, no metadata will be extracted from this message.\n"
|
92
|
+
metadata.add_warning(['018', '', ''])
|
95
93
|
return
|
96
94
|
end
|
97
95
|
|
98
96
|
begin
|
99
|
-
if reader.size
|
100
|
-
|
97
|
+
if reader.size.zero?
|
98
|
+
metadata.comments << "WARN: Though linked data was found, it failed to parse. This likely indicates some syntax error in the data. As a result, no metadata will be extracted from this message.\n"
|
101
99
|
return
|
102
100
|
end
|
103
101
|
reader = rdfformat.reader.new(body) # have to re-read it here, but now its safe because we have already caught errors
|
@@ -106,16 +104,16 @@ module HarvesterTools
|
|
106
104
|
warn 'WRITING DONE'
|
107
105
|
reader = rdfformat.reader.new(body.force_encoding('UTF-8')) # frustrating that we cannot rewind!
|
108
106
|
warn 'RE-READING DONE'
|
109
|
-
|
107
|
+
metadata.merge_rdf(reader.to_a)
|
110
108
|
warn 'MERGE DONE'
|
111
109
|
rescue RDF::ReaderError => e
|
112
|
-
|
110
|
+
metadata.comments << "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} || (sample of what was parsed: #{body[0..300].delete("\n")})\n"
|
113
111
|
warn "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} || (sample of what was parsed: #{body[0..300].delete("\n")})\n"
|
114
|
-
|
112
|
+
metadata.add_warning(['018', '', ''])
|
115
113
|
rescue Exception => e
|
116
|
-
|
114
|
+
metadata.comments << "CRITICAL: An unknown error occurred while parsing the (apparent) Linked Data (sample of what was parsed: #{body[0..300].delete("\n")}). Moving on...\n"
|
117
115
|
warn "\n\nCRITICAL: #{e.inspect} An unknown error occurred while parsing the (apparent) Linked Data (full body: #{body.force_encoding('UTF-8')}). Moving on...\n"
|
118
|
-
|
116
|
+
metadata.add_warning(['018', '', ''])
|
119
117
|
end
|
120
118
|
end
|
121
119
|
end
|