fsp_harvester 0.1.22 → 0.1.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rspec_status +60 -60
- data/Gemfile.lock +1 -1
- data/lib/fsp_harvester/version.rb +1 -1
- data/lib/harvester_brute.rb +24 -1
- data/lib/metadata_parser.rb +27 -29
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 34f1b3296ceae51a5d5c5e2e17123fab158a3218bbb5abab80be04fbccbd21fa
|
|
4
|
+
data.tar.gz: 6f59acbe4999636685f612b09822ecf11e42234bfafdf9290112a83e2782b4d7
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ac498d2cf39d739a53251e7c9c338c3d19070df5950eaae8fb6068820a66590f6ed78d998c97498284e791e3bb84fcf45515d221d7a4d4ca9cf73b213d3c69f2
|
|
7
|
+
data.tar.gz: bb432a89bf2066063b96162bdfb0560d75d5d26fc041b49c532bdcca5aefb4c4250b6c37f62627eabb446440f34f202f66bf88f608190a27fe0c573757b00886
|
data/.rspec_status
CHANGED
|
@@ -1,60 +1,60 @@
|
|
|
1
|
-
example_id | status | run_time
|
|
2
|
-
---------------------------------- | ------ |
|
|
3
|
-
./spec/cite-as_spec.rb[1:1:1] | passed | 1.87 seconds
|
|
4
|
-
./spec/cite-as_spec.rb[1:1:2] | passed | 1.3 seconds
|
|
5
|
-
./spec/cite-as_spec.rb[1:1:3] | passed | 1.53 seconds
|
|
6
|
-
./spec/cite-as_spec.rb[1:1:4] | passed | 2.09 seconds
|
|
7
|
-
./spec/cite-as_spec.rb[1:1:5] | passed | 8.09 seconds
|
|
8
|
-
./spec/cite-as_spec.rb[1:1:6] | passed | 2.63 seconds
|
|
9
|
-
./spec/cite-as_spec.rb[1:1:7] | passed | 2.9 seconds
|
|
10
|
-
./spec/cite-as_spec.rb[1:1:8] | passed | 2.21 seconds
|
|
11
|
-
./spec/cite-as_spec.rb[1:1:9] | passed | 2.85 seconds
|
|
12
|
-
./spec/cite-as_spec.rb[1:1:10] | passed | 2.89 seconds
|
|
13
|
-
./spec/cite-as_spec.rb[1:1:11] | passed | 3.16 seconds
|
|
14
|
-
./spec/cite-as_spec.rb[1:1:12] | passed | 2.23 seconds
|
|
15
|
-
./spec/cite-as_spec.rb[1:1:13] | passed | 2.92 seconds
|
|
16
|
-
./spec/cite-as_spec.rb[1:1:14] | passed | 2.8 seconds
|
|
17
|
-
./spec/cite-as_spec.rb[1:1:15] | passed | 1.21 seconds
|
|
18
|
-
./spec/cite-as_spec.rb[1:1:16] | passed | 1.28 seconds
|
|
19
|
-
./spec/cite-as_spec.rb[1:1:17] | passed | 1.19 seconds
|
|
20
|
-
./spec/cite-as_spec.rb[1:1:18] | passed | 1.24 seconds
|
|
21
|
-
./spec/cite-as_spec.rb[1:1:19] | passed | 1.7 seconds
|
|
22
|
-
./spec/cite-as_spec.rb[1:1:20] | passed | 1.74 seconds
|
|
23
|
-
./spec/cite-as_spec.rb[1:1:21] | passed | 2.75 seconds
|
|
24
|
-
./spec/cite-as_spec.rb[1:1:22] | passed | 1.35 seconds
|
|
25
|
-
./spec/cite-as_spec.rb[1:1:23] | passed | 1.19 seconds
|
|
26
|
-
./spec/cite-as_spec.rb[1:1:24] | failed | 1.2 seconds
|
|
27
|
-
./spec/cite-as_spec.rb[1:1:25] | passed | 0.60282 seconds
|
|
28
|
-
./spec/describedby_spec.rb[1:1:1] | passed | 3.23 seconds
|
|
29
|
-
./spec/describedby_spec.rb[1:1:2] | passed | 1.43 seconds
|
|
30
|
-
./spec/describedby_spec.rb[1:1:3] | passed | 1.31 seconds
|
|
31
|
-
./spec/describedby_spec.rb[1:1:4] | passed | 1.37 seconds
|
|
32
|
-
./spec/describedby_spec.rb[1:1:5] | passed | 1.24 seconds
|
|
33
|
-
./spec/describedby_spec.rb[1:1:6] | passed | 1.09 seconds
|
|
34
|
-
./spec/describedby_spec.rb[1:1:7] | passed | 1.03 seconds
|
|
35
|
-
./spec/describedby_spec.rb[1:1:8] | passed | 2.28 seconds
|
|
36
|
-
./spec/describedby_spec.rb[1:1:9] | passed | 1.84 seconds
|
|
37
|
-
./spec/describedby_spec.rb[1:1:10] | passed | 2.23 seconds
|
|
38
|
-
./spec/describedby_spec.rb[1:1:11] | passed | 2.97 seconds
|
|
39
|
-
./spec/describedby_spec.rb[1:1:12] | passed | 2.97 seconds
|
|
40
|
-
./spec/describedby_spec.rb[1:1:13] | passed | 1.65 seconds
|
|
41
|
-
./spec/describedby_spec.rb[1:1:14] | passed | 2.27 seconds
|
|
42
|
-
./spec/describedby_spec.rb[1:1:15] | passed | 2.53 seconds
|
|
43
|
-
./spec/fsp_harvester_spec.rb[1:1] | passed | 0.
|
|
44
|
-
./spec/fsp_harvester_spec.rb[1:2] | passed |
|
|
45
|
-
./spec/fsp_harvester_spec.rb[1:3] | passed |
|
|
46
|
-
./spec/fsp_harvester_spec.rb[1:4] | passed | 2.
|
|
47
|
-
./spec/fsp_harvester_spec.rb[1:5] | passed | 2.
|
|
48
|
-
./spec/fsp_harvester_spec.rb[1:6] |
|
|
49
|
-
./spec/fsp_harvester_spec.rb[1:7] | passed | 1 minute
|
|
50
|
-
./spec/item_spec.rb[1:1:1] | passed | 3.8 seconds
|
|
51
|
-
./spec/item_spec.rb[1:1:2] | passed | 3.3 seconds
|
|
52
|
-
./spec/item_spec.rb[1:1:3] | passed | 1.33 seconds
|
|
53
|
-
./spec/item_spec.rb[1:1:4] | passed | 1.68 seconds
|
|
54
|
-
./spec/item_spec.rb[1:1:5] | passed | 2.44 seconds
|
|
55
|
-
./spec/item_spec.rb[1:1:6] | passed | 2.64 seconds
|
|
56
|
-
./spec/item_spec.rb[1:1:7] | passed | 3.02 seconds
|
|
57
|
-
./spec/item_spec.rb[1:1:8] | passed | 0.49403 seconds
|
|
58
|
-
./spec/type_spec.rb[1:1:1] | passed | 1.23 seconds
|
|
59
|
-
./spec/type_spec.rb[1:1:2] | passed | 1.25 seconds
|
|
60
|
-
./spec/type_spec.rb[1:1:3] | passed | 1.57 seconds
|
|
1
|
+
example_id | status | run_time |
|
|
2
|
+
---------------------------------- | ------ | --------------------- |
|
|
3
|
+
./spec/cite-as_spec.rb[1:1:1] | passed | 1.87 seconds |
|
|
4
|
+
./spec/cite-as_spec.rb[1:1:2] | passed | 1.3 seconds |
|
|
5
|
+
./spec/cite-as_spec.rb[1:1:3] | passed | 1.53 seconds |
|
|
6
|
+
./spec/cite-as_spec.rb[1:1:4] | passed | 2.09 seconds |
|
|
7
|
+
./spec/cite-as_spec.rb[1:1:5] | passed | 8.09 seconds |
|
|
8
|
+
./spec/cite-as_spec.rb[1:1:6] | passed | 2.63 seconds |
|
|
9
|
+
./spec/cite-as_spec.rb[1:1:7] | passed | 2.9 seconds |
|
|
10
|
+
./spec/cite-as_spec.rb[1:1:8] | passed | 2.21 seconds |
|
|
11
|
+
./spec/cite-as_spec.rb[1:1:9] | passed | 2.85 seconds |
|
|
12
|
+
./spec/cite-as_spec.rb[1:1:10] | passed | 2.89 seconds |
|
|
13
|
+
./spec/cite-as_spec.rb[1:1:11] | passed | 3.16 seconds |
|
|
14
|
+
./spec/cite-as_spec.rb[1:1:12] | passed | 2.23 seconds |
|
|
15
|
+
./spec/cite-as_spec.rb[1:1:13] | passed | 2.92 seconds |
|
|
16
|
+
./spec/cite-as_spec.rb[1:1:14] | passed | 2.8 seconds |
|
|
17
|
+
./spec/cite-as_spec.rb[1:1:15] | passed | 1.21 seconds |
|
|
18
|
+
./spec/cite-as_spec.rb[1:1:16] | passed | 1.28 seconds |
|
|
19
|
+
./spec/cite-as_spec.rb[1:1:17] | passed | 1.19 seconds |
|
|
20
|
+
./spec/cite-as_spec.rb[1:1:18] | passed | 1.24 seconds |
|
|
21
|
+
./spec/cite-as_spec.rb[1:1:19] | passed | 1.7 seconds |
|
|
22
|
+
./spec/cite-as_spec.rb[1:1:20] | passed | 1.74 seconds |
|
|
23
|
+
./spec/cite-as_spec.rb[1:1:21] | passed | 2.75 seconds |
|
|
24
|
+
./spec/cite-as_spec.rb[1:1:22] | passed | 1.35 seconds |
|
|
25
|
+
./spec/cite-as_spec.rb[1:1:23] | passed | 1.19 seconds |
|
|
26
|
+
./spec/cite-as_spec.rb[1:1:24] | failed | 1.2 seconds |
|
|
27
|
+
./spec/cite-as_spec.rb[1:1:25] | passed | 0.60282 seconds |
|
|
28
|
+
./spec/describedby_spec.rb[1:1:1] | passed | 3.23 seconds |
|
|
29
|
+
./spec/describedby_spec.rb[1:1:2] | passed | 1.43 seconds |
|
|
30
|
+
./spec/describedby_spec.rb[1:1:3] | passed | 1.31 seconds |
|
|
31
|
+
./spec/describedby_spec.rb[1:1:4] | passed | 1.37 seconds |
|
|
32
|
+
./spec/describedby_spec.rb[1:1:5] | passed | 1.24 seconds |
|
|
33
|
+
./spec/describedby_spec.rb[1:1:6] | passed | 1.09 seconds |
|
|
34
|
+
./spec/describedby_spec.rb[1:1:7] | passed | 1.03 seconds |
|
|
35
|
+
./spec/describedby_spec.rb[1:1:8] | passed | 2.28 seconds |
|
|
36
|
+
./spec/describedby_spec.rb[1:1:9] | passed | 1.84 seconds |
|
|
37
|
+
./spec/describedby_spec.rb[1:1:10] | passed | 2.23 seconds |
|
|
38
|
+
./spec/describedby_spec.rb[1:1:11] | passed | 2.97 seconds |
|
|
39
|
+
./spec/describedby_spec.rb[1:1:12] | passed | 2.97 seconds |
|
|
40
|
+
./spec/describedby_spec.rb[1:1:13] | passed | 1.65 seconds |
|
|
41
|
+
./spec/describedby_spec.rb[1:1:14] | passed | 2.27 seconds |
|
|
42
|
+
./spec/describedby_spec.rb[1:1:15] | passed | 2.53 seconds |
|
|
43
|
+
./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00178 seconds |
|
|
44
|
+
./spec/fsp_harvester_spec.rb[1:2] | passed | 5.49 seconds |
|
|
45
|
+
./spec/fsp_harvester_spec.rb[1:3] | passed | 39.87 seconds |
|
|
46
|
+
./spec/fsp_harvester_spec.rb[1:4] | passed | 2.62 seconds |
|
|
47
|
+
./spec/fsp_harvester_spec.rb[1:5] | passed | 2.61 seconds |
|
|
48
|
+
./spec/fsp_harvester_spec.rb[1:6] | failed | 54.05 seconds |
|
|
49
|
+
./spec/fsp_harvester_spec.rb[1:7] | passed | 1 minute 26.9 seconds |
|
|
50
|
+
./spec/item_spec.rb[1:1:1] | passed | 3.8 seconds |
|
|
51
|
+
./spec/item_spec.rb[1:1:2] | passed | 3.3 seconds |
|
|
52
|
+
./spec/item_spec.rb[1:1:3] | passed | 1.33 seconds |
|
|
53
|
+
./spec/item_spec.rb[1:1:4] | passed | 1.68 seconds |
|
|
54
|
+
./spec/item_spec.rb[1:1:5] | passed | 2.44 seconds |
|
|
55
|
+
./spec/item_spec.rb[1:1:6] | passed | 2.64 seconds |
|
|
56
|
+
./spec/item_spec.rb[1:1:7] | passed | 3.02 seconds |
|
|
57
|
+
./spec/item_spec.rb[1:1:8] | passed | 0.49403 seconds |
|
|
58
|
+
./spec/type_spec.rb[1:1:1] | passed | 1.23 seconds |
|
|
59
|
+
./spec/type_spec.rb[1:1:2] | passed | 1.25 seconds |
|
|
60
|
+
./spec/type_spec.rb[1:1:3] | passed | 1.57 seconds |
|
data/Gemfile.lock
CHANGED
data/lib/harvester_brute.rb
CHANGED
|
@@ -39,12 +39,13 @@ module HarvesterTools
|
|
|
39
39
|
# process "alternate" links
|
|
40
40
|
links.each do |link|
|
|
41
41
|
next unless link.relation == "alternate"
|
|
42
|
+
next unless sanity_check_alternate(link: link, metadata: metadata) # don't try to process zip files! LOL!
|
|
42
43
|
|
|
43
44
|
url = link.href
|
|
44
45
|
headers = {'Accept' => "#{link.type}"} if link.respond_to?("type")
|
|
45
46
|
headers ||= FspHarvester::ACCEPT_STAR_HEADER
|
|
46
47
|
warn "\n\nINFO: resolving alternate #{url} with headers #{headers.to_s}\n\n"
|
|
47
|
-
metadata.comments << "
|
|
48
|
+
metadata.comments << "INFO: entering content negotiation on link alternates.\n"
|
|
48
49
|
response = resolve_url_brute(url: url, metadata: metadata, headers: headers) # now do content negotiation on the link
|
|
49
50
|
if response
|
|
50
51
|
HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from alternate link
|
|
@@ -53,6 +54,28 @@ module HarvesterTools
|
|
|
53
54
|
|
|
54
55
|
end
|
|
55
56
|
|
|
57
|
+
def self.sanity_check_alternate(link:, metadata:)
|
|
58
|
+
type = link.type if link.respond_to?('type')
|
|
59
|
+
href = link.href
|
|
60
|
+
unless type # we're gonna have to check extensions...
|
|
61
|
+
m = href.match(/.*\.[\w\-]+/)
|
|
62
|
+
extension = m[1]
|
|
63
|
+
unless %w[json jsonld rdf ttl turtle n3 triples ntriples txt html xhtml nq xml].include? extension
|
|
64
|
+
warn "\n\nINFO: extension #{extension} is not trusted as a link 'alternate' representation for metadata, so it will not be processed\n"
|
|
65
|
+
metadata.comments << "INFO: extension #{extension} is not trusted as a link 'alternate' representation for metadata, so it will not be processed.\n"
|
|
66
|
+
return false
|
|
67
|
+
end
|
|
68
|
+
return true
|
|
69
|
+
end
|
|
70
|
+
type.gsub!(/;.*/, '') # remove any UTF8 blah blah
|
|
71
|
+
abbrev = HarvesterTools::MetadataHarvester.abbreviate_type(contenttype: type)
|
|
72
|
+
unless abbrev
|
|
73
|
+
warn "\n\nINFO: content-type #{type} is not trusted as a link 'alternate' representation for metadata, so it will not be processed\n"
|
|
74
|
+
metadata.comments << "INFO: content-type #{type} is not trusted as a link 'alternate' representation for metadata, so it will not be processed.\n"
|
|
75
|
+
return false
|
|
76
|
+
end
|
|
77
|
+
true
|
|
78
|
+
end
|
|
56
79
|
|
|
57
80
|
def self.resolve_url_brute(url:, method: :get, nolinkheaders: true, headers:, metadata:)
|
|
58
81
|
|
data/lib/metadata_parser.rb
CHANGED
|
@@ -26,32 +26,30 @@ module HarvesterTools
|
|
|
26
26
|
end
|
|
27
27
|
|
|
28
28
|
def process_xml(body:, metadata:)
|
|
29
|
-
|
|
29
|
+
|
|
30
30
|
begin
|
|
31
31
|
hash = XmlSimple.xml_in(body)
|
|
32
32
|
rescue
|
|
33
|
-
|
|
34
|
-
|
|
33
|
+
metadata.comments << "CRITICAL: Malformed XML detected. Cannot process metadata.\n"
|
|
34
|
+
metadata.add_warning(['020', '', ''])
|
|
35
35
|
end
|
|
36
|
-
|
|
37
|
-
|
|
36
|
+
metadata.comments << "INFO: The XML is being merged in the metadata object\n"
|
|
37
|
+
metadata.hash.merge hash
|
|
38
38
|
end
|
|
39
39
|
|
|
40
40
|
def process_json(body:, metadata:)
|
|
41
|
-
@meta = metadata
|
|
42
41
|
begin
|
|
43
42
|
hash = JSON.parse(body)
|
|
44
43
|
rescue
|
|
45
|
-
|
|
46
|
-
|
|
44
|
+
metadata.comments << "CRITICAL: Malformed JSON detected. Cannot process metadata.\n"
|
|
45
|
+
metadata.add_warning(['021', '', ''])
|
|
47
46
|
end
|
|
48
|
-
|
|
49
|
-
|
|
47
|
+
metadata.comments << "INFO: The JSON is being merged in the metadata object\n"
|
|
48
|
+
metadata.hash.merge hash
|
|
50
49
|
end
|
|
51
50
|
|
|
52
51
|
def process_ld(body:, content_type:, metadata:)
|
|
53
|
-
|
|
54
|
-
parse_rdf(body: body, content_type: content_type, metadata: @meta)
|
|
52
|
+
parse_rdf(body: body, content_type: content_type, metadata: metadata)
|
|
55
53
|
end
|
|
56
54
|
|
|
57
55
|
def parse_rdf(body:, content_type:, metadata:)
|
|
@@ -61,43 +59,43 @@ module HarvesterTools
|
|
|
61
59
|
def self.parse_rdf(body:, content_type:, metadata:)
|
|
62
60
|
@meta = metadata
|
|
63
61
|
unless body
|
|
64
|
-
|
|
65
|
-
|
|
62
|
+
metadata.comments << "CRITICAL: The response message body component appears to have no content.\n"
|
|
63
|
+
metadata.add_warning(['018', '', ''])
|
|
66
64
|
return
|
|
67
65
|
end
|
|
68
66
|
|
|
69
67
|
unless body.match(/\w/)
|
|
70
|
-
|
|
71
|
-
|
|
68
|
+
metadata.comments << "CRITICAL: The response message body component appears to have no content.\n"
|
|
69
|
+
metadata.add_warning(['018', '', ''])
|
|
72
70
|
return
|
|
73
71
|
end
|
|
74
72
|
|
|
75
73
|
rdfformat = RDF::Format.for(content_type: content_type)
|
|
76
74
|
unless rdfformat
|
|
77
|
-
|
|
78
|
-
|
|
75
|
+
metadata.comments << "CRITICAL: Found what appears to be RDF (sample: #{body[0..300].delete!("\n")}), but it could not find a parser. Please report this error, along with the GUID of the resource, to the maintainer of the system.\n"
|
|
76
|
+
metadata.add_warning(['018', '', ''])
|
|
79
77
|
return
|
|
80
78
|
end
|
|
81
79
|
|
|
82
80
|
graph = HarvesterTools::Cache.checkRDFCache(body: body)
|
|
83
81
|
if graph.size > 0
|
|
84
82
|
warn "\n\n\n unmarshalling graph from cache\n\ngraph size #{graph.size}\n\n"
|
|
85
|
-
|
|
83
|
+
metadata.merge_rdf(graph.to_a)
|
|
86
84
|
else
|
|
87
85
|
warn "\n\n\nfound format #{rdfformat}\n\n"
|
|
88
|
-
|
|
86
|
+
metadata.comments << "INFO: The response message body component appears to contain #{rdfformat}.\n"
|
|
89
87
|
reader = ''
|
|
90
88
|
begin
|
|
91
89
|
reader = rdfformat.reader.new(body.force_encoding('UTF-8'))
|
|
92
90
|
rescue Exception => e
|
|
93
|
-
|
|
94
|
-
|
|
91
|
+
metadata.comments << "WARN: Though linked data was found, it failed to parse (Exception #{e}). This likely indicates some syntax error in the data. As a result, no metadata will be extracted from this message.\n"
|
|
92
|
+
metadata.add_warning(['018', '', ''])
|
|
95
93
|
return
|
|
96
94
|
end
|
|
97
95
|
|
|
98
96
|
begin
|
|
99
|
-
if reader.size
|
|
100
|
-
|
|
97
|
+
if reader.size.zero?
|
|
98
|
+
metadata.comments << "WARN: Though linked data was found, it failed to parse. This likely indicates some syntax error in the data. As a result, no metadata will be extracted from this message.\n"
|
|
101
99
|
return
|
|
102
100
|
end
|
|
103
101
|
reader = rdfformat.reader.new(body) # have to re-read it here, but now its safe because we have already caught errors
|
|
@@ -106,16 +104,16 @@ module HarvesterTools
|
|
|
106
104
|
warn 'WRITING DONE'
|
|
107
105
|
reader = rdfformat.reader.new(body.force_encoding('UTF-8')) # frustrating that we cannot rewind!
|
|
108
106
|
warn 'RE-READING DONE'
|
|
109
|
-
|
|
107
|
+
metadata.merge_rdf(reader.to_a)
|
|
110
108
|
warn 'MERGE DONE'
|
|
111
109
|
rescue RDF::ReaderError => e
|
|
112
|
-
|
|
110
|
+
metadata.comments << "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} || (sample of what was parsed: #{body[0..300].delete("\n")})\n"
|
|
113
111
|
warn "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} || (sample of what was parsed: #{body[0..300].delete("\n")})\n"
|
|
114
|
-
|
|
112
|
+
metadata.add_warning(['018', '', ''])
|
|
115
113
|
rescue Exception => e
|
|
116
|
-
|
|
114
|
+
metadata.comments << "CRITICAL: An unknown error occurred while parsing the (apparent) Linked Data (sample of what was parsed: #{body[0..300].delete("\n")}). Moving on...\n"
|
|
117
115
|
warn "\n\nCRITICAL: #{e.inspect} An unknown error occurred while parsing the (apparent) Linked Data (full body: #{body.force_encoding('UTF-8')}). Moving on...\n"
|
|
118
|
-
|
|
116
|
+
metadata.add_warning(['018', '', ''])
|
|
119
117
|
end
|
|
120
118
|
end
|
|
121
119
|
end
|