fsp_harvester 0.1.22 → 0.1.23

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dfe28e2fc429fd0c550539b356e325a1735d323e72fa9c4ee502fbedb1c818df
4
- data.tar.gz: b1ec033372645ca2129f44c4faad380b690527ab2f0823583985b12328bdce54
3
+ metadata.gz: 34f1b3296ceae51a5d5c5e2e17123fab158a3218bbb5abab80be04fbccbd21fa
4
+ data.tar.gz: 6f59acbe4999636685f612b09822ecf11e42234bfafdf9290112a83e2782b4d7
5
5
  SHA512:
6
- metadata.gz: 11b5ce8b8368d70171e3e376ee75275e1c2892ec58be07976fa225a2df1841ae11b06c335105ab1915a001fb1f0e6247fadf00e3d06f33b21918593c3ada5fc0
7
- data.tar.gz: b0c82ea9e81183789227a22eb3c160bfad5d4aa346f0aebb349b383f3d920877c28addb6f936a8a4f5b59d352bcecd313984f7ef7bb90aa4b5e93653e22fd176
6
+ metadata.gz: ac498d2cf39d739a53251e7c9c338c3d19070df5950eaae8fb6068820a66590f6ed78d998c97498284e791e3bb84fcf45515d221d7a4d4ca9cf73b213d3c69f2
7
+ data.tar.gz: bb432a89bf2066063b96162bdfb0560d75d5d26fc041b49c532bdcca5aefb4c4250b6c37f62627eabb446440f34f202f66bf88f608190a27fe0c573757b00886
data/.rspec_status CHANGED
@@ -1,60 +1,60 @@
1
- example_id | status | run_time |
2
- ---------------------------------- | ------ | ---------------------- |
3
- ./spec/cite-as_spec.rb[1:1:1] | passed | 1.87 seconds |
4
- ./spec/cite-as_spec.rb[1:1:2] | passed | 1.3 seconds |
5
- ./spec/cite-as_spec.rb[1:1:3] | passed | 1.53 seconds |
6
- ./spec/cite-as_spec.rb[1:1:4] | passed | 2.09 seconds |
7
- ./spec/cite-as_spec.rb[1:1:5] | passed | 8.09 seconds |
8
- ./spec/cite-as_spec.rb[1:1:6] | passed | 2.63 seconds |
9
- ./spec/cite-as_spec.rb[1:1:7] | passed | 2.9 seconds |
10
- ./spec/cite-as_spec.rb[1:1:8] | passed | 2.21 seconds |
11
- ./spec/cite-as_spec.rb[1:1:9] | passed | 2.85 seconds |
12
- ./spec/cite-as_spec.rb[1:1:10] | passed | 2.89 seconds |
13
- ./spec/cite-as_spec.rb[1:1:11] | passed | 3.16 seconds |
14
- ./spec/cite-as_spec.rb[1:1:12] | passed | 2.23 seconds |
15
- ./spec/cite-as_spec.rb[1:1:13] | passed | 2.92 seconds |
16
- ./spec/cite-as_spec.rb[1:1:14] | passed | 2.8 seconds |
17
- ./spec/cite-as_spec.rb[1:1:15] | passed | 1.21 seconds |
18
- ./spec/cite-as_spec.rb[1:1:16] | passed | 1.28 seconds |
19
- ./spec/cite-as_spec.rb[1:1:17] | passed | 1.19 seconds |
20
- ./spec/cite-as_spec.rb[1:1:18] | passed | 1.24 seconds |
21
- ./spec/cite-as_spec.rb[1:1:19] | passed | 1.7 seconds |
22
- ./spec/cite-as_spec.rb[1:1:20] | passed | 1.74 seconds |
23
- ./spec/cite-as_spec.rb[1:1:21] | passed | 2.75 seconds |
24
- ./spec/cite-as_spec.rb[1:1:22] | passed | 1.35 seconds |
25
- ./spec/cite-as_spec.rb[1:1:23] | passed | 1.19 seconds |
26
- ./spec/cite-as_spec.rb[1:1:24] | failed | 1.2 seconds |
27
- ./spec/cite-as_spec.rb[1:1:25] | passed | 0.60282 seconds |
28
- ./spec/describedby_spec.rb[1:1:1] | passed | 3.23 seconds |
29
- ./spec/describedby_spec.rb[1:1:2] | passed | 1.43 seconds |
30
- ./spec/describedby_spec.rb[1:1:3] | passed | 1.31 seconds |
31
- ./spec/describedby_spec.rb[1:1:4] | passed | 1.37 seconds |
32
- ./spec/describedby_spec.rb[1:1:5] | passed | 1.24 seconds |
33
- ./spec/describedby_spec.rb[1:1:6] | passed | 1.09 seconds |
34
- ./spec/describedby_spec.rb[1:1:7] | passed | 1.03 seconds |
35
- ./spec/describedby_spec.rb[1:1:8] | passed | 2.28 seconds |
36
- ./spec/describedby_spec.rb[1:1:9] | passed | 1.84 seconds |
37
- ./spec/describedby_spec.rb[1:1:10] | passed | 2.23 seconds |
38
- ./spec/describedby_spec.rb[1:1:11] | passed | 2.97 seconds |
39
- ./spec/describedby_spec.rb[1:1:12] | passed | 2.97 seconds |
40
- ./spec/describedby_spec.rb[1:1:13] | passed | 1.65 seconds |
41
- ./spec/describedby_spec.rb[1:1:14] | passed | 2.27 seconds |
42
- ./spec/describedby_spec.rb[1:1:15] | passed | 2.53 seconds |
43
- ./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00025 seconds |
44
- ./spec/fsp_harvester_spec.rb[1:2] | passed | 2.69 seconds |
45
- ./spec/fsp_harvester_spec.rb[1:3] | passed | 45.63 seconds |
46
- ./spec/fsp_harvester_spec.rb[1:4] | passed | 2.86 seconds |
47
- ./spec/fsp_harvester_spec.rb[1:5] | passed | 2.67 seconds |
48
- ./spec/fsp_harvester_spec.rb[1:6] | passed | 2 minutes 1.6 seconds |
49
- ./spec/fsp_harvester_spec.rb[1:7] | passed | 1 minute 28.63 seconds |
50
- ./spec/item_spec.rb[1:1:1] | passed | 3.8 seconds |
51
- ./spec/item_spec.rb[1:1:2] | passed | 3.3 seconds |
52
- ./spec/item_spec.rb[1:1:3] | passed | 1.33 seconds |
53
- ./spec/item_spec.rb[1:1:4] | passed | 1.68 seconds |
54
- ./spec/item_spec.rb[1:1:5] | passed | 2.44 seconds |
55
- ./spec/item_spec.rb[1:1:6] | passed | 2.64 seconds |
56
- ./spec/item_spec.rb[1:1:7] | passed | 3.02 seconds |
57
- ./spec/item_spec.rb[1:1:8] | passed | 0.49403 seconds |
58
- ./spec/type_spec.rb[1:1:1] | passed | 1.23 seconds |
59
- ./spec/type_spec.rb[1:1:2] | passed | 1.25 seconds |
60
- ./spec/type_spec.rb[1:1:3] | passed | 1.57 seconds |
1
+ example_id | status | run_time |
2
+ ---------------------------------- | ------ | --------------------- |
3
+ ./spec/cite-as_spec.rb[1:1:1] | passed | 1.87 seconds |
4
+ ./spec/cite-as_spec.rb[1:1:2] | passed | 1.3 seconds |
5
+ ./spec/cite-as_spec.rb[1:1:3] | passed | 1.53 seconds |
6
+ ./spec/cite-as_spec.rb[1:1:4] | passed | 2.09 seconds |
7
+ ./spec/cite-as_spec.rb[1:1:5] | passed | 8.09 seconds |
8
+ ./spec/cite-as_spec.rb[1:1:6] | passed | 2.63 seconds |
9
+ ./spec/cite-as_spec.rb[1:1:7] | passed | 2.9 seconds |
10
+ ./spec/cite-as_spec.rb[1:1:8] | passed | 2.21 seconds |
11
+ ./spec/cite-as_spec.rb[1:1:9] | passed | 2.85 seconds |
12
+ ./spec/cite-as_spec.rb[1:1:10] | passed | 2.89 seconds |
13
+ ./spec/cite-as_spec.rb[1:1:11] | passed | 3.16 seconds |
14
+ ./spec/cite-as_spec.rb[1:1:12] | passed | 2.23 seconds |
15
+ ./spec/cite-as_spec.rb[1:1:13] | passed | 2.92 seconds |
16
+ ./spec/cite-as_spec.rb[1:1:14] | passed | 2.8 seconds |
17
+ ./spec/cite-as_spec.rb[1:1:15] | passed | 1.21 seconds |
18
+ ./spec/cite-as_spec.rb[1:1:16] | passed | 1.28 seconds |
19
+ ./spec/cite-as_spec.rb[1:1:17] | passed | 1.19 seconds |
20
+ ./spec/cite-as_spec.rb[1:1:18] | passed | 1.24 seconds |
21
+ ./spec/cite-as_spec.rb[1:1:19] | passed | 1.7 seconds |
22
+ ./spec/cite-as_spec.rb[1:1:20] | passed | 1.74 seconds |
23
+ ./spec/cite-as_spec.rb[1:1:21] | passed | 2.75 seconds |
24
+ ./spec/cite-as_spec.rb[1:1:22] | passed | 1.35 seconds |
25
+ ./spec/cite-as_spec.rb[1:1:23] | passed | 1.19 seconds |
26
+ ./spec/cite-as_spec.rb[1:1:24] | failed | 1.2 seconds |
27
+ ./spec/cite-as_spec.rb[1:1:25] | passed | 0.60282 seconds |
28
+ ./spec/describedby_spec.rb[1:1:1] | passed | 3.23 seconds |
29
+ ./spec/describedby_spec.rb[1:1:2] | passed | 1.43 seconds |
30
+ ./spec/describedby_spec.rb[1:1:3] | passed | 1.31 seconds |
31
+ ./spec/describedby_spec.rb[1:1:4] | passed | 1.37 seconds |
32
+ ./spec/describedby_spec.rb[1:1:5] | passed | 1.24 seconds |
33
+ ./spec/describedby_spec.rb[1:1:6] | passed | 1.09 seconds |
34
+ ./spec/describedby_spec.rb[1:1:7] | passed | 1.03 seconds |
35
+ ./spec/describedby_spec.rb[1:1:8] | passed | 2.28 seconds |
36
+ ./spec/describedby_spec.rb[1:1:9] | passed | 1.84 seconds |
37
+ ./spec/describedby_spec.rb[1:1:10] | passed | 2.23 seconds |
38
+ ./spec/describedby_spec.rb[1:1:11] | passed | 2.97 seconds |
39
+ ./spec/describedby_spec.rb[1:1:12] | passed | 2.97 seconds |
40
+ ./spec/describedby_spec.rb[1:1:13] | passed | 1.65 seconds |
41
+ ./spec/describedby_spec.rb[1:1:14] | passed | 2.27 seconds |
42
+ ./spec/describedby_spec.rb[1:1:15] | passed | 2.53 seconds |
43
+ ./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00178 seconds |
44
+ ./spec/fsp_harvester_spec.rb[1:2] | passed | 5.49 seconds |
45
+ ./spec/fsp_harvester_spec.rb[1:3] | passed | 39.87 seconds |
46
+ ./spec/fsp_harvester_spec.rb[1:4] | passed | 2.62 seconds |
47
+ ./spec/fsp_harvester_spec.rb[1:5] | passed | 2.61 seconds |
48
+ ./spec/fsp_harvester_spec.rb[1:6] | failed | 54.05 seconds |
49
+ ./spec/fsp_harvester_spec.rb[1:7] | passed | 1 minute 26.9 seconds |
50
+ ./spec/item_spec.rb[1:1:1] | passed | 3.8 seconds |
51
+ ./spec/item_spec.rb[1:1:2] | passed | 3.3 seconds |
52
+ ./spec/item_spec.rb[1:1:3] | passed | 1.33 seconds |
53
+ ./spec/item_spec.rb[1:1:4] | passed | 1.68 seconds |
54
+ ./spec/item_spec.rb[1:1:5] | passed | 2.44 seconds |
55
+ ./spec/item_spec.rb[1:1:6] | passed | 2.64 seconds |
56
+ ./spec/item_spec.rb[1:1:7] | passed | 3.02 seconds |
57
+ ./spec/item_spec.rb[1:1:8] | passed | 0.49403 seconds |
58
+ ./spec/type_spec.rb[1:1:1] | passed | 1.23 seconds |
59
+ ./spec/type_spec.rb[1:1:2] | passed | 1.25 seconds |
60
+ ./spec/type_spec.rb[1:1:3] | passed | 1.57 seconds |
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fsp_harvester (0.1.22)
4
+ fsp_harvester (0.1.23)
5
5
  json (~> 2.0)
6
6
  linkeddata (~> 3.2)
7
7
  linkheaders-processor (~> 0.1.18)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module FspHarvester
4
- VERSION = "0.1.22"
4
+ VERSION = "0.1.23"
5
5
  end
@@ -39,12 +39,13 @@ module HarvesterTools
39
39
  # process "alternate" links
40
40
  links.each do |link|
41
41
  next unless link.relation == "alternate"
42
+ next unless sanity_check_alternate(link: link, metadata: metadata) # don't try to process zip files! LOL!
42
43
 
43
44
  url = link.href
44
45
  headers = {'Accept' => "#{link.type}"} if link.respond_to?("type")
45
46
  headers ||= FspHarvester::ACCEPT_STAR_HEADER
46
47
  warn "\n\nINFO: resolving alternate #{url} with headers #{headers.to_s}\n\n"
47
- metadata.comments << "IINFO: entering content negotiation on link alternates.\n"
48
+ metadata.comments << "INFO: entering content negotiation on link alternates.\n"
48
49
  response = resolve_url_brute(url: url, metadata: metadata, headers: headers) # now do content negotiation on the link
49
50
  if response
50
51
  HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from alternate link
@@ -53,6 +54,28 @@ module HarvesterTools
53
54
 
54
55
  end
55
56
 
57
+ def self.sanity_check_alternate(link:, metadata:)
58
+ type = link.type if link.respond_to?('type')
59
+ href = link.href
60
+ unless type # we're gonna have to check extensions...
61
+ m = href.match(/.*\.[\w\-]+/)
62
+ extension = m[1]
63
+ unless %w[json jsonld rdf ttl turtle n3 triples ntriples txt html xhtml nq xml].include? extension
64
+ warn "\n\nINFO: extension #{extension} is not trusted as a link 'alternate' representation for metadata, so it will not be processed\n"
65
+ metadata.comments << "INFO: extension #{extension} is not trusted as a link 'alternate' representation for metadata, so it will not be processed.\n"
66
+ return false
67
+ end
68
+ return true
69
+ end
70
+ type.gsub!(/;.*/, '') # remove any UTF8 blah blah
71
+ abbrev = HarvesterTools::MetadataHarvester.abbreviate_type(contenttype: type)
72
+ unless abbrev
73
+ warn "\n\nINFO: content-type #{type} is not trusted as a link 'alternate' representation for metadata, so it will not be processed\n"
74
+ metadata.comments << "INFO: content-type #{type} is not trusted as a link 'alternate' representation for metadata, so it will not be processed.\n"
75
+ return false
76
+ end
77
+ true
78
+ end
56
79
 
57
80
  def self.resolve_url_brute(url:, method: :get, nolinkheaders: true, headers:, metadata:)
58
81
 
@@ -26,32 +26,30 @@ module HarvesterTools
26
26
  end
27
27
 
28
28
  def process_xml(body:, metadata:)
29
- @meta = metadata
29
+
30
30
  begin
31
31
  hash = XmlSimple.xml_in(body)
32
32
  rescue
33
- @meta.comments << "CRITICAL: Malformed XML detected. Cannot process metadata.\n"
34
- @meta.add_warning(['020', '', ''])
33
+ metadata.comments << "CRITICAL: Malformed XML detected. Cannot process metadata.\n"
34
+ metadata.add_warning(['020', '', ''])
35
35
  end
36
- @meta.comments << "INFO: The XML is being merged in the metadata object\n"
37
- @meta.hash.merge hash
36
+ metadata.comments << "INFO: The XML is being merged in the metadata object\n"
37
+ metadata.hash.merge hash
38
38
  end
39
39
 
40
40
  def process_json(body:, metadata:)
41
- @meta = metadata
42
41
  begin
43
42
  hash = JSON.parse(body)
44
43
  rescue
45
- @meta.comments << "CRITICAL: Malformed JSON detected. Cannot process metadata.\n"
46
- @meta.add_warning(['021', '', ''])
44
+ metadata.comments << "CRITICAL: Malformed JSON detected. Cannot process metadata.\n"
45
+ metadata.add_warning(['021', '', ''])
47
46
  end
48
- @meta.comments << "INFO: The JSON is being merged in the metadata object\n"
49
- @meta.hash.merge hash
47
+ metadata.comments << "INFO: The JSON is being merged in the metadata object\n"
48
+ metadata.hash.merge hash
50
49
  end
51
50
 
52
51
  def process_ld(body:, content_type:, metadata:)
53
- @meta = metadata
54
- parse_rdf(body: body, content_type: content_type, metadata: @meta)
52
+ parse_rdf(body: body, content_type: content_type, metadata: metadata)
55
53
  end
56
54
 
57
55
  def parse_rdf(body:, content_type:, metadata:)
@@ -61,43 +59,43 @@ module HarvesterTools
61
59
  def self.parse_rdf(body:, content_type:, metadata:)
62
60
  @meta = metadata
63
61
  unless body
64
- @meta.comments << "CRITICAL: The response message body component appears to have no content.\n"
65
- @meta.add_warning(['018', '', ''])
62
+ metadata.comments << "CRITICAL: The response message body component appears to have no content.\n"
63
+ metadata.add_warning(['018', '', ''])
66
64
  return
67
65
  end
68
66
 
69
67
  unless body.match(/\w/)
70
- @meta.comments << "CRITICAL: The response message body component appears to have no content.\n"
71
- @meta.add_warning(['018', '', ''])
68
+ metadata.comments << "CRITICAL: The response message body component appears to have no content.\n"
69
+ metadata.add_warning(['018', '', ''])
72
70
  return
73
71
  end
74
72
 
75
73
  rdfformat = RDF::Format.for(content_type: content_type)
76
74
  unless rdfformat
77
- @meta.comments << "CRITICAL: Found what appears to be RDF (sample: #{body[0..300].delete!("\n")}), but it could not find a parser. Please report this error, along with the GUID of the resource, to the maintainer of the system.\n"
78
- @meta.add_warning(['018', '', ''])
75
+ metadata.comments << "CRITICAL: Found what appears to be RDF (sample: #{body[0..300].delete!("\n")}), but it could not find a parser. Please report this error, along with the GUID of the resource, to the maintainer of the system.\n"
76
+ metadata.add_warning(['018', '', ''])
79
77
  return
80
78
  end
81
79
 
82
80
  graph = HarvesterTools::Cache.checkRDFCache(body: body)
83
81
  if graph.size > 0
84
82
  warn "\n\n\n unmarshalling graph from cache\n\ngraph size #{graph.size}\n\n"
85
- @meta.merge_rdf(graph.to_a)
83
+ metadata.merge_rdf(graph.to_a)
86
84
  else
87
85
  warn "\n\n\nfound format #{rdfformat}\n\n"
88
- @meta.comments << "INFO: The response message body component appears to contain #{rdfformat}.\n"
86
+ metadata.comments << "INFO: The response message body component appears to contain #{rdfformat}.\n"
89
87
  reader = ''
90
88
  begin
91
89
  reader = rdfformat.reader.new(body.force_encoding('UTF-8'))
92
90
  rescue Exception => e
93
- @meta.comments << "WARN: Though linked data was found, it failed to parse (Exception #{e}). This likely indicates some syntax error in the data. As a result, no metadata will be extracted from this message.\n"
94
- @meta.add_warning(['018', '', ''])
91
+ metadata.comments << "WARN: Though linked data was found, it failed to parse (Exception #{e}). This likely indicates some syntax error in the data. As a result, no metadata will be extracted from this message.\n"
92
+ metadata.add_warning(['018', '', ''])
95
93
  return
96
94
  end
97
95
 
98
96
  begin
99
- if reader.size == 0
100
- @meta.comments << "WARN: Though linked data was found, it failed to parse. This likely indicates some syntax error in the data. As a result, no metadata will be extracted from this message.\n"
97
+ if reader.size.zero?
98
+ metadata.comments << "WARN: Though linked data was found, it failed to parse. This likely indicates some syntax error in the data. As a result, no metadata will be extracted from this message.\n"
101
99
  return
102
100
  end
103
101
  reader = rdfformat.reader.new(body) # have to re-read it here, but now its safe because we have already caught errors
@@ -106,16 +104,16 @@ module HarvesterTools
106
104
  warn 'WRITING DONE'
107
105
  reader = rdfformat.reader.new(body.force_encoding('UTF-8')) # frustrating that we cannot rewind!
108
106
  warn 'RE-READING DONE'
109
- @meta.merge_rdf(reader.to_a)
107
+ metadata.merge_rdf(reader.to_a)
110
108
  warn 'MERGE DONE'
111
109
  rescue RDF::ReaderError => e
112
- @meta.comments << "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} || (sample of what was parsed: #{body[0..300].delete("\n")})\n"
110
+ metadata.comments << "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} || (sample of what was parsed: #{body[0..300].delete("\n")})\n"
113
111
  warn "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} || (sample of what was parsed: #{body[0..300].delete("\n")})\n"
114
- @meta.add_warning(['018', '', ''])
112
+ metadata.add_warning(['018', '', ''])
115
113
  rescue Exception => e
116
- meta.comments << "CRITICAL: An unknown error occurred while parsing the (apparent) Linked Data (sample of what was parsed: #{body[0..300].delete("\n")}). Moving on...\n"
114
+ metadata.comments << "CRITICAL: An unknown error occurred while parsing the (apparent) Linked Data (sample of what was parsed: #{body[0..300].delete("\n")}). Moving on...\n"
117
115
  warn "\n\nCRITICAL: #{e.inspect} An unknown error occurred while parsing the (apparent) Linked Data (full body: #{body.force_encoding('UTF-8')}). Moving on...\n"
118
- @meta.add_warning(['018', '', ''])
116
+ metadata.add_warning(['018', '', ''])
119
117
  end
120
118
  end
121
119
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fsp_harvester
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.22
4
+ version: 0.1.23
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mark Wilkinson