fsp_harvester 0.1.22 → 0.1.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dfe28e2fc429fd0c550539b356e325a1735d323e72fa9c4ee502fbedb1c818df
4
- data.tar.gz: b1ec033372645ca2129f44c4faad380b690527ab2f0823583985b12328bdce54
3
+ metadata.gz: 34f1b3296ceae51a5d5c5e2e17123fab158a3218bbb5abab80be04fbccbd21fa
4
+ data.tar.gz: 6f59acbe4999636685f612b09822ecf11e42234bfafdf9290112a83e2782b4d7
5
5
  SHA512:
6
- metadata.gz: 11b5ce8b8368d70171e3e376ee75275e1c2892ec58be07976fa225a2df1841ae11b06c335105ab1915a001fb1f0e6247fadf00e3d06f33b21918593c3ada5fc0
7
- data.tar.gz: b0c82ea9e81183789227a22eb3c160bfad5d4aa346f0aebb349b383f3d920877c28addb6f936a8a4f5b59d352bcecd313984f7ef7bb90aa4b5e93653e22fd176
6
+ metadata.gz: ac498d2cf39d739a53251e7c9c338c3d19070df5950eaae8fb6068820a66590f6ed78d998c97498284e791e3bb84fcf45515d221d7a4d4ca9cf73b213d3c69f2
7
+ data.tar.gz: bb432a89bf2066063b96162bdfb0560d75d5d26fc041b49c532bdcca5aefb4c4250b6c37f62627eabb446440f34f202f66bf88f608190a27fe0c573757b00886
data/.rspec_status CHANGED
@@ -1,60 +1,60 @@
1
- example_id | status | run_time |
2
- ---------------------------------- | ------ | ---------------------- |
3
- ./spec/cite-as_spec.rb[1:1:1] | passed | 1.87 seconds |
4
- ./spec/cite-as_spec.rb[1:1:2] | passed | 1.3 seconds |
5
- ./spec/cite-as_spec.rb[1:1:3] | passed | 1.53 seconds |
6
- ./spec/cite-as_spec.rb[1:1:4] | passed | 2.09 seconds |
7
- ./spec/cite-as_spec.rb[1:1:5] | passed | 8.09 seconds |
8
- ./spec/cite-as_spec.rb[1:1:6] | passed | 2.63 seconds |
9
- ./spec/cite-as_spec.rb[1:1:7] | passed | 2.9 seconds |
10
- ./spec/cite-as_spec.rb[1:1:8] | passed | 2.21 seconds |
11
- ./spec/cite-as_spec.rb[1:1:9] | passed | 2.85 seconds |
12
- ./spec/cite-as_spec.rb[1:1:10] | passed | 2.89 seconds |
13
- ./spec/cite-as_spec.rb[1:1:11] | passed | 3.16 seconds |
14
- ./spec/cite-as_spec.rb[1:1:12] | passed | 2.23 seconds |
15
- ./spec/cite-as_spec.rb[1:1:13] | passed | 2.92 seconds |
16
- ./spec/cite-as_spec.rb[1:1:14] | passed | 2.8 seconds |
17
- ./spec/cite-as_spec.rb[1:1:15] | passed | 1.21 seconds |
18
- ./spec/cite-as_spec.rb[1:1:16] | passed | 1.28 seconds |
19
- ./spec/cite-as_spec.rb[1:1:17] | passed | 1.19 seconds |
20
- ./spec/cite-as_spec.rb[1:1:18] | passed | 1.24 seconds |
21
- ./spec/cite-as_spec.rb[1:1:19] | passed | 1.7 seconds |
22
- ./spec/cite-as_spec.rb[1:1:20] | passed | 1.74 seconds |
23
- ./spec/cite-as_spec.rb[1:1:21] | passed | 2.75 seconds |
24
- ./spec/cite-as_spec.rb[1:1:22] | passed | 1.35 seconds |
25
- ./spec/cite-as_spec.rb[1:1:23] | passed | 1.19 seconds |
26
- ./spec/cite-as_spec.rb[1:1:24] | failed | 1.2 seconds |
27
- ./spec/cite-as_spec.rb[1:1:25] | passed | 0.60282 seconds |
28
- ./spec/describedby_spec.rb[1:1:1] | passed | 3.23 seconds |
29
- ./spec/describedby_spec.rb[1:1:2] | passed | 1.43 seconds |
30
- ./spec/describedby_spec.rb[1:1:3] | passed | 1.31 seconds |
31
- ./spec/describedby_spec.rb[1:1:4] | passed | 1.37 seconds |
32
- ./spec/describedby_spec.rb[1:1:5] | passed | 1.24 seconds |
33
- ./spec/describedby_spec.rb[1:1:6] | passed | 1.09 seconds |
34
- ./spec/describedby_spec.rb[1:1:7] | passed | 1.03 seconds |
35
- ./spec/describedby_spec.rb[1:1:8] | passed | 2.28 seconds |
36
- ./spec/describedby_spec.rb[1:1:9] | passed | 1.84 seconds |
37
- ./spec/describedby_spec.rb[1:1:10] | passed | 2.23 seconds |
38
- ./spec/describedby_spec.rb[1:1:11] | passed | 2.97 seconds |
39
- ./spec/describedby_spec.rb[1:1:12] | passed | 2.97 seconds |
40
- ./spec/describedby_spec.rb[1:1:13] | passed | 1.65 seconds |
41
- ./spec/describedby_spec.rb[1:1:14] | passed | 2.27 seconds |
42
- ./spec/describedby_spec.rb[1:1:15] | passed | 2.53 seconds |
43
- ./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00025 seconds |
44
- ./spec/fsp_harvester_spec.rb[1:2] | passed | 2.69 seconds |
45
- ./spec/fsp_harvester_spec.rb[1:3] | passed | 45.63 seconds |
46
- ./spec/fsp_harvester_spec.rb[1:4] | passed | 2.86 seconds |
47
- ./spec/fsp_harvester_spec.rb[1:5] | passed | 2.67 seconds |
48
- ./spec/fsp_harvester_spec.rb[1:6] | passed | 2 minutes 1.6 seconds |
49
- ./spec/fsp_harvester_spec.rb[1:7] | passed | 1 minute 28.63 seconds |
50
- ./spec/item_spec.rb[1:1:1] | passed | 3.8 seconds |
51
- ./spec/item_spec.rb[1:1:2] | passed | 3.3 seconds |
52
- ./spec/item_spec.rb[1:1:3] | passed | 1.33 seconds |
53
- ./spec/item_spec.rb[1:1:4] | passed | 1.68 seconds |
54
- ./spec/item_spec.rb[1:1:5] | passed | 2.44 seconds |
55
- ./spec/item_spec.rb[1:1:6] | passed | 2.64 seconds |
56
- ./spec/item_spec.rb[1:1:7] | passed | 3.02 seconds |
57
- ./spec/item_spec.rb[1:1:8] | passed | 0.49403 seconds |
58
- ./spec/type_spec.rb[1:1:1] | passed | 1.23 seconds |
59
- ./spec/type_spec.rb[1:1:2] | passed | 1.25 seconds |
60
- ./spec/type_spec.rb[1:1:3] | passed | 1.57 seconds |
1
+ example_id | status | run_time |
2
+ ---------------------------------- | ------ | --------------------- |
3
+ ./spec/cite-as_spec.rb[1:1:1] | passed | 1.87 seconds |
4
+ ./spec/cite-as_spec.rb[1:1:2] | passed | 1.3 seconds |
5
+ ./spec/cite-as_spec.rb[1:1:3] | passed | 1.53 seconds |
6
+ ./spec/cite-as_spec.rb[1:1:4] | passed | 2.09 seconds |
7
+ ./spec/cite-as_spec.rb[1:1:5] | passed | 8.09 seconds |
8
+ ./spec/cite-as_spec.rb[1:1:6] | passed | 2.63 seconds |
9
+ ./spec/cite-as_spec.rb[1:1:7] | passed | 2.9 seconds |
10
+ ./spec/cite-as_spec.rb[1:1:8] | passed | 2.21 seconds |
11
+ ./spec/cite-as_spec.rb[1:1:9] | passed | 2.85 seconds |
12
+ ./spec/cite-as_spec.rb[1:1:10] | passed | 2.89 seconds |
13
+ ./spec/cite-as_spec.rb[1:1:11] | passed | 3.16 seconds |
14
+ ./spec/cite-as_spec.rb[1:1:12] | passed | 2.23 seconds |
15
+ ./spec/cite-as_spec.rb[1:1:13] | passed | 2.92 seconds |
16
+ ./spec/cite-as_spec.rb[1:1:14] | passed | 2.8 seconds |
17
+ ./spec/cite-as_spec.rb[1:1:15] | passed | 1.21 seconds |
18
+ ./spec/cite-as_spec.rb[1:1:16] | passed | 1.28 seconds |
19
+ ./spec/cite-as_spec.rb[1:1:17] | passed | 1.19 seconds |
20
+ ./spec/cite-as_spec.rb[1:1:18] | passed | 1.24 seconds |
21
+ ./spec/cite-as_spec.rb[1:1:19] | passed | 1.7 seconds |
22
+ ./spec/cite-as_spec.rb[1:1:20] | passed | 1.74 seconds |
23
+ ./spec/cite-as_spec.rb[1:1:21] | passed | 2.75 seconds |
24
+ ./spec/cite-as_spec.rb[1:1:22] | passed | 1.35 seconds |
25
+ ./spec/cite-as_spec.rb[1:1:23] | passed | 1.19 seconds |
26
+ ./spec/cite-as_spec.rb[1:1:24] | failed | 1.2 seconds |
27
+ ./spec/cite-as_spec.rb[1:1:25] | passed | 0.60282 seconds |
28
+ ./spec/describedby_spec.rb[1:1:1] | passed | 3.23 seconds |
29
+ ./spec/describedby_spec.rb[1:1:2] | passed | 1.43 seconds |
30
+ ./spec/describedby_spec.rb[1:1:3] | passed | 1.31 seconds |
31
+ ./spec/describedby_spec.rb[1:1:4] | passed | 1.37 seconds |
32
+ ./spec/describedby_spec.rb[1:1:5] | passed | 1.24 seconds |
33
+ ./spec/describedby_spec.rb[1:1:6] | passed | 1.09 seconds |
34
+ ./spec/describedby_spec.rb[1:1:7] | passed | 1.03 seconds |
35
+ ./spec/describedby_spec.rb[1:1:8] | passed | 2.28 seconds |
36
+ ./spec/describedby_spec.rb[1:1:9] | passed | 1.84 seconds |
37
+ ./spec/describedby_spec.rb[1:1:10] | passed | 2.23 seconds |
38
+ ./spec/describedby_spec.rb[1:1:11] | passed | 2.97 seconds |
39
+ ./spec/describedby_spec.rb[1:1:12] | passed | 2.97 seconds |
40
+ ./spec/describedby_spec.rb[1:1:13] | passed | 1.65 seconds |
41
+ ./spec/describedby_spec.rb[1:1:14] | passed | 2.27 seconds |
42
+ ./spec/describedby_spec.rb[1:1:15] | passed | 2.53 seconds |
43
+ ./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00178 seconds |
44
+ ./spec/fsp_harvester_spec.rb[1:2] | passed | 5.49 seconds |
45
+ ./spec/fsp_harvester_spec.rb[1:3] | passed | 39.87 seconds |
46
+ ./spec/fsp_harvester_spec.rb[1:4] | passed | 2.62 seconds |
47
+ ./spec/fsp_harvester_spec.rb[1:5] | passed | 2.61 seconds |
48
+ ./spec/fsp_harvester_spec.rb[1:6] | failed | 54.05 seconds |
49
+ ./spec/fsp_harvester_spec.rb[1:7] | passed | 1 minute 26.9 seconds |
50
+ ./spec/item_spec.rb[1:1:1] | passed | 3.8 seconds |
51
+ ./spec/item_spec.rb[1:1:2] | passed | 3.3 seconds |
52
+ ./spec/item_spec.rb[1:1:3] | passed | 1.33 seconds |
53
+ ./spec/item_spec.rb[1:1:4] | passed | 1.68 seconds |
54
+ ./spec/item_spec.rb[1:1:5] | passed | 2.44 seconds |
55
+ ./spec/item_spec.rb[1:1:6] | passed | 2.64 seconds |
56
+ ./spec/item_spec.rb[1:1:7] | passed | 3.02 seconds |
57
+ ./spec/item_spec.rb[1:1:8] | passed | 0.49403 seconds |
58
+ ./spec/type_spec.rb[1:1:1] | passed | 1.23 seconds |
59
+ ./spec/type_spec.rb[1:1:2] | passed | 1.25 seconds |
60
+ ./spec/type_spec.rb[1:1:3] | passed | 1.57 seconds |
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fsp_harvester (0.1.22)
4
+ fsp_harvester (0.1.23)
5
5
  json (~> 2.0)
6
6
  linkeddata (~> 3.2)
7
7
  linkheaders-processor (~> 0.1.18)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module FspHarvester
4
- VERSION = "0.1.22"
4
+ VERSION = "0.1.23"
5
5
  end
@@ -39,12 +39,13 @@ module HarvesterTools
39
39
  # process "alternate" links
40
40
  links.each do |link|
41
41
  next unless link.relation == "alternate"
42
+ next unless sanity_check_alternate(link: link, metadata: metadata) # don't try to process zip files! LOL!
42
43
 
43
44
  url = link.href
44
45
  headers = {'Accept' => "#{link.type}"} if link.respond_to?("type")
45
46
  headers ||= FspHarvester::ACCEPT_STAR_HEADER
46
47
  warn "\n\nINFO: resolving alternate #{url} with headers #{headers.to_s}\n\n"
47
- metadata.comments << "IINFO: entering content negotiation on link alternates.\n"
48
+ metadata.comments << "INFO: entering content negotiation on link alternates.\n"
48
49
  response = resolve_url_brute(url: url, metadata: metadata, headers: headers) # now do content negotiation on the link
49
50
  if response
50
51
  HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from alternate link
@@ -53,6 +54,28 @@ module HarvesterTools
53
54
 
54
55
  end
55
56
 
57
+ def self.sanity_check_alternate(link:, metadata:)
58
+ type = link.type if link.respond_to?('type')
59
+ href = link.href
60
+ unless type # we're gonna have to check extensions...
61
+ m = href.match(/.*\.[\w\-]+/)
62
+ extension = m[1]
63
+ unless %w[json jsonld rdf ttl turtle n3 triples ntriples txt html xhtml nq xml].include? extension
64
+ warn "\n\nINFO: extension #{extension} is not trusted as a link 'alternate' representation for metadata, so it will not be processed\n"
65
+ metadata.comments << "INFO: extension #{extension} is not trusted as a link 'alternate' representation for metadata, so it will not be processed.\n"
66
+ return false
67
+ end
68
+ return true
69
+ end
70
+ type.gsub!(/;.*/, '') # remove any UTF8 blah blah
71
+ abbrev = HarvesterTools::MetadataHarvester.abbreviate_type(contenttype: type)
72
+ unless abbrev
73
+ warn "\n\nINFO: content-type #{type} is not trusted as a link 'alternate' representation for metadata, so it will not be processed\n"
74
+ metadata.comments << "INFO: content-type #{type} is not trusted as a link 'alternate' representation for metadata, so it will not be processed.\n"
75
+ return false
76
+ end
77
+ true
78
+ end
56
79
 
57
80
  def self.resolve_url_brute(url:, method: :get, nolinkheaders: true, headers:, metadata:)
58
81
 
@@ -26,32 +26,30 @@ module HarvesterTools
26
26
  end
27
27
 
28
28
  def process_xml(body:, metadata:)
29
- @meta = metadata
29
+
30
30
  begin
31
31
  hash = XmlSimple.xml_in(body)
32
32
  rescue
33
- @meta.comments << "CRITICAL: Malformed XML detected. Cannot process metadata.\n"
34
- @meta.add_warning(['020', '', ''])
33
+ metadata.comments << "CRITICAL: Malformed XML detected. Cannot process metadata.\n"
34
+ metadata.add_warning(['020', '', ''])
35
35
  end
36
- @meta.comments << "INFO: The XML is being merged in the metadata object\n"
37
- @meta.hash.merge hash
36
+ metadata.comments << "INFO: The XML is being merged in the metadata object\n"
37
+ metadata.hash.merge hash
38
38
  end
39
39
 
40
40
  def process_json(body:, metadata:)
41
- @meta = metadata
42
41
  begin
43
42
  hash = JSON.parse(body)
44
43
  rescue
45
- @meta.comments << "CRITICAL: Malformed JSON detected. Cannot process metadata.\n"
46
- @meta.add_warning(['021', '', ''])
44
+ metadata.comments << "CRITICAL: Malformed JSON detected. Cannot process metadata.\n"
45
+ metadata.add_warning(['021', '', ''])
47
46
  end
48
- @meta.comments << "INFO: The JSON is being merged in the metadata object\n"
49
- @meta.hash.merge hash
47
+ metadata.comments << "INFO: The JSON is being merged in the metadata object\n"
48
+ metadata.hash.merge hash
50
49
  end
51
50
 
52
51
  def process_ld(body:, content_type:, metadata:)
53
- @meta = metadata
54
- parse_rdf(body: body, content_type: content_type, metadata: @meta)
52
+ parse_rdf(body: body, content_type: content_type, metadata: metadata)
55
53
  end
56
54
 
57
55
  def parse_rdf(body:, content_type:, metadata:)
@@ -61,43 +59,43 @@ module HarvesterTools
61
59
  def self.parse_rdf(body:, content_type:, metadata:)
62
60
  @meta = metadata
63
61
  unless body
64
- @meta.comments << "CRITICAL: The response message body component appears to have no content.\n"
65
- @meta.add_warning(['018', '', ''])
62
+ metadata.comments << "CRITICAL: The response message body component appears to have no content.\n"
63
+ metadata.add_warning(['018', '', ''])
66
64
  return
67
65
  end
68
66
 
69
67
  unless body.match(/\w/)
70
- @meta.comments << "CRITICAL: The response message body component appears to have no content.\n"
71
- @meta.add_warning(['018', '', ''])
68
+ metadata.comments << "CRITICAL: The response message body component appears to have no content.\n"
69
+ metadata.add_warning(['018', '', ''])
72
70
  return
73
71
  end
74
72
 
75
73
  rdfformat = RDF::Format.for(content_type: content_type)
76
74
  unless rdfformat
77
- @meta.comments << "CRITICAL: Found what appears to be RDF (sample: #{body[0..300].delete!("\n")}), but it could not find a parser. Please report this error, along with the GUID of the resource, to the maintainer of the system.\n"
78
- @meta.add_warning(['018', '', ''])
75
+ metadata.comments << "CRITICAL: Found what appears to be RDF (sample: #{body[0..300].delete!("\n")}), but it could not find a parser. Please report this error, along with the GUID of the resource, to the maintainer of the system.\n"
76
+ metadata.add_warning(['018', '', ''])
79
77
  return
80
78
  end
81
79
 
82
80
  graph = HarvesterTools::Cache.checkRDFCache(body: body)
83
81
  if graph.size > 0
84
82
  warn "\n\n\n unmarshalling graph from cache\n\ngraph size #{graph.size}\n\n"
85
- @meta.merge_rdf(graph.to_a)
83
+ metadata.merge_rdf(graph.to_a)
86
84
  else
87
85
  warn "\n\n\nfound format #{rdfformat}\n\n"
88
- @meta.comments << "INFO: The response message body component appears to contain #{rdfformat}.\n"
86
+ metadata.comments << "INFO: The response message body component appears to contain #{rdfformat}.\n"
89
87
  reader = ''
90
88
  begin
91
89
  reader = rdfformat.reader.new(body.force_encoding('UTF-8'))
92
90
  rescue Exception => e
93
- @meta.comments << "WARN: Though linked data was found, it failed to parse (Exception #{e}). This likely indicates some syntax error in the data. As a result, no metadata will be extracted from this message.\n"
94
- @meta.add_warning(['018', '', ''])
91
+ metadata.comments << "WARN: Though linked data was found, it failed to parse (Exception #{e}). This likely indicates some syntax error in the data. As a result, no metadata will be extracted from this message.\n"
92
+ metadata.add_warning(['018', '', ''])
95
93
  return
96
94
  end
97
95
 
98
96
  begin
99
- if reader.size == 0
100
- @meta.comments << "WARN: Though linked data was found, it failed to parse. This likely indicates some syntax error in the data. As a result, no metadata will be extracted from this message.\n"
97
+ if reader.size.zero?
98
+ metadata.comments << "WARN: Though linked data was found, it failed to parse. This likely indicates some syntax error in the data. As a result, no metadata will be extracted from this message.\n"
101
99
  return
102
100
  end
103
101
  reader = rdfformat.reader.new(body) # have to re-read it here, but now its safe because we have already caught errors
@@ -106,16 +104,16 @@ module HarvesterTools
106
104
  warn 'WRITING DONE'
107
105
  reader = rdfformat.reader.new(body.force_encoding('UTF-8')) # frustrating that we cannot rewind!
108
106
  warn 'RE-READING DONE'
109
- @meta.merge_rdf(reader.to_a)
107
+ metadata.merge_rdf(reader.to_a)
110
108
  warn 'MERGE DONE'
111
109
  rescue RDF::ReaderError => e
112
- @meta.comments << "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} || (sample of what was parsed: #{body[0..300].delete("\n")})\n"
110
+ metadata.comments << "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} || (sample of what was parsed: #{body[0..300].delete("\n")})\n"
113
111
  warn "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} || (sample of what was parsed: #{body[0..300].delete("\n")})\n"
114
- @meta.add_warning(['018', '', ''])
112
+ metadata.add_warning(['018', '', ''])
115
113
  rescue Exception => e
116
- meta.comments << "CRITICAL: An unknown error occurred while parsing the (apparent) Linked Data (sample of what was parsed: #{body[0..300].delete("\n")}). Moving on...\n"
114
+ metadata.comments << "CRITICAL: An unknown error occurred while parsing the (apparent) Linked Data (sample of what was parsed: #{body[0..300].delete("\n")}). Moving on...\n"
117
115
  warn "\n\nCRITICAL: #{e.inspect} An unknown error occurred while parsing the (apparent) Linked Data (full body: #{body.force_encoding('UTF-8')}). Moving on...\n"
118
- @meta.add_warning(['018', '', ''])
116
+ metadata.add_warning(['018', '', ''])
119
117
  end
120
118
  end
121
119
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fsp_harvester
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.22
4
+ version: 0.1.23
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mark Wilkinson