linkheaders-processor 0.1.17 → 0.1.19

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b10f24c0498058f393a2142465f0ff2e559dc2a8721ea8cf6a90178c8ff21789
4
- data.tar.gz: 5cd760d37f6e82f63cb8271375bec72364264173010fda86e1ae32f1a424642d
3
+ metadata.gz: 65747de845763341178717385337c65ed5d78a7df57bccecee17f392d8330b12
4
+ data.tar.gz: 1e2c5ae203200e40e8fc238b211f63491bc2ba4756d3bd807ccb916a7c9b270f
5
5
  SHA512:
6
- metadata.gz: e696571f2c9da932ff461af46740824de0728a6accd327ab6e109ab84530d313868d120cff5a62f670655efe6531069d7b6a358f1dcfcb94560cdd647614ff35
7
- data.tar.gz: cf056d618d352bbcfaa43e59317fbc6f739186ac3e7ae26534e8f12721dec0d7ab2e8edfa07a2434e881a37d1cd18bf30e84f6cfac402fcc74ecd8c86170fa73
6
+ metadata.gz: a92597e6f649e5abdc524862c051a0a2d442c753976ea5cea70ce0ce4b1c30261d0f0f3e9e3e577ad55c396cad7a780c806f068012ef26a3ae91a7cc8fbb109e
7
+ data.tar.gz: bdd96b086e950c0b427ec8d15ed151723639d72b81d9f3a15adfca22bf7326c51ee9711cf0aa451a3e1d472cb14d8c46d6ced944627058d5e87fa695dbcde6f6
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- linkheaders-processor (0.1.17)
4
+ linkheaders-processor (0.1.19)
5
5
  json (~> 2.0)
6
6
  json-ld (~> 3.2)
7
7
  json-ld-preloaded (~> 3.2)
@@ -13,8 +13,8 @@ PATH
13
13
  GEM
14
14
  remote: https://rubygems.org/
15
15
  specs:
16
- addressable (2.8.0)
17
- public_suffix (>= 2.0.2, < 5.0)
16
+ addressable (2.8.5)
17
+ public_suffix (>= 2.0.2, < 6.0)
18
18
  ast (2.4.2)
19
19
  diff-lcs (1.5.0)
20
20
  domain_name (0.5.20190701)
@@ -39,7 +39,7 @@ GEM
39
39
  faraday-encoding (0.0.5)
40
40
  faraday
41
41
  faraday-excon (1.1.0)
42
- faraday-http-cache (2.4.0)
42
+ faraday-http-cache (2.5.0)
43
43
  faraday (>= 0.8)
44
44
  faraday-httpclient (1.0.1)
45
45
  faraday-multipart (1.0.4)
@@ -51,21 +51,21 @@ GEM
51
51
  faraday-retry (1.0.3)
52
52
  faraday_middleware (1.2.0)
53
53
  faraday (~> 1.0)
54
- fastimage (2.2.6)
54
+ fastimage (2.2.7)
55
55
  htmlentities (4.3.4)
56
56
  http-accept (1.7.0)
57
57
  http-cookie (1.0.5)
58
58
  domain_name (~> 0.5)
59
59
  json (2.6.2)
60
- json-canonicalization (0.3.0)
61
- json-ld (3.2.3)
60
+ json-canonicalization (0.3.2)
61
+ json-ld (3.2.5)
62
62
  htmlentities (~> 4.3)
63
- json-canonicalization (~> 0.3)
63
+ json-canonicalization (~> 0.3, >= 0.3.2)
64
64
  link_header (~> 0.0, >= 0.0.8)
65
65
  multi_json (~> 1.15)
66
- rack (~> 2.2)
67
- rdf (~> 3.2, >= 3.2.9)
68
- json-ld-preloaded (3.2.0)
66
+ rack (>= 2.2, < 4)
67
+ rdf (~> 3.2, >= 3.2.10)
68
+ json-ld-preloaded (3.2.2)
69
69
  json-ld (~> 3.2)
70
70
  rdf (~> 3.2)
71
71
  link_header (0.0.8)
@@ -79,24 +79,24 @@ GEM
79
79
  fastimage (~> 2.2)
80
80
  nesty (~> 1.0)
81
81
  nokogiri (~> 1.11)
82
- mime-types (3.4.1)
82
+ mime-types (3.5.1)
83
83
  mime-types-data (~> 3.2015)
84
- mime-types-data (3.2022.0105)
84
+ mime-types-data (3.2023.0808)
85
85
  multi_json (1.15.0)
86
86
  multipart-post (2.2.3)
87
87
  nesty (1.0.2)
88
88
  netrc (0.11.0)
89
- nokogiri (1.13.8-x86_64-linux)
89
+ nokogiri (1.15.4-x86_64-linux)
90
90
  racc (~> 1.4)
91
91
  parallel (1.22.1)
92
92
  parser (3.1.2.0)
93
93
  ast (~> 2.4.1)
94
- public_suffix (4.0.7)
95
- racc (1.6.0)
96
- rack (2.2.4)
94
+ public_suffix (5.0.3)
95
+ racc (1.7.1)
96
+ rack (3.0.8)
97
97
  rainbow (3.1.1)
98
98
  rake (13.0.6)
99
- rdf (3.2.9)
99
+ rdf (3.2.11)
100
100
  link_header (~> 0.0, >= 0.0.8)
101
101
  regexp_parser (2.5.0)
102
102
  rest-client (2.1.0)
@@ -127,7 +127,7 @@ module LinkHeaders
127
127
  if l.relation != link.relation
128
128
  @warnings |= ['WARN: Found identical hrefs with different relation types. This may be suspicious. Both have been retained']
129
129
  else
130
- @warnings |= ['WARN: found apparent duplicate. Ignoring and returning known link']
130
+ @warnings |= ["WARN: found apparent duplicate #{l.relation} #{l.href} EQUALS#{link.href}. Ignoring and returning known link #{l.relation} #{l.href}"]
131
131
  link = l
132
132
  end
133
133
  end
@@ -3,6 +3,6 @@
3
3
 
4
4
  module LinkHeaders
5
5
  class Processor
6
- VERSION = "0.1.17"
6
+ VERSION = "0.1.19"
7
7
  end
8
8
  end
@@ -59,14 +59,14 @@ module LinkHeaders
59
59
  return [[], []]
60
60
  end
61
61
 
62
- newlinks = parse_http_link_headers(head) # pass guid to check against anchors in linksets
63
- warn "HTTPlinks #{newlinks.inspect}"
62
+ _newlinks = parse_http_link_headers(head)
63
+ # warn "HTTPlinks #{newlinks.inspect}"
64
64
 
65
- ['text/html','text/xhtml+xml', 'application/xhtml+xml'].each do |format|
65
+ ['text/html', 'text/xhtml+xml', 'application/xhtml+xml'].each do |format|
66
66
  if head[:content_type] and head[:content_type].match(format)
67
67
  warn "found #{format} content - parsing"
68
- htmllinks = parse_html_link_headers(body: body, anchor: default_anchor) # pass html body to find HTML link headers
69
- warn "htmllinks #{htmllinks.inspect}"
68
+ _htmllinks = parse_html_link_headers(body: body, anchor: default_anchor) # pass html body to find HTML link headers
69
+ # warn "htmllinks #{htmllinks.inspect}"
70
70
  end
71
71
  end
72
72
  end
@@ -99,10 +99,12 @@ module LinkHeaders
99
99
  # warn "link is: #{part}"
100
100
 
101
101
  section = part.split(';') # ["<https://example.one.com>", "rel='preconnect'"]
102
- # warn section
102
+ warn section
103
103
  next unless section[0]
104
104
 
105
105
  href = section[0][/<(.*)>/, 1]
106
+ next unless href # this is mandatory!
107
+
106
108
  next unless section[1]
107
109
 
108
110
  sections = {}
@@ -123,10 +125,11 @@ module LinkHeaders
123
125
  relation = sections['rel']
124
126
  sections.delete('rel')
125
127
  relations = relation.split(/\s+/) # handle the multiple relation case
126
- warn "RELATIONS #{relations}"
128
+ # warn "HEADERS RELATIONS #{relations}"
127
129
 
128
130
  relations.each do |rel|
129
131
  next unless rel.match?(/\w/)
132
+ puts "LICENCE is #{href}\n\n" if rel == "license"
130
133
  newlinks << factory.new_link(responsepart: :header, anchor: anchor, href: href, relation: rel, **sections) # parsed['https://example.one.com'][:rel] = "preconnect"
131
134
  end
132
135
  end
@@ -143,7 +146,6 @@ module LinkHeaders
143
146
  # an array of elements that look like this: [{:rel=>"alternate", :type=>"application/ld+json", :href=>"http://scidata.vitk.lv/dataset/303.jsonld"}]
144
147
  newlinks = Array.new
145
148
  m.head_links.each do |l|
146
- warn "HTML head link is: #{l.inspect}"
147
149
  next unless l[:href] and l[:rel] # required
148
150
 
149
151
  anchor = l[:anchor] || default_anchor
@@ -154,7 +156,7 @@ module LinkHeaders
154
156
  l.delete(:href)
155
157
 
156
158
  relations = relation.split(/\s+/) # handle the multiple relation case
157
- warn "RELATIONS #{relations}"
159
+ # warn "BODY RELATIONS #{relations}"
158
160
 
159
161
  relations.each do |rel|
160
162
  next unless rel.match?(/\w/)
@@ -189,9 +191,10 @@ module LinkHeaders
189
191
 
190
192
  def processJSONLinkset(href:)
191
193
  _headers, linkset = lhfetch(href, { 'Accept' => 'application/linkset+json' })
192
- # warn "Linkset body #{linkset.inspect}"
194
+ # warn "Linkset body #{linkset.inspect}\n\nLinkset headers #{_headers}\n\n"
193
195
  newlinks = Array.new
194
196
  return nil unless linkset
197
+ # warn "linkset #{linkset}"
195
198
 
196
199
  # linkset = '{ "linkset":
197
200
  # [
@@ -208,20 +211,28 @@ module LinkHeaders
208
211
  # }'
209
212
 
210
213
  linkset = JSON.parse(linkset)
214
+ # warn "linkset #{linkset}"
215
+ if linkset['data'] and linkset['data']['linkset']
216
+ linkset['linkset'] = linkset['data']['linkset']
217
+ end
218
+ return nil unless linkset['linkset'].first
211
219
  linkset['linkset'].each do |ls|
212
220
  # warn ls.inspect, "\n"
213
221
  anchor = ls['anchor'] || @default_anchor
214
- ls.delete('anchor') if ls['anchor'] # we need to delete since all others have a list as a value
222
+ ls.delete('anchor') if ls['anchor'] # we need to delete since almost all others have a list as a value
215
223
  attrhash = {}
216
224
  # warn ls.keys, "\n"
217
225
 
218
- ls.each_key do |relation| # key = e.g. "item", "described-by". "cite"
219
- # warn reltype, "\n"
226
+ ls.each_key do |relation| # relation = e.g. "item", "described-by". "cite"
227
+ href = ""
228
+ # warn relation
220
229
  # warn ls[reltype], "\n"
230
+ ls[relation] = [ls[relation]] unless ls[relation].is_a? Array # force it to be a list, if it isn't
221
231
  ls[relation].each do |attrs| # attr = e.g. {"href": "http://example.com/foo1", "type": "text/html"}
232
+ # warn "ATTR: #{attrs}"
222
233
  next unless attrs['href'] # this is a required attribute of a linkset relation
223
-
224
234
  href = attrs['href']
235
+ attrs.delete("href")
225
236
  # now go through the other attributes of that relation
226
237
  attrs.each do |attr, val| # attr = e.g. "type"; val = "text/html"
227
238
  attrhash[attr.to_sym] = val
@@ -229,7 +240,6 @@ module LinkHeaders
229
240
  end
230
241
 
231
242
  relations = relation.split(/\s+/) # handle the multiple relation case
232
-
233
243
  relations.each do |rel|
234
244
  next unless rel.match?(/\w/)
235
245
  newlinks << factory.new_link(responsepart: :header, anchor: anchor, href: href, relation: rel, **attrhash) # parsed['https://example.one.com'][:rel] = "preconnect"
@@ -245,13 +255,13 @@ module LinkHeaders
245
255
  # warn "linkset body #{linkset.inspect}"
246
256
  return {} unless linkset
247
257
 
248
- links = linkset.scan(/(<.*?>[^<]+)/) # split on the open angle bracket, which indicates a new link
258
+ # links = linkset.scan(/(<.*?>[^<]+)/) # split on the open angle bracket, which indicates a new link
259
+ links = linkset.split(/,\n*/) # split on the comma+newline
249
260
  # warn "Links found #{links}"
250
261
 
251
262
  links.each do |ls|
252
- # warn "workking on link #{ls}"
253
- ls = ls.first # ls is a single element array
254
- elements = ls.split(';') # semicolon delimited fields
263
+ # warn "working on link #{ls}"
264
+ elements = ls.split(';').map {|element| element.strip!} # semicolon delimited fields
255
265
  # ["<https://w3id.org/a2a-fair-metrics/08-http-describedby-citeas-linkset-txt/>", "anchor=\"https://s11.no/2022/a2a-fair-metrics/08-http-describedby-citeas-linkset-txt/\"", "rel=\"cite-as\""]
256
266
  href = elements.shift # first element is always the link url
257
267
  # warn "working on link href #{href}"
@@ -259,8 +269,6 @@ module LinkHeaders
259
269
  attrhash = {}
260
270
  elements.each do |e|
261
271
  key, val = e.split('=')
262
- key.strip!
263
- val.strip!
264
272
  val.delete_prefix!('"').delete_suffix!('"') # get rid of newlines and start/end quotes
265
273
  attrhash[key.to_sym] = val # split on key=val and make key a symbol
266
274
  end
@@ -12,26 +12,53 @@ factory = p.factory # LinkHeaders::LinkFactory
12
12
 
13
13
  RSpec.describe LinkHeaders::Processor do
14
14
 
15
- it 'has a version number' do
15
+ it 'Benchmark: has a version number' do
16
16
  expect(LinkHeaders::Processor::VERSION).not_to be nil
17
17
  end
18
18
 
19
- it "should find PURL citeas which has described-by and cite-as in mixed HTTP and HTML headers" do
20
- expect(factory.all_links.length).to eq 5
19
+ it "Benchmark: should find 8 links in total" do
20
+ expect(factory.all_links.length).to eq 8
21
21
  end
22
- it "should find find href on all links" do
23
- expect(factory.all_links.select{|l| l.href}.length).to eq 5
22
+ it "Benchmark: should find find href on all links" do
23
+ expect(factory.all_links.select{|l| l.href}.length).to eq 8
24
24
  end
25
- it "should find find href on all links" do
26
- expect(factory.all_links.select{|l| l.anchor}.length).to eq 5
25
+ it "Benchmark: should find find anchor on all links" do
26
+ expect(factory.all_links.select{|l| l.anchor}.length).to eq 8
27
27
  end
28
- it "should find 5 links in mixed HTTP and HTML headers" do
29
- expect(factory.all_links.select{|l| l.relation}.length).to eq 5
28
+ it "Benchmark: should find 5 links in mixed HTTP and HTML headers" do
29
+ expect(factory.all_links.select{|l| l.relation}.length).to eq 8
30
30
  end
31
- it "should find one citeas in mixed HTTP and HTML headers" do
31
+ it "Benchmark: should find one citeas in mixed HTTP and HTML headers" do
32
32
  expect(factory.all_links.select{|l| l.relation == 'cite-as'}.length).to eq 1
33
33
  end
34
- it "should find described-by in mixed HTTP and HTML headers" do
34
+ it "Benchmark: should find described-by in mixed HTTP and HTML headers" do
35
35
  expect(factory.all_links.select{|l| l.relation == 'describedby'}.length).to eq 1
36
36
  end
37
+
38
+ url2 = "https://doi.org/10.7910/DVN/Z2JD58"
39
+ p2 = LinkHeaders::Processor.new(default_anchor: url2)
40
+ r2 = RestClient.get(url2)
41
+ p2.extract_and_parse(response: r2)
42
+ factory2 = p2.factory # LinkHeaders::LinkFactory
43
+
44
+ it "Dataverse: should find 29 links in total" do
45
+ expect(factory2.all_links.length).to eq 28
46
+ end
47
+ it "Dataverse: should find find href on all links" do
48
+ expect(factory2.all_links.select{|l| l.href}.length).to eq 28
49
+ end
50
+ it "Dataverse: should find find anchor on all links" do
51
+ expect(factory2.all_links.select{|l| l.anchor}.length).to eq 28
52
+ end
53
+ it "Dataverse: should find one citeas" do
54
+ expect(factory2.all_links.select{|l| l.relation == 'cite-as'}.length).to eq 1
55
+ end
56
+ it "Dataverse: should find 2 described-by" do
57
+ expect(factory2.all_links.select{|l| l.relation == 'describedby'}.length).to eq 2
58
+ end
59
+ it "Dataverse: should find 1 license" do
60
+ expect(factory2.all_links.select{|l| l.relation == 'license'}.length).to eq 1
61
+ end
62
+
63
+
37
64
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linkheaders-processor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.17
4
+ version: 0.1.19
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mark Wilkinson
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-08-12 00:00:00.000000000 Z
11
+ date: 2023-09-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -168,7 +168,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
168
168
  - !ruby/object:Gem::Version
169
169
  version: '0'
170
170
  requirements: []
171
- rubygems_version: 3.2.28
171
+ rubygems_version: 3.3.23
172
172
  signing_key:
173
173
  specification_version: 4
174
174
  summary: A parser/processor for Link Headers and Linksets in both JSON and Text formats.