linkheaders-processor 0.1.17 → 0.1.19
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +18 -18
- data/lib/linkheaders/link.rb +1 -1
- data/lib/linkheaders/processor/version.rb +1 -1
- data/lib/linkheaders/processor.rb +29 -21
- data/spec/linkheader/parser_spec.rb +38 -11
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 65747de845763341178717385337c65ed5d78a7df57bccecee17f392d8330b12
|
4
|
+
data.tar.gz: 1e2c5ae203200e40e8fc238b211f63491bc2ba4756d3bd807ccb916a7c9b270f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a92597e6f649e5abdc524862c051a0a2d442c753976ea5cea70ce0ce4b1c30261d0f0f3e9e3e577ad55c396cad7a780c806f068012ef26a3ae91a7cc8fbb109e
|
7
|
+
data.tar.gz: bdd96b086e950c0b427ec8d15ed151723639d72b81d9f3a15adfca22bf7326c51ee9711cf0aa451a3e1d472cb14d8c46d6ced944627058d5e87fa695dbcde6f6
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
linkheaders-processor (0.1.
|
4
|
+
linkheaders-processor (0.1.19)
|
5
5
|
json (~> 2.0)
|
6
6
|
json-ld (~> 3.2)
|
7
7
|
json-ld-preloaded (~> 3.2)
|
@@ -13,8 +13,8 @@ PATH
|
|
13
13
|
GEM
|
14
14
|
remote: https://rubygems.org/
|
15
15
|
specs:
|
16
|
-
addressable (2.8.
|
17
|
-
public_suffix (>= 2.0.2, <
|
16
|
+
addressable (2.8.5)
|
17
|
+
public_suffix (>= 2.0.2, < 6.0)
|
18
18
|
ast (2.4.2)
|
19
19
|
diff-lcs (1.5.0)
|
20
20
|
domain_name (0.5.20190701)
|
@@ -39,7 +39,7 @@ GEM
|
|
39
39
|
faraday-encoding (0.0.5)
|
40
40
|
faraday
|
41
41
|
faraday-excon (1.1.0)
|
42
|
-
faraday-http-cache (2.
|
42
|
+
faraday-http-cache (2.5.0)
|
43
43
|
faraday (>= 0.8)
|
44
44
|
faraday-httpclient (1.0.1)
|
45
45
|
faraday-multipart (1.0.4)
|
@@ -51,21 +51,21 @@ GEM
|
|
51
51
|
faraday-retry (1.0.3)
|
52
52
|
faraday_middleware (1.2.0)
|
53
53
|
faraday (~> 1.0)
|
54
|
-
fastimage (2.2.
|
54
|
+
fastimage (2.2.7)
|
55
55
|
htmlentities (4.3.4)
|
56
56
|
http-accept (1.7.0)
|
57
57
|
http-cookie (1.0.5)
|
58
58
|
domain_name (~> 0.5)
|
59
59
|
json (2.6.2)
|
60
|
-
json-canonicalization (0.3.
|
61
|
-
json-ld (3.2.
|
60
|
+
json-canonicalization (0.3.2)
|
61
|
+
json-ld (3.2.5)
|
62
62
|
htmlentities (~> 4.3)
|
63
|
-
json-canonicalization (~> 0.3)
|
63
|
+
json-canonicalization (~> 0.3, >= 0.3.2)
|
64
64
|
link_header (~> 0.0, >= 0.0.8)
|
65
65
|
multi_json (~> 1.15)
|
66
|
-
rack (
|
67
|
-
rdf (~> 3.2, >= 3.2.
|
68
|
-
json-ld-preloaded (3.2.
|
66
|
+
rack (>= 2.2, < 4)
|
67
|
+
rdf (~> 3.2, >= 3.2.10)
|
68
|
+
json-ld-preloaded (3.2.2)
|
69
69
|
json-ld (~> 3.2)
|
70
70
|
rdf (~> 3.2)
|
71
71
|
link_header (0.0.8)
|
@@ -79,24 +79,24 @@ GEM
|
|
79
79
|
fastimage (~> 2.2)
|
80
80
|
nesty (~> 1.0)
|
81
81
|
nokogiri (~> 1.11)
|
82
|
-
mime-types (3.
|
82
|
+
mime-types (3.5.1)
|
83
83
|
mime-types-data (~> 3.2015)
|
84
|
-
mime-types-data (3.
|
84
|
+
mime-types-data (3.2023.0808)
|
85
85
|
multi_json (1.15.0)
|
86
86
|
multipart-post (2.2.3)
|
87
87
|
nesty (1.0.2)
|
88
88
|
netrc (0.11.0)
|
89
|
-
nokogiri (1.
|
89
|
+
nokogiri (1.15.4-x86_64-linux)
|
90
90
|
racc (~> 1.4)
|
91
91
|
parallel (1.22.1)
|
92
92
|
parser (3.1.2.0)
|
93
93
|
ast (~> 2.4.1)
|
94
|
-
public_suffix (
|
95
|
-
racc (1.
|
96
|
-
rack (
|
94
|
+
public_suffix (5.0.3)
|
95
|
+
racc (1.7.1)
|
96
|
+
rack (3.0.8)
|
97
97
|
rainbow (3.1.1)
|
98
98
|
rake (13.0.6)
|
99
|
-
rdf (3.2.
|
99
|
+
rdf (3.2.11)
|
100
100
|
link_header (~> 0.0, >= 0.0.8)
|
101
101
|
regexp_parser (2.5.0)
|
102
102
|
rest-client (2.1.0)
|
data/lib/linkheaders/link.rb
CHANGED
@@ -127,7 +127,7 @@ module LinkHeaders
|
|
127
127
|
if l.relation != link.relation
|
128
128
|
@warnings |= ['WARN: Found identical hrefs with different relation types. This may be suspicious. Both have been retained']
|
129
129
|
else
|
130
|
-
@warnings |= [
|
130
|
+
@warnings |= ["WARN: found apparent duplicate #{l.relation} #{l.href} EQUALS#{link.href}. Ignoring and returning known link #{l.relation} #{l.href}"]
|
131
131
|
link = l
|
132
132
|
end
|
133
133
|
end
|
@@ -59,14 +59,14 @@ module LinkHeaders
|
|
59
59
|
return [[], []]
|
60
60
|
end
|
61
61
|
|
62
|
-
|
63
|
-
warn "HTTPlinks #{newlinks.inspect}"
|
62
|
+
_newlinks = parse_http_link_headers(head)
|
63
|
+
# warn "HTTPlinks #{newlinks.inspect}"
|
64
64
|
|
65
|
-
['text/html','text/xhtml+xml', 'application/xhtml+xml'].each do |format|
|
65
|
+
['text/html', 'text/xhtml+xml', 'application/xhtml+xml'].each do |format|
|
66
66
|
if head[:content_type] and head[:content_type].match(format)
|
67
67
|
warn "found #{format} content - parsing"
|
68
|
-
|
69
|
-
warn "htmllinks #{htmllinks.inspect}"
|
68
|
+
_htmllinks = parse_html_link_headers(body: body, anchor: default_anchor) # pass html body to find HTML link headers
|
69
|
+
# warn "htmllinks #{htmllinks.inspect}"
|
70
70
|
end
|
71
71
|
end
|
72
72
|
end
|
@@ -99,10 +99,12 @@ module LinkHeaders
|
|
99
99
|
# warn "link is: #{part}"
|
100
100
|
|
101
101
|
section = part.split(';') # ["<https://example.one.com>", "rel='preconnect'"]
|
102
|
-
|
102
|
+
warn section
|
103
103
|
next unless section[0]
|
104
104
|
|
105
105
|
href = section[0][/<(.*)>/, 1]
|
106
|
+
next unless href # this is mandatory!
|
107
|
+
|
106
108
|
next unless section[1]
|
107
109
|
|
108
110
|
sections = {}
|
@@ -123,10 +125,11 @@ module LinkHeaders
|
|
123
125
|
relation = sections['rel']
|
124
126
|
sections.delete('rel')
|
125
127
|
relations = relation.split(/\s+/) # handle the multiple relation case
|
126
|
-
warn "RELATIONS #{relations}"
|
128
|
+
# warn "HEADERS RELATIONS #{relations}"
|
127
129
|
|
128
130
|
relations.each do |rel|
|
129
131
|
next unless rel.match?(/\w/)
|
132
|
+
puts "LICENCE is #{href}\n\n" if rel == "license"
|
130
133
|
newlinks << factory.new_link(responsepart: :header, anchor: anchor, href: href, relation: rel, **sections) # parsed['https://example.one.com'][:rel] = "preconnect"
|
131
134
|
end
|
132
135
|
end
|
@@ -143,7 +146,6 @@ module LinkHeaders
|
|
143
146
|
# an array of elements that look like this: [{:rel=>"alternate", :type=>"application/ld+json", :href=>"http://scidata.vitk.lv/dataset/303.jsonld"}]
|
144
147
|
newlinks = Array.new
|
145
148
|
m.head_links.each do |l|
|
146
|
-
warn "HTML head link is: #{l.inspect}"
|
147
149
|
next unless l[:href] and l[:rel] # required
|
148
150
|
|
149
151
|
anchor = l[:anchor] || default_anchor
|
@@ -154,7 +156,7 @@ module LinkHeaders
|
|
154
156
|
l.delete(:href)
|
155
157
|
|
156
158
|
relations = relation.split(/\s+/) # handle the multiple relation case
|
157
|
-
warn "RELATIONS #{relations}"
|
159
|
+
# warn "BODY RELATIONS #{relations}"
|
158
160
|
|
159
161
|
relations.each do |rel|
|
160
162
|
next unless rel.match?(/\w/)
|
@@ -189,9 +191,10 @@ module LinkHeaders
|
|
189
191
|
|
190
192
|
def processJSONLinkset(href:)
|
191
193
|
_headers, linkset = lhfetch(href, { 'Accept' => 'application/linkset+json' })
|
192
|
-
# warn "Linkset body #{linkset.inspect}"
|
194
|
+
# warn "Linkset body #{linkset.inspect}\n\nLinkset headers #{_headers}\n\n"
|
193
195
|
newlinks = Array.new
|
194
196
|
return nil unless linkset
|
197
|
+
# warn "linkset #{linkset}"
|
195
198
|
|
196
199
|
# linkset = '{ "linkset":
|
197
200
|
# [
|
@@ -208,20 +211,28 @@ module LinkHeaders
|
|
208
211
|
# }'
|
209
212
|
|
210
213
|
linkset = JSON.parse(linkset)
|
214
|
+
# warn "linkset #{linkset}"
|
215
|
+
if linkset['data'] and linkset['data']['linkset']
|
216
|
+
linkset['linkset'] = linkset['data']['linkset']
|
217
|
+
end
|
218
|
+
return nil unless linkset['linkset'].first
|
211
219
|
linkset['linkset'].each do |ls|
|
212
220
|
# warn ls.inspect, "\n"
|
213
221
|
anchor = ls['anchor'] || @default_anchor
|
214
|
-
ls.delete('anchor') if ls['anchor'] # we need to delete since all others have a list as a value
|
222
|
+
ls.delete('anchor') if ls['anchor'] # we need to delete since almost all others have a list as a value
|
215
223
|
attrhash = {}
|
216
224
|
# warn ls.keys, "\n"
|
217
225
|
|
218
|
-
ls.each_key do |relation| #
|
219
|
-
|
226
|
+
ls.each_key do |relation| # relation = e.g. "item", "described-by". "cite"
|
227
|
+
href = ""
|
228
|
+
# warn relation
|
220
229
|
# warn ls[reltype], "\n"
|
230
|
+
ls[relation] = [ls[relation]] unless ls[relation].is_a? Array # force it to be a list, if it isn't
|
221
231
|
ls[relation].each do |attrs| # attr = e.g. {"href": "http://example.com/foo1", "type": "text/html"}
|
232
|
+
# warn "ATTR: #{attrs}"
|
222
233
|
next unless attrs['href'] # this is a required attribute of a linkset relation
|
223
|
-
|
224
234
|
href = attrs['href']
|
235
|
+
attrs.delete("href")
|
225
236
|
# now go through the other attributes of that relation
|
226
237
|
attrs.each do |attr, val| # attr = e.g. "type"; val = "text/html"
|
227
238
|
attrhash[attr.to_sym] = val
|
@@ -229,7 +240,6 @@ module LinkHeaders
|
|
229
240
|
end
|
230
241
|
|
231
242
|
relations = relation.split(/\s+/) # handle the multiple relation case
|
232
|
-
|
233
243
|
relations.each do |rel|
|
234
244
|
next unless rel.match?(/\w/)
|
235
245
|
newlinks << factory.new_link(responsepart: :header, anchor: anchor, href: href, relation: rel, **attrhash) # parsed['https://example.one.com'][:rel] = "preconnect"
|
@@ -245,13 +255,13 @@ module LinkHeaders
|
|
245
255
|
# warn "linkset body #{linkset.inspect}"
|
246
256
|
return {} unless linkset
|
247
257
|
|
248
|
-
links = linkset.scan(/(<.*?>[^<]+)/) # split on the open angle bracket, which indicates a new link
|
258
|
+
# links = linkset.scan(/(<.*?>[^<]+)/) # split on the open angle bracket, which indicates a new link
|
259
|
+
links = linkset.split(/,\n*/) # split on the comma+newline
|
249
260
|
# warn "Links found #{links}"
|
250
261
|
|
251
262
|
links.each do |ls|
|
252
|
-
# warn "
|
253
|
-
|
254
|
-
elements = ls.split(';') # semicolon delimited fields
|
263
|
+
# warn "working on link #{ls}"
|
264
|
+
elements = ls.split(';').map {|element| element.strip!} # semicolon delimited fields
|
255
265
|
# ["<https://w3id.org/a2a-fair-metrics/08-http-describedby-citeas-linkset-txt/>", "anchor=\"https://s11.no/2022/a2a-fair-metrics/08-http-describedby-citeas-linkset-txt/\"", "rel=\"cite-as\""]
|
256
266
|
href = elements.shift # first element is always the link url
|
257
267
|
# warn "working on link href #{href}"
|
@@ -259,8 +269,6 @@ module LinkHeaders
|
|
259
269
|
attrhash = {}
|
260
270
|
elements.each do |e|
|
261
271
|
key, val = e.split('=')
|
262
|
-
key.strip!
|
263
|
-
val.strip!
|
264
272
|
val.delete_prefix!('"').delete_suffix!('"') # get rid of newlines and start/end quotes
|
265
273
|
attrhash[key.to_sym] = val # split on key=val and make key a symbol
|
266
274
|
end
|
@@ -12,26 +12,53 @@ factory = p.factory # LinkHeaders::LinkFactory
|
|
12
12
|
|
13
13
|
RSpec.describe LinkHeaders::Processor do
|
14
14
|
|
15
|
-
it 'has a version number' do
|
15
|
+
it 'Benchmark: has a version number' do
|
16
16
|
expect(LinkHeaders::Processor::VERSION).not_to be nil
|
17
17
|
end
|
18
18
|
|
19
|
-
it "should find
|
20
|
-
expect(factory.all_links.length).to eq
|
19
|
+
it "Benchmark: should find 8 links in total" do
|
20
|
+
expect(factory.all_links.length).to eq 8
|
21
21
|
end
|
22
|
-
it "should find find href on all links" do
|
23
|
-
expect(factory.all_links.select{|l| l.href}.length).to eq
|
22
|
+
it "Benchmark: should find find href on all links" do
|
23
|
+
expect(factory.all_links.select{|l| l.href}.length).to eq 8
|
24
24
|
end
|
25
|
-
it "should find find
|
26
|
-
expect(factory.all_links.select{|l| l.anchor}.length).to eq
|
25
|
+
it "Benchmark: should find find anchor on all links" do
|
26
|
+
expect(factory.all_links.select{|l| l.anchor}.length).to eq 8
|
27
27
|
end
|
28
|
-
it "should find 5 links in mixed HTTP and HTML headers" do
|
29
|
-
expect(factory.all_links.select{|l| l.relation}.length).to eq
|
28
|
+
it "Benchmark: should find 5 links in mixed HTTP and HTML headers" do
|
29
|
+
expect(factory.all_links.select{|l| l.relation}.length).to eq 8
|
30
30
|
end
|
31
|
-
it "should find one citeas in mixed HTTP and HTML headers" do
|
31
|
+
it "Benchmark: should find one citeas in mixed HTTP and HTML headers" do
|
32
32
|
expect(factory.all_links.select{|l| l.relation == 'cite-as'}.length).to eq 1
|
33
33
|
end
|
34
|
-
it "should find described-by in mixed HTTP and HTML headers" do
|
34
|
+
it "Benchmark: should find described-by in mixed HTTP and HTML headers" do
|
35
35
|
expect(factory.all_links.select{|l| l.relation == 'describedby'}.length).to eq 1
|
36
36
|
end
|
37
|
+
|
38
|
+
url2 = "https://doi.org/10.7910/DVN/Z2JD58"
|
39
|
+
p2 = LinkHeaders::Processor.new(default_anchor: url2)
|
40
|
+
r2 = RestClient.get(url2)
|
41
|
+
p2.extract_and_parse(response: r2)
|
42
|
+
factory2 = p2.factory # LinkHeaders::LinkFactory
|
43
|
+
|
44
|
+
it "Dataverse: should find 29 links in total" do
|
45
|
+
expect(factory2.all_links.length).to eq 28
|
46
|
+
end
|
47
|
+
it "Dataverse: should find find href on all links" do
|
48
|
+
expect(factory2.all_links.select{|l| l.href}.length).to eq 28
|
49
|
+
end
|
50
|
+
it "Dataverse: should find find anchor on all links" do
|
51
|
+
expect(factory2.all_links.select{|l| l.anchor}.length).to eq 28
|
52
|
+
end
|
53
|
+
it "Dataverse: should find one citeas" do
|
54
|
+
expect(factory2.all_links.select{|l| l.relation == 'cite-as'}.length).to eq 1
|
55
|
+
end
|
56
|
+
it "Dataverse: should find 2 described-by" do
|
57
|
+
expect(factory2.all_links.select{|l| l.relation == 'describedby'}.length).to eq 2
|
58
|
+
end
|
59
|
+
it "Dataverse: should find 1 license" do
|
60
|
+
expect(factory2.all_links.select{|l| l.relation == 'license'}.length).to eq 1
|
61
|
+
end
|
62
|
+
|
63
|
+
|
37
64
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkheaders-processor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.19
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mark Wilkinson
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-09-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -168,7 +168,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
168
168
|
- !ruby/object:Gem::Version
|
169
169
|
version: '0'
|
170
170
|
requirements: []
|
171
|
-
rubygems_version: 3.
|
171
|
+
rubygems_version: 3.3.23
|
172
172
|
signing_key:
|
173
173
|
specification_version: 4
|
174
174
|
summary: A parser/processor for Link Headers and Linksets in both JSON and Text formats.
|