linkheaders-processor 0.1.8 → 0.1.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +0 -2
- data/Gemfile.lock +2 -2
- data/README.md +13 -7
- data/lib/linkheaders/link.rb +18 -10
- data/lib/linkheaders/processor/version.rb +1 -1
- data/lib/linkheaders/processor.rb +58 -23
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e88dd164547a9a21ce0f1a3ffa85f2af4190ea6a588da445387cdfa2dca7e25d
|
4
|
+
data.tar.gz: f800677c8d4cb18e274defb5dbda5d2f58431cab40899ae15cfb2f866fcf8644
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ad4b8814c9ace9def1edd94e53e890c8d534a40d7cad4d55f7dbd0426e1310d29277a1849fd63ebada06fa3c9812a9d189c86ef2635ffe846fe52ff5f4864e2e
|
7
|
+
data.tar.gz: 99573e84fa6eb0412a5223cb188f333c37a8dbb3c877fcf5ea403a71bfc18d57d477920c6000b280fbb8cc920548ddd11ea95dcdb0df4f0a896e30a68fe84d9b
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
linkheaders-processor (0.1.
|
4
|
+
linkheaders-processor (0.1.13)
|
5
5
|
json (~> 2.0)
|
6
6
|
json-ld (~> 3.2)
|
7
7
|
json-ld-preloaded (~> 3.2)
|
@@ -143,7 +143,7 @@ PLATFORMS
|
|
143
143
|
DEPENDENCIES
|
144
144
|
linkheaders-processor!
|
145
145
|
rake (~> 13.0)
|
146
|
-
rspec (~> 3.
|
146
|
+
rspec (~> 3.11)
|
147
147
|
rubocop (~> 1.21)
|
148
148
|
|
149
149
|
BUNDLED WITH
|
data/README.md
CHANGED
@@ -2,24 +2,24 @@
|
|
2
2
|
|
3
3
|
A gem to extract Link Headers from Web responses.
|
4
4
|
|
5
|
-
This module handles HTTP Link Headers, HTML Link Headers, and auto-follows links to LinkSets in both JSON and Text format, and processes them also.
|
5
|
+
This module handles HTTP Link Headers, HTML Link Headers, and auto-follows links to LinkSets in both JSON and Text format, and processes them also. It also handles some unusual cases, such as having multiple relation types in a single link, or when dealing with 204 or 410 response where there is no message body.
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
9
9
|
Install the gem and add to the application's Gemfile by executing:
|
10
10
|
|
11
|
-
$ bundle add
|
11
|
+
$ bundle add linkheaders-processor
|
12
12
|
|
13
13
|
If bundler is not being used to manage dependencies, install the gem by executing:
|
14
14
|
|
15
|
-
$ gem install
|
15
|
+
$ gem install linkheaders-processor
|
16
16
|
|
17
17
|
## Usage
|
18
18
|
|
19
19
|
|
20
20
|
```
|
21
21
|
|
22
|
-
require '
|
22
|
+
require 'linkheaders/processor'
|
23
23
|
require 'rest-client'
|
24
24
|
|
25
25
|
# url1 has http link headers, and a reference to a linkset in json format
|
@@ -28,27 +28,33 @@ If bundler is not being used to manage dependencies, install the gem by executin
|
|
28
28
|
# url2 has http link headers, with a reference to a linkset in legacy text format
|
29
29
|
url2 = "https://s11.no/2022/a2a-fair-metrics/28-http-linkset-txt-only/"
|
30
30
|
|
31
|
-
p =
|
31
|
+
p = LinkHeaders::Processor.new(default_anchor: url1)
|
32
32
|
r = RestClient.get(url1)
|
33
33
|
|
34
34
|
p.extract_and_parse(response: r)
|
35
|
-
factory = p.factory #
|
35
|
+
factory = p.factory # LinkHeaders::LinkFactory
|
36
36
|
|
37
37
|
factory.all_links.each do |l|
|
38
38
|
puts l.href
|
39
39
|
puts l.relation
|
40
40
|
puts l.responsepart
|
41
41
|
|
42
|
+
# Additional properties are added as other instance methods
|
43
|
+
# you can access them as follows:
|
44
|
+
|
42
45
|
puts l.linkmethods # returns list of instance methods beyond href and relation, that are attributes of the link
|
43
46
|
l.linkmethods.each do |method|
|
44
47
|
puts "#{method}=" + l.send(method)
|
45
48
|
end
|
49
|
+
# or
|
50
|
+
puts l.type if l.respond_to? 'type'
|
46
51
|
puts
|
52
|
+
|
47
53
|
end
|
48
54
|
|
49
55
|
|
50
56
|
|
51
|
-
p =
|
57
|
+
p = LinkHeaders::Processor.new(default_anchor: url2)
|
52
58
|
r = RestClient.get(url2)
|
53
59
|
|
54
60
|
p.extract_and_parse(response: r)
|
data/lib/linkheaders/link.rb
CHANGED
@@ -5,7 +5,7 @@ module LinkHeaders
|
|
5
5
|
attr_accessor :default_anchor
|
6
6
|
# @return [Array] An array of strings containing any warnings that were encountered when creating the link (e.g. duplicate cite-as but non-identical URLs)
|
7
7
|
attr_accessor :warnings
|
8
|
-
|
8
|
+
attr_accessor :all_links
|
9
9
|
|
10
10
|
#
|
11
11
|
# Create the LinkFacgtory Object
|
@@ -15,8 +15,10 @@ module LinkHeaders
|
|
15
15
|
def initialize(default_anchor: 'https://example.org/')
|
16
16
|
@default_anchor = default_anchor
|
17
17
|
@warnings = Array.new
|
18
|
+
@all_links = Array.new
|
18
19
|
end
|
19
20
|
|
21
|
+
|
20
22
|
#
|
21
23
|
# Create a new LinkHeader::Link object
|
22
24
|
#
|
@@ -30,9 +32,13 @@ module LinkHeaders
|
|
30
32
|
#
|
31
33
|
def new_link(responsepart:, href:, relation:, anchor: @default_anchor, **kwargs)
|
32
34
|
# warn "creating new link with kw #{kwargs}"
|
33
|
-
|
35
|
+
if relation.split(/\s/).length > 1
|
36
|
+
@warnings |= ['WARN: the link relation contains spaces. This is allowed by the standard to indicate multiple relations for the same link, but this MUST be processed before creating a LinkHeaders::Link object!']
|
37
|
+
end
|
38
|
+
|
39
|
+
link = LinkHeaders::Link.new(responsepart: responsepart, factory: self, href: href, anchor: anchor, relation: relation, **kwargs)
|
34
40
|
link = sanitycheck(link) # this will add warnings if the link already exists and has a conflict. returns the original of a duplicate
|
35
|
-
|
41
|
+
self.all_links |= [link]
|
36
42
|
return link
|
37
43
|
end
|
38
44
|
|
@@ -42,7 +48,7 @@ module LinkHeaders
|
|
42
48
|
# @return [Array] Array of all LinkHeader::Link objects created by the factory so far
|
43
49
|
#
|
44
50
|
def all_links
|
45
|
-
|
51
|
+
@all_links
|
46
52
|
end
|
47
53
|
|
48
54
|
#
|
@@ -106,19 +112,21 @@ module LinkHeaders
|
|
106
112
|
end
|
107
113
|
|
108
114
|
def sanitycheck(link)
|
109
|
-
|
115
|
+
if link.relation == "describedby" and !(link.respond_to? 'type')
|
116
|
+
@warnings |= ['WARN: A describedby link should include a "type" attribute, to know the MIME type of the addressed description']
|
117
|
+
end
|
118
|
+
|
110
119
|
self.all_links.each do |l|
|
111
120
|
if l.relation == "cite-as" and link.relation == "cite-as"
|
112
121
|
if l.href != link.href
|
113
|
-
@warnings
|
122
|
+
@warnings |= ['WARN: Found conflicting cite-as relations. This should never happen']
|
114
123
|
end
|
115
124
|
end
|
116
125
|
if l.href == link.href
|
117
126
|
if l.relation != link.relation
|
118
|
-
@warnings
|
119
|
-
|
120
|
-
|
121
|
-
@warnings << 'WARN: found apparent duplicate. Ignoring and returning known link'
|
127
|
+
@warnings |= ['WARN: Found identical hrefs with different relation types. This may be suspicious. Both have been retained']
|
128
|
+
else
|
129
|
+
@warnings |= ['WARN: found apparent duplicate. Ignoring and returning known link']
|
122
130
|
link = l
|
123
131
|
end
|
124
132
|
end
|
@@ -17,7 +17,7 @@ module LinkHeaders
|
|
17
17
|
#
|
18
18
|
# Works for both HTML and HTTP links, and handles references to Linksets of either JSON or Text types
|
19
19
|
#
|
20
|
-
class
|
20
|
+
class Processor
|
21
21
|
# @return [<Type>] <description>
|
22
22
|
attr_accessor :default_anchor, :factory
|
23
23
|
|
@@ -28,7 +28,7 @@ module LinkHeaders
|
|
28
28
|
#
|
29
29
|
def initialize(default_anchor: 'https://default.anchor.org/')
|
30
30
|
@default_anchor = default_anchor
|
31
|
-
@factory =
|
31
|
+
@factory = LinkHeaders::LinkFactory.new(default_anchor: @default_anchor)
|
32
32
|
end
|
33
33
|
|
34
34
|
#
|
@@ -60,10 +60,14 @@ module LinkHeaders
|
|
60
60
|
return [[], []]
|
61
61
|
end
|
62
62
|
|
63
|
-
parse_http_link_headers(head) # pass guid to check against anchors in linksets
|
63
|
+
newlinks = parse_http_link_headers(head) # pass guid to check against anchors in linksets
|
64
|
+
warn "HTTPlinks #{newlinks.inspect}"
|
65
|
+
|
64
66
|
HTML_FORMATS['html'].each do |format|
|
65
67
|
if head[:content_type] and head[:content_type].match(format)
|
68
|
+
warn "found #{format} content - parsing"
|
66
69
|
htmllinks = parse_html_link_headers(body) # pass html body to find HTML link headers
|
70
|
+
warn "htmllinks #{htmllinks.inspect}"
|
67
71
|
end
|
68
72
|
end
|
69
73
|
end
|
@@ -75,7 +79,7 @@ module LinkHeaders
|
|
75
79
|
#
|
76
80
|
#
|
77
81
|
def parse_http_link_headers(headers)
|
78
|
-
|
82
|
+
newlinks = Array.new
|
79
83
|
# Link: <https://example.one.com>; rel="preconnect", <https://example.two.com>; rel="preconnect", <https://example.three.com>; rel="preconnect"
|
80
84
|
links = headers[:link]
|
81
85
|
return [] unless links
|
@@ -85,11 +89,13 @@ module LinkHeaders
|
|
85
89
|
# warn parts
|
86
90
|
|
87
91
|
# Parse each part into a named link
|
88
|
-
|
89
|
-
check_for_linkset(responsepart: :header) # all links are held in the Linkset::LinkFactory object (factory variable here). This scans the links for a linkset link to follow
|
92
|
+
newlinks << split_http_link_headers_and_process(parts) # creates links from the split headers and adds to factory.all_links
|
93
|
+
newlinks << check_for_linkset(responsepart: :header) # all links are held in the Linkset::LinkFactory object (factory variable here). This scans the links for a linkset link to follow
|
94
|
+
newlinks
|
90
95
|
end
|
91
96
|
|
92
|
-
def
|
97
|
+
def split_http_link_headers_and_process(parts)
|
98
|
+
newlinks = Array.new
|
93
99
|
parts.each do |part, _index|
|
94
100
|
# warn "link is: #{part}"
|
95
101
|
|
@@ -117,9 +123,15 @@ module LinkHeaders
|
|
117
123
|
sections.delete('anchor')
|
118
124
|
relation = sections['rel']
|
119
125
|
sections.delete('rel')
|
126
|
+
relations = relation.split(/\s+/) # handle the multiple relation case
|
127
|
+
$stderr.puts "RELATIONS #{relations}"
|
120
128
|
|
121
|
-
|
129
|
+
relations.each do |rel|
|
130
|
+
next unless rel.match?(/\w/)
|
131
|
+
newlinks << factory.new_link(responsepart: :header, anchor: anchor, href: href, relation: rel, **sections) # parsed['https://example.one.com'][:rel] = "preconnect"
|
132
|
+
end
|
122
133
|
end
|
134
|
+
newlinks
|
123
135
|
end
|
124
136
|
|
125
137
|
#
|
@@ -130,9 +142,9 @@ module LinkHeaders
|
|
130
142
|
def parse_html_link_headers(body)
|
131
143
|
m = MetaInspector.new('http://example.org', document: body)
|
132
144
|
# an array of elements that look like this: [{:rel=>"alternate", :type=>"application/ld+json", :href=>"http://scidata.vitk.lv/dataset/303.jsonld"}]
|
133
|
-
|
145
|
+
newlinks = Array.new
|
134
146
|
m.head_links.each do |l|
|
135
|
-
|
147
|
+
warn "HTML head link is: #{l.inspect}"
|
136
148
|
next unless l[:href] and l[:rel] # required
|
137
149
|
|
138
150
|
anchor = l[:anchor] || default_anchor
|
@@ -140,14 +152,23 @@ module LinkHeaders
|
|
140
152
|
relation = l[:rel]
|
141
153
|
l.delete(:rel)
|
142
154
|
href = l[:href]
|
143
|
-
l.delete(:href)
|
144
|
-
|
155
|
+
l.delete(:href)
|
156
|
+
|
157
|
+
relations = relation.split(/\s+/) # handle the multiple relation case
|
158
|
+
$stderr.puts "RELATIONS #{relations}"
|
159
|
+
|
160
|
+
relations.each do |rel|
|
161
|
+
next unless rel.match?(/\w/)
|
162
|
+
newlinks << factory.new_link(responsepart: :header, anchor: anchor, href: href, relation: rel, **l) # parsed['https://example.one.com'][:rel] = "preconnect"
|
163
|
+
end
|
145
164
|
end
|
146
|
-
check_for_linkset(responsepart: :body)
|
165
|
+
newlinks << check_for_linkset(responsepart: :body)
|
166
|
+
newlinks
|
147
167
|
end
|
148
168
|
|
149
169
|
def check_for_linkset(responsepart:)
|
150
|
-
|
170
|
+
warn "looking for a linkset"
|
171
|
+
newlinks = Array.new
|
151
172
|
factory.linksets.each do |linkset|
|
152
173
|
# warn "found #{linkset.methods- Object.new.methods}"
|
153
174
|
# warn "inspect #{linkset.inspect}"
|
@@ -156,20 +177,21 @@ module LinkHeaders
|
|
156
177
|
case linkset.type
|
157
178
|
when 'application/linkset+json'
|
158
179
|
# warn "found a json linkset"
|
159
|
-
processJSONLinkset(href: linkset.href)
|
180
|
+
newlinks << processJSONLinkset(href: linkset.href)
|
160
181
|
when 'application/linkset'
|
161
182
|
# warn "found a text linkset"
|
162
|
-
processTextLinkset(href:linkset.href)
|
183
|
+
newlinks << processTextLinkset(href:linkset.href)
|
163
184
|
else
|
164
185
|
warn "the linkset #{linkset} was not typed as 'application/linkset+json' or 'application/linkset', and it should be! (found #{linkset.type}) Ignoring..."
|
165
186
|
end
|
166
187
|
end
|
188
|
+
newlinks
|
167
189
|
end
|
168
190
|
|
169
191
|
def processJSONLinkset(href:)
|
170
192
|
_headers, linkset = fetch(href, { 'Accept' => 'application/linkset+json' })
|
171
193
|
# warn "Linkset body #{linkset.inspect}"
|
172
|
-
|
194
|
+
newlinks = Array.new
|
173
195
|
return nil unless linkset
|
174
196
|
|
175
197
|
# linkset = '{ "linkset":
|
@@ -194,10 +216,10 @@ module LinkHeaders
|
|
194
216
|
attrhash = {}
|
195
217
|
# warn ls.keys, "\n"
|
196
218
|
|
197
|
-
ls.each_key do |
|
219
|
+
ls.each_key do |relation| # key = e.g. "item", "described-by". "cite"
|
198
220
|
# warn reltype, "\n"
|
199
221
|
# warn ls[reltype], "\n"
|
200
|
-
ls[
|
222
|
+
ls[relation].each do |attrs| # attr = e.g. {"href": "http://example.com/foo1", "type": "text/html"}
|
201
223
|
next unless attrs['href'] # this is a required attribute of a linkset relation
|
202
224
|
|
203
225
|
href = attrs['href']
|
@@ -206,12 +228,20 @@ module LinkHeaders
|
|
206
228
|
attrhash[attr.to_sym] = val
|
207
229
|
end
|
208
230
|
end
|
209
|
-
|
231
|
+
|
232
|
+
relations = relation.split(/\s+/) # handle the multiple relation case
|
233
|
+
|
234
|
+
relations.each do |rel|
|
235
|
+
next unless rel.match?(/\w/)
|
236
|
+
newlinks << factory.new_link(responsepart: :header, anchor: anchor, href: href, relation: rel, **attrhash) # parsed['https://example.one.com'][:rel] = "preconnect"
|
237
|
+
end
|
210
238
|
end
|
211
239
|
end
|
240
|
+
newlinks
|
212
241
|
end
|
213
242
|
|
214
243
|
def processTextLinkset(href:)
|
244
|
+
newlinks = Array.new
|
215
245
|
headers, linkset = fetch(href, { 'Accept' => 'application/linkset' })
|
216
246
|
# warn "linkset body #{linkset.inspect}"
|
217
247
|
return {} unless linkset
|
@@ -237,14 +267,19 @@ module LinkHeaders
|
|
237
267
|
end
|
238
268
|
warn "No link relation type... this is bad! Skipping" unless attrhash[:rel]
|
239
269
|
next unless attrhash[:rel]
|
240
|
-
|
270
|
+
relation = attrhash[:rel]
|
241
271
|
attrhash.delete(:rel)
|
242
272
|
anchor = attrhash[:anchor] || @default_anchor
|
243
273
|
attrhash.delete(:anchor)
|
244
274
|
|
245
|
-
|
246
|
-
|
275
|
+
relations = relation.split(/\s+/) # handle the multiple relation case
|
276
|
+
#$stderr.puts "RELATIONS #{relations}"
|
277
|
+
relations.each do |rel|
|
278
|
+
next unless rel.match?(/\w/)
|
279
|
+
newlinks << factory.new_link(responsepart: :header, anchor: anchor, href: href, relation: rel, **attrhash) # parsed['https://example.one.com'][:rel] = "preconnect"
|
280
|
+
end
|
247
281
|
end
|
282
|
+
newlinks
|
248
283
|
end
|
249
284
|
end
|
250
285
|
end
|
metadata
CHANGED
@@ -1,29 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkheaders-processor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mark Wilkinson
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-08-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '3.11'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '3.11'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rest-client
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|