linkheaders-processor 0.1.8 → 0.1.13
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +0 -2
- data/Gemfile.lock +2 -2
- data/README.md +13 -7
- data/lib/linkheaders/link.rb +18 -10
- data/lib/linkheaders/processor/version.rb +1 -1
- data/lib/linkheaders/processor.rb +58 -23
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e88dd164547a9a21ce0f1a3ffa85f2af4190ea6a588da445387cdfa2dca7e25d
|
4
|
+
data.tar.gz: f800677c8d4cb18e274defb5dbda5d2f58431cab40899ae15cfb2f866fcf8644
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ad4b8814c9ace9def1edd94e53e890c8d534a40d7cad4d55f7dbd0426e1310d29277a1849fd63ebada06fa3c9812a9d189c86ef2635ffe846fe52ff5f4864e2e
|
7
|
+
data.tar.gz: 99573e84fa6eb0412a5223cb188f333c37a8dbb3c877fcf5ea403a71bfc18d57d477920c6000b280fbb8cc920548ddd11ea95dcdb0df4f0a896e30a68fe84d9b
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
linkheaders-processor (0.1.
|
4
|
+
linkheaders-processor (0.1.13)
|
5
5
|
json (~> 2.0)
|
6
6
|
json-ld (~> 3.2)
|
7
7
|
json-ld-preloaded (~> 3.2)
|
@@ -143,7 +143,7 @@ PLATFORMS
|
|
143
143
|
DEPENDENCIES
|
144
144
|
linkheaders-processor!
|
145
145
|
rake (~> 13.0)
|
146
|
-
rspec (~> 3.
|
146
|
+
rspec (~> 3.11)
|
147
147
|
rubocop (~> 1.21)
|
148
148
|
|
149
149
|
BUNDLED WITH
|
data/README.md
CHANGED
@@ -2,24 +2,24 @@
|
|
2
2
|
|
3
3
|
A gem to extract Link Headers from Web responses.
|
4
4
|
|
5
|
-
This module handles HTTP Link Headers, HTML Link Headers, and auto-follows links to LinkSets in both JSON and Text format, and processes them also.
|
5
|
+
This module handles HTTP Link Headers, HTML Link Headers, and auto-follows links to LinkSets in both JSON and Text format, and processes them also. It also handles some unusual cases, such as having multiple relation types in a single link, or when dealing with 204 or 410 response where there is no message body.
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
9
9
|
Install the gem and add to the application's Gemfile by executing:
|
10
10
|
|
11
|
-
$ bundle add
|
11
|
+
$ bundle add linkheaders-processor
|
12
12
|
|
13
13
|
If bundler is not being used to manage dependencies, install the gem by executing:
|
14
14
|
|
15
|
-
$ gem install
|
15
|
+
$ gem install linkheaders-processor
|
16
16
|
|
17
17
|
## Usage
|
18
18
|
|
19
19
|
|
20
20
|
```
|
21
21
|
|
22
|
-
require '
|
22
|
+
require 'linkheaders/processor'
|
23
23
|
require 'rest-client'
|
24
24
|
|
25
25
|
# url1 has http link headers, and a reference to a linkset in json format
|
@@ -28,27 +28,33 @@ If bundler is not being used to manage dependencies, install the gem by executin
|
|
28
28
|
# url2 has http link headers, with a reference to a linkset in legacy text format
|
29
29
|
url2 = "https://s11.no/2022/a2a-fair-metrics/28-http-linkset-txt-only/"
|
30
30
|
|
31
|
-
p =
|
31
|
+
p = LinkHeaders::Processor.new(default_anchor: url1)
|
32
32
|
r = RestClient.get(url1)
|
33
33
|
|
34
34
|
p.extract_and_parse(response: r)
|
35
|
-
factory = p.factory #
|
35
|
+
factory = p.factory # LinkHeaders::LinkFactory
|
36
36
|
|
37
37
|
factory.all_links.each do |l|
|
38
38
|
puts l.href
|
39
39
|
puts l.relation
|
40
40
|
puts l.responsepart
|
41
41
|
|
42
|
+
# Additional properties are added as other instance methods
|
43
|
+
# you can access them as follows:
|
44
|
+
|
42
45
|
puts l.linkmethods # returns list of instance methods beyond href and relation, that are attributes of the link
|
43
46
|
l.linkmethods.each do |method|
|
44
47
|
puts "#{method}=" + l.send(method)
|
45
48
|
end
|
49
|
+
# or
|
50
|
+
puts l.type if l.respond_to? 'type'
|
46
51
|
puts
|
52
|
+
|
47
53
|
end
|
48
54
|
|
49
55
|
|
50
56
|
|
51
|
-
p =
|
57
|
+
p = LinkHeaders::Processor.new(default_anchor: url2)
|
52
58
|
r = RestClient.get(url2)
|
53
59
|
|
54
60
|
p.extract_and_parse(response: r)
|
data/lib/linkheaders/link.rb
CHANGED
@@ -5,7 +5,7 @@ module LinkHeaders
|
|
5
5
|
attr_accessor :default_anchor
|
6
6
|
# @return [Array] An array of strings containing any warnings that were encountered when creating the link (e.g. duplicate cite-as but non-identical URLs)
|
7
7
|
attr_accessor :warnings
|
8
|
-
|
8
|
+
attr_accessor :all_links
|
9
9
|
|
10
10
|
#
|
11
11
|
# Create the LinkFacgtory Object
|
@@ -15,8 +15,10 @@ module LinkHeaders
|
|
15
15
|
def initialize(default_anchor: 'https://example.org/')
|
16
16
|
@default_anchor = default_anchor
|
17
17
|
@warnings = Array.new
|
18
|
+
@all_links = Array.new
|
18
19
|
end
|
19
20
|
|
21
|
+
|
20
22
|
#
|
21
23
|
# Create a new LinkHeader::Link object
|
22
24
|
#
|
@@ -30,9 +32,13 @@ module LinkHeaders
|
|
30
32
|
#
|
31
33
|
def new_link(responsepart:, href:, relation:, anchor: @default_anchor, **kwargs)
|
32
34
|
# warn "creating new link with kw #{kwargs}"
|
33
|
-
|
35
|
+
if relation.split(/\s/).length > 1
|
36
|
+
@warnings |= ['WARN: the link relation contains spaces. This is allowed by the standard to indicate multiple relations for the same link, but this MUST be processed before creating a LinkHeaders::Link object!']
|
37
|
+
end
|
38
|
+
|
39
|
+
link = LinkHeaders::Link.new(responsepart: responsepart, factory: self, href: href, anchor: anchor, relation: relation, **kwargs)
|
34
40
|
link = sanitycheck(link) # this will add warnings if the link already exists and has a conflict. returns the original of a duplicate
|
35
|
-
|
41
|
+
self.all_links |= [link]
|
36
42
|
return link
|
37
43
|
end
|
38
44
|
|
@@ -42,7 +48,7 @@ module LinkHeaders
|
|
42
48
|
# @return [Array] Array of all LinkHeader::Link objects created by the factory so far
|
43
49
|
#
|
44
50
|
def all_links
|
45
|
-
|
51
|
+
@all_links
|
46
52
|
end
|
47
53
|
|
48
54
|
#
|
@@ -106,19 +112,21 @@ module LinkHeaders
|
|
106
112
|
end
|
107
113
|
|
108
114
|
def sanitycheck(link)
|
109
|
-
|
115
|
+
if link.relation == "describedby" and !(link.respond_to? 'type')
|
116
|
+
@warnings |= ['WARN: A describedby link should include a "type" attribute, to know the MIME type of the addressed description']
|
117
|
+
end
|
118
|
+
|
110
119
|
self.all_links.each do |l|
|
111
120
|
if l.relation == "cite-as" and link.relation == "cite-as"
|
112
121
|
if l.href != link.href
|
113
|
-
@warnings
|
122
|
+
@warnings |= ['WARN: Found conflicting cite-as relations. This should never happen']
|
114
123
|
end
|
115
124
|
end
|
116
125
|
if l.href == link.href
|
117
126
|
if l.relation != link.relation
|
118
|
-
@warnings
|
119
|
-
|
120
|
-
|
121
|
-
@warnings << 'WARN: found apparent duplicate. Ignoring and returning known link'
|
127
|
+
@warnings |= ['WARN: Found identical hrefs with different relation types. This may be suspicious. Both have been retained']
|
128
|
+
else
|
129
|
+
@warnings |= ['WARN: found apparent duplicate. Ignoring and returning known link']
|
122
130
|
link = l
|
123
131
|
end
|
124
132
|
end
|
@@ -17,7 +17,7 @@ module LinkHeaders
|
|
17
17
|
#
|
18
18
|
# Works for both HTML and HTTP links, and handles references to Linksets of either JSON or Text types
|
19
19
|
#
|
20
|
-
class
|
20
|
+
class Processor
|
21
21
|
# @return [<Type>] <description>
|
22
22
|
attr_accessor :default_anchor, :factory
|
23
23
|
|
@@ -28,7 +28,7 @@ module LinkHeaders
|
|
28
28
|
#
|
29
29
|
def initialize(default_anchor: 'https://default.anchor.org/')
|
30
30
|
@default_anchor = default_anchor
|
31
|
-
@factory =
|
31
|
+
@factory = LinkHeaders::LinkFactory.new(default_anchor: @default_anchor)
|
32
32
|
end
|
33
33
|
|
34
34
|
#
|
@@ -60,10 +60,14 @@ module LinkHeaders
|
|
60
60
|
return [[], []]
|
61
61
|
end
|
62
62
|
|
63
|
-
parse_http_link_headers(head) # pass guid to check against anchors in linksets
|
63
|
+
newlinks = parse_http_link_headers(head) # pass guid to check against anchors in linksets
|
64
|
+
warn "HTTPlinks #{newlinks.inspect}"
|
65
|
+
|
64
66
|
HTML_FORMATS['html'].each do |format|
|
65
67
|
if head[:content_type] and head[:content_type].match(format)
|
68
|
+
warn "found #{format} content - parsing"
|
66
69
|
htmllinks = parse_html_link_headers(body) # pass html body to find HTML link headers
|
70
|
+
warn "htmllinks #{htmllinks.inspect}"
|
67
71
|
end
|
68
72
|
end
|
69
73
|
end
|
@@ -75,7 +79,7 @@ module LinkHeaders
|
|
75
79
|
#
|
76
80
|
#
|
77
81
|
def parse_http_link_headers(headers)
|
78
|
-
|
82
|
+
newlinks = Array.new
|
79
83
|
# Link: <https://example.one.com>; rel="preconnect", <https://example.two.com>; rel="preconnect", <https://example.three.com>; rel="preconnect"
|
80
84
|
links = headers[:link]
|
81
85
|
return [] unless links
|
@@ -85,11 +89,13 @@ module LinkHeaders
|
|
85
89
|
# warn parts
|
86
90
|
|
87
91
|
# Parse each part into a named link
|
88
|
-
|
89
|
-
check_for_linkset(responsepart: :header) # all links are held in the Linkset::LinkFactory object (factory variable here). This scans the links for a linkset link to follow
|
92
|
+
newlinks << split_http_link_headers_and_process(parts) # creates links from the split headers and adds to factory.all_links
|
93
|
+
newlinks << check_for_linkset(responsepart: :header) # all links are held in the Linkset::LinkFactory object (factory variable here). This scans the links for a linkset link to follow
|
94
|
+
newlinks
|
90
95
|
end
|
91
96
|
|
92
|
-
def
|
97
|
+
def split_http_link_headers_and_process(parts)
|
98
|
+
newlinks = Array.new
|
93
99
|
parts.each do |part, _index|
|
94
100
|
# warn "link is: #{part}"
|
95
101
|
|
@@ -117,9 +123,15 @@ module LinkHeaders
|
|
117
123
|
sections.delete('anchor')
|
118
124
|
relation = sections['rel']
|
119
125
|
sections.delete('rel')
|
126
|
+
relations = relation.split(/\s+/) # handle the multiple relation case
|
127
|
+
$stderr.puts "RELATIONS #{relations}"
|
120
128
|
|
121
|
-
|
129
|
+
relations.each do |rel|
|
130
|
+
next unless rel.match?(/\w/)
|
131
|
+
newlinks << factory.new_link(responsepart: :header, anchor: anchor, href: href, relation: rel, **sections) # parsed['https://example.one.com'][:rel] = "preconnect"
|
132
|
+
end
|
122
133
|
end
|
134
|
+
newlinks
|
123
135
|
end
|
124
136
|
|
125
137
|
#
|
@@ -130,9 +142,9 @@ module LinkHeaders
|
|
130
142
|
def parse_html_link_headers(body)
|
131
143
|
m = MetaInspector.new('http://example.org', document: body)
|
132
144
|
# an array of elements that look like this: [{:rel=>"alternate", :type=>"application/ld+json", :href=>"http://scidata.vitk.lv/dataset/303.jsonld"}]
|
133
|
-
|
145
|
+
newlinks = Array.new
|
134
146
|
m.head_links.each do |l|
|
135
|
-
|
147
|
+
warn "HTML head link is: #{l.inspect}"
|
136
148
|
next unless l[:href] and l[:rel] # required
|
137
149
|
|
138
150
|
anchor = l[:anchor] || default_anchor
|
@@ -140,14 +152,23 @@ module LinkHeaders
|
|
140
152
|
relation = l[:rel]
|
141
153
|
l.delete(:rel)
|
142
154
|
href = l[:href]
|
143
|
-
l.delete(:href)
|
144
|
-
|
155
|
+
l.delete(:href)
|
156
|
+
|
157
|
+
relations = relation.split(/\s+/) # handle the multiple relation case
|
158
|
+
$stderr.puts "RELATIONS #{relations}"
|
159
|
+
|
160
|
+
relations.each do |rel|
|
161
|
+
next unless rel.match?(/\w/)
|
162
|
+
newlinks << factory.new_link(responsepart: :header, anchor: anchor, href: href, relation: rel, **l) # parsed['https://example.one.com'][:rel] = "preconnect"
|
163
|
+
end
|
145
164
|
end
|
146
|
-
check_for_linkset(responsepart: :body)
|
165
|
+
newlinks << check_for_linkset(responsepart: :body)
|
166
|
+
newlinks
|
147
167
|
end
|
148
168
|
|
149
169
|
def check_for_linkset(responsepart:)
|
150
|
-
|
170
|
+
warn "looking for a linkset"
|
171
|
+
newlinks = Array.new
|
151
172
|
factory.linksets.each do |linkset|
|
152
173
|
# warn "found #{linkset.methods- Object.new.methods}"
|
153
174
|
# warn "inspect #{linkset.inspect}"
|
@@ -156,20 +177,21 @@ module LinkHeaders
|
|
156
177
|
case linkset.type
|
157
178
|
when 'application/linkset+json'
|
158
179
|
# warn "found a json linkset"
|
159
|
-
processJSONLinkset(href: linkset.href)
|
180
|
+
newlinks << processJSONLinkset(href: linkset.href)
|
160
181
|
when 'application/linkset'
|
161
182
|
# warn "found a text linkset"
|
162
|
-
processTextLinkset(href:linkset.href)
|
183
|
+
newlinks << processTextLinkset(href:linkset.href)
|
163
184
|
else
|
164
185
|
warn "the linkset #{linkset} was not typed as 'application/linkset+json' or 'application/linkset', and it should be! (found #{linkset.type}) Ignoring..."
|
165
186
|
end
|
166
187
|
end
|
188
|
+
newlinks
|
167
189
|
end
|
168
190
|
|
169
191
|
def processJSONLinkset(href:)
|
170
192
|
_headers, linkset = fetch(href, { 'Accept' => 'application/linkset+json' })
|
171
193
|
# warn "Linkset body #{linkset.inspect}"
|
172
|
-
|
194
|
+
newlinks = Array.new
|
173
195
|
return nil unless linkset
|
174
196
|
|
175
197
|
# linkset = '{ "linkset":
|
@@ -194,10 +216,10 @@ module LinkHeaders
|
|
194
216
|
attrhash = {}
|
195
217
|
# warn ls.keys, "\n"
|
196
218
|
|
197
|
-
ls.each_key do |
|
219
|
+
ls.each_key do |relation| # key = e.g. "item", "described-by". "cite"
|
198
220
|
# warn reltype, "\n"
|
199
221
|
# warn ls[reltype], "\n"
|
200
|
-
ls[
|
222
|
+
ls[relation].each do |attrs| # attr = e.g. {"href": "http://example.com/foo1", "type": "text/html"}
|
201
223
|
next unless attrs['href'] # this is a required attribute of a linkset relation
|
202
224
|
|
203
225
|
href = attrs['href']
|
@@ -206,12 +228,20 @@ module LinkHeaders
|
|
206
228
|
attrhash[attr.to_sym] = val
|
207
229
|
end
|
208
230
|
end
|
209
|
-
|
231
|
+
|
232
|
+
relations = relation.split(/\s+/) # handle the multiple relation case
|
233
|
+
|
234
|
+
relations.each do |rel|
|
235
|
+
next unless rel.match?(/\w/)
|
236
|
+
newlinks << factory.new_link(responsepart: :header, anchor: anchor, href: href, relation: rel, **attrhash) # parsed['https://example.one.com'][:rel] = "preconnect"
|
237
|
+
end
|
210
238
|
end
|
211
239
|
end
|
240
|
+
newlinks
|
212
241
|
end
|
213
242
|
|
214
243
|
def processTextLinkset(href:)
|
244
|
+
newlinks = Array.new
|
215
245
|
headers, linkset = fetch(href, { 'Accept' => 'application/linkset' })
|
216
246
|
# warn "linkset body #{linkset.inspect}"
|
217
247
|
return {} unless linkset
|
@@ -237,14 +267,19 @@ module LinkHeaders
|
|
237
267
|
end
|
238
268
|
warn "No link relation type... this is bad! Skipping" unless attrhash[:rel]
|
239
269
|
next unless attrhash[:rel]
|
240
|
-
|
270
|
+
relation = attrhash[:rel]
|
241
271
|
attrhash.delete(:rel)
|
242
272
|
anchor = attrhash[:anchor] || @default_anchor
|
243
273
|
attrhash.delete(:anchor)
|
244
274
|
|
245
|
-
|
246
|
-
|
275
|
+
relations = relation.split(/\s+/) # handle the multiple relation case
|
276
|
+
#$stderr.puts "RELATIONS #{relations}"
|
277
|
+
relations.each do |rel|
|
278
|
+
next unless rel.match?(/\w/)
|
279
|
+
newlinks << factory.new_link(responsepart: :header, anchor: anchor, href: href, relation: rel, **attrhash) # parsed['https://example.one.com'][:rel] = "preconnect"
|
280
|
+
end
|
247
281
|
end
|
282
|
+
newlinks
|
248
283
|
end
|
249
284
|
end
|
250
285
|
end
|
metadata
CHANGED
@@ -1,29 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkheaders-processor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mark Wilkinson
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-08-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '3.11'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '3.11'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rest-client
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|