linkheaders-processor 0.1.14 → 0.1.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +7 -7
- data/lib/linkheaders/processor/version.rb +1 -1
- data/lib/linkheaders/processor.rb +10 -13
- data/lib/linkheaders/web_utils.rb +1 -1
- data/spec/linkheader/parser_spec.rb +30 -0
- metadata +2 -3
- data/lib/linkheaders/constants.rb +0 -29
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c8eb95bf3880ef8dba373d47230b512d5209bcde19581c1064c2cb703bab3abf
|
|
4
|
+
data.tar.gz: 3ac9096ab4487e30f5e8a78a18cd982f47e232f4dc7e8b9d9573b73ee96b63ad
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 6de8bcfd72fb78d76483fe9473ece432ccfc6de1cd68eb1ceda5509ec7324e9c010f60a8c996c335c4164cc920a23b046c8ea1a0f9b6edd08f55acfe17e7caca
|
|
7
|
+
data.tar.gz: 557b9ff9c6f9da8a7f28d3f01e9079f14caf205fe14b017bcc0aa2902b80a89cd32207e98e4ec2642e9d08aae112c22c6835e0821fb7acc1282a3e620771f0cf
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
linkheaders-processor (0.1.
|
|
4
|
+
linkheaders-processor (0.1.18)
|
|
5
5
|
json (~> 2.0)
|
|
6
6
|
json-ld (~> 3.2)
|
|
7
7
|
json-ld-preloaded (~> 3.2)
|
|
@@ -19,7 +19,7 @@ GEM
|
|
|
19
19
|
diff-lcs (1.5.0)
|
|
20
20
|
domain_name (0.5.20190701)
|
|
21
21
|
unf (>= 0.0.5, < 1.0.0)
|
|
22
|
-
faraday (1.10.
|
|
22
|
+
faraday (1.10.1)
|
|
23
23
|
faraday-em_http (~> 1.0)
|
|
24
24
|
faraday-em_synchrony (~> 1.0)
|
|
25
25
|
faraday-excon (~> 1.1)
|
|
@@ -39,7 +39,7 @@ GEM
|
|
|
39
39
|
faraday-encoding (0.0.5)
|
|
40
40
|
faraday
|
|
41
41
|
faraday-excon (1.1.0)
|
|
42
|
-
faraday-http-cache (2.4.
|
|
42
|
+
faraday-http-cache (2.4.1)
|
|
43
43
|
faraday (>= 0.8)
|
|
44
44
|
faraday-httpclient (1.0.1)
|
|
45
45
|
faraday-multipart (1.0.4)
|
|
@@ -58,13 +58,13 @@ GEM
|
|
|
58
58
|
domain_name (~> 0.5)
|
|
59
59
|
json (2.6.2)
|
|
60
60
|
json-canonicalization (0.3.0)
|
|
61
|
-
json-ld (3.2.
|
|
61
|
+
json-ld (3.2.3)
|
|
62
62
|
htmlentities (~> 4.3)
|
|
63
63
|
json-canonicalization (~> 0.3)
|
|
64
64
|
link_header (~> 0.0, >= 0.0.8)
|
|
65
65
|
multi_json (~> 1.15)
|
|
66
66
|
rack (~> 2.2)
|
|
67
|
-
rdf (~> 3.2)
|
|
67
|
+
rdf (~> 3.2, >= 3.2.9)
|
|
68
68
|
json-ld-preloaded (3.2.0)
|
|
69
69
|
json-ld (~> 3.2)
|
|
70
70
|
rdf (~> 3.2)
|
|
@@ -96,7 +96,7 @@ GEM
|
|
|
96
96
|
rack (2.2.4)
|
|
97
97
|
rainbow (3.1.1)
|
|
98
98
|
rake (13.0.6)
|
|
99
|
-
rdf (3.2.
|
|
99
|
+
rdf (3.2.9)
|
|
100
100
|
link_header (~> 0.0, >= 0.0.8)
|
|
101
101
|
regexp_parser (2.5.0)
|
|
102
102
|
rest-client (2.1.0)
|
|
@@ -118,7 +118,7 @@ GEM
|
|
|
118
118
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
119
119
|
rspec-support (~> 3.11.0)
|
|
120
120
|
rspec-support (3.11.0)
|
|
121
|
-
rubocop (1.
|
|
121
|
+
rubocop (1.33.0)
|
|
122
122
|
json (~> 2.3)
|
|
123
123
|
parallel (~> 1.10)
|
|
124
124
|
parser (>= 3.1.0.0)
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative 'processor/version'
|
|
4
|
-
require_relative 'constants'
|
|
5
4
|
require_relative 'link'
|
|
6
5
|
require_relative 'web_utils'
|
|
7
6
|
require 'link_header'
|
|
@@ -63,10 +62,10 @@ module LinkHeaders
|
|
|
63
62
|
newlinks = parse_http_link_headers(head) # pass guid to check against anchors in linksets
|
|
64
63
|
warn "HTTPlinks #{newlinks.inspect}"
|
|
65
64
|
|
|
66
|
-
|
|
65
|
+
['text/html','text/xhtml+xml', 'application/xhtml+xml'].each do |format|
|
|
67
66
|
if head[:content_type] and head[:content_type].match(format)
|
|
68
67
|
warn "found #{format} content - parsing"
|
|
69
|
-
htmllinks = parse_html_link_headers(body) # pass html body to find HTML link headers
|
|
68
|
+
htmllinks = parse_html_link_headers(body: body, anchor: default_anchor) # pass html body to find HTML link headers
|
|
70
69
|
warn "htmllinks #{htmllinks.inspect}"
|
|
71
70
|
end
|
|
72
71
|
end
|
|
@@ -124,7 +123,7 @@ module LinkHeaders
|
|
|
124
123
|
relation = sections['rel']
|
|
125
124
|
sections.delete('rel')
|
|
126
125
|
relations = relation.split(/\s+/) # handle the multiple relation case
|
|
127
|
-
|
|
126
|
+
warn "RELATIONS #{relations}"
|
|
128
127
|
|
|
129
128
|
relations.each do |rel|
|
|
130
129
|
next unless rel.match?(/\w/)
|
|
@@ -139,8 +138,8 @@ module LinkHeaders
|
|
|
139
138
|
#
|
|
140
139
|
# @param [String] body The HTML of the page containing HTML Link headers
|
|
141
140
|
#
|
|
142
|
-
def parse_html_link_headers(body)
|
|
143
|
-
m = MetaInspector.new(
|
|
141
|
+
def parse_html_link_headers(body:, anchor: '')
|
|
142
|
+
m = MetaInspector.new(anchor, document: body)
|
|
144
143
|
# an array of elements that look like this: [{:rel=>"alternate", :type=>"application/ld+json", :href=>"http://scidata.vitk.lv/dataset/303.jsonld"}]
|
|
145
144
|
newlinks = Array.new
|
|
146
145
|
m.head_links.each do |l|
|
|
@@ -155,7 +154,7 @@ module LinkHeaders
|
|
|
155
154
|
l.delete(:href)
|
|
156
155
|
|
|
157
156
|
relations = relation.split(/\s+/) # handle the multiple relation case
|
|
158
|
-
|
|
157
|
+
warn "RELATIONS #{relations}"
|
|
159
158
|
|
|
160
159
|
relations.each do |rel|
|
|
161
160
|
next unless rel.match?(/\w/)
|
|
@@ -246,13 +245,13 @@ module LinkHeaders
|
|
|
246
245
|
# warn "linkset body #{linkset.inspect}"
|
|
247
246
|
return {} unless linkset
|
|
248
247
|
|
|
249
|
-
links = linkset.scan(/(<.*?>[^<]+)/) # split on the open angle bracket, which indicates a new link
|
|
248
|
+
# links = linkset.scan(/(<.*?>[^<]+)/) # split on the open angle bracket, which indicates a new link
|
|
249
|
+
links = linkset.split(/,\n*/) # split on the comma+newline
|
|
250
250
|
# warn "Links found #{links}"
|
|
251
251
|
|
|
252
252
|
links.each do |ls|
|
|
253
|
-
# warn "
|
|
254
|
-
|
|
255
|
-
elements = ls.split(';') # semicolon delimited fields
|
|
253
|
+
# warn "working on link #{ls}"
|
|
254
|
+
elements = ls.split(';').map {|element| element.strip!} # semicolon delimited fields
|
|
256
255
|
# ["<https://w3id.org/a2a-fair-metrics/08-http-describedby-citeas-linkset-txt/>", "anchor=\"https://s11.no/2022/a2a-fair-metrics/08-http-describedby-citeas-linkset-txt/\"", "rel=\"cite-as\""]
|
|
257
256
|
href = elements.shift # first element is always the link url
|
|
258
257
|
# warn "working on link href #{href}"
|
|
@@ -260,8 +259,6 @@ module LinkHeaders
|
|
|
260
259
|
attrhash = {}
|
|
261
260
|
elements.each do |e|
|
|
262
261
|
key, val = e.split('=')
|
|
263
|
-
key.strip!
|
|
264
|
-
val.strip!
|
|
265
262
|
val.delete_prefix!('"').delete_suffix!('"') # get rid of newlines and start/end quotes
|
|
266
263
|
attrhash[key.to_sym] = val # split on key=val and make key a symbol
|
|
267
264
|
end
|
|
@@ -1,7 +1,37 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
|
+
require_relative '../../lib/linkheaders/processor'
|
|
3
|
+
require 'rest-client'
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
url1 = "https://w3id.org/a2a-fair-metrics/22-http-html-citeas-describedby-mixed/"
|
|
7
|
+
p = LinkHeaders::Processor.new(default_anchor: url1)
|
|
8
|
+
r = RestClient.get(url1)
|
|
9
|
+
p.extract_and_parse(response: r)
|
|
10
|
+
factory = p.factory # LinkHeaders::LinkFactory
|
|
11
|
+
|
|
2
12
|
|
|
3
13
|
RSpec.describe LinkHeaders::Processor do
|
|
14
|
+
|
|
4
15
|
it 'has a version number' do
|
|
5
16
|
expect(LinkHeaders::Processor::VERSION).not_to be nil
|
|
6
17
|
end
|
|
18
|
+
|
|
19
|
+
it "should find PURL citeas which has described-by and cite-as in mixed HTTP and HTML headers" do
|
|
20
|
+
expect(factory.all_links.length).to eq 5
|
|
21
|
+
end
|
|
22
|
+
it "should find find href on all links" do
|
|
23
|
+
expect(factory.all_links.select{|l| l.href}.length).to eq 5
|
|
24
|
+
end
|
|
25
|
+
it "should find find href on all links" do
|
|
26
|
+
expect(factory.all_links.select{|l| l.anchor}.length).to eq 5
|
|
27
|
+
end
|
|
28
|
+
it "should find 5 links in mixed HTTP and HTML headers" do
|
|
29
|
+
expect(factory.all_links.select{|l| l.relation}.length).to eq 5
|
|
30
|
+
end
|
|
31
|
+
it "should find one citeas in mixed HTTP and HTML headers" do
|
|
32
|
+
expect(factory.all_links.select{|l| l.relation == 'cite-as'}.length).to eq 1
|
|
33
|
+
end
|
|
34
|
+
it "should find described-by in mixed HTTP and HTML headers" do
|
|
35
|
+
expect(factory.all_links.select{|l| l.relation == 'describedby'}.length).to eq 1
|
|
36
|
+
end
|
|
7
37
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: linkheaders-processor
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.18
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Mark Wilkinson
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2022-08-
|
|
11
|
+
date: 2022-08-17 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rspec
|
|
@@ -137,7 +137,6 @@ files:
|
|
|
137
137
|
- README.md
|
|
138
138
|
- Rakefile
|
|
139
139
|
- launch.json
|
|
140
|
-
- lib/linkheaders/constants.rb
|
|
141
140
|
- lib/linkheaders/link.rb
|
|
142
141
|
- lib/linkheaders/processor.rb
|
|
143
142
|
- lib/linkheaders/processor/version.rb
|
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
ACCEPT_ALL_HEADER = {'Accept' => 'text/turtle, application/ld+json, application/rdf+xml, text/xhtml+xml, application/n3, application/rdf+n3, application/turtle, application/x-turtle, text/n3, text/turtle, text/rdf+n3, text/rdf+turtle, application/n-triples' }
|
|
2
|
-
ACCEPT_STAR_HEADER = {'Accept' => '*/*'}
|
|
3
|
-
|
|
4
|
-
TEXT_FORMATS = {
|
|
5
|
-
'text' => ['text/plain',],
|
|
6
|
-
}
|
|
7
|
-
|
|
8
|
-
RDF_FORMATS = {
|
|
9
|
-
'jsonld' => ['application/ld+json', 'application/vnd.schemaorg.ld+json'], # NEW FOR DATACITE
|
|
10
|
-
'turtle' => ['text/turtle','application/n3','application/rdf+n3',
|
|
11
|
-
'application/turtle', 'application/x-turtle','text/n3','text/turtle',
|
|
12
|
-
'text/rdf+n3', 'text/rdf+turtle'],
|
|
13
|
-
#'rdfa' => ['text/xhtml+xml', 'application/xhtml+xml'],
|
|
14
|
-
'rdfxml' => ['application/rdf+xml'],
|
|
15
|
-
'triples' => ['application/n-triples','application/n-quads', 'application/trig']
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
XML_FORMATS = {
|
|
19
|
-
'xml' => ['text/xhtml','text/xml',]
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
HTML_FORMATS = {
|
|
23
|
-
'html' => ['text/html','text/xhtml+xml', 'application/xhtml+xml']
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
JSON_FORMATS = {
|
|
27
|
-
'json' => ['application/json',]
|
|
28
|
-
}
|
|
29
|
-
|