linkheaders-processor 0.1.14 → 0.1.18
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +7 -7
- data/lib/linkheaders/processor/version.rb +1 -1
- data/lib/linkheaders/processor.rb +10 -13
- data/lib/linkheaders/web_utils.rb +1 -1
- data/spec/linkheader/parser_spec.rb +30 -0
- metadata +2 -3
- data/lib/linkheaders/constants.rb +0 -29
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c8eb95bf3880ef8dba373d47230b512d5209bcde19581c1064c2cb703bab3abf
|
4
|
+
data.tar.gz: 3ac9096ab4487e30f5e8a78a18cd982f47e232f4dc7e8b9d9573b73ee96b63ad
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6de8bcfd72fb78d76483fe9473ece432ccfc6de1cd68eb1ceda5509ec7324e9c010f60a8c996c335c4164cc920a23b046c8ea1a0f9b6edd08f55acfe17e7caca
|
7
|
+
data.tar.gz: 557b9ff9c6f9da8a7f28d3f01e9079f14caf205fe14b017bcc0aa2902b80a89cd32207e98e4ec2642e9d08aae112c22c6835e0821fb7acc1282a3e620771f0cf
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
linkheaders-processor (0.1.
|
4
|
+
linkheaders-processor (0.1.18)
|
5
5
|
json (~> 2.0)
|
6
6
|
json-ld (~> 3.2)
|
7
7
|
json-ld-preloaded (~> 3.2)
|
@@ -19,7 +19,7 @@ GEM
|
|
19
19
|
diff-lcs (1.5.0)
|
20
20
|
domain_name (0.5.20190701)
|
21
21
|
unf (>= 0.0.5, < 1.0.0)
|
22
|
-
faraday (1.10.
|
22
|
+
faraday (1.10.1)
|
23
23
|
faraday-em_http (~> 1.0)
|
24
24
|
faraday-em_synchrony (~> 1.0)
|
25
25
|
faraday-excon (~> 1.1)
|
@@ -39,7 +39,7 @@ GEM
|
|
39
39
|
faraday-encoding (0.0.5)
|
40
40
|
faraday
|
41
41
|
faraday-excon (1.1.0)
|
42
|
-
faraday-http-cache (2.4.
|
42
|
+
faraday-http-cache (2.4.1)
|
43
43
|
faraday (>= 0.8)
|
44
44
|
faraday-httpclient (1.0.1)
|
45
45
|
faraday-multipart (1.0.4)
|
@@ -58,13 +58,13 @@ GEM
|
|
58
58
|
domain_name (~> 0.5)
|
59
59
|
json (2.6.2)
|
60
60
|
json-canonicalization (0.3.0)
|
61
|
-
json-ld (3.2.
|
61
|
+
json-ld (3.2.3)
|
62
62
|
htmlentities (~> 4.3)
|
63
63
|
json-canonicalization (~> 0.3)
|
64
64
|
link_header (~> 0.0, >= 0.0.8)
|
65
65
|
multi_json (~> 1.15)
|
66
66
|
rack (~> 2.2)
|
67
|
-
rdf (~> 3.2)
|
67
|
+
rdf (~> 3.2, >= 3.2.9)
|
68
68
|
json-ld-preloaded (3.2.0)
|
69
69
|
json-ld (~> 3.2)
|
70
70
|
rdf (~> 3.2)
|
@@ -96,7 +96,7 @@ GEM
|
|
96
96
|
rack (2.2.4)
|
97
97
|
rainbow (3.1.1)
|
98
98
|
rake (13.0.6)
|
99
|
-
rdf (3.2.
|
99
|
+
rdf (3.2.9)
|
100
100
|
link_header (~> 0.0, >= 0.0.8)
|
101
101
|
regexp_parser (2.5.0)
|
102
102
|
rest-client (2.1.0)
|
@@ -118,7 +118,7 @@ GEM
|
|
118
118
|
diff-lcs (>= 1.2.0, < 2.0)
|
119
119
|
rspec-support (~> 3.11.0)
|
120
120
|
rspec-support (3.11.0)
|
121
|
-
rubocop (1.
|
121
|
+
rubocop (1.33.0)
|
122
122
|
json (~> 2.3)
|
123
123
|
parallel (~> 1.10)
|
124
124
|
parser (>= 3.1.0.0)
|
@@ -1,7 +1,6 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require_relative 'processor/version'
|
4
|
-
require_relative 'constants'
|
5
4
|
require_relative 'link'
|
6
5
|
require_relative 'web_utils'
|
7
6
|
require 'link_header'
|
@@ -63,10 +62,10 @@ module LinkHeaders
|
|
63
62
|
newlinks = parse_http_link_headers(head) # pass guid to check against anchors in linksets
|
64
63
|
warn "HTTPlinks #{newlinks.inspect}"
|
65
64
|
|
66
|
-
|
65
|
+
['text/html','text/xhtml+xml', 'application/xhtml+xml'].each do |format|
|
67
66
|
if head[:content_type] and head[:content_type].match(format)
|
68
67
|
warn "found #{format} content - parsing"
|
69
|
-
htmllinks = parse_html_link_headers(body) # pass html body to find HTML link headers
|
68
|
+
htmllinks = parse_html_link_headers(body: body, anchor: default_anchor) # pass html body to find HTML link headers
|
70
69
|
warn "htmllinks #{htmllinks.inspect}"
|
71
70
|
end
|
72
71
|
end
|
@@ -124,7 +123,7 @@ module LinkHeaders
|
|
124
123
|
relation = sections['rel']
|
125
124
|
sections.delete('rel')
|
126
125
|
relations = relation.split(/\s+/) # handle the multiple relation case
|
127
|
-
|
126
|
+
warn "RELATIONS #{relations}"
|
128
127
|
|
129
128
|
relations.each do |rel|
|
130
129
|
next unless rel.match?(/\w/)
|
@@ -139,8 +138,8 @@ module LinkHeaders
|
|
139
138
|
#
|
140
139
|
# @param [String] body The HTML of the page containing HTML Link headers
|
141
140
|
#
|
142
|
-
def parse_html_link_headers(body)
|
143
|
-
m = MetaInspector.new(
|
141
|
+
def parse_html_link_headers(body:, anchor: '')
|
142
|
+
m = MetaInspector.new(anchor, document: body)
|
144
143
|
# an array of elements that look like this: [{:rel=>"alternate", :type=>"application/ld+json", :href=>"http://scidata.vitk.lv/dataset/303.jsonld"}]
|
145
144
|
newlinks = Array.new
|
146
145
|
m.head_links.each do |l|
|
@@ -155,7 +154,7 @@ module LinkHeaders
|
|
155
154
|
l.delete(:href)
|
156
155
|
|
157
156
|
relations = relation.split(/\s+/) # handle the multiple relation case
|
158
|
-
|
157
|
+
warn "RELATIONS #{relations}"
|
159
158
|
|
160
159
|
relations.each do |rel|
|
161
160
|
next unless rel.match?(/\w/)
|
@@ -246,13 +245,13 @@ module LinkHeaders
|
|
246
245
|
# warn "linkset body #{linkset.inspect}"
|
247
246
|
return {} unless linkset
|
248
247
|
|
249
|
-
links = linkset.scan(/(<.*?>[^<]+)/) # split on the open angle bracket, which indicates a new link
|
248
|
+
# links = linkset.scan(/(<.*?>[^<]+)/) # split on the open angle bracket, which indicates a new link
|
249
|
+
links = linkset.split(/,\n*/) # split on the comma+newline
|
250
250
|
# warn "Links found #{links}"
|
251
251
|
|
252
252
|
links.each do |ls|
|
253
|
-
# warn "
|
254
|
-
|
255
|
-
elements = ls.split(';') # semicolon delimited fields
|
253
|
+
# warn "working on link #{ls}"
|
254
|
+
elements = ls.split(';').map {|element| element.strip!} # semicolon delimited fields
|
256
255
|
# ["<https://w3id.org/a2a-fair-metrics/08-http-describedby-citeas-linkset-txt/>", "anchor=\"https://s11.no/2022/a2a-fair-metrics/08-http-describedby-citeas-linkset-txt/\"", "rel=\"cite-as\""]
|
257
256
|
href = elements.shift # first element is always the link url
|
258
257
|
# warn "working on link href #{href}"
|
@@ -260,8 +259,6 @@ module LinkHeaders
|
|
260
259
|
attrhash = {}
|
261
260
|
elements.each do |e|
|
262
261
|
key, val = e.split('=')
|
263
|
-
key.strip!
|
264
|
-
val.strip!
|
265
262
|
val.delete_prefix!('"').delete_suffix!('"') # get rid of newlines and start/end quotes
|
266
263
|
attrhash[key.to_sym] = val # split on key=val and make key a symbol
|
267
264
|
end
|
@@ -1,7 +1,37 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
require_relative '../../lib/linkheaders/processor'
|
3
|
+
require 'rest-client'
|
4
|
+
|
5
|
+
|
6
|
+
url1 = "https://w3id.org/a2a-fair-metrics/22-http-html-citeas-describedby-mixed/"
|
7
|
+
p = LinkHeaders::Processor.new(default_anchor: url1)
|
8
|
+
r = RestClient.get(url1)
|
9
|
+
p.extract_and_parse(response: r)
|
10
|
+
factory = p.factory # LinkHeaders::LinkFactory
|
11
|
+
|
2
12
|
|
3
13
|
RSpec.describe LinkHeaders::Processor do
|
14
|
+
|
4
15
|
it 'has a version number' do
|
5
16
|
expect(LinkHeaders::Processor::VERSION).not_to be nil
|
6
17
|
end
|
18
|
+
|
19
|
+
it "should find PURL citeas which has described-by and cite-as in mixed HTTP and HTML headers" do
|
20
|
+
expect(factory.all_links.length).to eq 5
|
21
|
+
end
|
22
|
+
it "should find find href on all links" do
|
23
|
+
expect(factory.all_links.select{|l| l.href}.length).to eq 5
|
24
|
+
end
|
25
|
+
it "should find find href on all links" do
|
26
|
+
expect(factory.all_links.select{|l| l.anchor}.length).to eq 5
|
27
|
+
end
|
28
|
+
it "should find 5 links in mixed HTTP and HTML headers" do
|
29
|
+
expect(factory.all_links.select{|l| l.relation}.length).to eq 5
|
30
|
+
end
|
31
|
+
it "should find one citeas in mixed HTTP and HTML headers" do
|
32
|
+
expect(factory.all_links.select{|l| l.relation == 'cite-as'}.length).to eq 1
|
33
|
+
end
|
34
|
+
it "should find described-by in mixed HTTP and HTML headers" do
|
35
|
+
expect(factory.all_links.select{|l| l.relation == 'describedby'}.length).to eq 1
|
36
|
+
end
|
7
37
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkheaders-processor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.18
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mark Wilkinson
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-08-
|
11
|
+
date: 2022-08-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -137,7 +137,6 @@ files:
|
|
137
137
|
- README.md
|
138
138
|
- Rakefile
|
139
139
|
- launch.json
|
140
|
-
- lib/linkheaders/constants.rb
|
141
140
|
- lib/linkheaders/link.rb
|
142
141
|
- lib/linkheaders/processor.rb
|
143
142
|
- lib/linkheaders/processor/version.rb
|
@@ -1,29 +0,0 @@
|
|
1
|
-
ACCEPT_ALL_HEADER = {'Accept' => 'text/turtle, application/ld+json, application/rdf+xml, text/xhtml+xml, application/n3, application/rdf+n3, application/turtle, application/x-turtle, text/n3, text/turtle, text/rdf+n3, text/rdf+turtle, application/n-triples' }
|
2
|
-
ACCEPT_STAR_HEADER = {'Accept' => '*/*'}
|
3
|
-
|
4
|
-
TEXT_FORMATS = {
|
5
|
-
'text' => ['text/plain',],
|
6
|
-
}
|
7
|
-
|
8
|
-
RDF_FORMATS = {
|
9
|
-
'jsonld' => ['application/ld+json', 'application/vnd.schemaorg.ld+json'], # NEW FOR DATACITE
|
10
|
-
'turtle' => ['text/turtle','application/n3','application/rdf+n3',
|
11
|
-
'application/turtle', 'application/x-turtle','text/n3','text/turtle',
|
12
|
-
'text/rdf+n3', 'text/rdf+turtle'],
|
13
|
-
#'rdfa' => ['text/xhtml+xml', 'application/xhtml+xml'],
|
14
|
-
'rdfxml' => ['application/rdf+xml'],
|
15
|
-
'triples' => ['application/n-triples','application/n-quads', 'application/trig']
|
16
|
-
}
|
17
|
-
|
18
|
-
XML_FORMATS = {
|
19
|
-
'xml' => ['text/xhtml','text/xml',]
|
20
|
-
}
|
21
|
-
|
22
|
-
HTML_FORMATS = {
|
23
|
-
'html' => ['text/html','text/xhtml+xml', 'application/xhtml+xml']
|
24
|
-
}
|
25
|
-
|
26
|
-
JSON_FORMATS = {
|
27
|
-
'json' => ['application/json',]
|
28
|
-
}
|
29
|
-
|