linkheaders-processor 0.1.14 → 0.1.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4df2bc9ea84550e0de7be8110f49121ea84962ac797b01860858a445213787d5
4
- data.tar.gz: ea148084a6ee71bb8e5509ae94c1dde4714dc66e609527b8c186362cc71c7c76
3
+ metadata.gz: c8eb95bf3880ef8dba373d47230b512d5209bcde19581c1064c2cb703bab3abf
4
+ data.tar.gz: 3ac9096ab4487e30f5e8a78a18cd982f47e232f4dc7e8b9d9573b73ee96b63ad
5
5
  SHA512:
6
- metadata.gz: 9b66ae4ca8d0b48f71d4d33f2bccc1575abc5efe65ebe7bc756cab9fb47836d06fe9dc011ae0e55b0a3be6db331e3018e62d006bb8140b181538b65dc66cbab1
7
- data.tar.gz: 8c2ae044bd1a111f915ea419932b431227b344386ad1824d4e1fc6f86269c2c513d72918d693addf0fa6e0ae0a3fcc2cabb9fa9c27542b3f5bed4d1d5edbf3d9
6
+ metadata.gz: 6de8bcfd72fb78d76483fe9473ece432ccfc6de1cd68eb1ceda5509ec7324e9c010f60a8c996c335c4164cc920a23b046c8ea1a0f9b6edd08f55acfe17e7caca
7
+ data.tar.gz: 557b9ff9c6f9da8a7f28d3f01e9079f14caf205fe14b017bcc0aa2902b80a89cd32207e98e4ec2642e9d08aae112c22c6835e0821fb7acc1282a3e620771f0cf
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- linkheaders-processor (0.1.14)
4
+ linkheaders-processor (0.1.18)
5
5
  json (~> 2.0)
6
6
  json-ld (~> 3.2)
7
7
  json-ld-preloaded (~> 3.2)
@@ -19,7 +19,7 @@ GEM
19
19
  diff-lcs (1.5.0)
20
20
  domain_name (0.5.20190701)
21
21
  unf (>= 0.0.5, < 1.0.0)
22
- faraday (1.10.0)
22
+ faraday (1.10.1)
23
23
  faraday-em_http (~> 1.0)
24
24
  faraday-em_synchrony (~> 1.0)
25
25
  faraday-excon (~> 1.1)
@@ -39,7 +39,7 @@ GEM
39
39
  faraday-encoding (0.0.5)
40
40
  faraday
41
41
  faraday-excon (1.1.0)
42
- faraday-http-cache (2.4.0)
42
+ faraday-http-cache (2.4.1)
43
43
  faraday (>= 0.8)
44
44
  faraday-httpclient (1.0.1)
45
45
  faraday-multipart (1.0.4)
@@ -58,13 +58,13 @@ GEM
58
58
  domain_name (~> 0.5)
59
59
  json (2.6.2)
60
60
  json-canonicalization (0.3.0)
61
- json-ld (3.2.1)
61
+ json-ld (3.2.3)
62
62
  htmlentities (~> 4.3)
63
63
  json-canonicalization (~> 0.3)
64
64
  link_header (~> 0.0, >= 0.0.8)
65
65
  multi_json (~> 1.15)
66
66
  rack (~> 2.2)
67
- rdf (~> 3.2)
67
+ rdf (~> 3.2, >= 3.2.9)
68
68
  json-ld-preloaded (3.2.0)
69
69
  json-ld (~> 3.2)
70
70
  rdf (~> 3.2)
@@ -96,7 +96,7 @@ GEM
96
96
  rack (2.2.4)
97
97
  rainbow (3.1.1)
98
98
  rake (13.0.6)
99
- rdf (3.2.8)
99
+ rdf (3.2.9)
100
100
  link_header (~> 0.0, >= 0.0.8)
101
101
  regexp_parser (2.5.0)
102
102
  rest-client (2.1.0)
@@ -118,7 +118,7 @@ GEM
118
118
  diff-lcs (>= 1.2.0, < 2.0)
119
119
  rspec-support (~> 3.11.0)
120
120
  rspec-support (3.11.0)
121
- rubocop (1.32.0)
121
+ rubocop (1.33.0)
122
122
  json (~> 2.3)
123
123
  parallel (~> 1.10)
124
124
  parser (>= 3.1.0.0)
@@ -3,6 +3,6 @@
3
3
 
4
4
  module LinkHeaders
5
5
  class Processor
6
- VERSION = "0.1.14"
6
+ VERSION = "0.1.18"
7
7
  end
8
8
  end
@@ -1,7 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative 'processor/version'
4
- require_relative 'constants'
5
4
  require_relative 'link'
6
5
  require_relative 'web_utils'
7
6
  require 'link_header'
@@ -63,10 +62,10 @@ module LinkHeaders
63
62
  newlinks = parse_http_link_headers(head) # pass guid to check against anchors in linksets
64
63
  warn "HTTPlinks #{newlinks.inspect}"
65
64
 
66
- HTML_FORMATS['html'].each do |format|
65
+ ['text/html','text/xhtml+xml', 'application/xhtml+xml'].each do |format|
67
66
  if head[:content_type] and head[:content_type].match(format)
68
67
  warn "found #{format} content - parsing"
69
- htmllinks = parse_html_link_headers(body) # pass html body to find HTML link headers
68
+ htmllinks = parse_html_link_headers(body: body, anchor: default_anchor) # pass html body to find HTML link headers
70
69
  warn "htmllinks #{htmllinks.inspect}"
71
70
  end
72
71
  end
@@ -124,7 +123,7 @@ module LinkHeaders
124
123
  relation = sections['rel']
125
124
  sections.delete('rel')
126
125
  relations = relation.split(/\s+/) # handle the multiple relation case
127
- $stderr.puts "RELATIONS #{relations}"
126
+ warn "RELATIONS #{relations}"
128
127
 
129
128
  relations.each do |rel|
130
129
  next unless rel.match?(/\w/)
@@ -139,8 +138,8 @@ module LinkHeaders
139
138
  #
140
139
  # @param [String] body The HTML of the page containing HTML Link headers
141
140
  #
142
- def parse_html_link_headers(body)
143
- m = MetaInspector.new('http://example.org', document: body)
141
+ def parse_html_link_headers(body:, anchor: '')
142
+ m = MetaInspector.new(anchor, document: body)
144
143
  # an array of elements that look like this: [{:rel=>"alternate", :type=>"application/ld+json", :href=>"http://scidata.vitk.lv/dataset/303.jsonld"}]
145
144
  newlinks = Array.new
146
145
  m.head_links.each do |l|
@@ -155,7 +154,7 @@ module LinkHeaders
155
154
  l.delete(:href)
156
155
 
157
156
  relations = relation.split(/\s+/) # handle the multiple relation case
158
- $stderr.puts "RELATIONS #{relations}"
157
+ warn "RELATIONS #{relations}"
159
158
 
160
159
  relations.each do |rel|
161
160
  next unless rel.match?(/\w/)
@@ -246,13 +245,13 @@ module LinkHeaders
246
245
  # warn "linkset body #{linkset.inspect}"
247
246
  return {} unless linkset
248
247
 
249
- links = linkset.scan(/(<.*?>[^<]+)/) # split on the open angle bracket, which indicates a new link
248
+ # links = linkset.scan(/(<.*?>[^<]+)/) # split on the open angle bracket, which indicates a new link
249
+ links = linkset.split(/,\n*/) # split on the comma+newline
250
250
  # warn "Links found #{links}"
251
251
 
252
252
  links.each do |ls|
253
- # warn "workking on link #{ls}"
254
- ls = ls.first # ls is a single element array
255
- elements = ls.split(';') # semicolon delimited fields
253
+ # warn "working on link #{ls}"
254
+ elements = ls.split(';').map {|element| element.strip!} # semicolon delimited fields
256
255
  # ["<https://w3id.org/a2a-fair-metrics/08-http-describedby-citeas-linkset-txt/>", "anchor=\"https://s11.no/2022/a2a-fair-metrics/08-http-describedby-citeas-linkset-txt/\"", "rel=\"cite-as\""]
257
256
  href = elements.shift # first element is always the link url
258
257
  # warn "working on link href #{href}"
@@ -260,8 +259,6 @@ module LinkHeaders
260
259
  attrhash = {}
261
260
  elements.each do |e|
262
261
  key, val = e.split('=')
263
- key.strip!
264
- val.strip!
265
262
  val.delete_prefix!('"').delete_suffix!('"') # get rid of newlines and start/end quotes
266
263
  attrhash[key.to_sym] = val # split on key=val and make key a symbol
267
264
  end
@@ -1,4 +1,4 @@
1
- def lhfetch(url, headers = ACCEPT_STAR_HEADER)
1
+ def lhfetch(url, headers = {accept: "*/*"})
2
2
  # warn "In fetch routine now. "
3
3
 
4
4
  # warn "executing call over the Web to #{url.to_s}"
@@ -1,7 +1,37 @@
1
1
  # frozen_string_literal: true
2
+ require_relative '../../lib/linkheaders/processor'
3
+ require 'rest-client'
4
+
5
+
6
+ url1 = "https://w3id.org/a2a-fair-metrics/22-http-html-citeas-describedby-mixed/"
7
+ p = LinkHeaders::Processor.new(default_anchor: url1)
8
+ r = RestClient.get(url1)
9
+ p.extract_and_parse(response: r)
10
+ factory = p.factory # LinkHeaders::LinkFactory
11
+
2
12
 
3
13
  RSpec.describe LinkHeaders::Processor do
14
+
4
15
  it 'has a version number' do
5
16
  expect(LinkHeaders::Processor::VERSION).not_to be nil
6
17
  end
18
+
19
+ it "should find PURL citeas which has described-by and cite-as in mixed HTTP and HTML headers" do
20
+ expect(factory.all_links.length).to eq 5
21
+ end
22
+ it "should find find href on all links" do
23
+ expect(factory.all_links.select{|l| l.href}.length).to eq 5
24
+ end
25
+ it "should find find href on all links" do
26
+ expect(factory.all_links.select{|l| l.anchor}.length).to eq 5
27
+ end
28
+ it "should find 5 links in mixed HTTP and HTML headers" do
29
+ expect(factory.all_links.select{|l| l.relation}.length).to eq 5
30
+ end
31
+ it "should find one citeas in mixed HTTP and HTML headers" do
32
+ expect(factory.all_links.select{|l| l.relation == 'cite-as'}.length).to eq 1
33
+ end
34
+ it "should find described-by in mixed HTTP and HTML headers" do
35
+ expect(factory.all_links.select{|l| l.relation == 'describedby'}.length).to eq 1
36
+ end
7
37
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linkheaders-processor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.14
4
+ version: 0.1.18
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mark Wilkinson
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-08-04 00:00:00.000000000 Z
11
+ date: 2022-08-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -137,7 +137,6 @@ files:
137
137
  - README.md
138
138
  - Rakefile
139
139
  - launch.json
140
- - lib/linkheaders/constants.rb
141
140
  - lib/linkheaders/link.rb
142
141
  - lib/linkheaders/processor.rb
143
142
  - lib/linkheaders/processor/version.rb
@@ -1,29 +0,0 @@
1
- ACCEPT_ALL_HEADER = {'Accept' => 'text/turtle, application/ld+json, application/rdf+xml, text/xhtml+xml, application/n3, application/rdf+n3, application/turtle, application/x-turtle, text/n3, text/turtle, text/rdf+n3, text/rdf+turtle, application/n-triples' }
2
- ACCEPT_STAR_HEADER = {'Accept' => '*/*'}
3
-
4
- TEXT_FORMATS = {
5
- 'text' => ['text/plain',],
6
- }
7
-
8
- RDF_FORMATS = {
9
- 'jsonld' => ['application/ld+json', 'application/vnd.schemaorg.ld+json'], # NEW FOR DATACITE
10
- 'turtle' => ['text/turtle','application/n3','application/rdf+n3',
11
- 'application/turtle', 'application/x-turtle','text/n3','text/turtle',
12
- 'text/rdf+n3', 'text/rdf+turtle'],
13
- #'rdfa' => ['text/xhtml+xml', 'application/xhtml+xml'],
14
- 'rdfxml' => ['application/rdf+xml'],
15
- 'triples' => ['application/n-triples','application/n-quads', 'application/trig']
16
- }
17
-
18
- XML_FORMATS = {
19
- 'xml' => ['text/xhtml','text/xml',]
20
- }
21
-
22
- HTML_FORMATS = {
23
- 'html' => ['text/html','text/xhtml+xml', 'application/xhtml+xml']
24
- }
25
-
26
- JSON_FORMATS = {
27
- 'json' => ['application/json',]
28
- }
29
-