linkheaders-processor 0.1.14 → 0.1.18

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4df2bc9ea84550e0de7be8110f49121ea84962ac797b01860858a445213787d5
4
- data.tar.gz: ea148084a6ee71bb8e5509ae94c1dde4714dc66e609527b8c186362cc71c7c76
3
+ metadata.gz: c8eb95bf3880ef8dba373d47230b512d5209bcde19581c1064c2cb703bab3abf
4
+ data.tar.gz: 3ac9096ab4487e30f5e8a78a18cd982f47e232f4dc7e8b9d9573b73ee96b63ad
5
5
  SHA512:
6
- metadata.gz: 9b66ae4ca8d0b48f71d4d33f2bccc1575abc5efe65ebe7bc756cab9fb47836d06fe9dc011ae0e55b0a3be6db331e3018e62d006bb8140b181538b65dc66cbab1
7
- data.tar.gz: 8c2ae044bd1a111f915ea419932b431227b344386ad1824d4e1fc6f86269c2c513d72918d693addf0fa6e0ae0a3fcc2cabb9fa9c27542b3f5bed4d1d5edbf3d9
6
+ metadata.gz: 6de8bcfd72fb78d76483fe9473ece432ccfc6de1cd68eb1ceda5509ec7324e9c010f60a8c996c335c4164cc920a23b046c8ea1a0f9b6edd08f55acfe17e7caca
7
+ data.tar.gz: 557b9ff9c6f9da8a7f28d3f01e9079f14caf205fe14b017bcc0aa2902b80a89cd32207e98e4ec2642e9d08aae112c22c6835e0821fb7acc1282a3e620771f0cf
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- linkheaders-processor (0.1.14)
4
+ linkheaders-processor (0.1.18)
5
5
  json (~> 2.0)
6
6
  json-ld (~> 3.2)
7
7
  json-ld-preloaded (~> 3.2)
@@ -19,7 +19,7 @@ GEM
19
19
  diff-lcs (1.5.0)
20
20
  domain_name (0.5.20190701)
21
21
  unf (>= 0.0.5, < 1.0.0)
22
- faraday (1.10.0)
22
+ faraday (1.10.1)
23
23
  faraday-em_http (~> 1.0)
24
24
  faraday-em_synchrony (~> 1.0)
25
25
  faraday-excon (~> 1.1)
@@ -39,7 +39,7 @@ GEM
39
39
  faraday-encoding (0.0.5)
40
40
  faraday
41
41
  faraday-excon (1.1.0)
42
- faraday-http-cache (2.4.0)
42
+ faraday-http-cache (2.4.1)
43
43
  faraday (>= 0.8)
44
44
  faraday-httpclient (1.0.1)
45
45
  faraday-multipart (1.0.4)
@@ -58,13 +58,13 @@ GEM
58
58
  domain_name (~> 0.5)
59
59
  json (2.6.2)
60
60
  json-canonicalization (0.3.0)
61
- json-ld (3.2.1)
61
+ json-ld (3.2.3)
62
62
  htmlentities (~> 4.3)
63
63
  json-canonicalization (~> 0.3)
64
64
  link_header (~> 0.0, >= 0.0.8)
65
65
  multi_json (~> 1.15)
66
66
  rack (~> 2.2)
67
- rdf (~> 3.2)
67
+ rdf (~> 3.2, >= 3.2.9)
68
68
  json-ld-preloaded (3.2.0)
69
69
  json-ld (~> 3.2)
70
70
  rdf (~> 3.2)
@@ -96,7 +96,7 @@ GEM
96
96
  rack (2.2.4)
97
97
  rainbow (3.1.1)
98
98
  rake (13.0.6)
99
- rdf (3.2.8)
99
+ rdf (3.2.9)
100
100
  link_header (~> 0.0, >= 0.0.8)
101
101
  regexp_parser (2.5.0)
102
102
  rest-client (2.1.0)
@@ -118,7 +118,7 @@ GEM
118
118
  diff-lcs (>= 1.2.0, < 2.0)
119
119
  rspec-support (~> 3.11.0)
120
120
  rspec-support (3.11.0)
121
- rubocop (1.32.0)
121
+ rubocop (1.33.0)
122
122
  json (~> 2.3)
123
123
  parallel (~> 1.10)
124
124
  parser (>= 3.1.0.0)
@@ -3,6 +3,6 @@
3
3
 
4
4
  module LinkHeaders
5
5
  class Processor
6
- VERSION = "0.1.14"
6
+ VERSION = "0.1.18"
7
7
  end
8
8
  end
@@ -1,7 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative 'processor/version'
4
- require_relative 'constants'
5
4
  require_relative 'link'
6
5
  require_relative 'web_utils'
7
6
  require 'link_header'
@@ -63,10 +62,10 @@ module LinkHeaders
63
62
  newlinks = parse_http_link_headers(head) # pass guid to check against anchors in linksets
64
63
  warn "HTTPlinks #{newlinks.inspect}"
65
64
 
66
- HTML_FORMATS['html'].each do |format|
65
+ ['text/html','text/xhtml+xml', 'application/xhtml+xml'].each do |format|
67
66
  if head[:content_type] and head[:content_type].match(format)
68
67
  warn "found #{format} content - parsing"
69
- htmllinks = parse_html_link_headers(body) # pass html body to find HTML link headers
68
+ htmllinks = parse_html_link_headers(body: body, anchor: default_anchor) # pass html body to find HTML link headers
70
69
  warn "htmllinks #{htmllinks.inspect}"
71
70
  end
72
71
  end
@@ -124,7 +123,7 @@ module LinkHeaders
124
123
  relation = sections['rel']
125
124
  sections.delete('rel')
126
125
  relations = relation.split(/\s+/) # handle the multiple relation case
127
- $stderr.puts "RELATIONS #{relations}"
126
+ warn "RELATIONS #{relations}"
128
127
 
129
128
  relations.each do |rel|
130
129
  next unless rel.match?(/\w/)
@@ -139,8 +138,8 @@ module LinkHeaders
139
138
  #
140
139
  # @param [String] body The HTML of the page containing HTML Link headers
141
140
  #
142
- def parse_html_link_headers(body)
143
- m = MetaInspector.new('http://example.org', document: body)
141
+ def parse_html_link_headers(body:, anchor: '')
142
+ m = MetaInspector.new(anchor, document: body)
144
143
  # an array of elements that look like this: [{:rel=>"alternate", :type=>"application/ld+json", :href=>"http://scidata.vitk.lv/dataset/303.jsonld"}]
145
144
  newlinks = Array.new
146
145
  m.head_links.each do |l|
@@ -155,7 +154,7 @@ module LinkHeaders
155
154
  l.delete(:href)
156
155
 
157
156
  relations = relation.split(/\s+/) # handle the multiple relation case
158
- $stderr.puts "RELATIONS #{relations}"
157
+ warn "RELATIONS #{relations}"
159
158
 
160
159
  relations.each do |rel|
161
160
  next unless rel.match?(/\w/)
@@ -246,13 +245,13 @@ module LinkHeaders
246
245
  # warn "linkset body #{linkset.inspect}"
247
246
  return {} unless linkset
248
247
 
249
- links = linkset.scan(/(<.*?>[^<]+)/) # split on the open angle bracket, which indicates a new link
248
+ # links = linkset.scan(/(<.*?>[^<]+)/) # split on the open angle bracket, which indicates a new link
249
+ links = linkset.split(/,\n*/) # split on the comma+newline
250
250
  # warn "Links found #{links}"
251
251
 
252
252
  links.each do |ls|
253
- # warn "workking on link #{ls}"
254
- ls = ls.first # ls is a single element array
255
- elements = ls.split(';') # semicolon delimited fields
253
+ # warn "working on link #{ls}"
254
+ elements = ls.split(';').map {|element| element.strip!} # semicolon delimited fields
256
255
  # ["<https://w3id.org/a2a-fair-metrics/08-http-describedby-citeas-linkset-txt/>", "anchor=\"https://s11.no/2022/a2a-fair-metrics/08-http-describedby-citeas-linkset-txt/\"", "rel=\"cite-as\""]
257
256
  href = elements.shift # first element is always the link url
258
257
  # warn "working on link href #{href}"
@@ -260,8 +259,6 @@ module LinkHeaders
260
259
  attrhash = {}
261
260
  elements.each do |e|
262
261
  key, val = e.split('=')
263
- key.strip!
264
- val.strip!
265
262
  val.delete_prefix!('"').delete_suffix!('"') # get rid of newlines and start/end quotes
266
263
  attrhash[key.to_sym] = val # split on key=val and make key a symbol
267
264
  end
@@ -1,4 +1,4 @@
1
- def lhfetch(url, headers = ACCEPT_STAR_HEADER)
1
+ def lhfetch(url, headers = {accept: "*/*"})
2
2
  # warn "In fetch routine now. "
3
3
 
4
4
  # warn "executing call over the Web to #{url.to_s}"
@@ -1,7 +1,37 @@
1
1
  # frozen_string_literal: true
2
+ require_relative '../../lib/linkheaders/processor'
3
+ require 'rest-client'
4
+
5
+
6
+ url1 = "https://w3id.org/a2a-fair-metrics/22-http-html-citeas-describedby-mixed/"
7
+ p = LinkHeaders::Processor.new(default_anchor: url1)
8
+ r = RestClient.get(url1)
9
+ p.extract_and_parse(response: r)
10
+ factory = p.factory # LinkHeaders::LinkFactory
11
+
2
12
 
3
13
  RSpec.describe LinkHeaders::Processor do
14
+
4
15
  it 'has a version number' do
5
16
  expect(LinkHeaders::Processor::VERSION).not_to be nil
6
17
  end
18
+
19
+ it "should find PURL citeas which has described-by and cite-as in mixed HTTP and HTML headers" do
20
+ expect(factory.all_links.length).to eq 5
21
+ end
22
+ it "should find find href on all links" do
23
+ expect(factory.all_links.select{|l| l.href}.length).to eq 5
24
+ end
25
+ it "should find find href on all links" do
26
+ expect(factory.all_links.select{|l| l.anchor}.length).to eq 5
27
+ end
28
+ it "should find 5 links in mixed HTTP and HTML headers" do
29
+ expect(factory.all_links.select{|l| l.relation}.length).to eq 5
30
+ end
31
+ it "should find one citeas in mixed HTTP and HTML headers" do
32
+ expect(factory.all_links.select{|l| l.relation == 'cite-as'}.length).to eq 1
33
+ end
34
+ it "should find described-by in mixed HTTP and HTML headers" do
35
+ expect(factory.all_links.select{|l| l.relation == 'describedby'}.length).to eq 1
36
+ end
7
37
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linkheaders-processor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.14
4
+ version: 0.1.18
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mark Wilkinson
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-08-04 00:00:00.000000000 Z
11
+ date: 2022-08-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -137,7 +137,6 @@ files:
137
137
  - README.md
138
138
  - Rakefile
139
139
  - launch.json
140
- - lib/linkheaders/constants.rb
141
140
  - lib/linkheaders/link.rb
142
141
  - lib/linkheaders/processor.rb
143
142
  - lib/linkheaders/processor/version.rb
@@ -1,29 +0,0 @@
1
- ACCEPT_ALL_HEADER = {'Accept' => 'text/turtle, application/ld+json, application/rdf+xml, text/xhtml+xml, application/n3, application/rdf+n3, application/turtle, application/x-turtle, text/n3, text/turtle, text/rdf+n3, text/rdf+turtle, application/n-triples' }
2
- ACCEPT_STAR_HEADER = {'Accept' => '*/*'}
3
-
4
- TEXT_FORMATS = {
5
- 'text' => ['text/plain',],
6
- }
7
-
8
- RDF_FORMATS = {
9
- 'jsonld' => ['application/ld+json', 'application/vnd.schemaorg.ld+json'], # NEW FOR DATACITE
10
- 'turtle' => ['text/turtle','application/n3','application/rdf+n3',
11
- 'application/turtle', 'application/x-turtle','text/n3','text/turtle',
12
- 'text/rdf+n3', 'text/rdf+turtle'],
13
- #'rdfa' => ['text/xhtml+xml', 'application/xhtml+xml'],
14
- 'rdfxml' => ['application/rdf+xml'],
15
- 'triples' => ['application/n-triples','application/n-quads', 'application/trig']
16
- }
17
-
18
- XML_FORMATS = {
19
- 'xml' => ['text/xhtml','text/xml',]
20
- }
21
-
22
- HTML_FORMATS = {
23
- 'html' => ['text/html','text/xhtml+xml', 'application/xhtml+xml']
24
- }
25
-
26
- JSON_FORMATS = {
27
- 'json' => ['application/json',]
28
- }
29
-