linkheaders-processor 0.1.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: af390c80d1304df2d885e4bb19ad6be8a95e695b7b88ac2a59762e8f11d17dff
4
+ data.tar.gz: f3e90daa90734be50afb722f6023ecb6594c778776c987c0a4bf9b42e4d3aeaa
5
+ SHA512:
6
+ metadata.gz: 100903ef954dc3b40aaea1f97b285bb5dce59703a968905c9a9a7933416c1e4b847de32d0c03de5c646b3d0ae8d6d9a73ae003c315ee335aac10a956cfc38bfb
7
+ data.tar.gz: 71b0b8b7ad489ee6f3db7787fa6de0da3b3bbc40c2f16c53f16586be88233f3c88693c848624bd592d1a923ea9d90e535eaf6841547cade3e96664bdde6cdba4
data/CHANGELOG.md ADDED
@@ -0,0 +1,5 @@
1
+ ## [Unreleased]
2
+
3
+ ## [0.1.0] - 2022-04-27
4
+
5
+ - Initial release
data/Gemfile ADDED
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ # Specify your gem's dependencies in linkset-parser.gemspec
6
+ gemspec
7
+
8
+ gem "rake", "~> 13.0"
9
+
10
+ gem "rspec", "~> 3.0"
11
+
12
+ gem "rubocop", "~> 1.21"
data/Gemfile.lock ADDED
@@ -0,0 +1,150 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ linkheaders-processor (0.1.8)
5
+ json (~> 2.0)
6
+ json-ld (~> 3.2)
7
+ json-ld-preloaded (~> 3.2)
8
+ metainspector (~> 5.11.2)
9
+ rest-client (~> 2.1)
10
+ securerandom (~> 0.1.0)
11
+
12
+ GEM
13
+ remote: https://rubygems.org/
14
+ specs:
15
+ addressable (2.8.0)
16
+ public_suffix (>= 2.0.2, < 5.0)
17
+ ast (2.4.2)
18
+ diff-lcs (1.5.0)
19
+ domain_name (0.5.20190701)
20
+ unf (>= 0.0.5, < 1.0.0)
21
+ faraday (1.10.0)
22
+ faraday-em_http (~> 1.0)
23
+ faraday-em_synchrony (~> 1.0)
24
+ faraday-excon (~> 1.1)
25
+ faraday-httpclient (~> 1.0)
26
+ faraday-multipart (~> 1.0)
27
+ faraday-net_http (~> 1.0)
28
+ faraday-net_http_persistent (~> 1.0)
29
+ faraday-patron (~> 1.0)
30
+ faraday-rack (~> 1.0)
31
+ faraday-retry (~> 1.0)
32
+ ruby2_keywords (>= 0.0.4)
33
+ faraday-cookie_jar (0.0.7)
34
+ faraday (>= 0.8.0)
35
+ http-cookie (~> 1.0.0)
36
+ faraday-em_http (1.0.0)
37
+ faraday-em_synchrony (1.0.0)
38
+ faraday-encoding (0.0.5)
39
+ faraday
40
+ faraday-excon (1.1.0)
41
+ faraday-http-cache (2.4.0)
42
+ faraday (>= 0.8)
43
+ faraday-httpclient (1.0.1)
44
+ faraday-multipart (1.0.4)
45
+ multipart-post (~> 2)
46
+ faraday-net_http (1.0.1)
47
+ faraday-net_http_persistent (1.2.0)
48
+ faraday-patron (1.0.0)
49
+ faraday-rack (1.0.0)
50
+ faraday-retry (1.0.3)
51
+ faraday_middleware (1.2.0)
52
+ faraday (~> 1.0)
53
+ fastimage (2.2.6)
54
+ htmlentities (4.3.4)
55
+ http-accept (1.7.0)
56
+ http-cookie (1.0.5)
57
+ domain_name (~> 0.5)
58
+ json (2.6.2)
59
+ json-canonicalization (0.3.0)
60
+ json-ld (3.2.1)
61
+ htmlentities (~> 4.3)
62
+ json-canonicalization (~> 0.3)
63
+ link_header (~> 0.0, >= 0.0.8)
64
+ multi_json (~> 1.15)
65
+ rack (~> 2.2)
66
+ rdf (~> 3.2)
67
+ json-ld-preloaded (3.2.0)
68
+ json-ld (~> 3.2)
69
+ rdf (~> 3.2)
70
+ link_header (0.0.8)
71
+ metainspector (5.11.2)
72
+ addressable (~> 2.7)
73
+ faraday (~> 1.4)
74
+ faraday-cookie_jar (~> 0.0)
75
+ faraday-encoding (~> 0.0)
76
+ faraday-http-cache (~> 2.2)
77
+ faraday_middleware (~> 1.0)
78
+ fastimage (~> 2.2)
79
+ nesty (~> 1.0)
80
+ nokogiri (~> 1.11)
81
+ mime-types (3.4.1)
82
+ mime-types-data (~> 3.2015)
83
+ mime-types-data (3.2022.0105)
84
+ multi_json (1.15.0)
85
+ multipart-post (2.2.3)
86
+ nesty (1.0.2)
87
+ netrc (0.11.0)
88
+ nokogiri (1.13.8-x86_64-linux)
89
+ racc (~> 1.4)
90
+ parallel (1.22.1)
91
+ parser (3.1.2.0)
92
+ ast (~> 2.4.1)
93
+ public_suffix (4.0.7)
94
+ racc (1.6.0)
95
+ rack (2.2.4)
96
+ rainbow (3.1.1)
97
+ rake (13.0.6)
98
+ rdf (3.2.8)
99
+ link_header (~> 0.0, >= 0.0.8)
100
+ regexp_parser (2.5.0)
101
+ rest-client (2.1.0)
102
+ http-accept (>= 1.7.0, < 2.0)
103
+ http-cookie (>= 1.0.2, < 2.0)
104
+ mime-types (>= 1.16, < 4.0)
105
+ netrc (~> 0.8)
106
+ rexml (3.2.5)
107
+ rspec (3.11.0)
108
+ rspec-core (~> 3.11.0)
109
+ rspec-expectations (~> 3.11.0)
110
+ rspec-mocks (~> 3.11.0)
111
+ rspec-core (3.11.0)
112
+ rspec-support (~> 3.11.0)
113
+ rspec-expectations (3.11.0)
114
+ diff-lcs (>= 1.2.0, < 2.0)
115
+ rspec-support (~> 3.11.0)
116
+ rspec-mocks (3.11.1)
117
+ diff-lcs (>= 1.2.0, < 2.0)
118
+ rspec-support (~> 3.11.0)
119
+ rspec-support (3.11.0)
120
+ rubocop (1.32.0)
121
+ json (~> 2.3)
122
+ parallel (~> 1.10)
123
+ parser (>= 3.1.0.0)
124
+ rainbow (>= 2.2.2, < 4.0)
125
+ regexp_parser (>= 1.8, < 3.0)
126
+ rexml (>= 3.2.5, < 4.0)
127
+ rubocop-ast (>= 1.19.1, < 2.0)
128
+ ruby-progressbar (~> 1.7)
129
+ unicode-display_width (>= 1.4.0, < 3.0)
130
+ rubocop-ast (1.19.1)
131
+ parser (>= 3.1.1.0)
132
+ ruby-progressbar (1.11.0)
133
+ ruby2_keywords (0.0.5)
134
+ securerandom (0.1.1)
135
+ unf (0.1.4)
136
+ unf_ext
137
+ unf_ext (0.0.8.2)
138
+ unicode-display_width (2.2.0)
139
+
140
+ PLATFORMS
141
+ x86_64-linux
142
+
143
+ DEPENDENCIES
144
+ linkheaders-processor!
145
+ rake (~> 13.0)
146
+ rspec (~> 3.0)
147
+ rubocop (~> 1.21)
148
+
149
+ BUNDLED WITH
150
+ 2.3.12
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Mark Wilkinson
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,77 @@
1
+ # LinkHeader::Parser
2
+
3
+ A gem to extract Link Headers from Web responses.
4
+
5
+ This module handles HTTP Link Headers, HTML Link Headers, and auto-follows links to LinkSets in both JSON and Text format, and processes them also.
6
+
7
+ ## Installation
8
+
9
+ Install the gem and add to the application's Gemfile by executing:
10
+
11
+ $ bundle add linkheader-processor
12
+
13
+ If bundler is not being used to manage dependencies, install the gem by executing:
14
+
15
+ $ gem install linkheader-processor
16
+
17
+ ## Usage
18
+
19
+
20
+ ```
21
+
22
+ require 'linkheader/processor'
23
+ require 'rest-client'
24
+
25
+ # url1 has http link headers, and a reference to a linkset in json format
26
+ url1 = "https://s11.no/2022/a2a-fair-metrics/07-http-describedby-citeas-linkset-json/"
27
+
28
+ # url2 has http link headers, with a reference to a linkset in legacy text format
29
+ url2 = "https://s11.no/2022/a2a-fair-metrics/28-http-linkset-txt-only/"
30
+
31
+ p = LinkHeader::Parser.new(default_anchor: url1)
32
+ r = RestClient.get(url1)
33
+
34
+ p.extract_and_parse(response: r)
35
+ factory = p.factory # LinkHeader::LinkFactory
36
+
37
+ factory.all_links.each do |l|
38
+ puts l.href
39
+ puts l.relation
40
+ puts l.responsepart
41
+
42
+ puts l.linkmethods # returns list of instance methods beyond href and relation, that are attributes of the link
43
+ l.linkmethods.each do |method|
44
+ puts "#{method}=" + l.send(method)
45
+ end
46
+ puts
47
+ end
48
+
49
+
50
+
51
+ p = LinkHeader::Parser.new(default_anchor: url2)
52
+ r = RestClient.get(url2)
53
+
54
+ p.extract_and_parse(response: r)
55
+ factory = p.factory
56
+
57
+ factory.all_links.each do |l|
58
+ puts l.href
59
+ puts l.relation
60
+ puts l.responsepart
61
+ puts
62
+ puts
63
+ end
64
+
65
+ ```
66
+
67
+
68
+ ## Development
69
+
70
+
71
+ ## Contributing
72
+
73
+ Bug reports and pull requests are welcome on GitHub at https://github.com/markwilkinson/linkheader-parser.
74
+
75
+ ## License
76
+
77
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ require "rubocop/rake_task"
9
+
10
+ RuboCop::RakeTask.new
11
+
12
+ task default: %i[spec rubocop]
data/launch.json ADDED
@@ -0,0 +1,11 @@
1
+ {
2
+ "name": "RSpec - all",
3
+ "type": "Ruby",
4
+ "request": "launch",
5
+ "cwd": "${workspaceRoot}",
6
+ "program": "/home/osboxes/.rvm/gems/ruby-3.0.0/bin/rspec",
7
+ "args": [
8
+ "-I",
9
+ "${workspaceRoot}"
10
+ ]
11
+ }
@@ -0,0 +1,28 @@
1
+ ACCEPT_ALL_HEADER = {'Accept' => 'text/turtle, application/ld+json, application/rdf+xml, text/xhtml+xml, application/n3, application/rdf+n3, application/turtle, application/x-turtle, text/n3, text/turtle, text/rdf+n3, text/rdf+turtle, application/n-triples' }
2
+
3
+ TEXT_FORMATS = {
4
+ 'text' => ['text/plain',],
5
+ }
6
+
7
+ RDF_FORMATS = {
8
+ 'jsonld' => ['application/ld+json', 'application/vnd.schemaorg.ld+json'], # NEW FOR DATACITE
9
+ 'turtle' => ['text/turtle','application/n3','application/rdf+n3',
10
+ 'application/turtle', 'application/x-turtle','text/n3','text/turtle',
11
+ 'text/rdf+n3', 'text/rdf+turtle'],
12
+ #'rdfa' => ['text/xhtml+xml', 'application/xhtml+xml'],
13
+ 'rdfxml' => ['application/rdf+xml'],
14
+ 'triples' => ['application/n-triples','application/n-quads', 'application/trig']
15
+ }
16
+
17
+ XML_FORMATS = {
18
+ 'xml' => ['text/xhtml','text/xml',]
19
+ }
20
+
21
+ HTML_FORMATS = {
22
+ 'html' => ['text/html','text/xhtml+xml', 'application/xhtml+xml']
23
+ }
24
+
25
+ JSON_FORMATS = {
26
+ 'json' => ['application/json',]
27
+ }
28
+
@@ -0,0 +1,187 @@
1
+ module LinkHeaders
2
+ class LinkFactory
3
+
4
+ # @return [<String>] the HTTP anchor used by default for implicit Links
5
+ attr_accessor :default_anchor
6
+ # @return [Array] An array of strings containing any warnings that were encountered when creating the link (e.g. duplicate cite-as but non-identical URLs)
7
+ attr_accessor :warnings
8
+ @@all_links = Array.new
9
+
10
+ #
11
+ # Create the LinkFacgtory Object
12
+ #
13
+ # @param [String] default_anchor The URL to be used as the default anchor for a link when it isn't specified
14
+ #
15
+ def initialize(default_anchor: 'https://example.org/')
16
+ @default_anchor = default_anchor
17
+ @warnings = Array.new
18
+ end
19
+
20
+ #
21
+ # Create a new LinkHeader::Link object
22
+ #
23
+ # @param [Symbol] responsepart either :header, :body, or :linkset as the original location of this Link
24
+ # @param [String] href the URL of the link
25
+ # @param [String] relation the string of the relation type (e.g. "cite-as" or "described-by")
26
+ # @param [String] anchor The URL of the anchor. Defaults to the default anchor of the LinkHeader factory
27
+ # @param [Hash] **kwargs All other facets of the link. e.g. 'type' => 'text/html',...
28
+ #
29
+ # @return [LinkHeader::Link] The Link object just created
30
+ #
31
+ def new_link(responsepart:, href:, relation:, anchor: @default_anchor, **kwargs)
32
+ # warn "creating new link with kw #{kwargs}"
33
+ link = LinkHeader::Link.new(responsepart: responsepart, factory: self, href: href, anchor: anchor, relation: relation, **kwargs)
34
+ link = sanitycheck(link) # this will add warnings if the link already exists and has a conflict. returns the original of a duplicate
35
+ @@all_links |= [link]
36
+ return link
37
+ end
38
+
39
+ #
40
+ # retrieve all known LinkHeader::Link objects
41
+ #
42
+ # @return [Array] Array of all LinkHeader::Link objects created by the factory so far
43
+ #
44
+ def all_links
45
+ @@all_links
46
+ end
47
+
48
+ #
49
+ # Extracts Linkset type links from a list of LinkHeader::Link objects
50
+ #
51
+ # @return [Array] Array of LinkHeader::Link objects that represent URLs of LinkSets.
52
+ #
53
+ def linksets
54
+ links = Array.new
55
+ self.all_links.each do |link|
56
+ # warn "found #{link.relation}"
57
+ next unless link.relation == 'linkset'
58
+ links << link
59
+ end
60
+ links
61
+ end
62
+
63
+ #
64
+ # Extracts the LinkHeader::Link ojects that originated in the HTTP Headers
65
+ #
66
+ # @return [Array] Array of LinkHeader::Link objects
67
+ #
68
+ def headlinks
69
+ links = Array.new
70
+ self.all_links.each do |link|
71
+ # warn "found #{link.relation}"
72
+ next unless link.responsepart == :header
73
+ links << link
74
+ end
75
+ links
76
+ end
77
+
78
+ #
79
+ # Extracts the LinkHeader::Link ojects that originated in the HTML Link Headers
80
+ #
81
+ # @return [Array] Array of LinkHeader::Link objects
82
+ #
83
+ def bodylinks
84
+ links = Array.new
85
+ self.all_links.each do |link|
86
+ # warn "found #{link.relation}"
87
+ next unless link.responsepart == :body
88
+ links << link
89
+ end
90
+ links
91
+ end
92
+
93
+ #
94
+ # Extracts the LinkHeader::Link ojects that originated from a LinkSet
95
+ #
96
+ # @return [Array] Array of LinkHeader::Link objects
97
+ #
98
+ def linksetlinks
99
+ links = Array.new
100
+ self.all_links.each do |link|
101
+ # warn "found #{link.relation}"
102
+ next unless link.responsepart == :linkset
103
+ links << link
104
+ end
105
+ links
106
+ end
107
+
108
+ def sanitycheck(link)
109
+ flag = true
110
+ self.all_links.each do |l|
111
+ if l.relation == "cite-as" and link.relation == "cite-as"
112
+ if l.href != link.href
113
+ @warnings << 'WARN: Found conflicting cite-as relations. This should never happen'
114
+ end
115
+ end
116
+ if l.href == link.href
117
+ if l.relation != link.relation
118
+ @warnings << 'WARN: Found identical hrefs with different relation types. This may be suspicious. Both have been retained'
119
+ end
120
+ if l.relation = link.relation
121
+ @warnings << 'WARN: found apparent duplicate. Ignoring and returning known link'
122
+ link = l
123
+ end
124
+ end
125
+ end
126
+ link
127
+ end
128
+ end
129
+
130
+ #
131
+ # LinkHeader::Link represnts an HTTP Link Header, an HTML LinkHeader, or a LinkSet Link.
132
+ #
133
+ # #anchor, #href, and #relation are all guaranteed to return a value. Other methods are dynamically created based on what key/value pairs exist in the link
134
+ # for example, if "'type': 'text/html'" exists in the link description, then the method #type will be available on the Link object.
135
+ #
136
+ class Link
137
+ # @return [String] URL of the Link anchor
138
+ attr_accessor :anchor
139
+ # @return [String] URL of the Link
140
+ attr_accessor :href
141
+ # @return [String] What is the relation? (e.g. "cite-as")
142
+ attr_accessor :relation
143
+ # @return [LinkHeader::LinkFactory] The factory that made the Link
144
+ attr_accessor :factory
145
+ # @return [Symbol] :header, :body, or :linkset indicating the place the Link object originated
146
+ attr_accessor :responsepart
147
+ # @return [String] the list of instance method names auto-generated by the various key/value pairs in the link header. e.g. "type"
148
+ attr_accessor :linkmethods
149
+
150
+
151
+ #
152
+ # Create the Link object
153
+ #
154
+ # @param [Symbol] responsepart :header, :body, :linkset
155
+ # @param [LinkHeader::LinkFactory] factory the factory that made the link
156
+ # @param [String] href The URL of the Link
157
+ # @param [String] anchor The URL of the anchor
158
+ # @param [String] relation the Link relation (e.g. "cite-as")
159
+ # @param [hash] **kwargs The remaining facets of the link (e.g. type => 'text/html')
160
+ #
161
+ def initialize(responsepart:, factory:, href:, anchor:, relation:, **kwargs)
162
+ # warn "incoming kw args #{kwargs}"
163
+ @href = href
164
+ @anchor = anchor
165
+ @relation = relation
166
+ @factory = factory
167
+ @responsepart = responsepart
168
+ @linkmethods = Array.new
169
+
170
+ kwargs.each do |k, v|
171
+ # warn "key #{k} val #{v}"
172
+
173
+ @linkmethods << k
174
+ define_singleton_method(k.to_sym) {
175
+ value = instance_variable_get("@#{k}")
176
+ return value
177
+ }
178
+ define_singleton_method "#{k}=".to_sym do |val|
179
+ instance_variable_set("@#{k}", val)
180
+ return "@#{k}".to_sym
181
+ end
182
+ # warn "methods: #{self.methods - Object.new.methods}"
183
+ self.send("#{k}=", v)
184
+ end
185
+ end
186
+ end
187
+ end
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+
4
+ module LinkHeaders
5
+ class Processor
6
+ VERSION = "0.1.8"
7
+ end
8
+ end
@@ -0,0 +1,250 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'processor/version'
4
+ require_relative 'constants'
5
+ require_relative 'link'
6
+ require_relative 'web_utils'
7
+
8
+ require 'json'
9
+ require 'rest-client'
10
+ require 'securerandom'
11
+ require 'metainspector'
12
+
13
+ module LinkHeaders
14
+ class Error < StandardError; end
15
+
16
+ # A Link Header parser
17
+ #
18
+ # Works for both HTML and HTTP links, and handles references to Linksets of either JSON or Text types
19
+ #
20
+ class Parser
21
+ # @return [<Type>] <description>
22
+ attr_accessor :default_anchor, :factory
23
+
24
+ #
25
+ # Create the Link Headers Parser and its Link factory
26
+ #
27
+ # @param [<String>] default_anchor Link relations always have an anchor, but it is sometimes implicit. This value will be used in implicit cases.
28
+ #
29
+ def initialize(default_anchor: 'https://default.anchor.org/')
30
+ @default_anchor = default_anchor
31
+ @factory = LinkHeader::LinkFactory.new(default_anchor: @default_anchor)
32
+ end
33
+
34
+ #
35
+ # Get the parser factory that contains all the links
36
+ #
37
+ # @return [<LinkHeader::LinkFactory>] The factory containing the links (LinkHeader::Link) that have been created so far
38
+ #
39
+ def factory
40
+ @factory
41
+ end
42
+
43
+ #
44
+ # Parses a RestClient::Response
45
+ #
46
+ # The HTTP headers are parsed for Links and if those links contain a Linkset, that is retrieved and parsed
47
+ # If the Response is of some HTML form, this is also parsed for Link headers and Linkset links
48
+ # All discovered links end up in a LinkHeader::LinkFactory object (self.factory)
49
+ #
50
+ # @param [<RestClilent::Response>] response The full response object from an HTTP 2** successful call
51
+ #
52
+ #
53
+ def extract_and_parse(response: RestClient::Response.new)
54
+ head = response.headers
55
+ body = response.body
56
+ # warn "\n\n head #{head.inspect}\n\n"
57
+
58
+ unless head
59
+ warn "WARNING: This doesn't seem to be a RestClient response message.\nReturning blank"
60
+ return [[], []]
61
+ end
62
+
63
+ parse_http_link_headers(head) # pass guid to check against anchors in linksets
64
+ HTML_FORMATS['html'].each do |format|
65
+ if head[:content_type] and head[:content_type].match(format)
66
+ htmllinks = parse_html_link_headers(body) # pass html body to find HTML link headers
67
+ end
68
+ end
69
+ end
70
+
71
+ #
72
+ # Consume a String of the Link Headers and parse it into individual links. Will automatically retrieve and process any LinkSet references found. All LinkHeader::Link objects end up in the LinkHeader::LinkFactory object (self.factory)
73
+ #
74
+ # @param [RestClient::Response::Header] headers the Headers of a RestClent::Response. Calls headers[:link] to retrieve '<https://example.one.com>; rel="preconnect", <https://example.two.com>; rel="preconnect", <https://example.three.com>; rel="preconnect"'
75
+ #
76
+ #
77
+ def parse_http_link_headers(headers)
78
+
79
+ # Link: <https://example.one.com>; rel="preconnect", <https://example.two.com>; rel="preconnect", <https://example.three.com>; rel="preconnect"
80
+ links = headers[:link]
81
+ return [] unless links
82
+
83
+ # warn links.inspect
84
+ parts = links.split(',') # ["<https://example.one.com>; rel='preconnect'", "<https://example.two.com>; rel="preconnect"".....]
85
+ # warn parts
86
+
87
+ # Parse each part into a named link
88
+ split_http_link_headers(parts) # creates links from the split headers and adds to factory.all_links
89
+ check_for_linkset(responsepart: :header) # all links are held in the Linkset::LinkFactory object (factory variable here). This scans the links for a linkset link to follow
90
+ end
91
+
92
+ def split_http_link_headers(parts)
93
+ parts.each do |part, _index|
94
+ # warn "link is: #{part}"
95
+
96
+ section = part.split(';') # ["<https://example.one.com>", "rel='preconnect'"]
97
+ # warn section
98
+ next unless section[0]
99
+
100
+ href = section[0][/<(.*)>/, 1]
101
+ next unless section[1]
102
+
103
+ sections = {}
104
+ section[1..].each do |s| # can be more than one link property "rel='preconnect'"
105
+ s.strip!
106
+ unless m = s.match(%r{(\w+?)="?([\w:\d.,\#\-+/\s]+)"?})
107
+ next
108
+ end # can be rel="cite-as describedby" --> two relations in one! or "linkset+json"
109
+
110
+ relation = m[1] # rel"
111
+ value = m[2] # "preconnect"
112
+ sections[relation] = value # value could hold multiple relation types sections[:rel] = "preconnect"
113
+ end
114
+ next unless sections['rel'] # the relation is required!
115
+
116
+ anchor = sections['anchor'] || default_anchor
117
+ sections.delete('anchor')
118
+ relation = sections['rel']
119
+ sections.delete('rel')
120
+
121
+ factory.new_link(responsepart: :header, anchor: anchor, href: href, relation: relation, **sections) # parsed['https://example.one.com'][:rel] = "preconnect"
122
+ end
123
+ end
124
+
125
+ #
126
+ # Parses the link headers out of an HTML body, and adds links to the LinkHeader::LinkFactory object. Will automatically retrieve and process any LinkSet references found
127
+ #
128
+ # @param [String] body The HTML of the page containing HTML Link headers
129
+ #
130
+ def parse_html_link_headers(body)
131
+ m = MetaInspector.new('http://example.org', document: body)
132
+ # an array of elements that look like this: [{:rel=>"alternate", :type=>"application/ld+json", :href=>"http://scidata.vitk.lv/dataset/303.jsonld"}]
133
+
134
+ m.head_links.each do |l|
135
+ # warn "link is: #{l}"
136
+ next unless l[:href] and l[:rel] # required
137
+
138
+ anchor = l[:anchor] || default_anchor
139
+ l.delete(:anchor)
140
+ relation = l[:rel]
141
+ l.delete(:rel)
142
+ href = l[:href]
143
+ l.delete(:href)
144
+ factory.new_link(responsepart: :body, anchor: anchor, href: href, relation: relation, **l)
145
+ end
146
+ check_for_linkset(responsepart: :body)
147
+ end
148
+
149
+ def check_for_linkset(responsepart:)
150
+ # warn "looking for a linkset"
151
+ factory.linksets.each do |linkset|
152
+ # warn "found #{linkset.methods- Object.new.methods}"
153
+ # warn "inspect #{linkset.inspect}"
154
+ next unless linkset.respond_to? 'type'
155
+ # warn "responds #{linkset.type} "
156
+ case linkset.type
157
+ when 'application/linkset+json'
158
+ # warn "found a json linkset"
159
+ processJSONLinkset(href: linkset.href)
160
+ when 'application/linkset'
161
+ # warn "found a text linkset"
162
+ processTextLinkset(href:linkset.href)
163
+ else
164
+ warn "the linkset #{linkset} was not typed as 'application/linkset+json' or 'application/linkset', and it should be! (found #{linkset.type}) Ignoring..."
165
+ end
166
+ end
167
+ end
168
+
169
+ def processJSONLinkset(href:)
170
+ _headers, linkset = fetch(href, { 'Accept' => 'application/linkset+json' })
171
+ # warn "Linkset body #{linkset.inspect}"
172
+
173
+ return nil unless linkset
174
+
175
+ # linkset = '{ "linkset":
176
+ # [
177
+ # { "anchor": "http://example.net/bar",
178
+ # "item": [
179
+ # {"href": "http://example.com/foo1", "type": "text/html"},
180
+ # {"href": "http://example.com/foo2"}
181
+ # ],
182
+ # "next": [
183
+ # {"href": "http://the.next/"}
184
+ # ]
185
+ # }
186
+ # ]
187
+ # }'
188
+
189
+ linkset = JSON.parse(linkset)
190
+ linkset['linkset'].each do |ls|
191
+ # warn ls.inspect, "\n"
192
+ anchor = ls['anchor'] || @default_anchor
193
+ ls.delete('anchor') if ls['anchor'] # we need to delete since all others have a list as a value
194
+ attrhash = {}
195
+ # warn ls.keys, "\n"
196
+
197
+ ls.each_key do |reltype| # key = e.g. "item", "described-by". "cite"
198
+ # warn reltype, "\n"
199
+ # warn ls[reltype], "\n"
200
+ ls[reltype].each do |attrs| # attr = e.g. {"href": "http://example.com/foo1", "type": "text/html"}
201
+ next unless attrs['href'] # this is a required attribute of a linkset relation
202
+
203
+ href = attrs['href']
204
+ # now go through the other attributes of that relation
205
+ attrs.each do |attr, val| # attr = e.g. "type"; val = "text/html"
206
+ attrhash[attr.to_sym] = val
207
+ end
208
+ end
209
+ factory.new_link(responsepart: :linkset, href: href, relation: reltype, anchor: anchor, **attrhash)
210
+ end
211
+ end
212
+ end
213
+
214
+ def processTextLinkset(href:)
215
+ headers, linkset = fetch(href, { 'Accept' => 'application/linkset' })
216
+ # warn "linkset body #{linkset.inspect}"
217
+ return {} unless linkset
218
+
219
+ links = linkset.scan(/(<.*?>[^<]+)/) # split on the open angle bracket, which indicates a new link
220
+ # warn "Links found #{links}"
221
+
222
+ links.each do |ls|
223
+ # warn "workking on link #{ls}"
224
+ ls = ls.first # ls is a single element array
225
+ elements = ls.split(';') # semicolon delimited fields
226
+ # ["<https://w3id.org/a2a-fair-metrics/08-http-describedby-citeas-linkset-txt/>", "anchor=\"https://s11.no/2022/a2a-fair-metrics/08-http-describedby-citeas-linkset-txt/\"", "rel=\"cite-as\""]
227
+ href = elements.shift # first element is always the link url
228
+ # warn "working on link href #{href}"
229
+ href = href.match(/<([^>]+)>/)[1]
230
+ attrhash = {}
231
+ elements.each do |e|
232
+ key, val = e.split('=')
233
+ key.strip!
234
+ val.strip!
235
+ val.delete_prefix!('"').delete_suffix!('"') # get rid of newlines and start/end quotes
236
+ attrhash[key.to_sym] = val # split on key=val and make key a symbol
237
+ end
238
+ warn "No link relation type... this is bad! Skipping" unless attrhash[:rel]
239
+ next unless attrhash[:rel]
240
+ reltype = attrhash[:rel]
241
+ attrhash.delete(:rel)
242
+ anchor = attrhash[:anchor] || @default_anchor
243
+ attrhash.delete(:anchor)
244
+
245
+ factory.new_link(responsepart: :linkset, href: href, relation: reltype, anchor: anchor, **attrhash)
246
+ # warn "created #{[href, reltype, anchor, **attrhash]}"
247
+ end
248
+ end
249
+ end
250
+ end
@@ -0,0 +1,39 @@
1
+ def fetch(url, headers = ACCEPT_ALL_HEADER) # we will try to retrieve turtle whenever possible
2
+ # warn "In fetch routine now. "
3
+
4
+ # warn "executing call over the Web to #{url.to_s}"
5
+ response = RestClient::Request.execute({
6
+ method: :get,
7
+ url: url.to_s,
8
+ # user: user,
9
+ # password: pass,
10
+ headers: headers
11
+ })
12
+
13
+ # warn "There was a response to the call #{url.to_s}"
14
+ # warn "Response code #{response.code}"
15
+ if response.code == 203
16
+ warn "WARN: Response is non-authoritative (HTTP response code: #{response.code}). Headers may have been manipulated encountered when trying to resolve #{url}\n"
17
+ end
18
+ [response.headers, response.body]
19
+ rescue RestClient::ExceptionWithResponse => e
20
+ warn "EXCEPTION WITH RESPONSE! #{e.response}\n#{e.response.headers}"
21
+ warn "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
22
+ if e.response.code == 500
23
+ [false, false]
24
+ else
25
+ [e.response.headers, e.response.body]
26
+ end
27
+ # now we are returning the headers and body that were returned
28
+ rescue RestClient::Exception => e
29
+ warn "EXCEPTION WITH NO RESPONSE! #{e}"
30
+ warn "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
31
+ [false, false]
32
+ # now we are returning 'False', and we will check that with an \"if\" statement in our main code
33
+ rescue Exception => e
34
+ warn "EXCEPTION UNKNOWN! #{e}"
35
+ warn "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
36
+ [false, false]
37
+ # now we are returning 'False', and we will check that with an \"if\" statement in our main code
38
+ # you can capture the Exception and do something useful with it!\n",
39
+ end
@@ -0,0 +1,6 @@
1
+ module Linkset
2
+ module Parser
3
+ VERSION: String
4
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
5
+ end
6
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe LinkHeader::Parser do
4
+ it 'has a version number' do
5
+ expect(LinkHeader::Parser::VERSION).not_to be nil
6
+ end
7
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "linkset/parser"
4
+
5
+ RSpec.configure do |config|
6
+ # Enable flags like --only-failures and --next-failure
7
+ config.example_status_persistence_file_path = ".rspec_status"
8
+
9
+ # Disable RSpec exposing methods globally on `Module` and `main`
10
+ config.disable_monkey_patching!
11
+
12
+ config.expect_with :rspec do |c|
13
+ c.syntax = :expect
14
+ end
15
+ end
data/testme.rb ADDED
@@ -0,0 +1,37 @@
1
+ require 'linkheader/processor'
2
+ #require_relative 'lib/linkheader/processor'
3
+ require 'rest-client'
4
+
5
+ url1 = "https://s11.no/2022/a2a-fair-metrics/07-http-describedby-citeas-linkset-json/"
6
+ url2 = "https://s11.no/2022/a2a-fair-metrics/28-http-linkset-txt-only/"
7
+
8
+ p = LinkHeader::Parser.new(default_anchor: url1)
9
+ r = RestClient.get(url1)
10
+
11
+ p.extract_and_parse(response: r)
12
+ factory = p.factory
13
+
14
+ factory.all_links.each do |l|
15
+ puts l.href
16
+ puts l.relation
17
+ puts l.responsepart
18
+ puts
19
+ puts
20
+ end
21
+
22
+
23
+
24
+ p = LinkHeader::Parser.new(default_anchor: url2)
25
+ r = RestClient.get(url2)
26
+
27
+ p.extract_and_parse(response: r)
28
+ factory = p.factory
29
+
30
+ factory.all_links.each do |l|
31
+ puts l.href
32
+ puts l.relation
33
+ puts l.responsepart
34
+ puts
35
+ puts
36
+ end
37
+
metadata ADDED
@@ -0,0 +1,164 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: linkheaders-processor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.8
5
+ platform: ruby
6
+ authors:
7
+ - Mark Wilkinson
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2022-07-27 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rest-client
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.1'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.1'
41
+ - !ruby/object:Gem::Dependency
42
+ name: json
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '2.0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '2.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: json-ld
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '3.2'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '3.2'
69
+ - !ruby/object:Gem::Dependency
70
+ name: json-ld-preloaded
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.2'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '3.2'
83
+ - !ruby/object:Gem::Dependency
84
+ name: securerandom
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 0.1.0
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 0.1.0
97
+ - !ruby/object:Gem::Dependency
98
+ name: metainspector
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: 5.11.2
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: 5.11.2
111
+ description: A parser/processor for Link Headers and Linksets in both JSON and Text
112
+ formats.
113
+ email:
114
+ - markw@illuminae.com
115
+ executables: []
116
+ extensions: []
117
+ extra_rdoc_files: []
118
+ files:
119
+ - CHANGELOG.md
120
+ - Gemfile
121
+ - Gemfile.lock
122
+ - LICENSE
123
+ - README.md
124
+ - Rakefile
125
+ - launch.json
126
+ - lib/linkheaders/constants.rb
127
+ - lib/linkheaders/link.rb
128
+ - lib/linkheaders/processor.rb
129
+ - lib/linkheaders/processor/version.rb
130
+ - lib/linkheaders/web_utils.rb
131
+ - sig/linkheader/parser.rbs
132
+ - spec/linkheader/parser_spec.rb
133
+ - spec/spec_helper.rb
134
+ - testme.rb
135
+ homepage: https://github.com/markwilkinson/linkheader-processor
136
+ licenses:
137
+ - MIT
138
+ metadata:
139
+ allowed_push_host: https://rubygems.org
140
+ homepage_uri: https://github.com/markwilkinson/linkheader-processor
141
+ source_code_uri: https://github.com/markwilkinson/linkheader-processor
142
+ changelog_uri: https://github.com/markwilkinson/linkheader-processor/blob/master/CHANGELOG.md
143
+ post_install_message:
144
+ rdoc_options: []
145
+ require_paths:
146
+ - lib
147
+ required_ruby_version: !ruby/object:Gem::Requirement
148
+ requirements:
149
+ - - ">="
150
+ - !ruby/object:Gem::Version
151
+ version: 3.0.0
152
+ required_rubygems_version: !ruby/object:Gem::Requirement
153
+ requirements:
154
+ - - ">="
155
+ - !ruby/object:Gem::Version
156
+ version: '0'
157
+ requirements: []
158
+ rubygems_version: 3.2.28
159
+ signing_key:
160
+ specification_version: 4
161
+ summary: A parser/processor for Link Headers and Linksets in both JSON and Text formats.
162
+ test_files:
163
+ - spec/linkheader/parser_spec.rb
164
+ - spec/spec_helper.rb