linkheaders-processor 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: af390c80d1304df2d885e4bb19ad6be8a95e695b7b88ac2a59762e8f11d17dff
4
+ data.tar.gz: f3e90daa90734be50afb722f6023ecb6594c778776c987c0a4bf9b42e4d3aeaa
5
+ SHA512:
6
+ metadata.gz: 100903ef954dc3b40aaea1f97b285bb5dce59703a968905c9a9a7933416c1e4b847de32d0c03de5c646b3d0ae8d6d9a73ae003c315ee335aac10a956cfc38bfb
7
+ data.tar.gz: 71b0b8b7ad489ee6f3db7787fa6de0da3b3bbc40c2f16c53f16586be88233f3c88693c848624bd592d1a923ea9d90e535eaf6841547cade3e96664bdde6cdba4
data/CHANGELOG.md ADDED
@@ -0,0 +1,5 @@
1
+ ## [Unreleased]
2
+
3
+ ## [0.1.0] - 2022-04-27
4
+
5
+ - Initial release
data/Gemfile ADDED
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ # Specify your gem's dependencies in linkset-parser.gemspec
6
+ gemspec
7
+
8
+ gem "rake", "~> 13.0"
9
+
10
+ gem "rspec", "~> 3.0"
11
+
12
+ gem "rubocop", "~> 1.21"
data/Gemfile.lock ADDED
@@ -0,0 +1,150 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ linkheaders-processor (0.1.8)
5
+ json (~> 2.0)
6
+ json-ld (~> 3.2)
7
+ json-ld-preloaded (~> 3.2)
8
+ metainspector (~> 5.11.2)
9
+ rest-client (~> 2.1)
10
+ securerandom (~> 0.1.0)
11
+
12
+ GEM
13
+ remote: https://rubygems.org/
14
+ specs:
15
+ addressable (2.8.0)
16
+ public_suffix (>= 2.0.2, < 5.0)
17
+ ast (2.4.2)
18
+ diff-lcs (1.5.0)
19
+ domain_name (0.5.20190701)
20
+ unf (>= 0.0.5, < 1.0.0)
21
+ faraday (1.10.0)
22
+ faraday-em_http (~> 1.0)
23
+ faraday-em_synchrony (~> 1.0)
24
+ faraday-excon (~> 1.1)
25
+ faraday-httpclient (~> 1.0)
26
+ faraday-multipart (~> 1.0)
27
+ faraday-net_http (~> 1.0)
28
+ faraday-net_http_persistent (~> 1.0)
29
+ faraday-patron (~> 1.0)
30
+ faraday-rack (~> 1.0)
31
+ faraday-retry (~> 1.0)
32
+ ruby2_keywords (>= 0.0.4)
33
+ faraday-cookie_jar (0.0.7)
34
+ faraday (>= 0.8.0)
35
+ http-cookie (~> 1.0.0)
36
+ faraday-em_http (1.0.0)
37
+ faraday-em_synchrony (1.0.0)
38
+ faraday-encoding (0.0.5)
39
+ faraday
40
+ faraday-excon (1.1.0)
41
+ faraday-http-cache (2.4.0)
42
+ faraday (>= 0.8)
43
+ faraday-httpclient (1.0.1)
44
+ faraday-multipart (1.0.4)
45
+ multipart-post (~> 2)
46
+ faraday-net_http (1.0.1)
47
+ faraday-net_http_persistent (1.2.0)
48
+ faraday-patron (1.0.0)
49
+ faraday-rack (1.0.0)
50
+ faraday-retry (1.0.3)
51
+ faraday_middleware (1.2.0)
52
+ faraday (~> 1.0)
53
+ fastimage (2.2.6)
54
+ htmlentities (4.3.4)
55
+ http-accept (1.7.0)
56
+ http-cookie (1.0.5)
57
+ domain_name (~> 0.5)
58
+ json (2.6.2)
59
+ json-canonicalization (0.3.0)
60
+ json-ld (3.2.1)
61
+ htmlentities (~> 4.3)
62
+ json-canonicalization (~> 0.3)
63
+ link_header (~> 0.0, >= 0.0.8)
64
+ multi_json (~> 1.15)
65
+ rack (~> 2.2)
66
+ rdf (~> 3.2)
67
+ json-ld-preloaded (3.2.0)
68
+ json-ld (~> 3.2)
69
+ rdf (~> 3.2)
70
+ link_header (0.0.8)
71
+ metainspector (5.11.2)
72
+ addressable (~> 2.7)
73
+ faraday (~> 1.4)
74
+ faraday-cookie_jar (~> 0.0)
75
+ faraday-encoding (~> 0.0)
76
+ faraday-http-cache (~> 2.2)
77
+ faraday_middleware (~> 1.0)
78
+ fastimage (~> 2.2)
79
+ nesty (~> 1.0)
80
+ nokogiri (~> 1.11)
81
+ mime-types (3.4.1)
82
+ mime-types-data (~> 3.2015)
83
+ mime-types-data (3.2022.0105)
84
+ multi_json (1.15.0)
85
+ multipart-post (2.2.3)
86
+ nesty (1.0.2)
87
+ netrc (0.11.0)
88
+ nokogiri (1.13.8-x86_64-linux)
89
+ racc (~> 1.4)
90
+ parallel (1.22.1)
91
+ parser (3.1.2.0)
92
+ ast (~> 2.4.1)
93
+ public_suffix (4.0.7)
94
+ racc (1.6.0)
95
+ rack (2.2.4)
96
+ rainbow (3.1.1)
97
+ rake (13.0.6)
98
+ rdf (3.2.8)
99
+ link_header (~> 0.0, >= 0.0.8)
100
+ regexp_parser (2.5.0)
101
+ rest-client (2.1.0)
102
+ http-accept (>= 1.7.0, < 2.0)
103
+ http-cookie (>= 1.0.2, < 2.0)
104
+ mime-types (>= 1.16, < 4.0)
105
+ netrc (~> 0.8)
106
+ rexml (3.2.5)
107
+ rspec (3.11.0)
108
+ rspec-core (~> 3.11.0)
109
+ rspec-expectations (~> 3.11.0)
110
+ rspec-mocks (~> 3.11.0)
111
+ rspec-core (3.11.0)
112
+ rspec-support (~> 3.11.0)
113
+ rspec-expectations (3.11.0)
114
+ diff-lcs (>= 1.2.0, < 2.0)
115
+ rspec-support (~> 3.11.0)
116
+ rspec-mocks (3.11.1)
117
+ diff-lcs (>= 1.2.0, < 2.0)
118
+ rspec-support (~> 3.11.0)
119
+ rspec-support (3.11.0)
120
+ rubocop (1.32.0)
121
+ json (~> 2.3)
122
+ parallel (~> 1.10)
123
+ parser (>= 3.1.0.0)
124
+ rainbow (>= 2.2.2, < 4.0)
125
+ regexp_parser (>= 1.8, < 3.0)
126
+ rexml (>= 3.2.5, < 4.0)
127
+ rubocop-ast (>= 1.19.1, < 2.0)
128
+ ruby-progressbar (~> 1.7)
129
+ unicode-display_width (>= 1.4.0, < 3.0)
130
+ rubocop-ast (1.19.1)
131
+ parser (>= 3.1.1.0)
132
+ ruby-progressbar (1.11.0)
133
+ ruby2_keywords (0.0.5)
134
+ securerandom (0.1.1)
135
+ unf (0.1.4)
136
+ unf_ext
137
+ unf_ext (0.0.8.2)
138
+ unicode-display_width (2.2.0)
139
+
140
+ PLATFORMS
141
+ x86_64-linux
142
+
143
+ DEPENDENCIES
144
+ linkheaders-processor!
145
+ rake (~> 13.0)
146
+ rspec (~> 3.0)
147
+ rubocop (~> 1.21)
148
+
149
+ BUNDLED WITH
150
+ 2.3.12
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Mark Wilkinson
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,77 @@
1
+ # LinkHeader::Parser
2
+
3
+ A gem to extract Link Headers from Web responses.
4
+
5
+ This module handles HTTP Link Headers, HTML Link Headers, and auto-follows links to LinkSets in both JSON and Text format, and processes them also.
6
+
7
+ ## Installation
8
+
9
+ Install the gem and add to the application's Gemfile by executing:
10
+
11
+ $ bundle add linkheader-processor
12
+
13
+ If bundler is not being used to manage dependencies, install the gem by executing:
14
+
15
+ $ gem install linkheader-processor
16
+
17
+ ## Usage
18
+
19
+
20
+ ```
21
+
22
+ require 'linkheader/processor'
23
+ require 'rest-client'
24
+
25
+ # url1 has http link headers, and a reference to a linkset in json format
26
+ url1 = "https://s11.no/2022/a2a-fair-metrics/07-http-describedby-citeas-linkset-json/"
27
+
28
+ # url2 has http link headers, with a reference to a linkset in legacy text format
29
+ url2 = "https://s11.no/2022/a2a-fair-metrics/28-http-linkset-txt-only/"
30
+
31
+ p = LinkHeader::Parser.new(default_anchor: url1)
32
+ r = RestClient.get(url1)
33
+
34
+ p.extract_and_parse(response: r)
35
+ factory = p.factory # LinkHeader::LinkFactory
36
+
37
+ factory.all_links.each do |l|
38
+ puts l.href
39
+ puts l.relation
40
+ puts l.responsepart
41
+
42
+ puts l.linkmethods # returns list of instance methods beyond href and relation, that are attributes of the link
43
+ l.linkmethods.each do |method|
44
+ puts "#{method}=" + l.send(method)
45
+ end
46
+ puts
47
+ end
48
+
49
+
50
+
51
+ p = LinkHeader::Parser.new(default_anchor: url2)
52
+ r = RestClient.get(url2)
53
+
54
+ p.extract_and_parse(response: r)
55
+ factory = p.factory
56
+
57
+ factory.all_links.each do |l|
58
+ puts l.href
59
+ puts l.relation
60
+ puts l.responsepart
61
+ puts
62
+ puts
63
+ end
64
+
65
+ ```
66
+
67
+
68
+ ## Development
69
+
70
+
71
+ ## Contributing
72
+
73
+ Bug reports and pull requests are welcome on GitHub at https://github.com/markwilkinson/linkheader-parser.
74
+
75
+ ## License
76
+
77
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ require "rubocop/rake_task"
9
+
10
+ RuboCop::RakeTask.new
11
+
12
+ task default: %i[spec rubocop]
data/launch.json ADDED
@@ -0,0 +1,11 @@
1
+ {
2
+ "name": "RSpec - all",
3
+ "type": "Ruby",
4
+ "request": "launch",
5
+ "cwd": "${workspaceRoot}",
6
+ "program": "/home/osboxes/.rvm/gems/ruby-3.0.0/bin/rspec",
7
+ "args": [
8
+ "-I",
9
+ "${workspaceRoot}"
10
+ ]
11
+ }
@@ -0,0 +1,28 @@
1
+ ACCEPT_ALL_HEADER = {'Accept' => 'text/turtle, application/ld+json, application/rdf+xml, text/xhtml+xml, application/n3, application/rdf+n3, application/turtle, application/x-turtle, text/n3, text/turtle, text/rdf+n3, text/rdf+turtle, application/n-triples' }
2
+
3
+ TEXT_FORMATS = {
4
+ 'text' => ['text/plain',],
5
+ }
6
+
7
+ RDF_FORMATS = {
8
+ 'jsonld' => ['application/ld+json', 'application/vnd.schemaorg.ld+json'], # NEW FOR DATACITE
9
+ 'turtle' => ['text/turtle','application/n3','application/rdf+n3',
10
+ 'application/turtle', 'application/x-turtle','text/n3','text/turtle',
11
+ 'text/rdf+n3', 'text/rdf+turtle'],
12
+ #'rdfa' => ['text/xhtml+xml', 'application/xhtml+xml'],
13
+ 'rdfxml' => ['application/rdf+xml'],
14
+ 'triples' => ['application/n-triples','application/n-quads', 'application/trig']
15
+ }
16
+
17
+ XML_FORMATS = {
18
+ 'xml' => ['text/xhtml','text/xml',]
19
+ }
20
+
21
+ HTML_FORMATS = {
22
+ 'html' => ['text/html','text/xhtml+xml', 'application/xhtml+xml']
23
+ }
24
+
25
+ JSON_FORMATS = {
26
+ 'json' => ['application/json',]
27
+ }
28
+
@@ -0,0 +1,187 @@
1
+ module LinkHeaders
2
+ class LinkFactory
3
+
4
+ # @return [<String>] the HTTP anchor used by default for implicit Links
5
+ attr_accessor :default_anchor
6
+ # @return [Array] An array of strings containing any warnings that were encountered when creating the link (e.g. duplicate cite-as but non-identical URLs)
7
+ attr_accessor :warnings
8
+ @@all_links = Array.new
9
+
10
+ #
11
+ # Create the LinkFacgtory Object
12
+ #
13
+ # @param [String] default_anchor The URL to be used as the default anchor for a link when it isn't specified
14
+ #
15
+ def initialize(default_anchor: 'https://example.org/')
16
+ @default_anchor = default_anchor
17
+ @warnings = Array.new
18
+ end
19
+
20
+ #
21
+ # Create a new LinkHeader::Link object
22
+ #
23
+ # @param [Symbol] responsepart either :header, :body, or :linkset as the original location of this Link
24
+ # @param [String] href the URL of the link
25
+ # @param [String] relation the string of the relation type (e.g. "cite-as" or "described-by")
26
+ # @param [String] anchor The URL of the anchor. Defaults to the default anchor of the LinkHeader factory
27
+ # @param [Hash] **kwargs All other facets of the link. e.g. 'type' => 'text/html',...
28
+ #
29
+ # @return [LinkHeader::Link] The Link object just created
30
+ #
31
+ def new_link(responsepart:, href:, relation:, anchor: @default_anchor, **kwargs)
32
+ # warn "creating new link with kw #{kwargs}"
33
+ link = LinkHeader::Link.new(responsepart: responsepart, factory: self, href: href, anchor: anchor, relation: relation, **kwargs)
34
+ link = sanitycheck(link) # this will add warnings if the link already exists and has a conflict. returns the original of a duplicate
35
+ @@all_links |= [link]
36
+ return link
37
+ end
38
+
39
+ #
40
+ # retrieve all known LinkHeader::Link objects
41
+ #
42
+ # @return [Array] Array of all LinkHeader::Link objects created by the factory so far
43
+ #
44
+ def all_links
45
+ @@all_links
46
+ end
47
+
48
+ #
49
+ # Extracts Linkset type links from a list of LinkHeader::Link objects
50
+ #
51
+ # @return [Array] Array of LinkHeader::Link objects that represent URLs of LinkSets.
52
+ #
53
+ def linksets
54
+ links = Array.new
55
+ self.all_links.each do |link|
56
+ # warn "found #{link.relation}"
57
+ next unless link.relation == 'linkset'
58
+ links << link
59
+ end
60
+ links
61
+ end
62
+
63
+ #
64
+ # Extracts the LinkHeader::Link ojects that originated in the HTTP Headers
65
+ #
66
+ # @return [Array] Array of LinkHeader::Link objects
67
+ #
68
+ def headlinks
69
+ links = Array.new
70
+ self.all_links.each do |link|
71
+ # warn "found #{link.relation}"
72
+ next unless link.responsepart == :header
73
+ links << link
74
+ end
75
+ links
76
+ end
77
+
78
+ #
79
+ # Extracts the LinkHeader::Link ojects that originated in the HTML Link Headers
80
+ #
81
+ # @return [Array] Array of LinkHeader::Link objects
82
+ #
83
+ def bodylinks
84
+ links = Array.new
85
+ self.all_links.each do |link|
86
+ # warn "found #{link.relation}"
87
+ next unless link.responsepart == :body
88
+ links << link
89
+ end
90
+ links
91
+ end
92
+
93
+ #
94
+ # Extracts the LinkHeader::Link ojects that originated from a LinkSet
95
+ #
96
+ # @return [Array] Array of LinkHeader::Link objects
97
+ #
98
+ def linksetlinks
99
+ links = Array.new
100
+ self.all_links.each do |link|
101
+ # warn "found #{link.relation}"
102
+ next unless link.responsepart == :linkset
103
+ links << link
104
+ end
105
+ links
106
+ end
107
+
108
+ def sanitycheck(link)
109
+ flag = true
110
+ self.all_links.each do |l|
111
+ if l.relation == "cite-as" and link.relation == "cite-as"
112
+ if l.href != link.href
113
+ @warnings << 'WARN: Found conflicting cite-as relations. This should never happen'
114
+ end
115
+ end
116
+ if l.href == link.href
117
+ if l.relation != link.relation
118
+ @warnings << 'WARN: Found identical hrefs with different relation types. This may be suspicious. Both have been retained'
119
+ end
120
+ if l.relation = link.relation
121
+ @warnings << 'WARN: found apparent duplicate. Ignoring and returning known link'
122
+ link = l
123
+ end
124
+ end
125
+ end
126
+ link
127
+ end
128
+ end
129
+
130
+ #
131
+ # LinkHeader::Link represnts an HTTP Link Header, an HTML LinkHeader, or a LinkSet Link.
132
+ #
133
+ # #anchor, #href, and #relation are all guaranteed to return a value. Other methods are dynamically created based on what key/value pairs exist in the link
134
+ # for example, if "'type': 'text/html'" exists in the link description, then the method #type will be available on the Link object.
135
+ #
136
+ class Link
137
+ # @return [String] URL of the Link anchor
138
+ attr_accessor :anchor
139
+ # @return [String] URL of the Link
140
+ attr_accessor :href
141
+ # @return [String] What is the relation? (e.g. "cite-as")
142
+ attr_accessor :relation
143
+ # @return [LinkHeader::LinkFactory] The factory that made the Link
144
+ attr_accessor :factory
145
+ # @return [Symbol] :header, :body, or :linkset indicating the place the Link object originated
146
+ attr_accessor :responsepart
147
+ # @return [String] the list of instance method names auto-generated by the various key/value pairs in the link header. e.g. "type"
148
+ attr_accessor :linkmethods
149
+
150
+
151
+ #
152
+ # Create the Link object
153
+ #
154
+ # @param [Symbol] responsepart :header, :body, :linkset
155
+ # @param [LinkHeader::LinkFactory] factory the factory that made the link
156
+ # @param [String] href The URL of the Link
157
+ # @param [String] anchor The URL of the anchor
158
+ # @param [String] relation the Link relation (e.g. "cite-as")
159
+ # @param [hash] **kwargs The remaining facets of the link (e.g. type => 'text/html')
160
+ #
161
+ def initialize(responsepart:, factory:, href:, anchor:, relation:, **kwargs)
162
+ # warn "incoming kw args #{kwargs}"
163
+ @href = href
164
+ @anchor = anchor
165
+ @relation = relation
166
+ @factory = factory
167
+ @responsepart = responsepart
168
+ @linkmethods = Array.new
169
+
170
+ kwargs.each do |k, v|
171
+ # warn "key #{k} val #{v}"
172
+
173
+ @linkmethods << k
174
+ define_singleton_method(k.to_sym) {
175
+ value = instance_variable_get("@#{k}")
176
+ return value
177
+ }
178
+ define_singleton_method "#{k}=".to_sym do |val|
179
+ instance_variable_set("@#{k}", val)
180
+ return "@#{k}".to_sym
181
+ end
182
+ # warn "methods: #{self.methods - Object.new.methods}"
183
+ self.send("#{k}=", v)
184
+ end
185
+ end
186
+ end
187
+ end
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+
4
+ module LinkHeaders
5
+ class Processor
6
+ VERSION = "0.1.8"
7
+ end
8
+ end
@@ -0,0 +1,250 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'processor/version'
4
+ require_relative 'constants'
5
+ require_relative 'link'
6
+ require_relative 'web_utils'
7
+
8
+ require 'json'
9
+ require 'rest-client'
10
+ require 'securerandom'
11
+ require 'metainspector'
12
+
13
+ module LinkHeaders
14
+ class Error < StandardError; end
15
+
16
+ # A Link Header parser
17
+ #
18
+ # Works for both HTML and HTTP links, and handles references to Linksets of either JSON or Text types
19
+ #
20
+ class Parser
21
+ # @return [<Type>] <description>
22
+ attr_accessor :default_anchor, :factory
23
+
24
+ #
25
+ # Create the Link Headers Parser and its Link factory
26
+ #
27
+ # @param [<String>] default_anchor Link relations always have an anchor, but it is sometimes implicit. This value will be used in implicit cases.
28
+ #
29
+ def initialize(default_anchor: 'https://default.anchor.org/')
30
+ @default_anchor = default_anchor
31
+ @factory = LinkHeader::LinkFactory.new(default_anchor: @default_anchor)
32
+ end
33
+
34
+ #
35
+ # Get the parser factory that contains all the links
36
+ #
37
+ # @return [<LinkHeader::LinkFactory>] The factory containing the links (LinkHeader::Link) that have been created so far
38
+ #
39
+ def factory
40
+ @factory
41
+ end
42
+
43
+ #
44
+ # Parses a RestClient::Response
45
+ #
46
+ # The HTTP headers are parsed for Links and if those links contain a Linkset, that is retrieved and parsed
47
+ # If the Response is of some HTML form, this is also parsed for Link headers and Linkset links
48
+ # All discovered links end up in a LinkHeader::LinkFactory object (self.factory)
49
+ #
50
+ # @param [<RestClilent::Response>] response The full response object from an HTTP 2** successful call
51
+ #
52
+ #
53
+ def extract_and_parse(response: RestClient::Response.new)
54
+ head = response.headers
55
+ body = response.body
56
+ # warn "\n\n head #{head.inspect}\n\n"
57
+
58
+ unless head
59
+ warn "WARNING: This doesn't seem to be a RestClient response message.\nReturning blank"
60
+ return [[], []]
61
+ end
62
+
63
+ parse_http_link_headers(head) # pass guid to check against anchors in linksets
64
+ HTML_FORMATS['html'].each do |format|
65
+ if head[:content_type] and head[:content_type].match(format)
66
+ htmllinks = parse_html_link_headers(body) # pass html body to find HTML link headers
67
+ end
68
+ end
69
+ end
70
+
71
+ #
72
+ # Consume a String of the Link Headers and parse it into individual links. Will automatically retrieve and process any LinkSet references found. All LinkHeader::Link objects end up in the LinkHeader::LinkFactory object (self.factory)
73
+ #
74
+ # @param [RestClient::Response::Header] headers the Headers of a RestClent::Response. Calls headers[:link] to retrieve '<https://example.one.com>; rel="preconnect", <https://example.two.com>; rel="preconnect", <https://example.three.com>; rel="preconnect"'
75
+ #
76
+ #
77
+ def parse_http_link_headers(headers)
78
+
79
+ # Link: <https://example.one.com>; rel="preconnect", <https://example.two.com>; rel="preconnect", <https://example.three.com>; rel="preconnect"
80
+ links = headers[:link]
81
+ return [] unless links
82
+
83
+ # warn links.inspect
84
+ parts = links.split(',') # ["<https://example.one.com>; rel='preconnect'", "<https://example.two.com>; rel="preconnect"".....]
85
+ # warn parts
86
+
87
+ # Parse each part into a named link
88
+ split_http_link_headers(parts) # creates links from the split headers and adds to factory.all_links
89
+ check_for_linkset(responsepart: :header) # all links are held in the Linkset::LinkFactory object (factory variable here). This scans the links for a linkset link to follow
90
+ end
91
+
92
+ def split_http_link_headers(parts)
93
+ parts.each do |part, _index|
94
+ # warn "link is: #{part}"
95
+
96
+ section = part.split(';') # ["<https://example.one.com>", "rel='preconnect'"]
97
+ # warn section
98
+ next unless section[0]
99
+
100
+ href = section[0][/<(.*)>/, 1]
101
+ next unless section[1]
102
+
103
+ sections = {}
104
+ section[1..].each do |s| # can be more than one link property "rel='preconnect'"
105
+ s.strip!
106
+ unless m = s.match(%r{(\w+?)="?([\w:\d.,\#\-+/\s]+)"?})
107
+ next
108
+ end # can be rel="cite-as describedby" --> two relations in one! or "linkset+json"
109
+
110
+ relation = m[1] # rel"
111
+ value = m[2] # "preconnect"
112
+ sections[relation] = value # value could hold multiple relation types sections[:rel] = "preconnect"
113
+ end
114
+ next unless sections['rel'] # the relation is required!
115
+
116
+ anchor = sections['anchor'] || default_anchor
117
+ sections.delete('anchor')
118
+ relation = sections['rel']
119
+ sections.delete('rel')
120
+
121
+ factory.new_link(responsepart: :header, anchor: anchor, href: href, relation: relation, **sections) # parsed['https://example.one.com'][:rel] = "preconnect"
122
+ end
123
+ end
124
+
125
+ #
126
+ # Parses the link headers out of an HTML body, and adds links to the LinkHeader::LinkFactory object. Will automatically retrieve and process any LinkSet references found
127
+ #
128
+ # @param [String] body The HTML of the page containing HTML Link headers
129
+ #
130
+ def parse_html_link_headers(body)
131
+ m = MetaInspector.new('http://example.org', document: body)
132
+ # an array of elements that look like this: [{:rel=>"alternate", :type=>"application/ld+json", :href=>"http://scidata.vitk.lv/dataset/303.jsonld"}]
133
+
134
+ m.head_links.each do |l|
135
+ # warn "link is: #{l}"
136
+ next unless l[:href] and l[:rel] # required
137
+
138
+ anchor = l[:anchor] || default_anchor
139
+ l.delete(:anchor)
140
+ relation = l[:rel]
141
+ l.delete(:rel)
142
+ href = l[:href]
143
+ l.delete(:href)
144
+ factory.new_link(responsepart: :body, anchor: anchor, href: href, relation: relation, **l)
145
+ end
146
+ check_for_linkset(responsepart: :body)
147
+ end
148
+
149
+ def check_for_linkset(responsepart:)
150
+ # warn "looking for a linkset"
151
+ factory.linksets.each do |linkset|
152
+ # warn "found #{linkset.methods- Object.new.methods}"
153
+ # warn "inspect #{linkset.inspect}"
154
+ next unless linkset.respond_to? 'type'
155
+ # warn "responds #{linkset.type} "
156
+ case linkset.type
157
+ when 'application/linkset+json'
158
+ # warn "found a json linkset"
159
+ processJSONLinkset(href: linkset.href)
160
+ when 'application/linkset'
161
+ # warn "found a text linkset"
162
+ processTextLinkset(href:linkset.href)
163
+ else
164
+ warn "the linkset #{linkset} was not typed as 'application/linkset+json' or 'application/linkset', and it should be! (found #{linkset.type}) Ignoring..."
165
+ end
166
+ end
167
+ end
168
+
169
+ def processJSONLinkset(href:)
170
+ _headers, linkset = fetch(href, { 'Accept' => 'application/linkset+json' })
171
+ # warn "Linkset body #{linkset.inspect}"
172
+
173
+ return nil unless linkset
174
+
175
+ # linkset = '{ "linkset":
176
+ # [
177
+ # { "anchor": "http://example.net/bar",
178
+ # "item": [
179
+ # {"href": "http://example.com/foo1", "type": "text/html"},
180
+ # {"href": "http://example.com/foo2"}
181
+ # ],
182
+ # "next": [
183
+ # {"href": "http://the.next/"}
184
+ # ]
185
+ # }
186
+ # ]
187
+ # }'
188
+
189
+ linkset = JSON.parse(linkset)
190
+ linkset['linkset'].each do |ls|
191
+ # warn ls.inspect, "\n"
192
+ anchor = ls['anchor'] || @default_anchor
193
+ ls.delete('anchor') if ls['anchor'] # we need to delete since all others have a list as a value
194
+ attrhash = {}
195
+ # warn ls.keys, "\n"
196
+
197
+ ls.each_key do |reltype| # key = e.g. "item", "described-by". "cite"
198
+ # warn reltype, "\n"
199
+ # warn ls[reltype], "\n"
200
+ ls[reltype].each do |attrs| # attr = e.g. {"href": "http://example.com/foo1", "type": "text/html"}
201
+ next unless attrs['href'] # this is a required attribute of a linkset relation
202
+
203
+ href = attrs['href']
204
+ # now go through the other attributes of that relation
205
+ attrs.each do |attr, val| # attr = e.g. "type"; val = "text/html"
206
+ attrhash[attr.to_sym] = val
207
+ end
208
+ end
209
+ factory.new_link(responsepart: :linkset, href: href, relation: reltype, anchor: anchor, **attrhash)
210
+ end
211
+ end
212
+ end
213
+
214
+ def processTextLinkset(href:)
215
+ headers, linkset = fetch(href, { 'Accept' => 'application/linkset' })
216
+ # warn "linkset body #{linkset.inspect}"
217
+ return {} unless linkset
218
+
219
+ links = linkset.scan(/(<.*?>[^<]+)/) # split on the open angle bracket, which indicates a new link
220
+ # warn "Links found #{links}"
221
+
222
+ links.each do |ls|
223
+ # warn "workking on link #{ls}"
224
+ ls = ls.first # ls is a single element array
225
+ elements = ls.split(';') # semicolon delimited fields
226
+ # ["<https://w3id.org/a2a-fair-metrics/08-http-describedby-citeas-linkset-txt/>", "anchor=\"https://s11.no/2022/a2a-fair-metrics/08-http-describedby-citeas-linkset-txt/\"", "rel=\"cite-as\""]
227
+ href = elements.shift # first element is always the link url
228
+ # warn "working on link href #{href}"
229
+ href = href.match(/<([^>]+)>/)[1]
230
+ attrhash = {}
231
+ elements.each do |e|
232
+ key, val = e.split('=')
233
+ key.strip!
234
+ val.strip!
235
+ val.delete_prefix!('"').delete_suffix!('"') # get rid of newlines and start/end quotes
236
+ attrhash[key.to_sym] = val # split on key=val and make key a symbol
237
+ end
238
+ warn "No link relation type... this is bad! Skipping" unless attrhash[:rel]
239
+ next unless attrhash[:rel]
240
+ reltype = attrhash[:rel]
241
+ attrhash.delete(:rel)
242
+ anchor = attrhash[:anchor] || @default_anchor
243
+ attrhash.delete(:anchor)
244
+
245
+ factory.new_link(responsepart: :linkset, href: href, relation: reltype, anchor: anchor, **attrhash)
246
+ # warn "created #{[href, reltype, anchor, **attrhash]}"
247
+ end
248
+ end
249
+ end
250
+ end
@@ -0,0 +1,39 @@
1
+ def fetch(url, headers = ACCEPT_ALL_HEADER) # we will try to retrieve turtle whenever possible
2
+ # warn "In fetch routine now. "
3
+
4
+ # warn "executing call over the Web to #{url.to_s}"
5
+ response = RestClient::Request.execute({
6
+ method: :get,
7
+ url: url.to_s,
8
+ # user: user,
9
+ # password: pass,
10
+ headers: headers
11
+ })
12
+
13
+ # warn "There was a response to the call #{url.to_s}"
14
+ # warn "Response code #{response.code}"
15
+ if response.code == 203
16
+ warn "WARN: Response is non-authoritative (HTTP response code: #{response.code}). Headers may have been manipulated encountered when trying to resolve #{url}\n"
17
+ end
18
+ [response.headers, response.body]
19
+ rescue RestClient::ExceptionWithResponse => e
20
+ warn "EXCEPTION WITH RESPONSE! #{e.response}\n#{e.response.headers}"
21
+ warn "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
22
+ if e.response.code == 500
23
+ [false, false]
24
+ else
25
+ [e.response.headers, e.response.body]
26
+ end
27
+ # now we are returning the headers and body that were returned
28
+ rescue RestClient::Exception => e
29
+ warn "EXCEPTION WITH NO RESPONSE! #{e}"
30
+ warn "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
31
+ [false, false]
32
+ # now we are returning 'False', and we will check that with an \"if\" statement in our main code
33
+ rescue Exception => e
34
+ warn "EXCEPTION UNKNOWN! #{e}"
35
+ warn "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
36
+ [false, false]
37
+ # now we are returning 'False', and we will check that with an \"if\" statement in our main code
38
+ # you can capture the Exception and do something useful with it!\n",
39
+ end
@@ -0,0 +1,6 @@
1
+ module Linkset
2
+ module Parser
3
+ VERSION: String
4
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
5
+ end
6
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe LinkHeader::Parser do
4
+ it 'has a version number' do
5
+ expect(LinkHeader::Parser::VERSION).not_to be nil
6
+ end
7
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "linkset/parser"
4
+
5
+ RSpec.configure do |config|
6
+ # Enable flags like --only-failures and --next-failure
7
+ config.example_status_persistence_file_path = ".rspec_status"
8
+
9
+ # Disable RSpec exposing methods globally on `Module` and `main`
10
+ config.disable_monkey_patching!
11
+
12
+ config.expect_with :rspec do |c|
13
+ c.syntax = :expect
14
+ end
15
+ end
data/testme.rb ADDED
@@ -0,0 +1,37 @@
1
+ require 'linkheader/processor'
2
+ #require_relative 'lib/linkheader/processor'
3
+ require 'rest-client'
4
+
5
+ url1 = "https://s11.no/2022/a2a-fair-metrics/07-http-describedby-citeas-linkset-json/"
6
+ url2 = "https://s11.no/2022/a2a-fair-metrics/28-http-linkset-txt-only/"
7
+
8
+ p = LinkHeader::Parser.new(default_anchor: url1)
9
+ r = RestClient.get(url1)
10
+
11
+ p.extract_and_parse(response: r)
12
+ factory = p.factory
13
+
14
+ factory.all_links.each do |l|
15
+ puts l.href
16
+ puts l.relation
17
+ puts l.responsepart
18
+ puts
19
+ puts
20
+ end
21
+
22
+
23
+
24
+ p = LinkHeader::Parser.new(default_anchor: url2)
25
+ r = RestClient.get(url2)
26
+
27
+ p.extract_and_parse(response: r)
28
+ factory = p.factory
29
+
30
+ factory.all_links.each do |l|
31
+ puts l.href
32
+ puts l.relation
33
+ puts l.responsepart
34
+ puts
35
+ puts
36
+ end
37
+
metadata ADDED
@@ -0,0 +1,164 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: linkheaders-processor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.8
5
+ platform: ruby
6
+ authors:
7
+ - Mark Wilkinson
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2022-07-27 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rest-client
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.1'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.1'
41
+ - !ruby/object:Gem::Dependency
42
+ name: json
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '2.0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '2.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: json-ld
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '3.2'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '3.2'
69
+ - !ruby/object:Gem::Dependency
70
+ name: json-ld-preloaded
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.2'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '3.2'
83
+ - !ruby/object:Gem::Dependency
84
+ name: securerandom
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 0.1.0
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 0.1.0
97
+ - !ruby/object:Gem::Dependency
98
+ name: metainspector
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: 5.11.2
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: 5.11.2
111
+ description: A parser/processor for Link Headers and Linksets in both JSON and Text
112
+ formats.
113
+ email:
114
+ - markw@illuminae.com
115
+ executables: []
116
+ extensions: []
117
+ extra_rdoc_files: []
118
+ files:
119
+ - CHANGELOG.md
120
+ - Gemfile
121
+ - Gemfile.lock
122
+ - LICENSE
123
+ - README.md
124
+ - Rakefile
125
+ - launch.json
126
+ - lib/linkheaders/constants.rb
127
+ - lib/linkheaders/link.rb
128
+ - lib/linkheaders/processor.rb
129
+ - lib/linkheaders/processor/version.rb
130
+ - lib/linkheaders/web_utils.rb
131
+ - sig/linkheader/parser.rbs
132
+ - spec/linkheader/parser_spec.rb
133
+ - spec/spec_helper.rb
134
+ - testme.rb
135
+ homepage: https://github.com/markwilkinson/linkheader-processor
136
+ licenses:
137
+ - MIT
138
+ metadata:
139
+ allowed_push_host: https://rubygems.org
140
+ homepage_uri: https://github.com/markwilkinson/linkheader-processor
141
+ source_code_uri: https://github.com/markwilkinson/linkheader-processor
142
+ changelog_uri: https://github.com/markwilkinson/linkheader-processor/blob/master/CHANGELOG.md
143
+ post_install_message:
144
+ rdoc_options: []
145
+ require_paths:
146
+ - lib
147
+ required_ruby_version: !ruby/object:Gem::Requirement
148
+ requirements:
149
+ - - ">="
150
+ - !ruby/object:Gem::Version
151
+ version: 3.0.0
152
+ required_rubygems_version: !ruby/object:Gem::Requirement
153
+ requirements:
154
+ - - ">="
155
+ - !ruby/object:Gem::Version
156
+ version: '0'
157
+ requirements: []
158
+ rubygems_version: 3.2.28
159
+ signing_key:
160
+ specification_version: 4
161
+ summary: A parser/processor for Link Headers and Linksets in both JSON and Text formats.
162
+ test_files:
163
+ - spec/linkheader/parser_spec.rb
164
+ - spec/spec_helper.rb