nokogiri-html-ext 1.2.1 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +7 -3
- data/lib/nokogiri/html-ext.rb +1 -1
- data/lib/nokogiri/html_ext/document.rb +29 -47
- data/nokogiri-html-ext.gemspec +2 -1
- metadata +18 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 81844b7536086078b24d1600282bb324c1325cc62b527b45bf81a13090ff413a
|
4
|
+
data.tar.gz: fdda5725873079052a6315c6d19a388dce12586c54ed88cb42321f215f2826fe
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 307a4c317c840578338d253d1c6c8586ab863ec97f32cde0339069c8a096220c6863d718d305f000357cee68fdfb4e5b36e7cc8a91eb27e7bcd55045b7cf7a77
|
7
|
+
data.tar.gz: d1701b55fe58d9b80378b2a9b09a5c73a0c528895869aa99ecd01ce6d4b291c6108708b5eb6ea55c1201e545c975e85af14489b58272282473451b27a4c2e929
|
data/README.md
CHANGED
@@ -79,11 +79,12 @@ doc.at_css("base").to_s
|
|
79
79
|
|
80
80
|
nokogiri-html-ext will resolve a document's relative URLs against a provided source URL. The source URL _should_ be an absolute URL (e.g. `https://jgarber.example`) representing the location of the document being parsed. The source URL _may_ be any `String` (or any Ruby object that responds to `#to_s`).
|
81
81
|
|
82
|
-
nokogiri-html-ext takes advantage of [the `Nokogiri::XML::Document.parse` method](https://github.com/sparklemotion/nokogiri/blob/main/lib/nokogiri/xml/document.rb#L48)'s second positional argument to set the parsed document's URL.Nokogiri's source code is _very_ complex, but in short: [the `Nokogiri::HTML` method](https://github.com/sparklemotion/nokogiri/blob/main/lib/nokogiri/html.rb#L7-L8) is an alias to [the `Nokogiri::HTML4` method](https://github.com/sparklemotion/nokogiri/blob/main/lib/nokogiri/html4.rb#
|
82
|
+
nokogiri-html-ext takes advantage of [the `Nokogiri::XML::Document.parse` method](https://github.com/sparklemotion/nokogiri/blob/main/lib/nokogiri/xml/document.rb#L48)'s second positional argument to set the parsed document's URL. Nokogiri's source code is _very_ complex, but in short: [the `Nokogiri::HTML` method](https://github.com/sparklemotion/nokogiri/blob/main/lib/nokogiri/html.rb#L7-L8) is an alias to [the `Nokogiri::HTML4` method](https://github.com/sparklemotion/nokogiri/blob/main/lib/nokogiri/html4.rb#L6-L9) which eventually winds its way to the aforementioned `Nokogiri::XML::Document.parse` method. _Phew._ 🥵
|
83
83
|
|
84
|
-
URL resolution uses
|
84
|
+
URL resolution uses [the Addressable gem](https://rubygems.org/gems/addressable)'s URL parsing and normalizing capabilities. Absolute URLs will remain unmodified.
|
85
85
|
|
86
|
-
|
86
|
+
> [!NOTE]
|
87
|
+
If the document's markup includes a `<base>` element whose `href` attribute is an absolute URL, _that_ URL will take precedence when performing URL resolution.
|
87
88
|
|
88
89
|
An abbreviated example:
|
89
90
|
|
@@ -133,6 +134,9 @@ doc.resolve_relative_url("biz/baz")
|
|
133
134
|
#=> "https://jgarber.example/foo/biz/baz"
|
134
135
|
```
|
135
136
|
|
137
|
+
> [!NOTE]
|
138
|
+
> Nokogiri's default `Nokogiri::HTML` method returns a `Nokogiri::HTML4::Document` which will encode URLs with non-ASCII characters. For example, `☠️.example` will be encoded as `%25E2%2598%25A0%25EF%25B8%258F.example`. For a more consistence experience, use the `Nokogiri::HTML5` method which does not encode URLs in this manner.
|
139
|
+
|
136
140
|
## Acknowledgments
|
137
141
|
|
138
142
|
nokogiri-html-ext wouldn't exist without the [Nokogiri](https://nokogiri.org) project and its [community](https://github.com/sparklemotion/nokogiri).
|
data/lib/nokogiri/html-ext.rb
CHANGED
@@ -7,13 +7,11 @@ module Nokogiri
|
|
7
7
|
#
|
8
8
|
# @see https://html.spec.whatwg.org/#srcset-attributes
|
9
9
|
# @see https://html.spec.whatwg.org/#attributes-3
|
10
|
-
|
10
|
+
SRCSET_ATTRIBUTES_MAP = {
|
11
11
|
"imagesrcset" => ["link"],
|
12
12
|
"srcset" => ["img", "source"],
|
13
13
|
}.freeze
|
14
14
|
|
15
|
-
private_constant :IMAGE_CANDIDATE_STRINGS_ATTRIBUTES_MAP
|
16
|
-
|
17
15
|
# A map of HTML URL attributes and their associated element names.
|
18
16
|
#
|
19
17
|
# @see https://html.spec.whatwg.org/#attributes-3
|
@@ -28,17 +26,16 @@ module Nokogiri
|
|
28
26
|
"src" => ["audio", "embed", "iframe", "img", "input", "script", "source", "track", "video"],
|
29
27
|
}.freeze
|
30
28
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
private_constant :URI_PARSER
|
29
|
+
ATTRIBUTES_XPATHS =
|
30
|
+
URL_ATTRIBUTES_MAP.merge(SRCSET_ATTRIBUTES_MAP).flat_map do |attribute, names|
|
31
|
+
names.map { |name| ".//#{name} / @#{attribute}" }
|
32
|
+
end
|
36
33
|
|
37
34
|
# Get the +<base>+ element's HREF attribute value.
|
38
35
|
#
|
39
36
|
# @return [String, nil]
|
40
37
|
def base_href
|
41
|
-
|
38
|
+
at_xpath("//base / @href")&.value&.strip
|
42
39
|
end
|
43
40
|
|
44
41
|
# Set the +<base>+ element's HREF attribute value.
|
@@ -65,15 +62,10 @@ module Nokogiri
|
|
65
62
|
#
|
66
63
|
# @return [String]
|
67
64
|
def resolve_relative_url(url)
|
68
|
-
strs = [
|
69
|
-
|
70
|
-
strs.compact!
|
71
|
-
strs.map! { |str| URI_PARSER.escape(str) }
|
65
|
+
strs = [document.url, base_href, url].compact
|
72
66
|
|
73
|
-
|
74
|
-
|
75
|
-
URI_PARSER.unescape(URI_PARSER.join(*strs).normalize.to_s)
|
76
|
-
rescue URI::InvalidComponentError, URI::InvalidURIError
|
67
|
+
Addressable::URI.join(*strs).to_s
|
68
|
+
rescue Addressable::URI::InvalidURIError
|
77
69
|
url
|
78
70
|
end
|
79
71
|
|
@@ -81,16 +73,15 @@ module Nokogiri
|
|
81
73
|
#
|
82
74
|
# @return [self]
|
83
75
|
def resolve_relative_urls!
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
candidates.join(", ")
|
76
|
+
xpath(*ATTRIBUTES_XPATHS).each do |attr_node|
|
77
|
+
stripped_value = attr_node.value.strip
|
78
|
+
|
79
|
+
attr_node.value =
|
80
|
+
if SRCSET_ATTRIBUTES_MAP.key?(attr_node.name)
|
81
|
+
resolve_srcset_attributes(stripped_value.split(/\s*,\s*/))
|
82
|
+
else
|
83
|
+
resolve_relative_url(stripped_value)
|
84
|
+
end
|
94
85
|
end
|
95
86
|
|
96
87
|
self
|
@@ -98,29 +89,20 @@ module Nokogiri
|
|
98
89
|
|
99
90
|
private
|
100
91
|
|
101
|
-
# +
|
102
|
-
#
|
103
|
-
# is returned as +"https%3A//%25E2%2598%25A0%25EF%25B8%258F.example+.
|
92
|
+
# Resolve a set of +String+s that represent +srcset+ attribute image
|
93
|
+
# candidate strings.
|
104
94
|
#
|
105
|
-
# @
|
106
|
-
def doc_url_str
|
107
|
-
@doc_url_str ||= URI_PARSER.unescape(URI_PARSER.unescape(document.url)).strip
|
108
|
-
end
|
109
|
-
|
110
|
-
# @param attribute [String]
|
111
|
-
# @param names [Array<String>]
|
95
|
+
# @param srcset_attributes [Array<String>]
|
112
96
|
#
|
113
|
-
# @return [
|
114
|
-
def
|
115
|
-
|
116
|
-
|
97
|
+
# @return [String]
|
98
|
+
def resolve_srcset_attributes(srcset_attributes)
|
99
|
+
srcset_attributes.map! do |candidate_string|
|
100
|
+
# rubocop:disable Style/PerlBackrefs
|
101
|
+
candidate_string.sub(/^(.+?)(\s+.+)?$/) { "#{resolve_relative_url($1)}#{$2}" }
|
102
|
+
# rubocop:enable Style/PerlBackrefs
|
103
|
+
end
|
117
104
|
|
118
|
-
|
119
|
-
attributes_map
|
120
|
-
.map { |attribute, names| node_sets_from(attribute, names) }
|
121
|
-
.each do |attribute, node_set|
|
122
|
-
node_set.each { |node| node[attribute] = yield node[attribute] }
|
123
|
-
end
|
105
|
+
srcset_attributes.join(", ")
|
124
106
|
end
|
125
107
|
end
|
126
108
|
end
|
data/nokogiri-html-ext.gemspec
CHANGED
@@ -4,7 +4,7 @@ Gem::Specification.new do |spec|
|
|
4
4
|
spec.required_ruby_version = ">= 2.7"
|
5
5
|
|
6
6
|
spec.name = "nokogiri-html-ext"
|
7
|
-
spec.version = "1.
|
7
|
+
spec.version = "1.4.0"
|
8
8
|
spec.authors = ["Jason Garber"]
|
9
9
|
spec.email = ["jason@sixtwothree.org"]
|
10
10
|
|
@@ -28,5 +28,6 @@ Gem::Specification.new do |spec|
|
|
28
28
|
"source_code_uri" => "#{spec.homepage}/src/tag/v#{spec.version}",
|
29
29
|
}
|
30
30
|
|
31
|
+
spec.add_dependency "addressable", "~> 2.8.7"
|
31
32
|
spec.add_dependency "nokogiri", "~> 1.14"
|
32
33
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nokogiri-html-ext
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jason Garber
|
@@ -9,6 +9,20 @@ bindir: bin
|
|
9
9
|
cert_chain: []
|
10
10
|
date: 1980-01-02 00:00:00.000000000 Z
|
11
11
|
dependencies:
|
12
|
+
- !ruby/object:Gem::Dependency
|
13
|
+
name: addressable
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
15
|
+
requirements:
|
16
|
+
- - "~>"
|
17
|
+
- !ruby/object:Gem::Version
|
18
|
+
version: 2.8.7
|
19
|
+
type: :runtime
|
20
|
+
prerelease: false
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
22
|
+
requirements:
|
23
|
+
- - "~>"
|
24
|
+
- !ruby/object:Gem::Version
|
25
|
+
version: 2.8.7
|
12
26
|
- !ruby/object:Gem::Dependency
|
13
27
|
name: nokogiri
|
14
28
|
requirement: !ruby/object:Gem::Requirement
|
@@ -41,11 +55,11 @@ licenses:
|
|
41
55
|
- MIT
|
42
56
|
metadata:
|
43
57
|
bug_tracker_uri: https://codeberg.org/jgarber/nokogiri-html-ext/issues
|
44
|
-
changelog_uri: https://codeberg.org/jgarber/nokogiri-html-ext/releases/tag/v1.
|
45
|
-
documentation_uri: https://rubydoc.info/gems/nokogiri-html-ext/1.
|
58
|
+
changelog_uri: https://codeberg.org/jgarber/nokogiri-html-ext/releases/tag/v1.4.0
|
59
|
+
documentation_uri: https://rubydoc.info/gems/nokogiri-html-ext/1.4.0
|
46
60
|
homepage_uri: https://codeberg.org/jgarber/nokogiri-html-ext
|
47
61
|
rubygems_mfa_required: 'true'
|
48
|
-
source_code_uri: https://codeberg.org/jgarber/nokogiri-html-ext/src/tag/v1.
|
62
|
+
source_code_uri: https://codeberg.org/jgarber/nokogiri-html-ext/src/tag/v1.4.0
|
49
63
|
rdoc_options: []
|
50
64
|
require_paths:
|
51
65
|
- lib
|