nokogiri-html-ext 1.2.1 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 516557e7b4a1a13c0e5b4e45afa3f5af1bd76bbe16fa7166336f5e0860ee5395
4
- data.tar.gz: 99f1f8d2be3b02c5db36e24885f57688c47269715c25485b95559eea6942b5da
3
+ metadata.gz: 81844b7536086078b24d1600282bb324c1325cc62b527b45bf81a13090ff413a
4
+ data.tar.gz: fdda5725873079052a6315c6d19a388dce12586c54ed88cb42321f215f2826fe
5
5
  SHA512:
6
- metadata.gz: 6d5a5a8271ec9024f795abf5aeae30b815ce37af003c5017986799009c967e8211a5cb35ab5ee359146bc361a025c895dc39e0d536d4d524a5378d8160460f13
7
- data.tar.gz: 1a0bbb92cd2dae7ee7cde736a73a98619d1c5f298b2c89906889d445ef6528d6e3ddca7d4bbd907b69df1dd99d6aeb67b7965c9c334c01de7ad97fe54411818a
6
+ metadata.gz: 307a4c317c840578338d253d1c6c8586ab863ec97f32cde0339069c8a096220c6863d718d305f000357cee68fdfb4e5b36e7cc8a91eb27e7bcd55045b7cf7a77
7
+ data.tar.gz: d1701b55fe58d9b80378b2a9b09a5c73a0c528895869aa99ecd01ce6d4b291c6108708b5eb6ea55c1201e545c975e85af14489b58272282473451b27a4c2e929
data/README.md CHANGED
@@ -79,11 +79,12 @@ doc.at_css("base").to_s
79
79
 
80
80
  nokogiri-html-ext will resolve a document's relative URLs against a provided source URL. The source URL _should_ be an absolute URL (e.g. `https://jgarber.example`) representing the location of the document being parsed. The source URL _may_ be any `String` (or any Ruby object that responds to `#to_s`).
81
81
 
82
- nokogiri-html-ext takes advantage of [the `Nokogiri::XML::Document.parse` method](https://github.com/sparklemotion/nokogiri/blob/main/lib/nokogiri/xml/document.rb#L48)'s second positional argument to set the parsed document's URL.Nokogiri's source code is _very_ complex, but in short: [the `Nokogiri::HTML` method](https://github.com/sparklemotion/nokogiri/blob/main/lib/nokogiri/html.rb#L7-L8) is an alias to [the `Nokogiri::HTML4` method](https://github.com/sparklemotion/nokogiri/blob/main/lib/nokogiri/html4.rb#L10-L12) which eventually winds its way to the aforementioned `Nokogiri::XML::Document.parse` method. _Phew._ 🥵
82
+ nokogiri-html-ext takes advantage of [the `Nokogiri::XML::Document.parse` method](https://github.com/sparklemotion/nokogiri/blob/main/lib/nokogiri/xml/document.rb#L48)'s second positional argument to set the parsed document's URL. Nokogiri's source code is _very_ complex, but in short: [the `Nokogiri::HTML` method](https://github.com/sparklemotion/nokogiri/blob/main/lib/nokogiri/html.rb#L7-L8) is an alias to [the `Nokogiri::HTML4` method](https://github.com/sparklemotion/nokogiri/blob/main/lib/nokogiri/html4.rb#L6-L9) which eventually winds its way to the aforementioned `Nokogiri::XML::Document.parse` method. _Phew._ 🥵
83
83
 
84
- URL resolution uses Ruby's built-in URL parsing and normalizing capabilities. Absolute URLs will remain unmodified.
84
+ URL resolution uses [the Addressable gem](https://rubygems.org/gems/addressable)'s URL parsing and normalizing capabilities. Absolute URLs will remain unmodified.
85
85
 
86
- **Note:** If the document's markup includes a `<base>` element whose `href` attribute is an absolute URL, _that_ URL will take precedence when performing URL resolution.
86
+ > [!NOTE]
87
+ If the document's markup includes a `<base>` element whose `href` attribute is an absolute URL, _that_ URL will take precedence when performing URL resolution.
87
88
 
88
89
  An abbreviated example:
89
90
 
@@ -133,6 +134,9 @@ doc.resolve_relative_url("biz/baz")
133
134
  #=> "https://jgarber.example/foo/biz/baz"
134
135
  ```
135
136
 
137
+ > [!NOTE]
138
+ > Nokogiri's default `Nokogiri::HTML` method returns a `Nokogiri::HTML4::Document` which will encode URLs with non-ASCII characters. For example, `☠️.example` will be encoded as `%25E2%2598%25A0%25EF%25B8%258F.example`. For a more consistence experience, use the `Nokogiri::HTML5` method which does not encode URLs in this manner.
139
+
136
140
  ## Acknowledgments
137
141
 
138
142
  nokogiri-html-ext wouldn't exist without the [Nokogiri](https://nokogiri.org) project and its [community](https://github.com/sparklemotion/nokogiri).
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "addressable/uri"
3
4
  require "nokogiri"
4
- require "uri"
5
5
 
6
6
  require_relative "html_ext/document"
@@ -7,13 +7,11 @@ module Nokogiri
7
7
  #
8
8
  # @see https://html.spec.whatwg.org/#srcset-attributes
9
9
  # @see https://html.spec.whatwg.org/#attributes-3
10
- IMAGE_CANDIDATE_STRINGS_ATTRIBUTES_MAP = {
10
+ SRCSET_ATTRIBUTES_MAP = {
11
11
  "imagesrcset" => ["link"],
12
12
  "srcset" => ["img", "source"],
13
13
  }.freeze
14
14
 
15
- private_constant :IMAGE_CANDIDATE_STRINGS_ATTRIBUTES_MAP
16
-
17
15
  # A map of HTML URL attributes and their associated element names.
18
16
  #
19
17
  # @see https://html.spec.whatwg.org/#attributes-3
@@ -28,17 +26,16 @@ module Nokogiri
28
26
  "src" => ["audio", "embed", "iframe", "img", "input", "script", "source", "track", "video"],
29
27
  }.freeze
30
28
 
31
- private_constant :URL_ATTRIBUTES_MAP
32
-
33
- URI_PARSER = defined?(URI::RFC2396_PARSER) ? URI::RFC2396_PARSER : URI::Generic::DEFAULT_PARSER
34
-
35
- private_constant :URI_PARSER
29
+ ATTRIBUTES_XPATHS =
30
+ URL_ATTRIBUTES_MAP.merge(SRCSET_ATTRIBUTES_MAP).flat_map do |attribute, names|
31
+ names.map { |name| ".//#{name} / @#{attribute}" }
32
+ end
36
33
 
37
34
  # Get the +<base>+ element's HREF attribute value.
38
35
  #
39
36
  # @return [String, nil]
40
37
  def base_href
41
- (base = at_xpath("//base[@href]")) && base["href"].strip
38
+ at_xpath("//base / @href")&.value&.strip
42
39
  end
43
40
 
44
41
  # Set the +<base>+ element's HREF attribute value.
@@ -65,15 +62,10 @@ module Nokogiri
65
62
  #
66
63
  # @return [String]
67
64
  def resolve_relative_url(url)
68
- strs = [doc_url_str, base_href, url]
69
-
70
- strs.compact!
71
- strs.map! { |str| URI_PARSER.escape(str) }
65
+ strs = [document.url, base_href, url].compact
72
66
 
73
- # Escape each component before joining (Ruby's +URI.parse+ only likes
74
- # ASCII) and subsequently unescaping.
75
- URI_PARSER.unescape(URI_PARSER.join(*strs).normalize.to_s)
76
- rescue URI::InvalidComponentError, URI::InvalidURIError
67
+ Addressable::URI.join(*strs).to_s
68
+ rescue Addressable::URI::InvalidURIError
77
69
  url
78
70
  end
79
71
 
@@ -81,16 +73,15 @@ module Nokogiri
81
73
  #
82
74
  # @return [self]
83
75
  def resolve_relative_urls!
84
- resolve_relative_urls_for(URL_ATTRIBUTES_MAP) { |attribute| resolve_relative_url(attribute.strip) }
85
-
86
- resolve_relative_urls_for(IMAGE_CANDIDATE_STRINGS_ATTRIBUTES_MAP) do |attribute|
87
- candidates = attribute.split(/\s*,\s*/)
88
-
89
- # rubocop:disable Style/PerlBackrefs
90
- candidates.map! { |candidate| candidate.sub(/^(.+?)(\s+.+)?$/) { "#{resolve_relative_url($1)}#{$2}" } }
91
- # rubocop:enable Style/PerlBackrefs
92
-
93
- candidates.join(", ")
76
+ xpath(*ATTRIBUTES_XPATHS).each do |attr_node|
77
+ stripped_value = attr_node.value.strip
78
+
79
+ attr_node.value =
80
+ if SRCSET_ATTRIBUTES_MAP.key?(attr_node.name)
81
+ resolve_srcset_attributes(stripped_value.split(/\s*,\s*/))
82
+ else
83
+ resolve_relative_url(stripped_value)
84
+ end
94
85
  end
95
86
 
96
87
  self
@@ -98,29 +89,20 @@ module Nokogiri
98
89
 
99
90
  private
100
91
 
101
- # +Nokogiri::HTML4::Document#url+ may be double-escaped if the parser
102
- # detects non-ASCII characters. For example, +https://[skull emoji].example+
103
- # is returned as +"https%3A//%25E2%2598%25A0%25EF%25B8%258F.example+.
92
+ # Resolve a set of +String+s that represent +srcset+ attribute image
93
+ # candidate strings.
104
94
  #
105
- # @return [String]
106
- def doc_url_str
107
- @doc_url_str ||= URI_PARSER.unescape(URI_PARSER.unescape(document.url)).strip
108
- end
109
-
110
- # @param attribute [String]
111
- # @param names [Array<String>]
95
+ # @param srcset_attributes [Array<String>]
112
96
  #
113
- # @return [Array<String, Nokogiri::XML::NodeSet>]
114
- def node_sets_from(attribute, names)
115
- [attribute, xpath(*names.map { |name| "//#{name}[@#{attribute}]" })]
116
- end
97
+ # @return [String]
98
+ def resolve_srcset_attributes(srcset_attributes)
99
+ srcset_attributes.map! do |candidate_string|
100
+ # rubocop:disable Style/PerlBackrefs
101
+ candidate_string.sub(/^(.+?)(\s+.+)?$/) { "#{resolve_relative_url($1)}#{$2}" }
102
+ # rubocop:enable Style/PerlBackrefs
103
+ end
117
104
 
118
- def resolve_relative_urls_for(attributes_map)
119
- attributes_map
120
- .map { |attribute, names| node_sets_from(attribute, names) }
121
- .each do |attribute, node_set|
122
- node_set.each { |node| node[attribute] = yield node[attribute] }
123
- end
105
+ srcset_attributes.join(", ")
124
106
  end
125
107
  end
126
108
  end
@@ -4,7 +4,7 @@ Gem::Specification.new do |spec|
4
4
  spec.required_ruby_version = ">= 2.7"
5
5
 
6
6
  spec.name = "nokogiri-html-ext"
7
- spec.version = "1.2.1"
7
+ spec.version = "1.4.0"
8
8
  spec.authors = ["Jason Garber"]
9
9
  spec.email = ["jason@sixtwothree.org"]
10
10
 
@@ -28,5 +28,6 @@ Gem::Specification.new do |spec|
28
28
  "source_code_uri" => "#{spec.homepage}/src/tag/v#{spec.version}",
29
29
  }
30
30
 
31
+ spec.add_dependency "addressable", "~> 2.8.7"
31
32
  spec.add_dependency "nokogiri", "~> 1.14"
32
33
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nokogiri-html-ext
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.1
4
+ version: 1.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jason Garber
@@ -9,6 +9,20 @@ bindir: bin
9
9
  cert_chain: []
10
10
  date: 1980-01-02 00:00:00.000000000 Z
11
11
  dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: addressable
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: 2.8.7
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - "~>"
24
+ - !ruby/object:Gem::Version
25
+ version: 2.8.7
12
26
  - !ruby/object:Gem::Dependency
13
27
  name: nokogiri
14
28
  requirement: !ruby/object:Gem::Requirement
@@ -41,11 +55,11 @@ licenses:
41
55
  - MIT
42
56
  metadata:
43
57
  bug_tracker_uri: https://codeberg.org/jgarber/nokogiri-html-ext/issues
44
- changelog_uri: https://codeberg.org/jgarber/nokogiri-html-ext/releases/tag/v1.2.1
45
- documentation_uri: https://rubydoc.info/gems/nokogiri-html-ext/1.2.1
58
+ changelog_uri: https://codeberg.org/jgarber/nokogiri-html-ext/releases/tag/v1.4.0
59
+ documentation_uri: https://rubydoc.info/gems/nokogiri-html-ext/1.4.0
46
60
  homepage_uri: https://codeberg.org/jgarber/nokogiri-html-ext
47
61
  rubygems_mfa_required: 'true'
48
- source_code_uri: https://codeberg.org/jgarber/nokogiri-html-ext/src/tag/v1.2.1
62
+ source_code_uri: https://codeberg.org/jgarber/nokogiri-html-ext/src/tag/v1.4.0
49
63
  rdoc_options: []
50
64
  require_paths:
51
65
  - lib