nokogiri 1.16.8 → 1.17.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (91) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +11 -21
  3. data/README.md +4 -0
  4. data/dependencies.yml +6 -6
  5. data/ext/nokogiri/extconf.rb +191 -137
  6. data/ext/nokogiri/gumbo.c +69 -53
  7. data/ext/nokogiri/html4_document.c +10 -4
  8. data/ext/nokogiri/html4_element_description.c +18 -18
  9. data/ext/nokogiri/html4_sax_parser.c +40 -0
  10. data/ext/nokogiri/html4_sax_parser_context.c +48 -58
  11. data/ext/nokogiri/html4_sax_push_parser.c +25 -24
  12. data/ext/nokogiri/libxml2_polyfill.c +114 -0
  13. data/ext/nokogiri/nokogiri.c +9 -2
  14. data/ext/nokogiri/nokogiri.h +18 -33
  15. data/ext/nokogiri/xml_attr.c +1 -1
  16. data/ext/nokogiri/xml_cdata.c +2 -10
  17. data/ext/nokogiri/xml_comment.c +3 -8
  18. data/ext/nokogiri/xml_document.c +163 -156
  19. data/ext/nokogiri/xml_document_fragment.c +10 -25
  20. data/ext/nokogiri/xml_dtd.c +1 -1
  21. data/ext/nokogiri/xml_element_content.c +9 -9
  22. data/ext/nokogiri/xml_encoding_handler.c +4 -4
  23. data/ext/nokogiri/xml_namespace.c +6 -6
  24. data/ext/nokogiri/xml_node.c +134 -103
  25. data/ext/nokogiri/xml_node_set.c +46 -44
  26. data/ext/nokogiri/xml_reader.c +54 -58
  27. data/ext/nokogiri/xml_relax_ng.c +35 -56
  28. data/ext/nokogiri/xml_sax_parser.c +156 -88
  29. data/ext/nokogiri/xml_sax_parser_context.c +213 -131
  30. data/ext/nokogiri/xml_sax_push_parser.c +68 -49
  31. data/ext/nokogiri/xml_schema.c +50 -85
  32. data/ext/nokogiri/xml_syntax_error.c +19 -11
  33. data/ext/nokogiri/xml_text.c +2 -4
  34. data/ext/nokogiri/xml_xpath_context.c +2 -2
  35. data/ext/nokogiri/xslt_stylesheet.c +8 -8
  36. data/gumbo-parser/src/error.c +76 -48
  37. data/gumbo-parser/src/error.h +5 -1
  38. data/gumbo-parser/src/nokogiri_gumbo.h +11 -2
  39. data/gumbo-parser/src/parser.c +61 -23
  40. data/gumbo-parser/src/tokenizer.c +6 -6
  41. data/lib/nokogiri/class_resolver.rb +1 -1
  42. data/lib/nokogiri/css/node.rb +6 -2
  43. data/lib/nokogiri/css/parser.rb +6 -4
  44. data/lib/nokogiri/css/parser.y +2 -2
  45. data/lib/nokogiri/css/parser_extras.rb +6 -66
  46. data/lib/nokogiri/css/selector_cache.rb +38 -0
  47. data/lib/nokogiri/css/tokenizer.rb +4 -4
  48. data/lib/nokogiri/css/tokenizer.rex +9 -8
  49. data/lib/nokogiri/css/xpath_visitor.rb +42 -6
  50. data/lib/nokogiri/css.rb +86 -20
  51. data/lib/nokogiri/decorators/slop.rb +3 -5
  52. data/lib/nokogiri/encoding_handler.rb +2 -2
  53. data/lib/nokogiri/html4/document.rb +44 -23
  54. data/lib/nokogiri/html4/document_fragment.rb +124 -12
  55. data/lib/nokogiri/html4/encoding_reader.rb +1 -1
  56. data/lib/nokogiri/html4/sax/parser.rb +23 -38
  57. data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
  58. data/lib/nokogiri/html4.rb +9 -14
  59. data/lib/nokogiri/html5/builder.rb +40 -0
  60. data/lib/nokogiri/html5/document.rb +61 -30
  61. data/lib/nokogiri/html5/document_fragment.rb +130 -20
  62. data/lib/nokogiri/html5/node.rb +4 -4
  63. data/lib/nokogiri/html5.rb +114 -72
  64. data/lib/nokogiri/version/constant.rb +1 -1
  65. data/lib/nokogiri/xml/builder.rb +8 -1
  66. data/lib/nokogiri/xml/document.rb +70 -26
  67. data/lib/nokogiri/xml/document_fragment.rb +84 -13
  68. data/lib/nokogiri/xml/node.rb +82 -11
  69. data/lib/nokogiri/xml/node_set.rb +9 -7
  70. data/lib/nokogiri/xml/parse_options.rb +1 -1
  71. data/lib/nokogiri/xml/pp/node.rb +6 -1
  72. data/lib/nokogiri/xml/reader.rb +46 -13
  73. data/lib/nokogiri/xml/relax_ng.rb +57 -20
  74. data/lib/nokogiri/xml/sax/document.rb +174 -83
  75. data/lib/nokogiri/xml/sax/parser.rb +115 -41
  76. data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
  77. data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
  78. data/lib/nokogiri/xml/sax.rb +48 -0
  79. data/lib/nokogiri/xml/schema.rb +112 -45
  80. data/lib/nokogiri/xml/searchable.rb +6 -8
  81. data/lib/nokogiri/xml/syntax_error.rb +22 -0
  82. data/lib/nokogiri/xml.rb +13 -24
  83. data/lib/nokogiri/xslt.rb +3 -9
  84. data/lib/xsd/xmlparser/nokogiri.rb +3 -4
  85. data/ports/archives/libxml2-2.13.5.tar.xz +0 -0
  86. data/ports/archives/libxslt-1.1.42.tar.xz +0 -0
  87. metadata +10 -7
  88. data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
  89. data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +0 -25
  90. data/ports/archives/libxml2-2.12.9.tar.xz +0 -0
  91. data/ports/archives/libxslt-1.1.39.tar.xz +0 -0
@@ -63,13 +63,13 @@ class Tokenizer
63
63
  when (text = @ss.scan(/has\([\s]*/))
64
64
  action { [:HAS, text] }
65
65
 
66
- when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*\([\s]*/))
66
+ when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*\([\s]*/))
67
67
  action { [:FUNCTION, text] }
68
68
 
69
- when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*/))
69
+ when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*/))
70
70
  action { [:IDENT, text] }
71
71
 
72
- when (text = @ss.scan(/\#([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])+/))
72
+ when (text = @ss.scan(/\#([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))+/))
73
73
  action { [:HASH, text] }
74
74
 
75
75
  when (text = @ss.scan(/[\s]*~=[\s]*/))
@@ -132,7 +132,7 @@ class Tokenizer
132
132
  when (text = @ss.scan(/[\s]+/))
133
133
  action { [:S, text] }
134
134
 
135
- when (text = @ss.scan(/"([^\n\r\f"]|\n|\r\n|\r|\f|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*(?<!\\)(?:\\{2})*"|'([^\n\r\f']|\n|\r\n|\r|\f|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*(?<!\\)(?:\\{2})*'/))
135
+ when (text = @ss.scan(/("([^\n\r\f"]|(\n|\r\n|\r|\f)|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*(?<!\\)(?:\\{2})*"|'([^\n\r\f']|(\n|\r\n|\r|\f)|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*(?<!\\)(?:\\{2})*')/))
136
136
  action { [:STRING, text] }
137
137
 
138
138
  when (text = @ss.scan(/./))
@@ -4,20 +4,21 @@ module CSS
4
4
  class Tokenizer
5
5
 
6
6
  macro
7
- nl \n|\r\n|\r|\f
7
+ nl (\n|\r\n|\r|\f)
8
8
  w [\s]*
9
9
  nonascii [^\0-\177]
10
10
  num -?([0-9]+|[0-9]*\.[0-9]+)
11
11
  unicode \\[0-9A-Fa-f]{1,6}(\r\n|[\s])?
12
12
 
13
- escape {unicode}|\\[^\n\r\f0-9A-Fa-f]
14
- nmchar [_A-Za-z0-9-]|{nonascii}|{escape}
15
- nmstart [_A-Za-z]|{nonascii}|{escape}
16
- ident -?({nmstart})({nmchar})*
17
- name ({nmchar})+
13
+ escape ({unicode}|\\[^\n\r\f0-9A-Fa-f])
14
+ nmchar ([_A-Za-z0-9-]|{nonascii}|{escape})
15
+ nmstart ([_A-Za-z]|{nonascii}|{escape})
16
+ name {nmstart}{nmchar}*
17
+ ident -?{name}
18
+ charref {nmchar}+
18
19
  string1 "([^\n\r\f"]|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*"
19
20
  string2 '([^\n\r\f']|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*'
20
- string {string1}|{string2}
21
+ string ({string1}|{string2})
21
22
 
22
23
  rule
23
24
 
@@ -26,7 +27,7 @@ rule
26
27
  has\({w} { [:HAS, text] }
27
28
  {ident}\({w} { [:FUNCTION, text] }
28
29
  {ident} { [:IDENT, text] }
29
- \#{name} { [:HASH, text] }
30
+ \#{charref} { [:HASH, text] }
30
31
  {w}~={w} { [:INCLUDES, text] }
31
32
  {w}\|={w} { [:DASHMATCH, text] }
32
33
  {w}\^={w} { [:PREFIXMATCH, text] }
@@ -44,6 +44,18 @@ module Nokogiri
44
44
  VALUES = [XML, HTML4, HTML5]
45
45
  end
46
46
 
47
+ # The visitor configuration set via the +builtins:+ keyword argument to XPathVisitor.new.
48
+ attr_reader :builtins
49
+
50
+ # The visitor configuration set via the +doctype:+ keyword argument to XPathVisitor.new.
51
+ attr_reader :doctype
52
+
53
+ # The visitor configuration set via the +prefix:+ keyword argument to XPathVisitor.new.
54
+ attr_reader :prefix
55
+
56
+ # The visitor configuration set via the +namespaces:+ keyword argument to XPathVisitor.new.
57
+ attr_reader :namespaces
58
+
47
59
  # :call-seq:
48
60
  # new() → XPathVisitor
49
61
  # new(builtins:, doctype:) → XPathVisitor
@@ -54,7 +66,12 @@ module Nokogiri
54
66
  #
55
67
  # [Returns] XPathVisitor
56
68
  #
57
- def initialize(builtins: BuiltinsConfig::NEVER, doctype: DoctypeConfig::XML)
69
+ def initialize(
70
+ builtins: BuiltinsConfig::NEVER,
71
+ doctype: DoctypeConfig::XML,
72
+ prefix: Nokogiri::XML::XPath::GLOBAL_SEARCH_PREFIX,
73
+ namespaces: nil
74
+ )
58
75
  unless BuiltinsConfig::VALUES.include?(builtins)
59
76
  raise(ArgumentError, "Invalid values #{builtins.inspect} for builtins: keyword parameter")
60
77
  end
@@ -64,6 +81,8 @@ module Nokogiri
64
81
 
65
82
  @builtins = builtins
66
83
  @doctype = doctype
84
+ @prefix = prefix
85
+ @namespaces = namespaces
67
86
  end
68
87
 
69
88
  # :call-seq: config() → Hash
@@ -72,7 +91,7 @@ module Nokogiri
72
91
  # a Hash representing the configuration of the XPathVisitor, suitable for use as
73
92
  # part of the CSS cache key.
74
93
  def config
75
- { builtins: @builtins, doctype: @doctype }
94
+ { builtins: @builtins, doctype: @doctype, prefix: @prefix, namespaces: @namespaces }
76
95
  end
77
96
 
78
97
  # :stopdoc:
@@ -128,6 +147,8 @@ module Nokogiri
128
147
  is_direct = node.value[1].value[0].nil? # e.g. "has(> a)", "has(~ a)", "has(+ a)"
129
148
  ".#{"//" unless is_direct}#{node.value[1].accept(self)}"
130
149
  else
150
+ validate_xpath_function_name(node.value.first)
151
+
131
152
  # xpath function call, let's marshal those arguments
132
153
  args = ["."]
133
154
  args += node.value[1..-1].map do |n|
@@ -207,6 +228,7 @@ module Nokogiri
207
228
  when "parent" then "node()"
208
229
  when "root" then "not(parent::*)"
209
230
  else
231
+ validate_xpath_function_name(node.value.first)
210
232
  "nokogiri:#{node.value.first}(.)"
211
233
  end
212
234
  end
@@ -255,6 +277,14 @@ module Nokogiri
255
277
  else
256
278
  "*[local-name()='#{node.value.first}']"
257
279
  end
280
+ elsif node.value.length == 2 # has a namespace prefix
281
+ if node.value.first.nil? # namespace prefix is empty
282
+ node.value.last
283
+ else
284
+ node.value.join(":")
285
+ end
286
+ elsif @namespaces&.key?("xmlns") # apply the default namespace if it's declared
287
+ "xmlns:#{node.value.first}"
258
288
  else
259
289
  node.value.first
260
290
  end
@@ -270,11 +300,17 @@ module Nokogiri
270
300
 
271
301
  private
272
302
 
303
+ def validate_xpath_function_name(name)
304
+ if name.start_with?("-")
305
+ raise Nokogiri::CSS::SyntaxError, "Invalid XPath function name '#{name}'"
306
+ end
307
+ end
308
+
273
309
  def html5_element_name_needs_namespace_handling(node)
274
- # if this is the wildcard selector "*", use it as normal
275
- node.value.first != "*" &&
276
- # if there is already a namespace (i.e., it is a prefixed QName), use it as normal
277
- !node.value.first.include?(":")
310
+ # if there is already a namespace (i.e., it is a prefixed QName), use it as normal
311
+ node.value.length == 1 &&
312
+ # if this is the wildcard selector "*", use it as normal
313
+ node.value.first != "*"
278
314
  end
279
315
 
280
316
  def nth(node, options = {})
data/lib/nokogiri/css.rb CHANGED
@@ -8,53 +8,119 @@ module Nokogiri
8
8
  # TODO: Deprecate this method ahead of 2.0 and delete it in 2.0.
9
9
  # It is not used by Nokogiri and shouldn't be part of the public API.
10
10
  def parse(selector) # :nodoc:
11
+ warn("Nokogiri::CSS.parse is deprecated and will be removed in a future version of Nokogiri. Use Nokogiri::CSS::Parser#parse instead.", uplevel: 1, category: :deprecated)
11
12
  Parser.new.parse(selector)
12
13
  end
13
14
 
14
15
  # :call-seq:
15
- # xpath_for(selector) → String
16
- # xpath_for(selector [, prefix:] [, visitor:] [, ns:]) → String
16
+ # xpath_for(selector_list) → Array<String>
17
+ # xpath_for(selector_list [, prefix:] [, ns:] [, visitor:] [, cache:]) → Array<String>
17
18
  #
18
- # Translate a CSS selector to the equivalent XPath query.
19
+ # Translate a CSS selector list to the equivalent XPath expressions.
20
+ #
21
+ # 💡 Note that translated queries are cached by default for performance concerns.
22
+ #
23
+ # ⚠ Users should prefer Nokogiri::XML::Searchable#css, which is mixed into all document and
24
+ # node classes, for querying documents with CSS selectors. This method is the underlying
25
+ # mechanism used by XML::Searchable and is provided solely for advanced users to translate
26
+ # \CSS selectors to XPath directly.
27
+ #
28
+ # Also see Nokogiri::XML::Searchable#css for documentation on supported CSS selector features,
29
+ # some extended syntax that Nokogiri supports, and advanced CSS features like pseudo-class
30
+ # functions.
19
31
  #
20
32
  # [Parameters]
21
- # - +selector+ (String) The CSS selector to be translated into XPath
33
+ # - +selector_list+ (String)
22
34
  #
35
+ # The CSS selector to be translated into XPath. This is always a String, but that string
36
+ # value may be a {selector list}[https://www.w3.org/TR/selectors-4/#grouping] (see
37
+ # examples).
38
+ #
39
+ # [Keyword arguments]
23
40
  # - +prefix:+ (String)
24
41
  #
25
- # The XPath prefix for the query, see Nokogiri::XML::XPath for some options. Default is
26
- # +XML::XPath::GLOBAL_SEARCH_PREFIX+.
42
+ # The XPath expression prefix which determines the search context. See Nokogiri::XML::XPath
43
+ # for standard options. Default is +XPath::GLOBAL_SEARCH_PREFIX+.
44
+ #
45
+ # - +ns:+ (Hash<String ⇒ String>, nil)
46
+ #
47
+ # Namespaces that are referenced in the query, if any. This is a hash where the keys are the
48
+ # namespace prefix and the values are the namespace URIs. Default is +nil+ indicating an
49
+ # empty set of namespaces.
27
50
  #
28
51
  # - +visitor:+ (Nokogiri::CSS::XPathVisitor)
29
52
  #
30
- # The visitor class to use to transform the AST into XPath. Default is
31
- # +Nokogiri::CSS::XPathVisitor.new+.
53
+ # Use this XPathVisitor object to transform the CSS AST into XPath expressions. See
54
+ # Nokogiri::CSS::XPathVisitor for more information on some of the complex behavior that can
55
+ # be customized for your document type. Default is +Nokogiri::CSS::XPathVisitor.new+.
56
+ #
57
+ # ⚠ Note that this option is mutually exclusive with +prefix+ and +ns+. If +visitor+ is
58
+ # provided, +prefix+ and +ns+ must not be present.
59
+ #
60
+ # - +cache:+ (Boolean)
61
+ #
62
+ # Whether to use the SelectorCache for the translated query to ensure that repeated queries
63
+ # don't incur the overhead of re-parsing the selector. Default is +true+.
32
64
  #
33
- # - +ns:+ (Hash<String String>)
65
+ # [Returns] (Array<String>) The equivalent set of XPath expressions for +selector_list+
34
66
  #
35
- # The namespaces that are referenced in the query, if any. This is a hash where the keys are
36
- # the namespace prefix and the values are the namespace URIs. Default is an empty Hash.
67
+ # *Example* with a simple selector:
37
68
  #
38
- # [Returns] (String) The equivalent XPath query for +selector+
69
+ # Nokogiri::CSS.xpath_for("div") # => ["//div"]
39
70
  #
40
- # 💡 Note that translated queries are cached for performance concerns.
71
+ # *Example* with a compound selector:
41
72
  #
42
- def xpath_for(selector, options = {})
43
- raise TypeError, "no implicit conversion of #{selector.inspect} to String" unless selector.respond_to?(:to_str)
73
+ # Nokogiri::CSS.xpath_for("div.xl") # => ["//div[contains(concat(' ',normalize-space(@class),' '),' xl ')]"]
74
+ #
75
+ # *Example* with a complex selector:
76
+ #
77
+ # Nokogiri::CSS.xpath_for("h1 + div") # => ["//h1/following-sibling::*[1]/self::div"]
78
+ #
79
+ # *Example* with a selector list:
80
+ #
81
+ # Nokogiri::CSS.xpath_for("h1, h2, h3") # => ["//h1", "//h2", "//h3"]
82
+ #
83
+ def xpath_for(
84
+ selector, options = nil,
85
+ prefix: options&.delete(:prefix),
86
+ visitor: options&.delete(:visitor),
87
+ ns: options&.delete(:ns),
88
+ cache: true
89
+ )
90
+ unless options.nil?
91
+ warn("Nokogiri::CSS.xpath_for: Passing options as an explicit hash is deprecated. Use keyword arguments instead. This will become an error in a future release.", uplevel: 1, category: :deprecated)
92
+ end
93
+
94
+ raise(TypeError, "no implicit conversion of #{selector.inspect} to String") unless selector.respond_to?(:to_str)
44
95
 
45
96
  selector = selector.to_str
46
- raise Nokogiri::CSS::SyntaxError, "empty CSS selector" if selector.empty?
97
+ raise(Nokogiri::CSS::SyntaxError, "empty CSS selector") if selector.empty?
98
+
99
+ if visitor
100
+ raise ArgumentError, "cannot provide both :prefix and :visitor" if prefix
101
+ raise ArgumentError, "cannot provide both :ns and :visitor" if ns
102
+ end
103
+
104
+ visitor ||= begin
105
+ visitor_kw = {}
106
+ visitor_kw[:prefix] = prefix if prefix
107
+ visitor_kw[:namespaces] = ns if ns
47
108
 
48
- prefix = options.fetch(:prefix, Nokogiri::XML::XPath::GLOBAL_SEARCH_PREFIX)
49
- visitor = options.fetch(:visitor) { Nokogiri::CSS::XPathVisitor.new }
50
- ns = options.fetch(:ns, {})
109
+ Nokogiri::CSS::XPathVisitor.new(**visitor_kw)
110
+ end
51
111
 
52
- Parser.new(ns).xpath_for(selector, prefix, visitor)
112
+ if cache
113
+ key = SelectorCache.key(selector: selector, visitor: visitor)
114
+ SelectorCache[key] ||= Parser.new.xpath_for(selector, visitor)
115
+ else
116
+ Parser.new.xpath_for(selector, visitor)
117
+ end
53
118
  end
54
119
  end
55
120
  end
56
121
  end
57
122
 
123
+ require_relative "css/selector_cache"
58
124
  require_relative "css/node"
59
125
  require_relative "css/xpath_visitor"
60
126
  x = $-w
@@ -23,11 +23,9 @@ module Nokogiri
23
23
  list = xpath("#{XPATH_PREFIX}#{name}[#{conds}]")
24
24
  end
25
25
  else
26
- CSS::Parser.without_cache do
27
- list = xpath(
28
- *CSS.xpath_for("#{name}#{args.first}", prefix: XPATH_PREFIX),
29
- )
30
- end
26
+ list = xpath(
27
+ *CSS.xpath_for("#{name}#{args.first}", prefix: XPATH_PREFIX, cache: false),
28
+ )
31
29
  end
32
30
 
33
31
  super if list.empty?
@@ -6,9 +6,9 @@ module Nokogiri
6
6
  # Popular encoding aliases not known by all iconv implementations that Nokogiri should support.
7
7
  USEFUL_ALIASES = {
8
8
  # alias_name => true_name
9
- "NOKOGIRI-SENTINEL" => "UTF-8", # indicating the Nokogiri has installed aliases
9
+ "ISO-2022-JP" => "ISO-2022-JP", # only for JRuby tests, this is a no-op in CRuby
10
+ "NOKOGIRI-SENTINEL" => "ISO-2022-JP", # indicating the Nokogiri has installed aliases
10
11
  "Windows-31J" => "CP932", # Windows-31J is the IANA registered name of CP932.
11
- "UTF-8" => "UTF-8", # for JRuby tests, this is a no-op in CRuby
12
12
  }
13
13
 
14
14
  class << self
@@ -161,52 +161,73 @@ module Nokogiri
161
161
  end
162
162
 
163
163
  class << self
164
- ###
165
- # Parse HTML. +string_or_io+ may be a String, or any object that
166
- # responds to _read_ and _close_ such as an IO, or StringIO.
167
- # +url+ is resource where this document is located. +encoding+ is the
168
- # encoding that should be used when processing the document. +options+
169
- # is a number that sets options in the parser, such as
170
- # Nokogiri::XML::ParseOptions::RECOVER. See the constants in
171
- # Nokogiri::XML::ParseOptions.
172
- def parse(string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML)
164
+ # :call-seq:
165
+ # parse(input) { |options| ... } => Nokogiri::HTML4::Document
166
+ # parse(input, url:, encoding:, options:) => Nokogiri::HTML4::Document
167
+ #
168
+ # Parse \HTML4 input from a String or IO object, and return a new HTML4::Document.
169
+ #
170
+ # [Required Parameters]
171
+ # - +input+ (String | IO) The content to be parsed.
172
+ #
173
+ # [Optional Keyword Arguments]
174
+ # - +url:+ (String) The base URI for this document.
175
+ #
176
+ # - +encoding:+ (String) The name of the encoding that should be used when processing the
177
+ # document. When not provided, the encoding will be determined based on the document
178
+ # content.
179
+ #
180
+ # - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some
181
+ # behaviors during parsing. See ParseOptions for more information. The default value is
182
+ # +ParseOptions::DEFAULT_HTML+.
183
+ #
184
+ # [Yields]
185
+ # If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which
186
+ # can be configured before parsing. See Nokogiri::XML::ParseOptions for more information.
187
+ #
188
+ # [Returns] Nokogiri::HTML4::Document
189
+ def parse(
190
+ input,
191
+ url_ = nil, encoding_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML,
192
+ url: url_, encoding: encoding_, options: options_
193
+ )
173
194
  options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
174
195
  yield options if block_given?
175
196
 
176
- url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
197
+ url ||= input.respond_to?(:path) ? input.path : nil
177
198
 
178
- if string_or_io.respond_to?(:encoding)
179
- unless string_or_io.encoding == Encoding::ASCII_8BIT
180
- encoding ||= string_or_io.encoding.name
199
+ if input.respond_to?(:encoding)
200
+ unless input.encoding == Encoding::ASCII_8BIT
201
+ encoding ||= input.encoding.name
181
202
  end
182
203
  end
183
204
 
184
- if string_or_io.respond_to?(:read)
185
- if string_or_io.is_a?(Pathname)
205
+ if input.respond_to?(:read)
206
+ if input.is_a?(Pathname)
186
207
  # resolve the Pathname to the file and open it as an IO object, see #2110
187
- string_or_io = string_or_io.expand_path.open
188
- url ||= string_or_io.path
208
+ input = input.expand_path.open
209
+ url ||= input.path
189
210
  end
190
211
 
191
212
  unless encoding
192
- string_or_io = EncodingReader.new(string_or_io)
213
+ input = EncodingReader.new(input)
193
214
  begin
194
- return read_io(string_or_io, url, encoding, options.to_i)
215
+ return read_io(input, url, encoding, options.to_i)
195
216
  rescue EncodingReader::EncodingFound => e
196
217
  encoding = e.found_encoding
197
218
  end
198
219
  end
199
- return read_io(string_or_io, url, encoding, options.to_i)
220
+ return read_io(input, url, encoding, options.to_i)
200
221
  end
201
222
 
202
223
  # read_memory pukes on empty docs
203
- if string_or_io.nil? || string_or_io.empty?
224
+ if input.nil? || input.empty?
204
225
  return encoding ? new.tap { |i| i.encoding = encoding } : new
205
226
  end
206
227
 
207
- encoding ||= EncodingReader.detect_encoding(string_or_io)
228
+ encoding ||= EncodingReader.detect_encoding(input)
208
229
 
209
- read_memory(string_or_io, url, encoding, options.to_i)
230
+ read_memory(input, url, encoding, options.to_i)
210
231
  end
211
232
  end
212
233
  end
@@ -3,13 +3,83 @@
3
3
  module Nokogiri
4
4
  module HTML4
5
5
  class DocumentFragment < Nokogiri::XML::DocumentFragment
6
- ####
7
- # Create a Nokogiri::XML::DocumentFragment from +tags+, using +encoding+
8
- def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
6
+ #
7
+ # :call-seq:
8
+ # parse(input) { |options| ... } HTML4::DocumentFragment
9
+ # parse(input, encoding:, options:) { |options| ... } → HTML4::DocumentFragment
10
+ #
11
+ # Parse \HTML4 fragment input from a String, and return a new HTML4::DocumentFragment. This
12
+ # method creates a new, empty HTML4::Document to contain the fragment.
13
+ #
14
+ # [Required Parameters]
15
+ # - +input+ (String | IO) The content to be parsed.
16
+ #
17
+ # [Optional Keyword Arguments]
18
+ # - +encoding:+ (String) The name of the encoding that should be used when processing the
19
+ # document. When not provided, the encoding will be determined based on the document
20
+ # content.
21
+ #
22
+ # - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some
23
+ # behaviors during parsing. See ParseOptions for more information. The default value is
24
+ # +ParseOptions::DEFAULT_HTML+.
25
+ #
26
+ # [Yields]
27
+ # If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which
28
+ # can be configured before parsing. See ParseOptions for more information.
29
+ #
30
+ # [Returns] HTML4::DocumentFragment
31
+ #
32
+ # *Example:* Parsing a string
33
+ #
34
+ # fragment = HTML4::DocumentFragment.parse("<div>Hello World</div>")
35
+ #
36
+ # *Example:* Parsing an IO
37
+ #
38
+ # fragment = File.open("fragment.html") do |file|
39
+ # HTML4::DocumentFragment.parse(file)
40
+ # end
41
+ #
42
+ # *Example:* Specifying encoding
43
+ #
44
+ # fragment = HTML4::DocumentFragment.parse(input, encoding: "EUC-JP")
45
+ #
46
+ # *Example:* Setting parse options dynamically
47
+ #
48
+ # HTML4::DocumentFragment.parse("<div>Hello World") do |options|
49
+ # options.huge.pedantic
50
+ # end
51
+ #
52
+ def self.parse(
53
+ input,
54
+ encoding_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML,
55
+ encoding: encoding_, options: options_,
56
+ &block
57
+ )
58
+ # TODO: this method should take a context node.
9
59
  doc = HTML4::Document.new
10
60
 
11
- encoding ||= if tags.respond_to?(:encoding)
12
- encoding = tags.encoding
61
+ if input.respond_to?(:read)
62
+ # Handle IO-like objects (IO, File, StringIO, etc.)
63
+ # The _read_ method of these objects doesn't accept an +encoding+ parameter.
64
+ # Encoding is usually set when the IO object is created or opened,
65
+ # or by using the _set_encoding_ method.
66
+ #
67
+ # 1. If +encoding+ is provided and the object supports _set_encoding_,
68
+ # set the encoding before reading.
69
+ # 2. Read the content from the IO-like object.
70
+ #
71
+ # Note: After reading, the content's encoding will be:
72
+ # - The encoding set by _set_encoding_ if it was called
73
+ # - The default encoding of the IO object otherwise
74
+ #
75
+ # For StringIO specifically, _set_encoding_ affects only the internal string,
76
+ # not how the data is read out.
77
+ input.set_encoding(encoding) if encoding && input.respond_to?(:set_encoding)
78
+ input = input.read
79
+ end
80
+
81
+ encoding ||= if input.respond_to?(:encoding)
82
+ encoding = input.encoding
13
83
  if encoding == ::Encoding::ASCII_8BIT
14
84
  "UTF-8"
15
85
  else
@@ -21,29 +91,71 @@ module Nokogiri
21
91
 
22
92
  doc.encoding = encoding
23
93
 
24
- new(doc, tags, nil, options, &block)
94
+ new(doc, input, options: options, &block)
25
95
  end
26
96
 
27
- def initialize(document, tags = nil, ctx = nil, options = XML::ParseOptions::DEFAULT_HTML) # rubocop:disable Lint/MissingSuper
28
- return self unless tags
97
+ #
98
+ # :call-seq:
99
+ # new(document) { |options| ... } → HTML4::DocumentFragment
100
+ # new(document, input) { |options| ... } → HTML4::DocumentFragment
101
+ # new(document, input, context:, options:) { |options| ... } → HTML4::DocumentFragment
102
+ #
103
+ # Parse \HTML4 fragment input from a String, and return a new HTML4::DocumentFragment.
104
+ #
105
+ # 💡 It's recommended to use either HTML4::DocumentFragment.parse or XML::Node#parse rather
106
+ # than call this method directly.
107
+ #
108
+ # [Required Parameters]
109
+ # - +document+ (HTML4::Document) The parent document to associate the returned fragment with.
110
+ #
111
+ # [Optional Parameters]
112
+ # - +input+ (String) The content to be parsed.
113
+ #
114
+ # [Optional Keyword Arguments]
115
+ # - +context:+ (Nokogiri::XML::Node) The <b>context node</b> for the subtree created. See
116
+ # below for more information.
117
+ #
118
+ # - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some
119
+ # behaviors during parsing. See ParseOptions for more information. The default value is
120
+ # +ParseOptions::DEFAULT_HTML+.
121
+ #
122
+ # [Yields]
123
+ # If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which
124
+ # can be configured before parsing. See ParseOptions for more information.
125
+ #
126
+ # [Returns] HTML4::DocumentFragment
127
+ #
128
+ # === Context \Node
129
+ #
130
+ # If a context node is specified using +context:+, then the fragment will be created by
131
+ # calling XML::Node#parse on that node, so the parser will behave as if that Node is the
132
+ # parent of the fragment subtree.
133
+ #
134
+ def initialize(
135
+ document, input = nil,
136
+ context_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML,
137
+ context: context_, options: options_
138
+ ) # rubocop:disable Lint/MissingSuper
139
+ return self unless input
29
140
 
30
141
  options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
142
+ @parse_options = options
31
143
  yield options if block_given?
32
144
 
33
- if ctx
145
+ if context
34
146
  preexisting_errors = document.errors.dup
35
- node_set = ctx.parse("<div>#{tags}</div>", options)
147
+ node_set = context.parse("<div>#{input}</div>", options)
36
148
  node_set.first.children.each { |child| child.parent = self } unless node_set.empty?
37
149
  self.errors = document.errors - preexisting_errors
38
150
  else
39
151
  # This is a horrible hack, but I don't care
40
- path = if /^\s*?<body/i.match?(tags)
152
+ path = if /^\s*?<body/i.match?(input)
41
153
  "/html/body"
42
154
  else
43
155
  "/html/body/node()"
44
156
  end
45
157
 
46
- temp_doc = HTML4::Document.parse("<html><body>#{tags}", nil, document.encoding, options)
158
+ temp_doc = HTML4::Document.parse("<html><body>#{input}", nil, document.encoding, options)
47
159
  temp_doc.xpath(path).each { |child| child.parent = self }
48
160
  self.errors = temp_doc.errors
49
161
  end
@@ -26,7 +26,7 @@ module Nokogiri
26
26
 
27
27
  def initialize
28
28
  @encoding = nil
29
- super()
29
+ super
30
30
  end
31
31
 
32
32
  def start_element(name, attrs = [])