nokogiri 1.16.3 → 1.18.1

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (95) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +14 -22
  3. data/LICENSE-DEPENDENCIES.md +6 -6
  4. data/README.md +8 -5
  5. data/dependencies.yml +9 -9
  6. data/ext/nokogiri/extconf.rb +188 -142
  7. data/ext/nokogiri/gumbo.c +69 -53
  8. data/ext/nokogiri/html4_document.c +10 -4
  9. data/ext/nokogiri/html4_element_description.c +18 -18
  10. data/ext/nokogiri/html4_sax_parser.c +40 -0
  11. data/ext/nokogiri/html4_sax_parser_context.c +48 -58
  12. data/ext/nokogiri/html4_sax_push_parser.c +25 -24
  13. data/ext/nokogiri/libxml2_polyfill.c +114 -0
  14. data/ext/nokogiri/nokogiri.c +9 -2
  15. data/ext/nokogiri/nokogiri.h +18 -33
  16. data/ext/nokogiri/xml_attr.c +1 -1
  17. data/ext/nokogiri/xml_cdata.c +2 -10
  18. data/ext/nokogiri/xml_comment.c +3 -8
  19. data/ext/nokogiri/xml_document.c +163 -156
  20. data/ext/nokogiri/xml_document_fragment.c +10 -25
  21. data/ext/nokogiri/xml_dtd.c +1 -1
  22. data/ext/nokogiri/xml_element_content.c +9 -9
  23. data/ext/nokogiri/xml_encoding_handler.c +4 -4
  24. data/ext/nokogiri/xml_namespace.c +6 -6
  25. data/ext/nokogiri/xml_node.c +141 -104
  26. data/ext/nokogiri/xml_node_set.c +46 -44
  27. data/ext/nokogiri/xml_reader.c +54 -58
  28. data/ext/nokogiri/xml_relax_ng.c +35 -56
  29. data/ext/nokogiri/xml_sax_parser.c +156 -88
  30. data/ext/nokogiri/xml_sax_parser_context.c +219 -131
  31. data/ext/nokogiri/xml_sax_push_parser.c +68 -49
  32. data/ext/nokogiri/xml_schema.c +50 -85
  33. data/ext/nokogiri/xml_syntax_error.c +19 -11
  34. data/ext/nokogiri/xml_text.c +2 -4
  35. data/ext/nokogiri/xml_xpath_context.c +103 -100
  36. data/ext/nokogiri/xslt_stylesheet.c +8 -8
  37. data/gumbo-parser/src/ascii.c +2 -2
  38. data/gumbo-parser/src/error.c +76 -48
  39. data/gumbo-parser/src/error.h +5 -1
  40. data/gumbo-parser/src/nokogiri_gumbo.h +11 -2
  41. data/gumbo-parser/src/parser.c +63 -25
  42. data/gumbo-parser/src/tokenizer.c +6 -6
  43. data/lib/nokogiri/class_resolver.rb +1 -1
  44. data/lib/nokogiri/css/node.rb +6 -2
  45. data/lib/nokogiri/css/parser.rb +6 -4
  46. data/lib/nokogiri/css/parser.y +2 -2
  47. data/lib/nokogiri/css/parser_extras.rb +6 -66
  48. data/lib/nokogiri/css/selector_cache.rb +38 -0
  49. data/lib/nokogiri/css/tokenizer.rb +4 -4
  50. data/lib/nokogiri/css/tokenizer.rex +9 -8
  51. data/lib/nokogiri/css/xpath_visitor.rb +42 -6
  52. data/lib/nokogiri/css.rb +86 -20
  53. data/lib/nokogiri/decorators/slop.rb +3 -5
  54. data/lib/nokogiri/encoding_handler.rb +2 -2
  55. data/lib/nokogiri/html4/document.rb +44 -23
  56. data/lib/nokogiri/html4/document_fragment.rb +124 -12
  57. data/lib/nokogiri/html4/encoding_reader.rb +1 -1
  58. data/lib/nokogiri/html4/sax/parser.rb +23 -38
  59. data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
  60. data/lib/nokogiri/html4.rb +9 -14
  61. data/lib/nokogiri/html5/builder.rb +40 -0
  62. data/lib/nokogiri/html5/document.rb +61 -30
  63. data/lib/nokogiri/html5/document_fragment.rb +130 -20
  64. data/lib/nokogiri/html5/node.rb +4 -4
  65. data/lib/nokogiri/html5.rb +114 -72
  66. data/lib/nokogiri/version/constant.rb +1 -1
  67. data/lib/nokogiri/xml/builder.rb +8 -1
  68. data/lib/nokogiri/xml/document.rb +70 -26
  69. data/lib/nokogiri/xml/document_fragment.rb +84 -13
  70. data/lib/nokogiri/xml/node.rb +82 -11
  71. data/lib/nokogiri/xml/node_set.rb +9 -7
  72. data/lib/nokogiri/xml/parse_options.rb +1 -1
  73. data/lib/nokogiri/xml/pp/node.rb +6 -1
  74. data/lib/nokogiri/xml/reader.rb +46 -13
  75. data/lib/nokogiri/xml/relax_ng.rb +57 -20
  76. data/lib/nokogiri/xml/sax/document.rb +174 -83
  77. data/lib/nokogiri/xml/sax/parser.rb +115 -41
  78. data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
  79. data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
  80. data/lib/nokogiri/xml/sax.rb +48 -0
  81. data/lib/nokogiri/xml/schema.rb +112 -45
  82. data/lib/nokogiri/xml/searchable.rb +38 -42
  83. data/lib/nokogiri/xml/syntax_error.rb +22 -0
  84. data/lib/nokogiri/xml/xpath_context.rb +14 -3
  85. data/lib/nokogiri/xml.rb +13 -24
  86. data/lib/nokogiri/xslt.rb +3 -9
  87. data/lib/xsd/xmlparser/nokogiri.rb +3 -4
  88. data/patches/libxml2/0019-xpath-Use-separate-static-hash-table-for-standard-fu.patch +244 -0
  89. data/ports/archives/libxml2-2.13.5.tar.xz +0 -0
  90. data/ports/archives/libxslt-1.1.42.tar.xz +0 -0
  91. metadata +13 -14
  92. data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
  93. data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +0 -25
  94. data/ports/archives/libxml2-2.12.6.tar.xz +0 -0
  95. data/ports/archives/libxslt-1.1.39.tar.xz +0 -0
data/lib/nokogiri/css.rb CHANGED
@@ -8,53 +8,119 @@ module Nokogiri
8
8
  # TODO: Deprecate this method ahead of 2.0 and delete it in 2.0.
9
9
  # It is not used by Nokogiri and shouldn't be part of the public API.
10
10
  def parse(selector) # :nodoc:
11
+ warn("Nokogiri::CSS.parse is deprecated and will be removed in a future version of Nokogiri. Use Nokogiri::CSS::Parser#parse instead.", uplevel: 1, category: :deprecated)
11
12
  Parser.new.parse(selector)
12
13
  end
13
14
 
14
15
  # :call-seq:
15
- # xpath_for(selector) → String
16
- # xpath_for(selector [, prefix:] [, visitor:] [, ns:]) → String
16
+ # xpath_for(selector_list) → Array<String>
17
+ # xpath_for(selector_list [, prefix:] [, ns:] [, visitor:] [, cache:]) → Array<String>
17
18
  #
18
- # Translate a CSS selector to the equivalent XPath query.
19
+ # Translate a CSS selector list to the equivalent XPath expressions.
20
+ #
21
+ # 💡 Note that translated queries are cached by default for performance concerns.
22
+ #
23
+ # ⚠ Users should prefer Nokogiri::XML::Searchable#css, which is mixed into all document and
24
+ # node classes, for querying documents with CSS selectors. This method is the underlying
25
+ # mechanism used by XML::Searchable and is provided solely for advanced users to translate
26
+ # \CSS selectors to XPath directly.
27
+ #
28
+ # Also see Nokogiri::XML::Searchable#css for documentation on supported CSS selector features,
29
+ # some extended syntax that Nokogiri supports, and advanced CSS features like pseudo-class
30
+ # functions.
19
31
  #
20
32
  # [Parameters]
21
- # - +selector+ (String) The CSS selector to be translated into XPath
33
+ # - +selector_list+ (String)
22
34
  #
35
+ # The CSS selector to be translated into XPath. This is always a String, but that string
36
+ # value may be a {selector list}[https://www.w3.org/TR/selectors-4/#grouping] (see
37
+ # examples).
38
+ #
39
+ # [Keyword arguments]
23
40
  # - +prefix:+ (String)
24
41
  #
25
- # The XPath prefix for the query, see Nokogiri::XML::XPath for some options. Default is
26
- # +XML::XPath::GLOBAL_SEARCH_PREFIX+.
42
+ # The XPath expression prefix which determines the search context. See Nokogiri::XML::XPath
43
+ # for standard options. Default is +XPath::GLOBAL_SEARCH_PREFIX+.
44
+ #
45
+ # - +ns:+ (Hash<String ⇒ String>, nil)
46
+ #
47
+ # Namespaces that are referenced in the query, if any. This is a hash where the keys are the
48
+ # namespace prefix and the values are the namespace URIs. Default is +nil+ indicating an
49
+ # empty set of namespaces.
27
50
  #
28
51
  # - +visitor:+ (Nokogiri::CSS::XPathVisitor)
29
52
  #
30
- # The visitor class to use to transform the AST into XPath. Default is
31
- # +Nokogiri::CSS::XPathVisitor.new+.
53
+ # Use this XPathVisitor object to transform the CSS AST into XPath expressions. See
54
+ # Nokogiri::CSS::XPathVisitor for more information on some of the complex behavior that can
55
+ # be customized for your document type. Default is +Nokogiri::CSS::XPathVisitor.new+.
56
+ #
57
+ # ⚠ Note that this option is mutually exclusive with +prefix+ and +ns+. If +visitor+ is
58
+ # provided, +prefix+ and +ns+ must not be present.
59
+ #
60
+ # - +cache:+ (Boolean)
61
+ #
62
+ # Whether to use the SelectorCache for the translated query to ensure that repeated queries
63
+ # don't incur the overhead of re-parsing the selector. Default is +true+.
32
64
  #
33
- # - +ns:+ (Hash<String String>)
65
+ # [Returns] (Array<String>) The equivalent set of XPath expressions for +selector_list+
34
66
  #
35
- # The namespaces that are referenced in the query, if any. This is a hash where the keys are
36
- # the namespace prefix and the values are the namespace URIs. Default is an empty Hash.
67
+ # *Example* with a simple selector:
37
68
  #
38
- # [Returns] (String) The equivalent XPath query for +selector+
69
+ # Nokogiri::CSS.xpath_for("div") # => ["//div"]
39
70
  #
40
- # 💡 Note that translated queries are cached for performance concerns.
71
+ # *Example* with a compound selector:
41
72
  #
42
- def xpath_for(selector, options = {})
43
- raise TypeError, "no implicit conversion of #{selector.inspect} to String" unless selector.respond_to?(:to_str)
73
+ # Nokogiri::CSS.xpath_for("div.xl") # => ["//div[contains(concat(' ',normalize-space(@class),' '),' xl ')]"]
74
+ #
75
+ # *Example* with a complex selector:
76
+ #
77
+ # Nokogiri::CSS.xpath_for("h1 + div") # => ["//h1/following-sibling::*[1]/self::div"]
78
+ #
79
+ # *Example* with a selector list:
80
+ #
81
+ # Nokogiri::CSS.xpath_for("h1, h2, h3") # => ["//h1", "//h2", "//h3"]
82
+ #
83
+ def xpath_for(
84
+ selector, options = nil,
85
+ prefix: options&.delete(:prefix),
86
+ visitor: options&.delete(:visitor),
87
+ ns: options&.delete(:ns),
88
+ cache: true
89
+ )
90
+ unless options.nil?
91
+ warn("Nokogiri::CSS.xpath_for: Passing options as an explicit hash is deprecated. Use keyword arguments instead. This will become an error in a future release.", uplevel: 1, category: :deprecated)
92
+ end
93
+
94
+ raise(TypeError, "no implicit conversion of #{selector.inspect} to String") unless selector.respond_to?(:to_str)
44
95
 
45
96
  selector = selector.to_str
46
- raise Nokogiri::CSS::SyntaxError, "empty CSS selector" if selector.empty?
97
+ raise(Nokogiri::CSS::SyntaxError, "empty CSS selector") if selector.empty?
98
+
99
+ if visitor
100
+ raise ArgumentError, "cannot provide both :prefix and :visitor" if prefix
101
+ raise ArgumentError, "cannot provide both :ns and :visitor" if ns
102
+ end
103
+
104
+ visitor ||= begin
105
+ visitor_kw = {}
106
+ visitor_kw[:prefix] = prefix if prefix
107
+ visitor_kw[:namespaces] = ns if ns
47
108
 
48
- prefix = options.fetch(:prefix, Nokogiri::XML::XPath::GLOBAL_SEARCH_PREFIX)
49
- visitor = options.fetch(:visitor) { Nokogiri::CSS::XPathVisitor.new }
50
- ns = options.fetch(:ns, {})
109
+ Nokogiri::CSS::XPathVisitor.new(**visitor_kw)
110
+ end
51
111
 
52
- Parser.new(ns).xpath_for(selector, prefix, visitor)
112
+ if cache
113
+ key = SelectorCache.key(selector: selector, visitor: visitor)
114
+ SelectorCache[key] ||= Parser.new.xpath_for(selector, visitor)
115
+ else
116
+ Parser.new.xpath_for(selector, visitor)
117
+ end
53
118
  end
54
119
  end
55
120
  end
56
121
  end
57
122
 
123
+ require_relative "css/selector_cache"
58
124
  require_relative "css/node"
59
125
  require_relative "css/xpath_visitor"
60
126
  x = $-w
@@ -23,11 +23,9 @@ module Nokogiri
23
23
  list = xpath("#{XPATH_PREFIX}#{name}[#{conds}]")
24
24
  end
25
25
  else
26
- CSS::Parser.without_cache do
27
- list = xpath(
28
- *CSS.xpath_for("#{name}#{args.first}", prefix: XPATH_PREFIX),
29
- )
30
- end
26
+ list = xpath(
27
+ *CSS.xpath_for("#{name}#{args.first}", prefix: XPATH_PREFIX, cache: false),
28
+ )
31
29
  end
32
30
 
33
31
  super if list.empty?
@@ -6,9 +6,9 @@ module Nokogiri
6
6
  # Popular encoding aliases not known by all iconv implementations that Nokogiri should support.
7
7
  USEFUL_ALIASES = {
8
8
  # alias_name => true_name
9
- "NOKOGIRI-SENTINEL" => "UTF-8", # indicating the Nokogiri has installed aliases
9
+ "ISO-2022-JP" => "ISO-2022-JP", # only for JRuby tests, this is a no-op in CRuby
10
+ "NOKOGIRI-SENTINEL" => "ISO-2022-JP", # indicating the Nokogiri has installed aliases
10
11
  "Windows-31J" => "CP932", # Windows-31J is the IANA registered name of CP932.
11
- "UTF-8" => "UTF-8", # for JRuby tests, this is a no-op in CRuby
12
12
  }
13
13
 
14
14
  class << self
@@ -161,52 +161,73 @@ module Nokogiri
161
161
  end
162
162
 
163
163
  class << self
164
- ###
165
- # Parse HTML. +string_or_io+ may be a String, or any object that
166
- # responds to _read_ and _close_ such as an IO, or StringIO.
167
- # +url+ is resource where this document is located. +encoding+ is the
168
- # encoding that should be used when processing the document. +options+
169
- # is a number that sets options in the parser, such as
170
- # Nokogiri::XML::ParseOptions::RECOVER. See the constants in
171
- # Nokogiri::XML::ParseOptions.
172
- def parse(string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML)
164
+ # :call-seq:
165
+ # parse(input) { |options| ... } => Nokogiri::HTML4::Document
166
+ # parse(input, url:, encoding:, options:) => Nokogiri::HTML4::Document
167
+ #
168
+ # Parse \HTML4 input from a String or IO object, and return a new HTML4::Document.
169
+ #
170
+ # [Required Parameters]
171
+ # - +input+ (String | IO) The content to be parsed.
172
+ #
173
+ # [Optional Keyword Arguments]
174
+ # - +url:+ (String) The base URI for this document.
175
+ #
176
+ # - +encoding:+ (String) The name of the encoding that should be used when processing the
177
+ # document. When not provided, the encoding will be determined based on the document
178
+ # content.
179
+ #
180
+ # - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some
181
+ # behaviors during parsing. See ParseOptions for more information. The default value is
182
+ # +ParseOptions::DEFAULT_HTML+.
183
+ #
184
+ # [Yields]
185
+ # If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which
186
+ # can be configured before parsing. See Nokogiri::XML::ParseOptions for more information.
187
+ #
188
+ # [Returns] Nokogiri::HTML4::Document
189
+ def parse(
190
+ input,
191
+ url_ = nil, encoding_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML,
192
+ url: url_, encoding: encoding_, options: options_
193
+ )
173
194
  options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
174
195
  yield options if block_given?
175
196
 
176
- url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
197
+ url ||= input.respond_to?(:path) ? input.path : nil
177
198
 
178
- if string_or_io.respond_to?(:encoding)
179
- unless string_or_io.encoding == Encoding::ASCII_8BIT
180
- encoding ||= string_or_io.encoding.name
199
+ if input.respond_to?(:encoding)
200
+ unless input.encoding == Encoding::ASCII_8BIT
201
+ encoding ||= input.encoding.name
181
202
  end
182
203
  end
183
204
 
184
- if string_or_io.respond_to?(:read)
185
- if string_or_io.is_a?(Pathname)
205
+ if input.respond_to?(:read)
206
+ if input.is_a?(Pathname)
186
207
  # resolve the Pathname to the file and open it as an IO object, see #2110
187
- string_or_io = string_or_io.expand_path.open
188
- url ||= string_or_io.path
208
+ input = input.expand_path.open
209
+ url ||= input.path
189
210
  end
190
211
 
191
212
  unless encoding
192
- string_or_io = EncodingReader.new(string_or_io)
213
+ input = EncodingReader.new(input)
193
214
  begin
194
- return read_io(string_or_io, url, encoding, options.to_i)
215
+ return read_io(input, url, encoding, options.to_i)
195
216
  rescue EncodingReader::EncodingFound => e
196
217
  encoding = e.found_encoding
197
218
  end
198
219
  end
199
- return read_io(string_or_io, url, encoding, options.to_i)
220
+ return read_io(input, url, encoding, options.to_i)
200
221
  end
201
222
 
202
223
  # read_memory pukes on empty docs
203
- if string_or_io.nil? || string_or_io.empty?
224
+ if input.nil? || input.empty?
204
225
  return encoding ? new.tap { |i| i.encoding = encoding } : new
205
226
  end
206
227
 
207
- encoding ||= EncodingReader.detect_encoding(string_or_io)
228
+ encoding ||= EncodingReader.detect_encoding(input)
208
229
 
209
- read_memory(string_or_io, url, encoding, options.to_i)
230
+ read_memory(input, url, encoding, options.to_i)
210
231
  end
211
232
  end
212
233
  end
@@ -3,13 +3,83 @@
3
3
  module Nokogiri
4
4
  module HTML4
5
5
  class DocumentFragment < Nokogiri::XML::DocumentFragment
6
- ####
7
- # Create a Nokogiri::XML::DocumentFragment from +tags+, using +encoding+
8
- def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
6
+ #
7
+ # :call-seq:
8
+ # parse(input) { |options| ... } HTML4::DocumentFragment
9
+ # parse(input, encoding:, options:) { |options| ... } → HTML4::DocumentFragment
10
+ #
11
+ # Parse \HTML4 fragment input from a String, and return a new HTML4::DocumentFragment. This
12
+ # method creates a new, empty HTML4::Document to contain the fragment.
13
+ #
14
+ # [Required Parameters]
15
+ # - +input+ (String | IO) The content to be parsed.
16
+ #
17
+ # [Optional Keyword Arguments]
18
+ # - +encoding:+ (String) The name of the encoding that should be used when processing the
19
+ # document. When not provided, the encoding will be determined based on the document
20
+ # content.
21
+ #
22
+ # - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some
23
+ # behaviors during parsing. See ParseOptions for more information. The default value is
24
+ # +ParseOptions::DEFAULT_HTML+.
25
+ #
26
+ # [Yields]
27
+ # If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which
28
+ # can be configured before parsing. See ParseOptions for more information.
29
+ #
30
+ # [Returns] HTML4::DocumentFragment
31
+ #
32
+ # *Example:* Parsing a string
33
+ #
34
+ # fragment = HTML4::DocumentFragment.parse("<div>Hello World</div>")
35
+ #
36
+ # *Example:* Parsing an IO
37
+ #
38
+ # fragment = File.open("fragment.html") do |file|
39
+ # HTML4::DocumentFragment.parse(file)
40
+ # end
41
+ #
42
+ # *Example:* Specifying encoding
43
+ #
44
+ # fragment = HTML4::DocumentFragment.parse(input, encoding: "EUC-JP")
45
+ #
46
+ # *Example:* Setting parse options dynamically
47
+ #
48
+ # HTML4::DocumentFragment.parse("<div>Hello World") do |options|
49
+ # options.huge.pedantic
50
+ # end
51
+ #
52
+ def self.parse(
53
+ input,
54
+ encoding_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML,
55
+ encoding: encoding_, options: options_,
56
+ &block
57
+ )
58
+ # TODO: this method should take a context node.
9
59
  doc = HTML4::Document.new
10
60
 
11
- encoding ||= if tags.respond_to?(:encoding)
12
- encoding = tags.encoding
61
+ if input.respond_to?(:read)
62
+ # Handle IO-like objects (IO, File, StringIO, etc.)
63
+ # The _read_ method of these objects doesn't accept an +encoding+ parameter.
64
+ # Encoding is usually set when the IO object is created or opened,
65
+ # or by using the _set_encoding_ method.
66
+ #
67
+ # 1. If +encoding+ is provided and the object supports _set_encoding_,
68
+ # set the encoding before reading.
69
+ # 2. Read the content from the IO-like object.
70
+ #
71
+ # Note: After reading, the content's encoding will be:
72
+ # - The encoding set by _set_encoding_ if it was called
73
+ # - The default encoding of the IO object otherwise
74
+ #
75
+ # For StringIO specifically, _set_encoding_ affects only the internal string,
76
+ # not how the data is read out.
77
+ input.set_encoding(encoding) if encoding && input.respond_to?(:set_encoding)
78
+ input = input.read
79
+ end
80
+
81
+ encoding ||= if input.respond_to?(:encoding)
82
+ encoding = input.encoding
13
83
  if encoding == ::Encoding::ASCII_8BIT
14
84
  "UTF-8"
15
85
  else
@@ -21,29 +91,71 @@ module Nokogiri
21
91
 
22
92
  doc.encoding = encoding
23
93
 
24
- new(doc, tags, nil, options, &block)
94
+ new(doc, input, options: options, &block)
25
95
  end
26
96
 
27
- def initialize(document, tags = nil, ctx = nil, options = XML::ParseOptions::DEFAULT_HTML) # rubocop:disable Lint/MissingSuper
28
- return self unless tags
97
+ #
98
+ # :call-seq:
99
+ # new(document) { |options| ... } → HTML4::DocumentFragment
100
+ # new(document, input) { |options| ... } → HTML4::DocumentFragment
101
+ # new(document, input, context:, options:) { |options| ... } → HTML4::DocumentFragment
102
+ #
103
+ # Parse \HTML4 fragment input from a String, and return a new HTML4::DocumentFragment.
104
+ #
105
+ # 💡 It's recommended to use either HTML4::DocumentFragment.parse or XML::Node#parse rather
106
+ # than call this method directly.
107
+ #
108
+ # [Required Parameters]
109
+ # - +document+ (HTML4::Document) The parent document to associate the returned fragment with.
110
+ #
111
+ # [Optional Parameters]
112
+ # - +input+ (String) The content to be parsed.
113
+ #
114
+ # [Optional Keyword Arguments]
115
+ # - +context:+ (Nokogiri::XML::Node) The <b>context node</b> for the subtree created. See
116
+ # below for more information.
117
+ #
118
+ # - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some
119
+ # behaviors during parsing. See ParseOptions for more information. The default value is
120
+ # +ParseOptions::DEFAULT_HTML+.
121
+ #
122
+ # [Yields]
123
+ # If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which
124
+ # can be configured before parsing. See ParseOptions for more information.
125
+ #
126
+ # [Returns] HTML4::DocumentFragment
127
+ #
128
+ # === Context \Node
129
+ #
130
+ # If a context node is specified using +context:+, then the fragment will be created by
131
+ # calling XML::Node#parse on that node, so the parser will behave as if that Node is the
132
+ # parent of the fragment subtree.
133
+ #
134
+ def initialize(
135
+ document, input = nil,
136
+ context_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML,
137
+ context: context_, options: options_
138
+ ) # rubocop:disable Lint/MissingSuper
139
+ return self unless input
29
140
 
30
141
  options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
142
+ @parse_options = options
31
143
  yield options if block_given?
32
144
 
33
- if ctx
145
+ if context
34
146
  preexisting_errors = document.errors.dup
35
- node_set = ctx.parse("<div>#{tags}</div>", options)
147
+ node_set = context.parse("<div>#{input}</div>", options)
36
148
  node_set.first.children.each { |child| child.parent = self } unless node_set.empty?
37
149
  self.errors = document.errors - preexisting_errors
38
150
  else
39
151
  # This is a horrible hack, but I don't care
40
- path = if /^\s*?<body/i.match?(tags)
152
+ path = if /^\s*?<body/i.match?(input)
41
153
  "/html/body"
42
154
  else
43
155
  "/html/body/node()"
44
156
  end
45
157
 
46
- temp_doc = HTML4::Document.parse("<html><body>#{tags}", nil, document.encoding, options)
158
+ temp_doc = HTML4::Document.parse("<html><body>#{input}", nil, document.encoding, options)
47
159
  temp_doc.xpath(path).each { |child| child.parent = self }
48
160
  self.errors = temp_doc.errors
49
161
  end
@@ -26,7 +26,7 @@ module Nokogiri
26
26
 
27
27
  def initialize
28
28
  @encoding = nil
29
- super()
29
+ super
30
30
  end
31
31
 
32
32
  def start_element(name, attrs = [])
@@ -3,60 +3,45 @@
3
3
  module Nokogiri
4
4
  module HTML4
5
5
  ###
6
- # Nokogiri lets you write a SAX parser to process HTML but get HTML correction features.
6
+ # Nokogiri provides a SAX parser to process HTML4 which will provide HTML recovery
7
+ # ("autocorrection") features.
7
8
  #
8
9
  # See Nokogiri::HTML4::SAX::Parser for a basic example of using a SAX parser with HTML.
9
10
  #
10
11
  # For more information on SAX parsers, see Nokogiri::XML::SAX
12
+ #
11
13
  module SAX
12
14
  ###
13
- # This class lets you perform SAX style parsing on HTML with HTML error correction.
15
+ # This parser is a SAX style parser that reads its input as it deems necessary. The parser
16
+ # takes a Nokogiri::XML::SAX::Document, an optional encoding, then given an HTML input, sends
17
+ # messages to the Nokogiri::XML::SAX::Document.
18
+ #
19
+ # ⚠ This is an HTML4 parser and so may not support some HTML5 features and behaviors.
14
20
  #
15
21
  # Here is a basic usage example:
16
22
  #
17
- # class MyDoc < Nokogiri::XML::SAX::Document
23
+ # class MyHandler < Nokogiri::XML::SAX::Document
18
24
  # def start_element name, attributes = []
19
25
  # puts "found a #{name}"
20
26
  # end
21
27
  # end
22
28
  #
23
- # parser = Nokogiri::HTML4::SAX::Parser.new(MyDoc.new)
24
- # parser.parse(File.read(ARGV[0], mode: 'rb'))
29
+ # parser = Nokogiri::HTML4::SAX::Parser.new(MyHandler.new)
30
+ #
31
+ # # Hand an IO object to the parser, which will read the HTML from the IO.
32
+ # File.open(path_to_html) do |f|
33
+ # parser.parse(f)
34
+ # end
35
+ #
36
+ # For more information on \SAX parsers, see Nokogiri::XML::SAX or the parent class
37
+ # Nokogiri::XML::SAX::Parser.
38
+ #
39
+ # Also see Nokogiri::XML::SAX::Document for the available events.
25
40
  #
26
- # For more information on SAX parsers, see Nokogiri::XML::SAX
27
41
  class Parser < Nokogiri::XML::SAX::Parser
28
- ###
29
- # Parse html stored in +data+ using +encoding+
30
- def parse_memory(data, encoding = "UTF-8")
31
- raise TypeError unless String === data
32
- return if data.empty?
33
-
34
- ctx = ParserContext.memory(data, encoding)
35
- yield ctx if block_given?
36
- ctx.parse_with(self)
37
- end
38
-
39
- ###
40
- # Parse given +io+
41
- def parse_io(io, encoding = "UTF-8")
42
- check_encoding(encoding)
43
- @encoding = encoding
44
- ctx = ParserContext.io(io, ENCODINGS[encoding])
45
- yield ctx if block_given?
46
- ctx.parse_with(self)
47
- end
48
-
49
- ###
50
- # Parse a file with +filename+
51
- def parse_file(filename, encoding = "UTF-8")
52
- raise ArgumentError unless filename
53
- raise Errno::ENOENT unless File.exist?(filename)
54
- raise Errno::EISDIR if File.directory?(filename)
55
-
56
- ctx = ParserContext.file(filename, encoding)
57
- yield ctx if block_given?
58
- ctx.parse_with(self)
59
- end
42
+ # this class inherits its behavior from Nokogiri::XML::SAX::Parser, but note that superclass
43
+ # uses Nokogiri::ClassResolver to use HTML4::SAX::ParserContext as the context class for
44
+ # this class, which is where the real behavioral differences are implemented.
60
45
  end
61
46
  end
62
47
  end
@@ -4,16 +4,11 @@ module Nokogiri
4
4
  module HTML4
5
5
  module SAX
6
6
  ###
7
- # Context for HTML SAX parsers. This class is usually not instantiated by the user. Instead,
8
- # you should be looking at Nokogiri::HTML4::SAX::Parser
7
+ # Context object to invoke the HTML4 SAX parser on the SAX::Document handler.
8
+ #
9
+ # 💡 This class is usually not instantiated by the user. Use Nokogiri::HTML4::SAX::Parser
10
+ # instead.
9
11
  class ParserContext < Nokogiri::XML::SAX::ParserContext
10
- def self.new(thing, encoding = "UTF-8")
11
- if [:read, :close].all? { |x| thing.respond_to?(x) }
12
- super
13
- else
14
- memory(thing, encoding)
15
- end
16
- end
17
12
  end
18
13
  end
19
14
  end
@@ -3,12 +3,9 @@
3
3
 
4
4
  module Nokogiri
5
5
  class << self
6
- # :call-seq:
7
- # HTML4(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block) → Nokogiri::HTML4::Document
8
- #
9
- # Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
10
- def HTML4(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
11
- Nokogiri::HTML4::Document.parse(input, url, encoding, options, &block)
6
+ # Convenience method for Nokogiri::HTML4::Document.parse
7
+ def HTML4(...)
8
+ Nokogiri::HTML4::Document.parse(...)
12
9
  end
13
10
  end
14
11
 
@@ -18,16 +15,14 @@ module Nokogiri
18
15
  # for parsing HTML.
19
16
  module HTML4
20
17
  class << self
21
- ###
22
- # Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
23
- def parse(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
24
- Document.parse(input, url, encoding, options, &block)
18
+ # Convenience method for Nokogiri::HTML4::Document.parse
19
+ def parse(...)
20
+ Document.parse(...)
25
21
  end
26
22
 
27
- ####
28
- # Parse a fragment from +string+ in to a NodeSet.
29
- def fragment(string, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
30
- HTML4::DocumentFragment.parse(string, encoding, options, &block)
23
+ # Convenience method for Nokogiri::HTML4::DocumentFragment.parse
24
+ def fragment(...)
25
+ HTML4::DocumentFragment.parse(...)
31
26
  end
32
27
  end
33
28
 
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nokogiri
4
+ module HTML5
5
+ ###
6
+ # Nokogiri HTML5 builder is used for building HTML documents. It is very similar to the
7
+ # Nokogiri::XML::Builder. In fact, you should go read the documentation for
8
+ # Nokogiri::XML::Builder before reading this documentation.
9
+ #
10
+ # The construction behavior is identical to HTML4::Builder, but HTML5 documents implement the
11
+ # [HTML5 standard's serialization
12
+ # algorithm](https://www.w3.org/TR/2008/WD-html5-20080610/serializing.html).
13
+ #
14
+ # == Synopsis:
15
+ #
16
+ # Create an HTML5 document with a body that has an onload attribute, and a
17
+ # span tag with a class of "bold" that has content of "Hello world".
18
+ #
19
+ # builder = Nokogiri::HTML5::Builder.new do |doc|
20
+ # doc.html {
21
+ # doc.body(:onload => 'some_func();') {
22
+ # doc.span.bold {
23
+ # doc.text "Hello world"
24
+ # }
25
+ # }
26
+ # }
27
+ # end
28
+ # puts builder.to_html
29
+ #
30
+ # The HTML5 builder inherits from the XML builder, so make sure to read the
31
+ # Nokogiri::XML::Builder documentation.
32
+ class Builder < Nokogiri::XML::Builder
33
+ ###
34
+ # Convert the builder to HTML
35
+ def to_html
36
+ @doc.to_html
37
+ end
38
+ end
39
+ end
40
+ end