nokogiri 1.16.5 → 1.18.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (95) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +11 -21
  3. data/LICENSE-DEPENDENCIES.md +6 -6
  4. data/README.md +8 -5
  5. data/dependencies.yml +6 -6
  6. data/ext/nokogiri/extconf.rb +188 -142
  7. data/ext/nokogiri/gumbo.c +69 -53
  8. data/ext/nokogiri/html4_document.c +10 -4
  9. data/ext/nokogiri/html4_element_description.c +18 -18
  10. data/ext/nokogiri/html4_sax_parser.c +40 -0
  11. data/ext/nokogiri/html4_sax_parser_context.c +48 -58
  12. data/ext/nokogiri/html4_sax_push_parser.c +25 -24
  13. data/ext/nokogiri/libxml2_polyfill.c +114 -0
  14. data/ext/nokogiri/nokogiri.c +9 -2
  15. data/ext/nokogiri/nokogiri.h +18 -33
  16. data/ext/nokogiri/xml_attr.c +1 -1
  17. data/ext/nokogiri/xml_cdata.c +2 -10
  18. data/ext/nokogiri/xml_comment.c +3 -8
  19. data/ext/nokogiri/xml_document.c +163 -156
  20. data/ext/nokogiri/xml_document_fragment.c +10 -25
  21. data/ext/nokogiri/xml_dtd.c +1 -1
  22. data/ext/nokogiri/xml_element_content.c +9 -9
  23. data/ext/nokogiri/xml_encoding_handler.c +4 -4
  24. data/ext/nokogiri/xml_namespace.c +6 -6
  25. data/ext/nokogiri/xml_node.c +141 -104
  26. data/ext/nokogiri/xml_node_set.c +46 -44
  27. data/ext/nokogiri/xml_reader.c +54 -58
  28. data/ext/nokogiri/xml_relax_ng.c +35 -56
  29. data/ext/nokogiri/xml_sax_parser.c +156 -88
  30. data/ext/nokogiri/xml_sax_parser_context.c +219 -131
  31. data/ext/nokogiri/xml_sax_push_parser.c +68 -49
  32. data/ext/nokogiri/xml_schema.c +50 -85
  33. data/ext/nokogiri/xml_syntax_error.c +19 -11
  34. data/ext/nokogiri/xml_text.c +2 -4
  35. data/ext/nokogiri/xml_xpath_context.c +103 -100
  36. data/ext/nokogiri/xslt_stylesheet.c +8 -8
  37. data/gumbo-parser/src/ascii.c +2 -2
  38. data/gumbo-parser/src/error.c +76 -48
  39. data/gumbo-parser/src/error.h +5 -1
  40. data/gumbo-parser/src/nokogiri_gumbo.h +11 -2
  41. data/gumbo-parser/src/parser.c +63 -25
  42. data/gumbo-parser/src/tokenizer.c +6 -6
  43. data/lib/nokogiri/class_resolver.rb +1 -1
  44. data/lib/nokogiri/css/node.rb +6 -2
  45. data/lib/nokogiri/css/parser.rb +6 -4
  46. data/lib/nokogiri/css/parser.y +2 -2
  47. data/lib/nokogiri/css/parser_extras.rb +6 -66
  48. data/lib/nokogiri/css/selector_cache.rb +38 -0
  49. data/lib/nokogiri/css/tokenizer.rb +4 -4
  50. data/lib/nokogiri/css/tokenizer.rex +9 -8
  51. data/lib/nokogiri/css/xpath_visitor.rb +43 -6
  52. data/lib/nokogiri/css.rb +86 -20
  53. data/lib/nokogiri/decorators/slop.rb +3 -5
  54. data/lib/nokogiri/encoding_handler.rb +2 -2
  55. data/lib/nokogiri/html4/document.rb +44 -23
  56. data/lib/nokogiri/html4/document_fragment.rb +124 -12
  57. data/lib/nokogiri/html4/encoding_reader.rb +1 -1
  58. data/lib/nokogiri/html4/sax/parser.rb +23 -38
  59. data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
  60. data/lib/nokogiri/html4.rb +9 -14
  61. data/lib/nokogiri/html5/builder.rb +40 -0
  62. data/lib/nokogiri/html5/document.rb +61 -30
  63. data/lib/nokogiri/html5/document_fragment.rb +130 -20
  64. data/lib/nokogiri/html5/node.rb +4 -4
  65. data/lib/nokogiri/html5.rb +114 -72
  66. data/lib/nokogiri/version/constant.rb +1 -1
  67. data/lib/nokogiri/xml/builder.rb +8 -1
  68. data/lib/nokogiri/xml/document.rb +70 -26
  69. data/lib/nokogiri/xml/document_fragment.rb +84 -13
  70. data/lib/nokogiri/xml/node.rb +82 -11
  71. data/lib/nokogiri/xml/node_set.rb +9 -7
  72. data/lib/nokogiri/xml/parse_options.rb +1 -1
  73. data/lib/nokogiri/xml/pp/node.rb +6 -1
  74. data/lib/nokogiri/xml/reader.rb +46 -13
  75. data/lib/nokogiri/xml/relax_ng.rb +57 -20
  76. data/lib/nokogiri/xml/sax/document.rb +174 -83
  77. data/lib/nokogiri/xml/sax/parser.rb +115 -41
  78. data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
  79. data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
  80. data/lib/nokogiri/xml/sax.rb +48 -0
  81. data/lib/nokogiri/xml/schema.rb +112 -45
  82. data/lib/nokogiri/xml/searchable.rb +38 -42
  83. data/lib/nokogiri/xml/syntax_error.rb +22 -0
  84. data/lib/nokogiri/xml/xpath_context.rb +14 -3
  85. data/lib/nokogiri/xml.rb +13 -24
  86. data/lib/nokogiri/xslt.rb +3 -9
  87. data/lib/xsd/xmlparser/nokogiri.rb +3 -4
  88. data/patches/libxml2/0019-xpath-Use-separate-static-hash-table-for-standard-fu.patch +244 -0
  89. data/ports/archives/libxml2-2.13.6.tar.xz +0 -0
  90. data/ports/archives/libxslt-1.1.42.tar.xz +0 -0
  91. metadata +13 -14
  92. data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
  93. data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +0 -25
  94. data/ports/archives/libxml2-2.12.7.tar.xz +0 -0
  95. data/ports/archives/libxslt-1.1.39.tar.xz +0 -0
data/lib/nokogiri/css.rb CHANGED
@@ -8,53 +8,119 @@ module Nokogiri
8
8
  # TODO: Deprecate this method ahead of 2.0 and delete it in 2.0.
9
9
  # It is not used by Nokogiri and shouldn't be part of the public API.
10
10
  def parse(selector) # :nodoc:
11
+ warn("Nokogiri::CSS.parse is deprecated and will be removed in a future version of Nokogiri. Use Nokogiri::CSS::Parser#parse instead.", uplevel: 1, category: :deprecated)
11
12
  Parser.new.parse(selector)
12
13
  end
13
14
 
14
15
  # :call-seq:
15
- # xpath_for(selector) → String
16
- # xpath_for(selector [, prefix:] [, visitor:] [, ns:]) → String
16
+ # xpath_for(selector_list) → Array<String>
17
+ # xpath_for(selector_list [, prefix:] [, ns:] [, visitor:] [, cache:]) → Array<String>
17
18
  #
18
- # Translate a CSS selector to the equivalent XPath query.
19
+ # Translate a CSS selector list to the equivalent XPath expressions.
20
+ #
21
+ # 💡 Note that translated queries are cached by default for performance concerns.
22
+ #
23
+ # ⚠ Users should prefer Nokogiri::XML::Searchable#css, which is mixed into all document and
24
+ # node classes, for querying documents with CSS selectors. This method is the underlying
25
+ # mechanism used by XML::Searchable and is provided solely for advanced users to translate
26
+ # \CSS selectors to XPath directly.
27
+ #
28
+ # Also see Nokogiri::XML::Searchable#css for documentation on supported CSS selector features,
29
+ # some extended syntax that Nokogiri supports, and advanced CSS features like pseudo-class
30
+ # functions.
19
31
  #
20
32
  # [Parameters]
21
- # - +selector+ (String) The CSS selector to be translated into XPath
33
+ # - +selector_list+ (String)
22
34
  #
35
+ # The CSS selector to be translated into XPath. This is always a String, but that string
36
+ # value may be a {selector list}[https://www.w3.org/TR/selectors-4/#grouping] (see
37
+ # examples).
38
+ #
39
+ # [Keyword arguments]
23
40
  # - +prefix:+ (String)
24
41
  #
25
- # The XPath prefix for the query, see Nokogiri::XML::XPath for some options. Default is
26
- # +XML::XPath::GLOBAL_SEARCH_PREFIX+.
42
+ # The XPath expression prefix which determines the search context. See Nokogiri::XML::XPath
43
+ # for standard options. Default is +XPath::GLOBAL_SEARCH_PREFIX+.
44
+ #
45
+ # - +ns:+ (Hash<String ⇒ String>, nil)
46
+ #
47
+ # Namespaces that are referenced in the query, if any. This is a hash where the keys are the
48
+ # namespace prefix and the values are the namespace URIs. Default is +nil+ indicating an
49
+ # empty set of namespaces.
27
50
  #
28
51
  # - +visitor:+ (Nokogiri::CSS::XPathVisitor)
29
52
  #
30
- # The visitor class to use to transform the AST into XPath. Default is
31
- # +Nokogiri::CSS::XPathVisitor.new+.
53
+ # Use this XPathVisitor object to transform the CSS AST into XPath expressions. See
54
+ # Nokogiri::CSS::XPathVisitor for more information on some of the complex behavior that can
55
+ # be customized for your document type. Default is +Nokogiri::CSS::XPathVisitor.new+.
56
+ #
57
+ # ⚠ Note that this option is mutually exclusive with +prefix+ and +ns+. If +visitor+ is
58
+ # provided, +prefix+ and +ns+ must not be present.
59
+ #
60
+ # - +cache:+ (Boolean)
61
+ #
62
+ # Whether to use the SelectorCache for the translated query to ensure that repeated queries
63
+ # don't incur the overhead of re-parsing the selector. Default is +true+.
32
64
  #
33
- # - +ns:+ (Hash<String String>)
65
+ # [Returns] (Array<String>) The equivalent set of XPath expressions for +selector_list+
34
66
  #
35
- # The namespaces that are referenced in the query, if any. This is a hash where the keys are
36
- # the namespace prefix and the values are the namespace URIs. Default is an empty Hash.
67
+ # *Example* with a simple selector:
37
68
  #
38
- # [Returns] (String) The equivalent XPath query for +selector+
69
+ # Nokogiri::CSS.xpath_for("div") # => ["//div"]
39
70
  #
40
- # 💡 Note that translated queries are cached for performance concerns.
71
+ # *Example* with a compound selector:
41
72
  #
42
- def xpath_for(selector, options = {})
43
- raise TypeError, "no implicit conversion of #{selector.inspect} to String" unless selector.respond_to?(:to_str)
73
+ # Nokogiri::CSS.xpath_for("div.xl") # => ["//div[contains(concat(' ',normalize-space(@class),' '),' xl ')]"]
74
+ #
75
+ # *Example* with a complex selector:
76
+ #
77
+ # Nokogiri::CSS.xpath_for("h1 + div") # => ["//h1/following-sibling::*[1]/self::div"]
78
+ #
79
+ # *Example* with a selector list:
80
+ #
81
+ # Nokogiri::CSS.xpath_for("h1, h2, h3") # => ["//h1", "//h2", "//h3"]
82
+ #
83
+ def xpath_for(
84
+ selector, options = nil,
85
+ prefix: options&.delete(:prefix),
86
+ visitor: options&.delete(:visitor),
87
+ ns: options&.delete(:ns),
88
+ cache: true
89
+ )
90
+ unless options.nil?
91
+ warn("Nokogiri::CSS.xpath_for: Passing options as an explicit hash is deprecated. Use keyword arguments instead. This will become an error in a future release.", uplevel: 1, category: :deprecated)
92
+ end
93
+
94
+ raise(TypeError, "no implicit conversion of #{selector.inspect} to String") unless selector.respond_to?(:to_str)
44
95
 
45
96
  selector = selector.to_str
46
- raise Nokogiri::CSS::SyntaxError, "empty CSS selector" if selector.empty?
97
+ raise(Nokogiri::CSS::SyntaxError, "empty CSS selector") if selector.empty?
98
+
99
+ if visitor
100
+ raise ArgumentError, "cannot provide both :prefix and :visitor" if prefix
101
+ raise ArgumentError, "cannot provide both :ns and :visitor" if ns
102
+ end
103
+
104
+ visitor ||= begin
105
+ visitor_kw = {}
106
+ visitor_kw[:prefix] = prefix if prefix
107
+ visitor_kw[:namespaces] = ns if ns
47
108
 
48
- prefix = options.fetch(:prefix, Nokogiri::XML::XPath::GLOBAL_SEARCH_PREFIX)
49
- visitor = options.fetch(:visitor) { Nokogiri::CSS::XPathVisitor.new }
50
- ns = options.fetch(:ns, {})
109
+ Nokogiri::CSS::XPathVisitor.new(**visitor_kw)
110
+ end
51
111
 
52
- Parser.new(ns).xpath_for(selector, prefix, visitor)
112
+ if cache
113
+ key = SelectorCache.key(selector: selector, visitor: visitor)
114
+ SelectorCache[key] ||= Parser.new.xpath_for(selector, visitor)
115
+ else
116
+ Parser.new.xpath_for(selector, visitor)
117
+ end
53
118
  end
54
119
  end
55
120
  end
56
121
  end
57
122
 
123
+ require_relative "css/selector_cache"
58
124
  require_relative "css/node"
59
125
  require_relative "css/xpath_visitor"
60
126
  x = $-w
@@ -23,11 +23,9 @@ module Nokogiri
23
23
  list = xpath("#{XPATH_PREFIX}#{name}[#{conds}]")
24
24
  end
25
25
  else
26
- CSS::Parser.without_cache do
27
- list = xpath(
28
- *CSS.xpath_for("#{name}#{args.first}", prefix: XPATH_PREFIX),
29
- )
30
- end
26
+ list = xpath(
27
+ *CSS.xpath_for("#{name}#{args.first}", prefix: XPATH_PREFIX, cache: false),
28
+ )
31
29
  end
32
30
 
33
31
  super if list.empty?
@@ -6,9 +6,9 @@ module Nokogiri
6
6
  # Popular encoding aliases not known by all iconv implementations that Nokogiri should support.
7
7
  USEFUL_ALIASES = {
8
8
  # alias_name => true_name
9
- "NOKOGIRI-SENTINEL" => "UTF-8", # indicating the Nokogiri has installed aliases
9
+ "ISO-2022-JP" => "ISO-2022-JP", # only for JRuby tests, this is a no-op in CRuby
10
+ "NOKOGIRI-SENTINEL" => "ISO-2022-JP", # indicating the Nokogiri has installed aliases
10
11
  "Windows-31J" => "CP932", # Windows-31J is the IANA registered name of CP932.
11
- "UTF-8" => "UTF-8", # for JRuby tests, this is a no-op in CRuby
12
12
  }
13
13
 
14
14
  class << self
@@ -161,52 +161,73 @@ module Nokogiri
161
161
  end
162
162
 
163
163
  class << self
164
- ###
165
- # Parse HTML. +string_or_io+ may be a String, or any object that
166
- # responds to _read_ and _close_ such as an IO, or StringIO.
167
- # +url+ is resource where this document is located. +encoding+ is the
168
- # encoding that should be used when processing the document. +options+
169
- # is a number that sets options in the parser, such as
170
- # Nokogiri::XML::ParseOptions::RECOVER. See the constants in
171
- # Nokogiri::XML::ParseOptions.
172
- def parse(string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML)
164
+ # :call-seq:
165
+ # parse(input) { |options| ... } => Nokogiri::HTML4::Document
166
+ # parse(input, url:, encoding:, options:) => Nokogiri::HTML4::Document
167
+ #
168
+ # Parse \HTML4 input from a String or IO object, and return a new HTML4::Document.
169
+ #
170
+ # [Required Parameters]
171
+ # - +input+ (String | IO) The content to be parsed.
172
+ #
173
+ # [Optional Keyword Arguments]
174
+ # - +url:+ (String) The base URI for this document.
175
+ #
176
+ # - +encoding:+ (String) The name of the encoding that should be used when processing the
177
+ # document. When not provided, the encoding will be determined based on the document
178
+ # content.
179
+ #
180
+ # - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some
181
+ # behaviors during parsing. See ParseOptions for more information. The default value is
182
+ # +ParseOptions::DEFAULT_HTML+.
183
+ #
184
+ # [Yields]
185
+ # If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which
186
+ # can be configured before parsing. See Nokogiri::XML::ParseOptions for more information.
187
+ #
188
+ # [Returns] Nokogiri::HTML4::Document
189
+ def parse(
190
+ input,
191
+ url_ = nil, encoding_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML,
192
+ url: url_, encoding: encoding_, options: options_
193
+ )
173
194
  options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
174
195
  yield options if block_given?
175
196
 
176
- url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
197
+ url ||= input.respond_to?(:path) ? input.path : nil
177
198
 
178
- if string_or_io.respond_to?(:encoding)
179
- unless string_or_io.encoding == Encoding::ASCII_8BIT
180
- encoding ||= string_or_io.encoding.name
199
+ if input.respond_to?(:encoding)
200
+ unless input.encoding == Encoding::ASCII_8BIT
201
+ encoding ||= input.encoding.name
181
202
  end
182
203
  end
183
204
 
184
- if string_or_io.respond_to?(:read)
185
- if string_or_io.is_a?(Pathname)
205
+ if input.respond_to?(:read)
206
+ if input.is_a?(Pathname)
186
207
  # resolve the Pathname to the file and open it as an IO object, see #2110
187
- string_or_io = string_or_io.expand_path.open
188
- url ||= string_or_io.path
208
+ input = input.expand_path.open
209
+ url ||= input.path
189
210
  end
190
211
 
191
212
  unless encoding
192
- string_or_io = EncodingReader.new(string_or_io)
213
+ input = EncodingReader.new(input)
193
214
  begin
194
- return read_io(string_or_io, url, encoding, options.to_i)
215
+ return read_io(input, url, encoding, options.to_i)
195
216
  rescue EncodingReader::EncodingFound => e
196
217
  encoding = e.found_encoding
197
218
  end
198
219
  end
199
- return read_io(string_or_io, url, encoding, options.to_i)
220
+ return read_io(input, url, encoding, options.to_i)
200
221
  end
201
222
 
202
223
  # read_memory pukes on empty docs
203
- if string_or_io.nil? || string_or_io.empty?
224
+ if input.nil? || input.empty?
204
225
  return encoding ? new.tap { |i| i.encoding = encoding } : new
205
226
  end
206
227
 
207
- encoding ||= EncodingReader.detect_encoding(string_or_io)
228
+ encoding ||= EncodingReader.detect_encoding(input)
208
229
 
209
- read_memory(string_or_io, url, encoding, options.to_i)
230
+ read_memory(input, url, encoding, options.to_i)
210
231
  end
211
232
  end
212
233
  end
@@ -3,13 +3,83 @@
3
3
  module Nokogiri
4
4
  module HTML4
5
5
  class DocumentFragment < Nokogiri::XML::DocumentFragment
6
- ####
7
- # Create a Nokogiri::XML::DocumentFragment from +tags+, using +encoding+
8
- def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
6
+ #
7
+ # :call-seq:
8
+ # parse(input) { |options| ... } HTML4::DocumentFragment
9
+ # parse(input, encoding:, options:) { |options| ... } → HTML4::DocumentFragment
10
+ #
11
+ # Parse \HTML4 fragment input from a String, and return a new HTML4::DocumentFragment. This
12
+ # method creates a new, empty HTML4::Document to contain the fragment.
13
+ #
14
+ # [Required Parameters]
15
+ # - +input+ (String | IO) The content to be parsed.
16
+ #
17
+ # [Optional Keyword Arguments]
18
+ # - +encoding:+ (String) The name of the encoding that should be used when processing the
19
+ # document. When not provided, the encoding will be determined based on the document
20
+ # content.
21
+ #
22
+ # - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some
23
+ # behaviors during parsing. See ParseOptions for more information. The default value is
24
+ # +ParseOptions::DEFAULT_HTML+.
25
+ #
26
+ # [Yields]
27
+ # If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which
28
+ # can be configured before parsing. See ParseOptions for more information.
29
+ #
30
+ # [Returns] HTML4::DocumentFragment
31
+ #
32
+ # *Example:* Parsing a string
33
+ #
34
+ # fragment = HTML4::DocumentFragment.parse("<div>Hello World</div>")
35
+ #
36
+ # *Example:* Parsing an IO
37
+ #
38
+ # fragment = File.open("fragment.html") do |file|
39
+ # HTML4::DocumentFragment.parse(file)
40
+ # end
41
+ #
42
+ # *Example:* Specifying encoding
43
+ #
44
+ # fragment = HTML4::DocumentFragment.parse(input, encoding: "EUC-JP")
45
+ #
46
+ # *Example:* Setting parse options dynamically
47
+ #
48
+ # HTML4::DocumentFragment.parse("<div>Hello World") do |options|
49
+ # options.huge.pedantic
50
+ # end
51
+ #
52
+ def self.parse(
53
+ input,
54
+ encoding_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML,
55
+ encoding: encoding_, options: options_,
56
+ &block
57
+ )
58
+ # TODO: this method should take a context node.
9
59
  doc = HTML4::Document.new
10
60
 
11
- encoding ||= if tags.respond_to?(:encoding)
12
- encoding = tags.encoding
61
+ if input.respond_to?(:read)
62
+ # Handle IO-like objects (IO, File, StringIO, etc.)
63
+ # The _read_ method of these objects doesn't accept an +encoding+ parameter.
64
+ # Encoding is usually set when the IO object is created or opened,
65
+ # or by using the _set_encoding_ method.
66
+ #
67
+ # 1. If +encoding+ is provided and the object supports _set_encoding_,
68
+ # set the encoding before reading.
69
+ # 2. Read the content from the IO-like object.
70
+ #
71
+ # Note: After reading, the content's encoding will be:
72
+ # - The encoding set by _set_encoding_ if it was called
73
+ # - The default encoding of the IO object otherwise
74
+ #
75
+ # For StringIO specifically, _set_encoding_ affects only the internal string,
76
+ # not how the data is read out.
77
+ input.set_encoding(encoding) if encoding && input.respond_to?(:set_encoding)
78
+ input = input.read
79
+ end
80
+
81
+ encoding ||= if input.respond_to?(:encoding)
82
+ encoding = input.encoding
13
83
  if encoding == ::Encoding::ASCII_8BIT
14
84
  "UTF-8"
15
85
  else
@@ -21,29 +91,71 @@ module Nokogiri
21
91
 
22
92
  doc.encoding = encoding
23
93
 
24
- new(doc, tags, nil, options, &block)
94
+ new(doc, input, options: options, &block)
25
95
  end
26
96
 
27
- def initialize(document, tags = nil, ctx = nil, options = XML::ParseOptions::DEFAULT_HTML) # rubocop:disable Lint/MissingSuper
28
- return self unless tags
97
+ #
98
+ # :call-seq:
99
+ # new(document) { |options| ... } → HTML4::DocumentFragment
100
+ # new(document, input) { |options| ... } → HTML4::DocumentFragment
101
+ # new(document, input, context:, options:) { |options| ... } → HTML4::DocumentFragment
102
+ #
103
+ # Parse \HTML4 fragment input from a String, and return a new HTML4::DocumentFragment.
104
+ #
105
+ # 💡 It's recommended to use either HTML4::DocumentFragment.parse or XML::Node#parse rather
106
+ # than call this method directly.
107
+ #
108
+ # [Required Parameters]
109
+ # - +document+ (HTML4::Document) The parent document to associate the returned fragment with.
110
+ #
111
+ # [Optional Parameters]
112
+ # - +input+ (String) The content to be parsed.
113
+ #
114
+ # [Optional Keyword Arguments]
115
+ # - +context:+ (Nokogiri::XML::Node) The <b>context node</b> for the subtree created. See
116
+ # below for more information.
117
+ #
118
+ # - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some
119
+ # behaviors during parsing. See ParseOptions for more information. The default value is
120
+ # +ParseOptions::DEFAULT_HTML+.
121
+ #
122
+ # [Yields]
123
+ # If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which
124
+ # can be configured before parsing. See ParseOptions for more information.
125
+ #
126
+ # [Returns] HTML4::DocumentFragment
127
+ #
128
+ # === Context \Node
129
+ #
130
+ # If a context node is specified using +context:+, then the fragment will be created by
131
+ # calling XML::Node#parse on that node, so the parser will behave as if that Node is the
132
+ # parent of the fragment subtree.
133
+ #
134
+ def initialize(
135
+ document, input = nil,
136
+ context_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML,
137
+ context: context_, options: options_
138
+ ) # rubocop:disable Lint/MissingSuper
139
+ return self unless input
29
140
 
30
141
  options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
142
+ @parse_options = options
31
143
  yield options if block_given?
32
144
 
33
- if ctx
145
+ if context
34
146
  preexisting_errors = document.errors.dup
35
- node_set = ctx.parse("<div>#{tags}</div>", options)
147
+ node_set = context.parse("<div>#{input}</div>", options)
36
148
  node_set.first.children.each { |child| child.parent = self } unless node_set.empty?
37
149
  self.errors = document.errors - preexisting_errors
38
150
  else
39
151
  # This is a horrible hack, but I don't care
40
- path = if /^\s*?<body/i.match?(tags)
152
+ path = if /^\s*?<body/i.match?(input)
41
153
  "/html/body"
42
154
  else
43
155
  "/html/body/node()"
44
156
  end
45
157
 
46
- temp_doc = HTML4::Document.parse("<html><body>#{tags}", nil, document.encoding, options)
158
+ temp_doc = HTML4::Document.parse("<html><body>#{input}", nil, document.encoding, options)
47
159
  temp_doc.xpath(path).each { |child| child.parent = self }
48
160
  self.errors = temp_doc.errors
49
161
  end
@@ -26,7 +26,7 @@ module Nokogiri
26
26
 
27
27
  def initialize
28
28
  @encoding = nil
29
- super()
29
+ super
30
30
  end
31
31
 
32
32
  def start_element(name, attrs = [])
@@ -3,60 +3,45 @@
3
3
  module Nokogiri
4
4
  module HTML4
5
5
  ###
6
- # Nokogiri lets you write a SAX parser to process HTML but get HTML correction features.
6
+ # Nokogiri provides a SAX parser to process HTML4 which will provide HTML recovery
7
+ # ("autocorrection") features.
7
8
  #
8
9
  # See Nokogiri::HTML4::SAX::Parser for a basic example of using a SAX parser with HTML.
9
10
  #
10
11
  # For more information on SAX parsers, see Nokogiri::XML::SAX
12
+ #
11
13
  module SAX
12
14
  ###
13
- # This class lets you perform SAX style parsing on HTML with HTML error correction.
15
+ # This parser is a SAX style parser that reads its input as it deems necessary. The parser
16
+ # takes a Nokogiri::XML::SAX::Document, an optional encoding, then given an HTML input, sends
17
+ # messages to the Nokogiri::XML::SAX::Document.
18
+ #
19
+ # ⚠ This is an HTML4 parser and so may not support some HTML5 features and behaviors.
14
20
  #
15
21
  # Here is a basic usage example:
16
22
  #
17
- # class MyDoc < Nokogiri::XML::SAX::Document
23
+ # class MyHandler < Nokogiri::XML::SAX::Document
18
24
  # def start_element name, attributes = []
19
25
  # puts "found a #{name}"
20
26
  # end
21
27
  # end
22
28
  #
23
- # parser = Nokogiri::HTML4::SAX::Parser.new(MyDoc.new)
24
- # parser.parse(File.read(ARGV[0], mode: 'rb'))
29
+ # parser = Nokogiri::HTML4::SAX::Parser.new(MyHandler.new)
30
+ #
31
+ # # Hand an IO object to the parser, which will read the HTML from the IO.
32
+ # File.open(path_to_html) do |f|
33
+ # parser.parse(f)
34
+ # end
35
+ #
36
+ # For more information on \SAX parsers, see Nokogiri::XML::SAX or the parent class
37
+ # Nokogiri::XML::SAX::Parser.
38
+ #
39
+ # Also see Nokogiri::XML::SAX::Document for the available events.
25
40
  #
26
- # For more information on SAX parsers, see Nokogiri::XML::SAX
27
41
  class Parser < Nokogiri::XML::SAX::Parser
28
- ###
29
- # Parse html stored in +data+ using +encoding+
30
- def parse_memory(data, encoding = "UTF-8")
31
- raise TypeError unless String === data
32
- return if data.empty?
33
-
34
- ctx = ParserContext.memory(data, encoding)
35
- yield ctx if block_given?
36
- ctx.parse_with(self)
37
- end
38
-
39
- ###
40
- # Parse given +io+
41
- def parse_io(io, encoding = "UTF-8")
42
- check_encoding(encoding)
43
- @encoding = encoding
44
- ctx = ParserContext.io(io, ENCODINGS[encoding])
45
- yield ctx if block_given?
46
- ctx.parse_with(self)
47
- end
48
-
49
- ###
50
- # Parse a file with +filename+
51
- def parse_file(filename, encoding = "UTF-8")
52
- raise ArgumentError unless filename
53
- raise Errno::ENOENT unless File.exist?(filename)
54
- raise Errno::EISDIR if File.directory?(filename)
55
-
56
- ctx = ParserContext.file(filename, encoding)
57
- yield ctx if block_given?
58
- ctx.parse_with(self)
59
- end
42
+ # this class inherits its behavior from Nokogiri::XML::SAX::Parser, but note that superclass
43
+ # uses Nokogiri::ClassResolver to use HTML4::SAX::ParserContext as the context class for
44
+ # this class, which is where the real behavioral differences are implemented.
60
45
  end
61
46
  end
62
47
  end
@@ -4,16 +4,11 @@ module Nokogiri
4
4
  module HTML4
5
5
  module SAX
6
6
  ###
7
- # Context for HTML SAX parsers. This class is usually not instantiated by the user. Instead,
8
- # you should be looking at Nokogiri::HTML4::SAX::Parser
7
+ # Context object to invoke the HTML4 SAX parser on the SAX::Document handler.
8
+ #
9
+ # 💡 This class is usually not instantiated by the user. Use Nokogiri::HTML4::SAX::Parser
10
+ # instead.
9
11
  class ParserContext < Nokogiri::XML::SAX::ParserContext
10
- def self.new(thing, encoding = "UTF-8")
11
- if [:read, :close].all? { |x| thing.respond_to?(x) }
12
- super
13
- else
14
- memory(thing, encoding)
15
- end
16
- end
17
12
  end
18
13
  end
19
14
  end
@@ -3,12 +3,9 @@
3
3
 
4
4
  module Nokogiri
5
5
  class << self
6
- # :call-seq:
7
- # HTML4(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block) → Nokogiri::HTML4::Document
8
- #
9
- # Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
10
- def HTML4(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
11
- Nokogiri::HTML4::Document.parse(input, url, encoding, options, &block)
6
+ # Convenience method for Nokogiri::HTML4::Document.parse
7
+ def HTML4(...)
8
+ Nokogiri::HTML4::Document.parse(...)
12
9
  end
13
10
  end
14
11
 
@@ -18,16 +15,14 @@ module Nokogiri
18
15
  # for parsing HTML.
19
16
  module HTML4
20
17
  class << self
21
- ###
22
- # Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
23
- def parse(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
24
- Document.parse(input, url, encoding, options, &block)
18
+ # Convenience method for Nokogiri::HTML4::Document.parse
19
+ def parse(...)
20
+ Document.parse(...)
25
21
  end
26
22
 
27
- ####
28
- # Parse a fragment from +string+ in to a NodeSet.
29
- def fragment(string, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
30
- HTML4::DocumentFragment.parse(string, encoding, options, &block)
23
+ # Convenience method for Nokogiri::HTML4::DocumentFragment.parse
24
+ def fragment(...)
25
+ HTML4::DocumentFragment.parse(...)
31
26
  end
32
27
  end
33
28
 
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nokogiri
4
+ module HTML5
5
+ ###
6
+ # Nokogiri HTML5 builder is used for building HTML documents. It is very similar to the
7
+ # Nokogiri::XML::Builder. In fact, you should go read the documentation for
8
+ # Nokogiri::XML::Builder before reading this documentation.
9
+ #
10
+ # The construction behavior is identical to HTML4::Builder, but HTML5 documents implement the
11
+ # [HTML5 standard's serialization
12
+ # algorithm](https://www.w3.org/TR/2008/WD-html5-20080610/serializing.html).
13
+ #
14
+ # == Synopsis:
15
+ #
16
+ # Create an HTML5 document with a body that has an onload attribute, and a
17
+ # span tag with a class of "bold" that has content of "Hello world".
18
+ #
19
+ # builder = Nokogiri::HTML5::Builder.new do |doc|
20
+ # doc.html {
21
+ # doc.body(:onload => 'some_func();') {
22
+ # doc.span.bold {
23
+ # doc.text "Hello world"
24
+ # }
25
+ # }
26
+ # }
27
+ # end
28
+ # puts builder.to_html
29
+ #
30
+ # The HTML5 builder inherits from the XML builder, so make sure to read the
31
+ # Nokogiri::XML::Builder documentation.
32
+ class Builder < Nokogiri::XML::Builder
33
+ ###
34
+ # Convert the builder to HTML
35
+ def to_html
36
+ @doc.to_html
37
+ end
38
+ end
39
+ end
40
+ end