nokogiri 1.13.10-java → 1.14.0-java

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (119) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +33 -0
  3. data/LICENSE-DEPENDENCIES.md +830 -509
  4. data/LICENSE.md +1 -1
  5. data/README.md +18 -11
  6. data/dependencies.yml +25 -7
  7. data/ext/java/nokogiri/Html4Document.java +2 -0
  8. data/ext/java/nokogiri/Html4ElementDescription.java +9 -9
  9. data/ext/java/nokogiri/Html4EntityLookup.java +14 -3
  10. data/ext/java/nokogiri/Html4SaxParserContext.java +2 -2
  11. data/ext/java/nokogiri/Html4SaxPushParser.java +3 -0
  12. data/ext/java/nokogiri/NokogiriService.java +1 -24
  13. data/ext/java/nokogiri/XmlAttr.java +1 -1
  14. data/ext/java/nokogiri/XmlAttributeDecl.java +2 -1
  15. data/ext/java/nokogiri/XmlCdata.java +2 -1
  16. data/ext/java/nokogiri/XmlComment.java +2 -1
  17. data/ext/java/nokogiri/XmlDocument.java +5 -6
  18. data/ext/java/nokogiri/XmlDocumentFragment.java +2 -1
  19. data/ext/java/nokogiri/XmlDtd.java +4 -3
  20. data/ext/java/nokogiri/XmlElement.java +1 -0
  21. data/ext/java/nokogiri/XmlElementContent.java +4 -1
  22. data/ext/java/nokogiri/XmlElementDecl.java +3 -1
  23. data/ext/java/nokogiri/XmlEntityDecl.java +2 -0
  24. data/ext/java/nokogiri/XmlEntityReference.java +1 -0
  25. data/ext/java/nokogiri/XmlNamespace.java +2 -0
  26. data/ext/java/nokogiri/XmlNode.java +39 -24
  27. data/ext/java/nokogiri/XmlNodeSet.java +10 -7
  28. data/ext/java/nokogiri/XmlProcessingInstruction.java +1 -0
  29. data/ext/java/nokogiri/XmlReader.java +4 -3
  30. data/ext/java/nokogiri/XmlRelaxng.java +1 -0
  31. data/ext/java/nokogiri/XmlSaxParserContext.java +1 -0
  32. data/ext/java/nokogiri/XmlSaxPushParser.java +3 -0
  33. data/ext/java/nokogiri/XmlSchema.java +4 -2
  34. data/ext/java/nokogiri/XmlSyntaxError.java +1 -0
  35. data/ext/java/nokogiri/XmlText.java +1 -0
  36. data/ext/java/nokogiri/XmlXpathContext.java +2 -0
  37. data/ext/java/nokogiri/XsltStylesheet.java +16 -13
  38. data/ext/java/nokogiri/internals/HtmlDomParserContext.java +3 -2
  39. data/ext/java/nokogiri/internals/NokogiriHandler.java +2 -2
  40. data/ext/java/nokogiri/internals/NokogiriHelpers.java +4 -5
  41. data/ext/java/nokogiri/internals/NokogiriXPathFunction.java +3 -3
  42. data/ext/java/nokogiri/internals/ParserContext.java +2 -0
  43. data/ext/java/nokogiri/internals/ReaderNode.java +1 -1
  44. data/ext/java/nokogiri/internals/SaveContextVisitor.java +4 -2
  45. data/ext/java/nokogiri/internals/SchemaErrorHandler.java +2 -2
  46. data/ext/java/nokogiri/internals/XmlDomParserContext.java +2 -1
  47. data/ext/java/nokogiri/internals/c14n/Canonicalizer.java +1 -0
  48. data/ext/java/nokogiri/internals/dom2dtm/DOM2DTM.java +5 -4
  49. data/ext/nokogiri/extconf.rb +80 -21
  50. data/ext/nokogiri/gumbo.c +19 -9
  51. data/ext/nokogiri/html4_document.c +1 -1
  52. data/ext/nokogiri/html4_entity_lookup.c +1 -1
  53. data/ext/nokogiri/html4_sax_parser_context.c +0 -5
  54. data/ext/nokogiri/nokogiri.c +33 -51
  55. data/ext/nokogiri/xml_attribute_decl.c +1 -1
  56. data/ext/nokogiri/xml_cdata.c +1 -1
  57. data/ext/nokogiri/xml_document.c +16 -11
  58. data/ext/nokogiri/xml_element_content.c +2 -2
  59. data/ext/nokogiri/xml_element_decl.c +1 -1
  60. data/ext/nokogiri/xml_encoding_handler.c +2 -2
  61. data/ext/nokogiri/xml_namespace.c +38 -8
  62. data/ext/nokogiri/xml_node.c +286 -26
  63. data/ext/nokogiri/xml_node_set.c +0 -2
  64. data/ext/nokogiri/xml_reader.c +40 -20
  65. data/ext/nokogiri/xml_relax_ng.c +0 -2
  66. data/ext/nokogiri/xml_sax_parser.c +22 -16
  67. data/ext/nokogiri/xml_sax_parser_context.c +0 -5
  68. data/ext/nokogiri/xml_sax_push_parser.c +0 -2
  69. data/ext/nokogiri/xml_schema.c +0 -2
  70. data/ext/nokogiri/xml_xpath_context.c +87 -83
  71. data/ext/nokogiri/xslt_stylesheet.c +14 -13
  72. data/gumbo-parser/Makefile +10 -0
  73. data/lib/nokogiri/css/node.rb +2 -2
  74. data/lib/nokogiri/css/xpath_visitor.rb +5 -3
  75. data/lib/nokogiri/css.rb +6 -0
  76. data/lib/nokogiri/encoding_handler.rb +57 -0
  77. data/lib/nokogiri/extension.rb +3 -2
  78. data/lib/nokogiri/html4/document.rb +2 -121
  79. data/lib/nokogiri/html4/element_description_defaults.rb +6 -12
  80. data/lib/nokogiri/html4/encoding_reader.rb +121 -0
  81. data/lib/nokogiri/html4.rb +1 -0
  82. data/lib/nokogiri/html5/document.rb +113 -36
  83. data/lib/nokogiri/html5/document_fragment.rb +9 -2
  84. data/lib/nokogiri/html5/node.rb +3 -5
  85. data/lib/nokogiri/html5.rb +127 -216
  86. data/lib/nokogiri/jruby/dependencies.rb +1 -19
  87. data/lib/{isorelax.jar → nokogiri/jruby/isorelax/isorelax/20030108/isorelax-20030108.jar} +0 -0
  88. data/lib/nokogiri/jruby/net/sf/saxon/Saxon-HE/9.6.0-4/Saxon-HE-9.6.0-4.jar +0 -0
  89. data/lib/nokogiri/jruby/net/sourceforge/htmlunit/neko-htmlunit/2.63.0/neko-htmlunit-2.63.0.jar +0 -0
  90. data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
  91. data/lib/nokogiri/jruby/nu/validator/jing/20200702VNU/jing-20200702VNU.jar +0 -0
  92. data/lib/nokogiri/jruby/org/nokogiri/nekodtd/0.1.11.noko1/nekodtd-0.1.11.noko1.jar +0 -0
  93. data/lib/{serializer.jar → nokogiri/jruby/xalan/serializer/2.7.2/serializer-2.7.2.jar} +0 -0
  94. data/lib/{xalan.jar → nokogiri/jruby/xalan/xalan/2.7.2/xalan-2.7.2.jar} +0 -0
  95. data/lib/{xercesImpl.jar → nokogiri/jruby/xerces/xercesImpl/2.12.2/xercesImpl-2.12.2.jar} +0 -0
  96. data/lib/{xml-apis.jar → nokogiri/jruby/xml-apis/xml-apis/1.4.01/xml-apis-1.4.01.jar} +0 -0
  97. data/lib/nokogiri/nokogiri.jar +0 -0
  98. data/lib/nokogiri/version/constant.rb +1 -1
  99. data/lib/nokogiri/version/info.rb +11 -10
  100. data/lib/nokogiri/xml/attr.rb +49 -0
  101. data/lib/nokogiri/xml/builder.rb +1 -1
  102. data/lib/nokogiri/xml/document.rb +102 -54
  103. data/lib/nokogiri/xml/document_fragment.rb +49 -6
  104. data/lib/nokogiri/xml/namespace.rb +42 -0
  105. data/lib/nokogiri/xml/node/save_options.rb +6 -4
  106. data/lib/nokogiri/xml/node.rb +190 -35
  107. data/lib/nokogiri/xml/node_set.rb +87 -9
  108. data/lib/nokogiri/xml/parse_options.rb +129 -50
  109. data/lib/nokogiri/xml/pp/node.rb +6 -4
  110. data/lib/nokogiri/xml/processing_instruction.rb +2 -1
  111. data/lib/nokogiri/xml/sax/parser.rb +2 -3
  112. data/lib/nokogiri/xslt.rb +1 -1
  113. data/lib/nokogiri.rb +3 -11
  114. data/lib/xsd/xmlparser/nokogiri.rb +3 -1
  115. metadata +60 -272
  116. data/ext/java/nokogiri/EncodingHandler.java +0 -111
  117. data/lib/jing.jar +0 -0
  118. data/lib/nekodtd.jar +0 -0
  119. data/lib/nekohtml.jar +0 -0
@@ -176,7 +176,7 @@ module Nokogiri
176
176
  url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
177
177
 
178
178
  if string_or_io.respond_to?(:encoding)
179
- unless string_or_io.encoding.name == "ASCII-8BIT"
179
+ unless string_or_io.encoding == Encoding::ASCII_8BIT
180
180
  encoding ||= string_or_io.encoding.name
181
181
  end
182
182
  end
@@ -189,21 +189,10 @@ module Nokogiri
189
189
  end
190
190
 
191
191
  unless encoding
192
- # Libxml2's parser has poor support for encoding
193
- # detection. First, it does not recognize the HTML5
194
- # style meta charset declaration. Secondly, even if it
195
- # successfully detects an encoding hint, it does not
196
- # re-decode or re-parse the preceding part which may be
197
- # garbled.
198
- #
199
- # EncodingReader aims to perform advanced encoding
200
- # detection beyond what Libxml2 does, and to emulate
201
- # rewinding of a stream and make Libxml2 redo parsing
202
- # from the start when an encoding hint is found.
203
192
  string_or_io = EncodingReader.new(string_or_io)
204
193
  begin
205
194
  return read_io(string_or_io, url, encoding, options.to_i)
206
- rescue EncodingFound => e
195
+ rescue EncodingReader::EncodingFound => e
207
196
  encoding = e.found_encoding
208
197
  end
209
198
  end
@@ -220,114 +209,6 @@ module Nokogiri
220
209
  read_memory(string_or_io, url, encoding, options.to_i)
221
210
  end
222
211
  end
223
-
224
- class EncodingFound < StandardError # :nodoc: all
225
- attr_reader :found_encoding
226
-
227
- def initialize(encoding)
228
- @found_encoding = encoding
229
- super(format("encoding found: %s", encoding))
230
- end
231
- end
232
-
233
- # :nodoc: all
234
- class EncodingReader
235
- class SAXHandler < Nokogiri::XML::SAX::Document
236
- attr_reader :encoding
237
-
238
- def initialize
239
- @encoding = nil
240
- super()
241
- end
242
-
243
- def start_element(name, attrs = [])
244
- return unless name == "meta"
245
-
246
- attr = Hash[attrs]
247
- (charset = attr["charset"]) &&
248
- (@encoding = charset)
249
- (http_equiv = attr["http-equiv"]) &&
250
- http_equiv.match(/\AContent-Type\z/i) &&
251
- (content = attr["content"]) &&
252
- (m = content.match(/;\s*charset\s*=\s*([\w-]+)/)) &&
253
- (@encoding = m[1])
254
- end
255
- end
256
-
257
- class JumpSAXHandler < SAXHandler
258
- def initialize(jumptag)
259
- @jumptag = jumptag
260
- super()
261
- end
262
-
263
- def start_element(name, attrs = [])
264
- super
265
- throw(@jumptag, @encoding) if @encoding
266
- throw(@jumptag, nil) if /\A(?:div|h1|img|p|br)\z/.match?(name)
267
- end
268
- end
269
-
270
- def self.detect_encoding(chunk)
271
- (m = chunk.match(/\A(<\?xml[ \t\r\n][^>]*>)/)) &&
272
- (return Nokogiri.XML(m[1]).encoding)
273
-
274
- if Nokogiri.jruby?
275
- (m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i)) &&
276
- (return m[4])
277
- catch(:encoding_found) do
278
- Nokogiri::HTML4::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
279
- nil
280
- end
281
- else
282
- handler = SAXHandler.new
283
- parser = Nokogiri::HTML4::SAX::PushParser.new(handler)
284
- begin
285
- parser << chunk
286
- rescue
287
- Nokogiri::SyntaxError
288
- end
289
- handler.encoding
290
- end
291
- end
292
-
293
- def initialize(io)
294
- @io = io
295
- @firstchunk = nil
296
- @encoding_found = nil
297
- end
298
-
299
- # This method is used by the C extension so that
300
- # Nokogiri::HTML4::Document#read_io() does not leak memory when
301
- # EncodingFound is raised.
302
- attr_reader :encoding_found
303
-
304
- def read(len)
305
- # no support for a call without len
306
-
307
- unless @firstchunk
308
- (@firstchunk = @io.read(len)) || (return nil)
309
-
310
- # This implementation expects that the first call from
311
- # htmlReadIO() is made with a length long enough (~1KB) to
312
- # achieve advanced encoding detection.
313
- if (encoding = EncodingReader.detect_encoding(@firstchunk))
314
- # The first chunk is stored for the next read in retry.
315
- raise @encoding_found = EncodingFound.new(encoding)
316
- end
317
- end
318
- @encoding_found = nil
319
-
320
- ret = @firstchunk.slice!(0, len)
321
- if (len -= ret.length) > 0
322
- (rest = @io.read(len)) && ret << (rest)
323
- end
324
- if ret.empty?
325
- nil
326
- else
327
- ret
328
- end
329
- end
330
- end
331
212
  end
332
213
  end
333
214
  end
@@ -25,43 +25,37 @@ module Nokogiri
25
25
 
26
26
  unless method_defined?(:implied_start_tag?)
27
27
  def implied_start_tag?
28
- d = default_desc
29
- d ? d.startTag : nil
28
+ default_desc&.startTag
30
29
  end
31
30
  end
32
31
 
33
32
  unless method_defined?(:implied_end_tag?)
34
33
  def implied_end_tag?
35
- d = default_desc
36
- d ? d.endTag : nil
34
+ default_desc&.endTag
37
35
  end
38
36
  end
39
37
 
40
38
  unless method_defined?(:save_end_tag?)
41
39
  def save_end_tag?
42
- d = default_desc
43
- d ? d.saveEndTag : nil
40
+ default_desc&.saveEndTag
44
41
  end
45
42
  end
46
43
 
47
44
  unless method_defined?(:deprecated?)
48
45
  def deprecated?
49
- d = default_desc
50
- d ? d.depr : nil
46
+ default_desc&.depr
51
47
  end
52
48
  end
53
49
 
54
50
  unless method_defined?(:description)
55
51
  def description
56
- d = default_desc
57
- d ? d.desc : nil
52
+ default_desc&.desc
58
53
  end
59
54
  end
60
55
 
61
56
  unless method_defined?(:default_sub_element)
62
57
  def default_sub_element
63
- d = default_desc
64
- d ? d.defaultsubelt : nil
58
+ default_desc&.defaultsubelt
65
59
  end
66
60
  end
67
61
 
@@ -0,0 +1,121 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nokogiri
4
+ module HTML4
5
+ # Libxml2's parser has poor support for encoding detection. First, it does not recognize the
6
+ # HTML5 style meta charset declaration. Secondly, even if it successfully detects an encoding
7
+ # hint, it does not re-decode or re-parse the preceding part which may be garbled.
8
+ #
9
+ # EncodingReader aims to perform advanced encoding detection beyond what Libxml2 does, and to
10
+ # emulate rewinding of a stream and make Libxml2 redo parsing from the start when an encoding
11
+ # hint is found.
12
+
13
+ # :nodoc: all
14
+ class EncodingReader
15
+ class EncodingFound < StandardError
16
+ attr_reader :found_encoding
17
+
18
+ def initialize(encoding)
19
+ @found_encoding = encoding
20
+ super(format("encoding found: %s", encoding))
21
+ end
22
+ end
23
+
24
+ class SAXHandler < Nokogiri::XML::SAX::Document
25
+ attr_reader :encoding
26
+
27
+ def initialize
28
+ @encoding = nil
29
+ super()
30
+ end
31
+
32
+ def start_element(name, attrs = [])
33
+ return unless name == "meta"
34
+
35
+ attr = Hash[attrs]
36
+ (charset = attr["charset"]) &&
37
+ (@encoding = charset)
38
+ (http_equiv = attr["http-equiv"]) &&
39
+ http_equiv.match(/\AContent-Type\z/i) &&
40
+ (content = attr["content"]) &&
41
+ (m = content.match(/;\s*charset\s*=\s*([\w-]+)/)) &&
42
+ (@encoding = m[1])
43
+ end
44
+ end
45
+
46
+ class JumpSAXHandler < SAXHandler
47
+ def initialize(jumptag)
48
+ @jumptag = jumptag
49
+ super()
50
+ end
51
+
52
+ def start_element(name, attrs = [])
53
+ super
54
+ throw(@jumptag, @encoding) if @encoding
55
+ throw(@jumptag, nil) if /\A(?:div|h1|img|p|br)\z/.match?(name)
56
+ end
57
+ end
58
+
59
+ def self.detect_encoding(chunk)
60
+ (m = chunk.match(/\A(<\?xml[ \t\r\n][^>]*>)/)) &&
61
+ (return Nokogiri.XML(m[1]).encoding)
62
+
63
+ if Nokogiri.jruby?
64
+ (m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i)) &&
65
+ (return m[4])
66
+ catch(:encoding_found) do
67
+ Nokogiri::HTML4::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
68
+ nil
69
+ end
70
+ else
71
+ handler = SAXHandler.new
72
+ parser = Nokogiri::HTML4::SAX::PushParser.new(handler)
73
+ begin
74
+ parser << chunk
75
+ rescue
76
+ Nokogiri::SyntaxError
77
+ end
78
+ handler.encoding
79
+ end
80
+ end
81
+
82
+ def initialize(io)
83
+ @io = io
84
+ @firstchunk = nil
85
+ @encoding_found = nil
86
+ end
87
+
88
+ # This method is used by the C extension so that
89
+ # Nokogiri::HTML4::Document#read_io() does not leak memory when
90
+ # EncodingFound is raised.
91
+ attr_reader :encoding_found
92
+
93
+ def read(len)
94
+ # no support for a call without len
95
+
96
+ unless @firstchunk
97
+ (@firstchunk = @io.read(len)) || (return nil)
98
+
99
+ # This implementation expects that the first call from
100
+ # htmlReadIO() is made with a length long enough (~1KB) to
101
+ # achieve advanced encoding detection.
102
+ if (encoding = EncodingReader.detect_encoding(@firstchunk))
103
+ # The first chunk is stored for the next read in retry.
104
+ raise @encoding_found = EncodingFound.new(encoding)
105
+ end
106
+ end
107
+ @encoding_found = nil
108
+
109
+ ret = @firstchunk.slice!(0, len)
110
+ if (len -= ret.length) > 0
111
+ (rest = @io.read(len)) && ret << (rest)
112
+ end
113
+ if ret.empty?
114
+ nil
115
+ else
116
+ ret
117
+ end
118
+ end
119
+ end
120
+ end
121
+ end
@@ -39,6 +39,7 @@ end
39
39
  require_relative "html4/entity_lookup"
40
40
  require_relative "html4/document"
41
41
  require_relative "html4/document_fragment"
42
+ require_relative "html4/encoding_reader"
42
43
  require_relative "html4/sax/parser_context"
43
44
  require_relative "html4/sax/parser"
44
45
  require_relative "html4/sax/push_parser"
@@ -21,48 +21,137 @@ require_relative "../html4/document"
21
21
 
22
22
  module Nokogiri
23
23
  module HTML5
24
+ # Enum for the HTML5 parser quirks mode values. Values returned by HTML5::Document#quirks_mode
25
+ #
26
+ # See https://dom.spec.whatwg.org/#concept-document-quirks for more information on HTML5 quirks
27
+ # mode.
28
+ #
29
+ # Since v1.14.0
30
+ module QuirksMode
31
+ NO_QUIRKS = 0 # The document was parsed in "no-quirks" mode
32
+ QUIRKS = 1 # The document was parsed in "quirks" mode
33
+ LIMITED_QUIRKS = 2 # The document was parsed in "limited-quirks" mode
34
+ end
35
+
24
36
  # Since v1.12.0
25
37
  #
26
38
  # 💡 HTML5 functionality is not available when running JRuby.
27
39
  class Document < Nokogiri::HTML4::Document
28
- def self.parse(string_or_io, url = nil, encoding = nil, **options, &block)
29
- yield options if block
30
- string_or_io = "" unless string_or_io
40
+ # Get the url name for this document, as passed into Document.parse, Document.read_io, or
41
+ # Document.read_memory
42
+ attr_reader :url
31
43
 
32
- if string_or_io.respond_to?(:encoding) && string_or_io.encoding.name != "ASCII-8BIT"
33
- encoding ||= string_or_io.encoding.name
34
- end
44
+ # Get the parser's quirks mode value. See HTML5::QuirksMode.
45
+ #
46
+ # This method returns `nil` if the parser was not invoked (e.g., `Nokogiri::HTML5::Document.new`).
47
+ #
48
+ # Since v1.14.0
49
+ attr_reader :quirks_mode
35
50
 
36
- if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
37
- url ||= string_or_io.path
51
+ class << self
52
+ # :call-seq:
53
+ # parse(input)
54
+ # parse(input, url=nil, encoding=nil, **options)
55
+ # parse(input, url=nil, encoding=nil) { |options| ... }
56
+ #
57
+ # Parse HTML5 input.
58
+ #
59
+ # [Parameters]
60
+ # - +input+ may be a String, or any object that responds to _read_ and _close_ such as an
61
+ # IO, or StringIO.
62
+ #
63
+ # - +url+ (optional) is a String indicating the canonical URI where this document is located.
64
+ #
65
+ # - +encoding+ (optional) is the encoding that should be used when processing
66
+ # the document.
67
+ #
68
+ # - +options+ (optional) is a configuration Hash (or keyword arguments) to set options
69
+ # during parsing. The three currently supported options are +:max_errors+,
70
+ # +:max_tree_depth+ and +:max_attributes+, described at Nokogiri::HTML5.
71
+ #
72
+ # ⚠ Note that these options are different than those made available by
73
+ # Nokogiri::XML::Document and Nokogiri::HTML4::Document.
74
+ #
75
+ # - +block+ (optional) is passed a configuration Hash on which parse options may be set. See
76
+ # Nokogiri::HTML5 for more information and usage.
77
+ #
78
+ # [Returns] Nokogiri::HTML5::Document
79
+ #
80
+ def parse(string_or_io, url = nil, encoding = nil, **options, &block)
81
+ yield options if block
82
+ string_or_io = "" unless string_or_io
83
+
84
+ if string_or_io.respond_to?(:encoding) && string_or_io.encoding != Encoding::ASCII_8BIT
85
+ encoding ||= string_or_io.encoding.name
86
+ end
87
+
88
+ if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
89
+ url ||= string_or_io.path
90
+ end
91
+ unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
92
+ raise ArgumentError, "not a string or IO object"
93
+ end
94
+
95
+ do_parse(string_or_io, url, encoding, options)
38
96
  end
39
- unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
40
- raise ArgumentError, "not a string or IO object"
97
+
98
+ # Create a new document from an IO object.
99
+ #
100
+ # 💡 Most users should prefer Document.parse to this method.
101
+ def read_io(io, url = nil, encoding = nil, **options)
102
+ raise ArgumentError, "io object doesn't respond to :read" unless io.respond_to?(:read)
103
+
104
+ do_parse(io, url, encoding, options)
41
105
  end
42
106
 
43
- do_parse(string_or_io, url, encoding, options)
44
- end
107
+ # Create a new document from a String.
108
+ #
109
+ # 💡 Most users should prefer Document.parse to this method.
110
+ def read_memory(string, url = nil, encoding = nil, **options)
111
+ raise ArgumentError, "string object doesn't respond to :to_str" unless string.respond_to?(:to_str)
45
112
 
46
- def self.read_io(io, url = nil, encoding = nil, **options)
47
- raise ArgumentError, "io object doesn't respond to :read" unless io.respond_to?(:read)
113
+ do_parse(string, url, encoding, options)
114
+ end
48
115
 
49
- do_parse(io, url, encoding, options)
50
- end
116
+ private
51
117
 
52
- def self.read_memory(string, url = nil, encoding = nil, **options)
53
- raise ArgumentError, "string object doesn't respond to :to_str" unless string.respond_to?(:to_str)
118
+ def do_parse(string_or_io, url, encoding, options)
119
+ string = HTML5.read_and_encode(string_or_io, encoding)
120
+ max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
121
+ max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
122
+ max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
123
+ doc = Nokogiri::Gumbo.parse(string, url, max_attributes, max_errors, max_depth, self)
124
+ doc.encoding = "UTF-8"
125
+ doc
126
+ end
127
+ end
54
128
 
55
- do_parse(string, url, encoding, options)
129
+ def initialize(*args) # :nodoc:
130
+ super
131
+ @url = nil
132
+ @quirks_mode = nil
56
133
  end
57
134
 
58
- def fragment(tags = nil)
59
- DocumentFragment.new(self, tags, root)
135
+ # :call-seq:
136
+ # fragment() Nokogiri::HTML5::DocumentFragment
137
+ # fragment(markup) → Nokogiri::HTML5::DocumentFragment
138
+ #
139
+ # Parse a HTML5 document fragment from +markup+, returning a Nokogiri::HTML5::DocumentFragment.
140
+ #
141
+ # [Properties]
142
+ # - +markup+ (String) The HTML5 markup fragment to be parsed
143
+ #
144
+ # [Returns]
145
+ # Nokogiri::HTML5::DocumentFragment. This object's children will be empty if `markup` is not passed, is empty, or is `nil`.
146
+ #
147
+ def fragment(markup = nil)
148
+ DocumentFragment.new(self, markup)
60
149
  end
61
150
 
62
- def to_xml(options = {}, &block)
151
+ def to_xml(options = {}, &block) # :nodoc:
63
152
  # Bypass XML::Document#to_xml which doesn't add
64
153
  # XML::Node::SaveOptions::AS_XML like XML::Node#to_xml does.
65
- XML::Node.instance_method(:to_xml).bind(self).call(options, &block)
154
+ XML::Node.instance_method(:to_xml).bind_call(self, options, &block)
66
155
  end
67
156
 
68
157
  # :call-seq:
@@ -70,22 +159,10 @@ module Nokogiri
70
159
  #
71
160
  # [Returns] The document type which determines CSS-to-XPath translation.
72
161
  #
73
- # See XPathVisitor for more information.
162
+ # See CSS::XPathVisitor for more information.
74
163
  def xpath_doctype
75
164
  Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML5
76
165
  end
77
-
78
- private
79
-
80
- def self.do_parse(string_or_io, url, encoding, options)
81
- string = HTML5.read_and_encode(string_or_io, encoding)
82
- max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
83
- max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
84
- max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
85
- doc = Nokogiri::Gumbo.parse(string, url, max_attributes, max_errors, max_depth)
86
- doc.encoding = "UTF-8"
87
- doc
88
- end
89
166
  end
90
167
  end
91
168
  end
@@ -28,6 +28,13 @@ module Nokogiri
28
28
  attr_accessor :document
29
29
  attr_accessor :errors
30
30
 
31
+ # Get the parser's quirks mode value. See HTML5::QuirksMode.
32
+ #
33
+ # This method returns `nil` if the parser was not invoked (e.g., `Nokogiri::HTML5::DocumentFragment.new(doc)`).
34
+ #
35
+ # Since v1.14.0
36
+ attr_reader :quirks_mode
37
+
31
38
  # Create a document fragment.
32
39
  def initialize(doc, tags = nil, ctx = nil, options = {})
33
40
  self.document = doc
@@ -41,10 +48,10 @@ module Nokogiri
41
48
  Nokogiri::Gumbo.fragment(self, tags, ctx, max_attributes, max_errors, max_depth)
42
49
  end
43
50
 
44
- def serialize(options = {}, &block)
51
+ def serialize(options = {}, &block) # :nodoc:
45
52
  # Bypass XML::Document.serialize which doesn't support options even
46
53
  # though XML::Node.serialize does!
47
- XML::Node.instance_method(:serialize).bind(self).call(options, &block)
54
+ XML::Node.instance_method(:serialize).bind_call(self, options, &block)
48
55
  end
49
56
 
50
57
  # Parse a document fragment from +tags+, returning a Nodeset.
@@ -28,7 +28,7 @@ module Nokogiri
28
28
  def inner_html(options = {})
29
29
  return super(options) unless document.is_a?(HTML5::Document)
30
30
 
31
- result = options[:preserve_newline] && HTML5.prepend_newline?(self) ? +"\n" : +""
31
+ result = options[:preserve_newline] && prepend_newline? ? +"\n" : +""
32
32
  result << children.map { |child| child.to_html(options) }.join
33
33
  result
34
34
  end
@@ -56,11 +56,9 @@ module Nokogiri
56
56
  native_write_to(io, encoding, indent_string, config_options)
57
57
  else
58
58
  # Serialize including the current node.
59
+ html = html_standard_serialize(options[:preserve_newline] || false)
59
60
  encoding ||= document.encoding || Encoding::UTF_8
60
- internal_ops = {
61
- preserve_newline: options[:preserve_newline] || false,
62
- }
63
- HTML5.serialize_node_internal(self, io, encoding, internal_ops)
61
+ io << html.encode(encoding, fallback: lambda { |c| "&#x#{c.ord.to_s(16)};" })
64
62
  end
65
63
  end
66
64