nokogiri 1.12.5 → 1.14.3

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (156) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +41 -0
  3. data/LICENSE-DEPENDENCIES.md +830 -509
  4. data/LICENSE.md +1 -1
  5. data/README.md +23 -14
  6. data/bin/nokogiri +63 -50
  7. data/dependencies.yml +33 -66
  8. data/ext/nokogiri/extconf.rb +159 -63
  9. data/ext/nokogiri/gumbo.c +21 -11
  10. data/ext/nokogiri/html4_document.c +2 -2
  11. data/ext/nokogiri/html4_element_description.c +1 -1
  12. data/ext/nokogiri/html4_entity_lookup.c +2 -2
  13. data/ext/nokogiri/html4_sax_parser_context.c +3 -9
  14. data/ext/nokogiri/html4_sax_push_parser.c +1 -1
  15. data/ext/nokogiri/nokogiri.c +38 -51
  16. data/ext/nokogiri/nokogiri.h +26 -14
  17. data/ext/nokogiri/test_global_handlers.c +1 -1
  18. data/ext/nokogiri/xml_attr.c +3 -3
  19. data/ext/nokogiri/xml_attribute_decl.c +5 -5
  20. data/ext/nokogiri/xml_cdata.c +3 -3
  21. data/ext/nokogiri/xml_comment.c +1 -1
  22. data/ext/nokogiri/xml_document.c +53 -44
  23. data/ext/nokogiri/xml_document_fragment.c +1 -3
  24. data/ext/nokogiri/xml_dtd.c +11 -11
  25. data/ext/nokogiri/xml_element_content.c +3 -3
  26. data/ext/nokogiri/xml_element_decl.c +5 -5
  27. data/ext/nokogiri/xml_encoding_handler.c +28 -14
  28. data/ext/nokogiri/xml_entity_decl.c +6 -6
  29. data/ext/nokogiri/xml_entity_reference.c +1 -1
  30. data/ext/nokogiri/xml_namespace.c +80 -14
  31. data/ext/nokogiri/xml_node.c +982 -396
  32. data/ext/nokogiri/xml_node_set.c +4 -6
  33. data/ext/nokogiri/xml_processing_instruction.c +1 -1
  34. data/ext/nokogiri/xml_reader.c +133 -32
  35. data/ext/nokogiri/xml_relax_ng.c +1 -3
  36. data/ext/nokogiri/xml_sax_parser.c +23 -17
  37. data/ext/nokogiri/xml_sax_parser_context.c +11 -9
  38. data/ext/nokogiri/xml_sax_push_parser.c +1 -3
  39. data/ext/nokogiri/xml_schema.c +4 -6
  40. data/ext/nokogiri/xml_syntax_error.c +1 -1
  41. data/ext/nokogiri/xml_text.c +2 -2
  42. data/ext/nokogiri/xml_xpath_context.c +144 -114
  43. data/ext/nokogiri/xslt_stylesheet.c +122 -23
  44. data/gumbo-parser/Makefile +10 -0
  45. data/gumbo-parser/src/attribute.h +1 -1
  46. data/gumbo-parser/src/error.c +2 -2
  47. data/gumbo-parser/src/error.h +1 -1
  48. data/gumbo-parser/src/foreign_attrs.c +2 -2
  49. data/gumbo-parser/src/{gumbo.h → nokogiri_gumbo.h} +1 -0
  50. data/gumbo-parser/src/parser.c +8 -16
  51. data/gumbo-parser/src/replacement.h +1 -1
  52. data/gumbo-parser/src/string_buffer.h +1 -1
  53. data/gumbo-parser/src/string_piece.c +1 -1
  54. data/gumbo-parser/src/svg_attrs.c +2 -2
  55. data/gumbo-parser/src/svg_tags.c +2 -2
  56. data/gumbo-parser/src/tag.c +2 -1
  57. data/gumbo-parser/src/tag_lookup.c +7 -7
  58. data/gumbo-parser/src/tag_lookup.gperf +1 -0
  59. data/gumbo-parser/src/tag_lookup.h +1 -1
  60. data/gumbo-parser/src/token_buffer.h +1 -1
  61. data/gumbo-parser/src/tokenizer.c +1 -1
  62. data/gumbo-parser/src/tokenizer.h +1 -1
  63. data/gumbo-parser/src/utf8.c +1 -1
  64. data/gumbo-parser/src/utf8.h +1 -1
  65. data/gumbo-parser/src/util.c +1 -3
  66. data/gumbo-parser/src/util.h +4 -0
  67. data/gumbo-parser/src/vector.h +1 -1
  68. data/lib/nokogiri/class_resolver.rb +67 -0
  69. data/lib/nokogiri/css/node.rb +9 -8
  70. data/lib/nokogiri/css/parser.rb +360 -341
  71. data/lib/nokogiri/css/parser.y +249 -244
  72. data/lib/nokogiri/css/parser_extras.rb +22 -20
  73. data/lib/nokogiri/css/syntax_error.rb +1 -0
  74. data/lib/nokogiri/css/tokenizer.rb +4 -3
  75. data/lib/nokogiri/css/tokenizer.rex +3 -2
  76. data/lib/nokogiri/css/xpath_visitor.rb +184 -85
  77. data/lib/nokogiri/css.rb +44 -6
  78. data/lib/nokogiri/decorators/slop.rb +8 -7
  79. data/lib/nokogiri/encoding_handler.rb +57 -0
  80. data/lib/nokogiri/extension.rb +4 -3
  81. data/lib/nokogiri/gumbo.rb +1 -0
  82. data/lib/nokogiri/html.rb +16 -10
  83. data/lib/nokogiri/html4/builder.rb +1 -0
  84. data/lib/nokogiri/html4/document.rb +56 -164
  85. data/lib/nokogiri/html4/document_fragment.rb +11 -7
  86. data/lib/nokogiri/html4/element_description.rb +1 -0
  87. data/lib/nokogiri/html4/element_description_defaults.rb +432 -532
  88. data/lib/nokogiri/html4/encoding_reader.rb +121 -0
  89. data/lib/nokogiri/html4/entity_lookup.rb +2 -1
  90. data/lib/nokogiri/html4/sax/parser.rb +5 -2
  91. data/lib/nokogiri/html4/sax/parser_context.rb +1 -0
  92. data/lib/nokogiri/html4/sax/push_parser.rb +7 -7
  93. data/lib/nokogiri/html4.rb +12 -5
  94. data/lib/nokogiri/html5/document.rb +126 -32
  95. data/lib/nokogiri/html5/document_fragment.rb +14 -4
  96. data/lib/nokogiri/html5/node.rb +12 -7
  97. data/lib/nokogiri/html5.rb +138 -222
  98. data/lib/nokogiri/jruby/dependencies.rb +2 -19
  99. data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
  100. data/lib/nokogiri/syntax_error.rb +1 -0
  101. data/lib/nokogiri/version/constant.rb +2 -1
  102. data/lib/nokogiri/version/info.rb +32 -24
  103. data/lib/nokogiri/version.rb +1 -0
  104. data/lib/nokogiri/xml/attr.rb +54 -3
  105. data/lib/nokogiri/xml/attribute_decl.rb +2 -1
  106. data/lib/nokogiri/xml/builder.rb +35 -33
  107. data/lib/nokogiri/xml/cdata.rb +2 -1
  108. data/lib/nokogiri/xml/character_data.rb +1 -0
  109. data/lib/nokogiri/xml/document.rb +232 -143
  110. data/lib/nokogiri/xml/document_fragment.rb +88 -42
  111. data/lib/nokogiri/xml/dtd.rb +3 -2
  112. data/lib/nokogiri/xml/element_content.rb +1 -0
  113. data/lib/nokogiri/xml/element_decl.rb +2 -1
  114. data/lib/nokogiri/xml/entity_decl.rb +3 -2
  115. data/lib/nokogiri/xml/entity_reference.rb +1 -0
  116. data/lib/nokogiri/xml/namespace.rb +44 -0
  117. data/lib/nokogiri/xml/node/save_options.rb +14 -8
  118. data/lib/nokogiri/xml/node.rb +708 -383
  119. data/lib/nokogiri/xml/node_set.rb +134 -59
  120. data/lib/nokogiri/xml/notation.rb +12 -0
  121. data/lib/nokogiri/xml/parse_options.rb +140 -56
  122. data/lib/nokogiri/xml/pp/character_data.rb +8 -6
  123. data/lib/nokogiri/xml/pp/node.rb +26 -26
  124. data/lib/nokogiri/xml/pp.rb +1 -0
  125. data/lib/nokogiri/xml/processing_instruction.rb +3 -1
  126. data/lib/nokogiri/xml/reader.rb +20 -24
  127. data/lib/nokogiri/xml/relax_ng.rb +1 -0
  128. data/lib/nokogiri/xml/sax/document.rb +20 -19
  129. data/lib/nokogiri/xml/sax/parser.rb +38 -36
  130. data/lib/nokogiri/xml/sax/parser_context.rb +7 -3
  131. data/lib/nokogiri/xml/sax/push_parser.rb +5 -5
  132. data/lib/nokogiri/xml/sax.rb +1 -0
  133. data/lib/nokogiri/xml/schema.rb +7 -6
  134. data/lib/nokogiri/xml/searchable.rb +93 -62
  135. data/lib/nokogiri/xml/syntax_error.rb +5 -4
  136. data/lib/nokogiri/xml/text.rb +1 -0
  137. data/lib/nokogiri/xml/xpath/syntax_error.rb +2 -1
  138. data/lib/nokogiri/xml/xpath.rb +12 -0
  139. data/lib/nokogiri/xml/xpath_context.rb +2 -3
  140. data/lib/nokogiri/xml.rb +4 -3
  141. data/lib/nokogiri/xslt/stylesheet.rb +1 -0
  142. data/lib/nokogiri/xslt.rb +21 -13
  143. data/lib/nokogiri.rb +22 -27
  144. data/lib/xsd/xmlparser/nokogiri.rb +28 -25
  145. data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
  146. data/patches/libxslt/0001-update-automake-files-for-arm64.patch +2445 -1919
  147. data/ports/archives/libxml2-2.10.4.tar.xz +0 -0
  148. data/ports/archives/libxslt-1.1.37.tar.xz +0 -0
  149. metadata +20 -171
  150. data/patches/libxml2/0004-use-glibc-strlen.patch +0 -53
  151. data/patches/libxml2/0005-avoid-isnan-isinf.patch +0 -81
  152. data/patches/libxml2/0006-update-automake-files-for-arm64.patch +0 -2511
  153. data/patches/libxml2/0007-Fix-XPath-recursion-limit.patch +0 -31
  154. data/patches/libxslt/0002-Fix-xml2-config-check-in-configure-script.patch +0 -19
  155. data/ports/archives/libxml2-2.9.12.tar.gz +0 -0
  156. data/ports/archives/libxslt-1.1.34.tar.gz +0 -0
@@ -0,0 +1,121 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nokogiri
4
+ module HTML4
5
+ # Libxml2's parser has poor support for encoding detection. First, it does not recognize the
6
+ # HTML5 style meta charset declaration. Secondly, even if it successfully detects an encoding
7
+ # hint, it does not re-decode or re-parse the preceding part which may be garbled.
8
+ #
9
+ # EncodingReader aims to perform advanced encoding detection beyond what Libxml2 does, and to
10
+ # emulate rewinding of a stream and make Libxml2 redo parsing from the start when an encoding
11
+ # hint is found.
12
+
13
+ # :nodoc: all
14
+ class EncodingReader
15
+ class EncodingFound < StandardError
16
+ attr_reader :found_encoding
17
+
18
+ def initialize(encoding)
19
+ @found_encoding = encoding
20
+ super(format("encoding found: %s", encoding))
21
+ end
22
+ end
23
+
24
+ class SAXHandler < Nokogiri::XML::SAX::Document
25
+ attr_reader :encoding
26
+
27
+ def initialize
28
+ @encoding = nil
29
+ super()
30
+ end
31
+
32
+ def start_element(name, attrs = [])
33
+ return unless name == "meta"
34
+
35
+ attr = Hash[attrs]
36
+ (charset = attr["charset"]) &&
37
+ (@encoding = charset)
38
+ (http_equiv = attr["http-equiv"]) &&
39
+ http_equiv.match(/\AContent-Type\z/i) &&
40
+ (content = attr["content"]) &&
41
+ (m = content.match(/;\s*charset\s*=\s*([\w-]+)/)) &&
42
+ (@encoding = m[1])
43
+ end
44
+ end
45
+
46
+ class JumpSAXHandler < SAXHandler
47
+ def initialize(jumptag)
48
+ @jumptag = jumptag
49
+ super()
50
+ end
51
+
52
+ def start_element(name, attrs = [])
53
+ super
54
+ throw(@jumptag, @encoding) if @encoding
55
+ throw(@jumptag, nil) if /\A(?:div|h1|img|p|br)\z/.match?(name)
56
+ end
57
+ end
58
+
59
+ def self.detect_encoding(chunk)
60
+ (m = chunk.match(/\A(<\?xml[ \t\r\n][^>]*>)/)) &&
61
+ (return Nokogiri.XML(m[1]).encoding)
62
+
63
+ if Nokogiri.jruby?
64
+ (m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i)) &&
65
+ (return m[4])
66
+ catch(:encoding_found) do
67
+ Nokogiri::HTML4::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
68
+ nil
69
+ end
70
+ else
71
+ handler = SAXHandler.new
72
+ parser = Nokogiri::HTML4::SAX::PushParser.new(handler)
73
+ begin
74
+ parser << chunk
75
+ rescue
76
+ Nokogiri::SyntaxError
77
+ end
78
+ handler.encoding
79
+ end
80
+ end
81
+
82
+ def initialize(io)
83
+ @io = io
84
+ @firstchunk = nil
85
+ @encoding_found = nil
86
+ end
87
+
88
+ # This method is used by the C extension so that
89
+ # Nokogiri::HTML4::Document#read_io() does not leak memory when
90
+ # EncodingFound is raised.
91
+ attr_reader :encoding_found
92
+
93
+ def read(len)
94
+ # no support for a call without len
95
+
96
+ unless @firstchunk
97
+ (@firstchunk = @io.read(len)) || (return nil)
98
+
99
+ # This implementation expects that the first call from
100
+ # htmlReadIO() is made with a length long enough (~1KB) to
101
+ # achieve advanced encoding detection.
102
+ if (encoding = EncodingReader.detect_encoding(@firstchunk))
103
+ # The first chunk is stored for the next read in retry.
104
+ raise @encoding_found = EncodingFound.new(encoding)
105
+ end
106
+ end
107
+ @encoding_found = nil
108
+
109
+ ret = @firstchunk.slice!(0, len)
110
+ if (len -= ret.length) > 0
111
+ (rest = @io.read(len)) && ret << (rest)
112
+ end
113
+ if ret.empty?
114
+ nil
115
+ else
116
+ ret
117
+ end
118
+ end
119
+ end
120
+ end
121
+ end
@@ -1,4 +1,5 @@
1
1
  # frozen_string_literal: true
2
+
2
3
  module Nokogiri
3
4
  module HTML4
4
5
  class EntityDescription < Struct.new(:value, :name, :description); end
@@ -6,7 +7,7 @@ module Nokogiri
6
7
  class EntityLookup
7
8
  ###
8
9
  # Look up entity with +name+
9
- def [] name
10
+ def [](name)
10
11
  (val = get(name)) && val.value
11
12
  end
12
13
  end
@@ -1,4 +1,5 @@
1
1
  # frozen_string_literal: true
2
+
2
3
  module Nokogiri
3
4
  module HTML4
4
5
  ###
@@ -27,8 +28,9 @@ module Nokogiri
27
28
  ###
28
29
  # Parse html stored in +data+ using +encoding+
29
30
  def parse_memory(data, encoding = "UTF-8")
30
- raise ArgumentError unless data
31
- return unless data.length > 0
31
+ raise TypeError unless String === data
32
+ return if data.empty?
33
+
32
34
  ctx = ParserContext.memory(data, encoding)
33
35
  yield ctx if block_given?
34
36
  ctx.parse_with(self)
@@ -50,6 +52,7 @@ module Nokogiri
50
52
  raise ArgumentError unless filename
51
53
  raise Errno::ENOENT unless File.exist?(filename)
52
54
  raise Errno::EISDIR if File.directory?(filename)
55
+
53
56
  ctx = ParserContext.file(filename, encoding)
54
57
  yield ctx if block_given?
55
58
  ctx.parse_with(self)
@@ -1,4 +1,5 @@
1
1
  # frozen_string_literal: true
2
+
2
3
  module Nokogiri
3
4
  module HTML4
4
5
  module SAX
@@ -1,14 +1,14 @@
1
1
  # frozen_string_literal: true
2
+
2
3
  module Nokogiri
3
4
  module HTML4
4
5
  module SAX
5
6
  class PushParser
6
-
7
7
  # The Nokogiri::HTML4::SAX::Document on which the PushParser will be
8
8
  # operating
9
9
  attr_accessor :document
10
-
11
- def initialize(doc = HTML4::SAX::Document.new, file_name = nil, encoding = 'UTF-8')
10
+
11
+ def initialize(doc = HTML4::SAX::Document.new, file_name = nil, encoding = "UTF-8")
12
12
  @document = doc
13
13
  @encoding = encoding
14
14
  @sax_parser = HTML4::SAX::Parser.new(doc, @encoding)
@@ -16,20 +16,20 @@ module Nokogiri
16
16
  ## Create our push parser context
17
17
  initialize_native(@sax_parser, file_name, encoding)
18
18
  end
19
-
19
+
20
20
  ###
21
21
  # Write a +chunk+ of HTML to the PushParser. Any callback methods
22
22
  # that can be called will be called immediately.
23
- def write chunk, last_chunk = false
23
+ def write(chunk, last_chunk = false)
24
24
  native_write(chunk, last_chunk)
25
25
  end
26
- alias :<< :write
26
+ alias_method :<<, :write
27
27
 
28
28
  ###
29
29
  # Finish the parsing. This method is only necessary for
30
30
  # Nokogiri::HTML4::SAX::Document#end_document to be called.
31
31
  def finish
32
- write '', true
32
+ write("", true)
33
33
  end
34
34
  end
35
35
  end
@@ -1,15 +1,21 @@
1
+ # coding: utf-8
1
2
  # frozen_string_literal: true
3
+
2
4
  module Nokogiri
3
5
  class << self
4
- ###
6
+ # :call-seq:
7
+ # HTML4(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block) → Nokogiri::HTML4::Document
8
+ #
5
9
  # Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
6
10
  def HTML4(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
7
11
  Nokogiri::HTML4::Document.parse(input, url, encoding, options, &block)
8
12
  end
9
13
  end
10
14
 
11
- # @since v1.12.0
12
- # @note Before v1.12.0, {Nokogiri::HTML4} did not exist, and {Nokogiri::HTML} was the module/namespace for parsing HTML.
15
+ # Since v1.12.0
16
+ #
17
+ # 💡 Before v1.12.0, Nokogiri::HTML4 did not exist, and Nokogiri::HTML was the module/namespace
18
+ # for parsing HTML.
13
19
  module HTML4
14
20
  class << self
15
21
  ###
@@ -20,8 +26,8 @@ module Nokogiri
20
26
 
21
27
  ####
22
28
  # Parse a fragment from +string+ in to a NodeSet.
23
- def fragment(string, encoding = nil)
24
- HTML4::DocumentFragment.parse(string, encoding)
29
+ def fragment(string, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
30
+ HTML4::DocumentFragment.parse(string, encoding, options, &block)
25
31
  end
26
32
  end
27
33
 
@@ -33,6 +39,7 @@ end
33
39
  require_relative "html4/entity_lookup"
34
40
  require_relative "html4/document"
35
41
  require_relative "html4/document_fragment"
42
+ require_relative "html4/encoding_reader"
36
43
  require_relative "html4/sax/parser_context"
37
44
  require_relative "html4/sax/parser"
38
45
  require_relative "html4/sax/push_parser"
@@ -1,4 +1,6 @@
1
+ # coding: utf-8
1
2
  # frozen_string_literal: true
3
+
2
4
  #
3
5
  # Copyright 2013-2021 Sam Ruby, Stephen Checkoway
4
6
  #
@@ -19,55 +21,147 @@ require_relative "../html4/document"
19
21
 
20
22
  module Nokogiri
21
23
  module HTML5
22
- # @since v1.12.0
23
- # @note HTML5 functionality is not available when running JRuby.
24
+ # Enum for the HTML5 parser quirks mode values. Values returned by HTML5::Document#quirks_mode
25
+ #
26
+ # See https://dom.spec.whatwg.org/#concept-document-quirks for more information on HTML5 quirks
27
+ # mode.
28
+ #
29
+ # Since v1.14.0
30
+ module QuirksMode
31
+ NO_QUIRKS = 0 # The document was parsed in "no-quirks" mode
32
+ QUIRKS = 1 # The document was parsed in "quirks" mode
33
+ LIMITED_QUIRKS = 2 # The document was parsed in "limited-quirks" mode
34
+ end
35
+
36
+ # Since v1.12.0
37
+ #
38
+ # 💡 HTML5 functionality is not available when running JRuby.
24
39
  class Document < Nokogiri::HTML4::Document
25
- def self.parse(string_or_io, url = nil, encoding = nil, **options, &block)
26
- yield options if block_given?
27
- string_or_io = '' unless string_or_io
40
+ # Get the url name for this document, as passed into Document.parse, Document.read_io, or
41
+ # Document.read_memory
42
+ attr_reader :url
43
+
44
+ # Get the parser's quirks mode value. See HTML5::QuirksMode.
45
+ #
46
+ # This method returns `nil` if the parser was not invoked (e.g., `Nokogiri::HTML5::Document.new`).
47
+ #
48
+ # Since v1.14.0
49
+ attr_reader :quirks_mode
50
+
51
+ class << self
52
+ # :call-seq:
53
+ # parse(input)
54
+ # parse(input, url=nil, encoding=nil, **options)
55
+ # parse(input, url=nil, encoding=nil) { |options| ... }
56
+ #
57
+ # Parse HTML5 input.
58
+ #
59
+ # [Parameters]
60
+ # - +input+ may be a String, or any object that responds to _read_ and _close_ such as an
61
+ # IO, or StringIO.
62
+ #
63
+ # - +url+ (optional) is a String indicating the canonical URI where this document is located.
64
+ #
65
+ # - +encoding+ (optional) is the encoding that should be used when processing
66
+ # the document.
67
+ #
68
+ # - +options+ (optional) is a configuration Hash (or keyword arguments) to set options
69
+ # during parsing. The three currently supported options are +:max_errors+,
70
+ # +:max_tree_depth+ and +:max_attributes+, described at Nokogiri::HTML5.
71
+ #
72
+ # ⚠ Note that these options are different than those made available by
73
+ # Nokogiri::XML::Document and Nokogiri::HTML4::Document.
74
+ #
75
+ # - +block+ (optional) is passed a configuration Hash on which parse options may be set. See
76
+ # Nokogiri::HTML5 for more information and usage.
77
+ #
78
+ # [Returns] Nokogiri::HTML5::Document
79
+ #
80
+ def parse(string_or_io, url = nil, encoding = nil, **options, &block)
81
+ yield options if block
82
+ string_or_io = "" unless string_or_io
83
+
84
+ if string_or_io.respond_to?(:encoding) && string_or_io.encoding != Encoding::ASCII_8BIT
85
+ encoding ||= string_or_io.encoding.name
86
+ end
87
+
88
+ if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
89
+ url ||= string_or_io.path
90
+ end
91
+ unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
92
+ raise ArgumentError, "not a string or IO object"
93
+ end
28
94
 
29
- if string_or_io.respond_to?(:encoding) && string_or_io.encoding.name != 'ASCII-8BIT'
30
- encoding ||= string_or_io.encoding.name
95
+ do_parse(string_or_io, url, encoding, options)
31
96
  end
32
97
 
33
- if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
34
- url ||= string_or_io.path
98
+ # Create a new document from an IO object.
99
+ #
100
+ # 💡 Most users should prefer Document.parse to this method.
101
+ def read_io(io, url = nil, encoding = nil, **options)
102
+ raise ArgumentError, "io object doesn't respond to :read" unless io.respond_to?(:read)
103
+
104
+ do_parse(io, url, encoding, options)
35
105
  end
36
- unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
37
- raise ArgumentError.new("not a string or IO object")
106
+
107
+ # Create a new document from a String.
108
+ #
109
+ # 💡 Most users should prefer Document.parse to this method.
110
+ def read_memory(string, url = nil, encoding = nil, **options)
111
+ raise ArgumentError, "string object doesn't respond to :to_str" unless string.respond_to?(:to_str)
112
+
113
+ do_parse(string, url, encoding, options)
38
114
  end
39
- do_parse(string_or_io, url, encoding, options)
40
- end
41
115
 
42
- def self.read_io(io, url = nil, encoding = nil, **options)
43
- raise ArgumentError.new("io object doesn't respond to :read") unless io.respond_to?(:read)
44
- do_parse(io, url, encoding, options)
116
+ private
117
+
118
+ def do_parse(string_or_io, url, encoding, options)
119
+ string = HTML5.read_and_encode(string_or_io, encoding)
120
+ max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
121
+ max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
122
+ max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
123
+ doc = Nokogiri::Gumbo.parse(string, url, max_attributes, max_errors, max_depth, self)
124
+ doc.encoding = "UTF-8"
125
+ doc
126
+ end
45
127
  end
46
128
 
47
- def self.read_memory(string, url = nil, encoding = nil, **options)
48
- raise ArgumentError.new("string object doesn't respond to :to_str") unless string.respond_to?(:to_str)
49
- do_parse(string, url, encoding, options)
129
+ def initialize(*args) # :nodoc:
130
+ super
131
+ @url = nil
132
+ @quirks_mode = nil
50
133
  end
51
134
 
52
- def fragment(tags = nil)
53
- DocumentFragment.new(self, tags, self.root)
135
+ # :call-seq:
136
+ # fragment() Nokogiri::HTML5::DocumentFragment
137
+ # fragment(markup) → Nokogiri::HTML5::DocumentFragment
138
+ #
139
+ # Parse a HTML5 document fragment from +markup+, returning a Nokogiri::HTML5::DocumentFragment.
140
+ #
141
+ # [Properties]
142
+ # - +markup+ (String) The HTML5 markup fragment to be parsed
143
+ #
144
+ # [Returns]
145
+ # Nokogiri::HTML5::DocumentFragment. This object's children will be empty if `markup` is not passed, is empty, or is `nil`.
146
+ #
147
+ def fragment(markup = nil)
148
+ DocumentFragment.new(self, markup)
54
149
  end
55
150
 
56
- def to_xml(options = {}, &block)
151
+ def to_xml(options = {}, &block) # :nodoc:
57
152
  # Bypass XML::Document#to_xml which doesn't add
58
153
  # XML::Node::SaveOptions::AS_XML like XML::Node#to_xml does.
59
- XML::Node.instance_method(:to_xml).bind(self).call(options, &block)
154
+ XML::Node.instance_method(:to_xml).bind_call(self, options, &block)
60
155
  end
61
156
 
62
- private
63
- def self.do_parse(string_or_io, url, encoding, options)
64
- string = HTML5.read_and_encode(string_or_io, encoding)
65
- max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
66
- max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
67
- max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
68
- doc = Nokogiri::Gumbo.parse(string, url, max_attributes, max_errors, max_depth)
69
- doc.encoding = 'UTF-8'
70
- doc
157
+ # :call-seq:
158
+ # xpath_doctype() Nokogiri::CSS::XPathVisitor::DoctypeConfig
159
+ #
160
+ # [Returns] The document type which determines CSS-to-XPath translation.
161
+ #
162
+ # See CSS::XPathVisitor for more information.
163
+ def xpath_doctype
164
+ Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML5
71
165
  end
72
166
  end
73
167
  end
@@ -1,4 +1,6 @@
1
+ # coding: utf-8
1
2
  # frozen_string_literal: true
3
+
2
4
  #
3
5
  # Copyright 2013-2021 Sam Ruby, Stephen Checkoway
4
6
  #
@@ -19,12 +21,20 @@ require_relative "../html4/document_fragment"
19
21
 
20
22
  module Nokogiri
21
23
  module HTML5
22
- # @since v1.12.0
23
- # @note HTML5 functionality is not available when running JRuby.
24
+ # Since v1.12.0
25
+ #
26
+ # 💡 HTML5 functionality is not available when running JRuby.
24
27
  class DocumentFragment < Nokogiri::HTML4::DocumentFragment
25
28
  attr_accessor :document
26
29
  attr_accessor :errors
27
30
 
31
+ # Get the parser's quirks mode value. See HTML5::QuirksMode.
32
+ #
33
+ # This method returns `nil` if the parser was not invoked (e.g., `Nokogiri::HTML5::DocumentFragment.new(doc)`).
34
+ #
35
+ # Since v1.14.0
36
+ attr_reader :quirks_mode
37
+
28
38
  # Create a document fragment.
29
39
  def initialize(doc, tags = nil, ctx = nil, options = {})
30
40
  self.document = doc
@@ -38,10 +48,10 @@ module Nokogiri
38
48
  Nokogiri::Gumbo.fragment(self, tags, ctx, max_attributes, max_errors, max_depth)
39
49
  end
40
50
 
41
- def serialize(options = {}, &block)
51
+ def serialize(options = {}, &block) # :nodoc:
42
52
  # Bypass XML::Document.serialize which doesn't support options even
43
53
  # though XML::Node.serialize does!
44
- XML::Node.instance_method(:serialize).bind(self).call(options, &block)
54
+ XML::Node.instance_method(:serialize).bind_call(self, options, &block)
45
55
  end
46
56
 
47
57
  # Parse a document fragment from +tags+, returning a Nodeset.
@@ -1,4 +1,6 @@
1
+ # coding: utf-8
1
2
  # frozen_string_literal: true
3
+
2
4
  #
3
5
  # Copyright 2013-2021 Sam Ruby, Stephen Checkoway
4
6
  #
@@ -19,18 +21,21 @@ require_relative "../xml/node"
19
21
 
20
22
  module Nokogiri
21
23
  module HTML5
22
- # @since v1.12.0
23
- # @note HTML5 functionality is not available when running JRuby.
24
+ # Since v1.12.0
25
+ #
26
+ # 💡 HTML5 functionality is not available when running JRuby.
24
27
  module Node
25
28
  def inner_html(options = {})
26
29
  return super(options) unless document.is_a?(HTML5::Document)
27
- result = options[:preserve_newline] && HTML5.prepend_newline?(self) ? String.new("\n") : String.new
30
+
31
+ result = options[:preserve_newline] && prepend_newline? ? +"\n" : +""
28
32
  result << children.map { |child| child.to_html(options) }.join
29
33
  result
30
34
  end
31
35
 
32
36
  def write_to(io, *options)
33
37
  return super(io, *options) unless document.is_a?(HTML5::Document)
38
+
34
39
  options = options.first.is_a?(Hash) ? options.shift : {}
35
40
  encoding = options[:encoding] || options[0]
36
41
  if Nokogiri.jruby?
@@ -51,16 +56,15 @@ module Nokogiri
51
56
  native_write_to(io, encoding, indent_string, config_options)
52
57
  else
53
58
  # Serialize including the current node.
59
+ html = html_standard_serialize(options[:preserve_newline] || false)
54
60
  encoding ||= document.encoding || Encoding::UTF_8
55
- internal_ops = {
56
- preserve_newline: options[:preserve_newline] || false,
57
- }
58
- HTML5.serialize_node_internal(self, io, encoding, internal_ops)
61
+ io << html.encode(encoding, fallback: lambda { |c| "&#x#{c.ord.to_s(16)};" })
59
62
  end
60
63
  end
61
64
 
62
65
  def fragment(tags)
63
66
  return super(tags) unless document.is_a?(HTML5::Document)
67
+
64
68
  DocumentFragment.new(document, tags, self)
65
69
  end
66
70
 
@@ -73,6 +77,7 @@ module Nokogiri
73
77
  # actually create the xml namespace if it doesn't exist already.
74
78
  def add_child_node_and_reparent_attrs(node)
75
79
  return super(node) unless document.is_a?(HTML5::Document)
80
+
76
81
  # I'm not sure what this method is supposed to do. Reparenting
77
82
  # namespaces is handled by libxml2, including child namespaces which
78
83
  # this method wouldn't handle.