nokogiri 1.16.8-x86_64-darwin → 1.17.0-x86_64-darwin

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +11 -21
  3. data/README.md +4 -0
  4. data/dependencies.yml +6 -6
  5. data/ext/nokogiri/extconf.rb +191 -137
  6. data/ext/nokogiri/gumbo.c +69 -53
  7. data/ext/nokogiri/html4_document.c +10 -4
  8. data/ext/nokogiri/html4_element_description.c +18 -18
  9. data/ext/nokogiri/html4_sax_parser.c +40 -0
  10. data/ext/nokogiri/html4_sax_parser_context.c +48 -58
  11. data/ext/nokogiri/html4_sax_push_parser.c +25 -24
  12. data/ext/nokogiri/include/libexslt/exsltconfig.h +3 -3
  13. data/ext/nokogiri/include/libxml2/libxml/HTMLparser.h +12 -19
  14. data/ext/nokogiri/include/libxml2/libxml/c14n.h +1 -12
  15. data/ext/nokogiri/include/libxml2/libxml/debugXML.h +1 -1
  16. data/ext/nokogiri/include/libxml2/libxml/encoding.h +9 -0
  17. data/ext/nokogiri/include/libxml2/libxml/entities.h +12 -1
  18. data/ext/nokogiri/include/libxml2/libxml/hash.h +19 -0
  19. data/ext/nokogiri/include/libxml2/libxml/list.h +2 -2
  20. data/ext/nokogiri/include/libxml2/libxml/nanohttp.h +17 -0
  21. data/ext/nokogiri/include/libxml2/libxml/parser.h +60 -54
  22. data/ext/nokogiri/include/libxml2/libxml/parserInternals.h +9 -1
  23. data/ext/nokogiri/include/libxml2/libxml/pattern.h +6 -0
  24. data/ext/nokogiri/include/libxml2/libxml/tree.h +32 -12
  25. data/ext/nokogiri/include/libxml2/libxml/uri.h +11 -0
  26. data/ext/nokogiri/include/libxml2/libxml/valid.h +29 -2
  27. data/ext/nokogiri/include/libxml2/libxml/xinclude.h +7 -0
  28. data/ext/nokogiri/include/libxml2/libxml/xmlIO.h +21 -4
  29. data/ext/nokogiri/include/libxml2/libxml/xmlerror.h +14 -0
  30. data/ext/nokogiri/include/libxml2/libxml/xmlexports.h +111 -15
  31. data/ext/nokogiri/include/libxml2/libxml/xmlmemory.h +8 -45
  32. data/ext/nokogiri/include/libxml2/libxml/xmlreader.h +2 -0
  33. data/ext/nokogiri/include/libxml2/libxml/xmlsave.h +5 -0
  34. data/ext/nokogiri/include/libxml2/libxml/xmlunicode.h +165 -1
  35. data/ext/nokogiri/include/libxml2/libxml/xmlversion.h +7 -171
  36. data/ext/nokogiri/include/libxml2/libxml/xmlwriter.h +1 -0
  37. data/ext/nokogiri/include/libxml2/libxml/xpath.h +4 -0
  38. data/ext/nokogiri/include/libxslt/xsltInternals.h +3 -0
  39. data/ext/nokogiri/include/libxslt/xsltconfig.h +4 -37
  40. data/ext/nokogiri/libxml2_polyfill.c +114 -0
  41. data/ext/nokogiri/nokogiri.c +9 -2
  42. data/ext/nokogiri/nokogiri.h +18 -33
  43. data/ext/nokogiri/xml_attr.c +1 -1
  44. data/ext/nokogiri/xml_cdata.c +2 -10
  45. data/ext/nokogiri/xml_comment.c +3 -8
  46. data/ext/nokogiri/xml_document.c +163 -156
  47. data/ext/nokogiri/xml_document_fragment.c +10 -25
  48. data/ext/nokogiri/xml_dtd.c +1 -1
  49. data/ext/nokogiri/xml_element_content.c +9 -9
  50. data/ext/nokogiri/xml_encoding_handler.c +4 -4
  51. data/ext/nokogiri/xml_namespace.c +6 -6
  52. data/ext/nokogiri/xml_node.c +130 -104
  53. data/ext/nokogiri/xml_node_set.c +46 -44
  54. data/ext/nokogiri/xml_reader.c +54 -58
  55. data/ext/nokogiri/xml_relax_ng.c +35 -56
  56. data/ext/nokogiri/xml_sax_parser.c +156 -88
  57. data/ext/nokogiri/xml_sax_parser_context.c +213 -131
  58. data/ext/nokogiri/xml_sax_push_parser.c +68 -49
  59. data/ext/nokogiri/xml_schema.c +50 -85
  60. data/ext/nokogiri/xml_syntax_error.c +19 -11
  61. data/ext/nokogiri/xml_text.c +2 -4
  62. data/ext/nokogiri/xml_xpath_context.c +2 -2
  63. data/ext/nokogiri/xslt_stylesheet.c +8 -8
  64. data/lib/nokogiri/3.0/nokogiri.bundle +0 -0
  65. data/lib/nokogiri/3.1/nokogiri.bundle +0 -0
  66. data/lib/nokogiri/3.2/nokogiri.bundle +0 -0
  67. data/lib/nokogiri/3.3/nokogiri.bundle +0 -0
  68. data/lib/nokogiri/class_resolver.rb +1 -1
  69. data/lib/nokogiri/css/node.rb +6 -2
  70. data/lib/nokogiri/css/parser.rb +6 -4
  71. data/lib/nokogiri/css/parser.y +2 -2
  72. data/lib/nokogiri/css/parser_extras.rb +6 -66
  73. data/lib/nokogiri/css/selector_cache.rb +38 -0
  74. data/lib/nokogiri/css/tokenizer.rb +4 -4
  75. data/lib/nokogiri/css/tokenizer.rex +9 -8
  76. data/lib/nokogiri/css/xpath_visitor.rb +42 -6
  77. data/lib/nokogiri/css.rb +86 -20
  78. data/lib/nokogiri/decorators/slop.rb +3 -5
  79. data/lib/nokogiri/encoding_handler.rb +2 -2
  80. data/lib/nokogiri/html4/document.rb +44 -23
  81. data/lib/nokogiri/html4/document_fragment.rb +124 -12
  82. data/lib/nokogiri/html4/encoding_reader.rb +1 -1
  83. data/lib/nokogiri/html4/sax/parser.rb +23 -38
  84. data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
  85. data/lib/nokogiri/html4.rb +9 -14
  86. data/lib/nokogiri/html5/builder.rb +40 -0
  87. data/lib/nokogiri/html5/document.rb +61 -30
  88. data/lib/nokogiri/html5/document_fragment.rb +130 -20
  89. data/lib/nokogiri/html5/node.rb +4 -4
  90. data/lib/nokogiri/html5.rb +114 -72
  91. data/lib/nokogiri/version/constant.rb +1 -1
  92. data/lib/nokogiri/xml/builder.rb +8 -1
  93. data/lib/nokogiri/xml/document.rb +70 -26
  94. data/lib/nokogiri/xml/document_fragment.rb +84 -13
  95. data/lib/nokogiri/xml/node.rb +82 -11
  96. data/lib/nokogiri/xml/node_set.rb +9 -7
  97. data/lib/nokogiri/xml/parse_options.rb +1 -1
  98. data/lib/nokogiri/xml/pp/node.rb +6 -1
  99. data/lib/nokogiri/xml/reader.rb +46 -13
  100. data/lib/nokogiri/xml/relax_ng.rb +57 -20
  101. data/lib/nokogiri/xml/sax/document.rb +174 -83
  102. data/lib/nokogiri/xml/sax/parser.rb +115 -41
  103. data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
  104. data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
  105. data/lib/nokogiri/xml/sax.rb +48 -0
  106. data/lib/nokogiri/xml/schema.rb +112 -45
  107. data/lib/nokogiri/xml/searchable.rb +6 -8
  108. data/lib/nokogiri/xml/syntax_error.rb +22 -0
  109. data/lib/nokogiri/xml.rb +13 -24
  110. data/lib/nokogiri/xslt.rb +3 -9
  111. data/lib/xsd/xmlparser/nokogiri.rb +3 -4
  112. metadata +8 -4
  113. data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
@@ -3,13 +3,83 @@
3
3
  module Nokogiri
4
4
  module HTML4
5
5
  class DocumentFragment < Nokogiri::XML::DocumentFragment
6
- ####
7
- # Create a Nokogiri::XML::DocumentFragment from +tags+, using +encoding+
8
- def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
6
+ #
7
+ # :call-seq:
8
+ # parse(input) { |options| ... } HTML4::DocumentFragment
9
+ # parse(input, encoding:, options:) { |options| ... } → HTML4::DocumentFragment
10
+ #
11
+ # Parse \HTML4 fragment input from a String, and return a new HTML4::DocumentFragment. This
12
+ # method creates a new, empty HTML4::Document to contain the fragment.
13
+ #
14
+ # [Required Parameters]
15
+ # - +input+ (String | IO) The content to be parsed.
16
+ #
17
+ # [Optional Keyword Arguments]
18
+ # - +encoding:+ (String) The name of the encoding that should be used when processing the
19
+ # document. When not provided, the encoding will be determined based on the document
20
+ # content.
21
+ #
22
+ # - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some
23
+ # behaviors during parsing. See ParseOptions for more information. The default value is
24
+ # +ParseOptions::DEFAULT_HTML+.
25
+ #
26
+ # [Yields]
27
+ # If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which
28
+ # can be configured before parsing. See ParseOptions for more information.
29
+ #
30
+ # [Returns] HTML4::DocumentFragment
31
+ #
32
+ # *Example:* Parsing a string
33
+ #
34
+ # fragment = HTML4::DocumentFragment.parse("<div>Hello World</div>")
35
+ #
36
+ # *Example:* Parsing an IO
37
+ #
38
+ # fragment = File.open("fragment.html") do |file|
39
+ # HTML4::DocumentFragment.parse(file)
40
+ # end
41
+ #
42
+ # *Example:* Specifying encoding
43
+ #
44
+ # fragment = HTML4::DocumentFragment.parse(input, encoding: "EUC-JP")
45
+ #
46
+ # *Example:* Setting parse options dynamically
47
+ #
48
+ # HTML4::DocumentFragment.parse("<div>Hello World") do |options|
49
+ # options.huge.pedantic
50
+ # end
51
+ #
52
+ def self.parse(
53
+ input,
54
+ encoding_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML,
55
+ encoding: encoding_, options: options_,
56
+ &block
57
+ )
58
+ # TODO: this method should take a context node.
9
59
  doc = HTML4::Document.new
10
60
 
11
- encoding ||= if tags.respond_to?(:encoding)
12
- encoding = tags.encoding
61
+ if input.respond_to?(:read)
62
+ # Handle IO-like objects (IO, File, StringIO, etc.)
63
+ # The _read_ method of these objects doesn't accept an +encoding+ parameter.
64
+ # Encoding is usually set when the IO object is created or opened,
65
+ # or by using the _set_encoding_ method.
66
+ #
67
+ # 1. If +encoding+ is provided and the object supports _set_encoding_,
68
+ # set the encoding before reading.
69
+ # 2. Read the content from the IO-like object.
70
+ #
71
+ # Note: After reading, the content's encoding will be:
72
+ # - The encoding set by _set_encoding_ if it was called
73
+ # - The default encoding of the IO object otherwise
74
+ #
75
+ # For StringIO specifically, _set_encoding_ affects only the internal string,
76
+ # not how the data is read out.
77
+ input.set_encoding(encoding) if encoding && input.respond_to?(:set_encoding)
78
+ input = input.read
79
+ end
80
+
81
+ encoding ||= if input.respond_to?(:encoding)
82
+ encoding = input.encoding
13
83
  if encoding == ::Encoding::ASCII_8BIT
14
84
  "UTF-8"
15
85
  else
@@ -21,29 +91,71 @@ module Nokogiri
21
91
 
22
92
  doc.encoding = encoding
23
93
 
24
- new(doc, tags, nil, options, &block)
94
+ new(doc, input, options: options, &block)
25
95
  end
26
96
 
27
- def initialize(document, tags = nil, ctx = nil, options = XML::ParseOptions::DEFAULT_HTML) # rubocop:disable Lint/MissingSuper
28
- return self unless tags
97
+ #
98
+ # :call-seq:
99
+ # new(document) { |options| ... } → HTML4::DocumentFragment
100
+ # new(document, input) { |options| ... } → HTML4::DocumentFragment
101
+ # new(document, input, context:, options:) { |options| ... } → HTML4::DocumentFragment
102
+ #
103
+ # Parse \HTML4 fragment input from a String, and return a new HTML4::DocumentFragment.
104
+ #
105
+ # 💡 It's recommended to use either HTML4::DocumentFragment.parse or XML::Node#parse rather
106
+ # than call this method directly.
107
+ #
108
+ # [Required Parameters]
109
+ # - +document+ (HTML4::Document) The parent document to associate the returned fragment with.
110
+ #
111
+ # [Optional Parameters]
112
+ # - +input+ (String) The content to be parsed.
113
+ #
114
+ # [Optional Keyword Arguments]
115
+ # - +context:+ (Nokogiri::XML::Node) The <b>context node</b> for the subtree created. See
116
+ # below for more information.
117
+ #
118
+ # - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some
119
+ # behaviors during parsing. See ParseOptions for more information. The default value is
120
+ # +ParseOptions::DEFAULT_HTML+.
121
+ #
122
+ # [Yields]
123
+ # If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which
124
+ # can be configured before parsing. See ParseOptions for more information.
125
+ #
126
+ # [Returns] HTML4::DocumentFragment
127
+ #
128
+ # === Context \Node
129
+ #
130
+ # If a context node is specified using +context:+, then the fragment will be created by
131
+ # calling XML::Node#parse on that node, so the parser will behave as if that Node is the
132
+ # parent of the fragment subtree.
133
+ #
134
+ def initialize(
135
+ document, input = nil,
136
+ context_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML,
137
+ context: context_, options: options_
138
+ ) # rubocop:disable Lint/MissingSuper
139
+ return self unless input
29
140
 
30
141
  options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
142
+ @parse_options = options
31
143
  yield options if block_given?
32
144
 
33
- if ctx
145
+ if context
34
146
  preexisting_errors = document.errors.dup
35
- node_set = ctx.parse("<div>#{tags}</div>", options)
147
+ node_set = context.parse("<div>#{input}</div>", options)
36
148
  node_set.first.children.each { |child| child.parent = self } unless node_set.empty?
37
149
  self.errors = document.errors - preexisting_errors
38
150
  else
39
151
  # This is a horrible hack, but I don't care
40
- path = if /^\s*?<body/i.match?(tags)
152
+ path = if /^\s*?<body/i.match?(input)
41
153
  "/html/body"
42
154
  else
43
155
  "/html/body/node()"
44
156
  end
45
157
 
46
- temp_doc = HTML4::Document.parse("<html><body>#{tags}", nil, document.encoding, options)
158
+ temp_doc = HTML4::Document.parse("<html><body>#{input}", nil, document.encoding, options)
47
159
  temp_doc.xpath(path).each { |child| child.parent = self }
48
160
  self.errors = temp_doc.errors
49
161
  end
@@ -26,7 +26,7 @@ module Nokogiri
26
26
 
27
27
  def initialize
28
28
  @encoding = nil
29
- super()
29
+ super
30
30
  end
31
31
 
32
32
  def start_element(name, attrs = [])
@@ -3,60 +3,45 @@
3
3
  module Nokogiri
4
4
  module HTML4
5
5
  ###
6
- # Nokogiri lets you write a SAX parser to process HTML but get HTML correction features.
6
+ # Nokogiri provides a SAX parser to process HTML4 which will provide HTML recovery
7
+ # ("autocorrection") features.
7
8
  #
8
9
  # See Nokogiri::HTML4::SAX::Parser for a basic example of using a SAX parser with HTML.
9
10
  #
10
11
  # For more information on SAX parsers, see Nokogiri::XML::SAX
12
+ #
11
13
  module SAX
12
14
  ###
13
- # This class lets you perform SAX style parsing on HTML with HTML error correction.
15
+ # This parser is a SAX style parser that reads its input as it deems necessary. The parser
16
+ # takes a Nokogiri::XML::SAX::Document, an optional encoding, then given an HTML input, sends
17
+ # messages to the Nokogiri::XML::SAX::Document.
18
+ #
19
+ # ⚠ This is an HTML4 parser and so may not support some HTML5 features and behaviors.
14
20
  #
15
21
  # Here is a basic usage example:
16
22
  #
17
- # class MyDoc < Nokogiri::XML::SAX::Document
23
+ # class MyHandler < Nokogiri::XML::SAX::Document
18
24
  # def start_element name, attributes = []
19
25
  # puts "found a #{name}"
20
26
  # end
21
27
  # end
22
28
  #
23
- # parser = Nokogiri::HTML4::SAX::Parser.new(MyDoc.new)
24
- # parser.parse(File.read(ARGV[0], mode: 'rb'))
29
+ # parser = Nokogiri::HTML4::SAX::Parser.new(MyHandler.new)
30
+ #
31
+ # # Hand an IO object to the parser, which will read the HTML from the IO.
32
+ # File.open(path_to_html) do |f|
33
+ # parser.parse(f)
34
+ # end
35
+ #
36
+ # For more information on \SAX parsers, see Nokogiri::XML::SAX or the parent class
37
+ # Nokogiri::XML::SAX::Parser.
38
+ #
39
+ # Also see Nokogiri::XML::SAX::Document for the available events.
25
40
  #
26
- # For more information on SAX parsers, see Nokogiri::XML::SAX
27
41
  class Parser < Nokogiri::XML::SAX::Parser
28
- ###
29
- # Parse html stored in +data+ using +encoding+
30
- def parse_memory(data, encoding = "UTF-8")
31
- raise TypeError unless String === data
32
- return if data.empty?
33
-
34
- ctx = ParserContext.memory(data, encoding)
35
- yield ctx if block_given?
36
- ctx.parse_with(self)
37
- end
38
-
39
- ###
40
- # Parse given +io+
41
- def parse_io(io, encoding = "UTF-8")
42
- check_encoding(encoding)
43
- @encoding = encoding
44
- ctx = ParserContext.io(io, ENCODINGS[encoding])
45
- yield ctx if block_given?
46
- ctx.parse_with(self)
47
- end
48
-
49
- ###
50
- # Parse a file with +filename+
51
- def parse_file(filename, encoding = "UTF-8")
52
- raise ArgumentError unless filename
53
- raise Errno::ENOENT unless File.exist?(filename)
54
- raise Errno::EISDIR if File.directory?(filename)
55
-
56
- ctx = ParserContext.file(filename, encoding)
57
- yield ctx if block_given?
58
- ctx.parse_with(self)
59
- end
42
+ # this class inherits its behavior from Nokogiri::XML::SAX::Parser, but note that superclass
43
+ # uses Nokogiri::ClassResolver to use HTML4::SAX::ParserContext as the context class for
44
+ # this class, which is where the real behavioral differences are implemented.
60
45
  end
61
46
  end
62
47
  end
@@ -4,16 +4,11 @@ module Nokogiri
4
4
  module HTML4
5
5
  module SAX
6
6
  ###
7
- # Context for HTML SAX parsers. This class is usually not instantiated by the user. Instead,
8
- # you should be looking at Nokogiri::HTML4::SAX::Parser
7
+ # Context object to invoke the HTML4 SAX parser on the SAX::Document handler.
8
+ #
9
+ # 💡 This class is usually not instantiated by the user. Use Nokogiri::HTML4::SAX::Parser
10
+ # instead.
9
11
  class ParserContext < Nokogiri::XML::SAX::ParserContext
10
- def self.new(thing, encoding = "UTF-8")
11
- if [:read, :close].all? { |x| thing.respond_to?(x) }
12
- super
13
- else
14
- memory(thing, encoding)
15
- end
16
- end
17
12
  end
18
13
  end
19
14
  end
@@ -3,12 +3,9 @@
3
3
 
4
4
  module Nokogiri
5
5
  class << self
6
- # :call-seq:
7
- # HTML4(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block) → Nokogiri::HTML4::Document
8
- #
9
- # Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
10
- def HTML4(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
11
- Nokogiri::HTML4::Document.parse(input, url, encoding, options, &block)
6
+ # Convenience method for Nokogiri::HTML4::Document.parse
7
+ def HTML4(...)
8
+ Nokogiri::HTML4::Document.parse(...)
12
9
  end
13
10
  end
14
11
 
@@ -18,16 +15,14 @@ module Nokogiri
18
15
  # for parsing HTML.
19
16
  module HTML4
20
17
  class << self
21
- ###
22
- # Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
23
- def parse(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
24
- Document.parse(input, url, encoding, options, &block)
18
+ # Convenience method for Nokogiri::HTML4::Document.parse
19
+ def parse(...)
20
+ Document.parse(...)
25
21
  end
26
22
 
27
- ####
28
- # Parse a fragment from +string+ in to a NodeSet.
29
- def fragment(string, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
30
- HTML4::DocumentFragment.parse(string, encoding, options, &block)
23
+ # Convenience method for Nokogiri::HTML4::DocumentFragment.parse
24
+ def fragment(...)
25
+ HTML4::DocumentFragment.parse(...)
31
26
  end
32
27
  end
33
28
 
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nokogiri
4
+ module HTML5
5
+ ###
6
+ # Nokogiri HTML5 builder is used for building HTML documents. It is very similar to the
7
+ # Nokogiri::XML::Builder. In fact, you should go read the documentation for
8
+ # Nokogiri::XML::Builder before reading this documentation.
9
+ #
10
+ # The construction behavior is identical to HTML4::Builder, but HTML5 documents implement the
11
+ # [HTML5 standard's serialization
12
+ # algorithm](https://www.w3.org/TR/2008/WD-html5-20080610/serializing.html).
13
+ #
14
+ # == Synopsis:
15
+ #
16
+ # Create an HTML5 document with a body that has an onload attribute, and a
17
+ # span tag with a class of "bold" that has content of "Hello world".
18
+ #
19
+ # builder = Nokogiri::HTML5::Builder.new do |doc|
20
+ # doc.html {
21
+ # doc.body(:onload => 'some_func();') {
22
+ # doc.span.bold {
23
+ # doc.text "Hello world"
24
+ # }
25
+ # }
26
+ # }
27
+ # end
28
+ # puts builder.to_html
29
+ #
30
+ # The HTML5 builder inherits from the XML builder, so make sure to read the
31
+ # Nokogiri::XML::Builder documentation.
32
+ class Builder < Nokogiri::XML::Builder
33
+ ###
34
+ # Convert the builder to HTML
35
+ def to_html
36
+ @doc.to_html
37
+ end
38
+ end
39
+ end
40
+ end
@@ -43,41 +43,69 @@ module Nokogiri
43
43
 
44
44
  # Get the parser's quirks mode value. See HTML5::QuirksMode.
45
45
  #
46
- # This method returns `nil` if the parser was not invoked (e.g., `Nokogiri::HTML5::Document.new`).
46
+ # This method returns +nil+ if the parser was not invoked (e.g., Nokogiri::HTML5::Document.new).
47
47
  #
48
48
  # Since v1.14.0
49
49
  attr_reader :quirks_mode
50
50
 
51
51
  class << self
52
52
  # :call-seq:
53
- # parse(input)
54
- # parse(input, url=nil, encoding=nil, **options)
55
- # parse(input, url=nil, encoding=nil) { |options| ... }
53
+ # parse(input) { |options| ... } → HTML5::Document
54
+ # parse(input, url: encoding:) { |options| ... } → HTML5::Document
55
+ # parse(input, **options) HTML5::Document
56
56
  #
57
- # Parse HTML5 input.
57
+ # Parse \HTML input with a parser compliant with the HTML5 spec. This method uses the
58
+ # encoding of +input+ if it can be determined, or else falls back to the +encoding:+
59
+ # parameter.
58
60
  #
59
- # [Parameters]
60
- # - +input+ may be a String, or any object that responds to _read_ and _close_ such as an
61
- # IO, or StringIO.
61
+ # [Required Parameters]
62
+ # - +input+ (String | IO) the \HTML content to be parsed.
62
63
  #
63
- # - +url+ (optional) is a String indicating the canonical URI where this document is located.
64
+ # [Optional Parameters]
65
+ # - +url:+ (String) the base URI of the document.
64
66
  #
65
- # - +encoding+ (optional) is the encoding that should be used when processing
66
- # the document.
67
+ # [Optional Keyword Arguments]
68
+ # - +encoding:+ (Encoding) The name of the encoding that should be used when processing the
69
+ # document. When not provided, the encoding will be determined based on the document
70
+ # content.
67
71
  #
68
- # - +options+ (optional) is a configuration Hash (or keyword arguments) to set options
69
- # during parsing. The three currently supported options are +:max_errors+,
70
- # +:max_tree_depth+ and +:max_attributes+, described at Nokogiri::HTML5.
72
+ # - +max_errors:+ (Integer) The maximum number of parse errors to record. (default
73
+ # +Nokogiri::Gumbo::DEFAULT_MAX_ERRORS+ which is currently 0)
71
74
  #
72
- # Note that these options are different than those made available by
73
- # Nokogiri::XML::Document and Nokogiri::HTML4::Document.
75
+ # - +max_tree_depth:+ (Integer) The maximum depth of the parse tree. (default
76
+ # +Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH+)
74
77
  #
75
- # - +block+ (optional) is passed a configuration Hash on which parse options may be set. See
76
- # Nokogiri::HTML5 for more information and usage.
78
+ # - +max_attributes:+ (Integer) The maximum number of attributes allowed on an
79
+ # element. (default +Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES+)
80
+ #
81
+ # - +parse_noscript_content_as_text:+ (Boolean) Whether to parse the content of +noscript+
82
+ # elements as text. (default +false+)
83
+ #
84
+ # See rdoc-ref:HTML5@Parsing+options for a complete description of these parsing options.
85
+ #
86
+ # [Yields]
87
+ # If present, the block will be passed a Hash object to modify with parse options before the
88
+ # input is parsed. See rdoc-ref:HTML5@Parsing+options for a list of available options.
89
+ #
90
+ # ⚠ Note that +url:+ and +encoding:+ cannot be set by the configuration block.
77
91
  #
78
92
  # [Returns] Nokogiri::HTML5::Document
79
93
  #
80
- def parse(string_or_io, url = nil, encoding = nil, **options, &block)
94
+ # *Example:* Parse a string with a specific encoding and custom max errors limit.
95
+ #
96
+ # Nokogiri::HTML5::Document.parse(socket, encoding: "ISO-8859-1", max_errors: 10)
97
+ #
98
+ # *Example:* Parse a string setting the +:parse_noscript_content_as_text+ option using the
99
+ # configuration block parameter.
100
+ #
101
+ # Nokogiri::HTML5::Document.parse(input) { |c| c[:parse_noscript_content_as_text] = true }
102
+ #
103
+ def parse(
104
+ string_or_io,
105
+ url_ = nil, encoding_ = nil,
106
+ url: url_, encoding: encoding_,
107
+ **options, &block
108
+ )
81
109
  yield options if block
82
110
  string_or_io = "" unless string_or_io
83
111
 
@@ -92,35 +120,37 @@ module Nokogiri
92
120
  raise ArgumentError, "not a string or IO object"
93
121
  end
94
122
 
95
- do_parse(string_or_io, url, encoding, options)
123
+ do_parse(string_or_io, url, encoding, **options)
96
124
  end
97
125
 
98
126
  # Create a new document from an IO object.
99
127
  #
100
128
  # 💡 Most users should prefer Document.parse to this method.
101
- def read_io(io, url = nil, encoding = nil, **options)
129
+ def read_io(io, url_ = nil, encoding_ = nil, url: url_, encoding: encoding_, **options)
102
130
  raise ArgumentError, "io object doesn't respond to :read" unless io.respond_to?(:read)
103
131
 
104
- do_parse(io, url, encoding, options)
132
+ do_parse(io, url, encoding, **options)
105
133
  end
106
134
 
107
135
  # Create a new document from a String.
108
136
  #
109
137
  # 💡 Most users should prefer Document.parse to this method.
110
- def read_memory(string, url = nil, encoding = nil, **options)
138
+ def read_memory(string, url_ = nil, encoding_ = nil, url: url_, encoding: encoding_, **options)
111
139
  raise ArgumentError, "string object doesn't respond to :to_str" unless string.respond_to?(:to_str)
112
140
 
113
- do_parse(string, url, encoding, options)
141
+ do_parse(string, url, encoding, **options)
114
142
  end
115
143
 
116
144
  private
117
145
 
118
- def do_parse(string_or_io, url, encoding, options)
146
+ def do_parse(string_or_io, url, encoding, **options)
119
147
  string = HTML5.read_and_encode(string_or_io, encoding)
120
- max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
121
- max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
122
- max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
123
- doc = Nokogiri::Gumbo.parse(string, url, max_attributes, max_errors, max_depth, self)
148
+
149
+ options[:max_attributes] ||= Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
150
+ options[:max_errors] ||= options.delete(:max_parse_errors) || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
151
+ options[:max_tree_depth] ||= Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
152
+
153
+ doc = Nokogiri::Gumbo.parse(string, url, self, **options)
124
154
  doc.encoding = "UTF-8"
125
155
  doc
126
156
  end
@@ -142,7 +172,8 @@ module Nokogiri
142
172
  # - +markup+ (String) The HTML5 markup fragment to be parsed
143
173
  #
144
174
  # [Returns]
145
- # Nokogiri::HTML5::DocumentFragment. This object's children will be empty if `markup` is not passed, is empty, or is `nil`.
175
+ # Nokogiri::HTML5::DocumentFragment. This object's children will be empty if +markup+ is not
176
+ # passed, is empty, or is +nil+.
146
177
  #
147
178
  def fragment(markup = nil)
148
179
  DocumentFragment.new(self, markup)