nokogiri 1.11.7 → 1.12.0.rc1

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (107) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE-DEPENDENCIES.md +243 -22
  3. data/LICENSE.md +1 -1
  4. data/README.md +6 -5
  5. data/ext/nokogiri/depend +35 -34
  6. data/ext/nokogiri/extconf.rb +181 -103
  7. data/ext/nokogiri/gumbo.c +611 -0
  8. data/ext/nokogiri/{html_document.c → html4_document.c} +8 -8
  9. data/ext/nokogiri/{html_element_description.c → html4_element_description.c} +20 -18
  10. data/ext/nokogiri/{html_entity_lookup.c → html4_entity_lookup.c} +7 -7
  11. data/ext/nokogiri/{html_sax_parser_context.c → html4_sax_parser_context.c} +5 -5
  12. data/ext/nokogiri/{html_sax_push_parser.c → html4_sax_push_parser.c} +4 -4
  13. data/ext/nokogiri/libxml2_backwards_compat.c +30 -30
  14. data/ext/nokogiri/nokogiri.c +51 -38
  15. data/ext/nokogiri/nokogiri.h +16 -9
  16. data/ext/nokogiri/xml_document.c +13 -13
  17. data/ext/nokogiri/xml_element_content.c +2 -0
  18. data/ext/nokogiri/xml_encoding_handler.c +11 -6
  19. data/ext/nokogiri/xml_namespace.c +2 -0
  20. data/ext/nokogiri/xml_node.c +102 -102
  21. data/ext/nokogiri/xml_node_set.c +20 -20
  22. data/ext/nokogiri/xml_reader.c +2 -0
  23. data/ext/nokogiri/xml_sax_parser.c +6 -6
  24. data/ext/nokogiri/xml_sax_parser_context.c +2 -0
  25. data/ext/nokogiri/xml_schema.c +2 -0
  26. data/ext/nokogiri/xml_xpath_context.c +67 -65
  27. data/ext/nokogiri/xslt_stylesheet.c +2 -1
  28. data/gumbo-parser/CHANGES.md +63 -0
  29. data/gumbo-parser/Makefile +101 -0
  30. data/gumbo-parser/THANKS +27 -0
  31. data/gumbo-parser/src/Makefile +17 -0
  32. data/gumbo-parser/src/README.md +41 -0
  33. data/gumbo-parser/src/ascii.c +75 -0
  34. data/gumbo-parser/src/ascii.h +115 -0
  35. data/gumbo-parser/src/attribute.c +42 -0
  36. data/gumbo-parser/src/attribute.h +17 -0
  37. data/gumbo-parser/src/char_ref.c +22225 -0
  38. data/gumbo-parser/src/char_ref.h +29 -0
  39. data/gumbo-parser/src/char_ref.rl +2154 -0
  40. data/gumbo-parser/src/error.c +626 -0
  41. data/gumbo-parser/src/error.h +148 -0
  42. data/gumbo-parser/src/foreign_attrs.c +104 -0
  43. data/gumbo-parser/src/foreign_attrs.gperf +27 -0
  44. data/gumbo-parser/src/gumbo.h +943 -0
  45. data/gumbo-parser/src/insertion_mode.h +33 -0
  46. data/gumbo-parser/src/macros.h +91 -0
  47. data/gumbo-parser/src/parser.c +4886 -0
  48. data/gumbo-parser/src/parser.h +41 -0
  49. data/gumbo-parser/src/replacement.h +33 -0
  50. data/gumbo-parser/src/string_buffer.c +103 -0
  51. data/gumbo-parser/src/string_buffer.h +68 -0
  52. data/gumbo-parser/src/string_piece.c +48 -0
  53. data/gumbo-parser/src/svg_attrs.c +174 -0
  54. data/gumbo-parser/src/svg_attrs.gperf +77 -0
  55. data/gumbo-parser/src/svg_tags.c +137 -0
  56. data/gumbo-parser/src/svg_tags.gperf +55 -0
  57. data/gumbo-parser/src/tag.c +222 -0
  58. data/gumbo-parser/src/tag_lookup.c +382 -0
  59. data/gumbo-parser/src/tag_lookup.gperf +169 -0
  60. data/gumbo-parser/src/tag_lookup.h +13 -0
  61. data/gumbo-parser/src/token_buffer.c +79 -0
  62. data/gumbo-parser/src/token_buffer.h +71 -0
  63. data/gumbo-parser/src/token_type.h +17 -0
  64. data/gumbo-parser/src/tokenizer.c +3463 -0
  65. data/gumbo-parser/src/tokenizer.h +112 -0
  66. data/gumbo-parser/src/tokenizer_states.h +339 -0
  67. data/gumbo-parser/src/utf8.c +245 -0
  68. data/gumbo-parser/src/utf8.h +164 -0
  69. data/gumbo-parser/src/util.c +68 -0
  70. data/gumbo-parser/src/util.h +30 -0
  71. data/gumbo-parser/src/vector.c +111 -0
  72. data/gumbo-parser/src/vector.h +45 -0
  73. data/lib/nokogiri.rb +31 -29
  74. data/lib/nokogiri/css.rb +14 -14
  75. data/lib/nokogiri/css/parser.rb +1 -1
  76. data/lib/nokogiri/css/parser.y +1 -1
  77. data/lib/nokogiri/css/syntax_error.rb +1 -1
  78. data/lib/nokogiri/extension.rb +2 -2
  79. data/lib/nokogiri/gumbo.rb +14 -0
  80. data/lib/nokogiri/html.rb +31 -27
  81. data/lib/nokogiri/html4.rb +40 -0
  82. data/lib/nokogiri/{html → html4}/builder.rb +2 -2
  83. data/lib/nokogiri/{html → html4}/document.rb +4 -4
  84. data/lib/nokogiri/{html → html4}/document_fragment.rb +3 -3
  85. data/lib/nokogiri/{html → html4}/element_description.rb +1 -1
  86. data/lib/nokogiri/{html → html4}/element_description_defaults.rb +1 -1
  87. data/lib/nokogiri/{html → html4}/entity_lookup.rb +1 -1
  88. data/lib/nokogiri/{html → html4}/sax/parser.rb +11 -14
  89. data/lib/nokogiri/html4/sax/parser_context.rb +19 -0
  90. data/lib/nokogiri/{html → html4}/sax/push_parser.rb +5 -5
  91. data/lib/nokogiri/html5.rb +473 -0
  92. data/lib/nokogiri/html5/document.rb +74 -0
  93. data/lib/nokogiri/html5/document_fragment.rb +80 -0
  94. data/lib/nokogiri/html5/node.rb +93 -0
  95. data/lib/nokogiri/version/constant.rb +1 -1
  96. data/lib/nokogiri/version/info.rb +11 -2
  97. data/lib/nokogiri/xml.rb +35 -36
  98. data/lib/nokogiri/xml/node.rb +6 -5
  99. data/lib/nokogiri/xml/parse_options.rb +2 -0
  100. data/lib/nokogiri/xml/pp.rb +2 -2
  101. data/lib/nokogiri/xml/sax.rb +4 -4
  102. data/lib/nokogiri/xml/sax/document.rb +24 -30
  103. data/lib/nokogiri/xml/xpath.rb +2 -2
  104. data/lib/nokogiri/xslt.rb +16 -16
  105. data/lib/nokogiri/xslt/stylesheet.rb +1 -1
  106. metadata +102 -60
  107. data/lib/nokogiri/html/sax/parser_context.rb +0 -17
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+ module Nokogiri
3
+ class << self
4
+ ###
5
+ # Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
6
+ def HTML4(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
7
+ Nokogiri::HTML4::Document.parse(input, url, encoding, options, &block)
8
+ end
9
+ end
10
+
11
+ # @since v1.12.0
12
+ # @note Before v1.12.0, {Nokogiri::HTML4} did not exist, and {Nokogiri::HTML} was the module/namespace for parsing HTML.
13
+ module HTML4
14
+ class << self
15
+ ###
16
+ # Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
17
+ def parse(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
18
+ Document.parse(input, url, encoding, options, &block)
19
+ end
20
+
21
+ ####
22
+ # Parse a fragment from +string+ in to a NodeSet.
23
+ def fragment(string, encoding = nil)
24
+ HTML4::DocumentFragment.parse(string, encoding)
25
+ end
26
+ end
27
+
28
+ # Instance of Nokogiri::HTML4::EntityLookup
29
+ NamedCharacters = EntityLookup.new
30
+ end
31
+ end
32
+
33
+ require_relative "html4/entity_lookup"
34
+ require_relative "html4/document"
35
+ require_relative "html4/document_fragment"
36
+ require_relative "html4/sax/parser_context"
37
+ require_relative "html4/sax/parser"
38
+ require_relative "html4/sax/push_parser"
39
+ require_relative "html4/element_description"
40
+ require_relative "html4/element_description_defaults"
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
  module Nokogiri
3
- module HTML
3
+ module HTML4
4
4
  ###
5
5
  # Nokogiri HTML builder is used for building HTML documents. It is very
6
6
  # similar to the Nokogiri::XML::Builder. In fact, you should go read the
@@ -12,7 +12,7 @@ module Nokogiri
12
12
  # Create an HTML document with a body that has an onload attribute, and a
13
13
  # span tag with a class of "bold" that has content of "Hello world".
14
14
  #
15
- # builder = Nokogiri::HTML::Builder.new do |doc|
15
+ # builder = Nokogiri::HTML4::Builder.new do |doc|
16
16
  # doc.html {
17
17
  # doc.body(:onload => 'some_func();') {
18
18
  # doc.span.bold {
@@ -3,7 +3,7 @@
3
3
  require 'pathname'
4
4
 
5
5
  module Nokogiri
6
- module HTML
6
+ module HTML4
7
7
  class Document < Nokogiri::XML::Document
8
8
  ###
9
9
  # Get the meta tag encoding for this document. If there is no meta tag,
@@ -268,12 +268,12 @@ module Nokogiri
268
268
  m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
269
269
  return m[4]
270
270
  catch(:encoding_found) {
271
- Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
271
+ Nokogiri::HTML4::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
272
272
  nil
273
273
  }
274
274
  else
275
275
  handler = SAXHandler.new
276
- parser = Nokogiri::HTML::SAX::PushParser.new(handler)
276
+ parser = Nokogiri::HTML4::SAX::PushParser.new(handler)
277
277
  parser << chunk rescue Nokogiri::SyntaxError
278
278
  handler.encoding
279
279
  end
@@ -286,7 +286,7 @@ module Nokogiri
286
286
  end
287
287
 
288
288
  # This method is used by the C extension so that
289
- # Nokogiri::HTML::Document#read_io() does not leak memory when
289
+ # Nokogiri::HTML4::Document#read_io() does not leak memory when
290
290
  # EncodingFound is raised.
291
291
  attr_reader :encoding_found
292
292
 
@@ -1,11 +1,11 @@
1
1
  # frozen_string_literal: true
2
2
  module Nokogiri
3
- module HTML
3
+ module HTML4
4
4
  class DocumentFragment < Nokogiri::XML::DocumentFragment
5
5
  ####
6
6
  # Create a Nokogiri::XML::DocumentFragment from +tags+, using +encoding+
7
7
  def self.parse(tags, encoding = nil)
8
- doc = HTML::Document.new
8
+ doc = HTML4::Document.new
9
9
 
10
10
  encoding ||= if tags.respond_to?(:encoding)
11
11
  encoding = tags.encoding
@@ -39,7 +39,7 @@ module Nokogiri
39
39
  "/html/body/node()"
40
40
  end
41
41
 
42
- temp_doc = HTML::Document.parse("<html><body>#{tags}", nil, document.encoding)
42
+ temp_doc = HTML4::Document.parse("<html><body>#{tags}", nil, document.encoding)
43
43
  temp_doc.xpath(path).each { |child| child.parent = self }
44
44
  self.errors = temp_doc.errors
45
45
  end
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
  module Nokogiri
3
- module HTML
3
+ module HTML4
4
4
  class ElementDescription
5
5
  ###
6
6
  # Is this element a block element?
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
  module Nokogiri
3
- module HTML
3
+ module HTML4
4
4
  class ElementDescription
5
5
 
6
6
  # Methods are defined protected by method_defined? because at
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
  module Nokogiri
3
- module HTML
3
+ module HTML4
4
4
  class EntityDescription < Struct.new(:value, :name, :description); end
5
5
 
6
6
  class EntityLookup
@@ -1,18 +1,15 @@
1
1
  # frozen_string_literal: true
2
2
  module Nokogiri
3
- module HTML
3
+ module HTML4
4
4
  ###
5
- # Nokogiri lets you write a SAX parser to process HTML but get HTML
6
- # correction features.
5
+ # Nokogiri lets you write a SAX parser to process HTML but get HTML correction features.
7
6
  #
8
- # See Nokogiri::HTML::SAX::Parser for a basic example of using a
9
- # SAX parser with HTML.
7
+ # See Nokogiri::HTML4::SAX::Parser for a basic example of using a SAX parser with HTML.
10
8
  #
11
9
  # For more information on SAX parsers, see Nokogiri::XML::SAX
12
10
  module SAX
13
11
  ###
14
- # This class lets you perform SAX style parsing on HTML with HTML
15
- # error correction.
12
+ # This class lets you perform SAX style parsing on HTML with HTML error correction.
16
13
  #
17
14
  # Here is a basic usage example:
18
15
  #
@@ -22,40 +19,40 @@ module Nokogiri
22
19
  # end
23
20
  # end
24
21
  #
25
- # parser = Nokogiri::HTML::SAX::Parser.new(MyDoc.new)
22
+ # parser = Nokogiri::HTML4::SAX::Parser.new(MyDoc.new)
26
23
  # parser.parse(File.read(ARGV[0], mode: 'rb'))
27
24
  #
28
25
  # For more information on SAX parsers, see Nokogiri::XML::SAX
29
26
  class Parser < Nokogiri::XML::SAX::Parser
30
27
  ###
31
28
  # Parse html stored in +data+ using +encoding+
32
- def parse_memory data, encoding = 'UTF-8'
29
+ def parse_memory(data, encoding = "UTF-8")
33
30
  raise ArgumentError unless data
34
31
  return unless data.length > 0
35
32
  ctx = ParserContext.memory(data, encoding)
36
33
  yield ctx if block_given?
37
- ctx.parse_with self
34
+ ctx.parse_with(self)
38
35
  end
39
36
 
40
37
  ###
41
38
  # Parse given +io+
42
- def parse_io io, encoding = 'UTF-8'
39
+ def parse_io(io, encoding = "UTF-8")
43
40
  check_encoding(encoding)
44
41
  @encoding = encoding
45
42
  ctx = ParserContext.io(io, ENCODINGS[encoding])
46
43
  yield ctx if block_given?
47
- ctx.parse_with self
44
+ ctx.parse_with(self)
48
45
  end
49
46
 
50
47
  ###
51
48
  # Parse a file with +filename+
52
- def parse_file filename, encoding = 'UTF-8'
49
+ def parse_file(filename, encoding = "UTF-8")
53
50
  raise ArgumentError unless filename
54
51
  raise Errno::ENOENT unless File.exist?(filename)
55
52
  raise Errno::EISDIR if File.directory?(filename)
56
53
  ctx = ParserContext.file(filename, encoding)
57
54
  yield ctx if block_given?
58
- ctx.parse_with self
55
+ ctx.parse_with(self)
59
56
  end
60
57
  end
61
58
  end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+ module Nokogiri
3
+ module HTML4
4
+ module SAX
5
+ ###
6
+ # Context for HTML SAX parsers. This class is usually not instantiated by the user. Instead,
7
+ # you should be looking at Nokogiri::HTML4::SAX::Parser
8
+ class ParserContext < Nokogiri::XML::SAX::ParserContext
9
+ def self.new(thing, encoding = "UTF-8")
10
+ if [:read, :close].all? { |x| thing.respond_to?(x) }
11
+ super
12
+ else
13
+ memory(thing, encoding)
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -1,17 +1,17 @@
1
1
  # frozen_string_literal: true
2
2
  module Nokogiri
3
- module HTML
3
+ module HTML4
4
4
  module SAX
5
5
  class PushParser
6
6
 
7
- # The Nokogiri::HTML::SAX::Document on which the PushParser will be
7
+ # The Nokogiri::HTML4::SAX::Document on which the PushParser will be
8
8
  # operating
9
9
  attr_accessor :document
10
10
 
11
- def initialize(doc = HTML::SAX::Document.new, file_name = nil, encoding = 'UTF-8')
11
+ def initialize(doc = HTML4::SAX::Document.new, file_name = nil, encoding = 'UTF-8')
12
12
  @document = doc
13
13
  @encoding = encoding
14
- @sax_parser = HTML::SAX::Parser.new(doc, @encoding)
14
+ @sax_parser = HTML4::SAX::Parser.new(doc, @encoding)
15
15
 
16
16
  ## Create our push parser context
17
17
  initialize_native(@sax_parser, file_name, encoding)
@@ -27,7 +27,7 @@ module Nokogiri
27
27
 
28
28
  ###
29
29
  # Finish the parsing. This method is only necessary for
30
- # Nokogiri::HTML::SAX::Document#end_document to be called.
30
+ # Nokogiri::HTML4::SAX::Document#end_document to be called.
31
31
  def finish
32
32
  write '', true
33
33
  end
@@ -0,0 +1,473 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+ #
4
+ # Copyright 2013-2021 Sam Ruby, Stephen Checkoway
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+ #
18
+
19
+ require_relative 'html5/document'
20
+ require_relative 'html5/document_fragment'
21
+ require_relative 'html5/node'
22
+
23
+ module Nokogiri
24
+ # @since v1.12.0
25
+ # @note HTML5 functionality is not available when running JRuby.
26
+ # Parse an HTML5 document. Convenience method for {Nokogiri::HTML5::Document.parse}
27
+ def self.HTML5(input, url = nil, encoding = nil, **options, &block)
28
+ Nokogiri::HTML5::Document.parse(input, url, encoding, **options, &block)
29
+ end
30
+
31
+ # == Usage
32
+ #
33
+ # Parse an HTML5 document:
34
+ #
35
+ # doc = Nokogiri.HTML5(string)
36
+ #
37
+ # Parse an HTML5 fragment:
38
+ #
39
+ # fragment = Nokogiri::HTML5.fragment(string)
40
+ #
41
+ # == Parsing options
42
+ #
43
+ # The document and fragment parsing methods support options that are different from Nokogiri's.
44
+ #
45
+ # - <tt>Nokogiri.HTML5(html, url = nil, encoding = nil, options = {})</tt>
46
+ # - <tt>Nokogiri::HTML5.parse(html, url = nil, encoding = nil, options = {})</tt>
47
+ # - <tt>Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, options = {})</tt>
48
+ # - <tt>Nokogiri::HTML5.fragment(html, encoding = nil, options = {})</tt>
49
+ # - <tt>Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, options = {})</tt>
50
+ #
51
+ # The three currently supported options are +:max_errors+, +:max_tree_depth+ and
52
+ # +:max_attributes+, described below.
53
+ #
54
+ # === Error reporting
55
+ #
56
+ # Nokogumbo contains an experimental parse error reporting facility. By default, no parse errors
57
+ # are reported but this can be configured by passing the +:max_errors+ option to {HTML5.parse} or
58
+ # {HTML5.fragment}.
59
+ #
60
+ # For example, this script:
61
+ #
62
+ # doc = Nokogiri::HTML5.parse('<span/>Hi there!</span foo=bar />', max_errors: 10)
63
+ # doc.errors.each do |err|
64
+ # puts(err)
65
+ # end
66
+ #
67
+ # Emits:
68
+ #
69
+ # 1:1: ERROR: Expected a doctype token
70
+ # <span/>Hi there!</span foo=bar />
71
+ # ^
72
+ # 1:1: ERROR: Start tag of nonvoid HTML element ends with '/>', use '>'.
73
+ # <span/>Hi there!</span foo=bar />
74
+ # ^
75
+ # 1:17: ERROR: End tag ends with '/>', use '>'.
76
+ # <span/>Hi there!</span foo=bar />
77
+ # ^
78
+ # 1:17: ERROR: End tag contains attributes.
79
+ # <span/>Hi there!</span foo=bar />
80
+ # ^
81
+ #
82
+ # Using <tt>max_errors: -1</tt> results in an unlimited number of errors being returned.
83
+ #
84
+ # The errors returned by {HTML5::Document#errors} are instances of {Nokogiri::XML::SyntaxError}.
85
+ #
86
+ # The {https://html.spec.whatwg.org/multipage/parsing.html#parse-errors HTML standard} defines a
87
+ # number of standard parse error codes. These error codes only cover the "tokenization" stage of
88
+ # parsing HTML. The parse errors in the "tree construction" stage do not have standardized error
89
+ # codes (yet).
90
+ #
91
+ # As a convenience to Nokogumbo users, the defined error codes are available via
92
+ # {Nokogiri::XML::SyntaxError#str1} method.
93
+ #
94
+ # doc = Nokogiri::HTML5.parse('<span/>Hi there!</span foo=bar />', max_errors: 10)
95
+ # doc.errors.each do |err|
96
+ # puts("#{err.line}:#{err.column}: #{err.str1}")
97
+ # end
98
+ # # => 1:1: generic-parser
99
+ # # 1:1: non-void-html-element-start-tag-with-trailing-solidus
100
+ # # 1:17: end-tag-with-trailing-solidus
101
+ # # 1:17: end-tag-with-attributes
102
+ #
103
+ # Note that the first error is +generic-parser+ because it's an error from the tree construction
104
+ # stage and doesn't have a standardized error code.
105
+ #
106
+ # For the purposes of semantic versioning, the error messages, error locations, and error codes
107
+ # are not part of Nokogumbo's public API. That is, these are subject to change without Nokogumbo's
108
+ # major version number changing. These may be stabilized in the future.
109
+ #
110
+ # === Maximum tree depth
111
+ #
112
+ # The maximum depth of the DOM tree parsed by the various parsing methods is configurable by the
113
+ # +:max_tree_depth+ option. If the depth of the tree would exceed this limit, then an
114
+ # {::ArgumentError} is thrown.
115
+ #
116
+ # This limit (which defaults to <tt>Nokogumbo::DEFAULT_MAX_TREE_DEPTH = 400</tt>) can be removed
117
+ # by giving the option <tt>max_tree_depth: -1</tt>.
118
+ #
119
+ # html = '<!DOCTYPE html>' + '<div>' * 1000
120
+ # doc = Nokogiri.HTML5(html)
121
+ # # raises ArgumentError: Document tree depth limit exceeded
122
+ # doc = Nokogiri.HTML5(html, max_tree_depth: -1)
123
+ #
124
+ # === Attribute limit per element
125
+ #
126
+ # The maximum number of attributes per DOM element is configurable by the +:max_attributes+
127
+ # option. If a given element would exceed this limit, then an {::ArgumentError} is thrown.
128
+ #
129
+ # This limit (which defaults to <tt>Nokogumbo::DEFAULT_MAX_ATTRIBUTES = 400</tt>) can be removed
130
+ # by giving the option <tt>max_attributes: -1</tt>.
131
+ #
132
+ # html = '<!DOCTYPE html><div ' + (1..1000).map { |x| "attr-#{x}" }.join(' ') + '>'
133
+ # # "<!DOCTYPE html><div attr-1 attr-2 attr-3 ... attr-1000>"
134
+ # doc = Nokogiri.HTML5(html)
135
+ # # raises ArgumentError: Attributes per element limit exceeded
136
+ # doc = Nokogiri.HTML5(html, max_attributes: -1)
137
+ #
138
+ # == HTML Serialization
139
+ #
140
+ # After parsing HTML, it may be serialized using any of the {Nokogiri::XML::Node} serialization
141
+ # methods. In particular, {XML::Node#serialize}, {XML::Node#to_html}, and {XML::Node#to_s} will
142
+ # serialize a given node and its children. (This is the equivalent of JavaScript's
143
+ # +Element.outerHTML+.) Similarly, {XML::Node#inner_html} will serialize the children of a given
144
+ # node. (This is the equivalent of JavaScript's +Element.innerHTML+.)
145
+ #
146
+ # doc = Nokogiri::HTML5("<!DOCTYPE html><span>Hello world!</span>")
147
+ # puts doc.serialize
148
+ # # => <!DOCTYPE html><html><head></head><body><span>Hello world!</span></body></html>
149
+ #
150
+ # Due to quirks in how HTML is parsed and serialized, it's possible for a DOM tree to be
151
+ # serialized and then re-parsed, resulting in a different DOM. Mostly, this happens with DOMs
152
+ # produced from invalid HTML. Unfortunately, even valid HTML may not survive serialization and
153
+ # re-parsing.
154
+ #
155
+ # In particular, a newline at the start of +pre+, +listing+, and +textarea+ elements is ignored by
156
+ # the parser.
157
+ #
158
+ # doc = Nokogiri::HTML5(<<-EOF)
159
+ # <!DOCTYPE html>
160
+ # <pre>
161
+ # Content</pre>
162
+ # EOF
163
+ # puts doc.at('/html/body/pre').serialize
164
+ # # => <pre>Content</pre>
165
+ #
166
+ # In this case, the original HTML is semantically equivalent to the serialized version. If the
167
+ # +pre+, +listing+, or +textarea+ content starts with two newlines, the first newline will be
168
+ # stripped on the first parse and the second newline will be stripped on the second, leading to
169
+ # semantically different DOMs. Passing the parameter <tt>preserve_newline: true</tt> will cause
170
+ # two or more newlines to be preserved. (A single leading newline will still be removed.)
171
+ #
172
+ # doc = Nokogiri::HTML5(<<-EOF)
173
+ # <!DOCTYPE html>
174
+ # <listing>
175
+ #
176
+ # Content</listing>
177
+ # EOF
178
+ # puts doc.at('/html/body/listing').serialize(preserve_newline: true)
179
+ # # => <listing>
180
+ # #
181
+ # # Content</listing>
182
+ #
183
+ # == Encodings
184
+ #
185
+ # Nokogumbo always parses HTML using {https://en.wikipedia.org/wiki/UTF-8 UTF-8}; however, the
186
+ # encoding of the input can be explicitly selected via the optional +encoding+ parameter. This is
187
+ # most useful when the input comes not from a string but from an IO object.
188
+ #
189
+ # When serializing a document or node, the encoding of the output string can be specified via the
190
+ # +:encoding+ options. Characters that cannot be encoded in the selected encoding will be encoded
191
+ # as {https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references HTML numeric
192
+ # entities}.
193
+ #
194
+ # frag = Nokogiri::HTML5.fragment('<span>아는 길도 물어가라</span>')
195
+ # html = frag.serialize(encoding: 'US-ASCII')
196
+ # puts html
197
+ # # => <span>&#xc544;&#xb294; &#xae38;&#xb3c4; &#xbb3c;&#xc5b4;&#xac00;&#xb77c;</span>
198
+ # frag = Nokogiri::HTML5.fragment(html)
199
+ # puts frag.serialize
200
+ # # => <span>아는 길도 물어가라</span>
201
+ #
202
+ # (There's a {https://bugs.ruby-lang.org/issues/15033 bug} in all current versions of Ruby that
203
+ # can cause the entity encoding to fail. Of the mandated supported encodings for HTML, the only
204
+ # encoding I'm aware of that has this bug is <tt>'ISO-2022-JP'</tt>. We recommend avoiding this
205
+ # encoding.)
206
+ #
207
+ # == Notes
208
+ #
209
+ # * The {Nokogiri::HTML5.fragment} function takes a string and parses it
210
+ # as a HTML5 document. The +<html>+, +<head>+, and +<body>+ elements are
211
+ # removed from this document, and any children of these elements that remain
212
+ # are returned as a {Nokogiri::HTML5::DocumentFragment}.
213
+ #
214
+ # * The {Nokogiri::HTML5.parse} function takes a string and passes it to the
215
+ # <code>gumbo_parse_with_options</code> method, using the default options.
216
+ # The resulting Gumbo parse tree is then walked.
217
+ #
218
+ # * Instead of uppercase element names, lowercase element names are produced.
219
+ #
220
+ # * Instead of returning +unknown+ as the element name for unknown tags, the
221
+ # original tag name is returned verbatim.
222
+ #
223
+ # @since v1.12.0
224
+ # @note HTML5 functionality is not available when running JRuby.
225
+ module HTML5
226
+ # HTML uses the XHTML namespace.
227
+ HTML_NAMESPACE = 'http://www.w3.org/1999/xhtml'.freeze
228
+ MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML'.freeze
229
+ SVG_NAMESPACE = 'http://www.w3.org/2000/svg'.freeze
230
+ XLINK_NAMESPACE = 'http://www.w3.org/1999/xlink'.freeze
231
+ XML_NAMESPACE = 'http://www.w3.org/XML/1998/namespace'.freeze
232
+ XMLNS_NAMESPACE = 'http://www.w3.org/2000/xmlns/'.freeze
233
+
234
+ # Parse an HTML 5 document. Convenience method for {Nokogiri::HTML5::Document.parse}
235
+ def self.parse(string, url = nil, encoding = nil, **options, &block)
236
+ Document.parse(string, url, encoding, **options, &block)
237
+ end
238
+
239
+ # Parse a fragment from +string+. Convenience method for
240
+ # {Nokogiri::HTML5::DocumentFragment.parse}.
241
+ def self.fragment(string, encoding = nil, **options)
242
+ DocumentFragment.parse(string, encoding, options)
243
+ end
244
+
245
+ # Fetch and parse a HTML document from the web, following redirects,
246
+ # handling https, and determining the character encoding using HTML5
247
+ # rules. +uri+ may be a +String+ or a +URI+. +options+ contains
248
+ # http headers and special options. Everything which is not a
249
+ # special option is considered a header. Special options include:
250
+ # * :follow_limit => number of redirects which are followed
251
+ # * :basic_auth => [username, password]
252
+ def self.get(uri, options={})
253
+ warn("Nokogiri::HTML5.get is deprecated and will be removed in a future version of Nokogiri.",
254
+ uplevel: 1, category: :deprecated)
255
+ get_impl(uri, options)
256
+ end
257
+
258
+ private
259
+
260
+ def self.get_impl(uri, options={})
261
+ headers = options.clone
262
+ headers = {:follow_limit => headers} if Numeric === headers # deprecated
263
+ limit=headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
264
+
265
+ require 'net/http'
266
+ uri = URI(uri) unless URI === uri
267
+
268
+ http = Net::HTTP.new(uri.host, uri.port)
269
+
270
+ # TLS / SSL support
271
+ http.use_ssl = true if uri.scheme == 'https'
272
+
273
+ # Pass through Net::HTTP override values, which currently include:
274
+ # :ca_file, :ca_path, :cert, :cert_store, :ciphers,
275
+ # :close_on_empty_response, :continue_timeout, :key, :open_timeout,
276
+ # :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
277
+ # :verify_callback, :verify_depth, :verify_mode
278
+ options.each do |key, value|
279
+ http.send "#{key}=", headers.delete(key) if http.respond_to? "#{key}="
280
+ end
281
+
282
+ request = Net::HTTP::Get.new(uri.request_uri)
283
+
284
+ # basic authentication
285
+ auth = headers.delete(:basic_auth)
286
+ auth ||= [uri.user, uri.password] if uri.user && uri.password
287
+ request.basic_auth auth.first, auth.last if auth
288
+
289
+ # remaining options are treated as headers
290
+ headers.each {|key, value| request[key.to_s] = value.to_s}
291
+
292
+ response = http.request(request)
293
+
294
+ case response
295
+ when Net::HTTPSuccess
296
+ doc = parse(reencode(response.body, response['content-type']), options)
297
+ doc.instance_variable_set('@response', response)
298
+ doc.class.send(:attr_reader, :response)
299
+ doc
300
+ when Net::HTTPRedirection
301
+ response.value if limit <= 1
302
+ location = URI.join(uri, response['location'])
303
+ get_impl(location, options.merge(:follow_limit => limit-1))
304
+ else
305
+ response.value
306
+ end
307
+ end
308
+
309
+ def self.read_and_encode(string, encoding)
310
+ # Read the string with the given encoding.
311
+ if string.respond_to?(:read)
312
+ if encoding.nil?
313
+ string = string.read
314
+ else
315
+ string = string.read(encoding: encoding)
316
+ end
317
+ else
318
+ # Otherwise the string has the given encoding.
319
+ string = string.to_s
320
+ if encoding
321
+ string = string.dup
322
+ string.force_encoding(encoding)
323
+ end
324
+ end
325
+
326
+ # convert to UTF-8
327
+ if string.encoding != Encoding::UTF_8
328
+ string = reencode(string)
329
+ end
330
+ string
331
+ end
332
+
333
+ # Charset sniffing is a complex and controversial topic that understandably isn't done _by
334
+ # default_ by the Ruby Net::HTTP library. This being said, it is a very real problem for
335
+ # consumers of HTML as the default for HTML is iso-8859-1, most "good" producers use utf-8, and
336
+ # the Gumbo parser *only* supports utf-8.
337
+ #
338
+ # Accordingly, Nokogiri::HTML4::Document.parse provides limited encoding detection. Following
339
+ # this lead, Nokogiri::HTML5 attempts to do likewise, while attempting to more closely follow
340
+ # the HTML5 standard.
341
+ #
342
+ # http://bugs.ruby-lang.org/issues/2567
343
+ # http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
344
+ #
345
+ def self.reencode(body, content_type=nil)
346
+ if body.encoding == Encoding::ASCII_8BIT
347
+ encoding = nil
348
+
349
+ # look for a Byte Order Mark (BOM)
350
+ initial_bytes = body[0..2].bytes
351
+ if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
352
+ encoding = Encoding::UTF_8
353
+ elsif initial_bytes[0..1] == [0xFE, 0xFF]
354
+ encoding = Encoding::UTF_16BE
355
+ elsif initial_bytes[0..1] == [0xFF, 0xFE]
356
+ encoding = Encoding::UTF_16LE
357
+ end
358
+
359
+ # look for a charset in a content-encoding header
360
+ if content_type
361
+ encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
362
+ end
363
+
364
+ # look for a charset in a meta tag in the first 1024 bytes
365
+ if not encoding
366
+ data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, '')
367
+ data.scan(/<meta.*?>/m).each do |meta|
368
+ encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
369
+ end
370
+ end
371
+
372
+ # if all else fails, default to the official default encoding for HTML
373
+ encoding ||= Encoding::ISO_8859_1
374
+
375
+ # change the encoding to match the detected or inferred encoding
376
+ body = body.dup
377
+ begin
378
+ body.force_encoding(encoding)
379
+ rescue ArgumentError
380
+ body.force_encoding(Encoding::ISO_8859_1)
381
+ end
382
+ end
383
+
384
+ body.encode(Encoding::UTF_8)
385
+ end
386
+
387
+ def self.serialize_node_internal(current_node, io, encoding, options)
388
+ case current_node.type
389
+ when XML::Node::ELEMENT_NODE
390
+ ns = current_node.namespace
391
+ ns_uri = ns.nil? ? nil : ns.href
392
+ # XXX(sfc): attach namespaces to all nodes, even html?
393
+ if ns_uri.nil? || ns_uri == HTML_NAMESPACE || ns_uri == MATHML_NAMESPACE || ns_uri == SVG_NAMESPACE
394
+ tagname = current_node.name
395
+ else
396
+ tagname = "#{ns.prefix}:#{current_node.name}"
397
+ end
398
+ io << '<' << tagname
399
+ current_node.attribute_nodes.each do |attr|
400
+ attr_ns = attr.namespace
401
+ if attr_ns.nil?
402
+ attr_name = attr.name
403
+ else
404
+ ns_uri = attr_ns.href
405
+ if ns_uri == XML_NAMESPACE
406
+ attr_name = 'xml:' + attr.name.sub(/^[^:]*:/, '')
407
+ elsif ns_uri == XMLNS_NAMESPACE && attr.name.sub(/^[^:]*:/, '') == 'xmlns'
408
+ attr_name = 'xmlns'
409
+ elsif ns_uri == XMLNS_NAMESPACE
410
+ attr_name = 'xmlns:' + attr.name.sub(/^[^:]*:/, '')
411
+ elsif ns_uri == XLINK_NAMESPACE
412
+ attr_name = 'xlink:' + attr.name.sub(/^[^:]*:/, '')
413
+ else
414
+ attr_name = "#{attr_ns.prefix}:#{attr.name}"
415
+ end
416
+ end
417
+ io << ' ' << attr_name << '="' << escape_text(attr.content, encoding, true) << '"'
418
+ end
419
+ io << '>'
420
+ if !%w[area base basefont bgsound br col embed frame hr img input keygen
421
+ link meta param source track wbr].include?(current_node.name)
422
+ io << "\n" if options[:preserve_newline] && prepend_newline?(current_node)
423
+ current_node.children.each do |child|
424
+ # XXX(sfc): Templates handled specially?
425
+ serialize_node_internal(child, io, encoding, options)
426
+ end
427
+ io << '</' << tagname << '>'
428
+ end
429
+ when XML::Node::TEXT_NODE
430
+ parent = current_node.parent
431
+ if parent.element? && %w[style script xmp iframe noembed noframes plaintext noscript].include?(parent.name)
432
+ io << current_node.content
433
+ else
434
+ io << escape_text(current_node.content, encoding, false)
435
+ end
436
+ when XML::Node::CDATA_SECTION_NODE
437
+ io << '<![CDATA[' << current_node.content << ']]>'
438
+ when XML::Node::COMMENT_NODE
439
+ io << '<!--' << current_node.content << '-->'
440
+ when XML::Node::PI_NODE
441
+ io << '<?' << current_node.content << '>'
442
+ when XML::Node::DOCUMENT_TYPE_NODE, XML::Node::DTD_NODE
443
+ io << '<!DOCTYPE ' << current_node.name << '>'
444
+ when XML::Node::HTML_DOCUMENT_NODE, XML::Node::DOCUMENT_FRAG_NODE
445
+ current_node.children.each do |child|
446
+ serialize_node_internal(child, io, encoding, options)
447
+ end
448
+ else
449
+ raise "Unexpected node '#{current_node.name}' of type #{current_node.type}"
450
+ end
451
+ end
452
+
453
+ def self.escape_text(text, encoding, attribute_mode)
454
+ if attribute_mode
455
+ text = text.gsub(/[&\u00a0"]/,
456
+ '&' => '&amp;', "\u00a0" => '&nbsp;', '"' => '&quot;')
457
+ else
458
+ text = text.gsub(/[&\u00a0<>]/,
459
+ '&' => '&amp;', "\u00a0" => '&nbsp;', '<' => '&lt;', '>' => '&gt;')
460
+ end
461
+ # Not part of the standard
462
+ text.encode(encoding, fallback: lambda { |c| "&\#x#{c.ord.to_s(16)};" })
463
+ end
464
+
465
+ def self.prepend_newline?(node)
466
+ return false unless %w[pre textarea listing].include?(node.name) && !node.children.empty?
467
+ first_child = node.children[0]
468
+ first_child.text? && first_child.content.start_with?("\n")
469
+ end
470
+ end
471
+ end
472
+
473
+ require_relative 'gumbo'