nokogiri 1.11.4 → 1.12.5

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (111) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE-DEPENDENCIES.md +243 -22
  3. data/LICENSE.md +1 -1
  4. data/README.md +6 -5
  5. data/ext/nokogiri/depend +35 -34
  6. data/ext/nokogiri/extconf.rb +185 -103
  7. data/ext/nokogiri/gumbo.c +584 -0
  8. data/ext/nokogiri/{html_document.c → html4_document.c} +8 -8
  9. data/ext/nokogiri/{html_element_description.c → html4_element_description.c} +21 -19
  10. data/ext/nokogiri/{html_entity_lookup.c → html4_entity_lookup.c} +7 -7
  11. data/ext/nokogiri/{html_sax_parser_context.c → html4_sax_parser_context.c} +6 -5
  12. data/ext/nokogiri/{html_sax_push_parser.c → html4_sax_push_parser.c} +4 -4
  13. data/ext/nokogiri/libxml2_backwards_compat.c +30 -30
  14. data/ext/nokogiri/nokogiri.c +70 -38
  15. data/ext/nokogiri/nokogiri.h +19 -9
  16. data/ext/nokogiri/xml_document.c +14 -14
  17. data/ext/nokogiri/xml_element_content.c +2 -0
  18. data/ext/nokogiri/xml_encoding_handler.c +11 -6
  19. data/ext/nokogiri/xml_namespace.c +4 -2
  20. data/ext/nokogiri/xml_node.c +123 -108
  21. data/ext/nokogiri/xml_node_set.c +20 -20
  22. data/ext/nokogiri/xml_reader.c +2 -0
  23. data/ext/nokogiri/xml_sax_parser.c +6 -6
  24. data/ext/nokogiri/xml_sax_parser_context.c +2 -0
  25. data/ext/nokogiri/xml_schema.c +2 -0
  26. data/ext/nokogiri/xml_xpath_context.c +67 -65
  27. data/ext/nokogiri/xslt_stylesheet.c +2 -1
  28. data/gumbo-parser/CHANGES.md +63 -0
  29. data/gumbo-parser/Makefile +101 -0
  30. data/gumbo-parser/THANKS +27 -0
  31. data/gumbo-parser/src/Makefile +34 -0
  32. data/gumbo-parser/src/README.md +41 -0
  33. data/gumbo-parser/src/ascii.c +75 -0
  34. data/gumbo-parser/src/ascii.h +115 -0
  35. data/gumbo-parser/src/attribute.c +42 -0
  36. data/gumbo-parser/src/attribute.h +17 -0
  37. data/gumbo-parser/src/char_ref.c +22225 -0
  38. data/gumbo-parser/src/char_ref.h +29 -0
  39. data/gumbo-parser/src/char_ref.rl +2154 -0
  40. data/gumbo-parser/src/error.c +626 -0
  41. data/gumbo-parser/src/error.h +148 -0
  42. data/gumbo-parser/src/foreign_attrs.c +104 -0
  43. data/gumbo-parser/src/foreign_attrs.gperf +27 -0
  44. data/gumbo-parser/src/gumbo.h +943 -0
  45. data/gumbo-parser/src/insertion_mode.h +33 -0
  46. data/gumbo-parser/src/macros.h +91 -0
  47. data/gumbo-parser/src/parser.c +4886 -0
  48. data/gumbo-parser/src/parser.h +41 -0
  49. data/gumbo-parser/src/replacement.h +33 -0
  50. data/gumbo-parser/src/string_buffer.c +103 -0
  51. data/gumbo-parser/src/string_buffer.h +68 -0
  52. data/gumbo-parser/src/string_piece.c +48 -0
  53. data/gumbo-parser/src/svg_attrs.c +174 -0
  54. data/gumbo-parser/src/svg_attrs.gperf +77 -0
  55. data/gumbo-parser/src/svg_tags.c +137 -0
  56. data/gumbo-parser/src/svg_tags.gperf +55 -0
  57. data/gumbo-parser/src/tag.c +222 -0
  58. data/gumbo-parser/src/tag_lookup.c +382 -0
  59. data/gumbo-parser/src/tag_lookup.gperf +169 -0
  60. data/gumbo-parser/src/tag_lookup.h +13 -0
  61. data/gumbo-parser/src/token_buffer.c +79 -0
  62. data/gumbo-parser/src/token_buffer.h +71 -0
  63. data/gumbo-parser/src/token_type.h +17 -0
  64. data/gumbo-parser/src/tokenizer.c +3463 -0
  65. data/gumbo-parser/src/tokenizer.h +112 -0
  66. data/gumbo-parser/src/tokenizer_states.h +339 -0
  67. data/gumbo-parser/src/utf8.c +245 -0
  68. data/gumbo-parser/src/utf8.h +164 -0
  69. data/gumbo-parser/src/util.c +68 -0
  70. data/gumbo-parser/src/util.h +30 -0
  71. data/gumbo-parser/src/vector.c +111 -0
  72. data/gumbo-parser/src/vector.h +45 -0
  73. data/lib/nokogiri/css/parser.rb +1 -1
  74. data/lib/nokogiri/css/parser.y +1 -1
  75. data/lib/nokogiri/css/syntax_error.rb +1 -1
  76. data/lib/nokogiri/css.rb +14 -14
  77. data/lib/nokogiri/extension.rb +7 -2
  78. data/lib/nokogiri/gumbo.rb +14 -0
  79. data/lib/nokogiri/html.rb +31 -27
  80. data/lib/nokogiri/{html → html4}/builder.rb +2 -2
  81. data/lib/nokogiri/{html → html4}/document.rb +4 -4
  82. data/lib/nokogiri/{html → html4}/document_fragment.rb +3 -3
  83. data/lib/nokogiri/{html → html4}/element_description.rb +1 -1
  84. data/lib/nokogiri/{html → html4}/element_description_defaults.rb +1 -1
  85. data/lib/nokogiri/{html → html4}/entity_lookup.rb +1 -1
  86. data/lib/nokogiri/{html → html4}/sax/parser.rb +11 -14
  87. data/lib/nokogiri/html4/sax/parser_context.rb +19 -0
  88. data/lib/nokogiri/{html → html4}/sax/push_parser.rb +5 -5
  89. data/lib/nokogiri/html4.rb +40 -0
  90. data/lib/nokogiri/html5/document.rb +74 -0
  91. data/lib/nokogiri/html5/document_fragment.rb +80 -0
  92. data/lib/nokogiri/html5/node.rb +93 -0
  93. data/lib/nokogiri/html5.rb +473 -0
  94. data/lib/nokogiri/version/constant.rb +1 -1
  95. data/lib/nokogiri/version/info.rb +12 -2
  96. data/lib/nokogiri/xml/builder.rb +38 -0
  97. data/lib/nokogiri/xml/document.rb +46 -0
  98. data/lib/nokogiri/xml/node/save_options.rb +1 -1
  99. data/lib/nokogiri/xml/node.rb +6 -5
  100. data/lib/nokogiri/xml/parse_options.rb +2 -0
  101. data/lib/nokogiri/xml/pp.rb +2 -2
  102. data/lib/nokogiri/xml/sax/document.rb +24 -30
  103. data/lib/nokogiri/xml/sax.rb +4 -4
  104. data/lib/nokogiri/xml/xpath.rb +2 -2
  105. data/lib/nokogiri/xml.rb +35 -36
  106. data/lib/nokogiri/xslt/stylesheet.rb +1 -1
  107. data/lib/nokogiri/xslt.rb +16 -16
  108. data/lib/nokogiri.rb +31 -29
  109. data/patches/libxml2/0007-Fix-XPath-recursion-limit.patch +31 -0
  110. metadata +101 -58
  111. data/lib/nokogiri/html/sax/parser_context.rb +0 -17
data/lib/nokogiri/css.rb CHANGED
@@ -1,28 +1,28 @@
1
1
  # frozen_string_literal: true
2
- require 'nokogiri/css/node'
3
- require 'nokogiri/css/xpath_visitor'
4
- x = $-w
5
- $-w = false
6
- require 'nokogiri/css/parser'
7
- $-w = x
8
-
9
- require 'nokogiri/css/tokenizer'
10
- require 'nokogiri/css/syntax_error'
11
-
12
2
  module Nokogiri
13
3
  module CSS
14
4
  class << self
15
5
  ###
16
6
  # Parse this CSS selector in +selector+. Returns an AST.
17
- def parse selector
18
- Parser.new.parse selector
7
+ def parse(selector)
8
+ Parser.new.parse(selector)
19
9
  end
20
10
 
21
11
  ###
22
12
  # Get the XPath for +selector+.
23
- def xpath_for selector, options={}
24
- Parser.new(options[:ns] || {}).xpath_for selector, options
13
+ def xpath_for(selector, options = {})
14
+ Parser.new(options[:ns] || {}).xpath_for(selector, options)
25
15
  end
26
16
  end
27
17
  end
28
18
  end
19
+
20
+ require_relative "css/node"
21
+ require_relative "css/xpath_visitor"
22
+ x = $-w
23
+ $-w = false
24
+ require_relative "css/parser"
25
+ $-w = x
26
+
27
+ require_relative "css/tokenizer"
28
+ require_relative "css/syntax_error"
@@ -2,8 +2,9 @@
2
2
 
3
3
  # load the C or Java extension
4
4
  begin
5
+ # native precompiled gems package shared libraries in <gem_dir>/lib/nokogiri/<ruby_version>
5
6
  ::RUBY_VERSION =~ /(\d+\.\d+)/
6
- require "nokogiri/#{Regexp.last_match(1)}/nokogiri"
7
+ require_relative "#{Regexp.last_match(1)}/nokogiri"
7
8
  rescue LoadError => e
8
9
  if e.message =~ /GLIBC/
9
10
  warn(<<~EOM)
@@ -22,5 +23,9 @@ rescue LoadError => e
22
23
  EOM
23
24
  raise e
24
25
  end
25
- require 'nokogiri/nokogiri'
26
+
27
+ # use "require" instead of "require_relative" because non-native gems will place C extension files
28
+ # in Gem::BasicSpecification#extension_dir after compilation (during normal installation), which
29
+ # is in $LOAD_PATH but not necessarily relative to this file (see #2300)
30
+ require "nokogiri/nokogiri"
26
31
  end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+ module Nokogiri
3
+ module Gumbo
4
+ # The default maximum number of attributes per element.
5
+ DEFAULT_MAX_ATTRIBUTES = 400
6
+
7
+ # The default maximum number of errors for parsing a document or a fragment.
8
+ DEFAULT_MAX_ERRORS = 0
9
+
10
+ # The default maximum depth of the DOM tree produced by parsing a document
11
+ # or fragment.
12
+ DEFAULT_MAX_TREE_DEPTH = 400
13
+ end
14
+ end
data/lib/nokogiri/html.rb CHANGED
@@ -1,38 +1,42 @@
1
1
  # frozen_string_literal: true
2
- require 'nokogiri/html/entity_lookup'
3
- require 'nokogiri/html/document'
4
- require 'nokogiri/html/document_fragment'
5
- require 'nokogiri/html/sax/parser_context'
6
- require 'nokogiri/html/sax/parser'
7
- require 'nokogiri/html/sax/push_parser'
8
- require 'nokogiri/html/element_description'
9
- require 'nokogiri/html/element_description_defaults'
2
+ require_relative "html4"
10
3
 
11
4
  module Nokogiri
12
- class << self
13
- ###
14
- # Parse HTML. Convenience method for Nokogiri::HTML::Document.parse
15
- def HTML thing, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block
16
- Nokogiri::HTML::Document.parse(thing, url, encoding, options, &block)
17
- end
18
- end
5
+ HTML = Nokogiri::HTML4
6
+
7
+ # @!method HTML(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
8
+ # Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
9
+ # @!scope class
10
+ define_singleton_method(:HTML, Nokogiri.method(:HTML4))
19
11
 
12
+ # @note This module/namespace is an alias for {Nokogiri::HTML4} as of v1.12.0. Before v1.12.0,
13
+ # {Nokogiri::HTML4} did not exist, and this was the module/namespace for all HTML-related
14
+ # classes.
20
15
  module HTML
21
- class << self
22
- ###
23
- # Parse HTML. Convenience method for Nokogiri::HTML::Document.parse
24
- def parse thing, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block
25
- Document.parse(thing, url, encoding, options, &block)
16
+ # @note This class is an alias for {Nokogiri::HTML4::Document} as of v1.12.0.
17
+ class Document < Nokogiri::XML::Document
18
+ end
19
+
20
+ # @note This class is an alias for {Nokogiri::HTML4::DocumentFragment} as of v1.12.0.
21
+ class DocumentFragment < Nokogiri::XML::DocumentFragment
22
+ end
23
+
24
+ # @note This class is an alias for {Nokogiri::HTML4::Builder} as of v1.12.0.
25
+ class Builder < Nokogiri::XML::Builder
26
+ end
27
+
28
+ module SAX
29
+ # @note This class is an alias for {Nokogiri::HTML4::SAX::Parser} as of v1.12.0.
30
+ class Parser < Nokogiri::XML::SAX::Parser
26
31
  end
27
32
 
28
- ####
29
- # Parse a fragment from +string+ in to a NodeSet.
30
- def fragment string, encoding = nil
31
- HTML::DocumentFragment.parse string, encoding
33
+ # @note This class is an alias for {Nokogiri::HTML4::SAX::ParserContext} as of v1.12.0.
34
+ class ParserContext < Nokogiri::XML::SAX::ParserContext
32
35
  end
33
- end
34
36
 
35
- # Instance of Nokogiri::HTML::EntityLookup
36
- NamedCharacters = EntityLookup.new
37
+ # @note This class is an alias for {Nokogiri::HTML4::SAX::PushParser} as of v1.12.0.
38
+ class PushParser
39
+ end
40
+ end
37
41
  end
38
42
  end
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
  module Nokogiri
3
- module HTML
3
+ module HTML4
4
4
  ###
5
5
  # Nokogiri HTML builder is used for building HTML documents. It is very
6
6
  # similar to the Nokogiri::XML::Builder. In fact, you should go read the
@@ -12,7 +12,7 @@ module Nokogiri
12
12
  # Create an HTML document with a body that has an onload attribute, and a
13
13
  # span tag with a class of "bold" that has content of "Hello world".
14
14
  #
15
- # builder = Nokogiri::HTML::Builder.new do |doc|
15
+ # builder = Nokogiri::HTML4::Builder.new do |doc|
16
16
  # doc.html {
17
17
  # doc.body(:onload => 'some_func();') {
18
18
  # doc.span.bold {
@@ -3,7 +3,7 @@
3
3
  require 'pathname'
4
4
 
5
5
  module Nokogiri
6
- module HTML
6
+ module HTML4
7
7
  class Document < Nokogiri::XML::Document
8
8
  ###
9
9
  # Get the meta tag encoding for this document. If there is no meta tag,
@@ -268,12 +268,12 @@ module Nokogiri
268
268
  m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
269
269
  return m[4]
270
270
  catch(:encoding_found) {
271
- Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
271
+ Nokogiri::HTML4::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
272
272
  nil
273
273
  }
274
274
  else
275
275
  handler = SAXHandler.new
276
- parser = Nokogiri::HTML::SAX::PushParser.new(handler)
276
+ parser = Nokogiri::HTML4::SAX::PushParser.new(handler)
277
277
  parser << chunk rescue Nokogiri::SyntaxError
278
278
  handler.encoding
279
279
  end
@@ -286,7 +286,7 @@ module Nokogiri
286
286
  end
287
287
 
288
288
  # This method is used by the C extension so that
289
- # Nokogiri::HTML::Document#read_io() does not leak memory when
289
+ # Nokogiri::HTML4::Document#read_io() does not leak memory when
290
290
  # EncodingFound is raised.
291
291
  attr_reader :encoding_found
292
292
 
@@ -1,11 +1,11 @@
1
1
  # frozen_string_literal: true
2
2
  module Nokogiri
3
- module HTML
3
+ module HTML4
4
4
  class DocumentFragment < Nokogiri::XML::DocumentFragment
5
5
  ####
6
6
  # Create a Nokogiri::XML::DocumentFragment from +tags+, using +encoding+
7
7
  def self.parse(tags, encoding = nil)
8
- doc = HTML::Document.new
8
+ doc = HTML4::Document.new
9
9
 
10
10
  encoding ||= if tags.respond_to?(:encoding)
11
11
  encoding = tags.encoding
@@ -39,7 +39,7 @@ module Nokogiri
39
39
  "/html/body/node()"
40
40
  end
41
41
 
42
- temp_doc = HTML::Document.parse("<html><body>#{tags}", nil, document.encoding)
42
+ temp_doc = HTML4::Document.parse("<html><body>#{tags}", nil, document.encoding)
43
43
  temp_doc.xpath(path).each { |child| child.parent = self }
44
44
  self.errors = temp_doc.errors
45
45
  end
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
  module Nokogiri
3
- module HTML
3
+ module HTML4
4
4
  class ElementDescription
5
5
  ###
6
6
  # Is this element a block element?
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
  module Nokogiri
3
- module HTML
3
+ module HTML4
4
4
  class ElementDescription
5
5
 
6
6
  # Methods are defined protected by method_defined? because at
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
  module Nokogiri
3
- module HTML
3
+ module HTML4
4
4
  class EntityDescription < Struct.new(:value, :name, :description); end
5
5
 
6
6
  class EntityLookup
@@ -1,18 +1,15 @@
1
1
  # frozen_string_literal: true
2
2
  module Nokogiri
3
- module HTML
3
+ module HTML4
4
4
  ###
5
- # Nokogiri lets you write a SAX parser to process HTML but get HTML
6
- # correction features.
5
+ # Nokogiri lets you write a SAX parser to process HTML but get HTML correction features.
7
6
  #
8
- # See Nokogiri::HTML::SAX::Parser for a basic example of using a
9
- # SAX parser with HTML.
7
+ # See Nokogiri::HTML4::SAX::Parser for a basic example of using a SAX parser with HTML.
10
8
  #
11
9
  # For more information on SAX parsers, see Nokogiri::XML::SAX
12
10
  module SAX
13
11
  ###
14
- # This class lets you perform SAX style parsing on HTML with HTML
15
- # error correction.
12
+ # This class lets you perform SAX style parsing on HTML with HTML error correction.
16
13
  #
17
14
  # Here is a basic usage example:
18
15
  #
@@ -22,40 +19,40 @@ module Nokogiri
22
19
  # end
23
20
  # end
24
21
  #
25
- # parser = Nokogiri::HTML::SAX::Parser.new(MyDoc.new)
22
+ # parser = Nokogiri::HTML4::SAX::Parser.new(MyDoc.new)
26
23
  # parser.parse(File.read(ARGV[0], mode: 'rb'))
27
24
  #
28
25
  # For more information on SAX parsers, see Nokogiri::XML::SAX
29
26
  class Parser < Nokogiri::XML::SAX::Parser
30
27
  ###
31
28
  # Parse html stored in +data+ using +encoding+
32
- def parse_memory data, encoding = 'UTF-8'
29
+ def parse_memory(data, encoding = "UTF-8")
33
30
  raise ArgumentError unless data
34
31
  return unless data.length > 0
35
32
  ctx = ParserContext.memory(data, encoding)
36
33
  yield ctx if block_given?
37
- ctx.parse_with self
34
+ ctx.parse_with(self)
38
35
  end
39
36
 
40
37
  ###
41
38
  # Parse given +io+
42
- def parse_io io, encoding = 'UTF-8'
39
+ def parse_io(io, encoding = "UTF-8")
43
40
  check_encoding(encoding)
44
41
  @encoding = encoding
45
42
  ctx = ParserContext.io(io, ENCODINGS[encoding])
46
43
  yield ctx if block_given?
47
- ctx.parse_with self
44
+ ctx.parse_with(self)
48
45
  end
49
46
 
50
47
  ###
51
48
  # Parse a file with +filename+
52
- def parse_file filename, encoding = 'UTF-8'
49
+ def parse_file(filename, encoding = "UTF-8")
53
50
  raise ArgumentError unless filename
54
51
  raise Errno::ENOENT unless File.exist?(filename)
55
52
  raise Errno::EISDIR if File.directory?(filename)
56
53
  ctx = ParserContext.file(filename, encoding)
57
54
  yield ctx if block_given?
58
- ctx.parse_with self
55
+ ctx.parse_with(self)
59
56
  end
60
57
  end
61
58
  end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+ module Nokogiri
3
+ module HTML4
4
+ module SAX
5
+ ###
6
+ # Context for HTML SAX parsers. This class is usually not instantiated by the user. Instead,
7
+ # you should be looking at Nokogiri::HTML4::SAX::Parser
8
+ class ParserContext < Nokogiri::XML::SAX::ParserContext
9
+ def self.new(thing, encoding = "UTF-8")
10
+ if [:read, :close].all? { |x| thing.respond_to?(x) }
11
+ super
12
+ else
13
+ memory(thing, encoding)
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -1,17 +1,17 @@
1
1
  # frozen_string_literal: true
2
2
  module Nokogiri
3
- module HTML
3
+ module HTML4
4
4
  module SAX
5
5
  class PushParser
6
6
 
7
- # The Nokogiri::HTML::SAX::Document on which the PushParser will be
7
+ # The Nokogiri::HTML4::SAX::Document on which the PushParser will be
8
8
  # operating
9
9
  attr_accessor :document
10
10
 
11
- def initialize(doc = HTML::SAX::Document.new, file_name = nil, encoding = 'UTF-8')
11
+ def initialize(doc = HTML4::SAX::Document.new, file_name = nil, encoding = 'UTF-8')
12
12
  @document = doc
13
13
  @encoding = encoding
14
- @sax_parser = HTML::SAX::Parser.new(doc, @encoding)
14
+ @sax_parser = HTML4::SAX::Parser.new(doc, @encoding)
15
15
 
16
16
  ## Create our push parser context
17
17
  initialize_native(@sax_parser, file_name, encoding)
@@ -27,7 +27,7 @@ module Nokogiri
27
27
 
28
28
  ###
29
29
  # Finish the parsing. This method is only necessary for
30
- # Nokogiri::HTML::SAX::Document#end_document to be called.
30
+ # Nokogiri::HTML4::SAX::Document#end_document to be called.
31
31
  def finish
32
32
  write '', true
33
33
  end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+ module Nokogiri
3
+ class << self
4
+ ###
5
+ # Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
6
+ def HTML4(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
7
+ Nokogiri::HTML4::Document.parse(input, url, encoding, options, &block)
8
+ end
9
+ end
10
+
11
+ # @since v1.12.0
12
+ # @note Before v1.12.0, {Nokogiri::HTML4} did not exist, and {Nokogiri::HTML} was the module/namespace for parsing HTML.
13
+ module HTML4
14
+ class << self
15
+ ###
16
+ # Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
17
+ def parse(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
18
+ Document.parse(input, url, encoding, options, &block)
19
+ end
20
+
21
+ ####
22
+ # Parse a fragment from +string+ in to a NodeSet.
23
+ def fragment(string, encoding = nil)
24
+ HTML4::DocumentFragment.parse(string, encoding)
25
+ end
26
+ end
27
+
28
+ # Instance of Nokogiri::HTML4::EntityLookup
29
+ NamedCharacters = EntityLookup.new
30
+ end
31
+ end
32
+
33
+ require_relative "html4/entity_lookup"
34
+ require_relative "html4/document"
35
+ require_relative "html4/document_fragment"
36
+ require_relative "html4/sax/parser_context"
37
+ require_relative "html4/sax/parser"
38
+ require_relative "html4/sax/push_parser"
39
+ require_relative "html4/element_description"
40
+ require_relative "html4/element_description_defaults"
@@ -0,0 +1,74 @@
1
+ # frozen_string_literal: true
2
+ #
3
+ # Copyright 2013-2021 Sam Ruby, Stephen Checkoway
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ require_relative "../html4/document"
19
+
20
+ module Nokogiri
21
+ module HTML5
22
+ # @since v1.12.0
23
+ # @note HTML5 functionality is not available when running JRuby.
24
+ class Document < Nokogiri::HTML4::Document
25
+ def self.parse(string_or_io, url = nil, encoding = nil, **options, &block)
26
+ yield options if block_given?
27
+ string_or_io = '' unless string_or_io
28
+
29
+ if string_or_io.respond_to?(:encoding) && string_or_io.encoding.name != 'ASCII-8BIT'
30
+ encoding ||= string_or_io.encoding.name
31
+ end
32
+
33
+ if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
34
+ url ||= string_or_io.path
35
+ end
36
+ unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
37
+ raise ArgumentError.new("not a string or IO object")
38
+ end
39
+ do_parse(string_or_io, url, encoding, options)
40
+ end
41
+
42
+ def self.read_io(io, url = nil, encoding = nil, **options)
43
+ raise ArgumentError.new("io object doesn't respond to :read") unless io.respond_to?(:read)
44
+ do_parse(io, url, encoding, options)
45
+ end
46
+
47
+ def self.read_memory(string, url = nil, encoding = nil, **options)
48
+ raise ArgumentError.new("string object doesn't respond to :to_str") unless string.respond_to?(:to_str)
49
+ do_parse(string, url, encoding, options)
50
+ end
51
+
52
+ def fragment(tags = nil)
53
+ DocumentFragment.new(self, tags, self.root)
54
+ end
55
+
56
+ def to_xml(options = {}, &block)
57
+ # Bypass XML::Document#to_xml which doesn't add
58
+ # XML::Node::SaveOptions::AS_XML like XML::Node#to_xml does.
59
+ XML::Node.instance_method(:to_xml).bind(self).call(options, &block)
60
+ end
61
+
62
+ private
63
+ def self.do_parse(string_or_io, url, encoding, options)
64
+ string = HTML5.read_and_encode(string_or_io, encoding)
65
+ max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
66
+ max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
67
+ max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
68
+ doc = Nokogiri::Gumbo.parse(string, url, max_attributes, max_errors, max_depth)
69
+ doc.encoding = 'UTF-8'
70
+ doc
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+ #
3
+ # Copyright 2013-2021 Sam Ruby, Stephen Checkoway
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ require_relative "../html4/document_fragment"
19
+
20
+ module Nokogiri
21
+ module HTML5
22
+ # @since v1.12.0
23
+ # @note HTML5 functionality is not available when running JRuby.
24
+ class DocumentFragment < Nokogiri::HTML4::DocumentFragment
25
+ attr_accessor :document
26
+ attr_accessor :errors
27
+
28
+ # Create a document fragment.
29
+ def initialize(doc, tags = nil, ctx = nil, options = {})
30
+ self.document = doc
31
+ self.errors = []
32
+ return self unless tags
33
+
34
+ max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
35
+ max_errors = options[:max_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
36
+ max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
37
+ tags = Nokogiri::HTML5.read_and_encode(tags, nil)
38
+ Nokogiri::Gumbo.fragment(self, tags, ctx, max_attributes, max_errors, max_depth)
39
+ end
40
+
41
+ def serialize(options = {}, &block)
42
+ # Bypass XML::Document.serialize which doesn't support options even
43
+ # though XML::Node.serialize does!
44
+ XML::Node.instance_method(:serialize).bind(self).call(options, &block)
45
+ end
46
+
47
+ # Parse a document fragment from +tags+, returning a Nodeset.
48
+ def self.parse(tags, encoding = nil, options = {})
49
+ doc = HTML5::Document.new
50
+ tags = HTML5.read_and_encode(tags, encoding)
51
+ doc.encoding = "UTF-8"
52
+ new(doc, tags, nil, options)
53
+ end
54
+
55
+ def extract_params(params) # :nodoc:
56
+ handler = params.find do |param|
57
+ ![Hash, String, Symbol].include?(param.class)
58
+ end
59
+ params -= [handler] if handler
60
+
61
+ hashes = []
62
+ while Hash === params.last || params.last.nil?
63
+ hashes << params.pop
64
+ break if params.empty?
65
+ end
66
+ ns, binds = hashes.reverse
67
+
68
+ ns ||=
69
+ begin
70
+ ns = {}
71
+ children.each { |child| ns.merge!(child.namespaces) }
72
+ ns
73
+ end
74
+
75
+ [params, handler, ns, binds]
76
+ end
77
+ end
78
+ end
79
+ end
80
+ # vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
@@ -0,0 +1,93 @@
1
+ # frozen_string_literal: true
2
+ #
3
+ # Copyright 2013-2021 Sam Ruby, Stephen Checkoway
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ require_relative "../xml/node"
19
+
20
+ module Nokogiri
21
+ module HTML5
22
+ # @since v1.12.0
23
+ # @note HTML5 functionality is not available when running JRuby.
24
+ module Node
25
+ def inner_html(options = {})
26
+ return super(options) unless document.is_a?(HTML5::Document)
27
+ result = options[:preserve_newline] && HTML5.prepend_newline?(self) ? String.new("\n") : String.new
28
+ result << children.map { |child| child.to_html(options) }.join
29
+ result
30
+ end
31
+
32
+ def write_to(io, *options)
33
+ return super(io, *options) unless document.is_a?(HTML5::Document)
34
+ options = options.first.is_a?(Hash) ? options.shift : {}
35
+ encoding = options[:encoding] || options[0]
36
+ if Nokogiri.jruby?
37
+ save_options = options[:save_with] || options[1]
38
+ indent_times = options[:indent] || 0
39
+ else
40
+ save_options = options[:save_with] || options[1] || XML::Node::SaveOptions::FORMAT
41
+ indent_times = options[:indent] || 2
42
+ end
43
+ indent_string = (options[:indent_text] || " ") * indent_times
44
+
45
+ config = XML::Node::SaveOptions.new(save_options.to_i)
46
+ yield config if block_given?
47
+
48
+ config_options = config.options
49
+ if config_options & (XML::Node::SaveOptions::AS_XML | XML::Node::SaveOptions::AS_XHTML) != 0
50
+ # Use Nokogiri's serializing code.
51
+ native_write_to(io, encoding, indent_string, config_options)
52
+ else
53
+ # Serialize including the current node.
54
+ encoding ||= document.encoding || Encoding::UTF_8
55
+ internal_ops = {
56
+ preserve_newline: options[:preserve_newline] || false,
57
+ }
58
+ HTML5.serialize_node_internal(self, io, encoding, internal_ops)
59
+ end
60
+ end
61
+
62
+ def fragment(tags)
63
+ return super(tags) unless document.is_a?(HTML5::Document)
64
+ DocumentFragment.new(document, tags, self)
65
+ end
66
+
67
+ private
68
+
69
+ # HTML elements can have attributes that contain colons.
70
+ # Nokogiri::XML::Node#[]= treats names with colons as a prefixed QName
71
+ # and tries to create an attribute in a namespace. This is especially
72
+ # annoying with attribute names like xml:lang since libxml2 will
73
+ # actually create the xml namespace if it doesn't exist already.
74
+ def add_child_node_and_reparent_attrs(node)
75
+ return super(node) unless document.is_a?(HTML5::Document)
76
+ # I'm not sure what this method is supposed to do. Reparenting
77
+ # namespaces is handled by libxml2, including child namespaces which
78
+ # this method wouldn't handle.
79
+ # https://github.com/sparklemotion/nokogiri/issues/1790
80
+ add_child_node(node)
81
+ # node.attribute_nodes.find_all { |a| a.namespace }.each do |attr|
82
+ # attr.remove
83
+ # ns = attr.namespace
84
+ # a["#{ns.prefix}:#{attr.name}"] = attr.value
85
+ # end
86
+ end
87
+ end
88
+ # Monkey patch
89
+ XML::Node.prepend(HTML5::Node)
90
+ end
91
+ end
92
+
93
+ # vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab: