nokogiri 1.10.7 → 1.16.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (224) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +42 -0
  3. data/LICENSE-DEPENDENCIES.md +1632 -1022
  4. data/LICENSE.md +1 -1
  5. data/README.md +188 -96
  6. data/bin/nokogiri +63 -50
  7. data/dependencies.yml +34 -66
  8. data/ext/nokogiri/depend +38 -358
  9. data/ext/nokogiri/extconf.rb +862 -421
  10. data/ext/nokogiri/gumbo.c +594 -0
  11. data/ext/nokogiri/html4_document.c +165 -0
  12. data/ext/nokogiri/html4_element_description.c +299 -0
  13. data/ext/nokogiri/html4_entity_lookup.c +37 -0
  14. data/ext/nokogiri/html4_sax_parser_context.c +108 -0
  15. data/ext/nokogiri/html4_sax_push_parser.c +95 -0
  16. data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
  17. data/ext/nokogiri/nokogiri.c +251 -105
  18. data/ext/nokogiri/nokogiri.h +222 -90
  19. data/ext/nokogiri/test_global_handlers.c +40 -0
  20. data/ext/nokogiri/xml_attr.c +17 -17
  21. data/ext/nokogiri/xml_attribute_decl.c +22 -22
  22. data/ext/nokogiri/xml_cdata.c +39 -31
  23. data/ext/nokogiri/xml_comment.c +20 -27
  24. data/ext/nokogiri/xml_document.c +408 -243
  25. data/ext/nokogiri/xml_document_fragment.c +13 -17
  26. data/ext/nokogiri/xml_dtd.c +64 -58
  27. data/ext/nokogiri/xml_element_content.c +63 -55
  28. data/ext/nokogiri/xml_element_decl.c +31 -31
  29. data/ext/nokogiri/xml_encoding_handler.c +54 -21
  30. data/ext/nokogiri/xml_entity_decl.c +37 -35
  31. data/ext/nokogiri/xml_entity_reference.c +17 -19
  32. data/ext/nokogiri/xml_namespace.c +131 -61
  33. data/ext/nokogiri/xml_node.c +1343 -674
  34. data/ext/nokogiri/xml_node_set.c +246 -216
  35. data/ext/nokogiri/xml_processing_instruction.c +18 -20
  36. data/ext/nokogiri/xml_reader.c +305 -213
  37. data/ext/nokogiri/xml_relax_ng.c +87 -78
  38. data/ext/nokogiri/xml_sax_parser.c +149 -124
  39. data/ext/nokogiri/xml_sax_parser_context.c +149 -103
  40. data/ext/nokogiri/xml_sax_push_parser.c +65 -37
  41. data/ext/nokogiri/xml_schema.c +138 -82
  42. data/ext/nokogiri/xml_syntax_error.c +42 -21
  43. data/ext/nokogiri/xml_text.c +35 -26
  44. data/ext/nokogiri/xml_xpath_context.c +363 -178
  45. data/ext/nokogiri/xslt_stylesheet.c +335 -189
  46. data/gumbo-parser/CHANGES.md +63 -0
  47. data/gumbo-parser/Makefile +126 -0
  48. data/gumbo-parser/THANKS +27 -0
  49. data/gumbo-parser/src/Makefile +34 -0
  50. data/gumbo-parser/src/README.md +41 -0
  51. data/gumbo-parser/src/ascii.c +75 -0
  52. data/gumbo-parser/src/ascii.h +115 -0
  53. data/gumbo-parser/src/attribute.c +42 -0
  54. data/gumbo-parser/src/attribute.h +17 -0
  55. data/gumbo-parser/src/char_ref.c +22225 -0
  56. data/gumbo-parser/src/char_ref.h +29 -0
  57. data/gumbo-parser/src/char_ref.rl +2154 -0
  58. data/gumbo-parser/src/error.c +630 -0
  59. data/gumbo-parser/src/error.h +148 -0
  60. data/gumbo-parser/src/foreign_attrs.c +103 -0
  61. data/gumbo-parser/src/foreign_attrs.gperf +27 -0
  62. data/gumbo-parser/src/insertion_mode.h +33 -0
  63. data/gumbo-parser/src/macros.h +91 -0
  64. data/gumbo-parser/src/nokogiri_gumbo.h +944 -0
  65. data/gumbo-parser/src/parser.c +4891 -0
  66. data/gumbo-parser/src/parser.h +41 -0
  67. data/gumbo-parser/src/replacement.h +33 -0
  68. data/gumbo-parser/src/string_buffer.c +103 -0
  69. data/gumbo-parser/src/string_buffer.h +68 -0
  70. data/gumbo-parser/src/string_piece.c +48 -0
  71. data/gumbo-parser/src/svg_attrs.c +174 -0
  72. data/gumbo-parser/src/svg_attrs.gperf +77 -0
  73. data/gumbo-parser/src/svg_tags.c +137 -0
  74. data/gumbo-parser/src/svg_tags.gperf +55 -0
  75. data/gumbo-parser/src/tag.c +223 -0
  76. data/gumbo-parser/src/tag_lookup.c +382 -0
  77. data/gumbo-parser/src/tag_lookup.gperf +170 -0
  78. data/gumbo-parser/src/tag_lookup.h +13 -0
  79. data/gumbo-parser/src/token_buffer.c +79 -0
  80. data/gumbo-parser/src/token_buffer.h +71 -0
  81. data/gumbo-parser/src/token_type.h +17 -0
  82. data/gumbo-parser/src/tokenizer.c +3464 -0
  83. data/gumbo-parser/src/tokenizer.h +112 -0
  84. data/gumbo-parser/src/tokenizer_states.h +339 -0
  85. data/gumbo-parser/src/utf8.c +245 -0
  86. data/gumbo-parser/src/utf8.h +164 -0
  87. data/gumbo-parser/src/util.c +66 -0
  88. data/gumbo-parser/src/util.h +34 -0
  89. data/gumbo-parser/src/vector.c +111 -0
  90. data/gumbo-parser/src/vector.h +45 -0
  91. data/lib/nokogiri/class_resolver.rb +67 -0
  92. data/lib/nokogiri/css/node.rb +10 -8
  93. data/lib/nokogiri/css/parser.rb +397 -377
  94. data/lib/nokogiri/css/parser.y +250 -245
  95. data/lib/nokogiri/css/parser_extras.rb +54 -49
  96. data/lib/nokogiri/css/syntax_error.rb +3 -1
  97. data/lib/nokogiri/css/tokenizer.rb +5 -3
  98. data/lib/nokogiri/css/tokenizer.rex +3 -2
  99. data/lib/nokogiri/css/xpath_visitor.rb +205 -96
  100. data/lib/nokogiri/css.rb +56 -17
  101. data/lib/nokogiri/decorators/slop.rb +9 -7
  102. data/lib/nokogiri/encoding_handler.rb +57 -0
  103. data/lib/nokogiri/extension.rb +32 -0
  104. data/lib/nokogiri/gumbo.rb +15 -0
  105. data/lib/nokogiri/html.rb +38 -27
  106. data/lib/nokogiri/{html → html4}/builder.rb +4 -2
  107. data/lib/nokogiri/html4/document.rb +214 -0
  108. data/lib/nokogiri/html4/document_fragment.rb +54 -0
  109. data/lib/nokogiri/{html → html4}/element_description.rb +3 -1
  110. data/lib/nokogiri/html4/element_description_defaults.rb +2040 -0
  111. data/lib/nokogiri/html4/encoding_reader.rb +121 -0
  112. data/lib/nokogiri/{html → html4}/entity_lookup.rb +4 -2
  113. data/lib/nokogiri/{html → html4}/sax/parser.rb +17 -16
  114. data/lib/nokogiri/html4/sax/parser_context.rb +20 -0
  115. data/lib/nokogiri/{html → html4}/sax/push_parser.rb +12 -11
  116. data/lib/nokogiri/html4.rb +47 -0
  117. data/lib/nokogiri/html5/document.rb +168 -0
  118. data/lib/nokogiri/html5/document_fragment.rb +90 -0
  119. data/lib/nokogiri/html5/node.rb +103 -0
  120. data/lib/nokogiri/html5.rb +326 -0
  121. data/lib/nokogiri/jruby/dependencies.rb +3 -0
  122. data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
  123. data/lib/nokogiri/syntax_error.rb +2 -0
  124. data/lib/nokogiri/version/constant.rb +6 -0
  125. data/lib/nokogiri/version/info.rb +224 -0
  126. data/lib/nokogiri/version.rb +3 -108
  127. data/lib/nokogiri/xml/attr.rb +55 -3
  128. data/lib/nokogiri/xml/attribute_decl.rb +6 -2
  129. data/lib/nokogiri/xml/builder.rb +75 -34
  130. data/lib/nokogiri/xml/cdata.rb +3 -1
  131. data/lib/nokogiri/xml/character_data.rb +2 -0
  132. data/lib/nokogiri/xml/document.rb +312 -127
  133. data/lib/nokogiri/xml/document_fragment.rb +93 -48
  134. data/lib/nokogiri/xml/dtd.rb +4 -2
  135. data/lib/nokogiri/xml/element_content.rb +12 -2
  136. data/lib/nokogiri/xml/element_decl.rb +6 -2
  137. data/lib/nokogiri/xml/entity_decl.rb +7 -3
  138. data/lib/nokogiri/xml/entity_reference.rb +2 -0
  139. data/lib/nokogiri/xml/namespace.rb +44 -0
  140. data/lib/nokogiri/xml/node/save_options.rb +23 -8
  141. data/lib/nokogiri/xml/node.rb +1096 -419
  142. data/lib/nokogiri/xml/node_set.rb +137 -61
  143. data/lib/nokogiri/xml/notation.rb +13 -0
  144. data/lib/nokogiri/xml/parse_options.rb +145 -52
  145. data/lib/nokogiri/xml/pp/character_data.rb +9 -6
  146. data/lib/nokogiri/xml/pp/node.rb +42 -30
  147. data/lib/nokogiri/xml/pp.rb +4 -2
  148. data/lib/nokogiri/xml/processing_instruction.rb +4 -1
  149. data/lib/nokogiri/xml/reader.rb +21 -28
  150. data/lib/nokogiri/xml/relax_ng.rb +8 -2
  151. data/lib/nokogiri/xml/sax/document.rb +45 -49
  152. data/lib/nokogiri/xml/sax/parser.rb +39 -36
  153. data/lib/nokogiri/xml/sax/parser_context.rb +8 -3
  154. data/lib/nokogiri/xml/sax/push_parser.rb +6 -5
  155. data/lib/nokogiri/xml/sax.rb +6 -4
  156. data/lib/nokogiri/xml/schema.rb +19 -9
  157. data/lib/nokogiri/xml/searchable.rb +120 -72
  158. data/lib/nokogiri/xml/syntax_error.rb +7 -5
  159. data/lib/nokogiri/xml/text.rb +2 -0
  160. data/lib/nokogiri/xml/xpath/syntax_error.rb +4 -2
  161. data/lib/nokogiri/xml/xpath.rb +15 -4
  162. data/lib/nokogiri/xml/xpath_context.rb +3 -3
  163. data/lib/nokogiri/xml.rb +39 -38
  164. data/lib/nokogiri/xslt/stylesheet.rb +3 -1
  165. data/lib/nokogiri/xslt.rb +101 -22
  166. data/lib/nokogiri.rb +59 -75
  167. data/lib/xsd/xmlparser/nokogiri.rb +29 -25
  168. data/patches/libxml2/{0004-libxml2.la-is-in-top_builddir.patch → 0003-libxml2.la-is-in-top_builddir.patch} +1 -1
  169. data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
  170. data/patches/libxml2/0010-update-config.guess-and-config.sub-for-libxml2.patch +224 -0
  171. data/patches/libxml2/0011-rip-out-libxml2-s-libc_single_threaded-support.patch +30 -0
  172. data/patches/libxslt/0001-update-config.guess-and-config.sub-for-libxslt.patch +224 -0
  173. data/ports/archives/libxml2-2.12.3.tar.xz +0 -0
  174. data/ports/archives/libxslt-1.1.39.tar.xz +0 -0
  175. metadata +121 -291
  176. data/ext/nokogiri/html_document.c +0 -170
  177. data/ext/nokogiri/html_document.h +0 -10
  178. data/ext/nokogiri/html_element_description.c +0 -279
  179. data/ext/nokogiri/html_element_description.h +0 -10
  180. data/ext/nokogiri/html_entity_lookup.c +0 -32
  181. data/ext/nokogiri/html_entity_lookup.h +0 -8
  182. data/ext/nokogiri/html_sax_parser_context.c +0 -116
  183. data/ext/nokogiri/html_sax_parser_context.h +0 -11
  184. data/ext/nokogiri/html_sax_push_parser.c +0 -87
  185. data/ext/nokogiri/html_sax_push_parser.h +0 -9
  186. data/ext/nokogiri/xml_attr.h +0 -9
  187. data/ext/nokogiri/xml_attribute_decl.h +0 -9
  188. data/ext/nokogiri/xml_cdata.h +0 -9
  189. data/ext/nokogiri/xml_comment.h +0 -9
  190. data/ext/nokogiri/xml_document.h +0 -23
  191. data/ext/nokogiri/xml_document_fragment.h +0 -10
  192. data/ext/nokogiri/xml_dtd.h +0 -10
  193. data/ext/nokogiri/xml_element_content.h +0 -10
  194. data/ext/nokogiri/xml_element_decl.h +0 -9
  195. data/ext/nokogiri/xml_encoding_handler.h +0 -8
  196. data/ext/nokogiri/xml_entity_decl.h +0 -10
  197. data/ext/nokogiri/xml_entity_reference.h +0 -9
  198. data/ext/nokogiri/xml_io.c +0 -61
  199. data/ext/nokogiri/xml_io.h +0 -11
  200. data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
  201. data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
  202. data/ext/nokogiri/xml_namespace.h +0 -14
  203. data/ext/nokogiri/xml_node.h +0 -13
  204. data/ext/nokogiri/xml_node_set.h +0 -12
  205. data/ext/nokogiri/xml_processing_instruction.h +0 -9
  206. data/ext/nokogiri/xml_reader.h +0 -10
  207. data/ext/nokogiri/xml_relax_ng.h +0 -9
  208. data/ext/nokogiri/xml_sax_parser.h +0 -39
  209. data/ext/nokogiri/xml_sax_parser_context.h +0 -10
  210. data/ext/nokogiri/xml_sax_push_parser.h +0 -9
  211. data/ext/nokogiri/xml_schema.h +0 -9
  212. data/ext/nokogiri/xml_syntax_error.h +0 -13
  213. data/ext/nokogiri/xml_text.h +0 -9
  214. data/ext/nokogiri/xml_xpath_context.h +0 -10
  215. data/ext/nokogiri/xslt_stylesheet.h +0 -14
  216. data/lib/nokogiri/html/document.rb +0 -335
  217. data/lib/nokogiri/html/document_fragment.rb +0 -49
  218. data/lib/nokogiri/html/element_description_defaults.rb +0 -671
  219. data/lib/nokogiri/html/sax/parser_context.rb +0 -16
  220. data/patches/libxml2/0001-Revert-Do-not-URI-escape-in-server-side-includes.patch +0 -78
  221. data/ports/archives/libxml2-2.9.10.tar.gz +0 -0
  222. data/ports/archives/libxslt-1.1.34.tar.gz +0 -0
  223. /data/patches/libxml2/{0002-Remove-script-macro-support.patch → 0001-Remove-script-macro-support.patch} +0 -0
  224. /data/patches/libxml2/{0003-Update-entities-to-remove-handling-of-ssi.patch → 0002-Update-entities-to-remove-handling-of-ssi.patch} +0 -0
@@ -0,0 +1,121 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nokogiri
4
+ module HTML4
5
+ # Libxml2's parser has poor support for encoding detection. First, it does not recognize the
6
+ # HTML5 style meta charset declaration. Secondly, even if it successfully detects an encoding
7
+ # hint, it does not re-decode or re-parse the preceding part which may be garbled.
8
+ #
9
+ # EncodingReader aims to perform advanced encoding detection beyond what Libxml2 does, and to
10
+ # emulate rewinding of a stream and make Libxml2 redo parsing from the start when an encoding
11
+ # hint is found.
12
+
13
+ # :nodoc: all
14
+ class EncodingReader
15
+ class EncodingFound < StandardError
16
+ attr_reader :found_encoding
17
+
18
+ def initialize(encoding)
19
+ @found_encoding = encoding
20
+ super(format("encoding found: %s", encoding))
21
+ end
22
+ end
23
+
24
+ class SAXHandler < Nokogiri::XML::SAX::Document
25
+ attr_reader :encoding
26
+
27
+ def initialize
28
+ @encoding = nil
29
+ super()
30
+ end
31
+
32
+ def start_element(name, attrs = [])
33
+ return unless name == "meta"
34
+
35
+ attr = Hash[attrs]
36
+ (charset = attr["charset"]) &&
37
+ (@encoding = charset)
38
+ (http_equiv = attr["http-equiv"]) &&
39
+ http_equiv.match(/\AContent-Type\z/i) &&
40
+ (content = attr["content"]) &&
41
+ (m = content.match(/;\s*charset\s*=\s*([\w-]+)/)) &&
42
+ (@encoding = m[1])
43
+ end
44
+ end
45
+
46
+ class JumpSAXHandler < SAXHandler
47
+ def initialize(jumptag)
48
+ @jumptag = jumptag
49
+ super()
50
+ end
51
+
52
+ def start_element(name, attrs = [])
53
+ super
54
+ throw(@jumptag, @encoding) if @encoding
55
+ throw(@jumptag, nil) if /\A(?:div|h1|img|p|br)\z/.match?(name)
56
+ end
57
+ end
58
+
59
+ def self.detect_encoding(chunk)
60
+ (m = chunk.match(/\A(<\?xml[ \t\r\n][^>]*>)/)) &&
61
+ (return Nokogiri.XML(m[1]).encoding)
62
+
63
+ if Nokogiri.jruby?
64
+ (m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i)) &&
65
+ (return m[4])
66
+ catch(:encoding_found) do
67
+ Nokogiri::HTML4::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
68
+ nil
69
+ end
70
+ else
71
+ handler = SAXHandler.new
72
+ parser = Nokogiri::HTML4::SAX::PushParser.new(handler)
73
+ begin
74
+ parser << chunk
75
+ rescue
76
+ Nokogiri::SyntaxError
77
+ end
78
+ handler.encoding
79
+ end
80
+ end
81
+
82
+ def initialize(io)
83
+ @io = io
84
+ @firstchunk = nil
85
+ @encoding_found = nil
86
+ end
87
+
88
+ # This method is used by the C extension so that
89
+ # Nokogiri::HTML4::Document#read_io() does not leak memory when
90
+ # EncodingFound is raised.
91
+ attr_reader :encoding_found
92
+
93
+ def read(len)
94
+ # no support for a call without len
95
+
96
+ unless @firstchunk
97
+ (@firstchunk = @io.read(len)) || return
98
+
99
+ # This implementation expects that the first call from
100
+ # htmlReadIO() is made with a length long enough (~1KB) to
101
+ # achieve advanced encoding detection.
102
+ if (encoding = EncodingReader.detect_encoding(@firstchunk))
103
+ # The first chunk is stored for the next read in retry.
104
+ raise @encoding_found = EncodingFound.new(encoding)
105
+ end
106
+ end
107
+ @encoding_found = nil
108
+
109
+ ret = @firstchunk.slice!(0, len)
110
+ if (len -= ret.length) > 0
111
+ (rest = @io.read(len)) && ret << (rest)
112
+ end
113
+ if ret.empty?
114
+ nil
115
+ else
116
+ ret
117
+ end
118
+ end
119
+ end
120
+ end
121
+ end
@@ -1,11 +1,13 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Nokogiri
2
- module HTML
4
+ module HTML4
3
5
  class EntityDescription < Struct.new(:value, :name, :description); end
4
6
 
5
7
  class EntityLookup
6
8
  ###
7
9
  # Look up entity with +name+
8
- def [] name
10
+ def [](name)
9
11
  (val = get(name)) && val.value
10
12
  end
11
13
  end
@@ -1,17 +1,16 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Nokogiri
2
- module HTML
4
+ module HTML4
3
5
  ###
4
- # Nokogiri lets you write a SAX parser to process HTML but get HTML
5
- # correction features.
6
+ # Nokogiri lets you write a SAX parser to process HTML but get HTML correction features.
6
7
  #
7
- # See Nokogiri::HTML::SAX::Parser for a basic example of using a
8
- # SAX parser with HTML.
8
+ # See Nokogiri::HTML4::SAX::Parser for a basic example of using a SAX parser with HTML.
9
9
  #
10
10
  # For more information on SAX parsers, see Nokogiri::XML::SAX
11
11
  module SAX
12
12
  ###
13
- # This class lets you perform SAX style parsing on HTML with HTML
14
- # error correction.
13
+ # This class lets you perform SAX style parsing on HTML with HTML error correction.
15
14
  #
16
15
  # Here is a basic usage example:
17
16
  #
@@ -21,40 +20,42 @@ module Nokogiri
21
20
  # end
22
21
  # end
23
22
  #
24
- # parser = Nokogiri::HTML::SAX::Parser.new(MyDoc.new)
23
+ # parser = Nokogiri::HTML4::SAX::Parser.new(MyDoc.new)
25
24
  # parser.parse(File.read(ARGV[0], mode: 'rb'))
26
25
  #
27
26
  # For more information on SAX parsers, see Nokogiri::XML::SAX
28
27
  class Parser < Nokogiri::XML::SAX::Parser
29
28
  ###
30
29
  # Parse html stored in +data+ using +encoding+
31
- def parse_memory data, encoding = 'UTF-8'
32
- raise ArgumentError unless data
33
- return unless data.length > 0
30
+ def parse_memory(data, encoding = "UTF-8")
31
+ raise TypeError unless String === data
32
+ return if data.empty?
33
+
34
34
  ctx = ParserContext.memory(data, encoding)
35
35
  yield ctx if block_given?
36
- ctx.parse_with self
36
+ ctx.parse_with(self)
37
37
  end
38
38
 
39
39
  ###
40
40
  # Parse given +io+
41
- def parse_io io, encoding = 'UTF-8'
41
+ def parse_io(io, encoding = "UTF-8")
42
42
  check_encoding(encoding)
43
43
  @encoding = encoding
44
44
  ctx = ParserContext.io(io, ENCODINGS[encoding])
45
45
  yield ctx if block_given?
46
- ctx.parse_with self
46
+ ctx.parse_with(self)
47
47
  end
48
48
 
49
49
  ###
50
50
  # Parse a file with +filename+
51
- def parse_file filename, encoding = 'UTF-8'
51
+ def parse_file(filename, encoding = "UTF-8")
52
52
  raise ArgumentError unless filename
53
53
  raise Errno::ENOENT unless File.exist?(filename)
54
54
  raise Errno::EISDIR if File.directory?(filename)
55
+
55
56
  ctx = ParserContext.file(filename, encoding)
56
57
  yield ctx if block_given?
57
- ctx.parse_with self
58
+ ctx.parse_with(self)
58
59
  end
59
60
  end
60
61
  end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nokogiri
4
+ module HTML4
5
+ module SAX
6
+ ###
7
+ # Context for HTML SAX parsers. This class is usually not instantiated by the user. Instead,
8
+ # you should be looking at Nokogiri::HTML4::SAX::Parser
9
+ class ParserContext < Nokogiri::XML::SAX::ParserContext
10
+ def self.new(thing, encoding = "UTF-8")
11
+ if [:read, :close].all? { |x| thing.respond_to?(x) }
12
+ super
13
+ else
14
+ memory(thing, encoding)
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -1,34 +1,35 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Nokogiri
2
- module HTML
4
+ module HTML4
3
5
  module SAX
4
6
  class PushParser
5
-
6
- # The Nokogiri::HTML::SAX::Document on which the PushParser will be
7
+ # The Nokogiri::HTML4::SAX::Document on which the PushParser will be
7
8
  # operating
8
9
  attr_accessor :document
9
-
10
- def initialize(doc = HTML::SAX::Document.new, file_name = nil, encoding = 'UTF-8')
10
+
11
+ def initialize(doc = HTML4::SAX::Document.new, file_name = nil, encoding = "UTF-8")
11
12
  @document = doc
12
13
  @encoding = encoding
13
- @sax_parser = HTML::SAX::Parser.new(doc, @encoding)
14
+ @sax_parser = HTML4::SAX::Parser.new(doc, @encoding)
14
15
 
15
16
  ## Create our push parser context
16
17
  initialize_native(@sax_parser, file_name, encoding)
17
18
  end
18
-
19
+
19
20
  ###
20
21
  # Write a +chunk+ of HTML to the PushParser. Any callback methods
21
22
  # that can be called will be called immediately.
22
- def write chunk, last_chunk = false
23
+ def write(chunk, last_chunk = false)
23
24
  native_write(chunk, last_chunk)
24
25
  end
25
- alias :<< :write
26
+ alias_method :<<, :write
26
27
 
27
28
  ###
28
29
  # Finish the parsing. This method is only necessary for
29
- # Nokogiri::HTML::SAX::Document#end_document to be called.
30
+ # Nokogiri::HTML4::SAX::Document#end_document to be called.
30
31
  def finish
31
- write '', true
32
+ write("", true)
32
33
  end
33
34
  end
34
35
  end
@@ -0,0 +1,47 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ module Nokogiri
5
+ class << self
6
+ # :call-seq:
7
+ # HTML4(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block) → Nokogiri::HTML4::Document
8
+ #
9
+ # Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
10
+ def HTML4(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
11
+ Nokogiri::HTML4::Document.parse(input, url, encoding, options, &block)
12
+ end
13
+ end
14
+
15
+ # Since v1.12.0
16
+ #
17
+ # 💡 Before v1.12.0, Nokogiri::HTML4 did not exist, and Nokogiri::HTML was the module/namespace
18
+ # for parsing HTML.
19
+ module HTML4
20
+ class << self
21
+ ###
22
+ # Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
23
+ def parse(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
24
+ Document.parse(input, url, encoding, options, &block)
25
+ end
26
+
27
+ ####
28
+ # Parse a fragment from +string+ in to a NodeSet.
29
+ def fragment(string, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
30
+ HTML4::DocumentFragment.parse(string, encoding, options, &block)
31
+ end
32
+ end
33
+
34
+ # Instance of Nokogiri::HTML4::EntityLookup
35
+ NamedCharacters = EntityLookup.new
36
+ end
37
+ end
38
+
39
+ require_relative "html4/entity_lookup"
40
+ require_relative "html4/document"
41
+ require_relative "html4/document_fragment"
42
+ require_relative "html4/encoding_reader"
43
+ require_relative "html4/sax/parser_context"
44
+ require_relative "html4/sax/parser"
45
+ require_relative "html4/sax/push_parser"
46
+ require_relative "html4/element_description"
47
+ require_relative "html4/element_description_defaults"
@@ -0,0 +1,168 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ #
5
+ # Copyright 2013-2021 Sam Ruby, Stephen Checkoway
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+ #
19
+
20
+ require_relative "../html4/document"
21
+
22
+ module Nokogiri
23
+ module HTML5
24
+ # Enum for the HTML5 parser quirks mode values. Values returned by HTML5::Document#quirks_mode
25
+ #
26
+ # See https://dom.spec.whatwg.org/#concept-document-quirks for more information on HTML5 quirks
27
+ # mode.
28
+ #
29
+ # Since v1.14.0
30
+ module QuirksMode
31
+ NO_QUIRKS = 0 # The document was parsed in "no-quirks" mode
32
+ QUIRKS = 1 # The document was parsed in "quirks" mode
33
+ LIMITED_QUIRKS = 2 # The document was parsed in "limited-quirks" mode
34
+ end
35
+
36
+ # Since v1.12.0
37
+ #
38
+ # 💡 HTML5 functionality is not available when running JRuby.
39
+ class Document < Nokogiri::HTML4::Document
40
+ # Get the url name for this document, as passed into Document.parse, Document.read_io, or
41
+ # Document.read_memory
42
+ attr_reader :url
43
+
44
+ # Get the parser's quirks mode value. See HTML5::QuirksMode.
45
+ #
46
+ # This method returns `nil` if the parser was not invoked (e.g., `Nokogiri::HTML5::Document.new`).
47
+ #
48
+ # Since v1.14.0
49
+ attr_reader :quirks_mode
50
+
51
+ class << self
52
+ # :call-seq:
53
+ # parse(input)
54
+ # parse(input, url=nil, encoding=nil, **options)
55
+ # parse(input, url=nil, encoding=nil) { |options| ... }
56
+ #
57
+ # Parse HTML5 input.
58
+ #
59
+ # [Parameters]
60
+ # - +input+ may be a String, or any object that responds to _read_ and _close_ such as an
61
+ # IO, or StringIO.
62
+ #
63
+ # - +url+ (optional) is a String indicating the canonical URI where this document is located.
64
+ #
65
+ # - +encoding+ (optional) is the encoding that should be used when processing
66
+ # the document.
67
+ #
68
+ # - +options+ (optional) is a configuration Hash (or keyword arguments) to set options
69
+ # during parsing. The three currently supported options are +:max_errors+,
70
+ # +:max_tree_depth+ and +:max_attributes+, described at Nokogiri::HTML5.
71
+ #
72
+ # ⚠ Note that these options are different than those made available by
73
+ # Nokogiri::XML::Document and Nokogiri::HTML4::Document.
74
+ #
75
+ # - +block+ (optional) is passed a configuration Hash on which parse options may be set. See
76
+ # Nokogiri::HTML5 for more information and usage.
77
+ #
78
+ # [Returns] Nokogiri::HTML5::Document
79
+ #
80
+ def parse(string_or_io, url = nil, encoding = nil, **options, &block)
81
+ yield options if block
82
+ string_or_io = "" unless string_or_io
83
+
84
+ if string_or_io.respond_to?(:encoding) && string_or_io.encoding != Encoding::ASCII_8BIT
85
+ encoding ||= string_or_io.encoding.name
86
+ end
87
+
88
+ if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
89
+ url ||= string_or_io.path
90
+ end
91
+ unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
92
+ raise ArgumentError, "not a string or IO object"
93
+ end
94
+
95
+ do_parse(string_or_io, url, encoding, options)
96
+ end
97
+
98
+ # Create a new document from an IO object.
99
+ #
100
+ # 💡 Most users should prefer Document.parse to this method.
101
+ def read_io(io, url = nil, encoding = nil, **options)
102
+ raise ArgumentError, "io object doesn't respond to :read" unless io.respond_to?(:read)
103
+
104
+ do_parse(io, url, encoding, options)
105
+ end
106
+
107
+ # Create a new document from a String.
108
+ #
109
+ # 💡 Most users should prefer Document.parse to this method.
110
+ def read_memory(string, url = nil, encoding = nil, **options)
111
+ raise ArgumentError, "string object doesn't respond to :to_str" unless string.respond_to?(:to_str)
112
+
113
+ do_parse(string, url, encoding, options)
114
+ end
115
+
116
+ private
117
+
118
+ def do_parse(string_or_io, url, encoding, options)
119
+ string = HTML5.read_and_encode(string_or_io, encoding)
120
+ max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
121
+ max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
122
+ max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
123
+ doc = Nokogiri::Gumbo.parse(string, url, max_attributes, max_errors, max_depth, self)
124
+ doc.encoding = "UTF-8"
125
+ doc
126
+ end
127
+ end
128
+
129
+ def initialize(*args) # :nodoc:
130
+ super
131
+ @url = nil
132
+ @quirks_mode = nil
133
+ end
134
+
135
+ # :call-seq:
136
+ # fragment() → Nokogiri::HTML5::DocumentFragment
137
+ # fragment(markup) → Nokogiri::HTML5::DocumentFragment
138
+ #
139
+ # Parse a HTML5 document fragment from +markup+, returning a Nokogiri::HTML5::DocumentFragment.
140
+ #
141
+ # [Properties]
142
+ # - +markup+ (String) The HTML5 markup fragment to be parsed
143
+ #
144
+ # [Returns]
145
+ # Nokogiri::HTML5::DocumentFragment. This object's children will be empty if `markup` is not passed, is empty, or is `nil`.
146
+ #
147
+ def fragment(markup = nil)
148
+ DocumentFragment.new(self, markup)
149
+ end
150
+
151
+ def to_xml(options = {}, &block) # :nodoc:
152
+ # Bypass XML::Document#to_xml which doesn't add
153
+ # XML::Node::SaveOptions::AS_XML like XML::Node#to_xml does.
154
+ XML::Node.instance_method(:to_xml).bind_call(self, options, &block)
155
+ end
156
+
157
+ # :call-seq:
158
+ # xpath_doctype() → Nokogiri::CSS::XPathVisitor::DoctypeConfig
159
+ #
160
+ # [Returns] The document type which determines CSS-to-XPath translation.
161
+ #
162
+ # See CSS::XPathVisitor for more information.
163
+ def xpath_doctype
164
+ Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML5
165
+ end
166
+ end
167
+ end
168
+ end
@@ -0,0 +1,90 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ #
5
+ # Copyright 2013-2021 Sam Ruby, Stephen Checkoway
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+ #
19
+
20
+ require_relative "../html4/document_fragment"
21
+
22
+ module Nokogiri
23
+ module HTML5
24
+ # Since v1.12.0
25
+ #
26
+ # 💡 HTML5 functionality is not available when running JRuby.
27
+ class DocumentFragment < Nokogiri::HTML4::DocumentFragment
28
+ attr_accessor :document
29
+ attr_accessor :errors
30
+
31
+ # Get the parser's quirks mode value. See HTML5::QuirksMode.
32
+ #
33
+ # This method returns `nil` if the parser was not invoked (e.g., `Nokogiri::HTML5::DocumentFragment.new(doc)`).
34
+ #
35
+ # Since v1.14.0
36
+ attr_reader :quirks_mode
37
+
38
+ # Create a document fragment.
39
+ def initialize(doc, tags = nil, ctx = nil, options = {}) # rubocop:disable Lint/MissingSuper
40
+ self.document = doc
41
+ self.errors = []
42
+ return self unless tags
43
+
44
+ max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
45
+ max_errors = options[:max_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
46
+ max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
47
+ tags = Nokogiri::HTML5.read_and_encode(tags, nil)
48
+ Nokogiri::Gumbo.fragment(self, tags, ctx, max_attributes, max_errors, max_depth)
49
+ end
50
+
51
+ def serialize(options = {}, &block) # :nodoc:
52
+ # Bypass XML::Document.serialize which doesn't support options even
53
+ # though XML::Node.serialize does!
54
+ XML::Node.instance_method(:serialize).bind_call(self, options, &block)
55
+ end
56
+
57
+ # Parse a document fragment from +tags+, returning a Nodeset.
58
+ def self.parse(tags, encoding = nil, options = {})
59
+ doc = HTML5::Document.new
60
+ tags = HTML5.read_and_encode(tags, encoding)
61
+ doc.encoding = "UTF-8"
62
+ new(doc, tags, nil, options)
63
+ end
64
+
65
+ def extract_params(params) # :nodoc:
66
+ handler = params.find do |param|
67
+ ![Hash, String, Symbol].include?(param.class)
68
+ end
69
+ params -= [handler] if handler
70
+
71
+ hashes = []
72
+ while Hash === params.last || params.last.nil?
73
+ hashes << params.pop
74
+ break if params.empty?
75
+ end
76
+ ns, binds = hashes.reverse
77
+
78
+ ns ||=
79
+ begin
80
+ ns = {}
81
+ children.each { |child| ns.merge!(child.namespaces) }
82
+ ns
83
+ end
84
+
85
+ [params, handler, ns, binds]
86
+ end
87
+ end
88
+ end
89
+ end
90
+ # vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
@@ -0,0 +1,103 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ #
5
+ # Copyright 2013-2021 Sam Ruby, Stephen Checkoway
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+ #
19
+
20
+ #
21
+ # TODO: this whole file should go away. maybe make it a decorator?
22
+ #
23
+ require_relative "../xml/node"
24
+
25
+ module Nokogiri
26
+ module HTML5
27
+ # Since v1.12.0
28
+ #
29
+ # 💡 HTML5 functionality is not available when running JRuby.
30
+ module Node
31
+ def inner_html(options = {})
32
+ return super(options) unless document.is_a?(HTML5::Document)
33
+
34
+ result = options[:preserve_newline] && prepend_newline? ? +"\n" : +""
35
+ result << children.map { |child| child.to_html(options) }.join
36
+ result
37
+ end
38
+
39
+ def write_to(io, *options)
40
+ return super(io, *options) unless document.is_a?(HTML5::Document)
41
+
42
+ options = options.first.is_a?(Hash) ? options.shift : {}
43
+ encoding = options[:encoding] || options[0]
44
+ if Nokogiri.jruby?
45
+ save_options = options[:save_with] || options[1]
46
+ indent_times = options[:indent] || 0
47
+ else
48
+ save_options = options[:save_with] || options[1] || XML::Node::SaveOptions::FORMAT
49
+ indent_times = options[:indent] || 2
50
+ end
51
+ indent_string = (options[:indent_text] || " ") * indent_times
52
+
53
+ config = XML::Node::SaveOptions.new(save_options.to_i)
54
+ yield config if block_given?
55
+
56
+ encoding = encoding.is_a?(Encoding) ? encoding.name : encoding
57
+
58
+ config_options = config.options
59
+ if config_options & (XML::Node::SaveOptions::AS_XML | XML::Node::SaveOptions::AS_XHTML) != 0
60
+ # Use Nokogiri's serializing code.
61
+ native_write_to(io, encoding, indent_string, config_options)
62
+ else
63
+ # Serialize including the current node.
64
+ html = html_standard_serialize(options[:preserve_newline] || false)
65
+ encoding ||= document.encoding || Encoding::UTF_8
66
+ io << html.encode(encoding, fallback: lambda { |c| "&#x#{c.ord.to_s(16)};" })
67
+ end
68
+ end
69
+
70
+ def fragment(tags)
71
+ return super(tags) unless document.is_a?(HTML5::Document)
72
+
73
+ DocumentFragment.new(document, tags, self)
74
+ end
75
+
76
+ private
77
+
78
+ # HTML elements can have attributes that contain colons.
79
+ # Nokogiri::XML::Node#[]= treats names with colons as a prefixed QName
80
+ # and tries to create an attribute in a namespace. This is especially
81
+ # annoying with attribute names like xml:lang since libxml2 will
82
+ # actually create the xml namespace if it doesn't exist already.
83
+ def add_child_node_and_reparent_attrs(node)
84
+ return super(node) unless document.is_a?(HTML5::Document)
85
+
86
+ # I'm not sure what this method is supposed to do. Reparenting
87
+ # namespaces is handled by libxml2, including child namespaces which
88
+ # this method wouldn't handle.
89
+ # https://github.com/sparklemotion/nokogiri/issues/1790
90
+ add_child_node(node)
91
+ # node.attribute_nodes.find_all { |a| a.namespace }.each do |attr|
92
+ # attr.remove
93
+ # ns = attr.namespace
94
+ # a["#{ns.prefix}:#{attr.name}"] = attr.value
95
+ # end
96
+ end
97
+ end
98
+ # Monkey patch
99
+ XML::Node.prepend(HTML5::Node)
100
+ end
101
+ end
102
+
103
+ # vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab: