nokogiri 1.10.3 → 1.12.5

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (218) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +3 -0
  3. data/LICENSE-DEPENDENCIES.md +1173 -884
  4. data/LICENSE.md +1 -1
  5. data/README.md +176 -96
  6. data/dependencies.yml +28 -26
  7. data/ext/nokogiri/depend +38 -358
  8. data/ext/nokogiri/extconf.rb +716 -414
  9. data/ext/nokogiri/gumbo.c +584 -0
  10. data/ext/nokogiri/html4_document.c +166 -0
  11. data/ext/nokogiri/html4_element_description.c +294 -0
  12. data/ext/nokogiri/html4_entity_lookup.c +37 -0
  13. data/ext/nokogiri/html4_sax_parser_context.c +120 -0
  14. data/ext/nokogiri/html4_sax_push_parser.c +95 -0
  15. data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
  16. data/ext/nokogiri/nokogiri.c +228 -91
  17. data/ext/nokogiri/nokogiri.h +191 -89
  18. data/ext/nokogiri/test_global_handlers.c +40 -0
  19. data/ext/nokogiri/xml_attr.c +15 -15
  20. data/ext/nokogiri/xml_attribute_decl.c +18 -18
  21. data/ext/nokogiri/xml_cdata.c +13 -18
  22. data/ext/nokogiri/xml_comment.c +19 -26
  23. data/ext/nokogiri/xml_document.c +267 -195
  24. data/ext/nokogiri/xml_document_fragment.c +13 -15
  25. data/ext/nokogiri/xml_dtd.c +54 -48
  26. data/ext/nokogiri/xml_element_content.c +31 -26
  27. data/ext/nokogiri/xml_element_decl.c +22 -22
  28. data/ext/nokogiri/xml_encoding_handler.c +28 -17
  29. data/ext/nokogiri/xml_entity_decl.c +32 -30
  30. data/ext/nokogiri/xml_entity_reference.c +16 -18
  31. data/ext/nokogiri/xml_namespace.c +60 -51
  32. data/ext/nokogiri/xml_node.c +493 -407
  33. data/ext/nokogiri/xml_node_set.c +174 -162
  34. data/ext/nokogiri/xml_processing_instruction.c +17 -19
  35. data/ext/nokogiri/xml_reader.c +197 -172
  36. data/ext/nokogiri/xml_relax_ng.c +52 -28
  37. data/ext/nokogiri/xml_sax_parser.c +112 -112
  38. data/ext/nokogiri/xml_sax_parser_context.c +105 -86
  39. data/ext/nokogiri/xml_sax_push_parser.c +36 -27
  40. data/ext/nokogiri/xml_schema.c +112 -33
  41. data/ext/nokogiri/xml_syntax_error.c +42 -21
  42. data/ext/nokogiri/xml_text.c +13 -17
  43. data/ext/nokogiri/xml_xpath_context.c +158 -73
  44. data/ext/nokogiri/xslt_stylesheet.c +158 -164
  45. data/gumbo-parser/CHANGES.md +63 -0
  46. data/gumbo-parser/Makefile +101 -0
  47. data/gumbo-parser/THANKS +27 -0
  48. data/gumbo-parser/src/Makefile +34 -0
  49. data/gumbo-parser/src/README.md +41 -0
  50. data/gumbo-parser/src/ascii.c +75 -0
  51. data/gumbo-parser/src/ascii.h +115 -0
  52. data/gumbo-parser/src/attribute.c +42 -0
  53. data/gumbo-parser/src/attribute.h +17 -0
  54. data/gumbo-parser/src/char_ref.c +22225 -0
  55. data/gumbo-parser/src/char_ref.h +29 -0
  56. data/gumbo-parser/src/char_ref.rl +2154 -0
  57. data/gumbo-parser/src/error.c +626 -0
  58. data/gumbo-parser/src/error.h +148 -0
  59. data/gumbo-parser/src/foreign_attrs.c +104 -0
  60. data/gumbo-parser/src/foreign_attrs.gperf +27 -0
  61. data/gumbo-parser/src/gumbo.h +943 -0
  62. data/gumbo-parser/src/insertion_mode.h +33 -0
  63. data/gumbo-parser/src/macros.h +91 -0
  64. data/gumbo-parser/src/parser.c +4886 -0
  65. data/gumbo-parser/src/parser.h +41 -0
  66. data/gumbo-parser/src/replacement.h +33 -0
  67. data/gumbo-parser/src/string_buffer.c +103 -0
  68. data/gumbo-parser/src/string_buffer.h +68 -0
  69. data/gumbo-parser/src/string_piece.c +48 -0
  70. data/gumbo-parser/src/svg_attrs.c +174 -0
  71. data/gumbo-parser/src/svg_attrs.gperf +77 -0
  72. data/gumbo-parser/src/svg_tags.c +137 -0
  73. data/gumbo-parser/src/svg_tags.gperf +55 -0
  74. data/gumbo-parser/src/tag.c +222 -0
  75. data/gumbo-parser/src/tag_lookup.c +382 -0
  76. data/gumbo-parser/src/tag_lookup.gperf +169 -0
  77. data/gumbo-parser/src/tag_lookup.h +13 -0
  78. data/gumbo-parser/src/token_buffer.c +79 -0
  79. data/gumbo-parser/src/token_buffer.h +71 -0
  80. data/gumbo-parser/src/token_type.h +17 -0
  81. data/gumbo-parser/src/tokenizer.c +3463 -0
  82. data/gumbo-parser/src/tokenizer.h +112 -0
  83. data/gumbo-parser/src/tokenizer_states.h +339 -0
  84. data/gumbo-parser/src/utf8.c +245 -0
  85. data/gumbo-parser/src/utf8.h +164 -0
  86. data/gumbo-parser/src/util.c +68 -0
  87. data/gumbo-parser/src/util.h +30 -0
  88. data/gumbo-parser/src/vector.c +111 -0
  89. data/gumbo-parser/src/vector.h +45 -0
  90. data/lib/nokogiri/css/node.rb +1 -0
  91. data/lib/nokogiri/css/parser.rb +64 -63
  92. data/lib/nokogiri/css/parser.y +3 -3
  93. data/lib/nokogiri/css/parser_extras.rb +39 -36
  94. data/lib/nokogiri/css/syntax_error.rb +2 -1
  95. data/lib/nokogiri/css/tokenizer.rb +105 -103
  96. data/lib/nokogiri/css/xpath_visitor.rb +73 -43
  97. data/lib/nokogiri/css.rb +15 -14
  98. data/lib/nokogiri/decorators/slop.rb +1 -0
  99. data/lib/nokogiri/extension.rb +31 -0
  100. data/lib/nokogiri/gumbo.rb +14 -0
  101. data/lib/nokogiri/html.rb +32 -27
  102. data/lib/nokogiri/{html → html4}/builder.rb +3 -2
  103. data/lib/nokogiri/{html → html4}/document.rb +17 -30
  104. data/lib/nokogiri/{html → html4}/document_fragment.rb +18 -17
  105. data/lib/nokogiri/{html → html4}/element_description.rb +2 -1
  106. data/lib/nokogiri/{html → html4}/element_description_defaults.rb +2 -1
  107. data/lib/nokogiri/{html → html4}/entity_lookup.rb +2 -1
  108. data/lib/nokogiri/{html → html4}/sax/parser.rb +12 -14
  109. data/lib/nokogiri/html4/sax/parser_context.rb +19 -0
  110. data/lib/nokogiri/{html → html4}/sax/push_parser.rb +6 -5
  111. data/lib/nokogiri/html4.rb +40 -0
  112. data/lib/nokogiri/html5/document.rb +74 -0
  113. data/lib/nokogiri/html5/document_fragment.rb +80 -0
  114. data/lib/nokogiri/html5/node.rb +93 -0
  115. data/lib/nokogiri/html5.rb +473 -0
  116. data/lib/nokogiri/jruby/dependencies.rb +20 -0
  117. data/lib/nokogiri/syntax_error.rb +1 -0
  118. data/lib/nokogiri/version/constant.rb +5 -0
  119. data/lib/nokogiri/version/info.rb +215 -0
  120. data/lib/nokogiri/version.rb +3 -109
  121. data/lib/nokogiri/xml/attr.rb +1 -0
  122. data/lib/nokogiri/xml/attribute_decl.rb +1 -0
  123. data/lib/nokogiri/xml/builder.rb +74 -32
  124. data/lib/nokogiri/xml/cdata.rb +1 -0
  125. data/lib/nokogiri/xml/character_data.rb +1 -0
  126. data/lib/nokogiri/xml/document.rb +138 -41
  127. data/lib/nokogiri/xml/document_fragment.rb +5 -6
  128. data/lib/nokogiri/xml/dtd.rb +1 -0
  129. data/lib/nokogiri/xml/element_content.rb +1 -0
  130. data/lib/nokogiri/xml/element_decl.rb +1 -0
  131. data/lib/nokogiri/xml/entity_decl.rb +1 -0
  132. data/lib/nokogiri/xml/entity_reference.rb +1 -0
  133. data/lib/nokogiri/xml/namespace.rb +1 -0
  134. data/lib/nokogiri/xml/node/save_options.rb +2 -1
  135. data/lib/nokogiri/xml/node.rb +629 -293
  136. data/lib/nokogiri/xml/node_set.rb +1 -0
  137. data/lib/nokogiri/xml/notation.rb +1 -0
  138. data/lib/nokogiri/xml/parse_options.rb +12 -3
  139. data/lib/nokogiri/xml/pp/character_data.rb +1 -0
  140. data/lib/nokogiri/xml/pp/node.rb +1 -0
  141. data/lib/nokogiri/xml/pp.rb +3 -2
  142. data/lib/nokogiri/xml/processing_instruction.rb +1 -0
  143. data/lib/nokogiri/xml/reader.rb +9 -12
  144. data/lib/nokogiri/xml/relax_ng.rb +7 -2
  145. data/lib/nokogiri/xml/sax/document.rb +25 -30
  146. data/lib/nokogiri/xml/sax/parser.rb +1 -0
  147. data/lib/nokogiri/xml/sax/parser_context.rb +1 -0
  148. data/lib/nokogiri/xml/sax/push_parser.rb +1 -0
  149. data/lib/nokogiri/xml/sax.rb +5 -4
  150. data/lib/nokogiri/xml/schema.rb +13 -4
  151. data/lib/nokogiri/xml/searchable.rb +25 -16
  152. data/lib/nokogiri/xml/syntax_error.rb +1 -0
  153. data/lib/nokogiri/xml/text.rb +1 -0
  154. data/lib/nokogiri/xml/xpath/syntax_error.rb +2 -1
  155. data/lib/nokogiri/xml/xpath.rb +4 -5
  156. data/lib/nokogiri/xml/xpath_context.rb +1 -0
  157. data/lib/nokogiri/xml.rb +36 -36
  158. data/lib/nokogiri/xslt/stylesheet.rb +2 -1
  159. data/lib/nokogiri/xslt.rb +17 -16
  160. data/lib/nokogiri.rb +32 -51
  161. data/lib/xsd/xmlparser/nokogiri.rb +1 -0
  162. data/patches/libxml2/{0002-Remove-script-macro-support.patch → 0001-Remove-script-macro-support.patch} +0 -0
  163. data/patches/libxml2/{0003-Update-entities-to-remove-handling-of-ssi.patch → 0002-Update-entities-to-remove-handling-of-ssi.patch} +0 -0
  164. data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +25 -0
  165. data/patches/libxml2/0004-use-glibc-strlen.patch +53 -0
  166. data/patches/libxml2/0005-avoid-isnan-isinf.patch +81 -0
  167. data/patches/libxml2/0006-update-automake-files-for-arm64.patch +2511 -0
  168. data/patches/libxml2/0007-Fix-XPath-recursion-limit.patch +31 -0
  169. data/patches/libxslt/0001-update-automake-files-for-arm64.patch +2511 -0
  170. data/patches/libxslt/0002-Fix-xml2-config-check-in-configure-script.patch +19 -0
  171. data/ports/archives/libxml2-2.9.12.tar.gz +0 -0
  172. data/ports/archives/libxslt-1.1.34.tar.gz +0 -0
  173. metadata +151 -153
  174. data/ext/nokogiri/html_document.c +0 -170
  175. data/ext/nokogiri/html_document.h +0 -10
  176. data/ext/nokogiri/html_element_description.c +0 -279
  177. data/ext/nokogiri/html_element_description.h +0 -10
  178. data/ext/nokogiri/html_entity_lookup.c +0 -32
  179. data/ext/nokogiri/html_entity_lookup.h +0 -8
  180. data/ext/nokogiri/html_sax_parser_context.c +0 -116
  181. data/ext/nokogiri/html_sax_parser_context.h +0 -11
  182. data/ext/nokogiri/html_sax_push_parser.c +0 -87
  183. data/ext/nokogiri/html_sax_push_parser.h +0 -9
  184. data/ext/nokogiri/xml_attr.h +0 -9
  185. data/ext/nokogiri/xml_attribute_decl.h +0 -9
  186. data/ext/nokogiri/xml_cdata.h +0 -9
  187. data/ext/nokogiri/xml_comment.h +0 -9
  188. data/ext/nokogiri/xml_document.h +0 -23
  189. data/ext/nokogiri/xml_document_fragment.h +0 -10
  190. data/ext/nokogiri/xml_dtd.h +0 -10
  191. data/ext/nokogiri/xml_element_content.h +0 -10
  192. data/ext/nokogiri/xml_element_decl.h +0 -9
  193. data/ext/nokogiri/xml_encoding_handler.h +0 -8
  194. data/ext/nokogiri/xml_entity_decl.h +0 -10
  195. data/ext/nokogiri/xml_entity_reference.h +0 -9
  196. data/ext/nokogiri/xml_io.c +0 -61
  197. data/ext/nokogiri/xml_io.h +0 -11
  198. data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
  199. data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
  200. data/ext/nokogiri/xml_namespace.h +0 -14
  201. data/ext/nokogiri/xml_node.h +0 -13
  202. data/ext/nokogiri/xml_node_set.h +0 -12
  203. data/ext/nokogiri/xml_processing_instruction.h +0 -9
  204. data/ext/nokogiri/xml_reader.h +0 -10
  205. data/ext/nokogiri/xml_relax_ng.h +0 -9
  206. data/ext/nokogiri/xml_sax_parser.h +0 -39
  207. data/ext/nokogiri/xml_sax_parser_context.h +0 -10
  208. data/ext/nokogiri/xml_sax_push_parser.h +0 -9
  209. data/ext/nokogiri/xml_schema.h +0 -9
  210. data/ext/nokogiri/xml_syntax_error.h +0 -13
  211. data/ext/nokogiri/xml_text.h +0 -9
  212. data/ext/nokogiri/xml_xpath_context.h +0 -10
  213. data/ext/nokogiri/xslt_stylesheet.h +0 -14
  214. data/lib/nokogiri/html/sax/parser_context.rb +0 -16
  215. data/patches/libxml2/0001-Revert-Do-not-URI-escape-in-server-side-includes.patch +0 -78
  216. data/patches/libxslt/0001-Fix-security-framework-bypass.patch +0 -120
  217. data/ports/archives/libxml2-2.9.9.tar.gz +0 -0
  218. data/ports/archives/libxslt-1.1.33.tar.gz +0 -0
@@ -1,5 +1,6 @@
1
+ # frozen_string_literal: true
1
2
  module Nokogiri
2
- module HTML
3
+ module HTML4
3
4
  ###
4
5
  # Nokogiri HTML builder is used for building HTML documents. It is very
5
6
  # similar to the Nokogiri::XML::Builder. In fact, you should go read the
@@ -11,7 +12,7 @@ module Nokogiri
11
12
  # Create an HTML document with a body that has an onload attribute, and a
12
13
  # span tag with a class of "bold" that has content of "Hello world".
13
14
  #
14
- # builder = Nokogiri::HTML::Builder.new do |doc|
15
+ # builder = Nokogiri::HTML4::Builder.new do |doc|
15
16
  # doc.html {
16
17
  # doc.body(:onload => 'some_func();') {
17
18
  # doc.span.bold {
@@ -1,5 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pathname'
4
+
1
5
  module Nokogiri
2
- module HTML
6
+ module HTML4
3
7
  class Document < Nokogiri::XML::Document
4
8
  ###
5
9
  # Get the meta tag encoding for this document. If there is no meta tag,
@@ -160,11 +164,12 @@ module Nokogiri
160
164
  # Nokogiri::XML::ParseOptions::RECOVER. See the constants in
161
165
  # Nokogiri::XML::ParseOptions.
162
166
  def parse string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML
163
-
164
167
  options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
165
- # Give the options to the user
168
+
166
169
  yield options if block_given?
167
170
 
171
+ url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
172
+
168
173
  if string_or_io.respond_to?(:encoding)
169
174
  unless string_or_io.encoding.name == "ASCII-8BIT"
170
175
  encoding ||= string_or_io.encoding.name
@@ -172,7 +177,12 @@ module Nokogiri
172
177
  end
173
178
 
174
179
  if string_or_io.respond_to?(:read)
175
- url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
180
+ if string_or_io.is_a?(Pathname)
181
+ # resolve the Pathname to the file and open it as an IO object, see #2110
182
+ string_or_io = string_or_io.expand_path.open
183
+ url ||= string_or_io.path
184
+ end
185
+
176
186
  unless encoding
177
187
  # Libxml2's parser has poor support for encoding
178
188
  # detection. First, it does not recognize the HTML5
@@ -251,9 +261,6 @@ module Nokogiri
251
261
  end
252
262
 
253
263
  def self.detect_encoding(chunk)
254
- if Nokogiri.jruby? && EncodingReader.is_jruby_without_fix?
255
- return EncodingReader.detect_encoding_for_jruby_without_fix(chunk)
256
- end
257
264
  m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
258
265
  return Nokogiri.XML(m[1]).encoding
259
266
 
@@ -261,37 +268,17 @@ module Nokogiri
261
268
  m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
262
269
  return m[4]
263
270
  catch(:encoding_found) {
264
- Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
271
+ Nokogiri::HTML4::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
265
272
  nil
266
273
  }
267
274
  else
268
275
  handler = SAXHandler.new
269
- parser = Nokogiri::HTML::SAX::PushParser.new(handler)
276
+ parser = Nokogiri::HTML4::SAX::PushParser.new(handler)
270
277
  parser << chunk rescue Nokogiri::SyntaxError
271
278
  handler.encoding
272
279
  end
273
280
  end
274
281
 
275
- def self.is_jruby_without_fix?
276
- JRUBY_VERSION.split('.').join.to_i < 165
277
- end
278
-
279
- def self.detect_encoding_for_jruby_without_fix(chunk)
280
- m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
281
- return Nokogiri.XML(m[1]).encoding
282
-
283
- m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
284
- return m[4]
285
-
286
- catch(:encoding_found) {
287
- Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found.to_s)).parse(chunk)
288
- nil
289
- }
290
- rescue Nokogiri::SyntaxError, RuntimeError
291
- # Ignore parser errors that nokogiri may raise
292
- nil
293
- end
294
-
295
282
  def initialize(io)
296
283
  @io = io
297
284
  @firstchunk = nil
@@ -299,7 +286,7 @@ module Nokogiri
299
286
  end
300
287
 
301
288
  # This method is used by the C extension so that
302
- # Nokogiri::HTML::Document#read_io() does not leak memory when
289
+ # Nokogiri::HTML4::Document#read_io() does not leak memory when
303
290
  # EncodingFound is raised.
304
291
  attr_reader :encoding_found
305
292
 
@@ -1,28 +1,29 @@
1
+ # frozen_string_literal: true
1
2
  module Nokogiri
2
- module HTML
3
+ module HTML4
3
4
  class DocumentFragment < Nokogiri::XML::DocumentFragment
4
5
  ####
5
6
  # Create a Nokogiri::XML::DocumentFragment from +tags+, using +encoding+
6
- def self.parse tags, encoding = nil
7
- doc = HTML::Document.new
7
+ def self.parse(tags, encoding = nil)
8
+ doc = HTML4::Document.new
8
9
 
9
10
  encoding ||= if tags.respond_to?(:encoding)
10
- encoding = tags.encoding
11
- if encoding == ::Encoding::ASCII_8BIT
12
- 'UTF-8'
13
- else
14
- encoding.name
15
- end
16
- else
17
- 'UTF-8'
18
- end
11
+ encoding = tags.encoding
12
+ if encoding == ::Encoding::ASCII_8BIT
13
+ 'UTF-8'
14
+ else
15
+ encoding.name
16
+ end
17
+ else
18
+ 'UTF-8'
19
+ end
19
20
 
20
21
  doc.encoding = encoding
21
22
 
22
23
  new(doc, tags)
23
24
  end
24
25
 
25
- def initialize document, tags = nil, ctx = nil
26
+ def initialize(document, tags = nil, ctx = nil)
26
27
  return self unless tags
27
28
 
28
29
  if ctx
@@ -32,13 +33,13 @@ module Nokogiri
32
33
  self.errors = document.errors - preexisting_errors
33
34
  else
34
35
  # This is a horrible hack, but I don't care
35
- if tags.strip =~ /^<body/i
36
- path = "/html/body"
36
+ path = if /^\s*?<body/i.match?(tags)
37
+ "/html/body"
37
38
  else
38
- path = "/html/body/node()"
39
+ "/html/body/node()"
39
40
  end
40
41
 
41
- temp_doc = HTML::Document.parse "<html><body>#{tags}", nil, document.encoding
42
+ temp_doc = HTML4::Document.parse("<html><body>#{tags}", nil, document.encoding)
42
43
  temp_doc.xpath(path).each { |child| child.parent = self }
43
44
  self.errors = temp_doc.errors
44
45
  end
@@ -1,5 +1,6 @@
1
+ # frozen_string_literal: true
1
2
  module Nokogiri
2
- module HTML
3
+ module HTML4
3
4
  class ElementDescription
4
5
  ###
5
6
  # Is this element a block element?
@@ -1,5 +1,6 @@
1
+ # frozen_string_literal: true
1
2
  module Nokogiri
2
- module HTML
3
+ module HTML4
3
4
  class ElementDescription
4
5
 
5
6
  # Methods are defined protected by method_defined? because at
@@ -1,5 +1,6 @@
1
+ # frozen_string_literal: true
1
2
  module Nokogiri
2
- module HTML
3
+ module HTML4
3
4
  class EntityDescription < Struct.new(:value, :name, :description); end
4
5
 
5
6
  class EntityLookup
@@ -1,17 +1,15 @@
1
+ # frozen_string_literal: true
1
2
  module Nokogiri
2
- module HTML
3
+ module HTML4
3
4
  ###
4
- # Nokogiri lets you write a SAX parser to process HTML but get HTML
5
- # correction features.
5
+ # Nokogiri lets you write a SAX parser to process HTML but get HTML correction features.
6
6
  #
7
- # See Nokogiri::HTML::SAX::Parser for a basic example of using a
8
- # SAX parser with HTML.
7
+ # See Nokogiri::HTML4::SAX::Parser for a basic example of using a SAX parser with HTML.
9
8
  #
10
9
  # For more information on SAX parsers, see Nokogiri::XML::SAX
11
10
  module SAX
12
11
  ###
13
- # This class lets you perform SAX style parsing on HTML with HTML
14
- # error correction.
12
+ # This class lets you perform SAX style parsing on HTML with HTML error correction.
15
13
  #
16
14
  # Here is a basic usage example:
17
15
  #
@@ -21,40 +19,40 @@ module Nokogiri
21
19
  # end
22
20
  # end
23
21
  #
24
- # parser = Nokogiri::HTML::SAX::Parser.new(MyDoc.new)
22
+ # parser = Nokogiri::HTML4::SAX::Parser.new(MyDoc.new)
25
23
  # parser.parse(File.read(ARGV[0], mode: 'rb'))
26
24
  #
27
25
  # For more information on SAX parsers, see Nokogiri::XML::SAX
28
26
  class Parser < Nokogiri::XML::SAX::Parser
29
27
  ###
30
28
  # Parse html stored in +data+ using +encoding+
31
- def parse_memory data, encoding = 'UTF-8'
29
+ def parse_memory(data, encoding = "UTF-8")
32
30
  raise ArgumentError unless data
33
31
  return unless data.length > 0
34
32
  ctx = ParserContext.memory(data, encoding)
35
33
  yield ctx if block_given?
36
- ctx.parse_with self
34
+ ctx.parse_with(self)
37
35
  end
38
36
 
39
37
  ###
40
38
  # Parse given +io+
41
- def parse_io io, encoding = 'UTF-8'
39
+ def parse_io(io, encoding = "UTF-8")
42
40
  check_encoding(encoding)
43
41
  @encoding = encoding
44
42
  ctx = ParserContext.io(io, ENCODINGS[encoding])
45
43
  yield ctx if block_given?
46
- ctx.parse_with self
44
+ ctx.parse_with(self)
47
45
  end
48
46
 
49
47
  ###
50
48
  # Parse a file with +filename+
51
- def parse_file filename, encoding = 'UTF-8'
49
+ def parse_file(filename, encoding = "UTF-8")
52
50
  raise ArgumentError unless filename
53
51
  raise Errno::ENOENT unless File.exist?(filename)
54
52
  raise Errno::EISDIR if File.directory?(filename)
55
53
  ctx = ParserContext.file(filename, encoding)
56
54
  yield ctx if block_given?
57
- ctx.parse_with self
55
+ ctx.parse_with(self)
58
56
  end
59
57
  end
60
58
  end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+ module Nokogiri
3
+ module HTML4
4
+ module SAX
5
+ ###
6
+ # Context for HTML SAX parsers. This class is usually not instantiated by the user. Instead,
7
+ # you should be looking at Nokogiri::HTML4::SAX::Parser
8
+ class ParserContext < Nokogiri::XML::SAX::ParserContext
9
+ def self.new(thing, encoding = "UTF-8")
10
+ if [:read, :close].all? { |x| thing.respond_to?(x) }
11
+ super
12
+ else
13
+ memory(thing, encoding)
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -1,16 +1,17 @@
1
+ # frozen_string_literal: true
1
2
  module Nokogiri
2
- module HTML
3
+ module HTML4
3
4
  module SAX
4
5
  class PushParser
5
6
 
6
- # The Nokogiri::HTML::SAX::Document on which the PushParser will be
7
+ # The Nokogiri::HTML4::SAX::Document on which the PushParser will be
7
8
  # operating
8
9
  attr_accessor :document
9
10
 
10
- def initialize(doc = HTML::SAX::Document.new, file_name = nil, encoding = 'UTF-8')
11
+ def initialize(doc = HTML4::SAX::Document.new, file_name = nil, encoding = 'UTF-8')
11
12
  @document = doc
12
13
  @encoding = encoding
13
- @sax_parser = HTML::SAX::Parser.new(doc, @encoding)
14
+ @sax_parser = HTML4::SAX::Parser.new(doc, @encoding)
14
15
 
15
16
  ## Create our push parser context
16
17
  initialize_native(@sax_parser, file_name, encoding)
@@ -26,7 +27,7 @@ module Nokogiri
26
27
 
27
28
  ###
28
29
  # Finish the parsing. This method is only necessary for
29
- # Nokogiri::HTML::SAX::Document#end_document to be called.
30
+ # Nokogiri::HTML4::SAX::Document#end_document to be called.
30
31
  def finish
31
32
  write '', true
32
33
  end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+ module Nokogiri
3
+ class << self
4
+ ###
5
+ # Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
6
+ def HTML4(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
7
+ Nokogiri::HTML4::Document.parse(input, url, encoding, options, &block)
8
+ end
9
+ end
10
+
11
+ # @since v1.12.0
12
+ # @note Before v1.12.0, {Nokogiri::HTML4} did not exist, and {Nokogiri::HTML} was the module/namespace for parsing HTML.
13
+ module HTML4
14
+ class << self
15
+ ###
16
+ # Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
17
+ def parse(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
18
+ Document.parse(input, url, encoding, options, &block)
19
+ end
20
+
21
+ ####
22
+ # Parse a fragment from +string+ in to a NodeSet.
23
+ def fragment(string, encoding = nil)
24
+ HTML4::DocumentFragment.parse(string, encoding)
25
+ end
26
+ end
27
+
28
+ # Instance of Nokogiri::HTML4::EntityLookup
29
+ NamedCharacters = EntityLookup.new
30
+ end
31
+ end
32
+
33
+ require_relative "html4/entity_lookup"
34
+ require_relative "html4/document"
35
+ require_relative "html4/document_fragment"
36
+ require_relative "html4/sax/parser_context"
37
+ require_relative "html4/sax/parser"
38
+ require_relative "html4/sax/push_parser"
39
+ require_relative "html4/element_description"
40
+ require_relative "html4/element_description_defaults"
@@ -0,0 +1,74 @@
1
+ # frozen_string_literal: true
2
+ #
3
+ # Copyright 2013-2021 Sam Ruby, Stephen Checkoway
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ require_relative "../html4/document"
19
+
20
+ module Nokogiri
21
+ module HTML5
22
+ # @since v1.12.0
23
+ # @note HTML5 functionality is not available when running JRuby.
24
+ class Document < Nokogiri::HTML4::Document
25
+ def self.parse(string_or_io, url = nil, encoding = nil, **options, &block)
26
+ yield options if block_given?
27
+ string_or_io = '' unless string_or_io
28
+
29
+ if string_or_io.respond_to?(:encoding) && string_or_io.encoding.name != 'ASCII-8BIT'
30
+ encoding ||= string_or_io.encoding.name
31
+ end
32
+
33
+ if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
34
+ url ||= string_or_io.path
35
+ end
36
+ unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
37
+ raise ArgumentError.new("not a string or IO object")
38
+ end
39
+ do_parse(string_or_io, url, encoding, options)
40
+ end
41
+
42
+ def self.read_io(io, url = nil, encoding = nil, **options)
43
+ raise ArgumentError.new("io object doesn't respond to :read") unless io.respond_to?(:read)
44
+ do_parse(io, url, encoding, options)
45
+ end
46
+
47
+ def self.read_memory(string, url = nil, encoding = nil, **options)
48
+ raise ArgumentError.new("string object doesn't respond to :to_str") unless string.respond_to?(:to_str)
49
+ do_parse(string, url, encoding, options)
50
+ end
51
+
52
+ def fragment(tags = nil)
53
+ DocumentFragment.new(self, tags, self.root)
54
+ end
55
+
56
+ def to_xml(options = {}, &block)
57
+ # Bypass XML::Document#to_xml which doesn't add
58
+ # XML::Node::SaveOptions::AS_XML like XML::Node#to_xml does.
59
+ XML::Node.instance_method(:to_xml).bind(self).call(options, &block)
60
+ end
61
+
62
+ private
63
+ def self.do_parse(string_or_io, url, encoding, options)
64
+ string = HTML5.read_and_encode(string_or_io, encoding)
65
+ max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
66
+ max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
67
+ max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
68
+ doc = Nokogiri::Gumbo.parse(string, url, max_attributes, max_errors, max_depth)
69
+ doc.encoding = 'UTF-8'
70
+ doc
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+ #
3
+ # Copyright 2013-2021 Sam Ruby, Stephen Checkoway
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ require_relative "../html4/document_fragment"
19
+
20
+ module Nokogiri
21
+ module HTML5
22
+ # @since v1.12.0
23
+ # @note HTML5 functionality is not available when running JRuby.
24
+ class DocumentFragment < Nokogiri::HTML4::DocumentFragment
25
+ attr_accessor :document
26
+ attr_accessor :errors
27
+
28
+ # Create a document fragment.
29
+ def initialize(doc, tags = nil, ctx = nil, options = {})
30
+ self.document = doc
31
+ self.errors = []
32
+ return self unless tags
33
+
34
+ max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
35
+ max_errors = options[:max_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
36
+ max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
37
+ tags = Nokogiri::HTML5.read_and_encode(tags, nil)
38
+ Nokogiri::Gumbo.fragment(self, tags, ctx, max_attributes, max_errors, max_depth)
39
+ end
40
+
41
+ def serialize(options = {}, &block)
42
+ # Bypass XML::Document.serialize which doesn't support options even
43
+ # though XML::Node.serialize does!
44
+ XML::Node.instance_method(:serialize).bind(self).call(options, &block)
45
+ end
46
+
47
+ # Parse a document fragment from +tags+, returning a Nodeset.
48
+ def self.parse(tags, encoding = nil, options = {})
49
+ doc = HTML5::Document.new
50
+ tags = HTML5.read_and_encode(tags, encoding)
51
+ doc.encoding = "UTF-8"
52
+ new(doc, tags, nil, options)
53
+ end
54
+
55
+ def extract_params(params) # :nodoc:
56
+ handler = params.find do |param|
57
+ ![Hash, String, Symbol].include?(param.class)
58
+ end
59
+ params -= [handler] if handler
60
+
61
+ hashes = []
62
+ while Hash === params.last || params.last.nil?
63
+ hashes << params.pop
64
+ break if params.empty?
65
+ end
66
+ ns, binds = hashes.reverse
67
+
68
+ ns ||=
69
+ begin
70
+ ns = {}
71
+ children.each { |child| ns.merge!(child.namespaces) }
72
+ ns
73
+ end
74
+
75
+ [params, handler, ns, binds]
76
+ end
77
+ end
78
+ end
79
+ end
80
+ # vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
@@ -0,0 +1,93 @@
1
+ # frozen_string_literal: true
2
+ #
3
+ # Copyright 2013-2021 Sam Ruby, Stephen Checkoway
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ require_relative "../xml/node"
19
+
20
+ module Nokogiri
21
+ module HTML5
22
+ # @since v1.12.0
23
+ # @note HTML5 functionality is not available when running JRuby.
24
+ module Node
25
+ def inner_html(options = {})
26
+ return super(options) unless document.is_a?(HTML5::Document)
27
+ result = options[:preserve_newline] && HTML5.prepend_newline?(self) ? String.new("\n") : String.new
28
+ result << children.map { |child| child.to_html(options) }.join
29
+ result
30
+ end
31
+
32
+ def write_to(io, *options)
33
+ return super(io, *options) unless document.is_a?(HTML5::Document)
34
+ options = options.first.is_a?(Hash) ? options.shift : {}
35
+ encoding = options[:encoding] || options[0]
36
+ if Nokogiri.jruby?
37
+ save_options = options[:save_with] || options[1]
38
+ indent_times = options[:indent] || 0
39
+ else
40
+ save_options = options[:save_with] || options[1] || XML::Node::SaveOptions::FORMAT
41
+ indent_times = options[:indent] || 2
42
+ end
43
+ indent_string = (options[:indent_text] || " ") * indent_times
44
+
45
+ config = XML::Node::SaveOptions.new(save_options.to_i)
46
+ yield config if block_given?
47
+
48
+ config_options = config.options
49
+ if config_options & (XML::Node::SaveOptions::AS_XML | XML::Node::SaveOptions::AS_XHTML) != 0
50
+ # Use Nokogiri's serializing code.
51
+ native_write_to(io, encoding, indent_string, config_options)
52
+ else
53
+ # Serialize including the current node.
54
+ encoding ||= document.encoding || Encoding::UTF_8
55
+ internal_ops = {
56
+ preserve_newline: options[:preserve_newline] || false,
57
+ }
58
+ HTML5.serialize_node_internal(self, io, encoding, internal_ops)
59
+ end
60
+ end
61
+
62
+ def fragment(tags)
63
+ return super(tags) unless document.is_a?(HTML5::Document)
64
+ DocumentFragment.new(document, tags, self)
65
+ end
66
+
67
+ private
68
+
69
+ # HTML elements can have attributes that contain colons.
70
+ # Nokogiri::XML::Node#[]= treats names with colons as a prefixed QName
71
+ # and tries to create an attribute in a namespace. This is especially
72
+ # annoying with attribute names like xml:lang since libxml2 will
73
+ # actually create the xml namespace if it doesn't exist already.
74
+ def add_child_node_and_reparent_attrs(node)
75
+ return super(node) unless document.is_a?(HTML5::Document)
76
+ # I'm not sure what this method is supposed to do. Reparenting
77
+ # namespaces is handled by libxml2, including child namespaces which
78
+ # this method wouldn't handle.
79
+ # https://github.com/sparklemotion/nokogiri/issues/1790
80
+ add_child_node(node)
81
+ # node.attribute_nodes.find_all { |a| a.namespace }.each do |attr|
82
+ # attr.remove
83
+ # ns = attr.namespace
84
+ # a["#{ns.prefix}:#{attr.name}"] = attr.value
85
+ # end
86
+ end
87
+ end
88
+ # Monkey patch
89
+ XML::Node.prepend(HTML5::Node)
90
+ end
91
+ end
92
+
93
+ # vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab: