nokogiri 1.11.7-x86-linux → 1.12.2-x86-linux

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (69) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE-DEPENDENCIES.md +243 -22
  3. data/LICENSE.md +1 -1
  4. data/README.md +6 -5
  5. data/ext/nokogiri/depend +35 -34
  6. data/ext/nokogiri/extconf.rb +181 -103
  7. data/ext/nokogiri/gumbo.c +584 -0
  8. data/ext/nokogiri/{html_document.c → html4_document.c} +8 -8
  9. data/ext/nokogiri/{html_element_description.c → html4_element_description.c} +20 -18
  10. data/ext/nokogiri/{html_entity_lookup.c → html4_entity_lookup.c} +7 -7
  11. data/ext/nokogiri/{html_sax_parser_context.c → html4_sax_parser_context.c} +5 -5
  12. data/ext/nokogiri/{html_sax_push_parser.c → html4_sax_push_parser.c} +4 -4
  13. data/ext/nokogiri/libxml2_backwards_compat.c +30 -30
  14. data/ext/nokogiri/nokogiri.c +51 -38
  15. data/ext/nokogiri/nokogiri.h +16 -9
  16. data/ext/nokogiri/xml_document.c +13 -13
  17. data/ext/nokogiri/xml_element_content.c +2 -0
  18. data/ext/nokogiri/xml_encoding_handler.c +11 -6
  19. data/ext/nokogiri/xml_namespace.c +2 -0
  20. data/ext/nokogiri/xml_node.c +102 -102
  21. data/ext/nokogiri/xml_node_set.c +20 -20
  22. data/ext/nokogiri/xml_reader.c +2 -0
  23. data/ext/nokogiri/xml_sax_parser.c +6 -6
  24. data/ext/nokogiri/xml_sax_parser_context.c +2 -0
  25. data/ext/nokogiri/xml_schema.c +2 -0
  26. data/ext/nokogiri/xml_xpath_context.c +67 -65
  27. data/ext/nokogiri/xslt_stylesheet.c +2 -1
  28. data/gumbo-parser/CHANGES.md +63 -0
  29. data/gumbo-parser/Makefile +101 -0
  30. data/gumbo-parser/THANKS +27 -0
  31. data/lib/nokogiri.rb +31 -29
  32. data/lib/nokogiri/2.5/nokogiri.so +0 -0
  33. data/lib/nokogiri/2.6/nokogiri.so +0 -0
  34. data/lib/nokogiri/2.7/nokogiri.so +0 -0
  35. data/lib/nokogiri/3.0/nokogiri.so +0 -0
  36. data/lib/nokogiri/css.rb +14 -14
  37. data/lib/nokogiri/css/parser.rb +1 -1
  38. data/lib/nokogiri/css/parser.y +1 -1
  39. data/lib/nokogiri/css/syntax_error.rb +1 -1
  40. data/lib/nokogiri/extension.rb +7 -2
  41. data/lib/nokogiri/gumbo.rb +14 -0
  42. data/lib/nokogiri/html.rb +31 -27
  43. data/lib/nokogiri/html4.rb +40 -0
  44. data/lib/nokogiri/{html → html4}/builder.rb +2 -2
  45. data/lib/nokogiri/{html → html4}/document.rb +4 -4
  46. data/lib/nokogiri/{html → html4}/document_fragment.rb +3 -3
  47. data/lib/nokogiri/{html → html4}/element_description.rb +1 -1
  48. data/lib/nokogiri/{html → html4}/element_description_defaults.rb +1 -1
  49. data/lib/nokogiri/{html → html4}/entity_lookup.rb +1 -1
  50. data/lib/nokogiri/{html → html4}/sax/parser.rb +11 -14
  51. data/lib/nokogiri/html4/sax/parser_context.rb +19 -0
  52. data/lib/nokogiri/{html → html4}/sax/push_parser.rb +5 -5
  53. data/lib/nokogiri/html5.rb +473 -0
  54. data/lib/nokogiri/html5/document.rb +74 -0
  55. data/lib/nokogiri/html5/document_fragment.rb +80 -0
  56. data/lib/nokogiri/html5/node.rb +93 -0
  57. data/lib/nokogiri/version/constant.rb +1 -1
  58. data/lib/nokogiri/version/info.rb +11 -2
  59. data/lib/nokogiri/xml.rb +35 -36
  60. data/lib/nokogiri/xml/node.rb +6 -5
  61. data/lib/nokogiri/xml/parse_options.rb +2 -0
  62. data/lib/nokogiri/xml/pp.rb +2 -2
  63. data/lib/nokogiri/xml/sax.rb +4 -4
  64. data/lib/nokogiri/xml/sax/document.rb +24 -30
  65. data/lib/nokogiri/xml/xpath.rb +2 -2
  66. data/lib/nokogiri/xslt.rb +16 -16
  67. data/lib/nokogiri/xslt/stylesheet.rb +1 -1
  68. metadata +35 -35
  69. data/lib/nokogiri/html/sax/parser_context.rb +0 -17
@@ -0,0 +1,473 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+ #
4
+ # Copyright 2013-2021 Sam Ruby, Stephen Checkoway
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+ #
18
+
19
+ require_relative 'html5/document'
20
+ require_relative 'html5/document_fragment'
21
+ require_relative 'html5/node'
22
+
23
+ module Nokogiri
24
+ # @since v1.12.0
25
+ # @note HTML5 functionality is not available when running JRuby.
26
+ # Parse an HTML5 document. Convenience method for {Nokogiri::HTML5::Document.parse}
27
+ def self.HTML5(input, url = nil, encoding = nil, **options, &block)
28
+ Nokogiri::HTML5::Document.parse(input, url, encoding, **options, &block)
29
+ end
30
+
31
+ # == Usage
32
+ #
33
+ # Parse an HTML5 document:
34
+ #
35
+ # doc = Nokogiri.HTML5(string)
36
+ #
37
+ # Parse an HTML5 fragment:
38
+ #
39
+ # fragment = Nokogiri::HTML5.fragment(string)
40
+ #
41
+ # == Parsing options
42
+ #
43
+ # The document and fragment parsing methods support options that are different from Nokogiri's.
44
+ #
45
+ # - <tt>Nokogiri.HTML5(html, url = nil, encoding = nil, options = {})</tt>
46
+ # - <tt>Nokogiri::HTML5.parse(html, url = nil, encoding = nil, options = {})</tt>
47
+ # - <tt>Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, options = {})</tt>
48
+ # - <tt>Nokogiri::HTML5.fragment(html, encoding = nil, options = {})</tt>
49
+ # - <tt>Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, options = {})</tt>
50
+ #
51
+ # The three currently supported options are +:max_errors+, +:max_tree_depth+ and
52
+ # +:max_attributes+, described below.
53
+ #
54
+ # === Error reporting
55
+ #
56
+ # Nokogiri contains an experimental HTML5 parse error reporting facility. By default, no parse
57
+ # errors are reported but this can be configured by passing the +:max_errors+ option to
58
+ # {HTML5.parse} or {HTML5.fragment}.
59
+ #
60
+ # For example, this script:
61
+ #
62
+ # doc = Nokogiri::HTML5.parse('<span/>Hi there!</span foo=bar />', max_errors: 10)
63
+ # doc.errors.each do |err|
64
+ # puts(err)
65
+ # end
66
+ #
67
+ # Emits:
68
+ #
69
+ # 1:1: ERROR: Expected a doctype token
70
+ # <span/>Hi there!</span foo=bar />
71
+ # ^
72
+ # 1:1: ERROR: Start tag of nonvoid HTML element ends with '/>', use '>'.
73
+ # <span/>Hi there!</span foo=bar />
74
+ # ^
75
+ # 1:17: ERROR: End tag ends with '/>', use '>'.
76
+ # <span/>Hi there!</span foo=bar />
77
+ # ^
78
+ # 1:17: ERROR: End tag contains attributes.
79
+ # <span/>Hi there!</span foo=bar />
80
+ # ^
81
+ #
82
+ # Using <tt>max_errors: -1</tt> results in an unlimited number of errors being returned.
83
+ #
84
+ # The errors returned by {HTML5::Document#errors} are instances of {Nokogiri::XML::SyntaxError}.
85
+ #
86
+ # The {https://html.spec.whatwg.org/multipage/parsing.html#parse-errors HTML standard} defines a
87
+ # number of standard parse error codes. These error codes only cover the "tokenization" stage of
88
+ # parsing HTML. The parse errors in the "tree construction" stage do not have standardized error
89
+ # codes (yet).
90
+ #
91
+ # As a convenience to Nokogiri users, the defined error codes are available via
92
+ # {Nokogiri::XML::SyntaxError#str1} method.
93
+ #
94
+ # doc = Nokogiri::HTML5.parse('<span/>Hi there!</span foo=bar />', max_errors: 10)
95
+ # doc.errors.each do |err|
96
+ # puts("#{err.line}:#{err.column}: #{err.str1}")
97
+ # end
98
+ # # => 1:1: generic-parser
99
+ # # 1:1: non-void-html-element-start-tag-with-trailing-solidus
100
+ # # 1:17: end-tag-with-trailing-solidus
101
+ # # 1:17: end-tag-with-attributes
102
+ #
103
+ # Note that the first error is +generic-parser+ because it's an error from the tree construction
104
+ # stage and doesn't have a standardized error code.
105
+ #
106
+ # For the purposes of semantic versioning, the error messages, error locations, and error codes
107
+ # are not part of Nokogiri's public API. That is, these are subject to change without Nokogiri's
108
+ # major version number changing. These may be stabilized in the future.
109
+ #
110
+ # === Maximum tree depth
111
+ #
112
+ # The maximum depth of the DOM tree parsed by the various parsing methods is configurable by the
113
+ # +:max_tree_depth+ option. If the depth of the tree would exceed this limit, then an
114
+ # {::ArgumentError} is thrown.
115
+ #
116
+ # This limit (which defaults to <tt>Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH = 400</tt>) can be
117
+ # removed by giving the option <tt>max_tree_depth: -1</tt>.
118
+ #
119
+ # html = '<!DOCTYPE html>' + '<div>' * 1000
120
+ # doc = Nokogiri.HTML5(html)
121
+ # # raises ArgumentError: Document tree depth limit exceeded
122
+ # doc = Nokogiri.HTML5(html, max_tree_depth: -1)
123
+ #
124
+ # === Attribute limit per element
125
+ #
126
+ # The maximum number of attributes per DOM element is configurable by the +:max_attributes+
127
+ # option. If a given element would exceed this limit, then an {::ArgumentError} is thrown.
128
+ #
129
+ # This limit (which defaults to <tt>Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES = 400</tt>) can be
130
+ # removed by giving the option <tt>max_attributes: -1</tt>.
131
+ #
132
+ # html = '<!DOCTYPE html><div ' + (1..1000).map { |x| "attr-#{x}" }.join(' ') + '>'
133
+ # # "<!DOCTYPE html><div attr-1 attr-2 attr-3 ... attr-1000>"
134
+ # doc = Nokogiri.HTML5(html)
135
+ # # raises ArgumentError: Attributes per element limit exceeded
136
+ # doc = Nokogiri.HTML5(html, max_attributes: -1)
137
+ #
138
+ # == HTML Serialization
139
+ #
140
+ # After parsing HTML, it may be serialized using any of the {Nokogiri::XML::Node} serialization
141
+ # methods. In particular, {XML::Node#serialize}, {XML::Node#to_html}, and {XML::Node#to_s} will
142
+ # serialize a given node and its children. (This is the equivalent of JavaScript's
143
+ # +Element.outerHTML+.) Similarly, {XML::Node#inner_html} will serialize the children of a given
144
+ # node. (This is the equivalent of JavaScript's +Element.innerHTML+.)
145
+ #
146
+ # doc = Nokogiri::HTML5("<!DOCTYPE html><span>Hello world!</span>")
147
+ # puts doc.serialize
148
+ # # => <!DOCTYPE html><html><head></head><body><span>Hello world!</span></body></html>
149
+ #
150
+ # Due to quirks in how HTML is parsed and serialized, it's possible for a DOM tree to be
151
+ # serialized and then re-parsed, resulting in a different DOM. Mostly, this happens with DOMs
152
+ # produced from invalid HTML. Unfortunately, even valid HTML may not survive serialization and
153
+ # re-parsing.
154
+ #
155
+ # In particular, a newline at the start of +pre+, +listing+, and +textarea+ elements is ignored by
156
+ # the parser.
157
+ #
158
+ # doc = Nokogiri::HTML5(<<-EOF)
159
+ # <!DOCTYPE html>
160
+ # <pre>
161
+ # Content</pre>
162
+ # EOF
163
+ # puts doc.at('/html/body/pre').serialize
164
+ # # => <pre>Content</pre>
165
+ #
166
+ # In this case, the original HTML is semantically equivalent to the serialized version. If the
167
+ # +pre+, +listing+, or +textarea+ content starts with two newlines, the first newline will be
168
+ # stripped on the first parse and the second newline will be stripped on the second, leading to
169
+ # semantically different DOMs. Passing the parameter <tt>preserve_newline: true</tt> will cause
170
+ # two or more newlines to be preserved. (A single leading newline will still be removed.)
171
+ #
172
+ # doc = Nokogiri::HTML5(<<-EOF)
173
+ # <!DOCTYPE html>
174
+ # <listing>
175
+ #
176
+ # Content</listing>
177
+ # EOF
178
+ # puts doc.at('/html/body/listing').serialize(preserve_newline: true)
179
+ # # => <listing>
180
+ # #
181
+ # # Content</listing>
182
+ #
183
+ # == Encodings
184
+ #
185
+ # Nokogiri always parses HTML5 using {https://en.wikipedia.org/wiki/UTF-8 UTF-8}; however, the
186
+ # encoding of the input can be explicitly selected via the optional +encoding+ parameter. This is
187
+ # most useful when the input comes not from a string but from an IO object.
188
+ #
189
+ # When serializing a document or node, the encoding of the output string can be specified via the
190
+ # +:encoding+ options. Characters that cannot be encoded in the selected encoding will be encoded
191
+ # as {https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references HTML numeric
192
+ # entities}.
193
+ #
194
+ # frag = Nokogiri::HTML5.fragment('<span>아는 길도 물어가라</span>')
195
+ # html = frag.serialize(encoding: 'US-ASCII')
196
+ # puts html
197
+ # # => <span>&#xc544;&#xb294; &#xae38;&#xb3c4; &#xbb3c;&#xc5b4;&#xac00;&#xb77c;</span>
198
+ # frag = Nokogiri::HTML5.fragment(html)
199
+ # puts frag.serialize
200
+ # # => <span>아는 길도 물어가라</span>
201
+ #
202
+ # (There's a {https://bugs.ruby-lang.org/issues/15033 bug} in all current versions of Ruby that
203
+ # can cause the entity encoding to fail. Of the mandated supported encodings for HTML, the only
204
+ # encoding I'm aware of that has this bug is <tt>'ISO-2022-JP'</tt>. We recommend avoiding this
205
+ # encoding.)
206
+ #
207
+ # == Notes
208
+ #
209
+ # * The {Nokogiri::HTML5.fragment} function takes a string and parses it
210
+ # as a HTML5 document. The +<html>+, +<head>+, and +<body>+ elements are
211
+ # removed from this document, and any children of these elements that remain
212
+ # are returned as a {Nokogiri::HTML5::DocumentFragment}.
213
+ #
214
+ # * The {Nokogiri::HTML5.parse} function takes a string and passes it to the
215
+ # <code>gumbo_parse_with_options</code> method, using the default options.
216
+ # The resulting Gumbo parse tree is then walked.
217
+ #
218
+ # * Instead of uppercase element names, lowercase element names are produced.
219
+ #
220
+ # * Instead of returning +unknown+ as the element name for unknown tags, the
221
+ # original tag name is returned verbatim.
222
+ #
223
+ # @since v1.12.0
224
+ # @note HTML5 functionality is not available when running JRuby.
225
+ module HTML5
226
+ # HTML uses the XHTML namespace.
227
+ HTML_NAMESPACE = 'http://www.w3.org/1999/xhtml'.freeze
228
+ MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML'.freeze
229
+ SVG_NAMESPACE = 'http://www.w3.org/2000/svg'.freeze
230
+ XLINK_NAMESPACE = 'http://www.w3.org/1999/xlink'.freeze
231
+ XML_NAMESPACE = 'http://www.w3.org/XML/1998/namespace'.freeze
232
+ XMLNS_NAMESPACE = 'http://www.w3.org/2000/xmlns/'.freeze
233
+
234
+ # Parse an HTML 5 document. Convenience method for {Nokogiri::HTML5::Document.parse}
235
+ def self.parse(string, url = nil, encoding = nil, **options, &block)
236
+ Document.parse(string, url, encoding, **options, &block)
237
+ end
238
+
239
+ # Parse a fragment from +string+. Convenience method for
240
+ # {Nokogiri::HTML5::DocumentFragment.parse}.
241
+ def self.fragment(string, encoding = nil, **options)
242
+ DocumentFragment.parse(string, encoding, options)
243
+ end
244
+
245
+ # Fetch and parse a HTML document from the web, following redirects,
246
+ # handling https, and determining the character encoding using HTML5
247
+ # rules. +uri+ may be a +String+ or a +URI+. +options+ contains
248
+ # http headers and special options. Everything which is not a
249
+ # special option is considered a header. Special options include:
250
+ # * :follow_limit => number of redirects which are followed
251
+ # * :basic_auth => [username, password]
252
+ def self.get(uri, options={})
253
+ warn("Nokogiri::HTML5.get is deprecated and will be removed in a future version of Nokogiri.",
254
+ uplevel: 1, category: :deprecated)
255
+ get_impl(uri, options)
256
+ end
257
+
258
+ private
259
+
260
+ def self.get_impl(uri, options={})
261
+ headers = options.clone
262
+ headers = {:follow_limit => headers} if Numeric === headers # deprecated
263
+ limit=headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
264
+
265
+ require 'net/http'
266
+ uri = URI(uri) unless URI === uri
267
+
268
+ http = Net::HTTP.new(uri.host, uri.port)
269
+
270
+ # TLS / SSL support
271
+ http.use_ssl = true if uri.scheme == 'https'
272
+
273
+ # Pass through Net::HTTP override values, which currently include:
274
+ # :ca_file, :ca_path, :cert, :cert_store, :ciphers,
275
+ # :close_on_empty_response, :continue_timeout, :key, :open_timeout,
276
+ # :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
277
+ # :verify_callback, :verify_depth, :verify_mode
278
+ options.each do |key, value|
279
+ http.send "#{key}=", headers.delete(key) if http.respond_to? "#{key}="
280
+ end
281
+
282
+ request = Net::HTTP::Get.new(uri.request_uri)
283
+
284
+ # basic authentication
285
+ auth = headers.delete(:basic_auth)
286
+ auth ||= [uri.user, uri.password] if uri.user && uri.password
287
+ request.basic_auth auth.first, auth.last if auth
288
+
289
+ # remaining options are treated as headers
290
+ headers.each {|key, value| request[key.to_s] = value.to_s}
291
+
292
+ response = http.request(request)
293
+
294
+ case response
295
+ when Net::HTTPSuccess
296
+ doc = parse(reencode(response.body, response['content-type']), options)
297
+ doc.instance_variable_set('@response', response)
298
+ doc.class.send(:attr_reader, :response)
299
+ doc
300
+ when Net::HTTPRedirection
301
+ response.value if limit <= 1
302
+ location = URI.join(uri, response['location'])
303
+ get_impl(location, options.merge(:follow_limit => limit-1))
304
+ else
305
+ response.value
306
+ end
307
+ end
308
+
309
+ def self.read_and_encode(string, encoding)
310
+ # Read the string with the given encoding.
311
+ if string.respond_to?(:read)
312
+ if encoding.nil?
313
+ string = string.read
314
+ else
315
+ string = string.read(encoding: encoding)
316
+ end
317
+ else
318
+ # Otherwise the string has the given encoding.
319
+ string = string.to_s
320
+ if encoding
321
+ string = string.dup
322
+ string.force_encoding(encoding)
323
+ end
324
+ end
325
+
326
+ # convert to UTF-8
327
+ if string.encoding != Encoding::UTF_8
328
+ string = reencode(string)
329
+ end
330
+ string
331
+ end
332
+
333
+ # Charset sniffing is a complex and controversial topic that understandably isn't done _by
334
+ # default_ by the Ruby Net::HTTP library. This being said, it is a very real problem for
335
+ # consumers of HTML as the default for HTML is iso-8859-1, most "good" producers use utf-8, and
336
+ # the Gumbo parser *only* supports utf-8.
337
+ #
338
+ # Accordingly, Nokogiri::HTML4::Document.parse provides limited encoding detection. Following
339
+ # this lead, Nokogiri::HTML5 attempts to do likewise, while attempting to more closely follow
340
+ # the HTML5 standard.
341
+ #
342
+ # http://bugs.ruby-lang.org/issues/2567
343
+ # http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
344
+ #
345
+ def self.reencode(body, content_type=nil)
346
+ if body.encoding == Encoding::ASCII_8BIT
347
+ encoding = nil
348
+
349
+ # look for a Byte Order Mark (BOM)
350
+ initial_bytes = body[0..2].bytes
351
+ if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
352
+ encoding = Encoding::UTF_8
353
+ elsif initial_bytes[0..1] == [0xFE, 0xFF]
354
+ encoding = Encoding::UTF_16BE
355
+ elsif initial_bytes[0..1] == [0xFF, 0xFE]
356
+ encoding = Encoding::UTF_16LE
357
+ end
358
+
359
+ # look for a charset in a content-encoding header
360
+ if content_type
361
+ encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
362
+ end
363
+
364
+ # look for a charset in a meta tag in the first 1024 bytes
365
+ if not encoding
366
+ data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, '')
367
+ data.scan(/<meta.*?>/m).each do |meta|
368
+ encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
369
+ end
370
+ end
371
+
372
+ # if all else fails, default to the official default encoding for HTML
373
+ encoding ||= Encoding::ISO_8859_1
374
+
375
+ # change the encoding to match the detected or inferred encoding
376
+ body = body.dup
377
+ begin
378
+ body.force_encoding(encoding)
379
+ rescue ArgumentError
380
+ body.force_encoding(Encoding::ISO_8859_1)
381
+ end
382
+ end
383
+
384
+ body.encode(Encoding::UTF_8)
385
+ end
386
+
387
+ def self.serialize_node_internal(current_node, io, encoding, options)
388
+ case current_node.type
389
+ when XML::Node::ELEMENT_NODE
390
+ ns = current_node.namespace
391
+ ns_uri = ns.nil? ? nil : ns.href
392
+ # XXX(sfc): attach namespaces to all nodes, even html?
393
+ if ns_uri.nil? || ns_uri == HTML_NAMESPACE || ns_uri == MATHML_NAMESPACE || ns_uri == SVG_NAMESPACE
394
+ tagname = current_node.name
395
+ else
396
+ tagname = "#{ns.prefix}:#{current_node.name}"
397
+ end
398
+ io << '<' << tagname
399
+ current_node.attribute_nodes.each do |attr|
400
+ attr_ns = attr.namespace
401
+ if attr_ns.nil?
402
+ attr_name = attr.name
403
+ else
404
+ ns_uri = attr_ns.href
405
+ if ns_uri == XML_NAMESPACE
406
+ attr_name = 'xml:' + attr.name.sub(/^[^:]*:/, '')
407
+ elsif ns_uri == XMLNS_NAMESPACE && attr.name.sub(/^[^:]*:/, '') == 'xmlns'
408
+ attr_name = 'xmlns'
409
+ elsif ns_uri == XMLNS_NAMESPACE
410
+ attr_name = 'xmlns:' + attr.name.sub(/^[^:]*:/, '')
411
+ elsif ns_uri == XLINK_NAMESPACE
412
+ attr_name = 'xlink:' + attr.name.sub(/^[^:]*:/, '')
413
+ else
414
+ attr_name = "#{attr_ns.prefix}:#{attr.name}"
415
+ end
416
+ end
417
+ io << ' ' << attr_name << '="' << escape_text(attr.content, encoding, true) << '"'
418
+ end
419
+ io << '>'
420
+ if !%w[area base basefont bgsound br col embed frame hr img input keygen
421
+ link meta param source track wbr].include?(current_node.name)
422
+ io << "\n" if options[:preserve_newline] && prepend_newline?(current_node)
423
+ current_node.children.each do |child|
424
+ # XXX(sfc): Templates handled specially?
425
+ serialize_node_internal(child, io, encoding, options)
426
+ end
427
+ io << '</' << tagname << '>'
428
+ end
429
+ when XML::Node::TEXT_NODE
430
+ parent = current_node.parent
431
+ if parent.element? && %w[style script xmp iframe noembed noframes plaintext noscript].include?(parent.name)
432
+ io << current_node.content
433
+ else
434
+ io << escape_text(current_node.content, encoding, false)
435
+ end
436
+ when XML::Node::CDATA_SECTION_NODE
437
+ io << '<![CDATA[' << current_node.content << ']]>'
438
+ when XML::Node::COMMENT_NODE
439
+ io << '<!--' << current_node.content << '-->'
440
+ when XML::Node::PI_NODE
441
+ io << '<?' << current_node.content << '>'
442
+ when XML::Node::DOCUMENT_TYPE_NODE, XML::Node::DTD_NODE
443
+ io << '<!DOCTYPE ' << current_node.name << '>'
444
+ when XML::Node::HTML_DOCUMENT_NODE, XML::Node::DOCUMENT_FRAG_NODE
445
+ current_node.children.each do |child|
446
+ serialize_node_internal(child, io, encoding, options)
447
+ end
448
+ else
449
+ raise "Unexpected node '#{current_node.name}' of type #{current_node.type}"
450
+ end
451
+ end
452
+
453
+ def self.escape_text(text, encoding, attribute_mode)
454
+ if attribute_mode
455
+ text = text.gsub(/[&\u00a0"]/,
456
+ '&' => '&amp;', "\u00a0" => '&nbsp;', '"' => '&quot;')
457
+ else
458
+ text = text.gsub(/[&\u00a0<>]/,
459
+ '&' => '&amp;', "\u00a0" => '&nbsp;', '<' => '&lt;', '>' => '&gt;')
460
+ end
461
+ # Not part of the standard
462
+ text.encode(encoding, fallback: lambda { |c| "&\#x#{c.ord.to_s(16)};" })
463
+ end
464
+
465
+ def self.prepend_newline?(node)
466
+ return false unless %w[pre textarea listing].include?(node.name) && !node.children.empty?
467
+ first_child = node.children[0]
468
+ first_child.text? && first_child.content.start_with?("\n")
469
+ end
470
+ end
471
+ end
472
+
473
+ require_relative 'gumbo'