nokogiri 1.15.3 → 1.18.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +12 -17
  3. data/LICENSE-DEPENDENCIES.md +6 -6
  4. data/README.md +11 -5
  5. data/dependencies.yml +9 -8
  6. data/ext/nokogiri/extconf.rb +191 -154
  7. data/ext/nokogiri/gumbo.c +69 -53
  8. data/ext/nokogiri/html4_document.c +10 -4
  9. data/ext/nokogiri/html4_element_description.c +18 -18
  10. data/ext/nokogiri/html4_sax_parser.c +40 -0
  11. data/ext/nokogiri/html4_sax_parser_context.c +48 -58
  12. data/ext/nokogiri/html4_sax_push_parser.c +26 -25
  13. data/ext/nokogiri/libxml2_polyfill.c +114 -0
  14. data/ext/nokogiri/nokogiri.c +9 -2
  15. data/ext/nokogiri/nokogiri.h +25 -33
  16. data/ext/nokogiri/test_global_handlers.c +1 -1
  17. data/ext/nokogiri/xml_attr.c +1 -1
  18. data/ext/nokogiri/xml_cdata.c +3 -12
  19. data/ext/nokogiri/xml_comment.c +3 -8
  20. data/ext/nokogiri/xml_document.c +173 -158
  21. data/ext/nokogiri/xml_document_fragment.c +10 -25
  22. data/ext/nokogiri/xml_dtd.c +1 -1
  23. data/ext/nokogiri/xml_element_content.c +9 -9
  24. data/ext/nokogiri/xml_encoding_handler.c +4 -4
  25. data/ext/nokogiri/xml_namespace.c +6 -10
  26. data/ext/nokogiri/xml_node.c +142 -108
  27. data/ext/nokogiri/xml_node_set.c +46 -44
  28. data/ext/nokogiri/xml_reader.c +74 -100
  29. data/ext/nokogiri/xml_relax_ng.c +35 -56
  30. data/ext/nokogiri/xml_sax_parser.c +156 -88
  31. data/ext/nokogiri/xml_sax_parser_context.c +220 -128
  32. data/ext/nokogiri/xml_sax_push_parser.c +69 -50
  33. data/ext/nokogiri/xml_schema.c +51 -87
  34. data/ext/nokogiri/xml_syntax_error.c +19 -11
  35. data/ext/nokogiri/xml_text.c +3 -6
  36. data/ext/nokogiri/xml_xpath_context.c +104 -104
  37. data/ext/nokogiri/xslt_stylesheet.c +16 -11
  38. data/gumbo-parser/Makefile +18 -0
  39. data/gumbo-parser/src/ascii.c +2 -2
  40. data/gumbo-parser/src/error.c +76 -48
  41. data/gumbo-parser/src/error.h +5 -1
  42. data/gumbo-parser/src/nokogiri_gumbo.h +11 -2
  43. data/gumbo-parser/src/parser.c +66 -25
  44. data/gumbo-parser/src/tokenizer.c +7 -6
  45. data/lib/nokogiri/class_resolver.rb +1 -1
  46. data/lib/nokogiri/css/node.rb +6 -2
  47. data/lib/nokogiri/css/parser.rb +6 -4
  48. data/lib/nokogiri/css/parser.y +2 -2
  49. data/lib/nokogiri/css/parser_extras.rb +6 -66
  50. data/lib/nokogiri/css/selector_cache.rb +38 -0
  51. data/lib/nokogiri/css/tokenizer.rb +4 -4
  52. data/lib/nokogiri/css/tokenizer.rex +9 -8
  53. data/lib/nokogiri/css/xpath_visitor.rb +44 -27
  54. data/lib/nokogiri/css.rb +86 -20
  55. data/lib/nokogiri/decorators/slop.rb +3 -5
  56. data/lib/nokogiri/encoding_handler.rb +2 -2
  57. data/lib/nokogiri/html4/document.rb +45 -24
  58. data/lib/nokogiri/html4/document_fragment.rb +124 -12
  59. data/lib/nokogiri/html4/encoding_reader.rb +2 -2
  60. data/lib/nokogiri/html4/sax/parser.rb +23 -38
  61. data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
  62. data/lib/nokogiri/html4.rb +9 -14
  63. data/lib/nokogiri/html5/builder.rb +40 -0
  64. data/lib/nokogiri/html5/document.rb +61 -30
  65. data/lib/nokogiri/html5/document_fragment.rb +130 -20
  66. data/lib/nokogiri/html5/node.rb +4 -4
  67. data/lib/nokogiri/html5.rb +114 -138
  68. data/lib/nokogiri/version/constant.rb +1 -1
  69. data/lib/nokogiri/version/info.rb +6 -5
  70. data/lib/nokogiri/xml/attr.rb +2 -2
  71. data/lib/nokogiri/xml/builder.rb +8 -1
  72. data/lib/nokogiri/xml/document.rb +74 -31
  73. data/lib/nokogiri/xml/document_fragment.rb +86 -15
  74. data/lib/nokogiri/xml/namespace.rb +1 -2
  75. data/lib/nokogiri/xml/node.rb +113 -35
  76. data/lib/nokogiri/xml/node_set.rb +12 -10
  77. data/lib/nokogiri/xml/parse_options.rb +1 -1
  78. data/lib/nokogiri/xml/pp/node.rb +6 -1
  79. data/lib/nokogiri/xml/reader.rb +51 -17
  80. data/lib/nokogiri/xml/relax_ng.rb +57 -20
  81. data/lib/nokogiri/xml/sax/document.rb +174 -83
  82. data/lib/nokogiri/xml/sax/parser.rb +115 -41
  83. data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
  84. data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
  85. data/lib/nokogiri/xml/sax.rb +48 -0
  86. data/lib/nokogiri/xml/schema.rb +112 -45
  87. data/lib/nokogiri/xml/searchable.rb +39 -43
  88. data/lib/nokogiri/xml/syntax_error.rb +23 -1
  89. data/lib/nokogiri/xml/xpath_context.rb +14 -3
  90. data/lib/nokogiri/xml.rb +14 -25
  91. data/lib/nokogiri/xslt/stylesheet.rb +29 -7
  92. data/lib/nokogiri/xslt.rb +4 -10
  93. data/lib/nokogiri.rb +1 -1
  94. data/lib/xsd/xmlparser/nokogiri.rb +3 -4
  95. data/patches/libxml2/0019-xpath-Use-separate-static-hash-table-for-standard-fu.patch +244 -0
  96. data/ports/archives/libxml2-2.13.7.tar.xz +0 -0
  97. data/ports/archives/libxslt-1.1.43.tar.xz +0 -0
  98. metadata +13 -14
  99. data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
  100. data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +0 -25
  101. data/ports/archives/libxml2-2.11.4.tar.xz +0 -0
  102. data/ports/archives/libxslt-1.1.38.tar.xz +0 -0
@@ -3,32 +3,103 @@
3
3
 
4
4
  module Nokogiri
5
5
  module XML
6
+ # DocumentFragment represents a fragment of an \XML document. It provides the same functionality
7
+ # exposed by XML::Node and can be used to contain one or more \XML subtrees.
6
8
  class DocumentFragment < Nokogiri::XML::Node
7
- ####
8
- # Create a Nokogiri::XML::DocumentFragment from +tags+
9
- def self.parse(tags, options = ParseOptions::DEFAULT_XML, &block)
10
- new(XML::Document.new, tags, nil, options, &block)
9
+ # The options used to parse the document fragment. Returns the value of any options that were
10
+ # passed into the constructor as a parameter or set in a config block, else the default
11
+ # options for the specific subclass.
12
+ attr_reader :parse_options
13
+
14
+ class << self
15
+ # :call-seq:
16
+ # parse(input) { |options| ... } → XML::DocumentFragment
17
+ # parse(input, options:) → XML::DocumentFragment
18
+ #
19
+ # Parse \XML fragment input from a String, and return a new XML::DocumentFragment. This
20
+ # method creates a new, empty XML::Document to contain the fragment.
21
+ #
22
+ # [Required Parameters]
23
+ # - +input+ (String) The content to be parsed.
24
+ #
25
+ # [Optional Keyword Arguments]
26
+ # - +options+ (Nokogiri::XML::ParseOptions) Configuration object that determines some
27
+ # behaviors during parsing. See ParseOptions for more information. The default value is
28
+ # +ParseOptions::DEFAULT_XML+.
29
+ #
30
+ # [Yields]
31
+ # If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which
32
+ # can be configured before parsing. See Nokogiri::XML::ParseOptions for more information.
33
+ #
34
+ # [Returns] Nokogiri::XML::DocumentFragment
35
+ def parse(tags, options_ = ParseOptions::DEFAULT_XML, options: options_, &block)
36
+ new(XML::Document.new, tags, options: options, &block)
37
+ end
38
+
39
+ # Wrapper method to separate the concerns of:
40
+ # - the native object allocator's parameter (it only requires `document`)
41
+ # - the initializer's parameters
42
+ def new(document, ...) # :nodoc:
43
+ instance = native_new(document)
44
+ instance.send(:initialize, document, ...)
45
+ instance
46
+ end
11
47
  end
12
48
 
13
- ##
14
- # Create a new DocumentFragment from +tags+.
49
+ # :call-seq:
50
+ # new(document, input=nil) { |options| ... } → DocumentFragment
51
+ # new(document, input=nil, context:, options:) → DocumentFragment
52
+ #
53
+ # Parse \XML fragment input from a String, and return a new DocumentFragment that is
54
+ # associated with the given +document+.
55
+ #
56
+ # 💡 It's recommended to use either XML::DocumentFragment.parse or Node#parse rather than call
57
+ # this method directly.
58
+ #
59
+ # [Required Parameters]
60
+ # - +document+ (XML::Document) The parent document to associate the returned fragment with.
61
+ #
62
+ # [Optional Parameters]
63
+ # - +input+ (String) The content to be parsed.
15
64
  #
16
- # If +ctx+ is present, it is used as a context node for the
17
- # subtree created, e.g., namespaces will be resolved relative
18
- # to +ctx+.
19
- def initialize(document, tags = nil, ctx = nil, options = ParseOptions::DEFAULT_XML) # rubocop:disable Lint/MissingSuper
65
+ # [Optional Keyword Arguments]
66
+ # - +context:+ (Nokogiri::XML::Node) The <b>context node</b> for the subtree created. See
67
+ # below for more information.
68
+ #
69
+ # - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some
70
+ # behaviors during parsing. See ParseOptions for more information. The default value is
71
+ # +ParseOptions::DEFAULT_XML+.
72
+ #
73
+ # [Yields]
74
+ # If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which
75
+ # can be configured before parsing. See ParseOptions for more information.
76
+ #
77
+ # [Returns] XML::DocumentFragment
78
+ #
79
+ # === Context \Node
80
+ #
81
+ # If a context node is specified using +context:+, then the fragment will be created by
82
+ # calling Node#parse on that node, so the parser will behave as if that Node is the parent of
83
+ # the fragment subtree, and will resolve namespaces relative to that node.
84
+ #
85
+ def initialize(
86
+ document, tags = nil,
87
+ context_ = nil, options_ = ParseOptions::DEFAULT_XML,
88
+ context: context_, options: options_
89
+ ) # rubocop:disable Lint/MissingSuper
20
90
  return self unless tags
21
91
 
22
92
  options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
93
+ @parse_options = options
23
94
  yield options if block_given?
24
95
 
25
- children = if ctx
96
+ children = if context
26
97
  # Fix for issue#490
27
98
  if Nokogiri.jruby?
28
99
  # fix for issue #770
29
- ctx.parse("<root #{namespace_declarations(ctx)}>#{tags}</root>", options).children
100
+ context.parse("<root #{namespace_declarations(context)}>#{tags}</root>", options).children
30
101
  else
31
- ctx.parse(tags, options)
102
+ context.parse(tags, options)
32
103
  end
33
104
  else
34
105
  wrapper_doc = XML::Document.parse("<root>#{tags}</root>", nil, nil, options)
@@ -154,8 +225,6 @@ module Nokogiri
154
225
  # root elements, you should deconstruct the array returned by
155
226
  # <tt>DocumentFragment#elements</tt>.
156
227
  #
157
- # ⚡ This is an experimental feature, available since v1.14.0
158
- #
159
228
  # *Example*
160
229
  #
161
230
  # frag = Nokogiri::HTML5.fragment(<<~HTML)
@@ -187,6 +256,8 @@ module Nokogiri
187
256
  # # }),
188
257
  # # #(Element:0x398 { name = "div", children = [ #(Text "End")] })]
189
258
  #
259
+ # Since v1.14.0
260
+ #
190
261
  def deconstruct
191
262
  children.to_a
192
263
  end
@@ -16,8 +16,6 @@ module Nokogiri
16
16
  # - +prefix+ → (String, nil) The namespace's prefix, or +nil+ if there is no prefix (e.g., default namespace).
17
17
  # - +href+ → (String) The namespace's URI
18
18
  #
19
- # ⚡ This is an experimental feature, available since v1.14.0
20
- #
21
19
  # *Example*
22
20
  #
23
21
  # doc = Nokogiri::XML.parse(<<~XML)
@@ -43,6 +41,7 @@ module Nokogiri
43
41
  # doc.root.elements.last.namespace.deconstruct_keys([:prefix, :href])
44
42
  # # => {:prefix=>"noko", :href=>"http://nokogiri.org/ns/noko"}
45
43
  #
44
+ # Since v1.14.0
46
45
  #
47
46
  def deconstruct_keys(keys)
48
47
  { prefix: prefix, href: href }
@@ -127,6 +127,42 @@ module Nokogiri
127
127
  # This is intentionally empty, and sets the method signature for subclasses.
128
128
  end
129
129
 
130
+ #
131
+ # :call-seq:
132
+ # dup → Nokogiri::XML::Node
133
+ # dup(level) → Nokogiri::XML::Node
134
+ # dup(level, new_parent_doc) → Nokogiri::XML::Node
135
+ #
136
+ # Duplicate this node.
137
+ #
138
+ # [Parameters]
139
+ # - +level+ (optional Integer). 0 is a shallow copy, 1 (the default) is a deep copy.
140
+ # - +new_parent_doc+ (optional Nokogiri::XML::Document)
141
+ # The new node's parent Document. Defaults to the the Document of the current node.
142
+ # [Returns] The new Nokogiri::XML::Node
143
+ #
144
+ def dup(level = 1, new_parent_doc = document)
145
+ super().initialize_copy_with_args(self, level, new_parent_doc)
146
+ end
147
+
148
+ #
149
+ # :call-seq:
150
+ # clone → Nokogiri::XML::Node
151
+ # clone(level) → Nokogiri::XML::Node
152
+ # clone(level, new_parent_doc) → Nokogiri::XML::Node
153
+ #
154
+ # Clone this node.
155
+ #
156
+ # [Parameters]
157
+ # - +level+ (optional Integer). 0 is a shallow copy, 1 (the default) is a deep copy.
158
+ # - +new_parent_doc+
159
+ # The new node's parent Document. Defaults to the the Document of the current node.
160
+ # [Returns] The new Nokogiri::XML::Node
161
+ #
162
+ def clone(level = 1, new_parent_doc = document)
163
+ super().initialize_copy_with_args(self, level, new_parent_doc)
164
+ end
165
+
130
166
  ###
131
167
  # Decorate this node with the decorators set up in this node's Document
132
168
  def decorate!
@@ -228,7 +264,7 @@ module Nokogiri
228
264
  if new_parent.nil?
229
265
  raise "Failed to parse '#{node_or_tags}' in the context of a '#{context_node.name}' element"
230
266
  end
231
- when XML::Node
267
+ when Node
232
268
  new_parent = node_or_tags.dup
233
269
  else
234
270
  raise ArgumentError, "Requires a String or Node argument, and cannot accept a #{node_or_tags.class}"
@@ -406,8 +442,48 @@ module Nokogiri
406
442
  end
407
443
 
408
444
  ####
409
- # Set the Node's content to a Text node containing +string+. The string gets XML escaped, not
410
- # interpreted as markup.
445
+ # call-seq:
446
+ # content=(input)
447
+ #
448
+ # Set the content of this node to +input+.
449
+ #
450
+ # [Parameters]
451
+ # - +input+ (String) The new content for this node. Input is considered to be raw content, and
452
+ # so will be entity-escaped in the final DOM string.
453
+ #
454
+ # [Example]
455
+ # Note how entities are handled:
456
+ #
457
+ # doc = Nokogiri::HTML::Document.parse(<<~HTML)
458
+ # <html>
459
+ # <body>
460
+ # <div id="first">asdf</div>
461
+ # <div id="second">asdf</div>
462
+ # HTML
463
+ #
464
+ # text_node = doc.at_css("div#first").children.first
465
+ # div_node = doc.at_css("div#second")
466
+ #
467
+ # value = "You &amp; Me"
468
+ #
469
+ # text_node.content = value
470
+ # div_node.content = value
471
+ #
472
+ # doc.css("div").to_html
473
+ # # => "<div id=\"first\">You &amp;amp; Me</div>
474
+ # # <div id=\"second\">You &amp;amp; Me</div>"
475
+ #
476
+ # For content that is already entity-escaped, use CGI::unescapeHTML to decode it:
477
+ #
478
+ # text_node.content = CGI::unescapeHTML(value)
479
+ # div_node.content = CGI::unescapeHTML(value)
480
+ #
481
+ # doc.css("div").to_html
482
+ # # => "<div id=\"first\">You &amp; Me</div>
483
+ # # <div id=\"second\">You &amp; Me</div>"
484
+ #
485
+ # See also: #native_content=
486
+ #
411
487
  def content=(string)
412
488
  self.native_content = encode_special_chars(string.to_s)
413
489
  end
@@ -474,7 +550,6 @@ module Nokogiri
474
550
  alias_method :to_str, :content
475
551
  alias_method :name, :node_name
476
552
  alias_method :type, :node_type
477
- alias_method :clone, :dup
478
553
  alias_method :elements, :element_children
479
554
 
480
555
  # :section: Working With Node Attributes
@@ -1049,31 +1124,40 @@ module Nokogiri
1049
1124
 
1050
1125
  return Nokogiri::XML::NodeSet.new(document) if contents.empty?
1051
1126
 
1052
- # libxml2 does not obey the +recover+ option after encountering errors during +in_context+
1053
- # parsing, and so this horrible hack is here to try to emulate recovery behavior.
1054
- #
1055
- # Unfortunately, this means we're no longer parsing "in context" and so namespaces that
1056
- # would have been inherited from the context node won't be handled correctly. This hack was
1057
- # written in 2010, and I regret it, because it's silently degrading functionality in a way
1058
- # that's not easily prevented (or even detected).
1059
- #
1060
- # I think preferable behavior would be to either:
1061
- #
1062
- # a. add an error noting that we "fell back" and pointing the user to turning off the +recover+ option
1063
- # b. don't recover, but raise a sensible exception
1064
- #
1065
- # For context and background: https://github.com/sparklemotion/nokogiri/issues/313
1066
- # FIXME bug report: https://github.com/sparklemotion/nokogiri/issues/2092
1067
1127
  error_count = document.errors.length
1068
1128
  node_set = in_context(contents, options.to_i)
1069
- if node_set.empty? && (document.errors.length > error_count)
1070
- if options.recover?
1129
+
1130
+ if document.errors.length > error_count
1131
+ raise document.errors[error_count] unless options.recover?
1132
+
1133
+ # TODO: remove this block when libxml2 < 2.13 is no longer supported
1134
+ if node_set.empty?
1135
+ # libxml2 < 2.13 does not obey the +recover+ option after encountering errors during
1136
+ # +in_context+ parsing, and so this horrible hack is here to try to emulate recovery
1137
+ # behavior.
1138
+ #
1139
+ # (Note that HTML4 fragment parsing seems to have been fixed in abd74186, and XML
1140
+ # fragment parsing is fixed in 1c106edf. Both are in 2.13.)
1141
+ #
1142
+ # Unfortunately, this means we're no longer parsing "in context" and so namespaces that
1143
+ # would have been inherited from the context node won't be handled correctly. This hack
1144
+ # was written in 2010, and I regret it, because it's silently degrading functionality in
1145
+ # a way that's not easily prevented (or even detected).
1146
+ #
1147
+ # I think preferable behavior would be to either:
1148
+ #
1149
+ # a. add an error noting that we "fell back" and pointing the user to turning off the
1150
+ # +recover+ option
1151
+ # b. don't recover, but raise a sensible exception
1152
+ #
1153
+ # For context and background:
1154
+ # - https://github.com/sparklemotion/nokogiri/issues/313
1155
+ # - https://github.com/sparklemotion/nokogiri/issues/2092
1071
1156
  fragment = document.related_class("DocumentFragment").parse(contents)
1072
1157
  node_set = fragment.children
1073
- else
1074
- raise document.errors[error_count]
1075
1158
  end
1076
1159
  end
1160
+
1077
1161
  node_set
1078
1162
  end
1079
1163
 
@@ -1165,7 +1249,7 @@ module Nokogiri
1165
1249
  # Fetch the Nokogiri::HTML4::ElementDescription for this node. Returns
1166
1250
  # nil on XML documents and on unknown tags.
1167
1251
  def description
1168
- return nil if document.xml?
1252
+ return if document.xml?
1169
1253
 
1170
1254
  Nokogiri::HTML4::ElementDescription[name]
1171
1255
  end
@@ -1254,8 +1338,8 @@ module Nokogiri
1254
1338
  # Compare two Node objects with respect to their Document. Nodes from
1255
1339
  # different documents cannot be compared.
1256
1340
  def <=>(other)
1257
- return nil unless other.is_a?(Nokogiri::XML::Node)
1258
- return nil unless document == other.document
1341
+ return unless other.is_a?(Nokogiri::XML::Node)
1342
+ return unless document == other.document
1259
1343
 
1260
1344
  compare(other)
1261
1345
  end
@@ -1278,6 +1362,7 @@ module Nokogiri
1278
1362
  # end
1279
1363
  #
1280
1364
  def serialize(*args, &block)
1365
+ # TODO: deprecate non-hash options, see 46c68ed 2009-06-20 for context
1281
1366
  options = if args.first.is_a?(Hash)
1282
1367
  args.shift
1283
1368
  else
@@ -1429,8 +1514,6 @@ module Nokogiri
1429
1514
  # - +content+ → (String) The contents of all the text nodes in this node's subtree. See #content.
1430
1515
  # - +inner_html+ → (String) The inner markup for the children of this node. See #inner_html.
1431
1516
  #
1432
- # ⚡ This is an experimental feature, available since v1.14.0
1433
- #
1434
1517
  # *Example*
1435
1518
  #
1436
1519
  # doc = Nokogiri::XML.parse(<<~XML)
@@ -1465,6 +1548,8 @@ module Nokogiri
1465
1548
  # # value = "def"
1466
1549
  # # })]}
1467
1550
  #
1551
+ # Since v1.14.0
1552
+ #
1468
1553
  def deconstruct_keys(keys)
1469
1554
  requested_keys = DECONSTRUCT_KEYS & keys
1470
1555
  {}.tap do |values|
@@ -1535,19 +1620,12 @@ module Nokogiri
1535
1620
  node_or_tags
1536
1621
  end
1537
1622
 
1538
- USING_LIBXML_WITH_BROKEN_SERIALIZATION = Nokogiri.uses_libxml?("~> 2.6.0").freeze
1539
- private_constant :USING_LIBXML_WITH_BROKEN_SERIALIZATION
1540
-
1541
1623
  def to_format(save_option, options)
1542
- return dump_html if USING_LIBXML_WITH_BROKEN_SERIALIZATION
1543
-
1544
1624
  options[:save_with] = save_option unless options[:save_with]
1545
1625
  serialize(options)
1546
1626
  end
1547
1627
 
1548
1628
  def write_format_to(save_option, io, options)
1549
- return (io << dump_html) if USING_LIBXML_WITH_BROKEN_SERIALIZATION
1550
-
1551
1629
  options[:save_with] ||= save_option
1552
1630
  write_to(io, options)
1553
1631
  end
@@ -4,9 +4,13 @@
4
4
  module Nokogiri
5
5
  module XML
6
6
  ####
7
- # A NodeSet contains a list of Nokogiri::XML::Node objects. Typically
8
- # a NodeSet is return as a result of searching a Document via
9
- # Nokogiri::XML::Searchable#css or Nokogiri::XML::Searchable#xpath
7
+ # A NodeSet is an Enumerable that contains a list of Nokogiri::XML::Node objects.
8
+ #
9
+ # Typically a NodeSet is returned as a result of searching a Document via
10
+ # Nokogiri::XML::Searchable#css or Nokogiri::XML::Searchable#xpath.
11
+ #
12
+ # Note that the `#dup` and `#clone` methods perform shallow copies; these methods do not copy
13
+ # the Nodes contained in the NodeSet (similar to how Array and other Enumerable classes work).
10
14
  class NodeSet
11
15
  include Nokogiri::XML::Searchable
12
16
  include Enumerable
@@ -14,8 +18,6 @@ module Nokogiri
14
18
  # The Document this NodeSet is associated with
15
19
  attr_accessor :document
16
20
 
17
- alias_method :clone, :dup
18
-
19
21
  # Create a NodeSet with +document+ defaulting to +list+
20
22
  def initialize(document, list = [])
21
23
  @document = document
@@ -121,7 +123,7 @@ module Nokogiri
121
123
  return self[args.first]
122
124
  end
123
125
 
124
- super(*args)
126
+ super
125
127
  end
126
128
  alias_method :%, :at
127
129
 
@@ -372,7 +374,7 @@ module Nokogiri
372
374
  # Removes the last element from set and returns it, or +nil+ if
373
375
  # the set is empty
374
376
  def pop
375
- return nil if length == 0
377
+ return if length == 0
376
378
 
377
379
  delete(last)
378
380
  end
@@ -381,7 +383,7 @@ module Nokogiri
381
383
  # Returns the first element of the NodeSet and removes it. Returns
382
384
  # +nil+ if the set is empty.
383
385
  def shift
384
- return nil if length == 0
386
+ return if length == 0
385
387
 
386
388
  delete(first)
387
389
  end
@@ -423,7 +425,7 @@ module Nokogiri
423
425
  end
424
426
 
425
427
  ###
426
- # Return a nicely formated string representation
428
+ # Return a nicely formatted string representation
427
429
  def inspect
428
430
  "[#{map(&:inspect).join(", ")}]"
429
431
  end
@@ -435,7 +437,7 @@ module Nokogiri
435
437
  #
436
438
  # Returns the members of this NodeSet as an array, to use in pattern matching.
437
439
  #
438
- # This is an experimental feature, available since v1.14.0
440
+ # Since v1.14.0
439
441
  #
440
442
  def deconstruct
441
443
  to_a
@@ -140,7 +140,7 @@ module Nokogiri
140
140
 
141
141
  # Relax any hardcoded limit from the parser. Off by default.
142
142
  #
143
- # ⚠ There may be a performance penalty when this option is set.
143
+ # ⚠ <b>It is UNSAFE to set this option</b> when parsing untrusted documents.
144
144
  HUGE = 1 << 19
145
145
 
146
146
  # Support line numbers up to <code>long int</code> (default is a <code>short int</code>). On
@@ -8,6 +8,11 @@ module Nokogiri
8
8
  COLLECTIONS = [:attribute_nodes, :children]
9
9
 
10
10
  def inspect
11
+ # handle the case where an exception is thrown during object construction
12
+ if respond_to?(:data_ptr?) && !data_ptr?
13
+ return "#<#{self.class}:#{format("0x%x", object_id)} (no data)>"
14
+ end
15
+
11
16
  attributes = inspect_attributes.reject do |x|
12
17
  attribute = send(x)
13
18
  !attribute || (attribute.respond_to?(:empty?) && attribute.empty?)
@@ -21,7 +26,7 @@ module Nokogiri
21
26
  "#{attribute}=#{send(attribute).inspect}"
22
27
  end.join(" ")
23
28
  end
24
- "#<#{self.class.name}:#{format("0x%x", object_id)} #{attributes}>"
29
+ "#<#{self.class}:#{format("0x%x", object_id)} #{attributes}>"
25
30
  end
26
31
 
27
32
  def pretty_print(pp)
@@ -3,32 +3,34 @@
3
3
  module Nokogiri
4
4
  module XML
5
5
  ###
6
- # Nokogiri::XML::Reader parses an XML document similar to the way a cursor
7
- # would move. The Reader is given an XML document, and yields nodes
8
- # to an each block.
6
+ # The Reader parser allows you to effectively pull parse an \XML document. Once instantiated,
7
+ # call Nokogiri::XML::Reader#each to iterate over each node.
8
+ #
9
+ # Nokogiri::XML::Reader parses an \XML document similar to the way a cursor would move. The
10
+ # Reader is given an \XML document, and yields nodes to an each block.
11
+ #
12
+ # The Reader parser might be good for when you need the speed and low memory usage of a \SAX
13
+ # parser, but do not want to write a SAX::Document handler.
9
14
  #
10
15
  # Here is an example of usage:
11
16
  #
12
- # reader = Nokogiri::XML::Reader(<<-eoxml)
17
+ # reader = Nokogiri::XML::Reader.new <<~XML
13
18
  # <x xmlns:tenderlove='http://tenderlovemaking.com/'>
14
19
  # <tenderlove:foo awesome='true'>snuggles!</tenderlove:foo>
15
20
  # </x>
16
- # eoxml
21
+ # XML
17
22
  #
18
23
  # reader.each do |node|
19
- #
20
24
  # # node is an instance of Nokogiri::XML::Reader
21
25
  # puts node.name
22
- #
23
26
  # end
24
27
  #
25
- # Note that Nokogiri::XML::Reader#each can only be called once!! Once
26
- # the cursor moves through the entire document, you must parse the
27
- # document again. So make sure that you capture any information you
28
- # need during the first iteration.
28
+ # Nokogiri::XML::Reader#each can only be called once! Once the cursor moves through the entire
29
+ # document, you must parse the document again. It may be better to capture all information you
30
+ # need during a single iteration.
29
31
  #
30
- # The Reader parser is good for when you need the speed of a SAX parser,
31
- # but do not want to write a Document handler.
32
+ # libxml2 does not support error recovery in the Reader parser. The +RECOVER+ ParseOption is
33
+ # ignored. If a syntax error is encountered during parsing, an exception will be raised.
32
34
  class Reader
33
35
  include Enumerable
34
36
 
@@ -65,23 +67,55 @@ module Nokogiri
65
67
  TYPE_END_ELEMENT = 15
66
68
  # Entity end node type
67
69
  TYPE_END_ENTITY = 16
68
- # XML Declaration node type
70
+ # \XML Declaration node type
69
71
  TYPE_XML_DECLARATION = 17
70
72
 
71
73
  # A list of errors encountered while parsing
72
74
  attr_accessor :errors
73
75
 
74
- # The XML source
76
+ # The \XML source
75
77
  attr_reader :source
76
78
 
77
79
  alias_method :self_closing?, :empty_element?
78
80
 
79
- def initialize(source, url = nil, encoding = nil) # :nodoc:
81
+ # :call-seq:
82
+ # Reader.new(input) { |options| ... } → Reader
83
+ # Reader.new(input, url:, encoding:, options:) { |options| ... } → Reader
84
+ #
85
+ # Create a new Reader to parse an \XML document.
86
+ #
87
+ # [Required Parameters]
88
+ # - +input+ (String | IO): The \XML document to parse.
89
+ #
90
+ # [Optional Parameters]
91
+ # - +url:+ (String) The base URL of the document.
92
+ # - +encoding:+ (String) The name of the encoding of the document.
93
+ # - +options:+ (Integer | ParseOptions) Options to control the parser behavior.
94
+ # Defaults to +ParseOptions::STRICT+.
95
+ #
96
+ # [Yields]
97
+ # If present, the block will be passed a Nokogiri::XML::ParseOptions object to modify before
98
+ # the fragment is parsed. See Nokogiri::XML::ParseOptions for more information.
99
+ def self.new(
100
+ string_or_io,
101
+ url_ = nil, encoding_ = nil, options_ = ParseOptions::STRICT,
102
+ url: url_, encoding: encoding_, options: options_
103
+ )
104
+ options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
105
+ yield options if block_given?
106
+
107
+ if string_or_io.respond_to?(:read)
108
+ return Reader.from_io(string_or_io, url, encoding, options.to_i)
109
+ end
110
+
111
+ Reader.from_memory(string_or_io, url, encoding, options.to_i)
112
+ end
113
+
114
+ private def initialize(source, url = nil, encoding = nil) # :nodoc:
80
115
  @source = source
81
116
  @errors = []
82
117
  @encoding = encoding
83
118
  end
84
- private :initialize
85
119
 
86
120
  # Get the attributes and namespaces of the current node as a Hash.
87
121
  #