moxml 0.1.20 → 0.1.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/opal.yml +37 -0
  3. data/.rspec-opal +5 -0
  4. data/Gemfile +6 -0
  5. data/Rakefile +67 -0
  6. data/lib/compat/opal/rexml/namespace.rb +56 -0
  7. data/lib/compat/opal/rexml/parsers/baseparser.rb +952 -0
  8. data/lib/compat/opal/rexml/source.rb +213 -0
  9. data/lib/compat/opal/rexml/text.rb +418 -0
  10. data/lib/compat/opal/rexml/xmltokens.rb +45 -0
  11. data/lib/compat/opal/rexml_compat.rb +76 -0
  12. data/lib/moxml/adapter/base.rb +5 -0
  13. data/lib/moxml/adapter/customized_libxml/node.rb +3 -0
  14. data/lib/moxml/adapter/customized_libxml/text.rb +6 -1
  15. data/lib/moxml/adapter/customized_rexml/formatter.rb +11 -10
  16. data/lib/moxml/adapter/headed_ox.rb +2 -6
  17. data/lib/moxml/adapter/libxml/entity_ref_registry.rb +105 -0
  18. data/lib/moxml/adapter/libxml/entity_restorer.rb +92 -0
  19. data/lib/moxml/adapter/libxml.rb +386 -382
  20. data/lib/moxml/adapter/nokogiri.rb +7 -18
  21. data/lib/moxml/adapter/oga.rb +4 -22
  22. data/lib/moxml/adapter/ox.rb +8 -23
  23. data/lib/moxml/adapter/rexml.rb +29 -33
  24. data/lib/moxml/adapter.rb +38 -8
  25. data/lib/moxml/config.rb +1 -1
  26. data/lib/moxml/entity_registry.rb +36 -31
  27. data/lib/moxml/entity_registry_opal_data.rb +2137 -0
  28. data/lib/moxml/node.rb +19 -26
  29. data/lib/moxml/sax/namespace_splitter.rb +54 -0
  30. data/lib/moxml/version.rb +1 -1
  31. data/lib/moxml/xml_utils.rb +9 -1
  32. data/spec/consistency/adapter_parity_spec.rb +1 -1
  33. data/spec/integration/all_adapters_spec.rb +1 -1
  34. data/spec/integration/w3c_namespace_spec.rb +1 -1
  35. data/spec/moxml/adapter/libxml_internals_spec.rb +167 -0
  36. data/spec/moxml/adapter/ox_spec.rb +8 -0
  37. data/spec/moxml/adapter/platform_spec.rb +69 -0
  38. data/spec/moxml/adapter/shared_examples/adapter_contract.rb +0 -6
  39. data/spec/moxml/entity_registry_spec.rb +10 -0
  40. data/spec/moxml/native_attachment/opal_spec.rb +39 -2
  41. data/spec/moxml/node_type_map_spec.rb +43 -0
  42. data/spec/moxml/opal_rexml_adapter_spec.rb +14 -0
  43. data/spec/moxml/opal_smoke_spec.rb +61 -0
  44. data/spec/moxml/sax/namespace_splitter_spec.rb +67 -0
  45. data/spec/moxml/text_spec.rb +1 -1
  46. data/spec/performance/benchmark_spec.rb +1 -1
  47. data/spec/spec_helper.rb +32 -13
  48. data/spec/support/opal.rb +16 -0
  49. metadata +21 -2
@@ -1,8 +1,11 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ return if RUBY_ENGINE == "opal"
4
+
3
5
  require_relative "base"
4
6
  require "libxml"
5
7
  require_relative "customized_libxml"
8
+ require_relative "../sax/namespace_splitter"
6
9
 
7
10
  module Moxml
8
11
  module Adapter
@@ -37,6 +40,32 @@ module Moxml
37
40
  end
38
41
  end
39
42
 
43
+ # Mapping from libxml's integer node_type to our symbol — built once
44
+ # at load so `node_type` can do a single hash lookup on the hot path
45
+ # instead of a large case/when on every node.
46
+ NATIVE_NODE_TYPE_MAP = {
47
+ ::LibXML::XML::Node::ELEMENT_NODE => :element,
48
+ ::LibXML::XML::Node::TEXT_NODE => :text,
49
+ ::LibXML::XML::Node::CDATA_SECTION_NODE => :cdata,
50
+ ::LibXML::XML::Node::COMMENT_NODE => :comment,
51
+ ::LibXML::XML::Node::PI_NODE => :processing_instruction,
52
+ ::LibXML::XML::Node::ATTRIBUTE_NODE => :attribute,
53
+ ::LibXML::XML::Node::DTD_NODE => :doctype,
54
+ ::LibXML::XML::Node::DOCUMENT_NODE => :document,
55
+ }.freeze
56
+ private_constant :NATIVE_NODE_TYPE_MAP
57
+
58
+ WRAPPER_NODE_TYPE_MAP = {
59
+ DoctypeWrapper => :doctype,
60
+ CustomizedLibxml::Element => :element,
61
+ CustomizedLibxml::Text => :text,
62
+ CustomizedLibxml::Cdata => :cdata,
63
+ CustomizedLibxml::Comment => :comment,
64
+ CustomizedLibxml::ProcessingInstruction => :processing_instruction,
65
+ CustomizedLibxml::EntityReference => :entity_reference,
66
+ }.freeze
67
+ private_constant :WRAPPER_NODE_TYPE_MAP
68
+
40
69
  class << self
41
70
  def attachments
42
71
  @attachments ||= Moxml::NativeAttachment.new
@@ -98,7 +127,22 @@ module Moxml
98
127
  end
99
128
 
100
129
  ctx = _context || Context.new(:libxml)
101
- DocumentBuilder.new(ctx).build(native_doc)
130
+ # Single parse path: wrap libxml's already-complete C-parsed tree
131
+ # directly (same pattern as nokogiri/ox). The previous
132
+ # DocumentBuilder.build path walked the entire parsed tree and
133
+ # re-added every node to a fresh moxml-managed document, which
134
+ # made parse O(N) Ruby work on top of an already-complete parse.
135
+ # Doctype/declaration/PI attachments set above remain on
136
+ # native_doc, so the serialize path still sees them.
137
+ #
138
+ # restore_entities is handled as a post-processing case after
139
+ # the wrap, NOT by branching into a different builder. This way
140
+ # any new parse-time logic only has to be added to this one
141
+ # path; the restoration walk is just one of potentially several
142
+ # post-processing steps and doesn't fork the construction.
143
+ doc = Document.new(native_doc, ctx)
144
+ EntityRestorer.new(doc).run if ctx.config.restore_entities
145
+ doc
102
146
  end
103
147
 
104
148
  # SAX parsing implementation for LibXML
@@ -181,40 +225,23 @@ module Moxml
181
225
  def node_type(node)
182
226
  return :unknown unless node
183
227
 
184
- # Handle wrapper classes
185
- return :element if node.is_a?(CustomizedLibxml::Element)
186
- return :text if node.is_a?(CustomizedLibxml::Text)
187
- return :cdata if node.is_a?(CustomizedLibxml::Cdata)
188
- return :comment if node.is_a?(CustomizedLibxml::Comment)
189
- if node.is_a?(CustomizedLibxml::ProcessingInstruction)
190
- return :processing_instruction
228
+ # Fast path: native libxml nodes are the vast majority during
229
+ # parse traversal (DocumentBuilder visits raw libxml children).
230
+ # Skip the wrapper checks below for them.
231
+ if node.is_a?(::LibXML::XML::Node)
232
+ return NATIVE_NODE_TYPE_MAP[node.node_type] || :unknown
191
233
  end
192
- return :entity_reference if node.is_a?(CustomizedLibxml::EntityReference)
193
- return :doctype if node.is_a?(DoctypeWrapper)
234
+ return :document if node.is_a?(::LibXML::XML::Document)
194
235
 
195
- # Unwrap if needed
196
- native_node = unpatch_node(node)
236
+ wrapper_type = WRAPPER_NODE_TYPE_MAP[node.class]
237
+ return wrapper_type if wrapper_type
197
238
 
198
- case native_node.node_type
199
- when ::LibXML::XML::Node::DOCUMENT_NODE
200
- :document
201
- when ::LibXML::XML::Node::ELEMENT_NODE
202
- :element
203
- when ::LibXML::XML::Node::TEXT_NODE
204
- :text
205
- when ::LibXML::XML::Node::CDATA_SECTION_NODE
206
- :cdata
207
- when ::LibXML::XML::Node::COMMENT_NODE
208
- :comment
209
- when ::LibXML::XML::Node::ATTRIBUTE_NODE
210
- :attribute
211
- when ::LibXML::XML::Node::PI_NODE
212
- :processing_instruction
213
- when ::LibXML::XML::Node::DTD_NODE
214
- :doctype
215
- else
216
- :unknown
217
- end
239
+ # Duck-typed fallback for libxml types that aren't ::Node
240
+ # subclasses but still expose node_type (e.g. ::Attr).
241
+ native = unpatch_node(node)
242
+ return :unknown unless native.respond_to?(:node_type)
243
+
244
+ NATIVE_NODE_TYPE_MAP[native.node_type] || :unknown
218
245
  end
219
246
 
220
247
  def node_name(node)
@@ -300,7 +327,7 @@ module Moxml
300
327
 
301
328
  # Include any EntityReference wrappers stored on the document
302
329
  doc = native_node.doc
303
- entity_refs = doc ? lookup_entity_refs(doc, native_node) : nil
330
+ entity_refs = entity_ref_registry(doc).refs_for(native_node)
304
331
  result.concat(entity_refs) if entity_refs
305
332
 
306
333
  result
@@ -317,7 +344,7 @@ module Moxml
317
344
  current = native_node&.next
318
345
  while current
319
346
  # Skip whitespace-only text nodes
320
- break unless current.text? && current.content.to_s.strip.empty?
347
+ break unless blank_text_node?(current)
321
348
 
322
349
  current = current.next
323
350
  end
@@ -329,7 +356,7 @@ module Moxml
329
356
  current = native_node&.prev
330
357
  while current
331
358
  # Skip whitespace-only text nodes
332
- break unless current.text? && current.content.to_s.strip.empty?
359
+ break unless blank_text_node?(current)
333
360
 
334
361
  current = current.prev
335
362
  end
@@ -504,21 +531,22 @@ module Moxml
504
531
  native_child = unpatch_node(child)
505
532
 
506
533
  # EntityReference wrappers can't go in LibXML's native tree.
507
- # Store on the document (stable identity) keyed by element.
508
- # LibXML creates new Ruby wrappers on each access, so element
509
- # object_id is unstable — we look up via == comparison.
534
+ # Store them on the document for interleaved serialization.
510
535
  if child.is_a?(CustomizedLibxml::EntityReference)
511
536
  doc = native_elem.is_a?(::LibXML::XML::Document) ? native_elem : native_elem.doc
512
- store_entity_ref_on_doc(doc, native_elem, child)
513
- append_child_sequence_on_doc(doc, native_elem, :eref)
537
+ entity_ref_registry(doc).register(native_elem, child)
514
538
  return
515
539
  end
516
540
 
517
541
  # For LibXML: if parent has a DEFAULT namespace (nil/empty prefix) and child is an element without a namespace,
518
542
  # explicitly set the child's namespace to match the parent's for XPath compatibility
519
- # NOTE: Prefixed namespaces are NOT inherited, only default namespaces
520
- if native_elem.is_a?(::LibXML::XML::Node) && native_elem.namespaces&.namespace &&
521
- native_child.is_a?(::LibXML::XML::Node) && native_child.element? &&
543
+ # NOTE: Prefixed namespaces are NOT inherited, only default namespaces.
544
+ #
545
+ # Reorder cheap-first: skip the expensive `.namespaces` fetches
546
+ # entirely for non-element children (text, comment, cdata, PI),
547
+ # which is roughly 30-50% of adds in a typical doc.
548
+ if native_child.is_a?(::LibXML::XML::Node) && native_child.element? &&
549
+ native_elem.is_a?(::LibXML::XML::Node) && native_elem.namespaces&.namespace &&
522
550
  (!native_child.namespaces.namespace || native_child.namespaces.namespace.href.to_s.empty?)
523
551
 
524
552
  parent_ns = native_elem.namespaces.namespace
@@ -574,53 +602,10 @@ module Moxml
574
602
  else
575
603
  import_and_add(native_elem.doc, native_elem, native_child)
576
604
  doc = native_elem.doc || native_elem
577
- append_child_sequence_on_doc(doc, native_elem, :native)
605
+ entity_ref_registry(doc).append_native(native_elem)
578
606
  end
579
607
  end
580
608
 
581
- # Store entity ref on the document (stable identity).
582
- # LibXML element wrappers are ephemeral, so we use == to find matching elements.
583
- def store_entity_ref_on_doc(doc, element, ref)
584
- pairs = attachments.get(doc, :_entity_ref_pairs) || []
585
- pair = pairs.find { |elem, _| elem == element }
586
- if pair
587
- pair[1] << ref
588
- else
589
- pairs << [element, [ref]]
590
- end
591
- attachments.set(doc, :_entity_ref_pairs, pairs)
592
- end
593
-
594
- # Look up entity refs for an element from the document
595
- def lookup_entity_refs(doc, element)
596
- pairs = attachments.get(doc, :_entity_ref_pairs)
597
- return nil unless pairs
598
-
599
- pair = pairs.find { |elem, _| elem == element }
600
- pair&.last
601
- end
602
-
603
- # Track child order on the document (stable identity)
604
- def append_child_sequence_on_doc(doc, element, type)
605
- pairs = attachments.get(doc, :_child_seq_pairs) || []
606
- pair = pairs.find { |elem, _| elem == element }
607
- if pair
608
- pair[1] << type
609
- else
610
- pairs << [element, [type]]
611
- end
612
- attachments.set(doc, :_child_seq_pairs, pairs)
613
- end
614
-
615
- # Look up child sequence for an element from the document
616
- def lookup_child_sequence(doc, element)
617
- pairs = attachments.get(doc, :_child_seq_pairs)
618
- return nil unless pairs
619
-
620
- pair = pairs.find { |elem, _| elem == element }
621
- pair&.last
622
- end
623
-
624
609
  def append_child_sequence(element, type)
625
610
  seq = attachments.get(element, :child_sequence) || []
626
611
  seq << type
@@ -976,21 +961,23 @@ module Moxml
976
961
  end
977
962
 
978
963
  if native_node.root
979
- # Use our custom serializer to control namespace output
964
+ indent_size = options[:indent].is_a?(Integer) && options[:indent].positive? ? options[:indent] : 0
965
+ # Custom serializer emits newlines AND indentation directly —
966
+ # no separate add_newlines_to_xml / indent_xml passes.
967
+ # `eref_active` is computed once here and threaded through the
968
+ # recursion so that the per-element `attachments.key?` Monitor
969
+ # sync only fires for docs that actually have entity refs.
970
+ eref_active = entity_ref_registry(native_node).active?
980
971
  root_output = serialize_element_with_namespaces(
981
972
  native_node.root,
982
973
  true,
974
+ indent_size,
975
+ 0,
976
+ eref_active: eref_active,
983
977
  )
984
978
 
985
- # Apply indentation if requested
986
- if options[:indent]&.positive?
987
- # First add newlines between elements
988
- formatted = add_newlines_to_xml(root_output)
989
- output << "\n" << indent_xml(formatted, options[:indent])
990
- else
991
- output << "\n" << root_output unless output.empty?
992
- output << root_output if output.empty?
993
- end
979
+ output << "\n" << root_output unless output.empty?
980
+ output << root_output if output.empty?
994
981
  end
995
982
 
996
983
  output
@@ -999,104 +986,22 @@ module Moxml
999
986
  end
1000
987
  end
1001
988
 
1002
- def add_newlines_to_xml(xml_string)
1003
- # Add newlines between XML elements for proper indentation
1004
- # But don't add newlines between opening and immediate closing tags (e.g., <tag></tag>)
1005
- # And most importantly, don't add newlines inside CDATA sections
1006
-
1007
- # First, protect CDATA sections by replacing them with placeholders
1008
- # Manual scanning guarantees O(n) complexity with no backtracking (ReDoS-safe)
1009
- cdata_sections = []
1010
- result = +""
1011
- pos = 0
1012
-
1013
- loop do
1014
- # Find next CDATA start
1015
- cdata_start = xml_string.index("<![CDATA[", pos)
1016
-
1017
- if cdata_start
1018
- # Copy everything before CDATA
1019
- result << xml_string[pos...cdata_start]
1020
-
1021
- # Find CDATA end
1022
- cdata_content_start = cdata_start + 9 # Length of "<![CDATA["
1023
- cdata_end = xml_string.index("]]>", cdata_content_start)
1024
-
1025
- if cdata_end
1026
- # Extract full CDATA including markers
1027
- full_cdata_end = cdata_end + 3 # Include "]]>"
1028
- cdata_section = xml_string[cdata_start...full_cdata_end]
1029
-
1030
- # Store and add placeholder
1031
- cdata_sections << cdata_section
1032
- result << "__CDATA_PLACEHOLDER_#{cdata_sections.length - 1}__"
1033
-
1034
- # Continue after this CDATA
1035
- pos = full_cdata_end
1036
- else
1037
- # Malformed CDATA (no closing "]]>") - copy as-is
1038
- result << xml_string[cdata_start..]
1039
- break
1040
- end
1041
- else
1042
- # No more CDATA sections - copy rest
1043
- result << xml_string[pos..]
1044
- break
1045
- end
1046
- end
1047
-
1048
- protected = result
1049
-
1050
- # Add newlines between elements (but not in CDATA - already protected)
1051
- with_newlines = protected.gsub(%r{(<[^>]+)>(?=<(?!/))}, "\\1>\n")
1052
-
1053
- # Restore CDATA sections
1054
- cdata_sections.each_with_index do |cdata, index|
1055
- with_newlines.sub!("__CDATA_PLACEHOLDER_#{index}__", cdata)
1056
- end
1057
-
1058
- with_newlines
1059
- end
1060
-
1061
- def indent_xml(xml_string, indent_size)
1062
- # Simple line-by-line indentation
1063
- lines = []
1064
- level = 0
1065
-
1066
- xml_string.each_line do |line|
1067
- line = line.strip
1068
- next if line.empty?
1069
-
1070
- # Decrease level for closing tags
1071
- level -= 1 if line.start_with?("</")
1072
- level = [level, 0].max
1073
-
1074
- # Add indented line
1075
- lines << ((" " * (indent_size * level)) + line)
1076
-
1077
- # Increase level for opening tags (but not self-closing or special tags)
1078
- next unless line.start_with?("<") && !line.start_with?("</") &&
1079
- !line.end_with?("/>") && !line.start_with?("<?") &&
1080
- !line.start_with?("<!") && !line.include?("</")
1081
-
1082
- level += 1
1083
- end
1084
-
1085
- lines.join("\n")
1086
- end
1087
-
989
+ # Shallow duplication: copies the node itself (name, attrs, namespaces)
990
+ # but NOT its descendants. This is what DocumentBuilder needs — it
991
+ # walks the source tree and re-adds children one at a time via
992
+ # add_child, so a deep copy here would be done only to be stripped
993
+ # by replace_children, then rebuilt — O(N²) waste on parse.
994
+ #
995
+ # For callers that need a true deep copy (e.g. the import_and_add
996
+ # fallback when LibXML can't move the subtree directly), use
997
+ # deep_duplicate_node.
1088
998
  def duplicate_node(node)
1089
999
  return nil unless node
1090
1000
 
1091
- # Unwrap if wrapped
1092
1001
  native_node = unpatch_node(node)
1093
1002
 
1094
- # LibXML is strict about document ownership
1095
- # Create brand new NATIVE nodes that are document-independent
1096
- # Wrappers are only used via patch_node when reading children
1097
1003
  case node_type(node)
1098
1004
  when :doctype
1099
- # DoctypeWrapper - create a new one with same properties
1100
1005
  if node.is_a?(DoctypeWrapper)
1101
1006
  DoctypeWrapper.new(
1102
1007
  create_document,
@@ -1105,64 +1010,10 @@ module Moxml
1105
1010
  node.system_id,
1106
1011
  )
1107
1012
  else
1108
- # Should not happen, but handle gracefully
1109
1013
  node
1110
1014
  end
1111
1015
  when :element
1112
- new_node = ::LibXML::XML::Node.new(native_node.name)
1113
- # new_node.line = node.line
1114
-
1115
- # Copy and set namespace definitions FIRST
1116
- if native_node.is_a?(::LibXML::XML::Node)
1117
- # First, copy all namespace definitions
1118
- native_node.namespaces.each do |ns|
1119
- ::LibXML::XML::Namespace.new(
1120
- new_node,
1121
- ns.prefix,
1122
- ns.href,
1123
- )
1124
- end
1125
-
1126
- # Then, set this element's own namespace if it has one
1127
- if native_node.namespaces.namespace
1128
- orig_ns = native_node.namespaces.namespace
1129
- # Find the matching namespace we just created
1130
- new_node.namespaces.each do |ns|
1131
- if ns.prefix == orig_ns.prefix && ns.href == orig_ns.href
1132
- new_node.namespaces.namespace = ns
1133
- break
1134
- end
1135
- end
1136
- end
1137
- end
1138
-
1139
- # Copy attributes AFTER namespaces are set up
1140
- # LibXML handles namespaced attributes through their full names
1141
- if native_node.attributes?
1142
- native_node.each_attr do |attr|
1143
- # Get the full attribute name (may include namespace prefix)
1144
- attr_name = if attr.ns&.prefix
1145
- "#{attr.ns.prefix}:#{attr.name}"
1146
- else
1147
- attr.name
1148
- end
1149
- new_node[attr_name] = attr.value
1150
- end
1151
- end
1152
-
1153
- # Recursively copy children
1154
- if native_node.children?
1155
- native_node.each_child do |child|
1156
- # Skip whitespace-only text nodes
1157
- next if child.text? && child.content.to_s.strip.empty?
1158
-
1159
- # Recursively duplicate the child
1160
- child_copy = duplicate_node(child)
1161
- new_node << child_copy
1162
- end
1163
- end
1164
-
1165
- new_node
1016
+ shallow_duplicate_element(native_node)
1166
1017
  when :text
1167
1018
  ::LibXML::XML::Node.new_text(native_node.content)
1168
1019
  when :cdata
@@ -1172,7 +1023,6 @@ module Moxml
1172
1023
  when :processing_instruction
1173
1024
  ::LibXML::XML::Node.new_pi(native_node.name, native_node.content)
1174
1025
  else
1175
- # For other types, try dup as fallback
1176
1026
  native_node.dup
1177
1027
  end
1178
1028
  end
@@ -1281,7 +1131,7 @@ module Moxml
1281
1131
  if elem.children?
1282
1132
  elem.each_child do |child|
1283
1133
  # Skip whitespace-only text nodes
1284
- next if child.text? && child.content.to_s.strip.empty?
1134
+ next if blank_text_node?(child)
1285
1135
 
1286
1136
  output << serialize_node(child)
1287
1137
  end
@@ -1289,7 +1139,7 @@ module Moxml
1289
1139
 
1290
1140
  # Append any EntityReference wrappers stored on the document
1291
1141
  doc = elem.doc
1292
- entity_refs = doc ? lookup_entity_refs(doc, elem) : nil
1142
+ entity_refs = entity_ref_registry(doc).refs_for(elem)
1293
1143
  entity_refs&.each { |ref| output << ref.to_xml }
1294
1144
 
1295
1145
  output << "</#{elem.name}>"
@@ -1331,21 +1181,18 @@ module Moxml
1331
1181
  .gsub(">", "&gt;")
1332
1182
  end
1333
1183
 
1184
+ ESCAPE_XML_RE = /[&<>"]/
1185
+ ESCAPE_XML_MAP = { "&" => "&amp;", "<" => "&lt;", ">" => "&gt;", '"' => "&quot;" }.freeze
1186
+ private_constant :ESCAPE_XML_RE, :ESCAPE_XML_MAP
1187
+
1334
1188
  def escape_xml(text)
1335
- text.to_s
1336
- .gsub("&", "&amp;")
1337
- .gsub("<", "&lt;")
1338
- .gsub(">", "&gt;")
1339
- .gsub("\"", "&quot;")
1340
- end
1189
+ # One gsub pass with a Hash replacement allocates a single new
1190
+ # string. The previous chained gsubs allocated three throwaway
1191
+ # strings on every call (very hot for attribute-heavy XML).
1192
+ str = text.is_a?(String) ? text : text.to_s
1193
+ return str unless str.match?(ESCAPE_XML_RE)
1341
1194
 
1342
- def escape_attribute_value(value)
1343
- escaped = value.to_s
1344
- .gsub("&", "&amp;")
1345
- .gsub("<", "&lt;")
1346
- .gsub(">", "&gt;")
1347
- .gsub("\"", "&quot;")
1348
- escaped.to_s
1195
+ str.gsub(ESCAPE_XML_RE, ESCAPE_XML_MAP)
1349
1196
  end
1350
1197
 
1351
1198
  def import_and_add(doc, element, child)
@@ -1368,7 +1215,7 @@ module Moxml
1368
1215
  else
1369
1216
  # No target document - create a deep copy of the node instead
1370
1217
  # This handles the case where the element isn't attached to a document yet
1371
- copied = duplicate_node(child)
1218
+ copied = deep_duplicate_node(child)
1372
1219
  element << copied
1373
1220
  end
1374
1221
 
@@ -1413,127 +1260,242 @@ module Moxml
1413
1260
  end
1414
1261
  end
1415
1262
 
1416
- def serialize_element_with_namespaces(elem, include_ns = true)
1417
- output = "<#{elem.name}"
1263
+ def serialize_element_with_namespaces(elem, include_ns = true,
1264
+ indent_size = 0, depth = 0,
1265
+ eref_active: nil)
1266
+ # Cache elem.name — it's a libxml C call we'd otherwise make
1267
+ # twice (open tag + close tag). Concat with `<<` instead of
1268
+ # `"<#{name}"` to avoid the interpolated intermediate string.
1269
+ name = elem.name
1270
+ output = +"<"
1271
+ output << name
1272
+ emit_namespace_definitions(output, elem, include_ns)
1273
+ emit_attributes(output, elem)
1274
+
1275
+ # `eref_active` is precomputed at the top-level `serialize` call
1276
+ # and threaded down — when nil (top-level non-recursive call into
1277
+ # this method), look it up; when false, skip the per-element doc
1278
+ # attachment query that otherwise fires for every element under
1279
+ # Monitor#synchronize.
1280
+ eref_active = doc_eref_active?(elem.doc) if eref_active.nil?
1281
+ entity_refs, child_sequence = eref_active ? lookup_entity_ref_serialization(elem) : [nil, nil]
1418
1282
 
1419
- # Include namespace definitions:
1420
- # - On root element (include_ns = true), output ALL namespace definitions
1421
- # - On child elements, output namespace definitions that override parent namespaces
1422
- if elem.is_a?(::LibXML::XML::Node) && elem.namespaces.respond_to?(:definitions)
1423
- # Get parent's namespace definitions to detect overrides
1424
- parent_ns_defs = if !include_ns && elem.parent && !elem.parent.is_a?(::LibXML::XML::Document)
1425
- parent_namespaces = {}
1426
- if elem.parent.is_a?(::LibXML::XML::Node)
1427
- elem.parent.namespaces.each do |ns|
1428
- parent_namespaces[ns.prefix] = ns.href
1429
- end
1430
- end
1431
- parent_namespaces
1432
- else
1433
- {}
1434
- end
1283
+ # Always use verbose format <tag></tag> for consistency with other adapters
1284
+ output << ">"
1435
1285
 
1436
- seen_ns = {}
1437
- elem.namespaces.definitions.each do |ns|
1438
- prefix = ns.prefix
1439
- uri = ns.href
1440
- next if seen_ns.key?(prefix)
1286
+ if entity_refs && child_sequence
1287
+ emit_eref_interleaved_children(output, elem, entity_refs, child_sequence,
1288
+ indent_size, depth, eref_active: eref_active)
1289
+ elsif elem.children?
1290
+ emit_children_with_layout(output, elem, indent_size, depth,
1291
+ eref_active: eref_active)
1292
+ end
1441
1293
 
1442
- # Output namespace if:
1443
- # 1. This is root element (include_ns = true), OR
1444
- # 2. This namespace overrides a parent namespace (different URI for same prefix)
1445
- should_output = include_ns ||
1446
- (parent_ns_defs.key?(prefix) && parent_ns_defs[prefix] != uri)
1294
+ output << "</" << name << ">"
1295
+ output
1296
+ end
1447
1297
 
1448
- next unless should_output
1298
+ def doc_eref_active?(doc)
1299
+ entity_ref_registry(doc).active?
1300
+ end
1449
1301
 
1450
- seen_ns[prefix] = true
1451
- output << if prefix.nil? || prefix.empty?
1452
- " xmlns=\"#{escape_xml(uri)}\""
1453
- else
1454
- " xmlns:#{prefix}=\"#{escape_xml(uri)}\""
1455
- end
1456
- end
1302
+ def entity_ref_registry(doc)
1303
+ EntityRefRegistry.new(attachments, doc)
1304
+ end
1305
+
1306
+ # Emit `xmlns`/`xmlns:foo` declarations onto `output`. On the root
1307
+ # (`include_ns: true`) we emit ALL definitions; on children we
1308
+ # emit only definitions that OVERRIDE a parent's same-prefix URI.
1309
+ # Skips the whole block when the element has no local definitions,
1310
+ # which is the common case for child elements in unnamespaced docs.
1311
+ def emit_namespace_definitions(output, elem, include_ns)
1312
+ return unless elem.is_a?(::LibXML::XML::Node)
1313
+
1314
+ ns_list = elem.namespaces
1315
+ return unless ns_list.respond_to?(:definitions)
1316
+
1317
+ definitions = ns_list.definitions
1318
+ return if definitions.empty?
1319
+
1320
+ parent_ns_defs = include_ns ? nil : parent_namespace_defs(elem)
1321
+ seen_ns = nil
1322
+
1323
+ definitions.each do |ns|
1324
+ prefix = ns.prefix
1325
+ uri = ns.href
1326
+ next unless include_ns ||
1327
+ (parent_ns_defs&.key?(prefix) && parent_ns_defs[prefix] != uri)
1328
+
1329
+ seen_ns ||= {}
1330
+ next if seen_ns.key?(prefix)
1331
+
1332
+ seen_ns[prefix] = true
1333
+ output << format_ns_declaration(prefix, uri)
1457
1334
  end
1335
+ end
1458
1336
 
1459
- # Add attributes
1460
- if elem.attributes?
1461
- elem.each_attr do |attr|
1462
- next if attr.name.start_with?("xmlns")
1337
+ def parent_namespace_defs(elem)
1338
+ parent = elem.parent
1339
+ return nil unless parent.is_a?(::LibXML::XML::Node)
1463
1340
 
1464
- # Include namespace prefix if attribute has one
1465
- attr_name = if attr.ns&.prefix
1466
- "#{attr.ns.prefix}:#{attr.name}"
1467
- else
1468
- attr.name
1469
- end
1470
- output << " #{attr_name}=\"#{escape_xml(attr.value)}\""
1471
- end
1341
+ defs = {}
1342
+ parent.namespaces.each { |ns| defs[ns.prefix] = ns.href }
1343
+ defs
1344
+ end
1345
+
1346
+ def format_ns_declaration(prefix, uri)
1347
+ if prefix.nil? || prefix.empty?
1348
+ " xmlns=\"#{escape_xml(uri)}\""
1349
+ else
1350
+ " xmlns:#{prefix}=\"#{escape_xml(uri)}\""
1472
1351
  end
1352
+ end
1473
1353
 
1474
- # Check for entity refs stored on the document
1475
- # LibXML element wrappers are ephemeral, so look up via == comparison
1476
- doc = elem.doc
1477
- entity_refs = doc ? lookup_entity_refs(doc, elem) : nil
1478
- child_sequence = doc ? lookup_child_sequence(doc, elem) : nil
1354
+ def emit_attributes(output, elem)
1355
+ return unless elem.attributes?
1479
1356
 
1480
- # Always use verbose format <tag></tag> for consistency with other adapters
1481
- output << ">"
1357
+ elem.each_attr do |attr|
1358
+ next if attr.name.start_with?("xmlns")
1482
1359
 
1483
- if entity_refs && !entity_refs.empty? && child_sequence
1484
- # Interleave native children with entity refs using tracked sequence
1485
- native_children = []
1486
- if elem.children?
1487
- elem.each_child do |c|
1488
- native_children << c unless c.text? && c.content.to_s.strip.empty?
1489
- end
1490
- end
1360
+ attr_name = attr.ns&.prefix ? "#{attr.ns.prefix}:#{attr.name}" : attr.name
1361
+ output << " #{attr_name}=\"#{escape_xml(attr.value)}\""
1362
+ end
1363
+ end
1491
1364
 
1492
- eref_idx = 0
1493
- native_idx = 0
1494
- child_sequence.each do |type|
1495
- case type
1496
- when :native
1497
- if native_idx < native_children.size
1498
- child = native_children[native_idx]
1499
- native_idx += 1
1500
- wrapped_child = patch_node(child)
1501
- output << if wrapped_child.is_a?(CustomizedLibxml::Node) && !wrapped_child.is_a?(CustomizedLibxml::Element)
1502
- wrapped_child.to_xml
1503
- elsif child.element?
1504
- serialize_element_with_namespaces(child, false)
1505
- else
1506
- serialize_node(child)
1507
- end
1508
- end
1509
- when :eref
1510
- if eref_idx < entity_refs.size
1511
- output << entity_refs[eref_idx].to_xml
1512
- eref_idx += 1
1365
+ # Returns [entity_refs, child_sequence] when the element has
1366
+ # interleaved entity references that the serializer needs to
1367
+ # weave back into the native child stream — otherwise [nil, nil].
1368
+ #
1369
+ # The caller is responsible for gating this with `eref_active`
1370
+ # (precomputed once per `serialize` call). When `eref_active` is
1371
+ # false this method is never entered, so the per-element doc
1372
+ # attachment query never fires.
1373
+ def lookup_entity_ref_serialization(elem)
1374
+ doc = elem.doc
1375
+ return [nil, nil] unless doc
1376
+
1377
+ entity_ref_registry(doc).serialization_for(elem)
1378
+ end
1379
+
1380
+ def emit_eref_interleaved_children(output, elem, entity_refs, child_sequence,
1381
+ indent_size, depth, eref_active:)
1382
+ native_children = collect_non_blank_children(elem)
1383
+ child_pad = indent_size.positive? ? " " * (indent_size * (depth + 1)) : nil
1384
+ eref_idx = 0
1385
+ native_idx = 0
1386
+ prev_block = true
1387
+
1388
+ child_sequence.each do |type|
1389
+ case type
1390
+ when :native
1391
+ if native_idx < native_children.size
1392
+ child = native_children[native_idx]
1393
+ is_text_like = child.text? || child.cdata?
1394
+ if prev_block && !is_text_like
1395
+ output << "\n"
1396
+ output << child_pad if child_pad
1513
1397
  end
1398
+ prev_block = !is_text_like
1399
+
1400
+ output << serialize_child_to_xml(
1401
+ child, indent_size: indent_size, depth: depth,
1402
+ eref_active: eref_active
1403
+ )
1404
+ native_idx += 1
1405
+ end
1406
+ when :eref
1407
+ if eref_idx < entity_refs.size
1408
+ output << entity_refs[eref_idx].to_xml
1409
+ eref_idx += 1
1410
+ prev_block = false
1514
1411
  end
1515
1412
  end
1516
- elsif elem.children?
1517
- elem.each_child do |child|
1518
- # Skip whitespace-only text nodes
1519
- next if child.text? && child.content.to_s.strip.empty?
1520
-
1521
- # Wrap the child and serialize
1522
- wrapped_child = patch_node(child)
1523
- output << if wrapped_child.is_a?(CustomizedLibxml::Node) && !wrapped_child.is_a?(CustomizedLibxml::Element)
1524
- # Use wrapper's to_xml for proper serialization
1525
- wrapped_child.to_xml
1526
- elsif child.element?
1527
- # Recursively serialize child elements
1528
- serialize_element_with_namespaces(child, false)
1529
- else
1530
- serialize_node(child)
1531
- end
1413
+ end
1414
+ end
1415
+
1416
+ # Regex used in place of `content.to_s.strip.empty?` for whitespace-only
1417
+ # text detection — `match?` allocates nothing while `.strip` makes a
1418
+ # throwaway copy of every text node's content on each visit.
1419
+ NON_WHITESPACE_RE = /\S/
1420
+ private_constant :NON_WHITESPACE_RE
1421
+
1422
+ def blank_text_node?(child)
1423
+ child.text? && blank_content?(child.content)
1424
+ end
1425
+
1426
+ def blank_content?(content)
1427
+ content.nil? || !content.match?(NON_WHITESPACE_RE)
1428
+ end
1429
+
1430
+ def collect_non_blank_children(elem)
1431
+ children = []
1432
+ return children unless elem.children?
1433
+
1434
+ elem.each_child do |c|
1435
+ children << c unless blank_text_node?(c)
1436
+ end
1437
+ children
1438
+ end
1439
+
1440
+ # Walk native children once and emit them with the same newline +
1441
+ # indentation layout the old `add_newlines_to_xml` + `indent_xml`
1442
+ # post-passes produced — but in a single recursion with no string
1443
+ # rescanning.
1444
+ #
1445
+ # Newline rule (matching `>(?=<(?!/))` with CDATA-placeholder
1446
+ # protection): emit `\n` + per-level padding before a child iff
1447
+ # the previous emitted sibling was block-level (ended with `>`)
1448
+ # AND the current sibling is block-level. Text and CDATA count
1449
+ # as text-like and suppress the newline on both sides (the
1450
+ # original CDATA placeholder broke the `>...<` adjacency
1451
+ # symmetrically).
1452
+ def emit_children_with_layout(output, elem, indent_size, depth,
1453
+ eref_active:)
1454
+ child_pad = indent_size.positive? ? " " * (indent_size * (depth + 1)) : nil
1455
+ prev_block = true
1456
+
1457
+ elem.each_child do |child|
1458
+ # Cache text? — used twice per child (whitespace skip + is_text_like).
1459
+ # For element children (the common case) both calls return false, so
1460
+ # caching saves a libxml C call.
1461
+ is_text = child.text?
1462
+ next if is_text && blank_content?(child.content)
1463
+
1464
+ is_text_like = is_text || child.cdata?
1465
+ if prev_block && !is_text_like
1466
+ output << "\n"
1467
+ output << child_pad if child_pad
1532
1468
  end
1469
+ prev_block = !is_text_like
1470
+
1471
+ output << serialize_child_to_xml(child, indent_size: indent_size, depth: depth,
1472
+ eref_active: eref_active)
1533
1473
  end
1534
- output << "</#{elem.name}>"
1474
+ end
1535
1475
 
1536
- output
1476
+ # Serialize one child node. Elements recurse into the layout-aware
1477
+ # path; non-element wrappers route through their own `to_xml`;
1478
+ # everything else falls through to the per-type serializer.
1479
+ # `indent_size:` and `depth:` are required to force callers to
1480
+ # decide whether the child should inherit the parent's indent
1481
+ # state — the entity-ref interleave path deliberately passes 0/0.
1482
+ #
1483
+ # Element fast-path checked first to avoid allocating a wrapper
1484
+ # we'd immediately throw away (elements always recurse on the
1485
+ # raw native node, not the wrapper). For a typical document this
1486
+ # skips wrapper allocation for the majority of children.
1487
+ def serialize_child_to_xml(child, indent_size:, depth:, eref_active:)
1488
+ if child.element?
1489
+ return serialize_element_with_namespaces(child, false, indent_size, depth + 1,
1490
+ eref_active: eref_active)
1491
+ end
1492
+
1493
+ wrapped_child = patch_node(child)
1494
+ if wrapped_child.is_a?(CustomizedLibxml::Node)
1495
+ wrapped_child.to_xml
1496
+ else
1497
+ serialize_node(child)
1498
+ end
1537
1499
  end
1538
1500
 
1539
1501
  def remove_indentation(xml_string)
@@ -1631,6 +1593,63 @@ module Moxml
1631
1593
  end
1632
1594
  nil
1633
1595
  end
1596
+
1597
+ # Deep duplication for the rare `import_and_add` fallback (when
1598
+ # libxml refuses to move a subtree across documents AND no target
1599
+ # document is available). Walks the source subtree and rebuilds
1600
+ # it as document-independent nodes. The DocumentBuilder hot path
1601
+ # goes through the shallow `duplicate_node` instead.
1602
+ def deep_duplicate_node(node)
1603
+ return nil unless node
1604
+
1605
+ native_node = unpatch_node(node)
1606
+
1607
+ return duplicate_node(node) unless node_type(node) == :element
1608
+
1609
+ new_node = shallow_duplicate_element(native_node)
1610
+ return new_node unless native_node.is_a?(::LibXML::XML::Node) && native_node.children?
1611
+
1612
+ native_node.each_child do |child|
1613
+ next if blank_text_node?(child)
1614
+
1615
+ new_node << deep_duplicate_node(child)
1616
+ end
1617
+ new_node
1618
+ end
1619
+
1620
+ # Copies a single element: its name, its OWN namespace definitions,
1621
+ # the active default namespace, and its attributes. Children are NOT
1622
+ # duplicated — callers that need the subtree use deep_duplicate_node.
1623
+ def shallow_duplicate_element(native_node)
1624
+ new_node = ::LibXML::XML::Node.new(native_node.name)
1625
+ copy_element_namespaces(native_node, new_node) if native_node.is_a?(::LibXML::XML::Node)
1626
+ copy_element_attributes(native_node, new_node) if native_node.attributes?
1627
+ new_node
1628
+ end
1629
+
1630
+ def copy_element_namespaces(src, dst)
1631
+ ns_list = src.namespaces
1632
+ ns_list.each do |ns|
1633
+ ::LibXML::XML::Namespace.new(dst, ns.prefix, ns.href)
1634
+ end
1635
+
1636
+ own_ns = ns_list.namespace
1637
+ return unless own_ns
1638
+
1639
+ dst.namespaces.each do |ns|
1640
+ next unless ns.prefix == own_ns.prefix && ns.href == own_ns.href
1641
+
1642
+ dst.namespaces.namespace = ns
1643
+ break
1644
+ end
1645
+ end
1646
+
1647
+ def copy_element_attributes(src, dst)
1648
+ src.each_attr do |attr|
1649
+ attr_name = attr.ns&.prefix ? "#{attr.ns.prefix}:#{attr.name}" : attr.name
1650
+ dst[attr_name] = attr.value
1651
+ end
1652
+ end
1634
1653
  end
1635
1654
 
1636
1655
  # Bridge between LibXML SAX and Moxml SAX
@@ -1640,6 +1659,7 @@ module Moxml
1640
1659
  # @private
1641
1660
  class LibXMLSAXBridge
1642
1661
  include ::LibXML::XML::SaxParser::Callbacks
1662
+ include Moxml::SAX::NamespaceSplitter
1643
1663
 
1644
1664
  def initialize(handler)
1645
1665
  @handler = handler
@@ -1656,26 +1676,7 @@ module Moxml
1656
1676
  end
1657
1677
 
1658
1678
  def on_start_element(name, attributes)
1659
- # Convert LibXML attributes hash to separate attrs and namespaces
1660
- attr_hash = {}
1661
- ns_hash = {}
1662
-
1663
- attributes&.each do |attr_name, attr_value|
1664
- if attr_name.to_s.start_with?("xmlns")
1665
- # Namespace declaration
1666
- prefix = if attr_name.to_s == "xmlns"
1667
- nil
1668
- else
1669
- attr_name.to_s.sub(
1670
- "xmlns:", ""
1671
- )
1672
- end
1673
- ns_hash[prefix] = attr_value
1674
- else
1675
- attr_hash[attr_name.to_s] = attr_value
1676
- end
1677
- end
1678
-
1679
+ attr_hash, ns_hash = split_attributes_and_namespaces(attributes)
1679
1680
  @handler.on_start_element(name.to_s, attr_hash, ns_hash)
1680
1681
  end
1681
1682
 
@@ -1706,3 +1707,6 @@ module Moxml
1706
1707
  end
1707
1708
  end
1708
1709
  end
1710
+
1711
+ require_relative "libxml/entity_ref_registry"
1712
+ require_relative "libxml/entity_restorer"