moxml 0.1.19 → 0.1.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -37,6 +37,32 @@ module Moxml
37
37
  end
38
38
  end
39
39
 
40
+ # Mapping from libxml's integer node_type to our symbol — built once
41
+ # at load so `node_type` can do a single hash lookup on the hot path
42
+ # instead of a large case/when on every node.
43
+ NATIVE_NODE_TYPE_MAP = {
44
+ ::LibXML::XML::Node::ELEMENT_NODE => :element,
45
+ ::LibXML::XML::Node::TEXT_NODE => :text,
46
+ ::LibXML::XML::Node::CDATA_SECTION_NODE => :cdata,
47
+ ::LibXML::XML::Node::COMMENT_NODE => :comment,
48
+ ::LibXML::XML::Node::PI_NODE => :processing_instruction,
49
+ ::LibXML::XML::Node::ATTRIBUTE_NODE => :attribute,
50
+ ::LibXML::XML::Node::DTD_NODE => :doctype,
51
+ ::LibXML::XML::Node::DOCUMENT_NODE => :document,
52
+ }.freeze
53
+ private_constant :NATIVE_NODE_TYPE_MAP
54
+
55
+ WRAPPER_NODE_TYPE_MAP = {
56
+ DoctypeWrapper => :doctype,
57
+ CustomizedLibxml::Element => :element,
58
+ CustomizedLibxml::Text => :text,
59
+ CustomizedLibxml::Cdata => :cdata,
60
+ CustomizedLibxml::Comment => :comment,
61
+ CustomizedLibxml::ProcessingInstruction => :processing_instruction,
62
+ CustomizedLibxml::EntityReference => :entity_reference,
63
+ }.freeze
64
+ private_constant :WRAPPER_NODE_TYPE_MAP
65
+
40
66
  class << self
41
67
  def attachments
42
68
  @attachments ||= Moxml::NativeAttachment.new
@@ -98,7 +124,22 @@ module Moxml
98
124
  end
99
125
 
100
126
  ctx = _context || Context.new(:libxml)
101
- DocumentBuilder.new(ctx).build(native_doc)
127
+ # Single parse path: wrap libxml's already-complete C-parsed tree
128
+ # directly (same pattern as nokogiri/ox). The previous
129
+ # DocumentBuilder.build path walked the entire parsed tree and
130
+ # re-added every node to a fresh moxml-managed document, which
131
+ # made parse O(N) Ruby work on top of an already-complete parse.
132
+ # Doctype/declaration/PI attachments set above remain on
133
+ # native_doc, so the serialize path still sees them.
134
+ #
135
+ # restore_entities is handled as a post-processing case after
136
+ # the wrap, NOT by branching into a different builder. This way
137
+ # any new parse-time logic only has to be added to this one
138
+ # path; the restoration walk is just one of potentially several
139
+ # post-processing steps and doesn't fork the construction.
140
+ doc = Document.new(native_doc, ctx)
141
+ EntityRestorer.new(doc).run if ctx.config.restore_entities
142
+ doc
102
143
  end
103
144
 
104
145
  # SAX parsing implementation for LibXML
@@ -181,40 +222,23 @@ module Moxml
181
222
  def node_type(node)
182
223
  return :unknown unless node
183
224
 
184
- # Handle wrapper classes
185
- return :element if node.is_a?(CustomizedLibxml::Element)
186
- return :text if node.is_a?(CustomizedLibxml::Text)
187
- return :cdata if node.is_a?(CustomizedLibxml::Cdata)
188
- return :comment if node.is_a?(CustomizedLibxml::Comment)
189
- if node.is_a?(CustomizedLibxml::ProcessingInstruction)
190
- return :processing_instruction
225
+ # Fast path: native libxml nodes are the vast majority during
226
+ # parse traversal (DocumentBuilder visits raw libxml children).
227
+ # Skip the wrapper checks below for them.
228
+ if node.is_a?(::LibXML::XML::Node)
229
+ return NATIVE_NODE_TYPE_MAP[node.node_type] || :unknown
191
230
  end
192
- return :entity_reference if node.is_a?(CustomizedLibxml::EntityReference)
193
- return :doctype if node.is_a?(DoctypeWrapper)
231
+ return :document if node.is_a?(::LibXML::XML::Document)
194
232
 
195
- # Unwrap if needed
196
- native_node = unpatch_node(node)
233
+ wrapper_type = WRAPPER_NODE_TYPE_MAP[node.class]
234
+ return wrapper_type if wrapper_type
197
235
 
198
- case native_node.node_type
199
- when ::LibXML::XML::Node::DOCUMENT_NODE
200
- :document
201
- when ::LibXML::XML::Node::ELEMENT_NODE
202
- :element
203
- when ::LibXML::XML::Node::TEXT_NODE
204
- :text
205
- when ::LibXML::XML::Node::CDATA_SECTION_NODE
206
- :cdata
207
- when ::LibXML::XML::Node::COMMENT_NODE
208
- :comment
209
- when ::LibXML::XML::Node::ATTRIBUTE_NODE
210
- :attribute
211
- when ::LibXML::XML::Node::PI_NODE
212
- :processing_instruction
213
- when ::LibXML::XML::Node::DTD_NODE
214
- :doctype
215
- else
216
- :unknown
217
- end
236
+ # Duck-typed fallback for libxml types that aren't ::Node
237
+ # subclasses but still expose node_type (e.g. ::Attr).
238
+ native = unpatch_node(node)
239
+ return :unknown unless native.respond_to?(:node_type)
240
+
241
+ NATIVE_NODE_TYPE_MAP[native.node_type] || :unknown
218
242
  end
219
243
 
220
244
  def node_name(node)
@@ -300,7 +324,7 @@ module Moxml
300
324
 
301
325
  # Include any EntityReference wrappers stored on the document
302
326
  doc = native_node.doc
303
- entity_refs = doc ? lookup_entity_refs(doc, native_node) : nil
327
+ entity_refs = entity_ref_registry(doc).refs_for(native_node)
304
328
  result.concat(entity_refs) if entity_refs
305
329
 
306
330
  result
@@ -317,7 +341,7 @@ module Moxml
317
341
  current = native_node&.next
318
342
  while current
319
343
  # Skip whitespace-only text nodes
320
- break unless current.text? && current.content.to_s.strip.empty?
344
+ break unless blank_text_node?(current)
321
345
 
322
346
  current = current.next
323
347
  end
@@ -329,7 +353,7 @@ module Moxml
329
353
  current = native_node&.prev
330
354
  while current
331
355
  # Skip whitespace-only text nodes
332
- break unless current.text? && current.content.to_s.strip.empty?
356
+ break unless blank_text_node?(current)
333
357
 
334
358
  current = current.prev
335
359
  end
@@ -504,21 +528,22 @@ module Moxml
504
528
  native_child = unpatch_node(child)
505
529
 
506
530
  # EntityReference wrappers can't go in LibXML's native tree.
507
- # Store on the document (stable identity) keyed by element.
508
- # LibXML creates new Ruby wrappers on each access, so element
509
- # object_id is unstable — we look up via == comparison.
531
+ # Store them on the document for interleaved serialization.
510
532
  if child.is_a?(CustomizedLibxml::EntityReference)
511
533
  doc = native_elem.is_a?(::LibXML::XML::Document) ? native_elem : native_elem.doc
512
- store_entity_ref_on_doc(doc, native_elem, child)
513
- append_child_sequence_on_doc(doc, native_elem, :eref)
534
+ entity_ref_registry(doc).register(native_elem, child)
514
535
  return
515
536
  end
516
537
 
517
538
  # For LibXML: if parent has a DEFAULT namespace (nil/empty prefix) and child is an element without a namespace,
518
539
  # explicitly set the child's namespace to match the parent's for XPath compatibility
519
- # NOTE: Prefixed namespaces are NOT inherited, only default namespaces
520
- if native_elem.is_a?(::LibXML::XML::Node) && native_elem.namespaces&.namespace &&
521
- native_child.is_a?(::LibXML::XML::Node) && native_child.element? &&
540
+ # NOTE: Prefixed namespaces are NOT inherited, only default namespaces.
541
+ #
542
+ # Reorder cheap-first: skip the expensive `.namespaces` fetches
543
+ # entirely for non-element children (text, comment, cdata, PI),
544
+ # which is roughly 30-50% of adds in a typical doc.
545
+ if native_child.is_a?(::LibXML::XML::Node) && native_child.element? &&
546
+ native_elem.is_a?(::LibXML::XML::Node) && native_elem.namespaces&.namespace &&
522
547
  (!native_child.namespaces.namespace || native_child.namespaces.namespace.href.to_s.empty?)
523
548
 
524
549
  parent_ns = native_elem.namespaces.namespace
@@ -565,7 +590,8 @@ module Moxml
565
590
  # Set as root element
566
591
  native_elem.root = native_child
567
592
  # Flag for actual_native to refresh the wrapper's native reference
568
- attachments.set(native_elem, :_pending_root_refresh, native_child.object_id)
593
+ attachments.set(native_elem, :_pending_root_refresh,
594
+ native_child.object_id)
569
595
  elsif native_elem.root
570
596
  # Document has root, add to it instead
571
597
  import_and_add(native_elem.doc, native_elem.root, native_child)
@@ -573,49 +599,8 @@ module Moxml
573
599
  else
574
600
  import_and_add(native_elem.doc, native_elem, native_child)
575
601
  doc = native_elem.doc || native_elem
576
- append_child_sequence_on_doc(doc, native_elem, :native)
577
- end
578
- end
579
-
580
- # Store entity ref on the document (stable identity).
581
- # LibXML element wrappers are ephemeral, so we use == to find matching elements.
582
- def store_entity_ref_on_doc(doc, element, ref)
583
- pairs = attachments.get(doc, :_entity_ref_pairs) || []
584
- pair = pairs.find { |elem, _| elem == element }
585
- if pair
586
- pair[1] << ref
587
- else
588
- pairs << [element, [ref]]
589
- end
590
- attachments.set(doc, :_entity_ref_pairs, pairs)
591
- end
592
-
593
- # Look up entity refs for an element from the document
594
- def lookup_entity_refs(doc, element)
595
- pairs = attachments.get(doc, :_entity_ref_pairs)
596
- return nil unless pairs
597
- pair = pairs.find { |elem, _| elem == element }
598
- pair&.last
599
- end
600
-
601
- # Track child order on the document (stable identity)
602
- def append_child_sequence_on_doc(doc, element, type)
603
- pairs = attachments.get(doc, :_child_seq_pairs) || []
604
- pair = pairs.find { |elem, _| elem == element }
605
- if pair
606
- pair[1] << type
607
- else
608
- pairs << [element, [type]]
602
+ entity_ref_registry(doc).append_native(native_elem)
609
603
  end
610
- attachments.set(doc, :_child_seq_pairs, pairs)
611
- end
612
-
613
- # Look up child sequence for an element from the document
614
- def lookup_child_sequence(doc, element)
615
- pairs = attachments.get(doc, :_child_seq_pairs)
616
- return nil unless pairs
617
- pair = pairs.find { |elem, _| elem == element }
618
- pair&.last
619
604
  end
620
605
 
621
606
  def append_child_sequence(element, type)
@@ -973,21 +958,23 @@ module Moxml
973
958
  end
974
959
 
975
960
  if native_node.root
976
- # Use our custom serializer to control namespace output
961
+ indent_size = options[:indent].is_a?(Integer) && options[:indent].positive? ? options[:indent] : 0
962
+ # Custom serializer emits newlines AND indentation directly —
963
+ # no separate add_newlines_to_xml / indent_xml passes.
964
+ # `eref_active` is computed once here and threaded through the
965
+ # recursion so that the per-element `attachments.key?` Monitor
966
+ # sync only fires for docs that actually have entity refs.
967
+ eref_active = entity_ref_registry(native_node).active?
977
968
  root_output = serialize_element_with_namespaces(
978
969
  native_node.root,
979
970
  true,
971
+ indent_size,
972
+ 0,
973
+ eref_active: eref_active,
980
974
  )
981
975
 
982
- # Apply indentation if requested
983
- if options[:indent]&.positive?
984
- # First add newlines between elements
985
- formatted = add_newlines_to_xml(root_output)
986
- output << "\n" << indent_xml(formatted, options[:indent])
987
- else
988
- output << "\n" << root_output unless output.empty?
989
- output << root_output if output.empty?
990
- end
976
+ output << "\n" << root_output unless output.empty?
977
+ output << root_output if output.empty?
991
978
  end
992
979
 
993
980
  output
@@ -996,104 +983,22 @@ module Moxml
996
983
  end
997
984
  end
998
985
 
999
- def add_newlines_to_xml(xml_string)
1000
- # Add newlines between XML elements for proper indentation
1001
- # But don't add newlines between opening and immediate closing tags (e.g., <tag></tag>)
1002
- # And most importantly, don't add newlines inside CDATA sections
1003
-
1004
- # First, protect CDATA sections by replacing them with placeholders
1005
- # Manual scanning guarantees O(n) complexity with no backtracking (ReDoS-safe)
1006
- cdata_sections = []
1007
- result = +""
1008
- pos = 0
1009
-
1010
- loop do
1011
- # Find next CDATA start
1012
- cdata_start = xml_string.index("<![CDATA[", pos)
1013
-
1014
- if cdata_start
1015
- # Copy everything before CDATA
1016
- result << xml_string[pos...cdata_start]
1017
-
1018
- # Find CDATA end
1019
- cdata_content_start = cdata_start + 9 # Length of "<![CDATA["
1020
- cdata_end = xml_string.index("]]>", cdata_content_start)
1021
-
1022
- if cdata_end
1023
- # Extract full CDATA including markers
1024
- full_cdata_end = cdata_end + 3 # Include "]]>"
1025
- cdata_section = xml_string[cdata_start...full_cdata_end]
1026
-
1027
- # Store and add placeholder
1028
- cdata_sections << cdata_section
1029
- result << "__CDATA_PLACEHOLDER_#{cdata_sections.length - 1}__"
1030
-
1031
- # Continue after this CDATA
1032
- pos = full_cdata_end
1033
- else
1034
- # Malformed CDATA (no closing "]]>") - copy as-is
1035
- result << xml_string[cdata_start..]
1036
- break
1037
- end
1038
- else
1039
- # No more CDATA sections - copy rest
1040
- result << xml_string[pos..]
1041
- break
1042
- end
1043
- end
1044
-
1045
- protected = result
1046
-
1047
- # Add newlines between elements (but not in CDATA - already protected)
1048
- with_newlines = protected.gsub(%r{(<[^>]+)>(?=<(?!/))}, "\\1>\n")
1049
-
1050
- # Restore CDATA sections
1051
- cdata_sections.each_with_index do |cdata, index|
1052
- with_newlines.sub!("__CDATA_PLACEHOLDER_#{index}__", cdata)
1053
- end
1054
-
1055
- with_newlines
1056
- end
1057
-
1058
- def indent_xml(xml_string, indent_size)
1059
- # Simple line-by-line indentation
1060
- lines = []
1061
- level = 0
1062
-
1063
- xml_string.each_line do |line|
1064
- line = line.strip
1065
- next if line.empty?
1066
-
1067
- # Decrease level for closing tags
1068
- level -= 1 if line.start_with?("</")
1069
- level = [level, 0].max
1070
-
1071
- # Add indented line
1072
- lines << ((" " * (indent_size * level)) + line)
1073
-
1074
- # Increase level for opening tags (but not self-closing or special tags)
1075
- next unless line.start_with?("<") && !line.start_with?("</") &&
1076
- !line.end_with?("/>") && !line.start_with?("<?") &&
1077
- !line.start_with?("<!") && !line.include?("</")
1078
-
1079
- level += 1
1080
- end
1081
-
1082
- lines.join("\n")
1083
- end
1084
-
986
+ # Shallow duplication: copies the node itself (name, attrs, namespaces)
987
+ # but NOT its descendants. This is what DocumentBuilder needs — it
988
+ # walks the source tree and re-adds children one at a time via
989
+ # add_child, so a deep copy here would be done only to be stripped
990
+ # by replace_children, then rebuilt — O(N²) waste on parse.
991
+ #
992
+ # For callers that need a true deep copy (e.g. the import_and_add
993
+ # fallback when LibXML can't move the subtree directly), use
994
+ # deep_duplicate_node.
1085
995
  def duplicate_node(node)
1086
996
  return nil unless node
1087
997
 
1088
- # Unwrap if wrapped
1089
998
  native_node = unpatch_node(node)
1090
999
 
1091
- # LibXML is strict about document ownership
1092
- # Create brand new NATIVE nodes that are document-independent
1093
- # Wrappers are only used via patch_node when reading children
1094
1000
  case node_type(node)
1095
1001
  when :doctype
1096
- # DoctypeWrapper - create a new one with same properties
1097
1002
  if node.is_a?(DoctypeWrapper)
1098
1003
  DoctypeWrapper.new(
1099
1004
  create_document,
@@ -1102,64 +1007,10 @@ module Moxml
1102
1007
  node.system_id,
1103
1008
  )
1104
1009
  else
1105
- # Should not happen, but handle gracefully
1106
1010
  node
1107
1011
  end
1108
1012
  when :element
1109
- new_node = ::LibXML::XML::Node.new(native_node.name)
1110
- # new_node.line = node.line
1111
-
1112
- # Copy and set namespace definitions FIRST
1113
- if native_node.is_a?(::LibXML::XML::Node)
1114
- # First, copy all namespace definitions
1115
- native_node.namespaces.each do |ns|
1116
- ::LibXML::XML::Namespace.new(
1117
- new_node,
1118
- ns.prefix,
1119
- ns.href,
1120
- )
1121
- end
1122
-
1123
- # Then, set this element's own namespace if it has one
1124
- if native_node.namespaces.namespace
1125
- orig_ns = native_node.namespaces.namespace
1126
- # Find the matching namespace we just created
1127
- new_node.namespaces.each do |ns|
1128
- if ns.prefix == orig_ns.prefix && ns.href == orig_ns.href
1129
- new_node.namespaces.namespace = ns
1130
- break
1131
- end
1132
- end
1133
- end
1134
- end
1135
-
1136
- # Copy attributes AFTER namespaces are set up
1137
- # LibXML handles namespaced attributes through their full names
1138
- if native_node.attributes?
1139
- native_node.each_attr do |attr|
1140
- # Get the full attribute name (may include namespace prefix)
1141
- attr_name = if attr.ns&.prefix
1142
- "#{attr.ns.prefix}:#{attr.name}"
1143
- else
1144
- attr.name
1145
- end
1146
- new_node[attr_name] = attr.value
1147
- end
1148
- end
1149
-
1150
- # Recursively copy children
1151
- if native_node.children?
1152
- native_node.each_child do |child|
1153
- # Skip whitespace-only text nodes
1154
- next if child.text? && child.content.to_s.strip.empty?
1155
-
1156
- # Recursively duplicate the child
1157
- child_copy = duplicate_node(child)
1158
- new_node << child_copy
1159
- end
1160
- end
1161
-
1162
- new_node
1013
+ shallow_duplicate_element(native_node)
1163
1014
  when :text
1164
1015
  ::LibXML::XML::Node.new_text(native_node.content)
1165
1016
  when :cdata
@@ -1169,7 +1020,6 @@ module Moxml
1169
1020
  when :processing_instruction
1170
1021
  ::LibXML::XML::Node.new_pi(native_node.name, native_node.content)
1171
1022
  else
1172
- # For other types, try dup as fallback
1173
1023
  native_node.dup
1174
1024
  end
1175
1025
  end
@@ -1278,7 +1128,7 @@ module Moxml
1278
1128
  if elem.children?
1279
1129
  elem.each_child do |child|
1280
1130
  # Skip whitespace-only text nodes
1281
- next if child.text? && child.content.to_s.strip.empty?
1131
+ next if blank_text_node?(child)
1282
1132
 
1283
1133
  output << serialize_node(child)
1284
1134
  end
@@ -1286,7 +1136,7 @@ module Moxml
1286
1136
 
1287
1137
  # Append any EntityReference wrappers stored on the document
1288
1138
  doc = elem.doc
1289
- entity_refs = doc ? lookup_entity_refs(doc, elem) : nil
1139
+ entity_refs = entity_ref_registry(doc).refs_for(elem)
1290
1140
  entity_refs&.each { |ref| output << ref.to_xml }
1291
1141
 
1292
1142
  output << "</#{elem.name}>"
@@ -1328,21 +1178,18 @@ module Moxml
1328
1178
  .gsub(">", "&gt;")
1329
1179
  end
1330
1180
 
1181
+ ESCAPE_XML_RE = /[&<>"]/
1182
+ ESCAPE_XML_MAP = { "&" => "&amp;", "<" => "&lt;", ">" => "&gt;", '"' => "&quot;" }.freeze
1183
+ private_constant :ESCAPE_XML_RE, :ESCAPE_XML_MAP
1184
+
1331
1185
  def escape_xml(text)
1332
- text.to_s
1333
- .gsub("&", "&amp;")
1334
- .gsub("<", "&lt;")
1335
- .gsub(">", "&gt;")
1336
- .gsub("\"", "&quot;")
1337
- end
1186
+ # One gsub pass with a Hash replacement allocates a single new
1187
+ # string. The previous chained gsubs allocated three throwaway
1188
+ # strings on every call (very hot for attribute-heavy XML).
1189
+ str = text.is_a?(String) ? text : text.to_s
1190
+ return str unless str.match?(ESCAPE_XML_RE)
1338
1191
 
1339
- def escape_attribute_value(value)
1340
- escaped = value.to_s
1341
- .gsub("&", "&amp;")
1342
- .gsub("<", "&lt;")
1343
- .gsub(">", "&gt;")
1344
- .gsub("\"", "&quot;")
1345
- escaped.to_s
1192
+ str.gsub(ESCAPE_XML_RE, ESCAPE_XML_MAP)
1346
1193
  end
1347
1194
 
1348
1195
  def import_and_add(doc, element, child)
@@ -1365,7 +1212,7 @@ module Moxml
1365
1212
  else
1366
1213
  # No target document - create a deep copy of the node instead
1367
1214
  # This handles the case where the element isn't attached to a document yet
1368
- copied = duplicate_node(child)
1215
+ copied = deep_duplicate_node(child)
1369
1216
  element << copied
1370
1217
  end
1371
1218
 
@@ -1410,125 +1257,242 @@ module Moxml
1410
1257
  end
1411
1258
  end
1412
1259
 
1413
- def serialize_element_with_namespaces(elem, include_ns = true)
1414
- output = "<#{elem.name}"
1260
+ def serialize_element_with_namespaces(elem, include_ns = true,
1261
+ indent_size = 0, depth = 0,
1262
+ eref_active: nil)
1263
+ # Cache elem.name — it's a libxml C call we'd otherwise make
1264
+ # twice (open tag + close tag). Concat with `<<` instead of
1265
+ # `"<#{name}"` to avoid the interpolated intermediate string.
1266
+ name = elem.name
1267
+ output = +"<"
1268
+ output << name
1269
+ emit_namespace_definitions(output, elem, include_ns)
1270
+ emit_attributes(output, elem)
1271
+
1272
+ # `eref_active` is precomputed at the top-level `serialize` call
1273
+ # and threaded down — when nil (top-level non-recursive call into
1274
+ # this method), look it up; when false, skip the per-element doc
1275
+ # attachment query that otherwise fires for every element under
1276
+ # Monitor#synchronize.
1277
+ eref_active = doc_eref_active?(elem.doc) if eref_active.nil?
1278
+ entity_refs, child_sequence = eref_active ? lookup_entity_ref_serialization(elem) : [nil, nil]
1415
1279
 
1416
- # Include namespace definitions:
1417
- # - On root element (include_ns = true), output ALL namespace definitions
1418
- # - On child elements, output namespace definitions that override parent namespaces
1419
- if elem.is_a?(::LibXML::XML::Node) && elem.namespaces.respond_to?(:definitions)
1420
- # Get parent's namespace definitions to detect overrides
1421
- parent_ns_defs = if !include_ns && elem.parent && !elem.parent.is_a?(::LibXML::XML::Document)
1422
- parent_namespaces = {}
1423
- if elem.parent.is_a?(::LibXML::XML::Node)
1424
- elem.parent.namespaces.each do |ns|
1425
- parent_namespaces[ns.prefix] = ns.href
1426
- end
1427
- end
1428
- parent_namespaces
1429
- else
1430
- {}
1431
- end
1280
+ # Always use verbose format <tag></tag> for consistency with other adapters
1281
+ output << ">"
1432
1282
 
1433
- seen_ns = {}
1434
- elem.namespaces.definitions.each do |ns|
1435
- prefix = ns.prefix
1436
- uri = ns.href
1437
- next if seen_ns.key?(prefix)
1283
+ if entity_refs && child_sequence
1284
+ emit_eref_interleaved_children(output, elem, entity_refs, child_sequence,
1285
+ indent_size, depth, eref_active: eref_active)
1286
+ elsif elem.children?
1287
+ emit_children_with_layout(output, elem, indent_size, depth,
1288
+ eref_active: eref_active)
1289
+ end
1438
1290
 
1439
- # Output namespace if:
1440
- # 1. This is root element (include_ns = true), OR
1441
- # 2. This namespace overrides a parent namespace (different URI for same prefix)
1442
- should_output = include_ns ||
1443
- (parent_ns_defs.key?(prefix) && parent_ns_defs[prefix] != uri)
1291
+ output << "</" << name << ">"
1292
+ output
1293
+ end
1444
1294
 
1445
- next unless should_output
1295
+ def doc_eref_active?(doc)
1296
+ entity_ref_registry(doc).active?
1297
+ end
1446
1298
 
1447
- seen_ns[prefix] = true
1448
- output << if prefix.nil? || prefix.empty?
1449
- " xmlns=\"#{escape_xml(uri)}\""
1450
- else
1451
- " xmlns:#{prefix}=\"#{escape_xml(uri)}\""
1452
- end
1453
- end
1299
+ def entity_ref_registry(doc)
1300
+ EntityRefRegistry.new(attachments, doc)
1301
+ end
1302
+
1303
+ # Emit `xmlns`/`xmlns:foo` declarations onto `output`. On the root
1304
+ # (`include_ns: true`) we emit ALL definitions; on children we
1305
+ # emit only definitions that OVERRIDE a parent's same-prefix URI.
1306
+ # Skips the whole block when the element has no local definitions,
1307
+ # which is the common case for child elements in unnamespaced docs.
1308
+ def emit_namespace_definitions(output, elem, include_ns)
1309
+ return unless elem.is_a?(::LibXML::XML::Node)
1310
+
1311
+ ns_list = elem.namespaces
1312
+ return unless ns_list.respond_to?(:definitions)
1313
+
1314
+ definitions = ns_list.definitions
1315
+ return if definitions.empty?
1316
+
1317
+ parent_ns_defs = include_ns ? nil : parent_namespace_defs(elem)
1318
+ seen_ns = nil
1319
+
1320
+ definitions.each do |ns|
1321
+ prefix = ns.prefix
1322
+ uri = ns.href
1323
+ next unless include_ns ||
1324
+ (parent_ns_defs&.key?(prefix) && parent_ns_defs[prefix] != uri)
1325
+
1326
+ seen_ns ||= {}
1327
+ next if seen_ns.key?(prefix)
1328
+
1329
+ seen_ns[prefix] = true
1330
+ output << format_ns_declaration(prefix, uri)
1454
1331
  end
1332
+ end
1455
1333
 
1456
- # Add attributes
1457
- if elem.attributes?
1458
- elem.each_attr do |attr|
1459
- next if attr.name.start_with?("xmlns")
1334
+ def parent_namespace_defs(elem)
1335
+ parent = elem.parent
1336
+ return nil unless parent.is_a?(::LibXML::XML::Node)
1460
1337
 
1461
- # Include namespace prefix if attribute has one
1462
- attr_name = if attr.ns&.prefix
1463
- "#{attr.ns.prefix}:#{attr.name}"
1464
- else
1465
- attr.name
1466
- end
1467
- output << " #{attr_name}=\"#{escape_xml(attr.value)}\""
1468
- end
1338
+ defs = {}
1339
+ parent.namespaces.each { |ns| defs[ns.prefix] = ns.href }
1340
+ defs
1341
+ end
1342
+
1343
+ def format_ns_declaration(prefix, uri)
1344
+ if prefix.nil? || prefix.empty?
1345
+ " xmlns=\"#{escape_xml(uri)}\""
1346
+ else
1347
+ " xmlns:#{prefix}=\"#{escape_xml(uri)}\""
1469
1348
  end
1349
+ end
1470
1350
 
1471
- # Check for entity refs stored on the document
1472
- # LibXML element wrappers are ephemeral, so look up via == comparison
1473
- doc = elem.doc
1474
- entity_refs = doc ? lookup_entity_refs(doc, elem) : nil
1475
- child_sequence = doc ? lookup_child_sequence(doc, elem) : nil
1351
+ def emit_attributes(output, elem)
1352
+ return unless elem.attributes?
1476
1353
 
1477
- # Always use verbose format <tag></tag> for consistency with other adapters
1478
- output << ">"
1354
+ elem.each_attr do |attr|
1355
+ next if attr.name.start_with?("xmlns")
1479
1356
 
1480
- if entity_refs && !entity_refs.empty? && child_sequence
1481
- # Interleave native children with entity refs using tracked sequence
1482
- native_children = []
1483
- if elem.children?
1484
- elem.each_child { |c| native_children << c unless c.text? && c.content.to_s.strip.empty? }
1485
- end
1357
+ attr_name = attr.ns&.prefix ? "#{attr.ns.prefix}:#{attr.name}" : attr.name
1358
+ output << " #{attr_name}=\"#{escape_xml(attr.value)}\""
1359
+ end
1360
+ end
1486
1361
 
1487
- eref_idx = 0
1488
- native_idx = 0
1489
- child_sequence.each do |type|
1490
- case type
1491
- when :native
1492
- if native_idx < native_children.size
1493
- child = native_children[native_idx]
1494
- native_idx += 1
1495
- wrapped_child = patch_node(child)
1496
- output << if wrapped_child.is_a?(CustomizedLibxml::Node) && !wrapped_child.is_a?(CustomizedLibxml::Element)
1497
- wrapped_child.to_xml
1498
- elsif child.element?
1499
- serialize_element_with_namespaces(child, false)
1500
- else
1501
- serialize_node(child)
1502
- end
1503
- end
1504
- when :eref
1505
- if eref_idx < entity_refs.size
1506
- output << entity_refs[eref_idx].to_xml
1507
- eref_idx += 1
1362
+ # Returns [entity_refs, child_sequence] when the element has
1363
+ # interleaved entity references that the serializer needs to
1364
+ # weave back into the native child stream — otherwise [nil, nil].
1365
+ #
1366
+ # The caller is responsible for gating this with `eref_active`
1367
+ # (precomputed once per `serialize` call). When `eref_active` is
1368
+ # false this method is never entered, so the per-element doc
1369
+ # attachment query never fires.
1370
+ def lookup_entity_ref_serialization(elem)
1371
+ doc = elem.doc
1372
+ return [nil, nil] unless doc
1373
+
1374
+ entity_ref_registry(doc).serialization_for(elem)
1375
+ end
1376
+
1377
+ def emit_eref_interleaved_children(output, elem, entity_refs, child_sequence,
1378
+ indent_size, depth, eref_active:)
1379
+ native_children = collect_non_blank_children(elem)
1380
+ child_pad = indent_size.positive? ? " " * (indent_size * (depth + 1)) : nil
1381
+ eref_idx = 0
1382
+ native_idx = 0
1383
+ prev_block = true
1384
+
1385
+ child_sequence.each do |type|
1386
+ case type
1387
+ when :native
1388
+ if native_idx < native_children.size
1389
+ child = native_children[native_idx]
1390
+ is_text_like = child.text? || child.cdata?
1391
+ if prev_block && !is_text_like
1392
+ output << "\n"
1393
+ output << child_pad if child_pad
1508
1394
  end
1395
+ prev_block = !is_text_like
1396
+
1397
+ output << serialize_child_to_xml(
1398
+ child, indent_size: indent_size, depth: depth,
1399
+ eref_active: eref_active
1400
+ )
1401
+ native_idx += 1
1402
+ end
1403
+ when :eref
1404
+ if eref_idx < entity_refs.size
1405
+ output << entity_refs[eref_idx].to_xml
1406
+ eref_idx += 1
1407
+ prev_block = false
1509
1408
  end
1510
1409
  end
1511
- elsif elem.children?
1512
- elem.each_child do |child|
1513
- # Skip whitespace-only text nodes
1514
- next if child.text? && child.content.to_s.strip.empty?
1515
-
1516
- # Wrap the child and serialize
1517
- wrapped_child = patch_node(child)
1518
- output << if wrapped_child.is_a?(CustomizedLibxml::Node) && !wrapped_child.is_a?(CustomizedLibxml::Element)
1519
- # Use wrapper's to_xml for proper serialization
1520
- wrapped_child.to_xml
1521
- elsif child.element?
1522
- # Recursively serialize child elements
1523
- serialize_element_with_namespaces(child, false)
1524
- else
1525
- serialize_node(child)
1526
- end
1410
+ end
1411
+ end
1412
+
1413
+ # Regex used in place of `content.to_s.strip.empty?` for whitespace-only
1414
+ # text detection — `match?` allocates nothing while `.strip` makes a
1415
+ # throwaway copy of every text node's content on each visit.
1416
+ NON_WHITESPACE_RE = /\S/
1417
+ private_constant :NON_WHITESPACE_RE
1418
+
1419
+ def blank_text_node?(child)
1420
+ child.text? && blank_content?(child.content)
1421
+ end
1422
+
1423
+ def blank_content?(content)
1424
+ content.nil? || !content.match?(NON_WHITESPACE_RE)
1425
+ end
1426
+
1427
+ def collect_non_blank_children(elem)
1428
+ children = []
1429
+ return children unless elem.children?
1430
+
1431
+ elem.each_child do |c|
1432
+ children << c unless blank_text_node?(c)
1433
+ end
1434
+ children
1435
+ end
1436
+
1437
+ # Walk native children once and emit them with the same newline +
1438
+ # indentation layout the old `add_newlines_to_xml` + `indent_xml`
1439
+ # post-passes produced — but in a single recursion with no string
1440
+ # rescanning.
1441
+ #
1442
+ # Newline rule (matching `>(?=<(?!/))` with CDATA-placeholder
1443
+ # protection): emit `\n` + per-level padding before a child iff
1444
+ # the previous emitted sibling was block-level (ended with `>`)
1445
+ # AND the current sibling is block-level. Text and CDATA count
1446
+ # as text-like and suppress the newline on both sides (the
1447
+ # original CDATA placeholder broke the `>...<` adjacency
1448
+ # symmetrically).
1449
+ def emit_children_with_layout(output, elem, indent_size, depth,
1450
+ eref_active:)
1451
+ child_pad = indent_size.positive? ? " " * (indent_size * (depth + 1)) : nil
1452
+ prev_block = true
1453
+
1454
+ elem.each_child do |child|
1455
+ # Cache text? — used twice per child (whitespace skip + is_text_like).
1456
+ # For element children (the common case) both calls return false, so
1457
+ # caching saves a libxml C call.
1458
+ is_text = child.text?
1459
+ next if is_text && blank_content?(child.content)
1460
+
1461
+ is_text_like = is_text || child.cdata?
1462
+ if prev_block && !is_text_like
1463
+ output << "\n"
1464
+ output << child_pad if child_pad
1527
1465
  end
1466
+ prev_block = !is_text_like
1467
+
1468
+ output << serialize_child_to_xml(child, indent_size: indent_size, depth: depth,
1469
+ eref_active: eref_active)
1528
1470
  end
1529
- output << "</#{elem.name}>"
1471
+ end
1530
1472
 
1531
- output
1473
+ # Serialize one child node. Elements recurse into the layout-aware
1474
+ # path; non-element wrappers route through their own `to_xml`;
1475
+ # everything else falls through to the per-type serializer.
1476
+ # `indent_size:` and `depth:` are required to force callers to
1477
+ # decide whether the child should inherit the parent's indent
1478
+ # state — the entity-ref interleave path deliberately passes 0/0.
1479
+ #
1480
+ # Element fast-path checked first to avoid allocating a wrapper
1481
+ # we'd immediately throw away (elements always recurse on the
1482
+ # raw native node, not the wrapper). For a typical document this
1483
+ # skips wrapper allocation for the majority of children.
1484
+ def serialize_child_to_xml(child, indent_size:, depth:, eref_active:)
1485
+ if child.element?
1486
+ return serialize_element_with_namespaces(child, false, indent_size, depth + 1,
1487
+ eref_active: eref_active)
1488
+ end
1489
+
1490
+ wrapped_child = patch_node(child)
1491
+ if wrapped_child.is_a?(CustomizedLibxml::Node)
1492
+ wrapped_child.to_xml
1493
+ else
1494
+ serialize_node(child)
1495
+ end
1532
1496
  end
1533
1497
 
1534
1498
  def remove_indentation(xml_string)
@@ -1626,6 +1590,63 @@ module Moxml
1626
1590
  end
1627
1591
  nil
1628
1592
  end
1593
+
1594
+ # Deep duplication for the rare `import_and_add` fallback (when
1595
+ # libxml refuses to move a subtree across documents AND no target
1596
+ # document is available). Walks the source subtree and rebuilds
1597
+ # it as document-independent nodes. The DocumentBuilder hot path
1598
+ # goes through the shallow `duplicate_node` instead.
1599
+ def deep_duplicate_node(node)
1600
+ return nil unless node
1601
+
1602
+ native_node = unpatch_node(node)
1603
+
1604
+ return duplicate_node(node) unless node_type(node) == :element
1605
+
1606
+ new_node = shallow_duplicate_element(native_node)
1607
+ return new_node unless native_node.is_a?(::LibXML::XML::Node) && native_node.children?
1608
+
1609
+ native_node.each_child do |child|
1610
+ next if blank_text_node?(child)
1611
+
1612
+ new_node << deep_duplicate_node(child)
1613
+ end
1614
+ new_node
1615
+ end
1616
+
1617
+ # Copies a single element: its name, its OWN namespace definitions,
1618
+ # the active default namespace, and its attributes. Children are NOT
1619
+ # duplicated — callers that need the subtree use deep_duplicate_node.
1620
+ def shallow_duplicate_element(native_node)
1621
+ new_node = ::LibXML::XML::Node.new(native_node.name)
1622
+ copy_element_namespaces(native_node, new_node) if native_node.is_a?(::LibXML::XML::Node)
1623
+ copy_element_attributes(native_node, new_node) if native_node.attributes?
1624
+ new_node
1625
+ end
1626
+
1627
+ def copy_element_namespaces(src, dst)
1628
+ ns_list = src.namespaces
1629
+ ns_list.each do |ns|
1630
+ ::LibXML::XML::Namespace.new(dst, ns.prefix, ns.href)
1631
+ end
1632
+
1633
+ own_ns = ns_list.namespace
1634
+ return unless own_ns
1635
+
1636
+ dst.namespaces.each do |ns|
1637
+ next unless ns.prefix == own_ns.prefix && ns.href == own_ns.href
1638
+
1639
+ dst.namespaces.namespace = ns
1640
+ break
1641
+ end
1642
+ end
1643
+
1644
+ def copy_element_attributes(src, dst)
1645
+ src.each_attr do |attr|
1646
+ attr_name = attr.ns&.prefix ? "#{attr.ns.prefix}:#{attr.name}" : attr.name
1647
+ dst[attr_name] = attr.value
1648
+ end
1649
+ end
1629
1650
  end
1630
1651
 
1631
1652
  # Bridge between LibXML SAX and Moxml SAX
@@ -1701,3 +1722,6 @@ module Moxml
1701
1722
  end
1702
1723
  end
1703
1724
  end
1725
+
1726
+ require_relative "libxml/entity_ref_registry"
1727
+ require_relative "libxml/entity_restorer"