moxml 0.1.20 → 0.1.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -37,6 +37,32 @@ module Moxml
37
37
  end
38
38
  end
39
39
 
40
+ # Mapping from libxml's integer node_type to our symbol — built once
41
+ # at load so `node_type` can do a single hash lookup on the hot path
42
+ # instead of a large case/when on every node.
43
+ NATIVE_NODE_TYPE_MAP = {
44
+ ::LibXML::XML::Node::ELEMENT_NODE => :element,
45
+ ::LibXML::XML::Node::TEXT_NODE => :text,
46
+ ::LibXML::XML::Node::CDATA_SECTION_NODE => :cdata,
47
+ ::LibXML::XML::Node::COMMENT_NODE => :comment,
48
+ ::LibXML::XML::Node::PI_NODE => :processing_instruction,
49
+ ::LibXML::XML::Node::ATTRIBUTE_NODE => :attribute,
50
+ ::LibXML::XML::Node::DTD_NODE => :doctype,
51
+ ::LibXML::XML::Node::DOCUMENT_NODE => :document,
52
+ }.freeze
53
+ private_constant :NATIVE_NODE_TYPE_MAP
54
+
55
+ WRAPPER_NODE_TYPE_MAP = {
56
+ DoctypeWrapper => :doctype,
57
+ CustomizedLibxml::Element => :element,
58
+ CustomizedLibxml::Text => :text,
59
+ CustomizedLibxml::Cdata => :cdata,
60
+ CustomizedLibxml::Comment => :comment,
61
+ CustomizedLibxml::ProcessingInstruction => :processing_instruction,
62
+ CustomizedLibxml::EntityReference => :entity_reference,
63
+ }.freeze
64
+ private_constant :WRAPPER_NODE_TYPE_MAP
65
+
40
66
  class << self
41
67
  def attachments
42
68
  @attachments ||= Moxml::NativeAttachment.new
@@ -98,7 +124,22 @@ module Moxml
98
124
  end
99
125
 
100
126
  ctx = _context || Context.new(:libxml)
101
- DocumentBuilder.new(ctx).build(native_doc)
127
+ # Single parse path: wrap libxml's already-complete C-parsed tree
128
+ # directly (same pattern as nokogiri/ox). The previous
129
+ # DocumentBuilder.build path walked the entire parsed tree and
130
+ # re-added every node to a fresh moxml-managed document, which
131
+ # made parse O(N) Ruby work on top of an already-complete parse.
132
+ # Doctype/declaration/PI attachments set above remain on
133
+ # native_doc, so the serialize path still sees them.
134
+ #
135
+ # restore_entities is handled as a post-processing case after
136
+ # the wrap, NOT by branching into a different builder. This way
137
+ # any new parse-time logic only has to be added to this one
138
+ # path; the restoration walk is just one of potentially several
139
+ # post-processing steps and doesn't fork the construction.
140
+ doc = Document.new(native_doc, ctx)
141
+ EntityRestorer.new(doc).run if ctx.config.restore_entities
142
+ doc
102
143
  end
103
144
 
104
145
  # SAX parsing implementation for LibXML
@@ -181,40 +222,23 @@ module Moxml
181
222
  def node_type(node)
182
223
  return :unknown unless node
183
224
 
184
- # Handle wrapper classes
185
- return :element if node.is_a?(CustomizedLibxml::Element)
186
- return :text if node.is_a?(CustomizedLibxml::Text)
187
- return :cdata if node.is_a?(CustomizedLibxml::Cdata)
188
- return :comment if node.is_a?(CustomizedLibxml::Comment)
189
- if node.is_a?(CustomizedLibxml::ProcessingInstruction)
190
- return :processing_instruction
225
+ # Fast path: native libxml nodes are the vast majority during
226
+ # parse traversal (DocumentBuilder visits raw libxml children).
227
+ # Skip the wrapper checks below for them.
228
+ if node.is_a?(::LibXML::XML::Node)
229
+ return NATIVE_NODE_TYPE_MAP[node.node_type] || :unknown
191
230
  end
192
- return :entity_reference if node.is_a?(CustomizedLibxml::EntityReference)
193
- return :doctype if node.is_a?(DoctypeWrapper)
231
+ return :document if node.is_a?(::LibXML::XML::Document)
194
232
 
195
- # Unwrap if needed
196
- native_node = unpatch_node(node)
233
+ wrapper_type = WRAPPER_NODE_TYPE_MAP[node.class]
234
+ return wrapper_type if wrapper_type
197
235
 
198
- case native_node.node_type
199
- when ::LibXML::XML::Node::DOCUMENT_NODE
200
- :document
201
- when ::LibXML::XML::Node::ELEMENT_NODE
202
- :element
203
- when ::LibXML::XML::Node::TEXT_NODE
204
- :text
205
- when ::LibXML::XML::Node::CDATA_SECTION_NODE
206
- :cdata
207
- when ::LibXML::XML::Node::COMMENT_NODE
208
- :comment
209
- when ::LibXML::XML::Node::ATTRIBUTE_NODE
210
- :attribute
211
- when ::LibXML::XML::Node::PI_NODE
212
- :processing_instruction
213
- when ::LibXML::XML::Node::DTD_NODE
214
- :doctype
215
- else
216
- :unknown
217
- end
236
+ # Duck-typed fallback for libxml types that aren't ::Node
237
+ # subclasses but still expose node_type (e.g. ::Attr).
238
+ native = unpatch_node(node)
239
+ return :unknown unless native.respond_to?(:node_type)
240
+
241
+ NATIVE_NODE_TYPE_MAP[native.node_type] || :unknown
218
242
  end
219
243
 
220
244
  def node_name(node)
@@ -300,7 +324,7 @@ module Moxml
300
324
 
301
325
  # Include any EntityReference wrappers stored on the document
302
326
  doc = native_node.doc
303
- entity_refs = doc ? lookup_entity_refs(doc, native_node) : nil
327
+ entity_refs = entity_ref_registry(doc).refs_for(native_node)
304
328
  result.concat(entity_refs) if entity_refs
305
329
 
306
330
  result
@@ -317,7 +341,7 @@ module Moxml
317
341
  current = native_node&.next
318
342
  while current
319
343
  # Skip whitespace-only text nodes
320
- break unless current.text? && current.content.to_s.strip.empty?
344
+ break unless blank_text_node?(current)
321
345
 
322
346
  current = current.next
323
347
  end
@@ -329,7 +353,7 @@ module Moxml
329
353
  current = native_node&.prev
330
354
  while current
331
355
  # Skip whitespace-only text nodes
332
- break unless current.text? && current.content.to_s.strip.empty?
356
+ break unless blank_text_node?(current)
333
357
 
334
358
  current = current.prev
335
359
  end
@@ -504,21 +528,22 @@ module Moxml
504
528
  native_child = unpatch_node(child)
505
529
 
506
530
  # EntityReference wrappers can't go in LibXML's native tree.
507
- # Store on the document (stable identity) keyed by element.
508
- # LibXML creates new Ruby wrappers on each access, so element
509
- # object_id is unstable — we look up via == comparison.
531
+ # Store them on the document for interleaved serialization.
510
532
  if child.is_a?(CustomizedLibxml::EntityReference)
511
533
  doc = native_elem.is_a?(::LibXML::XML::Document) ? native_elem : native_elem.doc
512
- store_entity_ref_on_doc(doc, native_elem, child)
513
- append_child_sequence_on_doc(doc, native_elem, :eref)
534
+ entity_ref_registry(doc).register(native_elem, child)
514
535
  return
515
536
  end
516
537
 
517
538
  # For LibXML: if parent has a DEFAULT namespace (nil/empty prefix) and child is an element without a namespace,
518
539
  # explicitly set the child's namespace to match the parent's for XPath compatibility
519
- # NOTE: Prefixed namespaces are NOT inherited, only default namespaces
520
- if native_elem.is_a?(::LibXML::XML::Node) && native_elem.namespaces&.namespace &&
521
- native_child.is_a?(::LibXML::XML::Node) && native_child.element? &&
540
+ # NOTE: Prefixed namespaces are NOT inherited, only default namespaces.
541
+ #
542
+ # Reorder cheap-first: skip the expensive `.namespaces` fetches
543
+ # entirely for non-element children (text, comment, cdata, PI),
544
+ # which is roughly 30-50% of adds in a typical doc.
545
+ if native_child.is_a?(::LibXML::XML::Node) && native_child.element? &&
546
+ native_elem.is_a?(::LibXML::XML::Node) && native_elem.namespaces&.namespace &&
522
547
  (!native_child.namespaces.namespace || native_child.namespaces.namespace.href.to_s.empty?)
523
548
 
524
549
  parent_ns = native_elem.namespaces.namespace
@@ -574,51 +599,8 @@ module Moxml
574
599
  else
575
600
  import_and_add(native_elem.doc, native_elem, native_child)
576
601
  doc = native_elem.doc || native_elem
577
- append_child_sequence_on_doc(doc, native_elem, :native)
578
- end
579
- end
580
-
581
- # Store entity ref on the document (stable identity).
582
- # LibXML element wrappers are ephemeral, so we use == to find matching elements.
583
- def store_entity_ref_on_doc(doc, element, ref)
584
- pairs = attachments.get(doc, :_entity_ref_pairs) || []
585
- pair = pairs.find { |elem, _| elem == element }
586
- if pair
587
- pair[1] << ref
588
- else
589
- pairs << [element, [ref]]
590
- end
591
- attachments.set(doc, :_entity_ref_pairs, pairs)
592
- end
593
-
594
- # Look up entity refs for an element from the document
595
- def lookup_entity_refs(doc, element)
596
- pairs = attachments.get(doc, :_entity_ref_pairs)
597
- return nil unless pairs
598
-
599
- pair = pairs.find { |elem, _| elem == element }
600
- pair&.last
601
- end
602
-
603
- # Track child order on the document (stable identity)
604
- def append_child_sequence_on_doc(doc, element, type)
605
- pairs = attachments.get(doc, :_child_seq_pairs) || []
606
- pair = pairs.find { |elem, _| elem == element }
607
- if pair
608
- pair[1] << type
609
- else
610
- pairs << [element, [type]]
602
+ entity_ref_registry(doc).append_native(native_elem)
611
603
  end
612
- attachments.set(doc, :_child_seq_pairs, pairs)
613
- end
614
-
615
- # Look up child sequence for an element from the document
616
- def lookup_child_sequence(doc, element)
617
- pairs = attachments.get(doc, :_child_seq_pairs)
618
- return nil unless pairs
619
-
620
- pair = pairs.find { |elem, _| elem == element }
621
- pair&.last
622
604
  end
623
605
 
624
606
  def append_child_sequence(element, type)
@@ -976,21 +958,23 @@ module Moxml
976
958
  end
977
959
 
978
960
  if native_node.root
979
- # Use our custom serializer to control namespace output
961
+ indent_size = options[:indent].is_a?(Integer) && options[:indent].positive? ? options[:indent] : 0
962
+ # Custom serializer emits newlines AND indentation directly —
963
+ # no separate add_newlines_to_xml / indent_xml passes.
964
+ # `eref_active` is computed once here and threaded through the
965
+ # recursion so that the per-element `attachments.key?` Monitor
966
+ # sync only fires for docs that actually have entity refs.
967
+ eref_active = entity_ref_registry(native_node).active?
980
968
  root_output = serialize_element_with_namespaces(
981
969
  native_node.root,
982
970
  true,
971
+ indent_size,
972
+ 0,
973
+ eref_active: eref_active,
983
974
  )
984
975
 
985
- # Apply indentation if requested
986
- if options[:indent]&.positive?
987
- # First add newlines between elements
988
- formatted = add_newlines_to_xml(root_output)
989
- output << "\n" << indent_xml(formatted, options[:indent])
990
- else
991
- output << "\n" << root_output unless output.empty?
992
- output << root_output if output.empty?
993
- end
976
+ output << "\n" << root_output unless output.empty?
977
+ output << root_output if output.empty?
994
978
  end
995
979
 
996
980
  output
@@ -999,104 +983,22 @@ module Moxml
999
983
  end
1000
984
  end
1001
985
 
1002
- def add_newlines_to_xml(xml_string)
1003
- # Add newlines between XML elements for proper indentation
1004
- # But don't add newlines between opening and immediate closing tags (e.g., <tag></tag>)
1005
- # And most importantly, don't add newlines inside CDATA sections
1006
-
1007
- # First, protect CDATA sections by replacing them with placeholders
1008
- # Manual scanning guarantees O(n) complexity with no backtracking (ReDoS-safe)
1009
- cdata_sections = []
1010
- result = +""
1011
- pos = 0
1012
-
1013
- loop do
1014
- # Find next CDATA start
1015
- cdata_start = xml_string.index("<![CDATA[", pos)
1016
-
1017
- if cdata_start
1018
- # Copy everything before CDATA
1019
- result << xml_string[pos...cdata_start]
1020
-
1021
- # Find CDATA end
1022
- cdata_content_start = cdata_start + 9 # Length of "<![CDATA["
1023
- cdata_end = xml_string.index("]]>", cdata_content_start)
1024
-
1025
- if cdata_end
1026
- # Extract full CDATA including markers
1027
- full_cdata_end = cdata_end + 3 # Include "]]>"
1028
- cdata_section = xml_string[cdata_start...full_cdata_end]
1029
-
1030
- # Store and add placeholder
1031
- cdata_sections << cdata_section
1032
- result << "__CDATA_PLACEHOLDER_#{cdata_sections.length - 1}__"
1033
-
1034
- # Continue after this CDATA
1035
- pos = full_cdata_end
1036
- else
1037
- # Malformed CDATA (no closing "]]>") - copy as-is
1038
- result << xml_string[cdata_start..]
1039
- break
1040
- end
1041
- else
1042
- # No more CDATA sections - copy rest
1043
- result << xml_string[pos..]
1044
- break
1045
- end
1046
- end
1047
-
1048
- protected = result
1049
-
1050
- # Add newlines between elements (but not in CDATA - already protected)
1051
- with_newlines = protected.gsub(%r{(<[^>]+)>(?=<(?!/))}, "\\1>\n")
1052
-
1053
- # Restore CDATA sections
1054
- cdata_sections.each_with_index do |cdata, index|
1055
- with_newlines.sub!("__CDATA_PLACEHOLDER_#{index}__", cdata)
1056
- end
1057
-
1058
- with_newlines
1059
- end
1060
-
1061
- def indent_xml(xml_string, indent_size)
1062
- # Simple line-by-line indentation
1063
- lines = []
1064
- level = 0
1065
-
1066
- xml_string.each_line do |line|
1067
- line = line.strip
1068
- next if line.empty?
1069
-
1070
- # Decrease level for closing tags
1071
- level -= 1 if line.start_with?("</")
1072
- level = [level, 0].max
1073
-
1074
- # Add indented line
1075
- lines << ((" " * (indent_size * level)) + line)
1076
-
1077
- # Increase level for opening tags (but not self-closing or special tags)
1078
- next unless line.start_with?("<") && !line.start_with?("</") &&
1079
- !line.end_with?("/>") && !line.start_with?("<?") &&
1080
- !line.start_with?("<!") && !line.include?("</")
1081
-
1082
- level += 1
1083
- end
1084
-
1085
- lines.join("\n")
1086
- end
1087
-
986
+ # Shallow duplication: copies the node itself (name, attrs, namespaces)
987
+ # but NOT its descendants. This is what DocumentBuilder needs — it
988
+ # walks the source tree and re-adds children one at a time via
989
+ # add_child, so a deep copy here would be done only to be stripped
990
+ # by replace_children, then rebuilt — O(N²) waste on parse.
991
+ #
992
+ # For callers that need a true deep copy (e.g. the import_and_add
993
+ # fallback when LibXML can't move the subtree directly), use
994
+ # deep_duplicate_node.
1088
995
  def duplicate_node(node)
1089
996
  return nil unless node
1090
997
 
1091
- # Unwrap if wrapped
1092
998
  native_node = unpatch_node(node)
1093
999
 
1094
- # LibXML is strict about document ownership
1095
- # Create brand new NATIVE nodes that are document-independent
1096
- # Wrappers are only used via patch_node when reading children
1097
1000
  case node_type(node)
1098
1001
  when :doctype
1099
- # DoctypeWrapper - create a new one with same properties
1100
1002
  if node.is_a?(DoctypeWrapper)
1101
1003
  DoctypeWrapper.new(
1102
1004
  create_document,
@@ -1105,64 +1007,10 @@ module Moxml
1105
1007
  node.system_id,
1106
1008
  )
1107
1009
  else
1108
- # Should not happen, but handle gracefully
1109
1010
  node
1110
1011
  end
1111
1012
  when :element
1112
- new_node = ::LibXML::XML::Node.new(native_node.name)
1113
- # new_node.line = node.line
1114
-
1115
- # Copy and set namespace definitions FIRST
1116
- if native_node.is_a?(::LibXML::XML::Node)
1117
- # First, copy all namespace definitions
1118
- native_node.namespaces.each do |ns|
1119
- ::LibXML::XML::Namespace.new(
1120
- new_node,
1121
- ns.prefix,
1122
- ns.href,
1123
- )
1124
- end
1125
-
1126
- # Then, set this element's own namespace if it has one
1127
- if native_node.namespaces.namespace
1128
- orig_ns = native_node.namespaces.namespace
1129
- # Find the matching namespace we just created
1130
- new_node.namespaces.each do |ns|
1131
- if ns.prefix == orig_ns.prefix && ns.href == orig_ns.href
1132
- new_node.namespaces.namespace = ns
1133
- break
1134
- end
1135
- end
1136
- end
1137
- end
1138
-
1139
- # Copy attributes AFTER namespaces are set up
1140
- # LibXML handles namespaced attributes through their full names
1141
- if native_node.attributes?
1142
- native_node.each_attr do |attr|
1143
- # Get the full attribute name (may include namespace prefix)
1144
- attr_name = if attr.ns&.prefix
1145
- "#{attr.ns.prefix}:#{attr.name}"
1146
- else
1147
- attr.name
1148
- end
1149
- new_node[attr_name] = attr.value
1150
- end
1151
- end
1152
-
1153
- # Recursively copy children
1154
- if native_node.children?
1155
- native_node.each_child do |child|
1156
- # Skip whitespace-only text nodes
1157
- next if child.text? && child.content.to_s.strip.empty?
1158
-
1159
- # Recursively duplicate the child
1160
- child_copy = duplicate_node(child)
1161
- new_node << child_copy
1162
- end
1163
- end
1164
-
1165
- new_node
1013
+ shallow_duplicate_element(native_node)
1166
1014
  when :text
1167
1015
  ::LibXML::XML::Node.new_text(native_node.content)
1168
1016
  when :cdata
@@ -1172,7 +1020,6 @@ module Moxml
1172
1020
  when :processing_instruction
1173
1021
  ::LibXML::XML::Node.new_pi(native_node.name, native_node.content)
1174
1022
  else
1175
- # For other types, try dup as fallback
1176
1023
  native_node.dup
1177
1024
  end
1178
1025
  end
@@ -1281,7 +1128,7 @@ module Moxml
1281
1128
  if elem.children?
1282
1129
  elem.each_child do |child|
1283
1130
  # Skip whitespace-only text nodes
1284
- next if child.text? && child.content.to_s.strip.empty?
1131
+ next if blank_text_node?(child)
1285
1132
 
1286
1133
  output << serialize_node(child)
1287
1134
  end
@@ -1289,7 +1136,7 @@ module Moxml
1289
1136
 
1290
1137
  # Append any EntityReference wrappers stored on the document
1291
1138
  doc = elem.doc
1292
- entity_refs = doc ? lookup_entity_refs(doc, elem) : nil
1139
+ entity_refs = entity_ref_registry(doc).refs_for(elem)
1293
1140
  entity_refs&.each { |ref| output << ref.to_xml }
1294
1141
 
1295
1142
  output << "</#{elem.name}>"
@@ -1331,21 +1178,18 @@ module Moxml
1331
1178
  .gsub(">", "&gt;")
1332
1179
  end
1333
1180
 
1181
+ ESCAPE_XML_RE = /[&<>"]/
1182
+ ESCAPE_XML_MAP = { "&" => "&amp;", "<" => "&lt;", ">" => "&gt;", '"' => "&quot;" }.freeze
1183
+ private_constant :ESCAPE_XML_RE, :ESCAPE_XML_MAP
1184
+
1334
1185
  def escape_xml(text)
1335
- text.to_s
1336
- .gsub("&", "&amp;")
1337
- .gsub("<", "&lt;")
1338
- .gsub(">", "&gt;")
1339
- .gsub("\"", "&quot;")
1340
- end
1186
+ # One gsub pass with a Hash replacement allocates a single new
1187
+ # string. The previous chained gsubs allocated three throwaway
1188
+ # strings on every call (very hot for attribute-heavy XML).
1189
+ str = text.is_a?(String) ? text : text.to_s
1190
+ return str unless str.match?(ESCAPE_XML_RE)
1341
1191
 
1342
- def escape_attribute_value(value)
1343
- escaped = value.to_s
1344
- .gsub("&", "&amp;")
1345
- .gsub("<", "&lt;")
1346
- .gsub(">", "&gt;")
1347
- .gsub("\"", "&quot;")
1348
- escaped.to_s
1192
+ str.gsub(ESCAPE_XML_RE, ESCAPE_XML_MAP)
1349
1193
  end
1350
1194
 
1351
1195
  def import_and_add(doc, element, child)
@@ -1368,7 +1212,7 @@ module Moxml
1368
1212
  else
1369
1213
  # No target document - create a deep copy of the node instead
1370
1214
  # This handles the case where the element isn't attached to a document yet
1371
- copied = duplicate_node(child)
1215
+ copied = deep_duplicate_node(child)
1372
1216
  element << copied
1373
1217
  end
1374
1218
 
@@ -1413,127 +1257,242 @@ module Moxml
1413
1257
  end
1414
1258
  end
1415
1259
 
1416
- def serialize_element_with_namespaces(elem, include_ns = true)
1417
- output = "<#{elem.name}"
1260
+ def serialize_element_with_namespaces(elem, include_ns = true,
1261
+ indent_size = 0, depth = 0,
1262
+ eref_active: nil)
1263
+ # Cache elem.name — it's a libxml C call we'd otherwise make
1264
+ # twice (open tag + close tag). Concat with `<<` instead of
1265
+ # `"<#{name}"` to avoid the interpolated intermediate string.
1266
+ name = elem.name
1267
+ output = +"<"
1268
+ output << name
1269
+ emit_namespace_definitions(output, elem, include_ns)
1270
+ emit_attributes(output, elem)
1271
+
1272
+ # `eref_active` is precomputed at the top-level `serialize` call
1273
+ # and threaded down — when nil (top-level non-recursive call into
1274
+ # this method), look it up; when false, skip the per-element doc
1275
+ # attachment query that otherwise fires for every element under
1276
+ # Monitor#synchronize.
1277
+ eref_active = doc_eref_active?(elem.doc) if eref_active.nil?
1278
+ entity_refs, child_sequence = eref_active ? lookup_entity_ref_serialization(elem) : [nil, nil]
1418
1279
 
1419
- # Include namespace definitions:
1420
- # - On root element (include_ns = true), output ALL namespace definitions
1421
- # - On child elements, output namespace definitions that override parent namespaces
1422
- if elem.is_a?(::LibXML::XML::Node) && elem.namespaces.respond_to?(:definitions)
1423
- # Get parent's namespace definitions to detect overrides
1424
- parent_ns_defs = if !include_ns && elem.parent && !elem.parent.is_a?(::LibXML::XML::Document)
1425
- parent_namespaces = {}
1426
- if elem.parent.is_a?(::LibXML::XML::Node)
1427
- elem.parent.namespaces.each do |ns|
1428
- parent_namespaces[ns.prefix] = ns.href
1429
- end
1430
- end
1431
- parent_namespaces
1432
- else
1433
- {}
1434
- end
1280
+ # Always use verbose format <tag></tag> for consistency with other adapters
1281
+ output << ">"
1435
1282
 
1436
- seen_ns = {}
1437
- elem.namespaces.definitions.each do |ns|
1438
- prefix = ns.prefix
1439
- uri = ns.href
1440
- next if seen_ns.key?(prefix)
1283
+ if entity_refs && child_sequence
1284
+ emit_eref_interleaved_children(output, elem, entity_refs, child_sequence,
1285
+ indent_size, depth, eref_active: eref_active)
1286
+ elsif elem.children?
1287
+ emit_children_with_layout(output, elem, indent_size, depth,
1288
+ eref_active: eref_active)
1289
+ end
1441
1290
 
1442
- # Output namespace if:
1443
- # 1. This is root element (include_ns = true), OR
1444
- # 2. This namespace overrides a parent namespace (different URI for same prefix)
1445
- should_output = include_ns ||
1446
- (parent_ns_defs.key?(prefix) && parent_ns_defs[prefix] != uri)
1291
+ output << "</" << name << ">"
1292
+ output
1293
+ end
1447
1294
 
1448
- next unless should_output
1295
+ def doc_eref_active?(doc)
1296
+ entity_ref_registry(doc).active?
1297
+ end
1449
1298
 
1450
- seen_ns[prefix] = true
1451
- output << if prefix.nil? || prefix.empty?
1452
- " xmlns=\"#{escape_xml(uri)}\""
1453
- else
1454
- " xmlns:#{prefix}=\"#{escape_xml(uri)}\""
1455
- end
1456
- end
1299
+ def entity_ref_registry(doc)
1300
+ EntityRefRegistry.new(attachments, doc)
1301
+ end
1302
+
1303
+ # Emit `xmlns`/`xmlns:foo` declarations onto `output`. On the root
1304
+ # (`include_ns: true`) we emit ALL definitions; on children we
1305
+ # emit only definitions that OVERRIDE a parent's same-prefix URI.
1306
+ # Skips the whole block when the element has no local definitions,
1307
+ # which is the common case for child elements in unnamespaced docs.
1308
+ def emit_namespace_definitions(output, elem, include_ns)
1309
+ return unless elem.is_a?(::LibXML::XML::Node)
1310
+
1311
+ ns_list = elem.namespaces
1312
+ return unless ns_list.respond_to?(:definitions)
1313
+
1314
+ definitions = ns_list.definitions
1315
+ return if definitions.empty?
1316
+
1317
+ parent_ns_defs = include_ns ? nil : parent_namespace_defs(elem)
1318
+ seen_ns = nil
1319
+
1320
+ definitions.each do |ns|
1321
+ prefix = ns.prefix
1322
+ uri = ns.href
1323
+ next unless include_ns ||
1324
+ (parent_ns_defs&.key?(prefix) && parent_ns_defs[prefix] != uri)
1325
+
1326
+ seen_ns ||= {}
1327
+ next if seen_ns.key?(prefix)
1328
+
1329
+ seen_ns[prefix] = true
1330
+ output << format_ns_declaration(prefix, uri)
1457
1331
  end
1332
+ end
1458
1333
 
1459
- # Add attributes
1460
- if elem.attributes?
1461
- elem.each_attr do |attr|
1462
- next if attr.name.start_with?("xmlns")
1334
+ def parent_namespace_defs(elem)
1335
+ parent = elem.parent
1336
+ return nil unless parent.is_a?(::LibXML::XML::Node)
1463
1337
 
1464
- # Include namespace prefix if attribute has one
1465
- attr_name = if attr.ns&.prefix
1466
- "#{attr.ns.prefix}:#{attr.name}"
1467
- else
1468
- attr.name
1469
- end
1470
- output << " #{attr_name}=\"#{escape_xml(attr.value)}\""
1471
- end
1338
+ defs = {}
1339
+ parent.namespaces.each { |ns| defs[ns.prefix] = ns.href }
1340
+ defs
1341
+ end
1342
+
1343
+ def format_ns_declaration(prefix, uri)
1344
+ if prefix.nil? || prefix.empty?
1345
+ " xmlns=\"#{escape_xml(uri)}\""
1346
+ else
1347
+ " xmlns:#{prefix}=\"#{escape_xml(uri)}\""
1472
1348
  end
1349
+ end
1473
1350
 
1474
- # Check for entity refs stored on the document
1475
- # LibXML element wrappers are ephemeral, so look up via == comparison
1476
- doc = elem.doc
1477
- entity_refs = doc ? lookup_entity_refs(doc, elem) : nil
1478
- child_sequence = doc ? lookup_child_sequence(doc, elem) : nil
1351
+ def emit_attributes(output, elem)
1352
+ return unless elem.attributes?
1479
1353
 
1480
- # Always use verbose format <tag></tag> for consistency with other adapters
1481
- output << ">"
1354
+ elem.each_attr do |attr|
1355
+ next if attr.name.start_with?("xmlns")
1482
1356
 
1483
- if entity_refs && !entity_refs.empty? && child_sequence
1484
- # Interleave native children with entity refs using tracked sequence
1485
- native_children = []
1486
- if elem.children?
1487
- elem.each_child do |c|
1488
- native_children << c unless c.text? && c.content.to_s.strip.empty?
1489
- end
1490
- end
1357
+ attr_name = attr.ns&.prefix ? "#{attr.ns.prefix}:#{attr.name}" : attr.name
1358
+ output << " #{attr_name}=\"#{escape_xml(attr.value)}\""
1359
+ end
1360
+ end
1491
1361
 
1492
- eref_idx = 0
1493
- native_idx = 0
1494
- child_sequence.each do |type|
1495
- case type
1496
- when :native
1497
- if native_idx < native_children.size
1498
- child = native_children[native_idx]
1499
- native_idx += 1
1500
- wrapped_child = patch_node(child)
1501
- output << if wrapped_child.is_a?(CustomizedLibxml::Node) && !wrapped_child.is_a?(CustomizedLibxml::Element)
1502
- wrapped_child.to_xml
1503
- elsif child.element?
1504
- serialize_element_with_namespaces(child, false)
1505
- else
1506
- serialize_node(child)
1507
- end
1508
- end
1509
- when :eref
1510
- if eref_idx < entity_refs.size
1511
- output << entity_refs[eref_idx].to_xml
1512
- eref_idx += 1
1362
+ # Returns [entity_refs, child_sequence] when the element has
1363
+ # interleaved entity references that the serializer needs to
1364
+ # weave back into the native child stream — otherwise [nil, nil].
1365
+ #
1366
+ # The caller is responsible for gating this with `eref_active`
1367
+ # (precomputed once per `serialize` call). When `eref_active` is
1368
+ # false this method is never entered, so the per-element doc
1369
+ # attachment query never fires.
1370
+ def lookup_entity_ref_serialization(elem)
1371
+ doc = elem.doc
1372
+ return [nil, nil] unless doc
1373
+
1374
+ entity_ref_registry(doc).serialization_for(elem)
1375
+ end
1376
+
1377
+ def emit_eref_interleaved_children(output, elem, entity_refs, child_sequence,
1378
+ indent_size, depth, eref_active:)
1379
+ native_children = collect_non_blank_children(elem)
1380
+ child_pad = indent_size.positive? ? " " * (indent_size * (depth + 1)) : nil
1381
+ eref_idx = 0
1382
+ native_idx = 0
1383
+ prev_block = true
1384
+
1385
+ child_sequence.each do |type|
1386
+ case type
1387
+ when :native
1388
+ if native_idx < native_children.size
1389
+ child = native_children[native_idx]
1390
+ is_text_like = child.text? || child.cdata?
1391
+ if prev_block && !is_text_like
1392
+ output << "\n"
1393
+ output << child_pad if child_pad
1513
1394
  end
1395
+ prev_block = !is_text_like
1396
+
1397
+ output << serialize_child_to_xml(
1398
+ child, indent_size: indent_size, depth: depth,
1399
+ eref_active: eref_active
1400
+ )
1401
+ native_idx += 1
1402
+ end
1403
+ when :eref
1404
+ if eref_idx < entity_refs.size
1405
+ output << entity_refs[eref_idx].to_xml
1406
+ eref_idx += 1
1407
+ prev_block = false
1514
1408
  end
1515
1409
  end
1516
- elsif elem.children?
1517
- elem.each_child do |child|
1518
- # Skip whitespace-only text nodes
1519
- next if child.text? && child.content.to_s.strip.empty?
1520
-
1521
- # Wrap the child and serialize
1522
- wrapped_child = patch_node(child)
1523
- output << if wrapped_child.is_a?(CustomizedLibxml::Node) && !wrapped_child.is_a?(CustomizedLibxml::Element)
1524
- # Use wrapper's to_xml for proper serialization
1525
- wrapped_child.to_xml
1526
- elsif child.element?
1527
- # Recursively serialize child elements
1528
- serialize_element_with_namespaces(child, false)
1529
- else
1530
- serialize_node(child)
1531
- end
1410
+ end
1411
+ end
1412
+
1413
+ # Regex used in place of `content.to_s.strip.empty?` for whitespace-only
1414
+ # text detection — `match?` allocates nothing while `.strip` makes a
1415
+ # throwaway copy of every text node's content on each visit.
1416
+ NON_WHITESPACE_RE = /\S/
1417
+ private_constant :NON_WHITESPACE_RE
1418
+
1419
+ def blank_text_node?(child)
1420
+ child.text? && blank_content?(child.content)
1421
+ end
1422
+
1423
+ def blank_content?(content)
1424
+ content.nil? || !content.match?(NON_WHITESPACE_RE)
1425
+ end
1426
+
1427
+ def collect_non_blank_children(elem)
1428
+ children = []
1429
+ return children unless elem.children?
1430
+
1431
+ elem.each_child do |c|
1432
+ children << c unless blank_text_node?(c)
1433
+ end
1434
+ children
1435
+ end
1436
+
1437
+ # Walk native children once and emit them with the same newline +
1438
+ # indentation layout the old `add_newlines_to_xml` + `indent_xml`
1439
+ # post-passes produced — but in a single recursion with no string
1440
+ # rescanning.
1441
+ #
1442
+ # Newline rule (matching `>(?=<(?!/))` with CDATA-placeholder
1443
+ # protection): emit `\n` + per-level padding before a child iff
1444
+ # the previous emitted sibling was block-level (ended with `>`)
1445
+ # AND the current sibling is block-level. Text and CDATA count
1446
+ # as text-like and suppress the newline on both sides (the
1447
+ # original CDATA placeholder broke the `>...<` adjacency
1448
+ # symmetrically).
1449
+ def emit_children_with_layout(output, elem, indent_size, depth,
1450
+ eref_active:)
1451
+ child_pad = indent_size.positive? ? " " * (indent_size * (depth + 1)) : nil
1452
+ prev_block = true
1453
+
1454
+ elem.each_child do |child|
1455
+ # Cache text? — used twice per child (whitespace skip + is_text_like).
1456
+ # For element children (the common case) both calls return false, so
1457
+ # caching saves a libxml C call.
1458
+ is_text = child.text?
1459
+ next if is_text && blank_content?(child.content)
1460
+
1461
+ is_text_like = is_text || child.cdata?
1462
+ if prev_block && !is_text_like
1463
+ output << "\n"
1464
+ output << child_pad if child_pad
1532
1465
  end
1466
+ prev_block = !is_text_like
1467
+
1468
+ output << serialize_child_to_xml(child, indent_size: indent_size, depth: depth,
1469
+ eref_active: eref_active)
1533
1470
  end
1534
- output << "</#{elem.name}>"
1471
+ end
1535
1472
 
1536
- output
1473
+ # Serialize one child node. Elements recurse into the layout-aware
1474
+ # path; non-element wrappers route through their own `to_xml`;
1475
+ # everything else falls through to the per-type serializer.
1476
+ # `indent_size:` and `depth:` are required to force callers to
1477
+ # decide whether the child should inherit the parent's indent
1478
+ # state — the entity-ref interleave path deliberately passes 0/0.
1479
+ #
1480
+ # Element fast-path checked first to avoid allocating a wrapper
1481
+ # we'd immediately throw away (elements always recurse on the
1482
+ # raw native node, not the wrapper). For a typical document this
1483
+ # skips wrapper allocation for the majority of children.
1484
+ def serialize_child_to_xml(child, indent_size:, depth:, eref_active:)
1485
+ if child.element?
1486
+ return serialize_element_with_namespaces(child, false, indent_size, depth + 1,
1487
+ eref_active: eref_active)
1488
+ end
1489
+
1490
+ wrapped_child = patch_node(child)
1491
+ if wrapped_child.is_a?(CustomizedLibxml::Node)
1492
+ wrapped_child.to_xml
1493
+ else
1494
+ serialize_node(child)
1495
+ end
1537
1496
  end
1538
1497
 
1539
1498
  def remove_indentation(xml_string)
@@ -1631,6 +1590,63 @@ module Moxml
1631
1590
  end
1632
1591
  nil
1633
1592
  end
1593
+
1594
+ # Deep duplication for the rare `import_and_add` fallback (when
1595
+ # libxml refuses to move a subtree across documents AND no target
1596
+ # document is available). Walks the source subtree and rebuilds
1597
+ # it as document-independent nodes. The DocumentBuilder hot path
1598
+ # goes through the shallow `duplicate_node` instead.
1599
+ def deep_duplicate_node(node)
1600
+ return nil unless node
1601
+
1602
+ native_node = unpatch_node(node)
1603
+
1604
+ return duplicate_node(node) unless node_type(node) == :element
1605
+
1606
+ new_node = shallow_duplicate_element(native_node)
1607
+ return new_node unless native_node.is_a?(::LibXML::XML::Node) && native_node.children?
1608
+
1609
+ native_node.each_child do |child|
1610
+ next if blank_text_node?(child)
1611
+
1612
+ new_node << deep_duplicate_node(child)
1613
+ end
1614
+ new_node
1615
+ end
1616
+
1617
+ # Copies a single element: its name, its OWN namespace definitions,
1618
+ # the active default namespace, and its attributes. Children are NOT
1619
+ # duplicated — callers that need the subtree use deep_duplicate_node.
1620
+ def shallow_duplicate_element(native_node)
1621
+ new_node = ::LibXML::XML::Node.new(native_node.name)
1622
+ copy_element_namespaces(native_node, new_node) if native_node.is_a?(::LibXML::XML::Node)
1623
+ copy_element_attributes(native_node, new_node) if native_node.attributes?
1624
+ new_node
1625
+ end
1626
+
1627
+ def copy_element_namespaces(src, dst)
1628
+ ns_list = src.namespaces
1629
+ ns_list.each do |ns|
1630
+ ::LibXML::XML::Namespace.new(dst, ns.prefix, ns.href)
1631
+ end
1632
+
1633
+ own_ns = ns_list.namespace
1634
+ return unless own_ns
1635
+
1636
+ dst.namespaces.each do |ns|
1637
+ next unless ns.prefix == own_ns.prefix && ns.href == own_ns.href
1638
+
1639
+ dst.namespaces.namespace = ns
1640
+ break
1641
+ end
1642
+ end
1643
+
1644
+ def copy_element_attributes(src, dst)
1645
+ src.each_attr do |attr|
1646
+ attr_name = attr.ns&.prefix ? "#{attr.ns.prefix}:#{attr.name}" : attr.name
1647
+ dst[attr_name] = attr.value
1648
+ end
1649
+ end
1634
1650
  end
1635
1651
 
1636
1652
  # Bridge between LibXML SAX and Moxml SAX
@@ -1706,3 +1722,6 @@ module Moxml
1706
1722
  end
1707
1723
  end
1708
1724
  end
1725
+
1726
+ require_relative "libxml/entity_ref_registry"
1727
+ require_relative "libxml/entity_restorer"