moxml 0.1.20 → 0.1.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/moxml/adapter/base.rb +5 -0
- data/lib/moxml/adapter/customized_libxml/node.rb +3 -0
- data/lib/moxml/adapter/customized_libxml/text.rb +6 -1
- data/lib/moxml/adapter/libxml/entity_ref_registry.rb +105 -0
- data/lib/moxml/adapter/libxml/entity_restorer.rb +92 -0
- data/lib/moxml/adapter/libxml.rb +381 -362
- data/lib/moxml/version.rb +1 -1
- data/spec/moxml/adapter/libxml_internals_spec.rb +167 -0
- data/spec/performance/benchmark_spec.rb +1 -1
- metadata +5 -2
data/lib/moxml/adapter/libxml.rb
CHANGED
|
@@ -37,6 +37,32 @@ module Moxml
|
|
|
37
37
|
end
|
|
38
38
|
end
|
|
39
39
|
|
|
40
|
+
# Mapping from libxml's integer node_type to our symbol — built once
|
|
41
|
+
# at load so `node_type` can do a single hash lookup on the hot path
|
|
42
|
+
# instead of a large case/when on every node.
|
|
43
|
+
NATIVE_NODE_TYPE_MAP = {
|
|
44
|
+
::LibXML::XML::Node::ELEMENT_NODE => :element,
|
|
45
|
+
::LibXML::XML::Node::TEXT_NODE => :text,
|
|
46
|
+
::LibXML::XML::Node::CDATA_SECTION_NODE => :cdata,
|
|
47
|
+
::LibXML::XML::Node::COMMENT_NODE => :comment,
|
|
48
|
+
::LibXML::XML::Node::PI_NODE => :processing_instruction,
|
|
49
|
+
::LibXML::XML::Node::ATTRIBUTE_NODE => :attribute,
|
|
50
|
+
::LibXML::XML::Node::DTD_NODE => :doctype,
|
|
51
|
+
::LibXML::XML::Node::DOCUMENT_NODE => :document,
|
|
52
|
+
}.freeze
|
|
53
|
+
private_constant :NATIVE_NODE_TYPE_MAP
|
|
54
|
+
|
|
55
|
+
WRAPPER_NODE_TYPE_MAP = {
|
|
56
|
+
DoctypeWrapper => :doctype,
|
|
57
|
+
CustomizedLibxml::Element => :element,
|
|
58
|
+
CustomizedLibxml::Text => :text,
|
|
59
|
+
CustomizedLibxml::Cdata => :cdata,
|
|
60
|
+
CustomizedLibxml::Comment => :comment,
|
|
61
|
+
CustomizedLibxml::ProcessingInstruction => :processing_instruction,
|
|
62
|
+
CustomizedLibxml::EntityReference => :entity_reference,
|
|
63
|
+
}.freeze
|
|
64
|
+
private_constant :WRAPPER_NODE_TYPE_MAP
|
|
65
|
+
|
|
40
66
|
class << self
|
|
41
67
|
def attachments
|
|
42
68
|
@attachments ||= Moxml::NativeAttachment.new
|
|
@@ -98,7 +124,22 @@ module Moxml
|
|
|
98
124
|
end
|
|
99
125
|
|
|
100
126
|
ctx = _context || Context.new(:libxml)
|
|
101
|
-
|
|
127
|
+
# Single parse path: wrap libxml's already-complete C-parsed tree
|
|
128
|
+
# directly (same pattern as nokogiri/ox). The previous
|
|
129
|
+
# DocumentBuilder.build path walked the entire parsed tree and
|
|
130
|
+
# re-added every node to a fresh moxml-managed document, which
|
|
131
|
+
# made parse O(N) Ruby work on top of an already-complete parse.
|
|
132
|
+
# Doctype/declaration/PI attachments set above remain on
|
|
133
|
+
# native_doc, so the serialize path still sees them.
|
|
134
|
+
#
|
|
135
|
+
# restore_entities is handled as a post-processing case after
|
|
136
|
+
# the wrap, NOT by branching into a different builder. This way
|
|
137
|
+
# any new parse-time logic only has to be added to this one
|
|
138
|
+
# path; the restoration walk is just one of potentially several
|
|
139
|
+
# post-processing steps and doesn't fork the construction.
|
|
140
|
+
doc = Document.new(native_doc, ctx)
|
|
141
|
+
EntityRestorer.new(doc).run if ctx.config.restore_entities
|
|
142
|
+
doc
|
|
102
143
|
end
|
|
103
144
|
|
|
104
145
|
# SAX parsing implementation for LibXML
|
|
@@ -181,40 +222,23 @@ module Moxml
|
|
|
181
222
|
def node_type(node)
|
|
182
223
|
return :unknown unless node
|
|
183
224
|
|
|
184
|
-
#
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
if node.is_a?(CustomizedLibxml::ProcessingInstruction)
|
|
190
|
-
return :processing_instruction
|
|
225
|
+
# Fast path: native libxml nodes are the vast majority during
|
|
226
|
+
# parse traversal (DocumentBuilder visits raw libxml children).
|
|
227
|
+
# Skip the wrapper checks below for them.
|
|
228
|
+
if node.is_a?(::LibXML::XML::Node)
|
|
229
|
+
return NATIVE_NODE_TYPE_MAP[node.node_type] || :unknown
|
|
191
230
|
end
|
|
192
|
-
return :
|
|
193
|
-
return :doctype if node.is_a?(DoctypeWrapper)
|
|
231
|
+
return :document if node.is_a?(::LibXML::XML::Document)
|
|
194
232
|
|
|
195
|
-
|
|
196
|
-
|
|
233
|
+
wrapper_type = WRAPPER_NODE_TYPE_MAP[node.class]
|
|
234
|
+
return wrapper_type if wrapper_type
|
|
197
235
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
:text
|
|
205
|
-
when ::LibXML::XML::Node::CDATA_SECTION_NODE
|
|
206
|
-
:cdata
|
|
207
|
-
when ::LibXML::XML::Node::COMMENT_NODE
|
|
208
|
-
:comment
|
|
209
|
-
when ::LibXML::XML::Node::ATTRIBUTE_NODE
|
|
210
|
-
:attribute
|
|
211
|
-
when ::LibXML::XML::Node::PI_NODE
|
|
212
|
-
:processing_instruction
|
|
213
|
-
when ::LibXML::XML::Node::DTD_NODE
|
|
214
|
-
:doctype
|
|
215
|
-
else
|
|
216
|
-
:unknown
|
|
217
|
-
end
|
|
236
|
+
# Duck-typed fallback for libxml types that aren't ::Node
|
|
237
|
+
# subclasses but still expose node_type (e.g. ::Attr).
|
|
238
|
+
native = unpatch_node(node)
|
|
239
|
+
return :unknown unless native.respond_to?(:node_type)
|
|
240
|
+
|
|
241
|
+
NATIVE_NODE_TYPE_MAP[native.node_type] || :unknown
|
|
218
242
|
end
|
|
219
243
|
|
|
220
244
|
def node_name(node)
|
|
@@ -300,7 +324,7 @@ module Moxml
|
|
|
300
324
|
|
|
301
325
|
# Include any EntityReference wrappers stored on the document
|
|
302
326
|
doc = native_node.doc
|
|
303
|
-
entity_refs = doc
|
|
327
|
+
entity_refs = entity_ref_registry(doc).refs_for(native_node)
|
|
304
328
|
result.concat(entity_refs) if entity_refs
|
|
305
329
|
|
|
306
330
|
result
|
|
@@ -317,7 +341,7 @@ module Moxml
|
|
|
317
341
|
current = native_node&.next
|
|
318
342
|
while current
|
|
319
343
|
# Skip whitespace-only text nodes
|
|
320
|
-
break unless
|
|
344
|
+
break unless blank_text_node?(current)
|
|
321
345
|
|
|
322
346
|
current = current.next
|
|
323
347
|
end
|
|
@@ -329,7 +353,7 @@ module Moxml
|
|
|
329
353
|
current = native_node&.prev
|
|
330
354
|
while current
|
|
331
355
|
# Skip whitespace-only text nodes
|
|
332
|
-
break unless
|
|
356
|
+
break unless blank_text_node?(current)
|
|
333
357
|
|
|
334
358
|
current = current.prev
|
|
335
359
|
end
|
|
@@ -504,21 +528,22 @@ module Moxml
|
|
|
504
528
|
native_child = unpatch_node(child)
|
|
505
529
|
|
|
506
530
|
# EntityReference wrappers can't go in LibXML's native tree.
|
|
507
|
-
# Store on the document
|
|
508
|
-
# LibXML creates new Ruby wrappers on each access, so element
|
|
509
|
-
# object_id is unstable — we look up via == comparison.
|
|
531
|
+
# Store them on the document for interleaved serialization.
|
|
510
532
|
if child.is_a?(CustomizedLibxml::EntityReference)
|
|
511
533
|
doc = native_elem.is_a?(::LibXML::XML::Document) ? native_elem : native_elem.doc
|
|
512
|
-
|
|
513
|
-
append_child_sequence_on_doc(doc, native_elem, :eref)
|
|
534
|
+
entity_ref_registry(doc).register(native_elem, child)
|
|
514
535
|
return
|
|
515
536
|
end
|
|
516
537
|
|
|
517
538
|
# For LibXML: if parent has a DEFAULT namespace (nil/empty prefix) and child is an element without a namespace,
|
|
518
539
|
# explicitly set the child's namespace to match the parent's for XPath compatibility
|
|
519
|
-
# NOTE: Prefixed namespaces are NOT inherited, only default namespaces
|
|
520
|
-
|
|
521
|
-
|
|
540
|
+
# NOTE: Prefixed namespaces are NOT inherited, only default namespaces.
|
|
541
|
+
#
|
|
542
|
+
# Reorder cheap-first: skip the expensive `.namespaces` fetches
|
|
543
|
+
# entirely for non-element children (text, comment, cdata, PI),
|
|
544
|
+
# which is roughly 30-50% of adds in a typical doc.
|
|
545
|
+
if native_child.is_a?(::LibXML::XML::Node) && native_child.element? &&
|
|
546
|
+
native_elem.is_a?(::LibXML::XML::Node) && native_elem.namespaces&.namespace &&
|
|
522
547
|
(!native_child.namespaces.namespace || native_child.namespaces.namespace.href.to_s.empty?)
|
|
523
548
|
|
|
524
549
|
parent_ns = native_elem.namespaces.namespace
|
|
@@ -574,51 +599,8 @@ module Moxml
|
|
|
574
599
|
else
|
|
575
600
|
import_and_add(native_elem.doc, native_elem, native_child)
|
|
576
601
|
doc = native_elem.doc || native_elem
|
|
577
|
-
|
|
578
|
-
end
|
|
579
|
-
end
|
|
580
|
-
|
|
581
|
-
# Store entity ref on the document (stable identity).
|
|
582
|
-
# LibXML element wrappers are ephemeral, so we use == to find matching elements.
|
|
583
|
-
def store_entity_ref_on_doc(doc, element, ref)
|
|
584
|
-
pairs = attachments.get(doc, :_entity_ref_pairs) || []
|
|
585
|
-
pair = pairs.find { |elem, _| elem == element }
|
|
586
|
-
if pair
|
|
587
|
-
pair[1] << ref
|
|
588
|
-
else
|
|
589
|
-
pairs << [element, [ref]]
|
|
590
|
-
end
|
|
591
|
-
attachments.set(doc, :_entity_ref_pairs, pairs)
|
|
592
|
-
end
|
|
593
|
-
|
|
594
|
-
# Look up entity refs for an element from the document
|
|
595
|
-
def lookup_entity_refs(doc, element)
|
|
596
|
-
pairs = attachments.get(doc, :_entity_ref_pairs)
|
|
597
|
-
return nil unless pairs
|
|
598
|
-
|
|
599
|
-
pair = pairs.find { |elem, _| elem == element }
|
|
600
|
-
pair&.last
|
|
601
|
-
end
|
|
602
|
-
|
|
603
|
-
# Track child order on the document (stable identity)
|
|
604
|
-
def append_child_sequence_on_doc(doc, element, type)
|
|
605
|
-
pairs = attachments.get(doc, :_child_seq_pairs) || []
|
|
606
|
-
pair = pairs.find { |elem, _| elem == element }
|
|
607
|
-
if pair
|
|
608
|
-
pair[1] << type
|
|
609
|
-
else
|
|
610
|
-
pairs << [element, [type]]
|
|
602
|
+
entity_ref_registry(doc).append_native(native_elem)
|
|
611
603
|
end
|
|
612
|
-
attachments.set(doc, :_child_seq_pairs, pairs)
|
|
613
|
-
end
|
|
614
|
-
|
|
615
|
-
# Look up child sequence for an element from the document
|
|
616
|
-
def lookup_child_sequence(doc, element)
|
|
617
|
-
pairs = attachments.get(doc, :_child_seq_pairs)
|
|
618
|
-
return nil unless pairs
|
|
619
|
-
|
|
620
|
-
pair = pairs.find { |elem, _| elem == element }
|
|
621
|
-
pair&.last
|
|
622
604
|
end
|
|
623
605
|
|
|
624
606
|
def append_child_sequence(element, type)
|
|
@@ -976,21 +958,23 @@ module Moxml
|
|
|
976
958
|
end
|
|
977
959
|
|
|
978
960
|
if native_node.root
|
|
979
|
-
|
|
961
|
+
indent_size = options[:indent].is_a?(Integer) && options[:indent].positive? ? options[:indent] : 0
|
|
962
|
+
# Custom serializer emits newlines AND indentation directly —
|
|
963
|
+
# no separate add_newlines_to_xml / indent_xml passes.
|
|
964
|
+
# `eref_active` is computed once here and threaded through the
|
|
965
|
+
# recursion so that the per-element `attachments.key?` Monitor
|
|
966
|
+
# sync only fires for docs that actually have entity refs.
|
|
967
|
+
eref_active = entity_ref_registry(native_node).active?
|
|
980
968
|
root_output = serialize_element_with_namespaces(
|
|
981
969
|
native_node.root,
|
|
982
970
|
true,
|
|
971
|
+
indent_size,
|
|
972
|
+
0,
|
|
973
|
+
eref_active: eref_active,
|
|
983
974
|
)
|
|
984
975
|
|
|
985
|
-
|
|
986
|
-
if
|
|
987
|
-
# First add newlines between elements
|
|
988
|
-
formatted = add_newlines_to_xml(root_output)
|
|
989
|
-
output << "\n" << indent_xml(formatted, options[:indent])
|
|
990
|
-
else
|
|
991
|
-
output << "\n" << root_output unless output.empty?
|
|
992
|
-
output << root_output if output.empty?
|
|
993
|
-
end
|
|
976
|
+
output << "\n" << root_output unless output.empty?
|
|
977
|
+
output << root_output if output.empty?
|
|
994
978
|
end
|
|
995
979
|
|
|
996
980
|
output
|
|
@@ -999,104 +983,22 @@ module Moxml
|
|
|
999
983
|
end
|
|
1000
984
|
end
|
|
1001
985
|
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
pos = 0
|
|
1012
|
-
|
|
1013
|
-
loop do
|
|
1014
|
-
# Find next CDATA start
|
|
1015
|
-
cdata_start = xml_string.index("<![CDATA[", pos)
|
|
1016
|
-
|
|
1017
|
-
if cdata_start
|
|
1018
|
-
# Copy everything before CDATA
|
|
1019
|
-
result << xml_string[pos...cdata_start]
|
|
1020
|
-
|
|
1021
|
-
# Find CDATA end
|
|
1022
|
-
cdata_content_start = cdata_start + 9 # Length of "<![CDATA["
|
|
1023
|
-
cdata_end = xml_string.index("]]>", cdata_content_start)
|
|
1024
|
-
|
|
1025
|
-
if cdata_end
|
|
1026
|
-
# Extract full CDATA including markers
|
|
1027
|
-
full_cdata_end = cdata_end + 3 # Include "]]>"
|
|
1028
|
-
cdata_section = xml_string[cdata_start...full_cdata_end]
|
|
1029
|
-
|
|
1030
|
-
# Store and add placeholder
|
|
1031
|
-
cdata_sections << cdata_section
|
|
1032
|
-
result << "__CDATA_PLACEHOLDER_#{cdata_sections.length - 1}__"
|
|
1033
|
-
|
|
1034
|
-
# Continue after this CDATA
|
|
1035
|
-
pos = full_cdata_end
|
|
1036
|
-
else
|
|
1037
|
-
# Malformed CDATA (no closing "]]>") - copy as-is
|
|
1038
|
-
result << xml_string[cdata_start..]
|
|
1039
|
-
break
|
|
1040
|
-
end
|
|
1041
|
-
else
|
|
1042
|
-
# No more CDATA sections - copy rest
|
|
1043
|
-
result << xml_string[pos..]
|
|
1044
|
-
break
|
|
1045
|
-
end
|
|
1046
|
-
end
|
|
1047
|
-
|
|
1048
|
-
protected = result
|
|
1049
|
-
|
|
1050
|
-
# Add newlines between elements (but not in CDATA - already protected)
|
|
1051
|
-
with_newlines = protected.gsub(%r{(<[^>]+)>(?=<(?!/))}, "\\1>\n")
|
|
1052
|
-
|
|
1053
|
-
# Restore CDATA sections
|
|
1054
|
-
cdata_sections.each_with_index do |cdata, index|
|
|
1055
|
-
with_newlines.sub!("__CDATA_PLACEHOLDER_#{index}__", cdata)
|
|
1056
|
-
end
|
|
1057
|
-
|
|
1058
|
-
with_newlines
|
|
1059
|
-
end
|
|
1060
|
-
|
|
1061
|
-
def indent_xml(xml_string, indent_size)
|
|
1062
|
-
# Simple line-by-line indentation
|
|
1063
|
-
lines = []
|
|
1064
|
-
level = 0
|
|
1065
|
-
|
|
1066
|
-
xml_string.each_line do |line|
|
|
1067
|
-
line = line.strip
|
|
1068
|
-
next if line.empty?
|
|
1069
|
-
|
|
1070
|
-
# Decrease level for closing tags
|
|
1071
|
-
level -= 1 if line.start_with?("</")
|
|
1072
|
-
level = [level, 0].max
|
|
1073
|
-
|
|
1074
|
-
# Add indented line
|
|
1075
|
-
lines << ((" " * (indent_size * level)) + line)
|
|
1076
|
-
|
|
1077
|
-
# Increase level for opening tags (but not self-closing or special tags)
|
|
1078
|
-
next unless line.start_with?("<") && !line.start_with?("</") &&
|
|
1079
|
-
!line.end_with?("/>") && !line.start_with?("<?") &&
|
|
1080
|
-
!line.start_with?("<!") && !line.include?("</")
|
|
1081
|
-
|
|
1082
|
-
level += 1
|
|
1083
|
-
end
|
|
1084
|
-
|
|
1085
|
-
lines.join("\n")
|
|
1086
|
-
end
|
|
1087
|
-
|
|
986
|
+
# Shallow duplication: copies the node itself (name, attrs, namespaces)
|
|
987
|
+
# but NOT its descendants. This is what DocumentBuilder needs — it
|
|
988
|
+
# walks the source tree and re-adds children one at a time via
|
|
989
|
+
# add_child, so a deep copy here would be done only to be stripped
|
|
990
|
+
# by replace_children, then rebuilt — O(N²) waste on parse.
|
|
991
|
+
#
|
|
992
|
+
# For callers that need a true deep copy (e.g. the import_and_add
|
|
993
|
+
# fallback when LibXML can't move the subtree directly), use
|
|
994
|
+
# deep_duplicate_node.
|
|
1088
995
|
def duplicate_node(node)
|
|
1089
996
|
return nil unless node
|
|
1090
997
|
|
|
1091
|
-
# Unwrap if wrapped
|
|
1092
998
|
native_node = unpatch_node(node)
|
|
1093
999
|
|
|
1094
|
-
# LibXML is strict about document ownership
|
|
1095
|
-
# Create brand new NATIVE nodes that are document-independent
|
|
1096
|
-
# Wrappers are only used via patch_node when reading children
|
|
1097
1000
|
case node_type(node)
|
|
1098
1001
|
when :doctype
|
|
1099
|
-
# DoctypeWrapper - create a new one with same properties
|
|
1100
1002
|
if node.is_a?(DoctypeWrapper)
|
|
1101
1003
|
DoctypeWrapper.new(
|
|
1102
1004
|
create_document,
|
|
@@ -1105,64 +1007,10 @@ module Moxml
|
|
|
1105
1007
|
node.system_id,
|
|
1106
1008
|
)
|
|
1107
1009
|
else
|
|
1108
|
-
# Should not happen, but handle gracefully
|
|
1109
1010
|
node
|
|
1110
1011
|
end
|
|
1111
1012
|
when :element
|
|
1112
|
-
|
|
1113
|
-
# new_node.line = node.line
|
|
1114
|
-
|
|
1115
|
-
# Copy and set namespace definitions FIRST
|
|
1116
|
-
if native_node.is_a?(::LibXML::XML::Node)
|
|
1117
|
-
# First, copy all namespace definitions
|
|
1118
|
-
native_node.namespaces.each do |ns|
|
|
1119
|
-
::LibXML::XML::Namespace.new(
|
|
1120
|
-
new_node,
|
|
1121
|
-
ns.prefix,
|
|
1122
|
-
ns.href,
|
|
1123
|
-
)
|
|
1124
|
-
end
|
|
1125
|
-
|
|
1126
|
-
# Then, set this element's own namespace if it has one
|
|
1127
|
-
if native_node.namespaces.namespace
|
|
1128
|
-
orig_ns = native_node.namespaces.namespace
|
|
1129
|
-
# Find the matching namespace we just created
|
|
1130
|
-
new_node.namespaces.each do |ns|
|
|
1131
|
-
if ns.prefix == orig_ns.prefix && ns.href == orig_ns.href
|
|
1132
|
-
new_node.namespaces.namespace = ns
|
|
1133
|
-
break
|
|
1134
|
-
end
|
|
1135
|
-
end
|
|
1136
|
-
end
|
|
1137
|
-
end
|
|
1138
|
-
|
|
1139
|
-
# Copy attributes AFTER namespaces are set up
|
|
1140
|
-
# LibXML handles namespaced attributes through their full names
|
|
1141
|
-
if native_node.attributes?
|
|
1142
|
-
native_node.each_attr do |attr|
|
|
1143
|
-
# Get the full attribute name (may include namespace prefix)
|
|
1144
|
-
attr_name = if attr.ns&.prefix
|
|
1145
|
-
"#{attr.ns.prefix}:#{attr.name}"
|
|
1146
|
-
else
|
|
1147
|
-
attr.name
|
|
1148
|
-
end
|
|
1149
|
-
new_node[attr_name] = attr.value
|
|
1150
|
-
end
|
|
1151
|
-
end
|
|
1152
|
-
|
|
1153
|
-
# Recursively copy children
|
|
1154
|
-
if native_node.children?
|
|
1155
|
-
native_node.each_child do |child|
|
|
1156
|
-
# Skip whitespace-only text nodes
|
|
1157
|
-
next if child.text? && child.content.to_s.strip.empty?
|
|
1158
|
-
|
|
1159
|
-
# Recursively duplicate the child
|
|
1160
|
-
child_copy = duplicate_node(child)
|
|
1161
|
-
new_node << child_copy
|
|
1162
|
-
end
|
|
1163
|
-
end
|
|
1164
|
-
|
|
1165
|
-
new_node
|
|
1013
|
+
shallow_duplicate_element(native_node)
|
|
1166
1014
|
when :text
|
|
1167
1015
|
::LibXML::XML::Node.new_text(native_node.content)
|
|
1168
1016
|
when :cdata
|
|
@@ -1172,7 +1020,6 @@ module Moxml
|
|
|
1172
1020
|
when :processing_instruction
|
|
1173
1021
|
::LibXML::XML::Node.new_pi(native_node.name, native_node.content)
|
|
1174
1022
|
else
|
|
1175
|
-
# For other types, try dup as fallback
|
|
1176
1023
|
native_node.dup
|
|
1177
1024
|
end
|
|
1178
1025
|
end
|
|
@@ -1281,7 +1128,7 @@ module Moxml
|
|
|
1281
1128
|
if elem.children?
|
|
1282
1129
|
elem.each_child do |child|
|
|
1283
1130
|
# Skip whitespace-only text nodes
|
|
1284
|
-
next if
|
|
1131
|
+
next if blank_text_node?(child)
|
|
1285
1132
|
|
|
1286
1133
|
output << serialize_node(child)
|
|
1287
1134
|
end
|
|
@@ -1289,7 +1136,7 @@ module Moxml
|
|
|
1289
1136
|
|
|
1290
1137
|
# Append any EntityReference wrappers stored on the document
|
|
1291
1138
|
doc = elem.doc
|
|
1292
|
-
entity_refs = doc
|
|
1139
|
+
entity_refs = entity_ref_registry(doc).refs_for(elem)
|
|
1293
1140
|
entity_refs&.each { |ref| output << ref.to_xml }
|
|
1294
1141
|
|
|
1295
1142
|
output << "</#{elem.name}>"
|
|
@@ -1331,21 +1178,18 @@ module Moxml
|
|
|
1331
1178
|
.gsub(">", ">")
|
|
1332
1179
|
end
|
|
1333
1180
|
|
|
1181
|
+
ESCAPE_XML_RE = /[&<>"]/
|
|
1182
|
+
ESCAPE_XML_MAP = { "&" => "&", "<" => "<", ">" => ">", '"' => """ }.freeze
|
|
1183
|
+
private_constant :ESCAPE_XML_RE, :ESCAPE_XML_MAP
|
|
1184
|
+
|
|
1334
1185
|
def escape_xml(text)
|
|
1335
|
-
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
end
|
|
1186
|
+
# One gsub pass with a Hash replacement allocates a single new
|
|
1187
|
+
# string. The previous chained gsubs allocated three throwaway
|
|
1188
|
+
# strings on every call (very hot for attribute-heavy XML).
|
|
1189
|
+
str = text.is_a?(String) ? text : text.to_s
|
|
1190
|
+
return str unless str.match?(ESCAPE_XML_RE)
|
|
1341
1191
|
|
|
1342
|
-
|
|
1343
|
-
escaped = value.to_s
|
|
1344
|
-
.gsub("&", "&")
|
|
1345
|
-
.gsub("<", "<")
|
|
1346
|
-
.gsub(">", ">")
|
|
1347
|
-
.gsub("\"", """)
|
|
1348
|
-
escaped.to_s
|
|
1192
|
+
str.gsub(ESCAPE_XML_RE, ESCAPE_XML_MAP)
|
|
1349
1193
|
end
|
|
1350
1194
|
|
|
1351
1195
|
def import_and_add(doc, element, child)
|
|
@@ -1368,7 +1212,7 @@ module Moxml
|
|
|
1368
1212
|
else
|
|
1369
1213
|
# No target document - create a deep copy of the node instead
|
|
1370
1214
|
# This handles the case where the element isn't attached to a document yet
|
|
1371
|
-
copied =
|
|
1215
|
+
copied = deep_duplicate_node(child)
|
|
1372
1216
|
element << copied
|
|
1373
1217
|
end
|
|
1374
1218
|
|
|
@@ -1413,127 +1257,242 @@ module Moxml
|
|
|
1413
1257
|
end
|
|
1414
1258
|
end
|
|
1415
1259
|
|
|
1416
|
-
def serialize_element_with_namespaces(elem, include_ns = true
|
|
1417
|
-
|
|
1260
|
+
def serialize_element_with_namespaces(elem, include_ns = true,
|
|
1261
|
+
indent_size = 0, depth = 0,
|
|
1262
|
+
eref_active: nil)
|
|
1263
|
+
# Cache elem.name — it's a libxml C call we'd otherwise make
|
|
1264
|
+
# twice (open tag + close tag). Concat with `<<` instead of
|
|
1265
|
+
# `"<#{name}"` to avoid the interpolated intermediate string.
|
|
1266
|
+
name = elem.name
|
|
1267
|
+
output = +"<"
|
|
1268
|
+
output << name
|
|
1269
|
+
emit_namespace_definitions(output, elem, include_ns)
|
|
1270
|
+
emit_attributes(output, elem)
|
|
1271
|
+
|
|
1272
|
+
# `eref_active` is precomputed at the top-level `serialize` call
|
|
1273
|
+
# and threaded down — when nil (top-level non-recursive call into
|
|
1274
|
+
# this method), look it up; when false, skip the per-element doc
|
|
1275
|
+
# attachment query that otherwise fires for every element under
|
|
1276
|
+
# Monitor#synchronize.
|
|
1277
|
+
eref_active = doc_eref_active?(elem.doc) if eref_active.nil?
|
|
1278
|
+
entity_refs, child_sequence = eref_active ? lookup_entity_ref_serialization(elem) : [nil, nil]
|
|
1418
1279
|
|
|
1419
|
-
#
|
|
1420
|
-
|
|
1421
|
-
# - On child elements, output namespace definitions that override parent namespaces
|
|
1422
|
-
if elem.is_a?(::LibXML::XML::Node) && elem.namespaces.respond_to?(:definitions)
|
|
1423
|
-
# Get parent's namespace definitions to detect overrides
|
|
1424
|
-
parent_ns_defs = if !include_ns && elem.parent && !elem.parent.is_a?(::LibXML::XML::Document)
|
|
1425
|
-
parent_namespaces = {}
|
|
1426
|
-
if elem.parent.is_a?(::LibXML::XML::Node)
|
|
1427
|
-
elem.parent.namespaces.each do |ns|
|
|
1428
|
-
parent_namespaces[ns.prefix] = ns.href
|
|
1429
|
-
end
|
|
1430
|
-
end
|
|
1431
|
-
parent_namespaces
|
|
1432
|
-
else
|
|
1433
|
-
{}
|
|
1434
|
-
end
|
|
1280
|
+
# Always use verbose format <tag></tag> for consistency with other adapters
|
|
1281
|
+
output << ">"
|
|
1435
1282
|
|
|
1436
|
-
|
|
1437
|
-
elem
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1283
|
+
if entity_refs && child_sequence
|
|
1284
|
+
emit_eref_interleaved_children(output, elem, entity_refs, child_sequence,
|
|
1285
|
+
indent_size, depth, eref_active: eref_active)
|
|
1286
|
+
elsif elem.children?
|
|
1287
|
+
emit_children_with_layout(output, elem, indent_size, depth,
|
|
1288
|
+
eref_active: eref_active)
|
|
1289
|
+
end
|
|
1441
1290
|
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
should_output = include_ns ||
|
|
1446
|
-
(parent_ns_defs.key?(prefix) && parent_ns_defs[prefix] != uri)
|
|
1291
|
+
output << "</" << name << ">"
|
|
1292
|
+
output
|
|
1293
|
+
end
|
|
1447
1294
|
|
|
1448
|
-
|
|
1295
|
+
def doc_eref_active?(doc)
|
|
1296
|
+
entity_ref_registry(doc).active?
|
|
1297
|
+
end
|
|
1449
1298
|
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1299
|
+
def entity_ref_registry(doc)
|
|
1300
|
+
EntityRefRegistry.new(attachments, doc)
|
|
1301
|
+
end
|
|
1302
|
+
|
|
1303
|
+
# Emit `xmlns`/`xmlns:foo` declarations onto `output`. On the root
|
|
1304
|
+
# (`include_ns: true`) we emit ALL definitions; on children we
|
|
1305
|
+
# emit only definitions that OVERRIDE a parent's same-prefix URI.
|
|
1306
|
+
# Skips the whole block when the element has no local definitions,
|
|
1307
|
+
# which is the common case for child elements in unnamespaced docs.
|
|
1308
|
+
def emit_namespace_definitions(output, elem, include_ns)
|
|
1309
|
+
return unless elem.is_a?(::LibXML::XML::Node)
|
|
1310
|
+
|
|
1311
|
+
ns_list = elem.namespaces
|
|
1312
|
+
return unless ns_list.respond_to?(:definitions)
|
|
1313
|
+
|
|
1314
|
+
definitions = ns_list.definitions
|
|
1315
|
+
return if definitions.empty?
|
|
1316
|
+
|
|
1317
|
+
parent_ns_defs = include_ns ? nil : parent_namespace_defs(elem)
|
|
1318
|
+
seen_ns = nil
|
|
1319
|
+
|
|
1320
|
+
definitions.each do |ns|
|
|
1321
|
+
prefix = ns.prefix
|
|
1322
|
+
uri = ns.href
|
|
1323
|
+
next unless include_ns ||
|
|
1324
|
+
(parent_ns_defs&.key?(prefix) && parent_ns_defs[prefix] != uri)
|
|
1325
|
+
|
|
1326
|
+
seen_ns ||= {}
|
|
1327
|
+
next if seen_ns.key?(prefix)
|
|
1328
|
+
|
|
1329
|
+
seen_ns[prefix] = true
|
|
1330
|
+
output << format_ns_declaration(prefix, uri)
|
|
1457
1331
|
end
|
|
1332
|
+
end
|
|
1458
1333
|
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
next if attr.name.start_with?("xmlns")
|
|
1334
|
+
def parent_namespace_defs(elem)
|
|
1335
|
+
parent = elem.parent
|
|
1336
|
+
return nil unless parent.is_a?(::LibXML::XML::Node)
|
|
1463
1337
|
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1338
|
+
defs = {}
|
|
1339
|
+
parent.namespaces.each { |ns| defs[ns.prefix] = ns.href }
|
|
1340
|
+
defs
|
|
1341
|
+
end
|
|
1342
|
+
|
|
1343
|
+
def format_ns_declaration(prefix, uri)
|
|
1344
|
+
if prefix.nil? || prefix.empty?
|
|
1345
|
+
" xmlns=\"#{escape_xml(uri)}\""
|
|
1346
|
+
else
|
|
1347
|
+
" xmlns:#{prefix}=\"#{escape_xml(uri)}\""
|
|
1472
1348
|
end
|
|
1349
|
+
end
|
|
1473
1350
|
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
doc = elem.doc
|
|
1477
|
-
entity_refs = doc ? lookup_entity_refs(doc, elem) : nil
|
|
1478
|
-
child_sequence = doc ? lookup_child_sequence(doc, elem) : nil
|
|
1351
|
+
def emit_attributes(output, elem)
|
|
1352
|
+
return unless elem.attributes?
|
|
1479
1353
|
|
|
1480
|
-
|
|
1481
|
-
|
|
1354
|
+
elem.each_attr do |attr|
|
|
1355
|
+
next if attr.name.start_with?("xmlns")
|
|
1482
1356
|
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
elem.each_child do |c|
|
|
1488
|
-
native_children << c unless c.text? && c.content.to_s.strip.empty?
|
|
1489
|
-
end
|
|
1490
|
-
end
|
|
1357
|
+
attr_name = attr.ns&.prefix ? "#{attr.ns.prefix}:#{attr.name}" : attr.name
|
|
1358
|
+
output << " #{attr_name}=\"#{escape_xml(attr.value)}\""
|
|
1359
|
+
end
|
|
1360
|
+
end
|
|
1491
1361
|
|
|
1492
|
-
|
|
1493
|
-
|
|
1494
|
-
|
|
1495
|
-
|
|
1496
|
-
|
|
1497
|
-
|
|
1498
|
-
|
|
1499
|
-
|
|
1500
|
-
|
|
1501
|
-
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
|
|
1507
|
-
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
|
|
1511
|
-
|
|
1512
|
-
|
|
1362
|
+
# Returns [entity_refs, child_sequence] when the element has
|
|
1363
|
+
# interleaved entity references that the serializer needs to
|
|
1364
|
+
# weave back into the native child stream — otherwise [nil, nil].
|
|
1365
|
+
#
|
|
1366
|
+
# The caller is responsible for gating this with `eref_active`
|
|
1367
|
+
# (precomputed once per `serialize` call). When `eref_active` is
|
|
1368
|
+
# false this method is never entered, so the per-element doc
|
|
1369
|
+
# attachment query never fires.
|
|
1370
|
+
def lookup_entity_ref_serialization(elem)
|
|
1371
|
+
doc = elem.doc
|
|
1372
|
+
return [nil, nil] unless doc
|
|
1373
|
+
|
|
1374
|
+
entity_ref_registry(doc).serialization_for(elem)
|
|
1375
|
+
end
|
|
1376
|
+
|
|
1377
|
+
def emit_eref_interleaved_children(output, elem, entity_refs, child_sequence,
|
|
1378
|
+
indent_size, depth, eref_active:)
|
|
1379
|
+
native_children = collect_non_blank_children(elem)
|
|
1380
|
+
child_pad = indent_size.positive? ? " " * (indent_size * (depth + 1)) : nil
|
|
1381
|
+
eref_idx = 0
|
|
1382
|
+
native_idx = 0
|
|
1383
|
+
prev_block = true
|
|
1384
|
+
|
|
1385
|
+
child_sequence.each do |type|
|
|
1386
|
+
case type
|
|
1387
|
+
when :native
|
|
1388
|
+
if native_idx < native_children.size
|
|
1389
|
+
child = native_children[native_idx]
|
|
1390
|
+
is_text_like = child.text? || child.cdata?
|
|
1391
|
+
if prev_block && !is_text_like
|
|
1392
|
+
output << "\n"
|
|
1393
|
+
output << child_pad if child_pad
|
|
1513
1394
|
end
|
|
1395
|
+
prev_block = !is_text_like
|
|
1396
|
+
|
|
1397
|
+
output << serialize_child_to_xml(
|
|
1398
|
+
child, indent_size: indent_size, depth: depth,
|
|
1399
|
+
eref_active: eref_active
|
|
1400
|
+
)
|
|
1401
|
+
native_idx += 1
|
|
1402
|
+
end
|
|
1403
|
+
when :eref
|
|
1404
|
+
if eref_idx < entity_refs.size
|
|
1405
|
+
output << entity_refs[eref_idx].to_xml
|
|
1406
|
+
eref_idx += 1
|
|
1407
|
+
prev_block = false
|
|
1514
1408
|
end
|
|
1515
1409
|
end
|
|
1516
|
-
|
|
1517
|
-
|
|
1518
|
-
|
|
1519
|
-
|
|
1520
|
-
|
|
1521
|
-
|
|
1522
|
-
|
|
1523
|
-
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
|
-
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
|
|
1530
|
-
|
|
1531
|
-
|
|
1410
|
+
end
|
|
1411
|
+
end
|
|
1412
|
+
|
|
1413
|
+
# Regex used in place of `content.to_s.strip.empty?` for whitespace-only
|
|
1414
|
+
# text detection — `match?` allocates nothing while `.strip` makes a
|
|
1415
|
+
# throwaway copy of every text node's content on each visit.
|
|
1416
|
+
NON_WHITESPACE_RE = /\S/
|
|
1417
|
+
private_constant :NON_WHITESPACE_RE
|
|
1418
|
+
|
|
1419
|
+
def blank_text_node?(child)
|
|
1420
|
+
child.text? && blank_content?(child.content)
|
|
1421
|
+
end
|
|
1422
|
+
|
|
1423
|
+
def blank_content?(content)
|
|
1424
|
+
content.nil? || !content.match?(NON_WHITESPACE_RE)
|
|
1425
|
+
end
|
|
1426
|
+
|
|
1427
|
+
def collect_non_blank_children(elem)
|
|
1428
|
+
children = []
|
|
1429
|
+
return children unless elem.children?
|
|
1430
|
+
|
|
1431
|
+
elem.each_child do |c|
|
|
1432
|
+
children << c unless blank_text_node?(c)
|
|
1433
|
+
end
|
|
1434
|
+
children
|
|
1435
|
+
end
|
|
1436
|
+
|
|
1437
|
+
# Walk native children once and emit them with the same newline +
|
|
1438
|
+
# indentation layout the old `add_newlines_to_xml` + `indent_xml`
|
|
1439
|
+
# post-passes produced — but in a single recursion with no string
|
|
1440
|
+
# rescanning.
|
|
1441
|
+
#
|
|
1442
|
+
# Newline rule (matching `>(?=<(?!/))` with CDATA-placeholder
|
|
1443
|
+
# protection): emit `\n` + per-level padding before a child iff
|
|
1444
|
+
# the previous emitted sibling was block-level (ended with `>`)
|
|
1445
|
+
# AND the current sibling is block-level. Text and CDATA count
|
|
1446
|
+
# as text-like and suppress the newline on both sides (the
|
|
1447
|
+
# original CDATA placeholder broke the `>...<` adjacency
|
|
1448
|
+
# symmetrically).
|
|
1449
|
+
def emit_children_with_layout(output, elem, indent_size, depth,
|
|
1450
|
+
eref_active:)
|
|
1451
|
+
child_pad = indent_size.positive? ? " " * (indent_size * (depth + 1)) : nil
|
|
1452
|
+
prev_block = true
|
|
1453
|
+
|
|
1454
|
+
elem.each_child do |child|
|
|
1455
|
+
# Cache text? — used twice per child (whitespace skip + is_text_like).
|
|
1456
|
+
# For element children (the common case) both calls return false, so
|
|
1457
|
+
# caching saves a libxml C call.
|
|
1458
|
+
is_text = child.text?
|
|
1459
|
+
next if is_text && blank_content?(child.content)
|
|
1460
|
+
|
|
1461
|
+
is_text_like = is_text || child.cdata?
|
|
1462
|
+
if prev_block && !is_text_like
|
|
1463
|
+
output << "\n"
|
|
1464
|
+
output << child_pad if child_pad
|
|
1532
1465
|
end
|
|
1466
|
+
prev_block = !is_text_like
|
|
1467
|
+
|
|
1468
|
+
output << serialize_child_to_xml(child, indent_size: indent_size, depth: depth,
|
|
1469
|
+
eref_active: eref_active)
|
|
1533
1470
|
end
|
|
1534
|
-
|
|
1471
|
+
end
|
|
1535
1472
|
|
|
1536
|
-
|
|
1473
|
+
# Serialize one child node. Elements recurse into the layout-aware
|
|
1474
|
+
# path; non-element wrappers route through their own `to_xml`;
|
|
1475
|
+
# everything else falls through to the per-type serializer.
|
|
1476
|
+
# `indent_size:` and `depth:` are required to force callers to
|
|
1477
|
+
# decide whether the child should inherit the parent's indent
|
|
1478
|
+
# state — the entity-ref interleave path deliberately passes 0/0.
|
|
1479
|
+
#
|
|
1480
|
+
# Element fast-path checked first to avoid allocating a wrapper
|
|
1481
|
+
# we'd immediately throw away (elements always recurse on the
|
|
1482
|
+
# raw native node, not the wrapper). For a typical document this
|
|
1483
|
+
# skips wrapper allocation for the majority of children.
|
|
1484
|
+
def serialize_child_to_xml(child, indent_size:, depth:, eref_active:)
|
|
1485
|
+
if child.element?
|
|
1486
|
+
return serialize_element_with_namespaces(child, false, indent_size, depth + 1,
|
|
1487
|
+
eref_active: eref_active)
|
|
1488
|
+
end
|
|
1489
|
+
|
|
1490
|
+
wrapped_child = patch_node(child)
|
|
1491
|
+
if wrapped_child.is_a?(CustomizedLibxml::Node)
|
|
1492
|
+
wrapped_child.to_xml
|
|
1493
|
+
else
|
|
1494
|
+
serialize_node(child)
|
|
1495
|
+
end
|
|
1537
1496
|
end
|
|
1538
1497
|
|
|
1539
1498
|
def remove_indentation(xml_string)
|
|
@@ -1631,6 +1590,63 @@ module Moxml
|
|
|
1631
1590
|
end
|
|
1632
1591
|
nil
|
|
1633
1592
|
end
|
|
1593
|
+
|
|
1594
|
+
# Deep duplication for the rare `import_and_add` fallback (when
|
|
1595
|
+
# libxml refuses to move a subtree across documents AND no target
|
|
1596
|
+
# document is available). Walks the source subtree and rebuilds
|
|
1597
|
+
# it as document-independent nodes. The DocumentBuilder hot path
|
|
1598
|
+
# goes through the shallow `duplicate_node` instead.
|
|
1599
|
+
def deep_duplicate_node(node)
|
|
1600
|
+
return nil unless node
|
|
1601
|
+
|
|
1602
|
+
native_node = unpatch_node(node)
|
|
1603
|
+
|
|
1604
|
+
return duplicate_node(node) unless node_type(node) == :element
|
|
1605
|
+
|
|
1606
|
+
new_node = shallow_duplicate_element(native_node)
|
|
1607
|
+
return new_node unless native_node.is_a?(::LibXML::XML::Node) && native_node.children?
|
|
1608
|
+
|
|
1609
|
+
native_node.each_child do |child|
|
|
1610
|
+
next if blank_text_node?(child)
|
|
1611
|
+
|
|
1612
|
+
new_node << deep_duplicate_node(child)
|
|
1613
|
+
end
|
|
1614
|
+
new_node
|
|
1615
|
+
end
|
|
1616
|
+
|
|
1617
|
+
# Copies a single element: its name, its OWN namespace definitions,
|
|
1618
|
+
# the active default namespace, and its attributes. Children are NOT
|
|
1619
|
+
# duplicated — callers that need the subtree use deep_duplicate_node.
|
|
1620
|
+
def shallow_duplicate_element(native_node)
|
|
1621
|
+
new_node = ::LibXML::XML::Node.new(native_node.name)
|
|
1622
|
+
copy_element_namespaces(native_node, new_node) if native_node.is_a?(::LibXML::XML::Node)
|
|
1623
|
+
copy_element_attributes(native_node, new_node) if native_node.attributes?
|
|
1624
|
+
new_node
|
|
1625
|
+
end
|
|
1626
|
+
|
|
1627
|
+
def copy_element_namespaces(src, dst)
|
|
1628
|
+
ns_list = src.namespaces
|
|
1629
|
+
ns_list.each do |ns|
|
|
1630
|
+
::LibXML::XML::Namespace.new(dst, ns.prefix, ns.href)
|
|
1631
|
+
end
|
|
1632
|
+
|
|
1633
|
+
own_ns = ns_list.namespace
|
|
1634
|
+
return unless own_ns
|
|
1635
|
+
|
|
1636
|
+
dst.namespaces.each do |ns|
|
|
1637
|
+
next unless ns.prefix == own_ns.prefix && ns.href == own_ns.href
|
|
1638
|
+
|
|
1639
|
+
dst.namespaces.namespace = ns
|
|
1640
|
+
break
|
|
1641
|
+
end
|
|
1642
|
+
end
|
|
1643
|
+
|
|
1644
|
+
def copy_element_attributes(src, dst)
|
|
1645
|
+
src.each_attr do |attr|
|
|
1646
|
+
attr_name = attr.ns&.prefix ? "#{attr.ns.prefix}:#{attr.name}" : attr.name
|
|
1647
|
+
dst[attr_name] = attr.value
|
|
1648
|
+
end
|
|
1649
|
+
end
|
|
1634
1650
|
end
|
|
1635
1651
|
|
|
1636
1652
|
# Bridge between LibXML SAX and Moxml SAX
|
|
@@ -1706,3 +1722,6 @@ module Moxml
|
|
|
1706
1722
|
end
|
|
1707
1723
|
end
|
|
1708
1724
|
end
|
|
1725
|
+
|
|
1726
|
+
require_relative "libxml/entity_ref_registry"
|
|
1727
|
+
require_relative "libxml/entity_restorer"
|