moxml 0.1.15 → 0.1.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -0
  3. data/.rubocop_todo.yml +49 -133
  4. data/README.adoc +18 -0
  5. data/Rakefile +31 -0
  6. data/benchmarks/generate_report.rb +1 -1
  7. data/lib/moxml/adapter/base.rb +79 -8
  8. data/lib/moxml/adapter/customized_libxml/declaration.rb +1 -1
  9. data/lib/moxml/adapter/customized_rexml/formatter.rb +42 -20
  10. data/lib/moxml/adapter/headed_ox.rb +30 -12
  11. data/lib/moxml/adapter/libxml.rb +181 -68
  12. data/lib/moxml/adapter/nokogiri.rb +33 -11
  13. data/lib/moxml/adapter/oga.rb +51 -96
  14. data/lib/moxml/adapter/ox.rb +79 -21
  15. data/lib/moxml/adapter/rexml.rb +64 -11
  16. data/lib/moxml/attribute.rb +7 -1
  17. data/lib/moxml/builder.rb +77 -24
  18. data/lib/moxml/config.rb +18 -1
  19. data/lib/moxml/declaration.rb +4 -2
  20. data/lib/moxml/document.rb +5 -2
  21. data/lib/moxml/document_builder.rb +9 -8
  22. data/lib/moxml/element.rb +22 -13
  23. data/lib/moxml/entity_registry.rb +16 -2
  24. data/lib/moxml/native_attachment.rb +65 -0
  25. data/lib/moxml/node.rb +21 -50
  26. data/lib/moxml/node_set.rb +1 -1
  27. data/lib/moxml/text.rb +6 -0
  28. data/lib/moxml/version.rb +1 -1
  29. data/lib/moxml/xpath/compiler.rb +44 -22
  30. data/lib/moxml/xpath/parser.rb +12 -7
  31. data/lib/moxml.rb +1 -0
  32. data/scripts/format_xml.rb +16 -0
  33. data/scripts/pretty_format_xml.rb +14 -0
  34. data/spec/consistency/round_trip_spec.rb +3 -30
  35. data/spec/integration/all_adapters_spec.rb +2 -0
  36. data/spec/integration/headed_ox_integration_spec.rb +0 -2
  37. data/spec/integration/shared_examples/edge_cases.rb +3 -9
  38. data/spec/integration/shared_examples/entity_reference_whitespace.rb +122 -0
  39. data/spec/integration/shared_examples/integration_workflows.rb +3 -3
  40. data/spec/integration/shared_examples/node_wrappers/cdata_behavior.rb +0 -7
  41. data/spec/integration/shared_examples/node_wrappers/entity_reference_behavior.rb +224 -0
  42. data/spec/integration/shared_examples/node_wrappers/namespace_behavior.rb +135 -0
  43. data/spec/integration/shared_examples/node_wrappers/node_behavior.rb +0 -3
  44. data/spec/moxml/adapter/entity_restoration_spec.rb +97 -0
  45. data/spec/moxml/adapter/headed_ox_spec.rb +8 -8
  46. data/spec/moxml/builder_spec.rb +249 -0
  47. data/spec/moxml/entity_preservation_spec.rb +130 -0
  48. data/spec/moxml/entity_reference_spec.rb +114 -0
  49. data/spec/moxml/entity_registry_spec.rb +68 -0
  50. data/spec/moxml/xpath/axes_spec.rb +0 -1
  51. data/spec/moxml/xpath/compiler_spec.rb +0 -2
  52. data/spec/moxml/xpath/functions/position_functions_spec.rb +5 -5
  53. data/spec/moxml/xpath/functions/special_functions_spec.rb +1 -1
  54. data/spec/performance/memory_usage_spec.rb +0 -4
  55. metadata +10 -2
@@ -30,8 +30,9 @@ module Moxml
30
30
  # ~176K allocations per 100-element parse). Lazy parse defers wrapper
31
31
  # creation until nodes are accessed, matching Ox adapter behavior.
32
32
  def parse(xml, options = {}, _context = nil)
33
+ processed_xml = preprocess_entities(xml)
33
34
  native_doc = begin
34
- result = ::Ox.parse(xml)
35
+ result = ::Ox.parse(processed_xml)
35
36
 
36
37
  # result can be either Document or Element
37
38
  if result.is_a?(::Ox::Document)
@@ -60,10 +61,10 @@ module Moxml
60
61
  #
61
62
  # This overrides the Ox adapter's xpath method which uses locate().
62
63
  #
63
- # @param [Moxml::Node] node Starting node (wrapped Moxml node)
64
+ # @param node Starting node (native or wrapped)
64
65
  # @param [String] expression XPath expression
65
66
  # @param [Hash] namespaces Namespace prefix mappings
66
- # @return [Moxml::NodeSet, Object] Query results
67
+ # @return [Array, Object] Native node array or scalar value
67
68
  def xpath(node, expression, namespaces = {})
68
69
  # If we receive a native node, wrap it first
69
70
  # Document#xpath passes @native, but our compiled XPath needs Moxml nodes
@@ -85,16 +86,33 @@ module Moxml
85
86
  # Execute on the node (now guaranteed to be wrapped Moxml node)
86
87
  result = proc.call(node)
87
88
 
88
- # Wrap Array results in NodeSet, return other types directly
89
+ # Return native arrays for Node#xpath to wrap, scalars directly.
90
+ # The adapter contract: xpath() returns Array<native> | scalar.
89
91
  case result
90
92
  when Array
91
- # Deduplicate by native object identity to handle descendant-or-self
92
- # which may yield the same native node multiple times
93
- nodeset = NodeSet.new(result, node.context)
94
- nodeset.uniq_by_native
93
+ # XPath engine returns wrapped Moxml::Node objects.
94
+ # Extract native nodes and deduplicate by object identity.
95
+ native_nodes = result.map { |n| n.is_a?(Moxml::Node) ? n.native : n }
96
+ seen = {}
97
+ native_nodes.select do |native|
98
+ id = native.object_id
99
+ if seen[id]
100
+ false
101
+ else
102
+ seen[id] = true
103
+ end
104
+ end
95
105
  when NodeSet
96
- # Deduplicate NodeSet results as well
97
- result.uniq_by_native
106
+ # NodeSet from intermediate evaluation - extract natives and deduplicate
107
+ seen = {}
108
+ result.to_a.map(&:native).select do |native|
109
+ id = native.object_id
110
+ if seen[id]
111
+ false
112
+ else
113
+ seen[id] = true
114
+ end
115
+ end
98
116
  else
99
117
  # Scalar values (string, number, boolean) - return as-is
100
118
  result
@@ -113,10 +131,10 @@ module Moxml
113
131
  # @param [Moxml::Node] node Starting node
114
132
  # @param [String] expression XPath expression
115
133
  # @param [Hash] namespaces Namespace prefix mappings
116
- # @return [Moxml::Node, Object, nil] First result or nil
134
+ # @return [Object, nil] First native node or scalar value
117
135
  def at_xpath(node, expression, namespaces = {})
118
136
  result = xpath(node, expression, namespaces)
119
- result.is_a?(NodeSet) ? result.first : result
137
+ result.is_a?(Array) ? result.first : result
120
138
  end
121
139
 
122
140
  # Check if XPath is supported
@@ -38,6 +38,10 @@ module Moxml
38
38
  end
39
39
 
40
40
  class << self
41
+ def attachments
42
+ @attachments ||= Moxml::NativeAttachment.new
43
+ end
44
+
41
45
  def set_root(doc, element)
42
46
  doc.root = element
43
47
  end
@@ -52,6 +56,11 @@ module Moxml
52
56
  xml.to_s
53
57
  end
54
58
 
59
+ # Preprocess entities before parsing.
60
+ # This converts the string to UTF-8; LibXML will use the encoding
61
+ # parameter or XML declaration for byte interpretation.
62
+ xml_string = preprocess_entities(xml_string)
63
+
55
64
  # Extract DOCTYPE before parsing
56
65
  doctype_match = xml_string.match(/<!DOCTYPE\s+(\S+)(?:\s+PUBLIC\s+"([^"]+)"\s+"([^"]+)"| \s+SYSTEM\s+"([^"]+)")?\s*>/i)
57
66
 
@@ -85,7 +94,7 @@ module Moxml
85
94
  external_id,
86
95
  system_id,
87
96
  )
88
- native_doc.instance_variable_set(:@moxml_doctype, doctype_wrapper)
97
+ attachments.set(native_doc, :doctype, doctype_wrapper)
89
98
  end
90
99
 
91
100
  ctx = _context || Context.new(:libxml)
@@ -273,10 +282,8 @@ module Moxml
273
282
  result = []
274
283
 
275
284
  # Include DOCTYPE if present
276
- if native_node.instance_variable_defined?(:@moxml_doctype)
277
- doctype_wrapper = native_node.instance_variable_get(:@moxml_doctype)
278
- result << doctype_wrapper if doctype_wrapper
279
- end
285
+ doctype_wrapper = attachments.get(native_node, :doctype)
286
+ result << doctype_wrapper if doctype_wrapper
280
287
 
281
288
  return result unless native_node.root
282
289
 
@@ -284,18 +291,19 @@ module Moxml
284
291
  return result
285
292
  end
286
293
 
287
- return [] unless native_node.children?
288
-
289
294
  result = []
290
- native_node.each_child do |child|
291
- # Skip whitespace-only text nodes
292
- next if child.text? && child.content.to_s.strip.empty?
295
+ if native_node.children?
296
+ native_node.each_child do |child|
297
+ # Skip whitespace-only text nodes
298
+ next if child.text? && child.content.to_s.strip.empty?
293
299
 
294
- result << patch_node(child)
300
+ result << patch_node(child)
301
+ end
295
302
  end
296
303
 
297
- # Include any EntityReference wrappers stored alongside native children
298
- entity_refs = native_node.instance_variable_get(:@moxml_entity_refs)
304
+ # Include any EntityReference wrappers stored on the document
305
+ doc = native_node.doc
306
+ entity_refs = doc ? lookup_entity_refs(doc, native_node) : nil
299
307
  result.concat(entity_refs) if entity_refs
300
308
 
301
309
  result
@@ -499,11 +507,13 @@ module Moxml
499
507
  native_child = unpatch_node(child)
500
508
 
501
509
  # EntityReference wrappers can't go in LibXML's native tree.
502
- # Store alongside native children via instance variable.
510
+ # Store on the document (stable identity) keyed by element.
511
+ # LibXML creates new Ruby wrappers on each access, so element
512
+ # object_id is unstable — we look up via == comparison.
503
513
  if child.is_a?(CustomizedLibxml::EntityReference)
504
- refs = native_elem.instance_variable_get(:@moxml_entity_refs) || []
505
- refs << child
506
- native_elem.instance_variable_set(:@moxml_entity_refs, refs)
514
+ doc = native_elem.is_a?(::LibXML::XML::Document) ? native_elem : native_elem.doc
515
+ store_entity_ref_on_doc(doc, native_elem, child)
516
+ append_child_sequence_on_doc(doc, native_elem, :eref)
507
517
  return
508
518
  end
509
519
 
@@ -524,32 +534,32 @@ module Moxml
524
534
  if native_elem.is_a?(::LibXML::XML::Document)
525
535
  # For Declaration wrappers, store them for serialization
526
536
  if child.is_a?(CustomizedLibxml::Declaration)
527
- native_elem.instance_variable_set(:@moxml_declaration, child)
537
+ attachments.set(native_elem, :declaration, child)
528
538
  # Also store reference to parent document in the declaration
529
- child.instance_variable_set(:@parent_doc, native_elem)
539
+ child.parent_doc = native_elem
530
540
  return
531
541
  end
532
542
 
533
543
  # For DOCTYPE wrappers, store them for serialization
534
544
  if child.is_a?(DoctypeWrapper)
535
- native_elem.instance_variable_set(:@moxml_doctype, child)
545
+ attachments.set(native_elem, :doctype, child)
536
546
  return
537
547
  end
538
548
 
539
549
  # For document-level PIs, store them for serialization
540
550
  if child.is_a?(CustomizedLibxml::ProcessingInstruction)
541
- pis = native_elem.instance_variable_get(:@moxml_pis) || []
551
+ pis = attachments.get(native_elem, :pis) || []
542
552
  pis << child
543
- native_elem.instance_variable_set(:@moxml_pis, pis)
553
+ attachments.set(native_elem, :pis, pis)
544
554
  return
545
555
  end
546
556
 
547
557
  # For text nodes added to document, store them for serialization
548
558
  # Documents can't have text children in LibXML
549
559
  if child.is_a?(CustomizedLibxml::Text)
550
- texts = native_elem.instance_variable_get(:@moxml_texts) || []
560
+ texts = attachments.get(native_elem, :texts) || []
551
561
  texts << child
552
- native_elem.instance_variable_set(:@moxml_texts, texts)
562
+ attachments.set(native_elem, :texts, texts)
553
563
  return
554
564
  end
555
565
 
@@ -557,13 +567,64 @@ module Moxml
557
567
  if native_elem.root.nil? && node_type(native_child) == :element
558
568
  # Set as root element
559
569
  native_elem.root = native_child
570
+ # Flag for actual_native to refresh the wrapper's native reference
571
+ attachments.set(native_elem, :_pending_root_refresh, native_child.object_id)
560
572
  elsif native_elem.root
561
573
  # Document has root, add to it instead
562
574
  import_and_add(native_elem.doc, native_elem.root, native_child)
563
575
  end
564
576
  else
565
577
  import_and_add(native_elem.doc, native_elem, native_child)
578
+ doc = native_elem.doc || native_elem
579
+ append_child_sequence_on_doc(doc, native_elem, :native)
580
+ end
581
+ end
582
+
583
+ # Store entity ref on the document (stable identity).
584
+ # LibXML element wrappers are ephemeral, so we use == to find matching elements.
585
+ def store_entity_ref_on_doc(doc, element, ref)
586
+ pairs = attachments.get(doc, :_entity_ref_pairs) || []
587
+ pair = pairs.find { |elem, _| elem == element }
588
+ if pair
589
+ pair[1] << ref
590
+ else
591
+ pairs << [element, [ref]]
566
592
  end
593
+ attachments.set(doc, :_entity_ref_pairs, pairs)
594
+ end
595
+
596
+ # Look up entity refs for an element from the document
597
+ def lookup_entity_refs(doc, element)
598
+ pairs = attachments.get(doc, :_entity_ref_pairs)
599
+ return nil unless pairs
600
+ pair = pairs.find { |elem, _| elem == element }
601
+ pair&.last
602
+ end
603
+
604
+ # Track child order on the document (stable identity)
605
+ def append_child_sequence_on_doc(doc, element, type)
606
+ pairs = attachments.get(doc, :_child_seq_pairs) || []
607
+ pair = pairs.find { |elem, _| elem == element }
608
+ if pair
609
+ pair[1] << type
610
+ else
611
+ pairs << [element, [type]]
612
+ end
613
+ attachments.set(doc, :_child_seq_pairs, pairs)
614
+ end
615
+
616
+ # Look up child sequence for an element from the document
617
+ def lookup_child_sequence(doc, element)
618
+ pairs = attachments.get(doc, :_child_seq_pairs)
619
+ return nil unless pairs
620
+ pair = pairs.find { |elem, _| elem == element }
621
+ pair&.last
622
+ end
623
+
624
+ def append_child_sequence(element, type)
625
+ seq = attachments.get(element, :child_sequence) || []
626
+ seq << type
627
+ attachments.set(element, :child_sequence, seq)
567
628
  end
568
629
 
569
630
  def add_previous_sibling(node, sibling)
@@ -577,9 +638,9 @@ module Moxml
577
638
  if sibling.is_a?(CustomizedLibxml::ProcessingInstruction) &&
578
639
  native_node.is_a?(::LibXML::XML::Node) && native_node.doc
579
640
  doc = native_node.doc
580
- pis = doc.instance_variable_get(:@moxml_pis) || []
641
+ pis = attachments.get(doc, :pis) || []
581
642
  pis << sibling
582
- doc.instance_variable_set(:@moxml_pis, pis)
643
+ attachments.set(doc, :pis, pis)
583
644
  return
584
645
  end
585
646
 
@@ -597,16 +658,7 @@ module Moxml
597
658
  def remove(node)
598
659
  # Handle Declaration wrapper - mark as removed on document
599
660
  if node.is_a?(CustomizedLibxml::Declaration)
600
- # The Declaration wrapper is stored on the actual document
601
- # We need to find which document it's stored on and mark it as removed
602
- # This is a bit tricky since the Declaration's native is its own internal doc
603
- # We rely on the fact that when a declaration is added to a document,
604
- # the document stores a reference to it in @moxml_declaration
605
- # So we need to clear that reference and mark it as removed
606
-
607
- # Since we can't easily find the parent document from the Declaration,
608
- # we'll set a flag on the Declaration itself
609
- node.instance_variable_set(:@removed, true)
661
+ node.removed = true
610
662
  return
611
663
  end
612
664
 
@@ -795,9 +847,17 @@ module Moxml
795
847
  return [] unless native_node
796
848
  return [] unless native_node.is_a?(::LibXML::XML::Node)
797
849
 
798
- native_node.namespaces.map do |ns|
799
- ns
800
- end
850
+ namespaces = native_node.namespaces
851
+ return [] unless namespaces
852
+
853
+ namespace_list =
854
+ if namespaces.respond_to?(:definitions)
855
+ namespaces.definitions
856
+ else
857
+ namespaces
858
+ end
859
+
860
+ namespace_list.to_a
801
861
  end
802
862
 
803
863
  # Doctype accessor methods
@@ -867,12 +927,10 @@ module Moxml
867
927
 
868
928
  if should_include_decl
869
929
  # Check if declaration was explicitly managed
870
- if native_node.instance_variable_defined?(:@moxml_declaration)
871
- decl = native_node.instance_variable_get(:@moxml_declaration)
930
+ decl = attachments.get(native_node, :declaration)
931
+ if decl
872
932
  # Only output declaration if it exists and wasn't removed
873
- if decl && !decl.instance_variable_get(:@removed)
874
- output << decl.to_xml
875
- end
933
+ output << decl.to_xml unless decl.removed
876
934
  else
877
935
  # No declaration stored - create default
878
936
  version = native_node.version || "1.0"
@@ -887,39 +945,33 @@ module Moxml
887
945
  encoding_val,
888
946
  nil, # No standalone by default
889
947
  )
890
- native_node.instance_variable_set(:@moxml_declaration, decl)
948
+ attachments.set(native_node, :declaration, decl)
891
949
  output << decl.to_xml
892
950
  end
893
951
  end
894
952
 
895
953
  # Add DOCTYPE if stored on document
896
- if native_node.instance_variable_defined?(:@moxml_doctype)
897
- doctype_wrapper = native_node.instance_variable_get(:@moxml_doctype)
898
- if doctype_wrapper
899
- output << "\n" unless output.empty?
900
- output << doctype_wrapper.to_xml
901
- end
954
+ doctype_wrapper = attachments.get(native_node, :doctype)
955
+ if doctype_wrapper
956
+ output << "\n" unless output.empty?
957
+ output << doctype_wrapper.to_xml
902
958
  end
903
959
 
904
960
  # Add document-level processing instructions if stored
905
- if native_node.instance_variable_defined?(:@moxml_pis)
906
- pis = native_node.instance_variable_get(:@moxml_pis)
907
- if pis && !pis.empty?
908
- pis.each do |pi|
909
- output << "\n" unless output.empty?
910
- output << pi.to_xml
911
- end
961
+ pis = attachments.get(native_node, :pis)
962
+ if pis && !pis.empty?
963
+ pis.each do |pi|
964
+ output << "\n" unless output.empty?
965
+ output << pi.to_xml
912
966
  end
913
967
  end
914
968
 
915
969
  # Add text nodes if stored (for documents without root)
916
- if native_node.instance_variable_defined?(:@moxml_texts)
917
- texts = native_node.instance_variable_get(:@moxml_texts)
918
- if texts && !texts.empty?
919
- texts.each do |text|
920
- output << "\n" unless output.empty?
921
- output << text.to_xml
922
- end
970
+ texts = attachments.get(native_node, :texts)
971
+ if texts && !texts.empty?
972
+ texts.each do |text|
973
+ output << "\n" unless output.empty?
974
+ output << text.to_xml
923
975
  end
924
976
  end
925
977
 
@@ -1165,6 +1217,28 @@ module Moxml
1165
1217
  duplicate_node(node)
1166
1218
  end
1167
1219
 
1220
+ def has_declaration?(native_doc, wrapper)
1221
+ decl = attachments.get(native_doc, :declaration)
1222
+ if decl
1223
+ !decl.removed
1224
+ else
1225
+ wrapper.has_xml_declaration
1226
+ end
1227
+ end
1228
+
1229
+ # LibXML's doc.root= creates a new Ruby wrapper with different object_id.
1230
+ # Return the actual root node so attachments are stored on the correct object.
1231
+ def actual_native(child_native, parent_native)
1232
+ if parent_native.is_a?(::LibXML::XML::Document)
1233
+ pending = attachments.get(parent_native, :_pending_root_refresh)
1234
+ if pending && pending == child_native.object_id
1235
+ attachments.delete(parent_native, :_pending_root_refresh)
1236
+ return parent_native.root
1237
+ end
1238
+ end
1239
+ child_native
1240
+ end
1241
+
1168
1242
  private
1169
1243
 
1170
1244
  def serialize_element(elem)
@@ -1213,8 +1287,9 @@ module Moxml
1213
1287
  end
1214
1288
  end
1215
1289
 
1216
- # Append any EntityReference wrappers stored on this element
1217
- entity_refs = elem.instance_variable_get(:@moxml_entity_refs)
1290
+ # Append any EntityReference wrappers stored on the document
1291
+ doc = elem.doc
1292
+ entity_refs = doc ? lookup_entity_refs(doc, elem) : nil
1218
1293
  entity_refs&.each { |ref| output << ref.to_xml }
1219
1294
 
1220
1295
  output << "</#{elem.name}>"
@@ -1396,9 +1471,47 @@ module Moxml
1396
1471
  end
1397
1472
  end
1398
1473
 
1474
+ # Check for entity refs stored on the document
1475
+ # LibXML element wrappers are ephemeral, so look up via == comparison
1476
+ doc = elem.doc
1477
+ entity_refs = doc ? lookup_entity_refs(doc, elem) : nil
1478
+ child_sequence = doc ? lookup_child_sequence(doc, elem) : nil
1479
+
1399
1480
  # Always use verbose format <tag></tag> for consistency with other adapters
1400
1481
  output << ">"
1401
- if elem.children?
1482
+
1483
+ if entity_refs && !entity_refs.empty? && child_sequence
1484
+ # Interleave native children with entity refs using tracked sequence
1485
+ native_children = []
1486
+ if elem.children?
1487
+ elem.each_child { |c| native_children << c unless c.text? && c.content.to_s.strip.empty? }
1488
+ end
1489
+
1490
+ eref_idx = 0
1491
+ native_idx = 0
1492
+ child_sequence.each do |type|
1493
+ case type
1494
+ when :native
1495
+ if native_idx < native_children.size
1496
+ child = native_children[native_idx]
1497
+ native_idx += 1
1498
+ wrapped_child = patch_node(child)
1499
+ output << if wrapped_child.is_a?(CustomizedLibxml::Node) && !wrapped_child.is_a?(CustomizedLibxml::Element)
1500
+ wrapped_child.to_xml
1501
+ elsif child.element?
1502
+ serialize_element_with_namespaces(child, false)
1503
+ else
1504
+ serialize_node(child)
1505
+ end
1506
+ end
1507
+ when :eref
1508
+ if eref_idx < entity_refs.size
1509
+ output << entity_refs[eref_idx].to_xml
1510
+ eref_idx += 1
1511
+ end
1512
+ end
1513
+ end
1514
+ elsif elem.children?
1402
1515
  elem.each_child do |child|
1403
1516
  # Skip whitespace-only text nodes
1404
1517
  next if child.text? && child.content.to_s.strip.empty?
@@ -7,19 +7,27 @@ module Moxml
7
7
  module Adapter
8
8
  class Nokogiri < Base
9
9
  class << self
10
+ def attachments
11
+ @attachments ||= Moxml::NativeAttachment.new
12
+ end
13
+
10
14
  def set_root(doc, element)
11
15
  doc.root = element
12
16
  end
13
17
 
14
18
  def parse(xml, options = {}, _context = nil)
19
+ processed_xml = preprocess_entities(xml)
20
+
21
+ # preprocess_entities always returns UTF-8, so tell Nokogiri to
22
+ # parse as UTF-8 regardless of any original encoding option.
15
23
  native_doc = begin
16
24
  if options[:fragment]
17
- ::Nokogiri::XML::DocumentFragment.parse(xml) do |config|
25
+ ::Nokogiri::XML::DocumentFragment.parse(processed_xml) do |config|
18
26
  config.strict.nonet
19
27
  config.recover unless options[:strict]
20
28
  end
21
29
  else
22
- ::Nokogiri::XML(xml, nil, options[:encoding]) do |config|
30
+ ::Nokogiri::XML(processed_xml, nil, "UTF-8") do |config|
23
31
  config.strict.nonet
24
32
  config.recover unless options[:strict]
25
33
  end
@@ -176,10 +184,16 @@ module Moxml
176
184
  def children(node)
177
185
  node.children.reject do |child|
178
186
  child.text? && child.content.strip.empty? &&
179
- !(child.previous_sibling.nil? && child.next_sibling.nil?)
187
+ !(child.previous_sibling.nil? && child.next_sibling.nil?) &&
188
+ !adjacent_to_entity_reference?(child)
180
189
  end
181
190
  end
182
191
 
192
+ def adjacent_to_entity_reference?(node)
193
+ node.previous_sibling.is_a?(::Nokogiri::XML::EntityReference) ||
194
+ node.next_sibling.is_a?(::Nokogiri::XML::EntityReference)
195
+ end
196
+
183
197
  def replace_children(node, new_children)
184
198
  node.children.unlink
185
199
  new_children.each { |child| add_child(node, child) }
@@ -241,8 +255,8 @@ module Moxml
241
255
  encoding = declaration_attribute(child, "encoding")
242
256
  standalone = declaration_attribute(child, "standalone")
243
257
 
244
- # Nokogiri's xml_decl can only be set via instance variable
245
- element.instance_variable_set(:@xml_decl, {
258
+ # Store declaration state in attachment map
259
+ attachments.set(element, :xml_decl, {
246
260
  version: version,
247
261
  encoding: encoding,
248
262
  standalone: standalone,
@@ -273,7 +287,7 @@ module Moxml
273
287
  node.name == "xml" &&
274
288
  node.parent.is_a?(::Nokogiri::XML::Document)
275
289
  # Clear document's xml_decl when removing declaration
276
- node.parent.instance_variable_set(:@xml_decl, nil)
290
+ attachments.set(node.parent, :xml_decl, nil)
277
291
  end
278
292
 
279
293
  node.remove
@@ -284,7 +298,7 @@ module Moxml
284
298
  end
285
299
 
286
300
  def text_content(node)
287
- node.text
301
+ node.text.to_s
288
302
  end
289
303
 
290
304
  def inner_text(node)
@@ -387,12 +401,12 @@ module Moxml
387
401
  # Handle declaration option
388
402
  # Priority:
389
403
  # 1. Explicit no_declaration option
390
- # 2. Check Nokogiri's internal @xml_decl (when remove is called, this becomes nil)
404
+ # 2. Check attachment-stored xml_decl (when remove is called, this becomes nil)
391
405
  if options.key?(:no_declaration)
392
406
  save_options |= ::Nokogiri::XML::Node::SaveOptions::NO_DECLARATION if options[:no_declaration]
393
- elsif node.instance_variable_defined?(:@xml_decl)
394
- # Nokogiri's internal state - if nil, declaration was removed
395
- xml_decl = node.instance_variable_get(:@xml_decl)
407
+ elsif attachments.key?(node, :xml_decl)
408
+ # State stored in attachment - if nil, declaration was removed
409
+ xml_decl = attachments.get(node, :xml_decl)
396
410
  save_options |= ::Nokogiri::XML::Node::SaveOptions::NO_DECLARATION if xml_decl.nil?
397
411
  end
398
412
 
@@ -403,6 +417,14 @@ module Moxml
403
417
  )
404
418
  end
405
419
 
420
+ def has_declaration?(native_doc, wrapper)
421
+ if attachments.key?(native_doc, :xml_decl)
422
+ !attachments.get(native_doc, :xml_decl).nil?
423
+ else
424
+ wrapper.has_xml_declaration
425
+ end
426
+ end
427
+
406
428
  private
407
429
 
408
430
  def build_declaration_attrs(version, encoding, standalone)