moxml 0.1.9 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/docs.yml +1 -1
  3. data/.github/workflows/rake.yml +16 -13
  4. data/.github/workflows/release.yml +1 -0
  5. data/.github/workflows/round-trip.yml +74 -0
  6. data/.gitignore +1 -0
  7. data/.rubocop.yml +1 -0
  8. data/.rubocop_todo.yml +160 -38
  9. data/Gemfile +2 -1
  10. data/README.adoc +287 -20
  11. data/Rakefile +11 -0
  12. data/data/w3c_entities.json +2131 -0
  13. data/docs/ENTITY_SUPPORT_FOR_LUTAML_MODEL.md +102 -0
  14. data/docs/_guides/index.adoc +14 -12
  15. data/docs/_guides/node-api-consistency.adoc +572 -0
  16. data/docs/_guides/xml-declaration.adoc +5 -5
  17. data/docs/_pages/adapters/ox.adoc +30 -0
  18. data/docs/_pages/adapters/rexml.adoc +1 -1
  19. data/docs/_pages/configuration.adoc +43 -0
  20. data/docs/_pages/node-api-reference.adoc +128 -3
  21. data/docs/_tutorials/namespace-handling.adoc +21 -0
  22. data/examples/rss_parser/rss_parser.rb +1 -3
  23. data/lib/moxml/adapter/base.rb +26 -2
  24. data/lib/moxml/adapter/headed_ox.rb +5 -4
  25. data/lib/moxml/adapter/libxml.rb +18 -3
  26. data/lib/moxml/adapter/nokogiri.rb +26 -2
  27. data/lib/moxml/adapter/oga.rb +137 -20
  28. data/lib/moxml/adapter/ox.rb +29 -3
  29. data/lib/moxml/adapter/rexml.rb +54 -7
  30. data/lib/moxml/attribute.rb +6 -0
  31. data/lib/moxml/builder.rb +6 -0
  32. data/lib/moxml/config.rb +52 -1
  33. data/lib/moxml/context.rb +21 -2
  34. data/lib/moxml/doctype.rb +33 -0
  35. data/lib/moxml/document.rb +6 -1
  36. data/lib/moxml/document_builder.rb +45 -1
  37. data/lib/moxml/element.rb +10 -3
  38. data/lib/moxml/entity_reference.rb +29 -0
  39. data/lib/moxml/entity_registry.rb +278 -0
  40. data/lib/moxml/error.rb +5 -5
  41. data/lib/moxml/node.rb +22 -8
  42. data/lib/moxml/node_set.rb +10 -6
  43. data/lib/moxml/processing_instruction.rb +6 -0
  44. data/lib/moxml/version.rb +1 -1
  45. data/lib/moxml/xml_utils.rb +25 -2
  46. data/lib/moxml/xpath/errors.rb +1 -1
  47. data/lib/moxml.rb +1 -0
  48. data/spec/consistency/README.md +3 -1
  49. data/spec/consistency/round_trip_spec.rb +479 -0
  50. data/spec/examples/readme_examples_spec.rb +1 -1
  51. data/spec/fixtures/round-trips/metanorma/a.xml +66 -0
  52. data/spec/fixtures/round-trips/metanorma/bilingual-en.xml +7682 -0
  53. data/spec/fixtures/round-trips/metanorma/bilingual-fr.xml +7520 -0
  54. data/spec/fixtures/round-trips/metanorma/bilingual.presentation.xml +21211 -0
  55. data/spec/fixtures/round-trips/metanorma/collection1.xml +313 -0
  56. data/spec/fixtures/round-trips/metanorma/collection1nested.xml +291 -0
  57. data/spec/fixtures/round-trips/metanorma/collection_docinline.xml +544 -0
  58. data/spec/fixtures/round-trips/metanorma/collection_full.xml +1776 -0
  59. data/spec/fixtures/round-trips/metanorma/dummy.1.xml +295 -0
  60. data/spec/fixtures/round-trips/metanorma/dummy.xml +349 -0
  61. data/spec/fixtures/round-trips/metanorma/footnotes.xml +70 -0
  62. data/spec/fixtures/round-trips/metanorma/iho.xml +116 -0
  63. data/spec/fixtures/round-trips/metanorma/rice-amd.final.xml +186 -0
  64. data/spec/fixtures/round-trips/metanorma/rice-amd.final_1.xml +180 -0
  65. data/spec/fixtures/round-trips/metanorma/rice-en.final.norepo.xml +116 -0
  66. data/spec/fixtures/round-trips/metanorma/rice-en.final.xml +149 -0
  67. data/spec/fixtures/round-trips/metanorma/rice-en.final_1.xml +144 -0
  68. data/spec/fixtures/round-trips/metanorma/rice1-en.final.xml +120 -0
  69. data/spec/fixtures/round-trips/metanorma/rice2-en.final.xml +116 -0
  70. data/spec/fixtures/round-trips/metanorma/test_sectionsplit.xml +119 -0
  71. data/spec/fixtures/round-trips/niso-jats/bmj_sample.xml +1068 -0
  72. data/spec/fixtures/round-trips/niso-jats/element_citation.xml +7 -0
  73. data/spec/fixtures/round-trips/niso-jats/pnas_sample.xml +3768 -0
  74. data/spec/fixtures/round-trips/rfcxml/rfc8881.xml +45848 -0
  75. data/spec/fixtures/round-trips/rfcxml/rfc8994.xml +6607 -0
  76. data/spec/fixtures/round-trips/rfcxml/rfc9000.xml +9064 -0
  77. data/spec/fixtures/round-trips/rfcxml/rfc9043.xml +5527 -0
  78. data/spec/fixtures/round-trips/rfcxml/rfc9051.xml +14286 -0
  79. data/spec/fixtures/round-trips/rfcxml/rfc9110.xml +18156 -0
  80. data/spec/fixtures/round-trips/rfcxml/rfc9260.xml +9136 -0
  81. data/spec/fixtures/round-trips/rfcxml/rfc9293.xml +8300 -0
  82. data/spec/fixtures/round-trips/rfcxml/rfc9380.xml +8916 -0
  83. data/spec/fixtures/round-trips/rfcxml/rfc9420.xml +8927 -0
  84. data/spec/fixtures/w3c/namespaces/1.0/001.xml +7 -0
  85. data/spec/fixtures/w3c/namespaces/1.0/002.xml +8 -0
  86. data/spec/fixtures/w3c/namespaces/1.0/003.xml +7 -0
  87. data/spec/fixtures/w3c/namespaces/1.0/004.xml +7 -0
  88. data/spec/fixtures/w3c/namespaces/1.0/005.xml +7 -0
  89. data/spec/fixtures/w3c/namespaces/1.0/006.xml +7 -0
  90. data/spec/fixtures/w3c/namespaces/1.0/007.xml +20 -0
  91. data/spec/fixtures/w3c/namespaces/1.0/008.xml +20 -0
  92. data/spec/fixtures/w3c/namespaces/1.0/009.xml +19 -0
  93. data/spec/fixtures/w3c/namespaces/1.0/010.xml +19 -0
  94. data/spec/fixtures/w3c/namespaces/1.0/011.xml +20 -0
  95. data/spec/fixtures/w3c/namespaces/1.0/012.xml +19 -0
  96. data/spec/fixtures/w3c/namespaces/1.0/013.xml +5 -0
  97. data/spec/fixtures/w3c/namespaces/1.0/014.xml +3 -0
  98. data/spec/fixtures/w3c/namespaces/1.0/015.xml +3 -0
  99. data/spec/fixtures/w3c/namespaces/1.0/016.xml +3 -0
  100. data/spec/fixtures/w3c/namespaces/1.0/017.xml +3 -0
  101. data/spec/fixtures/w3c/namespaces/1.0/018.xml +3 -0
  102. data/spec/fixtures/w3c/namespaces/1.0/019.xml +3 -0
  103. data/spec/fixtures/w3c/namespaces/1.0/020.xml +3 -0
  104. data/spec/fixtures/w3c/namespaces/1.0/021.xml +6 -0
  105. data/spec/fixtures/w3c/namespaces/1.0/022.xml +6 -0
  106. data/spec/fixtures/w3c/namespaces/1.0/023.xml +6 -0
  107. data/spec/fixtures/w3c/namespaces/1.0/024.xml +6 -0
  108. data/spec/fixtures/w3c/namespaces/1.0/025.xml +3 -0
  109. data/spec/fixtures/w3c/namespaces/1.0/026.xml +3 -0
  110. data/spec/fixtures/w3c/namespaces/1.0/027.xml +3 -0
  111. data/spec/fixtures/w3c/namespaces/1.0/028.xml +3 -0
  112. data/spec/fixtures/w3c/namespaces/1.0/029.xml +4 -0
  113. data/spec/fixtures/w3c/namespaces/1.0/030.xml +4 -0
  114. data/spec/fixtures/w3c/namespaces/1.0/031.xml +4 -0
  115. data/spec/fixtures/w3c/namespaces/1.0/032.xml +5 -0
  116. data/spec/fixtures/w3c/namespaces/1.0/033.xml +4 -0
  117. data/spec/fixtures/w3c/namespaces/1.0/034.xml +3 -0
  118. data/spec/fixtures/w3c/namespaces/1.0/035.xml +8 -0
  119. data/spec/fixtures/w3c/namespaces/1.0/036.xml +8 -0
  120. data/spec/fixtures/w3c/namespaces/1.0/037.xml +8 -0
  121. data/spec/fixtures/w3c/namespaces/1.0/038.xml +8 -0
  122. data/spec/fixtures/w3c/namespaces/1.0/039.xml +10 -0
  123. data/spec/fixtures/w3c/namespaces/1.0/040.xml +9 -0
  124. data/spec/fixtures/w3c/namespaces/1.0/041.xml +8 -0
  125. data/spec/fixtures/w3c/namespaces/1.0/042.xml +4 -0
  126. data/spec/fixtures/w3c/namespaces/1.0/043.xml +7 -0
  127. data/spec/fixtures/w3c/namespaces/1.0/044.xml +7 -0
  128. data/spec/fixtures/w3c/namespaces/1.0/045.xml +7 -0
  129. data/spec/fixtures/w3c/namespaces/1.0/046.xml +10 -0
  130. data/spec/fixtures/w3c/namespaces/1.0/047.xml +4 -0
  131. data/spec/fixtures/w3c/namespaces/1.0/048.xml +5 -0
  132. data/spec/fixtures/w3c/namespaces/1.0/LICENSE.md +32 -0
  133. data/spec/fixtures/w3c/namespaces/1.0/README.adoc +42 -0
  134. data/spec/fixtures/w3c/namespaces/1.0/rmt-ns10.xml +156 -0
  135. data/spec/integration/shared_examples/node_wrappers/namespace_behavior.rb +14 -2
  136. data/spec/integration/shared_examples/w3c_namespace_examples.rb +10 -0
  137. data/spec/integration/w3c_namespace_spec.rb +69 -0
  138. data/spec/moxml/adapter/libxml_spec.rb +7 -1
  139. data/spec/moxml/adapter/oga_spec.rb +92 -0
  140. data/spec/moxml/config_spec.rb +75 -0
  141. data/spec/moxml/doctype_spec.rb +19 -3
  142. data/spec/moxml/entity_registry_spec.rb +184 -0
  143. data/spec/moxml/error_spec.rb +2 -2
  144. data/spec/moxml/namespace_uri_validation_spec.rb +140 -0
  145. data/spec/moxml/xpath/axes_spec.rb +3 -4
  146. data/spec/performance/xpath_benchmark_spec.rb +6 -54
  147. data/spec/support/w3c_namespace_helpers.rb +41 -0
  148. data/spec/unit/rexml_isolated_test.rb +271 -0
  149. metadata +99 -3
  150. data/.ruby-version +0 -1
data/README.adoc CHANGED
@@ -26,6 +26,8 @@ Key features:
26
26
 
27
27
  == Supported XML libraries
28
28
 
29
+ === General
30
+
29
31
  Moxml supports the following XML libraries:
30
32
 
31
33
  REXML:: https://github.com/ruby/rexml[REXML], a pure Ruby XML parser
@@ -110,7 +112,9 @@ section.
110
112
  NOTE: HeadedOx provides full XPath 1.0 support via a pure Ruby XPath engine
111
113
  layered on top of Ox's C parser. See HeadedOx documentation for details.
112
114
 
113
- NOTE: Ox/HeadedOx SAX: Only core events supported (start_element, end_element, characters, errors). No separate CDATA, comment, or processing instruction events.
115
+ NOTE: Ox/HeadedOx SAX: Only core events supported (start_element, end_element,
116
+ characters, errors). No separate CDATA, comment, or processing instruction
117
+ events.
114
118
 
115
119
  == Adapter comparison
116
120
 
@@ -356,7 +360,7 @@ NOTE: Ox/HeadedOx SAX: Only core events supported (start_element, end_element, c
356
360
  | ✅ Full
357
361
  | ✅ Full
358
362
  | ✅ Full
359
- | ⚠️ Limited^4^
363
+ | Full
360
364
  | ✅ Full
361
365
  | ✅ Full
362
366
 
@@ -400,13 +404,12 @@ NOTE: Ox/HeadedOx SAX: Only core events supported (start_element, end_element, c
400
404
  | ✅ Yes
401
405
  | ✅ Yes
402
406
  |===
403
-
407
+ +
404
408
  ^1^ Ox/HeadedOx: Text node replacement may fail in some cases due to internal node structure +
405
409
  ^2^ Ox: `//book[@id]` works (returns all book elements), but doesn't filter by attribute existence +
406
410
  ^3^ HeadedOx: Full XPath 1.0 with all 27 functions and 6 axes. Pure Ruby XPath engine on Ox's C parser. 99.20% pass rate. See link:docs/headed-ox.adoc[] +
407
411
  ^4^ Ox: Use `.find { |el| el["id"] == "123" }` instead of XPath attribute value predicates +
408
- ^5^ LibXML: DOCTYPE parsing works, serialization is limited (no round-trip preservation) +
409
- ^6^ HeadedOx limitations: Namespace introspection and 7 axes not implemented. See link:docs/HEADED_OX_LIMITATIONS.md[]
412
+ ^5^ HeadedOx limitations: Namespace introspection and 7 axes not implemented. See link:docs/HEADED_OX_LIMITATIONS.md[]
410
413
 
411
414
  === Adapter selection guide
412
415
 
@@ -453,10 +456,13 @@ NOTE: Ox/HeadedOx SAX: Only core events supported (start_element, end_element, c
453
456
  * Need more XPath capabilities than standard Ox provides
454
457
  * Memory efficiency is important but XPath features are required
455
458
 
456
- CAUTION: Ox's custom XPath engine supports common patterns but may not handle
459
+ CAUTION: Ox's custom XPath engine supports common patterns but cannot handle
457
460
  complex XPath expressions. Test thoroughly if your use case requires advanced
458
461
  XPath.
459
462
 
463
+ TODO: We should throw errors when unsupported XPath features are used with Ox
464
+ or HeadedOx to prevent silent failures.
465
+
460
466
 
461
467
  == Getting started
462
468
 
@@ -586,7 +592,81 @@ book.add_child(doc.create_comment('Book details'))
586
592
  title = doc.create_element('title')
587
593
  title.text = 'Ruby Programming'
588
594
  book.add_child(title)
595
+
596
+ # Add entity reference (for declared entities)
597
+ book.add_child(doc.create_entity_reference('mdash'))
598
+ ----
599
+
600
+ === Entity References
601
+
602
+ Moxml supports `EntityReference` nodes for preserving entity syntax in XML documents. This enables round-trip preservation of entity references like ` `, `©`, and custom entities defined in the DOCTYPE.
603
+
604
+ [source,ruby]
605
+ ----
606
+ # Create entity reference programmatically
607
+ ref = doc.create_entity_reference('nbsp')
608
+ element.add_child(ref)
609
+
610
+ # Or using the builder pattern
611
+ doc = Moxml::Builder.new(Moxml.new).build do
612
+ element 'text' do
613
+ entity_reference 'ndash'
614
+ entity_reference 'copy'
615
+ end
616
+ end
617
+ ----
618
+
619
+ **Parsing and Round-Trip:**
620
+
621
+ When parsing XML with declared entities, Moxml preserves entity references:
622
+
623
+ [source,ruby]
624
+ ----
625
+ # Parse document with custom entity
626
+ xml = <<-XML
627
+ <!DOCTYPE root [<!ENTITY nbsp " "> ]>
628
+ <root>hello&nbsp;world</root>
629
+ XML
630
+
631
+ doc = Moxml.new(:nokogiri).parse(xml)
632
+ doc.to_xml # => preserves &nbsp; entity reference
633
+ ----
634
+
635
+ **Adapter Notes:**
636
+
637
+ - *Nokogiri*: Preserves custom declared entities as `EntityReference` nodes
638
+ - *Ox, Oga*: These adapters resolve entities during parsing and do not expose entity reference nodes. Use Nokogiri or LibXML for entity preservation.
639
+
640
+ **Entity Loading Configuration:**
641
+
642
+ Moxml provides configurable entity loading with four modes to balance between functionality, performance, and security:
643
+
644
+ [source,ruby]
589
645
  ----
646
+ # Default: Load all W3C entities (HTML + MathML + ISO entity sets)
647
+ # Raises error if entity data is unavailable
648
+ context = Moxml.new
649
+
650
+ # Optional: Load entities if available, silently skip if not
651
+ context = Moxml.new do |config|
652
+ config.entity_load_mode = :optional
653
+ end
654
+
655
+ # Disabled: No entity loading (fastest, for controlled XML sources)
656
+ context = Moxml.new do |config|
657
+ config.entity_load_mode = :disabled
658
+ end
659
+
660
+ # Custom: Load entities from your own source
661
+ context = Moxml.new do |config|
662
+ config.entity_load_mode = :custom
663
+ config.entity_provider = -> { MyEntitySource.all_entities }
664
+ end
665
+ ----
666
+
667
+ The entity data comes from the W3C XML Core WG Character Entities specification (HTMLMathML set), bundled locally in `data/w3c_entities.json` for offline capability. Set the `MOXML_ENTITY_DEFINITIONS_PATH` environment variable to use a custom entity data source.
668
+
669
+ For backward compatibility, `config.load_external_entities = false` maps to `:disabled` mode, and `config.load_external_entities = true` maps to `:required` mode.
590
670
 
591
671
  === Fluent interface API
592
672
 
@@ -643,6 +723,23 @@ For complete SAX documentation including all handler types, event methods, adapt
643
723
 
644
724
  For complete node API reference including traversal methods, manipulation, queries, type checking, and node information, see link:docs/_pages/node-api-reference.adoc[Node API Reference].
645
725
 
726
+ === Node identity
727
+
728
+ Moxml provides a consistent `#identifier` method across all node types to safely identify nodes:
729
+
730
+ [source,ruby]
731
+ ----
732
+ element = doc.at_xpath("//book")
733
+ puts element.identifier # => "book"
734
+
735
+ attr = element.attribute("id")
736
+ puts attr.identifier # => "id"
737
+ ----
738
+
739
+ The `#identifier` method returns the primary identifier for each node type (tag name for elements, attribute name for attributes, target for processing instructions, or `nil` for content nodes).
740
+
741
+ IMPORTANT: Always use type-safe patterns when working with mixed node types. See the link:docs/_guides/node-api-consistency.adoc[Node API Consistency Guide] for complete documentation on safe coding patterns, API surface by node type, and migration guidelines.
742
+
646
743
 
647
744
  == Advanced features
648
745
 
@@ -694,7 +791,8 @@ rescue Moxml::Error => e
694
791
  end
695
792
  ----
696
793
 
697
- For complete error class hierarchy, error types, best practices, and debugging techniques, see link:docs/_pages/error-handling.adoc[Error Handling Guide].
794
+ For complete error class hierarchy, error types, best practices, and debugging
795
+ techniques, see link:docs/_pages/error-handling.adoc[Error Handling Guide].
698
796
 
699
797
 
700
798
  == Configuration
@@ -717,22 +815,53 @@ context = Moxml.new do |config|
717
815
  end
718
816
  ----
719
817
 
720
- For all configuration options, adapter selection, serialization options, and environment-based configuration, see link:docs/_pages/configuration.adoc[Configuration Guide].
818
+ === Namespace URI validation
819
+
820
+ Moxml validates namespace URIs against
821
+ https://www.rfc-editor.org/rfc/rfc3986[RFC 3986] by default, as required by the
822
+ https://www.w3.org/TR/xml-names/[W3C Namespaces in XML] specification.
823
+
824
+ For documents that use non-standard namespace identifiers, a lenient mode is
825
+ available:
826
+
827
+ [source,ruby]
828
+ ----
829
+ # Strict mode (default) — rejects invalid URIs per RFC 3986
830
+ context = Moxml.new do |config|
831
+ config.namespace_uri_mode = :strict
832
+ end
833
+
834
+ # Lenient mode — accepts any string as a namespace URI
835
+ context = Moxml.new do |config|
836
+ config.namespace_uri_mode = :lenient
837
+ end
838
+ ----
839
+
840
+ For all configuration options, adapter selection, serialization options, and
841
+ environment-based configuration, see
842
+ link:docs/_pages/configuration.adoc[Configuration Guide].
721
843
 
722
844
 
723
845
 
724
846
  == Thread safety
725
847
 
726
- For complete information on thread-safe patterns, context management, and concurrent processing, see the link:docs/_pages/thread-safety.adoc[Thread Safety Guide].
848
+ For complete information on thread-safe patterns, context management, and
849
+ concurrent processing, see the link:docs/_pages/thread-safety.adoc[Thread Safety
850
+ Guide].
727
851
 
728
852
 
729
853
  == Performance considerations
730
854
 
731
- For detailed performance optimization strategies, memory management best practices, and efficient querying patterns, see the link:docs/_pages/performance.adoc[Performance Considerations Guide].
855
+ For detailed performance optimization strategies, memory management best
856
+ practices, and efficient querying patterns, see the
857
+ link:docs/_pages/performance.adoc[Performance Considerations Guide].
732
858
 
733
859
  == Best practices
734
860
 
735
- For comprehensive best practices covering XPath queries, adapter selection, error handling, namespace handling, memory management, thread safety, performance optimization, and testing strategies, see link:docs/_pages/best-practices.adoc[Best Practices Guide].
861
+ For comprehensive best practices covering XPath queries, adapter selection,
862
+ error handling, namespace handling, memory management, thread safety,
863
+ performance optimization, and testing strategies, see
864
+ link:docs/_pages/best-practices.adoc[Best Practices Guide].
736
865
 
737
866
 
738
867
  == Specific adapter limitations
@@ -756,11 +885,13 @@ The Ox adapter provides maximum parsing speed but has XPath limitations.
756
885
  doc.xpath("//book").find { |book| book["id"] == "123" }
757
886
  ----
758
887
 
759
- For complete Ox adapter documentation including all limitations and workarounds, see link:docs/_pages/adapters/ox.adoc[Ox Adapter Guide].
888
+ For complete Ox adapter documentation including all limitations and workarounds,
889
+ see link:docs/_pages/adapters/ox.adoc[Ox Adapter Guide].
760
890
 
761
891
  === HeadedOx adapter
762
892
 
763
- The HeadedOx adapter combines Ox's fast C-based XML parsing with Moxml's comprehensive pure Ruby XPath 1.0 engine.
893
+ The HeadedOx adapter combines Ox's fast C-based XML parsing with Moxml's
894
+ comprehensive pure Ruby XPath 1.0 engine.
764
895
 
765
896
  **Status:** Production-ready v1.2 (99.20% pass rate, 1,992/2,008 tests)
766
897
 
@@ -796,12 +927,6 @@ For complete HeadedOx documentation including architecture, XPath capabilities,
796
927
 
797
928
  ==== LibXML adapter
798
929
 
799
- *DOCTYPE Limitations:*
800
-
801
- * DOCTYPE parsing works
802
- * DOCTYPE round-trip preservation is limited
803
- * DOCTYPE cannot be reliably re-serialized after parsing
804
-
805
930
  *Performance:*
806
931
 
807
932
  * Serialization speed: ~120 ips (slower than target)
@@ -817,9 +942,151 @@ limitations. Use these adapters when you need full XPath and namespace support.
817
942
 
818
943
 
819
944
 
945
+ == Round-trip XML Testing
946
+
947
+ Moxml includes comprehensive round-trip testing to verify that XML documents remain
948
+ semantically equivalent when parsed and serialized across different adapters.
949
+
950
+ === Purpose
951
+
952
+ Round-trip testing ensures:
953
+
954
+ * **Cross-adapter compatibility** - XML parsed with one adapter (e.g., Nokogiri) can be
955
+ serialized and re-parsed with another adapter (e.g., Oga) while preserving content
956
+ * **Structural fidelity** - Element names, attributes, and document structure are maintained
957
+ * **Content preservation** - Text content and entity references survive multiple parse/serialize cycles
958
+ * **Double round-trip verification** - Source → Target → Source sequences produce semantically
959
+ equivalent output
960
+
961
+ === Test Fixtures
962
+
963
+ Round-trip tests use real-world XML documents organized into collections:
964
+
965
+ **rfcxml** - IETF RFC documents in XML format. These provide complex, standards-compliant
966
+ XML with mixed content, namespaces, and attributes. The collection includes:
967
+
968
+ * Large documents (500KB-2.4MB) for stress testing
969
+ * Rich metadata and cross-references
970
+ * Various XML schema patterns
971
+
972
+ **metanorma** - Metanorma document processing XML. These test:
973
+
974
+ * Document structure preservation
975
+ * Nested elements and complex hierarchies
976
+ * Standard XML vocabularies
977
+
978
+ **niso-jats** - NISO Journal Article Tag Suite XML. These provide:
979
+
980
+ * Scholarly publishing XML schemas
981
+ * Rich bibliographic metadata
982
+ * Mixed content models
983
+
984
+ === Running Round-trip Tests
985
+
986
+ [source,bash]
987
+ ----
988
+ # Run all round-trip tests
989
+ bundle exec rake spec:consistency
990
+
991
+ # Exclude REXML for larger fixtures (faster, REXML is pure Ruby)
992
+ MOXML_ROUNDTRIP_REXML_MAX_SIZE=0 bundle exec rake spec:consistency
993
+
994
+ # Adjust the per-example timeout (default: 120 seconds)
995
+ MOXML_ROUNDTRIP_TIMEOUT=300 bundle exec rake spec:consistency
996
+ ----
997
+
998
+ REXML is a pure Ruby XML parser and becomes very slow on large documents (500KB+).
999
+ By default, REXML adapter pairs are skipped for fixtures exceeding 500KB. All other
1000
+ adapters (Nokogiri, Oga, Ox) are tested against every fixture.
1001
+
1002
+ === Test Mechanics
1003
+
1004
+ For each fixture, tests run across all adapter pairs (4 adapters = 12 combinations):
1005
+
1006
+ 1. Parse with source adapter
1007
+ 2. Serialize to XML string
1008
+ 3. Parse serialized output with target adapter
1009
+ 4. Compare semantic equivalence (element names, attributes, text content)
1010
+
1011
+ A "double round-trip" test additionally verifies: Source → Target → Source → Target
1012
+ produces consistent results.
1013
+
1014
+ NOTE: REXML is excluded from adapter pairs for fixtures larger than 500KB (configurable
1015
+ via `MOXML_ROUNDTRIP_REXML_MAX_SIZE`). This is because REXML is pure Ruby and cannot
1016
+ parse large XML documents in a practical timeframe. A per-example timeout
1017
+ (`MOXML_ROUNDTRIP_TIMEOUT`, default 120s) prevents tests from hanging indefinitely.
1018
+
1019
+ === Ox Adapter Element Ordering Caveat
1020
+
1021
+ The Ox adapter produces elements in a different order than other adapters for certain
1022
+ fixtures with complex nested structures (e.g., `element_citation.xml`,
1023
+ `collection1nested.xml`, `pnas_sample.xml`). This causes the `elements_with_attributes`
1024
+ comparison to fail with "Array length mismatch" even though the semantic equivalence
1025
+ check (double round-trip) passes.
1026
+
1027
+ Round-trip tests automatically skip the `elements_with_attributes` comparison for these
1028
+ known Ox ordering issues. The `ruby-versions` CI job tests only Nokogiri and Oga adapters;
1029
+ the `nokogiri-ox` and `nokogiri-rexml` CI jobs test Ox and REXML respectively but are
1030
+ marked as **experimental** since these adapters lack full XML feature support:
1031
+
1032
+ * **Ox**: Lacks proper namespace support, XPath with predicates, and uses a custom
1033
+ `locate()` method instead of standard XPath
1034
+ * **REXML**: Pure Ruby, exponential time complexity with document size, impractical for
1035
+ documents over ~500KB
1036
+
1037
+ For production use, prefer Nokogiri or Oga which provide complete XML conformance.
1038
+
1039
+ To run tests with a specific adapter set locally:
1040
+
1041
+ [source,bash]
1042
+ ----
1043
+ # Nokogiri + Oga only (fast, full test suite)
1044
+ MOXML_ROUNDTRIP_ADAPTERS=nokogiri,oga bundle exec rspec spec/consistency/ --tag round_trip
1045
+
1046
+ # Nokogiri × Ox only (experimental)
1047
+ MOXML_ROUNDTRIP_ADAPTERS=nokogiri,ox MOXML_ROUNDTRIP_TIMEOUT=300 bundle exec rspec spec/consistency/ --tag round_trip
1048
+
1049
+ # Nokogiri × REXML only (experimental, small fixtures due to exponential complexity)
1050
+ MOXML_ROUNDTRIP_ADAPTERS=nokogiri,rexml MOXML_ROUNDTRIP_TIMEOUT=300 MOXML_ROUNDTRIP_REXML_MAX_SIZE=50000 bundle exec rspec spec/consistency/ --tag round_trip
1051
+ ----
1052
+
1053
+ === Why Semantic Equivalence?
1054
+
1055
+ While a pure round-trip test with raw XML comparison would be ideal, different XML adapters
1056
+ have fundamentally different philosophies for handling:
1057
+
1058
+ * **Element ordering** - Some preserve document order, others sort alphabetically
1059
+ * **Whitespace handling** - Some normalize spaces, others preserve exactly
1060
+ * **Attribute representation** - Different data structures for the same attributes
1061
+ * **Text extraction** - Varying approaches to concatenating text content
1062
+
1063
+ Instead of raw comparison, Moxml implements semantic equivalence testing that focuses on
1064
+ meaningful XML structure and content:
1065
+
1066
+ [source,ruby]
1067
+ ----
1068
+ # Element name must match
1069
+ expect(target_element.name).to eq(source_element.name)
1070
+
1071
+ # Attributes must be semantically equivalent
1072
+ expect(target_attributes).to eq(source_attributes)
1073
+
1074
+ # Text content must be preserved (whitespace-normalized)
1075
+ expect(normalized_text(target)).to eq(normalized_text(source))
1076
+
1077
+ # Document structure (element count) must match
1078
+ expect(doc.xpath("//*").size).to eq(original.xpath("//*").size)
1079
+ ----
1080
+
1081
+ This approach tolerates adapter-specific serialization differences while ensuring
1082
+ the actual XML content remains intact.
1083
+
1084
+
820
1085
  == Development and testing
821
1086
 
822
- For complete information on development setup, testing strategies, benchmarking, and coverage reporting, see the link:docs/_guides/development-testing.adoc[Development and Testing Guide].
1087
+ For complete information on development setup, testing strategies, benchmarking,
1088
+ and coverage reporting, see the
1089
+ link:docs/_guides/development-testing.adoc[Development and Testing Guide].
823
1090
 
824
1091
  == Contributing
825
1092
 
data/Rakefile CHANGED
@@ -30,6 +30,17 @@ namespace :spec do
30
30
  t.pattern = "spec/consistency/**/*_spec.rb"
31
31
  end
32
32
 
33
+ namespace :consistency do
34
+ desc "Run round-trip tests for a specific fixture category (CATEGORIES=metanorma,rfcxml,niso-jats)"
35
+ task :by_category do
36
+ categories = ENV.fetch("CATEGORIES", "").split(",").map(&:strip)
37
+ abort "Usage: CATEGORIES=metanorma,rfcxml rake spec:consistency:by_category" if categories.empty?
38
+
39
+ include_filters = categories.map { |c| "--tag fixture_category:#{c}" }.join(" ")
40
+ sh "bundle exec rspec spec/consistency/ --tag round_trip #{include_filters}"
41
+ end
42
+ end
43
+
33
44
  desc "Run example tests"
34
45
  RSpec::Core::RakeTask.new(:examples) do |t|
35
46
  t.pattern = "spec/examples/**/*_spec.rb"