moxml 0.1.10 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/docs.yml +1 -1
  3. data/.github/workflows/rake.yml +16 -13
  4. data/.github/workflows/release.yml +1 -0
  5. data/.github/workflows/round-trip.yml +74 -0
  6. data/.gitignore +1 -0
  7. data/.rubocop.yml +1 -0
  8. data/.rubocop_todo.yml +160 -38
  9. data/Gemfile +2 -1
  10. data/README.adoc +236 -0
  11. data/Rakefile +11 -0
  12. data/data/w3c_entities.json +2131 -0
  13. data/docs/ENTITY_SUPPORT_FOR_LUTAML_MODEL.md +102 -0
  14. data/docs/_pages/adapters/ox.adoc +30 -0
  15. data/docs/_pages/configuration.adoc +43 -0
  16. data/docs/_pages/node-api-reference.adoc +35 -0
  17. data/docs/_tutorials/namespace-handling.adoc +21 -0
  18. data/examples/rss_parser/rss_parser.rb +1 -3
  19. data/lib/moxml/adapter/base.rb +26 -2
  20. data/lib/moxml/adapter/headed_ox.rb +5 -4
  21. data/lib/moxml/adapter/libxml.rb +3 -2
  22. data/lib/moxml/adapter/nokogiri.rb +13 -2
  23. data/lib/moxml/adapter/oga.rb +124 -20
  24. data/lib/moxml/adapter/ox.rb +4 -3
  25. data/lib/moxml/adapter/rexml.rb +41 -7
  26. data/lib/moxml/builder.rb +6 -0
  27. data/lib/moxml/config.rb +52 -1
  28. data/lib/moxml/context.rb +21 -2
  29. data/lib/moxml/document.rb +6 -1
  30. data/lib/moxml/document_builder.rb +45 -1
  31. data/lib/moxml/element.rb +4 -3
  32. data/lib/moxml/entity_reference.rb +29 -0
  33. data/lib/moxml/entity_registry.rb +278 -0
  34. data/lib/moxml/node.rb +10 -8
  35. data/lib/moxml/node_set.rb +10 -6
  36. data/lib/moxml/version.rb +1 -1
  37. data/lib/moxml/xml_utils.rb +25 -2
  38. data/lib/moxml.rb +1 -0
  39. data/spec/consistency/README.md +3 -1
  40. data/spec/consistency/round_trip_spec.rb +479 -0
  41. data/spec/examples/readme_examples_spec.rb +1 -1
  42. data/spec/fixtures/round-trips/metanorma/a.xml +66 -0
  43. data/spec/fixtures/round-trips/metanorma/bilingual-en.xml +7682 -0
  44. data/spec/fixtures/round-trips/metanorma/bilingual-fr.xml +7520 -0
  45. data/spec/fixtures/round-trips/metanorma/bilingual.presentation.xml +21211 -0
  46. data/spec/fixtures/round-trips/metanorma/collection1.xml +313 -0
  47. data/spec/fixtures/round-trips/metanorma/collection1nested.xml +291 -0
  48. data/spec/fixtures/round-trips/metanorma/collection_docinline.xml +544 -0
  49. data/spec/fixtures/round-trips/metanorma/collection_full.xml +1776 -0
  50. data/spec/fixtures/round-trips/metanorma/dummy.1.xml +295 -0
  51. data/spec/fixtures/round-trips/metanorma/dummy.xml +349 -0
  52. data/spec/fixtures/round-trips/metanorma/footnotes.xml +70 -0
  53. data/spec/fixtures/round-trips/metanorma/iho.xml +116 -0
  54. data/spec/fixtures/round-trips/metanorma/rice-amd.final.xml +186 -0
  55. data/spec/fixtures/round-trips/metanorma/rice-amd.final_1.xml +180 -0
  56. data/spec/fixtures/round-trips/metanorma/rice-en.final.norepo.xml +116 -0
  57. data/spec/fixtures/round-trips/metanorma/rice-en.final.xml +149 -0
  58. data/spec/fixtures/round-trips/metanorma/rice-en.final_1.xml +144 -0
  59. data/spec/fixtures/round-trips/metanorma/rice1-en.final.xml +120 -0
  60. data/spec/fixtures/round-trips/metanorma/rice2-en.final.xml +116 -0
  61. data/spec/fixtures/round-trips/metanorma/test_sectionsplit.xml +119 -0
  62. data/spec/fixtures/round-trips/niso-jats/bmj_sample.xml +1068 -0
  63. data/spec/fixtures/round-trips/niso-jats/element_citation.xml +7 -0
  64. data/spec/fixtures/round-trips/niso-jats/pnas_sample.xml +3768 -0
  65. data/spec/fixtures/round-trips/rfcxml/rfc8881.xml +45848 -0
  66. data/spec/fixtures/round-trips/rfcxml/rfc8994.xml +6607 -0
  67. data/spec/fixtures/round-trips/rfcxml/rfc9000.xml +9064 -0
  68. data/spec/fixtures/round-trips/rfcxml/rfc9043.xml +5527 -0
  69. data/spec/fixtures/round-trips/rfcxml/rfc9051.xml +14286 -0
  70. data/spec/fixtures/round-trips/rfcxml/rfc9110.xml +18156 -0
  71. data/spec/fixtures/round-trips/rfcxml/rfc9260.xml +9136 -0
  72. data/spec/fixtures/round-trips/rfcxml/rfc9293.xml +8300 -0
  73. data/spec/fixtures/round-trips/rfcxml/rfc9380.xml +8916 -0
  74. data/spec/fixtures/round-trips/rfcxml/rfc9420.xml +8927 -0
  75. data/spec/fixtures/w3c/namespaces/1.0/001.xml +7 -0
  76. data/spec/fixtures/w3c/namespaces/1.0/002.xml +8 -0
  77. data/spec/fixtures/w3c/namespaces/1.0/003.xml +7 -0
  78. data/spec/fixtures/w3c/namespaces/1.0/004.xml +7 -0
  79. data/spec/fixtures/w3c/namespaces/1.0/005.xml +7 -0
  80. data/spec/fixtures/w3c/namespaces/1.0/006.xml +7 -0
  81. data/spec/fixtures/w3c/namespaces/1.0/007.xml +20 -0
  82. data/spec/fixtures/w3c/namespaces/1.0/008.xml +20 -0
  83. data/spec/fixtures/w3c/namespaces/1.0/009.xml +19 -0
  84. data/spec/fixtures/w3c/namespaces/1.0/010.xml +19 -0
  85. data/spec/fixtures/w3c/namespaces/1.0/011.xml +20 -0
  86. data/spec/fixtures/w3c/namespaces/1.0/012.xml +19 -0
  87. data/spec/fixtures/w3c/namespaces/1.0/013.xml +5 -0
  88. data/spec/fixtures/w3c/namespaces/1.0/014.xml +3 -0
  89. data/spec/fixtures/w3c/namespaces/1.0/015.xml +3 -0
  90. data/spec/fixtures/w3c/namespaces/1.0/016.xml +3 -0
  91. data/spec/fixtures/w3c/namespaces/1.0/017.xml +3 -0
  92. data/spec/fixtures/w3c/namespaces/1.0/018.xml +3 -0
  93. data/spec/fixtures/w3c/namespaces/1.0/019.xml +3 -0
  94. data/spec/fixtures/w3c/namespaces/1.0/020.xml +3 -0
  95. data/spec/fixtures/w3c/namespaces/1.0/021.xml +6 -0
  96. data/spec/fixtures/w3c/namespaces/1.0/022.xml +6 -0
  97. data/spec/fixtures/w3c/namespaces/1.0/023.xml +6 -0
  98. data/spec/fixtures/w3c/namespaces/1.0/024.xml +6 -0
  99. data/spec/fixtures/w3c/namespaces/1.0/025.xml +3 -0
  100. data/spec/fixtures/w3c/namespaces/1.0/026.xml +3 -0
  101. data/spec/fixtures/w3c/namespaces/1.0/027.xml +3 -0
  102. data/spec/fixtures/w3c/namespaces/1.0/028.xml +3 -0
  103. data/spec/fixtures/w3c/namespaces/1.0/029.xml +4 -0
  104. data/spec/fixtures/w3c/namespaces/1.0/030.xml +4 -0
  105. data/spec/fixtures/w3c/namespaces/1.0/031.xml +4 -0
  106. data/spec/fixtures/w3c/namespaces/1.0/032.xml +5 -0
  107. data/spec/fixtures/w3c/namespaces/1.0/033.xml +4 -0
  108. data/spec/fixtures/w3c/namespaces/1.0/034.xml +3 -0
  109. data/spec/fixtures/w3c/namespaces/1.0/035.xml +8 -0
  110. data/spec/fixtures/w3c/namespaces/1.0/036.xml +8 -0
  111. data/spec/fixtures/w3c/namespaces/1.0/037.xml +8 -0
  112. data/spec/fixtures/w3c/namespaces/1.0/038.xml +8 -0
  113. data/spec/fixtures/w3c/namespaces/1.0/039.xml +10 -0
  114. data/spec/fixtures/w3c/namespaces/1.0/040.xml +9 -0
  115. data/spec/fixtures/w3c/namespaces/1.0/041.xml +8 -0
  116. data/spec/fixtures/w3c/namespaces/1.0/042.xml +4 -0
  117. data/spec/fixtures/w3c/namespaces/1.0/043.xml +7 -0
  118. data/spec/fixtures/w3c/namespaces/1.0/044.xml +7 -0
  119. data/spec/fixtures/w3c/namespaces/1.0/045.xml +7 -0
  120. data/spec/fixtures/w3c/namespaces/1.0/046.xml +10 -0
  121. data/spec/fixtures/w3c/namespaces/1.0/047.xml +4 -0
  122. data/spec/fixtures/w3c/namespaces/1.0/048.xml +5 -0
  123. data/spec/fixtures/w3c/namespaces/1.0/LICENSE.md +32 -0
  124. data/spec/fixtures/w3c/namespaces/1.0/README.adoc +42 -0
  125. data/spec/fixtures/w3c/namespaces/1.0/rmt-ns10.xml +156 -0
  126. data/spec/integration/shared_examples/node_wrappers/namespace_behavior.rb +14 -2
  127. data/spec/integration/shared_examples/w3c_namespace_examples.rb +10 -0
  128. data/spec/integration/w3c_namespace_spec.rb +69 -0
  129. data/spec/moxml/adapter/libxml_spec.rb +7 -1
  130. data/spec/moxml/adapter/oga_spec.rb +92 -0
  131. data/spec/moxml/config_spec.rb +75 -0
  132. data/spec/moxml/entity_registry_spec.rb +184 -0
  133. data/spec/moxml/error_spec.rb +2 -2
  134. data/spec/moxml/namespace_uri_validation_spec.rb +140 -0
  135. data/spec/moxml/xpath/axes_spec.rb +3 -4
  136. data/spec/performance/xpath_benchmark_spec.rb +6 -54
  137. data/spec/support/w3c_namespace_helpers.rb +41 -0
  138. data/spec/unit/rexml_isolated_test.rb +271 -0
  139. metadata +98 -2
data/README.adoc CHANGED
@@ -592,8 +592,82 @@ book.add_child(doc.create_comment('Book details'))
592
592
  title = doc.create_element('title')
593
593
  title.text = 'Ruby Programming'
594
594
  book.add_child(title)
595
+
596
+ # Add entity reference (for declared entities)
597
+ book.add_child(doc.create_entity_reference('mdash'))
595
598
  ----
596
599
 
600
+ === Entity References
601
+
602
+ Moxml supports `EntityReference` nodes for preserving entity syntax in XML documents. This enables round-trip preservation of entity references like ` `, `©`, and custom entities defined in the DOCTYPE.
603
+
604
+ [source,ruby]
605
+ ----
606
+ # Create entity reference programmatically
607
+ ref = doc.create_entity_reference('nbsp')
608
+ element.add_child(ref)
609
+
610
+ # Or using the builder pattern
611
+ doc = Moxml::Builder.new(Moxml.new).build do
612
+ element 'text' do
613
+ entity_reference 'ndash'
614
+ entity_reference 'copy'
615
+ end
616
+ end
617
+ ----
618
+
619
+ **Parsing and Round-Trip:**
620
+
621
+ When parsing XML with declared entities, Moxml preserves entity references:
622
+
623
+ [source,ruby]
624
+ ----
625
+ # Parse document with custom entity
626
+ xml = <<-XML
627
+ <!DOCTYPE root [<!ENTITY nbsp " "> ]>
628
+ <root>hello&nbsp;world</root>
629
+ XML
630
+
631
+ doc = Moxml.new(:nokogiri).parse(xml)
632
+ doc.to_xml # => preserves &nbsp; entity reference
633
+ ----
634
+
635
+ **Adapter Notes:**
636
+
637
+ - *Nokogiri*: Preserves custom declared entities as `EntityReference` nodes
638
+ - *Ox, Oga*: These adapters resolve entities during parsing and do not expose entity reference nodes. Use Nokogiri or LibXML for entity preservation.
639
+
640
+ **Entity Loading Configuration:**
641
+
642
+ Moxml provides configurable entity loading with four modes to balance between functionality, performance, and security:
643
+
644
+ [source,ruby]
645
+ ----
646
+ # Default: Load all W3C entities (HTML + MathML + ISO entity sets)
647
+ # Raises error if entity data is unavailable
648
+ context = Moxml.new
649
+
650
+ # Optional: Load entities if available, silently skip if not
651
+ context = Moxml.new do |config|
652
+ config.entity_load_mode = :optional
653
+ end
654
+
655
+ # Disabled: No entity loading (fastest, for controlled XML sources)
656
+ context = Moxml.new do |config|
657
+ config.entity_load_mode = :disabled
658
+ end
659
+
660
+ # Custom: Load entities from your own source
661
+ context = Moxml.new do |config|
662
+ config.entity_load_mode = :custom
663
+ config.entity_provider = -> { MyEntitySource.all_entities }
664
+ end
665
+ ----
666
+
667
+ The entity data comes from the W3C XML Core WG Character Entities specification (HTMLMathML set), bundled locally in `data/w3c_entities.json` for offline capability. Set the `MOXML_ENTITY_DEFINITIONS_PATH` environment variable to use a custom entity data source.
668
+
669
+ For backward compatibility, `config.load_external_entities = false` maps to `:disabled` mode, and `config.load_external_entities = true` maps to `:required` mode.
670
+
597
671
  === Fluent interface API
598
672
 
599
673
  Moxml provides a fluent, chainable API for improved developer experience:
@@ -741,6 +815,28 @@ context = Moxml.new do |config|
741
815
  end
742
816
  ----
743
817
 
818
+ === Namespace URI validation
819
+
820
+ Moxml validates namespace URIs against
821
+ https://www.rfc-editor.org/rfc/rfc3986[RFC 3986] by default, as required by the
822
+ https://www.w3.org/TR/xml-names/[W3C Namespaces in XML] specification.
823
+
824
+ For documents that use non-standard namespace identifiers, a lenient mode is
825
+ available:
826
+
827
+ [source,ruby]
828
+ ----
829
+ # Strict mode (default) — rejects invalid URIs per RFC 3986
830
+ context = Moxml.new do |config|
831
+ config.namespace_uri_mode = :strict
832
+ end
833
+
834
+ # Lenient mode — accepts any string as a namespace URI
835
+ context = Moxml.new do |config|
836
+ config.namespace_uri_mode = :lenient
837
+ end
838
+ ----
839
+
744
840
  For all configuration options, adapter selection, serialization options, and
745
841
  environment-based configuration, see
746
842
  link:docs/_pages/configuration.adoc[Configuration Guide].
@@ -846,6 +942,146 @@ limitations. Use these adapters when you need full XPath and namespace support.
846
942
 
847
943
 
848
944
 
945
+ == Round-trip XML Testing
946
+
947
+ Moxml includes comprehensive round-trip testing to verify that XML documents remain
948
+ semantically equivalent when parsed and serialized across different adapters.
949
+
950
+ === Purpose
951
+
952
+ Round-trip testing ensures:
953
+
954
+ * **Cross-adapter compatibility** - XML parsed with one adapter (e.g., Nokogiri) can be
955
+ serialized and re-parsed with another adapter (e.g., Oga) while preserving content
956
+ * **Structural fidelity** - Element names, attributes, and document structure are maintained
957
+ * **Content preservation** - Text content and entity references survive multiple parse/serialize cycles
958
+ * **Double round-trip verification** - Source → Target → Source sequences produce semantically
959
+ equivalent output
960
+
961
+ === Test Fixtures
962
+
963
+ Round-trip tests use real-world XML documents organized into collections:
964
+
965
+ **rfcxml** - IETF RFC documents in XML format. These provide complex, standards-compliant
966
+ XML with mixed content, namespaces, and attributes. The collection includes:
967
+
968
+ * Large documents (500KB-2.4MB) for stress testing
969
+ * Rich metadata and cross-references
970
+ * Various XML schema patterns
971
+
972
+ **metanorma** - Metanorma document processing XML. These test:
973
+
974
+ * Document structure preservation
975
+ * Nested elements and complex hierarchies
976
+ * Standard XML vocabularies
977
+
978
+ **niso-jats** - NISO Journal Article Tag Suite XML. These provide:
979
+
980
+ * Scholarly publishing XML schemas
981
+ * Rich bibliographic metadata
982
+ * Mixed content models
983
+
984
+ === Running Round-trip Tests
985
+
986
+ [source,bash]
987
+ ----
988
+ # Run all round-trip tests
989
+ bundle exec rake spec:consistency
990
+
991
+ # Exclude REXML for larger fixtures (faster, REXML is pure Ruby)
992
+ MOXML_ROUNDTRIP_REXML_MAX_SIZE=0 bundle exec rake spec:consistency
993
+
994
+ # Adjust the per-example timeout (default: 120 seconds)
995
+ MOXML_ROUNDTRIP_TIMEOUT=300 bundle exec rake spec:consistency
996
+ ----
997
+
998
+ REXML is a pure Ruby XML parser and becomes very slow on large documents (500KB+).
999
+ By default, REXML adapter pairs are skipped for fixtures exceeding 500KB. All other
1000
+ adapters (Nokogiri, Oga, Ox) are tested against every fixture.
1001
+
1002
+ === Test Mechanics
1003
+
1004
+ For each fixture, tests run across all adapter pairs (4 adapters = 12 combinations):
1005
+
1006
+ 1. Parse with source adapter
1007
+ 2. Serialize to XML string
1008
+ 3. Parse serialized output with target adapter
1009
+ 4. Compare semantic equivalence (element names, attributes, text content)
1010
+
1011
+ A "double round-trip" test additionally verifies: Source → Target → Source → Target
1012
+ produces consistent results.
1013
+
1014
+ NOTE: REXML is excluded from adapter pairs for fixtures larger than 500KB (configurable
1015
+ via `MOXML_ROUNDTRIP_REXML_MAX_SIZE`). This is because REXML is pure Ruby and cannot
1016
+ parse large XML documents in a practical timeframe. A per-example timeout
1017
+ (`MOXML_ROUNDTRIP_TIMEOUT`, default 120s) prevents tests from hanging indefinitely.
1018
+
1019
+ === Ox Adapter Element Ordering Caveat
1020
+
1021
+ The Ox adapter produces elements in a different order than other adapters for certain
1022
+ fixtures with complex nested structures (e.g., `element_citation.xml`,
1023
+ `collection1nested.xml`, `pnas_sample.xml`). This causes the `elements_with_attributes`
1024
+ comparison to fail with "Array length mismatch" even though the semantic equivalence
1025
+ check (double round-trip) passes.
1026
+
1027
+ Round-trip tests automatically skip the `elements_with_attributes` comparison for these
1028
+ known Ox ordering issues. The `ruby-versions` CI job tests only Nokogiri and Oga adapters;
1029
+ the `nokogiri-ox` and `nokogiri-rexml` CI jobs test Ox and REXML respectively but are
1030
+ marked as **experimental** since these adapters lack full XML feature support:
1031
+
1032
+ * **Ox**: Lacks proper namespace support, XPath with predicates, and uses a custom
1033
+ `locate()` method instead of standard XPath
1034
+ * **REXML**: Pure Ruby, exponential time complexity with document size, impractical for
1035
+ documents over ~500KB
1036
+
1037
+ For production use, prefer Nokogiri or Oga which provide complete XML conformance.
1038
+
1039
+ To run tests with a specific adapter set locally:
1040
+
1041
+ [source,bash]
1042
+ ----
1043
+ # Nokogiri + Oga only (fast, full test suite)
1044
+ MOXML_ROUNDTRIP_ADAPTERS=nokogiri,oga bundle exec rspec spec/consistency/ --tag round_trip
1045
+
1046
+ # Nokogiri × Ox only (experimental)
1047
+ MOXML_ROUNDTRIP_ADAPTERS=nokogiri,ox MOXML_ROUNDTRIP_TIMEOUT=300 bundle exec rspec spec/consistency/ --tag round_trip
1048
+
1049
+ # Nokogiri × REXML only (experimental, small fixtures due to exponential complexity)
1050
+ MOXML_ROUNDTRIP_ADAPTERS=nokogiri,rexml MOXML_ROUNDTRIP_TIMEOUT=300 MOXML_ROUNDTRIP_REXML_MAX_SIZE=50000 bundle exec rspec spec/consistency/ --tag round_trip
1051
+ ----
1052
+
1053
+ === Why Semantic Equivalence?
1054
+
1055
+ While a pure round-trip test with raw XML comparison would be ideal, different XML adapters
1056
+ have fundamentally different philosophies for handling:
1057
+
1058
+ * **Element ordering** - Some preserve document order, others sort alphabetically
1059
+ * **Whitespace handling** - Some normalize spaces, others preserve exactly
1060
+ * **Attribute representation** - Different data structures for the same attributes
1061
+ * **Text extraction** - Varying approaches to concatenating text content
1062
+
1063
+ Instead of raw comparison, Moxml implements semantic equivalence testing that focuses on
1064
+ meaningful XML structure and content:
1065
+
1066
+ [source,ruby]
1067
+ ----
1068
+ # Element name must match
1069
+ expect(target_element.name).to eq(source_element.name)
1070
+
1071
+ # Attributes must be semantically equivalent
1072
+ expect(target_attributes).to eq(source_attributes)
1073
+
1074
+ # Text content must be preserved (whitespace-normalized)
1075
+ expect(normalized_text(target)).to eq(normalized_text(source))
1076
+
1077
+ # Document structure (element count) must match
1078
+ expect(doc.xpath("//*").size).to eq(original.xpath("//*").size)
1079
+ ----
1080
+
1081
+ This approach tolerates adapter-specific serialization differences while ensuring
1082
+ the actual XML content remains intact.
1083
+
1084
+
849
1085
  == Development and testing
850
1086
 
851
1087
  For complete information on development setup, testing strategies, benchmarking,
data/Rakefile CHANGED
@@ -30,6 +30,17 @@ namespace :spec do
30
30
  t.pattern = "spec/consistency/**/*_spec.rb"
31
31
  end
32
32
 
33
+ namespace :consistency do
34
+ desc "Run round-trip tests for a specific fixture category (CATEGORIES=metanorma,rfcxml,niso-jats)"
35
+ task :by_category do
36
+ categories = ENV.fetch("CATEGORIES", "").split(",").map(&:strip)
37
+ abort "Usage: CATEGORIES=metanorma,rfcxml rake spec:consistency:by_category" if categories.empty?
38
+
39
+ include_filters = categories.map { |c| "--tag fixture_category:#{c}" }.join(" ")
40
+ sh "bundle exec rspec spec/consistency/ --tag round_trip #{include_filters}"
41
+ end
42
+ end
43
+
33
44
  desc "Run example tests"
34
45
  RSpec::Core::RakeTask.new(:examples) do |t|
35
46
  t.pattern = "spec/examples/**/*_spec.rb"