moxml 0.1.9 → 0.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/docs.yml +1 -1
- data/.github/workflows/rake.yml +16 -13
- data/.github/workflows/release.yml +1 -0
- data/.github/workflows/round-trip.yml +74 -0
- data/.gitignore +1 -0
- data/.rubocop.yml +1 -0
- data/.rubocop_todo.yml +160 -38
- data/Gemfile +2 -1
- data/README.adoc +287 -20
- data/Rakefile +11 -0
- data/data/w3c_entities.json +2131 -0
- data/docs/ENTITY_SUPPORT_FOR_LUTAML_MODEL.md +102 -0
- data/docs/_guides/index.adoc +14 -12
- data/docs/_guides/node-api-consistency.adoc +572 -0
- data/docs/_guides/xml-declaration.adoc +5 -5
- data/docs/_pages/adapters/ox.adoc +30 -0
- data/docs/_pages/adapters/rexml.adoc +1 -1
- data/docs/_pages/configuration.adoc +43 -0
- data/docs/_pages/node-api-reference.adoc +128 -3
- data/docs/_tutorials/namespace-handling.adoc +21 -0
- data/examples/rss_parser/rss_parser.rb +1 -3
- data/lib/moxml/adapter/base.rb +26 -2
- data/lib/moxml/adapter/headed_ox.rb +5 -4
- data/lib/moxml/adapter/libxml.rb +18 -3
- data/lib/moxml/adapter/nokogiri.rb +26 -2
- data/lib/moxml/adapter/oga.rb +137 -20
- data/lib/moxml/adapter/ox.rb +29 -3
- data/lib/moxml/adapter/rexml.rb +54 -7
- data/lib/moxml/attribute.rb +6 -0
- data/lib/moxml/builder.rb +6 -0
- data/lib/moxml/config.rb +52 -1
- data/lib/moxml/context.rb +21 -2
- data/lib/moxml/doctype.rb +33 -0
- data/lib/moxml/document.rb +6 -1
- data/lib/moxml/document_builder.rb +45 -1
- data/lib/moxml/element.rb +10 -3
- data/lib/moxml/entity_reference.rb +29 -0
- data/lib/moxml/entity_registry.rb +278 -0
- data/lib/moxml/error.rb +5 -5
- data/lib/moxml/node.rb +22 -8
- data/lib/moxml/node_set.rb +10 -6
- data/lib/moxml/processing_instruction.rb +6 -0
- data/lib/moxml/version.rb +1 -1
- data/lib/moxml/xml_utils.rb +25 -2
- data/lib/moxml/xpath/errors.rb +1 -1
- data/lib/moxml.rb +1 -0
- data/spec/consistency/README.md +3 -1
- data/spec/consistency/round_trip_spec.rb +479 -0
- data/spec/examples/readme_examples_spec.rb +1 -1
- data/spec/fixtures/round-trips/metanorma/a.xml +66 -0
- data/spec/fixtures/round-trips/metanorma/bilingual-en.xml +7682 -0
- data/spec/fixtures/round-trips/metanorma/bilingual-fr.xml +7520 -0
- data/spec/fixtures/round-trips/metanorma/bilingual.presentation.xml +21211 -0
- data/spec/fixtures/round-trips/metanorma/collection1.xml +313 -0
- data/spec/fixtures/round-trips/metanorma/collection1nested.xml +291 -0
- data/spec/fixtures/round-trips/metanorma/collection_docinline.xml +544 -0
- data/spec/fixtures/round-trips/metanorma/collection_full.xml +1776 -0
- data/spec/fixtures/round-trips/metanorma/dummy.1.xml +295 -0
- data/spec/fixtures/round-trips/metanorma/dummy.xml +349 -0
- data/spec/fixtures/round-trips/metanorma/footnotes.xml +70 -0
- data/spec/fixtures/round-trips/metanorma/iho.xml +116 -0
- data/spec/fixtures/round-trips/metanorma/rice-amd.final.xml +186 -0
- data/spec/fixtures/round-trips/metanorma/rice-amd.final_1.xml +180 -0
- data/spec/fixtures/round-trips/metanorma/rice-en.final.norepo.xml +116 -0
- data/spec/fixtures/round-trips/metanorma/rice-en.final.xml +149 -0
- data/spec/fixtures/round-trips/metanorma/rice-en.final_1.xml +144 -0
- data/spec/fixtures/round-trips/metanorma/rice1-en.final.xml +120 -0
- data/spec/fixtures/round-trips/metanorma/rice2-en.final.xml +116 -0
- data/spec/fixtures/round-trips/metanorma/test_sectionsplit.xml +119 -0
- data/spec/fixtures/round-trips/niso-jats/bmj_sample.xml +1068 -0
- data/spec/fixtures/round-trips/niso-jats/element_citation.xml +7 -0
- data/spec/fixtures/round-trips/niso-jats/pnas_sample.xml +3768 -0
- data/spec/fixtures/round-trips/rfcxml/rfc8881.xml +45848 -0
- data/spec/fixtures/round-trips/rfcxml/rfc8994.xml +6607 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9000.xml +9064 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9043.xml +5527 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9051.xml +14286 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9110.xml +18156 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9260.xml +9136 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9293.xml +8300 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9380.xml +8916 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9420.xml +8927 -0
- data/spec/fixtures/w3c/namespaces/1.0/001.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/002.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/003.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/004.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/005.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/006.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/007.xml +20 -0
- data/spec/fixtures/w3c/namespaces/1.0/008.xml +20 -0
- data/spec/fixtures/w3c/namespaces/1.0/009.xml +19 -0
- data/spec/fixtures/w3c/namespaces/1.0/010.xml +19 -0
- data/spec/fixtures/w3c/namespaces/1.0/011.xml +20 -0
- data/spec/fixtures/w3c/namespaces/1.0/012.xml +19 -0
- data/spec/fixtures/w3c/namespaces/1.0/013.xml +5 -0
- data/spec/fixtures/w3c/namespaces/1.0/014.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/015.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/016.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/017.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/018.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/019.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/020.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/021.xml +6 -0
- data/spec/fixtures/w3c/namespaces/1.0/022.xml +6 -0
- data/spec/fixtures/w3c/namespaces/1.0/023.xml +6 -0
- data/spec/fixtures/w3c/namespaces/1.0/024.xml +6 -0
- data/spec/fixtures/w3c/namespaces/1.0/025.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/026.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/027.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/028.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/029.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/030.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/031.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/032.xml +5 -0
- data/spec/fixtures/w3c/namespaces/1.0/033.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/034.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/035.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/036.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/037.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/038.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/039.xml +10 -0
- data/spec/fixtures/w3c/namespaces/1.0/040.xml +9 -0
- data/spec/fixtures/w3c/namespaces/1.0/041.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/042.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/043.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/044.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/045.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/046.xml +10 -0
- data/spec/fixtures/w3c/namespaces/1.0/047.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/048.xml +5 -0
- data/spec/fixtures/w3c/namespaces/1.0/LICENSE.md +32 -0
- data/spec/fixtures/w3c/namespaces/1.0/README.adoc +42 -0
- data/spec/fixtures/w3c/namespaces/1.0/rmt-ns10.xml +156 -0
- data/spec/integration/shared_examples/node_wrappers/namespace_behavior.rb +14 -2
- data/spec/integration/shared_examples/w3c_namespace_examples.rb +10 -0
- data/spec/integration/w3c_namespace_spec.rb +69 -0
- data/spec/moxml/adapter/libxml_spec.rb +7 -1
- data/spec/moxml/adapter/oga_spec.rb +92 -0
- data/spec/moxml/config_spec.rb +75 -0
- data/spec/moxml/doctype_spec.rb +19 -3
- data/spec/moxml/entity_registry_spec.rb +184 -0
- data/spec/moxml/error_spec.rb +2 -2
- data/spec/moxml/namespace_uri_validation_spec.rb +140 -0
- data/spec/moxml/xpath/axes_spec.rb +3 -4
- data/spec/performance/xpath_benchmark_spec.rb +6 -54
- data/spec/support/w3c_namespace_helpers.rb +41 -0
- data/spec/unit/rexml_isolated_test.rb +271 -0
- metadata +99 -3
- data/.ruby-version +0 -1
data/README.adoc
CHANGED
|
@@ -26,6 +26,8 @@ Key features:
|
|
|
26
26
|
|
|
27
27
|
== Supported XML libraries
|
|
28
28
|
|
|
29
|
+
=== General
|
|
30
|
+
|
|
29
31
|
Moxml supports the following XML libraries:
|
|
30
32
|
|
|
31
33
|
REXML:: https://github.com/ruby/rexml[REXML], a pure Ruby XML parser
|
|
@@ -110,7 +112,9 @@ section.
|
|
|
110
112
|
NOTE: HeadedOx provides full XPath 1.0 support via a pure Ruby XPath engine
|
|
111
113
|
layered on top of Ox's C parser. See HeadedOx documentation for details.
|
|
112
114
|
|
|
113
|
-
NOTE: Ox/HeadedOx SAX: Only core events supported (start_element, end_element,
|
|
115
|
+
NOTE: Ox/HeadedOx SAX: Only core events supported (start_element, end_element,
|
|
116
|
+
characters, errors). No separate CDATA, comment, or processing instruction
|
|
117
|
+
events.
|
|
114
118
|
|
|
115
119
|
== Adapter comparison
|
|
116
120
|
|
|
@@ -356,7 +360,7 @@ NOTE: Ox/HeadedOx SAX: Only core events supported (start_element, end_element, c
|
|
|
356
360
|
| ✅ Full
|
|
357
361
|
| ✅ Full
|
|
358
362
|
| ✅ Full
|
|
359
|
-
|
|
|
363
|
+
| ✅ Full
|
|
360
364
|
| ✅ Full
|
|
361
365
|
| ✅ Full
|
|
362
366
|
|
|
@@ -400,13 +404,12 @@ NOTE: Ox/HeadedOx SAX: Only core events supported (start_element, end_element, c
|
|
|
400
404
|
| ✅ Yes
|
|
401
405
|
| ✅ Yes
|
|
402
406
|
|===
|
|
403
|
-
|
|
407
|
+
+
|
|
404
408
|
^1^ Ox/HeadedOx: Text node replacement may fail in some cases due to internal node structure +
|
|
405
409
|
^2^ Ox: `//book[@id]` works (returns all book elements), but doesn't filter by attribute existence +
|
|
406
410
|
^3^ HeadedOx: Full XPath 1.0 with all 27 functions and 6 axes. Pure Ruby XPath engine on Ox's C parser. 99.20% pass rate. See link:docs/headed-ox.adoc[] +
|
|
407
411
|
^4^ Ox: Use `.find { |el| el["id"] == "123" }` instead of XPath attribute value predicates +
|
|
408
|
-
^5^
|
|
409
|
-
^6^ HeadedOx limitations: Namespace introspection and 7 axes not implemented. See link:docs/HEADED_OX_LIMITATIONS.md[]
|
|
412
|
+
^5^ HeadedOx limitations: Namespace introspection and 7 axes not implemented. See link:docs/HEADED_OX_LIMITATIONS.md[]
|
|
410
413
|
|
|
411
414
|
=== Adapter selection guide
|
|
412
415
|
|
|
@@ -453,10 +456,13 @@ NOTE: Ox/HeadedOx SAX: Only core events supported (start_element, end_element, c
|
|
|
453
456
|
* Need more XPath capabilities than standard Ox provides
|
|
454
457
|
* Memory efficiency is important but XPath features are required
|
|
455
458
|
|
|
456
|
-
CAUTION: Ox's custom XPath engine supports common patterns but
|
|
459
|
+
CAUTION: Ox's custom XPath engine supports common patterns but cannot handle
|
|
457
460
|
complex XPath expressions. Test thoroughly if your use case requires advanced
|
|
458
461
|
XPath.
|
|
459
462
|
|
|
463
|
+
TODO: We should throw errors when unsupported XPath features are used with Ox
|
|
464
|
+
or HeadedOx to prevent silent failures.
|
|
465
|
+
|
|
460
466
|
|
|
461
467
|
== Getting started
|
|
462
468
|
|
|
@@ -586,7 +592,81 @@ book.add_child(doc.create_comment('Book details'))
|
|
|
586
592
|
title = doc.create_element('title')
|
|
587
593
|
title.text = 'Ruby Programming'
|
|
588
594
|
book.add_child(title)
|
|
595
|
+
|
|
596
|
+
# Add entity reference (for declared entities)
|
|
597
|
+
book.add_child(doc.create_entity_reference('mdash'))
|
|
598
|
+
----
|
|
599
|
+
|
|
600
|
+
=== Entity References
|
|
601
|
+
|
|
602
|
+
Moxml supports `EntityReference` nodes for preserving entity syntax in XML documents. This enables round-trip preservation of entity references like ` `, `©`, and custom entities defined in the DOCTYPE.
|
|
603
|
+
|
|
604
|
+
[source,ruby]
|
|
605
|
+
----
|
|
606
|
+
# Create entity reference programmatically
|
|
607
|
+
ref = doc.create_entity_reference('nbsp')
|
|
608
|
+
element.add_child(ref)
|
|
609
|
+
|
|
610
|
+
# Or using the builder pattern
|
|
611
|
+
doc = Moxml::Builder.new(Moxml.new).build do
|
|
612
|
+
element 'text' do
|
|
613
|
+
entity_reference 'ndash'
|
|
614
|
+
entity_reference 'copy'
|
|
615
|
+
end
|
|
616
|
+
end
|
|
617
|
+
----
|
|
618
|
+
|
|
619
|
+
**Parsing and Round-Trip:**
|
|
620
|
+
|
|
621
|
+
When parsing XML with declared entities, Moxml preserves entity references:
|
|
622
|
+
|
|
623
|
+
[source,ruby]
|
|
624
|
+
----
|
|
625
|
+
# Parse document with custom entity
|
|
626
|
+
xml = <<-XML
|
|
627
|
+
<!DOCTYPE root [<!ENTITY nbsp " "> ]>
|
|
628
|
+
<root>hello world</root>
|
|
629
|
+
XML
|
|
630
|
+
|
|
631
|
+
doc = Moxml.new(:nokogiri).parse(xml)
|
|
632
|
+
doc.to_xml # => preserves entity reference
|
|
633
|
+
----
|
|
634
|
+
|
|
635
|
+
**Adapter Notes:**
|
|
636
|
+
|
|
637
|
+
- *Nokogiri*: Preserves custom declared entities as `EntityReference` nodes
|
|
638
|
+
- *Ox, Oga*: These adapters resolve entities during parsing and do not expose entity reference nodes. Use Nokogiri or LibXML for entity preservation.
|
|
639
|
+
|
|
640
|
+
**Entity Loading Configuration:**
|
|
641
|
+
|
|
642
|
+
Moxml provides configurable entity loading with four modes to balance between functionality, performance, and security:
|
|
643
|
+
|
|
644
|
+
[source,ruby]
|
|
589
645
|
----
|
|
646
|
+
# Default: Load all W3C entities (HTML + MathML + ISO entity sets)
|
|
647
|
+
# Raises error if entity data is unavailable
|
|
648
|
+
context = Moxml.new
|
|
649
|
+
|
|
650
|
+
# Optional: Load entities if available, silently skip if not
|
|
651
|
+
context = Moxml.new do |config|
|
|
652
|
+
config.entity_load_mode = :optional
|
|
653
|
+
end
|
|
654
|
+
|
|
655
|
+
# Disabled: No entity loading (fastest, for controlled XML sources)
|
|
656
|
+
context = Moxml.new do |config|
|
|
657
|
+
config.entity_load_mode = :disabled
|
|
658
|
+
end
|
|
659
|
+
|
|
660
|
+
# Custom: Load entities from your own source
|
|
661
|
+
context = Moxml.new do |config|
|
|
662
|
+
config.entity_load_mode = :custom
|
|
663
|
+
config.entity_provider = -> { MyEntitySource.all_entities }
|
|
664
|
+
end
|
|
665
|
+
----
|
|
666
|
+
|
|
667
|
+
The entity data comes from the W3C XML Core WG Character Entities specification (HTMLMathML set), bundled locally in `data/w3c_entities.json` for offline capability. Set the `MOXML_ENTITY_DEFINITIONS_PATH` environment variable to use a custom entity data source.
|
|
668
|
+
|
|
669
|
+
For backward compatibility, `config.load_external_entities = false` maps to `:disabled` mode, and `config.load_external_entities = true` maps to `:required` mode.
|
|
590
670
|
|
|
591
671
|
=== Fluent interface API
|
|
592
672
|
|
|
@@ -643,6 +723,23 @@ For complete SAX documentation including all handler types, event methods, adapt
|
|
|
643
723
|
|
|
644
724
|
For complete node API reference including traversal methods, manipulation, queries, type checking, and node information, see link:docs/_pages/node-api-reference.adoc[Node API Reference].
|
|
645
725
|
|
|
726
|
+
=== Node identity
|
|
727
|
+
|
|
728
|
+
Moxml provides a consistent `#identifier` method across all node types to safely identify nodes:
|
|
729
|
+
|
|
730
|
+
[source,ruby]
|
|
731
|
+
----
|
|
732
|
+
element = doc.at_xpath("//book")
|
|
733
|
+
puts element.identifier # => "book"
|
|
734
|
+
|
|
735
|
+
attr = element.attribute("id")
|
|
736
|
+
puts attr.identifier # => "id"
|
|
737
|
+
----
|
|
738
|
+
|
|
739
|
+
The `#identifier` method returns the primary identifier for each node type (tag name for elements, attribute name for attributes, target for processing instructions, or `nil` for content nodes).
|
|
740
|
+
|
|
741
|
+
IMPORTANT: Always use type-safe patterns when working with mixed node types. See the link:docs/_guides/node-api-consistency.adoc[Node API Consistency Guide] for complete documentation on safe coding patterns, API surface by node type, and migration guidelines.
|
|
742
|
+
|
|
646
743
|
|
|
647
744
|
== Advanced features
|
|
648
745
|
|
|
@@ -694,7 +791,8 @@ rescue Moxml::Error => e
|
|
|
694
791
|
end
|
|
695
792
|
----
|
|
696
793
|
|
|
697
|
-
For complete error class hierarchy, error types, best practices, and debugging
|
|
794
|
+
For complete error class hierarchy, error types, best practices, and debugging
|
|
795
|
+
techniques, see link:docs/_pages/error-handling.adoc[Error Handling Guide].
|
|
698
796
|
|
|
699
797
|
|
|
700
798
|
== Configuration
|
|
@@ -717,22 +815,53 @@ context = Moxml.new do |config|
|
|
|
717
815
|
end
|
|
718
816
|
----
|
|
719
817
|
|
|
720
|
-
|
|
818
|
+
=== Namespace URI validation
|
|
819
|
+
|
|
820
|
+
Moxml validates namespace URIs against
|
|
821
|
+
https://www.rfc-editor.org/rfc/rfc3986[RFC 3986] by default, as required by the
|
|
822
|
+
https://www.w3.org/TR/xml-names/[W3C Namespaces in XML] specification.
|
|
823
|
+
|
|
824
|
+
For documents that use non-standard namespace identifiers, a lenient mode is
|
|
825
|
+
available:
|
|
826
|
+
|
|
827
|
+
[source,ruby]
|
|
828
|
+
----
|
|
829
|
+
# Strict mode (default) — rejects invalid URIs per RFC 3986
|
|
830
|
+
context = Moxml.new do |config|
|
|
831
|
+
config.namespace_uri_mode = :strict
|
|
832
|
+
end
|
|
833
|
+
|
|
834
|
+
# Lenient mode — accepts any string as a namespace URI
|
|
835
|
+
context = Moxml.new do |config|
|
|
836
|
+
config.namespace_uri_mode = :lenient
|
|
837
|
+
end
|
|
838
|
+
----
|
|
839
|
+
|
|
840
|
+
For all configuration options, adapter selection, serialization options, and
|
|
841
|
+
environment-based configuration, see
|
|
842
|
+
link:docs/_pages/configuration.adoc[Configuration Guide].
|
|
721
843
|
|
|
722
844
|
|
|
723
845
|
|
|
724
846
|
== Thread safety
|
|
725
847
|
|
|
726
|
-
For complete information on thread-safe patterns, context management, and
|
|
848
|
+
For complete information on thread-safe patterns, context management, and
|
|
849
|
+
concurrent processing, see the link:docs/_pages/thread-safety.adoc[Thread Safety
|
|
850
|
+
Guide].
|
|
727
851
|
|
|
728
852
|
|
|
729
853
|
== Performance considerations
|
|
730
854
|
|
|
731
|
-
For detailed performance optimization strategies, memory management best
|
|
855
|
+
For detailed performance optimization strategies, memory management best
|
|
856
|
+
practices, and efficient querying patterns, see the
|
|
857
|
+
link:docs/_pages/performance.adoc[Performance Considerations Guide].
|
|
732
858
|
|
|
733
859
|
== Best practices
|
|
734
860
|
|
|
735
|
-
For comprehensive best practices covering XPath queries, adapter selection,
|
|
861
|
+
For comprehensive best practices covering XPath queries, adapter selection,
|
|
862
|
+
error handling, namespace handling, memory management, thread safety,
|
|
863
|
+
performance optimization, and testing strategies, see
|
|
864
|
+
link:docs/_pages/best-practices.adoc[Best Practices Guide].
|
|
736
865
|
|
|
737
866
|
|
|
738
867
|
== Specific adapter limitations
|
|
@@ -756,11 +885,13 @@ The Ox adapter provides maximum parsing speed but has XPath limitations.
|
|
|
756
885
|
doc.xpath("//book").find { |book| book["id"] == "123" }
|
|
757
886
|
----
|
|
758
887
|
|
|
759
|
-
For complete Ox adapter documentation including all limitations and workarounds,
|
|
888
|
+
For complete Ox adapter documentation including all limitations and workarounds,
|
|
889
|
+
see link:docs/_pages/adapters/ox.adoc[Ox Adapter Guide].
|
|
760
890
|
|
|
761
891
|
=== HeadedOx adapter
|
|
762
892
|
|
|
763
|
-
The HeadedOx adapter combines Ox's fast C-based XML parsing with Moxml's
|
|
893
|
+
The HeadedOx adapter combines Ox's fast C-based XML parsing with Moxml's
|
|
894
|
+
comprehensive pure Ruby XPath 1.0 engine.
|
|
764
895
|
|
|
765
896
|
**Status:** Production-ready v1.2 (99.20% pass rate, 1,992/2,008 tests)
|
|
766
897
|
|
|
@@ -796,12 +927,6 @@ For complete HeadedOx documentation including architecture, XPath capabilities,
|
|
|
796
927
|
|
|
797
928
|
==== LibXML adapter
|
|
798
929
|
|
|
799
|
-
*DOCTYPE Limitations:*
|
|
800
|
-
|
|
801
|
-
* DOCTYPE parsing works
|
|
802
|
-
* DOCTYPE round-trip preservation is limited
|
|
803
|
-
* DOCTYPE cannot be reliably re-serialized after parsing
|
|
804
|
-
|
|
805
930
|
*Performance:*
|
|
806
931
|
|
|
807
932
|
* Serialization speed: ~120 ips (slower than target)
|
|
@@ -817,9 +942,151 @@ limitations. Use these adapters when you need full XPath and namespace support.
|
|
|
817
942
|
|
|
818
943
|
|
|
819
944
|
|
|
945
|
+
== Round-trip XML Testing
|
|
946
|
+
|
|
947
|
+
Moxml includes comprehensive round-trip testing to verify that XML documents remain
|
|
948
|
+
semantically equivalent when parsed and serialized across different adapters.
|
|
949
|
+
|
|
950
|
+
=== Purpose
|
|
951
|
+
|
|
952
|
+
Round-trip testing ensures:
|
|
953
|
+
|
|
954
|
+
* **Cross-adapter compatibility** - XML parsed with one adapter (e.g., Nokogiri) can be
|
|
955
|
+
serialized and re-parsed with another adapter (e.g., Oga) while preserving content
|
|
956
|
+
* **Structural fidelity** - Element names, attributes, and document structure are maintained
|
|
957
|
+
* **Content preservation** - Text content and entity references survive multiple parse/serialize cycles
|
|
958
|
+
* **Double round-trip verification** - Source → Target → Source sequences produce semantically
|
|
959
|
+
equivalent output
|
|
960
|
+
|
|
961
|
+
=== Test Fixtures
|
|
962
|
+
|
|
963
|
+
Round-trip tests use real-world XML documents organized into collections:
|
|
964
|
+
|
|
965
|
+
**rfcxml** - IETF RFC documents in XML format. These provide complex, standards-compliant
|
|
966
|
+
XML with mixed content, namespaces, and attributes. The collection includes:
|
|
967
|
+
|
|
968
|
+
* Large documents (500KB-2.4MB) for stress testing
|
|
969
|
+
* Rich metadata and cross-references
|
|
970
|
+
* Various XML schema patterns
|
|
971
|
+
|
|
972
|
+
**metanorma** - Metanorma document processing XML. These test:
|
|
973
|
+
|
|
974
|
+
* Document structure preservation
|
|
975
|
+
* Nested elements and complex hierarchies
|
|
976
|
+
* Standard XML vocabularies
|
|
977
|
+
|
|
978
|
+
**niso-jats** - NISO Journal Article Tag Suite XML. These provide:
|
|
979
|
+
|
|
980
|
+
* Scholarly publishing XML schemas
|
|
981
|
+
* Rich bibliographic metadata
|
|
982
|
+
* Mixed content models
|
|
983
|
+
|
|
984
|
+
=== Running Round-trip Tests
|
|
985
|
+
|
|
986
|
+
[source,bash]
|
|
987
|
+
----
|
|
988
|
+
# Run all round-trip tests
|
|
989
|
+
bundle exec rake spec:consistency
|
|
990
|
+
|
|
991
|
+
# Exclude REXML for larger fixtures (faster, REXML is pure Ruby)
|
|
992
|
+
MOXML_ROUNDTRIP_REXML_MAX_SIZE=0 bundle exec rake spec:consistency
|
|
993
|
+
|
|
994
|
+
# Adjust the per-example timeout (default: 120 seconds)
|
|
995
|
+
MOXML_ROUNDTRIP_TIMEOUT=300 bundle exec rake spec:consistency
|
|
996
|
+
----
|
|
997
|
+
|
|
998
|
+
REXML is a pure Ruby XML parser and becomes very slow on large documents (500KB+).
|
|
999
|
+
By default, REXML adapter pairs are skipped for fixtures exceeding 500KB. All other
|
|
1000
|
+
adapters (Nokogiri, Oga, Ox) are tested against every fixture.
|
|
1001
|
+
|
|
1002
|
+
=== Test Mechanics
|
|
1003
|
+
|
|
1004
|
+
For each fixture, tests run across all adapter pairs (4 adapters = 12 combinations):
|
|
1005
|
+
|
|
1006
|
+
1. Parse with source adapter
|
|
1007
|
+
2. Serialize to XML string
|
|
1008
|
+
3. Parse serialized output with target adapter
|
|
1009
|
+
4. Compare semantic equivalence (element names, attributes, text content)
|
|
1010
|
+
|
|
1011
|
+
A "double round-trip" test additionally verifies: Source → Target → Source → Target
|
|
1012
|
+
produces consistent results.
|
|
1013
|
+
|
|
1014
|
+
NOTE: REXML is excluded from adapter pairs for fixtures larger than 500KB (configurable
|
|
1015
|
+
via `MOXML_ROUNDTRIP_REXML_MAX_SIZE`). This is because REXML is pure Ruby and cannot
|
|
1016
|
+
parse large XML documents in a practical timeframe. A per-example timeout
|
|
1017
|
+
(`MOXML_ROUNDTRIP_TIMEOUT`, default 120s) prevents tests from hanging indefinitely.
|
|
1018
|
+
|
|
1019
|
+
=== Ox Adapter Element Ordering Caveat
|
|
1020
|
+
|
|
1021
|
+
The Ox adapter produces elements in a different order than other adapters for certain
|
|
1022
|
+
fixtures with complex nested structures (e.g., `element_citation.xml`,
|
|
1023
|
+
`collection1nested.xml`, `pnas_sample.xml`). This causes the `elements_with_attributes`
|
|
1024
|
+
comparison to fail with "Array length mismatch" even though the semantic equivalence
|
|
1025
|
+
check (double round-trip) passes.
|
|
1026
|
+
|
|
1027
|
+
Round-trip tests automatically skip the `elements_with_attributes` comparison for these
|
|
1028
|
+
known Ox ordering issues. The `ruby-versions` CI job tests only Nokogiri and Oga adapters;
|
|
1029
|
+
the `nokogiri-ox` and `nokogiri-rexml` CI jobs test Ox and REXML respectively but are
|
|
1030
|
+
marked as **experimental** since these adapters lack full XML feature support:
|
|
1031
|
+
|
|
1032
|
+
* **Ox**: Lacks proper namespace support, XPath with predicates, and uses a custom
|
|
1033
|
+
`locate()` method instead of standard XPath
|
|
1034
|
+
* **REXML**: Pure Ruby, exponential time complexity with document size, impractical for
|
|
1035
|
+
documents over ~500KB
|
|
1036
|
+
|
|
1037
|
+
For production use, prefer Nokogiri or Oga which provide complete XML conformance.
|
|
1038
|
+
|
|
1039
|
+
To run tests with a specific adapter set locally:
|
|
1040
|
+
|
|
1041
|
+
[source,bash]
|
|
1042
|
+
----
|
|
1043
|
+
# Nokogiri + Oga only (fast, full test suite)
|
|
1044
|
+
MOXML_ROUNDTRIP_ADAPTERS=nokogiri,oga bundle exec rspec spec/consistency/ --tag round_trip
|
|
1045
|
+
|
|
1046
|
+
# Nokogiri × Ox only (experimental)
|
|
1047
|
+
MOXML_ROUNDTRIP_ADAPTERS=nokogiri,ox MOXML_ROUNDTRIP_TIMEOUT=300 bundle exec rspec spec/consistency/ --tag round_trip
|
|
1048
|
+
|
|
1049
|
+
# Nokogiri × REXML only (experimental, small fixtures due to exponential complexity)
|
|
1050
|
+
MOXML_ROUNDTRIP_ADAPTERS=nokogiri,rexml MOXML_ROUNDTRIP_TIMEOUT=300 MOXML_ROUNDTRIP_REXML_MAX_SIZE=50000 bundle exec rspec spec/consistency/ --tag round_trip
|
|
1051
|
+
----
|
|
1052
|
+
|
|
1053
|
+
=== Why Semantic Equivalence?
|
|
1054
|
+
|
|
1055
|
+
While a pure round-trip test with raw XML comparison would be ideal, different XML adapters
|
|
1056
|
+
have fundamentally different philosophies for handling:
|
|
1057
|
+
|
|
1058
|
+
* **Element ordering** - Some preserve document order, others sort alphabetically
|
|
1059
|
+
* **Whitespace handling** - Some normalize spaces, others preserve exactly
|
|
1060
|
+
* **Attribute representation** - Different data structures for the same attributes
|
|
1061
|
+
* **Text extraction** - Varying approaches to concatenating text content
|
|
1062
|
+
|
|
1063
|
+
Instead of raw comparison, Moxml implements semantic equivalence testing that focuses on
|
|
1064
|
+
meaningful XML structure and content:
|
|
1065
|
+
|
|
1066
|
+
[source,ruby]
|
|
1067
|
+
----
|
|
1068
|
+
# Element name must match
|
|
1069
|
+
expect(target_element.name).to eq(source_element.name)
|
|
1070
|
+
|
|
1071
|
+
# Attributes must be semantically equivalent
|
|
1072
|
+
expect(target_attributes).to eq(source_attributes)
|
|
1073
|
+
|
|
1074
|
+
# Text content must be preserved (whitespace-normalized)
|
|
1075
|
+
expect(normalized_text(target)).to eq(normalized_text(source))
|
|
1076
|
+
|
|
1077
|
+
# Document structure (element count) must match
|
|
1078
|
+
expect(doc.xpath("//*").size).to eq(original.xpath("//*").size)
|
|
1079
|
+
----
|
|
1080
|
+
|
|
1081
|
+
This approach tolerates adapter-specific serialization differences while ensuring
|
|
1082
|
+
the actual XML content remains intact.
|
|
1083
|
+
|
|
1084
|
+
|
|
820
1085
|
== Development and testing
|
|
821
1086
|
|
|
822
|
-
For complete information on development setup, testing strategies, benchmarking,
|
|
1087
|
+
For complete information on development setup, testing strategies, benchmarking,
|
|
1088
|
+
and coverage reporting, see the
|
|
1089
|
+
link:docs/_guides/development-testing.adoc[Development and Testing Guide].
|
|
823
1090
|
|
|
824
1091
|
== Contributing
|
|
825
1092
|
|
data/Rakefile
CHANGED
|
@@ -30,6 +30,17 @@ namespace :spec do
|
|
|
30
30
|
t.pattern = "spec/consistency/**/*_spec.rb"
|
|
31
31
|
end
|
|
32
32
|
|
|
33
|
+
namespace :consistency do
|
|
34
|
+
desc "Run round-trip tests for a specific fixture category (CATEGORIES=metanorma,rfcxml,niso-jats)"
|
|
35
|
+
task :by_category do
|
|
36
|
+
categories = ENV.fetch("CATEGORIES", "").split(",").map(&:strip)
|
|
37
|
+
abort "Usage: CATEGORIES=metanorma,rfcxml rake spec:consistency:by_category" if categories.empty?
|
|
38
|
+
|
|
39
|
+
include_filters = categories.map { |c| "--tag fixture_category:#{c}" }.join(" ")
|
|
40
|
+
sh "bundle exec rspec spec/consistency/ --tag round_trip #{include_filters}"
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
33
44
|
desc "Run example tests"
|
|
34
45
|
RSpec::Core::RakeTask.new(:examples) do |t|
|
|
35
46
|
t.pattern = "spec/examples/**/*_spec.rb"
|