moxml 0.1.10 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/docs.yml +1 -1
  3. data/.github/workflows/rake.yml +16 -13
  4. data/.github/workflows/release.yml +1 -0
  5. data/.github/workflows/round-trip.yml +74 -0
  6. data/.gitignore +1 -0
  7. data/.rubocop.yml +1 -0
  8. data/.rubocop_todo.yml +160 -38
  9. data/Gemfile +2 -1
  10. data/README.adoc +236 -0
  11. data/Rakefile +11 -0
  12. data/data/w3c_entities.json +2131 -0
  13. data/docs/ENTITY_SUPPORT_FOR_LUTAML_MODEL.md +102 -0
  14. data/docs/_pages/adapters/ox.adoc +30 -0
  15. data/docs/_pages/configuration.adoc +43 -0
  16. data/docs/_pages/node-api-reference.adoc +35 -0
  17. data/docs/_tutorials/namespace-handling.adoc +21 -0
  18. data/examples/rss_parser/rss_parser.rb +1 -3
  19. data/lib/moxml/adapter/base.rb +26 -2
  20. data/lib/moxml/adapter/headed_ox.rb +5 -4
  21. data/lib/moxml/adapter/libxml.rb +3 -2
  22. data/lib/moxml/adapter/nokogiri.rb +16 -3
  23. data/lib/moxml/adapter/oga.rb +124 -20
  24. data/lib/moxml/adapter/ox.rb +4 -3
  25. data/lib/moxml/adapter/rexml.rb +41 -7
  26. data/lib/moxml/builder.rb +6 -0
  27. data/lib/moxml/config.rb +52 -1
  28. data/lib/moxml/context.rb +21 -2
  29. data/lib/moxml/document.rb +6 -1
  30. data/lib/moxml/document_builder.rb +45 -1
  31. data/lib/moxml/element.rb +4 -3
  32. data/lib/moxml/entity_reference.rb +29 -0
  33. data/lib/moxml/entity_registry.rb +278 -0
  34. data/lib/moxml/node.rb +10 -8
  35. data/lib/moxml/node_set.rb +10 -6
  36. data/lib/moxml/version.rb +1 -1
  37. data/lib/moxml/xml_utils.rb +25 -2
  38. data/lib/moxml.rb +1 -0
  39. data/spec/consistency/README.md +3 -1
  40. data/spec/consistency/round_trip_spec.rb +479 -0
  41. data/spec/examples/readme_examples_spec.rb +1 -1
  42. data/spec/fixtures/round-trips/metanorma/a.xml +66 -0
  43. data/spec/fixtures/round-trips/metanorma/bilingual-en.xml +7682 -0
  44. data/spec/fixtures/round-trips/metanorma/bilingual-fr.xml +7520 -0
  45. data/spec/fixtures/round-trips/metanorma/bilingual.presentation.xml +21211 -0
  46. data/spec/fixtures/round-trips/metanorma/collection1.xml +313 -0
  47. data/spec/fixtures/round-trips/metanorma/collection1nested.xml +291 -0
  48. data/spec/fixtures/round-trips/metanorma/collection_docinline.xml +544 -0
  49. data/spec/fixtures/round-trips/metanorma/collection_full.xml +1776 -0
  50. data/spec/fixtures/round-trips/metanorma/dummy.1.xml +295 -0
  51. data/spec/fixtures/round-trips/metanorma/dummy.xml +349 -0
  52. data/spec/fixtures/round-trips/metanorma/footnotes.xml +70 -0
  53. data/spec/fixtures/round-trips/metanorma/iho.xml +116 -0
  54. data/spec/fixtures/round-trips/metanorma/rice-amd.final.xml +186 -0
  55. data/spec/fixtures/round-trips/metanorma/rice-amd.final_1.xml +180 -0
  56. data/spec/fixtures/round-trips/metanorma/rice-en.final.norepo.xml +116 -0
  57. data/spec/fixtures/round-trips/metanorma/rice-en.final.xml +149 -0
  58. data/spec/fixtures/round-trips/metanorma/rice-en.final_1.xml +144 -0
  59. data/spec/fixtures/round-trips/metanorma/rice1-en.final.xml +120 -0
  60. data/spec/fixtures/round-trips/metanorma/rice2-en.final.xml +116 -0
  61. data/spec/fixtures/round-trips/metanorma/test_sectionsplit.xml +119 -0
  62. data/spec/fixtures/round-trips/niso-jats/bmj_sample.xml +1068 -0
  63. data/spec/fixtures/round-trips/niso-jats/element_citation.xml +7 -0
  64. data/spec/fixtures/round-trips/niso-jats/pnas_sample.xml +3768 -0
  65. data/spec/fixtures/round-trips/rfcxml/rfc8881.xml +45848 -0
  66. data/spec/fixtures/round-trips/rfcxml/rfc8994.xml +6607 -0
  67. data/spec/fixtures/round-trips/rfcxml/rfc9000.xml +9064 -0
  68. data/spec/fixtures/round-trips/rfcxml/rfc9043.xml +5527 -0
  69. data/spec/fixtures/round-trips/rfcxml/rfc9051.xml +14286 -0
  70. data/spec/fixtures/round-trips/rfcxml/rfc9110.xml +18156 -0
  71. data/spec/fixtures/round-trips/rfcxml/rfc9260.xml +9136 -0
  72. data/spec/fixtures/round-trips/rfcxml/rfc9293.xml +8300 -0
  73. data/spec/fixtures/round-trips/rfcxml/rfc9380.xml +8916 -0
  74. data/spec/fixtures/round-trips/rfcxml/rfc9420.xml +8927 -0
  75. data/spec/fixtures/w3c/namespaces/1.0/001.xml +7 -0
  76. data/spec/fixtures/w3c/namespaces/1.0/002.xml +8 -0
  77. data/spec/fixtures/w3c/namespaces/1.0/003.xml +7 -0
  78. data/spec/fixtures/w3c/namespaces/1.0/004.xml +7 -0
  79. data/spec/fixtures/w3c/namespaces/1.0/005.xml +7 -0
  80. data/spec/fixtures/w3c/namespaces/1.0/006.xml +7 -0
  81. data/spec/fixtures/w3c/namespaces/1.0/007.xml +20 -0
  82. data/spec/fixtures/w3c/namespaces/1.0/008.xml +20 -0
  83. data/spec/fixtures/w3c/namespaces/1.0/009.xml +19 -0
  84. data/spec/fixtures/w3c/namespaces/1.0/010.xml +19 -0
  85. data/spec/fixtures/w3c/namespaces/1.0/011.xml +20 -0
  86. data/spec/fixtures/w3c/namespaces/1.0/012.xml +19 -0
  87. data/spec/fixtures/w3c/namespaces/1.0/013.xml +5 -0
  88. data/spec/fixtures/w3c/namespaces/1.0/014.xml +3 -0
  89. data/spec/fixtures/w3c/namespaces/1.0/015.xml +3 -0
  90. data/spec/fixtures/w3c/namespaces/1.0/016.xml +3 -0
  91. data/spec/fixtures/w3c/namespaces/1.0/017.xml +3 -0
  92. data/spec/fixtures/w3c/namespaces/1.0/018.xml +3 -0
  93. data/spec/fixtures/w3c/namespaces/1.0/019.xml +3 -0
  94. data/spec/fixtures/w3c/namespaces/1.0/020.xml +3 -0
  95. data/spec/fixtures/w3c/namespaces/1.0/021.xml +6 -0
  96. data/spec/fixtures/w3c/namespaces/1.0/022.xml +6 -0
  97. data/spec/fixtures/w3c/namespaces/1.0/023.xml +6 -0
  98. data/spec/fixtures/w3c/namespaces/1.0/024.xml +6 -0
  99. data/spec/fixtures/w3c/namespaces/1.0/025.xml +3 -0
  100. data/spec/fixtures/w3c/namespaces/1.0/026.xml +3 -0
  101. data/spec/fixtures/w3c/namespaces/1.0/027.xml +3 -0
  102. data/spec/fixtures/w3c/namespaces/1.0/028.xml +3 -0
  103. data/spec/fixtures/w3c/namespaces/1.0/029.xml +4 -0
  104. data/spec/fixtures/w3c/namespaces/1.0/030.xml +4 -0
  105. data/spec/fixtures/w3c/namespaces/1.0/031.xml +4 -0
  106. data/spec/fixtures/w3c/namespaces/1.0/032.xml +5 -0
  107. data/spec/fixtures/w3c/namespaces/1.0/033.xml +4 -0
  108. data/spec/fixtures/w3c/namespaces/1.0/034.xml +3 -0
  109. data/spec/fixtures/w3c/namespaces/1.0/035.xml +8 -0
  110. data/spec/fixtures/w3c/namespaces/1.0/036.xml +8 -0
  111. data/spec/fixtures/w3c/namespaces/1.0/037.xml +8 -0
  112. data/spec/fixtures/w3c/namespaces/1.0/038.xml +8 -0
  113. data/spec/fixtures/w3c/namespaces/1.0/039.xml +10 -0
  114. data/spec/fixtures/w3c/namespaces/1.0/040.xml +9 -0
  115. data/spec/fixtures/w3c/namespaces/1.0/041.xml +8 -0
  116. data/spec/fixtures/w3c/namespaces/1.0/042.xml +4 -0
  117. data/spec/fixtures/w3c/namespaces/1.0/043.xml +7 -0
  118. data/spec/fixtures/w3c/namespaces/1.0/044.xml +7 -0
  119. data/spec/fixtures/w3c/namespaces/1.0/045.xml +7 -0
  120. data/spec/fixtures/w3c/namespaces/1.0/046.xml +10 -0
  121. data/spec/fixtures/w3c/namespaces/1.0/047.xml +4 -0
  122. data/spec/fixtures/w3c/namespaces/1.0/048.xml +5 -0
  123. data/spec/fixtures/w3c/namespaces/1.0/LICENSE.md +32 -0
  124. data/spec/fixtures/w3c/namespaces/1.0/README.adoc +42 -0
  125. data/spec/fixtures/w3c/namespaces/1.0/rmt-ns10.xml +156 -0
  126. data/spec/integration/shared_examples/node_wrappers/element_behavior.rb +14 -0
  127. data/spec/integration/shared_examples/node_wrappers/namespace_behavior.rb +14 -2
  128. data/spec/integration/shared_examples/w3c_namespace_examples.rb +10 -0
  129. data/spec/integration/w3c_namespace_spec.rb +69 -0
  130. data/spec/moxml/adapter/libxml_spec.rb +7 -1
  131. data/spec/moxml/adapter/oga_spec.rb +92 -0
  132. data/spec/moxml/config_spec.rb +75 -0
  133. data/spec/moxml/entity_registry_spec.rb +184 -0
  134. data/spec/moxml/error_spec.rb +2 -2
  135. data/spec/moxml/namespace_uri_validation_spec.rb +140 -0
  136. data/spec/moxml/xpath/axes_spec.rb +3 -4
  137. data/spec/performance/xpath_benchmark_spec.rb +6 -54
  138. data/spec/support/w3c_namespace_helpers.rb +41 -0
  139. data/spec/unit/rexml_isolated_test.rb +271 -0
  140. metadata +98 -2
@@ -0,0 +1,102 @@
1
+ # Entity Support for lutaml-model Team
2
+
3
+ ## Overview
4
+
5
+ Moxml now supports entity restoration during parsing. This feature ensures that XML entities (like `&`, `<`, `>`, `"`, `'`) are preserved as `EntityReference` nodes rather than being resolved to their character values during parsing.
6
+
7
+ ## Key Concept: Entity Restoration
8
+
9
+ By default, XML parsers resolve entities during parsing:
10
+ - Input: `<root>foo&amp;bar</root>`
11
+ - Default behavior: Text node contains `foo&bar` (resolved `&`)
12
+ - With entity restoration: Text node contains `foo` + EntityReference `&amp;` + Text node `bar`
13
+
14
+ ## Enabling Entity Restoration
15
+
16
+ ### Option 1: Per-Context Configuration
17
+
18
+ ```ruby
19
+ context = Moxml.new(:nokogiri, restore_entities: true)
20
+ doc = context.parse('<root>foo&amp;bar</root>')
21
+ # doc.to_xml will preserve &amp; as EntityReference
22
+ ```
23
+
24
+ ### Option 2: Global Configuration
25
+
26
+ ```ruby
27
+ Moxml.configure do |config|
28
+ config.restore_entities = true
29
+ end
30
+ ```
31
+
32
+ ## Preloading Entity Sets
33
+
34
+ You can preload standard entity sets (HTML5, MathML, ISO) for faster entity resolution:
35
+
36
+ ```ruby
37
+ context = Moxml.new(:nokogiri,
38
+ restore_entities: true,
39
+ preload_entity_sets: [:html5, :mathml]
40
+ )
41
+ ```
42
+
43
+ ## W3C XML Core WG Compliance
44
+
45
+ Per W3C XML Core WG guidance:
46
+ - Standard XML entities (`amp`, `lt`, `gt`, `quot`, `apos`) are implicitly declared per XML spec
47
+ - The `EntityRegistry` class tracks all known entities and their Unicode codepoints
48
+ - Entity names are preserved through round-trip serialization
49
+
50
+ ## What lutaml-model Needs to Know
51
+
52
+ ### 1. Document Structure with Entities
53
+
54
+ When entity restoration is enabled, documents containing entities will have mixed node types:
55
+
56
+ ```
57
+ Document
58
+ └── Element: root
59
+ ├── Text: "foo"
60
+ ├── EntityReference: "amp" # Represents &
61
+ └── Text: "bar"
62
+ ```
63
+
64
+ ### 2. Serialization
65
+
66
+ `doc.to_xml` will serialize EntityReference nodes as proper XML entity syntax:
67
+ - `EntityReference("amp")` → `&amp;`
68
+ - `EntityReference("lt")` → `&lt;`
69
+ - etc.
70
+
71
+ ### 3. XPath Queries
72
+
73
+ EntityReference nodes participate in XPath queries like any other node. You can query for them specifically if needed.
74
+
75
+ ### 4. Configuration Inheritance
76
+
77
+ When using `Moxml::Context`, the entity restoration setting is preserved through document operations. However, when creating new contexts, you need to set the option explicitly.
78
+
79
+ ## Example Usage in lutaml-model
80
+
81
+ ```ruby
82
+ # Parse XML with entities preserved
83
+ context = Moxml.new(:nokogiri, restore_entities: true)
84
+ doc = context.parse(your_xml_string)
85
+
86
+ # Serialize back - entities are preserved
87
+ output = doc.to_xml
88
+ ```
89
+
90
+ ## Testing Considerations
91
+
92
+ When writing tests for models that handle XML with entities:
93
+ 1. Enable `restore_entities: true` in your test context
94
+ 2. Verify that EntityReference nodes are created for entities in text
95
+ 3. Test round-trip: parse → serialize → parse should preserve entities
96
+
97
+ ## Files of Interest
98
+
99
+ - `lib/moxml/entity_registry.rb` - Entity definitions and lookup
100
+ - `lib/moxml/config.rb` - Configuration options
101
+ - `lib/moxml/document_builder.rb` - Entity restoration logic
102
+ - `lib/moxml/entity_reference.rb` - EntityReference node class
@@ -48,7 +48,37 @@ doc.xpath("//book").find { |book| book["id"] == "123" }
48
48
  IMPORTANT: For complete XPath 1.0 specification with zero limitations today, use
49
49
  Nokogiri or Oga adapters.
50
50
 
51
+ ==== Element Ordering in Round-Trip Tests
51
52
 
53
+ [IMPORTANT]
54
+ .Round-trip tests for Ox are experimental
55
+ --
56
+ Ox round-trip tests (`nokogiri-ox` CI job) are **experimental** because Ox lacks full XML
57
+ conformance:
58
+
59
+ * **Namespace support**: Ox does not properly handle namespaced elements in XPath queries
60
+ * **XPath limitations**: Uses `locate()` instead of standard XPath; no attribute value
61
+ predicates, no logical operators, no position predicates
62
+ * **Element ordering**: Ox produces elements in a different order than Nokogiri/Oga for
63
+ certain complex fixtures
64
+
65
+ For production use with complex XML, prefer Nokogiri or Oga adapters.
66
+ --
67
+
68
+ The Ox adapter produces elements in a different order than other adapters for certain
69
+ fixtures with complex nested structures. In round-trip tests, this causes the
70
+ `elements_with_attributes` array length comparison to fail, even though the semantic
71
+ equivalence check (double round-trip) passes.
72
+
73
+ Known affected fixtures:
74
+
75
+ * `niso-jats/element_citation.xml`
76
+ * `niso-jats/pnas_sample.xml`
77
+ * `metanorma/collection1nested.xml`
78
+
79
+ This is a known limitation tracked in the round-trip test suite via
80
+ `KNOWN_ELEMENT_ORDERING_ISSUES`. The `elements_with_attributes` comparison is
81
+ automatically skipped for these Ox adapter pairs.
52
82
 
53
83
  See also:
54
84
 
@@ -97,6 +97,49 @@ context.config.default_encoding = 'UTF-16'
97
97
 
98
98
  **Default:** `"UTF-8"`
99
99
 
100
+ ==== Namespace URI validation mode
101
+
102
+ Control how strictly namespace URIs are validated:
103
+
104
+ [source,ruby]
105
+ ----
106
+ # Strict mode (default) — validates namespace URIs against RFC 3986
107
+ context.config.namespace_uri_mode = :strict
108
+ doc = context.parse(xml) # Raises ValidationError for invalid URIs
109
+
110
+ # Lenient mode — accepts any string as a namespace URI
111
+ context.config.namespace_uri_mode = :lenient
112
+ doc = context.parse(xml) # Accepts non-standard namespace URIs
113
+ ----
114
+
115
+ **Default:** `:strict`
116
+
117
+ **Modes:**
118
+
119
+ `:strict`:: Validates namespace URIs against the
120
+ https://www.rfc-editor.org/rfc/rfc3986[RFC 3986] URI-reference specification, as
121
+ required by https://www.w3.org/TR/xml-names/[Namespaces in XML]. Invalid URIs
122
+ raise a `Moxml::ValidationError`. This is the recommended mode for
123
+ standards-compliant XML processing.
124
+
125
+ `:lenient`:: Accepts any string as a namespace URI, only rejecting strings
126
+ containing XML-invalid control characters (`0x00`-`0x08`, `0x0B`, `0x0C`,
127
+ `0x0E`-`0x1F`). Use this mode when processing XML documents that use
128
+ non-standard namespace identifiers such as URNs or other non-URI strings.
129
+
130
+ **Example:**
131
+
132
+ [source,ruby]
133
+ ----
134
+ # Process documents with non-standard namespace URIs
135
+ context = Moxml.new do |config|
136
+ config.namespace_uri_mode = :lenient
137
+ end
138
+
139
+ xml = '<root xmlns:ex="not a valid URI but accepted in lenient mode"/>'
140
+ doc = context.parse(xml) # Parses successfully
141
+ ----
142
+
100
143
  === Context switching
101
144
 
102
145
  Use different configurations for different tasks:
@@ -32,6 +32,10 @@ All node types in Moxml support the `#identifier` method, which returns the prim
32
32
  | `nil` (no identifier)
33
33
  | `nil`
34
34
 
35
+ | EntityReference
36
+ | The entity name
37
+ | `"nbsp"`, `"copy"`
38
+
35
39
  | Declaration
36
40
  | `nil` (no identifier)
37
41
  | `nil`
@@ -137,3 +141,34 @@ doctype.identifier # => "html"
137
141
  * `identifier` - Returns the primary identifier (same as `name`)
138
142
 
139
143
  All Doctype accessor methods are fully implemented across all 6 adapters.
144
+
145
+ === EntityReference nodes
146
+
147
+ EntityReference nodes represent XML entity references like `&nbsp;`, `&copy;`, or custom entities declared in the DOCTYPE.
148
+
149
+ [source,ruby]
150
+ ----
151
+ # Create programmatically
152
+ ref = doc.create_entity_reference('nbsp')
153
+ element.add_child(ref)
154
+
155
+ # Or via builder
156
+ doc = Moxml::Builder.new(Moxml.new).build do
157
+ element 'text' do
158
+ entity_reference 'ndash'
159
+ end
160
+ end
161
+ ----
162
+
163
+ *Available methods:*
164
+
165
+ * `name` - Returns the entity name (e.g., `"nbsp"`, `"copy"`)
166
+ * `identifier` - Returns the primary identifier (same as `name`)
167
+ * `text` - Returns empty string (`""`) since entity has no text content
168
+ * `content` - Returns empty string (entity content is in the name)
169
+ * `to_xml` - Returns the entity syntax (e.g., `"&nbsp;"`)
170
+
171
+ *Adapter notes:*
172
+
173
+ * *Nokogiri*: Preserves custom declared entities as `EntityReference` nodes
174
+ * *Ox, Oga*: These adapters resolve entities during parsing and do not expose entity reference nodes. Use Nokogiri or LibXML for entity preservation.
@@ -276,6 +276,27 @@ puts all_children.length # => 2
276
276
  * link:../pages/adapters/rexml[REXML] - ⚠️ No namespace XPath
277
277
  * link:../pages/adapters/ox[Ox] - ⚠️ Basic only, no XPath
278
278
 
279
+ === Namespace URI validation
280
+
281
+ By default, Moxml validates namespace URIs against
282
+ https://www.rfc-editor.org/rfc/rfc3986[RFC 3986] (strict mode). To accept
283
+ non-standard namespace identifiers, use lenient mode:
284
+
285
+ [source,ruby]
286
+ ----
287
+ # Strict mode (default) — validates URIs per RFC 3986
288
+ context = Moxml.new do |config|
289
+ config.namespace_uri_mode = :strict
290
+ end
291
+
292
+ # Lenient mode — accepts any string as namespace URI
293
+ context = Moxml.new do |config|
294
+ config.namespace_uri_mode = :lenient
295
+ end
296
+ ----
297
+
298
+ See link:../pages/configuration[Configuration] for details.
299
+
279
300
  === Troubleshooting
280
301
 
281
302
  **Namespace XPath not working:**
@@ -179,9 +179,7 @@ if __FILE__ == $0
179
179
 
180
180
  # Count categories
181
181
  all_categories = articles.flat_map(&:categories)
182
- category_counts = all_categories.each_with_object(Hash.new(0)) do |cat, counts|
183
- counts[cat] += 1
184
- end
182
+ category_counts = all_categories.tally
185
183
  puts "Categories: #{category_counts.map do |cat, count|
186
184
  "#{cat} (#{count})"
187
185
  end.join(', ')}"
@@ -98,12 +98,24 @@ module Moxml
98
98
  create_native_declaration(version, encoding, standalone)
99
99
  end
100
100
 
101
- def create_namespace(element, prefix, uri)
101
+ def create_namespace(element, prefix, uri, namespace_uri_mode: :strict)
102
+ if prefix && uri.to_s.empty?
103
+ raise NamespaceError.new(
104
+ "Prefixed namespace declaration cannot have an empty URI",
105
+ prefix: prefix,
106
+ uri: uri,
107
+ )
108
+ end
102
109
  validate_prefix(prefix) if prefix
103
- validate_uri(uri)
110
+ validate_uri(uri, mode: namespace_uri_mode)
104
111
  create_native_namespace(element, prefix, uri)
105
112
  end
106
113
 
114
+ def create_entity_reference(name)
115
+ validate_entity_reference_name(name)
116
+ create_native_entity_reference(name)
117
+ end
118
+
107
119
  def set_attribute_name(attribute, name)
108
120
  attribute.name = name
109
121
  end
@@ -112,6 +124,10 @@ module Moxml
112
124
  attribute.value = value
113
125
  end
114
126
 
127
+ def entity_reference_name(node)
128
+ node.name
129
+ end
130
+
115
131
  def duplicate_node(node)
116
132
  node.dup
117
133
  end
@@ -193,6 +209,14 @@ module Moxml
193
209
  adapter: name,
194
210
  )
195
211
  end
212
+
213
+ def create_native_entity_reference(_name)
214
+ raise Moxml::NotImplementedError.new(
215
+ "create_native_entity_reference not implemented",
216
+ feature: "create_native_entity_reference",
217
+ adapter: name,
218
+ )
219
+ end
196
220
  end
197
221
  end
198
222
  end
@@ -26,7 +26,7 @@ module Moxml
26
26
  class HeadedOx < Ox
27
27
  class << self
28
28
  # Override parse to use HeadedOx context instead of Ox context
29
- def parse(xml, _options = {})
29
+ def parse(xml, _options = {}, _context = nil)
30
30
  native_doc = begin
31
31
  result = ::Ox.parse(xml)
32
32
 
@@ -45,8 +45,9 @@ module Moxml
45
45
  )
46
46
  end
47
47
 
48
- # Use :headed_ox context instead of :ox
49
- DocumentBuilder.new(Context.new(:headed_ox)).build(native_doc)
48
+ # Use provided context if available, otherwise create new one
49
+ ctx = _context || Context.new(:headed_ox)
50
+ DocumentBuilder.new(ctx).build(native_doc)
50
51
  end
51
52
 
52
53
  # Execute XPath query using Moxml's XPath engine
@@ -66,7 +67,7 @@ module Moxml
66
67
  ctx = Context.new(:headed_ox)
67
68
 
68
69
  # Wrap the native node - don't rebuild the whole document
69
- node = Node.wrap(node, ctx)
70
+ node = Moxml::Node.wrap(node, ctx)
70
71
  end
71
72
 
72
73
  # Parse XPath expression to AST
@@ -48,7 +48,7 @@ module Moxml
48
48
  doc.root = element
49
49
  end
50
50
 
51
- def parse(xml, options = {})
51
+ def parse(xml, options = {}, _context = nil)
52
52
  # LibXML doesn't preserve DOCTYPE during parsing, so we need to extract it manually
53
53
  xml_string = if xml.is_a?(String)
54
54
  xml
@@ -94,7 +94,8 @@ module Moxml
94
94
  native_doc.instance_variable_set(:@moxml_doctype, doctype_wrapper)
95
95
  end
96
96
 
97
- DocumentBuilder.new(Context.new(:libxml)).build(native_doc)
97
+ ctx = _context || Context.new(:libxml)
98
+ DocumentBuilder.new(ctx).build(native_doc)
98
99
  end
99
100
 
100
101
  # SAX parsing implementation for LibXML
@@ -11,7 +11,7 @@ module Moxml
11
11
  doc.root = element
12
12
  end
13
13
 
14
- def parse(xml, options = {})
14
+ def parse(xml, options = {}, _context = nil)
15
15
  native_doc = begin
16
16
  if options[:fragment]
17
17
  ::Nokogiri::XML::DocumentFragment.parse(xml) do |config|
@@ -29,7 +29,9 @@ module Moxml
29
29
  column: e.column)
30
30
  end
31
31
 
32
- DocumentBuilder.new(Context.new(:nokogiri)).build(native_doc)
32
+ # Use provided context if available, otherwise create new one
33
+ ctx = _context || Context.new(:nokogiri)
34
+ DocumentBuilder.new(ctx).build(native_doc)
33
35
  end
34
36
 
35
37
  # SAX parsing implementation for Nokogiri
@@ -104,6 +106,14 @@ module Moxml
104
106
  )
105
107
  end
106
108
 
109
+ def create_native_entity_reference(name)
110
+ ::Nokogiri::XML::EntityReference.new(create_document, name)
111
+ end
112
+
113
+ def entity_reference_name(node)
114
+ node.name
115
+ end
116
+
107
117
  def declaration_attribute(declaration, attr_name)
108
118
  return nil unless declaration.content
109
119
 
@@ -150,6 +160,7 @@ module Moxml
150
160
  when ::Nokogiri::XML::ProcessingInstruction then :processing_instruction
151
161
  when ::Nokogiri::XML::Document, ::Nokogiri::XML::DocumentFragment then :document
152
162
  when ::Nokogiri::XML::DTD then :doctype
163
+ when ::Nokogiri::XML::EntityReference then :entity_reference
153
164
  else :unknown
154
165
  end
155
166
  end
@@ -277,7 +288,9 @@ module Moxml
277
288
  end
278
289
 
279
290
  def inner_text(node)
280
- text_children = node.children - node.element_children
291
+ text_children = node.children.reject do |c|
292
+ c.element? || c.comment?
293
+ end
281
294
  text_children.map(&:content).join
282
295
  end
283
296
 
@@ -9,17 +9,24 @@ module Moxml
9
9
  module Adapter
10
10
  class Oga < Base
11
11
  class << self
12
+ # Standard XML entities handled natively by parsers
13
+ STANDARD_XML_ENTITIES = %w[amp lt gt quot apos].freeze
14
+
12
15
  def set_root(doc, element)
13
16
  # Clear existing root element if any - Oga's NodeSet needs special handling
14
17
  # We need to manually remove elements since NodeSet doesn't support clear or delete_if
15
- elements_to_remove = doc.children.select { |child| child.is_a?(::Oga::XML::Element) }
18
+ elements_to_remove = doc.children.grep(::Oga::XML::Element)
16
19
  elements_to_remove.each { |elem| doc.children.delete(elem) }
17
20
  doc.children << element
18
21
  end
19
22
 
20
- def parse(xml, options = {})
23
+ def parse(xml, options = {}, _context = nil)
24
+ # Pre-process XML to convert named entities to marker form (\x01name;).
25
+ # Oga drops named entity references like &nbsp; during parsing.
26
+ processed_xml = preprocess_named_entities(xml)
27
+
21
28
  native_doc = begin
22
- ::Oga.parse_xml(xml, strict: options[:strict])
29
+ ::Oga.parse_xml(processed_xml, strict: options[:strict])
23
30
  rescue LL::ParserError => e
24
31
  raise Moxml::ParseError.new(
25
32
  e.message,
@@ -27,7 +34,8 @@ module Moxml
27
34
  )
28
35
  end
29
36
 
30
- DocumentBuilder.new(Context.new(:oga)).build(native_doc)
37
+ ctx = _context || Context.new(:oga)
38
+ DocumentBuilder.new(ctx).build(native_doc)
31
39
  end
32
40
 
33
41
  # SAX parsing implementation for Oga
@@ -61,7 +69,7 @@ module Moxml
61
69
  end
62
70
 
63
71
  def create_native_text(content)
64
- ::Oga::XML::Text.new(text: content)
72
+ ::Oga::XML::Text.new(text: encode_entity_markers(content))
65
73
  end
66
74
 
67
75
  def create_native_cdata(content)
@@ -74,7 +82,8 @@ module Moxml
74
82
 
75
83
  def create_native_doctype(name, external_id, system_id)
76
84
  ::Oga::XML::Doctype.new(
77
- name: name, public_id: external_id, system_id: system_id, type: "PUBLIC",
85
+ name: name, public_id: external_id, system_id: system_id,
86
+ type: external_id ? "PUBLIC" : "SYSTEM"
78
87
  )
79
88
  end
80
89
 
@@ -224,7 +233,7 @@ module Moxml
224
233
  attr = ::Oga::XML::Attribute.new(
225
234
  name: name.to_s,
226
235
  namespace_name: namespace_name,
227
- value: value.to_s,
236
+ value: encode_entity_markers(value.to_s),
228
237
  )
229
238
  element.add_attribute(attr)
230
239
  end
@@ -234,7 +243,7 @@ module Moxml
234
243
  end
235
244
 
236
245
  def get_attribute_value(element, name)
237
- element[name.to_s]
246
+ restore_entity_markers(element[name.to_s])
238
247
  end
239
248
 
240
249
  def remove_attribute(element, name)
@@ -303,24 +312,25 @@ module Moxml
303
312
  end
304
313
 
305
314
  def text_content(node)
306
- node.text
315
+ restore_entity_markers(node.text)
307
316
  end
308
317
 
309
318
  def inner_text(node)
310
- if node.respond_to?(:inner_text)
311
- node.inner_text
312
- else
313
- # Oga::XML::Text node for example
314
- node.text
315
- end
319
+ text = if node.respond_to?(:inner_text)
320
+ node.inner_text
321
+ else
322
+ # Oga::XML::Text node for example
323
+ node.text
324
+ end
325
+ restore_entity_markers(text)
316
326
  end
317
327
 
318
328
  def set_text_content(node, content)
329
+ encoded = encode_entity_markers(content)
319
330
  if node.respond_to?(:inner_text=)
320
- node.inner_text = content
331
+ node.inner_text = encoded
321
332
  else
322
- # Oga::XML::Text node for example
323
- node.text = content
333
+ node.text = encoded
324
334
  end
325
335
  end
326
336
 
@@ -402,6 +412,53 @@ module Moxml
402
412
  end
403
413
 
404
414
  def serialize(node, options = {})
415
+ output = serialize_without_entity_processing(node, options)
416
+ # Post-process: convert entity markers back to entity references
417
+ output.gsub(ENTITY_MARKER_REGEX, '&\1;')
418
+ end
419
+
420
+ # Shared entity name pattern (W3C: 2-31 chars, starts with alpha)
421
+ ENTITY_PATTERN = "([a-zA-Z][a-zA-Z0-9]{1,30})"
422
+
423
+ # Marker character for entity preservation through Oga's parser.
424
+ # U+0001 is preserved literally by Oga through parse/serialize cycle.
425
+ ENTITY_MARKER = "\x01"
426
+
427
+ # Regular expression for entity marker post-processing
428
+ ENTITY_MARKER_REGEX = /#{ENTITY_MARKER}#{ENTITY_PATTERN};/
429
+
430
+ # Simple entity-only regex with no nested quantifiers
431
+ ENTITY_REF_REGEX = /&#{ENTITY_PATTERN};/
432
+
433
+ private
434
+
435
+ # Convert &entity; back to \x01entity; for Oga text storage.
436
+ # Used when setting text content programmatically (not from parsing).
437
+ def encode_entity_markers(text)
438
+ return text unless text&.include?("&")
439
+
440
+ text.gsub(ENTITY_REF_REGEX) do
441
+ name = ::Regexp.last_match(1)
442
+
443
+ next ::Regexp.last_match(0) if STANDARD_XML_ENTITIES.include?(name)
444
+
445
+ codepoint = Moxml::EntityRegistry.default.codepoint_for_name(name)
446
+ if codepoint
447
+ "#{ENTITY_MARKER}#{name};"
448
+ else
449
+ ::Regexp.last_match(0)
450
+ end
451
+ end
452
+ end
453
+
454
+ # Convert \x01entity; back to &entity; for text accessors.
455
+ def restore_entity_markers(text)
456
+ return text unless text
457
+
458
+ text.gsub(ENTITY_MARKER_REGEX, '&\1;')
459
+ end
460
+
461
+ def serialize_without_entity_processing(node, options = {})
405
462
  # Oga's XmlGenerator doesn't support options directly
406
463
  # We need to handle declaration options ourselves for Document nodes
407
464
  if node.is_a?(::Oga::XML::Document)
@@ -416,7 +473,11 @@ module Moxml
416
473
  node.xml_declaration ? true : false
417
474
  end
418
475
 
419
- if should_include_decl && !node.xml_declaration
476
+ # Fix: Check if declaration already exists in children
477
+ # This prevents duplicate declarations when document already has one
478
+ has_existing_declaration = node.children.any?(::Oga::XML::XmlDeclaration)
479
+
480
+ if should_include_decl && !node.xml_declaration && !has_existing_declaration
420
481
  # Need to add declaration - create default one
421
482
  output = +""
422
483
  output << '<?xml version="1.0" encoding="UTF-8"?>'
@@ -450,7 +511,50 @@ module Moxml
450
511
  end
451
512
 
452
513
  # Default: use XmlGenerator
453
- ::Moxml::Adapter::CustomizedOga::XmlGenerator.new(node).to_xml
514
+ # But first check if we need to handle declaration specially
515
+ if node.is_a?(::Oga::XML::Document) && node.xml_declaration
516
+ # Document has declaration - use custom handling to avoid duplicates
517
+ output = +""
518
+
519
+ # Serialize children, but skip XmlDeclaration if it would cause duplication
520
+ node.children.each do |child|
521
+ # Check if this would cause duplication by seeing if we already have one in output
522
+ if child.is_a?(::Oga::XML::XmlDeclaration) && output.include?("<?xml")
523
+ next # Skip duplicate declaration
524
+ end
525
+
526
+ output << ::Moxml::Adapter::CustomizedOga::XmlGenerator.new(child).to_xml
527
+ end
528
+
529
+ output
530
+ else
531
+ # Normal case - use XmlGenerator directly
532
+ ::Moxml::Adapter::CustomizedOga::XmlGenerator.new(node).to_xml
533
+ end
534
+ end
535
+
536
+ # Pre-process XML to convert named entities to marker format.
537
+ # Oga drops named entity references like &nbsp; but preserves control chars.
538
+ # By converting known named entities to marker form (\x01name;), we can
539
+ # reconstruct them during serialization.
540
+ #
541
+ # @param xml [String, #to_s] The XML string to process
542
+ # @return [String] The XML with known named entities converted to marker form
543
+ def preprocess_named_entities(xml)
544
+ return xml unless xml.is_a?(String)
545
+
546
+ xml.gsub(ENTITY_REF_REGEX) do
547
+ name = Regexp.last_match(1)
548
+
549
+ next Regexp.last_match(0) if STANDARD_XML_ENTITIES.include?(name)
550
+
551
+ codepoint = Moxml::EntityRegistry.default.codepoint_for_name(name)
552
+ if codepoint
553
+ "#{ENTITY_MARKER}#{name};"
554
+ else
555
+ Regexp.last_match(0)
556
+ end
557
+ end
454
558
  end
455
559
  end
456
560
  end
@@ -17,7 +17,7 @@ module Moxml
17
17
  replace_children(doc, [element])
18
18
  end
19
19
 
20
- def parse(xml, _options = {})
20
+ def parse(xml, _options = {}, _context = nil)
21
21
  native_doc = begin
22
22
  result = ::Ox.parse(xml)
23
23
 
@@ -36,7 +36,8 @@ module Moxml
36
36
  )
37
37
  end
38
38
 
39
- DocumentBuilder.new(Context.new(:ox)).build(native_doc)
39
+ ctx = _context || Context.new(:ox)
40
+ DocumentBuilder.new(ctx).build(native_doc)
40
41
  end
41
42
 
42
43
  # SAX parsing implementation for Ox
@@ -452,7 +453,7 @@ module Moxml
452
453
  def inner_text(node)
453
454
  return "" unless node.respond_to?(:nodes)
454
455
 
455
- node.nodes.select { _1.is_a?(String) }.join
456
+ node.nodes.grep(String).join
456
457
  end
457
458
 
458
459
  def set_text_content(node, content)