moxml 0.1.16 → 0.1.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +6 -0
- data/.rubocop_todo.yml +49 -133
- data/README.adoc +18 -0
- data/lib/moxml/adapter/base.rb +65 -8
- data/lib/moxml/adapter/headed_ox.rb +2 -1
- data/lib/moxml/adapter/libxml.rb +16 -3
- data/lib/moxml/adapter/nokogiri.rb +14 -4
- data/lib/moxml/adapter/oga.rb +26 -87
- data/lib/moxml/adapter/ox.rb +69 -19
- data/lib/moxml/adapter/rexml.rb +24 -3
- data/lib/moxml/attribute.rb +6 -0
- data/lib/moxml/element.rb +12 -8
- data/lib/moxml/node.rb +4 -1
- data/lib/moxml/text.rb +6 -0
- data/lib/moxml/version.rb +1 -1
- data/lib/moxml/xpath/compiler.rb +40 -21
- data/lib/moxml/xpath/parser.rb +12 -7
- data/spec/integration/all_adapters_spec.rb +1 -0
- data/spec/integration/shared_examples/edge_cases.rb +0 -6
- data/spec/integration/shared_examples/entity_reference_whitespace.rb +122 -0
- data/spec/integration/shared_examples/node_wrappers/cdata_behavior.rb +0 -7
- data/spec/integration/shared_examples/node_wrappers/namespace_behavior.rb +135 -0
- data/spec/integration/shared_examples/node_wrappers/node_behavior.rb +0 -3
- data/spec/moxml/adapter/entity_restoration_spec.rb +97 -0
- data/spec/moxml/builder_spec.rb +16 -1
- data/spec/moxml/entity_preservation_spec.rb +130 -0
- data/spec/moxml/entity_reference_spec.rb +114 -0
- data/spec/moxml/entity_registry_spec.rb +68 -0
- data/spec/moxml/xpath/axes_spec.rb +0 -1
- data/spec/moxml/xpath/compiler_spec.rb +0 -2
- metadata +6 -12
- data/TODO.remaining/1-entity-reference-adapter-support.md +0 -157
- data/TODO.remaining/2-entity-restoration-model-driven.md +0 -169
- data/TODO.remaining/3-entity-reference-test-coverage.md +0 -170
- data/TODO.remaining/4-lenient-entities-mode.md +0 -106
- data/TODO.remaining/5-fixture-integrity.md +0 -65
- data/TODO.remaining/6-ox-element-ordering-bug.md +0 -36
- data/TODO.remaining/7-headed-ox-limitations.md +0 -95
- data/TODO.remaining/8-xpath-predicate-gaps.md +0 -68
- data/TODO.remaining/9-cleanup-hygiene.md +0 -42
- data/TODO.remaining/README.md +0 -54
|
@@ -181,4 +181,72 @@ RSpec.describe Moxml::EntityRegistry do
|
|
|
181
181
|
expect(registry.load_all).to be(registry)
|
|
182
182
|
end
|
|
183
183
|
end
|
|
184
|
+
|
|
185
|
+
describe "#standard_entity?" do
|
|
186
|
+
it "returns true for the 5 standard XML entities" do
|
|
187
|
+
registry = described_class.new
|
|
188
|
+
expect(registry.standard_entity?(0x26)).to be true # amp
|
|
189
|
+
expect(registry.standard_entity?(0x3C)).to be true # lt
|
|
190
|
+
expect(registry.standard_entity?(0x3E)).to be true # gt
|
|
191
|
+
expect(registry.standard_entity?(0x22)).to be true # quot
|
|
192
|
+
expect(registry.standard_entity?(0x27)).to be true # apos
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
it "returns false for non-standard codepoints" do
|
|
196
|
+
registry = described_class.new
|
|
197
|
+
expect(registry.standard_entity?(0xA0)).to be false # nbsp
|
|
198
|
+
expect(registry.standard_entity?(0xA9)).to be false # copy
|
|
199
|
+
expect(registry.standard_entity?(0x30)).to be false # '0'
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
describe "#should_restore?" do
|
|
204
|
+
it "always restores the 5 standard XML entities regardless of config" do
|
|
205
|
+
registry = described_class.new
|
|
206
|
+
config = Moxml::Config.new(:nokogiri)
|
|
207
|
+
config.restore_entities = false
|
|
208
|
+
expect(registry.should_restore?(0x26, config: config)).to be true # amp
|
|
209
|
+
expect(registry.should_restore?(0x3C, config: config)).to be true # lt
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
it "restores non-standard entities when restore_entities is true and mode is lenient" do
|
|
213
|
+
registry = described_class.new
|
|
214
|
+
config = Moxml::Config.new(:nokogiri)
|
|
215
|
+
config.restore_entities = true
|
|
216
|
+
config.entity_restoration_mode = :lenient
|
|
217
|
+
expect(registry.should_restore?(0xA0, config: config)).to be true # nbsp
|
|
218
|
+
expect(registry.should_restore?(0xA9, config: config)).to be true # copy
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
it "does not restore non-standard entities when restore_entities is false" do
|
|
222
|
+
registry = described_class.new
|
|
223
|
+
config = Moxml::Config.new(:nokogiri)
|
|
224
|
+
config.restore_entities = false
|
|
225
|
+
expect(registry.should_restore?(0xA0, config: config)).to be false
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
it "returns false for codepoints not in the registry" do
|
|
229
|
+
registry = described_class.new(mode: :disabled)
|
|
230
|
+
config = Moxml::Config.new(:nokogiri)
|
|
231
|
+
config.restore_entities = true
|
|
232
|
+
expect(registry.should_restore?(0x30, config: config)).to be false # '0'
|
|
233
|
+
end
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
describe "#restorable_codepoints" do
|
|
237
|
+
it "returns the set of codepoints that could be restored" do
|
|
238
|
+
registry = described_class.new
|
|
239
|
+
codepoints = registry.restorable_codepoints
|
|
240
|
+
expect(codepoints).to be_a(Set)
|
|
241
|
+
expect(codepoints).to include(0x26) # amp
|
|
242
|
+
expect(codepoints).to include(0xA0) # nbsp
|
|
243
|
+
expect(codepoints.size).to be > 100
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
it "returns only standard codepoints for empty registry" do
|
|
247
|
+
registry = described_class.new(mode: :disabled)
|
|
248
|
+
codepoints = registry.restorable_codepoints
|
|
249
|
+
expect(codepoints).to eq(described_class::STANDARD_CODEPOINTS)
|
|
250
|
+
end
|
|
251
|
+
end
|
|
184
252
|
end
|
|
@@ -222,7 +222,6 @@ RSpec.describe "XPath Axes" do
|
|
|
222
222
|
end
|
|
223
223
|
|
|
224
224
|
it "combines attribute axis with wildcards" do
|
|
225
|
-
skip "HeadedOx limitation: Attribute wildcard (@*) not supported by XPath parser. See docs/_pages/headed-ox-limitations.adoc"
|
|
226
225
|
ast = Moxml::XPath::Parser.parse("//book/@*")
|
|
227
226
|
proc = Moxml::XPath::Compiler.compile_with_cache(ast)
|
|
228
227
|
result = proc.call(book_doc)
|
|
@@ -153,7 +153,6 @@ RSpec.describe Moxml::XPath::Compiler do
|
|
|
153
153
|
end
|
|
154
154
|
|
|
155
155
|
it "works with wildcards" do
|
|
156
|
-
skip "HeadedOx limitation: Wildcard count differs due to Ox's DOM structure. See docs/_pages/headed-ox-limitations.adoc"
|
|
157
156
|
ast = Moxml::XPath::Parser.parse("//*")
|
|
158
157
|
proc = described_class.compile_with_cache(ast)
|
|
159
158
|
result = proc.call(nested_doc)
|
|
@@ -189,7 +188,6 @@ RSpec.describe Moxml::XPath::Compiler do
|
|
|
189
188
|
end
|
|
190
189
|
|
|
191
190
|
it "works with wildcards" do
|
|
192
|
-
skip "HeadedOx limitation: Attribute wildcard (@*) not supported by XPath parser. See docs/_pages/headed-ox-limitations.adoc"
|
|
193
191
|
ast = Moxml::XPath::Parser.parse("/root/book/@*")
|
|
194
192
|
proc = described_class.compile_with_cache(ast)
|
|
195
193
|
result = proc.call(attr_doc)
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: moxml
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.17
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-04-
|
|
11
|
+
date: 2026-04-24 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: |
|
|
14
14
|
Moxml is a unified XML manipulation library that provides a common API
|
|
@@ -34,16 +34,6 @@ files:
|
|
|
34
34
|
- LICENSE.md
|
|
35
35
|
- README.adoc
|
|
36
36
|
- Rakefile
|
|
37
|
-
- TODO.remaining/1-entity-reference-adapter-support.md
|
|
38
|
-
- TODO.remaining/2-entity-restoration-model-driven.md
|
|
39
|
-
- TODO.remaining/3-entity-reference-test-coverage.md
|
|
40
|
-
- TODO.remaining/4-lenient-entities-mode.md
|
|
41
|
-
- TODO.remaining/5-fixture-integrity.md
|
|
42
|
-
- TODO.remaining/6-ox-element-ordering-bug.md
|
|
43
|
-
- TODO.remaining/7-headed-ox-limitations.md
|
|
44
|
-
- TODO.remaining/8-xpath-predicate-gaps.md
|
|
45
|
-
- TODO.remaining/9-cleanup-hygiene.md
|
|
46
|
-
- TODO.remaining/README.md
|
|
47
37
|
- benchmarks/.gitignore
|
|
48
38
|
- benchmarks/generate_report.rb
|
|
49
39
|
- bin/console
|
|
@@ -277,6 +267,7 @@ files:
|
|
|
277
267
|
- spec/integration/all_adapters_spec.rb
|
|
278
268
|
- spec/integration/headed_ox_integration_spec.rb
|
|
279
269
|
- spec/integration/shared_examples/edge_cases.rb
|
|
270
|
+
- spec/integration/shared_examples/entity_reference_whitespace.rb
|
|
280
271
|
- spec/integration/shared_examples/high_level/.gitkeep
|
|
281
272
|
- spec/integration/shared_examples/high_level/builder_behavior.rb
|
|
282
273
|
- spec/integration/shared_examples/high_level/context_behavior.rb
|
|
@@ -302,6 +293,7 @@ files:
|
|
|
302
293
|
- spec/moxml/adapter/.gitkeep
|
|
303
294
|
- spec/moxml/adapter/README.md
|
|
304
295
|
- spec/moxml/adapter/base_spec.rb
|
|
296
|
+
- spec/moxml/adapter/entity_restoration_spec.rb
|
|
305
297
|
- spec/moxml/adapter/headed_ox_spec.rb
|
|
306
298
|
- spec/moxml/adapter/libxml_spec.rb
|
|
307
299
|
- spec/moxml/adapter/nokogiri_spec.rb
|
|
@@ -325,6 +317,8 @@ files:
|
|
|
325
317
|
- spec/moxml/document_builder_spec.rb
|
|
326
318
|
- spec/moxml/document_spec.rb
|
|
327
319
|
- spec/moxml/element_spec.rb
|
|
320
|
+
- spec/moxml/entity_preservation_spec.rb
|
|
321
|
+
- spec/moxml/entity_reference_spec.rb
|
|
328
322
|
- spec/moxml/entity_registry_spec.rb
|
|
329
323
|
- spec/moxml/error_spec.rb
|
|
330
324
|
- spec/moxml/lazy_parse_spec.rb
|
|
@@ -1,157 +0,0 @@
|
|
|
1
|
-
# TODO 1: EntityReference Adapter Support for Ox, Oga, REXML, LibXML, HeadedOx
|
|
2
|
-
|
|
3
|
-
## Problem
|
|
4
|
-
|
|
5
|
-
Only the Nokogiri adapter implements `create_native_entity_reference` and maps
|
|
6
|
-
its native type to `:entity_reference` in `node_type`. The other 5 adapters
|
|
7
|
-
will raise `NotImplementedError` if `restore_entities` is enabled or if any
|
|
8
|
-
code path calls `create_entity_reference`. This makes the entire
|
|
9
|
-
EntityReference feature **non-functional** outside Nokogiri.
|
|
10
|
-
|
|
11
|
-
## Current State (verified)
|
|
12
|
-
|
|
13
|
-
| Adapter | `create_native_entity_reference` | `node_type` mapping | Serialization | Status |
|
|
14
|
-
|-----------|----------------------------------|---------------------|---------------|--------|
|
|
15
|
-
| Nokogiri | Done (`Nokogiri::XML::EntityReference.new`) | Done | Native | Working |
|
|
16
|
-
| Ox | Missing | Missing | Uses `Ox.dump` (C-level, won't handle custom types) | Broken |
|
|
17
|
-
| HeadedOx | Missing (inherits Ox) | Missing | Same as Ox | Broken |
|
|
18
|
-
| Oga | Missing | Missing | Uses `CustomizedOga::XmlGenerator` | Broken |
|
|
19
|
-
| REXML | Missing | Missing | Uses REXML's `write` | Broken |
|
|
20
|
-
| LibXML | Missing | Missing | Uses custom serializer with wrapper detection | Broken |
|
|
21
|
-
|
|
22
|
-
## Architecture
|
|
23
|
-
|
|
24
|
-
EntityReference follows the same pattern as other non-native node types in Moxml:
|
|
25
|
-
a **wrapper class** that represents what the underlying library cannot express natively.
|
|
26
|
-
|
|
27
|
-
Each adapter needs three things:
|
|
28
|
-
1. **Wrapper class** (`CustomizedXxx::EntityReference`) — holds the entity name
|
|
29
|
-
2. **`node_type` mapping** — so `Node.wrap` can create the correct Moxml type
|
|
30
|
-
3. **Serialization** — so `to_xml` outputs `&name;`
|
|
31
|
-
|
|
32
|
-
The existing pattern: `CustomizedOx::Text` extends `::Ox::Node`,
|
|
33
|
-
`CustomizedOx::Attribute` extends `::Ox::Node`. EntityReference should follow suit.
|
|
34
|
-
|
|
35
|
-
### Serialization Challenge for Ox
|
|
36
|
-
|
|
37
|
-
Ox's `serialize` calls `::Ox.dump(node)` which is C-level — it only handles
|
|
38
|
-
Ox native types. For EntityReference wrappers to survive serialization, we need
|
|
39
|
-
one of:
|
|
40
|
-
|
|
41
|
-
- **Option A**: Custom serialization in the adapter that walks the tree manually,
|
|
42
|
-
detecting EntityReference wrappers and emitting `&name;` directly.
|
|
43
|
-
- **Option B**: Convert EntityReferences to their text equivalent before calling
|
|
44
|
-
`Ox.dump`, restoring them in a post-processing step. This is fragile.
|
|
45
|
-
- **Option C**: Override `serialize` for Element nodes to handle children
|
|
46
|
-
individually, using `Ox.dump` for native children but handling wrappers
|
|
47
|
-
directly.
|
|
48
|
-
|
|
49
|
-
**Recommended: Option A** — it's how `CustomizedOga::XmlGenerator` already works
|
|
50
|
-
for Oga. A similar tree-walking serializer for Ox gives full control.
|
|
51
|
-
|
|
52
|
-
For LibXML, the existing serializer already checks `node.respond_to?(:to_xml)`
|
|
53
|
-
for wrapper classes, so adding an EntityReference wrapper with `to_xml` returning
|
|
54
|
-
`"&#{name};"` should integrate cleanly.
|
|
55
|
-
|
|
56
|
-
## Implementation Steps
|
|
57
|
-
|
|
58
|
-
### Ox Adapter
|
|
59
|
-
|
|
60
|
-
1. Create `lib/moxml/adapter/customized_ox/entity_reference.rb`:
|
|
61
|
-
```ruby
|
|
62
|
-
module Moxml::Adapter::CustomizedOx
|
|
63
|
-
class EntityReference < ::Ox::Node
|
|
64
|
-
attr_reader :name
|
|
65
|
-
|
|
66
|
-
def initialize(name)
|
|
67
|
-
@name = name
|
|
68
|
-
super() # Ox::Node requires no args or a value
|
|
69
|
-
end
|
|
70
|
-
|
|
71
|
-
def to_xml
|
|
72
|
-
"&#{@name};"
|
|
73
|
-
end
|
|
74
|
-
alias to_s to_xml
|
|
75
|
-
end
|
|
76
|
-
end
|
|
77
|
-
```
|
|
78
|
-
|
|
79
|
-
2. Add to `lib/moxml/adapter/ox.rb`:
|
|
80
|
-
- `create_native_entity_reference(name)` → `CustomizedOx::EntityReference.new(name)`
|
|
81
|
-
- `node_type`: add `when CustomizedOx::EntityReference then :entity_reference`
|
|
82
|
-
- `patch_node`: handle EntityReference wrapper in child list
|
|
83
|
-
- `entity_reference_name(node)`: return `node.name`
|
|
84
|
-
- Serialization: handle EntityReference children when walking the tree
|
|
85
|
-
|
|
86
|
-
3. Add to `lib/moxml/adapter/ox.rb` `unpatch_node`: return wrapper as-is
|
|
87
|
-
(it extends Ox::Node so it can stay in the tree)
|
|
88
|
-
|
|
89
|
-
### HeadedOx Adapter
|
|
90
|
-
|
|
91
|
-
HeadedOx inherits from Ox, so it gets Ox's EntityReference support
|
|
92
|
-
automatically once Ox is done. Verify that the XPath engine doesn't
|
|
93
|
-
break when encountering EntityReference nodes in the tree.
|
|
94
|
-
|
|
95
|
-
### Oga Adapter
|
|
96
|
-
|
|
97
|
-
1. Create `lib/moxml/adapter/customized_oga/entity_reference.rb`:
|
|
98
|
-
```ruby
|
|
99
|
-
module Moxml::Adapter::CustomizedOga
|
|
100
|
-
class EntityReference
|
|
101
|
-
attr_reader :name
|
|
102
|
-
|
|
103
|
-
def initialize(name)
|
|
104
|
-
@name = name
|
|
105
|
-
end
|
|
106
|
-
|
|
107
|
-
def to_xml
|
|
108
|
-
"&#{@name};"
|
|
109
|
-
end
|
|
110
|
-
end
|
|
111
|
-
end
|
|
112
|
-
```
|
|
113
|
-
|
|
114
|
-
2. Add to `lib/moxml/adapter/oga.rb`:
|
|
115
|
-
- `create_native_entity_reference(name)` → `CustomizedOga::EntityReference.new(name)`
|
|
116
|
-
- `node_type`: add `when CustomizedOga::EntityReference then :entity_reference`
|
|
117
|
-
- Update `CustomizedOga::XmlGenerator` to handle EntityReference children
|
|
118
|
-
- `entity_reference_name(node)`: return `node.name`
|
|
119
|
-
|
|
120
|
-
### REXML Adapter
|
|
121
|
-
|
|
122
|
-
1. Investigate: REXML has `REXML::Entity` and `REXML::EntityRef` classes.
|
|
123
|
-
Check if they can be used as native entity reference nodes, or if a
|
|
124
|
-
wrapper is needed.
|
|
125
|
-
|
|
126
|
-
2. Add to `lib/moxml/adapter/rexml.rb`:
|
|
127
|
-
- `create_native_entity_reference(name)` — native or wrapper
|
|
128
|
-
- `node_type`: add mapping
|
|
129
|
-
- `entity_reference_name(node)`
|
|
130
|
-
|
|
131
|
-
### LibXML Adapter
|
|
132
|
-
|
|
133
|
-
1. Investigate: LibXML Ruby has `LibXML::XML::Node::ENTITY_REF_NODE` constant
|
|
134
|
-
(value 5). Check if native entity reference nodes can be created.
|
|
135
|
-
|
|
136
|
-
2. Create `lib/moxml/adapter/customized_libxml/entity_reference.rb` if needed.
|
|
137
|
-
|
|
138
|
-
3. Add to `lib/moxml/adapter/libxml.rb`:
|
|
139
|
-
- `create_native_entity_reference(name)`
|
|
140
|
-
- `node_type`: add `ENTITY_REF_NODE` mapping or wrapper mapping
|
|
141
|
-
- `entity_reference_name(node)`
|
|
142
|
-
- The existing serializer already handles wrappers with `to_xml` —
|
|
143
|
-
verify EntityReference works in this path.
|
|
144
|
-
|
|
145
|
-
## Files to Create/Modify
|
|
146
|
-
|
|
147
|
-
### New Files
|
|
148
|
-
- `lib/moxml/adapter/customized_ox/entity_reference.rb`
|
|
149
|
-
- `lib/moxml/adapter/customized_oga/entity_reference.rb`
|
|
150
|
-
- Possibly: `lib/moxml/adapter/customized_libxml/entity_reference.rb`
|
|
151
|
-
|
|
152
|
-
### Modified Files
|
|
153
|
-
- `lib/moxml/adapter/ox.rb` — create_native_entity_reference, node_type, serialization
|
|
154
|
-
- `lib/moxml/adapter/oga.rb` — create_native_entity_reference, node_type, XmlGenerator
|
|
155
|
-
- `lib/moxml/adapter/rexml.rb` — create_native_entity_reference, node_type
|
|
156
|
-
- `lib/moxml/adapter/libxml.rb` — create_native_entity_reference, node_type
|
|
157
|
-
- `lib/moxml/adapter/headed_ox.rb` — verify inheritance works (likely no changes)
|
|
@@ -1,169 +0,0 @@
|
|
|
1
|
-
# TODO 2: Model-Driven Entity Restoration
|
|
2
|
-
|
|
3
|
-
## Problem
|
|
4
|
-
|
|
5
|
-
The `restore_entities` feature in `DocumentBuilder` is hardcoded to only handle
|
|
6
|
-
the 5 standard XML entities (amp, lt, gt, quot, apos). It ignores the
|
|
7
|
-
EntityRegistry entirely — despite EntityRegistry knowing 2125+ entities from
|
|
8
|
-
the W3C HTML/MathML set. This means non-standard entities like ` `,
|
|
9
|
-
`©`, `—` are never restored, which is the core round-trip problem
|
|
10
|
-
that motivated the entire entity feature.
|
|
11
|
-
|
|
12
|
-
Additionally, the restoration logic lives in DocumentBuilder with hardcoded
|
|
13
|
-
knowledge that belongs in the model layer.
|
|
14
|
-
|
|
15
|
-
## Current State (verified)
|
|
16
|
-
|
|
17
|
-
`lib/moxml/document_builder.rb:80-110` — `restore_entities_in_text`:
|
|
18
|
-
```ruby
|
|
19
|
-
entity_chars = {
|
|
20
|
-
"<" => "lt", ">" => "gt", "&" => "amp",
|
|
21
|
-
'"' => "quot", "'" => "apos",
|
|
22
|
-
}
|
|
23
|
-
```
|
|
24
|
-
|
|
25
|
-
This is a hardcoded lookup that duplicates knowledge already in EntityRegistry.
|
|
26
|
-
It only triggers for characters `<`, `>`, `&`, `"`, `'` — the regex guard
|
|
27
|
-
`/[<>&"']/` on line 73 prevents it from ever seeing characters like U+00A0
|
|
28
|
-
(non-breaking space, ` `).
|
|
29
|
-
|
|
30
|
-
**Critical**: Because only Nokogiri has `create_native_entity_reference`
|
|
31
|
-
(see TODO 1), `restore_entities` raises `NotImplementedError` on all other
|
|
32
|
-
adapters even for the 5 standard entities.
|
|
33
|
-
|
|
34
|
-
## XML Entity Model
|
|
35
|
-
|
|
36
|
-
XML has a clear entity model:
|
|
37
|
-
|
|
38
|
-
1. **5 predefined entities** (amp, lt, gt, quot, apos) — always available per
|
|
39
|
-
XML spec. These characters MUST be entity-encoded in certain contexts
|
|
40
|
-
(e.g., `<` and `&` in text content).
|
|
41
|
-
|
|
42
|
-
2. **DTD-declared entities** — declared via `<!ENTITY name "value">` in the
|
|
43
|
-
document's DOCTYPE internal subset or external subset.
|
|
44
|
-
|
|
45
|
-
3. **API-supplied entities** — registered by the user via
|
|
46
|
-
`EntityRegistry.register` or `entity_provider` callback.
|
|
47
|
-
|
|
48
|
-
4. **Bundled detection set** — the W3C HTML/MathML entities bundled in
|
|
49
|
-
`data/w3c_entities.json`. These are not "declared" in any DTD but are
|
|
50
|
-
recognized by Moxml for restoration purposes.
|
|
51
|
-
|
|
52
|
-
The EntityRegistry already knows about categories 1, 3, and 4. Category 2
|
|
53
|
-
(DTD parsing) is future work.
|
|
54
|
-
|
|
55
|
-
## Design: Model-Driven Restoration
|
|
56
|
-
|
|
57
|
-
EntityRegistry should be THE source of truth for "should this character become
|
|
58
|
-
an entity reference?" The restoration policy should be:
|
|
59
|
-
|
|
60
|
-
```ruby
|
|
61
|
-
# In EntityRegistry (or a cooperating policy object)
|
|
62
|
-
STANDARD_CODEPOINTS = [0x26, 0x3C, 0x3E, 0x22, 0x27].freeze # amp, lt, gt, quot, apos
|
|
63
|
-
|
|
64
|
-
def should_restore?(codepoint, config:)
|
|
65
|
-
name = primary_name_for_codepoint(codepoint)
|
|
66
|
-
return false unless name
|
|
67
|
-
|
|
68
|
-
# 1. The 5 standard XML entities are ALWAYS restored.
|
|
69
|
-
# These are syntactically required — the XML wouldn't be well-formed
|
|
70
|
-
# without encoding them.
|
|
71
|
-
return true if STANDARD_CODEPOINTS.include?(codepoint)
|
|
72
|
-
|
|
73
|
-
# 2. Non-standard entities: only if restore_entities is enabled.
|
|
74
|
-
return false unless config.restore_entities
|
|
75
|
-
|
|
76
|
-
# 3. In the future, strict vs lenient mode will gate this further.
|
|
77
|
-
# Strict: only if declared in DTD (not yet implemented).
|
|
78
|
-
# Lenient: any known entity name.
|
|
79
|
-
true
|
|
80
|
-
end
|
|
81
|
-
```
|
|
82
|
-
|
|
83
|
-
### Changes to DocumentBuilder
|
|
84
|
-
|
|
85
|
-
Replace the hardcoded hash with delegation to the registry:
|
|
86
|
-
|
|
87
|
-
```ruby
|
|
88
|
-
def visit_text(node)
|
|
89
|
-
prepared = adapter.prepare_for_new_document(node, @current_doc.native)
|
|
90
|
-
content = adapter.text_content(node)
|
|
91
|
-
|
|
92
|
-
if should_restore_entities?(content)
|
|
93
|
-
restore_entities_in_text(content)
|
|
94
|
-
else
|
|
95
|
-
@node_stack.last&.add_child(Text.new(prepared, context))
|
|
96
|
-
end
|
|
97
|
-
end
|
|
98
|
-
|
|
99
|
-
private
|
|
100
|
-
|
|
101
|
-
def should_restore_entities?(content)
|
|
102
|
-
return false unless context.config.restore_entities
|
|
103
|
-
# Scan for any character that the registry knows about
|
|
104
|
-
content.to_s.chars.any? { |c| context.entity_registry.should_restore?(c.ord, config: context.config) }
|
|
105
|
-
end
|
|
106
|
-
|
|
107
|
-
def restore_entities_in_text(content)
|
|
108
|
-
parent = @node_stack.last
|
|
109
|
-
return unless parent
|
|
110
|
-
|
|
111
|
-
content.to_s.chars.each do |char|
|
|
112
|
-
codepoint = char.ord
|
|
113
|
-
name = context.entity_registry.primary_name_for_codepoint(codepoint)
|
|
114
|
-
|
|
115
|
-
if context.entity_registry.should_restore?(codepoint, config: context.config)
|
|
116
|
-
entity_node = adapter.create_entity_reference(name)
|
|
117
|
-
parent.add_child(EntityReference.new(entity_node, context))
|
|
118
|
-
else
|
|
119
|
-
text_node = adapter.create_text(char)
|
|
120
|
-
parent.add_child(Text.new(text_node, context))
|
|
121
|
-
end
|
|
122
|
-
end
|
|
123
|
-
end
|
|
124
|
-
```
|
|
125
|
-
|
|
126
|
-
**Note**: This splits each text node into per-character nodes. For documents
|
|
127
|
-
with few entity references, this creates unnecessary overhead. A future
|
|
128
|
-
optimization should buffer consecutive non-entity characters into a single
|
|
129
|
-
text node.
|
|
130
|
-
|
|
131
|
-
### Performance Optimization (deferred)
|
|
132
|
-
|
|
133
|
-
Instead of character-by-character processing:
|
|
134
|
-
1. Scan the text for characters that have entity names in the registry
|
|
135
|
-
2. Split only at those positions, keeping runs of plain characters together
|
|
136
|
-
3. This reduces node count dramatically for typical documents
|
|
137
|
-
|
|
138
|
-
```ruby
|
|
139
|
-
def restore_entities_in_text(content)
|
|
140
|
-
parent = @node_stack.last
|
|
141
|
-
return unless parent
|
|
142
|
-
|
|
143
|
-
buffer = +""
|
|
144
|
-
content.to_s.chars.each do |char|
|
|
145
|
-
codepoint = char.ord
|
|
146
|
-
name = context.entity_registry.primary_name_for_codepoint(codepoint)
|
|
147
|
-
|
|
148
|
-
if name && context.entity_registry.should_restore?(codepoint, config: context.config)
|
|
149
|
-
# Flush buffer before entity
|
|
150
|
-
if !buffer.empty?
|
|
151
|
-
parent.add_child(Text.new(adapter.create_text(buffer), context))
|
|
152
|
-
buffer.clear
|
|
153
|
-
end
|
|
154
|
-
parent.add_child(EntityReference.new(adapter.create_entity_reference(name), context))
|
|
155
|
-
else
|
|
156
|
-
buffer << char
|
|
157
|
-
end
|
|
158
|
-
end
|
|
159
|
-
# Flush remaining buffer
|
|
160
|
-
if !buffer.empty?
|
|
161
|
-
parent.add_child(Text.new(adapter.create_text(buffer), context))
|
|
162
|
-
end
|
|
163
|
-
end
|
|
164
|
-
```
|
|
165
|
-
|
|
166
|
-
## Files to Modify
|
|
167
|
-
|
|
168
|
-
- `lib/moxml/entity_registry.rb` — add `should_restore?` method
|
|
169
|
-
- `lib/moxml/document_builder.rb` — replace hardcoded entity_chars with registry-driven logic
|
|
@@ -1,170 +0,0 @@
|
|
|
1
|
-
# TODO 3: EntityReference Test Coverage
|
|
2
|
-
|
|
3
|
-
## Problem
|
|
4
|
-
|
|
5
|
-
There are zero tests for EntityReference node behavior, zero tests for
|
|
6
|
-
entity round-trip preservation, and zero adapter-level tests for entity
|
|
7
|
-
reference creation or serialization. Only `EntityRegistry` has tests
|
|
8
|
-
(`spec/moxml/entity_registry_spec.rb`).
|
|
9
|
-
|
|
10
|
-
This means the entire EntityReference feature is untested — including the
|
|
11
|
-
`restore_entities` config, `create_entity_reference` factory, `visit_entity_reference`
|
|
12
|
-
in DocumentBuilder, and the `entity_reference` Builder DSL method.
|
|
13
|
-
|
|
14
|
-
## Required Test Coverage
|
|
15
|
-
|
|
16
|
-
### 1. EntityReference Node Tests
|
|
17
|
-
|
|
18
|
-
**File**: `spec/moxml/entity_reference_spec.rb`
|
|
19
|
-
|
|
20
|
-
```ruby
|
|
21
|
-
RSpec.describe Moxml::EntityReference do
|
|
22
|
-
# Test per adapter (use shared examples)
|
|
23
|
-
%i[nokogiri].each do |adapter| # expand as adapters gain support
|
|
24
|
-
context "with #{adapter} adapter" do
|
|
25
|
-
let(:ctx) { Moxml.new(adapter) }
|
|
26
|
-
|
|
27
|
-
it "creates an entity reference node" do
|
|
28
|
-
doc = ctx.create_document
|
|
29
|
-
ref = doc.create_entity_reference("nbsp")
|
|
30
|
-
expect(ref).to be_a(Moxml::EntityReference)
|
|
31
|
-
expect(ref.name).to eq("nbsp")
|
|
32
|
-
end
|
|
33
|
-
|
|
34
|
-
it "has empty text content" do
|
|
35
|
-
doc = ctx.create_document
|
|
36
|
-
ref = doc.create_entity_reference("amp")
|
|
37
|
-
expect(ref.text).to eq("")
|
|
38
|
-
expect(ref.content).to eq("")
|
|
39
|
-
end
|
|
40
|
-
|
|
41
|
-
it "serializes to entity syntax" do
|
|
42
|
-
doc = ctx.create_document
|
|
43
|
-
ref = doc.create_entity_reference("mdash")
|
|
44
|
-
expect(ref.to_xml).to eq("—")
|
|
45
|
-
end
|
|
46
|
-
|
|
47
|
-
it "is recognized as entity_reference type" do
|
|
48
|
-
doc = ctx.create_document
|
|
49
|
-
ref = doc.create_entity_reference("copy")
|
|
50
|
-
expect(ref.entity_reference?).to be true
|
|
51
|
-
end
|
|
52
|
-
|
|
53
|
-
it "survives add_child and retrieval" do
|
|
54
|
-
doc = ctx.create_document
|
|
55
|
-
root = doc.create_element("p")
|
|
56
|
-
doc.root = root
|
|
57
|
-
ref = doc.create_entity_reference("nbsp")
|
|
58
|
-
root.add_child(ref)
|
|
59
|
-
expect(root.children.first).to be_a(Moxml::EntityReference)
|
|
60
|
-
expect(root.children.first.name).to eq("nbsp")
|
|
61
|
-
end
|
|
62
|
-
|
|
63
|
-
it "validates entity reference name" do
|
|
64
|
-
doc = ctx.create_document
|
|
65
|
-
expect {
|
|
66
|
-
doc.create_entity_reference("123invalid")
|
|
67
|
-
}.to raise_error(Moxml::ValidationError)
|
|
68
|
-
end
|
|
69
|
-
end
|
|
70
|
-
end
|
|
71
|
-
end
|
|
72
|
-
```
|
|
73
|
-
|
|
74
|
-
### 2. Builder DSL Tests
|
|
75
|
-
|
|
76
|
-
**File**: `spec/moxml/builder_spec.rb` (add to existing or create new section)
|
|
77
|
-
|
|
78
|
-
```ruby
|
|
79
|
-
it "creates entity references via DSL" do
|
|
80
|
-
doc = Moxml::Builder.new(ctx).build do
|
|
81
|
-
element("p") { entity_reference("nbsp") }
|
|
82
|
-
end
|
|
83
|
-
expect(doc.root.children.first).to be_a(Moxml::EntityReference)
|
|
84
|
-
expect(doc.to_xml).to include(" ")
|
|
85
|
-
end
|
|
86
|
-
```
|
|
87
|
-
|
|
88
|
-
### 3. Restore Entities Integration Tests
|
|
89
|
-
|
|
90
|
-
**File**: `spec/moxml/adapter/entity_restoration_spec.rb` (shared examples)
|
|
91
|
-
|
|
92
|
-
```ruby
|
|
93
|
-
RSpec.shared_examples "entity restoration" do |adapter_name|
|
|
94
|
-
context "with #{adapter_name}" do
|
|
95
|
-
let(:ctx) { Moxml.new(adapter_name, restore_entities: true) }
|
|
96
|
-
|
|
97
|
-
it "restores standard XML entities" do
|
|
98
|
-
doc = ctx.parse("<p>a & b</p>")
|
|
99
|
-
output = doc.to_xml
|
|
100
|
-
expect(output).to include("&")
|
|
101
|
-
end
|
|
102
|
-
|
|
103
|
-
it "restores non-standard entities from registry" do
|
|
104
|
-
# nbsp (U+00A0) is in the bundled W3C entity set
|
|
105
|
-
doc = ctx.parse("<p>\u00A0</p>")
|
|
106
|
-
output = doc.to_xml
|
|
107
|
-
expect(output).to include(" ")
|
|
108
|
-
end
|
|
109
|
-
|
|
110
|
-
it "preserves entity syntax through round-trip" do
|
|
111
|
-
doc = ctx.parse("<p> ©—</p>")
|
|
112
|
-
output = doc.to_xml
|
|
113
|
-
reparsed = ctx.parse(output)
|
|
114
|
-
# Text content should be identical after round-trip
|
|
115
|
-
expect(reparsed.root.text).to eq(doc.root.text)
|
|
116
|
-
end
|
|
117
|
-
|
|
118
|
-
it "does not restore entities when restore_entities is false" do
|
|
119
|
-
ctx_no_restore = Moxml.new(adapter_name, restore_entities: false)
|
|
120
|
-
doc = ctx_no_restore.parse("<p>a & b</p>")
|
|
121
|
-
output = doc.to_xml
|
|
122
|
-
# Standard entities may still appear as & due to XML escaping,
|
|
123
|
-
# but no EntityReference nodes should be created
|
|
124
|
-
expect(doc.root.children).not_to include(a_kind_of(Moxml::EntityReference))
|
|
125
|
-
end
|
|
126
|
-
end
|
|
127
|
-
end
|
|
128
|
-
```
|
|
129
|
-
|
|
130
|
-
### 4. Cross-Adapter Consistency Tests
|
|
131
|
-
|
|
132
|
-
**File**: `spec/consistency/entity_reference_consistency_spec.rb`
|
|
133
|
-
|
|
134
|
-
Verify that EntityReference behavior is consistent across all adapters that
|
|
135
|
-
support it:
|
|
136
|
-
- Same entity name produces same serialization
|
|
137
|
-
- Same text content after round-trip
|
|
138
|
-
- Children enumeration includes EntityReference nodes
|
|
139
|
-
|
|
140
|
-
### 5. EntityRegistry.should_restore? Tests
|
|
141
|
-
|
|
142
|
-
**File**: Add to `spec/moxml/entity_registry_spec.rb`
|
|
143
|
-
|
|
144
|
-
```ruby
|
|
145
|
-
describe "#should_restore?" do
|
|
146
|
-
it "always restores the 5 standard XML entities" do
|
|
147
|
-
registry = described_class.new
|
|
148
|
-
config = Moxml::Config.new(:nokogiri)
|
|
149
|
-
expect(registry.should_restore?(0x26, config: config)).to be true # amp
|
|
150
|
-
expect(registry.should_restore?(0x3C, config: config)).to be true # lt
|
|
151
|
-
end
|
|
152
|
-
|
|
153
|
-
it "restores non-standard entities only when restore_entities is true" do
|
|
154
|
-
registry = described_class.new
|
|
155
|
-
config_on = Moxml::Config.new(:nokogiri)
|
|
156
|
-
config_on.restore_entities = true
|
|
157
|
-
config_off = Moxml::Config.new(:nokogiri)
|
|
158
|
-
config_off.restore_entities = false
|
|
159
|
-
|
|
160
|
-
expect(registry.should_restore?(0xA0, config: config_on)).to be true # nbsp
|
|
161
|
-
expect(registry.should_restore?(0xA0, config: config_off)).to be false
|
|
162
|
-
end
|
|
163
|
-
end
|
|
164
|
-
```
|
|
165
|
-
|
|
166
|
-
## Dependencies
|
|
167
|
-
|
|
168
|
-
- TODO 1 must be partially complete (at least one adapter working) before
|
|
169
|
-
adapter-level tests can pass
|
|
170
|
-
- TODO 2 must be complete before non-standard entity restoration tests can pass
|