moxml 0.1.15 → 0.1.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +31 -0
- data/TODO.remaining/1-entity-reference-adapter-support.md +157 -0
- data/TODO.remaining/2-entity-restoration-model-driven.md +169 -0
- data/TODO.remaining/3-entity-reference-test-coverage.md +170 -0
- data/TODO.remaining/4-lenient-entities-mode.md +106 -0
- data/TODO.remaining/5-fixture-integrity.md +65 -0
- data/TODO.remaining/6-ox-element-ordering-bug.md +36 -0
- data/TODO.remaining/7-headed-ox-limitations.md +95 -0
- data/TODO.remaining/8-xpath-predicate-gaps.md +68 -0
- data/TODO.remaining/9-cleanup-hygiene.md +42 -0
- data/TODO.remaining/README.md +54 -0
- data/benchmarks/generate_report.rb +1 -1
- data/lib/moxml/adapter/base.rb +14 -0
- data/lib/moxml/adapter/customized_libxml/declaration.rb +1 -1
- data/lib/moxml/adapter/customized_rexml/formatter.rb +42 -20
- data/lib/moxml/adapter/headed_ox.rb +28 -11
- data/lib/moxml/adapter/libxml.rb +165 -65
- data/lib/moxml/adapter/nokogiri.rb +19 -7
- data/lib/moxml/adapter/oga.rb +28 -12
- data/lib/moxml/adapter/ox.rb +11 -3
- data/lib/moxml/adapter/rexml.rb +40 -8
- data/lib/moxml/attribute.rb +1 -1
- data/lib/moxml/builder.rb +77 -24
- data/lib/moxml/config.rb +18 -1
- data/lib/moxml/declaration.rb +4 -2
- data/lib/moxml/document.rb +5 -2
- data/lib/moxml/document_builder.rb +9 -8
- data/lib/moxml/element.rb +10 -5
- data/lib/moxml/entity_registry.rb +16 -2
- data/lib/moxml/native_attachment.rb +65 -0
- data/lib/moxml/node.rb +17 -49
- data/lib/moxml/node_set.rb +1 -1
- data/lib/moxml/version.rb +1 -1
- data/lib/moxml/xpath/compiler.rb +4 -1
- data/lib/moxml.rb +1 -0
- data/scripts/format_xml.rb +16 -0
- data/scripts/pretty_format_xml.rb +14 -0
- data/spec/consistency/round_trip_spec.rb +3 -30
- data/spec/integration/all_adapters_spec.rb +1 -0
- data/spec/integration/headed_ox_integration_spec.rb +0 -2
- data/spec/integration/shared_examples/edge_cases.rb +4 -4
- data/spec/integration/shared_examples/integration_workflows.rb +3 -3
- data/spec/integration/shared_examples/node_wrappers/cdata_behavior.rb +1 -1
- data/spec/integration/shared_examples/node_wrappers/entity_reference_behavior.rb +224 -0
- data/spec/integration/shared_examples/node_wrappers/node_behavior.rb +1 -1
- data/spec/moxml/adapter/headed_ox_spec.rb +8 -8
- data/spec/moxml/builder_spec.rb +234 -0
- data/spec/moxml/xpath/axes_spec.rb +1 -1
- data/spec/moxml/xpath/compiler_spec.rb +2 -2
- data/spec/moxml/xpath/functions/position_functions_spec.rb +5 -5
- data/spec/moxml/xpath/functions/special_functions_spec.rb +1 -1
- data/spec/performance/memory_usage_spec.rb +0 -4
- metadata +15 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 378f1400934e3a65fb230779fc4b1783aab059efb449912a6dc2d97c8d82903e
|
|
4
|
+
data.tar.gz: 7cd2739dd2dc41c2edb69c129cc4ec175a7a6b8e455d4d63cfd56bd2a93e808f
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 311f4905dcf14fb3ec209491d9a5eae9b8fe460152f29c7f7b428db37b1c2adac09e538ce9c0a8a4eeff2b0af83a2e8b4a787adca59cb04d1c7f1b14b7fbf37d
|
|
7
|
+
data.tar.gz: 36cc3ce0e2328547137f1716d7b7ef3de4e07cbca160b08d8fbe74ef126edd6e61fe4dc0ed1d8767ed19f573792fc8fdc52c41e332802698218584db559576e0
|
data/Rakefile
CHANGED
|
@@ -10,6 +10,37 @@ require "rubocop/rake_task"
|
|
|
10
10
|
RuboCop::RakeTask.new
|
|
11
11
|
|
|
12
12
|
namespace :spec do
|
|
13
|
+
desc "Validate XML fixtures are well-formed (requires xmllint)"
|
|
14
|
+
task :validate_fixtures do
|
|
15
|
+
fixtures = Dir.glob("spec/fixtures/**/*.xml")
|
|
16
|
+
if fixtures.empty?
|
|
17
|
+
abort "No XML fixtures found in spec/fixtures/"
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
unless system("which xmllint > /dev/null 2>&1")
|
|
21
|
+
abort "xmllint not found. Install with: brew install libxml2 (macOS) or apt install libxml2-utils (Linux)"
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Intentionally malformed fixtures (W3C test cases for error handling)
|
|
25
|
+
exemptions = %w[
|
|
26
|
+
spec/fixtures/w3c/namespaces/1.0/035.xml
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
errors = []
|
|
30
|
+
fixtures.each do |path|
|
|
31
|
+
next if exemptions.include?(path)
|
|
32
|
+
|
|
33
|
+
output = `xmllint --noout "#{path}" 2>&1`
|
|
34
|
+
errors << "#{path}: #{output.strip}" unless $?.success?
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
if errors.empty?
|
|
38
|
+
puts "#{fixtures.size} XML fixtures validated OK"
|
|
39
|
+
else
|
|
40
|
+
abort "Invalid fixtures:\n#{errors.join("\n")}"
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
13
44
|
desc "Run unit tests only"
|
|
14
45
|
RSpec::Core::RakeTask.new(:unit) do |t|
|
|
15
46
|
t.pattern = "spec/unit/**/*_spec.rb"
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
# TODO 1: EntityReference Adapter Support for Ox, Oga, REXML, LibXML, HeadedOx
|
|
2
|
+
|
|
3
|
+
## Problem
|
|
4
|
+
|
|
5
|
+
Only the Nokogiri adapter implements `create_native_entity_reference` and maps
|
|
6
|
+
its native type to `:entity_reference` in `node_type`. The other 5 adapters
|
|
7
|
+
will raise `NotImplementedError` if `restore_entities` is enabled or if any
|
|
8
|
+
code path calls `create_entity_reference`. This makes the entire
|
|
9
|
+
EntityReference feature **non-functional** outside Nokogiri.
|
|
10
|
+
|
|
11
|
+
## Current State (verified)
|
|
12
|
+
|
|
13
|
+
| Adapter | `create_native_entity_reference` | `node_type` mapping | Serialization | Status |
|
|
14
|
+
|-----------|----------------------------------|---------------------|---------------|--------|
|
|
15
|
+
| Nokogiri | Done (`Nokogiri::XML::EntityReference.new`) | Done | Native | Working |
|
|
16
|
+
| Ox | Missing | Missing | Uses `Ox.dump` (C-level, won't handle custom types) | Broken |
|
|
17
|
+
| HeadedOx | Missing (inherits Ox) | Missing | Same as Ox | Broken |
|
|
18
|
+
| Oga | Missing | Missing | Uses `CustomizedOga::XmlGenerator` | Broken |
|
|
19
|
+
| REXML | Missing | Missing | Uses REXML's `write` | Broken |
|
|
20
|
+
| LibXML | Missing | Missing | Uses custom serializer with wrapper detection | Broken |
|
|
21
|
+
|
|
22
|
+
## Architecture
|
|
23
|
+
|
|
24
|
+
EntityReference follows the same pattern as other non-native node types in Moxml:
|
|
25
|
+
a **wrapper class** that represents what the underlying library cannot express natively.
|
|
26
|
+
|
|
27
|
+
Each adapter needs three things:
|
|
28
|
+
1. **Wrapper class** (`CustomizedXxx::EntityReference`) — holds the entity name
|
|
29
|
+
2. **`node_type` mapping** — so `Node.wrap` can create the correct Moxml type
|
|
30
|
+
3. **Serialization** — so `to_xml` outputs `&name;`
|
|
31
|
+
|
|
32
|
+
The existing pattern: `CustomizedOx::Text` extends `::Ox::Node`,
|
|
33
|
+
`CustomizedOx::Attribute` extends `::Ox::Node`. EntityReference should follow suit.
|
|
34
|
+
|
|
35
|
+
### Serialization Challenge for Ox
|
|
36
|
+
|
|
37
|
+
Ox's `serialize` calls `::Ox.dump(node)` which is C-level — it only handles
|
|
38
|
+
Ox native types. For EntityReference wrappers to survive serialization, we need
|
|
39
|
+
one of:
|
|
40
|
+
|
|
41
|
+
- **Option A**: Custom serialization in the adapter that walks the tree manually,
|
|
42
|
+
detecting EntityReference wrappers and emitting `&name;` directly.
|
|
43
|
+
- **Option B**: Convert EntityReferences to their text equivalent before calling
|
|
44
|
+
`Ox.dump`, restoring them in a post-processing step. This is fragile.
|
|
45
|
+
- **Option C**: Override `serialize` for Element nodes to handle children
|
|
46
|
+
individually, using `Ox.dump` for native children but handling wrappers
|
|
47
|
+
directly.
|
|
48
|
+
|
|
49
|
+
**Recommended: Option A** — it's how `CustomizedOga::XmlGenerator` already works
|
|
50
|
+
for Oga. A similar tree-walking serializer for Ox gives full control.
|
|
51
|
+
|
|
52
|
+
For LibXML, the existing serializer already checks `node.respond_to?(:to_xml)`
|
|
53
|
+
for wrapper classes, so adding an EntityReference wrapper with `to_xml` returning
|
|
54
|
+
`"&#{name};"` should integrate cleanly.
|
|
55
|
+
|
|
56
|
+
## Implementation Steps
|
|
57
|
+
|
|
58
|
+
### Ox Adapter
|
|
59
|
+
|
|
60
|
+
1. Create `lib/moxml/adapter/customized_ox/entity_reference.rb`:
|
|
61
|
+
```ruby
|
|
62
|
+
module Moxml::Adapter::CustomizedOx
|
|
63
|
+
class EntityReference < ::Ox::Node
|
|
64
|
+
attr_reader :name
|
|
65
|
+
|
|
66
|
+
def initialize(name)
|
|
67
|
+
@name = name
|
|
68
|
+
super() # Ox::Node requires no args or a value
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def to_xml
|
|
72
|
+
"&#{@name};"
|
|
73
|
+
end
|
|
74
|
+
alias to_s to_xml
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
2. Add to `lib/moxml/adapter/ox.rb`:
|
|
80
|
+
- `create_native_entity_reference(name)` → `CustomizedOx::EntityReference.new(name)`
|
|
81
|
+
- `node_type`: add `when CustomizedOx::EntityReference then :entity_reference`
|
|
82
|
+
- `patch_node`: handle EntityReference wrapper in child list
|
|
83
|
+
- `entity_reference_name(node)`: return `node.name`
|
|
84
|
+
- Serialization: handle EntityReference children when walking the tree
|
|
85
|
+
|
|
86
|
+
3. Add to `lib/moxml/adapter/ox.rb` `unpatch_node`: return wrapper as-is
|
|
87
|
+
(it extends Ox::Node so it can stay in the tree)
|
|
88
|
+
|
|
89
|
+
### HeadedOx Adapter
|
|
90
|
+
|
|
91
|
+
HeadedOx inherits from Ox, so it gets Ox's EntityReference support
|
|
92
|
+
automatically once Ox is done. Verify that the XPath engine doesn't
|
|
93
|
+
break when encountering EntityReference nodes in the tree.
|
|
94
|
+
|
|
95
|
+
### Oga Adapter
|
|
96
|
+
|
|
97
|
+
1. Create `lib/moxml/adapter/customized_oga/entity_reference.rb`:
|
|
98
|
+
```ruby
|
|
99
|
+
module Moxml::Adapter::CustomizedOga
|
|
100
|
+
class EntityReference
|
|
101
|
+
attr_reader :name
|
|
102
|
+
|
|
103
|
+
def initialize(name)
|
|
104
|
+
@name = name
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def to_xml
|
|
108
|
+
"&#{@name};"
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
2. Add to `lib/moxml/adapter/oga.rb`:
|
|
115
|
+
- `create_native_entity_reference(name)` → `CustomizedOga::EntityReference.new(name)`
|
|
116
|
+
- `node_type`: add `when CustomizedOga::EntityReference then :entity_reference`
|
|
117
|
+
- Update `CustomizedOga::XmlGenerator` to handle EntityReference children
|
|
118
|
+
- `entity_reference_name(node)`: return `node.name`
|
|
119
|
+
|
|
120
|
+
### REXML Adapter
|
|
121
|
+
|
|
122
|
+
1. Investigate: REXML has `REXML::Entity` and `REXML::EntityRef` classes.
|
|
123
|
+
Check if they can be used as native entity reference nodes, or if a
|
|
124
|
+
wrapper is needed.
|
|
125
|
+
|
|
126
|
+
2. Add to `lib/moxml/adapter/rexml.rb`:
|
|
127
|
+
- `create_native_entity_reference(name)` — native or wrapper
|
|
128
|
+
- `node_type`: add mapping
|
|
129
|
+
- `entity_reference_name(node)`
|
|
130
|
+
|
|
131
|
+
### LibXML Adapter
|
|
132
|
+
|
|
133
|
+
1. Investigate: LibXML Ruby has `LibXML::XML::Node::ENTITY_REF_NODE` constant
|
|
134
|
+
(value 5). Check if native entity reference nodes can be created.
|
|
135
|
+
|
|
136
|
+
2. Create `lib/moxml/adapter/customized_libxml/entity_reference.rb` if needed.
|
|
137
|
+
|
|
138
|
+
3. Add to `lib/moxml/adapter/libxml.rb`:
|
|
139
|
+
- `create_native_entity_reference(name)`
|
|
140
|
+
- `node_type`: add `ENTITY_REF_NODE` mapping or wrapper mapping
|
|
141
|
+
- `entity_reference_name(node)`
|
|
142
|
+
- The existing serializer already handles wrappers with `to_xml` —
|
|
143
|
+
verify EntityReference works in this path.
|
|
144
|
+
|
|
145
|
+
## Files to Create/Modify
|
|
146
|
+
|
|
147
|
+
### New Files
|
|
148
|
+
- `lib/moxml/adapter/customized_ox/entity_reference.rb`
|
|
149
|
+
- `lib/moxml/adapter/customized_oga/entity_reference.rb`
|
|
150
|
+
- Possibly: `lib/moxml/adapter/customized_libxml/entity_reference.rb`
|
|
151
|
+
|
|
152
|
+
### Modified Files
|
|
153
|
+
- `lib/moxml/adapter/ox.rb` — create_native_entity_reference, node_type, serialization
|
|
154
|
+
- `lib/moxml/adapter/oga.rb` — create_native_entity_reference, node_type, XmlGenerator
|
|
155
|
+
- `lib/moxml/adapter/rexml.rb` — create_native_entity_reference, node_type
|
|
156
|
+
- `lib/moxml/adapter/libxml.rb` — create_native_entity_reference, node_type
|
|
157
|
+
- `lib/moxml/adapter/headed_ox.rb` — verify inheritance works (likely no changes)
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
# TODO 2: Model-Driven Entity Restoration
|
|
2
|
+
|
|
3
|
+
## Problem
|
|
4
|
+
|
|
5
|
+
The `restore_entities` feature in `DocumentBuilder` is hardcoded to only handle
|
|
6
|
+
the 5 standard XML entities (amp, lt, gt, quot, apos). It ignores the
|
|
7
|
+
EntityRegistry entirely — despite EntityRegistry knowing 2125+ entities from
|
|
8
|
+
the W3C HTML/MathML set. This means non-standard entities like ` `,
|
|
9
|
+
`©`, `—` are never restored, which is the core round-trip problem
|
|
10
|
+
that motivated the entire entity feature.
|
|
11
|
+
|
|
12
|
+
Additionally, the restoration logic lives in DocumentBuilder with hardcoded
|
|
13
|
+
knowledge that belongs in the model layer.
|
|
14
|
+
|
|
15
|
+
## Current State (verified)
|
|
16
|
+
|
|
17
|
+
`lib/moxml/document_builder.rb:80-110` — `restore_entities_in_text`:
|
|
18
|
+
```ruby
|
|
19
|
+
entity_chars = {
|
|
20
|
+
"<" => "lt", ">" => "gt", "&" => "amp",
|
|
21
|
+
'"' => "quot", "'" => "apos",
|
|
22
|
+
}
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
This is a hardcoded lookup that duplicates knowledge already in EntityRegistry.
|
|
26
|
+
It only triggers for characters `<`, `>`, `&`, `"`, `'` — the regex guard
|
|
27
|
+
`/[<>&"']/` on line 73 prevents it from ever seeing characters like U+00A0
|
|
28
|
+
(non-breaking space, ` `).
|
|
29
|
+
|
|
30
|
+
**Critical**: Because only Nokogiri has `create_native_entity_reference`
|
|
31
|
+
(see TODO 1), `restore_entities` raises `NotImplementedError` on all other
|
|
32
|
+
adapters even for the 5 standard entities.
|
|
33
|
+
|
|
34
|
+
## XML Entity Model
|
|
35
|
+
|
|
36
|
+
XML has a clear entity model:
|
|
37
|
+
|
|
38
|
+
1. **5 predefined entities** (amp, lt, gt, quot, apos) — always available per
|
|
39
|
+
XML spec. These characters MUST be entity-encoded in certain contexts
|
|
40
|
+
(e.g., `<` and `&` in text content).
|
|
41
|
+
|
|
42
|
+
2. **DTD-declared entities** — declared via `<!ENTITY name "value">` in the
|
|
43
|
+
document's DOCTYPE internal subset or external subset.
|
|
44
|
+
|
|
45
|
+
3. **API-supplied entities** — registered by the user via
|
|
46
|
+
`EntityRegistry.register` or `entity_provider` callback.
|
|
47
|
+
|
|
48
|
+
4. **Bundled detection set** — the W3C HTML/MathML entities bundled in
|
|
49
|
+
`data/w3c_entities.json`. These are not "declared" in any DTD but are
|
|
50
|
+
recognized by Moxml for restoration purposes.
|
|
51
|
+
|
|
52
|
+
The EntityRegistry already knows about categories 1, 3, and 4. Category 2
|
|
53
|
+
(DTD parsing) is future work.
|
|
54
|
+
|
|
55
|
+
## Design: Model-Driven Restoration
|
|
56
|
+
|
|
57
|
+
EntityRegistry should be THE source of truth for "should this character become
|
|
58
|
+
an entity reference?" The restoration policy should be:
|
|
59
|
+
|
|
60
|
+
```ruby
|
|
61
|
+
# In EntityRegistry (or a cooperating policy object)
|
|
62
|
+
STANDARD_CODEPOINTS = [0x26, 0x3C, 0x3E, 0x22, 0x27].freeze # amp, lt, gt, quot, apos
|
|
63
|
+
|
|
64
|
+
def should_restore?(codepoint, config:)
|
|
65
|
+
name = primary_name_for_codepoint(codepoint)
|
|
66
|
+
return false unless name
|
|
67
|
+
|
|
68
|
+
# 1. The 5 standard XML entities are ALWAYS restored.
|
|
69
|
+
# These are syntactically required — the XML wouldn't be well-formed
|
|
70
|
+
# without encoding them.
|
|
71
|
+
return true if STANDARD_CODEPOINTS.include?(codepoint)
|
|
72
|
+
|
|
73
|
+
# 2. Non-standard entities: only if restore_entities is enabled.
|
|
74
|
+
return false unless config.restore_entities
|
|
75
|
+
|
|
76
|
+
# 3. In the future, strict vs lenient mode will gate this further.
|
|
77
|
+
# Strict: only if declared in DTD (not yet implemented).
|
|
78
|
+
# Lenient: any known entity name.
|
|
79
|
+
true
|
|
80
|
+
end
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Changes to DocumentBuilder
|
|
84
|
+
|
|
85
|
+
Replace the hardcoded hash with delegation to the registry:
|
|
86
|
+
|
|
87
|
+
```ruby
|
|
88
|
+
def visit_text(node)
|
|
89
|
+
prepared = adapter.prepare_for_new_document(node, @current_doc.native)
|
|
90
|
+
content = adapter.text_content(node)
|
|
91
|
+
|
|
92
|
+
if should_restore_entities?(content)
|
|
93
|
+
restore_entities_in_text(content)
|
|
94
|
+
else
|
|
95
|
+
@node_stack.last&.add_child(Text.new(prepared, context))
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
private
|
|
100
|
+
|
|
101
|
+
def should_restore_entities?(content)
|
|
102
|
+
return false unless context.config.restore_entities
|
|
103
|
+
# Scan for any character that the registry knows about
|
|
104
|
+
content.to_s.chars.any? { |c| context.entity_registry.should_restore?(c.ord, config: context.config) }
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def restore_entities_in_text(content)
|
|
108
|
+
parent = @node_stack.last
|
|
109
|
+
return unless parent
|
|
110
|
+
|
|
111
|
+
content.to_s.chars.each do |char|
|
|
112
|
+
codepoint = char.ord
|
|
113
|
+
name = context.entity_registry.primary_name_for_codepoint(codepoint)
|
|
114
|
+
|
|
115
|
+
if context.entity_registry.should_restore?(codepoint, config: context.config)
|
|
116
|
+
entity_node = adapter.create_entity_reference(name)
|
|
117
|
+
parent.add_child(EntityReference.new(entity_node, context))
|
|
118
|
+
else
|
|
119
|
+
text_node = adapter.create_text(char)
|
|
120
|
+
parent.add_child(Text.new(text_node, context))
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
**Note**: This splits each text node into per-character nodes. For documents
|
|
127
|
+
with few entity references, this creates unnecessary overhead. A future
|
|
128
|
+
optimization should buffer consecutive non-entity characters into a single
|
|
129
|
+
text node.
|
|
130
|
+
|
|
131
|
+
### Performance Optimization (deferred)
|
|
132
|
+
|
|
133
|
+
Instead of character-by-character processing:
|
|
134
|
+
1. Scan the text for characters that have entity names in the registry
|
|
135
|
+
2. Split only at those positions, keeping runs of plain characters together
|
|
136
|
+
3. This reduces node count dramatically for typical documents
|
|
137
|
+
|
|
138
|
+
```ruby
|
|
139
|
+
def restore_entities_in_text(content)
|
|
140
|
+
parent = @node_stack.last
|
|
141
|
+
return unless parent
|
|
142
|
+
|
|
143
|
+
buffer = +""
|
|
144
|
+
content.to_s.chars.each do |char|
|
|
145
|
+
codepoint = char.ord
|
|
146
|
+
name = context.entity_registry.primary_name_for_codepoint(codepoint)
|
|
147
|
+
|
|
148
|
+
if name && context.entity_registry.should_restore?(codepoint, config: context.config)
|
|
149
|
+
# Flush buffer before entity
|
|
150
|
+
if !buffer.empty?
|
|
151
|
+
parent.add_child(Text.new(adapter.create_text(buffer), context))
|
|
152
|
+
buffer.clear
|
|
153
|
+
end
|
|
154
|
+
parent.add_child(EntityReference.new(adapter.create_entity_reference(name), context))
|
|
155
|
+
else
|
|
156
|
+
buffer << char
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
# Flush remaining buffer
|
|
160
|
+
if !buffer.empty?
|
|
161
|
+
parent.add_child(Text.new(adapter.create_text(buffer), context))
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
## Files to Modify
|
|
167
|
+
|
|
168
|
+
- `lib/moxml/entity_registry.rb` — add `should_restore?` method
|
|
169
|
+
- `lib/moxml/document_builder.rb` — replace hardcoded entity_chars with registry-driven logic
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
# TODO 3: EntityReference Test Coverage
|
|
2
|
+
|
|
3
|
+
## Problem
|
|
4
|
+
|
|
5
|
+
There are zero tests for EntityReference node behavior, zero tests for
|
|
6
|
+
entity round-trip preservation, and zero adapter-level tests for entity
|
|
7
|
+
reference creation or serialization. Only `EntityRegistry` has tests
|
|
8
|
+
(`spec/moxml/entity_registry_spec.rb`).
|
|
9
|
+
|
|
10
|
+
This means the entire EntityReference feature is untested — including the
|
|
11
|
+
`restore_entities` config, `create_entity_reference` factory, `visit_entity_reference`
|
|
12
|
+
in DocumentBuilder, and the `entity_reference` Builder DSL method.
|
|
13
|
+
|
|
14
|
+
## Required Test Coverage
|
|
15
|
+
|
|
16
|
+
### 1. EntityReference Node Tests
|
|
17
|
+
|
|
18
|
+
**File**: `spec/moxml/entity_reference_spec.rb`
|
|
19
|
+
|
|
20
|
+
```ruby
|
|
21
|
+
RSpec.describe Moxml::EntityReference do
|
|
22
|
+
# Test per adapter (use shared examples)
|
|
23
|
+
%i[nokogiri].each do |adapter| # expand as adapters gain support
|
|
24
|
+
context "with #{adapter} adapter" do
|
|
25
|
+
let(:ctx) { Moxml.new(adapter) }
|
|
26
|
+
|
|
27
|
+
it "creates an entity reference node" do
|
|
28
|
+
doc = ctx.create_document
|
|
29
|
+
ref = doc.create_entity_reference("nbsp")
|
|
30
|
+
expect(ref).to be_a(Moxml::EntityReference)
|
|
31
|
+
expect(ref.name).to eq("nbsp")
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
it "has empty text content" do
|
|
35
|
+
doc = ctx.create_document
|
|
36
|
+
ref = doc.create_entity_reference("amp")
|
|
37
|
+
expect(ref.text).to eq("")
|
|
38
|
+
expect(ref.content).to eq("")
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
it "serializes to entity syntax" do
|
|
42
|
+
doc = ctx.create_document
|
|
43
|
+
ref = doc.create_entity_reference("mdash")
|
|
44
|
+
expect(ref.to_xml).to eq("—")
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
it "is recognized as entity_reference type" do
|
|
48
|
+
doc = ctx.create_document
|
|
49
|
+
ref = doc.create_entity_reference("copy")
|
|
50
|
+
expect(ref.entity_reference?).to be true
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
it "survives add_child and retrieval" do
|
|
54
|
+
doc = ctx.create_document
|
|
55
|
+
root = doc.create_element("p")
|
|
56
|
+
doc.root = root
|
|
57
|
+
ref = doc.create_entity_reference("nbsp")
|
|
58
|
+
root.add_child(ref)
|
|
59
|
+
expect(root.children.first).to be_a(Moxml::EntityReference)
|
|
60
|
+
expect(root.children.first.name).to eq("nbsp")
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
it "validates entity reference name" do
|
|
64
|
+
doc = ctx.create_document
|
|
65
|
+
expect {
|
|
66
|
+
doc.create_entity_reference("123invalid")
|
|
67
|
+
}.to raise_error(Moxml::ValidationError)
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### 2. Builder DSL Tests
|
|
75
|
+
|
|
76
|
+
**File**: `spec/moxml/builder_spec.rb` (add to existing or create new section)
|
|
77
|
+
|
|
78
|
+
```ruby
|
|
79
|
+
it "creates entity references via DSL" do
|
|
80
|
+
doc = Moxml::Builder.new(ctx).build do
|
|
81
|
+
element("p") { entity_reference("nbsp") }
|
|
82
|
+
end
|
|
83
|
+
expect(doc.root.children.first).to be_a(Moxml::EntityReference)
|
|
84
|
+
expect(doc.to_xml).to include(" ")
|
|
85
|
+
end
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### 3. Restore Entities Integration Tests
|
|
89
|
+
|
|
90
|
+
**File**: `spec/moxml/adapter/entity_restoration_spec.rb` (shared examples)
|
|
91
|
+
|
|
92
|
+
```ruby
|
|
93
|
+
RSpec.shared_examples "entity restoration" do |adapter_name|
|
|
94
|
+
context "with #{adapter_name}" do
|
|
95
|
+
let(:ctx) { Moxml.new(adapter_name, restore_entities: true) }
|
|
96
|
+
|
|
97
|
+
it "restores standard XML entities" do
|
|
98
|
+
doc = ctx.parse("<p>a & b</p>")
|
|
99
|
+
output = doc.to_xml
|
|
100
|
+
expect(output).to include("&")
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
it "restores non-standard entities from registry" do
|
|
104
|
+
# nbsp (U+00A0) is in the bundled W3C entity set
|
|
105
|
+
doc = ctx.parse("<p>\u00A0</p>")
|
|
106
|
+
output = doc.to_xml
|
|
107
|
+
expect(output).to include(" ")
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
it "preserves entity syntax through round-trip" do
|
|
111
|
+
doc = ctx.parse("<p> ©—</p>")
|
|
112
|
+
output = doc.to_xml
|
|
113
|
+
reparsed = ctx.parse(output)
|
|
114
|
+
# Text content should be identical after round-trip
|
|
115
|
+
expect(reparsed.root.text).to eq(doc.root.text)
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
it "does not restore entities when restore_entities is false" do
|
|
119
|
+
ctx_no_restore = Moxml.new(adapter_name, restore_entities: false)
|
|
120
|
+
doc = ctx_no_restore.parse("<p>a & b</p>")
|
|
121
|
+
output = doc.to_xml
|
|
122
|
+
# Standard entities may still appear as & due to XML escaping,
|
|
123
|
+
# but no EntityReference nodes should be created
|
|
124
|
+
expect(doc.root.children).not_to include(a_kind_of(Moxml::EntityReference))
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### 4. Cross-Adapter Consistency Tests
|
|
131
|
+
|
|
132
|
+
**File**: `spec/consistency/entity_reference_consistency_spec.rb`
|
|
133
|
+
|
|
134
|
+
Verify that EntityReference behavior is consistent across all adapters that
|
|
135
|
+
support it:
|
|
136
|
+
- Same entity name produces same serialization
|
|
137
|
+
- Same text content after round-trip
|
|
138
|
+
- Children enumeration includes EntityReference nodes
|
|
139
|
+
|
|
140
|
+
### 5. EntityRegistry.should_restore? Tests
|
|
141
|
+
|
|
142
|
+
**File**: Add to `spec/moxml/entity_registry_spec.rb`
|
|
143
|
+
|
|
144
|
+
```ruby
|
|
145
|
+
describe "#should_restore?" do
|
|
146
|
+
it "always restores the 5 standard XML entities" do
|
|
147
|
+
registry = described_class.new
|
|
148
|
+
config = Moxml::Config.new(:nokogiri)
|
|
149
|
+
expect(registry.should_restore?(0x26, config: config)).to be true # amp
|
|
150
|
+
expect(registry.should_restore?(0x3C, config: config)).to be true # lt
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
it "restores non-standard entities only when restore_entities is true" do
|
|
154
|
+
registry = described_class.new
|
|
155
|
+
config_on = Moxml::Config.new(:nokogiri)
|
|
156
|
+
config_on.restore_entities = true
|
|
157
|
+
config_off = Moxml::Config.new(:nokogiri)
|
|
158
|
+
config_off.restore_entities = false
|
|
159
|
+
|
|
160
|
+
expect(registry.should_restore?(0xA0, config: config_on)).to be true # nbsp
|
|
161
|
+
expect(registry.should_restore?(0xA0, config: config_off)).to be false
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
## Dependencies
|
|
167
|
+
|
|
168
|
+
- TODO 1 must be partially complete (at least one adapter working) before
|
|
169
|
+
adapter-level tests can pass
|
|
170
|
+
- TODO 2 must be complete before non-standard entity restoration tests can pass
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# TODO 4: Lenient Entities Mode
|
|
2
|
+
|
|
3
|
+
## Problem
|
|
4
|
+
|
|
5
|
+
XML only defines 5 predefined entities (amp, lt, gt, quot, apos). Any other
|
|
6
|
+
entity must be declared in a DTD. However, real-world XML documents frequently
|
|
7
|
+
use HTML entities (` `, `©`) without DTD declarations — particularly
|
|
8
|
+
office documents (OOXML/ODF) and legacy systems.
|
|
9
|
+
|
|
10
|
+
Currently Moxml has no way to configure whether undeclared entities should be
|
|
11
|
+
preserved. The `restore_entities` flag is a boolean that enables restoration
|
|
12
|
+
for all known entities from the registry. There is no distinction between
|
|
13
|
+
"only DTD-declared" (strict) and "any recognized" (lenient).
|
|
14
|
+
|
|
15
|
+
## Design
|
|
16
|
+
|
|
17
|
+
### Config Option
|
|
18
|
+
|
|
19
|
+
Add `entity_restoration_mode` to Config with two values:
|
|
20
|
+
|
|
21
|
+
- `:strict` (default) — Only restore entities that are declared in the DTD
|
|
22
|
+
internal subset. The 5 standard XML entities are always restored regardless
|
|
23
|
+
(they are implicitly declared per XML spec). DTD parsing is prerequisite.
|
|
24
|
+
|
|
25
|
+
- `:lenient` — Restore any character that has a known entity name in the
|
|
26
|
+
EntityRegistry. This covers the bundled W3C HTML/MathML set (2125 entities)
|
|
27
|
+
plus any user-registered entities. No DTD required.
|
|
28
|
+
|
|
29
|
+
This replaces the boolean `restore_entities` which becomes a derived property:
|
|
30
|
+
- `restore_entities = true` + `entity_restoration_mode = :lenient` → restore all known
|
|
31
|
+
- `restore_entities = true` + `entity_restoration_mode = :strict` → restore only declared
|
|
32
|
+
- `restore_entities = false` → don't restore any
|
|
33
|
+
|
|
34
|
+
### EntityRegistry Enhancement
|
|
35
|
+
|
|
36
|
+
```ruby
|
|
37
|
+
class EntityRegistry
|
|
38
|
+
def should_restore?(codepoint, config:)
|
|
39
|
+
name = primary_name_for_codepoint(codepoint)
|
|
40
|
+
return false unless name
|
|
41
|
+
|
|
42
|
+
# Standard XML entities always restored (XML well-formedness requirement)
|
|
43
|
+
return true if standard_entity?(codepoint)
|
|
44
|
+
|
|
45
|
+
# Must have restoration enabled
|
|
46
|
+
return false unless config.restore_entities
|
|
47
|
+
|
|
48
|
+
case config.entity_restoration_mode
|
|
49
|
+
when :lenient
|
|
50
|
+
# Any known entity
|
|
51
|
+
true
|
|
52
|
+
when :strict
|
|
53
|
+
# Only if declared in DTD (future: check DTD declarations)
|
|
54
|
+
# For now, fall back to lenient behavior until DTD parsing is implemented
|
|
55
|
+
true
|
|
56
|
+
else
|
|
57
|
+
false
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def standard_entity?(codepoint)
|
|
62
|
+
STANDARD_ENTITIES.value?(codepoint)
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### User-Supplied Entities
|
|
68
|
+
|
|
69
|
+
Users can supply entities through three mechanisms:
|
|
70
|
+
|
|
71
|
+
1. **EntityRegistry.register** — programmatic registration:
|
|
72
|
+
```ruby
|
|
73
|
+
context = Moxml.new(:nokogiri)
|
|
74
|
+
context.entity_registry.register({ "myentity" => 0xABCD })
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
2. **entity_provider callback** — for custom/external entity sources:
|
|
78
|
+
```ruby
|
|
79
|
+
Moxml.new(:nokogiri) do |c|
|
|
80
|
+
c.entity_load_mode = :custom
|
|
81
|
+
c.entity_provider = -> { { "myentity" => 0xABCD } }
|
|
82
|
+
end
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
3. **Bundled W3C set** — loaded by default in `:required` mode (2125 entities
|
|
86
|
+
from HTML/MathML/ISO sets). Controlled by `entity_load_mode` config.
|
|
87
|
+
|
|
88
|
+
None of these require DTD. They are model-level knowledge in the EntityRegistry.
|
|
89
|
+
|
|
90
|
+
### DTD-Declared Entities (Future)
|
|
91
|
+
|
|
92
|
+
Strict mode's full value requires parsing DTD entity declarations from
|
|
93
|
+
`<!DOCTYPE ... [ <!ENTITY name "value"> ]>`. This is a separate feature
|
|
94
|
+
(external to this TODO). Until then, strict mode behaves like lenient mode.
|
|
95
|
+
|
|
96
|
+
## Files to Modify
|
|
97
|
+
|
|
98
|
+
- `lib/moxml/config.rb` — add `entity_restoration_mode` attribute
|
|
99
|
+
- `lib/moxml/entity_registry.rb` — add `should_restore?`, `standard_entity?`
|
|
100
|
+
- `lib/moxml/document_builder.rb` — use `should_restore?` from registry (ties into TODO 2)
|
|
101
|
+
|
|
102
|
+
## Dependencies
|
|
103
|
+
|
|
104
|
+
- TODO 2 (model-driven restoration) should be done first so the policy is
|
|
105
|
+
centralized in EntityRegistry
|
|
106
|
+
- TODO 1 (adapter support) should be done first so entities can actually be created
|