moxml 0.1.10 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/docs.yml +1 -1
- data/.github/workflows/rake.yml +16 -13
- data/.github/workflows/release.yml +1 -0
- data/.github/workflows/round-trip.yml +74 -0
- data/.gitignore +1 -0
- data/.rubocop.yml +1 -0
- data/.rubocop_todo.yml +160 -38
- data/Gemfile +2 -1
- data/README.adoc +236 -0
- data/Rakefile +11 -0
- data/data/w3c_entities.json +2131 -0
- data/docs/ENTITY_SUPPORT_FOR_LUTAML_MODEL.md +102 -0
- data/docs/_pages/adapters/ox.adoc +30 -0
- data/docs/_pages/configuration.adoc +43 -0
- data/docs/_pages/node-api-reference.adoc +35 -0
- data/docs/_tutorials/namespace-handling.adoc +21 -0
- data/examples/rss_parser/rss_parser.rb +1 -3
- data/lib/moxml/adapter/base.rb +26 -2
- data/lib/moxml/adapter/headed_ox.rb +5 -4
- data/lib/moxml/adapter/libxml.rb +3 -2
- data/lib/moxml/adapter/nokogiri.rb +16 -3
- data/lib/moxml/adapter/oga.rb +124 -20
- data/lib/moxml/adapter/ox.rb +4 -3
- data/lib/moxml/adapter/rexml.rb +41 -7
- data/lib/moxml/builder.rb +6 -0
- data/lib/moxml/config.rb +52 -1
- data/lib/moxml/context.rb +21 -2
- data/lib/moxml/document.rb +6 -1
- data/lib/moxml/document_builder.rb +45 -1
- data/lib/moxml/element.rb +4 -3
- data/lib/moxml/entity_reference.rb +29 -0
- data/lib/moxml/entity_registry.rb +278 -0
- data/lib/moxml/node.rb +10 -8
- data/lib/moxml/node_set.rb +10 -6
- data/lib/moxml/version.rb +1 -1
- data/lib/moxml/xml_utils.rb +25 -2
- data/lib/moxml.rb +1 -0
- data/spec/consistency/README.md +3 -1
- data/spec/consistency/round_trip_spec.rb +479 -0
- data/spec/examples/readme_examples_spec.rb +1 -1
- data/spec/fixtures/round-trips/metanorma/a.xml +66 -0
- data/spec/fixtures/round-trips/metanorma/bilingual-en.xml +7682 -0
- data/spec/fixtures/round-trips/metanorma/bilingual-fr.xml +7520 -0
- data/spec/fixtures/round-trips/metanorma/bilingual.presentation.xml +21211 -0
- data/spec/fixtures/round-trips/metanorma/collection1.xml +313 -0
- data/spec/fixtures/round-trips/metanorma/collection1nested.xml +291 -0
- data/spec/fixtures/round-trips/metanorma/collection_docinline.xml +544 -0
- data/spec/fixtures/round-trips/metanorma/collection_full.xml +1776 -0
- data/spec/fixtures/round-trips/metanorma/dummy.1.xml +295 -0
- data/spec/fixtures/round-trips/metanorma/dummy.xml +349 -0
- data/spec/fixtures/round-trips/metanorma/footnotes.xml +70 -0
- data/spec/fixtures/round-trips/metanorma/iho.xml +116 -0
- data/spec/fixtures/round-trips/metanorma/rice-amd.final.xml +186 -0
- data/spec/fixtures/round-trips/metanorma/rice-amd.final_1.xml +180 -0
- data/spec/fixtures/round-trips/metanorma/rice-en.final.norepo.xml +116 -0
- data/spec/fixtures/round-trips/metanorma/rice-en.final.xml +149 -0
- data/spec/fixtures/round-trips/metanorma/rice-en.final_1.xml +144 -0
- data/spec/fixtures/round-trips/metanorma/rice1-en.final.xml +120 -0
- data/spec/fixtures/round-trips/metanorma/rice2-en.final.xml +116 -0
- data/spec/fixtures/round-trips/metanorma/test_sectionsplit.xml +119 -0
- data/spec/fixtures/round-trips/niso-jats/bmj_sample.xml +1068 -0
- data/spec/fixtures/round-trips/niso-jats/element_citation.xml +7 -0
- data/spec/fixtures/round-trips/niso-jats/pnas_sample.xml +3768 -0
- data/spec/fixtures/round-trips/rfcxml/rfc8881.xml +45848 -0
- data/spec/fixtures/round-trips/rfcxml/rfc8994.xml +6607 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9000.xml +9064 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9043.xml +5527 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9051.xml +14286 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9110.xml +18156 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9260.xml +9136 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9293.xml +8300 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9380.xml +8916 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9420.xml +8927 -0
- data/spec/fixtures/w3c/namespaces/1.0/001.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/002.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/003.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/004.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/005.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/006.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/007.xml +20 -0
- data/spec/fixtures/w3c/namespaces/1.0/008.xml +20 -0
- data/spec/fixtures/w3c/namespaces/1.0/009.xml +19 -0
- data/spec/fixtures/w3c/namespaces/1.0/010.xml +19 -0
- data/spec/fixtures/w3c/namespaces/1.0/011.xml +20 -0
- data/spec/fixtures/w3c/namespaces/1.0/012.xml +19 -0
- data/spec/fixtures/w3c/namespaces/1.0/013.xml +5 -0
- data/spec/fixtures/w3c/namespaces/1.0/014.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/015.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/016.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/017.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/018.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/019.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/020.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/021.xml +6 -0
- data/spec/fixtures/w3c/namespaces/1.0/022.xml +6 -0
- data/spec/fixtures/w3c/namespaces/1.0/023.xml +6 -0
- data/spec/fixtures/w3c/namespaces/1.0/024.xml +6 -0
- data/spec/fixtures/w3c/namespaces/1.0/025.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/026.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/027.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/028.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/029.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/030.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/031.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/032.xml +5 -0
- data/spec/fixtures/w3c/namespaces/1.0/033.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/034.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/035.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/036.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/037.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/038.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/039.xml +10 -0
- data/spec/fixtures/w3c/namespaces/1.0/040.xml +9 -0
- data/spec/fixtures/w3c/namespaces/1.0/041.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/042.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/043.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/044.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/045.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/046.xml +10 -0
- data/spec/fixtures/w3c/namespaces/1.0/047.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/048.xml +5 -0
- data/spec/fixtures/w3c/namespaces/1.0/LICENSE.md +32 -0
- data/spec/fixtures/w3c/namespaces/1.0/README.adoc +42 -0
- data/spec/fixtures/w3c/namespaces/1.0/rmt-ns10.xml +156 -0
- data/spec/integration/shared_examples/node_wrappers/element_behavior.rb +14 -0
- data/spec/integration/shared_examples/node_wrappers/namespace_behavior.rb +14 -2
- data/spec/integration/shared_examples/w3c_namespace_examples.rb +10 -0
- data/spec/integration/w3c_namespace_spec.rb +69 -0
- data/spec/moxml/adapter/libxml_spec.rb +7 -1
- data/spec/moxml/adapter/oga_spec.rb +92 -0
- data/spec/moxml/config_spec.rb +75 -0
- data/spec/moxml/entity_registry_spec.rb +184 -0
- data/spec/moxml/error_spec.rb +2 -2
- data/spec/moxml/namespace_uri_validation_spec.rb +140 -0
- data/spec/moxml/xpath/axes_spec.rb +3 -4
- data/spec/performance/xpath_benchmark_spec.rb +6 -54
- data/spec/support/w3c_namespace_helpers.rb +41 -0
- data/spec/unit/rexml_isolated_test.rb +271 -0
- metadata +98 -2
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# Entity Support for lutaml-model Team
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
Moxml now supports entity restoration during parsing. This feature ensures that XML entities (like `&`, `<`, `>`, `"`, `'`) are preserved as `EntityReference` nodes rather than being resolved to their character values during parsing.
|
|
6
|
+
|
|
7
|
+
## Key Concept: Entity Restoration
|
|
8
|
+
|
|
9
|
+
By default, XML parsers resolve entities during parsing:
|
|
10
|
+
- Input: `<root>foo&bar</root>`
|
|
11
|
+
- Default behavior: Text node contains `foo&bar` (resolved `&`)
|
|
12
|
+
- With entity restoration: Text node contains `foo` + EntityReference `&` + Text node `bar`
|
|
13
|
+
|
|
14
|
+
## Enabling Entity Restoration
|
|
15
|
+
|
|
16
|
+
### Option 1: Per-Context Configuration
|
|
17
|
+
|
|
18
|
+
```ruby
|
|
19
|
+
context = Moxml.new(:nokogiri, restore_entities: true)
|
|
20
|
+
doc = context.parse('<root>foo&bar</root>')
|
|
21
|
+
# doc.to_xml will preserve & as EntityReference
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
### Option 2: Global Configuration
|
|
25
|
+
|
|
26
|
+
```ruby
|
|
27
|
+
Moxml.configure do |config|
|
|
28
|
+
config.restore_entities = true
|
|
29
|
+
end
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Preloading Entity Sets
|
|
33
|
+
|
|
34
|
+
You can preload standard entity sets (HTML5, MathML, ISO) for faster entity resolution:
|
|
35
|
+
|
|
36
|
+
```ruby
|
|
37
|
+
context = Moxml.new(:nokogiri,
|
|
38
|
+
restore_entities: true,
|
|
39
|
+
preload_entity_sets: [:html5, :mathml]
|
|
40
|
+
)
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## W3C XML Core WG Compliance
|
|
44
|
+
|
|
45
|
+
Per W3C XML Core WG guidance:
|
|
46
|
+
- Standard XML entities (`amp`, `lt`, `gt`, `quot`, `apos`) are implicitly declared per XML spec
|
|
47
|
+
- The `EntityRegistry` class tracks all known entities and their Unicode codepoints
|
|
48
|
+
- Entity names are preserved through round-trip serialization
|
|
49
|
+
|
|
50
|
+
## What lutaml-model Needs to Know
|
|
51
|
+
|
|
52
|
+
### 1. Document Structure with Entities
|
|
53
|
+
|
|
54
|
+
When entity restoration is enabled, documents containing entities will have mixed node types:
|
|
55
|
+
|
|
56
|
+
```
|
|
57
|
+
Document
|
|
58
|
+
└── Element: root
|
|
59
|
+
├── Text: "foo"
|
|
60
|
+
├── EntityReference: "amp" # Represents &
|
|
61
|
+
└── Text: "bar"
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### 2. Serialization
|
|
65
|
+
|
|
66
|
+
`doc.to_xml` will serialize EntityReference nodes as proper XML entity syntax:
|
|
67
|
+
- `EntityReference("amp")` → `&`
|
|
68
|
+
- `EntityReference("lt")` → `<`
|
|
69
|
+
- etc.
|
|
70
|
+
|
|
71
|
+
### 3. XPath Queries
|
|
72
|
+
|
|
73
|
+
EntityReference nodes participate in XPath queries like any other node. You can query for them specifically if needed.
|
|
74
|
+
|
|
75
|
+
### 4. Configuration Inheritance
|
|
76
|
+
|
|
77
|
+
When using `Moxml::Context`, the entity restoration setting is preserved through document operations. However, when creating new contexts, you need to set the option explicitly.
|
|
78
|
+
|
|
79
|
+
## Example Usage in lutaml-model
|
|
80
|
+
|
|
81
|
+
```ruby
|
|
82
|
+
# Parse XML with entities preserved
|
|
83
|
+
context = Moxml.new(:nokogiri, restore_entities: true)
|
|
84
|
+
doc = context.parse(your_xml_string)
|
|
85
|
+
|
|
86
|
+
# Serialize back - entities are preserved
|
|
87
|
+
output = doc.to_xml
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Testing Considerations
|
|
91
|
+
|
|
92
|
+
When writing tests for models that handle XML with entities:
|
|
93
|
+
1. Enable `restore_entities: true` in your test context
|
|
94
|
+
2. Verify that EntityReference nodes are created for entities in text
|
|
95
|
+
3. Test round-trip: parse → serialize → parse should preserve entities
|
|
96
|
+
|
|
97
|
+
## Files of Interest
|
|
98
|
+
|
|
99
|
+
- `lib/moxml/entity_registry.rb` - Entity definitions and lookup
|
|
100
|
+
- `lib/moxml/config.rb` - Configuration options
|
|
101
|
+
- `lib/moxml/document_builder.rb` - Entity restoration logic
|
|
102
|
+
- `lib/moxml/entity_reference.rb` - EntityReference node class
|
|
@@ -48,7 +48,37 @@ doc.xpath("//book").find { |book| book["id"] == "123" }
|
|
|
48
48
|
IMPORTANT: For complete XPath 1.0 specification with zero limitations today, use
|
|
49
49
|
Nokogiri or Oga adapters.
|
|
50
50
|
|
|
51
|
+
==== Element Ordering in Round-Trip Tests
|
|
51
52
|
|
|
53
|
+
[IMPORTANT]
|
|
54
|
+
.Round-trip tests for Ox are experimental
|
|
55
|
+
--
|
|
56
|
+
Ox round-trip tests (`nokogiri-ox` CI job) are **experimental** because Ox lacks full XML
|
|
57
|
+
conformance:
|
|
58
|
+
|
|
59
|
+
* **Namespace support**: Ox does not properly handle namespaced elements in XPath queries
|
|
60
|
+
* **XPath limitations**: Uses `locate()` instead of standard XPath; no attribute value
|
|
61
|
+
predicates, no logical operators, no position predicates
|
|
62
|
+
* **Element ordering**: Ox produces elements in a different order than Nokogiri/Oga for
|
|
63
|
+
certain complex fixtures
|
|
64
|
+
|
|
65
|
+
For production use with complex XML, prefer Nokogiri or Oga adapters.
|
|
66
|
+
--
|
|
67
|
+
|
|
68
|
+
The Ox adapter produces elements in a different order than other adapters for certain
|
|
69
|
+
fixtures with complex nested structures. In round-trip tests, this causes the
|
|
70
|
+
`elements_with_attributes` array length comparison to fail, even though the semantic
|
|
71
|
+
equivalence check (double round-trip) passes.
|
|
72
|
+
|
|
73
|
+
Known affected fixtures:
|
|
74
|
+
|
|
75
|
+
* `niso-jats/element_citation.xml`
|
|
76
|
+
* `niso-jats/pnas_sample.xml`
|
|
77
|
+
* `metanorma/collection1nested.xml`
|
|
78
|
+
|
|
79
|
+
This is a known limitation tracked in the round-trip test suite via
|
|
80
|
+
`KNOWN_ELEMENT_ORDERING_ISSUES`. The `elements_with_attributes` comparison is
|
|
81
|
+
automatically skipped for these Ox adapter pairs.
|
|
52
82
|
|
|
53
83
|
See also:
|
|
54
84
|
|
|
@@ -97,6 +97,49 @@ context.config.default_encoding = 'UTF-16'
|
|
|
97
97
|
|
|
98
98
|
**Default:** `"UTF-8"`
|
|
99
99
|
|
|
100
|
+
==== Namespace URI validation mode
|
|
101
|
+
|
|
102
|
+
Control how strictly namespace URIs are validated:
|
|
103
|
+
|
|
104
|
+
[source,ruby]
|
|
105
|
+
----
|
|
106
|
+
# Strict mode (default) — validates namespace URIs against RFC 3986
|
|
107
|
+
context.config.namespace_uri_mode = :strict
|
|
108
|
+
doc = context.parse(xml) # Raises ValidationError for invalid URIs
|
|
109
|
+
|
|
110
|
+
# Lenient mode — accepts any string as a namespace URI
|
|
111
|
+
context.config.namespace_uri_mode = :lenient
|
|
112
|
+
doc = context.parse(xml) # Accepts non-standard namespace URIs
|
|
113
|
+
----
|
|
114
|
+
|
|
115
|
+
**Default:** `:strict`
|
|
116
|
+
|
|
117
|
+
**Modes:**
|
|
118
|
+
|
|
119
|
+
`:strict`:: Validates namespace URIs against the
|
|
120
|
+
https://www.rfc-editor.org/rfc/rfc3986[RFC 3986] URI-reference specification, as
|
|
121
|
+
required by https://www.w3.org/TR/xml-names/[Namespaces in XML]. Invalid URIs
|
|
122
|
+
raise a `Moxml::ValidationError`. This is the recommended mode for
|
|
123
|
+
standards-compliant XML processing.
|
|
124
|
+
|
|
125
|
+
`:lenient`:: Accepts any string as a namespace URI, only rejecting strings
|
|
126
|
+
containing XML-invalid control characters (`0x00`-`0x08`, `0x0B`, `0x0C`,
|
|
127
|
+
`0x0E`-`0x1F`). Use this mode when processing XML documents that use
|
|
128
|
+
non-standard namespace identifiers such as URNs or other non-URI strings.
|
|
129
|
+
|
|
130
|
+
**Example:**
|
|
131
|
+
|
|
132
|
+
[source,ruby]
|
|
133
|
+
----
|
|
134
|
+
# Process documents with non-standard namespace URIs
|
|
135
|
+
context = Moxml.new do |config|
|
|
136
|
+
config.namespace_uri_mode = :lenient
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
xml = '<root xmlns:ex="not a valid URI but accepted in lenient mode"/>'
|
|
140
|
+
doc = context.parse(xml) # Parses successfully
|
|
141
|
+
----
|
|
142
|
+
|
|
100
143
|
=== Context switching
|
|
101
144
|
|
|
102
145
|
Use different configurations for different tasks:
|
|
@@ -32,6 +32,10 @@ All node types in Moxml support the `#identifier` method, which returns the prim
|
|
|
32
32
|
| `nil` (no identifier)
|
|
33
33
|
| `nil`
|
|
34
34
|
|
|
35
|
+
| EntityReference
|
|
36
|
+
| The entity name
|
|
37
|
+
| `"nbsp"`, `"copy"`
|
|
38
|
+
|
|
35
39
|
| Declaration
|
|
36
40
|
| `nil` (no identifier)
|
|
37
41
|
| `nil`
|
|
@@ -137,3 +141,34 @@ doctype.identifier # => "html"
|
|
|
137
141
|
* `identifier` - Returns the primary identifier (same as `name`)
|
|
138
142
|
|
|
139
143
|
All Doctype accessor methods are fully implemented across all 6 adapters.
|
|
144
|
+
|
|
145
|
+
=== EntityReference nodes
|
|
146
|
+
|
|
147
|
+
EntityReference nodes represent XML entity references like ` `, `©`, or custom entities declared in the DOCTYPE.
|
|
148
|
+
|
|
149
|
+
[source,ruby]
|
|
150
|
+
----
|
|
151
|
+
# Create programmatically
|
|
152
|
+
ref = doc.create_entity_reference('nbsp')
|
|
153
|
+
element.add_child(ref)
|
|
154
|
+
|
|
155
|
+
# Or via builder
|
|
156
|
+
doc = Moxml::Builder.new(Moxml.new).build do
|
|
157
|
+
element 'text' do
|
|
158
|
+
entity_reference 'ndash'
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
----
|
|
162
|
+
|
|
163
|
+
*Available methods:*
|
|
164
|
+
|
|
165
|
+
* `name` - Returns the entity name (e.g., `"nbsp"`, `"copy"`)
|
|
166
|
+
* `identifier` - Returns the primary identifier (same as `name`)
|
|
167
|
+
* `text` - Returns empty string (`""`) since entity has no text content
|
|
168
|
+
* `content` - Returns empty string (entity content is in the name)
|
|
169
|
+
* `to_xml` - Returns the entity syntax (e.g., `" "`)
|
|
170
|
+
|
|
171
|
+
*Adapter notes:*
|
|
172
|
+
|
|
173
|
+
* *Nokogiri*: Preserves custom declared entities as `EntityReference` nodes
|
|
174
|
+
* *Ox, Oga*: These adapters resolve entities during parsing and do not expose entity reference nodes. Use Nokogiri or LibXML for entity preservation.
|
|
@@ -276,6 +276,27 @@ puts all_children.length # => 2
|
|
|
276
276
|
* link:../pages/adapters/rexml[REXML] - ⚠️ No namespace XPath
|
|
277
277
|
* link:../pages/adapters/ox[Ox] - ⚠️ Basic only, no XPath
|
|
278
278
|
|
|
279
|
+
=== Namespace URI validation
|
|
280
|
+
|
|
281
|
+
By default, Moxml validates namespace URIs against
|
|
282
|
+
https://www.rfc-editor.org/rfc/rfc3986[RFC 3986] (strict mode). To accept
|
|
283
|
+
non-standard namespace identifiers, use lenient mode:
|
|
284
|
+
|
|
285
|
+
[source,ruby]
|
|
286
|
+
----
|
|
287
|
+
# Strict mode (default) — validates URIs per RFC 3986
|
|
288
|
+
context = Moxml.new do |config|
|
|
289
|
+
config.namespace_uri_mode = :strict
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
# Lenient mode — accepts any string as namespace URI
|
|
293
|
+
context = Moxml.new do |config|
|
|
294
|
+
config.namespace_uri_mode = :lenient
|
|
295
|
+
end
|
|
296
|
+
----
|
|
297
|
+
|
|
298
|
+
See link:../pages/configuration[Configuration] for details.
|
|
299
|
+
|
|
279
300
|
=== Troubleshooting
|
|
280
301
|
|
|
281
302
|
**Namespace XPath not working:**
|
|
@@ -179,9 +179,7 @@ if __FILE__ == $0
|
|
|
179
179
|
|
|
180
180
|
# Count categories
|
|
181
181
|
all_categories = articles.flat_map(&:categories)
|
|
182
|
-
category_counts = all_categories.
|
|
183
|
-
counts[cat] += 1
|
|
184
|
-
end
|
|
182
|
+
category_counts = all_categories.tally
|
|
185
183
|
puts "Categories: #{category_counts.map do |cat, count|
|
|
186
184
|
"#{cat} (#{count})"
|
|
187
185
|
end.join(', ')}"
|
data/lib/moxml/adapter/base.rb
CHANGED
|
@@ -98,12 +98,24 @@ module Moxml
|
|
|
98
98
|
create_native_declaration(version, encoding, standalone)
|
|
99
99
|
end
|
|
100
100
|
|
|
101
|
-
def create_namespace(element, prefix, uri)
|
|
101
|
+
def create_namespace(element, prefix, uri, namespace_uri_mode: :strict)
|
|
102
|
+
if prefix && uri.to_s.empty?
|
|
103
|
+
raise NamespaceError.new(
|
|
104
|
+
"Prefixed namespace declaration cannot have an empty URI",
|
|
105
|
+
prefix: prefix,
|
|
106
|
+
uri: uri,
|
|
107
|
+
)
|
|
108
|
+
end
|
|
102
109
|
validate_prefix(prefix) if prefix
|
|
103
|
-
validate_uri(uri)
|
|
110
|
+
validate_uri(uri, mode: namespace_uri_mode)
|
|
104
111
|
create_native_namespace(element, prefix, uri)
|
|
105
112
|
end
|
|
106
113
|
|
|
114
|
+
def create_entity_reference(name)
|
|
115
|
+
validate_entity_reference_name(name)
|
|
116
|
+
create_native_entity_reference(name)
|
|
117
|
+
end
|
|
118
|
+
|
|
107
119
|
def set_attribute_name(attribute, name)
|
|
108
120
|
attribute.name = name
|
|
109
121
|
end
|
|
@@ -112,6 +124,10 @@ module Moxml
|
|
|
112
124
|
attribute.value = value
|
|
113
125
|
end
|
|
114
126
|
|
|
127
|
+
def entity_reference_name(node)
|
|
128
|
+
node.name
|
|
129
|
+
end
|
|
130
|
+
|
|
115
131
|
def duplicate_node(node)
|
|
116
132
|
node.dup
|
|
117
133
|
end
|
|
@@ -193,6 +209,14 @@ module Moxml
|
|
|
193
209
|
adapter: name,
|
|
194
210
|
)
|
|
195
211
|
end
|
|
212
|
+
|
|
213
|
+
def create_native_entity_reference(_name)
|
|
214
|
+
raise Moxml::NotImplementedError.new(
|
|
215
|
+
"create_native_entity_reference not implemented",
|
|
216
|
+
feature: "create_native_entity_reference",
|
|
217
|
+
adapter: name,
|
|
218
|
+
)
|
|
219
|
+
end
|
|
196
220
|
end
|
|
197
221
|
end
|
|
198
222
|
end
|
|
@@ -26,7 +26,7 @@ module Moxml
|
|
|
26
26
|
class HeadedOx < Ox
|
|
27
27
|
class << self
|
|
28
28
|
# Override parse to use HeadedOx context instead of Ox context
|
|
29
|
-
def parse(xml, _options = {})
|
|
29
|
+
def parse(xml, _options = {}, _context = nil)
|
|
30
30
|
native_doc = begin
|
|
31
31
|
result = ::Ox.parse(xml)
|
|
32
32
|
|
|
@@ -45,8 +45,9 @@ module Moxml
|
|
|
45
45
|
)
|
|
46
46
|
end
|
|
47
47
|
|
|
48
|
-
# Use
|
|
49
|
-
|
|
48
|
+
# Use provided context if available, otherwise create new one
|
|
49
|
+
ctx = _context || Context.new(:headed_ox)
|
|
50
|
+
DocumentBuilder.new(ctx).build(native_doc)
|
|
50
51
|
end
|
|
51
52
|
|
|
52
53
|
# Execute XPath query using Moxml's XPath engine
|
|
@@ -66,7 +67,7 @@ module Moxml
|
|
|
66
67
|
ctx = Context.new(:headed_ox)
|
|
67
68
|
|
|
68
69
|
# Wrap the native node - don't rebuild the whole document
|
|
69
|
-
node = Node.wrap(node, ctx)
|
|
70
|
+
node = Moxml::Node.wrap(node, ctx)
|
|
70
71
|
end
|
|
71
72
|
|
|
72
73
|
# Parse XPath expression to AST
|
data/lib/moxml/adapter/libxml.rb
CHANGED
|
@@ -48,7 +48,7 @@ module Moxml
|
|
|
48
48
|
doc.root = element
|
|
49
49
|
end
|
|
50
50
|
|
|
51
|
-
def parse(xml, options = {})
|
|
51
|
+
def parse(xml, options = {}, _context = nil)
|
|
52
52
|
# LibXML doesn't preserve DOCTYPE during parsing, so we need to extract it manually
|
|
53
53
|
xml_string = if xml.is_a?(String)
|
|
54
54
|
xml
|
|
@@ -94,7 +94,8 @@ module Moxml
|
|
|
94
94
|
native_doc.instance_variable_set(:@moxml_doctype, doctype_wrapper)
|
|
95
95
|
end
|
|
96
96
|
|
|
97
|
-
|
|
97
|
+
ctx = _context || Context.new(:libxml)
|
|
98
|
+
DocumentBuilder.new(ctx).build(native_doc)
|
|
98
99
|
end
|
|
99
100
|
|
|
100
101
|
# SAX parsing implementation for LibXML
|
|
@@ -11,7 +11,7 @@ module Moxml
|
|
|
11
11
|
doc.root = element
|
|
12
12
|
end
|
|
13
13
|
|
|
14
|
-
def parse(xml, options = {})
|
|
14
|
+
def parse(xml, options = {}, _context = nil)
|
|
15
15
|
native_doc = begin
|
|
16
16
|
if options[:fragment]
|
|
17
17
|
::Nokogiri::XML::DocumentFragment.parse(xml) do |config|
|
|
@@ -29,7 +29,9 @@ module Moxml
|
|
|
29
29
|
column: e.column)
|
|
30
30
|
end
|
|
31
31
|
|
|
32
|
-
|
|
32
|
+
# Use provided context if available, otherwise create new one
|
|
33
|
+
ctx = _context || Context.new(:nokogiri)
|
|
34
|
+
DocumentBuilder.new(ctx).build(native_doc)
|
|
33
35
|
end
|
|
34
36
|
|
|
35
37
|
# SAX parsing implementation for Nokogiri
|
|
@@ -104,6 +106,14 @@ module Moxml
|
|
|
104
106
|
)
|
|
105
107
|
end
|
|
106
108
|
|
|
109
|
+
def create_native_entity_reference(name)
|
|
110
|
+
::Nokogiri::XML::EntityReference.new(create_document, name)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def entity_reference_name(node)
|
|
114
|
+
node.name
|
|
115
|
+
end
|
|
116
|
+
|
|
107
117
|
def declaration_attribute(declaration, attr_name)
|
|
108
118
|
return nil unless declaration.content
|
|
109
119
|
|
|
@@ -150,6 +160,7 @@ module Moxml
|
|
|
150
160
|
when ::Nokogiri::XML::ProcessingInstruction then :processing_instruction
|
|
151
161
|
when ::Nokogiri::XML::Document, ::Nokogiri::XML::DocumentFragment then :document
|
|
152
162
|
when ::Nokogiri::XML::DTD then :doctype
|
|
163
|
+
when ::Nokogiri::XML::EntityReference then :entity_reference
|
|
153
164
|
else :unknown
|
|
154
165
|
end
|
|
155
166
|
end
|
|
@@ -277,7 +288,9 @@ module Moxml
|
|
|
277
288
|
end
|
|
278
289
|
|
|
279
290
|
def inner_text(node)
|
|
280
|
-
text_children = node.children
|
|
291
|
+
text_children = node.children.reject do |c|
|
|
292
|
+
c.element? || c.comment?
|
|
293
|
+
end
|
|
281
294
|
text_children.map(&:content).join
|
|
282
295
|
end
|
|
283
296
|
|
data/lib/moxml/adapter/oga.rb
CHANGED
|
@@ -9,17 +9,24 @@ module Moxml
|
|
|
9
9
|
module Adapter
|
|
10
10
|
class Oga < Base
|
|
11
11
|
class << self
|
|
12
|
+
# Standard XML entities handled natively by parsers
|
|
13
|
+
STANDARD_XML_ENTITIES = %w[amp lt gt quot apos].freeze
|
|
14
|
+
|
|
12
15
|
def set_root(doc, element)
|
|
13
16
|
# Clear existing root element if any - Oga's NodeSet needs special handling
|
|
14
17
|
# We need to manually remove elements since NodeSet doesn't support clear or delete_if
|
|
15
|
-
elements_to_remove = doc.children.
|
|
18
|
+
elements_to_remove = doc.children.grep(::Oga::XML::Element)
|
|
16
19
|
elements_to_remove.each { |elem| doc.children.delete(elem) }
|
|
17
20
|
doc.children << element
|
|
18
21
|
end
|
|
19
22
|
|
|
20
|
-
def parse(xml, options = {})
|
|
23
|
+
def parse(xml, options = {}, _context = nil)
|
|
24
|
+
# Pre-process XML to convert named entities to marker form (\x01name;).
|
|
25
|
+
# Oga drops named entity references like during parsing.
|
|
26
|
+
processed_xml = preprocess_named_entities(xml)
|
|
27
|
+
|
|
21
28
|
native_doc = begin
|
|
22
|
-
::Oga.parse_xml(
|
|
29
|
+
::Oga.parse_xml(processed_xml, strict: options[:strict])
|
|
23
30
|
rescue LL::ParserError => e
|
|
24
31
|
raise Moxml::ParseError.new(
|
|
25
32
|
e.message,
|
|
@@ -27,7 +34,8 @@ module Moxml
|
|
|
27
34
|
)
|
|
28
35
|
end
|
|
29
36
|
|
|
30
|
-
|
|
37
|
+
ctx = _context || Context.new(:oga)
|
|
38
|
+
DocumentBuilder.new(ctx).build(native_doc)
|
|
31
39
|
end
|
|
32
40
|
|
|
33
41
|
# SAX parsing implementation for Oga
|
|
@@ -61,7 +69,7 @@ module Moxml
|
|
|
61
69
|
end
|
|
62
70
|
|
|
63
71
|
def create_native_text(content)
|
|
64
|
-
::Oga::XML::Text.new(text: content)
|
|
72
|
+
::Oga::XML::Text.new(text: encode_entity_markers(content))
|
|
65
73
|
end
|
|
66
74
|
|
|
67
75
|
def create_native_cdata(content)
|
|
@@ -74,7 +82,8 @@ module Moxml
|
|
|
74
82
|
|
|
75
83
|
def create_native_doctype(name, external_id, system_id)
|
|
76
84
|
::Oga::XML::Doctype.new(
|
|
77
|
-
name: name, public_id: external_id, system_id: system_id,
|
|
85
|
+
name: name, public_id: external_id, system_id: system_id,
|
|
86
|
+
type: external_id ? "PUBLIC" : "SYSTEM"
|
|
78
87
|
)
|
|
79
88
|
end
|
|
80
89
|
|
|
@@ -224,7 +233,7 @@ module Moxml
|
|
|
224
233
|
attr = ::Oga::XML::Attribute.new(
|
|
225
234
|
name: name.to_s,
|
|
226
235
|
namespace_name: namespace_name,
|
|
227
|
-
value: value.to_s,
|
|
236
|
+
value: encode_entity_markers(value.to_s),
|
|
228
237
|
)
|
|
229
238
|
element.add_attribute(attr)
|
|
230
239
|
end
|
|
@@ -234,7 +243,7 @@ module Moxml
|
|
|
234
243
|
end
|
|
235
244
|
|
|
236
245
|
def get_attribute_value(element, name)
|
|
237
|
-
element[name.to_s]
|
|
246
|
+
restore_entity_markers(element[name.to_s])
|
|
238
247
|
end
|
|
239
248
|
|
|
240
249
|
def remove_attribute(element, name)
|
|
@@ -303,24 +312,25 @@ module Moxml
|
|
|
303
312
|
end
|
|
304
313
|
|
|
305
314
|
def text_content(node)
|
|
306
|
-
node.text
|
|
315
|
+
restore_entity_markers(node.text)
|
|
307
316
|
end
|
|
308
317
|
|
|
309
318
|
def inner_text(node)
|
|
310
|
-
if node.respond_to?(:inner_text)
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
319
|
+
text = if node.respond_to?(:inner_text)
|
|
320
|
+
node.inner_text
|
|
321
|
+
else
|
|
322
|
+
# Oga::XML::Text node for example
|
|
323
|
+
node.text
|
|
324
|
+
end
|
|
325
|
+
restore_entity_markers(text)
|
|
316
326
|
end
|
|
317
327
|
|
|
318
328
|
def set_text_content(node, content)
|
|
329
|
+
encoded = encode_entity_markers(content)
|
|
319
330
|
if node.respond_to?(:inner_text=)
|
|
320
|
-
node.inner_text =
|
|
331
|
+
node.inner_text = encoded
|
|
321
332
|
else
|
|
322
|
-
|
|
323
|
-
node.text = content
|
|
333
|
+
node.text = encoded
|
|
324
334
|
end
|
|
325
335
|
end
|
|
326
336
|
|
|
@@ -402,6 +412,53 @@ module Moxml
|
|
|
402
412
|
end
|
|
403
413
|
|
|
404
414
|
def serialize(node, options = {})
|
|
415
|
+
output = serialize_without_entity_processing(node, options)
|
|
416
|
+
# Post-process: convert entity markers back to entity references
|
|
417
|
+
output.gsub(ENTITY_MARKER_REGEX, '&\1;')
|
|
418
|
+
end
|
|
419
|
+
|
|
420
|
+
# Shared entity name pattern (W3C: 2-31 chars, starts with alpha)
|
|
421
|
+
ENTITY_PATTERN = "([a-zA-Z][a-zA-Z0-9]{1,30})"
|
|
422
|
+
|
|
423
|
+
# Marker character for entity preservation through Oga's parser.
|
|
424
|
+
# U+0001 is preserved literally by Oga through parse/serialize cycle.
|
|
425
|
+
ENTITY_MARKER = "\x01"
|
|
426
|
+
|
|
427
|
+
# Regular expression for entity marker post-processing
|
|
428
|
+
ENTITY_MARKER_REGEX = /#{ENTITY_MARKER}#{ENTITY_PATTERN};/
|
|
429
|
+
|
|
430
|
+
# Simple entity-only regex with no nested quantifiers
|
|
431
|
+
ENTITY_REF_REGEX = /&#{ENTITY_PATTERN};/
|
|
432
|
+
|
|
433
|
+
private
|
|
434
|
+
|
|
435
|
+
# Convert &entity; back to \x01entity; for Oga text storage.
|
|
436
|
+
# Used when setting text content programmatically (not from parsing).
|
|
437
|
+
def encode_entity_markers(text)
|
|
438
|
+
return text unless text&.include?("&")
|
|
439
|
+
|
|
440
|
+
text.gsub(ENTITY_REF_REGEX) do
|
|
441
|
+
name = ::Regexp.last_match(1)
|
|
442
|
+
|
|
443
|
+
next ::Regexp.last_match(0) if STANDARD_XML_ENTITIES.include?(name)
|
|
444
|
+
|
|
445
|
+
codepoint = Moxml::EntityRegistry.default.codepoint_for_name(name)
|
|
446
|
+
if codepoint
|
|
447
|
+
"#{ENTITY_MARKER}#{name};"
|
|
448
|
+
else
|
|
449
|
+
::Regexp.last_match(0)
|
|
450
|
+
end
|
|
451
|
+
end
|
|
452
|
+
end
|
|
453
|
+
|
|
454
|
+
# Convert \x01entity; back to &entity; for text accessors.
|
|
455
|
+
def restore_entity_markers(text)
|
|
456
|
+
return text unless text
|
|
457
|
+
|
|
458
|
+
text.gsub(ENTITY_MARKER_REGEX, '&\1;')
|
|
459
|
+
end
|
|
460
|
+
|
|
461
|
+
def serialize_without_entity_processing(node, options = {})
|
|
405
462
|
# Oga's XmlGenerator doesn't support options directly
|
|
406
463
|
# We need to handle declaration options ourselves for Document nodes
|
|
407
464
|
if node.is_a?(::Oga::XML::Document)
|
|
@@ -416,7 +473,11 @@ module Moxml
|
|
|
416
473
|
node.xml_declaration ? true : false
|
|
417
474
|
end
|
|
418
475
|
|
|
419
|
-
if
|
|
476
|
+
# Fix: Check if declaration already exists in children
|
|
477
|
+
# This prevents duplicate declarations when document already has one
|
|
478
|
+
has_existing_declaration = node.children.any?(::Oga::XML::XmlDeclaration)
|
|
479
|
+
|
|
480
|
+
if should_include_decl && !node.xml_declaration && !has_existing_declaration
|
|
420
481
|
# Need to add declaration - create default one
|
|
421
482
|
output = +""
|
|
422
483
|
output << '<?xml version="1.0" encoding="UTF-8"?>'
|
|
@@ -450,7 +511,50 @@ module Moxml
|
|
|
450
511
|
end
|
|
451
512
|
|
|
452
513
|
# Default: use XmlGenerator
|
|
453
|
-
|
|
514
|
+
# But first check if we need to handle declaration specially
|
|
515
|
+
if node.is_a?(::Oga::XML::Document) && node.xml_declaration
|
|
516
|
+
# Document has declaration - use custom handling to avoid duplicates
|
|
517
|
+
output = +""
|
|
518
|
+
|
|
519
|
+
# Serialize children, but skip XmlDeclaration if it would cause duplication
|
|
520
|
+
node.children.each do |child|
|
|
521
|
+
# Check if this would cause duplication by seeing if we already have one in output
|
|
522
|
+
if child.is_a?(::Oga::XML::XmlDeclaration) && output.include?("<?xml")
|
|
523
|
+
next # Skip duplicate declaration
|
|
524
|
+
end
|
|
525
|
+
|
|
526
|
+
output << ::Moxml::Adapter::CustomizedOga::XmlGenerator.new(child).to_xml
|
|
527
|
+
end
|
|
528
|
+
|
|
529
|
+
output
|
|
530
|
+
else
|
|
531
|
+
# Normal case - use XmlGenerator directly
|
|
532
|
+
::Moxml::Adapter::CustomizedOga::XmlGenerator.new(node).to_xml
|
|
533
|
+
end
|
|
534
|
+
end
|
|
535
|
+
|
|
536
|
+
# Pre-process XML to convert named entities to marker format.
|
|
537
|
+
# Oga drops named entity references like but preserves control chars.
|
|
538
|
+
# By converting known named entities to marker form (\x01name;), we can
|
|
539
|
+
# reconstruct them during serialization.
|
|
540
|
+
#
|
|
541
|
+
# @param xml [String, #to_s] The XML string to process
|
|
542
|
+
# @return [String] The XML with known named entities converted to marker form
|
|
543
|
+
def preprocess_named_entities(xml)
|
|
544
|
+
return xml unless xml.is_a?(String)
|
|
545
|
+
|
|
546
|
+
xml.gsub(ENTITY_REF_REGEX) do
|
|
547
|
+
name = Regexp.last_match(1)
|
|
548
|
+
|
|
549
|
+
next Regexp.last_match(0) if STANDARD_XML_ENTITIES.include?(name)
|
|
550
|
+
|
|
551
|
+
codepoint = Moxml::EntityRegistry.default.codepoint_for_name(name)
|
|
552
|
+
if codepoint
|
|
553
|
+
"#{ENTITY_MARKER}#{name};"
|
|
554
|
+
else
|
|
555
|
+
Regexp.last_match(0)
|
|
556
|
+
end
|
|
557
|
+
end
|
|
454
558
|
end
|
|
455
559
|
end
|
|
456
560
|
end
|
data/lib/moxml/adapter/ox.rb
CHANGED
|
@@ -17,7 +17,7 @@ module Moxml
|
|
|
17
17
|
replace_children(doc, [element])
|
|
18
18
|
end
|
|
19
19
|
|
|
20
|
-
def parse(xml, _options = {})
|
|
20
|
+
def parse(xml, _options = {}, _context = nil)
|
|
21
21
|
native_doc = begin
|
|
22
22
|
result = ::Ox.parse(xml)
|
|
23
23
|
|
|
@@ -36,7 +36,8 @@ module Moxml
|
|
|
36
36
|
)
|
|
37
37
|
end
|
|
38
38
|
|
|
39
|
-
|
|
39
|
+
ctx = _context || Context.new(:ox)
|
|
40
|
+
DocumentBuilder.new(ctx).build(native_doc)
|
|
40
41
|
end
|
|
41
42
|
|
|
42
43
|
# SAX parsing implementation for Ox
|
|
@@ -452,7 +453,7 @@ module Moxml
|
|
|
452
453
|
def inner_text(node)
|
|
453
454
|
return "" unless node.respond_to?(:nodes)
|
|
454
455
|
|
|
455
|
-
node.nodes.
|
|
456
|
+
node.nodes.grep(String).join
|
|
456
457
|
end
|
|
457
458
|
|
|
458
459
|
def set_text_content(node, content)
|