moxml 0.1.9 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/docs.yml +1 -1
  3. data/.github/workflows/rake.yml +16 -13
  4. data/.github/workflows/release.yml +1 -0
  5. data/.github/workflows/round-trip.yml +74 -0
  6. data/.gitignore +1 -0
  7. data/.rubocop.yml +1 -0
  8. data/.rubocop_todo.yml +160 -38
  9. data/Gemfile +2 -1
  10. data/README.adoc +287 -20
  11. data/Rakefile +11 -0
  12. data/data/w3c_entities.json +2131 -0
  13. data/docs/ENTITY_SUPPORT_FOR_LUTAML_MODEL.md +102 -0
  14. data/docs/_guides/index.adoc +14 -12
  15. data/docs/_guides/node-api-consistency.adoc +572 -0
  16. data/docs/_guides/xml-declaration.adoc +5 -5
  17. data/docs/_pages/adapters/ox.adoc +30 -0
  18. data/docs/_pages/adapters/rexml.adoc +1 -1
  19. data/docs/_pages/configuration.adoc +43 -0
  20. data/docs/_pages/node-api-reference.adoc +128 -3
  21. data/docs/_tutorials/namespace-handling.adoc +21 -0
  22. data/examples/rss_parser/rss_parser.rb +1 -3
  23. data/lib/moxml/adapter/base.rb +26 -2
  24. data/lib/moxml/adapter/headed_ox.rb +5 -4
  25. data/lib/moxml/adapter/libxml.rb +18 -3
  26. data/lib/moxml/adapter/nokogiri.rb +26 -2
  27. data/lib/moxml/adapter/oga.rb +137 -20
  28. data/lib/moxml/adapter/ox.rb +29 -3
  29. data/lib/moxml/adapter/rexml.rb +54 -7
  30. data/lib/moxml/attribute.rb +6 -0
  31. data/lib/moxml/builder.rb +6 -0
  32. data/lib/moxml/config.rb +52 -1
  33. data/lib/moxml/context.rb +21 -2
  34. data/lib/moxml/doctype.rb +33 -0
  35. data/lib/moxml/document.rb +6 -1
  36. data/lib/moxml/document_builder.rb +45 -1
  37. data/lib/moxml/element.rb +10 -3
  38. data/lib/moxml/entity_reference.rb +29 -0
  39. data/lib/moxml/entity_registry.rb +278 -0
  40. data/lib/moxml/error.rb +5 -5
  41. data/lib/moxml/node.rb +22 -8
  42. data/lib/moxml/node_set.rb +10 -6
  43. data/lib/moxml/processing_instruction.rb +6 -0
  44. data/lib/moxml/version.rb +1 -1
  45. data/lib/moxml/xml_utils.rb +25 -2
  46. data/lib/moxml/xpath/errors.rb +1 -1
  47. data/lib/moxml.rb +1 -0
  48. data/spec/consistency/README.md +3 -1
  49. data/spec/consistency/round_trip_spec.rb +479 -0
  50. data/spec/examples/readme_examples_spec.rb +1 -1
  51. data/spec/fixtures/round-trips/metanorma/a.xml +66 -0
  52. data/spec/fixtures/round-trips/metanorma/bilingual-en.xml +7682 -0
  53. data/spec/fixtures/round-trips/metanorma/bilingual-fr.xml +7520 -0
  54. data/spec/fixtures/round-trips/metanorma/bilingual.presentation.xml +21211 -0
  55. data/spec/fixtures/round-trips/metanorma/collection1.xml +313 -0
  56. data/spec/fixtures/round-trips/metanorma/collection1nested.xml +291 -0
  57. data/spec/fixtures/round-trips/metanorma/collection_docinline.xml +544 -0
  58. data/spec/fixtures/round-trips/metanorma/collection_full.xml +1776 -0
  59. data/spec/fixtures/round-trips/metanorma/dummy.1.xml +295 -0
  60. data/spec/fixtures/round-trips/metanorma/dummy.xml +349 -0
  61. data/spec/fixtures/round-trips/metanorma/footnotes.xml +70 -0
  62. data/spec/fixtures/round-trips/metanorma/iho.xml +116 -0
  63. data/spec/fixtures/round-trips/metanorma/rice-amd.final.xml +186 -0
  64. data/spec/fixtures/round-trips/metanorma/rice-amd.final_1.xml +180 -0
  65. data/spec/fixtures/round-trips/metanorma/rice-en.final.norepo.xml +116 -0
  66. data/spec/fixtures/round-trips/metanorma/rice-en.final.xml +149 -0
  67. data/spec/fixtures/round-trips/metanorma/rice-en.final_1.xml +144 -0
  68. data/spec/fixtures/round-trips/metanorma/rice1-en.final.xml +120 -0
  69. data/spec/fixtures/round-trips/metanorma/rice2-en.final.xml +116 -0
  70. data/spec/fixtures/round-trips/metanorma/test_sectionsplit.xml +119 -0
  71. data/spec/fixtures/round-trips/niso-jats/bmj_sample.xml +1068 -0
  72. data/spec/fixtures/round-trips/niso-jats/element_citation.xml +7 -0
  73. data/spec/fixtures/round-trips/niso-jats/pnas_sample.xml +3768 -0
  74. data/spec/fixtures/round-trips/rfcxml/rfc8881.xml +45848 -0
  75. data/spec/fixtures/round-trips/rfcxml/rfc8994.xml +6607 -0
  76. data/spec/fixtures/round-trips/rfcxml/rfc9000.xml +9064 -0
  77. data/spec/fixtures/round-trips/rfcxml/rfc9043.xml +5527 -0
  78. data/spec/fixtures/round-trips/rfcxml/rfc9051.xml +14286 -0
  79. data/spec/fixtures/round-trips/rfcxml/rfc9110.xml +18156 -0
  80. data/spec/fixtures/round-trips/rfcxml/rfc9260.xml +9136 -0
  81. data/spec/fixtures/round-trips/rfcxml/rfc9293.xml +8300 -0
  82. data/spec/fixtures/round-trips/rfcxml/rfc9380.xml +8916 -0
  83. data/spec/fixtures/round-trips/rfcxml/rfc9420.xml +8927 -0
  84. data/spec/fixtures/w3c/namespaces/1.0/001.xml +7 -0
  85. data/spec/fixtures/w3c/namespaces/1.0/002.xml +8 -0
  86. data/spec/fixtures/w3c/namespaces/1.0/003.xml +7 -0
  87. data/spec/fixtures/w3c/namespaces/1.0/004.xml +7 -0
  88. data/spec/fixtures/w3c/namespaces/1.0/005.xml +7 -0
  89. data/spec/fixtures/w3c/namespaces/1.0/006.xml +7 -0
  90. data/spec/fixtures/w3c/namespaces/1.0/007.xml +20 -0
  91. data/spec/fixtures/w3c/namespaces/1.0/008.xml +20 -0
  92. data/spec/fixtures/w3c/namespaces/1.0/009.xml +19 -0
  93. data/spec/fixtures/w3c/namespaces/1.0/010.xml +19 -0
  94. data/spec/fixtures/w3c/namespaces/1.0/011.xml +20 -0
  95. data/spec/fixtures/w3c/namespaces/1.0/012.xml +19 -0
  96. data/spec/fixtures/w3c/namespaces/1.0/013.xml +5 -0
  97. data/spec/fixtures/w3c/namespaces/1.0/014.xml +3 -0
  98. data/spec/fixtures/w3c/namespaces/1.0/015.xml +3 -0
  99. data/spec/fixtures/w3c/namespaces/1.0/016.xml +3 -0
  100. data/spec/fixtures/w3c/namespaces/1.0/017.xml +3 -0
  101. data/spec/fixtures/w3c/namespaces/1.0/018.xml +3 -0
  102. data/spec/fixtures/w3c/namespaces/1.0/019.xml +3 -0
  103. data/spec/fixtures/w3c/namespaces/1.0/020.xml +3 -0
  104. data/spec/fixtures/w3c/namespaces/1.0/021.xml +6 -0
  105. data/spec/fixtures/w3c/namespaces/1.0/022.xml +6 -0
  106. data/spec/fixtures/w3c/namespaces/1.0/023.xml +6 -0
  107. data/spec/fixtures/w3c/namespaces/1.0/024.xml +6 -0
  108. data/spec/fixtures/w3c/namespaces/1.0/025.xml +3 -0
  109. data/spec/fixtures/w3c/namespaces/1.0/026.xml +3 -0
  110. data/spec/fixtures/w3c/namespaces/1.0/027.xml +3 -0
  111. data/spec/fixtures/w3c/namespaces/1.0/028.xml +3 -0
  112. data/spec/fixtures/w3c/namespaces/1.0/029.xml +4 -0
  113. data/spec/fixtures/w3c/namespaces/1.0/030.xml +4 -0
  114. data/spec/fixtures/w3c/namespaces/1.0/031.xml +4 -0
  115. data/spec/fixtures/w3c/namespaces/1.0/032.xml +5 -0
  116. data/spec/fixtures/w3c/namespaces/1.0/033.xml +4 -0
  117. data/spec/fixtures/w3c/namespaces/1.0/034.xml +3 -0
  118. data/spec/fixtures/w3c/namespaces/1.0/035.xml +8 -0
  119. data/spec/fixtures/w3c/namespaces/1.0/036.xml +8 -0
  120. data/spec/fixtures/w3c/namespaces/1.0/037.xml +8 -0
  121. data/spec/fixtures/w3c/namespaces/1.0/038.xml +8 -0
  122. data/spec/fixtures/w3c/namespaces/1.0/039.xml +10 -0
  123. data/spec/fixtures/w3c/namespaces/1.0/040.xml +9 -0
  124. data/spec/fixtures/w3c/namespaces/1.0/041.xml +8 -0
  125. data/spec/fixtures/w3c/namespaces/1.0/042.xml +4 -0
  126. data/spec/fixtures/w3c/namespaces/1.0/043.xml +7 -0
  127. data/spec/fixtures/w3c/namespaces/1.0/044.xml +7 -0
  128. data/spec/fixtures/w3c/namespaces/1.0/045.xml +7 -0
  129. data/spec/fixtures/w3c/namespaces/1.0/046.xml +10 -0
  130. data/spec/fixtures/w3c/namespaces/1.0/047.xml +4 -0
  131. data/spec/fixtures/w3c/namespaces/1.0/048.xml +5 -0
  132. data/spec/fixtures/w3c/namespaces/1.0/LICENSE.md +32 -0
  133. data/spec/fixtures/w3c/namespaces/1.0/README.adoc +42 -0
  134. data/spec/fixtures/w3c/namespaces/1.0/rmt-ns10.xml +156 -0
  135. data/spec/integration/shared_examples/node_wrappers/namespace_behavior.rb +14 -2
  136. data/spec/integration/shared_examples/w3c_namespace_examples.rb +10 -0
  137. data/spec/integration/w3c_namespace_spec.rb +69 -0
  138. data/spec/moxml/adapter/libxml_spec.rb +7 -1
  139. data/spec/moxml/adapter/oga_spec.rb +92 -0
  140. data/spec/moxml/config_spec.rb +75 -0
  141. data/spec/moxml/doctype_spec.rb +19 -3
  142. data/spec/moxml/entity_registry_spec.rb +184 -0
  143. data/spec/moxml/error_spec.rb +2 -2
  144. data/spec/moxml/namespace_uri_validation_spec.rb +140 -0
  145. data/spec/moxml/xpath/axes_spec.rb +3 -4
  146. data/spec/performance/xpath_benchmark_spec.rb +6 -54
  147. data/spec/support/w3c_namespace_helpers.rb +41 -0
  148. data/spec/unit/rexml_isolated_test.rb +271 -0
  149. metadata +99 -3
  150. data/.ruby-version +0 -1
@@ -67,7 +67,46 @@ module Moxml
67
67
  def visit_text(node)
68
68
  # Prepare node for new document before wrapping
69
69
  prepared = adapter.prepare_for_new_document(node, @current_doc.native)
70
- @node_stack.last&.add_child(Text.new(prepared, context))
70
+ content = adapter.text_content(node)
71
+
72
+ # Check if we should restore entity references for this text
73
+ if context.config.restore_entities && content.to_s =~ /[<>&"']/
74
+ restore_entities_in_text(content)
75
+ else
76
+ @node_stack.last&.add_child(Text.new(prepared, context))
77
+ end
78
+ end
79
+
80
+ def restore_entities_in_text(content)
81
+ parent = @node_stack.last
82
+ return unless parent
83
+
84
+ # Characters that should potentially be entity-encoded
85
+ # Per W3C XML spec, these characters have special meaning
86
+ entity_chars = {
87
+ "<" => "lt",
88
+ ">" => "gt",
89
+ "&" => "amp",
90
+ '"' => "quot",
91
+ "'" => "apos",
92
+ }
93
+
94
+ # Process character by character
95
+ chars = content.to_s.chars
96
+ chars.each do |char|
97
+ codepoint = char.ord
98
+ entity_name = context.entity_registry.primary_name_for_codepoint(codepoint)
99
+
100
+ if entity_name && entity_chars.value?(entity_name)
101
+ # This character should be an entity reference
102
+ entity_node = adapter.create_entity_reference(entity_name)
103
+ parent.add_child(EntityReference.new(entity_node, context))
104
+ else
105
+ # Regular character
106
+ text_node = adapter.create_text(char)
107
+ parent.add_child(Text.new(text_node, context))
108
+ end
109
+ end
71
110
  end
72
111
 
73
112
  def visit_cdata(node)
@@ -90,6 +129,11 @@ module Moxml
90
129
  @node_stack.last&.add_child(Doctype.new(prepared, context))
91
130
  end
92
131
 
132
+ def visit_entity_reference(node)
133
+ prepared = adapter.prepare_for_new_document(node, @current_doc.native)
134
+ @node_stack.last&.add_child(EntityReference.new(prepared, context))
135
+ end
136
+
93
137
  def visit_children(node)
94
138
  node_children = children(node).dup
95
139
  node_children.each do |child|
data/lib/moxml/element.rb CHANGED
@@ -13,6 +13,12 @@ module Moxml
13
13
  adapter.set_node_name(@native, value)
14
14
  end
15
15
 
16
+ # Returns the primary identifier for this element (its tag name)
17
+ # @return [String] the element name
18
+ def identifier
19
+ name
20
+ end
21
+
16
22
  # Returns the expanded name including namespace prefix
17
23
  def expanded_name
18
24
  if namespace_prefix && !namespace_prefix.empty?
@@ -69,8 +75,8 @@ module Moxml
69
75
  end
70
76
 
71
77
  def add_namespace(prefix, uri)
72
- validate_uri(uri)
73
- adapter.create_native_namespace(@native, prefix, uri)
78
+ adapter.create_namespace(@native, prefix, uri,
79
+ namespace_uri_mode: context.config.namespace_uri_mode)
74
80
  self
75
81
  rescue ValidationError => e
76
82
  # Re-raise as NamespaceError, provide attributes for error context
@@ -96,7 +102,8 @@ module Moxml
96
102
  if ns_or_hash.is_a?(Hash)
97
103
  adapter.set_namespace(
98
104
  @native,
99
- adapter.create_namespace(@native, *ns_or_hash.to_a.first),
105
+ adapter.create_namespace(@native, *ns_or_hash.to_a.first,
106
+ namespace_uri_mode: context.config.namespace_uri_mode),
100
107
  )
101
108
  else
102
109
  adapter.set_namespace(@native, ns_or_hash&.native)
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Moxml
4
+ class EntityReference < Node
5
+ def content
6
+ ""
7
+ end
8
+
9
+ def text
10
+ ""
11
+ end
12
+
13
+ def name
14
+ adapter.entity_reference_name(@native)
15
+ end
16
+
17
+ def to_xml(*)
18
+ "&#{name};"
19
+ end
20
+
21
+ def ==(other)
22
+ self.class == other.class && @native == other.native
23
+ end
24
+
25
+ def identifier
26
+ name
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,278 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module Moxml
6
+ # EntityRegistry maintains a knowledge base of XML entity definitions.
7
+ #
8
+ # Data source: W3C XML Core WG Character Entities (bundled)
9
+ # https://www.w3.org/2003/entities/2007/htmlmathml
10
+ #
11
+ # The W3C entity data is bundled in data/w3c_entities.json and loaded
12
+ # from the gem's data directory. For development, MOXML_ENTITY_DEFINITIONS_PATH
13
+ # can be set to an external copy.
14
+ #
15
+ # Per W3C XML Core WG guidance:
16
+ # - Character entities are XML internal general entities providing a name for a single Unicode character
17
+ # - Standard XML entities (amp, lt, gt, quot, apos) are implicitly declared per XML specification
18
+ # - External entity sets (like HTML, MathML) can be referenced via DTD parameter entities
19
+ #
20
+ # @example Basic usage
21
+ # registry = EntityRegistry.new
22
+ # registry.declared?("amp") # => true
23
+ # registry.codepoint_for_name("amp") # => 38
24
+ #
25
+ class EntityRegistry
26
+ # W3C entity data file name
27
+ ENTITY_DATA_FILE = "w3c_entities.json"
28
+
29
+ class << self
30
+ # Get the raw entity data from the bundled JSON source
31
+ # @return [Hash{String => String}] entity name to character mapping
32
+ def entity_data
33
+ @entity_data ||= load_entity_data
34
+ end
35
+
36
+ # Get the default registry instance (lazy loaded)
37
+ # @return [EntityRegistry]
38
+ def default
39
+ @default ||= new
40
+ end
41
+
42
+ # Reset the default registry (mainly for testing)
43
+ # @return [void]
44
+ def reset
45
+ @default = nil
46
+ @entity_data = nil
47
+ end
48
+
49
+ private
50
+
51
+ # Load entity data from bundled gem data or local file
52
+ # @return [Hash{String => String}]
53
+ def load_entity_data
54
+ # Try multiple paths in order of priority
55
+ paths_to_try = []
56
+
57
+ # 1. Environment variable override (for development/custom setups)
58
+ if ENV["MOXML_ENTITY_DEFINITIONS_PATH"]
59
+ paths_to_try << ENV["MOXML_ENTITY_DEFINITIONS_PATH"]
60
+ end
61
+
62
+ # 2. Relative to moxml lib directory (for development/installation)
63
+ # __dir__ is lib/moxml/entity_registry.rb
64
+ # So ../../data/ goes to project_root/data/
65
+ paths_to_try << File.expand_path(
66
+ "../../data/#{ENTITY_DATA_FILE}",
67
+ __dir__,
68
+ )
69
+
70
+ # 3. External xml-entities sibling directory (common development setup)
71
+ paths_to_try << File.expand_path(
72
+ "../../external/xml-entities/docs/2007/htmlmathml.json",
73
+ __dir__,
74
+ )
75
+
76
+ data = nil
77
+ paths_to_try.uniq.each do |path|
78
+ next unless path && File.exist?(path)
79
+
80
+ begin
81
+ data = File.read(path)
82
+ break
83
+ rescue StandardError
84
+ # Try next path
85
+ end
86
+ end
87
+
88
+ unless data
89
+ raise EntityDataError,
90
+ "Entity data not found. Set MOXML_ENTITY_DEFINITIONS_PATH or ensure data/#{ENTITY_DATA_FILE} exists."
91
+ end
92
+
93
+ JSON.parse(data)["characters"]
94
+ rescue StandardError => e
95
+ raise EntityDataError, "Failed to load entity definitions: #{e.message}"
96
+ end
97
+ end
98
+
99
+ # Error raised when entity data cannot be loaded
100
+ class EntityDataError < StandardError; end
101
+
102
+ # @return [Hash{String => Integer}] entity name to codepoint mapping
103
+ attr_reader :by_name
104
+
105
+ # @return [Hash{Integer => Array<String>}] codepoint to entity names mapping
106
+ attr_reader :by_codepoint
107
+
108
+ # @param mode [Symbol] Loading mode: :required, :optional, :disabled, :custom
109
+ # @param entity_provider [Proc, nil] Custom entity provider proc/lambda
110
+ def initialize(mode: :required, entity_provider: nil)
111
+ @by_name = {}
112
+ @by_codepoint = Hash.new { |h, k| h[k] = [] }
113
+ @mode = mode
114
+ @entity_provider = entity_provider
115
+
116
+ case mode
117
+ when :required
118
+ load_from_entity_data
119
+ when :optional
120
+ load_from_entity_data_optional
121
+ when :custom
122
+ load_custom_entities
123
+ when :disabled
124
+ # Don't load anything - empty registry
125
+ end
126
+ end
127
+
128
+ # Check if an entity name is declared
129
+ # @param name [String] entity name (e.g., "amp", "nbsp")
130
+ # @return [Boolean]
131
+ def declared?(name)
132
+ @by_name.key?(name)
133
+ end
134
+
135
+ # Get the Unicode codepoint for an entity name
136
+ # @param name [String] entity name
137
+ # @return [Integer, nil] codepoint or nil if not found
138
+ def codepoint_for_name(name)
139
+ @by_name[name]
140
+ end
141
+
142
+ # Get all entity names for a codepoint
143
+ # @param codepoint [Integer] Unicode codepoint
144
+ # @return [Array<String>] entity names mapping to this codepoint
145
+ def names_for_codepoint(codepoint)
146
+ @by_codepoint[codepoint]
147
+ end
148
+
149
+ # Get the primary (preferred) entity name for a codepoint
150
+ # @param codepoint [Integer] Unicode codepoint
151
+ # @return [String, nil] primary entity name or nil
152
+ def primary_name_for_codepoint(codepoint)
153
+ @by_codepoint[codepoint]&.first
154
+ end
155
+
156
+ # Register additional entities
157
+ # @param entities [Hash{String => Integer}] name => codepoint mapping
158
+ # @return [self]
159
+ def register(entities)
160
+ entities.each do |name, codepoint|
161
+ @by_name[name] = codepoint
162
+ @by_codepoint[codepoint] ||= []
163
+ @by_codepoint[codepoint] << name unless @by_codepoint[codepoint].include?(name)
164
+ end
165
+ self
166
+ end
167
+
168
+ # Load all entities from the W3C HTMLMathML entity set
169
+ # This is called automatically by initialize
170
+ # @return [self]
171
+ def load_html5
172
+ # All entities are loaded by default from initialize
173
+ self
174
+ end
175
+
176
+ # Load MathML entity set (included in HTMLMathML)
177
+ # @return [self]
178
+ def load_mathml
179
+ # All entities are loaded by default from initialize
180
+ self
181
+ end
182
+
183
+ # Load ISO entity sets (included in HTMLMathML)
184
+ # @param _set_name [Symbol] (ignored, all loaded together)
185
+ # @return [self]
186
+ def load_iso(_set_name = :iso8879)
187
+ # All entities are loaded by default from initialize
188
+ self
189
+ end
190
+
191
+ # Load all standard entity sets
192
+ # @return [self]
193
+ def load_all
194
+ # All entities are loaded by default from initialize
195
+ self
196
+ end
197
+
198
+ # Clear all entities (reset to empty)
199
+ # @return [self]
200
+ def clear!
201
+ @by_name = {}
202
+ @by_codepoint = Hash.new { |h, k| h[k] = [] }
203
+ self
204
+ end
205
+
206
+ private
207
+
208
+ # Load entities from the centralized JSON data source
209
+ # @raise [EntityDataError] if entity data is required but cannot be loaded
210
+ # @return [void]
211
+ def load_from_entity_data
212
+ data = self.class.entity_data
213
+
214
+ if data.nil?
215
+ raise EntityDataError,
216
+ "Entity data is not available. Set entity_load_mode to :optional or :disabled to skip entity loading."
217
+ end
218
+
219
+ data.each do |name, char|
220
+ codepoint = parse_codepoint(char)
221
+ next unless codepoint
222
+
223
+ @by_name[name] = codepoint
224
+ @by_codepoint[codepoint] ||= []
225
+ @by_codepoint[codepoint] << name unless @by_codepoint[codepoint].include?(name)
226
+ end
227
+ end
228
+
229
+ # Load entities from the centralized JSON data source (optional mode)
230
+ # Silently continues if entity data cannot be loaded
231
+ # @return [void]
232
+ def load_from_entity_data_optional
233
+ data = self.class.entity_data
234
+ return unless data
235
+
236
+ data.each do |name, char|
237
+ codepoint = parse_codepoint(char)
238
+ next unless codepoint
239
+
240
+ @by_name[name] = codepoint
241
+ @by_codepoint[codepoint] ||= []
242
+ @by_codepoint[codepoint] << name unless @by_codepoint[codepoint].include?(name)
243
+ end
244
+ rescue EntityDataError
245
+ # Silently ignore - optional mode
246
+ end
247
+
248
+ # Load custom entities from the provided entity provider
249
+ # @return [void]
250
+ def load_custom_entities
251
+ return unless @entity_provider
252
+
253
+ entities = @entity_provider.call
254
+ return unless entities
255
+
256
+ entities.each do |name, codepoint|
257
+ @by_name[name] = codepoint
258
+ @by_codepoint[codepoint] ||= []
259
+ @by_codepoint[codepoint] << name unless @by_codepoint[codepoint].include?(name)
260
+ end
261
+ end
262
+
263
+ # Parse a Unicode character escape to codepoint
264
+ # @param char [String] character or escape sequence
265
+ # @return [Integer, nil]
266
+ def parse_codepoint(char)
267
+ if char.start_with?("\\u")
268
+ # Handle \uXXXX format
269
+ char.unicode_normalize(:nfc)[2..].to_i(16)
270
+ else
271
+ # Single character - get its ord
272
+ char.ord
273
+ end
274
+ rescue StandardError
275
+ nil
276
+ end
277
+ end
278
+ end
data/lib/moxml/error.rb CHANGED
@@ -40,7 +40,7 @@ module Moxml
40
40
  msg = super
41
41
  msg += "\n Expression: #{@expression}" if @expression
42
42
  msg += "\n Adapter: #{@adapter}" if @adapter
43
- msg += "\n Node: <#{@node.name}>" if @node.respond_to?(:name)
43
+ msg += "\n Node: <#{@node.name}>" if @node.is_a?(Element) || @node.is_a?(Attribute)
44
44
  msg += "\n Hint: Verify XPath syntax and ensure the adapter supports the expression"
45
45
  msg
46
46
  end
@@ -60,9 +60,9 @@ module Moxml
60
60
  def to_s
61
61
  msg = super
62
62
  # Only add extra details if any were provided
63
- has_details = @node.respond_to?(:name) || @constraint || @value
63
+ has_details = (@node.is_a?(Element) || @node.is_a?(Attribute)) || @constraint || @value
64
64
  if has_details
65
- msg += "\n Node: <#{@node.name}>" if @node.respond_to?(:name)
65
+ msg += "\n Node: <#{@node.name}>" if @node.is_a?(Element) || @node.is_a?(Attribute)
66
66
  msg += "\n Constraint: #{@constraint}" if @constraint
67
67
  msg += "\n Value: #{@value.inspect}" if @value
68
68
  msg += "\n Hint: Ensure the value meets XML specification requirements"
@@ -119,7 +119,7 @@ module Moxml
119
119
 
120
120
  def to_s
121
121
  msg = super
122
- msg += "\n Node: <#{@node.name}>" if @node.respond_to?(:name)
122
+ msg += "\n Node: <#{@node.name}>" if @node.is_a?(Element) || @node.is_a?(Attribute)
123
123
  msg += "\n Adapter: #{@adapter}" if @adapter
124
124
  msg += "\n Format: #{@format}" if @format
125
125
  msg += "\n Hint: Check that the node structure is valid for serialization"
@@ -160,7 +160,7 @@ module Moxml
160
160
  def to_s
161
161
  msg = super
162
162
  msg += "\n Attribute: #{@attribute_name}" if @attribute_name
163
- msg += "\n Element: <#{@element.name}>" if @element.respond_to?(:name)
163
+ msg += "\n Element: <#{@element.name}>" if @element.is_a?(Element)
164
164
  msg += "\n Value: #{@value.inspect}" if @value
165
165
  msg += "\n Hint: Verify attribute name follows XML naming rules"
166
166
  msg
data/lib/moxml/node.rb CHANGED
@@ -9,7 +9,7 @@ module Moxml
9
9
 
10
10
  TYPES = %i[
11
11
  element text cdata comment processing_instruction document
12
- declaration doctype namespace attribute unknown
12
+ declaration doctype namespace attribute unknown entity_reference
13
13
  ].freeze
14
14
 
15
15
  attr_reader :native, :context
@@ -25,7 +25,7 @@ module Moxml
25
25
  end
26
26
 
27
27
  def parent
28
- Node.wrap(adapter.parent(@native), context)
28
+ Moxml::Node.wrap(adapter.parent(@native), context)
29
29
  end
30
30
 
31
31
  def children
@@ -36,11 +36,11 @@ module Moxml
36
36
  end
37
37
 
38
38
  def next_sibling
39
- Node.wrap(adapter.next_sibling(@native), context)
39
+ Moxml::Node.wrap(adapter.next_sibling(@native), context)
40
40
  end
41
41
 
42
42
  def previous_sibling
43
- Node.wrap(adapter.previous_sibling(@native), context)
43
+ Moxml::Node.wrap(adapter.previous_sibling(@native), context)
44
44
  end
45
45
 
46
46
  def add_child(node)
@@ -87,7 +87,8 @@ module Moxml
87
87
  end
88
88
 
89
89
  def at_xpath(expression, namespaces = {})
90
- Node.wrap(adapter.at_xpath(@native, expression, namespaces), context)
90
+ Moxml::Node.wrap(adapter.at_xpath(@native, expression, namespaces),
91
+ context)
91
92
  end
92
93
 
93
94
  # Convenience find methods (aliases for xpath methods)
@@ -120,7 +121,7 @@ module Moxml
120
121
  if respond_to?(:content)
121
122
  content
122
123
  elsif respond_to?(:children)
123
- children.select { |c| c.is_a?(Text) }.map(&:content).join
124
+ children.grep(Text).map(&:content).join
124
125
  else
125
126
  ""
126
127
  end
@@ -170,9 +171,9 @@ module Moxml
170
171
  end
171
172
  end
172
173
 
173
- # Clone the node (deep copy)
174
+ # Clone node (deep copy)
174
175
  def clone
175
- Node.wrap(adapter.dup(@native), context)
176
+ Moxml::Node.wrap(adapter.dup(@native), context)
176
177
  end
177
178
  alias dup clone
178
179
 
@@ -186,6 +187,18 @@ module Moxml
186
187
  end
187
188
  end
188
189
 
190
+ # Returns the primary identifier for this node type
191
+ # For Element: the tag name
192
+ # For Attribute: the attribute name
193
+ # For ProcessingInstruction: the target
194
+ # For content nodes (Text, Comment, Cdata, Declaration): nil (no identifier)
195
+ # For Doctype: nil (not fully implemented across adapters)
196
+ #
197
+ # @return [String, nil] the node's primary identifier or nil
198
+ def identifier
199
+ nil
200
+ end
201
+
189
202
  def self.wrap(node, context)
190
203
  return nil if node.nil?
191
204
 
@@ -199,6 +212,7 @@ module Moxml
199
212
  when :declaration then Declaration
200
213
  when :doctype then Doctype
201
214
  when :attribute then Attribute
215
+ when :entity_reference then EntityReference
202
216
  else self
203
217
  end
204
218
 
@@ -14,25 +14,29 @@ module Moxml
14
14
  def each
15
15
  return to_enum(:each) unless block_given?
16
16
 
17
- nodes.each { |node| yield Node.wrap(node, context) }
17
+ nodes.each { |node| yield Moxml::Node.wrap(node, context) }
18
18
  self
19
19
  end
20
20
 
21
21
  def [](index)
22
22
  case index
23
23
  when Integer
24
- Node.wrap(nodes[index], context)
24
+ Moxml::Node.wrap(nodes[index], context)
25
25
  when Range
26
26
  NodeSet.new(nodes[index], context)
27
27
  end
28
28
  end
29
29
 
30
- def first
31
- Node.wrap(nodes.first, context)
30
+ def first(n = nil)
31
+ if n.nil?
32
+ Moxml::Node.wrap(nodes.first, context)
33
+ else
34
+ nodes.first(n).map { |node| Moxml::Node.wrap(node, context) }
35
+ end
32
36
  end
33
37
 
34
38
  def last
35
- Node.wrap(nodes.last, context)
39
+ Moxml::Node.wrap(nodes.last, context)
36
40
  end
37
41
 
38
42
  def empty?
@@ -81,7 +85,7 @@ module Moxml
81
85
  self.class == other.class &&
82
86
  length == other.length &&
83
87
  nodes.each_with_index.all? do |node, index|
84
- Node.wrap(node, context) == other[index]
88
+ Moxml::Node.wrap(node, context) == other[index]
85
89
  end
86
90
  end
87
91
 
@@ -10,6 +10,12 @@ module Moxml
10
10
  adapter.set_node_name(@native, new_target.to_s)
11
11
  end
12
12
 
13
+ # Returns the primary identifier for this processing instruction (its target)
14
+ # @return [String] the PI target
15
+ def identifier
16
+ target
17
+ end
18
+
13
19
  def content
14
20
  adapter.processing_instruction_content(@native)
15
21
  end
data/lib/moxml/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Moxml
4
- VERSION = "0.1.9"
4
+ VERSION = "0.1.11"
5
5
  end
@@ -57,11 +57,26 @@ module Moxml
57
57
  "Invalid XML processing instruction target: #{target}"
58
58
  end
59
59
 
60
- def validate_uri(uri)
61
- if uri.empty? || uri.match?(/\A#{::URI::DEFAULT_PARSER.make_regexp}\z/)
60
+ def validate_uri(uri, mode: :strict)
61
+ # Empty strings are allowed for default namespace undeclaration (xmlns="").
62
+ return if uri.empty?
63
+
64
+ # In lenient mode, accept any string as a namespace URI.
65
+ # Only reject strings containing XML-invalid characters (control characters).
66
+ if mode == :lenient
67
+ if uri.match?(/[\x00-\x08\x0B\x0C\x0E-\x1F]/)
68
+ raise ValidationError, "Invalid URI: #{uri}"
69
+ end
70
+
62
71
  return
63
72
  end
64
73
 
74
+ # Namespace names must be valid URI-references per RFC 3986
75
+ # (W3C Namespaces in XML, https://www.w3.org/TR/xml-names/).
76
+ # Use split instead of parse to avoid scheme-specific validation
77
+ # that rejects valid opaque URIs like "mailto:bar".
78
+ URI::RFC3986_PARSER.split(uri)
79
+ rescue URI::InvalidURIError
65
80
  raise ValidationError, "Invalid URI: #{uri}"
66
81
  end
67
82
 
@@ -79,5 +94,13 @@ module Moxml
79
94
  else value.to_s
80
95
  end
81
96
  end
97
+
98
+ def validate_entity_reference_name(name)
99
+ # Entity names follow the same pattern as element names
100
+ # They must start with a letter or underscore, followed by letters, digits, hyphens, underscores, periods, or colons
101
+ return if name.is_a?(String) && name.match?(/^[a-zA-Z_][\w\-.:]*$/)
102
+
103
+ raise ValidationError, "Invalid entity reference name: #{name}"
104
+ end
82
105
  end
83
106
  end
@@ -36,7 +36,7 @@ module Moxml
36
36
 
37
37
  def to_s
38
38
  msg = super
39
- msg += "\n Context node: <#{@context_node.name}>" if @context_node.respond_to?(:name)
39
+ msg += "\n Context node: <#{@context_node.name}>" if @context_node.is_a?(Moxml::Element) || @context_node.is_a?(Moxml::Attribute)
40
40
  msg += "\n Step: #{@step}" if @step
41
41
  msg
42
42
  end
data/lib/moxml.rb CHANGED
@@ -42,6 +42,7 @@ require_relative "moxml/error"
42
42
  require_relative "moxml/builder"
43
43
  require_relative "moxml/config"
44
44
  require_relative "moxml/context"
45
+ require_relative "moxml/entity_registry"
45
46
  require_relative "moxml/adapter"
46
47
  require_relative "moxml/xpath"
47
48
  require_relative "moxml/sax"
@@ -27,13 +27,15 @@ bundle exec rake spec:consistency
27
27
 
28
28
  # Run specific consistency test
29
29
  bundle exec rspec spec/consistency/adapter_parity_spec.rb
30
+ bundle exec rspec spec/consistency/round_trip_spec.rb
30
31
  ```
31
32
 
32
33
  ## Directory Structure
33
34
 
34
35
  ```
35
36
  consistency/
36
- └── adapter_parity_spec.rb # Ensures all adapters produce equivalent results
37
+ ├── adapter_parity_spec.rb # Ensures all adapters produce equivalent results
38
+ └── round_trip_spec.rb # Cross-adapter round-trip XML testing
37
39
  ```
38
40
 
39
41
  ## Writing Consistency Tests