moxml 0.1.9 → 0.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/docs.yml +1 -1
- data/.github/workflows/rake.yml +16 -13
- data/.github/workflows/release.yml +1 -0
- data/.github/workflows/round-trip.yml +74 -0
- data/.gitignore +1 -0
- data/.rubocop.yml +1 -0
- data/.rubocop_todo.yml +160 -38
- data/Gemfile +2 -1
- data/README.adoc +287 -20
- data/Rakefile +11 -0
- data/data/w3c_entities.json +2131 -0
- data/docs/ENTITY_SUPPORT_FOR_LUTAML_MODEL.md +102 -0
- data/docs/_guides/index.adoc +14 -12
- data/docs/_guides/node-api-consistency.adoc +572 -0
- data/docs/_guides/xml-declaration.adoc +5 -5
- data/docs/_pages/adapters/ox.adoc +30 -0
- data/docs/_pages/adapters/rexml.adoc +1 -1
- data/docs/_pages/configuration.adoc +43 -0
- data/docs/_pages/node-api-reference.adoc +128 -3
- data/docs/_tutorials/namespace-handling.adoc +21 -0
- data/examples/rss_parser/rss_parser.rb +1 -3
- data/lib/moxml/adapter/base.rb +26 -2
- data/lib/moxml/adapter/headed_ox.rb +5 -4
- data/lib/moxml/adapter/libxml.rb +18 -3
- data/lib/moxml/adapter/nokogiri.rb +26 -2
- data/lib/moxml/adapter/oga.rb +137 -20
- data/lib/moxml/adapter/ox.rb +29 -3
- data/lib/moxml/adapter/rexml.rb +54 -7
- data/lib/moxml/attribute.rb +6 -0
- data/lib/moxml/builder.rb +6 -0
- data/lib/moxml/config.rb +52 -1
- data/lib/moxml/context.rb +21 -2
- data/lib/moxml/doctype.rb +33 -0
- data/lib/moxml/document.rb +6 -1
- data/lib/moxml/document_builder.rb +45 -1
- data/lib/moxml/element.rb +10 -3
- data/lib/moxml/entity_reference.rb +29 -0
- data/lib/moxml/entity_registry.rb +278 -0
- data/lib/moxml/error.rb +5 -5
- data/lib/moxml/node.rb +22 -8
- data/lib/moxml/node_set.rb +10 -6
- data/lib/moxml/processing_instruction.rb +6 -0
- data/lib/moxml/version.rb +1 -1
- data/lib/moxml/xml_utils.rb +25 -2
- data/lib/moxml/xpath/errors.rb +1 -1
- data/lib/moxml.rb +1 -0
- data/spec/consistency/README.md +3 -1
- data/spec/consistency/round_trip_spec.rb +479 -0
- data/spec/examples/readme_examples_spec.rb +1 -1
- data/spec/fixtures/round-trips/metanorma/a.xml +66 -0
- data/spec/fixtures/round-trips/metanorma/bilingual-en.xml +7682 -0
- data/spec/fixtures/round-trips/metanorma/bilingual-fr.xml +7520 -0
- data/spec/fixtures/round-trips/metanorma/bilingual.presentation.xml +21211 -0
- data/spec/fixtures/round-trips/metanorma/collection1.xml +313 -0
- data/spec/fixtures/round-trips/metanorma/collection1nested.xml +291 -0
- data/spec/fixtures/round-trips/metanorma/collection_docinline.xml +544 -0
- data/spec/fixtures/round-trips/metanorma/collection_full.xml +1776 -0
- data/spec/fixtures/round-trips/metanorma/dummy.1.xml +295 -0
- data/spec/fixtures/round-trips/metanorma/dummy.xml +349 -0
- data/spec/fixtures/round-trips/metanorma/footnotes.xml +70 -0
- data/spec/fixtures/round-trips/metanorma/iho.xml +116 -0
- data/spec/fixtures/round-trips/metanorma/rice-amd.final.xml +186 -0
- data/spec/fixtures/round-trips/metanorma/rice-amd.final_1.xml +180 -0
- data/spec/fixtures/round-trips/metanorma/rice-en.final.norepo.xml +116 -0
- data/spec/fixtures/round-trips/metanorma/rice-en.final.xml +149 -0
- data/spec/fixtures/round-trips/metanorma/rice-en.final_1.xml +144 -0
- data/spec/fixtures/round-trips/metanorma/rice1-en.final.xml +120 -0
- data/spec/fixtures/round-trips/metanorma/rice2-en.final.xml +116 -0
- data/spec/fixtures/round-trips/metanorma/test_sectionsplit.xml +119 -0
- data/spec/fixtures/round-trips/niso-jats/bmj_sample.xml +1068 -0
- data/spec/fixtures/round-trips/niso-jats/element_citation.xml +7 -0
- data/spec/fixtures/round-trips/niso-jats/pnas_sample.xml +3768 -0
- data/spec/fixtures/round-trips/rfcxml/rfc8881.xml +45848 -0
- data/spec/fixtures/round-trips/rfcxml/rfc8994.xml +6607 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9000.xml +9064 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9043.xml +5527 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9051.xml +14286 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9110.xml +18156 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9260.xml +9136 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9293.xml +8300 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9380.xml +8916 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9420.xml +8927 -0
- data/spec/fixtures/w3c/namespaces/1.0/001.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/002.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/003.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/004.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/005.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/006.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/007.xml +20 -0
- data/spec/fixtures/w3c/namespaces/1.0/008.xml +20 -0
- data/spec/fixtures/w3c/namespaces/1.0/009.xml +19 -0
- data/spec/fixtures/w3c/namespaces/1.0/010.xml +19 -0
- data/spec/fixtures/w3c/namespaces/1.0/011.xml +20 -0
- data/spec/fixtures/w3c/namespaces/1.0/012.xml +19 -0
- data/spec/fixtures/w3c/namespaces/1.0/013.xml +5 -0
- data/spec/fixtures/w3c/namespaces/1.0/014.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/015.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/016.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/017.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/018.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/019.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/020.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/021.xml +6 -0
- data/spec/fixtures/w3c/namespaces/1.0/022.xml +6 -0
- data/spec/fixtures/w3c/namespaces/1.0/023.xml +6 -0
- data/spec/fixtures/w3c/namespaces/1.0/024.xml +6 -0
- data/spec/fixtures/w3c/namespaces/1.0/025.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/026.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/027.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/028.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/029.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/030.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/031.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/032.xml +5 -0
- data/spec/fixtures/w3c/namespaces/1.0/033.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/034.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/035.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/036.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/037.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/038.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/039.xml +10 -0
- data/spec/fixtures/w3c/namespaces/1.0/040.xml +9 -0
- data/spec/fixtures/w3c/namespaces/1.0/041.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/042.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/043.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/044.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/045.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/046.xml +10 -0
- data/spec/fixtures/w3c/namespaces/1.0/047.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/048.xml +5 -0
- data/spec/fixtures/w3c/namespaces/1.0/LICENSE.md +32 -0
- data/spec/fixtures/w3c/namespaces/1.0/README.adoc +42 -0
- data/spec/fixtures/w3c/namespaces/1.0/rmt-ns10.xml +156 -0
- data/spec/integration/shared_examples/node_wrappers/namespace_behavior.rb +14 -2
- data/spec/integration/shared_examples/w3c_namespace_examples.rb +10 -0
- data/spec/integration/w3c_namespace_spec.rb +69 -0
- data/spec/moxml/adapter/libxml_spec.rb +7 -1
- data/spec/moxml/adapter/oga_spec.rb +92 -0
- data/spec/moxml/config_spec.rb +75 -0
- data/spec/moxml/doctype_spec.rb +19 -3
- data/spec/moxml/entity_registry_spec.rb +184 -0
- data/spec/moxml/error_spec.rb +2 -2
- data/spec/moxml/namespace_uri_validation_spec.rb +140 -0
- data/spec/moxml/xpath/axes_spec.rb +3 -4
- data/spec/performance/xpath_benchmark_spec.rb +6 -54
- data/spec/support/w3c_namespace_helpers.rb +41 -0
- data/spec/unit/rexml_isolated_test.rb +271 -0
- metadata +99 -3
- data/.ruby-version +0 -1
|
@@ -67,7 +67,46 @@ module Moxml
|
|
|
67
67
|
def visit_text(node)
|
|
68
68
|
# Prepare node for new document before wrapping
|
|
69
69
|
prepared = adapter.prepare_for_new_document(node, @current_doc.native)
|
|
70
|
-
|
|
70
|
+
content = adapter.text_content(node)
|
|
71
|
+
|
|
72
|
+
# Check if we should restore entity references for this text
|
|
73
|
+
if context.config.restore_entities && content.to_s =~ /[<>&"']/
|
|
74
|
+
restore_entities_in_text(content)
|
|
75
|
+
else
|
|
76
|
+
@node_stack.last&.add_child(Text.new(prepared, context))
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def restore_entities_in_text(content)
|
|
81
|
+
parent = @node_stack.last
|
|
82
|
+
return unless parent
|
|
83
|
+
|
|
84
|
+
# Characters that should potentially be entity-encoded
|
|
85
|
+
# Per W3C XML spec, these characters have special meaning
|
|
86
|
+
entity_chars = {
|
|
87
|
+
"<" => "lt",
|
|
88
|
+
">" => "gt",
|
|
89
|
+
"&" => "amp",
|
|
90
|
+
'"' => "quot",
|
|
91
|
+
"'" => "apos",
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
# Process character by character
|
|
95
|
+
chars = content.to_s.chars
|
|
96
|
+
chars.each do |char|
|
|
97
|
+
codepoint = char.ord
|
|
98
|
+
entity_name = context.entity_registry.primary_name_for_codepoint(codepoint)
|
|
99
|
+
|
|
100
|
+
if entity_name && entity_chars.value?(entity_name)
|
|
101
|
+
# This character should be an entity reference
|
|
102
|
+
entity_node = adapter.create_entity_reference(entity_name)
|
|
103
|
+
parent.add_child(EntityReference.new(entity_node, context))
|
|
104
|
+
else
|
|
105
|
+
# Regular character
|
|
106
|
+
text_node = adapter.create_text(char)
|
|
107
|
+
parent.add_child(Text.new(text_node, context))
|
|
108
|
+
end
|
|
109
|
+
end
|
|
71
110
|
end
|
|
72
111
|
|
|
73
112
|
def visit_cdata(node)
|
|
@@ -90,6 +129,11 @@ module Moxml
|
|
|
90
129
|
@node_stack.last&.add_child(Doctype.new(prepared, context))
|
|
91
130
|
end
|
|
92
131
|
|
|
132
|
+
def visit_entity_reference(node)
|
|
133
|
+
prepared = adapter.prepare_for_new_document(node, @current_doc.native)
|
|
134
|
+
@node_stack.last&.add_child(EntityReference.new(prepared, context))
|
|
135
|
+
end
|
|
136
|
+
|
|
93
137
|
def visit_children(node)
|
|
94
138
|
node_children = children(node).dup
|
|
95
139
|
node_children.each do |child|
|
data/lib/moxml/element.rb
CHANGED
|
@@ -13,6 +13,12 @@ module Moxml
|
|
|
13
13
|
adapter.set_node_name(@native, value)
|
|
14
14
|
end
|
|
15
15
|
|
|
16
|
+
# Returns the primary identifier for this element (its tag name)
|
|
17
|
+
# @return [String] the element name
|
|
18
|
+
def identifier
|
|
19
|
+
name
|
|
20
|
+
end
|
|
21
|
+
|
|
16
22
|
# Returns the expanded name including namespace prefix
|
|
17
23
|
def expanded_name
|
|
18
24
|
if namespace_prefix && !namespace_prefix.empty?
|
|
@@ -69,8 +75,8 @@ module Moxml
|
|
|
69
75
|
end
|
|
70
76
|
|
|
71
77
|
def add_namespace(prefix, uri)
|
|
72
|
-
|
|
73
|
-
|
|
78
|
+
adapter.create_namespace(@native, prefix, uri,
|
|
79
|
+
namespace_uri_mode: context.config.namespace_uri_mode)
|
|
74
80
|
self
|
|
75
81
|
rescue ValidationError => e
|
|
76
82
|
# Re-raise as NamespaceError, provide attributes for error context
|
|
@@ -96,7 +102,8 @@ module Moxml
|
|
|
96
102
|
if ns_or_hash.is_a?(Hash)
|
|
97
103
|
adapter.set_namespace(
|
|
98
104
|
@native,
|
|
99
|
-
adapter.create_namespace(@native, *ns_or_hash.to_a.first
|
|
105
|
+
adapter.create_namespace(@native, *ns_or_hash.to_a.first,
|
|
106
|
+
namespace_uri_mode: context.config.namespace_uri_mode),
|
|
100
107
|
)
|
|
101
108
|
else
|
|
102
109
|
adapter.set_namespace(@native, ns_or_hash&.native)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Moxml
|
|
4
|
+
class EntityReference < Node
|
|
5
|
+
def content
|
|
6
|
+
""
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def text
|
|
10
|
+
""
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def name
|
|
14
|
+
adapter.entity_reference_name(@native)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def to_xml(*)
|
|
18
|
+
"&#{name};"
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def ==(other)
|
|
22
|
+
self.class == other.class && @native == other.native
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def identifier
|
|
26
|
+
name
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module Moxml
|
|
6
|
+
# EntityRegistry maintains a knowledge base of XML entity definitions.
|
|
7
|
+
#
|
|
8
|
+
# Data source: W3C XML Core WG Character Entities (bundled)
|
|
9
|
+
# https://www.w3.org/2003/entities/2007/htmlmathml
|
|
10
|
+
#
|
|
11
|
+
# The W3C entity data is bundled in data/w3c_entities.json and loaded
|
|
12
|
+
# from the gem's data directory. For development, MOXML_ENTITY_DEFINITIONS_PATH
|
|
13
|
+
# can be set to an external copy.
|
|
14
|
+
#
|
|
15
|
+
# Per W3C XML Core WG guidance:
|
|
16
|
+
# - Character entities are XML internal general entities providing a name for a single Unicode character
|
|
17
|
+
# - Standard XML entities (amp, lt, gt, quot, apos) are implicitly declared per XML specification
|
|
18
|
+
# - External entity sets (like HTML, MathML) can be referenced via DTD parameter entities
|
|
19
|
+
#
|
|
20
|
+
# @example Basic usage
|
|
21
|
+
# registry = EntityRegistry.new
|
|
22
|
+
# registry.declared?("amp") # => true
|
|
23
|
+
# registry.codepoint_for_name("amp") # => 38
|
|
24
|
+
#
|
|
25
|
+
class EntityRegistry
|
|
26
|
+
# W3C entity data file name
|
|
27
|
+
ENTITY_DATA_FILE = "w3c_entities.json"
|
|
28
|
+
|
|
29
|
+
class << self
|
|
30
|
+
# Get the raw entity data from the bundled JSON source
|
|
31
|
+
# @return [Hash{String => String}] entity name to character mapping
|
|
32
|
+
def entity_data
|
|
33
|
+
@entity_data ||= load_entity_data
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Get the default registry instance (lazy loaded)
|
|
37
|
+
# @return [EntityRegistry]
|
|
38
|
+
def default
|
|
39
|
+
@default ||= new
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Reset the default registry (mainly for testing)
|
|
43
|
+
# @return [void]
|
|
44
|
+
def reset
|
|
45
|
+
@default = nil
|
|
46
|
+
@entity_data = nil
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
private
|
|
50
|
+
|
|
51
|
+
# Load entity data from bundled gem data or local file
|
|
52
|
+
# @return [Hash{String => String}]
|
|
53
|
+
def load_entity_data
|
|
54
|
+
# Try multiple paths in order of priority
|
|
55
|
+
paths_to_try = []
|
|
56
|
+
|
|
57
|
+
# 1. Environment variable override (for development/custom setups)
|
|
58
|
+
if ENV["MOXML_ENTITY_DEFINITIONS_PATH"]
|
|
59
|
+
paths_to_try << ENV["MOXML_ENTITY_DEFINITIONS_PATH"]
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# 2. Relative to moxml lib directory (for development/installation)
|
|
63
|
+
# __dir__ is lib/moxml/entity_registry.rb
|
|
64
|
+
# So ../../data/ goes to project_root/data/
|
|
65
|
+
paths_to_try << File.expand_path(
|
|
66
|
+
"../../data/#{ENTITY_DATA_FILE}",
|
|
67
|
+
__dir__,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# 3. External xml-entities sibling directory (common development setup)
|
|
71
|
+
paths_to_try << File.expand_path(
|
|
72
|
+
"../../external/xml-entities/docs/2007/htmlmathml.json",
|
|
73
|
+
__dir__,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
data = nil
|
|
77
|
+
paths_to_try.uniq.each do |path|
|
|
78
|
+
next unless path && File.exist?(path)
|
|
79
|
+
|
|
80
|
+
begin
|
|
81
|
+
data = File.read(path)
|
|
82
|
+
break
|
|
83
|
+
rescue StandardError
|
|
84
|
+
# Try next path
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
unless data
|
|
89
|
+
raise EntityDataError,
|
|
90
|
+
"Entity data not found. Set MOXML_ENTITY_DEFINITIONS_PATH or ensure data/#{ENTITY_DATA_FILE} exists."
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
JSON.parse(data)["characters"]
|
|
94
|
+
rescue StandardError => e
|
|
95
|
+
raise EntityDataError, "Failed to load entity definitions: #{e.message}"
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Error raised when entity data cannot be loaded
|
|
100
|
+
class EntityDataError < StandardError; end
|
|
101
|
+
|
|
102
|
+
# @return [Hash{String => Integer}] entity name to codepoint mapping
|
|
103
|
+
attr_reader :by_name
|
|
104
|
+
|
|
105
|
+
# @return [Hash{Integer => Array<String>}] codepoint to entity names mapping
|
|
106
|
+
attr_reader :by_codepoint
|
|
107
|
+
|
|
108
|
+
# @param mode [Symbol] Loading mode: :required, :optional, :disabled, :custom
|
|
109
|
+
# @param entity_provider [Proc, nil] Custom entity provider proc/lambda
|
|
110
|
+
def initialize(mode: :required, entity_provider: nil)
|
|
111
|
+
@by_name = {}
|
|
112
|
+
@by_codepoint = Hash.new { |h, k| h[k] = [] }
|
|
113
|
+
@mode = mode
|
|
114
|
+
@entity_provider = entity_provider
|
|
115
|
+
|
|
116
|
+
case mode
|
|
117
|
+
when :required
|
|
118
|
+
load_from_entity_data
|
|
119
|
+
when :optional
|
|
120
|
+
load_from_entity_data_optional
|
|
121
|
+
when :custom
|
|
122
|
+
load_custom_entities
|
|
123
|
+
when :disabled
|
|
124
|
+
# Don't load anything - empty registry
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Check if an entity name is declared
|
|
129
|
+
# @param name [String] entity name (e.g., "amp", "nbsp")
|
|
130
|
+
# @return [Boolean]
|
|
131
|
+
def declared?(name)
|
|
132
|
+
@by_name.key?(name)
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# Get the Unicode codepoint for an entity name
|
|
136
|
+
# @param name [String] entity name
|
|
137
|
+
# @return [Integer, nil] codepoint or nil if not found
|
|
138
|
+
def codepoint_for_name(name)
|
|
139
|
+
@by_name[name]
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# Get all entity names for a codepoint
|
|
143
|
+
# @param codepoint [Integer] Unicode codepoint
|
|
144
|
+
# @return [Array<String>] entity names mapping to this codepoint
|
|
145
|
+
def names_for_codepoint(codepoint)
|
|
146
|
+
@by_codepoint[codepoint]
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Get the primary (preferred) entity name for a codepoint
|
|
150
|
+
# @param codepoint [Integer] Unicode codepoint
|
|
151
|
+
# @return [String, nil] primary entity name or nil
|
|
152
|
+
def primary_name_for_codepoint(codepoint)
|
|
153
|
+
@by_codepoint[codepoint]&.first
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Register additional entities
|
|
157
|
+
# @param entities [Hash{String => Integer}] name => codepoint mapping
|
|
158
|
+
# @return [self]
|
|
159
|
+
def register(entities)
|
|
160
|
+
entities.each do |name, codepoint|
|
|
161
|
+
@by_name[name] = codepoint
|
|
162
|
+
@by_codepoint[codepoint] ||= []
|
|
163
|
+
@by_codepoint[codepoint] << name unless @by_codepoint[codepoint].include?(name)
|
|
164
|
+
end
|
|
165
|
+
self
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Load all entities from the W3C HTMLMathML entity set
|
|
169
|
+
# This is called automatically by initialize
|
|
170
|
+
# @return [self]
|
|
171
|
+
def load_html5
|
|
172
|
+
# All entities are loaded by default from initialize
|
|
173
|
+
self
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
# Load MathML entity set (included in HTMLMathML)
|
|
177
|
+
# @return [self]
|
|
178
|
+
def load_mathml
|
|
179
|
+
# All entities are loaded by default from initialize
|
|
180
|
+
self
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# Load ISO entity sets (included in HTMLMathML)
|
|
184
|
+
# @param _set_name [Symbol] (ignored, all loaded together)
|
|
185
|
+
# @return [self]
|
|
186
|
+
def load_iso(_set_name = :iso8879)
|
|
187
|
+
# All entities are loaded by default from initialize
|
|
188
|
+
self
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
# Load all standard entity sets
|
|
192
|
+
# @return [self]
|
|
193
|
+
def load_all
|
|
194
|
+
# All entities are loaded by default from initialize
|
|
195
|
+
self
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
# Clear all entities (reset to empty)
|
|
199
|
+
# @return [self]
|
|
200
|
+
def clear!
|
|
201
|
+
@by_name = {}
|
|
202
|
+
@by_codepoint = Hash.new { |h, k| h[k] = [] }
|
|
203
|
+
self
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
private
|
|
207
|
+
|
|
208
|
+
# Load entities from the centralized JSON data source
|
|
209
|
+
# @raise [EntityDataError] if entity data is required but cannot be loaded
|
|
210
|
+
# @return [void]
|
|
211
|
+
def load_from_entity_data
|
|
212
|
+
data = self.class.entity_data
|
|
213
|
+
|
|
214
|
+
if data.nil?
|
|
215
|
+
raise EntityDataError,
|
|
216
|
+
"Entity data is not available. Set entity_load_mode to :optional or :disabled to skip entity loading."
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
data.each do |name, char|
|
|
220
|
+
codepoint = parse_codepoint(char)
|
|
221
|
+
next unless codepoint
|
|
222
|
+
|
|
223
|
+
@by_name[name] = codepoint
|
|
224
|
+
@by_codepoint[codepoint] ||= []
|
|
225
|
+
@by_codepoint[codepoint] << name unless @by_codepoint[codepoint].include?(name)
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
# Load entities from the centralized JSON data source (optional mode)
|
|
230
|
+
# Silently continues if entity data cannot be loaded
|
|
231
|
+
# @return [void]
|
|
232
|
+
def load_from_entity_data_optional
|
|
233
|
+
data = self.class.entity_data
|
|
234
|
+
return unless data
|
|
235
|
+
|
|
236
|
+
data.each do |name, char|
|
|
237
|
+
codepoint = parse_codepoint(char)
|
|
238
|
+
next unless codepoint
|
|
239
|
+
|
|
240
|
+
@by_name[name] = codepoint
|
|
241
|
+
@by_codepoint[codepoint] ||= []
|
|
242
|
+
@by_codepoint[codepoint] << name unless @by_codepoint[codepoint].include?(name)
|
|
243
|
+
end
|
|
244
|
+
rescue EntityDataError
|
|
245
|
+
# Silently ignore - optional mode
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
# Load custom entities from the provided entity provider
|
|
249
|
+
# @return [void]
|
|
250
|
+
def load_custom_entities
|
|
251
|
+
return unless @entity_provider
|
|
252
|
+
|
|
253
|
+
entities = @entity_provider.call
|
|
254
|
+
return unless entities
|
|
255
|
+
|
|
256
|
+
entities.each do |name, codepoint|
|
|
257
|
+
@by_name[name] = codepoint
|
|
258
|
+
@by_codepoint[codepoint] ||= []
|
|
259
|
+
@by_codepoint[codepoint] << name unless @by_codepoint[codepoint].include?(name)
|
|
260
|
+
end
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
# Parse a Unicode character escape to codepoint
|
|
264
|
+
# @param char [String] character or escape sequence
|
|
265
|
+
# @return [Integer, nil]
|
|
266
|
+
def parse_codepoint(char)
|
|
267
|
+
if char.start_with?("\\u")
|
|
268
|
+
# Handle \uXXXX format
|
|
269
|
+
char.unicode_normalize(:nfc)[2..].to_i(16)
|
|
270
|
+
else
|
|
271
|
+
# Single character - get its ord
|
|
272
|
+
char.ord
|
|
273
|
+
end
|
|
274
|
+
rescue StandardError
|
|
275
|
+
nil
|
|
276
|
+
end
|
|
277
|
+
end
|
|
278
|
+
end
|
data/lib/moxml/error.rb
CHANGED
|
@@ -40,7 +40,7 @@ module Moxml
|
|
|
40
40
|
msg = super
|
|
41
41
|
msg += "\n Expression: #{@expression}" if @expression
|
|
42
42
|
msg += "\n Adapter: #{@adapter}" if @adapter
|
|
43
|
-
msg += "\n Node: <#{@node.name}>" if @node.
|
|
43
|
+
msg += "\n Node: <#{@node.name}>" if @node.is_a?(Element) || @node.is_a?(Attribute)
|
|
44
44
|
msg += "\n Hint: Verify XPath syntax and ensure the adapter supports the expression"
|
|
45
45
|
msg
|
|
46
46
|
end
|
|
@@ -60,9 +60,9 @@ module Moxml
|
|
|
60
60
|
def to_s
|
|
61
61
|
msg = super
|
|
62
62
|
# Only add extra details if any were provided
|
|
63
|
-
has_details = @node.
|
|
63
|
+
has_details = (@node.is_a?(Element) || @node.is_a?(Attribute)) || @constraint || @value
|
|
64
64
|
if has_details
|
|
65
|
-
msg += "\n Node: <#{@node.name}>" if @node.
|
|
65
|
+
msg += "\n Node: <#{@node.name}>" if @node.is_a?(Element) || @node.is_a?(Attribute)
|
|
66
66
|
msg += "\n Constraint: #{@constraint}" if @constraint
|
|
67
67
|
msg += "\n Value: #{@value.inspect}" if @value
|
|
68
68
|
msg += "\n Hint: Ensure the value meets XML specification requirements"
|
|
@@ -119,7 +119,7 @@ module Moxml
|
|
|
119
119
|
|
|
120
120
|
def to_s
|
|
121
121
|
msg = super
|
|
122
|
-
msg += "\n Node: <#{@node.name}>" if @node.
|
|
122
|
+
msg += "\n Node: <#{@node.name}>" if @node.is_a?(Element) || @node.is_a?(Attribute)
|
|
123
123
|
msg += "\n Adapter: #{@adapter}" if @adapter
|
|
124
124
|
msg += "\n Format: #{@format}" if @format
|
|
125
125
|
msg += "\n Hint: Check that the node structure is valid for serialization"
|
|
@@ -160,7 +160,7 @@ module Moxml
|
|
|
160
160
|
def to_s
|
|
161
161
|
msg = super
|
|
162
162
|
msg += "\n Attribute: #{@attribute_name}" if @attribute_name
|
|
163
|
-
msg += "\n Element: <#{@element.name}>" if @element.
|
|
163
|
+
msg += "\n Element: <#{@element.name}>" if @element.is_a?(Element)
|
|
164
164
|
msg += "\n Value: #{@value.inspect}" if @value
|
|
165
165
|
msg += "\n Hint: Verify attribute name follows XML naming rules"
|
|
166
166
|
msg
|
data/lib/moxml/node.rb
CHANGED
|
@@ -9,7 +9,7 @@ module Moxml
|
|
|
9
9
|
|
|
10
10
|
TYPES = %i[
|
|
11
11
|
element text cdata comment processing_instruction document
|
|
12
|
-
declaration doctype namespace attribute unknown
|
|
12
|
+
declaration doctype namespace attribute unknown entity_reference
|
|
13
13
|
].freeze
|
|
14
14
|
|
|
15
15
|
attr_reader :native, :context
|
|
@@ -25,7 +25,7 @@ module Moxml
|
|
|
25
25
|
end
|
|
26
26
|
|
|
27
27
|
def parent
|
|
28
|
-
Node.wrap(adapter.parent(@native), context)
|
|
28
|
+
Moxml::Node.wrap(adapter.parent(@native), context)
|
|
29
29
|
end
|
|
30
30
|
|
|
31
31
|
def children
|
|
@@ -36,11 +36,11 @@ module Moxml
|
|
|
36
36
|
end
|
|
37
37
|
|
|
38
38
|
def next_sibling
|
|
39
|
-
Node.wrap(adapter.next_sibling(@native), context)
|
|
39
|
+
Moxml::Node.wrap(adapter.next_sibling(@native), context)
|
|
40
40
|
end
|
|
41
41
|
|
|
42
42
|
def previous_sibling
|
|
43
|
-
Node.wrap(adapter.previous_sibling(@native), context)
|
|
43
|
+
Moxml::Node.wrap(adapter.previous_sibling(@native), context)
|
|
44
44
|
end
|
|
45
45
|
|
|
46
46
|
def add_child(node)
|
|
@@ -87,7 +87,8 @@ module Moxml
|
|
|
87
87
|
end
|
|
88
88
|
|
|
89
89
|
def at_xpath(expression, namespaces = {})
|
|
90
|
-
Node.wrap(adapter.at_xpath(@native, expression, namespaces),
|
|
90
|
+
Moxml::Node.wrap(adapter.at_xpath(@native, expression, namespaces),
|
|
91
|
+
context)
|
|
91
92
|
end
|
|
92
93
|
|
|
93
94
|
# Convenience find methods (aliases for xpath methods)
|
|
@@ -120,7 +121,7 @@ module Moxml
|
|
|
120
121
|
if respond_to?(:content)
|
|
121
122
|
content
|
|
122
123
|
elsif respond_to?(:children)
|
|
123
|
-
children.
|
|
124
|
+
children.grep(Text).map(&:content).join
|
|
124
125
|
else
|
|
125
126
|
""
|
|
126
127
|
end
|
|
@@ -170,9 +171,9 @@ module Moxml
|
|
|
170
171
|
end
|
|
171
172
|
end
|
|
172
173
|
|
|
173
|
-
# Clone
|
|
174
|
+
# Clone node (deep copy)
|
|
174
175
|
def clone
|
|
175
|
-
Node.wrap(adapter.dup(@native), context)
|
|
176
|
+
Moxml::Node.wrap(adapter.dup(@native), context)
|
|
176
177
|
end
|
|
177
178
|
alias dup clone
|
|
178
179
|
|
|
@@ -186,6 +187,18 @@ module Moxml
|
|
|
186
187
|
end
|
|
187
188
|
end
|
|
188
189
|
|
|
190
|
+
# Returns the primary identifier for this node type
|
|
191
|
+
# For Element: the tag name
|
|
192
|
+
# For Attribute: the attribute name
|
|
193
|
+
# For ProcessingInstruction: the target
|
|
194
|
+
# For content nodes (Text, Comment, Cdata, Declaration): nil (no identifier)
|
|
195
|
+
# For Doctype: nil (not fully implemented across adapters)
|
|
196
|
+
#
|
|
197
|
+
# @return [String, nil] the node's primary identifier or nil
|
|
198
|
+
def identifier
|
|
199
|
+
nil
|
|
200
|
+
end
|
|
201
|
+
|
|
189
202
|
def self.wrap(node, context)
|
|
190
203
|
return nil if node.nil?
|
|
191
204
|
|
|
@@ -199,6 +212,7 @@ module Moxml
|
|
|
199
212
|
when :declaration then Declaration
|
|
200
213
|
when :doctype then Doctype
|
|
201
214
|
when :attribute then Attribute
|
|
215
|
+
when :entity_reference then EntityReference
|
|
202
216
|
else self
|
|
203
217
|
end
|
|
204
218
|
|
data/lib/moxml/node_set.rb
CHANGED
|
@@ -14,25 +14,29 @@ module Moxml
|
|
|
14
14
|
def each
|
|
15
15
|
return to_enum(:each) unless block_given?
|
|
16
16
|
|
|
17
|
-
nodes.each { |node| yield Node.wrap(node, context) }
|
|
17
|
+
nodes.each { |node| yield Moxml::Node.wrap(node, context) }
|
|
18
18
|
self
|
|
19
19
|
end
|
|
20
20
|
|
|
21
21
|
def [](index)
|
|
22
22
|
case index
|
|
23
23
|
when Integer
|
|
24
|
-
Node.wrap(nodes[index], context)
|
|
24
|
+
Moxml::Node.wrap(nodes[index], context)
|
|
25
25
|
when Range
|
|
26
26
|
NodeSet.new(nodes[index], context)
|
|
27
27
|
end
|
|
28
28
|
end
|
|
29
29
|
|
|
30
|
-
def first
|
|
31
|
-
|
|
30
|
+
def first(n = nil)
|
|
31
|
+
if n.nil?
|
|
32
|
+
Moxml::Node.wrap(nodes.first, context)
|
|
33
|
+
else
|
|
34
|
+
nodes.first(n).map { |node| Moxml::Node.wrap(node, context) }
|
|
35
|
+
end
|
|
32
36
|
end
|
|
33
37
|
|
|
34
38
|
def last
|
|
35
|
-
Node.wrap(nodes.last, context)
|
|
39
|
+
Moxml::Node.wrap(nodes.last, context)
|
|
36
40
|
end
|
|
37
41
|
|
|
38
42
|
def empty?
|
|
@@ -81,7 +85,7 @@ module Moxml
|
|
|
81
85
|
self.class == other.class &&
|
|
82
86
|
length == other.length &&
|
|
83
87
|
nodes.each_with_index.all? do |node, index|
|
|
84
|
-
Node.wrap(node, context) == other[index]
|
|
88
|
+
Moxml::Node.wrap(node, context) == other[index]
|
|
85
89
|
end
|
|
86
90
|
end
|
|
87
91
|
|
|
@@ -10,6 +10,12 @@ module Moxml
|
|
|
10
10
|
adapter.set_node_name(@native, new_target.to_s)
|
|
11
11
|
end
|
|
12
12
|
|
|
13
|
+
# Returns the primary identifier for this processing instruction (its target)
|
|
14
|
+
# @return [String] the PI target
|
|
15
|
+
def identifier
|
|
16
|
+
target
|
|
17
|
+
end
|
|
18
|
+
|
|
13
19
|
def content
|
|
14
20
|
adapter.processing_instruction_content(@native)
|
|
15
21
|
end
|
data/lib/moxml/version.rb
CHANGED
data/lib/moxml/xml_utils.rb
CHANGED
|
@@ -57,11 +57,26 @@ module Moxml
|
|
|
57
57
|
"Invalid XML processing instruction target: #{target}"
|
|
58
58
|
end
|
|
59
59
|
|
|
60
|
-
def validate_uri(uri)
|
|
61
|
-
|
|
60
|
+
def validate_uri(uri, mode: :strict)
|
|
61
|
+
# Empty strings are allowed for default namespace undeclaration (xmlns="").
|
|
62
|
+
return if uri.empty?
|
|
63
|
+
|
|
64
|
+
# In lenient mode, accept any string as a namespace URI.
|
|
65
|
+
# Only reject strings containing XML-invalid characters (control characters).
|
|
66
|
+
if mode == :lenient
|
|
67
|
+
if uri.match?(/[\x00-\x08\x0B\x0C\x0E-\x1F]/)
|
|
68
|
+
raise ValidationError, "Invalid URI: #{uri}"
|
|
69
|
+
end
|
|
70
|
+
|
|
62
71
|
return
|
|
63
72
|
end
|
|
64
73
|
|
|
74
|
+
# Namespace names must be valid URI-references per RFC 3986
|
|
75
|
+
# (W3C Namespaces in XML, https://www.w3.org/TR/xml-names/).
|
|
76
|
+
# Use split instead of parse to avoid scheme-specific validation
|
|
77
|
+
# that rejects valid opaque URIs like "mailto:bar".
|
|
78
|
+
URI::RFC3986_PARSER.split(uri)
|
|
79
|
+
rescue URI::InvalidURIError
|
|
65
80
|
raise ValidationError, "Invalid URI: #{uri}"
|
|
66
81
|
end
|
|
67
82
|
|
|
@@ -79,5 +94,13 @@ module Moxml
|
|
|
79
94
|
else value.to_s
|
|
80
95
|
end
|
|
81
96
|
end
|
|
97
|
+
|
|
98
|
+
def validate_entity_reference_name(name)
|
|
99
|
+
# Entity names follow the same pattern as element names
|
|
100
|
+
# They must start with a letter or underscore, followed by letters, digits, hyphens, underscores, periods, or colons
|
|
101
|
+
return if name.is_a?(String) && name.match?(/^[a-zA-Z_][\w\-.:]*$/)
|
|
102
|
+
|
|
103
|
+
raise ValidationError, "Invalid entity reference name: #{name}"
|
|
104
|
+
end
|
|
82
105
|
end
|
|
83
106
|
end
|
data/lib/moxml/xpath/errors.rb
CHANGED
|
@@ -36,7 +36,7 @@ module Moxml
|
|
|
36
36
|
|
|
37
37
|
def to_s
|
|
38
38
|
msg = super
|
|
39
|
-
msg += "\n Context node: <#{@context_node.name}>" if @context_node.
|
|
39
|
+
msg += "\n Context node: <#{@context_node.name}>" if @context_node.is_a?(Moxml::Element) || @context_node.is_a?(Moxml::Attribute)
|
|
40
40
|
msg += "\n Step: #{@step}" if @step
|
|
41
41
|
msg
|
|
42
42
|
end
|
data/lib/moxml.rb
CHANGED
|
@@ -42,6 +42,7 @@ require_relative "moxml/error"
|
|
|
42
42
|
require_relative "moxml/builder"
|
|
43
43
|
require_relative "moxml/config"
|
|
44
44
|
require_relative "moxml/context"
|
|
45
|
+
require_relative "moxml/entity_registry"
|
|
45
46
|
require_relative "moxml/adapter"
|
|
46
47
|
require_relative "moxml/xpath"
|
|
47
48
|
require_relative "moxml/sax"
|
data/spec/consistency/README.md
CHANGED
|
@@ -27,13 +27,15 @@ bundle exec rake spec:consistency
|
|
|
27
27
|
|
|
28
28
|
# Run specific consistency test
|
|
29
29
|
bundle exec rspec spec/consistency/adapter_parity_spec.rb
|
|
30
|
+
bundle exec rspec spec/consistency/round_trip_spec.rb
|
|
30
31
|
```
|
|
31
32
|
|
|
32
33
|
## Directory Structure
|
|
33
34
|
|
|
34
35
|
```
|
|
35
36
|
consistency/
|
|
36
|
-
|
|
37
|
+
├── adapter_parity_spec.rb # Ensures all adapters produce equivalent results
|
|
38
|
+
└── round_trip_spec.rb # Cross-adapter round-trip XML testing
|
|
37
39
|
```
|
|
38
40
|
|
|
39
41
|
## Writing Consistency Tests
|