moxml 0.1.10 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/docs.yml +1 -1
- data/.github/workflows/rake.yml +16 -13
- data/.github/workflows/release.yml +1 -0
- data/.github/workflows/round-trip.yml +74 -0
- data/.gitignore +1 -0
- data/.rubocop.yml +1 -0
- data/.rubocop_todo.yml +160 -38
- data/Gemfile +2 -1
- data/README.adoc +236 -0
- data/Rakefile +11 -0
- data/data/w3c_entities.json +2131 -0
- data/docs/ENTITY_SUPPORT_FOR_LUTAML_MODEL.md +102 -0
- data/docs/_pages/adapters/ox.adoc +30 -0
- data/docs/_pages/configuration.adoc +43 -0
- data/docs/_pages/node-api-reference.adoc +35 -0
- data/docs/_tutorials/namespace-handling.adoc +21 -0
- data/examples/rss_parser/rss_parser.rb +1 -3
- data/lib/moxml/adapter/base.rb +26 -2
- data/lib/moxml/adapter/headed_ox.rb +5 -4
- data/lib/moxml/adapter/libxml.rb +3 -2
- data/lib/moxml/adapter/nokogiri.rb +16 -3
- data/lib/moxml/adapter/oga.rb +124 -20
- data/lib/moxml/adapter/ox.rb +4 -3
- data/lib/moxml/adapter/rexml.rb +41 -7
- data/lib/moxml/builder.rb +6 -0
- data/lib/moxml/config.rb +52 -1
- data/lib/moxml/context.rb +21 -2
- data/lib/moxml/document.rb +6 -1
- data/lib/moxml/document_builder.rb +45 -1
- data/lib/moxml/element.rb +4 -3
- data/lib/moxml/entity_reference.rb +29 -0
- data/lib/moxml/entity_registry.rb +278 -0
- data/lib/moxml/node.rb +10 -8
- data/lib/moxml/node_set.rb +10 -6
- data/lib/moxml/version.rb +1 -1
- data/lib/moxml/xml_utils.rb +25 -2
- data/lib/moxml.rb +1 -0
- data/spec/consistency/README.md +3 -1
- data/spec/consistency/round_trip_spec.rb +479 -0
- data/spec/examples/readme_examples_spec.rb +1 -1
- data/spec/fixtures/round-trips/metanorma/a.xml +66 -0
- data/spec/fixtures/round-trips/metanorma/bilingual-en.xml +7682 -0
- data/spec/fixtures/round-trips/metanorma/bilingual-fr.xml +7520 -0
- data/spec/fixtures/round-trips/metanorma/bilingual.presentation.xml +21211 -0
- data/spec/fixtures/round-trips/metanorma/collection1.xml +313 -0
- data/spec/fixtures/round-trips/metanorma/collection1nested.xml +291 -0
- data/spec/fixtures/round-trips/metanorma/collection_docinline.xml +544 -0
- data/spec/fixtures/round-trips/metanorma/collection_full.xml +1776 -0
- data/spec/fixtures/round-trips/metanorma/dummy.1.xml +295 -0
- data/spec/fixtures/round-trips/metanorma/dummy.xml +349 -0
- data/spec/fixtures/round-trips/metanorma/footnotes.xml +70 -0
- data/spec/fixtures/round-trips/metanorma/iho.xml +116 -0
- data/spec/fixtures/round-trips/metanorma/rice-amd.final.xml +186 -0
- data/spec/fixtures/round-trips/metanorma/rice-amd.final_1.xml +180 -0
- data/spec/fixtures/round-trips/metanorma/rice-en.final.norepo.xml +116 -0
- data/spec/fixtures/round-trips/metanorma/rice-en.final.xml +149 -0
- data/spec/fixtures/round-trips/metanorma/rice-en.final_1.xml +144 -0
- data/spec/fixtures/round-trips/metanorma/rice1-en.final.xml +120 -0
- data/spec/fixtures/round-trips/metanorma/rice2-en.final.xml +116 -0
- data/spec/fixtures/round-trips/metanorma/test_sectionsplit.xml +119 -0
- data/spec/fixtures/round-trips/niso-jats/bmj_sample.xml +1068 -0
- data/spec/fixtures/round-trips/niso-jats/element_citation.xml +7 -0
- data/spec/fixtures/round-trips/niso-jats/pnas_sample.xml +3768 -0
- data/spec/fixtures/round-trips/rfcxml/rfc8881.xml +45848 -0
- data/spec/fixtures/round-trips/rfcxml/rfc8994.xml +6607 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9000.xml +9064 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9043.xml +5527 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9051.xml +14286 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9110.xml +18156 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9260.xml +9136 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9293.xml +8300 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9380.xml +8916 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9420.xml +8927 -0
- data/spec/fixtures/w3c/namespaces/1.0/001.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/002.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/003.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/004.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/005.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/006.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/007.xml +20 -0
- data/spec/fixtures/w3c/namespaces/1.0/008.xml +20 -0
- data/spec/fixtures/w3c/namespaces/1.0/009.xml +19 -0
- data/spec/fixtures/w3c/namespaces/1.0/010.xml +19 -0
- data/spec/fixtures/w3c/namespaces/1.0/011.xml +20 -0
- data/spec/fixtures/w3c/namespaces/1.0/012.xml +19 -0
- data/spec/fixtures/w3c/namespaces/1.0/013.xml +5 -0
- data/spec/fixtures/w3c/namespaces/1.0/014.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/015.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/016.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/017.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/018.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/019.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/020.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/021.xml +6 -0
- data/spec/fixtures/w3c/namespaces/1.0/022.xml +6 -0
- data/spec/fixtures/w3c/namespaces/1.0/023.xml +6 -0
- data/spec/fixtures/w3c/namespaces/1.0/024.xml +6 -0
- data/spec/fixtures/w3c/namespaces/1.0/025.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/026.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/027.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/028.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/029.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/030.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/031.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/032.xml +5 -0
- data/spec/fixtures/w3c/namespaces/1.0/033.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/034.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/035.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/036.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/037.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/038.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/039.xml +10 -0
- data/spec/fixtures/w3c/namespaces/1.0/040.xml +9 -0
- data/spec/fixtures/w3c/namespaces/1.0/041.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/042.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/043.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/044.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/045.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/046.xml +10 -0
- data/spec/fixtures/w3c/namespaces/1.0/047.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/048.xml +5 -0
- data/spec/fixtures/w3c/namespaces/1.0/LICENSE.md +32 -0
- data/spec/fixtures/w3c/namespaces/1.0/README.adoc +42 -0
- data/spec/fixtures/w3c/namespaces/1.0/rmt-ns10.xml +156 -0
- data/spec/integration/shared_examples/node_wrappers/element_behavior.rb +14 -0
- data/spec/integration/shared_examples/node_wrappers/namespace_behavior.rb +14 -2
- data/spec/integration/shared_examples/w3c_namespace_examples.rb +10 -0
- data/spec/integration/w3c_namespace_spec.rb +69 -0
- data/spec/moxml/adapter/libxml_spec.rb +7 -1
- data/spec/moxml/adapter/oga_spec.rb +92 -0
- data/spec/moxml/config_spec.rb +75 -0
- data/spec/moxml/entity_registry_spec.rb +184 -0
- data/spec/moxml/error_spec.rb +2 -2
- data/spec/moxml/namespace_uri_validation_spec.rb +140 -0
- data/spec/moxml/xpath/axes_spec.rb +3 -4
- data/spec/performance/xpath_benchmark_spec.rb +6 -54
- data/spec/support/w3c_namespace_helpers.rb +41 -0
- data/spec/unit/rexml_isolated_test.rb +271 -0
- metadata +98 -2
data/lib/moxml/adapter/rexml.rb
CHANGED
|
@@ -10,9 +10,16 @@ module Moxml
|
|
|
10
10
|
module Adapter
|
|
11
11
|
class Rexml < Base
|
|
12
12
|
class << self
|
|
13
|
-
def parse(xml, options = {})
|
|
13
|
+
def parse(xml, options = {}, _context = nil)
|
|
14
|
+
# Handle frozen strings by creating a mutable copy
|
|
15
|
+
processed_xml = if xml.frozen?
|
|
16
|
+
xml.dup.force_encoding("UTF-8").encode("UTF-8")
|
|
17
|
+
else
|
|
18
|
+
xml.force_encoding("UTF-8").encode("UTF-8")
|
|
19
|
+
end
|
|
20
|
+
|
|
14
21
|
native_doc = begin
|
|
15
|
-
::REXML::Document.new(
|
|
22
|
+
::REXML::Document.new(processed_xml)
|
|
16
23
|
rescue ::REXML::ParseException => e
|
|
17
24
|
if options[:strict]
|
|
18
25
|
raise Moxml::ParseError.new(
|
|
@@ -24,7 +31,15 @@ module Moxml
|
|
|
24
31
|
create_document
|
|
25
32
|
end
|
|
26
33
|
|
|
27
|
-
|
|
34
|
+
ctx = _context || Context.new(:rexml)
|
|
35
|
+
DocumentBuilder.new(ctx).build(native_doc)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def extract_encoding_from_xml(xml)
|
|
39
|
+
# Match XML declaration pattern: <?xml version="..." encoding="..."?>
|
|
40
|
+
# Use atomic group (?>) to prevent polynomial backtracking ReDoS
|
|
41
|
+
match = xml.match(/<\?xml(?>[^>]*)\bencoding\s*=\s*["']([^"']+)["']/i)
|
|
42
|
+
match ? match[1] : "UTF-8"
|
|
28
43
|
end
|
|
29
44
|
|
|
30
45
|
# SAX parsing implementation for REXML
|
|
@@ -359,16 +374,35 @@ module Moxml
|
|
|
359
374
|
when ::REXML::Text, ::REXML::CData
|
|
360
375
|
node.value.to_s
|
|
361
376
|
when ::REXML::Element
|
|
362
|
-
#
|
|
363
|
-
|
|
364
|
-
|
|
377
|
+
# Extract text recursively from all children to match other adapters
|
|
378
|
+
extract_text_recursively(node)
|
|
379
|
+
end
|
|
380
|
+
end
|
|
381
|
+
|
|
382
|
+
def extract_text_recursively(element)
|
|
383
|
+
return "" unless element
|
|
384
|
+
|
|
385
|
+
text = ""
|
|
386
|
+
element.children.each do |child|
|
|
387
|
+
case child
|
|
388
|
+
when ::REXML::Text
|
|
389
|
+
# Preserve original spacing from text nodes exactly including newlines and all whitespace
|
|
390
|
+
text += child.value
|
|
391
|
+
when ::REXML::Element
|
|
392
|
+
# Extract text recursively from child element
|
|
393
|
+
child_text = extract_text_recursively(child)
|
|
394
|
+
# Concatenate directly like other adapters - NO SPACE INSERTION
|
|
395
|
+
text += child_text
|
|
396
|
+
end
|
|
365
397
|
end
|
|
398
|
+
# Don't strip - preserve original spacing including newlines
|
|
399
|
+
text
|
|
366
400
|
end
|
|
367
401
|
|
|
368
402
|
def inner_text(node)
|
|
369
403
|
# Get direct text children only, filter duplicates
|
|
370
404
|
text_children = node.children
|
|
371
|
-
.
|
|
405
|
+
.grep(::REXML::Text)
|
|
372
406
|
.uniq(&:object_id)
|
|
373
407
|
text_children.map(&:value).join
|
|
374
408
|
end
|
data/lib/moxml/builder.rb
CHANGED
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
module Moxml
|
|
4
4
|
class Builder
|
|
5
|
+
attr_reader :document
|
|
6
|
+
|
|
5
7
|
def initialize(context)
|
|
6
8
|
@context = context
|
|
7
9
|
@current = @document = context.create_document
|
|
@@ -60,6 +62,10 @@ module Moxml
|
|
|
60
62
|
@current.add_child(@document.create_comment(content))
|
|
61
63
|
end
|
|
62
64
|
|
|
65
|
+
def entity_reference(name)
|
|
66
|
+
@current.add_child(@document.create_entity_reference(name))
|
|
67
|
+
end
|
|
68
|
+
|
|
63
69
|
def processing_instruction(target, content)
|
|
64
70
|
@current.add_child(
|
|
65
71
|
@document.create_processing_instruction(target, content),
|
data/lib/moxml/config.rb
CHANGED
|
@@ -5,6 +5,13 @@ module Moxml
|
|
|
5
5
|
VALID_ADAPTERS = %i[nokogiri oga rexml ox headed_ox libxml].freeze
|
|
6
6
|
DEFAULT_ADAPTER = VALID_ADAPTERS.first
|
|
7
7
|
|
|
8
|
+
# Entity loading modes:
|
|
9
|
+
# - :required - Must load entities, raise error if unavailable (default)
|
|
10
|
+
# - :optional - Try to load, continue silently if unavailable
|
|
11
|
+
# - :disabled - Don't load entities, use empty registry
|
|
12
|
+
# - :custom - Use custom entity provider via entity_provider callback
|
|
13
|
+
ENTITY_LOAD_MODES = %i[required optional disabled custom].freeze
|
|
14
|
+
|
|
8
15
|
class << self
|
|
9
16
|
attr_writer :default_adapter
|
|
10
17
|
|
|
@@ -17,11 +24,18 @@ module Moxml
|
|
|
17
24
|
end
|
|
18
25
|
end
|
|
19
26
|
|
|
27
|
+
NAMESPACE_URI_MODES = %i[strict lenient].freeze
|
|
28
|
+
|
|
20
29
|
attr_reader :adapter_name
|
|
21
30
|
attr_accessor :strict_parsing,
|
|
22
31
|
:default_encoding,
|
|
23
32
|
:entity_encoding,
|
|
24
|
-
:default_indent
|
|
33
|
+
:default_indent,
|
|
34
|
+
:restore_entities,
|
|
35
|
+
:preload_entity_sets,
|
|
36
|
+
:entity_load_mode,
|
|
37
|
+
:entity_provider,
|
|
38
|
+
:namespace_uri_mode
|
|
25
39
|
|
|
26
40
|
def initialize(adapter_name = nil, strict_parsing = nil,
|
|
27
41
|
default_encoding = nil)
|
|
@@ -31,6 +45,11 @@ module Moxml
|
|
|
31
45
|
# reserved for future use
|
|
32
46
|
@default_indent = 2
|
|
33
47
|
@entity_encoding = :basic
|
|
48
|
+
@restore_entities = false
|
|
49
|
+
@preload_entity_sets = []
|
|
50
|
+
@entity_load_mode = :required
|
|
51
|
+
@entity_provider = nil
|
|
52
|
+
@namespace_uri_mode = :strict
|
|
34
53
|
end
|
|
35
54
|
|
|
36
55
|
def adapter=(name)
|
|
@@ -57,5 +76,37 @@ module Moxml
|
|
|
57
76
|
def adapter
|
|
58
77
|
@adapter ||= Adapter.load(@adapter_name)
|
|
59
78
|
end
|
|
79
|
+
|
|
80
|
+
def entity_load_mode=(mode)
|
|
81
|
+
unless ENTITY_LOAD_MODES.include?(mode)
|
|
82
|
+
raise ArgumentError,
|
|
83
|
+
"Invalid entity_load_mode: #{mode}. Must be one of: #{ENTITY_LOAD_MODES.join(', ')}"
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
@entity_load_mode = mode
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def namespace_uri_mode=(mode)
|
|
90
|
+
mode = mode.to_sym
|
|
91
|
+
unless NAMESPACE_URI_MODES.include?(mode)
|
|
92
|
+
raise ArgumentError,
|
|
93
|
+
"Invalid namespace_uri_mode: #{mode}. Must be one of: #{NAMESPACE_URI_MODES.join(', ')}"
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
@namespace_uri_mode = mode
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Backward compatibility: convert old boolean to new symbol
|
|
100
|
+
def load_external_entities=(value)
|
|
101
|
+
@entity_load_mode = case value
|
|
102
|
+
when true then :required
|
|
103
|
+
when false then :disabled
|
|
104
|
+
else value
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def load_external_entities
|
|
109
|
+
@entity_load_mode == :required
|
|
110
|
+
end
|
|
60
111
|
end
|
|
61
112
|
end
|
data/lib/moxml/context.rb
CHANGED
|
@@ -8,6 +8,10 @@ module Moxml
|
|
|
8
8
|
@config = Config.new(adapter)
|
|
9
9
|
end
|
|
10
10
|
|
|
11
|
+
def entity_registry
|
|
12
|
+
@entity_registry ||= build_entity_registry
|
|
13
|
+
end
|
|
14
|
+
|
|
11
15
|
def create_document(native_doc = nil)
|
|
12
16
|
Document.new(config.adapter.create_document(native_doc), self)
|
|
13
17
|
end
|
|
@@ -23,9 +27,9 @@ module Moxml
|
|
|
23
27
|
end
|
|
24
28
|
has_declaration = xml_string.strip.start_with?("<?xml")
|
|
25
29
|
|
|
26
|
-
# Parse with adapter (
|
|
30
|
+
# Parse with adapter, passing self (context) so adapter can use our config
|
|
27
31
|
parsed_options = default_options.merge(options)
|
|
28
|
-
doc = config.adapter.parse(xml_string, parsed_options)
|
|
32
|
+
doc = config.adapter.parse(xml_string, parsed_options, self)
|
|
29
33
|
|
|
30
34
|
# Set declaration flag on Document wrapper (proper OOP)
|
|
31
35
|
doc.has_xml_declaration = has_declaration if doc.is_a?(Document)
|
|
@@ -73,6 +77,21 @@ module Moxml
|
|
|
73
77
|
|
|
74
78
|
private
|
|
75
79
|
|
|
80
|
+
def build_entity_registry
|
|
81
|
+
registry = EntityRegistry.new(
|
|
82
|
+
mode: config.entity_load_mode,
|
|
83
|
+
entity_provider: config.entity_provider,
|
|
84
|
+
)
|
|
85
|
+
config.preload_entity_sets.each do |set_name|
|
|
86
|
+
case set_name
|
|
87
|
+
when :html5 then registry.load_html5
|
|
88
|
+
when :mathml then registry.load_mathml
|
|
89
|
+
when :iso then registry.load_iso
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
registry
|
|
93
|
+
end
|
|
94
|
+
|
|
76
95
|
def default_options
|
|
77
96
|
{
|
|
78
97
|
encoding: config.default_encoding,
|
data/lib/moxml/document.rb
CHANGED
|
@@ -9,6 +9,7 @@ require_relative "processing_instruction"
|
|
|
9
9
|
require_relative "declaration"
|
|
10
10
|
require_relative "namespace"
|
|
11
11
|
require_relative "doctype"
|
|
12
|
+
require_relative "entity_reference"
|
|
12
13
|
|
|
13
14
|
module Moxml
|
|
14
15
|
class Document < Node
|
|
@@ -68,6 +69,10 @@ module Moxml
|
|
|
68
69
|
Declaration.new(decl, context)
|
|
69
70
|
end
|
|
70
71
|
|
|
72
|
+
def create_entity_reference(name)
|
|
73
|
+
EntityReference.new(adapter.create_entity_reference(name), context)
|
|
74
|
+
end
|
|
75
|
+
|
|
71
76
|
def add_child(node)
|
|
72
77
|
node = prepare_node(node)
|
|
73
78
|
|
|
@@ -109,7 +114,7 @@ module Moxml
|
|
|
109
114
|
|
|
110
115
|
def at_xpath(expression, namespaces = nil)
|
|
111
116
|
if (native_node = adapter.at_xpath(@native, expression, namespaces))
|
|
112
|
-
Node.wrap(native_node, context)
|
|
117
|
+
Moxml::Node.wrap(native_node, context)
|
|
113
118
|
end
|
|
114
119
|
end
|
|
115
120
|
|
|
@@ -67,7 +67,46 @@ module Moxml
|
|
|
67
67
|
def visit_text(node)
|
|
68
68
|
# Prepare node for new document before wrapping
|
|
69
69
|
prepared = adapter.prepare_for_new_document(node, @current_doc.native)
|
|
70
|
-
|
|
70
|
+
content = adapter.text_content(node)
|
|
71
|
+
|
|
72
|
+
# Check if we should restore entity references for this text
|
|
73
|
+
if context.config.restore_entities && content.to_s =~ /[<>&"']/
|
|
74
|
+
restore_entities_in_text(content)
|
|
75
|
+
else
|
|
76
|
+
@node_stack.last&.add_child(Text.new(prepared, context))
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def restore_entities_in_text(content)
|
|
81
|
+
parent = @node_stack.last
|
|
82
|
+
return unless parent
|
|
83
|
+
|
|
84
|
+
# Characters that should potentially be entity-encoded
|
|
85
|
+
# Per W3C XML spec, these characters have special meaning
|
|
86
|
+
entity_chars = {
|
|
87
|
+
"<" => "lt",
|
|
88
|
+
">" => "gt",
|
|
89
|
+
"&" => "amp",
|
|
90
|
+
'"' => "quot",
|
|
91
|
+
"'" => "apos",
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
# Process character by character
|
|
95
|
+
chars = content.to_s.chars
|
|
96
|
+
chars.each do |char|
|
|
97
|
+
codepoint = char.ord
|
|
98
|
+
entity_name = context.entity_registry.primary_name_for_codepoint(codepoint)
|
|
99
|
+
|
|
100
|
+
if entity_name && entity_chars.value?(entity_name)
|
|
101
|
+
# This character should be an entity reference
|
|
102
|
+
entity_node = adapter.create_entity_reference(entity_name)
|
|
103
|
+
parent.add_child(EntityReference.new(entity_node, context))
|
|
104
|
+
else
|
|
105
|
+
# Regular character
|
|
106
|
+
text_node = adapter.create_text(char)
|
|
107
|
+
parent.add_child(Text.new(text_node, context))
|
|
108
|
+
end
|
|
109
|
+
end
|
|
71
110
|
end
|
|
72
111
|
|
|
73
112
|
def visit_cdata(node)
|
|
@@ -90,6 +129,11 @@ module Moxml
|
|
|
90
129
|
@node_stack.last&.add_child(Doctype.new(prepared, context))
|
|
91
130
|
end
|
|
92
131
|
|
|
132
|
+
def visit_entity_reference(node)
|
|
133
|
+
prepared = adapter.prepare_for_new_document(node, @current_doc.native)
|
|
134
|
+
@node_stack.last&.add_child(EntityReference.new(prepared, context))
|
|
135
|
+
end
|
|
136
|
+
|
|
93
137
|
def visit_children(node)
|
|
94
138
|
node_children = children(node).dup
|
|
95
139
|
node_children.each do |child|
|
data/lib/moxml/element.rb
CHANGED
|
@@ -75,8 +75,8 @@ module Moxml
|
|
|
75
75
|
end
|
|
76
76
|
|
|
77
77
|
def add_namespace(prefix, uri)
|
|
78
|
-
|
|
79
|
-
|
|
78
|
+
adapter.create_namespace(@native, prefix, uri,
|
|
79
|
+
namespace_uri_mode: context.config.namespace_uri_mode)
|
|
80
80
|
self
|
|
81
81
|
rescue ValidationError => e
|
|
82
82
|
# Re-raise as NamespaceError, provide attributes for error context
|
|
@@ -102,7 +102,8 @@ module Moxml
|
|
|
102
102
|
if ns_or_hash.is_a?(Hash)
|
|
103
103
|
adapter.set_namespace(
|
|
104
104
|
@native,
|
|
105
|
-
adapter.create_namespace(@native, *ns_or_hash.to_a.first
|
|
105
|
+
adapter.create_namespace(@native, *ns_or_hash.to_a.first,
|
|
106
|
+
namespace_uri_mode: context.config.namespace_uri_mode),
|
|
106
107
|
)
|
|
107
108
|
else
|
|
108
109
|
adapter.set_namespace(@native, ns_or_hash&.native)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Moxml
|
|
4
|
+
class EntityReference < Node
|
|
5
|
+
def content
|
|
6
|
+
""
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def text
|
|
10
|
+
""
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def name
|
|
14
|
+
adapter.entity_reference_name(@native)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def to_xml(*)
|
|
18
|
+
"&#{name};"
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def ==(other)
|
|
22
|
+
self.class == other.class && @native == other.native
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def identifier
|
|
26
|
+
name
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module Moxml
|
|
6
|
+
# EntityRegistry maintains a knowledge base of XML entity definitions.
|
|
7
|
+
#
|
|
8
|
+
# Data source: W3C XML Core WG Character Entities (bundled)
|
|
9
|
+
# https://www.w3.org/2003/entities/2007/htmlmathml
|
|
10
|
+
#
|
|
11
|
+
# The W3C entity data is bundled in data/w3c_entities.json and loaded
|
|
12
|
+
# from the gem's data directory. For development, MOXML_ENTITY_DEFINITIONS_PATH
|
|
13
|
+
# can be set to an external copy.
|
|
14
|
+
#
|
|
15
|
+
# Per W3C XML Core WG guidance:
|
|
16
|
+
# - Character entities are XML internal general entities providing a name for a single Unicode character
|
|
17
|
+
# - Standard XML entities (amp, lt, gt, quot, apos) are implicitly declared per XML specification
|
|
18
|
+
# - External entity sets (like HTML, MathML) can be referenced via DTD parameter entities
|
|
19
|
+
#
|
|
20
|
+
# @example Basic usage
|
|
21
|
+
# registry = EntityRegistry.new
|
|
22
|
+
# registry.declared?("amp") # => true
|
|
23
|
+
# registry.codepoint_for_name("amp") # => 38
|
|
24
|
+
#
|
|
25
|
+
class EntityRegistry
|
|
26
|
+
# W3C entity data file name
|
|
27
|
+
ENTITY_DATA_FILE = "w3c_entities.json"
|
|
28
|
+
|
|
29
|
+
class << self
|
|
30
|
+
# Get the raw entity data from the bundled JSON source
|
|
31
|
+
# @return [Hash{String => String}] entity name to character mapping
|
|
32
|
+
def entity_data
|
|
33
|
+
@entity_data ||= load_entity_data
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Get the default registry instance (lazy loaded)
|
|
37
|
+
# @return [EntityRegistry]
|
|
38
|
+
def default
|
|
39
|
+
@default ||= new
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Reset the default registry (mainly for testing)
|
|
43
|
+
# @return [void]
|
|
44
|
+
def reset
|
|
45
|
+
@default = nil
|
|
46
|
+
@entity_data = nil
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
private
|
|
50
|
+
|
|
51
|
+
# Load entity data from bundled gem data or local file
|
|
52
|
+
# @return [Hash{String => String}]
|
|
53
|
+
def load_entity_data
|
|
54
|
+
# Try multiple paths in order of priority
|
|
55
|
+
paths_to_try = []
|
|
56
|
+
|
|
57
|
+
# 1. Environment variable override (for development/custom setups)
|
|
58
|
+
if ENV["MOXML_ENTITY_DEFINITIONS_PATH"]
|
|
59
|
+
paths_to_try << ENV["MOXML_ENTITY_DEFINITIONS_PATH"]
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# 2. Relative to moxml lib directory (for development/installation)
|
|
63
|
+
# __dir__ is lib/moxml/entity_registry.rb
|
|
64
|
+
# So ../../data/ goes to project_root/data/
|
|
65
|
+
paths_to_try << File.expand_path(
|
|
66
|
+
"../../data/#{ENTITY_DATA_FILE}",
|
|
67
|
+
__dir__,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# 3. External xml-entities sibling directory (common development setup)
|
|
71
|
+
paths_to_try << File.expand_path(
|
|
72
|
+
"../../external/xml-entities/docs/2007/htmlmathml.json",
|
|
73
|
+
__dir__,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
data = nil
|
|
77
|
+
paths_to_try.uniq.each do |path|
|
|
78
|
+
next unless path && File.exist?(path)
|
|
79
|
+
|
|
80
|
+
begin
|
|
81
|
+
data = File.read(path)
|
|
82
|
+
break
|
|
83
|
+
rescue StandardError
|
|
84
|
+
# Try next path
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
unless data
|
|
89
|
+
raise EntityDataError,
|
|
90
|
+
"Entity data not found. Set MOXML_ENTITY_DEFINITIONS_PATH or ensure data/#{ENTITY_DATA_FILE} exists."
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
JSON.parse(data)["characters"]
|
|
94
|
+
rescue StandardError => e
|
|
95
|
+
raise EntityDataError, "Failed to load entity definitions: #{e.message}"
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Error raised when entity data cannot be loaded
|
|
100
|
+
class EntityDataError < StandardError; end
|
|
101
|
+
|
|
102
|
+
# @return [Hash{String => Integer}] entity name to codepoint mapping
|
|
103
|
+
attr_reader :by_name
|
|
104
|
+
|
|
105
|
+
# @return [Hash{Integer => Array<String>}] codepoint to entity names mapping
|
|
106
|
+
attr_reader :by_codepoint
|
|
107
|
+
|
|
108
|
+
# @param mode [Symbol] Loading mode: :required, :optional, :disabled, :custom
|
|
109
|
+
# @param entity_provider [Proc, nil] Custom entity provider proc/lambda
|
|
110
|
+
def initialize(mode: :required, entity_provider: nil)
|
|
111
|
+
@by_name = {}
|
|
112
|
+
@by_codepoint = Hash.new { |h, k| h[k] = [] }
|
|
113
|
+
@mode = mode
|
|
114
|
+
@entity_provider = entity_provider
|
|
115
|
+
|
|
116
|
+
case mode
|
|
117
|
+
when :required
|
|
118
|
+
load_from_entity_data
|
|
119
|
+
when :optional
|
|
120
|
+
load_from_entity_data_optional
|
|
121
|
+
when :custom
|
|
122
|
+
load_custom_entities
|
|
123
|
+
when :disabled
|
|
124
|
+
# Don't load anything - empty registry
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Check if an entity name is declared
|
|
129
|
+
# @param name [String] entity name (e.g., "amp", "nbsp")
|
|
130
|
+
# @return [Boolean]
|
|
131
|
+
def declared?(name)
|
|
132
|
+
@by_name.key?(name)
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# Get the Unicode codepoint for an entity name
|
|
136
|
+
# @param name [String] entity name
|
|
137
|
+
# @return [Integer, nil] codepoint or nil if not found
|
|
138
|
+
def codepoint_for_name(name)
|
|
139
|
+
@by_name[name]
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# Get all entity names for a codepoint
|
|
143
|
+
# @param codepoint [Integer] Unicode codepoint
|
|
144
|
+
# @return [Array<String>] entity names mapping to this codepoint
|
|
145
|
+
def names_for_codepoint(codepoint)
|
|
146
|
+
@by_codepoint[codepoint]
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Get the primary (preferred) entity name for a codepoint
|
|
150
|
+
# @param codepoint [Integer] Unicode codepoint
|
|
151
|
+
# @return [String, nil] primary entity name or nil
|
|
152
|
+
def primary_name_for_codepoint(codepoint)
|
|
153
|
+
@by_codepoint[codepoint]&.first
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Register additional entities
|
|
157
|
+
# @param entities [Hash{String => Integer}] name => codepoint mapping
|
|
158
|
+
# @return [self]
|
|
159
|
+
def register(entities)
|
|
160
|
+
entities.each do |name, codepoint|
|
|
161
|
+
@by_name[name] = codepoint
|
|
162
|
+
@by_codepoint[codepoint] ||= []
|
|
163
|
+
@by_codepoint[codepoint] << name unless @by_codepoint[codepoint].include?(name)
|
|
164
|
+
end
|
|
165
|
+
self
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Load all entities from the W3C HTMLMathML entity set
|
|
169
|
+
# This is called automatically by initialize
|
|
170
|
+
# @return [self]
|
|
171
|
+
def load_html5
|
|
172
|
+
# All entities are loaded by default from initialize
|
|
173
|
+
self
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
# Load MathML entity set (included in HTMLMathML)
|
|
177
|
+
# @return [self]
|
|
178
|
+
def load_mathml
|
|
179
|
+
# All entities are loaded by default from initialize
|
|
180
|
+
self
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# Load ISO entity sets (included in HTMLMathML)
|
|
184
|
+
# @param _set_name [Symbol] (ignored, all loaded together)
|
|
185
|
+
# @return [self]
|
|
186
|
+
def load_iso(_set_name = :iso8879)
|
|
187
|
+
# All entities are loaded by default from initialize
|
|
188
|
+
self
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
# Load all standard entity sets
|
|
192
|
+
# @return [self]
|
|
193
|
+
def load_all
|
|
194
|
+
# All entities are loaded by default from initialize
|
|
195
|
+
self
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
# Clear all entities (reset to empty)
|
|
199
|
+
# @return [self]
|
|
200
|
+
def clear!
|
|
201
|
+
@by_name = {}
|
|
202
|
+
@by_codepoint = Hash.new { |h, k| h[k] = [] }
|
|
203
|
+
self
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
private
|
|
207
|
+
|
|
208
|
+
# Load entities from the centralized JSON data source
|
|
209
|
+
# @raise [EntityDataError] if entity data is required but cannot be loaded
|
|
210
|
+
# @return [void]
|
|
211
|
+
def load_from_entity_data
|
|
212
|
+
data = self.class.entity_data
|
|
213
|
+
|
|
214
|
+
if data.nil?
|
|
215
|
+
raise EntityDataError,
|
|
216
|
+
"Entity data is not available. Set entity_load_mode to :optional or :disabled to skip entity loading."
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
data.each do |name, char|
|
|
220
|
+
codepoint = parse_codepoint(char)
|
|
221
|
+
next unless codepoint
|
|
222
|
+
|
|
223
|
+
@by_name[name] = codepoint
|
|
224
|
+
@by_codepoint[codepoint] ||= []
|
|
225
|
+
@by_codepoint[codepoint] << name unless @by_codepoint[codepoint].include?(name)
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
# Load entities from the centralized JSON data source (optional mode)
|
|
230
|
+
# Silently continues if entity data cannot be loaded
|
|
231
|
+
# @return [void]
|
|
232
|
+
def load_from_entity_data_optional
|
|
233
|
+
data = self.class.entity_data
|
|
234
|
+
return unless data
|
|
235
|
+
|
|
236
|
+
data.each do |name, char|
|
|
237
|
+
codepoint = parse_codepoint(char)
|
|
238
|
+
next unless codepoint
|
|
239
|
+
|
|
240
|
+
@by_name[name] = codepoint
|
|
241
|
+
@by_codepoint[codepoint] ||= []
|
|
242
|
+
@by_codepoint[codepoint] << name unless @by_codepoint[codepoint].include?(name)
|
|
243
|
+
end
|
|
244
|
+
rescue EntityDataError
|
|
245
|
+
# Silently ignore - optional mode
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
# Load custom entities from the provided entity provider
|
|
249
|
+
# @return [void]
|
|
250
|
+
def load_custom_entities
|
|
251
|
+
return unless @entity_provider
|
|
252
|
+
|
|
253
|
+
entities = @entity_provider.call
|
|
254
|
+
return unless entities
|
|
255
|
+
|
|
256
|
+
entities.each do |name, codepoint|
|
|
257
|
+
@by_name[name] = codepoint
|
|
258
|
+
@by_codepoint[codepoint] ||= []
|
|
259
|
+
@by_codepoint[codepoint] << name unless @by_codepoint[codepoint].include?(name)
|
|
260
|
+
end
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
# Parse a Unicode character escape to codepoint
|
|
264
|
+
# @param char [String] character or escape sequence
|
|
265
|
+
# @return [Integer, nil]
|
|
266
|
+
def parse_codepoint(char)
|
|
267
|
+
if char.start_with?("\\u")
|
|
268
|
+
# Handle \uXXXX format
|
|
269
|
+
char.unicode_normalize(:nfc)[2..].to_i(16)
|
|
270
|
+
else
|
|
271
|
+
# Single character - get its ord
|
|
272
|
+
char.ord
|
|
273
|
+
end
|
|
274
|
+
rescue StandardError
|
|
275
|
+
nil
|
|
276
|
+
end
|
|
277
|
+
end
|
|
278
|
+
end
|