moxml 0.1.10 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/docs.yml +1 -1
  3. data/.github/workflows/rake.yml +16 -13
  4. data/.github/workflows/release.yml +1 -0
  5. data/.github/workflows/round-trip.yml +74 -0
  6. data/.gitignore +1 -0
  7. data/.rubocop.yml +1 -0
  8. data/.rubocop_todo.yml +160 -38
  9. data/Gemfile +2 -1
  10. data/README.adoc +236 -0
  11. data/Rakefile +11 -0
  12. data/data/w3c_entities.json +2131 -0
  13. data/docs/ENTITY_SUPPORT_FOR_LUTAML_MODEL.md +102 -0
  14. data/docs/_pages/adapters/ox.adoc +30 -0
  15. data/docs/_pages/configuration.adoc +43 -0
  16. data/docs/_pages/node-api-reference.adoc +35 -0
  17. data/docs/_tutorials/namespace-handling.adoc +21 -0
  18. data/examples/rss_parser/rss_parser.rb +1 -3
  19. data/lib/moxml/adapter/base.rb +26 -2
  20. data/lib/moxml/adapter/headed_ox.rb +5 -4
  21. data/lib/moxml/adapter/libxml.rb +3 -2
  22. data/lib/moxml/adapter/nokogiri.rb +16 -3
  23. data/lib/moxml/adapter/oga.rb +124 -20
  24. data/lib/moxml/adapter/ox.rb +4 -3
  25. data/lib/moxml/adapter/rexml.rb +41 -7
  26. data/lib/moxml/builder.rb +6 -0
  27. data/lib/moxml/config.rb +52 -1
  28. data/lib/moxml/context.rb +21 -2
  29. data/lib/moxml/document.rb +6 -1
  30. data/lib/moxml/document_builder.rb +45 -1
  31. data/lib/moxml/element.rb +4 -3
  32. data/lib/moxml/entity_reference.rb +29 -0
  33. data/lib/moxml/entity_registry.rb +278 -0
  34. data/lib/moxml/node.rb +10 -8
  35. data/lib/moxml/node_set.rb +10 -6
  36. data/lib/moxml/version.rb +1 -1
  37. data/lib/moxml/xml_utils.rb +25 -2
  38. data/lib/moxml.rb +1 -0
  39. data/spec/consistency/README.md +3 -1
  40. data/spec/consistency/round_trip_spec.rb +479 -0
  41. data/spec/examples/readme_examples_spec.rb +1 -1
  42. data/spec/fixtures/round-trips/metanorma/a.xml +66 -0
  43. data/spec/fixtures/round-trips/metanorma/bilingual-en.xml +7682 -0
  44. data/spec/fixtures/round-trips/metanorma/bilingual-fr.xml +7520 -0
  45. data/spec/fixtures/round-trips/metanorma/bilingual.presentation.xml +21211 -0
  46. data/spec/fixtures/round-trips/metanorma/collection1.xml +313 -0
  47. data/spec/fixtures/round-trips/metanorma/collection1nested.xml +291 -0
  48. data/spec/fixtures/round-trips/metanorma/collection_docinline.xml +544 -0
  49. data/spec/fixtures/round-trips/metanorma/collection_full.xml +1776 -0
  50. data/spec/fixtures/round-trips/metanorma/dummy.1.xml +295 -0
  51. data/spec/fixtures/round-trips/metanorma/dummy.xml +349 -0
  52. data/spec/fixtures/round-trips/metanorma/footnotes.xml +70 -0
  53. data/spec/fixtures/round-trips/metanorma/iho.xml +116 -0
  54. data/spec/fixtures/round-trips/metanorma/rice-amd.final.xml +186 -0
  55. data/spec/fixtures/round-trips/metanorma/rice-amd.final_1.xml +180 -0
  56. data/spec/fixtures/round-trips/metanorma/rice-en.final.norepo.xml +116 -0
  57. data/spec/fixtures/round-trips/metanorma/rice-en.final.xml +149 -0
  58. data/spec/fixtures/round-trips/metanorma/rice-en.final_1.xml +144 -0
  59. data/spec/fixtures/round-trips/metanorma/rice1-en.final.xml +120 -0
  60. data/spec/fixtures/round-trips/metanorma/rice2-en.final.xml +116 -0
  61. data/spec/fixtures/round-trips/metanorma/test_sectionsplit.xml +119 -0
  62. data/spec/fixtures/round-trips/niso-jats/bmj_sample.xml +1068 -0
  63. data/spec/fixtures/round-trips/niso-jats/element_citation.xml +7 -0
  64. data/spec/fixtures/round-trips/niso-jats/pnas_sample.xml +3768 -0
  65. data/spec/fixtures/round-trips/rfcxml/rfc8881.xml +45848 -0
  66. data/spec/fixtures/round-trips/rfcxml/rfc8994.xml +6607 -0
  67. data/spec/fixtures/round-trips/rfcxml/rfc9000.xml +9064 -0
  68. data/spec/fixtures/round-trips/rfcxml/rfc9043.xml +5527 -0
  69. data/spec/fixtures/round-trips/rfcxml/rfc9051.xml +14286 -0
  70. data/spec/fixtures/round-trips/rfcxml/rfc9110.xml +18156 -0
  71. data/spec/fixtures/round-trips/rfcxml/rfc9260.xml +9136 -0
  72. data/spec/fixtures/round-trips/rfcxml/rfc9293.xml +8300 -0
  73. data/spec/fixtures/round-trips/rfcxml/rfc9380.xml +8916 -0
  74. data/spec/fixtures/round-trips/rfcxml/rfc9420.xml +8927 -0
  75. data/spec/fixtures/w3c/namespaces/1.0/001.xml +7 -0
  76. data/spec/fixtures/w3c/namespaces/1.0/002.xml +8 -0
  77. data/spec/fixtures/w3c/namespaces/1.0/003.xml +7 -0
  78. data/spec/fixtures/w3c/namespaces/1.0/004.xml +7 -0
  79. data/spec/fixtures/w3c/namespaces/1.0/005.xml +7 -0
  80. data/spec/fixtures/w3c/namespaces/1.0/006.xml +7 -0
  81. data/spec/fixtures/w3c/namespaces/1.0/007.xml +20 -0
  82. data/spec/fixtures/w3c/namespaces/1.0/008.xml +20 -0
  83. data/spec/fixtures/w3c/namespaces/1.0/009.xml +19 -0
  84. data/spec/fixtures/w3c/namespaces/1.0/010.xml +19 -0
  85. data/spec/fixtures/w3c/namespaces/1.0/011.xml +20 -0
  86. data/spec/fixtures/w3c/namespaces/1.0/012.xml +19 -0
  87. data/spec/fixtures/w3c/namespaces/1.0/013.xml +5 -0
  88. data/spec/fixtures/w3c/namespaces/1.0/014.xml +3 -0
  89. data/spec/fixtures/w3c/namespaces/1.0/015.xml +3 -0
  90. data/spec/fixtures/w3c/namespaces/1.0/016.xml +3 -0
  91. data/spec/fixtures/w3c/namespaces/1.0/017.xml +3 -0
  92. data/spec/fixtures/w3c/namespaces/1.0/018.xml +3 -0
  93. data/spec/fixtures/w3c/namespaces/1.0/019.xml +3 -0
  94. data/spec/fixtures/w3c/namespaces/1.0/020.xml +3 -0
  95. data/spec/fixtures/w3c/namespaces/1.0/021.xml +6 -0
  96. data/spec/fixtures/w3c/namespaces/1.0/022.xml +6 -0
  97. data/spec/fixtures/w3c/namespaces/1.0/023.xml +6 -0
  98. data/spec/fixtures/w3c/namespaces/1.0/024.xml +6 -0
  99. data/spec/fixtures/w3c/namespaces/1.0/025.xml +3 -0
  100. data/spec/fixtures/w3c/namespaces/1.0/026.xml +3 -0
  101. data/spec/fixtures/w3c/namespaces/1.0/027.xml +3 -0
  102. data/spec/fixtures/w3c/namespaces/1.0/028.xml +3 -0
  103. data/spec/fixtures/w3c/namespaces/1.0/029.xml +4 -0
  104. data/spec/fixtures/w3c/namespaces/1.0/030.xml +4 -0
  105. data/spec/fixtures/w3c/namespaces/1.0/031.xml +4 -0
  106. data/spec/fixtures/w3c/namespaces/1.0/032.xml +5 -0
  107. data/spec/fixtures/w3c/namespaces/1.0/033.xml +4 -0
  108. data/spec/fixtures/w3c/namespaces/1.0/034.xml +3 -0
  109. data/spec/fixtures/w3c/namespaces/1.0/035.xml +8 -0
  110. data/spec/fixtures/w3c/namespaces/1.0/036.xml +8 -0
  111. data/spec/fixtures/w3c/namespaces/1.0/037.xml +8 -0
  112. data/spec/fixtures/w3c/namespaces/1.0/038.xml +8 -0
  113. data/spec/fixtures/w3c/namespaces/1.0/039.xml +10 -0
  114. data/spec/fixtures/w3c/namespaces/1.0/040.xml +9 -0
  115. data/spec/fixtures/w3c/namespaces/1.0/041.xml +8 -0
  116. data/spec/fixtures/w3c/namespaces/1.0/042.xml +4 -0
  117. data/spec/fixtures/w3c/namespaces/1.0/043.xml +7 -0
  118. data/spec/fixtures/w3c/namespaces/1.0/044.xml +7 -0
  119. data/spec/fixtures/w3c/namespaces/1.0/045.xml +7 -0
  120. data/spec/fixtures/w3c/namespaces/1.0/046.xml +10 -0
  121. data/spec/fixtures/w3c/namespaces/1.0/047.xml +4 -0
  122. data/spec/fixtures/w3c/namespaces/1.0/048.xml +5 -0
  123. data/spec/fixtures/w3c/namespaces/1.0/LICENSE.md +32 -0
  124. data/spec/fixtures/w3c/namespaces/1.0/README.adoc +42 -0
  125. data/spec/fixtures/w3c/namespaces/1.0/rmt-ns10.xml +156 -0
  126. data/spec/integration/shared_examples/node_wrappers/element_behavior.rb +14 -0
  127. data/spec/integration/shared_examples/node_wrappers/namespace_behavior.rb +14 -2
  128. data/spec/integration/shared_examples/w3c_namespace_examples.rb +10 -0
  129. data/spec/integration/w3c_namespace_spec.rb +69 -0
  130. data/spec/moxml/adapter/libxml_spec.rb +7 -1
  131. data/spec/moxml/adapter/oga_spec.rb +92 -0
  132. data/spec/moxml/config_spec.rb +75 -0
  133. data/spec/moxml/entity_registry_spec.rb +184 -0
  134. data/spec/moxml/error_spec.rb +2 -2
  135. data/spec/moxml/namespace_uri_validation_spec.rb +140 -0
  136. data/spec/moxml/xpath/axes_spec.rb +3 -4
  137. data/spec/performance/xpath_benchmark_spec.rb +6 -54
  138. data/spec/support/w3c_namespace_helpers.rb +41 -0
  139. data/spec/unit/rexml_isolated_test.rb +271 -0
  140. metadata +98 -2
@@ -10,9 +10,16 @@ module Moxml
10
10
  module Adapter
11
11
  class Rexml < Base
12
12
  class << self
13
- def parse(xml, options = {})
13
+ def parse(xml, options = {}, _context = nil)
14
+ # Handle frozen strings by creating a mutable copy
15
+ processed_xml = if xml.frozen?
16
+ xml.dup.force_encoding("UTF-8").encode("UTF-8")
17
+ else
18
+ xml.force_encoding("UTF-8").encode("UTF-8")
19
+ end
20
+
14
21
  native_doc = begin
15
- ::REXML::Document.new(xml)
22
+ ::REXML::Document.new(processed_xml)
16
23
  rescue ::REXML::ParseException => e
17
24
  if options[:strict]
18
25
  raise Moxml::ParseError.new(
@@ -24,7 +31,15 @@ module Moxml
24
31
  create_document
25
32
  end
26
33
 
27
- DocumentBuilder.new(Context.new(:rexml)).build(native_doc)
34
+ ctx = _context || Context.new(:rexml)
35
+ DocumentBuilder.new(ctx).build(native_doc)
36
+ end
37
+
38
+ def extract_encoding_from_xml(xml)
39
+ # Match XML declaration pattern: <?xml version="..." encoding="..."?>
40
+ # Use atomic group (?>) to prevent polynomial backtracking ReDoS
41
+ match = xml.match(/<\?xml(?>[^>]*)\bencoding\s*=\s*["']([^"']+)["']/i)
42
+ match ? match[1] : "UTF-8"
28
43
  end
29
44
 
30
45
  # SAX parsing implementation for REXML
@@ -359,16 +374,35 @@ module Moxml
359
374
  when ::REXML::Text, ::REXML::CData
360
375
  node.value.to_s
361
376
  when ::REXML::Element
362
- # Get all text nodes, filter out duplicates, and join
363
- text_nodes = node.texts.uniq(&:object_id)
364
- text_nodes.map(&:value).join
377
+ # Extract text recursively from all children to match other adapters
378
+ extract_text_recursively(node)
379
+ end
380
+ end
381
+
382
+ def extract_text_recursively(element)
383
+ return "" unless element
384
+
385
+ text = ""
386
+ element.children.each do |child|
387
+ case child
388
+ when ::REXML::Text
389
+ # Preserve original spacing from text nodes exactly including newlines and all whitespace
390
+ text += child.value
391
+ when ::REXML::Element
392
+ # Extract text recursively from child element
393
+ child_text = extract_text_recursively(child)
394
+ # Concatenate directly like other adapters - NO SPACE INSERTION
395
+ text += child_text
396
+ end
365
397
  end
398
+ # Don't strip - preserve original spacing including newlines
399
+ text
366
400
  end
367
401
 
368
402
  def inner_text(node)
369
403
  # Get direct text children only, filter duplicates
370
404
  text_children = node.children
371
- .select { _1.is_a?(::REXML::Text) }
405
+ .grep(::REXML::Text)
372
406
  .uniq(&:object_id)
373
407
  text_children.map(&:value).join
374
408
  end
data/lib/moxml/builder.rb CHANGED
@@ -2,6 +2,8 @@
2
2
 
3
3
  module Moxml
4
4
  class Builder
5
+ attr_reader :document
6
+
5
7
  def initialize(context)
6
8
  @context = context
7
9
  @current = @document = context.create_document
@@ -60,6 +62,10 @@ module Moxml
60
62
  @current.add_child(@document.create_comment(content))
61
63
  end
62
64
 
65
+ def entity_reference(name)
66
+ @current.add_child(@document.create_entity_reference(name))
67
+ end
68
+
63
69
  def processing_instruction(target, content)
64
70
  @current.add_child(
65
71
  @document.create_processing_instruction(target, content),
data/lib/moxml/config.rb CHANGED
@@ -5,6 +5,13 @@ module Moxml
5
5
  VALID_ADAPTERS = %i[nokogiri oga rexml ox headed_ox libxml].freeze
6
6
  DEFAULT_ADAPTER = VALID_ADAPTERS.first
7
7
 
8
+ # Entity loading modes:
9
+ # - :required - Must load entities, raise error if unavailable (default)
10
+ # - :optional - Try to load, continue silently if unavailable
11
+ # - :disabled - Don't load entities, use empty registry
12
+ # - :custom - Use custom entity provider via entity_provider callback
13
+ ENTITY_LOAD_MODES = %i[required optional disabled custom].freeze
14
+
8
15
  class << self
9
16
  attr_writer :default_adapter
10
17
 
@@ -17,11 +24,18 @@ module Moxml
17
24
  end
18
25
  end
19
26
 
27
+ NAMESPACE_URI_MODES = %i[strict lenient].freeze
28
+
20
29
  attr_reader :adapter_name
21
30
  attr_accessor :strict_parsing,
22
31
  :default_encoding,
23
32
  :entity_encoding,
24
- :default_indent
33
+ :default_indent,
34
+ :restore_entities,
35
+ :preload_entity_sets,
36
+ :entity_load_mode,
37
+ :entity_provider,
38
+ :namespace_uri_mode
25
39
 
26
40
  def initialize(adapter_name = nil, strict_parsing = nil,
27
41
  default_encoding = nil)
@@ -31,6 +45,11 @@ module Moxml
31
45
  # reserved for future use
32
46
  @default_indent = 2
33
47
  @entity_encoding = :basic
48
+ @restore_entities = false
49
+ @preload_entity_sets = []
50
+ @entity_load_mode = :required
51
+ @entity_provider = nil
52
+ @namespace_uri_mode = :strict
34
53
  end
35
54
 
36
55
  def adapter=(name)
@@ -57,5 +76,37 @@ module Moxml
57
76
  def adapter
58
77
  @adapter ||= Adapter.load(@adapter_name)
59
78
  end
79
+
80
+ def entity_load_mode=(mode)
81
+ unless ENTITY_LOAD_MODES.include?(mode)
82
+ raise ArgumentError,
83
+ "Invalid entity_load_mode: #{mode}. Must be one of: #{ENTITY_LOAD_MODES.join(', ')}"
84
+ end
85
+
86
+ @entity_load_mode = mode
87
+ end
88
+
89
+ def namespace_uri_mode=(mode)
90
+ mode = mode.to_sym
91
+ unless NAMESPACE_URI_MODES.include?(mode)
92
+ raise ArgumentError,
93
+ "Invalid namespace_uri_mode: #{mode}. Must be one of: #{NAMESPACE_URI_MODES.join(', ')}"
94
+ end
95
+
96
+ @namespace_uri_mode = mode
97
+ end
98
+
99
+ # Backward compatibility: convert old boolean to new symbol
100
+ def load_external_entities=(value)
101
+ @entity_load_mode = case value
102
+ when true then :required
103
+ when false then :disabled
104
+ else value
105
+ end
106
+ end
107
+
108
+ def load_external_entities
109
+ @entity_load_mode == :required
110
+ end
60
111
  end
61
112
  end
data/lib/moxml/context.rb CHANGED
@@ -8,6 +8,10 @@ module Moxml
8
8
  @config = Config.new(adapter)
9
9
  end
10
10
 
11
+ def entity_registry
12
+ @entity_registry ||= build_entity_registry
13
+ end
14
+
11
15
  def create_document(native_doc = nil)
12
16
  Document.new(config.adapter.create_document(native_doc), self)
13
17
  end
@@ -23,9 +27,9 @@ module Moxml
23
27
  end
24
28
  has_declaration = xml_string.strip.start_with?("<?xml")
25
29
 
26
- # Parse with adapter (without declaration info - adapters don't need it)
30
+ # Parse with adapter, passing self (context) so adapter can use our config
27
31
  parsed_options = default_options.merge(options)
28
- doc = config.adapter.parse(xml_string, parsed_options)
32
+ doc = config.adapter.parse(xml_string, parsed_options, self)
29
33
 
30
34
  # Set declaration flag on Document wrapper (proper OOP)
31
35
  doc.has_xml_declaration = has_declaration if doc.is_a?(Document)
@@ -73,6 +77,21 @@ module Moxml
73
77
 
74
78
  private
75
79
 
80
+ def build_entity_registry
81
+ registry = EntityRegistry.new(
82
+ mode: config.entity_load_mode,
83
+ entity_provider: config.entity_provider,
84
+ )
85
+ config.preload_entity_sets.each do |set_name|
86
+ case set_name
87
+ when :html5 then registry.load_html5
88
+ when :mathml then registry.load_mathml
89
+ when :iso then registry.load_iso
90
+ end
91
+ end
92
+ registry
93
+ end
94
+
76
95
  def default_options
77
96
  {
78
97
  encoding: config.default_encoding,
@@ -9,6 +9,7 @@ require_relative "processing_instruction"
9
9
  require_relative "declaration"
10
10
  require_relative "namespace"
11
11
  require_relative "doctype"
12
+ require_relative "entity_reference"
12
13
 
13
14
  module Moxml
14
15
  class Document < Node
@@ -68,6 +69,10 @@ module Moxml
68
69
  Declaration.new(decl, context)
69
70
  end
70
71
 
72
+ def create_entity_reference(name)
73
+ EntityReference.new(adapter.create_entity_reference(name), context)
74
+ end
75
+
71
76
  def add_child(node)
72
77
  node = prepare_node(node)
73
78
 
@@ -109,7 +114,7 @@ module Moxml
109
114
 
110
115
  def at_xpath(expression, namespaces = nil)
111
116
  if (native_node = adapter.at_xpath(@native, expression, namespaces))
112
- Node.wrap(native_node, context)
117
+ Moxml::Node.wrap(native_node, context)
113
118
  end
114
119
  end
115
120
 
@@ -67,7 +67,46 @@ module Moxml
67
67
  def visit_text(node)
68
68
  # Prepare node for new document before wrapping
69
69
  prepared = adapter.prepare_for_new_document(node, @current_doc.native)
70
- @node_stack.last&.add_child(Text.new(prepared, context))
70
+ content = adapter.text_content(node)
71
+
72
+ # Check if we should restore entity references for this text
73
+ if context.config.restore_entities && content.to_s =~ /[<>&"']/
74
+ restore_entities_in_text(content)
75
+ else
76
+ @node_stack.last&.add_child(Text.new(prepared, context))
77
+ end
78
+ end
79
+
80
+ def restore_entities_in_text(content)
81
+ parent = @node_stack.last
82
+ return unless parent
83
+
84
+ # Characters that should potentially be entity-encoded
85
+ # Per W3C XML spec, these characters have special meaning
86
+ entity_chars = {
87
+ "<" => "lt",
88
+ ">" => "gt",
89
+ "&" => "amp",
90
+ '"' => "quot",
91
+ "'" => "apos",
92
+ }
93
+
94
+ # Process character by character
95
+ chars = content.to_s.chars
96
+ chars.each do |char|
97
+ codepoint = char.ord
98
+ entity_name = context.entity_registry.primary_name_for_codepoint(codepoint)
99
+
100
+ if entity_name && entity_chars.value?(entity_name)
101
+ # This character should be an entity reference
102
+ entity_node = adapter.create_entity_reference(entity_name)
103
+ parent.add_child(EntityReference.new(entity_node, context))
104
+ else
105
+ # Regular character
106
+ text_node = adapter.create_text(char)
107
+ parent.add_child(Text.new(text_node, context))
108
+ end
109
+ end
71
110
  end
72
111
 
73
112
  def visit_cdata(node)
@@ -90,6 +129,11 @@ module Moxml
90
129
  @node_stack.last&.add_child(Doctype.new(prepared, context))
91
130
  end
92
131
 
132
+ def visit_entity_reference(node)
133
+ prepared = adapter.prepare_for_new_document(node, @current_doc.native)
134
+ @node_stack.last&.add_child(EntityReference.new(prepared, context))
135
+ end
136
+
93
137
  def visit_children(node)
94
138
  node_children = children(node).dup
95
139
  node_children.each do |child|
data/lib/moxml/element.rb CHANGED
@@ -75,8 +75,8 @@ module Moxml
75
75
  end
76
76
 
77
77
  def add_namespace(prefix, uri)
78
- validate_uri(uri)
79
- adapter.create_native_namespace(@native, prefix, uri)
78
+ adapter.create_namespace(@native, prefix, uri,
79
+ namespace_uri_mode: context.config.namespace_uri_mode)
80
80
  self
81
81
  rescue ValidationError => e
82
82
  # Re-raise as NamespaceError, provide attributes for error context
@@ -102,7 +102,8 @@ module Moxml
102
102
  if ns_or_hash.is_a?(Hash)
103
103
  adapter.set_namespace(
104
104
  @native,
105
- adapter.create_namespace(@native, *ns_or_hash.to_a.first),
105
+ adapter.create_namespace(@native, *ns_or_hash.to_a.first,
106
+ namespace_uri_mode: context.config.namespace_uri_mode),
106
107
  )
107
108
  else
108
109
  adapter.set_namespace(@native, ns_or_hash&.native)
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Moxml
4
+ class EntityReference < Node
5
+ def content
6
+ ""
7
+ end
8
+
9
+ def text
10
+ ""
11
+ end
12
+
13
+ def name
14
+ adapter.entity_reference_name(@native)
15
+ end
16
+
17
+ def to_xml(*)
18
+ "&#{name};"
19
+ end
20
+
21
+ def ==(other)
22
+ self.class == other.class && @native == other.native
23
+ end
24
+
25
+ def identifier
26
+ name
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,278 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module Moxml
6
+ # EntityRegistry maintains a knowledge base of XML entity definitions.
7
+ #
8
+ # Data source: W3C XML Core WG Character Entities (bundled)
9
+ # https://www.w3.org/2003/entities/2007/htmlmathml
10
+ #
11
+ # The W3C entity data is bundled in data/w3c_entities.json and loaded
12
+ # from the gem's data directory. For development, MOXML_ENTITY_DEFINITIONS_PATH
13
+ # can be set to an external copy.
14
+ #
15
+ # Per W3C XML Core WG guidance:
16
+ # - Character entities are XML internal general entities providing a name for a single Unicode character
17
+ # - Standard XML entities (amp, lt, gt, quot, apos) are implicitly declared per XML specification
18
+ # - External entity sets (like HTML, MathML) can be referenced via DTD parameter entities
19
+ #
20
+ # @example Basic usage
21
+ # registry = EntityRegistry.new
22
+ # registry.declared?("amp") # => true
23
+ # registry.codepoint_for_name("amp") # => 38
24
+ #
25
+ class EntityRegistry
26
+ # W3C entity data file name
27
+ ENTITY_DATA_FILE = "w3c_entities.json"
28
+
29
+ class << self
30
+ # Get the raw entity data from the bundled JSON source
31
+ # @return [Hash{String => String}] entity name to character mapping
32
+ def entity_data
33
+ @entity_data ||= load_entity_data
34
+ end
35
+
36
+ # Get the default registry instance (lazy loaded)
37
+ # @return [EntityRegistry]
38
+ def default
39
+ @default ||= new
40
+ end
41
+
42
+ # Reset the default registry (mainly for testing)
43
+ # @return [void]
44
+ def reset
45
+ @default = nil
46
+ @entity_data = nil
47
+ end
48
+
49
+ private
50
+
51
+ # Load entity data from bundled gem data or local file
52
+ # @return [Hash{String => String}]
53
+ def load_entity_data
54
+ # Try multiple paths in order of priority
55
+ paths_to_try = []
56
+
57
+ # 1. Environment variable override (for development/custom setups)
58
+ if ENV["MOXML_ENTITY_DEFINITIONS_PATH"]
59
+ paths_to_try << ENV["MOXML_ENTITY_DEFINITIONS_PATH"]
60
+ end
61
+
62
+ # 2. Relative to moxml lib directory (for development/installation)
63
+ # __dir__ is lib/moxml/entity_registry.rb
64
+ # So ../../data/ goes to project_root/data/
65
+ paths_to_try << File.expand_path(
66
+ "../../data/#{ENTITY_DATA_FILE}",
67
+ __dir__,
68
+ )
69
+
70
+ # 3. External xml-entities sibling directory (common development setup)
71
+ paths_to_try << File.expand_path(
72
+ "../../external/xml-entities/docs/2007/htmlmathml.json",
73
+ __dir__,
74
+ )
75
+
76
+ data = nil
77
+ paths_to_try.uniq.each do |path|
78
+ next unless path && File.exist?(path)
79
+
80
+ begin
81
+ data = File.read(path)
82
+ break
83
+ rescue StandardError
84
+ # Try next path
85
+ end
86
+ end
87
+
88
+ unless data
89
+ raise EntityDataError,
90
+ "Entity data not found. Set MOXML_ENTITY_DEFINITIONS_PATH or ensure data/#{ENTITY_DATA_FILE} exists."
91
+ end
92
+
93
+ JSON.parse(data)["characters"]
94
+ rescue StandardError => e
95
+ raise EntityDataError, "Failed to load entity definitions: #{e.message}"
96
+ end
97
+ end
98
+
99
+ # Error raised when entity data cannot be loaded
100
+ class EntityDataError < StandardError; end
101
+
102
+ # @return [Hash{String => Integer}] entity name to codepoint mapping
103
+ attr_reader :by_name
104
+
105
+ # @return [Hash{Integer => Array<String>}] codepoint to entity names mapping
106
+ attr_reader :by_codepoint
107
+
108
+ # @param mode [Symbol] Loading mode: :required, :optional, :disabled, :custom
109
+ # @param entity_provider [Proc, nil] Custom entity provider proc/lambda
110
+ def initialize(mode: :required, entity_provider: nil)
111
+ @by_name = {}
112
+ @by_codepoint = Hash.new { |h, k| h[k] = [] }
113
+ @mode = mode
114
+ @entity_provider = entity_provider
115
+
116
+ case mode
117
+ when :required
118
+ load_from_entity_data
119
+ when :optional
120
+ load_from_entity_data_optional
121
+ when :custom
122
+ load_custom_entities
123
+ when :disabled
124
+ # Don't load anything - empty registry
125
+ end
126
+ end
127
+
128
+ # Check if an entity name is declared
129
+ # @param name [String] entity name (e.g., "amp", "nbsp")
130
+ # @return [Boolean]
131
+ def declared?(name)
132
+ @by_name.key?(name)
133
+ end
134
+
135
+ # Get the Unicode codepoint for an entity name
136
+ # @param name [String] entity name
137
+ # @return [Integer, nil] codepoint or nil if not found
138
+ def codepoint_for_name(name)
139
+ @by_name[name]
140
+ end
141
+
142
+ # Get all entity names for a codepoint
143
+ # @param codepoint [Integer] Unicode codepoint
144
+ # @return [Array<String>] entity names mapping to this codepoint
145
+ def names_for_codepoint(codepoint)
146
+ @by_codepoint[codepoint]
147
+ end
148
+
149
+ # Get the primary (preferred) entity name for a codepoint
150
+ # @param codepoint [Integer] Unicode codepoint
151
+ # @return [String, nil] primary entity name or nil
152
+ def primary_name_for_codepoint(codepoint)
153
+ @by_codepoint[codepoint]&.first
154
+ end
155
+
156
+ # Register additional entities
157
+ # @param entities [Hash{String => Integer}] name => codepoint mapping
158
+ # @return [self]
159
+ def register(entities)
160
+ entities.each do |name, codepoint|
161
+ @by_name[name] = codepoint
162
+ @by_codepoint[codepoint] ||= []
163
+ @by_codepoint[codepoint] << name unless @by_codepoint[codepoint].include?(name)
164
+ end
165
+ self
166
+ end
167
+
168
+ # Load all entities from the W3C HTMLMathML entity set
169
+ # This is called automatically by initialize
170
+ # @return [self]
171
+ def load_html5
172
+ # All entities are loaded by default from initialize
173
+ self
174
+ end
175
+
176
+ # Load MathML entity set (included in HTMLMathML)
177
+ # @return [self]
178
+ def load_mathml
179
+ # All entities are loaded by default from initialize
180
+ self
181
+ end
182
+
183
+ # Load ISO entity sets (included in HTMLMathML)
184
+ # @param _set_name [Symbol] (ignored, all loaded together)
185
+ # @return [self]
186
+ def load_iso(_set_name = :iso8879)
187
+ # All entities are loaded by default from initialize
188
+ self
189
+ end
190
+
191
+ # Load all standard entity sets
192
+ # @return [self]
193
+ def load_all
194
+ # All entities are loaded by default from initialize
195
+ self
196
+ end
197
+
198
+ # Clear all entities (reset to empty)
199
+ # @return [self]
200
+ def clear!
201
+ @by_name = {}
202
+ @by_codepoint = Hash.new { |h, k| h[k] = [] }
203
+ self
204
+ end
205
+
206
+ private
207
+
208
+ # Load entities from the centralized JSON data source
209
+ # @raise [EntityDataError] if entity data is required but cannot be loaded
210
+ # @return [void]
211
+ def load_from_entity_data
212
+ data = self.class.entity_data
213
+
214
+ if data.nil?
215
+ raise EntityDataError,
216
+ "Entity data is not available. Set entity_load_mode to :optional or :disabled to skip entity loading."
217
+ end
218
+
219
+ data.each do |name, char|
220
+ codepoint = parse_codepoint(char)
221
+ next unless codepoint
222
+
223
+ @by_name[name] = codepoint
224
+ @by_codepoint[codepoint] ||= []
225
+ @by_codepoint[codepoint] << name unless @by_codepoint[codepoint].include?(name)
226
+ end
227
+ end
228
+
229
+ # Load entities from the centralized JSON data source (optional mode)
230
+ # Silently continues if entity data cannot be loaded
231
+ # @return [void]
232
+ def load_from_entity_data_optional
233
+ data = self.class.entity_data
234
+ return unless data
235
+
236
+ data.each do |name, char|
237
+ codepoint = parse_codepoint(char)
238
+ next unless codepoint
239
+
240
+ @by_name[name] = codepoint
241
+ @by_codepoint[codepoint] ||= []
242
+ @by_codepoint[codepoint] << name unless @by_codepoint[codepoint].include?(name)
243
+ end
244
+ rescue EntityDataError
245
+ # Silently ignore - optional mode
246
+ end
247
+
248
+ # Load custom entities from the provided entity provider
249
+ # @return [void]
250
+ def load_custom_entities
251
+ return unless @entity_provider
252
+
253
+ entities = @entity_provider.call
254
+ return unless entities
255
+
256
+ entities.each do |name, codepoint|
257
+ @by_name[name] = codepoint
258
+ @by_codepoint[codepoint] ||= []
259
+ @by_codepoint[codepoint] << name unless @by_codepoint[codepoint].include?(name)
260
+ end
261
+ end
262
+
263
+ # Parse a Unicode character escape to codepoint
264
+ # @param char [String] character or escape sequence
265
+ # @return [Integer, nil]
266
+ def parse_codepoint(char)
267
+ if char.start_with?("\\u")
268
+ # Handle \uXXXX format
269
+ char.unicode_normalize(:nfc)[2..].to_i(16)
270
+ else
271
+ # Single character - get its ord
272
+ char.ord
273
+ end
274
+ rescue StandardError
275
+ nil
276
+ end
277
+ end
278
+ end