moxml 0.1.9 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/docs.yml +1 -1
  3. data/.github/workflows/rake.yml +16 -13
  4. data/.github/workflows/release.yml +1 -0
  5. data/.github/workflows/round-trip.yml +74 -0
  6. data/.gitignore +1 -0
  7. data/.rubocop.yml +1 -0
  8. data/.rubocop_todo.yml +160 -38
  9. data/Gemfile +2 -1
  10. data/README.adoc +287 -20
  11. data/Rakefile +11 -0
  12. data/data/w3c_entities.json +2131 -0
  13. data/docs/ENTITY_SUPPORT_FOR_LUTAML_MODEL.md +102 -0
  14. data/docs/_guides/index.adoc +14 -12
  15. data/docs/_guides/node-api-consistency.adoc +572 -0
  16. data/docs/_guides/xml-declaration.adoc +5 -5
  17. data/docs/_pages/adapters/ox.adoc +30 -0
  18. data/docs/_pages/adapters/rexml.adoc +1 -1
  19. data/docs/_pages/configuration.adoc +43 -0
  20. data/docs/_pages/node-api-reference.adoc +128 -3
  21. data/docs/_tutorials/namespace-handling.adoc +21 -0
  22. data/examples/rss_parser/rss_parser.rb +1 -3
  23. data/lib/moxml/adapter/base.rb +26 -2
  24. data/lib/moxml/adapter/headed_ox.rb +5 -4
  25. data/lib/moxml/adapter/libxml.rb +18 -3
  26. data/lib/moxml/adapter/nokogiri.rb +26 -2
  27. data/lib/moxml/adapter/oga.rb +137 -20
  28. data/lib/moxml/adapter/ox.rb +29 -3
  29. data/lib/moxml/adapter/rexml.rb +54 -7
  30. data/lib/moxml/attribute.rb +6 -0
  31. data/lib/moxml/builder.rb +6 -0
  32. data/lib/moxml/config.rb +52 -1
  33. data/lib/moxml/context.rb +21 -2
  34. data/lib/moxml/doctype.rb +33 -0
  35. data/lib/moxml/document.rb +6 -1
  36. data/lib/moxml/document_builder.rb +45 -1
  37. data/lib/moxml/element.rb +10 -3
  38. data/lib/moxml/entity_reference.rb +29 -0
  39. data/lib/moxml/entity_registry.rb +278 -0
  40. data/lib/moxml/error.rb +5 -5
  41. data/lib/moxml/node.rb +22 -8
  42. data/lib/moxml/node_set.rb +10 -6
  43. data/lib/moxml/processing_instruction.rb +6 -0
  44. data/lib/moxml/version.rb +1 -1
  45. data/lib/moxml/xml_utils.rb +25 -2
  46. data/lib/moxml/xpath/errors.rb +1 -1
  47. data/lib/moxml.rb +1 -0
  48. data/spec/consistency/README.md +3 -1
  49. data/spec/consistency/round_trip_spec.rb +479 -0
  50. data/spec/examples/readme_examples_spec.rb +1 -1
  51. data/spec/fixtures/round-trips/metanorma/a.xml +66 -0
  52. data/spec/fixtures/round-trips/metanorma/bilingual-en.xml +7682 -0
  53. data/spec/fixtures/round-trips/metanorma/bilingual-fr.xml +7520 -0
  54. data/spec/fixtures/round-trips/metanorma/bilingual.presentation.xml +21211 -0
  55. data/spec/fixtures/round-trips/metanorma/collection1.xml +313 -0
  56. data/spec/fixtures/round-trips/metanorma/collection1nested.xml +291 -0
  57. data/spec/fixtures/round-trips/metanorma/collection_docinline.xml +544 -0
  58. data/spec/fixtures/round-trips/metanorma/collection_full.xml +1776 -0
  59. data/spec/fixtures/round-trips/metanorma/dummy.1.xml +295 -0
  60. data/spec/fixtures/round-trips/metanorma/dummy.xml +349 -0
  61. data/spec/fixtures/round-trips/metanorma/footnotes.xml +70 -0
  62. data/spec/fixtures/round-trips/metanorma/iho.xml +116 -0
  63. data/spec/fixtures/round-trips/metanorma/rice-amd.final.xml +186 -0
  64. data/spec/fixtures/round-trips/metanorma/rice-amd.final_1.xml +180 -0
  65. data/spec/fixtures/round-trips/metanorma/rice-en.final.norepo.xml +116 -0
  66. data/spec/fixtures/round-trips/metanorma/rice-en.final.xml +149 -0
  67. data/spec/fixtures/round-trips/metanorma/rice-en.final_1.xml +144 -0
  68. data/spec/fixtures/round-trips/metanorma/rice1-en.final.xml +120 -0
  69. data/spec/fixtures/round-trips/metanorma/rice2-en.final.xml +116 -0
  70. data/spec/fixtures/round-trips/metanorma/test_sectionsplit.xml +119 -0
  71. data/spec/fixtures/round-trips/niso-jats/bmj_sample.xml +1068 -0
  72. data/spec/fixtures/round-trips/niso-jats/element_citation.xml +7 -0
  73. data/spec/fixtures/round-trips/niso-jats/pnas_sample.xml +3768 -0
  74. data/spec/fixtures/round-trips/rfcxml/rfc8881.xml +45848 -0
  75. data/spec/fixtures/round-trips/rfcxml/rfc8994.xml +6607 -0
  76. data/spec/fixtures/round-trips/rfcxml/rfc9000.xml +9064 -0
  77. data/spec/fixtures/round-trips/rfcxml/rfc9043.xml +5527 -0
  78. data/spec/fixtures/round-trips/rfcxml/rfc9051.xml +14286 -0
  79. data/spec/fixtures/round-trips/rfcxml/rfc9110.xml +18156 -0
  80. data/spec/fixtures/round-trips/rfcxml/rfc9260.xml +9136 -0
  81. data/spec/fixtures/round-trips/rfcxml/rfc9293.xml +8300 -0
  82. data/spec/fixtures/round-trips/rfcxml/rfc9380.xml +8916 -0
  83. data/spec/fixtures/round-trips/rfcxml/rfc9420.xml +8927 -0
  84. data/spec/fixtures/w3c/namespaces/1.0/001.xml +7 -0
  85. data/spec/fixtures/w3c/namespaces/1.0/002.xml +8 -0
  86. data/spec/fixtures/w3c/namespaces/1.0/003.xml +7 -0
  87. data/spec/fixtures/w3c/namespaces/1.0/004.xml +7 -0
  88. data/spec/fixtures/w3c/namespaces/1.0/005.xml +7 -0
  89. data/spec/fixtures/w3c/namespaces/1.0/006.xml +7 -0
  90. data/spec/fixtures/w3c/namespaces/1.0/007.xml +20 -0
  91. data/spec/fixtures/w3c/namespaces/1.0/008.xml +20 -0
  92. data/spec/fixtures/w3c/namespaces/1.0/009.xml +19 -0
  93. data/spec/fixtures/w3c/namespaces/1.0/010.xml +19 -0
  94. data/spec/fixtures/w3c/namespaces/1.0/011.xml +20 -0
  95. data/spec/fixtures/w3c/namespaces/1.0/012.xml +19 -0
  96. data/spec/fixtures/w3c/namespaces/1.0/013.xml +5 -0
  97. data/spec/fixtures/w3c/namespaces/1.0/014.xml +3 -0
  98. data/spec/fixtures/w3c/namespaces/1.0/015.xml +3 -0
  99. data/spec/fixtures/w3c/namespaces/1.0/016.xml +3 -0
  100. data/spec/fixtures/w3c/namespaces/1.0/017.xml +3 -0
  101. data/spec/fixtures/w3c/namespaces/1.0/018.xml +3 -0
  102. data/spec/fixtures/w3c/namespaces/1.0/019.xml +3 -0
  103. data/spec/fixtures/w3c/namespaces/1.0/020.xml +3 -0
  104. data/spec/fixtures/w3c/namespaces/1.0/021.xml +6 -0
  105. data/spec/fixtures/w3c/namespaces/1.0/022.xml +6 -0
  106. data/spec/fixtures/w3c/namespaces/1.0/023.xml +6 -0
  107. data/spec/fixtures/w3c/namespaces/1.0/024.xml +6 -0
  108. data/spec/fixtures/w3c/namespaces/1.0/025.xml +3 -0
  109. data/spec/fixtures/w3c/namespaces/1.0/026.xml +3 -0
  110. data/spec/fixtures/w3c/namespaces/1.0/027.xml +3 -0
  111. data/spec/fixtures/w3c/namespaces/1.0/028.xml +3 -0
  112. data/spec/fixtures/w3c/namespaces/1.0/029.xml +4 -0
  113. data/spec/fixtures/w3c/namespaces/1.0/030.xml +4 -0
  114. data/spec/fixtures/w3c/namespaces/1.0/031.xml +4 -0
  115. data/spec/fixtures/w3c/namespaces/1.0/032.xml +5 -0
  116. data/spec/fixtures/w3c/namespaces/1.0/033.xml +4 -0
  117. data/spec/fixtures/w3c/namespaces/1.0/034.xml +3 -0
  118. data/spec/fixtures/w3c/namespaces/1.0/035.xml +8 -0
  119. data/spec/fixtures/w3c/namespaces/1.0/036.xml +8 -0
  120. data/spec/fixtures/w3c/namespaces/1.0/037.xml +8 -0
  121. data/spec/fixtures/w3c/namespaces/1.0/038.xml +8 -0
  122. data/spec/fixtures/w3c/namespaces/1.0/039.xml +10 -0
  123. data/spec/fixtures/w3c/namespaces/1.0/040.xml +9 -0
  124. data/spec/fixtures/w3c/namespaces/1.0/041.xml +8 -0
  125. data/spec/fixtures/w3c/namespaces/1.0/042.xml +4 -0
  126. data/spec/fixtures/w3c/namespaces/1.0/043.xml +7 -0
  127. data/spec/fixtures/w3c/namespaces/1.0/044.xml +7 -0
  128. data/spec/fixtures/w3c/namespaces/1.0/045.xml +7 -0
  129. data/spec/fixtures/w3c/namespaces/1.0/046.xml +10 -0
  130. data/spec/fixtures/w3c/namespaces/1.0/047.xml +4 -0
  131. data/spec/fixtures/w3c/namespaces/1.0/048.xml +5 -0
  132. data/spec/fixtures/w3c/namespaces/1.0/LICENSE.md +32 -0
  133. data/spec/fixtures/w3c/namespaces/1.0/README.adoc +42 -0
  134. data/spec/fixtures/w3c/namespaces/1.0/rmt-ns10.xml +156 -0
  135. data/spec/integration/shared_examples/node_wrappers/namespace_behavior.rb +14 -2
  136. data/spec/integration/shared_examples/w3c_namespace_examples.rb +10 -0
  137. data/spec/integration/w3c_namespace_spec.rb +69 -0
  138. data/spec/moxml/adapter/libxml_spec.rb +7 -1
  139. data/spec/moxml/adapter/oga_spec.rb +92 -0
  140. data/spec/moxml/config_spec.rb +75 -0
  141. data/spec/moxml/doctype_spec.rb +19 -3
  142. data/spec/moxml/entity_registry_spec.rb +184 -0
  143. data/spec/moxml/error_spec.rb +2 -2
  144. data/spec/moxml/namespace_uri_validation_spec.rb +140 -0
  145. data/spec/moxml/xpath/axes_spec.rb +3 -4
  146. data/spec/performance/xpath_benchmark_spec.rb +6 -54
  147. data/spec/support/w3c_namespace_helpers.rb +41 -0
  148. data/spec/unit/rexml_isolated_test.rb +271 -0
  149. metadata +99 -3
  150. data/.ruby-version +0 -1
@@ -9,17 +9,24 @@ module Moxml
9
9
  module Adapter
10
10
  class Oga < Base
11
11
  class << self
12
+ # Standard XML entities handled natively by parsers
13
+ STANDARD_XML_ENTITIES = %w[amp lt gt quot apos].freeze
14
+
12
15
  def set_root(doc, element)
13
16
  # Clear existing root element if any - Oga's NodeSet needs special handling
14
17
  # We need to manually remove elements since NodeSet doesn't support clear or delete_if
15
- elements_to_remove = doc.children.select { |child| child.is_a?(::Oga::XML::Element) }
18
+ elements_to_remove = doc.children.grep(::Oga::XML::Element)
16
19
  elements_to_remove.each { |elem| doc.children.delete(elem) }
17
20
  doc.children << element
18
21
  end
19
22
 
20
- def parse(xml, options = {})
23
+ def parse(xml, options = {}, _context = nil)
24
+ # Pre-process XML to convert named entities to marker form (\x01name;).
25
+ # Oga drops named entity references like &nbsp; during parsing.
26
+ processed_xml = preprocess_named_entities(xml)
27
+
21
28
  native_doc = begin
22
- ::Oga.parse_xml(xml, strict: options[:strict])
29
+ ::Oga.parse_xml(processed_xml, strict: options[:strict])
23
30
  rescue LL::ParserError => e
24
31
  raise Moxml::ParseError.new(
25
32
  e.message,
@@ -27,7 +34,8 @@ module Moxml
27
34
  )
28
35
  end
29
36
 
30
- DocumentBuilder.new(Context.new(:oga)).build(native_doc)
37
+ ctx = _context || Context.new(:oga)
38
+ DocumentBuilder.new(ctx).build(native_doc)
31
39
  end
32
40
 
33
41
  # SAX parsing implementation for Oga
@@ -61,7 +69,7 @@ module Moxml
61
69
  end
62
70
 
63
71
  def create_native_text(content)
64
- ::Oga::XML::Text.new(text: content)
72
+ ::Oga::XML::Text.new(text: encode_entity_markers(content))
65
73
  end
66
74
 
67
75
  def create_native_cdata(content)
@@ -74,7 +82,8 @@ module Moxml
74
82
 
75
83
  def create_native_doctype(name, external_id, system_id)
76
84
  ::Oga::XML::Doctype.new(
77
- name: name, public_id: external_id, system_id: system_id, type: "PUBLIC",
85
+ name: name, public_id: external_id, system_id: system_id,
86
+ type: external_id ? "PUBLIC" : "SYSTEM"
78
87
  )
79
88
  end
80
89
 
@@ -224,7 +233,7 @@ module Moxml
224
233
  attr = ::Oga::XML::Attribute.new(
225
234
  name: name.to_s,
226
235
  namespace_name: namespace_name,
227
- value: value.to_s,
236
+ value: encode_entity_markers(value.to_s),
228
237
  )
229
238
  element.add_attribute(attr)
230
239
  end
@@ -234,7 +243,7 @@ module Moxml
234
243
  end
235
244
 
236
245
  def get_attribute_value(element, name)
237
- element[name.to_s]
246
+ restore_entity_markers(element[name.to_s])
238
247
  end
239
248
 
240
249
  def remove_attribute(element, name)
@@ -303,24 +312,25 @@ module Moxml
303
312
  end
304
313
 
305
314
  def text_content(node)
306
- node.text
315
+ restore_entity_markers(node.text)
307
316
  end
308
317
 
309
318
  def inner_text(node)
310
- if node.respond_to?(:inner_text)
311
- node.inner_text
312
- else
313
- # Oga::XML::Text node for example
314
- node.text
315
- end
319
+ text = if node.respond_to?(:inner_text)
320
+ node.inner_text
321
+ else
322
+ # Oga::XML::Text node for example
323
+ node.text
324
+ end
325
+ restore_entity_markers(text)
316
326
  end
317
327
 
318
328
  def set_text_content(node, content)
329
+ encoded = encode_entity_markers(content)
319
330
  if node.respond_to?(:inner_text=)
320
- node.inner_text = content
331
+ node.inner_text = encoded
321
332
  else
322
- # Oga::XML::Text node for example
323
- node.text = content
333
+ node.text = encoded
324
334
  end
325
335
  end
326
336
 
@@ -365,6 +375,19 @@ module Moxml
365
375
  node.namespaces.values
366
376
  end
367
377
 
378
+ # Doctype accessor methods
379
+ def doctype_name(native)
380
+ native.name
381
+ end
382
+
383
+ def doctype_external_id(native)
384
+ native.public_id
385
+ end
386
+
387
+ def doctype_system_id(native)
388
+ native.system_id
389
+ end
390
+
368
391
  def xpath(node, expression, namespaces = nil)
369
392
  node.xpath(expression, {},
370
393
  namespaces: namespaces&.transform_keys(&:to_s)).to_a
@@ -389,6 +412,53 @@ module Moxml
389
412
  end
390
413
 
391
414
  def serialize(node, options = {})
415
+ output = serialize_without_entity_processing(node, options)
416
+ # Post-process: convert entity markers back to entity references
417
+ output.gsub(ENTITY_MARKER_REGEX, '&\1;')
418
+ end
419
+
420
+ # Shared entity name pattern (W3C: 2-31 chars, starts with alpha)
421
+ ENTITY_PATTERN = "([a-zA-Z][a-zA-Z0-9]{1,30})"
422
+
423
+ # Marker character for entity preservation through Oga's parser.
424
+ # U+0001 is preserved literally by Oga through parse/serialize cycle.
425
+ ENTITY_MARKER = "\x01"
426
+
427
+ # Regular expression for entity marker post-processing
428
+ ENTITY_MARKER_REGEX = /#{ENTITY_MARKER}#{ENTITY_PATTERN};/
429
+
430
+ # Simple entity-only regex with no nested quantifiers
431
+ ENTITY_REF_REGEX = /&#{ENTITY_PATTERN};/
432
+
433
+ private
434
+
435
+ # Convert &entity; back to \x01entity; for Oga text storage.
436
+ # Used when setting text content programmatically (not from parsing).
437
+ def encode_entity_markers(text)
438
+ return text unless text&.include?("&")
439
+
440
+ text.gsub(ENTITY_REF_REGEX) do
441
+ name = ::Regexp.last_match(1)
442
+
443
+ next ::Regexp.last_match(0) if STANDARD_XML_ENTITIES.include?(name)
444
+
445
+ codepoint = Moxml::EntityRegistry.default.codepoint_for_name(name)
446
+ if codepoint
447
+ "#{ENTITY_MARKER}#{name};"
448
+ else
449
+ ::Regexp.last_match(0)
450
+ end
451
+ end
452
+ end
453
+
454
+ # Convert \x01entity; back to &entity; for text accessors.
455
+ def restore_entity_markers(text)
456
+ return text unless text
457
+
458
+ text.gsub(ENTITY_MARKER_REGEX, '&\1;')
459
+ end
460
+
461
+ def serialize_without_entity_processing(node, options = {})
392
462
  # Oga's XmlGenerator doesn't support options directly
393
463
  # We need to handle declaration options ourselves for Document nodes
394
464
  if node.is_a?(::Oga::XML::Document)
@@ -403,7 +473,11 @@ module Moxml
403
473
  node.xml_declaration ? true : false
404
474
  end
405
475
 
406
- if should_include_decl && !node.xml_declaration
476
+ # Fix: Check if declaration already exists in children
477
+ # This prevents duplicate declarations when document already has one
478
+ has_existing_declaration = node.children.any?(::Oga::XML::XmlDeclaration)
479
+
480
+ if should_include_decl && !node.xml_declaration && !has_existing_declaration
407
481
  # Need to add declaration - create default one
408
482
  output = +""
409
483
  output << '<?xml version="1.0" encoding="UTF-8"?>'
@@ -437,7 +511,50 @@ module Moxml
437
511
  end
438
512
 
439
513
  # Default: use XmlGenerator
440
- ::Moxml::Adapter::CustomizedOga::XmlGenerator.new(node).to_xml
514
+ # But first check if we need to handle declaration specially
515
+ if node.is_a?(::Oga::XML::Document) && node.xml_declaration
516
+ # Document has declaration - use custom handling to avoid duplicates
517
+ output = +""
518
+
519
+ # Serialize children, but skip XmlDeclaration if it would cause duplication
520
+ node.children.each do |child|
521
+ # Check if this would cause duplication by seeing if we already have one in output
522
+ if child.is_a?(::Oga::XML::XmlDeclaration) && output.include?("<?xml")
523
+ next # Skip duplicate declaration
524
+ end
525
+
526
+ output << ::Moxml::Adapter::CustomizedOga::XmlGenerator.new(child).to_xml
527
+ end
528
+
529
+ output
530
+ else
531
+ # Normal case - use XmlGenerator directly
532
+ ::Moxml::Adapter::CustomizedOga::XmlGenerator.new(node).to_xml
533
+ end
534
+ end
535
+
536
+ # Pre-process XML to convert named entities to marker format.
537
+ # Oga drops named entity references like &nbsp; but preserves control chars.
538
+ # By converting known named entities to marker form (\x01name;), we can
539
+ # reconstruct them during serialization.
540
+ #
541
+ # @param xml [String, #to_s] The XML string to process
542
+ # @return [String] The XML with known named entities converted to marker form
543
+ def preprocess_named_entities(xml)
544
+ return xml unless xml.is_a?(String)
545
+
546
+ xml.gsub(ENTITY_REF_REGEX) do
547
+ name = Regexp.last_match(1)
548
+
549
+ next Regexp.last_match(0) if STANDARD_XML_ENTITIES.include?(name)
550
+
551
+ codepoint = Moxml::EntityRegistry.default.codepoint_for_name(name)
552
+ if codepoint
553
+ "#{ENTITY_MARKER}#{name};"
554
+ else
555
+ Regexp.last_match(0)
556
+ end
557
+ end
441
558
  end
442
559
  end
443
560
  end
@@ -17,7 +17,7 @@ module Moxml
17
17
  replace_children(doc, [element])
18
18
  end
19
19
 
20
- def parse(xml, _options = {})
20
+ def parse(xml, _options = {}, _context = nil)
21
21
  native_doc = begin
22
22
  result = ::Ox.parse(xml)
23
23
 
@@ -36,7 +36,8 @@ module Moxml
36
36
  )
37
37
  end
38
38
 
39
- DocumentBuilder.new(Context.new(:ox)).build(native_doc)
39
+ ctx = _context || Context.new(:ox)
40
+ DocumentBuilder.new(ctx).build(native_doc)
40
41
  end
41
42
 
42
43
  # SAX parsing implementation for Ox
@@ -452,7 +453,7 @@ module Moxml
452
453
  def inner_text(node)
453
454
  return "" unless node.respond_to?(:nodes)
454
455
 
455
- node.nodes.select { _1.is_a?(String) }.join
456
+ node.nodes.grep(String).join
456
457
  end
457
458
 
458
459
  def set_text_content(node, content)
@@ -510,6 +511,31 @@ module Moxml
510
511
  end.values
511
512
  end
512
513
 
514
+ # Doctype accessor methods
515
+ # Ox stores DOCTYPE as a string, so we parse it
516
+ def doctype_name(native)
517
+ # Parse: "name PUBLIC \"external_id\" \"system_id\"" or "name SYSTEM \"system_id\""
518
+ value = native.value.to_s.strip
519
+ # Extract the first word (the name)
520
+ value.split(/\s+/).first
521
+ end
522
+
523
+ def doctype_external_id(native)
524
+ value = native.value.to_s
525
+ # Match PUBLIC "external_id"
526
+ match = value.match(/PUBLIC\s+"([^"]*)"/)
527
+ match ? match[1] : nil
528
+ end
529
+
530
+ def doctype_system_id(native)
531
+ value = native.value.to_s
532
+ # Match the last quoted string (system_id)
533
+ # For PUBLIC: "name PUBLIC \"external_id\" \"system_id\""
534
+ # For SYSTEM: "name SYSTEM \"system_id\""
535
+ matches = value.scan(/"([^"]*)"/)
536
+ matches.last&.first
537
+ end
538
+
513
539
  def xpath(node, expression, namespaces = {})
514
540
  # Translate common XPath patterns to Ox locate() syntax
515
541
  locate_expr = translate_xpath_to_locate(expression, namespaces)
@@ -10,9 +10,16 @@ module Moxml
10
10
  module Adapter
11
11
  class Rexml < Base
12
12
  class << self
13
- def parse(xml, options = {})
13
+ def parse(xml, options = {}, _context = nil)
14
+ # Handle frozen strings by creating a mutable copy
15
+ processed_xml = if xml.frozen?
16
+ xml.dup.force_encoding("UTF-8").encode("UTF-8")
17
+ else
18
+ xml.force_encoding("UTF-8").encode("UTF-8")
19
+ end
20
+
14
21
  native_doc = begin
15
- ::REXML::Document.new(xml)
22
+ ::REXML::Document.new(processed_xml)
16
23
  rescue ::REXML::ParseException => e
17
24
  if options[:strict]
18
25
  raise Moxml::ParseError.new(
@@ -24,7 +31,15 @@ module Moxml
24
31
  create_document
25
32
  end
26
33
 
27
- DocumentBuilder.new(Context.new(:rexml)).build(native_doc)
34
+ ctx = _context || Context.new(:rexml)
35
+ DocumentBuilder.new(ctx).build(native_doc)
36
+ end
37
+
38
+ def extract_encoding_from_xml(xml)
39
+ # Match XML declaration pattern: <?xml version="..." encoding="..."?>
40
+ # Use atomic group (?>) to prevent polynomial backtracking ReDoS
41
+ match = xml.match(/<\?xml(?>[^>]*)\bencoding\s*=\s*["']([^"']+)["']/i)
42
+ match ? match[1] : "UTF-8"
28
43
  end
29
44
 
30
45
  # SAX parsing implementation for REXML
@@ -359,16 +374,35 @@ module Moxml
359
374
  when ::REXML::Text, ::REXML::CData
360
375
  node.value.to_s
361
376
  when ::REXML::Element
362
- # Get all text nodes, filter out duplicates, and join
363
- text_nodes = node.texts.uniq(&:object_id)
364
- text_nodes.map(&:value).join
377
+ # Extract text recursively from all children to match other adapters
378
+ extract_text_recursively(node)
379
+ end
380
+ end
381
+
382
+ def extract_text_recursively(element)
383
+ return "" unless element
384
+
385
+ text = ""
386
+ element.children.each do |child|
387
+ case child
388
+ when ::REXML::Text
389
+ # Preserve original spacing from text nodes exactly including newlines and all whitespace
390
+ text += child.value
391
+ when ::REXML::Element
392
+ # Extract text recursively from child element
393
+ child_text = extract_text_recursively(child)
394
+ # Concatenate directly like other adapters - NO SPACE INSERTION
395
+ text += child_text
396
+ end
365
397
  end
398
+ # Don't strip - preserve original spacing including newlines
399
+ text
366
400
  end
367
401
 
368
402
  def inner_text(node)
369
403
  # Get direct text children only, filter duplicates
370
404
  text_children = node.children
371
- .select { _1.is_a?(::REXML::Text) }
405
+ .grep(::REXML::Text)
372
406
  .uniq(&:object_id)
373
407
  text_children.map(&:value).join
374
408
  end
@@ -426,6 +460,19 @@ module Moxml
426
460
  end
427
461
  end
428
462
 
463
+ # Doctype accessor methods
464
+ def doctype_name(native)
465
+ native.name
466
+ end
467
+
468
+ def doctype_external_id(native)
469
+ native.public
470
+ end
471
+
472
+ def doctype_system_id(native)
473
+ native.system
474
+ end
475
+
429
476
  # not used at the moment
430
477
  # but may be useful when the xpath is upgraded to work with namespaces
431
478
  def prepare_xpath_namespaces(node)
@@ -10,6 +10,12 @@ module Moxml
10
10
  adapter.set_attribute_name(@native, new_name)
11
11
  end
12
12
 
13
+ # Returns the primary identifier for this attribute (its name)
14
+ # @return [String] the attribute name
15
+ def identifier
16
+ name
17
+ end
18
+
13
19
  def value
14
20
  @native.value
15
21
  end
data/lib/moxml/builder.rb CHANGED
@@ -2,6 +2,8 @@
2
2
 
3
3
  module Moxml
4
4
  class Builder
5
+ attr_reader :document
6
+
5
7
  def initialize(context)
6
8
  @context = context
7
9
  @current = @document = context.create_document
@@ -60,6 +62,10 @@ module Moxml
60
62
  @current.add_child(@document.create_comment(content))
61
63
  end
62
64
 
65
+ def entity_reference(name)
66
+ @current.add_child(@document.create_entity_reference(name))
67
+ end
68
+
63
69
  def processing_instruction(target, content)
64
70
  @current.add_child(
65
71
  @document.create_processing_instruction(target, content),
data/lib/moxml/config.rb CHANGED
@@ -5,6 +5,13 @@ module Moxml
5
5
  VALID_ADAPTERS = %i[nokogiri oga rexml ox headed_ox libxml].freeze
6
6
  DEFAULT_ADAPTER = VALID_ADAPTERS.first
7
7
 
8
+ # Entity loading modes:
9
+ # - :required - Must load entities, raise error if unavailable (default)
10
+ # - :optional - Try to load, continue silently if unavailable
11
+ # - :disabled - Don't load entities, use empty registry
12
+ # - :custom - Use custom entity provider via entity_provider callback
13
+ ENTITY_LOAD_MODES = %i[required optional disabled custom].freeze
14
+
8
15
  class << self
9
16
  attr_writer :default_adapter
10
17
 
@@ -17,11 +24,18 @@ module Moxml
17
24
  end
18
25
  end
19
26
 
27
+ NAMESPACE_URI_MODES = %i[strict lenient].freeze
28
+
20
29
  attr_reader :adapter_name
21
30
  attr_accessor :strict_parsing,
22
31
  :default_encoding,
23
32
  :entity_encoding,
24
- :default_indent
33
+ :default_indent,
34
+ :restore_entities,
35
+ :preload_entity_sets,
36
+ :entity_load_mode,
37
+ :entity_provider,
38
+ :namespace_uri_mode
25
39
 
26
40
  def initialize(adapter_name = nil, strict_parsing = nil,
27
41
  default_encoding = nil)
@@ -31,6 +45,11 @@ module Moxml
31
45
  # reserved for future use
32
46
  @default_indent = 2
33
47
  @entity_encoding = :basic
48
+ @restore_entities = false
49
+ @preload_entity_sets = []
50
+ @entity_load_mode = :required
51
+ @entity_provider = nil
52
+ @namespace_uri_mode = :strict
34
53
  end
35
54
 
36
55
  def adapter=(name)
@@ -57,5 +76,37 @@ module Moxml
57
76
  def adapter
58
77
  @adapter ||= Adapter.load(@adapter_name)
59
78
  end
79
+
80
+ def entity_load_mode=(mode)
81
+ unless ENTITY_LOAD_MODES.include?(mode)
82
+ raise ArgumentError,
83
+ "Invalid entity_load_mode: #{mode}. Must be one of: #{ENTITY_LOAD_MODES.join(', ')}"
84
+ end
85
+
86
+ @entity_load_mode = mode
87
+ end
88
+
89
+ def namespace_uri_mode=(mode)
90
+ mode = mode.to_sym
91
+ unless NAMESPACE_URI_MODES.include?(mode)
92
+ raise ArgumentError,
93
+ "Invalid namespace_uri_mode: #{mode}. Must be one of: #{NAMESPACE_URI_MODES.join(', ')}"
94
+ end
95
+
96
+ @namespace_uri_mode = mode
97
+ end
98
+
99
+ # Backward compatibility: convert old boolean to new symbol
100
+ def load_external_entities=(value)
101
+ @entity_load_mode = case value
102
+ when true then :required
103
+ when false then :disabled
104
+ else value
105
+ end
106
+ end
107
+
108
+ def load_external_entities
109
+ @entity_load_mode == :required
110
+ end
60
111
  end
61
112
  end
data/lib/moxml/context.rb CHANGED
@@ -8,6 +8,10 @@ module Moxml
8
8
  @config = Config.new(adapter)
9
9
  end
10
10
 
11
+ def entity_registry
12
+ @entity_registry ||= build_entity_registry
13
+ end
14
+
11
15
  def create_document(native_doc = nil)
12
16
  Document.new(config.adapter.create_document(native_doc), self)
13
17
  end
@@ -23,9 +27,9 @@ module Moxml
23
27
  end
24
28
  has_declaration = xml_string.strip.start_with?("<?xml")
25
29
 
26
- # Parse with adapter (without declaration info - adapters don't need it)
30
+ # Parse with adapter, passing self (context) so adapter can use our config
27
31
  parsed_options = default_options.merge(options)
28
- doc = config.adapter.parse(xml_string, parsed_options)
32
+ doc = config.adapter.parse(xml_string, parsed_options, self)
29
33
 
30
34
  # Set declaration flag on Document wrapper (proper OOP)
31
35
  doc.has_xml_declaration = has_declaration if doc.is_a?(Document)
@@ -73,6 +77,21 @@ module Moxml
73
77
 
74
78
  private
75
79
 
80
+ def build_entity_registry
81
+ registry = EntityRegistry.new(
82
+ mode: config.entity_load_mode,
83
+ entity_provider: config.entity_provider,
84
+ )
85
+ config.preload_entity_sets.each do |set_name|
86
+ case set_name
87
+ when :html5 then registry.load_html5
88
+ when :mathml then registry.load_mathml
89
+ when :iso then registry.load_iso
90
+ end
91
+ end
92
+ registry
93
+ end
94
+
76
95
  def default_options
77
96
  {
78
97
  encoding: config.default_encoding,
data/lib/moxml/doctype.rb CHANGED
@@ -1,17 +1,50 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Moxml
4
+ # Represents an XML DOCTYPE declaration
5
+ #
6
+ # @note Doctype accessor methods are not fully implemented across all adapters.
7
+ # The availability of #name, #external_id, and #system_id depends on whether
8
+ # the specific adapter implements the corresponding adapter methods:
9
+ # - adapter.doctype_name(native)
10
+ # - adapter.doctype_external_id(native)
11
+ # - adapter.doctype_system_id(native)
12
+ #
13
+ # Most adapters do not currently implement these methods. If you need DOCTYPE
14
+ # information, consider using adapter-specific methods or parsing the serialized
15
+ # XML manually.
4
16
  class Doctype < Node
17
+ # Returns the DOCTYPE name (root element name)
18
+ #
19
+ # @return [String, nil] the DOCTYPE name
20
+ # @raise [NotImplementedError] if the adapter doesn't implement doctype_name
5
21
  def name
6
22
  adapter.doctype_name(@native)
7
23
  end
8
24
 
25
+ # Returns the DOCTYPE external ID
26
+ #
27
+ # @return [String, nil] the external ID
28
+ # @raise [NotImplementedError] if the adapter doesn't implement doctype_external_id
9
29
  def external_id
10
30
  adapter.doctype_external_id(@native)
11
31
  end
12
32
 
33
+ # Returns the DOCTYPE system ID
34
+ #
35
+ # @return [String, nil] the system ID
36
+ # @raise [NotImplementedError] if the adapter doesn't implement doctype_system_id
13
37
  def system_id
14
38
  adapter.doctype_system_id(@native)
15
39
  end
40
+
41
+ # Returns the primary identifier for this doctype
42
+ # Since DOCTYPE information is not reliably available across adapters,
43
+ # this returns nil.
44
+ #
45
+ # @return [nil]
46
+ def identifier
47
+ name
48
+ end
16
49
  end
17
50
  end
@@ -9,6 +9,7 @@ require_relative "processing_instruction"
9
9
  require_relative "declaration"
10
10
  require_relative "namespace"
11
11
  require_relative "doctype"
12
+ require_relative "entity_reference"
12
13
 
13
14
  module Moxml
14
15
  class Document < Node
@@ -68,6 +69,10 @@ module Moxml
68
69
  Declaration.new(decl, context)
69
70
  end
70
71
 
72
+ def create_entity_reference(name)
73
+ EntityReference.new(adapter.create_entity_reference(name), context)
74
+ end
75
+
71
76
  def add_child(node)
72
77
  node = prepare_node(node)
73
78
 
@@ -109,7 +114,7 @@ module Moxml
109
114
 
110
115
  def at_xpath(expression, namespaces = nil)
111
116
  if (native_node = adapter.at_xpath(@native, expression, namespaces))
112
- Node.wrap(native_node, context)
117
+ Moxml::Node.wrap(native_node, context)
113
118
  end
114
119
  end
115
120