moxml 0.1.9 → 0.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/docs.yml +1 -1
- data/.github/workflows/rake.yml +16 -13
- data/.github/workflows/release.yml +1 -0
- data/.github/workflows/round-trip.yml +74 -0
- data/.gitignore +1 -0
- data/.rubocop.yml +1 -0
- data/.rubocop_todo.yml +160 -38
- data/Gemfile +2 -1
- data/README.adoc +287 -20
- data/Rakefile +11 -0
- data/data/w3c_entities.json +2131 -0
- data/docs/ENTITY_SUPPORT_FOR_LUTAML_MODEL.md +102 -0
- data/docs/_guides/index.adoc +14 -12
- data/docs/_guides/node-api-consistency.adoc +572 -0
- data/docs/_guides/xml-declaration.adoc +5 -5
- data/docs/_pages/adapters/ox.adoc +30 -0
- data/docs/_pages/adapters/rexml.adoc +1 -1
- data/docs/_pages/configuration.adoc +43 -0
- data/docs/_pages/node-api-reference.adoc +128 -3
- data/docs/_tutorials/namespace-handling.adoc +21 -0
- data/examples/rss_parser/rss_parser.rb +1 -3
- data/lib/moxml/adapter/base.rb +26 -2
- data/lib/moxml/adapter/headed_ox.rb +5 -4
- data/lib/moxml/adapter/libxml.rb +18 -3
- data/lib/moxml/adapter/nokogiri.rb +26 -2
- data/lib/moxml/adapter/oga.rb +137 -20
- data/lib/moxml/adapter/ox.rb +29 -3
- data/lib/moxml/adapter/rexml.rb +54 -7
- data/lib/moxml/attribute.rb +6 -0
- data/lib/moxml/builder.rb +6 -0
- data/lib/moxml/config.rb +52 -1
- data/lib/moxml/context.rb +21 -2
- data/lib/moxml/doctype.rb +33 -0
- data/lib/moxml/document.rb +6 -1
- data/lib/moxml/document_builder.rb +45 -1
- data/lib/moxml/element.rb +10 -3
- data/lib/moxml/entity_reference.rb +29 -0
- data/lib/moxml/entity_registry.rb +278 -0
- data/lib/moxml/error.rb +5 -5
- data/lib/moxml/node.rb +22 -8
- data/lib/moxml/node_set.rb +10 -6
- data/lib/moxml/processing_instruction.rb +6 -0
- data/lib/moxml/version.rb +1 -1
- data/lib/moxml/xml_utils.rb +25 -2
- data/lib/moxml/xpath/errors.rb +1 -1
- data/lib/moxml.rb +1 -0
- data/spec/consistency/README.md +3 -1
- data/spec/consistency/round_trip_spec.rb +479 -0
- data/spec/examples/readme_examples_spec.rb +1 -1
- data/spec/fixtures/round-trips/metanorma/a.xml +66 -0
- data/spec/fixtures/round-trips/metanorma/bilingual-en.xml +7682 -0
- data/spec/fixtures/round-trips/metanorma/bilingual-fr.xml +7520 -0
- data/spec/fixtures/round-trips/metanorma/bilingual.presentation.xml +21211 -0
- data/spec/fixtures/round-trips/metanorma/collection1.xml +313 -0
- data/spec/fixtures/round-trips/metanorma/collection1nested.xml +291 -0
- data/spec/fixtures/round-trips/metanorma/collection_docinline.xml +544 -0
- data/spec/fixtures/round-trips/metanorma/collection_full.xml +1776 -0
- data/spec/fixtures/round-trips/metanorma/dummy.1.xml +295 -0
- data/spec/fixtures/round-trips/metanorma/dummy.xml +349 -0
- data/spec/fixtures/round-trips/metanorma/footnotes.xml +70 -0
- data/spec/fixtures/round-trips/metanorma/iho.xml +116 -0
- data/spec/fixtures/round-trips/metanorma/rice-amd.final.xml +186 -0
- data/spec/fixtures/round-trips/metanorma/rice-amd.final_1.xml +180 -0
- data/spec/fixtures/round-trips/metanorma/rice-en.final.norepo.xml +116 -0
- data/spec/fixtures/round-trips/metanorma/rice-en.final.xml +149 -0
- data/spec/fixtures/round-trips/metanorma/rice-en.final_1.xml +144 -0
- data/spec/fixtures/round-trips/metanorma/rice1-en.final.xml +120 -0
- data/spec/fixtures/round-trips/metanorma/rice2-en.final.xml +116 -0
- data/spec/fixtures/round-trips/metanorma/test_sectionsplit.xml +119 -0
- data/spec/fixtures/round-trips/niso-jats/bmj_sample.xml +1068 -0
- data/spec/fixtures/round-trips/niso-jats/element_citation.xml +7 -0
- data/spec/fixtures/round-trips/niso-jats/pnas_sample.xml +3768 -0
- data/spec/fixtures/round-trips/rfcxml/rfc8881.xml +45848 -0
- data/spec/fixtures/round-trips/rfcxml/rfc8994.xml +6607 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9000.xml +9064 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9043.xml +5527 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9051.xml +14286 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9110.xml +18156 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9260.xml +9136 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9293.xml +8300 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9380.xml +8916 -0
- data/spec/fixtures/round-trips/rfcxml/rfc9420.xml +8927 -0
- data/spec/fixtures/w3c/namespaces/1.0/001.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/002.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/003.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/004.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/005.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/006.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/007.xml +20 -0
- data/spec/fixtures/w3c/namespaces/1.0/008.xml +20 -0
- data/spec/fixtures/w3c/namespaces/1.0/009.xml +19 -0
- data/spec/fixtures/w3c/namespaces/1.0/010.xml +19 -0
- data/spec/fixtures/w3c/namespaces/1.0/011.xml +20 -0
- data/spec/fixtures/w3c/namespaces/1.0/012.xml +19 -0
- data/spec/fixtures/w3c/namespaces/1.0/013.xml +5 -0
- data/spec/fixtures/w3c/namespaces/1.0/014.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/015.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/016.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/017.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/018.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/019.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/020.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/021.xml +6 -0
- data/spec/fixtures/w3c/namespaces/1.0/022.xml +6 -0
- data/spec/fixtures/w3c/namespaces/1.0/023.xml +6 -0
- data/spec/fixtures/w3c/namespaces/1.0/024.xml +6 -0
- data/spec/fixtures/w3c/namespaces/1.0/025.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/026.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/027.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/028.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/029.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/030.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/031.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/032.xml +5 -0
- data/spec/fixtures/w3c/namespaces/1.0/033.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/034.xml +3 -0
- data/spec/fixtures/w3c/namespaces/1.0/035.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/036.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/037.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/038.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/039.xml +10 -0
- data/spec/fixtures/w3c/namespaces/1.0/040.xml +9 -0
- data/spec/fixtures/w3c/namespaces/1.0/041.xml +8 -0
- data/spec/fixtures/w3c/namespaces/1.0/042.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/043.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/044.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/045.xml +7 -0
- data/spec/fixtures/w3c/namespaces/1.0/046.xml +10 -0
- data/spec/fixtures/w3c/namespaces/1.0/047.xml +4 -0
- data/spec/fixtures/w3c/namespaces/1.0/048.xml +5 -0
- data/spec/fixtures/w3c/namespaces/1.0/LICENSE.md +32 -0
- data/spec/fixtures/w3c/namespaces/1.0/README.adoc +42 -0
- data/spec/fixtures/w3c/namespaces/1.0/rmt-ns10.xml +156 -0
- data/spec/integration/shared_examples/node_wrappers/namespace_behavior.rb +14 -2
- data/spec/integration/shared_examples/w3c_namespace_examples.rb +10 -0
- data/spec/integration/w3c_namespace_spec.rb +69 -0
- data/spec/moxml/adapter/libxml_spec.rb +7 -1
- data/spec/moxml/adapter/oga_spec.rb +92 -0
- data/spec/moxml/config_spec.rb +75 -0
- data/spec/moxml/doctype_spec.rb +19 -3
- data/spec/moxml/entity_registry_spec.rb +184 -0
- data/spec/moxml/error_spec.rb +2 -2
- data/spec/moxml/namespace_uri_validation_spec.rb +140 -0
- data/spec/moxml/xpath/axes_spec.rb +3 -4
- data/spec/performance/xpath_benchmark_spec.rb +6 -54
- data/spec/support/w3c_namespace_helpers.rb +41 -0
- data/spec/unit/rexml_isolated_test.rb +271 -0
- metadata +99 -3
- data/.ruby-version +0 -1
data/lib/moxml/adapter/oga.rb
CHANGED
|
@@ -9,17 +9,24 @@ module Moxml
|
|
|
9
9
|
module Adapter
|
|
10
10
|
class Oga < Base
|
|
11
11
|
class << self
|
|
12
|
+
# Standard XML entities handled natively by parsers
|
|
13
|
+
STANDARD_XML_ENTITIES = %w[amp lt gt quot apos].freeze
|
|
14
|
+
|
|
12
15
|
def set_root(doc, element)
|
|
13
16
|
# Clear existing root element if any - Oga's NodeSet needs special handling
|
|
14
17
|
# We need to manually remove elements since NodeSet doesn't support clear or delete_if
|
|
15
|
-
elements_to_remove = doc.children.
|
|
18
|
+
elements_to_remove = doc.children.grep(::Oga::XML::Element)
|
|
16
19
|
elements_to_remove.each { |elem| doc.children.delete(elem) }
|
|
17
20
|
doc.children << element
|
|
18
21
|
end
|
|
19
22
|
|
|
20
|
-
def parse(xml, options = {})
|
|
23
|
+
def parse(xml, options = {}, _context = nil)
|
|
24
|
+
# Pre-process XML to convert named entities to marker form (\x01name;).
|
|
25
|
+
# Oga drops named entity references like during parsing.
|
|
26
|
+
processed_xml = preprocess_named_entities(xml)
|
|
27
|
+
|
|
21
28
|
native_doc = begin
|
|
22
|
-
::Oga.parse_xml(
|
|
29
|
+
::Oga.parse_xml(processed_xml, strict: options[:strict])
|
|
23
30
|
rescue LL::ParserError => e
|
|
24
31
|
raise Moxml::ParseError.new(
|
|
25
32
|
e.message,
|
|
@@ -27,7 +34,8 @@ module Moxml
|
|
|
27
34
|
)
|
|
28
35
|
end
|
|
29
36
|
|
|
30
|
-
|
|
37
|
+
ctx = _context || Context.new(:oga)
|
|
38
|
+
DocumentBuilder.new(ctx).build(native_doc)
|
|
31
39
|
end
|
|
32
40
|
|
|
33
41
|
# SAX parsing implementation for Oga
|
|
@@ -61,7 +69,7 @@ module Moxml
|
|
|
61
69
|
end
|
|
62
70
|
|
|
63
71
|
def create_native_text(content)
|
|
64
|
-
::Oga::XML::Text.new(text: content)
|
|
72
|
+
::Oga::XML::Text.new(text: encode_entity_markers(content))
|
|
65
73
|
end
|
|
66
74
|
|
|
67
75
|
def create_native_cdata(content)
|
|
@@ -74,7 +82,8 @@ module Moxml
|
|
|
74
82
|
|
|
75
83
|
def create_native_doctype(name, external_id, system_id)
|
|
76
84
|
::Oga::XML::Doctype.new(
|
|
77
|
-
name: name, public_id: external_id, system_id: system_id,
|
|
85
|
+
name: name, public_id: external_id, system_id: system_id,
|
|
86
|
+
type: external_id ? "PUBLIC" : "SYSTEM"
|
|
78
87
|
)
|
|
79
88
|
end
|
|
80
89
|
|
|
@@ -224,7 +233,7 @@ module Moxml
|
|
|
224
233
|
attr = ::Oga::XML::Attribute.new(
|
|
225
234
|
name: name.to_s,
|
|
226
235
|
namespace_name: namespace_name,
|
|
227
|
-
value: value.to_s,
|
|
236
|
+
value: encode_entity_markers(value.to_s),
|
|
228
237
|
)
|
|
229
238
|
element.add_attribute(attr)
|
|
230
239
|
end
|
|
@@ -234,7 +243,7 @@ module Moxml
|
|
|
234
243
|
end
|
|
235
244
|
|
|
236
245
|
def get_attribute_value(element, name)
|
|
237
|
-
element[name.to_s]
|
|
246
|
+
restore_entity_markers(element[name.to_s])
|
|
238
247
|
end
|
|
239
248
|
|
|
240
249
|
def remove_attribute(element, name)
|
|
@@ -303,24 +312,25 @@ module Moxml
|
|
|
303
312
|
end
|
|
304
313
|
|
|
305
314
|
def text_content(node)
|
|
306
|
-
node.text
|
|
315
|
+
restore_entity_markers(node.text)
|
|
307
316
|
end
|
|
308
317
|
|
|
309
318
|
def inner_text(node)
|
|
310
|
-
if node.respond_to?(:inner_text)
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
319
|
+
text = if node.respond_to?(:inner_text)
|
|
320
|
+
node.inner_text
|
|
321
|
+
else
|
|
322
|
+
# Oga::XML::Text node for example
|
|
323
|
+
node.text
|
|
324
|
+
end
|
|
325
|
+
restore_entity_markers(text)
|
|
316
326
|
end
|
|
317
327
|
|
|
318
328
|
def set_text_content(node, content)
|
|
329
|
+
encoded = encode_entity_markers(content)
|
|
319
330
|
if node.respond_to?(:inner_text=)
|
|
320
|
-
node.inner_text =
|
|
331
|
+
node.inner_text = encoded
|
|
321
332
|
else
|
|
322
|
-
|
|
323
|
-
node.text = content
|
|
333
|
+
node.text = encoded
|
|
324
334
|
end
|
|
325
335
|
end
|
|
326
336
|
|
|
@@ -365,6 +375,19 @@ module Moxml
|
|
|
365
375
|
node.namespaces.values
|
|
366
376
|
end
|
|
367
377
|
|
|
378
|
+
# Doctype accessor methods
|
|
379
|
+
def doctype_name(native)
|
|
380
|
+
native.name
|
|
381
|
+
end
|
|
382
|
+
|
|
383
|
+
def doctype_external_id(native)
|
|
384
|
+
native.public_id
|
|
385
|
+
end
|
|
386
|
+
|
|
387
|
+
def doctype_system_id(native)
|
|
388
|
+
native.system_id
|
|
389
|
+
end
|
|
390
|
+
|
|
368
391
|
def xpath(node, expression, namespaces = nil)
|
|
369
392
|
node.xpath(expression, {},
|
|
370
393
|
namespaces: namespaces&.transform_keys(&:to_s)).to_a
|
|
@@ -389,6 +412,53 @@ module Moxml
|
|
|
389
412
|
end
|
|
390
413
|
|
|
391
414
|
def serialize(node, options = {})
|
|
415
|
+
output = serialize_without_entity_processing(node, options)
|
|
416
|
+
# Post-process: convert entity markers back to entity references
|
|
417
|
+
output.gsub(ENTITY_MARKER_REGEX, '&\1;')
|
|
418
|
+
end
|
|
419
|
+
|
|
420
|
+
# Shared entity name pattern (W3C: 2-31 chars, starts with alpha)
|
|
421
|
+
ENTITY_PATTERN = "([a-zA-Z][a-zA-Z0-9]{1,30})"
|
|
422
|
+
|
|
423
|
+
# Marker character for entity preservation through Oga's parser.
|
|
424
|
+
# U+0001 is preserved literally by Oga through parse/serialize cycle.
|
|
425
|
+
ENTITY_MARKER = "\x01"
|
|
426
|
+
|
|
427
|
+
# Regular expression for entity marker post-processing
|
|
428
|
+
ENTITY_MARKER_REGEX = /#{ENTITY_MARKER}#{ENTITY_PATTERN};/
|
|
429
|
+
|
|
430
|
+
# Simple entity-only regex with no nested quantifiers
|
|
431
|
+
ENTITY_REF_REGEX = /&#{ENTITY_PATTERN};/
|
|
432
|
+
|
|
433
|
+
private
|
|
434
|
+
|
|
435
|
+
# Convert &entity; back to \x01entity; for Oga text storage.
|
|
436
|
+
# Used when setting text content programmatically (not from parsing).
|
|
437
|
+
def encode_entity_markers(text)
|
|
438
|
+
return text unless text&.include?("&")
|
|
439
|
+
|
|
440
|
+
text.gsub(ENTITY_REF_REGEX) do
|
|
441
|
+
name = ::Regexp.last_match(1)
|
|
442
|
+
|
|
443
|
+
next ::Regexp.last_match(0) if STANDARD_XML_ENTITIES.include?(name)
|
|
444
|
+
|
|
445
|
+
codepoint = Moxml::EntityRegistry.default.codepoint_for_name(name)
|
|
446
|
+
if codepoint
|
|
447
|
+
"#{ENTITY_MARKER}#{name};"
|
|
448
|
+
else
|
|
449
|
+
::Regexp.last_match(0)
|
|
450
|
+
end
|
|
451
|
+
end
|
|
452
|
+
end
|
|
453
|
+
|
|
454
|
+
# Convert \x01entity; back to &entity; for text accessors.
|
|
455
|
+
def restore_entity_markers(text)
|
|
456
|
+
return text unless text
|
|
457
|
+
|
|
458
|
+
text.gsub(ENTITY_MARKER_REGEX, '&\1;')
|
|
459
|
+
end
|
|
460
|
+
|
|
461
|
+
def serialize_without_entity_processing(node, options = {})
|
|
392
462
|
# Oga's XmlGenerator doesn't support options directly
|
|
393
463
|
# We need to handle declaration options ourselves for Document nodes
|
|
394
464
|
if node.is_a?(::Oga::XML::Document)
|
|
@@ -403,7 +473,11 @@ module Moxml
|
|
|
403
473
|
node.xml_declaration ? true : false
|
|
404
474
|
end
|
|
405
475
|
|
|
406
|
-
if
|
|
476
|
+
# Fix: Check if declaration already exists in children
|
|
477
|
+
# This prevents duplicate declarations when document already has one
|
|
478
|
+
has_existing_declaration = node.children.any?(::Oga::XML::XmlDeclaration)
|
|
479
|
+
|
|
480
|
+
if should_include_decl && !node.xml_declaration && !has_existing_declaration
|
|
407
481
|
# Need to add declaration - create default one
|
|
408
482
|
output = +""
|
|
409
483
|
output << '<?xml version="1.0" encoding="UTF-8"?>'
|
|
@@ -437,7 +511,50 @@ module Moxml
|
|
|
437
511
|
end
|
|
438
512
|
|
|
439
513
|
# Default: use XmlGenerator
|
|
440
|
-
|
|
514
|
+
# But first check if we need to handle declaration specially
|
|
515
|
+
if node.is_a?(::Oga::XML::Document) && node.xml_declaration
|
|
516
|
+
# Document has declaration - use custom handling to avoid duplicates
|
|
517
|
+
output = +""
|
|
518
|
+
|
|
519
|
+
# Serialize children, but skip XmlDeclaration if it would cause duplication
|
|
520
|
+
node.children.each do |child|
|
|
521
|
+
# Check if this would cause duplication by seeing if we already have one in output
|
|
522
|
+
if child.is_a?(::Oga::XML::XmlDeclaration) && output.include?("<?xml")
|
|
523
|
+
next # Skip duplicate declaration
|
|
524
|
+
end
|
|
525
|
+
|
|
526
|
+
output << ::Moxml::Adapter::CustomizedOga::XmlGenerator.new(child).to_xml
|
|
527
|
+
end
|
|
528
|
+
|
|
529
|
+
output
|
|
530
|
+
else
|
|
531
|
+
# Normal case - use XmlGenerator directly
|
|
532
|
+
::Moxml::Adapter::CustomizedOga::XmlGenerator.new(node).to_xml
|
|
533
|
+
end
|
|
534
|
+
end
|
|
535
|
+
|
|
536
|
+
# Pre-process XML to convert named entities to marker format.
|
|
537
|
+
# Oga drops named entity references like but preserves control chars.
|
|
538
|
+
# By converting known named entities to marker form (\x01name;), we can
|
|
539
|
+
# reconstruct them during serialization.
|
|
540
|
+
#
|
|
541
|
+
# @param xml [String, #to_s] The XML string to process
|
|
542
|
+
# @return [String] The XML with known named entities converted to marker form
|
|
543
|
+
def preprocess_named_entities(xml)
|
|
544
|
+
return xml unless xml.is_a?(String)
|
|
545
|
+
|
|
546
|
+
xml.gsub(ENTITY_REF_REGEX) do
|
|
547
|
+
name = Regexp.last_match(1)
|
|
548
|
+
|
|
549
|
+
next Regexp.last_match(0) if STANDARD_XML_ENTITIES.include?(name)
|
|
550
|
+
|
|
551
|
+
codepoint = Moxml::EntityRegistry.default.codepoint_for_name(name)
|
|
552
|
+
if codepoint
|
|
553
|
+
"#{ENTITY_MARKER}#{name};"
|
|
554
|
+
else
|
|
555
|
+
Regexp.last_match(0)
|
|
556
|
+
end
|
|
557
|
+
end
|
|
441
558
|
end
|
|
442
559
|
end
|
|
443
560
|
end
|
data/lib/moxml/adapter/ox.rb
CHANGED
|
@@ -17,7 +17,7 @@ module Moxml
|
|
|
17
17
|
replace_children(doc, [element])
|
|
18
18
|
end
|
|
19
19
|
|
|
20
|
-
def parse(xml, _options = {})
|
|
20
|
+
def parse(xml, _options = {}, _context = nil)
|
|
21
21
|
native_doc = begin
|
|
22
22
|
result = ::Ox.parse(xml)
|
|
23
23
|
|
|
@@ -36,7 +36,8 @@ module Moxml
|
|
|
36
36
|
)
|
|
37
37
|
end
|
|
38
38
|
|
|
39
|
-
|
|
39
|
+
ctx = _context || Context.new(:ox)
|
|
40
|
+
DocumentBuilder.new(ctx).build(native_doc)
|
|
40
41
|
end
|
|
41
42
|
|
|
42
43
|
# SAX parsing implementation for Ox
|
|
@@ -452,7 +453,7 @@ module Moxml
|
|
|
452
453
|
def inner_text(node)
|
|
453
454
|
return "" unless node.respond_to?(:nodes)
|
|
454
455
|
|
|
455
|
-
node.nodes.
|
|
456
|
+
node.nodes.grep(String).join
|
|
456
457
|
end
|
|
457
458
|
|
|
458
459
|
def set_text_content(node, content)
|
|
@@ -510,6 +511,31 @@ module Moxml
|
|
|
510
511
|
end.values
|
|
511
512
|
end
|
|
512
513
|
|
|
514
|
+
# Doctype accessor methods
|
|
515
|
+
# Ox stores DOCTYPE as a string, so we parse it
|
|
516
|
+
def doctype_name(native)
|
|
517
|
+
# Parse: "name PUBLIC \"external_id\" \"system_id\"" or "name SYSTEM \"system_id\""
|
|
518
|
+
value = native.value.to_s.strip
|
|
519
|
+
# Extract the first word (the name)
|
|
520
|
+
value.split(/\s+/).first
|
|
521
|
+
end
|
|
522
|
+
|
|
523
|
+
def doctype_external_id(native)
|
|
524
|
+
value = native.value.to_s
|
|
525
|
+
# Match PUBLIC "external_id"
|
|
526
|
+
match = value.match(/PUBLIC\s+"([^"]*)"/)
|
|
527
|
+
match ? match[1] : nil
|
|
528
|
+
end
|
|
529
|
+
|
|
530
|
+
def doctype_system_id(native)
|
|
531
|
+
value = native.value.to_s
|
|
532
|
+
# Match the last quoted string (system_id)
|
|
533
|
+
# For PUBLIC: "name PUBLIC \"external_id\" \"system_id\""
|
|
534
|
+
# For SYSTEM: "name SYSTEM \"system_id\""
|
|
535
|
+
matches = value.scan(/"([^"]*)"/)
|
|
536
|
+
matches.last&.first
|
|
537
|
+
end
|
|
538
|
+
|
|
513
539
|
def xpath(node, expression, namespaces = {})
|
|
514
540
|
# Translate common XPath patterns to Ox locate() syntax
|
|
515
541
|
locate_expr = translate_xpath_to_locate(expression, namespaces)
|
data/lib/moxml/adapter/rexml.rb
CHANGED
|
@@ -10,9 +10,16 @@ module Moxml
|
|
|
10
10
|
module Adapter
|
|
11
11
|
class Rexml < Base
|
|
12
12
|
class << self
|
|
13
|
-
def parse(xml, options = {})
|
|
13
|
+
def parse(xml, options = {}, _context = nil)
|
|
14
|
+
# Handle frozen strings by creating a mutable copy
|
|
15
|
+
processed_xml = if xml.frozen?
|
|
16
|
+
xml.dup.force_encoding("UTF-8").encode("UTF-8")
|
|
17
|
+
else
|
|
18
|
+
xml.force_encoding("UTF-8").encode("UTF-8")
|
|
19
|
+
end
|
|
20
|
+
|
|
14
21
|
native_doc = begin
|
|
15
|
-
::REXML::Document.new(
|
|
22
|
+
::REXML::Document.new(processed_xml)
|
|
16
23
|
rescue ::REXML::ParseException => e
|
|
17
24
|
if options[:strict]
|
|
18
25
|
raise Moxml::ParseError.new(
|
|
@@ -24,7 +31,15 @@ module Moxml
|
|
|
24
31
|
create_document
|
|
25
32
|
end
|
|
26
33
|
|
|
27
|
-
|
|
34
|
+
ctx = _context || Context.new(:rexml)
|
|
35
|
+
DocumentBuilder.new(ctx).build(native_doc)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def extract_encoding_from_xml(xml)
|
|
39
|
+
# Match XML declaration pattern: <?xml version="..." encoding="..."?>
|
|
40
|
+
# Use atomic group (?>) to prevent polynomial backtracking ReDoS
|
|
41
|
+
match = xml.match(/<\?xml(?>[^>]*)\bencoding\s*=\s*["']([^"']+)["']/i)
|
|
42
|
+
match ? match[1] : "UTF-8"
|
|
28
43
|
end
|
|
29
44
|
|
|
30
45
|
# SAX parsing implementation for REXML
|
|
@@ -359,16 +374,35 @@ module Moxml
|
|
|
359
374
|
when ::REXML::Text, ::REXML::CData
|
|
360
375
|
node.value.to_s
|
|
361
376
|
when ::REXML::Element
|
|
362
|
-
#
|
|
363
|
-
|
|
364
|
-
|
|
377
|
+
# Extract text recursively from all children to match other adapters
|
|
378
|
+
extract_text_recursively(node)
|
|
379
|
+
end
|
|
380
|
+
end
|
|
381
|
+
|
|
382
|
+
def extract_text_recursively(element)
|
|
383
|
+
return "" unless element
|
|
384
|
+
|
|
385
|
+
text = ""
|
|
386
|
+
element.children.each do |child|
|
|
387
|
+
case child
|
|
388
|
+
when ::REXML::Text
|
|
389
|
+
# Preserve original spacing from text nodes exactly including newlines and all whitespace
|
|
390
|
+
text += child.value
|
|
391
|
+
when ::REXML::Element
|
|
392
|
+
# Extract text recursively from child element
|
|
393
|
+
child_text = extract_text_recursively(child)
|
|
394
|
+
# Concatenate directly like other adapters - NO SPACE INSERTION
|
|
395
|
+
text += child_text
|
|
396
|
+
end
|
|
365
397
|
end
|
|
398
|
+
# Don't strip - preserve original spacing including newlines
|
|
399
|
+
text
|
|
366
400
|
end
|
|
367
401
|
|
|
368
402
|
def inner_text(node)
|
|
369
403
|
# Get direct text children only, filter duplicates
|
|
370
404
|
text_children = node.children
|
|
371
|
-
.
|
|
405
|
+
.grep(::REXML::Text)
|
|
372
406
|
.uniq(&:object_id)
|
|
373
407
|
text_children.map(&:value).join
|
|
374
408
|
end
|
|
@@ -426,6 +460,19 @@ module Moxml
|
|
|
426
460
|
end
|
|
427
461
|
end
|
|
428
462
|
|
|
463
|
+
# Doctype accessor methods
|
|
464
|
+
def doctype_name(native)
|
|
465
|
+
native.name
|
|
466
|
+
end
|
|
467
|
+
|
|
468
|
+
def doctype_external_id(native)
|
|
469
|
+
native.public
|
|
470
|
+
end
|
|
471
|
+
|
|
472
|
+
def doctype_system_id(native)
|
|
473
|
+
native.system
|
|
474
|
+
end
|
|
475
|
+
|
|
429
476
|
# not used at the moment
|
|
430
477
|
# but may be useful when the xpath is upgraded to work with namespaces
|
|
431
478
|
def prepare_xpath_namespaces(node)
|
data/lib/moxml/attribute.rb
CHANGED
|
@@ -10,6 +10,12 @@ module Moxml
|
|
|
10
10
|
adapter.set_attribute_name(@native, new_name)
|
|
11
11
|
end
|
|
12
12
|
|
|
13
|
+
# Returns the primary identifier for this attribute (its name)
|
|
14
|
+
# @return [String] the attribute name
|
|
15
|
+
def identifier
|
|
16
|
+
name
|
|
17
|
+
end
|
|
18
|
+
|
|
13
19
|
def value
|
|
14
20
|
@native.value
|
|
15
21
|
end
|
data/lib/moxml/builder.rb
CHANGED
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
module Moxml
|
|
4
4
|
class Builder
|
|
5
|
+
attr_reader :document
|
|
6
|
+
|
|
5
7
|
def initialize(context)
|
|
6
8
|
@context = context
|
|
7
9
|
@current = @document = context.create_document
|
|
@@ -60,6 +62,10 @@ module Moxml
|
|
|
60
62
|
@current.add_child(@document.create_comment(content))
|
|
61
63
|
end
|
|
62
64
|
|
|
65
|
+
def entity_reference(name)
|
|
66
|
+
@current.add_child(@document.create_entity_reference(name))
|
|
67
|
+
end
|
|
68
|
+
|
|
63
69
|
def processing_instruction(target, content)
|
|
64
70
|
@current.add_child(
|
|
65
71
|
@document.create_processing_instruction(target, content),
|
data/lib/moxml/config.rb
CHANGED
|
@@ -5,6 +5,13 @@ module Moxml
|
|
|
5
5
|
VALID_ADAPTERS = %i[nokogiri oga rexml ox headed_ox libxml].freeze
|
|
6
6
|
DEFAULT_ADAPTER = VALID_ADAPTERS.first
|
|
7
7
|
|
|
8
|
+
# Entity loading modes:
|
|
9
|
+
# - :required - Must load entities, raise error if unavailable (default)
|
|
10
|
+
# - :optional - Try to load, continue silently if unavailable
|
|
11
|
+
# - :disabled - Don't load entities, use empty registry
|
|
12
|
+
# - :custom - Use custom entity provider via entity_provider callback
|
|
13
|
+
ENTITY_LOAD_MODES = %i[required optional disabled custom].freeze
|
|
14
|
+
|
|
8
15
|
class << self
|
|
9
16
|
attr_writer :default_adapter
|
|
10
17
|
|
|
@@ -17,11 +24,18 @@ module Moxml
|
|
|
17
24
|
end
|
|
18
25
|
end
|
|
19
26
|
|
|
27
|
+
NAMESPACE_URI_MODES = %i[strict lenient].freeze
|
|
28
|
+
|
|
20
29
|
attr_reader :adapter_name
|
|
21
30
|
attr_accessor :strict_parsing,
|
|
22
31
|
:default_encoding,
|
|
23
32
|
:entity_encoding,
|
|
24
|
-
:default_indent
|
|
33
|
+
:default_indent,
|
|
34
|
+
:restore_entities,
|
|
35
|
+
:preload_entity_sets,
|
|
36
|
+
:entity_load_mode,
|
|
37
|
+
:entity_provider,
|
|
38
|
+
:namespace_uri_mode
|
|
25
39
|
|
|
26
40
|
def initialize(adapter_name = nil, strict_parsing = nil,
|
|
27
41
|
default_encoding = nil)
|
|
@@ -31,6 +45,11 @@ module Moxml
|
|
|
31
45
|
# reserved for future use
|
|
32
46
|
@default_indent = 2
|
|
33
47
|
@entity_encoding = :basic
|
|
48
|
+
@restore_entities = false
|
|
49
|
+
@preload_entity_sets = []
|
|
50
|
+
@entity_load_mode = :required
|
|
51
|
+
@entity_provider = nil
|
|
52
|
+
@namespace_uri_mode = :strict
|
|
34
53
|
end
|
|
35
54
|
|
|
36
55
|
def adapter=(name)
|
|
@@ -57,5 +76,37 @@ module Moxml
|
|
|
57
76
|
def adapter
|
|
58
77
|
@adapter ||= Adapter.load(@adapter_name)
|
|
59
78
|
end
|
|
79
|
+
|
|
80
|
+
def entity_load_mode=(mode)
|
|
81
|
+
unless ENTITY_LOAD_MODES.include?(mode)
|
|
82
|
+
raise ArgumentError,
|
|
83
|
+
"Invalid entity_load_mode: #{mode}. Must be one of: #{ENTITY_LOAD_MODES.join(', ')}"
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
@entity_load_mode = mode
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def namespace_uri_mode=(mode)
|
|
90
|
+
mode = mode.to_sym
|
|
91
|
+
unless NAMESPACE_URI_MODES.include?(mode)
|
|
92
|
+
raise ArgumentError,
|
|
93
|
+
"Invalid namespace_uri_mode: #{mode}. Must be one of: #{NAMESPACE_URI_MODES.join(', ')}"
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
@namespace_uri_mode = mode
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Backward compatibility: convert old boolean to new symbol
|
|
100
|
+
def load_external_entities=(value)
|
|
101
|
+
@entity_load_mode = case value
|
|
102
|
+
when true then :required
|
|
103
|
+
when false then :disabled
|
|
104
|
+
else value
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def load_external_entities
|
|
109
|
+
@entity_load_mode == :required
|
|
110
|
+
end
|
|
60
111
|
end
|
|
61
112
|
end
|
data/lib/moxml/context.rb
CHANGED
|
@@ -8,6 +8,10 @@ module Moxml
|
|
|
8
8
|
@config = Config.new(adapter)
|
|
9
9
|
end
|
|
10
10
|
|
|
11
|
+
def entity_registry
|
|
12
|
+
@entity_registry ||= build_entity_registry
|
|
13
|
+
end
|
|
14
|
+
|
|
11
15
|
def create_document(native_doc = nil)
|
|
12
16
|
Document.new(config.adapter.create_document(native_doc), self)
|
|
13
17
|
end
|
|
@@ -23,9 +27,9 @@ module Moxml
|
|
|
23
27
|
end
|
|
24
28
|
has_declaration = xml_string.strip.start_with?("<?xml")
|
|
25
29
|
|
|
26
|
-
# Parse with adapter (
|
|
30
|
+
# Parse with adapter, passing self (context) so adapter can use our config
|
|
27
31
|
parsed_options = default_options.merge(options)
|
|
28
|
-
doc = config.adapter.parse(xml_string, parsed_options)
|
|
32
|
+
doc = config.adapter.parse(xml_string, parsed_options, self)
|
|
29
33
|
|
|
30
34
|
# Set declaration flag on Document wrapper (proper OOP)
|
|
31
35
|
doc.has_xml_declaration = has_declaration if doc.is_a?(Document)
|
|
@@ -73,6 +77,21 @@ module Moxml
|
|
|
73
77
|
|
|
74
78
|
private
|
|
75
79
|
|
|
80
|
+
def build_entity_registry
|
|
81
|
+
registry = EntityRegistry.new(
|
|
82
|
+
mode: config.entity_load_mode,
|
|
83
|
+
entity_provider: config.entity_provider,
|
|
84
|
+
)
|
|
85
|
+
config.preload_entity_sets.each do |set_name|
|
|
86
|
+
case set_name
|
|
87
|
+
when :html5 then registry.load_html5
|
|
88
|
+
when :mathml then registry.load_mathml
|
|
89
|
+
when :iso then registry.load_iso
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
registry
|
|
93
|
+
end
|
|
94
|
+
|
|
76
95
|
def default_options
|
|
77
96
|
{
|
|
78
97
|
encoding: config.default_encoding,
|
data/lib/moxml/doctype.rb
CHANGED
|
@@ -1,17 +1,50 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Moxml
|
|
4
|
+
# Represents an XML DOCTYPE declaration
|
|
5
|
+
#
|
|
6
|
+
# @note Doctype accessor methods are not fully implemented across all adapters.
|
|
7
|
+
# The availability of #name, #external_id, and #system_id depends on whether
|
|
8
|
+
# the specific adapter implements the corresponding adapter methods:
|
|
9
|
+
# - adapter.doctype_name(native)
|
|
10
|
+
# - adapter.doctype_external_id(native)
|
|
11
|
+
# - adapter.doctype_system_id(native)
|
|
12
|
+
#
|
|
13
|
+
# Most adapters do not currently implement these methods. If you need DOCTYPE
|
|
14
|
+
# information, consider using adapter-specific methods or parsing the serialized
|
|
15
|
+
# XML manually.
|
|
4
16
|
class Doctype < Node
|
|
17
|
+
# Returns the DOCTYPE name (root element name)
|
|
18
|
+
#
|
|
19
|
+
# @return [String, nil] the DOCTYPE name
|
|
20
|
+
# @raise [NotImplementedError] if the adapter doesn't implement doctype_name
|
|
5
21
|
def name
|
|
6
22
|
adapter.doctype_name(@native)
|
|
7
23
|
end
|
|
8
24
|
|
|
25
|
+
# Returns the DOCTYPE external ID
|
|
26
|
+
#
|
|
27
|
+
# @return [String, nil] the external ID
|
|
28
|
+
# @raise [NotImplementedError] if the adapter doesn't implement doctype_external_id
|
|
9
29
|
def external_id
|
|
10
30
|
adapter.doctype_external_id(@native)
|
|
11
31
|
end
|
|
12
32
|
|
|
33
|
+
# Returns the DOCTYPE system ID
|
|
34
|
+
#
|
|
35
|
+
# @return [String, nil] the system ID
|
|
36
|
+
# @raise [NotImplementedError] if the adapter doesn't implement doctype_system_id
|
|
13
37
|
def system_id
|
|
14
38
|
adapter.doctype_system_id(@native)
|
|
15
39
|
end
|
|
40
|
+
|
|
41
|
+
# Returns the primary identifier for this doctype
|
|
42
|
+
# Since DOCTYPE information is not reliably available across adapters,
|
|
43
|
+
# this returns nil.
|
|
44
|
+
#
|
|
45
|
+
# @return [nil]
|
|
46
|
+
def identifier
|
|
47
|
+
name
|
|
48
|
+
end
|
|
16
49
|
end
|
|
17
50
|
end
|
data/lib/moxml/document.rb
CHANGED
|
@@ -9,6 +9,7 @@ require_relative "processing_instruction"
|
|
|
9
9
|
require_relative "declaration"
|
|
10
10
|
require_relative "namespace"
|
|
11
11
|
require_relative "doctype"
|
|
12
|
+
require_relative "entity_reference"
|
|
12
13
|
|
|
13
14
|
module Moxml
|
|
14
15
|
class Document < Node
|
|
@@ -68,6 +69,10 @@ module Moxml
|
|
|
68
69
|
Declaration.new(decl, context)
|
|
69
70
|
end
|
|
70
71
|
|
|
72
|
+
def create_entity_reference(name)
|
|
73
|
+
EntityReference.new(adapter.create_entity_reference(name), context)
|
|
74
|
+
end
|
|
75
|
+
|
|
71
76
|
def add_child(node)
|
|
72
77
|
node = prepare_node(node)
|
|
73
78
|
|
|
@@ -109,7 +114,7 @@ module Moxml
|
|
|
109
114
|
|
|
110
115
|
def at_xpath(expression, namespaces = nil)
|
|
111
116
|
if (native_node = adapter.at_xpath(@native, expression, namespaces))
|
|
112
|
-
Node.wrap(native_node, context)
|
|
117
|
+
Moxml::Node.wrap(native_node, context)
|
|
113
118
|
end
|
|
114
119
|
end
|
|
115
120
|
|