moxml 0.1.14 → 0.1.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +117 -66
  3. data/Gemfile +1 -0
  4. data/README.adoc +11 -9
  5. data/Rakefile +34 -1
  6. data/TODO.remaining/1-entity-reference-adapter-support.md +157 -0
  7. data/TODO.remaining/2-entity-restoration-model-driven.md +169 -0
  8. data/TODO.remaining/3-entity-reference-test-coverage.md +170 -0
  9. data/TODO.remaining/4-lenient-entities-mode.md +106 -0
  10. data/TODO.remaining/5-fixture-integrity.md +65 -0
  11. data/TODO.remaining/6-ox-element-ordering-bug.md +36 -0
  12. data/TODO.remaining/7-headed-ox-limitations.md +95 -0
  13. data/TODO.remaining/8-xpath-predicate-gaps.md +68 -0
  14. data/TODO.remaining/9-cleanup-hygiene.md +42 -0
  15. data/TODO.remaining/README.md +54 -0
  16. data/benchmarks/generate_report.rb +1 -1
  17. data/docs/_pages/configuration.adoc +22 -19
  18. data/docs/_tutorials/namespace-handling.adoc +5 -5
  19. data/lib/moxml/adapter/base.rb +22 -3
  20. data/lib/moxml/adapter/customized_libxml/declaration.rb +1 -1
  21. data/lib/moxml/adapter/customized_libxml/entity_reference.rb +23 -0
  22. data/lib/moxml/adapter/customized_libxml.rb +18 -0
  23. data/lib/moxml/adapter/customized_oga.rb +10 -0
  24. data/lib/moxml/adapter/customized_ox/entity_reference.rb +25 -0
  25. data/lib/moxml/adapter/customized_ox.rb +12 -0
  26. data/lib/moxml/adapter/customized_rexml/entity_reference.rb +19 -0
  27. data/lib/moxml/adapter/customized_rexml/formatter.rb +44 -20
  28. data/lib/moxml/adapter/customized_rexml.rb +11 -0
  29. data/lib/moxml/adapter/headed_ox.rb +37 -14
  30. data/lib/moxml/adapter/libxml.rb +233 -119
  31. data/lib/moxml/adapter/nokogiri.rb +22 -11
  32. data/lib/moxml/adapter/oga.rb +64 -25
  33. data/lib/moxml/adapter/ox.rb +198 -42
  34. data/lib/moxml/adapter/rexml.rb +64 -13
  35. data/lib/moxml/attribute.rb +3 -0
  36. data/lib/moxml/builder.rb +78 -24
  37. data/lib/moxml/config.rb +24 -7
  38. data/lib/moxml/declaration.rb +4 -2
  39. data/lib/moxml/document.rb +8 -1
  40. data/lib/moxml/document_builder.rb +44 -37
  41. data/lib/moxml/element.rb +18 -5
  42. data/lib/moxml/entity_registry.rb +51 -1
  43. data/lib/moxml/native_attachment.rb +65 -0
  44. data/lib/moxml/node.rb +39 -50
  45. data/lib/moxml/node_set.rb +43 -15
  46. data/lib/moxml/version.rb +1 -1
  47. data/lib/moxml/xml_utils.rb +1 -1
  48. data/lib/moxml/xpath/compiler.rb +4 -1
  49. data/lib/moxml.rb +1 -0
  50. data/scripts/format_xml.rb +16 -0
  51. data/scripts/pretty_format_xml.rb +14 -0
  52. data/spec/consistency/round_trip_spec.rb +3 -30
  53. data/spec/integration/all_adapters_spec.rb +1 -0
  54. data/spec/integration/headed_ox_integration_spec.rb +0 -2
  55. data/spec/integration/shared_examples/edge_cases.rb +7 -4
  56. data/spec/integration/shared_examples/integration_workflows.rb +3 -3
  57. data/spec/integration/shared_examples/node_wrappers/cdata_behavior.rb +1 -1
  58. data/spec/integration/shared_examples/node_wrappers/entity_reference_behavior.rb +224 -0
  59. data/spec/integration/shared_examples/node_wrappers/node_behavior.rb +1 -1
  60. data/spec/moxml/adapter/headed_ox_spec.rb +8 -8
  61. data/spec/moxml/adapter/oga_spec.rb +46 -0
  62. data/spec/moxml/adapter/shared_examples/adapter_contract.rb +1 -12
  63. data/spec/moxml/allocation_benchmark_spec.rb +96 -0
  64. data/spec/moxml/allocation_guard_spec.rb +282 -0
  65. data/spec/moxml/builder_spec.rb +256 -0
  66. data/spec/moxml/config_spec.rb +11 -11
  67. data/spec/moxml/doctype_spec.rb +41 -0
  68. data/spec/moxml/lazy_parse_spec.rb +115 -0
  69. data/spec/moxml/namespace_uri_validation_spec.rb +11 -3
  70. data/spec/moxml/node_cache_spec.rb +110 -0
  71. data/spec/moxml/node_set_cache_spec.rb +90 -0
  72. data/spec/moxml/xml_utils_spec.rb +32 -0
  73. data/spec/moxml/xpath/axes_spec.rb +1 -1
  74. data/spec/moxml/xpath/compiler_spec.rb +2 -2
  75. data/spec/moxml/xpath/functions/position_functions_spec.rb +5 -5
  76. data/spec/moxml/xpath/functions/special_functions_spec.rb +1 -1
  77. data/spec/performance/memory_usage_spec.rb +0 -4
  78. data/spec/support/allocation_helper.rb +165 -0
  79. data/spec/support/w3c_namespace_helpers.rb +2 -1
  80. metadata +29 -2
@@ -7,6 +7,10 @@ module Moxml
7
7
  module Adapter
8
8
  class Nokogiri < Base
9
9
  class << self
10
+ def attachments
11
+ @attachments ||= Moxml::NativeAttachment.new
12
+ end
13
+
10
14
  def set_root(doc, element)
11
15
  doc.root = element
12
16
  end
@@ -31,7 +35,7 @@ module Moxml
31
35
 
32
36
  # Use provided context if available, otherwise create new one
33
37
  ctx = _context || Context.new(:nokogiri)
34
- DocumentBuilder.new(ctx).build(native_doc)
38
+ Document.new(native_doc, ctx)
35
39
  end
36
40
 
37
41
  # SAX parsing implementation for Nokogiri
@@ -47,7 +51,7 @@ module Moxml
47
51
  parser = ::Nokogiri::XML::SAX::Parser.new(bridge)
48
52
 
49
53
  # Parse
50
- if xml.respond_to?(:read)
54
+ if xml.is_a?(IO) || xml.is_a?(StringIO)
51
55
  parser.parse(xml)
52
56
  else
53
57
  parser.parse(xml.to_s)
@@ -202,7 +206,7 @@ module Moxml
202
206
  end
203
207
 
204
208
  def root(document)
205
- document.respond_to?(:root) ? document.root : document.children.first
209
+ document.is_a?(::Nokogiri::XML::Document) ? document.root : document.children.first
206
210
  end
207
211
 
208
212
  def attribute_element(attr)
@@ -241,8 +245,8 @@ module Moxml
241
245
  encoding = declaration_attribute(child, "encoding")
242
246
  standalone = declaration_attribute(child, "standalone")
243
247
 
244
- # Nokogiri's xml_decl can only be set via instance variable
245
- element.instance_variable_set(:@xml_decl, {
248
+ # Store declaration state in attachment map
249
+ attachments.set(element, :xml_decl, {
246
250
  version: version,
247
251
  encoding: encoding,
248
252
  standalone: standalone,
@@ -273,7 +277,7 @@ module Moxml
273
277
  node.name == "xml" &&
274
278
  node.parent.is_a?(::Nokogiri::XML::Document)
275
279
  # Clear document's xml_decl when removing declaration
276
- node.parent.instance_variable_set(:@xml_decl, nil)
280
+ attachments.set(node.parent, :xml_decl, nil)
277
281
  end
278
282
 
279
283
  node.remove
@@ -387,13 +391,12 @@ module Moxml
387
391
  # Handle declaration option
388
392
  # Priority:
389
393
  # 1. Explicit no_declaration option
390
- # 2. Check Nokogiri's internal @xml_decl (when remove is called, this becomes nil)
394
+ # 2. Check attachment-stored xml_decl (when remove is called, this becomes nil)
391
395
  if options.key?(:no_declaration)
392
396
  save_options |= ::Nokogiri::XML::Node::SaveOptions::NO_DECLARATION if options[:no_declaration]
393
- elsif node.respond_to?(:instance_variable_get) &&
394
- node.instance_variable_defined?(:@xml_decl)
395
- # Nokogiri's internal state - if nil, declaration was removed
396
- xml_decl = node.instance_variable_get(:@xml_decl)
397
+ elsif attachments.key?(node, :xml_decl)
398
+ # State stored in attachment - if nil, declaration was removed
399
+ xml_decl = attachments.get(node, :xml_decl)
397
400
  save_options |= ::Nokogiri::XML::Node::SaveOptions::NO_DECLARATION if xml_decl.nil?
398
401
  end
399
402
 
@@ -404,6 +407,14 @@ module Moxml
404
407
  )
405
408
  end
406
409
 
410
+ def has_declaration?(native_doc, wrapper)
411
+ if attachments.key?(native_doc, :xml_decl)
412
+ !attachments.get(native_doc, :xml_decl).nil?
413
+ else
414
+ wrapper.has_xml_declaration
415
+ end
416
+ end
417
+
407
418
  private
408
419
 
409
420
  def build_declaration_attrs(version, encoding, standalone)
@@ -1,8 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "base"
4
- require_relative "customized_oga/xml_generator"
5
- require_relative "customized_oga/xml_declaration"
4
+ require_relative "customized_oga"
6
5
  require "oga"
7
6
 
8
7
  module Moxml
@@ -12,6 +11,10 @@ module Moxml
12
11
  # Standard XML entities handled natively by parsers
13
12
  STANDARD_XML_ENTITIES = %w[amp lt gt quot apos].freeze
14
13
 
14
+ def attachments
15
+ @attachments ||= Moxml::NativeAttachment.new
16
+ end
17
+
15
18
  def set_root(doc, element)
16
19
  # Clear existing root element if any - Oga's NodeSet needs special handling
17
20
  # We need to manually remove elements since NodeSet doesn't support clear or delete_if
@@ -46,7 +49,7 @@ module Moxml
46
49
  def sax_parse(xml, handler)
47
50
  bridge = OgaSAXBridge.new(handler)
48
51
 
49
- xml_string = xml.respond_to?(:read) ? xml.read : xml.to_s
52
+ xml_string = xml.is_a?(IO) || xml.is_a?(StringIO) ? xml.read : xml.to_s
50
53
 
51
54
  # Manually call start_document (Oga doesn't)
52
55
  handler.on_start_document
@@ -72,6 +75,17 @@ module Moxml
72
75
  ::Oga::XML::Text.new(text: encode_entity_markers(content))
73
76
  end
74
77
 
78
+ def create_native_entity_reference(name)
79
+ text = ::Oga::XML::Text.new
80
+ text.text = "#{ENTITY_MARKER}#{name};"
81
+ attachments.set(text, :entity_name, name)
82
+ text
83
+ end
84
+
85
+ def entity_reference_name(node)
86
+ attachments.get(node, :entity_name)
87
+ end
88
+
75
89
  def create_native_cdata(content, _owner_doc = nil)
76
90
  ::Oga::XML::Cdata.new(text: content)
77
91
  end
@@ -132,10 +146,9 @@ module Moxml
132
146
  end
133
147
 
134
148
  def namespace(element)
135
- if element.respond_to?(:namespace)
149
+ case element
150
+ when ::Oga::XML::Element, ::Oga::XML::Attribute
136
151
  element.namespace
137
- elsif element.respond_to?(:namespaces)
138
- element.namespaces.values.last
139
152
  end
140
153
  rescue NoMethodError
141
154
  # Oga attributes fail with NoMethodError:
@@ -150,7 +163,12 @@ module Moxml
150
163
  def node_type(node)
151
164
  case node
152
165
  when ::Oga::XML::Element then :element
153
- when ::Oga::XML::Text then :text
166
+ when ::Oga::XML::Text
167
+ if attachments.key?(node, :entity_name)
168
+ :entity_reference
169
+ else
170
+ :text
171
+ end
154
172
  when ::Oga::XML::Cdata then :cdata
155
173
  when ::Oga::XML::Comment then :comment
156
174
  when ::Oga::XML::Attribute then :attribute
@@ -178,7 +196,7 @@ module Moxml
178
196
  node.doctype].compact
179
197
  end
180
198
 
181
- return all_children unless node.respond_to?(:children)
199
+ return all_children unless node.is_a?(::Oga::XML::Node) || node.is_a?(::Oga::XML::Document)
182
200
 
183
201
  all_children + node.children.reject do |child|
184
202
  child.is_a?(::Oga::XML::Text) &&
@@ -188,7 +206,7 @@ module Moxml
188
206
  end
189
207
 
190
208
  def parent(node)
191
- node.parent if node.respond_to?(:parent)
209
+ node.parent if node.is_a?(::Oga::XML::Node)
192
210
  end
193
211
 
194
212
  def next_sibling(node)
@@ -215,7 +233,7 @@ module Moxml
215
233
  end
216
234
 
217
235
  def attributes(element)
218
- return [] unless element.respond_to?(:attributes)
236
+ return [] unless element.is_a?(::Oga::XML::Element)
219
237
 
220
238
  # remove attributes-namespaces
221
239
  element.attributes.reject do |attr|
@@ -262,8 +280,8 @@ module Moxml
262
280
  # Special handling for declarations on Oga documents
263
281
  if element.is_a?(::Oga::XML::Document) &&
264
282
  child.is_a?(::Oga::XML::XmlDeclaration)
265
- # Set as document's xml_declaration
266
- element.instance_variable_set(:@xml_declaration, child)
283
+ # Track declaration state in attachment map
284
+ attachments.set(element, :xml_declaration, child)
267
285
  end
268
286
 
269
287
  element.children << child
@@ -295,8 +313,8 @@ module Moxml
295
313
  # Special handling for declarations on Oga documents
296
314
  if node.is_a?(::Oga::XML::XmlDeclaration) &&
297
315
  node.parent.is_a?(::Oga::XML::Document)
298
- # Clear document's xml_declaration when removing declaration
299
- node.parent.instance_variable_set(:@xml_declaration, nil)
316
+ # Clear declaration state in attachment map
317
+ attachments.set(node.parent, :xml_declaration, nil)
300
318
  end
301
319
 
302
320
  node.remove
@@ -316,10 +334,9 @@ module Moxml
316
334
  end
317
335
 
318
336
  def inner_text(node)
319
- text = if node.respond_to?(:inner_text)
337
+ text = if node.is_a?(::Oga::XML::Element)
320
338
  node.inner_text
321
339
  else
322
- # Oga::XML::Text node for example
323
340
  node.text
324
341
  end
325
342
  restore_entity_markers(text)
@@ -327,7 +344,7 @@ module Moxml
327
344
 
328
345
  def set_text_content(node, content)
329
346
  encoded = encode_entity_markers(content)
330
- if node.respond_to?(:inner_text=)
347
+ if node.is_a?(::Oga::XML::Element)
331
348
  node.inner_text = encoded
332
349
  else
333
350
  node.text = encoded
@@ -370,22 +387,32 @@ module Moxml
370
387
  end
371
388
 
372
389
  def namespace_definitions(node)
373
- return [] unless node.respond_to?(:namespaces)
390
+ return [] unless node.is_a?(::Oga::XML::Element)
374
391
 
375
392
  node.namespaces.values
376
393
  end
377
394
 
378
395
  # Doctype accessor methods
396
+ # Note: Oga stores SYSTEM identifier in public_id for SYSTEM doctypes.
397
+ # See: Oga::XML::Doctype puts SYSTEM dtd in public_id, system_id is nil.
379
398
  def doctype_name(native)
380
399
  native.name
381
400
  end
382
401
 
383
402
  def doctype_external_id(native)
384
- native.public_id
403
+ if native.type == "SYSTEM"
404
+ nil
405
+ else
406
+ native.public_id
407
+ end
385
408
  end
386
409
 
387
410
  def doctype_system_id(native)
388
- native.system_id
411
+ if native.type == "SYSTEM"
412
+ native.public_id
413
+ else
414
+ native.system_id
415
+ end
389
416
  end
390
417
 
391
418
  def xpath(node, expression, namespaces = nil)
@@ -430,6 +457,16 @@ module Moxml
430
457
  # Simple entity-only regex with no nested quantifiers
431
458
  ENTITY_REF_REGEX = /&#{ENTITY_PATTERN};/
432
459
 
460
+ def has_declaration?(native_doc, _wrapper)
461
+ decl = attachments.get(native_doc, :xml_declaration)
462
+ if decl.nil? && !attachments.key?(native_doc, :xml_declaration)
463
+ # No attachment entry - check native doc (for parsed documents)
464
+ native_doc.respond_to?(:xml_declaration) && !native_doc.xml_declaration.nil?
465
+ else
466
+ !decl.nil?
467
+ end
468
+ end
469
+
433
470
  private
434
471
 
435
472
  # Convert &entity; back to \x01entity; for Oga text storage.
@@ -463,21 +500,22 @@ module Moxml
463
500
  # We need to handle declaration options ourselves for Document nodes
464
501
  if node.is_a?(::Oga::XML::Document)
465
502
  # Check if we should include declaration
466
- # Priority: explicit option > existence of xml_declaration node
503
+ # Priority: explicit option > existence of xml_declaration (native or attachment)
504
+ effective_xml_declaration = node.xml_declaration || attachments.get(node, :xml_declaration)
467
505
  should_include_decl = if options.key?(:no_declaration)
468
506
  !options[:no_declaration]
469
507
  elsif options.key?(:declaration)
470
508
  options[:declaration]
471
509
  else
472
- # Default: include if document has xml_declaration node
473
- node.xml_declaration ? true : false
510
+ # Default: include if document has xml_declaration
511
+ effective_xml_declaration ? true : false
474
512
  end
475
513
 
476
514
  # Fix: Check if declaration already exists in children
477
515
  # This prevents duplicate declarations when document already has one
478
516
  has_existing_declaration = node.children.any?(::Oga::XML::XmlDeclaration)
479
517
 
480
- if should_include_decl && !node.xml_declaration && !has_existing_declaration
518
+ if should_include_decl && !effective_xml_declaration && !has_existing_declaration
481
519
  # Need to add declaration - create default one
482
520
  output = []
483
521
  output << '<?xml version="1.0" encoding="UTF-8"?>'
@@ -512,7 +550,8 @@ module Moxml
512
550
 
513
551
  # Default: use XmlGenerator
514
552
  # But first check if we need to handle declaration specially
515
- if node.is_a?(::Oga::XML::Document) && node.xml_declaration
553
+ effective_xml_declaration = node.is_a?(::Oga::XML::Document) && (node.xml_declaration || attachments.get(node, :xml_declaration))
554
+ if node.is_a?(::Oga::XML::Document) && effective_xml_declaration
516
555
  # Document has declaration - use custom handling to avoid duplicates
517
556
  output = []
518
557
  xml_declaration_serialized = false