moxml 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.adoc ADDED
@@ -0,0 +1,770 @@
1
+ = Moxml: Modular XML processing for Ruby
2
+
3
+ Moxml provides a unified API for XML processing in Ruby, supporting multiple XML parsing backends (Nokogiri, Ox, and Oga).
4
+
5
+ Moxml ("mox-em-el") stands for "Modular XML" and aims to provide a consistent
6
+ interface for working with XML documents, regardless of the underlying XML
7
+ library.
8
+
9
+ == Installation
10
+
11
+ [source,ruby]
12
+ ----
13
+ gem 'moxml'
14
+ ----
15
+
16
+ == Basic usage
17
+
18
+ === Configuration
19
+
20
+ Configure Moxml to use your preferred XML backend:
21
+
22
+ [source,ruby]
23
+ ----
24
+ require 'moxml'
25
+
26
+ Moxml.configure do |config|
27
+ config.backend = :nokogiri # or :ox, :oga
28
+ end
29
+ ----
30
+
31
+ === Creating and parsing documents
32
+
33
+ [source,ruby]
34
+ ----
35
+ # Create new empty document
36
+ doc = Moxml::Document.new
37
+
38
+ # Parse from string
39
+ doc = Moxml::Document.parse("<root><child>content</child></root>")
40
+
41
+ # Parse with encoding
42
+ doc = Moxml::Document.parse(xml_string, encoding: 'UTF-8')
43
+ ----
44
+
45
+ === Document creation patterns
46
+
47
+ [source,ruby]
48
+ ----
49
+ # Method 1: Create and build
50
+ doc = Moxml::Document.new
51
+ root = doc.create_element('root')
52
+ doc.add_child(root)
53
+
54
+ # Method 2: Parse from string
55
+ doc = Moxml::Document.parse("<root/>")
56
+
57
+ # Method 3: Parse with encoding
58
+ doc = Moxml::Document.parse(xml_string, encoding: 'UTF-8')
59
+
60
+ # Method 4: Parse with options
61
+ doc = Moxml::Document.parse(xml_string, {
62
+ encoding: 'UTF-8',
63
+ strict: true
64
+ })
65
+ ----
66
+
67
+ === Common XML patterns
68
+
69
+ [source,ruby]
70
+ ----
71
+ # Working with namespaces
72
+ doc = Moxml::Document.new
73
+ root = doc.create_element('root')
74
+ root['xmlns:custom'] = 'http://example.com/ns'
75
+ child = doc.create_element('custom:element')
76
+ root.add_child(child)
77
+
78
+ # Creating structured data
79
+ person = doc.create_element('person')
80
+ person['id'] = '123'
81
+ name = doc.create_element('name')
82
+ name.add_child(doc.create_text('John Doe'))
83
+ person.add_child(name)
84
+
85
+ # Working with attributes
86
+ element = doc.create_element('div')
87
+ element['class'] = 'container'
88
+ element['data-id'] = '123'
89
+ element['style'] = 'color: blue'
90
+
91
+ # Handling special characters
92
+ text = doc.create_text('Special chars: < > & " \'')
93
+ cdata = doc.create_cdata('<script>alert("Hello!");</script>')
94
+
95
+ # Processing instructions
96
+ pi = doc.create_processing_instruction('xml-stylesheet',
97
+ 'type="text/xsl" href="style.xsl"')
98
+ doc.add_child(pi)
99
+ ----
100
+
101
+ === Working with elements
102
+
103
+ [source,ruby]
104
+ ----
105
+ # Create new element
106
+ element = Moxml::Element.new('tagname')
107
+
108
+ # Add attributes
109
+ element['class'] = 'content'
110
+
111
+ # Access attributes
112
+ class_attr = element['class']
113
+
114
+ # Add child elements
115
+ child = element.create_element('child')
116
+ element.add_child(child)
117
+
118
+ # Access text content
119
+ text_content = element.text
120
+
121
+ # Add text content
122
+ text = element.create_text('content')
123
+ element.add_child(text)
124
+
125
+ # Chaining operations
126
+ element
127
+ .add_child(doc.create_element('child'))
128
+ .add_child(doc.create_text('content'))
129
+ ['class'] = 'new-class'
130
+
131
+ # Complex element creation
132
+ div = doc.create_element('div')
133
+ div['class'] = 'container'
134
+ div.add_child(doc.create_element('span'))
135
+ .add_child(doc.create_text('Hello'))
136
+ div.add_child(doc.create_element('br'))
137
+ div.add_child(doc.create_text('World'))
138
+ ----
139
+
140
+ === Working with different node types
141
+
142
+ [source,ruby]
143
+ ----
144
+ # Text nodes with various content
145
+ plain_text = Moxml::Text.new("Simple text")
146
+ multiline_text = Moxml::Text.new("Line 1\nLine 2")
147
+ special_chars = Moxml::Text.new("Special: & < > \" '")
148
+
149
+ # CDATA sections for different content types
150
+ script_cdata = Moxml::Cdata.new("function() { alert('Hello!'); }")
151
+ xml_cdata = Moxml::Cdata.new("<data><item>value</item></data>")
152
+ mixed_cdata = Moxml::Cdata.new("Text with ]]> characters")
153
+
154
+ # Comments for documentation
155
+ todo_comment = Moxml::Comment.new("TODO: Add validation")
156
+ section_comment = Moxml::Comment.new("----- Section Break -----")
157
+ debug_comment = Moxml::Comment.new("DEBUG: Remove in production")
158
+
159
+ # Processing instructions for various uses
160
+ style_pi = Moxml::ProcessingInstruction.new(
161
+ "xml-stylesheet",
162
+ 'type="text/css" href="style.css"'
163
+ )
164
+ php_pi = Moxml::ProcessingInstruction.new(
165
+ "php",
166
+ 'echo "<?php echo $var; ?>>";'
167
+ )
168
+ custom_pi = Moxml::ProcessingInstruction.new(
169
+ "custom-processor",
170
+ 'param1="value1" param2="value2"'
171
+ )
172
+ ----
173
+
174
+ === Element manipulation examples
175
+
176
+ [source,ruby]
177
+ ----
178
+ # Building complex structures
179
+ doc = Moxml::Document.new
180
+ root = doc.create_element('html')
181
+ doc.add_child(root)
182
+
183
+ # Create head section
184
+ head = doc.create_element('head')
185
+ root.add_child(head)
186
+
187
+ title = doc.create_element('title')
188
+ title.add_child(doc.create_text('Example Page'))
189
+ head.add_child(title)
190
+
191
+ meta = doc.create_element('meta')
192
+ meta['charset'] = 'UTF-8'
193
+ head.add_child(meta)
194
+
195
+ # Create body section
196
+ body = doc.create_element('body')
197
+ root.add_child(body)
198
+
199
+ div = doc.create_element('div')
200
+ div['class'] = 'container'
201
+ body.add_child(div)
202
+
203
+ # Add multiple paragraphs
204
+ 3.times do |i|
205
+ p = doc.create_element('p')
206
+ p.add_child(doc.create_text("Paragraph #{i + 1}"))
207
+ div.add_child(p)
208
+ end
209
+
210
+ # Working with lists
211
+ ul = doc.create_element('ul')
212
+ div.add_child(ul)
213
+
214
+ ['Item 1', 'Item 2', 'Item 3'].each do |text|
215
+ li = doc.create_element('li')
216
+ li.add_child(doc.create_text(text))
217
+ ul.add_child(li)
218
+ end
219
+
220
+ # Adding link element
221
+ a = doc.create_element('a')
222
+ a['href'] = 'https://example.com'
223
+ a.add_child(doc.create_text('Visit Example'))
224
+ div.add_child(a)
225
+ ----
226
+
227
+ === Advanced node manipulation
228
+
229
+ [source,ruby]
230
+ ----
231
+ # Cloning nodes
232
+ original = doc.create_element('div')
233
+ original['id'] = 'original'
234
+ clone = original.clone
235
+
236
+ # Moving nodes
237
+ target = doc.create_element('target')
238
+ source = doc.create_element('source')
239
+ source.add_child(doc.create_text('Content'))
240
+ target.add_child(source)
241
+
242
+ # Replacing nodes
243
+ old_node = doc.at_xpath('//old')
244
+ new_node = doc.create_element('new')
245
+ old_node.replace(new_node)
246
+
247
+ # Inserting before/after
248
+ reference = doc.create_element('reference')
249
+ before = doc.create_element('before')
250
+ after = doc.create_element('after')
251
+ reference.add_previous_sibling(before)
252
+ reference.add_next_sibling(after)
253
+
254
+ # Conditional manipulation
255
+ element = doc.at_xpath('//conditional')
256
+ if element['flag'] == 'true'
257
+ element.add_child(doc.create_text('Flag is true'))
258
+ else
259
+ element.remove
260
+ end
261
+ ----
262
+
263
+ === Working with namespaces
264
+
265
+ [source,ruby]
266
+ ----
267
+ # Creating namespaced document
268
+ doc = Moxml::Document.new
269
+ root = doc.create_element('root')
270
+ root['xmlns'] = 'http://example.com/default'
271
+ root['xmlns:custom'] = 'http://example.com/custom'
272
+ doc.add_child(root)
273
+
274
+ # Adding namespaced elements
275
+ default_elem = doc.create_element('default-elem')
276
+ custom_elem = doc.create_element('custom:elem')
277
+
278
+ root.add_child(default_elem)
279
+ root.add_child(custom_elem)
280
+
281
+ # Working with attributes in namespaces
282
+ custom_elem['custom:attr'] = 'value'
283
+
284
+ # Accessing namespaced content
285
+ ns_elem = doc.at_xpath('//custom:elem')
286
+ ns_attr = ns_elem['custom:attr']
287
+ ----
288
+
289
+ === Document serialization examples
290
+
291
+ [source,ruby]
292
+ ----
293
+ # Basic serialization
294
+ xml_string = doc.to_xml
295
+
296
+ # Pretty printing with indentation
297
+ formatted_xml = doc.to_xml(
298
+ indent: 2,
299
+ pretty: true
300
+ )
301
+
302
+ # Controlling XML declaration
303
+ with_declaration = doc.to_xml(
304
+ xml_declaration: true,
305
+ encoding: 'UTF-8',
306
+ standalone: 'yes'
307
+ )
308
+
309
+ # Compact output
310
+ minimal_xml = doc.to_xml(
311
+ indent: 0,
312
+ pretty: false,
313
+ xml_declaration: false
314
+ )
315
+
316
+ # Custom formatting
317
+ custom_format = doc.to_xml(
318
+ indent: 4,
319
+ encoding: 'ISO-8859-1',
320
+ xml_declaration: true
321
+ )
322
+ ----
323
+
324
+ == Implementation details
325
+
326
+ === Memory management
327
+
328
+ [source,ruby]
329
+ ----
330
+ # Efficient document handling
331
+ doc = Moxml::Document.parse(large_xml)
332
+ begin
333
+ # Process document
334
+ result = process_document(doc)
335
+ ensure
336
+ # Clear references
337
+ doc = nil
338
+ GC.start
339
+ end
340
+
341
+ # Streaming large node sets
342
+ doc.xpath('//large-set/*').each do |node|
343
+ # Process node
344
+ process_node(node)
345
+ # Clear reference
346
+ node = nil
347
+ end
348
+
349
+ # Handling large collections
350
+ def process_large_nodeset(nodeset)
351
+ nodeset.each do |node|
352
+ yield node if block_given?
353
+ end
354
+ ensure
355
+ # Clear references
356
+ nodeset = nil
357
+ GC.start
358
+ end
359
+ ----
360
+
361
+ === Backend-specific optimizations
362
+
363
+ [source,ruby]
364
+ ----
365
+ # Nokogiri-specific optimizations
366
+ if Moxml.config.backend == :nokogiri
367
+ # Use native CSS selectors
368
+ nodes = doc.native.css('complex > selector')
369
+ nodes.each do |native_node|
370
+ node = Moxml::Node.wrap(native_node)
371
+ # Process node
372
+ end
373
+
374
+ # Use native XPath
375
+ results = doc.native.xpath('//complex/xpath/expression')
376
+ end
377
+
378
+ # Ox-specific optimizations
379
+ if Moxml.config.backend == :ox
380
+ # Use native parsing options
381
+ doc = Moxml::Document.parse(xml, {
382
+ mode: :generic,
383
+ effort: :tolerant,
384
+ smart: true
385
+ })
386
+
387
+ # Direct element creation
388
+ element = Ox::Element.new('name')
389
+ wrapped = Moxml::Element.new(element)
390
+ end
391
+
392
+ # Oga-specific optimizations
393
+ if Moxml.config.backend == :oga
394
+ # Use native parsing features
395
+ doc = Moxml::Document.parse(xml, {
396
+ encoding: 'UTF-8',
397
+ strict: true
398
+ })
399
+
400
+ # Direct access to native methods
401
+ nodes = doc.native.xpath('//element')
402
+ end
403
+ ----
404
+
405
+ === Threading patterns
406
+
407
+ [source,ruby]
408
+ ----
409
+ # Thread-safe document creation
410
+ require 'thread'
411
+
412
+ class ThreadSafeXmlProcessor
413
+ def initialize
414
+ @mutex = Mutex.new
415
+ end
416
+
417
+ def process_document(xml_string)
418
+ @mutex.synchronize do
419
+ doc = Moxml::Document.parse(xml_string)
420
+ # Process document
421
+ result = doc.to_xml
422
+ doc = nil
423
+ result
424
+ end
425
+ end
426
+ end
427
+
428
+ # Parallel document processing
429
+ def process_documents(xml_strings)
430
+ threads = xml_strings.map do |xml|
431
+ Thread.new do
432
+ doc = Moxml::Document.parse(xml)
433
+ # Process document
434
+ doc = nil
435
+ end
436
+ end
437
+ threads.each(&:join)
438
+ end
439
+
440
+ # Thread-local document storage
441
+ Thread.new do
442
+ Thread.current[:document] = Moxml::Document.new
443
+ # Process document
444
+ ensure
445
+ Thread.current[:document] = nil
446
+ end
447
+ ----
448
+
449
+ == Troubleshooting
450
+
451
+ === Common issues and solutions
452
+
453
+ ==== Parsing errors
454
+
455
+ [source,ruby]
456
+ ----
457
+ # Handle malformed XML
458
+ begin
459
+ doc = Moxml::Document.parse(xml_string)
460
+ rescue Moxml::ParseError => e
461
+ puts "Parse error at line #{e.line}, column #{e.column}: #{e.message}"
462
+ # Attempt recovery
463
+ xml_string = cleanup_xml(xml_string)
464
+ retry
465
+ end
466
+
467
+ # Handle encoding issues
468
+ begin
469
+ doc = Moxml::Document.parse(xml_string, encoding: 'UTF-8')
470
+ rescue Moxml::ParseError => e
471
+ if e.message =~ /encoding/
472
+ # Try detecting encoding
473
+ detected_encoding = detect_encoding(xml_string)
474
+ retry if detected_encoding
475
+ end
476
+ raise
477
+ end
478
+ ----
479
+
480
+ ==== Memory issues
481
+
482
+ [source,ruby]
483
+ ----
484
+ # Handle large documents
485
+ def process_large_document(path)
486
+ # Read and process in chunks
487
+ File.open(path) do |file|
488
+ doc = Moxml::Document.parse(file)
489
+ doc.xpath('//chunk').each do |chunk|
490
+ process_chunk(chunk)
491
+ chunk = nil
492
+ end
493
+ doc = nil
494
+ end
495
+ GC.start
496
+ end
497
+
498
+ # Monitor memory usage
499
+ require 'get_process_mem'
500
+
501
+ def memory_safe_processing(xml)
502
+ memory = GetProcessMem.new
503
+ initial_memory = memory.mb
504
+
505
+ doc = Moxml::Document.parse(xml)
506
+ result = process_document(doc)
507
+ doc = nil
508
+ GC.start
509
+
510
+ final_memory = memory.mb
511
+ puts "Memory usage: #{final_memory - initial_memory}MB"
512
+
513
+ result
514
+ end
515
+ ----
516
+
517
+ ==== Backend-specific issues
518
+
519
+ [source,ruby]
520
+ ----
521
+ # Handle backend limitations
522
+ def safe_xpath(doc, xpath)
523
+ case Moxml.config.backend
524
+ when :nokogiri
525
+ doc.xpath(xpath)
526
+ when :ox
527
+ # Ox has limited XPath support
528
+ fallback_xpath_search(doc, xpath)
529
+ when :oga
530
+ # Handle Oga-specific XPath syntax
531
+ modified_xpath = adjust_xpath_for_oga(xpath)
532
+ doc.xpath(modified_xpath)
533
+ end
534
+ end
535
+
536
+ # Handle backend switching
537
+ def with_backend(backend)
538
+ original_backend = Moxml.config.backend
539
+ Moxml.config.backend = backend
540
+ yield
541
+ ensure
542
+ Moxml.config.backend = original_backend
543
+ end
544
+ ----
545
+
546
+ === Performance optimization
547
+
548
+ ==== Document creation
549
+
550
+ [source,ruby]
551
+ ----
552
+ # Efficient document building
553
+ def build_large_document
554
+ doc = Moxml::Document.new
555
+ root = doc.create_element('root')
556
+ doc.add_child(root)
557
+
558
+ # Pre-allocate elements
559
+ elements = Array.new(1000) do |i|
560
+ elem = doc.create_element('item')
561
+ elem['id'] = i.to_s
562
+ elem
563
+ end
564
+
565
+ # Batch add elements
566
+ elements.each do |elem|
567
+ root.add_child(elem)
568
+ end
569
+
570
+ doc
571
+ end
572
+
573
+ # Memory-efficient processing
574
+ def process_large_xml(xml_string)
575
+ result = []
576
+ doc = Moxml::Document.parse(xml_string)
577
+
578
+ doc.xpath('//item').each do |item|
579
+ # Process and immediately discard
580
+ result << process_item(item)
581
+ item = nil
582
+ end
583
+
584
+ doc = nil
585
+ GC.start
586
+
587
+ result
588
+ end
589
+ ----
590
+
591
+ ==== Query optimization
592
+
593
+ [source,ruby]
594
+ ----
595
+ # Optimize node selection
596
+ def efficient_node_selection(doc)
597
+ # Cache frequently used nodes
598
+ @header_nodes ||= doc.xpath('//header').to_a
599
+
600
+ # Use specific selectors
601
+ doc.xpath('//specific/path') # Better than '//*[name()="specific"]'
602
+
603
+ # Combine queries when possible
604
+ doc.xpath('//a | //b') # Better than two separate queries
605
+ end
606
+
607
+ # Optimize attribute access
608
+ def efficient_attribute_handling(element)
609
+ # Cache attribute values
610
+ @cached_attrs ||= element.attributes
611
+
612
+ # Direct attribute access
613
+ value = element['attr'] # Better than element.attributes['attr']
614
+
615
+ # Batch attribute updates
616
+ attrs = {'id' => '1', 'class' => 'new', 'data' => 'value'}
617
+ attrs.each { |k,v| element[k] = v }
618
+ end
619
+ ----
620
+
621
+ ==== Serialization optimization
622
+
623
+ [source,ruby]
624
+ ----
625
+ # Efficient output generation
626
+ def optimized_serialization(doc)
627
+ # Minimal output
628
+ compact = doc.to_xml(
629
+ indent: 0,
630
+ pretty: false,
631
+ xml_declaration: false
632
+ )
633
+
634
+ # Balanced formatting
635
+ readable = doc.to_xml(
636
+ indent: 2,
637
+ pretty: true,
638
+ xml_declaration: true
639
+ )
640
+
641
+ # Stream large documents
642
+ File.open('large.xml', 'w') do |file|
643
+ doc.write_to(file, indent: 2)
644
+ end
645
+ end
646
+ ----
647
+
648
+ === Debugging tips
649
+
650
+ ==== Inspection helpers
651
+
652
+ [source,ruby]
653
+ ----
654
+ # Debug node structure
655
+ def inspect_node(node, level = 0)
656
+ indent = " " * level
657
+ puts "#{indent}#{node.class.name}: #{node.name}"
658
+
659
+ if node.respond_to?(:attributes)
660
+ node.attributes.each do |name, attr|
661
+ puts "#{indent} @#{name}=#{attr.value.inspect}"
662
+ end
663
+ end
664
+
665
+ if node.respond_to?(:children)
666
+ node.children.each { |child| inspect_node(child, level + 1) }
667
+ end
668
+ end
669
+
670
+ # Track node operations
671
+ def debug_node_operations
672
+ nodes_created = 0
673
+ nodes_removed = 0
674
+
675
+ yield
676
+ ensure
677
+ puts "Nodes created: #{nodes_created}"
678
+ puts "Nodes removed: #{nodes_removed}"
679
+ end
680
+ ----
681
+
682
+ ==== Backend validation
683
+
684
+ [source,ruby]
685
+ ----
686
+ # Verify backend behavior
687
+ def verify_backend_compatibility
688
+ doc = Moxml::Document.new
689
+
690
+ # Test basic operations
691
+ element = doc.create_element('test')
692
+ doc.add_child(element)
693
+
694
+ # Verify node handling
695
+ raise "Node creation failed" unless doc.root
696
+ raise "Node type wrong" unless doc.root.is_a?(Moxml::Element)
697
+
698
+ # Verify serialization
699
+ xml = doc.to_xml
700
+ raise "Serialization failed" unless xml.include?('<test/>')
701
+
702
+ puts "Backend verification successful"
703
+ rescue => e
704
+ puts "Backend verification failed: #{e.message}"
705
+ end
706
+ ----
707
+
708
+ == Error handling
709
+
710
+ Moxml provides unified error handling:
711
+
712
+ * `Moxml::Error` - Base error class
713
+ * `Moxml::ParseError` - XML parsing errors
714
+ * `Moxml::ArgumentError` - Invalid argument errors
715
+
716
+ === Error handling patterns
717
+
718
+ [source,ruby]
719
+ ----
720
+ # Handle parsing errors
721
+ begin
722
+ doc = Moxml::Document.parse(xml_string)
723
+ rescue Moxml::ParseError => e
724
+ logger.error "Parse error: #{e.message}"
725
+ logger.error "At line #{e.line}, column #{e.column}"
726
+ raise
727
+ end
728
+
729
+ # Handle invalid operations
730
+ begin
731
+ element['invalid/name'] = 'value'
732
+ rescue Moxml::ArgumentError => e
733
+ logger.warn "Invalid operation: #{e.message}"
734
+ # Use alternative approach
735
+ end
736
+
737
+ # Custom error handling
738
+ class XmlProcessor
739
+ def process(xml)
740
+ doc = Moxml::Document.parse(xml)
741
+ yield doc
742
+ rescue Moxml::Error => e
743
+ handle_moxml_error(e)
744
+ rescue StandardError => e
745
+ handle_standard_error(e)
746
+ ensure
747
+ doc = nil
748
+ end
749
+ end
750
+ ----
751
+
752
+ == Contributing
753
+
754
+ Bug reports and pull requests are welcome on GitHub at
755
+ https://github.com/lutaml/moxml.
756
+
757
+ === Development guidelines
758
+
759
+ * Follow Ruby style guide
760
+ * Add tests for new features
761
+ * Update documentation
762
+ * Ensure backwards compatibility
763
+ * Consider performance implications
764
+ * Test with all supported backends
765
+
766
+ == Copyright and license
767
+
768
+ Copyright Ribose.
769
+
770
+ The gem is available as open source under the terms of the BSD-2-Clause License.