moxml 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/dependent-repos.json +5 -0
- data/.github/workflows/dependent-tests.yml +20 -0
- data/.github/workflows/docs.yml +59 -0
- data/.github/workflows/rake.yml +10 -10
- data/.github/workflows/release.yml +5 -3
- data/.gitignore +37 -0
- data/.rubocop.yml +15 -7
- data/.rubocop_todo.yml +224 -43
- data/Gemfile +14 -9
- data/LICENSE.md +6 -2
- data/README.adoc +535 -373
- data/Rakefile +53 -0
- data/benchmarks/.gitignore +6 -0
- data/benchmarks/generate_report.rb +550 -0
- data/docs/Gemfile +13 -0
- data/docs/_config.yml +138 -0
- data/docs/_guides/advanced-features.adoc +87 -0
- data/docs/_guides/development-testing.adoc +165 -0
- data/docs/_guides/index.adoc +51 -0
- data/docs/_guides/modifying-xml.adoc +292 -0
- data/docs/_guides/parsing-xml.adoc +230 -0
- data/docs/_guides/sax-parsing.adoc +603 -0
- data/docs/_guides/working-with-documents.adoc +118 -0
- data/docs/_guides/xml-declaration.adoc +450 -0
- data/docs/_pages/adapter-compatibility.adoc +369 -0
- data/docs/_pages/adapters/headed-ox.adoc +237 -0
- data/docs/_pages/adapters/index.adoc +97 -0
- data/docs/_pages/adapters/libxml.adoc +285 -0
- data/docs/_pages/adapters/nokogiri.adoc +251 -0
- data/docs/_pages/adapters/oga.adoc +291 -0
- data/docs/_pages/adapters/ox.adoc +56 -0
- data/docs/_pages/adapters/rexml.adoc +292 -0
- data/docs/_pages/best-practices.adoc +429 -0
- data/docs/_pages/compatibility.adoc +467 -0
- data/docs/_pages/configuration.adoc +250 -0
- data/docs/_pages/error-handling.adoc +349 -0
- data/docs/_pages/headed-ox-limitations.adoc +574 -0
- data/docs/_pages/headed-ox.adoc +1025 -0
- data/docs/_pages/index.adoc +35 -0
- data/docs/_pages/installation.adoc +140 -0
- data/docs/_pages/node-api-reference.adoc +49 -0
- data/docs/_pages/performance.adoc +35 -0
- data/docs/_pages/quick-start.adoc +243 -0
- data/docs/_pages/thread-safety.adoc +28 -0
- data/docs/_references/document-api.adoc +407 -0
- data/docs/_references/index.adoc +48 -0
- data/docs/_tutorials/basic-usage.adoc +267 -0
- data/docs/_tutorials/builder-pattern.adoc +342 -0
- data/docs/_tutorials/index.adoc +33 -0
- data/docs/_tutorials/namespace-handling.adoc +324 -0
- data/docs/_tutorials/xpath-queries.adoc +358 -0
- data/docs/index.adoc +122 -0
- data/examples/README.md +124 -0
- data/examples/api_client/README.md +424 -0
- data/examples/api_client/api_client.rb +394 -0
- data/examples/api_client/example_response.xml +48 -0
- data/examples/headed_ox_example/README.md +90 -0
- data/examples/headed_ox_example/headed_ox_demo.rb +71 -0
- data/examples/rss_parser/README.md +194 -0
- data/examples/rss_parser/example_feed.xml +93 -0
- data/examples/rss_parser/rss_parser.rb +189 -0
- data/examples/sax_parsing/README.md +50 -0
- data/examples/sax_parsing/data_extractor.rb +75 -0
- data/examples/sax_parsing/example.xml +21 -0
- data/examples/sax_parsing/large_file.rb +78 -0
- data/examples/sax_parsing/simple_parser.rb +55 -0
- data/examples/web_scraper/README.md +352 -0
- data/examples/web_scraper/example_page.html +201 -0
- data/examples/web_scraper/web_scraper.rb +312 -0
- data/lib/moxml/adapter/base.rb +107 -28
- data/lib/moxml/adapter/customized_libxml/cdata.rb +28 -0
- data/lib/moxml/adapter/customized_libxml/comment.rb +24 -0
- data/lib/moxml/adapter/customized_libxml/declaration.rb +85 -0
- data/lib/moxml/adapter/customized_libxml/element.rb +39 -0
- data/lib/moxml/adapter/customized_libxml/node.rb +44 -0
- data/lib/moxml/adapter/customized_libxml/processing_instruction.rb +31 -0
- data/lib/moxml/adapter/customized_libxml/text.rb +27 -0
- data/lib/moxml/adapter/customized_oga/xml_generator.rb +1 -1
- data/lib/moxml/adapter/customized_ox/attribute.rb +28 -1
- data/lib/moxml/adapter/customized_rexml/formatter.rb +13 -8
- data/lib/moxml/adapter/headed_ox.rb +161 -0
- data/lib/moxml/adapter/libxml.rb +1564 -0
- data/lib/moxml/adapter/nokogiri.rb +156 -9
- data/lib/moxml/adapter/oga.rb +190 -15
- data/lib/moxml/adapter/ox.rb +322 -28
- data/lib/moxml/adapter/rexml.rb +157 -28
- data/lib/moxml/adapter.rb +21 -4
- data/lib/moxml/attribute.rb +6 -0
- data/lib/moxml/builder.rb +40 -4
- data/lib/moxml/config.rb +8 -3
- data/lib/moxml/context.rb +57 -2
- data/lib/moxml/declaration.rb +9 -0
- data/lib/moxml/doctype.rb +13 -1
- data/lib/moxml/document.rb +53 -6
- data/lib/moxml/document_builder.rb +34 -5
- data/lib/moxml/element.rb +71 -2
- data/lib/moxml/error.rb +175 -6
- data/lib/moxml/node.rb +155 -4
- data/lib/moxml/node_set.rb +34 -0
- data/lib/moxml/sax/block_handler.rb +194 -0
- data/lib/moxml/sax/element_handler.rb +124 -0
- data/lib/moxml/sax/handler.rb +113 -0
- data/lib/moxml/sax.rb +31 -0
- data/lib/moxml/version.rb +1 -1
- data/lib/moxml/xml_utils/encoder.rb +4 -4
- data/lib/moxml/xml_utils.rb +7 -4
- data/lib/moxml/xpath/ast/node.rb +159 -0
- data/lib/moxml/xpath/cache.rb +91 -0
- data/lib/moxml/xpath/compiler.rb +1770 -0
- data/lib/moxml/xpath/context.rb +26 -0
- data/lib/moxml/xpath/conversion.rb +124 -0
- data/lib/moxml/xpath/engine.rb +52 -0
- data/lib/moxml/xpath/errors.rb +101 -0
- data/lib/moxml/xpath/lexer.rb +304 -0
- data/lib/moxml/xpath/parser.rb +485 -0
- data/lib/moxml/xpath/ruby/generator.rb +269 -0
- data/lib/moxml/xpath/ruby/node.rb +193 -0
- data/lib/moxml/xpath.rb +37 -0
- data/lib/moxml.rb +5 -2
- data/moxml.gemspec +3 -1
- data/old-specs/moxml/adapter/customized_libxml/.gitkeep +6 -0
- data/spec/consistency/README.md +77 -0
- data/spec/{moxml/examples/adapter_spec.rb → consistency/adapter_parity_spec.rb} +4 -4
- data/spec/examples/README.md +75 -0
- data/spec/{support/shared_examples/examples/attribute.rb → examples/attribute_examples_spec.rb} +1 -1
- data/spec/{support/shared_examples/examples/basic_usage.rb → examples/basic_usage_spec.rb} +2 -2
- data/spec/{support/shared_examples/examples/namespace.rb → examples/namespace_examples_spec.rb} +3 -3
- data/spec/{support/shared_examples/examples/readme_examples.rb → examples/readme_examples_spec.rb} +6 -4
- data/spec/{support/shared_examples/examples/xpath.rb → examples/xpath_examples_spec.rb} +10 -6
- data/spec/integration/README.md +71 -0
- data/spec/{moxml/all_with_adapters_spec.rb → integration/all_adapters_spec.rb} +3 -2
- data/spec/integration/headed_ox_integration_spec.rb +326 -0
- data/spec/{support → integration}/shared_examples/edge_cases.rb +37 -10
- data/spec/integration/shared_examples/high_level/.gitkeep +0 -0
- data/spec/{support/shared_examples/context.rb → integration/shared_examples/high_level/context_behavior.rb} +2 -1
- data/spec/{support/shared_examples/integration.rb → integration/shared_examples/integration_workflows.rb} +23 -6
- data/spec/integration/shared_examples/node_wrappers/.gitkeep +0 -0
- data/spec/{support/shared_examples/cdata.rb → integration/shared_examples/node_wrappers/cdata_behavior.rb} +6 -1
- data/spec/{support/shared_examples/comment.rb → integration/shared_examples/node_wrappers/comment_behavior.rb} +2 -1
- data/spec/{support/shared_examples/declaration.rb → integration/shared_examples/node_wrappers/declaration_behavior.rb} +5 -5
- data/spec/{support/shared_examples/doctype.rb → integration/shared_examples/node_wrappers/doctype_behavior.rb} +2 -2
- data/spec/{support/shared_examples/document.rb → integration/shared_examples/node_wrappers/document_behavior.rb} +1 -1
- data/spec/{support/shared_examples/node.rb → integration/shared_examples/node_wrappers/node_behavior.rb} +9 -2
- data/spec/{support/shared_examples/node_set.rb → integration/shared_examples/node_wrappers/node_set_behavior.rb} +1 -18
- data/spec/{support/shared_examples/processing_instruction.rb → integration/shared_examples/node_wrappers/processing_instruction_behavior.rb} +6 -2
- data/spec/moxml/README.md +41 -0
- data/spec/moxml/adapter/.gitkeep +0 -0
- data/spec/moxml/adapter/README.md +61 -0
- data/spec/moxml/adapter/base_spec.rb +27 -0
- data/spec/moxml/adapter/headed_ox_spec.rb +311 -0
- data/spec/moxml/adapter/libxml_spec.rb +14 -0
- data/spec/moxml/adapter/ox_spec.rb +9 -8
- data/spec/moxml/adapter/shared_examples/.gitkeep +0 -0
- data/spec/{support/shared_examples/xml_adapter.rb → moxml/adapter/shared_examples/adapter_contract.rb} +39 -12
- data/spec/moxml/adapter_spec.rb +16 -0
- data/spec/moxml/attribute_spec.rb +30 -0
- data/spec/moxml/builder_spec.rb +33 -0
- data/spec/moxml/cdata_spec.rb +31 -0
- data/spec/moxml/comment_spec.rb +31 -0
- data/spec/moxml/config_spec.rb +3 -3
- data/spec/moxml/context_spec.rb +28 -0
- data/spec/moxml/declaration_preservation_spec.rb +217 -0
- data/spec/moxml/declaration_spec.rb +36 -0
- data/spec/moxml/doctype_spec.rb +33 -0
- data/spec/moxml/document_builder_spec.rb +30 -0
- data/spec/moxml/document_spec.rb +105 -0
- data/spec/moxml/element_spec.rb +143 -0
- data/spec/moxml/error_spec.rb +266 -22
- data/spec/{moxml_spec.rb → moxml/moxml_spec.rb} +9 -9
- data/spec/moxml/namespace_spec.rb +32 -0
- data/spec/moxml/node_set_spec.rb +39 -0
- data/spec/moxml/node_spec.rb +37 -0
- data/spec/moxml/processing_instruction_spec.rb +34 -0
- data/spec/moxml/sax_spec.rb +1067 -0
- data/spec/moxml/text_spec.rb +31 -0
- data/spec/moxml/version_spec.rb +14 -0
- data/spec/moxml/xml_utils/.gitkeep +0 -0
- data/spec/moxml/xml_utils/encoder_spec.rb +27 -0
- data/spec/moxml/xml_utils_spec.rb +49 -0
- data/spec/moxml/xpath/ast/node_spec.rb +83 -0
- data/spec/moxml/xpath/axes_spec.rb +296 -0
- data/spec/moxml/xpath/cache_spec.rb +358 -0
- data/spec/moxml/xpath/compiler_spec.rb +406 -0
- data/spec/moxml/xpath/context_spec.rb +210 -0
- data/spec/moxml/xpath/conversion_spec.rb +365 -0
- data/spec/moxml/xpath/fixtures/sample.xml +25 -0
- data/spec/moxml/xpath/functions/boolean_functions_spec.rb +114 -0
- data/spec/moxml/xpath/functions/node_functions_spec.rb +145 -0
- data/spec/moxml/xpath/functions/numeric_functions_spec.rb +164 -0
- data/spec/moxml/xpath/functions/position_functions_spec.rb +93 -0
- data/spec/moxml/xpath/functions/special_functions_spec.rb +89 -0
- data/spec/moxml/xpath/functions/string_functions_spec.rb +381 -0
- data/spec/moxml/xpath/lexer_spec.rb +488 -0
- data/spec/moxml/xpath/parser_integration_spec.rb +210 -0
- data/spec/moxml/xpath/parser_spec.rb +364 -0
- data/spec/moxml/xpath/ruby/generator_spec.rb +421 -0
- data/spec/moxml/xpath/ruby/node_spec.rb +291 -0
- data/spec/moxml/xpath_capabilities_spec.rb +199 -0
- data/spec/moxml/xpath_spec.rb +77 -0
- data/spec/performance/README.md +83 -0
- data/spec/performance/benchmark_spec.rb +64 -0
- data/spec/{support/shared_examples/examples/memory.rb → performance/memory_usage_spec.rb} +4 -1
- data/spec/{support/shared_examples/examples/thread_safety.rb → performance/thread_safety_spec.rb} +3 -1
- data/spec/performance/xpath_benchmark_spec.rb +259 -0
- data/spec/spec_helper.rb +58 -1
- data/spec/support/xml_matchers.rb +1 -1
- metadata +178 -34
- data/spec/support/shared_examples/examples/benchmark_spec.rb +0 -51
- /data/spec/{support/shared_examples/builder.rb → integration/shared_examples/high_level/builder_behavior.rb} +0 -0
- /data/spec/{support/shared_examples/document_builder.rb → integration/shared_examples/high_level/document_builder_behavior.rb} +0 -0
- /data/spec/{support/shared_examples/attribute.rb → integration/shared_examples/node_wrappers/attribute_behavior.rb} +0 -0
- /data/spec/{support/shared_examples/element.rb → integration/shared_examples/node_wrappers/element_behavior.rb} +0 -0
- /data/spec/{support/shared_examples/namespace.rb → integration/shared_examples/node_wrappers/namespace_behavior.rb} +0 -0
- /data/spec/{support/shared_examples/text.rb → integration/shared_examples/node_wrappers/text_behavior.rb} +0 -0
|
@@ -0,0 +1,603 @@
|
|
|
1
|
+
= SAX Parsing Guide
|
|
2
|
+
:toc:
|
|
3
|
+
:toclevels: 3
|
|
4
|
+
|
|
5
|
+
== Introduction
|
|
6
|
+
|
|
7
|
+
SAX (Simple API for XML) provides event-driven XML parsing, allowing you to process XML documents efficiently without loading the entire structure into memory. This is particularly useful for large files or streaming scenarios.
|
|
8
|
+
|
|
9
|
+
Moxml provides a consistent SAX interface across all supported adapters, with three handler types to suit different use cases.
|
|
10
|
+
|
|
11
|
+
== When to use SAX vs DOM
|
|
12
|
+
|
|
13
|
+
=== Use SAX when
|
|
14
|
+
|
|
15
|
+
* Processing files >100MB in size
|
|
16
|
+
* Memory is constrained
|
|
17
|
+
* You only need specific data from the document
|
|
18
|
+
* Streaming data processing is required
|
|
19
|
+
* Linear, forward-only traversal is sufficient
|
|
20
|
+
|
|
21
|
+
=== Use DOM when
|
|
22
|
+
|
|
23
|
+
* Need random access to any part of the document
|
|
24
|
+
* Need to modify XML structure
|
|
25
|
+
* Working with small documents (<10MB)
|
|
26
|
+
* Need XPath queries for complex selections
|
|
27
|
+
* Need to navigate backwards or access parent nodes
|
|
28
|
+
|
|
29
|
+
== Handler types
|
|
30
|
+
|
|
31
|
+
Moxml provides three handler types, each suited for different scenarios.
|
|
32
|
+
|
|
33
|
+
=== Base handler
|
|
34
|
+
|
|
35
|
+
The base handler provides minimal interface - override only the events you need.
|
|
36
|
+
|
|
37
|
+
[source,ruby]
|
|
38
|
+
----
|
|
39
|
+
class MyHandler < Moxml::SAX::Handler
|
|
40
|
+
def on_start_element(name, attributes = {}, namespaces = {})
|
|
41
|
+
puts "Element: #{name}"
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def on_characters(text)
|
|
45
|
+
puts "Text: #{text}"
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
context = Moxml.new(:nokogiri)
|
|
50
|
+
context.sax_parse(xml, MyHandler.new)
|
|
51
|
+
----
|
|
52
|
+
|
|
53
|
+
=== ElementHandler
|
|
54
|
+
|
|
55
|
+
Adds element stack tracking and path utilities for more sophisticated parsing.
|
|
56
|
+
|
|
57
|
+
[source,ruby]
|
|
58
|
+
----
|
|
59
|
+
class DataExtractor < Moxml::SAX::ElementHandler
|
|
60
|
+
def on_start_element(name, attributes = {}, namespaces = {})
|
|
61
|
+
super # Important: updates the stack
|
|
62
|
+
|
|
63
|
+
if path_matches?(/book\/title$/)
|
|
64
|
+
# We're inside book/title element
|
|
65
|
+
@capturing = true
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
----
|
|
70
|
+
|
|
71
|
+
**Utilities provided:**
|
|
72
|
+
|
|
73
|
+
* `element_stack` - Array of open elements
|
|
74
|
+
* `current_element()` - Current element name
|
|
75
|
+
* `parent_element()` - Parent element name
|
|
76
|
+
* `in_element?(name)` - Check if inside element
|
|
77
|
+
* `path_matches?(pattern)` - Match current path with regex
|
|
78
|
+
* `path_string(sep)` - Get path as string (default separator: "/")
|
|
79
|
+
* `depth()` - Current nesting level
|
|
80
|
+
|
|
81
|
+
=== BlockHandler
|
|
82
|
+
|
|
83
|
+
DSL for simple cases without defining a class.
|
|
84
|
+
|
|
85
|
+
[source,ruby]
|
|
86
|
+
----
|
|
87
|
+
context.sax_parse(xml) do
|
|
88
|
+
start_element { |name, attrs| puts name }
|
|
89
|
+
characters { |text| puts text unless text.strip.empty? }
|
|
90
|
+
end_element { |name| puts "End: #{name}" }
|
|
91
|
+
end
|
|
92
|
+
----
|
|
93
|
+
|
|
94
|
+
== Event reference
|
|
95
|
+
|
|
96
|
+
=== Document lifecycle
|
|
97
|
+
|
|
98
|
+
[source,ruby]
|
|
99
|
+
----
|
|
100
|
+
def on_start_document
|
|
101
|
+
# Called once at document start
|
|
102
|
+
# Initialize any document-level state here
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def on_end_document
|
|
106
|
+
# Called once at document end
|
|
107
|
+
# Clean up, finalize processing
|
|
108
|
+
end
|
|
109
|
+
----
|
|
110
|
+
|
|
111
|
+
=== Element events
|
|
112
|
+
|
|
113
|
+
[source,ruby]
|
|
114
|
+
----
|
|
115
|
+
def on_start_element(name, attributes = {}, namespaces = {})
|
|
116
|
+
# name: Element name (String)
|
|
117
|
+
# attributes: Hash<String, String> - regular attributes
|
|
118
|
+
# namespaces: Hash<String|nil, String> - prefix => uri
|
|
119
|
+
# nil prefix = default namespace (xmlns="...")
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
def on_end_element(name)
|
|
123
|
+
# name: Element name (String)
|
|
124
|
+
# Signals element is closing
|
|
125
|
+
end
|
|
126
|
+
----
|
|
127
|
+
|
|
128
|
+
=== Content events
|
|
129
|
+
|
|
130
|
+
[source,ruby]
|
|
131
|
+
----
|
|
132
|
+
def on_characters(text)
|
|
133
|
+
# Called for text content
|
|
134
|
+
# May be called multiple times for single text node
|
|
135
|
+
# Accumulate text if needed
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
def on_cdata(text)
|
|
139
|
+
# Called for <![CDATA[...]]> sections
|
|
140
|
+
# Not supported by Ox adapter
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def on_comment(text)
|
|
144
|
+
# Called for <!-- ... --> comments
|
|
145
|
+
# Not supported by Ox adapter
|
|
146
|
+
end
|
|
147
|
+
----
|
|
148
|
+
|
|
149
|
+
=== Processing instructions
|
|
150
|
+
|
|
151
|
+
[source,ruby]
|
|
152
|
+
----
|
|
153
|
+
def on_processing_instruction(target, data)
|
|
154
|
+
# Called for <?target data?>
|
|
155
|
+
# Example: <?xml-stylesheet type="text/xsl"?>
|
|
156
|
+
# Not supported by Ox adapter
|
|
157
|
+
end
|
|
158
|
+
----
|
|
159
|
+
|
|
160
|
+
=== Error handling
|
|
161
|
+
|
|
162
|
+
[source,ruby]
|
|
163
|
+
----
|
|
164
|
+
def on_error(error)
|
|
165
|
+
# error is Moxml::ParseError
|
|
166
|
+
# Default: raises the error
|
|
167
|
+
# Override to handle differently
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def on_warning(message)
|
|
171
|
+
# Non-fatal warnings
|
|
172
|
+
# Default: ignores
|
|
173
|
+
end
|
|
174
|
+
----
|
|
175
|
+
|
|
176
|
+
== Best practices
|
|
177
|
+
|
|
178
|
+
=== Memory management
|
|
179
|
+
|
|
180
|
+
[source,ruby]
|
|
181
|
+
----
|
|
182
|
+
class MemoryEfficientHandler < Moxml::SAX::Handler
|
|
183
|
+
def initialize(output_stream)
|
|
184
|
+
super()
|
|
185
|
+
@output = output_stream
|
|
186
|
+
@current_text = "".dup # Mutable string
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
def on_characters(text)
|
|
190
|
+
@current_text << text # Accumulate
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
def on_end_element(name)
|
|
194
|
+
@output.puts @current_text.strip
|
|
195
|
+
@current_text = "".dup # Reset - don't accumulate memory
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
----
|
|
199
|
+
|
|
200
|
+
=== String accumulation
|
|
201
|
+
|
|
202
|
+
Ruby 2.3+ freezes string literals by default. Always use `.dup`:
|
|
203
|
+
|
|
204
|
+
[source,ruby]
|
|
205
|
+
----
|
|
206
|
+
# WRONG:
|
|
207
|
+
@text = "" # Frozen literal!
|
|
208
|
+
|
|
209
|
+
# RIGHT:
|
|
210
|
+
@text = "".dup # Mutable string
|
|
211
|
+
----
|
|
212
|
+
|
|
213
|
+
=== Error recovery
|
|
214
|
+
|
|
215
|
+
[source,ruby]
|
|
216
|
+
----
|
|
217
|
+
class RobustHandler < Moxml::SAX::Handler
|
|
218
|
+
def initialize
|
|
219
|
+
super
|
|
220
|
+
@errors = []
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
def on_error(error)
|
|
224
|
+
# Log but don't crash
|
|
225
|
+
warn "Parse error: #{error.message}"
|
|
226
|
+
@errors << error
|
|
227
|
+
# Don't re-raise - allows parsing to continue if possible
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
def on_warning(message)
|
|
231
|
+
warn "Warning: #{message}"
|
|
232
|
+
end
|
|
233
|
+
end
|
|
234
|
+
----
|
|
235
|
+
|
|
236
|
+
== Adapter-specific notes
|
|
237
|
+
|
|
238
|
+
=== Nokogiri
|
|
239
|
+
|
|
240
|
+
* ✅ Full SAX support
|
|
241
|
+
* ✅ All 10 event types
|
|
242
|
+
* ✅ Line/column information in errors
|
|
243
|
+
* **Best choice for production use**
|
|
244
|
+
|
|
245
|
+
=== REXML
|
|
246
|
+
|
|
247
|
+
* ✅ Full SAX support
|
|
248
|
+
* ✅ Pure Ruby (no C dependencies)
|
|
249
|
+
* ✅ Always available (stdlib)
|
|
250
|
+
* ⚠️ Slower than C-based parsers
|
|
251
|
+
* **Best for portability**
|
|
252
|
+
|
|
253
|
+
=== Oga
|
|
254
|
+
|
|
255
|
+
* ✅ Full SAX support
|
|
256
|
+
* ✅ Pure Ruby
|
|
257
|
+
* ✅ Modern API
|
|
258
|
+
* ⚠️ May be lenient with malformed XML
|
|
259
|
+
* **Good for JRuby/TruffleRuby**
|
|
260
|
+
|
|
261
|
+
=== Ox
|
|
262
|
+
|
|
263
|
+
* ✅ Fast parsing
|
|
264
|
+
* ✅ Core events supported (start/end element, text)
|
|
265
|
+
* ❌ No separate CDATA events (delivered as text)
|
|
266
|
+
* ❌ No comment events
|
|
267
|
+
* ❌ No processing instruction events
|
|
268
|
+
* **Best for simple, fast parsing**
|
|
269
|
+
|
|
270
|
+
=== LibXML
|
|
271
|
+
|
|
272
|
+
* ✅ Full SAX support
|
|
273
|
+
* ✅ Fast (C-based)
|
|
274
|
+
* ✅ Alternative to Nokogiri
|
|
275
|
+
* **Good for performance**
|
|
276
|
+
|
|
277
|
+
=== HeadedOx
|
|
278
|
+
|
|
279
|
+
* Same as Ox (inherits implementation)
|
|
280
|
+
* ✅ Fast parsing
|
|
281
|
+
* ❌ Same limitations as Ox
|
|
282
|
+
|
|
283
|
+
== Common patterns
|
|
284
|
+
|
|
285
|
+
=== Extract specific data
|
|
286
|
+
|
|
287
|
+
[source,ruby]
|
|
288
|
+
----
|
|
289
|
+
class BookExtractor < Moxml::SAX::ElementHandler
|
|
290
|
+
attr_reader :books
|
|
291
|
+
|
|
292
|
+
def initialize
|
|
293
|
+
super
|
|
294
|
+
@books = []
|
|
295
|
+
@current_book = nil
|
|
296
|
+
@current_field = nil
|
|
297
|
+
@current_text = "".dup
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
def on_start_element(name, attributes = {}, namespaces = {})
|
|
301
|
+
super
|
|
302
|
+
case name
|
|
303
|
+
when "book"
|
|
304
|
+
@current_book = { id: attributes["id"] }
|
|
305
|
+
when "title", "author", "price"
|
|
306
|
+
@current_field = name
|
|
307
|
+
@current_text = "".dup
|
|
308
|
+
end
|
|
309
|
+
end
|
|
310
|
+
|
|
311
|
+
def on_characters(text)
|
|
312
|
+
@current_text << text if @current_field
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
def on_end_element(name)
|
|
316
|
+
case name
|
|
317
|
+
when "title", "author"
|
|
318
|
+
@current_book[@current_field.to_sym] = @current_text.strip if @current_book
|
|
319
|
+
@current_field = nil
|
|
320
|
+
when "price"
|
|
321
|
+
@current_book[:price] = @current_text.strip.to_f if @current_book
|
|
322
|
+
@current_field = nil
|
|
323
|
+
when "book"
|
|
324
|
+
@books << @current_book if @current_book
|
|
325
|
+
@current_book = nil
|
|
326
|
+
end
|
|
327
|
+
super
|
|
328
|
+
end
|
|
329
|
+
end
|
|
330
|
+
|
|
331
|
+
handler = BookExtractor.new
|
|
332
|
+
context.sax_parse(xml, handler)
|
|
333
|
+
puts handler.books.inspect
|
|
334
|
+
----
|
|
335
|
+
|
|
336
|
+
=== Stream processing
|
|
337
|
+
|
|
338
|
+
[source,ruby]
|
|
339
|
+
----
|
|
340
|
+
class StreamProcessor < Moxml::SAX::Handler
|
|
341
|
+
def initialize(output)
|
|
342
|
+
super()
|
|
343
|
+
@output = output
|
|
344
|
+
@current_record = nil
|
|
345
|
+
end
|
|
346
|
+
|
|
347
|
+
def on_start_element(name, attributes = {}, namespaces = {})
|
|
348
|
+
if name == "record"
|
|
349
|
+
@current_record = {}
|
|
350
|
+
end
|
|
351
|
+
end
|
|
352
|
+
|
|
353
|
+
def on_end_element(name)
|
|
354
|
+
if name == "record" && @current_record
|
|
355
|
+
process_record(@current_record)
|
|
356
|
+
@current_record = nil # Free memory immediately
|
|
357
|
+
end
|
|
358
|
+
end
|
|
359
|
+
|
|
360
|
+
private
|
|
361
|
+
|
|
362
|
+
def process_record(record)
|
|
363
|
+
# Process and write immediately - don't accumulate
|
|
364
|
+
@output.puts record.to_json
|
|
365
|
+
end
|
|
366
|
+
end
|
|
367
|
+
----
|
|
368
|
+
|
|
369
|
+
=== Path-based filtering
|
|
370
|
+
|
|
371
|
+
[source,ruby]
|
|
372
|
+
----
|
|
373
|
+
class PathMatcher < Moxml::SAX::ElementHandler
|
|
374
|
+
def on_start_element(name, attributes = {}, namespaces = {})
|
|
375
|
+
super
|
|
376
|
+
|
|
377
|
+
# Match exact path
|
|
378
|
+
if path_matches?(%r{^/catalog/book/title$})
|
|
379
|
+
puts "Found book title at depth #{depth}"
|
|
380
|
+
end
|
|
381
|
+
|
|
382
|
+
# Match pattern
|
|
383
|
+
if path_matches?(/\/book\//)
|
|
384
|
+
puts "Inside a book element somewhere"
|
|
385
|
+
end
|
|
386
|
+
|
|
387
|
+
# Check current element
|
|
388
|
+
if current_element == "price" && in_element?("book")
|
|
389
|
+
puts "Found price inside book"
|
|
390
|
+
end
|
|
391
|
+
end
|
|
392
|
+
end
|
|
393
|
+
----
|
|
394
|
+
|
|
395
|
+
=== Counting and statistics
|
|
396
|
+
|
|
397
|
+
[source,ruby]
|
|
398
|
+
----
|
|
399
|
+
class StatsCollector < Moxml::SAX::ElementHandler
|
|
400
|
+
attr_reader :stats
|
|
401
|
+
|
|
402
|
+
def initialize
|
|
403
|
+
super
|
|
404
|
+
@stats = Hash.new(0)
|
|
405
|
+
end
|
|
406
|
+
|
|
407
|
+
def on_start_element(name, attributes = {}, namespaces = {})
|
|
408
|
+
super
|
|
409
|
+
@stats[:elements] += 1
|
|
410
|
+
@stats[:by_name][name] ||= 0
|
|
411
|
+
@stats[:by_name][name] += 1
|
|
412
|
+
@stats[:max_depth] = [stats[:max_depth], depth].max
|
|
413
|
+
end
|
|
414
|
+
|
|
415
|
+
def on_characters(text)
|
|
416
|
+
@stats[:text_nodes] += 1 unless text.strip.empty?
|
|
417
|
+
end
|
|
418
|
+
end
|
|
419
|
+
----
|
|
420
|
+
|
|
421
|
+
=== Using block handler for quick scripts
|
|
422
|
+
|
|
423
|
+
[source,ruby]
|
|
424
|
+
----
|
|
425
|
+
# Quick data extraction
|
|
426
|
+
titles = []
|
|
427
|
+
context.sax_parse(xml) do
|
|
428
|
+
start_element do |name, attrs|
|
|
429
|
+
@in_title = (name == "title")
|
|
430
|
+
@text = "".dup if @in_title
|
|
431
|
+
end
|
|
432
|
+
|
|
433
|
+
characters do |text|
|
|
434
|
+
@text << text if @in_title
|
|
435
|
+
end
|
|
436
|
+
|
|
437
|
+
end_element do |name|
|
|
438
|
+
if name == "title"
|
|
439
|
+
titles << @text.strip
|
|
440
|
+
@in_title = false
|
|
441
|
+
end
|
|
442
|
+
end
|
|
443
|
+
end
|
|
444
|
+
|
|
445
|
+
puts titles
|
|
446
|
+
----
|
|
447
|
+
|
|
448
|
+
== Performance tips
|
|
449
|
+
|
|
450
|
+
=== Minimize object creation
|
|
451
|
+
|
|
452
|
+
[source,ruby]
|
|
453
|
+
----
|
|
454
|
+
# SLOW: Creates new string each time
|
|
455
|
+
def on_characters(text)
|
|
456
|
+
@text = @text + text
|
|
457
|
+
end
|
|
458
|
+
|
|
459
|
+
# FAST: Mutates existing string
|
|
460
|
+
def on_characters(text)
|
|
461
|
+
@text << text
|
|
462
|
+
end
|
|
463
|
+
----
|
|
464
|
+
|
|
465
|
+
=== Batch output operations
|
|
466
|
+
|
|
467
|
+
[source,ruby]
|
|
468
|
+
----
|
|
469
|
+
# SLOW: Write each record individually
|
|
470
|
+
def on_end_element(name)
|
|
471
|
+
@output.puts record if name == "record"
|
|
472
|
+
end
|
|
473
|
+
|
|
474
|
+
# FAST: Buffer and write in batches
|
|
475
|
+
def on_end_element(name)
|
|
476
|
+
if name == "record"
|
|
477
|
+
@buffer << record
|
|
478
|
+
flush if @buffer.size >= 1000
|
|
479
|
+
end
|
|
480
|
+
end
|
|
481
|
+
----
|
|
482
|
+
|
|
483
|
+
=== Reset state properly
|
|
484
|
+
|
|
485
|
+
[source,ruby]
|
|
486
|
+
----
|
|
487
|
+
def on_end_element(name)
|
|
488
|
+
if name == "book"
|
|
489
|
+
process(@current_book)
|
|
490
|
+
@current_book = nil # Free for GC
|
|
491
|
+
@current_text = "".dup # Fresh string, not ""
|
|
492
|
+
end
|
|
493
|
+
end
|
|
494
|
+
----
|
|
495
|
+
|
|
496
|
+
== Comparison with DOM parsing
|
|
497
|
+
|
|
498
|
+
[cols="1,1,1"]
|
|
499
|
+
|===
|
|
500
|
+
|Feature |SAX |DOM
|
|
501
|
+
|
|
502
|
+
|Memory usage
|
|
503
|
+
|O(1) - constant
|
|
504
|
+
|O(n) - full document
|
|
505
|
+
|
|
506
|
+
|Speed
|
|
507
|
+
|Fast - single pass
|
|
508
|
+
|Slower - builds tree
|
|
509
|
+
|
|
510
|
+
|Random access
|
|
511
|
+
|No
|
|
512
|
+
|Yes
|
|
513
|
+
|
|
514
|
+
|Modification
|
|
515
|
+
|No
|
|
516
|
+
|Yes
|
|
517
|
+
|
|
518
|
+
|XPath queries
|
|
519
|
+
|No
|
|
520
|
+
|Yes
|
|
521
|
+
|
|
522
|
+
|Best for
|
|
523
|
+
|Large files, streaming
|
|
524
|
+
|Small files, complex queries
|
|
525
|
+
|===
|
|
526
|
+
|
|
527
|
+
== Complete example
|
|
528
|
+
|
|
529
|
+
[source,ruby]
|
|
530
|
+
----
|
|
531
|
+
require 'moxml'
|
|
532
|
+
|
|
533
|
+
# Handler that extracts book data and counts elements
|
|
534
|
+
class BookProcessor < Moxml::SAX::ElementHandler
|
|
535
|
+
attr_reader :books, :element_count
|
|
536
|
+
|
|
537
|
+
def initialize
|
|
538
|
+
super
|
|
539
|
+
@books = []
|
|
540
|
+
@element_count = 0
|
|
541
|
+
@current_book = nil
|
|
542
|
+
@current_field = nil
|
|
543
|
+
@text_buffer = "".dup
|
|
544
|
+
end
|
|
545
|
+
|
|
546
|
+
def on_start_document
|
|
547
|
+
puts "Starting XML processing..."
|
|
548
|
+
end
|
|
549
|
+
|
|
550
|
+
def on_start_element(name, attributes = {}, namespaces = {})
|
|
551
|
+
super # Updates stack
|
|
552
|
+
@element_count += 1
|
|
553
|
+
|
|
554
|
+
case name
|
|
555
|
+
when "book"
|
|
556
|
+
@current_book = {
|
|
557
|
+
id: attributes["id"],
|
|
558
|
+
category: attributes["category"]
|
|
559
|
+
}
|
|
560
|
+
when "title", "author", "price", "isbn"
|
|
561
|
+
@current_field = name
|
|
562
|
+
@text_buffer = "".dup
|
|
563
|
+
end
|
|
564
|
+
end
|
|
565
|
+
|
|
566
|
+
def on_characters(text)
|
|
567
|
+
@text_buffer << text if @current_field
|
|
568
|
+
end
|
|
569
|
+
|
|
570
|
+
def on_end_element(name)
|
|
571
|
+
if @current_field == name && @current_book
|
|
572
|
+
value = @text_buffer.strip
|
|
573
|
+
value = value.to_f if name == "price"
|
|
574
|
+
@current_book[name.to_sym] = value
|
|
575
|
+
@current_field = nil
|
|
576
|
+
end
|
|
577
|
+
|
|
578
|
+
if name == "book" && @current_book
|
|
579
|
+
@books << @current_book
|
|
580
|
+
@current_book = nil
|
|
581
|
+
end
|
|
582
|
+
|
|
583
|
+
super # Updates stack
|
|
584
|
+
end
|
|
585
|
+
|
|
586
|
+
def on_end_document
|
|
587
|
+
puts "Processed #{@element_count} elements"
|
|
588
|
+
puts "Found #{@books.size} books"
|
|
589
|
+
end
|
|
590
|
+
end
|
|
591
|
+
|
|
592
|
+
# Usage
|
|
593
|
+
xml = File.read("library.xml")
|
|
594
|
+
context = Moxml.new(:nokogiri) # or :rexml, :oga, :ox
|
|
595
|
+
|
|
596
|
+
handler = BookProcessor.new
|
|
597
|
+
context.sax_parse(xml, handler)
|
|
598
|
+
|
|
599
|
+
# Access results
|
|
600
|
+
handler.books.each do |book|
|
|
601
|
+
puts "#{book[:title]} by #{book[:author]} - $#{book[:price]}"
|
|
602
|
+
end
|
|
603
|
+
----
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: Working with documents
|
|
3
|
+
parent: Guides
|
|
4
|
+
nav_order: 3
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
== Working with documents
|
|
8
|
+
|
|
9
|
+
=== Builder pattern
|
|
10
|
+
|
|
11
|
+
The builder pattern provides a clean DSL for creating XML documents:
|
|
12
|
+
|
|
13
|
+
[source,ruby]
|
|
14
|
+
----
|
|
15
|
+
doc = Moxml::Builder.new(Moxml.new).build do
|
|
16
|
+
declaration version: "1.0", encoding: "UTF-8"
|
|
17
|
+
element 'library', xmlns: 'http://example.org/library' do
|
|
18
|
+
element 'book' do
|
|
19
|
+
element 'title' do
|
|
20
|
+
text 'Ruby Programming'
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
----
|
|
26
|
+
|
|
27
|
+
See link:parsing-xml.adoc[Parsing XML Guide] for more document creation patterns.
|
|
28
|
+
|
|
29
|
+
=== Fluent interface API
|
|
30
|
+
|
|
31
|
+
Moxml provides a fluent, chainable API for creating and manipulating XML documents with improved developer experience:
|
|
32
|
+
|
|
33
|
+
[source,ruby]
|
|
34
|
+
----
|
|
35
|
+
# Old way - verbose and less readable
|
|
36
|
+
element = doc.create_element('book')
|
|
37
|
+
element.add_namespace("dc", "http://purl.org/dc/elements/1.1/")
|
|
38
|
+
element["id"] = "123"
|
|
39
|
+
element["type"] = "article"
|
|
40
|
+
child = doc.create_element("title")
|
|
41
|
+
child.text = "Hello"
|
|
42
|
+
element.add_child(child)
|
|
43
|
+
|
|
44
|
+
# New way - fluent and chainable
|
|
45
|
+
element = doc.create_element('book')
|
|
46
|
+
.with_namespace("dc", "http://purl.org/dc/elements/1.1/")
|
|
47
|
+
.set_attributes(id: "123", type: "article")
|
|
48
|
+
.with_child(doc.create_element("title").tap { |t| t.text = "Hello" })
|
|
49
|
+
----
|
|
50
|
+
|
|
51
|
+
==== Chainable element methods
|
|
52
|
+
|
|
53
|
+
[source,ruby]
|
|
54
|
+
----
|
|
55
|
+
# with_namespace - add namespace and return self
|
|
56
|
+
element.with_namespace("dc", "http://purl.org/dc/elements/1.1/")
|
|
57
|
+
|
|
58
|
+
# set_attributes - set multiple attributes at once
|
|
59
|
+
element.set_attributes(id: "123", title: "Ruby", year: "2024")
|
|
60
|
+
|
|
61
|
+
# with_child - add child and return self
|
|
62
|
+
element.with_child(doc.create_element("author"))
|
|
63
|
+
|
|
64
|
+
# Chain multiple operations
|
|
65
|
+
element
|
|
66
|
+
.with_namespace("dc", "http://purl.org/dc/elements/1.1/")
|
|
67
|
+
.set_attributes(id: "123", type: "technical")
|
|
68
|
+
.with_child(doc.create_element("title"))
|
|
69
|
+
.with_child(doc.create_element("author"))
|
|
70
|
+
----
|
|
71
|
+
|
|
72
|
+
==== Convenience query methods
|
|
73
|
+
|
|
74
|
+
[source,ruby]
|
|
75
|
+
----
|
|
76
|
+
# find_element - alias for at_xpath
|
|
77
|
+
first_book = doc.root.find_element("//book")
|
|
78
|
+
|
|
79
|
+
# find_all - returns array of matching elements
|
|
80
|
+
all_books = doc.root.find_all("//book")
|
|
81
|
+
|
|
82
|
+
# Document-level find methods
|
|
83
|
+
first_title = doc.find("//title")
|
|
84
|
+
all_titles = doc.find_all("//title")
|
|
85
|
+
----
|
|
86
|
+
|
|
87
|
+
==== Quick element creation
|
|
88
|
+
|
|
89
|
+
[source,ruby]
|
|
90
|
+
----
|
|
91
|
+
# add_element - create, configure, and add element in one call
|
|
92
|
+
book = doc.add_element("book", id: "123", title: "Ruby") do |elem|
|
|
93
|
+
elem.text = "Ruby Programming Guide"
|
|
94
|
+
end
|
|
95
|
+
----
|
|
96
|
+
|
|
97
|
+
==== Practical fluent example
|
|
98
|
+
|
|
99
|
+
[source,ruby]
|
|
100
|
+
----
|
|
101
|
+
doc = Moxml.new.create_document
|
|
102
|
+
|
|
103
|
+
# Build a complete book entry with fluent API
|
|
104
|
+
doc.add_element("library") do |library|
|
|
105
|
+
library
|
|
106
|
+
.with_namespace("dc", "http://purl.org/dc/elements/1.1/")
|
|
107
|
+
.with_child(
|
|
108
|
+
doc.create_element("book")
|
|
109
|
+
.set_attributes(id: "b1", isbn: "978-0-123456-78-9")
|
|
110
|
+
.with_child(doc.create_element("dc:title").tap { |t| t.text = "Ruby Programming" })
|
|
111
|
+
.with_child(doc.create_element("dc:creator").tap { |c| c.text = "Jane Smith" })
|
|
112
|
+
.with_child(doc.create_element("dc:date").tap { |d| d.text = "2024" })
|
|
113
|
+
)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
puts doc.to_xml(indent: 2)
|
|
117
|
+
----
|
|
118
|
+
|