moxml 0.1.6 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (215) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/dependent-repos.json +5 -0
  3. data/.github/workflows/dependent-tests.yml +20 -0
  4. data/.github/workflows/docs.yml +59 -0
  5. data/.github/workflows/rake.yml +12 -4
  6. data/.github/workflows/release.yml +5 -3
  7. data/.gitignore +37 -0
  8. data/.rubocop.yml +15 -7
  9. data/.rubocop_todo.yml +238 -40
  10. data/Gemfile +14 -9
  11. data/LICENSE.md +6 -2
  12. data/README.adoc +535 -373
  13. data/Rakefile +53 -0
  14. data/benchmarks/.gitignore +6 -0
  15. data/benchmarks/generate_report.rb +550 -0
  16. data/docs/Gemfile +13 -0
  17. data/docs/_config.yml +138 -0
  18. data/docs/_guides/advanced-features.adoc +87 -0
  19. data/docs/_guides/development-testing.adoc +165 -0
  20. data/docs/_guides/index.adoc +45 -0
  21. data/docs/_guides/modifying-xml.adoc +293 -0
  22. data/docs/_guides/parsing-xml.adoc +231 -0
  23. data/docs/_guides/sax-parsing.adoc +603 -0
  24. data/docs/_guides/working-with-documents.adoc +118 -0
  25. data/docs/_pages/adapter-compatibility.adoc +369 -0
  26. data/docs/_pages/adapters/headed-ox.adoc +237 -0
  27. data/docs/_pages/adapters/index.adoc +98 -0
  28. data/docs/_pages/adapters/libxml.adoc +286 -0
  29. data/docs/_pages/adapters/nokogiri.adoc +252 -0
  30. data/docs/_pages/adapters/oga.adoc +292 -0
  31. data/docs/_pages/adapters/ox.adoc +55 -0
  32. data/docs/_pages/adapters/rexml.adoc +293 -0
  33. data/docs/_pages/best-practices.adoc +430 -0
  34. data/docs/_pages/compatibility.adoc +468 -0
  35. data/docs/_pages/configuration.adoc +251 -0
  36. data/docs/_pages/error-handling.adoc +350 -0
  37. data/docs/_pages/headed-ox-limitations.adoc +558 -0
  38. data/docs/_pages/headed-ox.adoc +1025 -0
  39. data/docs/_pages/index.adoc +35 -0
  40. data/docs/_pages/installation.adoc +141 -0
  41. data/docs/_pages/node-api-reference.adoc +50 -0
  42. data/docs/_pages/performance.adoc +36 -0
  43. data/docs/_pages/quick-start.adoc +244 -0
  44. data/docs/_pages/thread-safety.adoc +29 -0
  45. data/docs/_references/document-api.adoc +408 -0
  46. data/docs/_references/index.adoc +48 -0
  47. data/docs/_tutorials/basic-usage.adoc +268 -0
  48. data/docs/_tutorials/builder-pattern.adoc +343 -0
  49. data/docs/_tutorials/index.adoc +33 -0
  50. data/docs/_tutorials/namespace-handling.adoc +325 -0
  51. data/docs/_tutorials/xpath-queries.adoc +359 -0
  52. data/docs/index.adoc +122 -0
  53. data/examples/README.md +124 -0
  54. data/examples/api_client/README.md +424 -0
  55. data/examples/api_client/api_client.rb +394 -0
  56. data/examples/api_client/example_response.xml +48 -0
  57. data/examples/headed_ox_example/README.md +90 -0
  58. data/examples/headed_ox_example/headed_ox_demo.rb +71 -0
  59. data/examples/rss_parser/README.md +194 -0
  60. data/examples/rss_parser/example_feed.xml +93 -0
  61. data/examples/rss_parser/rss_parser.rb +189 -0
  62. data/examples/sax_parsing/README.md +50 -0
  63. data/examples/sax_parsing/data_extractor.rb +75 -0
  64. data/examples/sax_parsing/example.xml +21 -0
  65. data/examples/sax_parsing/large_file.rb +78 -0
  66. data/examples/sax_parsing/simple_parser.rb +55 -0
  67. data/examples/web_scraper/README.md +352 -0
  68. data/examples/web_scraper/example_page.html +201 -0
  69. data/examples/web_scraper/web_scraper.rb +312 -0
  70. data/lib/moxml/adapter/base.rb +107 -28
  71. data/lib/moxml/adapter/customized_libxml/cdata.rb +28 -0
  72. data/lib/moxml/adapter/customized_libxml/comment.rb +24 -0
  73. data/lib/moxml/adapter/customized_libxml/declaration.rb +85 -0
  74. data/lib/moxml/adapter/customized_libxml/element.rb +39 -0
  75. data/lib/moxml/adapter/customized_libxml/node.rb +44 -0
  76. data/lib/moxml/adapter/customized_libxml/processing_instruction.rb +31 -0
  77. data/lib/moxml/adapter/customized_libxml/text.rb +27 -0
  78. data/lib/moxml/adapter/customized_oga/xml_generator.rb +1 -1
  79. data/lib/moxml/adapter/customized_ox/attribute.rb +28 -3
  80. data/lib/moxml/adapter/customized_ox/namespace.rb +0 -2
  81. data/lib/moxml/adapter/customized_ox/text.rb +0 -2
  82. data/lib/moxml/adapter/customized_rexml/formatter.rb +11 -6
  83. data/lib/moxml/adapter/headed_ox.rb +161 -0
  84. data/lib/moxml/adapter/libxml.rb +1548 -0
  85. data/lib/moxml/adapter/nokogiri.rb +121 -9
  86. data/lib/moxml/adapter/oga.rb +123 -12
  87. data/lib/moxml/adapter/ox.rb +283 -27
  88. data/lib/moxml/adapter/rexml.rb +127 -20
  89. data/lib/moxml/adapter.rb +21 -4
  90. data/lib/moxml/attribute.rb +6 -0
  91. data/lib/moxml/builder.rb +40 -4
  92. data/lib/moxml/config.rb +8 -3
  93. data/lib/moxml/context.rb +39 -1
  94. data/lib/moxml/doctype.rb +13 -1
  95. data/lib/moxml/document.rb +39 -6
  96. data/lib/moxml/document_builder.rb +27 -5
  97. data/lib/moxml/element.rb +71 -2
  98. data/lib/moxml/error.rb +175 -6
  99. data/lib/moxml/node.rb +94 -3
  100. data/lib/moxml/node_set.rb +34 -0
  101. data/lib/moxml/sax/block_handler.rb +194 -0
  102. data/lib/moxml/sax/element_handler.rb +124 -0
  103. data/lib/moxml/sax/handler.rb +113 -0
  104. data/lib/moxml/sax.rb +31 -0
  105. data/lib/moxml/version.rb +1 -1
  106. data/lib/moxml/xml_utils/encoder.rb +4 -4
  107. data/lib/moxml/xml_utils.rb +7 -4
  108. data/lib/moxml/xpath/ast/node.rb +159 -0
  109. data/lib/moxml/xpath/cache.rb +91 -0
  110. data/lib/moxml/xpath/compiler.rb +1768 -0
  111. data/lib/moxml/xpath/context.rb +26 -0
  112. data/lib/moxml/xpath/conversion.rb +124 -0
  113. data/lib/moxml/xpath/engine.rb +52 -0
  114. data/lib/moxml/xpath/errors.rb +101 -0
  115. data/lib/moxml/xpath/lexer.rb +304 -0
  116. data/lib/moxml/xpath/parser.rb +485 -0
  117. data/lib/moxml/xpath/ruby/generator.rb +269 -0
  118. data/lib/moxml/xpath/ruby/node.rb +193 -0
  119. data/lib/moxml/xpath.rb +37 -0
  120. data/lib/moxml.rb +5 -2
  121. data/moxml.gemspec +3 -1
  122. data/old-specs/moxml/adapter/customized_libxml/.gitkeep +6 -0
  123. data/spec/consistency/README.md +77 -0
  124. data/spec/{moxml/examples/adapter_spec.rb → consistency/adapter_parity_spec.rb} +4 -4
  125. data/spec/examples/README.md +75 -0
  126. data/spec/{support/shared_examples/examples/attribute.rb → examples/attribute_examples_spec.rb} +1 -1
  127. data/spec/{support/shared_examples/examples/basic_usage.rb → examples/basic_usage_spec.rb} +2 -2
  128. data/spec/{support/shared_examples/examples/namespace.rb → examples/namespace_examples_spec.rb} +3 -3
  129. data/spec/{support/shared_examples/examples/readme_examples.rb → examples/readme_examples_spec.rb} +6 -4
  130. data/spec/{support/shared_examples/examples/xpath.rb → examples/xpath_examples_spec.rb} +10 -6
  131. data/spec/integration/README.md +71 -0
  132. data/spec/{moxml/all_with_adapters_spec.rb → integration/all_adapters_spec.rb} +3 -2
  133. data/spec/integration/headed_ox_integration_spec.rb +326 -0
  134. data/spec/{support → integration}/shared_examples/edge_cases.rb +37 -10
  135. data/spec/integration/shared_examples/high_level/.gitkeep +0 -0
  136. data/spec/{support/shared_examples/context.rb → integration/shared_examples/high_level/context_behavior.rb} +2 -1
  137. data/spec/{support/shared_examples/integration.rb → integration/shared_examples/integration_workflows.rb} +23 -6
  138. data/spec/integration/shared_examples/node_wrappers/.gitkeep +0 -0
  139. data/spec/{support/shared_examples/cdata.rb → integration/shared_examples/node_wrappers/cdata_behavior.rb} +6 -1
  140. data/spec/{support/shared_examples/comment.rb → integration/shared_examples/node_wrappers/comment_behavior.rb} +2 -1
  141. data/spec/{support/shared_examples/declaration.rb → integration/shared_examples/node_wrappers/declaration_behavior.rb} +5 -2
  142. data/spec/{support/shared_examples/doctype.rb → integration/shared_examples/node_wrappers/doctype_behavior.rb} +2 -2
  143. data/spec/{support/shared_examples/document.rb → integration/shared_examples/node_wrappers/document_behavior.rb} +1 -1
  144. data/spec/{support/shared_examples/node.rb → integration/shared_examples/node_wrappers/node_behavior.rb} +9 -2
  145. data/spec/{support/shared_examples/node_set.rb → integration/shared_examples/node_wrappers/node_set_behavior.rb} +1 -18
  146. data/spec/{support/shared_examples/processing_instruction.rb → integration/shared_examples/node_wrappers/processing_instruction_behavior.rb} +6 -2
  147. data/spec/moxml/README.md +41 -0
  148. data/spec/moxml/adapter/.gitkeep +0 -0
  149. data/spec/moxml/adapter/README.md +61 -0
  150. data/spec/moxml/adapter/base_spec.rb +27 -0
  151. data/spec/moxml/adapter/headed_ox_spec.rb +311 -0
  152. data/spec/moxml/adapter/libxml_spec.rb +14 -0
  153. data/spec/moxml/adapter/ox_spec.rb +9 -8
  154. data/spec/moxml/adapter/shared_examples/.gitkeep +0 -0
  155. data/spec/{support/shared_examples/xml_adapter.rb → moxml/adapter/shared_examples/adapter_contract.rb} +39 -12
  156. data/spec/moxml/adapter_spec.rb +16 -0
  157. data/spec/moxml/attribute_spec.rb +30 -0
  158. data/spec/moxml/builder_spec.rb +33 -0
  159. data/spec/moxml/cdata_spec.rb +31 -0
  160. data/spec/moxml/comment_spec.rb +31 -0
  161. data/spec/moxml/config_spec.rb +3 -3
  162. data/spec/moxml/context_spec.rb +28 -0
  163. data/spec/moxml/declaration_spec.rb +36 -0
  164. data/spec/moxml/doctype_spec.rb +33 -0
  165. data/spec/moxml/document_builder_spec.rb +30 -0
  166. data/spec/moxml/document_spec.rb +105 -0
  167. data/spec/moxml/element_spec.rb +143 -0
  168. data/spec/moxml/error_spec.rb +266 -22
  169. data/spec/{moxml_spec.rb → moxml/moxml_spec.rb} +9 -9
  170. data/spec/moxml/namespace_spec.rb +32 -0
  171. data/spec/moxml/node_set_spec.rb +39 -0
  172. data/spec/moxml/node_spec.rb +37 -0
  173. data/spec/moxml/processing_instruction_spec.rb +34 -0
  174. data/spec/moxml/sax_spec.rb +1067 -0
  175. data/spec/moxml/text_spec.rb +31 -0
  176. data/spec/moxml/version_spec.rb +14 -0
  177. data/spec/moxml/xml_utils/.gitkeep +0 -0
  178. data/spec/moxml/xml_utils/encoder_spec.rb +27 -0
  179. data/spec/moxml/xml_utils_spec.rb +49 -0
  180. data/spec/moxml/xpath/ast/node_spec.rb +83 -0
  181. data/spec/moxml/xpath/axes_spec.rb +296 -0
  182. data/spec/moxml/xpath/cache_spec.rb +358 -0
  183. data/spec/moxml/xpath/compiler_spec.rb +406 -0
  184. data/spec/moxml/xpath/context_spec.rb +210 -0
  185. data/spec/moxml/xpath/conversion_spec.rb +365 -0
  186. data/spec/moxml/xpath/fixtures/sample.xml +25 -0
  187. data/spec/moxml/xpath/functions/boolean_functions_spec.rb +114 -0
  188. data/spec/moxml/xpath/functions/node_functions_spec.rb +145 -0
  189. data/spec/moxml/xpath/functions/numeric_functions_spec.rb +164 -0
  190. data/spec/moxml/xpath/functions/position_functions_spec.rb +93 -0
  191. data/spec/moxml/xpath/functions/special_functions_spec.rb +89 -0
  192. data/spec/moxml/xpath/functions/string_functions_spec.rb +381 -0
  193. data/spec/moxml/xpath/lexer_spec.rb +488 -0
  194. data/spec/moxml/xpath/parser_integration_spec.rb +210 -0
  195. data/spec/moxml/xpath/parser_spec.rb +364 -0
  196. data/spec/moxml/xpath/ruby/generator_spec.rb +421 -0
  197. data/spec/moxml/xpath/ruby/node_spec.rb +291 -0
  198. data/spec/moxml/xpath_capabilities_spec.rb +199 -0
  199. data/spec/moxml/xpath_spec.rb +77 -0
  200. data/spec/performance/README.md +83 -0
  201. data/spec/performance/benchmark_spec.rb +64 -0
  202. data/spec/{support/shared_examples/examples/memory.rb → performance/memory_usage_spec.rb} +3 -1
  203. data/spec/{support/shared_examples/examples/thread_safety.rb → performance/thread_safety_spec.rb} +3 -1
  204. data/spec/performance/xpath_benchmark_spec.rb +259 -0
  205. data/spec/spec_helper.rb +58 -1
  206. data/spec/support/xml_matchers.rb +1 -1
  207. metadata +176 -35
  208. data/lib/ox/node.rb +0 -9
  209. data/spec/support/shared_examples/examples/benchmark_spec.rb +0 -51
  210. /data/spec/{support/shared_examples/builder.rb → integration/shared_examples/high_level/builder_behavior.rb} +0 -0
  211. /data/spec/{support/shared_examples/document_builder.rb → integration/shared_examples/high_level/document_builder_behavior.rb} +0 -0
  212. /data/spec/{support/shared_examples/attribute.rb → integration/shared_examples/node_wrappers/attribute_behavior.rb} +0 -0
  213. /data/spec/{support/shared_examples/element.rb → integration/shared_examples/node_wrappers/element_behavior.rb} +0 -0
  214. /data/spec/{support/shared_examples/namespace.rb → integration/shared_examples/node_wrappers/namespace_behavior.rb} +0 -0
  215. /data/spec/{support/shared_examples/text.rb → integration/shared_examples/node_wrappers/text_behavior.rb} +0 -0
@@ -0,0 +1,603 @@
1
+ = SAX Parsing Guide
2
+ :toc:
3
+ :toclevels: 3
4
+
5
+ == Introduction
6
+
7
+ SAX (Simple API for XML) provides event-driven XML parsing, allowing you to process XML documents efficiently without loading the entire structure into memory. This is particularly useful for large files or streaming scenarios.
8
+
9
+ Moxml provides a consistent SAX interface across all supported adapters, with three handler types to suit different use cases.
10
+
11
+ == When to use SAX vs DOM
12
+
13
+ === Use SAX when
14
+
15
+ * Processing files >100MB in size
16
+ * Memory is constrained
17
+ * You only need specific data from the document
18
+ * Streaming data processing is required
19
+ * Linear, forward-only traversal is sufficient
20
+
21
+ === Use DOM when
22
+
23
+ * Need random access to any part of the document
24
+ * Need to modify XML structure
25
+ * Working with small documents (<10MB)
26
+ * Need XPath queries for complex selections
27
+ * Need to navigate backwards or access parent nodes
28
+
29
+ == Handler types
30
+
31
+ Moxml provides three handler types, each suited for different scenarios.
32
+
33
+ === Base handler
34
+
35
+ The base handler provides minimal interface - override only the events you need.
36
+
37
+ [source,ruby]
38
+ ----
39
+ class MyHandler < Moxml::SAX::Handler
40
+ def on_start_element(name, attributes = {}, namespaces = {})
41
+ puts "Element: #{name}"
42
+ end
43
+
44
+ def on_characters(text)
45
+ puts "Text: #{text}"
46
+ end
47
+ end
48
+
49
+ context = Moxml.new(:nokogiri)
50
+ context.sax_parse(xml, MyHandler.new)
51
+ ----
52
+
53
+ === ElementHandler
54
+
55
+ Adds element stack tracking and path utilities for more sophisticated parsing.
56
+
57
+ [source,ruby]
58
+ ----
59
+ class DataExtractor < Moxml::SAX::ElementHandler
60
+ def on_start_element(name, attributes = {}, namespaces = {})
61
+ super # Important: updates the stack
62
+
63
+ if path_matches?(/book\/title$/)
64
+ # We're inside book/title element
65
+ @capturing = true
66
+ end
67
+ end
68
+ end
69
+ ----
70
+
71
+ **Utilities provided:**
72
+
73
+ * `element_stack` - Array of open elements
74
+ * `current_element()` - Current element name
75
+ * `parent_element()` - Parent element name
76
+ * `in_element?(name)` - Check if inside element
77
+ * `path_matches?(pattern)` - Match current path with regex
78
+ * `path_string(sep)` - Get path as string (default separator: "/")
79
+ * `depth()` - Current nesting level
80
+
81
+ === BlockHandler
82
+
83
+ DSL for simple cases without defining a class.
84
+
85
+ [source,ruby]
86
+ ----
87
+ context.sax_parse(xml) do
88
+ start_element { |name, attrs| puts name }
89
+ characters { |text| puts text unless text.strip.empty? }
90
+ end_element { |name| puts "End: #{name}" }
91
+ end
92
+ ----
93
+
94
+ == Event reference
95
+
96
+ === Document lifecycle
97
+
98
+ [source,ruby]
99
+ ----
100
+ def on_start_document
101
+ # Called once at document start
102
+ # Initialize any document-level state here
103
+ end
104
+
105
+ def on_end_document
106
+ # Called once at document end
107
+ # Clean up, finalize processing
108
+ end
109
+ ----
110
+
111
+ === Element events
112
+
113
+ [source,ruby]
114
+ ----
115
+ def on_start_element(name, attributes = {}, namespaces = {})
116
+ # name: Element name (String)
117
+ # attributes: Hash<String, String> - regular attributes
118
+ # namespaces: Hash<String|nil, String> - prefix => uri
119
+ # nil prefix = default namespace (xmlns="...")
120
+ end
121
+
122
+ def on_end_element(name)
123
+ # name: Element name (String)
124
+ # Signals element is closing
125
+ end
126
+ ----
127
+
128
+ === Content events
129
+
130
+ [source,ruby]
131
+ ----
132
+ def on_characters(text)
133
+ # Called for text content
134
+ # May be called multiple times for single text node
135
+ # Accumulate text if needed
136
+ end
137
+
138
+ def on_cdata(text)
139
+ # Called for <![CDATA[...]]> sections
140
+ # Not supported by Ox adapter
141
+ end
142
+
143
+ def on_comment(text)
144
+ # Called for <!-- ... --> comments
145
+ # Not supported by Ox adapter
146
+ end
147
+ ----
148
+
149
+ === Processing instructions
150
+
151
+ [source,ruby]
152
+ ----
153
+ def on_processing_instruction(target, data)
154
+ # Called for <?target data?>
155
+ # Example: <?xml-stylesheet type="text/xsl"?>
156
+ # Not supported by Ox adapter
157
+ end
158
+ ----
159
+
160
+ === Error handling
161
+
162
+ [source,ruby]
163
+ ----
164
+ def on_error(error)
165
+ # error is Moxml::ParseError
166
+ # Default: raises the error
167
+ # Override to handle differently
168
+ end
169
+
170
+ def on_warning(message)
171
+ # Non-fatal warnings
172
+ # Default: ignores
173
+ end
174
+ ----
175
+
176
+ == Best practices
177
+
178
+ === Memory management
179
+
180
+ [source,ruby]
181
+ ----
182
+ class MemoryEfficientHandler < Moxml::SAX::Handler
183
+ def initialize(output_stream)
184
+ super()
185
+ @output = output_stream
186
+ @current_text = "".dup # Mutable string
187
+ end
188
+
189
+ def on_characters(text)
190
+ @current_text << text # Accumulate
191
+ end
192
+
193
+ def on_end_element(name)
194
+ @output.puts @current_text.strip
195
+ @current_text = "".dup # Reset - don't accumulate memory
196
+ end
197
+ end
198
+ ----
199
+
200
+ === String accumulation
201
+
202
+ Ruby 2.3+ freezes string literals by default. Always use `.dup`:
203
+
204
+ [source,ruby]
205
+ ----
206
+ # WRONG:
207
+ @text = "" # Frozen literal!
208
+
209
+ # RIGHT:
210
+ @text = "".dup # Mutable string
211
+ ----
212
+
213
+ === Error recovery
214
+
215
+ [source,ruby]
216
+ ----
217
+ class RobustHandler < Moxml::SAX::Handler
218
+ def initialize
219
+ super
220
+ @errors = []
221
+ end
222
+
223
+ def on_error(error)
224
+ # Log but don't crash
225
+ warn "Parse error: #{error.message}"
226
+ @errors << error
227
+ # Don't re-raise - allows parsing to continue if possible
228
+ end
229
+
230
+ def on_warning(message)
231
+ warn "Warning: #{message}"
232
+ end
233
+ end
234
+ ----
235
+
236
+ == Adapter-specific notes
237
+
238
+ === Nokogiri
239
+
240
+ * ✅ Full SAX support
241
+ * ✅ All 10 event types
242
+ * ✅ Line/column information in errors
243
+ * **Best choice for production use**
244
+
245
+ === REXML
246
+
247
+ * ✅ Full SAX support
248
+ * ✅ Pure Ruby (no C dependencies)
249
+ * ✅ Always available (stdlib)
250
+ * ⚠️ Slower than C-based parsers
251
+ * **Best for portability**
252
+
253
+ === Oga
254
+
255
+ * ✅ Full SAX support
256
+ * ✅ Pure Ruby
257
+ * ✅ Modern API
258
+ * ⚠️ May be lenient with malformed XML
259
+ * **Good for JRuby/TruffleRuby**
260
+
261
+ === Ox
262
+
263
+ * ✅ Fast parsing
264
+ * ✅ Core events supported (start/end element, text)
265
+ * ❌ No separate CDATA events (delivered as text)
266
+ * ❌ No comment events
267
+ * ❌ No processing instruction events
268
+ * **Best for simple, fast parsing**
269
+
270
+ === LibXML
271
+
272
+ * ✅ Full SAX support
273
+ * ✅ Fast (C-based)
274
+ * ✅ Alternative to Nokogiri
275
+ * **Good for performance**
276
+
277
+ === HeadedOx
278
+
279
+ * Same as Ox (inherits implementation)
280
+ * ✅ Fast parsing
281
+ * ❌ Same limitations as Ox
282
+
283
+ == Common patterns
284
+
285
+ === Extract specific data
286
+
287
+ [source,ruby]
288
+ ----
289
+ class BookExtractor < Moxml::SAX::ElementHandler
290
+ attr_reader :books
291
+
292
+ def initialize
293
+ super
294
+ @books = []
295
+ @current_book = nil
296
+ @current_field = nil
297
+ @current_text = "".dup
298
+ end
299
+
300
+ def on_start_element(name, attributes = {}, namespaces = {})
301
+ super
302
+ case name
303
+ when "book"
304
+ @current_book = { id: attributes["id"] }
305
+ when "title", "author", "price"
306
+ @current_field = name
307
+ @current_text = "".dup
308
+ end
309
+ end
310
+
311
+ def on_characters(text)
312
+ @current_text << text if @current_field
313
+ end
314
+
315
+ def on_end_element(name)
316
+ case name
317
+ when "title", "author"
318
+ @current_book[@current_field.to_sym] = @current_text.strip if @current_book
319
+ @current_field = nil
320
+ when "price"
321
+ @current_book[:price] = @current_text.strip.to_f if @current_book
322
+ @current_field = nil
323
+ when "book"
324
+ @books << @current_book if @current_book
325
+ @current_book = nil
326
+ end
327
+ super
328
+ end
329
+ end
330
+
331
+ handler = BookExtractor.new
332
+ context.sax_parse(xml, handler)
333
+ puts handler.books.inspect
334
+ ----
335
+
336
+ === Stream processing
337
+
338
+ [source,ruby]
339
+ ----
340
+ class StreamProcessor < Moxml::SAX::Handler
341
+ def initialize(output)
342
+ super()
343
+ @output = output
344
+ @current_record = nil
345
+ end
346
+
347
+ def on_start_element(name, attributes = {}, namespaces = {})
348
+ if name == "record"
349
+ @current_record = {}
350
+ end
351
+ end
352
+
353
+ def on_end_element(name)
354
+ if name == "record" && @current_record
355
+ process_record(@current_record)
356
+ @current_record = nil # Free memory immediately
357
+ end
358
+ end
359
+
360
+ private
361
+
362
+ def process_record(record)
363
+ # Process and write immediately - don't accumulate
364
+ @output.puts record.to_json
365
+ end
366
+ end
367
+ ----
368
+
369
+ === Path-based filtering
370
+
371
+ [source,ruby]
372
+ ----
373
+ class PathMatcher < Moxml::SAX::ElementHandler
374
+ def on_start_element(name, attributes = {}, namespaces = {})
375
+ super
376
+
377
+ # Match exact path
378
+ if path_matches?(%r{^/catalog/book/title$})
379
+ puts "Found book title at depth #{depth}"
380
+ end
381
+
382
+ # Match pattern
383
+ if path_matches?(/\/book\//)
384
+ puts "Inside a book element somewhere"
385
+ end
386
+
387
+ # Check current element
388
+ if current_element == "price" && in_element?("book")
389
+ puts "Found price inside book"
390
+ end
391
+ end
392
+ end
393
+ ----
394
+
395
+ === Counting and statistics
396
+
397
+ [source,ruby]
398
+ ----
399
+ class StatsCollector < Moxml::SAX::ElementHandler
400
+ attr_reader :stats
401
+
402
+ def initialize
403
+ super
404
+ @stats = Hash.new(0)
405
+ end
406
+
407
+ def on_start_element(name, attributes = {}, namespaces = {})
408
+ super
409
+ @stats[:elements] += 1
410
+ @stats[:by_name][name] ||= 0
411
+ @stats[:by_name][name] += 1
412
+ @stats[:max_depth] = [stats[:max_depth], depth].max
413
+ end
414
+
415
+ def on_characters(text)
416
+ @stats[:text_nodes] += 1 unless text.strip.empty?
417
+ end
418
+ end
419
+ ----
420
+
421
+ === Using block handler for quick scripts
422
+
423
+ [source,ruby]
424
+ ----
425
+ # Quick data extraction
426
+ titles = []
427
+ context.sax_parse(xml) do
428
+ start_element do |name, attrs|
429
+ @in_title = (name == "title")
430
+ @text = "".dup if @in_title
431
+ end
432
+
433
+ characters do |text|
434
+ @text << text if @in_title
435
+ end
436
+
437
+ end_element do |name|
438
+ if name == "title"
439
+ titles << @text.strip
440
+ @in_title = false
441
+ end
442
+ end
443
+ end
444
+
445
+ puts titles
446
+ ----
447
+
448
+ == Performance tips
449
+
450
+ === Minimize object creation
451
+
452
+ [source,ruby]
453
+ ----
454
+ # SLOW: Creates new string each time
455
+ def on_characters(text)
456
+ @text = @text + text
457
+ end
458
+
459
+ # FAST: Mutates existing string
460
+ def on_characters(text)
461
+ @text << text
462
+ end
463
+ ----
464
+
465
+ === Batch output operations
466
+
467
+ [source,ruby]
468
+ ----
469
+ # SLOW: Write each record individually
470
+ def on_end_element(name)
471
+ @output.puts record if name == "record"
472
+ end
473
+
474
+ # FAST: Buffer and write in batches
475
+ def on_end_element(name)
476
+ if name == "record"
477
+ @buffer << record
478
+ flush if @buffer.size >= 1000
479
+ end
480
+ end
481
+ ----
482
+
483
+ === Reset state properly
484
+
485
+ [source,ruby]
486
+ ----
487
+ def on_end_element(name)
488
+ if name == "book"
489
+ process(@current_book)
490
+ @current_book = nil # Free for GC
491
+ @current_text = "".dup # Fresh string, not ""
492
+ end
493
+ end
494
+ ----
495
+
496
+ == Comparison with DOM parsing
497
+
498
+ [cols="1,1,1"]
499
+ |===
500
+ |Feature |SAX |DOM
501
+
502
+ |Memory usage
503
+ |O(1) - constant
504
+ |O(n) - full document
505
+
506
+ |Speed
507
+ |Fast - single pass
508
+ |Slower - builds tree
509
+
510
+ |Random access
511
+ |No
512
+ |Yes
513
+
514
+ |Modification
515
+ |No
516
+ |Yes
517
+
518
+ |XPath queries
519
+ |No
520
+ |Yes
521
+
522
+ |Best for
523
+ |Large files, streaming
524
+ |Small files, complex queries
525
+ |===
526
+
527
+ == Complete example
528
+
529
+ [source,ruby]
530
+ ----
531
+ require 'moxml'
532
+
533
+ # Handler that extracts book data and counts elements
534
+ class BookProcessor < Moxml::SAX::ElementHandler
535
+ attr_reader :books, :element_count
536
+
537
+ def initialize
538
+ super
539
+ @books = []
540
+ @element_count = 0
541
+ @current_book = nil
542
+ @current_field = nil
543
+ @text_buffer = "".dup
544
+ end
545
+
546
+ def on_start_document
547
+ puts "Starting XML processing..."
548
+ end
549
+
550
+ def on_start_element(name, attributes = {}, namespaces = {})
551
+ super # Updates stack
552
+ @element_count += 1
553
+
554
+ case name
555
+ when "book"
556
+ @current_book = {
557
+ id: attributes["id"],
558
+ category: attributes["category"]
559
+ }
560
+ when "title", "author", "price", "isbn"
561
+ @current_field = name
562
+ @text_buffer = "".dup
563
+ end
564
+ end
565
+
566
+ def on_characters(text)
567
+ @text_buffer << text if @current_field
568
+ end
569
+
570
+ def on_end_element(name)
571
+ if @current_field == name && @current_book
572
+ value = @text_buffer.strip
573
+ value = value.to_f if name == "price"
574
+ @current_book[name.to_sym] = value
575
+ @current_field = nil
576
+ end
577
+
578
+ if name == "book" && @current_book
579
+ @books << @current_book
580
+ @current_book = nil
581
+ end
582
+
583
+ super # Updates stack
584
+ end
585
+
586
+ def on_end_document
587
+ puts "Processed #{@element_count} elements"
588
+ puts "Found #{@books.size} books"
589
+ end
590
+ end
591
+
592
+ # Usage
593
+ xml = File.read("library.xml")
594
+ context = Moxml.new(:nokogiri) # or :rexml, :oga, :ox
595
+
596
+ handler = BookProcessor.new
597
+ context.sax_parse(xml, handler)
598
+
599
+ # Access results
600
+ handler.books.each do |book|
601
+ puts "#{book[:title]} by #{book[:author]} - $#{book[:price]}"
602
+ end
603
+ ----
@@ -0,0 +1,118 @@
1
+ ---
2
+ title: Working with documents
3
+ parent: Guides
4
+ nav_order: 3
5
+ ---
6
+
7
+ == Working with documents
8
+
9
+ === Builder pattern
10
+
11
+ The builder pattern provides a clean DSL for creating XML documents:
12
+
13
+ [source,ruby]
14
+ ----
15
+ doc = Moxml::Builder.new(Moxml.new).build do
16
+ declaration version: "1.0", encoding: "UTF-8"
17
+ element 'library', xmlns: 'http://example.org/library' do
18
+ element 'book' do
19
+ element 'title' do
20
+ text 'Ruby Programming'
21
+ end
22
+ end
23
+ end
24
+ end
25
+ ----
26
+
27
+ See link:parsing-xml.adoc[Parsing XML Guide] for more document creation patterns.
28
+
29
+ === Fluent interface API
30
+
31
+ Moxml provides a fluent, chainable API for creating and manipulating XML documents with improved developer experience:
32
+
33
+ [source,ruby]
34
+ ----
35
+ # Old way - verbose and less readable
36
+ element = doc.create_element('book')
37
+ element.add_namespace("dc", "http://purl.org/dc/elements/1.1/")
38
+ element["id"] = "123"
39
+ element["type"] = "article"
40
+ child = doc.create_element("title")
41
+ child.text = "Hello"
42
+ element.add_child(child)
43
+
44
+ # New way - fluent and chainable
45
+ element = doc.create_element('book')
46
+ .with_namespace("dc", "http://purl.org/dc/elements/1.1/")
47
+ .set_attributes(id: "123", type: "article")
48
+ .with_child(doc.create_element("title").tap { |t| t.text = "Hello" })
49
+ ----
50
+
51
+ ==== Chainable element methods
52
+
53
+ [source,ruby]
54
+ ----
55
+ # with_namespace - add namespace and return self
56
+ element.with_namespace("dc", "http://purl.org/dc/elements/1.1/")
57
+
58
+ # set_attributes - set multiple attributes at once
59
+ element.set_attributes(id: "123", title: "Ruby", year: "2024")
60
+
61
+ # with_child - add child and return self
62
+ element.with_child(doc.create_element("author"))
63
+
64
+ # Chain multiple operations
65
+ element
66
+ .with_namespace("dc", "http://purl.org/dc/elements/1.1/")
67
+ .set_attributes(id: "123", type: "technical")
68
+ .with_child(doc.create_element("title"))
69
+ .with_child(doc.create_element("author"))
70
+ ----
71
+
72
+ ==== Convenience query methods
73
+
74
+ [source,ruby]
75
+ ----
76
+ # find_element - alias for at_xpath
77
+ first_book = doc.root.find_element("//book")
78
+
79
+ # find_all - returns array of matching elements
80
+ all_books = doc.root.find_all("//book")
81
+
82
+ # Document-level find methods
83
+ first_title = doc.find("//title")
84
+ all_titles = doc.find_all("//title")
85
+ ----
86
+
87
+ ==== Quick element creation
88
+
89
+ [source,ruby]
90
+ ----
91
+ # add_element - create, configure, and add element in one call
92
+ book = doc.add_element("book", id: "123", title: "Ruby") do |elem|
93
+ elem.text = "Ruby Programming Guide"
94
+ end
95
+ ----
96
+
97
+ ==== Practical fluent example
98
+
99
+ [source,ruby]
100
+ ----
101
+ doc = Moxml.new.create_document
102
+
103
+ # Build a complete book entry with fluent API
104
+ doc.add_element("library") do |library|
105
+ library
106
+ .with_namespace("dc", "http://purl.org/dc/elements/1.1/")
107
+ .with_child(
108
+ doc.create_element("book")
109
+ .set_attributes(id: "b1", isbn: "978-0-123456-78-9")
110
+ .with_child(doc.create_element("dc:title").tap { |t| t.text = "Ruby Programming" })
111
+ .with_child(doc.create_element("dc:creator").tap { |c| c.text = "Jane Smith" })
112
+ .with_child(doc.create_element("dc:date").tap { |d| d.text = "2024" })
113
+ )
114
+ end
115
+
116
+ puts doc.to_xml(indent: 2)
117
+ ----
118
+