moxml 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (212) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/dependent-repos.json +5 -0
  3. data/.github/workflows/dependent-tests.yml +20 -0
  4. data/.github/workflows/docs.yml +59 -0
  5. data/.github/workflows/rake.yml +10 -10
  6. data/.github/workflows/release.yml +5 -3
  7. data/.gitignore +37 -0
  8. data/.rubocop.yml +15 -7
  9. data/.rubocop_todo.yml +238 -40
  10. data/Gemfile +14 -9
  11. data/LICENSE.md +6 -2
  12. data/README.adoc +535 -373
  13. data/Rakefile +53 -0
  14. data/benchmarks/.gitignore +6 -0
  15. data/benchmarks/generate_report.rb +550 -0
  16. data/docs/Gemfile +13 -0
  17. data/docs/_config.yml +138 -0
  18. data/docs/_guides/advanced-features.adoc +87 -0
  19. data/docs/_guides/development-testing.adoc +165 -0
  20. data/docs/_guides/index.adoc +45 -0
  21. data/docs/_guides/modifying-xml.adoc +293 -0
  22. data/docs/_guides/parsing-xml.adoc +231 -0
  23. data/docs/_guides/sax-parsing.adoc +603 -0
  24. data/docs/_guides/working-with-documents.adoc +118 -0
  25. data/docs/_pages/adapter-compatibility.adoc +369 -0
  26. data/docs/_pages/adapters/headed-ox.adoc +237 -0
  27. data/docs/_pages/adapters/index.adoc +98 -0
  28. data/docs/_pages/adapters/libxml.adoc +286 -0
  29. data/docs/_pages/adapters/nokogiri.adoc +252 -0
  30. data/docs/_pages/adapters/oga.adoc +292 -0
  31. data/docs/_pages/adapters/ox.adoc +55 -0
  32. data/docs/_pages/adapters/rexml.adoc +293 -0
  33. data/docs/_pages/best-practices.adoc +430 -0
  34. data/docs/_pages/compatibility.adoc +468 -0
  35. data/docs/_pages/configuration.adoc +251 -0
  36. data/docs/_pages/error-handling.adoc +350 -0
  37. data/docs/_pages/headed-ox-limitations.adoc +558 -0
  38. data/docs/_pages/headed-ox.adoc +1025 -0
  39. data/docs/_pages/index.adoc +35 -0
  40. data/docs/_pages/installation.adoc +141 -0
  41. data/docs/_pages/node-api-reference.adoc +50 -0
  42. data/docs/_pages/performance.adoc +36 -0
  43. data/docs/_pages/quick-start.adoc +244 -0
  44. data/docs/_pages/thread-safety.adoc +29 -0
  45. data/docs/_references/document-api.adoc +408 -0
  46. data/docs/_references/index.adoc +48 -0
  47. data/docs/_tutorials/basic-usage.adoc +268 -0
  48. data/docs/_tutorials/builder-pattern.adoc +343 -0
  49. data/docs/_tutorials/index.adoc +33 -0
  50. data/docs/_tutorials/namespace-handling.adoc +325 -0
  51. data/docs/_tutorials/xpath-queries.adoc +359 -0
  52. data/docs/index.adoc +122 -0
  53. data/examples/README.md +124 -0
  54. data/examples/api_client/README.md +424 -0
  55. data/examples/api_client/api_client.rb +394 -0
  56. data/examples/api_client/example_response.xml +48 -0
  57. data/examples/headed_ox_example/README.md +90 -0
  58. data/examples/headed_ox_example/headed_ox_demo.rb +71 -0
  59. data/examples/rss_parser/README.md +194 -0
  60. data/examples/rss_parser/example_feed.xml +93 -0
  61. data/examples/rss_parser/rss_parser.rb +189 -0
  62. data/examples/sax_parsing/README.md +50 -0
  63. data/examples/sax_parsing/data_extractor.rb +75 -0
  64. data/examples/sax_parsing/example.xml +21 -0
  65. data/examples/sax_parsing/large_file.rb +78 -0
  66. data/examples/sax_parsing/simple_parser.rb +55 -0
  67. data/examples/web_scraper/README.md +352 -0
  68. data/examples/web_scraper/example_page.html +201 -0
  69. data/examples/web_scraper/web_scraper.rb +312 -0
  70. data/lib/moxml/adapter/base.rb +107 -28
  71. data/lib/moxml/adapter/customized_libxml/cdata.rb +28 -0
  72. data/lib/moxml/adapter/customized_libxml/comment.rb +24 -0
  73. data/lib/moxml/adapter/customized_libxml/declaration.rb +85 -0
  74. data/lib/moxml/adapter/customized_libxml/element.rb +39 -0
  75. data/lib/moxml/adapter/customized_libxml/node.rb +44 -0
  76. data/lib/moxml/adapter/customized_libxml/processing_instruction.rb +31 -0
  77. data/lib/moxml/adapter/customized_libxml/text.rb +27 -0
  78. data/lib/moxml/adapter/customized_oga/xml_generator.rb +1 -1
  79. data/lib/moxml/adapter/customized_ox/attribute.rb +28 -1
  80. data/lib/moxml/adapter/customized_rexml/formatter.rb +11 -6
  81. data/lib/moxml/adapter/headed_ox.rb +161 -0
  82. data/lib/moxml/adapter/libxml.rb +1548 -0
  83. data/lib/moxml/adapter/nokogiri.rb +121 -9
  84. data/lib/moxml/adapter/oga.rb +123 -12
  85. data/lib/moxml/adapter/ox.rb +282 -26
  86. data/lib/moxml/adapter/rexml.rb +127 -20
  87. data/lib/moxml/adapter.rb +21 -4
  88. data/lib/moxml/attribute.rb +6 -0
  89. data/lib/moxml/builder.rb +40 -4
  90. data/lib/moxml/config.rb +8 -3
  91. data/lib/moxml/context.rb +39 -1
  92. data/lib/moxml/doctype.rb +13 -1
  93. data/lib/moxml/document.rb +39 -6
  94. data/lib/moxml/document_builder.rb +27 -5
  95. data/lib/moxml/element.rb +71 -2
  96. data/lib/moxml/error.rb +175 -6
  97. data/lib/moxml/node.rb +94 -3
  98. data/lib/moxml/node_set.rb +34 -0
  99. data/lib/moxml/sax/block_handler.rb +194 -0
  100. data/lib/moxml/sax/element_handler.rb +124 -0
  101. data/lib/moxml/sax/handler.rb +113 -0
  102. data/lib/moxml/sax.rb +31 -0
  103. data/lib/moxml/version.rb +1 -1
  104. data/lib/moxml/xml_utils/encoder.rb +4 -4
  105. data/lib/moxml/xml_utils.rb +7 -4
  106. data/lib/moxml/xpath/ast/node.rb +159 -0
  107. data/lib/moxml/xpath/cache.rb +91 -0
  108. data/lib/moxml/xpath/compiler.rb +1768 -0
  109. data/lib/moxml/xpath/context.rb +26 -0
  110. data/lib/moxml/xpath/conversion.rb +124 -0
  111. data/lib/moxml/xpath/engine.rb +52 -0
  112. data/lib/moxml/xpath/errors.rb +101 -0
  113. data/lib/moxml/xpath/lexer.rb +304 -0
  114. data/lib/moxml/xpath/parser.rb +485 -0
  115. data/lib/moxml/xpath/ruby/generator.rb +269 -0
  116. data/lib/moxml/xpath/ruby/node.rb +193 -0
  117. data/lib/moxml/xpath.rb +37 -0
  118. data/lib/moxml.rb +5 -2
  119. data/moxml.gemspec +3 -1
  120. data/old-specs/moxml/adapter/customized_libxml/.gitkeep +6 -0
  121. data/spec/consistency/README.md +77 -0
  122. data/spec/{moxml/examples/adapter_spec.rb → consistency/adapter_parity_spec.rb} +4 -4
  123. data/spec/examples/README.md +75 -0
  124. data/spec/{support/shared_examples/examples/attribute.rb → examples/attribute_examples_spec.rb} +1 -1
  125. data/spec/{support/shared_examples/examples/basic_usage.rb → examples/basic_usage_spec.rb} +2 -2
  126. data/spec/{support/shared_examples/examples/namespace.rb → examples/namespace_examples_spec.rb} +3 -3
  127. data/spec/{support/shared_examples/examples/readme_examples.rb → examples/readme_examples_spec.rb} +6 -4
  128. data/spec/{support/shared_examples/examples/xpath.rb → examples/xpath_examples_spec.rb} +10 -6
  129. data/spec/integration/README.md +71 -0
  130. data/spec/{moxml/all_with_adapters_spec.rb → integration/all_adapters_spec.rb} +3 -2
  131. data/spec/integration/headed_ox_integration_spec.rb +326 -0
  132. data/spec/{support → integration}/shared_examples/edge_cases.rb +37 -10
  133. data/spec/integration/shared_examples/high_level/.gitkeep +0 -0
  134. data/spec/{support/shared_examples/context.rb → integration/shared_examples/high_level/context_behavior.rb} +2 -1
  135. data/spec/{support/shared_examples/integration.rb → integration/shared_examples/integration_workflows.rb} +23 -6
  136. data/spec/integration/shared_examples/node_wrappers/.gitkeep +0 -0
  137. data/spec/{support/shared_examples/cdata.rb → integration/shared_examples/node_wrappers/cdata_behavior.rb} +6 -1
  138. data/spec/{support/shared_examples/comment.rb → integration/shared_examples/node_wrappers/comment_behavior.rb} +2 -1
  139. data/spec/{support/shared_examples/declaration.rb → integration/shared_examples/node_wrappers/declaration_behavior.rb} +5 -2
  140. data/spec/{support/shared_examples/doctype.rb → integration/shared_examples/node_wrappers/doctype_behavior.rb} +2 -2
  141. data/spec/{support/shared_examples/document.rb → integration/shared_examples/node_wrappers/document_behavior.rb} +1 -1
  142. data/spec/{support/shared_examples/node.rb → integration/shared_examples/node_wrappers/node_behavior.rb} +9 -2
  143. data/spec/{support/shared_examples/node_set.rb → integration/shared_examples/node_wrappers/node_set_behavior.rb} +1 -18
  144. data/spec/{support/shared_examples/processing_instruction.rb → integration/shared_examples/node_wrappers/processing_instruction_behavior.rb} +6 -2
  145. data/spec/moxml/README.md +41 -0
  146. data/spec/moxml/adapter/.gitkeep +0 -0
  147. data/spec/moxml/adapter/README.md +61 -0
  148. data/spec/moxml/adapter/base_spec.rb +27 -0
  149. data/spec/moxml/adapter/headed_ox_spec.rb +311 -0
  150. data/spec/moxml/adapter/libxml_spec.rb +14 -0
  151. data/spec/moxml/adapter/ox_spec.rb +9 -8
  152. data/spec/moxml/adapter/shared_examples/.gitkeep +0 -0
  153. data/spec/{support/shared_examples/xml_adapter.rb → moxml/adapter/shared_examples/adapter_contract.rb} +39 -12
  154. data/spec/moxml/adapter_spec.rb +16 -0
  155. data/spec/moxml/attribute_spec.rb +30 -0
  156. data/spec/moxml/builder_spec.rb +33 -0
  157. data/spec/moxml/cdata_spec.rb +31 -0
  158. data/spec/moxml/comment_spec.rb +31 -0
  159. data/spec/moxml/config_spec.rb +3 -3
  160. data/spec/moxml/context_spec.rb +28 -0
  161. data/spec/moxml/declaration_spec.rb +36 -0
  162. data/spec/moxml/doctype_spec.rb +33 -0
  163. data/spec/moxml/document_builder_spec.rb +30 -0
  164. data/spec/moxml/document_spec.rb +105 -0
  165. data/spec/moxml/element_spec.rb +143 -0
  166. data/spec/moxml/error_spec.rb +266 -22
  167. data/spec/{moxml_spec.rb → moxml/moxml_spec.rb} +9 -9
  168. data/spec/moxml/namespace_spec.rb +32 -0
  169. data/spec/moxml/node_set_spec.rb +39 -0
  170. data/spec/moxml/node_spec.rb +37 -0
  171. data/spec/moxml/processing_instruction_spec.rb +34 -0
  172. data/spec/moxml/sax_spec.rb +1067 -0
  173. data/spec/moxml/text_spec.rb +31 -0
  174. data/spec/moxml/version_spec.rb +14 -0
  175. data/spec/moxml/xml_utils/.gitkeep +0 -0
  176. data/spec/moxml/xml_utils/encoder_spec.rb +27 -0
  177. data/spec/moxml/xml_utils_spec.rb +49 -0
  178. data/spec/moxml/xpath/ast/node_spec.rb +83 -0
  179. data/spec/moxml/xpath/axes_spec.rb +296 -0
  180. data/spec/moxml/xpath/cache_spec.rb +358 -0
  181. data/spec/moxml/xpath/compiler_spec.rb +406 -0
  182. data/spec/moxml/xpath/context_spec.rb +210 -0
  183. data/spec/moxml/xpath/conversion_spec.rb +365 -0
  184. data/spec/moxml/xpath/fixtures/sample.xml +25 -0
  185. data/spec/moxml/xpath/functions/boolean_functions_spec.rb +114 -0
  186. data/spec/moxml/xpath/functions/node_functions_spec.rb +145 -0
  187. data/spec/moxml/xpath/functions/numeric_functions_spec.rb +164 -0
  188. data/spec/moxml/xpath/functions/position_functions_spec.rb +93 -0
  189. data/spec/moxml/xpath/functions/special_functions_spec.rb +89 -0
  190. data/spec/moxml/xpath/functions/string_functions_spec.rb +381 -0
  191. data/spec/moxml/xpath/lexer_spec.rb +488 -0
  192. data/spec/moxml/xpath/parser_integration_spec.rb +210 -0
  193. data/spec/moxml/xpath/parser_spec.rb +364 -0
  194. data/spec/moxml/xpath/ruby/generator_spec.rb +421 -0
  195. data/spec/moxml/xpath/ruby/node_spec.rb +291 -0
  196. data/spec/moxml/xpath_capabilities_spec.rb +199 -0
  197. data/spec/moxml/xpath_spec.rb +77 -0
  198. data/spec/performance/README.md +83 -0
  199. data/spec/performance/benchmark_spec.rb +64 -0
  200. data/spec/{support/shared_examples/examples/memory.rb → performance/memory_usage_spec.rb} +3 -1
  201. data/spec/{support/shared_examples/examples/thread_safety.rb → performance/thread_safety_spec.rb} +3 -1
  202. data/spec/performance/xpath_benchmark_spec.rb +259 -0
  203. data/spec/spec_helper.rb +58 -1
  204. data/spec/support/xml_matchers.rb +1 -1
  205. metadata +176 -34
  206. data/spec/support/shared_examples/examples/benchmark_spec.rb +0 -51
  207. /data/spec/{support/shared_examples/builder.rb → integration/shared_examples/high_level/builder_behavior.rb} +0 -0
  208. /data/spec/{support/shared_examples/document_builder.rb → integration/shared_examples/high_level/document_builder_behavior.rb} +0 -0
  209. /data/spec/{support/shared_examples/attribute.rb → integration/shared_examples/node_wrappers/attribute_behavior.rb} +0 -0
  210. /data/spec/{support/shared_examples/element.rb → integration/shared_examples/node_wrappers/element_behavior.rb} +0 -0
  211. /data/spec/{support/shared_examples/namespace.rb → integration/shared_examples/node_wrappers/namespace_behavior.rb} +0 -0
  212. /data/spec/{support/shared_examples/text.rb → integration/shared_examples/node_wrappers/text_behavior.rb} +0 -0
@@ -0,0 +1,194 @@
1
+ # RSS Feed Parser Example
2
+
3
+ This example demonstrates how to parse RSS/Atom feeds using Moxml, showcasing XPath queries, namespace handling, and data extraction.
4
+
5
+ ## What This Example Demonstrates
6
+
7
+ - **XML Parsing**: Loading and parsing RSS feed XML
8
+ - **XPath Queries**: Using XPath to extract specific elements
9
+ - **Namespace Handling**: Working with Dublin Core (dc), Content, and Atom namespaces
10
+ - **Element Traversal**: Navigating the document structure
11
+ - **CDATA Sections**: Extracting content from CDATA blocks
12
+ - **Error Handling**: Proper error handling with Moxml exceptions
13
+
14
+ ## Files
15
+
16
+ - `rss_parser.rb` - Main parser implementation
17
+ - `example_feed.xml` - Sample RSS 2.0 feed with multiple articles
18
+ - `README.md` - This file
19
+
20
+ ## Running the Example
21
+
22
+ ### Using the Example Feed
23
+
24
+ ```bash
25
+ ruby examples/rss_parser/rss_parser.rb
26
+ ```
27
+
28
+ ### Using Your Own Feed
29
+
30
+ ```bash
31
+ ruby examples/rss_parser/rss_parser.rb path/to/your/feed.xml
32
+ ```
33
+
34
+ ## Expected Output
35
+
36
+ ```
37
+ Parsing RSS feed: examples/rss_parser/example_feed.xml
38
+ ================================================================================
39
+
40
+ Feed: Tech News Daily
41
+ URL: https://technews.example.com
42
+ Description: Your daily dose of technology news
43
+
44
+ Articles:
45
+
46
+ ================================================================================
47
+ Title: Ruby 3.4 Released with Performance Improvements
48
+ Link: https://technews.example.com/ruby-3-4-released
49
+ Author: Jane Smith
50
+ Published: Wed, 30 Oct 2024 09:00:00 GMT
51
+ Categories: Programming, Ruby
52
+ --------------------------------------------------------------------------------
53
+ Description: Ruby 3.4 brings significant performance improvements and new features
54
+
55
+ Full Content:
56
+ <p>The Ruby core team has announced the release of Ruby 3.4, featuring:</p>
57
+ <ul>
58
+ <li>30% faster execution for common patterns</li>
59
+ <li>Improved memory management</li>
60
+ <li>New standard library additions</li>
61
+ </ul>
62
+ ================================================================================
63
+
64
+ [Additional articles...]
65
+
66
+ Summary:
67
+ Total articles: 4
68
+ Authors: Jane Smith, John Doe, Alice Johnson, Bob Williams
69
+ Categories: Programming (2), Ruby (1), XML (2), API (1), Design (1), XPath (1)
70
+ ```
71
+
72
+ ## Key Concepts
73
+
74
+ ### XPath Queries
75
+
76
+ The example uses various XPath patterns:
77
+
78
+ ```ruby
79
+ # Simple path - get channel title
80
+ doc.xpath('//channel/title')
81
+
82
+ # Namespaced element - get Dublin Core creator
83
+ item.xpath('./dc:creator', 'dc' => 'http://purl.org/dc/elements/1.1/')
84
+
85
+ # Multiple results - get all categories
86
+ item.xpath('./category')
87
+ ```
88
+
89
+ ### Namespace Handling
90
+
91
+ RSS feeds often use multiple namespaces:
92
+
93
+ ```ruby
94
+ namespaces = {
95
+ 'dc' => 'http://purl.org/dc/elements/1.1/', # Dublin Core
96
+ 'content' => 'http://purl.org/rss/1.0/modules/content/', # Content
97
+ 'atom' => 'http://www.w3.org/2005/Atom' # Atom
98
+ }
99
+
100
+ # Query with namespace
101
+ author = item.at_xpath('./dc:creator', namespaces)
102
+ ```
103
+
104
+ ### CDATA Content
105
+
106
+ Extract HTML/XML content preserved in CDATA sections:
107
+
108
+ ```ruby
109
+ content_node = item.at_xpath('./content:encoded', namespaces)
110
+ content = content_node&.text&.strip
111
+ ```
112
+
113
+ ### Error Handling
114
+
115
+ Proper error handling for parse and XPath errors:
116
+
117
+ ```ruby
118
+ begin
119
+ doc = @moxml.parse(xml_content)
120
+ rescue Moxml::ParseError => e
121
+ puts "Failed to parse RSS feed: #{e.message}"
122
+ exit 1
123
+ end
124
+ ```
125
+
126
+ ## Code Structure
127
+
128
+ ### Article Class
129
+
130
+ Represents a single RSS article with:
131
+ - Title, link, description
132
+ - Full content (from CDATA)
133
+ - Author (from dc:creator)
134
+ - Publication date
135
+ - Categories
136
+
137
+ ### RSSParser Class
138
+
139
+ Main parser with methods:
140
+ - `parse` - Parse the feed and return Article objects
141
+ - `parse_item` - Extract data from a single RSS item
142
+ - `extract_text` - Helper for safe text extraction
143
+
144
+ ## Customization
145
+
146
+ ### Adding More Fields
147
+
148
+ To extract additional RSS fields, add to `parse_item`:
149
+
150
+ ```ruby
151
+ # Extract guid
152
+ guid = extract_text(item, './guid')
153
+
154
+ # Extract enclosure (podcast, etc.)
155
+ enclosure = item.at_xpath('./enclosure')
156
+ if enclosure
157
+ url = enclosure['url']
158
+ type = enclosure['type']
159
+ length = enclosure['length']
160
+ end
161
+ ```
162
+
163
+ ### Supporting Atom Feeds
164
+
165
+ Modify the parser to support Atom feed format:
166
+
167
+ ```ruby
168
+ # Atom uses different element names
169
+ if doc.xpath('//feed').any? # Atom feed
170
+ items = doc.xpath('//entry')
171
+ # Extract with Atom element names: entry, id, summary, etc.
172
+ end
173
+ ```
174
+
175
+ ## Learning Points
176
+
177
+ 1. **XPath is powerful**: One query can extract multiple elements
178
+ 2. **Namespaces are important**: Many RSS extensions use namespaces
179
+ 3. **CDATA preserves markup**: Use for HTML/XML content within RSS
180
+ 4. **Safe navigation**: Use `&.` operator and nil checks
181
+ 5. **Error handling matters**: Always handle parse and query errors
182
+
183
+ ## Next Steps
184
+
185
+ - Try parsing different RSS feeds from the web
186
+ - Add support for podcast feeds (enclosures)
187
+ - Implement feed validation
188
+ - Create an RSS feed aggregator
189
+ - Export articles to different formats (JSON, Markdown, etc.)
190
+
191
+ ## Related Examples
192
+
193
+ - [Web Scraper](../web_scraper/) - Similar DOM navigation techniques
194
+ - [API Client](../api_client/) - XML generation and parsing
@@ -0,0 +1,93 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <rss version="2.0"
3
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
4
+ xmlns:content="http://purl.org/rss/1.0/modules/content/"
5
+ xmlns:atom="http://www.w3.org/2005/Atom">
6
+ <channel>
7
+ <title>Tech News Daily</title>
8
+ <link>https://technews.example.com</link>
9
+ <description>Your daily dose of technology news</description>
10
+ <language>en-us</language>
11
+ <pubDate>Wed, 30 Oct 2024 10:00:00 GMT</pubDate>
12
+ <lastBuildDate>Wed, 30 Oct 2024 10:00:00 GMT</lastBuildDate>
13
+ <atom:link href="https://technews.example.com/rss" rel="self" type="application/rss+xml"/>
14
+
15
+ <item>
16
+ <title>Ruby 3.4 Released with Performance Improvements</title>
17
+ <link>https://technews.example.com/ruby-3-4-released</link>
18
+ <guid isPermaLink="true">https://technews.example.com/ruby-3-4-released</guid>
19
+ <description>Ruby 3.4 brings significant performance improvements and new features</description>
20
+ <content:encoded><![CDATA[
21
+ <p>The Ruby core team has announced the release of Ruby 3.4, featuring:</p>
22
+ <ul>
23
+ <li>30% faster execution for common patterns</li>
24
+ <li>Improved memory management</li>
25
+ <li>New standard library additions</li>
26
+ </ul>
27
+ ]]></content:encoded>
28
+ <dc:creator>Jane Smith</dc:creator>
29
+ <pubDate>Wed, 30 Oct 2024 09:00:00 GMT</pubDate>
30
+ <category>Programming</category>
31
+ <category>Ruby</category>
32
+ </item>
33
+
34
+ <item>
35
+ <title>XML Processing in Modern Applications</title>
36
+ <link>https://technews.example.com/xml-processing</link>
37
+ <guid isPermaLink="true">https://technews.example.com/xml-processing</guid>
38
+ <description>A comprehensive guide to XML processing in modern applications</description>
39
+ <content:encoded><![CDATA[
40
+ <p>Despite the rise of JSON, XML remains crucial for many applications:</p>
41
+ <ul>
42
+ <li>Document-oriented data structures</li>
43
+ <li>Complex namespace requirements</li>
44
+ <li>Legacy system integration</li>
45
+ </ul>
46
+ ]]></content:encoded>
47
+ <dc:creator>John Doe</dc:creator>
48
+ <pubDate>Wed, 30 Oct 2024 08:00:00 GMT</pubDate>
49
+ <category>Programming</category>
50
+ <category>XML</category>
51
+ </item>
52
+
53
+ <item>
54
+ <title>Best Practices for API Design</title>
55
+ <link>https://technews.example.com/api-design</link>
56
+ <guid isPermaLink="true">https://technews.example.com/api-design</guid>
57
+ <description>Learn the essential principles of modern API design</description>
58
+ <content:encoded><![CDATA[
59
+ <p>Key principles for designing robust APIs:</p>
60
+ <ol>
61
+ <li>Clear and consistent naming conventions</li>
62
+ <li>Proper error handling and messaging</li>
63
+ <li>Comprehensive documentation</li>
64
+ <li>Versioning strategy</li>
65
+ </ol>
66
+ ]]></content:encoded>
67
+ <dc:creator>Alice Johnson</dc:creator>
68
+ <pubDate>Tue, 29 Oct 2024 14:00:00 GMT</pubDate>
69
+ <category>API</category>
70
+ <category>Design</category>
71
+ </item>
72
+
73
+ <item>
74
+ <title>Introduction to XPath Queries</title>
75
+ <link>https://technews.example.com/xpath-intro</link>
76
+ <guid isPermaLink="true">https://technews.example.com/xpath-intro</guid>
77
+ <description>Master XPath for efficient XML data extraction</description>
78
+ <content:encoded><![CDATA[
79
+ <p>XPath is a powerful language for navigating XML documents:</p>
80
+ <ul>
81
+ <li>Select elements by path</li>
82
+ <li>Filter with predicates</li>
83
+ <li>Work with namespaces</li>
84
+ <li>Use built-in functions</li>
85
+ </ul>
86
+ ]]></content:encoded>
87
+ <dc:creator>Bob Williams</dc:creator>
88
+ <pubDate>Tue, 29 Oct 2024 10:00:00 GMT</pubDate>
89
+ <category>XML</category>
90
+ <category>XPath</category>
91
+ </item>
92
+ </channel>
93
+ </rss>
@@ -0,0 +1,189 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # RSS Feed Parser Example
5
+ # This example demonstrates how to use Moxml to parse RSS feeds with:
6
+ # - XPath queries for data extraction
7
+ # - Namespace handling (dc, content, atom)
8
+ # - Element traversal and attribute access
9
+ # - Error handling best practices
10
+
11
+ # Load moxml from the local source (use 'require "moxml"' in production)
12
+ require_relative "../../lib/moxml"
13
+
14
+ # Article class to represent a parsed RSS item
15
+ class Article
16
+ attr_reader :title, :link, :description, :content, :author, :pub_date,
17
+ :categories
18
+
19
+ def initialize(title:, link:, description:, content: nil, author: nil,
20
+ pub_date: nil, categories: [])
21
+ @title = title
22
+ @link = link
23
+ @description = description
24
+ @content = content
25
+ @author = author
26
+ @pub_date = pub_date
27
+ @categories = categories
28
+ end
29
+
30
+ def to_s
31
+ output = []
32
+ output << ("=" * 80)
33
+ output << "Title: #{@title}"
34
+ output << "Link: #{@link}"
35
+ output << "Author: #{@author}" if @author
36
+ output << "Published: #{@pub_date}" if @pub_date
37
+ output << "Categories: #{@categories.join(', ')}" unless @categories.empty?
38
+ output << ("-" * 80)
39
+ output << "Description: #{@description}"
40
+ output << ""
41
+ output << "Full Content:" if @content
42
+ output << @content if @content
43
+ output << ("=" * 80)
44
+ output.join("\n")
45
+ end
46
+ end
47
+
48
+ # RSSParser class encapsulates RSS feed parsing logic
49
+ class RSSParser
50
+ # Initialize with the path to an RSS feed file
51
+ def initialize(feed_path)
52
+ @feed_path = feed_path
53
+ @moxml = Moxml.new
54
+ end
55
+
56
+ # Parse the RSS feed and return an array of Article objects
57
+ def parse
58
+ # Read and parse the XML file
59
+ xml_content = File.read(@feed_path)
60
+
61
+ # Parse with error handling
62
+ doc = begin
63
+ @moxml.parse(xml_content)
64
+ rescue Moxml::ParseError => e
65
+ puts "Failed to parse RSS feed: #{e.message}"
66
+ puts "Hint: #{e.hint}" if e.respond_to?(:hint)
67
+ exit 1
68
+ end
69
+
70
+ # Define namespace prefixes for XPath queries
71
+ # RSS feeds often use Dublin Core (dc) and Content (content) namespaces
72
+ namespaces = {
73
+ "dc" => "http://purl.org/dc/elements/1.1/",
74
+ "content" => "http://purl.org/rss/1.0/modules/content/",
75
+ "atom" => "http://www.w3.org/2005/Atom",
76
+ }
77
+
78
+ # Extract feed metadata using XPath
79
+ feed_title = extract_text(doc, "//channel/title")
80
+ feed_link = extract_text(doc, "//channel/link")
81
+ feed_description = extract_text(doc, "//channel/description")
82
+
83
+ puts "Feed: #{feed_title}"
84
+ puts "URL: #{feed_link}"
85
+ puts "Description: #{feed_description}"
86
+ puts "\nArticles:\n\n"
87
+
88
+ # Find all item elements using XPath
89
+ # The double slash (//) searches at any depth in the document
90
+ items = begin
91
+ doc.xpath("//item")
92
+ rescue Moxml::XPathError => e
93
+ puts "XPath query failed: #{e.message}"
94
+ puts "Expression: #{e.expression}" if e.respond_to?(:expression)
95
+ exit 1
96
+ end
97
+
98
+ # Parse each item into an Article object
99
+ items.map do |item|
100
+ parse_item(item, namespaces)
101
+ end
102
+ end
103
+
104
+ private
105
+
106
+ # Parse a single RSS item element
107
+ def parse_item(item, namespaces)
108
+ # Extract basic RSS fields
109
+ # Using at_xpath to get the first matching element (returns nil if not found)
110
+ title = extract_text(item, "./title")
111
+ link = extract_text(item, "./link")
112
+ description = extract_text(item, "./description")
113
+ pub_date = extract_text(item, "./pubDate")
114
+
115
+ # Extract namespaced elements
116
+ # The dc:creator element uses the Dublin Core namespace
117
+ author = extract_text(item, "./dc:creator", namespaces)
118
+
119
+ # Extract CDATA content from the content:encoded element
120
+ # CDATA sections preserve HTML/XML markup without parsing it
121
+ content_node = item.at_xpath("./content:encoded", namespaces)
122
+ content = content_node&.text&.strip
123
+
124
+ # Extract all category elements
125
+ # xpath returns a NodeSet which we can iterate over
126
+ category_nodes = item.xpath("./category")
127
+ categories = category_nodes.map(&:text)
128
+
129
+ # Create and return Article object
130
+ Article.new(
131
+ title: title,
132
+ link: link,
133
+ description: description,
134
+ content: content,
135
+ author: author,
136
+ pub_date: pub_date,
137
+ categories: categories,
138
+ )
139
+ end
140
+
141
+ # Helper method to extract text content from an XPath query
142
+ # Returns empty string if element not found
143
+ def extract_text(node, xpath, namespaces = {})
144
+ element = node.at_xpath(xpath, namespaces)
145
+ element&.text&.strip || ""
146
+ end
147
+ end
148
+
149
+ # Main execution
150
+ if __FILE__ == $0
151
+ # Get the feed path (use example feed by default)
152
+ feed_path = ARGV[0] || File.join(__dir__, "example_feed.xml")
153
+
154
+ # Check if file exists
155
+ unless File.exist?(feed_path)
156
+ puts "Error: Feed file not found: #{feed_path}"
157
+ puts "Usage: ruby rss_parser.rb [path/to/feed.xml]"
158
+ exit 1
159
+ end
160
+
161
+ puts "Parsing RSS feed: #{feed_path}"
162
+ puts "=" * 80
163
+ puts
164
+
165
+ # Parse the feed
166
+ parser = RSSParser.new(feed_path)
167
+ articles = parser.parse
168
+
169
+ # Display each article
170
+ articles.each_with_index do |article, index|
171
+ puts "\n#{index + 1}. #{article}\n"
172
+ end
173
+
174
+ # Summary statistics
175
+ puts "\n#{'=' * 80}"
176
+ puts "Summary:"
177
+ puts "Total articles: #{articles.length}"
178
+ puts "Authors: #{articles.filter_map(&:author).uniq.join(', ')}"
179
+
180
+ # Count categories
181
+ all_categories = articles.flat_map(&:categories)
182
+ category_counts = all_categories.each_with_object(Hash.new(0)) do |cat, counts|
183
+ counts[cat] += 1
184
+ end
185
+ puts "Categories: #{category_counts.map do |cat, count|
186
+ "#{cat} (#{count})"
187
+ end.join(', ')}"
188
+ puts "=" * 80
189
+ end
@@ -0,0 +1,50 @@
1
+ # SAX Parsing Examples
2
+
3
+ This directory contains practical examples demonstrating Moxml's SAX (Simple API for XML) parsing capabilities.
4
+
5
+ ## Files
6
+
7
+ - `example.xml` - Sample XML file with book data
8
+ - `simple_parser.rb` - Basic SAX parsing with both class and block handlers
9
+ - `data_extractor.rb` - Extract specific data using ElementHandler
10
+ - `large_file.rb` - Memory-efficient streaming processor
11
+
12
+ ## Running Examples
13
+
14
+ Make sure you have moxml installed:
15
+
16
+ ```bash
17
+ gem install moxml
18
+ ```
19
+
20
+ Then run any example:
21
+
22
+ ```bash
23
+ ruby simple_parser.rb
24
+ ruby data_extractor.rb
25
+ ruby large_file.rb
26
+ ```
27
+
28
+ ## What Each Example Demonstrates
29
+
30
+ ### simple_parser.rb
31
+ - Basic handler creation
32
+ - Using both class-based and block-based handlers
33
+ - Handling different event types
34
+ - Comparing the two approaches
35
+
36
+ ### data_extractor.rb
37
+ - Using ElementHandler for context-aware parsing
38
+ - Path matching with regex
39
+ - Extracting structured data
40
+ - Accumulating text across multiple character events
41
+
42
+ ### large_file.rb
43
+ - Memory-efficient streaming
44
+ - Processing records without loading entire document
45
+ - Immediate output to avoid memory accumulation
46
+ - Best practices for large file handling
47
+
48
+ ## Learn More
49
+
50
+ See the comprehensive SAX Parsing Guide in `docs/_guides/sax-parsing.adoc` for detailed documentation, patterns, and best practices.
@@ -0,0 +1,75 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "bundler/setup"
5
+ require "moxml"
6
+
7
+ xml = File.read(File.join(__dir__, "example.xml"))
8
+
9
+ # Handler that extracts book data using ElementHandler utilities
10
+ class BookExtractor < Moxml::SAX::ElementHandler
11
+ attr_reader :books
12
+
13
+ def initialize
14
+ super
15
+ @books = []
16
+ @current_book = nil
17
+ @current_field = nil
18
+ @current_text = +""
19
+ end
20
+
21
+ def on_start_element(name, attributes = {}, namespaces = {})
22
+ super # Important: updates element stack
23
+
24
+ case name
25
+ when "book"
26
+ @current_book = {
27
+ id: attributes["id"],
28
+ category: attributes["category"],
29
+ }
30
+ puts "Found book with ID: #{attributes['id']}"
31
+ when "title", "author", "price", "isbn"
32
+ @current_field = name
33
+ @current_text = +""
34
+ end
35
+ end
36
+
37
+ def on_characters(text)
38
+ # Accumulate text - may be called multiple times for one element
39
+ @current_text << text if @current_field
40
+ end
41
+
42
+ def on_end_element(name)
43
+ # Process completed elements
44
+ if @current_field == name && @current_book
45
+ value = @current_text.strip
46
+ value = value.to_f if name == "price"
47
+ @current_book[name.to_sym] = value
48
+ puts " #{name.capitalize}: #{value}"
49
+ @current_field = nil
50
+ end
51
+
52
+ if name == "book" && @current_book
53
+ @books << @current_book
54
+ puts " Complete book added\n\n"
55
+ @current_book = nil
56
+ end
57
+
58
+ super # Important: updates element stack
59
+ end
60
+ end
61
+
62
+ puts "=== SAX Data Extraction Example ==="
63
+ puts
64
+
65
+ context = Moxml.new(:nokogiri)
66
+ handler = BookExtractor.new
67
+ context.sax_parse(xml, handler)
68
+
69
+ puts "=== Summary ==="
70
+ puts "Total books extracted: #{handler.books.size}"
71
+ puts
72
+ puts "Programming books:"
73
+ handler.books.select { |b| b[:category] == "programming" }.each do |book|
74
+ puts " - #{book[:title]} by #{book[:author]} ($#{book[:price]})"
75
+ end
@@ -0,0 +1,21 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <library>
3
+ <book id="1" category="programming">
4
+ <title>Ruby Programming</title>
5
+ <author>Jane Smith</author>
6
+ <price>29.99</price>
7
+ <isbn>978-0-123456-78-9</isbn>
8
+ </book>
9
+ <book id="2" category="fiction">
10
+ <title>The Great Novel</title>
11
+ <author>John Doe</author>
12
+ <price>19.99</price>
13
+ <isbn>978-0-987654-32-1</isbn>
14
+ </book>
15
+ <book id="3" category="programming">
16
+ <title>Advanced Ruby</title>
17
+ <author>Jane Smith</author>
18
+ <price>39.99</price>
19
+ <isbn>978-0-111111-11-1</isbn>
20
+ </book>
21
+ </library>
@@ -0,0 +1,78 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "bundler/setup"
5
+ require "moxml"
6
+
7
+ xml = File.read(File.join(__dir__, "example.xml"))
8
+
9
+ # Memory-efficient streaming processor
10
+ # Processes and outputs records immediately without accumulating in memory
11
+ class StreamProcessor < Moxml::SAX::Handler
12
+ def initialize(output = $stdout)
13
+ super()
14
+ @output = output
15
+ @current_record = nil
16
+ @current_field = nil
17
+ @text_buffer = +""
18
+ @record_count = 0
19
+ end
20
+
21
+ def on_start_element(name, attributes = {}, _namespaces = {})
22
+ case name
23
+ when "book"
24
+ @current_record = {
25
+ id: attributes["id"],
26
+ category: attributes["category"],
27
+ }
28
+ when "title", "author", "price", "isbn"
29
+ @current_field = name
30
+ @text_buffer = +""
31
+ end
32
+ end
33
+
34
+ def on_characters(text)
35
+ @text_buffer << text if @current_field
36
+ end
37
+
38
+ def on_end_element(name)
39
+ # Capture field value
40
+ if @current_field == name && @current_record
41
+ value = @text_buffer.strip
42
+ value = value.to_f if name == "price"
43
+ @current_record[name.to_sym] = value
44
+ @current_field = nil
45
+ end
46
+
47
+ # Process complete record immediately
48
+ if name == "book" && @current_record
49
+ process_record(@current_record)
50
+ @current_record = nil # Free memory immediately
51
+ @text_buffer = +"" # Reset for next record
52
+ end
53
+ end
54
+
55
+ private
56
+
57
+ def process_record(record)
58
+ @record_count += 1
59
+ # Process and output immediately - don't accumulate
60
+ @output.puts "Record #{@record_count}: #{record[:title]} by #{record[:author]}"
61
+ @output.puts " Category: #{record[:category]}, Price: $#{record[:price]}"
62
+ @output.puts
63
+ end
64
+ end
65
+
66
+ puts "=== Memory-Efficient Streaming Example ==="
67
+ puts "Processing records as they're encountered..."
68
+ puts
69
+
70
+ context = Moxml.new(:nokogiri)
71
+ processor = StreamProcessor.new
72
+ context.sax_parse(xml, processor)
73
+
74
+ puts "=== Benefits of This Approach ==="
75
+ puts "✓ Constant memory usage - O(1) regardless of file size"
76
+ puts "✓ Immediate processing/output - no waiting for full parse"
77
+ puts "✓ Handles files of any size - tested with gigabyte+ files"
78
+ puts "✓ Perfect for streaming data or ETL pipelines"