moxml 0.1.6 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/dependent-repos.json +5 -0
- data/.github/workflows/dependent-tests.yml +20 -0
- data/.github/workflows/docs.yml +59 -0
- data/.github/workflows/rake.yml +12 -4
- data/.github/workflows/release.yml +5 -3
- data/.gitignore +37 -0
- data/.rubocop.yml +15 -7
- data/.rubocop_todo.yml +238 -40
- data/Gemfile +14 -9
- data/LICENSE.md +6 -2
- data/README.adoc +535 -373
- data/Rakefile +53 -0
- data/benchmarks/.gitignore +6 -0
- data/benchmarks/generate_report.rb +550 -0
- data/docs/Gemfile +13 -0
- data/docs/_config.yml +138 -0
- data/docs/_guides/advanced-features.adoc +87 -0
- data/docs/_guides/development-testing.adoc +165 -0
- data/docs/_guides/index.adoc +45 -0
- data/docs/_guides/modifying-xml.adoc +293 -0
- data/docs/_guides/parsing-xml.adoc +231 -0
- data/docs/_guides/sax-parsing.adoc +603 -0
- data/docs/_guides/working-with-documents.adoc +118 -0
- data/docs/_pages/adapter-compatibility.adoc +369 -0
- data/docs/_pages/adapters/headed-ox.adoc +237 -0
- data/docs/_pages/adapters/index.adoc +98 -0
- data/docs/_pages/adapters/libxml.adoc +286 -0
- data/docs/_pages/adapters/nokogiri.adoc +252 -0
- data/docs/_pages/adapters/oga.adoc +292 -0
- data/docs/_pages/adapters/ox.adoc +55 -0
- data/docs/_pages/adapters/rexml.adoc +293 -0
- data/docs/_pages/best-practices.adoc +430 -0
- data/docs/_pages/compatibility.adoc +468 -0
- data/docs/_pages/configuration.adoc +251 -0
- data/docs/_pages/error-handling.adoc +350 -0
- data/docs/_pages/headed-ox-limitations.adoc +558 -0
- data/docs/_pages/headed-ox.adoc +1025 -0
- data/docs/_pages/index.adoc +35 -0
- data/docs/_pages/installation.adoc +141 -0
- data/docs/_pages/node-api-reference.adoc +50 -0
- data/docs/_pages/performance.adoc +36 -0
- data/docs/_pages/quick-start.adoc +244 -0
- data/docs/_pages/thread-safety.adoc +29 -0
- data/docs/_references/document-api.adoc +408 -0
- data/docs/_references/index.adoc +48 -0
- data/docs/_tutorials/basic-usage.adoc +268 -0
- data/docs/_tutorials/builder-pattern.adoc +343 -0
- data/docs/_tutorials/index.adoc +33 -0
- data/docs/_tutorials/namespace-handling.adoc +325 -0
- data/docs/_tutorials/xpath-queries.adoc +359 -0
- data/docs/index.adoc +122 -0
- data/examples/README.md +124 -0
- data/examples/api_client/README.md +424 -0
- data/examples/api_client/api_client.rb +394 -0
- data/examples/api_client/example_response.xml +48 -0
- data/examples/headed_ox_example/README.md +90 -0
- data/examples/headed_ox_example/headed_ox_demo.rb +71 -0
- data/examples/rss_parser/README.md +194 -0
- data/examples/rss_parser/example_feed.xml +93 -0
- data/examples/rss_parser/rss_parser.rb +189 -0
- data/examples/sax_parsing/README.md +50 -0
- data/examples/sax_parsing/data_extractor.rb +75 -0
- data/examples/sax_parsing/example.xml +21 -0
- data/examples/sax_parsing/large_file.rb +78 -0
- data/examples/sax_parsing/simple_parser.rb +55 -0
- data/examples/web_scraper/README.md +352 -0
- data/examples/web_scraper/example_page.html +201 -0
- data/examples/web_scraper/web_scraper.rb +312 -0
- data/lib/moxml/adapter/base.rb +107 -28
- data/lib/moxml/adapter/customized_libxml/cdata.rb +28 -0
- data/lib/moxml/adapter/customized_libxml/comment.rb +24 -0
- data/lib/moxml/adapter/customized_libxml/declaration.rb +85 -0
- data/lib/moxml/adapter/customized_libxml/element.rb +39 -0
- data/lib/moxml/adapter/customized_libxml/node.rb +44 -0
- data/lib/moxml/adapter/customized_libxml/processing_instruction.rb +31 -0
- data/lib/moxml/adapter/customized_libxml/text.rb +27 -0
- data/lib/moxml/adapter/customized_oga/xml_generator.rb +1 -1
- data/lib/moxml/adapter/customized_ox/attribute.rb +28 -3
- data/lib/moxml/adapter/customized_ox/namespace.rb +0 -2
- data/lib/moxml/adapter/customized_ox/text.rb +0 -2
- data/lib/moxml/adapter/customized_rexml/formatter.rb +11 -6
- data/lib/moxml/adapter/headed_ox.rb +161 -0
- data/lib/moxml/adapter/libxml.rb +1548 -0
- data/lib/moxml/adapter/nokogiri.rb +121 -9
- data/lib/moxml/adapter/oga.rb +123 -12
- data/lib/moxml/adapter/ox.rb +283 -27
- data/lib/moxml/adapter/rexml.rb +127 -20
- data/lib/moxml/adapter.rb +21 -4
- data/lib/moxml/attribute.rb +6 -0
- data/lib/moxml/builder.rb +40 -4
- data/lib/moxml/config.rb +8 -3
- data/lib/moxml/context.rb +39 -1
- data/lib/moxml/doctype.rb +13 -1
- data/lib/moxml/document.rb +39 -6
- data/lib/moxml/document_builder.rb +27 -5
- data/lib/moxml/element.rb +71 -2
- data/lib/moxml/error.rb +175 -6
- data/lib/moxml/node.rb +94 -3
- data/lib/moxml/node_set.rb +34 -0
- data/lib/moxml/sax/block_handler.rb +194 -0
- data/lib/moxml/sax/element_handler.rb +124 -0
- data/lib/moxml/sax/handler.rb +113 -0
- data/lib/moxml/sax.rb +31 -0
- data/lib/moxml/version.rb +1 -1
- data/lib/moxml/xml_utils/encoder.rb +4 -4
- data/lib/moxml/xml_utils.rb +7 -4
- data/lib/moxml/xpath/ast/node.rb +159 -0
- data/lib/moxml/xpath/cache.rb +91 -0
- data/lib/moxml/xpath/compiler.rb +1768 -0
- data/lib/moxml/xpath/context.rb +26 -0
- data/lib/moxml/xpath/conversion.rb +124 -0
- data/lib/moxml/xpath/engine.rb +52 -0
- data/lib/moxml/xpath/errors.rb +101 -0
- data/lib/moxml/xpath/lexer.rb +304 -0
- data/lib/moxml/xpath/parser.rb +485 -0
- data/lib/moxml/xpath/ruby/generator.rb +269 -0
- data/lib/moxml/xpath/ruby/node.rb +193 -0
- data/lib/moxml/xpath.rb +37 -0
- data/lib/moxml.rb +5 -2
- data/moxml.gemspec +3 -1
- data/old-specs/moxml/adapter/customized_libxml/.gitkeep +6 -0
- data/spec/consistency/README.md +77 -0
- data/spec/{moxml/examples/adapter_spec.rb → consistency/adapter_parity_spec.rb} +4 -4
- data/spec/examples/README.md +75 -0
- data/spec/{support/shared_examples/examples/attribute.rb → examples/attribute_examples_spec.rb} +1 -1
- data/spec/{support/shared_examples/examples/basic_usage.rb → examples/basic_usage_spec.rb} +2 -2
- data/spec/{support/shared_examples/examples/namespace.rb → examples/namespace_examples_spec.rb} +3 -3
- data/spec/{support/shared_examples/examples/readme_examples.rb → examples/readme_examples_spec.rb} +6 -4
- data/spec/{support/shared_examples/examples/xpath.rb → examples/xpath_examples_spec.rb} +10 -6
- data/spec/integration/README.md +71 -0
- data/spec/{moxml/all_with_adapters_spec.rb → integration/all_adapters_spec.rb} +3 -2
- data/spec/integration/headed_ox_integration_spec.rb +326 -0
- data/spec/{support → integration}/shared_examples/edge_cases.rb +37 -10
- data/spec/integration/shared_examples/high_level/.gitkeep +0 -0
- data/spec/{support/shared_examples/context.rb → integration/shared_examples/high_level/context_behavior.rb} +2 -1
- data/spec/{support/shared_examples/integration.rb → integration/shared_examples/integration_workflows.rb} +23 -6
- data/spec/integration/shared_examples/node_wrappers/.gitkeep +0 -0
- data/spec/{support/shared_examples/cdata.rb → integration/shared_examples/node_wrappers/cdata_behavior.rb} +6 -1
- data/spec/{support/shared_examples/comment.rb → integration/shared_examples/node_wrappers/comment_behavior.rb} +2 -1
- data/spec/{support/shared_examples/declaration.rb → integration/shared_examples/node_wrappers/declaration_behavior.rb} +5 -2
- data/spec/{support/shared_examples/doctype.rb → integration/shared_examples/node_wrappers/doctype_behavior.rb} +2 -2
- data/spec/{support/shared_examples/document.rb → integration/shared_examples/node_wrappers/document_behavior.rb} +1 -1
- data/spec/{support/shared_examples/node.rb → integration/shared_examples/node_wrappers/node_behavior.rb} +9 -2
- data/spec/{support/shared_examples/node_set.rb → integration/shared_examples/node_wrappers/node_set_behavior.rb} +1 -18
- data/spec/{support/shared_examples/processing_instruction.rb → integration/shared_examples/node_wrappers/processing_instruction_behavior.rb} +6 -2
- data/spec/moxml/README.md +41 -0
- data/spec/moxml/adapter/.gitkeep +0 -0
- data/spec/moxml/adapter/README.md +61 -0
- data/spec/moxml/adapter/base_spec.rb +27 -0
- data/spec/moxml/adapter/headed_ox_spec.rb +311 -0
- data/spec/moxml/adapter/libxml_spec.rb +14 -0
- data/spec/moxml/adapter/ox_spec.rb +9 -8
- data/spec/moxml/adapter/shared_examples/.gitkeep +0 -0
- data/spec/{support/shared_examples/xml_adapter.rb → moxml/adapter/shared_examples/adapter_contract.rb} +39 -12
- data/spec/moxml/adapter_spec.rb +16 -0
- data/spec/moxml/attribute_spec.rb +30 -0
- data/spec/moxml/builder_spec.rb +33 -0
- data/spec/moxml/cdata_spec.rb +31 -0
- data/spec/moxml/comment_spec.rb +31 -0
- data/spec/moxml/config_spec.rb +3 -3
- data/spec/moxml/context_spec.rb +28 -0
- data/spec/moxml/declaration_spec.rb +36 -0
- data/spec/moxml/doctype_spec.rb +33 -0
- data/spec/moxml/document_builder_spec.rb +30 -0
- data/spec/moxml/document_spec.rb +105 -0
- data/spec/moxml/element_spec.rb +143 -0
- data/spec/moxml/error_spec.rb +266 -22
- data/spec/{moxml_spec.rb → moxml/moxml_spec.rb} +9 -9
- data/spec/moxml/namespace_spec.rb +32 -0
- data/spec/moxml/node_set_spec.rb +39 -0
- data/spec/moxml/node_spec.rb +37 -0
- data/spec/moxml/processing_instruction_spec.rb +34 -0
- data/spec/moxml/sax_spec.rb +1067 -0
- data/spec/moxml/text_spec.rb +31 -0
- data/spec/moxml/version_spec.rb +14 -0
- data/spec/moxml/xml_utils/.gitkeep +0 -0
- data/spec/moxml/xml_utils/encoder_spec.rb +27 -0
- data/spec/moxml/xml_utils_spec.rb +49 -0
- data/spec/moxml/xpath/ast/node_spec.rb +83 -0
- data/spec/moxml/xpath/axes_spec.rb +296 -0
- data/spec/moxml/xpath/cache_spec.rb +358 -0
- data/spec/moxml/xpath/compiler_spec.rb +406 -0
- data/spec/moxml/xpath/context_spec.rb +210 -0
- data/spec/moxml/xpath/conversion_spec.rb +365 -0
- data/spec/moxml/xpath/fixtures/sample.xml +25 -0
- data/spec/moxml/xpath/functions/boolean_functions_spec.rb +114 -0
- data/spec/moxml/xpath/functions/node_functions_spec.rb +145 -0
- data/spec/moxml/xpath/functions/numeric_functions_spec.rb +164 -0
- data/spec/moxml/xpath/functions/position_functions_spec.rb +93 -0
- data/spec/moxml/xpath/functions/special_functions_spec.rb +89 -0
- data/spec/moxml/xpath/functions/string_functions_spec.rb +381 -0
- data/spec/moxml/xpath/lexer_spec.rb +488 -0
- data/spec/moxml/xpath/parser_integration_spec.rb +210 -0
- data/spec/moxml/xpath/parser_spec.rb +364 -0
- data/spec/moxml/xpath/ruby/generator_spec.rb +421 -0
- data/spec/moxml/xpath/ruby/node_spec.rb +291 -0
- data/spec/moxml/xpath_capabilities_spec.rb +199 -0
- data/spec/moxml/xpath_spec.rb +77 -0
- data/spec/performance/README.md +83 -0
- data/spec/performance/benchmark_spec.rb +64 -0
- data/spec/{support/shared_examples/examples/memory.rb → performance/memory_usage_spec.rb} +3 -1
- data/spec/{support/shared_examples/examples/thread_safety.rb → performance/thread_safety_spec.rb} +3 -1
- data/spec/performance/xpath_benchmark_spec.rb +259 -0
- data/spec/spec_helper.rb +58 -1
- data/spec/support/xml_matchers.rb +1 -1
- metadata +176 -35
- data/lib/ox/node.rb +0 -9
- data/spec/support/shared_examples/examples/benchmark_spec.rb +0 -51
- /data/spec/{support/shared_examples/builder.rb → integration/shared_examples/high_level/builder_behavior.rb} +0 -0
- /data/spec/{support/shared_examples/document_builder.rb → integration/shared_examples/high_level/document_builder_behavior.rb} +0 -0
- /data/spec/{support/shared_examples/attribute.rb → integration/shared_examples/node_wrappers/attribute_behavior.rb} +0 -0
- /data/spec/{support/shared_examples/element.rb → integration/shared_examples/node_wrappers/element_behavior.rb} +0 -0
- /data/spec/{support/shared_examples/namespace.rb → integration/shared_examples/node_wrappers/namespace_behavior.rb} +0 -0
- /data/spec/{support/shared_examples/text.rb → integration/shared_examples/node_wrappers/text_behavior.rb} +0 -0
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
# RSS Feed Parser Example
|
|
2
|
+
|
|
3
|
+
This example demonstrates how to parse RSS/Atom feeds using Moxml, showcasing XPath queries, namespace handling, and data extraction.
|
|
4
|
+
|
|
5
|
+
## What This Example Demonstrates
|
|
6
|
+
|
|
7
|
+
- **XML Parsing**: Loading and parsing RSS feed XML
|
|
8
|
+
- **XPath Queries**: Using XPath to extract specific elements
|
|
9
|
+
- **Namespace Handling**: Working with Dublin Core (dc), Content, and Atom namespaces
|
|
10
|
+
- **Element Traversal**: Navigating the document structure
|
|
11
|
+
- **CDATA Sections**: Extracting content from CDATA blocks
|
|
12
|
+
- **Error Handling**: Proper error handling with Moxml exceptions
|
|
13
|
+
|
|
14
|
+
## Files
|
|
15
|
+
|
|
16
|
+
- `rss_parser.rb` - Main parser implementation
|
|
17
|
+
- `example_feed.xml` - Sample RSS 2.0 feed with multiple articles
|
|
18
|
+
- `README.md` - This file
|
|
19
|
+
|
|
20
|
+
## Running the Example
|
|
21
|
+
|
|
22
|
+
### Using the Example Feed
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
ruby examples/rss_parser/rss_parser.rb
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### Using Your Own Feed
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
ruby examples/rss_parser/rss_parser.rb path/to/your/feed.xml
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Expected Output
|
|
35
|
+
|
|
36
|
+
```
|
|
37
|
+
Parsing RSS feed: examples/rss_parser/example_feed.xml
|
|
38
|
+
================================================================================
|
|
39
|
+
|
|
40
|
+
Feed: Tech News Daily
|
|
41
|
+
URL: https://technews.example.com
|
|
42
|
+
Description: Your daily dose of technology news
|
|
43
|
+
|
|
44
|
+
Articles:
|
|
45
|
+
|
|
46
|
+
================================================================================
|
|
47
|
+
Title: Ruby 3.4 Released with Performance Improvements
|
|
48
|
+
Link: https://technews.example.com/ruby-3-4-released
|
|
49
|
+
Author: Jane Smith
|
|
50
|
+
Published: Wed, 30 Oct 2024 09:00:00 GMT
|
|
51
|
+
Categories: Programming, Ruby
|
|
52
|
+
--------------------------------------------------------------------------------
|
|
53
|
+
Description: Ruby 3.4 brings significant performance improvements and new features
|
|
54
|
+
|
|
55
|
+
Full Content:
|
|
56
|
+
<p>The Ruby core team has announced the release of Ruby 3.4, featuring:</p>
|
|
57
|
+
<ul>
|
|
58
|
+
<li>30% faster execution for common patterns</li>
|
|
59
|
+
<li>Improved memory management</li>
|
|
60
|
+
<li>New standard library additions</li>
|
|
61
|
+
</ul>
|
|
62
|
+
================================================================================
|
|
63
|
+
|
|
64
|
+
[Additional articles...]
|
|
65
|
+
|
|
66
|
+
Summary:
|
|
67
|
+
Total articles: 4
|
|
68
|
+
Authors: Jane Smith, John Doe, Alice Johnson, Bob Williams
|
|
69
|
+
Categories: Programming (2), Ruby (1), XML (2), API (1), Design (1), XPath (1)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Key Concepts
|
|
73
|
+
|
|
74
|
+
### XPath Queries
|
|
75
|
+
|
|
76
|
+
The example uses various XPath patterns:
|
|
77
|
+
|
|
78
|
+
```ruby
|
|
79
|
+
# Simple path - get channel title
|
|
80
|
+
doc.xpath('//channel/title')
|
|
81
|
+
|
|
82
|
+
# Namespaced element - get Dublin Core creator
|
|
83
|
+
item.xpath('./dc:creator', 'dc' => 'http://purl.org/dc/elements/1.1/')
|
|
84
|
+
|
|
85
|
+
# Multiple results - get all categories
|
|
86
|
+
item.xpath('./category')
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Namespace Handling
|
|
90
|
+
|
|
91
|
+
RSS feeds often use multiple namespaces:
|
|
92
|
+
|
|
93
|
+
```ruby
|
|
94
|
+
namespaces = {
|
|
95
|
+
'dc' => 'http://purl.org/dc/elements/1.1/', # Dublin Core
|
|
96
|
+
'content' => 'http://purl.org/rss/1.0/modules/content/', # Content
|
|
97
|
+
'atom' => 'http://www.w3.org/2005/Atom' # Atom
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
# Query with namespace
|
|
101
|
+
author = item.at_xpath('./dc:creator', namespaces)
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### CDATA Content
|
|
105
|
+
|
|
106
|
+
Extract HTML/XML content preserved in CDATA sections:
|
|
107
|
+
|
|
108
|
+
```ruby
|
|
109
|
+
content_node = item.at_xpath('./content:encoded', namespaces)
|
|
110
|
+
content = content_node&.text&.strip
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### Error Handling
|
|
114
|
+
|
|
115
|
+
Proper error handling for parse and XPath errors:
|
|
116
|
+
|
|
117
|
+
```ruby
|
|
118
|
+
begin
|
|
119
|
+
doc = @moxml.parse(xml_content)
|
|
120
|
+
rescue Moxml::ParseError => e
|
|
121
|
+
puts "Failed to parse RSS feed: #{e.message}"
|
|
122
|
+
exit 1
|
|
123
|
+
end
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## Code Structure
|
|
127
|
+
|
|
128
|
+
### Article Class
|
|
129
|
+
|
|
130
|
+
Represents a single RSS article with:
|
|
131
|
+
- Title, link, description
|
|
132
|
+
- Full content (from CDATA)
|
|
133
|
+
- Author (from dc:creator)
|
|
134
|
+
- Publication date
|
|
135
|
+
- Categories
|
|
136
|
+
|
|
137
|
+
### RSSParser Class
|
|
138
|
+
|
|
139
|
+
Main parser with methods:
|
|
140
|
+
- `parse` - Parse the feed and return Article objects
|
|
141
|
+
- `parse_item` - Extract data from a single RSS item
|
|
142
|
+
- `extract_text` - Helper for safe text extraction
|
|
143
|
+
|
|
144
|
+
## Customization
|
|
145
|
+
|
|
146
|
+
### Adding More Fields
|
|
147
|
+
|
|
148
|
+
To extract additional RSS fields, add to `parse_item`:
|
|
149
|
+
|
|
150
|
+
```ruby
|
|
151
|
+
# Extract guid
|
|
152
|
+
guid = extract_text(item, './guid')
|
|
153
|
+
|
|
154
|
+
# Extract enclosure (podcast, etc.)
|
|
155
|
+
enclosure = item.at_xpath('./enclosure')
|
|
156
|
+
if enclosure
|
|
157
|
+
url = enclosure['url']
|
|
158
|
+
type = enclosure['type']
|
|
159
|
+
length = enclosure['length']
|
|
160
|
+
end
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
### Supporting Atom Feeds
|
|
164
|
+
|
|
165
|
+
Modify the parser to support Atom feed format:
|
|
166
|
+
|
|
167
|
+
```ruby
|
|
168
|
+
# Atom uses different element names
|
|
169
|
+
if doc.xpath('//feed').any? # Atom feed
|
|
170
|
+
items = doc.xpath('//entry')
|
|
171
|
+
# Extract with Atom element names: entry, id, summary, etc.
|
|
172
|
+
end
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
## Learning Points
|
|
176
|
+
|
|
177
|
+
1. **XPath is powerful**: One query can extract multiple elements
|
|
178
|
+
2. **Namespaces are important**: Many RSS extensions use namespaces
|
|
179
|
+
3. **CDATA preserves markup**: Use for HTML/XML content within RSS
|
|
180
|
+
4. **Safe navigation**: Use `&.` operator and nil checks
|
|
181
|
+
5. **Error handling matters**: Always handle parse and query errors
|
|
182
|
+
|
|
183
|
+
## Next Steps
|
|
184
|
+
|
|
185
|
+
- Try parsing different RSS feeds from the web
|
|
186
|
+
- Add support for podcast feeds (enclosures)
|
|
187
|
+
- Implement feed validation
|
|
188
|
+
- Create an RSS feed aggregator
|
|
189
|
+
- Export articles to different formats (JSON, Markdown, etc.)
|
|
190
|
+
|
|
191
|
+
## Related Examples
|
|
192
|
+
|
|
193
|
+
- [Web Scraper](../web_scraper/) - Similar DOM navigation techniques
|
|
194
|
+
- [API Client](../api_client/) - XML generation and parsing
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
2
|
+
<rss version="2.0"
|
|
3
|
+
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
|
4
|
+
xmlns:content="http://purl.org/rss/1.0/modules/content/"
|
|
5
|
+
xmlns:atom="http://www.w3.org/2005/Atom">
|
|
6
|
+
<channel>
|
|
7
|
+
<title>Tech News Daily</title>
|
|
8
|
+
<link>https://technews.example.com</link>
|
|
9
|
+
<description>Your daily dose of technology news</description>
|
|
10
|
+
<language>en-us</language>
|
|
11
|
+
<pubDate>Wed, 30 Oct 2024 10:00:00 GMT</pubDate>
|
|
12
|
+
<lastBuildDate>Wed, 30 Oct 2024 10:00:00 GMT</lastBuildDate>
|
|
13
|
+
<atom:link href="https://technews.example.com/rss" rel="self" type="application/rss+xml"/>
|
|
14
|
+
|
|
15
|
+
<item>
|
|
16
|
+
<title>Ruby 3.4 Released with Performance Improvements</title>
|
|
17
|
+
<link>https://technews.example.com/ruby-3-4-released</link>
|
|
18
|
+
<guid isPermaLink="true">https://technews.example.com/ruby-3-4-released</guid>
|
|
19
|
+
<description>Ruby 3.4 brings significant performance improvements and new features</description>
|
|
20
|
+
<content:encoded><![CDATA[
|
|
21
|
+
<p>The Ruby core team has announced the release of Ruby 3.4, featuring:</p>
|
|
22
|
+
<ul>
|
|
23
|
+
<li>30% faster execution for common patterns</li>
|
|
24
|
+
<li>Improved memory management</li>
|
|
25
|
+
<li>New standard library additions</li>
|
|
26
|
+
</ul>
|
|
27
|
+
]]></content:encoded>
|
|
28
|
+
<dc:creator>Jane Smith</dc:creator>
|
|
29
|
+
<pubDate>Wed, 30 Oct 2024 09:00:00 GMT</pubDate>
|
|
30
|
+
<category>Programming</category>
|
|
31
|
+
<category>Ruby</category>
|
|
32
|
+
</item>
|
|
33
|
+
|
|
34
|
+
<item>
|
|
35
|
+
<title>XML Processing in Modern Applications</title>
|
|
36
|
+
<link>https://technews.example.com/xml-processing</link>
|
|
37
|
+
<guid isPermaLink="true">https://technews.example.com/xml-processing</guid>
|
|
38
|
+
<description>A comprehensive guide to XML processing in modern applications</description>
|
|
39
|
+
<content:encoded><![CDATA[
|
|
40
|
+
<p>Despite the rise of JSON, XML remains crucial for many applications:</p>
|
|
41
|
+
<ul>
|
|
42
|
+
<li>Document-oriented data structures</li>
|
|
43
|
+
<li>Complex namespace requirements</li>
|
|
44
|
+
<li>Legacy system integration</li>
|
|
45
|
+
</ul>
|
|
46
|
+
]]></content:encoded>
|
|
47
|
+
<dc:creator>John Doe</dc:creator>
|
|
48
|
+
<pubDate>Wed, 30 Oct 2024 08:00:00 GMT</pubDate>
|
|
49
|
+
<category>Programming</category>
|
|
50
|
+
<category>XML</category>
|
|
51
|
+
</item>
|
|
52
|
+
|
|
53
|
+
<item>
|
|
54
|
+
<title>Best Practices for API Design</title>
|
|
55
|
+
<link>https://technews.example.com/api-design</link>
|
|
56
|
+
<guid isPermaLink="true">https://technews.example.com/api-design</guid>
|
|
57
|
+
<description>Learn the essential principles of modern API design</description>
|
|
58
|
+
<content:encoded><![CDATA[
|
|
59
|
+
<p>Key principles for designing robust APIs:</p>
|
|
60
|
+
<ol>
|
|
61
|
+
<li>Clear and consistent naming conventions</li>
|
|
62
|
+
<li>Proper error handling and messaging</li>
|
|
63
|
+
<li>Comprehensive documentation</li>
|
|
64
|
+
<li>Versioning strategy</li>
|
|
65
|
+
</ol>
|
|
66
|
+
]]></content:encoded>
|
|
67
|
+
<dc:creator>Alice Johnson</dc:creator>
|
|
68
|
+
<pubDate>Tue, 29 Oct 2024 14:00:00 GMT</pubDate>
|
|
69
|
+
<category>API</category>
|
|
70
|
+
<category>Design</category>
|
|
71
|
+
</item>
|
|
72
|
+
|
|
73
|
+
<item>
|
|
74
|
+
<title>Introduction to XPath Queries</title>
|
|
75
|
+
<link>https://technews.example.com/xpath-intro</link>
|
|
76
|
+
<guid isPermaLink="true">https://technews.example.com/xpath-intro</guid>
|
|
77
|
+
<description>Master XPath for efficient XML data extraction</description>
|
|
78
|
+
<content:encoded><![CDATA[
|
|
79
|
+
<p>XPath is a powerful language for navigating XML documents:</p>
|
|
80
|
+
<ul>
|
|
81
|
+
<li>Select elements by path</li>
|
|
82
|
+
<li>Filter with predicates</li>
|
|
83
|
+
<li>Work with namespaces</li>
|
|
84
|
+
<li>Use built-in functions</li>
|
|
85
|
+
</ul>
|
|
86
|
+
]]></content:encoded>
|
|
87
|
+
<dc:creator>Bob Williams</dc:creator>
|
|
88
|
+
<pubDate>Tue, 29 Oct 2024 10:00:00 GMT</pubDate>
|
|
89
|
+
<category>XML</category>
|
|
90
|
+
<category>XPath</category>
|
|
91
|
+
</item>
|
|
92
|
+
</channel>
|
|
93
|
+
</rss>
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# RSS Feed Parser Example
|
|
5
|
+
# This example demonstrates how to use Moxml to parse RSS feeds with:
|
|
6
|
+
# - XPath queries for data extraction
|
|
7
|
+
# - Namespace handling (dc, content, atom)
|
|
8
|
+
# - Element traversal and attribute access
|
|
9
|
+
# - Error handling best practices
|
|
10
|
+
|
|
11
|
+
# Load moxml from the local source (use 'require "moxml"' in production)
|
|
12
|
+
require_relative "../../lib/moxml"
|
|
13
|
+
|
|
14
|
+
# Article class to represent a parsed RSS item
|
|
15
|
+
class Article
|
|
16
|
+
attr_reader :title, :link, :description, :content, :author, :pub_date,
|
|
17
|
+
:categories
|
|
18
|
+
|
|
19
|
+
def initialize(title:, link:, description:, content: nil, author: nil,
|
|
20
|
+
pub_date: nil, categories: [])
|
|
21
|
+
@title = title
|
|
22
|
+
@link = link
|
|
23
|
+
@description = description
|
|
24
|
+
@content = content
|
|
25
|
+
@author = author
|
|
26
|
+
@pub_date = pub_date
|
|
27
|
+
@categories = categories
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def to_s
|
|
31
|
+
output = []
|
|
32
|
+
output << ("=" * 80)
|
|
33
|
+
output << "Title: #{@title}"
|
|
34
|
+
output << "Link: #{@link}"
|
|
35
|
+
output << "Author: #{@author}" if @author
|
|
36
|
+
output << "Published: #{@pub_date}" if @pub_date
|
|
37
|
+
output << "Categories: #{@categories.join(', ')}" unless @categories.empty?
|
|
38
|
+
output << ("-" * 80)
|
|
39
|
+
output << "Description: #{@description}"
|
|
40
|
+
output << ""
|
|
41
|
+
output << "Full Content:" if @content
|
|
42
|
+
output << @content if @content
|
|
43
|
+
output << ("=" * 80)
|
|
44
|
+
output.join("\n")
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# RSSParser class encapsulates RSS feed parsing logic
|
|
49
|
+
class RSSParser
|
|
50
|
+
# Initialize with the path to an RSS feed file
|
|
51
|
+
def initialize(feed_path)
|
|
52
|
+
@feed_path = feed_path
|
|
53
|
+
@moxml = Moxml.new
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Parse the RSS feed and return an array of Article objects
|
|
57
|
+
def parse
|
|
58
|
+
# Read and parse the XML file
|
|
59
|
+
xml_content = File.read(@feed_path)
|
|
60
|
+
|
|
61
|
+
# Parse with error handling
|
|
62
|
+
doc = begin
|
|
63
|
+
@moxml.parse(xml_content)
|
|
64
|
+
rescue Moxml::ParseError => e
|
|
65
|
+
puts "Failed to parse RSS feed: #{e.message}"
|
|
66
|
+
puts "Hint: #{e.hint}" if e.respond_to?(:hint)
|
|
67
|
+
exit 1
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Define namespace prefixes for XPath queries
|
|
71
|
+
# RSS feeds often use Dublin Core (dc) and Content (content) namespaces
|
|
72
|
+
namespaces = {
|
|
73
|
+
"dc" => "http://purl.org/dc/elements/1.1/",
|
|
74
|
+
"content" => "http://purl.org/rss/1.0/modules/content/",
|
|
75
|
+
"atom" => "http://www.w3.org/2005/Atom",
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
# Extract feed metadata using XPath
|
|
79
|
+
feed_title = extract_text(doc, "//channel/title")
|
|
80
|
+
feed_link = extract_text(doc, "//channel/link")
|
|
81
|
+
feed_description = extract_text(doc, "//channel/description")
|
|
82
|
+
|
|
83
|
+
puts "Feed: #{feed_title}"
|
|
84
|
+
puts "URL: #{feed_link}"
|
|
85
|
+
puts "Description: #{feed_description}"
|
|
86
|
+
puts "\nArticles:\n\n"
|
|
87
|
+
|
|
88
|
+
# Find all item elements using XPath
|
|
89
|
+
# The double slash (//) searches at any depth in the document
|
|
90
|
+
items = begin
|
|
91
|
+
doc.xpath("//item")
|
|
92
|
+
rescue Moxml::XPathError => e
|
|
93
|
+
puts "XPath query failed: #{e.message}"
|
|
94
|
+
puts "Expression: #{e.expression}" if e.respond_to?(:expression)
|
|
95
|
+
exit 1
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Parse each item into an Article object
|
|
99
|
+
items.map do |item|
|
|
100
|
+
parse_item(item, namespaces)
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
private
|
|
105
|
+
|
|
106
|
+
# Parse a single RSS item element
|
|
107
|
+
def parse_item(item, namespaces)
|
|
108
|
+
# Extract basic RSS fields
|
|
109
|
+
# Using at_xpath to get the first matching element (returns nil if not found)
|
|
110
|
+
title = extract_text(item, "./title")
|
|
111
|
+
link = extract_text(item, "./link")
|
|
112
|
+
description = extract_text(item, "./description")
|
|
113
|
+
pub_date = extract_text(item, "./pubDate")
|
|
114
|
+
|
|
115
|
+
# Extract namespaced elements
|
|
116
|
+
# The dc:creator element uses the Dublin Core namespace
|
|
117
|
+
author = extract_text(item, "./dc:creator", namespaces)
|
|
118
|
+
|
|
119
|
+
# Extract CDATA content from the content:encoded element
|
|
120
|
+
# CDATA sections preserve HTML/XML markup without parsing it
|
|
121
|
+
content_node = item.at_xpath("./content:encoded", namespaces)
|
|
122
|
+
content = content_node&.text&.strip
|
|
123
|
+
|
|
124
|
+
# Extract all category elements
|
|
125
|
+
# xpath returns a NodeSet which we can iterate over
|
|
126
|
+
category_nodes = item.xpath("./category")
|
|
127
|
+
categories = category_nodes.map(&:text)
|
|
128
|
+
|
|
129
|
+
# Create and return Article object
|
|
130
|
+
Article.new(
|
|
131
|
+
title: title,
|
|
132
|
+
link: link,
|
|
133
|
+
description: description,
|
|
134
|
+
content: content,
|
|
135
|
+
author: author,
|
|
136
|
+
pub_date: pub_date,
|
|
137
|
+
categories: categories,
|
|
138
|
+
)
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Helper method to extract text content from an XPath query
|
|
142
|
+
# Returns empty string if element not found
|
|
143
|
+
def extract_text(node, xpath, namespaces = {})
|
|
144
|
+
element = node.at_xpath(xpath, namespaces)
|
|
145
|
+
element&.text&.strip || ""
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Main execution
|
|
150
|
+
if __FILE__ == $0
|
|
151
|
+
# Get the feed path (use example feed by default)
|
|
152
|
+
feed_path = ARGV[0] || File.join(__dir__, "example_feed.xml")
|
|
153
|
+
|
|
154
|
+
# Check if file exists
|
|
155
|
+
unless File.exist?(feed_path)
|
|
156
|
+
puts "Error: Feed file not found: #{feed_path}"
|
|
157
|
+
puts "Usage: ruby rss_parser.rb [path/to/feed.xml]"
|
|
158
|
+
exit 1
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
puts "Parsing RSS feed: #{feed_path}"
|
|
162
|
+
puts "=" * 80
|
|
163
|
+
puts
|
|
164
|
+
|
|
165
|
+
# Parse the feed
|
|
166
|
+
parser = RSSParser.new(feed_path)
|
|
167
|
+
articles = parser.parse
|
|
168
|
+
|
|
169
|
+
# Display each article
|
|
170
|
+
articles.each_with_index do |article, index|
|
|
171
|
+
puts "\n#{index + 1}. #{article}\n"
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# Summary statistics
|
|
175
|
+
puts "\n#{'=' * 80}"
|
|
176
|
+
puts "Summary:"
|
|
177
|
+
puts "Total articles: #{articles.length}"
|
|
178
|
+
puts "Authors: #{articles.filter_map(&:author).uniq.join(', ')}"
|
|
179
|
+
|
|
180
|
+
# Count categories
|
|
181
|
+
all_categories = articles.flat_map(&:categories)
|
|
182
|
+
category_counts = all_categories.each_with_object(Hash.new(0)) do |cat, counts|
|
|
183
|
+
counts[cat] += 1
|
|
184
|
+
end
|
|
185
|
+
puts "Categories: #{category_counts.map do |cat, count|
|
|
186
|
+
"#{cat} (#{count})"
|
|
187
|
+
end.join(', ')}"
|
|
188
|
+
puts "=" * 80
|
|
189
|
+
end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# SAX Parsing Examples
|
|
2
|
+
|
|
3
|
+
This directory contains practical examples demonstrating Moxml's SAX (Simple API for XML) parsing capabilities.
|
|
4
|
+
|
|
5
|
+
## Files
|
|
6
|
+
|
|
7
|
+
- `example.xml` - Sample XML file with book data
|
|
8
|
+
- `simple_parser.rb` - Basic SAX parsing with both class and block handlers
|
|
9
|
+
- `data_extractor.rb` - Extract specific data using ElementHandler
|
|
10
|
+
- `large_file.rb` - Memory-efficient streaming processor
|
|
11
|
+
|
|
12
|
+
## Running Examples
|
|
13
|
+
|
|
14
|
+
Make sure you have moxml installed:
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
gem install moxml
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
Then run any example:
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
ruby simple_parser.rb
|
|
24
|
+
ruby data_extractor.rb
|
|
25
|
+
ruby large_file.rb
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## What Each Example Demonstrates
|
|
29
|
+
|
|
30
|
+
### simple_parser.rb
|
|
31
|
+
- Basic handler creation
|
|
32
|
+
- Using both class-based and block-based handlers
|
|
33
|
+
- Handling different event types
|
|
34
|
+
- Comparing the two approaches
|
|
35
|
+
|
|
36
|
+
### data_extractor.rb
|
|
37
|
+
- Using ElementHandler for context-aware parsing
|
|
38
|
+
- Path matching with regex
|
|
39
|
+
- Extracting structured data
|
|
40
|
+
- Accumulating text across multiple character events
|
|
41
|
+
|
|
42
|
+
### large_file.rb
|
|
43
|
+
- Memory-efficient streaming
|
|
44
|
+
- Processing records without loading entire document
|
|
45
|
+
- Immediate output to avoid memory accumulation
|
|
46
|
+
- Best practices for large file handling
|
|
47
|
+
|
|
48
|
+
## Learn More
|
|
49
|
+
|
|
50
|
+
See the comprehensive SAX Parsing Guide in `docs/_guides/sax-parsing.adoc` for detailed documentation, patterns, and best practices.
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require "bundler/setup"
|
|
5
|
+
require "moxml"
|
|
6
|
+
|
|
7
|
+
xml = File.read(File.join(__dir__, "example.xml"))
|
|
8
|
+
|
|
9
|
+
# Handler that extracts book data using ElementHandler utilities
|
|
10
|
+
class BookExtractor < Moxml::SAX::ElementHandler
|
|
11
|
+
attr_reader :books
|
|
12
|
+
|
|
13
|
+
def initialize
|
|
14
|
+
super
|
|
15
|
+
@books = []
|
|
16
|
+
@current_book = nil
|
|
17
|
+
@current_field = nil
|
|
18
|
+
@current_text = +""
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def on_start_element(name, attributes = {}, namespaces = {})
|
|
22
|
+
super # Important: updates element stack
|
|
23
|
+
|
|
24
|
+
case name
|
|
25
|
+
when "book"
|
|
26
|
+
@current_book = {
|
|
27
|
+
id: attributes["id"],
|
|
28
|
+
category: attributes["category"],
|
|
29
|
+
}
|
|
30
|
+
puts "Found book with ID: #{attributes['id']}"
|
|
31
|
+
when "title", "author", "price", "isbn"
|
|
32
|
+
@current_field = name
|
|
33
|
+
@current_text = +""
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def on_characters(text)
|
|
38
|
+
# Accumulate text - may be called multiple times for one element
|
|
39
|
+
@current_text << text if @current_field
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def on_end_element(name)
|
|
43
|
+
# Process completed elements
|
|
44
|
+
if @current_field == name && @current_book
|
|
45
|
+
value = @current_text.strip
|
|
46
|
+
value = value.to_f if name == "price"
|
|
47
|
+
@current_book[name.to_sym] = value
|
|
48
|
+
puts " #{name.capitalize}: #{value}"
|
|
49
|
+
@current_field = nil
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
if name == "book" && @current_book
|
|
53
|
+
@books << @current_book
|
|
54
|
+
puts " Complete book added\n\n"
|
|
55
|
+
@current_book = nil
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
super # Important: updates element stack
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
puts "=== SAX Data Extraction Example ==="
|
|
63
|
+
puts
|
|
64
|
+
|
|
65
|
+
context = Moxml.new(:nokogiri)
|
|
66
|
+
handler = BookExtractor.new
|
|
67
|
+
context.sax_parse(xml, handler)
|
|
68
|
+
|
|
69
|
+
puts "=== Summary ==="
|
|
70
|
+
puts "Total books extracted: #{handler.books.size}"
|
|
71
|
+
puts
|
|
72
|
+
puts "Programming books:"
|
|
73
|
+
handler.books.select { |b| b[:category] == "programming" }.each do |book|
|
|
74
|
+
puts " - #{book[:title]} by #{book[:author]} ($#{book[:price]})"
|
|
75
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
2
|
+
<library>
|
|
3
|
+
<book id="1" category="programming">
|
|
4
|
+
<title>Ruby Programming</title>
|
|
5
|
+
<author>Jane Smith</author>
|
|
6
|
+
<price>29.99</price>
|
|
7
|
+
<isbn>978-0-123456-78-9</isbn>
|
|
8
|
+
</book>
|
|
9
|
+
<book id="2" category="fiction">
|
|
10
|
+
<title>The Great Novel</title>
|
|
11
|
+
<author>John Doe</author>
|
|
12
|
+
<price>19.99</price>
|
|
13
|
+
<isbn>978-0-987654-32-1</isbn>
|
|
14
|
+
</book>
|
|
15
|
+
<book id="3" category="programming">
|
|
16
|
+
<title>Advanced Ruby</title>
|
|
17
|
+
<author>Jane Smith</author>
|
|
18
|
+
<price>39.99</price>
|
|
19
|
+
<isbn>978-0-111111-11-1</isbn>
|
|
20
|
+
</book>
|
|
21
|
+
</library>
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require "bundler/setup"
|
|
5
|
+
require "moxml"
|
|
6
|
+
|
|
7
|
+
xml = File.read(File.join(__dir__, "example.xml"))
|
|
8
|
+
|
|
9
|
+
# Memory-efficient streaming processor
|
|
10
|
+
# Processes and outputs records immediately without accumulating in memory
|
|
11
|
+
class StreamProcessor < Moxml::SAX::Handler
|
|
12
|
+
def initialize(output = $stdout)
|
|
13
|
+
super()
|
|
14
|
+
@output = output
|
|
15
|
+
@current_record = nil
|
|
16
|
+
@current_field = nil
|
|
17
|
+
@text_buffer = +""
|
|
18
|
+
@record_count = 0
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def on_start_element(name, attributes = {}, _namespaces = {})
|
|
22
|
+
case name
|
|
23
|
+
when "book"
|
|
24
|
+
@current_record = {
|
|
25
|
+
id: attributes["id"],
|
|
26
|
+
category: attributes["category"],
|
|
27
|
+
}
|
|
28
|
+
when "title", "author", "price", "isbn"
|
|
29
|
+
@current_field = name
|
|
30
|
+
@text_buffer = +""
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def on_characters(text)
|
|
35
|
+
@text_buffer << text if @current_field
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def on_end_element(name)
|
|
39
|
+
# Capture field value
|
|
40
|
+
if @current_field == name && @current_record
|
|
41
|
+
value = @text_buffer.strip
|
|
42
|
+
value = value.to_f if name == "price"
|
|
43
|
+
@current_record[name.to_sym] = value
|
|
44
|
+
@current_field = nil
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Process complete record immediately
|
|
48
|
+
if name == "book" && @current_record
|
|
49
|
+
process_record(@current_record)
|
|
50
|
+
@current_record = nil # Free memory immediately
|
|
51
|
+
@text_buffer = +"" # Reset for next record
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
private
|
|
56
|
+
|
|
57
|
+
def process_record(record)
|
|
58
|
+
@record_count += 1
|
|
59
|
+
# Process and output immediately - don't accumulate
|
|
60
|
+
@output.puts "Record #{@record_count}: #{record[:title]} by #{record[:author]}"
|
|
61
|
+
@output.puts " Category: #{record[:category]}, Price: $#{record[:price]}"
|
|
62
|
+
@output.puts
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
puts "=== Memory-Efficient Streaming Example ==="
|
|
67
|
+
puts "Processing records as they're encountered..."
|
|
68
|
+
puts
|
|
69
|
+
|
|
70
|
+
context = Moxml.new(:nokogiri)
|
|
71
|
+
processor = StreamProcessor.new
|
|
72
|
+
context.sax_parse(xml, processor)
|
|
73
|
+
|
|
74
|
+
puts "=== Benefits of This Approach ==="
|
|
75
|
+
puts "✓ Constant memory usage - O(1) regardless of file size"
|
|
76
|
+
puts "✓ Immediate processing/output - no waiting for full parse"
|
|
77
|
+
puts "✓ Handles files of any size - tested with gigabyte+ files"
|
|
78
|
+
puts "✓ Perfect for streaming data or ETL pipelines"
|