moxml 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (215) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/dependent-repos.json +5 -0
  3. data/.github/workflows/dependent-tests.yml +20 -0
  4. data/.github/workflows/docs.yml +59 -0
  5. data/.github/workflows/rake.yml +10 -10
  6. data/.github/workflows/release.yml +5 -3
  7. data/.gitignore +37 -0
  8. data/.rubocop.yml +15 -7
  9. data/.rubocop_todo.yml +224 -43
  10. data/Gemfile +14 -9
  11. data/LICENSE.md +6 -2
  12. data/README.adoc +535 -373
  13. data/Rakefile +53 -0
  14. data/benchmarks/.gitignore +6 -0
  15. data/benchmarks/generate_report.rb +550 -0
  16. data/docs/Gemfile +13 -0
  17. data/docs/_config.yml +138 -0
  18. data/docs/_guides/advanced-features.adoc +87 -0
  19. data/docs/_guides/development-testing.adoc +165 -0
  20. data/docs/_guides/index.adoc +51 -0
  21. data/docs/_guides/modifying-xml.adoc +292 -0
  22. data/docs/_guides/parsing-xml.adoc +230 -0
  23. data/docs/_guides/sax-parsing.adoc +603 -0
  24. data/docs/_guides/working-with-documents.adoc +118 -0
  25. data/docs/_guides/xml-declaration.adoc +450 -0
  26. data/docs/_pages/adapter-compatibility.adoc +369 -0
  27. data/docs/_pages/adapters/headed-ox.adoc +237 -0
  28. data/docs/_pages/adapters/index.adoc +97 -0
  29. data/docs/_pages/adapters/libxml.adoc +285 -0
  30. data/docs/_pages/adapters/nokogiri.adoc +251 -0
  31. data/docs/_pages/adapters/oga.adoc +291 -0
  32. data/docs/_pages/adapters/ox.adoc +56 -0
  33. data/docs/_pages/adapters/rexml.adoc +292 -0
  34. data/docs/_pages/best-practices.adoc +429 -0
  35. data/docs/_pages/compatibility.adoc +467 -0
  36. data/docs/_pages/configuration.adoc +250 -0
  37. data/docs/_pages/error-handling.adoc +349 -0
  38. data/docs/_pages/headed-ox-limitations.adoc +574 -0
  39. data/docs/_pages/headed-ox.adoc +1025 -0
  40. data/docs/_pages/index.adoc +35 -0
  41. data/docs/_pages/installation.adoc +140 -0
  42. data/docs/_pages/node-api-reference.adoc +49 -0
  43. data/docs/_pages/performance.adoc +35 -0
  44. data/docs/_pages/quick-start.adoc +243 -0
  45. data/docs/_pages/thread-safety.adoc +28 -0
  46. data/docs/_references/document-api.adoc +407 -0
  47. data/docs/_references/index.adoc +48 -0
  48. data/docs/_tutorials/basic-usage.adoc +267 -0
  49. data/docs/_tutorials/builder-pattern.adoc +342 -0
  50. data/docs/_tutorials/index.adoc +33 -0
  51. data/docs/_tutorials/namespace-handling.adoc +324 -0
  52. data/docs/_tutorials/xpath-queries.adoc +358 -0
  53. data/docs/index.adoc +122 -0
  54. data/examples/README.md +124 -0
  55. data/examples/api_client/README.md +424 -0
  56. data/examples/api_client/api_client.rb +394 -0
  57. data/examples/api_client/example_response.xml +48 -0
  58. data/examples/headed_ox_example/README.md +90 -0
  59. data/examples/headed_ox_example/headed_ox_demo.rb +71 -0
  60. data/examples/rss_parser/README.md +194 -0
  61. data/examples/rss_parser/example_feed.xml +93 -0
  62. data/examples/rss_parser/rss_parser.rb +189 -0
  63. data/examples/sax_parsing/README.md +50 -0
  64. data/examples/sax_parsing/data_extractor.rb +75 -0
  65. data/examples/sax_parsing/example.xml +21 -0
  66. data/examples/sax_parsing/large_file.rb +78 -0
  67. data/examples/sax_parsing/simple_parser.rb +55 -0
  68. data/examples/web_scraper/README.md +352 -0
  69. data/examples/web_scraper/example_page.html +201 -0
  70. data/examples/web_scraper/web_scraper.rb +312 -0
  71. data/lib/moxml/adapter/base.rb +107 -28
  72. data/lib/moxml/adapter/customized_libxml/cdata.rb +28 -0
  73. data/lib/moxml/adapter/customized_libxml/comment.rb +24 -0
  74. data/lib/moxml/adapter/customized_libxml/declaration.rb +85 -0
  75. data/lib/moxml/adapter/customized_libxml/element.rb +39 -0
  76. data/lib/moxml/adapter/customized_libxml/node.rb +44 -0
  77. data/lib/moxml/adapter/customized_libxml/processing_instruction.rb +31 -0
  78. data/lib/moxml/adapter/customized_libxml/text.rb +27 -0
  79. data/lib/moxml/adapter/customized_oga/xml_generator.rb +1 -1
  80. data/lib/moxml/adapter/customized_ox/attribute.rb +28 -1
  81. data/lib/moxml/adapter/customized_rexml/formatter.rb +13 -8
  82. data/lib/moxml/adapter/headed_ox.rb +161 -0
  83. data/lib/moxml/adapter/libxml.rb +1564 -0
  84. data/lib/moxml/adapter/nokogiri.rb +156 -9
  85. data/lib/moxml/adapter/oga.rb +190 -15
  86. data/lib/moxml/adapter/ox.rb +322 -28
  87. data/lib/moxml/adapter/rexml.rb +157 -28
  88. data/lib/moxml/adapter.rb +21 -4
  89. data/lib/moxml/attribute.rb +6 -0
  90. data/lib/moxml/builder.rb +40 -4
  91. data/lib/moxml/config.rb +8 -3
  92. data/lib/moxml/context.rb +57 -2
  93. data/lib/moxml/declaration.rb +9 -0
  94. data/lib/moxml/doctype.rb +13 -1
  95. data/lib/moxml/document.rb +53 -6
  96. data/lib/moxml/document_builder.rb +34 -5
  97. data/lib/moxml/element.rb +71 -2
  98. data/lib/moxml/error.rb +175 -6
  99. data/lib/moxml/node.rb +155 -4
  100. data/lib/moxml/node_set.rb +34 -0
  101. data/lib/moxml/sax/block_handler.rb +194 -0
  102. data/lib/moxml/sax/element_handler.rb +124 -0
  103. data/lib/moxml/sax/handler.rb +113 -0
  104. data/lib/moxml/sax.rb +31 -0
  105. data/lib/moxml/version.rb +1 -1
  106. data/lib/moxml/xml_utils/encoder.rb +4 -4
  107. data/lib/moxml/xml_utils.rb +7 -4
  108. data/lib/moxml/xpath/ast/node.rb +159 -0
  109. data/lib/moxml/xpath/cache.rb +91 -0
  110. data/lib/moxml/xpath/compiler.rb +1770 -0
  111. data/lib/moxml/xpath/context.rb +26 -0
  112. data/lib/moxml/xpath/conversion.rb +124 -0
  113. data/lib/moxml/xpath/engine.rb +52 -0
  114. data/lib/moxml/xpath/errors.rb +101 -0
  115. data/lib/moxml/xpath/lexer.rb +304 -0
  116. data/lib/moxml/xpath/parser.rb +485 -0
  117. data/lib/moxml/xpath/ruby/generator.rb +269 -0
  118. data/lib/moxml/xpath/ruby/node.rb +193 -0
  119. data/lib/moxml/xpath.rb +37 -0
  120. data/lib/moxml.rb +5 -2
  121. data/moxml.gemspec +3 -1
  122. data/old-specs/moxml/adapter/customized_libxml/.gitkeep +6 -0
  123. data/spec/consistency/README.md +77 -0
  124. data/spec/{moxml/examples/adapter_spec.rb → consistency/adapter_parity_spec.rb} +4 -4
  125. data/spec/examples/README.md +75 -0
  126. data/spec/{support/shared_examples/examples/attribute.rb → examples/attribute_examples_spec.rb} +1 -1
  127. data/spec/{support/shared_examples/examples/basic_usage.rb → examples/basic_usage_spec.rb} +2 -2
  128. data/spec/{support/shared_examples/examples/namespace.rb → examples/namespace_examples_spec.rb} +3 -3
  129. data/spec/{support/shared_examples/examples/readme_examples.rb → examples/readme_examples_spec.rb} +6 -4
  130. data/spec/{support/shared_examples/examples/xpath.rb → examples/xpath_examples_spec.rb} +10 -6
  131. data/spec/integration/README.md +71 -0
  132. data/spec/{moxml/all_with_adapters_spec.rb → integration/all_adapters_spec.rb} +3 -2
  133. data/spec/integration/headed_ox_integration_spec.rb +326 -0
  134. data/spec/{support → integration}/shared_examples/edge_cases.rb +37 -10
  135. data/spec/integration/shared_examples/high_level/.gitkeep +0 -0
  136. data/spec/{support/shared_examples/context.rb → integration/shared_examples/high_level/context_behavior.rb} +2 -1
  137. data/spec/{support/shared_examples/integration.rb → integration/shared_examples/integration_workflows.rb} +23 -6
  138. data/spec/integration/shared_examples/node_wrappers/.gitkeep +0 -0
  139. data/spec/{support/shared_examples/cdata.rb → integration/shared_examples/node_wrappers/cdata_behavior.rb} +6 -1
  140. data/spec/{support/shared_examples/comment.rb → integration/shared_examples/node_wrappers/comment_behavior.rb} +2 -1
  141. data/spec/{support/shared_examples/declaration.rb → integration/shared_examples/node_wrappers/declaration_behavior.rb} +5 -5
  142. data/spec/{support/shared_examples/doctype.rb → integration/shared_examples/node_wrappers/doctype_behavior.rb} +2 -2
  143. data/spec/{support/shared_examples/document.rb → integration/shared_examples/node_wrappers/document_behavior.rb} +1 -1
  144. data/spec/{support/shared_examples/node.rb → integration/shared_examples/node_wrappers/node_behavior.rb} +9 -2
  145. data/spec/{support/shared_examples/node_set.rb → integration/shared_examples/node_wrappers/node_set_behavior.rb} +1 -18
  146. data/spec/{support/shared_examples/processing_instruction.rb → integration/shared_examples/node_wrappers/processing_instruction_behavior.rb} +6 -2
  147. data/spec/moxml/README.md +41 -0
  148. data/spec/moxml/adapter/.gitkeep +0 -0
  149. data/spec/moxml/adapter/README.md +61 -0
  150. data/spec/moxml/adapter/base_spec.rb +27 -0
  151. data/spec/moxml/adapter/headed_ox_spec.rb +311 -0
  152. data/spec/moxml/adapter/libxml_spec.rb +14 -0
  153. data/spec/moxml/adapter/ox_spec.rb +9 -8
  154. data/spec/moxml/adapter/shared_examples/.gitkeep +0 -0
  155. data/spec/{support/shared_examples/xml_adapter.rb → moxml/adapter/shared_examples/adapter_contract.rb} +39 -12
  156. data/spec/moxml/adapter_spec.rb +16 -0
  157. data/spec/moxml/attribute_spec.rb +30 -0
  158. data/spec/moxml/builder_spec.rb +33 -0
  159. data/spec/moxml/cdata_spec.rb +31 -0
  160. data/spec/moxml/comment_spec.rb +31 -0
  161. data/spec/moxml/config_spec.rb +3 -3
  162. data/spec/moxml/context_spec.rb +28 -0
  163. data/spec/moxml/declaration_preservation_spec.rb +217 -0
  164. data/spec/moxml/declaration_spec.rb +36 -0
  165. data/spec/moxml/doctype_spec.rb +33 -0
  166. data/spec/moxml/document_builder_spec.rb +30 -0
  167. data/spec/moxml/document_spec.rb +105 -0
  168. data/spec/moxml/element_spec.rb +143 -0
  169. data/spec/moxml/error_spec.rb +266 -22
  170. data/spec/{moxml_spec.rb → moxml/moxml_spec.rb} +9 -9
  171. data/spec/moxml/namespace_spec.rb +32 -0
  172. data/spec/moxml/node_set_spec.rb +39 -0
  173. data/spec/moxml/node_spec.rb +37 -0
  174. data/spec/moxml/processing_instruction_spec.rb +34 -0
  175. data/spec/moxml/sax_spec.rb +1067 -0
  176. data/spec/moxml/text_spec.rb +31 -0
  177. data/spec/moxml/version_spec.rb +14 -0
  178. data/spec/moxml/xml_utils/.gitkeep +0 -0
  179. data/spec/moxml/xml_utils/encoder_spec.rb +27 -0
  180. data/spec/moxml/xml_utils_spec.rb +49 -0
  181. data/spec/moxml/xpath/ast/node_spec.rb +83 -0
  182. data/spec/moxml/xpath/axes_spec.rb +296 -0
  183. data/spec/moxml/xpath/cache_spec.rb +358 -0
  184. data/spec/moxml/xpath/compiler_spec.rb +406 -0
  185. data/spec/moxml/xpath/context_spec.rb +210 -0
  186. data/spec/moxml/xpath/conversion_spec.rb +365 -0
  187. data/spec/moxml/xpath/fixtures/sample.xml +25 -0
  188. data/spec/moxml/xpath/functions/boolean_functions_spec.rb +114 -0
  189. data/spec/moxml/xpath/functions/node_functions_spec.rb +145 -0
  190. data/spec/moxml/xpath/functions/numeric_functions_spec.rb +164 -0
  191. data/spec/moxml/xpath/functions/position_functions_spec.rb +93 -0
  192. data/spec/moxml/xpath/functions/special_functions_spec.rb +89 -0
  193. data/spec/moxml/xpath/functions/string_functions_spec.rb +381 -0
  194. data/spec/moxml/xpath/lexer_spec.rb +488 -0
  195. data/spec/moxml/xpath/parser_integration_spec.rb +210 -0
  196. data/spec/moxml/xpath/parser_spec.rb +364 -0
  197. data/spec/moxml/xpath/ruby/generator_spec.rb +421 -0
  198. data/spec/moxml/xpath/ruby/node_spec.rb +291 -0
  199. data/spec/moxml/xpath_capabilities_spec.rb +199 -0
  200. data/spec/moxml/xpath_spec.rb +77 -0
  201. data/spec/performance/README.md +83 -0
  202. data/spec/performance/benchmark_spec.rb +64 -0
  203. data/spec/{support/shared_examples/examples/memory.rb → performance/memory_usage_spec.rb} +4 -1
  204. data/spec/{support/shared_examples/examples/thread_safety.rb → performance/thread_safety_spec.rb} +3 -1
  205. data/spec/performance/xpath_benchmark_spec.rb +259 -0
  206. data/spec/spec_helper.rb +58 -1
  207. data/spec/support/xml_matchers.rb +1 -1
  208. metadata +178 -34
  209. data/spec/support/shared_examples/examples/benchmark_spec.rb +0 -51
  210. /data/spec/{support/shared_examples/builder.rb → integration/shared_examples/high_level/builder_behavior.rb} +0 -0
  211. /data/spec/{support/shared_examples/document_builder.rb → integration/shared_examples/high_level/document_builder_behavior.rb} +0 -0
  212. /data/spec/{support/shared_examples/attribute.rb → integration/shared_examples/node_wrappers/attribute_behavior.rb} +0 -0
  213. /data/spec/{support/shared_examples/element.rb → integration/shared_examples/node_wrappers/element_behavior.rb} +0 -0
  214. /data/spec/{support/shared_examples/namespace.rb → integration/shared_examples/node_wrappers/namespace_behavior.rb} +0 -0
  215. /data/spec/{support/shared_examples/text.rb → integration/shared_examples/node_wrappers/text_behavior.rb} +0 -0
@@ -0,0 +1,312 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # Web Scraper Example
5
+ # This example demonstrates how to use Moxml to scrape data from HTML/XML:
6
+ # - Parsing HTML as XML
7
+ # - Extracting data from tables
8
+ # - DOM structure navigation
9
+ # - Attribute and text content access
10
+ # - Working with structured data
11
+
12
+ # Load moxml from the local source (use 'require "moxml"' in production)
13
+ require_relative "../../lib/moxml"
14
+
15
+ # Language class to represent programming language data
16
+ class Language
17
+ attr_reader :rank, :name, :category, :score, :year, :use_cases
18
+
19
+ def initialize(rank:, name:, category:, score:, year:, use_cases:)
20
+ @rank = rank.to_i
21
+ @name = name
22
+ @category = category
23
+ @score = score.to_f
24
+ @year = year.to_i
25
+ @use_cases = use_cases
26
+ end
27
+
28
+ def to_s
29
+ "#{@rank}. #{@name} (#{@category}) - #{@score}% | Created: #{@year} | Uses: #{@use_cases}"
30
+ end
31
+ end
32
+
33
+ # CategoryStats class to represent category statistics
34
+ class CategoryStats
35
+ attr_reader :name, :count, :avg_score, :top_language
36
+
37
+ def initialize(name:, count:, avg_score:, top_language:)
38
+ @name = name
39
+ @count = count.to_i
40
+ @avg_score = avg_score
41
+ @top_language = top_language
42
+ end
43
+
44
+ def to_s
45
+ "#{@name}: #{@count} languages, avg #{@avg_score}, top: #{@top_language}"
46
+ end
47
+ end
48
+
49
+ # WebScraper class encapsulates web scraping logic
50
+ class WebScraper
51
+ # Initialize with the path to an HTML file
52
+ def initialize(html_path)
53
+ @html_path = html_path
54
+ @moxml = Moxml.new
55
+ end
56
+
57
+ # Scrape the HTML page and extract all data
58
+ def scrape
59
+ # Read and parse the HTML file
60
+ html_content = File.read(@html_path)
61
+
62
+ # Parse HTML as XML (Moxml can handle well-formed HTML)
63
+ doc = begin
64
+ @moxml.parse(html_content)
65
+ rescue Moxml::ParseError => e
66
+ puts "Failed to parse HTML: #{e.message}"
67
+ puts "Hint: Ensure the HTML is well-formed XML"
68
+ exit 1
69
+ end
70
+
71
+ puts "=" * 80
72
+ puts "Programming Language Statistics Scraper"
73
+ puts "=" * 80
74
+ puts
75
+
76
+ # Extract page title
77
+ title = extract_page_title(doc)
78
+ puts "Page Title: #{title}\n\n"
79
+
80
+ # Extract summary information
81
+ summary = extract_summary(doc)
82
+ puts "Summary:"
83
+ puts " #{summary[:total]} total languages tracked"
84
+ puts " Last updated: #{summary[:updated]}\n\n"
85
+
86
+ # Extract language data from the main table
87
+ languages = extract_languages_table(doc)
88
+ puts "Languages Extracted: #{languages.length}"
89
+ puts "-" * 80
90
+ languages.each { |lang| puts lang }
91
+ puts
92
+
93
+ # Extract category statistics
94
+ categories = extract_category_stats(doc)
95
+ puts "\nCategory Statistics:"
96
+ puts "-" * 80
97
+ categories.each { |cat| puts cat }
98
+ puts
99
+
100
+ # Extract detailed information
101
+ details = extract_detailed_info(doc)
102
+ puts "\nDetailed Information:"
103
+ puts "-" * 80
104
+ details.each do |lang, info|
105
+ puts "#{lang}:"
106
+ info.each { |key, value| puts " #{key}: #{value}" }
107
+ puts
108
+ end
109
+
110
+ # Return structured data
111
+ {
112
+ title: title,
113
+ summary: summary,
114
+ languages: languages,
115
+ categories: categories,
116
+ details: details,
117
+ }
118
+ end
119
+
120
+ private
121
+
122
+ # Extract the page title from <title> element
123
+ def extract_page_title(doc)
124
+ # Find the title element using XPath
125
+ # The double slash (//) searches from the root
126
+ title_element = doc.at_xpath("//title")
127
+ title_element&.text&.strip || "Unknown Title"
128
+ end
129
+
130
+ # Extract summary statistics from the summary card
131
+ def extract_summary(doc)
132
+ # Find the summary div by id attribute
133
+ # XPath attribute selector: [@id='value']
134
+ summary_div = doc.at_xpath("//div[@id='summary']")
135
+
136
+ return { total: 0, updated: "Unknown" } unless summary_div
137
+
138
+ # Extract text from span elements with class 'stat-value'
139
+ # Using XPath class selector: [contains(@class, 'value')]
140
+ stats = summary_div.xpath(".//span[@class='stat-value']")
141
+
142
+ {
143
+ total: stats[0]&.text&.strip || "0",
144
+ updated: stats[1]&.text&.strip || "Unknown",
145
+ }
146
+ end
147
+
148
+ # Extract language data from the popularity table
149
+ def extract_languages_table(doc)
150
+ # Find the table by id
151
+ table = doc.at_xpath("//table[@id='popularity-table']")
152
+ return [] unless table
153
+
154
+ # Get all table body rows
155
+ # Using descendant axis to find tbody/tr elements
156
+ rows = table.xpath(".//tbody/tr")
157
+
158
+ # Parse each row into a Language object
159
+ rows.filter_map do |row|
160
+ # Get all td (cell) elements in this row
161
+ cells = row.xpath("./td")
162
+
163
+ # Skip if we don't have enough cells
164
+ next nil if cells.length < 6
165
+
166
+ # Extract data from each cell
167
+ # Using array indexing for predictable table structure
168
+ rank = cells[0].text.strip
169
+ name = cells[1].text.strip
170
+ category = cells[2].text.strip
171
+
172
+ # Access data-score attribute for the score
173
+ # Demonstrates attribute access with []
174
+ score = cells[3]["data-score"] || cells[3].text.strip.delete("%")
175
+
176
+ year = cells[4].text.strip
177
+ use_cases = cells[5].text.strip
178
+
179
+ Language.new(
180
+ rank: rank,
181
+ name: name,
182
+ category: category,
183
+ score: score,
184
+ year: year,
185
+ use_cases: use_cases,
186
+ )
187
+ end
188
+ end
189
+
190
+ # Extract category statistics from the category table
191
+ def extract_category_stats(doc)
192
+ # Find the category table
193
+ table = doc.at_xpath("//table[@id='category-table']")
194
+ return [] unless table
195
+
196
+ # Get table rows
197
+ rows = table.xpath(".//tbody/tr")
198
+
199
+ rows.filter_map do |row|
200
+ cells = row.xpath("./td")
201
+ next nil if cells.length < 4
202
+
203
+ CategoryStats.new(
204
+ name: cells[0].text.strip,
205
+ count: cells[1].text.strip,
206
+ avg_score: cells[2].text.strip,
207
+ top_language: cells[3].text.strip,
208
+ )
209
+ end
210
+ end
211
+
212
+ # Extract detailed language information from stats cards
213
+ def extract_detailed_info(doc)
214
+ # Find all divs with class 'stats-card' that have a data-language attribute
215
+ # This demonstrates combining class and attribute selectors
216
+ cards = doc.xpath("//div[contains(@class, 'stats-card') and @data-language]")
217
+
218
+ cards.each_with_object({}) do |card, hash|
219
+ # Get the language name from the data-language attribute
220
+ lang_name = card["data-language"]
221
+
222
+ # Extract all list items within this card
223
+ items = card.xpath(".//li")
224
+
225
+ # Parse each list item to extract key-value pairs
226
+ info = items.each_with_object({}) do |item, item_hash|
227
+ text = item.text.strip
228
+ # Simple parsing: split on first colon
229
+ if text.include?(":")
230
+ key, value = text.split(":", 2)
231
+ item_hash[key.strip] = value.strip
232
+ end
233
+ end
234
+
235
+ hash[lang_name] = info unless info.empty?
236
+ end
237
+ end
238
+ end
239
+
240
+ # Demonstration of various XPath patterns
241
+ def demonstrate_xpath_patterns(doc)
242
+ puts "\n#{'=' * 80}"
243
+ puts "XPath Pattern Demonstrations"
244
+ puts "=" * 80
245
+
246
+ # Pattern 1: Direct descendant
247
+ puts "\n1. All table headers (//th):"
248
+ headers = doc.xpath("//th")
249
+ puts " Found #{headers.length} headers: #{headers.map(&:text).join(', ')}"
250
+
251
+ # Pattern 2: Attribute selector
252
+ puts "\n2. Elements with data-language attribute (//tr[@data-language]):"
253
+ lang_rows = doc.xpath("//tr[@data-language]")
254
+ langs = lang_rows.map { |row| row["data-language"] }
255
+ puts " Found #{langs.length} languages: #{langs.join(', ')}"
256
+
257
+ # Pattern 3: Class contains
258
+ puts "\n3. Elements with 'language-name' class:"
259
+ names = doc.xpath("//*[contains(@class, 'language-name')]")
260
+ puts " Found #{names.length} elements: #{names.map(&:text).join(', ')}"
261
+
262
+ # Pattern 4: Combining conditions
263
+ puts "\n4. Table cells with data-score > 80:"
264
+ high_scores = doc.xpath("//td[@data-score]").select do |cell|
265
+ cell["data-score"].to_f > 80
266
+ end
267
+ puts " Found #{high_scores.length} high scores"
268
+
269
+ # Pattern 5: Navigation axes
270
+ puts "\n5. Parent elements of language names:"
271
+ first_name = doc.at_xpath("//td[@class='language-name']")
272
+ if first_name
273
+ parent_row = first_name.parent
274
+ puts " Parent tag: #{parent_row.name}"
275
+ puts " Parent has #{parent_row.children.length} children"
276
+ end
277
+
278
+ puts "=" * 80
279
+ end
280
+
281
+ # Main execution
282
+ if __FILE__ == $0
283
+ # Get the HTML path (use example page by default)
284
+ html_path = ARGV[0] || File.join(__dir__, "example_page.html")
285
+
286
+ # Check if file exists
287
+ unless File.exist?(html_path)
288
+ puts "Error: HTML file not found: #{html_path}"
289
+ puts "Usage: ruby web_scraper.rb [path/to/page.html]"
290
+ exit 1
291
+ end
292
+
293
+ puts "Scraping HTML page: #{html_path}\n"
294
+
295
+ # Scrape the page
296
+ scraper = WebScraper.new(html_path)
297
+ data = scraper.scrape
298
+
299
+ # Demonstrate various XPath patterns
300
+ doc = Moxml.new.parse(File.read(html_path))
301
+ demonstrate_xpath_patterns(doc)
302
+
303
+ # Summary
304
+ puts "\n#{'=' * 80}"
305
+ puts "Scraping Complete!"
306
+ puts "=" * 80
307
+ puts "Extracted:"
308
+ puts " - #{data[:languages].length} programming languages"
309
+ puts " - #{data[:categories].length} category statistics"
310
+ puts " - #{data[:details].length} detailed information entries"
311
+ puts "=" * 80
312
+ end
@@ -11,16 +11,54 @@ module Moxml
11
11
  class << self
12
12
  include XmlUtils
13
13
 
14
- def set_root(doc, element)
15
- raise NotImplementedError
16
- end
17
-
18
- def parse(xml, options = {})
19
- raise NotImplementedError
20
- end
21
-
22
- def create_document(native_doc = nil)
23
- raise NotImplementedError
14
+ def set_root(_doc, _element)
15
+ raise Moxml::NotImplementedError.new(
16
+ "set_root not implemented",
17
+ feature: "set_root",
18
+ adapter: name,
19
+ )
20
+ end
21
+
22
+ def parse(_xml, _options = {})
23
+ raise Moxml::NotImplementedError.new(
24
+ "parse not implemented",
25
+ feature: "parse",
26
+ adapter: name,
27
+ )
28
+ end
29
+
30
+ # Parse XML using SAX (event-driven) parsing
31
+ #
32
+ # SAX parsing provides a memory-efficient way to process XML
33
+ # by triggering events as the document is parsed, rather than
34
+ # building a complete DOM tree.
35
+ #
36
+ # @param xml [String, IO] XML string or IO object to parse
37
+ # @param handler [Moxml::SAX::Handler] Handler object receiving events
38
+ # @return [void]
39
+ # @raise [Moxml::NotImplementedError] if adapter doesn't support SAX
40
+ def sax_parse(_xml, _handler)
41
+ raise Moxml::NotImplementedError.new(
42
+ "sax_parse not implemented",
43
+ feature: "sax_parse",
44
+ adapter: name,
45
+ )
46
+ end
47
+
48
+ # Check if this adapter supports SAX parsing
49
+ #
50
+ # @return [Boolean] true if SAX parsing is supported
51
+ def sax_supported?
52
+ respond_to?(:sax_parse) &&
53
+ method(:sax_parse).owner != Moxml::Adapter::Base.singleton_class
54
+ end
55
+
56
+ def create_document(_native_doc = nil)
57
+ raise Moxml::NotImplementedError.new(
58
+ "create_document not implemented",
59
+ feature: "create_document",
60
+ adapter: name,
61
+ )
24
62
  end
25
63
 
26
64
  def create_element(name)
@@ -48,10 +86,12 @@ module Moxml
48
86
 
49
87
  def create_processing_instruction(target, content)
50
88
  validate_pi_target(target)
51
- create_native_processing_instruction(target, normalize_xml_value(content))
89
+ create_native_processing_instruction(target,
90
+ normalize_xml_value(content))
52
91
  end
53
92
 
54
- def create_declaration(version = "1.0", encoding = "UTF-8", standalone = nil)
93
+ def create_declaration(version = "1.0", encoding = "UTF-8",
94
+ standalone = nil)
55
95
  validate_declaration_version(version)
56
96
  validate_declaration_encoding(encoding)
57
97
  validate_declaration_standalone(standalone)
@@ -81,38 +121,77 @@ module Moxml
81
121
  node
82
122
  end
83
123
 
124
+ def prepare_for_new_document(node, _target_doc)
125
+ # Hook for adapters that need special handling when moving nodes
126
+ # between documents (e.g., LibXML's document.import)
127
+ # Default: no-op for backward compatibility
128
+ node
129
+ end
130
+
84
131
  protected
85
132
 
86
- def create_native_element(name)
87
- raise NotImplementedError
133
+ def create_native_element(_name)
134
+ raise Moxml::NotImplementedError.new(
135
+ "create_native_element not implemented",
136
+ feature: "create_native_element",
137
+ adapter: name,
138
+ )
88
139
  end
89
140
 
90
- def create_native_text(content)
91
- raise NotImplementedError
141
+ def create_native_text(_content)
142
+ raise Moxml::NotImplementedError.new(
143
+ "create_native_text not implemented",
144
+ feature: "create_native_text",
145
+ adapter: name,
146
+ )
92
147
  end
93
148
 
94
- def create_native_cdata(content)
95
- raise NotImplementedError
149
+ def create_native_cdata(_content)
150
+ raise Moxml::NotImplementedError.new(
151
+ "create_native_cdata not implemented",
152
+ feature: "create_native_cdata",
153
+ adapter: name,
154
+ )
96
155
  end
97
156
 
98
- def create_native_comment(content)
99
- raise NotImplementedError
157
+ def create_native_comment(_content)
158
+ raise Moxml::NotImplementedError.new(
159
+ "create_native_comment not implemented",
160
+ feature: "create_native_comment",
161
+ adapter: name,
162
+ )
100
163
  end
101
164
 
102
- def create_native_doctype(name, external_id, system_id)
103
- raise NotImplementedError
165
+ def create_native_doctype(_name, _external_id, _system_id)
166
+ raise Moxml::NotImplementedError.new(
167
+ "create_native_doctype not implemented",
168
+ feature: "create_native_doctype",
169
+ adapter: name,
170
+ )
104
171
  end
105
172
 
106
- def create_native_processing_instruction(target, content)
107
- raise NotImplementedError
173
+ def create_native_processing_instruction(_target, _content)
174
+ raise Moxml::NotImplementedError.new(
175
+ "create_native_processing_instruction not implemented",
176
+ feature: "create_native_processing_instruction",
177
+ adapter: name,
178
+ )
108
179
  end
109
180
 
110
- def create_native_declaration(version, encoding, standalone)
111
- raise NotImplementedError
181
+ def create_native_declaration(_version, _encoding, _standalone)
182
+ raise Moxml::NotImplementedError.new(
183
+ "create_native_declaration not implemented",
184
+ feature: "create_native_declaration",
185
+ adapter: name,
186
+ )
112
187
  end
113
188
 
114
- def create_native_namespace(element, prefix, uri)
115
- raise NotImplementedError
189
+ def create_native_namespace(_element, _prefix, _uri)
190
+ raise Moxml::NotImplementedError.new(
191
+ "create_native_namespace not implemented",
192
+ feature: "create_native_namespace",
193
+ adapter: name,
194
+ )
116
195
  end
117
196
  end
118
197
  end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "node"
4
+
5
+ module Moxml
6
+ module Adapter
7
+ module CustomizedLibxml
8
+ # Wrapper for LibXML CDATA section nodes
9
+ class Cdata < Node
10
+ # Serialize as XML CDATA section
11
+ # LibXML auto-escapes content, we need to un-escape it
12
+ def to_xml
13
+ content = @native.content
14
+ .gsub("&quot;", '"')
15
+ .gsub("&apos;", "'")
16
+ .gsub("&lt;", "<")
17
+ .gsub("&gt;", ">")
18
+ .gsub("&amp;", "&")
19
+
20
+ # Handle CDATA end marker escaping (]]> becomes ]]]]><![CDATA[>)
21
+ # Replace all ]]> markers in the content before wrapping
22
+ escaped_content = content.gsub("]]>", "]]]]><![CDATA[>")
23
+ "<![CDATA[#{escaped_content}]]>"
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "node"
4
+
5
+ module Moxml
6
+ module Adapter
7
+ module CustomizedLibxml
8
+ # Wrapper for LibXML comment nodes
9
+ class Comment < Node
10
+ # Serialize as XML comment
11
+ # LibXML auto-escapes content, we need to un-escape it
12
+ def to_xml
13
+ content = @native.content
14
+ .gsub("&quot;", '"')
15
+ .gsub("&apos;", "'")
16
+ .gsub("&lt;", "<")
17
+ .gsub("&gt;", ">")
18
+ .gsub("&amp;", "&")
19
+ "<!--#{content}-->"
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Moxml
4
+ module Adapter
5
+ module CustomizedLibxml
6
+ # Wrapper for LibXML document declarations
7
+ #
8
+ # LibXML::XML::Document properties (version, encoding, standalone)
9
+ # are read-only after creation. This wrapper allows mutation by
10
+ # storing values internally and regenerating XML when needed.
11
+ class Declaration
12
+ attr_accessor :version, :encoding
13
+ attr_reader :native
14
+
15
+ def initialize(native_doc, version = nil, encoding = nil,
16
+ standalone = nil)
17
+ @native = native_doc
18
+ # Store explicit values - don't default from native_doc
19
+ @version = version || native_doc.version || "1.0"
20
+ # Only use encoding if explicitly provided, otherwise nil
21
+ @encoding = encoding
22
+ # Parse standalone value
23
+ @standalone_value = case standalone
24
+ when "yes", true
25
+ true
26
+ when "no", false
27
+ false
28
+ end
29
+ end
30
+
31
+ def standalone
32
+ return nil if @standalone_value.nil?
33
+
34
+ @standalone_value ? "yes" : "no"
35
+ end
36
+
37
+ def standalone=(value)
38
+ @standalone_value = case value
39
+ when "yes", true
40
+ true
41
+ when "no", false
42
+ false
43
+ when nil
44
+ nil
45
+ end
46
+ end
47
+
48
+ # Generate XML declaration string
49
+ def to_xml
50
+ output = "<?xml version=\"#{@version}\""
51
+ if @encoding && !@encoding.empty?
52
+ output << " encoding=\"#{@encoding}\""
53
+ end
54
+ # Include standalone attribute if explicitly set (true or false)
55
+ unless @standalone_value.nil?
56
+ output << " standalone=\"#{standalone}\""
57
+ end
58
+ output << "?>"
59
+ output
60
+ end
61
+
62
+ private
63
+
64
+ def extract_encoding(libxml_encoding)
65
+ return nil unless libxml_encoding
66
+
67
+ case libxml_encoding
68
+ when ::LibXML::XML::Encoding::UTF_8
69
+ "UTF-8"
70
+ when ::LibXML::XML::Encoding::ISO_8859_1
71
+ "ISO-8859-1"
72
+ when ::LibXML::XML::Encoding::UTF_16LE
73
+ "UTF-16LE"
74
+ when ::LibXML::XML::Encoding::UTF_16BE
75
+ "UTF-16BE"
76
+ when ::LibXML::XML::Encoding::UCS_2
77
+ "UCS-2"
78
+ else
79
+ "UTF-8"
80
+ end
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "node"
4
+
5
+ module Moxml
6
+ module Adapter
7
+ module CustomizedLibxml
8
+ # Wrapper for LibXML element nodes
9
+ #
10
+ # This wrapper provides automatic document import when adding children,
11
+ # solving LibXML's strict document ownership requirement.
12
+ class Element < Node
13
+ # Add a child to this element, handling document import automatically
14
+ def add_child(child)
15
+ child_native = child.respond_to?(:native) ? child.native : child
16
+
17
+ # Check if child needs to be imported
18
+ if needs_import?(child_native)
19
+ imported = @native.doc.import(child_native)
20
+ @native << imported
21
+ else
22
+ @native << child_native
23
+ end
24
+ end
25
+
26
+ private
27
+
28
+ def needs_import?(child_node)
29
+ return false unless @native.respond_to?(:doc)
30
+ return false unless @native.doc
31
+ return false unless child_node.respond_to?(:doc)
32
+ return false unless child_node.doc
33
+
34
+ child_node.doc != @native.doc
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end