moxml 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (215) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/dependent-repos.json +5 -0
  3. data/.github/workflows/dependent-tests.yml +20 -0
  4. data/.github/workflows/docs.yml +59 -0
  5. data/.github/workflows/rake.yml +10 -10
  6. data/.github/workflows/release.yml +5 -3
  7. data/.gitignore +37 -0
  8. data/.rubocop.yml +15 -7
  9. data/.rubocop_todo.yml +224 -43
  10. data/Gemfile +14 -9
  11. data/LICENSE.md +6 -2
  12. data/README.adoc +535 -373
  13. data/Rakefile +53 -0
  14. data/benchmarks/.gitignore +6 -0
  15. data/benchmarks/generate_report.rb +550 -0
  16. data/docs/Gemfile +13 -0
  17. data/docs/_config.yml +138 -0
  18. data/docs/_guides/advanced-features.adoc +87 -0
  19. data/docs/_guides/development-testing.adoc +165 -0
  20. data/docs/_guides/index.adoc +51 -0
  21. data/docs/_guides/modifying-xml.adoc +292 -0
  22. data/docs/_guides/parsing-xml.adoc +230 -0
  23. data/docs/_guides/sax-parsing.adoc +603 -0
  24. data/docs/_guides/working-with-documents.adoc +118 -0
  25. data/docs/_guides/xml-declaration.adoc +450 -0
  26. data/docs/_pages/adapter-compatibility.adoc +369 -0
  27. data/docs/_pages/adapters/headed-ox.adoc +237 -0
  28. data/docs/_pages/adapters/index.adoc +97 -0
  29. data/docs/_pages/adapters/libxml.adoc +285 -0
  30. data/docs/_pages/adapters/nokogiri.adoc +251 -0
  31. data/docs/_pages/adapters/oga.adoc +291 -0
  32. data/docs/_pages/adapters/ox.adoc +56 -0
  33. data/docs/_pages/adapters/rexml.adoc +292 -0
  34. data/docs/_pages/best-practices.adoc +429 -0
  35. data/docs/_pages/compatibility.adoc +467 -0
  36. data/docs/_pages/configuration.adoc +250 -0
  37. data/docs/_pages/error-handling.adoc +349 -0
  38. data/docs/_pages/headed-ox-limitations.adoc +574 -0
  39. data/docs/_pages/headed-ox.adoc +1025 -0
  40. data/docs/_pages/index.adoc +35 -0
  41. data/docs/_pages/installation.adoc +140 -0
  42. data/docs/_pages/node-api-reference.adoc +49 -0
  43. data/docs/_pages/performance.adoc +35 -0
  44. data/docs/_pages/quick-start.adoc +243 -0
  45. data/docs/_pages/thread-safety.adoc +28 -0
  46. data/docs/_references/document-api.adoc +407 -0
  47. data/docs/_references/index.adoc +48 -0
  48. data/docs/_tutorials/basic-usage.adoc +267 -0
  49. data/docs/_tutorials/builder-pattern.adoc +342 -0
  50. data/docs/_tutorials/index.adoc +33 -0
  51. data/docs/_tutorials/namespace-handling.adoc +324 -0
  52. data/docs/_tutorials/xpath-queries.adoc +358 -0
  53. data/docs/index.adoc +122 -0
  54. data/examples/README.md +124 -0
  55. data/examples/api_client/README.md +424 -0
  56. data/examples/api_client/api_client.rb +394 -0
  57. data/examples/api_client/example_response.xml +48 -0
  58. data/examples/headed_ox_example/README.md +90 -0
  59. data/examples/headed_ox_example/headed_ox_demo.rb +71 -0
  60. data/examples/rss_parser/README.md +194 -0
  61. data/examples/rss_parser/example_feed.xml +93 -0
  62. data/examples/rss_parser/rss_parser.rb +189 -0
  63. data/examples/sax_parsing/README.md +50 -0
  64. data/examples/sax_parsing/data_extractor.rb +75 -0
  65. data/examples/sax_parsing/example.xml +21 -0
  66. data/examples/sax_parsing/large_file.rb +78 -0
  67. data/examples/sax_parsing/simple_parser.rb +55 -0
  68. data/examples/web_scraper/README.md +352 -0
  69. data/examples/web_scraper/example_page.html +201 -0
  70. data/examples/web_scraper/web_scraper.rb +312 -0
  71. data/lib/moxml/adapter/base.rb +107 -28
  72. data/lib/moxml/adapter/customized_libxml/cdata.rb +28 -0
  73. data/lib/moxml/adapter/customized_libxml/comment.rb +24 -0
  74. data/lib/moxml/adapter/customized_libxml/declaration.rb +85 -0
  75. data/lib/moxml/adapter/customized_libxml/element.rb +39 -0
  76. data/lib/moxml/adapter/customized_libxml/node.rb +44 -0
  77. data/lib/moxml/adapter/customized_libxml/processing_instruction.rb +31 -0
  78. data/lib/moxml/adapter/customized_libxml/text.rb +27 -0
  79. data/lib/moxml/adapter/customized_oga/xml_generator.rb +1 -1
  80. data/lib/moxml/adapter/customized_ox/attribute.rb +28 -1
  81. data/lib/moxml/adapter/customized_rexml/formatter.rb +13 -8
  82. data/lib/moxml/adapter/headed_ox.rb +161 -0
  83. data/lib/moxml/adapter/libxml.rb +1564 -0
  84. data/lib/moxml/adapter/nokogiri.rb +156 -9
  85. data/lib/moxml/adapter/oga.rb +190 -15
  86. data/lib/moxml/adapter/ox.rb +322 -28
  87. data/lib/moxml/adapter/rexml.rb +157 -28
  88. data/lib/moxml/adapter.rb +21 -4
  89. data/lib/moxml/attribute.rb +6 -0
  90. data/lib/moxml/builder.rb +40 -4
  91. data/lib/moxml/config.rb +8 -3
  92. data/lib/moxml/context.rb +57 -2
  93. data/lib/moxml/declaration.rb +9 -0
  94. data/lib/moxml/doctype.rb +13 -1
  95. data/lib/moxml/document.rb +53 -6
  96. data/lib/moxml/document_builder.rb +34 -5
  97. data/lib/moxml/element.rb +71 -2
  98. data/lib/moxml/error.rb +175 -6
  99. data/lib/moxml/node.rb +155 -4
  100. data/lib/moxml/node_set.rb +34 -0
  101. data/lib/moxml/sax/block_handler.rb +194 -0
  102. data/lib/moxml/sax/element_handler.rb +124 -0
  103. data/lib/moxml/sax/handler.rb +113 -0
  104. data/lib/moxml/sax.rb +31 -0
  105. data/lib/moxml/version.rb +1 -1
  106. data/lib/moxml/xml_utils/encoder.rb +4 -4
  107. data/lib/moxml/xml_utils.rb +7 -4
  108. data/lib/moxml/xpath/ast/node.rb +159 -0
  109. data/lib/moxml/xpath/cache.rb +91 -0
  110. data/lib/moxml/xpath/compiler.rb +1770 -0
  111. data/lib/moxml/xpath/context.rb +26 -0
  112. data/lib/moxml/xpath/conversion.rb +124 -0
  113. data/lib/moxml/xpath/engine.rb +52 -0
  114. data/lib/moxml/xpath/errors.rb +101 -0
  115. data/lib/moxml/xpath/lexer.rb +304 -0
  116. data/lib/moxml/xpath/parser.rb +485 -0
  117. data/lib/moxml/xpath/ruby/generator.rb +269 -0
  118. data/lib/moxml/xpath/ruby/node.rb +193 -0
  119. data/lib/moxml/xpath.rb +37 -0
  120. data/lib/moxml.rb +5 -2
  121. data/moxml.gemspec +3 -1
  122. data/old-specs/moxml/adapter/customized_libxml/.gitkeep +6 -0
  123. data/spec/consistency/README.md +77 -0
  124. data/spec/{moxml/examples/adapter_spec.rb → consistency/adapter_parity_spec.rb} +4 -4
  125. data/spec/examples/README.md +75 -0
  126. data/spec/{support/shared_examples/examples/attribute.rb → examples/attribute_examples_spec.rb} +1 -1
  127. data/spec/{support/shared_examples/examples/basic_usage.rb → examples/basic_usage_spec.rb} +2 -2
  128. data/spec/{support/shared_examples/examples/namespace.rb → examples/namespace_examples_spec.rb} +3 -3
  129. data/spec/{support/shared_examples/examples/readme_examples.rb → examples/readme_examples_spec.rb} +6 -4
  130. data/spec/{support/shared_examples/examples/xpath.rb → examples/xpath_examples_spec.rb} +10 -6
  131. data/spec/integration/README.md +71 -0
  132. data/spec/{moxml/all_with_adapters_spec.rb → integration/all_adapters_spec.rb} +3 -2
  133. data/spec/integration/headed_ox_integration_spec.rb +326 -0
  134. data/spec/{support → integration}/shared_examples/edge_cases.rb +37 -10
  135. data/spec/integration/shared_examples/high_level/.gitkeep +0 -0
  136. data/spec/{support/shared_examples/context.rb → integration/shared_examples/high_level/context_behavior.rb} +2 -1
  137. data/spec/{support/shared_examples/integration.rb → integration/shared_examples/integration_workflows.rb} +23 -6
  138. data/spec/integration/shared_examples/node_wrappers/.gitkeep +0 -0
  139. data/spec/{support/shared_examples/cdata.rb → integration/shared_examples/node_wrappers/cdata_behavior.rb} +6 -1
  140. data/spec/{support/shared_examples/comment.rb → integration/shared_examples/node_wrappers/comment_behavior.rb} +2 -1
  141. data/spec/{support/shared_examples/declaration.rb → integration/shared_examples/node_wrappers/declaration_behavior.rb} +5 -5
  142. data/spec/{support/shared_examples/doctype.rb → integration/shared_examples/node_wrappers/doctype_behavior.rb} +2 -2
  143. data/spec/{support/shared_examples/document.rb → integration/shared_examples/node_wrappers/document_behavior.rb} +1 -1
  144. data/spec/{support/shared_examples/node.rb → integration/shared_examples/node_wrappers/node_behavior.rb} +9 -2
  145. data/spec/{support/shared_examples/node_set.rb → integration/shared_examples/node_wrappers/node_set_behavior.rb} +1 -18
  146. data/spec/{support/shared_examples/processing_instruction.rb → integration/shared_examples/node_wrappers/processing_instruction_behavior.rb} +6 -2
  147. data/spec/moxml/README.md +41 -0
  148. data/spec/moxml/adapter/.gitkeep +0 -0
  149. data/spec/moxml/adapter/README.md +61 -0
  150. data/spec/moxml/adapter/base_spec.rb +27 -0
  151. data/spec/moxml/adapter/headed_ox_spec.rb +311 -0
  152. data/spec/moxml/adapter/libxml_spec.rb +14 -0
  153. data/spec/moxml/adapter/ox_spec.rb +9 -8
  154. data/spec/moxml/adapter/shared_examples/.gitkeep +0 -0
  155. data/spec/{support/shared_examples/xml_adapter.rb → moxml/adapter/shared_examples/adapter_contract.rb} +39 -12
  156. data/spec/moxml/adapter_spec.rb +16 -0
  157. data/spec/moxml/attribute_spec.rb +30 -0
  158. data/spec/moxml/builder_spec.rb +33 -0
  159. data/spec/moxml/cdata_spec.rb +31 -0
  160. data/spec/moxml/comment_spec.rb +31 -0
  161. data/spec/moxml/config_spec.rb +3 -3
  162. data/spec/moxml/context_spec.rb +28 -0
  163. data/spec/moxml/declaration_preservation_spec.rb +217 -0
  164. data/spec/moxml/declaration_spec.rb +36 -0
  165. data/spec/moxml/doctype_spec.rb +33 -0
  166. data/spec/moxml/document_builder_spec.rb +30 -0
  167. data/spec/moxml/document_spec.rb +105 -0
  168. data/spec/moxml/element_spec.rb +143 -0
  169. data/spec/moxml/error_spec.rb +266 -22
  170. data/spec/{moxml_spec.rb → moxml/moxml_spec.rb} +9 -9
  171. data/spec/moxml/namespace_spec.rb +32 -0
  172. data/spec/moxml/node_set_spec.rb +39 -0
  173. data/spec/moxml/node_spec.rb +37 -0
  174. data/spec/moxml/processing_instruction_spec.rb +34 -0
  175. data/spec/moxml/sax_spec.rb +1067 -0
  176. data/spec/moxml/text_spec.rb +31 -0
  177. data/spec/moxml/version_spec.rb +14 -0
  178. data/spec/moxml/xml_utils/.gitkeep +0 -0
  179. data/spec/moxml/xml_utils/encoder_spec.rb +27 -0
  180. data/spec/moxml/xml_utils_spec.rb +49 -0
  181. data/spec/moxml/xpath/ast/node_spec.rb +83 -0
  182. data/spec/moxml/xpath/axes_spec.rb +296 -0
  183. data/spec/moxml/xpath/cache_spec.rb +358 -0
  184. data/spec/moxml/xpath/compiler_spec.rb +406 -0
  185. data/spec/moxml/xpath/context_spec.rb +210 -0
  186. data/spec/moxml/xpath/conversion_spec.rb +365 -0
  187. data/spec/moxml/xpath/fixtures/sample.xml +25 -0
  188. data/spec/moxml/xpath/functions/boolean_functions_spec.rb +114 -0
  189. data/spec/moxml/xpath/functions/node_functions_spec.rb +145 -0
  190. data/spec/moxml/xpath/functions/numeric_functions_spec.rb +164 -0
  191. data/spec/moxml/xpath/functions/position_functions_spec.rb +93 -0
  192. data/spec/moxml/xpath/functions/special_functions_spec.rb +89 -0
  193. data/spec/moxml/xpath/functions/string_functions_spec.rb +381 -0
  194. data/spec/moxml/xpath/lexer_spec.rb +488 -0
  195. data/spec/moxml/xpath/parser_integration_spec.rb +210 -0
  196. data/spec/moxml/xpath/parser_spec.rb +364 -0
  197. data/spec/moxml/xpath/ruby/generator_spec.rb +421 -0
  198. data/spec/moxml/xpath/ruby/node_spec.rb +291 -0
  199. data/spec/moxml/xpath_capabilities_spec.rb +199 -0
  200. data/spec/moxml/xpath_spec.rb +77 -0
  201. data/spec/performance/README.md +83 -0
  202. data/spec/performance/benchmark_spec.rb +64 -0
  203. data/spec/{support/shared_examples/examples/memory.rb → performance/memory_usage_spec.rb} +4 -1
  204. data/spec/{support/shared_examples/examples/thread_safety.rb → performance/thread_safety_spec.rb} +3 -1
  205. data/spec/performance/xpath_benchmark_spec.rb +259 -0
  206. data/spec/spec_helper.rb +58 -1
  207. data/spec/support/xml_matchers.rb +1 -1
  208. metadata +178 -34
  209. data/spec/support/shared_examples/examples/benchmark_spec.rb +0 -51
  210. /data/spec/{support/shared_examples/builder.rb → integration/shared_examples/high_level/builder_behavior.rb} +0 -0
  211. /data/spec/{support/shared_examples/document_builder.rb → integration/shared_examples/high_level/document_builder_behavior.rb} +0 -0
  212. /data/spec/{support/shared_examples/attribute.rb → integration/shared_examples/node_wrappers/attribute_behavior.rb} +0 -0
  213. /data/spec/{support/shared_examples/element.rb → integration/shared_examples/node_wrappers/element_behavior.rb} +0 -0
  214. /data/spec/{support/shared_examples/namespace.rb → integration/shared_examples/node_wrappers/namespace_behavior.rb} +0 -0
  215. /data/spec/{support/shared_examples/text.rb → integration/shared_examples/node_wrappers/text_behavior.rb} +0 -0
@@ -0,0 +1,55 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "bundler/setup"
5
+ require "moxml"
6
+
7
+ xml = File.read(File.join(__dir__, "example.xml"))
8
+
9
+ puts "=== Example 1: Class-Based Handler ==="
10
+ puts
11
+
12
+ # Define a simple handler class
13
+ class SimpleHandler < Moxml::SAX::Handler
14
+ def on_start_document
15
+ puts "Document started"
16
+ end
17
+
18
+ def on_start_element(name, attributes = {}, _namespaces = {})
19
+ attrs_str = attributes.map { |k, v| "#{k}=#{v}" }.join(", ")
20
+ puts " Start element: #{name}" + (attrs_str.empty? ? "" : " [#{attrs_str}]")
21
+ end
22
+
23
+ def on_characters(text)
24
+ text = text.strip
25
+ puts " Text: #{text}" unless text.empty?
26
+ end
27
+
28
+ def on_end_element(name)
29
+ puts " End element: #{name}"
30
+ end
31
+
32
+ def on_end_document
33
+ puts "Document ended"
34
+ end
35
+ end
36
+
37
+ context = Moxml.new(:nokogiri)
38
+ handler = SimpleHandler.new
39
+ context.sax_parse(xml, handler)
40
+
41
+ puts
42
+ puts "=== Example 2: Block-Based Handler ==="
43
+ puts
44
+
45
+ element_count = 0
46
+ context.sax_parse(xml) do
47
+ start_document { puts "Document started" }
48
+
49
+ start_element do |name, _attrs|
50
+ element_count += 1
51
+ puts " Element #{element_count}: #{name}"
52
+ end
53
+
54
+ end_document { puts "Document ended - processed #{element_count} elements" }
55
+ end
@@ -0,0 +1,352 @@
1
+ # Web Scraper Example
2
+
3
+ This example demonstrates how to scrape data from HTML/XML documents using Moxml, showcasing table extraction, DOM navigation, and attribute access.
4
+
5
+ ## What This Example Demonstrates
6
+
7
+ - **HTML Parsing**: Parsing HTML as XML for data extraction
8
+ - **Table Scraping**: Extracting structured data from HTML tables
9
+ - **DOM Navigation**: Traversing the document structure
10
+ - **Attribute Access**: Reading element attributes and data attributes
11
+ - **XPath Patterns**: Various XPath selectors for element selection
12
+ - **Data Structuring**: Converting scraped data into Ruby objects
13
+
14
+ ## Files
15
+
16
+ - `web_scraper.rb` - Main scraper implementation
17
+ - `example_page.html` - Sample HTML page with programming language statistics
18
+ - `README.md` - This file
19
+
20
+ ## Running the Example
21
+
22
+ ### Using the Example Page
23
+
24
+ ```bash
25
+ ruby examples/web_scraper/web_scraper.rb
26
+ ```
27
+
28
+ ### Using Your Own HTML
29
+
30
+ ```bash
31
+ ruby examples/web_scraper/web_scraper.rb path/to/your/page.html
32
+ ```
33
+
34
+ ## Expected Output
35
+
36
+ ```
37
+ Scraping HTML page: examples/web_scraper/example_page.html
38
+ ================================================================================
39
+ Programming Language Statistics Scraper
40
+ ================================================================================
41
+
42
+ Page Title: Programming Language Statistics - 2024
43
+
44
+ Summary:
45
+ 10 total languages tracked
46
+ Last updated: October 30, 2024
47
+
48
+ Languages Extracted: 10
49
+ --------------------------------------------------------------------------------
50
+ 1. Python (Interpreted) - 95.5% | Created: 1991 | Uses: Data Science, Web, AI
51
+ 2. JavaScript (Interpreted) - 94.2% | Created: 1995 | Uses: Web Development
52
+ 3. Java (Compiled) - 89.7% | Created: 1995 | Uses: Enterprise, Android
53
+ [...]
54
+
55
+ Category Statistics:
56
+ --------------------------------------------------------------------------------
57
+ Interpreted: 3 languages, avg 85.0%, top: Python
58
+ Compiled: 7 languages, avg 70.1%, top: Java
59
+
60
+ Detailed Information:
61
+ --------------------------------------------------------------------------------
62
+ python:
63
+ Paradigm: Multi-paradigm: object-oriented, procedural, functional
64
+ Typing: Dynamic, strong
65
+ Community: Very large and active
66
+ Learning Curve: Beginner-friendly
67
+ [...]
68
+
69
+ XPath Pattern Demonstrations
70
+ ================================================================================
71
+ 1. All table headers (//th):
72
+ Found 12 headers: Rank, Language, Category, ...
73
+ [...]
74
+ ```
75
+
76
+ ## Key Concepts
77
+
78
+ ### Table Scraping
79
+
80
+ Extract data from HTML tables systematically:
81
+
82
+ ```ruby
83
+ # Find table by ID
84
+ table = doc.at_xpath("//table[@id='popularity-table']")
85
+
86
+ # Get all rows
87
+ rows = table.xpath('.//tbody/tr')
88
+
89
+ # Extract cells from each row
90
+ rows.each do |row|
91
+ cells = row.xpath('./td')
92
+ rank = cells[0].text.strip
93
+ name = cells[1].text.strip
94
+ # ...
95
+ end
96
+ ```
97
+
98
+ ### Attribute Access
99
+
100
+ Read element attributes using the `[]` operator:
101
+
102
+ ```ruby
103
+ # Get data attribute
104
+ score = cell['data-score']
105
+
106
+ # Get class attribute
107
+ class_name = element['class']
108
+
109
+ # Check if attribute exists
110
+ if row['data-language']
111
+ lang = row['data-language']
112
+ end
113
+ ```
114
+
115
+ ### XPath Patterns
116
+
117
+ The example demonstrates various XPath patterns:
118
+
119
+ ```ruby
120
+ # By ID
121
+ doc.at_xpath("//div[@id='summary']")
122
+
123
+ # By class (contains for multi-class support)
124
+ doc.xpath("//*[contains(@class, 'language-name')]")
125
+
126
+ # By attribute existence
127
+ doc.xpath("//tr[@data-language]")
128
+
129
+ # Combining conditions
130
+ doc.xpath("//div[contains(@class, 'stats-card') and @data-language]")
131
+
132
+ # Direct descendants only
133
+ element.xpath('./td') # Not './/td'
134
+ ```
135
+
136
+ ### DOM Navigation
137
+
138
+ Navigate the document tree:
139
+
140
+ ```ruby
141
+ # Get parent
142
+ parent = element.parent
143
+
144
+ # Get children
145
+ children = element.children
146
+
147
+ # Get siblings
148
+ next_elem = element.next_sibling
149
+ prev_elem = element.previous_sibling
150
+ ```
151
+
152
+ ### Error Handling
153
+
154
+ Handle parsing errors gracefully:
155
+
156
+ ```ruby
157
+ begin
158
+ doc = @moxml.parse(html_content)
159
+ rescue Moxml::ParseError => e
160
+ puts "Failed to parse HTML: #{e.message}"
161
+ exit 1
162
+ end
163
+ ```
164
+
165
+ ## Code Structure
166
+
167
+ ### Language Class
168
+
169
+ Represents a programming language with:
170
+ - Rank, name, category
171
+ - Popularity score
172
+ - Year created
173
+ - Primary use cases
174
+
175
+ ### CategoryStats Class
176
+
177
+ Represents category statistics:
178
+ - Category name
179
+ - Language count
180
+ - Average score
181
+ - Top language
182
+
183
+ ### WebScraper Class
184
+
185
+ Main scraper with methods:
186
+ - `scrape` - Main scraping entry point
187
+ - `extract_page_title` - Get page title
188
+ - `extract_summary` - Extract summary statistics
189
+ - `extract_languages_table` - Parse language table
190
+ - `extract_category_stats` - Parse category table
191
+ - `extract_detailed_info` - Parse detail cards
192
+
193
+ ## XPath Pattern Reference
194
+
195
+ ### Basic Selectors
196
+
197
+ ```ruby
198
+ # All elements of a type
199
+ doc.xpath('//div')
200
+
201
+ # Element by ID
202
+ doc.at_xpath("//div[@id='content']")
203
+
204
+ # Element by class (single class)
205
+ doc.xpath("//div[@class='card']")
206
+
207
+ # Element by class (multiple classes)
208
+ doc.xpath("//*[contains(@class, 'card')]")
209
+ ```
210
+
211
+ ### Attribute Selectors
212
+
213
+ ```ruby
214
+ # Has attribute
215
+ doc.xpath("//tr[@data-language]")
216
+
217
+ # Attribute equals value
218
+ doc.xpath("//input[@type='text']")
219
+
220
+ # Attribute contains value
221
+ doc.xpath("//div[contains(@class, 'active')]")
222
+ ```
223
+
224
+ ### Hierarchical Selectors
225
+
226
+ ```ruby
227
+ # Direct child
228
+ div.xpath('./p') # Only direct <p> children
229
+
230
+ # Any descendant
231
+ div.xpath('.//p') # All <p> descendants
232
+
233
+ # Parent
234
+ element.parent
235
+
236
+ # Sibling
237
+ element.next_sibling
238
+ ```
239
+
240
+ ### Combining Conditions
241
+
242
+ ```ruby
243
+ # AND condition
244
+ doc.xpath("//div[@class='card' and @id='main']")
245
+
246
+ # Multiple conditions
247
+ doc.xpath("//tr[contains(@class, 'row') and @data-id]")
248
+ ```
249
+
250
+ ## Customization
251
+
252
+ ### Scraping Different Tables
253
+
254
+ Modify XPath selectors for your table structure:
255
+
256
+ ```ruby
257
+ # Different table structure
258
+ table = doc.at_xpath("//table[@class='data-table']")
259
+ headers = table.xpath('.//thead/tr/th').map(&:text)
260
+ rows = table.xpath('.//tbody/tr')
261
+ ```
262
+
263
+ ### Handling Complex HTML
264
+
265
+ For nested structures:
266
+
267
+ ```ruby
268
+ # Extract nested data
269
+ card.xpath('.//div[@class="section"]').each do |section|
270
+ title = section.at_xpath('./h3').text
271
+ items = section.xpath('.//li').map(&:text)
272
+ end
273
+ ```
274
+
275
+ ### Data Cleaning
276
+
277
+ Clean extracted text:
278
+
279
+ ```ruby
280
+ # Strip whitespace
281
+ text = element.text.strip
282
+
283
+ # Remove special characters
284
+ text = text.gsub(/[^\w\s]/, '')
285
+
286
+ # Parse numbers
287
+ score = text.delete('%').to_f
288
+ ```
289
+
290
+ ## Learning Points
291
+
292
+ 1. **HTML as XML**: Well-formed HTML can be parsed as XML
293
+ 2. **XPath is powerful**: One query can find many elements
294
+ 3. **Attributes are key**: Use data attributes for reliable scraping
295
+ 4. **Structure matters**: Understand the DOM structure before scraping
296
+ 5. **Clean data**: Always clean and validate scraped data
297
+ 6. **Error handling**: Handle missing elements gracefully
298
+
299
+ ## Best Practices
300
+
301
+ 1. **Use specific selectors**: Prefer IDs over classes when available
302
+ 2. **Validate data**: Check for nil/empty values
303
+ 3. **Handle errors**: Wrap parsing in begin/rescue blocks
304
+ 4. **Clean text**: Strip whitespace and normalize data
305
+ 5. **Document structure**: Understand the HTML before writing XPath
306
+ 6. **Test thoroughly**: Test with different HTML structures
307
+
308
+ ## Common Issues
309
+
310
+ ### Issue: Element not found
311
+
312
+ ```ruby
313
+ # Bad - will raise error if not found
314
+ title = doc.xpath('//title').first.text
315
+
316
+ # Good - safe navigation
317
+ title = doc.at_xpath('//title')&.text || 'Unknown'
318
+ ```
319
+
320
+ ### Issue: Incorrect XPath
321
+
322
+ ```ruby
323
+ # Wrong - searches entire document
324
+ row.xpath('//td')
325
+
326
+ # Correct - searches within row only
327
+ row.xpath('./td')
328
+ ```
329
+
330
+ ### Issue: Class matching
331
+
332
+ ```ruby
333
+ # Won't work with multiple classes
334
+ div.xpath("//div[@class='card active']")
335
+
336
+ # Works with multiple classes
337
+ div.xpath("//div[contains(@class, 'card')]")
338
+ ```
339
+
340
+ ## Next Steps
341
+
342
+ - Scrape real websites (check robots.txt and terms of service)
343
+ - Add data export (CSV, JSON)
344
+ - Implement pagination handling
345
+ - Add retry logic for failed requests
346
+ - Create scrapers for different domains
347
+ - Implement data validation
348
+
349
+ ## Related Examples
350
+
351
+ - [RSS Parser](../rss_parser/) - Similar XPath techniques for RSS
352
+ - [API Client](../api_client/) - XML generation and parsing
@@ -0,0 +1,201 @@
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8"/>
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
6
+ <title>Programming Language Statistics - 2024</title>
7
+ <style>
8
+ body {
9
+ font-family: Arial, sans-serif;
10
+ max-width: 1200px;
11
+ margin: 0 auto;
12
+ padding: 20px;
13
+ }
14
+ table {
15
+ width: 100%;
16
+ border-collapse: collapse;
17
+ margin: 20px 0;
18
+ }
19
+ th, td {
20
+ border: 1px solid #ddd;
21
+ padding: 12px;
22
+ text-align: left;
23
+ }
24
+ th {
25
+ background-color: #4CAF50;
26
+ color: white;
27
+ }
28
+ tr:nth-child(even) {
29
+ background-color: #f2f2f2;
30
+ }
31
+ .stats-card {
32
+ background-color: #f9f9f9;
33
+ border: 1px solid #ddd;
34
+ border-radius: 5px;
35
+ padding: 15px;
36
+ margin: 10px 0;
37
+ }
38
+ .language-name {
39
+ font-weight: bold;
40
+ color: #333;
41
+ }
42
+ </style>
43
+ </head>
44
+ <body>
45
+ <h1>Programming Language Statistics - 2024</h1>
46
+
47
+ <div class="stats-card" id="summary">
48
+ <h2>Summary</h2>
49
+ <p>Total languages tracked: <span class="stat-value">10</span></p>
50
+ <p>Data last updated: <span class="stat-value">October 30, 2024</span></p>
51
+ </div>
52
+
53
+ <h2>Most Popular Programming Languages</h2>
54
+ <table id="popularity-table">
55
+ <thead>
56
+ <tr>
57
+ <th>Rank</th>
58
+ <th>Language</th>
59
+ <th>Category</th>
60
+ <th>Popularity Score</th>
61
+ <th>Year Created</th>
62
+ <th>Primary Use</th>
63
+ </tr>
64
+ </thead>
65
+ <tbody>
66
+ <tr data-language="python">
67
+ <td>1</td>
68
+ <td class="language-name">Python</td>
69
+ <td>Interpreted</td>
70
+ <td data-score="95.5">95.5%</td>
71
+ <td>1991</td>
72
+ <td>Data Science, Web, AI</td>
73
+ </tr>
74
+ <tr data-language="javascript">
75
+ <td>2</td>
76
+ <td class="language-name">JavaScript</td>
77
+ <td>Interpreted</td>
78
+ <td data-score="94.2">94.2%</td>
79
+ <td>1995</td>
80
+ <td>Web Development</td>
81
+ </tr>
82
+ <tr data-language="java">
83
+ <td>3</td>
84
+ <td class="language-name">Java</td>
85
+ <td>Compiled</td>
86
+ <td data-score="89.7">89.7%</td>
87
+ <td>1995</td>
88
+ <td>Enterprise, Android</td>
89
+ </tr>
90
+ <tr data-language="csharp">
91
+ <td>4</td>
92
+ <td class="language-name">C#</td>
93
+ <td>Compiled</td>
94
+ <td data-score="82.3">82.3%</td>
95
+ <td>2000</td>
96
+ <td>.NET, Games, Enterprise</td>
97
+ </tr>
98
+ <tr data-language="cpp">
99
+ <td>5</td>
100
+ <td class="language-name">C++</td>
101
+ <td>Compiled</td>
102
+ <td data-score="78.9">78.9%</td>
103
+ <td>1985</td>
104
+ <td>Systems, Games, Performance</td>
105
+ </tr>
106
+ <tr data-language="ruby">
107
+ <td>6</td>
108
+ <td class="language-name">Ruby</td>
109
+ <td>Interpreted</td>
110
+ <td data-score="65.4">65.4%</td>
111
+ <td>1995</td>
112
+ <td>Web Development, Scripting</td>
113
+ </tr>
114
+ <tr data-language="go">
115
+ <td>7</td>
116
+ <td class="language-name">Go</td>
117
+ <td>Compiled</td>
118
+ <td data-score="63.8">63.8%</td>
119
+ <td>2009</td>
120
+ <td>Cloud, Microservices</td>
121
+ </tr>
122
+ <tr data-language="rust">
123
+ <td>8</td>
124
+ <td class="language-name">Rust</td>
125
+ <td>Compiled</td>
126
+ <td data-score="61.2">61.2%</td>
127
+ <td>2010</td>
128
+ <td>Systems, WebAssembly</td>
129
+ </tr>
130
+ <tr data-language="swift">
131
+ <td>9</td>
132
+ <td class="language-name">Swift</td>
133
+ <td>Compiled</td>
134
+ <td data-score="58.7">58.7%</td>
135
+ <td>2014</td>
136
+ <td>iOS, macOS Development</td>
137
+ </tr>
138
+ <tr data-language="kotlin">
139
+ <td>10</td>
140
+ <td class="language-name">Kotlin</td>
141
+ <td>Compiled</td>
142
+ <td data-score="56.3">56.3%</td>
143
+ <td>2011</td>
144
+ <td>Android, JVM</td>
145
+ </tr>
146
+ </tbody>
147
+ </table>
148
+
149
+ <h2>Language Statistics by Category</h2>
150
+ <table id="category-table">
151
+ <thead>
152
+ <tr>
153
+ <th>Category</th>
154
+ <th>Count</th>
155
+ <th>Average Score</th>
156
+ <th>Top Language</th>
157
+ </tr>
158
+ </thead>
159
+ <tbody>
160
+ <tr>
161
+ <td>Interpreted</td>
162
+ <td>3</td>
163
+ <td>85.0%</td>
164
+ <td>Python</td>
165
+ </tr>
166
+ <tr>
167
+ <td>Compiled</td>
168
+ <td>7</td>
169
+ <td>70.1%</td>
170
+ <td>Java</td>
171
+ </tr>
172
+ </tbody>
173
+ </table>
174
+
175
+ <h2>Detailed Language Information</h2>
176
+ <div class="stats-card" data-language="python">
177
+ <h3 class="language-name">Python</h3>
178
+ <ul>
179
+ <li><strong>Paradigm:</strong> Multi-paradigm: object-oriented, procedural, functional</li>
180
+ <li><strong>Typing:</strong> Dynamic, strong</li>
181
+ <li><strong>Community:</strong> Very large and active</li>
182
+ <li><strong>Learning Curve:</strong> Beginner-friendly</li>
183
+ </ul>
184
+ </div>
185
+
186
+ <div class="stats-card" data-language="ruby">
187
+ <h3 class="language-name">Ruby</h3>
188
+ <ul>
189
+ <li><strong>Paradigm:</strong> Multi-paradigm: object-oriented, functional</li>
190
+ <li><strong>Typing:</strong> Dynamic, strong</li>
191
+ <li><strong>Community:</strong> Active, focused on web development</li>
192
+ <li><strong>Learning Curve:</strong> Beginner to intermediate</li>
193
+ </ul>
194
+ </div>
195
+
196
+ <footer>
197
+ <p><em>Data compiled from various programming language popularity indices</em></p>
198
+ <p>Contact: <a href="mailto:info@example.com">info@example.com</a></p>
199
+ </footer>
200
+ </body>
201
+ </html>