moxml 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/dependent-repos.json +5 -0
- data/.github/workflows/dependent-tests.yml +20 -0
- data/.github/workflows/docs.yml +59 -0
- data/.github/workflows/rake.yml +10 -10
- data/.github/workflows/release.yml +5 -3
- data/.gitignore +37 -0
- data/.rubocop.yml +15 -7
- data/.rubocop_todo.yml +238 -40
- data/Gemfile +14 -9
- data/LICENSE.md +6 -2
- data/README.adoc +535 -373
- data/Rakefile +53 -0
- data/benchmarks/.gitignore +6 -0
- data/benchmarks/generate_report.rb +550 -0
- data/docs/Gemfile +13 -0
- data/docs/_config.yml +138 -0
- data/docs/_guides/advanced-features.adoc +87 -0
- data/docs/_guides/development-testing.adoc +165 -0
- data/docs/_guides/index.adoc +45 -0
- data/docs/_guides/modifying-xml.adoc +293 -0
- data/docs/_guides/parsing-xml.adoc +231 -0
- data/docs/_guides/sax-parsing.adoc +603 -0
- data/docs/_guides/working-with-documents.adoc +118 -0
- data/docs/_pages/adapter-compatibility.adoc +369 -0
- data/docs/_pages/adapters/headed-ox.adoc +237 -0
- data/docs/_pages/adapters/index.adoc +98 -0
- data/docs/_pages/adapters/libxml.adoc +286 -0
- data/docs/_pages/adapters/nokogiri.adoc +252 -0
- data/docs/_pages/adapters/oga.adoc +292 -0
- data/docs/_pages/adapters/ox.adoc +55 -0
- data/docs/_pages/adapters/rexml.adoc +293 -0
- data/docs/_pages/best-practices.adoc +430 -0
- data/docs/_pages/compatibility.adoc +468 -0
- data/docs/_pages/configuration.adoc +251 -0
- data/docs/_pages/error-handling.adoc +350 -0
- data/docs/_pages/headed-ox-limitations.adoc +558 -0
- data/docs/_pages/headed-ox.adoc +1025 -0
- data/docs/_pages/index.adoc +35 -0
- data/docs/_pages/installation.adoc +141 -0
- data/docs/_pages/node-api-reference.adoc +50 -0
- data/docs/_pages/performance.adoc +36 -0
- data/docs/_pages/quick-start.adoc +244 -0
- data/docs/_pages/thread-safety.adoc +29 -0
- data/docs/_references/document-api.adoc +408 -0
- data/docs/_references/index.adoc +48 -0
- data/docs/_tutorials/basic-usage.adoc +268 -0
- data/docs/_tutorials/builder-pattern.adoc +343 -0
- data/docs/_tutorials/index.adoc +33 -0
- data/docs/_tutorials/namespace-handling.adoc +325 -0
- data/docs/_tutorials/xpath-queries.adoc +359 -0
- data/docs/index.adoc +122 -0
- data/examples/README.md +124 -0
- data/examples/api_client/README.md +424 -0
- data/examples/api_client/api_client.rb +394 -0
- data/examples/api_client/example_response.xml +48 -0
- data/examples/headed_ox_example/README.md +90 -0
- data/examples/headed_ox_example/headed_ox_demo.rb +71 -0
- data/examples/rss_parser/README.md +194 -0
- data/examples/rss_parser/example_feed.xml +93 -0
- data/examples/rss_parser/rss_parser.rb +189 -0
- data/examples/sax_parsing/README.md +50 -0
- data/examples/sax_parsing/data_extractor.rb +75 -0
- data/examples/sax_parsing/example.xml +21 -0
- data/examples/sax_parsing/large_file.rb +78 -0
- data/examples/sax_parsing/simple_parser.rb +55 -0
- data/examples/web_scraper/README.md +352 -0
- data/examples/web_scraper/example_page.html +201 -0
- data/examples/web_scraper/web_scraper.rb +312 -0
- data/lib/moxml/adapter/base.rb +107 -28
- data/lib/moxml/adapter/customized_libxml/cdata.rb +28 -0
- data/lib/moxml/adapter/customized_libxml/comment.rb +24 -0
- data/lib/moxml/adapter/customized_libxml/declaration.rb +85 -0
- data/lib/moxml/adapter/customized_libxml/element.rb +39 -0
- data/lib/moxml/adapter/customized_libxml/node.rb +44 -0
- data/lib/moxml/adapter/customized_libxml/processing_instruction.rb +31 -0
- data/lib/moxml/adapter/customized_libxml/text.rb +27 -0
- data/lib/moxml/adapter/customized_oga/xml_generator.rb +1 -1
- data/lib/moxml/adapter/customized_ox/attribute.rb +28 -1
- data/lib/moxml/adapter/customized_rexml/formatter.rb +11 -6
- data/lib/moxml/adapter/headed_ox.rb +161 -0
- data/lib/moxml/adapter/libxml.rb +1548 -0
- data/lib/moxml/adapter/nokogiri.rb +121 -9
- data/lib/moxml/adapter/oga.rb +123 -12
- data/lib/moxml/adapter/ox.rb +282 -26
- data/lib/moxml/adapter/rexml.rb +127 -20
- data/lib/moxml/adapter.rb +21 -4
- data/lib/moxml/attribute.rb +6 -0
- data/lib/moxml/builder.rb +40 -4
- data/lib/moxml/config.rb +8 -3
- data/lib/moxml/context.rb +39 -1
- data/lib/moxml/doctype.rb +13 -1
- data/lib/moxml/document.rb +39 -6
- data/lib/moxml/document_builder.rb +27 -5
- data/lib/moxml/element.rb +71 -2
- data/lib/moxml/error.rb +175 -6
- data/lib/moxml/node.rb +94 -3
- data/lib/moxml/node_set.rb +34 -0
- data/lib/moxml/sax/block_handler.rb +194 -0
- data/lib/moxml/sax/element_handler.rb +124 -0
- data/lib/moxml/sax/handler.rb +113 -0
- data/lib/moxml/sax.rb +31 -0
- data/lib/moxml/version.rb +1 -1
- data/lib/moxml/xml_utils/encoder.rb +4 -4
- data/lib/moxml/xml_utils.rb +7 -4
- data/lib/moxml/xpath/ast/node.rb +159 -0
- data/lib/moxml/xpath/cache.rb +91 -0
- data/lib/moxml/xpath/compiler.rb +1768 -0
- data/lib/moxml/xpath/context.rb +26 -0
- data/lib/moxml/xpath/conversion.rb +124 -0
- data/lib/moxml/xpath/engine.rb +52 -0
- data/lib/moxml/xpath/errors.rb +101 -0
- data/lib/moxml/xpath/lexer.rb +304 -0
- data/lib/moxml/xpath/parser.rb +485 -0
- data/lib/moxml/xpath/ruby/generator.rb +269 -0
- data/lib/moxml/xpath/ruby/node.rb +193 -0
- data/lib/moxml/xpath.rb +37 -0
- data/lib/moxml.rb +5 -2
- data/moxml.gemspec +3 -1
- data/old-specs/moxml/adapter/customized_libxml/.gitkeep +6 -0
- data/spec/consistency/README.md +77 -0
- data/spec/{moxml/examples/adapter_spec.rb → consistency/adapter_parity_spec.rb} +4 -4
- data/spec/examples/README.md +75 -0
- data/spec/{support/shared_examples/examples/attribute.rb → examples/attribute_examples_spec.rb} +1 -1
- data/spec/{support/shared_examples/examples/basic_usage.rb → examples/basic_usage_spec.rb} +2 -2
- data/spec/{support/shared_examples/examples/namespace.rb → examples/namespace_examples_spec.rb} +3 -3
- data/spec/{support/shared_examples/examples/readme_examples.rb → examples/readme_examples_spec.rb} +6 -4
- data/spec/{support/shared_examples/examples/xpath.rb → examples/xpath_examples_spec.rb} +10 -6
- data/spec/integration/README.md +71 -0
- data/spec/{moxml/all_with_adapters_spec.rb → integration/all_adapters_spec.rb} +3 -2
- data/spec/integration/headed_ox_integration_spec.rb +326 -0
- data/spec/{support → integration}/shared_examples/edge_cases.rb +37 -10
- data/spec/integration/shared_examples/high_level/.gitkeep +0 -0
- data/spec/{support/shared_examples/context.rb → integration/shared_examples/high_level/context_behavior.rb} +2 -1
- data/spec/{support/shared_examples/integration.rb → integration/shared_examples/integration_workflows.rb} +23 -6
- data/spec/integration/shared_examples/node_wrappers/.gitkeep +0 -0
- data/spec/{support/shared_examples/cdata.rb → integration/shared_examples/node_wrappers/cdata_behavior.rb} +6 -1
- data/spec/{support/shared_examples/comment.rb → integration/shared_examples/node_wrappers/comment_behavior.rb} +2 -1
- data/spec/{support/shared_examples/declaration.rb → integration/shared_examples/node_wrappers/declaration_behavior.rb} +5 -2
- data/spec/{support/shared_examples/doctype.rb → integration/shared_examples/node_wrappers/doctype_behavior.rb} +2 -2
- data/spec/{support/shared_examples/document.rb → integration/shared_examples/node_wrappers/document_behavior.rb} +1 -1
- data/spec/{support/shared_examples/node.rb → integration/shared_examples/node_wrappers/node_behavior.rb} +9 -2
- data/spec/{support/shared_examples/node_set.rb → integration/shared_examples/node_wrappers/node_set_behavior.rb} +1 -18
- data/spec/{support/shared_examples/processing_instruction.rb → integration/shared_examples/node_wrappers/processing_instruction_behavior.rb} +6 -2
- data/spec/moxml/README.md +41 -0
- data/spec/moxml/adapter/.gitkeep +0 -0
- data/spec/moxml/adapter/README.md +61 -0
- data/spec/moxml/adapter/base_spec.rb +27 -0
- data/spec/moxml/adapter/headed_ox_spec.rb +311 -0
- data/spec/moxml/adapter/libxml_spec.rb +14 -0
- data/spec/moxml/adapter/ox_spec.rb +9 -8
- data/spec/moxml/adapter/shared_examples/.gitkeep +0 -0
- data/spec/{support/shared_examples/xml_adapter.rb → moxml/adapter/shared_examples/adapter_contract.rb} +39 -12
- data/spec/moxml/adapter_spec.rb +16 -0
- data/spec/moxml/attribute_spec.rb +30 -0
- data/spec/moxml/builder_spec.rb +33 -0
- data/spec/moxml/cdata_spec.rb +31 -0
- data/spec/moxml/comment_spec.rb +31 -0
- data/spec/moxml/config_spec.rb +3 -3
- data/spec/moxml/context_spec.rb +28 -0
- data/spec/moxml/declaration_spec.rb +36 -0
- data/spec/moxml/doctype_spec.rb +33 -0
- data/spec/moxml/document_builder_spec.rb +30 -0
- data/spec/moxml/document_spec.rb +105 -0
- data/spec/moxml/element_spec.rb +143 -0
- data/spec/moxml/error_spec.rb +266 -22
- data/spec/{moxml_spec.rb → moxml/moxml_spec.rb} +9 -9
- data/spec/moxml/namespace_spec.rb +32 -0
- data/spec/moxml/node_set_spec.rb +39 -0
- data/spec/moxml/node_spec.rb +37 -0
- data/spec/moxml/processing_instruction_spec.rb +34 -0
- data/spec/moxml/sax_spec.rb +1067 -0
- data/spec/moxml/text_spec.rb +31 -0
- data/spec/moxml/version_spec.rb +14 -0
- data/spec/moxml/xml_utils/.gitkeep +0 -0
- data/spec/moxml/xml_utils/encoder_spec.rb +27 -0
- data/spec/moxml/xml_utils_spec.rb +49 -0
- data/spec/moxml/xpath/ast/node_spec.rb +83 -0
- data/spec/moxml/xpath/axes_spec.rb +296 -0
- data/spec/moxml/xpath/cache_spec.rb +358 -0
- data/spec/moxml/xpath/compiler_spec.rb +406 -0
- data/spec/moxml/xpath/context_spec.rb +210 -0
- data/spec/moxml/xpath/conversion_spec.rb +365 -0
- data/spec/moxml/xpath/fixtures/sample.xml +25 -0
- data/spec/moxml/xpath/functions/boolean_functions_spec.rb +114 -0
- data/spec/moxml/xpath/functions/node_functions_spec.rb +145 -0
- data/spec/moxml/xpath/functions/numeric_functions_spec.rb +164 -0
- data/spec/moxml/xpath/functions/position_functions_spec.rb +93 -0
- data/spec/moxml/xpath/functions/special_functions_spec.rb +89 -0
- data/spec/moxml/xpath/functions/string_functions_spec.rb +381 -0
- data/spec/moxml/xpath/lexer_spec.rb +488 -0
- data/spec/moxml/xpath/parser_integration_spec.rb +210 -0
- data/spec/moxml/xpath/parser_spec.rb +364 -0
- data/spec/moxml/xpath/ruby/generator_spec.rb +421 -0
- data/spec/moxml/xpath/ruby/node_spec.rb +291 -0
- data/spec/moxml/xpath_capabilities_spec.rb +199 -0
- data/spec/moxml/xpath_spec.rb +77 -0
- data/spec/performance/README.md +83 -0
- data/spec/performance/benchmark_spec.rb +64 -0
- data/spec/{support/shared_examples/examples/memory.rb → performance/memory_usage_spec.rb} +3 -1
- data/spec/{support/shared_examples/examples/thread_safety.rb → performance/thread_safety_spec.rb} +3 -1
- data/spec/performance/xpath_benchmark_spec.rb +259 -0
- data/spec/spec_helper.rb +58 -1
- data/spec/support/xml_matchers.rb +1 -1
- metadata +176 -34
- data/spec/support/shared_examples/examples/benchmark_spec.rb +0 -51
- /data/spec/{support/shared_examples/builder.rb → integration/shared_examples/high_level/builder_behavior.rb} +0 -0
- /data/spec/{support/shared_examples/document_builder.rb → integration/shared_examples/high_level/document_builder_behavior.rb} +0 -0
- /data/spec/{support/shared_examples/attribute.rb → integration/shared_examples/node_wrappers/attribute_behavior.rb} +0 -0
- /data/spec/{support/shared_examples/element.rb → integration/shared_examples/node_wrappers/element_behavior.rb} +0 -0
- /data/spec/{support/shared_examples/namespace.rb → integration/shared_examples/node_wrappers/namespace_behavior.rb} +0 -0
- /data/spec/{support/shared_examples/text.rb → integration/shared_examples/node_wrappers/text_behavior.rb} +0 -0
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require "bundler/setup"
|
|
5
|
+
require "moxml"
|
|
6
|
+
|
|
7
|
+
xml = File.read(File.join(__dir__, "example.xml"))
|
|
8
|
+
|
|
9
|
+
puts "=== Example 1: Class-Based Handler ==="
|
|
10
|
+
puts
|
|
11
|
+
|
|
12
|
+
# Define a simple handler class
|
|
13
|
+
class SimpleHandler < Moxml::SAX::Handler
|
|
14
|
+
def on_start_document
|
|
15
|
+
puts "Document started"
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def on_start_element(name, attributes = {}, _namespaces = {})
|
|
19
|
+
attrs_str = attributes.map { |k, v| "#{k}=#{v}" }.join(", ")
|
|
20
|
+
puts " Start element: #{name}" + (attrs_str.empty? ? "" : " [#{attrs_str}]")
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def on_characters(text)
|
|
24
|
+
text = text.strip
|
|
25
|
+
puts " Text: #{text}" unless text.empty?
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def on_end_element(name)
|
|
29
|
+
puts " End element: #{name}"
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def on_end_document
|
|
33
|
+
puts "Document ended"
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
context = Moxml.new(:nokogiri)
|
|
38
|
+
handler = SimpleHandler.new
|
|
39
|
+
context.sax_parse(xml, handler)
|
|
40
|
+
|
|
41
|
+
puts
|
|
42
|
+
puts "=== Example 2: Block-Based Handler ==="
|
|
43
|
+
puts
|
|
44
|
+
|
|
45
|
+
element_count = 0
|
|
46
|
+
context.sax_parse(xml) do
|
|
47
|
+
start_document { puts "Document started" }
|
|
48
|
+
|
|
49
|
+
start_element do |name, _attrs|
|
|
50
|
+
element_count += 1
|
|
51
|
+
puts " Element #{element_count}: #{name}"
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
end_document { puts "Document ended - processed #{element_count} elements" }
|
|
55
|
+
end
|
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
# Web Scraper Example
|
|
2
|
+
|
|
3
|
+
This example demonstrates how to scrape data from HTML/XML documents using Moxml, showcasing table extraction, DOM navigation, and attribute access.
|
|
4
|
+
|
|
5
|
+
## What This Example Demonstrates
|
|
6
|
+
|
|
7
|
+
- **HTML Parsing**: Parsing HTML as XML for data extraction
|
|
8
|
+
- **Table Scraping**: Extracting structured data from HTML tables
|
|
9
|
+
- **DOM Navigation**: Traversing the document structure
|
|
10
|
+
- **Attribute Access**: Reading element attributes and data attributes
|
|
11
|
+
- **XPath Patterns**: Various XPath selectors for element selection
|
|
12
|
+
- **Data Structuring**: Converting scraped data into Ruby objects
|
|
13
|
+
|
|
14
|
+
## Files
|
|
15
|
+
|
|
16
|
+
- `web_scraper.rb` - Main scraper implementation
|
|
17
|
+
- `example_page.html` - Sample HTML page with programming language statistics
|
|
18
|
+
- `README.md` - This file
|
|
19
|
+
|
|
20
|
+
## Running the Example
|
|
21
|
+
|
|
22
|
+
### Using the Example Page
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
ruby examples/web_scraper/web_scraper.rb
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### Using Your Own HTML
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
ruby examples/web_scraper/web_scraper.rb path/to/your/page.html
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Expected Output
|
|
35
|
+
|
|
36
|
+
```
|
|
37
|
+
Scraping HTML page: examples/web_scraper/example_page.html
|
|
38
|
+
================================================================================
|
|
39
|
+
Programming Language Statistics Scraper
|
|
40
|
+
================================================================================
|
|
41
|
+
|
|
42
|
+
Page Title: Programming Language Statistics - 2024
|
|
43
|
+
|
|
44
|
+
Summary:
|
|
45
|
+
10 total languages tracked
|
|
46
|
+
Last updated: October 30, 2024
|
|
47
|
+
|
|
48
|
+
Languages Extracted: 10
|
|
49
|
+
--------------------------------------------------------------------------------
|
|
50
|
+
1. Python (Interpreted) - 95.5% | Created: 1991 | Uses: Data Science, Web, AI
|
|
51
|
+
2. JavaScript (Interpreted) - 94.2% | Created: 1995 | Uses: Web Development
|
|
52
|
+
3. Java (Compiled) - 89.7% | Created: 1995 | Uses: Enterprise, Android
|
|
53
|
+
[...]
|
|
54
|
+
|
|
55
|
+
Category Statistics:
|
|
56
|
+
--------------------------------------------------------------------------------
|
|
57
|
+
Interpreted: 3 languages, avg 85.0%, top: Python
|
|
58
|
+
Compiled: 7 languages, avg 70.1%, top: Java
|
|
59
|
+
|
|
60
|
+
Detailed Information:
|
|
61
|
+
--------------------------------------------------------------------------------
|
|
62
|
+
python:
|
|
63
|
+
Paradigm: Multi-paradigm: object-oriented, procedural, functional
|
|
64
|
+
Typing: Dynamic, strong
|
|
65
|
+
Community: Very large and active
|
|
66
|
+
Learning Curve: Beginner-friendly
|
|
67
|
+
[...]
|
|
68
|
+
|
|
69
|
+
XPath Pattern Demonstrations
|
|
70
|
+
================================================================================
|
|
71
|
+
1. All table headers (//th):
|
|
72
|
+
Found 12 headers: Rank, Language, Category, ...
|
|
73
|
+
[...]
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Key Concepts
|
|
77
|
+
|
|
78
|
+
### Table Scraping
|
|
79
|
+
|
|
80
|
+
Extract data from HTML tables systematically:
|
|
81
|
+
|
|
82
|
+
```ruby
|
|
83
|
+
# Find table by ID
|
|
84
|
+
table = doc.at_xpath("//table[@id='popularity-table']")
|
|
85
|
+
|
|
86
|
+
# Get all rows
|
|
87
|
+
rows = table.xpath('.//tbody/tr')
|
|
88
|
+
|
|
89
|
+
# Extract cells from each row
|
|
90
|
+
rows.each do |row|
|
|
91
|
+
cells = row.xpath('./td')
|
|
92
|
+
rank = cells[0].text.strip
|
|
93
|
+
name = cells[1].text.strip
|
|
94
|
+
# ...
|
|
95
|
+
end
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Attribute Access
|
|
99
|
+
|
|
100
|
+
Read element attributes using the `[]` operator:
|
|
101
|
+
|
|
102
|
+
```ruby
|
|
103
|
+
# Get data attribute
|
|
104
|
+
score = cell['data-score']
|
|
105
|
+
|
|
106
|
+
# Get class attribute
|
|
107
|
+
class_name = element['class']
|
|
108
|
+
|
|
109
|
+
# Check if attribute exists
|
|
110
|
+
if row['data-language']
|
|
111
|
+
lang = row['data-language']
|
|
112
|
+
end
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
### XPath Patterns
|
|
116
|
+
|
|
117
|
+
The example demonstrates various XPath patterns:
|
|
118
|
+
|
|
119
|
+
```ruby
|
|
120
|
+
# By ID
|
|
121
|
+
doc.at_xpath("//div[@id='summary']")
|
|
122
|
+
|
|
123
|
+
# By class (contains for multi-class support)
|
|
124
|
+
doc.xpath("//*[contains(@class, 'language-name')]")
|
|
125
|
+
|
|
126
|
+
# By attribute existence
|
|
127
|
+
doc.xpath("//tr[@data-language]")
|
|
128
|
+
|
|
129
|
+
# Combining conditions
|
|
130
|
+
doc.xpath("//div[contains(@class, 'stats-card') and @data-language]")
|
|
131
|
+
|
|
132
|
+
# Direct descendants only
|
|
133
|
+
element.xpath('./td') # Not './/td'
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
### DOM Navigation
|
|
137
|
+
|
|
138
|
+
Navigate the document tree:
|
|
139
|
+
|
|
140
|
+
```ruby
|
|
141
|
+
# Get parent
|
|
142
|
+
parent = element.parent
|
|
143
|
+
|
|
144
|
+
# Get children
|
|
145
|
+
children = element.children
|
|
146
|
+
|
|
147
|
+
# Get siblings
|
|
148
|
+
next_elem = element.next_sibling
|
|
149
|
+
prev_elem = element.previous_sibling
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
### Error Handling
|
|
153
|
+
|
|
154
|
+
Handle parsing errors gracefully:
|
|
155
|
+
|
|
156
|
+
```ruby
|
|
157
|
+
begin
|
|
158
|
+
doc = @moxml.parse(html_content)
|
|
159
|
+
rescue Moxml::ParseError => e
|
|
160
|
+
puts "Failed to parse HTML: #{e.message}"
|
|
161
|
+
exit 1
|
|
162
|
+
end
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
## Code Structure
|
|
166
|
+
|
|
167
|
+
### Language Class
|
|
168
|
+
|
|
169
|
+
Represents a programming language with:
|
|
170
|
+
- Rank, name, category
|
|
171
|
+
- Popularity score
|
|
172
|
+
- Year created
|
|
173
|
+
- Primary use cases
|
|
174
|
+
|
|
175
|
+
### CategoryStats Class
|
|
176
|
+
|
|
177
|
+
Represents category statistics:
|
|
178
|
+
- Category name
|
|
179
|
+
- Language count
|
|
180
|
+
- Average score
|
|
181
|
+
- Top language
|
|
182
|
+
|
|
183
|
+
### WebScraper Class
|
|
184
|
+
|
|
185
|
+
Main scraper with methods:
|
|
186
|
+
- `scrape` - Main scraping entry point
|
|
187
|
+
- `extract_page_title` - Get page title
|
|
188
|
+
- `extract_summary` - Extract summary statistics
|
|
189
|
+
- `extract_languages_table` - Parse language table
|
|
190
|
+
- `extract_category_stats` - Parse category table
|
|
191
|
+
- `extract_detailed_info` - Parse detail cards
|
|
192
|
+
|
|
193
|
+
## XPath Pattern Reference
|
|
194
|
+
|
|
195
|
+
### Basic Selectors
|
|
196
|
+
|
|
197
|
+
```ruby
|
|
198
|
+
# All elements of a type
|
|
199
|
+
doc.xpath('//div')
|
|
200
|
+
|
|
201
|
+
# Element by ID
|
|
202
|
+
doc.at_xpath("//div[@id='content']")
|
|
203
|
+
|
|
204
|
+
# Element by class (single class)
|
|
205
|
+
doc.xpath("//div[@class='card']")
|
|
206
|
+
|
|
207
|
+
# Element by class (multiple classes)
|
|
208
|
+
doc.xpath("//*[contains(@class, 'card')]")
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
### Attribute Selectors
|
|
212
|
+
|
|
213
|
+
```ruby
|
|
214
|
+
# Has attribute
|
|
215
|
+
doc.xpath("//tr[@data-language]")
|
|
216
|
+
|
|
217
|
+
# Attribute equals value
|
|
218
|
+
doc.xpath("//input[@type='text']")
|
|
219
|
+
|
|
220
|
+
# Attribute contains value
|
|
221
|
+
doc.xpath("//div[contains(@class, 'active')]")
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
### Hierarchical Selectors
|
|
225
|
+
|
|
226
|
+
```ruby
|
|
227
|
+
# Direct child
|
|
228
|
+
div.xpath('./p') # Only direct <p> children
|
|
229
|
+
|
|
230
|
+
# Any descendant
|
|
231
|
+
div.xpath('.//p') # All <p> descendants
|
|
232
|
+
|
|
233
|
+
# Parent
|
|
234
|
+
element.parent
|
|
235
|
+
|
|
236
|
+
# Sibling
|
|
237
|
+
element.next_sibling
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
### Combining Conditions
|
|
241
|
+
|
|
242
|
+
```ruby
|
|
243
|
+
# AND condition
|
|
244
|
+
doc.xpath("//div[@class='card' and @id='main']")
|
|
245
|
+
|
|
246
|
+
# Multiple conditions
|
|
247
|
+
doc.xpath("//tr[contains(@class, 'row') and @data-id]")
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
## Customization
|
|
251
|
+
|
|
252
|
+
### Scraping Different Tables
|
|
253
|
+
|
|
254
|
+
Modify XPath selectors for your table structure:
|
|
255
|
+
|
|
256
|
+
```ruby
|
|
257
|
+
# Different table structure
|
|
258
|
+
table = doc.at_xpath("//table[@class='data-table']")
|
|
259
|
+
headers = table.xpath('.//thead/tr/th').map(&:text)
|
|
260
|
+
rows = table.xpath('.//tbody/tr')
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
### Handling Complex HTML
|
|
264
|
+
|
|
265
|
+
For nested structures:
|
|
266
|
+
|
|
267
|
+
```ruby
|
|
268
|
+
# Extract nested data
|
|
269
|
+
card.xpath('.//div[@class="section"]').each do |section|
|
|
270
|
+
title = section.at_xpath('./h3').text
|
|
271
|
+
items = section.xpath('.//li').map(&:text)
|
|
272
|
+
end
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
### Data Cleaning
|
|
276
|
+
|
|
277
|
+
Clean extracted text:
|
|
278
|
+
|
|
279
|
+
```ruby
|
|
280
|
+
# Strip whitespace
|
|
281
|
+
text = element.text.strip
|
|
282
|
+
|
|
283
|
+
# Remove special characters
|
|
284
|
+
text = text.gsub(/[^\w\s]/, '')
|
|
285
|
+
|
|
286
|
+
# Parse numbers
|
|
287
|
+
score = text.delete('%').to_f
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
## Learning Points
|
|
291
|
+
|
|
292
|
+
1. **HTML as XML**: Well-formed HTML can be parsed as XML
|
|
293
|
+
2. **XPath is powerful**: One query can find many elements
|
|
294
|
+
3. **Attributes are key**: Use data attributes for reliable scraping
|
|
295
|
+
4. **Structure matters**: Understand the DOM structure before scraping
|
|
296
|
+
5. **Clean data**: Always clean and validate scraped data
|
|
297
|
+
6. **Error handling**: Handle missing elements gracefully
|
|
298
|
+
|
|
299
|
+
## Best Practices
|
|
300
|
+
|
|
301
|
+
1. **Use specific selectors**: Prefer IDs over classes when available
|
|
302
|
+
2. **Validate data**: Check for nil/empty values
|
|
303
|
+
3. **Handle errors**: Wrap parsing in begin/rescue blocks
|
|
304
|
+
4. **Clean text**: Strip whitespace and normalize data
|
|
305
|
+
5. **Document structure**: Understand the HTML before writing XPath
|
|
306
|
+
6. **Test thoroughly**: Test with different HTML structures
|
|
307
|
+
|
|
308
|
+
## Common Issues
|
|
309
|
+
|
|
310
|
+
### Issue: Element not found
|
|
311
|
+
|
|
312
|
+
```ruby
|
|
313
|
+
# Bad - will raise error if not found
|
|
314
|
+
title = doc.xpath('//title').first.text
|
|
315
|
+
|
|
316
|
+
# Good - safe navigation
|
|
317
|
+
title = doc.at_xpath('//title')&.text || 'Unknown'
|
|
318
|
+
```
|
|
319
|
+
|
|
320
|
+
### Issue: Incorrect XPath
|
|
321
|
+
|
|
322
|
+
```ruby
|
|
323
|
+
# Wrong - searches entire document
|
|
324
|
+
row.xpath('//td')
|
|
325
|
+
|
|
326
|
+
# Correct - searches within row only
|
|
327
|
+
row.xpath('./td')
|
|
328
|
+
```
|
|
329
|
+
|
|
330
|
+
### Issue: Class matching
|
|
331
|
+
|
|
332
|
+
```ruby
|
|
333
|
+
# Won't work with multiple classes
|
|
334
|
+
div.xpath("//div[@class='card active']")
|
|
335
|
+
|
|
336
|
+
# Works with multiple classes
|
|
337
|
+
div.xpath("//div[contains(@class, 'card')]")
|
|
338
|
+
```
|
|
339
|
+
|
|
340
|
+
## Next Steps
|
|
341
|
+
|
|
342
|
+
- Scrape real websites (check robots.txt and terms of service)
|
|
343
|
+
- Add data export (CSV, JSON)
|
|
344
|
+
- Implement pagination handling
|
|
345
|
+
- Add retry logic for failed requests
|
|
346
|
+
- Create scrapers for different domains
|
|
347
|
+
- Implement data validation
|
|
348
|
+
|
|
349
|
+
## Related Examples
|
|
350
|
+
|
|
351
|
+
- [RSS Parser](../rss_parser/) - Similar XPath techniques for RSS
|
|
352
|
+
- [API Client](../api_client/) - XML generation and parsing
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="UTF-8"/>
|
|
5
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0"/>
|
|
6
|
+
<title>Programming Language Statistics - 2024</title>
|
|
7
|
+
<style>
|
|
8
|
+
body {
|
|
9
|
+
font-family: Arial, sans-serif;
|
|
10
|
+
max-width: 1200px;
|
|
11
|
+
margin: 0 auto;
|
|
12
|
+
padding: 20px;
|
|
13
|
+
}
|
|
14
|
+
table {
|
|
15
|
+
width: 100%;
|
|
16
|
+
border-collapse: collapse;
|
|
17
|
+
margin: 20px 0;
|
|
18
|
+
}
|
|
19
|
+
th, td {
|
|
20
|
+
border: 1px solid #ddd;
|
|
21
|
+
padding: 12px;
|
|
22
|
+
text-align: left;
|
|
23
|
+
}
|
|
24
|
+
th {
|
|
25
|
+
background-color: #4CAF50;
|
|
26
|
+
color: white;
|
|
27
|
+
}
|
|
28
|
+
tr:nth-child(even) {
|
|
29
|
+
background-color: #f2f2f2;
|
|
30
|
+
}
|
|
31
|
+
.stats-card {
|
|
32
|
+
background-color: #f9f9f9;
|
|
33
|
+
border: 1px solid #ddd;
|
|
34
|
+
border-radius: 5px;
|
|
35
|
+
padding: 15px;
|
|
36
|
+
margin: 10px 0;
|
|
37
|
+
}
|
|
38
|
+
.language-name {
|
|
39
|
+
font-weight: bold;
|
|
40
|
+
color: #333;
|
|
41
|
+
}
|
|
42
|
+
</style>
|
|
43
|
+
</head>
|
|
44
|
+
<body>
|
|
45
|
+
<h1>Programming Language Statistics - 2024</h1>
|
|
46
|
+
|
|
47
|
+
<div class="stats-card" id="summary">
|
|
48
|
+
<h2>Summary</h2>
|
|
49
|
+
<p>Total languages tracked: <span class="stat-value">10</span></p>
|
|
50
|
+
<p>Data last updated: <span class="stat-value">October 30, 2024</span></p>
|
|
51
|
+
</div>
|
|
52
|
+
|
|
53
|
+
<h2>Most Popular Programming Languages</h2>
|
|
54
|
+
<table id="popularity-table">
|
|
55
|
+
<thead>
|
|
56
|
+
<tr>
|
|
57
|
+
<th>Rank</th>
|
|
58
|
+
<th>Language</th>
|
|
59
|
+
<th>Category</th>
|
|
60
|
+
<th>Popularity Score</th>
|
|
61
|
+
<th>Year Created</th>
|
|
62
|
+
<th>Primary Use</th>
|
|
63
|
+
</tr>
|
|
64
|
+
</thead>
|
|
65
|
+
<tbody>
|
|
66
|
+
<tr data-language="python">
|
|
67
|
+
<td>1</td>
|
|
68
|
+
<td class="language-name">Python</td>
|
|
69
|
+
<td>Interpreted</td>
|
|
70
|
+
<td data-score="95.5">95.5%</td>
|
|
71
|
+
<td>1991</td>
|
|
72
|
+
<td>Data Science, Web, AI</td>
|
|
73
|
+
</tr>
|
|
74
|
+
<tr data-language="javascript">
|
|
75
|
+
<td>2</td>
|
|
76
|
+
<td class="language-name">JavaScript</td>
|
|
77
|
+
<td>Interpreted</td>
|
|
78
|
+
<td data-score="94.2">94.2%</td>
|
|
79
|
+
<td>1995</td>
|
|
80
|
+
<td>Web Development</td>
|
|
81
|
+
</tr>
|
|
82
|
+
<tr data-language="java">
|
|
83
|
+
<td>3</td>
|
|
84
|
+
<td class="language-name">Java</td>
|
|
85
|
+
<td>Compiled</td>
|
|
86
|
+
<td data-score="89.7">89.7%</td>
|
|
87
|
+
<td>1995</td>
|
|
88
|
+
<td>Enterprise, Android</td>
|
|
89
|
+
</tr>
|
|
90
|
+
<tr data-language="csharp">
|
|
91
|
+
<td>4</td>
|
|
92
|
+
<td class="language-name">C#</td>
|
|
93
|
+
<td>Compiled</td>
|
|
94
|
+
<td data-score="82.3">82.3%</td>
|
|
95
|
+
<td>2000</td>
|
|
96
|
+
<td>.NET, Games, Enterprise</td>
|
|
97
|
+
</tr>
|
|
98
|
+
<tr data-language="cpp">
|
|
99
|
+
<td>5</td>
|
|
100
|
+
<td class="language-name">C++</td>
|
|
101
|
+
<td>Compiled</td>
|
|
102
|
+
<td data-score="78.9">78.9%</td>
|
|
103
|
+
<td>1985</td>
|
|
104
|
+
<td>Systems, Games, Performance</td>
|
|
105
|
+
</tr>
|
|
106
|
+
<tr data-language="ruby">
|
|
107
|
+
<td>6</td>
|
|
108
|
+
<td class="language-name">Ruby</td>
|
|
109
|
+
<td>Interpreted</td>
|
|
110
|
+
<td data-score="65.4">65.4%</td>
|
|
111
|
+
<td>1995</td>
|
|
112
|
+
<td>Web Development, Scripting</td>
|
|
113
|
+
</tr>
|
|
114
|
+
<tr data-language="go">
|
|
115
|
+
<td>7</td>
|
|
116
|
+
<td class="language-name">Go</td>
|
|
117
|
+
<td>Compiled</td>
|
|
118
|
+
<td data-score="63.8">63.8%</td>
|
|
119
|
+
<td>2009</td>
|
|
120
|
+
<td>Cloud, Microservices</td>
|
|
121
|
+
</tr>
|
|
122
|
+
<tr data-language="rust">
|
|
123
|
+
<td>8</td>
|
|
124
|
+
<td class="language-name">Rust</td>
|
|
125
|
+
<td>Compiled</td>
|
|
126
|
+
<td data-score="61.2">61.2%</td>
|
|
127
|
+
<td>2010</td>
|
|
128
|
+
<td>Systems, WebAssembly</td>
|
|
129
|
+
</tr>
|
|
130
|
+
<tr data-language="swift">
|
|
131
|
+
<td>9</td>
|
|
132
|
+
<td class="language-name">Swift</td>
|
|
133
|
+
<td>Compiled</td>
|
|
134
|
+
<td data-score="58.7">58.7%</td>
|
|
135
|
+
<td>2014</td>
|
|
136
|
+
<td>iOS, macOS Development</td>
|
|
137
|
+
</tr>
|
|
138
|
+
<tr data-language="kotlin">
|
|
139
|
+
<td>10</td>
|
|
140
|
+
<td class="language-name">Kotlin</td>
|
|
141
|
+
<td>Compiled</td>
|
|
142
|
+
<td data-score="56.3">56.3%</td>
|
|
143
|
+
<td>2011</td>
|
|
144
|
+
<td>Android, JVM</td>
|
|
145
|
+
</tr>
|
|
146
|
+
</tbody>
|
|
147
|
+
</table>
|
|
148
|
+
|
|
149
|
+
<h2>Language Statistics by Category</h2>
|
|
150
|
+
<table id="category-table">
|
|
151
|
+
<thead>
|
|
152
|
+
<tr>
|
|
153
|
+
<th>Category</th>
|
|
154
|
+
<th>Count</th>
|
|
155
|
+
<th>Average Score</th>
|
|
156
|
+
<th>Top Language</th>
|
|
157
|
+
</tr>
|
|
158
|
+
</thead>
|
|
159
|
+
<tbody>
|
|
160
|
+
<tr>
|
|
161
|
+
<td>Interpreted</td>
|
|
162
|
+
<td>3</td>
|
|
163
|
+
<td>85.0%</td>
|
|
164
|
+
<td>Python</td>
|
|
165
|
+
</tr>
|
|
166
|
+
<tr>
|
|
167
|
+
<td>Compiled</td>
|
|
168
|
+
<td>7</td>
|
|
169
|
+
<td>70.1%</td>
|
|
170
|
+
<td>Java</td>
|
|
171
|
+
</tr>
|
|
172
|
+
</tbody>
|
|
173
|
+
</table>
|
|
174
|
+
|
|
175
|
+
<h2>Detailed Language Information</h2>
|
|
176
|
+
<div class="stats-card" data-language="python">
|
|
177
|
+
<h3 class="language-name">Python</h3>
|
|
178
|
+
<ul>
|
|
179
|
+
<li><strong>Paradigm:</strong> Multi-paradigm: object-oriented, procedural, functional</li>
|
|
180
|
+
<li><strong>Typing:</strong> Dynamic, strong</li>
|
|
181
|
+
<li><strong>Community:</strong> Very large and active</li>
|
|
182
|
+
<li><strong>Learning Curve:</strong> Beginner-friendly</li>
|
|
183
|
+
</ul>
|
|
184
|
+
</div>
|
|
185
|
+
|
|
186
|
+
<div class="stats-card" data-language="ruby">
|
|
187
|
+
<h3 class="language-name">Ruby</h3>
|
|
188
|
+
<ul>
|
|
189
|
+
<li><strong>Paradigm:</strong> Multi-paradigm: object-oriented, functional</li>
|
|
190
|
+
<li><strong>Typing:</strong> Dynamic, strong</li>
|
|
191
|
+
<li><strong>Community:</strong> Active, focused on web development</li>
|
|
192
|
+
<li><strong>Learning Curve:</strong> Beginner to intermediate</li>
|
|
193
|
+
</ul>
|
|
194
|
+
</div>
|
|
195
|
+
|
|
196
|
+
<footer>
|
|
197
|
+
<p><em>Data compiled from various programming language popularity indices</em></p>
|
|
198
|
+
<p>Contact: <a href="mailto:info@example.com">info@example.com</a></p>
|
|
199
|
+
</footer>
|
|
200
|
+
</body>
|
|
201
|
+
</html>
|