moxml 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/dependent-repos.json +5 -0
- data/.github/workflows/dependent-tests.yml +20 -0
- data/.github/workflows/docs.yml +59 -0
- data/.github/workflows/rake.yml +10 -10
- data/.github/workflows/release.yml +5 -3
- data/.gitignore +37 -0
- data/.rubocop.yml +15 -7
- data/.rubocop_todo.yml +224 -43
- data/Gemfile +14 -9
- data/LICENSE.md +6 -2
- data/README.adoc +535 -373
- data/Rakefile +53 -0
- data/benchmarks/.gitignore +6 -0
- data/benchmarks/generate_report.rb +550 -0
- data/docs/Gemfile +13 -0
- data/docs/_config.yml +138 -0
- data/docs/_guides/advanced-features.adoc +87 -0
- data/docs/_guides/development-testing.adoc +165 -0
- data/docs/_guides/index.adoc +51 -0
- data/docs/_guides/modifying-xml.adoc +292 -0
- data/docs/_guides/parsing-xml.adoc +230 -0
- data/docs/_guides/sax-parsing.adoc +603 -0
- data/docs/_guides/working-with-documents.adoc +118 -0
- data/docs/_guides/xml-declaration.adoc +450 -0
- data/docs/_pages/adapter-compatibility.adoc +369 -0
- data/docs/_pages/adapters/headed-ox.adoc +237 -0
- data/docs/_pages/adapters/index.adoc +97 -0
- data/docs/_pages/adapters/libxml.adoc +285 -0
- data/docs/_pages/adapters/nokogiri.adoc +251 -0
- data/docs/_pages/adapters/oga.adoc +291 -0
- data/docs/_pages/adapters/ox.adoc +56 -0
- data/docs/_pages/adapters/rexml.adoc +292 -0
- data/docs/_pages/best-practices.adoc +429 -0
- data/docs/_pages/compatibility.adoc +467 -0
- data/docs/_pages/configuration.adoc +250 -0
- data/docs/_pages/error-handling.adoc +349 -0
- data/docs/_pages/headed-ox-limitations.adoc +574 -0
- data/docs/_pages/headed-ox.adoc +1025 -0
- data/docs/_pages/index.adoc +35 -0
- data/docs/_pages/installation.adoc +140 -0
- data/docs/_pages/node-api-reference.adoc +49 -0
- data/docs/_pages/performance.adoc +35 -0
- data/docs/_pages/quick-start.adoc +243 -0
- data/docs/_pages/thread-safety.adoc +28 -0
- data/docs/_references/document-api.adoc +407 -0
- data/docs/_references/index.adoc +48 -0
- data/docs/_tutorials/basic-usage.adoc +267 -0
- data/docs/_tutorials/builder-pattern.adoc +342 -0
- data/docs/_tutorials/index.adoc +33 -0
- data/docs/_tutorials/namespace-handling.adoc +324 -0
- data/docs/_tutorials/xpath-queries.adoc +358 -0
- data/docs/index.adoc +122 -0
- data/examples/README.md +124 -0
- data/examples/api_client/README.md +424 -0
- data/examples/api_client/api_client.rb +394 -0
- data/examples/api_client/example_response.xml +48 -0
- data/examples/headed_ox_example/README.md +90 -0
- data/examples/headed_ox_example/headed_ox_demo.rb +71 -0
- data/examples/rss_parser/README.md +194 -0
- data/examples/rss_parser/example_feed.xml +93 -0
- data/examples/rss_parser/rss_parser.rb +189 -0
- data/examples/sax_parsing/README.md +50 -0
- data/examples/sax_parsing/data_extractor.rb +75 -0
- data/examples/sax_parsing/example.xml +21 -0
- data/examples/sax_parsing/large_file.rb +78 -0
- data/examples/sax_parsing/simple_parser.rb +55 -0
- data/examples/web_scraper/README.md +352 -0
- data/examples/web_scraper/example_page.html +201 -0
- data/examples/web_scraper/web_scraper.rb +312 -0
- data/lib/moxml/adapter/base.rb +107 -28
- data/lib/moxml/adapter/customized_libxml/cdata.rb +28 -0
- data/lib/moxml/adapter/customized_libxml/comment.rb +24 -0
- data/lib/moxml/adapter/customized_libxml/declaration.rb +85 -0
- data/lib/moxml/adapter/customized_libxml/element.rb +39 -0
- data/lib/moxml/adapter/customized_libxml/node.rb +44 -0
- data/lib/moxml/adapter/customized_libxml/processing_instruction.rb +31 -0
- data/lib/moxml/adapter/customized_libxml/text.rb +27 -0
- data/lib/moxml/adapter/customized_oga/xml_generator.rb +1 -1
- data/lib/moxml/adapter/customized_ox/attribute.rb +28 -1
- data/lib/moxml/adapter/customized_rexml/formatter.rb +13 -8
- data/lib/moxml/adapter/headed_ox.rb +161 -0
- data/lib/moxml/adapter/libxml.rb +1564 -0
- data/lib/moxml/adapter/nokogiri.rb +156 -9
- data/lib/moxml/adapter/oga.rb +190 -15
- data/lib/moxml/adapter/ox.rb +322 -28
- data/lib/moxml/adapter/rexml.rb +157 -28
- data/lib/moxml/adapter.rb +21 -4
- data/lib/moxml/attribute.rb +6 -0
- data/lib/moxml/builder.rb +40 -4
- data/lib/moxml/config.rb +8 -3
- data/lib/moxml/context.rb +57 -2
- data/lib/moxml/declaration.rb +9 -0
- data/lib/moxml/doctype.rb +13 -1
- data/lib/moxml/document.rb +53 -6
- data/lib/moxml/document_builder.rb +34 -5
- data/lib/moxml/element.rb +71 -2
- data/lib/moxml/error.rb +175 -6
- data/lib/moxml/node.rb +155 -4
- data/lib/moxml/node_set.rb +34 -0
- data/lib/moxml/sax/block_handler.rb +194 -0
- data/lib/moxml/sax/element_handler.rb +124 -0
- data/lib/moxml/sax/handler.rb +113 -0
- data/lib/moxml/sax.rb +31 -0
- data/lib/moxml/version.rb +1 -1
- data/lib/moxml/xml_utils/encoder.rb +4 -4
- data/lib/moxml/xml_utils.rb +7 -4
- data/lib/moxml/xpath/ast/node.rb +159 -0
- data/lib/moxml/xpath/cache.rb +91 -0
- data/lib/moxml/xpath/compiler.rb +1770 -0
- data/lib/moxml/xpath/context.rb +26 -0
- data/lib/moxml/xpath/conversion.rb +124 -0
- data/lib/moxml/xpath/engine.rb +52 -0
- data/lib/moxml/xpath/errors.rb +101 -0
- data/lib/moxml/xpath/lexer.rb +304 -0
- data/lib/moxml/xpath/parser.rb +485 -0
- data/lib/moxml/xpath/ruby/generator.rb +269 -0
- data/lib/moxml/xpath/ruby/node.rb +193 -0
- data/lib/moxml/xpath.rb +37 -0
- data/lib/moxml.rb +5 -2
- data/moxml.gemspec +3 -1
- data/old-specs/moxml/adapter/customized_libxml/.gitkeep +6 -0
- data/spec/consistency/README.md +77 -0
- data/spec/{moxml/examples/adapter_spec.rb → consistency/adapter_parity_spec.rb} +4 -4
- data/spec/examples/README.md +75 -0
- data/spec/{support/shared_examples/examples/attribute.rb → examples/attribute_examples_spec.rb} +1 -1
- data/spec/{support/shared_examples/examples/basic_usage.rb → examples/basic_usage_spec.rb} +2 -2
- data/spec/{support/shared_examples/examples/namespace.rb → examples/namespace_examples_spec.rb} +3 -3
- data/spec/{support/shared_examples/examples/readme_examples.rb → examples/readme_examples_spec.rb} +6 -4
- data/spec/{support/shared_examples/examples/xpath.rb → examples/xpath_examples_spec.rb} +10 -6
- data/spec/integration/README.md +71 -0
- data/spec/{moxml/all_with_adapters_spec.rb → integration/all_adapters_spec.rb} +3 -2
- data/spec/integration/headed_ox_integration_spec.rb +326 -0
- data/spec/{support → integration}/shared_examples/edge_cases.rb +37 -10
- data/spec/integration/shared_examples/high_level/.gitkeep +0 -0
- data/spec/{support/shared_examples/context.rb → integration/shared_examples/high_level/context_behavior.rb} +2 -1
- data/spec/{support/shared_examples/integration.rb → integration/shared_examples/integration_workflows.rb} +23 -6
- data/spec/integration/shared_examples/node_wrappers/.gitkeep +0 -0
- data/spec/{support/shared_examples/cdata.rb → integration/shared_examples/node_wrappers/cdata_behavior.rb} +6 -1
- data/spec/{support/shared_examples/comment.rb → integration/shared_examples/node_wrappers/comment_behavior.rb} +2 -1
- data/spec/{support/shared_examples/declaration.rb → integration/shared_examples/node_wrappers/declaration_behavior.rb} +5 -5
- data/spec/{support/shared_examples/doctype.rb → integration/shared_examples/node_wrappers/doctype_behavior.rb} +2 -2
- data/spec/{support/shared_examples/document.rb → integration/shared_examples/node_wrappers/document_behavior.rb} +1 -1
- data/spec/{support/shared_examples/node.rb → integration/shared_examples/node_wrappers/node_behavior.rb} +9 -2
- data/spec/{support/shared_examples/node_set.rb → integration/shared_examples/node_wrappers/node_set_behavior.rb} +1 -18
- data/spec/{support/shared_examples/processing_instruction.rb → integration/shared_examples/node_wrappers/processing_instruction_behavior.rb} +6 -2
- data/spec/moxml/README.md +41 -0
- data/spec/moxml/adapter/.gitkeep +0 -0
- data/spec/moxml/adapter/README.md +61 -0
- data/spec/moxml/adapter/base_spec.rb +27 -0
- data/spec/moxml/adapter/headed_ox_spec.rb +311 -0
- data/spec/moxml/adapter/libxml_spec.rb +14 -0
- data/spec/moxml/adapter/ox_spec.rb +9 -8
- data/spec/moxml/adapter/shared_examples/.gitkeep +0 -0
- data/spec/{support/shared_examples/xml_adapter.rb → moxml/adapter/shared_examples/adapter_contract.rb} +39 -12
- data/spec/moxml/adapter_spec.rb +16 -0
- data/spec/moxml/attribute_spec.rb +30 -0
- data/spec/moxml/builder_spec.rb +33 -0
- data/spec/moxml/cdata_spec.rb +31 -0
- data/spec/moxml/comment_spec.rb +31 -0
- data/spec/moxml/config_spec.rb +3 -3
- data/spec/moxml/context_spec.rb +28 -0
- data/spec/moxml/declaration_preservation_spec.rb +217 -0
- data/spec/moxml/declaration_spec.rb +36 -0
- data/spec/moxml/doctype_spec.rb +33 -0
- data/spec/moxml/document_builder_spec.rb +30 -0
- data/spec/moxml/document_spec.rb +105 -0
- data/spec/moxml/element_spec.rb +143 -0
- data/spec/moxml/error_spec.rb +266 -22
- data/spec/{moxml_spec.rb → moxml/moxml_spec.rb} +9 -9
- data/spec/moxml/namespace_spec.rb +32 -0
- data/spec/moxml/node_set_spec.rb +39 -0
- data/spec/moxml/node_spec.rb +37 -0
- data/spec/moxml/processing_instruction_spec.rb +34 -0
- data/spec/moxml/sax_spec.rb +1067 -0
- data/spec/moxml/text_spec.rb +31 -0
- data/spec/moxml/version_spec.rb +14 -0
- data/spec/moxml/xml_utils/.gitkeep +0 -0
- data/spec/moxml/xml_utils/encoder_spec.rb +27 -0
- data/spec/moxml/xml_utils_spec.rb +49 -0
- data/spec/moxml/xpath/ast/node_spec.rb +83 -0
- data/spec/moxml/xpath/axes_spec.rb +296 -0
- data/spec/moxml/xpath/cache_spec.rb +358 -0
- data/spec/moxml/xpath/compiler_spec.rb +406 -0
- data/spec/moxml/xpath/context_spec.rb +210 -0
- data/spec/moxml/xpath/conversion_spec.rb +365 -0
- data/spec/moxml/xpath/fixtures/sample.xml +25 -0
- data/spec/moxml/xpath/functions/boolean_functions_spec.rb +114 -0
- data/spec/moxml/xpath/functions/node_functions_spec.rb +145 -0
- data/spec/moxml/xpath/functions/numeric_functions_spec.rb +164 -0
- data/spec/moxml/xpath/functions/position_functions_spec.rb +93 -0
- data/spec/moxml/xpath/functions/special_functions_spec.rb +89 -0
- data/spec/moxml/xpath/functions/string_functions_spec.rb +381 -0
- data/spec/moxml/xpath/lexer_spec.rb +488 -0
- data/spec/moxml/xpath/parser_integration_spec.rb +210 -0
- data/spec/moxml/xpath/parser_spec.rb +364 -0
- data/spec/moxml/xpath/ruby/generator_spec.rb +421 -0
- data/spec/moxml/xpath/ruby/node_spec.rb +291 -0
- data/spec/moxml/xpath_capabilities_spec.rb +199 -0
- data/spec/moxml/xpath_spec.rb +77 -0
- data/spec/performance/README.md +83 -0
- data/spec/performance/benchmark_spec.rb +64 -0
- data/spec/{support/shared_examples/examples/memory.rb → performance/memory_usage_spec.rb} +4 -1
- data/spec/{support/shared_examples/examples/thread_safety.rb → performance/thread_safety_spec.rb} +3 -1
- data/spec/performance/xpath_benchmark_spec.rb +259 -0
- data/spec/spec_helper.rb +58 -1
- data/spec/support/xml_matchers.rb +1 -1
- metadata +178 -34
- data/spec/support/shared_examples/examples/benchmark_spec.rb +0 -51
- /data/spec/{support/shared_examples/builder.rb → integration/shared_examples/high_level/builder_behavior.rb} +0 -0
- /data/spec/{support/shared_examples/document_builder.rb → integration/shared_examples/high_level/document_builder_behavior.rb} +0 -0
- /data/spec/{support/shared_examples/attribute.rb → integration/shared_examples/node_wrappers/attribute_behavior.rb} +0 -0
- /data/spec/{support/shared_examples/element.rb → integration/shared_examples/node_wrappers/element_behavior.rb} +0 -0
- /data/spec/{support/shared_examples/namespace.rb → integration/shared_examples/node_wrappers/namespace_behavior.rb} +0 -0
- /data/spec/{support/shared_examples/text.rb → integration/shared_examples/node_wrappers/text_behavior.rb} +0 -0
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Web Scraper Example
|
|
5
|
+
# This example demonstrates how to use Moxml to scrape data from HTML/XML:
|
|
6
|
+
# - Parsing HTML as XML
|
|
7
|
+
# - Extracting data from tables
|
|
8
|
+
# - DOM structure navigation
|
|
9
|
+
# - Attribute and text content access
|
|
10
|
+
# - Working with structured data
|
|
11
|
+
|
|
12
|
+
# Load moxml from the local source (use 'require "moxml"' in production)
|
|
13
|
+
require_relative "../../lib/moxml"
|
|
14
|
+
|
|
15
|
+
# Language class to represent programming language data
|
|
16
|
+
class Language
|
|
17
|
+
attr_reader :rank, :name, :category, :score, :year, :use_cases
|
|
18
|
+
|
|
19
|
+
def initialize(rank:, name:, category:, score:, year:, use_cases:)
|
|
20
|
+
@rank = rank.to_i
|
|
21
|
+
@name = name
|
|
22
|
+
@category = category
|
|
23
|
+
@score = score.to_f
|
|
24
|
+
@year = year.to_i
|
|
25
|
+
@use_cases = use_cases
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def to_s
|
|
29
|
+
"#{@rank}. #{@name} (#{@category}) - #{@score}% | Created: #{@year} | Uses: #{@use_cases}"
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# CategoryStats class to represent category statistics
|
|
34
|
+
class CategoryStats
|
|
35
|
+
attr_reader :name, :count, :avg_score, :top_language
|
|
36
|
+
|
|
37
|
+
def initialize(name:, count:, avg_score:, top_language:)
|
|
38
|
+
@name = name
|
|
39
|
+
@count = count.to_i
|
|
40
|
+
@avg_score = avg_score
|
|
41
|
+
@top_language = top_language
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def to_s
|
|
45
|
+
"#{@name}: #{@count} languages, avg #{@avg_score}, top: #{@top_language}"
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# WebScraper class encapsulates web scraping logic
|
|
50
|
+
class WebScraper
|
|
51
|
+
# Initialize with the path to an HTML file
|
|
52
|
+
def initialize(html_path)
|
|
53
|
+
@html_path = html_path
|
|
54
|
+
@moxml = Moxml.new
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Scrape the HTML page and extract all data
|
|
58
|
+
def scrape
|
|
59
|
+
# Read and parse the HTML file
|
|
60
|
+
html_content = File.read(@html_path)
|
|
61
|
+
|
|
62
|
+
# Parse HTML as XML (Moxml can handle well-formed HTML)
|
|
63
|
+
doc = begin
|
|
64
|
+
@moxml.parse(html_content)
|
|
65
|
+
rescue Moxml::ParseError => e
|
|
66
|
+
puts "Failed to parse HTML: #{e.message}"
|
|
67
|
+
puts "Hint: Ensure the HTML is well-formed XML"
|
|
68
|
+
exit 1
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
puts "=" * 80
|
|
72
|
+
puts "Programming Language Statistics Scraper"
|
|
73
|
+
puts "=" * 80
|
|
74
|
+
puts
|
|
75
|
+
|
|
76
|
+
# Extract page title
|
|
77
|
+
title = extract_page_title(doc)
|
|
78
|
+
puts "Page Title: #{title}\n\n"
|
|
79
|
+
|
|
80
|
+
# Extract summary information
|
|
81
|
+
summary = extract_summary(doc)
|
|
82
|
+
puts "Summary:"
|
|
83
|
+
puts " #{summary[:total]} total languages tracked"
|
|
84
|
+
puts " Last updated: #{summary[:updated]}\n\n"
|
|
85
|
+
|
|
86
|
+
# Extract language data from the main table
|
|
87
|
+
languages = extract_languages_table(doc)
|
|
88
|
+
puts "Languages Extracted: #{languages.length}"
|
|
89
|
+
puts "-" * 80
|
|
90
|
+
languages.each { |lang| puts lang }
|
|
91
|
+
puts
|
|
92
|
+
|
|
93
|
+
# Extract category statistics
|
|
94
|
+
categories = extract_category_stats(doc)
|
|
95
|
+
puts "\nCategory Statistics:"
|
|
96
|
+
puts "-" * 80
|
|
97
|
+
categories.each { |cat| puts cat }
|
|
98
|
+
puts
|
|
99
|
+
|
|
100
|
+
# Extract detailed information
|
|
101
|
+
details = extract_detailed_info(doc)
|
|
102
|
+
puts "\nDetailed Information:"
|
|
103
|
+
puts "-" * 80
|
|
104
|
+
details.each do |lang, info|
|
|
105
|
+
puts "#{lang}:"
|
|
106
|
+
info.each { |key, value| puts " #{key}: #{value}" }
|
|
107
|
+
puts
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Return structured data
|
|
111
|
+
{
|
|
112
|
+
title: title,
|
|
113
|
+
summary: summary,
|
|
114
|
+
languages: languages,
|
|
115
|
+
categories: categories,
|
|
116
|
+
details: details,
|
|
117
|
+
}
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
private
|
|
121
|
+
|
|
122
|
+
# Extract the page title from <title> element
|
|
123
|
+
def extract_page_title(doc)
|
|
124
|
+
# Find the title element using XPath
|
|
125
|
+
# The double slash (//) searches from the root
|
|
126
|
+
title_element = doc.at_xpath("//title")
|
|
127
|
+
title_element&.text&.strip || "Unknown Title"
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# Extract summary statistics from the summary card
|
|
131
|
+
def extract_summary(doc)
|
|
132
|
+
# Find the summary div by id attribute
|
|
133
|
+
# XPath attribute selector: [@id='value']
|
|
134
|
+
summary_div = doc.at_xpath("//div[@id='summary']")
|
|
135
|
+
|
|
136
|
+
return { total: 0, updated: "Unknown" } unless summary_div
|
|
137
|
+
|
|
138
|
+
# Extract text from span elements with class 'stat-value'
|
|
139
|
+
# Using XPath class selector: [contains(@class, 'value')]
|
|
140
|
+
stats = summary_div.xpath(".//span[@class='stat-value']")
|
|
141
|
+
|
|
142
|
+
{
|
|
143
|
+
total: stats[0]&.text&.strip || "0",
|
|
144
|
+
updated: stats[1]&.text&.strip || "Unknown",
|
|
145
|
+
}
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Extract language data from the popularity table
|
|
149
|
+
def extract_languages_table(doc)
|
|
150
|
+
# Find the table by id
|
|
151
|
+
table = doc.at_xpath("//table[@id='popularity-table']")
|
|
152
|
+
return [] unless table
|
|
153
|
+
|
|
154
|
+
# Get all table body rows
|
|
155
|
+
# Using descendant axis to find tbody/tr elements
|
|
156
|
+
rows = table.xpath(".//tbody/tr")
|
|
157
|
+
|
|
158
|
+
# Parse each row into a Language object
|
|
159
|
+
rows.filter_map do |row|
|
|
160
|
+
# Get all td (cell) elements in this row
|
|
161
|
+
cells = row.xpath("./td")
|
|
162
|
+
|
|
163
|
+
# Skip if we don't have enough cells
|
|
164
|
+
next nil if cells.length < 6
|
|
165
|
+
|
|
166
|
+
# Extract data from each cell
|
|
167
|
+
# Using array indexing for predictable table structure
|
|
168
|
+
rank = cells[0].text.strip
|
|
169
|
+
name = cells[1].text.strip
|
|
170
|
+
category = cells[2].text.strip
|
|
171
|
+
|
|
172
|
+
# Access data-score attribute for the score
|
|
173
|
+
# Demonstrates attribute access with []
|
|
174
|
+
score = cells[3]["data-score"] || cells[3].text.strip.delete("%")
|
|
175
|
+
|
|
176
|
+
year = cells[4].text.strip
|
|
177
|
+
use_cases = cells[5].text.strip
|
|
178
|
+
|
|
179
|
+
Language.new(
|
|
180
|
+
rank: rank,
|
|
181
|
+
name: name,
|
|
182
|
+
category: category,
|
|
183
|
+
score: score,
|
|
184
|
+
year: year,
|
|
185
|
+
use_cases: use_cases,
|
|
186
|
+
)
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
# Extract category statistics from the category table
|
|
191
|
+
def extract_category_stats(doc)
|
|
192
|
+
# Find the category table
|
|
193
|
+
table = doc.at_xpath("//table[@id='category-table']")
|
|
194
|
+
return [] unless table
|
|
195
|
+
|
|
196
|
+
# Get table rows
|
|
197
|
+
rows = table.xpath(".//tbody/tr")
|
|
198
|
+
|
|
199
|
+
rows.filter_map do |row|
|
|
200
|
+
cells = row.xpath("./td")
|
|
201
|
+
next nil if cells.length < 4
|
|
202
|
+
|
|
203
|
+
CategoryStats.new(
|
|
204
|
+
name: cells[0].text.strip,
|
|
205
|
+
count: cells[1].text.strip,
|
|
206
|
+
avg_score: cells[2].text.strip,
|
|
207
|
+
top_language: cells[3].text.strip,
|
|
208
|
+
)
|
|
209
|
+
end
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
# Extract detailed language information from stats cards
|
|
213
|
+
def extract_detailed_info(doc)
|
|
214
|
+
# Find all divs with class 'stats-card' that have a data-language attribute
|
|
215
|
+
# This demonstrates combining class and attribute selectors
|
|
216
|
+
cards = doc.xpath("//div[contains(@class, 'stats-card') and @data-language]")
|
|
217
|
+
|
|
218
|
+
cards.each_with_object({}) do |card, hash|
|
|
219
|
+
# Get the language name from the data-language attribute
|
|
220
|
+
lang_name = card["data-language"]
|
|
221
|
+
|
|
222
|
+
# Extract all list items within this card
|
|
223
|
+
items = card.xpath(".//li")
|
|
224
|
+
|
|
225
|
+
# Parse each list item to extract key-value pairs
|
|
226
|
+
info = items.each_with_object({}) do |item, item_hash|
|
|
227
|
+
text = item.text.strip
|
|
228
|
+
# Simple parsing: split on first colon
|
|
229
|
+
if text.include?(":")
|
|
230
|
+
key, value = text.split(":", 2)
|
|
231
|
+
item_hash[key.strip] = value.strip
|
|
232
|
+
end
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
hash[lang_name] = info unless info.empty?
|
|
236
|
+
end
|
|
237
|
+
end
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
# Demonstration of various XPath patterns
|
|
241
|
+
def demonstrate_xpath_patterns(doc)
|
|
242
|
+
puts "\n#{'=' * 80}"
|
|
243
|
+
puts "XPath Pattern Demonstrations"
|
|
244
|
+
puts "=" * 80
|
|
245
|
+
|
|
246
|
+
# Pattern 1: Direct descendant
|
|
247
|
+
puts "\n1. All table headers (//th):"
|
|
248
|
+
headers = doc.xpath("//th")
|
|
249
|
+
puts " Found #{headers.length} headers: #{headers.map(&:text).join(', ')}"
|
|
250
|
+
|
|
251
|
+
# Pattern 2: Attribute selector
|
|
252
|
+
puts "\n2. Elements with data-language attribute (//tr[@data-language]):"
|
|
253
|
+
lang_rows = doc.xpath("//tr[@data-language]")
|
|
254
|
+
langs = lang_rows.map { |row| row["data-language"] }
|
|
255
|
+
puts " Found #{langs.length} languages: #{langs.join(', ')}"
|
|
256
|
+
|
|
257
|
+
# Pattern 3: Class contains
|
|
258
|
+
puts "\n3. Elements with 'language-name' class:"
|
|
259
|
+
names = doc.xpath("//*[contains(@class, 'language-name')]")
|
|
260
|
+
puts " Found #{names.length} elements: #{names.map(&:text).join(', ')}"
|
|
261
|
+
|
|
262
|
+
# Pattern 4: Combining conditions
|
|
263
|
+
puts "\n4. Table cells with data-score > 80:"
|
|
264
|
+
high_scores = doc.xpath("//td[@data-score]").select do |cell|
|
|
265
|
+
cell["data-score"].to_f > 80
|
|
266
|
+
end
|
|
267
|
+
puts " Found #{high_scores.length} high scores"
|
|
268
|
+
|
|
269
|
+
# Pattern 5: Navigation axes
|
|
270
|
+
puts "\n5. Parent elements of language names:"
|
|
271
|
+
first_name = doc.at_xpath("//td[@class='language-name']")
|
|
272
|
+
if first_name
|
|
273
|
+
parent_row = first_name.parent
|
|
274
|
+
puts " Parent tag: #{parent_row.name}"
|
|
275
|
+
puts " Parent has #{parent_row.children.length} children"
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
puts "=" * 80
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
# Main execution
|
|
282
|
+
if __FILE__ == $0
|
|
283
|
+
# Get the HTML path (use example page by default)
|
|
284
|
+
html_path = ARGV[0] || File.join(__dir__, "example_page.html")
|
|
285
|
+
|
|
286
|
+
# Check if file exists
|
|
287
|
+
unless File.exist?(html_path)
|
|
288
|
+
puts "Error: HTML file not found: #{html_path}"
|
|
289
|
+
puts "Usage: ruby web_scraper.rb [path/to/page.html]"
|
|
290
|
+
exit 1
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
puts "Scraping HTML page: #{html_path}\n"
|
|
294
|
+
|
|
295
|
+
# Scrape the page
|
|
296
|
+
scraper = WebScraper.new(html_path)
|
|
297
|
+
data = scraper.scrape
|
|
298
|
+
|
|
299
|
+
# Demonstrate various XPath patterns
|
|
300
|
+
doc = Moxml.new.parse(File.read(html_path))
|
|
301
|
+
demonstrate_xpath_patterns(doc)
|
|
302
|
+
|
|
303
|
+
# Summary
|
|
304
|
+
puts "\n#{'=' * 80}"
|
|
305
|
+
puts "Scraping Complete!"
|
|
306
|
+
puts "=" * 80
|
|
307
|
+
puts "Extracted:"
|
|
308
|
+
puts " - #{data[:languages].length} programming languages"
|
|
309
|
+
puts " - #{data[:categories].length} category statistics"
|
|
310
|
+
puts " - #{data[:details].length} detailed information entries"
|
|
311
|
+
puts "=" * 80
|
|
312
|
+
end
|
data/lib/moxml/adapter/base.rb
CHANGED
|
@@ -11,16 +11,54 @@ module Moxml
|
|
|
11
11
|
class << self
|
|
12
12
|
include XmlUtils
|
|
13
13
|
|
|
14
|
-
def set_root(
|
|
15
|
-
raise NotImplementedError
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
end
|
|
21
|
-
|
|
22
|
-
def
|
|
23
|
-
raise NotImplementedError
|
|
14
|
+
def set_root(_doc, _element)
|
|
15
|
+
raise Moxml::NotImplementedError.new(
|
|
16
|
+
"set_root not implemented",
|
|
17
|
+
feature: "set_root",
|
|
18
|
+
adapter: name,
|
|
19
|
+
)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def parse(_xml, _options = {})
|
|
23
|
+
raise Moxml::NotImplementedError.new(
|
|
24
|
+
"parse not implemented",
|
|
25
|
+
feature: "parse",
|
|
26
|
+
adapter: name,
|
|
27
|
+
)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Parse XML using SAX (event-driven) parsing
|
|
31
|
+
#
|
|
32
|
+
# SAX parsing provides a memory-efficient way to process XML
|
|
33
|
+
# by triggering events as the document is parsed, rather than
|
|
34
|
+
# building a complete DOM tree.
|
|
35
|
+
#
|
|
36
|
+
# @param xml [String, IO] XML string or IO object to parse
|
|
37
|
+
# @param handler [Moxml::SAX::Handler] Handler object receiving events
|
|
38
|
+
# @return [void]
|
|
39
|
+
# @raise [Moxml::NotImplementedError] if adapter doesn't support SAX
|
|
40
|
+
def sax_parse(_xml, _handler)
|
|
41
|
+
raise Moxml::NotImplementedError.new(
|
|
42
|
+
"sax_parse not implemented",
|
|
43
|
+
feature: "sax_parse",
|
|
44
|
+
adapter: name,
|
|
45
|
+
)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Check if this adapter supports SAX parsing
|
|
49
|
+
#
|
|
50
|
+
# @return [Boolean] true if SAX parsing is supported
|
|
51
|
+
def sax_supported?
|
|
52
|
+
respond_to?(:sax_parse) &&
|
|
53
|
+
method(:sax_parse).owner != Moxml::Adapter::Base.singleton_class
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def create_document(_native_doc = nil)
|
|
57
|
+
raise Moxml::NotImplementedError.new(
|
|
58
|
+
"create_document not implemented",
|
|
59
|
+
feature: "create_document",
|
|
60
|
+
adapter: name,
|
|
61
|
+
)
|
|
24
62
|
end
|
|
25
63
|
|
|
26
64
|
def create_element(name)
|
|
@@ -48,10 +86,12 @@ module Moxml
|
|
|
48
86
|
|
|
49
87
|
def create_processing_instruction(target, content)
|
|
50
88
|
validate_pi_target(target)
|
|
51
|
-
create_native_processing_instruction(target,
|
|
89
|
+
create_native_processing_instruction(target,
|
|
90
|
+
normalize_xml_value(content))
|
|
52
91
|
end
|
|
53
92
|
|
|
54
|
-
def create_declaration(version = "1.0", encoding = "UTF-8",
|
|
93
|
+
def create_declaration(version = "1.0", encoding = "UTF-8",
|
|
94
|
+
standalone = nil)
|
|
55
95
|
validate_declaration_version(version)
|
|
56
96
|
validate_declaration_encoding(encoding)
|
|
57
97
|
validate_declaration_standalone(standalone)
|
|
@@ -81,38 +121,77 @@ module Moxml
|
|
|
81
121
|
node
|
|
82
122
|
end
|
|
83
123
|
|
|
124
|
+
def prepare_for_new_document(node, _target_doc)
|
|
125
|
+
# Hook for adapters that need special handling when moving nodes
|
|
126
|
+
# between documents (e.g., LibXML's document.import)
|
|
127
|
+
# Default: no-op for backward compatibility
|
|
128
|
+
node
|
|
129
|
+
end
|
|
130
|
+
|
|
84
131
|
protected
|
|
85
132
|
|
|
86
|
-
def create_native_element(
|
|
87
|
-
raise NotImplementedError
|
|
133
|
+
def create_native_element(_name)
|
|
134
|
+
raise Moxml::NotImplementedError.new(
|
|
135
|
+
"create_native_element not implemented",
|
|
136
|
+
feature: "create_native_element",
|
|
137
|
+
adapter: name,
|
|
138
|
+
)
|
|
88
139
|
end
|
|
89
140
|
|
|
90
|
-
def create_native_text(
|
|
91
|
-
raise NotImplementedError
|
|
141
|
+
def create_native_text(_content)
|
|
142
|
+
raise Moxml::NotImplementedError.new(
|
|
143
|
+
"create_native_text not implemented",
|
|
144
|
+
feature: "create_native_text",
|
|
145
|
+
adapter: name,
|
|
146
|
+
)
|
|
92
147
|
end
|
|
93
148
|
|
|
94
|
-
def create_native_cdata(
|
|
95
|
-
raise NotImplementedError
|
|
149
|
+
def create_native_cdata(_content)
|
|
150
|
+
raise Moxml::NotImplementedError.new(
|
|
151
|
+
"create_native_cdata not implemented",
|
|
152
|
+
feature: "create_native_cdata",
|
|
153
|
+
adapter: name,
|
|
154
|
+
)
|
|
96
155
|
end
|
|
97
156
|
|
|
98
|
-
def create_native_comment(
|
|
99
|
-
raise NotImplementedError
|
|
157
|
+
def create_native_comment(_content)
|
|
158
|
+
raise Moxml::NotImplementedError.new(
|
|
159
|
+
"create_native_comment not implemented",
|
|
160
|
+
feature: "create_native_comment",
|
|
161
|
+
adapter: name,
|
|
162
|
+
)
|
|
100
163
|
end
|
|
101
164
|
|
|
102
|
-
def create_native_doctype(
|
|
103
|
-
raise NotImplementedError
|
|
165
|
+
def create_native_doctype(_name, _external_id, _system_id)
|
|
166
|
+
raise Moxml::NotImplementedError.new(
|
|
167
|
+
"create_native_doctype not implemented",
|
|
168
|
+
feature: "create_native_doctype",
|
|
169
|
+
adapter: name,
|
|
170
|
+
)
|
|
104
171
|
end
|
|
105
172
|
|
|
106
|
-
def create_native_processing_instruction(
|
|
107
|
-
raise NotImplementedError
|
|
173
|
+
def create_native_processing_instruction(_target, _content)
|
|
174
|
+
raise Moxml::NotImplementedError.new(
|
|
175
|
+
"create_native_processing_instruction not implemented",
|
|
176
|
+
feature: "create_native_processing_instruction",
|
|
177
|
+
adapter: name,
|
|
178
|
+
)
|
|
108
179
|
end
|
|
109
180
|
|
|
110
|
-
def create_native_declaration(
|
|
111
|
-
raise NotImplementedError
|
|
181
|
+
def create_native_declaration(_version, _encoding, _standalone)
|
|
182
|
+
raise Moxml::NotImplementedError.new(
|
|
183
|
+
"create_native_declaration not implemented",
|
|
184
|
+
feature: "create_native_declaration",
|
|
185
|
+
adapter: name,
|
|
186
|
+
)
|
|
112
187
|
end
|
|
113
188
|
|
|
114
|
-
def create_native_namespace(
|
|
115
|
-
raise NotImplementedError
|
|
189
|
+
def create_native_namespace(_element, _prefix, _uri)
|
|
190
|
+
raise Moxml::NotImplementedError.new(
|
|
191
|
+
"create_native_namespace not implemented",
|
|
192
|
+
feature: "create_native_namespace",
|
|
193
|
+
adapter: name,
|
|
194
|
+
)
|
|
116
195
|
end
|
|
117
196
|
end
|
|
118
197
|
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "node"
|
|
4
|
+
|
|
5
|
+
module Moxml
|
|
6
|
+
module Adapter
|
|
7
|
+
module CustomizedLibxml
|
|
8
|
+
# Wrapper for LibXML CDATA section nodes
|
|
9
|
+
class Cdata < Node
|
|
10
|
+
# Serialize as XML CDATA section
|
|
11
|
+
# LibXML auto-escapes content, we need to un-escape it
|
|
12
|
+
def to_xml
|
|
13
|
+
content = @native.content
|
|
14
|
+
.gsub(""", '"')
|
|
15
|
+
.gsub("'", "'")
|
|
16
|
+
.gsub("<", "<")
|
|
17
|
+
.gsub(">", ">")
|
|
18
|
+
.gsub("&", "&")
|
|
19
|
+
|
|
20
|
+
# Handle CDATA end marker escaping (]]> becomes ]]]]><![CDATA[>)
|
|
21
|
+
# Replace all ]]> markers in the content before wrapping
|
|
22
|
+
escaped_content = content.gsub("]]>", "]]]]><![CDATA[>")
|
|
23
|
+
"<![CDATA[#{escaped_content}]]>"
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "node"
|
|
4
|
+
|
|
5
|
+
module Moxml
|
|
6
|
+
module Adapter
|
|
7
|
+
module CustomizedLibxml
|
|
8
|
+
# Wrapper for LibXML comment nodes
|
|
9
|
+
class Comment < Node
|
|
10
|
+
# Serialize as XML comment
|
|
11
|
+
# LibXML auto-escapes content, we need to un-escape it
|
|
12
|
+
def to_xml
|
|
13
|
+
content = @native.content
|
|
14
|
+
.gsub(""", '"')
|
|
15
|
+
.gsub("'", "'")
|
|
16
|
+
.gsub("<", "<")
|
|
17
|
+
.gsub(">", ">")
|
|
18
|
+
.gsub("&", "&")
|
|
19
|
+
"<!--#{content}-->"
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Moxml
|
|
4
|
+
module Adapter
|
|
5
|
+
module CustomizedLibxml
|
|
6
|
+
# Wrapper for LibXML document declarations
|
|
7
|
+
#
|
|
8
|
+
# LibXML::XML::Document properties (version, encoding, standalone)
|
|
9
|
+
# are read-only after creation. This wrapper allows mutation by
|
|
10
|
+
# storing values internally and regenerating XML when needed.
|
|
11
|
+
class Declaration
|
|
12
|
+
attr_accessor :version, :encoding
|
|
13
|
+
attr_reader :native
|
|
14
|
+
|
|
15
|
+
def initialize(native_doc, version = nil, encoding = nil,
|
|
16
|
+
standalone = nil)
|
|
17
|
+
@native = native_doc
|
|
18
|
+
# Store explicit values - don't default from native_doc
|
|
19
|
+
@version = version || native_doc.version || "1.0"
|
|
20
|
+
# Only use encoding if explicitly provided, otherwise nil
|
|
21
|
+
@encoding = encoding
|
|
22
|
+
# Parse standalone value
|
|
23
|
+
@standalone_value = case standalone
|
|
24
|
+
when "yes", true
|
|
25
|
+
true
|
|
26
|
+
when "no", false
|
|
27
|
+
false
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def standalone
|
|
32
|
+
return nil if @standalone_value.nil?
|
|
33
|
+
|
|
34
|
+
@standalone_value ? "yes" : "no"
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def standalone=(value)
|
|
38
|
+
@standalone_value = case value
|
|
39
|
+
when "yes", true
|
|
40
|
+
true
|
|
41
|
+
when "no", false
|
|
42
|
+
false
|
|
43
|
+
when nil
|
|
44
|
+
nil
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Generate XML declaration string
|
|
49
|
+
def to_xml
|
|
50
|
+
output = "<?xml version=\"#{@version}\""
|
|
51
|
+
if @encoding && !@encoding.empty?
|
|
52
|
+
output << " encoding=\"#{@encoding}\""
|
|
53
|
+
end
|
|
54
|
+
# Include standalone attribute if explicitly set (true or false)
|
|
55
|
+
unless @standalone_value.nil?
|
|
56
|
+
output << " standalone=\"#{standalone}\""
|
|
57
|
+
end
|
|
58
|
+
output << "?>"
|
|
59
|
+
output
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
private
|
|
63
|
+
|
|
64
|
+
def extract_encoding(libxml_encoding)
|
|
65
|
+
return nil unless libxml_encoding
|
|
66
|
+
|
|
67
|
+
case libxml_encoding
|
|
68
|
+
when ::LibXML::XML::Encoding::UTF_8
|
|
69
|
+
"UTF-8"
|
|
70
|
+
when ::LibXML::XML::Encoding::ISO_8859_1
|
|
71
|
+
"ISO-8859-1"
|
|
72
|
+
when ::LibXML::XML::Encoding::UTF_16LE
|
|
73
|
+
"UTF-16LE"
|
|
74
|
+
when ::LibXML::XML::Encoding::UTF_16BE
|
|
75
|
+
"UTF-16BE"
|
|
76
|
+
when ::LibXML::XML::Encoding::UCS_2
|
|
77
|
+
"UCS-2"
|
|
78
|
+
else
|
|
79
|
+
"UTF-8"
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "node"
|
|
4
|
+
|
|
5
|
+
module Moxml
|
|
6
|
+
module Adapter
|
|
7
|
+
module CustomizedLibxml
|
|
8
|
+
# Wrapper for LibXML element nodes
|
|
9
|
+
#
|
|
10
|
+
# This wrapper provides automatic document import when adding children,
|
|
11
|
+
# solving LibXML's strict document ownership requirement.
|
|
12
|
+
class Element < Node
|
|
13
|
+
# Add a child to this element, handling document import automatically
|
|
14
|
+
def add_child(child)
|
|
15
|
+
child_native = child.respond_to?(:native) ? child.native : child
|
|
16
|
+
|
|
17
|
+
# Check if child needs to be imported
|
|
18
|
+
if needs_import?(child_native)
|
|
19
|
+
imported = @native.doc.import(child_native)
|
|
20
|
+
@native << imported
|
|
21
|
+
else
|
|
22
|
+
@native << child_native
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
private
|
|
27
|
+
|
|
28
|
+
def needs_import?(child_node)
|
|
29
|
+
return false unless @native.respond_to?(:doc)
|
|
30
|
+
return false unless @native.doc
|
|
31
|
+
return false unless child_node.respond_to?(:doc)
|
|
32
|
+
return false unless child_node.doc
|
|
33
|
+
|
|
34
|
+
child_node.doc != @native.doc
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|