html-to-markdown 2.30.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -14
- data/README.md +37 -50
- data/ext/html-to-markdown-rb/native/Cargo.lock +13 -701
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
- data/ext/html-to-markdown-rb/native/README.md +4 -13
- data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
- data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
- data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
- data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
- data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +13 -194
- data/sig/html_to_markdown.rbs +12 -373
- data/vendor/Cargo.toml +5 -2
- data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
- data/vendor/html-to-markdown-rs/README.md +126 -52
- data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
- data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
- data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
- data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
- data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
- data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
- data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
- data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
- data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
- data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
- data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
- data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
- data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
- data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
- data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
- data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
- data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
- data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -68
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
- data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
- data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
- data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
- data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
- data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
- data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
- data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
- data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
- data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
- data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
- data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
- data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
- data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
- data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
- data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
- data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
- data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
- data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -323
- data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
- data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
- data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
- data/vendor/html-to-markdown-rs/src/text.rs +25 -14
- data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
- data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
- data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
- data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
- data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
- data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
- data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
- data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
- data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
- data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
- data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
- data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
- data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
- data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
- data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
- metadata +9 -37
- data/bin/benchmark.rb +0 -232
- data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
- data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
- data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
- data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
- data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
- data/spec/convert_spec.rb +0 -77
- data/spec/convert_with_tables_spec.rb +0 -194
- data/spec/metadata_extraction_spec.rb +0 -437
- data/spec/visitor_issue_187_spec.rb +0 -605
- data/spec/visitor_spec.rb +0 -1149
- data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
- data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
- data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
- data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
- data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
- data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
- data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
- data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
- data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
- data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -42
- data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
- data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
- data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
- data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
- data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
- data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
- data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
- data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
- data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
- data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
- data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
|
@@ -0,0 +1,790 @@
|
|
|
1
|
+
//! Builds a [`DocumentStructure`] from a parsed `tl::VDom`.
|
|
2
|
+
//!
|
|
3
|
+
//! Walk the DOM once, mapping each HTML element to the appropriate [`NodeContent`] variant,
|
|
4
|
+
//! collecting inline [`TextAnnotation`]s, tracking parent/child relationships, and generating
|
|
5
|
+
//! heading-based [`Group`] hierarchy.
|
|
6
|
+
|
|
7
|
+
use std::collections::HashMap;
|
|
8
|
+
|
|
9
|
+
use super::document::{AnnotationKind, DocumentNode, DocumentStructure, NodeContent, TextAnnotation};
|
|
10
|
+
use super::tables::{GridCell, TableGrid};
|
|
11
|
+
|
|
12
|
+
// ── Text extraction ───────────────────────────────────────────────────────────
|
|
13
|
+
|
|
14
|
+
/// Extract plain text from a tag's descendants, decoding HTML entities.
|
|
15
|
+
fn extract_text(tag: &tl::HTMLTag, parser: &tl::Parser) -> String {
|
|
16
|
+
let mut buf = String::new();
|
|
17
|
+
collect_text_from_tag(tag, parser, &mut buf);
|
|
18
|
+
buf
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
/// Recursively accumulate text content from a tag's children.
|
|
22
|
+
fn collect_text_from_tag(tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String) {
|
|
23
|
+
let children = tag.children();
|
|
24
|
+
for handle in children.top().iter() {
|
|
25
|
+
let Some(node) = handle.get(parser) else {
|
|
26
|
+
continue;
|
|
27
|
+
};
|
|
28
|
+
match node {
|
|
29
|
+
tl::Node::Raw(bytes) => {
|
|
30
|
+
let raw = bytes.as_utf8_str();
|
|
31
|
+
let decoded = crate::text::decode_html_entities_cow(raw.as_ref());
|
|
32
|
+
buf.push_str(&decoded);
|
|
33
|
+
}
|
|
34
|
+
tl::Node::Tag(child_tag) => {
|
|
35
|
+
let name = child_tag.name().as_utf8_str().to_ascii_lowercase();
|
|
36
|
+
// Skip invisible elements
|
|
37
|
+
if matches!(name.as_str(), "script" | "style" | "head") {
|
|
38
|
+
continue;
|
|
39
|
+
}
|
|
40
|
+
collect_text_from_tag(child_tag, parser, buf);
|
|
41
|
+
}
|
|
42
|
+
tl::Node::Comment(_) => {}
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// ── Inline annotation extraction ─────────────────────────────────────────────
|
|
48
|
+
|
|
49
|
+
/// Scan the children of `tag` and collect [`TextAnnotation`]s into `annotations`.
|
|
50
|
+
///
|
|
51
|
+
/// `text` is the pre-extracted full text of the enclosing block node; annotation
|
|
52
|
+
/// byte offsets are computed relative to that string.
|
|
53
|
+
fn collect_annotations(tag: &tl::HTMLTag, parser: &tl::Parser, text: &str, annotations: &mut Vec<TextAnnotation>) {
|
|
54
|
+
collect_annotations_from_tag(tag, parser, text, &mut 0usize, annotations);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/// Recursive helper. `offset` tracks how many bytes of `full_text` have been consumed
|
|
58
|
+
/// so far; it is mutated in place as we walk the tree.
|
|
59
|
+
fn collect_annotations_from_tag(
|
|
60
|
+
tag: &tl::HTMLTag,
|
|
61
|
+
parser: &tl::Parser,
|
|
62
|
+
full_text: &str,
|
|
63
|
+
offset: &mut usize,
|
|
64
|
+
annotations: &mut Vec<TextAnnotation>,
|
|
65
|
+
) {
|
|
66
|
+
let children = tag.children();
|
|
67
|
+
for handle in children.top().iter() {
|
|
68
|
+
let Some(node) = handle.get(parser) else {
|
|
69
|
+
continue;
|
|
70
|
+
};
|
|
71
|
+
match node {
|
|
72
|
+
tl::Node::Raw(bytes) => {
|
|
73
|
+
let raw = bytes.as_utf8_str();
|
|
74
|
+
let decoded = crate::text::decode_html_entities_cow(raw.as_ref());
|
|
75
|
+
*offset += decoded.len();
|
|
76
|
+
}
|
|
77
|
+
tl::Node::Tag(child_tag) => {
|
|
78
|
+
let name = child_tag.name().as_utf8_str().to_ascii_lowercase();
|
|
79
|
+
if matches!(name.as_str(), "script" | "style" | "head") {
|
|
80
|
+
continue;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
let start = *offset;
|
|
84
|
+
// Recurse to advance offset over the child's text span.
|
|
85
|
+
collect_annotations_from_tag(child_tag, parser, full_text, offset, annotations);
|
|
86
|
+
let end = *offset;
|
|
87
|
+
|
|
88
|
+
// Emit annotation only for non-empty spans that fit within the text.
|
|
89
|
+
if start < end && end <= full_text.len() {
|
|
90
|
+
let kind = match name.as_str() {
|
|
91
|
+
"strong" | "b" => Some(AnnotationKind::Bold),
|
|
92
|
+
"em" | "i" => Some(AnnotationKind::Italic),
|
|
93
|
+
"u" | "ins" => Some(AnnotationKind::Underline),
|
|
94
|
+
"s" | "del" | "strike" => Some(AnnotationKind::Strikethrough),
|
|
95
|
+
"code" | "kbd" | "samp" => Some(AnnotationKind::Code),
|
|
96
|
+
"sub" => Some(AnnotationKind::Subscript),
|
|
97
|
+
"sup" => Some(AnnotationKind::Superscript),
|
|
98
|
+
"mark" => Some(AnnotationKind::Highlight),
|
|
99
|
+
"a" => {
|
|
100
|
+
let url = child_tag
|
|
101
|
+
.attributes()
|
|
102
|
+
.get("href")
|
|
103
|
+
.flatten()
|
|
104
|
+
.map(|v| v.as_utf8_str().to_string())
|
|
105
|
+
.unwrap_or_default();
|
|
106
|
+
let title = child_tag
|
|
107
|
+
.attributes()
|
|
108
|
+
.get("title")
|
|
109
|
+
.flatten()
|
|
110
|
+
.map(|v| v.as_utf8_str().to_string());
|
|
111
|
+
Some(AnnotationKind::Link { url, title })
|
|
112
|
+
}
|
|
113
|
+
_ => None,
|
|
114
|
+
};
|
|
115
|
+
|
|
116
|
+
if let Some(kind) = kind {
|
|
117
|
+
annotations.push(TextAnnotation {
|
|
118
|
+
start: start as u32,
|
|
119
|
+
end: end as u32,
|
|
120
|
+
kind,
|
|
121
|
+
});
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
tl::Node::Comment(_) => {}
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
// ── Table extraction ──────────────────────────────────────────────────────────
|
|
131
|
+
|
|
132
|
+
/// Build a [`TableGrid`] from a `<table>` element.
|
|
133
|
+
fn extract_table_grid(table_tag: &tl::HTMLTag, parser: &tl::Parser) -> TableGrid {
|
|
134
|
+
// Gather all <tr> handles (recursing through thead/tbody/tfoot).
|
|
135
|
+
let mut row_handles: Vec<tl::NodeHandle> = Vec::new();
|
|
136
|
+
collect_tr_handles(table_tag, parser, &mut row_handles);
|
|
137
|
+
|
|
138
|
+
let mut cells: Vec<GridCell> = Vec::new();
|
|
139
|
+
let mut max_col: u32 = 0;
|
|
140
|
+
|
|
141
|
+
for (row_idx, row_handle) in row_handles.iter().enumerate() {
|
|
142
|
+
let Some(tl::Node::Tag(row_tag)) = row_handle.get(parser) else {
|
|
143
|
+
continue;
|
|
144
|
+
};
|
|
145
|
+
|
|
146
|
+
let mut col_idx: u32 = 0;
|
|
147
|
+
let row_children = row_tag.children();
|
|
148
|
+
|
|
149
|
+
for child_handle in row_children.top().iter() {
|
|
150
|
+
let Some(tl::Node::Tag(cell_tag)) = child_handle.get(parser) else {
|
|
151
|
+
continue;
|
|
152
|
+
};
|
|
153
|
+
let cell_name = cell_tag.name().as_utf8_str().to_ascii_lowercase();
|
|
154
|
+
let is_cell = cell_name == "td" || cell_name == "th";
|
|
155
|
+
if !is_cell {
|
|
156
|
+
continue;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
let is_header = cell_name == "th";
|
|
160
|
+
|
|
161
|
+
let row_span = cell_tag
|
|
162
|
+
.attributes()
|
|
163
|
+
.get("rowspan")
|
|
164
|
+
.flatten()
|
|
165
|
+
.and_then(|v| v.as_utf8_str().parse::<u32>().ok())
|
|
166
|
+
.unwrap_or(1)
|
|
167
|
+
.max(1);
|
|
168
|
+
|
|
169
|
+
let col_span = cell_tag
|
|
170
|
+
.attributes()
|
|
171
|
+
.get("colspan")
|
|
172
|
+
.flatten()
|
|
173
|
+
.and_then(|v| v.as_utf8_str().parse::<u32>().ok())
|
|
174
|
+
.unwrap_or(1)
|
|
175
|
+
.max(1);
|
|
176
|
+
|
|
177
|
+
let content = extract_text(cell_tag, parser).trim().to_string();
|
|
178
|
+
|
|
179
|
+
cells.push(GridCell {
|
|
180
|
+
content,
|
|
181
|
+
row: row_idx as u32,
|
|
182
|
+
col: col_idx,
|
|
183
|
+
row_span,
|
|
184
|
+
col_span,
|
|
185
|
+
is_header,
|
|
186
|
+
});
|
|
187
|
+
|
|
188
|
+
col_idx += col_span;
|
|
189
|
+
if col_idx > max_col {
|
|
190
|
+
max_col = col_idx;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
let rows = row_handles.len() as u32;
|
|
196
|
+
TableGrid {
|
|
197
|
+
rows,
|
|
198
|
+
cols: max_col,
|
|
199
|
+
cells,
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
/// Recursively collect all `<tr>` `NodeHandle`s from within a table element.
|
|
204
|
+
fn collect_tr_handles(tag: &tl::HTMLTag, parser: &tl::Parser, result: &mut Vec<tl::NodeHandle>) {
|
|
205
|
+
let children = tag.children();
|
|
206
|
+
for handle in children.top().iter() {
|
|
207
|
+
if let Some(tl::Node::Tag(child_tag)) = handle.get(parser) {
|
|
208
|
+
let name = child_tag.name().as_utf8_str().to_ascii_lowercase();
|
|
209
|
+
if name == "tr" {
|
|
210
|
+
result.push(*handle);
|
|
211
|
+
} else {
|
|
212
|
+
collect_tr_handles(child_tag, parser, result);
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
// ── Node ID generation ────────────────────────────────────────────────────────
|
|
219
|
+
|
|
220
|
+
/// Generate a deterministic node ID from the node type, an excerpt of its text content,
|
|
221
|
+
/// and its position (index) in the flat node list.
|
|
222
|
+
fn make_node_id(node_type: &str, text: &str, index: usize) -> String {
|
|
223
|
+
use std::collections::hash_map::DefaultHasher;
|
|
224
|
+
use std::hash::{Hash, Hasher};
|
|
225
|
+
|
|
226
|
+
let mut hasher = DefaultHasher::new();
|
|
227
|
+
node_type.hash(&mut hasher);
|
|
228
|
+
// Only hash a prefix of the text to keep cost bounded.
|
|
229
|
+
text[..text.len().min(64)].hash(&mut hasher);
|
|
230
|
+
index.hash(&mut hasher);
|
|
231
|
+
let digest = hasher.finish();
|
|
232
|
+
format!("{node_type}-{digest:016x}")
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
// ── Definition list helpers ───────────────────────────────────────────────────
|
|
236
|
+
|
|
237
|
+
/// Collect `<dt>`/`<dd>` pairs from a `<dl>` element.
|
|
238
|
+
///
|
|
239
|
+
/// Returns `(term_text, definition_text)` tuples. Consecutive `<dt>` elements share
|
|
240
|
+
/// the next `<dd>`; orphan `<dd>`s use an empty term.
|
|
241
|
+
fn collect_definition_items(dl_tag: &tl::HTMLTag, parser: &tl::Parser) -> Vec<(String, String)> {
|
|
242
|
+
let mut items: Vec<(String, String)> = Vec::new();
|
|
243
|
+
let mut pending_terms: Vec<String> = Vec::new();
|
|
244
|
+
|
|
245
|
+
let children = dl_tag.children();
|
|
246
|
+
for handle in children.top().iter() {
|
|
247
|
+
let Some(tl::Node::Tag(child_tag)) = handle.get(parser) else {
|
|
248
|
+
continue;
|
|
249
|
+
};
|
|
250
|
+
let name = child_tag.name().as_utf8_str().to_ascii_lowercase();
|
|
251
|
+
match name.as_str() {
|
|
252
|
+
"dt" => {
|
|
253
|
+
pending_terms.push(extract_text(child_tag, parser).trim().to_string());
|
|
254
|
+
}
|
|
255
|
+
"dd" => {
|
|
256
|
+
let definition = extract_text(child_tag, parser).trim().to_string();
|
|
257
|
+
if pending_terms.is_empty() {
|
|
258
|
+
items.push((String::new(), definition));
|
|
259
|
+
} else {
|
|
260
|
+
let mut drained: Vec<String> = std::mem::take(&mut pending_terms);
|
|
261
|
+
let last_term = drained.pop();
|
|
262
|
+
for term in drained {
|
|
263
|
+
items.push((term, String::new()));
|
|
264
|
+
}
|
|
265
|
+
if let Some(term) = last_term {
|
|
266
|
+
items.push((term, definition));
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
_ => {}
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
// Flush trailing <dt>s without a corresponding <dd>.
|
|
275
|
+
for term in pending_terms {
|
|
276
|
+
items.push((term, String::new()));
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
items
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
// ── Head metadata extraction ──────────────────────────────────────────────────
|
|
283
|
+
|
|
284
|
+
/// Extract `<meta name=… content=…>` and `<title>` entries from a `<head>` element.
|
|
285
|
+
fn extract_head_metadata_entries(head_tag: &tl::HTMLTag, parser: &tl::Parser) -> Vec<(String, String)> {
|
|
286
|
+
let mut entries: Vec<(String, String)> = Vec::new();
|
|
287
|
+
|
|
288
|
+
let children = head_tag.children();
|
|
289
|
+
for handle in children.top().iter() {
|
|
290
|
+
let Some(tl::Node::Tag(child_tag)) = handle.get(parser) else {
|
|
291
|
+
continue;
|
|
292
|
+
};
|
|
293
|
+
let name = child_tag.name().as_utf8_str().to_ascii_lowercase();
|
|
294
|
+
match name.as_str() {
|
|
295
|
+
"title" => {
|
|
296
|
+
let title = extract_text(child_tag, parser).trim().to_string();
|
|
297
|
+
if !title.is_empty() {
|
|
298
|
+
entries.push(("title".to_string(), title));
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
"meta" => {
|
|
302
|
+
// name + content
|
|
303
|
+
if let (Some(Some(meta_name)), Some(Some(meta_content))) = (
|
|
304
|
+
child_tag.attributes().get("name"),
|
|
305
|
+
child_tag.attributes().get("content"),
|
|
306
|
+
) {
|
|
307
|
+
entries.push((
|
|
308
|
+
meta_name.as_utf8_str().to_string(),
|
|
309
|
+
meta_content.as_utf8_str().to_string(),
|
|
310
|
+
));
|
|
311
|
+
}
|
|
312
|
+
// property + content (Open Graph etc.)
|
|
313
|
+
if let (Some(Some(property)), Some(Some(content))) = (
|
|
314
|
+
child_tag.attributes().get("property"),
|
|
315
|
+
child_tag.attributes().get("content"),
|
|
316
|
+
) {
|
|
317
|
+
entries.push((property.as_utf8_str().to_string(), content.as_utf8_str().to_string()));
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
_ => {}
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
entries
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
// ── Main builder ──────────────────────────────────────────────────────────────
|
|
328
|
+
|
|
329
|
+
/// State threaded through the recursive walk.
|
|
330
|
+
struct BuilderState {
|
|
331
|
+
/// Accumulated nodes (flat list in document order).
|
|
332
|
+
nodes: Vec<DocumentNode>,
|
|
333
|
+
/// Stack of open heading-group indices: `(heading_level, node_index)`.
|
|
334
|
+
group_stack: Vec<(u8, u32)>,
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
impl BuilderState {
|
|
338
|
+
fn new() -> Self {
|
|
339
|
+
Self {
|
|
340
|
+
nodes: Vec::new(),
|
|
341
|
+
group_stack: Vec::new(),
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
/// Append a node and return its index.
|
|
346
|
+
fn push(&mut self, node: DocumentNode) -> u32 {
|
|
347
|
+
let idx = self.nodes.len() as u32;
|
|
348
|
+
self.nodes.push(node);
|
|
349
|
+
idx
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
/// Index of the innermost open group, if any.
|
|
353
|
+
fn current_group(&self) -> Option<u32> {
|
|
354
|
+
self.group_stack.last().map(|(_, idx)| *idx)
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
/// Record `child_idx` as a child of `parent_idx`.
|
|
358
|
+
fn add_child(&mut self, parent_idx: u32, child_idx: u32) {
|
|
359
|
+
if let Some(parent) = self.nodes.get_mut(parent_idx as usize) {
|
|
360
|
+
parent.children.push(child_idx);
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
/// Build a [`DocumentStructure`] from an already-parsed `tl::VDom`.
|
|
366
|
+
///
|
|
367
|
+
/// Walks the DOM once, mapping HTML elements to semantic [`NodeContent`] variants,
|
|
368
|
+
/// tracking parent/child relationships, extracting inline [`TextAnnotation`]s, and
|
|
369
|
+
/// constructing heading-based [`Group`] nodes.
|
|
370
|
+
pub fn build_document_structure(dom: &tl::VDom<'_>) -> DocumentStructure {
|
|
371
|
+
let parser = dom.parser();
|
|
372
|
+
let mut state = BuilderState::new();
|
|
373
|
+
|
|
374
|
+
for handle in dom.children() {
|
|
375
|
+
walk(&mut state, handle, parser, None);
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
DocumentStructure {
|
|
379
|
+
nodes: state.nodes,
|
|
380
|
+
source_format: Some("html".to_string()),
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
/// Recursive DOM walker.
|
|
385
|
+
///
|
|
386
|
+
/// `parent_idx` is the flat-list index of the nearest structural parent, if any.
|
|
387
|
+
fn walk(state: &mut BuilderState, handle: &tl::NodeHandle, parser: &tl::Parser, parent_idx: Option<u32>) {
|
|
388
|
+
let Some(node) = handle.get(parser) else {
|
|
389
|
+
return;
|
|
390
|
+
};
|
|
391
|
+
|
|
392
|
+
match node {
|
|
393
|
+
tl::Node::Raw(_) | tl::Node::Comment(_) => {}
|
|
394
|
+
tl::Node::Tag(tag) => {
|
|
395
|
+
let tag_name = tag.name().as_utf8_str().to_ascii_lowercase();
|
|
396
|
+
process_tag(state, tag_name.as_str(), tag, parser, parent_idx);
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
/// Decide how to handle a given tag, creating nodes and recursing as needed.
|
|
402
|
+
#[allow(clippy::too_many_lines)]
|
|
403
|
+
fn process_tag(
|
|
404
|
+
state: &mut BuilderState,
|
|
405
|
+
tag_name: &str,
|
|
406
|
+
tag: &tl::HTMLTag,
|
|
407
|
+
parser: &tl::Parser,
|
|
408
|
+
parent_idx: Option<u32>,
|
|
409
|
+
) {
|
|
410
|
+
match tag_name {
|
|
411
|
+
// ── Headings ──────────────────────────────────────────────────────
|
|
412
|
+
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
|
|
413
|
+
let level = tag_name[1..].parse::<u8>().unwrap_or(1);
|
|
414
|
+
let text = extract_text(tag, parser).trim().to_string();
|
|
415
|
+
|
|
416
|
+
// Close any open groups at the same or deeper heading level.
|
|
417
|
+
while let Some(&(open_level, _)) = state.group_stack.last() {
|
|
418
|
+
if open_level >= level {
|
|
419
|
+
state.group_stack.pop();
|
|
420
|
+
} else {
|
|
421
|
+
break;
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
// Parent for the new group is the enclosing group or the explicit parent.
|
|
426
|
+
let group_parent = state.group_stack.last().map(|(_, idx)| *idx).or(parent_idx);
|
|
427
|
+
let group_id = make_node_id("group", &text, state.nodes.len());
|
|
428
|
+
let group_idx = state.push(DocumentNode {
|
|
429
|
+
id: group_id,
|
|
430
|
+
content: NodeContent::Group {
|
|
431
|
+
label: Some(text.clone()),
|
|
432
|
+
heading_level: Some(level),
|
|
433
|
+
heading_text: Some(text.clone()),
|
|
434
|
+
},
|
|
435
|
+
parent: group_parent,
|
|
436
|
+
children: Vec::new(),
|
|
437
|
+
annotations: Vec::new(),
|
|
438
|
+
attributes: None,
|
|
439
|
+
});
|
|
440
|
+
if let Some(gp) = group_parent {
|
|
441
|
+
state.add_child(gp, group_idx);
|
|
442
|
+
}
|
|
443
|
+
state.group_stack.push((level, group_idx));
|
|
444
|
+
|
|
445
|
+
// Emit the Heading node as a child of the new group.
|
|
446
|
+
let mut annotations = Vec::new();
|
|
447
|
+
collect_annotations(tag, parser, &text, &mut annotations);
|
|
448
|
+
let heading_id = make_node_id("heading", &text, state.nodes.len());
|
|
449
|
+
let heading_idx = state.push(DocumentNode {
|
|
450
|
+
id: heading_id,
|
|
451
|
+
content: NodeContent::Heading { level, text },
|
|
452
|
+
parent: Some(group_idx),
|
|
453
|
+
children: Vec::new(),
|
|
454
|
+
annotations,
|
|
455
|
+
attributes: None,
|
|
456
|
+
});
|
|
457
|
+
state.add_child(group_idx, heading_idx);
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
// ── Paragraph ────────────────────────────────────────────────────
|
|
461
|
+
"p" => {
|
|
462
|
+
let text = extract_text(tag, parser).trim().to_string();
|
|
463
|
+
if text.is_empty() {
|
|
464
|
+
return;
|
|
465
|
+
}
|
|
466
|
+
let effective_parent = state.current_group().or(parent_idx);
|
|
467
|
+
let mut annotations = Vec::new();
|
|
468
|
+
collect_annotations(tag, parser, &text, &mut annotations);
|
|
469
|
+
let id = make_node_id("paragraph", &text, state.nodes.len());
|
|
470
|
+
let idx = state.push(DocumentNode {
|
|
471
|
+
id,
|
|
472
|
+
content: NodeContent::Paragraph { text },
|
|
473
|
+
parent: effective_parent,
|
|
474
|
+
children: Vec::new(),
|
|
475
|
+
annotations,
|
|
476
|
+
attributes: None,
|
|
477
|
+
});
|
|
478
|
+
if let Some(ep) = effective_parent {
|
|
479
|
+
state.add_child(ep, idx);
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
// ── Lists ─────────────────────────────────────────────────────────
|
|
484
|
+
"ul" | "ol" => {
|
|
485
|
+
let ordered = tag_name == "ol";
|
|
486
|
+
let effective_parent = state.current_group().or(parent_idx);
|
|
487
|
+
let id = make_node_id("list", if ordered { "ordered" } else { "unordered" }, state.nodes.len());
|
|
488
|
+
let list_idx = state.push(DocumentNode {
|
|
489
|
+
id,
|
|
490
|
+
content: NodeContent::List { ordered },
|
|
491
|
+
parent: effective_parent,
|
|
492
|
+
children: Vec::new(),
|
|
493
|
+
annotations: Vec::new(),
|
|
494
|
+
attributes: None,
|
|
495
|
+
});
|
|
496
|
+
if let Some(ep) = effective_parent {
|
|
497
|
+
state.add_child(ep, list_idx);
|
|
498
|
+
}
|
|
499
|
+
// Recurse with the list node as the parent so <li>s attach to it.
|
|
500
|
+
let children = tag.children();
|
|
501
|
+
for child_handle in children.top().iter() {
|
|
502
|
+
walk(state, child_handle, parser, Some(list_idx));
|
|
503
|
+
}
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
// ── List item ─────────────────────────────────────────────────────
|
|
507
|
+
"li" => {
|
|
508
|
+
let text = extract_text(tag, parser).trim().to_string();
|
|
509
|
+
let effective_parent = parent_idx.or_else(|| state.current_group());
|
|
510
|
+
let mut annotations = Vec::new();
|
|
511
|
+
collect_annotations(tag, parser, &text, &mut annotations);
|
|
512
|
+
let id = make_node_id("list_item", &text, state.nodes.len());
|
|
513
|
+
let idx = state.push(DocumentNode {
|
|
514
|
+
id,
|
|
515
|
+
content: NodeContent::ListItem { text },
|
|
516
|
+
parent: effective_parent,
|
|
517
|
+
children: Vec::new(),
|
|
518
|
+
annotations,
|
|
519
|
+
attributes: None,
|
|
520
|
+
});
|
|
521
|
+
if let Some(ep) = effective_parent {
|
|
522
|
+
state.add_child(ep, idx);
|
|
523
|
+
}
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
// ── Table ─────────────────────────────────────────────────────────
|
|
527
|
+
"table" => {
|
|
528
|
+
let grid = extract_table_grid(tag, parser);
|
|
529
|
+
let effective_parent = state.current_group().or(parent_idx);
|
|
530
|
+
let id = make_node_id("table", &grid.rows.to_string(), state.nodes.len());
|
|
531
|
+
let idx = state.push(DocumentNode {
|
|
532
|
+
id,
|
|
533
|
+
content: NodeContent::Table { grid },
|
|
534
|
+
parent: effective_parent,
|
|
535
|
+
children: Vec::new(),
|
|
536
|
+
annotations: Vec::new(),
|
|
537
|
+
attributes: None,
|
|
538
|
+
});
|
|
539
|
+
if let Some(ep) = effective_parent {
|
|
540
|
+
state.add_child(ep, idx);
|
|
541
|
+
}
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
// ── Image ─────────────────────────────────────────────────────────
|
|
545
|
+
"img" => {
|
|
546
|
+
let src = tag
|
|
547
|
+
.attributes()
|
|
548
|
+
.get("src")
|
|
549
|
+
.flatten()
|
|
550
|
+
.map(|v| v.as_utf8_str().to_string());
|
|
551
|
+
let description = tag
|
|
552
|
+
.attributes()
|
|
553
|
+
.get("alt")
|
|
554
|
+
.flatten()
|
|
555
|
+
.map(|v| v.as_utf8_str().to_string())
|
|
556
|
+
.filter(|s| !s.is_empty());
|
|
557
|
+
let effective_parent = state.current_group().or(parent_idx);
|
|
558
|
+
let label = src.as_deref().unwrap_or("img");
|
|
559
|
+
let id = make_node_id("image", label, state.nodes.len());
|
|
560
|
+
let idx = state.push(DocumentNode {
|
|
561
|
+
id,
|
|
562
|
+
content: NodeContent::Image {
|
|
563
|
+
description,
|
|
564
|
+
src,
|
|
565
|
+
image_index: None,
|
|
566
|
+
},
|
|
567
|
+
parent: effective_parent,
|
|
568
|
+
children: Vec::new(),
|
|
569
|
+
annotations: Vec::new(),
|
|
570
|
+
attributes: None,
|
|
571
|
+
});
|
|
572
|
+
if let Some(ep) = effective_parent {
|
|
573
|
+
state.add_child(ep, idx);
|
|
574
|
+
}
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
// ── Code block (<pre><code …>) ────────────────────────────────────
|
|
578
|
+
"pre" => {
|
|
579
|
+
let mut language: Option<String> = None;
|
|
580
|
+
let mut code_text: Option<String> = None;
|
|
581
|
+
|
|
582
|
+
let children = tag.children();
|
|
583
|
+
for child_handle in children.top().iter() {
|
|
584
|
+
if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
|
|
585
|
+
let child_name = child_tag.name().as_utf8_str().to_ascii_lowercase();
|
|
586
|
+
if child_name == "code" {
|
|
587
|
+
// Extract language from class="language-*"
|
|
588
|
+
if let Some(Some(class_val)) = child_tag.attributes().get("class") {
|
|
589
|
+
let class_str = class_val.as_utf8_str();
|
|
590
|
+
for token in class_str.split_whitespace() {
|
|
591
|
+
if let Some(lang) = token.strip_prefix("language-") {
|
|
592
|
+
language = Some(lang.to_string());
|
|
593
|
+
break;
|
|
594
|
+
}
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
code_text = Some(extract_text(child_tag, parser));
|
|
598
|
+
break;
|
|
599
|
+
}
|
|
600
|
+
}
|
|
601
|
+
}
|
|
602
|
+
|
|
603
|
+
let text = code_text.unwrap_or_else(|| extract_text(tag, parser));
|
|
604
|
+
let effective_parent = state.current_group().or(parent_idx);
|
|
605
|
+
let id = make_node_id("code", &text, state.nodes.len());
|
|
606
|
+
let idx = state.push(DocumentNode {
|
|
607
|
+
id,
|
|
608
|
+
content: NodeContent::Code { text, language },
|
|
609
|
+
parent: effective_parent,
|
|
610
|
+
children: Vec::new(),
|
|
611
|
+
annotations: Vec::new(),
|
|
612
|
+
attributes: None,
|
|
613
|
+
});
|
|
614
|
+
if let Some(ep) = effective_parent {
|
|
615
|
+
state.add_child(ep, idx);
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
// ── Blockquote ────────────────────────────────────────────────────
|
|
620
|
+
"blockquote" => {
|
|
621
|
+
let effective_parent = state.current_group().or(parent_idx);
|
|
622
|
+
let id = make_node_id("quote", "blockquote", state.nodes.len());
|
|
623
|
+
let quote_idx = state.push(DocumentNode {
|
|
624
|
+
id,
|
|
625
|
+
content: NodeContent::Quote,
|
|
626
|
+
parent: effective_parent,
|
|
627
|
+
children: Vec::new(),
|
|
628
|
+
annotations: Vec::new(),
|
|
629
|
+
attributes: None,
|
|
630
|
+
});
|
|
631
|
+
if let Some(ep) = effective_parent {
|
|
632
|
+
state.add_child(ep, quote_idx);
|
|
633
|
+
}
|
|
634
|
+
// Recurse into blockquote children under the Quote node.
|
|
635
|
+
let children = tag.children();
|
|
636
|
+
for child_handle in children.top().iter() {
|
|
637
|
+
walk(state, child_handle, parser, Some(quote_idx));
|
|
638
|
+
}
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
// ── Definition list ───────────────────────────────────────────────
|
|
642
|
+
"dl" => {
|
|
643
|
+
let effective_parent = state.current_group().or(parent_idx);
|
|
644
|
+
let id = make_node_id("definition_list", "dl", state.nodes.len());
|
|
645
|
+
let dl_idx = state.push(DocumentNode {
|
|
646
|
+
id,
|
|
647
|
+
content: NodeContent::DefinitionList,
|
|
648
|
+
parent: effective_parent,
|
|
649
|
+
children: Vec::new(),
|
|
650
|
+
annotations: Vec::new(),
|
|
651
|
+
attributes: None,
|
|
652
|
+
});
|
|
653
|
+
if let Some(ep) = effective_parent {
|
|
654
|
+
state.add_child(ep, dl_idx);
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
for (term, definition) in collect_definition_items(tag, parser) {
|
|
658
|
+
let item_id = make_node_id("definition_item", &term, state.nodes.len());
|
|
659
|
+
let item_idx = state.push(DocumentNode {
|
|
660
|
+
id: item_id,
|
|
661
|
+
content: NodeContent::DefinitionItem { term, definition },
|
|
662
|
+
parent: Some(dl_idx),
|
|
663
|
+
children: Vec::new(),
|
|
664
|
+
annotations: Vec::new(),
|
|
665
|
+
attributes: None,
|
|
666
|
+
});
|
|
667
|
+
state.add_child(dl_idx, item_idx);
|
|
668
|
+
}
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
// ── Script / Style → RawBlock ─────────────────────────────────────
|
|
672
|
+
"script" | "style" => {
|
|
673
|
+
let format = if tag_name == "script" {
|
|
674
|
+
tag.attributes()
|
|
675
|
+
.get("type")
|
|
676
|
+
.flatten()
|
|
677
|
+
.map(|v| v.as_utf8_str().to_string())
|
|
678
|
+
.unwrap_or_else(|| "javascript".to_string())
|
|
679
|
+
} else {
|
|
680
|
+
"css".to_string()
|
|
681
|
+
};
|
|
682
|
+
let content = extract_text(tag, parser);
|
|
683
|
+
if content.trim().is_empty() {
|
|
684
|
+
return;
|
|
685
|
+
}
|
|
686
|
+
let effective_parent = state.current_group().or(parent_idx);
|
|
687
|
+
let id = make_node_id("raw_block", &format, state.nodes.len());
|
|
688
|
+
let idx = state.push(DocumentNode {
|
|
689
|
+
id,
|
|
690
|
+
content: NodeContent::RawBlock { format, content },
|
|
691
|
+
parent: effective_parent,
|
|
692
|
+
children: Vec::new(),
|
|
693
|
+
annotations: Vec::new(),
|
|
694
|
+
attributes: None,
|
|
695
|
+
});
|
|
696
|
+
if let Some(ep) = effective_parent {
|
|
697
|
+
state.add_child(ep, idx);
|
|
698
|
+
}
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
// ── Head → MetadataBlock ──────────────────────────────────────────
|
|
702
|
+
"head" => {
|
|
703
|
+
let entries = extract_head_metadata_entries(tag, parser);
|
|
704
|
+
if entries.is_empty() {
|
|
705
|
+
return;
|
|
706
|
+
}
|
|
707
|
+
let id = make_node_id("metadata_block", "head", state.nodes.len());
|
|
708
|
+
// Metadata blocks sit at the root level.
|
|
709
|
+
state.push(DocumentNode {
|
|
710
|
+
id,
|
|
711
|
+
content: NodeContent::MetadataBlock { entries },
|
|
712
|
+
parent: None,
|
|
713
|
+
children: Vec::new(),
|
|
714
|
+
annotations: Vec::new(),
|
|
715
|
+
attributes: None,
|
|
716
|
+
});
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
// ── Semantic containers → Group node ──────────────────────────────
|
|
720
|
+
"main" | "article" | "section" | "header" | "footer" | "nav" | "aside" => {
|
|
721
|
+
let label = tag
|
|
722
|
+
.attributes()
|
|
723
|
+
.get("aria-label")
|
|
724
|
+
.flatten()
|
|
725
|
+
.map(|v| v.as_utf8_str().to_string());
|
|
726
|
+
let effective_parent = state.current_group().or(parent_idx);
|
|
727
|
+
let id = make_node_id("group", tag_name, state.nodes.len());
|
|
728
|
+
let group_idx = state.push(DocumentNode {
|
|
729
|
+
id,
|
|
730
|
+
content: NodeContent::Group {
|
|
731
|
+
label,
|
|
732
|
+
heading_level: None,
|
|
733
|
+
heading_text: None,
|
|
734
|
+
},
|
|
735
|
+
parent: effective_parent,
|
|
736
|
+
children: Vec::new(),
|
|
737
|
+
annotations: Vec::new(),
|
|
738
|
+
attributes: collect_attributes(tag),
|
|
739
|
+
});
|
|
740
|
+
if let Some(ep) = effective_parent {
|
|
741
|
+
state.add_child(ep, group_idx);
|
|
742
|
+
}
|
|
743
|
+
let children = tag.children();
|
|
744
|
+
for child_handle in children.top().iter() {
|
|
745
|
+
walk(state, child_handle, parser, Some(group_idx));
|
|
746
|
+
}
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
// ── Transparent structural containers ─────────────────────────────
|
|
750
|
+
"html" | "body" | "div" | "figure" | "figcaption" | "details" | "summary" | "address" | "hgroup" | "search"
|
|
751
|
+
| "form" | "fieldset" => {
|
|
752
|
+
let children = tag.children();
|
|
753
|
+
for child_handle in children.top().iter() {
|
|
754
|
+
walk(state, child_handle, parser, parent_idx);
|
|
755
|
+
}
|
|
756
|
+
}
|
|
757
|
+
|
|
758
|
+
// ── Everything else: recurse transparently ────────────────────────
|
|
759
|
+
_ => {
|
|
760
|
+
let children = tag.children();
|
|
761
|
+
for child_handle in children.top().iter() {
|
|
762
|
+
walk(state, child_handle, parser, parent_idx);
|
|
763
|
+
}
|
|
764
|
+
}
|
|
765
|
+
}
|
|
766
|
+
}
|
|
767
|
+
|
|
768
|
+
/// Collect a safe subset of attributes into a `HashMap`.
|
|
769
|
+
///
|
|
770
|
+
/// Only `id`, `class`, `lang`, `dir`, and `data-*` attributes are kept.
|
|
771
|
+
/// Event handlers (`on*`) and other potentially unsafe attributes are dropped.
|
|
772
|
+
fn collect_attributes(tag: &tl::HTMLTag) -> Option<HashMap<String, String>> {
|
|
773
|
+
let raw = tag.attributes().clone();
|
|
774
|
+
let mut map: HashMap<String, String> = HashMap::new();
|
|
775
|
+
|
|
776
|
+
for (key_cow, val_opt) in raw.iter() {
|
|
777
|
+
let key = key_cow.to_ascii_lowercase();
|
|
778
|
+
// Drop event handlers.
|
|
779
|
+
if key.starts_with("on") {
|
|
780
|
+
continue;
|
|
781
|
+
}
|
|
782
|
+
if matches!(key.as_str(), "id" | "class" | "lang" | "dir") || key.starts_with("data-") {
|
|
783
|
+
if let Some(val) = val_opt {
|
|
784
|
+
map.insert(key, val.to_string());
|
|
785
|
+
}
|
|
786
|
+
}
|
|
787
|
+
}
|
|
788
|
+
|
|
789
|
+
if map.is_empty() { None } else { Some(map) }
|
|
790
|
+
}
|