html-to-markdown 2.30.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +4 -14
  3. data/README.md +37 -50
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +13 -701
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
  6. data/ext/html-to-markdown-rb/native/README.md +4 -13
  7. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
  8. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
  9. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
  10. data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
  11. data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
  12. data/lib/html_to_markdown/version.rb +1 -1
  13. data/lib/html_to_markdown.rb +13 -194
  14. data/sig/html_to_markdown.rbs +12 -373
  15. data/vendor/Cargo.toml +5 -2
  16. data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
  17. data/vendor/html-to-markdown-rs/README.md +126 -52
  18. data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
  19. data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
  20. data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
  21. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
  22. data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
  23. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
  24. data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
  25. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
  26. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
  27. data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
  28. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
  29. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
  30. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
  31. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
  32. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
  33. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
  38. data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
  43. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
  44. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
  45. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
  46. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
  47. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
  48. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
  49. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
  50. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
  52. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
  53. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
  54. data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
  56. data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -68
  57. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
  58. data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
  59. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
  60. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
  61. data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
  62. data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
  63. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
  65. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
  66. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
  67. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
  68. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
  69. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
  70. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
  71. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
  72. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
  73. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
  74. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
  75. data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
  76. data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
  77. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  78. data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
  79. data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
  80. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
  81. data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
  82. data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -323
  83. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
  84. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
  85. data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
  86. data/vendor/html-to-markdown-rs/src/text.rs +25 -14
  87. data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
  88. data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
  89. data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
  90. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
  91. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
  92. data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
  93. data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
  94. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
  95. data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
  96. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
  97. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
  98. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
  99. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
  100. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
  101. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
  102. data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
  103. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
  104. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
  105. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
  106. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
  107. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
  108. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
  109. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
  110. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
  111. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
  112. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
  113. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
  114. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
  115. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
  116. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
  117. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
  118. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
  119. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
  120. data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
  121. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
  122. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
  123. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
  124. data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
  125. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
  126. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
  127. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
  128. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
  129. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
  130. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
  131. metadata +9 -37
  132. data/bin/benchmark.rb +0 -232
  133. data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
  134. data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
  135. data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
  136. data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
  137. data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
  138. data/spec/convert_spec.rb +0 -77
  139. data/spec/convert_with_tables_spec.rb +0 -194
  140. data/spec/metadata_extraction_spec.rb +0 -437
  141. data/spec/visitor_issue_187_spec.rb +0 -605
  142. data/spec/visitor_spec.rb +0 -1149
  143. data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
  144. data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
  145. data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
  146. data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
  147. data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
  148. data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
  149. data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
  150. data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
  151. data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
  152. data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -42
  153. data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
  154. data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
  155. data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
  156. data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
  157. data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
  158. data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
  159. data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
  160. data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
  161. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
  162. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
  163. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
  164. data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
  165. data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
  166. data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
@@ -0,0 +1,790 @@
1
+ //! Builds a [`DocumentStructure`] from a parsed `tl::VDom`.
2
+ //!
3
+ //! Walk the DOM once, mapping each HTML element to the appropriate [`NodeContent`] variant,
4
+ //! collecting inline [`TextAnnotation`]s, tracking parent/child relationships, and generating
5
+ //! heading-based [`Group`] hierarchy.
6
+
7
+ use std::collections::HashMap;
8
+
9
+ use super::document::{AnnotationKind, DocumentNode, DocumentStructure, NodeContent, TextAnnotation};
10
+ use super::tables::{GridCell, TableGrid};
11
+
12
+ // ── Text extraction ───────────────────────────────────────────────────────────
13
+
14
+ /// Extract plain text from a tag's descendants, decoding HTML entities.
15
+ fn extract_text(tag: &tl::HTMLTag, parser: &tl::Parser) -> String {
16
+ let mut buf = String::new();
17
+ collect_text_from_tag(tag, parser, &mut buf);
18
+ buf
19
+ }
20
+
21
+ /// Recursively accumulate text content from a tag's children.
22
+ fn collect_text_from_tag(tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String) {
23
+ let children = tag.children();
24
+ for handle in children.top().iter() {
25
+ let Some(node) = handle.get(parser) else {
26
+ continue;
27
+ };
28
+ match node {
29
+ tl::Node::Raw(bytes) => {
30
+ let raw = bytes.as_utf8_str();
31
+ let decoded = crate::text::decode_html_entities_cow(raw.as_ref());
32
+ buf.push_str(&decoded);
33
+ }
34
+ tl::Node::Tag(child_tag) => {
35
+ let name = child_tag.name().as_utf8_str().to_ascii_lowercase();
36
+ // Skip invisible elements
37
+ if matches!(name.as_str(), "script" | "style" | "head") {
38
+ continue;
39
+ }
40
+ collect_text_from_tag(child_tag, parser, buf);
41
+ }
42
+ tl::Node::Comment(_) => {}
43
+ }
44
+ }
45
+ }
46
+
47
+ // ── Inline annotation extraction ─────────────────────────────────────────────
48
+
49
+ /// Scan the children of `tag` and collect [`TextAnnotation`]s into `annotations`.
50
+ ///
51
+ /// `text` is the pre-extracted full text of the enclosing block node; annotation
52
+ /// byte offsets are computed relative to that string.
53
+ fn collect_annotations(tag: &tl::HTMLTag, parser: &tl::Parser, text: &str, annotations: &mut Vec<TextAnnotation>) {
54
+ collect_annotations_from_tag(tag, parser, text, &mut 0usize, annotations);
55
+ }
56
+
57
+ /// Recursive helper. `offset` tracks how many bytes of `full_text` have been consumed
58
+ /// so far; it is mutated in place as we walk the tree.
59
+ fn collect_annotations_from_tag(
60
+ tag: &tl::HTMLTag,
61
+ parser: &tl::Parser,
62
+ full_text: &str,
63
+ offset: &mut usize,
64
+ annotations: &mut Vec<TextAnnotation>,
65
+ ) {
66
+ let children = tag.children();
67
+ for handle in children.top().iter() {
68
+ let Some(node) = handle.get(parser) else {
69
+ continue;
70
+ };
71
+ match node {
72
+ tl::Node::Raw(bytes) => {
73
+ let raw = bytes.as_utf8_str();
74
+ let decoded = crate::text::decode_html_entities_cow(raw.as_ref());
75
+ *offset += decoded.len();
76
+ }
77
+ tl::Node::Tag(child_tag) => {
78
+ let name = child_tag.name().as_utf8_str().to_ascii_lowercase();
79
+ if matches!(name.as_str(), "script" | "style" | "head") {
80
+ continue;
81
+ }
82
+
83
+ let start = *offset;
84
+ // Recurse to advance offset over the child's text span.
85
+ collect_annotations_from_tag(child_tag, parser, full_text, offset, annotations);
86
+ let end = *offset;
87
+
88
+ // Emit annotation only for non-empty spans that fit within the text.
89
+ if start < end && end <= full_text.len() {
90
+ let kind = match name.as_str() {
91
+ "strong" | "b" => Some(AnnotationKind::Bold),
92
+ "em" | "i" => Some(AnnotationKind::Italic),
93
+ "u" | "ins" => Some(AnnotationKind::Underline),
94
+ "s" | "del" | "strike" => Some(AnnotationKind::Strikethrough),
95
+ "code" | "kbd" | "samp" => Some(AnnotationKind::Code),
96
+ "sub" => Some(AnnotationKind::Subscript),
97
+ "sup" => Some(AnnotationKind::Superscript),
98
+ "mark" => Some(AnnotationKind::Highlight),
99
+ "a" => {
100
+ let url = child_tag
101
+ .attributes()
102
+ .get("href")
103
+ .flatten()
104
+ .map(|v| v.as_utf8_str().to_string())
105
+ .unwrap_or_default();
106
+ let title = child_tag
107
+ .attributes()
108
+ .get("title")
109
+ .flatten()
110
+ .map(|v| v.as_utf8_str().to_string());
111
+ Some(AnnotationKind::Link { url, title })
112
+ }
113
+ _ => None,
114
+ };
115
+
116
+ if let Some(kind) = kind {
117
+ annotations.push(TextAnnotation {
118
+ start: start as u32,
119
+ end: end as u32,
120
+ kind,
121
+ });
122
+ }
123
+ }
124
+ }
125
+ tl::Node::Comment(_) => {}
126
+ }
127
+ }
128
+ }
129
+
130
+ // ── Table extraction ──────────────────────────────────────────────────────────
131
+
132
+ /// Build a [`TableGrid`] from a `<table>` element.
133
+ fn extract_table_grid(table_tag: &tl::HTMLTag, parser: &tl::Parser) -> TableGrid {
134
+ // Gather all <tr> handles (recursing through thead/tbody/tfoot).
135
+ let mut row_handles: Vec<tl::NodeHandle> = Vec::new();
136
+ collect_tr_handles(table_tag, parser, &mut row_handles);
137
+
138
+ let mut cells: Vec<GridCell> = Vec::new();
139
+ let mut max_col: u32 = 0;
140
+
141
+ for (row_idx, row_handle) in row_handles.iter().enumerate() {
142
+ let Some(tl::Node::Tag(row_tag)) = row_handle.get(parser) else {
143
+ continue;
144
+ };
145
+
146
+ let mut col_idx: u32 = 0;
147
+ let row_children = row_tag.children();
148
+
149
+ for child_handle in row_children.top().iter() {
150
+ let Some(tl::Node::Tag(cell_tag)) = child_handle.get(parser) else {
151
+ continue;
152
+ };
153
+ let cell_name = cell_tag.name().as_utf8_str().to_ascii_lowercase();
154
+ let is_cell = cell_name == "td" || cell_name == "th";
155
+ if !is_cell {
156
+ continue;
157
+ }
158
+
159
+ let is_header = cell_name == "th";
160
+
161
+ let row_span = cell_tag
162
+ .attributes()
163
+ .get("rowspan")
164
+ .flatten()
165
+ .and_then(|v| v.as_utf8_str().parse::<u32>().ok())
166
+ .unwrap_or(1)
167
+ .max(1);
168
+
169
+ let col_span = cell_tag
170
+ .attributes()
171
+ .get("colspan")
172
+ .flatten()
173
+ .and_then(|v| v.as_utf8_str().parse::<u32>().ok())
174
+ .unwrap_or(1)
175
+ .max(1);
176
+
177
+ let content = extract_text(cell_tag, parser).trim().to_string();
178
+
179
+ cells.push(GridCell {
180
+ content,
181
+ row: row_idx as u32,
182
+ col: col_idx,
183
+ row_span,
184
+ col_span,
185
+ is_header,
186
+ });
187
+
188
+ col_idx += col_span;
189
+ if col_idx > max_col {
190
+ max_col = col_idx;
191
+ }
192
+ }
193
+ }
194
+
195
+ let rows = row_handles.len() as u32;
196
+ TableGrid {
197
+ rows,
198
+ cols: max_col,
199
+ cells,
200
+ }
201
+ }
202
+
203
+ /// Recursively collect all `<tr>` `NodeHandle`s from within a table element.
204
+ fn collect_tr_handles(tag: &tl::HTMLTag, parser: &tl::Parser, result: &mut Vec<tl::NodeHandle>) {
205
+ let children = tag.children();
206
+ for handle in children.top().iter() {
207
+ if let Some(tl::Node::Tag(child_tag)) = handle.get(parser) {
208
+ let name = child_tag.name().as_utf8_str().to_ascii_lowercase();
209
+ if name == "tr" {
210
+ result.push(*handle);
211
+ } else {
212
+ collect_tr_handles(child_tag, parser, result);
213
+ }
214
+ }
215
+ }
216
+ }
217
+
218
+ // ── Node ID generation ────────────────────────────────────────────────────────
219
+
220
+ /// Generate a deterministic node ID from the node type, an excerpt of its text content,
221
+ /// and its position (index) in the flat node list.
222
+ fn make_node_id(node_type: &str, text: &str, index: usize) -> String {
223
+ use std::collections::hash_map::DefaultHasher;
224
+ use std::hash::{Hash, Hasher};
225
+
226
+ let mut hasher = DefaultHasher::new();
227
+ node_type.hash(&mut hasher);
228
+ // Only hash a prefix of the text to keep cost bounded.
229
+ text[..text.len().min(64)].hash(&mut hasher);
230
+ index.hash(&mut hasher);
231
+ let digest = hasher.finish();
232
+ format!("{node_type}-{digest:016x}")
233
+ }
234
+
235
+ // ── Definition list helpers ───────────────────────────────────────────────────
236
+
237
+ /// Collect `<dt>`/`<dd>` pairs from a `<dl>` element.
238
+ ///
239
+ /// Returns `(term_text, definition_text)` tuples. Consecutive `<dt>` elements share
240
+ /// the next `<dd>`; orphan `<dd>`s use an empty term.
241
+ fn collect_definition_items(dl_tag: &tl::HTMLTag, parser: &tl::Parser) -> Vec<(String, String)> {
242
+ let mut items: Vec<(String, String)> = Vec::new();
243
+ let mut pending_terms: Vec<String> = Vec::new();
244
+
245
+ let children = dl_tag.children();
246
+ for handle in children.top().iter() {
247
+ let Some(tl::Node::Tag(child_tag)) = handle.get(parser) else {
248
+ continue;
249
+ };
250
+ let name = child_tag.name().as_utf8_str().to_ascii_lowercase();
251
+ match name.as_str() {
252
+ "dt" => {
253
+ pending_terms.push(extract_text(child_tag, parser).trim().to_string());
254
+ }
255
+ "dd" => {
256
+ let definition = extract_text(child_tag, parser).trim().to_string();
257
+ if pending_terms.is_empty() {
258
+ items.push((String::new(), definition));
259
+ } else {
260
+ let mut drained: Vec<String> = std::mem::take(&mut pending_terms);
261
+ let last_term = drained.pop();
262
+ for term in drained {
263
+ items.push((term, String::new()));
264
+ }
265
+ if let Some(term) = last_term {
266
+ items.push((term, definition));
267
+ }
268
+ }
269
+ }
270
+ _ => {}
271
+ }
272
+ }
273
+
274
+ // Flush trailing <dt>s without a corresponding <dd>.
275
+ for term in pending_terms {
276
+ items.push((term, String::new()));
277
+ }
278
+
279
+ items
280
+ }
281
+
282
+ // ── Head metadata extraction ──────────────────────────────────────────────────
283
+
284
+ /// Extract `<meta name=… content=…>` and `<title>` entries from a `<head>` element.
285
+ fn extract_head_metadata_entries(head_tag: &tl::HTMLTag, parser: &tl::Parser) -> Vec<(String, String)> {
286
+ let mut entries: Vec<(String, String)> = Vec::new();
287
+
288
+ let children = head_tag.children();
289
+ for handle in children.top().iter() {
290
+ let Some(tl::Node::Tag(child_tag)) = handle.get(parser) else {
291
+ continue;
292
+ };
293
+ let name = child_tag.name().as_utf8_str().to_ascii_lowercase();
294
+ match name.as_str() {
295
+ "title" => {
296
+ let title = extract_text(child_tag, parser).trim().to_string();
297
+ if !title.is_empty() {
298
+ entries.push(("title".to_string(), title));
299
+ }
300
+ }
301
+ "meta" => {
302
+ // name + content
303
+ if let (Some(Some(meta_name)), Some(Some(meta_content))) = (
304
+ child_tag.attributes().get("name"),
305
+ child_tag.attributes().get("content"),
306
+ ) {
307
+ entries.push((
308
+ meta_name.as_utf8_str().to_string(),
309
+ meta_content.as_utf8_str().to_string(),
310
+ ));
311
+ }
312
+ // property + content (Open Graph etc.)
313
+ if let (Some(Some(property)), Some(Some(content))) = (
314
+ child_tag.attributes().get("property"),
315
+ child_tag.attributes().get("content"),
316
+ ) {
317
+ entries.push((property.as_utf8_str().to_string(), content.as_utf8_str().to_string()));
318
+ }
319
+ }
320
+ _ => {}
321
+ }
322
+ }
323
+
324
+ entries
325
+ }
326
+
327
+ // ── Main builder ──────────────────────────────────────────────────────────────
328
+
329
+ /// State threaded through the recursive walk.
330
+ struct BuilderState {
331
+ /// Accumulated nodes (flat list in document order).
332
+ nodes: Vec<DocumentNode>,
333
+ /// Stack of open heading-group indices: `(heading_level, node_index)`.
334
+ group_stack: Vec<(u8, u32)>,
335
+ }
336
+
337
+ impl BuilderState {
338
+ fn new() -> Self {
339
+ Self {
340
+ nodes: Vec::new(),
341
+ group_stack: Vec::new(),
342
+ }
343
+ }
344
+
345
+ /// Append a node and return its index.
346
+ fn push(&mut self, node: DocumentNode) -> u32 {
347
+ let idx = self.nodes.len() as u32;
348
+ self.nodes.push(node);
349
+ idx
350
+ }
351
+
352
+ /// Index of the innermost open group, if any.
353
+ fn current_group(&self) -> Option<u32> {
354
+ self.group_stack.last().map(|(_, idx)| *idx)
355
+ }
356
+
357
+ /// Record `child_idx` as a child of `parent_idx`.
358
+ fn add_child(&mut self, parent_idx: u32, child_idx: u32) {
359
+ if let Some(parent) = self.nodes.get_mut(parent_idx as usize) {
360
+ parent.children.push(child_idx);
361
+ }
362
+ }
363
+ }
364
+
365
+ /// Build a [`DocumentStructure`] from an already-parsed `tl::VDom`.
366
+ ///
367
+ /// Walks the DOM once, mapping HTML elements to semantic [`NodeContent`] variants,
368
+ /// tracking parent/child relationships, extracting inline [`TextAnnotation`]s, and
369
+ /// constructing heading-based [`Group`] nodes.
370
+ pub fn build_document_structure(dom: &tl::VDom<'_>) -> DocumentStructure {
371
+ let parser = dom.parser();
372
+ let mut state = BuilderState::new();
373
+
374
+ for handle in dom.children() {
375
+ walk(&mut state, handle, parser, None);
376
+ }
377
+
378
+ DocumentStructure {
379
+ nodes: state.nodes,
380
+ source_format: Some("html".to_string()),
381
+ }
382
+ }
383
+
384
+ /// Recursive DOM walker.
385
+ ///
386
+ /// `parent_idx` is the flat-list index of the nearest structural parent, if any.
387
+ fn walk(state: &mut BuilderState, handle: &tl::NodeHandle, parser: &tl::Parser, parent_idx: Option<u32>) {
388
+ let Some(node) = handle.get(parser) else {
389
+ return;
390
+ };
391
+
392
+ match node {
393
+ tl::Node::Raw(_) | tl::Node::Comment(_) => {}
394
+ tl::Node::Tag(tag) => {
395
+ let tag_name = tag.name().as_utf8_str().to_ascii_lowercase();
396
+ process_tag(state, tag_name.as_str(), tag, parser, parent_idx);
397
+ }
398
+ }
399
+ }
400
+
401
+ /// Decide how to handle a given tag, creating nodes and recursing as needed.
402
+ #[allow(clippy::too_many_lines)]
403
+ fn process_tag(
404
+ state: &mut BuilderState,
405
+ tag_name: &str,
406
+ tag: &tl::HTMLTag,
407
+ parser: &tl::Parser,
408
+ parent_idx: Option<u32>,
409
+ ) {
410
+ match tag_name {
411
+ // ── Headings ──────────────────────────────────────────────────────
412
+ "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
413
+ let level = tag_name[1..].parse::<u8>().unwrap_or(1);
414
+ let text = extract_text(tag, parser).trim().to_string();
415
+
416
+ // Close any open groups at the same or deeper heading level.
417
+ while let Some(&(open_level, _)) = state.group_stack.last() {
418
+ if open_level >= level {
419
+ state.group_stack.pop();
420
+ } else {
421
+ break;
422
+ }
423
+ }
424
+
425
+ // Parent for the new group is the enclosing group or the explicit parent.
426
+ let group_parent = state.group_stack.last().map(|(_, idx)| *idx).or(parent_idx);
427
+ let group_id = make_node_id("group", &text, state.nodes.len());
428
+ let group_idx = state.push(DocumentNode {
429
+ id: group_id,
430
+ content: NodeContent::Group {
431
+ label: Some(text.clone()),
432
+ heading_level: Some(level),
433
+ heading_text: Some(text.clone()),
434
+ },
435
+ parent: group_parent,
436
+ children: Vec::new(),
437
+ annotations: Vec::new(),
438
+ attributes: None,
439
+ });
440
+ if let Some(gp) = group_parent {
441
+ state.add_child(gp, group_idx);
442
+ }
443
+ state.group_stack.push((level, group_idx));
444
+
445
+ // Emit the Heading node as a child of the new group.
446
+ let mut annotations = Vec::new();
447
+ collect_annotations(tag, parser, &text, &mut annotations);
448
+ let heading_id = make_node_id("heading", &text, state.nodes.len());
449
+ let heading_idx = state.push(DocumentNode {
450
+ id: heading_id,
451
+ content: NodeContent::Heading { level, text },
452
+ parent: Some(group_idx),
453
+ children: Vec::new(),
454
+ annotations,
455
+ attributes: None,
456
+ });
457
+ state.add_child(group_idx, heading_idx);
458
+ }
459
+
460
+ // ── Paragraph ────────────────────────────────────────────────────
461
+ "p" => {
462
+ let text = extract_text(tag, parser).trim().to_string();
463
+ if text.is_empty() {
464
+ return;
465
+ }
466
+ let effective_parent = state.current_group().or(parent_idx);
467
+ let mut annotations = Vec::new();
468
+ collect_annotations(tag, parser, &text, &mut annotations);
469
+ let id = make_node_id("paragraph", &text, state.nodes.len());
470
+ let idx = state.push(DocumentNode {
471
+ id,
472
+ content: NodeContent::Paragraph { text },
473
+ parent: effective_parent,
474
+ children: Vec::new(),
475
+ annotations,
476
+ attributes: None,
477
+ });
478
+ if let Some(ep) = effective_parent {
479
+ state.add_child(ep, idx);
480
+ }
481
+ }
482
+
483
+ // ── Lists ─────────────────────────────────────────────────────────
484
+ "ul" | "ol" => {
485
+ let ordered = tag_name == "ol";
486
+ let effective_parent = state.current_group().or(parent_idx);
487
+ let id = make_node_id("list", if ordered { "ordered" } else { "unordered" }, state.nodes.len());
488
+ let list_idx = state.push(DocumentNode {
489
+ id,
490
+ content: NodeContent::List { ordered },
491
+ parent: effective_parent,
492
+ children: Vec::new(),
493
+ annotations: Vec::new(),
494
+ attributes: None,
495
+ });
496
+ if let Some(ep) = effective_parent {
497
+ state.add_child(ep, list_idx);
498
+ }
499
+ // Recurse with the list node as the parent so <li>s attach to it.
500
+ let children = tag.children();
501
+ for child_handle in children.top().iter() {
502
+ walk(state, child_handle, parser, Some(list_idx));
503
+ }
504
+ }
505
+
506
+ // ── List item ─────────────────────────────────────────────────────
507
+ "li" => {
508
+ let text = extract_text(tag, parser).trim().to_string();
509
+ let effective_parent = parent_idx.or_else(|| state.current_group());
510
+ let mut annotations = Vec::new();
511
+ collect_annotations(tag, parser, &text, &mut annotations);
512
+ let id = make_node_id("list_item", &text, state.nodes.len());
513
+ let idx = state.push(DocumentNode {
514
+ id,
515
+ content: NodeContent::ListItem { text },
516
+ parent: effective_parent,
517
+ children: Vec::new(),
518
+ annotations,
519
+ attributes: None,
520
+ });
521
+ if let Some(ep) = effective_parent {
522
+ state.add_child(ep, idx);
523
+ }
524
+ }
525
+
526
+ // ── Table ─────────────────────────────────────────────────────────
527
+ "table" => {
528
+ let grid = extract_table_grid(tag, parser);
529
+ let effective_parent = state.current_group().or(parent_idx);
530
+ let id = make_node_id("table", &grid.rows.to_string(), state.nodes.len());
531
+ let idx = state.push(DocumentNode {
532
+ id,
533
+ content: NodeContent::Table { grid },
534
+ parent: effective_parent,
535
+ children: Vec::new(),
536
+ annotations: Vec::new(),
537
+ attributes: None,
538
+ });
539
+ if let Some(ep) = effective_parent {
540
+ state.add_child(ep, idx);
541
+ }
542
+ }
543
+
544
+ // ── Image ─────────────────────────────────────────────────────────
545
+ "img" => {
546
+ let src = tag
547
+ .attributes()
548
+ .get("src")
549
+ .flatten()
550
+ .map(|v| v.as_utf8_str().to_string());
551
+ let description = tag
552
+ .attributes()
553
+ .get("alt")
554
+ .flatten()
555
+ .map(|v| v.as_utf8_str().to_string())
556
+ .filter(|s| !s.is_empty());
557
+ let effective_parent = state.current_group().or(parent_idx);
558
+ let label = src.as_deref().unwrap_or("img");
559
+ let id = make_node_id("image", label, state.nodes.len());
560
+ let idx = state.push(DocumentNode {
561
+ id,
562
+ content: NodeContent::Image {
563
+ description,
564
+ src,
565
+ image_index: None,
566
+ },
567
+ parent: effective_parent,
568
+ children: Vec::new(),
569
+ annotations: Vec::new(),
570
+ attributes: None,
571
+ });
572
+ if let Some(ep) = effective_parent {
573
+ state.add_child(ep, idx);
574
+ }
575
+ }
576
+
577
+ // ── Code block (<pre><code …>) ────────────────────────────────────
578
+ "pre" => {
579
+ let mut language: Option<String> = None;
580
+ let mut code_text: Option<String> = None;
581
+
582
+ let children = tag.children();
583
+ for child_handle in children.top().iter() {
584
+ if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
585
+ let child_name = child_tag.name().as_utf8_str().to_ascii_lowercase();
586
+ if child_name == "code" {
587
+ // Extract language from class="language-*"
588
+ if let Some(Some(class_val)) = child_tag.attributes().get("class") {
589
+ let class_str = class_val.as_utf8_str();
590
+ for token in class_str.split_whitespace() {
591
+ if let Some(lang) = token.strip_prefix("language-") {
592
+ language = Some(lang.to_string());
593
+ break;
594
+ }
595
+ }
596
+ }
597
+ code_text = Some(extract_text(child_tag, parser));
598
+ break;
599
+ }
600
+ }
601
+ }
602
+
603
+ let text = code_text.unwrap_or_else(|| extract_text(tag, parser));
604
+ let effective_parent = state.current_group().or(parent_idx);
605
+ let id = make_node_id("code", &text, state.nodes.len());
606
+ let idx = state.push(DocumentNode {
607
+ id,
608
+ content: NodeContent::Code { text, language },
609
+ parent: effective_parent,
610
+ children: Vec::new(),
611
+ annotations: Vec::new(),
612
+ attributes: None,
613
+ });
614
+ if let Some(ep) = effective_parent {
615
+ state.add_child(ep, idx);
616
+ }
617
+ }
618
+
619
+ // ── Blockquote ────────────────────────────────────────────────────
620
+ "blockquote" => {
621
+ let effective_parent = state.current_group().or(parent_idx);
622
+ let id = make_node_id("quote", "blockquote", state.nodes.len());
623
+ let quote_idx = state.push(DocumentNode {
624
+ id,
625
+ content: NodeContent::Quote,
626
+ parent: effective_parent,
627
+ children: Vec::new(),
628
+ annotations: Vec::new(),
629
+ attributes: None,
630
+ });
631
+ if let Some(ep) = effective_parent {
632
+ state.add_child(ep, quote_idx);
633
+ }
634
+ // Recurse into blockquote children under the Quote node.
635
+ let children = tag.children();
636
+ for child_handle in children.top().iter() {
637
+ walk(state, child_handle, parser, Some(quote_idx));
638
+ }
639
+ }
640
+
641
+ // ── Definition list ───────────────────────────────────────────────
642
+ "dl" => {
643
+ let effective_parent = state.current_group().or(parent_idx);
644
+ let id = make_node_id("definition_list", "dl", state.nodes.len());
645
+ let dl_idx = state.push(DocumentNode {
646
+ id,
647
+ content: NodeContent::DefinitionList,
648
+ parent: effective_parent,
649
+ children: Vec::new(),
650
+ annotations: Vec::new(),
651
+ attributes: None,
652
+ });
653
+ if let Some(ep) = effective_parent {
654
+ state.add_child(ep, dl_idx);
655
+ }
656
+
657
+ for (term, definition) in collect_definition_items(tag, parser) {
658
+ let item_id = make_node_id("definition_item", &term, state.nodes.len());
659
+ let item_idx = state.push(DocumentNode {
660
+ id: item_id,
661
+ content: NodeContent::DefinitionItem { term, definition },
662
+ parent: Some(dl_idx),
663
+ children: Vec::new(),
664
+ annotations: Vec::new(),
665
+ attributes: None,
666
+ });
667
+ state.add_child(dl_idx, item_idx);
668
+ }
669
+ }
670
+
671
+ // ── Script / Style → RawBlock ─────────────────────────────────────
672
+ "script" | "style" => {
673
+ let format = if tag_name == "script" {
674
+ tag.attributes()
675
+ .get("type")
676
+ .flatten()
677
+ .map(|v| v.as_utf8_str().to_string())
678
+ .unwrap_or_else(|| "javascript".to_string())
679
+ } else {
680
+ "css".to_string()
681
+ };
682
+ let content = extract_text(tag, parser);
683
+ if content.trim().is_empty() {
684
+ return;
685
+ }
686
+ let effective_parent = state.current_group().or(parent_idx);
687
+ let id = make_node_id("raw_block", &format, state.nodes.len());
688
+ let idx = state.push(DocumentNode {
689
+ id,
690
+ content: NodeContent::RawBlock { format, content },
691
+ parent: effective_parent,
692
+ children: Vec::new(),
693
+ annotations: Vec::new(),
694
+ attributes: None,
695
+ });
696
+ if let Some(ep) = effective_parent {
697
+ state.add_child(ep, idx);
698
+ }
699
+ }
700
+
701
+ // ── Head → MetadataBlock ──────────────────────────────────────────
702
+ "head" => {
703
+ let entries = extract_head_metadata_entries(tag, parser);
704
+ if entries.is_empty() {
705
+ return;
706
+ }
707
+ let id = make_node_id("metadata_block", "head", state.nodes.len());
708
+ // Metadata blocks sit at the root level.
709
+ state.push(DocumentNode {
710
+ id,
711
+ content: NodeContent::MetadataBlock { entries },
712
+ parent: None,
713
+ children: Vec::new(),
714
+ annotations: Vec::new(),
715
+ attributes: None,
716
+ });
717
+ }
718
+
719
+ // ── Semantic containers → Group node ──────────────────────────────
720
+ "main" | "article" | "section" | "header" | "footer" | "nav" | "aside" => {
721
+ let label = tag
722
+ .attributes()
723
+ .get("aria-label")
724
+ .flatten()
725
+ .map(|v| v.as_utf8_str().to_string());
726
+ let effective_parent = state.current_group().or(parent_idx);
727
+ let id = make_node_id("group", tag_name, state.nodes.len());
728
+ let group_idx = state.push(DocumentNode {
729
+ id,
730
+ content: NodeContent::Group {
731
+ label,
732
+ heading_level: None,
733
+ heading_text: None,
734
+ },
735
+ parent: effective_parent,
736
+ children: Vec::new(),
737
+ annotations: Vec::new(),
738
+ attributes: collect_attributes(tag),
739
+ });
740
+ if let Some(ep) = effective_parent {
741
+ state.add_child(ep, group_idx);
742
+ }
743
+ let children = tag.children();
744
+ for child_handle in children.top().iter() {
745
+ walk(state, child_handle, parser, Some(group_idx));
746
+ }
747
+ }
748
+
749
+ // ── Transparent structural containers ─────────────────────────────
750
+ "html" | "body" | "div" | "figure" | "figcaption" | "details" | "summary" | "address" | "hgroup" | "search"
751
+ | "form" | "fieldset" => {
752
+ let children = tag.children();
753
+ for child_handle in children.top().iter() {
754
+ walk(state, child_handle, parser, parent_idx);
755
+ }
756
+ }
757
+
758
+ // ── Everything else: recurse transparently ────────────────────────
759
+ _ => {
760
+ let children = tag.children();
761
+ for child_handle in children.top().iter() {
762
+ walk(state, child_handle, parser, parent_idx);
763
+ }
764
+ }
765
+ }
766
+ }
767
+
768
+ /// Collect a safe subset of attributes into a `HashMap`.
769
+ ///
770
+ /// Only `id`, `class`, `lang`, `dir`, and `data-*` attributes are kept.
771
+ /// Event handlers (`on*`) and other potentially unsafe attributes are dropped.
772
+ fn collect_attributes(tag: &tl::HTMLTag) -> Option<HashMap<String, String>> {
773
+ let raw = tag.attributes().clone();
774
+ let mut map: HashMap<String, String> = HashMap::new();
775
+
776
+ for (key_cow, val_opt) in raw.iter() {
777
+ let key = key_cow.to_ascii_lowercase();
778
+ // Drop event handlers.
779
+ if key.starts_with("on") {
780
+ continue;
781
+ }
782
+ if matches!(key.as_str(), "id" | "class" | "lang" | "dir") || key.starts_with("data-") {
783
+ if let Some(val) = val_opt {
784
+ map.insert(key, val.to_string());
785
+ }
786
+ }
787
+ }
788
+
789
+ if map.is_empty() { None } else { Some(map) }
790
+ }