html-to-markdown 2.29.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +18 -41
- data/README.md +37 -50
- data/ext/html-to-markdown-rb/native/Cargo.lock +17 -705
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
- data/ext/html-to-markdown-rb/native/README.md +4 -13
- data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
- data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
- data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
- data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
- data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +13 -194
- data/sig/html_to_markdown.rbs +12 -373
- data/vendor/Cargo.toml +7 -4
- data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
- data/vendor/html-to-markdown-rs/README.md +127 -51
- data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
- data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
- data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
- data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
- data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
- data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
- data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
- data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
- data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
- data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
- data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
- data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
- data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
- data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
- data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
- data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
- data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
- data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -67
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
- data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
- data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
- data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
- data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
- data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
- data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
- data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
- data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
- data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
- data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
- data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
- data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
- data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
- data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
- data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
- data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
- data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
- data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -319
- data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
- data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
- data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
- data/vendor/html-to-markdown-rs/src/text.rs +25 -14
- data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
- data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
- data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
- data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
- data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
- data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
- data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
- data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
- data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
- data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
- data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
- data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
- data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
- data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
- data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
- metadata +9 -37
- data/bin/benchmark.rb +0 -232
- data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
- data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
- data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
- data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
- data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
- data/spec/convert_spec.rb +0 -77
- data/spec/convert_with_tables_spec.rb +0 -194
- data/spec/metadata_extraction_spec.rb +0 -437
- data/spec/visitor_issue_187_spec.rb +0 -605
- data/spec/visitor_spec.rb +0 -1149
- data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
- data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
- data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
- data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
- data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
- data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
- data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
- data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
- data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
- data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -31
- data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
- data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
- data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
- data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
- data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
- data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
- data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
- data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
- data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
- data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
- data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
|
@@ -0,0 +1,442 @@
|
|
|
1
|
+
//! Collector that builds a [`DocumentStructure`] during the converter's HTML DOM walk.
|
|
2
|
+
//!
|
|
3
|
+
//! Follows the same single-pass collector pattern used by [`crate::metadata::MetadataCollector`]:
|
|
4
|
+
//! an `Rc<RefCell<StructureCollector>>` handle is threaded through the [`crate::converter::Context`]
|
|
5
|
+
//! and individual element handlers call `push_*` methods as they encounter content.
|
|
6
|
+
//!
|
|
7
|
+
//! # Design
|
|
8
|
+
//!
|
|
9
|
+
//! - **Flat node array** with index-based parent/child links (matches [`DocumentStructure`]).
|
|
10
|
+
//! - **section_stack** tracks the currently-open heading groups (`(level, group_node_index)`).
|
|
11
|
+
//! - **container_stack** tracks open blockquote containers.
|
|
12
|
+
//! - **list_stack** tracks open list containers so `push_list_item` attaches items to the right list.
|
|
13
|
+
//! - IDs are deterministic hashes of `(node_type, text_prefix, index)`.
|
|
14
|
+
|
|
15
|
+
use std::cell::RefCell;
|
|
16
|
+
use std::rc::Rc;
|
|
17
|
+
|
|
18
|
+
use super::document::{DocumentNode, DocumentStructure, NodeContent};
|
|
19
|
+
use super::tables::TableGrid;
|
|
20
|
+
|
|
21
|
+
/// Shared mutable handle used in [`crate::converter::Context`].
|
|
22
|
+
pub type StructureCollectorHandle = Rc<RefCell<StructureCollector>>;
|
|
23
|
+
|
|
24
|
+
/// Incremental builder for [`DocumentStructure`] during a single DOM walk.
|
|
25
|
+
pub struct StructureCollector {
|
|
26
|
+
/// Accumulated nodes in document order.
|
|
27
|
+
nodes: Vec<DocumentNode>,
|
|
28
|
+
/// Open heading-group stack: `(heading_level, node_index)`.
|
|
29
|
+
/// Mirrors the `group_stack` in `structure_builder`.
|
|
30
|
+
section_stack: Vec<(u8, u32)>,
|
|
31
|
+
/// Open blockquote container indices (innermost last).
|
|
32
|
+
container_stack: Vec<u32>,
|
|
33
|
+
/// Open list container indices (innermost last).
|
|
34
|
+
list_stack: Vec<u32>,
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
impl StructureCollector {
|
|
38
|
+
/// Create a new empty collector.
|
|
39
|
+
pub fn new() -> Self {
|
|
40
|
+
Self {
|
|
41
|
+
nodes: Vec::new(),
|
|
42
|
+
section_stack: Vec::new(),
|
|
43
|
+
container_stack: Vec::new(),
|
|
44
|
+
list_stack: Vec::new(),
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// ── Public push methods ──────────────────────────────────────────────────
|
|
49
|
+
|
|
50
|
+
/// Record a heading element.
|
|
51
|
+
///
|
|
52
|
+
/// Creates a [`NodeContent::Group`] (which owns all subsequent sibling content until a
|
|
53
|
+
/// heading of equal or higher rank closes it) followed by a [`NodeContent::Heading`] child.
|
|
54
|
+
///
|
|
55
|
+
/// Returns the index of the **heading** node (the group node is one before it).
|
|
56
|
+
pub fn push_heading(&mut self, level: u8, text: &str, id: Option<&str>) -> u32 {
|
|
57
|
+
// Close any open groups at the same or deeper heading level.
|
|
58
|
+
while let Some(&(open_level, _)) = self.section_stack.last() {
|
|
59
|
+
if open_level >= level {
|
|
60
|
+
self.section_stack.pop();
|
|
61
|
+
} else {
|
|
62
|
+
break;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// The group's parent is the surrounding open group or a container/list (if any).
|
|
67
|
+
let group_parent = self.current_structural_parent();
|
|
68
|
+
|
|
69
|
+
// Create the Group node.
|
|
70
|
+
let group_id = Self::generate_id("group", text, self.nodes.len() as u32);
|
|
71
|
+
let group_idx = self.raw_push(DocumentNode {
|
|
72
|
+
id: group_id,
|
|
73
|
+
content: NodeContent::Group {
|
|
74
|
+
label: Some(text.to_string()),
|
|
75
|
+
heading_level: Some(level),
|
|
76
|
+
heading_text: Some(text.to_string()),
|
|
77
|
+
},
|
|
78
|
+
parent: group_parent,
|
|
79
|
+
children: Vec::new(),
|
|
80
|
+
annotations: Vec::new(),
|
|
81
|
+
attributes: None,
|
|
82
|
+
});
|
|
83
|
+
if let Some(gp) = group_parent {
|
|
84
|
+
self.add_child(gp, group_idx);
|
|
85
|
+
}
|
|
86
|
+
self.section_stack.push((level, group_idx));
|
|
87
|
+
|
|
88
|
+
// Create the Heading node as a child of the new group.
|
|
89
|
+
let heading_id = Self::generate_id("heading", text, self.nodes.len() as u32);
|
|
90
|
+
let heading_idx = self.raw_push(DocumentNode {
|
|
91
|
+
id: heading_id,
|
|
92
|
+
content: NodeContent::Heading {
|
|
93
|
+
level,
|
|
94
|
+
text: text.to_string(),
|
|
95
|
+
},
|
|
96
|
+
parent: Some(group_idx),
|
|
97
|
+
children: Vec::new(),
|
|
98
|
+
annotations: Vec::new(),
|
|
99
|
+
attributes: id.map(|v| {
|
|
100
|
+
let mut m = std::collections::HashMap::new();
|
|
101
|
+
m.insert("id".to_string(), v.to_string());
|
|
102
|
+
m
|
|
103
|
+
}),
|
|
104
|
+
});
|
|
105
|
+
self.add_child(group_idx, heading_idx);
|
|
106
|
+
heading_idx
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/// Record a paragraph element.
|
|
110
|
+
///
|
|
111
|
+
/// Returns the node index.
|
|
112
|
+
pub fn push_paragraph(&mut self, text: &str) -> u32 {
|
|
113
|
+
if text.is_empty() {
|
|
114
|
+
return u32::MAX;
|
|
115
|
+
}
|
|
116
|
+
let parent = self.current_structural_parent();
|
|
117
|
+
let id = Self::generate_id("paragraph", text, self.nodes.len() as u32);
|
|
118
|
+
let idx = self.raw_push(DocumentNode {
|
|
119
|
+
id,
|
|
120
|
+
content: NodeContent::Paragraph { text: text.to_string() },
|
|
121
|
+
parent,
|
|
122
|
+
children: Vec::new(),
|
|
123
|
+
annotations: Vec::new(),
|
|
124
|
+
attributes: None,
|
|
125
|
+
});
|
|
126
|
+
if let Some(p) = parent {
|
|
127
|
+
self.add_child(p, idx);
|
|
128
|
+
}
|
|
129
|
+
idx
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/// Open a list container.
|
|
133
|
+
///
|
|
134
|
+
/// Returns the node index; call [`push_list_end`] to close it.
|
|
135
|
+
pub fn push_list_start(&mut self, ordered: bool) -> u32 {
|
|
136
|
+
let parent = self.current_structural_parent();
|
|
137
|
+
let label = if ordered { "ordered" } else { "unordered" };
|
|
138
|
+
let id = Self::generate_id("list", label, self.nodes.len() as u32);
|
|
139
|
+
let idx = self.raw_push(DocumentNode {
|
|
140
|
+
id,
|
|
141
|
+
content: NodeContent::List { ordered },
|
|
142
|
+
parent,
|
|
143
|
+
children: Vec::new(),
|
|
144
|
+
annotations: Vec::new(),
|
|
145
|
+
attributes: None,
|
|
146
|
+
});
|
|
147
|
+
if let Some(p) = parent {
|
|
148
|
+
self.add_child(p, idx);
|
|
149
|
+
}
|
|
150
|
+
self.list_stack.push(idx);
|
|
151
|
+
idx
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
/// Close the innermost open list container.
|
|
155
|
+
pub fn push_list_end(&mut self) {
|
|
156
|
+
self.list_stack.pop();
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
/// Record a list item under the current open list.
|
|
160
|
+
///
|
|
161
|
+
/// If there is no open list, the item is parented under the current section/container.
|
|
162
|
+
/// Returns the node index.
|
|
163
|
+
pub fn push_list_item(&mut self, text: &str) -> u32 {
|
|
164
|
+
let parent = self
|
|
165
|
+
.list_stack
|
|
166
|
+
.last()
|
|
167
|
+
.copied()
|
|
168
|
+
.or_else(|| self.current_structural_parent());
|
|
169
|
+
let id = Self::generate_id("list_item", text, self.nodes.len() as u32);
|
|
170
|
+
let idx = self.raw_push(DocumentNode {
|
|
171
|
+
id,
|
|
172
|
+
content: NodeContent::ListItem { text: text.to_string() },
|
|
173
|
+
parent,
|
|
174
|
+
children: Vec::new(),
|
|
175
|
+
annotations: Vec::new(),
|
|
176
|
+
attributes: None,
|
|
177
|
+
});
|
|
178
|
+
if let Some(p) = parent {
|
|
179
|
+
self.add_child(p, idx);
|
|
180
|
+
}
|
|
181
|
+
idx
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/// Record a table.
|
|
185
|
+
///
|
|
186
|
+
/// Returns the node index.
|
|
187
|
+
pub fn push_table(&mut self, grid: TableGrid) -> u32 {
|
|
188
|
+
let parent = self.current_structural_parent();
|
|
189
|
+
let label = grid.rows.to_string();
|
|
190
|
+
let id = Self::generate_id("table", &label, self.nodes.len() as u32);
|
|
191
|
+
let idx = self.raw_push(DocumentNode {
|
|
192
|
+
id,
|
|
193
|
+
content: NodeContent::Table { grid },
|
|
194
|
+
parent,
|
|
195
|
+
children: Vec::new(),
|
|
196
|
+
annotations: Vec::new(),
|
|
197
|
+
attributes: None,
|
|
198
|
+
});
|
|
199
|
+
if let Some(p) = parent {
|
|
200
|
+
self.add_child(p, idx);
|
|
201
|
+
}
|
|
202
|
+
idx
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
/// Record an image element.
|
|
206
|
+
///
|
|
207
|
+
/// Returns the node index.
|
|
208
|
+
pub fn push_image(&mut self, src: Option<&str>, alt: Option<&str>) -> u32 {
|
|
209
|
+
let parent = self.current_structural_parent();
|
|
210
|
+
let label = src.unwrap_or("img");
|
|
211
|
+
let id = Self::generate_id("image", label, self.nodes.len() as u32);
|
|
212
|
+
let idx = self.raw_push(DocumentNode {
|
|
213
|
+
id,
|
|
214
|
+
content: NodeContent::Image {
|
|
215
|
+
description: alt.filter(|s| !s.is_empty()).map(str::to_string),
|
|
216
|
+
src: src.map(str::to_string),
|
|
217
|
+
image_index: None,
|
|
218
|
+
},
|
|
219
|
+
parent,
|
|
220
|
+
children: Vec::new(),
|
|
221
|
+
annotations: Vec::new(),
|
|
222
|
+
attributes: None,
|
|
223
|
+
});
|
|
224
|
+
if let Some(p) = parent {
|
|
225
|
+
self.add_child(p, idx);
|
|
226
|
+
}
|
|
227
|
+
idx
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/// Record a code block.
|
|
231
|
+
///
|
|
232
|
+
/// Returns the node index.
|
|
233
|
+
pub fn push_code(&mut self, text: &str, language: Option<&str>) -> u32 {
|
|
234
|
+
let parent = self.current_structural_parent();
|
|
235
|
+
let id = Self::generate_id("code", text, self.nodes.len() as u32);
|
|
236
|
+
let idx = self.raw_push(DocumentNode {
|
|
237
|
+
id,
|
|
238
|
+
content: NodeContent::Code {
|
|
239
|
+
text: text.to_string(),
|
|
240
|
+
language: language.map(str::to_string),
|
|
241
|
+
},
|
|
242
|
+
parent,
|
|
243
|
+
children: Vec::new(),
|
|
244
|
+
annotations: Vec::new(),
|
|
245
|
+
attributes: None,
|
|
246
|
+
});
|
|
247
|
+
if let Some(p) = parent {
|
|
248
|
+
self.add_child(p, idx);
|
|
249
|
+
}
|
|
250
|
+
idx
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
/// Open a blockquote container.
|
|
254
|
+
///
|
|
255
|
+
/// Returns the node index; call [`push_quote_end`] to close it.
|
|
256
|
+
pub fn push_quote_start(&mut self) -> u32 {
|
|
257
|
+
let parent = self.current_structural_parent();
|
|
258
|
+
let id = Self::generate_id("quote", "blockquote", self.nodes.len() as u32);
|
|
259
|
+
let idx = self.raw_push(DocumentNode {
|
|
260
|
+
id,
|
|
261
|
+
content: NodeContent::Quote,
|
|
262
|
+
parent,
|
|
263
|
+
children: Vec::new(),
|
|
264
|
+
annotations: Vec::new(),
|
|
265
|
+
attributes: None,
|
|
266
|
+
});
|
|
267
|
+
if let Some(p) = parent {
|
|
268
|
+
self.add_child(p, idx);
|
|
269
|
+
}
|
|
270
|
+
self.container_stack.push(idx);
|
|
271
|
+
idx
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
/// Close the innermost open blockquote container.
|
|
275
|
+
pub fn push_quote_end(&mut self) {
|
|
276
|
+
self.container_stack.pop();
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
/// Record a raw block (e.g. preserved `<script>` or `<style>` content).
|
|
280
|
+
///
|
|
281
|
+
/// Returns the node index.
|
|
282
|
+
pub fn push_raw_block(&mut self, format: &str, content: &str) -> u32 {
|
|
283
|
+
let parent = self.current_structural_parent();
|
|
284
|
+
let id = Self::generate_id("raw_block", format, self.nodes.len() as u32);
|
|
285
|
+
let idx = self.raw_push(DocumentNode {
|
|
286
|
+
id,
|
|
287
|
+
content: NodeContent::RawBlock {
|
|
288
|
+
format: format.to_string(),
|
|
289
|
+
content: content.to_string(),
|
|
290
|
+
},
|
|
291
|
+
parent,
|
|
292
|
+
children: Vec::new(),
|
|
293
|
+
annotations: Vec::new(),
|
|
294
|
+
attributes: None,
|
|
295
|
+
});
|
|
296
|
+
if let Some(p) = parent {
|
|
297
|
+
self.add_child(p, idx);
|
|
298
|
+
}
|
|
299
|
+
idx
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
/// Consume the collector and return the completed [`DocumentStructure`].
|
|
303
|
+
pub fn finish(self) -> DocumentStructure {
|
|
304
|
+
DocumentStructure {
|
|
305
|
+
nodes: self.nodes,
|
|
306
|
+
source_format: Some("html".to_string()),
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
// ── Private helpers ──────────────────────────────────────────────────────
|
|
311
|
+
|
|
312
|
+
/// The effective structural parent for a new node:
|
|
313
|
+
/// list stack > container stack > section stack > None.
|
|
314
|
+
fn current_structural_parent(&self) -> Option<u32> {
|
|
315
|
+
// List items: already handled explicitly in push_list_item.
|
|
316
|
+
// For non-list-item content, prefer the innermost container (blockquote),
|
|
317
|
+
// then innermost section group.
|
|
318
|
+
if let Some(&q) = self.container_stack.last() {
|
|
319
|
+
return Some(q);
|
|
320
|
+
}
|
|
321
|
+
if let Some(&(_, g)) = self.section_stack.last() {
|
|
322
|
+
return Some(g);
|
|
323
|
+
}
|
|
324
|
+
None
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
/// Append a node to the flat list and return its index.
|
|
328
|
+
fn raw_push(&mut self, node: DocumentNode) -> u32 {
|
|
329
|
+
let idx = self.nodes.len() as u32;
|
|
330
|
+
self.nodes.push(node);
|
|
331
|
+
idx
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
/// Record `child_idx` as a child of `parent_idx`.
|
|
335
|
+
fn add_child(&mut self, parent_idx: u32, child_idx: u32) {
|
|
336
|
+
if let Some(parent) = self.nodes.get_mut(parent_idx as usize) {
|
|
337
|
+
parent.children.push(child_idx);
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
/// Generate a deterministic node ID: `"{node_type}-{hash:016x}"`.
|
|
342
|
+
///
|
|
343
|
+
/// Hashes `(node_type, text[..64], index)` with `DefaultHasher`.
|
|
344
|
+
fn generate_id(node_type: &str, text: &str, index: u32) -> String {
|
|
345
|
+
use std::collections::hash_map::DefaultHasher;
|
|
346
|
+
use std::hash::{Hash, Hasher};
|
|
347
|
+
|
|
348
|
+
let mut hasher = DefaultHasher::new();
|
|
349
|
+
node_type.hash(&mut hasher);
|
|
350
|
+
text[..text.len().min(64)].hash(&mut hasher);
|
|
351
|
+
index.hash(&mut hasher);
|
|
352
|
+
let digest = hasher.finish();
|
|
353
|
+
format!("{node_type}-{digest:016x}")
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
impl Default for StructureCollector {
|
|
358
|
+
fn default() -> Self {
|
|
359
|
+
Self::new()
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
#[cfg(test)]
|
|
364
|
+
mod tests {
|
|
365
|
+
use super::*;
|
|
366
|
+
|
|
367
|
+
#[test]
|
|
368
|
+
fn test_heading_creates_group_and_heading() {
|
|
369
|
+
let mut c = StructureCollector::new();
|
|
370
|
+
let heading_idx = c.push_heading(1, "Title", None);
|
|
371
|
+
// Group is at index 0, Heading at index 1.
|
|
372
|
+
assert_eq!(heading_idx, 1);
|
|
373
|
+
assert_eq!(c.nodes.len(), 2);
|
|
374
|
+
|
|
375
|
+
let group = &c.nodes[0];
|
|
376
|
+
matches!(
|
|
377
|
+
&group.content,
|
|
378
|
+
NodeContent::Group {
|
|
379
|
+
heading_level: Some(1),
|
|
380
|
+
..
|
|
381
|
+
}
|
|
382
|
+
);
|
|
383
|
+
assert!(group.children.contains(&1));
|
|
384
|
+
|
|
385
|
+
let heading = &c.nodes[1];
|
|
386
|
+
assert!(matches!(&heading.content, NodeContent::Heading { level: 1, .. }));
|
|
387
|
+
assert_eq!(heading.parent, Some(0));
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
#[test]
|
|
391
|
+
fn test_heading_closes_deeper_groups() {
|
|
392
|
+
let mut c = StructureCollector::new();
|
|
393
|
+
c.push_heading(1, "H1", None);
|
|
394
|
+
c.push_heading(2, "H2", None);
|
|
395
|
+
// Now push another H1 — must close the H2 group.
|
|
396
|
+
c.push_heading(1, "H1b", None);
|
|
397
|
+
// After the second H1 there should be 2 open groups gone and 1 new one.
|
|
398
|
+
assert_eq!(c.section_stack.len(), 1);
|
|
399
|
+
let (level, _) = c.section_stack[0];
|
|
400
|
+
assert_eq!(level, 1);
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
#[test]
|
|
404
|
+
fn test_paragraph_parents_under_section() {
|
|
405
|
+
let mut c = StructureCollector::new();
|
|
406
|
+
c.push_heading(1, "Title", None);
|
|
407
|
+
let p_idx = c.push_paragraph("Some text");
|
|
408
|
+
let para = &c.nodes[p_idx as usize];
|
|
409
|
+
// Parent should be the group node (index 0).
|
|
410
|
+
assert_eq!(para.parent, Some(0));
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
#[test]
|
|
414
|
+
fn test_list_items_attach_to_list() {
|
|
415
|
+
let mut c = StructureCollector::new();
|
|
416
|
+
let list_idx = c.push_list_start(false);
|
|
417
|
+
let item_idx = c.push_list_item("Item 1");
|
|
418
|
+
c.push_list_end();
|
|
419
|
+
assert_eq!(c.nodes[item_idx as usize].parent, Some(list_idx));
|
|
420
|
+
let list = &c.nodes[list_idx as usize];
|
|
421
|
+
assert!(list.children.contains(&item_idx));
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
#[test]
|
|
425
|
+
fn test_quote_container() {
|
|
426
|
+
let mut c = StructureCollector::new();
|
|
427
|
+
let q_idx = c.push_quote_start();
|
|
428
|
+
let p_idx = c.push_paragraph("Quoted text");
|
|
429
|
+
c.push_quote_end();
|
|
430
|
+
assert_eq!(c.nodes[p_idx as usize].parent, Some(q_idx));
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
#[test]
|
|
434
|
+
fn test_finish_returns_document_structure() {
|
|
435
|
+
let mut c = StructureCollector::new();
|
|
436
|
+
c.push_heading(1, "Title", None);
|
|
437
|
+
c.push_paragraph("Text");
|
|
438
|
+
let doc = c.finish();
|
|
439
|
+
assert_eq!(doc.source_format, Some("html".to_string()));
|
|
440
|
+
assert_eq!(doc.nodes.len(), 3); // Group + Heading + Paragraph
|
|
441
|
+
}
|
|
442
|
+
}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
//! Structured table types aligned with kreuzberg's `TableGrid`.
|
|
2
|
+
|
|
3
|
+
use serde::{Deserialize, Serialize};
|
|
4
|
+
|
|
5
|
+
/// A structured table grid with cell-level data including spans.
|
|
6
|
+
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
|
7
|
+
pub struct TableGrid {
|
|
8
|
+
/// Number of rows.
|
|
9
|
+
pub rows: u32,
|
|
10
|
+
/// Number of columns.
|
|
11
|
+
pub cols: u32,
|
|
12
|
+
/// All cells in the table (may be fewer than rows*cols due to spans).
|
|
13
|
+
pub cells: Vec<GridCell>,
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/// A single cell in a table grid.
|
|
17
|
+
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
|
18
|
+
pub struct GridCell {
|
|
19
|
+
/// The text content of the cell.
|
|
20
|
+
pub content: String,
|
|
21
|
+
/// 0-indexed row position.
|
|
22
|
+
pub row: u32,
|
|
23
|
+
/// 0-indexed column position.
|
|
24
|
+
pub col: u32,
|
|
25
|
+
/// Number of rows this cell spans (default 1).
|
|
26
|
+
#[serde(default = "default_span")]
|
|
27
|
+
pub row_span: u32,
|
|
28
|
+
/// Number of columns this cell spans (default 1).
|
|
29
|
+
#[serde(default = "default_span")]
|
|
30
|
+
pub col_span: u32,
|
|
31
|
+
/// Whether this is a header cell (`<th>`).
|
|
32
|
+
#[serde(default)]
|
|
33
|
+
pub is_header: bool,
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
fn default_span() -> u32 {
|
|
37
|
+
1
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/// A top-level extracted table with both structured data and markdown representation.
|
|
41
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
42
|
+
pub struct TableData {
|
|
43
|
+
/// The structured table grid.
|
|
44
|
+
pub grid: TableGrid,
|
|
45
|
+
/// The markdown rendering of this table.
|
|
46
|
+
pub markdown: String,
|
|
47
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
//! Processing warning types for non-fatal issues during conversion.
|
|
2
|
+
|
|
3
|
+
use serde::{Deserialize, Serialize};
|
|
4
|
+
|
|
5
|
+
/// A non-fatal warning generated during HTML processing.
|
|
6
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
7
|
+
pub struct ProcessingWarning {
|
|
8
|
+
/// Human-readable warning message.
|
|
9
|
+
pub message: String,
|
|
10
|
+
/// The category of warning.
|
|
11
|
+
pub kind: WarningKind,
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
/// Categories of processing warnings.
|
|
15
|
+
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
|
16
|
+
#[serde(rename_all = "snake_case")]
|
|
17
|
+
pub enum WarningKind {
|
|
18
|
+
/// An image could not be extracted (e.g. invalid data URI, unsupported format).
|
|
19
|
+
ImageExtractionFailed,
|
|
20
|
+
/// The input encoding was not recognized; fell back to UTF-8.
|
|
21
|
+
EncodingFallback,
|
|
22
|
+
/// The input was truncated due to size limits.
|
|
23
|
+
TruncatedInput,
|
|
24
|
+
/// The HTML was malformed but processing continued with best effort.
|
|
25
|
+
MalformedHtml,
|
|
26
|
+
/// Sanitization was applied to remove potentially unsafe content.
|
|
27
|
+
SanitizationApplied,
|
|
28
|
+
}
|
|
@@ -35,13 +35,7 @@ mod default_impl;
|
|
|
35
35
|
mod traits;
|
|
36
36
|
mod types;
|
|
37
37
|
|
|
38
|
-
#[cfg(feature = "async-visitor")]
|
|
39
|
-
mod async_traits;
|
|
40
|
-
|
|
41
38
|
// Re-export all public items from submodules
|
|
42
39
|
pub use default_impl::VisitorHandle;
|
|
43
40
|
pub use traits::HtmlVisitor;
|
|
44
41
|
pub use types::{NodeContext, NodeType, VisitResult};
|
|
45
|
-
|
|
46
|
-
#[cfg(feature = "async-visitor")]
|
|
47
|
-
pub use async_traits::AsyncHtmlVisitor;
|
|
@@ -1,21 +1 @@
|
|
|
1
|
-
//! Callback management
|
|
2
|
-
//!
|
|
3
|
-
//! This module provides helper macros for common visitor patterns and an async-to-sync
|
|
4
|
-
//! visitor bridge for integrating async visitors with synchronous converters.
|
|
5
|
-
|
|
6
|
-
mod bridge;
|
|
7
|
-
mod bridge_visitor;
|
|
8
|
-
mod macros;
|
|
9
|
-
|
|
10
|
-
#[cfg(feature = "async-visitor")]
|
|
11
|
-
pub use bridge::AsyncToSyncVisitorBridge;
|
|
12
|
-
#[cfg(feature = "async-visitor")]
|
|
13
|
-
pub use macros::dispatch_async_visitor;
|
|
14
|
-
|
|
15
|
-
/// Type alias for an async visitor handle (Arc-wrapped `Mutex` for interior mutability).
|
|
16
|
-
///
|
|
17
|
-
/// This allows async visitors to be passed around and shared while still being mutable.
|
|
18
|
-
/// Uses Arc<Mutex<>> instead of Rc<RefCell<>> to enable Send across thread boundaries.
|
|
19
|
-
/// The + Send + 'static bounds allow the visitor to be moved to other threads.
|
|
20
|
-
#[cfg(feature = "async-visitor")]
|
|
21
|
-
pub type AsyncVisitorHandle = std::sync::Arc<tokio::sync::Mutex<dyn crate::visitor::AsyncHtmlVisitor + Send + 'static>>;
|
|
1
|
+
//! Callback management for visitor pattern.
|
|
@@ -22,11 +22,6 @@ pub mod content;
|
|
|
22
22
|
pub mod state;
|
|
23
23
|
pub mod traversal;
|
|
24
24
|
|
|
25
|
-
#[cfg(feature = "async-visitor")]
|
|
26
|
-
pub use callbacks::AsyncVisitorHandle;
|
|
27
25
|
pub use content::VisitorDispatch;
|
|
28
26
|
pub use state::build_node_context;
|
|
29
27
|
pub use traversal::dispatch_visitor;
|
|
30
|
-
|
|
31
|
-
#[cfg(feature = "async-visitor")]
|
|
32
|
-
pub use callbacks::{AsyncToSyncVisitorBridge, dispatch_async_visitor};
|