html-to-markdown 2.30.0 → 3.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +6 -19
- data/README.md +37 -50
- data/ext/html-to-markdown-rb/native/Cargo.lock +13 -701
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
- data/ext/html-to-markdown-rb/native/README.md +4 -13
- data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
- data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
- data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
- data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
- data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +13 -194
- data/sig/html_to_markdown.rbs +12 -373
- data/vendor/Cargo.toml +6 -3
- data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
- data/vendor/html-to-markdown-rs/README.md +126 -52
- data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
- data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
- data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
- data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
- data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
- data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
- data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
- data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
- data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
- data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
- data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
- data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
- data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
- data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
- data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
- data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
- data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
- data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -68
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
- data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
- data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
- data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
- data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
- data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
- data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
- data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
- data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
- data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
- data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
- data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
- data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
- data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
- data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
- data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
- data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
- data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
- data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -323
- data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
- data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
- data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
- data/vendor/html-to-markdown-rs/src/text.rs +25 -14
- data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
- data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
- data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
- data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
- data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
- data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
- data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
- data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
- data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
- data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
- data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
- data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
- data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
- data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
- data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
- metadata +9 -37
- data/bin/benchmark.rb +0 -232
- data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
- data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
- data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
- data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
- data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
- data/spec/convert_spec.rb +0 -77
- data/spec/convert_with_tables_spec.rb +0 -194
- data/spec/metadata_extraction_spec.rb +0 -437
- data/spec/visitor_issue_187_spec.rb +0 -605
- data/spec/visitor_spec.rb +0 -1149
- data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
- data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
- data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
- data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
- data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
- data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
- data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
- data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
- data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
- data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -42
- data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
- data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
- data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
- data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
- data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
- data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
- data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
- data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
- data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
- data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
- data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
|
@@ -1,335 +0,0 @@
|
|
|
1
|
-
#![allow(clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::unused_self)]
|
|
2
|
-
//! Layout analysis and table reconstruction
|
|
3
|
-
|
|
4
|
-
use crate::hocr::spatial::coords::HocrWord;
|
|
5
|
-
|
|
6
|
-
/// Detect column positions from word positions
|
|
7
|
-
///
|
|
8
|
-
/// Groups words by their x-position and returns the median x-position
|
|
9
|
-
/// for each detected column.
|
|
10
|
-
///
|
|
11
|
-
/// Optimized with O(n log n) complexity using sorted insertion.
|
|
12
|
-
#[must_use]
|
|
13
|
-
pub fn detect_columns(words: &[HocrWord], column_threshold: u32) -> Vec<u32> {
|
|
14
|
-
if words.is_empty() {
|
|
15
|
-
return Vec::new();
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
let mut x_positions: Vec<u32> = words.iter().map(|w| w.left).collect();
|
|
19
|
-
x_positions.sort_unstable();
|
|
20
|
-
|
|
21
|
-
let mut position_groups: Vec<Vec<u32>> = Vec::new();
|
|
22
|
-
let mut current_group = vec![x_positions[0]];
|
|
23
|
-
|
|
24
|
-
for &x_pos in &x_positions[1..] {
|
|
25
|
-
let matches_group = current_group.iter().any(|&pos| x_pos.abs_diff(pos) <= column_threshold);
|
|
26
|
-
|
|
27
|
-
if matches_group {
|
|
28
|
-
current_group.push(x_pos);
|
|
29
|
-
} else {
|
|
30
|
-
position_groups.push(std::mem::replace(&mut current_group, vec![x_pos]));
|
|
31
|
-
}
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
if !current_group.is_empty() {
|
|
35
|
-
position_groups.push(current_group);
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
let mut columns: Vec<u32> = position_groups
|
|
39
|
-
.iter()
|
|
40
|
-
.map(|group| {
|
|
41
|
-
let mid = group.len() / 2;
|
|
42
|
-
group[mid]
|
|
43
|
-
})
|
|
44
|
-
.collect();
|
|
45
|
-
|
|
46
|
-
columns.sort_unstable();
|
|
47
|
-
columns
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
/// Detect row positions from word positions
|
|
51
|
-
///
|
|
52
|
-
/// Groups words by their vertical center position and returns the median
|
|
53
|
-
/// y-position for each detected row.
|
|
54
|
-
///
|
|
55
|
-
/// Optimized with O(n log n) complexity using sorted insertion.
|
|
56
|
-
#[must_use]
|
|
57
|
-
#[allow(clippy::cast_possible_truncation)]
|
|
58
|
-
pub fn detect_rows(words: &[HocrWord], row_threshold_ratio: f64) -> Vec<u32> {
|
|
59
|
-
if words.is_empty() {
|
|
60
|
-
return Vec::new();
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
let mut heights: Vec<u32> = words.iter().map(|w| w.height).collect();
|
|
64
|
-
heights.sort_unstable();
|
|
65
|
-
let median_height = heights[heights.len() / 2];
|
|
66
|
-
let row_threshold = f64::from(median_height) * row_threshold_ratio;
|
|
67
|
-
|
|
68
|
-
let mut y_centers: Vec<f64> = words.iter().map(HocrWord::y_center).collect();
|
|
69
|
-
y_centers.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
|
|
70
|
-
|
|
71
|
-
let mut position_groups: Vec<Vec<f64>> = Vec::new();
|
|
72
|
-
let mut current_group = vec![y_centers[0]];
|
|
73
|
-
|
|
74
|
-
for &y_center in &y_centers[1..] {
|
|
75
|
-
let matches_group = current_group.iter().any(|&pos| (y_center - pos).abs() <= row_threshold);
|
|
76
|
-
|
|
77
|
-
if matches_group {
|
|
78
|
-
current_group.push(y_center);
|
|
79
|
-
} else {
|
|
80
|
-
position_groups.push(std::mem::replace(&mut current_group, vec![y_center]));
|
|
81
|
-
}
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
if !current_group.is_empty() {
|
|
85
|
-
position_groups.push(current_group);
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
let mut rows: Vec<u32> = position_groups
|
|
89
|
-
.iter()
|
|
90
|
-
.map(|group| {
|
|
91
|
-
let mid = group.len() / 2;
|
|
92
|
-
group[mid] as u32
|
|
93
|
-
})
|
|
94
|
-
.collect();
|
|
95
|
-
|
|
96
|
-
rows.sort_unstable();
|
|
97
|
-
rows
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
/// Remove empty rows and columns from table
|
|
101
|
-
fn remove_empty_rows_and_columns(table: Vec<Vec<String>>) -> Vec<Vec<String>> {
|
|
102
|
-
if table.is_empty() {
|
|
103
|
-
return table;
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
let num_cols = table[0].len();
|
|
107
|
-
let mut non_empty_cols: Vec<bool> = vec![false; num_cols];
|
|
108
|
-
|
|
109
|
-
for row in &table {
|
|
110
|
-
for (col_idx, cell) in row.iter().enumerate() {
|
|
111
|
-
if !cell.trim().is_empty() {
|
|
112
|
-
non_empty_cols[col_idx] = true;
|
|
113
|
-
}
|
|
114
|
-
}
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
table
|
|
118
|
-
.into_iter()
|
|
119
|
-
.filter(|row| row.iter().any(|cell| !cell.trim().is_empty()))
|
|
120
|
-
.map(|row| {
|
|
121
|
-
row.into_iter()
|
|
122
|
-
.enumerate()
|
|
123
|
-
.filter(|(idx, _)| non_empty_cols[*idx])
|
|
124
|
-
.map(|(_, cell)| cell)
|
|
125
|
-
.collect()
|
|
126
|
-
})
|
|
127
|
-
.collect()
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
/// Find which row a word belongs to based on its y-center
|
|
131
|
-
#[allow(clippy::cast_possible_truncation)]
|
|
132
|
-
fn find_row_index(row_positions: &[u32], word: &HocrWord) -> Option<usize> {
|
|
133
|
-
let y_center = word.y_center() as u32;
|
|
134
|
-
|
|
135
|
-
row_positions
|
|
136
|
-
.iter()
|
|
137
|
-
.enumerate()
|
|
138
|
-
.min_by_key(|&(_, row_y)| row_y.abs_diff(y_center))
|
|
139
|
-
.map(|(idx, _)| idx)
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
/// Find which column a word belongs to based on its x-position
|
|
143
|
-
fn find_column_index(col_positions: &[u32], word: &HocrWord) -> Option<usize> {
|
|
144
|
-
let x_pos = word.left;
|
|
145
|
-
|
|
146
|
-
col_positions
|
|
147
|
-
.iter()
|
|
148
|
-
.enumerate()
|
|
149
|
-
.min_by_key(|&(_, col_x)| col_x.abs_diff(x_pos))
|
|
150
|
-
.map(|(idx, _)| idx)
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
/// Reconstruct table structure from words
|
|
154
|
-
///
|
|
155
|
-
/// Takes detected words and reconstructs a 2D table by:
|
|
156
|
-
/// 1. Detecting column and row positions
|
|
157
|
-
/// 2. Assigning words to cells based on position
|
|
158
|
-
/// 3. Combining words within the same cell
|
|
159
|
-
#[must_use]
|
|
160
|
-
pub fn reconstruct_table(words: &[HocrWord], column_threshold: u32, row_threshold_ratio: f64) -> Vec<Vec<String>> {
|
|
161
|
-
if words.is_empty() {
|
|
162
|
-
return Vec::new();
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
let col_positions = detect_columns(words, column_threshold);
|
|
166
|
-
let row_positions = detect_rows(words, row_threshold_ratio);
|
|
167
|
-
|
|
168
|
-
if col_positions.is_empty() || row_positions.is_empty() {
|
|
169
|
-
return Vec::new();
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
let num_rows = row_positions.len();
|
|
173
|
-
let num_cols = col_positions.len();
|
|
174
|
-
let mut table: Vec<Vec<Vec<String>>> = vec![vec![vec![]; num_cols]; num_rows];
|
|
175
|
-
|
|
176
|
-
for word in words {
|
|
177
|
-
if let (Some(r), Some(c)) = (
|
|
178
|
-
find_row_index(&row_positions, word),
|
|
179
|
-
find_column_index(&col_positions, word),
|
|
180
|
-
) {
|
|
181
|
-
if r < num_rows && c < num_cols {
|
|
182
|
-
table[r][c].push(word.text.clone());
|
|
183
|
-
}
|
|
184
|
-
}
|
|
185
|
-
}
|
|
186
|
-
|
|
187
|
-
let result: Vec<Vec<String>> = table
|
|
188
|
-
.into_iter()
|
|
189
|
-
.map(|row| {
|
|
190
|
-
row.into_iter()
|
|
191
|
-
.map(|cell_words| {
|
|
192
|
-
if cell_words.is_empty() {
|
|
193
|
-
String::new()
|
|
194
|
-
} else {
|
|
195
|
-
cell_words.join(" ")
|
|
196
|
-
}
|
|
197
|
-
})
|
|
198
|
-
.collect()
|
|
199
|
-
})
|
|
200
|
-
.collect();
|
|
201
|
-
|
|
202
|
-
remove_empty_rows_and_columns(result)
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
#[cfg(test)]
|
|
206
|
-
mod tests {
|
|
207
|
-
use super::*;
|
|
208
|
-
|
|
209
|
-
#[test]
|
|
210
|
-
fn test_detect_columns() {
|
|
211
|
-
let words = vec![
|
|
212
|
-
HocrWord {
|
|
213
|
-
text: "A".to_string(),
|
|
214
|
-
left: 100,
|
|
215
|
-
top: 50,
|
|
216
|
-
width: 20,
|
|
217
|
-
height: 30,
|
|
218
|
-
confidence: 95.0,
|
|
219
|
-
},
|
|
220
|
-
HocrWord {
|
|
221
|
-
text: "B".to_string(),
|
|
222
|
-
left: 200,
|
|
223
|
-
top: 50,
|
|
224
|
-
width: 20,
|
|
225
|
-
height: 30,
|
|
226
|
-
confidence: 95.0,
|
|
227
|
-
},
|
|
228
|
-
HocrWord {
|
|
229
|
-
text: "C".to_string(),
|
|
230
|
-
left: 105,
|
|
231
|
-
top: 100,
|
|
232
|
-
width: 20,
|
|
233
|
-
height: 30,
|
|
234
|
-
confidence: 95.0,
|
|
235
|
-
},
|
|
236
|
-
];
|
|
237
|
-
|
|
238
|
-
let columns = detect_columns(&words, 50);
|
|
239
|
-
assert_eq!(columns.len(), 2);
|
|
240
|
-
assert!(columns.contains(&100) || columns.contains(&105));
|
|
241
|
-
assert!(columns.contains(&200));
|
|
242
|
-
}
|
|
243
|
-
|
|
244
|
-
#[test]
|
|
245
|
-
fn test_reconstruct_simple_table() {
|
|
246
|
-
let words = vec![
|
|
247
|
-
HocrWord {
|
|
248
|
-
text: "Name".to_string(),
|
|
249
|
-
left: 100,
|
|
250
|
-
top: 50,
|
|
251
|
-
width: 50,
|
|
252
|
-
height: 20,
|
|
253
|
-
confidence: 95.0,
|
|
254
|
-
},
|
|
255
|
-
HocrWord {
|
|
256
|
-
text: "Age".to_string(),
|
|
257
|
-
left: 200,
|
|
258
|
-
top: 50,
|
|
259
|
-
width: 50,
|
|
260
|
-
height: 20,
|
|
261
|
-
confidence: 95.0,
|
|
262
|
-
},
|
|
263
|
-
HocrWord {
|
|
264
|
-
text: "Alice".to_string(),
|
|
265
|
-
left: 100,
|
|
266
|
-
top: 100,
|
|
267
|
-
width: 50,
|
|
268
|
-
height: 20,
|
|
269
|
-
confidence: 95.0,
|
|
270
|
-
},
|
|
271
|
-
HocrWord {
|
|
272
|
-
text: "30".to_string(),
|
|
273
|
-
left: 200,
|
|
274
|
-
top: 100,
|
|
275
|
-
width: 50,
|
|
276
|
-
height: 20,
|
|
277
|
-
confidence: 95.0,
|
|
278
|
-
},
|
|
279
|
-
];
|
|
280
|
-
|
|
281
|
-
let table = reconstruct_table(&words, 50, 0.5);
|
|
282
|
-
|
|
283
|
-
assert_eq!(table.len(), 2);
|
|
284
|
-
assert_eq!(table[0].len(), 2);
|
|
285
|
-
assert_eq!(table[0][0], "Name");
|
|
286
|
-
assert_eq!(table[0][1], "Age");
|
|
287
|
-
assert_eq!(table[1][0], "Alice");
|
|
288
|
-
assert_eq!(table[1][1], "30");
|
|
289
|
-
}
|
|
290
|
-
|
|
291
|
-
#[test]
|
|
292
|
-
fn test_reconstruct_table_with_multi_word_cells() {
|
|
293
|
-
let words = vec![
|
|
294
|
-
HocrWord {
|
|
295
|
-
text: "First".to_string(),
|
|
296
|
-
left: 100,
|
|
297
|
-
top: 50,
|
|
298
|
-
width: 30,
|
|
299
|
-
height: 20,
|
|
300
|
-
confidence: 95.0,
|
|
301
|
-
},
|
|
302
|
-
HocrWord {
|
|
303
|
-
text: "Name".to_string(),
|
|
304
|
-
left: 135,
|
|
305
|
-
top: 50,
|
|
306
|
-
width: 30,
|
|
307
|
-
height: 20,
|
|
308
|
-
confidence: 95.0,
|
|
309
|
-
},
|
|
310
|
-
HocrWord {
|
|
311
|
-
text: "Last".to_string(),
|
|
312
|
-
left: 200,
|
|
313
|
-
top: 50,
|
|
314
|
-
width: 30,
|
|
315
|
-
height: 20,
|
|
316
|
-
confidence: 95.0,
|
|
317
|
-
},
|
|
318
|
-
HocrWord {
|
|
319
|
-
text: "Name".to_string(),
|
|
320
|
-
left: 235,
|
|
321
|
-
top: 50,
|
|
322
|
-
width: 30,
|
|
323
|
-
height: 20,
|
|
324
|
-
confidence: 95.0,
|
|
325
|
-
},
|
|
326
|
-
];
|
|
327
|
-
|
|
328
|
-
let table = reconstruct_table(&words, 50, 0.5);
|
|
329
|
-
|
|
330
|
-
assert_eq!(table.len(), 1);
|
|
331
|
-
assert_eq!(table[0].len(), 2);
|
|
332
|
-
assert_eq!(table[0][0], "First Name");
|
|
333
|
-
assert_eq!(table[0][1], "Last Name");
|
|
334
|
-
}
|
|
335
|
-
}
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
#![allow(clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::unused_self)]
|
|
2
|
-
//! Spatial table reconstruction from hOCR bounding box coordinates
|
|
3
|
-
//!
|
|
4
|
-
//! This module provides functions to detect and reconstruct tabular data from OCR'd text
|
|
5
|
-
//! by analyzing the spatial positions of words using their bounding box (bbox) coordinates.
|
|
6
|
-
|
|
7
|
-
mod coords;
|
|
8
|
-
mod grouping;
|
|
9
|
-
mod layout;
|
|
10
|
-
mod output;
|
|
11
|
-
|
|
12
|
-
pub use coords::{HocrWord, parse_bbox, parse_confidence};
|
|
13
|
-
pub use grouping::extract_hocr_words;
|
|
14
|
-
pub use layout::reconstruct_table;
|
|
15
|
-
pub use output::table_to_markdown;
|
|
@@ -1,63 +0,0 @@
|
|
|
1
|
-
#![allow(clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::unused_self)]
|
|
2
|
-
//! Markdown table output formatting
|
|
3
|
-
|
|
4
|
-
/// Convert table to markdown format
|
|
5
|
-
#[must_use]
|
|
6
|
-
pub fn table_to_markdown(table: &[Vec<String>]) -> String {
|
|
7
|
-
if table.is_empty() {
|
|
8
|
-
return String::new();
|
|
9
|
-
}
|
|
10
|
-
|
|
11
|
-
let num_cols = table[0].len();
|
|
12
|
-
if num_cols == 0 {
|
|
13
|
-
return String::new();
|
|
14
|
-
}
|
|
15
|
-
|
|
16
|
-
let mut markdown = String::new();
|
|
17
|
-
|
|
18
|
-
for (row_idx, row) in table.iter().enumerate() {
|
|
19
|
-
markdown.push('|');
|
|
20
|
-
for cell in row {
|
|
21
|
-
markdown.push(' ');
|
|
22
|
-
markdown.push_str(&cell.replace('|', "\\|"));
|
|
23
|
-
markdown.push_str(" |");
|
|
24
|
-
}
|
|
25
|
-
markdown.push('\n');
|
|
26
|
-
|
|
27
|
-
if row_idx == 0 {
|
|
28
|
-
markdown.push('|');
|
|
29
|
-
for _ in 0..num_cols {
|
|
30
|
-
markdown.push_str(" --- |");
|
|
31
|
-
}
|
|
32
|
-
markdown.push('\n');
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
markdown
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
#[cfg(test)]
|
|
40
|
-
mod tests {
|
|
41
|
-
use super::*;
|
|
42
|
-
|
|
43
|
-
#[test]
|
|
44
|
-
fn test_table_to_markdown() {
|
|
45
|
-
let table = vec![
|
|
46
|
-
vec!["Header1".to_string(), "Header2".to_string()],
|
|
47
|
-
vec!["Cell1".to_string(), "Cell2".to_string()],
|
|
48
|
-
];
|
|
49
|
-
|
|
50
|
-
let markdown = table_to_markdown(&table);
|
|
51
|
-
assert!(markdown.contains("| Header1 | Header2 |"));
|
|
52
|
-
assert!(markdown.contains("| --- | --- |"));
|
|
53
|
-
assert!(markdown.contains("| Cell1 | Cell2 |"));
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
#[test]
|
|
57
|
-
fn test_table_to_markdown_escape_pipes() {
|
|
58
|
-
let table = vec![vec!["A|B".to_string(), "C".to_string()]];
|
|
59
|
-
|
|
60
|
-
let markdown = table_to_markdown(&table);
|
|
61
|
-
assert!(markdown.contains("A\\|B"));
|
|
62
|
-
}
|
|
63
|
-
}
|
|
@@ -1,269 +0,0 @@
|
|
|
1
|
-
#![allow(clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::unused_self)]
|
|
2
|
-
//! hOCR 1.2 type definitions
|
|
3
|
-
//!
|
|
4
|
-
//! Complete type system for hOCR 1.2 specification elements and properties.
|
|
5
|
-
|
|
6
|
-
use ahash::AHashMap as HashMap;
|
|
7
|
-
|
|
8
|
-
/// All hOCR 1.2 element types
|
|
9
|
-
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
10
|
-
pub enum HocrElementType {
|
|
11
|
-
/// Document abstract or summary
|
|
12
|
-
OcrAbstract,
|
|
13
|
-
/// Author attribution
|
|
14
|
-
OcrAuthor,
|
|
15
|
-
/// Block quotation
|
|
16
|
-
OcrBlockquote,
|
|
17
|
-
/// Image caption
|
|
18
|
-
OcrCaption,
|
|
19
|
-
/// Chapter division
|
|
20
|
-
OcrChapter,
|
|
21
|
-
/// Document root element
|
|
22
|
-
OcrDocument,
|
|
23
|
-
/// Paragraph text
|
|
24
|
-
OcrPar,
|
|
25
|
-
/// Major part or section
|
|
26
|
-
OcrPart,
|
|
27
|
-
/// Section heading
|
|
28
|
-
OcrSection,
|
|
29
|
-
/// Subsection heading
|
|
30
|
-
OcrSubsection,
|
|
31
|
-
/// Subsubsection heading
|
|
32
|
-
OcrSubsubsection,
|
|
33
|
-
/// Document title
|
|
34
|
-
OcrTitle,
|
|
35
|
-
|
|
36
|
-
/// Column area of multi-column layout
|
|
37
|
-
OcrCarea,
|
|
38
|
-
/// Column within a page
|
|
39
|
-
OcrColumn,
|
|
40
|
-
/// Text line
|
|
41
|
-
OcrLine,
|
|
42
|
-
/// Linearization element
|
|
43
|
-
OcrLinear,
|
|
44
|
-
/// Page element
|
|
45
|
-
OcrPage,
|
|
46
|
-
/// Separator or divider
|
|
47
|
-
OcrSeparator,
|
|
48
|
-
|
|
49
|
-
/// Chemical formula
|
|
50
|
-
OcrChem,
|
|
51
|
-
/// Display equation or complex content
|
|
52
|
-
OcrDisplay,
|
|
53
|
-
/// Float element (typically with caption)
|
|
54
|
-
OcrFloat,
|
|
55
|
-
/// Footer area of page
|
|
56
|
-
OcrFooter,
|
|
57
|
-
/// Header area of page
|
|
58
|
-
OcrHeader,
|
|
59
|
-
/// Image element
|
|
60
|
-
OcrImage,
|
|
61
|
-
/// Line drawing or diagram
|
|
62
|
-
OcrLinedrawing,
|
|
63
|
-
/// Mathematical formula
|
|
64
|
-
OcrMath,
|
|
65
|
-
/// Page number marker
|
|
66
|
-
OcrPageno,
|
|
67
|
-
/// Photograph element
|
|
68
|
-
OcrPhoto,
|
|
69
|
-
/// Table element
|
|
70
|
-
OcrTable,
|
|
71
|
-
/// Text float (floating text box)
|
|
72
|
-
OcrTextfloat,
|
|
73
|
-
/// Text image (text rendered as image)
|
|
74
|
-
OcrTextimage,
|
|
75
|
-
|
|
76
|
-
/// Character information
|
|
77
|
-
OcrCinfo,
|
|
78
|
-
/// Decorative capital letter
|
|
79
|
-
OcrDropcap,
|
|
80
|
-
/// Glyph element
|
|
81
|
-
OcrGlyph,
|
|
82
|
-
/// Multiple glyphs
|
|
83
|
-
OcrGlyphs,
|
|
84
|
-
/// Noise or artifacts
|
|
85
|
-
OcrNoise,
|
|
86
|
-
/// `XyZut` analysis segment
|
|
87
|
-
OcrXycut,
|
|
88
|
-
|
|
89
|
-
/// Block-level element
|
|
90
|
-
OcrxBlock,
|
|
91
|
-
/// OCR word line
|
|
92
|
-
OcrxLine,
|
|
93
|
-
/// Individual word element
|
|
94
|
-
OcrxWord,
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
impl HocrElementType {
|
|
98
|
-
/// Get element type from class name
|
|
99
|
-
#[must_use]
|
|
100
|
-
pub fn from_class(class: &str) -> Option<Self> {
|
|
101
|
-
match class {
|
|
102
|
-
"ocr_abstract" => Some(Self::OcrAbstract),
|
|
103
|
-
"ocr_author" => Some(Self::OcrAuthor),
|
|
104
|
-
"ocr_blockquote" => Some(Self::OcrBlockquote),
|
|
105
|
-
"ocr_caption" => Some(Self::OcrCaption),
|
|
106
|
-
"ocr_chapter" => Some(Self::OcrChapter),
|
|
107
|
-
"ocr_document" => Some(Self::OcrDocument),
|
|
108
|
-
"ocr_par" => Some(Self::OcrPar),
|
|
109
|
-
"ocr_part" => Some(Self::OcrPart),
|
|
110
|
-
"ocr_section" => Some(Self::OcrSection),
|
|
111
|
-
"ocr_subsection" => Some(Self::OcrSubsection),
|
|
112
|
-
"ocr_subsubsection" => Some(Self::OcrSubsubsection),
|
|
113
|
-
"ocr_title" => Some(Self::OcrTitle),
|
|
114
|
-
|
|
115
|
-
"ocr_carea" => Some(Self::OcrCarea),
|
|
116
|
-
"ocr_column" => Some(Self::OcrColumn),
|
|
117
|
-
"ocr_line" => Some(Self::OcrLine),
|
|
118
|
-
"ocr_linear" => Some(Self::OcrLinear),
|
|
119
|
-
"ocr_page" => Some(Self::OcrPage),
|
|
120
|
-
"ocr_separator" => Some(Self::OcrSeparator),
|
|
121
|
-
|
|
122
|
-
"ocr_chem" => Some(Self::OcrChem),
|
|
123
|
-
"ocr_display" => Some(Self::OcrDisplay),
|
|
124
|
-
"ocr_float" => Some(Self::OcrFloat),
|
|
125
|
-
"ocr_footer" => Some(Self::OcrFooter),
|
|
126
|
-
"ocr_header" => Some(Self::OcrHeader),
|
|
127
|
-
"ocr_image" => Some(Self::OcrImage),
|
|
128
|
-
"ocr_linedrawing" => Some(Self::OcrLinedrawing),
|
|
129
|
-
"ocr_math" => Some(Self::OcrMath),
|
|
130
|
-
"ocr_pageno" => Some(Self::OcrPageno),
|
|
131
|
-
"ocr_photo" => Some(Self::OcrPhoto),
|
|
132
|
-
"ocr_table" => Some(Self::OcrTable),
|
|
133
|
-
"ocr_textfloat" => Some(Self::OcrTextfloat),
|
|
134
|
-
"ocr_textimage" => Some(Self::OcrTextimage),
|
|
135
|
-
|
|
136
|
-
"ocr_cinfo" => Some(Self::OcrCinfo),
|
|
137
|
-
"ocr_dropcap" => Some(Self::OcrDropcap),
|
|
138
|
-
"ocr_glyph" => Some(Self::OcrGlyph),
|
|
139
|
-
"ocr_glyphs" => Some(Self::OcrGlyphs),
|
|
140
|
-
"ocr_noise" => Some(Self::OcrNoise),
|
|
141
|
-
"ocr_xycut" => Some(Self::OcrXycut),
|
|
142
|
-
|
|
143
|
-
"ocrx_block" => Some(Self::OcrxBlock),
|
|
144
|
-
"ocrx_line" => Some(Self::OcrxLine),
|
|
145
|
-
"ocrx_word" => Some(Self::OcrxWord),
|
|
146
|
-
|
|
147
|
-
_ => None,
|
|
148
|
-
}
|
|
149
|
-
}
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
/// Bounding box with corner coordinates
|
|
153
|
-
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
154
|
-
pub struct BBox {
|
|
155
|
-
/// Left edge x-coordinate (pixels)
|
|
156
|
-
pub x1: u32,
|
|
157
|
-
/// Top edge y-coordinate (pixels)
|
|
158
|
-
pub y1: u32,
|
|
159
|
-
/// Right edge x-coordinate (pixels)
|
|
160
|
-
pub x2: u32,
|
|
161
|
-
/// Bottom edge y-coordinate (pixels)
|
|
162
|
-
pub y2: u32,
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
impl BBox {
|
|
166
|
-
/// Calculate the width from left to right edge
|
|
167
|
-
#[must_use]
|
|
168
|
-
pub const fn width(&self) -> u32 {
|
|
169
|
-
self.x2.saturating_sub(self.x1)
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
/// Calculate the height from top to bottom edge
|
|
173
|
-
#[must_use]
|
|
174
|
-
pub const fn height(&self) -> u32 {
|
|
175
|
-
self.y2.saturating_sub(self.y1)
|
|
176
|
-
}
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
/// Baseline property for text alignment in OCR elements
|
|
180
|
-
#[derive(Debug, Clone, Copy, PartialEq)]
|
|
181
|
-
pub struct Baseline {
|
|
182
|
-
/// Baseline slope relative to horizontal
|
|
183
|
-
pub slope: f64,
|
|
184
|
-
/// Baseline vertical offset in pixels
|
|
185
|
-
pub constant: i32,
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
/// All hOCR properties extracted from element attributes
|
|
189
|
-
#[derive(Debug, Clone, Default)]
|
|
190
|
-
pub struct HocrProperties {
|
|
191
|
-
/// Bounding box (left, top, right, bottom) coordinates
|
|
192
|
-
pub bbox: Option<BBox>,
|
|
193
|
-
/// Baseline properties (slope, constant offset)
|
|
194
|
-
pub baseline: Option<Baseline>,
|
|
195
|
-
/// Text rotation angle in degrees
|
|
196
|
-
pub textangle: Option<f64>,
|
|
197
|
-
/// Polygon coordinates for non-rectangular regions
|
|
198
|
-
pub poly: Option<Vec<(i32, i32)>>,
|
|
199
|
-
|
|
200
|
-
/// Word-level confidence score (0-100)
|
|
201
|
-
pub x_wconf: Option<f64>,
|
|
202
|
-
/// Per-character confidence scores
|
|
203
|
-
pub x_confs: Vec<f64>,
|
|
204
|
-
/// Natural language processing results
|
|
205
|
-
pub nlp: Vec<f64>,
|
|
206
|
-
|
|
207
|
-
/// Font name or family
|
|
208
|
-
pub x_font: Option<String>,
|
|
209
|
-
/// Font size in points
|
|
210
|
-
pub x_fsize: Option<u32>,
|
|
211
|
-
|
|
212
|
-
/// Reading order index for document structure
|
|
213
|
-
pub order: Option<u32>,
|
|
214
|
-
/// Column flow direction (ltr, rtl, etc.)
|
|
215
|
-
pub cflow: Option<String>,
|
|
216
|
-
/// Hard line break indicator
|
|
217
|
-
pub hardbreak: bool,
|
|
218
|
-
|
|
219
|
-
/// Cut lines for layout analysis
|
|
220
|
-
pub cuts: Vec<Vec<u32>>,
|
|
221
|
-
/// Alternative bounding boxes for multi-part elements
|
|
222
|
-
pub x_bboxes: Vec<BBox>,
|
|
223
|
-
|
|
224
|
-
/// Image path or data URI
|
|
225
|
-
pub image: Option<String>,
|
|
226
|
-
/// MD5 hash of image content
|
|
227
|
-
pub imagemd5: Option<String>,
|
|
228
|
-
/// Physical page number
|
|
229
|
-
pub ppageno: Option<u32>,
|
|
230
|
-
/// Logical page number/label
|
|
231
|
-
pub lpageno: Option<String>,
|
|
232
|
-
/// Scanner resolution (`dpi_x`, `dpi_y`)
|
|
233
|
-
pub scan_res: Option<(u32, u32)>,
|
|
234
|
-
/// Image source file paths
|
|
235
|
-
pub x_source: Vec<String>,
|
|
236
|
-
/// Scanner device identifier
|
|
237
|
-
pub x_scanner: Option<String>,
|
|
238
|
-
|
|
239
|
-
/// Additional custom properties
|
|
240
|
-
pub other: HashMap<String, String>,
|
|
241
|
-
}
|
|
242
|
-
|
|
243
|
-
/// A complete hOCR element with type, properties, text content, and child elements
|
|
244
|
-
#[derive(Debug, Clone)]
|
|
245
|
-
pub struct HocrElement {
|
|
246
|
-
/// The semantic type of this hOCR element
|
|
247
|
-
pub element_type: HocrElementType,
|
|
248
|
-
/// All extracted properties (bbox, confidence, etc.)
|
|
249
|
-
pub properties: HocrProperties,
|
|
250
|
-
/// Text content of this element
|
|
251
|
-
pub text: String,
|
|
252
|
-
/// Child elements in the document tree
|
|
253
|
-
pub children: Vec<Self>,
|
|
254
|
-
}
|
|
255
|
-
|
|
256
|
-
/// hOCR document metadata extracted from document properties
|
|
257
|
-
#[derive(Debug, Clone, Default)]
|
|
258
|
-
pub struct HocrMetadata {
|
|
259
|
-
/// Name and version of the OCR system used
|
|
260
|
-
pub ocr_system: Option<String>,
|
|
261
|
-
/// OCR capabilities supported (e.g., "`ocr_page`", "`ocr_carea`")
|
|
262
|
-
pub ocr_capabilities: Vec<String>,
|
|
263
|
-
/// Total number of pages in the OCR'd document
|
|
264
|
-
pub ocr_number_of_pages: Option<u32>,
|
|
265
|
-
/// Languages detected in the document (ISO 639 codes)
|
|
266
|
-
pub ocr_langs: Vec<String>,
|
|
267
|
-
/// Scripts used in the document (ISO 15924 codes)
|
|
268
|
-
pub ocr_scripts: Vec<String>,
|
|
269
|
-
}
|