html-to-markdown 2.30.0 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +6 -19
  3. data/README.md +37 -50
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +13 -701
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
  6. data/ext/html-to-markdown-rb/native/README.md +4 -13
  7. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
  8. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
  9. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
  10. data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
  11. data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
  12. data/lib/html_to_markdown/version.rb +1 -1
  13. data/lib/html_to_markdown.rb +13 -194
  14. data/sig/html_to_markdown.rbs +12 -373
  15. data/vendor/Cargo.toml +6 -3
  16. data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
  17. data/vendor/html-to-markdown-rs/README.md +126 -52
  18. data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
  19. data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
  20. data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
  21. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
  22. data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
  23. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
  24. data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
  25. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
  26. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
  27. data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
  28. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
  29. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
  30. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
  31. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
  32. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
  33. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
  38. data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
  43. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
  44. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
  45. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
  46. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
  47. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
  48. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
  49. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
  50. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
  52. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
  53. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
  54. data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
  56. data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -68
  57. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
  58. data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
  59. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
  60. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
  61. data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
  62. data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
  63. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
  65. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
  66. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
  67. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
  68. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
  69. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
  70. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
  71. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
  72. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
  73. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
  74. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
  75. data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
  76. data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
  77. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  78. data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
  79. data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
  80. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
  81. data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
  82. data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -323
  83. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
  84. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
  85. data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
  86. data/vendor/html-to-markdown-rs/src/text.rs +25 -14
  87. data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
  88. data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
  89. data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
  90. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
  91. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
  92. data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
  93. data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
  94. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
  95. data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
  96. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
  97. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
  98. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
  99. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
  100. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
  101. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
  102. data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
  103. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
  104. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
  105. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
  106. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
  107. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
  108. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
  109. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
  110. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
  111. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
  112. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
  113. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
  114. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
  115. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
  116. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
  117. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
  118. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
  119. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
  120. data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
  121. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
  122. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
  123. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
  124. data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
  125. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
  126. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
  127. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
  128. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
  129. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
  130. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
  131. metadata +9 -37
  132. data/bin/benchmark.rb +0 -232
  133. data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
  134. data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
  135. data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
  136. data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
  137. data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
  138. data/spec/convert_spec.rb +0 -77
  139. data/spec/convert_with_tables_spec.rb +0 -194
  140. data/spec/metadata_extraction_spec.rb +0 -437
  141. data/spec/visitor_issue_187_spec.rb +0 -605
  142. data/spec/visitor_spec.rb +0 -1149
  143. data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
  144. data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
  145. data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
  146. data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
  147. data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
  148. data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
  149. data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
  150. data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
  151. data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
  152. data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -42
  153. data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
  154. data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
  155. data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
  156. data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
  157. data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
  158. data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
  159. data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
  160. data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
  161. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
  162. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
  163. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
  164. data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
  165. data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
  166. data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
@@ -1,335 +0,0 @@
1
- #![allow(clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::unused_self)]
2
- //! Layout analysis and table reconstruction
3
-
4
- use crate::hocr::spatial::coords::HocrWord;
5
-
6
- /// Detect column positions from word positions
7
- ///
8
- /// Groups words by their x-position and returns the median x-position
9
- /// for each detected column.
10
- ///
11
- /// Optimized with O(n log n) complexity using sorted insertion.
12
- #[must_use]
13
- pub fn detect_columns(words: &[HocrWord], column_threshold: u32) -> Vec<u32> {
14
- if words.is_empty() {
15
- return Vec::new();
16
- }
17
-
18
- let mut x_positions: Vec<u32> = words.iter().map(|w| w.left).collect();
19
- x_positions.sort_unstable();
20
-
21
- let mut position_groups: Vec<Vec<u32>> = Vec::new();
22
- let mut current_group = vec![x_positions[0]];
23
-
24
- for &x_pos in &x_positions[1..] {
25
- let matches_group = current_group.iter().any(|&pos| x_pos.abs_diff(pos) <= column_threshold);
26
-
27
- if matches_group {
28
- current_group.push(x_pos);
29
- } else {
30
- position_groups.push(std::mem::replace(&mut current_group, vec![x_pos]));
31
- }
32
- }
33
-
34
- if !current_group.is_empty() {
35
- position_groups.push(current_group);
36
- }
37
-
38
- let mut columns: Vec<u32> = position_groups
39
- .iter()
40
- .map(|group| {
41
- let mid = group.len() / 2;
42
- group[mid]
43
- })
44
- .collect();
45
-
46
- columns.sort_unstable();
47
- columns
48
- }
49
-
50
- /// Detect row positions from word positions
51
- ///
52
- /// Groups words by their vertical center position and returns the median
53
- /// y-position for each detected row.
54
- ///
55
- /// Optimized with O(n log n) complexity using sorted insertion.
56
- #[must_use]
57
- #[allow(clippy::cast_possible_truncation)]
58
- pub fn detect_rows(words: &[HocrWord], row_threshold_ratio: f64) -> Vec<u32> {
59
- if words.is_empty() {
60
- return Vec::new();
61
- }
62
-
63
- let mut heights: Vec<u32> = words.iter().map(|w| w.height).collect();
64
- heights.sort_unstable();
65
- let median_height = heights[heights.len() / 2];
66
- let row_threshold = f64::from(median_height) * row_threshold_ratio;
67
-
68
- let mut y_centers: Vec<f64> = words.iter().map(HocrWord::y_center).collect();
69
- y_centers.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
70
-
71
- let mut position_groups: Vec<Vec<f64>> = Vec::new();
72
- let mut current_group = vec![y_centers[0]];
73
-
74
- for &y_center in &y_centers[1..] {
75
- let matches_group = current_group.iter().any(|&pos| (y_center - pos).abs() <= row_threshold);
76
-
77
- if matches_group {
78
- current_group.push(y_center);
79
- } else {
80
- position_groups.push(std::mem::replace(&mut current_group, vec![y_center]));
81
- }
82
- }
83
-
84
- if !current_group.is_empty() {
85
- position_groups.push(current_group);
86
- }
87
-
88
- let mut rows: Vec<u32> = position_groups
89
- .iter()
90
- .map(|group| {
91
- let mid = group.len() / 2;
92
- group[mid] as u32
93
- })
94
- .collect();
95
-
96
- rows.sort_unstable();
97
- rows
98
- }
99
-
100
- /// Remove empty rows and columns from table
101
- fn remove_empty_rows_and_columns(table: Vec<Vec<String>>) -> Vec<Vec<String>> {
102
- if table.is_empty() {
103
- return table;
104
- }
105
-
106
- let num_cols = table[0].len();
107
- let mut non_empty_cols: Vec<bool> = vec![false; num_cols];
108
-
109
- for row in &table {
110
- for (col_idx, cell) in row.iter().enumerate() {
111
- if !cell.trim().is_empty() {
112
- non_empty_cols[col_idx] = true;
113
- }
114
- }
115
- }
116
-
117
- table
118
- .into_iter()
119
- .filter(|row| row.iter().any(|cell| !cell.trim().is_empty()))
120
- .map(|row| {
121
- row.into_iter()
122
- .enumerate()
123
- .filter(|(idx, _)| non_empty_cols[*idx])
124
- .map(|(_, cell)| cell)
125
- .collect()
126
- })
127
- .collect()
128
- }
129
-
130
- /// Find which row a word belongs to based on its y-center
131
- #[allow(clippy::cast_possible_truncation)]
132
- fn find_row_index(row_positions: &[u32], word: &HocrWord) -> Option<usize> {
133
- let y_center = word.y_center() as u32;
134
-
135
- row_positions
136
- .iter()
137
- .enumerate()
138
- .min_by_key(|&(_, row_y)| row_y.abs_diff(y_center))
139
- .map(|(idx, _)| idx)
140
- }
141
-
142
- /// Find which column a word belongs to based on its x-position
143
- fn find_column_index(col_positions: &[u32], word: &HocrWord) -> Option<usize> {
144
- let x_pos = word.left;
145
-
146
- col_positions
147
- .iter()
148
- .enumerate()
149
- .min_by_key(|&(_, col_x)| col_x.abs_diff(x_pos))
150
- .map(|(idx, _)| idx)
151
- }
152
-
153
- /// Reconstruct table structure from words
154
- ///
155
- /// Takes detected words and reconstructs a 2D table by:
156
- /// 1. Detecting column and row positions
157
- /// 2. Assigning words to cells based on position
158
- /// 3. Combining words within the same cell
159
- #[must_use]
160
- pub fn reconstruct_table(words: &[HocrWord], column_threshold: u32, row_threshold_ratio: f64) -> Vec<Vec<String>> {
161
- if words.is_empty() {
162
- return Vec::new();
163
- }
164
-
165
- let col_positions = detect_columns(words, column_threshold);
166
- let row_positions = detect_rows(words, row_threshold_ratio);
167
-
168
- if col_positions.is_empty() || row_positions.is_empty() {
169
- return Vec::new();
170
- }
171
-
172
- let num_rows = row_positions.len();
173
- let num_cols = col_positions.len();
174
- let mut table: Vec<Vec<Vec<String>>> = vec![vec![vec![]; num_cols]; num_rows];
175
-
176
- for word in words {
177
- if let (Some(r), Some(c)) = (
178
- find_row_index(&row_positions, word),
179
- find_column_index(&col_positions, word),
180
- ) {
181
- if r < num_rows && c < num_cols {
182
- table[r][c].push(word.text.clone());
183
- }
184
- }
185
- }
186
-
187
- let result: Vec<Vec<String>> = table
188
- .into_iter()
189
- .map(|row| {
190
- row.into_iter()
191
- .map(|cell_words| {
192
- if cell_words.is_empty() {
193
- String::new()
194
- } else {
195
- cell_words.join(" ")
196
- }
197
- })
198
- .collect()
199
- })
200
- .collect();
201
-
202
- remove_empty_rows_and_columns(result)
203
- }
204
-
205
- #[cfg(test)]
206
- mod tests {
207
- use super::*;
208
-
209
- #[test]
210
- fn test_detect_columns() {
211
- let words = vec![
212
- HocrWord {
213
- text: "A".to_string(),
214
- left: 100,
215
- top: 50,
216
- width: 20,
217
- height: 30,
218
- confidence: 95.0,
219
- },
220
- HocrWord {
221
- text: "B".to_string(),
222
- left: 200,
223
- top: 50,
224
- width: 20,
225
- height: 30,
226
- confidence: 95.0,
227
- },
228
- HocrWord {
229
- text: "C".to_string(),
230
- left: 105,
231
- top: 100,
232
- width: 20,
233
- height: 30,
234
- confidence: 95.0,
235
- },
236
- ];
237
-
238
- let columns = detect_columns(&words, 50);
239
- assert_eq!(columns.len(), 2);
240
- assert!(columns.contains(&100) || columns.contains(&105));
241
- assert!(columns.contains(&200));
242
- }
243
-
244
- #[test]
245
- fn test_reconstruct_simple_table() {
246
- let words = vec![
247
- HocrWord {
248
- text: "Name".to_string(),
249
- left: 100,
250
- top: 50,
251
- width: 50,
252
- height: 20,
253
- confidence: 95.0,
254
- },
255
- HocrWord {
256
- text: "Age".to_string(),
257
- left: 200,
258
- top: 50,
259
- width: 50,
260
- height: 20,
261
- confidence: 95.0,
262
- },
263
- HocrWord {
264
- text: "Alice".to_string(),
265
- left: 100,
266
- top: 100,
267
- width: 50,
268
- height: 20,
269
- confidence: 95.0,
270
- },
271
- HocrWord {
272
- text: "30".to_string(),
273
- left: 200,
274
- top: 100,
275
- width: 50,
276
- height: 20,
277
- confidence: 95.0,
278
- },
279
- ];
280
-
281
- let table = reconstruct_table(&words, 50, 0.5);
282
-
283
- assert_eq!(table.len(), 2);
284
- assert_eq!(table[0].len(), 2);
285
- assert_eq!(table[0][0], "Name");
286
- assert_eq!(table[0][1], "Age");
287
- assert_eq!(table[1][0], "Alice");
288
- assert_eq!(table[1][1], "30");
289
- }
290
-
291
- #[test]
292
- fn test_reconstruct_table_with_multi_word_cells() {
293
- let words = vec![
294
- HocrWord {
295
- text: "First".to_string(),
296
- left: 100,
297
- top: 50,
298
- width: 30,
299
- height: 20,
300
- confidence: 95.0,
301
- },
302
- HocrWord {
303
- text: "Name".to_string(),
304
- left: 135,
305
- top: 50,
306
- width: 30,
307
- height: 20,
308
- confidence: 95.0,
309
- },
310
- HocrWord {
311
- text: "Last".to_string(),
312
- left: 200,
313
- top: 50,
314
- width: 30,
315
- height: 20,
316
- confidence: 95.0,
317
- },
318
- HocrWord {
319
- text: "Name".to_string(),
320
- left: 235,
321
- top: 50,
322
- width: 30,
323
- height: 20,
324
- confidence: 95.0,
325
- },
326
- ];
327
-
328
- let table = reconstruct_table(&words, 50, 0.5);
329
-
330
- assert_eq!(table.len(), 1);
331
- assert_eq!(table[0].len(), 2);
332
- assert_eq!(table[0][0], "First Name");
333
- assert_eq!(table[0][1], "Last Name");
334
- }
335
- }
@@ -1,15 +0,0 @@
1
- #![allow(clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::unused_self)]
2
- //! Spatial table reconstruction from hOCR bounding box coordinates
3
- //!
4
- //! This module provides functions to detect and reconstruct tabular data from OCR'd text
5
- //! by analyzing the spatial positions of words using their bounding box (bbox) coordinates.
6
-
7
- mod coords;
8
- mod grouping;
9
- mod layout;
10
- mod output;
11
-
12
- pub use coords::{HocrWord, parse_bbox, parse_confidence};
13
- pub use grouping::extract_hocr_words;
14
- pub use layout::reconstruct_table;
15
- pub use output::table_to_markdown;
@@ -1,63 +0,0 @@
1
- #![allow(clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::unused_self)]
2
- //! Markdown table output formatting
3
-
4
- /// Convert table to markdown format
5
- #[must_use]
6
- pub fn table_to_markdown(table: &[Vec<String>]) -> String {
7
- if table.is_empty() {
8
- return String::new();
9
- }
10
-
11
- let num_cols = table[0].len();
12
- if num_cols == 0 {
13
- return String::new();
14
- }
15
-
16
- let mut markdown = String::new();
17
-
18
- for (row_idx, row) in table.iter().enumerate() {
19
- markdown.push('|');
20
- for cell in row {
21
- markdown.push(' ');
22
- markdown.push_str(&cell.replace('|', "\\|"));
23
- markdown.push_str(" |");
24
- }
25
- markdown.push('\n');
26
-
27
- if row_idx == 0 {
28
- markdown.push('|');
29
- for _ in 0..num_cols {
30
- markdown.push_str(" --- |");
31
- }
32
- markdown.push('\n');
33
- }
34
- }
35
-
36
- markdown
37
- }
38
-
39
- #[cfg(test)]
40
- mod tests {
41
- use super::*;
42
-
43
- #[test]
44
- fn test_table_to_markdown() {
45
- let table = vec![
46
- vec!["Header1".to_string(), "Header2".to_string()],
47
- vec!["Cell1".to_string(), "Cell2".to_string()],
48
- ];
49
-
50
- let markdown = table_to_markdown(&table);
51
- assert!(markdown.contains("| Header1 | Header2 |"));
52
- assert!(markdown.contains("| --- | --- |"));
53
- assert!(markdown.contains("| Cell1 | Cell2 |"));
54
- }
55
-
56
- #[test]
57
- fn test_table_to_markdown_escape_pipes() {
58
- let table = vec![vec!["A|B".to_string(), "C".to_string()]];
59
-
60
- let markdown = table_to_markdown(&table);
61
- assert!(markdown.contains("A\\|B"));
62
- }
63
- }
@@ -1,269 +0,0 @@
1
- #![allow(clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::unused_self)]
2
- //! hOCR 1.2 type definitions
3
- //!
4
- //! Complete type system for hOCR 1.2 specification elements and properties.
5
-
6
- use ahash::AHashMap as HashMap;
7
-
8
- /// All hOCR 1.2 element types
9
- #[derive(Debug, Clone, Copy, PartialEq, Eq)]
10
- pub enum HocrElementType {
11
- /// Document abstract or summary
12
- OcrAbstract,
13
- /// Author attribution
14
- OcrAuthor,
15
- /// Block quotation
16
- OcrBlockquote,
17
- /// Image caption
18
- OcrCaption,
19
- /// Chapter division
20
- OcrChapter,
21
- /// Document root element
22
- OcrDocument,
23
- /// Paragraph text
24
- OcrPar,
25
- /// Major part or section
26
- OcrPart,
27
- /// Section heading
28
- OcrSection,
29
- /// Subsection heading
30
- OcrSubsection,
31
- /// Subsubsection heading
32
- OcrSubsubsection,
33
- /// Document title
34
- OcrTitle,
35
-
36
- /// Column area of multi-column layout
37
- OcrCarea,
38
- /// Column within a page
39
- OcrColumn,
40
- /// Text line
41
- OcrLine,
42
- /// Linearization element
43
- OcrLinear,
44
- /// Page element
45
- OcrPage,
46
- /// Separator or divider
47
- OcrSeparator,
48
-
49
- /// Chemical formula
50
- OcrChem,
51
- /// Display equation or complex content
52
- OcrDisplay,
53
- /// Float element (typically with caption)
54
- OcrFloat,
55
- /// Footer area of page
56
- OcrFooter,
57
- /// Header area of page
58
- OcrHeader,
59
- /// Image element
60
- OcrImage,
61
- /// Line drawing or diagram
62
- OcrLinedrawing,
63
- /// Mathematical formula
64
- OcrMath,
65
- /// Page number marker
66
- OcrPageno,
67
- /// Photograph element
68
- OcrPhoto,
69
- /// Table element
70
- OcrTable,
71
- /// Text float (floating text box)
72
- OcrTextfloat,
73
- /// Text image (text rendered as image)
74
- OcrTextimage,
75
-
76
- /// Character information
77
- OcrCinfo,
78
- /// Decorative capital letter
79
- OcrDropcap,
80
- /// Glyph element
81
- OcrGlyph,
82
- /// Multiple glyphs
83
- OcrGlyphs,
84
- /// Noise or artifacts
85
- OcrNoise,
86
- /// `XyZut` analysis segment
87
- OcrXycut,
88
-
89
- /// Block-level element
90
- OcrxBlock,
91
- /// OCR word line
92
- OcrxLine,
93
- /// Individual word element
94
- OcrxWord,
95
- }
96
-
97
- impl HocrElementType {
98
- /// Get element type from class name
99
- #[must_use]
100
- pub fn from_class(class: &str) -> Option<Self> {
101
- match class {
102
- "ocr_abstract" => Some(Self::OcrAbstract),
103
- "ocr_author" => Some(Self::OcrAuthor),
104
- "ocr_blockquote" => Some(Self::OcrBlockquote),
105
- "ocr_caption" => Some(Self::OcrCaption),
106
- "ocr_chapter" => Some(Self::OcrChapter),
107
- "ocr_document" => Some(Self::OcrDocument),
108
- "ocr_par" => Some(Self::OcrPar),
109
- "ocr_part" => Some(Self::OcrPart),
110
- "ocr_section" => Some(Self::OcrSection),
111
- "ocr_subsection" => Some(Self::OcrSubsection),
112
- "ocr_subsubsection" => Some(Self::OcrSubsubsection),
113
- "ocr_title" => Some(Self::OcrTitle),
114
-
115
- "ocr_carea" => Some(Self::OcrCarea),
116
- "ocr_column" => Some(Self::OcrColumn),
117
- "ocr_line" => Some(Self::OcrLine),
118
- "ocr_linear" => Some(Self::OcrLinear),
119
- "ocr_page" => Some(Self::OcrPage),
120
- "ocr_separator" => Some(Self::OcrSeparator),
121
-
122
- "ocr_chem" => Some(Self::OcrChem),
123
- "ocr_display" => Some(Self::OcrDisplay),
124
- "ocr_float" => Some(Self::OcrFloat),
125
- "ocr_footer" => Some(Self::OcrFooter),
126
- "ocr_header" => Some(Self::OcrHeader),
127
- "ocr_image" => Some(Self::OcrImage),
128
- "ocr_linedrawing" => Some(Self::OcrLinedrawing),
129
- "ocr_math" => Some(Self::OcrMath),
130
- "ocr_pageno" => Some(Self::OcrPageno),
131
- "ocr_photo" => Some(Self::OcrPhoto),
132
- "ocr_table" => Some(Self::OcrTable),
133
- "ocr_textfloat" => Some(Self::OcrTextfloat),
134
- "ocr_textimage" => Some(Self::OcrTextimage),
135
-
136
- "ocr_cinfo" => Some(Self::OcrCinfo),
137
- "ocr_dropcap" => Some(Self::OcrDropcap),
138
- "ocr_glyph" => Some(Self::OcrGlyph),
139
- "ocr_glyphs" => Some(Self::OcrGlyphs),
140
- "ocr_noise" => Some(Self::OcrNoise),
141
- "ocr_xycut" => Some(Self::OcrXycut),
142
-
143
- "ocrx_block" => Some(Self::OcrxBlock),
144
- "ocrx_line" => Some(Self::OcrxLine),
145
- "ocrx_word" => Some(Self::OcrxWord),
146
-
147
- _ => None,
148
- }
149
- }
150
- }
151
-
152
- /// Bounding box with corner coordinates
153
- #[derive(Debug, Clone, Copy, PartialEq, Eq)]
154
- pub struct BBox {
155
- /// Left edge x-coordinate (pixels)
156
- pub x1: u32,
157
- /// Top edge y-coordinate (pixels)
158
- pub y1: u32,
159
- /// Right edge x-coordinate (pixels)
160
- pub x2: u32,
161
- /// Bottom edge y-coordinate (pixels)
162
- pub y2: u32,
163
- }
164
-
165
- impl BBox {
166
- /// Calculate the width from left to right edge
167
- #[must_use]
168
- pub const fn width(&self) -> u32 {
169
- self.x2.saturating_sub(self.x1)
170
- }
171
-
172
- /// Calculate the height from top to bottom edge
173
- #[must_use]
174
- pub const fn height(&self) -> u32 {
175
- self.y2.saturating_sub(self.y1)
176
- }
177
- }
178
-
179
- /// Baseline property for text alignment in OCR elements
180
- #[derive(Debug, Clone, Copy, PartialEq)]
181
- pub struct Baseline {
182
- /// Baseline slope relative to horizontal
183
- pub slope: f64,
184
- /// Baseline vertical offset in pixels
185
- pub constant: i32,
186
- }
187
-
188
- /// All hOCR properties extracted from element attributes
189
- #[derive(Debug, Clone, Default)]
190
- pub struct HocrProperties {
191
- /// Bounding box (left, top, right, bottom) coordinates
192
- pub bbox: Option<BBox>,
193
- /// Baseline properties (slope, constant offset)
194
- pub baseline: Option<Baseline>,
195
- /// Text rotation angle in degrees
196
- pub textangle: Option<f64>,
197
- /// Polygon coordinates for non-rectangular regions
198
- pub poly: Option<Vec<(i32, i32)>>,
199
-
200
- /// Word-level confidence score (0-100)
201
- pub x_wconf: Option<f64>,
202
- /// Per-character confidence scores
203
- pub x_confs: Vec<f64>,
204
- /// Natural language processing results
205
- pub nlp: Vec<f64>,
206
-
207
- /// Font name or family
208
- pub x_font: Option<String>,
209
- /// Font size in points
210
- pub x_fsize: Option<u32>,
211
-
212
- /// Reading order index for document structure
213
- pub order: Option<u32>,
214
- /// Column flow direction (ltr, rtl, etc.)
215
- pub cflow: Option<String>,
216
- /// Hard line break indicator
217
- pub hardbreak: bool,
218
-
219
- /// Cut lines for layout analysis
220
- pub cuts: Vec<Vec<u32>>,
221
- /// Alternative bounding boxes for multi-part elements
222
- pub x_bboxes: Vec<BBox>,
223
-
224
- /// Image path or data URI
225
- pub image: Option<String>,
226
- /// MD5 hash of image content
227
- pub imagemd5: Option<String>,
228
- /// Physical page number
229
- pub ppageno: Option<u32>,
230
- /// Logical page number/label
231
- pub lpageno: Option<String>,
232
- /// Scanner resolution (`dpi_x`, `dpi_y`)
233
- pub scan_res: Option<(u32, u32)>,
234
- /// Image source file paths
235
- pub x_source: Vec<String>,
236
- /// Scanner device identifier
237
- pub x_scanner: Option<String>,
238
-
239
- /// Additional custom properties
240
- pub other: HashMap<String, String>,
241
- }
242
-
243
- /// A complete hOCR element with type, properties, text content, and child elements
244
- #[derive(Debug, Clone)]
245
- pub struct HocrElement {
246
- /// The semantic type of this hOCR element
247
- pub element_type: HocrElementType,
248
- /// All extracted properties (bbox, confidence, etc.)
249
- pub properties: HocrProperties,
250
- /// Text content of this element
251
- pub text: String,
252
- /// Child elements in the document tree
253
- pub children: Vec<Self>,
254
- }
255
-
256
- /// hOCR document metadata extracted from document properties
257
- #[derive(Debug, Clone, Default)]
258
- pub struct HocrMetadata {
259
- /// Name and version of the OCR system used
260
- pub ocr_system: Option<String>,
261
- /// OCR capabilities supported (e.g., "`ocr_page`", "`ocr_carea`")
262
- pub ocr_capabilities: Vec<String>,
263
- /// Total number of pages in the OCR'd document
264
- pub ocr_number_of_pages: Option<u32>,
265
- /// Languages detected in the document (ISO 639 codes)
266
- pub ocr_langs: Vec<String>,
267
- /// Scripts used in the document (ISO 15924 codes)
268
- pub ocr_scripts: Vec<String>,
269
- }