html-to-markdown 2.30.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +4 -14
  3. data/README.md +37 -50
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +13 -701
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
  6. data/ext/html-to-markdown-rb/native/README.md +4 -13
  7. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
  8. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
  9. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
  10. data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
  11. data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
  12. data/lib/html_to_markdown/version.rb +1 -1
  13. data/lib/html_to_markdown.rb +13 -194
  14. data/sig/html_to_markdown.rbs +12 -373
  15. data/vendor/Cargo.toml +5 -2
  16. data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
  17. data/vendor/html-to-markdown-rs/README.md +126 -52
  18. data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
  19. data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
  20. data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
  21. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
  22. data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
  23. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
  24. data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
  25. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
  26. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
  27. data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
  28. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
  29. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
  30. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
  31. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
  32. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
  33. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
  38. data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
  43. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
  44. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
  45. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
  46. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
  47. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
  48. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
  49. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
  50. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
  52. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
  53. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
  54. data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
  56. data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -68
  57. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
  58. data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
  59. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
  60. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
  61. data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
  62. data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
  63. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
  65. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
  66. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
  67. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
  68. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
  69. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
  70. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
  71. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
  72. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
  73. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
  74. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
  75. data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
  76. data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
  77. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  78. data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
  79. data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
  80. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
  81. data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
  82. data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -323
  83. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
  84. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
  85. data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
  86. data/vendor/html-to-markdown-rs/src/text.rs +25 -14
  87. data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
  88. data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
  89. data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
  90. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
  91. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
  92. data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
  93. data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
  94. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
  95. data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
  96. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
  97. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
  98. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
  99. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
  100. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
  101. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
  102. data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
  103. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
  104. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
  105. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
  106. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
  107. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
  108. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
  109. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
  110. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
  111. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
  112. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
  113. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
  114. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
  115. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
  116. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
  117. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
  118. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
  119. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
  120. data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
  121. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
  122. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
  123. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
  124. data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
  125. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
  126. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
  127. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
  128. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
  129. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
  130. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
  131. metadata +9 -37
  132. data/bin/benchmark.rb +0 -232
  133. data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
  134. data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
  135. data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
  136. data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
  137. data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
  138. data/spec/convert_spec.rb +0 -77
  139. data/spec/convert_with_tables_spec.rb +0 -194
  140. data/spec/metadata_extraction_spec.rb +0 -437
  141. data/spec/visitor_issue_187_spec.rb +0 -605
  142. data/spec/visitor_spec.rb +0 -1149
  143. data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
  144. data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
  145. data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
  146. data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
  147. data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
  148. data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
  149. data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
  150. data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
  151. data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
  152. data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -42
  153. data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
  154. data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
  155. data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
  156. data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
  157. data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
  158. data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
  159. data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
  160. data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
  161. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
  162. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
  163. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
  164. data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
  165. data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
  166. data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
@@ -1,313 +0,0 @@
1
- //! Spatial layout analysis and table reconstruction for hOCR conversion
2
-
3
- use crate::hocr::spatial::{self, HocrWord};
4
- use crate::hocr::types::{HocrElement, HocrElementType};
5
-
6
- pub fn is_bullet_paragraph(element: &HocrElement, text: &str) -> bool {
7
- if element.element_type != HocrElementType::OcrPar {
8
- return false;
9
- }
10
-
11
- let trimmed = text.trim_start();
12
- if trimmed.is_empty() {
13
- return false;
14
- }
15
-
16
- if matches!(trimmed.chars().next(), Some('•' | '●' | '-' | '+' | '*')) {
17
- return true;
18
- }
19
-
20
- let mut chars = trimmed.chars().peekable();
21
- let mut digit_count = 0;
22
- while let Some(&ch) = chars.peek() {
23
- if ch.is_ascii_digit() {
24
- digit_count += 1;
25
- chars.next();
26
- } else {
27
- break;
28
- }
29
- }
30
-
31
- if digit_count > 0 {
32
- if let Some(&ch) = chars.peek() {
33
- if (ch == '.' || ch == ')') && chars.clone().nth(1).is_some_and(char::is_whitespace) {
34
- return true;
35
- }
36
- }
37
- }
38
-
39
- false
40
- }
41
-
42
- /// Try to detect and reconstruct a table from an element's word children
43
- ///
44
- /// Returns Some(markdown) if table structure detected, None otherwise
45
- pub fn try_spatial_table_reconstruction(element: &HocrElement) -> Option<String> {
46
- let mut words = Vec::new();
47
- collect_words(element, &mut words);
48
-
49
- if words.len() < 6 {
50
- return None;
51
- }
52
-
53
- let table = spatial::reconstruct_table(&words, 50, 0.5);
54
-
55
- if table.is_empty() || table[0].is_empty() {
56
- return None;
57
- }
58
-
59
- if let Some(cleaned_table) = post_process_table(table) {
60
- let markdown = spatial::table_to_markdown(&cleaned_table);
61
- if !markdown.is_empty() {
62
- return Some(markdown);
63
- }
64
- }
65
-
66
- None
67
- }
68
-
69
- /// Collect all word elements recursively from an element tree
70
- fn collect_words(element: &HocrElement, words: &mut Vec<HocrWord>) {
71
- if element.element_type == HocrElementType::OcrxWord {
72
- if let Some(bbox) = element.properties.bbox {
73
- let confidence = element.properties.x_wconf.unwrap_or(0.0);
74
- words.push(HocrWord {
75
- text: element.text.clone(),
76
- left: bbox.x1,
77
- top: bbox.y1,
78
- width: bbox.width(),
79
- height: bbox.height(),
80
- confidence,
81
- });
82
- }
83
- }
84
-
85
- for child in &element.children {
86
- collect_words(child, words);
87
- }
88
- }
89
-
90
- fn post_process_table(mut table: Vec<Vec<String>>) -> Option<Vec<Vec<String>>> {
91
- table.retain(|row| row.iter().any(|cell| !cell.trim().is_empty()));
92
- if table.is_empty() {
93
- return None;
94
- }
95
-
96
- let mut non_empty = 0;
97
- let mut long_cells = 0;
98
- for row in &table {
99
- for cell in row {
100
- let trimmed = cell.trim();
101
- if trimmed.is_empty() {
102
- continue;
103
- }
104
- non_empty += 1;
105
- if trimmed.chars().count() > 60 {
106
- long_cells += 1;
107
- }
108
- }
109
- }
110
-
111
- if non_empty > 0 && long_cells * 3 > non_empty * 2 {
112
- return None;
113
- }
114
-
115
- let data_start = table
116
- .iter()
117
- .enumerate()
118
- .find_map(|(idx, row)| {
119
- let digit_cells = row
120
- .iter()
121
- .filter(|cell| cell.chars().any(|c| c.is_ascii_digit()))
122
- .count();
123
- if digit_cells >= 3 { Some(idx) } else { None }
124
- })
125
- .unwrap_or(0);
126
-
127
- let mut header_rows = if data_start > 0 {
128
- table[..data_start].to_vec()
129
- } else {
130
- Vec::new()
131
- };
132
- let mut data_rows = table[data_start..].to_vec();
133
-
134
- if header_rows.len() > 2 {
135
- header_rows = header_rows[header_rows.len() - 2..].to_vec();
136
- }
137
-
138
- if header_rows.is_empty() {
139
- if data_rows.len() < 2 {
140
- return None;
141
- }
142
- header_rows.push(data_rows[0].clone());
143
- data_rows = data_rows[1..].to_vec();
144
- }
145
-
146
- let column_count = header_rows
147
- .first()
148
- .or_else(|| data_rows.first())
149
- .map_or(0, std::vec::Vec::len);
150
-
151
- if column_count == 0 {
152
- return None;
153
- }
154
-
155
- let mut header = vec![String::new(); column_count];
156
- for row in &header_rows {
157
- for (idx, cell) in row.iter().enumerate() {
158
- let trimmed = cell.trim();
159
- if trimmed.is_empty() {
160
- continue;
161
- }
162
- if !header[idx].is_empty() {
163
- header[idx].push(' ');
164
- }
165
- header[idx].push_str(trimmed);
166
- }
167
- }
168
-
169
- let mut processed = Vec::new();
170
- processed.push(header);
171
- processed.extend(data_rows);
172
-
173
- if processed.len() <= 1 {
174
- return None;
175
- }
176
-
177
- let mut col = 0;
178
- while col < processed[0].len() {
179
- let header_text = processed[0][col].trim().to_string();
180
- let data_empty = processed[1..]
181
- .iter()
182
- .all(|row| row.get(col).is_none_or(|cell| cell.trim().is_empty()));
183
-
184
- if data_empty {
185
- merge_header_only_column(&mut processed, col, header_text);
186
- } else {
187
- col += 1;
188
- }
189
-
190
- if processed.is_empty() || processed[0].is_empty() {
191
- return None;
192
- }
193
- }
194
-
195
- if processed[0].len() < 2 || processed.len() <= 1 {
196
- return None;
197
- }
198
-
199
- for cell in &mut processed[0] {
200
- normalize_header_cell(cell);
201
- }
202
-
203
- for row in processed.iter_mut().skip(1) {
204
- for cell in row.iter_mut() {
205
- normalize_data_cell(cell);
206
- }
207
- }
208
-
209
- Some(processed)
210
- }
211
-
212
- #[allow(clippy::trivially_copy_pass_by_ref)]
213
- fn merge_header_only_column(table: &mut [Vec<String>], col: usize, header_text: String) {
214
- if table.is_empty() || table[0].is_empty() {
215
- return;
216
- }
217
-
218
- let trimmed = header_text.trim();
219
- if trimmed.is_empty() && table.len() > 1 {
220
- for row in table.iter_mut() {
221
- row.remove(col);
222
- }
223
- return;
224
- }
225
-
226
- if !trimmed.is_empty() {
227
- if col > 0 {
228
- let mut target = col - 1;
229
- while target > 0 && table[0][target].trim().is_empty() {
230
- target -= 1;
231
- }
232
- if !table[0][target].trim().is_empty() || target == 0 {
233
- if !table[0][target].is_empty() {
234
- table[0][target].push(' ');
235
- }
236
- table[0][target].push_str(trimmed);
237
- for row in table.iter_mut() {
238
- row.remove(col);
239
- }
240
- return;
241
- }
242
- }
243
-
244
- if col + 1 < table[0].len() {
245
- if table[0][col + 1].trim().is_empty() {
246
- table[0][col + 1] = trimmed.to_string();
247
- } else {
248
- let mut updated = trimmed.to_string();
249
- updated.push(' ');
250
- updated.push_str(table[0][col + 1].trim());
251
- table[0][col + 1] = updated;
252
- }
253
- for row in table.iter_mut() {
254
- row.remove(col);
255
- }
256
- return;
257
- }
258
- }
259
-
260
- for row in table.iter_mut() {
261
- row.remove(col);
262
- }
263
- }
264
-
265
- fn normalize_header_cell(cell: &mut String) {
266
- let mut text = cell.trim().replace(" ", " ");
267
- if text.contains("(Q)") {
268
- text = text.replace("(Q)", "(Ω)");
269
- }
270
- if text.contains("icorr") && text.contains("(A/cm)") && !text.contains("^2") {
271
- text = text.replace("(A/cm)", "(A/cm^2)");
272
- }
273
- if text.eq_ignore_ascii_case("be (V/dec)") {
274
- text = "bc (V/dec)".to_string();
275
- }
276
- if text.starts_with("Polarization resistance") {
277
- if text.contains("(Ω)") {
278
- text = text.replace("(Ω) rate", "(Ω)");
279
- } else {
280
- text.push_str(" (Ω)");
281
- }
282
- }
283
- if text.starts_with("Corrosion") && text.contains("mm/year") {
284
- text = "Corrosion rate (mm/year)".to_string();
285
- }
286
- *cell = text;
287
- }
288
-
289
- fn normalize_data_cell(cell: &mut String) {
290
- let mut text = cell.trim().to_string();
291
- if text.is_empty() {
292
- cell.clear();
293
- return;
294
- }
295
-
296
- for ch in ['\u{2014}', '\u{2013}', '\u{2212}'] {
297
- text = text.replace(ch, "-");
298
- }
299
-
300
- if text.starts_with("- ") {
301
- text = format!("-{}", text[2..].trim_start());
302
- }
303
-
304
- text = text.replace("- ", "-");
305
- text = text.replace(" -", "-");
306
- text = text.replace("E-", "e-").replace("E+", "e+");
307
-
308
- if text == "-" {
309
- text.clear();
310
- }
311
-
312
- *cell = text;
313
- }
@@ -1,26 +0,0 @@
1
- //! hOCR to Markdown conversion module
2
- //!
3
- //! Converts structured hOCR elements to Markdown while preserving document hierarchy.
4
- //!
5
- //! This module is organized into several submodules:
6
- //! - `core`: Main conversion functions and entry points
7
- //! - `elements`: Element-specific conversion logic
8
- //! - `hierarchy`: Document hierarchy and code block detection
9
- //! - `layout`: Spatial layout analysis and table reconstruction
10
- //! - `output`: Output formatting utilities
11
-
12
- #![allow(clippy::branches_sharing_code, clippy::option_if_let_else)]
13
-
14
- mod code_analysis;
15
- mod core;
16
- mod elements;
17
- mod hierarchy;
18
- mod keywords;
19
- mod layout;
20
- mod output;
21
-
22
- // Re-export public API
23
- pub use core::{convert_to_markdown, convert_to_markdown_with_options};
24
-
25
- // Re-export commonly used types from spatial module for downstream use
26
- pub use super::spatial::HocrWord;
@@ -1,78 +0,0 @@
1
- //! Output formatting utilities for hOCR to Markdown conversion
2
-
3
- use crate::hocr::types::{HocrElement, HocrElementType};
4
-
5
- #[derive(Default)]
6
- pub struct ConvertContext {
7
- pub last_heading: Option<String>,
8
- }
9
-
10
- pub fn ensure_trailing_blank_line(output: &mut String) {
11
- while output.ends_with("\n\n\n") {
12
- output.pop();
13
- }
14
- if output.ends_with("\n\n") {
15
- return;
16
- }
17
- if output.ends_with('\n') {
18
- output.push('\n');
19
- } else {
20
- output.push_str("\n\n");
21
- }
22
- }
23
-
24
- pub fn collapse_extra_newlines(output: &mut String) {
25
- let mut collapsed = String::with_capacity(output.len());
26
- let mut newline_count = 0;
27
-
28
- for ch in output.chars() {
29
- if ch == '\n' {
30
- newline_count += 1;
31
- if newline_count <= 2 {
32
- collapsed.push('\n');
33
- }
34
- } else {
35
- newline_count = 0;
36
- collapsed.push(ch);
37
- }
38
- }
39
-
40
- if collapsed.len() != output.len() {
41
- *output = collapsed;
42
- }
43
- }
44
-
45
- pub fn element_text_content(element: &HocrElement) -> String {
46
- let mut output = String::new();
47
- collect_text_tokens(element, &mut output);
48
- output
49
- }
50
-
51
- fn collect_text_tokens(element: &HocrElement, output: &mut String) {
52
- if element.element_type == HocrElementType::OcrxWord {
53
- let trimmed = element.text.trim();
54
- if !trimmed.is_empty() {
55
- if !output.is_empty() {
56
- output.push(' ');
57
- }
58
- output.push_str(trimmed);
59
- }
60
- }
61
-
62
- for child in &element.children {
63
- collect_text_tokens(child, output);
64
- }
65
- }
66
-
67
- pub fn collect_line_words(element: &HocrElement, words: &mut Vec<String>) {
68
- if element.element_type == HocrElementType::OcrxWord {
69
- let trimmed = element.text.trim();
70
- if !trimmed.is_empty() {
71
- words.push(trimmed.to_string());
72
- }
73
- }
74
-
75
- for child in &element.children {
76
- collect_line_words(child, words);
77
- }
78
- }
@@ -1,232 +0,0 @@
1
- #![allow(clippy::option_if_let_else)]
2
- //! hOCR element extraction
3
- //!
4
- //! Extracts structured hOCR elements from HTML DOM.
5
-
6
- use super::parser::parse_properties;
7
- use super::types::{HocrElement, HocrElementType, HocrMetadata};
8
-
9
- /// Extract complete hOCR document structure from HTML DOM
10
- ///
11
- /// Parses an HTML document containing hOCR annotations and extracts all hOCR elements
12
- /// along with document metadata.
13
- ///
14
- /// # Arguments
15
- ///
16
- /// * `dom` - The parsed HTML DOM (from the astral-tl parser)
17
- /// * `debug` - Enable debug logging for property parsing
18
- ///
19
- /// # Returns
20
- ///
21
- /// A tuple containing:
22
- /// * `Vec<HocrElement>` - All top-level hOCR elements with their full hierarchies
23
- /// * `HocrMetadata` - Document metadata from `<head>` meta tags
24
- ///
25
- /// # hOCR 1.2 Compliance
26
- ///
27
- /// Supports all 40 element types:
28
- /// - Logical structure (12): `ocr_title`, `ocr_chapter`, `ocr_section`, `ocr_par`, etc.
29
- /// - Typesetting (6): `ocr_page`, `ocr_carea`, `ocr_line`, etc.
30
- /// - Float elements (13): `ocr_image`, `ocr_table`, `ocr_math`, etc.
31
- /// - Inline elements (6): `ocr_dropcap`, `ocr_glyph`, etc.
32
- /// - Engine-specific (3): `ocrx_block`, `ocrx_line`, `ocrx_word`
33
- ///
34
- /// Extracts all 20+ properties from title attributes (bbox, `x_wconf`, baseline, order, etc.)
35
- /// and all 5 metadata fields (ocr-system, ocr-capabilities, ocr-langs, etc.)
36
- ///
37
- /// # Example
38
- ///
39
- /// ```rust
40
- /// use html_to_markdown_rs::hocr::extract_hocr_document;
41
- ///
42
- /// let html = r#"<div class="ocr_page" title="bbox 0 0 1000 1500">
43
- /// <p class="ocr_par" title="bbox 100 100 900 200">
44
- /// <span class="ocrx_word" title="bbox 100 100 150 130; x_wconf 95">Hello</span>
45
- /// </p>
46
- /// </div>"#;
47
- /// let dom = tl::parse(html, tl::ParserOptions::default()).unwrap();
48
- /// let (elements, metadata) = extract_hocr_document(&dom);
49
- /// ```
50
- #[must_use]
51
- pub fn extract_hocr_document(dom: &tl::VDom) -> (Vec<HocrElement>, HocrMetadata) {
52
- let parser = dom.parser();
53
- let mut elements = Vec::new();
54
- let metadata = extract_metadata(dom);
55
-
56
- for child_handle in dom.children() {
57
- collect_hocr_elements(child_handle, parser, &mut elements);
58
- }
59
-
60
- (elements, metadata)
61
- }
62
-
63
- /// Recursively collect hOCR elements from DOM tree
64
- #[allow(clippy::trivially_copy_pass_by_ref)]
65
- fn collect_hocr_elements(node_handle: &tl::NodeHandle, parser: &tl::Parser, elements: &mut Vec<HocrElement>) {
66
- if let Some(element) = extract_element(node_handle, parser) {
67
- elements.push(element);
68
- } else if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
69
- let children = tag.children();
70
- for child_handle in children.top().iter() {
71
- collect_hocr_elements(child_handle, parser, elements);
72
- }
73
- }
74
- }
75
-
76
- /// Extract hOCR metadata from HTML head (or from orphaned meta tags after sanitization)
77
- pub(crate) fn extract_metadata(dom: &tl::VDom) -> HocrMetadata {
78
- let mut metadata = HocrMetadata::default();
79
- let parser = dom.parser();
80
-
81
- fn extract_from_meta_tag(meta_tag: &tl::HTMLTag, metadata: &mut HocrMetadata) {
82
- let attrs = meta_tag.attributes();
83
- if let (Some(name), Some(content)) = (attrs.get("name").flatten(), attrs.get("content").flatten()) {
84
- let name_str = name.as_utf8_str();
85
- let content_str = content.as_utf8_str().to_string();
86
-
87
- match name_str.as_ref() {
88
- "ocr-system" => metadata.ocr_system = Some(content_str),
89
- "ocr-capabilities" => {
90
- metadata.ocr_capabilities = content_str
91
- .split_whitespace()
92
- .map(std::string::ToString::to_string)
93
- .collect();
94
- }
95
- "ocr-number-of-pages" => {
96
- metadata.ocr_number_of_pages = content_str.parse().ok();
97
- }
98
- "ocr-langs" => {
99
- metadata.ocr_langs = content_str
100
- .split_whitespace()
101
- .map(std::string::ToString::to_string)
102
- .collect();
103
- }
104
- "ocr-scripts" => {
105
- metadata.ocr_scripts = content_str
106
- .split_whitespace()
107
- .map(std::string::ToString::to_string)
108
- .collect();
109
- }
110
- _ => {}
111
- }
112
- }
113
- }
114
-
115
- #[allow(clippy::trivially_copy_pass_by_ref)]
116
- fn find_meta_tags<'a>(node_handle: &tl::NodeHandle, parser: &'a tl::Parser<'a>, metadata: &mut HocrMetadata) {
117
- if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
118
- let tag_name = tag.name().as_utf8_str();
119
-
120
- if tag_name == "meta" {
121
- extract_from_meta_tag(tag, metadata);
122
- }
123
-
124
- let children = tag.children();
125
- for child_handle in children.top().iter() {
126
- find_meta_tags(child_handle, parser, metadata);
127
- }
128
- }
129
- }
130
-
131
- for child_handle in dom.children() {
132
- find_meta_tags(child_handle, parser, &mut metadata);
133
- }
134
-
135
- metadata
136
- }
137
-
138
- /// Extract a single hOCR element and its children
139
- #[allow(clippy::trivially_copy_pass_by_ref)]
140
- fn extract_element(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> Option<HocrElement> {
141
- if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
142
- let attrs = tag.attributes();
143
- let class_attr = attrs.get("class").flatten()?;
144
- let classes = class_attr.as_utf8_str();
145
- if !classes.as_ref().contains("ocr") {
146
- return None;
147
- }
148
-
149
- let element_type = classes.split_whitespace().find_map(HocrElementType::from_class)?;
150
-
151
- let properties = if let Some(title) = attrs.get("title").flatten() {
152
- parse_properties(&title.as_utf8_str())
153
- } else {
154
- Default::default()
155
- };
156
-
157
- let mut text = String::new();
158
- let mut children = Vec::new();
159
-
160
- let tag_children = tag.children();
161
- for child_handle in tag_children.top().iter() {
162
- if let Some(tl::Node::Raw(bytes)) = child_handle.get(parser) {
163
- text.push_str(&bytes.as_utf8_str());
164
- } else if let Some(child_element) = extract_element(child_handle, parser) {
165
- children.push(child_element);
166
- }
167
- }
168
-
169
- Some(HocrElement {
170
- element_type,
171
- properties,
172
- text: text.trim().to_string(),
173
- children,
174
- })
175
- } else {
176
- None
177
- }
178
- }
179
-
180
- #[cfg(test)]
181
- mod tests {
182
- use super::*;
183
-
184
- #[test]
185
- fn test_extract_simple_word() {
186
- let html = r#"<span class="ocrx_word" title="bbox 100 50 150 80; x_wconf 95">Hello</span>"#;
187
- let dom = tl::parse(html, tl::ParserOptions::default()).unwrap();
188
- let parser = dom.parser();
189
-
190
- let element = extract_element(&dom.children()[0], parser).unwrap();
191
- assert!(matches!(element.element_type, HocrElementType::OcrxWord));
192
- assert_eq!(element.text, "Hello");
193
- assert!(element.properties.bbox.is_some());
194
- assert_eq!(element.properties.x_wconf, Some(95.0));
195
- }
196
-
197
- #[test]
198
- fn test_extract_paragraph() {
199
- let html = r#"<p class="ocr_par" title="bbox 0 0 200 100">
200
- <span class="ocrx_word" title="bbox 10 10 50 30; x_wconf 90">First</span>
201
- <span class="ocrx_word" title="bbox 60 10 100 30; x_wconf 92">Word</span>
202
- </p>"#;
203
- let dom = tl::parse(html, tl::ParserOptions::default()).unwrap();
204
- let parser = dom.parser();
205
-
206
- let element = extract_element(&dom.children()[0], parser).unwrap();
207
- assert!(matches!(element.element_type, HocrElementType::OcrPar));
208
- assert_eq!(element.children.len(), 2);
209
- assert!(matches!(element.children[0].element_type, HocrElementType::OcrxWord));
210
- }
211
-
212
- #[test]
213
- fn test_extract_metadata() {
214
- let html = r#"<!DOCTYPE html>
215
- <html>
216
- <head>
217
- <meta name="ocr-system" content="tesseract 4.1.1" />
218
- <meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
219
- <meta name="ocr-number-of-pages" content="5" />
220
- </head>
221
- <body>
222
- <div class="ocr_page"></div>
223
- </body>
224
- </html>"#;
225
- let dom = tl::parse(html, tl::ParserOptions::default()).unwrap();
226
- let (_, metadata) = extract_hocr_document(&dom);
227
-
228
- assert_eq!(metadata.ocr_system, Some("tesseract 4.1.1".to_string()));
229
- assert!(metadata.ocr_capabilities.contains(&"ocr_page".to_string()));
230
- assert_eq!(metadata.ocr_number_of_pages, Some(5));
231
- }
232
- }
@@ -1,42 +0,0 @@
1
- #![allow(clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::unused_self)]
2
- //! hOCR 1.2 document processing.
3
- //!
4
- //! **Deprecated since 2.30.0**: hOCR support will be removed in v3.
5
- //!
6
- //! Complete hOCR 1.2 specification support for extracting structured content from OCR documents.
7
- //!
8
- //! ## Features
9
- //!
10
- //! - **Full Element Support**: All 40+ hOCR 1.2 element types
11
- //! - **Complete Property Parsing**: All 20+ hOCR properties (bbox, baseline, fonts, etc.)
12
- //! - **Document Structure**: Logical hierarchy (paragraphs, sections, chapters)
13
- //! - **Spatial Table Reconstruction**: Automatic table detection from bbox coordinates
14
- //! - **Metadata Extraction**: OCR system info, capabilities, languages
15
- //!
16
- //! ## Modules
17
- //!
18
- //! - [`types`]: Core hOCR element and property types
19
- //! - [`parser`]: Property parsing from title attributes
20
- //! - [`extractor`]: DOM to hOCR element tree extraction
21
- //! - [`converter`]: hOCR to Markdown conversion
22
- //! - [`spatial`]: Spatial table reconstruction from bounding boxes
23
-
24
- #[allow(deprecated)]
25
- pub mod converter;
26
- #[allow(deprecated)]
27
- pub mod extractor;
28
- #[allow(deprecated)]
29
- pub mod parser;
30
- #[allow(deprecated)]
31
- pub mod spatial;
32
- #[allow(deprecated)]
33
- pub mod types;
34
-
35
- #[deprecated(since = "2.30.0", note = "hOCR support will be removed in v3.")]
36
- pub use converter::{convert_to_markdown, convert_to_markdown_with_options};
37
- #[deprecated(since = "2.30.0", note = "hOCR support will be removed in v3.")]
38
- pub use extractor::extract_hocr_document;
39
- #[deprecated(since = "2.30.0", note = "hOCR support will be removed in v3.")]
40
- pub use spatial::{HocrWord, extract_hocr_words, reconstruct_table, table_to_markdown};
41
- #[deprecated(since = "2.30.0", note = "hOCR support will be removed in v3.")]
42
- pub use types::{BBox, Baseline, HocrElement, HocrElementType, HocrMetadata, HocrProperties};