html-to-markdown 2.30.0 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +6 -19
  3. data/README.md +37 -50
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +13 -701
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
  6. data/ext/html-to-markdown-rb/native/README.md +4 -13
  7. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
  8. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
  9. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
  10. data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
  11. data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
  12. data/lib/html_to_markdown/version.rb +1 -1
  13. data/lib/html_to_markdown.rb +13 -194
  14. data/sig/html_to_markdown.rbs +12 -373
  15. data/vendor/Cargo.toml +6 -3
  16. data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
  17. data/vendor/html-to-markdown-rs/README.md +126 -52
  18. data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
  19. data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
  20. data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
  21. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
  22. data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
  23. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
  24. data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
  25. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
  26. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
  27. data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
  28. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
  29. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
  30. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
  31. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
  32. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
  33. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
  38. data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
  43. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
  44. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
  45. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
  46. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
  47. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
  48. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
  49. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
  50. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
  52. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
  53. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
  54. data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
  56. data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -68
  57. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
  58. data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
  59. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
  60. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
  61. data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
  62. data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
  63. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
  65. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
  66. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
  67. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
  68. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
  69. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
  70. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
  71. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
  72. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
  73. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
  74. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
  75. data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
  76. data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
  77. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  78. data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
  79. data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
  80. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
  81. data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
  82. data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -323
  83. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
  84. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
  85. data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
  86. data/vendor/html-to-markdown-rs/src/text.rs +25 -14
  87. data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
  88. data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
  89. data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
  90. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
  91. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
  92. data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
  93. data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
  94. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
  95. data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
  96. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
  97. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
  98. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
  99. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
  100. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
  101. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
  102. data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
  103. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
  104. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
  105. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
  106. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
  107. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
  108. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
  109. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
  110. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
  111. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
  112. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
  113. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
  114. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
  115. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
  116. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
  117. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
  118. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
  119. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
  120. data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
  121. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
  122. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
  123. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
  124. data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
  125. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
  126. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
  127. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
  128. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
  129. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
  130. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
  131. metadata +9 -37
  132. data/bin/benchmark.rb +0 -232
  133. data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
  134. data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
  135. data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
  136. data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
  137. data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
  138. data/spec/convert_spec.rb +0 -77
  139. data/spec/convert_with_tables_spec.rb +0 -194
  140. data/spec/metadata_extraction_spec.rb +0 -437
  141. data/spec/visitor_issue_187_spec.rb +0 -605
  142. data/spec/visitor_spec.rb +0 -1149
  143. data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
  144. data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
  145. data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
  146. data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
  147. data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
  148. data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
  149. data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
  150. data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
  151. data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
  152. data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -42
  153. data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
  154. data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
  155. data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
  156. data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
  157. data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
  158. data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
  159. data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
  160. data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
  161. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
  162. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
  163. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
  164. data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
  165. data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
  166. data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
@@ -1,333 +0,0 @@
1
- #![allow(clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::unused_self)]
2
- //! hOCR property parser
3
- //!
4
- //! Parses hOCR title attributes into structured properties.
5
-
6
- use super::types::{BBox, Baseline, HocrProperties};
7
- use crate::text::decode_html_entities_cow;
8
-
9
- /// Parse all properties from hOCR title attribute
10
- #[must_use]
11
- pub fn parse_properties(title: &str) -> HocrProperties {
12
- let mut props = HocrProperties::default();
13
-
14
- let title = decode_html_entities_cow(title);
15
-
16
- for part in title.as_ref().split(';') {
17
- let part = part.trim();
18
- if part.is_empty() {
19
- continue;
20
- }
21
-
22
- let mut tokens = part.split_whitespace();
23
- if let Some(key) = tokens.next() {
24
- match key {
25
- "bbox" => {
26
- if let Some(bbox) = parse_bbox_coords(&mut tokens) {
27
- props.bbox = Some(bbox);
28
- }
29
- }
30
- "baseline" => {
31
- if let Some(baseline) = parse_baseline(&mut tokens) {
32
- props.baseline = Some(baseline);
33
- }
34
- }
35
- "textangle" => {
36
- if let Some(angle_str) = tokens.next() {
37
- if let Ok(angle) = angle_str.parse::<f64>() {
38
- props.textangle = Some(angle);
39
- }
40
- }
41
- }
42
- "poly" => {
43
- props.poly = parse_poly(&mut tokens);
44
- }
45
- "x_wconf" => {
46
- if let Some(conf_str) = tokens.next() {
47
- if let Ok(conf) = conf_str.parse::<f64>() {
48
- props.x_wconf = Some(conf);
49
- }
50
- }
51
- }
52
- "x_confs" => {
53
- props.x_confs = parse_float_list(&mut tokens);
54
- }
55
- "nlp" => {
56
- props.nlp = parse_float_list(&mut tokens);
57
- }
58
- "x_font" => {
59
- if let Some(font) = parse_quoted_string(part) {
60
- props.x_font = Some(font);
61
- }
62
- }
63
- "x_fsize" => {
64
- if let Some(size_str) = tokens.next() {
65
- if let Ok(size) = size_str.parse::<u32>() {
66
- props.x_fsize = Some(size);
67
- }
68
- }
69
- }
70
- "order" => {
71
- if let Some(order_str) = tokens.next() {
72
- if let Ok(order) = order_str.parse::<u32>() {
73
- props.order = Some(order);
74
- }
75
- }
76
- }
77
- "cflow" => {
78
- if let Some(flow) = parse_quoted_string(part) {
79
- props.cflow = Some(flow);
80
- }
81
- }
82
- "hardbreak" => {
83
- if let Some(val) = tokens.next() {
84
- props.hardbreak = val == "1";
85
- }
86
- }
87
- "cuts" => {
88
- props.cuts = parse_cuts(&mut tokens);
89
- }
90
- "x_bboxes" => {
91
- props.x_bboxes = parse_bboxes_list(&mut tokens);
92
- }
93
- "image" => {
94
- if let Some(img) = parse_quoted_string(part) {
95
- props.image = Some(img);
96
- }
97
- }
98
- "imagemd5" => {
99
- if let Some(md5) = parse_quoted_string(part) {
100
- props.imagemd5 = Some(md5);
101
- }
102
- }
103
- "ppageno" => {
104
- if let Some(page_str) = tokens.next() {
105
- if let Ok(page) = page_str.parse::<u32>() {
106
- props.ppageno = Some(page);
107
- }
108
- }
109
- }
110
- "lpageno" => {
111
- let rest: Vec<&str> = tokens.collect();
112
- if !rest.is_empty() {
113
- let lpageno_str = rest.join(" ");
114
- if let Some(quoted) = parse_quoted_string(part) {
115
- props.lpageno = Some(quoted);
116
- } else {
117
- props.lpageno = Some(lpageno_str);
118
- }
119
- }
120
- }
121
- "scan_res" => {
122
- let coords: Vec<&str> = tokens.collect();
123
- if coords.len() >= 2 {
124
- if let (Ok(x), Ok(y)) = (coords[0].parse::<u32>(), coords[1].parse::<u32>()) {
125
- props.scan_res = Some((x, y));
126
- }
127
- }
128
- }
129
- "x_source" => {
130
- let sources = parse_all_quoted_strings(part);
131
- if !sources.is_empty() {
132
- props.x_source = sources;
133
- }
134
- }
135
- "x_scanner" => {
136
- if let Some(scanner) = parse_quoted_string(part) {
137
- props.x_scanner = Some(scanner);
138
- }
139
- }
140
- "x_size" | "x_descenders" | "x_ascenders" => {
141
- let value: Vec<&str> = tokens.collect();
142
- if !value.is_empty() {
143
- props.other.insert(key.to_string(), value.join(" "));
144
- }
145
- }
146
- _ => {
147
- let value: Vec<&str> = tokens.collect();
148
- if !value.is_empty() {
149
- props.other.insert(key.to_string(), value.join(" "));
150
- }
151
- }
152
- }
153
- }
154
- }
155
-
156
- props
157
- }
158
-
159
- fn parse_bbox_coords<'a, I>(tokens: &mut I) -> Option<BBox>
160
- where
161
- I: Iterator<Item = &'a str>,
162
- {
163
- let x1 = tokens.next()?.parse::<u32>().ok()?;
164
- let y1 = tokens.next()?.parse::<u32>().ok()?;
165
- let x2 = tokens.next()?.parse::<u32>().ok()?;
166
- let y2 = tokens.next()?.parse::<u32>().ok()?;
167
- Some(BBox { x1, y1, x2, y2 })
168
- }
169
-
170
- fn parse_baseline<'a, I>(tokens: &mut I) -> Option<Baseline>
171
- where
172
- I: Iterator<Item = &'a str>,
173
- {
174
- let slope = tokens.next()?.parse::<f64>().ok()?;
175
- let constant = tokens.next()?.parse::<i32>().ok()?;
176
- Some(Baseline { slope, constant })
177
- }
178
-
179
- fn parse_poly<'a, I>(tokens: &mut I) -> Option<Vec<(i32, i32)>>
180
- where
181
- I: Iterator<Item = &'a str>,
182
- {
183
- let coords: Vec<&str> = tokens.collect();
184
- if coords.len() >= 4 && coords.len() % 2 == 0 {
185
- let mut points = Vec::new();
186
- for chunk in coords.chunks(2) {
187
- if let (Ok(x), Ok(y)) = (chunk[0].parse::<i32>(), chunk[1].parse::<i32>()) {
188
- points.push((x, y));
189
- } else {
190
- return None;
191
- }
192
- }
193
- return Some(points);
194
- }
195
- None
196
- }
197
-
198
- fn parse_float_list<'a, I>(tokens: &mut I) -> Vec<f64>
199
- where
200
- I: Iterator<Item = &'a str>,
201
- {
202
- tokens.filter_map(|s| s.parse::<f64>().ok()).collect()
203
- }
204
-
205
- fn parse_cuts<'a, I>(tokens: &mut I) -> Vec<Vec<u32>>
206
- where
207
- I: Iterator<Item = &'a str>,
208
- {
209
- let mut cuts = Vec::new();
210
- for token in tokens {
211
- if token.contains(',') {
212
- let parts: Vec<u32> = token.split(',').filter_map(|s| s.parse::<u32>().ok()).collect();
213
- cuts.push(parts);
214
- } else if let Ok(val) = token.parse::<u32>() {
215
- cuts.push(vec![val]);
216
- }
217
- }
218
- cuts
219
- }
220
-
221
- fn parse_bboxes_list<'a, I>(tokens: &mut I) -> Vec<BBox>
222
- where
223
- I: Iterator<Item = &'a str>,
224
- {
225
- let coords: Vec<u32> = tokens.filter_map(|s| s.parse::<u32>().ok()).collect();
226
-
227
- coords
228
- .chunks(4)
229
- .filter_map(|chunk| {
230
- if chunk.len() == 4 {
231
- Some(BBox {
232
- x1: chunk[0],
233
- y1: chunk[1],
234
- x2: chunk[2],
235
- y2: chunk[3],
236
- })
237
- } else {
238
- None
239
- }
240
- })
241
- .collect()
242
- }
243
-
244
- fn parse_quoted_string(s: &str) -> Option<String> {
245
- if let Some(start) = s.find('"') {
246
- if let Some(end) = s[start + 1..].find('"') {
247
- return Some(s[start + 1..start + 1 + end].to_string());
248
- }
249
- }
250
- None
251
- }
252
-
253
- fn parse_all_quoted_strings(s: &str) -> Vec<String> {
254
- let mut results = Vec::new();
255
- let mut remaining = s;
256
-
257
- while let Some(start) = remaining.find('"') {
258
- if let Some(end) = remaining[start + 1..].find('"') {
259
- results.push(remaining[start + 1..start + 1 + end].to_string());
260
- remaining = &remaining[start + 1 + end + 1..];
261
- } else {
262
- break;
263
- }
264
- }
265
-
266
- results
267
- }
268
-
269
- #[cfg(test)]
270
- mod tests {
271
- use super::*;
272
-
273
- #[test]
274
- fn test_parse_bbox() {
275
- let props = parse_properties("bbox 100 50 200 150");
276
- assert_eq!(
277
- props.bbox,
278
- Some(BBox {
279
- x1: 100,
280
- y1: 50,
281
- x2: 200,
282
- y2: 150
283
- })
284
- );
285
- }
286
-
287
- #[test]
288
- fn test_parse_baseline() {
289
- let props = parse_properties("baseline 0.015 -18");
290
- assert_eq!(
291
- props.baseline,
292
- Some(Baseline {
293
- slope: 0.015,
294
- constant: -18
295
- })
296
- );
297
- }
298
-
299
- #[test]
300
- fn test_parse_multiple_properties() {
301
- let props = parse_properties("bbox 0 0 100 50; x_wconf 95.5; textangle 7.2");
302
- assert_eq!(
303
- props.bbox,
304
- Some(BBox {
305
- x1: 0,
306
- y1: 0,
307
- x2: 100,
308
- y2: 50
309
- })
310
- );
311
- assert_eq!(props.x_wconf, Some(95.5));
312
- assert_eq!(props.textangle, Some(7.2));
313
- }
314
-
315
- #[test]
316
- fn test_parse_quoted_strings() {
317
- let props = parse_properties("x_font \"Comic Sans MS\"; x_fsize 12");
318
- assert_eq!(props.x_font, Some("Comic Sans MS".to_string()));
319
- assert_eq!(props.x_fsize, Some(12));
320
- }
321
-
322
- #[test]
323
- fn test_parse_poly() {
324
- let props = parse_properties("poly 0 0 0 10 10 10 10 0");
325
- assert_eq!(props.poly, Some(vec![(0, 0), (0, 10), (10, 10), (10, 0)]));
326
- }
327
-
328
- #[test]
329
- fn test_parse_x_confs() {
330
- let props = parse_properties("x_confs 37.3 51.23 100");
331
- assert_eq!(props.x_confs, vec![37.3, 51.23, 100.0]);
332
- }
333
- }
@@ -1,129 +0,0 @@
1
- #![allow(
2
- clippy::cast_precision_loss,
3
- clippy::cast_sign_loss,
4
- clippy::unused_self,
5
- clippy::float_cmp
6
- )]
7
- //! Coordinate types and parsing from hOCR bbox attributes
8
-
9
- /// Represents a word extracted from hOCR with position and confidence information
10
- #[derive(Debug, Clone)]
11
- pub struct HocrWord {
12
- /// The text content of the word
13
- pub text: String,
14
- /// X-coordinate of the left edge (pixels)
15
- pub left: u32,
16
- /// Y-coordinate of the top edge (pixels)
17
- pub top: u32,
18
- /// Width of the word bounding box (pixels)
19
- pub width: u32,
20
- /// Height of the word bounding box (pixels)
21
- pub height: u32,
22
- /// OCR confidence score (0.0 to 100.0)
23
- pub confidence: f64,
24
- }
25
-
26
- impl HocrWord {
27
- /// Get the right edge position
28
- #[must_use]
29
- pub const fn right(&self) -> u32 {
30
- self.left + self.width
31
- }
32
-
33
- /// Get the bottom edge position
34
- #[must_use]
35
- pub const fn bottom(&self) -> u32 {
36
- self.top + self.height
37
- }
38
-
39
- /// Get the vertical center position
40
- #[must_use]
41
- pub fn y_center(&self) -> f64 {
42
- f64::from(self.top) + (f64::from(self.height) / 2.0)
43
- }
44
-
45
- /// Get the horizontal center position
46
- #[must_use]
47
- pub fn x_center(&self) -> f64 {
48
- f64::from(self.left) + (f64::from(self.width) / 2.0)
49
- }
50
- }
51
-
52
- /// Parse bbox attribute from hOCR title attribute
53
- ///
54
- /// Example: "bbox 100 50 180 80; `x_wconf` 95" -> (100, 50, 80, 30)
55
- pub fn parse_bbox(title: &str) -> Option<(u32, u32, u32, u32)> {
56
- for part in title.split(';') {
57
- let part = part.trim();
58
-
59
- if let Some(bbox_str) = part.strip_prefix("bbox ") {
60
- let coords: Vec<&str> = bbox_str.split_whitespace().collect();
61
- if coords.len() == 4 {
62
- if let (Ok(x1), Ok(y1), Ok(x2), Ok(y2)) = (
63
- coords[0].parse::<u32>(),
64
- coords[1].parse::<u32>(),
65
- coords[2].parse::<u32>(),
66
- coords[3].parse::<u32>(),
67
- ) {
68
- let width = x2.saturating_sub(x1);
69
- let height = y2.saturating_sub(y1);
70
- return Some((x1, y1, width, height));
71
- }
72
- }
73
- }
74
- }
75
- None
76
- }
77
-
78
- /// Parse confidence from hOCR title attribute
79
- ///
80
- /// Example: "bbox 100 50 180 80; `x_wconf` 95" -> 95.0
81
- pub fn parse_confidence(title: &str) -> f64 {
82
- for part in title.split(';') {
83
- let part = part.trim();
84
- if let Some(conf_str) = part.strip_prefix("x_wconf ") {
85
- if let Ok(conf) = conf_str.trim().parse::<f64>() {
86
- return conf;
87
- }
88
- }
89
- }
90
- 0.0
91
- }
92
-
93
- #[cfg(test)]
94
- mod tests {
95
- use super::*;
96
-
97
- #[test]
98
- fn test_parse_bbox() {
99
- assert_eq!(parse_bbox("bbox 100 50 180 80"), Some((100, 50, 80, 30)));
100
- assert_eq!(parse_bbox("bbox 0 0 100 200"), Some((0, 0, 100, 200)));
101
- assert_eq!(parse_bbox("bbox 100 50 180 80; x_wconf 95"), Some((100, 50, 80, 30)));
102
- assert_eq!(parse_bbox("invalid"), None);
103
- assert_eq!(parse_bbox("bbox 100 50"), None);
104
- }
105
-
106
- #[test]
107
- fn test_parse_confidence() {
108
- assert_eq!(parse_confidence("x_wconf 95.5"), 95.5);
109
- assert_eq!(parse_confidence("bbox 100 50 180 80; x_wconf 92"), 92.0);
110
- assert_eq!(parse_confidence("invalid"), 0.0);
111
- }
112
-
113
- #[test]
114
- fn test_hocr_word_methods() {
115
- let word = HocrWord {
116
- text: "Hello".to_string(),
117
- left: 100,
118
- top: 50,
119
- width: 80,
120
- height: 30,
121
- confidence: 95.5,
122
- };
123
-
124
- assert_eq!(word.right(), 180);
125
- assert_eq!(word.bottom(), 80);
126
- assert_eq!(word.y_center(), 65.0);
127
- assert_eq!(word.x_center(), 140.0);
128
- }
129
- }
@@ -1,165 +0,0 @@
1
- #![allow(clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::unused_self)]
2
- //! Word extraction and DOM processing for hOCR documents
3
-
4
- use crate::hocr::spatial::coords::{HocrWord, parse_bbox, parse_confidence};
5
-
6
- /// Extract text content from a node
7
- #[allow(clippy::trivially_copy_pass_by_ref)]
8
- fn get_text_content(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
9
- let mut text = String::new();
10
-
11
- if let Some(node) = node_handle.get(parser) {
12
- match node {
13
- tl::Node::Raw(bytes) => {
14
- text.push_str(&bytes.as_utf8_str());
15
- }
16
- tl::Node::Tag(tag) => {
17
- let children = tag.children();
18
- for child_handle in children.top().iter() {
19
- text.push_str(&get_text_content(child_handle, parser));
20
- }
21
- }
22
- tl::Node::Comment(_) => {}
23
- }
24
- }
25
-
26
- text
27
- }
28
-
29
- /// Extract hOCR words from a DOM tree
30
- ///
31
- /// Walks the DOM and extracts all elements with `ocrx_word` class,
32
- /// parsing their bbox and confidence information.
33
- #[must_use]
34
- #[allow(clippy::trivially_copy_pass_by_ref)]
35
- pub fn extract_hocr_words(node_handle: &tl::NodeHandle, parser: &tl::Parser, min_confidence: f64) -> Vec<HocrWord> {
36
- let mut words = Vec::new();
37
-
38
- if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
39
- let tag_name = tag.name().as_utf8_str();
40
- let attrs = tag.attributes();
41
-
42
- let class_attr = attrs.get("class").flatten().map(|v| v.as_utf8_str().to_string());
43
-
44
- // hOCR class validation removed for performance
45
-
46
- if tag_name == "span" {
47
- let is_word = class_attr.as_ref().is_some_and(|c| c.contains("ocrx_word"));
48
- let title = attrs.get("title").flatten().map(|v| v.as_utf8_str());
49
-
50
- if is_word {
51
- let title_str = title.as_deref().unwrap_or("");
52
- if let Some((left, top, width, height)) = parse_bbox(title_str) {
53
- let confidence = parse_confidence(title_str);
54
-
55
- if confidence >= min_confidence {
56
- let text = get_text_content(node_handle, parser).trim().to_string();
57
-
58
- if !text.is_empty() {
59
- words.push(HocrWord {
60
- text,
61
- left,
62
- top,
63
- width,
64
- height,
65
- confidence,
66
- });
67
- }
68
- }
69
- }
70
- }
71
- }
72
-
73
- let children = tag.children();
74
- for child_handle in children.top().iter() {
75
- words.extend(extract_hocr_words(child_handle, parser, min_confidence));
76
- }
77
- }
78
-
79
- words
80
- }
81
-
82
- #[cfg(test)]
83
- mod tests {
84
- use super::*;
85
-
86
- #[test]
87
- fn test_extract_hocr_words() {
88
- let hocr = r#"
89
- <div class="ocr_page">
90
- <span class="ocrx_word" title="bbox 100 50 150 80; x_wconf 95">Hello</span>
91
- <span class="ocrx_word" title="bbox 160 50 210 80; x_wconf 92">World</span>
92
- </div>
93
- "#;
94
-
95
- let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
96
- let parser = dom.parser();
97
-
98
- let mut words = Vec::new();
99
- for child_handle in dom.children().iter() {
100
- words.extend(extract_hocr_words(child_handle, parser, 0.0));
101
- }
102
-
103
- assert_eq!(words.len(), 2);
104
- assert_eq!(words[0].text, "Hello");
105
- assert_eq!(words[0].left, 100);
106
- assert!((words[0].confidence - 95.0).abs() < f64::EPSILON);
107
-
108
- assert_eq!(words[1].text, "World");
109
- assert_eq!(words[1].left, 160);
110
- assert!((words[1].confidence - 92.0).abs() < f64::EPSILON);
111
- }
112
-
113
- #[test]
114
- fn test_extract_hocr_words_confidence_filter() {
115
- let hocr = r#"
116
- <div class="ocr_page">
117
- <span class="ocrx_word" title="bbox 100 50 150 80; x_wconf 95">HighConf</span>
118
- <span class="ocrx_word" title="bbox 160 50 210 80; x_wconf 50">LowConf</span>
119
- <span class="ocrx_word" title="bbox 220 50 270 80; x_wconf 98">VeryHigh</span>
120
- </div>
121
- "#;
122
-
123
- let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
124
- let parser = dom.parser();
125
-
126
- let mut words = Vec::new();
127
- for child_handle in dom.children().iter() {
128
- words.extend(extract_hocr_words(child_handle, parser, 90.0));
129
- }
130
-
131
- assert_eq!(words.len(), 2);
132
- assert_eq!(words[0].text, "HighConf");
133
- assert_eq!(words[1].text, "VeryHigh");
134
- }
135
-
136
- #[test]
137
- fn test_end_to_end_hocr_table_extraction() {
138
- let hocr = r#"
139
- <div class="ocr_page">
140
- <span class="ocrx_word" title="bbox 100 50 140 70; x_wconf 95">Product</span>
141
- <span class="ocrx_word" title="bbox 200 50 240 70; x_wconf 95">Price</span>
142
- <span class="ocrx_word" title="bbox 100 100 140 120; x_wconf 95">Apple</span>
143
- <span class="ocrx_word" title="bbox 200 100 240 120; x_wconf 95">$1.50</span>
144
- <span class="ocrx_word" title="bbox 100 150 140 170; x_wconf 95">Orange</span>
145
- <span class="ocrx_word" title="bbox 200 150 240 170; x_wconf 95">$2.00</span>
146
- </div>
147
- "#;
148
-
149
- let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
150
- let parser = dom.parser();
151
-
152
- let mut words = Vec::new();
153
- for child_handle in dom.children().iter() {
154
- words.extend(extract_hocr_words(child_handle, parser, 0.0));
155
- }
156
-
157
- assert_eq!(words.len(), 6);
158
- assert_eq!(words[0].text, "Product");
159
- assert_eq!(words[1].text, "Price");
160
- assert_eq!(words[2].text, "Apple");
161
- assert_eq!(words[3].text, "$1.50");
162
- assert_eq!(words[4].text, "Orange");
163
- assert_eq!(words[5].text, "$2.00");
164
- }
165
- }