html-to-markdown 2.30.0 → 3.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +6 -19
- data/README.md +37 -50
- data/ext/html-to-markdown-rb/native/Cargo.lock +13 -701
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
- data/ext/html-to-markdown-rb/native/README.md +4 -13
- data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
- data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
- data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
- data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
- data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +13 -194
- data/sig/html_to_markdown.rbs +12 -373
- data/vendor/Cargo.toml +6 -3
- data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
- data/vendor/html-to-markdown-rs/README.md +126 -52
- data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
- data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
- data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
- data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
- data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
- data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
- data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
- data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
- data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
- data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
- data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
- data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
- data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
- data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
- data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
- data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
- data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
- data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -68
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
- data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
- data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
- data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
- data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
- data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
- data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
- data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
- data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
- data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
- data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
- data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
- data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
- data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
- data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
- data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
- data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
- data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
- data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -323
- data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
- data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
- data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
- data/vendor/html-to-markdown-rs/src/text.rs +25 -14
- data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
- data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
- data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
- data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
- data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
- data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
- data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
- data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
- data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
- data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
- data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
- data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
- data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
- data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
- data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
- metadata +9 -37
- data/bin/benchmark.rb +0 -232
- data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
- data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
- data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
- data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
- data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
- data/spec/convert_spec.rb +0 -77
- data/spec/convert_with_tables_spec.rb +0 -194
- data/spec/metadata_extraction_spec.rb +0 -437
- data/spec/visitor_issue_187_spec.rb +0 -605
- data/spec/visitor_spec.rb +0 -1149
- data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
- data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
- data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
- data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
- data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
- data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
- data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
- data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
- data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
- data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -42
- data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
- data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
- data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
- data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
- data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
- data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
- data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
- data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
- data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
- data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
- data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
|
@@ -1,333 +0,0 @@
|
|
|
1
|
-
#![allow(clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::unused_self)]
|
|
2
|
-
//! hOCR property parser
|
|
3
|
-
//!
|
|
4
|
-
//! Parses hOCR title attributes into structured properties.
|
|
5
|
-
|
|
6
|
-
use super::types::{BBox, Baseline, HocrProperties};
|
|
7
|
-
use crate::text::decode_html_entities_cow;
|
|
8
|
-
|
|
9
|
-
/// Parse all properties from hOCR title attribute
|
|
10
|
-
#[must_use]
|
|
11
|
-
pub fn parse_properties(title: &str) -> HocrProperties {
|
|
12
|
-
let mut props = HocrProperties::default();
|
|
13
|
-
|
|
14
|
-
let title = decode_html_entities_cow(title);
|
|
15
|
-
|
|
16
|
-
for part in title.as_ref().split(';') {
|
|
17
|
-
let part = part.trim();
|
|
18
|
-
if part.is_empty() {
|
|
19
|
-
continue;
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
let mut tokens = part.split_whitespace();
|
|
23
|
-
if let Some(key) = tokens.next() {
|
|
24
|
-
match key {
|
|
25
|
-
"bbox" => {
|
|
26
|
-
if let Some(bbox) = parse_bbox_coords(&mut tokens) {
|
|
27
|
-
props.bbox = Some(bbox);
|
|
28
|
-
}
|
|
29
|
-
}
|
|
30
|
-
"baseline" => {
|
|
31
|
-
if let Some(baseline) = parse_baseline(&mut tokens) {
|
|
32
|
-
props.baseline = Some(baseline);
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
"textangle" => {
|
|
36
|
-
if let Some(angle_str) = tokens.next() {
|
|
37
|
-
if let Ok(angle) = angle_str.parse::<f64>() {
|
|
38
|
-
props.textangle = Some(angle);
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
}
|
|
42
|
-
"poly" => {
|
|
43
|
-
props.poly = parse_poly(&mut tokens);
|
|
44
|
-
}
|
|
45
|
-
"x_wconf" => {
|
|
46
|
-
if let Some(conf_str) = tokens.next() {
|
|
47
|
-
if let Ok(conf) = conf_str.parse::<f64>() {
|
|
48
|
-
props.x_wconf = Some(conf);
|
|
49
|
-
}
|
|
50
|
-
}
|
|
51
|
-
}
|
|
52
|
-
"x_confs" => {
|
|
53
|
-
props.x_confs = parse_float_list(&mut tokens);
|
|
54
|
-
}
|
|
55
|
-
"nlp" => {
|
|
56
|
-
props.nlp = parse_float_list(&mut tokens);
|
|
57
|
-
}
|
|
58
|
-
"x_font" => {
|
|
59
|
-
if let Some(font) = parse_quoted_string(part) {
|
|
60
|
-
props.x_font = Some(font);
|
|
61
|
-
}
|
|
62
|
-
}
|
|
63
|
-
"x_fsize" => {
|
|
64
|
-
if let Some(size_str) = tokens.next() {
|
|
65
|
-
if let Ok(size) = size_str.parse::<u32>() {
|
|
66
|
-
props.x_fsize = Some(size);
|
|
67
|
-
}
|
|
68
|
-
}
|
|
69
|
-
}
|
|
70
|
-
"order" => {
|
|
71
|
-
if let Some(order_str) = tokens.next() {
|
|
72
|
-
if let Ok(order) = order_str.parse::<u32>() {
|
|
73
|
-
props.order = Some(order);
|
|
74
|
-
}
|
|
75
|
-
}
|
|
76
|
-
}
|
|
77
|
-
"cflow" => {
|
|
78
|
-
if let Some(flow) = parse_quoted_string(part) {
|
|
79
|
-
props.cflow = Some(flow);
|
|
80
|
-
}
|
|
81
|
-
}
|
|
82
|
-
"hardbreak" => {
|
|
83
|
-
if let Some(val) = tokens.next() {
|
|
84
|
-
props.hardbreak = val == "1";
|
|
85
|
-
}
|
|
86
|
-
}
|
|
87
|
-
"cuts" => {
|
|
88
|
-
props.cuts = parse_cuts(&mut tokens);
|
|
89
|
-
}
|
|
90
|
-
"x_bboxes" => {
|
|
91
|
-
props.x_bboxes = parse_bboxes_list(&mut tokens);
|
|
92
|
-
}
|
|
93
|
-
"image" => {
|
|
94
|
-
if let Some(img) = parse_quoted_string(part) {
|
|
95
|
-
props.image = Some(img);
|
|
96
|
-
}
|
|
97
|
-
}
|
|
98
|
-
"imagemd5" => {
|
|
99
|
-
if let Some(md5) = parse_quoted_string(part) {
|
|
100
|
-
props.imagemd5 = Some(md5);
|
|
101
|
-
}
|
|
102
|
-
}
|
|
103
|
-
"ppageno" => {
|
|
104
|
-
if let Some(page_str) = tokens.next() {
|
|
105
|
-
if let Ok(page) = page_str.parse::<u32>() {
|
|
106
|
-
props.ppageno = Some(page);
|
|
107
|
-
}
|
|
108
|
-
}
|
|
109
|
-
}
|
|
110
|
-
"lpageno" => {
|
|
111
|
-
let rest: Vec<&str> = tokens.collect();
|
|
112
|
-
if !rest.is_empty() {
|
|
113
|
-
let lpageno_str = rest.join(" ");
|
|
114
|
-
if let Some(quoted) = parse_quoted_string(part) {
|
|
115
|
-
props.lpageno = Some(quoted);
|
|
116
|
-
} else {
|
|
117
|
-
props.lpageno = Some(lpageno_str);
|
|
118
|
-
}
|
|
119
|
-
}
|
|
120
|
-
}
|
|
121
|
-
"scan_res" => {
|
|
122
|
-
let coords: Vec<&str> = tokens.collect();
|
|
123
|
-
if coords.len() >= 2 {
|
|
124
|
-
if let (Ok(x), Ok(y)) = (coords[0].parse::<u32>(), coords[1].parse::<u32>()) {
|
|
125
|
-
props.scan_res = Some((x, y));
|
|
126
|
-
}
|
|
127
|
-
}
|
|
128
|
-
}
|
|
129
|
-
"x_source" => {
|
|
130
|
-
let sources = parse_all_quoted_strings(part);
|
|
131
|
-
if !sources.is_empty() {
|
|
132
|
-
props.x_source = sources;
|
|
133
|
-
}
|
|
134
|
-
}
|
|
135
|
-
"x_scanner" => {
|
|
136
|
-
if let Some(scanner) = parse_quoted_string(part) {
|
|
137
|
-
props.x_scanner = Some(scanner);
|
|
138
|
-
}
|
|
139
|
-
}
|
|
140
|
-
"x_size" | "x_descenders" | "x_ascenders" => {
|
|
141
|
-
let value: Vec<&str> = tokens.collect();
|
|
142
|
-
if !value.is_empty() {
|
|
143
|
-
props.other.insert(key.to_string(), value.join(" "));
|
|
144
|
-
}
|
|
145
|
-
}
|
|
146
|
-
_ => {
|
|
147
|
-
let value: Vec<&str> = tokens.collect();
|
|
148
|
-
if !value.is_empty() {
|
|
149
|
-
props.other.insert(key.to_string(), value.join(" "));
|
|
150
|
-
}
|
|
151
|
-
}
|
|
152
|
-
}
|
|
153
|
-
}
|
|
154
|
-
}
|
|
155
|
-
|
|
156
|
-
props
|
|
157
|
-
}
|
|
158
|
-
|
|
159
|
-
fn parse_bbox_coords<'a, I>(tokens: &mut I) -> Option<BBox>
|
|
160
|
-
where
|
|
161
|
-
I: Iterator<Item = &'a str>,
|
|
162
|
-
{
|
|
163
|
-
let x1 = tokens.next()?.parse::<u32>().ok()?;
|
|
164
|
-
let y1 = tokens.next()?.parse::<u32>().ok()?;
|
|
165
|
-
let x2 = tokens.next()?.parse::<u32>().ok()?;
|
|
166
|
-
let y2 = tokens.next()?.parse::<u32>().ok()?;
|
|
167
|
-
Some(BBox { x1, y1, x2, y2 })
|
|
168
|
-
}
|
|
169
|
-
|
|
170
|
-
fn parse_baseline<'a, I>(tokens: &mut I) -> Option<Baseline>
|
|
171
|
-
where
|
|
172
|
-
I: Iterator<Item = &'a str>,
|
|
173
|
-
{
|
|
174
|
-
let slope = tokens.next()?.parse::<f64>().ok()?;
|
|
175
|
-
let constant = tokens.next()?.parse::<i32>().ok()?;
|
|
176
|
-
Some(Baseline { slope, constant })
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
fn parse_poly<'a, I>(tokens: &mut I) -> Option<Vec<(i32, i32)>>
|
|
180
|
-
where
|
|
181
|
-
I: Iterator<Item = &'a str>,
|
|
182
|
-
{
|
|
183
|
-
let coords: Vec<&str> = tokens.collect();
|
|
184
|
-
if coords.len() >= 4 && coords.len() % 2 == 0 {
|
|
185
|
-
let mut points = Vec::new();
|
|
186
|
-
for chunk in coords.chunks(2) {
|
|
187
|
-
if let (Ok(x), Ok(y)) = (chunk[0].parse::<i32>(), chunk[1].parse::<i32>()) {
|
|
188
|
-
points.push((x, y));
|
|
189
|
-
} else {
|
|
190
|
-
return None;
|
|
191
|
-
}
|
|
192
|
-
}
|
|
193
|
-
return Some(points);
|
|
194
|
-
}
|
|
195
|
-
None
|
|
196
|
-
}
|
|
197
|
-
|
|
198
|
-
fn parse_float_list<'a, I>(tokens: &mut I) -> Vec<f64>
|
|
199
|
-
where
|
|
200
|
-
I: Iterator<Item = &'a str>,
|
|
201
|
-
{
|
|
202
|
-
tokens.filter_map(|s| s.parse::<f64>().ok()).collect()
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
fn parse_cuts<'a, I>(tokens: &mut I) -> Vec<Vec<u32>>
|
|
206
|
-
where
|
|
207
|
-
I: Iterator<Item = &'a str>,
|
|
208
|
-
{
|
|
209
|
-
let mut cuts = Vec::new();
|
|
210
|
-
for token in tokens {
|
|
211
|
-
if token.contains(',') {
|
|
212
|
-
let parts: Vec<u32> = token.split(',').filter_map(|s| s.parse::<u32>().ok()).collect();
|
|
213
|
-
cuts.push(parts);
|
|
214
|
-
} else if let Ok(val) = token.parse::<u32>() {
|
|
215
|
-
cuts.push(vec![val]);
|
|
216
|
-
}
|
|
217
|
-
}
|
|
218
|
-
cuts
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
fn parse_bboxes_list<'a, I>(tokens: &mut I) -> Vec<BBox>
|
|
222
|
-
where
|
|
223
|
-
I: Iterator<Item = &'a str>,
|
|
224
|
-
{
|
|
225
|
-
let coords: Vec<u32> = tokens.filter_map(|s| s.parse::<u32>().ok()).collect();
|
|
226
|
-
|
|
227
|
-
coords
|
|
228
|
-
.chunks(4)
|
|
229
|
-
.filter_map(|chunk| {
|
|
230
|
-
if chunk.len() == 4 {
|
|
231
|
-
Some(BBox {
|
|
232
|
-
x1: chunk[0],
|
|
233
|
-
y1: chunk[1],
|
|
234
|
-
x2: chunk[2],
|
|
235
|
-
y2: chunk[3],
|
|
236
|
-
})
|
|
237
|
-
} else {
|
|
238
|
-
None
|
|
239
|
-
}
|
|
240
|
-
})
|
|
241
|
-
.collect()
|
|
242
|
-
}
|
|
243
|
-
|
|
244
|
-
fn parse_quoted_string(s: &str) -> Option<String> {
|
|
245
|
-
if let Some(start) = s.find('"') {
|
|
246
|
-
if let Some(end) = s[start + 1..].find('"') {
|
|
247
|
-
return Some(s[start + 1..start + 1 + end].to_string());
|
|
248
|
-
}
|
|
249
|
-
}
|
|
250
|
-
None
|
|
251
|
-
}
|
|
252
|
-
|
|
253
|
-
fn parse_all_quoted_strings(s: &str) -> Vec<String> {
|
|
254
|
-
let mut results = Vec::new();
|
|
255
|
-
let mut remaining = s;
|
|
256
|
-
|
|
257
|
-
while let Some(start) = remaining.find('"') {
|
|
258
|
-
if let Some(end) = remaining[start + 1..].find('"') {
|
|
259
|
-
results.push(remaining[start + 1..start + 1 + end].to_string());
|
|
260
|
-
remaining = &remaining[start + 1 + end + 1..];
|
|
261
|
-
} else {
|
|
262
|
-
break;
|
|
263
|
-
}
|
|
264
|
-
}
|
|
265
|
-
|
|
266
|
-
results
|
|
267
|
-
}
|
|
268
|
-
|
|
269
|
-
#[cfg(test)]
|
|
270
|
-
mod tests {
|
|
271
|
-
use super::*;
|
|
272
|
-
|
|
273
|
-
#[test]
|
|
274
|
-
fn test_parse_bbox() {
|
|
275
|
-
let props = parse_properties("bbox 100 50 200 150");
|
|
276
|
-
assert_eq!(
|
|
277
|
-
props.bbox,
|
|
278
|
-
Some(BBox {
|
|
279
|
-
x1: 100,
|
|
280
|
-
y1: 50,
|
|
281
|
-
x2: 200,
|
|
282
|
-
y2: 150
|
|
283
|
-
})
|
|
284
|
-
);
|
|
285
|
-
}
|
|
286
|
-
|
|
287
|
-
#[test]
|
|
288
|
-
fn test_parse_baseline() {
|
|
289
|
-
let props = parse_properties("baseline 0.015 -18");
|
|
290
|
-
assert_eq!(
|
|
291
|
-
props.baseline,
|
|
292
|
-
Some(Baseline {
|
|
293
|
-
slope: 0.015,
|
|
294
|
-
constant: -18
|
|
295
|
-
})
|
|
296
|
-
);
|
|
297
|
-
}
|
|
298
|
-
|
|
299
|
-
#[test]
|
|
300
|
-
fn test_parse_multiple_properties() {
|
|
301
|
-
let props = parse_properties("bbox 0 0 100 50; x_wconf 95.5; textangle 7.2");
|
|
302
|
-
assert_eq!(
|
|
303
|
-
props.bbox,
|
|
304
|
-
Some(BBox {
|
|
305
|
-
x1: 0,
|
|
306
|
-
y1: 0,
|
|
307
|
-
x2: 100,
|
|
308
|
-
y2: 50
|
|
309
|
-
})
|
|
310
|
-
);
|
|
311
|
-
assert_eq!(props.x_wconf, Some(95.5));
|
|
312
|
-
assert_eq!(props.textangle, Some(7.2));
|
|
313
|
-
}
|
|
314
|
-
|
|
315
|
-
#[test]
|
|
316
|
-
fn test_parse_quoted_strings() {
|
|
317
|
-
let props = parse_properties("x_font \"Comic Sans MS\"; x_fsize 12");
|
|
318
|
-
assert_eq!(props.x_font, Some("Comic Sans MS".to_string()));
|
|
319
|
-
assert_eq!(props.x_fsize, Some(12));
|
|
320
|
-
}
|
|
321
|
-
|
|
322
|
-
#[test]
|
|
323
|
-
fn test_parse_poly() {
|
|
324
|
-
let props = parse_properties("poly 0 0 0 10 10 10 10 0");
|
|
325
|
-
assert_eq!(props.poly, Some(vec![(0, 0), (0, 10), (10, 10), (10, 0)]));
|
|
326
|
-
}
|
|
327
|
-
|
|
328
|
-
#[test]
|
|
329
|
-
fn test_parse_x_confs() {
|
|
330
|
-
let props = parse_properties("x_confs 37.3 51.23 100");
|
|
331
|
-
assert_eq!(props.x_confs, vec![37.3, 51.23, 100.0]);
|
|
332
|
-
}
|
|
333
|
-
}
|
|
@@ -1,129 +0,0 @@
|
|
|
1
|
-
#![allow(
|
|
2
|
-
clippy::cast_precision_loss,
|
|
3
|
-
clippy::cast_sign_loss,
|
|
4
|
-
clippy::unused_self,
|
|
5
|
-
clippy::float_cmp
|
|
6
|
-
)]
|
|
7
|
-
//! Coordinate types and parsing from hOCR bbox attributes
|
|
8
|
-
|
|
9
|
-
/// Represents a word extracted from hOCR with position and confidence information
|
|
10
|
-
#[derive(Debug, Clone)]
|
|
11
|
-
pub struct HocrWord {
|
|
12
|
-
/// The text content of the word
|
|
13
|
-
pub text: String,
|
|
14
|
-
/// X-coordinate of the left edge (pixels)
|
|
15
|
-
pub left: u32,
|
|
16
|
-
/// Y-coordinate of the top edge (pixels)
|
|
17
|
-
pub top: u32,
|
|
18
|
-
/// Width of the word bounding box (pixels)
|
|
19
|
-
pub width: u32,
|
|
20
|
-
/// Height of the word bounding box (pixels)
|
|
21
|
-
pub height: u32,
|
|
22
|
-
/// OCR confidence score (0.0 to 100.0)
|
|
23
|
-
pub confidence: f64,
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
impl HocrWord {
|
|
27
|
-
/// Get the right edge position
|
|
28
|
-
#[must_use]
|
|
29
|
-
pub const fn right(&self) -> u32 {
|
|
30
|
-
self.left + self.width
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
/// Get the bottom edge position
|
|
34
|
-
#[must_use]
|
|
35
|
-
pub const fn bottom(&self) -> u32 {
|
|
36
|
-
self.top + self.height
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
/// Get the vertical center position
|
|
40
|
-
#[must_use]
|
|
41
|
-
pub fn y_center(&self) -> f64 {
|
|
42
|
-
f64::from(self.top) + (f64::from(self.height) / 2.0)
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
/// Get the horizontal center position
|
|
46
|
-
#[must_use]
|
|
47
|
-
pub fn x_center(&self) -> f64 {
|
|
48
|
-
f64::from(self.left) + (f64::from(self.width) / 2.0)
|
|
49
|
-
}
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
/// Parse bbox attribute from hOCR title attribute
|
|
53
|
-
///
|
|
54
|
-
/// Example: "bbox 100 50 180 80; `x_wconf` 95" -> (100, 50, 80, 30)
|
|
55
|
-
pub fn parse_bbox(title: &str) -> Option<(u32, u32, u32, u32)> {
|
|
56
|
-
for part in title.split(';') {
|
|
57
|
-
let part = part.trim();
|
|
58
|
-
|
|
59
|
-
if let Some(bbox_str) = part.strip_prefix("bbox ") {
|
|
60
|
-
let coords: Vec<&str> = bbox_str.split_whitespace().collect();
|
|
61
|
-
if coords.len() == 4 {
|
|
62
|
-
if let (Ok(x1), Ok(y1), Ok(x2), Ok(y2)) = (
|
|
63
|
-
coords[0].parse::<u32>(),
|
|
64
|
-
coords[1].parse::<u32>(),
|
|
65
|
-
coords[2].parse::<u32>(),
|
|
66
|
-
coords[3].parse::<u32>(),
|
|
67
|
-
) {
|
|
68
|
-
let width = x2.saturating_sub(x1);
|
|
69
|
-
let height = y2.saturating_sub(y1);
|
|
70
|
-
return Some((x1, y1, width, height));
|
|
71
|
-
}
|
|
72
|
-
}
|
|
73
|
-
}
|
|
74
|
-
}
|
|
75
|
-
None
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
/// Parse confidence from hOCR title attribute
|
|
79
|
-
///
|
|
80
|
-
/// Example: "bbox 100 50 180 80; `x_wconf` 95" -> 95.0
|
|
81
|
-
pub fn parse_confidence(title: &str) -> f64 {
|
|
82
|
-
for part in title.split(';') {
|
|
83
|
-
let part = part.trim();
|
|
84
|
-
if let Some(conf_str) = part.strip_prefix("x_wconf ") {
|
|
85
|
-
if let Ok(conf) = conf_str.trim().parse::<f64>() {
|
|
86
|
-
return conf;
|
|
87
|
-
}
|
|
88
|
-
}
|
|
89
|
-
}
|
|
90
|
-
0.0
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
#[cfg(test)]
|
|
94
|
-
mod tests {
|
|
95
|
-
use super::*;
|
|
96
|
-
|
|
97
|
-
#[test]
|
|
98
|
-
fn test_parse_bbox() {
|
|
99
|
-
assert_eq!(parse_bbox("bbox 100 50 180 80"), Some((100, 50, 80, 30)));
|
|
100
|
-
assert_eq!(parse_bbox("bbox 0 0 100 200"), Some((0, 0, 100, 200)));
|
|
101
|
-
assert_eq!(parse_bbox("bbox 100 50 180 80; x_wconf 95"), Some((100, 50, 80, 30)));
|
|
102
|
-
assert_eq!(parse_bbox("invalid"), None);
|
|
103
|
-
assert_eq!(parse_bbox("bbox 100 50"), None);
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
#[test]
|
|
107
|
-
fn test_parse_confidence() {
|
|
108
|
-
assert_eq!(parse_confidence("x_wconf 95.5"), 95.5);
|
|
109
|
-
assert_eq!(parse_confidence("bbox 100 50 180 80; x_wconf 92"), 92.0);
|
|
110
|
-
assert_eq!(parse_confidence("invalid"), 0.0);
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
#[test]
|
|
114
|
-
fn test_hocr_word_methods() {
|
|
115
|
-
let word = HocrWord {
|
|
116
|
-
text: "Hello".to_string(),
|
|
117
|
-
left: 100,
|
|
118
|
-
top: 50,
|
|
119
|
-
width: 80,
|
|
120
|
-
height: 30,
|
|
121
|
-
confidence: 95.5,
|
|
122
|
-
};
|
|
123
|
-
|
|
124
|
-
assert_eq!(word.right(), 180);
|
|
125
|
-
assert_eq!(word.bottom(), 80);
|
|
126
|
-
assert_eq!(word.y_center(), 65.0);
|
|
127
|
-
assert_eq!(word.x_center(), 140.0);
|
|
128
|
-
}
|
|
129
|
-
}
|
|
@@ -1,165 +0,0 @@
|
|
|
1
|
-
#![allow(clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::unused_self)]
|
|
2
|
-
//! Word extraction and DOM processing for hOCR documents
|
|
3
|
-
|
|
4
|
-
use crate::hocr::spatial::coords::{HocrWord, parse_bbox, parse_confidence};
|
|
5
|
-
|
|
6
|
-
/// Extract text content from a node
|
|
7
|
-
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
8
|
-
fn get_text_content(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
|
|
9
|
-
let mut text = String::new();
|
|
10
|
-
|
|
11
|
-
if let Some(node) = node_handle.get(parser) {
|
|
12
|
-
match node {
|
|
13
|
-
tl::Node::Raw(bytes) => {
|
|
14
|
-
text.push_str(&bytes.as_utf8_str());
|
|
15
|
-
}
|
|
16
|
-
tl::Node::Tag(tag) => {
|
|
17
|
-
let children = tag.children();
|
|
18
|
-
for child_handle in children.top().iter() {
|
|
19
|
-
text.push_str(&get_text_content(child_handle, parser));
|
|
20
|
-
}
|
|
21
|
-
}
|
|
22
|
-
tl::Node::Comment(_) => {}
|
|
23
|
-
}
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
text
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
/// Extract hOCR words from a DOM tree
|
|
30
|
-
///
|
|
31
|
-
/// Walks the DOM and extracts all elements with `ocrx_word` class,
|
|
32
|
-
/// parsing their bbox and confidence information.
|
|
33
|
-
#[must_use]
|
|
34
|
-
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
35
|
-
pub fn extract_hocr_words(node_handle: &tl::NodeHandle, parser: &tl::Parser, min_confidence: f64) -> Vec<HocrWord> {
|
|
36
|
-
let mut words = Vec::new();
|
|
37
|
-
|
|
38
|
-
if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
|
|
39
|
-
let tag_name = tag.name().as_utf8_str();
|
|
40
|
-
let attrs = tag.attributes();
|
|
41
|
-
|
|
42
|
-
let class_attr = attrs.get("class").flatten().map(|v| v.as_utf8_str().to_string());
|
|
43
|
-
|
|
44
|
-
// hOCR class validation removed for performance
|
|
45
|
-
|
|
46
|
-
if tag_name == "span" {
|
|
47
|
-
let is_word = class_attr.as_ref().is_some_and(|c| c.contains("ocrx_word"));
|
|
48
|
-
let title = attrs.get("title").flatten().map(|v| v.as_utf8_str());
|
|
49
|
-
|
|
50
|
-
if is_word {
|
|
51
|
-
let title_str = title.as_deref().unwrap_or("");
|
|
52
|
-
if let Some((left, top, width, height)) = parse_bbox(title_str) {
|
|
53
|
-
let confidence = parse_confidence(title_str);
|
|
54
|
-
|
|
55
|
-
if confidence >= min_confidence {
|
|
56
|
-
let text = get_text_content(node_handle, parser).trim().to_string();
|
|
57
|
-
|
|
58
|
-
if !text.is_empty() {
|
|
59
|
-
words.push(HocrWord {
|
|
60
|
-
text,
|
|
61
|
-
left,
|
|
62
|
-
top,
|
|
63
|
-
width,
|
|
64
|
-
height,
|
|
65
|
-
confidence,
|
|
66
|
-
});
|
|
67
|
-
}
|
|
68
|
-
}
|
|
69
|
-
}
|
|
70
|
-
}
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
let children = tag.children();
|
|
74
|
-
for child_handle in children.top().iter() {
|
|
75
|
-
words.extend(extract_hocr_words(child_handle, parser, min_confidence));
|
|
76
|
-
}
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
words
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
#[cfg(test)]
|
|
83
|
-
mod tests {
|
|
84
|
-
use super::*;
|
|
85
|
-
|
|
86
|
-
#[test]
|
|
87
|
-
fn test_extract_hocr_words() {
|
|
88
|
-
let hocr = r#"
|
|
89
|
-
<div class="ocr_page">
|
|
90
|
-
<span class="ocrx_word" title="bbox 100 50 150 80; x_wconf 95">Hello</span>
|
|
91
|
-
<span class="ocrx_word" title="bbox 160 50 210 80; x_wconf 92">World</span>
|
|
92
|
-
</div>
|
|
93
|
-
"#;
|
|
94
|
-
|
|
95
|
-
let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
|
|
96
|
-
let parser = dom.parser();
|
|
97
|
-
|
|
98
|
-
let mut words = Vec::new();
|
|
99
|
-
for child_handle in dom.children().iter() {
|
|
100
|
-
words.extend(extract_hocr_words(child_handle, parser, 0.0));
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
assert_eq!(words.len(), 2);
|
|
104
|
-
assert_eq!(words[0].text, "Hello");
|
|
105
|
-
assert_eq!(words[0].left, 100);
|
|
106
|
-
assert!((words[0].confidence - 95.0).abs() < f64::EPSILON);
|
|
107
|
-
|
|
108
|
-
assert_eq!(words[1].text, "World");
|
|
109
|
-
assert_eq!(words[1].left, 160);
|
|
110
|
-
assert!((words[1].confidence - 92.0).abs() < f64::EPSILON);
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
#[test]
|
|
114
|
-
fn test_extract_hocr_words_confidence_filter() {
|
|
115
|
-
let hocr = r#"
|
|
116
|
-
<div class="ocr_page">
|
|
117
|
-
<span class="ocrx_word" title="bbox 100 50 150 80; x_wconf 95">HighConf</span>
|
|
118
|
-
<span class="ocrx_word" title="bbox 160 50 210 80; x_wconf 50">LowConf</span>
|
|
119
|
-
<span class="ocrx_word" title="bbox 220 50 270 80; x_wconf 98">VeryHigh</span>
|
|
120
|
-
</div>
|
|
121
|
-
"#;
|
|
122
|
-
|
|
123
|
-
let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
|
|
124
|
-
let parser = dom.parser();
|
|
125
|
-
|
|
126
|
-
let mut words = Vec::new();
|
|
127
|
-
for child_handle in dom.children().iter() {
|
|
128
|
-
words.extend(extract_hocr_words(child_handle, parser, 90.0));
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
assert_eq!(words.len(), 2);
|
|
132
|
-
assert_eq!(words[0].text, "HighConf");
|
|
133
|
-
assert_eq!(words[1].text, "VeryHigh");
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
#[test]
|
|
137
|
-
fn test_end_to_end_hocr_table_extraction() {
|
|
138
|
-
let hocr = r#"
|
|
139
|
-
<div class="ocr_page">
|
|
140
|
-
<span class="ocrx_word" title="bbox 100 50 140 70; x_wconf 95">Product</span>
|
|
141
|
-
<span class="ocrx_word" title="bbox 200 50 240 70; x_wconf 95">Price</span>
|
|
142
|
-
<span class="ocrx_word" title="bbox 100 100 140 120; x_wconf 95">Apple</span>
|
|
143
|
-
<span class="ocrx_word" title="bbox 200 100 240 120; x_wconf 95">$1.50</span>
|
|
144
|
-
<span class="ocrx_word" title="bbox 100 150 140 170; x_wconf 95">Orange</span>
|
|
145
|
-
<span class="ocrx_word" title="bbox 200 150 240 170; x_wconf 95">$2.00</span>
|
|
146
|
-
</div>
|
|
147
|
-
"#;
|
|
148
|
-
|
|
149
|
-
let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
|
|
150
|
-
let parser = dom.parser();
|
|
151
|
-
|
|
152
|
-
let mut words = Vec::new();
|
|
153
|
-
for child_handle in dom.children().iter() {
|
|
154
|
-
words.extend(extract_hocr_words(child_handle, parser, 0.0));
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
assert_eq!(words.len(), 6);
|
|
158
|
-
assert_eq!(words[0].text, "Product");
|
|
159
|
-
assert_eq!(words[1].text, "Price");
|
|
160
|
-
assert_eq!(words[2].text, "Apple");
|
|
161
|
-
assert_eq!(words[3].text, "$1.50");
|
|
162
|
-
assert_eq!(words[4].text, "Orange");
|
|
163
|
-
assert_eq!(words[5].text, "$2.00");
|
|
164
|
-
}
|
|
165
|
-
}
|