kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +5 -3
- data/README.md +15 -9
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +516 -324
- data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
- data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
- data/kreuzberg.gemspec +38 -4
- data/lib/kreuzberg/config.rb +34 -1
- data/lib/kreuzberg/result.rb +77 -14
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +23 -6
- data/vendor/kreuzberg/Cargo.toml +25 -11
- data/vendor/kreuzberg/README.md +13 -8
- data/vendor/kreuzberg/build.rs +17 -6
- data/vendor/kreuzberg/src/api/mod.rs +2 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
- data/vendor/kreuzberg/src/core/config.rs +49 -1
- data/vendor/kreuzberg/src/core/extractor.rs +134 -2
- data/vendor/kreuzberg/src/core/mod.rs +4 -2
- data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
- data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
- data/vendor/kreuzberg/src/extraction/html.rs +24 -8
- data/vendor/kreuzberg/src/extraction/image.rs +124 -1
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
- data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
- data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
- data/vendor/kreuzberg/src/extractors/email.rs +29 -15
- data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
- data/vendor/kreuzberg/src/extractors/html.rs +29 -15
- data/vendor/kreuzberg/src/extractors/image.rs +25 -4
- data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
- data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
- data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
- data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +194 -17
- data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
- data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
- data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +7 -2
- data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
- data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
- data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
- data/vendor/kreuzberg/src/lib.rs +10 -2
- data/vendor/kreuzberg/src/mcp/mod.rs +2 -0
- data/vendor/kreuzberg/src/mcp/server.rs +14 -12
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
- data/vendor/kreuzberg/src/pdf/error.rs +8 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
- data/vendor/kreuzberg/src/pdf/mod.rs +14 -2
- data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
- data/vendor/kreuzberg/src/pdf/table.rs +26 -2
- data/vendor/kreuzberg/src/pdf/text.rs +89 -7
- data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
- data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
- data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
- data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
- data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
- data/vendor/kreuzberg/src/text/mod.rs +6 -0
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
- data/vendor/kreuzberg/src/types.rs +173 -21
- data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
- data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
- data/vendor/kreuzberg/tests/config_features.rs +15 -1
- data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
- data/vendor/kreuzberg/tests/email_integration.rs +2 -0
- data/vendor/kreuzberg/tests/error_handling.rs +43 -34
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/image_integration.rs +2 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
- data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
- data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
- data/vendor/kreuzberg/tests/security_validation.rs +1 -0
- data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
- data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
- data/vendor/rb-sys/Cargo.lock +15 -15
- data/vendor/rb-sys/Cargo.toml +4 -4
- data/vendor/rb-sys/Cargo.toml.orig +4 -4
- data/vendor/rb-sys/bin/release.sh +9 -8
- data/vendor/rb-sys/build/features.rs +5 -2
- data/vendor/rb-sys/build/main.rs +55 -15
- data/vendor/rb-sys/build/stable_api_config.rs +4 -2
- data/vendor/rb-sys/build/version.rs +3 -1
- data/vendor/rb-sys/src/macros.rs +2 -2
- data/vendor/rb-sys/src/special_consts.rs +1 -1
- data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
- data/vendor/rb-sys/src/stable_api.rs +0 -1
- data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
- metadata +11 -10
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
|
@@ -1,14 +1,19 @@
|
|
|
1
1
|
use super::error::{PdfError, Result};
|
|
2
|
+
use crate::core::config::PageConfig;
|
|
3
|
+
use crate::types::{PageBoundary, PageContent};
|
|
2
4
|
use pdfium_render::prelude::*;
|
|
3
5
|
|
|
6
|
+
/// Result type for PDF text extraction with optional page tracking.
|
|
7
|
+
#[allow(dead_code)]
|
|
8
|
+
type PdfTextExtractionResult = (String, Option<Vec<PageBoundary>>, Option<Vec<PageContent>>);
|
|
9
|
+
|
|
4
10
|
pub struct PdfTextExtractor {
|
|
5
11
|
pdfium: Pdfium,
|
|
6
12
|
}
|
|
7
13
|
|
|
8
14
|
impl PdfTextExtractor {
|
|
9
15
|
pub fn new() -> Result<Self> {
|
|
10
|
-
let binding = Pdfium::
|
|
11
|
-
.or_else(|_| Pdfium::bind_to_system_library())
|
|
16
|
+
let binding = Pdfium::bind_to_system_library()
|
|
12
17
|
.map_err(|e| PdfError::TextExtractionFailed(format!("Failed to initialize Pdfium: {}", e)))?;
|
|
13
18
|
|
|
14
19
|
let pdfium = Pdfium::new(binding);
|
|
@@ -31,7 +36,8 @@ impl PdfTextExtractor {
|
|
|
31
36
|
}
|
|
32
37
|
})?;
|
|
33
38
|
|
|
34
|
-
extract_text_from_pdf_document(&document)
|
|
39
|
+
let (content, _, _) = extract_text_from_pdf_document(&document, None)?;
|
|
40
|
+
Ok(content)
|
|
35
41
|
}
|
|
36
42
|
|
|
37
43
|
pub fn extract_text_with_passwords(&self, pdf_bytes: &[u8], passwords: &[&str]) -> Result<String> {
|
|
@@ -89,28 +95,104 @@ pub fn extract_text_from_pdf_with_passwords(pdf_bytes: &[u8], passwords: &[&str]
|
|
|
89
95
|
extractor.extract_text_with_passwords(pdf_bytes, passwords)
|
|
90
96
|
}
|
|
91
97
|
|
|
92
|
-
|
|
98
|
+
/// Extract text from PDF document with optional page boundary tracking.
|
|
99
|
+
///
|
|
100
|
+
/// # Arguments
|
|
101
|
+
///
|
|
102
|
+
/// * `document` - The PDF document to extract text from
|
|
103
|
+
/// * `page_config` - Optional page configuration for boundary tracking and page markers
|
|
104
|
+
///
|
|
105
|
+
/// # Returns
|
|
106
|
+
///
|
|
107
|
+
/// A tuple containing:
|
|
108
|
+
/// - The extracted text content (String)
|
|
109
|
+
/// - Optional page boundaries when page tracking is enabled (Vec<PageBoundary>)
|
|
110
|
+
/// - Optional per-page content when extract_pages is enabled (Vec<PageContent>)
|
|
111
|
+
///
|
|
112
|
+
/// # Implementation Details
|
|
113
|
+
///
|
|
114
|
+
/// When page_config is None, returns fast path with (content, None, None).
|
|
115
|
+
/// When page_config is Some, tracks byte offsets using .len() for O(1) performance (UTF-8 valid boundaries).
|
|
116
|
+
pub fn extract_text_from_pdf_document(
|
|
117
|
+
document: &PdfDocument<'_>,
|
|
118
|
+
page_config: Option<&PageConfig>,
|
|
119
|
+
) -> Result<PdfTextExtractionResult> {
|
|
93
120
|
let page_count = document.pages().len() as usize;
|
|
94
121
|
|
|
122
|
+
if page_config.is_none() {
|
|
123
|
+
let estimated_size = page_count * 2048;
|
|
124
|
+
let mut content = String::with_capacity(estimated_size);
|
|
125
|
+
|
|
126
|
+
for page in document.pages().iter() {
|
|
127
|
+
let text = page
|
|
128
|
+
.text()
|
|
129
|
+
.map_err(|e| PdfError::TextExtractionFailed(format!("Page text extraction failed: {}", e)))?;
|
|
130
|
+
|
|
131
|
+
let page_text = text.all();
|
|
132
|
+
|
|
133
|
+
if !content.is_empty() {
|
|
134
|
+
content.push_str("\n\n");
|
|
135
|
+
}
|
|
136
|
+
content.push_str(&page_text);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
content.shrink_to_fit();
|
|
140
|
+
return Ok((content, None, None));
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
let config = page_config.unwrap();
|
|
95
144
|
let estimated_size = page_count * 2048;
|
|
96
145
|
let mut content = String::with_capacity(estimated_size);
|
|
146
|
+
let mut boundaries = Vec::with_capacity(page_count);
|
|
147
|
+
let mut page_contents = if config.extract_pages {
|
|
148
|
+
Some(Vec::with_capacity(page_count))
|
|
149
|
+
} else {
|
|
150
|
+
None
|
|
151
|
+
};
|
|
152
|
+
|
|
153
|
+
for (page_idx, page) in document.pages().iter().enumerate() {
|
|
154
|
+
let page_number = page_idx + 1;
|
|
97
155
|
|
|
98
|
-
for page in document.pages().iter() {
|
|
99
156
|
let text = page
|
|
100
157
|
.text()
|
|
101
158
|
.map_err(|e| PdfError::TextExtractionFailed(format!("Page text extraction failed: {}", e)))?;
|
|
102
159
|
|
|
103
160
|
let page_text = text.all();
|
|
104
161
|
|
|
105
|
-
if
|
|
162
|
+
if page_number > 1 && config.insert_page_markers {
|
|
163
|
+
let marker = config.marker_format.replace("{page_num}", &page_number.to_string());
|
|
164
|
+
content.push_str(&marker);
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
if page_number > 1 && !config.insert_page_markers && !content.is_empty() {
|
|
106
168
|
content.push_str("\n\n");
|
|
107
169
|
}
|
|
170
|
+
|
|
171
|
+
let byte_start = content.len();
|
|
172
|
+
|
|
108
173
|
content.push_str(&page_text);
|
|
174
|
+
|
|
175
|
+
let byte_end = content.len();
|
|
176
|
+
|
|
177
|
+
boundaries.push(PageBoundary {
|
|
178
|
+
byte_start,
|
|
179
|
+
byte_end,
|
|
180
|
+
page_number,
|
|
181
|
+
});
|
|
182
|
+
|
|
183
|
+
if let Some(ref mut pages) = page_contents {
|
|
184
|
+
pages.push(PageContent {
|
|
185
|
+
page_number,
|
|
186
|
+
content: page_text,
|
|
187
|
+
tables: Vec::new(),
|
|
188
|
+
images: Vec::new(),
|
|
189
|
+
});
|
|
190
|
+
}
|
|
109
191
|
}
|
|
110
192
|
|
|
111
193
|
content.shrink_to_fit();
|
|
112
194
|
|
|
113
|
-
Ok(content)
|
|
195
|
+
Ok((content, Some(boundaries), page_contents))
|
|
114
196
|
}
|
|
115
197
|
|
|
116
198
|
#[cfg(test)]
|
|
@@ -10,6 +10,9 @@ use async_trait::async_trait;
|
|
|
10
10
|
use std::path::Path;
|
|
11
11
|
use std::sync::Arc;
|
|
12
12
|
|
|
13
|
+
#[cfg(not(feature = "tokio-runtime"))]
|
|
14
|
+
use crate::KreuzbergError;
|
|
15
|
+
|
|
13
16
|
/// Trait for document extractor plugins.
|
|
14
17
|
///
|
|
15
18
|
/// Implement this trait to add support for new document formats or to override
|
|
@@ -61,6 +64,7 @@ use std::sync::Arc;
|
|
|
61
64
|
/// detected_languages: None,
|
|
62
65
|
/// chunks: None,
|
|
63
66
|
/// images: None,
|
|
67
|
+
/// pages: None,
|
|
64
68
|
/// })
|
|
65
69
|
/// }
|
|
66
70
|
///
|
|
@@ -139,6 +143,7 @@ pub trait DocumentExtractor: Plugin {
|
|
|
139
143
|
/// detected_languages: None,
|
|
140
144
|
/// chunks: None,
|
|
141
145
|
/// images: None,
|
|
146
|
+
/// pages: None,
|
|
142
147
|
/// })
|
|
143
148
|
/// }
|
|
144
149
|
/// # }
|
|
@@ -209,14 +214,27 @@ pub trait DocumentExtractor: Plugin {
|
|
|
209
214
|
/// detected_languages: None,
|
|
210
215
|
/// chunks: None,
|
|
211
216
|
/// images: None,
|
|
217
|
+
/// pages: None,
|
|
212
218
|
/// })
|
|
213
219
|
/// }
|
|
214
220
|
/// # }
|
|
215
221
|
/// ```
|
|
216
222
|
async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
223
|
+
#[cfg(feature = "tokio-runtime")]
|
|
224
|
+
{
|
|
225
|
+
use crate::core::io;
|
|
226
|
+
let bytes = io::read_file_async(path).await?;
|
|
227
|
+
self.extract_bytes(&bytes, mime_type, config).await
|
|
228
|
+
}
|
|
229
|
+
#[cfg(not(feature = "tokio-runtime"))]
|
|
230
|
+
{
|
|
231
|
+
let _ = (path, mime_type, config);
|
|
232
|
+
// For WASM and non-tokio environments, file extraction is not supported
|
|
233
|
+
// through the default implementation. Implementations must provide their own.
|
|
234
|
+
Err(KreuzbergError::Other(
|
|
235
|
+
"File-based extraction requires the tokio-runtime feature".to_string(),
|
|
236
|
+
))
|
|
237
|
+
}
|
|
220
238
|
}
|
|
221
239
|
|
|
222
240
|
/// Get the list of MIME types supported by this extractor.
|
|
@@ -359,6 +377,14 @@ pub trait DocumentExtractor: Plugin {
|
|
|
359
377
|
fn can_handle(&self, _path: &Path, _mime_type: &str) -> bool {
|
|
360
378
|
true
|
|
361
379
|
}
|
|
380
|
+
|
|
381
|
+
/// Attempt to get a reference to this extractor as a SyncExtractor.
|
|
382
|
+
///
|
|
383
|
+
/// Returns None if the extractor doesn't support synchronous extraction.
|
|
384
|
+
/// This is used for WASM and other sync-only environments.
|
|
385
|
+
fn as_sync_extractor(&self) -> Option<&dyn crate::extractors::SyncExtractor> {
|
|
386
|
+
None
|
|
387
|
+
}
|
|
362
388
|
}
|
|
363
389
|
|
|
364
390
|
/// Register a document extractor with the global registry.
|
|
@@ -412,6 +438,7 @@ pub trait DocumentExtractor: Plugin {
|
|
|
412
438
|
/// detected_languages: None,
|
|
413
439
|
/// chunks: None,
|
|
414
440
|
/// images: None,
|
|
441
|
+
/// pages: None,
|
|
415
442
|
/// })
|
|
416
443
|
/// }
|
|
417
444
|
///
|
|
@@ -577,6 +604,7 @@ mod tests {
|
|
|
577
604
|
detected_languages: None,
|
|
578
605
|
chunks: None,
|
|
579
606
|
images: None,
|
|
607
|
+
pages: None,
|
|
580
608
|
})
|
|
581
609
|
}
|
|
582
610
|
|
|
@@ -749,6 +777,7 @@ mod tests {
|
|
|
749
777
|
detected_languages: None,
|
|
750
778
|
chunks: None,
|
|
751
779
|
images: None,
|
|
780
|
+
pages: None,
|
|
752
781
|
})
|
|
753
782
|
}
|
|
754
783
|
|
|
@@ -953,6 +982,7 @@ mod tests {
|
|
|
953
982
|
detected_languages: None,
|
|
954
983
|
chunks: None,
|
|
955
984
|
images: None,
|
|
985
|
+
pages: None,
|
|
956
986
|
})
|
|
957
987
|
}
|
|
958
988
|
|
|
@@ -998,6 +1028,7 @@ mod tests {
|
|
|
998
1028
|
detected_languages: None,
|
|
999
1029
|
chunks: None,
|
|
1000
1030
|
images: None,
|
|
1031
|
+
pages: None,
|
|
1001
1032
|
})
|
|
1002
1033
|
}
|
|
1003
1034
|
|
|
@@ -47,6 +47,7 @@
|
|
|
47
47
|
//! # detected_languages: None,
|
|
48
48
|
//! # chunks: None,
|
|
49
49
|
//! # images: None,
|
|
50
|
+
//! # pages: None,
|
|
50
51
|
//! # })
|
|
51
52
|
//! # }
|
|
52
53
|
//! # async fn extract_file(&self, _: &std::path::Path, _: &str, _: &kreuzberg::ExtractionConfig)
|
|
@@ -59,6 +60,7 @@
|
|
|
59
60
|
//! # detected_languages: None,
|
|
60
61
|
//! # chunks: None,
|
|
61
62
|
//! # images: None,
|
|
63
|
+
//! # pages: None,
|
|
62
64
|
//! # })
|
|
63
65
|
//! # }
|
|
64
66
|
//! # fn supported_mime_types(&self) -> &[&str] { &[] }
|
|
@@ -120,6 +122,7 @@
|
|
|
120
122
|
//! detected_languages: None,
|
|
121
123
|
//! chunks: None,
|
|
122
124
|
//! images: None,
|
|
125
|
+
//! pages: None,
|
|
123
126
|
//! })
|
|
124
127
|
//! }
|
|
125
128
|
//!
|
|
@@ -10,6 +10,9 @@ use async_trait::async_trait;
|
|
|
10
10
|
use std::path::Path;
|
|
11
11
|
use std::sync::Arc;
|
|
12
12
|
|
|
13
|
+
#[cfg(not(feature = "tokio-runtime"))]
|
|
14
|
+
use crate::KreuzbergError;
|
|
15
|
+
|
|
13
16
|
/// OCR backend types.
|
|
14
17
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
15
18
|
pub enum OcrBackendType {
|
|
@@ -64,6 +67,7 @@ pub enum OcrBackendType {
|
|
|
64
67
|
/// detected_languages: None,
|
|
65
68
|
/// chunks: None,
|
|
66
69
|
/// images: None,
|
|
70
|
+
/// pages: None,
|
|
67
71
|
/// })
|
|
68
72
|
/// }
|
|
69
73
|
///
|
|
@@ -142,6 +146,7 @@ pub trait OcrBackend: Plugin {
|
|
|
142
146
|
/// detected_languages: None,
|
|
143
147
|
/// chunks: None,
|
|
144
148
|
/// images: None,
|
|
149
|
+
/// pages: None,
|
|
145
150
|
/// })
|
|
146
151
|
/// }
|
|
147
152
|
/// # }
|
|
@@ -162,9 +167,21 @@ pub trait OcrBackend: Plugin {
|
|
|
162
167
|
///
|
|
163
168
|
/// Same as `process_image`, plus file I/O errors.
|
|
164
169
|
async fn process_file(&self, path: &Path, config: &OcrConfig) -> Result<ExtractionResult> {
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
170
|
+
#[cfg(feature = "tokio-runtime")]
|
|
171
|
+
{
|
|
172
|
+
use crate::core::io;
|
|
173
|
+
let bytes = io::read_file_async(path).await?;
|
|
174
|
+
self.process_image(&bytes, config).await
|
|
175
|
+
}
|
|
176
|
+
#[cfg(not(feature = "tokio-runtime"))]
|
|
177
|
+
{
|
|
178
|
+
let _ = (path, config);
|
|
179
|
+
// For WASM and non-tokio environments, file-based OCR is not supported
|
|
180
|
+
// through the default implementation. Implementations must provide their own.
|
|
181
|
+
Err(KreuzbergError::Other(
|
|
182
|
+
"File-based OCR processing requires the tokio-runtime feature".to_string(),
|
|
183
|
+
))
|
|
184
|
+
}
|
|
168
185
|
}
|
|
169
186
|
|
|
170
187
|
/// Check if this backend supports a given language code.
|
|
@@ -302,6 +319,7 @@ pub trait OcrBackend: Plugin {
|
|
|
302
319
|
/// detected_languages: None,
|
|
303
320
|
/// chunks: None,
|
|
304
321
|
/// images: None,
|
|
322
|
+
/// pages: None,
|
|
305
323
|
/// })
|
|
306
324
|
/// }
|
|
307
325
|
/// fn supports_language(&self, _: &str) -> bool { true }
|
|
@@ -462,6 +480,7 @@ mod tests {
|
|
|
462
480
|
detected_languages: None,
|
|
463
481
|
chunks: None,
|
|
464
482
|
images: None,
|
|
483
|
+
pages: None,
|
|
465
484
|
})
|
|
466
485
|
}
|
|
467
486
|
|
|
@@ -373,6 +373,7 @@ mod tests {
|
|
|
373
373
|
detected_languages: None,
|
|
374
374
|
chunks: None,
|
|
375
375
|
images: None,
|
|
376
|
+
pages: None,
|
|
376
377
|
};
|
|
377
378
|
|
|
378
379
|
let config = ExtractionConfig::default();
|
|
@@ -422,6 +423,7 @@ mod tests {
|
|
|
422
423
|
detected_languages: None,
|
|
423
424
|
chunks: None,
|
|
424
425
|
images: None,
|
|
426
|
+
pages: None,
|
|
425
427
|
};
|
|
426
428
|
|
|
427
429
|
let config = ExtractionConfig::default();
|
|
@@ -488,6 +490,7 @@ mod tests {
|
|
|
488
490
|
detected_languages: None,
|
|
489
491
|
chunks: None,
|
|
490
492
|
images: None,
|
|
493
|
+
pages: None,
|
|
491
494
|
};
|
|
492
495
|
|
|
493
496
|
let config = ExtractionConfig::default();
|
|
@@ -513,6 +516,7 @@ mod tests {
|
|
|
513
516
|
additional,
|
|
514
517
|
..Default::default()
|
|
515
518
|
},
|
|
519
|
+
pages: None,
|
|
516
520
|
tables: vec![],
|
|
517
521
|
detected_languages: None,
|
|
518
522
|
chunks: None,
|
|
@@ -543,6 +547,7 @@ mod tests {
|
|
|
543
547
|
detected_languages: None,
|
|
544
548
|
chunks: None,
|
|
545
549
|
images: None,
|
|
550
|
+
pages: None,
|
|
546
551
|
};
|
|
547
552
|
|
|
548
553
|
assert_eq!(processor.estimated_duration_ms(&result), 0);
|
|
@@ -593,6 +598,7 @@ mod tests {
|
|
|
593
598
|
detected_languages: None,
|
|
594
599
|
chunks: None,
|
|
595
600
|
images: None,
|
|
601
|
+
pages: None,
|
|
596
602
|
};
|
|
597
603
|
|
|
598
604
|
let txt_result = ExtractionResult {
|
|
@@ -603,6 +609,7 @@ mod tests {
|
|
|
603
609
|
detected_languages: None,
|
|
604
610
|
chunks: None,
|
|
605
611
|
images: None,
|
|
612
|
+
pages: None,
|
|
606
613
|
};
|
|
607
614
|
|
|
608
615
|
assert!(processor.should_process(&pdf_result, &config));
|
|
@@ -631,6 +638,7 @@ mod tests {
|
|
|
631
638
|
detected_languages: None,
|
|
632
639
|
chunks: None,
|
|
633
640
|
images: None,
|
|
641
|
+
pages: None,
|
|
634
642
|
};
|
|
635
643
|
|
|
636
644
|
let config = ExtractionConfig::default();
|
|
@@ -661,6 +661,7 @@ mod tests {
|
|
|
661
661
|
detected_languages: None,
|
|
662
662
|
chunks: None,
|
|
663
663
|
images: None,
|
|
664
|
+
pages: None,
|
|
664
665
|
})
|
|
665
666
|
}
|
|
666
667
|
|
|
@@ -705,6 +706,7 @@ mod tests {
|
|
|
705
706
|
detected_languages: None,
|
|
706
707
|
chunks: None,
|
|
707
708
|
images: None,
|
|
709
|
+
pages: None,
|
|
708
710
|
})
|
|
709
711
|
}
|
|
710
712
|
|
|
@@ -489,6 +489,7 @@ mod tests {
|
|
|
489
489
|
detected_languages: None,
|
|
490
490
|
chunks: None,
|
|
491
491
|
images: None,
|
|
492
|
+
pages: None,
|
|
492
493
|
};
|
|
493
494
|
|
|
494
495
|
let config = ExtractionConfig::default();
|
|
@@ -507,6 +508,7 @@ mod tests {
|
|
|
507
508
|
detected_languages: None,
|
|
508
509
|
chunks: None,
|
|
509
510
|
images: None,
|
|
511
|
+
pages: None,
|
|
510
512
|
};
|
|
511
513
|
|
|
512
514
|
let config = ExtractionConfig::default();
|
|
@@ -527,6 +529,7 @@ mod tests {
|
|
|
527
529
|
detected_languages: None,
|
|
528
530
|
chunks: None,
|
|
529
531
|
images: None,
|
|
532
|
+
pages: None,
|
|
530
533
|
};
|
|
531
534
|
|
|
532
535
|
let config = ExtractionConfig::default();
|
|
@@ -562,6 +565,7 @@ mod tests {
|
|
|
562
565
|
detected_languages: None,
|
|
563
566
|
chunks: None,
|
|
564
567
|
images: None,
|
|
568
|
+
pages: None,
|
|
565
569
|
};
|
|
566
570
|
|
|
567
571
|
let config = ExtractionConfig::default();
|
|
@@ -609,6 +613,7 @@ mod tests {
|
|
|
609
613
|
detected_languages: None,
|
|
610
614
|
chunks: None,
|
|
611
615
|
images: None,
|
|
616
|
+
pages: None,
|
|
612
617
|
};
|
|
613
618
|
|
|
614
619
|
let txt_result = ExtractionResult {
|
|
@@ -619,6 +624,7 @@ mod tests {
|
|
|
619
624
|
detected_languages: None,
|
|
620
625
|
chunks: None,
|
|
621
626
|
images: None,
|
|
627
|
+
pages: None,
|
|
622
628
|
};
|
|
623
629
|
|
|
624
630
|
assert!(validator.should_validate(&pdf_result, &config));
|
|
@@ -702,6 +708,7 @@ mod tests {
|
|
|
702
708
|
detected_languages: None,
|
|
703
709
|
chunks: None,
|
|
704
710
|
images: None,
|
|
711
|
+
pages: None,
|
|
705
712
|
};
|
|
706
713
|
|
|
707
714
|
let config = ExtractionConfig::default();
|
|
@@ -729,6 +736,7 @@ mod tests {
|
|
|
729
736
|
additional,
|
|
730
737
|
..Default::default()
|
|
731
738
|
},
|
|
739
|
+
pages: None,
|
|
732
740
|
tables: vec![],
|
|
733
741
|
detected_languages: None,
|
|
734
742
|
chunks: None,
|
|
@@ -759,6 +767,7 @@ mod tests {
|
|
|
759
767
|
detected_languages: None,
|
|
760
768
|
chunks: None,
|
|
761
769
|
images: None,
|
|
770
|
+
pages: None,
|
|
762
771
|
};
|
|
763
772
|
|
|
764
773
|
let config = ExtractionConfig::default();
|
|
@@ -787,6 +796,7 @@ mod tests {
|
|
|
787
796
|
detected_languages: None,
|
|
788
797
|
chunks: None,
|
|
789
798
|
images: None,
|
|
799
|
+
pages: None,
|
|
790
800
|
};
|
|
791
801
|
|
|
792
802
|
assert!(validator.validate(&result, &config).await.is_ok());
|
|
@@ -805,6 +815,7 @@ mod tests {
|
|
|
805
815
|
detected_languages: None,
|
|
806
816
|
chunks: None,
|
|
807
817
|
images: None,
|
|
818
|
+
pages: None,
|
|
808
819
|
};
|
|
809
820
|
|
|
810
821
|
let config = ExtractionConfig::default();
|
|
@@ -7,9 +7,15 @@ pub mod string_utils;
|
|
|
7
7
|
#[cfg(feature = "quality")]
|
|
8
8
|
pub mod token_reduction;
|
|
9
9
|
|
|
10
|
+
#[cfg(feature = "quality")]
|
|
11
|
+
pub mod quality_processor;
|
|
12
|
+
|
|
10
13
|
#[cfg(feature = "quality")]
|
|
11
14
|
pub use quality::{calculate_quality_score, clean_extracted_text, normalize_spaces};
|
|
12
15
|
|
|
16
|
+
#[cfg(feature = "quality")]
|
|
17
|
+
pub use quality_processor::QualityProcessor;
|
|
18
|
+
|
|
13
19
|
#[cfg(feature = "quality")]
|
|
14
20
|
pub use string_utils::{calculate_text_confidence, fix_mojibake, get_encoding_cache_key, safe_decode};
|
|
15
21
|
|