kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +5 -3
- data/README.md +15 -9
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +516 -324
- data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
- data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
- data/kreuzberg.gemspec +38 -4
- data/lib/kreuzberg/config.rb +34 -1
- data/lib/kreuzberg/result.rb +77 -14
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +23 -6
- data/vendor/kreuzberg/Cargo.toml +25 -11
- data/vendor/kreuzberg/README.md +13 -8
- data/vendor/kreuzberg/build.rs +17 -6
- data/vendor/kreuzberg/src/api/mod.rs +2 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
- data/vendor/kreuzberg/src/core/config.rs +49 -1
- data/vendor/kreuzberg/src/core/extractor.rs +134 -2
- data/vendor/kreuzberg/src/core/mod.rs +4 -2
- data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
- data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
- data/vendor/kreuzberg/src/extraction/html.rs +24 -8
- data/vendor/kreuzberg/src/extraction/image.rs +124 -1
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
- data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
- data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
- data/vendor/kreuzberg/src/extractors/email.rs +29 -15
- data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
- data/vendor/kreuzberg/src/extractors/html.rs +29 -15
- data/vendor/kreuzberg/src/extractors/image.rs +25 -4
- data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
- data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
- data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
- data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +194 -17
- data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
- data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
- data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +7 -2
- data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
- data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
- data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
- data/vendor/kreuzberg/src/lib.rs +10 -2
- data/vendor/kreuzberg/src/mcp/mod.rs +2 -0
- data/vendor/kreuzberg/src/mcp/server.rs +14 -12
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
- data/vendor/kreuzberg/src/pdf/error.rs +8 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
- data/vendor/kreuzberg/src/pdf/mod.rs +14 -2
- data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
- data/vendor/kreuzberg/src/pdf/table.rs +26 -2
- data/vendor/kreuzberg/src/pdf/text.rs +89 -7
- data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
- data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
- data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
- data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
- data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
- data/vendor/kreuzberg/src/text/mod.rs +6 -0
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
- data/vendor/kreuzberg/src/types.rs +173 -21
- data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
- data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
- data/vendor/kreuzberg/tests/config_features.rs +15 -1
- data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
- data/vendor/kreuzberg/tests/email_integration.rs +2 -0
- data/vendor/kreuzberg/tests/error_handling.rs +43 -34
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/image_integration.rs +2 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
- data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
- data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
- data/vendor/kreuzberg/tests/security_validation.rs +1 -0
- data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
- data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
- data/vendor/rb-sys/Cargo.lock +15 -15
- data/vendor/rb-sys/Cargo.toml +4 -4
- data/vendor/rb-sys/Cargo.toml.orig +4 -4
- data/vendor/rb-sys/bin/release.sh +9 -8
- data/vendor/rb-sys/build/features.rs +5 -2
- data/vendor/rb-sys/build/main.rs +55 -15
- data/vendor/rb-sys/build/stable_api_config.rs +4 -2
- data/vendor/rb-sys/build/version.rs +3 -1
- data/vendor/rb-sys/src/macros.rs +2 -2
- data/vendor/rb-sys/src/special_consts.rs +1 -1
- data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
- data/vendor/rb-sys/src/stable_api.rs +0 -1
- data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
- metadata +11 -10
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
|
@@ -4,8 +4,13 @@
|
|
|
4
4
|
|
|
5
5
|
use crate::Result;
|
|
6
6
|
use crate::core::config::LanguageDetectionConfig;
|
|
7
|
+
use once_cell::sync::Lazy;
|
|
8
|
+
use std::sync::Arc;
|
|
7
9
|
use whatlang::{Lang, detect};
|
|
8
10
|
|
|
11
|
+
pub mod processor;
|
|
12
|
+
pub use processor::LanguageDetector;
|
|
13
|
+
|
|
9
14
|
/// Detect languages in text using whatlang.
|
|
10
15
|
///
|
|
11
16
|
/// Returns a list of detected language codes (ISO 639-3 format).
|
|
@@ -940,3 +945,41 @@ mod tests {
|
|
|
940
945
|
assert_eq!(langs[0], "eng");
|
|
941
946
|
}
|
|
942
947
|
}
|
|
948
|
+
|
|
949
|
+
/// Lazy-initialized flag that ensures language detection processor is registered exactly once.
|
|
950
|
+
///
|
|
951
|
+
/// This static is accessed on first use to automatically register the
|
|
952
|
+
/// language detection processor with the plugin registry.
|
|
953
|
+
static PROCESSOR_INITIALIZED: Lazy<Result<()>> = Lazy::new(register_language_detection_processor);
|
|
954
|
+
|
|
955
|
+
/// Ensure the language detection processor is registered.
|
|
956
|
+
///
|
|
957
|
+
/// This function is called automatically when needed.
|
|
958
|
+
/// It's safe to call multiple times - registration only happens once.
|
|
959
|
+
pub fn ensure_initialized() -> Result<()> {
|
|
960
|
+
PROCESSOR_INITIALIZED
|
|
961
|
+
.as_ref()
|
|
962
|
+
.map(|_| ())
|
|
963
|
+
.map_err(|e| crate::KreuzbergError::Plugin {
|
|
964
|
+
message: format!("Failed to register language detection processor: {}", e),
|
|
965
|
+
plugin_name: "language-detection".to_string(),
|
|
966
|
+
})
|
|
967
|
+
}
|
|
968
|
+
|
|
969
|
+
/// Register the language detection processor with the global registry.
|
|
970
|
+
///
|
|
971
|
+
/// This function should be called once at application startup to register
|
|
972
|
+
/// the language detection post-processor.
|
|
973
|
+
///
|
|
974
|
+
/// **Note:** This is called automatically on first use.
|
|
975
|
+
/// Explicit calling is optional.
|
|
976
|
+
pub fn register_language_detection_processor() -> Result<()> {
|
|
977
|
+
let registry = crate::plugins::registry::get_post_processor_registry();
|
|
978
|
+
let mut registry = registry
|
|
979
|
+
.write()
|
|
980
|
+
.map_err(|e| crate::KreuzbergError::Other(format!("Post-processor registry lock poisoned: {}", e)))?;
|
|
981
|
+
|
|
982
|
+
registry.register(Arc::new(LanguageDetector), 40)?;
|
|
983
|
+
|
|
984
|
+
Ok(())
|
|
985
|
+
}
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
//! Language detection post-processor.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides a PostProcessor plugin that detects languages in
|
|
4
|
+
//! extraction results and stores them in the result.
|
|
5
|
+
|
|
6
|
+
use crate::plugins::{Plugin, PostProcessor, ProcessingStage};
|
|
7
|
+
use crate::{ExtractionConfig, ExtractionResult, KreuzbergError, Result};
|
|
8
|
+
use async_trait::async_trait;
|
|
9
|
+
|
|
10
|
+
/// Post-processor that detects languages in document content.
|
|
11
|
+
///
|
|
12
|
+
/// This processor:
|
|
13
|
+
/// - Runs in the Early processing stage
|
|
14
|
+
/// - Only processes when `config.language_detection` is configured
|
|
15
|
+
/// - Stores detected languages in `result.detected_languages`
|
|
16
|
+
/// - Uses the whatlang library for detection
|
|
17
|
+
///
|
|
18
|
+
/// # Example
|
|
19
|
+
///
|
|
20
|
+
/// ```rust,no_run
|
|
21
|
+
/// use kreuzberg::plugins::{Plugin, PostProcessor};
|
|
22
|
+
/// use kreuzberg::language_detection::processor::LanguageDetector;
|
|
23
|
+
///
|
|
24
|
+
/// let processor = LanguageDetector;
|
|
25
|
+
/// assert_eq!(processor.name(), "language-detection");
|
|
26
|
+
/// ```
|
|
27
|
+
#[derive(Debug, Clone, Copy)]
|
|
28
|
+
pub struct LanguageDetector;
|
|
29
|
+
|
|
30
|
+
impl Plugin for LanguageDetector {
|
|
31
|
+
fn name(&self) -> &str {
|
|
32
|
+
"language-detection"
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
fn version(&self) -> String {
|
|
36
|
+
env!("CARGO_PKG_VERSION").to_string()
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
fn initialize(&self) -> Result<()> {
|
|
40
|
+
Ok(())
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
fn shutdown(&self) -> Result<()> {
|
|
44
|
+
Ok(())
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
|
|
49
|
+
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
|
|
50
|
+
impl PostProcessor for LanguageDetector {
|
|
51
|
+
async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig) -> Result<()> {
|
|
52
|
+
let lang_config = match &config.language_detection {
|
|
53
|
+
Some(cfg) => cfg,
|
|
54
|
+
None => return Ok(()),
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
match super::detect_languages(&result.content, lang_config)
|
|
58
|
+
.map_err(|e| KreuzbergError::Other(format!("Language detection failed: {}", e)))?
|
|
59
|
+
{
|
|
60
|
+
Some(languages) => {
|
|
61
|
+
result.detected_languages = Some(languages);
|
|
62
|
+
}
|
|
63
|
+
None => {
|
|
64
|
+
result.detected_languages = None;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
Ok(())
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
fn processing_stage(&self) -> ProcessingStage {
|
|
72
|
+
ProcessingStage::Early
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
fn should_process(&self, _result: &ExtractionResult, config: &ExtractionConfig) -> bool {
|
|
76
|
+
config.language_detection.is_some()
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
fn estimated_duration_ms(&self, result: &ExtractionResult) -> u64 {
|
|
80
|
+
let text_length = result.content.len();
|
|
81
|
+
// Language detection is relatively fast: ~1ms per 1KB
|
|
82
|
+
(text_length / 1024).max(1) as u64
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
#[cfg(test)]
|
|
87
|
+
mod tests {
|
|
88
|
+
use super::*;
|
|
89
|
+
use crate::core::config::LanguageDetectionConfig;
|
|
90
|
+
use crate::types::Metadata;
|
|
91
|
+
|
|
92
|
+
#[tokio::test]
|
|
93
|
+
async fn test_language_detector_processor() {
|
|
94
|
+
let processor = LanguageDetector;
|
|
95
|
+
let config = ExtractionConfig {
|
|
96
|
+
language_detection: Some(LanguageDetectionConfig {
|
|
97
|
+
enabled: true,
|
|
98
|
+
min_confidence: 0.8,
|
|
99
|
+
detect_multiple: false,
|
|
100
|
+
}),
|
|
101
|
+
..Default::default()
|
|
102
|
+
};
|
|
103
|
+
|
|
104
|
+
let mut result = ExtractionResult {
|
|
105
|
+
content: "Hello world! This is a test of the language detection system.".to_string(),
|
|
106
|
+
mime_type: "text/plain".to_string(),
|
|
107
|
+
metadata: Metadata::default(),
|
|
108
|
+
tables: vec![],
|
|
109
|
+
detected_languages: None,
|
|
110
|
+
chunks: None,
|
|
111
|
+
images: None,
|
|
112
|
+
pages: None,
|
|
113
|
+
};
|
|
114
|
+
|
|
115
|
+
processor.process(&mut result, &config).await.unwrap();
|
|
116
|
+
|
|
117
|
+
assert!(result.detected_languages.is_some());
|
|
118
|
+
let langs = result.detected_languages.unwrap();
|
|
119
|
+
assert!(!langs.is_empty());
|
|
120
|
+
assert_eq!(langs[0], "eng");
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
#[tokio::test]
|
|
124
|
+
async fn test_language_detector_no_config() {
|
|
125
|
+
let processor = LanguageDetector;
|
|
126
|
+
let config = ExtractionConfig::default();
|
|
127
|
+
|
|
128
|
+
let mut result = ExtractionResult {
|
|
129
|
+
content: "Hello world!".to_string(),
|
|
130
|
+
mime_type: "text/plain".to_string(),
|
|
131
|
+
metadata: Metadata::default(),
|
|
132
|
+
tables: vec![],
|
|
133
|
+
detected_languages: None,
|
|
134
|
+
chunks: None,
|
|
135
|
+
images: None,
|
|
136
|
+
pages: None,
|
|
137
|
+
};
|
|
138
|
+
|
|
139
|
+
processor.process(&mut result, &config).await.unwrap();
|
|
140
|
+
|
|
141
|
+
assert!(result.detected_languages.is_none());
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
#[test]
|
|
145
|
+
fn test_language_detector_plugin_interface() {
|
|
146
|
+
let processor = LanguageDetector;
|
|
147
|
+
assert_eq!(processor.name(), "language-detection");
|
|
148
|
+
assert!(!processor.version().is_empty());
|
|
149
|
+
assert!(processor.initialize().is_ok());
|
|
150
|
+
assert!(processor.shutdown().is_ok());
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
#[test]
|
|
154
|
+
fn test_language_detector_stage() {
|
|
155
|
+
let processor = LanguageDetector;
|
|
156
|
+
assert_eq!(processor.processing_stage(), ProcessingStage::Early);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
#[test]
|
|
160
|
+
fn test_language_detector_should_process() {
|
|
161
|
+
let processor = LanguageDetector;
|
|
162
|
+
|
|
163
|
+
let result = ExtractionResult {
|
|
164
|
+
content: "Sample text".to_string(),
|
|
165
|
+
mime_type: "text/plain".to_string(),
|
|
166
|
+
metadata: Metadata::default(),
|
|
167
|
+
tables: vec![],
|
|
168
|
+
detected_languages: None,
|
|
169
|
+
chunks: None,
|
|
170
|
+
images: None,
|
|
171
|
+
pages: None,
|
|
172
|
+
};
|
|
173
|
+
|
|
174
|
+
let config_with_lang = ExtractionConfig {
|
|
175
|
+
language_detection: Some(LanguageDetectionConfig {
|
|
176
|
+
enabled: true,
|
|
177
|
+
min_confidence: 0.8,
|
|
178
|
+
detect_multiple: false,
|
|
179
|
+
}),
|
|
180
|
+
..Default::default()
|
|
181
|
+
};
|
|
182
|
+
assert!(processor.should_process(&result, &config_with_lang));
|
|
183
|
+
|
|
184
|
+
let config_without_lang = ExtractionConfig::default();
|
|
185
|
+
assert!(!processor.should_process(&result, &config_without_lang));
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
#[test]
|
|
189
|
+
fn test_language_detector_estimated_duration() {
|
|
190
|
+
let processor = LanguageDetector;
|
|
191
|
+
|
|
192
|
+
let short_result = ExtractionResult {
|
|
193
|
+
content: "Short".to_string(),
|
|
194
|
+
mime_type: "text/plain".to_string(),
|
|
195
|
+
metadata: Metadata::default(),
|
|
196
|
+
tables: vec![],
|
|
197
|
+
detected_languages: None,
|
|
198
|
+
chunks: None,
|
|
199
|
+
images: None,
|
|
200
|
+
pages: None,
|
|
201
|
+
};
|
|
202
|
+
|
|
203
|
+
let long_result = ExtractionResult {
|
|
204
|
+
content: "a".repeat(10000),
|
|
205
|
+
mime_type: "text/plain".to_string(),
|
|
206
|
+
metadata: Metadata::default(),
|
|
207
|
+
tables: vec![],
|
|
208
|
+
detected_languages: None,
|
|
209
|
+
chunks: None,
|
|
210
|
+
images: None,
|
|
211
|
+
pages: None,
|
|
212
|
+
};
|
|
213
|
+
|
|
214
|
+
let short_duration = processor.estimated_duration_ms(&short_result);
|
|
215
|
+
let long_duration = processor.estimated_duration_ms(&long_result);
|
|
216
|
+
|
|
217
|
+
assert!(long_duration > short_duration);
|
|
218
|
+
}
|
|
219
|
+
}
|
data/vendor/kreuzberg/src/lib.rs
CHANGED
|
@@ -84,13 +84,21 @@ pub use types::*;
|
|
|
84
84
|
pub use core::extractor::{batch_extract_bytes, batch_extract_file};
|
|
85
85
|
pub use core::extractor::{extract_bytes, extract_file};
|
|
86
86
|
|
|
87
|
-
|
|
87
|
+
// Available in WASM (bytes-based)
|
|
88
|
+
pub use core::extractor::{batch_extract_bytes_sync, extract_bytes_sync};
|
|
89
|
+
|
|
90
|
+
// Only available with filesystem access
|
|
91
|
+
#[cfg(feature = "tokio-runtime")]
|
|
92
|
+
pub use core::extractor::{batch_extract_file_sync, extract_file_sync};
|
|
88
93
|
|
|
89
94
|
pub use core::config::{
|
|
90
95
|
ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig, ImageExtractionConfig,
|
|
91
|
-
LanguageDetectionConfig, OcrConfig,
|
|
96
|
+
LanguageDetectionConfig, OcrConfig, PostProcessorConfig, TokenReductionConfig,
|
|
92
97
|
};
|
|
93
98
|
|
|
99
|
+
#[cfg(feature = "pdf")]
|
|
100
|
+
pub use core::config::PdfConfig;
|
|
101
|
+
|
|
94
102
|
pub use core::mime::{
|
|
95
103
|
DOCX_MIME_TYPE, EXCEL_MIME_TYPE, HTML_MIME_TYPE, JSON_MIME_TYPE, MARKDOWN_MIME_TYPE, PDF_MIME_TYPE,
|
|
96
104
|
PLAIN_TEXT_MIME_TYPE, POWER_POINT_MIME_TYPE, XML_MIME_TYPE, detect_mime_type, detect_mime_type_from_bytes,
|
|
@@ -750,6 +750,7 @@ mod tests {
|
|
|
750
750
|
detected_languages: None,
|
|
751
751
|
chunks: None,
|
|
752
752
|
images: None,
|
|
753
|
+
pages: None,
|
|
753
754
|
};
|
|
754
755
|
|
|
755
756
|
let formatted = format_extraction_result(&result);
|
|
@@ -786,6 +787,7 @@ mod tests {
|
|
|
786
787
|
detected_languages: None,
|
|
787
788
|
chunks: None,
|
|
788
789
|
images: None,
|
|
790
|
+
pages: None,
|
|
789
791
|
};
|
|
790
792
|
|
|
791
793
|
let formatted = format_extraction_result(&result);
|
|
@@ -807,6 +809,7 @@ mod tests {
|
|
|
807
809
|
detected_languages: None,
|
|
808
810
|
chunks: None,
|
|
809
811
|
images: None,
|
|
812
|
+
pages: None,
|
|
810
813
|
};
|
|
811
814
|
|
|
812
815
|
let formatted = format_extraction_result(&result);
|
|
@@ -825,6 +828,7 @@ mod tests {
|
|
|
825
828
|
detected_languages: None,
|
|
826
829
|
chunks: None,
|
|
827
830
|
images: None,
|
|
831
|
+
pages: None,
|
|
828
832
|
};
|
|
829
833
|
|
|
830
834
|
let formatted = format_extraction_result(&result);
|
|
@@ -1622,19 +1626,17 @@ mod tests {
|
|
|
1622
1626
|
|
|
1623
1627
|
let result = server.batch_extract_files(Parameters(params)).await;
|
|
1624
1628
|
|
|
1625
|
-
if
|
|
1626
|
-
let
|
|
1627
|
-
|
|
1628
|
-
|
|
1629
|
-
|
|
1630
|
-
|
|
1631
|
-
assert!(text.text.contains("Document 2"));
|
|
1629
|
+
if let Ok(call_result) = result
|
|
1630
|
+
&& let Some(content) = call_result.content.first()
|
|
1631
|
+
&& let RawContent::Text(text) = &content.raw
|
|
1632
|
+
{
|
|
1633
|
+
assert!(text.text.contains("Document 1"));
|
|
1634
|
+
assert!(text.text.contains("Document 2"));
|
|
1632
1635
|
|
|
1633
|
-
|
|
1634
|
-
|
|
1635
|
-
|
|
1636
|
-
|
|
1637
|
-
}
|
|
1636
|
+
let doc1_pos = text.text.find("Document 1");
|
|
1637
|
+
let doc2_pos = text.text.find("Document 2");
|
|
1638
|
+
if let (Some(pos1), Some(pos2)) = (doc1_pos, doc2_pos) {
|
|
1639
|
+
assert!(pos1 < pos2, "Documents should be in order");
|
|
1638
1640
|
}
|
|
1639
1641
|
}
|
|
1640
1642
|
}
|
|
@@ -161,6 +161,7 @@ impl OcrBackend for TesseractBackend {
|
|
|
161
161
|
content: ocr_result.content,
|
|
162
162
|
mime_type: ocr_result.mime_type,
|
|
163
163
|
metadata,
|
|
164
|
+
pages: None,
|
|
164
165
|
tables: ocr_result
|
|
165
166
|
.tables
|
|
166
167
|
.into_iter()
|
|
@@ -214,6 +215,7 @@ impl OcrBackend for TesseractBackend {
|
|
|
214
215
|
content: ocr_result.content,
|
|
215
216
|
mime_type: ocr_result.mime_type,
|
|
216
217
|
metadata,
|
|
218
|
+
pages: None,
|
|
217
219
|
tables: ocr_result
|
|
218
220
|
.tables
|
|
219
221
|
.into_iter()
|
|
@@ -10,6 +10,7 @@ pub enum PdfError {
|
|
|
10
10
|
TextExtractionFailed(String),
|
|
11
11
|
RenderingFailed(String),
|
|
12
12
|
MetadataExtractionFailed(String),
|
|
13
|
+
ExtractionFailed(String),
|
|
13
14
|
IOError(String),
|
|
14
15
|
}
|
|
15
16
|
|
|
@@ -28,6 +29,7 @@ impl fmt::Display for PdfError {
|
|
|
28
29
|
PdfError::MetadataExtractionFailed(msg) => {
|
|
29
30
|
write!(f, "Metadata extraction failed: {}", msg)
|
|
30
31
|
}
|
|
32
|
+
PdfError::ExtractionFailed(msg) => write!(f, "Extraction failed: {}", msg),
|
|
31
33
|
PdfError::IOError(msg) => write!(f, "I/O error: {}", msg),
|
|
32
34
|
}
|
|
33
35
|
}
|
|
@@ -119,4 +121,10 @@ mod tests {
|
|
|
119
121
|
let err2 = err1.clone();
|
|
120
122
|
assert_eq!(err1.to_string(), err2.to_string());
|
|
121
123
|
}
|
|
124
|
+
|
|
125
|
+
#[test]
|
|
126
|
+
fn test_extraction_failed_error() {
|
|
127
|
+
let err = PdfError::ExtractionFailed("page data mismatch".to_string());
|
|
128
|
+
assert_eq!(err.to_string(), "Extraction failed: page data mismatch");
|
|
129
|
+
}
|
|
122
130
|
}
|