kreuzberg 4.2.6 → 4.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +7 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
- data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
- data/ext/kreuzberg_rb/native/src/result.rs +5 -3
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +228 -37
- data/spec/binding/batch_operations_spec.rb +2 -0
- data/vendor/Cargo.toml +3 -2
- data/vendor/kreuzberg/Cargo.toml +2 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/error.rs +29 -1
- data/vendor/kreuzberg/src/api/handlers.rs +28 -25
- data/vendor/kreuzberg/src/api/openapi.rs +14 -1
- data/vendor/kreuzberg/src/chunking/config.rs +2 -37
- data/vendor/kreuzberg/src/chunking/core.rs +78 -2
- data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
- data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
- data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
- data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
- data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
- data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
- data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
- data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
- data/vendor/kreuzberg/src/extraction/email.rs +31 -19
- data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
- data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
- data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
- data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
- data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
- data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
- data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
- data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
- data/vendor/kreuzberg/src/extractors/email.rs +5 -3
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
- data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
- data/vendor/kreuzberg/src/extractors/html.rs +1 -1
- data/vendor/kreuzberg/src/extractors/image.rs +3 -3
- data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
- data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
- data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
- data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
- data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
- data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
- data/vendor/kreuzberg/src/extractors/text.rs +2 -2
- data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
- data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
- data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
- data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
- data/vendor/kreuzberg/src/lib.rs +1 -1
- data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
- data/vendor/kreuzberg/src/mcp/format.rs +5 -4
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
- data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
- data/vendor/kreuzberg/src/ocr/types.rs +3 -4
- data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
- data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
- data/vendor/kreuzberg/src/text/quality.rs +13 -13
- data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
- data/vendor/kreuzberg/src/types/djot.rs +15 -4
- data/vendor/kreuzberg/src/types/extraction.rs +24 -4
- data/vendor/kreuzberg/src/types/formats.rs +9 -5
- data/vendor/kreuzberg/src/types/metadata.rs +68 -7
- data/vendor/kreuzberg/src/types/mod.rs +7 -5
- data/vendor/kreuzberg/src/types/page.rs +9 -0
- data/vendor/kreuzberg/src/types/tables.rs +2 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
- data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
- data/vendor/kreuzberg/tests/config_features.rs +19 -11
- data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
- data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
- data/vendor/kreuzberg/tests/core_integration.rs +5 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
- data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
- data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
- data/vendor/kreuzberg-ffi/src/error.rs +56 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
- data/vendor/kreuzberg-ffi/src/result.rs +2 -1
- data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
- data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +2 -2
|
@@ -8,7 +8,9 @@ use crate::core::config::OcrConfig;
|
|
|
8
8
|
use crate::ocr::processor::OcrProcessor;
|
|
9
9
|
use crate::plugins::{OcrBackend, OcrBackendType, Plugin};
|
|
10
10
|
use crate::types::ExtractionResult;
|
|
11
|
+
use ahash::AHashMap;
|
|
11
12
|
use async_trait::async_trait;
|
|
13
|
+
use std::borrow::Cow;
|
|
12
14
|
use std::path::Path;
|
|
13
15
|
use std::sync::{Arc, OnceLock};
|
|
14
16
|
|
|
@@ -196,9 +198,23 @@ impl OcrBackend for TesseractBackend {
|
|
|
196
198
|
source: Some(Box::new(e)),
|
|
197
199
|
})?;
|
|
198
200
|
|
|
201
|
+
// Use resolved language from OCR result metadata (handles "all"/"*" resolution)
|
|
202
|
+
let resolved_language = ocr_result
|
|
203
|
+
.metadata
|
|
204
|
+
.get("language")
|
|
205
|
+
.and_then(|v| v.as_str())
|
|
206
|
+
.unwrap_or(&tess_config.language)
|
|
207
|
+
.to_string();
|
|
208
|
+
|
|
209
|
+
// Convert HashMap<String, Value> to AHashMap<Cow<'static, str>, Value>
|
|
210
|
+
let mut additional = AHashMap::new();
|
|
211
|
+
for (key, value) in ocr_result.metadata {
|
|
212
|
+
additional.insert(Cow::Owned(key), value);
|
|
213
|
+
}
|
|
214
|
+
|
|
199
215
|
let metadata = crate::types::Metadata {
|
|
200
216
|
format: Some(crate::types::FormatMetadata::Ocr(crate::types::OcrMetadata {
|
|
201
|
-
language:
|
|
217
|
+
language: resolved_language,
|
|
202
218
|
psm: tess_config.psm as i32,
|
|
203
219
|
output_format: tess_config.output_format.clone(),
|
|
204
220
|
table_count: ocr_result.tables.len(),
|
|
@@ -208,13 +224,13 @@ impl OcrBackend for TesseractBackend {
|
|
|
208
224
|
.first()
|
|
209
225
|
.and_then(|t| t.cells.first().map(|row| row.len())),
|
|
210
226
|
})),
|
|
211
|
-
additional
|
|
227
|
+
additional,
|
|
212
228
|
..Default::default()
|
|
213
229
|
};
|
|
214
230
|
|
|
215
231
|
Ok(ExtractionResult {
|
|
216
232
|
content: ocr_result.content,
|
|
217
|
-
mime_type: ocr_result.mime_type,
|
|
233
|
+
mime_type: ocr_result.mime_type.into(),
|
|
218
234
|
metadata,
|
|
219
235
|
pages: None,
|
|
220
236
|
tables: ocr_result
|
|
@@ -256,9 +272,23 @@ impl OcrBackend for TesseractBackend {
|
|
|
256
272
|
source: Some(Box::new(e)),
|
|
257
273
|
})?;
|
|
258
274
|
|
|
275
|
+
// Use resolved language from OCR result metadata (handles "all"/"*" resolution)
|
|
276
|
+
let resolved_language = ocr_result
|
|
277
|
+
.metadata
|
|
278
|
+
.get("language")
|
|
279
|
+
.and_then(|v| v.as_str())
|
|
280
|
+
.unwrap_or(&tess_config.language)
|
|
281
|
+
.to_string();
|
|
282
|
+
|
|
283
|
+
// Convert HashMap<String, Value> to AHashMap<Cow<'static, str>, Value>
|
|
284
|
+
let mut additional = AHashMap::new();
|
|
285
|
+
for (key, value) in ocr_result.metadata {
|
|
286
|
+
additional.insert(Cow::Owned(key), value);
|
|
287
|
+
}
|
|
288
|
+
|
|
259
289
|
let metadata = crate::types::Metadata {
|
|
260
290
|
format: Some(crate::types::FormatMetadata::Ocr(crate::types::OcrMetadata {
|
|
261
|
-
language:
|
|
291
|
+
language: resolved_language,
|
|
262
292
|
psm: tess_config.psm as i32,
|
|
263
293
|
output_format: tess_config.output_format.clone(),
|
|
264
294
|
table_count: ocr_result.tables.len(),
|
|
@@ -268,13 +298,13 @@ impl OcrBackend for TesseractBackend {
|
|
|
268
298
|
.first()
|
|
269
299
|
.and_then(|t| t.cells.first().map(|row| row.len())),
|
|
270
300
|
})),
|
|
271
|
-
additional
|
|
301
|
+
additional,
|
|
272
302
|
..Default::default()
|
|
273
303
|
};
|
|
274
304
|
|
|
275
305
|
Ok(ExtractionResult {
|
|
276
306
|
content: ocr_result.content,
|
|
277
|
-
mime_type: ocr_result.mime_type,
|
|
307
|
+
mime_type: ocr_result.mime_type.into(),
|
|
278
308
|
metadata,
|
|
279
309
|
pages: None,
|
|
280
310
|
tables: ocr_result
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
use serde::{Deserialize, Serialize};
|
|
2
|
-
use std::collections::HashMap;
|
|
3
2
|
|
|
4
3
|
pub use crate::types::ImagePreprocessingConfig;
|
|
5
4
|
|
|
@@ -154,7 +153,7 @@ impl From<&crate::types::TesseractConfig> for TesseractConfig {
|
|
|
154
153
|
pub struct ExtractionResult {
|
|
155
154
|
pub content: String,
|
|
156
155
|
pub mime_type: String,
|
|
157
|
-
pub metadata: HashMap<String, serde_json::Value>,
|
|
156
|
+
pub metadata: std::collections::HashMap<String, serde_json::Value>,
|
|
158
157
|
pub tables: Vec<Table>,
|
|
159
158
|
}
|
|
160
159
|
|
|
@@ -260,7 +259,7 @@ mod tests {
|
|
|
260
259
|
|
|
261
260
|
#[test]
|
|
262
261
|
fn test_extraction_result_creation() {
|
|
263
|
-
let mut metadata = HashMap::new();
|
|
262
|
+
let mut metadata = std::collections::HashMap::new();
|
|
264
263
|
metadata.insert("key".to_string(), serde_json::json!("value"));
|
|
265
264
|
|
|
266
265
|
let table = Table {
|
|
@@ -308,7 +307,7 @@ mod tests {
|
|
|
308
307
|
let result = crate::types::OcrExtractionResult {
|
|
309
308
|
content: "content".to_string(),
|
|
310
309
|
mime_type: "text/plain".to_string(),
|
|
311
|
-
metadata: HashMap::new(),
|
|
310
|
+
metadata: std::collections::HashMap::new(),
|
|
312
311
|
tables: vec![],
|
|
313
312
|
};
|
|
314
313
|
|
|
@@ -131,6 +131,12 @@ lazy_static::lazy_static! {
|
|
|
131
131
|
}
|
|
132
132
|
|
|
133
133
|
pub fn validate_language_code(lang_code: &str) -> Result<(), OcrError> {
|
|
134
|
+
// Accept "all" and "*" as special values to auto-detect installed languages
|
|
135
|
+
let lower = lang_code.to_ascii_lowercase();
|
|
136
|
+
if lower == "all" || lower == "*" {
|
|
137
|
+
return Ok(());
|
|
138
|
+
}
|
|
139
|
+
|
|
134
140
|
for code in lang_code.split('+') {
|
|
135
141
|
if !TESSERACT_SUPPORTED_LANGUAGE_CODES.contains(code) {
|
|
136
142
|
return Err(OcrError::InvalidLanguageCode(format!(
|
|
@@ -156,6 +162,14 @@ pub fn validate_tesseract_version(version: u32) -> Result<(), OcrError> {
|
|
|
156
162
|
mod tests {
|
|
157
163
|
use super::*;
|
|
158
164
|
|
|
165
|
+
#[test]
|
|
166
|
+
fn test_validate_language_code_all_keyword() {
|
|
167
|
+
assert!(validate_language_code("all").is_ok());
|
|
168
|
+
assert!(validate_language_code("*").is_ok());
|
|
169
|
+
assert!(validate_language_code("ALL").is_ok());
|
|
170
|
+
assert!(validate_language_code("All").is_ok());
|
|
171
|
+
}
|
|
172
|
+
|
|
159
173
|
#[test]
|
|
160
174
|
fn test_validate_language_code_valid() {
|
|
161
175
|
assert!(validate_language_code("eng").is_ok());
|
|
@@ -10,6 +10,7 @@ use serde::{Deserialize, Serialize};
|
|
|
10
10
|
/// `Metadata` structure. Common fields like title, authors, keywords, and dates
|
|
11
11
|
/// are now at the `Metadata` level.
|
|
12
12
|
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
|
13
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
13
14
|
pub struct PdfMetadata {
|
|
14
15
|
/// PDF version (e.g., "1.7", "2.0")
|
|
15
16
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
@@ -19,6 +19,7 @@ mod tests {
|
|
|
19
19
|
use crate::plugins::Plugin;
|
|
20
20
|
use crate::types::ExtractionResult;
|
|
21
21
|
use async_trait::async_trait;
|
|
22
|
+
use std::borrow::Cow;
|
|
22
23
|
|
|
23
24
|
struct MockExtractor {
|
|
24
25
|
mime_types: Vec<&'static str>,
|
|
@@ -53,7 +54,7 @@ mod tests {
|
|
|
53
54
|
) -> Result<ExtractionResult> {
|
|
54
55
|
Ok(ExtractionResult {
|
|
55
56
|
content: String::from_utf8_lossy(content).to_string(),
|
|
56
|
-
mime_type: mime_type.to_string(),
|
|
57
|
+
mime_type: mime_type.to_string().into(),
|
|
57
58
|
metadata: crate::types::Metadata::default(),
|
|
58
59
|
tables: vec![],
|
|
59
60
|
detected_languages: None,
|
|
@@ -228,7 +229,7 @@ mod tests {
|
|
|
228
229
|
) -> Result<ExtractionResult> {
|
|
229
230
|
Ok(ExtractionResult {
|
|
230
231
|
content: String::new(),
|
|
231
|
-
mime_type:
|
|
232
|
+
mime_type: Cow::Borrowed(""),
|
|
232
233
|
metadata: crate::types::Metadata::default(),
|
|
233
234
|
tables: vec![],
|
|
234
235
|
detected_languages: None,
|
|
@@ -50,7 +50,7 @@ use std::sync::Arc;
|
|
|
50
50
|
/// -> Result<ExtractionResult> {
|
|
51
51
|
/// Ok(ExtractionResult {
|
|
52
52
|
/// content: String::from_utf8_lossy(content).to_string(),
|
|
53
|
-
/// mime_type: mime_type.to_string(),
|
|
53
|
+
/// mime_type: mime_type.to_string().into(),
|
|
54
54
|
/// metadata: Metadata::default(),
|
|
55
55
|
/// tables: vec![],
|
|
56
56
|
/// detected_languages: None,
|
|
@@ -189,6 +189,7 @@ mod tests {
|
|
|
189
189
|
use crate::types::ExtractionResult;
|
|
190
190
|
use async_trait::async_trait;
|
|
191
191
|
use serial_test::serial;
|
|
192
|
+
use std::borrow::Cow;
|
|
192
193
|
|
|
193
194
|
struct MockExtractor {
|
|
194
195
|
mime_types: Vec<&'static str>,
|
|
@@ -223,7 +224,7 @@ mod tests {
|
|
|
223
224
|
) -> Result<ExtractionResult> {
|
|
224
225
|
Ok(ExtractionResult {
|
|
225
226
|
content: String::from_utf8_lossy(content).to_string(),
|
|
226
|
-
mime_type: mime_type.to_string(),
|
|
227
|
+
mime_type: mime_type.to_string().into(),
|
|
227
228
|
metadata: crate::types::Metadata::default(),
|
|
228
229
|
tables: vec![],
|
|
229
230
|
detected_languages: None,
|
|
@@ -362,7 +363,7 @@ mod tests {
|
|
|
362
363
|
async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
363
364
|
Ok(ExtractionResult {
|
|
364
365
|
content: String::new(),
|
|
365
|
-
mime_type:
|
|
366
|
+
mime_type: Cow::Borrowed(""),
|
|
366
367
|
metadata: crate::types::Metadata::default(),
|
|
367
368
|
tables: vec![],
|
|
368
369
|
detected_languages: None,
|
|
@@ -410,7 +411,7 @@ mod tests {
|
|
|
410
411
|
async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
411
412
|
Ok(ExtractionResult {
|
|
412
413
|
content: String::new(),
|
|
413
|
-
mime_type:
|
|
414
|
+
mime_type: Cow::Borrowed(""),
|
|
414
415
|
metadata: crate::types::Metadata::default(),
|
|
415
416
|
tables: vec![],
|
|
416
417
|
detected_languages: None,
|
|
@@ -61,7 +61,7 @@ pub enum OcrBackendType {
|
|
|
61
61
|
/// // Implement OCR logic here
|
|
62
62
|
/// Ok(ExtractionResult {
|
|
63
63
|
/// content: "Extracted text".to_string(),
|
|
64
|
-
/// mime_type: "text/plain"
|
|
64
|
+
/// mime_type: Cow::Borrowed("text/plain"),
|
|
65
65
|
/// metadata: Metadata::default(),
|
|
66
66
|
/// tables: vec![],
|
|
67
67
|
/// detected_languages: None,
|
|
@@ -142,7 +142,7 @@ pub trait OcrBackend: Plugin {
|
|
|
142
142
|
///
|
|
143
143
|
/// Ok(ExtractionResult {
|
|
144
144
|
/// content: text,
|
|
145
|
-
/// mime_type: "text/plain"
|
|
145
|
+
/// mime_type: Cow::Borrowed("text/plain"),
|
|
146
146
|
/// metadata: Metadata::default(),
|
|
147
147
|
/// tables: vec![],
|
|
148
148
|
/// detected_languages: None,
|
|
@@ -315,7 +315,7 @@ pub trait OcrBackend: Plugin {
|
|
|
315
315
|
/// async fn process_image(&self, _: &[u8], _: &OcrConfig) -> Result<ExtractionResult> {
|
|
316
316
|
/// Ok(ExtractionResult {
|
|
317
317
|
/// content: "text".to_string(),
|
|
318
|
-
/// mime_type: "text/plain"
|
|
318
|
+
/// mime_type: Cow::Borrowed("text/plain"),
|
|
319
319
|
/// metadata: Metadata::default(),
|
|
320
320
|
/// tables: vec![],
|
|
321
321
|
/// detected_languages: None,
|
|
@@ -450,6 +450,7 @@ pub fn clear_ocr_backends() -> crate::Result<()> {
|
|
|
450
450
|
#[cfg(test)]
|
|
451
451
|
mod tests {
|
|
452
452
|
use super::*;
|
|
453
|
+
use std::borrow::Cow;
|
|
453
454
|
|
|
454
455
|
struct MockOcrBackend {
|
|
455
456
|
languages: Vec<String>,
|
|
@@ -478,7 +479,7 @@ mod tests {
|
|
|
478
479
|
async fn process_image(&self, _image_bytes: &[u8], _config: &OcrConfig) -> Result<ExtractionResult> {
|
|
479
480
|
Ok(ExtractionResult {
|
|
480
481
|
content: "Mocked OCR text".to_string(),
|
|
481
|
-
mime_type: "text/plain"
|
|
482
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
482
483
|
metadata: crate::types::Metadata::default(),
|
|
483
484
|
tables: vec![],
|
|
484
485
|
detected_languages: None,
|
|
@@ -18,8 +18,9 @@ mod tests {
|
|
|
18
18
|
use crate::core::config::ExtractionConfig;
|
|
19
19
|
use crate::plugins::Plugin;
|
|
20
20
|
use crate::types::ExtractionResult;
|
|
21
|
+
use ahash::AHashMap;
|
|
21
22
|
use async_trait::async_trait;
|
|
22
|
-
use std::
|
|
23
|
+
use std::borrow::Cow;
|
|
23
24
|
|
|
24
25
|
struct MockPostProcessor {
|
|
25
26
|
stage: ProcessingStage,
|
|
@@ -49,7 +50,7 @@ mod tests {
|
|
|
49
50
|
result
|
|
50
51
|
.metadata
|
|
51
52
|
.additional
|
|
52
|
-
.insert("processed_by"
|
|
53
|
+
.insert(Cow::Borrowed("processed_by"), serde_json::json!(self.name()));
|
|
53
54
|
Ok(())
|
|
54
55
|
}
|
|
55
56
|
|
|
@@ -66,7 +67,7 @@ mod tests {
|
|
|
66
67
|
|
|
67
68
|
let mut result = ExtractionResult {
|
|
68
69
|
content: "test content".to_string(),
|
|
69
|
-
mime_type: "text/plain"
|
|
70
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
70
71
|
metadata: crate::types::Metadata::default(),
|
|
71
72
|
tables: vec![],
|
|
72
73
|
detected_languages: None,
|
|
@@ -118,7 +119,7 @@ mod tests {
|
|
|
118
119
|
|
|
119
120
|
let result = ExtractionResult {
|
|
120
121
|
content: "test".to_string(),
|
|
121
|
-
mime_type: "text/plain"
|
|
122
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
122
123
|
metadata: crate::types::Metadata::default(),
|
|
123
124
|
tables: vec![],
|
|
124
125
|
detected_languages: None,
|
|
@@ -187,7 +188,7 @@ mod tests {
|
|
|
187
188
|
|
|
188
189
|
let mut result = ExtractionResult {
|
|
189
190
|
content: String::new(),
|
|
190
|
-
mime_type: "text/plain"
|
|
191
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
191
192
|
metadata: crate::types::Metadata::default(),
|
|
192
193
|
tables: vec![],
|
|
193
194
|
detected_languages: None,
|
|
@@ -211,12 +212,12 @@ mod tests {
|
|
|
211
212
|
stage: ProcessingStage::Early,
|
|
212
213
|
};
|
|
213
214
|
|
|
214
|
-
let mut additional =
|
|
215
|
-
additional.insert("existing_key"
|
|
215
|
+
let mut additional = AHashMap::new();
|
|
216
|
+
additional.insert(Cow::Borrowed("existing_key"), serde_json::json!("existing_value"));
|
|
216
217
|
|
|
217
218
|
let mut result = ExtractionResult {
|
|
218
219
|
content: "test".to_string(),
|
|
219
|
-
mime_type: "text/plain"
|
|
220
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
220
221
|
metadata: crate::types::Metadata {
|
|
221
222
|
additional,
|
|
222
223
|
..Default::default()
|
|
@@ -248,7 +249,7 @@ mod tests {
|
|
|
248
249
|
|
|
249
250
|
let result = ExtractionResult {
|
|
250
251
|
content: "test".to_string(),
|
|
251
|
-
mime_type: "text/plain"
|
|
252
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
252
253
|
metadata: crate::types::Metadata::default(),
|
|
253
254
|
tables: vec![],
|
|
254
255
|
detected_languages: None,
|
|
@@ -301,7 +302,7 @@ mod tests {
|
|
|
301
302
|
|
|
302
303
|
let pdf_result = ExtractionResult {
|
|
303
304
|
content: "test".to_string(),
|
|
304
|
-
mime_type: "application/pdf"
|
|
305
|
+
mime_type: Cow::Borrowed("application/pdf"),
|
|
305
306
|
metadata: crate::types::Metadata::default(),
|
|
306
307
|
tables: vec![],
|
|
307
308
|
detected_languages: None,
|
|
@@ -314,7 +315,7 @@ mod tests {
|
|
|
314
315
|
|
|
315
316
|
let txt_result = ExtractionResult {
|
|
316
317
|
content: "test".to_string(),
|
|
317
|
-
mime_type: "text/plain"
|
|
318
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
318
319
|
metadata: crate::types::Metadata::default(),
|
|
319
320
|
tables: vec![],
|
|
320
321
|
detected_languages: None,
|
|
@@ -345,7 +346,7 @@ mod tests {
|
|
|
345
346
|
|
|
346
347
|
let mut result = ExtractionResult {
|
|
347
348
|
content: "test".to_string(),
|
|
348
|
-
mime_type: "text/plain"
|
|
349
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
349
350
|
metadata: crate::types::Metadata::default(),
|
|
350
351
|
tables: vec![table],
|
|
351
352
|
detected_languages: None,
|
|
@@ -225,6 +225,7 @@ mod tests {
|
|
|
225
225
|
use crate::plugins::Plugin;
|
|
226
226
|
use crate::types::ExtractionResult;
|
|
227
227
|
use async_trait::async_trait;
|
|
228
|
+
use std::borrow::Cow;
|
|
228
229
|
|
|
229
230
|
struct MockExtractor {
|
|
230
231
|
name: String,
|
|
@@ -252,7 +253,7 @@ mod tests {
|
|
|
252
253
|
async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
253
254
|
Ok(ExtractionResult {
|
|
254
255
|
content: "test".to_string(),
|
|
255
|
-
mime_type: "text/plain"
|
|
256
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
256
257
|
metadata: crate::types::Metadata::default(),
|
|
257
258
|
tables: vec![],
|
|
258
259
|
detected_languages: None,
|
|
@@ -494,7 +495,7 @@ mod tests {
|
|
|
494
495
|
async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
495
496
|
Ok(ExtractionResult {
|
|
496
497
|
content: "test".to_string(),
|
|
497
|
-
mime_type: "text/plain"
|
|
498
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
498
499
|
metadata: crate::types::Metadata::default(),
|
|
499
500
|
tables: vec![],
|
|
500
501
|
detected_languages: None,
|
|
@@ -191,6 +191,7 @@ mod tests {
|
|
|
191
191
|
use crate::plugins::{OcrBackend, Plugin};
|
|
192
192
|
use crate::types::ExtractionResult;
|
|
193
193
|
use async_trait::async_trait;
|
|
194
|
+
use std::borrow::Cow;
|
|
194
195
|
|
|
195
196
|
struct MockOcrBackend {
|
|
196
197
|
name: String,
|
|
@@ -217,7 +218,7 @@ mod tests {
|
|
|
217
218
|
async fn process_image(&self, _: &[u8], _: &OcrConfig) -> Result<ExtractionResult> {
|
|
218
219
|
Ok(ExtractionResult {
|
|
219
220
|
content: "test".to_string(),
|
|
220
|
-
mime_type: "text/plain"
|
|
221
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
221
222
|
metadata: crate::types::Metadata::default(),
|
|
222
223
|
tables: vec![],
|
|
223
224
|
detected_languages: None,
|
|
@@ -344,7 +345,7 @@ mod tests {
|
|
|
344
345
|
async fn process_image(&self, _: &[u8], _: &OcrConfig) -> Result<ExtractionResult> {
|
|
345
346
|
Ok(ExtractionResult {
|
|
346
347
|
content: "test".to_string(),
|
|
347
|
-
mime_type: "text/plain"
|
|
348
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
348
349
|
metadata: crate::types::Metadata::default(),
|
|
349
350
|
tables: vec![],
|
|
350
351
|
detected_languages: None,
|
|
@@ -19,8 +19,9 @@ mod tests {
|
|
|
19
19
|
use crate::core::config::ExtractionConfig;
|
|
20
20
|
use crate::plugins::Plugin;
|
|
21
21
|
use crate::types::ExtractionResult;
|
|
22
|
+
use ahash::AHashMap;
|
|
22
23
|
use async_trait::async_trait;
|
|
23
|
-
use std::
|
|
24
|
+
use std::borrow::Cow;
|
|
24
25
|
|
|
25
26
|
struct MockValidator {
|
|
26
27
|
should_fail: bool,
|
|
@@ -61,7 +62,7 @@ mod tests {
|
|
|
61
62
|
|
|
62
63
|
let result = ExtractionResult {
|
|
63
64
|
content: "test content".to_string(),
|
|
64
|
-
mime_type: "text/plain"
|
|
65
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
65
66
|
metadata: crate::types::Metadata::default(),
|
|
66
67
|
tables: vec![],
|
|
67
68
|
detected_languages: None,
|
|
@@ -82,7 +83,7 @@ mod tests {
|
|
|
82
83
|
|
|
83
84
|
let result = ExtractionResult {
|
|
84
85
|
content: "test content".to_string(),
|
|
85
|
-
mime_type: "text/plain"
|
|
86
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
86
87
|
metadata: crate::types::Metadata::default(),
|
|
87
88
|
tables: vec![],
|
|
88
89
|
detected_languages: None,
|
|
@@ -105,7 +106,7 @@ mod tests {
|
|
|
105
106
|
|
|
106
107
|
let result = ExtractionResult {
|
|
107
108
|
content: "test".to_string(),
|
|
108
|
-
mime_type: "text/plain"
|
|
109
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
109
110
|
metadata: crate::types::Metadata::default(),
|
|
110
111
|
tables: vec![],
|
|
111
112
|
detected_languages: None,
|
|
@@ -143,7 +144,7 @@ mod tests {
|
|
|
143
144
|
|
|
144
145
|
let result = ExtractionResult {
|
|
145
146
|
content: String::new(),
|
|
146
|
-
mime_type: "text/plain"
|
|
147
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
147
148
|
metadata: crate::types::Metadata::default(),
|
|
148
149
|
tables: vec![],
|
|
149
150
|
detected_languages: None,
|
|
@@ -193,7 +194,7 @@ mod tests {
|
|
|
193
194
|
|
|
194
195
|
let pdf_result = ExtractionResult {
|
|
195
196
|
content: "test".to_string(),
|
|
196
|
-
mime_type: "application/pdf"
|
|
197
|
+
mime_type: Cow::Borrowed("application/pdf"),
|
|
197
198
|
metadata: crate::types::Metadata::default(),
|
|
198
199
|
tables: vec![],
|
|
199
200
|
detected_languages: None,
|
|
@@ -206,7 +207,7 @@ mod tests {
|
|
|
206
207
|
|
|
207
208
|
let txt_result = ExtractionResult {
|
|
208
209
|
content: "test".to_string(),
|
|
209
|
-
mime_type: "text/plain"
|
|
210
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
210
211
|
metadata: crate::types::Metadata::default(),
|
|
211
212
|
tables: vec![],
|
|
212
213
|
detected_languages: None,
|
|
@@ -292,7 +293,7 @@ mod tests {
|
|
|
292
293
|
|
|
293
294
|
let result = ExtractionResult {
|
|
294
295
|
content: "test".to_string(),
|
|
295
|
-
mime_type: "text/plain"
|
|
296
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
296
297
|
metadata: crate::types::Metadata::default(),
|
|
297
298
|
tables: vec![],
|
|
298
299
|
detected_languages: None,
|
|
@@ -318,12 +319,12 @@ mod tests {
|
|
|
318
319
|
async fn test_validator_with_metadata() {
|
|
319
320
|
let validator = MockValidator { should_fail: false };
|
|
320
321
|
|
|
321
|
-
let mut additional =
|
|
322
|
-
additional.insert("quality_score"
|
|
322
|
+
let mut additional = AHashMap::new();
|
|
323
|
+
additional.insert(Cow::Borrowed("quality_score"), serde_json::json!(0.95));
|
|
323
324
|
|
|
324
325
|
let result = ExtractionResult {
|
|
325
326
|
content: "test".to_string(),
|
|
326
|
-
mime_type: "text/plain"
|
|
327
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
327
328
|
metadata: crate::types::Metadata {
|
|
328
329
|
additional,
|
|
329
330
|
..Default::default()
|
|
@@ -355,7 +356,7 @@ mod tests {
|
|
|
355
356
|
|
|
356
357
|
let result = ExtractionResult {
|
|
357
358
|
content: "test".to_string(),
|
|
358
|
-
mime_type: "text/plain"
|
|
359
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
359
360
|
metadata: crate::types::Metadata::default(),
|
|
360
361
|
tables: vec![table],
|
|
361
362
|
detected_languages: None,
|
|
@@ -386,7 +387,7 @@ mod tests {
|
|
|
386
387
|
for mime_type in mime_types {
|
|
387
388
|
let result = ExtractionResult {
|
|
388
389
|
content: "test".to_string(),
|
|
389
|
-
mime_type: mime_type
|
|
390
|
+
mime_type: Cow::Borrowed(mime_type),
|
|
390
391
|
metadata: crate::types::Metadata::default(),
|
|
391
392
|
tables: vec![],
|
|
392
393
|
detected_languages: None,
|
|
@@ -407,7 +408,7 @@ mod tests {
|
|
|
407
408
|
|
|
408
409
|
let result = ExtractionResult {
|
|
409
410
|
content: "test content ".repeat(10000),
|
|
410
|
-
mime_type: "text/plain"
|
|
411
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
411
412
|
metadata: crate::types::Metadata::default(),
|
|
412
413
|
tables: vec![],
|
|
413
414
|
detected_languages: None,
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
+
use ahash::AHashMap;
|
|
1
2
|
use once_cell::sync::Lazy;
|
|
2
3
|
use regex::Regex;
|
|
3
4
|
use std::borrow::Cow;
|
|
4
|
-
use std::collections::HashMap;
|
|
5
5
|
|
|
6
6
|
use crate::utils::quality::{collapse_scattered_ascii, normalize_whitespace_ascii};
|
|
7
7
|
|
|
@@ -123,7 +123,7 @@ where
|
|
|
123
123
|
}
|
|
124
124
|
}
|
|
125
125
|
|
|
126
|
-
pub fn calculate_quality_score(text: &str, metadata: Option<&
|
|
126
|
+
pub fn calculate_quality_score(text: &str, metadata: Option<&AHashMap<Cow<'static, str>, serde_json::Value>>) -> f64 {
|
|
127
127
|
if text.is_empty() || text.trim().is_empty() {
|
|
128
128
|
return 0.0;
|
|
129
129
|
}
|
|
@@ -266,7 +266,7 @@ fn calculate_structure_bonus(text: &str) -> f64 {
|
|
|
266
266
|
}
|
|
267
267
|
|
|
268
268
|
#[inline]
|
|
269
|
-
fn calculate_metadata_bonus(metadata: &
|
|
269
|
+
fn calculate_metadata_bonus(metadata: &AHashMap<Cow<'static, str>, serde_json::Value>) -> f64 {
|
|
270
270
|
const IMPORTANT_FIELDS: &[&str] = &["title", "author", "subject", "description", "keywords"];
|
|
271
271
|
|
|
272
272
|
let present_fields = IMPORTANT_FIELDS
|
|
@@ -491,9 +491,9 @@ mod tests {
|
|
|
491
491
|
#[test]
|
|
492
492
|
fn test_calculate_quality_score_with_metadata() {
|
|
493
493
|
let text = "This is a normal text with proper structure.";
|
|
494
|
-
let mut metadata =
|
|
495
|
-
metadata.insert("title"
|
|
496
|
-
metadata.insert("author"
|
|
494
|
+
let mut metadata: AHashMap<Cow<'static, str>, serde_json::Value> = AHashMap::new();
|
|
495
|
+
metadata.insert(Cow::Borrowed("title"), serde_json::json!("Test Title"));
|
|
496
|
+
metadata.insert(Cow::Borrowed("author"), serde_json::json!("Test Author"));
|
|
497
497
|
|
|
498
498
|
let score = calculate_quality_score(text, Some(&metadata));
|
|
499
499
|
assert!(score > 0.0);
|
|
@@ -558,19 +558,19 @@ mod tests {
|
|
|
558
558
|
|
|
559
559
|
#[test]
|
|
560
560
|
fn test_calculate_metadata_bonus_empty() {
|
|
561
|
-
let metadata =
|
|
561
|
+
let metadata: AHashMap<Cow<'static, str>, serde_json::Value> = AHashMap::new();
|
|
562
562
|
let bonus = calculate_metadata_bonus(&metadata);
|
|
563
563
|
assert_eq!(bonus, 0.0);
|
|
564
564
|
}
|
|
565
565
|
|
|
566
566
|
#[test]
|
|
567
567
|
fn test_calculate_metadata_bonus_full() {
|
|
568
|
-
let mut metadata =
|
|
569
|
-
metadata.insert("title"
|
|
570
|
-
metadata.insert("author"
|
|
571
|
-
metadata.insert("subject"
|
|
572
|
-
metadata.insert("description"
|
|
573
|
-
metadata.insert("keywords"
|
|
568
|
+
let mut metadata: AHashMap<Cow<'static, str>, serde_json::Value> = AHashMap::new();
|
|
569
|
+
metadata.insert(Cow::Borrowed("title"), serde_json::json!("Title"));
|
|
570
|
+
metadata.insert(Cow::Borrowed("author"), serde_json::json!("Author"));
|
|
571
|
+
metadata.insert(Cow::Borrowed("subject"), serde_json::json!("Subject"));
|
|
572
|
+
metadata.insert(Cow::Borrowed("description"), serde_json::json!("Description"));
|
|
573
|
+
metadata.insert(Cow::Borrowed("keywords"), serde_json::json!("Keywords"));
|
|
574
574
|
|
|
575
575
|
let bonus = calculate_metadata_bonus(&metadata);
|
|
576
576
|
assert_eq!(bonus, 1.0);
|
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
use crate::plugins::{Plugin, PostProcessor, ProcessingStage};
|
|
16
16
|
use crate::{ExtractionConfig, ExtractionResult, Result};
|
|
17
17
|
use async_trait::async_trait;
|
|
18
|
+
use std::borrow::Cow;
|
|
18
19
|
|
|
19
20
|
/// Post-processor that calculates quality score and cleans text.
|
|
20
21
|
///
|
|
@@ -65,7 +66,7 @@ impl PostProcessor for QualityProcessor {
|
|
|
65
66
|
};
|
|
66
67
|
|
|
67
68
|
result.metadata.additional.insert(
|
|
68
|
-
"quality_score"
|
|
69
|
+
Cow::Borrowed("quality_score"),
|
|
69
70
|
serde_json::Value::Number(
|
|
70
71
|
serde_json::Number::from_f64(quality_score).unwrap_or(serde_json::Number::from(0)),
|
|
71
72
|
),
|
|
@@ -116,7 +117,7 @@ mod tests {
|
|
|
116
117
|
|
|
117
118
|
let mut result = ExtractionResult {
|
|
118
119
|
content: "This is a well-written paragraph with proper structure. It contains multiple sentences. The quality should be good.".to_string(),
|
|
119
|
-
mime_type: "text/plain"
|
|
120
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
120
121
|
metadata: Metadata::default(),
|
|
121
122
|
tables: vec![],
|
|
122
123
|
detected_languages: None,
|
|
@@ -144,7 +145,7 @@ mod tests {
|
|
|
144
145
|
|
|
145
146
|
let mut result = ExtractionResult {
|
|
146
147
|
content: "Some text".to_string(),
|
|
147
|
-
mime_type: "text/plain"
|
|
148
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
148
149
|
metadata: Metadata::default(),
|
|
149
150
|
tables: vec![],
|
|
150
151
|
detected_languages: None,
|
|
@@ -179,7 +180,7 @@ mod tests {
|
|
|
179
180
|
|
|
180
181
|
let result = ExtractionResult {
|
|
181
182
|
content: "Sample text".to_string(),
|
|
182
|
-
mime_type: "text/plain"
|
|
183
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
183
184
|
metadata: Metadata::default(),
|
|
184
185
|
tables: vec![],
|
|
185
186
|
detected_languages: None,
|
|
@@ -209,7 +210,7 @@ mod tests {
|
|
|
209
210
|
|
|
210
211
|
let short_result = ExtractionResult {
|
|
211
212
|
content: "Short".to_string(),
|
|
212
|
-
mime_type: "text/plain"
|
|
213
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
213
214
|
metadata: Metadata::default(),
|
|
214
215
|
tables: vec![],
|
|
215
216
|
detected_languages: None,
|
|
@@ -222,7 +223,7 @@ mod tests {
|
|
|
222
223
|
|
|
223
224
|
let long_result = ExtractionResult {
|
|
224
225
|
content: "a".repeat(1000000),
|
|
225
|
-
mime_type: "text/plain"
|
|
226
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
226
227
|
metadata: Metadata::default(),
|
|
227
228
|
tables: vec![],
|
|
228
229
|
detected_languages: None,
|