kreuzberg 4.2.6 → 4.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +7 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
- data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
- data/ext/kreuzberg_rb/native/src/result.rs +5 -3
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +228 -37
- data/spec/binding/batch_operations_spec.rb +2 -0
- data/vendor/Cargo.toml +3 -2
- data/vendor/kreuzberg/Cargo.toml +2 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/error.rs +29 -1
- data/vendor/kreuzberg/src/api/handlers.rs +28 -25
- data/vendor/kreuzberg/src/api/openapi.rs +14 -1
- data/vendor/kreuzberg/src/chunking/config.rs +2 -37
- data/vendor/kreuzberg/src/chunking/core.rs +78 -2
- data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
- data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
- data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
- data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
- data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
- data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
- data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
- data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
- data/vendor/kreuzberg/src/extraction/email.rs +31 -19
- data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
- data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
- data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
- data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
- data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
- data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
- data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
- data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
- data/vendor/kreuzberg/src/extractors/email.rs +5 -3
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
- data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
- data/vendor/kreuzberg/src/extractors/html.rs +1 -1
- data/vendor/kreuzberg/src/extractors/image.rs +3 -3
- data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
- data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
- data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
- data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
- data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
- data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
- data/vendor/kreuzberg/src/extractors/text.rs +2 -2
- data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
- data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
- data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
- data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
- data/vendor/kreuzberg/src/lib.rs +1 -1
- data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
- data/vendor/kreuzberg/src/mcp/format.rs +5 -4
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
- data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
- data/vendor/kreuzberg/src/ocr/types.rs +3 -4
- data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
- data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
- data/vendor/kreuzberg/src/text/quality.rs +13 -13
- data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
- data/vendor/kreuzberg/src/types/djot.rs +15 -4
- data/vendor/kreuzberg/src/types/extraction.rs +24 -4
- data/vendor/kreuzberg/src/types/formats.rs +9 -5
- data/vendor/kreuzberg/src/types/metadata.rs +68 -7
- data/vendor/kreuzberg/src/types/mod.rs +7 -5
- data/vendor/kreuzberg/src/types/page.rs +9 -0
- data/vendor/kreuzberg/src/types/tables.rs +2 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
- data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
- data/vendor/kreuzberg/tests/config_features.rs +19 -11
- data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
- data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
- data/vendor/kreuzberg/tests/core_integration.rs +5 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
- data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
- data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
- data/vendor/kreuzberg-ffi/src/error.rs +56 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
- data/vendor/kreuzberg-ffi/src/result.rs +2 -1
- data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
- data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +2 -2
|
@@ -147,10 +147,9 @@ async fn test_chunking_max_chars_limits_chunk_size() {
|
|
|
147
147
|
|
|
148
148
|
let config = ExtractionConfig {
|
|
149
149
|
chunking: Some(ChunkingConfig {
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
preset: None,
|
|
150
|
+
max_characters: 100,
|
|
151
|
+
overlap: 20,
|
|
152
|
+
..Default::default()
|
|
154
153
|
}),
|
|
155
154
|
..Default::default()
|
|
156
155
|
};
|
|
@@ -184,10 +183,9 @@ async fn test_chunking_overlap_creates_overlap() {
|
|
|
184
183
|
|
|
185
184
|
let config = ExtractionConfig {
|
|
186
185
|
chunking: Some(ChunkingConfig {
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
preset: None,
|
|
186
|
+
max_characters: 50,
|
|
187
|
+
overlap: 15,
|
|
188
|
+
..Default::default()
|
|
191
189
|
}),
|
|
192
190
|
..Default::default()
|
|
193
191
|
};
|
|
@@ -351,10 +349,9 @@ async fn test_chunking_overlap_maximum() {
|
|
|
351
349
|
|
|
352
350
|
let config = ExtractionConfig {
|
|
353
351
|
chunking: Some(ChunkingConfig {
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
preset: None,
|
|
352
|
+
max_characters: 60,
|
|
353
|
+
overlap: 10,
|
|
354
|
+
..Default::default()
|
|
358
355
|
}),
|
|
359
356
|
..Default::default()
|
|
360
357
|
};
|
|
@@ -385,10 +382,9 @@ async fn test_large_document_with_combined_config() {
|
|
|
385
382
|
let config = ExtractionConfig {
|
|
386
383
|
output_format: OutputFormat::Plain,
|
|
387
384
|
chunking: Some(ChunkingConfig {
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
preset: None,
|
|
385
|
+
max_characters: 200,
|
|
386
|
+
overlap: 30,
|
|
387
|
+
..Default::default()
|
|
392
388
|
}),
|
|
393
389
|
use_cache: true,
|
|
394
390
|
enable_quality_processing: true,
|
|
@@ -19,10 +19,12 @@ mod helpers;
|
|
|
19
19
|
async fn test_chunking_enabled() {
|
|
20
20
|
let config = ExtractionConfig {
|
|
21
21
|
chunking: Some(ChunkingConfig {
|
|
22
|
-
|
|
23
|
-
|
|
22
|
+
max_characters: 50,
|
|
23
|
+
overlap: 10,
|
|
24
24
|
embedding: None,
|
|
25
25
|
preset: None,
|
|
26
|
+
trim: true,
|
|
27
|
+
chunker_type: kreuzberg::chunking::ChunkerType::Text,
|
|
26
28
|
}),
|
|
27
29
|
..Default::default()
|
|
28
30
|
};
|
|
@@ -62,10 +64,12 @@ async fn test_chunking_enabled() {
|
|
|
62
64
|
async fn test_chunking_with_overlap() {
|
|
63
65
|
let config = ExtractionConfig {
|
|
64
66
|
chunking: Some(ChunkingConfig {
|
|
65
|
-
|
|
66
|
-
|
|
67
|
+
max_characters: 100,
|
|
68
|
+
overlap: 20,
|
|
67
69
|
embedding: None,
|
|
68
70
|
preset: None,
|
|
71
|
+
trim: true,
|
|
72
|
+
chunker_type: kreuzberg::chunking::ChunkerType::Text,
|
|
69
73
|
}),
|
|
70
74
|
..Default::default()
|
|
71
75
|
};
|
|
@@ -102,10 +106,12 @@ async fn test_chunking_with_overlap() {
|
|
|
102
106
|
async fn test_chunking_custom_sizes() {
|
|
103
107
|
let config = ExtractionConfig {
|
|
104
108
|
chunking: Some(ChunkingConfig {
|
|
105
|
-
|
|
106
|
-
|
|
109
|
+
max_characters: 200,
|
|
110
|
+
overlap: 50,
|
|
107
111
|
embedding: None,
|
|
108
112
|
preset: None,
|
|
113
|
+
trim: true,
|
|
114
|
+
chunker_type: kreuzberg::chunking::ChunkerType::Text,
|
|
109
115
|
}),
|
|
110
116
|
..Default::default()
|
|
111
117
|
};
|
|
@@ -512,10 +518,12 @@ async fn test_chunking_with_embeddings() {
|
|
|
512
518
|
|
|
513
519
|
let config = ExtractionConfig {
|
|
514
520
|
chunking: Some(ChunkingConfig {
|
|
515
|
-
|
|
516
|
-
|
|
521
|
+
max_characters: 100,
|
|
522
|
+
overlap: 20,
|
|
517
523
|
embedding: Some(EmbeddingConfig::default()),
|
|
518
524
|
preset: None,
|
|
525
|
+
trim: true,
|
|
526
|
+
chunker_type: kreuzberg::chunking::ChunkerType::Text,
|
|
519
527
|
}),
|
|
520
528
|
..Default::default()
|
|
521
529
|
};
|
|
@@ -582,15 +590,15 @@ async fn test_chunking_with_fast_embeddings() {
|
|
|
582
590
|
|
|
583
591
|
let config = ExtractionConfig {
|
|
584
592
|
chunking: Some(ChunkingConfig {
|
|
585
|
-
|
|
586
|
-
|
|
593
|
+
max_characters: 100,
|
|
594
|
+
overlap: 20,
|
|
587
595
|
embedding: Some(EmbeddingConfig {
|
|
588
596
|
model: EmbeddingModelType::Preset {
|
|
589
597
|
name: "fast".to_string(),
|
|
590
598
|
},
|
|
591
599
|
..Default::default()
|
|
592
600
|
}),
|
|
593
|
-
|
|
601
|
+
..Default::default()
|
|
594
602
|
}),
|
|
595
603
|
..Default::default()
|
|
596
604
|
};
|
|
@@ -36,8 +36,8 @@ max_overlap = 100
|
|
|
36
36
|
assert!(config.chunking.is_some(), "Should have chunking config");
|
|
37
37
|
|
|
38
38
|
let chunking = config.chunking.expect("Operation failed");
|
|
39
|
-
assert_eq!(chunking.
|
|
40
|
-
assert_eq!(chunking.
|
|
39
|
+
assert_eq!(chunking.max_characters, 1000);
|
|
40
|
+
assert_eq!(chunking.overlap, 100);
|
|
41
41
|
}
|
|
42
42
|
|
|
43
43
|
/// Test loading config from YAML file.
|
|
@@ -51,8 +51,8 @@ ocr:
|
|
|
51
51
|
enabled: true
|
|
52
52
|
backend: tesseract
|
|
53
53
|
chunking:
|
|
54
|
-
|
|
55
|
-
|
|
54
|
+
max_characters: 1000
|
|
55
|
+
overlap: 100
|
|
56
56
|
"#;
|
|
57
57
|
|
|
58
58
|
fs::write(&config_path, yaml_content).expect("Operation failed");
|
|
@@ -65,8 +65,8 @@ chunking:
|
|
|
65
65
|
assert!(config.chunking.is_some(), "Should have chunking config");
|
|
66
66
|
|
|
67
67
|
let chunking = config.chunking.expect("Operation failed");
|
|
68
|
-
assert_eq!(chunking.
|
|
69
|
-
assert_eq!(chunking.
|
|
68
|
+
assert_eq!(chunking.max_characters, 1000);
|
|
69
|
+
assert_eq!(chunking.overlap, 100);
|
|
70
70
|
}
|
|
71
71
|
|
|
72
72
|
/// Test loading config from JSON file.
|
|
@@ -98,8 +98,8 @@ fn test_from_file_json_succeeds() {
|
|
|
98
98
|
assert!(config.chunking.is_some(), "Should have chunking config");
|
|
99
99
|
|
|
100
100
|
let chunking = config.chunking.expect("Operation failed");
|
|
101
|
-
assert_eq!(chunking.
|
|
102
|
-
assert_eq!(chunking.
|
|
101
|
+
assert_eq!(chunking.max_characters, 1000);
|
|
102
|
+
assert_eq!(chunking.overlap, 100);
|
|
103
103
|
}
|
|
104
104
|
|
|
105
105
|
/// Test loading config from .yml extension.
|
|
@@ -420,6 +420,6 @@ max_overlap = -100
|
|
|
420
420
|
if let Ok(config) = result
|
|
421
421
|
&& let Some(chunking) = config.chunking
|
|
422
422
|
{
|
|
423
|
-
assert!(chunking.
|
|
423
|
+
assert!(chunking.max_characters > 0, "max_characters should be positive");
|
|
424
424
|
}
|
|
425
425
|
}
|
|
@@ -96,8 +96,8 @@ fn test_mcp_chunking_config_nested_matches_rust_core() {
|
|
|
96
96
|
assert!(config.chunking.is_some(), "Chunking config should be present");
|
|
97
97
|
|
|
98
98
|
if let Some(chunking) = &config.chunking {
|
|
99
|
-
assert_eq!(chunking.
|
|
100
|
-
assert_eq!(chunking.
|
|
99
|
+
assert_eq!(chunking.max_characters, 500, "max_chars should be 500");
|
|
100
|
+
assert_eq!(chunking.overlap, 50, "max_overlap should be 50");
|
|
101
101
|
}
|
|
102
102
|
|
|
103
103
|
// Verify roundtrip
|
|
@@ -370,8 +370,8 @@ max_overlap = 300
|
|
|
370
370
|
assert_eq!(ocr_config.language, "deu");
|
|
371
371
|
|
|
372
372
|
let chunking_config = config.chunking.expect("Operation failed");
|
|
373
|
-
assert_eq!(chunking_config.
|
|
374
|
-
assert_eq!(chunking_config.
|
|
373
|
+
assert_eq!(chunking_config.max_characters, 2000);
|
|
374
|
+
assert_eq!(chunking_config.overlap, 300);
|
|
375
375
|
}
|
|
376
376
|
|
|
377
377
|
/// Test config discovery in parent directories.
|
|
@@ -481,10 +481,9 @@ async fn test_extraction_with_chunking_config() {
|
|
|
481
481
|
|
|
482
482
|
let config = ExtractionConfig {
|
|
483
483
|
chunking: Some(kreuzberg::ChunkingConfig {
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
preset: None,
|
|
484
|
+
max_characters: 100,
|
|
485
|
+
overlap: 20,
|
|
486
|
+
..Default::default()
|
|
488
487
|
}),
|
|
489
488
|
..Default::default()
|
|
490
489
|
};
|
|
@@ -576,7 +576,7 @@ async fn test_opml_extraction_statistics() {
|
|
|
576
576
|
println!(" Metadata fields: {}", result.metadata.additional.len());
|
|
577
577
|
|
|
578
578
|
if !result.metadata.additional.is_empty() {
|
|
579
|
-
let keys: Vec<String> = result.metadata.additional.keys().
|
|
579
|
+
let keys: Vec<String> = result.metadata.additional.keys().map(|k| k.to_string()).collect();
|
|
580
580
|
println!(" Keys: {}", keys.join(", "));
|
|
581
581
|
}
|
|
582
582
|
|
|
@@ -778,7 +778,7 @@ async fn test_orgmode_extraction_statistics() {
|
|
|
778
778
|
println!(" Metadata fields: {}", result.metadata.additional.len());
|
|
779
779
|
|
|
780
780
|
if !result.metadata.additional.is_empty() {
|
|
781
|
-
let keys: Vec<String> = result.metadata.additional.keys().
|
|
781
|
+
let keys: Vec<String> = result.metadata.additional.keys().map(|k| k.to_string()).collect();
|
|
782
782
|
println!(" Keys: {}", keys.join(", "));
|
|
783
783
|
}
|
|
784
784
|
|
|
@@ -13,6 +13,7 @@ use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
|
|
|
13
13
|
use kreuzberg::types::{ExtractionResult, Metadata};
|
|
14
14
|
use kreuzberg::{KreuzbergError, Result};
|
|
15
15
|
use serial_test::serial;
|
|
16
|
+
use std::borrow::Cow;
|
|
16
17
|
use std::sync::Arc;
|
|
17
18
|
|
|
18
19
|
struct OrderTrackingProcessor {
|
|
@@ -74,7 +75,7 @@ impl PostProcessor for MetadataAddingProcessor {
|
|
|
74
75
|
result
|
|
75
76
|
.metadata
|
|
76
77
|
.additional
|
|
77
|
-
.insert(self.key.clone(), serde_json::json!(self.value));
|
|
78
|
+
.insert(Cow::Owned(self.key.clone()), serde_json::json!(self.value));
|
|
78
79
|
Ok(())
|
|
79
80
|
}
|
|
80
81
|
|
|
@@ -134,7 +135,7 @@ async fn test_pipeline_empty_no_processors() {
|
|
|
134
135
|
|
|
135
136
|
let result = ExtractionResult {
|
|
136
137
|
content: "original content".to_string(),
|
|
137
|
-
mime_type: "text/plain"
|
|
138
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
138
139
|
metadata: Metadata::default(),
|
|
139
140
|
tables: vec![],
|
|
140
141
|
detected_languages: None,
|
|
@@ -181,7 +182,7 @@ async fn test_pipeline_single_processor_per_stage() {
|
|
|
181
182
|
|
|
182
183
|
let result = ExtractionResult {
|
|
183
184
|
content: "start".to_string(),
|
|
184
|
-
mime_type: "text/plain"
|
|
185
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
185
186
|
metadata: Metadata::default(),
|
|
186
187
|
tables: vec![],
|
|
187
188
|
detected_languages: None,
|
|
@@ -228,7 +229,7 @@ async fn test_pipeline_multiple_processors_per_stage() {
|
|
|
228
229
|
|
|
229
230
|
let result = ExtractionResult {
|
|
230
231
|
content: "start".to_string(),
|
|
231
|
-
mime_type: "text/plain"
|
|
232
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
232
233
|
metadata: Metadata::default(),
|
|
233
234
|
tables: vec![],
|
|
234
235
|
detected_languages: None,
|
|
@@ -266,7 +267,7 @@ async fn test_pipeline_all_stages_enabled() {
|
|
|
266
267
|
|
|
267
268
|
let result = ExtractionResult {
|
|
268
269
|
content: "start".to_string(),
|
|
269
|
-
mime_type: "text/plain"
|
|
270
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
270
271
|
metadata: Metadata::default(),
|
|
271
272
|
tables: vec![],
|
|
272
273
|
detected_languages: None,
|
|
@@ -302,7 +303,7 @@ async fn test_pipeline_postprocessing_disabled() {
|
|
|
302
303
|
|
|
303
304
|
let result = ExtractionResult {
|
|
304
305
|
content: "start".to_string(),
|
|
305
|
-
mime_type: "text/plain"
|
|
306
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
306
307
|
metadata: Metadata::default(),
|
|
307
308
|
tables: vec![],
|
|
308
309
|
detected_languages: None,
|
|
@@ -353,7 +354,7 @@ async fn test_pipeline_early_stage_runs_first() {
|
|
|
353
354
|
|
|
354
355
|
let result = ExtractionResult {
|
|
355
356
|
content: "start".to_string(),
|
|
356
|
-
mime_type: "text/plain"
|
|
357
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
357
358
|
metadata: Metadata::default(),
|
|
358
359
|
tables: vec![],
|
|
359
360
|
detected_languages: None,
|
|
@@ -395,7 +396,7 @@ async fn test_pipeline_middle_stage_runs_second() {
|
|
|
395
396
|
|
|
396
397
|
let result = ExtractionResult {
|
|
397
398
|
content: "start".to_string(),
|
|
398
|
-
mime_type: "text/plain"
|
|
399
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
399
400
|
metadata: Metadata::default(),
|
|
400
401
|
tables: vec![],
|
|
401
402
|
detected_languages: None,
|
|
@@ -433,7 +434,7 @@ async fn test_pipeline_late_stage_runs_last() {
|
|
|
433
434
|
|
|
434
435
|
let result = ExtractionResult {
|
|
435
436
|
content: "start".to_string(),
|
|
436
|
-
mime_type: "text/plain"
|
|
437
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
437
438
|
metadata: Metadata::default(),
|
|
438
439
|
tables: vec![],
|
|
439
440
|
detected_languages: None,
|
|
@@ -471,7 +472,7 @@ async fn test_pipeline_within_stage_priority_order() {
|
|
|
471
472
|
|
|
472
473
|
let result = ExtractionResult {
|
|
473
474
|
content: "start".to_string(),
|
|
474
|
-
mime_type: "text/plain"
|
|
475
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
475
476
|
metadata: Metadata::default(),
|
|
476
477
|
tables: vec![],
|
|
477
478
|
detected_languages: None,
|
|
@@ -522,7 +523,7 @@ async fn test_pipeline_cross_stage_data_flow() {
|
|
|
522
523
|
#[async_trait]
|
|
523
524
|
impl PostProcessor for MiddleProcessor {
|
|
524
525
|
async fn process(&self, result: &mut ExtractionResult, _: &ExtractionConfig) -> Result<()> {
|
|
525
|
-
if let Some(stage) = result.metadata.additional.get("stage") {
|
|
526
|
+
if let Some(stage) = result.metadata.additional.get(&Cow::Borrowed("stage")) {
|
|
526
527
|
result.content.push_str(&format!(
|
|
527
528
|
"[saw:{}]",
|
|
528
529
|
stage.as_str().expect("Failed to extract string from value")
|
|
@@ -541,7 +542,7 @@ async fn test_pipeline_cross_stage_data_flow() {
|
|
|
541
542
|
|
|
542
543
|
let result = ExtractionResult {
|
|
543
544
|
content: "start".to_string(),
|
|
544
|
-
mime_type: "text/plain"
|
|
545
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
545
546
|
metadata: Metadata::default(),
|
|
546
547
|
tables: vec![],
|
|
547
548
|
detected_languages: None,
|
|
@@ -601,7 +602,7 @@ async fn test_pipeline_early_stage_error_recorded() {
|
|
|
601
602
|
|
|
602
603
|
let result = ExtractionResult {
|
|
603
604
|
content: "content".to_string(),
|
|
604
|
-
mime_type: "text/plain"
|
|
605
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
605
606
|
metadata: Metadata::default(),
|
|
606
607
|
tables: vec![],
|
|
607
608
|
detected_languages: None,
|
|
@@ -645,7 +646,7 @@ async fn test_pipeline_middle_stage_error_propagation() {
|
|
|
645
646
|
|
|
646
647
|
let result = ExtractionResult {
|
|
647
648
|
content: "content".to_string(),
|
|
648
|
-
mime_type: "text/plain"
|
|
649
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
649
650
|
metadata: Metadata::default(),
|
|
650
651
|
tables: vec![],
|
|
651
652
|
detected_languages: None,
|
|
@@ -719,7 +720,7 @@ async fn test_pipeline_late_stage_error_doesnt_affect_earlier_stages() {
|
|
|
719
720
|
|
|
720
721
|
let result = ExtractionResult {
|
|
721
722
|
content: "start".to_string(),
|
|
722
|
-
mime_type: "text/plain"
|
|
723
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
723
724
|
metadata: Metadata::default(),
|
|
724
725
|
tables: vec![],
|
|
725
726
|
detected_languages: None,
|
|
@@ -809,7 +810,7 @@ async fn test_pipeline_processor_error_doesnt_stop_other_processors() {
|
|
|
809
810
|
|
|
810
811
|
let result = ExtractionResult {
|
|
811
812
|
content: "start".to_string(),
|
|
812
|
-
mime_type: "text/plain"
|
|
813
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
813
814
|
metadata: Metadata::default(),
|
|
814
815
|
tables: vec![],
|
|
815
816
|
detected_languages: None,
|
|
@@ -889,7 +890,7 @@ async fn test_pipeline_multiple_processor_errors() {
|
|
|
889
890
|
|
|
890
891
|
let result = ExtractionResult {
|
|
891
892
|
content: "start".to_string(),
|
|
892
|
-
mime_type: "text/plain"
|
|
893
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
893
894
|
metadata: Metadata::default(),
|
|
894
895
|
tables: vec![],
|
|
895
896
|
detected_languages: None,
|
|
@@ -933,7 +934,7 @@ async fn test_pipeline_error_context_preservation() {
|
|
|
933
934
|
|
|
934
935
|
let result = ExtractionResult {
|
|
935
936
|
content: "content".to_string(),
|
|
936
|
-
mime_type: "text/plain"
|
|
937
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
937
938
|
metadata: Metadata::default(),
|
|
938
939
|
tables: vec![],
|
|
939
940
|
detected_languages: None,
|
|
@@ -991,8 +992,11 @@ async fn test_pipeline_metadata_added_in_early_visible_in_middle() {
|
|
|
991
992
|
#[async_trait]
|
|
992
993
|
impl PostProcessor for MiddleReadingProcessor {
|
|
993
994
|
async fn process(&self, result: &mut ExtractionResult, _: &ExtractionConfig) -> Result<()> {
|
|
994
|
-
if let Some(val) = result.metadata.additional.get("early_key") {
|
|
995
|
-
result
|
|
995
|
+
if let Some(val) = result.metadata.additional.get(&Cow::Borrowed("early_key")) {
|
|
996
|
+
result
|
|
997
|
+
.metadata
|
|
998
|
+
.additional
|
|
999
|
+
.insert(Cow::Borrowed("middle_saw"), val.clone());
|
|
996
1000
|
}
|
|
997
1001
|
Ok(())
|
|
998
1002
|
}
|
|
@@ -1008,7 +1012,7 @@ async fn test_pipeline_metadata_added_in_early_visible_in_middle() {
|
|
|
1008
1012
|
|
|
1009
1013
|
let result = ExtractionResult {
|
|
1010
1014
|
content: "content".to_string(),
|
|
1011
|
-
mime_type: "text/plain"
|
|
1015
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
1012
1016
|
metadata: Metadata::default(),
|
|
1013
1017
|
tables: vec![],
|
|
1014
1018
|
detected_languages: None,
|
|
@@ -1082,7 +1086,7 @@ async fn test_pipeline_content_modified_in_middle_visible_in_late() {
|
|
|
1082
1086
|
|
|
1083
1087
|
let result = ExtractionResult {
|
|
1084
1088
|
content: "start".to_string(),
|
|
1085
|
-
mime_type: "text/plain"
|
|
1089
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
1086
1090
|
metadata: Metadata::default(),
|
|
1087
1091
|
tables: vec![],
|
|
1088
1092
|
detected_languages: None,
|
|
@@ -1135,7 +1139,7 @@ async fn test_pipeline_multiple_processors_modifying_same_metadata() {
|
|
|
1135
1139
|
result
|
|
1136
1140
|
.metadata
|
|
1137
1141
|
.additional
|
|
1138
|
-
.insert("shared_key"
|
|
1142
|
+
.insert(Cow::Borrowed("shared_key"), serde_json::json!(self.value));
|
|
1139
1143
|
Ok(())
|
|
1140
1144
|
}
|
|
1141
1145
|
fn processing_stage(&self) -> ProcessingStage {
|
|
@@ -1153,7 +1157,7 @@ async fn test_pipeline_multiple_processors_modifying_same_metadata() {
|
|
|
1153
1157
|
|
|
1154
1158
|
let result = ExtractionResult {
|
|
1155
1159
|
content: "content".to_string(),
|
|
1156
|
-
mime_type: "text/plain"
|
|
1160
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
1157
1161
|
metadata: Metadata::default(),
|
|
1158
1162
|
tables: vec![],
|
|
1159
1163
|
detected_languages: None,
|
|
@@ -1213,13 +1217,13 @@ async fn test_pipeline_processors_reading_previous_output() {
|
|
|
1213
1217
|
let current_count = result
|
|
1214
1218
|
.metadata
|
|
1215
1219
|
.additional
|
|
1216
|
-
.get("count")
|
|
1220
|
+
.get(&Cow::Borrowed("count"))
|
|
1217
1221
|
.and_then(|v| v.as_i64())
|
|
1218
1222
|
.unwrap_or(0);
|
|
1219
1223
|
result
|
|
1220
1224
|
.metadata
|
|
1221
1225
|
.additional
|
|
1222
|
-
.insert("count"
|
|
1226
|
+
.insert(Cow::Borrowed("count"), serde_json::json!(current_count + 1));
|
|
1223
1227
|
Ok(())
|
|
1224
1228
|
}
|
|
1225
1229
|
fn processing_stage(&self) -> ProcessingStage {
|
|
@@ -1243,7 +1247,7 @@ async fn test_pipeline_processors_reading_previous_output() {
|
|
|
1243
1247
|
|
|
1244
1248
|
let result = ExtractionResult {
|
|
1245
1249
|
content: "content".to_string(),
|
|
1246
|
-
mime_type: "text/plain"
|
|
1250
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
1247
1251
|
metadata: Metadata::default(),
|
|
1248
1252
|
tables: vec![],
|
|
1249
1253
|
detected_languages: None,
|
|
@@ -1310,7 +1314,7 @@ async fn test_pipeline_large_content_modification() {
|
|
|
1310
1314
|
|
|
1311
1315
|
let result = ExtractionResult {
|
|
1312
1316
|
content: "start".to_string(),
|
|
1313
|
-
mime_type: "text/plain"
|
|
1317
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
1314
1318
|
metadata: Metadata::default(),
|
|
1315
1319
|
tables: vec![],
|
|
1316
1320
|
detected_languages: None,
|
|
@@ -1348,7 +1352,7 @@ async fn test_pipeline_enabled_processors_whitelist() {
|
|
|
1348
1352
|
|
|
1349
1353
|
let result = ExtractionResult {
|
|
1350
1354
|
content: "start".to_string(),
|
|
1351
|
-
mime_type: "text/plain"
|
|
1355
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
1352
1356
|
metadata: Metadata::default(),
|
|
1353
1357
|
tables: vec![],
|
|
1354
1358
|
detected_languages: None,
|
|
@@ -1397,7 +1401,7 @@ async fn test_pipeline_disabled_processors_blacklist() {
|
|
|
1397
1401
|
|
|
1398
1402
|
let result = ExtractionResult {
|
|
1399
1403
|
content: "start".to_string(),
|
|
1400
|
-
mime_type: "text/plain"
|
|
1404
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
1401
1405
|
metadata: Metadata::default(),
|
|
1402
1406
|
tables: vec![],
|
|
1403
1407
|
detected_languages: None,
|
|
@@ -1446,7 +1450,7 @@ async fn test_pipeline_no_filtering_runs_all() {
|
|
|
1446
1450
|
|
|
1447
1451
|
let result = ExtractionResult {
|
|
1448
1452
|
content: "start".to_string(),
|
|
1449
|
-
mime_type: "text/plain"
|
|
1453
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
1450
1454
|
metadata: Metadata::default(),
|
|
1451
1455
|
tables: vec![],
|
|
1452
1456
|
detected_languages: None,
|
|
@@ -1486,7 +1490,7 @@ async fn test_pipeline_empty_whitelist_runs_none() {
|
|
|
1486
1490
|
|
|
1487
1491
|
let result = ExtractionResult {
|
|
1488
1492
|
content: "start".to_string(),
|
|
1489
|
-
mime_type: "text/plain"
|
|
1493
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
1490
1494
|
metadata: Metadata::default(),
|
|
1491
1495
|
tables: vec![],
|
|
1492
1496
|
detected_languages: None,
|
|
@@ -54,9 +54,10 @@ impl OcrBackend for MockOcrBackend {
|
|
|
54
54
|
return Err(KreuzbergError::validation("Empty image data".to_string()));
|
|
55
55
|
}
|
|
56
56
|
|
|
57
|
+
use std::borrow::Cow;
|
|
57
58
|
Ok(ExtractionResult {
|
|
58
59
|
content: format!("{} (lang: {})", self.return_text, config.language),
|
|
59
|
-
mime_type: "text/plain"
|
|
60
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
60
61
|
metadata: Metadata::default(),
|
|
61
62
|
tables: vec![],
|
|
62
63
|
detected_languages: None,
|
|
@@ -152,9 +153,10 @@ impl OcrBackend for ValidatingOcrBackend {
|
|
|
152
153
|
)));
|
|
153
154
|
}
|
|
154
155
|
|
|
156
|
+
use std::borrow::Cow;
|
|
155
157
|
Ok(ExtractionResult {
|
|
156
158
|
content: format!("Processed {} bytes", image_bytes.len()),
|
|
157
|
-
mime_type: "text/plain"
|
|
159
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
158
160
|
metadata: Metadata::default(),
|
|
159
161
|
tables: vec![],
|
|
160
162
|
detected_languages: None,
|
|
@@ -201,19 +203,23 @@ impl Plugin for MetadataOcrBackend {
|
|
|
201
203
|
impl OcrBackend for MetadataOcrBackend {
|
|
202
204
|
async fn process_image(&self, image_bytes: &[u8], config: &OcrConfig) -> Result<ExtractionResult> {
|
|
203
205
|
let mut metadata = Metadata::default();
|
|
204
|
-
metadata
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
206
|
+
metadata.additional.insert(
|
|
207
|
+
std::borrow::Cow::Borrowed("ocr_backend"),
|
|
208
|
+
serde_json::json!(self.name()),
|
|
209
|
+
);
|
|
210
|
+
metadata.additional.insert(
|
|
211
|
+
std::borrow::Cow::Borrowed("image_size"),
|
|
212
|
+
serde_json::json!(image_bytes.len()),
|
|
213
|
+
);
|
|
214
|
+
metadata.additional.insert(
|
|
215
|
+
std::borrow::Cow::Borrowed("ocr_language"),
|
|
216
|
+
serde_json::json!(config.language),
|
|
217
|
+
);
|
|
218
|
+
|
|
219
|
+
use std::borrow::Cow;
|
|
214
220
|
Ok(ExtractionResult {
|
|
215
221
|
content: "OCR processed text".to_string(),
|
|
216
|
-
mime_type: "text/plain"
|
|
222
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
217
223
|
metadata,
|
|
218
224
|
tables: vec![],
|
|
219
225
|
detected_languages: None,
|
|
@@ -11,6 +11,7 @@ use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
|
|
|
11
11
|
use kreuzberg::types::ExtractionResult;
|
|
12
12
|
use kreuzberg::{KreuzbergError, Result, extract_file_sync};
|
|
13
13
|
use serial_test::serial;
|
|
14
|
+
use std::borrow::Cow;
|
|
14
15
|
use std::sync::Arc;
|
|
15
16
|
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
|
16
17
|
|
|
@@ -82,9 +83,9 @@ impl PostProcessor for MetadataAddingProcessor {
|
|
|
82
83
|
result
|
|
83
84
|
.metadata
|
|
84
85
|
.additional
|
|
85
|
-
.insert("processed_by"
|
|
86
|
+
.insert(Cow::Borrowed("processed_by"), serde_json::json!(self.name()));
|
|
86
87
|
result.metadata.additional.insert(
|
|
87
|
-
"word_count"
|
|
88
|
+
Cow::Borrowed("word_count"),
|
|
88
89
|
serde_json::json!(result.content.split_whitespace().count()),
|
|
89
90
|
);
|
|
90
91
|
Ok(())
|
|
@@ -11,6 +11,7 @@ use kreuzberg::plugins::registry::{
|
|
|
11
11
|
use kreuzberg::plugins::{DocumentExtractor, Plugin, PostProcessor, ProcessingStage, Validator};
|
|
12
12
|
use kreuzberg::types::{ExtractionResult, Metadata};
|
|
13
13
|
use kreuzberg::{KreuzbergError, Result};
|
|
14
|
+
use std::borrow::Cow;
|
|
14
15
|
use std::sync::Arc;
|
|
15
16
|
|
|
16
17
|
struct FailingExtractor {
|
|
@@ -52,7 +53,7 @@ impl DocumentExtractor for FailingExtractor {
|
|
|
52
53
|
} else {
|
|
53
54
|
Ok(ExtractionResult {
|
|
54
55
|
content: "success".to_string(),
|
|
55
|
-
mime_type: "text/plain"
|
|
56
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
56
57
|
metadata: Metadata::default(),
|
|
57
58
|
tables: vec![],
|
|
58
59
|
detected_languages: None,
|
|
@@ -299,7 +300,7 @@ fn test_extractor_priority_ordering_complex() {
|
|
|
299
300
|
async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
300
301
|
Ok(ExtractionResult {
|
|
301
302
|
content: "test".to_string(),
|
|
302
|
-
mime_type: "text/plain"
|
|
303
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
303
304
|
metadata: Metadata::default(),
|
|
304
305
|
tables: vec![],
|
|
305
306
|
detected_languages: None,
|
|
@@ -461,7 +462,7 @@ async fn test_processor_execution_order_within_stage() {
|
|
|
461
462
|
|
|
462
463
|
let mut result = ExtractionResult {
|
|
463
464
|
content: "start".to_string(),
|
|
464
|
-
mime_type: "text/plain"
|
|
465
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
465
466
|
metadata: Metadata::default(),
|
|
466
467
|
tables: vec![],
|
|
467
468
|
detected_languages: None,
|
|
@@ -498,7 +499,7 @@ async fn test_processor_error_propagation() {
|
|
|
498
499
|
|
|
499
500
|
let mut result = ExtractionResult {
|
|
500
501
|
content: "test".to_string(),
|
|
501
|
-
mime_type: "text/plain"
|
|
502
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
502
503
|
metadata: Metadata::default(),
|
|
503
504
|
tables: vec![],
|
|
504
505
|
detected_languages: None,
|
|
@@ -672,7 +673,7 @@ async fn test_validator_content_validation() {
|
|
|
672
673
|
|
|
673
674
|
let short_result = ExtractionResult {
|
|
674
675
|
content: "short".to_string(),
|
|
675
|
-
mime_type: "text/plain"
|
|
676
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
676
677
|
metadata: Metadata::default(),
|
|
677
678
|
tables: vec![],
|
|
678
679
|
detected_languages: None,
|
|
@@ -688,7 +689,7 @@ async fn test_validator_content_validation() {
|
|
|
688
689
|
|
|
689
690
|
let long_result = ExtractionResult {
|
|
690
691
|
content: "this is long enough content".to_string(),
|
|
691
|
-
mime_type: "text/plain"
|
|
692
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
692
693
|
metadata: Metadata::default(),
|
|
693
694
|
tables: vec![],
|
|
694
695
|
detected_languages: None,
|
|
@@ -157,7 +157,7 @@ impl Plugin for MetadataValidator {
|
|
|
157
157
|
#[async_trait]
|
|
158
158
|
impl Validator for MetadataValidator {
|
|
159
159
|
async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
|
|
160
|
-
if !result.metadata.additional.contains_key(
|
|
160
|
+
if !result.metadata.additional.contains_key(self.required_key.as_str()) {
|
|
161
161
|
Err(KreuzbergError::validation(format!(
|
|
162
162
|
"Required metadata key '{}' missing",
|
|
163
163
|
self.required_key
|