kreuzberg 4.2.6 → 4.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +7 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
- data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
- data/ext/kreuzberg_rb/native/src/result.rs +5 -3
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +228 -37
- data/spec/binding/batch_operations_spec.rb +2 -0
- data/vendor/Cargo.toml +3 -2
- data/vendor/kreuzberg/Cargo.toml +2 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/error.rs +29 -1
- data/vendor/kreuzberg/src/api/handlers.rs +28 -25
- data/vendor/kreuzberg/src/api/openapi.rs +14 -1
- data/vendor/kreuzberg/src/chunking/config.rs +2 -37
- data/vendor/kreuzberg/src/chunking/core.rs +78 -2
- data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
- data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
- data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
- data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
- data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
- data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
- data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
- data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
- data/vendor/kreuzberg/src/extraction/email.rs +31 -19
- data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
- data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
- data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
- data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
- data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
- data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
- data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
- data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
- data/vendor/kreuzberg/src/extractors/email.rs +5 -3
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
- data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
- data/vendor/kreuzberg/src/extractors/html.rs +1 -1
- data/vendor/kreuzberg/src/extractors/image.rs +3 -3
- data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
- data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
- data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
- data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
- data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
- data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
- data/vendor/kreuzberg/src/extractors/text.rs +2 -2
- data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
- data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
- data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
- data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
- data/vendor/kreuzberg/src/lib.rs +1 -1
- data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
- data/vendor/kreuzberg/src/mcp/format.rs +5 -4
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
- data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
- data/vendor/kreuzberg/src/ocr/types.rs +3 -4
- data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
- data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
- data/vendor/kreuzberg/src/text/quality.rs +13 -13
- data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
- data/vendor/kreuzberg/src/types/djot.rs +15 -4
- data/vendor/kreuzberg/src/types/extraction.rs +24 -4
- data/vendor/kreuzberg/src/types/formats.rs +9 -5
- data/vendor/kreuzberg/src/types/metadata.rs +68 -7
- data/vendor/kreuzberg/src/types/mod.rs +7 -5
- data/vendor/kreuzberg/src/types/page.rs +9 -0
- data/vendor/kreuzberg/src/types/tables.rs +2 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
- data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
- data/vendor/kreuzberg/tests/config_features.rs +19 -11
- data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
- data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
- data/vendor/kreuzberg/tests/core_integration.rs +5 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
- data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
- data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
- data/vendor/kreuzberg-ffi/src/error.rs +56 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
- data/vendor/kreuzberg-ffi/src/result.rs +2 -1
- data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
- data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +2 -2
|
@@ -4,6 +4,7 @@ use super::*;
|
|
|
4
4
|
use crate::core::config::OutputFormat;
|
|
5
5
|
use crate::types::Metadata;
|
|
6
6
|
use lazy_static::lazy_static;
|
|
7
|
+
use std::borrow::Cow;
|
|
7
8
|
|
|
8
9
|
const VALIDATION_MARKER_KEY: &str = "registry_validation_marker";
|
|
9
10
|
#[cfg(feature = "quality")]
|
|
@@ -19,7 +20,7 @@ lazy_static! {
|
|
|
19
20
|
async fn test_run_pipeline_basic() {
|
|
20
21
|
let mut result = ExtractionResult {
|
|
21
22
|
content: "test".to_string(),
|
|
22
|
-
mime_type: "text/plain"
|
|
23
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
23
24
|
metadata: Metadata::default(),
|
|
24
25
|
tables: vec![],
|
|
25
26
|
detected_languages: None,
|
|
@@ -30,7 +31,7 @@ async fn test_run_pipeline_basic() {
|
|
|
30
31
|
elements: None,
|
|
31
32
|
};
|
|
32
33
|
result.metadata.additional.insert(
|
|
33
|
-
VALIDATION_MARKER_KEY
|
|
34
|
+
Cow::Borrowed(VALIDATION_MARKER_KEY),
|
|
34
35
|
serde_json::json!(ORDER_VALIDATION_MARKER),
|
|
35
36
|
);
|
|
36
37
|
let config = ExtractionConfig::default();
|
|
@@ -44,7 +45,7 @@ async fn test_run_pipeline_basic() {
|
|
|
44
45
|
async fn test_pipeline_with_quality_processing() {
|
|
45
46
|
let result = ExtractionResult {
|
|
46
47
|
content: "This is a test document with some meaningful content.".to_string(),
|
|
47
|
-
mime_type: "text/plain"
|
|
48
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
48
49
|
metadata: Metadata::default(),
|
|
49
50
|
tables: vec![],
|
|
50
51
|
detected_languages: None,
|
|
@@ -67,7 +68,7 @@ async fn test_pipeline_with_quality_processing() {
|
|
|
67
68
|
async fn test_pipeline_without_quality_processing() {
|
|
68
69
|
let result = ExtractionResult {
|
|
69
70
|
content: "test".to_string(),
|
|
70
|
-
mime_type: "text/plain"
|
|
71
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
71
72
|
metadata: Metadata::default(),
|
|
72
73
|
tables: vec![],
|
|
73
74
|
detected_languages: None,
|
|
@@ -91,7 +92,7 @@ async fn test_pipeline_without_quality_processing() {
|
|
|
91
92
|
async fn test_pipeline_with_chunking() {
|
|
92
93
|
let result = ExtractionResult {
|
|
93
94
|
content: "This is a long text that should be chunked. ".repeat(100),
|
|
94
|
-
mime_type: "text/plain"
|
|
95
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
95
96
|
metadata: Metadata::default(),
|
|
96
97
|
tables: vec![],
|
|
97
98
|
detected_languages: None,
|
|
@@ -103,8 +104,10 @@ async fn test_pipeline_with_chunking() {
|
|
|
103
104
|
};
|
|
104
105
|
let config = ExtractionConfig {
|
|
105
106
|
chunking: Some(crate::ChunkingConfig {
|
|
106
|
-
|
|
107
|
-
|
|
107
|
+
max_characters: 500,
|
|
108
|
+
overlap: 50,
|
|
109
|
+
trim: true,
|
|
110
|
+
chunker_type: crate::ChunkerType::Text,
|
|
108
111
|
embedding: None,
|
|
109
112
|
preset: None,
|
|
110
113
|
}),
|
|
@@ -121,7 +124,7 @@ async fn test_pipeline_with_chunking() {
|
|
|
121
124
|
async fn test_pipeline_without_chunking() {
|
|
122
125
|
let result = ExtractionResult {
|
|
123
126
|
content: "test".to_string(),
|
|
124
|
-
mime_type: "text/plain"
|
|
127
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
125
128
|
metadata: Metadata::default(),
|
|
126
129
|
tables: vec![],
|
|
127
130
|
detected_languages: None,
|
|
@@ -142,14 +145,14 @@ async fn test_pipeline_without_chunking() {
|
|
|
142
145
|
|
|
143
146
|
#[tokio::test]
|
|
144
147
|
async fn test_pipeline_preserves_metadata() {
|
|
145
|
-
use
|
|
146
|
-
let mut additional =
|
|
147
|
-
additional.insert("source"
|
|
148
|
-
additional.insert("page"
|
|
148
|
+
use ahash::AHashMap;
|
|
149
|
+
let mut additional = AHashMap::new();
|
|
150
|
+
additional.insert(Cow::Borrowed("source"), serde_json::json!("test"));
|
|
151
|
+
additional.insert(Cow::Borrowed("page"), serde_json::json!(1));
|
|
149
152
|
|
|
150
153
|
let result = ExtractionResult {
|
|
151
154
|
content: "test".to_string(),
|
|
152
|
-
mime_type: "text/plain"
|
|
155
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
153
156
|
metadata: Metadata {
|
|
154
157
|
additional,
|
|
155
158
|
..Default::default()
|
|
@@ -187,7 +190,7 @@ async fn test_pipeline_preserves_tables() {
|
|
|
187
190
|
|
|
188
191
|
let result = ExtractionResult {
|
|
189
192
|
content: "test".to_string(),
|
|
190
|
-
mime_type: "text/plain"
|
|
193
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
191
194
|
metadata: Metadata::default(),
|
|
192
195
|
tables: vec![table],
|
|
193
196
|
detected_languages: None,
|
|
@@ -219,7 +222,7 @@ async fn test_pipeline_empty_content() {
|
|
|
219
222
|
|
|
220
223
|
let result = ExtractionResult {
|
|
221
224
|
content: String::new(),
|
|
222
|
-
mime_type: "text/plain"
|
|
225
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
223
226
|
metadata: Metadata::default(),
|
|
224
227
|
tables: vec![],
|
|
225
228
|
detected_languages: None,
|
|
@@ -242,7 +245,7 @@ async fn test_pipeline_empty_content() {
|
|
|
242
245
|
async fn test_pipeline_with_all_features() {
|
|
243
246
|
let result = ExtractionResult {
|
|
244
247
|
content: "This is a comprehensive test document. ".repeat(50),
|
|
245
|
-
mime_type: "text/plain"
|
|
248
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
246
249
|
metadata: Metadata::default(),
|
|
247
250
|
tables: vec![],
|
|
248
251
|
detected_languages: None,
|
|
@@ -255,8 +258,10 @@ async fn test_pipeline_with_all_features() {
|
|
|
255
258
|
let config = ExtractionConfig {
|
|
256
259
|
enable_quality_processing: true,
|
|
257
260
|
chunking: Some(crate::ChunkingConfig {
|
|
258
|
-
|
|
259
|
-
|
|
261
|
+
max_characters: 500,
|
|
262
|
+
overlap: 50,
|
|
263
|
+
trim: true,
|
|
264
|
+
chunker_type: crate::ChunkerType::Text,
|
|
260
265
|
embedding: None,
|
|
261
266
|
preset: None,
|
|
262
267
|
}),
|
|
@@ -295,7 +300,7 @@ machine learning that uses neural networks with multiple layers.
|
|
|
295
300
|
Natural language processing enables computers to understand human language.
|
|
296
301
|
"#
|
|
297
302
|
.to_string(),
|
|
298
|
-
mime_type: "text/plain"
|
|
303
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
299
304
|
metadata: Metadata::default(),
|
|
300
305
|
tables: vec![],
|
|
301
306
|
detected_languages: None,
|
|
@@ -342,7 +347,7 @@ async fn test_pipeline_without_keyword_config() {
|
|
|
342
347
|
}
|
|
343
348
|
let result = ExtractionResult {
|
|
344
349
|
content: "Machine learning and artificial intelligence.".to_string(),
|
|
345
|
-
mime_type: "text/plain"
|
|
350
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
346
351
|
metadata: Metadata::default(),
|
|
347
352
|
tables: vec![],
|
|
348
353
|
detected_languages: None,
|
|
@@ -380,7 +385,7 @@ async fn test_pipeline_keyword_extraction_short_content() {
|
|
|
380
385
|
|
|
381
386
|
let result = ExtractionResult {
|
|
382
387
|
content: "Short text".to_string(),
|
|
383
|
-
mime_type: "text/plain"
|
|
388
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
384
389
|
metadata: Metadata::default(),
|
|
385
390
|
tables: vec![],
|
|
386
391
|
detected_languages: None,
|
|
@@ -437,7 +442,7 @@ async fn test_postprocessor_runs_before_validator() {
|
|
|
437
442
|
result
|
|
438
443
|
.metadata
|
|
439
444
|
.additional
|
|
440
|
-
.insert("processed"
|
|
445
|
+
.insert(Cow::Borrowed("processed"), serde_json::json!(true));
|
|
441
446
|
Ok(())
|
|
442
447
|
}
|
|
443
448
|
|
|
@@ -517,7 +522,7 @@ async fn test_postprocessor_runs_before_validator() {
|
|
|
517
522
|
|
|
518
523
|
let mut result = ExtractionResult {
|
|
519
524
|
content: "test".to_string(),
|
|
520
|
-
mime_type: "text/plain"
|
|
525
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
521
526
|
metadata: Metadata::default(),
|
|
522
527
|
tables: vec![],
|
|
523
528
|
detected_languages: None,
|
|
@@ -528,7 +533,7 @@ async fn test_postprocessor_runs_before_validator() {
|
|
|
528
533
|
elements: None,
|
|
529
534
|
};
|
|
530
535
|
result.metadata.additional.insert(
|
|
531
|
-
VALIDATION_MARKER_KEY
|
|
536
|
+
Cow::Borrowed(VALIDATION_MARKER_KEY),
|
|
532
537
|
serde_json::json!(POSTPROCESSOR_VALIDATION_MARKER),
|
|
533
538
|
);
|
|
534
539
|
|
|
@@ -614,7 +619,7 @@ async fn test_quality_processing_runs_before_validator() {
|
|
|
614
619
|
|
|
615
620
|
let mut result = ExtractionResult {
|
|
616
621
|
content: "This is meaningful test content for quality scoring.".to_string(),
|
|
617
|
-
mime_type: "text/plain"
|
|
622
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
618
623
|
metadata: Metadata::default(),
|
|
619
624
|
tables: vec![],
|
|
620
625
|
detected_languages: None,
|
|
@@ -625,7 +630,7 @@ async fn test_quality_processing_runs_before_validator() {
|
|
|
625
630
|
elements: None,
|
|
626
631
|
};
|
|
627
632
|
result.metadata.additional.insert(
|
|
628
|
-
VALIDATION_MARKER_KEY
|
|
633
|
+
Cow::Borrowed(VALIDATION_MARKER_KEY),
|
|
629
634
|
serde_json::json!(QUALITY_VALIDATION_MARKER),
|
|
630
635
|
);
|
|
631
636
|
|
|
@@ -682,7 +687,7 @@ async fn test_multiple_postprocessors_run_before_validator() {
|
|
|
682
687
|
result
|
|
683
688
|
.metadata
|
|
684
689
|
.additional
|
|
685
|
-
.insert("execution_order"
|
|
690
|
+
.insert(Cow::Borrowed("execution_order"), serde_json::json!(order));
|
|
686
691
|
Ok(())
|
|
687
692
|
}
|
|
688
693
|
|
|
@@ -721,7 +726,7 @@ async fn test_multiple_postprocessors_run_before_validator() {
|
|
|
721
726
|
result
|
|
722
727
|
.metadata
|
|
723
728
|
.additional
|
|
724
|
-
.insert("execution_order"
|
|
729
|
+
.insert(Cow::Borrowed("execution_order"), serde_json::json!(order));
|
|
725
730
|
Ok(())
|
|
726
731
|
}
|
|
727
732
|
|
|
@@ -812,7 +817,7 @@ async fn test_multiple_postprocessors_run_before_validator() {
|
|
|
812
817
|
|
|
813
818
|
let result = ExtractionResult {
|
|
814
819
|
content: "test".to_string(),
|
|
815
|
-
mime_type: "text/plain"
|
|
820
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
816
821
|
metadata: Metadata::default(),
|
|
817
822
|
tables: vec![],
|
|
818
823
|
detected_languages: None,
|
|
@@ -839,7 +844,7 @@ async fn test_multiple_postprocessors_run_before_validator() {
|
|
|
839
844
|
async fn test_run_pipeline_with_output_format_plain() {
|
|
840
845
|
let result = ExtractionResult {
|
|
841
846
|
content: "test content".to_string(),
|
|
842
|
-
mime_type: "text/plain"
|
|
847
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
843
848
|
metadata: Metadata::default(),
|
|
844
849
|
tables: vec![],
|
|
845
850
|
detected_languages: None,
|
|
@@ -865,7 +870,7 @@ async fn test_run_pipeline_with_output_format_djot() {
|
|
|
865
870
|
|
|
866
871
|
let result = ExtractionResult {
|
|
867
872
|
content: "test content".to_string(),
|
|
868
|
-
mime_type: "text/djot"
|
|
873
|
+
mime_type: Cow::Borrowed("text/djot"),
|
|
869
874
|
metadata: Metadata::default(),
|
|
870
875
|
tables: vec![],
|
|
871
876
|
detected_languages: None,
|
|
@@ -894,7 +899,7 @@ async fn test_run_pipeline_with_output_format_djot() {
|
|
|
894
899
|
images: vec![],
|
|
895
900
|
links: vec![],
|
|
896
901
|
footnotes: vec![],
|
|
897
|
-
attributes:
|
|
902
|
+
attributes: Vec::new(),
|
|
898
903
|
}),
|
|
899
904
|
};
|
|
900
905
|
|
|
@@ -912,7 +917,7 @@ async fn test_run_pipeline_with_output_format_djot() {
|
|
|
912
917
|
async fn test_run_pipeline_with_output_format_html() {
|
|
913
918
|
let result = ExtractionResult {
|
|
914
919
|
content: "test content".to_string(),
|
|
915
|
-
mime_type: "text/plain"
|
|
920
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
916
921
|
metadata: Metadata::default(),
|
|
917
922
|
tables: vec![],
|
|
918
923
|
detected_languages: None,
|
|
@@ -942,7 +947,7 @@ async fn test_run_pipeline_applies_output_format_last() {
|
|
|
942
947
|
|
|
943
948
|
let result = ExtractionResult {
|
|
944
949
|
content: "test".to_string(),
|
|
945
|
-
mime_type: "text/plain"
|
|
950
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
946
951
|
metadata: Metadata::default(),
|
|
947
952
|
tables: vec![],
|
|
948
953
|
detected_languages: None,
|
|
@@ -958,7 +963,7 @@ async fn test_run_pipeline_applies_output_format_last() {
|
|
|
958
963
|
images: vec![],
|
|
959
964
|
links: vec![],
|
|
960
965
|
footnotes: vec![],
|
|
961
|
-
attributes:
|
|
966
|
+
attributes: Vec::new(),
|
|
962
967
|
}),
|
|
963
968
|
};
|
|
964
969
|
|
|
@@ -24,6 +24,8 @@
|
|
|
24
24
|
//! # Ok(())
|
|
25
25
|
//! # }
|
|
26
26
|
//! ```
|
|
27
|
+
use bytes::Bytes;
|
|
28
|
+
|
|
27
29
|
use crate::error::{KreuzbergError, Result};
|
|
28
30
|
use crate::types::{EmailAttachment, EmailExtractionResult};
|
|
29
31
|
use mail_parser::MimeHeaders;
|
|
@@ -101,7 +103,7 @@ pub fn parse_eml_content(data: &[u8]) -> Result<EmailExtractionResult> {
|
|
|
101
103
|
|
|
102
104
|
let html_content = message.body_html(0).map(|s| s.to_string());
|
|
103
105
|
|
|
104
|
-
let cleaned_text = if let Some(plain) =
|
|
106
|
+
let cleaned_text = if let Some(ref plain) = plain_text {
|
|
105
107
|
plain.clone()
|
|
106
108
|
} else if let Some(html) = &html_content {
|
|
107
109
|
clean_html_content(html)
|
|
@@ -132,7 +134,7 @@ pub fn parse_eml_content(data: &[u8]) -> Result<EmailExtractionResult> {
|
|
|
132
134
|
mime_type: Some(mime_type),
|
|
133
135
|
size: Some(size),
|
|
134
136
|
is_image,
|
|
135
|
-
data: Some(data
|
|
137
|
+
data: Some(Bytes::copy_from_slice(data)),
|
|
136
138
|
});
|
|
137
139
|
}
|
|
138
140
|
|
|
@@ -174,39 +176,49 @@ pub fn parse_msg_content(data: &[u8]) -> Result<EmailExtractionResult> {
|
|
|
174
176
|
let to_emails = outlook
|
|
175
177
|
.to
|
|
176
178
|
.iter()
|
|
177
|
-
.
|
|
178
|
-
|
|
179
|
+
.filter_map(|p| {
|
|
180
|
+
if p.email.is_empty() {
|
|
181
|
+
None
|
|
182
|
+
} else {
|
|
183
|
+
Some(p.email.clone())
|
|
184
|
+
}
|
|
185
|
+
})
|
|
179
186
|
.collect::<Vec<String>>();
|
|
180
187
|
|
|
181
188
|
let cc_emails = outlook
|
|
182
189
|
.cc
|
|
183
190
|
.iter()
|
|
184
|
-
.
|
|
185
|
-
|
|
191
|
+
.filter_map(|p| {
|
|
192
|
+
if p.email.is_empty() {
|
|
193
|
+
None
|
|
194
|
+
} else {
|
|
195
|
+
Some(p.email.clone())
|
|
196
|
+
}
|
|
197
|
+
})
|
|
186
198
|
.collect::<Vec<String>>();
|
|
187
199
|
|
|
188
|
-
let bcc_emails = if
|
|
189
|
-
vec![outlook.bcc.clone()]
|
|
190
|
-
} else {
|
|
200
|
+
let bcc_emails = if outlook.bcc.is_empty() {
|
|
191
201
|
vec![]
|
|
202
|
+
} else {
|
|
203
|
+
vec![outlook.bcc.clone()]
|
|
192
204
|
};
|
|
193
205
|
|
|
194
|
-
let date = if
|
|
195
|
-
Some(outlook.headers.date.clone())
|
|
196
|
-
} else {
|
|
206
|
+
let date = if outlook.headers.date.is_empty() {
|
|
197
207
|
None
|
|
208
|
+
} else {
|
|
209
|
+
Some(outlook.headers.date.clone())
|
|
198
210
|
};
|
|
199
211
|
|
|
200
|
-
let message_id = if
|
|
201
|
-
Some(outlook.headers.message_id.clone())
|
|
202
|
-
} else {
|
|
212
|
+
let message_id = if outlook.headers.message_id.is_empty() {
|
|
203
213
|
None
|
|
214
|
+
} else {
|
|
215
|
+
Some(outlook.headers.message_id.clone())
|
|
204
216
|
};
|
|
205
217
|
|
|
206
|
-
let plain_text = if
|
|
207
|
-
Some(outlook.body.clone())
|
|
208
|
-
} else {
|
|
218
|
+
let plain_text = if outlook.body.is_empty() {
|
|
209
219
|
None
|
|
220
|
+
} else {
|
|
221
|
+
Some(outlook.body.clone())
|
|
210
222
|
};
|
|
211
223
|
|
|
212
224
|
let html_content = None;
|
|
@@ -231,7 +243,7 @@ pub fn parse_msg_content(data: &[u8]) -> Result<EmailExtractionResult> {
|
|
|
231
243
|
};
|
|
232
244
|
|
|
233
245
|
let data = if !att.payload.is_empty() {
|
|
234
|
-
hex::decode(&att.payload).ok()
|
|
246
|
+
hex::decode(&att.payload).ok().map(Bytes::from)
|
|
235
247
|
} else {
|
|
236
248
|
None
|
|
237
249
|
};
|
|
@@ -448,13 +448,13 @@ fn generate_markdown_and_cells(sheet_name: &str, range: &Range<Data>, capacity:
|
|
|
448
448
|
markdown.push_str(" | ");
|
|
449
449
|
}
|
|
450
450
|
let cell_str = format_cell_to_string(cell);
|
|
451
|
-
header_cells.push(cell_str.clone());
|
|
452
451
|
|
|
453
452
|
if cell_str.contains('|') || cell_str.contains('\\') {
|
|
454
453
|
escape_markdown_into(&mut markdown, &cell_str);
|
|
455
454
|
} else {
|
|
456
455
|
markdown.push_str(&cell_str);
|
|
457
456
|
}
|
|
457
|
+
header_cells.push(cell_str);
|
|
458
458
|
}
|
|
459
459
|
markdown.push_str(" |\n");
|
|
460
460
|
cells.push(header_cells);
|
|
@@ -475,18 +475,19 @@ fn generate_markdown_and_cells(sheet_name: &str, range: &Range<Data>, capacity:
|
|
|
475
475
|
if i > 0 {
|
|
476
476
|
markdown.push_str(" | ");
|
|
477
477
|
}
|
|
478
|
-
if let Some(cell) = row.get(i) {
|
|
478
|
+
let cell_str = if let Some(cell) = row.get(i) {
|
|
479
479
|
let cell_str = format_cell_to_string(cell);
|
|
480
|
-
row_cells.push(cell_str.clone());
|
|
481
480
|
|
|
482
481
|
if cell_str.contains('|') || cell_str.contains('\\') {
|
|
483
482
|
escape_markdown_into(&mut markdown, &cell_str);
|
|
484
483
|
} else {
|
|
485
484
|
markdown.push_str(&cell_str);
|
|
486
485
|
}
|
|
486
|
+
cell_str
|
|
487
487
|
} else {
|
|
488
|
-
|
|
489
|
-
}
|
|
488
|
+
String::new()
|
|
489
|
+
};
|
|
490
|
+
row_cells.push(cell_str);
|
|
490
491
|
}
|
|
491
492
|
markdown.push_str(" |\n");
|
|
492
493
|
cells.push(row_cells);
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
//! Image handling and conversion functionality for HTML extraction.
|
|
2
2
|
|
|
3
|
+
use bytes::Bytes;
|
|
4
|
+
|
|
3
5
|
use super::types::ExtractedInlineImage;
|
|
4
6
|
use html_to_markdown_rs::{InlineImage, InlineImageFormat};
|
|
5
7
|
|
|
@@ -49,13 +51,16 @@ pub fn inline_image_format_to_str(format: &InlineImageFormat) -> String {
|
|
|
49
51
|
}
|
|
50
52
|
}
|
|
51
53
|
|
|
54
|
+
// Note: This function returns String because ExtractedInlineImage.format is String (internal to HTML extraction).
|
|
55
|
+
// For external ExtractedImage, use detect_image_format from pptx which returns Cow<'static, str>.
|
|
56
|
+
|
|
52
57
|
/// Convert a library InlineImage to an ExtractedInlineImage.
|
|
53
58
|
///
|
|
54
59
|
/// Maps the library's image representation to the extraction API's format,
|
|
55
60
|
/// converting the format enum to a string representation.
|
|
56
61
|
pub fn inline_image_to_extracted(image: InlineImage) -> ExtractedInlineImage {
|
|
57
62
|
ExtractedInlineImage {
|
|
58
|
-
data: image.data,
|
|
63
|
+
data: Bytes::from(image.data),
|
|
59
64
|
format: inline_image_format_to_str(&image.format),
|
|
60
65
|
filename: image.filename,
|
|
61
66
|
description: image.description,
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
//! Type definitions for HTML extraction.
|
|
2
2
|
|
|
3
|
+
use bytes::Bytes;
|
|
3
4
|
use serde::{Deserialize, Serialize};
|
|
4
|
-
use std::collections::HashMap;
|
|
5
5
|
|
|
6
6
|
pub use html_to_markdown_rs::{
|
|
7
7
|
CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, PreprocessingOptions,
|
|
@@ -19,10 +19,11 @@ pub struct HtmlExtractionResult {
|
|
|
19
19
|
/// Extracted inline image with metadata.
|
|
20
20
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
21
21
|
pub struct ExtractedInlineImage {
|
|
22
|
-
|
|
22
|
+
/// Uses `bytes::Bytes` for cheap cloning of large buffers.
|
|
23
|
+
pub data: Bytes,
|
|
23
24
|
pub format: String,
|
|
24
25
|
pub filename: Option<String>,
|
|
25
26
|
pub description: Option<String>,
|
|
26
27
|
pub dimensions: Option<(u32, u32)>,
|
|
27
|
-
pub attributes:
|
|
28
|
+
pub attributes: Vec<(String, String)>,
|
|
28
29
|
}
|
|
@@ -45,6 +45,7 @@
|
|
|
45
45
|
|
|
46
46
|
use crate::error::{KreuzbergError, Result};
|
|
47
47
|
use crate::types::LibreOfficeConversionResult;
|
|
48
|
+
use std::borrow::Cow;
|
|
48
49
|
use std::collections::HashSet;
|
|
49
50
|
use std::env;
|
|
50
51
|
use std::fs as std_fs;
|
|
@@ -326,9 +327,9 @@ pub async fn convert_doc_to_docx(doc_bytes: &[u8]) -> Result<LibreOfficeConversi
|
|
|
326
327
|
|
|
327
328
|
Ok(LibreOfficeConversionResult {
|
|
328
329
|
converted_bytes,
|
|
329
|
-
original_format: "doc"
|
|
330
|
-
target_format: "docx"
|
|
331
|
-
target_mime: crate::core::mime::DOCX_MIME_TYPE
|
|
330
|
+
original_format: Cow::Borrowed("doc"),
|
|
331
|
+
target_format: Cow::Borrowed("docx"),
|
|
332
|
+
target_mime: Cow::Borrowed(crate::core::mime::DOCX_MIME_TYPE),
|
|
332
333
|
})
|
|
333
334
|
}
|
|
334
335
|
|
|
@@ -350,9 +351,9 @@ pub async fn convert_ppt_to_pptx(ppt_bytes: &[u8]) -> Result<LibreOfficeConversi
|
|
|
350
351
|
|
|
351
352
|
Ok(LibreOfficeConversionResult {
|
|
352
353
|
converted_bytes,
|
|
353
|
-
original_format: "ppt"
|
|
354
|
-
target_format: "pptx"
|
|
355
|
-
target_mime: crate::core::mime::POWER_POINT_MIME_TYPE
|
|
354
|
+
original_format: Cow::Borrowed("ppt"),
|
|
355
|
+
target_format: Cow::Borrowed("pptx"),
|
|
356
|
+
target_mime: Cow::Borrowed(crate::core::mime::POWER_POINT_MIME_TYPE),
|
|
356
357
|
})
|
|
357
358
|
}
|
|
358
359
|
|
|
@@ -505,9 +506,9 @@ mod tests {
|
|
|
505
506
|
async fn test_conversion_result_structure() {
|
|
506
507
|
let result = LibreOfficeConversionResult {
|
|
507
508
|
converted_bytes: vec![1, 2, 3],
|
|
508
|
-
original_format: "doc"
|
|
509
|
-
target_format: "docx"
|
|
510
|
-
target_mime: crate::core::mime::DOCX_MIME_TYPE
|
|
509
|
+
original_format: Cow::Borrowed("doc"),
|
|
510
|
+
target_format: Cow::Borrowed("docx"),
|
|
511
|
+
target_mime: Cow::Borrowed(crate::core::mime::DOCX_MIME_TYPE),
|
|
511
512
|
};
|
|
512
513
|
|
|
513
514
|
assert_eq!(result.original_format, "doc");
|
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
//! This module handles image-related parsing from slide XML and
|
|
4
4
|
//! detection of image formats from file data.
|
|
5
5
|
|
|
6
|
+
use std::borrow::Cow;
|
|
7
|
+
|
|
6
8
|
pub(super) fn html_escape(text: &str) -> String {
|
|
7
9
|
text.replace('&', "&")
|
|
8
10
|
.replace('<', "<")
|
|
@@ -11,21 +13,21 @@ pub(super) fn html_escape(text: &str) -> String {
|
|
|
11
13
|
.replace('\'', "'")
|
|
12
14
|
}
|
|
13
15
|
|
|
14
|
-
pub(super) fn detect_image_format(data: &[u8]) ->
|
|
16
|
+
pub(super) fn detect_image_format(data: &[u8]) -> Cow<'static, str> {
|
|
15
17
|
if data.starts_with(&[0xFF, 0xD8, 0xFF]) {
|
|
16
|
-
"jpeg"
|
|
18
|
+
Cow::Borrowed("jpeg")
|
|
17
19
|
} else if data.starts_with(&[0x89, 0x50, 0x4E, 0x47]) {
|
|
18
|
-
"png"
|
|
20
|
+
Cow::Borrowed("png")
|
|
19
21
|
} else if data.starts_with(b"GIF") {
|
|
20
|
-
"gif"
|
|
22
|
+
Cow::Borrowed("gif")
|
|
21
23
|
} else if data.starts_with(b"BM") {
|
|
22
|
-
"bmp"
|
|
24
|
+
Cow::Borrowed("bmp")
|
|
23
25
|
} else if data.starts_with(b"<svg") || data.starts_with(b"<?xml") {
|
|
24
|
-
"svg"
|
|
26
|
+
Cow::Borrowed("svg")
|
|
25
27
|
} else if data.starts_with(b"II\x2A\x00") || data.starts_with(b"MM\x00\x2A") {
|
|
26
|
-
"tiff"
|
|
28
|
+
Cow::Borrowed("tiff")
|
|
27
29
|
} else {
|
|
28
|
-
"unknown"
|
|
30
|
+
Cow::Borrowed("unknown")
|
|
29
31
|
}
|
|
30
32
|
}
|
|
31
33
|
|
|
@@ -45,6 +45,8 @@ mod image_handling;
|
|
|
45
45
|
mod metadata;
|
|
46
46
|
mod parser;
|
|
47
47
|
|
|
48
|
+
use bytes::Bytes;
|
|
49
|
+
|
|
48
50
|
use crate::error::Result;
|
|
49
51
|
use crate::types::{ExtractedImage, PptxExtractionResult};
|
|
50
52
|
|
|
@@ -117,8 +119,8 @@ pub fn extract_pptx_from_path(
|
|
|
117
119
|
let image_index = extracted_images.len();
|
|
118
120
|
|
|
119
121
|
extracted_images.push(ExtractedImage {
|
|
120
|
-
data,
|
|
121
|
-
format,
|
|
122
|
+
data: Bytes::from(data),
|
|
123
|
+
format, // Already a Cow<'static, str> from detect_image_format
|
|
122
124
|
image_index,
|
|
123
125
|
page_number: Some(slide.slide_number as usize),
|
|
124
126
|
width: None,
|
|
@@ -333,11 +335,13 @@ mod tests {
|
|
|
333
335
|
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">"#,
|
|
334
336
|
);
|
|
335
337
|
for (i, _) in slides.iter().enumerate() {
|
|
336
|
-
|
|
338
|
+
use std::fmt::Write;
|
|
339
|
+
let _ = write!(
|
|
340
|
+
rels_xml,
|
|
337
341
|
r#"<Relationship Id="rId{}" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide{}.xml"/>"#,
|
|
338
342
|
i + 1,
|
|
339
343
|
i + 1
|
|
340
|
-
)
|
|
344
|
+
);
|
|
341
345
|
}
|
|
342
346
|
rels_xml.push_str("</Relationships>");
|
|
343
347
|
zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
|
|
@@ -33,12 +33,13 @@
|
|
|
33
33
|
use crate::error::{KreuzbergError, Result};
|
|
34
34
|
use crate::text::utf8_validation;
|
|
35
35
|
use serde::{Deserialize, Serialize};
|
|
36
|
+
use std::borrow::Cow;
|
|
36
37
|
use std::collections::HashMap;
|
|
37
38
|
|
|
38
39
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
39
40
|
pub struct StructuredDataResult {
|
|
40
41
|
pub content: String,
|
|
41
|
-
pub format:
|
|
42
|
+
pub format: Cow<'static, str>,
|
|
42
43
|
pub metadata: HashMap<String, String>,
|
|
43
44
|
pub text_fields: Vec<String>,
|
|
44
45
|
}
|
|
@@ -97,7 +98,7 @@ pub fn parse_json(data: &[u8], config: Option<JsonExtractionConfig>) -> Result<S
|
|
|
97
98
|
|
|
98
99
|
Ok(StructuredDataResult {
|
|
99
100
|
content,
|
|
100
|
-
format: "json"
|
|
101
|
+
format: Cow::Borrowed("json"),
|
|
101
102
|
metadata,
|
|
102
103
|
text_fields,
|
|
103
104
|
})
|
|
@@ -254,7 +255,7 @@ pub fn parse_yaml(data: &[u8]) -> Result<StructuredDataResult> {
|
|
|
254
255
|
|
|
255
256
|
Ok(StructuredDataResult {
|
|
256
257
|
content,
|
|
257
|
-
format: "yaml"
|
|
258
|
+
format: Cow::Borrowed("yaml"),
|
|
258
259
|
metadata,
|
|
259
260
|
text_fields,
|
|
260
261
|
})
|
|
@@ -326,7 +327,7 @@ pub fn parse_toml(data: &[u8]) -> Result<StructuredDataResult> {
|
|
|
326
327
|
|
|
327
328
|
Ok(StructuredDataResult {
|
|
328
329
|
content,
|
|
329
|
-
format: "toml"
|
|
330
|
+
format: Cow::Borrowed("toml"),
|
|
330
331
|
metadata,
|
|
331
332
|
text_fields,
|
|
332
333
|
})
|
|
@@ -167,7 +167,7 @@ pub(super) fn process_images(
|
|
|
167
167
|
element_index: Some(elements.len()),
|
|
168
168
|
additional: {
|
|
169
169
|
let mut m = HashMap::new();
|
|
170
|
-
m.insert("format".to_string(), image.format.
|
|
170
|
+
m.insert("format".to_string(), image.format.to_string());
|
|
171
171
|
if let Some(width) = image.width {
|
|
172
172
|
m.insert("width".to_string(), width.to_string());
|
|
173
173
|
}
|