kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
|
@@ -3,8 +3,6 @@
|
|
|
3
3
|
//! Tests for .eml (RFC822) email extraction.
|
|
4
4
|
//! Validates metadata extraction, content extraction, HTML/plain text handling, and attachments.
|
|
5
5
|
|
|
6
|
-
#![cfg(feature = "email")]
|
|
7
|
-
|
|
8
6
|
use kreuzberg::core::config::ExtractionConfig;
|
|
9
7
|
use kreuzberg::core::extractor::extract_bytes;
|
|
10
8
|
|
|
@@ -52,7 +50,7 @@ This is the email body content.";
|
|
|
52
50
|
|
|
53
51
|
assert!(email_meta.attachments.is_empty(), "Should have no attachments");
|
|
54
52
|
|
|
55
|
-
assert!(result.metadata.
|
|
53
|
+
assert!(result.metadata.date.is_some());
|
|
56
54
|
|
|
57
55
|
assert!(result.content.contains("Subject: Test Email Subject"));
|
|
58
56
|
assert!(result.content.contains("From: sender@example.com"));
|
|
@@ -12,7 +12,6 @@ mod helpers;
|
|
|
12
12
|
|
|
13
13
|
/// Test truncated PDF - incomplete PDF file.
|
|
14
14
|
#[tokio::test]
|
|
15
|
-
#[cfg(feature = "pdf")]
|
|
16
15
|
async fn test_truncated_pdf() {
|
|
17
16
|
let config = ExtractionConfig::default();
|
|
18
17
|
|
|
@@ -32,7 +31,6 @@ async fn test_truncated_pdf() {
|
|
|
32
31
|
|
|
33
32
|
/// Test corrupted ZIP - malformed archive.
|
|
34
33
|
#[tokio::test]
|
|
35
|
-
#[cfg(feature = "archives")]
|
|
36
34
|
async fn test_corrupted_zip() {
|
|
37
35
|
let config = ExtractionConfig::default();
|
|
38
36
|
|
|
@@ -52,7 +50,6 @@ async fn test_corrupted_zip() {
|
|
|
52
50
|
|
|
53
51
|
/// Test invalid XML - bad XML syntax.
|
|
54
52
|
#[tokio::test]
|
|
55
|
-
#[cfg(feature = "xml")]
|
|
56
53
|
async fn test_invalid_xml() {
|
|
57
54
|
let config = ExtractionConfig::default();
|
|
58
55
|
|
|
@@ -83,7 +80,6 @@ async fn test_invalid_xml() {
|
|
|
83
80
|
|
|
84
81
|
/// Test corrupted image - invalid image data.
|
|
85
82
|
#[tokio::test]
|
|
86
|
-
#[cfg(feature = "ocr")]
|
|
87
83
|
async fn test_corrupted_image() {
|
|
88
84
|
let config = ExtractionConfig::default();
|
|
89
85
|
|
|
@@ -116,28 +112,27 @@ async fn test_empty_file() {
|
|
|
116
112
|
|
|
117
113
|
let empty_data = b"";
|
|
118
114
|
|
|
115
|
+
let result_pdf = extract_bytes(empty_data, "application/pdf", &config).await;
|
|
119
116
|
let result_text = extract_bytes(empty_data, "text/plain", &config).await;
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
);
|
|
140
|
-
}
|
|
117
|
+
let result_xml = extract_bytes(empty_data, "application/xml", &config).await;
|
|
118
|
+
|
|
119
|
+
match result_pdf {
|
|
120
|
+
Ok(extraction) => {
|
|
121
|
+
assert!(
|
|
122
|
+
extraction.content.is_empty(),
|
|
123
|
+
"Empty PDF should have empty content if it succeeds"
|
|
124
|
+
);
|
|
125
|
+
assert!(extraction.chunks.is_none(), "Chunks should be None");
|
|
126
|
+
}
|
|
127
|
+
Err(error) => {
|
|
128
|
+
assert!(
|
|
129
|
+
matches!(
|
|
130
|
+
error,
|
|
131
|
+
kreuzberg::KreuzbergError::Parsing { .. } | kreuzberg::KreuzbergError::Validation { .. }
|
|
132
|
+
),
|
|
133
|
+
"Empty PDF should produce Parsing or Validation error, got: {:?}",
|
|
134
|
+
error
|
|
135
|
+
);
|
|
141
136
|
}
|
|
142
137
|
}
|
|
143
138
|
|
|
@@ -154,24 +149,20 @@ async fn test_empty_file() {
|
|
|
154
149
|
}
|
|
155
150
|
}
|
|
156
151
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
"Empty XML error should be Parsing type, got: {:?}",
|
|
172
|
-
error
|
|
173
|
-
);
|
|
174
|
-
}
|
|
152
|
+
match result_xml {
|
|
153
|
+
Ok(extraction) => {
|
|
154
|
+
assert!(
|
|
155
|
+
extraction.content.is_empty(),
|
|
156
|
+
"Empty XML should have empty content if it succeeds"
|
|
157
|
+
);
|
|
158
|
+
assert!(extraction.chunks.is_none(), "Chunks should be None");
|
|
159
|
+
}
|
|
160
|
+
Err(error) => {
|
|
161
|
+
assert!(
|
|
162
|
+
matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
|
|
163
|
+
"Empty XML error should be Parsing type, got: {:?}",
|
|
164
|
+
error
|
|
165
|
+
);
|
|
175
166
|
}
|
|
176
167
|
}
|
|
177
168
|
}
|
|
@@ -5,14 +5,9 @@
|
|
|
5
5
|
//! asynchronous APIs or to graceful handling when optional system
|
|
6
6
|
//! dependencies are missing.
|
|
7
7
|
|
|
8
|
-
#![cfg(any(feature = "pdf", feature = "office", feature = "ocr"))]
|
|
9
|
-
|
|
10
8
|
mod helpers;
|
|
11
9
|
|
|
12
|
-
use helpers::{assert_mime_type, get_test_file_path, test_documents_available};
|
|
13
|
-
|
|
14
|
-
#[cfg(any(feature = "office", feature = "ocr"))]
|
|
15
|
-
use helpers::assert_non_empty_content;
|
|
10
|
+
use helpers::{assert_mime_type, assert_non_empty_content, get_test_file_path, test_documents_available};
|
|
16
11
|
use kreuzberg::core::config::ExtractionConfig;
|
|
17
12
|
use kreuzberg::core::extractor::extract_file;
|
|
18
13
|
|
|
@@ -49,7 +44,6 @@ async fn test_pdf_password_protected_async() {
|
|
|
49
44
|
|
|
50
45
|
#[cfg(feature = "office")]
|
|
51
46
|
#[tokio::test]
|
|
52
|
-
#[cfg_attr(target_os = "windows", ignore = "LibreOffice tests timeout on Windows CI")]
|
|
53
47
|
async fn test_legacy_doc_extraction_async() {
|
|
54
48
|
if !test_documents_available() {
|
|
55
49
|
return;
|
|
@@ -121,66 +121,6 @@ pub fn test_config_with_ocr() -> kreuzberg::core::config::ExtractionConfig {
|
|
|
121
121
|
}
|
|
122
122
|
}
|
|
123
123
|
|
|
124
|
-
// PDF-specific test helpers (only available with pdf feature)
|
|
125
|
-
#[cfg(feature = "pdf")]
|
|
126
|
-
pub mod pdf_helpers {
|
|
127
|
-
use kreuzberg::core::config::ExtractionConfig;
|
|
128
|
-
use kreuzberg::pdf::hierarchy::{BoundingBox, CharData};
|
|
129
|
-
|
|
130
|
-
/// Create a bounding box with simple coordinates.
|
|
131
|
-
///
|
|
132
|
-
/// # Arguments
|
|
133
|
-
///
|
|
134
|
-
/// * `left` - Left x-coordinate
|
|
135
|
-
/// * `top` - Top y-coordinate
|
|
136
|
-
/// * `right` - Right x-coordinate
|
|
137
|
-
/// * `bottom` - Bottom y-coordinate
|
|
138
|
-
///
|
|
139
|
-
/// # Returns
|
|
140
|
-
///
|
|
141
|
-
/// A new BoundingBox with the specified coordinates
|
|
142
|
-
pub fn create_bounding_box(left: f32, top: f32, right: f32, bottom: f32) -> BoundingBox {
|
|
143
|
-
BoundingBox {
|
|
144
|
-
left,
|
|
145
|
-
top,
|
|
146
|
-
right,
|
|
147
|
-
bottom,
|
|
148
|
-
}
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
/// Create a character data with minimal parameters.
|
|
152
|
-
///
|
|
153
|
-
/// # Arguments
|
|
154
|
-
///
|
|
155
|
-
/// * `text` - Character text content
|
|
156
|
-
/// * `x` - X position
|
|
157
|
-
/// * `y` - Y position
|
|
158
|
-
/// * `font_size` - Font size in points
|
|
159
|
-
///
|
|
160
|
-
/// # Returns
|
|
161
|
-
///
|
|
162
|
-
/// A new CharData with calculated width and height
|
|
163
|
-
pub fn create_char_data(text: &str, x: f32, y: f32, font_size: f32) -> CharData {
|
|
164
|
-
CharData {
|
|
165
|
-
text: text.to_string(),
|
|
166
|
-
x,
|
|
167
|
-
y,
|
|
168
|
-
font_size,
|
|
169
|
-
width: font_size * 0.6,
|
|
170
|
-
height: font_size,
|
|
171
|
-
}
|
|
172
|
-
}
|
|
173
|
-
|
|
174
|
-
/// Create a default extraction configuration for testing hierarchy extraction.
|
|
175
|
-
///
|
|
176
|
-
/// # Returns
|
|
177
|
-
///
|
|
178
|
-
/// A new ExtractionConfig with PDF hierarchy options enabled
|
|
179
|
-
pub fn create_hierarchy_extraction_config() -> ExtractionConfig {
|
|
180
|
-
ExtractionConfig::default()
|
|
181
|
-
}
|
|
182
|
-
}
|
|
183
|
-
|
|
184
124
|
#[cfg(test)]
|
|
185
125
|
mod tests {
|
|
186
126
|
use super::*;
|
|
@@ -276,23 +276,22 @@ async fn test_no_extension() {
|
|
|
276
276
|
|
|
277
277
|
let detected = detect_mime_type(&temp_path, true);
|
|
278
278
|
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
}
|
|
279
|
+
if detected.is_err() {
|
|
280
|
+
let error = detected.unwrap_err();
|
|
281
|
+
assert!(
|
|
282
|
+
matches!(
|
|
283
|
+
error,
|
|
284
|
+
kreuzberg::KreuzbergError::Validation { .. } | kreuzberg::KreuzbergError::UnsupportedFormat(_)
|
|
285
|
+
),
|
|
286
|
+
"Should return appropriate error for file without extension"
|
|
287
|
+
);
|
|
288
|
+
} else {
|
|
289
|
+
let mime = detected.unwrap();
|
|
290
|
+
assert!(
|
|
291
|
+
mime.contains('/'),
|
|
292
|
+
"Detected MIME type should be valid format: {}",
|
|
293
|
+
mime
|
|
294
|
+
);
|
|
296
295
|
}
|
|
297
296
|
|
|
298
297
|
let _ = std::fs::remove_file(&temp_path);
|
|
@@ -11,8 +11,6 @@
|
|
|
11
11
|
//! - Verify configuration changes actually affect output
|
|
12
12
|
//! - Test table detection with various settings
|
|
13
13
|
|
|
14
|
-
#![cfg(feature = "ocr")]
|
|
15
|
-
|
|
16
14
|
mod helpers;
|
|
17
15
|
|
|
18
16
|
use helpers::*;
|
|
@@ -206,7 +204,6 @@ fn test_ocr_psm_single_line() {
|
|
|
206
204
|
}
|
|
207
205
|
|
|
208
206
|
#[test]
|
|
209
|
-
#[cfg(feature = "pdf")]
|
|
210
207
|
fn test_force_ocr_on_text_pdf() {
|
|
211
208
|
if skip_if_missing("pdfs/fake_memo.pdf") {
|
|
212
209
|
return;
|
|
@@ -236,7 +233,6 @@ fn test_force_ocr_on_text_pdf() {
|
|
|
236
233
|
}
|
|
237
234
|
|
|
238
235
|
#[test]
|
|
239
|
-
#[cfg(feature = "pdf")]
|
|
240
236
|
fn test_force_ocr_disabled() {
|
|
241
237
|
if skip_if_missing("pdfs/fake_memo.pdf") {
|
|
242
238
|
return;
|
|
@@ -13,8 +13,6 @@
|
|
|
13
13
|
//! - Test recovery from transient failures
|
|
14
14
|
//! - Validate resource limits and constraints
|
|
15
15
|
|
|
16
|
-
#![cfg(feature = "ocr")]
|
|
17
|
-
|
|
18
16
|
mod helpers;
|
|
19
17
|
|
|
20
18
|
use helpers::*;
|
|
@@ -453,9 +451,6 @@ fn test_ocr_cache_disabled_then_enabled() {
|
|
|
453
451
|
};
|
|
454
452
|
|
|
455
453
|
let result1 = extract_file_sync(&file_path, None, &config_no_cache);
|
|
456
|
-
if matches!(result1, Err(KreuzbergError::MissingDependency(_))) {
|
|
457
|
-
return;
|
|
458
|
-
}
|
|
459
454
|
assert!(result1.is_ok(), "First extraction should succeed");
|
|
460
455
|
|
|
461
456
|
let config_with_cache = ExtractionConfig {
|
|
@@ -473,9 +468,6 @@ fn test_ocr_cache_disabled_then_enabled() {
|
|
|
473
468
|
};
|
|
474
469
|
|
|
475
470
|
let result2 = extract_file_sync(&file_path, None, &config_with_cache);
|
|
476
|
-
if matches!(result2, Err(KreuzbergError::MissingDependency(_))) {
|
|
477
|
-
return;
|
|
478
|
-
}
|
|
479
471
|
assert!(result2.is_ok(), "Second extraction should succeed");
|
|
480
472
|
|
|
481
473
|
assert_non_empty_content(&result1.unwrap());
|
|
@@ -503,13 +495,6 @@ fn test_ocr_concurrent_same_file() {
|
|
|
503
495
|
..Default::default()
|
|
504
496
|
});
|
|
505
497
|
|
|
506
|
-
if matches!(
|
|
507
|
-
extract_file_sync(&*file_path, None, &config),
|
|
508
|
-
Err(KreuzbergError::MissingDependency(_))
|
|
509
|
-
) {
|
|
510
|
-
return;
|
|
511
|
-
}
|
|
512
|
-
|
|
513
498
|
let mut handles = vec![];
|
|
514
499
|
for i in 0..5 {
|
|
515
500
|
let file_path_clone = Arc::clone(&file_path);
|
|
@@ -569,13 +554,6 @@ fn test_ocr_concurrent_different_files() {
|
|
|
569
554
|
..Default::default()
|
|
570
555
|
});
|
|
571
556
|
|
|
572
|
-
if matches!(
|
|
573
|
-
extract_file_sync(&files[0], None, &config),
|
|
574
|
-
Err(KreuzbergError::MissingDependency(_))
|
|
575
|
-
) {
|
|
576
|
-
return;
|
|
577
|
-
}
|
|
578
|
-
|
|
579
557
|
let mut handles = vec![];
|
|
580
558
|
for (i, file_path) in files.iter().enumerate() {
|
|
581
559
|
let file_path_clone = file_path.clone();
|