kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
|
@@ -19,18 +19,6 @@ use kreuzberg::core::extractor::extract_file_sync;
|
|
|
19
19
|
|
|
20
20
|
mod helpers;
|
|
21
21
|
|
|
22
|
-
fn trim_trailing_newlines(value: &str) -> &str {
|
|
23
|
-
value.trim_end_matches(['\n', '\r'])
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
fn assert_text_content(actual: &str, expected: &str) {
|
|
27
|
-
assert_eq!(
|
|
28
|
-
trim_trailing_newlines(actual),
|
|
29
|
-
expected,
|
|
30
|
-
"Content mismatch after trimming trailing newlines"
|
|
31
|
-
);
|
|
32
|
-
}
|
|
33
|
-
|
|
34
22
|
/// Test that batch extraction processes documents in parallel.
|
|
35
23
|
///
|
|
36
24
|
/// Validates:
|
|
@@ -307,13 +295,9 @@ async fn test_batch_bytes_parallel_processing() {
|
|
|
307
295
|
.collect();
|
|
308
296
|
|
|
309
297
|
let contents_ref: Vec<(&[u8], &str)> = contents.iter().map(|(bytes, mime)| (bytes.as_slice(), *mime)).collect();
|
|
310
|
-
let owned_contents: Vec<(Vec<u8>, String)> = contents_ref
|
|
311
|
-
.into_iter()
|
|
312
|
-
.map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
|
|
313
|
-
.collect();
|
|
314
298
|
|
|
315
299
|
let start = Instant::now();
|
|
316
|
-
let results = batch_extract_bytes(
|
|
300
|
+
let results = batch_extract_bytes(contents_ref, &config).await;
|
|
317
301
|
let duration = start.elapsed();
|
|
318
302
|
|
|
319
303
|
assert!(results.is_ok());
|
|
@@ -321,8 +305,7 @@ async fn test_batch_bytes_parallel_processing() {
|
|
|
321
305
|
assert_eq!(results.len(), 30);
|
|
322
306
|
|
|
323
307
|
for (i, result) in results.iter().enumerate() {
|
|
324
|
-
|
|
325
|
-
assert_text_content(&result.content, &expected);
|
|
308
|
+
assert_eq!(result.content, format!("Test content number {}", i));
|
|
326
309
|
}
|
|
327
310
|
|
|
328
311
|
println!("Batch processed 30 byte arrays in {:?}", duration);
|
|
@@ -341,20 +324,15 @@ async fn test_batch_bytes_mixed_valid_invalid() {
|
|
|
341
324
|
(b"valid content 3".as_slice(), "text/plain"),
|
|
342
325
|
];
|
|
343
326
|
|
|
344
|
-
let
|
|
345
|
-
.into_iter()
|
|
346
|
-
.map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
|
|
347
|
-
.collect();
|
|
348
|
-
|
|
349
|
-
let results = batch_extract_bytes(owned_contents, &config).await;
|
|
327
|
+
let results = batch_extract_bytes(contents, &config).await;
|
|
350
328
|
|
|
351
329
|
assert!(results.is_ok());
|
|
352
330
|
let results = results.unwrap();
|
|
353
331
|
assert_eq!(results.len(), 5);
|
|
354
332
|
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
333
|
+
assert_eq!(results[0].content, "valid content 1");
|
|
334
|
+
assert_eq!(results[2].content, "valid content 2");
|
|
335
|
+
assert_eq!(results[4].content, "valid content 3");
|
|
358
336
|
|
|
359
337
|
assert!(results[1].metadata.error.is_some());
|
|
360
338
|
assert!(results[3].metadata.error.is_some());
|
|
@@ -383,13 +361,9 @@ async fn test_batch_utilizes_multiple_cores() {
|
|
|
383
361
|
}
|
|
384
362
|
|
|
385
363
|
let contents_ref: Vec<(&[u8], &str)> = contents.iter().map(|(bytes, mime)| (bytes.as_slice(), *mime)).collect();
|
|
386
|
-
let owned_contents: Vec<(Vec<u8>, String)> = contents_ref
|
|
387
|
-
.into_iter()
|
|
388
|
-
.map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
|
|
389
|
-
.collect();
|
|
390
364
|
|
|
391
365
|
let start = Instant::now();
|
|
392
|
-
let results = batch_extract_bytes(
|
|
366
|
+
let results = batch_extract_bytes(contents_ref, &config).await;
|
|
393
367
|
let duration = start.elapsed();
|
|
394
368
|
|
|
395
369
|
assert!(results.is_ok());
|
|
@@ -426,13 +400,9 @@ async fn test_batch_memory_pressure_handling() {
|
|
|
426
400
|
}
|
|
427
401
|
|
|
428
402
|
let contents_ref: Vec<(&[u8], &str)> = contents.iter().map(|(bytes, mime)| (bytes.as_slice(), *mime)).collect();
|
|
429
|
-
let owned_contents: Vec<(Vec<u8>, String)> = contents_ref
|
|
430
|
-
.into_iter()
|
|
431
|
-
.map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
|
|
432
|
-
.collect();
|
|
433
403
|
|
|
434
404
|
let start = Instant::now();
|
|
435
|
-
let results = batch_extract_bytes(
|
|
405
|
+
let results = batch_extract_bytes(contents_ref, &config).await;
|
|
436
406
|
let duration = start.elapsed();
|
|
437
407
|
|
|
438
408
|
assert!(results.is_ok());
|
|
@@ -462,13 +432,8 @@ async fn test_batch_scales_with_cpu_count() {
|
|
|
462
432
|
|
|
463
433
|
let contents_ref: Vec<(&[u8], &str)> = contents.iter().map(|(bytes, mime)| (bytes.as_slice(), *mime)).collect();
|
|
464
434
|
|
|
465
|
-
let owned_contents_1: Vec<(Vec<u8>, String)> = contents_ref
|
|
466
|
-
.iter()
|
|
467
|
-
.map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
|
|
468
|
-
.collect();
|
|
469
|
-
|
|
470
435
|
let start = Instant::now();
|
|
471
|
-
let _ = batch_extract_bytes(
|
|
436
|
+
let _ = batch_extract_bytes(contents_ref.clone(), &config_1).await.unwrap();
|
|
472
437
|
let duration_1 = start.elapsed();
|
|
473
438
|
|
|
474
439
|
let config_full = ExtractionConfig {
|
|
@@ -476,13 +441,8 @@ async fn test_batch_scales_with_cpu_count() {
|
|
|
476
441
|
..Default::default()
|
|
477
442
|
};
|
|
478
443
|
|
|
479
|
-
let owned_contents_full: Vec<(Vec<u8>, String)> = contents_ref
|
|
480
|
-
.into_iter()
|
|
481
|
-
.map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
|
|
482
|
-
.collect();
|
|
483
|
-
|
|
484
444
|
let start = Instant::now();
|
|
485
|
-
let _ = batch_extract_bytes(
|
|
445
|
+
let _ = batch_extract_bytes(contents_ref, &config_full).await.unwrap();
|
|
486
446
|
let duration_full = start.elapsed();
|
|
487
447
|
|
|
488
448
|
println!(
|
|
@@ -566,20 +526,15 @@ async fn test_batch_accuracy_under_load() {
|
|
|
566
526
|
}
|
|
567
527
|
|
|
568
528
|
let contents_ref: Vec<(&[u8], &str)> = contents.iter().map(|(bytes, mime)| (bytes.as_slice(), *mime)).collect();
|
|
569
|
-
let owned_contents: Vec<(Vec<u8>, String)> = contents_ref
|
|
570
|
-
.into_iter()
|
|
571
|
-
.map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
|
|
572
|
-
.collect();
|
|
573
529
|
|
|
574
|
-
let results = batch_extract_bytes(
|
|
530
|
+
let results = batch_extract_bytes(contents_ref, &config).await.unwrap();
|
|
575
531
|
|
|
576
532
|
assert_eq!(results.len(), 100);
|
|
577
533
|
|
|
578
534
|
for (i, result) in results.iter().enumerate() {
|
|
579
535
|
let expected = format!("Document number {} with unique content", i);
|
|
580
536
|
assert_eq!(
|
|
581
|
-
|
|
582
|
-
expected,
|
|
537
|
+
result.content, expected,
|
|
583
538
|
"Document {} content mismatch - possible cross-contamination",
|
|
584
539
|
i
|
|
585
540
|
);
|
|
@@ -4,29 +4,16 @@
|
|
|
4
4
|
//! Validates concurrent processing, error handling, and performance.
|
|
5
5
|
|
|
6
6
|
use kreuzberg::core::config::ExtractionConfig;
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
7
|
+
use kreuzberg::core::extractor::{
|
|
8
|
+
batch_extract_bytes, batch_extract_bytes_sync, batch_extract_file, batch_extract_file_sync,
|
|
9
|
+
};
|
|
10
10
|
use std::path::PathBuf;
|
|
11
11
|
|
|
12
12
|
mod helpers;
|
|
13
13
|
use helpers::{get_test_documents_dir, get_test_file_path, skip_if_missing, test_documents_available};
|
|
14
14
|
|
|
15
|
-
fn trim_trailing_newlines(value: &str) -> &str {
|
|
16
|
-
value.trim_end_matches(['\n', '\r'])
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
fn assert_text_content(actual: &str, expected: &str) {
|
|
20
|
-
assert_eq!(
|
|
21
|
-
trim_trailing_newlines(actual),
|
|
22
|
-
expected,
|
|
23
|
-
"Content mismatch after trimming trailing newlines"
|
|
24
|
-
);
|
|
25
|
-
}
|
|
26
|
-
|
|
27
15
|
/// Test batch extraction with multiple file formats (PDF, DOCX, TXT).
|
|
28
16
|
#[tokio::test]
|
|
29
|
-
#[cfg(all(feature = "pdf", feature = "office", feature = "tokio-runtime"))]
|
|
30
17
|
async fn test_batch_extract_file_multiple_formats() {
|
|
31
18
|
if !test_documents_available() {
|
|
32
19
|
println!("Skipping test: test_documents/ directory not found");
|
|
@@ -74,7 +61,6 @@ async fn test_batch_extract_file_multiple_formats() {
|
|
|
74
61
|
|
|
75
62
|
/// Test synchronous batch extraction variant.
|
|
76
63
|
#[test]
|
|
77
|
-
#[cfg(feature = "pdf")]
|
|
78
64
|
fn test_batch_extract_file_sync_variant() {
|
|
79
65
|
if !test_documents_available() {
|
|
80
66
|
println!("Skipping test: test_documents/ directory not found");
|
|
@@ -129,19 +115,14 @@ async fn test_batch_extract_bytes_multiple() {
|
|
|
129
115
|
(json_bytes.as_slice(), "application/json"),
|
|
130
116
|
];
|
|
131
117
|
|
|
132
|
-
let
|
|
133
|
-
.into_iter()
|
|
134
|
-
.map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
|
|
135
|
-
.collect();
|
|
136
|
-
|
|
137
|
-
let results = batch_extract_bytes(owned_contents, &config).await;
|
|
118
|
+
let results = batch_extract_bytes(contents, &config).await;
|
|
138
119
|
|
|
139
120
|
assert!(results.is_ok(), "Batch bytes extraction should succeed");
|
|
140
121
|
let results = results.unwrap();
|
|
141
122
|
|
|
142
123
|
assert_eq!(results.len(), 3);
|
|
143
124
|
|
|
144
|
-
|
|
125
|
+
assert_eq!(results[0].content, "This is plain text content");
|
|
145
126
|
assert_eq!(results[0].mime_type, "text/plain");
|
|
146
127
|
|
|
147
128
|
assert!(results[1].content.contains("Markdown Header"));
|
|
@@ -311,18 +292,13 @@ fn test_batch_extract_bytes_sync_variant() {
|
|
|
311
292
|
(b"# content 3".as_slice(), "text/markdown"),
|
|
312
293
|
];
|
|
313
294
|
|
|
314
|
-
let
|
|
315
|
-
.into_iter()
|
|
316
|
-
.map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
|
|
317
|
-
.collect();
|
|
318
|
-
|
|
319
|
-
let results = batch_extract_bytes_sync(owned_contents, &config);
|
|
295
|
+
let results = batch_extract_bytes_sync(contents, &config);
|
|
320
296
|
|
|
321
297
|
assert!(results.is_ok(), "Sync batch bytes extraction should succeed");
|
|
322
298
|
let results = results.unwrap();
|
|
323
299
|
|
|
324
300
|
assert_eq!(results.len(), 3);
|
|
325
|
-
|
|
326
|
-
|
|
301
|
+
assert_eq!(results[0].content, "content 1");
|
|
302
|
+
assert_eq!(results[1].content, "content 2");
|
|
327
303
|
assert!(results[2].content.contains("content 3"));
|
|
328
304
|
}
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
#[cfg(feature = "chunking")]
|
|
2
|
+
#[test]
|
|
3
|
+
fn demonstrate_correct_offset_calculation() {
|
|
4
|
+
use kreuzberg::chunking::{ChunkerType, ChunkingConfig, chunk_text};
|
|
5
|
+
|
|
6
|
+
println!("\n=== Demonstrating Correct Chunking Offset Calculation ===\n");
|
|
7
|
+
|
|
8
|
+
let config_with_overlap = ChunkingConfig {
|
|
9
|
+
max_characters: 20,
|
|
10
|
+
overlap: 5,
|
|
11
|
+
trim: false,
|
|
12
|
+
chunker_type: ChunkerType::Text,
|
|
13
|
+
};
|
|
14
|
+
|
|
15
|
+
let text = "AAAAA BBBBB CCCCC DDDDD EEEEE FFFFF";
|
|
16
|
+
println!("Text: \"{}\"", text);
|
|
17
|
+
println!(
|
|
18
|
+
"Max characters: {}, Overlap: {}\n",
|
|
19
|
+
config_with_overlap.max_characters, config_with_overlap.overlap
|
|
20
|
+
);
|
|
21
|
+
|
|
22
|
+
let result = chunk_text(text, &config_with_overlap).unwrap();
|
|
23
|
+
|
|
24
|
+
println!("WITH OVERLAP (5 chars):");
|
|
25
|
+
for (i, chunk) in result.chunks.iter().enumerate() {
|
|
26
|
+
println!(
|
|
27
|
+
" Chunk {}: [{:3} - {:3}] = \"{}\"",
|
|
28
|
+
i,
|
|
29
|
+
chunk.metadata.char_start,
|
|
30
|
+
chunk.metadata.char_end,
|
|
31
|
+
chunk.content.replace('\n', "\\n")
|
|
32
|
+
);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
println!("\nOverlap verification:");
|
|
36
|
+
for i in 0..result.chunks.len() - 1 {
|
|
37
|
+
let current = &result.chunks[i];
|
|
38
|
+
let next = &result.chunks[i + 1];
|
|
39
|
+
let overlap_size = current.metadata.char_end - next.metadata.char_start;
|
|
40
|
+
println!(
|
|
41
|
+
" Chunks {} and {}: overlap = {} chars (next starts at {} while current ends at {})",
|
|
42
|
+
i,
|
|
43
|
+
i + 1,
|
|
44
|
+
overlap_size,
|
|
45
|
+
next.metadata.char_start,
|
|
46
|
+
current.metadata.char_end
|
|
47
|
+
);
|
|
48
|
+
assert!(
|
|
49
|
+
overlap_size > 0 && overlap_size <= config_with_overlap.overlap + 10,
|
|
50
|
+
"Overlap should exist and be reasonable"
|
|
51
|
+
);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
println!("\n\n=== Without Overlap ===\n");
|
|
55
|
+
let config_no_overlap = ChunkingConfig {
|
|
56
|
+
max_characters: 20,
|
|
57
|
+
overlap: 0,
|
|
58
|
+
trim: false,
|
|
59
|
+
chunker_type: ChunkerType::Text,
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
let result_no_overlap = chunk_text(text, &config_no_overlap).unwrap();
|
|
63
|
+
|
|
64
|
+
println!("WITHOUT OVERLAP:");
|
|
65
|
+
for (i, chunk) in result_no_overlap.chunks.iter().enumerate() {
|
|
66
|
+
println!(
|
|
67
|
+
" Chunk {}: [{:3} - {:3}] = \"{}\"",
|
|
68
|
+
i,
|
|
69
|
+
chunk.metadata.char_start,
|
|
70
|
+
chunk.metadata.char_end,
|
|
71
|
+
chunk.content.replace('\n', "\\n")
|
|
72
|
+
);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
println!("\nAdjacency verification:");
|
|
76
|
+
for i in 0..result_no_overlap.chunks.len() - 1 {
|
|
77
|
+
let current = &result_no_overlap.chunks[i];
|
|
78
|
+
let next = &result_no_overlap.chunks[i + 1];
|
|
79
|
+
let gap = next.metadata.char_start as i32 - current.metadata.char_end as i32;
|
|
80
|
+
println!(
|
|
81
|
+
" Chunks {} and {}: gap = {} (next starts at {}, current ends at {})",
|
|
82
|
+
i,
|
|
83
|
+
i + 1,
|
|
84
|
+
gap,
|
|
85
|
+
next.metadata.char_start,
|
|
86
|
+
current.metadata.char_end
|
|
87
|
+
);
|
|
88
|
+
assert!(gap >= 0, "Should have no overlap (gap >= 0)");
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
println!("\n✓ All offset calculations are correct!");
|
|
92
|
+
}
|
|
@@ -18,6 +18,7 @@ use kreuzberg::plugins::registry::{get_document_extractor_registry, get_post_pro
|
|
|
18
18
|
use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
|
|
19
19
|
use kreuzberg::types::{ExtractionResult, Metadata};
|
|
20
20
|
use std::sync::Arc;
|
|
21
|
+
use std::sync::atomic::{AtomicUsize, Ordering};
|
|
21
22
|
|
|
22
23
|
#[cfg(feature = "ocr")]
|
|
23
24
|
use kreuzberg::core::config::OcrConfig;
|
|
@@ -29,18 +30,6 @@ use tokio::time::timeout;
|
|
|
29
30
|
|
|
30
31
|
mod helpers;
|
|
31
32
|
|
|
32
|
-
fn trim_trailing_newlines(value: &str) -> &str {
|
|
33
|
-
value.trim_end_matches(['\n', '\r'])
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
fn assert_text_content(actual: &str, expected: &str) {
|
|
37
|
-
assert_eq!(
|
|
38
|
-
trim_trailing_newlines(actual),
|
|
39
|
-
expected,
|
|
40
|
-
"Content mismatch after trimming trailing newlines"
|
|
41
|
-
);
|
|
42
|
-
}
|
|
43
|
-
|
|
44
33
|
/// Test many concurrent extractions of different MIME types.
|
|
45
34
|
///
|
|
46
35
|
/// Validates that:
|
|
@@ -51,16 +40,13 @@ fn assert_text_content(actual: &str, expected: &str) {
|
|
|
51
40
|
async fn test_concurrent_extractions_mixed_formats() {
|
|
52
41
|
let config = ExtractionConfig::default();
|
|
53
42
|
|
|
54
|
-
|
|
55
|
-
let mut test_cases = vec![
|
|
43
|
+
let test_cases = vec![
|
|
56
44
|
(b"Plain text content" as &[u8], "text/plain"),
|
|
57
45
|
(b"{\"key\": \"value\"}", "application/json"),
|
|
46
|
+
(b"<root><item>XML content</item></root>", "application/xml"),
|
|
58
47
|
(b"# Markdown\n\nContent here", "text/markdown"),
|
|
59
48
|
];
|
|
60
49
|
|
|
61
|
-
#[cfg(feature = "xml")]
|
|
62
|
-
test_cases.push((b"<root><item>XML content</item></root>" as &[u8], "application/xml"));
|
|
63
|
-
|
|
64
50
|
let mut handles = vec![];
|
|
65
51
|
for _ in 0..10 {
|
|
66
52
|
for (data, mime_type) in &test_cases {
|
|
@@ -109,11 +95,7 @@ async fn test_concurrent_batch_extractions() {
|
|
|
109
95
|
|
|
110
96
|
handles.push(tokio::spawn(async move {
|
|
111
97
|
let data: Vec<(&[u8], &str)> = contents_clone.iter().map(|c| (c.as_slice(), "text/plain")).collect();
|
|
112
|
-
|
|
113
|
-
.into_iter()
|
|
114
|
-
.map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
|
|
115
|
-
.collect();
|
|
116
|
-
batch_extract_bytes(owned_data, &config).await
|
|
98
|
+
batch_extract_bytes(data, &config).await
|
|
117
99
|
}));
|
|
118
100
|
}
|
|
119
101
|
|
|
@@ -139,8 +121,6 @@ async fn test_concurrent_extractions_with_cache() {
|
|
|
139
121
|
enabled: false,
|
|
140
122
|
enabled_processors: None,
|
|
141
123
|
disabled_processors: None,
|
|
142
|
-
enabled_set: None,
|
|
143
|
-
disabled_set: None,
|
|
144
124
|
}),
|
|
145
125
|
..Default::default()
|
|
146
126
|
};
|
|
@@ -164,7 +144,7 @@ async fn test_concurrent_extractions_with_cache() {
|
|
|
164
144
|
let result = handle.await.expect("Task should not panic");
|
|
165
145
|
assert!(result.is_ok(), "Cache read should succeed");
|
|
166
146
|
let extraction = result.unwrap();
|
|
167
|
-
|
|
147
|
+
assert_eq!(extraction.content, expected_content);
|
|
168
148
|
}
|
|
169
149
|
}
|
|
170
150
|
|
|
@@ -179,10 +159,6 @@ async fn test_concurrent_extractions_with_cache() {
|
|
|
179
159
|
async fn test_concurrent_ocr_processing() {
|
|
180
160
|
use helpers::{get_test_file_path, skip_if_missing};
|
|
181
161
|
|
|
182
|
-
if cfg!(windows) {
|
|
183
|
-
return;
|
|
184
|
-
}
|
|
185
|
-
|
|
186
162
|
if skip_if_missing("images/ocr_image.jpg") {
|
|
187
163
|
tracing::debug!("Skipping concurrent OCR test: test file not available");
|
|
188
164
|
return;
|
|
@@ -250,7 +226,6 @@ async fn test_concurrent_ocr_processing() {
|
|
|
250
226
|
#[test]
|
|
251
227
|
fn test_concurrent_ocr_cache_stress() {
|
|
252
228
|
use helpers::{get_test_file_path, skip_if_missing};
|
|
253
|
-
use std::sync::atomic::{AtomicUsize, Ordering};
|
|
254
229
|
|
|
255
230
|
if skip_if_missing("images/ocr_image.jpg") {
|
|
256
231
|
tracing::debug!("Skipping OCR cache stress test: test file not available");
|
|
@@ -358,8 +333,6 @@ async fn test_concurrent_pipeline_processing() {
|
|
|
358
333
|
enabled: true,
|
|
359
334
|
enabled_processors: Some(vec!["concurrent-test".to_string()]),
|
|
360
335
|
disabled_processors: None,
|
|
361
|
-
enabled_set: None,
|
|
362
|
-
disabled_set: None,
|
|
363
336
|
}),
|
|
364
337
|
..Default::default()
|
|
365
338
|
};
|
|
@@ -377,7 +350,6 @@ async fn test_concurrent_pipeline_processing() {
|
|
|
377
350
|
detected_languages: None,
|
|
378
351
|
chunks: None,
|
|
379
352
|
images: None,
|
|
380
|
-
pages: None,
|
|
381
353
|
};
|
|
382
354
|
|
|
383
355
|
run_pipeline(result, &config).await
|
|
@@ -498,16 +470,13 @@ async fn test_high_concurrency_stress() {
|
|
|
498
470
|
..Default::default()
|
|
499
471
|
};
|
|
500
472
|
|
|
501
|
-
|
|
502
|
-
let mut formats = vec![
|
|
473
|
+
let formats = vec![
|
|
503
474
|
(b"Text content" as &[u8], "text/plain"),
|
|
504
475
|
(b"{\"json\": true}", "application/json"),
|
|
476
|
+
(b"<xml><item>content</item></xml>", "application/xml"),
|
|
505
477
|
(b"# Markdown\n\nContent", "text/markdown"),
|
|
506
478
|
];
|
|
507
479
|
|
|
508
|
-
#[cfg(feature = "xml")]
|
|
509
|
-
formats.push((b"<xml><item>content</item></xml>" as &[u8], "application/xml"));
|
|
510
|
-
|
|
511
480
|
let mut handles = vec![];
|
|
512
481
|
for _ in 0..100 {
|
|
513
482
|
for (data, mime_type) in &formats {
|
|
@@ -531,10 +500,9 @@ async fn test_high_concurrency_stress() {
|
|
|
531
500
|
.await
|
|
532
501
|
.expect("High-load stress test should complete within 60s");
|
|
533
502
|
|
|
534
|
-
let expected_successes = 100 * formats.len();
|
|
535
503
|
let success_count = results.iter().filter(|r| r.is_ok()).count();
|
|
536
504
|
assert_eq!(
|
|
537
|
-
success_count,
|
|
505
|
+
success_count, 400,
|
|
538
506
|
"All extractions should succeed under stress, got {} successes",
|
|
539
507
|
success_count
|
|
540
508
|
);
|
|
@@ -3,19 +3,13 @@
|
|
|
3
3
|
//! Tests for chunking, language detection, caching, token reduction, and quality processing.
|
|
4
4
|
//! Validates that configuration options work correctly end-to-end.
|
|
5
5
|
|
|
6
|
-
|
|
7
|
-
use kreuzberg::core::config::ChunkingConfig;
|
|
8
|
-
use kreuzberg::core::config::ExtractionConfig;
|
|
9
|
-
#[cfg(feature = "language-detection")]
|
|
10
|
-
use kreuzberg::core::config::LanguageDetectionConfig;
|
|
11
|
-
use kreuzberg::core::config::TokenReductionConfig;
|
|
6
|
+
use kreuzberg::core::config::{ChunkingConfig, ExtractionConfig, LanguageDetectionConfig, TokenReductionConfig};
|
|
12
7
|
use kreuzberg::core::extractor::extract_bytes;
|
|
13
8
|
|
|
14
9
|
mod helpers;
|
|
15
10
|
|
|
16
11
|
/// Test chunking enabled - text split into chunks.
|
|
17
12
|
#[tokio::test]
|
|
18
|
-
#[cfg(feature = "chunking")]
|
|
19
13
|
async fn test_chunking_enabled() {
|
|
20
14
|
let config = ExtractionConfig {
|
|
21
15
|
chunking: Some(ChunkingConfig {
|
|
@@ -58,7 +52,6 @@ async fn test_chunking_enabled() {
|
|
|
58
52
|
|
|
59
53
|
/// Test chunking with overlap - overlap preserved between chunks.
|
|
60
54
|
#[tokio::test]
|
|
61
|
-
#[cfg(feature = "chunking")]
|
|
62
55
|
async fn test_chunking_with_overlap() {
|
|
63
56
|
let config = ExtractionConfig {
|
|
64
57
|
chunking: Some(ChunkingConfig {
|
|
@@ -98,7 +91,6 @@ async fn test_chunking_with_overlap() {
|
|
|
98
91
|
|
|
99
92
|
/// Test chunking with custom sizes - custom chunk size and overlap.
|
|
100
93
|
#[tokio::test]
|
|
101
|
-
#[cfg(feature = "chunking")]
|
|
102
94
|
async fn test_chunking_custom_sizes() {
|
|
103
95
|
let config = ExtractionConfig {
|
|
104
96
|
chunking: Some(ChunkingConfig {
|
|
@@ -159,7 +151,6 @@ async fn test_chunking_disabled() {
|
|
|
159
151
|
|
|
160
152
|
/// Test language detection for single language document.
|
|
161
153
|
#[tokio::test]
|
|
162
|
-
#[cfg(feature = "language-detection")]
|
|
163
154
|
async fn test_language_detection_single() {
|
|
164
155
|
let config = ExtractionConfig {
|
|
165
156
|
language_detection: Some(LanguageDetectionConfig {
|
|
@@ -186,7 +177,6 @@ async fn test_language_detection_single() {
|
|
|
186
177
|
/// Test language detection for multi-language document.
|
|
187
178
|
#[cfg_attr(coverage, ignore = "coverage instrumentation affects multi-language heuristics")]
|
|
188
179
|
#[tokio::test]
|
|
189
|
-
#[cfg(feature = "language-detection")]
|
|
190
180
|
async fn test_language_detection_multiple() {
|
|
191
181
|
let config = ExtractionConfig {
|
|
192
182
|
language_detection: Some(LanguageDetectionConfig {
|
|
@@ -211,7 +201,6 @@ async fn test_language_detection_multiple() {
|
|
|
211
201
|
|
|
212
202
|
/// Test language detection with confidence threshold.
|
|
213
203
|
#[tokio::test]
|
|
214
|
-
#[cfg(feature = "language-detection")]
|
|
215
204
|
async fn test_language_detection_confidence() {
|
|
216
205
|
let config = ExtractionConfig {
|
|
217
206
|
language_detection: Some(LanguageDetectionConfig {
|
|
@@ -236,7 +225,6 @@ async fn test_language_detection_confidence() {
|
|
|
236
225
|
|
|
237
226
|
/// Test language detection disabled.
|
|
238
227
|
#[tokio::test]
|
|
239
|
-
#[cfg(feature = "language-detection")]
|
|
240
228
|
async fn test_language_detection_disabled() {
|
|
241
229
|
let config = ExtractionConfig {
|
|
242
230
|
language_detection: Some(LanguageDetectionConfig {
|
|
@@ -409,7 +397,6 @@ async fn test_token_reduction_disabled() {
|
|
|
409
397
|
|
|
410
398
|
/// Test quality processing enabled - quality scoring applied.
|
|
411
399
|
#[tokio::test]
|
|
412
|
-
#[cfg(feature = "quality")]
|
|
413
400
|
async fn test_quality_processing_enabled() {
|
|
414
401
|
let config = ExtractionConfig {
|
|
415
402
|
enable_quality_processing: true,
|
|
@@ -433,7 +420,6 @@ async fn test_quality_processing_enabled() {
|
|
|
433
420
|
|
|
434
421
|
/// Test quality processing calculates score for different text quality.
|
|
435
422
|
#[tokio::test]
|
|
436
|
-
#[cfg(feature = "quality")]
|
|
437
423
|
async fn test_quality_threshold_filtering() {
|
|
438
424
|
let config = ExtractionConfig {
|
|
439
425
|
enable_quality_processing: true,
|
|
@@ -498,15 +484,8 @@ async fn test_quality_processing_disabled() {
|
|
|
498
484
|
}
|
|
499
485
|
|
|
500
486
|
/// Test chunking with embeddings using balanced preset.
|
|
501
|
-
///
|
|
502
|
-
/// This test requires ONNX Runtime to be installed as a system dependency.
|
|
503
|
-
/// On macOS with Homebrew: `brew install onnxruntime`
|
|
504
|
-
/// On Linux: Install via your package manager or download from https://github.com/microsoft/onnxruntime/releases
|
|
505
|
-
/// On Windows: Download from https://github.com/microsoft/onnxruntime/releases
|
|
506
487
|
#[tokio::test]
|
|
507
488
|
#[cfg(feature = "embeddings")]
|
|
508
|
-
#[cfg_attr(target_os = "macos", ignore = "ONNX models not cached on macOS")]
|
|
509
|
-
#[cfg_attr(target_os = "windows", ignore = "ONNX models not cached on Windows")]
|
|
510
489
|
async fn test_chunking_with_embeddings() {
|
|
511
490
|
use kreuzberg::core::config::EmbeddingConfig;
|
|
512
491
|
|
|
@@ -564,15 +543,8 @@ async fn test_chunking_with_embeddings() {
|
|
|
564
543
|
}
|
|
565
544
|
|
|
566
545
|
/// Test chunking with fast embedding preset.
|
|
567
|
-
///
|
|
568
|
-
/// This test requires ONNX Runtime to be installed as a system dependency.
|
|
569
|
-
/// On macOS with Homebrew: `brew install onnxruntime`
|
|
570
|
-
/// On Linux: Install via your package manager or download from https://github.com/microsoft/onnxruntime/releases
|
|
571
|
-
/// On Windows: Download from https://github.com/microsoft/onnxruntime/releases
|
|
572
546
|
#[tokio::test]
|
|
573
547
|
#[cfg(feature = "embeddings")]
|
|
574
|
-
#[cfg_attr(target_os = "macos", ignore = "ONNX models not cached on macOS")]
|
|
575
|
-
#[cfg_attr(target_os = "windows", ignore = "ONNX models not cached on Windows")]
|
|
576
548
|
async fn test_chunking_with_fast_embeddings() {
|
|
577
549
|
use kreuzberg::core::config::{EmbeddingConfig, EmbeddingModelType};
|
|
578
550
|
|
|
@@ -601,10 +573,6 @@ async fn test_chunking_with_fast_embeddings() {
|
|
|
601
573
|
let chunks = result.chunks.expect("Should have chunks");
|
|
602
574
|
assert!(!chunks.is_empty(), "Should have at least one chunk");
|
|
603
575
|
|
|
604
|
-
if let Some(error) = result.metadata.additional.get("embedding_error") {
|
|
605
|
-
panic!("Embedding generation failed: {}", error);
|
|
606
|
-
}
|
|
607
|
-
|
|
608
576
|
for chunk in &chunks {
|
|
609
577
|
let embedding = chunk.embedding.as_ref().expect("Should have embedding");
|
|
610
578
|
assert_eq!(embedding.len(), 384, "Fast preset should produce 384-dim embeddings");
|