kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
|
@@ -1,289 +0,0 @@
|
|
|
1
|
-
#![cfg(feature = "api")]
|
|
2
|
-
//! Diagnostic tests for large PDF file extraction issues.
|
|
3
|
-
//!
|
|
4
|
-
//! These tests are designed to isolate and identify the root cause of
|
|
5
|
-
//! issues with large PDF file handling in the Kreuzberg API server.
|
|
6
|
-
//!
|
|
7
|
-
//! Current Status:
|
|
8
|
-
//! - 5MB PDF tests are returning HTTP 400 instead of HTTP 200
|
|
9
|
-
//! - This suggests either:
|
|
10
|
-
//! a) The mock PDF structure is invalid
|
|
11
|
-
//! b) The PDF extraction logic has issues with the generated content
|
|
12
|
-
//! c) The multipart parsing is failing on large payloads
|
|
13
|
-
//!
|
|
14
|
-
//! These diagnostic tests help narrow down which component is failing.
|
|
15
|
-
|
|
16
|
-
use axum::{
|
|
17
|
-
body::{Body, to_bytes},
|
|
18
|
-
http::{Request, StatusCode},
|
|
19
|
-
};
|
|
20
|
-
use kreuzberg::{
|
|
21
|
-
ExtractionConfig,
|
|
22
|
-
api::{ApiSizeLimits, create_router_with_limits},
|
|
23
|
-
};
|
|
24
|
-
use serde_json::Value;
|
|
25
|
-
use tower::ServiceExt;
|
|
26
|
-
|
|
27
|
-
/// Test extracting a minimal valid PDF (control test).
|
|
28
|
-
///
|
|
29
|
-
/// This serves as a baseline to verify the API can handle valid PDFs
|
|
30
|
-
/// before testing with large files.
|
|
31
|
-
#[tokio::test]
|
|
32
|
-
async fn test_extract_minimal_valid_pdf() {
|
|
33
|
-
let router = create_router_with_limits(ExtractionConfig::default(), ApiSizeLimits::from_mb(10, 10));
|
|
34
|
-
|
|
35
|
-
let pdf_content = b"%PDF-1.4
|
|
36
|
-
1 0 obj
|
|
37
|
-
<< /Type /Catalog /Pages 2 0 R >>
|
|
38
|
-
endobj
|
|
39
|
-
2 0 obj
|
|
40
|
-
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
|
|
41
|
-
endobj
|
|
42
|
-
3 0 obj
|
|
43
|
-
<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>
|
|
44
|
-
endobj
|
|
45
|
-
4 0 obj
|
|
46
|
-
<< >>
|
|
47
|
-
stream
|
|
48
|
-
BT /F1 12 Tf 50 750 Td (Hello) Tj ET
|
|
49
|
-
endstream
|
|
50
|
-
endobj
|
|
51
|
-
xref
|
|
52
|
-
0 5
|
|
53
|
-
0000000000 65535 f
|
|
54
|
-
0000000009 00000 n
|
|
55
|
-
0000000074 00000 n
|
|
56
|
-
0000000133 00000 n
|
|
57
|
-
0000000214 00000 n
|
|
58
|
-
trailer
|
|
59
|
-
<< /Size 5 /Root 1 0 R >>
|
|
60
|
-
startxref
|
|
61
|
-
340
|
|
62
|
-
%%EOF";
|
|
63
|
-
|
|
64
|
-
let boundary = "----minimal-pdf";
|
|
65
|
-
let mut body = Vec::new();
|
|
66
|
-
|
|
67
|
-
body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes());
|
|
68
|
-
body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"minimal.pdf\"\r\n");
|
|
69
|
-
body.extend_from_slice(b"Content-Type: application/pdf\r\n\r\n");
|
|
70
|
-
body.extend_from_slice(pdf_content);
|
|
71
|
-
body.extend_from_slice(b"\r\n");
|
|
72
|
-
body.extend_from_slice(format!("--{}--\r\n", boundary).as_bytes());
|
|
73
|
-
|
|
74
|
-
let request = Request::builder()
|
|
75
|
-
.method("POST")
|
|
76
|
-
.uri("/extract")
|
|
77
|
-
.header("content-type", format!("multipart/form-data; boundary={}", boundary))
|
|
78
|
-
.header("content-length", body.len())
|
|
79
|
-
.body(Body::from(body))
|
|
80
|
-
.expect("Failed to build request");
|
|
81
|
-
|
|
82
|
-
let response = router.oneshot(request).await.expect("Request failed");
|
|
83
|
-
|
|
84
|
-
assert_eq!(
|
|
85
|
-
response.status(),
|
|
86
|
-
StatusCode::OK,
|
|
87
|
-
"Minimal PDF should extract successfully. Status: {} indicates baseline is working",
|
|
88
|
-
response.status()
|
|
89
|
-
);
|
|
90
|
-
|
|
91
|
-
let body = to_bytes(response.into_body(), 1_000_000)
|
|
92
|
-
.await
|
|
93
|
-
.expect("Failed to read response body");
|
|
94
|
-
|
|
95
|
-
let parsed: Value = serde_json::from_slice(&body).expect("Failed to parse response");
|
|
96
|
-
eprintln!("Extraction result: {}", serde_json::to_string_pretty(&parsed).unwrap());
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
/// Test extracting a 1MB text file (control test without PDF).
|
|
100
|
-
///
|
|
101
|
-
/// This isolates whether the issue is specific to PDF handling or
|
|
102
|
-
/// a general problem with large multipart uploads.
|
|
103
|
-
#[tokio::test]
|
|
104
|
-
async fn test_extract_1mb_text_file() {
|
|
105
|
-
let router = create_router_with_limits(ExtractionConfig::default(), ApiSizeLimits::from_mb(10, 10));
|
|
106
|
-
|
|
107
|
-
let boundary = "----large-text";
|
|
108
|
-
let large_text = "This is test content. ".repeat(50000);
|
|
109
|
-
|
|
110
|
-
let mut body = Vec::new();
|
|
111
|
-
body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes());
|
|
112
|
-
body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"large.txt\"\r\n");
|
|
113
|
-
body.extend_from_slice(b"Content-Type: text/plain\r\n\r\n");
|
|
114
|
-
body.extend_from_slice(large_text.as_bytes());
|
|
115
|
-
body.extend_from_slice(b"\r\n");
|
|
116
|
-
body.extend_from_slice(format!("--{}--\r\n", boundary).as_bytes());
|
|
117
|
-
|
|
118
|
-
let request = Request::builder()
|
|
119
|
-
.method("POST")
|
|
120
|
-
.uri("/extract")
|
|
121
|
-
.header("content-type", format!("multipart/form-data; boundary={}", boundary))
|
|
122
|
-
.header("content-length", body.len())
|
|
123
|
-
.body(Body::from(body))
|
|
124
|
-
.expect("Failed to build request");
|
|
125
|
-
|
|
126
|
-
let response = router.oneshot(request).await.expect("Request failed");
|
|
127
|
-
|
|
128
|
-
println!("1MB text file extraction status: {}", response.status());
|
|
129
|
-
|
|
130
|
-
assert_eq!(
|
|
131
|
-
response.status(),
|
|
132
|
-
StatusCode::OK,
|
|
133
|
-
"1MB text file should extract successfully. If this fails, multipart parsing may have issues."
|
|
134
|
-
);
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
/// Test extracting progressively larger text files to find breaking point.
|
|
138
|
-
///
|
|
139
|
-
/// This helps identify at what size the API starts failing.
|
|
140
|
-
#[tokio::test]
|
|
141
|
-
async fn test_find_size_breaking_point() {
|
|
142
|
-
let sizes = vec![
|
|
143
|
-
("100KB", 100 * 1024),
|
|
144
|
-
("500KB", 500 * 1024),
|
|
145
|
-
("1MB", 1024 * 1024),
|
|
146
|
-
("2MB", 2 * 1024 * 1024),
|
|
147
|
-
("5MB", 5 * 1024 * 1024),
|
|
148
|
-
];
|
|
149
|
-
|
|
150
|
-
for (label, size) in sizes {
|
|
151
|
-
let router = create_router_with_limits(ExtractionConfig::default(), ApiSizeLimits::from_mb(20, 20));
|
|
152
|
-
|
|
153
|
-
let boundary = "----size-test";
|
|
154
|
-
let content = "A".repeat(size);
|
|
155
|
-
|
|
156
|
-
let mut body = Vec::new();
|
|
157
|
-
body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes());
|
|
158
|
-
body.extend_from_slice(
|
|
159
|
-
format!(
|
|
160
|
-
"Content-Disposition: form-data; name=\"files\"; filename=\"test_{}.txt\"\r\n",
|
|
161
|
-
label
|
|
162
|
-
)
|
|
163
|
-
.as_bytes(),
|
|
164
|
-
);
|
|
165
|
-
body.extend_from_slice(b"Content-Type: text/plain\r\n\r\n");
|
|
166
|
-
body.extend_from_slice(content.as_bytes());
|
|
167
|
-
body.extend_from_slice(b"\r\n");
|
|
168
|
-
body.extend_from_slice(format!("--{}--\r\n", boundary).as_bytes());
|
|
169
|
-
|
|
170
|
-
let request = Request::builder()
|
|
171
|
-
.method("POST")
|
|
172
|
-
.uri("/extract")
|
|
173
|
-
.header("content-type", format!("multipart/form-data; boundary={}", boundary))
|
|
174
|
-
.header("content-length", body.len())
|
|
175
|
-
.body(Body::from(body))
|
|
176
|
-
.expect("Failed to build request");
|
|
177
|
-
|
|
178
|
-
let response = router.oneshot(request).await.expect("Request failed");
|
|
179
|
-
|
|
180
|
-
println!("Size {} ({}B): HTTP {}", label, size, response.status().as_u16());
|
|
181
|
-
|
|
182
|
-
if response.status() != StatusCode::OK {
|
|
183
|
-
eprintln!("Extraction failed at size: {}", label);
|
|
184
|
-
|
|
185
|
-
let body = to_bytes(response.into_body(), 1_000_000)
|
|
186
|
-
.await
|
|
187
|
-
.expect("Failed to read response body");
|
|
188
|
-
|
|
189
|
-
if let Ok(parsed) = serde_json::from_slice::<Value>(&body) {
|
|
190
|
-
eprintln!("Error response: {}", serde_json::to_string_pretty(&parsed).unwrap());
|
|
191
|
-
} else {
|
|
192
|
-
eprintln!("Response body (not JSON): {}", String::from_utf8_lossy(&body));
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
return;
|
|
196
|
-
}
|
|
197
|
-
}
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
/// Test that the default 100MB limit is being applied.
|
|
201
|
-
///
|
|
202
|
-
/// Verifies that the server is actually respecting the configured limits,
|
|
203
|
-
/// and documents what the default limit actually is.
|
|
204
|
-
#[tokio::test]
|
|
205
|
-
async fn test_default_size_limits() {
|
|
206
|
-
let default_limits = ApiSizeLimits::default();
|
|
207
|
-
assert_eq!(default_limits.max_request_body_bytes, 100 * 1024 * 1024);
|
|
208
|
-
assert_eq!(default_limits.max_multipart_field_bytes, 100 * 1024 * 1024);
|
|
209
|
-
|
|
210
|
-
println!(
|
|
211
|
-
"Default limits: {} bytes request, {} bytes per field",
|
|
212
|
-
default_limits.max_request_body_bytes, default_limits.max_multipart_field_bytes
|
|
213
|
-
);
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
/// Test that the router layer actually applies RequestBodyLimitLayer.
|
|
217
|
-
///
|
|
218
|
-
/// Creates a router and verifies that size limit enforcement is active.
|
|
219
|
-
#[tokio::test]
|
|
220
|
-
async fn test_request_body_limit_layer_applied() {
|
|
221
|
-
let small_limits = ApiSizeLimits::from_mb(1, 1);
|
|
222
|
-
let router = create_router_with_limits(ExtractionConfig::default(), small_limits);
|
|
223
|
-
|
|
224
|
-
let boundary = "----exceed-limits";
|
|
225
|
-
let large_content = "X".repeat(2 * 1024 * 1024);
|
|
226
|
-
|
|
227
|
-
let mut body = Vec::new();
|
|
228
|
-
body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes());
|
|
229
|
-
body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"test.txt\"\r\n");
|
|
230
|
-
body.extend_from_slice(b"Content-Type: text/plain\r\n\r\n");
|
|
231
|
-
body.extend_from_slice(large_content.as_bytes());
|
|
232
|
-
body.extend_from_slice(b"\r\n");
|
|
233
|
-
body.extend_from_slice(format!("--{}--\r\n", boundary).as_bytes());
|
|
234
|
-
|
|
235
|
-
let request = Request::builder()
|
|
236
|
-
.method("POST")
|
|
237
|
-
.uri("/extract")
|
|
238
|
-
.header("content-type", format!("multipart/form-data; boundary={}", boundary))
|
|
239
|
-
.header("content-length", body.len())
|
|
240
|
-
.body(Body::from(body))
|
|
241
|
-
.expect("Failed to build request");
|
|
242
|
-
|
|
243
|
-
let response = router.oneshot(request).await.expect("Request failed");
|
|
244
|
-
|
|
245
|
-
assert_eq!(
|
|
246
|
-
response.status(),
|
|
247
|
-
StatusCode::PAYLOAD_TOO_LARGE,
|
|
248
|
-
"2MB file should be rejected when limit is 1MB"
|
|
249
|
-
);
|
|
250
|
-
}
|
|
251
|
-
|
|
252
|
-
/// Test multipart parsing with incremental content.
|
|
253
|
-
///
|
|
254
|
-
/// Some implementations have issues with streaming multipart parsing.
|
|
255
|
-
/// This test uses proper CRLF line endings to ensure correct parsing.
|
|
256
|
-
#[tokio::test]
|
|
257
|
-
async fn test_multipart_proper_crlf_formatting() {
|
|
258
|
-
let router = create_router_with_limits(ExtractionConfig::default(), ApiSizeLimits::from_mb(10, 10));
|
|
259
|
-
|
|
260
|
-
let content = "Test PDF content that is at least somewhat large for testing purposes.";
|
|
261
|
-
|
|
262
|
-
let mut body = Vec::new();
|
|
263
|
-
|
|
264
|
-
body.extend_from_slice(b"--BOUNDARY123456\r\n");
|
|
265
|
-
|
|
266
|
-
body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"test.pdf\"\r\n");
|
|
267
|
-
body.extend_from_slice(b"Content-Type: application/pdf\r\n");
|
|
268
|
-
|
|
269
|
-
body.extend_from_slice(b"\r\n");
|
|
270
|
-
|
|
271
|
-
body.extend_from_slice(content.as_bytes());
|
|
272
|
-
|
|
273
|
-
body.extend_from_slice(b"\r\n");
|
|
274
|
-
|
|
275
|
-
body.extend_from_slice(b"--BOUNDARY123456--\r\n");
|
|
276
|
-
|
|
277
|
-
let request = Request::builder()
|
|
278
|
-
.method("POST")
|
|
279
|
-
.uri("/extract")
|
|
280
|
-
.header("content-type", "multipart/form-data; boundary=BOUNDARY123456")
|
|
281
|
-
.header("content-length", body.len())
|
|
282
|
-
.body(Body::from(body))
|
|
283
|
-
.expect("Failed to build request");
|
|
284
|
-
|
|
285
|
-
let response = router.oneshot(request).await.expect("Request failed");
|
|
286
|
-
|
|
287
|
-
println!("Multipart with proper CRLF: HTTP {}", response.status().as_u16());
|
|
288
|
-
assert!(response.status().is_success() || response.status().is_client_error());
|
|
289
|
-
}
|
|
@@ -1,154 +0,0 @@
|
|
|
1
|
-
//! Benchmark tests for object pooling in batch extraction.
|
|
2
|
-
//!
|
|
3
|
-
//! This test suite demonstrates the performance benefits of object pooling
|
|
4
|
-
//! during batch document extraction operations.
|
|
5
|
-
|
|
6
|
-
#[cfg(feature = "tokio-runtime")]
|
|
7
|
-
mod tests {
|
|
8
|
-
use kreuzberg::core::{BatchProcessor, ExtractionConfig};
|
|
9
|
-
use kreuzberg::utils::pool::create_string_buffer_pool;
|
|
10
|
-
|
|
11
|
-
#[test]
|
|
12
|
-
fn test_batch_processor_initialization() {
|
|
13
|
-
let processor = BatchProcessor::new();
|
|
14
|
-
assert_eq!(processor.string_pool_size(), 0);
|
|
15
|
-
assert_eq!(processor.byte_pool_size(), 0);
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
#[test]
|
|
19
|
-
fn test_string_pool_reuse_reduces_allocations() {
|
|
20
|
-
let pool = create_string_buffer_pool(5, 8192);
|
|
21
|
-
|
|
22
|
-
let mut buffers = vec![];
|
|
23
|
-
for _ in 0..3 {
|
|
24
|
-
let buf = pool.acquire().unwrap();
|
|
25
|
-
buffers.push(buf);
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
drop(buffers);
|
|
29
|
-
|
|
30
|
-
assert_eq!(pool.size(), 3, "pool should have 3 buffers after first batch");
|
|
31
|
-
|
|
32
|
-
let mut buffers = vec![];
|
|
33
|
-
for _ in 0..3 {
|
|
34
|
-
let buf = pool.acquire().unwrap();
|
|
35
|
-
buffers.push(buf);
|
|
36
|
-
}
|
|
37
|
-
drop(buffers);
|
|
38
|
-
|
|
39
|
-
assert!(pool.size() <= 5, "pool should not exceed max size");
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
#[test]
|
|
43
|
-
fn test_batch_processor_multiple_operations() {
|
|
44
|
-
let processor = BatchProcessor::new();
|
|
45
|
-
|
|
46
|
-
for _batch in 0..3 {
|
|
47
|
-
let mut results = vec![];
|
|
48
|
-
|
|
49
|
-
for _i in 0..5 {
|
|
50
|
-
let string_buf = processor.string_pool().acquire().unwrap();
|
|
51
|
-
let byte_buf = processor.byte_pool().acquire().unwrap();
|
|
52
|
-
|
|
53
|
-
results.push((string_buf, byte_buf));
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
drop(results);
|
|
57
|
-
|
|
58
|
-
assert!(processor.string_pool_size() <= 10);
|
|
59
|
-
assert!(processor.byte_pool_size() <= 10);
|
|
60
|
-
}
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
#[test]
|
|
64
|
-
fn test_pool_memory_efficiency() {
|
|
65
|
-
let pool = create_string_buffer_pool(5, 4096);
|
|
66
|
-
|
|
67
|
-
let capacity_initial = {
|
|
68
|
-
let buf = pool.acquire().unwrap();
|
|
69
|
-
buf.capacity()
|
|
70
|
-
};
|
|
71
|
-
|
|
72
|
-
for _ in 0..10 {
|
|
73
|
-
let mut buf = pool.acquire().unwrap();
|
|
74
|
-
buf.push_str("test data");
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
let capacity_final = {
|
|
78
|
-
let buf = pool.acquire().unwrap();
|
|
79
|
-
buf.capacity()
|
|
80
|
-
};
|
|
81
|
-
|
|
82
|
-
assert_eq!(
|
|
83
|
-
capacity_initial, capacity_final,
|
|
84
|
-
"buffer capacity should be maintained across reuses"
|
|
85
|
-
);
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
#[tokio::test]
|
|
89
|
-
async fn test_batch_processor_with_extraction_config() {
|
|
90
|
-
let processor = BatchProcessor::new();
|
|
91
|
-
let _config = ExtractionConfig::default();
|
|
92
|
-
|
|
93
|
-
assert!(processor.config().string_pool_size > 0);
|
|
94
|
-
assert!(processor.config().string_buffer_capacity > 0);
|
|
95
|
-
assert!(processor.config().byte_pool_size > 0);
|
|
96
|
-
assert!(processor.config().byte_buffer_capacity > 0);
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
#[test]
|
|
100
|
-
fn test_pool_clear_resets_size() {
|
|
101
|
-
let processor = BatchProcessor::new();
|
|
102
|
-
|
|
103
|
-
{
|
|
104
|
-
let _s1 = processor.string_pool().acquire().unwrap();
|
|
105
|
-
let _s2 = processor.string_pool().acquire().unwrap();
|
|
106
|
-
let _b1 = processor.byte_pool().acquire().unwrap();
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
assert!(processor.string_pool_size() > 0);
|
|
110
|
-
assert!(processor.byte_pool_size() > 0);
|
|
111
|
-
|
|
112
|
-
processor.clear_pools().unwrap();
|
|
113
|
-
|
|
114
|
-
assert_eq!(processor.string_pool_size(), 0);
|
|
115
|
-
assert_eq!(processor.byte_pool_size(), 0);
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
#[test]
|
|
119
|
-
fn test_concurrent_pool_access() {
|
|
120
|
-
use std::sync::Arc;
|
|
121
|
-
use std::thread;
|
|
122
|
-
|
|
123
|
-
let processor = Arc::new(BatchProcessor::new());
|
|
124
|
-
let mut handles = vec![];
|
|
125
|
-
|
|
126
|
-
for _thread_id in 0..4 {
|
|
127
|
-
let processor_clone = Arc::clone(&processor);
|
|
128
|
-
|
|
129
|
-
let handle = thread::spawn(move || {
|
|
130
|
-
for _ in 0..5 {
|
|
131
|
-
let _buf1 = processor_clone.string_pool().acquire();
|
|
132
|
-
let _buf2 = processor_clone.byte_pool().acquire();
|
|
133
|
-
}
|
|
134
|
-
});
|
|
135
|
-
|
|
136
|
-
handles.push(handle);
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
for handle in handles {
|
|
140
|
-
handle.join().unwrap();
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
assert!(processor.string_pool_size() <= 10);
|
|
144
|
-
assert!(processor.byte_pool_size() <= 10);
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
#[test]
|
|
148
|
-
fn test_pool_respects_capacity_hints() {
|
|
149
|
-
let pool = create_string_buffer_pool(3, 2048);
|
|
150
|
-
|
|
151
|
-
let buf = pool.acquire().unwrap();
|
|
152
|
-
assert!(buf.capacity() >= 2048, "buffer should respect capacity hint");
|
|
153
|
-
}
|
|
154
|
-
}
|