kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
|
@@ -11,30 +11,6 @@
|
|
|
11
11
|
//! - Batch processing for efficient embedding generation
|
|
12
12
|
//! - Optional GPU acceleration via ONNX Runtime execution providers
|
|
13
13
|
//!
|
|
14
|
-
//! # ONNX Runtime Requirement
|
|
15
|
-
//!
|
|
16
|
-
//! **CRITICAL**: This module requires ONNX Runtime to be installed on the system.
|
|
17
|
-
//! The `embeddings` feature uses dynamic loading (`ort-load-dynamic`), which detects
|
|
18
|
-
//! the ONNX Runtime library at runtime.
|
|
19
|
-
//!
|
|
20
|
-
//! ## Installation Instructions
|
|
21
|
-
//!
|
|
22
|
-
//! - **macOS**: `brew install onnxruntime`
|
|
23
|
-
//! - **Linux (Ubuntu/Debian)**: `apt install libonnxruntime libonnxruntime-dev`
|
|
24
|
-
//! - **Linux (Fedora)**: `dnf install onnxruntime onnxruntime-devel`
|
|
25
|
-
//! - **Linux (Arch)**: `pacman -S onnxruntime`
|
|
26
|
-
//! - **Windows (MSVC)**: Download from https://github.com/microsoft/onnxruntime/releases and add to PATH
|
|
27
|
-
//!
|
|
28
|
-
//! Alternatively, set the `ORT_DYLIB_PATH` environment variable to the ONNX Runtime library path.
|
|
29
|
-
//!
|
|
30
|
-
//! For Docker/containers, install via package manager in your base image.
|
|
31
|
-
//! Verified packages: Ubuntu 22.04+, Fedora 38+, Arch Linux.
|
|
32
|
-
//!
|
|
33
|
-
//! ## Platform Limitations
|
|
34
|
-
//!
|
|
35
|
-
//! **Windows MinGW builds are not supported**. ONNX Runtime requires the MSVC toolchain on Windows.
|
|
36
|
-
//! Please use Windows MSVC builds or disable the embeddings feature.
|
|
37
|
-
//!
|
|
38
14
|
//! # Example
|
|
39
15
|
//!
|
|
40
16
|
//! ```rust,ignore
|
|
@@ -67,107 +43,11 @@ use std::sync::{Arc, Mutex, RwLock};
|
|
|
67
43
|
use std::collections::HashMap;
|
|
68
44
|
|
|
69
45
|
#[cfg(feature = "embeddings")]
|
|
70
|
-
use
|
|
46
|
+
use lazy_static::lazy_static;
|
|
71
47
|
|
|
72
48
|
#[cfg(feature = "embeddings")]
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
/// Wrapper for TextEmbedding that prevents cleanup during process shutdown.
|
|
76
|
-
///
|
|
77
|
-
/// # Problem
|
|
78
|
-
///
|
|
79
|
-
/// ONNX Runtime's C++ destructors fail during process shutdown when trying to
|
|
80
|
-
/// acquire mutexes that have already been torn down by the C++ runtime. This
|
|
81
|
-
/// causes crashes with "mutex lock failed: Invalid argument" errors.
|
|
82
|
-
///
|
|
83
|
-
/// This is a known issue in `ort` v2.0.0-rc.10 (pykeio/ort#441) that was fixed
|
|
84
|
-
/// in later versions, but we're constrained by fastembed's dependency tree.
|
|
85
|
-
///
|
|
86
|
-
/// # Solution
|
|
87
|
-
///
|
|
88
|
-
/// We prevent all cleanup of ONNX Runtime resources:
|
|
89
|
-
/// 1. Individual TextEmbedding objects are leaked via Box::leak
|
|
90
|
-
/// 2. The entire MODEL_CACHE is wrapped in ManuallyDrop
|
|
91
|
-
///
|
|
92
|
-
/// This prevents Drop implementations from running during shutdown, completely
|
|
93
|
-
/// avoiding the mutex errors. The OS reclaims all memory on process exit anyway.
|
|
94
|
-
///
|
|
95
|
-
/// Thread-safe wrapper for leaked TextEmbedding that allows interior mutability.
|
|
96
|
-
///
|
|
97
|
-
/// This wrapper holds a raw pointer to a leaked `TextEmbedding` and provides
|
|
98
|
-
/// safe access through the Mutex lock in MODEL_CACHE.
|
|
99
|
-
#[cfg(feature = "embeddings")]
|
|
100
|
-
pub(crate) struct LeakedModel {
|
|
101
|
-
ptr: *mut TextEmbedding,
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
#[cfg(feature = "embeddings")]
|
|
105
|
-
impl LeakedModel {
|
|
106
|
-
fn new(model: TextEmbedding) -> Self {
|
|
107
|
-
Self {
|
|
108
|
-
ptr: Box::into_raw(Box::new(model)),
|
|
109
|
-
}
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
/// Get a mutable reference to the model.
|
|
113
|
-
///
|
|
114
|
-
/// # Safety
|
|
115
|
-
///
|
|
116
|
-
/// This is safe to call only when:
|
|
117
|
-
/// 1. The caller has exclusive access (guaranteed by Mutex in MODEL_CACHE)
|
|
118
|
-
/// 2. The pointer is valid (guaranteed by Box::into_raw and never deallocating)
|
|
119
|
-
#[allow(unsafe_code, clippy::mut_from_ref)]
|
|
120
|
-
unsafe fn get_mut(&self) -> &mut TextEmbedding {
|
|
121
|
-
unsafe { &mut *self.ptr }
|
|
122
|
-
}
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
#[cfg(feature = "embeddings")]
|
|
126
|
-
#[allow(unsafe_code)]
|
|
127
|
-
unsafe impl Send for LeakedModel {}
|
|
128
|
-
#[cfg(feature = "embeddings")]
|
|
129
|
-
#[allow(unsafe_code)]
|
|
130
|
-
unsafe impl Sync for LeakedModel {}
|
|
131
|
-
|
|
132
|
-
#[cfg(feature = "embeddings")]
|
|
133
|
-
type CachedEmbedding = Arc<Mutex<LeakedModel>>;
|
|
134
|
-
|
|
135
|
-
/// Global model cache wrapped in ManuallyDrop to prevent cleanup during process exit.
|
|
136
|
-
///
|
|
137
|
-
/// We use Lazy + ManuallyDrop because ONNX Runtime's C++ destructors fail during static
|
|
138
|
-
/// destruction when mutexes are already torn down. By never dropping this cache,
|
|
139
|
-
/// we avoid the mutex errors at shutdown. The OS reclaims memory on process exit anyway.
|
|
140
|
-
#[cfg(feature = "embeddings")]
|
|
141
|
-
static MODEL_CACHE: Lazy<ManuallyDrop<RwLock<HashMap<String, CachedEmbedding>>>> =
|
|
142
|
-
Lazy::new(|| ManuallyDrop::new(RwLock::new(HashMap::new())));
|
|
143
|
-
|
|
144
|
-
/// Returns installation instructions for ONNX Runtime.
|
|
145
|
-
#[cfg(feature = "embeddings")]
|
|
146
|
-
fn onnx_runtime_install_message() -> String {
|
|
147
|
-
#[cfg(all(windows, target_env = "gnu"))]
|
|
148
|
-
{
|
|
149
|
-
return "ONNX Runtime embeddings are not supported on Windows MinGW builds. \
|
|
150
|
-
ONNX Runtime requires MSVC toolchain. \
|
|
151
|
-
Please use Windows MSVC builds or disable embeddings feature."
|
|
152
|
-
.to_string();
|
|
153
|
-
}
|
|
154
|
-
|
|
155
|
-
#[cfg(not(all(windows, target_env = "gnu")))]
|
|
156
|
-
{
|
|
157
|
-
"ONNX Runtime is required for embeddings functionality. \
|
|
158
|
-
Install: \
|
|
159
|
-
macOS: 'brew install onnxruntime', \
|
|
160
|
-
Linux (Ubuntu/Debian): 'apt install libonnxruntime libonnxruntime-dev', \
|
|
161
|
-
Linux (Fedora): 'dnf install onnxruntime onnxruntime-devel', \
|
|
162
|
-
Linux (Arch): 'pacman -S onnxruntime', \
|
|
163
|
-
Windows (MSVC): Download from https://github.com/microsoft/onnxruntime/releases and add to PATH. \
|
|
164
|
-
\
|
|
165
|
-
Alternatively, set ORT_DYLIB_PATH environment variable to the ONNX Runtime library path. \
|
|
166
|
-
\
|
|
167
|
-
For Docker/containers: Install via package manager in your base image. \
|
|
168
|
-
Verified packages: Ubuntu 22.04+, Fedora 38+, Arch Linux."
|
|
169
|
-
.to_string()
|
|
170
|
-
}
|
|
49
|
+
lazy_static! {
|
|
50
|
+
static ref MODEL_CACHE: RwLock<HashMap<String, Arc<Mutex<TextEmbedding>>>> = RwLock::new(HashMap::new());
|
|
171
51
|
}
|
|
172
52
|
|
|
173
53
|
/// Get or initialize a text embedding model from cache.
|
|
@@ -175,11 +55,10 @@ fn onnx_runtime_install_message() -> String {
|
|
|
175
55
|
/// This function ensures models are initialized only once and reused across
|
|
176
56
|
/// the application, avoiding redundant downloads and initialization overhead.
|
|
177
57
|
#[cfg(feature = "embeddings")]
|
|
178
|
-
#[allow(private_interfaces)]
|
|
179
58
|
pub fn get_or_init_model(
|
|
180
59
|
model: EmbeddingModel,
|
|
181
60
|
cache_dir: Option<std::path::PathBuf>,
|
|
182
|
-
) -> crate::Result<
|
|
61
|
+
) -> crate::Result<Arc<Mutex<TextEmbedding>>> {
|
|
183
62
|
let cache_directory = cache_dir.unwrap_or_else(|| {
|
|
184
63
|
let mut path = std::env::current_dir().unwrap_or_else(|_| std::path::PathBuf::from("."));
|
|
185
64
|
path.push(".kreuzberg");
|
|
@@ -190,26 +69,21 @@ pub fn get_or_init_model(
|
|
|
190
69
|
let model_key = format!("{:?}_{}", model, cache_directory.display());
|
|
191
70
|
|
|
192
71
|
{
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
let cache = poison_error.get_ref();
|
|
201
|
-
if let Some(cached_model) = cache.get(&model_key) {
|
|
202
|
-
return Ok(Arc::clone(cached_model));
|
|
203
|
-
}
|
|
204
|
-
}
|
|
72
|
+
let cache = MODEL_CACHE.read().map_err(|e| crate::KreuzbergError::Plugin {
|
|
73
|
+
message: format!("Failed to acquire model cache read lock: {}", e),
|
|
74
|
+
plugin_name: "embeddings".to_string(),
|
|
75
|
+
})?;
|
|
76
|
+
|
|
77
|
+
if let Some(cached_model) = cache.get(&model_key) {
|
|
78
|
+
return Ok(Arc::clone(cached_model));
|
|
205
79
|
}
|
|
206
80
|
}
|
|
207
81
|
|
|
208
82
|
{
|
|
209
|
-
let mut cache =
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
}
|
|
83
|
+
let mut cache = MODEL_CACHE.write().map_err(|e| crate::KreuzbergError::Plugin {
|
|
84
|
+
message: format!("Failed to acquire model cache write lock: {}", e),
|
|
85
|
+
plugin_name: "embeddings".to_string(),
|
|
86
|
+
})?;
|
|
213
87
|
|
|
214
88
|
if let Some(cached_model) = cache.get(&model_key) {
|
|
215
89
|
return Ok(Arc::clone(cached_model));
|
|
@@ -218,27 +92,12 @@ pub fn get_or_init_model(
|
|
|
218
92
|
let mut init_options = InitOptions::new(model);
|
|
219
93
|
init_options = init_options.with_cache_dir(cache_directory);
|
|
220
94
|
|
|
221
|
-
let embedding_model = TextEmbedding::try_new(init_options).map_err(|e| {
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
if error_msg.contains("onnxruntime")
|
|
225
|
-
|| error_msg.contains("ORT")
|
|
226
|
-
|| error_msg.contains("libonnxruntime")
|
|
227
|
-
|| error_msg.contains("onnxruntime.dll")
|
|
228
|
-
|| error_msg.contains("Unable to load")
|
|
229
|
-
|| error_msg.contains("library load failed")
|
|
230
|
-
{
|
|
231
|
-
crate::KreuzbergError::MissingDependency(format!("ONNX Runtime - {}", onnx_runtime_install_message()))
|
|
232
|
-
} else {
|
|
233
|
-
crate::KreuzbergError::Plugin {
|
|
234
|
-
message: format!("Failed to initialize embedding model: {}", e),
|
|
235
|
-
plugin_name: "embeddings".to_string(),
|
|
236
|
-
}
|
|
237
|
-
}
|
|
95
|
+
let embedding_model = TextEmbedding::try_new(init_options).map_err(|e| crate::KreuzbergError::Plugin {
|
|
96
|
+
message: format!("Failed to initialize embedding model: {}", e),
|
|
97
|
+
plugin_name: "embeddings".to_string(),
|
|
238
98
|
})?;
|
|
239
99
|
|
|
240
|
-
let
|
|
241
|
-
let arc_model = Arc::new(Mutex::new(leaked_model));
|
|
100
|
+
let arc_model = Arc::new(Mutex::new(embedding_model));
|
|
242
101
|
cache.insert(model_key, Arc::clone(&arc_model));
|
|
243
102
|
|
|
244
103
|
Ok(arc_model)
|
|
@@ -389,15 +248,12 @@ pub fn generate_embeddings_for_chunks(
|
|
|
389
248
|
let texts: Vec<String> = chunks.iter().map(|chunk| chunk.content.clone()).collect();
|
|
390
249
|
|
|
391
250
|
let embeddings_result = {
|
|
392
|
-
let locked_model = model.lock().map_err(|e| crate::KreuzbergError::Plugin {
|
|
251
|
+
let mut locked_model = model.lock().map_err(|e| crate::KreuzbergError::Plugin {
|
|
393
252
|
message: format!("Failed to acquire model lock: {}", e),
|
|
394
253
|
plugin_name: "embeddings".to_string(),
|
|
395
254
|
})?;
|
|
396
255
|
|
|
397
|
-
|
|
398
|
-
let model_mut = unsafe { locked_model.get_mut() };
|
|
399
|
-
|
|
400
|
-
model_mut
|
|
256
|
+
locked_model
|
|
401
257
|
.embed(texts, Some(config.batch_size))
|
|
402
258
|
.map_err(|e| crate::KreuzbergError::Plugin {
|
|
403
259
|
message: format!("Failed to generate embeddings: {}", e),
|
|
@@ -464,8 +320,4 @@ mod tests {
|
|
|
464
320
|
assert_eq!(quality.chunk_size, 2000);
|
|
465
321
|
assert_eq!(quality.overlap, 200);
|
|
466
322
|
}
|
|
467
|
-
|
|
468
|
-
#[cfg(feature = "embeddings")]
|
|
469
|
-
#[test]
|
|
470
|
-
fn test_lock_poisoning_recovery_semantics() {}
|
|
471
323
|
}
|
|
@@ -60,7 +60,7 @@ pub type Result<T> = std::result::Result<T, KreuzbergError>;
|
|
|
60
60
|
/// - `Cache` - Cache operation errors (non-fatal, can be ignored)
|
|
61
61
|
/// - `ImageProcessing` - Image manipulation errors
|
|
62
62
|
/// - `Serialization` - JSON/MessagePack serialization errors
|
|
63
|
-
/// - `MissingDependency` - Missing optional dependencies (tesseract, etc.)
|
|
63
|
+
/// - `MissingDependency` - Missing optional dependencies (tesseract, pandoc, etc.)
|
|
64
64
|
/// - `Plugin` - Plugin-specific errors
|
|
65
65
|
/// - `LockPoisoned` - Mutex/RwLock poisoning (should not happen in normal operation)
|
|
66
66
|
/// - `UnsupportedFormat` - Unsupported MIME type or file format
|
|
@@ -177,7 +177,7 @@ impl From<crate::pdf::error::PdfError> for KreuzbergError {
|
|
|
177
177
|
|
|
178
178
|
macro_rules! error_constructor {
|
|
179
179
|
($name:ident, $variant:ident) => {
|
|
180
|
-
|
|
180
|
+
paste::paste! {
|
|
181
181
|
#[doc = "Create a " $variant " error"]
|
|
182
182
|
pub fn $name<S: Into<String>>(message: S) -> Self {
|
|
183
183
|
Self::$variant {
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
//! This module provides functions for extracting file lists and contents from archives.
|
|
4
4
|
|
|
5
5
|
use crate::error::{KreuzbergError, Result};
|
|
6
|
-
use
|
|
6
|
+
use sevenz_rust::SevenZReader;
|
|
7
7
|
use std::collections::HashMap;
|
|
8
8
|
use std::io::{Cursor, Read};
|
|
9
9
|
use tar::Archive as TarArchive;
|
|
@@ -39,7 +39,7 @@ pub fn extract_zip_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
|
|
|
39
39
|
let mut archive =
|
|
40
40
|
ZipArchive::new(cursor).map_err(|e| KreuzbergError::parsing(format!("Failed to read ZIP archive: {}", e)))?;
|
|
41
41
|
|
|
42
|
-
let mut file_list = Vec::
|
|
42
|
+
let mut file_list = Vec::new();
|
|
43
43
|
let mut total_size = 0u64;
|
|
44
44
|
|
|
45
45
|
for i in 0..archive.len() {
|
|
@@ -71,8 +71,7 @@ pub fn extract_tar_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
|
|
|
71
71
|
let cursor = Cursor::new(bytes);
|
|
72
72
|
let mut archive = TarArchive::new(cursor);
|
|
73
73
|
|
|
74
|
-
let
|
|
75
|
-
let mut file_list = Vec::with_capacity(estimated_entries);
|
|
74
|
+
let mut file_list = Vec::new();
|
|
76
75
|
let mut total_size = 0u64;
|
|
77
76
|
let mut file_count = 0;
|
|
78
77
|
|
|
@@ -116,8 +115,7 @@ pub fn extract_zip_text_content(bytes: &[u8]) -> Result<HashMap<String, String>>
|
|
|
116
115
|
let mut archive =
|
|
117
116
|
ZipArchive::new(cursor).map_err(|e| KreuzbergError::parsing(format!("Failed to read ZIP archive: {}", e)))?;
|
|
118
117
|
|
|
119
|
-
let
|
|
120
|
-
let mut contents = HashMap::with_capacity(estimated_text_files);
|
|
118
|
+
let mut contents = HashMap::new();
|
|
121
119
|
let text_extensions = [
|
|
122
120
|
".txt", ".md", ".json", ".xml", ".html", ".csv", ".log", ".yaml", ".toml",
|
|
123
121
|
];
|
|
@@ -130,8 +128,7 @@ pub fn extract_zip_text_content(bytes: &[u8]) -> Result<HashMap<String, String>>
|
|
|
130
128
|
let path = file.name().to_string();
|
|
131
129
|
|
|
132
130
|
if !file.is_dir() && text_extensions.iter().any(|ext| path.to_lowercase().ends_with(ext)) {
|
|
133
|
-
let
|
|
134
|
-
let mut content = String::with_capacity(estimated_size);
|
|
131
|
+
let mut content = String::new();
|
|
135
132
|
if file.read_to_string(&mut content).is_ok() {
|
|
136
133
|
contents.insert(path, content);
|
|
137
134
|
}
|
|
@@ -148,8 +145,7 @@ pub fn extract_tar_text_content(bytes: &[u8]) -> Result<HashMap<String, String>>
|
|
|
148
145
|
let cursor = Cursor::new(bytes);
|
|
149
146
|
let mut archive = TarArchive::new(cursor);
|
|
150
147
|
|
|
151
|
-
let
|
|
152
|
-
let mut contents = HashMap::with_capacity(estimated_text_files.max(2));
|
|
148
|
+
let mut contents = HashMap::new();
|
|
153
149
|
let text_extensions = [
|
|
154
150
|
".txt", ".md", ".json", ".xml", ".html", ".csv", ".log", ".yaml", ".toml",
|
|
155
151
|
];
|
|
@@ -170,8 +166,7 @@ pub fn extract_tar_text_content(bytes: &[u8]) -> Result<HashMap<String, String>>
|
|
|
170
166
|
|
|
171
167
|
if !entry.header().entry_type().is_dir() && text_extensions.iter().any(|ext| path.to_lowercase().ends_with(ext))
|
|
172
168
|
{
|
|
173
|
-
let
|
|
174
|
-
let mut content = String::with_capacity(estimated_size);
|
|
169
|
+
let mut content = String::new();
|
|
175
170
|
if entry.read_to_string(&mut content).is_ok() {
|
|
176
171
|
contents.insert(path, content);
|
|
177
172
|
}
|
|
@@ -184,7 +179,7 @@ pub fn extract_tar_text_content(bytes: &[u8]) -> Result<HashMap<String, String>>
|
|
|
184
179
|
/// Extract metadata from a 7z archive.
|
|
185
180
|
pub fn extract_7z_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
|
|
186
181
|
let cursor = Cursor::new(bytes);
|
|
187
|
-
let archive =
|
|
182
|
+
let archive = SevenZReader::new(cursor, bytes.len() as u64, "".into())
|
|
188
183
|
.map_err(|e| KreuzbergError::parsing(format!("Failed to read 7z archive: {}", e)))?;
|
|
189
184
|
|
|
190
185
|
let mut file_list = Vec::new();
|
|
@@ -217,7 +212,7 @@ pub fn extract_7z_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
|
|
|
217
212
|
/// Only extracts files with common text extensions: .txt, .md, .json, .xml, .html, .csv, .log
|
|
218
213
|
pub fn extract_7z_text_content(bytes: &[u8]) -> Result<HashMap<String, String>> {
|
|
219
214
|
let cursor = Cursor::new(bytes);
|
|
220
|
-
let mut archive =
|
|
215
|
+
let mut archive = SevenZReader::new(cursor, bytes.len() as u64, "".into())
|
|
221
216
|
.map_err(|e| KreuzbergError::parsing(format!("Failed to read 7z archive: {}", e)))?;
|
|
222
217
|
|
|
223
218
|
let mut contents = HashMap::new();
|
|
@@ -464,26 +459,26 @@ mod tests {
|
|
|
464
459
|
|
|
465
460
|
#[test]
|
|
466
461
|
fn test_extract_7z_metadata_with_files() {
|
|
467
|
-
use
|
|
462
|
+
use sevenz_rust::SevenZWriter;
|
|
468
463
|
|
|
469
|
-
let cursor =
|
|
470
|
-
|
|
471
|
-
let mut sz =
|
|
464
|
+
let mut cursor = Cursor::new(Vec::new());
|
|
465
|
+
{
|
|
466
|
+
let mut sz = SevenZWriter::new(&mut cursor).unwrap();
|
|
472
467
|
|
|
473
468
|
sz.push_archive_entry(
|
|
474
|
-
|
|
469
|
+
sevenz_rust::SevenZArchiveEntry::from_path("test.txt", "test.txt".to_string()),
|
|
475
470
|
Some(Cursor::new(b"Hello 7z!".to_vec())),
|
|
476
471
|
)
|
|
477
472
|
.unwrap();
|
|
478
473
|
|
|
479
474
|
sz.push_archive_entry(
|
|
480
|
-
|
|
475
|
+
sevenz_rust::SevenZArchiveEntry::from_path("data.json", "data.json".to_string()),
|
|
481
476
|
Some(Cursor::new(b"{\"key\":\"value\"}".to_vec())),
|
|
482
477
|
)
|
|
483
478
|
.unwrap();
|
|
484
479
|
|
|
485
|
-
sz.finish().unwrap()
|
|
486
|
-
}
|
|
480
|
+
sz.finish().unwrap();
|
|
481
|
+
}
|
|
487
482
|
|
|
488
483
|
let bytes = cursor.into_inner();
|
|
489
484
|
let metadata = extract_7z_metadata(&bytes).unwrap();
|
|
@@ -839,26 +834,26 @@ mod tests {
|
|
|
839
834
|
|
|
840
835
|
#[test]
|
|
841
836
|
fn test_extract_7z_text_content() {
|
|
842
|
-
use
|
|
837
|
+
use sevenz_rust::SevenZWriter;
|
|
843
838
|
|
|
844
|
-
let cursor =
|
|
845
|
-
|
|
846
|
-
let mut sz =
|
|
839
|
+
let mut cursor = Cursor::new(Vec::new());
|
|
840
|
+
{
|
|
841
|
+
let mut sz = SevenZWriter::new(&mut cursor).unwrap();
|
|
847
842
|
|
|
848
843
|
sz.push_archive_entry(
|
|
849
|
-
|
|
844
|
+
sevenz_rust::SevenZArchiveEntry::from_path("test.txt", "test.txt".to_string()),
|
|
850
845
|
Some(Cursor::new(b"Hello 7z text!".to_vec())),
|
|
851
846
|
)
|
|
852
847
|
.unwrap();
|
|
853
848
|
|
|
854
849
|
sz.push_archive_entry(
|
|
855
|
-
|
|
850
|
+
sevenz_rust::SevenZArchiveEntry::from_path("readme.md", "readme.md".to_string()),
|
|
856
851
|
Some(Cursor::new(b"# 7z README".to_vec())),
|
|
857
852
|
)
|
|
858
853
|
.unwrap();
|
|
859
854
|
|
|
860
|
-
sz.finish().unwrap()
|
|
861
|
-
}
|
|
855
|
+
sz.finish().unwrap();
|
|
856
|
+
}
|
|
862
857
|
|
|
863
858
|
let bytes = cursor.into_inner();
|
|
864
859
|
let contents = extract_7z_text_content(&bytes).unwrap();
|
|
@@ -870,13 +865,13 @@ mod tests {
|
|
|
870
865
|
|
|
871
866
|
#[test]
|
|
872
867
|
fn test_extract_7z_empty_archive() {
|
|
873
|
-
use
|
|
868
|
+
use sevenz_rust::SevenZWriter;
|
|
874
869
|
|
|
875
|
-
let cursor =
|
|
876
|
-
|
|
877
|
-
let sz =
|
|
878
|
-
sz.finish().unwrap()
|
|
879
|
-
}
|
|
870
|
+
let mut cursor = Cursor::new(Vec::new());
|
|
871
|
+
{
|
|
872
|
+
let sz = SevenZWriter::new(&mut cursor).unwrap();
|
|
873
|
+
sz.finish().unwrap();
|
|
874
|
+
}
|
|
880
875
|
|
|
881
876
|
let bytes = cursor.into_inner();
|
|
882
877
|
let metadata = extract_7z_metadata(&bytes).unwrap();
|