kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -215,26 +215,149 @@ pub fn get_or_init_model(
|
|
|
215
215
|
return Ok(Arc::clone(cached_model));
|
|
216
216
|
}
|
|
217
217
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
let
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
218
|
+
// Check if ONNX Runtime library exists and set ORT_DYLIB_PATH if needed
|
|
219
|
+
// This prevents panics that cannot unwind through FFI boundaries
|
|
220
|
+
fn ensure_onnx_available() -> Result<(), String> {
|
|
221
|
+
// Check if ORT_DYLIB_PATH is already set and valid
|
|
222
|
+
if let Ok(path) = std::env::var("ORT_DYLIB_PATH") {
|
|
223
|
+
if std::path::Path::new(&path).exists() {
|
|
224
|
+
return Ok(());
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
// Check common installation paths and set ORT_DYLIB_PATH if found
|
|
229
|
+
#[cfg(target_os = "macos")]
|
|
230
|
+
{
|
|
231
|
+
let paths = vec![
|
|
232
|
+
"/opt/homebrew/lib/libonnxruntime.dylib",
|
|
233
|
+
"/usr/local/lib/libonnxruntime.dylib",
|
|
234
|
+
];
|
|
235
|
+
for path in paths {
|
|
236
|
+
if std::path::Path::new(path).exists() {
|
|
237
|
+
// Set ORT_DYLIB_PATH so the ort crate can find it
|
|
238
|
+
// SAFETY: We're setting an environment variable before any threads are spawned
|
|
239
|
+
// in this module, and we're the only ones setting this variable
|
|
240
|
+
#[allow(unsafe_code)]
|
|
241
|
+
unsafe {
|
|
242
|
+
std::env::set_var("ORT_DYLIB_PATH", path);
|
|
243
|
+
}
|
|
244
|
+
return Ok(());
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
#[cfg(target_os = "linux")]
|
|
250
|
+
{
|
|
251
|
+
let paths = vec![
|
|
252
|
+
"/usr/lib/libonnxruntime.so",
|
|
253
|
+
"/usr/local/lib/libonnxruntime.so",
|
|
254
|
+
"/usr/lib/x86_64-linux-gnu/libonnxruntime.so",
|
|
255
|
+
"/usr/lib/aarch64-linux-gnu/libonnxruntime.so",
|
|
256
|
+
];
|
|
257
|
+
for path in paths {
|
|
258
|
+
if std::path::Path::new(path).exists() {
|
|
259
|
+
// SAFETY: We're setting an environment variable before any threads are spawned
|
|
260
|
+
// in this module, and we're the only ones setting this variable
|
|
261
|
+
#[allow(unsafe_code)]
|
|
262
|
+
unsafe {
|
|
263
|
+
std::env::set_var("ORT_DYLIB_PATH", path);
|
|
264
|
+
}
|
|
265
|
+
return Ok(());
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
#[cfg(target_os = "windows")]
|
|
271
|
+
{
|
|
272
|
+
let paths = vec![
|
|
273
|
+
"C:\\Program Files\\onnxruntime\\bin\\onnxruntime.dll",
|
|
274
|
+
"C:\\Windows\\System32\\onnxruntime.dll",
|
|
275
|
+
];
|
|
276
|
+
for path in paths {
|
|
277
|
+
if std::path::Path::new(path).exists() {
|
|
278
|
+
// SAFETY: We're setting an environment variable before any threads are spawned
|
|
279
|
+
// in this module, and we're the only ones setting this variable
|
|
280
|
+
#[allow(unsafe_code)]
|
|
281
|
+
unsafe {
|
|
282
|
+
std::env::set_var("ORT_DYLIB_PATH", path);
|
|
283
|
+
}
|
|
284
|
+
return Ok(());
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
Err("ONNX Runtime library not found in common installation paths".to_string())
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
if let Err(e) = ensure_onnx_available() {
|
|
293
|
+
return Err(crate::KreuzbergError::MissingDependency(format!(
|
|
294
|
+
"{}. {}",
|
|
295
|
+
e,
|
|
296
|
+
onnx_runtime_install_message()
|
|
297
|
+
)));
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
// Wrap the entire embedding initialization with catch_unwind to handle panics from ONNX Runtime
|
|
301
|
+
// ONNX Runtime can panic when the library is not found, which causes issues in FFI contexts
|
|
302
|
+
// This includes both InitOptions::new and TextEmbedding::try_new as both can trigger ONNX Runtime loading
|
|
303
|
+
let embedding_model = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
|
|
304
|
+
let mut init_options = InitOptions::new(model);
|
|
305
|
+
init_options = init_options.with_cache_dir(cache_directory);
|
|
306
|
+
TextEmbedding::try_new(init_options)
|
|
307
|
+
}))
|
|
308
|
+
.map_err(|panic_payload| {
|
|
309
|
+
// Convert panic to a KreuzbergError
|
|
310
|
+
let panic_msg = if let Some(s) = panic_payload.downcast_ref::<&str>() {
|
|
311
|
+
s.to_string()
|
|
312
|
+
} else if let Some(s) = panic_payload.downcast_ref::<String>() {
|
|
313
|
+
s.clone()
|
|
314
|
+
} else {
|
|
315
|
+
"Unknown panic during ONNX Runtime initialization".to_string()
|
|
316
|
+
};
|
|
317
|
+
|
|
318
|
+
// Check if this looks like an ONNX Runtime missing dependency error
|
|
319
|
+
if panic_msg.contains("onnxruntime")
|
|
320
|
+
|| panic_msg.contains("ORT")
|
|
321
|
+
|| panic_msg.contains("libonnxruntime")
|
|
322
|
+
|| panic_msg.contains("onnxruntime.dll")
|
|
323
|
+
|| panic_msg.contains("Unable to load")
|
|
324
|
+
|| panic_msg.contains("library load failed")
|
|
325
|
+
|| panic_msg.contains("attempting to load")
|
|
326
|
+
|| panic_msg.contains("An error occurred while")
|
|
230
327
|
{
|
|
231
328
|
crate::KreuzbergError::MissingDependency(format!("ONNX Runtime - {}", onnx_runtime_install_message()))
|
|
232
329
|
} else {
|
|
233
330
|
crate::KreuzbergError::Plugin {
|
|
234
|
-
message: format!("
|
|
331
|
+
message: format!("ONNX Runtime initialization panicked: {}", panic_msg),
|
|
235
332
|
plugin_name: "embeddings".to_string(),
|
|
236
333
|
}
|
|
237
334
|
}
|
|
335
|
+
})
|
|
336
|
+
.and_then(|result| {
|
|
337
|
+
// Map fastembed errors to KreuzbergError
|
|
338
|
+
result.map_err(|e| {
|
|
339
|
+
let error_msg = e.to_string();
|
|
340
|
+
|
|
341
|
+
if error_msg.contains("onnxruntime")
|
|
342
|
+
|| error_msg.contains("ORT")
|
|
343
|
+
|| error_msg.contains("libonnxruntime")
|
|
344
|
+
|| error_msg.contains("onnxruntime.dll")
|
|
345
|
+
|| error_msg.contains("Unable to load")
|
|
346
|
+
|| error_msg.contains("library load failed")
|
|
347
|
+
|| error_msg.contains("attempting to load")
|
|
348
|
+
|| error_msg.contains("An error occurred while")
|
|
349
|
+
{
|
|
350
|
+
crate::KreuzbergError::MissingDependency(format!(
|
|
351
|
+
"ONNX Runtime - {}",
|
|
352
|
+
onnx_runtime_install_message()
|
|
353
|
+
))
|
|
354
|
+
} else {
|
|
355
|
+
crate::KreuzbergError::Plugin {
|
|
356
|
+
message: format!("Failed to initialize embedding model: {}", e),
|
|
357
|
+
plugin_name: "embeddings".to_string(),
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
})
|
|
238
361
|
})?;
|
|
239
362
|
|
|
240
363
|
let leaked_model = LeakedModel::new(embedding_model);
|
|
@@ -1,13 +1,21 @@
|
|
|
1
1
|
//! Archive extraction functionality.
|
|
2
2
|
//!
|
|
3
3
|
//! This module provides functions for extracting file lists and contents from archives.
|
|
4
|
+
//! Supported formats:
|
|
5
|
+
//! - ZIP archives
|
|
6
|
+
//! - TAR archives (including compressed TAR.GZ, TAR.BZ2)
|
|
7
|
+
//! - 7Z archives
|
|
8
|
+
//!
|
|
9
|
+
//! Each format has its own submodule with specialized extraction logic.
|
|
10
|
+
|
|
11
|
+
mod sevenz;
|
|
12
|
+
mod tar;
|
|
13
|
+
mod zip;
|
|
4
14
|
|
|
5
|
-
|
|
6
|
-
use
|
|
7
|
-
use
|
|
8
|
-
use
|
|
9
|
-
use tar::Archive as TarArchive;
|
|
10
|
-
use zip::ZipArchive;
|
|
15
|
+
// Re-export all public functions for backward compatibility
|
|
16
|
+
pub use sevenz::{extract_7z_metadata, extract_7z_text_content};
|
|
17
|
+
pub use tar::{extract_tar_metadata, extract_tar_text_content};
|
|
18
|
+
pub use zip::{extract_zip_metadata, extract_zip_text_content};
|
|
11
19
|
|
|
12
20
|
/// Archive metadata extracted from an archive file.
|
|
13
21
|
#[derive(Debug, Clone)]
|
|
@@ -33,223 +41,17 @@ pub struct ArchiveEntry {
|
|
|
33
41
|
pub is_dir: bool,
|
|
34
42
|
}
|
|
35
43
|
|
|
36
|
-
///
|
|
37
|
-
pub
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
ZipArchive::new(cursor).map_err(|e| KreuzbergError::parsing(format!("Failed to read ZIP archive: {}", e)))?;
|
|
41
|
-
|
|
42
|
-
let mut file_list = Vec::with_capacity(archive.len());
|
|
43
|
-
let mut total_size = 0u64;
|
|
44
|
-
|
|
45
|
-
for i in 0..archive.len() {
|
|
46
|
-
let file = archive
|
|
47
|
-
.by_index(i)
|
|
48
|
-
.map_err(|e| KreuzbergError::parsing(format!("Failed to read ZIP entry: {}", e)))?;
|
|
49
|
-
|
|
50
|
-
let path = file.name().to_string();
|
|
51
|
-
let size = file.size();
|
|
52
|
-
let is_dir = file.is_dir();
|
|
53
|
-
|
|
54
|
-
if !is_dir {
|
|
55
|
-
total_size += size;
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
file_list.push(ArchiveEntry { path, size, is_dir });
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
Ok(ArchiveMetadata {
|
|
62
|
-
format: "ZIP".to_string(),
|
|
63
|
-
file_list,
|
|
64
|
-
file_count: archive.len(),
|
|
65
|
-
total_size,
|
|
66
|
-
})
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
/// Extract metadata from a TAR archive.
|
|
70
|
-
pub fn extract_tar_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
|
|
71
|
-
let cursor = Cursor::new(bytes);
|
|
72
|
-
let mut archive = TarArchive::new(cursor);
|
|
73
|
-
|
|
74
|
-
let estimated_entries = bytes.len().saturating_div(512).max(16);
|
|
75
|
-
let mut file_list = Vec::with_capacity(estimated_entries);
|
|
76
|
-
let mut total_size = 0u64;
|
|
77
|
-
let mut file_count = 0;
|
|
78
|
-
|
|
79
|
-
let entries = archive
|
|
80
|
-
.entries()
|
|
81
|
-
.map_err(|e| KreuzbergError::parsing(format!("Failed to read TAR archive: {}", e)))?;
|
|
82
|
-
|
|
83
|
-
for entry_result in entries {
|
|
84
|
-
let entry = entry_result.map_err(|e| KreuzbergError::parsing(format!("Failed to read TAR entry: {}", e)))?;
|
|
85
|
-
|
|
86
|
-
let path = entry
|
|
87
|
-
.path()
|
|
88
|
-
.map_err(|e| KreuzbergError::parsing(format!("Failed to read TAR entry path: {}", e)))?
|
|
89
|
-
.to_string_lossy()
|
|
90
|
-
.to_string();
|
|
91
|
-
|
|
92
|
-
let size = entry.size();
|
|
93
|
-
let is_dir = entry.header().entry_type().is_dir();
|
|
94
|
-
|
|
95
|
-
if !is_dir {
|
|
96
|
-
total_size += size;
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
file_count += 1;
|
|
100
|
-
file_list.push(ArchiveEntry { path, size, is_dir });
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
Ok(ArchiveMetadata {
|
|
104
|
-
format: "TAR".to_string(),
|
|
105
|
-
file_list,
|
|
106
|
-
file_count,
|
|
107
|
-
total_size,
|
|
108
|
-
})
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
/// Extract text content from files within a ZIP archive.
|
|
112
|
-
///
|
|
113
|
-
/// Only extracts files with common text extensions: .txt, .md, .json, .xml, .html, .csv, .log
|
|
114
|
-
pub fn extract_zip_text_content(bytes: &[u8]) -> Result<HashMap<String, String>> {
|
|
115
|
-
let cursor = Cursor::new(bytes);
|
|
116
|
-
let mut archive =
|
|
117
|
-
ZipArchive::new(cursor).map_err(|e| KreuzbergError::parsing(format!("Failed to read ZIP archive: {}", e)))?;
|
|
118
|
-
|
|
119
|
-
let estimated_text_files = archive.len().saturating_mul(3).saturating_div(10).max(2);
|
|
120
|
-
let mut contents = HashMap::with_capacity(estimated_text_files);
|
|
121
|
-
let text_extensions = [
|
|
122
|
-
".txt", ".md", ".json", ".xml", ".html", ".csv", ".log", ".yaml", ".toml",
|
|
123
|
-
];
|
|
124
|
-
|
|
125
|
-
for i in 0..archive.len() {
|
|
126
|
-
let mut file = archive
|
|
127
|
-
.by_index(i)
|
|
128
|
-
.map_err(|e| KreuzbergError::parsing(format!("Failed to read ZIP entry: {}", e)))?;
|
|
129
|
-
|
|
130
|
-
let path = file.name().to_string();
|
|
131
|
-
|
|
132
|
-
if !file.is_dir() && text_extensions.iter().any(|ext| path.to_lowercase().ends_with(ext)) {
|
|
133
|
-
let estimated_size = (file.size() as usize).min(10 * 1024 * 1024);
|
|
134
|
-
let mut content = String::with_capacity(estimated_size);
|
|
135
|
-
if file.read_to_string(&mut content).is_ok() {
|
|
136
|
-
contents.insert(path, content);
|
|
137
|
-
}
|
|
138
|
-
}
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
Ok(contents)
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
/// Extract text content from files within a TAR archive.
|
|
145
|
-
///
|
|
146
|
-
/// Only extracts files with common text extensions: .txt, .md, .json, .xml, .html, .csv, .log
|
|
147
|
-
pub fn extract_tar_text_content(bytes: &[u8]) -> Result<HashMap<String, String>> {
|
|
148
|
-
let cursor = Cursor::new(bytes);
|
|
149
|
-
let mut archive = TarArchive::new(cursor);
|
|
150
|
-
|
|
151
|
-
let estimated_text_files = bytes.len().saturating_div(1024 * 10).min(100);
|
|
152
|
-
let mut contents = HashMap::with_capacity(estimated_text_files.max(2));
|
|
153
|
-
let text_extensions = [
|
|
154
|
-
".txt", ".md", ".json", ".xml", ".html", ".csv", ".log", ".yaml", ".toml",
|
|
155
|
-
];
|
|
156
|
-
|
|
157
|
-
let entries = archive
|
|
158
|
-
.entries()
|
|
159
|
-
.map_err(|e| KreuzbergError::parsing(format!("Failed to read TAR archive: {}", e)))?;
|
|
160
|
-
|
|
161
|
-
for entry_result in entries {
|
|
162
|
-
let mut entry =
|
|
163
|
-
entry_result.map_err(|e| KreuzbergError::parsing(format!("Failed to read TAR entry: {}", e)))?;
|
|
164
|
-
|
|
165
|
-
let path = entry
|
|
166
|
-
.path()
|
|
167
|
-
.map_err(|e| KreuzbergError::parsing(format!("Failed to read TAR entry path: {}", e)))?
|
|
168
|
-
.to_string_lossy()
|
|
169
|
-
.to_string();
|
|
170
|
-
|
|
171
|
-
if !entry.header().entry_type().is_dir() && text_extensions.iter().any(|ext| path.to_lowercase().ends_with(ext))
|
|
172
|
-
{
|
|
173
|
-
let estimated_size = (entry.size().min(10 * 1024 * 1024)) as usize;
|
|
174
|
-
let mut content = String::with_capacity(estimated_size);
|
|
175
|
-
if entry.read_to_string(&mut content).is_ok() {
|
|
176
|
-
contents.insert(path, content);
|
|
177
|
-
}
|
|
178
|
-
}
|
|
179
|
-
}
|
|
180
|
-
|
|
181
|
-
Ok(contents)
|
|
182
|
-
}
|
|
183
|
-
|
|
184
|
-
/// Extract metadata from a 7z archive.
|
|
185
|
-
pub fn extract_7z_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
|
|
186
|
-
let cursor = Cursor::new(bytes);
|
|
187
|
-
let archive = ArchiveReader::new(cursor, Password::empty())
|
|
188
|
-
.map_err(|e| KreuzbergError::parsing(format!("Failed to read 7z archive: {}", e)))?;
|
|
189
|
-
|
|
190
|
-
let mut file_list = Vec::new();
|
|
191
|
-
let mut total_size = 0u64;
|
|
192
|
-
|
|
193
|
-
for entry in &archive.archive().files {
|
|
194
|
-
let path = entry.name().to_string();
|
|
195
|
-
let size = entry.size();
|
|
196
|
-
let is_dir = entry.is_directory();
|
|
197
|
-
|
|
198
|
-
if !is_dir {
|
|
199
|
-
total_size += size;
|
|
200
|
-
}
|
|
201
|
-
|
|
202
|
-
file_list.push(ArchiveEntry { path, size, is_dir });
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
let file_count = file_list.len();
|
|
206
|
-
|
|
207
|
-
Ok(ArchiveMetadata {
|
|
208
|
-
format: "7Z".to_string(),
|
|
209
|
-
file_list,
|
|
210
|
-
file_count,
|
|
211
|
-
total_size,
|
|
212
|
-
})
|
|
213
|
-
}
|
|
214
|
-
|
|
215
|
-
/// Extract text content from files within a 7z archive.
|
|
216
|
-
///
|
|
217
|
-
/// Only extracts files with common text extensions: .txt, .md, .json, .xml, .html, .csv, .log
|
|
218
|
-
pub fn extract_7z_text_content(bytes: &[u8]) -> Result<HashMap<String, String>> {
|
|
219
|
-
let cursor = Cursor::new(bytes);
|
|
220
|
-
let mut archive = ArchiveReader::new(cursor, Password::empty())
|
|
221
|
-
.map_err(|e| KreuzbergError::parsing(format!("Failed to read 7z archive: {}", e)))?;
|
|
222
|
-
|
|
223
|
-
let mut contents = HashMap::new();
|
|
224
|
-
let text_extensions = [
|
|
225
|
-
".txt", ".md", ".json", ".xml", ".html", ".csv", ".log", ".yaml", ".toml",
|
|
226
|
-
];
|
|
227
|
-
|
|
228
|
-
archive
|
|
229
|
-
.for_each_entries(|entry, reader| {
|
|
230
|
-
let path = entry.name().to_string();
|
|
231
|
-
|
|
232
|
-
if !entry.is_directory() && text_extensions.iter().any(|ext| path.to_lowercase().ends_with(ext)) {
|
|
233
|
-
let mut content = Vec::new();
|
|
234
|
-
if let Ok(_) = reader.read_to_end(&mut content)
|
|
235
|
-
&& let Ok(text) = String::from_utf8(content)
|
|
236
|
-
{
|
|
237
|
-
contents.insert(path, text);
|
|
238
|
-
}
|
|
239
|
-
}
|
|
240
|
-
Ok(true)
|
|
241
|
-
})
|
|
242
|
-
.map_err(|e| KreuzbergError::parsing(format!("Failed to read 7z entries: {}", e)))?;
|
|
243
|
-
|
|
244
|
-
Ok(contents)
|
|
245
|
-
}
|
|
44
|
+
/// Common text file extensions that should be extracted from archives.
|
|
45
|
+
pub(crate) const TEXT_EXTENSIONS: &[&str] = &[
|
|
46
|
+
".txt", ".md", ".json", ".xml", ".html", ".csv", ".log", ".yaml", ".toml",
|
|
47
|
+
];
|
|
246
48
|
|
|
247
49
|
#[cfg(test)]
|
|
248
50
|
mod tests {
|
|
249
51
|
use super::*;
|
|
250
|
-
use
|
|
251
|
-
use
|
|
252
|
-
use
|
|
52
|
+
use ::tar::Builder as TarBuilder;
|
|
53
|
+
use ::zip::write::{FileOptions, ZipWriter};
|
|
54
|
+
use std::io::{Cursor, Write};
|
|
253
55
|
|
|
254
56
|
#[test]
|
|
255
57
|
fn test_extract_zip_metadata() {
|
|
@@ -283,14 +85,14 @@ mod tests {
|
|
|
283
85
|
let mut tar = TarBuilder::new(&mut cursor);
|
|
284
86
|
|
|
285
87
|
let data1 = b"Hello, World!";
|
|
286
|
-
let mut header1 = tar::Header::new_gnu();
|
|
88
|
+
let mut header1 = ::tar::Header::new_gnu();
|
|
287
89
|
header1.set_path("test.txt").unwrap();
|
|
288
90
|
header1.set_size(data1.len() as u64);
|
|
289
91
|
header1.set_cksum();
|
|
290
92
|
tar.append(&header1, &data1[..]).unwrap();
|
|
291
93
|
|
|
292
94
|
let data2 = b"# Header";
|
|
293
|
-
let mut header2 = tar::Header::new_gnu();
|
|
95
|
+
let mut header2 = ::tar::Header::new_gnu();
|
|
294
96
|
header2.set_path("dir/file.md").unwrap();
|
|
295
97
|
header2.set_size(data2.len() as u64);
|
|
296
98
|
header2.set_cksum();
|
|
@@ -339,14 +141,14 @@ mod tests {
|
|
|
339
141
|
let mut tar = TarBuilder::new(&mut cursor);
|
|
340
142
|
|
|
341
143
|
let data1 = b"Hello, World!";
|
|
342
|
-
let mut header1 = tar::Header::new_gnu();
|
|
144
|
+
let mut header1 = ::tar::Header::new_gnu();
|
|
343
145
|
header1.set_path("test.txt").unwrap();
|
|
344
146
|
header1.set_size(data1.len() as u64);
|
|
345
147
|
header1.set_cksum();
|
|
346
148
|
tar.append(&header1, &data1[..]).unwrap();
|
|
347
149
|
|
|
348
150
|
let data2 = b"# README";
|
|
349
|
-
let mut header2 = tar::Header::new_gnu();
|
|
151
|
+
let mut header2 = ::tar::Header::new_gnu();
|
|
350
152
|
header2.set_path("readme.md").unwrap();
|
|
351
153
|
header2.set_size(data2.len() as u64);
|
|
352
154
|
header2.set_cksum();
|
|
@@ -413,15 +215,15 @@ mod tests {
|
|
|
413
215
|
{
|
|
414
216
|
let mut tar = TarBuilder::new(&mut cursor);
|
|
415
217
|
|
|
416
|
-
let mut header_dir = tar::Header::new_gnu();
|
|
218
|
+
let mut header_dir = ::tar::Header::new_gnu();
|
|
417
219
|
header_dir.set_path("dir1/").unwrap();
|
|
418
220
|
header_dir.set_size(0);
|
|
419
|
-
header_dir.set_entry_type(tar::EntryType::Directory);
|
|
221
|
+
header_dir.set_entry_type(::tar::EntryType::Directory);
|
|
420
222
|
header_dir.set_cksum();
|
|
421
223
|
tar.append(&header_dir, &[][..]).unwrap();
|
|
422
224
|
|
|
423
225
|
let data = b"content1";
|
|
424
|
-
let mut header1 = tar::Header::new_gnu();
|
|
226
|
+
let mut header1 = ::tar::Header::new_gnu();
|
|
425
227
|
header1.set_path("dir1/file1.txt").unwrap();
|
|
426
228
|
header1.set_size(data.len() as u64);
|
|
427
229
|
header1.set_cksum();
|
|
@@ -447,7 +249,7 @@ mod tests {
|
|
|
447
249
|
let mut tar = TarBuilder::new(&mut tar_data);
|
|
448
250
|
|
|
449
251
|
let data = b"Hello from gzip!";
|
|
450
|
-
let mut header = tar::Header::new_gnu();
|
|
252
|
+
let mut header = ::tar::Header::new_gnu();
|
|
451
253
|
header.set_path("test.txt").unwrap();
|
|
452
254
|
header.set_size(data.len() as u64);
|
|
453
255
|
header.set_cksum();
|
|
@@ -464,20 +266,20 @@ mod tests {
|
|
|
464
266
|
|
|
465
267
|
#[test]
|
|
466
268
|
fn test_extract_7z_metadata_with_files() {
|
|
467
|
-
use sevenz_rust2::{ArchiveEntry, ArchiveWriter};
|
|
269
|
+
use sevenz_rust2::{ArchiveEntry as SevenzEntry, ArchiveWriter};
|
|
468
270
|
|
|
469
271
|
let cursor = {
|
|
470
272
|
let cursor = Cursor::new(Vec::new());
|
|
471
273
|
let mut sz = ArchiveWriter::new(cursor).unwrap();
|
|
472
274
|
|
|
473
275
|
sz.push_archive_entry(
|
|
474
|
-
|
|
276
|
+
SevenzEntry::new_file("test.txt"),
|
|
475
277
|
Some(Cursor::new(b"Hello 7z!".to_vec())),
|
|
476
278
|
)
|
|
477
279
|
.unwrap();
|
|
478
280
|
|
|
479
281
|
sz.push_archive_entry(
|
|
480
|
-
|
|
282
|
+
SevenzEntry::new_file("data.json"),
|
|
481
283
|
Some(Cursor::new(b"{\"key\":\"value\"}".to_vec())),
|
|
482
284
|
)
|
|
483
285
|
.unwrap();
|
|
@@ -538,7 +340,7 @@ mod tests {
|
|
|
538
340
|
let mut inner_tar = TarBuilder::new(&mut inner_cursor);
|
|
539
341
|
|
|
540
342
|
let data = b"Nested content";
|
|
541
|
-
let mut header = tar::Header::new_gnu();
|
|
343
|
+
let mut header = ::tar::Header::new_gnu();
|
|
542
344
|
header.set_path("inner.txt").unwrap();
|
|
543
345
|
header.set_size(data.len() as u64);
|
|
544
346
|
header.set_cksum();
|
|
@@ -552,14 +354,14 @@ mod tests {
|
|
|
552
354
|
{
|
|
553
355
|
let mut outer_tar = TarBuilder::new(&mut outer_cursor);
|
|
554
356
|
|
|
555
|
-
let mut header1 = tar::Header::new_gnu();
|
|
357
|
+
let mut header1 = ::tar::Header::new_gnu();
|
|
556
358
|
header1.set_path("archive.tar").unwrap();
|
|
557
359
|
header1.set_size(inner_bytes.len() as u64);
|
|
558
360
|
header1.set_cksum();
|
|
559
361
|
outer_tar.append(&header1, &inner_bytes[..]).unwrap();
|
|
560
362
|
|
|
561
363
|
let data = b"Outer content";
|
|
562
|
-
let mut header2 = tar::Header::new_gnu();
|
|
364
|
+
let mut header2 = ::tar::Header::new_gnu();
|
|
563
365
|
header2.set_path("readme.txt").unwrap();
|
|
564
366
|
header2.set_size(data.len() as u64);
|
|
565
367
|
header2.set_cksum();
|
|
@@ -579,6 +381,8 @@ mod tests {
|
|
|
579
381
|
|
|
580
382
|
#[test]
|
|
581
383
|
fn test_extract_zip_corrupted_data() {
|
|
384
|
+
use crate::error::KreuzbergError;
|
|
385
|
+
|
|
582
386
|
let mut valid_cursor = Cursor::new(Vec::new());
|
|
583
387
|
{
|
|
584
388
|
let mut zip = ZipWriter::new(&mut valid_cursor);
|
|
@@ -608,7 +412,7 @@ mod tests {
|
|
|
608
412
|
let mut tar = TarBuilder::new(&mut valid_cursor);
|
|
609
413
|
|
|
610
414
|
let data = b"content";
|
|
611
|
-
let mut header = tar::Header::new_gnu();
|
|
415
|
+
let mut header = ::tar::Header::new_gnu();
|
|
612
416
|
header.set_path("test.txt").unwrap();
|
|
613
417
|
header.set_size(data.len() as u64);
|
|
614
418
|
header.set_cksum();
|
|
@@ -704,7 +508,7 @@ mod tests {
|
|
|
704
508
|
];
|
|
705
509
|
|
|
706
510
|
for (path, data) in files {
|
|
707
|
-
let mut header = tar::Header::new_gnu();
|
|
511
|
+
let mut header = ::tar::Header::new_gnu();
|
|
708
512
|
header.set_path(path).unwrap();
|
|
709
513
|
header.set_size(data.len() as u64);
|
|
710
514
|
header.set_cksum();
|
|
@@ -839,20 +643,20 @@ mod tests {
|
|
|
839
643
|
|
|
840
644
|
#[test]
|
|
841
645
|
fn test_extract_7z_text_content() {
|
|
842
|
-
use sevenz_rust2::{ArchiveEntry, ArchiveWriter};
|
|
646
|
+
use sevenz_rust2::{ArchiveEntry as SevenzEntry, ArchiveWriter};
|
|
843
647
|
|
|
844
648
|
let cursor = {
|
|
845
649
|
let cursor = Cursor::new(Vec::new());
|
|
846
650
|
let mut sz = ArchiveWriter::new(cursor).unwrap();
|
|
847
651
|
|
|
848
652
|
sz.push_archive_entry(
|
|
849
|
-
|
|
653
|
+
SevenzEntry::new_file("test.txt"),
|
|
850
654
|
Some(Cursor::new(b"Hello 7z text!".to_vec())),
|
|
851
655
|
)
|
|
852
656
|
.unwrap();
|
|
853
657
|
|
|
854
658
|
sz.push_archive_entry(
|
|
855
|
-
|
|
659
|
+
SevenzEntry::new_file("readme.md"),
|
|
856
660
|
Some(Cursor::new(b"# 7z README".to_vec())),
|
|
857
661
|
)
|
|
858
662
|
.unwrap();
|
|
@@ -894,7 +698,7 @@ mod tests {
|
|
|
894
698
|
|
|
895
699
|
let large_content = "y".repeat(50_000);
|
|
896
700
|
|
|
897
|
-
let mut header = tar::Header::new_gnu();
|
|
701
|
+
let mut header = ::tar::Header::new_gnu();
|
|
898
702
|
header.set_path("large.txt").unwrap();
|
|
899
703
|
header.set_size(large_content.len() as u64);
|
|
900
704
|
header.set_cksum();
|
|
@@ -947,6 +751,8 @@ mod tests {
|
|
|
947
751
|
|
|
948
752
|
#[test]
|
|
949
753
|
fn test_extract_7z_corrupted_data() {
|
|
754
|
+
use crate::error::KreuzbergError;
|
|
755
|
+
|
|
950
756
|
let invalid_7z_data = vec![0x37, 0x7A, 0xBC, 0xAF, 0x27, 0x1C, 0x00];
|
|
951
757
|
|
|
952
758
|
let result = extract_7z_metadata(&invalid_7z_data);
|