kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
//! 7Z archive extraction.
|
|
2
|
+
//!
|
|
3
|
+
//! Provides functions for extracting metadata and text content from 7Z archives.
|
|
4
|
+
|
|
5
|
+
use super::{ArchiveEntry, ArchiveMetadata, TEXT_EXTENSIONS};
|
|
6
|
+
use crate::error::{KreuzbergError, Result};
|
|
7
|
+
use sevenz_rust2::{ArchiveReader, Password};
|
|
8
|
+
use std::collections::HashMap;
|
|
9
|
+
use std::io::Cursor;
|
|
10
|
+
|
|
11
|
+
/// Extract metadata from a 7z archive.
|
|
12
|
+
///
|
|
13
|
+
/// # Arguments
|
|
14
|
+
///
|
|
15
|
+
/// * `bytes` - The 7z archive bytes
|
|
16
|
+
///
|
|
17
|
+
/// # Returns
|
|
18
|
+
///
|
|
19
|
+
/// Returns `ArchiveMetadata` containing:
|
|
20
|
+
/// - Format: "7Z"
|
|
21
|
+
/// - File list with paths, sizes, and directory flags
|
|
22
|
+
/// - Total file count
|
|
23
|
+
/// - Total uncompressed size
|
|
24
|
+
///
|
|
25
|
+
/// # Errors
|
|
26
|
+
///
|
|
27
|
+
/// Returns an error if the 7z archive cannot be read or parsed.
|
|
28
|
+
pub fn extract_7z_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
|
|
29
|
+
let cursor = Cursor::new(bytes);
|
|
30
|
+
let archive = ArchiveReader::new(cursor, Password::empty())
|
|
31
|
+
.map_err(|e| KreuzbergError::parsing(format!("Failed to read 7z archive: {}", e)))?;
|
|
32
|
+
|
|
33
|
+
let mut file_list = Vec::new();
|
|
34
|
+
let mut total_size = 0u64;
|
|
35
|
+
|
|
36
|
+
for entry in &archive.archive().files {
|
|
37
|
+
let path = entry.name().to_string();
|
|
38
|
+
let size = entry.size();
|
|
39
|
+
let is_dir = entry.is_directory();
|
|
40
|
+
|
|
41
|
+
if !is_dir {
|
|
42
|
+
total_size += size;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
file_list.push(ArchiveEntry { path, size, is_dir });
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
let file_count = file_list.len();
|
|
49
|
+
|
|
50
|
+
Ok(ArchiveMetadata {
|
|
51
|
+
format: "7Z".to_string(),
|
|
52
|
+
file_list,
|
|
53
|
+
file_count,
|
|
54
|
+
total_size,
|
|
55
|
+
})
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/// Extract text content from files within a 7z archive.
|
|
59
|
+
///
|
|
60
|
+
/// Only extracts files with common text extensions: .txt, .md, .json, .xml, .html, .csv, .log, .yaml, .toml
|
|
61
|
+
///
|
|
62
|
+
/// # Arguments
|
|
63
|
+
///
|
|
64
|
+
/// * `bytes` - The 7z archive bytes
|
|
65
|
+
///
|
|
66
|
+
/// # Returns
|
|
67
|
+
///
|
|
68
|
+
/// Returns a `HashMap` mapping file paths to their text content.
|
|
69
|
+
/// Binary files and files with non-text extensions are excluded.
|
|
70
|
+
///
|
|
71
|
+
/// # Errors
|
|
72
|
+
///
|
|
73
|
+
/// Returns an error if the 7z archive cannot be read or parsed.
|
|
74
|
+
pub fn extract_7z_text_content(bytes: &[u8]) -> Result<HashMap<String, String>> {
|
|
75
|
+
let cursor = Cursor::new(bytes);
|
|
76
|
+
let mut archive = ArchiveReader::new(cursor, Password::empty())
|
|
77
|
+
.map_err(|e| KreuzbergError::parsing(format!("Failed to read 7z archive: {}", e)))?;
|
|
78
|
+
|
|
79
|
+
let mut contents = HashMap::new();
|
|
80
|
+
|
|
81
|
+
archive
|
|
82
|
+
.for_each_entries(|entry, reader| {
|
|
83
|
+
let path = entry.name().to_string();
|
|
84
|
+
|
|
85
|
+
if !entry.is_directory() && TEXT_EXTENSIONS.iter().any(|ext| path.to_lowercase().ends_with(ext)) {
|
|
86
|
+
let mut content = Vec::new();
|
|
87
|
+
if let Ok(_) = reader.read_to_end(&mut content)
|
|
88
|
+
&& let Ok(text) = String::from_utf8(content)
|
|
89
|
+
{
|
|
90
|
+
contents.insert(path, text);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
Ok(true)
|
|
94
|
+
})
|
|
95
|
+
.map_err(|e| KreuzbergError::parsing(format!("Failed to read 7z entries: {}", e)))?;
|
|
96
|
+
|
|
97
|
+
Ok(contents)
|
|
98
|
+
}
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
//! TAR archive extraction.
|
|
2
|
+
//!
|
|
3
|
+
//! Provides functions for extracting metadata and text content from TAR archives.
|
|
4
|
+
//! Supports plain TAR as well as compressed variants (TAR.GZ, TAR.BZ2).
|
|
5
|
+
|
|
6
|
+
use super::{ArchiveEntry, ArchiveMetadata, TEXT_EXTENSIONS};
|
|
7
|
+
use crate::error::{KreuzbergError, Result};
|
|
8
|
+
use std::collections::HashMap;
|
|
9
|
+
use std::io::{Cursor, Read};
|
|
10
|
+
use tar::Archive as TarArchive;
|
|
11
|
+
|
|
12
|
+
/// Extract metadata from a TAR archive.
|
|
13
|
+
///
|
|
14
|
+
/// # Arguments
|
|
15
|
+
///
|
|
16
|
+
/// * `bytes` - The TAR archive bytes (can be compressed with gzip or bzip2)
|
|
17
|
+
///
|
|
18
|
+
/// # Returns
|
|
19
|
+
///
|
|
20
|
+
/// Returns `ArchiveMetadata` containing:
|
|
21
|
+
/// - Format: "TAR"
|
|
22
|
+
/// - File list with paths, sizes, and directory flags
|
|
23
|
+
/// - Total file count
|
|
24
|
+
/// - Total uncompressed size
|
|
25
|
+
///
|
|
26
|
+
/// # Errors
|
|
27
|
+
///
|
|
28
|
+
/// Returns an error if the TAR archive cannot be read or parsed.
|
|
29
|
+
pub fn extract_tar_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
|
|
30
|
+
let cursor = Cursor::new(bytes);
|
|
31
|
+
let mut archive = TarArchive::new(cursor);
|
|
32
|
+
|
|
33
|
+
let estimated_entries = bytes.len().saturating_div(512).max(16);
|
|
34
|
+
let mut file_list = Vec::with_capacity(estimated_entries);
|
|
35
|
+
let mut total_size = 0u64;
|
|
36
|
+
let mut file_count = 0;
|
|
37
|
+
|
|
38
|
+
let entries = archive
|
|
39
|
+
.entries()
|
|
40
|
+
.map_err(|e| KreuzbergError::parsing(format!("Failed to read TAR archive: {}", e)))?;
|
|
41
|
+
|
|
42
|
+
for entry_result in entries {
|
|
43
|
+
let entry = entry_result.map_err(|e| KreuzbergError::parsing(format!("Failed to read TAR entry: {}", e)))?;
|
|
44
|
+
|
|
45
|
+
let path = entry
|
|
46
|
+
.path()
|
|
47
|
+
.map_err(|e| KreuzbergError::parsing(format!("Failed to read TAR entry path: {}", e)))?
|
|
48
|
+
.to_string_lossy()
|
|
49
|
+
.to_string();
|
|
50
|
+
|
|
51
|
+
let size = entry.size();
|
|
52
|
+
let is_dir = entry.header().entry_type().is_dir();
|
|
53
|
+
|
|
54
|
+
if !is_dir {
|
|
55
|
+
total_size += size;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
file_count += 1;
|
|
59
|
+
file_list.push(ArchiveEntry { path, size, is_dir });
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
Ok(ArchiveMetadata {
|
|
63
|
+
format: "TAR".to_string(),
|
|
64
|
+
file_list,
|
|
65
|
+
file_count,
|
|
66
|
+
total_size,
|
|
67
|
+
})
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/// Extract text content from files within a TAR archive.
|
|
71
|
+
///
|
|
72
|
+
/// Only extracts files with common text extensions: .txt, .md, .json, .xml, .html, .csv, .log, .yaml, .toml
|
|
73
|
+
///
|
|
74
|
+
/// # Arguments
|
|
75
|
+
///
|
|
76
|
+
/// * `bytes` - The TAR archive bytes (can be compressed with gzip or bzip2)
|
|
77
|
+
///
|
|
78
|
+
/// # Returns
|
|
79
|
+
///
|
|
80
|
+
/// Returns a `HashMap` mapping file paths to their text content.
|
|
81
|
+
/// Binary files and files with non-text extensions are excluded.
|
|
82
|
+
///
|
|
83
|
+
/// # Errors
|
|
84
|
+
///
|
|
85
|
+
/// Returns an error if the TAR archive cannot be read or parsed.
|
|
86
|
+
pub fn extract_tar_text_content(bytes: &[u8]) -> Result<HashMap<String, String>> {
|
|
87
|
+
let cursor = Cursor::new(bytes);
|
|
88
|
+
let mut archive = TarArchive::new(cursor);
|
|
89
|
+
|
|
90
|
+
let estimated_text_files = bytes.len().saturating_div(1024 * 10).min(100);
|
|
91
|
+
let mut contents = HashMap::with_capacity(estimated_text_files.max(2));
|
|
92
|
+
|
|
93
|
+
let entries = archive
|
|
94
|
+
.entries()
|
|
95
|
+
.map_err(|e| KreuzbergError::parsing(format!("Failed to read TAR archive: {}", e)))?;
|
|
96
|
+
|
|
97
|
+
for entry_result in entries {
|
|
98
|
+
let mut entry =
|
|
99
|
+
entry_result.map_err(|e| KreuzbergError::parsing(format!("Failed to read TAR entry: {}", e)))?;
|
|
100
|
+
|
|
101
|
+
let path = entry
|
|
102
|
+
.path()
|
|
103
|
+
.map_err(|e| KreuzbergError::parsing(format!("Failed to read TAR entry path: {}", e)))?
|
|
104
|
+
.to_string_lossy()
|
|
105
|
+
.to_string();
|
|
106
|
+
|
|
107
|
+
if !entry.header().entry_type().is_dir() && TEXT_EXTENSIONS.iter().any(|ext| path.to_lowercase().ends_with(ext))
|
|
108
|
+
{
|
|
109
|
+
let estimated_size = (entry.size().min(10 * 1024 * 1024)) as usize;
|
|
110
|
+
let mut content = String::with_capacity(estimated_size);
|
|
111
|
+
if entry.read_to_string(&mut content).is_ok() {
|
|
112
|
+
contents.insert(path, content);
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
Ok(contents)
|
|
118
|
+
}
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
//! ZIP archive extraction.
|
|
2
|
+
//!
|
|
3
|
+
//! Provides functions for extracting metadata and text content from ZIP archives.
|
|
4
|
+
|
|
5
|
+
use super::{ArchiveEntry, ArchiveMetadata, TEXT_EXTENSIONS};
|
|
6
|
+
use crate::error::{KreuzbergError, Result};
|
|
7
|
+
use std::collections::HashMap;
|
|
8
|
+
use std::io::{Cursor, Read};
|
|
9
|
+
use zip::ZipArchive;
|
|
10
|
+
|
|
11
|
+
/// Extract metadata from a ZIP archive.
|
|
12
|
+
///
|
|
13
|
+
/// # Arguments
|
|
14
|
+
///
|
|
15
|
+
/// * `bytes` - The ZIP archive bytes
|
|
16
|
+
///
|
|
17
|
+
/// # Returns
|
|
18
|
+
///
|
|
19
|
+
/// Returns `ArchiveMetadata` containing:
|
|
20
|
+
/// - Format: "ZIP"
|
|
21
|
+
/// - File list with paths, sizes, and directory flags
|
|
22
|
+
/// - Total file count
|
|
23
|
+
/// - Total uncompressed size
|
|
24
|
+
///
|
|
25
|
+
/// # Errors
|
|
26
|
+
///
|
|
27
|
+
/// Returns an error if the ZIP archive cannot be read or parsed.
|
|
28
|
+
pub fn extract_zip_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
|
|
29
|
+
let cursor = Cursor::new(bytes);
|
|
30
|
+
let mut archive =
|
|
31
|
+
ZipArchive::new(cursor).map_err(|e| KreuzbergError::parsing(format!("Failed to read ZIP archive: {}", e)))?;
|
|
32
|
+
|
|
33
|
+
let mut file_list = Vec::with_capacity(archive.len());
|
|
34
|
+
let mut total_size = 0u64;
|
|
35
|
+
|
|
36
|
+
for i in 0..archive.len() {
|
|
37
|
+
let file = archive
|
|
38
|
+
.by_index(i)
|
|
39
|
+
.map_err(|e| KreuzbergError::parsing(format!("Failed to read ZIP entry: {}", e)))?;
|
|
40
|
+
|
|
41
|
+
let path = file.name().to_string();
|
|
42
|
+
let size = file.size();
|
|
43
|
+
let is_dir = file.is_dir();
|
|
44
|
+
|
|
45
|
+
if !is_dir {
|
|
46
|
+
total_size += size;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
file_list.push(ArchiveEntry { path, size, is_dir });
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
Ok(ArchiveMetadata {
|
|
53
|
+
format: "ZIP".to_string(),
|
|
54
|
+
file_list,
|
|
55
|
+
file_count: archive.len(),
|
|
56
|
+
total_size,
|
|
57
|
+
})
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/// Extract text content from files within a ZIP archive.
|
|
61
|
+
///
|
|
62
|
+
/// Only extracts files with common text extensions: .txt, .md, .json, .xml, .html, .csv, .log, .yaml, .toml
|
|
63
|
+
///
|
|
64
|
+
/// # Arguments
|
|
65
|
+
///
|
|
66
|
+
/// * `bytes` - The ZIP archive bytes
|
|
67
|
+
///
|
|
68
|
+
/// # Returns
|
|
69
|
+
///
|
|
70
|
+
/// Returns a `HashMap` mapping file paths to their text content.
|
|
71
|
+
/// Binary files and files with non-text extensions are excluded.
|
|
72
|
+
///
|
|
73
|
+
/// # Errors
|
|
74
|
+
///
|
|
75
|
+
/// Returns an error if the ZIP archive cannot be read or parsed.
|
|
76
|
+
pub fn extract_zip_text_content(bytes: &[u8]) -> Result<HashMap<String, String>> {
|
|
77
|
+
let cursor = Cursor::new(bytes);
|
|
78
|
+
let mut archive =
|
|
79
|
+
ZipArchive::new(cursor).map_err(|e| KreuzbergError::parsing(format!("Failed to read ZIP archive: {}", e)))?;
|
|
80
|
+
|
|
81
|
+
let estimated_text_files = archive.len().saturating_mul(3).saturating_div(10).max(2);
|
|
82
|
+
let mut contents = HashMap::with_capacity(estimated_text_files);
|
|
83
|
+
|
|
84
|
+
for i in 0..archive.len() {
|
|
85
|
+
let mut file = archive
|
|
86
|
+
.by_index(i)
|
|
87
|
+
.map_err(|e| KreuzbergError::parsing(format!("Failed to read ZIP entry: {}", e)))?;
|
|
88
|
+
|
|
89
|
+
let path = file.name().to_string();
|
|
90
|
+
|
|
91
|
+
if !file.is_dir() && TEXT_EXTENSIONS.iter().any(|ext| path.to_lowercase().ends_with(ext)) {
|
|
92
|
+
let estimated_size = (file.size() as usize).min(10 * 1024 * 1024);
|
|
93
|
+
let mut content = String::with_capacity(estimated_size);
|
|
94
|
+
if file.read_to_string(&mut content).is_ok() {
|
|
95
|
+
contents.insert(path, content);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
Ok(contents)
|
|
101
|
+
}
|