kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
//! Chunk construction and building logic.
|
|
2
|
+
//!
|
|
3
|
+
//! This module handles the construction of individual chunks from text segments,
|
|
4
|
+
//! including overlap calculation, offset tracking, and metadata assembly.
|
|
5
|
+
|
|
6
|
+
use crate::error::{KreuzbergError, Result};
|
|
7
|
+
use crate::types::{Chunk, ChunkMetadata, PageBoundary};
|
|
8
|
+
use text_splitter::{Characters, ChunkCapacity, ChunkConfig};
|
|
9
|
+
|
|
10
|
+
use super::boundaries::calculate_page_range;
|
|
11
|
+
|
|
12
|
+
/// Build a ChunkConfig from chunking parameters.
|
|
13
|
+
///
|
|
14
|
+
/// # Arguments
|
|
15
|
+
///
|
|
16
|
+
/// * `max_characters` - Maximum characters per chunk
|
|
17
|
+
/// * `overlap` - Character overlap between consecutive chunks
|
|
18
|
+
/// * `trim` - Whether to trim whitespace from boundaries
|
|
19
|
+
///
|
|
20
|
+
/// # Returns
|
|
21
|
+
///
|
|
22
|
+
/// A configured ChunkConfig ready for use with text splitters.
|
|
23
|
+
///
|
|
24
|
+
/// # Errors
|
|
25
|
+
///
|
|
26
|
+
/// Returns `KreuzbergError::Validation` if configuration is invalid.
|
|
27
|
+
pub fn build_chunk_config(max_characters: usize, overlap: usize, trim: bool) -> Result<ChunkConfig<Characters>> {
|
|
28
|
+
ChunkConfig::new(ChunkCapacity::new(max_characters))
|
|
29
|
+
.with_overlap(overlap)
|
|
30
|
+
.map(|config| config.with_trim(trim))
|
|
31
|
+
.map_err(|e| KreuzbergError::validation(format!("Invalid chunking configuration: {}", e)))
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/// Build chunks from text segments with optional page boundary tracking.
|
|
35
|
+
///
|
|
36
|
+
/// This function takes a collection of text segments (produced by a text splitter)
|
|
37
|
+
/// and constructs Chunk objects with proper metadata, including:
|
|
38
|
+
/// - Byte offsets accounting for overlap
|
|
39
|
+
/// - Chunk indices and total count
|
|
40
|
+
/// - Page boundary information (if provided)
|
|
41
|
+
///
|
|
42
|
+
/// # Arguments
|
|
43
|
+
///
|
|
44
|
+
/// * `text_chunks` - Iterator of text segments to convert into chunks
|
|
45
|
+
/// * `overlap` - Number of characters to overlap between chunks
|
|
46
|
+
/// * `page_boundaries` - Optional page boundary markers for mapping chunks to pages
|
|
47
|
+
///
|
|
48
|
+
/// # Returns
|
|
49
|
+
///
|
|
50
|
+
/// A vector of Chunk objects with complete metadata.
|
|
51
|
+
///
|
|
52
|
+
/// # Errors
|
|
53
|
+
///
|
|
54
|
+
/// Returns an error if page boundary calculation fails.
|
|
55
|
+
pub fn build_chunks<'a, I>(
|
|
56
|
+
text_chunks: I,
|
|
57
|
+
overlap: usize,
|
|
58
|
+
page_boundaries: Option<&[PageBoundary]>,
|
|
59
|
+
) -> Result<Vec<Chunk>>
|
|
60
|
+
where
|
|
61
|
+
I: IntoIterator<Item = &'a str>,
|
|
62
|
+
{
|
|
63
|
+
let chunks_vec: Vec<&str> = text_chunks.into_iter().collect();
|
|
64
|
+
let total_chunks = chunks_vec.len();
|
|
65
|
+
let mut byte_offset = 0;
|
|
66
|
+
let mut chunks = Vec::with_capacity(total_chunks);
|
|
67
|
+
|
|
68
|
+
for (index, chunk_text) in chunks_vec.into_iter().enumerate() {
|
|
69
|
+
let chunk = build_single_chunk(
|
|
70
|
+
chunk_text,
|
|
71
|
+
index,
|
|
72
|
+
total_chunks,
|
|
73
|
+
&mut byte_offset,
|
|
74
|
+
overlap,
|
|
75
|
+
page_boundaries,
|
|
76
|
+
)?;
|
|
77
|
+
chunks.push(chunk);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
Ok(chunks)
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/// Build a single chunk with metadata.
|
|
84
|
+
///
|
|
85
|
+
/// # Arguments
|
|
86
|
+
///
|
|
87
|
+
/// * `chunk_text` - The text content for this chunk
|
|
88
|
+
/// * `index` - Zero-based index of this chunk
|
|
89
|
+
/// * `total_chunks` - Total number of chunks in the collection
|
|
90
|
+
/// * `byte_offset` - Mutable reference to current byte offset (will be updated)
|
|
91
|
+
/// * `overlap` - Number of characters to overlap between chunks
|
|
92
|
+
/// * `page_boundaries` - Optional page boundary markers
|
|
93
|
+
///
|
|
94
|
+
/// # Returns
|
|
95
|
+
///
|
|
96
|
+
/// A complete Chunk object with all metadata filled in.
|
|
97
|
+
///
|
|
98
|
+
/// # Errors
|
|
99
|
+
///
|
|
100
|
+
/// Returns an error if page boundary calculation fails.
|
|
101
|
+
fn build_single_chunk(
|
|
102
|
+
chunk_text: &str,
|
|
103
|
+
index: usize,
|
|
104
|
+
total_chunks: usize,
|
|
105
|
+
byte_offset: &mut usize,
|
|
106
|
+
overlap: usize,
|
|
107
|
+
page_boundaries: Option<&[PageBoundary]>,
|
|
108
|
+
) -> Result<Chunk> {
|
|
109
|
+
let byte_start = *byte_offset;
|
|
110
|
+
let chunk_length = chunk_text.len();
|
|
111
|
+
let byte_end = byte_start + chunk_length;
|
|
112
|
+
|
|
113
|
+
// Calculate overlap for next chunk (not applicable to last chunk)
|
|
114
|
+
let overlap_chars = if index < total_chunks - 1 {
|
|
115
|
+
overlap.min(chunk_length)
|
|
116
|
+
} else {
|
|
117
|
+
0
|
|
118
|
+
};
|
|
119
|
+
|
|
120
|
+
// Update offset for next chunk, accounting for overlap
|
|
121
|
+
*byte_offset = byte_end - overlap_chars;
|
|
122
|
+
|
|
123
|
+
// Calculate page range if boundaries are provided
|
|
124
|
+
let (first_page, last_page) = if let Some(boundaries) = page_boundaries {
|
|
125
|
+
calculate_page_range(byte_start, byte_end, boundaries)?
|
|
126
|
+
} else {
|
|
127
|
+
(None, None)
|
|
128
|
+
};
|
|
129
|
+
|
|
130
|
+
Ok(Chunk {
|
|
131
|
+
content: chunk_text.to_string(),
|
|
132
|
+
embedding: None,
|
|
133
|
+
metadata: ChunkMetadata {
|
|
134
|
+
byte_start,
|
|
135
|
+
byte_end,
|
|
136
|
+
token_count: None,
|
|
137
|
+
chunk_index: index,
|
|
138
|
+
total_chunks,
|
|
139
|
+
first_page,
|
|
140
|
+
last_page,
|
|
141
|
+
},
|
|
142
|
+
})
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
#[cfg(test)]
|
|
146
|
+
mod tests {
|
|
147
|
+
use super::*;
|
|
148
|
+
|
|
149
|
+
#[test]
|
|
150
|
+
fn test_build_chunk_config_valid() {
|
|
151
|
+
let result = build_chunk_config(100, 10, true);
|
|
152
|
+
assert!(result.is_ok());
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
#[test]
|
|
156
|
+
fn test_build_chunk_config_invalid_overlap() {
|
|
157
|
+
let result = build_chunk_config(10, 20, true);
|
|
158
|
+
assert!(result.is_err());
|
|
159
|
+
let err = result.unwrap_err();
|
|
160
|
+
assert!(matches!(err, KreuzbergError::Validation { .. }));
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
#[test]
|
|
164
|
+
fn test_build_chunks_empty() {
|
|
165
|
+
let text_chunks: Vec<&str> = vec![];
|
|
166
|
+
let result = build_chunks(text_chunks, 5, None).unwrap();
|
|
167
|
+
assert_eq!(result.len(), 0);
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
#[test]
|
|
171
|
+
fn test_build_chunks_single() {
|
|
172
|
+
let text_chunks = vec!["Single chunk"];
|
|
173
|
+
let result = build_chunks(text_chunks, 5, None).unwrap();
|
|
174
|
+
assert_eq!(result.len(), 1);
|
|
175
|
+
assert_eq!(result[0].content, "Single chunk");
|
|
176
|
+
assert_eq!(result[0].metadata.chunk_index, 0);
|
|
177
|
+
assert_eq!(result[0].metadata.total_chunks, 1);
|
|
178
|
+
assert_eq!(result[0].metadata.byte_start, 0);
|
|
179
|
+
assert_eq!(result[0].metadata.byte_end, 12);
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
#[test]
|
|
183
|
+
fn test_build_chunks_multiple_with_overlap() {
|
|
184
|
+
let text_chunks = vec!["First chunk here", "Second chunk here", "Third chunk here"];
|
|
185
|
+
let overlap = 5;
|
|
186
|
+
let result = build_chunks(text_chunks, overlap, None).unwrap();
|
|
187
|
+
|
|
188
|
+
assert_eq!(result.len(), 3);
|
|
189
|
+
|
|
190
|
+
// First chunk
|
|
191
|
+
assert_eq!(result[0].content, "First chunk here");
|
|
192
|
+
assert_eq!(result[0].metadata.byte_start, 0);
|
|
193
|
+
assert_eq!(result[0].metadata.byte_end, 16);
|
|
194
|
+
|
|
195
|
+
// Second chunk should start before first ends (overlap)
|
|
196
|
+
assert!(result[1].metadata.byte_start < result[0].metadata.byte_end);
|
|
197
|
+
|
|
198
|
+
// Third chunk should start before second ends (overlap)
|
|
199
|
+
assert!(result[2].metadata.byte_start < result[1].metadata.byte_end);
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
#[test]
|
|
203
|
+
fn test_build_chunks_with_page_boundaries() {
|
|
204
|
+
let text_chunks = vec!["First chunk", "Second chunk"];
|
|
205
|
+
let boundaries = vec![
|
|
206
|
+
PageBoundary {
|
|
207
|
+
byte_start: 0,
|
|
208
|
+
byte_end: 11,
|
|
209
|
+
page_number: 1,
|
|
210
|
+
},
|
|
211
|
+
PageBoundary {
|
|
212
|
+
byte_start: 11,
|
|
213
|
+
byte_end: 23,
|
|
214
|
+
page_number: 2,
|
|
215
|
+
},
|
|
216
|
+
];
|
|
217
|
+
|
|
218
|
+
let result = build_chunks(text_chunks, 0, Some(&boundaries)).unwrap();
|
|
219
|
+
|
|
220
|
+
assert_eq!(result.len(), 2);
|
|
221
|
+
assert_eq!(result[0].metadata.first_page, Some(1));
|
|
222
|
+
assert_eq!(result[1].metadata.first_page, Some(2));
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
#[test]
|
|
226
|
+
fn test_build_chunks_offset_tracking() {
|
|
227
|
+
let text_chunks = vec!["AAAAA", "BBBBB", "CCCCC"];
|
|
228
|
+
let overlap = 2;
|
|
229
|
+
let result = build_chunks(text_chunks, overlap, None).unwrap();
|
|
230
|
+
|
|
231
|
+
assert_eq!(result.len(), 3);
|
|
232
|
+
|
|
233
|
+
// First chunk: 0-5
|
|
234
|
+
assert_eq!(result[0].metadata.byte_start, 0);
|
|
235
|
+
assert_eq!(result[0].metadata.byte_end, 5);
|
|
236
|
+
|
|
237
|
+
// Second chunk: 3-8 (overlap of 2)
|
|
238
|
+
assert_eq!(result[1].metadata.byte_start, 3);
|
|
239
|
+
assert_eq!(result[1].metadata.byte_end, 8);
|
|
240
|
+
|
|
241
|
+
// Third chunk: 6-11 (overlap of 2, but last chunk so no further adjustment)
|
|
242
|
+
assert_eq!(result[2].metadata.byte_start, 6);
|
|
243
|
+
assert_eq!(result[2].metadata.byte_end, 11);
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
#[test]
|
|
247
|
+
fn test_build_single_chunk_metadata() {
|
|
248
|
+
let mut offset = 0;
|
|
249
|
+
let chunk = build_single_chunk("Test content", 0, 1, &mut offset, 5, None).unwrap();
|
|
250
|
+
|
|
251
|
+
assert_eq!(chunk.content, "Test content");
|
|
252
|
+
assert_eq!(chunk.metadata.byte_start, 0);
|
|
253
|
+
assert_eq!(chunk.metadata.byte_end, 12);
|
|
254
|
+
assert_eq!(chunk.metadata.chunk_index, 0);
|
|
255
|
+
assert_eq!(chunk.metadata.total_chunks, 1);
|
|
256
|
+
assert_eq!(chunk.metadata.first_page, None);
|
|
257
|
+
assert_eq!(chunk.metadata.last_page, None);
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
#[test]
|
|
261
|
+
fn test_build_single_chunk_with_overlap() {
|
|
262
|
+
let mut offset = 0;
|
|
263
|
+
|
|
264
|
+
// First chunk
|
|
265
|
+
let chunk1 = build_single_chunk("0123456789", 0, 2, &mut offset, 3, None).unwrap();
|
|
266
|
+
assert_eq!(chunk1.metadata.byte_start, 0);
|
|
267
|
+
assert_eq!(chunk1.metadata.byte_end, 10);
|
|
268
|
+
assert_eq!(offset, 7); // 10 - 3 (overlap)
|
|
269
|
+
|
|
270
|
+
// Second chunk
|
|
271
|
+
let chunk2 = build_single_chunk("ABCDEFGHIJ", 1, 2, &mut offset, 3, None).unwrap();
|
|
272
|
+
assert_eq!(chunk2.metadata.byte_start, 7);
|
|
273
|
+
assert_eq!(chunk2.metadata.byte_end, 17);
|
|
274
|
+
assert_eq!(offset, 17); // Last chunk, no overlap subtracted
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
#[test]
|
|
278
|
+
fn test_build_chunks_no_overlap() {
|
|
279
|
+
let text_chunks = vec!["AAAAA", "BBBBB", "CCCCC"];
|
|
280
|
+
let result = build_chunks(text_chunks, 0, None).unwrap();
|
|
281
|
+
|
|
282
|
+
assert_eq!(result.len(), 3);
|
|
283
|
+
|
|
284
|
+
// Chunks should be contiguous with no overlap
|
|
285
|
+
assert_eq!(result[0].metadata.byte_start, 0);
|
|
286
|
+
assert_eq!(result[0].metadata.byte_end, 5);
|
|
287
|
+
|
|
288
|
+
assert_eq!(result[1].metadata.byte_start, 5);
|
|
289
|
+
assert_eq!(result[1].metadata.byte_end, 10);
|
|
290
|
+
|
|
291
|
+
assert_eq!(result[2].metadata.byte_start, 10);
|
|
292
|
+
assert_eq!(result[2].metadata.byte_end, 15);
|
|
293
|
+
}
|
|
294
|
+
}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
//! Configuration types for text chunking.
|
|
2
|
+
|
|
3
|
+
use serde::{Deserialize, Serialize};
|
|
4
|
+
|
|
5
|
+
/// Configuration options for text chunking operations.
|
|
6
|
+
///
|
|
7
|
+
/// # Fields
|
|
8
|
+
///
|
|
9
|
+
/// * `max_characters` - Maximum number of characters per chunk (default: 2000)
|
|
10
|
+
/// * `overlap` - Number of characters to overlap between consecutive chunks (default: 100)
|
|
11
|
+
/// * `trim` - Whether to trim whitespace from chunk boundaries (default: true)
|
|
12
|
+
/// * `chunker_type` - Type of chunker to use (Text or Markdown) (default: Text)
|
|
13
|
+
pub struct ChunkingConfig {
|
|
14
|
+
pub max_characters: usize,
|
|
15
|
+
pub overlap: usize,
|
|
16
|
+
pub trim: bool,
|
|
17
|
+
pub chunker_type: ChunkerType,
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
impl Default for ChunkingConfig {
|
|
21
|
+
fn default() -> Self {
|
|
22
|
+
Self {
|
|
23
|
+
max_characters: 2000,
|
|
24
|
+
overlap: 100,
|
|
25
|
+
trim: true,
|
|
26
|
+
chunker_type: ChunkerType::Text,
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/// Type of text chunker to use.
|
|
32
|
+
///
|
|
33
|
+
/// # Variants
|
|
34
|
+
///
|
|
35
|
+
/// * `Text` - Generic text splitter, splits on whitespace and punctuation
|
|
36
|
+
/// * `Markdown` - Markdown-aware splitter, preserves formatting and structure
|
|
37
|
+
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
|
38
|
+
pub enum ChunkerType {
|
|
39
|
+
Text,
|
|
40
|
+
Markdown,
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/// Result of a text chunking operation.
|
|
44
|
+
///
|
|
45
|
+
/// Contains the generated chunks and metadata about the chunking.
|
|
46
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
47
|
+
pub struct ChunkingResult {
|
|
48
|
+
/// List of text chunks
|
|
49
|
+
pub chunks: Vec<crate::types::Chunk>,
|
|
50
|
+
/// Total number of chunks generated
|
|
51
|
+
pub chunk_count: usize,
|
|
52
|
+
}
|