kreuzberg 4.0.7 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +24 -16
- data/README.md +4 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +26 -353
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,1017 @@
|
|
|
1
|
+
//! Core text chunking logic and public API.
|
|
2
|
+
//!
|
|
3
|
+
//! This module implements the main chunking algorithms and provides the primary
|
|
4
|
+
//! public API functions for splitting text into chunks.
|
|
5
|
+
|
|
6
|
+
use crate::error::Result;
|
|
7
|
+
use crate::types::PageBoundary;
|
|
8
|
+
use text_splitter::{MarkdownSplitter, TextSplitter};
|
|
9
|
+
|
|
10
|
+
use super::builder::{build_chunk_config, build_chunks};
|
|
11
|
+
use super::config::{ChunkerType, ChunkingConfig, ChunkingResult};
|
|
12
|
+
use super::validation::validate_utf8_boundaries;
|
|
13
|
+
|
|
14
|
+
/// Split text into chunks with optional page boundary tracking.
|
|
15
|
+
///
|
|
16
|
+
/// This is the primary API function for chunking text. It supports both plain text
|
|
17
|
+
/// and Markdown with configurable chunk size, overlap, and page boundary mapping.
|
|
18
|
+
///
|
|
19
|
+
/// # Arguments
|
|
20
|
+
///
|
|
21
|
+
/// * `text` - The text to split into chunks
|
|
22
|
+
/// * `config` - Chunking configuration (max size, overlap, type)
|
|
23
|
+
/// * `page_boundaries` - Optional page boundary markers for mapping chunks to pages
|
|
24
|
+
///
|
|
25
|
+
/// # Returns
|
|
26
|
+
///
|
|
27
|
+
/// A ChunkingResult containing all chunks and their metadata.
|
|
28
|
+
///
|
|
29
|
+
/// # Examples
|
|
30
|
+
///
|
|
31
|
+
/// ```rust
|
|
32
|
+
/// use kreuzberg::chunking::{chunk_text, ChunkingConfig, ChunkerType};
|
|
33
|
+
///
|
|
34
|
+
/// # fn example() -> kreuzberg::Result<()> {
|
|
35
|
+
/// let config = ChunkingConfig {
|
|
36
|
+
/// max_characters: 500,
|
|
37
|
+
/// overlap: 50,
|
|
38
|
+
/// trim: true,
|
|
39
|
+
/// chunker_type: ChunkerType::Text,
|
|
40
|
+
/// };
|
|
41
|
+
/// let result = chunk_text("Long text...", &config, None)?;
|
|
42
|
+
/// assert!(!result.chunks.is_empty());
|
|
43
|
+
/// # Ok(())
|
|
44
|
+
/// # }
|
|
45
|
+
/// ```
|
|
46
|
+
pub fn chunk_text(
|
|
47
|
+
text: &str,
|
|
48
|
+
config: &ChunkingConfig,
|
|
49
|
+
page_boundaries: Option<&[PageBoundary]>,
|
|
50
|
+
) -> Result<ChunkingResult> {
|
|
51
|
+
if text.is_empty() {
|
|
52
|
+
return Ok(ChunkingResult {
|
|
53
|
+
chunks: vec![],
|
|
54
|
+
chunk_count: 0,
|
|
55
|
+
});
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
if let Some(boundaries) = page_boundaries {
|
|
59
|
+
validate_utf8_boundaries(text, boundaries)?;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
let chunk_config = build_chunk_config(config.max_characters, config.overlap, config.trim)?;
|
|
63
|
+
|
|
64
|
+
let text_chunks: Vec<&str> = match config.chunker_type {
|
|
65
|
+
ChunkerType::Text => {
|
|
66
|
+
let splitter = TextSplitter::new(chunk_config);
|
|
67
|
+
splitter.chunks(text).collect()
|
|
68
|
+
}
|
|
69
|
+
ChunkerType::Markdown => {
|
|
70
|
+
let splitter = MarkdownSplitter::new(chunk_config);
|
|
71
|
+
splitter.chunks(text).collect()
|
|
72
|
+
}
|
|
73
|
+
};
|
|
74
|
+
|
|
75
|
+
let chunks = build_chunks(text_chunks.into_iter(), config.overlap, page_boundaries)?;
|
|
76
|
+
let chunk_count = chunks.len();
|
|
77
|
+
|
|
78
|
+
Ok(ChunkingResult { chunks, chunk_count })
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/// Chunk text with explicit type specification.
|
|
82
|
+
///
|
|
83
|
+
/// This is a convenience function that constructs a ChunkingConfig from individual
|
|
84
|
+
/// parameters and calls `chunk_text`.
|
|
85
|
+
///
|
|
86
|
+
/// # Arguments
|
|
87
|
+
///
|
|
88
|
+
/// * `text` - The text to split into chunks
|
|
89
|
+
/// * `max_characters` - Maximum characters per chunk
|
|
90
|
+
/// * `overlap` - Character overlap between consecutive chunks
|
|
91
|
+
/// * `trim` - Whether to trim whitespace from boundaries
|
|
92
|
+
/// * `chunker_type` - Type of chunker to use (Text or Markdown)
|
|
93
|
+
///
|
|
94
|
+
/// # Returns
|
|
95
|
+
///
|
|
96
|
+
/// A ChunkingResult containing all chunks and their metadata.
|
|
97
|
+
///
|
|
98
|
+
/// # Examples
|
|
99
|
+
///
|
|
100
|
+
/// ```rust
|
|
101
|
+
/// use kreuzberg::chunking::{chunk_text_with_type, ChunkerType};
|
|
102
|
+
///
|
|
103
|
+
/// # fn example() -> kreuzberg::Result<()> {
|
|
104
|
+
/// let result = chunk_text_with_type("Some text", 500, 50, true, ChunkerType::Text)?;
|
|
105
|
+
/// assert!(!result.chunks.is_empty());
|
|
106
|
+
/// # Ok(())
|
|
107
|
+
/// # }
|
|
108
|
+
/// ```
|
|
109
|
+
pub fn chunk_text_with_type(
|
|
110
|
+
text: &str,
|
|
111
|
+
max_characters: usize,
|
|
112
|
+
overlap: usize,
|
|
113
|
+
trim: bool,
|
|
114
|
+
chunker_type: ChunkerType,
|
|
115
|
+
) -> Result<ChunkingResult> {
|
|
116
|
+
let config = ChunkingConfig {
|
|
117
|
+
max_characters,
|
|
118
|
+
overlap,
|
|
119
|
+
trim,
|
|
120
|
+
chunker_type,
|
|
121
|
+
};
|
|
122
|
+
chunk_text(text, &config, None)
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
/// Batch process multiple texts with the same configuration.
|
|
126
|
+
///
|
|
127
|
+
/// This convenience function applies the same chunking configuration to multiple
|
|
128
|
+
/// texts in sequence.
|
|
129
|
+
///
|
|
130
|
+
/// # Arguments
|
|
131
|
+
///
|
|
132
|
+
/// * `texts` - Slice of text strings to chunk
|
|
133
|
+
/// * `config` - Chunking configuration to apply to all texts
|
|
134
|
+
///
|
|
135
|
+
/// # Returns
|
|
136
|
+
///
|
|
137
|
+
/// A vector of ChunkingResult objects, one per input text.
|
|
138
|
+
///
|
|
139
|
+
/// # Errors
|
|
140
|
+
///
|
|
141
|
+
/// Returns an error if chunking any individual text fails.
|
|
142
|
+
///
|
|
143
|
+
/// # Examples
|
|
144
|
+
///
|
|
145
|
+
/// ```rust
|
|
146
|
+
/// use kreuzberg::chunking::{chunk_texts_batch, ChunkingConfig};
|
|
147
|
+
///
|
|
148
|
+
/// # fn example() -> kreuzberg::Result<()> {
|
|
149
|
+
/// let config = ChunkingConfig::default();
|
|
150
|
+
/// let texts = vec!["First text", "Second text"];
|
|
151
|
+
/// let results = chunk_texts_batch(&texts, &config)?;
|
|
152
|
+
/// assert_eq!(results.len(), 2);
|
|
153
|
+
/// # Ok(())
|
|
154
|
+
/// # }
|
|
155
|
+
/// ```
|
|
156
|
+
pub fn chunk_texts_batch(texts: &[&str], config: &ChunkingConfig) -> Result<Vec<ChunkingResult>> {
|
|
157
|
+
texts.iter().map(|text| chunk_text(text, config, None)).collect()
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
#[cfg(test)]
|
|
161
|
+
mod tests {
|
|
162
|
+
use super::*;
|
|
163
|
+
use crate::KreuzbergError;
|
|
164
|
+
|
|
165
|
+
#[test]
|
|
166
|
+
fn test_chunk_empty_text() {
|
|
167
|
+
let config = ChunkingConfig::default();
|
|
168
|
+
let result = chunk_text("", &config, None).unwrap();
|
|
169
|
+
assert_eq!(result.chunks.len(), 0);
|
|
170
|
+
assert_eq!(result.chunk_count, 0);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
#[test]
|
|
174
|
+
fn test_chunk_short_text_single_chunk() {
|
|
175
|
+
let config = ChunkingConfig {
|
|
176
|
+
max_characters: 100,
|
|
177
|
+
overlap: 10,
|
|
178
|
+
trim: true,
|
|
179
|
+
chunker_type: ChunkerType::Text,
|
|
180
|
+
};
|
|
181
|
+
let text = "This is a short text.";
|
|
182
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
183
|
+
assert_eq!(result.chunks.len(), 1);
|
|
184
|
+
assert_eq!(result.chunk_count, 1);
|
|
185
|
+
assert_eq!(result.chunks[0].content, text);
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
#[test]
|
|
189
|
+
fn test_chunk_long_text_multiple_chunks() {
|
|
190
|
+
let config = ChunkingConfig {
|
|
191
|
+
max_characters: 20,
|
|
192
|
+
overlap: 5,
|
|
193
|
+
trim: true,
|
|
194
|
+
chunker_type: ChunkerType::Text,
|
|
195
|
+
};
|
|
196
|
+
let text = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
|
|
197
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
198
|
+
assert!(result.chunk_count >= 2);
|
|
199
|
+
assert_eq!(result.chunks.len(), result.chunk_count);
|
|
200
|
+
assert!(result.chunks.iter().all(|chunk| chunk.content.len() <= 20));
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
#[test]
|
|
204
|
+
fn test_chunk_text_with_overlap() {
|
|
205
|
+
let config = ChunkingConfig {
|
|
206
|
+
max_characters: 20,
|
|
207
|
+
overlap: 5,
|
|
208
|
+
trim: true,
|
|
209
|
+
chunker_type: ChunkerType::Text,
|
|
210
|
+
};
|
|
211
|
+
let text = "abcdefghijklmnopqrstuvwxyz0123456789";
|
|
212
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
213
|
+
assert!(result.chunk_count >= 2);
|
|
214
|
+
|
|
215
|
+
if result.chunks.len() >= 2 {
|
|
216
|
+
let first_chunk_end = &result.chunks[0].content[result.chunks[0].content.len().saturating_sub(5)..];
|
|
217
|
+
assert!(
|
|
218
|
+
result.chunks[1].content.starts_with(first_chunk_end),
|
|
219
|
+
"Expected overlap '{}' at start of second chunk '{}'",
|
|
220
|
+
first_chunk_end,
|
|
221
|
+
result.chunks[1].content
|
|
222
|
+
);
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
#[test]
|
|
227
|
+
fn test_chunk_markdown_preserves_structure() {
|
|
228
|
+
let config = ChunkingConfig {
|
|
229
|
+
max_characters: 50,
|
|
230
|
+
overlap: 10,
|
|
231
|
+
trim: true,
|
|
232
|
+
chunker_type: ChunkerType::Markdown,
|
|
233
|
+
};
|
|
234
|
+
let markdown = "# Title\n\nParagraph one.\n\n## Section\n\nParagraph two.";
|
|
235
|
+
let result = chunk_text(markdown, &config, None).unwrap();
|
|
236
|
+
assert!(result.chunk_count >= 1);
|
|
237
|
+
assert!(result.chunks.iter().any(|chunk| chunk.content.contains("# Title")));
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
#[test]
|
|
241
|
+
fn test_chunk_markdown_with_code_blocks() {
|
|
242
|
+
let config = ChunkingConfig {
|
|
243
|
+
max_characters: 100,
|
|
244
|
+
overlap: 10,
|
|
245
|
+
trim: true,
|
|
246
|
+
chunker_type: ChunkerType::Markdown,
|
|
247
|
+
};
|
|
248
|
+
let markdown = "# Code Example\n\n```python\nprint('hello')\n```\n\nSome text after code.";
|
|
249
|
+
let result = chunk_text(markdown, &config, None).unwrap();
|
|
250
|
+
assert!(result.chunk_count >= 1);
|
|
251
|
+
assert!(result.chunks.iter().any(|chunk| chunk.content.contains("```")));
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
#[test]
|
|
255
|
+
fn test_chunk_markdown_with_links() {
|
|
256
|
+
let config = ChunkingConfig {
|
|
257
|
+
max_characters: 80,
|
|
258
|
+
overlap: 10,
|
|
259
|
+
trim: true,
|
|
260
|
+
chunker_type: ChunkerType::Markdown,
|
|
261
|
+
};
|
|
262
|
+
let markdown = "Check out [this link](https://example.com) for more info.";
|
|
263
|
+
let result = chunk_text(markdown, &config, None).unwrap();
|
|
264
|
+
assert_eq!(result.chunk_count, 1);
|
|
265
|
+
assert!(result.chunks[0].content.contains("[this link]"));
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
#[test]
|
|
269
|
+
fn test_chunk_text_with_trim() {
|
|
270
|
+
let config = ChunkingConfig {
|
|
271
|
+
max_characters: 30,
|
|
272
|
+
overlap: 5,
|
|
273
|
+
trim: true,
|
|
274
|
+
chunker_type: ChunkerType::Text,
|
|
275
|
+
};
|
|
276
|
+
let text = " Leading and trailing spaces should be trimmed ";
|
|
277
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
278
|
+
assert!(result.chunk_count >= 1);
|
|
279
|
+
assert!(result.chunks.iter().all(|chunk| !chunk.content.starts_with(' ')));
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
#[test]
|
|
283
|
+
fn test_chunk_text_without_trim() {
|
|
284
|
+
let config = ChunkingConfig {
|
|
285
|
+
max_characters: 30,
|
|
286
|
+
overlap: 5,
|
|
287
|
+
trim: false,
|
|
288
|
+
chunker_type: ChunkerType::Text,
|
|
289
|
+
};
|
|
290
|
+
let text = " Text with spaces ";
|
|
291
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
292
|
+
assert_eq!(result.chunk_count, 1);
|
|
293
|
+
assert!(result.chunks[0].content.starts_with(' ') || result.chunks[0].content.len() < text.len());
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
#[test]
|
|
297
|
+
fn test_chunk_with_invalid_overlap() {
|
|
298
|
+
let config = ChunkingConfig {
|
|
299
|
+
max_characters: 10,
|
|
300
|
+
overlap: 20,
|
|
301
|
+
trim: true,
|
|
302
|
+
chunker_type: ChunkerType::Text,
|
|
303
|
+
};
|
|
304
|
+
let result = chunk_text("Some text", &config, None);
|
|
305
|
+
assert!(result.is_err());
|
|
306
|
+
let err = result.unwrap_err();
|
|
307
|
+
assert!(matches!(err, KreuzbergError::Validation { .. }));
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
#[test]
|
|
311
|
+
fn test_chunk_text_with_type_text() {
|
|
312
|
+
let result = chunk_text_with_type("Simple text", 50, 10, true, ChunkerType::Text).unwrap();
|
|
313
|
+
assert_eq!(result.chunk_count, 1);
|
|
314
|
+
assert_eq!(result.chunks[0].content, "Simple text");
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
#[test]
|
|
318
|
+
fn test_chunk_text_with_type_markdown() {
|
|
319
|
+
let markdown = "# Header\n\nContent here.";
|
|
320
|
+
let result = chunk_text_with_type(markdown, 50, 10, true, ChunkerType::Markdown).unwrap();
|
|
321
|
+
assert_eq!(result.chunk_count, 1);
|
|
322
|
+
assert!(result.chunks[0].content.contains("# Header"));
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
#[test]
|
|
326
|
+
fn test_chunk_texts_batch_empty() {
|
|
327
|
+
let config = ChunkingConfig::default();
|
|
328
|
+
let texts: Vec<&str> = vec![];
|
|
329
|
+
let results = chunk_texts_batch(&texts, &config).unwrap();
|
|
330
|
+
assert_eq!(results.len(), 0);
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
#[test]
|
|
334
|
+
fn test_chunk_texts_batch_multiple() {
|
|
335
|
+
let config = ChunkingConfig {
|
|
336
|
+
max_characters: 30,
|
|
337
|
+
overlap: 5,
|
|
338
|
+
trim: true,
|
|
339
|
+
chunker_type: ChunkerType::Text,
|
|
340
|
+
};
|
|
341
|
+
let texts = vec!["First text", "Second text", "Third text"];
|
|
342
|
+
let results = chunk_texts_batch(&texts, &config).unwrap();
|
|
343
|
+
assert_eq!(results.len(), 3);
|
|
344
|
+
assert!(results.iter().all(|r| r.chunk_count >= 1));
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
#[test]
|
|
348
|
+
fn test_chunk_texts_batch_mixed_lengths() {
|
|
349
|
+
let config = ChunkingConfig {
|
|
350
|
+
max_characters: 20,
|
|
351
|
+
overlap: 5,
|
|
352
|
+
trim: true,
|
|
353
|
+
chunker_type: ChunkerType::Text,
|
|
354
|
+
};
|
|
355
|
+
let texts = vec![
|
|
356
|
+
"Short",
|
|
357
|
+
"This is a longer text that should be split into multiple chunks",
|
|
358
|
+
"",
|
|
359
|
+
];
|
|
360
|
+
let results = chunk_texts_batch(&texts, &config).unwrap();
|
|
361
|
+
assert_eq!(results.len(), 3);
|
|
362
|
+
assert_eq!(results[0].chunk_count, 1);
|
|
363
|
+
assert!(results[1].chunk_count > 1);
|
|
364
|
+
assert_eq!(results[2].chunk_count, 0);
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
#[test]
|
|
368
|
+
fn test_chunk_texts_batch_error_propagation() {
|
|
369
|
+
let config = ChunkingConfig {
|
|
370
|
+
max_characters: 10,
|
|
371
|
+
overlap: 20,
|
|
372
|
+
trim: true,
|
|
373
|
+
chunker_type: ChunkerType::Text,
|
|
374
|
+
};
|
|
375
|
+
let texts = vec!["Text one", "Text two"];
|
|
376
|
+
let result = chunk_texts_batch(&texts, &config);
|
|
377
|
+
assert!(result.is_err());
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
#[test]
|
|
381
|
+
fn test_chunking_config_default() {
|
|
382
|
+
let config = ChunkingConfig::default();
|
|
383
|
+
assert_eq!(config.max_characters, 2000);
|
|
384
|
+
assert_eq!(config.overlap, 100);
|
|
385
|
+
assert!(config.trim);
|
|
386
|
+
assert_eq!(config.chunker_type, ChunkerType::Text);
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
#[test]
|
|
390
|
+
fn test_chunk_very_long_text() {
|
|
391
|
+
let config = ChunkingConfig {
|
|
392
|
+
max_characters: 100,
|
|
393
|
+
overlap: 20,
|
|
394
|
+
trim: true,
|
|
395
|
+
chunker_type: ChunkerType::Text,
|
|
396
|
+
};
|
|
397
|
+
let text = "a".repeat(1000);
|
|
398
|
+
let result = chunk_text(&text, &config, None).unwrap();
|
|
399
|
+
assert!(result.chunk_count >= 10);
|
|
400
|
+
assert!(result.chunks.iter().all(|chunk| chunk.content.len() <= 100));
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
#[test]
|
|
404
|
+
fn test_chunk_text_with_newlines() {
|
|
405
|
+
let config = ChunkingConfig {
|
|
406
|
+
max_characters: 30,
|
|
407
|
+
overlap: 5,
|
|
408
|
+
trim: true,
|
|
409
|
+
chunker_type: ChunkerType::Text,
|
|
410
|
+
};
|
|
411
|
+
let text = "Line one\nLine two\nLine three\nLine four\nLine five";
|
|
412
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
413
|
+
assert!(result.chunk_count >= 1);
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
#[test]
|
|
417
|
+
fn test_chunk_markdown_with_lists() {
|
|
418
|
+
let config = ChunkingConfig {
|
|
419
|
+
max_characters: 100,
|
|
420
|
+
overlap: 10,
|
|
421
|
+
trim: true,
|
|
422
|
+
chunker_type: ChunkerType::Markdown,
|
|
423
|
+
};
|
|
424
|
+
let markdown = "# List Example\n\n- Item 1\n- Item 2\n- Item 3\n\nMore text.";
|
|
425
|
+
let result = chunk_text(markdown, &config, None).unwrap();
|
|
426
|
+
assert!(result.chunk_count >= 1);
|
|
427
|
+
assert!(result.chunks.iter().any(|chunk| chunk.content.contains("- Item")));
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
#[test]
|
|
431
|
+
fn test_chunk_markdown_with_tables() {
|
|
432
|
+
let config = ChunkingConfig {
|
|
433
|
+
max_characters: 150,
|
|
434
|
+
overlap: 10,
|
|
435
|
+
trim: true,
|
|
436
|
+
chunker_type: ChunkerType::Markdown,
|
|
437
|
+
};
|
|
438
|
+
let markdown = "# Table\n\n| Col1 | Col2 |\n|------|------|\n| A | B |\n| C | D |";
|
|
439
|
+
let result = chunk_text(markdown, &config, None).unwrap();
|
|
440
|
+
assert!(result.chunk_count >= 1);
|
|
441
|
+
assert!(result.chunks.iter().any(|chunk| chunk.content.contains("|")));
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
#[test]
|
|
445
|
+
fn test_chunk_special_characters() {
|
|
446
|
+
let config = ChunkingConfig {
|
|
447
|
+
max_characters: 50,
|
|
448
|
+
overlap: 5,
|
|
449
|
+
trim: true,
|
|
450
|
+
chunker_type: ChunkerType::Text,
|
|
451
|
+
};
|
|
452
|
+
let text = "Special chars: @#$%^&*()[]{}|\\<>?/~`";
|
|
453
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
454
|
+
assert_eq!(result.chunk_count, 1);
|
|
455
|
+
assert!(result.chunks[0].content.contains("@#$%"));
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
#[test]
|
|
459
|
+
fn test_chunk_unicode_characters() {
|
|
460
|
+
let config = ChunkingConfig {
|
|
461
|
+
max_characters: 50,
|
|
462
|
+
overlap: 5,
|
|
463
|
+
trim: true,
|
|
464
|
+
chunker_type: ChunkerType::Text,
|
|
465
|
+
};
|
|
466
|
+
let text = "Unicode: 你好世界 🌍 café résumé";
|
|
467
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
468
|
+
assert_eq!(result.chunk_count, 1);
|
|
469
|
+
assert!(result.chunks[0].content.contains("你好"));
|
|
470
|
+
assert!(result.chunks[0].content.contains("🌍"));
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
#[test]
|
|
474
|
+
fn test_chunk_cjk_text() {
|
|
475
|
+
let config = ChunkingConfig {
|
|
476
|
+
max_characters: 30,
|
|
477
|
+
overlap: 5,
|
|
478
|
+
trim: true,
|
|
479
|
+
chunker_type: ChunkerType::Text,
|
|
480
|
+
};
|
|
481
|
+
let text = "日本語のテキストです。これは長い文章で、複数のチャンクに分割されるべきです。";
|
|
482
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
483
|
+
assert!(result.chunk_count >= 1);
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
#[test]
|
|
487
|
+
fn test_chunk_mixed_languages() {
|
|
488
|
+
let config = ChunkingConfig {
|
|
489
|
+
max_characters: 40,
|
|
490
|
+
overlap: 5,
|
|
491
|
+
trim: true,
|
|
492
|
+
chunker_type: ChunkerType::Text,
|
|
493
|
+
};
|
|
494
|
+
let text = "English text mixed with 中文文本 and some français";
|
|
495
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
496
|
+
assert!(result.chunk_count >= 1);
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
#[test]
|
|
500
|
+
fn test_chunk_offset_calculation_with_overlap() {
|
|
501
|
+
let config = ChunkingConfig {
|
|
502
|
+
max_characters: 20,
|
|
503
|
+
overlap: 5,
|
|
504
|
+
trim: false,
|
|
505
|
+
chunker_type: ChunkerType::Text,
|
|
506
|
+
};
|
|
507
|
+
let text = "AAAAA BBBBB CCCCC DDDDD EEEEE FFFFF";
|
|
508
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
509
|
+
|
|
510
|
+
assert!(result.chunks.len() >= 2, "Expected at least 2 chunks");
|
|
511
|
+
|
|
512
|
+
for i in 0..result.chunks.len() {
|
|
513
|
+
let chunk = &result.chunks[i];
|
|
514
|
+
let metadata = &chunk.metadata;
|
|
515
|
+
|
|
516
|
+
assert_eq!(
|
|
517
|
+
metadata.byte_end - metadata.byte_start,
|
|
518
|
+
chunk.content.len(),
|
|
519
|
+
"Chunk {} offset range doesn't match content length",
|
|
520
|
+
i
|
|
521
|
+
);
|
|
522
|
+
|
|
523
|
+
assert_eq!(metadata.chunk_index, i);
|
|
524
|
+
assert_eq!(metadata.total_chunks, result.chunks.len());
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
for i in 0..result.chunks.len() - 1 {
|
|
528
|
+
let current_chunk = &result.chunks[i];
|
|
529
|
+
let next_chunk = &result.chunks[i + 1];
|
|
530
|
+
|
|
531
|
+
assert!(
|
|
532
|
+
next_chunk.metadata.byte_start < current_chunk.metadata.byte_end,
|
|
533
|
+
"Chunk {} and {} don't overlap: next starts at {} but current ends at {}",
|
|
534
|
+
i,
|
|
535
|
+
i + 1,
|
|
536
|
+
next_chunk.metadata.byte_start,
|
|
537
|
+
current_chunk.metadata.byte_end
|
|
538
|
+
);
|
|
539
|
+
|
|
540
|
+
let overlap_size = current_chunk.metadata.byte_end - next_chunk.metadata.byte_start;
|
|
541
|
+
assert!(
|
|
542
|
+
overlap_size <= config.overlap + 10,
|
|
543
|
+
"Overlap between chunks {} and {} is too large: {}",
|
|
544
|
+
i,
|
|
545
|
+
i + 1,
|
|
546
|
+
overlap_size
|
|
547
|
+
);
|
|
548
|
+
}
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
#[test]
|
|
552
|
+
fn test_chunk_offset_calculation_without_overlap() {
|
|
553
|
+
let config = ChunkingConfig {
|
|
554
|
+
max_characters: 20,
|
|
555
|
+
overlap: 0,
|
|
556
|
+
trim: false,
|
|
557
|
+
chunker_type: ChunkerType::Text,
|
|
558
|
+
};
|
|
559
|
+
let text = "AAAAA BBBBB CCCCC DDDDD EEEEE FFFFF";
|
|
560
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
561
|
+
|
|
562
|
+
for i in 0..result.chunks.len() - 1 {
|
|
563
|
+
let current_chunk = &result.chunks[i];
|
|
564
|
+
let next_chunk = &result.chunks[i + 1];
|
|
565
|
+
|
|
566
|
+
assert!(
|
|
567
|
+
next_chunk.metadata.byte_start >= current_chunk.metadata.byte_end,
|
|
568
|
+
"Chunk {} and {} overlap when they shouldn't: next starts at {} but current ends at {}",
|
|
569
|
+
i,
|
|
570
|
+
i + 1,
|
|
571
|
+
next_chunk.metadata.byte_start,
|
|
572
|
+
current_chunk.metadata.byte_end
|
|
573
|
+
);
|
|
574
|
+
}
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
#[test]
|
|
578
|
+
fn test_chunk_offset_covers_full_text() {
|
|
579
|
+
let config = ChunkingConfig {
|
|
580
|
+
max_characters: 15,
|
|
581
|
+
overlap: 3,
|
|
582
|
+
trim: false,
|
|
583
|
+
chunker_type: ChunkerType::Text,
|
|
584
|
+
};
|
|
585
|
+
let text = "0123456789 ABCDEFGHIJ KLMNOPQRST UVWXYZ";
|
|
586
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
587
|
+
|
|
588
|
+
assert!(result.chunks.len() >= 2, "Expected multiple chunks");
|
|
589
|
+
|
|
590
|
+
assert_eq!(
|
|
591
|
+
result.chunks[0].metadata.byte_start, 0,
|
|
592
|
+
"First chunk should start at position 0"
|
|
593
|
+
);
|
|
594
|
+
|
|
595
|
+
for i in 0..result.chunks.len() - 1 {
|
|
596
|
+
let current_chunk = &result.chunks[i];
|
|
597
|
+
let next_chunk = &result.chunks[i + 1];
|
|
598
|
+
|
|
599
|
+
assert!(
|
|
600
|
+
next_chunk.metadata.byte_start <= current_chunk.metadata.byte_end,
|
|
601
|
+
"Gap detected between chunk {} (ends at {}) and chunk {} (starts at {})",
|
|
602
|
+
i,
|
|
603
|
+
current_chunk.metadata.byte_end,
|
|
604
|
+
i + 1,
|
|
605
|
+
next_chunk.metadata.byte_start
|
|
606
|
+
);
|
|
607
|
+
}
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
#[test]
|
|
611
|
+
fn test_chunk_offset_with_various_overlap_sizes() {
|
|
612
|
+
for overlap in [0, 5, 10, 20] {
|
|
613
|
+
let config = ChunkingConfig {
|
|
614
|
+
max_characters: 30,
|
|
615
|
+
overlap,
|
|
616
|
+
trim: false,
|
|
617
|
+
chunker_type: ChunkerType::Text,
|
|
618
|
+
};
|
|
619
|
+
let text = "Word ".repeat(30);
|
|
620
|
+
let result = chunk_text(&text, &config, None).unwrap();
|
|
621
|
+
|
|
622
|
+
for chunk in &result.chunks {
|
|
623
|
+
assert!(
|
|
624
|
+
chunk.metadata.byte_end > chunk.metadata.byte_start,
|
|
625
|
+
"Invalid offset range for overlap {}: start={}, end={}",
|
|
626
|
+
overlap,
|
|
627
|
+
chunk.metadata.byte_start,
|
|
628
|
+
chunk.metadata.byte_end
|
|
629
|
+
);
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
for chunk in &result.chunks {
|
|
633
|
+
assert!(
|
|
634
|
+
chunk.metadata.byte_start < text.len(),
|
|
635
|
+
"char_start with overlap {} is out of bounds: {}",
|
|
636
|
+
overlap,
|
|
637
|
+
chunk.metadata.byte_start
|
|
638
|
+
);
|
|
639
|
+
}
|
|
640
|
+
}
|
|
641
|
+
}
|
|
642
|
+
|
|
643
|
+
#[test]
|
|
644
|
+
fn test_chunk_last_chunk_offset() {
|
|
645
|
+
let config = ChunkingConfig {
|
|
646
|
+
max_characters: 20,
|
|
647
|
+
overlap: 5,
|
|
648
|
+
trim: false,
|
|
649
|
+
chunker_type: ChunkerType::Text,
|
|
650
|
+
};
|
|
651
|
+
let text = "AAAAA BBBBB CCCCC DDDDD EEEEE";
|
|
652
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
653
|
+
|
|
654
|
+
assert!(result.chunks.len() >= 2, "Need multiple chunks for this test");
|
|
655
|
+
|
|
656
|
+
let last_chunk = result.chunks.last().unwrap();
|
|
657
|
+
let second_to_last = &result.chunks[result.chunks.len() - 2];
|
|
658
|
+
|
|
659
|
+
assert!(
|
|
660
|
+
last_chunk.metadata.byte_start < second_to_last.metadata.byte_end,
|
|
661
|
+
"Last chunk should overlap with previous chunk"
|
|
662
|
+
);
|
|
663
|
+
|
|
664
|
+
let expected_end = text.len();
|
|
665
|
+
let last_chunk_covers_end =
|
|
666
|
+
last_chunk.content.trim_end() == text.trim_end() || last_chunk.metadata.byte_end >= expected_end - 5;
|
|
667
|
+
assert!(last_chunk_covers_end, "Last chunk should cover the end of the text");
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
#[test]
|
|
671
|
+
fn test_chunk_with_page_boundaries() {
|
|
672
|
+
let config = ChunkingConfig {
|
|
673
|
+
max_characters: 30,
|
|
674
|
+
overlap: 5,
|
|
675
|
+
trim: true,
|
|
676
|
+
chunker_type: ChunkerType::Text,
|
|
677
|
+
};
|
|
678
|
+
let text = "Page one content here. Page two starts here and continues.";
|
|
679
|
+
|
|
680
|
+
let boundaries = vec![
|
|
681
|
+
PageBoundary {
|
|
682
|
+
byte_start: 0,
|
|
683
|
+
byte_end: 21,
|
|
684
|
+
page_number: 1,
|
|
685
|
+
},
|
|
686
|
+
PageBoundary {
|
|
687
|
+
byte_start: 22,
|
|
688
|
+
byte_end: 58,
|
|
689
|
+
page_number: 2,
|
|
690
|
+
},
|
|
691
|
+
];
|
|
692
|
+
|
|
693
|
+
let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
|
|
694
|
+
assert!(result.chunks.len() >= 2);
|
|
695
|
+
|
|
696
|
+
assert_eq!(result.chunks[0].metadata.first_page, Some(1));
|
|
697
|
+
|
|
698
|
+
let last_chunk = result.chunks.last().unwrap();
|
|
699
|
+
assert_eq!(last_chunk.metadata.last_page, Some(2));
|
|
700
|
+
}
|
|
701
|
+
|
|
702
|
+
#[test]
|
|
703
|
+
fn test_chunk_without_page_boundaries() {
|
|
704
|
+
let config = ChunkingConfig {
|
|
705
|
+
max_characters: 30,
|
|
706
|
+
overlap: 5,
|
|
707
|
+
trim: true,
|
|
708
|
+
chunker_type: ChunkerType::Text,
|
|
709
|
+
};
|
|
710
|
+
let text = "This is some test content that should be split into multiple chunks.";
|
|
711
|
+
|
|
712
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
713
|
+
assert!(result.chunks.len() >= 2);
|
|
714
|
+
|
|
715
|
+
for chunk in &result.chunks {
|
|
716
|
+
assert_eq!(chunk.metadata.first_page, None);
|
|
717
|
+
assert_eq!(chunk.metadata.last_page, None);
|
|
718
|
+
}
|
|
719
|
+
}
|
|
720
|
+
|
|
721
|
+
#[test]
|
|
722
|
+
fn test_chunk_empty_boundaries() {
|
|
723
|
+
let config = ChunkingConfig {
|
|
724
|
+
max_characters: 30,
|
|
725
|
+
overlap: 5,
|
|
726
|
+
trim: true,
|
|
727
|
+
chunker_type: ChunkerType::Text,
|
|
728
|
+
};
|
|
729
|
+
let text = "Some text content here.";
|
|
730
|
+
let boundaries: Vec<PageBoundary> = vec![];
|
|
731
|
+
|
|
732
|
+
let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
|
|
733
|
+
assert_eq!(result.chunks.len(), 1);
|
|
734
|
+
|
|
735
|
+
assert_eq!(result.chunks[0].metadata.first_page, None);
|
|
736
|
+
assert_eq!(result.chunks[0].metadata.last_page, None);
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
#[test]
|
|
740
|
+
fn test_chunk_spanning_multiple_pages() {
|
|
741
|
+
let config = ChunkingConfig {
|
|
742
|
+
max_characters: 50,
|
|
743
|
+
overlap: 5,
|
|
744
|
+
trim: false,
|
|
745
|
+
chunker_type: ChunkerType::Text,
|
|
746
|
+
};
|
|
747
|
+
let text = "0123456789 AAAAAAAAAA 1111111111 BBBBBBBBBB 2222222222";
|
|
748
|
+
|
|
749
|
+
let boundaries = vec![
|
|
750
|
+
PageBoundary {
|
|
751
|
+
byte_start: 0,
|
|
752
|
+
byte_end: 20,
|
|
753
|
+
page_number: 1,
|
|
754
|
+
},
|
|
755
|
+
PageBoundary {
|
|
756
|
+
byte_start: 20,
|
|
757
|
+
byte_end: 40,
|
|
758
|
+
page_number: 2,
|
|
759
|
+
},
|
|
760
|
+
PageBoundary {
|
|
761
|
+
byte_start: 40,
|
|
762
|
+
byte_end: 54,
|
|
763
|
+
page_number: 3,
|
|
764
|
+
},
|
|
765
|
+
];
|
|
766
|
+
|
|
767
|
+
let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
|
|
768
|
+
assert!(result.chunks.len() >= 2);
|
|
769
|
+
|
|
770
|
+
for chunk in &result.chunks {
|
|
771
|
+
assert!(chunk.metadata.first_page.is_some() || chunk.metadata.last_page.is_some());
|
|
772
|
+
}
|
|
773
|
+
}
|
|
774
|
+
|
|
775
|
+
#[test]
|
|
776
|
+
fn test_chunk_text_with_invalid_boundary_range() {
|
|
777
|
+
let config = ChunkingConfig {
|
|
778
|
+
max_characters: 30,
|
|
779
|
+
overlap: 5,
|
|
780
|
+
trim: true,
|
|
781
|
+
chunker_type: ChunkerType::Text,
|
|
782
|
+
};
|
|
783
|
+
let text = "Page one content here. Page two content.";
|
|
784
|
+
|
|
785
|
+
let boundaries = vec![PageBoundary {
|
|
786
|
+
byte_start: 10,
|
|
787
|
+
byte_end: 5,
|
|
788
|
+
page_number: 1,
|
|
789
|
+
}];
|
|
790
|
+
|
|
791
|
+
let result = chunk_text(text, &config, Some(&boundaries));
|
|
792
|
+
assert!(result.is_err());
|
|
793
|
+
let err = result.unwrap_err();
|
|
794
|
+
assert!(err.to_string().contains("Invalid boundary range"));
|
|
795
|
+
assert!(err.to_string().contains("byte_start"));
|
|
796
|
+
}
|
|
797
|
+
|
|
798
|
+
#[test]
|
|
799
|
+
fn test_chunk_text_with_unsorted_boundaries() {
|
|
800
|
+
let config = ChunkingConfig {
|
|
801
|
+
max_characters: 30,
|
|
802
|
+
overlap: 5,
|
|
803
|
+
trim: true,
|
|
804
|
+
chunker_type: ChunkerType::Text,
|
|
805
|
+
};
|
|
806
|
+
let text = "Page one content here. Page two content.";
|
|
807
|
+
|
|
808
|
+
let boundaries = vec![
|
|
809
|
+
PageBoundary {
|
|
810
|
+
byte_start: 22,
|
|
811
|
+
byte_end: 40,
|
|
812
|
+
page_number: 2,
|
|
813
|
+
},
|
|
814
|
+
PageBoundary {
|
|
815
|
+
byte_start: 0,
|
|
816
|
+
byte_end: 21,
|
|
817
|
+
page_number: 1,
|
|
818
|
+
},
|
|
819
|
+
];
|
|
820
|
+
|
|
821
|
+
let result = chunk_text(text, &config, Some(&boundaries));
|
|
822
|
+
assert!(result.is_err());
|
|
823
|
+
let err = result.unwrap_err();
|
|
824
|
+
assert!(err.to_string().contains("not sorted"));
|
|
825
|
+
assert!(err.to_string().contains("boundaries"));
|
|
826
|
+
}
|
|
827
|
+
|
|
828
|
+
#[test]
|
|
829
|
+
fn test_chunk_text_with_overlapping_boundaries() {
|
|
830
|
+
let config = ChunkingConfig {
|
|
831
|
+
max_characters: 30,
|
|
832
|
+
overlap: 5,
|
|
833
|
+
trim: true,
|
|
834
|
+
chunker_type: ChunkerType::Text,
|
|
835
|
+
};
|
|
836
|
+
let text = "Page one content here. Page two content.";
|
|
837
|
+
|
|
838
|
+
let boundaries = vec![
|
|
839
|
+
PageBoundary {
|
|
840
|
+
byte_start: 0,
|
|
841
|
+
byte_end: 25,
|
|
842
|
+
page_number: 1,
|
|
843
|
+
},
|
|
844
|
+
PageBoundary {
|
|
845
|
+
byte_start: 20,
|
|
846
|
+
byte_end: 40,
|
|
847
|
+
page_number: 2,
|
|
848
|
+
},
|
|
849
|
+
];
|
|
850
|
+
|
|
851
|
+
let result = chunk_text(text, &config, Some(&boundaries));
|
|
852
|
+
assert!(result.is_err());
|
|
853
|
+
let err = result.unwrap_err();
|
|
854
|
+
assert!(err.to_string().contains("Overlapping"));
|
|
855
|
+
assert!(err.to_string().contains("boundaries"));
|
|
856
|
+
}
|
|
857
|
+
|
|
858
|
+
#[test]
|
|
859
|
+
fn test_chunk_with_pages_basic() {
|
|
860
|
+
let config = ChunkingConfig {
|
|
861
|
+
max_characters: 25,
|
|
862
|
+
overlap: 5,
|
|
863
|
+
trim: true,
|
|
864
|
+
chunker_type: ChunkerType::Text,
|
|
865
|
+
};
|
|
866
|
+
let text = "First page content here.Second page content here.Third page.";
|
|
867
|
+
|
|
868
|
+
let boundaries = vec![
|
|
869
|
+
PageBoundary {
|
|
870
|
+
byte_start: 0,
|
|
871
|
+
byte_end: 24,
|
|
872
|
+
page_number: 1,
|
|
873
|
+
},
|
|
874
|
+
PageBoundary {
|
|
875
|
+
byte_start: 24,
|
|
876
|
+
byte_end: 50,
|
|
877
|
+
page_number: 2,
|
|
878
|
+
},
|
|
879
|
+
PageBoundary {
|
|
880
|
+
byte_start: 50,
|
|
881
|
+
byte_end: 60,
|
|
882
|
+
page_number: 3,
|
|
883
|
+
},
|
|
884
|
+
];
|
|
885
|
+
|
|
886
|
+
let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
|
|
887
|
+
|
|
888
|
+
if !result.chunks.is_empty() {
|
|
889
|
+
assert!(result.chunks[0].metadata.first_page.is_some());
|
|
890
|
+
}
|
|
891
|
+
}
|
|
892
|
+
|
|
893
|
+
#[test]
|
|
894
|
+
fn test_chunk_with_pages_single_page_chunk() {
|
|
895
|
+
let config = ChunkingConfig {
|
|
896
|
+
max_characters: 100,
|
|
897
|
+
overlap: 10,
|
|
898
|
+
trim: true,
|
|
899
|
+
chunker_type: ChunkerType::Text,
|
|
900
|
+
};
|
|
901
|
+
let text = "All content on single page fits in one chunk.";
|
|
902
|
+
|
|
903
|
+
let boundaries = vec![PageBoundary {
|
|
904
|
+
byte_start: 0,
|
|
905
|
+
byte_end: 45,
|
|
906
|
+
page_number: 1,
|
|
907
|
+
}];
|
|
908
|
+
|
|
909
|
+
let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
|
|
910
|
+
assert_eq!(result.chunks.len(), 1);
|
|
911
|
+
assert_eq!(result.chunks[0].metadata.first_page, Some(1));
|
|
912
|
+
assert_eq!(result.chunks[0].metadata.last_page, Some(1));
|
|
913
|
+
}
|
|
914
|
+
|
|
915
|
+
#[test]
|
|
916
|
+
fn test_chunk_with_pages_no_overlap() {
|
|
917
|
+
let config = ChunkingConfig {
|
|
918
|
+
max_characters: 20,
|
|
919
|
+
overlap: 0,
|
|
920
|
+
trim: false,
|
|
921
|
+
chunker_type: ChunkerType::Text,
|
|
922
|
+
};
|
|
923
|
+
let text = "AAAAA BBBBB CCCCC DDDDD";
|
|
924
|
+
|
|
925
|
+
let boundaries = vec![
|
|
926
|
+
PageBoundary {
|
|
927
|
+
byte_start: 0,
|
|
928
|
+
byte_end: 11,
|
|
929
|
+
page_number: 1,
|
|
930
|
+
},
|
|
931
|
+
PageBoundary {
|
|
932
|
+
byte_start: 11,
|
|
933
|
+
byte_end: 23,
|
|
934
|
+
page_number: 2,
|
|
935
|
+
},
|
|
936
|
+
];
|
|
937
|
+
|
|
938
|
+
let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
|
|
939
|
+
assert!(!result.chunks.is_empty());
|
|
940
|
+
|
|
941
|
+
for chunk in &result.chunks {
|
|
942
|
+
if let (Some(first), Some(last)) = (chunk.metadata.first_page, chunk.metadata.last_page) {
|
|
943
|
+
assert!(first <= last);
|
|
944
|
+
}
|
|
945
|
+
}
|
|
946
|
+
}
|
|
947
|
+
|
|
948
|
+
#[test]
|
|
949
|
+
fn test_chunk_metadata_page_range_accuracy() {
|
|
950
|
+
let config = ChunkingConfig {
|
|
951
|
+
max_characters: 30,
|
|
952
|
+
overlap: 5,
|
|
953
|
+
trim: true,
|
|
954
|
+
chunker_type: ChunkerType::Text,
|
|
955
|
+
};
|
|
956
|
+
let text = "Page One Content Here.Page Two.";
|
|
957
|
+
|
|
958
|
+
let boundaries = vec![
|
|
959
|
+
PageBoundary {
|
|
960
|
+
byte_start: 0,
|
|
961
|
+
byte_end: 21,
|
|
962
|
+
page_number: 1,
|
|
963
|
+
},
|
|
964
|
+
PageBoundary {
|
|
965
|
+
byte_start: 21,
|
|
966
|
+
byte_end: 31,
|
|
967
|
+
page_number: 2,
|
|
968
|
+
},
|
|
969
|
+
];
|
|
970
|
+
|
|
971
|
+
let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
|
|
972
|
+
|
|
973
|
+
for chunk in &result.chunks {
|
|
974
|
+
assert_eq!(chunk.metadata.byte_end - chunk.metadata.byte_start, chunk.content.len());
|
|
975
|
+
}
|
|
976
|
+
}
|
|
977
|
+
|
|
978
|
+
#[test]
|
|
979
|
+
fn test_chunk_page_range_boundary_edge_cases() {
|
|
980
|
+
let config = ChunkingConfig {
|
|
981
|
+
max_characters: 10,
|
|
982
|
+
overlap: 2,
|
|
983
|
+
trim: false,
|
|
984
|
+
chunker_type: ChunkerType::Text,
|
|
985
|
+
};
|
|
986
|
+
let text = "0123456789ABCDEFGHIJ";
|
|
987
|
+
|
|
988
|
+
let boundaries = vec![
|
|
989
|
+
PageBoundary {
|
|
990
|
+
byte_start: 0,
|
|
991
|
+
byte_end: 10,
|
|
992
|
+
page_number: 1,
|
|
993
|
+
},
|
|
994
|
+
PageBoundary {
|
|
995
|
+
byte_start: 10,
|
|
996
|
+
byte_end: 20,
|
|
997
|
+
page_number: 2,
|
|
998
|
+
},
|
|
999
|
+
];
|
|
1000
|
+
|
|
1001
|
+
let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
|
|
1002
|
+
|
|
1003
|
+
for chunk in &result.chunks {
|
|
1004
|
+
let on_page1 = chunk.metadata.byte_start < 10;
|
|
1005
|
+
let on_page2 = chunk.metadata.byte_end > 10;
|
|
1006
|
+
|
|
1007
|
+
if on_page1 && on_page2 {
|
|
1008
|
+
assert_eq!(chunk.metadata.first_page, Some(1));
|
|
1009
|
+
assert_eq!(chunk.metadata.last_page, Some(2));
|
|
1010
|
+
} else if on_page1 {
|
|
1011
|
+
assert_eq!(chunk.metadata.first_page, Some(1));
|
|
1012
|
+
} else if on_page2 {
|
|
1013
|
+
assert_eq!(chunk.metadata.first_page, Some(2));
|
|
1014
|
+
}
|
|
1015
|
+
}
|
|
1016
|
+
}
|
|
1017
|
+
}
|