kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
|
@@ -30,7 +30,7 @@
|
|
|
30
30
|
//! };
|
|
31
31
|
//!
|
|
32
32
|
//! let long_text = "This is a very long document...".repeat(100);
|
|
33
|
-
//! let result = chunk_text(&long_text, &config
|
|
33
|
+
//! let result = chunk_text(&long_text, &config)?;
|
|
34
34
|
//!
|
|
35
35
|
//! println!("Split into {} chunks", result.chunk_count);
|
|
36
36
|
//! for (i, chunk) in result.chunks.iter().enumerate() {
|
|
@@ -47,25 +47,10 @@
|
|
|
47
47
|
//! - Processing large documents in batches
|
|
48
48
|
//! - Maintaining context across chunk boundaries
|
|
49
49
|
use crate::error::{KreuzbergError, Result};
|
|
50
|
-
use crate::types::{Chunk, ChunkMetadata
|
|
51
|
-
use bitvec::prelude::*;
|
|
52
|
-
use once_cell::sync::Lazy;
|
|
50
|
+
use crate::types::{Chunk, ChunkMetadata};
|
|
53
51
|
use serde::{Deserialize, Serialize};
|
|
54
|
-
use std::sync::Arc;
|
|
55
52
|
use text_splitter::{Characters, ChunkCapacity, ChunkConfig, MarkdownSplitter, TextSplitter};
|
|
56
53
|
|
|
57
|
-
pub mod processor;
|
|
58
|
-
pub use processor::ChunkingProcessor;
|
|
59
|
-
|
|
60
|
-
/// Threshold below which we use O(1) direct validation instead of precomputing a BitVec.
|
|
61
|
-
///
|
|
62
|
-
/// When there are 10 or fewer boundaries, the overhead of creating a BitVec (which is O(n)
|
|
63
|
-
/// where n is the text length) exceeds the cost of calling `is_char_boundary()` directly
|
|
64
|
-
/// for each boundary position. This threshold balances performance across different scenarios:
|
|
65
|
-
/// - Small documents with few boundaries: fast path dominates
|
|
66
|
-
/// - Large documents with many boundaries: batch path leverages the precomputed BitVec
|
|
67
|
-
const ADAPTIVE_VALIDATION_THRESHOLD: usize = 10;
|
|
68
|
-
|
|
69
54
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
|
70
55
|
pub enum ChunkerType {
|
|
71
56
|
Text,
|
|
@@ -103,341 +88,7 @@ fn build_chunk_config(max_characters: usize, overlap: usize, trim: bool) -> Resu
|
|
|
103
88
|
.map_err(|e| KreuzbergError::validation(format!("Invalid chunking configuration: {}", e)))
|
|
104
89
|
}
|
|
105
90
|
|
|
106
|
-
|
|
107
|
-
///
|
|
108
|
-
/// This function performs a single O(n) pass through the text to identify all valid
|
|
109
|
-
/// UTF-8 character boundaries, storing them in a BitVec for O(1) lookups.
|
|
110
|
-
///
|
|
111
|
-
/// # Arguments
|
|
112
|
-
///
|
|
113
|
-
/// * `text` - The text to analyze
|
|
114
|
-
///
|
|
115
|
-
/// # Returns
|
|
116
|
-
///
|
|
117
|
-
/// A BitVec where each bit represents whether a byte offset is a valid UTF-8 character boundary.
|
|
118
|
-
/// The BitVec has length `text.len() + 1` (includes the end position).
|
|
119
|
-
///
|
|
120
|
-
/// # Examples
|
|
121
|
-
///
|
|
122
|
-
/// ```ignore
|
|
123
|
-
/// let text = "Hello 👋";
|
|
124
|
-
/// let boundaries = precompute_utf8_boundaries(text);
|
|
125
|
-
/// assert!(boundaries[0]); // Start is always valid
|
|
126
|
-
/// assert!(boundaries[6]); // 'H' + "ello " = 6 bytes
|
|
127
|
-
/// assert!(!boundaries[7]); // Middle of emoji (first byte of 4-byte sequence)
|
|
128
|
-
/// assert!(boundaries[10]); // After emoji (valid boundary)
|
|
129
|
-
/// ```
|
|
130
|
-
fn precompute_utf8_boundaries(text: &str) -> BitVec {
|
|
131
|
-
let text_len = text.len();
|
|
132
|
-
let mut boundaries = bitvec![0; text_len + 1];
|
|
133
|
-
|
|
134
|
-
boundaries.set(0, true);
|
|
135
|
-
|
|
136
|
-
for (i, _) in text.char_indices() {
|
|
137
|
-
if i <= text_len {
|
|
138
|
-
boundaries.set(i, true);
|
|
139
|
-
}
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
if text_len > 0 {
|
|
143
|
-
boundaries.set(text_len, true);
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
boundaries
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
/// Validates that byte offsets in page boundaries fall on valid UTF-8 character boundaries.
|
|
150
|
-
///
|
|
151
|
-
/// This function ensures that all page boundary positions are at valid UTF-8 character
|
|
152
|
-
/// boundaries within the text. This is CRITICAL to prevent text corruption when boundaries
|
|
153
|
-
/// are created from language bindings or external sources, particularly with multibyte
|
|
154
|
-
/// UTF-8 characters (emoji, CJK characters, combining marks, etc.).
|
|
155
|
-
///
|
|
156
|
-
/// **Performance Strategy**: Uses adaptive validation to optimize for different boundary counts:
|
|
157
|
-
/// - **Small sets (≤10 boundaries)**: O(k) approach using Rust's native `is_char_boundary()` for each position
|
|
158
|
-
/// - **Large sets (>10 boundaries)**: O(n) precomputation with O(1) lookups via BitVec
|
|
159
|
-
///
|
|
160
|
-
/// For typical PDF documents with 1-10 page boundaries, the fast path provides 30-50% faster
|
|
161
|
-
/// validation than always precomputing. For documents with 100+ boundaries, batch precomputation
|
|
162
|
-
/// is 2-4% faster overall due to amortized costs. This gives ~2-4% improvement across all scenarios.
|
|
163
|
-
///
|
|
164
|
-
/// # Arguments
|
|
165
|
-
///
|
|
166
|
-
/// * `text` - The text being chunked
|
|
167
|
-
/// * `boundaries` - Page boundary markers to validate
|
|
168
|
-
///
|
|
169
|
-
/// # Returns
|
|
170
|
-
///
|
|
171
|
-
/// Returns `Ok(())` if all boundaries are at valid UTF-8 character boundaries.
|
|
172
|
-
/// Returns `KreuzbergError::Validation` if any boundary is at an invalid position.
|
|
173
|
-
///
|
|
174
|
-
/// # UTF-8 Boundary Safety
|
|
175
|
-
///
|
|
176
|
-
/// Rust strings use UTF-8 encoding where characters can be 1-4 bytes. For example:
|
|
177
|
-
/// - ASCII letters: 1 byte each
|
|
178
|
-
/// - Emoji (🌍): 4 bytes but 1 character
|
|
179
|
-
/// - CJK characters (中): 3 bytes but 1 character
|
|
180
|
-
///
|
|
181
|
-
/// This function checks that all byte_start and byte_end values are at character boundaries
|
|
182
|
-
/// using an adaptive strategy: direct calls for small boundary sets, or precomputed BitVec
|
|
183
|
-
/// for large sets.
|
|
184
|
-
fn validate_utf8_boundaries(text: &str, boundaries: &[PageBoundary]) -> Result<()> {
|
|
185
|
-
if boundaries.is_empty() {
|
|
186
|
-
return Ok(());
|
|
187
|
-
}
|
|
188
|
-
|
|
189
|
-
let text_len = text.len();
|
|
190
|
-
|
|
191
|
-
if boundaries.len() <= ADAPTIVE_VALIDATION_THRESHOLD {
|
|
192
|
-
validate_utf8_boundaries_fast_path(text, boundaries, text_len)
|
|
193
|
-
} else {
|
|
194
|
-
validate_utf8_boundaries_batch_path(text, boundaries, text_len)
|
|
195
|
-
}
|
|
196
|
-
}
|
|
197
|
-
|
|
198
|
-
/// Fast path: direct UTF-8 boundary validation for small boundary counts (≤10).
|
|
199
|
-
///
|
|
200
|
-
/// Uses Rust's native `str::is_char_boundary()` for O(1) checks on each boundary position.
|
|
201
|
-
/// This avoids the O(n) overhead of BitVec precomputation, making it ideal for typical
|
|
202
|
-
/// PDF documents with few page boundaries.
|
|
203
|
-
///
|
|
204
|
-
/// # Arguments
|
|
205
|
-
///
|
|
206
|
-
/// * `text` - The text being validated
|
|
207
|
-
/// * `boundaries` - Page boundary markers to validate
|
|
208
|
-
/// * `text_len` - Pre-computed text length (avoids recomputation)
|
|
209
|
-
///
|
|
210
|
-
/// # Returns
|
|
211
|
-
///
|
|
212
|
-
/// Returns `Ok(())` if all boundaries are at valid UTF-8 character boundaries.
|
|
213
|
-
/// Returns `KreuzbergError::Validation` if any boundary is invalid.
|
|
214
|
-
fn validate_utf8_boundaries_fast_path(text: &str, boundaries: &[PageBoundary], text_len: usize) -> Result<()> {
|
|
215
|
-
for (idx, boundary) in boundaries.iter().enumerate() {
|
|
216
|
-
if boundary.byte_start > text_len {
|
|
217
|
-
return Err(KreuzbergError::validation(format!(
|
|
218
|
-
"Page boundary {} has byte_start={} which exceeds text length {}",
|
|
219
|
-
idx, boundary.byte_start, text_len
|
|
220
|
-
)));
|
|
221
|
-
}
|
|
222
|
-
|
|
223
|
-
if boundary.byte_end > text_len {
|
|
224
|
-
return Err(KreuzbergError::validation(format!(
|
|
225
|
-
"Page boundary {} has byte_end={} which exceeds text length {}",
|
|
226
|
-
idx, boundary.byte_end, text_len
|
|
227
|
-
)));
|
|
228
|
-
}
|
|
229
|
-
|
|
230
|
-
if boundary.byte_start > 0 && boundary.byte_start < text_len && !text.is_char_boundary(boundary.byte_start) {
|
|
231
|
-
return Err(KreuzbergError::validation(format!(
|
|
232
|
-
"Page boundary {} has byte_start={} which is not a valid UTF-8 character boundary (text length={}). This may indicate corrupted multibyte characters (emoji, CJK, etc.)",
|
|
233
|
-
idx, boundary.byte_start, text_len
|
|
234
|
-
)));
|
|
235
|
-
}
|
|
236
|
-
|
|
237
|
-
if boundary.byte_end > 0 && boundary.byte_end < text_len && !text.is_char_boundary(boundary.byte_end) {
|
|
238
|
-
return Err(KreuzbergError::validation(format!(
|
|
239
|
-
"Page boundary {} has byte_end={} which is not a valid UTF-8 character boundary (text length={}). This may indicate corrupted multibyte characters (emoji, CJK, etc.)",
|
|
240
|
-
idx, boundary.byte_end, text_len
|
|
241
|
-
)));
|
|
242
|
-
}
|
|
243
|
-
}
|
|
244
|
-
|
|
245
|
-
Ok(())
|
|
246
|
-
}
|
|
247
|
-
|
|
248
|
-
/// Batch path: precomputed BitVec validation for large boundary counts (>10).
|
|
249
|
-
///
|
|
250
|
-
/// Precomputes all valid UTF-8 boundaries in a single O(n) pass, then performs O(1)
|
|
251
|
-
/// lookups for each boundary position. This is more efficient than O(k*1) direct checks
|
|
252
|
-
/// when k is large or when the repeated `is_char_boundary()` calls have measurable overhead.
|
|
253
|
-
///
|
|
254
|
-
/// # Arguments
|
|
255
|
-
///
|
|
256
|
-
/// * `text` - The text being validated
|
|
257
|
-
/// * `boundaries` - Page boundary markers to validate
|
|
258
|
-
/// * `text_len` - Pre-computed text length (avoids recomputation)
|
|
259
|
-
///
|
|
260
|
-
/// # Returns
|
|
261
|
-
///
|
|
262
|
-
/// Returns `Ok(())` if all boundaries are at valid UTF-8 character boundaries.
|
|
263
|
-
/// Returns `KreuzbergError::Validation` if any boundary is invalid.
|
|
264
|
-
fn validate_utf8_boundaries_batch_path(text: &str, boundaries: &[PageBoundary], text_len: usize) -> Result<()> {
|
|
265
|
-
let valid_boundaries = precompute_utf8_boundaries(text);
|
|
266
|
-
|
|
267
|
-
for (idx, boundary) in boundaries.iter().enumerate() {
|
|
268
|
-
if boundary.byte_start > text_len {
|
|
269
|
-
return Err(KreuzbergError::validation(format!(
|
|
270
|
-
"Page boundary {} has byte_start={} which exceeds text length {}",
|
|
271
|
-
idx, boundary.byte_start, text_len
|
|
272
|
-
)));
|
|
273
|
-
}
|
|
274
|
-
|
|
275
|
-
if boundary.byte_end > text_len {
|
|
276
|
-
return Err(KreuzbergError::validation(format!(
|
|
277
|
-
"Page boundary {} has byte_end={} which exceeds text length {}",
|
|
278
|
-
idx, boundary.byte_end, text_len
|
|
279
|
-
)));
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
if boundary.byte_start > 0 && boundary.byte_start <= text_len && !valid_boundaries[boundary.byte_start] {
|
|
283
|
-
return Err(KreuzbergError::validation(format!(
|
|
284
|
-
"Page boundary {} has byte_start={} which is not a valid UTF-8 character boundary (text length={}). This may indicate corrupted multibyte characters (emoji, CJK, etc.)",
|
|
285
|
-
idx, boundary.byte_start, text_len
|
|
286
|
-
)));
|
|
287
|
-
}
|
|
288
|
-
|
|
289
|
-
if boundary.byte_end > 0 && boundary.byte_end <= text_len && !valid_boundaries[boundary.byte_end] {
|
|
290
|
-
return Err(KreuzbergError::validation(format!(
|
|
291
|
-
"Page boundary {} has byte_end={} which is not a valid UTF-8 character boundary (text length={}). This may indicate corrupted multibyte characters (emoji, CJK, etc.)",
|
|
292
|
-
idx, boundary.byte_end, text_len
|
|
293
|
-
)));
|
|
294
|
-
}
|
|
295
|
-
}
|
|
296
|
-
|
|
297
|
-
Ok(())
|
|
298
|
-
}
|
|
299
|
-
|
|
300
|
-
/// Calculate which pages a character range spans.
|
|
301
|
-
///
|
|
302
|
-
/// # Arguments
|
|
303
|
-
///
|
|
304
|
-
/// * `char_start` - Starting character offset of the chunk
|
|
305
|
-
/// * `char_end` - Ending character offset of the chunk
|
|
306
|
-
/// * `boundaries` - Page boundary markers from the document
|
|
307
|
-
///
|
|
308
|
-
/// # Returns
|
|
309
|
-
///
|
|
310
|
-
/// A tuple of (first_page, last_page) where page numbers are 1-indexed.
|
|
311
|
-
/// Returns (None, None) if boundaries are empty or chunk doesn't overlap any page.
|
|
312
|
-
/// Validates page boundaries for consistency and correctness.
|
|
313
|
-
///
|
|
314
|
-
/// # Validation Rules
|
|
315
|
-
///
|
|
316
|
-
/// 1. Boundaries must be sorted by char_start (monotonically increasing)
|
|
317
|
-
/// 2. Boundaries must not overlap (char_end[i] <= char_start[i+1])
|
|
318
|
-
/// 3. Each boundary must have char_start < char_end
|
|
319
|
-
///
|
|
320
|
-
/// # Errors
|
|
321
|
-
///
|
|
322
|
-
/// Returns `KreuzbergError::Validation` if any boundary is invalid.
|
|
323
|
-
fn validate_page_boundaries(boundaries: &[PageBoundary]) -> Result<()> {
|
|
324
|
-
if boundaries.is_empty() {
|
|
325
|
-
return Ok(());
|
|
326
|
-
}
|
|
327
|
-
|
|
328
|
-
for (idx, boundary) in boundaries.iter().enumerate() {
|
|
329
|
-
if boundary.byte_start >= boundary.byte_end {
|
|
330
|
-
return Err(KreuzbergError::validation(format!(
|
|
331
|
-
"Invalid boundary range at index {}: byte_start ({}) must be < byte_end ({})",
|
|
332
|
-
idx, boundary.byte_start, boundary.byte_end
|
|
333
|
-
)));
|
|
334
|
-
}
|
|
335
|
-
}
|
|
336
|
-
|
|
337
|
-
for i in 0..boundaries.len() - 1 {
|
|
338
|
-
let current = &boundaries[i];
|
|
339
|
-
let next = &boundaries[i + 1];
|
|
340
|
-
|
|
341
|
-
if current.byte_start > next.byte_start {
|
|
342
|
-
return Err(KreuzbergError::validation(format!(
|
|
343
|
-
"Page boundaries not sorted: boundary at index {} (byte_start={}) comes after boundary at index {} (byte_start={})",
|
|
344
|
-
i,
|
|
345
|
-
current.byte_start,
|
|
346
|
-
i + 1,
|
|
347
|
-
next.byte_start
|
|
348
|
-
)));
|
|
349
|
-
}
|
|
350
|
-
|
|
351
|
-
if current.byte_end > next.byte_start {
|
|
352
|
-
return Err(KreuzbergError::validation(format!(
|
|
353
|
-
"Overlapping page boundaries: boundary {} ends at {} but boundary {} starts at {}",
|
|
354
|
-
i,
|
|
355
|
-
current.byte_end,
|
|
356
|
-
i + 1,
|
|
357
|
-
next.byte_start
|
|
358
|
-
)));
|
|
359
|
-
}
|
|
360
|
-
}
|
|
361
|
-
|
|
362
|
-
Ok(())
|
|
363
|
-
}
|
|
364
|
-
|
|
365
|
-
/// Calculate which pages a byte range spans.
|
|
366
|
-
///
|
|
367
|
-
/// # Arguments
|
|
368
|
-
///
|
|
369
|
-
/// * `byte_start` - Starting byte offset of the chunk
|
|
370
|
-
/// * `byte_end` - Ending byte offset of the chunk
|
|
371
|
-
/// * `boundaries` - Page boundary markers from the document
|
|
372
|
-
///
|
|
373
|
-
/// # Returns
|
|
374
|
-
///
|
|
375
|
-
/// A tuple of (first_page, last_page) where page numbers are 1-indexed.
|
|
376
|
-
/// Returns (None, None) if boundaries are empty or chunk doesn't overlap any page.
|
|
377
|
-
///
|
|
378
|
-
/// # Errors
|
|
379
|
-
///
|
|
380
|
-
/// Returns `KreuzbergError::Validation` if boundaries are invalid.
|
|
381
|
-
fn calculate_page_range(
|
|
382
|
-
byte_start: usize,
|
|
383
|
-
byte_end: usize,
|
|
384
|
-
boundaries: &[PageBoundary],
|
|
385
|
-
) -> Result<(Option<usize>, Option<usize>)> {
|
|
386
|
-
if boundaries.is_empty() {
|
|
387
|
-
return Ok((None, None));
|
|
388
|
-
}
|
|
389
|
-
|
|
390
|
-
validate_page_boundaries(boundaries)?;
|
|
391
|
-
|
|
392
|
-
let mut first_page = None;
|
|
393
|
-
let mut last_page = None;
|
|
394
|
-
|
|
395
|
-
for boundary in boundaries {
|
|
396
|
-
if byte_start < boundary.byte_end && byte_end > boundary.byte_start {
|
|
397
|
-
if first_page.is_none() {
|
|
398
|
-
first_page = Some(boundary.page_number);
|
|
399
|
-
}
|
|
400
|
-
last_page = Some(boundary.page_number);
|
|
401
|
-
}
|
|
402
|
-
}
|
|
403
|
-
|
|
404
|
-
Ok((first_page, last_page))
|
|
405
|
-
}
|
|
406
|
-
|
|
407
|
-
/// Split text into chunks with optional page boundary tracking.
|
|
408
|
-
///
|
|
409
|
-
/// # Arguments
|
|
410
|
-
///
|
|
411
|
-
/// * `text` - The text to split into chunks
|
|
412
|
-
/// * `config` - Chunking configuration (max size, overlap, type)
|
|
413
|
-
/// * `page_boundaries` - Optional page boundary markers for mapping chunks to pages
|
|
414
|
-
///
|
|
415
|
-
/// # Returns
|
|
416
|
-
///
|
|
417
|
-
/// A ChunkingResult containing all chunks and their metadata.
|
|
418
|
-
///
|
|
419
|
-
/// # Examples
|
|
420
|
-
///
|
|
421
|
-
/// ```rust
|
|
422
|
-
/// use kreuzberg::chunking::{chunk_text, ChunkingConfig, ChunkerType};
|
|
423
|
-
///
|
|
424
|
-
/// # fn example() -> kreuzberg::Result<()> {
|
|
425
|
-
/// let config = ChunkingConfig {
|
|
426
|
-
/// max_characters: 500,
|
|
427
|
-
/// overlap: 50,
|
|
428
|
-
/// trim: true,
|
|
429
|
-
/// chunker_type: ChunkerType::Text,
|
|
430
|
-
/// };
|
|
431
|
-
/// let result = chunk_text("Long text...", &config, None)?;
|
|
432
|
-
/// assert!(!result.chunks.is_empty());
|
|
433
|
-
/// # Ok(())
|
|
434
|
-
/// # }
|
|
435
|
-
/// ```
|
|
436
|
-
pub fn chunk_text(
|
|
437
|
-
text: &str,
|
|
438
|
-
config: &ChunkingConfig,
|
|
439
|
-
page_boundaries: Option<&[PageBoundary]>,
|
|
440
|
-
) -> Result<ChunkingResult> {
|
|
91
|
+
pub fn chunk_text(text: &str, config: &ChunkingConfig) -> Result<ChunkingResult> {
|
|
441
92
|
if text.is_empty() {
|
|
442
93
|
return Ok(ChunkingResult {
|
|
443
94
|
chunks: vec![],
|
|
@@ -445,10 +96,6 @@ pub fn chunk_text(
|
|
|
445
96
|
});
|
|
446
97
|
}
|
|
447
98
|
|
|
448
|
-
if let Some(boundaries) = page_boundaries {
|
|
449
|
-
validate_utf8_boundaries(text, boundaries)?;
|
|
450
|
-
}
|
|
451
|
-
|
|
452
99
|
let chunk_config = build_chunk_config(config.max_characters, config.overlap, config.trim)?;
|
|
453
100
|
|
|
454
101
|
let text_chunks: Vec<&str> = match config.chunker_type {
|
|
@@ -463,42 +110,36 @@ pub fn chunk_text(
|
|
|
463
110
|
};
|
|
464
111
|
|
|
465
112
|
let total_chunks = text_chunks.len();
|
|
466
|
-
let mut
|
|
467
|
-
|
|
468
|
-
let
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
total_chunks,
|
|
497
|
-
first_page,
|
|
498
|
-
last_page,
|
|
499
|
-
},
|
|
500
|
-
});
|
|
501
|
-
}
|
|
113
|
+
let mut char_offset = 0;
|
|
114
|
+
|
|
115
|
+
let chunks: Vec<Chunk> = text_chunks
|
|
116
|
+
.into_iter()
|
|
117
|
+
.enumerate()
|
|
118
|
+
.map(|(index, chunk_text)| {
|
|
119
|
+
let char_start = char_offset;
|
|
120
|
+
let chunk_length = chunk_text.chars().count();
|
|
121
|
+
let char_end = char_start + chunk_length;
|
|
122
|
+
|
|
123
|
+
let overlap_chars = if index < total_chunks - 1 {
|
|
124
|
+
config.overlap.min(chunk_length)
|
|
125
|
+
} else {
|
|
126
|
+
0
|
|
127
|
+
};
|
|
128
|
+
char_offset = char_end - overlap_chars;
|
|
129
|
+
|
|
130
|
+
Chunk {
|
|
131
|
+
content: chunk_text.to_string(),
|
|
132
|
+
embedding: None,
|
|
133
|
+
metadata: ChunkMetadata {
|
|
134
|
+
char_start,
|
|
135
|
+
char_end,
|
|
136
|
+
token_count: None,
|
|
137
|
+
chunk_index: index,
|
|
138
|
+
total_chunks,
|
|
139
|
+
},
|
|
140
|
+
}
|
|
141
|
+
})
|
|
142
|
+
.collect();
|
|
502
143
|
|
|
503
144
|
let chunk_count = chunks.len();
|
|
504
145
|
|
|
@@ -518,49 +159,11 @@ pub fn chunk_text_with_type(
|
|
|
518
159
|
trim,
|
|
519
160
|
chunker_type,
|
|
520
161
|
};
|
|
521
|
-
chunk_text(text, &config
|
|
162
|
+
chunk_text(text, &config)
|
|
522
163
|
}
|
|
523
164
|
|
|
524
165
|
pub fn chunk_texts_batch(texts: &[&str], config: &ChunkingConfig) -> Result<Vec<ChunkingResult>> {
|
|
525
|
-
texts.iter().map(|text| chunk_text(text, config
|
|
526
|
-
}
|
|
527
|
-
|
|
528
|
-
/// Lazy-initialized flag that ensures chunking processor is registered exactly once.
|
|
529
|
-
///
|
|
530
|
-
/// This static is accessed on first use to automatically register the
|
|
531
|
-
/// chunking processor with the plugin registry.
|
|
532
|
-
static PROCESSOR_INITIALIZED: Lazy<Result<()>> = Lazy::new(register_chunking_processor);
|
|
533
|
-
|
|
534
|
-
/// Ensure the chunking processor is registered.
|
|
535
|
-
///
|
|
536
|
-
/// This function is called automatically when needed.
|
|
537
|
-
/// It's safe to call multiple times - registration only happens once.
|
|
538
|
-
pub fn ensure_initialized() -> Result<()> {
|
|
539
|
-
PROCESSOR_INITIALIZED
|
|
540
|
-
.as_ref()
|
|
541
|
-
.map(|_| ())
|
|
542
|
-
.map_err(|e| crate::KreuzbergError::Plugin {
|
|
543
|
-
message: format!("Failed to register chunking processor: {}", e),
|
|
544
|
-
plugin_name: "text-chunking".to_string(),
|
|
545
|
-
})
|
|
546
|
-
}
|
|
547
|
-
|
|
548
|
-
/// Register the chunking processor with the global registry.
|
|
549
|
-
///
|
|
550
|
-
/// This function should be called once at application startup to register
|
|
551
|
-
/// the chunking post-processor.
|
|
552
|
-
///
|
|
553
|
-
/// **Note:** This is called automatically on first use.
|
|
554
|
-
/// Explicit calling is optional.
|
|
555
|
-
pub fn register_chunking_processor() -> Result<()> {
|
|
556
|
-
let registry = crate::plugins::registry::get_post_processor_registry();
|
|
557
|
-
let mut registry = registry
|
|
558
|
-
.write()
|
|
559
|
-
.map_err(|e| crate::KreuzbergError::Other(format!("Post-processor registry lock poisoned: {}", e)))?;
|
|
560
|
-
|
|
561
|
-
registry.register(Arc::new(ChunkingProcessor), 50)?;
|
|
562
|
-
|
|
563
|
-
Ok(())
|
|
166
|
+
texts.iter().map(|text| chunk_text(text, config)).collect()
|
|
564
167
|
}
|
|
565
168
|
|
|
566
169
|
#[cfg(test)]
|
|
@@ -570,7 +173,7 @@ mod tests {
|
|
|
570
173
|
#[test]
|
|
571
174
|
fn test_chunk_empty_text() {
|
|
572
175
|
let config = ChunkingConfig::default();
|
|
573
|
-
let result = chunk_text("", &config
|
|
176
|
+
let result = chunk_text("", &config).unwrap();
|
|
574
177
|
assert_eq!(result.chunks.len(), 0);
|
|
575
178
|
assert_eq!(result.chunk_count, 0);
|
|
576
179
|
}
|
|
@@ -584,7 +187,7 @@ mod tests {
|
|
|
584
187
|
chunker_type: ChunkerType::Text,
|
|
585
188
|
};
|
|
586
189
|
let text = "This is a short text.";
|
|
587
|
-
let result = chunk_text(text, &config
|
|
190
|
+
let result = chunk_text(text, &config).unwrap();
|
|
588
191
|
assert_eq!(result.chunks.len(), 1);
|
|
589
192
|
assert_eq!(result.chunk_count, 1);
|
|
590
193
|
assert_eq!(result.chunks[0].content, text);
|
|
@@ -599,7 +202,7 @@ mod tests {
|
|
|
599
202
|
chunker_type: ChunkerType::Text,
|
|
600
203
|
};
|
|
601
204
|
let text = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
|
|
602
|
-
let result = chunk_text(text, &config
|
|
205
|
+
let result = chunk_text(text, &config).unwrap();
|
|
603
206
|
assert!(result.chunk_count >= 2);
|
|
604
207
|
assert_eq!(result.chunks.len(), result.chunk_count);
|
|
605
208
|
assert!(result.chunks.iter().all(|chunk| chunk.content.len() <= 20));
|
|
@@ -614,7 +217,7 @@ mod tests {
|
|
|
614
217
|
chunker_type: ChunkerType::Text,
|
|
615
218
|
};
|
|
616
219
|
let text = "abcdefghijklmnopqrstuvwxyz0123456789";
|
|
617
|
-
let result = chunk_text(text, &config
|
|
220
|
+
let result = chunk_text(text, &config).unwrap();
|
|
618
221
|
assert!(result.chunk_count >= 2);
|
|
619
222
|
|
|
620
223
|
if result.chunks.len() >= 2 {
|
|
@@ -637,7 +240,7 @@ mod tests {
|
|
|
637
240
|
chunker_type: ChunkerType::Markdown,
|
|
638
241
|
};
|
|
639
242
|
let markdown = "# Title\n\nParagraph one.\n\n## Section\n\nParagraph two.";
|
|
640
|
-
let result = chunk_text(markdown, &config
|
|
243
|
+
let result = chunk_text(markdown, &config).unwrap();
|
|
641
244
|
assert!(result.chunk_count >= 1);
|
|
642
245
|
assert!(result.chunks.iter().any(|chunk| chunk.content.contains("# Title")));
|
|
643
246
|
}
|
|
@@ -651,7 +254,7 @@ mod tests {
|
|
|
651
254
|
chunker_type: ChunkerType::Markdown,
|
|
652
255
|
};
|
|
653
256
|
let markdown = "# Code Example\n\n```python\nprint('hello')\n```\n\nSome text after code.";
|
|
654
|
-
let result = chunk_text(markdown, &config
|
|
257
|
+
let result = chunk_text(markdown, &config).unwrap();
|
|
655
258
|
assert!(result.chunk_count >= 1);
|
|
656
259
|
assert!(result.chunks.iter().any(|chunk| chunk.content.contains("```")));
|
|
657
260
|
}
|
|
@@ -665,7 +268,7 @@ mod tests {
|
|
|
665
268
|
chunker_type: ChunkerType::Markdown,
|
|
666
269
|
};
|
|
667
270
|
let markdown = "Check out [this link](https://example.com) for more info.";
|
|
668
|
-
let result = chunk_text(markdown, &config
|
|
271
|
+
let result = chunk_text(markdown, &config).unwrap();
|
|
669
272
|
assert_eq!(result.chunk_count, 1);
|
|
670
273
|
assert!(result.chunks[0].content.contains("[this link]"));
|
|
671
274
|
}
|
|
@@ -679,7 +282,7 @@ mod tests {
|
|
|
679
282
|
chunker_type: ChunkerType::Text,
|
|
680
283
|
};
|
|
681
284
|
let text = " Leading and trailing spaces should be trimmed ";
|
|
682
|
-
let result = chunk_text(text, &config
|
|
285
|
+
let result = chunk_text(text, &config).unwrap();
|
|
683
286
|
assert!(result.chunk_count >= 1);
|
|
684
287
|
assert!(result.chunks.iter().all(|chunk| !chunk.content.starts_with(' ')));
|
|
685
288
|
}
|
|
@@ -693,7 +296,7 @@ mod tests {
|
|
|
693
296
|
chunker_type: ChunkerType::Text,
|
|
694
297
|
};
|
|
695
298
|
let text = " Text with spaces ";
|
|
696
|
-
let result = chunk_text(text, &config
|
|
299
|
+
let result = chunk_text(text, &config).unwrap();
|
|
697
300
|
assert_eq!(result.chunk_count, 1);
|
|
698
301
|
assert!(result.chunks[0].content.starts_with(' ') || result.chunks[0].content.len() < text.len());
|
|
699
302
|
}
|
|
@@ -706,7 +309,7 @@ mod tests {
|
|
|
706
309
|
trim: true,
|
|
707
310
|
chunker_type: ChunkerType::Text,
|
|
708
311
|
};
|
|
709
|
-
let result = chunk_text("Some text", &config
|
|
312
|
+
let result = chunk_text("Some text", &config);
|
|
710
313
|
assert!(result.is_err());
|
|
711
314
|
let err = result.unwrap_err();
|
|
712
315
|
assert!(matches!(err, KreuzbergError::Validation { .. }));
|
|
@@ -800,7 +403,7 @@ mod tests {
|
|
|
800
403
|
chunker_type: ChunkerType::Text,
|
|
801
404
|
};
|
|
802
405
|
let text = "a".repeat(1000);
|
|
803
|
-
let result = chunk_text(&text, &config
|
|
406
|
+
let result = chunk_text(&text, &config).unwrap();
|
|
804
407
|
assert!(result.chunk_count >= 10);
|
|
805
408
|
assert!(result.chunks.iter().all(|chunk| chunk.content.len() <= 100));
|
|
806
409
|
}
|
|
@@ -814,7 +417,7 @@ mod tests {
|
|
|
814
417
|
chunker_type: ChunkerType::Text,
|
|
815
418
|
};
|
|
816
419
|
let text = "Line one\nLine two\nLine three\nLine four\nLine five";
|
|
817
|
-
let result = chunk_text(text, &config
|
|
420
|
+
let result = chunk_text(text, &config).unwrap();
|
|
818
421
|
assert!(result.chunk_count >= 1);
|
|
819
422
|
}
|
|
820
423
|
|
|
@@ -827,7 +430,7 @@ mod tests {
|
|
|
827
430
|
chunker_type: ChunkerType::Markdown,
|
|
828
431
|
};
|
|
829
432
|
let markdown = "# List Example\n\n- Item 1\n- Item 2\n- Item 3\n\nMore text.";
|
|
830
|
-
let result = chunk_text(markdown, &config
|
|
433
|
+
let result = chunk_text(markdown, &config).unwrap();
|
|
831
434
|
assert!(result.chunk_count >= 1);
|
|
832
435
|
assert!(result.chunks.iter().any(|chunk| chunk.content.contains("- Item")));
|
|
833
436
|
}
|
|
@@ -841,7 +444,7 @@ mod tests {
|
|
|
841
444
|
chunker_type: ChunkerType::Markdown,
|
|
842
445
|
};
|
|
843
446
|
let markdown = "# Table\n\n| Col1 | Col2 |\n|------|------|\n| A | B |\n| C | D |";
|
|
844
|
-
let result = chunk_text(markdown, &config
|
|
447
|
+
let result = chunk_text(markdown, &config).unwrap();
|
|
845
448
|
assert!(result.chunk_count >= 1);
|
|
846
449
|
assert!(result.chunks.iter().any(|chunk| chunk.content.contains("|")));
|
|
847
450
|
}
|
|
@@ -855,7 +458,7 @@ mod tests {
|
|
|
855
458
|
chunker_type: ChunkerType::Text,
|
|
856
459
|
};
|
|
857
460
|
let text = "Special chars: @#$%^&*()[]{}|\\<>?/~`";
|
|
858
|
-
let result = chunk_text(text, &config
|
|
461
|
+
let result = chunk_text(text, &config).unwrap();
|
|
859
462
|
assert_eq!(result.chunk_count, 1);
|
|
860
463
|
assert!(result.chunks[0].content.contains("@#$%"));
|
|
861
464
|
}
|
|
@@ -869,7 +472,7 @@ mod tests {
|
|
|
869
472
|
chunker_type: ChunkerType::Text,
|
|
870
473
|
};
|
|
871
474
|
let text = "Unicode: 你好世界 🌍 café résumé";
|
|
872
|
-
let result = chunk_text(text, &config
|
|
475
|
+
let result = chunk_text(text, &config).unwrap();
|
|
873
476
|
assert_eq!(result.chunk_count, 1);
|
|
874
477
|
assert!(result.chunks[0].content.contains("你好"));
|
|
875
478
|
assert!(result.chunks[0].content.contains("🌍"));
|
|
@@ -884,7 +487,7 @@ mod tests {
|
|
|
884
487
|
chunker_type: ChunkerType::Text,
|
|
885
488
|
};
|
|
886
489
|
let text = "日本語のテキストです。これは長い文章で、複数のチャンクに分割されるべきです。";
|
|
887
|
-
let result = chunk_text(text, &config
|
|
490
|
+
let result = chunk_text(text, &config).unwrap();
|
|
888
491
|
assert!(result.chunk_count >= 1);
|
|
889
492
|
}
|
|
890
493
|
|
|
@@ -897,7 +500,7 @@ mod tests {
|
|
|
897
500
|
chunker_type: ChunkerType::Text,
|
|
898
501
|
};
|
|
899
502
|
let text = "English text mixed with 中文文本 and some français";
|
|
900
|
-
let result = chunk_text(text, &config
|
|
503
|
+
let result = chunk_text(text, &config).unwrap();
|
|
901
504
|
assert!(result.chunk_count >= 1);
|
|
902
505
|
}
|
|
903
506
|
|
|
@@ -910,7 +513,7 @@ mod tests {
|
|
|
910
513
|
chunker_type: ChunkerType::Text,
|
|
911
514
|
};
|
|
912
515
|
let text = "AAAAA BBBBB CCCCC DDDDD EEEEE FFFFF";
|
|
913
|
-
let result = chunk_text(text, &config
|
|
516
|
+
let result = chunk_text(text, &config).unwrap();
|
|
914
517
|
|
|
915
518
|
assert!(result.chunks.len() >= 2, "Expected at least 2 chunks");
|
|
916
519
|
|
|
@@ -919,8 +522,8 @@ mod tests {
|
|
|
919
522
|
let metadata = &chunk.metadata;
|
|
920
523
|
|
|
921
524
|
assert_eq!(
|
|
922
|
-
metadata.
|
|
923
|
-
chunk.content.
|
|
525
|
+
metadata.char_end - metadata.char_start,
|
|
526
|
+
chunk.content.chars().count(),
|
|
924
527
|
"Chunk {} offset range doesn't match content length",
|
|
925
528
|
i
|
|
926
529
|
);
|
|
@@ -934,15 +537,15 @@ mod tests {
|
|
|
934
537
|
let next_chunk = &result.chunks[i + 1];
|
|
935
538
|
|
|
936
539
|
assert!(
|
|
937
|
-
next_chunk.metadata.
|
|
540
|
+
next_chunk.metadata.char_start < current_chunk.metadata.char_end,
|
|
938
541
|
"Chunk {} and {} don't overlap: next starts at {} but current ends at {}",
|
|
939
542
|
i,
|
|
940
543
|
i + 1,
|
|
941
|
-
next_chunk.metadata.
|
|
942
|
-
current_chunk.metadata.
|
|
544
|
+
next_chunk.metadata.char_start,
|
|
545
|
+
current_chunk.metadata.char_end
|
|
943
546
|
);
|
|
944
547
|
|
|
945
|
-
let overlap_size = current_chunk.metadata.
|
|
548
|
+
let overlap_size = current_chunk.metadata.char_end - next_chunk.metadata.char_start;
|
|
946
549
|
assert!(
|
|
947
550
|
overlap_size <= config.overlap + 10,
|
|
948
551
|
"Overlap between chunks {} and {} is too large: {}",
|
|
@@ -962,19 +565,19 @@ mod tests {
|
|
|
962
565
|
chunker_type: ChunkerType::Text,
|
|
963
566
|
};
|
|
964
567
|
let text = "AAAAA BBBBB CCCCC DDDDD EEEEE FFFFF";
|
|
965
|
-
let result = chunk_text(text, &config
|
|
568
|
+
let result = chunk_text(text, &config).unwrap();
|
|
966
569
|
|
|
967
570
|
for i in 0..result.chunks.len() - 1 {
|
|
968
571
|
let current_chunk = &result.chunks[i];
|
|
969
572
|
let next_chunk = &result.chunks[i + 1];
|
|
970
573
|
|
|
971
574
|
assert!(
|
|
972
|
-
next_chunk.metadata.
|
|
575
|
+
next_chunk.metadata.char_start >= current_chunk.metadata.char_end,
|
|
973
576
|
"Chunk {} and {} overlap when they shouldn't: next starts at {} but current ends at {}",
|
|
974
577
|
i,
|
|
975
578
|
i + 1,
|
|
976
|
-
next_chunk.metadata.
|
|
977
|
-
current_chunk.metadata.
|
|
579
|
+
next_chunk.metadata.char_start,
|
|
580
|
+
current_chunk.metadata.char_end
|
|
978
581
|
);
|
|
979
582
|
}
|
|
980
583
|
}
|
|
@@ -988,12 +591,12 @@ mod tests {
|
|
|
988
591
|
chunker_type: ChunkerType::Text,
|
|
989
592
|
};
|
|
990
593
|
let text = "0123456789 ABCDEFGHIJ KLMNOPQRST UVWXYZ";
|
|
991
|
-
let result = chunk_text(text, &config
|
|
594
|
+
let result = chunk_text(text, &config).unwrap();
|
|
992
595
|
|
|
993
596
|
assert!(result.chunks.len() >= 2, "Expected multiple chunks");
|
|
994
597
|
|
|
995
598
|
assert_eq!(
|
|
996
|
-
result.chunks[0].metadata.
|
|
599
|
+
result.chunks[0].metadata.char_start, 0,
|
|
997
600
|
"First chunk should start at position 0"
|
|
998
601
|
);
|
|
999
602
|
|
|
@@ -1002,12 +605,12 @@ mod tests {
|
|
|
1002
605
|
let next_chunk = &result.chunks[i + 1];
|
|
1003
606
|
|
|
1004
607
|
assert!(
|
|
1005
|
-
next_chunk.metadata.
|
|
608
|
+
next_chunk.metadata.char_start <= current_chunk.metadata.char_end,
|
|
1006
609
|
"Gap detected between chunk {} (ends at {}) and chunk {} (starts at {})",
|
|
1007
610
|
i,
|
|
1008
|
-
current_chunk.metadata.
|
|
611
|
+
current_chunk.metadata.char_end,
|
|
1009
612
|
i + 1,
|
|
1010
|
-
next_chunk.metadata.
|
|
613
|
+
next_chunk.metadata.char_start
|
|
1011
614
|
);
|
|
1012
615
|
}
|
|
1013
616
|
}
|
|
@@ -1022,24 +625,24 @@ mod tests {
|
|
|
1022
625
|
chunker_type: ChunkerType::Text,
|
|
1023
626
|
};
|
|
1024
627
|
let text = "Word ".repeat(30);
|
|
1025
|
-
let result = chunk_text(&text, &config
|
|
628
|
+
let result = chunk_text(&text, &config).unwrap();
|
|
1026
629
|
|
|
1027
630
|
for chunk in &result.chunks {
|
|
1028
631
|
assert!(
|
|
1029
|
-
chunk.metadata.
|
|
632
|
+
chunk.metadata.char_end > chunk.metadata.char_start,
|
|
1030
633
|
"Invalid offset range for overlap {}: start={}, end={}",
|
|
1031
634
|
overlap,
|
|
1032
|
-
chunk.metadata.
|
|
1033
|
-
chunk.metadata.
|
|
635
|
+
chunk.metadata.char_start,
|
|
636
|
+
chunk.metadata.char_end
|
|
1034
637
|
);
|
|
1035
638
|
}
|
|
1036
639
|
|
|
1037
640
|
for chunk in &result.chunks {
|
|
1038
641
|
assert!(
|
|
1039
|
-
chunk.metadata.
|
|
642
|
+
chunk.metadata.char_start < text.chars().count(),
|
|
1040
643
|
"char_start with overlap {} is out of bounds: {}",
|
|
1041
644
|
overlap,
|
|
1042
|
-
chunk.metadata.
|
|
645
|
+
chunk.metadata.char_start
|
|
1043
646
|
);
|
|
1044
647
|
}
|
|
1045
648
|
}
|
|
@@ -1054,7 +657,7 @@ mod tests {
|
|
|
1054
657
|
chunker_type: ChunkerType::Text,
|
|
1055
658
|
};
|
|
1056
659
|
let text = "AAAAA BBBBB CCCCC DDDDD EEEEE";
|
|
1057
|
-
let result = chunk_text(text, &config
|
|
660
|
+
let result = chunk_text(text, &config).unwrap();
|
|
1058
661
|
|
|
1059
662
|
assert!(result.chunks.len() >= 2, "Need multiple chunks for this test");
|
|
1060
663
|
|
|
@@ -1062,1242 +665,13 @@ mod tests {
|
|
|
1062
665
|
let second_to_last = &result.chunks[result.chunks.len() - 2];
|
|
1063
666
|
|
|
1064
667
|
assert!(
|
|
1065
|
-
last_chunk.metadata.
|
|
668
|
+
last_chunk.metadata.char_start < second_to_last.metadata.char_end,
|
|
1066
669
|
"Last chunk should overlap with previous chunk"
|
|
1067
670
|
);
|
|
1068
671
|
|
|
1069
|
-
let expected_end = text.
|
|
672
|
+
let expected_end = text.chars().count();
|
|
1070
673
|
let last_chunk_covers_end =
|
|
1071
|
-
last_chunk.content.trim_end() == text.trim_end() || last_chunk.metadata.
|
|
674
|
+
last_chunk.content.trim_end() == text.trim_end() || last_chunk.metadata.char_end >= expected_end - 5;
|
|
1072
675
|
assert!(last_chunk_covers_end, "Last chunk should cover the end of the text");
|
|
1073
676
|
}
|
|
1074
|
-
|
|
1075
|
-
#[test]
|
|
1076
|
-
fn test_chunk_with_page_boundaries() {
|
|
1077
|
-
use crate::types::PageBoundary;
|
|
1078
|
-
|
|
1079
|
-
let config = ChunkingConfig {
|
|
1080
|
-
max_characters: 30,
|
|
1081
|
-
overlap: 5,
|
|
1082
|
-
trim: true,
|
|
1083
|
-
chunker_type: ChunkerType::Text,
|
|
1084
|
-
};
|
|
1085
|
-
let text = "Page one content here. Page two starts here and continues.";
|
|
1086
|
-
|
|
1087
|
-
let boundaries = vec![
|
|
1088
|
-
PageBoundary {
|
|
1089
|
-
byte_start: 0,
|
|
1090
|
-
byte_end: 21,
|
|
1091
|
-
page_number: 1,
|
|
1092
|
-
},
|
|
1093
|
-
PageBoundary {
|
|
1094
|
-
byte_start: 22,
|
|
1095
|
-
byte_end: 58,
|
|
1096
|
-
page_number: 2,
|
|
1097
|
-
},
|
|
1098
|
-
];
|
|
1099
|
-
|
|
1100
|
-
let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
|
|
1101
|
-
assert!(result.chunks.len() >= 2);
|
|
1102
|
-
|
|
1103
|
-
assert_eq!(result.chunks[0].metadata.first_page, Some(1));
|
|
1104
|
-
|
|
1105
|
-
let last_chunk = result.chunks.last().unwrap();
|
|
1106
|
-
assert_eq!(last_chunk.metadata.last_page, Some(2));
|
|
1107
|
-
}
|
|
1108
|
-
|
|
1109
|
-
#[test]
|
|
1110
|
-
fn test_chunk_without_page_boundaries() {
|
|
1111
|
-
let config = ChunkingConfig {
|
|
1112
|
-
max_characters: 30,
|
|
1113
|
-
overlap: 5,
|
|
1114
|
-
trim: true,
|
|
1115
|
-
chunker_type: ChunkerType::Text,
|
|
1116
|
-
};
|
|
1117
|
-
let text = "This is some test content that should be split into multiple chunks.";
|
|
1118
|
-
|
|
1119
|
-
let result = chunk_text(text, &config, None).unwrap();
|
|
1120
|
-
assert!(result.chunks.len() >= 2);
|
|
1121
|
-
|
|
1122
|
-
for chunk in &result.chunks {
|
|
1123
|
-
assert_eq!(chunk.metadata.first_page, None);
|
|
1124
|
-
assert_eq!(chunk.metadata.last_page, None);
|
|
1125
|
-
}
|
|
1126
|
-
}
|
|
1127
|
-
|
|
1128
|
-
#[test]
|
|
1129
|
-
fn test_chunk_empty_boundaries() {
|
|
1130
|
-
let config = ChunkingConfig {
|
|
1131
|
-
max_characters: 30,
|
|
1132
|
-
overlap: 5,
|
|
1133
|
-
trim: true,
|
|
1134
|
-
chunker_type: ChunkerType::Text,
|
|
1135
|
-
};
|
|
1136
|
-
let text = "Some text content here.";
|
|
1137
|
-
let boundaries: Vec<PageBoundary> = vec![];
|
|
1138
|
-
|
|
1139
|
-
let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
|
|
1140
|
-
assert_eq!(result.chunks.len(), 1);
|
|
1141
|
-
|
|
1142
|
-
assert_eq!(result.chunks[0].metadata.first_page, None);
|
|
1143
|
-
assert_eq!(result.chunks[0].metadata.last_page, None);
|
|
1144
|
-
}
|
|
1145
|
-
|
|
1146
|
-
#[test]
|
|
1147
|
-
fn test_chunk_spanning_multiple_pages() {
|
|
1148
|
-
use crate::types::PageBoundary;
|
|
1149
|
-
|
|
1150
|
-
let config = ChunkingConfig {
|
|
1151
|
-
max_characters: 50,
|
|
1152
|
-
overlap: 5,
|
|
1153
|
-
trim: false,
|
|
1154
|
-
chunker_type: ChunkerType::Text,
|
|
1155
|
-
};
|
|
1156
|
-
let text = "0123456789 AAAAAAAAAA 1111111111 BBBBBBBBBB 2222222222";
|
|
1157
|
-
|
|
1158
|
-
let boundaries = vec![
|
|
1159
|
-
PageBoundary {
|
|
1160
|
-
byte_start: 0,
|
|
1161
|
-
byte_end: 20,
|
|
1162
|
-
page_number: 1,
|
|
1163
|
-
},
|
|
1164
|
-
PageBoundary {
|
|
1165
|
-
byte_start: 20,
|
|
1166
|
-
byte_end: 40,
|
|
1167
|
-
page_number: 2,
|
|
1168
|
-
},
|
|
1169
|
-
PageBoundary {
|
|
1170
|
-
byte_start: 40,
|
|
1171
|
-
byte_end: 54,
|
|
1172
|
-
page_number: 3,
|
|
1173
|
-
},
|
|
1174
|
-
];
|
|
1175
|
-
|
|
1176
|
-
let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
|
|
1177
|
-
assert!(result.chunks.len() >= 2);
|
|
1178
|
-
|
|
1179
|
-
for chunk in &result.chunks {
|
|
1180
|
-
assert!(chunk.metadata.first_page.is_some() || chunk.metadata.last_page.is_some());
|
|
1181
|
-
}
|
|
1182
|
-
}
|
|
1183
|
-
|
|
1184
|
-
#[test]
|
|
1185
|
-
fn test_chunk_text_with_invalid_boundary_range() {
|
|
1186
|
-
use crate::types::PageBoundary;
|
|
1187
|
-
|
|
1188
|
-
let config = ChunkingConfig {
|
|
1189
|
-
max_characters: 30,
|
|
1190
|
-
overlap: 5,
|
|
1191
|
-
trim: true,
|
|
1192
|
-
chunker_type: ChunkerType::Text,
|
|
1193
|
-
};
|
|
1194
|
-
let text = "Page one content here. Page two content.";
|
|
1195
|
-
|
|
1196
|
-
let boundaries = vec![PageBoundary {
|
|
1197
|
-
byte_start: 10,
|
|
1198
|
-
byte_end: 5,
|
|
1199
|
-
page_number: 1,
|
|
1200
|
-
}];
|
|
1201
|
-
|
|
1202
|
-
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1203
|
-
assert!(result.is_err());
|
|
1204
|
-
let err = result.unwrap_err();
|
|
1205
|
-
assert!(err.to_string().contains("Invalid boundary range"));
|
|
1206
|
-
assert!(err.to_string().contains("byte_start"));
|
|
1207
|
-
}
|
|
1208
|
-
|
|
1209
|
-
#[test]
|
|
1210
|
-
fn test_chunk_text_with_unsorted_boundaries() {
|
|
1211
|
-
use crate::types::PageBoundary;
|
|
1212
|
-
|
|
1213
|
-
let config = ChunkingConfig {
|
|
1214
|
-
max_characters: 30,
|
|
1215
|
-
overlap: 5,
|
|
1216
|
-
trim: true,
|
|
1217
|
-
chunker_type: ChunkerType::Text,
|
|
1218
|
-
};
|
|
1219
|
-
let text = "Page one content here. Page two content.";
|
|
1220
|
-
|
|
1221
|
-
let boundaries = vec![
|
|
1222
|
-
PageBoundary {
|
|
1223
|
-
byte_start: 22,
|
|
1224
|
-
byte_end: 40,
|
|
1225
|
-
page_number: 2,
|
|
1226
|
-
},
|
|
1227
|
-
PageBoundary {
|
|
1228
|
-
byte_start: 0,
|
|
1229
|
-
byte_end: 21,
|
|
1230
|
-
page_number: 1,
|
|
1231
|
-
},
|
|
1232
|
-
];
|
|
1233
|
-
|
|
1234
|
-
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1235
|
-
assert!(result.is_err());
|
|
1236
|
-
let err = result.unwrap_err();
|
|
1237
|
-
assert!(err.to_string().contains("not sorted"));
|
|
1238
|
-
assert!(err.to_string().contains("boundaries"));
|
|
1239
|
-
}
|
|
1240
|
-
|
|
1241
|
-
#[test]
|
|
1242
|
-
fn test_chunk_text_with_overlapping_boundaries() {
|
|
1243
|
-
use crate::types::PageBoundary;
|
|
1244
|
-
|
|
1245
|
-
let config = ChunkingConfig {
|
|
1246
|
-
max_characters: 30,
|
|
1247
|
-
overlap: 5,
|
|
1248
|
-
trim: true,
|
|
1249
|
-
chunker_type: ChunkerType::Text,
|
|
1250
|
-
};
|
|
1251
|
-
let text = "Page one content here. Page two content.";
|
|
1252
|
-
|
|
1253
|
-
let boundaries = vec![
|
|
1254
|
-
PageBoundary {
|
|
1255
|
-
byte_start: 0,
|
|
1256
|
-
byte_end: 25,
|
|
1257
|
-
page_number: 1,
|
|
1258
|
-
},
|
|
1259
|
-
PageBoundary {
|
|
1260
|
-
byte_start: 20,
|
|
1261
|
-
byte_end: 40,
|
|
1262
|
-
page_number: 2,
|
|
1263
|
-
},
|
|
1264
|
-
];
|
|
1265
|
-
|
|
1266
|
-
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1267
|
-
assert!(result.is_err());
|
|
1268
|
-
let err = result.unwrap_err();
|
|
1269
|
-
assert!(err.to_string().contains("Overlapping"));
|
|
1270
|
-
assert!(err.to_string().contains("boundaries"));
|
|
1271
|
-
}
|
|
1272
|
-
|
|
1273
|
-
#[test]
|
|
1274
|
-
fn test_calculate_page_range_with_invalid_boundaries() {
|
|
1275
|
-
use crate::types::PageBoundary;
|
|
1276
|
-
|
|
1277
|
-
let boundaries = vec![PageBoundary {
|
|
1278
|
-
byte_start: 15,
|
|
1279
|
-
byte_end: 10,
|
|
1280
|
-
page_number: 1,
|
|
1281
|
-
}];
|
|
1282
|
-
|
|
1283
|
-
let result = calculate_page_range(0, 20, &boundaries);
|
|
1284
|
-
assert!(result.is_err());
|
|
1285
|
-
let err = result.unwrap_err();
|
|
1286
|
-
assert!(err.to_string().contains("Invalid boundary range"));
|
|
1287
|
-
}
|
|
1288
|
-
|
|
1289
|
-
#[test]
|
|
1290
|
-
fn test_validate_page_boundaries_valid() {
|
|
1291
|
-
use crate::types::PageBoundary;
|
|
1292
|
-
|
|
1293
|
-
let boundaries = vec![
|
|
1294
|
-
PageBoundary {
|
|
1295
|
-
byte_start: 0,
|
|
1296
|
-
byte_end: 20,
|
|
1297
|
-
page_number: 1,
|
|
1298
|
-
},
|
|
1299
|
-
PageBoundary {
|
|
1300
|
-
byte_start: 20,
|
|
1301
|
-
byte_end: 40,
|
|
1302
|
-
page_number: 2,
|
|
1303
|
-
},
|
|
1304
|
-
PageBoundary {
|
|
1305
|
-
byte_start: 40,
|
|
1306
|
-
byte_end: 60,
|
|
1307
|
-
page_number: 3,
|
|
1308
|
-
},
|
|
1309
|
-
];
|
|
1310
|
-
|
|
1311
|
-
let result = chunk_text(
|
|
1312
|
-
"x".repeat(60).as_str(),
|
|
1313
|
-
&ChunkingConfig {
|
|
1314
|
-
max_characters: 30,
|
|
1315
|
-
overlap: 5,
|
|
1316
|
-
trim: false,
|
|
1317
|
-
chunker_type: ChunkerType::Text,
|
|
1318
|
-
},
|
|
1319
|
-
Some(&boundaries),
|
|
1320
|
-
);
|
|
1321
|
-
assert!(result.is_ok());
|
|
1322
|
-
}
|
|
1323
|
-
|
|
1324
|
-
#[test]
|
|
1325
|
-
fn test_validate_page_boundaries_empty() {
|
|
1326
|
-
let boundaries: Vec<PageBoundary> = vec![];
|
|
1327
|
-
let result = chunk_text(
|
|
1328
|
-
"Some test text",
|
|
1329
|
-
&ChunkingConfig {
|
|
1330
|
-
max_characters: 30,
|
|
1331
|
-
overlap: 5,
|
|
1332
|
-
trim: true,
|
|
1333
|
-
chunker_type: ChunkerType::Text,
|
|
1334
|
-
},
|
|
1335
|
-
Some(&boundaries),
|
|
1336
|
-
);
|
|
1337
|
-
assert!(result.is_ok());
|
|
1338
|
-
}
|
|
1339
|
-
|
|
1340
|
-
#[test]
|
|
1341
|
-
fn test_page_boundaries_with_gaps() {
|
|
1342
|
-
use crate::types::PageBoundary;
|
|
1343
|
-
|
|
1344
|
-
let boundaries = vec![
|
|
1345
|
-
PageBoundary {
|
|
1346
|
-
byte_start: 0,
|
|
1347
|
-
byte_end: 10,
|
|
1348
|
-
page_number: 1,
|
|
1349
|
-
},
|
|
1350
|
-
PageBoundary {
|
|
1351
|
-
byte_start: 15,
|
|
1352
|
-
byte_end: 25,
|
|
1353
|
-
page_number: 2,
|
|
1354
|
-
},
|
|
1355
|
-
];
|
|
1356
|
-
|
|
1357
|
-
let text = "0123456789XXXXX0123456789";
|
|
1358
|
-
let result = chunk_text(
|
|
1359
|
-
text,
|
|
1360
|
-
&ChunkingConfig {
|
|
1361
|
-
max_characters: 30,
|
|
1362
|
-
overlap: 5,
|
|
1363
|
-
trim: false,
|
|
1364
|
-
chunker_type: ChunkerType::Text,
|
|
1365
|
-
},
|
|
1366
|
-
Some(&boundaries),
|
|
1367
|
-
);
|
|
1368
|
-
assert!(result.is_ok());
|
|
1369
|
-
}
|
|
1370
|
-
|
|
1371
|
-
#[test]
|
|
1372
|
-
fn test_chunk_with_same_start_and_end() {
|
|
1373
|
-
use crate::types::PageBoundary;
|
|
1374
|
-
|
|
1375
|
-
let boundaries = vec![PageBoundary {
|
|
1376
|
-
byte_start: 10,
|
|
1377
|
-
byte_end: 10,
|
|
1378
|
-
page_number: 1,
|
|
1379
|
-
}];
|
|
1380
|
-
|
|
1381
|
-
let result = chunk_text(
|
|
1382
|
-
"test content here",
|
|
1383
|
-
&ChunkingConfig {
|
|
1384
|
-
max_characters: 30,
|
|
1385
|
-
overlap: 5,
|
|
1386
|
-
trim: true,
|
|
1387
|
-
chunker_type: ChunkerType::Text,
|
|
1388
|
-
},
|
|
1389
|
-
Some(&boundaries),
|
|
1390
|
-
);
|
|
1391
|
-
assert!(result.is_err());
|
|
1392
|
-
let err = result.unwrap_err();
|
|
1393
|
-
assert!(err.to_string().contains("Invalid boundary range"));
|
|
1394
|
-
}
|
|
1395
|
-
|
|
1396
|
-
#[test]
|
|
1397
|
-
fn test_multiple_overlapping_errors() {
|
|
1398
|
-
use crate::types::PageBoundary;
|
|
1399
|
-
|
|
1400
|
-
let text = "This is a longer test content string that spans more bytes";
|
|
1401
|
-
let boundaries = vec![
|
|
1402
|
-
PageBoundary {
|
|
1403
|
-
byte_start: 20,
|
|
1404
|
-
byte_end: 40,
|
|
1405
|
-
page_number: 2,
|
|
1406
|
-
},
|
|
1407
|
-
PageBoundary {
|
|
1408
|
-
byte_start: 10,
|
|
1409
|
-
byte_end: 35,
|
|
1410
|
-
page_number: 1,
|
|
1411
|
-
},
|
|
1412
|
-
];
|
|
1413
|
-
|
|
1414
|
-
let result = chunk_text(
|
|
1415
|
-
text,
|
|
1416
|
-
&ChunkingConfig {
|
|
1417
|
-
max_characters: 30,
|
|
1418
|
-
overlap: 5,
|
|
1419
|
-
trim: true,
|
|
1420
|
-
chunker_type: ChunkerType::Text,
|
|
1421
|
-
},
|
|
1422
|
-
Some(&boundaries),
|
|
1423
|
-
);
|
|
1424
|
-
assert!(result.is_err());
|
|
1425
|
-
assert!(result.unwrap_err().to_string().contains("not sorted"));
|
|
1426
|
-
}
|
|
1427
|
-
|
|
1428
|
-
#[test]
|
|
1429
|
-
fn test_chunk_with_pages_basic() {
|
|
1430
|
-
use crate::types::PageBoundary;
|
|
1431
|
-
|
|
1432
|
-
let config = ChunkingConfig {
|
|
1433
|
-
max_characters: 25,
|
|
1434
|
-
overlap: 5,
|
|
1435
|
-
trim: true,
|
|
1436
|
-
chunker_type: ChunkerType::Text,
|
|
1437
|
-
};
|
|
1438
|
-
let text = "First page content here.Second page content here.Third page.";
|
|
1439
|
-
|
|
1440
|
-
let boundaries = vec![
|
|
1441
|
-
PageBoundary {
|
|
1442
|
-
byte_start: 0,
|
|
1443
|
-
byte_end: 24,
|
|
1444
|
-
page_number: 1,
|
|
1445
|
-
},
|
|
1446
|
-
PageBoundary {
|
|
1447
|
-
byte_start: 24,
|
|
1448
|
-
byte_end: 50,
|
|
1449
|
-
page_number: 2,
|
|
1450
|
-
},
|
|
1451
|
-
PageBoundary {
|
|
1452
|
-
byte_start: 50,
|
|
1453
|
-
byte_end: 60,
|
|
1454
|
-
page_number: 3,
|
|
1455
|
-
},
|
|
1456
|
-
];
|
|
1457
|
-
|
|
1458
|
-
let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
|
|
1459
|
-
|
|
1460
|
-
if !result.chunks.is_empty() {
|
|
1461
|
-
assert!(result.chunks[0].metadata.first_page.is_some());
|
|
1462
|
-
}
|
|
1463
|
-
}
|
|
1464
|
-
|
|
1465
|
-
#[test]
|
|
1466
|
-
fn test_chunk_with_pages_single_page_chunk() {
|
|
1467
|
-
use crate::types::PageBoundary;
|
|
1468
|
-
|
|
1469
|
-
let config = ChunkingConfig {
|
|
1470
|
-
max_characters: 100,
|
|
1471
|
-
overlap: 10,
|
|
1472
|
-
trim: true,
|
|
1473
|
-
chunker_type: ChunkerType::Text,
|
|
1474
|
-
};
|
|
1475
|
-
let text = "All content on single page fits in one chunk.";
|
|
1476
|
-
|
|
1477
|
-
let boundaries = vec![PageBoundary {
|
|
1478
|
-
byte_start: 0,
|
|
1479
|
-
byte_end: 45,
|
|
1480
|
-
page_number: 1,
|
|
1481
|
-
}];
|
|
1482
|
-
|
|
1483
|
-
let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
|
|
1484
|
-
assert_eq!(result.chunks.len(), 1);
|
|
1485
|
-
assert_eq!(result.chunks[0].metadata.first_page, Some(1));
|
|
1486
|
-
assert_eq!(result.chunks[0].metadata.last_page, Some(1));
|
|
1487
|
-
}
|
|
1488
|
-
|
|
1489
|
-
#[test]
|
|
1490
|
-
fn test_chunk_with_pages_no_overlap() {
|
|
1491
|
-
use crate::types::PageBoundary;
|
|
1492
|
-
|
|
1493
|
-
let config = ChunkingConfig {
|
|
1494
|
-
max_characters: 20,
|
|
1495
|
-
overlap: 0,
|
|
1496
|
-
trim: false,
|
|
1497
|
-
chunker_type: ChunkerType::Text,
|
|
1498
|
-
};
|
|
1499
|
-
let text = "AAAAA BBBBB CCCCC DDDDD";
|
|
1500
|
-
|
|
1501
|
-
let boundaries = vec![
|
|
1502
|
-
PageBoundary {
|
|
1503
|
-
byte_start: 0,
|
|
1504
|
-
byte_end: 11,
|
|
1505
|
-
page_number: 1,
|
|
1506
|
-
},
|
|
1507
|
-
PageBoundary {
|
|
1508
|
-
byte_start: 11,
|
|
1509
|
-
byte_end: 23,
|
|
1510
|
-
page_number: 2,
|
|
1511
|
-
},
|
|
1512
|
-
];
|
|
1513
|
-
|
|
1514
|
-
let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
|
|
1515
|
-
assert!(!result.chunks.is_empty());
|
|
1516
|
-
|
|
1517
|
-
for chunk in &result.chunks {
|
|
1518
|
-
if let (Some(first), Some(last)) = (chunk.metadata.first_page, chunk.metadata.last_page) {
|
|
1519
|
-
assert!(first <= last);
|
|
1520
|
-
}
|
|
1521
|
-
}
|
|
1522
|
-
}
|
|
1523
|
-
|
|
1524
|
-
#[test]
|
|
1525
|
-
fn test_calculate_page_range_within_page() {
|
|
1526
|
-
let boundaries = vec![
|
|
1527
|
-
PageBoundary {
|
|
1528
|
-
byte_start: 0,
|
|
1529
|
-
byte_end: 100,
|
|
1530
|
-
page_number: 1,
|
|
1531
|
-
},
|
|
1532
|
-
PageBoundary {
|
|
1533
|
-
byte_start: 100,
|
|
1534
|
-
byte_end: 200,
|
|
1535
|
-
page_number: 2,
|
|
1536
|
-
},
|
|
1537
|
-
];
|
|
1538
|
-
|
|
1539
|
-
let (first, last) = calculate_page_range(10, 50, &boundaries).unwrap();
|
|
1540
|
-
assert_eq!(first, Some(1));
|
|
1541
|
-
assert_eq!(last, Some(1));
|
|
1542
|
-
}
|
|
1543
|
-
|
|
1544
|
-
#[test]
|
|
1545
|
-
fn test_calculate_page_range_spanning_pages() {
|
|
1546
|
-
let boundaries = vec![
|
|
1547
|
-
PageBoundary {
|
|
1548
|
-
byte_start: 0,
|
|
1549
|
-
byte_end: 100,
|
|
1550
|
-
page_number: 1,
|
|
1551
|
-
},
|
|
1552
|
-
PageBoundary {
|
|
1553
|
-
byte_start: 100,
|
|
1554
|
-
byte_end: 200,
|
|
1555
|
-
page_number: 2,
|
|
1556
|
-
},
|
|
1557
|
-
];
|
|
1558
|
-
|
|
1559
|
-
let (first, last) = calculate_page_range(50, 150, &boundaries).unwrap();
|
|
1560
|
-
assert_eq!(first, Some(1));
|
|
1561
|
-
assert_eq!(last, Some(2));
|
|
1562
|
-
}
|
|
1563
|
-
|
|
1564
|
-
#[test]
|
|
1565
|
-
fn test_calculate_page_range_empty_boundaries() {
|
|
1566
|
-
let boundaries: Vec<PageBoundary> = vec![];
|
|
1567
|
-
|
|
1568
|
-
let (first, last) = calculate_page_range(0, 50, &boundaries).unwrap();
|
|
1569
|
-
assert_eq!(first, None);
|
|
1570
|
-
assert_eq!(last, None);
|
|
1571
|
-
}
|
|
1572
|
-
|
|
1573
|
-
#[test]
|
|
1574
|
-
fn test_calculate_page_range_no_overlap() {
|
|
1575
|
-
let boundaries = vec![
|
|
1576
|
-
PageBoundary {
|
|
1577
|
-
byte_start: 0,
|
|
1578
|
-
byte_end: 100,
|
|
1579
|
-
page_number: 1,
|
|
1580
|
-
},
|
|
1581
|
-
PageBoundary {
|
|
1582
|
-
byte_start: 100,
|
|
1583
|
-
byte_end: 200,
|
|
1584
|
-
page_number: 2,
|
|
1585
|
-
},
|
|
1586
|
-
];
|
|
1587
|
-
|
|
1588
|
-
let (first, last) = calculate_page_range(200, 250, &boundaries).unwrap();
|
|
1589
|
-
assert_eq!(first, None);
|
|
1590
|
-
assert_eq!(last, None);
|
|
1591
|
-
}
|
|
1592
|
-
|
|
1593
|
-
#[test]
|
|
1594
|
-
fn test_calculate_page_range_three_pages() {
|
|
1595
|
-
let boundaries = vec![
|
|
1596
|
-
PageBoundary {
|
|
1597
|
-
byte_start: 0,
|
|
1598
|
-
byte_end: 100,
|
|
1599
|
-
page_number: 1,
|
|
1600
|
-
},
|
|
1601
|
-
PageBoundary {
|
|
1602
|
-
byte_start: 100,
|
|
1603
|
-
byte_end: 200,
|
|
1604
|
-
page_number: 2,
|
|
1605
|
-
},
|
|
1606
|
-
PageBoundary {
|
|
1607
|
-
byte_start: 200,
|
|
1608
|
-
byte_end: 300,
|
|
1609
|
-
page_number: 3,
|
|
1610
|
-
},
|
|
1611
|
-
];
|
|
1612
|
-
|
|
1613
|
-
let (first, last) = calculate_page_range(50, 250, &boundaries).unwrap();
|
|
1614
|
-
assert_eq!(first, Some(1));
|
|
1615
|
-
assert_eq!(last, Some(3));
|
|
1616
|
-
}
|
|
1617
|
-
|
|
1618
|
-
#[test]
|
|
1619
|
-
fn test_chunk_metadata_page_range_accuracy() {
|
|
1620
|
-
use crate::types::PageBoundary;
|
|
1621
|
-
|
|
1622
|
-
let config = ChunkingConfig {
|
|
1623
|
-
max_characters: 30,
|
|
1624
|
-
overlap: 5,
|
|
1625
|
-
trim: true,
|
|
1626
|
-
chunker_type: ChunkerType::Text,
|
|
1627
|
-
};
|
|
1628
|
-
let text = "Page One Content Here.Page Two.";
|
|
1629
|
-
|
|
1630
|
-
let boundaries = vec![
|
|
1631
|
-
PageBoundary {
|
|
1632
|
-
byte_start: 0,
|
|
1633
|
-
byte_end: 21,
|
|
1634
|
-
page_number: 1,
|
|
1635
|
-
},
|
|
1636
|
-
PageBoundary {
|
|
1637
|
-
byte_start: 21,
|
|
1638
|
-
byte_end: 31,
|
|
1639
|
-
page_number: 2,
|
|
1640
|
-
},
|
|
1641
|
-
];
|
|
1642
|
-
|
|
1643
|
-
let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
|
|
1644
|
-
|
|
1645
|
-
for chunk in &result.chunks {
|
|
1646
|
-
assert_eq!(chunk.metadata.byte_end - chunk.metadata.byte_start, chunk.content.len());
|
|
1647
|
-
}
|
|
1648
|
-
}
|
|
1649
|
-
|
|
1650
|
-
#[test]
|
|
1651
|
-
fn test_chunk_page_range_boundary_edge_cases() {
|
|
1652
|
-
use crate::types::PageBoundary;
|
|
1653
|
-
|
|
1654
|
-
let config = ChunkingConfig {
|
|
1655
|
-
max_characters: 10,
|
|
1656
|
-
overlap: 2,
|
|
1657
|
-
trim: false,
|
|
1658
|
-
chunker_type: ChunkerType::Text,
|
|
1659
|
-
};
|
|
1660
|
-
let text = "0123456789ABCDEFGHIJ";
|
|
1661
|
-
|
|
1662
|
-
let boundaries = vec![
|
|
1663
|
-
PageBoundary {
|
|
1664
|
-
byte_start: 0,
|
|
1665
|
-
byte_end: 10,
|
|
1666
|
-
page_number: 1,
|
|
1667
|
-
},
|
|
1668
|
-
PageBoundary {
|
|
1669
|
-
byte_start: 10,
|
|
1670
|
-
byte_end: 20,
|
|
1671
|
-
page_number: 2,
|
|
1672
|
-
},
|
|
1673
|
-
];
|
|
1674
|
-
|
|
1675
|
-
let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
|
|
1676
|
-
|
|
1677
|
-
for chunk in &result.chunks {
|
|
1678
|
-
let on_page1 = chunk.metadata.byte_start < 10;
|
|
1679
|
-
let on_page2 = chunk.metadata.byte_end > 10;
|
|
1680
|
-
|
|
1681
|
-
if on_page1 && on_page2 {
|
|
1682
|
-
assert_eq!(chunk.metadata.first_page, Some(1));
|
|
1683
|
-
assert_eq!(chunk.metadata.last_page, Some(2));
|
|
1684
|
-
} else if on_page1 {
|
|
1685
|
-
assert_eq!(chunk.metadata.first_page, Some(1));
|
|
1686
|
-
} else if on_page2 {
|
|
1687
|
-
assert_eq!(chunk.metadata.first_page, Some(2));
|
|
1688
|
-
}
|
|
1689
|
-
}
|
|
1690
|
-
}
|
|
1691
|
-
|
|
1692
|
-
#[test]
|
|
1693
|
-
fn test_validate_utf8_boundaries_valid_ascii() {
|
|
1694
|
-
use crate::types::PageBoundary;
|
|
1695
|
-
|
|
1696
|
-
let text = "This is ASCII text.";
|
|
1697
|
-
let boundaries = vec![
|
|
1698
|
-
PageBoundary {
|
|
1699
|
-
byte_start: 0,
|
|
1700
|
-
byte_end: 10,
|
|
1701
|
-
page_number: 1,
|
|
1702
|
-
},
|
|
1703
|
-
PageBoundary {
|
|
1704
|
-
byte_start: 10,
|
|
1705
|
-
byte_end: 19,
|
|
1706
|
-
page_number: 2,
|
|
1707
|
-
},
|
|
1708
|
-
];
|
|
1709
|
-
|
|
1710
|
-
let result = chunk_text(text, &ChunkingConfig::default(), Some(&boundaries));
|
|
1711
|
-
assert!(result.is_ok());
|
|
1712
|
-
}
|
|
1713
|
-
|
|
1714
|
-
#[test]
|
|
1715
|
-
fn test_validate_utf8_boundaries_valid_emoji() {
|
|
1716
|
-
use crate::types::PageBoundary;
|
|
1717
|
-
|
|
1718
|
-
let text = "Hello 👋 World 🌍 End";
|
|
1719
|
-
let config = ChunkingConfig::default();
|
|
1720
|
-
|
|
1721
|
-
let boundaries = vec![
|
|
1722
|
-
PageBoundary {
|
|
1723
|
-
byte_start: 0,
|
|
1724
|
-
byte_end: 11,
|
|
1725
|
-
page_number: 1,
|
|
1726
|
-
},
|
|
1727
|
-
PageBoundary {
|
|
1728
|
-
byte_start: 11,
|
|
1729
|
-
byte_end: 25,
|
|
1730
|
-
page_number: 2,
|
|
1731
|
-
},
|
|
1732
|
-
];
|
|
1733
|
-
|
|
1734
|
-
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1735
|
-
assert!(result.is_ok());
|
|
1736
|
-
}
|
|
1737
|
-
|
|
1738
|
-
#[test]
|
|
1739
|
-
fn test_validate_utf8_boundaries_valid_cjk() {
|
|
1740
|
-
use crate::types::PageBoundary;
|
|
1741
|
-
|
|
1742
|
-
let text = "你好世界 こんにちは 안녕하세요";
|
|
1743
|
-
let config = ChunkingConfig::default();
|
|
1744
|
-
|
|
1745
|
-
let boundaries = vec![
|
|
1746
|
-
PageBoundary {
|
|
1747
|
-
byte_start: 0,
|
|
1748
|
-
byte_end: 13,
|
|
1749
|
-
page_number: 1,
|
|
1750
|
-
},
|
|
1751
|
-
PageBoundary {
|
|
1752
|
-
byte_start: 13,
|
|
1753
|
-
byte_end: 44,
|
|
1754
|
-
page_number: 2,
|
|
1755
|
-
},
|
|
1756
|
-
];
|
|
1757
|
-
|
|
1758
|
-
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1759
|
-
assert!(result.is_ok());
|
|
1760
|
-
}
|
|
1761
|
-
|
|
1762
|
-
#[test]
|
|
1763
|
-
fn test_validate_utf8_boundaries_invalid_mid_emoji() {
|
|
1764
|
-
use crate::types::PageBoundary;
|
|
1765
|
-
|
|
1766
|
-
let text = "Hello 👋 World";
|
|
1767
|
-
let boundaries = vec![PageBoundary {
|
|
1768
|
-
byte_start: 0,
|
|
1769
|
-
byte_end: 7,
|
|
1770
|
-
page_number: 1,
|
|
1771
|
-
}];
|
|
1772
|
-
|
|
1773
|
-
let config = ChunkingConfig::default();
|
|
1774
|
-
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1775
|
-
assert!(result.is_err());
|
|
1776
|
-
let err = result.unwrap_err();
|
|
1777
|
-
assert!(err.to_string().contains("UTF-8 character boundary"));
|
|
1778
|
-
assert!(err.to_string().contains("byte_end=7"));
|
|
1779
|
-
}
|
|
1780
|
-
|
|
1781
|
-
#[test]
|
|
1782
|
-
fn test_validate_utf8_boundaries_invalid_mid_multibyte_cjk() {
|
|
1783
|
-
use crate::types::PageBoundary;
|
|
1784
|
-
|
|
1785
|
-
let text = "中文文本";
|
|
1786
|
-
let boundaries = vec![PageBoundary {
|
|
1787
|
-
byte_start: 0,
|
|
1788
|
-
byte_end: 1,
|
|
1789
|
-
page_number: 1,
|
|
1790
|
-
}];
|
|
1791
|
-
|
|
1792
|
-
let config = ChunkingConfig::default();
|
|
1793
|
-
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1794
|
-
assert!(result.is_err());
|
|
1795
|
-
let err = result.unwrap_err();
|
|
1796
|
-
assert!(err.to_string().contains("UTF-8 character boundary"));
|
|
1797
|
-
}
|
|
1798
|
-
|
|
1799
|
-
#[test]
|
|
1800
|
-
fn test_validate_utf8_boundaries_byte_start_exceeds_length() {
|
|
1801
|
-
use crate::types::PageBoundary;
|
|
1802
|
-
|
|
1803
|
-
let text = "Short";
|
|
1804
|
-
let boundaries = vec![
|
|
1805
|
-
PageBoundary {
|
|
1806
|
-
byte_start: 0,
|
|
1807
|
-
byte_end: 3,
|
|
1808
|
-
page_number: 1,
|
|
1809
|
-
},
|
|
1810
|
-
PageBoundary {
|
|
1811
|
-
byte_start: 10,
|
|
1812
|
-
byte_end: 15,
|
|
1813
|
-
page_number: 2,
|
|
1814
|
-
},
|
|
1815
|
-
];
|
|
1816
|
-
|
|
1817
|
-
let config = ChunkingConfig::default();
|
|
1818
|
-
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1819
|
-
assert!(result.is_err());
|
|
1820
|
-
let err = result.unwrap_err();
|
|
1821
|
-
assert!(err.to_string().contains("exceeds text length"));
|
|
1822
|
-
}
|
|
1823
|
-
|
|
1824
|
-
#[test]
|
|
1825
|
-
fn test_validate_utf8_boundaries_byte_end_exceeds_length() {
|
|
1826
|
-
use crate::types::PageBoundary;
|
|
1827
|
-
|
|
1828
|
-
let text = "Short";
|
|
1829
|
-
let boundaries = vec![PageBoundary {
|
|
1830
|
-
byte_start: 0,
|
|
1831
|
-
byte_end: 100,
|
|
1832
|
-
page_number: 1,
|
|
1833
|
-
}];
|
|
1834
|
-
|
|
1835
|
-
let config = ChunkingConfig::default();
|
|
1836
|
-
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1837
|
-
assert!(result.is_err());
|
|
1838
|
-
let err = result.unwrap_err();
|
|
1839
|
-
assert!(err.to_string().contains("exceeds text length"));
|
|
1840
|
-
}
|
|
1841
|
-
|
|
1842
|
-
#[test]
|
|
1843
|
-
fn test_validate_utf8_boundaries_empty_boundaries() {
|
|
1844
|
-
use crate::types::PageBoundary;
|
|
1845
|
-
|
|
1846
|
-
let text = "Some text";
|
|
1847
|
-
let boundaries: Vec<PageBoundary> = vec![];
|
|
1848
|
-
|
|
1849
|
-
let config = ChunkingConfig::default();
|
|
1850
|
-
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1851
|
-
assert!(result.is_ok());
|
|
1852
|
-
}
|
|
1853
|
-
|
|
1854
|
-
#[test]
|
|
1855
|
-
fn test_validate_utf8_boundaries_at_text_boundaries() {
|
|
1856
|
-
use crate::types::PageBoundary;
|
|
1857
|
-
|
|
1858
|
-
let text = "Exact boundary test";
|
|
1859
|
-
let text_len = text.len();
|
|
1860
|
-
let boundaries = vec![PageBoundary {
|
|
1861
|
-
byte_start: 0,
|
|
1862
|
-
byte_end: text_len,
|
|
1863
|
-
page_number: 1,
|
|
1864
|
-
}];
|
|
1865
|
-
|
|
1866
|
-
let config = ChunkingConfig::default();
|
|
1867
|
-
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1868
|
-
assert!(result.is_ok());
|
|
1869
|
-
}
|
|
1870
|
-
|
|
1871
|
-
#[test]
|
|
1872
|
-
fn test_validate_utf8_boundaries_mixed_languages() {
|
|
1873
|
-
use crate::types::PageBoundary;
|
|
1874
|
-
|
|
1875
|
-
let text = "English text mixed with 中文 and français";
|
|
1876
|
-
let config = ChunkingConfig::default();
|
|
1877
|
-
|
|
1878
|
-
let boundaries = vec![
|
|
1879
|
-
PageBoundary {
|
|
1880
|
-
byte_start: 0,
|
|
1881
|
-
byte_end: 24,
|
|
1882
|
-
page_number: 1,
|
|
1883
|
-
},
|
|
1884
|
-
PageBoundary {
|
|
1885
|
-
byte_start: 24,
|
|
1886
|
-
byte_end: text.len(),
|
|
1887
|
-
page_number: 2,
|
|
1888
|
-
},
|
|
1889
|
-
];
|
|
1890
|
-
|
|
1891
|
-
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1892
|
-
assert!(result.is_ok());
|
|
1893
|
-
}
|
|
1894
|
-
|
|
1895
|
-
#[test]
|
|
1896
|
-
fn test_chunk_text_rejects_invalid_utf8_boundaries() {
|
|
1897
|
-
use crate::types::PageBoundary;
|
|
1898
|
-
|
|
1899
|
-
let text = "🌍🌎🌏 Three emoji planets";
|
|
1900
|
-
let config = ChunkingConfig::default();
|
|
1901
|
-
|
|
1902
|
-
let boundaries = vec![PageBoundary {
|
|
1903
|
-
byte_start: 0,
|
|
1904
|
-
byte_end: 1000,
|
|
1905
|
-
page_number: 1,
|
|
1906
|
-
}];
|
|
1907
|
-
|
|
1908
|
-
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1909
|
-
assert!(result.is_err());
|
|
1910
|
-
}
|
|
1911
|
-
|
|
1912
|
-
#[test]
|
|
1913
|
-
fn test_validate_utf8_boundaries_combining_diacriticals() {
|
|
1914
|
-
use crate::types::PageBoundary;
|
|
1915
|
-
|
|
1916
|
-
let text = "café";
|
|
1917
|
-
let config = ChunkingConfig::default();
|
|
1918
|
-
|
|
1919
|
-
let boundaries = vec![
|
|
1920
|
-
PageBoundary {
|
|
1921
|
-
byte_start: 0,
|
|
1922
|
-
byte_end: 2,
|
|
1923
|
-
page_number: 1,
|
|
1924
|
-
},
|
|
1925
|
-
PageBoundary {
|
|
1926
|
-
byte_start: 2,
|
|
1927
|
-
byte_end: text.len(),
|
|
1928
|
-
page_number: 2,
|
|
1929
|
-
},
|
|
1930
|
-
];
|
|
1931
|
-
|
|
1932
|
-
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1933
|
-
assert!(result.is_ok());
|
|
1934
|
-
}
|
|
1935
|
-
|
|
1936
|
-
#[test]
|
|
1937
|
-
fn test_validate_utf8_boundaries_error_messages_are_clear() {
|
|
1938
|
-
use crate::types::PageBoundary;
|
|
1939
|
-
|
|
1940
|
-
let text = "Test 👋 text";
|
|
1941
|
-
let config = ChunkingConfig::default();
|
|
1942
|
-
|
|
1943
|
-
let boundaries = vec![PageBoundary {
|
|
1944
|
-
byte_start: 0,
|
|
1945
|
-
byte_end: 6,
|
|
1946
|
-
page_number: 1,
|
|
1947
|
-
}];
|
|
1948
|
-
|
|
1949
|
-
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1950
|
-
assert!(result.is_err());
|
|
1951
|
-
let err = result.unwrap_err();
|
|
1952
|
-
let err_msg = err.to_string();
|
|
1953
|
-
assert!(err_msg.contains("UTF-8"));
|
|
1954
|
-
assert!(err_msg.contains("boundary"));
|
|
1955
|
-
assert!(err_msg.contains("6"));
|
|
1956
|
-
}
|
|
1957
|
-
|
|
1958
|
-
#[test]
|
|
1959
|
-
fn test_validate_utf8_boundaries_multiple_valid_boundaries() {
|
|
1960
|
-
use crate::types::PageBoundary;
|
|
1961
|
-
|
|
1962
|
-
let text = "First👋Second🌍Third";
|
|
1963
|
-
let config = ChunkingConfig::default();
|
|
1964
|
-
|
|
1965
|
-
let boundaries = vec![
|
|
1966
|
-
PageBoundary {
|
|
1967
|
-
byte_start: 0,
|
|
1968
|
-
byte_end: 5,
|
|
1969
|
-
page_number: 1,
|
|
1970
|
-
},
|
|
1971
|
-
PageBoundary {
|
|
1972
|
-
byte_start: 5,
|
|
1973
|
-
byte_end: 9,
|
|
1974
|
-
page_number: 2,
|
|
1975
|
-
},
|
|
1976
|
-
PageBoundary {
|
|
1977
|
-
byte_start: 9,
|
|
1978
|
-
byte_end: 15,
|
|
1979
|
-
page_number: 3,
|
|
1980
|
-
},
|
|
1981
|
-
PageBoundary {
|
|
1982
|
-
byte_start: 15,
|
|
1983
|
-
byte_end: 19,
|
|
1984
|
-
page_number: 4,
|
|
1985
|
-
},
|
|
1986
|
-
PageBoundary {
|
|
1987
|
-
byte_start: 19,
|
|
1988
|
-
byte_end: text.len(),
|
|
1989
|
-
page_number: 5,
|
|
1990
|
-
},
|
|
1991
|
-
];
|
|
1992
|
-
|
|
1993
|
-
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1994
|
-
assert!(result.is_ok());
|
|
1995
|
-
}
|
|
1996
|
-
|
|
1997
|
-
#[test]
|
|
1998
|
-
fn test_validate_utf8_boundaries_zero_start_and_end() {
|
|
1999
|
-
use crate::types::PageBoundary;
|
|
2000
|
-
|
|
2001
|
-
let text = "Text";
|
|
2002
|
-
let config = ChunkingConfig::default();
|
|
2003
|
-
|
|
2004
|
-
let boundaries = vec![PageBoundary {
|
|
2005
|
-
byte_start: 0,
|
|
2006
|
-
byte_end: 0,
|
|
2007
|
-
page_number: 1,
|
|
2008
|
-
}];
|
|
2009
|
-
|
|
2010
|
-
let result = chunk_text(text, &config, Some(&boundaries));
|
|
2011
|
-
assert!(result.is_err());
|
|
2012
|
-
}
|
|
2013
|
-
|
|
2014
|
-
#[test]
|
|
2015
|
-
fn test_utf8_boundaries_caching_with_many_boundaries() {
|
|
2016
|
-
use crate::types::PageBoundary;
|
|
2017
|
-
|
|
2018
|
-
let config = ChunkingConfig {
|
|
2019
|
-
max_characters: 500,
|
|
2020
|
-
overlap: 50,
|
|
2021
|
-
trim: true,
|
|
2022
|
-
chunker_type: ChunkerType::Text,
|
|
2023
|
-
};
|
|
2024
|
-
|
|
2025
|
-
let text = "🌍 Hello World ".repeat(200);
|
|
2026
|
-
let text_len = text.len();
|
|
2027
|
-
|
|
2028
|
-
let mut boundaries = vec![];
|
|
2029
|
-
let boundary_count = 10;
|
|
2030
|
-
let step = text_len / boundary_count;
|
|
2031
|
-
|
|
2032
|
-
for i in 0..boundary_count {
|
|
2033
|
-
let start = i * step;
|
|
2034
|
-
let end = if i == boundary_count - 1 {
|
|
2035
|
-
text_len
|
|
2036
|
-
} else {
|
|
2037
|
-
(i + 1) * step
|
|
2038
|
-
};
|
|
2039
|
-
|
|
2040
|
-
if start < end
|
|
2041
|
-
&& start <= text_len
|
|
2042
|
-
&& end <= text_len
|
|
2043
|
-
&& let Some(boundary_start) = text[..start].char_indices().last().map(|(idx, _)| idx)
|
|
2044
|
-
&& let Some(boundary_end) = text[..end].char_indices().last().map(|(idx, _)| idx)
|
|
2045
|
-
{
|
|
2046
|
-
boundaries.push(PageBoundary {
|
|
2047
|
-
byte_start: boundary_start,
|
|
2048
|
-
byte_end: boundary_end,
|
|
2049
|
-
page_number: i + 1,
|
|
2050
|
-
});
|
|
2051
|
-
}
|
|
2052
|
-
}
|
|
2053
|
-
|
|
2054
|
-
if !boundaries.is_empty() {
|
|
2055
|
-
let result = chunk_text(&text, &config, Some(&boundaries));
|
|
2056
|
-
assert!(
|
|
2057
|
-
result.is_ok(),
|
|
2058
|
-
"Failed to chunk text with {} boundaries",
|
|
2059
|
-
boundaries.len()
|
|
2060
|
-
);
|
|
2061
|
-
|
|
2062
|
-
let chunks = result.unwrap();
|
|
2063
|
-
assert!(chunks.chunk_count > 0);
|
|
2064
|
-
}
|
|
2065
|
-
}
|
|
2066
|
-
|
|
2067
|
-
#[test]
|
|
2068
|
-
fn test_utf8_boundaries_caching_large_document_with_emojis() {
|
|
2069
|
-
use crate::types::PageBoundary;
|
|
2070
|
-
|
|
2071
|
-
let config = ChunkingConfig {
|
|
2072
|
-
max_characters: 1000,
|
|
2073
|
-
overlap: 100,
|
|
2074
|
-
trim: true,
|
|
2075
|
-
chunker_type: ChunkerType::Text,
|
|
2076
|
-
};
|
|
2077
|
-
|
|
2078
|
-
let large_text = "This is a large document with lots of emoji: 🌍 🚀 💻 🎉 🔥 ✨ 🎨 🌟 ".repeat(100);
|
|
2079
|
-
|
|
2080
|
-
let all_indices: Vec<usize> = large_text.char_indices().map(|(idx, _)| idx).collect();
|
|
2081
|
-
|
|
2082
|
-
let third_idx = all_indices.len() / 3;
|
|
2083
|
-
let two_thirds_idx = (2 * all_indices.len()) / 3;
|
|
2084
|
-
|
|
2085
|
-
let boundary_start_1 = if third_idx < all_indices.len() {
|
|
2086
|
-
all_indices[third_idx]
|
|
2087
|
-
} else {
|
|
2088
|
-
large_text.len()
|
|
2089
|
-
};
|
|
2090
|
-
|
|
2091
|
-
let boundary_start_2 = if two_thirds_idx < all_indices.len() {
|
|
2092
|
-
all_indices[two_thirds_idx]
|
|
2093
|
-
} else {
|
|
2094
|
-
large_text.len()
|
|
2095
|
-
};
|
|
2096
|
-
|
|
2097
|
-
let boundaries = vec![
|
|
2098
|
-
PageBoundary {
|
|
2099
|
-
byte_start: 0,
|
|
2100
|
-
byte_end: boundary_start_1,
|
|
2101
|
-
page_number: 1,
|
|
2102
|
-
},
|
|
2103
|
-
PageBoundary {
|
|
2104
|
-
byte_start: boundary_start_1,
|
|
2105
|
-
byte_end: boundary_start_2,
|
|
2106
|
-
page_number: 2,
|
|
2107
|
-
},
|
|
2108
|
-
PageBoundary {
|
|
2109
|
-
byte_start: boundary_start_2,
|
|
2110
|
-
byte_end: large_text.len(),
|
|
2111
|
-
page_number: 3,
|
|
2112
|
-
},
|
|
2113
|
-
];
|
|
2114
|
-
|
|
2115
|
-
let result = chunk_text(&large_text, &config, Some(&boundaries));
|
|
2116
|
-
assert!(result.is_ok());
|
|
2117
|
-
|
|
2118
|
-
let chunks = result.unwrap();
|
|
2119
|
-
assert!(!chunks.chunks.is_empty());
|
|
2120
|
-
|
|
2121
|
-
for chunk in &chunks.chunks {
|
|
2122
|
-
assert!(!chunk.content.is_empty());
|
|
2123
|
-
if let (Some(first), Some(last)) = (chunk.metadata.first_page, chunk.metadata.last_page) {
|
|
2124
|
-
assert!(first <= last);
|
|
2125
|
-
}
|
|
2126
|
-
}
|
|
2127
|
-
}
|
|
2128
|
-
|
|
2129
|
-
#[test]
|
|
2130
|
-
fn test_adaptive_validation_small_boundary_set() {
|
|
2131
|
-
use crate::types::PageBoundary;
|
|
2132
|
-
|
|
2133
|
-
let config = ChunkingConfig {
|
|
2134
|
-
max_characters: 100,
|
|
2135
|
-
overlap: 10,
|
|
2136
|
-
trim: true,
|
|
2137
|
-
chunker_type: ChunkerType::Text,
|
|
2138
|
-
};
|
|
2139
|
-
let text = "Hello 👋 World 🌍 End";
|
|
2140
|
-
|
|
2141
|
-
let boundaries = vec![
|
|
2142
|
-
PageBoundary {
|
|
2143
|
-
byte_start: 0,
|
|
2144
|
-
byte_end: 6,
|
|
2145
|
-
page_number: 1,
|
|
2146
|
-
},
|
|
2147
|
-
PageBoundary {
|
|
2148
|
-
byte_start: 6,
|
|
2149
|
-
byte_end: 15,
|
|
2150
|
-
page_number: 2,
|
|
2151
|
-
},
|
|
2152
|
-
PageBoundary {
|
|
2153
|
-
byte_start: 15,
|
|
2154
|
-
byte_end: text.len(),
|
|
2155
|
-
page_number: 3,
|
|
2156
|
-
},
|
|
2157
|
-
];
|
|
2158
|
-
|
|
2159
|
-
let result = chunk_text(text, &config, Some(&boundaries));
|
|
2160
|
-
assert!(result.is_ok());
|
|
2161
|
-
}
|
|
2162
|
-
|
|
2163
|
-
#[test]
|
|
2164
|
-
fn test_adaptive_validation_threshold_boundary() {
|
|
2165
|
-
use crate::types::PageBoundary;
|
|
2166
|
-
|
|
2167
|
-
let config = ChunkingConfig {
|
|
2168
|
-
max_characters: 200,
|
|
2169
|
-
overlap: 20,
|
|
2170
|
-
trim: true,
|
|
2171
|
-
chunker_type: ChunkerType::Text,
|
|
2172
|
-
};
|
|
2173
|
-
let text = "Test text ".repeat(50);
|
|
2174
|
-
let text_len = text.len();
|
|
2175
|
-
|
|
2176
|
-
let mut boundaries = vec![];
|
|
2177
|
-
let step = text_len / ADAPTIVE_VALIDATION_THRESHOLD;
|
|
2178
|
-
|
|
2179
|
-
for i in 0..ADAPTIVE_VALIDATION_THRESHOLD {
|
|
2180
|
-
let start = i * step;
|
|
2181
|
-
let end = if i == ADAPTIVE_VALIDATION_THRESHOLD - 1 {
|
|
2182
|
-
text_len
|
|
2183
|
-
} else {
|
|
2184
|
-
(i + 1) * step
|
|
2185
|
-
};
|
|
2186
|
-
|
|
2187
|
-
if start < end
|
|
2188
|
-
&& start <= text_len
|
|
2189
|
-
&& end <= text_len
|
|
2190
|
-
&& let Some(boundary_start) = text[..start.min(text_len - 1)]
|
|
2191
|
-
.char_indices()
|
|
2192
|
-
.last()
|
|
2193
|
-
.map(|(idx, _)| idx)
|
|
2194
|
-
&& let Some(boundary_end) = text[..end.min(text_len)].char_indices().last().map(|(idx, _)| idx)
|
|
2195
|
-
&& boundary_start < boundary_end
|
|
2196
|
-
{
|
|
2197
|
-
boundaries.push(PageBoundary {
|
|
2198
|
-
byte_start: boundary_start,
|
|
2199
|
-
byte_end: boundary_end,
|
|
2200
|
-
page_number: i + 1,
|
|
2201
|
-
});
|
|
2202
|
-
}
|
|
2203
|
-
}
|
|
2204
|
-
|
|
2205
|
-
if !boundaries.is_empty() {
|
|
2206
|
-
let result = chunk_text(&text, &config, Some(&boundaries));
|
|
2207
|
-
assert!(result.is_ok());
|
|
2208
|
-
}
|
|
2209
|
-
}
|
|
2210
|
-
|
|
2211
|
-
#[test]
|
|
2212
|
-
fn test_adaptive_validation_large_boundary_set() {
|
|
2213
|
-
use crate::types::PageBoundary;
|
|
2214
|
-
|
|
2215
|
-
let config = ChunkingConfig {
|
|
2216
|
-
max_characters: 500,
|
|
2217
|
-
overlap: 50,
|
|
2218
|
-
trim: true,
|
|
2219
|
-
chunker_type: ChunkerType::Text,
|
|
2220
|
-
};
|
|
2221
|
-
let text = "Lorem ipsum dolor sit amet ".repeat(100);
|
|
2222
|
-
let text_len = text.len();
|
|
2223
|
-
|
|
2224
|
-
let mut boundaries = vec![];
|
|
2225
|
-
let boundary_count = 50;
|
|
2226
|
-
let step = text_len / boundary_count;
|
|
2227
|
-
|
|
2228
|
-
for i in 0..boundary_count {
|
|
2229
|
-
let start = i * step;
|
|
2230
|
-
let end = if i == boundary_count - 1 {
|
|
2231
|
-
text_len
|
|
2232
|
-
} else {
|
|
2233
|
-
(i + 1) * step
|
|
2234
|
-
};
|
|
2235
|
-
|
|
2236
|
-
if start < end
|
|
2237
|
-
&& start <= text_len
|
|
2238
|
-
&& end <= text_len
|
|
2239
|
-
&& let Some(boundary_start) = text[..start.min(text_len - 1)]
|
|
2240
|
-
.char_indices()
|
|
2241
|
-
.last()
|
|
2242
|
-
.map(|(idx, _)| idx)
|
|
2243
|
-
&& let Some(boundary_end) = text[..end.min(text_len)].char_indices().last().map(|(idx, _)| idx)
|
|
2244
|
-
&& boundary_start < boundary_end
|
|
2245
|
-
{
|
|
2246
|
-
boundaries.push(PageBoundary {
|
|
2247
|
-
byte_start: boundary_start,
|
|
2248
|
-
byte_end: boundary_end,
|
|
2249
|
-
page_number: i + 1,
|
|
2250
|
-
});
|
|
2251
|
-
}
|
|
2252
|
-
}
|
|
2253
|
-
|
|
2254
|
-
if !boundaries.is_empty() {
|
|
2255
|
-
let result = chunk_text(&text, &config, Some(&boundaries));
|
|
2256
|
-
assert!(result.is_ok());
|
|
2257
|
-
}
|
|
2258
|
-
}
|
|
2259
|
-
|
|
2260
|
-
#[test]
|
|
2261
|
-
fn test_adaptive_validation_consistency() {
|
|
2262
|
-
use crate::types::PageBoundary;
|
|
2263
|
-
|
|
2264
|
-
let config = ChunkingConfig {
|
|
2265
|
-
max_characters: 300,
|
|
2266
|
-
overlap: 30,
|
|
2267
|
-
trim: true,
|
|
2268
|
-
chunker_type: ChunkerType::Text,
|
|
2269
|
-
};
|
|
2270
|
-
let text = "Mixed language: 你好 مرحبا Здравствуй ".repeat(50);
|
|
2271
|
-
|
|
2272
|
-
let boundaries = vec![
|
|
2273
|
-
PageBoundary {
|
|
2274
|
-
byte_start: 0,
|
|
2275
|
-
byte_end: 50,
|
|
2276
|
-
page_number: 1,
|
|
2277
|
-
},
|
|
2278
|
-
PageBoundary {
|
|
2279
|
-
byte_start: 50,
|
|
2280
|
-
byte_end: 100,
|
|
2281
|
-
page_number: 2,
|
|
2282
|
-
},
|
|
2283
|
-
PageBoundary {
|
|
2284
|
-
byte_start: 100,
|
|
2285
|
-
byte_end: 150,
|
|
2286
|
-
page_number: 3,
|
|
2287
|
-
},
|
|
2288
|
-
PageBoundary {
|
|
2289
|
-
byte_start: 150,
|
|
2290
|
-
byte_end: 200,
|
|
2291
|
-
page_number: 4,
|
|
2292
|
-
},
|
|
2293
|
-
PageBoundary {
|
|
2294
|
-
byte_start: 200,
|
|
2295
|
-
byte_end: text.len(),
|
|
2296
|
-
page_number: 5,
|
|
2297
|
-
},
|
|
2298
|
-
];
|
|
2299
|
-
|
|
2300
|
-
let result = chunk_text(&text, &config, Some(&boundaries));
|
|
2301
|
-
let _ = result;
|
|
2302
|
-
}
|
|
2303
677
|
}
|