kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +5 -5
- data/README.md +15 -9
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +511 -325
- data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
- data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
- data/kreuzberg.gemspec +38 -4
- data/lib/kreuzberg/config.rb +34 -1
- data/lib/kreuzberg/result.rb +77 -14
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +23 -6
- data/vendor/kreuzberg/Cargo.toml +32 -11
- data/vendor/kreuzberg/README.md +54 -8
- data/vendor/kreuzberg/build.rs +549 -132
- data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
- data/vendor/kreuzberg/src/core/config.rs +49 -1
- data/vendor/kreuzberg/src/core/extractor.rs +134 -2
- data/vendor/kreuzberg/src/core/mod.rs +4 -2
- data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
- data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
- data/vendor/kreuzberg/src/extraction/html.rs +24 -8
- data/vendor/kreuzberg/src/extraction/image.rs +124 -1
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
- data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
- data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
- data/vendor/kreuzberg/src/extractors/email.rs +29 -15
- data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
- data/vendor/kreuzberg/src/extractors/html.rs +29 -15
- data/vendor/kreuzberg/src/extractors/image.rs +25 -4
- data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
- data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
- data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
- data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +197 -17
- data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
- data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
- data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +7 -2
- data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
- data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
- data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
- data/vendor/kreuzberg/src/lib.rs +10 -2
- data/vendor/kreuzberg/src/mcp/mod.rs +3 -0
- data/vendor/kreuzberg/src/mcp/server.rs +120 -12
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +328 -0
- data/vendor/kreuzberg/src/pdf/error.rs +8 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
- data/vendor/kreuzberg/src/pdf/mod.rs +18 -2
- data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
- data/vendor/kreuzberg/src/pdf/table.rs +26 -2
- data/vendor/kreuzberg/src/pdf/text.rs +89 -7
- data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
- data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
- data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
- data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
- data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
- data/vendor/kreuzberg/src/text/mod.rs +6 -0
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
- data/vendor/kreuzberg/src/types.rs +173 -21
- data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
- data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
- data/vendor/kreuzberg/tests/config_features.rs +15 -1
- data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
- data/vendor/kreuzberg/tests/email_integration.rs +2 -0
- data/vendor/kreuzberg/tests/error_handling.rs +43 -34
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/image_integration.rs +2 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
- data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
- data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
- data/vendor/kreuzberg/tests/security_validation.rs +1 -0
- data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
- data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
- data/vendor/rb-sys/Cargo.lock +15 -15
- data/vendor/rb-sys/Cargo.toml +4 -4
- data/vendor/rb-sys/Cargo.toml.orig +4 -4
- data/vendor/rb-sys/build/features.rs +5 -2
- data/vendor/rb-sys/build/main.rs +55 -15
- data/vendor/rb-sys/build/stable_api_config.rs +4 -2
- data/vendor/rb-sys/build/version.rs +3 -1
- data/vendor/rb-sys/src/lib.rs +1 -0
- data/vendor/rb-sys/src/macros.rs +2 -2
- data/vendor/rb-sys/src/special_consts.rs +1 -1
- data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
- data/vendor/rb-sys/src/stable_api.rs +0 -1
- data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
- metadata +13 -10
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
|
@@ -30,7 +30,7 @@
|
|
|
30
30
|
//! };
|
|
31
31
|
//!
|
|
32
32
|
//! let long_text = "This is a very long document...".repeat(100);
|
|
33
|
-
//! let result = chunk_text(&long_text, &config)?;
|
|
33
|
+
//! let result = chunk_text(&long_text, &config, None)?;
|
|
34
34
|
//!
|
|
35
35
|
//! println!("Split into {} chunks", result.chunk_count);
|
|
36
36
|
//! for (i, chunk) in result.chunks.iter().enumerate() {
|
|
@@ -47,10 +47,15 @@
|
|
|
47
47
|
//! - Processing large documents in batches
|
|
48
48
|
//! - Maintaining context across chunk boundaries
|
|
49
49
|
use crate::error::{KreuzbergError, Result};
|
|
50
|
-
use crate::types::{Chunk, ChunkMetadata};
|
|
50
|
+
use crate::types::{Chunk, ChunkMetadata, PageBoundary};
|
|
51
|
+
use once_cell::sync::Lazy;
|
|
51
52
|
use serde::{Deserialize, Serialize};
|
|
53
|
+
use std::sync::Arc;
|
|
52
54
|
use text_splitter::{Characters, ChunkCapacity, ChunkConfig, MarkdownSplitter, TextSplitter};
|
|
53
55
|
|
|
56
|
+
pub mod processor;
|
|
57
|
+
pub use processor::ChunkingProcessor;
|
|
58
|
+
|
|
54
59
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
|
55
60
|
pub enum ChunkerType {
|
|
56
61
|
Text,
|
|
@@ -88,7 +93,215 @@ fn build_chunk_config(max_characters: usize, overlap: usize, trim: bool) -> Resu
|
|
|
88
93
|
.map_err(|e| KreuzbergError::validation(format!("Invalid chunking configuration: {}", e)))
|
|
89
94
|
}
|
|
90
95
|
|
|
91
|
-
|
|
96
|
+
/// Validates that byte offsets in page boundaries fall on valid UTF-8 character boundaries.
|
|
97
|
+
///
|
|
98
|
+
/// This function ensures that all page boundary positions are at valid UTF-8 character
|
|
99
|
+
/// boundaries within the text. This is CRITICAL to prevent text corruption when boundaries
|
|
100
|
+
/// are created from language bindings or external sources, particularly with multibyte
|
|
101
|
+
/// UTF-8 characters (emoji, CJK characters, combining marks, etc.).
|
|
102
|
+
///
|
|
103
|
+
/// # Arguments
|
|
104
|
+
///
|
|
105
|
+
/// * `text` - The text being chunked
|
|
106
|
+
/// * `boundaries` - Page boundary markers to validate
|
|
107
|
+
///
|
|
108
|
+
/// # Returns
|
|
109
|
+
///
|
|
110
|
+
/// Returns `Ok(())` if all boundaries are at valid UTF-8 character boundaries.
|
|
111
|
+
/// Returns `KreuzbergError::Validation` if any boundary is at an invalid position.
|
|
112
|
+
///
|
|
113
|
+
/// # UTF-8 Boundary Safety
|
|
114
|
+
///
|
|
115
|
+
/// Rust strings use UTF-8 encoding where characters can be 1-4 bytes. For example:
|
|
116
|
+
/// - ASCII letters: 1 byte each
|
|
117
|
+
/// - Emoji (🌍): 4 bytes but 1 character
|
|
118
|
+
/// - CJK characters (中): 3 bytes but 1 character
|
|
119
|
+
///
|
|
120
|
+
/// This function checks that all byte_start and byte_end values are at character
|
|
121
|
+
/// boundaries using Rust's `is_char_boundary()` method.
|
|
122
|
+
fn validate_utf8_boundaries(text: &str, boundaries: &[PageBoundary]) -> Result<()> {
|
|
123
|
+
for (idx, boundary) in boundaries.iter().enumerate() {
|
|
124
|
+
if boundary.byte_start > 0 && boundary.byte_start <= text.len() {
|
|
125
|
+
if !text.is_char_boundary(boundary.byte_start) {
|
|
126
|
+
return Err(KreuzbergError::validation(format!(
|
|
127
|
+
"Page boundary {} has byte_start={} which is not a valid UTF-8 character boundary (text length={}). This may indicate corrupted multibyte characters (emoji, CJK, etc.)",
|
|
128
|
+
idx,
|
|
129
|
+
boundary.byte_start,
|
|
130
|
+
text.len()
|
|
131
|
+
)));
|
|
132
|
+
}
|
|
133
|
+
} else if boundary.byte_start > text.len() {
|
|
134
|
+
return Err(KreuzbergError::validation(format!(
|
|
135
|
+
"Page boundary {} has byte_start={} which exceeds text length {}",
|
|
136
|
+
idx,
|
|
137
|
+
boundary.byte_start,
|
|
138
|
+
text.len()
|
|
139
|
+
)));
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
if boundary.byte_end > 0 && boundary.byte_end <= text.len() {
|
|
143
|
+
if !text.is_char_boundary(boundary.byte_end) {
|
|
144
|
+
return Err(KreuzbergError::validation(format!(
|
|
145
|
+
"Page boundary {} has byte_end={} which is not a valid UTF-8 character boundary (text length={}). This may indicate corrupted multibyte characters (emoji, CJK, etc.)",
|
|
146
|
+
idx,
|
|
147
|
+
boundary.byte_end,
|
|
148
|
+
text.len()
|
|
149
|
+
)));
|
|
150
|
+
}
|
|
151
|
+
} else if boundary.byte_end > text.len() {
|
|
152
|
+
return Err(KreuzbergError::validation(format!(
|
|
153
|
+
"Page boundary {} has byte_end={} which exceeds text length {}",
|
|
154
|
+
idx,
|
|
155
|
+
boundary.byte_end,
|
|
156
|
+
text.len()
|
|
157
|
+
)));
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
Ok(())
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/// Calculate which pages a character range spans.
|
|
165
|
+
///
|
|
166
|
+
/// # Arguments
|
|
167
|
+
///
|
|
168
|
+
/// * `char_start` - Starting character offset of the chunk
|
|
169
|
+
/// * `char_end` - Ending character offset of the chunk
|
|
170
|
+
/// * `boundaries` - Page boundary markers from the document
|
|
171
|
+
///
|
|
172
|
+
/// # Returns
|
|
173
|
+
///
|
|
174
|
+
/// A tuple of (first_page, last_page) where page numbers are 1-indexed.
|
|
175
|
+
/// Returns (None, None) if boundaries are empty or chunk doesn't overlap any page.
|
|
176
|
+
/// Validates page boundaries for consistency and correctness.
|
|
177
|
+
///
|
|
178
|
+
/// # Validation Rules
|
|
179
|
+
///
|
|
180
|
+
/// 1. Boundaries must be sorted by char_start (monotonically increasing)
|
|
181
|
+
/// 2. Boundaries must not overlap (char_end[i] <= char_start[i+1])
|
|
182
|
+
/// 3. Each boundary must have char_start < char_end
|
|
183
|
+
///
|
|
184
|
+
/// # Errors
|
|
185
|
+
///
|
|
186
|
+
/// Returns `KreuzbergError::Validation` if any boundary is invalid.
|
|
187
|
+
fn validate_page_boundaries(boundaries: &[PageBoundary]) -> Result<()> {
|
|
188
|
+
if boundaries.is_empty() {
|
|
189
|
+
return Ok(());
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
for (idx, boundary) in boundaries.iter().enumerate() {
|
|
193
|
+
if boundary.byte_start >= boundary.byte_end {
|
|
194
|
+
return Err(KreuzbergError::validation(format!(
|
|
195
|
+
"Invalid boundary range at index {}: byte_start ({}) must be < byte_end ({})",
|
|
196
|
+
idx, boundary.byte_start, boundary.byte_end
|
|
197
|
+
)));
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
for i in 0..boundaries.len() - 1 {
|
|
202
|
+
let current = &boundaries[i];
|
|
203
|
+
let next = &boundaries[i + 1];
|
|
204
|
+
|
|
205
|
+
if current.byte_start > next.byte_start {
|
|
206
|
+
return Err(KreuzbergError::validation(format!(
|
|
207
|
+
"Page boundaries not sorted: boundary at index {} (byte_start={}) comes after boundary at index {} (byte_start={})",
|
|
208
|
+
i,
|
|
209
|
+
current.byte_start,
|
|
210
|
+
i + 1,
|
|
211
|
+
next.byte_start
|
|
212
|
+
)));
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
if current.byte_end > next.byte_start {
|
|
216
|
+
return Err(KreuzbergError::validation(format!(
|
|
217
|
+
"Overlapping page boundaries: boundary {} ends at {} but boundary {} starts at {}",
|
|
218
|
+
i,
|
|
219
|
+
current.byte_end,
|
|
220
|
+
i + 1,
|
|
221
|
+
next.byte_start
|
|
222
|
+
)));
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
Ok(())
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
/// Calculate which pages a byte range spans.
|
|
230
|
+
///
|
|
231
|
+
/// # Arguments
|
|
232
|
+
///
|
|
233
|
+
/// * `byte_start` - Starting byte offset of the chunk
|
|
234
|
+
/// * `byte_end` - Ending byte offset of the chunk
|
|
235
|
+
/// * `boundaries` - Page boundary markers from the document
|
|
236
|
+
///
|
|
237
|
+
/// # Returns
|
|
238
|
+
///
|
|
239
|
+
/// A tuple of (first_page, last_page) where page numbers are 1-indexed.
|
|
240
|
+
/// Returns (None, None) if boundaries are empty or chunk doesn't overlap any page.
|
|
241
|
+
///
|
|
242
|
+
/// # Errors
|
|
243
|
+
///
|
|
244
|
+
/// Returns `KreuzbergError::Validation` if boundaries are invalid.
|
|
245
|
+
fn calculate_page_range(
|
|
246
|
+
byte_start: usize,
|
|
247
|
+
byte_end: usize,
|
|
248
|
+
boundaries: &[PageBoundary],
|
|
249
|
+
) -> Result<(Option<usize>, Option<usize>)> {
|
|
250
|
+
if boundaries.is_empty() {
|
|
251
|
+
return Ok((None, None));
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
validate_page_boundaries(boundaries)?;
|
|
255
|
+
|
|
256
|
+
let mut first_page = None;
|
|
257
|
+
let mut last_page = None;
|
|
258
|
+
|
|
259
|
+
for boundary in boundaries {
|
|
260
|
+
if byte_start < boundary.byte_end && byte_end > boundary.byte_start {
|
|
261
|
+
if first_page.is_none() {
|
|
262
|
+
first_page = Some(boundary.page_number);
|
|
263
|
+
}
|
|
264
|
+
last_page = Some(boundary.page_number);
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
Ok((first_page, last_page))
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
/// Split text into chunks with optional page boundary tracking.
|
|
272
|
+
///
|
|
273
|
+
/// # Arguments
|
|
274
|
+
///
|
|
275
|
+
/// * `text` - The text to split into chunks
|
|
276
|
+
/// * `config` - Chunking configuration (max size, overlap, type)
|
|
277
|
+
/// * `page_boundaries` - Optional page boundary markers for mapping chunks to pages
|
|
278
|
+
///
|
|
279
|
+
/// # Returns
|
|
280
|
+
///
|
|
281
|
+
/// A ChunkingResult containing all chunks and their metadata.
|
|
282
|
+
///
|
|
283
|
+
/// # Examples
|
|
284
|
+
///
|
|
285
|
+
/// ```rust
|
|
286
|
+
/// use kreuzberg::chunking::{chunk_text, ChunkingConfig, ChunkerType};
|
|
287
|
+
///
|
|
288
|
+
/// # fn example() -> kreuzberg::Result<()> {
|
|
289
|
+
/// let config = ChunkingConfig {
|
|
290
|
+
/// max_characters: 500,
|
|
291
|
+
/// overlap: 50,
|
|
292
|
+
/// trim: true,
|
|
293
|
+
/// chunker_type: ChunkerType::Text,
|
|
294
|
+
/// };
|
|
295
|
+
/// let result = chunk_text("Long text...", &config, None)?;
|
|
296
|
+
/// assert!(!result.chunks.is_empty());
|
|
297
|
+
/// # Ok(())
|
|
298
|
+
/// # }
|
|
299
|
+
/// ```
|
|
300
|
+
pub fn chunk_text(
|
|
301
|
+
text: &str,
|
|
302
|
+
config: &ChunkingConfig,
|
|
303
|
+
page_boundaries: Option<&[PageBoundary]>,
|
|
304
|
+
) -> Result<ChunkingResult> {
|
|
92
305
|
if text.is_empty() {
|
|
93
306
|
return Ok(ChunkingResult {
|
|
94
307
|
chunks: vec![],
|
|
@@ -96,6 +309,10 @@ pub fn chunk_text(text: &str, config: &ChunkingConfig) -> Result<ChunkingResult>
|
|
|
96
309
|
});
|
|
97
310
|
}
|
|
98
311
|
|
|
312
|
+
if let Some(boundaries) = page_boundaries {
|
|
313
|
+
validate_utf8_boundaries(text, boundaries)?;
|
|
314
|
+
}
|
|
315
|
+
|
|
99
316
|
let chunk_config = build_chunk_config(config.max_characters, config.overlap, config.trim)?;
|
|
100
317
|
|
|
101
318
|
let text_chunks: Vec<&str> = match config.chunker_type {
|
|
@@ -110,36 +327,42 @@ pub fn chunk_text(text: &str, config: &ChunkingConfig) -> Result<ChunkingResult>
|
|
|
110
327
|
};
|
|
111
328
|
|
|
112
329
|
let total_chunks = text_chunks.len();
|
|
113
|
-
let mut
|
|
114
|
-
|
|
115
|
-
let chunks: Vec<Chunk> =
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
330
|
+
let mut byte_offset = 0;
|
|
331
|
+
|
|
332
|
+
let mut chunks: Vec<Chunk> = Vec::new();
|
|
333
|
+
|
|
334
|
+
for (index, chunk_text) in text_chunks.into_iter().enumerate() {
|
|
335
|
+
let byte_start = byte_offset;
|
|
336
|
+
let chunk_length = chunk_text.len();
|
|
337
|
+
let byte_end = byte_start + chunk_length;
|
|
338
|
+
|
|
339
|
+
let overlap_chars = if index < total_chunks - 1 {
|
|
340
|
+
config.overlap.min(chunk_length)
|
|
341
|
+
} else {
|
|
342
|
+
0
|
|
343
|
+
};
|
|
344
|
+
byte_offset = byte_end - overlap_chars;
|
|
345
|
+
|
|
346
|
+
let (first_page, last_page) = if let Some(boundaries) = page_boundaries {
|
|
347
|
+
calculate_page_range(byte_start, byte_end, boundaries)?
|
|
348
|
+
} else {
|
|
349
|
+
(None, None)
|
|
350
|
+
};
|
|
351
|
+
|
|
352
|
+
chunks.push(Chunk {
|
|
353
|
+
content: chunk_text.to_string(),
|
|
354
|
+
embedding: None,
|
|
355
|
+
metadata: ChunkMetadata {
|
|
356
|
+
byte_start,
|
|
357
|
+
byte_end,
|
|
358
|
+
token_count: None,
|
|
359
|
+
chunk_index: index,
|
|
360
|
+
total_chunks,
|
|
361
|
+
first_page,
|
|
362
|
+
last_page,
|
|
363
|
+
},
|
|
364
|
+
});
|
|
365
|
+
}
|
|
143
366
|
|
|
144
367
|
let chunk_count = chunks.len();
|
|
145
368
|
|
|
@@ -159,11 +382,11 @@ pub fn chunk_text_with_type(
|
|
|
159
382
|
trim,
|
|
160
383
|
chunker_type,
|
|
161
384
|
};
|
|
162
|
-
chunk_text(text, &config)
|
|
385
|
+
chunk_text(text, &config, None)
|
|
163
386
|
}
|
|
164
387
|
|
|
165
388
|
pub fn chunk_texts_batch(texts: &[&str], config: &ChunkingConfig) -> Result<Vec<ChunkingResult>> {
|
|
166
|
-
texts.iter().map(|text| chunk_text(text, config)).collect()
|
|
389
|
+
texts.iter().map(|text| chunk_text(text, config, None)).collect()
|
|
167
390
|
}
|
|
168
391
|
|
|
169
392
|
#[cfg(test)]
|
|
@@ -173,7 +396,7 @@ mod tests {
|
|
|
173
396
|
#[test]
|
|
174
397
|
fn test_chunk_empty_text() {
|
|
175
398
|
let config = ChunkingConfig::default();
|
|
176
|
-
let result = chunk_text("", &config).unwrap();
|
|
399
|
+
let result = chunk_text("", &config, None).unwrap();
|
|
177
400
|
assert_eq!(result.chunks.len(), 0);
|
|
178
401
|
assert_eq!(result.chunk_count, 0);
|
|
179
402
|
}
|
|
@@ -187,7 +410,7 @@ mod tests {
|
|
|
187
410
|
chunker_type: ChunkerType::Text,
|
|
188
411
|
};
|
|
189
412
|
let text = "This is a short text.";
|
|
190
|
-
let result = chunk_text(text, &config).unwrap();
|
|
413
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
191
414
|
assert_eq!(result.chunks.len(), 1);
|
|
192
415
|
assert_eq!(result.chunk_count, 1);
|
|
193
416
|
assert_eq!(result.chunks[0].content, text);
|
|
@@ -202,7 +425,7 @@ mod tests {
|
|
|
202
425
|
chunker_type: ChunkerType::Text,
|
|
203
426
|
};
|
|
204
427
|
let text = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
|
|
205
|
-
let result = chunk_text(text, &config).unwrap();
|
|
428
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
206
429
|
assert!(result.chunk_count >= 2);
|
|
207
430
|
assert_eq!(result.chunks.len(), result.chunk_count);
|
|
208
431
|
assert!(result.chunks.iter().all(|chunk| chunk.content.len() <= 20));
|
|
@@ -217,7 +440,7 @@ mod tests {
|
|
|
217
440
|
chunker_type: ChunkerType::Text,
|
|
218
441
|
};
|
|
219
442
|
let text = "abcdefghijklmnopqrstuvwxyz0123456789";
|
|
220
|
-
let result = chunk_text(text, &config).unwrap();
|
|
443
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
221
444
|
assert!(result.chunk_count >= 2);
|
|
222
445
|
|
|
223
446
|
if result.chunks.len() >= 2 {
|
|
@@ -240,7 +463,7 @@ mod tests {
|
|
|
240
463
|
chunker_type: ChunkerType::Markdown,
|
|
241
464
|
};
|
|
242
465
|
let markdown = "# Title\n\nParagraph one.\n\n## Section\n\nParagraph two.";
|
|
243
|
-
let result = chunk_text(markdown, &config).unwrap();
|
|
466
|
+
let result = chunk_text(markdown, &config, None).unwrap();
|
|
244
467
|
assert!(result.chunk_count >= 1);
|
|
245
468
|
assert!(result.chunks.iter().any(|chunk| chunk.content.contains("# Title")));
|
|
246
469
|
}
|
|
@@ -254,7 +477,7 @@ mod tests {
|
|
|
254
477
|
chunker_type: ChunkerType::Markdown,
|
|
255
478
|
};
|
|
256
479
|
let markdown = "# Code Example\n\n```python\nprint('hello')\n```\n\nSome text after code.";
|
|
257
|
-
let result = chunk_text(markdown, &config).unwrap();
|
|
480
|
+
let result = chunk_text(markdown, &config, None).unwrap();
|
|
258
481
|
assert!(result.chunk_count >= 1);
|
|
259
482
|
assert!(result.chunks.iter().any(|chunk| chunk.content.contains("```")));
|
|
260
483
|
}
|
|
@@ -268,7 +491,7 @@ mod tests {
|
|
|
268
491
|
chunker_type: ChunkerType::Markdown,
|
|
269
492
|
};
|
|
270
493
|
let markdown = "Check out [this link](https://example.com) for more info.";
|
|
271
|
-
let result = chunk_text(markdown, &config).unwrap();
|
|
494
|
+
let result = chunk_text(markdown, &config, None).unwrap();
|
|
272
495
|
assert_eq!(result.chunk_count, 1);
|
|
273
496
|
assert!(result.chunks[0].content.contains("[this link]"));
|
|
274
497
|
}
|
|
@@ -282,7 +505,7 @@ mod tests {
|
|
|
282
505
|
chunker_type: ChunkerType::Text,
|
|
283
506
|
};
|
|
284
507
|
let text = " Leading and trailing spaces should be trimmed ";
|
|
285
|
-
let result = chunk_text(text, &config).unwrap();
|
|
508
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
286
509
|
assert!(result.chunk_count >= 1);
|
|
287
510
|
assert!(result.chunks.iter().all(|chunk| !chunk.content.starts_with(' ')));
|
|
288
511
|
}
|
|
@@ -296,7 +519,7 @@ mod tests {
|
|
|
296
519
|
chunker_type: ChunkerType::Text,
|
|
297
520
|
};
|
|
298
521
|
let text = " Text with spaces ";
|
|
299
|
-
let result = chunk_text(text, &config).unwrap();
|
|
522
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
300
523
|
assert_eq!(result.chunk_count, 1);
|
|
301
524
|
assert!(result.chunks[0].content.starts_with(' ') || result.chunks[0].content.len() < text.len());
|
|
302
525
|
}
|
|
@@ -309,7 +532,7 @@ mod tests {
|
|
|
309
532
|
trim: true,
|
|
310
533
|
chunker_type: ChunkerType::Text,
|
|
311
534
|
};
|
|
312
|
-
let result = chunk_text("Some text", &config);
|
|
535
|
+
let result = chunk_text("Some text", &config, None);
|
|
313
536
|
assert!(result.is_err());
|
|
314
537
|
let err = result.unwrap_err();
|
|
315
538
|
assert!(matches!(err, KreuzbergError::Validation { .. }));
|
|
@@ -403,7 +626,7 @@ mod tests {
|
|
|
403
626
|
chunker_type: ChunkerType::Text,
|
|
404
627
|
};
|
|
405
628
|
let text = "a".repeat(1000);
|
|
406
|
-
let result = chunk_text(&text, &config).unwrap();
|
|
629
|
+
let result = chunk_text(&text, &config, None).unwrap();
|
|
407
630
|
assert!(result.chunk_count >= 10);
|
|
408
631
|
assert!(result.chunks.iter().all(|chunk| chunk.content.len() <= 100));
|
|
409
632
|
}
|
|
@@ -417,7 +640,7 @@ mod tests {
|
|
|
417
640
|
chunker_type: ChunkerType::Text,
|
|
418
641
|
};
|
|
419
642
|
let text = "Line one\nLine two\nLine three\nLine four\nLine five";
|
|
420
|
-
let result = chunk_text(text, &config).unwrap();
|
|
643
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
421
644
|
assert!(result.chunk_count >= 1);
|
|
422
645
|
}
|
|
423
646
|
|
|
@@ -430,7 +653,7 @@ mod tests {
|
|
|
430
653
|
chunker_type: ChunkerType::Markdown,
|
|
431
654
|
};
|
|
432
655
|
let markdown = "# List Example\n\n- Item 1\n- Item 2\n- Item 3\n\nMore text.";
|
|
433
|
-
let result = chunk_text(markdown, &config).unwrap();
|
|
656
|
+
let result = chunk_text(markdown, &config, None).unwrap();
|
|
434
657
|
assert!(result.chunk_count >= 1);
|
|
435
658
|
assert!(result.chunks.iter().any(|chunk| chunk.content.contains("- Item")));
|
|
436
659
|
}
|
|
@@ -444,7 +667,7 @@ mod tests {
|
|
|
444
667
|
chunker_type: ChunkerType::Markdown,
|
|
445
668
|
};
|
|
446
669
|
let markdown = "# Table\n\n| Col1 | Col2 |\n|------|------|\n| A | B |\n| C | D |";
|
|
447
|
-
let result = chunk_text(markdown, &config).unwrap();
|
|
670
|
+
let result = chunk_text(markdown, &config, None).unwrap();
|
|
448
671
|
assert!(result.chunk_count >= 1);
|
|
449
672
|
assert!(result.chunks.iter().any(|chunk| chunk.content.contains("|")));
|
|
450
673
|
}
|
|
@@ -458,7 +681,7 @@ mod tests {
|
|
|
458
681
|
chunker_type: ChunkerType::Text,
|
|
459
682
|
};
|
|
460
683
|
let text = "Special chars: @#$%^&*()[]{}|\\<>?/~`";
|
|
461
|
-
let result = chunk_text(text, &config).unwrap();
|
|
684
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
462
685
|
assert_eq!(result.chunk_count, 1);
|
|
463
686
|
assert!(result.chunks[0].content.contains("@#$%"));
|
|
464
687
|
}
|
|
@@ -472,7 +695,7 @@ mod tests {
|
|
|
472
695
|
chunker_type: ChunkerType::Text,
|
|
473
696
|
};
|
|
474
697
|
let text = "Unicode: 你好世界 🌍 café résumé";
|
|
475
|
-
let result = chunk_text(text, &config).unwrap();
|
|
698
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
476
699
|
assert_eq!(result.chunk_count, 1);
|
|
477
700
|
assert!(result.chunks[0].content.contains("你好"));
|
|
478
701
|
assert!(result.chunks[0].content.contains("🌍"));
|
|
@@ -487,7 +710,7 @@ mod tests {
|
|
|
487
710
|
chunker_type: ChunkerType::Text,
|
|
488
711
|
};
|
|
489
712
|
let text = "日本語のテキストです。これは長い文章で、複数のチャンクに分割されるべきです。";
|
|
490
|
-
let result = chunk_text(text, &config).unwrap();
|
|
713
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
491
714
|
assert!(result.chunk_count >= 1);
|
|
492
715
|
}
|
|
493
716
|
|
|
@@ -500,7 +723,7 @@ mod tests {
|
|
|
500
723
|
chunker_type: ChunkerType::Text,
|
|
501
724
|
};
|
|
502
725
|
let text = "English text mixed with 中文文本 and some français";
|
|
503
|
-
let result = chunk_text(text, &config).unwrap();
|
|
726
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
504
727
|
assert!(result.chunk_count >= 1);
|
|
505
728
|
}
|
|
506
729
|
|
|
@@ -513,7 +736,7 @@ mod tests {
|
|
|
513
736
|
chunker_type: ChunkerType::Text,
|
|
514
737
|
};
|
|
515
738
|
let text = "AAAAA BBBBB CCCCC DDDDD EEEEE FFFFF";
|
|
516
|
-
let result = chunk_text(text, &config).unwrap();
|
|
739
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
517
740
|
|
|
518
741
|
assert!(result.chunks.len() >= 2, "Expected at least 2 chunks");
|
|
519
742
|
|
|
@@ -522,8 +745,8 @@ mod tests {
|
|
|
522
745
|
let metadata = &chunk.metadata;
|
|
523
746
|
|
|
524
747
|
assert_eq!(
|
|
525
|
-
metadata.
|
|
526
|
-
chunk.content.
|
|
748
|
+
metadata.byte_end - metadata.byte_start,
|
|
749
|
+
chunk.content.len(),
|
|
527
750
|
"Chunk {} offset range doesn't match content length",
|
|
528
751
|
i
|
|
529
752
|
);
|
|
@@ -537,15 +760,15 @@ mod tests {
|
|
|
537
760
|
let next_chunk = &result.chunks[i + 1];
|
|
538
761
|
|
|
539
762
|
assert!(
|
|
540
|
-
next_chunk.metadata.
|
|
763
|
+
next_chunk.metadata.byte_start < current_chunk.metadata.byte_end,
|
|
541
764
|
"Chunk {} and {} don't overlap: next starts at {} but current ends at {}",
|
|
542
765
|
i,
|
|
543
766
|
i + 1,
|
|
544
|
-
next_chunk.metadata.
|
|
545
|
-
current_chunk.metadata.
|
|
767
|
+
next_chunk.metadata.byte_start,
|
|
768
|
+
current_chunk.metadata.byte_end
|
|
546
769
|
);
|
|
547
770
|
|
|
548
|
-
let overlap_size = current_chunk.metadata.
|
|
771
|
+
let overlap_size = current_chunk.metadata.byte_end - next_chunk.metadata.byte_start;
|
|
549
772
|
assert!(
|
|
550
773
|
overlap_size <= config.overlap + 10,
|
|
551
774
|
"Overlap between chunks {} and {} is too large: {}",
|
|
@@ -565,19 +788,19 @@ mod tests {
|
|
|
565
788
|
chunker_type: ChunkerType::Text,
|
|
566
789
|
};
|
|
567
790
|
let text = "AAAAA BBBBB CCCCC DDDDD EEEEE FFFFF";
|
|
568
|
-
let result = chunk_text(text, &config).unwrap();
|
|
791
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
569
792
|
|
|
570
793
|
for i in 0..result.chunks.len() - 1 {
|
|
571
794
|
let current_chunk = &result.chunks[i];
|
|
572
795
|
let next_chunk = &result.chunks[i + 1];
|
|
573
796
|
|
|
574
797
|
assert!(
|
|
575
|
-
next_chunk.metadata.
|
|
798
|
+
next_chunk.metadata.byte_start >= current_chunk.metadata.byte_end,
|
|
576
799
|
"Chunk {} and {} overlap when they shouldn't: next starts at {} but current ends at {}",
|
|
577
800
|
i,
|
|
578
801
|
i + 1,
|
|
579
|
-
next_chunk.metadata.
|
|
580
|
-
current_chunk.metadata.
|
|
802
|
+
next_chunk.metadata.byte_start,
|
|
803
|
+
current_chunk.metadata.byte_end
|
|
581
804
|
);
|
|
582
805
|
}
|
|
583
806
|
}
|
|
@@ -591,12 +814,12 @@ mod tests {
|
|
|
591
814
|
chunker_type: ChunkerType::Text,
|
|
592
815
|
};
|
|
593
816
|
let text = "0123456789 ABCDEFGHIJ KLMNOPQRST UVWXYZ";
|
|
594
|
-
let result = chunk_text(text, &config).unwrap();
|
|
817
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
595
818
|
|
|
596
819
|
assert!(result.chunks.len() >= 2, "Expected multiple chunks");
|
|
597
820
|
|
|
598
821
|
assert_eq!(
|
|
599
|
-
result.chunks[0].metadata.
|
|
822
|
+
result.chunks[0].metadata.byte_start, 0,
|
|
600
823
|
"First chunk should start at position 0"
|
|
601
824
|
);
|
|
602
825
|
|
|
@@ -605,12 +828,12 @@ mod tests {
|
|
|
605
828
|
let next_chunk = &result.chunks[i + 1];
|
|
606
829
|
|
|
607
830
|
assert!(
|
|
608
|
-
next_chunk.metadata.
|
|
831
|
+
next_chunk.metadata.byte_start <= current_chunk.metadata.byte_end,
|
|
609
832
|
"Gap detected between chunk {} (ends at {}) and chunk {} (starts at {})",
|
|
610
833
|
i,
|
|
611
|
-
current_chunk.metadata.
|
|
834
|
+
current_chunk.metadata.byte_end,
|
|
612
835
|
i + 1,
|
|
613
|
-
next_chunk.metadata.
|
|
836
|
+
next_chunk.metadata.byte_start
|
|
614
837
|
);
|
|
615
838
|
}
|
|
616
839
|
}
|
|
@@ -625,24 +848,24 @@ mod tests {
|
|
|
625
848
|
chunker_type: ChunkerType::Text,
|
|
626
849
|
};
|
|
627
850
|
let text = "Word ".repeat(30);
|
|
628
|
-
let result = chunk_text(&text, &config).unwrap();
|
|
851
|
+
let result = chunk_text(&text, &config, None).unwrap();
|
|
629
852
|
|
|
630
853
|
for chunk in &result.chunks {
|
|
631
854
|
assert!(
|
|
632
|
-
chunk.metadata.
|
|
855
|
+
chunk.metadata.byte_end > chunk.metadata.byte_start,
|
|
633
856
|
"Invalid offset range for overlap {}: start={}, end={}",
|
|
634
857
|
overlap,
|
|
635
|
-
chunk.metadata.
|
|
636
|
-
chunk.metadata.
|
|
858
|
+
chunk.metadata.byte_start,
|
|
859
|
+
chunk.metadata.byte_end
|
|
637
860
|
);
|
|
638
861
|
}
|
|
639
862
|
|
|
640
863
|
for chunk in &result.chunks {
|
|
641
864
|
assert!(
|
|
642
|
-
chunk.metadata.
|
|
865
|
+
chunk.metadata.byte_start < text.len(),
|
|
643
866
|
"char_start with overlap {} is out of bounds: {}",
|
|
644
867
|
overlap,
|
|
645
|
-
chunk.metadata.
|
|
868
|
+
chunk.metadata.byte_start
|
|
646
869
|
);
|
|
647
870
|
}
|
|
648
871
|
}
|
|
@@ -657,7 +880,7 @@ mod tests {
|
|
|
657
880
|
chunker_type: ChunkerType::Text,
|
|
658
881
|
};
|
|
659
882
|
let text = "AAAAA BBBBB CCCCC DDDDD EEEEE";
|
|
660
|
-
let result = chunk_text(text, &config).unwrap();
|
|
883
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
661
884
|
|
|
662
885
|
assert!(result.chunks.len() >= 2, "Need multiple chunks for this test");
|
|
663
886
|
|
|
@@ -665,13 +888,990 @@ mod tests {
|
|
|
665
888
|
let second_to_last = &result.chunks[result.chunks.len() - 2];
|
|
666
889
|
|
|
667
890
|
assert!(
|
|
668
|
-
last_chunk.metadata.
|
|
891
|
+
last_chunk.metadata.byte_start < second_to_last.metadata.byte_end,
|
|
669
892
|
"Last chunk should overlap with previous chunk"
|
|
670
893
|
);
|
|
671
894
|
|
|
672
|
-
let expected_end = text.
|
|
895
|
+
let expected_end = text.len();
|
|
673
896
|
let last_chunk_covers_end =
|
|
674
|
-
last_chunk.content.trim_end() == text.trim_end() || last_chunk.metadata.
|
|
897
|
+
last_chunk.content.trim_end() == text.trim_end() || last_chunk.metadata.byte_end >= expected_end - 5;
|
|
675
898
|
assert!(last_chunk_covers_end, "Last chunk should cover the end of the text");
|
|
676
899
|
}
|
|
900
|
+
|
|
901
|
+
#[test]
|
|
902
|
+
fn test_chunk_with_page_boundaries() {
|
|
903
|
+
use crate::types::PageBoundary;
|
|
904
|
+
|
|
905
|
+
let config = ChunkingConfig {
|
|
906
|
+
max_characters: 30,
|
|
907
|
+
overlap: 5,
|
|
908
|
+
trim: true,
|
|
909
|
+
chunker_type: ChunkerType::Text,
|
|
910
|
+
};
|
|
911
|
+
let text = "Page one content here. Page two starts here and continues.";
|
|
912
|
+
|
|
913
|
+
let boundaries = vec![
|
|
914
|
+
PageBoundary {
|
|
915
|
+
byte_start: 0,
|
|
916
|
+
byte_end: 21,
|
|
917
|
+
page_number: 1,
|
|
918
|
+
},
|
|
919
|
+
PageBoundary {
|
|
920
|
+
byte_start: 22,
|
|
921
|
+
byte_end: 58,
|
|
922
|
+
page_number: 2,
|
|
923
|
+
},
|
|
924
|
+
];
|
|
925
|
+
|
|
926
|
+
let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
|
|
927
|
+
assert!(result.chunks.len() >= 2);
|
|
928
|
+
|
|
929
|
+
assert_eq!(result.chunks[0].metadata.first_page, Some(1));
|
|
930
|
+
|
|
931
|
+
let last_chunk = result.chunks.last().unwrap();
|
|
932
|
+
assert_eq!(last_chunk.metadata.last_page, Some(2));
|
|
933
|
+
}
|
|
934
|
+
|
|
935
|
+
#[test]
|
|
936
|
+
fn test_chunk_without_page_boundaries() {
|
|
937
|
+
let config = ChunkingConfig {
|
|
938
|
+
max_characters: 30,
|
|
939
|
+
overlap: 5,
|
|
940
|
+
trim: true,
|
|
941
|
+
chunker_type: ChunkerType::Text,
|
|
942
|
+
};
|
|
943
|
+
let text = "This is some test content that should be split into multiple chunks.";
|
|
944
|
+
|
|
945
|
+
let result = chunk_text(text, &config, None).unwrap();
|
|
946
|
+
assert!(result.chunks.len() >= 2);
|
|
947
|
+
|
|
948
|
+
for chunk in &result.chunks {
|
|
949
|
+
assert_eq!(chunk.metadata.first_page, None);
|
|
950
|
+
assert_eq!(chunk.metadata.last_page, None);
|
|
951
|
+
}
|
|
952
|
+
}
|
|
953
|
+
|
|
954
|
+
#[test]
|
|
955
|
+
fn test_chunk_empty_boundaries() {
|
|
956
|
+
let config = ChunkingConfig {
|
|
957
|
+
max_characters: 30,
|
|
958
|
+
overlap: 5,
|
|
959
|
+
trim: true,
|
|
960
|
+
chunker_type: ChunkerType::Text,
|
|
961
|
+
};
|
|
962
|
+
let text = "Some text content here.";
|
|
963
|
+
let boundaries: Vec<PageBoundary> = vec![];
|
|
964
|
+
|
|
965
|
+
let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
|
|
966
|
+
assert_eq!(result.chunks.len(), 1);
|
|
967
|
+
|
|
968
|
+
assert_eq!(result.chunks[0].metadata.first_page, None);
|
|
969
|
+
assert_eq!(result.chunks[0].metadata.last_page, None);
|
|
970
|
+
}
|
|
971
|
+
|
|
972
|
+
#[test]
|
|
973
|
+
fn test_chunk_spanning_multiple_pages() {
|
|
974
|
+
use crate::types::PageBoundary;
|
|
975
|
+
|
|
976
|
+
let config = ChunkingConfig {
|
|
977
|
+
max_characters: 50,
|
|
978
|
+
overlap: 5,
|
|
979
|
+
trim: false,
|
|
980
|
+
chunker_type: ChunkerType::Text,
|
|
981
|
+
};
|
|
982
|
+
let text = "0123456789 AAAAAAAAAA 1111111111 BBBBBBBBBB 2222222222";
|
|
983
|
+
|
|
984
|
+
let boundaries = vec![
|
|
985
|
+
PageBoundary {
|
|
986
|
+
byte_start: 0,
|
|
987
|
+
byte_end: 20,
|
|
988
|
+
page_number: 1,
|
|
989
|
+
},
|
|
990
|
+
PageBoundary {
|
|
991
|
+
byte_start: 20,
|
|
992
|
+
byte_end: 40,
|
|
993
|
+
page_number: 2,
|
|
994
|
+
},
|
|
995
|
+
PageBoundary {
|
|
996
|
+
byte_start: 40,
|
|
997
|
+
byte_end: 54,
|
|
998
|
+
page_number: 3,
|
|
999
|
+
},
|
|
1000
|
+
];
|
|
1001
|
+
|
|
1002
|
+
let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
|
|
1003
|
+
assert!(result.chunks.len() >= 2);
|
|
1004
|
+
|
|
1005
|
+
for chunk in &result.chunks {
|
|
1006
|
+
assert!(chunk.metadata.first_page.is_some() || chunk.metadata.last_page.is_some());
|
|
1007
|
+
}
|
|
1008
|
+
}
|
|
1009
|
+
|
|
1010
|
+
#[test]
|
|
1011
|
+
fn test_chunk_text_with_invalid_boundary_range() {
|
|
1012
|
+
use crate::types::PageBoundary;
|
|
1013
|
+
|
|
1014
|
+
let config = ChunkingConfig {
|
|
1015
|
+
max_characters: 30,
|
|
1016
|
+
overlap: 5,
|
|
1017
|
+
trim: true,
|
|
1018
|
+
chunker_type: ChunkerType::Text,
|
|
1019
|
+
};
|
|
1020
|
+
let text = "Page one content here. Page two content.";
|
|
1021
|
+
|
|
1022
|
+
let boundaries = vec![PageBoundary {
|
|
1023
|
+
byte_start: 10,
|
|
1024
|
+
byte_end: 5,
|
|
1025
|
+
page_number: 1,
|
|
1026
|
+
}];
|
|
1027
|
+
|
|
1028
|
+
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1029
|
+
assert!(result.is_err());
|
|
1030
|
+
let err = result.unwrap_err();
|
|
1031
|
+
assert!(err.to_string().contains("Invalid boundary range"));
|
|
1032
|
+
assert!(err.to_string().contains("byte_start"));
|
|
1033
|
+
}
|
|
1034
|
+
|
|
1035
|
+
#[test]
|
|
1036
|
+
fn test_chunk_text_with_unsorted_boundaries() {
|
|
1037
|
+
use crate::types::PageBoundary;
|
|
1038
|
+
|
|
1039
|
+
let config = ChunkingConfig {
|
|
1040
|
+
max_characters: 30,
|
|
1041
|
+
overlap: 5,
|
|
1042
|
+
trim: true,
|
|
1043
|
+
chunker_type: ChunkerType::Text,
|
|
1044
|
+
};
|
|
1045
|
+
let text = "Page one content here. Page two content.";
|
|
1046
|
+
|
|
1047
|
+
let boundaries = vec![
|
|
1048
|
+
PageBoundary {
|
|
1049
|
+
byte_start: 22,
|
|
1050
|
+
byte_end: 40,
|
|
1051
|
+
page_number: 2,
|
|
1052
|
+
},
|
|
1053
|
+
PageBoundary {
|
|
1054
|
+
byte_start: 0,
|
|
1055
|
+
byte_end: 21,
|
|
1056
|
+
page_number: 1,
|
|
1057
|
+
},
|
|
1058
|
+
];
|
|
1059
|
+
|
|
1060
|
+
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1061
|
+
assert!(result.is_err());
|
|
1062
|
+
let err = result.unwrap_err();
|
|
1063
|
+
assert!(err.to_string().contains("not sorted"));
|
|
1064
|
+
assert!(err.to_string().contains("boundaries"));
|
|
1065
|
+
}
|
|
1066
|
+
|
|
1067
|
+
#[test]
|
|
1068
|
+
fn test_chunk_text_with_overlapping_boundaries() {
|
|
1069
|
+
use crate::types::PageBoundary;
|
|
1070
|
+
|
|
1071
|
+
let config = ChunkingConfig {
|
|
1072
|
+
max_characters: 30,
|
|
1073
|
+
overlap: 5,
|
|
1074
|
+
trim: true,
|
|
1075
|
+
chunker_type: ChunkerType::Text,
|
|
1076
|
+
};
|
|
1077
|
+
let text = "Page one content here. Page two content.";
|
|
1078
|
+
|
|
1079
|
+
let boundaries = vec![
|
|
1080
|
+
PageBoundary {
|
|
1081
|
+
byte_start: 0,
|
|
1082
|
+
byte_end: 25,
|
|
1083
|
+
page_number: 1,
|
|
1084
|
+
},
|
|
1085
|
+
PageBoundary {
|
|
1086
|
+
byte_start: 20,
|
|
1087
|
+
byte_end: 40,
|
|
1088
|
+
page_number: 2,
|
|
1089
|
+
},
|
|
1090
|
+
];
|
|
1091
|
+
|
|
1092
|
+
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1093
|
+
assert!(result.is_err());
|
|
1094
|
+
let err = result.unwrap_err();
|
|
1095
|
+
assert!(err.to_string().contains("Overlapping"));
|
|
1096
|
+
assert!(err.to_string().contains("boundaries"));
|
|
1097
|
+
}
|
|
1098
|
+
|
|
1099
|
+
#[test]
|
|
1100
|
+
fn test_calculate_page_range_with_invalid_boundaries() {
|
|
1101
|
+
use crate::types::PageBoundary;
|
|
1102
|
+
|
|
1103
|
+
let boundaries = vec![PageBoundary {
|
|
1104
|
+
byte_start: 15,
|
|
1105
|
+
byte_end: 10,
|
|
1106
|
+
page_number: 1,
|
|
1107
|
+
}];
|
|
1108
|
+
|
|
1109
|
+
let result = calculate_page_range(0, 20, &boundaries);
|
|
1110
|
+
assert!(result.is_err());
|
|
1111
|
+
let err = result.unwrap_err();
|
|
1112
|
+
assert!(err.to_string().contains("Invalid boundary range"));
|
|
1113
|
+
}
|
|
1114
|
+
|
|
1115
|
+
#[test]
|
|
1116
|
+
fn test_validate_page_boundaries_valid() {
|
|
1117
|
+
use crate::types::PageBoundary;
|
|
1118
|
+
|
|
1119
|
+
let boundaries = vec![
|
|
1120
|
+
PageBoundary {
|
|
1121
|
+
byte_start: 0,
|
|
1122
|
+
byte_end: 20,
|
|
1123
|
+
page_number: 1,
|
|
1124
|
+
},
|
|
1125
|
+
PageBoundary {
|
|
1126
|
+
byte_start: 20,
|
|
1127
|
+
byte_end: 40,
|
|
1128
|
+
page_number: 2,
|
|
1129
|
+
},
|
|
1130
|
+
PageBoundary {
|
|
1131
|
+
byte_start: 40,
|
|
1132
|
+
byte_end: 60,
|
|
1133
|
+
page_number: 3,
|
|
1134
|
+
},
|
|
1135
|
+
];
|
|
1136
|
+
|
|
1137
|
+
let result = chunk_text(
|
|
1138
|
+
"x".repeat(60).as_str(),
|
|
1139
|
+
&ChunkingConfig {
|
|
1140
|
+
max_characters: 30,
|
|
1141
|
+
overlap: 5,
|
|
1142
|
+
trim: false,
|
|
1143
|
+
chunker_type: ChunkerType::Text,
|
|
1144
|
+
},
|
|
1145
|
+
Some(&boundaries),
|
|
1146
|
+
);
|
|
1147
|
+
assert!(result.is_ok());
|
|
1148
|
+
}
|
|
1149
|
+
|
|
1150
|
+
#[test]
|
|
1151
|
+
fn test_validate_page_boundaries_empty() {
|
|
1152
|
+
let boundaries: Vec<PageBoundary> = vec![];
|
|
1153
|
+
let result = chunk_text(
|
|
1154
|
+
"Some test text",
|
|
1155
|
+
&ChunkingConfig {
|
|
1156
|
+
max_characters: 30,
|
|
1157
|
+
overlap: 5,
|
|
1158
|
+
trim: true,
|
|
1159
|
+
chunker_type: ChunkerType::Text,
|
|
1160
|
+
},
|
|
1161
|
+
Some(&boundaries),
|
|
1162
|
+
);
|
|
1163
|
+
assert!(result.is_ok());
|
|
1164
|
+
}
|
|
1165
|
+
|
|
1166
|
+
#[test]
|
|
1167
|
+
fn test_page_boundaries_with_gaps() {
|
|
1168
|
+
use crate::types::PageBoundary;
|
|
1169
|
+
|
|
1170
|
+
let boundaries = vec![
|
|
1171
|
+
PageBoundary {
|
|
1172
|
+
byte_start: 0,
|
|
1173
|
+
byte_end: 10,
|
|
1174
|
+
page_number: 1,
|
|
1175
|
+
},
|
|
1176
|
+
PageBoundary {
|
|
1177
|
+
byte_start: 15,
|
|
1178
|
+
byte_end: 25,
|
|
1179
|
+
page_number: 2,
|
|
1180
|
+
},
|
|
1181
|
+
];
|
|
1182
|
+
|
|
1183
|
+
let text = "0123456789XXXXX0123456789";
|
|
1184
|
+
let result = chunk_text(
|
|
1185
|
+
text,
|
|
1186
|
+
&ChunkingConfig {
|
|
1187
|
+
max_characters: 30,
|
|
1188
|
+
overlap: 5,
|
|
1189
|
+
trim: false,
|
|
1190
|
+
chunker_type: ChunkerType::Text,
|
|
1191
|
+
},
|
|
1192
|
+
Some(&boundaries),
|
|
1193
|
+
);
|
|
1194
|
+
assert!(result.is_ok());
|
|
1195
|
+
}
|
|
1196
|
+
|
|
1197
|
+
#[test]
|
|
1198
|
+
fn test_chunk_with_same_start_and_end() {
|
|
1199
|
+
use crate::types::PageBoundary;
|
|
1200
|
+
|
|
1201
|
+
let boundaries = vec![PageBoundary {
|
|
1202
|
+
byte_start: 10,
|
|
1203
|
+
byte_end: 10,
|
|
1204
|
+
page_number: 1,
|
|
1205
|
+
}];
|
|
1206
|
+
|
|
1207
|
+
let result = chunk_text(
|
|
1208
|
+
"test content here",
|
|
1209
|
+
&ChunkingConfig {
|
|
1210
|
+
max_characters: 30,
|
|
1211
|
+
overlap: 5,
|
|
1212
|
+
trim: true,
|
|
1213
|
+
chunker_type: ChunkerType::Text,
|
|
1214
|
+
},
|
|
1215
|
+
Some(&boundaries),
|
|
1216
|
+
);
|
|
1217
|
+
assert!(result.is_err());
|
|
1218
|
+
let err = result.unwrap_err();
|
|
1219
|
+
assert!(err.to_string().contains("Invalid boundary range"));
|
|
1220
|
+
}
|
|
1221
|
+
|
|
1222
|
+
#[test]
|
|
1223
|
+
fn test_multiple_overlapping_errors() {
|
|
1224
|
+
use crate::types::PageBoundary;
|
|
1225
|
+
|
|
1226
|
+
let text = "This is a longer test content string that spans more bytes";
|
|
1227
|
+
let boundaries = vec![
|
|
1228
|
+
PageBoundary {
|
|
1229
|
+
byte_start: 20,
|
|
1230
|
+
byte_end: 40,
|
|
1231
|
+
page_number: 2,
|
|
1232
|
+
},
|
|
1233
|
+
PageBoundary {
|
|
1234
|
+
byte_start: 10,
|
|
1235
|
+
byte_end: 35,
|
|
1236
|
+
page_number: 1,
|
|
1237
|
+
},
|
|
1238
|
+
];
|
|
1239
|
+
|
|
1240
|
+
let result = chunk_text(
|
|
1241
|
+
text,
|
|
1242
|
+
&ChunkingConfig {
|
|
1243
|
+
max_characters: 30,
|
|
1244
|
+
overlap: 5,
|
|
1245
|
+
trim: true,
|
|
1246
|
+
chunker_type: ChunkerType::Text,
|
|
1247
|
+
},
|
|
1248
|
+
Some(&boundaries),
|
|
1249
|
+
);
|
|
1250
|
+
assert!(result.is_err());
|
|
1251
|
+
assert!(result.unwrap_err().to_string().contains("not sorted"));
|
|
1252
|
+
}
|
|
1253
|
+
|
|
1254
|
+
#[test]
|
|
1255
|
+
fn test_chunk_with_pages_basic() {
|
|
1256
|
+
use crate::types::PageBoundary;
|
|
1257
|
+
|
|
1258
|
+
let config = ChunkingConfig {
|
|
1259
|
+
max_characters: 25,
|
|
1260
|
+
overlap: 5,
|
|
1261
|
+
trim: true,
|
|
1262
|
+
chunker_type: ChunkerType::Text,
|
|
1263
|
+
};
|
|
1264
|
+
let text = "First page content here.Second page content here.Third page.";
|
|
1265
|
+
|
|
1266
|
+
let boundaries = vec![
|
|
1267
|
+
PageBoundary {
|
|
1268
|
+
byte_start: 0,
|
|
1269
|
+
byte_end: 24,
|
|
1270
|
+
page_number: 1,
|
|
1271
|
+
},
|
|
1272
|
+
PageBoundary {
|
|
1273
|
+
byte_start: 24,
|
|
1274
|
+
byte_end: 50,
|
|
1275
|
+
page_number: 2,
|
|
1276
|
+
},
|
|
1277
|
+
PageBoundary {
|
|
1278
|
+
byte_start: 50,
|
|
1279
|
+
byte_end: 60,
|
|
1280
|
+
page_number: 3,
|
|
1281
|
+
},
|
|
1282
|
+
];
|
|
1283
|
+
|
|
1284
|
+
let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
|
|
1285
|
+
|
|
1286
|
+
if !result.chunks.is_empty() {
|
|
1287
|
+
assert!(result.chunks[0].metadata.first_page.is_some());
|
|
1288
|
+
}
|
|
1289
|
+
}
|
|
1290
|
+
|
|
1291
|
+
#[test]
|
|
1292
|
+
fn test_chunk_with_pages_single_page_chunk() {
|
|
1293
|
+
use crate::types::PageBoundary;
|
|
1294
|
+
|
|
1295
|
+
let config = ChunkingConfig {
|
|
1296
|
+
max_characters: 100,
|
|
1297
|
+
overlap: 10,
|
|
1298
|
+
trim: true,
|
|
1299
|
+
chunker_type: ChunkerType::Text,
|
|
1300
|
+
};
|
|
1301
|
+
let text = "All content on single page fits in one chunk.";
|
|
1302
|
+
|
|
1303
|
+
let boundaries = vec![PageBoundary {
|
|
1304
|
+
byte_start: 0,
|
|
1305
|
+
byte_end: 45,
|
|
1306
|
+
page_number: 1,
|
|
1307
|
+
}];
|
|
1308
|
+
|
|
1309
|
+
let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
|
|
1310
|
+
assert_eq!(result.chunks.len(), 1);
|
|
1311
|
+
assert_eq!(result.chunks[0].metadata.first_page, Some(1));
|
|
1312
|
+
assert_eq!(result.chunks[0].metadata.last_page, Some(1));
|
|
1313
|
+
}
|
|
1314
|
+
|
|
1315
|
+
#[test]
|
|
1316
|
+
fn test_chunk_with_pages_no_overlap() {
|
|
1317
|
+
use crate::types::PageBoundary;
|
|
1318
|
+
|
|
1319
|
+
let config = ChunkingConfig {
|
|
1320
|
+
max_characters: 20,
|
|
1321
|
+
overlap: 0,
|
|
1322
|
+
trim: false,
|
|
1323
|
+
chunker_type: ChunkerType::Text,
|
|
1324
|
+
};
|
|
1325
|
+
let text = "AAAAA BBBBB CCCCC DDDDD";
|
|
1326
|
+
|
|
1327
|
+
let boundaries = vec![
|
|
1328
|
+
PageBoundary {
|
|
1329
|
+
byte_start: 0,
|
|
1330
|
+
byte_end: 11,
|
|
1331
|
+
page_number: 1,
|
|
1332
|
+
},
|
|
1333
|
+
PageBoundary {
|
|
1334
|
+
byte_start: 11,
|
|
1335
|
+
byte_end: 23,
|
|
1336
|
+
page_number: 2,
|
|
1337
|
+
},
|
|
1338
|
+
];
|
|
1339
|
+
|
|
1340
|
+
let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
|
|
1341
|
+
assert!(!result.chunks.is_empty());
|
|
1342
|
+
|
|
1343
|
+
for chunk in &result.chunks {
|
|
1344
|
+
if let (Some(first), Some(last)) = (chunk.metadata.first_page, chunk.metadata.last_page) {
|
|
1345
|
+
assert!(first <= last);
|
|
1346
|
+
}
|
|
1347
|
+
}
|
|
1348
|
+
}
|
|
1349
|
+
|
|
1350
|
+
#[test]
|
|
1351
|
+
fn test_calculate_page_range_within_page() {
|
|
1352
|
+
let boundaries = vec![
|
|
1353
|
+
PageBoundary {
|
|
1354
|
+
byte_start: 0,
|
|
1355
|
+
byte_end: 100,
|
|
1356
|
+
page_number: 1,
|
|
1357
|
+
},
|
|
1358
|
+
PageBoundary {
|
|
1359
|
+
byte_start: 100,
|
|
1360
|
+
byte_end: 200,
|
|
1361
|
+
page_number: 2,
|
|
1362
|
+
},
|
|
1363
|
+
];
|
|
1364
|
+
|
|
1365
|
+
let (first, last) = calculate_page_range(10, 50, &boundaries).unwrap();
|
|
1366
|
+
assert_eq!(first, Some(1));
|
|
1367
|
+
assert_eq!(last, Some(1));
|
|
1368
|
+
}
|
|
1369
|
+
|
|
1370
|
+
#[test]
|
|
1371
|
+
fn test_calculate_page_range_spanning_pages() {
|
|
1372
|
+
let boundaries = vec![
|
|
1373
|
+
PageBoundary {
|
|
1374
|
+
byte_start: 0,
|
|
1375
|
+
byte_end: 100,
|
|
1376
|
+
page_number: 1,
|
|
1377
|
+
},
|
|
1378
|
+
PageBoundary {
|
|
1379
|
+
byte_start: 100,
|
|
1380
|
+
byte_end: 200,
|
|
1381
|
+
page_number: 2,
|
|
1382
|
+
},
|
|
1383
|
+
];
|
|
1384
|
+
|
|
1385
|
+
let (first, last) = calculate_page_range(50, 150, &boundaries).unwrap();
|
|
1386
|
+
assert_eq!(first, Some(1));
|
|
1387
|
+
assert_eq!(last, Some(2));
|
|
1388
|
+
}
|
|
1389
|
+
|
|
1390
|
+
#[test]
|
|
1391
|
+
fn test_calculate_page_range_empty_boundaries() {
|
|
1392
|
+
let boundaries: Vec<PageBoundary> = vec![];
|
|
1393
|
+
|
|
1394
|
+
let (first, last) = calculate_page_range(0, 50, &boundaries).unwrap();
|
|
1395
|
+
assert_eq!(first, None);
|
|
1396
|
+
assert_eq!(last, None);
|
|
1397
|
+
}
|
|
1398
|
+
|
|
1399
|
+
#[test]
|
|
1400
|
+
fn test_calculate_page_range_no_overlap() {
|
|
1401
|
+
let boundaries = vec![
|
|
1402
|
+
PageBoundary {
|
|
1403
|
+
byte_start: 0,
|
|
1404
|
+
byte_end: 100,
|
|
1405
|
+
page_number: 1,
|
|
1406
|
+
},
|
|
1407
|
+
PageBoundary {
|
|
1408
|
+
byte_start: 100,
|
|
1409
|
+
byte_end: 200,
|
|
1410
|
+
page_number: 2,
|
|
1411
|
+
},
|
|
1412
|
+
];
|
|
1413
|
+
|
|
1414
|
+
let (first, last) = calculate_page_range(200, 250, &boundaries).unwrap();
|
|
1415
|
+
assert_eq!(first, None);
|
|
1416
|
+
assert_eq!(last, None);
|
|
1417
|
+
}
|
|
1418
|
+
|
|
1419
|
+
#[test]
|
|
1420
|
+
fn test_calculate_page_range_three_pages() {
|
|
1421
|
+
let boundaries = vec![
|
|
1422
|
+
PageBoundary {
|
|
1423
|
+
byte_start: 0,
|
|
1424
|
+
byte_end: 100,
|
|
1425
|
+
page_number: 1,
|
|
1426
|
+
},
|
|
1427
|
+
PageBoundary {
|
|
1428
|
+
byte_start: 100,
|
|
1429
|
+
byte_end: 200,
|
|
1430
|
+
page_number: 2,
|
|
1431
|
+
},
|
|
1432
|
+
PageBoundary {
|
|
1433
|
+
byte_start: 200,
|
|
1434
|
+
byte_end: 300,
|
|
1435
|
+
page_number: 3,
|
|
1436
|
+
},
|
|
1437
|
+
];
|
|
1438
|
+
|
|
1439
|
+
let (first, last) = calculate_page_range(50, 250, &boundaries).unwrap();
|
|
1440
|
+
assert_eq!(first, Some(1));
|
|
1441
|
+
assert_eq!(last, Some(3));
|
|
1442
|
+
}
|
|
1443
|
+
|
|
1444
|
+
#[test]
|
|
1445
|
+
fn test_chunk_metadata_page_range_accuracy() {
|
|
1446
|
+
use crate::types::PageBoundary;
|
|
1447
|
+
|
|
1448
|
+
let config = ChunkingConfig {
|
|
1449
|
+
max_characters: 30,
|
|
1450
|
+
overlap: 5,
|
|
1451
|
+
trim: true,
|
|
1452
|
+
chunker_type: ChunkerType::Text,
|
|
1453
|
+
};
|
|
1454
|
+
let text = "Page One Content Here.Page Two.";
|
|
1455
|
+
|
|
1456
|
+
let boundaries = vec![
|
|
1457
|
+
PageBoundary {
|
|
1458
|
+
byte_start: 0,
|
|
1459
|
+
byte_end: 21,
|
|
1460
|
+
page_number: 1,
|
|
1461
|
+
},
|
|
1462
|
+
PageBoundary {
|
|
1463
|
+
byte_start: 21,
|
|
1464
|
+
byte_end: 31,
|
|
1465
|
+
page_number: 2,
|
|
1466
|
+
},
|
|
1467
|
+
];
|
|
1468
|
+
|
|
1469
|
+
let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
|
|
1470
|
+
|
|
1471
|
+
for chunk in &result.chunks {
|
|
1472
|
+
assert_eq!(chunk.metadata.byte_end - chunk.metadata.byte_start, chunk.content.len());
|
|
1473
|
+
}
|
|
1474
|
+
}
|
|
1475
|
+
|
|
1476
|
+
#[test]
|
|
1477
|
+
fn test_chunk_page_range_boundary_edge_cases() {
|
|
1478
|
+
use crate::types::PageBoundary;
|
|
1479
|
+
|
|
1480
|
+
let config = ChunkingConfig {
|
|
1481
|
+
max_characters: 10,
|
|
1482
|
+
overlap: 2,
|
|
1483
|
+
trim: false,
|
|
1484
|
+
chunker_type: ChunkerType::Text,
|
|
1485
|
+
};
|
|
1486
|
+
let text = "0123456789ABCDEFGHIJ";
|
|
1487
|
+
|
|
1488
|
+
let boundaries = vec![
|
|
1489
|
+
PageBoundary {
|
|
1490
|
+
byte_start: 0,
|
|
1491
|
+
byte_end: 10,
|
|
1492
|
+
page_number: 1,
|
|
1493
|
+
},
|
|
1494
|
+
PageBoundary {
|
|
1495
|
+
byte_start: 10,
|
|
1496
|
+
byte_end: 20,
|
|
1497
|
+
page_number: 2,
|
|
1498
|
+
},
|
|
1499
|
+
];
|
|
1500
|
+
|
|
1501
|
+
let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
|
|
1502
|
+
|
|
1503
|
+
for chunk in &result.chunks {
|
|
1504
|
+
let on_page1 = chunk.metadata.byte_start < 10;
|
|
1505
|
+
let on_page2 = chunk.metadata.byte_end > 10;
|
|
1506
|
+
|
|
1507
|
+
if on_page1 && on_page2 {
|
|
1508
|
+
assert_eq!(chunk.metadata.first_page, Some(1));
|
|
1509
|
+
assert_eq!(chunk.metadata.last_page, Some(2));
|
|
1510
|
+
} else if on_page1 {
|
|
1511
|
+
assert_eq!(chunk.metadata.first_page, Some(1));
|
|
1512
|
+
} else if on_page2 {
|
|
1513
|
+
assert_eq!(chunk.metadata.first_page, Some(2));
|
|
1514
|
+
}
|
|
1515
|
+
}
|
|
1516
|
+
}
|
|
1517
|
+
|
|
1518
|
+
#[test]
|
|
1519
|
+
fn test_validate_utf8_boundaries_valid_ascii() {
|
|
1520
|
+
use crate::types::PageBoundary;
|
|
1521
|
+
|
|
1522
|
+
let text = "This is ASCII text.";
|
|
1523
|
+
let boundaries = vec![
|
|
1524
|
+
PageBoundary {
|
|
1525
|
+
byte_start: 0,
|
|
1526
|
+
byte_end: 10,
|
|
1527
|
+
page_number: 1,
|
|
1528
|
+
},
|
|
1529
|
+
PageBoundary {
|
|
1530
|
+
byte_start: 10,
|
|
1531
|
+
byte_end: 19,
|
|
1532
|
+
page_number: 2,
|
|
1533
|
+
},
|
|
1534
|
+
];
|
|
1535
|
+
|
|
1536
|
+
let result = chunk_text(text, &ChunkingConfig::default(), Some(&boundaries));
|
|
1537
|
+
assert!(result.is_ok());
|
|
1538
|
+
}
|
|
1539
|
+
|
|
1540
|
+
#[test]
|
|
1541
|
+
fn test_validate_utf8_boundaries_valid_emoji() {
|
|
1542
|
+
use crate::types::PageBoundary;
|
|
1543
|
+
|
|
1544
|
+
let text = "Hello 👋 World 🌍 End";
|
|
1545
|
+
let config = ChunkingConfig::default();
|
|
1546
|
+
|
|
1547
|
+
let boundaries = vec![
|
|
1548
|
+
PageBoundary {
|
|
1549
|
+
byte_start: 0,
|
|
1550
|
+
byte_end: 11,
|
|
1551
|
+
page_number: 1,
|
|
1552
|
+
},
|
|
1553
|
+
PageBoundary {
|
|
1554
|
+
byte_start: 11,
|
|
1555
|
+
byte_end: 25,
|
|
1556
|
+
page_number: 2,
|
|
1557
|
+
},
|
|
1558
|
+
];
|
|
1559
|
+
|
|
1560
|
+
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1561
|
+
assert!(result.is_ok());
|
|
1562
|
+
}
|
|
1563
|
+
|
|
1564
|
+
#[test]
|
|
1565
|
+
fn test_validate_utf8_boundaries_valid_cjk() {
|
|
1566
|
+
use crate::types::PageBoundary;
|
|
1567
|
+
|
|
1568
|
+
let text = "你好世界 こんにちは 안녕하세요";
|
|
1569
|
+
let config = ChunkingConfig::default();
|
|
1570
|
+
|
|
1571
|
+
let boundaries = vec![
|
|
1572
|
+
PageBoundary {
|
|
1573
|
+
byte_start: 0,
|
|
1574
|
+
byte_end: 13,
|
|
1575
|
+
page_number: 1,
|
|
1576
|
+
},
|
|
1577
|
+
PageBoundary {
|
|
1578
|
+
byte_start: 13,
|
|
1579
|
+
byte_end: 44,
|
|
1580
|
+
page_number: 2,
|
|
1581
|
+
},
|
|
1582
|
+
];
|
|
1583
|
+
|
|
1584
|
+
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1585
|
+
assert!(result.is_ok());
|
|
1586
|
+
}
|
|
1587
|
+
|
|
1588
|
+
#[test]
|
|
1589
|
+
fn test_validate_utf8_boundaries_invalid_mid_emoji() {
|
|
1590
|
+
use crate::types::PageBoundary;
|
|
1591
|
+
|
|
1592
|
+
let text = "Hello 👋 World";
|
|
1593
|
+
let boundaries = vec![PageBoundary {
|
|
1594
|
+
byte_start: 0,
|
|
1595
|
+
byte_end: 7,
|
|
1596
|
+
page_number: 1,
|
|
1597
|
+
}];
|
|
1598
|
+
|
|
1599
|
+
let config = ChunkingConfig::default();
|
|
1600
|
+
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1601
|
+
assert!(result.is_err());
|
|
1602
|
+
let err = result.unwrap_err();
|
|
1603
|
+
assert!(err.to_string().contains("UTF-8 character boundary"));
|
|
1604
|
+
assert!(err.to_string().contains("byte_end=7"));
|
|
1605
|
+
}
|
|
1606
|
+
|
|
1607
|
+
#[test]
|
|
1608
|
+
fn test_validate_utf8_boundaries_invalid_mid_multibyte_cjk() {
|
|
1609
|
+
use crate::types::PageBoundary;
|
|
1610
|
+
|
|
1611
|
+
let text = "中文文本";
|
|
1612
|
+
let boundaries = vec![PageBoundary {
|
|
1613
|
+
byte_start: 0,
|
|
1614
|
+
byte_end: 1,
|
|
1615
|
+
page_number: 1,
|
|
1616
|
+
}];
|
|
1617
|
+
|
|
1618
|
+
let config = ChunkingConfig::default();
|
|
1619
|
+
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1620
|
+
assert!(result.is_err());
|
|
1621
|
+
let err = result.unwrap_err();
|
|
1622
|
+
assert!(err.to_string().contains("UTF-8 character boundary"));
|
|
1623
|
+
}
|
|
1624
|
+
|
|
1625
|
+
#[test]
|
|
1626
|
+
fn test_validate_utf8_boundaries_byte_start_exceeds_length() {
|
|
1627
|
+
use crate::types::PageBoundary;
|
|
1628
|
+
|
|
1629
|
+
let text = "Short";
|
|
1630
|
+
let boundaries = vec![
|
|
1631
|
+
PageBoundary {
|
|
1632
|
+
byte_start: 0,
|
|
1633
|
+
byte_end: 3,
|
|
1634
|
+
page_number: 1,
|
|
1635
|
+
},
|
|
1636
|
+
PageBoundary {
|
|
1637
|
+
byte_start: 10,
|
|
1638
|
+
byte_end: 15,
|
|
1639
|
+
page_number: 2,
|
|
1640
|
+
},
|
|
1641
|
+
];
|
|
1642
|
+
|
|
1643
|
+
let config = ChunkingConfig::default();
|
|
1644
|
+
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1645
|
+
assert!(result.is_err());
|
|
1646
|
+
let err = result.unwrap_err();
|
|
1647
|
+
assert!(err.to_string().contains("exceeds text length"));
|
|
1648
|
+
}
|
|
1649
|
+
|
|
1650
|
+
#[test]
|
|
1651
|
+
fn test_validate_utf8_boundaries_byte_end_exceeds_length() {
|
|
1652
|
+
use crate::types::PageBoundary;
|
|
1653
|
+
|
|
1654
|
+
let text = "Short";
|
|
1655
|
+
let boundaries = vec![PageBoundary {
|
|
1656
|
+
byte_start: 0,
|
|
1657
|
+
byte_end: 100,
|
|
1658
|
+
page_number: 1,
|
|
1659
|
+
}];
|
|
1660
|
+
|
|
1661
|
+
let config = ChunkingConfig::default();
|
|
1662
|
+
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1663
|
+
assert!(result.is_err());
|
|
1664
|
+
let err = result.unwrap_err();
|
|
1665
|
+
assert!(err.to_string().contains("exceeds text length"));
|
|
1666
|
+
}
|
|
1667
|
+
|
|
1668
|
+
#[test]
|
|
1669
|
+
fn test_validate_utf8_boundaries_empty_boundaries() {
|
|
1670
|
+
use crate::types::PageBoundary;
|
|
1671
|
+
|
|
1672
|
+
let text = "Some text";
|
|
1673
|
+
let boundaries: Vec<PageBoundary> = vec![];
|
|
1674
|
+
|
|
1675
|
+
let config = ChunkingConfig::default();
|
|
1676
|
+
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1677
|
+
assert!(result.is_ok());
|
|
1678
|
+
}
|
|
1679
|
+
|
|
1680
|
+
#[test]
|
|
1681
|
+
fn test_validate_utf8_boundaries_at_text_boundaries() {
|
|
1682
|
+
use crate::types::PageBoundary;
|
|
1683
|
+
|
|
1684
|
+
let text = "Exact boundary test";
|
|
1685
|
+
let text_len = text.len();
|
|
1686
|
+
let boundaries = vec![PageBoundary {
|
|
1687
|
+
byte_start: 0,
|
|
1688
|
+
byte_end: text_len,
|
|
1689
|
+
page_number: 1,
|
|
1690
|
+
}];
|
|
1691
|
+
|
|
1692
|
+
let config = ChunkingConfig::default();
|
|
1693
|
+
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1694
|
+
assert!(result.is_ok());
|
|
1695
|
+
}
|
|
1696
|
+
|
|
1697
|
+
#[test]
|
|
1698
|
+
fn test_validate_utf8_boundaries_mixed_languages() {
|
|
1699
|
+
use crate::types::PageBoundary;
|
|
1700
|
+
|
|
1701
|
+
let text = "English text mixed with 中文 and français";
|
|
1702
|
+
let config = ChunkingConfig::default();
|
|
1703
|
+
|
|
1704
|
+
let boundaries = vec![
|
|
1705
|
+
PageBoundary {
|
|
1706
|
+
byte_start: 0,
|
|
1707
|
+
byte_end: 24,
|
|
1708
|
+
page_number: 1,
|
|
1709
|
+
},
|
|
1710
|
+
PageBoundary {
|
|
1711
|
+
byte_start: 24,
|
|
1712
|
+
byte_end: text.len(),
|
|
1713
|
+
page_number: 2,
|
|
1714
|
+
},
|
|
1715
|
+
];
|
|
1716
|
+
|
|
1717
|
+
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1718
|
+
assert!(result.is_ok());
|
|
1719
|
+
}
|
|
1720
|
+
|
|
1721
|
+
#[test]
|
|
1722
|
+
fn test_chunk_text_rejects_invalid_utf8_boundaries() {
|
|
1723
|
+
use crate::types::PageBoundary;
|
|
1724
|
+
|
|
1725
|
+
let text = "🌍🌎🌏 Three emoji planets";
|
|
1726
|
+
let config = ChunkingConfig::default();
|
|
1727
|
+
|
|
1728
|
+
let boundaries = vec![PageBoundary {
|
|
1729
|
+
byte_start: 0,
|
|
1730
|
+
byte_end: 1000,
|
|
1731
|
+
page_number: 1,
|
|
1732
|
+
}];
|
|
1733
|
+
|
|
1734
|
+
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1735
|
+
assert!(result.is_err());
|
|
1736
|
+
}
|
|
1737
|
+
|
|
1738
|
+
#[test]
|
|
1739
|
+
fn test_validate_utf8_boundaries_combining_diacriticals() {
|
|
1740
|
+
use crate::types::PageBoundary;
|
|
1741
|
+
|
|
1742
|
+
let text = "café";
|
|
1743
|
+
let config = ChunkingConfig::default();
|
|
1744
|
+
|
|
1745
|
+
let boundaries = vec![
|
|
1746
|
+
PageBoundary {
|
|
1747
|
+
byte_start: 0,
|
|
1748
|
+
byte_end: 2,
|
|
1749
|
+
page_number: 1,
|
|
1750
|
+
},
|
|
1751
|
+
PageBoundary {
|
|
1752
|
+
byte_start: 2,
|
|
1753
|
+
byte_end: text.len(),
|
|
1754
|
+
page_number: 2,
|
|
1755
|
+
},
|
|
1756
|
+
];
|
|
1757
|
+
|
|
1758
|
+
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1759
|
+
assert!(result.is_ok());
|
|
1760
|
+
}
|
|
1761
|
+
|
|
1762
|
+
#[test]
|
|
1763
|
+
fn test_validate_utf8_boundaries_error_messages_are_clear() {
|
|
1764
|
+
use crate::types::PageBoundary;
|
|
1765
|
+
|
|
1766
|
+
let text = "Test 👋 text";
|
|
1767
|
+
let config = ChunkingConfig::default();
|
|
1768
|
+
|
|
1769
|
+
let boundaries = vec![PageBoundary {
|
|
1770
|
+
byte_start: 0,
|
|
1771
|
+
byte_end: 6,
|
|
1772
|
+
page_number: 1,
|
|
1773
|
+
}];
|
|
1774
|
+
|
|
1775
|
+
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1776
|
+
assert!(result.is_err());
|
|
1777
|
+
let err = result.unwrap_err();
|
|
1778
|
+
let err_msg = err.to_string();
|
|
1779
|
+
assert!(err_msg.contains("UTF-8"));
|
|
1780
|
+
assert!(err_msg.contains("boundary"));
|
|
1781
|
+
assert!(err_msg.contains("6"));
|
|
1782
|
+
}
|
|
1783
|
+
|
|
1784
|
+
#[test]
|
|
1785
|
+
fn test_validate_utf8_boundaries_multiple_valid_boundaries() {
|
|
1786
|
+
use crate::types::PageBoundary;
|
|
1787
|
+
|
|
1788
|
+
let text = "First👋Second🌍Third";
|
|
1789
|
+
let config = ChunkingConfig::default();
|
|
1790
|
+
|
|
1791
|
+
let boundaries = vec![
|
|
1792
|
+
PageBoundary {
|
|
1793
|
+
byte_start: 0,
|
|
1794
|
+
byte_end: 5,
|
|
1795
|
+
page_number: 1,
|
|
1796
|
+
},
|
|
1797
|
+
PageBoundary {
|
|
1798
|
+
byte_start: 5,
|
|
1799
|
+
byte_end: 9,
|
|
1800
|
+
page_number: 2,
|
|
1801
|
+
},
|
|
1802
|
+
PageBoundary {
|
|
1803
|
+
byte_start: 9,
|
|
1804
|
+
byte_end: 15,
|
|
1805
|
+
page_number: 3,
|
|
1806
|
+
},
|
|
1807
|
+
PageBoundary {
|
|
1808
|
+
byte_start: 15,
|
|
1809
|
+
byte_end: 19,
|
|
1810
|
+
page_number: 4,
|
|
1811
|
+
},
|
|
1812
|
+
PageBoundary {
|
|
1813
|
+
byte_start: 19,
|
|
1814
|
+
byte_end: text.len(),
|
|
1815
|
+
page_number: 5,
|
|
1816
|
+
},
|
|
1817
|
+
];
|
|
1818
|
+
|
|
1819
|
+
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1820
|
+
assert!(result.is_ok());
|
|
1821
|
+
}
|
|
1822
|
+
|
|
1823
|
+
#[test]
|
|
1824
|
+
fn test_validate_utf8_boundaries_zero_start_and_end() {
|
|
1825
|
+
use crate::types::PageBoundary;
|
|
1826
|
+
|
|
1827
|
+
let text = "Text";
|
|
1828
|
+
let config = ChunkingConfig::default();
|
|
1829
|
+
|
|
1830
|
+
let boundaries = vec![PageBoundary {
|
|
1831
|
+
byte_start: 0,
|
|
1832
|
+
byte_end: 0,
|
|
1833
|
+
page_number: 1,
|
|
1834
|
+
}];
|
|
1835
|
+
|
|
1836
|
+
let result = chunk_text(text, &config, Some(&boundaries));
|
|
1837
|
+
assert!(result.is_err());
|
|
1838
|
+
}
|
|
1839
|
+
}
|
|
1840
|
+
|
|
1841
|
+
/// Lazy-initialized flag that ensures chunking processor is registered exactly once.
|
|
1842
|
+
///
|
|
1843
|
+
/// This static is accessed on first use to automatically register the
|
|
1844
|
+
/// chunking processor with the plugin registry.
|
|
1845
|
+
static PROCESSOR_INITIALIZED: Lazy<Result<()>> = Lazy::new(register_chunking_processor);
|
|
1846
|
+
|
|
1847
|
+
/// Ensure the chunking processor is registered.
|
|
1848
|
+
///
|
|
1849
|
+
/// This function is called automatically when needed.
|
|
1850
|
+
/// It's safe to call multiple times - registration only happens once.
|
|
1851
|
+
pub fn ensure_initialized() -> Result<()> {
|
|
1852
|
+
PROCESSOR_INITIALIZED
|
|
1853
|
+
.as_ref()
|
|
1854
|
+
.map(|_| ())
|
|
1855
|
+
.map_err(|e| crate::KreuzbergError::Plugin {
|
|
1856
|
+
message: format!("Failed to register chunking processor: {}", e),
|
|
1857
|
+
plugin_name: "text-chunking".to_string(),
|
|
1858
|
+
})
|
|
1859
|
+
}
|
|
1860
|
+
|
|
1861
|
+
/// Register the chunking processor with the global registry.
|
|
1862
|
+
///
|
|
1863
|
+
/// This function should be called once at application startup to register
|
|
1864
|
+
/// the chunking post-processor.
|
|
1865
|
+
///
|
|
1866
|
+
/// **Note:** This is called automatically on first use.
|
|
1867
|
+
/// Explicit calling is optional.
|
|
1868
|
+
pub fn register_chunking_processor() -> Result<()> {
|
|
1869
|
+
let registry = crate::plugins::registry::get_post_processor_registry();
|
|
1870
|
+
let mut registry = registry
|
|
1871
|
+
.write()
|
|
1872
|
+
.map_err(|e| crate::KreuzbergError::Other(format!("Post-processor registry lock poisoned: {}", e)))?;
|
|
1873
|
+
|
|
1874
|
+
registry.register(Arc::new(ChunkingProcessor), 50)?;
|
|
1875
|
+
|
|
1876
|
+
Ok(())
|
|
677
1877
|
}
|