kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +5 -5
  3. data/README.md +15 -9
  4. data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
  5. data/ext/kreuzberg_rb/native/Cargo.lock +511 -325
  6. data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
  7. data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
  8. data/kreuzberg.gemspec +38 -4
  9. data/lib/kreuzberg/config.rb +34 -1
  10. data/lib/kreuzberg/result.rb +77 -14
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/sig/kreuzberg.rbs +23 -6
  13. data/vendor/kreuzberg/Cargo.toml +32 -11
  14. data/vendor/kreuzberg/README.md +54 -8
  15. data/vendor/kreuzberg/build.rs +549 -132
  16. data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
  17. data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
  18. data/vendor/kreuzberg/src/core/config.rs +49 -1
  19. data/vendor/kreuzberg/src/core/extractor.rs +134 -2
  20. data/vendor/kreuzberg/src/core/mod.rs +4 -2
  21. data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
  22. data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
  23. data/vendor/kreuzberg/src/extraction/html.rs +24 -8
  24. data/vendor/kreuzberg/src/extraction/image.rs +124 -1
  25. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
  26. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
  27. data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
  28. data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
  29. data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
  30. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  31. data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
  32. data/vendor/kreuzberg/src/extractors/email.rs +29 -15
  33. data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
  34. data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
  35. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
  36. data/vendor/kreuzberg/src/extractors/html.rs +29 -15
  37. data/vendor/kreuzberg/src/extractors/image.rs +25 -4
  38. data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
  39. data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
  40. data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
  41. data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
  42. data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
  43. data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
  44. data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
  45. data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
  46. data/vendor/kreuzberg/src/extractors/pdf.rs +197 -17
  47. data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
  48. data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
  49. data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
  50. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  51. data/vendor/kreuzberg/src/extractors/text.rs +7 -2
  52. data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
  53. data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
  54. data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
  55. data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
  56. data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
  57. data/vendor/kreuzberg/src/lib.rs +10 -2
  58. data/vendor/kreuzberg/src/mcp/mod.rs +3 -0
  59. data/vendor/kreuzberg/src/mcp/server.rs +120 -12
  60. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
  61. data/vendor/kreuzberg/src/pdf/bundled.rs +328 -0
  62. data/vendor/kreuzberg/src/pdf/error.rs +8 -0
  63. data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
  64. data/vendor/kreuzberg/src/pdf/mod.rs +18 -2
  65. data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
  66. data/vendor/kreuzberg/src/pdf/table.rs +26 -2
  67. data/vendor/kreuzberg/src/pdf/text.rs +89 -7
  68. data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
  69. data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
  70. data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
  71. data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
  72. data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
  73. data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
  74. data/vendor/kreuzberg/src/text/mod.rs +6 -0
  75. data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
  76. data/vendor/kreuzberg/src/types.rs +173 -21
  77. data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
  78. data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
  79. data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
  80. data/vendor/kreuzberg/tests/config_features.rs +15 -1
  81. data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
  82. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
  83. data/vendor/kreuzberg/tests/email_integration.rs +2 -0
  84. data/vendor/kreuzberg/tests/error_handling.rs +43 -34
  85. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  86. data/vendor/kreuzberg/tests/image_integration.rs +2 -0
  87. data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
  88. data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
  89. data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
  90. data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
  91. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
  92. data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
  93. data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
  95. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
  96. data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
  97. data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
  98. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
  99. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
  100. data/vendor/kreuzberg/tests/security_validation.rs +1 -0
  101. data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
  102. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
  103. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
  104. data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
  105. data/vendor/rb-sys/Cargo.lock +15 -15
  106. data/vendor/rb-sys/Cargo.toml +4 -4
  107. data/vendor/rb-sys/Cargo.toml.orig +4 -4
  108. data/vendor/rb-sys/build/features.rs +5 -2
  109. data/vendor/rb-sys/build/main.rs +55 -15
  110. data/vendor/rb-sys/build/stable_api_config.rs +4 -2
  111. data/vendor/rb-sys/build/version.rs +3 -1
  112. data/vendor/rb-sys/src/lib.rs +1 -0
  113. data/vendor/rb-sys/src/macros.rs +2 -2
  114. data/vendor/rb-sys/src/special_consts.rs +1 -1
  115. data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
  116. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
  117. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
  118. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
  119. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
  120. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
  121. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
  122. data/vendor/rb-sys/src/stable_api.rs +0 -1
  123. data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
  124. metadata +13 -10
  125. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  126. data/vendor/rb-sys/.cargo-ok +0 -1
  127. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
@@ -30,7 +30,7 @@
30
30
  //! };
31
31
  //!
32
32
  //! let long_text = "This is a very long document...".repeat(100);
33
- //! let result = chunk_text(&long_text, &config)?;
33
+ //! let result = chunk_text(&long_text, &config, None)?;
34
34
  //!
35
35
  //! println!("Split into {} chunks", result.chunk_count);
36
36
  //! for (i, chunk) in result.chunks.iter().enumerate() {
@@ -47,10 +47,15 @@
47
47
  //! - Processing large documents in batches
48
48
  //! - Maintaining context across chunk boundaries
49
49
  use crate::error::{KreuzbergError, Result};
50
- use crate::types::{Chunk, ChunkMetadata};
50
+ use crate::types::{Chunk, ChunkMetadata, PageBoundary};
51
+ use once_cell::sync::Lazy;
51
52
  use serde::{Deserialize, Serialize};
53
+ use std::sync::Arc;
52
54
  use text_splitter::{Characters, ChunkCapacity, ChunkConfig, MarkdownSplitter, TextSplitter};
53
55
 
56
+ pub mod processor;
57
+ pub use processor::ChunkingProcessor;
58
+
54
59
  #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
55
60
  pub enum ChunkerType {
56
61
  Text,
@@ -88,7 +93,215 @@ fn build_chunk_config(max_characters: usize, overlap: usize, trim: bool) -> Resu
88
93
  .map_err(|e| KreuzbergError::validation(format!("Invalid chunking configuration: {}", e)))
89
94
  }
90
95
 
91
- pub fn chunk_text(text: &str, config: &ChunkingConfig) -> Result<ChunkingResult> {
96
+ /// Validates that byte offsets in page boundaries fall on valid UTF-8 character boundaries.
97
+ ///
98
+ /// This function ensures that all page boundary positions are at valid UTF-8 character
99
+ /// boundaries within the text. This is CRITICAL to prevent text corruption when boundaries
100
+ /// are created from language bindings or external sources, particularly with multibyte
101
+ /// UTF-8 characters (emoji, CJK characters, combining marks, etc.).
102
+ ///
103
+ /// # Arguments
104
+ ///
105
+ /// * `text` - The text being chunked
106
+ /// * `boundaries` - Page boundary markers to validate
107
+ ///
108
+ /// # Returns
109
+ ///
110
+ /// Returns `Ok(())` if all boundaries are at valid UTF-8 character boundaries.
111
+ /// Returns `KreuzbergError::Validation` if any boundary is at an invalid position.
112
+ ///
113
+ /// # UTF-8 Boundary Safety
114
+ ///
115
+ /// Rust strings use UTF-8 encoding where characters can be 1-4 bytes. For example:
116
+ /// - ASCII letters: 1 byte each
117
+ /// - Emoji (🌍): 4 bytes but 1 character
118
+ /// - CJK characters (中): 3 bytes but 1 character
119
+ ///
120
+ /// This function checks that all byte_start and byte_end values are at character
121
+ /// boundaries using Rust's `is_char_boundary()` method.
122
+ fn validate_utf8_boundaries(text: &str, boundaries: &[PageBoundary]) -> Result<()> {
123
+ for (idx, boundary) in boundaries.iter().enumerate() {
124
+ if boundary.byte_start > 0 && boundary.byte_start <= text.len() {
125
+ if !text.is_char_boundary(boundary.byte_start) {
126
+ return Err(KreuzbergError::validation(format!(
127
+ "Page boundary {} has byte_start={} which is not a valid UTF-8 character boundary (text length={}). This may indicate corrupted multibyte characters (emoji, CJK, etc.)",
128
+ idx,
129
+ boundary.byte_start,
130
+ text.len()
131
+ )));
132
+ }
133
+ } else if boundary.byte_start > text.len() {
134
+ return Err(KreuzbergError::validation(format!(
135
+ "Page boundary {} has byte_start={} which exceeds text length {}",
136
+ idx,
137
+ boundary.byte_start,
138
+ text.len()
139
+ )));
140
+ }
141
+
142
+ if boundary.byte_end > 0 && boundary.byte_end <= text.len() {
143
+ if !text.is_char_boundary(boundary.byte_end) {
144
+ return Err(KreuzbergError::validation(format!(
145
+ "Page boundary {} has byte_end={} which is not a valid UTF-8 character boundary (text length={}). This may indicate corrupted multibyte characters (emoji, CJK, etc.)",
146
+ idx,
147
+ boundary.byte_end,
148
+ text.len()
149
+ )));
150
+ }
151
+ } else if boundary.byte_end > text.len() {
152
+ return Err(KreuzbergError::validation(format!(
153
+ "Page boundary {} has byte_end={} which exceeds text length {}",
154
+ idx,
155
+ boundary.byte_end,
156
+ text.len()
157
+ )));
158
+ }
159
+ }
160
+
161
+ Ok(())
162
+ }
163
+
164
+ /// Calculate which pages a character range spans.
165
+ ///
166
+ /// # Arguments
167
+ ///
168
+ /// * `char_start` - Starting character offset of the chunk
169
+ /// * `char_end` - Ending character offset of the chunk
170
+ /// * `boundaries` - Page boundary markers from the document
171
+ ///
172
+ /// # Returns
173
+ ///
174
+ /// A tuple of (first_page, last_page) where page numbers are 1-indexed.
175
+ /// Returns (None, None) if boundaries are empty or chunk doesn't overlap any page.
176
+ /// Validates page boundaries for consistency and correctness.
177
+ ///
178
+ /// # Validation Rules
179
+ ///
180
+ /// 1. Boundaries must be sorted by char_start (monotonically increasing)
181
+ /// 2. Boundaries must not overlap (char_end[i] <= char_start[i+1])
182
+ /// 3. Each boundary must have char_start < char_end
183
+ ///
184
+ /// # Errors
185
+ ///
186
+ /// Returns `KreuzbergError::Validation` if any boundary is invalid.
187
+ fn validate_page_boundaries(boundaries: &[PageBoundary]) -> Result<()> {
188
+ if boundaries.is_empty() {
189
+ return Ok(());
190
+ }
191
+
192
+ for (idx, boundary) in boundaries.iter().enumerate() {
193
+ if boundary.byte_start >= boundary.byte_end {
194
+ return Err(KreuzbergError::validation(format!(
195
+ "Invalid boundary range at index {}: byte_start ({}) must be < byte_end ({})",
196
+ idx, boundary.byte_start, boundary.byte_end
197
+ )));
198
+ }
199
+ }
200
+
201
+ for i in 0..boundaries.len() - 1 {
202
+ let current = &boundaries[i];
203
+ let next = &boundaries[i + 1];
204
+
205
+ if current.byte_start > next.byte_start {
206
+ return Err(KreuzbergError::validation(format!(
207
+ "Page boundaries not sorted: boundary at index {} (byte_start={}) comes after boundary at index {} (byte_start={})",
208
+ i,
209
+ current.byte_start,
210
+ i + 1,
211
+ next.byte_start
212
+ )));
213
+ }
214
+
215
+ if current.byte_end > next.byte_start {
216
+ return Err(KreuzbergError::validation(format!(
217
+ "Overlapping page boundaries: boundary {} ends at {} but boundary {} starts at {}",
218
+ i,
219
+ current.byte_end,
220
+ i + 1,
221
+ next.byte_start
222
+ )));
223
+ }
224
+ }
225
+
226
+ Ok(())
227
+ }
228
+
229
+ /// Calculate which pages a byte range spans.
230
+ ///
231
+ /// # Arguments
232
+ ///
233
+ /// * `byte_start` - Starting byte offset of the chunk
234
+ /// * `byte_end` - Ending byte offset of the chunk
235
+ /// * `boundaries` - Page boundary markers from the document
236
+ ///
237
+ /// # Returns
238
+ ///
239
+ /// A tuple of (first_page, last_page) where page numbers are 1-indexed.
240
+ /// Returns (None, None) if boundaries are empty or chunk doesn't overlap any page.
241
+ ///
242
+ /// # Errors
243
+ ///
244
+ /// Returns `KreuzbergError::Validation` if boundaries are invalid.
245
+ fn calculate_page_range(
246
+ byte_start: usize,
247
+ byte_end: usize,
248
+ boundaries: &[PageBoundary],
249
+ ) -> Result<(Option<usize>, Option<usize>)> {
250
+ if boundaries.is_empty() {
251
+ return Ok((None, None));
252
+ }
253
+
254
+ validate_page_boundaries(boundaries)?;
255
+
256
+ let mut first_page = None;
257
+ let mut last_page = None;
258
+
259
+ for boundary in boundaries {
260
+ if byte_start < boundary.byte_end && byte_end > boundary.byte_start {
261
+ if first_page.is_none() {
262
+ first_page = Some(boundary.page_number);
263
+ }
264
+ last_page = Some(boundary.page_number);
265
+ }
266
+ }
267
+
268
+ Ok((first_page, last_page))
269
+ }
270
+
271
+ /// Split text into chunks with optional page boundary tracking.
272
+ ///
273
+ /// # Arguments
274
+ ///
275
+ /// * `text` - The text to split into chunks
276
+ /// * `config` - Chunking configuration (max size, overlap, type)
277
+ /// * `page_boundaries` - Optional page boundary markers for mapping chunks to pages
278
+ ///
279
+ /// # Returns
280
+ ///
281
+ /// A ChunkingResult containing all chunks and their metadata.
282
+ ///
283
+ /// # Examples
284
+ ///
285
+ /// ```rust
286
+ /// use kreuzberg::chunking::{chunk_text, ChunkingConfig, ChunkerType};
287
+ ///
288
+ /// # fn example() -> kreuzberg::Result<()> {
289
+ /// let config = ChunkingConfig {
290
+ /// max_characters: 500,
291
+ /// overlap: 50,
292
+ /// trim: true,
293
+ /// chunker_type: ChunkerType::Text,
294
+ /// };
295
+ /// let result = chunk_text("Long text...", &config, None)?;
296
+ /// assert!(!result.chunks.is_empty());
297
+ /// # Ok(())
298
+ /// # }
299
+ /// ```
300
+ pub fn chunk_text(
301
+ text: &str,
302
+ config: &ChunkingConfig,
303
+ page_boundaries: Option<&[PageBoundary]>,
304
+ ) -> Result<ChunkingResult> {
92
305
  if text.is_empty() {
93
306
  return Ok(ChunkingResult {
94
307
  chunks: vec![],
@@ -96,6 +309,10 @@ pub fn chunk_text(text: &str, config: &ChunkingConfig) -> Result<ChunkingResult>
96
309
  });
97
310
  }
98
311
 
312
+ if let Some(boundaries) = page_boundaries {
313
+ validate_utf8_boundaries(text, boundaries)?;
314
+ }
315
+
99
316
  let chunk_config = build_chunk_config(config.max_characters, config.overlap, config.trim)?;
100
317
 
101
318
  let text_chunks: Vec<&str> = match config.chunker_type {
@@ -110,36 +327,42 @@ pub fn chunk_text(text: &str, config: &ChunkingConfig) -> Result<ChunkingResult>
110
327
  };
111
328
 
112
329
  let total_chunks = text_chunks.len();
113
- let mut char_offset = 0;
114
-
115
- let chunks: Vec<Chunk> = text_chunks
116
- .into_iter()
117
- .enumerate()
118
- .map(|(index, chunk_text)| {
119
- let char_start = char_offset;
120
- let chunk_length = chunk_text.chars().count();
121
- let char_end = char_start + chunk_length;
122
-
123
- let overlap_chars = if index < total_chunks - 1 {
124
- config.overlap.min(chunk_length)
125
- } else {
126
- 0
127
- };
128
- char_offset = char_end - overlap_chars;
129
-
130
- Chunk {
131
- content: chunk_text.to_string(),
132
- embedding: None,
133
- metadata: ChunkMetadata {
134
- char_start,
135
- char_end,
136
- token_count: None,
137
- chunk_index: index,
138
- total_chunks,
139
- },
140
- }
141
- })
142
- .collect();
330
+ let mut byte_offset = 0;
331
+
332
+ let mut chunks: Vec<Chunk> = Vec::new();
333
+
334
+ for (index, chunk_text) in text_chunks.into_iter().enumerate() {
335
+ let byte_start = byte_offset;
336
+ let chunk_length = chunk_text.len();
337
+ let byte_end = byte_start + chunk_length;
338
+
339
+ let overlap_chars = if index < total_chunks - 1 {
340
+ config.overlap.min(chunk_length)
341
+ } else {
342
+ 0
343
+ };
344
+ byte_offset = byte_end - overlap_chars;
345
+
346
+ let (first_page, last_page) = if let Some(boundaries) = page_boundaries {
347
+ calculate_page_range(byte_start, byte_end, boundaries)?
348
+ } else {
349
+ (None, None)
350
+ };
351
+
352
+ chunks.push(Chunk {
353
+ content: chunk_text.to_string(),
354
+ embedding: None,
355
+ metadata: ChunkMetadata {
356
+ byte_start,
357
+ byte_end,
358
+ token_count: None,
359
+ chunk_index: index,
360
+ total_chunks,
361
+ first_page,
362
+ last_page,
363
+ },
364
+ });
365
+ }
143
366
 
144
367
  let chunk_count = chunks.len();
145
368
 
@@ -159,11 +382,11 @@ pub fn chunk_text_with_type(
159
382
  trim,
160
383
  chunker_type,
161
384
  };
162
- chunk_text(text, &config)
385
+ chunk_text(text, &config, None)
163
386
  }
164
387
 
165
388
  pub fn chunk_texts_batch(texts: &[&str], config: &ChunkingConfig) -> Result<Vec<ChunkingResult>> {
166
- texts.iter().map(|text| chunk_text(text, config)).collect()
389
+ texts.iter().map(|text| chunk_text(text, config, None)).collect()
167
390
  }
168
391
 
169
392
  #[cfg(test)]
@@ -173,7 +396,7 @@ mod tests {
173
396
  #[test]
174
397
  fn test_chunk_empty_text() {
175
398
  let config = ChunkingConfig::default();
176
- let result = chunk_text("", &config).unwrap();
399
+ let result = chunk_text("", &config, None).unwrap();
177
400
  assert_eq!(result.chunks.len(), 0);
178
401
  assert_eq!(result.chunk_count, 0);
179
402
  }
@@ -187,7 +410,7 @@ mod tests {
187
410
  chunker_type: ChunkerType::Text,
188
411
  };
189
412
  let text = "This is a short text.";
190
- let result = chunk_text(text, &config).unwrap();
413
+ let result = chunk_text(text, &config, None).unwrap();
191
414
  assert_eq!(result.chunks.len(), 1);
192
415
  assert_eq!(result.chunk_count, 1);
193
416
  assert_eq!(result.chunks[0].content, text);
@@ -202,7 +425,7 @@ mod tests {
202
425
  chunker_type: ChunkerType::Text,
203
426
  };
204
427
  let text = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
205
- let result = chunk_text(text, &config).unwrap();
428
+ let result = chunk_text(text, &config, None).unwrap();
206
429
  assert!(result.chunk_count >= 2);
207
430
  assert_eq!(result.chunks.len(), result.chunk_count);
208
431
  assert!(result.chunks.iter().all(|chunk| chunk.content.len() <= 20));
@@ -217,7 +440,7 @@ mod tests {
217
440
  chunker_type: ChunkerType::Text,
218
441
  };
219
442
  let text = "abcdefghijklmnopqrstuvwxyz0123456789";
220
- let result = chunk_text(text, &config).unwrap();
443
+ let result = chunk_text(text, &config, None).unwrap();
221
444
  assert!(result.chunk_count >= 2);
222
445
 
223
446
  if result.chunks.len() >= 2 {
@@ -240,7 +463,7 @@ mod tests {
240
463
  chunker_type: ChunkerType::Markdown,
241
464
  };
242
465
  let markdown = "# Title\n\nParagraph one.\n\n## Section\n\nParagraph two.";
243
- let result = chunk_text(markdown, &config).unwrap();
466
+ let result = chunk_text(markdown, &config, None).unwrap();
244
467
  assert!(result.chunk_count >= 1);
245
468
  assert!(result.chunks.iter().any(|chunk| chunk.content.contains("# Title")));
246
469
  }
@@ -254,7 +477,7 @@ mod tests {
254
477
  chunker_type: ChunkerType::Markdown,
255
478
  };
256
479
  let markdown = "# Code Example\n\n```python\nprint('hello')\n```\n\nSome text after code.";
257
- let result = chunk_text(markdown, &config).unwrap();
480
+ let result = chunk_text(markdown, &config, None).unwrap();
258
481
  assert!(result.chunk_count >= 1);
259
482
  assert!(result.chunks.iter().any(|chunk| chunk.content.contains("```")));
260
483
  }
@@ -268,7 +491,7 @@ mod tests {
268
491
  chunker_type: ChunkerType::Markdown,
269
492
  };
270
493
  let markdown = "Check out [this link](https://example.com) for more info.";
271
- let result = chunk_text(markdown, &config).unwrap();
494
+ let result = chunk_text(markdown, &config, None).unwrap();
272
495
  assert_eq!(result.chunk_count, 1);
273
496
  assert!(result.chunks[0].content.contains("[this link]"));
274
497
  }
@@ -282,7 +505,7 @@ mod tests {
282
505
  chunker_type: ChunkerType::Text,
283
506
  };
284
507
  let text = " Leading and trailing spaces should be trimmed ";
285
- let result = chunk_text(text, &config).unwrap();
508
+ let result = chunk_text(text, &config, None).unwrap();
286
509
  assert!(result.chunk_count >= 1);
287
510
  assert!(result.chunks.iter().all(|chunk| !chunk.content.starts_with(' ')));
288
511
  }
@@ -296,7 +519,7 @@ mod tests {
296
519
  chunker_type: ChunkerType::Text,
297
520
  };
298
521
  let text = " Text with spaces ";
299
- let result = chunk_text(text, &config).unwrap();
522
+ let result = chunk_text(text, &config, None).unwrap();
300
523
  assert_eq!(result.chunk_count, 1);
301
524
  assert!(result.chunks[0].content.starts_with(' ') || result.chunks[0].content.len() < text.len());
302
525
  }
@@ -309,7 +532,7 @@ mod tests {
309
532
  trim: true,
310
533
  chunker_type: ChunkerType::Text,
311
534
  };
312
- let result = chunk_text("Some text", &config);
535
+ let result = chunk_text("Some text", &config, None);
313
536
  assert!(result.is_err());
314
537
  let err = result.unwrap_err();
315
538
  assert!(matches!(err, KreuzbergError::Validation { .. }));
@@ -403,7 +626,7 @@ mod tests {
403
626
  chunker_type: ChunkerType::Text,
404
627
  };
405
628
  let text = "a".repeat(1000);
406
- let result = chunk_text(&text, &config).unwrap();
629
+ let result = chunk_text(&text, &config, None).unwrap();
407
630
  assert!(result.chunk_count >= 10);
408
631
  assert!(result.chunks.iter().all(|chunk| chunk.content.len() <= 100));
409
632
  }
@@ -417,7 +640,7 @@ mod tests {
417
640
  chunker_type: ChunkerType::Text,
418
641
  };
419
642
  let text = "Line one\nLine two\nLine three\nLine four\nLine five";
420
- let result = chunk_text(text, &config).unwrap();
643
+ let result = chunk_text(text, &config, None).unwrap();
421
644
  assert!(result.chunk_count >= 1);
422
645
  }
423
646
 
@@ -430,7 +653,7 @@ mod tests {
430
653
  chunker_type: ChunkerType::Markdown,
431
654
  };
432
655
  let markdown = "# List Example\n\n- Item 1\n- Item 2\n- Item 3\n\nMore text.";
433
- let result = chunk_text(markdown, &config).unwrap();
656
+ let result = chunk_text(markdown, &config, None).unwrap();
434
657
  assert!(result.chunk_count >= 1);
435
658
  assert!(result.chunks.iter().any(|chunk| chunk.content.contains("- Item")));
436
659
  }
@@ -444,7 +667,7 @@ mod tests {
444
667
  chunker_type: ChunkerType::Markdown,
445
668
  };
446
669
  let markdown = "# Table\n\n| Col1 | Col2 |\n|------|------|\n| A | B |\n| C | D |";
447
- let result = chunk_text(markdown, &config).unwrap();
670
+ let result = chunk_text(markdown, &config, None).unwrap();
448
671
  assert!(result.chunk_count >= 1);
449
672
  assert!(result.chunks.iter().any(|chunk| chunk.content.contains("|")));
450
673
  }
@@ -458,7 +681,7 @@ mod tests {
458
681
  chunker_type: ChunkerType::Text,
459
682
  };
460
683
  let text = "Special chars: @#$%^&*()[]{}|\\<>?/~`";
461
- let result = chunk_text(text, &config).unwrap();
684
+ let result = chunk_text(text, &config, None).unwrap();
462
685
  assert_eq!(result.chunk_count, 1);
463
686
  assert!(result.chunks[0].content.contains("@#$%"));
464
687
  }
@@ -472,7 +695,7 @@ mod tests {
472
695
  chunker_type: ChunkerType::Text,
473
696
  };
474
697
  let text = "Unicode: 你好世界 🌍 café résumé";
475
- let result = chunk_text(text, &config).unwrap();
698
+ let result = chunk_text(text, &config, None).unwrap();
476
699
  assert_eq!(result.chunk_count, 1);
477
700
  assert!(result.chunks[0].content.contains("你好"));
478
701
  assert!(result.chunks[0].content.contains("🌍"));
@@ -487,7 +710,7 @@ mod tests {
487
710
  chunker_type: ChunkerType::Text,
488
711
  };
489
712
  let text = "日本語のテキストです。これは長い文章で、複数のチャンクに分割されるべきです。";
490
- let result = chunk_text(text, &config).unwrap();
713
+ let result = chunk_text(text, &config, None).unwrap();
491
714
  assert!(result.chunk_count >= 1);
492
715
  }
493
716
 
@@ -500,7 +723,7 @@ mod tests {
500
723
  chunker_type: ChunkerType::Text,
501
724
  };
502
725
  let text = "English text mixed with 中文文本 and some français";
503
- let result = chunk_text(text, &config).unwrap();
726
+ let result = chunk_text(text, &config, None).unwrap();
504
727
  assert!(result.chunk_count >= 1);
505
728
  }
506
729
 
@@ -513,7 +736,7 @@ mod tests {
513
736
  chunker_type: ChunkerType::Text,
514
737
  };
515
738
  let text = "AAAAA BBBBB CCCCC DDDDD EEEEE FFFFF";
516
- let result = chunk_text(text, &config).unwrap();
739
+ let result = chunk_text(text, &config, None).unwrap();
517
740
 
518
741
  assert!(result.chunks.len() >= 2, "Expected at least 2 chunks");
519
742
 
@@ -522,8 +745,8 @@ mod tests {
522
745
  let metadata = &chunk.metadata;
523
746
 
524
747
  assert_eq!(
525
- metadata.char_end - metadata.char_start,
526
- chunk.content.chars().count(),
748
+ metadata.byte_end - metadata.byte_start,
749
+ chunk.content.len(),
527
750
  "Chunk {} offset range doesn't match content length",
528
751
  i
529
752
  );
@@ -537,15 +760,15 @@ mod tests {
537
760
  let next_chunk = &result.chunks[i + 1];
538
761
 
539
762
  assert!(
540
- next_chunk.metadata.char_start < current_chunk.metadata.char_end,
763
+ next_chunk.metadata.byte_start < current_chunk.metadata.byte_end,
541
764
  "Chunk {} and {} don't overlap: next starts at {} but current ends at {}",
542
765
  i,
543
766
  i + 1,
544
- next_chunk.metadata.char_start,
545
- current_chunk.metadata.char_end
767
+ next_chunk.metadata.byte_start,
768
+ current_chunk.metadata.byte_end
546
769
  );
547
770
 
548
- let overlap_size = current_chunk.metadata.char_end - next_chunk.metadata.char_start;
771
+ let overlap_size = current_chunk.metadata.byte_end - next_chunk.metadata.byte_start;
549
772
  assert!(
550
773
  overlap_size <= config.overlap + 10,
551
774
  "Overlap between chunks {} and {} is too large: {}",
@@ -565,19 +788,19 @@ mod tests {
565
788
  chunker_type: ChunkerType::Text,
566
789
  };
567
790
  let text = "AAAAA BBBBB CCCCC DDDDD EEEEE FFFFF";
568
- let result = chunk_text(text, &config).unwrap();
791
+ let result = chunk_text(text, &config, None).unwrap();
569
792
 
570
793
  for i in 0..result.chunks.len() - 1 {
571
794
  let current_chunk = &result.chunks[i];
572
795
  let next_chunk = &result.chunks[i + 1];
573
796
 
574
797
  assert!(
575
- next_chunk.metadata.char_start >= current_chunk.metadata.char_end,
798
+ next_chunk.metadata.byte_start >= current_chunk.metadata.byte_end,
576
799
  "Chunk {} and {} overlap when they shouldn't: next starts at {} but current ends at {}",
577
800
  i,
578
801
  i + 1,
579
- next_chunk.metadata.char_start,
580
- current_chunk.metadata.char_end
802
+ next_chunk.metadata.byte_start,
803
+ current_chunk.metadata.byte_end
581
804
  );
582
805
  }
583
806
  }
@@ -591,12 +814,12 @@ mod tests {
591
814
  chunker_type: ChunkerType::Text,
592
815
  };
593
816
  let text = "0123456789 ABCDEFGHIJ KLMNOPQRST UVWXYZ";
594
- let result = chunk_text(text, &config).unwrap();
817
+ let result = chunk_text(text, &config, None).unwrap();
595
818
 
596
819
  assert!(result.chunks.len() >= 2, "Expected multiple chunks");
597
820
 
598
821
  assert_eq!(
599
- result.chunks[0].metadata.char_start, 0,
822
+ result.chunks[0].metadata.byte_start, 0,
600
823
  "First chunk should start at position 0"
601
824
  );
602
825
 
@@ -605,12 +828,12 @@ mod tests {
605
828
  let next_chunk = &result.chunks[i + 1];
606
829
 
607
830
  assert!(
608
- next_chunk.metadata.char_start <= current_chunk.metadata.char_end,
831
+ next_chunk.metadata.byte_start <= current_chunk.metadata.byte_end,
609
832
  "Gap detected between chunk {} (ends at {}) and chunk {} (starts at {})",
610
833
  i,
611
- current_chunk.metadata.char_end,
834
+ current_chunk.metadata.byte_end,
612
835
  i + 1,
613
- next_chunk.metadata.char_start
836
+ next_chunk.metadata.byte_start
614
837
  );
615
838
  }
616
839
  }
@@ -625,24 +848,24 @@ mod tests {
625
848
  chunker_type: ChunkerType::Text,
626
849
  };
627
850
  let text = "Word ".repeat(30);
628
- let result = chunk_text(&text, &config).unwrap();
851
+ let result = chunk_text(&text, &config, None).unwrap();
629
852
 
630
853
  for chunk in &result.chunks {
631
854
  assert!(
632
- chunk.metadata.char_end > chunk.metadata.char_start,
855
+ chunk.metadata.byte_end > chunk.metadata.byte_start,
633
856
  "Invalid offset range for overlap {}: start={}, end={}",
634
857
  overlap,
635
- chunk.metadata.char_start,
636
- chunk.metadata.char_end
858
+ chunk.metadata.byte_start,
859
+ chunk.metadata.byte_end
637
860
  );
638
861
  }
639
862
 
640
863
  for chunk in &result.chunks {
641
864
  assert!(
642
- chunk.metadata.char_start < text.chars().count(),
865
+ chunk.metadata.byte_start < text.len(),
643
866
  "char_start with overlap {} is out of bounds: {}",
644
867
  overlap,
645
- chunk.metadata.char_start
868
+ chunk.metadata.byte_start
646
869
  );
647
870
  }
648
871
  }
@@ -657,7 +880,7 @@ mod tests {
657
880
  chunker_type: ChunkerType::Text,
658
881
  };
659
882
  let text = "AAAAA BBBBB CCCCC DDDDD EEEEE";
660
- let result = chunk_text(text, &config).unwrap();
883
+ let result = chunk_text(text, &config, None).unwrap();
661
884
 
662
885
  assert!(result.chunks.len() >= 2, "Need multiple chunks for this test");
663
886
 
@@ -665,13 +888,990 @@ mod tests {
665
888
  let second_to_last = &result.chunks[result.chunks.len() - 2];
666
889
 
667
890
  assert!(
668
- last_chunk.metadata.char_start < second_to_last.metadata.char_end,
891
+ last_chunk.metadata.byte_start < second_to_last.metadata.byte_end,
669
892
  "Last chunk should overlap with previous chunk"
670
893
  );
671
894
 
672
- let expected_end = text.chars().count();
895
+ let expected_end = text.len();
673
896
  let last_chunk_covers_end =
674
- last_chunk.content.trim_end() == text.trim_end() || last_chunk.metadata.char_end >= expected_end - 5;
897
+ last_chunk.content.trim_end() == text.trim_end() || last_chunk.metadata.byte_end >= expected_end - 5;
675
898
  assert!(last_chunk_covers_end, "Last chunk should cover the end of the text");
676
899
  }
900
+
901
+ #[test]
902
+ fn test_chunk_with_page_boundaries() {
903
+ use crate::types::PageBoundary;
904
+
905
+ let config = ChunkingConfig {
906
+ max_characters: 30,
907
+ overlap: 5,
908
+ trim: true,
909
+ chunker_type: ChunkerType::Text,
910
+ };
911
+ let text = "Page one content here. Page two starts here and continues.";
912
+
913
+ let boundaries = vec![
914
+ PageBoundary {
915
+ byte_start: 0,
916
+ byte_end: 21,
917
+ page_number: 1,
918
+ },
919
+ PageBoundary {
920
+ byte_start: 22,
921
+ byte_end: 58,
922
+ page_number: 2,
923
+ },
924
+ ];
925
+
926
+ let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
927
+ assert!(result.chunks.len() >= 2);
928
+
929
+ assert_eq!(result.chunks[0].metadata.first_page, Some(1));
930
+
931
+ let last_chunk = result.chunks.last().unwrap();
932
+ assert_eq!(last_chunk.metadata.last_page, Some(2));
933
+ }
934
+
935
+ #[test]
936
+ fn test_chunk_without_page_boundaries() {
937
+ let config = ChunkingConfig {
938
+ max_characters: 30,
939
+ overlap: 5,
940
+ trim: true,
941
+ chunker_type: ChunkerType::Text,
942
+ };
943
+ let text = "This is some test content that should be split into multiple chunks.";
944
+
945
+ let result = chunk_text(text, &config, None).unwrap();
946
+ assert!(result.chunks.len() >= 2);
947
+
948
+ for chunk in &result.chunks {
949
+ assert_eq!(chunk.metadata.first_page, None);
950
+ assert_eq!(chunk.metadata.last_page, None);
951
+ }
952
+ }
953
+
954
+ #[test]
955
+ fn test_chunk_empty_boundaries() {
956
+ let config = ChunkingConfig {
957
+ max_characters: 30,
958
+ overlap: 5,
959
+ trim: true,
960
+ chunker_type: ChunkerType::Text,
961
+ };
962
+ let text = "Some text content here.";
963
+ let boundaries: Vec<PageBoundary> = vec![];
964
+
965
+ let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
966
+ assert_eq!(result.chunks.len(), 1);
967
+
968
+ assert_eq!(result.chunks[0].metadata.first_page, None);
969
+ assert_eq!(result.chunks[0].metadata.last_page, None);
970
+ }
971
+
972
+ #[test]
973
+ fn test_chunk_spanning_multiple_pages() {
974
+ use crate::types::PageBoundary;
975
+
976
+ let config = ChunkingConfig {
977
+ max_characters: 50,
978
+ overlap: 5,
979
+ trim: false,
980
+ chunker_type: ChunkerType::Text,
981
+ };
982
+ let text = "0123456789 AAAAAAAAAA 1111111111 BBBBBBBBBB 2222222222";
983
+
984
+ let boundaries = vec![
985
+ PageBoundary {
986
+ byte_start: 0,
987
+ byte_end: 20,
988
+ page_number: 1,
989
+ },
990
+ PageBoundary {
991
+ byte_start: 20,
992
+ byte_end: 40,
993
+ page_number: 2,
994
+ },
995
+ PageBoundary {
996
+ byte_start: 40,
997
+ byte_end: 54,
998
+ page_number: 3,
999
+ },
1000
+ ];
1001
+
1002
+ let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
1003
+ assert!(result.chunks.len() >= 2);
1004
+
1005
+ for chunk in &result.chunks {
1006
+ assert!(chunk.metadata.first_page.is_some() || chunk.metadata.last_page.is_some());
1007
+ }
1008
+ }
1009
+
1010
+ #[test]
1011
+ fn test_chunk_text_with_invalid_boundary_range() {
1012
+ use crate::types::PageBoundary;
1013
+
1014
+ let config = ChunkingConfig {
1015
+ max_characters: 30,
1016
+ overlap: 5,
1017
+ trim: true,
1018
+ chunker_type: ChunkerType::Text,
1019
+ };
1020
+ let text = "Page one content here. Page two content.";
1021
+
1022
+ let boundaries = vec![PageBoundary {
1023
+ byte_start: 10,
1024
+ byte_end: 5,
1025
+ page_number: 1,
1026
+ }];
1027
+
1028
+ let result = chunk_text(text, &config, Some(&boundaries));
1029
+ assert!(result.is_err());
1030
+ let err = result.unwrap_err();
1031
+ assert!(err.to_string().contains("Invalid boundary range"));
1032
+ assert!(err.to_string().contains("byte_start"));
1033
+ }
1034
+
1035
+ #[test]
1036
+ fn test_chunk_text_with_unsorted_boundaries() {
1037
+ use crate::types::PageBoundary;
1038
+
1039
+ let config = ChunkingConfig {
1040
+ max_characters: 30,
1041
+ overlap: 5,
1042
+ trim: true,
1043
+ chunker_type: ChunkerType::Text,
1044
+ };
1045
+ let text = "Page one content here. Page two content.";
1046
+
1047
+ let boundaries = vec![
1048
+ PageBoundary {
1049
+ byte_start: 22,
1050
+ byte_end: 40,
1051
+ page_number: 2,
1052
+ },
1053
+ PageBoundary {
1054
+ byte_start: 0,
1055
+ byte_end: 21,
1056
+ page_number: 1,
1057
+ },
1058
+ ];
1059
+
1060
+ let result = chunk_text(text, &config, Some(&boundaries));
1061
+ assert!(result.is_err());
1062
+ let err = result.unwrap_err();
1063
+ assert!(err.to_string().contains("not sorted"));
1064
+ assert!(err.to_string().contains("boundaries"));
1065
+ }
1066
+
1067
+ #[test]
1068
+ fn test_chunk_text_with_overlapping_boundaries() {
1069
+ use crate::types::PageBoundary;
1070
+
1071
+ let config = ChunkingConfig {
1072
+ max_characters: 30,
1073
+ overlap: 5,
1074
+ trim: true,
1075
+ chunker_type: ChunkerType::Text,
1076
+ };
1077
+ let text = "Page one content here. Page two content.";
1078
+
1079
+ let boundaries = vec![
1080
+ PageBoundary {
1081
+ byte_start: 0,
1082
+ byte_end: 25,
1083
+ page_number: 1,
1084
+ },
1085
+ PageBoundary {
1086
+ byte_start: 20,
1087
+ byte_end: 40,
1088
+ page_number: 2,
1089
+ },
1090
+ ];
1091
+
1092
+ let result = chunk_text(text, &config, Some(&boundaries));
1093
+ assert!(result.is_err());
1094
+ let err = result.unwrap_err();
1095
+ assert!(err.to_string().contains("Overlapping"));
1096
+ assert!(err.to_string().contains("boundaries"));
1097
+ }
1098
+
1099
+ #[test]
1100
+ fn test_calculate_page_range_with_invalid_boundaries() {
1101
+ use crate::types::PageBoundary;
1102
+
1103
+ let boundaries = vec![PageBoundary {
1104
+ byte_start: 15,
1105
+ byte_end: 10,
1106
+ page_number: 1,
1107
+ }];
1108
+
1109
+ let result = calculate_page_range(0, 20, &boundaries);
1110
+ assert!(result.is_err());
1111
+ let err = result.unwrap_err();
1112
+ assert!(err.to_string().contains("Invalid boundary range"));
1113
+ }
1114
+
1115
+ #[test]
1116
+ fn test_validate_page_boundaries_valid() {
1117
+ use crate::types::PageBoundary;
1118
+
1119
+ let boundaries = vec![
1120
+ PageBoundary {
1121
+ byte_start: 0,
1122
+ byte_end: 20,
1123
+ page_number: 1,
1124
+ },
1125
+ PageBoundary {
1126
+ byte_start: 20,
1127
+ byte_end: 40,
1128
+ page_number: 2,
1129
+ },
1130
+ PageBoundary {
1131
+ byte_start: 40,
1132
+ byte_end: 60,
1133
+ page_number: 3,
1134
+ },
1135
+ ];
1136
+
1137
+ let result = chunk_text(
1138
+ "x".repeat(60).as_str(),
1139
+ &ChunkingConfig {
1140
+ max_characters: 30,
1141
+ overlap: 5,
1142
+ trim: false,
1143
+ chunker_type: ChunkerType::Text,
1144
+ },
1145
+ Some(&boundaries),
1146
+ );
1147
+ assert!(result.is_ok());
1148
+ }
1149
+
1150
+ #[test]
1151
+ fn test_validate_page_boundaries_empty() {
1152
+ let boundaries: Vec<PageBoundary> = vec![];
1153
+ let result = chunk_text(
1154
+ "Some test text",
1155
+ &ChunkingConfig {
1156
+ max_characters: 30,
1157
+ overlap: 5,
1158
+ trim: true,
1159
+ chunker_type: ChunkerType::Text,
1160
+ },
1161
+ Some(&boundaries),
1162
+ );
1163
+ assert!(result.is_ok());
1164
+ }
1165
+
1166
+ #[test]
1167
+ fn test_page_boundaries_with_gaps() {
1168
+ use crate::types::PageBoundary;
1169
+
1170
+ let boundaries = vec![
1171
+ PageBoundary {
1172
+ byte_start: 0,
1173
+ byte_end: 10,
1174
+ page_number: 1,
1175
+ },
1176
+ PageBoundary {
1177
+ byte_start: 15,
1178
+ byte_end: 25,
1179
+ page_number: 2,
1180
+ },
1181
+ ];
1182
+
1183
+ let text = "0123456789XXXXX0123456789";
1184
+ let result = chunk_text(
1185
+ text,
1186
+ &ChunkingConfig {
1187
+ max_characters: 30,
1188
+ overlap: 5,
1189
+ trim: false,
1190
+ chunker_type: ChunkerType::Text,
1191
+ },
1192
+ Some(&boundaries),
1193
+ );
1194
+ assert!(result.is_ok());
1195
+ }
1196
+
1197
+ #[test]
1198
+ fn test_chunk_with_same_start_and_end() {
1199
+ use crate::types::PageBoundary;
1200
+
1201
+ let boundaries = vec![PageBoundary {
1202
+ byte_start: 10,
1203
+ byte_end: 10,
1204
+ page_number: 1,
1205
+ }];
1206
+
1207
+ let result = chunk_text(
1208
+ "test content here",
1209
+ &ChunkingConfig {
1210
+ max_characters: 30,
1211
+ overlap: 5,
1212
+ trim: true,
1213
+ chunker_type: ChunkerType::Text,
1214
+ },
1215
+ Some(&boundaries),
1216
+ );
1217
+ assert!(result.is_err());
1218
+ let err = result.unwrap_err();
1219
+ assert!(err.to_string().contains("Invalid boundary range"));
1220
+ }
1221
+
1222
+ #[test]
1223
+ fn test_multiple_overlapping_errors() {
1224
+ use crate::types::PageBoundary;
1225
+
1226
+ let text = "This is a longer test content string that spans more bytes";
1227
+ let boundaries = vec![
1228
+ PageBoundary {
1229
+ byte_start: 20,
1230
+ byte_end: 40,
1231
+ page_number: 2,
1232
+ },
1233
+ PageBoundary {
1234
+ byte_start: 10,
1235
+ byte_end: 35,
1236
+ page_number: 1,
1237
+ },
1238
+ ];
1239
+
1240
+ let result = chunk_text(
1241
+ text,
1242
+ &ChunkingConfig {
1243
+ max_characters: 30,
1244
+ overlap: 5,
1245
+ trim: true,
1246
+ chunker_type: ChunkerType::Text,
1247
+ },
1248
+ Some(&boundaries),
1249
+ );
1250
+ assert!(result.is_err());
1251
+ assert!(result.unwrap_err().to_string().contains("not sorted"));
1252
+ }
1253
+
1254
+ #[test]
1255
+ fn test_chunk_with_pages_basic() {
1256
+ use crate::types::PageBoundary;
1257
+
1258
+ let config = ChunkingConfig {
1259
+ max_characters: 25,
1260
+ overlap: 5,
1261
+ trim: true,
1262
+ chunker_type: ChunkerType::Text,
1263
+ };
1264
+ let text = "First page content here.Second page content here.Third page.";
1265
+
1266
+ let boundaries = vec![
1267
+ PageBoundary {
1268
+ byte_start: 0,
1269
+ byte_end: 24,
1270
+ page_number: 1,
1271
+ },
1272
+ PageBoundary {
1273
+ byte_start: 24,
1274
+ byte_end: 50,
1275
+ page_number: 2,
1276
+ },
1277
+ PageBoundary {
1278
+ byte_start: 50,
1279
+ byte_end: 60,
1280
+ page_number: 3,
1281
+ },
1282
+ ];
1283
+
1284
+ let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
1285
+
1286
+ if !result.chunks.is_empty() {
1287
+ assert!(result.chunks[0].metadata.first_page.is_some());
1288
+ }
1289
+ }
1290
+
1291
+ #[test]
1292
+ fn test_chunk_with_pages_single_page_chunk() {
1293
+ use crate::types::PageBoundary;
1294
+
1295
+ let config = ChunkingConfig {
1296
+ max_characters: 100,
1297
+ overlap: 10,
1298
+ trim: true,
1299
+ chunker_type: ChunkerType::Text,
1300
+ };
1301
+ let text = "All content on single page fits in one chunk.";
1302
+
1303
+ let boundaries = vec![PageBoundary {
1304
+ byte_start: 0,
1305
+ byte_end: 45,
1306
+ page_number: 1,
1307
+ }];
1308
+
1309
+ let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
1310
+ assert_eq!(result.chunks.len(), 1);
1311
+ assert_eq!(result.chunks[0].metadata.first_page, Some(1));
1312
+ assert_eq!(result.chunks[0].metadata.last_page, Some(1));
1313
+ }
1314
+
1315
+ #[test]
1316
+ fn test_chunk_with_pages_no_overlap() {
1317
+ use crate::types::PageBoundary;
1318
+
1319
+ let config = ChunkingConfig {
1320
+ max_characters: 20,
1321
+ overlap: 0,
1322
+ trim: false,
1323
+ chunker_type: ChunkerType::Text,
1324
+ };
1325
+ let text = "AAAAA BBBBB CCCCC DDDDD";
1326
+
1327
+ let boundaries = vec![
1328
+ PageBoundary {
1329
+ byte_start: 0,
1330
+ byte_end: 11,
1331
+ page_number: 1,
1332
+ },
1333
+ PageBoundary {
1334
+ byte_start: 11,
1335
+ byte_end: 23,
1336
+ page_number: 2,
1337
+ },
1338
+ ];
1339
+
1340
+ let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
1341
+ assert!(!result.chunks.is_empty());
1342
+
1343
+ for chunk in &result.chunks {
1344
+ if let (Some(first), Some(last)) = (chunk.metadata.first_page, chunk.metadata.last_page) {
1345
+ assert!(first <= last);
1346
+ }
1347
+ }
1348
+ }
1349
+
1350
+ #[test]
1351
+ fn test_calculate_page_range_within_page() {
1352
+ let boundaries = vec![
1353
+ PageBoundary {
1354
+ byte_start: 0,
1355
+ byte_end: 100,
1356
+ page_number: 1,
1357
+ },
1358
+ PageBoundary {
1359
+ byte_start: 100,
1360
+ byte_end: 200,
1361
+ page_number: 2,
1362
+ },
1363
+ ];
1364
+
1365
+ let (first, last) = calculate_page_range(10, 50, &boundaries).unwrap();
1366
+ assert_eq!(first, Some(1));
1367
+ assert_eq!(last, Some(1));
1368
+ }
1369
+
1370
+ #[test]
1371
+ fn test_calculate_page_range_spanning_pages() {
1372
+ let boundaries = vec![
1373
+ PageBoundary {
1374
+ byte_start: 0,
1375
+ byte_end: 100,
1376
+ page_number: 1,
1377
+ },
1378
+ PageBoundary {
1379
+ byte_start: 100,
1380
+ byte_end: 200,
1381
+ page_number: 2,
1382
+ },
1383
+ ];
1384
+
1385
+ let (first, last) = calculate_page_range(50, 150, &boundaries).unwrap();
1386
+ assert_eq!(first, Some(1));
1387
+ assert_eq!(last, Some(2));
1388
+ }
1389
+
1390
+ #[test]
1391
+ fn test_calculate_page_range_empty_boundaries() {
1392
+ let boundaries: Vec<PageBoundary> = vec![];
1393
+
1394
+ let (first, last) = calculate_page_range(0, 50, &boundaries).unwrap();
1395
+ assert_eq!(first, None);
1396
+ assert_eq!(last, None);
1397
+ }
1398
+
1399
+ #[test]
1400
+ fn test_calculate_page_range_no_overlap() {
1401
+ let boundaries = vec![
1402
+ PageBoundary {
1403
+ byte_start: 0,
1404
+ byte_end: 100,
1405
+ page_number: 1,
1406
+ },
1407
+ PageBoundary {
1408
+ byte_start: 100,
1409
+ byte_end: 200,
1410
+ page_number: 2,
1411
+ },
1412
+ ];
1413
+
1414
+ let (first, last) = calculate_page_range(200, 250, &boundaries).unwrap();
1415
+ assert_eq!(first, None);
1416
+ assert_eq!(last, None);
1417
+ }
1418
+
1419
+ #[test]
1420
+ fn test_calculate_page_range_three_pages() {
1421
+ let boundaries = vec![
1422
+ PageBoundary {
1423
+ byte_start: 0,
1424
+ byte_end: 100,
1425
+ page_number: 1,
1426
+ },
1427
+ PageBoundary {
1428
+ byte_start: 100,
1429
+ byte_end: 200,
1430
+ page_number: 2,
1431
+ },
1432
+ PageBoundary {
1433
+ byte_start: 200,
1434
+ byte_end: 300,
1435
+ page_number: 3,
1436
+ },
1437
+ ];
1438
+
1439
+ let (first, last) = calculate_page_range(50, 250, &boundaries).unwrap();
1440
+ assert_eq!(first, Some(1));
1441
+ assert_eq!(last, Some(3));
1442
+ }
1443
+
1444
+ #[test]
1445
+ fn test_chunk_metadata_page_range_accuracy() {
1446
+ use crate::types::PageBoundary;
1447
+
1448
+ let config = ChunkingConfig {
1449
+ max_characters: 30,
1450
+ overlap: 5,
1451
+ trim: true,
1452
+ chunker_type: ChunkerType::Text,
1453
+ };
1454
+ let text = "Page One Content Here.Page Two.";
1455
+
1456
+ let boundaries = vec![
1457
+ PageBoundary {
1458
+ byte_start: 0,
1459
+ byte_end: 21,
1460
+ page_number: 1,
1461
+ },
1462
+ PageBoundary {
1463
+ byte_start: 21,
1464
+ byte_end: 31,
1465
+ page_number: 2,
1466
+ },
1467
+ ];
1468
+
1469
+ let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
1470
+
1471
+ for chunk in &result.chunks {
1472
+ assert_eq!(chunk.metadata.byte_end - chunk.metadata.byte_start, chunk.content.len());
1473
+ }
1474
+ }
1475
+
1476
+ #[test]
1477
+ fn test_chunk_page_range_boundary_edge_cases() {
1478
+ use crate::types::PageBoundary;
1479
+
1480
+ let config = ChunkingConfig {
1481
+ max_characters: 10,
1482
+ overlap: 2,
1483
+ trim: false,
1484
+ chunker_type: ChunkerType::Text,
1485
+ };
1486
+ let text = "0123456789ABCDEFGHIJ";
1487
+
1488
+ let boundaries = vec![
1489
+ PageBoundary {
1490
+ byte_start: 0,
1491
+ byte_end: 10,
1492
+ page_number: 1,
1493
+ },
1494
+ PageBoundary {
1495
+ byte_start: 10,
1496
+ byte_end: 20,
1497
+ page_number: 2,
1498
+ },
1499
+ ];
1500
+
1501
+ let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
1502
+
1503
+ for chunk in &result.chunks {
1504
+ let on_page1 = chunk.metadata.byte_start < 10;
1505
+ let on_page2 = chunk.metadata.byte_end > 10;
1506
+
1507
+ if on_page1 && on_page2 {
1508
+ assert_eq!(chunk.metadata.first_page, Some(1));
1509
+ assert_eq!(chunk.metadata.last_page, Some(2));
1510
+ } else if on_page1 {
1511
+ assert_eq!(chunk.metadata.first_page, Some(1));
1512
+ } else if on_page2 {
1513
+ assert_eq!(chunk.metadata.first_page, Some(2));
1514
+ }
1515
+ }
1516
+ }
1517
+
1518
+ #[test]
1519
+ fn test_validate_utf8_boundaries_valid_ascii() {
1520
+ use crate::types::PageBoundary;
1521
+
1522
+ let text = "This is ASCII text.";
1523
+ let boundaries = vec![
1524
+ PageBoundary {
1525
+ byte_start: 0,
1526
+ byte_end: 10,
1527
+ page_number: 1,
1528
+ },
1529
+ PageBoundary {
1530
+ byte_start: 10,
1531
+ byte_end: 19,
1532
+ page_number: 2,
1533
+ },
1534
+ ];
1535
+
1536
+ let result = chunk_text(text, &ChunkingConfig::default(), Some(&boundaries));
1537
+ assert!(result.is_ok());
1538
+ }
1539
+
1540
+ #[test]
1541
+ fn test_validate_utf8_boundaries_valid_emoji() {
1542
+ use crate::types::PageBoundary;
1543
+
1544
+ let text = "Hello 👋 World 🌍 End";
1545
+ let config = ChunkingConfig::default();
1546
+
1547
+ let boundaries = vec![
1548
+ PageBoundary {
1549
+ byte_start: 0,
1550
+ byte_end: 11,
1551
+ page_number: 1,
1552
+ },
1553
+ PageBoundary {
1554
+ byte_start: 11,
1555
+ byte_end: 25,
1556
+ page_number: 2,
1557
+ },
1558
+ ];
1559
+
1560
+ let result = chunk_text(text, &config, Some(&boundaries));
1561
+ assert!(result.is_ok());
1562
+ }
1563
+
1564
+ #[test]
1565
+ fn test_validate_utf8_boundaries_valid_cjk() {
1566
+ use crate::types::PageBoundary;
1567
+
1568
+ let text = "你好世界 こんにちは 안녕하세요";
1569
+ let config = ChunkingConfig::default();
1570
+
1571
+ let boundaries = vec![
1572
+ PageBoundary {
1573
+ byte_start: 0,
1574
+ byte_end: 13,
1575
+ page_number: 1,
1576
+ },
1577
+ PageBoundary {
1578
+ byte_start: 13,
1579
+ byte_end: 44,
1580
+ page_number: 2,
1581
+ },
1582
+ ];
1583
+
1584
+ let result = chunk_text(text, &config, Some(&boundaries));
1585
+ assert!(result.is_ok());
1586
+ }
1587
+
1588
+ #[test]
1589
+ fn test_validate_utf8_boundaries_invalid_mid_emoji() {
1590
+ use crate::types::PageBoundary;
1591
+
1592
+ let text = "Hello 👋 World";
1593
+ let boundaries = vec![PageBoundary {
1594
+ byte_start: 0,
1595
+ byte_end: 7,
1596
+ page_number: 1,
1597
+ }];
1598
+
1599
+ let config = ChunkingConfig::default();
1600
+ let result = chunk_text(text, &config, Some(&boundaries));
1601
+ assert!(result.is_err());
1602
+ let err = result.unwrap_err();
1603
+ assert!(err.to_string().contains("UTF-8 character boundary"));
1604
+ assert!(err.to_string().contains("byte_end=7"));
1605
+ }
1606
+
1607
+ #[test]
1608
+ fn test_validate_utf8_boundaries_invalid_mid_multibyte_cjk() {
1609
+ use crate::types::PageBoundary;
1610
+
1611
+ let text = "中文文本";
1612
+ let boundaries = vec![PageBoundary {
1613
+ byte_start: 0,
1614
+ byte_end: 1,
1615
+ page_number: 1,
1616
+ }];
1617
+
1618
+ let config = ChunkingConfig::default();
1619
+ let result = chunk_text(text, &config, Some(&boundaries));
1620
+ assert!(result.is_err());
1621
+ let err = result.unwrap_err();
1622
+ assert!(err.to_string().contains("UTF-8 character boundary"));
1623
+ }
1624
+
1625
+ #[test]
1626
+ fn test_validate_utf8_boundaries_byte_start_exceeds_length() {
1627
+ use crate::types::PageBoundary;
1628
+
1629
+ let text = "Short";
1630
+ let boundaries = vec![
1631
+ PageBoundary {
1632
+ byte_start: 0,
1633
+ byte_end: 3,
1634
+ page_number: 1,
1635
+ },
1636
+ PageBoundary {
1637
+ byte_start: 10,
1638
+ byte_end: 15,
1639
+ page_number: 2,
1640
+ },
1641
+ ];
1642
+
1643
+ let config = ChunkingConfig::default();
1644
+ let result = chunk_text(text, &config, Some(&boundaries));
1645
+ assert!(result.is_err());
1646
+ let err = result.unwrap_err();
1647
+ assert!(err.to_string().contains("exceeds text length"));
1648
+ }
1649
+
1650
+ #[test]
1651
+ fn test_validate_utf8_boundaries_byte_end_exceeds_length() {
1652
+ use crate::types::PageBoundary;
1653
+
1654
+ let text = "Short";
1655
+ let boundaries = vec![PageBoundary {
1656
+ byte_start: 0,
1657
+ byte_end: 100,
1658
+ page_number: 1,
1659
+ }];
1660
+
1661
+ let config = ChunkingConfig::default();
1662
+ let result = chunk_text(text, &config, Some(&boundaries));
1663
+ assert!(result.is_err());
1664
+ let err = result.unwrap_err();
1665
+ assert!(err.to_string().contains("exceeds text length"));
1666
+ }
1667
+
1668
+ #[test]
1669
+ fn test_validate_utf8_boundaries_empty_boundaries() {
1670
+ use crate::types::PageBoundary;
1671
+
1672
+ let text = "Some text";
1673
+ let boundaries: Vec<PageBoundary> = vec![];
1674
+
1675
+ let config = ChunkingConfig::default();
1676
+ let result = chunk_text(text, &config, Some(&boundaries));
1677
+ assert!(result.is_ok());
1678
+ }
1679
+
1680
+ #[test]
1681
+ fn test_validate_utf8_boundaries_at_text_boundaries() {
1682
+ use crate::types::PageBoundary;
1683
+
1684
+ let text = "Exact boundary test";
1685
+ let text_len = text.len();
1686
+ let boundaries = vec![PageBoundary {
1687
+ byte_start: 0,
1688
+ byte_end: text_len,
1689
+ page_number: 1,
1690
+ }];
1691
+
1692
+ let config = ChunkingConfig::default();
1693
+ let result = chunk_text(text, &config, Some(&boundaries));
1694
+ assert!(result.is_ok());
1695
+ }
1696
+
1697
+ #[test]
1698
+ fn test_validate_utf8_boundaries_mixed_languages() {
1699
+ use crate::types::PageBoundary;
1700
+
1701
+ let text = "English text mixed with 中文 and français";
1702
+ let config = ChunkingConfig::default();
1703
+
1704
+ let boundaries = vec![
1705
+ PageBoundary {
1706
+ byte_start: 0,
1707
+ byte_end: 24,
1708
+ page_number: 1,
1709
+ },
1710
+ PageBoundary {
1711
+ byte_start: 24,
1712
+ byte_end: text.len(),
1713
+ page_number: 2,
1714
+ },
1715
+ ];
1716
+
1717
+ let result = chunk_text(text, &config, Some(&boundaries));
1718
+ assert!(result.is_ok());
1719
+ }
1720
+
1721
+ #[test]
1722
+ fn test_chunk_text_rejects_invalid_utf8_boundaries() {
1723
+ use crate::types::PageBoundary;
1724
+
1725
+ let text = "🌍🌎🌏 Three emoji planets";
1726
+ let config = ChunkingConfig::default();
1727
+
1728
+ let boundaries = vec![PageBoundary {
1729
+ byte_start: 0,
1730
+ byte_end: 1000,
1731
+ page_number: 1,
1732
+ }];
1733
+
1734
+ let result = chunk_text(text, &config, Some(&boundaries));
1735
+ assert!(result.is_err());
1736
+ }
1737
+
1738
+ #[test]
1739
+ fn test_validate_utf8_boundaries_combining_diacriticals() {
1740
+ use crate::types::PageBoundary;
1741
+
1742
+ let text = "café";
1743
+ let config = ChunkingConfig::default();
1744
+
1745
+ let boundaries = vec![
1746
+ PageBoundary {
1747
+ byte_start: 0,
1748
+ byte_end: 2,
1749
+ page_number: 1,
1750
+ },
1751
+ PageBoundary {
1752
+ byte_start: 2,
1753
+ byte_end: text.len(),
1754
+ page_number: 2,
1755
+ },
1756
+ ];
1757
+
1758
+ let result = chunk_text(text, &config, Some(&boundaries));
1759
+ assert!(result.is_ok());
1760
+ }
1761
+
1762
+ #[test]
1763
+ fn test_validate_utf8_boundaries_error_messages_are_clear() {
1764
+ use crate::types::PageBoundary;
1765
+
1766
+ let text = "Test 👋 text";
1767
+ let config = ChunkingConfig::default();
1768
+
1769
+ let boundaries = vec![PageBoundary {
1770
+ byte_start: 0,
1771
+ byte_end: 6,
1772
+ page_number: 1,
1773
+ }];
1774
+
1775
+ let result = chunk_text(text, &config, Some(&boundaries));
1776
+ assert!(result.is_err());
1777
+ let err = result.unwrap_err();
1778
+ let err_msg = err.to_string();
1779
+ assert!(err_msg.contains("UTF-8"));
1780
+ assert!(err_msg.contains("boundary"));
1781
+ assert!(err_msg.contains("6"));
1782
+ }
1783
+
1784
+ #[test]
1785
+ fn test_validate_utf8_boundaries_multiple_valid_boundaries() {
1786
+ use crate::types::PageBoundary;
1787
+
1788
+ let text = "First👋Second🌍Third";
1789
+ let config = ChunkingConfig::default();
1790
+
1791
+ let boundaries = vec![
1792
+ PageBoundary {
1793
+ byte_start: 0,
1794
+ byte_end: 5,
1795
+ page_number: 1,
1796
+ },
1797
+ PageBoundary {
1798
+ byte_start: 5,
1799
+ byte_end: 9,
1800
+ page_number: 2,
1801
+ },
1802
+ PageBoundary {
1803
+ byte_start: 9,
1804
+ byte_end: 15,
1805
+ page_number: 3,
1806
+ },
1807
+ PageBoundary {
1808
+ byte_start: 15,
1809
+ byte_end: 19,
1810
+ page_number: 4,
1811
+ },
1812
+ PageBoundary {
1813
+ byte_start: 19,
1814
+ byte_end: text.len(),
1815
+ page_number: 5,
1816
+ },
1817
+ ];
1818
+
1819
+ let result = chunk_text(text, &config, Some(&boundaries));
1820
+ assert!(result.is_ok());
1821
+ }
1822
+
1823
+ #[test]
1824
+ fn test_validate_utf8_boundaries_zero_start_and_end() {
1825
+ use crate::types::PageBoundary;
1826
+
1827
+ let text = "Text";
1828
+ let config = ChunkingConfig::default();
1829
+
1830
+ let boundaries = vec![PageBoundary {
1831
+ byte_start: 0,
1832
+ byte_end: 0,
1833
+ page_number: 1,
1834
+ }];
1835
+
1836
+ let result = chunk_text(text, &config, Some(&boundaries));
1837
+ assert!(result.is_err());
1838
+ }
1839
+ }
1840
+
1841
+ /// Lazy-initialized flag that ensures chunking processor is registered exactly once.
1842
+ ///
1843
+ /// This static is accessed on first use to automatically register the
1844
+ /// chunking processor with the plugin registry.
1845
+ static PROCESSOR_INITIALIZED: Lazy<Result<()>> = Lazy::new(register_chunking_processor);
1846
+
1847
+ /// Ensure the chunking processor is registered.
1848
+ ///
1849
+ /// This function is called automatically when needed.
1850
+ /// It's safe to call multiple times - registration only happens once.
1851
+ pub fn ensure_initialized() -> Result<()> {
1852
+ PROCESSOR_INITIALIZED
1853
+ .as_ref()
1854
+ .map(|_| ())
1855
+ .map_err(|e| crate::KreuzbergError::Plugin {
1856
+ message: format!("Failed to register chunking processor: {}", e),
1857
+ plugin_name: "text-chunking".to_string(),
1858
+ })
1859
+ }
1860
+
1861
+ /// Register the chunking processor with the global registry.
1862
+ ///
1863
+ /// This function should be called once at application startup to register
1864
+ /// the chunking post-processor.
1865
+ ///
1866
+ /// **Note:** This is called automatically on first use.
1867
+ /// Explicit calling is optional.
1868
+ pub fn register_chunking_processor() -> Result<()> {
1869
+ let registry = crate::plugins::registry::get_post_processor_registry();
1870
+ let mut registry = registry
1871
+ .write()
1872
+ .map_err(|e| crate::KreuzbergError::Other(format!("Post-processor registry lock poisoned: {}", e)))?;
1873
+
1874
+ registry.register(Arc::new(ChunkingProcessor), 50)?;
1875
+
1876
+ Ok(())
677
1877
  }