kreuzberg 4.2.6 → 4.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +7 -4
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
  5. data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
  6. data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
  7. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
  8. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
  9. data/ext/kreuzberg_rb/native/src/result.rs +5 -3
  10. data/lib/kreuzberg/version.rb +1 -1
  11. data/sig/kreuzberg.rbs +228 -37
  12. data/spec/binding/batch_operations_spec.rb +2 -0
  13. data/vendor/Cargo.toml +3 -2
  14. data/vendor/kreuzberg/Cargo.toml +2 -1
  15. data/vendor/kreuzberg/README.md +1 -1
  16. data/vendor/kreuzberg/src/api/error.rs +29 -1
  17. data/vendor/kreuzberg/src/api/handlers.rs +28 -25
  18. data/vendor/kreuzberg/src/api/openapi.rs +14 -1
  19. data/vendor/kreuzberg/src/chunking/config.rs +2 -37
  20. data/vendor/kreuzberg/src/chunking/core.rs +78 -2
  21. data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
  22. data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
  23. data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
  24. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
  25. data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
  26. data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
  27. data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
  28. data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
  29. data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
  30. data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
  31. data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
  32. data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
  33. data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
  34. data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
  35. data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
  36. data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
  37. data/vendor/kreuzberg/src/extraction/email.rs +31 -19
  38. data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
  39. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
  40. data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
  41. data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
  42. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
  43. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
  44. data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
  45. data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
  47. data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
  48. data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
  49. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
  50. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
  51. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
  52. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
  53. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
  54. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
  55. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
  56. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
  57. data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
  58. data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
  59. data/vendor/kreuzberg/src/extractors/email.rs +5 -3
  60. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
  61. data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
  62. data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
  63. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
  64. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
  65. data/vendor/kreuzberg/src/extractors/html.rs +1 -1
  66. data/vendor/kreuzberg/src/extractors/image.rs +3 -3
  67. data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
  68. data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
  69. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
  70. data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
  71. data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
  72. data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
  73. data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
  74. data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
  75. data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
  76. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
  77. data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
  78. data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
  79. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
  80. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
  81. data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
  82. data/vendor/kreuzberg/src/extractors/text.rs +2 -2
  83. data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
  84. data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
  85. data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
  86. data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
  87. data/vendor/kreuzberg/src/lib.rs +1 -1
  88. data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
  89. data/vendor/kreuzberg/src/mcp/format.rs +5 -4
  90. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
  91. data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
  92. data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
  93. data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
  94. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
  95. data/vendor/kreuzberg/src/ocr/types.rs +3 -4
  96. data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
  97. data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
  98. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
  99. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
  100. data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
  101. data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
  102. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
  103. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
  104. data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
  105. data/vendor/kreuzberg/src/text/quality.rs +13 -13
  106. data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
  107. data/vendor/kreuzberg/src/types/djot.rs +15 -4
  108. data/vendor/kreuzberg/src/types/extraction.rs +24 -4
  109. data/vendor/kreuzberg/src/types/formats.rs +9 -5
  110. data/vendor/kreuzberg/src/types/metadata.rs +68 -7
  111. data/vendor/kreuzberg/src/types/mod.rs +7 -5
  112. data/vendor/kreuzberg/src/types/page.rs +9 -0
  113. data/vendor/kreuzberg/src/types/tables.rs +2 -0
  114. data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
  115. data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
  116. data/vendor/kreuzberg/tests/config_features.rs +19 -11
  117. data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
  118. data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
  119. data/vendor/kreuzberg/tests/core_integration.rs +5 -6
  120. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
  121. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
  122. data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
  123. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
  124. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
  125. data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
  126. data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
  127. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
  128. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  129. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
  130. data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
  131. data/vendor/kreuzberg-ffi/src/error.rs +56 -0
  132. data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
  133. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
  134. data/vendor/kreuzberg-ffi/src/result.rs +2 -1
  135. data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
  136. data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
  137. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
  138. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  139. metadata +2 -2
@@ -1,14 +1,11 @@
1
1
  //! API request handlers.
2
2
 
3
- use axum::{
4
- Json,
5
- extract::{Multipart, State},
6
- };
3
+ use axum::{Json, extract::State};
7
4
 
8
5
  use crate::{batch_extract_bytes, cache, extract_bytes};
9
6
 
10
7
  use super::{
11
- error::{ApiError, JsonApi},
8
+ error::{ApiError, JsonApi, MultipartApi},
12
9
  types::{
13
10
  ApiState, CacheClearResponse, CacheStatsResponse, ChunkRequest, ChunkResponse, EmbedRequest, EmbedResponse,
14
11
  ExtractResponse, HealthResponse, InfoResponse,
@@ -84,19 +81,18 @@ pub async fn info_handler() -> Json<InfoResponse> {
84
81
  ///
85
82
  /// The server's default config (loaded from kreuzberg.toml/yaml/json via discovery)
86
83
  /// is used as the base, and any per-request config overrides those defaults.
87
- // TODO: Add utoipa::path annotation once ExtractionResult implements ToSchema
88
- // #[utoipa::path(
89
- // post,
90
- // path = "/extract",
91
- // tag = "extraction",
92
- // request_body(content_type = "multipart/form-data"),
93
- // responses(
94
- // (status = 200, description = "Extraction successful", body = ExtractResponse),
95
- // (status = 400, description = "Bad request", body = crate::api::types::ErrorResponse),
96
- // (status = 413, description = "Payload too large", body = crate::api::types::ErrorResponse),
97
- // (status = 500, description = "Internal server error", body = crate::api::types::ErrorResponse),
98
- // )
99
- // )]
84
+ #[utoipa::path(
85
+ post,
86
+ path = "/extract",
87
+ tag = "extraction",
88
+ request_body(content_type = "multipart/form-data"),
89
+ responses(
90
+ (status = 200, description = "Extraction successful", body = ExtractResponse),
91
+ (status = 400, description = "Bad request", body = crate::api::types::ErrorResponse),
92
+ (status = 413, description = "Payload too large", body = crate::api::types::ErrorResponse),
93
+ (status = 500, description = "Internal server error", body = crate::api::types::ErrorResponse),
94
+ )
95
+ )]
100
96
  #[cfg_attr(
101
97
  feature = "otel",
102
98
  tracing::instrument(
@@ -107,10 +103,10 @@ pub async fn info_handler() -> Json<InfoResponse> {
107
103
  )]
108
104
  pub async fn extract_handler(
109
105
  State(state): State<ApiState>,
110
- mut multipart: Multipart,
106
+ MultipartApi(mut multipart): MultipartApi,
111
107
  ) -> Result<Json<ExtractResponse>, ApiError> {
112
108
  let mut files = Vec::new();
113
- let mut config = (*state.default_config).clone();
109
+ let mut config: Option<crate::core::config::ExtractionConfig> = None;
114
110
 
115
111
  while let Some(field) = multipart
116
112
  .next_field()
@@ -138,12 +134,12 @@ pub async fn extract_handler(
138
134
  .await
139
135
  .map_err(|e| ApiError::validation(crate::error::KreuzbergError::validation(e.to_string())))?;
140
136
 
141
- config = serde_json::from_str(&config_str).map_err(|e| {
137
+ config = Some(serde_json::from_str(&config_str).map_err(|e| {
142
138
  ApiError::validation(crate::error::KreuzbergError::validation(format!(
143
139
  "Invalid extraction configuration: {}",
144
140
  e
145
141
  )))
146
- })?;
142
+ })?);
147
143
  }
148
144
  "output_format" => {
149
145
  let format_str = field
@@ -151,7 +147,9 @@ pub async fn extract_handler(
151
147
  .await
152
148
  .map_err(|e| ApiError::validation(crate::error::KreuzbergError::validation(e.to_string())))?;
153
149
 
154
- config.output_format = match format_str.to_lowercase().as_str() {
150
+ // Ensure config exists before modifying output_format
151
+ let cfg = config.get_or_insert_with(|| (*state.default_config).clone());
152
+ cfg.output_format = match format_str.to_lowercase().as_str() {
155
153
  "plain" => crate::core::config::OutputFormat::Plain,
156
154
  "markdown" => crate::core::config::OutputFormat::Markdown,
157
155
  "djot" => crate::core::config::OutputFormat::Djot,
@@ -177,18 +175,21 @@ pub async fn extract_handler(
177
175
  #[cfg(feature = "otel")]
178
176
  tracing::Span::current().record("files_count", files.len());
179
177
 
178
+ // Use provided config or fall back to default from state
179
+ let final_config = config.as_ref().unwrap_or(&state.default_config);
180
+
180
181
  if files.len() == 1 {
181
182
  let (data, mime_type, _file_name) = files
182
183
  .into_iter()
183
184
  .next()
184
185
  .expect("files.len() == 1 guarantees one element exists");
185
- let result = extract_bytes(&data, mime_type.as_str(), &config).await?;
186
+ let result = extract_bytes(&data, mime_type.as_str(), final_config).await?;
186
187
  return Ok(Json(vec![result]));
187
188
  }
188
189
 
189
190
  let files_data: Vec<(Vec<u8>, String)> = files.into_iter().map(|(data, mime, _name)| (data, mime)).collect();
190
191
 
191
- let results = batch_extract_bytes(files_data, &config).await?;
192
+ let results = batch_extract_bytes(files_data, final_config).await?;
192
193
  Ok(Json(results))
193
194
  }
194
195
 
@@ -492,6 +493,8 @@ pub async fn chunk_handler(JsonApi(request): JsonApi<ChunkRequest>) -> Result<Js
492
493
  overlap,
493
494
  trim: cfg.trim.unwrap_or(true),
494
495
  chunker_type,
496
+ embedding: None,
497
+ preset: None,
495
498
  };
496
499
 
497
500
  // Perform chunking - convert any remaining errors to validation errors since they're likely config issues
@@ -32,7 +32,7 @@ use utoipa::OpenApi;
32
32
  paths(
33
33
  crate::api::handlers::health_handler,
34
34
  crate::api::handlers::info_handler,
35
- // Note: extract_handler omitted - requires ExtractionResult ToSchema impl
35
+ crate::api::handlers::extract_handler,
36
36
  crate::api::handlers::cache_stats_handler,
37
37
  crate::api::handlers::cache_clear_handler,
38
38
  crate::api::handlers::embed_handler,
@@ -53,6 +53,19 @@ use utoipa::OpenApi;
53
53
  crate::api::types::ChunkItem,
54
54
  crate::api::types::ChunkingConfigRequest,
55
55
  crate::api::types::ChunkingConfigResponse,
56
+ crate::types::extraction::ExtractionResult,
57
+ crate::types::extraction::Chunk,
58
+ crate::types::extraction::ChunkMetadata,
59
+ crate::types::extraction::ExtractedImage,
60
+ crate::types::extraction::Element,
61
+ crate::types::extraction::ElementMetadata,
62
+ crate::types::extraction::ElementId,
63
+ crate::types::extraction::ElementType,
64
+ crate::types::extraction::BoundingBox,
65
+ crate::types::metadata::Metadata,
66
+ crate::types::tables::Table,
67
+ crate::types::page::PageContent,
68
+ crate::types::djot::DjotContent,
56
69
  )
57
70
  ),
58
71
  tags(
@@ -2,43 +2,8 @@
2
2
 
3
3
  use serde::{Deserialize, Serialize};
4
4
 
5
- /// Configuration options for text chunking operations.
6
- ///
7
- /// # Fields
8
- ///
9
- /// * `max_characters` - Maximum number of characters per chunk (default: 2000)
10
- /// * `overlap` - Number of characters to overlap between consecutive chunks (default: 100)
11
- /// * `trim` - Whether to trim whitespace from chunk boundaries (default: true)
12
- /// * `chunker_type` - Type of chunker to use (Text or Markdown) (default: Text)
13
- pub struct ChunkingConfig {
14
- pub max_characters: usize,
15
- pub overlap: usize,
16
- pub trim: bool,
17
- pub chunker_type: ChunkerType,
18
- }
19
-
20
- impl Default for ChunkingConfig {
21
- fn default() -> Self {
22
- Self {
23
- max_characters: 2000,
24
- overlap: 100,
25
- trim: true,
26
- chunker_type: ChunkerType::Text,
27
- }
28
- }
29
- }
30
-
31
- /// Type of text chunker to use.
32
- ///
33
- /// # Variants
34
- ///
35
- /// * `Text` - Generic text splitter, splits on whitespace and punctuation
36
- /// * `Markdown` - Markdown-aware splitter, preserves formatting and structure
37
- #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
38
- pub enum ChunkerType {
39
- Text,
40
- Markdown,
41
- }
5
+ // Re-export ChunkingConfig and ChunkerType from core config (canonical location)
6
+ pub use crate::core::config::processing::{ChunkerType, ChunkingConfig};
42
7
 
43
8
  /// Result of a text chunking operation.
44
9
  ///
@@ -118,6 +118,8 @@ pub fn chunk_text_with_type(
118
118
  overlap,
119
119
  trim,
120
120
  chunker_type,
121
+ embedding: None,
122
+ preset: None,
121
123
  };
122
124
  chunk_text(text, &config, None)
123
125
  }
@@ -177,6 +179,8 @@ mod tests {
177
179
  overlap: 10,
178
180
  trim: true,
179
181
  chunker_type: ChunkerType::Text,
182
+ embedding: None,
183
+ preset: None,
180
184
  };
181
185
  let text = "This is a short text.";
182
186
  let result = chunk_text(text, &config, None).unwrap();
@@ -192,6 +196,8 @@ mod tests {
192
196
  overlap: 5,
193
197
  trim: true,
194
198
  chunker_type: ChunkerType::Text,
199
+ embedding: None,
200
+ preset: None,
195
201
  };
196
202
  let text = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
197
203
  let result = chunk_text(text, &config, None).unwrap();
@@ -207,6 +213,8 @@ mod tests {
207
213
  overlap: 5,
208
214
  trim: true,
209
215
  chunker_type: ChunkerType::Text,
216
+ embedding: None,
217
+ preset: None,
210
218
  };
211
219
  let text = "abcdefghijklmnopqrstuvwxyz0123456789";
212
220
  let result = chunk_text(text, &config, None).unwrap();
@@ -230,6 +238,8 @@ mod tests {
230
238
  overlap: 10,
231
239
  trim: true,
232
240
  chunker_type: ChunkerType::Markdown,
241
+ embedding: None,
242
+ preset: None,
233
243
  };
234
244
  let markdown = "# Title\n\nParagraph one.\n\n## Section\n\nParagraph two.";
235
245
  let result = chunk_text(markdown, &config, None).unwrap();
@@ -244,6 +254,8 @@ mod tests {
244
254
  overlap: 10,
245
255
  trim: true,
246
256
  chunker_type: ChunkerType::Markdown,
257
+ embedding: None,
258
+ preset: None,
247
259
  };
248
260
  let markdown = "# Code Example\n\n```python\nprint('hello')\n```\n\nSome text after code.";
249
261
  let result = chunk_text(markdown, &config, None).unwrap();
@@ -258,6 +270,8 @@ mod tests {
258
270
  overlap: 10,
259
271
  trim: true,
260
272
  chunker_type: ChunkerType::Markdown,
273
+ embedding: None,
274
+ preset: None,
261
275
  };
262
276
  let markdown = "Check out [this link](https://example.com) for more info.";
263
277
  let result = chunk_text(markdown, &config, None).unwrap();
@@ -272,6 +286,8 @@ mod tests {
272
286
  overlap: 5,
273
287
  trim: true,
274
288
  chunker_type: ChunkerType::Text,
289
+ embedding: None,
290
+ preset: None,
275
291
  };
276
292
  let text = " Leading and trailing spaces should be trimmed ";
277
293
  let result = chunk_text(text, &config, None).unwrap();
@@ -286,6 +302,8 @@ mod tests {
286
302
  overlap: 5,
287
303
  trim: false,
288
304
  chunker_type: ChunkerType::Text,
305
+ embedding: None,
306
+ preset: None,
289
307
  };
290
308
  let text = " Text with spaces ";
291
309
  let result = chunk_text(text, &config, None).unwrap();
@@ -300,6 +318,8 @@ mod tests {
300
318
  overlap: 20,
301
319
  trim: true,
302
320
  chunker_type: ChunkerType::Text,
321
+ embedding: None,
322
+ preset: None,
303
323
  };
304
324
  let result = chunk_text("Some text", &config, None);
305
325
  assert!(result.is_err());
@@ -337,6 +357,8 @@ mod tests {
337
357
  overlap: 5,
338
358
  trim: true,
339
359
  chunker_type: ChunkerType::Text,
360
+ embedding: None,
361
+ preset: None,
340
362
  };
341
363
  let texts = vec!["First text", "Second text", "Third text"];
342
364
  let results = chunk_texts_batch(&texts, &config).unwrap();
@@ -351,6 +373,8 @@ mod tests {
351
373
  overlap: 5,
352
374
  trim: true,
353
375
  chunker_type: ChunkerType::Text,
376
+ embedding: None,
377
+ preset: None,
354
378
  };
355
379
  let texts = vec![
356
380
  "Short",
@@ -371,6 +395,8 @@ mod tests {
371
395
  overlap: 20,
372
396
  trim: true,
373
397
  chunker_type: ChunkerType::Text,
398
+ embedding: None,
399
+ preset: None,
374
400
  };
375
401
  let texts = vec!["Text one", "Text two"];
376
402
  let result = chunk_texts_batch(&texts, &config);
@@ -380,8 +406,8 @@ mod tests {
380
406
  #[test]
381
407
  fn test_chunking_config_default() {
382
408
  let config = ChunkingConfig::default();
383
- assert_eq!(config.max_characters, 2000);
384
- assert_eq!(config.overlap, 100);
409
+ assert_eq!(config.max_characters, 1000);
410
+ assert_eq!(config.overlap, 200);
385
411
  assert!(config.trim);
386
412
  assert_eq!(config.chunker_type, ChunkerType::Text);
387
413
  }
@@ -393,6 +419,8 @@ mod tests {
393
419
  overlap: 20,
394
420
  trim: true,
395
421
  chunker_type: ChunkerType::Text,
422
+ embedding: None,
423
+ preset: None,
396
424
  };
397
425
  let text = "a".repeat(1000);
398
426
  let result = chunk_text(&text, &config, None).unwrap();
@@ -407,6 +435,8 @@ mod tests {
407
435
  overlap: 5,
408
436
  trim: true,
409
437
  chunker_type: ChunkerType::Text,
438
+ embedding: None,
439
+ preset: None,
410
440
  };
411
441
  let text = "Line one\nLine two\nLine three\nLine four\nLine five";
412
442
  let result = chunk_text(text, &config, None).unwrap();
@@ -420,6 +450,8 @@ mod tests {
420
450
  overlap: 10,
421
451
  trim: true,
422
452
  chunker_type: ChunkerType::Markdown,
453
+ embedding: None,
454
+ preset: None,
423
455
  };
424
456
  let markdown = "# List Example\n\n- Item 1\n- Item 2\n- Item 3\n\nMore text.";
425
457
  let result = chunk_text(markdown, &config, None).unwrap();
@@ -434,6 +466,8 @@ mod tests {
434
466
  overlap: 10,
435
467
  trim: true,
436
468
  chunker_type: ChunkerType::Markdown,
469
+ embedding: None,
470
+ preset: None,
437
471
  };
438
472
  let markdown = "# Table\n\n| Col1 | Col2 |\n|------|------|\n| A | B |\n| C | D |";
439
473
  let result = chunk_text(markdown, &config, None).unwrap();
@@ -448,6 +482,8 @@ mod tests {
448
482
  overlap: 5,
449
483
  trim: true,
450
484
  chunker_type: ChunkerType::Text,
485
+ embedding: None,
486
+ preset: None,
451
487
  };
452
488
  let text = "Special chars: @#$%^&*()[]{}|\\<>?/~`";
453
489
  let result = chunk_text(text, &config, None).unwrap();
@@ -462,6 +498,8 @@ mod tests {
462
498
  overlap: 5,
463
499
  trim: true,
464
500
  chunker_type: ChunkerType::Text,
501
+ embedding: None,
502
+ preset: None,
465
503
  };
466
504
  let text = "Unicode: 你好世界 🌍 café résumé";
467
505
  let result = chunk_text(text, &config, None).unwrap();
@@ -477,6 +515,8 @@ mod tests {
477
515
  overlap: 5,
478
516
  trim: true,
479
517
  chunker_type: ChunkerType::Text,
518
+ embedding: None,
519
+ preset: None,
480
520
  };
481
521
  let text = "日本語のテキストです。これは長い文章で、複数のチャンクに分割されるべきです。";
482
522
  let result = chunk_text(text, &config, None).unwrap();
@@ -490,6 +530,8 @@ mod tests {
490
530
  overlap: 5,
491
531
  trim: true,
492
532
  chunker_type: ChunkerType::Text,
533
+ embedding: None,
534
+ preset: None,
493
535
  };
494
536
  let text = "English text mixed with 中文文本 and some français";
495
537
  let result = chunk_text(text, &config, None).unwrap();
@@ -503,6 +545,8 @@ mod tests {
503
545
  overlap: 5,
504
546
  trim: false,
505
547
  chunker_type: ChunkerType::Text,
548
+ embedding: None,
549
+ preset: None,
506
550
  };
507
551
  let text = "AAAAA BBBBB CCCCC DDDDD EEEEE FFFFF";
508
552
  let result = chunk_text(text, &config, None).unwrap();
@@ -555,6 +599,8 @@ mod tests {
555
599
  overlap: 0,
556
600
  trim: false,
557
601
  chunker_type: ChunkerType::Text,
602
+ embedding: None,
603
+ preset: None,
558
604
  };
559
605
  let text = "AAAAA BBBBB CCCCC DDDDD EEEEE FFFFF";
560
606
  let result = chunk_text(text, &config, None).unwrap();
@@ -581,6 +627,8 @@ mod tests {
581
627
  overlap: 3,
582
628
  trim: false,
583
629
  chunker_type: ChunkerType::Text,
630
+ embedding: None,
631
+ preset: None,
584
632
  };
585
633
  let text = "0123456789 ABCDEFGHIJ KLMNOPQRST UVWXYZ";
586
634
  let result = chunk_text(text, &config, None).unwrap();
@@ -615,6 +663,8 @@ mod tests {
615
663
  overlap,
616
664
  trim: false,
617
665
  chunker_type: ChunkerType::Text,
666
+ embedding: None,
667
+ preset: None,
618
668
  };
619
669
  let text = "Word ".repeat(30);
620
670
  let result = chunk_text(&text, &config, None).unwrap();
@@ -647,6 +697,8 @@ mod tests {
647
697
  overlap: 5,
648
698
  trim: false,
649
699
  chunker_type: ChunkerType::Text,
700
+ embedding: None,
701
+ preset: None,
650
702
  };
651
703
  let text = "AAAAA BBBBB CCCCC DDDDD EEEEE";
652
704
  let result = chunk_text(text, &config, None).unwrap();
@@ -674,6 +726,8 @@ mod tests {
674
726
  overlap: 5,
675
727
  trim: true,
676
728
  chunker_type: ChunkerType::Text,
729
+ embedding: None,
730
+ preset: None,
677
731
  };
678
732
  let text = "Page one content here. Page two starts here and continues.";
679
733
 
@@ -706,6 +760,8 @@ mod tests {
706
760
  overlap: 5,
707
761
  trim: true,
708
762
  chunker_type: ChunkerType::Text,
763
+ embedding: None,
764
+ preset: None,
709
765
  };
710
766
  let text = "This is some test content that should be split into multiple chunks.";
711
767
 
@@ -725,6 +781,8 @@ mod tests {
725
781
  overlap: 5,
726
782
  trim: true,
727
783
  chunker_type: ChunkerType::Text,
784
+ embedding: None,
785
+ preset: None,
728
786
  };
729
787
  let text = "Some text content here.";
730
788
  let boundaries: Vec<PageBoundary> = vec![];
@@ -743,6 +801,8 @@ mod tests {
743
801
  overlap: 5,
744
802
  trim: false,
745
803
  chunker_type: ChunkerType::Text,
804
+ embedding: None,
805
+ preset: None,
746
806
  };
747
807
  let text = "0123456789 AAAAAAAAAA 1111111111 BBBBBBBBBB 2222222222";
748
808
 
@@ -779,6 +839,8 @@ mod tests {
779
839
  overlap: 5,
780
840
  trim: true,
781
841
  chunker_type: ChunkerType::Text,
842
+ embedding: None,
843
+ preset: None,
782
844
  };
783
845
  let text = "Page one content here. Page two content.";
784
846
 
@@ -802,6 +864,8 @@ mod tests {
802
864
  overlap: 5,
803
865
  trim: true,
804
866
  chunker_type: ChunkerType::Text,
867
+ embedding: None,
868
+ preset: None,
805
869
  };
806
870
  let text = "Page one content here. Page two content.";
807
871
 
@@ -832,6 +896,8 @@ mod tests {
832
896
  overlap: 5,
833
897
  trim: true,
834
898
  chunker_type: ChunkerType::Text,
899
+ embedding: None,
900
+ preset: None,
835
901
  };
836
902
  let text = "Page one content here. Page two content.";
837
903
 
@@ -862,6 +928,8 @@ mod tests {
862
928
  overlap: 5,
863
929
  trim: true,
864
930
  chunker_type: ChunkerType::Text,
931
+ embedding: None,
932
+ preset: None,
865
933
  };
866
934
  let text = "First page content here.Second page content here.Third page.";
867
935
 
@@ -897,6 +965,8 @@ mod tests {
897
965
  overlap: 10,
898
966
  trim: true,
899
967
  chunker_type: ChunkerType::Text,
968
+ embedding: None,
969
+ preset: None,
900
970
  };
901
971
  let text = "All content on single page fits in one chunk.";
902
972
 
@@ -919,6 +989,8 @@ mod tests {
919
989
  overlap: 0,
920
990
  trim: false,
921
991
  chunker_type: ChunkerType::Text,
992
+ embedding: None,
993
+ preset: None,
922
994
  };
923
995
  let text = "AAAAA BBBBB CCCCC DDDDD";
924
996
 
@@ -952,6 +1024,8 @@ mod tests {
952
1024
  overlap: 5,
953
1025
  trim: true,
954
1026
  chunker_type: ChunkerType::Text,
1027
+ embedding: None,
1028
+ preset: None,
955
1029
  };
956
1030
  let text = "Page One Content Here.Page Two.";
957
1031
 
@@ -982,6 +1056,8 @@ mod tests {
982
1056
  overlap: 2,
983
1057
  trim: false,
984
1058
  chunker_type: ChunkerType::Text,
1059
+ embedding: None,
1060
+ preset: None,
985
1061
  };
986
1062
  let text = "0123456789ABCDEFGHIJ";
987
1063
 
@@ -60,7 +60,7 @@ pub mod validation;
60
60
 
61
61
  // Re-export submodule types and functions
62
62
  pub use boundaries::{calculate_page_range, validate_page_boundaries};
63
- pub use config::{ChunkerType, ChunkingConfig, ChunkingResult};
63
+ pub use config::{ChunkerType, ChunkingConfig, ChunkingResult}; // ChunkingConfig re-exported from core::config::processing
64
64
  pub use core::{chunk_text, chunk_text_with_type, chunk_texts_batch};
65
65
  pub use processor::ChunkingProcessor;
66
66
  pub use validation::{ADAPTIVE_VALIDATION_THRESHOLD, precompute_utf8_boundaries, validate_utf8_boundaries};
@@ -54,14 +54,7 @@ impl PostProcessor for ChunkingProcessor {
54
54
  None => return Ok(()),
55
55
  };
56
56
 
57
- let chunk_config = crate::chunking::ChunkingConfig {
58
- max_characters: chunking_config.max_chars,
59
- overlap: chunking_config.max_overlap,
60
- trim: true,
61
- chunker_type: crate::chunking::ChunkerType::Text,
62
- };
63
-
64
- let chunking_result = crate::chunking::chunk_text(&result.content, &chunk_config, None)
57
+ let chunking_result = crate::chunking::chunk_text(&result.content, chunking_config, None)
65
58
  .map_err(|e| KreuzbergError::Other(format!("Chunking failed: {}", e)))?;
66
59
  result.chunks = Some(chunking_result.chunks);
67
60
 
@@ -87,14 +80,17 @@ mod tests {
87
80
  use super::*;
88
81
  use crate::core::config::ChunkingConfig;
89
82
  use crate::types::Metadata;
83
+ use std::borrow::Cow;
90
84
 
91
85
  #[tokio::test]
92
86
  async fn test_chunking_processor() {
93
87
  let processor = ChunkingProcessor;
94
88
  let config = ExtractionConfig {
95
89
  chunking: Some(ChunkingConfig {
96
- max_chars: 100,
97
- max_overlap: 10,
90
+ max_characters: 100,
91
+ overlap: 10,
92
+ trim: true,
93
+ chunker_type: crate::chunking::ChunkerType::Text,
98
94
  embedding: None,
99
95
  preset: None,
100
96
  }),
@@ -103,7 +99,7 @@ mod tests {
103
99
 
104
100
  let mut result = ExtractionResult {
105
101
  content: "This is a longer text that should be split into multiple chunks to test the chunking processor functionality.".to_string(),
106
- mime_type: "text/plain".to_string(),
102
+ mime_type: Cow::Borrowed("text/plain"),
107
103
  metadata: Metadata::default(),
108
104
  tables: vec![],
109
105
  detected_languages: None,
@@ -128,7 +124,7 @@ mod tests {
128
124
 
129
125
  let mut result = ExtractionResult {
130
126
  content: "Some text".to_string(),
131
- mime_type: "text/plain".to_string(),
127
+ mime_type: Cow::Borrowed("text/plain"),
132
128
  metadata: Metadata::default(),
133
129
  tables: vec![],
134
130
  detected_languages: None,
@@ -165,7 +161,7 @@ mod tests {
165
161
 
166
162
  let result = ExtractionResult {
167
163
  content: "Sample text".to_string(),
168
- mime_type: "text/plain".to_string(),
164
+ mime_type: Cow::Borrowed("text/plain"),
169
165
  metadata: Metadata::default(),
170
166
  tables: vec![],
171
167
  detected_languages: None,
@@ -178,8 +174,10 @@ mod tests {
178
174
 
179
175
  let config_with_chunking = ExtractionConfig {
180
176
  chunking: Some(crate::core::config::ChunkingConfig {
181
- max_chars: 100,
182
- max_overlap: 10,
177
+ max_characters: 100,
178
+ overlap: 10,
179
+ trim: true,
180
+ chunker_type: crate::chunking::ChunkerType::Text,
183
181
  embedding: None,
184
182
  preset: None,
185
183
  }),
@@ -197,7 +195,7 @@ mod tests {
197
195
 
198
196
  let short_result = ExtractionResult {
199
197
  content: "Short".to_string(),
200
- mime_type: "text/plain".to_string(),
198
+ mime_type: Cow::Borrowed("text/plain"),
201
199
  metadata: Metadata::default(),
202
200
  tables: vec![],
203
201
  detected_languages: None,
@@ -210,7 +208,7 @@ mod tests {
210
208
 
211
209
  let long_result = ExtractionResult {
212
210
  content: "a".repeat(100000),
213
- mime_type: "text/plain".to_string(),
211
+ mime_type: Cow::Borrowed("text/plain"),
214
212
  metadata: Metadata::default(),
215
213
  tables: vec![],
216
214
  detected_languages: None,
@@ -94,8 +94,10 @@ impl ExtractionConfig {
94
94
 
95
95
  if self.chunking.is_none() {
96
96
  self.chunking = Some(ChunkingConfig {
97
- max_chars: 1000,
98
- max_overlap: 200,
97
+ max_characters: 1000,
98
+ overlap: 200,
99
+ trim: true,
100
+ chunker_type: super::super::processing::ChunkerType::Text,
99
101
  embedding: None,
100
102
  preset: None,
101
103
  });
@@ -103,8 +105,8 @@ impl ExtractionConfig {
103
105
 
104
106
  if let Some(ref mut chunking) = self.chunking {
105
107
  // Validate against current overlap before updating
106
- validate_chunking_params(max_chars, chunking.max_overlap)?;
107
- chunking.max_chars = max_chars;
108
+ validate_chunking_params(max_chars, chunking.overlap)?;
109
+ chunking.max_characters = max_chars;
108
110
  }
109
111
  }
110
112
 
@@ -120,17 +122,19 @@ impl ExtractionConfig {
120
122
 
121
123
  if self.chunking.is_none() {
122
124
  self.chunking = Some(ChunkingConfig {
123
- max_chars: 1000,
124
- max_overlap: 200,
125
+ max_characters: 1000,
126
+ overlap: 200,
127
+ trim: true,
128
+ chunker_type: super::super::processing::ChunkerType::Text,
125
129
  embedding: None,
126
130
  preset: None,
127
131
  });
128
132
  }
129
133
 
130
134
  if let Some(ref mut chunking) = self.chunking {
131
- // Validate against current max_chars before updating
132
- validate_chunking_params(chunking.max_chars, max_overlap)?;
133
- chunking.max_overlap = max_overlap;
135
+ // Validate against current max_characters before updating
136
+ validate_chunking_params(chunking.max_characters, max_overlap)?;
137
+ chunking.overlap = max_overlap;
134
138
  }
135
139
  }
136
140