kreuzberg 4.2.6 → 4.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +7 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
- data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
- data/ext/kreuzberg_rb/native/src/result.rs +5 -3
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +228 -37
- data/spec/binding/batch_operations_spec.rb +2 -0
- data/vendor/Cargo.toml +3 -2
- data/vendor/kreuzberg/Cargo.toml +2 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/error.rs +29 -1
- data/vendor/kreuzberg/src/api/handlers.rs +28 -25
- data/vendor/kreuzberg/src/api/openapi.rs +14 -1
- data/vendor/kreuzberg/src/chunking/config.rs +2 -37
- data/vendor/kreuzberg/src/chunking/core.rs +78 -2
- data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
- data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
- data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
- data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
- data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
- data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
- data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
- data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
- data/vendor/kreuzberg/src/extraction/email.rs +31 -19
- data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
- data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
- data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
- data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
- data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
- data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
- data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
- data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
- data/vendor/kreuzberg/src/extractors/email.rs +5 -3
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
- data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
- data/vendor/kreuzberg/src/extractors/html.rs +1 -1
- data/vendor/kreuzberg/src/extractors/image.rs +3 -3
- data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
- data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
- data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
- data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
- data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
- data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
- data/vendor/kreuzberg/src/extractors/text.rs +2 -2
- data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
- data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
- data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
- data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
- data/vendor/kreuzberg/src/lib.rs +1 -1
- data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
- data/vendor/kreuzberg/src/mcp/format.rs +5 -4
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
- data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
- data/vendor/kreuzberg/src/ocr/types.rs +3 -4
- data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
- data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
- data/vendor/kreuzberg/src/text/quality.rs +13 -13
- data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
- data/vendor/kreuzberg/src/types/djot.rs +15 -4
- data/vendor/kreuzberg/src/types/extraction.rs +24 -4
- data/vendor/kreuzberg/src/types/formats.rs +9 -5
- data/vendor/kreuzberg/src/types/metadata.rs +68 -7
- data/vendor/kreuzberg/src/types/mod.rs +7 -5
- data/vendor/kreuzberg/src/types/page.rs +9 -0
- data/vendor/kreuzberg/src/types/tables.rs +2 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
- data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
- data/vendor/kreuzberg/tests/config_features.rs +19 -11
- data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
- data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
- data/vendor/kreuzberg/tests/core_integration.rs +5 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
- data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
- data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
- data/vendor/kreuzberg-ffi/src/error.rs +56 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
- data/vendor/kreuzberg-ffi/src/result.rs +2 -1
- data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
- data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +2 -2
|
@@ -1,14 +1,11 @@
|
|
|
1
1
|
//! API request handlers.
|
|
2
2
|
|
|
3
|
-
use axum::{
|
|
4
|
-
Json,
|
|
5
|
-
extract::{Multipart, State},
|
|
6
|
-
};
|
|
3
|
+
use axum::{Json, extract::State};
|
|
7
4
|
|
|
8
5
|
use crate::{batch_extract_bytes, cache, extract_bytes};
|
|
9
6
|
|
|
10
7
|
use super::{
|
|
11
|
-
error::{ApiError, JsonApi},
|
|
8
|
+
error::{ApiError, JsonApi, MultipartApi},
|
|
12
9
|
types::{
|
|
13
10
|
ApiState, CacheClearResponse, CacheStatsResponse, ChunkRequest, ChunkResponse, EmbedRequest, EmbedResponse,
|
|
14
11
|
ExtractResponse, HealthResponse, InfoResponse,
|
|
@@ -84,19 +81,18 @@ pub async fn info_handler() -> Json<InfoResponse> {
|
|
|
84
81
|
///
|
|
85
82
|
/// The server's default config (loaded from kreuzberg.toml/yaml/json via discovery)
|
|
86
83
|
/// is used as the base, and any per-request config overrides those defaults.
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
// )]
|
|
84
|
+
#[utoipa::path(
|
|
85
|
+
post,
|
|
86
|
+
path = "/extract",
|
|
87
|
+
tag = "extraction",
|
|
88
|
+
request_body(content_type = "multipart/form-data"),
|
|
89
|
+
responses(
|
|
90
|
+
(status = 200, description = "Extraction successful", body = ExtractResponse),
|
|
91
|
+
(status = 400, description = "Bad request", body = crate::api::types::ErrorResponse),
|
|
92
|
+
(status = 413, description = "Payload too large", body = crate::api::types::ErrorResponse),
|
|
93
|
+
(status = 500, description = "Internal server error", body = crate::api::types::ErrorResponse),
|
|
94
|
+
)
|
|
95
|
+
)]
|
|
100
96
|
#[cfg_attr(
|
|
101
97
|
feature = "otel",
|
|
102
98
|
tracing::instrument(
|
|
@@ -107,10 +103,10 @@ pub async fn info_handler() -> Json<InfoResponse> {
|
|
|
107
103
|
)]
|
|
108
104
|
pub async fn extract_handler(
|
|
109
105
|
State(state): State<ApiState>,
|
|
110
|
-
mut multipart:
|
|
106
|
+
MultipartApi(mut multipart): MultipartApi,
|
|
111
107
|
) -> Result<Json<ExtractResponse>, ApiError> {
|
|
112
108
|
let mut files = Vec::new();
|
|
113
|
-
let mut config =
|
|
109
|
+
let mut config: Option<crate::core::config::ExtractionConfig> = None;
|
|
114
110
|
|
|
115
111
|
while let Some(field) = multipart
|
|
116
112
|
.next_field()
|
|
@@ -138,12 +134,12 @@ pub async fn extract_handler(
|
|
|
138
134
|
.await
|
|
139
135
|
.map_err(|e| ApiError::validation(crate::error::KreuzbergError::validation(e.to_string())))?;
|
|
140
136
|
|
|
141
|
-
config = serde_json::from_str(&config_str).map_err(|e| {
|
|
137
|
+
config = Some(serde_json::from_str(&config_str).map_err(|e| {
|
|
142
138
|
ApiError::validation(crate::error::KreuzbergError::validation(format!(
|
|
143
139
|
"Invalid extraction configuration: {}",
|
|
144
140
|
e
|
|
145
141
|
)))
|
|
146
|
-
})
|
|
142
|
+
})?);
|
|
147
143
|
}
|
|
148
144
|
"output_format" => {
|
|
149
145
|
let format_str = field
|
|
@@ -151,7 +147,9 @@ pub async fn extract_handler(
|
|
|
151
147
|
.await
|
|
152
148
|
.map_err(|e| ApiError::validation(crate::error::KreuzbergError::validation(e.to_string())))?;
|
|
153
149
|
|
|
154
|
-
config
|
|
150
|
+
// Ensure config exists before modifying output_format
|
|
151
|
+
let cfg = config.get_or_insert_with(|| (*state.default_config).clone());
|
|
152
|
+
cfg.output_format = match format_str.to_lowercase().as_str() {
|
|
155
153
|
"plain" => crate::core::config::OutputFormat::Plain,
|
|
156
154
|
"markdown" => crate::core::config::OutputFormat::Markdown,
|
|
157
155
|
"djot" => crate::core::config::OutputFormat::Djot,
|
|
@@ -177,18 +175,21 @@ pub async fn extract_handler(
|
|
|
177
175
|
#[cfg(feature = "otel")]
|
|
178
176
|
tracing::Span::current().record("files_count", files.len());
|
|
179
177
|
|
|
178
|
+
// Use provided config or fall back to default from state
|
|
179
|
+
let final_config = config.as_ref().unwrap_or(&state.default_config);
|
|
180
|
+
|
|
180
181
|
if files.len() == 1 {
|
|
181
182
|
let (data, mime_type, _file_name) = files
|
|
182
183
|
.into_iter()
|
|
183
184
|
.next()
|
|
184
185
|
.expect("files.len() == 1 guarantees one element exists");
|
|
185
|
-
let result = extract_bytes(&data, mime_type.as_str(),
|
|
186
|
+
let result = extract_bytes(&data, mime_type.as_str(), final_config).await?;
|
|
186
187
|
return Ok(Json(vec![result]));
|
|
187
188
|
}
|
|
188
189
|
|
|
189
190
|
let files_data: Vec<(Vec<u8>, String)> = files.into_iter().map(|(data, mime, _name)| (data, mime)).collect();
|
|
190
191
|
|
|
191
|
-
let results = batch_extract_bytes(files_data,
|
|
192
|
+
let results = batch_extract_bytes(files_data, final_config).await?;
|
|
192
193
|
Ok(Json(results))
|
|
193
194
|
}
|
|
194
195
|
|
|
@@ -492,6 +493,8 @@ pub async fn chunk_handler(JsonApi(request): JsonApi<ChunkRequest>) -> Result<Js
|
|
|
492
493
|
overlap,
|
|
493
494
|
trim: cfg.trim.unwrap_or(true),
|
|
494
495
|
chunker_type,
|
|
496
|
+
embedding: None,
|
|
497
|
+
preset: None,
|
|
495
498
|
};
|
|
496
499
|
|
|
497
500
|
// Perform chunking - convert any remaining errors to validation errors since they're likely config issues
|
|
@@ -32,7 +32,7 @@ use utoipa::OpenApi;
|
|
|
32
32
|
paths(
|
|
33
33
|
crate::api::handlers::health_handler,
|
|
34
34
|
crate::api::handlers::info_handler,
|
|
35
|
-
|
|
35
|
+
crate::api::handlers::extract_handler,
|
|
36
36
|
crate::api::handlers::cache_stats_handler,
|
|
37
37
|
crate::api::handlers::cache_clear_handler,
|
|
38
38
|
crate::api::handlers::embed_handler,
|
|
@@ -53,6 +53,19 @@ use utoipa::OpenApi;
|
|
|
53
53
|
crate::api::types::ChunkItem,
|
|
54
54
|
crate::api::types::ChunkingConfigRequest,
|
|
55
55
|
crate::api::types::ChunkingConfigResponse,
|
|
56
|
+
crate::types::extraction::ExtractionResult,
|
|
57
|
+
crate::types::extraction::Chunk,
|
|
58
|
+
crate::types::extraction::ChunkMetadata,
|
|
59
|
+
crate::types::extraction::ExtractedImage,
|
|
60
|
+
crate::types::extraction::Element,
|
|
61
|
+
crate::types::extraction::ElementMetadata,
|
|
62
|
+
crate::types::extraction::ElementId,
|
|
63
|
+
crate::types::extraction::ElementType,
|
|
64
|
+
crate::types::extraction::BoundingBox,
|
|
65
|
+
crate::types::metadata::Metadata,
|
|
66
|
+
crate::types::tables::Table,
|
|
67
|
+
crate::types::page::PageContent,
|
|
68
|
+
crate::types::djot::DjotContent,
|
|
56
69
|
)
|
|
57
70
|
),
|
|
58
71
|
tags(
|
|
@@ -2,43 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
use serde::{Deserialize, Serialize};
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
/// # Fields
|
|
8
|
-
///
|
|
9
|
-
/// * `max_characters` - Maximum number of characters per chunk (default: 2000)
|
|
10
|
-
/// * `overlap` - Number of characters to overlap between consecutive chunks (default: 100)
|
|
11
|
-
/// * `trim` - Whether to trim whitespace from chunk boundaries (default: true)
|
|
12
|
-
/// * `chunker_type` - Type of chunker to use (Text or Markdown) (default: Text)
|
|
13
|
-
pub struct ChunkingConfig {
|
|
14
|
-
pub max_characters: usize,
|
|
15
|
-
pub overlap: usize,
|
|
16
|
-
pub trim: bool,
|
|
17
|
-
pub chunker_type: ChunkerType,
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
impl Default for ChunkingConfig {
|
|
21
|
-
fn default() -> Self {
|
|
22
|
-
Self {
|
|
23
|
-
max_characters: 2000,
|
|
24
|
-
overlap: 100,
|
|
25
|
-
trim: true,
|
|
26
|
-
chunker_type: ChunkerType::Text,
|
|
27
|
-
}
|
|
28
|
-
}
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
/// Type of text chunker to use.
|
|
32
|
-
///
|
|
33
|
-
/// # Variants
|
|
34
|
-
///
|
|
35
|
-
/// * `Text` - Generic text splitter, splits on whitespace and punctuation
|
|
36
|
-
/// * `Markdown` - Markdown-aware splitter, preserves formatting and structure
|
|
37
|
-
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
|
38
|
-
pub enum ChunkerType {
|
|
39
|
-
Text,
|
|
40
|
-
Markdown,
|
|
41
|
-
}
|
|
5
|
+
// Re-export ChunkingConfig and ChunkerType from core config (canonical location)
|
|
6
|
+
pub use crate::core::config::processing::{ChunkerType, ChunkingConfig};
|
|
42
7
|
|
|
43
8
|
/// Result of a text chunking operation.
|
|
44
9
|
///
|
|
@@ -118,6 +118,8 @@ pub fn chunk_text_with_type(
|
|
|
118
118
|
overlap,
|
|
119
119
|
trim,
|
|
120
120
|
chunker_type,
|
|
121
|
+
embedding: None,
|
|
122
|
+
preset: None,
|
|
121
123
|
};
|
|
122
124
|
chunk_text(text, &config, None)
|
|
123
125
|
}
|
|
@@ -177,6 +179,8 @@ mod tests {
|
|
|
177
179
|
overlap: 10,
|
|
178
180
|
trim: true,
|
|
179
181
|
chunker_type: ChunkerType::Text,
|
|
182
|
+
embedding: None,
|
|
183
|
+
preset: None,
|
|
180
184
|
};
|
|
181
185
|
let text = "This is a short text.";
|
|
182
186
|
let result = chunk_text(text, &config, None).unwrap();
|
|
@@ -192,6 +196,8 @@ mod tests {
|
|
|
192
196
|
overlap: 5,
|
|
193
197
|
trim: true,
|
|
194
198
|
chunker_type: ChunkerType::Text,
|
|
199
|
+
embedding: None,
|
|
200
|
+
preset: None,
|
|
195
201
|
};
|
|
196
202
|
let text = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
|
|
197
203
|
let result = chunk_text(text, &config, None).unwrap();
|
|
@@ -207,6 +213,8 @@ mod tests {
|
|
|
207
213
|
overlap: 5,
|
|
208
214
|
trim: true,
|
|
209
215
|
chunker_type: ChunkerType::Text,
|
|
216
|
+
embedding: None,
|
|
217
|
+
preset: None,
|
|
210
218
|
};
|
|
211
219
|
let text = "abcdefghijklmnopqrstuvwxyz0123456789";
|
|
212
220
|
let result = chunk_text(text, &config, None).unwrap();
|
|
@@ -230,6 +238,8 @@ mod tests {
|
|
|
230
238
|
overlap: 10,
|
|
231
239
|
trim: true,
|
|
232
240
|
chunker_type: ChunkerType::Markdown,
|
|
241
|
+
embedding: None,
|
|
242
|
+
preset: None,
|
|
233
243
|
};
|
|
234
244
|
let markdown = "# Title\n\nParagraph one.\n\n## Section\n\nParagraph two.";
|
|
235
245
|
let result = chunk_text(markdown, &config, None).unwrap();
|
|
@@ -244,6 +254,8 @@ mod tests {
|
|
|
244
254
|
overlap: 10,
|
|
245
255
|
trim: true,
|
|
246
256
|
chunker_type: ChunkerType::Markdown,
|
|
257
|
+
embedding: None,
|
|
258
|
+
preset: None,
|
|
247
259
|
};
|
|
248
260
|
let markdown = "# Code Example\n\n```python\nprint('hello')\n```\n\nSome text after code.";
|
|
249
261
|
let result = chunk_text(markdown, &config, None).unwrap();
|
|
@@ -258,6 +270,8 @@ mod tests {
|
|
|
258
270
|
overlap: 10,
|
|
259
271
|
trim: true,
|
|
260
272
|
chunker_type: ChunkerType::Markdown,
|
|
273
|
+
embedding: None,
|
|
274
|
+
preset: None,
|
|
261
275
|
};
|
|
262
276
|
let markdown = "Check out [this link](https://example.com) for more info.";
|
|
263
277
|
let result = chunk_text(markdown, &config, None).unwrap();
|
|
@@ -272,6 +286,8 @@ mod tests {
|
|
|
272
286
|
overlap: 5,
|
|
273
287
|
trim: true,
|
|
274
288
|
chunker_type: ChunkerType::Text,
|
|
289
|
+
embedding: None,
|
|
290
|
+
preset: None,
|
|
275
291
|
};
|
|
276
292
|
let text = " Leading and trailing spaces should be trimmed ";
|
|
277
293
|
let result = chunk_text(text, &config, None).unwrap();
|
|
@@ -286,6 +302,8 @@ mod tests {
|
|
|
286
302
|
overlap: 5,
|
|
287
303
|
trim: false,
|
|
288
304
|
chunker_type: ChunkerType::Text,
|
|
305
|
+
embedding: None,
|
|
306
|
+
preset: None,
|
|
289
307
|
};
|
|
290
308
|
let text = " Text with spaces ";
|
|
291
309
|
let result = chunk_text(text, &config, None).unwrap();
|
|
@@ -300,6 +318,8 @@ mod tests {
|
|
|
300
318
|
overlap: 20,
|
|
301
319
|
trim: true,
|
|
302
320
|
chunker_type: ChunkerType::Text,
|
|
321
|
+
embedding: None,
|
|
322
|
+
preset: None,
|
|
303
323
|
};
|
|
304
324
|
let result = chunk_text("Some text", &config, None);
|
|
305
325
|
assert!(result.is_err());
|
|
@@ -337,6 +357,8 @@ mod tests {
|
|
|
337
357
|
overlap: 5,
|
|
338
358
|
trim: true,
|
|
339
359
|
chunker_type: ChunkerType::Text,
|
|
360
|
+
embedding: None,
|
|
361
|
+
preset: None,
|
|
340
362
|
};
|
|
341
363
|
let texts = vec!["First text", "Second text", "Third text"];
|
|
342
364
|
let results = chunk_texts_batch(&texts, &config).unwrap();
|
|
@@ -351,6 +373,8 @@ mod tests {
|
|
|
351
373
|
overlap: 5,
|
|
352
374
|
trim: true,
|
|
353
375
|
chunker_type: ChunkerType::Text,
|
|
376
|
+
embedding: None,
|
|
377
|
+
preset: None,
|
|
354
378
|
};
|
|
355
379
|
let texts = vec![
|
|
356
380
|
"Short",
|
|
@@ -371,6 +395,8 @@ mod tests {
|
|
|
371
395
|
overlap: 20,
|
|
372
396
|
trim: true,
|
|
373
397
|
chunker_type: ChunkerType::Text,
|
|
398
|
+
embedding: None,
|
|
399
|
+
preset: None,
|
|
374
400
|
};
|
|
375
401
|
let texts = vec!["Text one", "Text two"];
|
|
376
402
|
let result = chunk_texts_batch(&texts, &config);
|
|
@@ -380,8 +406,8 @@ mod tests {
|
|
|
380
406
|
#[test]
|
|
381
407
|
fn test_chunking_config_default() {
|
|
382
408
|
let config = ChunkingConfig::default();
|
|
383
|
-
assert_eq!(config.max_characters,
|
|
384
|
-
assert_eq!(config.overlap,
|
|
409
|
+
assert_eq!(config.max_characters, 1000);
|
|
410
|
+
assert_eq!(config.overlap, 200);
|
|
385
411
|
assert!(config.trim);
|
|
386
412
|
assert_eq!(config.chunker_type, ChunkerType::Text);
|
|
387
413
|
}
|
|
@@ -393,6 +419,8 @@ mod tests {
|
|
|
393
419
|
overlap: 20,
|
|
394
420
|
trim: true,
|
|
395
421
|
chunker_type: ChunkerType::Text,
|
|
422
|
+
embedding: None,
|
|
423
|
+
preset: None,
|
|
396
424
|
};
|
|
397
425
|
let text = "a".repeat(1000);
|
|
398
426
|
let result = chunk_text(&text, &config, None).unwrap();
|
|
@@ -407,6 +435,8 @@ mod tests {
|
|
|
407
435
|
overlap: 5,
|
|
408
436
|
trim: true,
|
|
409
437
|
chunker_type: ChunkerType::Text,
|
|
438
|
+
embedding: None,
|
|
439
|
+
preset: None,
|
|
410
440
|
};
|
|
411
441
|
let text = "Line one\nLine two\nLine three\nLine four\nLine five";
|
|
412
442
|
let result = chunk_text(text, &config, None).unwrap();
|
|
@@ -420,6 +450,8 @@ mod tests {
|
|
|
420
450
|
overlap: 10,
|
|
421
451
|
trim: true,
|
|
422
452
|
chunker_type: ChunkerType::Markdown,
|
|
453
|
+
embedding: None,
|
|
454
|
+
preset: None,
|
|
423
455
|
};
|
|
424
456
|
let markdown = "# List Example\n\n- Item 1\n- Item 2\n- Item 3\n\nMore text.";
|
|
425
457
|
let result = chunk_text(markdown, &config, None).unwrap();
|
|
@@ -434,6 +466,8 @@ mod tests {
|
|
|
434
466
|
overlap: 10,
|
|
435
467
|
trim: true,
|
|
436
468
|
chunker_type: ChunkerType::Markdown,
|
|
469
|
+
embedding: None,
|
|
470
|
+
preset: None,
|
|
437
471
|
};
|
|
438
472
|
let markdown = "# Table\n\n| Col1 | Col2 |\n|------|------|\n| A | B |\n| C | D |";
|
|
439
473
|
let result = chunk_text(markdown, &config, None).unwrap();
|
|
@@ -448,6 +482,8 @@ mod tests {
|
|
|
448
482
|
overlap: 5,
|
|
449
483
|
trim: true,
|
|
450
484
|
chunker_type: ChunkerType::Text,
|
|
485
|
+
embedding: None,
|
|
486
|
+
preset: None,
|
|
451
487
|
};
|
|
452
488
|
let text = "Special chars: @#$%^&*()[]{}|\\<>?/~`";
|
|
453
489
|
let result = chunk_text(text, &config, None).unwrap();
|
|
@@ -462,6 +498,8 @@ mod tests {
|
|
|
462
498
|
overlap: 5,
|
|
463
499
|
trim: true,
|
|
464
500
|
chunker_type: ChunkerType::Text,
|
|
501
|
+
embedding: None,
|
|
502
|
+
preset: None,
|
|
465
503
|
};
|
|
466
504
|
let text = "Unicode: 你好世界 🌍 café résumé";
|
|
467
505
|
let result = chunk_text(text, &config, None).unwrap();
|
|
@@ -477,6 +515,8 @@ mod tests {
|
|
|
477
515
|
overlap: 5,
|
|
478
516
|
trim: true,
|
|
479
517
|
chunker_type: ChunkerType::Text,
|
|
518
|
+
embedding: None,
|
|
519
|
+
preset: None,
|
|
480
520
|
};
|
|
481
521
|
let text = "日本語のテキストです。これは長い文章で、複数のチャンクに分割されるべきです。";
|
|
482
522
|
let result = chunk_text(text, &config, None).unwrap();
|
|
@@ -490,6 +530,8 @@ mod tests {
|
|
|
490
530
|
overlap: 5,
|
|
491
531
|
trim: true,
|
|
492
532
|
chunker_type: ChunkerType::Text,
|
|
533
|
+
embedding: None,
|
|
534
|
+
preset: None,
|
|
493
535
|
};
|
|
494
536
|
let text = "English text mixed with 中文文本 and some français";
|
|
495
537
|
let result = chunk_text(text, &config, None).unwrap();
|
|
@@ -503,6 +545,8 @@ mod tests {
|
|
|
503
545
|
overlap: 5,
|
|
504
546
|
trim: false,
|
|
505
547
|
chunker_type: ChunkerType::Text,
|
|
548
|
+
embedding: None,
|
|
549
|
+
preset: None,
|
|
506
550
|
};
|
|
507
551
|
let text = "AAAAA BBBBB CCCCC DDDDD EEEEE FFFFF";
|
|
508
552
|
let result = chunk_text(text, &config, None).unwrap();
|
|
@@ -555,6 +599,8 @@ mod tests {
|
|
|
555
599
|
overlap: 0,
|
|
556
600
|
trim: false,
|
|
557
601
|
chunker_type: ChunkerType::Text,
|
|
602
|
+
embedding: None,
|
|
603
|
+
preset: None,
|
|
558
604
|
};
|
|
559
605
|
let text = "AAAAA BBBBB CCCCC DDDDD EEEEE FFFFF";
|
|
560
606
|
let result = chunk_text(text, &config, None).unwrap();
|
|
@@ -581,6 +627,8 @@ mod tests {
|
|
|
581
627
|
overlap: 3,
|
|
582
628
|
trim: false,
|
|
583
629
|
chunker_type: ChunkerType::Text,
|
|
630
|
+
embedding: None,
|
|
631
|
+
preset: None,
|
|
584
632
|
};
|
|
585
633
|
let text = "0123456789 ABCDEFGHIJ KLMNOPQRST UVWXYZ";
|
|
586
634
|
let result = chunk_text(text, &config, None).unwrap();
|
|
@@ -615,6 +663,8 @@ mod tests {
|
|
|
615
663
|
overlap,
|
|
616
664
|
trim: false,
|
|
617
665
|
chunker_type: ChunkerType::Text,
|
|
666
|
+
embedding: None,
|
|
667
|
+
preset: None,
|
|
618
668
|
};
|
|
619
669
|
let text = "Word ".repeat(30);
|
|
620
670
|
let result = chunk_text(&text, &config, None).unwrap();
|
|
@@ -647,6 +697,8 @@ mod tests {
|
|
|
647
697
|
overlap: 5,
|
|
648
698
|
trim: false,
|
|
649
699
|
chunker_type: ChunkerType::Text,
|
|
700
|
+
embedding: None,
|
|
701
|
+
preset: None,
|
|
650
702
|
};
|
|
651
703
|
let text = "AAAAA BBBBB CCCCC DDDDD EEEEE";
|
|
652
704
|
let result = chunk_text(text, &config, None).unwrap();
|
|
@@ -674,6 +726,8 @@ mod tests {
|
|
|
674
726
|
overlap: 5,
|
|
675
727
|
trim: true,
|
|
676
728
|
chunker_type: ChunkerType::Text,
|
|
729
|
+
embedding: None,
|
|
730
|
+
preset: None,
|
|
677
731
|
};
|
|
678
732
|
let text = "Page one content here. Page two starts here and continues.";
|
|
679
733
|
|
|
@@ -706,6 +760,8 @@ mod tests {
|
|
|
706
760
|
overlap: 5,
|
|
707
761
|
trim: true,
|
|
708
762
|
chunker_type: ChunkerType::Text,
|
|
763
|
+
embedding: None,
|
|
764
|
+
preset: None,
|
|
709
765
|
};
|
|
710
766
|
let text = "This is some test content that should be split into multiple chunks.";
|
|
711
767
|
|
|
@@ -725,6 +781,8 @@ mod tests {
|
|
|
725
781
|
overlap: 5,
|
|
726
782
|
trim: true,
|
|
727
783
|
chunker_type: ChunkerType::Text,
|
|
784
|
+
embedding: None,
|
|
785
|
+
preset: None,
|
|
728
786
|
};
|
|
729
787
|
let text = "Some text content here.";
|
|
730
788
|
let boundaries: Vec<PageBoundary> = vec![];
|
|
@@ -743,6 +801,8 @@ mod tests {
|
|
|
743
801
|
overlap: 5,
|
|
744
802
|
trim: false,
|
|
745
803
|
chunker_type: ChunkerType::Text,
|
|
804
|
+
embedding: None,
|
|
805
|
+
preset: None,
|
|
746
806
|
};
|
|
747
807
|
let text = "0123456789 AAAAAAAAAA 1111111111 BBBBBBBBBB 2222222222";
|
|
748
808
|
|
|
@@ -779,6 +839,8 @@ mod tests {
|
|
|
779
839
|
overlap: 5,
|
|
780
840
|
trim: true,
|
|
781
841
|
chunker_type: ChunkerType::Text,
|
|
842
|
+
embedding: None,
|
|
843
|
+
preset: None,
|
|
782
844
|
};
|
|
783
845
|
let text = "Page one content here. Page two content.";
|
|
784
846
|
|
|
@@ -802,6 +864,8 @@ mod tests {
|
|
|
802
864
|
overlap: 5,
|
|
803
865
|
trim: true,
|
|
804
866
|
chunker_type: ChunkerType::Text,
|
|
867
|
+
embedding: None,
|
|
868
|
+
preset: None,
|
|
805
869
|
};
|
|
806
870
|
let text = "Page one content here. Page two content.";
|
|
807
871
|
|
|
@@ -832,6 +896,8 @@ mod tests {
|
|
|
832
896
|
overlap: 5,
|
|
833
897
|
trim: true,
|
|
834
898
|
chunker_type: ChunkerType::Text,
|
|
899
|
+
embedding: None,
|
|
900
|
+
preset: None,
|
|
835
901
|
};
|
|
836
902
|
let text = "Page one content here. Page two content.";
|
|
837
903
|
|
|
@@ -862,6 +928,8 @@ mod tests {
|
|
|
862
928
|
overlap: 5,
|
|
863
929
|
trim: true,
|
|
864
930
|
chunker_type: ChunkerType::Text,
|
|
931
|
+
embedding: None,
|
|
932
|
+
preset: None,
|
|
865
933
|
};
|
|
866
934
|
let text = "First page content here.Second page content here.Third page.";
|
|
867
935
|
|
|
@@ -897,6 +965,8 @@ mod tests {
|
|
|
897
965
|
overlap: 10,
|
|
898
966
|
trim: true,
|
|
899
967
|
chunker_type: ChunkerType::Text,
|
|
968
|
+
embedding: None,
|
|
969
|
+
preset: None,
|
|
900
970
|
};
|
|
901
971
|
let text = "All content on single page fits in one chunk.";
|
|
902
972
|
|
|
@@ -919,6 +989,8 @@ mod tests {
|
|
|
919
989
|
overlap: 0,
|
|
920
990
|
trim: false,
|
|
921
991
|
chunker_type: ChunkerType::Text,
|
|
992
|
+
embedding: None,
|
|
993
|
+
preset: None,
|
|
922
994
|
};
|
|
923
995
|
let text = "AAAAA BBBBB CCCCC DDDDD";
|
|
924
996
|
|
|
@@ -952,6 +1024,8 @@ mod tests {
|
|
|
952
1024
|
overlap: 5,
|
|
953
1025
|
trim: true,
|
|
954
1026
|
chunker_type: ChunkerType::Text,
|
|
1027
|
+
embedding: None,
|
|
1028
|
+
preset: None,
|
|
955
1029
|
};
|
|
956
1030
|
let text = "Page One Content Here.Page Two.";
|
|
957
1031
|
|
|
@@ -982,6 +1056,8 @@ mod tests {
|
|
|
982
1056
|
overlap: 2,
|
|
983
1057
|
trim: false,
|
|
984
1058
|
chunker_type: ChunkerType::Text,
|
|
1059
|
+
embedding: None,
|
|
1060
|
+
preset: None,
|
|
985
1061
|
};
|
|
986
1062
|
let text = "0123456789ABCDEFGHIJ";
|
|
987
1063
|
|
|
@@ -60,7 +60,7 @@ pub mod validation;
|
|
|
60
60
|
|
|
61
61
|
// Re-export submodule types and functions
|
|
62
62
|
pub use boundaries::{calculate_page_range, validate_page_boundaries};
|
|
63
|
-
pub use config::{ChunkerType, ChunkingConfig, ChunkingResult};
|
|
63
|
+
pub use config::{ChunkerType, ChunkingConfig, ChunkingResult}; // ChunkingConfig re-exported from core::config::processing
|
|
64
64
|
pub use core::{chunk_text, chunk_text_with_type, chunk_texts_batch};
|
|
65
65
|
pub use processor::ChunkingProcessor;
|
|
66
66
|
pub use validation::{ADAPTIVE_VALIDATION_THRESHOLD, precompute_utf8_boundaries, validate_utf8_boundaries};
|
|
@@ -54,14 +54,7 @@ impl PostProcessor for ChunkingProcessor {
|
|
|
54
54
|
None => return Ok(()),
|
|
55
55
|
};
|
|
56
56
|
|
|
57
|
-
let
|
|
58
|
-
max_characters: chunking_config.max_chars,
|
|
59
|
-
overlap: chunking_config.max_overlap,
|
|
60
|
-
trim: true,
|
|
61
|
-
chunker_type: crate::chunking::ChunkerType::Text,
|
|
62
|
-
};
|
|
63
|
-
|
|
64
|
-
let chunking_result = crate::chunking::chunk_text(&result.content, &chunk_config, None)
|
|
57
|
+
let chunking_result = crate::chunking::chunk_text(&result.content, chunking_config, None)
|
|
65
58
|
.map_err(|e| KreuzbergError::Other(format!("Chunking failed: {}", e)))?;
|
|
66
59
|
result.chunks = Some(chunking_result.chunks);
|
|
67
60
|
|
|
@@ -87,14 +80,17 @@ mod tests {
|
|
|
87
80
|
use super::*;
|
|
88
81
|
use crate::core::config::ChunkingConfig;
|
|
89
82
|
use crate::types::Metadata;
|
|
83
|
+
use std::borrow::Cow;
|
|
90
84
|
|
|
91
85
|
#[tokio::test]
|
|
92
86
|
async fn test_chunking_processor() {
|
|
93
87
|
let processor = ChunkingProcessor;
|
|
94
88
|
let config = ExtractionConfig {
|
|
95
89
|
chunking: Some(ChunkingConfig {
|
|
96
|
-
|
|
97
|
-
|
|
90
|
+
max_characters: 100,
|
|
91
|
+
overlap: 10,
|
|
92
|
+
trim: true,
|
|
93
|
+
chunker_type: crate::chunking::ChunkerType::Text,
|
|
98
94
|
embedding: None,
|
|
99
95
|
preset: None,
|
|
100
96
|
}),
|
|
@@ -103,7 +99,7 @@ mod tests {
|
|
|
103
99
|
|
|
104
100
|
let mut result = ExtractionResult {
|
|
105
101
|
content: "This is a longer text that should be split into multiple chunks to test the chunking processor functionality.".to_string(),
|
|
106
|
-
mime_type: "text/plain"
|
|
102
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
107
103
|
metadata: Metadata::default(),
|
|
108
104
|
tables: vec![],
|
|
109
105
|
detected_languages: None,
|
|
@@ -128,7 +124,7 @@ mod tests {
|
|
|
128
124
|
|
|
129
125
|
let mut result = ExtractionResult {
|
|
130
126
|
content: "Some text".to_string(),
|
|
131
|
-
mime_type: "text/plain"
|
|
127
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
132
128
|
metadata: Metadata::default(),
|
|
133
129
|
tables: vec![],
|
|
134
130
|
detected_languages: None,
|
|
@@ -165,7 +161,7 @@ mod tests {
|
|
|
165
161
|
|
|
166
162
|
let result = ExtractionResult {
|
|
167
163
|
content: "Sample text".to_string(),
|
|
168
|
-
mime_type: "text/plain"
|
|
164
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
169
165
|
metadata: Metadata::default(),
|
|
170
166
|
tables: vec![],
|
|
171
167
|
detected_languages: None,
|
|
@@ -178,8 +174,10 @@ mod tests {
|
|
|
178
174
|
|
|
179
175
|
let config_with_chunking = ExtractionConfig {
|
|
180
176
|
chunking: Some(crate::core::config::ChunkingConfig {
|
|
181
|
-
|
|
182
|
-
|
|
177
|
+
max_characters: 100,
|
|
178
|
+
overlap: 10,
|
|
179
|
+
trim: true,
|
|
180
|
+
chunker_type: crate::chunking::ChunkerType::Text,
|
|
183
181
|
embedding: None,
|
|
184
182
|
preset: None,
|
|
185
183
|
}),
|
|
@@ -197,7 +195,7 @@ mod tests {
|
|
|
197
195
|
|
|
198
196
|
let short_result = ExtractionResult {
|
|
199
197
|
content: "Short".to_string(),
|
|
200
|
-
mime_type: "text/plain"
|
|
198
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
201
199
|
metadata: Metadata::default(),
|
|
202
200
|
tables: vec![],
|
|
203
201
|
detected_languages: None,
|
|
@@ -210,7 +208,7 @@ mod tests {
|
|
|
210
208
|
|
|
211
209
|
let long_result = ExtractionResult {
|
|
212
210
|
content: "a".repeat(100000),
|
|
213
|
-
mime_type: "text/plain"
|
|
211
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
214
212
|
metadata: Metadata::default(),
|
|
215
213
|
tables: vec![],
|
|
216
214
|
detected_languages: None,
|
|
@@ -94,8 +94,10 @@ impl ExtractionConfig {
|
|
|
94
94
|
|
|
95
95
|
if self.chunking.is_none() {
|
|
96
96
|
self.chunking = Some(ChunkingConfig {
|
|
97
|
-
|
|
98
|
-
|
|
97
|
+
max_characters: 1000,
|
|
98
|
+
overlap: 200,
|
|
99
|
+
trim: true,
|
|
100
|
+
chunker_type: super::super::processing::ChunkerType::Text,
|
|
99
101
|
embedding: None,
|
|
100
102
|
preset: None,
|
|
101
103
|
});
|
|
@@ -103,8 +105,8 @@ impl ExtractionConfig {
|
|
|
103
105
|
|
|
104
106
|
if let Some(ref mut chunking) = self.chunking {
|
|
105
107
|
// Validate against current overlap before updating
|
|
106
|
-
validate_chunking_params(max_chars, chunking.
|
|
107
|
-
chunking.
|
|
108
|
+
validate_chunking_params(max_chars, chunking.overlap)?;
|
|
109
|
+
chunking.max_characters = max_chars;
|
|
108
110
|
}
|
|
109
111
|
}
|
|
110
112
|
|
|
@@ -120,17 +122,19 @@ impl ExtractionConfig {
|
|
|
120
122
|
|
|
121
123
|
if self.chunking.is_none() {
|
|
122
124
|
self.chunking = Some(ChunkingConfig {
|
|
123
|
-
|
|
124
|
-
|
|
125
|
+
max_characters: 1000,
|
|
126
|
+
overlap: 200,
|
|
127
|
+
trim: true,
|
|
128
|
+
chunker_type: super::super::processing::ChunkerType::Text,
|
|
125
129
|
embedding: None,
|
|
126
130
|
preset: None,
|
|
127
131
|
});
|
|
128
132
|
}
|
|
129
133
|
|
|
130
134
|
if let Some(ref mut chunking) = self.chunking {
|
|
131
|
-
// Validate against current
|
|
132
|
-
validate_chunking_params(chunking.
|
|
133
|
-
chunking.
|
|
135
|
+
// Validate against current max_characters before updating
|
|
136
|
+
validate_chunking_params(chunking.max_characters, max_overlap)?;
|
|
137
|
+
chunking.overlap = max_overlap;
|
|
134
138
|
}
|
|
135
139
|
}
|
|
136
140
|
|