kreuzberg 4.2.6 → 4.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +7 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
- data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
- data/ext/kreuzberg_rb/native/src/result.rs +5 -3
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +228 -37
- data/spec/binding/batch_operations_spec.rb +2 -0
- data/vendor/Cargo.toml +3 -2
- data/vendor/kreuzberg/Cargo.toml +2 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/error.rs +29 -1
- data/vendor/kreuzberg/src/api/handlers.rs +28 -25
- data/vendor/kreuzberg/src/api/openapi.rs +14 -1
- data/vendor/kreuzberg/src/chunking/config.rs +2 -37
- data/vendor/kreuzberg/src/chunking/core.rs +78 -2
- data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
- data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
- data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
- data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
- data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
- data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
- data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
- data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
- data/vendor/kreuzberg/src/extraction/email.rs +31 -19
- data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
- data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
- data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
- data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
- data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
- data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
- data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
- data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
- data/vendor/kreuzberg/src/extractors/email.rs +5 -3
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
- data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
- data/vendor/kreuzberg/src/extractors/html.rs +1 -1
- data/vendor/kreuzberg/src/extractors/image.rs +3 -3
- data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
- data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
- data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
- data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
- data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
- data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
- data/vendor/kreuzberg/src/extractors/text.rs +2 -2
- data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
- data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
- data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
- data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
- data/vendor/kreuzberg/src/lib.rs +1 -1
- data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
- data/vendor/kreuzberg/src/mcp/format.rs +5 -4
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
- data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
- data/vendor/kreuzberg/src/ocr/types.rs +3 -4
- data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
- data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
- data/vendor/kreuzberg/src/text/quality.rs +13 -13
- data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
- data/vendor/kreuzberg/src/types/djot.rs +15 -4
- data/vendor/kreuzberg/src/types/extraction.rs +24 -4
- data/vendor/kreuzberg/src/types/formats.rs +9 -5
- data/vendor/kreuzberg/src/types/metadata.rs +68 -7
- data/vendor/kreuzberg/src/types/mod.rs +7 -5
- data/vendor/kreuzberg/src/types/page.rs +9 -0
- data/vendor/kreuzberg/src/types/tables.rs +2 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
- data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
- data/vendor/kreuzberg/tests/config_features.rs +19 -11
- data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
- data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
- data/vendor/kreuzberg/tests/core_integration.rs +5 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
- data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
- data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
- data/vendor/kreuzberg-ffi/src/error.rs +56 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
- data/vendor/kreuzberg-ffi/src/result.rs +2 -1
- data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
- data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +2 -2
|
@@ -44,10 +44,10 @@ impl ExtractionConfig {
|
|
|
44
44
|
let config: Self = toml::from_str(&content)
|
|
45
45
|
.map_err(|e| KreuzbergError::validation(format!("Invalid TOML in {}: {}", path.display(), e)))?;
|
|
46
46
|
|
|
47
|
-
let config_arc = Arc::new(config
|
|
48
|
-
CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc));
|
|
47
|
+
let config_arc = Arc::new(config);
|
|
48
|
+
CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc.clone()));
|
|
49
49
|
|
|
50
|
-
Ok(
|
|
50
|
+
Ok((*config_arc).clone())
|
|
51
51
|
}
|
|
52
52
|
|
|
53
53
|
/// Load configuration from a YAML file.
|
|
@@ -72,10 +72,10 @@ impl ExtractionConfig {
|
|
|
72
72
|
let config: Self = serde_yaml_ng::from_str(&content)
|
|
73
73
|
.map_err(|e| KreuzbergError::validation(format!("Invalid YAML in {}: {}", path.display(), e)))?;
|
|
74
74
|
|
|
75
|
-
let config_arc = Arc::new(config
|
|
76
|
-
CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc));
|
|
75
|
+
let config_arc = Arc::new(config);
|
|
76
|
+
CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc.clone()));
|
|
77
77
|
|
|
78
|
-
Ok(
|
|
78
|
+
Ok((*config_arc).clone())
|
|
79
79
|
}
|
|
80
80
|
|
|
81
81
|
/// Load configuration from a JSON file.
|
|
@@ -100,10 +100,10 @@ impl ExtractionConfig {
|
|
|
100
100
|
let config: Self = serde_json::from_str(&content)
|
|
101
101
|
.map_err(|e| KreuzbergError::validation(format!("Invalid JSON in {}: {}", path.display(), e)))?;
|
|
102
102
|
|
|
103
|
-
let config_arc = Arc::new(config
|
|
104
|
-
CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc));
|
|
103
|
+
let config_arc = Arc::new(config);
|
|
104
|
+
CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc.clone()));
|
|
105
105
|
|
|
106
|
-
Ok(
|
|
106
|
+
Ok((*config_arc).clone())
|
|
107
107
|
}
|
|
108
108
|
|
|
109
109
|
/// Load configuration from a file, auto-detecting format by extension.
|
|
@@ -169,10 +169,10 @@ impl ExtractionConfig {
|
|
|
169
169
|
}
|
|
170
170
|
};
|
|
171
171
|
|
|
172
|
-
let config_arc = Arc::new(config
|
|
173
|
-
CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc));
|
|
172
|
+
let config_arc = Arc::new(config);
|
|
173
|
+
CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc.clone()));
|
|
174
174
|
|
|
175
|
-
Ok(
|
|
175
|
+
Ok((*config_arc).clone())
|
|
176
176
|
}
|
|
177
177
|
|
|
178
178
|
/// Discover configuration file in parent directories.
|
|
@@ -17,4 +17,4 @@ pub use ocr::OcrConfig;
|
|
|
17
17
|
pub use page::PageConfig;
|
|
18
18
|
#[cfg(feature = "pdf")]
|
|
19
19
|
pub use pdf::{HierarchyConfig, PdfConfig};
|
|
20
|
-
pub use processing::{ChunkingConfig, EmbeddingConfig, EmbeddingModelType, PostProcessorConfig};
|
|
20
|
+
pub use processing::{ChunkerType, ChunkingConfig, EmbeddingConfig, EmbeddingModelType, PostProcessorConfig};
|
|
@@ -7,6 +7,19 @@ use serde::{Deserialize, Serialize};
|
|
|
7
7
|
use std::collections::HashSet;
|
|
8
8
|
use std::path::PathBuf;
|
|
9
9
|
|
|
10
|
+
/// Type of text chunker to use.
|
|
11
|
+
///
|
|
12
|
+
/// # Variants
|
|
13
|
+
///
|
|
14
|
+
/// * `Text` - Generic text splitter, splits on whitespace and punctuation
|
|
15
|
+
/// * `Markdown` - Markdown-aware splitter, preserves formatting and structure
|
|
16
|
+
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
|
|
17
|
+
pub enum ChunkerType {
|
|
18
|
+
#[default]
|
|
19
|
+
Text,
|
|
20
|
+
Markdown,
|
|
21
|
+
}
|
|
22
|
+
|
|
10
23
|
/// Post-processor configuration.
|
|
11
24
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
12
25
|
pub struct PostProcessorConfig {
|
|
@@ -59,15 +72,34 @@ impl Default for PostProcessorConfig {
|
|
|
59
72
|
}
|
|
60
73
|
|
|
61
74
|
/// Chunking configuration.
|
|
75
|
+
///
|
|
76
|
+
/// Configures text chunking for document content, including chunk size,
|
|
77
|
+
/// overlap, trimming behavior, and optional embeddings.
|
|
62
78
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
63
79
|
pub struct ChunkingConfig {
|
|
64
80
|
/// Maximum characters per chunk
|
|
65
|
-
|
|
66
|
-
|
|
81
|
+
///
|
|
82
|
+
/// Default: 1000
|
|
83
|
+
#[serde(default = "default_chunk_size", rename = "max_chars", alias = "max_characters")]
|
|
84
|
+
pub max_characters: usize,
|
|
67
85
|
|
|
68
86
|
/// Overlap between chunks in characters
|
|
69
|
-
|
|
70
|
-
|
|
87
|
+
///
|
|
88
|
+
/// Default: 200
|
|
89
|
+
#[serde(default = "default_chunk_overlap", rename = "max_overlap", alias = "overlap")]
|
|
90
|
+
pub overlap: usize,
|
|
91
|
+
|
|
92
|
+
/// Whether to trim whitespace from chunk boundaries
|
|
93
|
+
///
|
|
94
|
+
/// Default: true
|
|
95
|
+
#[serde(default = "default_trim")]
|
|
96
|
+
pub trim: bool,
|
|
97
|
+
|
|
98
|
+
/// Type of chunker to use (Text or Markdown)
|
|
99
|
+
///
|
|
100
|
+
/// Default: Text
|
|
101
|
+
#[serde(default = "default_chunker_type")]
|
|
102
|
+
pub chunker_type: ChunkerType,
|
|
71
103
|
|
|
72
104
|
/// Optional embedding configuration for chunk embeddings
|
|
73
105
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
@@ -78,6 +110,19 @@ pub struct ChunkingConfig {
|
|
|
78
110
|
pub preset: Option<String>,
|
|
79
111
|
}
|
|
80
112
|
|
|
113
|
+
impl Default for ChunkingConfig {
|
|
114
|
+
fn default() -> Self {
|
|
115
|
+
Self {
|
|
116
|
+
max_characters: 1000,
|
|
117
|
+
overlap: 200,
|
|
118
|
+
trim: true,
|
|
119
|
+
chunker_type: ChunkerType::Text,
|
|
120
|
+
embedding: None,
|
|
121
|
+
preset: None,
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
81
126
|
/// Embedding configuration for text chunks.
|
|
82
127
|
///
|
|
83
128
|
/// Configures embedding generation using ONNX models via fastembed-rs.
|
|
@@ -149,6 +194,14 @@ fn default_chunk_overlap() -> usize {
|
|
|
149
194
|
200
|
|
150
195
|
}
|
|
151
196
|
|
|
197
|
+
fn default_trim() -> bool {
|
|
198
|
+
true
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
fn default_chunker_type() -> ChunkerType {
|
|
202
|
+
ChunkerType::Text
|
|
203
|
+
}
|
|
204
|
+
|
|
152
205
|
fn default_normalize() -> bool {
|
|
153
206
|
true
|
|
154
207
|
}
|
|
@@ -196,13 +249,17 @@ mod tests {
|
|
|
196
249
|
#[test]
|
|
197
250
|
fn test_chunking_config_defaults() {
|
|
198
251
|
let config = ChunkingConfig {
|
|
199
|
-
|
|
200
|
-
|
|
252
|
+
max_characters: 1000,
|
|
253
|
+
overlap: 200,
|
|
254
|
+
trim: true,
|
|
255
|
+
chunker_type: ChunkerType::Text,
|
|
201
256
|
embedding: None,
|
|
202
257
|
preset: None,
|
|
203
258
|
};
|
|
204
|
-
assert_eq!(config.
|
|
205
|
-
assert_eq!(config.
|
|
259
|
+
assert_eq!(config.max_characters, 1000);
|
|
260
|
+
assert_eq!(config.overlap, 200);
|
|
261
|
+
assert!(config.trim);
|
|
262
|
+
assert_eq!(config.chunker_type, ChunkerType::Text);
|
|
206
263
|
}
|
|
207
264
|
|
|
208
265
|
#[test]
|
|
@@ -141,6 +141,14 @@ mod tests {
|
|
|
141
141
|
assert!(validate_language_code("DEU").is_ok());
|
|
142
142
|
}
|
|
143
143
|
|
|
144
|
+
#[test]
|
|
145
|
+
fn test_validate_language_code_all_keyword() {
|
|
146
|
+
assert!(validate_language_code("all").is_ok());
|
|
147
|
+
assert!(validate_language_code("ALL").is_ok());
|
|
148
|
+
assert!(validate_language_code("All").is_ok());
|
|
149
|
+
assert!(validate_language_code("*").is_ok());
|
|
150
|
+
}
|
|
151
|
+
|
|
144
152
|
#[test]
|
|
145
153
|
fn test_validate_language_code_invalid() {
|
|
146
154
|
let result = validate_language_code("invalid");
|
|
@@ -167,6 +167,11 @@ pub fn validate_ocr_backend(backend: &str) -> Result<()> {
|
|
|
167
167
|
pub fn validate_language_code(code: &str) -> Result<()> {
|
|
168
168
|
let code_lower = code.to_lowercase();
|
|
169
169
|
|
|
170
|
+
// Accept "all" and "*" as special values to auto-detect installed languages
|
|
171
|
+
if code_lower == "all" || code_lower == "*" {
|
|
172
|
+
return Ok(());
|
|
173
|
+
}
|
|
174
|
+
|
|
170
175
|
if VALID_LANGUAGE_CODES.contains(&code_lower.as_str()) {
|
|
171
176
|
return Ok(());
|
|
172
177
|
}
|
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
use crate::core::config::ExtractionConfig;
|
|
7
7
|
use crate::types::{ErrorMetadata, ExtractionResult, Metadata};
|
|
8
8
|
use crate::{KreuzbergError, Result};
|
|
9
|
+
use std::borrow::Cow;
|
|
9
10
|
use std::path::Path;
|
|
10
11
|
use std::sync::Arc;
|
|
11
12
|
|
|
@@ -65,9 +66,9 @@ pub async fn batch_extract_file(
|
|
|
65
66
|
return Ok(vec![]);
|
|
66
67
|
}
|
|
67
68
|
|
|
68
|
-
let
|
|
69
|
+
let config_arc = Arc::new(config.clone());
|
|
69
70
|
|
|
70
|
-
let max_concurrent =
|
|
71
|
+
let max_concurrent = config_arc
|
|
71
72
|
.max_concurrent_extractions
|
|
72
73
|
.unwrap_or_else(|| (num_cpus::get() as f64 * 1.5).ceil() as usize);
|
|
73
74
|
let semaphore = Arc::new(Semaphore::new(max_concurrent));
|
|
@@ -76,7 +77,7 @@ pub async fn batch_extract_file(
|
|
|
76
77
|
|
|
77
78
|
for (index, path) in paths.into_iter().enumerate() {
|
|
78
79
|
let path_buf = path.as_ref().to_path_buf();
|
|
79
|
-
let config_clone = Arc::clone(&
|
|
80
|
+
let config_clone = Arc::clone(&config_arc);
|
|
80
81
|
let semaphore_clone = Arc::clone(&semaphore);
|
|
81
82
|
|
|
82
83
|
tasks.spawn(async move {
|
|
@@ -108,7 +109,7 @@ pub async fn batch_extract_file(
|
|
|
108
109
|
|
|
109
110
|
results[index] = Some(ExtractionResult {
|
|
110
111
|
content: format!("Error: {}", e),
|
|
111
|
-
mime_type: "text/plain"
|
|
112
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
112
113
|
metadata,
|
|
113
114
|
tables: vec![],
|
|
114
115
|
detected_languages: None,
|
|
@@ -180,10 +181,9 @@ pub async fn batch_extract_bytes(
|
|
|
180
181
|
return Ok(vec![]);
|
|
181
182
|
}
|
|
182
183
|
|
|
183
|
-
let
|
|
184
|
-
let config = Arc::new(batch_config);
|
|
184
|
+
let config_arc = Arc::new(config.clone());
|
|
185
185
|
|
|
186
|
-
let max_concurrent =
|
|
186
|
+
let max_concurrent = config_arc
|
|
187
187
|
.max_concurrent_extractions
|
|
188
188
|
.unwrap_or_else(|| (num_cpus::get() as f64 * 1.5).ceil() as usize);
|
|
189
189
|
let semaphore = Arc::new(Semaphore::new(max_concurrent));
|
|
@@ -191,7 +191,7 @@ pub async fn batch_extract_bytes(
|
|
|
191
191
|
let mut tasks = JoinSet::new();
|
|
192
192
|
|
|
193
193
|
for (index, (bytes, mime_type)) in contents.into_iter().enumerate() {
|
|
194
|
-
let config_clone = Arc::clone(&
|
|
194
|
+
let config_clone = Arc::clone(&config_arc);
|
|
195
195
|
let semaphore_clone = Arc::clone(&semaphore);
|
|
196
196
|
|
|
197
197
|
tasks.spawn(async move {
|
|
@@ -224,7 +224,7 @@ pub async fn batch_extract_bytes(
|
|
|
224
224
|
|
|
225
225
|
results[index] = Some(ExtractionResult {
|
|
226
226
|
content: format!("Error: {}", e),
|
|
227
|
-
mime_type: "text/plain"
|
|
227
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
228
228
|
metadata,
|
|
229
229
|
tables: vec![],
|
|
230
230
|
detected_languages: None,
|
|
@@ -18,6 +18,8 @@ use crate::types::ExtractionResult;
|
|
|
18
18
|
use crate::types::LibreOfficeConversionResult;
|
|
19
19
|
#[cfg(feature = "office")]
|
|
20
20
|
use serde_json::json;
|
|
21
|
+
#[cfg(feature = "office")]
|
|
22
|
+
use std::borrow::Cow;
|
|
21
23
|
use std::path::Path;
|
|
22
24
|
|
|
23
25
|
#[cfg(feature = "office")]
|
|
@@ -226,9 +228,9 @@ pub(in crate::core::extractor) fn apply_libreoffice_metadata(
|
|
|
226
228
|
legacy_mime: &str,
|
|
227
229
|
conversion: &LibreOfficeConversionResult,
|
|
228
230
|
) {
|
|
229
|
-
result.mime_type = pool_mime_type(legacy_mime);
|
|
231
|
+
result.mime_type = pool_mime_type(legacy_mime).into();
|
|
230
232
|
result.metadata.additional.insert(
|
|
231
|
-
"libreoffice_conversion"
|
|
233
|
+
Cow::Borrowed("libreoffice_conversion"),
|
|
232
234
|
json!({
|
|
233
235
|
"converter": "libreoffice",
|
|
234
236
|
"original_format": conversion.original_format,
|
|
@@ -24,18 +24,18 @@
|
|
|
24
24
|
/// It replicates the logic of `extract_bytes` but uses synchronous extractor methods.
|
|
25
25
|
#[cfg(not(feature = "tokio-runtime"))]
|
|
26
26
|
pub(super) fn extract_bytes_sync_impl(
|
|
27
|
-
content:
|
|
28
|
-
mime_type: Option
|
|
29
|
-
config: Option
|
|
27
|
+
content: &[u8],
|
|
28
|
+
mime_type: Option<&str>,
|
|
29
|
+
config: Option<&crate::core::config::ExtractionConfig>,
|
|
30
30
|
) -> crate::Result<crate::types::ExtractionResult> {
|
|
31
31
|
use crate::KreuzbergError;
|
|
32
32
|
use crate::core::extractor::helpers::get_extractor;
|
|
33
33
|
use crate::core::mime;
|
|
34
34
|
|
|
35
|
-
let
|
|
35
|
+
let cfg = config.cloned().unwrap_or_default();
|
|
36
36
|
|
|
37
37
|
let validated_mime = if let Some(mime) = mime_type {
|
|
38
|
-
mime::validate_mime_type(
|
|
38
|
+
mime::validate_mime_type(mime)?
|
|
39
39
|
} else {
|
|
40
40
|
return Err(KreuzbergError::Validation {
|
|
41
41
|
message: "MIME type is required for synchronous extraction".to_string(),
|
|
@@ -54,9 +54,9 @@ pub(super) fn extract_bytes_sync_impl(
|
|
|
54
54
|
))
|
|
55
55
|
})?;
|
|
56
56
|
|
|
57
|
-
let mut result = sync_extractor.extract_sync(
|
|
57
|
+
let mut result = sync_extractor.extract_sync(content, &validated_mime, &cfg)?;
|
|
58
58
|
|
|
59
|
-
result = crate::core::pipeline::run_pipeline_sync(result, &
|
|
59
|
+
result = crate::core::pipeline::run_pipeline_sync(result, &cfg)?;
|
|
60
60
|
|
|
61
61
|
Ok(result)
|
|
62
62
|
}
|
|
@@ -107,7 +107,7 @@ pub fn extract_bytes_sync(content: &[u8], mime_type: &str, config: &ExtractionCo
|
|
|
107
107
|
/// It calls `extract_bytes_sync_impl()` to perform the extraction.
|
|
108
108
|
#[cfg(not(feature = "tokio-runtime"))]
|
|
109
109
|
pub fn extract_bytes_sync(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
110
|
-
super::legacy::extract_bytes_sync_impl(content
|
|
110
|
+
super::legacy::extract_bytes_sync_impl(content, Some(mime_type), Some(config))
|
|
111
111
|
}
|
|
112
112
|
|
|
113
113
|
/// Synchronous wrapper for `batch_extract_file`.
|
|
@@ -180,14 +180,14 @@ pub fn batch_extract_bytes_sync(
|
|
|
180
180
|
config: &ExtractionConfig,
|
|
181
181
|
) -> Result<Vec<ExtractionResult>> {
|
|
182
182
|
use crate::types::{ErrorMetadata, Metadata};
|
|
183
|
-
use
|
|
183
|
+
use std::borrow::Cow;
|
|
184
184
|
|
|
185
185
|
let mut results = Vec::with_capacity(contents.len());
|
|
186
186
|
for (content, mime_type) in contents {
|
|
187
187
|
let result = extract_bytes_sync(&content, &mime_type, config);
|
|
188
188
|
results.push(result.unwrap_or_else(|e| ExtractionResult {
|
|
189
189
|
content: format!("Error: {}", e),
|
|
190
|
-
mime_type:
|
|
190
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
191
191
|
metadata: Metadata {
|
|
192
192
|
error: Some(ErrorMetadata {
|
|
193
193
|
error_type: format!("{:?}", e),
|
|
@@ -7,6 +7,7 @@ use crate::core::config::ExtractionConfig;
|
|
|
7
7
|
use crate::plugins::ProcessingStage;
|
|
8
8
|
use crate::types::ExtractionResult;
|
|
9
9
|
use crate::{KreuzbergError, Result};
|
|
10
|
+
use std::borrow::Cow;
|
|
10
11
|
|
|
11
12
|
/// Execute all registered post-processors by stage.
|
|
12
13
|
pub(super) async fn execute_processors(
|
|
@@ -37,7 +38,7 @@ pub(super) async fn execute_processors(
|
|
|
37
38
|
}
|
|
38
39
|
Err(err) => {
|
|
39
40
|
result.metadata.additional.insert(
|
|
40
|
-
format!("processing_error_{processor_name}"),
|
|
41
|
+
Cow::Owned(format!("processing_error_{processor_name}")),
|
|
41
42
|
serde_json::Value::String(err.to_string()),
|
|
42
43
|
);
|
|
43
44
|
}
|
|
@@ -6,27 +6,21 @@
|
|
|
6
6
|
use crate::Result;
|
|
7
7
|
use crate::core::config::ExtractionConfig;
|
|
8
8
|
use crate::types::ExtractionResult;
|
|
9
|
+
use std::borrow::Cow;
|
|
9
10
|
|
|
10
11
|
/// Execute chunking if configured.
|
|
11
12
|
pub(super) fn execute_chunking(result: &mut ExtractionResult, config: &ExtractionConfig) -> Result<()> {
|
|
12
13
|
#[cfg(feature = "chunking")]
|
|
13
14
|
if let Some(ref chunking_config) = config.chunking {
|
|
14
|
-
let chunk_config = crate::chunking::ChunkingConfig {
|
|
15
|
-
max_characters: chunking_config.max_chars,
|
|
16
|
-
overlap: chunking_config.max_overlap,
|
|
17
|
-
trim: true,
|
|
18
|
-
chunker_type: crate::chunking::ChunkerType::Text,
|
|
19
|
-
};
|
|
20
|
-
|
|
21
15
|
let page_boundaries = result.metadata.pages.as_ref().and_then(|ps| ps.boundaries.as_deref());
|
|
22
16
|
|
|
23
|
-
match crate::chunking::chunk_text(&result.content,
|
|
17
|
+
match crate::chunking::chunk_text(&result.content, chunking_config, page_boundaries) {
|
|
24
18
|
Ok(chunking_result) => {
|
|
25
19
|
result.chunks = Some(chunking_result.chunks);
|
|
26
20
|
|
|
27
21
|
if let Some(ref chunks) = result.chunks {
|
|
28
22
|
result.metadata.additional.insert(
|
|
29
|
-
"chunk_count"
|
|
23
|
+
Cow::Borrowed("chunk_count"),
|
|
30
24
|
serde_json::Value::Number(serde_json::Number::from(chunks.len())),
|
|
31
25
|
);
|
|
32
26
|
}
|
|
@@ -40,13 +34,13 @@ pub(super) fn execute_chunking(result: &mut ExtractionResult, config: &Extractio
|
|
|
40
34
|
result
|
|
41
35
|
.metadata
|
|
42
36
|
.additional
|
|
43
|
-
.insert("embeddings_generated"
|
|
37
|
+
.insert(Cow::Borrowed("embeddings_generated"), serde_json::Value::Bool(true));
|
|
44
38
|
}
|
|
45
39
|
Err(e) => {
|
|
46
|
-
result
|
|
47
|
-
|
|
48
|
-
.
|
|
49
|
-
|
|
40
|
+
result.metadata.additional.insert(
|
|
41
|
+
Cow::Borrowed("embedding_error"),
|
|
42
|
+
serde_json::Value::String(e.to_string()),
|
|
43
|
+
);
|
|
50
44
|
}
|
|
51
45
|
}
|
|
52
46
|
}
|
|
@@ -54,16 +48,16 @@ pub(super) fn execute_chunking(result: &mut ExtractionResult, config: &Extractio
|
|
|
54
48
|
#[cfg(not(feature = "embeddings"))]
|
|
55
49
|
if chunking_config.embedding.is_some() {
|
|
56
50
|
result.metadata.additional.insert(
|
|
57
|
-
"embedding_error"
|
|
51
|
+
Cow::Borrowed("embedding_error"),
|
|
58
52
|
serde_json::Value::String("Embeddings feature not enabled".to_string()),
|
|
59
53
|
);
|
|
60
54
|
}
|
|
61
55
|
}
|
|
62
56
|
Err(e) => {
|
|
63
|
-
result
|
|
64
|
-
|
|
65
|
-
.
|
|
66
|
-
|
|
57
|
+
result.metadata.additional.insert(
|
|
58
|
+
Cow::Borrowed("chunking_error"),
|
|
59
|
+
serde_json::Value::String(e.to_string()),
|
|
60
|
+
);
|
|
67
61
|
}
|
|
68
62
|
}
|
|
69
63
|
}
|
|
@@ -71,7 +65,7 @@ pub(super) fn execute_chunking(result: &mut ExtractionResult, config: &Extractio
|
|
|
71
65
|
#[cfg(not(feature = "chunking"))]
|
|
72
66
|
if config.chunking.is_some() {
|
|
73
67
|
result.metadata.additional.insert(
|
|
74
|
-
"chunking_error"
|
|
68
|
+
Cow::Borrowed("chunking_error"),
|
|
75
69
|
serde_json::Value::String("Chunking feature not enabled".to_string()),
|
|
76
70
|
);
|
|
77
71
|
}
|
|
@@ -89,7 +83,7 @@ pub(super) fn execute_language_detection(result: &mut ExtractionResult, config:
|
|
|
89
83
|
}
|
|
90
84
|
Err(e) => {
|
|
91
85
|
result.metadata.additional.insert(
|
|
92
|
-
"language_detection_error"
|
|
86
|
+
Cow::Borrowed("language_detection_error"),
|
|
93
87
|
serde_json::Value::String(e.to_string()),
|
|
94
88
|
);
|
|
95
89
|
}
|
|
@@ -99,7 +93,7 @@ pub(super) fn execute_language_detection(result: &mut ExtractionResult, config:
|
|
|
99
93
|
#[cfg(not(feature = "language-detection"))]
|
|
100
94
|
if config.language_detection.is_some() {
|
|
101
95
|
result.metadata.additional.insert(
|
|
102
|
-
"language_detection_error"
|
|
96
|
+
Cow::Borrowed("language_detection_error"),
|
|
103
97
|
serde_json::Value::String("Language detection feature not enabled".to_string()),
|
|
104
98
|
);
|
|
105
99
|
}
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
|
|
6
6
|
use crate::core::config::OutputFormat;
|
|
7
7
|
use crate::types::ExtractionResult;
|
|
8
|
+
use std::borrow::Cow;
|
|
8
9
|
|
|
9
10
|
/// Apply output format conversion to the extraction result.
|
|
10
11
|
///
|
|
@@ -23,7 +24,7 @@ use crate::types::ExtractionResult;
|
|
|
23
24
|
/// * `output_format` - The desired output format
|
|
24
25
|
pub fn apply_output_format(result: &mut ExtractionResult, output_format: OutputFormat) {
|
|
25
26
|
// Check if content was already formatted during extraction
|
|
26
|
-
let already_formatted = match result.mime_type
|
|
27
|
+
let already_formatted = match &*result.mime_type {
|
|
27
28
|
"text/markdown" if output_format == OutputFormat::Markdown => true,
|
|
28
29
|
"text/djot" if output_format == OutputFormat::Djot => true,
|
|
29
30
|
_ => false,
|
|
@@ -46,7 +47,7 @@ pub fn apply_output_format(result: &mut ExtractionResult, output_format: OutputF
|
|
|
46
47
|
Err(e) => {
|
|
47
48
|
// Keep original content on error, record error in metadata
|
|
48
49
|
result.metadata.additional.insert(
|
|
49
|
-
"output_format_error"
|
|
50
|
+
Cow::Borrowed("output_format_error"),
|
|
50
51
|
serde_json::Value::String(format!("Failed to convert to djot: {}", e)),
|
|
51
52
|
);
|
|
52
53
|
}
|
|
@@ -66,7 +67,7 @@ pub fn apply_output_format(result: &mut ExtractionResult, output_format: OutputF
|
|
|
66
67
|
Err(e) => {
|
|
67
68
|
// Keep original content on error, record error in metadata
|
|
68
69
|
result.metadata.additional.insert(
|
|
69
|
-
"output_format_error"
|
|
70
|
+
Cow::Borrowed("output_format_error"),
|
|
70
71
|
serde_json::Value::String(format!("Failed to convert to markdown: {}", e)),
|
|
71
72
|
);
|
|
72
73
|
}
|
|
@@ -87,7 +88,7 @@ pub fn apply_output_format(result: &mut ExtractionResult, output_format: OutputF
|
|
|
87
88
|
Err(e) => {
|
|
88
89
|
// Keep original content on error, record error in metadata
|
|
89
90
|
result.metadata.additional.insert(
|
|
90
|
-
"output_format_error"
|
|
91
|
+
Cow::Borrowed("output_format_error"),
|
|
91
92
|
serde_json::Value::String(format!("Failed to convert djot to HTML: {}", e)),
|
|
92
93
|
);
|
|
93
94
|
}
|
|
@@ -96,7 +97,7 @@ pub fn apply_output_format(result: &mut ExtractionResult, output_format: OutputF
|
|
|
96
97
|
Err(e) => {
|
|
97
98
|
// Keep original content on error, record error in metadata
|
|
98
99
|
result.metadata.additional.insert(
|
|
99
|
-
"output_format_error"
|
|
100
|
+
Cow::Borrowed("output_format_error"),
|
|
100
101
|
serde_json::Value::String(format!("Failed to generate djot for HTML conversion: {}", e)),
|
|
101
102
|
);
|
|
102
103
|
}
|
|
@@ -128,7 +129,7 @@ mod tests {
|
|
|
128
129
|
fn test_apply_output_format_plain() {
|
|
129
130
|
let mut result = ExtractionResult {
|
|
130
131
|
content: "Hello World".to_string(),
|
|
131
|
-
mime_type: "text/plain"
|
|
132
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
132
133
|
metadata: Metadata::default(),
|
|
133
134
|
tables: vec![],
|
|
134
135
|
detected_languages: None,
|
|
@@ -151,7 +152,7 @@ mod tests {
|
|
|
151
152
|
|
|
152
153
|
let mut result = ExtractionResult {
|
|
153
154
|
content: "Hello World".to_string(),
|
|
154
|
-
mime_type: "text/djot"
|
|
155
|
+
mime_type: Cow::Borrowed("text/djot"),
|
|
155
156
|
metadata: Metadata::default(),
|
|
156
157
|
tables: vec![],
|
|
157
158
|
detected_languages: None,
|
|
@@ -180,7 +181,7 @@ mod tests {
|
|
|
180
181
|
images: vec![],
|
|
181
182
|
links: vec![],
|
|
182
183
|
footnotes: vec![],
|
|
183
|
-
attributes:
|
|
184
|
+
attributes: Vec::new(),
|
|
184
185
|
}),
|
|
185
186
|
};
|
|
186
187
|
|
|
@@ -194,7 +195,7 @@ mod tests {
|
|
|
194
195
|
fn test_apply_output_format_djot_without_djot_content() {
|
|
195
196
|
let mut result = ExtractionResult {
|
|
196
197
|
content: "Hello World".to_string(),
|
|
197
|
-
mime_type: "text/plain"
|
|
198
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
198
199
|
metadata: Metadata::default(),
|
|
199
200
|
tables: vec![],
|
|
200
201
|
detected_languages: None,
|
|
@@ -216,7 +217,7 @@ mod tests {
|
|
|
216
217
|
fn test_apply_output_format_html() {
|
|
217
218
|
let mut result = ExtractionResult {
|
|
218
219
|
content: "Hello World".to_string(),
|
|
219
|
-
mime_type: "text/plain"
|
|
220
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
220
221
|
metadata: Metadata::default(),
|
|
221
222
|
tables: vec![],
|
|
222
223
|
detected_languages: None,
|
|
@@ -239,7 +240,7 @@ mod tests {
|
|
|
239
240
|
fn test_apply_output_format_html_escapes_special_chars() {
|
|
240
241
|
let mut result = ExtractionResult {
|
|
241
242
|
content: "<script>alert('XSS')</script>".to_string(),
|
|
242
|
-
mime_type: "text/plain"
|
|
243
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
243
244
|
metadata: Metadata::default(),
|
|
244
245
|
tables: vec![],
|
|
245
246
|
detected_languages: None,
|
|
@@ -262,7 +263,7 @@ mod tests {
|
|
|
262
263
|
fn test_apply_output_format_markdown() {
|
|
263
264
|
let mut result = ExtractionResult {
|
|
264
265
|
content: "Hello World".to_string(),
|
|
265
|
-
mime_type: "text/plain"
|
|
266
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
266
267
|
metadata: Metadata::default(),
|
|
267
268
|
tables: vec![],
|
|
268
269
|
detected_languages: None,
|
|
@@ -281,8 +282,9 @@ mod tests {
|
|
|
281
282
|
|
|
282
283
|
#[test]
|
|
283
284
|
fn test_apply_output_format_preserves_metadata() {
|
|
284
|
-
|
|
285
|
-
additional
|
|
285
|
+
use ahash::AHashMap;
|
|
286
|
+
let mut additional = AHashMap::new();
|
|
287
|
+
additional.insert(Cow::Borrowed("custom_key"), serde_json::json!("custom_value"));
|
|
286
288
|
let metadata = Metadata {
|
|
287
289
|
title: Some("Test Title".to_string()),
|
|
288
290
|
additional,
|
|
@@ -291,7 +293,7 @@ mod tests {
|
|
|
291
293
|
|
|
292
294
|
let mut result = ExtractionResult {
|
|
293
295
|
content: "Hello World".to_string(),
|
|
294
|
-
mime_type: "text/plain"
|
|
296
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
295
297
|
metadata,
|
|
296
298
|
tables: vec![],
|
|
297
299
|
detected_languages: None,
|
|
@@ -324,7 +326,7 @@ mod tests {
|
|
|
324
326
|
|
|
325
327
|
let mut result = ExtractionResult {
|
|
326
328
|
content: "Hello World".to_string(),
|
|
327
|
-
mime_type: "text/plain"
|
|
329
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
328
330
|
metadata: Metadata::default(),
|
|
329
331
|
tables: vec![table],
|
|
330
332
|
detected_languages: None,
|
|
@@ -367,12 +369,12 @@ mod tests {
|
|
|
367
369
|
images: vec![],
|
|
368
370
|
links: vec![],
|
|
369
371
|
footnotes: vec![],
|
|
370
|
-
attributes:
|
|
372
|
+
attributes: Vec::new(),
|
|
371
373
|
};
|
|
372
374
|
|
|
373
375
|
let mut result = ExtractionResult {
|
|
374
376
|
content: "test".to_string(),
|
|
375
|
-
mime_type: "text/djot"
|
|
377
|
+
mime_type: Cow::Borrowed("text/djot"),
|
|
376
378
|
metadata: Metadata::default(),
|
|
377
379
|
tables: vec![],
|
|
378
380
|
detected_languages: None,
|