kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +5 -5
- data/README.md +15 -9
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +511 -325
- data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
- data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
- data/kreuzberg.gemspec +38 -4
- data/lib/kreuzberg/config.rb +34 -1
- data/lib/kreuzberg/result.rb +77 -14
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +23 -6
- data/vendor/kreuzberg/Cargo.toml +32 -11
- data/vendor/kreuzberg/README.md +54 -8
- data/vendor/kreuzberg/build.rs +549 -132
- data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
- data/vendor/kreuzberg/src/core/config.rs +49 -1
- data/vendor/kreuzberg/src/core/extractor.rs +134 -2
- data/vendor/kreuzberg/src/core/mod.rs +4 -2
- data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
- data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
- data/vendor/kreuzberg/src/extraction/html.rs +24 -8
- data/vendor/kreuzberg/src/extraction/image.rs +124 -1
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
- data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
- data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
- data/vendor/kreuzberg/src/extractors/email.rs +29 -15
- data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
- data/vendor/kreuzberg/src/extractors/html.rs +29 -15
- data/vendor/kreuzberg/src/extractors/image.rs +25 -4
- data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
- data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
- data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
- data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +197 -17
- data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
- data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
- data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +7 -2
- data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
- data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
- data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
- data/vendor/kreuzberg/src/lib.rs +10 -2
- data/vendor/kreuzberg/src/mcp/mod.rs +3 -0
- data/vendor/kreuzberg/src/mcp/server.rs +120 -12
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +328 -0
- data/vendor/kreuzberg/src/pdf/error.rs +8 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
- data/vendor/kreuzberg/src/pdf/mod.rs +18 -2
- data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
- data/vendor/kreuzberg/src/pdf/table.rs +26 -2
- data/vendor/kreuzberg/src/pdf/text.rs +89 -7
- data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
- data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
- data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
- data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
- data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
- data/vendor/kreuzberg/src/text/mod.rs +6 -0
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
- data/vendor/kreuzberg/src/types.rs +173 -21
- data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
- data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
- data/vendor/kreuzberg/tests/config_features.rs +15 -1
- data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
- data/vendor/kreuzberg/tests/email_integration.rs +2 -0
- data/vendor/kreuzberg/tests/error_handling.rs +43 -34
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/image_integration.rs +2 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
- data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
- data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
- data/vendor/kreuzberg/tests/security_validation.rs +1 -0
- data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
- data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
- data/vendor/rb-sys/Cargo.lock +15 -15
- data/vendor/rb-sys/Cargo.toml +4 -4
- data/vendor/rb-sys/Cargo.toml.orig +4 -4
- data/vendor/rb-sys/build/features.rs +5 -2
- data/vendor/rb-sys/build/main.rs +55 -15
- data/vendor/rb-sys/build/stable_api_config.rs +4 -2
- data/vendor/rb-sys/build/version.rs +3 -1
- data/vendor/rb-sys/src/lib.rs +1 -0
- data/vendor/rb-sys/src/macros.rs +2 -2
- data/vendor/rb-sys/src/special_consts.rs +1 -1
- data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
- data/vendor/rb-sys/src/stable_api.rs +0 -1
- data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
- metadata +13 -10
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
//! Text chunking post-processor.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides a PostProcessor plugin that chunks text content in
|
|
4
|
+
//! extraction results.
|
|
5
|
+
|
|
6
|
+
use crate::plugins::{Plugin, PostProcessor, ProcessingStage};
|
|
7
|
+
use crate::{ExtractionConfig, ExtractionResult, KreuzbergError, Result};
|
|
8
|
+
use async_trait::async_trait;
|
|
9
|
+
|
|
10
|
+
/// Post-processor that chunks text in document content.
|
|
11
|
+
///
|
|
12
|
+
/// This processor:
|
|
13
|
+
/// - Runs in the Middle processing stage
|
|
14
|
+
/// - Only processes when `config.chunking` is configured
|
|
15
|
+
/// - Stores chunks in `result.chunks`
|
|
16
|
+
/// - Uses configurable chunk size and overlap
|
|
17
|
+
///
|
|
18
|
+
/// # Example
|
|
19
|
+
///
|
|
20
|
+
/// ```rust,no_run
|
|
21
|
+
/// use kreuzberg::plugins::{Plugin, PostProcessor};
|
|
22
|
+
/// use kreuzberg::chunking::processor::ChunkingProcessor;
|
|
23
|
+
///
|
|
24
|
+
/// let processor = ChunkingProcessor;
|
|
25
|
+
/// assert_eq!(processor.name(), "text-chunking");
|
|
26
|
+
/// ```
|
|
27
|
+
#[derive(Debug, Clone, Copy)]
|
|
28
|
+
pub struct ChunkingProcessor;
|
|
29
|
+
|
|
30
|
+
impl Plugin for ChunkingProcessor {
|
|
31
|
+
fn name(&self) -> &str {
|
|
32
|
+
"text-chunking"
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
fn version(&self) -> String {
|
|
36
|
+
env!("CARGO_PKG_VERSION").to_string()
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
fn initialize(&self) -> Result<()> {
|
|
40
|
+
Ok(())
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
fn shutdown(&self) -> Result<()> {
|
|
44
|
+
Ok(())
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
|
|
49
|
+
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
|
|
50
|
+
impl PostProcessor for ChunkingProcessor {
|
|
51
|
+
async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig) -> Result<()> {
|
|
52
|
+
let chunking_config = match &config.chunking {
|
|
53
|
+
Some(cfg) => cfg,
|
|
54
|
+
None => return Ok(()),
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
let chunk_config = crate::chunking::ChunkingConfig {
|
|
58
|
+
max_characters: chunking_config.max_chars,
|
|
59
|
+
overlap: chunking_config.max_overlap,
|
|
60
|
+
trim: true,
|
|
61
|
+
chunker_type: crate::chunking::ChunkerType::Text,
|
|
62
|
+
};
|
|
63
|
+
|
|
64
|
+
let chunking_result = crate::chunking::chunk_text(&result.content, &chunk_config, None)
|
|
65
|
+
.map_err(|e| KreuzbergError::Other(format!("Chunking failed: {}", e)))?;
|
|
66
|
+
result.chunks = Some(chunking_result.chunks);
|
|
67
|
+
|
|
68
|
+
Ok(())
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
fn processing_stage(&self) -> ProcessingStage {
|
|
72
|
+
ProcessingStage::Middle
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
fn should_process(&self, _result: &ExtractionResult, config: &ExtractionConfig) -> bool {
|
|
76
|
+
config.chunking.is_some()
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
fn estimated_duration_ms(&self, result: &ExtractionResult) -> u64 {
|
|
80
|
+
let text_length = result.content.len();
|
|
81
|
+
// Chunking is fast: ~1ms per 10KB
|
|
82
|
+
(text_length / 10240).max(1) as u64
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
#[cfg(test)]
|
|
87
|
+
mod tests {
|
|
88
|
+
use super::*;
|
|
89
|
+
use crate::core::config::ChunkingConfig;
|
|
90
|
+
use crate::types::Metadata;
|
|
91
|
+
|
|
92
|
+
#[tokio::test]
|
|
93
|
+
async fn test_chunking_processor() {
|
|
94
|
+
let processor = ChunkingProcessor;
|
|
95
|
+
let config = ExtractionConfig {
|
|
96
|
+
chunking: Some(ChunkingConfig {
|
|
97
|
+
max_chars: 100,
|
|
98
|
+
max_overlap: 10,
|
|
99
|
+
embedding: None,
|
|
100
|
+
preset: None,
|
|
101
|
+
}),
|
|
102
|
+
..Default::default()
|
|
103
|
+
};
|
|
104
|
+
|
|
105
|
+
let mut result = ExtractionResult {
|
|
106
|
+
content: "This is a longer text that should be split into multiple chunks to test the chunking processor functionality.".to_string(),
|
|
107
|
+
mime_type: "text/plain".to_string(),
|
|
108
|
+
metadata: Metadata::default(),
|
|
109
|
+
tables: vec![],
|
|
110
|
+
detected_languages: None,
|
|
111
|
+
chunks: None,
|
|
112
|
+
images: None,
|
|
113
|
+
pages: None,
|
|
114
|
+
};
|
|
115
|
+
|
|
116
|
+
processor.process(&mut result, &config).await.unwrap();
|
|
117
|
+
|
|
118
|
+
assert!(result.chunks.is_some());
|
|
119
|
+
let chunks = result.chunks.unwrap();
|
|
120
|
+
assert!(!chunks.is_empty());
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
#[tokio::test]
|
|
124
|
+
async fn test_chunking_processor_no_config() {
|
|
125
|
+
let processor = ChunkingProcessor;
|
|
126
|
+
let config = ExtractionConfig::default();
|
|
127
|
+
|
|
128
|
+
let mut result = ExtractionResult {
|
|
129
|
+
content: "Some text".to_string(),
|
|
130
|
+
mime_type: "text/plain".to_string(),
|
|
131
|
+
metadata: Metadata::default(),
|
|
132
|
+
tables: vec![],
|
|
133
|
+
detected_languages: None,
|
|
134
|
+
chunks: None,
|
|
135
|
+
images: None,
|
|
136
|
+
pages: None,
|
|
137
|
+
};
|
|
138
|
+
|
|
139
|
+
processor.process(&mut result, &config).await.unwrap();
|
|
140
|
+
|
|
141
|
+
assert!(result.chunks.is_none());
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
#[test]
|
|
145
|
+
fn test_chunking_processor_plugin_interface() {
|
|
146
|
+
let processor = ChunkingProcessor;
|
|
147
|
+
assert_eq!(processor.name(), "text-chunking");
|
|
148
|
+
assert!(!processor.version().is_empty());
|
|
149
|
+
assert!(processor.initialize().is_ok());
|
|
150
|
+
assert!(processor.shutdown().is_ok());
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
#[test]
|
|
154
|
+
fn test_chunking_processor_stage() {
|
|
155
|
+
let processor = ChunkingProcessor;
|
|
156
|
+
assert_eq!(processor.processing_stage(), ProcessingStage::Middle);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
#[test]
|
|
160
|
+
fn test_chunking_processor_should_process() {
|
|
161
|
+
let processor = ChunkingProcessor;
|
|
162
|
+
|
|
163
|
+
let result = ExtractionResult {
|
|
164
|
+
content: "Sample text".to_string(),
|
|
165
|
+
mime_type: "text/plain".to_string(),
|
|
166
|
+
metadata: Metadata::default(),
|
|
167
|
+
tables: vec![],
|
|
168
|
+
detected_languages: None,
|
|
169
|
+
chunks: None,
|
|
170
|
+
images: None,
|
|
171
|
+
pages: None,
|
|
172
|
+
};
|
|
173
|
+
|
|
174
|
+
let config_with_chunking = ExtractionConfig {
|
|
175
|
+
chunking: Some(crate::core::config::ChunkingConfig {
|
|
176
|
+
max_chars: 100,
|
|
177
|
+
max_overlap: 10,
|
|
178
|
+
embedding: None,
|
|
179
|
+
preset: None,
|
|
180
|
+
}),
|
|
181
|
+
..Default::default()
|
|
182
|
+
};
|
|
183
|
+
assert!(processor.should_process(&result, &config_with_chunking));
|
|
184
|
+
|
|
185
|
+
let config_without_chunking = ExtractionConfig::default();
|
|
186
|
+
assert!(!processor.should_process(&result, &config_without_chunking));
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
#[test]
|
|
190
|
+
fn test_chunking_processor_estimated_duration() {
|
|
191
|
+
let processor = ChunkingProcessor;
|
|
192
|
+
|
|
193
|
+
let short_result = ExtractionResult {
|
|
194
|
+
content: "Short".to_string(),
|
|
195
|
+
mime_type: "text/plain".to_string(),
|
|
196
|
+
metadata: Metadata::default(),
|
|
197
|
+
tables: vec![],
|
|
198
|
+
detected_languages: None,
|
|
199
|
+
chunks: None,
|
|
200
|
+
images: None,
|
|
201
|
+
pages: None,
|
|
202
|
+
};
|
|
203
|
+
|
|
204
|
+
let long_result = ExtractionResult {
|
|
205
|
+
content: "a".repeat(100000),
|
|
206
|
+
mime_type: "text/plain".to_string(),
|
|
207
|
+
metadata: Metadata::default(),
|
|
208
|
+
tables: vec![],
|
|
209
|
+
detected_languages: None,
|
|
210
|
+
chunks: None,
|
|
211
|
+
images: None,
|
|
212
|
+
pages: None,
|
|
213
|
+
};
|
|
214
|
+
|
|
215
|
+
let short_duration = processor.estimated_duration_ms(&short_result);
|
|
216
|
+
let long_duration = processor.estimated_duration_ms(&long_result);
|
|
217
|
+
|
|
218
|
+
assert!(long_duration > short_duration);
|
|
219
|
+
}
|
|
220
|
+
}
|
|
@@ -7,6 +7,40 @@ use crate::{KreuzbergError, Result};
|
|
|
7
7
|
use serde::{Deserialize, Serialize};
|
|
8
8
|
use std::path::Path;
|
|
9
9
|
|
|
10
|
+
/// Page extraction and tracking configuration.
|
|
11
|
+
///
|
|
12
|
+
/// Controls how pages are extracted, tracked, and represented in the extraction results.
|
|
13
|
+
/// When `None`, page tracking is disabled.
|
|
14
|
+
///
|
|
15
|
+
/// Page range tracking in chunk metadata (first_page/last_page) is automatically enabled
|
|
16
|
+
/// when page boundaries are available and chunking is configured.
|
|
17
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
18
|
+
#[serde(default)]
|
|
19
|
+
pub struct PageConfig {
|
|
20
|
+
/// Extract pages as separate array (ExtractionResult.pages)
|
|
21
|
+
#[serde(default)]
|
|
22
|
+
pub extract_pages: bool,
|
|
23
|
+
|
|
24
|
+
/// Insert page markers in main content string
|
|
25
|
+
#[serde(default)]
|
|
26
|
+
pub insert_page_markers: bool,
|
|
27
|
+
|
|
28
|
+
/// Page marker format (use {page_num} placeholder)
|
|
29
|
+
/// Default: "\n\n<!-- PAGE {page_num} -->\n\n"
|
|
30
|
+
#[serde(default = "default_page_marker_format")]
|
|
31
|
+
pub marker_format: String,
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
impl Default for PageConfig {
|
|
35
|
+
fn default() -> Self {
|
|
36
|
+
Self {
|
|
37
|
+
extract_pages: false,
|
|
38
|
+
insert_page_markers: false,
|
|
39
|
+
marker_format: "\n\n<!-- PAGE {page_num} -->\n\n".to_string(),
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
10
44
|
/// Main extraction configuration.
|
|
11
45
|
///
|
|
12
46
|
/// This struct contains all configuration options for the extraction process.
|
|
@@ -50,6 +84,7 @@ pub struct ExtractionConfig {
|
|
|
50
84
|
pub images: Option<ImageExtractionConfig>,
|
|
51
85
|
|
|
52
86
|
/// PDF-specific options (None = use defaults)
|
|
87
|
+
#[cfg(feature = "pdf")]
|
|
53
88
|
#[serde(default)]
|
|
54
89
|
pub pdf_options: Option<PdfConfig>,
|
|
55
90
|
|
|
@@ -61,6 +96,10 @@ pub struct ExtractionConfig {
|
|
|
61
96
|
#[serde(default)]
|
|
62
97
|
pub language_detection: Option<LanguageDetectionConfig>,
|
|
63
98
|
|
|
99
|
+
/// Page extraction configuration (None = no page tracking)
|
|
100
|
+
#[serde(default)]
|
|
101
|
+
pub pages: Option<PageConfig>,
|
|
102
|
+
|
|
64
103
|
/// Keyword extraction configuration (None = no keyword extraction)
|
|
65
104
|
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
|
|
66
105
|
#[serde(default)]
|
|
@@ -225,6 +264,7 @@ pub struct ImageExtractionConfig {
|
|
|
225
264
|
}
|
|
226
265
|
|
|
227
266
|
/// PDF-specific configuration.
|
|
267
|
+
#[cfg(feature = "pdf")]
|
|
228
268
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
229
269
|
pub struct PdfConfig {
|
|
230
270
|
/// Extract images from PDF
|
|
@@ -277,6 +317,9 @@ fn default_eng() -> String {
|
|
|
277
317
|
fn default_tesseract_backend() -> String {
|
|
278
318
|
"tesseract".to_string()
|
|
279
319
|
}
|
|
320
|
+
fn default_page_marker_format() -> String {
|
|
321
|
+
"\n\n<!-- PAGE {page_num} -->\n\n".to_string()
|
|
322
|
+
}
|
|
280
323
|
fn default_chunk_size() -> usize {
|
|
281
324
|
1000
|
|
282
325
|
}
|
|
@@ -317,9 +360,11 @@ impl Default for ExtractionConfig {
|
|
|
317
360
|
force_ocr: false,
|
|
318
361
|
chunking: None,
|
|
319
362
|
images: None,
|
|
363
|
+
#[cfg(feature = "pdf")]
|
|
320
364
|
pdf_options: None,
|
|
321
365
|
token_reduction: None,
|
|
322
366
|
language_detection: None,
|
|
367
|
+
pages: None,
|
|
323
368
|
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
|
|
324
369
|
keywords: None,
|
|
325
370
|
postprocessor: None,
|
|
@@ -647,6 +692,7 @@ max_dpi = 600
|
|
|
647
692
|
}
|
|
648
693
|
|
|
649
694
|
#[test]
|
|
695
|
+
#[cfg(feature = "pdf")]
|
|
650
696
|
fn test_config_with_pdf_options() {
|
|
651
697
|
let dir = tempdir().unwrap();
|
|
652
698
|
let config_path = dir.path().join("kreuzberg.toml");
|
|
@@ -770,9 +816,10 @@ enabled = true
|
|
|
770
816
|
assert!(config.ocr.is_some());
|
|
771
817
|
assert!(config.chunking.is_some());
|
|
772
818
|
assert!(config.images.is_some());
|
|
773
|
-
assert!(config.pdf_options.is_some());
|
|
774
819
|
assert!(config.token_reduction.is_some());
|
|
775
820
|
assert!(config.language_detection.is_some());
|
|
821
|
+
#[cfg(feature = "pdf")]
|
|
822
|
+
assert!(config.pdf_options.is_some());
|
|
776
823
|
}
|
|
777
824
|
|
|
778
825
|
#[test]
|
|
@@ -838,6 +885,7 @@ enabled = true
|
|
|
838
885
|
}
|
|
839
886
|
|
|
840
887
|
#[test]
|
|
888
|
+
#[cfg(feature = "pdf")]
|
|
841
889
|
fn test_pdf_config_defaults() {
|
|
842
890
|
let dir = tempdir().unwrap();
|
|
843
891
|
let config_path = dir.path().join("kreuzberg.toml");
|
|
@@ -20,6 +20,7 @@ use crate::types::ExtractionResult;
|
|
|
20
20
|
#[cfg(feature = "office")]
|
|
21
21
|
use crate::types::LibreOfficeConversionResult;
|
|
22
22
|
use crate::{KreuzbergError, Result};
|
|
23
|
+
#[cfg(feature = "tokio-runtime")]
|
|
23
24
|
use once_cell::sync::Lazy;
|
|
24
25
|
#[cfg(feature = "office")]
|
|
25
26
|
use serde_json::json;
|
|
@@ -97,6 +98,12 @@ fn sanitize_path(path: &Path) -> String {
|
|
|
97
98
|
/// 2. If runtime creation fails, the process is already in a critical state
|
|
98
99
|
/// 3. This is a one-time initialization - if it fails, nothing will work
|
|
99
100
|
/// 4. Better to fail fast than return errors from every sync operation
|
|
101
|
+
///
|
|
102
|
+
/// # Availability
|
|
103
|
+
///
|
|
104
|
+
/// This static is only available when the `tokio-runtime` feature is enabled.
|
|
105
|
+
/// For WASM targets, use the truly synchronous extraction functions instead.
|
|
106
|
+
#[cfg(feature = "tokio-runtime")]
|
|
100
107
|
static GLOBAL_RUNTIME: Lazy<tokio::runtime::Runtime> = Lazy::new(|| {
|
|
101
108
|
tokio::runtime::Builder::new_multi_thread()
|
|
102
109
|
.enable_all()
|
|
@@ -310,13 +317,13 @@ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionC
|
|
|
310
317
|
///
|
|
311
318
|
/// Individual file errors are captured in the result metadata. System errors
|
|
312
319
|
/// (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
|
|
320
|
+
#[cfg(feature = "tokio-runtime")]
|
|
313
321
|
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
314
322
|
skip(config, paths),
|
|
315
323
|
fields(
|
|
316
324
|
extraction.batch_size = paths.len(),
|
|
317
325
|
)
|
|
318
326
|
))]
|
|
319
|
-
#[cfg(feature = "tokio-runtime")]
|
|
320
327
|
pub async fn batch_extract_file(
|
|
321
328
|
paths: Vec<impl AsRef<Path>>,
|
|
322
329
|
config: &ExtractionConfig,
|
|
@@ -380,6 +387,7 @@ pub async fn batch_extract_file(
|
|
|
380
387
|
detected_languages: None,
|
|
381
388
|
chunks: None,
|
|
382
389
|
images: None,
|
|
390
|
+
pages: None,
|
|
383
391
|
});
|
|
384
392
|
}
|
|
385
393
|
Err(join_err) => {
|
|
@@ -407,13 +415,13 @@ pub async fn batch_extract_file(
|
|
|
407
415
|
/// # Returns
|
|
408
416
|
///
|
|
409
417
|
/// A vector of `ExtractionResult` in the same order as the input.
|
|
418
|
+
#[cfg(feature = "tokio-runtime")]
|
|
410
419
|
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
411
420
|
skip(config, contents),
|
|
412
421
|
fields(
|
|
413
422
|
extraction.batch_size = contents.len(),
|
|
414
423
|
)
|
|
415
424
|
))]
|
|
416
|
-
#[cfg(feature = "tokio-runtime")]
|
|
417
425
|
pub async fn batch_extract_bytes(
|
|
418
426
|
contents: Vec<(&[u8], &str)>,
|
|
419
427
|
config: &ExtractionConfig,
|
|
@@ -483,6 +491,7 @@ pub async fn batch_extract_bytes(
|
|
|
483
491
|
detected_languages: None,
|
|
484
492
|
chunks: None,
|
|
485
493
|
images: None,
|
|
494
|
+
pages: None,
|
|
486
495
|
});
|
|
487
496
|
}
|
|
488
497
|
Err(join_err) => {
|
|
@@ -502,6 +511,10 @@ pub async fn batch_extract_bytes(
|
|
|
502
511
|
///
|
|
503
512
|
/// Uses the global Tokio runtime for 100x+ performance improvement over creating
|
|
504
513
|
/// a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
|
|
514
|
+
///
|
|
515
|
+
/// This function is only available with the `tokio-runtime` feature. For WASM targets,
|
|
516
|
+
/// use a truly synchronous extraction approach instead.
|
|
517
|
+
#[cfg(feature = "tokio-runtime")]
|
|
505
518
|
pub fn extract_file_sync(
|
|
506
519
|
path: impl AsRef<Path>,
|
|
507
520
|
mime_type: Option<&str>,
|
|
@@ -514,14 +527,31 @@ pub fn extract_file_sync(
|
|
|
514
527
|
///
|
|
515
528
|
/// Uses the global Tokio runtime for 100x+ performance improvement over creating
|
|
516
529
|
/// a new runtime per call.
|
|
530
|
+
///
|
|
531
|
+
/// With the `tokio-runtime` feature, this blocks the current thread using the global
|
|
532
|
+
/// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
|
|
533
|
+
#[cfg(feature = "tokio-runtime")]
|
|
517
534
|
pub fn extract_bytes_sync(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
518
535
|
GLOBAL_RUNTIME.block_on(extract_bytes(content, mime_type, config))
|
|
519
536
|
}
|
|
520
537
|
|
|
538
|
+
/// Synchronous wrapper for `extract_bytes` (WASM-compatible version).
|
|
539
|
+
///
|
|
540
|
+
/// This is a truly synchronous implementation without tokio runtime dependency.
|
|
541
|
+
/// It calls `extract_bytes_sync_impl()` to perform the extraction.
|
|
542
|
+
#[cfg(not(feature = "tokio-runtime"))]
|
|
543
|
+
pub fn extract_bytes_sync(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
544
|
+
extract_bytes_sync_impl(content.to_vec(), Some(mime_type.to_string()), Some(config.clone()))
|
|
545
|
+
}
|
|
546
|
+
|
|
521
547
|
/// Synchronous wrapper for `batch_extract_file`.
|
|
522
548
|
///
|
|
523
549
|
/// Uses the global Tokio runtime for 100x+ performance improvement over creating
|
|
524
550
|
/// a new runtime per call.
|
|
551
|
+
///
|
|
552
|
+
/// This function is only available with the `tokio-runtime` feature. For WASM targets,
|
|
553
|
+
/// use a truly synchronous extraction approach instead.
|
|
554
|
+
#[cfg(feature = "tokio-runtime")]
|
|
525
555
|
pub fn batch_extract_file_sync(
|
|
526
556
|
paths: Vec<impl AsRef<Path>>,
|
|
527
557
|
config: &ExtractionConfig,
|
|
@@ -533,6 +563,11 @@ pub fn batch_extract_file_sync(
|
|
|
533
563
|
///
|
|
534
564
|
/// Uses the global Tokio runtime for 100x+ performance improvement over creating
|
|
535
565
|
/// a new runtime per call.
|
|
566
|
+
///
|
|
567
|
+
/// With the `tokio-runtime` feature, this blocks the current thread using the global
|
|
568
|
+
/// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
|
|
569
|
+
/// that iterates through items and calls `extract_bytes_sync()`.
|
|
570
|
+
#[cfg(feature = "tokio-runtime")]
|
|
536
571
|
pub fn batch_extract_bytes_sync(
|
|
537
572
|
contents: Vec<(&[u8], &str)>,
|
|
538
573
|
config: &ExtractionConfig,
|
|
@@ -540,6 +575,103 @@ pub fn batch_extract_bytes_sync(
|
|
|
540
575
|
GLOBAL_RUNTIME.block_on(batch_extract_bytes(contents, config))
|
|
541
576
|
}
|
|
542
577
|
|
|
578
|
+
/// Synchronous wrapper for `batch_extract_bytes` (WASM-compatible version).
|
|
579
|
+
///
|
|
580
|
+
/// This is a truly synchronous implementation that iterates through items
|
|
581
|
+
/// and calls `extract_bytes_sync()` for each.
|
|
582
|
+
#[cfg(not(feature = "tokio-runtime"))]
|
|
583
|
+
pub fn batch_extract_bytes_sync(
|
|
584
|
+
contents: Vec<(&[u8], &str)>,
|
|
585
|
+
config: &ExtractionConfig,
|
|
586
|
+
) -> Result<Vec<ExtractionResult>> {
|
|
587
|
+
let mut results = Vec::with_capacity(contents.len());
|
|
588
|
+
for (content, mime_type) in contents {
|
|
589
|
+
let result = extract_bytes_sync(content, mime_type, config);
|
|
590
|
+
results.push(result.unwrap_or_else(|e| {
|
|
591
|
+
use crate::types::{ErrorMetadata, Metadata};
|
|
592
|
+
ExtractionResult {
|
|
593
|
+
content: format!("Error: {}", e),
|
|
594
|
+
mime_type: "text/plain".to_string(),
|
|
595
|
+
metadata: Metadata {
|
|
596
|
+
error: Some(ErrorMetadata {
|
|
597
|
+
error_type: format!("{:?}", e),
|
|
598
|
+
message: e.to_string(),
|
|
599
|
+
}),
|
|
600
|
+
..Default::default()
|
|
601
|
+
},
|
|
602
|
+
tables: vec![],
|
|
603
|
+
detected_languages: None,
|
|
604
|
+
chunks: None,
|
|
605
|
+
images: None,
|
|
606
|
+
pages: None,
|
|
607
|
+
}
|
|
608
|
+
}));
|
|
609
|
+
}
|
|
610
|
+
Ok(results)
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
/// Synchronous extraction implementation for WASM compatibility.
|
|
614
|
+
///
|
|
615
|
+
/// This function performs extraction without requiring a tokio runtime.
|
|
616
|
+
/// It calls the sync extractor methods directly.
|
|
617
|
+
///
|
|
618
|
+
/// # Arguments
|
|
619
|
+
///
|
|
620
|
+
/// * `content` - The byte content to extract
|
|
621
|
+
/// * `mime_type` - Optional MIME type to validate/use
|
|
622
|
+
/// * `config` - Optional extraction configuration
|
|
623
|
+
///
|
|
624
|
+
/// # Returns
|
|
625
|
+
///
|
|
626
|
+
/// An `ExtractionResult` or a `KreuzbergError`
|
|
627
|
+
///
|
|
628
|
+
/// # Implementation Notes
|
|
629
|
+
///
|
|
630
|
+
/// This is called when the `tokio-runtime` feature is disabled.
|
|
631
|
+
/// It replicates the logic of `extract_bytes` but uses synchronous extractor methods.
|
|
632
|
+
#[cfg(not(feature = "tokio-runtime"))]
|
|
633
|
+
fn extract_bytes_sync_impl(
|
|
634
|
+
content: Vec<u8>,
|
|
635
|
+
mime_type: Option<String>,
|
|
636
|
+
config: Option<ExtractionConfig>,
|
|
637
|
+
) -> Result<ExtractionResult> {
|
|
638
|
+
use crate::core::mime;
|
|
639
|
+
|
|
640
|
+
let config = config.unwrap_or_default();
|
|
641
|
+
|
|
642
|
+
// Validate MIME type if provided
|
|
643
|
+
let validated_mime = if let Some(mime) = mime_type {
|
|
644
|
+
mime::validate_mime_type(&mime)?
|
|
645
|
+
} else {
|
|
646
|
+
return Err(KreuzbergError::Validation {
|
|
647
|
+
message: "MIME type is required for synchronous extraction".to_string(),
|
|
648
|
+
source: None,
|
|
649
|
+
});
|
|
650
|
+
};
|
|
651
|
+
|
|
652
|
+
// Ensure extractors are initialized
|
|
653
|
+
crate::extractors::ensure_initialized()?;
|
|
654
|
+
|
|
655
|
+
// Get the appropriate extractor
|
|
656
|
+
let extractor = get_extractor(&validated_mime)?;
|
|
657
|
+
|
|
658
|
+
// Check if extractor supports synchronous extraction
|
|
659
|
+
let sync_extractor = extractor.as_sync_extractor().ok_or_else(|| {
|
|
660
|
+
KreuzbergError::UnsupportedFormat(format!(
|
|
661
|
+
"Extractor for '{}' does not support synchronous extraction",
|
|
662
|
+
validated_mime
|
|
663
|
+
))
|
|
664
|
+
})?;
|
|
665
|
+
|
|
666
|
+
// Call the sync extract method
|
|
667
|
+
let mut result = sync_extractor.extract_sync(&content, &validated_mime, &config)?;
|
|
668
|
+
|
|
669
|
+
// Run post-processing pipeline (sync version)
|
|
670
|
+
result = crate::core::pipeline::run_pipeline_sync(result, &config)?;
|
|
671
|
+
|
|
672
|
+
Ok(result)
|
|
673
|
+
}
|
|
674
|
+
|
|
543
675
|
async fn extract_file_with_extractor(
|
|
544
676
|
path: &Path,
|
|
545
677
|
mime_type: &str,
|
|
@@ -37,9 +37,11 @@ pub mod mime;
|
|
|
37
37
|
pub mod pipeline;
|
|
38
38
|
|
|
39
39
|
pub use config::{
|
|
40
|
-
ChunkingConfig, ExtractionConfig, ImageExtractionConfig, LanguageDetectionConfig, OcrConfig,
|
|
41
|
-
TokenReductionConfig,
|
|
40
|
+
ChunkingConfig, ExtractionConfig, ImageExtractionConfig, LanguageDetectionConfig, OcrConfig, TokenReductionConfig,
|
|
42
41
|
};
|
|
42
|
+
|
|
43
|
+
#[cfg(feature = "pdf")]
|
|
44
|
+
pub use config::PdfConfig;
|
|
43
45
|
#[cfg(feature = "tokio-runtime")]
|
|
44
46
|
pub use extractor::{batch_extract_bytes, batch_extract_file};
|
|
45
47
|
pub use extractor::{extract_bytes, extract_file};
|