kreuzberg 4.2.6 → 4.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +7 -4
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
  5. data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
  6. data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
  7. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
  8. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
  9. data/ext/kreuzberg_rb/native/src/result.rs +5 -3
  10. data/lib/kreuzberg/version.rb +1 -1
  11. data/sig/kreuzberg.rbs +228 -37
  12. data/spec/binding/batch_operations_spec.rb +2 -0
  13. data/vendor/Cargo.toml +3 -2
  14. data/vendor/kreuzberg/Cargo.toml +2 -1
  15. data/vendor/kreuzberg/README.md +1 -1
  16. data/vendor/kreuzberg/src/api/error.rs +29 -1
  17. data/vendor/kreuzberg/src/api/handlers.rs +28 -25
  18. data/vendor/kreuzberg/src/api/openapi.rs +14 -1
  19. data/vendor/kreuzberg/src/chunking/config.rs +2 -37
  20. data/vendor/kreuzberg/src/chunking/core.rs +78 -2
  21. data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
  22. data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
  23. data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
  24. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
  25. data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
  26. data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
  27. data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
  28. data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
  29. data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
  30. data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
  31. data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
  32. data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
  33. data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
  34. data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
  35. data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
  36. data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
  37. data/vendor/kreuzberg/src/extraction/email.rs +31 -19
  38. data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
  39. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
  40. data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
  41. data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
  42. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
  43. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
  44. data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
  45. data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
  47. data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
  48. data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
  49. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
  50. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
  51. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
  52. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
  53. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
  54. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
  55. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
  56. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
  57. data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
  58. data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
  59. data/vendor/kreuzberg/src/extractors/email.rs +5 -3
  60. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
  61. data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
  62. data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
  63. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
  64. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
  65. data/vendor/kreuzberg/src/extractors/html.rs +1 -1
  66. data/vendor/kreuzberg/src/extractors/image.rs +3 -3
  67. data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
  68. data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
  69. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
  70. data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
  71. data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
  72. data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
  73. data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
  74. data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
  75. data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
  76. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
  77. data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
  78. data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
  79. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
  80. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
  81. data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
  82. data/vendor/kreuzberg/src/extractors/text.rs +2 -2
  83. data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
  84. data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
  85. data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
  86. data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
  87. data/vendor/kreuzberg/src/lib.rs +1 -1
  88. data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
  89. data/vendor/kreuzberg/src/mcp/format.rs +5 -4
  90. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
  91. data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
  92. data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
  93. data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
  94. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
  95. data/vendor/kreuzberg/src/ocr/types.rs +3 -4
  96. data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
  97. data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
  98. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
  99. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
  100. data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
  101. data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
  102. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
  103. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
  104. data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
  105. data/vendor/kreuzberg/src/text/quality.rs +13 -13
  106. data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
  107. data/vendor/kreuzberg/src/types/djot.rs +15 -4
  108. data/vendor/kreuzberg/src/types/extraction.rs +24 -4
  109. data/vendor/kreuzberg/src/types/formats.rs +9 -5
  110. data/vendor/kreuzberg/src/types/metadata.rs +68 -7
  111. data/vendor/kreuzberg/src/types/mod.rs +7 -5
  112. data/vendor/kreuzberg/src/types/page.rs +9 -0
  113. data/vendor/kreuzberg/src/types/tables.rs +2 -0
  114. data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
  115. data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
  116. data/vendor/kreuzberg/tests/config_features.rs +19 -11
  117. data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
  118. data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
  119. data/vendor/kreuzberg/tests/core_integration.rs +5 -6
  120. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
  121. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
  122. data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
  123. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
  124. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
  125. data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
  126. data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
  127. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
  128. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  129. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
  130. data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
  131. data/vendor/kreuzberg-ffi/src/error.rs +56 -0
  132. data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
  133. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
  134. data/vendor/kreuzberg-ffi/src/result.rs +2 -1
  135. data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
  136. data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
  137. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
  138. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  139. metadata +2 -2
@@ -44,10 +44,10 @@ impl ExtractionConfig {
44
44
  let config: Self = toml::from_str(&content)
45
45
  .map_err(|e| KreuzbergError::validation(format!("Invalid TOML in {}: {}", path.display(), e)))?;
46
46
 
47
- let config_arc = Arc::new(config.clone());
48
- CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc));
47
+ let config_arc = Arc::new(config);
48
+ CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc.clone()));
49
49
 
50
- Ok(config)
50
+ Ok((*config_arc).clone())
51
51
  }
52
52
 
53
53
  /// Load configuration from a YAML file.
@@ -72,10 +72,10 @@ impl ExtractionConfig {
72
72
  let config: Self = serde_yaml_ng::from_str(&content)
73
73
  .map_err(|e| KreuzbergError::validation(format!("Invalid YAML in {}: {}", path.display(), e)))?;
74
74
 
75
- let config_arc = Arc::new(config.clone());
76
- CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc));
75
+ let config_arc = Arc::new(config);
76
+ CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc.clone()));
77
77
 
78
- Ok(config)
78
+ Ok((*config_arc).clone())
79
79
  }
80
80
 
81
81
  /// Load configuration from a JSON file.
@@ -100,10 +100,10 @@ impl ExtractionConfig {
100
100
  let config: Self = serde_json::from_str(&content)
101
101
  .map_err(|e| KreuzbergError::validation(format!("Invalid JSON in {}: {}", path.display(), e)))?;
102
102
 
103
- let config_arc = Arc::new(config.clone());
104
- CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc));
103
+ let config_arc = Arc::new(config);
104
+ CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc.clone()));
105
105
 
106
- Ok(config)
106
+ Ok((*config_arc).clone())
107
107
  }
108
108
 
109
109
  /// Load configuration from a file, auto-detecting format by extension.
@@ -169,10 +169,10 @@ impl ExtractionConfig {
169
169
  }
170
170
  };
171
171
 
172
- let config_arc = Arc::new(config.clone());
173
- CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc));
172
+ let config_arc = Arc::new(config);
173
+ CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc.clone()));
174
174
 
175
- Ok(config)
175
+ Ok((*config_arc).clone())
176
176
  }
177
177
 
178
178
  /// Discover configuration file in parent directories.
@@ -17,4 +17,4 @@ pub use ocr::OcrConfig;
17
17
  pub use page::PageConfig;
18
18
  #[cfg(feature = "pdf")]
19
19
  pub use pdf::{HierarchyConfig, PdfConfig};
20
- pub use processing::{ChunkingConfig, EmbeddingConfig, EmbeddingModelType, PostProcessorConfig};
20
+ pub use processing::{ChunkerType, ChunkingConfig, EmbeddingConfig, EmbeddingModelType, PostProcessorConfig};
@@ -7,6 +7,19 @@ use serde::{Deserialize, Serialize};
7
7
  use std::collections::HashSet;
8
8
  use std::path::PathBuf;
9
9
 
10
+ /// Type of text chunker to use.
11
+ ///
12
+ /// # Variants
13
+ ///
14
+ /// * `Text` - Generic text splitter, splits on whitespace and punctuation
15
+ /// * `Markdown` - Markdown-aware splitter, preserves formatting and structure
16
+ #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
17
+ pub enum ChunkerType {
18
+ #[default]
19
+ Text,
20
+ Markdown,
21
+ }
22
+
10
23
  /// Post-processor configuration.
11
24
  #[derive(Debug, Clone, Serialize, Deserialize)]
12
25
  pub struct PostProcessorConfig {
@@ -59,15 +72,34 @@ impl Default for PostProcessorConfig {
59
72
  }
60
73
 
61
74
  /// Chunking configuration.
75
+ ///
76
+ /// Configures text chunking for document content, including chunk size,
77
+ /// overlap, trimming behavior, and optional embeddings.
62
78
  #[derive(Debug, Clone, Serialize, Deserialize)]
63
79
  pub struct ChunkingConfig {
64
80
  /// Maximum characters per chunk
65
- #[serde(default = "default_chunk_size")]
66
- pub max_chars: usize,
81
+ ///
82
+ /// Default: 1000
83
+ #[serde(default = "default_chunk_size", rename = "max_chars", alias = "max_characters")]
84
+ pub max_characters: usize,
67
85
 
68
86
  /// Overlap between chunks in characters
69
- #[serde(default = "default_chunk_overlap")]
70
- pub max_overlap: usize,
87
+ ///
88
+ /// Default: 200
89
+ #[serde(default = "default_chunk_overlap", rename = "max_overlap", alias = "overlap")]
90
+ pub overlap: usize,
91
+
92
+ /// Whether to trim whitespace from chunk boundaries
93
+ ///
94
+ /// Default: true
95
+ #[serde(default = "default_trim")]
96
+ pub trim: bool,
97
+
98
+ /// Type of chunker to use (Text or Markdown)
99
+ ///
100
+ /// Default: Text
101
+ #[serde(default = "default_chunker_type")]
102
+ pub chunker_type: ChunkerType,
71
103
 
72
104
  /// Optional embedding configuration for chunk embeddings
73
105
  #[serde(skip_serializing_if = "Option::is_none")]
@@ -78,6 +110,19 @@ pub struct ChunkingConfig {
78
110
  pub preset: Option<String>,
79
111
  }
80
112
 
113
+ impl Default for ChunkingConfig {
114
+ fn default() -> Self {
115
+ Self {
116
+ max_characters: 1000,
117
+ overlap: 200,
118
+ trim: true,
119
+ chunker_type: ChunkerType::Text,
120
+ embedding: None,
121
+ preset: None,
122
+ }
123
+ }
124
+ }
125
+
81
126
  /// Embedding configuration for text chunks.
82
127
  ///
83
128
  /// Configures embedding generation using ONNX models via fastembed-rs.
@@ -149,6 +194,14 @@ fn default_chunk_overlap() -> usize {
149
194
  200
150
195
  }
151
196
 
197
+ fn default_trim() -> bool {
198
+ true
199
+ }
200
+
201
+ fn default_chunker_type() -> ChunkerType {
202
+ ChunkerType::Text
203
+ }
204
+
152
205
  fn default_normalize() -> bool {
153
206
  true
154
207
  }
@@ -196,13 +249,17 @@ mod tests {
196
249
  #[test]
197
250
  fn test_chunking_config_defaults() {
198
251
  let config = ChunkingConfig {
199
- max_chars: 1000,
200
- max_overlap: 200,
252
+ max_characters: 1000,
253
+ overlap: 200,
254
+ trim: true,
255
+ chunker_type: ChunkerType::Text,
201
256
  embedding: None,
202
257
  preset: None,
203
258
  };
204
- assert_eq!(config.max_chars, 1000);
205
- assert_eq!(config.max_overlap, 200);
259
+ assert_eq!(config.max_characters, 1000);
260
+ assert_eq!(config.overlap, 200);
261
+ assert!(config.trim);
262
+ assert_eq!(config.chunker_type, ChunkerType::Text);
206
263
  }
207
264
 
208
265
  #[test]
@@ -141,6 +141,14 @@ mod tests {
141
141
  assert!(validate_language_code("DEU").is_ok());
142
142
  }
143
143
 
144
+ #[test]
145
+ fn test_validate_language_code_all_keyword() {
146
+ assert!(validate_language_code("all").is_ok());
147
+ assert!(validate_language_code("ALL").is_ok());
148
+ assert!(validate_language_code("All").is_ok());
149
+ assert!(validate_language_code("*").is_ok());
150
+ }
151
+
144
152
  #[test]
145
153
  fn test_validate_language_code_invalid() {
146
154
  let result = validate_language_code("invalid");
@@ -167,6 +167,11 @@ pub fn validate_ocr_backend(backend: &str) -> Result<()> {
167
167
  pub fn validate_language_code(code: &str) -> Result<()> {
168
168
  let code_lower = code.to_lowercase();
169
169
 
170
+ // Accept "all" and "*" as special values to auto-detect installed languages
171
+ if code_lower == "all" || code_lower == "*" {
172
+ return Ok(());
173
+ }
174
+
170
175
  if VALID_LANGUAGE_CODES.contains(&code_lower.as_str()) {
171
176
  return Ok(());
172
177
  }
@@ -6,6 +6,7 @@
6
6
  use crate::core::config::ExtractionConfig;
7
7
  use crate::types::{ErrorMetadata, ExtractionResult, Metadata};
8
8
  use crate::{KreuzbergError, Result};
9
+ use std::borrow::Cow;
9
10
  use std::path::Path;
10
11
  use std::sync::Arc;
11
12
 
@@ -65,9 +66,9 @@ pub async fn batch_extract_file(
65
66
  return Ok(vec![]);
66
67
  }
67
68
 
68
- let config = Arc::new(config.clone());
69
+ let config_arc = Arc::new(config.clone());
69
70
 
70
- let max_concurrent = config
71
+ let max_concurrent = config_arc
71
72
  .max_concurrent_extractions
72
73
  .unwrap_or_else(|| (num_cpus::get() as f64 * 1.5).ceil() as usize);
73
74
  let semaphore = Arc::new(Semaphore::new(max_concurrent));
@@ -76,7 +77,7 @@ pub async fn batch_extract_file(
76
77
 
77
78
  for (index, path) in paths.into_iter().enumerate() {
78
79
  let path_buf = path.as_ref().to_path_buf();
79
- let config_clone = Arc::clone(&config);
80
+ let config_clone = Arc::clone(&config_arc);
80
81
  let semaphore_clone = Arc::clone(&semaphore);
81
82
 
82
83
  tasks.spawn(async move {
@@ -108,7 +109,7 @@ pub async fn batch_extract_file(
108
109
 
109
110
  results[index] = Some(ExtractionResult {
110
111
  content: format!("Error: {}", e),
111
- mime_type: "text/plain".to_string(),
112
+ mime_type: Cow::Borrowed("text/plain"),
112
113
  metadata,
113
114
  tables: vec![],
114
115
  detected_languages: None,
@@ -180,10 +181,9 @@ pub async fn batch_extract_bytes(
180
181
  return Ok(vec![]);
181
182
  }
182
183
 
183
- let batch_config = config.clone();
184
- let config = Arc::new(batch_config);
184
+ let config_arc = Arc::new(config.clone());
185
185
 
186
- let max_concurrent = config
186
+ let max_concurrent = config_arc
187
187
  .max_concurrent_extractions
188
188
  .unwrap_or_else(|| (num_cpus::get() as f64 * 1.5).ceil() as usize);
189
189
  let semaphore = Arc::new(Semaphore::new(max_concurrent));
@@ -191,7 +191,7 @@ pub async fn batch_extract_bytes(
191
191
  let mut tasks = JoinSet::new();
192
192
 
193
193
  for (index, (bytes, mime_type)) in contents.into_iter().enumerate() {
194
- let config_clone = Arc::clone(&config);
194
+ let config_clone = Arc::clone(&config_arc);
195
195
  let semaphore_clone = Arc::clone(&semaphore);
196
196
 
197
197
  tasks.spawn(async move {
@@ -224,7 +224,7 @@ pub async fn batch_extract_bytes(
224
224
 
225
225
  results[index] = Some(ExtractionResult {
226
226
  content: format!("Error: {}", e),
227
- mime_type: "text/plain".to_string(),
227
+ mime_type: Cow::Borrowed("text/plain"),
228
228
  metadata,
229
229
  tables: vec![],
230
230
  detected_languages: None,
@@ -18,6 +18,8 @@ use crate::types::ExtractionResult;
18
18
  use crate::types::LibreOfficeConversionResult;
19
19
  #[cfg(feature = "office")]
20
20
  use serde_json::json;
21
+ #[cfg(feature = "office")]
22
+ use std::borrow::Cow;
21
23
  use std::path::Path;
22
24
 
23
25
  #[cfg(feature = "office")]
@@ -226,9 +228,9 @@ pub(in crate::core::extractor) fn apply_libreoffice_metadata(
226
228
  legacy_mime: &str,
227
229
  conversion: &LibreOfficeConversionResult,
228
230
  ) {
229
- result.mime_type = pool_mime_type(legacy_mime);
231
+ result.mime_type = pool_mime_type(legacy_mime).into();
230
232
  result.metadata.additional.insert(
231
- "libreoffice_conversion".to_string(),
233
+ Cow::Borrowed("libreoffice_conversion"),
232
234
  json!({
233
235
  "converter": "libreoffice",
234
236
  "original_format": conversion.original_format,
@@ -24,18 +24,18 @@
24
24
  /// It replicates the logic of `extract_bytes` but uses synchronous extractor methods.
25
25
  #[cfg(not(feature = "tokio-runtime"))]
26
26
  pub(super) fn extract_bytes_sync_impl(
27
- content: Vec<u8>,
28
- mime_type: Option<String>,
29
- config: Option<crate::core::config::ExtractionConfig>,
27
+ content: &[u8],
28
+ mime_type: Option<&str>,
29
+ config: Option<&crate::core::config::ExtractionConfig>,
30
30
  ) -> crate::Result<crate::types::ExtractionResult> {
31
31
  use crate::KreuzbergError;
32
32
  use crate::core::extractor::helpers::get_extractor;
33
33
  use crate::core::mime;
34
34
 
35
- let config = config.unwrap_or_default();
35
+ let cfg = config.cloned().unwrap_or_default();
36
36
 
37
37
  let validated_mime = if let Some(mime) = mime_type {
38
- mime::validate_mime_type(&mime)?
38
+ mime::validate_mime_type(mime)?
39
39
  } else {
40
40
  return Err(KreuzbergError::Validation {
41
41
  message: "MIME type is required for synchronous extraction".to_string(),
@@ -54,9 +54,9 @@ pub(super) fn extract_bytes_sync_impl(
54
54
  ))
55
55
  })?;
56
56
 
57
- let mut result = sync_extractor.extract_sync(&content, &validated_mime, &config)?;
57
+ let mut result = sync_extractor.extract_sync(content, &validated_mime, &cfg)?;
58
58
 
59
- result = crate::core::pipeline::run_pipeline_sync(result, &config)?;
59
+ result = crate::core::pipeline::run_pipeline_sync(result, &cfg)?;
60
60
 
61
61
  Ok(result)
62
62
  }
@@ -107,7 +107,7 @@ pub fn extract_bytes_sync(content: &[u8], mime_type: &str, config: &ExtractionCo
107
107
  /// It calls `extract_bytes_sync_impl()` to perform the extraction.
108
108
  #[cfg(not(feature = "tokio-runtime"))]
109
109
  pub fn extract_bytes_sync(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
110
- super::legacy::extract_bytes_sync_impl(content.to_vec(), Some(mime_type.to_string()), Some(config.clone()))
110
+ super::legacy::extract_bytes_sync_impl(content, Some(mime_type), Some(config))
111
111
  }
112
112
 
113
113
  /// Synchronous wrapper for `batch_extract_file`.
@@ -180,14 +180,14 @@ pub fn batch_extract_bytes_sync(
180
180
  config: &ExtractionConfig,
181
181
  ) -> Result<Vec<ExtractionResult>> {
182
182
  use crate::types::{ErrorMetadata, Metadata};
183
- use crate::utils::intern_mime_type;
183
+ use std::borrow::Cow;
184
184
 
185
185
  let mut results = Vec::with_capacity(contents.len());
186
186
  for (content, mime_type) in contents {
187
187
  let result = extract_bytes_sync(&content, &mime_type, config);
188
188
  results.push(result.unwrap_or_else(|e| ExtractionResult {
189
189
  content: format!("Error: {}", e),
190
- mime_type: intern_mime_type("text/plain").to_string(),
190
+ mime_type: Cow::Borrowed("text/plain"),
191
191
  metadata: Metadata {
192
192
  error: Some(ErrorMetadata {
193
193
  error_type: format!("{:?}", e),
@@ -7,6 +7,7 @@ use crate::core::config::ExtractionConfig;
7
7
  use crate::plugins::ProcessingStage;
8
8
  use crate::types::ExtractionResult;
9
9
  use crate::{KreuzbergError, Result};
10
+ use std::borrow::Cow;
10
11
 
11
12
  /// Execute all registered post-processors by stage.
12
13
  pub(super) async fn execute_processors(
@@ -37,7 +38,7 @@ pub(super) async fn execute_processors(
37
38
  }
38
39
  Err(err) => {
39
40
  result.metadata.additional.insert(
40
- format!("processing_error_{processor_name}"),
41
+ Cow::Owned(format!("processing_error_{processor_name}")),
41
42
  serde_json::Value::String(err.to_string()),
42
43
  );
43
44
  }
@@ -6,27 +6,21 @@
6
6
  use crate::Result;
7
7
  use crate::core::config::ExtractionConfig;
8
8
  use crate::types::ExtractionResult;
9
+ use std::borrow::Cow;
9
10
 
10
11
  /// Execute chunking if configured.
11
12
  pub(super) fn execute_chunking(result: &mut ExtractionResult, config: &ExtractionConfig) -> Result<()> {
12
13
  #[cfg(feature = "chunking")]
13
14
  if let Some(ref chunking_config) = config.chunking {
14
- let chunk_config = crate::chunking::ChunkingConfig {
15
- max_characters: chunking_config.max_chars,
16
- overlap: chunking_config.max_overlap,
17
- trim: true,
18
- chunker_type: crate::chunking::ChunkerType::Text,
19
- };
20
-
21
15
  let page_boundaries = result.metadata.pages.as_ref().and_then(|ps| ps.boundaries.as_deref());
22
16
 
23
- match crate::chunking::chunk_text(&result.content, &chunk_config, page_boundaries) {
17
+ match crate::chunking::chunk_text(&result.content, chunking_config, page_boundaries) {
24
18
  Ok(chunking_result) => {
25
19
  result.chunks = Some(chunking_result.chunks);
26
20
 
27
21
  if let Some(ref chunks) = result.chunks {
28
22
  result.metadata.additional.insert(
29
- "chunk_count".to_string(),
23
+ Cow::Borrowed("chunk_count"),
30
24
  serde_json::Value::Number(serde_json::Number::from(chunks.len())),
31
25
  );
32
26
  }
@@ -40,13 +34,13 @@ pub(super) fn execute_chunking(result: &mut ExtractionResult, config: &Extractio
40
34
  result
41
35
  .metadata
42
36
  .additional
43
- .insert("embeddings_generated".to_string(), serde_json::Value::Bool(true));
37
+ .insert(Cow::Borrowed("embeddings_generated"), serde_json::Value::Bool(true));
44
38
  }
45
39
  Err(e) => {
46
- result
47
- .metadata
48
- .additional
49
- .insert("embedding_error".to_string(), serde_json::Value::String(e.to_string()));
40
+ result.metadata.additional.insert(
41
+ Cow::Borrowed("embedding_error"),
42
+ serde_json::Value::String(e.to_string()),
43
+ );
50
44
  }
51
45
  }
52
46
  }
@@ -54,16 +48,16 @@ pub(super) fn execute_chunking(result: &mut ExtractionResult, config: &Extractio
54
48
  #[cfg(not(feature = "embeddings"))]
55
49
  if chunking_config.embedding.is_some() {
56
50
  result.metadata.additional.insert(
57
- "embedding_error".to_string(),
51
+ Cow::Borrowed("embedding_error"),
58
52
  serde_json::Value::String("Embeddings feature not enabled".to_string()),
59
53
  );
60
54
  }
61
55
  }
62
56
  Err(e) => {
63
- result
64
- .metadata
65
- .additional
66
- .insert("chunking_error".to_string(), serde_json::Value::String(e.to_string()));
57
+ result.metadata.additional.insert(
58
+ Cow::Borrowed("chunking_error"),
59
+ serde_json::Value::String(e.to_string()),
60
+ );
67
61
  }
68
62
  }
69
63
  }
@@ -71,7 +65,7 @@ pub(super) fn execute_chunking(result: &mut ExtractionResult, config: &Extractio
71
65
  #[cfg(not(feature = "chunking"))]
72
66
  if config.chunking.is_some() {
73
67
  result.metadata.additional.insert(
74
- "chunking_error".to_string(),
68
+ Cow::Borrowed("chunking_error"),
75
69
  serde_json::Value::String("Chunking feature not enabled".to_string()),
76
70
  );
77
71
  }
@@ -89,7 +83,7 @@ pub(super) fn execute_language_detection(result: &mut ExtractionResult, config:
89
83
  }
90
84
  Err(e) => {
91
85
  result.metadata.additional.insert(
92
- "language_detection_error".to_string(),
86
+ Cow::Borrowed("language_detection_error"),
93
87
  serde_json::Value::String(e.to_string()),
94
88
  );
95
89
  }
@@ -99,7 +93,7 @@ pub(super) fn execute_language_detection(result: &mut ExtractionResult, config:
99
93
  #[cfg(not(feature = "language-detection"))]
100
94
  if config.language_detection.is_some() {
101
95
  result.metadata.additional.insert(
102
- "language_detection_error".to_string(),
96
+ Cow::Borrowed("language_detection_error"),
103
97
  serde_json::Value::String("Language detection feature not enabled".to_string()),
104
98
  );
105
99
  }
@@ -5,6 +5,7 @@
5
5
 
6
6
  use crate::core::config::OutputFormat;
7
7
  use crate::types::ExtractionResult;
8
+ use std::borrow::Cow;
8
9
 
9
10
  /// Apply output format conversion to the extraction result.
10
11
  ///
@@ -23,7 +24,7 @@ use crate::types::ExtractionResult;
23
24
  /// * `output_format` - The desired output format
24
25
  pub fn apply_output_format(result: &mut ExtractionResult, output_format: OutputFormat) {
25
26
  // Check if content was already formatted during extraction
26
- let already_formatted = match result.mime_type.as_str() {
27
+ let already_formatted = match &*result.mime_type {
27
28
  "text/markdown" if output_format == OutputFormat::Markdown => true,
28
29
  "text/djot" if output_format == OutputFormat::Djot => true,
29
30
  _ => false,
@@ -46,7 +47,7 @@ pub fn apply_output_format(result: &mut ExtractionResult, output_format: OutputF
46
47
  Err(e) => {
47
48
  // Keep original content on error, record error in metadata
48
49
  result.metadata.additional.insert(
49
- "output_format_error".to_string(),
50
+ Cow::Borrowed("output_format_error"),
50
51
  serde_json::Value::String(format!("Failed to convert to djot: {}", e)),
51
52
  );
52
53
  }
@@ -66,7 +67,7 @@ pub fn apply_output_format(result: &mut ExtractionResult, output_format: OutputF
66
67
  Err(e) => {
67
68
  // Keep original content on error, record error in metadata
68
69
  result.metadata.additional.insert(
69
- "output_format_error".to_string(),
70
+ Cow::Borrowed("output_format_error"),
70
71
  serde_json::Value::String(format!("Failed to convert to markdown: {}", e)),
71
72
  );
72
73
  }
@@ -87,7 +88,7 @@ pub fn apply_output_format(result: &mut ExtractionResult, output_format: OutputF
87
88
  Err(e) => {
88
89
  // Keep original content on error, record error in metadata
89
90
  result.metadata.additional.insert(
90
- "output_format_error".to_string(),
91
+ Cow::Borrowed("output_format_error"),
91
92
  serde_json::Value::String(format!("Failed to convert djot to HTML: {}", e)),
92
93
  );
93
94
  }
@@ -96,7 +97,7 @@ pub fn apply_output_format(result: &mut ExtractionResult, output_format: OutputF
96
97
  Err(e) => {
97
98
  // Keep original content on error, record error in metadata
98
99
  result.metadata.additional.insert(
99
- "output_format_error".to_string(),
100
+ Cow::Borrowed("output_format_error"),
100
101
  serde_json::Value::String(format!("Failed to generate djot for HTML conversion: {}", e)),
101
102
  );
102
103
  }
@@ -128,7 +129,7 @@ mod tests {
128
129
  fn test_apply_output_format_plain() {
129
130
  let mut result = ExtractionResult {
130
131
  content: "Hello World".to_string(),
131
- mime_type: "text/plain".to_string(),
132
+ mime_type: Cow::Borrowed("text/plain"),
132
133
  metadata: Metadata::default(),
133
134
  tables: vec![],
134
135
  detected_languages: None,
@@ -151,7 +152,7 @@ mod tests {
151
152
 
152
153
  let mut result = ExtractionResult {
153
154
  content: "Hello World".to_string(),
154
- mime_type: "text/djot".to_string(),
155
+ mime_type: Cow::Borrowed("text/djot"),
155
156
  metadata: Metadata::default(),
156
157
  tables: vec![],
157
158
  detected_languages: None,
@@ -180,7 +181,7 @@ mod tests {
180
181
  images: vec![],
181
182
  links: vec![],
182
183
  footnotes: vec![],
183
- attributes: std::collections::HashMap::new(),
184
+ attributes: Vec::new(),
184
185
  }),
185
186
  };
186
187
 
@@ -194,7 +195,7 @@ mod tests {
194
195
  fn test_apply_output_format_djot_without_djot_content() {
195
196
  let mut result = ExtractionResult {
196
197
  content: "Hello World".to_string(),
197
- mime_type: "text/plain".to_string(),
198
+ mime_type: Cow::Borrowed("text/plain"),
198
199
  metadata: Metadata::default(),
199
200
  tables: vec![],
200
201
  detected_languages: None,
@@ -216,7 +217,7 @@ mod tests {
216
217
  fn test_apply_output_format_html() {
217
218
  let mut result = ExtractionResult {
218
219
  content: "Hello World".to_string(),
219
- mime_type: "text/plain".to_string(),
220
+ mime_type: Cow::Borrowed("text/plain"),
220
221
  metadata: Metadata::default(),
221
222
  tables: vec![],
222
223
  detected_languages: None,
@@ -239,7 +240,7 @@ mod tests {
239
240
  fn test_apply_output_format_html_escapes_special_chars() {
240
241
  let mut result = ExtractionResult {
241
242
  content: "<script>alert('XSS')</script>".to_string(),
242
- mime_type: "text/plain".to_string(),
243
+ mime_type: Cow::Borrowed("text/plain"),
243
244
  metadata: Metadata::default(),
244
245
  tables: vec![],
245
246
  detected_languages: None,
@@ -262,7 +263,7 @@ mod tests {
262
263
  fn test_apply_output_format_markdown() {
263
264
  let mut result = ExtractionResult {
264
265
  content: "Hello World".to_string(),
265
- mime_type: "text/plain".to_string(),
266
+ mime_type: Cow::Borrowed("text/plain"),
266
267
  metadata: Metadata::default(),
267
268
  tables: vec![],
268
269
  detected_languages: None,
@@ -281,8 +282,9 @@ mod tests {
281
282
 
282
283
  #[test]
283
284
  fn test_apply_output_format_preserves_metadata() {
284
- let mut additional = std::collections::HashMap::new();
285
- additional.insert("custom_key".to_string(), serde_json::json!("custom_value"));
285
+ use ahash::AHashMap;
286
+ let mut additional = AHashMap::new();
287
+ additional.insert(Cow::Borrowed("custom_key"), serde_json::json!("custom_value"));
286
288
  let metadata = Metadata {
287
289
  title: Some("Test Title".to_string()),
288
290
  additional,
@@ -291,7 +293,7 @@ mod tests {
291
293
 
292
294
  let mut result = ExtractionResult {
293
295
  content: "Hello World".to_string(),
294
- mime_type: "text/plain".to_string(),
296
+ mime_type: Cow::Borrowed("text/plain"),
295
297
  metadata,
296
298
  tables: vec![],
297
299
  detected_languages: None,
@@ -324,7 +326,7 @@ mod tests {
324
326
 
325
327
  let mut result = ExtractionResult {
326
328
  content: "Hello World".to_string(),
327
- mime_type: "text/plain".to_string(),
329
+ mime_type: Cow::Borrowed("text/plain"),
328
330
  metadata: Metadata::default(),
329
331
  tables: vec![table],
330
332
  detected_languages: None,
@@ -367,12 +369,12 @@ mod tests {
367
369
  images: vec![],
368
370
  links: vec![],
369
371
  footnotes: vec![],
370
- attributes: std::collections::HashMap::new(),
372
+ attributes: Vec::new(),
371
373
  };
372
374
 
373
375
  let mut result = ExtractionResult {
374
376
  content: "test".to_string(),
375
- mime_type: "text/djot".to_string(),
377
+ mime_type: Cow::Borrowed("text/djot"),
376
378
  metadata: Metadata::default(),
377
379
  tables: vec![],
378
380
  detected_languages: None,