kreuzberg 4.2.6 → 4.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +7 -4
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
  5. data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
  6. data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
  7. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
  8. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
  9. data/ext/kreuzberg_rb/native/src/result.rs +5 -3
  10. data/lib/kreuzberg/version.rb +1 -1
  11. data/sig/kreuzberg.rbs +228 -37
  12. data/spec/binding/batch_operations_spec.rb +2 -0
  13. data/vendor/Cargo.toml +3 -2
  14. data/vendor/kreuzberg/Cargo.toml +2 -1
  15. data/vendor/kreuzberg/README.md +1 -1
  16. data/vendor/kreuzberg/src/api/error.rs +29 -1
  17. data/vendor/kreuzberg/src/api/handlers.rs +28 -25
  18. data/vendor/kreuzberg/src/api/openapi.rs +14 -1
  19. data/vendor/kreuzberg/src/chunking/config.rs +2 -37
  20. data/vendor/kreuzberg/src/chunking/core.rs +78 -2
  21. data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
  22. data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
  23. data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
  24. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
  25. data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
  26. data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
  27. data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
  28. data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
  29. data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
  30. data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
  31. data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
  32. data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
  33. data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
  34. data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
  35. data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
  36. data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
  37. data/vendor/kreuzberg/src/extraction/email.rs +31 -19
  38. data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
  39. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
  40. data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
  41. data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
  42. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
  43. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
  44. data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
  45. data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
  47. data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
  48. data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
  49. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
  50. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
  51. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
  52. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
  53. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
  54. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
  55. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
  56. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
  57. data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
  58. data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
  59. data/vendor/kreuzberg/src/extractors/email.rs +5 -3
  60. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
  61. data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
  62. data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
  63. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
  64. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
  65. data/vendor/kreuzberg/src/extractors/html.rs +1 -1
  66. data/vendor/kreuzberg/src/extractors/image.rs +3 -3
  67. data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
  68. data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
  69. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
  70. data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
  71. data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
  72. data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
  73. data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
  74. data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
  75. data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
  76. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
  77. data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
  78. data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
  79. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
  80. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
  81. data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
  82. data/vendor/kreuzberg/src/extractors/text.rs +2 -2
  83. data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
  84. data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
  85. data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
  86. data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
  87. data/vendor/kreuzberg/src/lib.rs +1 -1
  88. data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
  89. data/vendor/kreuzberg/src/mcp/format.rs +5 -4
  90. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
  91. data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
  92. data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
  93. data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
  94. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
  95. data/vendor/kreuzberg/src/ocr/types.rs +3 -4
  96. data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
  97. data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
  98. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
  99. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
  100. data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
  101. data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
  102. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
  103. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
  104. data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
  105. data/vendor/kreuzberg/src/text/quality.rs +13 -13
  106. data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
  107. data/vendor/kreuzberg/src/types/djot.rs +15 -4
  108. data/vendor/kreuzberg/src/types/extraction.rs +24 -4
  109. data/vendor/kreuzberg/src/types/formats.rs +9 -5
  110. data/vendor/kreuzberg/src/types/metadata.rs +68 -7
  111. data/vendor/kreuzberg/src/types/mod.rs +7 -5
  112. data/vendor/kreuzberg/src/types/page.rs +9 -0
  113. data/vendor/kreuzberg/src/types/tables.rs +2 -0
  114. data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
  115. data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
  116. data/vendor/kreuzberg/tests/config_features.rs +19 -11
  117. data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
  118. data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
  119. data/vendor/kreuzberg/tests/core_integration.rs +5 -6
  120. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
  121. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
  122. data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
  123. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
  124. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
  125. data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
  126. data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
  127. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
  128. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  129. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
  130. data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
  131. data/vendor/kreuzberg-ffi/src/error.rs +56 -0
  132. data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
  133. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
  134. data/vendor/kreuzberg-ffi/src/result.rs +2 -1
  135. data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
  136. data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
  137. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
  138. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  139. metadata +2 -2
@@ -5,7 +5,9 @@ use crate::core::config::ExtractionConfig;
5
5
  use crate::extractors::SyncExtractor;
6
6
  use crate::plugins::{DocumentExtractor, Plugin};
7
7
  use crate::types::{EmailMetadata, ExtractionResult, Metadata};
8
+ use ahash::AHashMap;
8
9
  use async_trait::async_trait;
10
+ use std::borrow::Cow;
9
11
  #[cfg(feature = "tokio-runtime")]
10
12
  use std::path::Path;
11
13
 
@@ -66,14 +68,14 @@ impl SyncExtractor for EmailExtractor {
66
68
  attachments: attachment_names,
67
69
  };
68
70
 
69
- let mut additional = std::collections::HashMap::new();
71
+ let mut additional = AHashMap::new();
70
72
  for (key, value) in &email_result.metadata {
71
- additional.insert(key.clone(), serde_json::json!(value));
73
+ additional.insert(Cow::Owned(key.clone()), serde_json::json!(value));
72
74
  }
73
75
 
74
76
  Ok(ExtractionResult {
75
77
  content: text,
76
- mime_type: mime_type.to_string(),
78
+ mime_type: mime_type.to_string().into(),
77
79
  metadata: Metadata {
78
80
  format: Some(crate::types::FormatMetadata::Email(email_metadata)),
79
81
  subject: email_result.subject.clone(),
@@ -27,24 +27,24 @@ pub(super) fn extract_metadata(opf_xml: &str) -> Result<(OepbMetadata, BTreeMap<
27
27
 
28
28
  let (epub_metadata, _) = parse_opf(opf_xml)?;
29
29
 
30
- if let Some(identifier) = epub_metadata.identifier.clone() {
31
- additional_metadata.insert("identifier".to_string(), serde_json::json!(identifier));
30
+ if let Some(ref identifier) = epub_metadata.identifier {
31
+ additional_metadata.insert("identifier".to_string(), serde_json::json!(identifier.clone()));
32
32
  }
33
33
 
34
- if let Some(publisher) = epub_metadata.publisher.clone() {
35
- additional_metadata.insert("publisher".to_string(), serde_json::json!(publisher));
34
+ if let Some(ref publisher) = epub_metadata.publisher {
35
+ additional_metadata.insert("publisher".to_string(), serde_json::json!(publisher.clone()));
36
36
  }
37
37
 
38
- if let Some(subject) = epub_metadata.subject.clone() {
39
- additional_metadata.insert("subject".to_string(), serde_json::json!(subject));
38
+ if let Some(ref subject) = epub_metadata.subject {
39
+ additional_metadata.insert("subject".to_string(), serde_json::json!(subject.clone()));
40
40
  }
41
41
 
42
- if let Some(description) = epub_metadata.description.clone() {
43
- additional_metadata.insert("description".to_string(), serde_json::json!(description));
42
+ if let Some(ref description) = epub_metadata.description {
43
+ additional_metadata.insert("description".to_string(), serde_json::json!(description.clone()));
44
44
  }
45
45
 
46
- if let Some(rights) = epub_metadata.rights.clone() {
47
- additional_metadata.insert("rights".to_string(), serde_json::json!(rights));
46
+ if let Some(ref rights) = epub_metadata.rights {
47
+ additional_metadata.insert("rights".to_string(), serde_json::json!(rights.clone()));
48
48
  }
49
49
 
50
50
  Ok((epub_metadata, additional_metadata))
@@ -19,7 +19,9 @@ use crate::Result;
19
19
  use crate::core::config::ExtractionConfig;
20
20
  use crate::plugins::{DocumentExtractor, Plugin};
21
21
  use crate::types::{ExtractionResult, Metadata};
22
+ use ahash::AHashMap;
22
23
  use async_trait::async_trait;
24
+ use std::borrow::Cow;
23
25
  use std::io::Cursor;
24
26
  use zip::ZipArchive;
25
27
 
@@ -112,12 +114,14 @@ impl DocumentExtractor for EpubExtractor {
112
114
  let extracted_content = extract_content(&mut archive, &opf_path, &manifest_dir)?;
113
115
 
114
116
  let (epub_metadata, additional_metadata) = extract_metadata(&opf_xml)?;
115
- let metadata_map: std::collections::HashMap<String, serde_json::Value> =
116
- additional_metadata.into_iter().collect();
117
+ let metadata_map: AHashMap<Cow<'static, str>, serde_json::Value> = additional_metadata
118
+ .into_iter()
119
+ .map(|(k, v)| (Cow::Owned(k), v))
120
+ .collect();
117
121
 
118
122
  Ok(ExtractionResult {
119
123
  content: extracted_content,
120
- mime_type: mime_type.to_string(),
124
+ mime_type: mime_type.to_string().into(),
121
125
  metadata: Metadata {
122
126
  title: epub_metadata.title,
123
127
  authors: epub_metadata.creator.map(|c| vec![c]),
@@ -4,7 +4,9 @@ use crate::Result;
4
4
  use crate::core::config::ExtractionConfig;
5
5
  use crate::plugins::{DocumentExtractor, Plugin};
6
6
  use crate::types::{ExcelMetadata, ExtractionResult, Metadata, Table};
7
+ use ahash::AHashMap;
7
8
  use async_trait::async_trait;
9
+ use std::borrow::Cow;
8
10
  use std::path::Path;
9
11
 
10
12
  /// Excel spreadsheet extractor using calamine.
@@ -120,16 +122,16 @@ impl DocumentExtractor for ExcelExtractor {
120
122
  sheet_names,
121
123
  };
122
124
 
123
- let mut additional = std::collections::HashMap::new();
125
+ let mut additional = AHashMap::new();
124
126
  for (key, value) in &workbook.metadata {
125
127
  if key != "sheet_count" && key != "sheet_names" {
126
- additional.insert(key.clone(), serde_json::json!(value));
128
+ additional.insert(Cow::Owned(key.clone()), serde_json::json!(value));
127
129
  }
128
130
  }
129
131
 
130
132
  Ok(ExtractionResult {
131
133
  content: markdown,
132
- mime_type: mime_type.to_string(),
134
+ mime_type: mime_type.to_string().into(),
133
135
  metadata: Metadata {
134
136
  format: Some(crate::types::FormatMetadata::Excel(excel_metadata)),
135
137
  additional,
@@ -166,16 +168,16 @@ impl DocumentExtractor for ExcelExtractor {
166
168
  sheet_names,
167
169
  };
168
170
 
169
- let mut additional = std::collections::HashMap::new();
171
+ let mut additional = AHashMap::new();
170
172
  for (key, value) in &workbook.metadata {
171
173
  if key != "sheet_count" && key != "sheet_names" {
172
- additional.insert(key.clone(), serde_json::json!(value));
174
+ additional.insert(Cow::Owned(key.clone()), serde_json::json!(value));
173
175
  }
174
176
  }
175
177
 
176
178
  Ok(ExtractionResult {
177
179
  content: markdown,
178
- mime_type: mime_type.to_string(),
180
+ mime_type: mime_type.to_string().into(),
179
181
  metadata: Metadata {
180
182
  format: Some(crate::types::FormatMetadata::Excel(excel_metadata)),
181
183
  additional,
@@ -434,7 +434,7 @@ impl DocumentExtractor for FictionBookExtractor {
434
434
 
435
435
  Ok(ExtractionResult {
436
436
  content: extracted_content,
437
- mime_type: mime_type.to_string(),
437
+ mime_type: mime_type.to_string().into(),
438
438
  metadata,
439
439
  tables: vec![],
440
440
  detected_languages: None,
@@ -9,6 +9,7 @@
9
9
  use crate::types::Metadata;
10
10
 
11
11
  use serde_yaml_ng::Value as YamlValue;
12
+ use std::borrow::Cow;
12
13
 
13
14
  /// Extract YAML frontmatter from document content.
14
15
  ///
@@ -126,12 +127,12 @@ pub fn extract_metadata_from_yaml(yaml: &YamlValue) -> Metadata {
126
127
 
127
128
  // Title
128
129
  if let Some(title) = yaml.get("title").and_then(|v| v.as_str()) {
129
- metadata.additional.insert("title".to_string(), title.into());
130
+ metadata.additional.insert(Cow::Borrowed("title"), title.into());
130
131
  }
131
132
 
132
133
  // Author
133
134
  if let Some(author) = yaml.get("author").and_then(|v| v.as_str()) {
134
- metadata.additional.insert("author".to_string(), author.into());
135
+ metadata.additional.insert(Cow::Borrowed("author"), author.into());
135
136
  }
136
137
 
137
138
  // Date (map to created_at)
@@ -143,11 +144,13 @@ pub fn extract_metadata_from_yaml(yaml: &YamlValue) -> Metadata {
143
144
  if let Some(keywords) = yaml.get("keywords") {
144
145
  match keywords {
145
146
  YamlValue::String(s) => {
146
- metadata.additional.insert("keywords".to_string(), s.clone().into());
147
+ metadata.additional.insert(Cow::Borrowed("keywords"), s.clone().into());
147
148
  }
148
149
  YamlValue::Sequence(seq) => {
149
150
  let keywords_str = seq.iter().filter_map(|v| v.as_str()).collect::<Vec<_>>().join(", ");
150
- metadata.additional.insert("keywords".to_string(), keywords_str.into());
151
+ metadata
152
+ .additional
153
+ .insert(Cow::Borrowed("keywords"), keywords_str.into());
151
154
  }
152
155
  _ => {}
153
156
  }
@@ -160,7 +163,9 @@ pub fn extract_metadata_from_yaml(yaml: &YamlValue) -> Metadata {
160
163
 
161
164
  // Abstract
162
165
  if let Some(abstract_text) = yaml.get("abstract").and_then(|v| v.as_str()) {
163
- metadata.additional.insert("abstract".to_string(), abstract_text.into());
166
+ metadata
167
+ .additional
168
+ .insert(Cow::Borrowed("abstract"), abstract_text.into());
164
169
  }
165
170
 
166
171
  // Subject (overrides description if both present)
@@ -170,18 +175,18 @@ pub fn extract_metadata_from_yaml(yaml: &YamlValue) -> Metadata {
170
175
 
171
176
  // Category
172
177
  if let Some(category) = yaml.get("category").and_then(|v| v.as_str()) {
173
- metadata.additional.insert("category".to_string(), category.into());
178
+ metadata.additional.insert(Cow::Borrowed("category"), category.into());
174
179
  }
175
180
 
176
181
  // Tags (support both string and array)
177
182
  if let Some(tags) = yaml.get("tags") {
178
183
  match tags {
179
184
  YamlValue::String(s) => {
180
- metadata.additional.insert("tags".to_string(), s.clone().into());
185
+ metadata.additional.insert(Cow::Borrowed("tags"), s.clone().into());
181
186
  }
182
187
  YamlValue::Sequence(seq) => {
183
188
  let tags_str = seq.iter().filter_map(|v| v.as_str()).collect::<Vec<_>>().join(", ");
184
- metadata.additional.insert("tags".to_string(), tags_str.into());
189
+ metadata.additional.insert(Cow::Borrowed("tags"), tags_str.into());
185
190
  }
186
191
  _ => {}
187
192
  }
@@ -189,12 +194,12 @@ pub fn extract_metadata_from_yaml(yaml: &YamlValue) -> Metadata {
189
194
 
190
195
  // Language
191
196
  if let Some(language) = yaml.get("language").and_then(|v| v.as_str()) {
192
- metadata.additional.insert("language".to_string(), language.into());
197
+ metadata.additional.insert(Cow::Borrowed("language"), language.into());
193
198
  }
194
199
 
195
200
  // Version
196
201
  if let Some(version) = yaml.get("version").and_then(|v| v.as_str()) {
197
- metadata.additional.insert("version".to_string(), version.into());
202
+ metadata.additional.insert(Cow::Borrowed("version"), version.into());
198
203
  }
199
204
 
200
205
  metadata
@@ -219,7 +219,7 @@ impl SyncExtractor for HtmlExtractor {
219
219
 
220
220
  Ok(ExtractionResult {
221
221
  content: content_text,
222
- mime_type: result_mime_type.to_string(),
222
+ mime_type: result_mime_type.to_string().into(),
223
223
  metadata: Metadata {
224
224
  format: html_metadata.map(|m| crate::types::FormatMetadata::Html(Box::new(m))),
225
225
  ..Default::default()
@@ -128,7 +128,7 @@ impl DocumentExtractor for ImageExtractor {
128
128
  let mut ocr_result = self.extract_with_ocr(content, mime_type, config).await?;
129
129
 
130
130
  ocr_result.metadata.format = Some(crate::types::FormatMetadata::Image(image_metadata));
131
- ocr_result.mime_type = mime_type.to_string();
131
+ ocr_result.mime_type = mime_type.to_string().into();
132
132
 
133
133
  return Ok(ocr_result);
134
134
  }
@@ -141,7 +141,7 @@ impl DocumentExtractor for ImageExtractor {
141
141
 
142
142
  return Ok(ExtractionResult {
143
143
  content: content_text,
144
- mime_type: mime_type.to_string(),
144
+ mime_type: mime_type.to_string().into(),
145
145
  metadata: Metadata {
146
146
  format: Some(crate::types::FormatMetadata::Image(image_metadata)),
147
147
  ..Default::default()
@@ -161,7 +161,7 @@ impl DocumentExtractor for ImageExtractor {
161
161
  "Image: {} {}x{}",
162
162
  extraction_metadata.format, extraction_metadata.width, extraction_metadata.height
163
163
  ),
164
- mime_type: mime_type.to_string(),
164
+ mime_type: mime_type.to_string().into(),
165
165
  metadata: Metadata {
166
166
  format: Some(crate::types::FormatMetadata::Image(image_metadata)),
167
167
  ..Default::default()
@@ -159,7 +159,7 @@ impl DocumentExtractor for JatsExtractor {
159
159
 
160
160
  Ok(ExtractionResult {
161
161
  content: extracted_content,
162
- mime_type: mime_type.to_string(),
162
+ mime_type: mime_type.to_string().into(),
163
163
  metadata,
164
164
  tables,
165
165
  detected_languages: None,
@@ -18,11 +18,13 @@ use crate::plugins::{DocumentExtractor, Plugin};
18
18
  #[cfg(feature = "office")]
19
19
  use crate::types::{ExtractionResult, Metadata};
20
20
  #[cfg(feature = "office")]
21
+ use ahash::AHashMap;
22
+ #[cfg(feature = "office")]
21
23
  use async_trait::async_trait;
22
24
  #[cfg(feature = "office")]
23
25
  use serde_json::{Value, json};
24
26
  #[cfg(feature = "office")]
25
- use std::collections::HashMap;
27
+ use std::borrow::Cow;
26
28
 
27
29
  /// Jupyter Notebook extractor.
28
30
  ///
@@ -42,32 +44,32 @@ impl JupyterExtractor {
42
44
  }
43
45
 
44
46
  /// Extract content from a Jupyter notebook.
45
- fn extract_notebook(content: &[u8]) -> Result<(String, HashMap<String, Value>)> {
47
+ fn extract_notebook(content: &[u8]) -> Result<(String, AHashMap<Cow<'static, str>, Value>)> {
46
48
  let notebook: Value = serde_json::from_slice(content)
47
49
  .map_err(|e| crate::KreuzbergError::parsing(format!("Failed to parse JSON: {}", e)))?;
48
50
 
49
51
  let mut extracted_content = String::new();
50
- let mut metadata = HashMap::new();
52
+ let mut metadata = AHashMap::new();
51
53
 
52
54
  if let Some(notebook_metadata) = notebook.get("metadata").and_then(|m| m.as_object()) {
53
55
  if let Some(kernelspec) = notebook_metadata.get("kernelspec")
54
56
  && let Some(name) = kernelspec.get("name").and_then(|n| n.as_str())
55
57
  {
56
58
  extracted_content.push_str(&format!("Kernelspec: {}\n", name));
57
- metadata.insert("kernelspec".to_string(), kernelspec.clone());
59
+ metadata.insert(Cow::Borrowed("kernelspec"), kernelspec.clone());
58
60
  }
59
61
 
60
62
  if let Some(language_info) = notebook_metadata.get("language_info")
61
63
  && let Some(name) = language_info.get("name").and_then(|n| n.as_str())
62
64
  {
63
65
  extracted_content.push_str(&format!("Language: {}\n", name));
64
- metadata.insert("language_info".to_string(), language_info.clone());
66
+ metadata.insert(Cow::Borrowed("language_info"), language_info.clone());
65
67
  }
66
68
  }
67
69
 
68
70
  if let Some(nbformat) = notebook.get("nbformat") {
69
71
  extracted_content.push_str(&format!("NBFormat: {}\n", nbformat));
70
- metadata.insert("nbformat".to_string(), nbformat.clone());
72
+ metadata.insert(Cow::Borrowed("nbformat"), nbformat.clone());
71
73
  }
72
74
 
73
75
  extracted_content.push('\n');
@@ -86,7 +88,7 @@ impl JupyterExtractor {
86
88
  cell: &Value,
87
89
  cell_idx: usize,
88
90
  content: &mut String,
89
- _metadata: &mut HashMap<String, Value>,
91
+ _metadata: &mut AHashMap<Cow<'static, str>, Value>,
90
92
  ) -> Result<()> {
91
93
  let cell_type = cell.get("cell_type").and_then(|t| t.as_str()).unwrap_or("unknown");
92
94
 
@@ -324,14 +326,14 @@ impl DocumentExtractor for JupyterExtractor {
324
326
  ) -> Result<ExtractionResult> {
325
327
  let (extracted_content, additional_metadata) = Self::extract_notebook(content)?;
326
328
 
327
- let mut metadata_additional = HashMap::new();
329
+ let mut metadata_additional = AHashMap::new();
328
330
  for (key, value) in additional_metadata {
329
331
  metadata_additional.insert(key, json!(value));
330
332
  }
331
333
 
332
334
  Ok(ExtractionResult {
333
335
  content: extracted_content,
334
- mime_type: mime_type.to_string(),
336
+ mime_type: mime_type.to_string().into(),
335
337
  metadata: Metadata {
336
338
  additional: metadata_additional,
337
339
  ..Default::default()
@@ -5,6 +5,7 @@
5
5
 
6
6
  use super::utilities::extract_braced;
7
7
  use crate::types::Metadata;
8
+ use std::borrow::Cow;
8
9
 
9
10
  /// Extracts metadata from a LaTeX line.
10
11
  ///
@@ -13,15 +14,15 @@ use crate::types::Metadata;
13
14
  pub fn extract_metadata_from_line(line: &str, metadata: &mut Metadata) {
14
15
  if line.starts_with("\\title{") {
15
16
  if let Some(title) = extract_braced(line, "title") {
16
- metadata.additional.insert("title".to_string(), title.into());
17
+ metadata.additional.insert(Cow::Borrowed("title"), title.into());
17
18
  }
18
19
  } else if line.starts_with("\\author{") {
19
20
  if let Some(author) = extract_braced(line, "author") {
20
- metadata.additional.insert("author".to_string(), author.into());
21
+ metadata.additional.insert(Cow::Borrowed("author"), author.into());
21
22
  }
22
23
  } else if line.starts_with("\\date{")
23
24
  && let Some(date) = extract_braced(line, "date")
24
25
  {
25
- metadata.additional.insert("date".to_string(), date.into());
26
+ metadata.additional.insert(Cow::Borrowed("date"), date.into());
26
27
  }
27
28
  }
@@ -95,7 +95,7 @@ impl DocumentExtractor for LatexExtractor {
95
95
 
96
96
  Ok(ExtractionResult {
97
97
  content: text,
98
- mime_type: mime_type.to_string(),
98
+ mime_type: mime_type.to_string().into(),
99
99
  metadata,
100
100
  tables,
101
101
  detected_languages: None,
@@ -28,6 +28,8 @@ use crate::types::{ExtractionResult, Metadata, Table};
28
28
  use async_trait::async_trait;
29
29
  #[cfg(feature = "office")]
30
30
  use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
31
+ #[cfg(feature = "office")]
32
+ use std::borrow::Cow;
31
33
 
32
34
  /// Enhanced Markdown extractor with metadata and table support.
33
35
  ///
@@ -113,7 +115,7 @@ impl MarkdownExtractor {
113
115
  if !current_row.is_empty()
114
116
  && let Some((ref mut rows, _)) = current_table
115
117
  {
116
- rows.push(current_row.clone());
118
+ rows.push(std::mem::take(&mut current_row));
117
119
  }
118
120
  current_row = Vec::new();
119
121
  }
@@ -121,7 +123,7 @@ impl MarkdownExtractor {
121
123
  if !current_row.is_empty()
122
124
  && let Some((ref mut rows, _)) = current_table
123
125
  {
124
- rows.push(current_row.clone());
126
+ rows.push(std::mem::take(&mut current_row));
125
127
  }
126
128
  current_row = Vec::new();
127
129
  }
@@ -211,7 +213,7 @@ impl DocumentExtractor for MarkdownExtractor {
211
213
  if !metadata.additional.contains_key("title")
212
214
  && let Some(title) = extract_title_from_content(&remaining_content)
213
215
  {
214
- metadata.additional.insert("title".to_string(), title.into());
216
+ metadata.additional.insert(Cow::Borrowed("title"), title.into());
215
217
  }
216
218
 
217
219
  let parser = Parser::new_ext(&remaining_content, Options::ENABLE_TABLES);
@@ -223,7 +225,7 @@ impl DocumentExtractor for MarkdownExtractor {
223
225
 
224
226
  Ok(ExtractionResult {
225
227
  content: extracted_text,
226
- mime_type: mime_type.to_string(),
228
+ mime_type: mime_type.to_string().into(),
227
229
  metadata,
228
230
  tables,
229
231
  detected_languages: None,
@@ -9,8 +9,10 @@ use crate::core::config::ExtractionConfig;
9
9
  use crate::extraction::{cells_to_markdown, office_metadata};
10
10
  use crate::plugins::{DocumentExtractor, Plugin};
11
11
  use crate::types::{ExtractionResult, Metadata, Table};
12
+ use ahash::AHashMap;
12
13
  use async_trait::async_trait;
13
14
  use roxmltree::Document;
15
+ use std::borrow::Cow;
14
16
  use std::io::Cursor;
15
17
 
16
18
  /// High-performance ODT extractor using native Rust XML parsing.
@@ -475,7 +477,7 @@ impl DocumentExtractor for OdtExtractor {
475
477
  (combined_text, tables)
476
478
  };
477
479
 
478
- let mut metadata_map = std::collections::HashMap::new();
480
+ let mut metadata_map = AHashMap::new();
479
481
 
480
482
  let cursor = Cursor::new(content_owned.clone());
481
483
  let mut archive = zip::ZipArchive::new(cursor).map_err(|e| {
@@ -484,80 +486,95 @@ impl DocumentExtractor for OdtExtractor {
484
486
 
485
487
  if let Ok(odt_props) = office_metadata::extract_odt_properties(&mut archive) {
486
488
  if let Some(title) = odt_props.title {
487
- metadata_map.insert("title".to_string(), serde_json::Value::String(title));
489
+ metadata_map.insert(Cow::Borrowed("title"), serde_json::Value::String(title));
488
490
  }
489
491
  if let Some(creator) = odt_props.creator {
490
492
  metadata_map.insert(
491
- "authors".to_string(),
493
+ Cow::Borrowed("authors"),
492
494
  serde_json::Value::Array(vec![serde_json::Value::String(creator.clone())]),
493
495
  );
494
- metadata_map.insert("created_by".to_string(), serde_json::Value::String(creator));
496
+ metadata_map.insert(Cow::Borrowed("created_by"), serde_json::Value::String(creator));
495
497
  }
496
498
  if let Some(initial_creator) = odt_props.initial_creator {
497
499
  metadata_map.insert(
498
- "initial_creator".to_string(),
500
+ Cow::Borrowed("initial_creator"),
499
501
  serde_json::Value::String(initial_creator),
500
502
  );
501
503
  }
502
504
  if let Some(subject) = odt_props.subject {
503
- metadata_map.insert("subject".to_string(), serde_json::Value::String(subject));
505
+ metadata_map.insert(Cow::Borrowed("subject"), serde_json::Value::String(subject));
504
506
  }
505
507
  if let Some(keywords) = odt_props.keywords {
506
- metadata_map.insert("keywords".to_string(), serde_json::Value::String(keywords));
508
+ metadata_map.insert(Cow::Borrowed("keywords"), serde_json::Value::String(keywords));
507
509
  }
508
510
  if let Some(description) = odt_props.description {
509
- metadata_map.insert("description".to_string(), serde_json::Value::String(description));
511
+ metadata_map.insert(Cow::Borrowed("description"), serde_json::Value::String(description));
510
512
  }
511
513
  if let Some(creation_date) = odt_props.creation_date {
512
- metadata_map.insert("created_at".to_string(), serde_json::Value::String(creation_date));
514
+ metadata_map.insert(Cow::Borrowed("created_at"), serde_json::Value::String(creation_date));
513
515
  }
514
516
  if let Some(date) = odt_props.date {
515
- metadata_map.insert("modified_at".to_string(), serde_json::Value::String(date));
517
+ metadata_map.insert(Cow::Borrowed("modified_at"), serde_json::Value::String(date));
516
518
  }
517
519
  if let Some(language) = odt_props.language {
518
- metadata_map.insert("language".to_string(), serde_json::Value::String(language));
520
+ metadata_map.insert(Cow::Borrowed("language"), serde_json::Value::String(language));
519
521
  }
520
522
  if let Some(generator) = odt_props.generator {
521
- metadata_map.insert("generator".to_string(), serde_json::Value::String(generator));
523
+ metadata_map.insert(Cow::Borrowed("generator"), serde_json::Value::String(generator));
522
524
  }
523
525
  if let Some(editing_duration) = odt_props.editing_duration {
524
526
  metadata_map.insert(
525
- "editing_duration".to_string(),
527
+ Cow::Borrowed("editing_duration"),
526
528
  serde_json::Value::String(editing_duration),
527
529
  );
528
530
  }
529
531
  if let Some(editing_cycles) = odt_props.editing_cycles {
530
- metadata_map.insert("editing_cycles".to_string(), serde_json::Value::String(editing_cycles));
532
+ metadata_map.insert(
533
+ Cow::Borrowed("editing_cycles"),
534
+ serde_json::Value::String(editing_cycles),
535
+ );
531
536
  }
532
537
  if let Some(page_count) = odt_props.page_count {
533
- metadata_map.insert("page_count".to_string(), serde_json::Value::Number(page_count.into()));
538
+ metadata_map.insert(
539
+ Cow::Borrowed("page_count"),
540
+ serde_json::Value::Number(page_count.into()),
541
+ );
534
542
  }
535
543
  if let Some(word_count) = odt_props.word_count {
536
- metadata_map.insert("word_count".to_string(), serde_json::Value::Number(word_count.into()));
544
+ metadata_map.insert(
545
+ Cow::Borrowed("word_count"),
546
+ serde_json::Value::Number(word_count.into()),
547
+ );
537
548
  }
538
549
  if let Some(character_count) = odt_props.character_count {
539
550
  metadata_map.insert(
540
- "character_count".to_string(),
551
+ Cow::Borrowed("character_count"),
541
552
  serde_json::Value::Number(character_count.into()),
542
553
  );
543
554
  }
544
555
  if let Some(paragraph_count) = odt_props.paragraph_count {
545
556
  metadata_map.insert(
546
- "paragraph_count".to_string(),
557
+ Cow::Borrowed("paragraph_count"),
547
558
  serde_json::Value::Number(paragraph_count.into()),
548
559
  );
549
560
  }
550
561
  if let Some(table_count) = odt_props.table_count {
551
- metadata_map.insert("table_count".to_string(), serde_json::Value::Number(table_count.into()));
562
+ metadata_map.insert(
563
+ Cow::Borrowed("table_count"),
564
+ serde_json::Value::Number(table_count.into()),
565
+ );
552
566
  }
553
567
  if let Some(image_count) = odt_props.image_count {
554
- metadata_map.insert("image_count".to_string(), serde_json::Value::Number(image_count.into()));
568
+ metadata_map.insert(
569
+ Cow::Borrowed("image_count"),
570
+ serde_json::Value::Number(image_count.into()),
571
+ );
555
572
  }
556
573
  }
557
574
 
558
575
  Ok(ExtractionResult {
559
576
  content: text,
560
- mime_type: mime_type.to_string(),
577
+ mime_type: mime_type.to_string().into(),
561
578
  metadata: Metadata {
562
579
  additional: metadata_map,
563
580
  ..Default::default()
@@ -79,7 +79,7 @@ impl DocumentExtractor for OpmlExtractor {
79
79
 
80
80
  Ok(ExtractionResult {
81
81
  content: extracted_content,
82
- mime_type: mime_type.to_string(),
82
+ mime_type: mime_type.to_string().into(),
83
83
  metadata: Metadata {
84
84
  additional: metadata_map,
85
85
  ..Default::default()