kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +11 -11
  6. data/README.md +5 -10
  7. data/examples/async_patterns.rb +0 -1
  8. data/ext/kreuzberg_rb/extconf.rb +0 -10
  9. data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
  10. data/ext/kreuzberg_rb/native/build.rs +2 -0
  11. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  12. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  13. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  14. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  15. data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
  16. data/kreuzberg.gemspec +14 -57
  17. data/lib/kreuzberg/cache_api.rb +0 -1
  18. data/lib/kreuzberg/cli.rb +2 -2
  19. data/lib/kreuzberg/config.rb +2 -9
  20. data/lib/kreuzberg/errors.rb +7 -75
  21. data/lib/kreuzberg/extraction_api.rb +0 -1
  22. data/lib/kreuzberg/setup_lib_path.rb +0 -1
  23. data/lib/kreuzberg/version.rb +1 -1
  24. data/lib/kreuzberg.rb +0 -21
  25. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  26. data/sig/kreuzberg.rbs +3 -55
  27. data/spec/binding/cli_proxy_spec.rb +4 -2
  28. data/spec/binding/cli_spec.rb +11 -12
  29. data/spec/examples.txt +104 -0
  30. data/spec/fixtures/config.yaml +1 -0
  31. data/spec/spec_helper.rb +1 -1
  32. data/vendor/kreuzberg/Cargo.toml +42 -112
  33. data/vendor/kreuzberg/README.md +2 -2
  34. data/vendor/kreuzberg/build.rs +4 -18
  35. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  36. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  37. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  38. data/vendor/kreuzberg/src/core/extractor.rs +81 -202
  39. data/vendor/kreuzberg/src/core/io.rs +2 -4
  40. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  41. data/vendor/kreuzberg/src/core/mod.rs +1 -4
  42. data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
  43. data/vendor/kreuzberg/src/embeddings.rs +16 -125
  44. data/vendor/kreuzberg/src/error.rs +1 -1
  45. data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/image.rs +13 -13
  47. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
  48. data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
  49. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  50. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  51. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  52. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  53. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  54. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  55. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  56. data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
  57. data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
  58. data/vendor/kreuzberg/src/extractors/email.rs +0 -14
  59. data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
  60. data/vendor/kreuzberg/src/extractors/html.rs +154 -137
  61. data/vendor/kreuzberg/src/extractors/image.rs +4 -7
  62. data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
  63. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  64. data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
  65. data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
  66. data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
  67. data/vendor/kreuzberg/src/extractors/text.rs +5 -23
  68. data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
  69. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  70. data/vendor/kreuzberg/src/lib.rs +1 -4
  71. data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
  72. data/vendor/kreuzberg/src/mcp/server.rs +3 -5
  73. data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
  74. data/vendor/kreuzberg/src/pdf/error.rs +1 -1
  75. data/vendor/kreuzberg/src/pdf/table.rs +44 -17
  76. data/vendor/kreuzberg/src/pdf/text.rs +3 -0
  77. data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
  78. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
  79. data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
  80. data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
  81. data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
  82. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  83. data/vendor/kreuzberg/src/types.rs +12 -42
  84. data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
  85. data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
  86. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  87. data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
  88. data/vendor/kreuzberg/tests/config_features.rs +0 -18
  89. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
  90. data/vendor/kreuzberg/tests/core_integration.rs +7 -24
  91. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  92. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  93. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
  95. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
  96. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
  97. data/vendor/kreuzberg/tests/security_validation.rs +1 -12
  98. metadata +25 -90
  99. data/.rubocop.yml +0 -538
  100. data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
  101. data/lib/kreuzberg/error_context.rb +0 -32
  102. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  103. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
  104. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
  105. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
  106. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
  107. data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
  108. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
  109. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  110. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
  111. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
  112. data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
  113. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
  114. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  115. data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
  116. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
  117. data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
  119. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  121. data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
  122. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  123. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  124. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  125. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
  126. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  127. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  128. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  129. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  130. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  131. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  132. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  133. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  134. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  135. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
  136. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  137. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  138. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
  139. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
  140. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
  141. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
  142. data/vendor/rb-sys/.cargo-ok +0 -1
  143. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  144. data/vendor/rb-sys/Cargo.lock +0 -393
  145. data/vendor/rb-sys/Cargo.toml +0 -70
  146. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  147. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  148. data/vendor/rb-sys/LICENSE-MIT +0 -21
  149. data/vendor/rb-sys/bin/release.sh +0 -21
  150. data/vendor/rb-sys/build/features.rs +0 -108
  151. data/vendor/rb-sys/build/main.rs +0 -246
  152. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  153. data/vendor/rb-sys/build/version.rs +0 -48
  154. data/vendor/rb-sys/readme.md +0 -36
  155. data/vendor/rb-sys/src/bindings.rs +0 -21
  156. data/vendor/rb-sys/src/hidden.rs +0 -11
  157. data/vendor/rb-sys/src/lib.rs +0 -34
  158. data/vendor/rb-sys/src/macros.rs +0 -371
  159. data/vendor/rb-sys/src/memory.rs +0 -53
  160. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  161. data/vendor/rb-sys/src/special_consts.rs +0 -31
  162. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  163. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  164. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  165. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  166. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  167. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  168. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  169. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  170. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  171. data/vendor/rb-sys/src/stable_api.rs +0 -261
  172. data/vendor/rb-sys/src/symbol.rs +0 -31
  173. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  174. data/vendor/rb-sys/src/utils.rs +0 -89
  175. data/vendor/rb-sys/src/value_type.rs +0 -7
@@ -26,65 +26,6 @@ use serde_json::json;
26
26
  use std::path::Path;
27
27
  use std::sync::Arc;
28
28
 
29
- /// Record error information in the current OpenTelemetry span.
30
- ///
31
- /// This function records error details in the current span when the `otel` feature is enabled.
32
- /// It marks the span with `otel.status_code=ERROR` and adds error type and message fields.
33
- ///
34
- /// # Arguments
35
- ///
36
- /// * `error` - The error to record in the span
37
- ///
38
- /// # Example
39
- ///
40
- /// ```rust,ignore
41
- /// let result = extract_file("doc.pdf", None, &config).await;
42
- /// #[cfg(feature = "otel")]
43
- /// if let Err(ref e) = result {
44
- /// record_error(e);
45
- /// }
46
- /// result
47
- /// ```
48
- #[cfg(feature = "otel")]
49
- fn record_error(error: &KreuzbergError) {
50
- let span = tracing::Span::current();
51
- span.record("otel.status_code", "ERROR");
52
- span.record("error.type", format!("{:?}", error));
53
- span.record("error.message", error.to_string());
54
- }
55
-
56
- /// Sanitize a file path to return only the filename.
57
- ///
58
- /// This function extracts the filename from a path to avoid recording
59
- /// potentially sensitive full file paths in telemetry data.
60
- ///
61
- /// # Arguments
62
- ///
63
- /// * `path` - The path to sanitize
64
- ///
65
- /// # Returns
66
- ///
67
- /// The filename as a string, or "unknown" if extraction fails
68
- ///
69
- /// # Security
70
- ///
71
- /// This prevents PII (personally identifiable information) from appearing in
72
- /// traces by only recording filenames instead of full paths.
73
- ///
74
- /// # Example
75
- ///
76
- /// ```rust,ignore
77
- /// let path = Path::new("/home/user/documents/secret.pdf");
78
- /// assert_eq!(sanitize_path(path), "secret.pdf");
79
- /// ```
80
- #[cfg(feature = "otel")]
81
- fn sanitize_path(path: &Path) -> String {
82
- path.file_name()
83
- .and_then(|n| n.to_str())
84
- .unwrap_or("unknown")
85
- .to_string()
86
- }
87
-
88
29
  /// Global Tokio runtime for synchronous operations.
89
30
  ///
90
31
  /// This runtime is lazily initialized on first use and shared across all sync wrappers.
@@ -160,12 +101,6 @@ fn get_extractor(mime_type: &str) -> Result<Arc<dyn DocumentExtractor>> {
160
101
  /// # Ok(())
161
102
  /// # }
162
103
  /// ```
163
- #[cfg_attr(feature = "otel", tracing::instrument(
164
- skip(config, path),
165
- fields(
166
- extraction.filename = tracing::field::Empty,
167
- )
168
- ))]
169
104
  pub async fn extract_file(
170
105
  path: impl AsRef<Path>,
171
106
  mime_type: Option<&str>,
@@ -175,119 +110,86 @@ pub async fn extract_file(
175
110
 
176
111
  let path = path.as_ref();
177
112
 
178
- #[cfg(feature = "otel")]
179
- {
180
- let span = tracing::Span::current();
181
- span.record("extraction.filename", sanitize_path(path));
182
- }
183
-
184
- let result = async {
185
- io::validate_file_exists(path)?;
113
+ io::validate_file_exists(path)?;
186
114
 
187
- let detected_mime = mime::detect_or_validate(Some(path), mime_type)?;
115
+ let detected_mime = mime::detect_or_validate(Some(path), mime_type)?;
188
116
 
189
- match detected_mime.as_str() {
190
- #[cfg(feature = "office")]
191
- LEGACY_WORD_MIME_TYPE => {
192
- let original_bytes = tokio::fs::read(path).await?;
193
- let conversion = convert_doc_to_docx(&original_bytes).await?;
194
- let mut result =
195
- extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
196
- apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
197
- return Ok(result);
198
- }
199
- #[cfg(not(feature = "office"))]
200
- LEGACY_WORD_MIME_TYPE => {
201
- return Err(KreuzbergError::UnsupportedFormat(
202
- "Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
203
- ));
204
- }
205
- #[cfg(feature = "office")]
206
- LEGACY_POWERPOINT_MIME_TYPE => {
207
- let original_bytes = tokio::fs::read(path).await?;
208
- let conversion = convert_ppt_to_pptx(&original_bytes).await?;
209
- let mut result =
210
- extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
211
- apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
212
- return Ok(result);
213
- }
214
- #[cfg(not(feature = "office"))]
215
- LEGACY_POWERPOINT_MIME_TYPE => {
216
- return Err(KreuzbergError::UnsupportedFormat(
217
- "Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
218
- ));
219
- }
220
- _ => {}
117
+ match detected_mime.as_str() {
118
+ #[cfg(feature = "office")]
119
+ LEGACY_WORD_MIME_TYPE => {
120
+ let original_bytes = tokio::fs::read(path).await?;
121
+ let conversion = convert_doc_to_docx(&original_bytes).await?;
122
+ let mut result =
123
+ extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
124
+ apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
125
+ return Ok(result);
221
126
  }
222
-
223
- extract_file_with_extractor(path, &detected_mime, config).await
224
- }
225
- .await;
226
-
227
- #[cfg(feature = "otel")]
228
- if let Err(ref e) = result {
229
- record_error(e);
127
+ #[cfg(not(feature = "office"))]
128
+ LEGACY_WORD_MIME_TYPE => {
129
+ return Err(KreuzbergError::UnsupportedFormat(
130
+ "Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
131
+ ));
132
+ }
133
+ #[cfg(feature = "office")]
134
+ LEGACY_POWERPOINT_MIME_TYPE => {
135
+ let original_bytes = tokio::fs::read(path).await?;
136
+ let conversion = convert_ppt_to_pptx(&original_bytes).await?;
137
+ let mut result =
138
+ extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
139
+ apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
140
+ return Ok(result);
141
+ }
142
+ #[cfg(not(feature = "office"))]
143
+ LEGACY_POWERPOINT_MIME_TYPE => {
144
+ return Err(KreuzbergError::UnsupportedFormat(
145
+ "Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
146
+ ));
147
+ }
148
+ _ => {}
230
149
  }
231
150
 
232
- result
151
+ extract_file_with_extractor(path, &detected_mime, config).await
233
152
  }
234
153
 
235
154
  /// Extract content from a byte array.
236
- #[cfg_attr(feature = "otel", tracing::instrument(
237
- skip(config, content),
238
- fields(
239
- extraction.mime_type = mime_type,
240
- extraction.size_bytes = content.len(),
241
- )
242
- ))]
243
155
  pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
244
156
  use crate::core::mime;
245
157
 
246
- let result = async {
247
- let validated_mime = mime::validate_mime_type(mime_type)?;
248
-
249
- match validated_mime.as_str() {
250
- #[cfg(feature = "office")]
251
- LEGACY_WORD_MIME_TYPE => {
252
- let conversion = convert_doc_to_docx(content).await?;
253
- let mut result =
254
- extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
255
- apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
256
- return Ok(result);
257
- }
258
- #[cfg(not(feature = "office"))]
259
- LEGACY_WORD_MIME_TYPE => {
260
- return Err(KreuzbergError::UnsupportedFormat(
261
- "Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
262
- ));
263
- }
264
- #[cfg(feature = "office")]
265
- LEGACY_POWERPOINT_MIME_TYPE => {
266
- let conversion = convert_ppt_to_pptx(content).await?;
267
- let mut result =
268
- extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
269
- apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
270
- return Ok(result);
271
- }
272
- #[cfg(not(feature = "office"))]
273
- LEGACY_POWERPOINT_MIME_TYPE => {
274
- return Err(KreuzbergError::UnsupportedFormat(
275
- "Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
276
- ));
277
- }
278
- _ => {}
279
- }
280
-
281
- extract_bytes_with_extractor(content, &validated_mime, config).await
282
- }
283
- .await;
158
+ let validated_mime = mime::validate_mime_type(mime_type)?;
284
159
 
285
- #[cfg(feature = "otel")]
286
- if let Err(ref e) = result {
287
- record_error(e);
160
+ match validated_mime.as_str() {
161
+ #[cfg(feature = "office")]
162
+ LEGACY_WORD_MIME_TYPE => {
163
+ let conversion = convert_doc_to_docx(content).await?;
164
+ let mut result =
165
+ extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
166
+ apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
167
+ return Ok(result);
168
+ }
169
+ #[cfg(not(feature = "office"))]
170
+ LEGACY_WORD_MIME_TYPE => {
171
+ return Err(KreuzbergError::UnsupportedFormat(
172
+ "Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
173
+ ));
174
+ }
175
+ #[cfg(feature = "office")]
176
+ LEGACY_POWERPOINT_MIME_TYPE => {
177
+ let conversion = convert_ppt_to_pptx(content).await?;
178
+ let mut result =
179
+ extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
180
+ apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
181
+ return Ok(result);
182
+ }
183
+ #[cfg(not(feature = "office"))]
184
+ LEGACY_POWERPOINT_MIME_TYPE => {
185
+ return Err(KreuzbergError::UnsupportedFormat(
186
+ "Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
187
+ ));
188
+ }
189
+ _ => {}
288
190
  }
289
191
 
290
- result
192
+ extract_bytes_with_extractor(content, &validated_mime, config).await
291
193
  }
292
194
 
293
195
  /// Extract content from multiple files concurrently.
@@ -310,13 +212,6 @@ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionC
310
212
  ///
311
213
  /// Individual file errors are captured in the result metadata. System errors
312
214
  /// (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
313
- #[cfg_attr(feature = "otel", tracing::instrument(
314
- skip(config, paths),
315
- fields(
316
- extraction.batch_size = paths.len(),
317
- )
318
- ))]
319
- #[cfg(feature = "tokio-runtime")]
320
215
  pub async fn batch_extract_file(
321
216
  paths: Vec<impl AsRef<Path>>,
322
217
  config: &ExtractionConfig,
@@ -407,13 +302,6 @@ pub async fn batch_extract_file(
407
302
  /// # Returns
408
303
  ///
409
304
  /// A vector of `ExtractionResult` in the same order as the input.
410
- #[cfg_attr(feature = "otel", tracing::instrument(
411
- skip(config, contents),
412
- fields(
413
- extraction.batch_size = contents.len(),
414
- )
415
- ))]
416
- #[cfg(feature = "tokio-runtime")]
417
305
  pub async fn batch_extract_bytes(
418
306
  contents: Vec<(&[u8], &str)>,
419
307
  config: &ExtractionConfig,
@@ -592,10 +480,6 @@ mod tests {
592
480
  use std::io::Write;
593
481
  use tempfile::tempdir;
594
482
 
595
- fn assert_text_content(actual: &str, expected: &str) {
596
- assert_eq!(actual.trim_end_matches('\n'), expected);
597
- }
598
-
599
483
  #[tokio::test]
600
484
  async fn test_extract_file_basic() {
601
485
  let dir = tempdir().unwrap();
@@ -608,7 +492,7 @@ mod tests {
608
492
 
609
493
  assert!(result.is_ok());
610
494
  let result = result.unwrap();
611
- assert_text_content(&result.content, "Hello, world!");
495
+ assert_eq!(result.content, "Hello, world!");
612
496
  assert_eq!(result.mime_type, "text/plain");
613
497
  }
614
498
 
@@ -641,7 +525,7 @@ mod tests {
641
525
 
642
526
  assert!(result.is_ok());
643
527
  let result = result.unwrap();
644
- assert_text_content(&result.content, "test content");
528
+ assert_eq!(result.content, "test content");
645
529
  assert_eq!(result.mime_type, "text/plain");
646
530
  }
647
531
 
@@ -669,8 +553,8 @@ mod tests {
669
553
  assert!(results.is_ok());
670
554
  let results = results.unwrap();
671
555
  assert_eq!(results.len(), 2);
672
- assert_text_content(&results[0].content, "content 1");
673
- assert_text_content(&results[1].content, "content 2");
556
+ assert_eq!(results[0].content, "content 1");
557
+ assert_eq!(results[1].content, "content 2");
674
558
  }
675
559
 
676
560
  #[tokio::test]
@@ -695,8 +579,8 @@ mod tests {
695
579
  assert!(results.is_ok());
696
580
  let results = results.unwrap();
697
581
  assert_eq!(results.len(), 2);
698
- assert_text_content(&results[0].content, "content 1");
699
- assert_text_content(&results[1].content, "content 2");
582
+ assert_eq!(results[0].content, "content 1");
583
+ assert_eq!(results[1].content, "content 2");
700
584
  }
701
585
 
702
586
  #[test]
@@ -709,8 +593,7 @@ mod tests {
709
593
 
710
594
  let result = extract_file_sync(&file_path, None, &config);
711
595
  assert!(result.is_ok());
712
- let result = result.unwrap();
713
- assert_text_content(&result.content, "sync test");
596
+ assert_eq!(result.unwrap().content, "sync test");
714
597
 
715
598
  let result = extract_bytes_sync(b"test", "text/plain", &config);
716
599
  assert!(result.is_ok());
@@ -722,14 +605,12 @@ mod tests {
722
605
 
723
606
  let result1 = extract_bytes(b"test 1", "text/plain", &config).await;
724
607
  assert!(result1.is_ok());
725
- let result1 = result1.unwrap();
726
608
 
727
609
  let result2 = extract_bytes(b"test 2", "text/plain", &config).await;
728
610
  assert!(result2.is_ok());
729
- let result2 = result2.unwrap();
730
611
 
731
- assert_text_content(&result1.content, "test 1");
732
- assert_text_content(&result2.content, "test 2");
612
+ assert_eq!(result1.unwrap().content, "test 1");
613
+ assert_eq!(result2.unwrap().content, "test 2");
733
614
 
734
615
  let result3 = extract_bytes(b"# test 3", "text/markdown", &config).await;
735
616
  assert!(result3.is_ok());
@@ -795,8 +676,7 @@ mod tests {
795
676
  let result = extract_file(&file_path, None, &config).await;
796
677
 
797
678
  assert!(result.is_ok());
798
- let result = result.unwrap();
799
- assert_text_content(&result.content, "content");
679
+ assert_eq!(result.unwrap().content, "content");
800
680
  }
801
681
 
802
682
  #[tokio::test]
@@ -836,7 +716,7 @@ mod tests {
836
716
  assert!(results.is_ok());
837
717
  let results = results.unwrap();
838
718
  assert_eq!(results.len(), 2);
839
- assert_text_content(&results[0].content, "valid content");
719
+ assert_eq!(results[0].content, "valid content");
840
720
  assert!(results[1].metadata.error.is_some());
841
721
  }
842
722
 
@@ -853,9 +733,9 @@ mod tests {
853
733
  assert!(results.is_ok());
854
734
  let results = results.unwrap();
855
735
  assert_eq!(results.len(), 3);
856
- assert_text_content(&results[0].content, "valid 1");
736
+ assert_eq!(results[0].content, "valid 1");
857
737
  assert!(results[1].metadata.error.is_some());
858
- assert_text_content(&results[2].content, "valid 2");
738
+ assert_eq!(results[2].content, "valid 2");
859
739
  }
860
740
 
861
741
  #[tokio::test]
@@ -882,8 +762,7 @@ mod tests {
882
762
 
883
763
  assert!(result.is_ok());
884
764
  let result = result.unwrap();
885
- let trimmed_len = result.content.trim_end_matches('\n').len();
886
- assert_eq!(trimmed_len, 10_000_000);
765
+ assert_eq!(result.content.len(), 10_000_000);
887
766
  }
888
767
 
889
768
  #[tokio::test]
@@ -908,7 +787,7 @@ mod tests {
908
787
  assert_eq!(results.len(), 100);
909
788
 
910
789
  for (i, result) in results.iter().enumerate() {
911
- assert_text_content(&result.content, &format!("content {}", i));
790
+ assert_eq!(result.content, format!("content {}", i));
912
791
  }
913
792
  }
914
793
 
@@ -4,6 +4,7 @@
4
4
 
5
5
  use crate::{KreuzbergError, Result};
6
6
  use std::path::Path;
7
+ use tokio::fs;
7
8
 
8
9
  /// Read a file asynchronously.
9
10
  ///
@@ -18,9 +19,8 @@ use std::path::Path;
18
19
  /// # Errors
19
20
  ///
20
21
  /// Returns `KreuzbergError::Io` for I/O errors (these always bubble up).
21
- #[cfg(feature = "tokio-runtime")]
22
22
  pub async fn read_file_async(path: impl AsRef<Path>) -> Result<Vec<u8>> {
23
- tokio::fs::read(path.as_ref()).await.map_err(KreuzbergError::Io)
23
+ fs::read(path.as_ref()).await.map_err(KreuzbergError::Io)
24
24
  }
25
25
 
26
26
  /// Read a file synchronously.
@@ -181,7 +181,6 @@ mod tests {
181
181
  use std::io::Write;
182
182
  use tempfile::tempdir;
183
183
 
184
- #[cfg(feature = "tokio-runtime")]
185
184
  #[tokio::test]
186
185
  async fn test_read_file_async() {
187
186
  let dir = tempdir().unwrap();
@@ -312,7 +311,6 @@ mod tests {
312
311
  assert!(result.is_err());
313
312
  }
314
313
 
315
- #[cfg(feature = "tokio-runtime")]
316
314
  #[tokio::test]
317
315
  async fn test_read_file_async_io_error() {
318
316
  let result = read_file_async("/nonexistent/file.txt").await;
@@ -152,7 +152,6 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
152
152
  set.insert("application/x-ipynb+json");
153
153
  set.insert("application/x-jats+xml");
154
154
  set.insert("application/x-latex");
155
- set.insert("application/xml+opml");
156
155
  set.insert("application/x-opml+xml");
157
156
  set.insert("application/x-research-info-systems");
158
157
  set.insert("application/x-typst");
@@ -165,7 +164,6 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
165
164
  set.insert("text/x-markdown-extra");
166
165
  set.insert("text/x-mdoc");
167
166
  set.insert("text/x-multimarkdown");
168
- set.insert("text/x-opml");
169
167
  set.insert("text/x-org");
170
168
  set.insert("text/x-pod");
171
169
  set.insert("text/x-rst");
@@ -329,35 +327,43 @@ pub fn detect_or_validate(path: Option<&Path>, mime_type: Option<&str>) -> Resul
329
327
  ///
330
328
  /// Returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined.
331
329
  pub fn detect_mime_type_from_bytes(content: &[u8]) -> Result<String> {
330
+ // Use infer crate for magic byte detection
332
331
  if let Some(kind) = infer::get(content) {
333
332
  let mime_type = kind.mime_type();
334
333
 
334
+ // Validate that it's a supported type
335
335
  if SUPPORTED_MIME_TYPES.contains(mime_type) || mime_type.starts_with("image/") {
336
336
  return Ok(mime_type.to_string());
337
337
  }
338
338
  }
339
339
 
340
+ // Try to detect text-based formats
340
341
  if let Ok(text) = std::str::from_utf8(content) {
341
342
  let trimmed = text.trim_start();
342
343
 
344
+ // Detect JSON
343
345
  if (trimmed.starts_with('{') || trimmed.starts_with('['))
344
346
  && serde_json::from_str::<serde_json::Value>(text).is_ok()
345
347
  {
346
348
  return Ok(JSON_MIME_TYPE.to_string());
347
349
  }
348
350
 
351
+ // Detect XML
349
352
  if trimmed.starts_with("<?xml") || trimmed.starts_with('<') {
350
353
  return Ok(XML_MIME_TYPE.to_string());
351
354
  }
352
355
 
356
+ // Detect HTML
353
357
  if trimmed.starts_with("<!DOCTYPE html") || trimmed.starts_with("<html") {
354
358
  return Ok(HTML_MIME_TYPE.to_string());
355
359
  }
356
360
 
361
+ // Detect PDF header
357
362
  if trimmed.starts_with("%PDF") {
358
363
  return Ok(PDF_MIME_TYPE.to_string());
359
364
  }
360
365
 
366
+ // Default to plain text for valid UTF-8
361
367
  return Ok(PLAIN_TEXT_MIME_TYPE.to_string());
362
368
  }
363
369
 
@@ -392,21 +398,25 @@ pub fn detect_mime_type_from_bytes(content: &[u8]) -> Result<String> {
392
398
  pub fn get_extensions_for_mime(mime_type: &str) -> Result<Vec<String>> {
393
399
  let mut extensions = Vec::new();
394
400
 
401
+ // Search through EXT_TO_MIME for matching MIME types
395
402
  for (ext, mime) in EXT_TO_MIME.iter() {
396
403
  if *mime == mime_type {
397
404
  extensions.push(ext.to_string());
398
405
  }
399
406
  }
400
407
 
408
+ // If we found extensions, return them
401
409
  if !extensions.is_empty() {
402
410
  return Ok(extensions);
403
411
  }
404
412
 
413
+ // Try using mime_guess crate for reverse lookup
405
414
  let guessed = mime_guess::get_mime_extensions_str(mime_type);
406
415
  if let Some(exts) = guessed {
407
416
  return Ok(exts.iter().map(|s| s.to_string()).collect());
408
417
  }
409
418
 
419
+ // No extensions found
410
420
  Err(KreuzbergError::UnsupportedFormat(format!(
411
421
  "No known extensions for MIME type: {}",
412
422
  mime_type
@@ -28,7 +28,6 @@
28
28
  //! # }
29
29
  //! ```
30
30
 
31
- #[cfg(feature = "tokio-runtime")]
32
31
  pub(crate) mod batch_mode;
33
32
  pub mod config;
34
33
  pub mod extractor;
@@ -40,6 +39,4 @@ pub use config::{
40
39
  ChunkingConfig, ExtractionConfig, ImageExtractionConfig, LanguageDetectionConfig, OcrConfig, PdfConfig,
41
40
  TokenReductionConfig,
42
41
  };
43
- #[cfg(feature = "tokio-runtime")]
44
- pub use extractor::{batch_extract_bytes, batch_extract_file};
45
- pub use extractor::{extract_bytes, extract_file};
42
+ pub use extractor::{batch_extract_bytes, batch_extract_file, extract_bytes, extract_file};