kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +11 -11
- data/README.md +5 -10
- data/examples/async_patterns.rb +0 -1
- data/ext/kreuzberg_rb/extconf.rb +0 -10
- data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
- data/ext/kreuzberg_rb/native/build.rs +2 -0
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
- data/kreuzberg.gemspec +14 -57
- data/lib/kreuzberg/cache_api.rb +0 -1
- data/lib/kreuzberg/cli.rb +2 -2
- data/lib/kreuzberg/config.rb +2 -9
- data/lib/kreuzberg/errors.rb +7 -75
- data/lib/kreuzberg/extraction_api.rb +0 -1
- data/lib/kreuzberg/setup_lib_path.rb +0 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -21
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +3 -55
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/spec_helper.rb +1 -1
- data/vendor/kreuzberg/Cargo.toml +42 -112
- data/vendor/kreuzberg/README.md +2 -2
- data/vendor/kreuzberg/build.rs +4 -18
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/extractor.rs +81 -202
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +1 -4
- data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
- data/vendor/kreuzberg/src/embeddings.rs +16 -125
- data/vendor/kreuzberg/src/error.rs +1 -1
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
- data/vendor/kreuzberg/src/extraction/image.rs +13 -13
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
- data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
- data/vendor/kreuzberg/src/extractors/email.rs +0 -14
- data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
- data/vendor/kreuzberg/src/extractors/html.rs +154 -137
- data/vendor/kreuzberg/src/extractors/image.rs +4 -7
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
- data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
- data/vendor/kreuzberg/src/extractors/text.rs +5 -23
- data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/lib.rs +1 -4
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
- data/vendor/kreuzberg/src/mcp/server.rs +3 -5
- data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
- data/vendor/kreuzberg/src/pdf/error.rs +1 -1
- data/vendor/kreuzberg/src/pdf/table.rs +44 -17
- data/vendor/kreuzberg/src/pdf/text.rs +3 -0
- data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/types.rs +12 -42
- data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
- data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
- data/vendor/kreuzberg/tests/config_features.rs +0 -18
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
- data/vendor/kreuzberg/tests/core_integration.rs +7 -24
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
- data/vendor/kreuzberg/tests/security_validation.rs +1 -12
- metadata +25 -90
- data/.rubocop.yml +0 -538
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
- data/lib/kreuzberg/error_context.rb +0 -32
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/LICENSE-MIT +0 -21
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -26,65 +26,6 @@ use serde_json::json;
|
|
|
26
26
|
use std::path::Path;
|
|
27
27
|
use std::sync::Arc;
|
|
28
28
|
|
|
29
|
-
/// Record error information in the current OpenTelemetry span.
|
|
30
|
-
///
|
|
31
|
-
/// This function records error details in the current span when the `otel` feature is enabled.
|
|
32
|
-
/// It marks the span with `otel.status_code=ERROR` and adds error type and message fields.
|
|
33
|
-
///
|
|
34
|
-
/// # Arguments
|
|
35
|
-
///
|
|
36
|
-
/// * `error` - The error to record in the span
|
|
37
|
-
///
|
|
38
|
-
/// # Example
|
|
39
|
-
///
|
|
40
|
-
/// ```rust,ignore
|
|
41
|
-
/// let result = extract_file("doc.pdf", None, &config).await;
|
|
42
|
-
/// #[cfg(feature = "otel")]
|
|
43
|
-
/// if let Err(ref e) = result {
|
|
44
|
-
/// record_error(e);
|
|
45
|
-
/// }
|
|
46
|
-
/// result
|
|
47
|
-
/// ```
|
|
48
|
-
#[cfg(feature = "otel")]
|
|
49
|
-
fn record_error(error: &KreuzbergError) {
|
|
50
|
-
let span = tracing::Span::current();
|
|
51
|
-
span.record("otel.status_code", "ERROR");
|
|
52
|
-
span.record("error.type", format!("{:?}", error));
|
|
53
|
-
span.record("error.message", error.to_string());
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
/// Sanitize a file path to return only the filename.
|
|
57
|
-
///
|
|
58
|
-
/// This function extracts the filename from a path to avoid recording
|
|
59
|
-
/// potentially sensitive full file paths in telemetry data.
|
|
60
|
-
///
|
|
61
|
-
/// # Arguments
|
|
62
|
-
///
|
|
63
|
-
/// * `path` - The path to sanitize
|
|
64
|
-
///
|
|
65
|
-
/// # Returns
|
|
66
|
-
///
|
|
67
|
-
/// The filename as a string, or "unknown" if extraction fails
|
|
68
|
-
///
|
|
69
|
-
/// # Security
|
|
70
|
-
///
|
|
71
|
-
/// This prevents PII (personally identifiable information) from appearing in
|
|
72
|
-
/// traces by only recording filenames instead of full paths.
|
|
73
|
-
///
|
|
74
|
-
/// # Example
|
|
75
|
-
///
|
|
76
|
-
/// ```rust,ignore
|
|
77
|
-
/// let path = Path::new("/home/user/documents/secret.pdf");
|
|
78
|
-
/// assert_eq!(sanitize_path(path), "secret.pdf");
|
|
79
|
-
/// ```
|
|
80
|
-
#[cfg(feature = "otel")]
|
|
81
|
-
fn sanitize_path(path: &Path) -> String {
|
|
82
|
-
path.file_name()
|
|
83
|
-
.and_then(|n| n.to_str())
|
|
84
|
-
.unwrap_or("unknown")
|
|
85
|
-
.to_string()
|
|
86
|
-
}
|
|
87
|
-
|
|
88
29
|
/// Global Tokio runtime for synchronous operations.
|
|
89
30
|
///
|
|
90
31
|
/// This runtime is lazily initialized on first use and shared across all sync wrappers.
|
|
@@ -160,12 +101,6 @@ fn get_extractor(mime_type: &str) -> Result<Arc<dyn DocumentExtractor>> {
|
|
|
160
101
|
/// # Ok(())
|
|
161
102
|
/// # }
|
|
162
103
|
/// ```
|
|
163
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
164
|
-
skip(config, path),
|
|
165
|
-
fields(
|
|
166
|
-
extraction.filename = tracing::field::Empty,
|
|
167
|
-
)
|
|
168
|
-
))]
|
|
169
104
|
pub async fn extract_file(
|
|
170
105
|
path: impl AsRef<Path>,
|
|
171
106
|
mime_type: Option<&str>,
|
|
@@ -175,119 +110,86 @@ pub async fn extract_file(
|
|
|
175
110
|
|
|
176
111
|
let path = path.as_ref();
|
|
177
112
|
|
|
178
|
-
|
|
179
|
-
{
|
|
180
|
-
let span = tracing::Span::current();
|
|
181
|
-
span.record("extraction.filename", sanitize_path(path));
|
|
182
|
-
}
|
|
183
|
-
|
|
184
|
-
let result = async {
|
|
185
|
-
io::validate_file_exists(path)?;
|
|
113
|
+
io::validate_file_exists(path)?;
|
|
186
114
|
|
|
187
|
-
|
|
115
|
+
let detected_mime = mime::detect_or_validate(Some(path), mime_type)?;
|
|
188
116
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
}
|
|
199
|
-
#[cfg(not(feature = "office"))]
|
|
200
|
-
LEGACY_WORD_MIME_TYPE => {
|
|
201
|
-
return Err(KreuzbergError::UnsupportedFormat(
|
|
202
|
-
"Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
203
|
-
));
|
|
204
|
-
}
|
|
205
|
-
#[cfg(feature = "office")]
|
|
206
|
-
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
207
|
-
let original_bytes = tokio::fs::read(path).await?;
|
|
208
|
-
let conversion = convert_ppt_to_pptx(&original_bytes).await?;
|
|
209
|
-
let mut result =
|
|
210
|
-
extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
|
|
211
|
-
apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
|
|
212
|
-
return Ok(result);
|
|
213
|
-
}
|
|
214
|
-
#[cfg(not(feature = "office"))]
|
|
215
|
-
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
216
|
-
return Err(KreuzbergError::UnsupportedFormat(
|
|
217
|
-
"Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
218
|
-
));
|
|
219
|
-
}
|
|
220
|
-
_ => {}
|
|
117
|
+
match detected_mime.as_str() {
|
|
118
|
+
#[cfg(feature = "office")]
|
|
119
|
+
LEGACY_WORD_MIME_TYPE => {
|
|
120
|
+
let original_bytes = tokio::fs::read(path).await?;
|
|
121
|
+
let conversion = convert_doc_to_docx(&original_bytes).await?;
|
|
122
|
+
let mut result =
|
|
123
|
+
extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
|
|
124
|
+
apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
|
|
125
|
+
return Ok(result);
|
|
221
126
|
}
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
127
|
+
#[cfg(not(feature = "office"))]
|
|
128
|
+
LEGACY_WORD_MIME_TYPE => {
|
|
129
|
+
return Err(KreuzbergError::UnsupportedFormat(
|
|
130
|
+
"Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
131
|
+
));
|
|
132
|
+
}
|
|
133
|
+
#[cfg(feature = "office")]
|
|
134
|
+
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
135
|
+
let original_bytes = tokio::fs::read(path).await?;
|
|
136
|
+
let conversion = convert_ppt_to_pptx(&original_bytes).await?;
|
|
137
|
+
let mut result =
|
|
138
|
+
extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
|
|
139
|
+
apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
|
|
140
|
+
return Ok(result);
|
|
141
|
+
}
|
|
142
|
+
#[cfg(not(feature = "office"))]
|
|
143
|
+
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
144
|
+
return Err(KreuzbergError::UnsupportedFormat(
|
|
145
|
+
"Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
146
|
+
));
|
|
147
|
+
}
|
|
148
|
+
_ => {}
|
|
230
149
|
}
|
|
231
150
|
|
|
232
|
-
|
|
151
|
+
extract_file_with_extractor(path, &detected_mime, config).await
|
|
233
152
|
}
|
|
234
153
|
|
|
235
154
|
/// Extract content from a byte array.
|
|
236
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
237
|
-
skip(config, content),
|
|
238
|
-
fields(
|
|
239
|
-
extraction.mime_type = mime_type,
|
|
240
|
-
extraction.size_bytes = content.len(),
|
|
241
|
-
)
|
|
242
|
-
))]
|
|
243
155
|
pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
244
156
|
use crate::core::mime;
|
|
245
157
|
|
|
246
|
-
let
|
|
247
|
-
let validated_mime = mime::validate_mime_type(mime_type)?;
|
|
248
|
-
|
|
249
|
-
match validated_mime.as_str() {
|
|
250
|
-
#[cfg(feature = "office")]
|
|
251
|
-
LEGACY_WORD_MIME_TYPE => {
|
|
252
|
-
let conversion = convert_doc_to_docx(content).await?;
|
|
253
|
-
let mut result =
|
|
254
|
-
extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
|
|
255
|
-
apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
|
|
256
|
-
return Ok(result);
|
|
257
|
-
}
|
|
258
|
-
#[cfg(not(feature = "office"))]
|
|
259
|
-
LEGACY_WORD_MIME_TYPE => {
|
|
260
|
-
return Err(KreuzbergError::UnsupportedFormat(
|
|
261
|
-
"Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
262
|
-
));
|
|
263
|
-
}
|
|
264
|
-
#[cfg(feature = "office")]
|
|
265
|
-
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
266
|
-
let conversion = convert_ppt_to_pptx(content).await?;
|
|
267
|
-
let mut result =
|
|
268
|
-
extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
|
|
269
|
-
apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
|
|
270
|
-
return Ok(result);
|
|
271
|
-
}
|
|
272
|
-
#[cfg(not(feature = "office"))]
|
|
273
|
-
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
274
|
-
return Err(KreuzbergError::UnsupportedFormat(
|
|
275
|
-
"Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
276
|
-
));
|
|
277
|
-
}
|
|
278
|
-
_ => {}
|
|
279
|
-
}
|
|
280
|
-
|
|
281
|
-
extract_bytes_with_extractor(content, &validated_mime, config).await
|
|
282
|
-
}
|
|
283
|
-
.await;
|
|
158
|
+
let validated_mime = mime::validate_mime_type(mime_type)?;
|
|
284
159
|
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
160
|
+
match validated_mime.as_str() {
|
|
161
|
+
#[cfg(feature = "office")]
|
|
162
|
+
LEGACY_WORD_MIME_TYPE => {
|
|
163
|
+
let conversion = convert_doc_to_docx(content).await?;
|
|
164
|
+
let mut result =
|
|
165
|
+
extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
|
|
166
|
+
apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
|
|
167
|
+
return Ok(result);
|
|
168
|
+
}
|
|
169
|
+
#[cfg(not(feature = "office"))]
|
|
170
|
+
LEGACY_WORD_MIME_TYPE => {
|
|
171
|
+
return Err(KreuzbergError::UnsupportedFormat(
|
|
172
|
+
"Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
173
|
+
));
|
|
174
|
+
}
|
|
175
|
+
#[cfg(feature = "office")]
|
|
176
|
+
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
177
|
+
let conversion = convert_ppt_to_pptx(content).await?;
|
|
178
|
+
let mut result =
|
|
179
|
+
extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
|
|
180
|
+
apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
|
|
181
|
+
return Ok(result);
|
|
182
|
+
}
|
|
183
|
+
#[cfg(not(feature = "office"))]
|
|
184
|
+
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
185
|
+
return Err(KreuzbergError::UnsupportedFormat(
|
|
186
|
+
"Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
187
|
+
));
|
|
188
|
+
}
|
|
189
|
+
_ => {}
|
|
288
190
|
}
|
|
289
191
|
|
|
290
|
-
|
|
192
|
+
extract_bytes_with_extractor(content, &validated_mime, config).await
|
|
291
193
|
}
|
|
292
194
|
|
|
293
195
|
/// Extract content from multiple files concurrently.
|
|
@@ -310,13 +212,6 @@ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionC
|
|
|
310
212
|
///
|
|
311
213
|
/// Individual file errors are captured in the result metadata. System errors
|
|
312
214
|
/// (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
|
|
313
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
314
|
-
skip(config, paths),
|
|
315
|
-
fields(
|
|
316
|
-
extraction.batch_size = paths.len(),
|
|
317
|
-
)
|
|
318
|
-
))]
|
|
319
|
-
#[cfg(feature = "tokio-runtime")]
|
|
320
215
|
pub async fn batch_extract_file(
|
|
321
216
|
paths: Vec<impl AsRef<Path>>,
|
|
322
217
|
config: &ExtractionConfig,
|
|
@@ -407,13 +302,6 @@ pub async fn batch_extract_file(
|
|
|
407
302
|
/// # Returns
|
|
408
303
|
///
|
|
409
304
|
/// A vector of `ExtractionResult` in the same order as the input.
|
|
410
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
411
|
-
skip(config, contents),
|
|
412
|
-
fields(
|
|
413
|
-
extraction.batch_size = contents.len(),
|
|
414
|
-
)
|
|
415
|
-
))]
|
|
416
|
-
#[cfg(feature = "tokio-runtime")]
|
|
417
305
|
pub async fn batch_extract_bytes(
|
|
418
306
|
contents: Vec<(&[u8], &str)>,
|
|
419
307
|
config: &ExtractionConfig,
|
|
@@ -592,10 +480,6 @@ mod tests {
|
|
|
592
480
|
use std::io::Write;
|
|
593
481
|
use tempfile::tempdir;
|
|
594
482
|
|
|
595
|
-
fn assert_text_content(actual: &str, expected: &str) {
|
|
596
|
-
assert_eq!(actual.trim_end_matches('\n'), expected);
|
|
597
|
-
}
|
|
598
|
-
|
|
599
483
|
#[tokio::test]
|
|
600
484
|
async fn test_extract_file_basic() {
|
|
601
485
|
let dir = tempdir().unwrap();
|
|
@@ -608,7 +492,7 @@ mod tests {
|
|
|
608
492
|
|
|
609
493
|
assert!(result.is_ok());
|
|
610
494
|
let result = result.unwrap();
|
|
611
|
-
|
|
495
|
+
assert_eq!(result.content, "Hello, world!");
|
|
612
496
|
assert_eq!(result.mime_type, "text/plain");
|
|
613
497
|
}
|
|
614
498
|
|
|
@@ -641,7 +525,7 @@ mod tests {
|
|
|
641
525
|
|
|
642
526
|
assert!(result.is_ok());
|
|
643
527
|
let result = result.unwrap();
|
|
644
|
-
|
|
528
|
+
assert_eq!(result.content, "test content");
|
|
645
529
|
assert_eq!(result.mime_type, "text/plain");
|
|
646
530
|
}
|
|
647
531
|
|
|
@@ -669,8 +553,8 @@ mod tests {
|
|
|
669
553
|
assert!(results.is_ok());
|
|
670
554
|
let results = results.unwrap();
|
|
671
555
|
assert_eq!(results.len(), 2);
|
|
672
|
-
|
|
673
|
-
|
|
556
|
+
assert_eq!(results[0].content, "content 1");
|
|
557
|
+
assert_eq!(results[1].content, "content 2");
|
|
674
558
|
}
|
|
675
559
|
|
|
676
560
|
#[tokio::test]
|
|
@@ -695,8 +579,8 @@ mod tests {
|
|
|
695
579
|
assert!(results.is_ok());
|
|
696
580
|
let results = results.unwrap();
|
|
697
581
|
assert_eq!(results.len(), 2);
|
|
698
|
-
|
|
699
|
-
|
|
582
|
+
assert_eq!(results[0].content, "content 1");
|
|
583
|
+
assert_eq!(results[1].content, "content 2");
|
|
700
584
|
}
|
|
701
585
|
|
|
702
586
|
#[test]
|
|
@@ -709,8 +593,7 @@ mod tests {
|
|
|
709
593
|
|
|
710
594
|
let result = extract_file_sync(&file_path, None, &config);
|
|
711
595
|
assert!(result.is_ok());
|
|
712
|
-
|
|
713
|
-
assert_text_content(&result.content, "sync test");
|
|
596
|
+
assert_eq!(result.unwrap().content, "sync test");
|
|
714
597
|
|
|
715
598
|
let result = extract_bytes_sync(b"test", "text/plain", &config);
|
|
716
599
|
assert!(result.is_ok());
|
|
@@ -722,14 +605,12 @@ mod tests {
|
|
|
722
605
|
|
|
723
606
|
let result1 = extract_bytes(b"test 1", "text/plain", &config).await;
|
|
724
607
|
assert!(result1.is_ok());
|
|
725
|
-
let result1 = result1.unwrap();
|
|
726
608
|
|
|
727
609
|
let result2 = extract_bytes(b"test 2", "text/plain", &config).await;
|
|
728
610
|
assert!(result2.is_ok());
|
|
729
|
-
let result2 = result2.unwrap();
|
|
730
611
|
|
|
731
|
-
|
|
732
|
-
|
|
612
|
+
assert_eq!(result1.unwrap().content, "test 1");
|
|
613
|
+
assert_eq!(result2.unwrap().content, "test 2");
|
|
733
614
|
|
|
734
615
|
let result3 = extract_bytes(b"# test 3", "text/markdown", &config).await;
|
|
735
616
|
assert!(result3.is_ok());
|
|
@@ -795,8 +676,7 @@ mod tests {
|
|
|
795
676
|
let result = extract_file(&file_path, None, &config).await;
|
|
796
677
|
|
|
797
678
|
assert!(result.is_ok());
|
|
798
|
-
|
|
799
|
-
assert_text_content(&result.content, "content");
|
|
679
|
+
assert_eq!(result.unwrap().content, "content");
|
|
800
680
|
}
|
|
801
681
|
|
|
802
682
|
#[tokio::test]
|
|
@@ -836,7 +716,7 @@ mod tests {
|
|
|
836
716
|
assert!(results.is_ok());
|
|
837
717
|
let results = results.unwrap();
|
|
838
718
|
assert_eq!(results.len(), 2);
|
|
839
|
-
|
|
719
|
+
assert_eq!(results[0].content, "valid content");
|
|
840
720
|
assert!(results[1].metadata.error.is_some());
|
|
841
721
|
}
|
|
842
722
|
|
|
@@ -853,9 +733,9 @@ mod tests {
|
|
|
853
733
|
assert!(results.is_ok());
|
|
854
734
|
let results = results.unwrap();
|
|
855
735
|
assert_eq!(results.len(), 3);
|
|
856
|
-
|
|
736
|
+
assert_eq!(results[0].content, "valid 1");
|
|
857
737
|
assert!(results[1].metadata.error.is_some());
|
|
858
|
-
|
|
738
|
+
assert_eq!(results[2].content, "valid 2");
|
|
859
739
|
}
|
|
860
740
|
|
|
861
741
|
#[tokio::test]
|
|
@@ -882,8 +762,7 @@ mod tests {
|
|
|
882
762
|
|
|
883
763
|
assert!(result.is_ok());
|
|
884
764
|
let result = result.unwrap();
|
|
885
|
-
|
|
886
|
-
assert_eq!(trimmed_len, 10_000_000);
|
|
765
|
+
assert_eq!(result.content.len(), 10_000_000);
|
|
887
766
|
}
|
|
888
767
|
|
|
889
768
|
#[tokio::test]
|
|
@@ -908,7 +787,7 @@ mod tests {
|
|
|
908
787
|
assert_eq!(results.len(), 100);
|
|
909
788
|
|
|
910
789
|
for (i, result) in results.iter().enumerate() {
|
|
911
|
-
|
|
790
|
+
assert_eq!(result.content, format!("content {}", i));
|
|
912
791
|
}
|
|
913
792
|
}
|
|
914
793
|
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
use crate::{KreuzbergError, Result};
|
|
6
6
|
use std::path::Path;
|
|
7
|
+
use tokio::fs;
|
|
7
8
|
|
|
8
9
|
/// Read a file asynchronously.
|
|
9
10
|
///
|
|
@@ -18,9 +19,8 @@ use std::path::Path;
|
|
|
18
19
|
/// # Errors
|
|
19
20
|
///
|
|
20
21
|
/// Returns `KreuzbergError::Io` for I/O errors (these always bubble up).
|
|
21
|
-
#[cfg(feature = "tokio-runtime")]
|
|
22
22
|
pub async fn read_file_async(path: impl AsRef<Path>) -> Result<Vec<u8>> {
|
|
23
|
-
|
|
23
|
+
fs::read(path.as_ref()).await.map_err(KreuzbergError::Io)
|
|
24
24
|
}
|
|
25
25
|
|
|
26
26
|
/// Read a file synchronously.
|
|
@@ -181,7 +181,6 @@ mod tests {
|
|
|
181
181
|
use std::io::Write;
|
|
182
182
|
use tempfile::tempdir;
|
|
183
183
|
|
|
184
|
-
#[cfg(feature = "tokio-runtime")]
|
|
185
184
|
#[tokio::test]
|
|
186
185
|
async fn test_read_file_async() {
|
|
187
186
|
let dir = tempdir().unwrap();
|
|
@@ -312,7 +311,6 @@ mod tests {
|
|
|
312
311
|
assert!(result.is_err());
|
|
313
312
|
}
|
|
314
313
|
|
|
315
|
-
#[cfg(feature = "tokio-runtime")]
|
|
316
314
|
#[tokio::test]
|
|
317
315
|
async fn test_read_file_async_io_error() {
|
|
318
316
|
let result = read_file_async("/nonexistent/file.txt").await;
|
|
@@ -152,7 +152,6 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
|
|
|
152
152
|
set.insert("application/x-ipynb+json");
|
|
153
153
|
set.insert("application/x-jats+xml");
|
|
154
154
|
set.insert("application/x-latex");
|
|
155
|
-
set.insert("application/xml+opml");
|
|
156
155
|
set.insert("application/x-opml+xml");
|
|
157
156
|
set.insert("application/x-research-info-systems");
|
|
158
157
|
set.insert("application/x-typst");
|
|
@@ -165,7 +164,6 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
|
|
|
165
164
|
set.insert("text/x-markdown-extra");
|
|
166
165
|
set.insert("text/x-mdoc");
|
|
167
166
|
set.insert("text/x-multimarkdown");
|
|
168
|
-
set.insert("text/x-opml");
|
|
169
167
|
set.insert("text/x-org");
|
|
170
168
|
set.insert("text/x-pod");
|
|
171
169
|
set.insert("text/x-rst");
|
|
@@ -329,35 +327,43 @@ pub fn detect_or_validate(path: Option<&Path>, mime_type: Option<&str>) -> Resul
|
|
|
329
327
|
///
|
|
330
328
|
/// Returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined.
|
|
331
329
|
pub fn detect_mime_type_from_bytes(content: &[u8]) -> Result<String> {
|
|
330
|
+
// Use infer crate for magic byte detection
|
|
332
331
|
if let Some(kind) = infer::get(content) {
|
|
333
332
|
let mime_type = kind.mime_type();
|
|
334
333
|
|
|
334
|
+
// Validate that it's a supported type
|
|
335
335
|
if SUPPORTED_MIME_TYPES.contains(mime_type) || mime_type.starts_with("image/") {
|
|
336
336
|
return Ok(mime_type.to_string());
|
|
337
337
|
}
|
|
338
338
|
}
|
|
339
339
|
|
|
340
|
+
// Try to detect text-based formats
|
|
340
341
|
if let Ok(text) = std::str::from_utf8(content) {
|
|
341
342
|
let trimmed = text.trim_start();
|
|
342
343
|
|
|
344
|
+
// Detect JSON
|
|
343
345
|
if (trimmed.starts_with('{') || trimmed.starts_with('['))
|
|
344
346
|
&& serde_json::from_str::<serde_json::Value>(text).is_ok()
|
|
345
347
|
{
|
|
346
348
|
return Ok(JSON_MIME_TYPE.to_string());
|
|
347
349
|
}
|
|
348
350
|
|
|
351
|
+
// Detect XML
|
|
349
352
|
if trimmed.starts_with("<?xml") || trimmed.starts_with('<') {
|
|
350
353
|
return Ok(XML_MIME_TYPE.to_string());
|
|
351
354
|
}
|
|
352
355
|
|
|
356
|
+
// Detect HTML
|
|
353
357
|
if trimmed.starts_with("<!DOCTYPE html") || trimmed.starts_with("<html") {
|
|
354
358
|
return Ok(HTML_MIME_TYPE.to_string());
|
|
355
359
|
}
|
|
356
360
|
|
|
361
|
+
// Detect PDF header
|
|
357
362
|
if trimmed.starts_with("%PDF") {
|
|
358
363
|
return Ok(PDF_MIME_TYPE.to_string());
|
|
359
364
|
}
|
|
360
365
|
|
|
366
|
+
// Default to plain text for valid UTF-8
|
|
361
367
|
return Ok(PLAIN_TEXT_MIME_TYPE.to_string());
|
|
362
368
|
}
|
|
363
369
|
|
|
@@ -392,21 +398,25 @@ pub fn detect_mime_type_from_bytes(content: &[u8]) -> Result<String> {
|
|
|
392
398
|
pub fn get_extensions_for_mime(mime_type: &str) -> Result<Vec<String>> {
|
|
393
399
|
let mut extensions = Vec::new();
|
|
394
400
|
|
|
401
|
+
// Search through EXT_TO_MIME for matching MIME types
|
|
395
402
|
for (ext, mime) in EXT_TO_MIME.iter() {
|
|
396
403
|
if *mime == mime_type {
|
|
397
404
|
extensions.push(ext.to_string());
|
|
398
405
|
}
|
|
399
406
|
}
|
|
400
407
|
|
|
408
|
+
// If we found extensions, return them
|
|
401
409
|
if !extensions.is_empty() {
|
|
402
410
|
return Ok(extensions);
|
|
403
411
|
}
|
|
404
412
|
|
|
413
|
+
// Try using mime_guess crate for reverse lookup
|
|
405
414
|
let guessed = mime_guess::get_mime_extensions_str(mime_type);
|
|
406
415
|
if let Some(exts) = guessed {
|
|
407
416
|
return Ok(exts.iter().map(|s| s.to_string()).collect());
|
|
408
417
|
}
|
|
409
418
|
|
|
419
|
+
// No extensions found
|
|
410
420
|
Err(KreuzbergError::UnsupportedFormat(format!(
|
|
411
421
|
"No known extensions for MIME type: {}",
|
|
412
422
|
mime_type
|
|
@@ -28,7 +28,6 @@
|
|
|
28
28
|
//! # }
|
|
29
29
|
//! ```
|
|
30
30
|
|
|
31
|
-
#[cfg(feature = "tokio-runtime")]
|
|
32
31
|
pub(crate) mod batch_mode;
|
|
33
32
|
pub mod config;
|
|
34
33
|
pub mod extractor;
|
|
@@ -40,6 +39,4 @@ pub use config::{
|
|
|
40
39
|
ChunkingConfig, ExtractionConfig, ImageExtractionConfig, LanguageDetectionConfig, OcrConfig, PdfConfig,
|
|
41
40
|
TokenReductionConfig,
|
|
42
41
|
};
|
|
43
|
-
|
|
44
|
-
pub use extractor::{batch_extract_bytes, batch_extract_file};
|
|
45
|
-
pub use extractor::{extract_bytes, extract_file};
|
|
42
|
+
pub use extractor::{batch_extract_bytes, batch_extract_file, extract_bytes, extract_file};
|