kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +11 -11
- data/README.md +5 -10
- data/examples/async_patterns.rb +0 -1
- data/ext/kreuzberg_rb/extconf.rb +0 -10
- data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
- data/ext/kreuzberg_rb/native/build.rs +2 -0
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
- data/kreuzberg.gemspec +14 -57
- data/lib/kreuzberg/cache_api.rb +0 -1
- data/lib/kreuzberg/cli.rb +2 -2
- data/lib/kreuzberg/config.rb +2 -9
- data/lib/kreuzberg/errors.rb +7 -75
- data/lib/kreuzberg/extraction_api.rb +0 -1
- data/lib/kreuzberg/setup_lib_path.rb +0 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -21
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +3 -55
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/spec_helper.rb +1 -1
- data/vendor/kreuzberg/Cargo.toml +42 -112
- data/vendor/kreuzberg/README.md +2 -2
- data/vendor/kreuzberg/build.rs +4 -18
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/extractor.rs +81 -202
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +1 -4
- data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
- data/vendor/kreuzberg/src/embeddings.rs +16 -125
- data/vendor/kreuzberg/src/error.rs +1 -1
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
- data/vendor/kreuzberg/src/extraction/image.rs +13 -13
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
- data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
- data/vendor/kreuzberg/src/extractors/email.rs +0 -14
- data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
- data/vendor/kreuzberg/src/extractors/html.rs +154 -137
- data/vendor/kreuzberg/src/extractors/image.rs +4 -7
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
- data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
- data/vendor/kreuzberg/src/extractors/text.rs +5 -23
- data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/lib.rs +1 -4
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
- data/vendor/kreuzberg/src/mcp/server.rs +3 -5
- data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
- data/vendor/kreuzberg/src/pdf/error.rs +1 -1
- data/vendor/kreuzberg/src/pdf/table.rs +44 -17
- data/vendor/kreuzberg/src/pdf/text.rs +3 -0
- data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/types.rs +12 -42
- data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
- data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
- data/vendor/kreuzberg/tests/config_features.rs +0 -18
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
- data/vendor/kreuzberg/tests/core_integration.rs +7 -24
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
- data/vendor/kreuzberg/tests/security_validation.rs +1 -12
- metadata +25 -90
- data/.rubocop.yml +0 -538
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
- data/lib/kreuzberg/error_context.rb +0 -32
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/LICENSE-MIT +0 -21
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
use crate::error::{KreuzbergError, Result};
|
|
2
|
+
use once_cell::sync::OnceCell;
|
|
3
|
+
use regex::Regex;
|
|
4
|
+
use tokio::process::Command;
|
|
5
|
+
|
|
6
|
+
static PANDOC_VALIDATED: OnceCell<bool> = OnceCell::new();
|
|
7
|
+
|
|
8
|
+
/// Validate that Pandoc version 2 or above is installed and available
|
|
9
|
+
pub async fn validate_pandoc_version() -> Result<()> {
|
|
10
|
+
if PANDOC_VALIDATED.get().is_some() {
|
|
11
|
+
return Ok(());
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
let output = Command::new("pandoc").arg("--version").output().await.map_err(|e| {
|
|
15
|
+
KreuzbergError::MissingDependency(format!(
|
|
16
|
+
"Pandoc version 2 or above is required but not found in PATH: {}",
|
|
17
|
+
e
|
|
18
|
+
))
|
|
19
|
+
})?;
|
|
20
|
+
|
|
21
|
+
if !output.status.success() {
|
|
22
|
+
return Err(KreuzbergError::MissingDependency(
|
|
23
|
+
"Pandoc version 2 or above is required but command failed".to_string(),
|
|
24
|
+
));
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
28
|
+
|
|
29
|
+
let version = extract_version(&stdout).ok_or_else(|| {
|
|
30
|
+
KreuzbergError::MissingDependency(format!("Could not parse Pandoc version from output: {}", stdout))
|
|
31
|
+
})?;
|
|
32
|
+
|
|
33
|
+
if version.major < 2 {
|
|
34
|
+
return Err(KreuzbergError::MissingDependency(format!(
|
|
35
|
+
"Pandoc version 2 or above is required, found version {}.{}.{}",
|
|
36
|
+
version.major, version.minor, version.patch
|
|
37
|
+
)));
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
let _ = PANDOC_VALIDATED.set(true);
|
|
41
|
+
|
|
42
|
+
Ok(())
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
#[derive(Debug, Clone)]
|
|
46
|
+
struct Version {
|
|
47
|
+
major: u32,
|
|
48
|
+
minor: u32,
|
|
49
|
+
patch: u32,
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
fn extract_version(output: &str) -> Option<Version> {
|
|
53
|
+
let patterns = [
|
|
54
|
+
r"pandoc(?:\.exe)?(?:\s+|\s+v|\s+version\s+)(\d+)\.(\d+)(?:\.(\d+))?",
|
|
55
|
+
r"pandoc\s+\(version\s+(\d+)\.(\d+)(?:\.(\d+))?\)",
|
|
56
|
+
r"pandoc-(\d+)\.(\d+)(?:\.(\d+))?",
|
|
57
|
+
r"^(\d+)\.(\d+)(?:\.(\d+)(?:\.(\d+))?)?",
|
|
58
|
+
r"(?:^|\s)(\d+)\.(\d+)(?:\.(\d+))?(?:\s|$)",
|
|
59
|
+
];
|
|
60
|
+
|
|
61
|
+
for pattern in &patterns {
|
|
62
|
+
if let Ok(re) = Regex::new(pattern)
|
|
63
|
+
&& let Some(caps) = re.captures(output)
|
|
64
|
+
{
|
|
65
|
+
let major = caps.get(1)?.as_str().parse().ok()?;
|
|
66
|
+
let minor = caps.get(2)?.as_str().parse().ok()?;
|
|
67
|
+
let patch = caps.get(3).and_then(|m| m.as_str().parse().ok()).unwrap_or(0);
|
|
68
|
+
|
|
69
|
+
return Some(Version { major, minor, patch });
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
for line in output.lines() {
|
|
74
|
+
for token in line.split_whitespace() {
|
|
75
|
+
if let Some(version) = parse_version_token(token) {
|
|
76
|
+
return Some(version);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
None
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
fn parse_version_token(token: &str) -> Option<Version> {
|
|
85
|
+
let parts: Vec<&str> = token.split('.').collect();
|
|
86
|
+
if parts.len() >= 2
|
|
87
|
+
&& let (Ok(major), Ok(minor)) = (parts[0].parse(), parts[1].parse())
|
|
88
|
+
{
|
|
89
|
+
let patch = parts.get(2).and_then(|p| p.parse().ok()).unwrap_or(0);
|
|
90
|
+
return Some(Version { major, minor, patch });
|
|
91
|
+
}
|
|
92
|
+
None
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
#[cfg(test)]
|
|
96
|
+
mod tests {
|
|
97
|
+
use super::*;
|
|
98
|
+
|
|
99
|
+
#[test]
|
|
100
|
+
fn test_extract_version_standard_format() {
|
|
101
|
+
let output = "pandoc 3.1.2";
|
|
102
|
+
let version = extract_version(output).unwrap();
|
|
103
|
+
assert_eq!(version.major, 3);
|
|
104
|
+
assert_eq!(version.minor, 1);
|
|
105
|
+
assert_eq!(version.patch, 2);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
#[test]
|
|
109
|
+
fn test_extract_version_with_parens() {
|
|
110
|
+
let output = "pandoc (version 2.19.2)";
|
|
111
|
+
let version = extract_version(output).unwrap();
|
|
112
|
+
assert_eq!(version.major, 2);
|
|
113
|
+
assert_eq!(version.minor, 19);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
#[test]
|
|
117
|
+
fn test_extract_version_with_exe() {
|
|
118
|
+
let output = "pandoc.exe 3.0";
|
|
119
|
+
let version = extract_version(output).unwrap();
|
|
120
|
+
assert_eq!(version.major, 3);
|
|
121
|
+
assert_eq!(version.minor, 0);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
#[test]
|
|
125
|
+
fn test_extract_version_multiline() {
|
|
126
|
+
let output = "pandoc 3.1.2\nCopyright (C) 2006-2023 John MacFarlane";
|
|
127
|
+
let version = extract_version(output).unwrap();
|
|
128
|
+
assert_eq!(version.major, 3);
|
|
129
|
+
assert_eq!(version.minor, 1);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
#[test]
|
|
133
|
+
fn test_extract_version_no_patch() {
|
|
134
|
+
let output = "pandoc 2.5";
|
|
135
|
+
let version = extract_version(output).unwrap();
|
|
136
|
+
assert_eq!(version.major, 2);
|
|
137
|
+
assert_eq!(version.minor, 5);
|
|
138
|
+
assert_eq!(version.patch, 0);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
#[test]
|
|
142
|
+
fn test_parse_version_token() {
|
|
143
|
+
let version = parse_version_token("2.19.2").unwrap();
|
|
144
|
+
assert_eq!(version.major, 2);
|
|
145
|
+
assert_eq!(version.minor, 19);
|
|
146
|
+
assert_eq!(version.patch, 2);
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
#[test]
|
|
150
|
+
fn test_parse_version_token_no_patch() {
|
|
151
|
+
let version = parse_version_token("3.1").unwrap();
|
|
152
|
+
assert_eq!(version.major, 3);
|
|
153
|
+
assert_eq!(version.minor, 1);
|
|
154
|
+
assert_eq!(version.patch, 0);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
#[test]
|
|
158
|
+
fn test_parse_version_token_invalid() {
|
|
159
|
+
let version = parse_version_token("abc");
|
|
160
|
+
assert!(version.is_none());
|
|
161
|
+
}
|
|
162
|
+
}
|
|
@@ -126,13 +126,6 @@ impl Plugin for ZipExtractor {
|
|
|
126
126
|
|
|
127
127
|
#[async_trait]
|
|
128
128
|
impl DocumentExtractor for ZipExtractor {
|
|
129
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
130
|
-
skip(self, content, _config),
|
|
131
|
-
fields(
|
|
132
|
-
extractor.name = self.name(),
|
|
133
|
-
content.size_bytes = content.len(),
|
|
134
|
-
)
|
|
135
|
-
))]
|
|
136
129
|
async fn extract_bytes(
|
|
137
130
|
&self,
|
|
138
131
|
content: &[u8],
|
|
@@ -204,13 +197,6 @@ impl Plugin for TarExtractor {
|
|
|
204
197
|
|
|
205
198
|
#[async_trait]
|
|
206
199
|
impl DocumentExtractor for TarExtractor {
|
|
207
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
208
|
-
skip(self, content, _config),
|
|
209
|
-
fields(
|
|
210
|
-
extractor.name = self.name(),
|
|
211
|
-
content.size_bytes = content.len(),
|
|
212
|
-
)
|
|
213
|
-
))]
|
|
214
200
|
async fn extract_bytes(
|
|
215
201
|
&self,
|
|
216
202
|
content: &[u8],
|
|
@@ -287,13 +273,6 @@ impl Plugin for SevenZExtractor {
|
|
|
287
273
|
|
|
288
274
|
#[async_trait]
|
|
289
275
|
impl DocumentExtractor for SevenZExtractor {
|
|
290
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
291
|
-
skip(self, content, _config),
|
|
292
|
-
fields(
|
|
293
|
-
extractor.name = self.name(),
|
|
294
|
-
content.size_bytes = content.len(),
|
|
295
|
-
)
|
|
296
|
-
))]
|
|
297
276
|
async fn extract_bytes(
|
|
298
277
|
&self,
|
|
299
278
|
content: &[u8],
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
use crate::Result;
|
|
6
6
|
use crate::core::config::ExtractionConfig;
|
|
7
|
-
use crate::extraction::
|
|
7
|
+
use crate::extraction::office_metadata;
|
|
8
8
|
use crate::plugins::{DocumentExtractor, Plugin};
|
|
9
9
|
use crate::types::{ExtractionResult, Metadata, Table};
|
|
10
10
|
use async_trait::async_trait;
|
|
@@ -15,6 +15,7 @@ use std::io::Cursor;
|
|
|
15
15
|
/// This extractor provides:
|
|
16
16
|
/// - Fast text extraction via streaming XML parsing (~160 MB/s average)
|
|
17
17
|
/// - Comprehensive metadata extraction (core.xml, app.xml, custom.xml)
|
|
18
|
+
/// - ~400x faster than Pandoc subprocess approach
|
|
18
19
|
pub struct DocxExtractor;
|
|
19
20
|
|
|
20
21
|
impl DocxExtractor {
|
|
@@ -65,6 +66,7 @@ impl Plugin for DocxExtractor {
|
|
|
65
66
|
/// # Returns
|
|
66
67
|
/// * `Table` - Converted table with cells and markdown representation
|
|
67
68
|
fn convert_docx_table_to_table(docx_table: &docx_lite::Table, table_index: usize) -> Table {
|
|
69
|
+
// Extract cells as 2D vector
|
|
68
70
|
let cells: Vec<Vec<String>> = docx_table
|
|
69
71
|
.rows
|
|
70
72
|
.iter()
|
|
@@ -72,6 +74,7 @@ fn convert_docx_table_to_table(docx_table: &docx_lite::Table, table_index: usize
|
|
|
72
74
|
row.cells
|
|
73
75
|
.iter()
|
|
74
76
|
.map(|cell| {
|
|
77
|
+
// Extract text from all paragraphs in the cell
|
|
75
78
|
cell.paragraphs
|
|
76
79
|
.iter()
|
|
77
80
|
.map(|para| para.to_text())
|
|
@@ -84,12 +87,13 @@ fn convert_docx_table_to_table(docx_table: &docx_lite::Table, table_index: usize
|
|
|
84
87
|
})
|
|
85
88
|
.collect();
|
|
86
89
|
|
|
90
|
+
// Generate markdown representation
|
|
87
91
|
let markdown = cells_to_markdown(&cells);
|
|
88
92
|
|
|
89
93
|
Table {
|
|
90
94
|
cells,
|
|
91
95
|
markdown,
|
|
92
|
-
page_number: table_index + 1,
|
|
96
|
+
page_number: table_index + 1, // 1-indexed
|
|
93
97
|
}
|
|
94
98
|
}
|
|
95
99
|
|
|
@@ -100,33 +104,82 @@ fn convert_docx_table_to_table(docx_table: &docx_lite::Table, table_index: usize
|
|
|
100
104
|
///
|
|
101
105
|
/// # Returns
|
|
102
106
|
/// * `String` - Markdown formatted table
|
|
107
|
+
fn cells_to_markdown(cells: &[Vec<String>]) -> String {
|
|
108
|
+
if cells.is_empty() {
|
|
109
|
+
return String::new();
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
let mut markdown = String::new();
|
|
113
|
+
|
|
114
|
+
// Determine number of columns from first row
|
|
115
|
+
let num_cols = cells.first().map(|r| r.len()).unwrap_or(0);
|
|
116
|
+
if num_cols == 0 {
|
|
117
|
+
return String::new();
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// Header row (first row)
|
|
121
|
+
if let Some(header) = cells.first() {
|
|
122
|
+
markdown.push_str("| ");
|
|
123
|
+
for cell in header {
|
|
124
|
+
// Escape pipe characters in cell content
|
|
125
|
+
let escaped = cell.replace('|', "\\|");
|
|
126
|
+
markdown.push_str(&escaped);
|
|
127
|
+
markdown.push_str(" | ");
|
|
128
|
+
}
|
|
129
|
+
markdown.push('\n');
|
|
130
|
+
|
|
131
|
+
// Separator row
|
|
132
|
+
markdown.push('|');
|
|
133
|
+
for _ in 0..num_cols {
|
|
134
|
+
markdown.push_str("------|");
|
|
135
|
+
}
|
|
136
|
+
markdown.push('\n');
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// Data rows (skip first row as it's the header)
|
|
140
|
+
for row in cells.iter().skip(1) {
|
|
141
|
+
markdown.push_str("| ");
|
|
142
|
+
for (idx, cell) in row.iter().enumerate() {
|
|
143
|
+
if idx >= num_cols {
|
|
144
|
+
break; // Handle irregular tables
|
|
145
|
+
}
|
|
146
|
+
// Escape pipe characters in cell content
|
|
147
|
+
let escaped = cell.replace('|', "\\|");
|
|
148
|
+
markdown.push_str(&escaped);
|
|
149
|
+
markdown.push_str(" | ");
|
|
150
|
+
}
|
|
151
|
+
// Pad with empty cells if row is shorter than expected
|
|
152
|
+
for _ in row.len()..num_cols {
|
|
153
|
+
markdown.push_str(" | ");
|
|
154
|
+
}
|
|
155
|
+
markdown.push('\n');
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
markdown
|
|
159
|
+
}
|
|
103
160
|
|
|
104
161
|
#[async_trait]
|
|
105
162
|
impl DocumentExtractor for DocxExtractor {
|
|
106
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
107
|
-
skip(self, content, _config),
|
|
108
|
-
fields(
|
|
109
|
-
extractor.name = self.name(),
|
|
110
|
-
content.size_bytes = content.len(),
|
|
111
|
-
)
|
|
112
|
-
))]
|
|
113
163
|
async fn extract_bytes(
|
|
114
164
|
&self,
|
|
115
165
|
content: &[u8],
|
|
116
166
|
mime_type: &str,
|
|
117
167
|
_config: &ExtractionConfig,
|
|
118
168
|
) -> Result<ExtractionResult> {
|
|
169
|
+
// Parse the DOCX document to extract both text and tables
|
|
119
170
|
let (text, tables) = if crate::core::batch_mode::is_batch_mode() {
|
|
171
|
+
// Batch mode: Use spawn_blocking for parallelism
|
|
120
172
|
let content_owned = content.to_vec();
|
|
121
|
-
let span = tracing::Span::current();
|
|
122
173
|
tokio::task::spawn_blocking(move || -> crate::error::Result<(String, Vec<Table>)> {
|
|
123
|
-
|
|
174
|
+
// Parse document structure
|
|
124
175
|
let cursor = Cursor::new(&content_owned);
|
|
125
176
|
let doc = docx_lite::parse_document(cursor)
|
|
126
177
|
.map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX parsing failed: {}", e)))?;
|
|
127
178
|
|
|
179
|
+
// Extract text
|
|
128
180
|
let text = doc.extract_text();
|
|
129
181
|
|
|
182
|
+
// Extract tables
|
|
130
183
|
let tables: Vec<Table> = doc
|
|
131
184
|
.tables
|
|
132
185
|
.iter()
|
|
@@ -139,12 +192,15 @@ impl DocumentExtractor for DocxExtractor {
|
|
|
139
192
|
.await
|
|
140
193
|
.map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX extraction task failed: {}", e)))??
|
|
141
194
|
} else {
|
|
195
|
+
// Single-file mode: Direct extraction (no spawn overhead)
|
|
142
196
|
let cursor = Cursor::new(content);
|
|
143
197
|
let doc = docx_lite::parse_document(cursor)
|
|
144
198
|
.map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX parsing failed: {}", e)))?;
|
|
145
199
|
|
|
200
|
+
// Extract text
|
|
146
201
|
let text = doc.extract_text();
|
|
147
202
|
|
|
203
|
+
// Extract tables
|
|
148
204
|
let tables: Vec<Table> = doc
|
|
149
205
|
.tables
|
|
150
206
|
.iter()
|
|
@@ -155,11 +211,11 @@ impl DocumentExtractor for DocxExtractor {
|
|
|
155
211
|
(text, tables)
|
|
156
212
|
};
|
|
157
213
|
|
|
214
|
+
// Extract metadata using existing office_metadata module
|
|
158
215
|
let mut archive = if crate::core::batch_mode::is_batch_mode() {
|
|
216
|
+
// Batch mode: Use spawn_blocking for parallelism
|
|
159
217
|
let content_owned = content.to_vec();
|
|
160
|
-
let span = tracing::Span::current();
|
|
161
218
|
tokio::task::spawn_blocking(move || -> crate::error::Result<_> {
|
|
162
|
-
let _guard = span.entered();
|
|
163
219
|
let cursor = Cursor::new(content_owned);
|
|
164
220
|
zip::ZipArchive::new(cursor)
|
|
165
221
|
.map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e)))
|
|
@@ -167,6 +223,8 @@ impl DocumentExtractor for DocxExtractor {
|
|
|
167
223
|
.await
|
|
168
224
|
.map_err(|e| crate::error::KreuzbergError::parsing(format!("Task join error: {}", e)))??
|
|
169
225
|
} else {
|
|
226
|
+
// Single-file mode: Direct extraction (no spawn overhead)
|
|
227
|
+
// Note: We still need to clone for ZipArchive type consistency with batch mode
|
|
170
228
|
let content_owned = content.to_vec();
|
|
171
229
|
let cursor = Cursor::new(content_owned);
|
|
172
230
|
zip::ZipArchive::new(cursor)
|
|
@@ -175,6 +233,7 @@ impl DocumentExtractor for DocxExtractor {
|
|
|
175
233
|
|
|
176
234
|
let mut metadata_map = std::collections::HashMap::new();
|
|
177
235
|
|
|
236
|
+
// Extract core properties (title, creator, dates, keywords, etc.)
|
|
178
237
|
if let Ok(core) = office_metadata::extract_core_properties(&mut archive) {
|
|
179
238
|
if let Some(title) = core.title {
|
|
180
239
|
metadata_map.insert("title".to_string(), serde_json::Value::String(title));
|
|
@@ -218,6 +277,7 @@ impl DocumentExtractor for DocxExtractor {
|
|
|
218
277
|
}
|
|
219
278
|
}
|
|
220
279
|
|
|
280
|
+
// Extract app properties (page count, word count, etc.)
|
|
221
281
|
if let Ok(app) = office_metadata::extract_docx_app_properties(&mut archive) {
|
|
222
282
|
if let Some(pages) = app.pages {
|
|
223
283
|
metadata_map.insert("page_count".to_string(), serde_json::Value::Number(pages.into()));
|
|
@@ -254,6 +314,7 @@ impl DocumentExtractor for DocxExtractor {
|
|
|
254
314
|
}
|
|
255
315
|
}
|
|
256
316
|
|
|
317
|
+
// Extract custom properties
|
|
257
318
|
if let Ok(custom) = office_metadata::extract_custom_properties(&mut archive) {
|
|
258
319
|
for (key, value) in custom {
|
|
259
320
|
metadata_map.insert(format!("custom_{}", key), value);
|
|
@@ -279,7 +340,7 @@ impl DocumentExtractor for DocxExtractor {
|
|
|
279
340
|
}
|
|
280
341
|
|
|
281
342
|
fn priority(&self) -> i32 {
|
|
282
|
-
50
|
|
343
|
+
50 // Higher priority than Pandoc (40) to take precedence
|
|
283
344
|
}
|
|
284
345
|
}
|
|
285
346
|
|
|
@@ -319,12 +380,61 @@ mod tests {
|
|
|
319
380
|
assert!(extractor.shutdown().is_ok());
|
|
320
381
|
}
|
|
321
382
|
|
|
383
|
+
#[test]
|
|
384
|
+
fn test_cells_to_markdown_basic_table() {
|
|
385
|
+
let cells = vec![
|
|
386
|
+
vec!["Header1".to_string(), "Header2".to_string()],
|
|
387
|
+
vec!["Row1Col1".to_string(), "Row1Col2".to_string()],
|
|
388
|
+
vec!["Row2Col1".to_string(), "Row2Col2".to_string()],
|
|
389
|
+
];
|
|
390
|
+
|
|
391
|
+
let markdown = cells_to_markdown(&cells);
|
|
392
|
+
|
|
393
|
+
assert!(markdown.contains("| Header1 | Header2 |"));
|
|
394
|
+
assert!(markdown.contains("|------|------|"));
|
|
395
|
+
assert!(markdown.contains("| Row1Col1 | Row1Col2 |"));
|
|
396
|
+
assert!(markdown.contains("| Row2Col1 | Row2Col2 |"));
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
#[test]
|
|
400
|
+
fn test_cells_to_markdown_empty() {
|
|
401
|
+
let cells: Vec<Vec<String>> = vec![];
|
|
402
|
+
let markdown = cells_to_markdown(&cells);
|
|
403
|
+
assert_eq!(markdown, "");
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
#[test]
|
|
407
|
+
fn test_cells_to_markdown_escape_pipes() {
|
|
408
|
+
let cells = vec![vec!["Header".to_string()], vec!["Cell with | pipe".to_string()]];
|
|
409
|
+
|
|
410
|
+
let markdown = cells_to_markdown(&cells);
|
|
411
|
+
assert!(markdown.contains("Cell with \\| pipe"));
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
#[test]
|
|
415
|
+
fn test_cells_to_markdown_irregular_rows() {
|
|
416
|
+
let cells = vec![
|
|
417
|
+
vec!["H1".to_string(), "H2".to_string(), "H3".to_string()],
|
|
418
|
+
vec!["R1C1".to_string(), "R1C2".to_string()], // Missing third column
|
|
419
|
+
vec!["R2C1".to_string(), "R2C2".to_string(), "R2C3".to_string()],
|
|
420
|
+
];
|
|
421
|
+
|
|
422
|
+
let markdown = cells_to_markdown(&cells);
|
|
423
|
+
|
|
424
|
+
// Should have 3 columns in header
|
|
425
|
+
assert!(markdown.contains("| H1 | H2 | H3 |"));
|
|
426
|
+
// Should pad short rows
|
|
427
|
+
assert!(markdown.contains("| R1C1 | R1C2 | |"));
|
|
428
|
+
}
|
|
429
|
+
|
|
322
430
|
#[test]
|
|
323
431
|
fn test_convert_docx_table_to_table() {
|
|
324
432
|
use docx_lite::{Paragraph, Run, Table as DocxTable, TableCell, TableRow};
|
|
325
433
|
|
|
434
|
+
// Create a simple docx-lite table
|
|
326
435
|
let mut table = DocxTable::new();
|
|
327
436
|
|
|
437
|
+
// Header row
|
|
328
438
|
let mut header_row = TableRow::default();
|
|
329
439
|
let mut cell1 = TableCell::default();
|
|
330
440
|
let mut para1 = Paragraph::new();
|
|
@@ -340,6 +450,7 @@ mod tests {
|
|
|
340
450
|
|
|
341
451
|
table.rows.push(header_row);
|
|
342
452
|
|
|
453
|
+
// Data row
|
|
343
454
|
let mut data_row = TableRow::default();
|
|
344
455
|
let mut cell3 = TableCell::default();
|
|
345
456
|
let mut para3 = Paragraph::new();
|
|
@@ -355,10 +466,11 @@ mod tests {
|
|
|
355
466
|
|
|
356
467
|
table.rows.push(data_row);
|
|
357
468
|
|
|
469
|
+
// Convert to Kreuzberg Table
|
|
358
470
|
let result = convert_docx_table_to_table(&table, 0);
|
|
359
471
|
|
|
360
|
-
assert_eq!(result.page_number, 1);
|
|
361
|
-
assert_eq!(result.cells.len(), 2);
|
|
472
|
+
assert_eq!(result.page_number, 1); // 0 + 1 = 1 (1-indexed)
|
|
473
|
+
assert_eq!(result.cells.len(), 2); // 2 rows
|
|
362
474
|
assert_eq!(result.cells[0], vec!["Name", "Age"]);
|
|
363
475
|
assert_eq!(result.cells[1], vec!["Alice", "30"]);
|
|
364
476
|
assert!(result.markdown.contains("| Name | Age |"));
|
|
@@ -44,13 +44,6 @@ impl Plugin for EmailExtractor {
|
|
|
44
44
|
|
|
45
45
|
#[async_trait]
|
|
46
46
|
impl DocumentExtractor for EmailExtractor {
|
|
47
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
48
|
-
skip(self, content, _config),
|
|
49
|
-
fields(
|
|
50
|
-
extractor.name = self.name(),
|
|
51
|
-
content.size_bytes = content.len(),
|
|
52
|
-
)
|
|
53
|
-
))]
|
|
54
47
|
async fn extract_bytes(
|
|
55
48
|
&self,
|
|
56
49
|
content: &[u8],
|
|
@@ -99,13 +92,6 @@ impl DocumentExtractor for EmailExtractor {
|
|
|
99
92
|
})
|
|
100
93
|
}
|
|
101
94
|
|
|
102
|
-
#[cfg(feature = "tokio-runtime")]
|
|
103
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
104
|
-
skip(self, path, config),
|
|
105
|
-
fields(
|
|
106
|
-
extractor.name = self.name(),
|
|
107
|
-
)
|
|
108
|
-
))]
|
|
109
95
|
async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
110
96
|
let bytes = tokio::fs::read(path).await?;
|
|
111
97
|
self.extract_bytes(&bytes, mime_type, config).await
|
|
@@ -31,34 +31,46 @@ impl ExcelExtractor {
|
|
|
31
31
|
let mut tables = Vec::with_capacity(workbook.sheets.len());
|
|
32
32
|
|
|
33
33
|
for (sheet_index, sheet) in workbook.sheets.iter().enumerate() {
|
|
34
|
+
// Skip empty sheets
|
|
34
35
|
if sheet.row_count == 0 || sheet.col_count == 0 {
|
|
35
36
|
continue;
|
|
36
37
|
}
|
|
37
38
|
|
|
39
|
+
// We need to re-parse the sheet to get structured cell data
|
|
40
|
+
// The workbook.sheets only contains markdown, not raw cell data
|
|
41
|
+
// So we'll extract from the markdown table representation
|
|
42
|
+
|
|
43
|
+
// Parse cells from markdown
|
|
38
44
|
let lines: Vec<&str> = sheet.markdown.lines().collect();
|
|
39
45
|
let mut cells: Vec<Vec<String>> = Vec::new();
|
|
40
46
|
|
|
47
|
+
// Find the table content (skip header line "## Sheet Name" and blank line)
|
|
41
48
|
let table_start = lines.iter().position(|line| line.starts_with("| "));
|
|
42
49
|
|
|
43
50
|
if let Some(start_idx) = table_start {
|
|
44
51
|
for line in lines.iter().skip(start_idx) {
|
|
45
52
|
if line.starts_with("| ") && !line.contains("---") {
|
|
53
|
+
// Parse table row
|
|
46
54
|
let row: Vec<String> = line
|
|
47
55
|
.trim_start_matches("| ")
|
|
48
56
|
.trim_end_matches(" |")
|
|
49
57
|
.split(" | ")
|
|
50
|
-
.map(|cell|
|
|
58
|
+
.map(|cell| {
|
|
59
|
+
// Unescape markdown pipes and backslashes
|
|
60
|
+
cell.replace("\\|", "|").replace("\\\\", "\\")
|
|
61
|
+
})
|
|
51
62
|
.collect();
|
|
52
63
|
cells.push(row);
|
|
53
64
|
}
|
|
54
65
|
}
|
|
55
66
|
}
|
|
56
67
|
|
|
68
|
+
// Only create table if we have data
|
|
57
69
|
if !cells.is_empty() {
|
|
58
70
|
tables.push(Table {
|
|
59
71
|
cells,
|
|
60
72
|
markdown: sheet.markdown.clone(),
|
|
61
|
-
page_number: sheet_index + 1,
|
|
73
|
+
page_number: sheet_index + 1, // 1-indexed
|
|
62
74
|
});
|
|
63
75
|
}
|
|
64
76
|
}
|
|
@@ -87,13 +99,6 @@ impl Plugin for ExcelExtractor {
|
|
|
87
99
|
|
|
88
100
|
#[async_trait]
|
|
89
101
|
impl DocumentExtractor for ExcelExtractor {
|
|
90
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
91
|
-
skip(self, content, _config),
|
|
92
|
-
fields(
|
|
93
|
-
extractor.name = self.name(),
|
|
94
|
-
content.size_bytes = content.len(),
|
|
95
|
-
)
|
|
96
|
-
))]
|
|
97
102
|
async fn extract_bytes(
|
|
98
103
|
&self,
|
|
99
104
|
content: &[u8],
|
|
@@ -112,17 +117,18 @@ impl DocumentExtractor for ExcelExtractor {
|
|
|
112
117
|
_ => ".xlsx",
|
|
113
118
|
};
|
|
114
119
|
|
|
120
|
+
// Extract workbook
|
|
115
121
|
let workbook = if crate::core::batch_mode::is_batch_mode() {
|
|
122
|
+
// Batch mode: Use spawn_blocking for parallelism
|
|
116
123
|
let content_owned = content.to_vec();
|
|
117
124
|
let extension_owned = extension.to_string();
|
|
118
|
-
let span = tracing::Span::current();
|
|
119
125
|
tokio::task::spawn_blocking(move || {
|
|
120
|
-
let _guard = span.entered();
|
|
121
126
|
crate::extraction::excel::read_excel_bytes(&content_owned, &extension_owned)
|
|
122
127
|
})
|
|
123
128
|
.await
|
|
124
129
|
.map_err(|e| crate::error::KreuzbergError::parsing(format!("Excel extraction task failed: {}", e)))??
|
|
125
130
|
} else {
|
|
131
|
+
// Single-file mode: Direct extraction (no spawn overhead)
|
|
126
132
|
crate::extraction::excel::read_excel_bytes(content, extension)?
|
|
127
133
|
};
|
|
128
134
|
|
|
@@ -157,12 +163,6 @@ impl DocumentExtractor for ExcelExtractor {
|
|
|
157
163
|
})
|
|
158
164
|
}
|
|
159
165
|
|
|
160
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
161
|
-
skip(self, path, _config),
|
|
162
|
-
fields(
|
|
163
|
-
extractor.name = self.name(),
|
|
164
|
-
)
|
|
165
|
-
))]
|
|
166
166
|
async fn extract_file(&self, path: &Path, mime_type: &str, _config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
167
167
|
let path_str = path
|
|
168
168
|
.to_str()
|
|
@@ -244,6 +244,7 @@ mod tests {
|
|
|
244
244
|
use crate::types::ExcelSheet;
|
|
245
245
|
use std::collections::HashMap;
|
|
246
246
|
|
|
247
|
+
// Create a mock workbook with a single sheet
|
|
247
248
|
let sheet = ExcelSheet {
|
|
248
249
|
name: "TestSheet".to_string(),
|
|
249
250
|
markdown: r#"## TestSheet
|
|
@@ -268,7 +269,7 @@ mod tests {
|
|
|
268
269
|
|
|
269
270
|
assert_eq!(tables.len(), 1);
|
|
270
271
|
assert_eq!(tables[0].page_number, 1);
|
|
271
|
-
assert_eq!(tables[0].cells.len(), 3);
|
|
272
|
+
assert_eq!(tables[0].cells.len(), 3); // Header + 2 data rows
|
|
272
273
|
assert_eq!(tables[0].cells[0], vec!["Name", "Age", "City"]);
|
|
273
274
|
assert_eq!(tables[0].cells[1], vec!["Alice", "30", "NYC"]);
|
|
274
275
|
assert_eq!(tables[0].cells[2], vec!["Bob", "25", "LA"]);
|
|
@@ -293,7 +294,7 @@ mod tests {
|
|
|
293
294
|
};
|
|
294
295
|
|
|
295
296
|
let tables = ExcelExtractor::sheets_to_tables(&workbook);
|
|
296
|
-
assert_eq!(tables.len(), 0);
|
|
297
|
+
assert_eq!(tables.len(), 0); // Empty sheets should not create tables
|
|
297
298
|
}
|
|
298
299
|
|
|
299
300
|
#[test]
|