kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +11 -11
- data/README.md +5 -10
- data/examples/async_patterns.rb +0 -1
- data/ext/kreuzberg_rb/extconf.rb +0 -10
- data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
- data/ext/kreuzberg_rb/native/build.rs +2 -0
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
- data/kreuzberg.gemspec +14 -57
- data/lib/kreuzberg/cache_api.rb +0 -1
- data/lib/kreuzberg/cli.rb +2 -2
- data/lib/kreuzberg/config.rb +2 -9
- data/lib/kreuzberg/errors.rb +7 -75
- data/lib/kreuzberg/extraction_api.rb +0 -1
- data/lib/kreuzberg/setup_lib_path.rb +0 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -21
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +3 -55
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/spec_helper.rb +1 -1
- data/vendor/kreuzberg/Cargo.toml +42 -112
- data/vendor/kreuzberg/README.md +2 -2
- data/vendor/kreuzberg/build.rs +4 -18
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/extractor.rs +81 -202
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +1 -4
- data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
- data/vendor/kreuzberg/src/embeddings.rs +16 -125
- data/vendor/kreuzberg/src/error.rs +1 -1
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
- data/vendor/kreuzberg/src/extraction/image.rs +13 -13
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
- data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
- data/vendor/kreuzberg/src/extractors/email.rs +0 -14
- data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
- data/vendor/kreuzberg/src/extractors/html.rs +154 -137
- data/vendor/kreuzberg/src/extractors/image.rs +4 -7
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
- data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
- data/vendor/kreuzberg/src/extractors/text.rs +5 -23
- data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/lib.rs +1 -4
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
- data/vendor/kreuzberg/src/mcp/server.rs +3 -5
- data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
- data/vendor/kreuzberg/src/pdf/error.rs +1 -1
- data/vendor/kreuzberg/src/pdf/table.rs +44 -17
- data/vendor/kreuzberg/src/pdf/text.rs +3 -0
- data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/types.rs +12 -42
- data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
- data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
- data/vendor/kreuzberg/tests/config_features.rs +0 -18
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
- data/vendor/kreuzberg/tests/core_integration.rs +7 -24
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
- data/vendor/kreuzberg/tests/security_validation.rs +1 -12
- metadata +25 -90
- data/.rubocop.yml +0 -538
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
- data/lib/kreuzberg/error_context.rb +0 -32
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/LICENSE-MIT +0 -21
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
//! Pandoc-based extractors for various document formats.
|
|
2
|
+
//!
|
|
3
|
+
//! Supports: DOCX, ODT, EPUB, LaTeX, RST, RTF, and many more formats via Pandoc.
|
|
4
|
+
|
|
5
|
+
use crate::Result;
|
|
6
|
+
use crate::core::config::ExtractionConfig;
|
|
7
|
+
use crate::extraction::pandoc::extract_bytes_from_mime;
|
|
8
|
+
use crate::plugins::{DocumentExtractor, Plugin};
|
|
9
|
+
use crate::types::{ExtractionResult, Metadata};
|
|
10
|
+
use async_trait::async_trait;
|
|
11
|
+
|
|
12
|
+
/// Generic Pandoc extractor for all Pandoc-supported formats.
|
|
13
|
+
///
|
|
14
|
+
/// This extractor handles all document formats supported by Pandoc, including:
|
|
15
|
+
/// - Microsoft Word (DOCX)
|
|
16
|
+
/// - OpenDocument Text (ODT)
|
|
17
|
+
/// - EPUB
|
|
18
|
+
/// - LaTeX
|
|
19
|
+
/// - reStructuredText (RST)
|
|
20
|
+
/// - RTF
|
|
21
|
+
/// - And many more
|
|
22
|
+
pub struct PandocExtractor;
|
|
23
|
+
|
|
24
|
+
impl PandocExtractor {
|
|
25
|
+
/// Create a new Pandoc extractor.
|
|
26
|
+
pub fn new() -> Self {
|
|
27
|
+
Self
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
impl Default for PandocExtractor {
|
|
32
|
+
fn default() -> Self {
|
|
33
|
+
Self::new()
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
impl Plugin for PandocExtractor {
|
|
38
|
+
fn name(&self) -> &str {
|
|
39
|
+
"pandoc-extractor"
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
fn version(&self) -> String {
|
|
43
|
+
env!("CARGO_PKG_VERSION").to_string()
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
fn initialize(&self) -> Result<()> {
|
|
47
|
+
Ok(())
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
fn shutdown(&self) -> Result<()> {
|
|
51
|
+
Ok(())
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
fn description(&self) -> &str {
|
|
55
|
+
"Extracts content from Pandoc-supported formats (DOCX, ODT, EPUB, LaTeX, RST, RTF, etc.)"
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
fn author(&self) -> &str {
|
|
59
|
+
"Kreuzberg Team"
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
#[async_trait]
|
|
64
|
+
impl DocumentExtractor for PandocExtractor {
|
|
65
|
+
async fn extract_bytes(
|
|
66
|
+
&self,
|
|
67
|
+
content: &[u8],
|
|
68
|
+
mime_type: &str,
|
|
69
|
+
_config: &ExtractionConfig,
|
|
70
|
+
) -> Result<ExtractionResult> {
|
|
71
|
+
let pandoc_result = extract_bytes_from_mime(content, mime_type).await?;
|
|
72
|
+
|
|
73
|
+
let mut additional = std::collections::HashMap::new();
|
|
74
|
+
for (key, value) in pandoc_result.metadata {
|
|
75
|
+
additional.insert(key, value);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
Ok(ExtractionResult {
|
|
79
|
+
content: pandoc_result.content,
|
|
80
|
+
mime_type: mime_type.to_string(),
|
|
81
|
+
metadata: Metadata {
|
|
82
|
+
additional,
|
|
83
|
+
..Default::default()
|
|
84
|
+
},
|
|
85
|
+
tables: vec![],
|
|
86
|
+
detected_languages: None,
|
|
87
|
+
chunks: None,
|
|
88
|
+
images: None,
|
|
89
|
+
})
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
fn supported_mime_types(&self) -> &[&str] {
|
|
93
|
+
&[
|
|
94
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
95
|
+
"application/vnd.oasis.opendocument.text",
|
|
96
|
+
"application/epub+zip",
|
|
97
|
+
"application/x-latex",
|
|
98
|
+
"text/x-tex",
|
|
99
|
+
"text/x-rst",
|
|
100
|
+
"text/prs.fallenstein.rst",
|
|
101
|
+
"application/rtf",
|
|
102
|
+
"text/rtf",
|
|
103
|
+
"application/x-typst",
|
|
104
|
+
"application/x-ipynb+json",
|
|
105
|
+
"application/x-fictionbook+xml",
|
|
106
|
+
"text/x-org",
|
|
107
|
+
"text/x-commonmark",
|
|
108
|
+
"text/x-gfm",
|
|
109
|
+
"text/x-multimarkdown",
|
|
110
|
+
"text/x-markdown-extra",
|
|
111
|
+
"application/docbook+xml",
|
|
112
|
+
"application/x-jats+xml",
|
|
113
|
+
"application/x-opml+xml",
|
|
114
|
+
]
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
fn priority(&self) -> i32 {
|
|
118
|
+
40
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
#[cfg(test)]
|
|
123
|
+
mod tests {
|
|
124
|
+
use super::*;
|
|
125
|
+
use crate::extraction::pandoc::validate_pandoc_version;
|
|
126
|
+
|
|
127
|
+
#[tokio::test]
|
|
128
|
+
async fn test_pandoc_extractor_plugin_interface() {
|
|
129
|
+
let extractor = PandocExtractor::new();
|
|
130
|
+
assert_eq!(extractor.name(), "pandoc-extractor");
|
|
131
|
+
assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
|
|
132
|
+
assert_eq!(extractor.priority(), 40);
|
|
133
|
+
assert!(!extractor.supported_mime_types().is_empty());
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
#[tokio::test]
|
|
137
|
+
async fn test_pandoc_extractor_supports_docx() {
|
|
138
|
+
let extractor = PandocExtractor::new();
|
|
139
|
+
assert!(
|
|
140
|
+
extractor
|
|
141
|
+
.supported_mime_types()
|
|
142
|
+
.contains(&"application/vnd.openxmlformats-officedocument.wordprocessingml.document")
|
|
143
|
+
);
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
#[tokio::test]
|
|
147
|
+
async fn test_pandoc_extractor_supports_odt() {
|
|
148
|
+
let extractor = PandocExtractor::new();
|
|
149
|
+
assert!(
|
|
150
|
+
extractor
|
|
151
|
+
.supported_mime_types()
|
|
152
|
+
.contains(&"application/vnd.oasis.opendocument.text")
|
|
153
|
+
);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
#[tokio::test]
|
|
157
|
+
async fn test_pandoc_extractor_supports_epub() {
|
|
158
|
+
let extractor = PandocExtractor::new();
|
|
159
|
+
assert!(extractor.supported_mime_types().contains(&"application/epub+zip"));
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
#[tokio::test]
|
|
163
|
+
async fn test_pandoc_extractor_supports_latex() {
|
|
164
|
+
let extractor = PandocExtractor::new();
|
|
165
|
+
assert!(extractor.supported_mime_types().contains(&"application/x-latex"));
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
#[tokio::test]
|
|
169
|
+
async fn test_pandoc_extractor_supports_rst() {
|
|
170
|
+
let extractor = PandocExtractor::new();
|
|
171
|
+
assert!(extractor.supported_mime_types().contains(&"text/x-rst"));
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
#[tokio::test]
|
|
175
|
+
async fn test_pandoc_extractor_markdown() {
|
|
176
|
+
if validate_pandoc_version().await.is_err() {
|
|
177
|
+
return;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
let extractor = PandocExtractor::new();
|
|
181
|
+
let markdown = b"# Hello World\n\nThis is a test.";
|
|
182
|
+
let config = ExtractionConfig::default();
|
|
183
|
+
|
|
184
|
+
let result = extractor.extract_bytes(markdown, "text/x-rst", &config).await;
|
|
185
|
+
|
|
186
|
+
let _ = result;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
#[tokio::test]
|
|
190
|
+
async fn test_pandoc_extractor_default() {
|
|
191
|
+
let extractor = PandocExtractor;
|
|
192
|
+
assert_eq!(extractor.name(), "pandoc-extractor");
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
#[tokio::test]
|
|
196
|
+
async fn test_pandoc_extractor_initialize_shutdown() {
|
|
197
|
+
let extractor = PandocExtractor::new();
|
|
198
|
+
assert!(extractor.initialize().is_ok());
|
|
199
|
+
assert!(extractor.shutdown().is_ok());
|
|
200
|
+
}
|
|
201
|
+
}
|
|
@@ -147,25 +147,31 @@ fn extract_tables_from_document(
|
|
|
147
147
|
|
|
148
148
|
let mut all_tables = Vec::new();
|
|
149
149
|
|
|
150
|
+
// Process each page
|
|
150
151
|
for (page_index, page) in document.pages().iter().enumerate() {
|
|
151
|
-
|
|
152
|
+
// Extract words with positions from the page
|
|
153
|
+
let words = extract_words_from_page(&page, 0.0)?; // Use 0.0 confidence for PDF (always high quality)
|
|
152
154
|
|
|
153
155
|
if words.is_empty() {
|
|
154
156
|
continue;
|
|
155
157
|
}
|
|
156
158
|
|
|
159
|
+
// Use existing table reconstruction logic
|
|
160
|
+
// These thresholds match the defaults from TesseractConfig
|
|
157
161
|
let column_threshold = 50;
|
|
158
162
|
let row_threshold_ratio = 0.5;
|
|
159
163
|
|
|
164
|
+
// Reconstruct table from positioned words
|
|
160
165
|
let table_cells = reconstruct_table(&words, column_threshold, row_threshold_ratio, true);
|
|
161
166
|
|
|
162
167
|
if !table_cells.is_empty() {
|
|
168
|
+
// Generate markdown representation
|
|
163
169
|
let markdown = table_to_markdown(&table_cells);
|
|
164
170
|
|
|
165
171
|
all_tables.push(Table {
|
|
166
172
|
cells: table_cells,
|
|
167
173
|
markdown,
|
|
168
|
-
page_number: page_index + 1,
|
|
174
|
+
page_number: page_index + 1, // 1-indexed
|
|
169
175
|
});
|
|
170
176
|
}
|
|
171
177
|
}
|
|
@@ -281,13 +287,6 @@ impl Plugin for PdfExtractor {
|
|
|
281
287
|
|
|
282
288
|
#[async_trait]
|
|
283
289
|
impl DocumentExtractor for PdfExtractor {
|
|
284
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
285
|
-
skip(self, content, config),
|
|
286
|
-
fields(
|
|
287
|
-
extractor.name = self.name(),
|
|
288
|
-
content.size_bytes = content.len(),
|
|
289
|
-
)
|
|
290
|
-
))]
|
|
291
290
|
async fn extract_bytes(
|
|
292
291
|
&self,
|
|
293
292
|
content: &[u8],
|
|
@@ -296,10 +295,9 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
296
295
|
) -> Result<ExtractionResult> {
|
|
297
296
|
#[cfg(feature = "pdf")]
|
|
298
297
|
let (pdf_metadata, native_text, tables) = if crate::core::batch_mode::is_batch_mode() {
|
|
298
|
+
// Batch mode: Move PDF extraction to blocking thread pool to enable parallelism
|
|
299
299
|
let content_owned = content.to_vec();
|
|
300
|
-
let span = tracing::Span::current();
|
|
301
300
|
tokio::task::spawn_blocking(move || {
|
|
302
|
-
let _guard = span.entered();
|
|
303
301
|
let bindings = Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./"))
|
|
304
302
|
.or_else(|_| Pdfium::bind_to_system_library())
|
|
305
303
|
.map_err(|e| PdfError::MetadataExtractionFailed(format!("Failed to initialize Pdfium: {}", e)))?;
|
|
@@ -318,6 +316,7 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
318
316
|
let metadata = crate::pdf::metadata::extract_metadata_from_document(&document)?;
|
|
319
317
|
let native_text = crate::pdf::text::extract_text_from_pdf_document(&document)?;
|
|
320
318
|
|
|
319
|
+
// Extract tables from native PDF text (when not using OCR)
|
|
321
320
|
let tables = extract_tables_from_document(&document, &metadata)?;
|
|
322
321
|
|
|
323
322
|
Ok::<_, crate::error::KreuzbergError>((metadata, native_text, tables))
|
|
@@ -325,6 +324,7 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
325
324
|
.await
|
|
326
325
|
.map_err(|e| crate::error::KreuzbergError::Other(format!("PDF extraction task failed: {}", e)))??
|
|
327
326
|
} else {
|
|
327
|
+
// Single-file mode: Direct extraction (no spawn overhead)
|
|
328
328
|
let bindings = Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./"))
|
|
329
329
|
.or_else(|_| Pdfium::bind_to_system_library())
|
|
330
330
|
.map_err(|e| PdfError::MetadataExtractionFailed(format!("Failed to initialize Pdfium: {}", e)))?;
|
|
@@ -343,6 +343,7 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
343
343
|
let metadata = crate::pdf::metadata::extract_metadata_from_document(&document)?;
|
|
344
344
|
let native_text = crate::pdf::text::extract_text_from_pdf_document(&document)?;
|
|
345
345
|
|
|
346
|
+
// Extract tables from native PDF text (when not using OCR)
|
|
346
347
|
let tables = extract_tables_from_document(&document, &metadata)?;
|
|
347
348
|
|
|
348
349
|
(metadata, native_text, tables)
|
|
@@ -415,6 +416,9 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
415
416
|
None
|
|
416
417
|
};
|
|
417
418
|
|
|
419
|
+
// Tables were extracted during metadata/text extraction phase
|
|
420
|
+
// (see extract_tables_from_document function below)
|
|
421
|
+
|
|
418
422
|
Ok(ExtractionResult {
|
|
419
423
|
content: text,
|
|
420
424
|
mime_type: mime_type.to_string(),
|
|
@@ -430,7 +434,6 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
430
434
|
})
|
|
431
435
|
}
|
|
432
436
|
|
|
433
|
-
#[cfg(feature = "tokio-runtime")]
|
|
434
437
|
async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
435
438
|
let bytes = tokio::fs::read(path).await?;
|
|
436
439
|
self.extract_bytes(&bytes, mime_type, config).await
|
|
@@ -43,10 +43,8 @@ impl PptxExtractor {
|
|
|
43
43
|
for image in &mut images {
|
|
44
44
|
let image_data = image.data.clone();
|
|
45
45
|
let tess_config_clone = tess_config.clone();
|
|
46
|
-
let span = tracing::Span::current();
|
|
47
46
|
|
|
48
47
|
let ocr_result = tokio::task::spawn_blocking(move || {
|
|
49
|
-
let _guard = span.entered();
|
|
50
48
|
let cache_dir = std::env::var("KREUZBERG_CACHE_DIR").ok().map(std::path::PathBuf::from);
|
|
51
49
|
|
|
52
50
|
let proc = OcrProcessor::new(cache_dir)?;
|
|
@@ -102,13 +100,6 @@ impl Plugin for PptxExtractor {
|
|
|
102
100
|
|
|
103
101
|
#[async_trait]
|
|
104
102
|
impl DocumentExtractor for PptxExtractor {
|
|
105
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
106
|
-
skip(self, content, config),
|
|
107
|
-
fields(
|
|
108
|
-
extractor.name = self.name(),
|
|
109
|
-
content.size_bytes = content.len(),
|
|
110
|
-
)
|
|
111
|
-
))]
|
|
112
103
|
async fn extract_bytes(
|
|
113
104
|
&self,
|
|
114
105
|
content: &[u8],
|
|
@@ -117,16 +108,17 @@ impl DocumentExtractor for PptxExtractor {
|
|
|
117
108
|
) -> Result<ExtractionResult> {
|
|
118
109
|
let extract_images = config.images.as_ref().is_some_and(|img| img.extract_images);
|
|
119
110
|
|
|
111
|
+
// Extract PPTX content
|
|
120
112
|
let pptx_result = if crate::core::batch_mode::is_batch_mode() {
|
|
113
|
+
// Batch mode: Use spawn_blocking for parallelism
|
|
121
114
|
let content_owned = content.to_vec();
|
|
122
|
-
let span = tracing::Span::current();
|
|
123
115
|
tokio::task::spawn_blocking(move || {
|
|
124
|
-
let _guard = span.entered();
|
|
125
116
|
crate::extraction::pptx::extract_pptx_from_bytes(&content_owned, extract_images)
|
|
126
117
|
})
|
|
127
118
|
.await
|
|
128
119
|
.map_err(|e| crate::error::KreuzbergError::parsing(format!("PPTX extraction task failed: {}", e)))??
|
|
129
120
|
} else {
|
|
121
|
+
// Single-file mode: Direct extraction (no spawn overhead)
|
|
130
122
|
crate::extraction::pptx::extract_pptx_from_bytes(content, extract_images)?
|
|
131
123
|
};
|
|
132
124
|
|
|
@@ -164,12 +156,6 @@ impl DocumentExtractor for PptxExtractor {
|
|
|
164
156
|
})
|
|
165
157
|
}
|
|
166
158
|
|
|
167
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
168
|
-
skip(self, path, config),
|
|
169
|
-
fields(
|
|
170
|
-
extractor.name = self.name(),
|
|
171
|
-
)
|
|
172
|
-
))]
|
|
173
159
|
async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
174
160
|
let path_str = path
|
|
175
161
|
.to_str()
|
|
@@ -42,13 +42,6 @@ impl Plugin for StructuredExtractor {
|
|
|
42
42
|
|
|
43
43
|
#[async_trait]
|
|
44
44
|
impl DocumentExtractor for StructuredExtractor {
|
|
45
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
46
|
-
skip(self, content, _config),
|
|
47
|
-
fields(
|
|
48
|
-
extractor.name = self.name(),
|
|
49
|
-
content.size_bytes = content.len(),
|
|
50
|
-
)
|
|
51
|
-
))]
|
|
52
45
|
async fn extract_bytes(
|
|
53
46
|
&self,
|
|
54
47
|
content: &[u8],
|
|
@@ -87,13 +80,6 @@ impl DocumentExtractor for StructuredExtractor {
|
|
|
87
80
|
})
|
|
88
81
|
}
|
|
89
82
|
|
|
90
|
-
#[cfg(feature = "tokio-runtime")]
|
|
91
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
92
|
-
skip(self, path, config),
|
|
93
|
-
fields(
|
|
94
|
-
extractor.name = self.name(),
|
|
95
|
-
)
|
|
96
|
-
))]
|
|
97
83
|
async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
98
84
|
let bytes = tokio::fs::read(path).await?;
|
|
99
85
|
self.extract_bytes(&bytes, mime_type, config).await
|
|
@@ -53,33 +53,22 @@ impl Plugin for PlainTextExtractor {
|
|
|
53
53
|
|
|
54
54
|
#[async_trait]
|
|
55
55
|
impl DocumentExtractor for PlainTextExtractor {
|
|
56
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
57
|
-
skip(self, content, _config),
|
|
58
|
-
fields(
|
|
59
|
-
extractor.name = self.name(),
|
|
60
|
-
content.size_bytes = content.len(),
|
|
61
|
-
)
|
|
62
|
-
))]
|
|
63
56
|
async fn extract_bytes(
|
|
64
57
|
&self,
|
|
65
58
|
content: &[u8],
|
|
66
59
|
mime_type: &str,
|
|
67
60
|
_config: &ExtractionConfig,
|
|
68
61
|
) -> Result<ExtractionResult> {
|
|
69
|
-
let
|
|
70
|
-
let text = text.trim_end_matches('\n').trim_end_matches('\r').to_string();
|
|
71
|
-
let line_count = text.lines().count();
|
|
72
|
-
let word_count = text.split_whitespace().count();
|
|
73
|
-
let character_count = text.len();
|
|
62
|
+
let text_result = parse_text(content, false)?;
|
|
74
63
|
|
|
75
64
|
Ok(ExtractionResult {
|
|
76
|
-
content:
|
|
65
|
+
content: text_result.content,
|
|
77
66
|
mime_type: mime_type.to_string(),
|
|
78
67
|
metadata: crate::types::Metadata {
|
|
79
68
|
format: Some(crate::types::FormatMetadata::Text(crate::types::TextMetadata {
|
|
80
|
-
line_count,
|
|
81
|
-
word_count,
|
|
82
|
-
character_count,
|
|
69
|
+
line_count: text_result.line_count,
|
|
70
|
+
word_count: text_result.word_count,
|
|
71
|
+
character_count: text_result.character_count,
|
|
83
72
|
headers: None,
|
|
84
73
|
links: None,
|
|
85
74
|
code_blocks: None,
|
|
@@ -149,13 +138,6 @@ impl Plugin for MarkdownExtractor {
|
|
|
149
138
|
|
|
150
139
|
#[async_trait]
|
|
151
140
|
impl DocumentExtractor for MarkdownExtractor {
|
|
152
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
153
|
-
skip(self, content, _config),
|
|
154
|
-
fields(
|
|
155
|
-
extractor.name = self.name(),
|
|
156
|
-
content.size_bytes = content.len(),
|
|
157
|
-
)
|
|
158
|
-
))]
|
|
159
141
|
async fn extract_bytes(
|
|
160
142
|
&self,
|
|
161
143
|
content: &[u8],
|
|
@@ -53,13 +53,6 @@ impl Plugin for XmlExtractor {
|
|
|
53
53
|
|
|
54
54
|
#[async_trait]
|
|
55
55
|
impl DocumentExtractor for XmlExtractor {
|
|
56
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
57
|
-
skip(self, content, _config),
|
|
58
|
-
fields(
|
|
59
|
-
extractor.name = self.name(),
|
|
60
|
-
content.size_bytes = content.len(),
|
|
61
|
-
)
|
|
62
|
-
))]
|
|
63
56
|
async fn extract_bytes(
|
|
64
57
|
&self,
|
|
65
58
|
content: &[u8],
|
|
@@ -248,6 +248,7 @@ mod tests {
|
|
|
248
248
|
let english_text = "Natural language processing is a subfield of artificial intelligence.";
|
|
249
249
|
let config = KeywordConfig::rake().with_language("fr");
|
|
250
250
|
let keywords = extract_keywords_rake(english_text, &config).unwrap();
|
|
251
|
+
dbg!(&keywords);
|
|
251
252
|
assert!(
|
|
252
253
|
!keywords.is_empty(),
|
|
253
254
|
"Should fall back to English stopwords and extract keywords"
|
data/vendor/kreuzberg/src/lib.rs
CHANGED
|
@@ -39,7 +39,6 @@ pub mod core;
|
|
|
39
39
|
pub mod error;
|
|
40
40
|
pub mod extraction;
|
|
41
41
|
pub mod extractors;
|
|
42
|
-
pub mod panic_context;
|
|
43
42
|
pub mod plugins;
|
|
44
43
|
pub mod text;
|
|
45
44
|
pub mod types;
|
|
@@ -80,9 +79,7 @@ pub mod pdf;
|
|
|
80
79
|
pub use error::{KreuzbergError, Result};
|
|
81
80
|
pub use types::*;
|
|
82
81
|
|
|
83
|
-
|
|
84
|
-
pub use core::extractor::{batch_extract_bytes, batch_extract_file};
|
|
85
|
-
pub use core::extractor::{extract_bytes, extract_file};
|
|
82
|
+
pub use core::extractor::{batch_extract_bytes, batch_extract_file, extract_bytes, extract_file};
|
|
86
83
|
|
|
87
84
|
pub use core::extractor::{batch_extract_bytes_sync, batch_extract_file_sync, extract_bytes_sync, extract_file_sync};
|
|
88
85
|
|
|
@@ -428,12 +428,12 @@ impl Default for KreuzbergMcp {
|
|
|
428
428
|
/// use kreuzberg::mcp::start_mcp_server;
|
|
429
429
|
///
|
|
430
430
|
/// #[tokio::main]
|
|
431
|
-
/// async fn main() -> Result<()
|
|
431
|
+
/// async fn main() -> anyhow::Result<()> {
|
|
432
432
|
/// start_mcp_server().await?;
|
|
433
433
|
/// Ok(())
|
|
434
434
|
/// }
|
|
435
435
|
/// ```
|
|
436
|
-
pub async fn start_mcp_server() -> Result<(), Box<dyn std::error::Error
|
|
436
|
+
pub async fn start_mcp_server() -> Result<(), Box<dyn std::error::Error>> {
|
|
437
437
|
let service = KreuzbergMcp::new()?.serve(stdio()).await?;
|
|
438
438
|
|
|
439
439
|
service.waiting().await?;
|
|
@@ -444,9 +444,7 @@ pub async fn start_mcp_server() -> Result<(), Box<dyn std::error::Error + Send +
|
|
|
444
444
|
///
|
|
445
445
|
/// This variant allows specifying a custom extraction configuration
|
|
446
446
|
/// (e.g., loaded from a file) instead of using defaults.
|
|
447
|
-
pub async fn start_mcp_server_with_config(
|
|
448
|
-
config: ExtractionConfig,
|
|
449
|
-
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
|
447
|
+
pub async fn start_mcp_server_with_config(config: ExtractionConfig) -> Result<(), Box<dyn std::error::Error>> {
|
|
450
448
|
let service = KreuzbergMcp::with_config(config).serve(stdio()).await?;
|
|
451
449
|
|
|
452
450
|
service.waiting().await?;
|
|
@@ -51,14 +51,6 @@ impl OcrProcessor {
|
|
|
51
51
|
Ok(Self { cache })
|
|
52
52
|
}
|
|
53
53
|
|
|
54
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
55
|
-
skip(self, image_bytes),
|
|
56
|
-
fields(
|
|
57
|
-
ocr.backend = "tesseract",
|
|
58
|
-
ocr.language = %config.language,
|
|
59
|
-
image.size_bytes = image_bytes.len(),
|
|
60
|
-
)
|
|
61
|
-
))]
|
|
62
54
|
pub fn process_image(&self, image_bytes: &[u8], config: &TesseractConfig) -> Result<OcrExtractionResult, OcrError> {
|
|
63
55
|
config.validate().map_err(OcrError::InvalidConfiguration)?;
|
|
64
56
|
|
|
@@ -72,14 +64,9 @@ impl OcrProcessor {
|
|
|
72
64
|
if config.use_cache
|
|
73
65
|
&& let Some(cached_result) = self.cache.get_cached_result(&image_hash, "tesseract", &config_str)?
|
|
74
66
|
{
|
|
75
|
-
#[cfg(feature = "otel")]
|
|
76
|
-
tracing::Span::current().record("cache.hit", true);
|
|
77
67
|
return Ok(cached_result);
|
|
78
68
|
}
|
|
79
69
|
|
|
80
|
-
#[cfg(feature = "otel")]
|
|
81
|
-
tracing::Span::current().record("cache.hit", false);
|
|
82
|
-
|
|
83
70
|
let result = self.perform_ocr(image_bytes, config)?;
|
|
84
71
|
|
|
85
72
|
if config.use_cache {
|
|
@@ -241,6 +228,7 @@ impl OcrProcessor {
|
|
|
241
228
|
});
|
|
242
229
|
|
|
243
230
|
// Validate language before initializing to prevent segfault ~keep
|
|
231
|
+
// tesseract-rs can crash on empty language or missing language files
|
|
244
232
|
if config.language.trim().is_empty() {
|
|
245
233
|
return Err(OcrError::TesseractInitializationFailed(
|
|
246
234
|
"Language cannot be empty. Please specify a valid language code (e.g., 'eng')".to_string(),
|
|
@@ -248,6 +236,7 @@ impl OcrProcessor {
|
|
|
248
236
|
}
|
|
249
237
|
|
|
250
238
|
// Validate language file exists before initializing to prevent segfault ~keep
|
|
239
|
+
// tesseract-rs can crash if language file is missing instead of returning error
|
|
251
240
|
if !tessdata_path.is_empty() {
|
|
252
241
|
let languages: Vec<&str> = config.language.split('+').collect();
|
|
253
242
|
for lang in languages {
|
|
@@ -373,11 +362,6 @@ impl OcrProcessor {
|
|
|
373
362
|
)
|
|
374
363
|
});
|
|
375
364
|
|
|
376
|
-
api.recognize()
|
|
377
|
-
.map_err(|e| OcrError::ProcessingFailed(format!("Failed to recognize text: {}", e)))?;
|
|
378
|
-
|
|
379
|
-
log_ci_debug(ci_debug_enabled, "recognize", || "completed".to_string());
|
|
380
|
-
|
|
381
365
|
let tsv_data_for_tables = if config.enable_table_detection || config.output_format == "tsv" {
|
|
382
366
|
Some(
|
|
383
367
|
api.get_tsv_text(0)
|
|
@@ -40,7 +40,7 @@ impl std::error::Error for PdfError {}
|
|
|
40
40
|
impl From<lopdf::Error> for PdfError {
|
|
41
41
|
fn from(err: lopdf::Error) -> Self {
|
|
42
42
|
match err {
|
|
43
|
-
lopdf::Error::IO(
|
|
43
|
+
lopdf::Error::IO(_) => panic!("lopdf IO errors should not be converted to PdfError - let them bubble up"),
|
|
44
44
|
_ => PdfError::InvalidPdf(err.to_string()),
|
|
45
45
|
}
|
|
46
46
|
}
|