kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +5 -3
- data/README.md +15 -9
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +516 -324
- data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
- data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
- data/kreuzberg.gemspec +38 -4
- data/lib/kreuzberg/config.rb +34 -1
- data/lib/kreuzberg/result.rb +77 -14
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +23 -6
- data/vendor/kreuzberg/Cargo.toml +25 -11
- data/vendor/kreuzberg/README.md +13 -8
- data/vendor/kreuzberg/build.rs +17 -6
- data/vendor/kreuzberg/src/api/mod.rs +2 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
- data/vendor/kreuzberg/src/core/config.rs +49 -1
- data/vendor/kreuzberg/src/core/extractor.rs +134 -2
- data/vendor/kreuzberg/src/core/mod.rs +4 -2
- data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
- data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
- data/vendor/kreuzberg/src/extraction/html.rs +24 -8
- data/vendor/kreuzberg/src/extraction/image.rs +124 -1
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
- data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
- data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
- data/vendor/kreuzberg/src/extractors/email.rs +29 -15
- data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
- data/vendor/kreuzberg/src/extractors/html.rs +29 -15
- data/vendor/kreuzberg/src/extractors/image.rs +25 -4
- data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
- data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
- data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
- data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +194 -17
- data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
- data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
- data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +7 -2
- data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
- data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
- data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
- data/vendor/kreuzberg/src/lib.rs +10 -2
- data/vendor/kreuzberg/src/mcp/mod.rs +2 -0
- data/vendor/kreuzberg/src/mcp/server.rs +14 -12
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
- data/vendor/kreuzberg/src/pdf/error.rs +8 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
- data/vendor/kreuzberg/src/pdf/mod.rs +14 -2
- data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
- data/vendor/kreuzberg/src/pdf/table.rs +26 -2
- data/vendor/kreuzberg/src/pdf/text.rs +89 -7
- data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
- data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
- data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
- data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
- data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
- data/vendor/kreuzberg/src/text/mod.rs +6 -0
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
- data/vendor/kreuzberg/src/types.rs +173 -21
- data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
- data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
- data/vendor/kreuzberg/tests/config_features.rs +15 -1
- data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
- data/vendor/kreuzberg/tests/email_integration.rs +2 -0
- data/vendor/kreuzberg/tests/error_handling.rs +43 -34
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/image_integration.rs +2 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
- data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
- data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
- data/vendor/kreuzberg/tests/security_validation.rs +1 -0
- data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
- data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
- data/vendor/rb-sys/Cargo.lock +15 -15
- data/vendor/rb-sys/Cargo.toml +4 -4
- data/vendor/rb-sys/Cargo.toml.orig +4 -4
- data/vendor/rb-sys/bin/release.sh +9 -8
- data/vendor/rb-sys/build/features.rs +5 -2
- data/vendor/rb-sys/build/main.rs +55 -15
- data/vendor/rb-sys/build/stable_api_config.rs +4 -2
- data/vendor/rb-sys/build/version.rs +3 -1
- data/vendor/rb-sys/src/macros.rs +2 -2
- data/vendor/rb-sys/src/special_consts.rs +1 -1
- data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
- data/vendor/rb-sys/src/stable_api.rs +0 -1
- data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
- metadata +11 -10
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
|
@@ -23,6 +23,7 @@ use crate::types::{ExtractionResult, Metadata, Table};
|
|
|
23
23
|
use async_trait::async_trait;
|
|
24
24
|
use quick_xml::Reader;
|
|
25
25
|
use quick_xml::events::Event;
|
|
26
|
+
#[cfg(feature = "tokio-runtime")]
|
|
26
27
|
use std::path::Path;
|
|
27
28
|
|
|
28
29
|
/// Strip namespace prefix from XML tag names.
|
|
@@ -403,6 +404,7 @@ impl DocumentExtractor for DocbookExtractor {
|
|
|
403
404
|
detected_languages: None,
|
|
404
405
|
chunks: None,
|
|
405
406
|
images: None,
|
|
407
|
+
pages: None,
|
|
406
408
|
})
|
|
407
409
|
}
|
|
408
410
|
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
#![cfg(all(feature = "tokio-runtime", feature = "office"))]
|
|
2
|
+
|
|
1
3
|
//! DOCX extractor using docx-lite for high-performance text extraction.
|
|
2
4
|
//!
|
|
3
5
|
//! Supports: Microsoft Word (.docx)
|
|
@@ -6,7 +8,7 @@ use crate::Result;
|
|
|
6
8
|
use crate::core::config::ExtractionConfig;
|
|
7
9
|
use crate::extraction::{cells_to_markdown, office_metadata};
|
|
8
10
|
use crate::plugins::{DocumentExtractor, Plugin};
|
|
9
|
-
use crate::types::{ExtractionResult, Metadata, Table};
|
|
11
|
+
use crate::types::{ExtractionResult, Metadata, PageBoundary, PageInfo, PageStructure, PageUnitType, Table};
|
|
10
12
|
use async_trait::async_trait;
|
|
11
13
|
use std::io::Cursor;
|
|
12
14
|
|
|
@@ -116,26 +118,30 @@ impl DocumentExtractor for DocxExtractor {
|
|
|
116
118
|
mime_type: &str,
|
|
117
119
|
_config: &ExtractionConfig,
|
|
118
120
|
) -> Result<ExtractionResult> {
|
|
119
|
-
let (text, tables) = if crate::core::batch_mode::is_batch_mode() {
|
|
121
|
+
let (text, tables, page_boundaries) = if crate::core::batch_mode::is_batch_mode() {
|
|
120
122
|
let content_owned = content.to_vec();
|
|
121
123
|
let span = tracing::Span::current();
|
|
122
|
-
tokio::task::spawn_blocking(
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
124
|
+
tokio::task::spawn_blocking(
|
|
125
|
+
move || -> crate::error::Result<(String, Vec<Table>, Option<Vec<PageBoundary>>)> {
|
|
126
|
+
let _guard = span.entered();
|
|
127
|
+
let cursor = Cursor::new(&content_owned);
|
|
128
|
+
let doc = docx_lite::parse_document(cursor)
|
|
129
|
+
.map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX parsing failed: {}", e)))?;
|
|
127
130
|
|
|
128
|
-
|
|
131
|
+
let text = doc.extract_text();
|
|
129
132
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
133
|
+
let tables: Vec<Table> = doc
|
|
134
|
+
.tables
|
|
135
|
+
.iter()
|
|
136
|
+
.enumerate()
|
|
137
|
+
.map(|(idx, table)| convert_docx_table_to_table(table, idx))
|
|
138
|
+
.collect();
|
|
136
139
|
|
|
137
|
-
|
|
138
|
-
|
|
140
|
+
let page_boundaries = crate::extraction::docx::detect_page_breaks_from_docx(&content_owned)?;
|
|
141
|
+
|
|
142
|
+
Ok((text, tables, page_boundaries))
|
|
143
|
+
},
|
|
144
|
+
)
|
|
139
145
|
.await
|
|
140
146
|
.map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX extraction task failed: {}", e)))??
|
|
141
147
|
} else {
|
|
@@ -152,7 +158,9 @@ impl DocumentExtractor for DocxExtractor {
|
|
|
152
158
|
.map(|(idx, table)| convert_docx_table_to_table(table, idx))
|
|
153
159
|
.collect();
|
|
154
160
|
|
|
155
|
-
(
|
|
161
|
+
let page_boundaries = crate::extraction::docx::detect_page_breaks_from_docx(content)?;
|
|
162
|
+
|
|
163
|
+
(text, tables, page_boundaries)
|
|
156
164
|
};
|
|
157
165
|
|
|
158
166
|
let mut archive = if crate::core::batch_mode::is_batch_mode() {
|
|
@@ -260,13 +268,38 @@ impl DocumentExtractor for DocxExtractor {
|
|
|
260
268
|
}
|
|
261
269
|
}
|
|
262
270
|
|
|
271
|
+
let page_structure = if let Some(boundaries) = page_boundaries {
|
|
272
|
+
let total_count = boundaries.len();
|
|
273
|
+
Some(PageStructure {
|
|
274
|
+
total_count,
|
|
275
|
+
unit_type: PageUnitType::Page,
|
|
276
|
+
boundaries: Some(boundaries),
|
|
277
|
+
pages: Some(
|
|
278
|
+
(1..=total_count)
|
|
279
|
+
.map(|page_num| PageInfo {
|
|
280
|
+
number: page_num,
|
|
281
|
+
title: None,
|
|
282
|
+
dimensions: None,
|
|
283
|
+
image_count: None,
|
|
284
|
+
table_count: None,
|
|
285
|
+
hidden: None,
|
|
286
|
+
})
|
|
287
|
+
.collect(),
|
|
288
|
+
),
|
|
289
|
+
})
|
|
290
|
+
} else {
|
|
291
|
+
None
|
|
292
|
+
};
|
|
293
|
+
|
|
263
294
|
Ok(ExtractionResult {
|
|
264
295
|
content: text,
|
|
265
296
|
mime_type: mime_type.to_string(),
|
|
266
297
|
metadata: Metadata {
|
|
298
|
+
pages: page_structure,
|
|
267
299
|
additional: metadata_map,
|
|
268
300
|
..Default::default()
|
|
269
301
|
},
|
|
302
|
+
pages: None,
|
|
270
303
|
tables,
|
|
271
304
|
detected_languages: None,
|
|
272
305
|
chunks: None,
|
|
@@ -2,9 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
use crate::Result;
|
|
4
4
|
use crate::core::config::ExtractionConfig;
|
|
5
|
+
use crate::extractors::SyncExtractor;
|
|
5
6
|
use crate::plugins::{DocumentExtractor, Plugin};
|
|
6
7
|
use crate::types::{EmailMetadata, ExtractionResult, Metadata};
|
|
7
8
|
use async_trait::async_trait;
|
|
9
|
+
#[cfg(feature = "tokio-runtime")]
|
|
8
10
|
use std::path::Path;
|
|
9
11
|
|
|
10
12
|
/// Email message extractor.
|
|
@@ -42,21 +44,8 @@ impl Plugin for EmailExtractor {
|
|
|
42
44
|
}
|
|
43
45
|
}
|
|
44
46
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
48
|
-
skip(self, content, _config),
|
|
49
|
-
fields(
|
|
50
|
-
extractor.name = self.name(),
|
|
51
|
-
content.size_bytes = content.len(),
|
|
52
|
-
)
|
|
53
|
-
))]
|
|
54
|
-
async fn extract_bytes(
|
|
55
|
-
&self,
|
|
56
|
-
content: &[u8],
|
|
57
|
-
mime_type: &str,
|
|
58
|
-
_config: &ExtractionConfig,
|
|
59
|
-
) -> Result<ExtractionResult> {
|
|
47
|
+
impl SyncExtractor for EmailExtractor {
|
|
48
|
+
fn extract_sync(&self, content: &[u8], mime_type: &str, _config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
60
49
|
let email_result = crate::extraction::email::extract_email_content(content, mime_type)?;
|
|
61
50
|
|
|
62
51
|
let text = crate::extraction::email::build_email_text_output(&email_result);
|
|
@@ -96,8 +85,28 @@ impl DocumentExtractor for EmailExtractor {
|
|
|
96
85
|
detected_languages: None,
|
|
97
86
|
chunks: None,
|
|
98
87
|
images: None,
|
|
88
|
+
pages: None,
|
|
99
89
|
})
|
|
100
90
|
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
#[async_trait]
|
|
94
|
+
impl DocumentExtractor for EmailExtractor {
|
|
95
|
+
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
96
|
+
skip(self, content, config),
|
|
97
|
+
fields(
|
|
98
|
+
extractor.name = self.name(),
|
|
99
|
+
content.size_bytes = content.len(),
|
|
100
|
+
)
|
|
101
|
+
))]
|
|
102
|
+
async fn extract_bytes(
|
|
103
|
+
&self,
|
|
104
|
+
content: &[u8],
|
|
105
|
+
mime_type: &str,
|
|
106
|
+
config: &ExtractionConfig,
|
|
107
|
+
) -> Result<ExtractionResult> {
|
|
108
|
+
self.extract_sync(content, mime_type, config)
|
|
109
|
+
}
|
|
101
110
|
|
|
102
111
|
#[cfg(feature = "tokio-runtime")]
|
|
103
112
|
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
@@ -106,6 +115,7 @@ impl DocumentExtractor for EmailExtractor {
|
|
|
106
115
|
extractor.name = self.name(),
|
|
107
116
|
)
|
|
108
117
|
))]
|
|
118
|
+
#[cfg(feature = "tokio-runtime")]
|
|
109
119
|
async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
110
120
|
let bytes = tokio::fs::read(path).await?;
|
|
111
121
|
self.extract_bytes(&bytes, mime_type, config).await
|
|
@@ -118,6 +128,10 @@ impl DocumentExtractor for EmailExtractor {
|
|
|
118
128
|
fn priority(&self) -> i32 {
|
|
119
129
|
50
|
|
120
130
|
}
|
|
131
|
+
|
|
132
|
+
fn as_sync_extractor(&self) -> Option<&dyn crate::extractors::SyncExtractor> {
|
|
133
|
+
Some(self)
|
|
134
|
+
}
|
|
121
135
|
}
|
|
122
136
|
|
|
123
137
|
#[cfg(test)]
|
|
@@ -150,6 +150,7 @@ impl DocumentExtractor for ExcelExtractor {
|
|
|
150
150
|
additional,
|
|
151
151
|
..Default::default()
|
|
152
152
|
},
|
|
153
|
+
pages: None,
|
|
153
154
|
tables,
|
|
154
155
|
detected_languages: None,
|
|
155
156
|
chunks: None,
|
|
@@ -193,6 +194,7 @@ impl DocumentExtractor for ExcelExtractor {
|
|
|
193
194
|
additional,
|
|
194
195
|
..Default::default()
|
|
195
196
|
},
|
|
197
|
+
pages: None,
|
|
196
198
|
tables,
|
|
197
199
|
detected_languages: None,
|
|
198
200
|
chunks: None,
|
|
@@ -2,9 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
use crate::Result;
|
|
4
4
|
use crate::core::config::ExtractionConfig;
|
|
5
|
+
use crate::extractors::SyncExtractor;
|
|
5
6
|
use crate::plugins::{DocumentExtractor, Plugin};
|
|
6
7
|
use crate::types::{ExtractionResult, Metadata, Table};
|
|
7
8
|
use async_trait::async_trait;
|
|
9
|
+
#[cfg(feature = "tokio-runtime")]
|
|
8
10
|
use std::path::Path;
|
|
9
11
|
|
|
10
12
|
// NOTE: scraper dependency has been removed in favor of html-to-markdown-rs
|
|
@@ -193,21 +195,8 @@ impl Plugin for HtmlExtractor {
|
|
|
193
195
|
}
|
|
194
196
|
}
|
|
195
197
|
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
199
|
-
skip(self, content, config),
|
|
200
|
-
fields(
|
|
201
|
-
extractor.name = self.name(),
|
|
202
|
-
content.size_bytes = content.len(),
|
|
203
|
-
)
|
|
204
|
-
))]
|
|
205
|
-
async fn extract_bytes(
|
|
206
|
-
&self,
|
|
207
|
-
content: &[u8],
|
|
208
|
-
mime_type: &str,
|
|
209
|
-
config: &ExtractionConfig,
|
|
210
|
-
) -> Result<ExtractionResult> {
|
|
198
|
+
impl SyncExtractor for HtmlExtractor {
|
|
199
|
+
fn extract_sync(&self, content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
211
200
|
let html = std::str::from_utf8(content)
|
|
212
201
|
.map(|s| s.to_string())
|
|
213
202
|
.unwrap_or_else(|_| String::from_utf8_lossy(content).to_string());
|
|
@@ -225,12 +214,32 @@ impl DocumentExtractor for HtmlExtractor {
|
|
|
225
214
|
format: html_metadata.map(|m| crate::types::FormatMetadata::Html(Box::new(m))),
|
|
226
215
|
..Default::default()
|
|
227
216
|
},
|
|
217
|
+
pages: None,
|
|
228
218
|
tables,
|
|
229
219
|
detected_languages: None,
|
|
230
220
|
chunks: None,
|
|
231
221
|
images: None,
|
|
232
222
|
})
|
|
233
223
|
}
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
#[async_trait]
|
|
227
|
+
impl DocumentExtractor for HtmlExtractor {
|
|
228
|
+
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
229
|
+
skip(self, content, config),
|
|
230
|
+
fields(
|
|
231
|
+
extractor.name = self.name(),
|
|
232
|
+
content.size_bytes = content.len(),
|
|
233
|
+
)
|
|
234
|
+
))]
|
|
235
|
+
async fn extract_bytes(
|
|
236
|
+
&self,
|
|
237
|
+
content: &[u8],
|
|
238
|
+
mime_type: &str,
|
|
239
|
+
config: &ExtractionConfig,
|
|
240
|
+
) -> Result<ExtractionResult> {
|
|
241
|
+
self.extract_sync(content, mime_type, config)
|
|
242
|
+
}
|
|
234
243
|
|
|
235
244
|
#[cfg(feature = "tokio-runtime")]
|
|
236
245
|
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
@@ -239,6 +248,7 @@ impl DocumentExtractor for HtmlExtractor {
|
|
|
239
248
|
extractor.name = self.name(),
|
|
240
249
|
)
|
|
241
250
|
))]
|
|
251
|
+
#[cfg(feature = "tokio-runtime")]
|
|
242
252
|
async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
243
253
|
let bytes = tokio::fs::read(path).await?;
|
|
244
254
|
self.extract_bytes(&bytes, mime_type, config).await
|
|
@@ -251,6 +261,10 @@ impl DocumentExtractor for HtmlExtractor {
|
|
|
251
261
|
fn priority(&self) -> i32 {
|
|
252
262
|
50
|
|
253
263
|
}
|
|
264
|
+
|
|
265
|
+
fn as_sync_extractor(&self) -> Option<&dyn crate::extractors::SyncExtractor> {
|
|
266
|
+
Some(self)
|
|
267
|
+
}
|
|
254
268
|
}
|
|
255
269
|
|
|
256
270
|
#[cfg(test)]
|
|
@@ -20,9 +20,14 @@ impl ImageExtractor {
|
|
|
20
20
|
Self
|
|
21
21
|
}
|
|
22
22
|
|
|
23
|
-
/// Extract text from image using OCR.
|
|
23
|
+
/// Extract text from image using OCR with optional page tracking for multi-frame TIFFs.
|
|
24
24
|
#[cfg(feature = "ocr")]
|
|
25
|
-
async fn extract_with_ocr(
|
|
25
|
+
async fn extract_with_ocr(
|
|
26
|
+
&self,
|
|
27
|
+
content: &[u8],
|
|
28
|
+
mime_type: &str,
|
|
29
|
+
config: &ExtractionConfig,
|
|
30
|
+
) -> Result<ExtractionResult> {
|
|
26
31
|
use crate::plugins::registry::get_ocr_backend_registry;
|
|
27
32
|
|
|
28
33
|
let ocr_config = config.ocr.as_ref().ok_or_else(|| crate::KreuzbergError::Parsing {
|
|
@@ -39,7 +44,21 @@ impl ImageExtractor {
|
|
|
39
44
|
registry.get(&ocr_config.backend)?
|
|
40
45
|
};
|
|
41
46
|
|
|
42
|
-
backend.process_image(content, ocr_config).await
|
|
47
|
+
let ocr_result = backend.process_image(content, ocr_config).await?;
|
|
48
|
+
|
|
49
|
+
let ocr_text = ocr_result.content.clone();
|
|
50
|
+
let ocr_extraction_result = crate::extraction::image::extract_text_from_image_with_ocr(
|
|
51
|
+
content,
|
|
52
|
+
mime_type,
|
|
53
|
+
ocr_text,
|
|
54
|
+
config.pages.as_ref(),
|
|
55
|
+
)?;
|
|
56
|
+
|
|
57
|
+
let mut result = ocr_result;
|
|
58
|
+
result.content = ocr_extraction_result.content;
|
|
59
|
+
result.pages = ocr_extraction_result.page_contents;
|
|
60
|
+
|
|
61
|
+
Ok(result)
|
|
43
62
|
}
|
|
44
63
|
}
|
|
45
64
|
|
|
@@ -102,7 +121,7 @@ impl DocumentExtractor for ImageExtractor {
|
|
|
102
121
|
if config.ocr.is_some() {
|
|
103
122
|
#[cfg(feature = "ocr")]
|
|
104
123
|
{
|
|
105
|
-
let mut ocr_result = self.extract_with_ocr(content, config).await?;
|
|
124
|
+
let mut ocr_result = self.extract_with_ocr(content, mime_type, config).await?;
|
|
106
125
|
|
|
107
126
|
ocr_result.metadata.format = Some(crate::types::FormatMetadata::Image(image_metadata));
|
|
108
127
|
ocr_result.mime_type = mime_type.to_string();
|
|
@@ -123,6 +142,7 @@ impl DocumentExtractor for ImageExtractor {
|
|
|
123
142
|
format: Some(crate::types::FormatMetadata::Image(image_metadata)),
|
|
124
143
|
..Default::default()
|
|
125
144
|
},
|
|
145
|
+
pages: None,
|
|
126
146
|
tables: vec![],
|
|
127
147
|
detected_languages: None,
|
|
128
148
|
chunks: None,
|
|
@@ -141,6 +161,7 @@ impl DocumentExtractor for ImageExtractor {
|
|
|
141
161
|
format: Some(crate::types::FormatMetadata::Image(image_metadata)),
|
|
142
162
|
..Default::default()
|
|
143
163
|
},
|
|
164
|
+
pages: None,
|
|
144
165
|
tables: vec![],
|
|
145
166
|
detected_languages: None,
|
|
146
167
|
chunks: None,
|
|
@@ -21,6 +21,7 @@ use crate::types::{ExtractionResult, Metadata, Table};
|
|
|
21
21
|
use async_trait::async_trait;
|
|
22
22
|
use quick_xml::Reader;
|
|
23
23
|
use quick_xml::events::Event;
|
|
24
|
+
#[cfg(feature = "tokio-runtime")]
|
|
24
25
|
use std::path::Path;
|
|
25
26
|
|
|
26
27
|
/// JATS document extractor.
|
|
@@ -569,6 +570,7 @@ impl DocumentExtractor for JatsExtractor {
|
|
|
569
570
|
detected_languages: None,
|
|
570
571
|
chunks: None,
|
|
571
572
|
images: None,
|
|
573
|
+
pages: None,
|
|
572
574
|
})
|
|
573
575
|
}
|
|
574
576
|
|
|
@@ -582,6 +584,7 @@ impl DocumentExtractor for JatsExtractor {
|
|
|
582
584
|
)
|
|
583
585
|
)
|
|
584
586
|
)]
|
|
587
|
+
#[cfg(feature = "tokio-runtime")]
|
|
585
588
|
async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
586
589
|
let bytes = tokio::fs::read(path).await?;
|
|
587
590
|
self.extract_bytes(&bytes, mime_type, config).await
|
|
@@ -4,14 +4,69 @@
|
|
|
4
4
|
//! All extractors implement the `DocumentExtractor` plugin trait.
|
|
5
5
|
|
|
6
6
|
use crate::Result;
|
|
7
|
+
use crate::core::config::ExtractionConfig;
|
|
7
8
|
use crate::plugins::registry::get_document_extractor_registry;
|
|
9
|
+
use crate::types::ExtractionResult;
|
|
8
10
|
use once_cell::sync::Lazy;
|
|
9
11
|
use std::sync::Arc;
|
|
10
12
|
|
|
11
|
-
|
|
13
|
+
/// Trait for extractors that can work synchronously (WASM-compatible).
|
|
14
|
+
///
|
|
15
|
+
/// This trait defines the synchronous extraction interface for WASM targets and other
|
|
16
|
+
/// environments where async/tokio runtimes are not available or desirable.
|
|
17
|
+
///
|
|
18
|
+
/// # Implementation
|
|
19
|
+
///
|
|
20
|
+
/// Extractors that need to support WASM should implement this trait in addition to
|
|
21
|
+
/// the async `DocumentExtractor` trait. This allows the same extractor to work in both
|
|
22
|
+
/// environments by delegating to the sync implementation.
|
|
23
|
+
///
|
|
24
|
+
/// # MIME Type Validation
|
|
25
|
+
///
|
|
26
|
+
/// The `mime_type` parameter is guaranteed to be already validated.
|
|
27
|
+
///
|
|
28
|
+
/// # Example
|
|
29
|
+
///
|
|
30
|
+
/// ```rust,ignore
|
|
31
|
+
/// impl SyncExtractor for PlainTextExtractor {
|
|
32
|
+
/// fn extract_sync(&self, content: &[u8], config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
33
|
+
/// let text = String::from_utf8_lossy(content).to_string();
|
|
34
|
+
/// Ok(ExtractionResult {
|
|
35
|
+
/// content: text,
|
|
36
|
+
/// mime_type: "text/plain".to_string(),
|
|
37
|
+
/// metadata: Metadata::default(),
|
|
38
|
+
/// tables: vec![],
|
|
39
|
+
/// detected_languages: None,
|
|
40
|
+
/// chunks: None,
|
|
41
|
+
/// images: None,
|
|
42
|
+
/// })
|
|
43
|
+
/// }
|
|
44
|
+
/// }
|
|
45
|
+
/// ```
|
|
46
|
+
pub trait SyncExtractor {
|
|
47
|
+
/// Extract content from a byte array synchronously.
|
|
48
|
+
///
|
|
49
|
+
/// This method performs extraction without requiring an async runtime.
|
|
50
|
+
/// It is called by `extract_bytes_sync()` when the `tokio-runtime` feature is disabled.
|
|
51
|
+
///
|
|
52
|
+
/// # Arguments
|
|
53
|
+
///
|
|
54
|
+
/// * `content` - Raw document bytes
|
|
55
|
+
/// * `mime_type` - MIME type of the document (already validated)
|
|
56
|
+
/// * `config` - Extraction configuration
|
|
57
|
+
///
|
|
58
|
+
/// # Returns
|
|
59
|
+
///
|
|
60
|
+
/// An `ExtractionResult` containing the extracted content and metadata.
|
|
61
|
+
fn extract_sync(&self, content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult>;
|
|
62
|
+
}
|
|
63
|
+
|
|
12
64
|
pub mod structured;
|
|
13
65
|
pub mod text;
|
|
14
66
|
|
|
67
|
+
#[cfg(feature = "archives")]
|
|
68
|
+
pub mod security;
|
|
69
|
+
|
|
15
70
|
#[cfg(feature = "ocr")]
|
|
16
71
|
pub mod image;
|
|
17
72
|
|
|
@@ -30,7 +85,7 @@ pub mod html;
|
|
|
30
85
|
#[cfg(feature = "office")]
|
|
31
86
|
pub mod bibtex;
|
|
32
87
|
|
|
33
|
-
#[cfg(feature = "office")]
|
|
88
|
+
#[cfg(all(feature = "tokio-runtime", feature = "office"))]
|
|
34
89
|
pub mod docx;
|
|
35
90
|
|
|
36
91
|
#[cfg(feature = "office")]
|
|
@@ -54,7 +109,7 @@ pub mod jupyter;
|
|
|
54
109
|
#[cfg(feature = "office")]
|
|
55
110
|
pub mod orgmode;
|
|
56
111
|
|
|
57
|
-
#[cfg(feature = "office")]
|
|
112
|
+
#[cfg(all(feature = "tokio-runtime", feature = "office"))]
|
|
58
113
|
pub mod odt;
|
|
59
114
|
|
|
60
115
|
#[cfg(feature = "office")]
|
|
@@ -69,7 +124,7 @@ pub mod jats;
|
|
|
69
124
|
#[cfg(feature = "pdf")]
|
|
70
125
|
pub mod pdf;
|
|
71
126
|
|
|
72
|
-
#[cfg(feature = "office")]
|
|
127
|
+
#[cfg(all(feature = "tokio-runtime", feature = "office"))]
|
|
73
128
|
pub mod pptx;
|
|
74
129
|
|
|
75
130
|
#[cfg(feature = "office")]
|
|
@@ -102,7 +157,7 @@ pub use html::HtmlExtractor;
|
|
|
102
157
|
#[cfg(feature = "office")]
|
|
103
158
|
pub use bibtex::BibtexExtractor;
|
|
104
159
|
|
|
105
|
-
#[cfg(feature = "office")]
|
|
160
|
+
#[cfg(all(feature = "tokio-runtime", feature = "office"))]
|
|
106
161
|
pub use docx::DocxExtractor;
|
|
107
162
|
|
|
108
163
|
#[cfg(feature = "office")]
|
|
@@ -126,7 +181,7 @@ pub use jupyter::JupyterExtractor;
|
|
|
126
181
|
#[cfg(feature = "office")]
|
|
127
182
|
pub use orgmode::OrgModeExtractor;
|
|
128
183
|
|
|
129
|
-
#[cfg(feature = "office")]
|
|
184
|
+
#[cfg(all(feature = "tokio-runtime", feature = "office"))]
|
|
130
185
|
pub use odt::OdtExtractor;
|
|
131
186
|
|
|
132
187
|
#[cfg(feature = "xml")]
|
|
@@ -141,7 +196,7 @@ pub use typst::TypstExtractor;
|
|
|
141
196
|
#[cfg(feature = "pdf")]
|
|
142
197
|
pub use pdf::PdfExtractor;
|
|
143
198
|
|
|
144
|
-
#[cfg(feature = "office")]
|
|
199
|
+
#[cfg(all(feature = "tokio-runtime", feature = "office"))]
|
|
145
200
|
pub use pptx::PptxExtractor;
|
|
146
201
|
|
|
147
202
|
#[cfg(feature = "office")]
|
|
@@ -230,11 +285,8 @@ pub fn register_default_extractors() -> Result<()> {
|
|
|
230
285
|
{
|
|
231
286
|
registry.register(Arc::new(EnhancedMarkdownExtractor::new()))?;
|
|
232
287
|
registry.register(Arc::new(BibtexExtractor::new()))?;
|
|
233
|
-
registry.register(Arc::new(DocxExtractor::new()))?;
|
|
234
288
|
registry.register(Arc::new(EpubExtractor::new()))?;
|
|
235
289
|
registry.register(Arc::new(FictionBookExtractor::new()))?;
|
|
236
|
-
registry.register(Arc::new(PptxExtractor::new()))?;
|
|
237
|
-
registry.register(Arc::new(OdtExtractor::new()))?;
|
|
238
290
|
registry.register(Arc::new(RtfExtractor::new()))?;
|
|
239
291
|
registry.register(Arc::new(RstExtractor::new()))?;
|
|
240
292
|
registry.register(Arc::new(LatexExtractor::new()))?;
|
|
@@ -244,6 +296,13 @@ pub fn register_default_extractors() -> Result<()> {
|
|
|
244
296
|
registry.register(Arc::new(TypstExtractor::new()))?;
|
|
245
297
|
}
|
|
246
298
|
|
|
299
|
+
#[cfg(all(feature = "tokio-runtime", feature = "office"))]
|
|
300
|
+
{
|
|
301
|
+
registry.register(Arc::new(DocxExtractor::new()))?;
|
|
302
|
+
registry.register(Arc::new(PptxExtractor::new()))?;
|
|
303
|
+
registry.register(Arc::new(OdtExtractor::new()))?;
|
|
304
|
+
}
|
|
305
|
+
|
|
247
306
|
#[cfg(feature = "email")]
|
|
248
307
|
registry.register(Arc::new(EmailExtractor::new()))?;
|
|
249
308
|
|
|
@@ -313,14 +372,11 @@ mod tests {
|
|
|
313
372
|
|
|
314
373
|
#[cfg(feature = "office")]
|
|
315
374
|
{
|
|
316
|
-
expected_count +=
|
|
375
|
+
expected_count += 10;
|
|
317
376
|
assert!(extractor_names.contains(&"markdown-extractor".to_string()));
|
|
318
377
|
assert!(extractor_names.contains(&"bibtex-extractor".to_string()));
|
|
319
|
-
assert!(extractor_names.contains(&"docx-extractor".to_string()));
|
|
320
378
|
assert!(extractor_names.contains(&"epub-extractor".to_string()));
|
|
321
379
|
assert!(extractor_names.contains(&"fictionbook-extractor".to_string()));
|
|
322
|
-
assert!(extractor_names.contains(&"pptx-extractor".to_string()));
|
|
323
|
-
assert!(extractor_names.contains(&"odt-extractor".to_string()));
|
|
324
380
|
assert!(extractor_names.contains(&"rtf-extractor".to_string()));
|
|
325
381
|
assert!(extractor_names.contains(&"rst-extractor".to_string()));
|
|
326
382
|
assert!(extractor_names.contains(&"latex-extractor".to_string()));
|
|
@@ -330,6 +386,14 @@ mod tests {
|
|
|
330
386
|
assert!(extractor_names.contains(&"typst-extractor".to_string()));
|
|
331
387
|
}
|
|
332
388
|
|
|
389
|
+
#[cfg(all(feature = "tokio-runtime", feature = "office"))]
|
|
390
|
+
{
|
|
391
|
+
expected_count += 3;
|
|
392
|
+
assert!(extractor_names.contains(&"docx-extractor".to_string()));
|
|
393
|
+
assert!(extractor_names.contains(&"pptx-extractor".to_string()));
|
|
394
|
+
assert!(extractor_names.contains(&"odt-extractor".to_string()));
|
|
395
|
+
}
|
|
396
|
+
|
|
333
397
|
#[cfg(feature = "email")]
|
|
334
398
|
{
|
|
335
399
|
expected_count += 1;
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
#![cfg(all(feature = "tokio-runtime", feature = "office"))]
|
|
2
|
+
|
|
1
3
|
//! ODT (OpenDocument Text) extractor using native Rust parsing.
|
|
2
4
|
//!
|
|
3
5
|
//! Supports: OpenDocument Text (.odt)
|
|
@@ -169,13 +171,10 @@ fn extract_content_text(archive: &mut zip::ZipArchive<Cursor<Vec<u8>>>) -> crate
|
|
|
169
171
|
|
|
170
172
|
let mut text_parts: Vec<String> = Vec::new();
|
|
171
173
|
|
|
172
|
-
// Find the office:text or text body element - this is the main document body
|
|
173
174
|
for body_child in root.children() {
|
|
174
175
|
if body_child.tag_name().name() == "body" {
|
|
175
|
-
// Process the text element inside body
|
|
176
176
|
for text_elem in body_child.children() {
|
|
177
177
|
if text_elem.tag_name().name() == "text" {
|
|
178
|
-
// Now process only direct children of the text element
|
|
179
178
|
process_document_elements(text_elem, &mut text_parts);
|
|
180
179
|
}
|
|
181
180
|
}
|
|
@@ -563,6 +562,7 @@ impl DocumentExtractor for OdtExtractor {
|
|
|
563
562
|
additional: metadata_map,
|
|
564
563
|
..Default::default()
|
|
565
564
|
},
|
|
565
|
+
pages: None,
|
|
566
566
|
tables,
|
|
567
567
|
detected_languages: None,
|
|
568
568
|
chunks: None,
|