kreuzberg 4.2.14 → 4.2.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/kreuzberg.gemspec +6 -5
- data/lib/kreuzberg/config.rb +8 -3
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +1 -0
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +2 -2
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/core/mime.rs +2 -0
- data/vendor/kreuzberg/src/extraction/email.rs +57 -9
- data/vendor/kreuzberg/src/extractors/docx.rs +85 -50
- data/vendor/kreuzberg/src/extractors/mod.rs +6 -10
- data/vendor/kreuzberg/src/extractors/odt.rs +95 -30
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +22 -147
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -18
- data/vendor/kreuzberg/tests/content_parity_debug.rs +280 -0
- data/vendor/kreuzberg-ffi/src/string_intern.rs +9 -7
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +10 -7
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: '09cd6cd5af8800892e58b09ade91500bac99dd1149569bde1721195cc52e94a2'
|
|
4
|
+
data.tar.gz: 7e4c00ce10c8ee8b576f9b6a1634112b9d3b21daee1bbb69f73756027bcb8876
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4eaa814c5d1d2ab357df797f39614c8fcc0013fac82863ac5bd306c61e05a6eb8a026e2b463876540e9c4c74422f6bcd058e8d279fa1ccbfcfa3fc94a1e9b815
|
|
7
|
+
data.tar.gz: b3068c76b4f6410640b5f9110a8e8cbf52b9ee8395fa11f0a2fe078ae1ecf8ce77070edd8786a063f835320ec2d1e80f6cb7f7439eacb794377d6073c8956af5
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.2.
|
|
4
|
+
kreuzberg (4.2.15)
|
|
5
5
|
rb_sys (~> 0.9.119)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -209,7 +209,7 @@ CHECKSUMS
|
|
|
209
209
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
210
210
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
211
211
|
json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
|
|
212
|
-
kreuzberg (4.2.
|
|
212
|
+
kreuzberg (4.2.15)
|
|
213
213
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
214
214
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
215
215
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.15" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
data/kreuzberg.gemspec
CHANGED
|
@@ -169,11 +169,12 @@ Gem::Specification.new do |spec|
|
|
|
169
169
|
spec.authors = ['Na\'aman Hirschfeld']
|
|
170
170
|
spec.email = ['nhirschfeld@gmail.com']
|
|
171
171
|
|
|
172
|
-
spec.summary = '
|
|
172
|
+
spec.summary = 'Document intelligence library — extract text from PDFs, Office docs, images, and 62+ formats'
|
|
173
173
|
spec.description = <<~DESC
|
|
174
|
-
Kreuzberg is a
|
|
175
|
-
|
|
176
|
-
including PDF, DOCX, PPTX, XLSX, images, and more.
|
|
174
|
+
Kreuzberg is a high-performance document intelligence library with a Rust core and native
|
|
175
|
+
Ruby bindings via Magnus. Extract text, metadata, and structured data from 62+ file formats
|
|
176
|
+
including PDF, DOCX, PPTX, XLSX, HTML, RTF, images (with OCR), email, archives, and more.
|
|
177
|
+
Features async/sync APIs, text chunking, language detection, and keyword extraction.
|
|
177
178
|
DESC
|
|
178
179
|
spec.homepage = 'https://github.com/kreuzberg-dev/kreuzberg'
|
|
179
180
|
spec.license = 'MIT'
|
|
@@ -186,7 +187,7 @@ Gem::Specification.new do |spec|
|
|
|
186
187
|
'documentation_uri' => 'https://docs.kreuzberg.dev',
|
|
187
188
|
'bug_tracker_uri' => 'https://github.com/kreuzberg-dev/kreuzberg/issues',
|
|
188
189
|
'rubygems_mfa_required' => 'true',
|
|
189
|
-
'keywords' => 'document-intelligence,document-extraction,ocr,rust,
|
|
190
|
+
'keywords' => 'document-intelligence,document-extraction,text-extraction,ocr,pdf,rust,native-extension,nlp,rag'
|
|
190
191
|
}
|
|
191
192
|
|
|
192
193
|
spec.files = files
|
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -707,7 +707,8 @@ module Kreuzberg
|
|
|
707
707
|
:ocr, :chunking, :language_detection, :pdf_options,
|
|
708
708
|
:images, :postprocessor,
|
|
709
709
|
:token_reduction, :keywords, :html_options, :pages,
|
|
710
|
-
:max_concurrent_extractions, :output_format, :result_format
|
|
710
|
+
:max_concurrent_extractions, :output_format, :result_format,
|
|
711
|
+
:security_limits
|
|
711
712
|
|
|
712
713
|
# Alias for backward compatibility - image_extraction is the canonical name
|
|
713
714
|
alias image_extraction images
|
|
@@ -732,6 +733,7 @@ module Kreuzberg
|
|
|
732
733
|
language_detection pdf_options image_extraction
|
|
733
734
|
postprocessor token_reduction keywords html_options pages
|
|
734
735
|
max_concurrent_extractions output_format result_format
|
|
736
|
+
security_limits
|
|
735
737
|
].freeze
|
|
736
738
|
|
|
737
739
|
# Aliases for backward compatibility
|
|
@@ -804,7 +806,8 @@ module Kreuzberg
|
|
|
804
806
|
pages: nil,
|
|
805
807
|
max_concurrent_extractions: nil,
|
|
806
808
|
output_format: nil,
|
|
807
|
-
result_format: nil
|
|
809
|
+
result_format: nil,
|
|
810
|
+
security_limits: nil)
|
|
808
811
|
kwargs = {
|
|
809
812
|
use_cache: use_cache, enable_quality_processing: enable_quality_processing,
|
|
810
813
|
force_ocr: force_ocr, ocr: ocr, chunking: chunking, language_detection: language_detection,
|
|
@@ -812,7 +815,8 @@ module Kreuzberg
|
|
|
812
815
|
postprocessor: postprocessor,
|
|
813
816
|
token_reduction: token_reduction, keywords: keywords, html_options: html_options,
|
|
814
817
|
pages: pages, max_concurrent_extractions: max_concurrent_extractions,
|
|
815
|
-
output_format: output_format, result_format: result_format
|
|
818
|
+
output_format: output_format, result_format: result_format,
|
|
819
|
+
security_limits: security_limits
|
|
816
820
|
}
|
|
817
821
|
extracted = extract_from_hash(hash, kwargs)
|
|
818
822
|
|
|
@@ -843,6 +847,7 @@ module Kreuzberg
|
|
|
843
847
|
@max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
|
|
844
848
|
@output_format = validate_output_format(params[:output_format])
|
|
845
849
|
@result_format = validate_result_format(params[:result_format])
|
|
850
|
+
@security_limits = params[:security_limits]
|
|
846
851
|
end
|
|
847
852
|
|
|
848
853
|
def validate_output_format(value)
|
data/lib/kreuzberg/version.rb
CHANGED
data/sig/kreuzberg.rbs
CHANGED
data/vendor/Cargo.toml
CHANGED
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.2.
|
|
3
|
+
version = "4.2.15"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
7
|
-
description = "High-performance document intelligence library for Rust. Extract text, metadata, and structured data from PDFs, Office documents, images, and
|
|
7
|
+
description = "High-performance document intelligence library for Rust. Extract text, metadata, and structured data from PDFs, Office documents, images, and 62+ formats with async/sync APIs."
|
|
8
8
|
license = "MIT"
|
|
9
9
|
repository = "https://github.com/kreuzberg-dev/kreuzberg"
|
|
10
10
|
homepage = "https://kreuzberg.dev"
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
17
17
|
|
|
18
18
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
19
19
|
|
|
20
|
-
> **🚀 Version 4.2.
|
|
20
|
+
> **🚀 Version 4.2.15 Release**
|
|
21
21
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
22
22
|
>
|
|
23
23
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -118,6 +118,8 @@ static EXT_TO_MIME: Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {
|
|
|
118
118
|
m.insert("fb2", "application/x-fictionbook+xml");
|
|
119
119
|
m.insert("opml", "application/xml+opml");
|
|
120
120
|
m.insert("dbk", "application/docbook+xml");
|
|
121
|
+
m.insert("docbook", "application/docbook+xml");
|
|
122
|
+
m.insert("jats", "application/x-jats+xml");
|
|
121
123
|
m.insert("ipynb", "application/x-ipynb+json");
|
|
122
124
|
m.insert("tex", "application/x-latex");
|
|
123
125
|
m.insert("latex", "application/x-latex");
|
|
@@ -54,10 +54,61 @@ fn whitespace_regex() -> &'static Regex {
|
|
|
54
54
|
WHITESPACE_RE.get_or_init(|| Regex::new(r"\s+").unwrap())
|
|
55
55
|
}
|
|
56
56
|
|
|
57
|
+
/// Detect UTF-16 encoding (with or without BOM) and transcode to UTF-8 if needed.
|
|
58
|
+
///
|
|
59
|
+
/// `mail_parser` expects ASCII/UTF-8 input. If the EML file is encoded as
|
|
60
|
+
/// UTF-16, we transcode it to UTF-8 first.
|
|
61
|
+
///
|
|
62
|
+
/// Detection strategy:
|
|
63
|
+
/// 1. Check for BOM (`FF FE` = LE, `FE FF` = BE)
|
|
64
|
+
/// 2. If no BOM, use heuristic: EML files start with ASCII headers, so
|
|
65
|
+
/// alternating zero bytes indicate UTF-16 encoding.
|
|
66
|
+
fn maybe_transcode_utf16(data: &[u8]) -> Option<Vec<u8>> {
|
|
67
|
+
if data.len() < 4 {
|
|
68
|
+
return None;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
let (is_le, skip) = if data[0] == 0xFF && data[1] == 0xFE {
|
|
72
|
+
(true, 2)
|
|
73
|
+
} else if data[0] == 0xFE && data[1] == 0xFF {
|
|
74
|
+
(false, 2)
|
|
75
|
+
} else if data[1] == 0x00 && data[3] == 0x00 && data[0] != 0x00 && data[2] != 0x00 {
|
|
76
|
+
// No BOM, but looks like UTF-16 LE (e.g. "M\0I\0M\0E\0")
|
|
77
|
+
(true, 0)
|
|
78
|
+
} else if data[0] == 0x00 && data[2] == 0x00 && data[1] != 0x00 && data[3] != 0x00 {
|
|
79
|
+
// No BOM, but looks like UTF-16 BE (e.g. "\0M\0I\0M\0E")
|
|
80
|
+
(false, 0)
|
|
81
|
+
} else {
|
|
82
|
+
return None;
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
let payload = &data[skip..];
|
|
86
|
+
let even_len = payload.len() & !1;
|
|
87
|
+
let u16_iter = (0..even_len).step_by(2).map(|i| {
|
|
88
|
+
if is_le {
|
|
89
|
+
u16::from_le_bytes([payload[i], payload[i + 1]])
|
|
90
|
+
} else {
|
|
91
|
+
u16::from_be_bytes([payload[i], payload[i + 1]])
|
|
92
|
+
}
|
|
93
|
+
});
|
|
94
|
+
|
|
95
|
+
match String::from_utf16(&u16_iter.collect::<Vec<u16>>()) {
|
|
96
|
+
Ok(s) => Some(s.into_bytes()),
|
|
97
|
+
Err(_) => None,
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
57
101
|
/// Parse .eml file content (RFC822 format)
|
|
58
102
|
pub fn parse_eml_content(data: &[u8]) -> Result<EmailExtractionResult> {
|
|
103
|
+
// Transcode UTF-16 to UTF-8 if a BOM is detected
|
|
104
|
+
let data = if let Some(transcoded) = maybe_transcode_utf16(data) {
|
|
105
|
+
std::borrow::Cow::Owned(transcoded)
|
|
106
|
+
} else {
|
|
107
|
+
std::borrow::Cow::Borrowed(data)
|
|
108
|
+
};
|
|
109
|
+
|
|
59
110
|
let message = mail_parser::MessageParser::default()
|
|
60
|
-
.parse(data)
|
|
111
|
+
.parse(&data)
|
|
61
112
|
.ok_or_else(|| KreuzbergError::parsing("Failed to parse EML file: invalid email format".to_string()))?;
|
|
62
113
|
|
|
63
114
|
let subject = message.subject().map(|s| s.to_string());
|
|
@@ -293,14 +344,11 @@ pub fn parse_msg_content(data: &[u8]) -> Result<EmailExtractionResult> {
|
|
|
293
344
|
if let Some(ref msg_id) = message_id {
|
|
294
345
|
metadata.insert("message_id".to_string(), msg_id.to_string());
|
|
295
346
|
}
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
.collect();
|
|
302
|
-
metadata.insert("attachments".to_string(), attachment_names.join(", "));
|
|
303
|
-
}
|
|
347
|
+
// NOTE: Do NOT insert "attachments" into the metadata HashMap here.
|
|
348
|
+
// The attachments are already stored in EmailMetadata.attachments (Vec<String>).
|
|
349
|
+
// Since both `format` and `additional` use #[serde(flatten)], inserting a
|
|
350
|
+
// comma-joined string here would overwrite the structured array, breaking
|
|
351
|
+
// deserialization in Go, C#, and other typed bindings.
|
|
304
352
|
|
|
305
353
|
Ok(EmailExtractionResult {
|
|
306
354
|
subject,
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
#![cfg(
|
|
1
|
+
#![cfg(feature = "office")]
|
|
2
2
|
|
|
3
3
|
//! DOCX extractor for high-performance text extraction.
|
|
4
4
|
//!
|
|
@@ -8,7 +8,9 @@ use crate::Result;
|
|
|
8
8
|
use crate::core::config::ExtractionConfig;
|
|
9
9
|
use crate::extraction::{cells_to_markdown, office_metadata};
|
|
10
10
|
use crate::plugins::{DocumentExtractor, Plugin};
|
|
11
|
-
|
|
11
|
+
#[cfg(feature = "tokio-runtime")]
|
|
12
|
+
use crate::types::PageBoundary;
|
|
13
|
+
use crate::types::{ExtractionResult, Metadata, PageInfo, PageStructure, PageUnitType, Table};
|
|
12
14
|
use ahash::AHashMap;
|
|
13
15
|
use async_trait::async_trait;
|
|
14
16
|
use std::borrow::Cow;
|
|
@@ -112,63 +114,96 @@ impl DocumentExtractor for DocxExtractor {
|
|
|
112
114
|
mime_type: &str,
|
|
113
115
|
_config: &ExtractionConfig,
|
|
114
116
|
) -> Result<ExtractionResult> {
|
|
115
|
-
let (text, tables, page_boundaries) =
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
117
|
+
let (text, tables, page_boundaries) = {
|
|
118
|
+
#[cfg(feature = "tokio-runtime")]
|
|
119
|
+
if crate::core::batch_mode::is_batch_mode() {
|
|
120
|
+
let content_owned = content.to_vec();
|
|
121
|
+
let span = tracing::Span::current();
|
|
122
|
+
tokio::task::spawn_blocking(
|
|
123
|
+
move || -> crate::error::Result<(String, Vec<Table>, Option<Vec<PageBoundary>>)> {
|
|
124
|
+
let _guard = span.entered();
|
|
125
|
+
let doc = crate::extraction::docx::parser::parse_document(&content_owned)?;
|
|
126
|
+
|
|
127
|
+
let text = doc.extract_text();
|
|
128
|
+
|
|
129
|
+
let tables: Vec<Table> = doc
|
|
130
|
+
.tables
|
|
131
|
+
.iter()
|
|
132
|
+
.enumerate()
|
|
133
|
+
.map(|(idx, table)| convert_docx_table_to_table(table, idx))
|
|
134
|
+
.collect();
|
|
135
|
+
|
|
136
|
+
let page_boundaries = crate::extraction::docx::detect_page_breaks_from_docx(&content_owned)?;
|
|
137
|
+
|
|
138
|
+
Ok((text, tables, page_boundaries))
|
|
139
|
+
},
|
|
140
|
+
)
|
|
141
|
+
.await
|
|
142
|
+
.map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX extraction task failed: {}", e)))??
|
|
143
|
+
} else {
|
|
144
|
+
let doc = crate::extraction::docx::parser::parse_document(content)?;
|
|
145
|
+
|
|
146
|
+
let text = doc.extract_text();
|
|
147
|
+
|
|
148
|
+
let tables: Vec<Table> = doc
|
|
149
|
+
.tables
|
|
150
|
+
.iter()
|
|
151
|
+
.enumerate()
|
|
152
|
+
.map(|(idx, table)| convert_docx_table_to_table(table, idx))
|
|
153
|
+
.collect();
|
|
154
|
+
|
|
155
|
+
let page_boundaries = crate::extraction::docx::detect_page_breaks_from_docx(content)?;
|
|
156
|
+
|
|
157
|
+
(text, tables, page_boundaries)
|
|
158
|
+
}
|
|
133
159
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
.await
|
|
138
|
-
.map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX extraction task failed: {}", e)))??
|
|
139
|
-
} else {
|
|
140
|
-
let doc = crate::extraction::docx::parser::parse_document(content)?;
|
|
160
|
+
#[cfg(not(feature = "tokio-runtime"))]
|
|
161
|
+
{
|
|
162
|
+
let doc = crate::extraction::docx::parser::parse_document(content)?;
|
|
141
163
|
|
|
142
|
-
|
|
164
|
+
let text = doc.extract_text();
|
|
143
165
|
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
166
|
+
let tables: Vec<Table> = doc
|
|
167
|
+
.tables
|
|
168
|
+
.iter()
|
|
169
|
+
.enumerate()
|
|
170
|
+
.map(|(idx, table)| convert_docx_table_to_table(table, idx))
|
|
171
|
+
.collect();
|
|
150
172
|
|
|
151
|
-
|
|
173
|
+
let page_boundaries = crate::extraction::docx::detect_page_breaks_from_docx(content)?;
|
|
152
174
|
|
|
153
|
-
|
|
175
|
+
(text, tables, page_boundaries)
|
|
176
|
+
}
|
|
154
177
|
};
|
|
155
178
|
|
|
156
|
-
let mut archive =
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
let
|
|
179
|
+
let mut archive = {
|
|
180
|
+
#[cfg(feature = "tokio-runtime")]
|
|
181
|
+
if crate::core::batch_mode::is_batch_mode() {
|
|
182
|
+
let content_owned = content.to_vec();
|
|
183
|
+
let span = tracing::Span::current();
|
|
184
|
+
tokio::task::spawn_blocking(move || -> crate::error::Result<_> {
|
|
185
|
+
let _guard = span.entered();
|
|
186
|
+
let cursor = Cursor::new(content_owned);
|
|
187
|
+
zip::ZipArchive::new(cursor).map_err(|e| {
|
|
188
|
+
crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e))
|
|
189
|
+
})
|
|
190
|
+
})
|
|
191
|
+
.await
|
|
192
|
+
.map_err(|e| crate::error::KreuzbergError::parsing(format!("Task join error: {}", e)))??
|
|
193
|
+
} else {
|
|
194
|
+
let content_owned = content.to_vec();
|
|
161
195
|
let cursor = Cursor::new(content_owned);
|
|
162
196
|
zip::ZipArchive::new(cursor)
|
|
163
|
-
.map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e)))
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
197
|
+
.map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e)))?
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
#[cfg(not(feature = "tokio-runtime"))]
|
|
201
|
+
{
|
|
202
|
+
let content_owned = content.to_vec();
|
|
203
|
+
let cursor = Cursor::new(content_owned);
|
|
204
|
+
zip::ZipArchive::new(cursor)
|
|
205
|
+
.map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e)))?
|
|
206
|
+
}
|
|
172
207
|
};
|
|
173
208
|
|
|
174
209
|
let mut metadata_map = AHashMap::new();
|
|
@@ -91,7 +91,7 @@ pub mod bibtex;
|
|
|
91
91
|
#[cfg(feature = "office")]
|
|
92
92
|
pub mod citation;
|
|
93
93
|
|
|
94
|
-
#[cfg(
|
|
94
|
+
#[cfg(feature = "office")]
|
|
95
95
|
pub mod docx;
|
|
96
96
|
|
|
97
97
|
#[cfg(feature = "office")]
|
|
@@ -115,7 +115,7 @@ pub mod jupyter;
|
|
|
115
115
|
#[cfg(feature = "office")]
|
|
116
116
|
pub mod orgmode;
|
|
117
117
|
|
|
118
|
-
#[cfg(
|
|
118
|
+
#[cfg(feature = "office")]
|
|
119
119
|
pub mod odt;
|
|
120
120
|
|
|
121
121
|
#[cfg(feature = "office")]
|
|
@@ -130,7 +130,7 @@ pub mod jats;
|
|
|
130
130
|
#[cfg(feature = "pdf")]
|
|
131
131
|
pub mod pdf;
|
|
132
132
|
|
|
133
|
-
#[cfg(
|
|
133
|
+
#[cfg(feature = "office")]
|
|
134
134
|
pub mod pptx;
|
|
135
135
|
|
|
136
136
|
#[cfg(feature = "office")]
|
|
@@ -166,7 +166,7 @@ pub use bibtex::BibtexExtractor;
|
|
|
166
166
|
#[cfg(feature = "office")]
|
|
167
167
|
pub use citation::CitationExtractor;
|
|
168
168
|
|
|
169
|
-
#[cfg(
|
|
169
|
+
#[cfg(feature = "office")]
|
|
170
170
|
pub use docx::DocxExtractor;
|
|
171
171
|
|
|
172
172
|
#[cfg(feature = "office")]
|
|
@@ -192,7 +192,7 @@ pub use jupyter::JupyterExtractor;
|
|
|
192
192
|
#[cfg(feature = "office")]
|
|
193
193
|
pub use orgmode::OrgModeExtractor;
|
|
194
194
|
|
|
195
|
-
#[cfg(
|
|
195
|
+
#[cfg(feature = "office")]
|
|
196
196
|
pub use odt::OdtExtractor;
|
|
197
197
|
|
|
198
198
|
#[cfg(feature = "xml")]
|
|
@@ -207,7 +207,7 @@ pub use typst::TypstExtractor;
|
|
|
207
207
|
#[cfg(feature = "pdf")]
|
|
208
208
|
pub use pdf::PdfExtractor;
|
|
209
209
|
|
|
210
|
-
#[cfg(
|
|
210
|
+
#[cfg(feature = "office")]
|
|
211
211
|
pub use pptx::PptxExtractor;
|
|
212
212
|
|
|
213
213
|
#[cfg(feature = "office")]
|
|
@@ -312,10 +312,6 @@ pub fn register_default_extractors() -> Result<()> {
|
|
|
312
312
|
registry.register(Arc::new(OrgModeExtractor::new()))?;
|
|
313
313
|
registry.register(Arc::new(OpmlExtractor::new()))?;
|
|
314
314
|
registry.register(Arc::new(TypstExtractor::new()))?;
|
|
315
|
-
}
|
|
316
|
-
|
|
317
|
-
#[cfg(all(feature = "tokio-runtime", feature = "office"))]
|
|
318
|
-
{
|
|
319
315
|
registry.register(Arc::new(DocxExtractor::new()))?;
|
|
320
316
|
registry.register(Arc::new(PptxExtractor::new()))?;
|
|
321
317
|
registry.register(Arc::new(OdtExtractor::new()))?;
|