kreuzberg 4.2.14 → 4.2.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: da61e06dfa4643e485c13636998888f03699816b7462087c9df6c9639d53fc45
4
- data.tar.gz: 20a9c88f3eac809d2d158e15ea3747c425d47b3af0e2bf93825c831c9aa11aa9
3
+ metadata.gz: '09cd6cd5af8800892e58b09ade91500bac99dd1149569bde1721195cc52e94a2'
4
+ data.tar.gz: 7e4c00ce10c8ee8b576f9b6a1634112b9d3b21daee1bbb69f73756027bcb8876
5
5
  SHA512:
6
- metadata.gz: 7be55db6494d45de03b3fee1271e1bc151193709098bdfe94fb7a5fb33159dd9a0b8b08fffd5ed2d3b24f3f3766c0bb1e81b42319d25b529088afe7e6a4c52d6
7
- data.tar.gz: bd94796f90094ca64775c0ded247bc216fdfe6ee50d4c6258685ccc5b5b33e1a69546cd892ac7dce77eee72123cbaf6ce239ee932a117c8ad5dfe801a9a548bf
6
+ metadata.gz: 4eaa814c5d1d2ab357df797f39614c8fcc0013fac82863ac5bd306c61e05a6eb8a026e2b463876540e9c4c74422f6bcd058e8d279fa1ccbfcfa3fc94a1e9b815
7
+ data.tar.gz: b3068c76b4f6410640b5f9110a8e8cbf52b9ee8395fa11f0a2fe078ae1ecf8ce77070edd8786a063f835320ec2d1e80f6cb7f7439eacb794377d6073c8956af5
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.2.14)
4
+ kreuzberg (4.2.15)
5
5
  rb_sys (~> 0.9.119)
6
6
 
7
7
  GEM
@@ -209,7 +209,7 @@ CHECKSUMS
209
209
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
210
210
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
211
211
  json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
212
- kreuzberg (4.2.14)
212
+ kreuzberg (4.2.15)
213
213
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
214
214
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
215
215
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.14" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.15" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -37,7 +37,7 @@ collapsible_if = "allow"
37
37
 
38
38
  [package]
39
39
  name = "kreuzberg-rb"
40
- version = "4.2.14"
40
+ version = "4.2.15"
41
41
  edition = "2024"
42
42
  rust-version = "1.91"
43
43
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
data/kreuzberg.gemspec CHANGED
@@ -169,11 +169,12 @@ Gem::Specification.new do |spec|
169
169
  spec.authors = ['Na\'aman Hirschfeld']
170
170
  spec.email = ['nhirschfeld@gmail.com']
171
171
 
172
- spec.summary = 'High-performance document intelligence framework'
172
+ spec.summary = 'Document intelligence library — extract text from PDFs, Office docs, images, and 62+ formats'
173
173
  spec.description = <<~DESC
174
- Kreuzberg is a multi-language document intelligence framework with a high-performance
175
- Rust core. Supports extraction, OCR, chunking, and language detection for 30+ file formats
176
- including PDF, DOCX, PPTX, XLSX, images, and more.
174
+ Kreuzberg is a high-performance document intelligence library with a Rust core and native
175
+ Ruby bindings via Magnus. Extract text, metadata, and structured data from 62+ file formats
176
+ including PDF, DOCX, PPTX, XLSX, HTML, RTF, images (with OCR), email, archives, and more.
177
+ Features async/sync APIs, text chunking, language detection, and keyword extraction.
177
178
  DESC
178
179
  spec.homepage = 'https://github.com/kreuzberg-dev/kreuzberg'
179
180
  spec.license = 'MIT'
@@ -186,7 +187,7 @@ Gem::Specification.new do |spec|
186
187
  'documentation_uri' => 'https://docs.kreuzberg.dev',
187
188
  'bug_tracker_uri' => 'https://github.com/kreuzberg-dev/kreuzberg/issues',
188
189
  'rubygems_mfa_required' => 'true',
189
- 'keywords' => 'document-intelligence,document-extraction,ocr,rust,bindings'
190
+ 'keywords' => 'document-intelligence,document-extraction,text-extraction,ocr,pdf,rust,native-extension,nlp,rag'
190
191
  }
191
192
 
192
193
  spec.files = files
@@ -707,7 +707,8 @@ module Kreuzberg
707
707
  :ocr, :chunking, :language_detection, :pdf_options,
708
708
  :images, :postprocessor,
709
709
  :token_reduction, :keywords, :html_options, :pages,
710
- :max_concurrent_extractions, :output_format, :result_format
710
+ :max_concurrent_extractions, :output_format, :result_format,
711
+ :security_limits
711
712
 
712
713
  # Alias for backward compatibility - image_extraction is the canonical name
713
714
  alias image_extraction images
@@ -732,6 +733,7 @@ module Kreuzberg
732
733
  language_detection pdf_options image_extraction
733
734
  postprocessor token_reduction keywords html_options pages
734
735
  max_concurrent_extractions output_format result_format
736
+ security_limits
735
737
  ].freeze
736
738
 
737
739
  # Aliases for backward compatibility
@@ -804,7 +806,8 @@ module Kreuzberg
804
806
  pages: nil,
805
807
  max_concurrent_extractions: nil,
806
808
  output_format: nil,
807
- result_format: nil)
809
+ result_format: nil,
810
+ security_limits: nil)
808
811
  kwargs = {
809
812
  use_cache: use_cache, enable_quality_processing: enable_quality_processing,
810
813
  force_ocr: force_ocr, ocr: ocr, chunking: chunking, language_detection: language_detection,
@@ -812,7 +815,8 @@ module Kreuzberg
812
815
  postprocessor: postprocessor,
813
816
  token_reduction: token_reduction, keywords: keywords, html_options: html_options,
814
817
  pages: pages, max_concurrent_extractions: max_concurrent_extractions,
815
- output_format: output_format, result_format: result_format
818
+ output_format: output_format, result_format: result_format,
819
+ security_limits: security_limits
816
820
  }
817
821
  extracted = extract_from_hash(hash, kwargs)
818
822
 
@@ -843,6 +847,7 @@ module Kreuzberg
843
847
  @max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
844
848
  @output_format = validate_output_format(params[:output_format])
845
849
  @result_format = validate_result_format(params[:result_format])
850
+ @security_limits = params[:security_limits]
846
851
  end
847
852
 
848
853
  def validate_output_format(value)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.2.14'
4
+ VERSION = '4.2.15'
5
5
  end
data/sig/kreuzberg.rbs CHANGED
@@ -242,6 +242,7 @@ module Kreuzberg
242
242
  attr_reader max_concurrent_extractions: Integer?
243
243
  attr_reader output_format: String?
244
244
  attr_reader result_format: String?
245
+ attr_reader security_limits: Hash[String, Integer]?
245
246
 
246
247
  alias image_extraction images
247
248
 
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "4.2.14"
6
+ version = "4.2.15"
7
7
  edition = "2024"
8
8
  rust-version = "1.91"
9
9
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -1,10 +1,10 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.2.14"
3
+ version = "4.2.15"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
7
- description = "High-performance document intelligence library for Rust. Extract text, metadata, and structured data from PDFs, Office documents, images, and 50+ formats with async/sync APIs."
7
+ description = "High-performance document intelligence library for Rust. Extract text, metadata, and structured data from PDFs, Office documents, images, and 62+ formats with async/sync APIs."
8
8
  license = "MIT"
9
9
  repository = "https://github.com/kreuzberg-dev/kreuzberg"
10
10
  homepage = "https://kreuzberg.dev"
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
17
17
 
18
18
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
19
19
 
20
- > **🚀 Version 4.2.14 Release**
20
+ > **🚀 Version 4.2.15 Release**
21
21
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
22
22
  >
23
23
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -118,6 +118,8 @@ static EXT_TO_MIME: Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {
118
118
  m.insert("fb2", "application/x-fictionbook+xml");
119
119
  m.insert("opml", "application/xml+opml");
120
120
  m.insert("dbk", "application/docbook+xml");
121
+ m.insert("docbook", "application/docbook+xml");
122
+ m.insert("jats", "application/x-jats+xml");
121
123
  m.insert("ipynb", "application/x-ipynb+json");
122
124
  m.insert("tex", "application/x-latex");
123
125
  m.insert("latex", "application/x-latex");
@@ -54,10 +54,61 @@ fn whitespace_regex() -> &'static Regex {
54
54
  WHITESPACE_RE.get_or_init(|| Regex::new(r"\s+").unwrap())
55
55
  }
56
56
 
57
+ /// Detect UTF-16 encoding (with or without BOM) and transcode to UTF-8 if needed.
58
+ ///
59
+ /// `mail_parser` expects ASCII/UTF-8 input. If the EML file is encoded as
60
+ /// UTF-16, we transcode it to UTF-8 first.
61
+ ///
62
+ /// Detection strategy:
63
+ /// 1. Check for BOM (`FF FE` = LE, `FE FF` = BE)
64
+ /// 2. If no BOM, use heuristic: EML files start with ASCII headers, so
65
+ /// alternating zero bytes indicate UTF-16 encoding.
66
+ fn maybe_transcode_utf16(data: &[u8]) -> Option<Vec<u8>> {
67
+ if data.len() < 4 {
68
+ return None;
69
+ }
70
+
71
+ let (is_le, skip) = if data[0] == 0xFF && data[1] == 0xFE {
72
+ (true, 2)
73
+ } else if data[0] == 0xFE && data[1] == 0xFF {
74
+ (false, 2)
75
+ } else if data[1] == 0x00 && data[3] == 0x00 && data[0] != 0x00 && data[2] != 0x00 {
76
+ // No BOM, but looks like UTF-16 LE (e.g. "M\0I\0M\0E\0")
77
+ (true, 0)
78
+ } else if data[0] == 0x00 && data[2] == 0x00 && data[1] != 0x00 && data[3] != 0x00 {
79
+ // No BOM, but looks like UTF-16 BE (e.g. "\0M\0I\0M\0E")
80
+ (false, 0)
81
+ } else {
82
+ return None;
83
+ };
84
+
85
+ let payload = &data[skip..];
86
+ let even_len = payload.len() & !1;
87
+ let u16_iter = (0..even_len).step_by(2).map(|i| {
88
+ if is_le {
89
+ u16::from_le_bytes([payload[i], payload[i + 1]])
90
+ } else {
91
+ u16::from_be_bytes([payload[i], payload[i + 1]])
92
+ }
93
+ });
94
+
95
+ match String::from_utf16(&u16_iter.collect::<Vec<u16>>()) {
96
+ Ok(s) => Some(s.into_bytes()),
97
+ Err(_) => None,
98
+ }
99
+ }
100
+
57
101
  /// Parse .eml file content (RFC822 format)
58
102
  pub fn parse_eml_content(data: &[u8]) -> Result<EmailExtractionResult> {
103
+ // Transcode UTF-16 to UTF-8 if a BOM is detected
104
+ let data = if let Some(transcoded) = maybe_transcode_utf16(data) {
105
+ std::borrow::Cow::Owned(transcoded)
106
+ } else {
107
+ std::borrow::Cow::Borrowed(data)
108
+ };
109
+
59
110
  let message = mail_parser::MessageParser::default()
60
- .parse(data)
111
+ .parse(&data)
61
112
  .ok_or_else(|| KreuzbergError::parsing("Failed to parse EML file: invalid email format".to_string()))?;
62
113
 
63
114
  let subject = message.subject().map(|s| s.to_string());
@@ -293,14 +344,11 @@ pub fn parse_msg_content(data: &[u8]) -> Result<EmailExtractionResult> {
293
344
  if let Some(ref msg_id) = message_id {
294
345
  metadata.insert("message_id".to_string(), msg_id.to_string());
295
346
  }
296
- if !attachments.is_empty() {
297
- let attachment_names: Vec<String> = attachments
298
- .iter()
299
- .filter_map(|a| a.filename.as_ref())
300
- .cloned()
301
- .collect();
302
- metadata.insert("attachments".to_string(), attachment_names.join(", "));
303
- }
347
+ // NOTE: Do NOT insert "attachments" into the metadata HashMap here.
348
+ // The attachments are already stored in EmailMetadata.attachments (Vec<String>).
349
+ // Since both `format` and `additional` use #[serde(flatten)], inserting a
350
+ // comma-joined string here would overwrite the structured array, breaking
351
+ // deserialization in Go, C#, and other typed bindings.
304
352
 
305
353
  Ok(EmailExtractionResult {
306
354
  subject,
@@ -1,4 +1,4 @@
1
- #![cfg(all(feature = "tokio-runtime", feature = "office"))]
1
+ #![cfg(feature = "office")]
2
2
 
3
3
  //! DOCX extractor for high-performance text extraction.
4
4
  //!
@@ -8,7 +8,9 @@ use crate::Result;
8
8
  use crate::core::config::ExtractionConfig;
9
9
  use crate::extraction::{cells_to_markdown, office_metadata};
10
10
  use crate::plugins::{DocumentExtractor, Plugin};
11
- use crate::types::{ExtractionResult, Metadata, PageBoundary, PageInfo, PageStructure, PageUnitType, Table};
11
+ #[cfg(feature = "tokio-runtime")]
12
+ use crate::types::PageBoundary;
13
+ use crate::types::{ExtractionResult, Metadata, PageInfo, PageStructure, PageUnitType, Table};
12
14
  use ahash::AHashMap;
13
15
  use async_trait::async_trait;
14
16
  use std::borrow::Cow;
@@ -112,63 +114,96 @@ impl DocumentExtractor for DocxExtractor {
112
114
  mime_type: &str,
113
115
  _config: &ExtractionConfig,
114
116
  ) -> Result<ExtractionResult> {
115
- let (text, tables, page_boundaries) = if crate::core::batch_mode::is_batch_mode() {
116
- let content_owned = content.to_vec();
117
- let span = tracing::Span::current();
118
- tokio::task::spawn_blocking(
119
- move || -> crate::error::Result<(String, Vec<Table>, Option<Vec<PageBoundary>>)> {
120
- let _guard = span.entered();
121
- let doc = crate::extraction::docx::parser::parse_document(&content_owned)?;
122
-
123
- let text = doc.extract_text();
124
-
125
- let tables: Vec<Table> = doc
126
- .tables
127
- .iter()
128
- .enumerate()
129
- .map(|(idx, table)| convert_docx_table_to_table(table, idx))
130
- .collect();
131
-
132
- let page_boundaries = crate::extraction::docx::detect_page_breaks_from_docx(&content_owned)?;
117
+ let (text, tables, page_boundaries) = {
118
+ #[cfg(feature = "tokio-runtime")]
119
+ if crate::core::batch_mode::is_batch_mode() {
120
+ let content_owned = content.to_vec();
121
+ let span = tracing::Span::current();
122
+ tokio::task::spawn_blocking(
123
+ move || -> crate::error::Result<(String, Vec<Table>, Option<Vec<PageBoundary>>)> {
124
+ let _guard = span.entered();
125
+ let doc = crate::extraction::docx::parser::parse_document(&content_owned)?;
126
+
127
+ let text = doc.extract_text();
128
+
129
+ let tables: Vec<Table> = doc
130
+ .tables
131
+ .iter()
132
+ .enumerate()
133
+ .map(|(idx, table)| convert_docx_table_to_table(table, idx))
134
+ .collect();
135
+
136
+ let page_boundaries = crate::extraction::docx::detect_page_breaks_from_docx(&content_owned)?;
137
+
138
+ Ok((text, tables, page_boundaries))
139
+ },
140
+ )
141
+ .await
142
+ .map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX extraction task failed: {}", e)))??
143
+ } else {
144
+ let doc = crate::extraction::docx::parser::parse_document(content)?;
145
+
146
+ let text = doc.extract_text();
147
+
148
+ let tables: Vec<Table> = doc
149
+ .tables
150
+ .iter()
151
+ .enumerate()
152
+ .map(|(idx, table)| convert_docx_table_to_table(table, idx))
153
+ .collect();
154
+
155
+ let page_boundaries = crate::extraction::docx::detect_page_breaks_from_docx(content)?;
156
+
157
+ (text, tables, page_boundaries)
158
+ }
133
159
 
134
- Ok((text, tables, page_boundaries))
135
- },
136
- )
137
- .await
138
- .map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX extraction task failed: {}", e)))??
139
- } else {
140
- let doc = crate::extraction::docx::parser::parse_document(content)?;
160
+ #[cfg(not(feature = "tokio-runtime"))]
161
+ {
162
+ let doc = crate::extraction::docx::parser::parse_document(content)?;
141
163
 
142
- let text = doc.extract_text();
164
+ let text = doc.extract_text();
143
165
 
144
- let tables: Vec<Table> = doc
145
- .tables
146
- .iter()
147
- .enumerate()
148
- .map(|(idx, table)| convert_docx_table_to_table(table, idx))
149
- .collect();
166
+ let tables: Vec<Table> = doc
167
+ .tables
168
+ .iter()
169
+ .enumerate()
170
+ .map(|(idx, table)| convert_docx_table_to_table(table, idx))
171
+ .collect();
150
172
 
151
- let page_boundaries = crate::extraction::docx::detect_page_breaks_from_docx(content)?;
173
+ let page_boundaries = crate::extraction::docx::detect_page_breaks_from_docx(content)?;
152
174
 
153
- (text, tables, page_boundaries)
175
+ (text, tables, page_boundaries)
176
+ }
154
177
  };
155
178
 
156
- let mut archive = if crate::core::batch_mode::is_batch_mode() {
157
- let content_owned = content.to_vec();
158
- let span = tracing::Span::current();
159
- tokio::task::spawn_blocking(move || -> crate::error::Result<_> {
160
- let _guard = span.entered();
179
+ let mut archive = {
180
+ #[cfg(feature = "tokio-runtime")]
181
+ if crate::core::batch_mode::is_batch_mode() {
182
+ let content_owned = content.to_vec();
183
+ let span = tracing::Span::current();
184
+ tokio::task::spawn_blocking(move || -> crate::error::Result<_> {
185
+ let _guard = span.entered();
186
+ let cursor = Cursor::new(content_owned);
187
+ zip::ZipArchive::new(cursor).map_err(|e| {
188
+ crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e))
189
+ })
190
+ })
191
+ .await
192
+ .map_err(|e| crate::error::KreuzbergError::parsing(format!("Task join error: {}", e)))??
193
+ } else {
194
+ let content_owned = content.to_vec();
161
195
  let cursor = Cursor::new(content_owned);
162
196
  zip::ZipArchive::new(cursor)
163
- .map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e)))
164
- })
165
- .await
166
- .map_err(|e| crate::error::KreuzbergError::parsing(format!("Task join error: {}", e)))??
167
- } else {
168
- let content_owned = content.to_vec();
169
- let cursor = Cursor::new(content_owned);
170
- zip::ZipArchive::new(cursor)
171
- .map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e)))?
197
+ .map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e)))?
198
+ }
199
+
200
+ #[cfg(not(feature = "tokio-runtime"))]
201
+ {
202
+ let content_owned = content.to_vec();
203
+ let cursor = Cursor::new(content_owned);
204
+ zip::ZipArchive::new(cursor)
205
+ .map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e)))?
206
+ }
172
207
  };
173
208
 
174
209
  let mut metadata_map = AHashMap::new();
@@ -91,7 +91,7 @@ pub mod bibtex;
91
91
  #[cfg(feature = "office")]
92
92
  pub mod citation;
93
93
 
94
- #[cfg(all(feature = "tokio-runtime", feature = "office"))]
94
+ #[cfg(feature = "office")]
95
95
  pub mod docx;
96
96
 
97
97
  #[cfg(feature = "office")]
@@ -115,7 +115,7 @@ pub mod jupyter;
115
115
  #[cfg(feature = "office")]
116
116
  pub mod orgmode;
117
117
 
118
- #[cfg(all(feature = "tokio-runtime", feature = "office"))]
118
+ #[cfg(feature = "office")]
119
119
  pub mod odt;
120
120
 
121
121
  #[cfg(feature = "office")]
@@ -130,7 +130,7 @@ pub mod jats;
130
130
  #[cfg(feature = "pdf")]
131
131
  pub mod pdf;
132
132
 
133
- #[cfg(all(feature = "tokio-runtime", feature = "office"))]
133
+ #[cfg(feature = "office")]
134
134
  pub mod pptx;
135
135
 
136
136
  #[cfg(feature = "office")]
@@ -166,7 +166,7 @@ pub use bibtex::BibtexExtractor;
166
166
  #[cfg(feature = "office")]
167
167
  pub use citation::CitationExtractor;
168
168
 
169
- #[cfg(all(feature = "tokio-runtime", feature = "office"))]
169
+ #[cfg(feature = "office")]
170
170
  pub use docx::DocxExtractor;
171
171
 
172
172
  #[cfg(feature = "office")]
@@ -192,7 +192,7 @@ pub use jupyter::JupyterExtractor;
192
192
  #[cfg(feature = "office")]
193
193
  pub use orgmode::OrgModeExtractor;
194
194
 
195
- #[cfg(all(feature = "tokio-runtime", feature = "office"))]
195
+ #[cfg(feature = "office")]
196
196
  pub use odt::OdtExtractor;
197
197
 
198
198
  #[cfg(feature = "xml")]
@@ -207,7 +207,7 @@ pub use typst::TypstExtractor;
207
207
  #[cfg(feature = "pdf")]
208
208
  pub use pdf::PdfExtractor;
209
209
 
210
- #[cfg(all(feature = "tokio-runtime", feature = "office"))]
210
+ #[cfg(feature = "office")]
211
211
  pub use pptx::PptxExtractor;
212
212
 
213
213
  #[cfg(feature = "office")]
@@ -312,10 +312,6 @@ pub fn register_default_extractors() -> Result<()> {
312
312
  registry.register(Arc::new(OrgModeExtractor::new()))?;
313
313
  registry.register(Arc::new(OpmlExtractor::new()))?;
314
314
  registry.register(Arc::new(TypstExtractor::new()))?;
315
- }
316
-
317
- #[cfg(all(feature = "tokio-runtime", feature = "office"))]
318
- {
319
315
  registry.register(Arc::new(DocxExtractor::new()))?;
320
316
  registry.register(Arc::new(PptxExtractor::new()))?;
321
317
  registry.register(Arc::new(OdtExtractor::new()))?;