kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +11 -11
  6. data/README.md +5 -10
  7. data/examples/async_patterns.rb +0 -1
  8. data/ext/kreuzberg_rb/extconf.rb +0 -10
  9. data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
  10. data/ext/kreuzberg_rb/native/build.rs +2 -0
  11. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  12. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  13. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  14. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  15. data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
  16. data/kreuzberg.gemspec +14 -57
  17. data/lib/kreuzberg/cache_api.rb +0 -1
  18. data/lib/kreuzberg/cli.rb +2 -2
  19. data/lib/kreuzberg/config.rb +2 -9
  20. data/lib/kreuzberg/errors.rb +7 -75
  21. data/lib/kreuzberg/extraction_api.rb +0 -1
  22. data/lib/kreuzberg/setup_lib_path.rb +0 -1
  23. data/lib/kreuzberg/version.rb +1 -1
  24. data/lib/kreuzberg.rb +0 -21
  25. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  26. data/sig/kreuzberg.rbs +3 -55
  27. data/spec/binding/cli_proxy_spec.rb +4 -2
  28. data/spec/binding/cli_spec.rb +11 -12
  29. data/spec/examples.txt +104 -0
  30. data/spec/fixtures/config.yaml +1 -0
  31. data/spec/spec_helper.rb +1 -1
  32. data/vendor/kreuzberg/Cargo.toml +42 -112
  33. data/vendor/kreuzberg/README.md +2 -2
  34. data/vendor/kreuzberg/build.rs +4 -18
  35. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  36. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  37. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  38. data/vendor/kreuzberg/src/core/extractor.rs +81 -202
  39. data/vendor/kreuzberg/src/core/io.rs +2 -4
  40. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  41. data/vendor/kreuzberg/src/core/mod.rs +1 -4
  42. data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
  43. data/vendor/kreuzberg/src/embeddings.rs +16 -125
  44. data/vendor/kreuzberg/src/error.rs +1 -1
  45. data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/image.rs +13 -13
  47. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
  48. data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
  49. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  50. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  51. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  52. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  53. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  54. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  55. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  56. data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
  57. data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
  58. data/vendor/kreuzberg/src/extractors/email.rs +0 -14
  59. data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
  60. data/vendor/kreuzberg/src/extractors/html.rs +154 -137
  61. data/vendor/kreuzberg/src/extractors/image.rs +4 -7
  62. data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
  63. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  64. data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
  65. data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
  66. data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
  67. data/vendor/kreuzberg/src/extractors/text.rs +5 -23
  68. data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
  69. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  70. data/vendor/kreuzberg/src/lib.rs +1 -4
  71. data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
  72. data/vendor/kreuzberg/src/mcp/server.rs +3 -5
  73. data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
  74. data/vendor/kreuzberg/src/pdf/error.rs +1 -1
  75. data/vendor/kreuzberg/src/pdf/table.rs +44 -17
  76. data/vendor/kreuzberg/src/pdf/text.rs +3 -0
  77. data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
  78. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
  79. data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
  80. data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
  81. data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
  82. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  83. data/vendor/kreuzberg/src/types.rs +12 -42
  84. data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
  85. data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
  86. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  87. data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
  88. data/vendor/kreuzberg/tests/config_features.rs +0 -18
  89. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
  90. data/vendor/kreuzberg/tests/core_integration.rs +7 -24
  91. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  92. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  93. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
  95. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
  96. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
  97. data/vendor/kreuzberg/tests/security_validation.rs +1 -12
  98. metadata +25 -90
  99. data/.rubocop.yml +0 -538
  100. data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
  101. data/lib/kreuzberg/error_context.rb +0 -32
  102. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  103. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
  104. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
  105. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
  106. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
  107. data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
  108. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
  109. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  110. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
  111. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
  112. data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
  113. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
  114. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  115. data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
  116. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
  117. data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
  119. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  121. data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
  122. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  123. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  124. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  125. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
  126. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  127. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  128. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  129. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  130. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  131. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  132. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  133. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  134. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  135. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
  136. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  137. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  138. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
  139. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
  140. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
  141. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
  142. data/vendor/rb-sys/.cargo-ok +0 -1
  143. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  144. data/vendor/rb-sys/Cargo.lock +0 -393
  145. data/vendor/rb-sys/Cargo.toml +0 -70
  146. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  147. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  148. data/vendor/rb-sys/LICENSE-MIT +0 -21
  149. data/vendor/rb-sys/bin/release.sh +0 -21
  150. data/vendor/rb-sys/build/features.rs +0 -108
  151. data/vendor/rb-sys/build/main.rs +0 -246
  152. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  153. data/vendor/rb-sys/build/version.rs +0 -48
  154. data/vendor/rb-sys/readme.md +0 -36
  155. data/vendor/rb-sys/src/bindings.rs +0 -21
  156. data/vendor/rb-sys/src/hidden.rs +0 -11
  157. data/vendor/rb-sys/src/lib.rs +0 -34
  158. data/vendor/rb-sys/src/macros.rs +0 -371
  159. data/vendor/rb-sys/src/memory.rs +0 -53
  160. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  161. data/vendor/rb-sys/src/special_consts.rs +0 -31
  162. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  163. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  164. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  165. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  166. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  167. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  168. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  169. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  170. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  171. data/vendor/rb-sys/src/stable_api.rs +0 -261
  172. data/vendor/rb-sys/src/symbol.rs +0 -31
  173. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  174. data/vendor/rb-sys/src/utils.rs +0 -89
  175. data/vendor/rb-sys/src/value_type.rs +0 -7
@@ -0,0 +1,201 @@
1
+ //! Pandoc-based extractors for various document formats.
2
+ //!
3
+ //! Supports: DOCX, ODT, EPUB, LaTeX, RST, RTF, and many more formats via Pandoc.
4
+
5
+ use crate::Result;
6
+ use crate::core::config::ExtractionConfig;
7
+ use crate::extraction::pandoc::extract_bytes_from_mime;
8
+ use crate::plugins::{DocumentExtractor, Plugin};
9
+ use crate::types::{ExtractionResult, Metadata};
10
+ use async_trait::async_trait;
11
+
12
+ /// Generic Pandoc extractor for all Pandoc-supported formats.
13
+ ///
14
+ /// This extractor handles all document formats supported by Pandoc, including:
15
+ /// - Microsoft Word (DOCX)
16
+ /// - OpenDocument Text (ODT)
17
+ /// - EPUB
18
+ /// - LaTeX
19
+ /// - reStructuredText (RST)
20
+ /// - RTF
21
+ /// - And many more
22
+ pub struct PandocExtractor;
23
+
24
+ impl PandocExtractor {
25
+ /// Create a new Pandoc extractor.
26
+ pub fn new() -> Self {
27
+ Self
28
+ }
29
+ }
30
+
31
+ impl Default for PandocExtractor {
32
+ fn default() -> Self {
33
+ Self::new()
34
+ }
35
+ }
36
+
37
+ impl Plugin for PandocExtractor {
38
+ fn name(&self) -> &str {
39
+ "pandoc-extractor"
40
+ }
41
+
42
+ fn version(&self) -> String {
43
+ env!("CARGO_PKG_VERSION").to_string()
44
+ }
45
+
46
+ fn initialize(&self) -> Result<()> {
47
+ Ok(())
48
+ }
49
+
50
+ fn shutdown(&self) -> Result<()> {
51
+ Ok(())
52
+ }
53
+
54
+ fn description(&self) -> &str {
55
+ "Extracts content from Pandoc-supported formats (DOCX, ODT, EPUB, LaTeX, RST, RTF, etc.)"
56
+ }
57
+
58
+ fn author(&self) -> &str {
59
+ "Kreuzberg Team"
60
+ }
61
+ }
62
+
63
+ #[async_trait]
64
+ impl DocumentExtractor for PandocExtractor {
65
+ async fn extract_bytes(
66
+ &self,
67
+ content: &[u8],
68
+ mime_type: &str,
69
+ _config: &ExtractionConfig,
70
+ ) -> Result<ExtractionResult> {
71
+ let pandoc_result = extract_bytes_from_mime(content, mime_type).await?;
72
+
73
+ let mut additional = std::collections::HashMap::new();
74
+ for (key, value) in pandoc_result.metadata {
75
+ additional.insert(key, value);
76
+ }
77
+
78
+ Ok(ExtractionResult {
79
+ content: pandoc_result.content,
80
+ mime_type: mime_type.to_string(),
81
+ metadata: Metadata {
82
+ additional,
83
+ ..Default::default()
84
+ },
85
+ tables: vec![],
86
+ detected_languages: None,
87
+ chunks: None,
88
+ images: None,
89
+ })
90
+ }
91
+
92
+ fn supported_mime_types(&self) -> &[&str] {
93
+ &[
94
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
95
+ "application/vnd.oasis.opendocument.text",
96
+ "application/epub+zip",
97
+ "application/x-latex",
98
+ "text/x-tex",
99
+ "text/x-rst",
100
+ "text/prs.fallenstein.rst",
101
+ "application/rtf",
102
+ "text/rtf",
103
+ "application/x-typst",
104
+ "application/x-ipynb+json",
105
+ "application/x-fictionbook+xml",
106
+ "text/x-org",
107
+ "text/x-commonmark",
108
+ "text/x-gfm",
109
+ "text/x-multimarkdown",
110
+ "text/x-markdown-extra",
111
+ "application/docbook+xml",
112
+ "application/x-jats+xml",
113
+ "application/x-opml+xml",
114
+ ]
115
+ }
116
+
117
+ fn priority(&self) -> i32 {
118
+ 40
119
+ }
120
+ }
121
+
122
+ #[cfg(test)]
123
+ mod tests {
124
+ use super::*;
125
+ use crate::extraction::pandoc::validate_pandoc_version;
126
+
127
+ #[tokio::test]
128
+ async fn test_pandoc_extractor_plugin_interface() {
129
+ let extractor = PandocExtractor::new();
130
+ assert_eq!(extractor.name(), "pandoc-extractor");
131
+ assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
132
+ assert_eq!(extractor.priority(), 40);
133
+ assert!(!extractor.supported_mime_types().is_empty());
134
+ }
135
+
136
+ #[tokio::test]
137
+ async fn test_pandoc_extractor_supports_docx() {
138
+ let extractor = PandocExtractor::new();
139
+ assert!(
140
+ extractor
141
+ .supported_mime_types()
142
+ .contains(&"application/vnd.openxmlformats-officedocument.wordprocessingml.document")
143
+ );
144
+ }
145
+
146
+ #[tokio::test]
147
+ async fn test_pandoc_extractor_supports_odt() {
148
+ let extractor = PandocExtractor::new();
149
+ assert!(
150
+ extractor
151
+ .supported_mime_types()
152
+ .contains(&"application/vnd.oasis.opendocument.text")
153
+ );
154
+ }
155
+
156
+ #[tokio::test]
157
+ async fn test_pandoc_extractor_supports_epub() {
158
+ let extractor = PandocExtractor::new();
159
+ assert!(extractor.supported_mime_types().contains(&"application/epub+zip"));
160
+ }
161
+
162
+ #[tokio::test]
163
+ async fn test_pandoc_extractor_supports_latex() {
164
+ let extractor = PandocExtractor::new();
165
+ assert!(extractor.supported_mime_types().contains(&"application/x-latex"));
166
+ }
167
+
168
+ #[tokio::test]
169
+ async fn test_pandoc_extractor_supports_rst() {
170
+ let extractor = PandocExtractor::new();
171
+ assert!(extractor.supported_mime_types().contains(&"text/x-rst"));
172
+ }
173
+
174
+ #[tokio::test]
175
+ async fn test_pandoc_extractor_markdown() {
176
+ if validate_pandoc_version().await.is_err() {
177
+ return;
178
+ }
179
+
180
+ let extractor = PandocExtractor::new();
181
+ let markdown = b"# Hello World\n\nThis is a test.";
182
+ let config = ExtractionConfig::default();
183
+
184
+ let result = extractor.extract_bytes(markdown, "text/x-rst", &config).await;
185
+
186
+ let _ = result;
187
+ }
188
+
189
+ #[tokio::test]
190
+ async fn test_pandoc_extractor_default() {
191
+ let extractor = PandocExtractor;
192
+ assert_eq!(extractor.name(), "pandoc-extractor");
193
+ }
194
+
195
+ #[tokio::test]
196
+ async fn test_pandoc_extractor_initialize_shutdown() {
197
+ let extractor = PandocExtractor::new();
198
+ assert!(extractor.initialize().is_ok());
199
+ assert!(extractor.shutdown().is_ok());
200
+ }
201
+ }
@@ -147,25 +147,31 @@ fn extract_tables_from_document(
147
147
 
148
148
  let mut all_tables = Vec::new();
149
149
 
150
+ // Process each page
150
151
  for (page_index, page) in document.pages().iter().enumerate() {
151
- let words = extract_words_from_page(&page, 0.0)?;
152
+ // Extract words with positions from the page
153
+ let words = extract_words_from_page(&page, 0.0)?; // Use 0.0 confidence for PDF (always high quality)
152
154
 
153
155
  if words.is_empty() {
154
156
  continue;
155
157
  }
156
158
 
159
+ // Use existing table reconstruction logic
160
+ // These thresholds match the defaults from TesseractConfig
157
161
  let column_threshold = 50;
158
162
  let row_threshold_ratio = 0.5;
159
163
 
164
+ // Reconstruct table from positioned words
160
165
  let table_cells = reconstruct_table(&words, column_threshold, row_threshold_ratio, true);
161
166
 
162
167
  if !table_cells.is_empty() {
168
+ // Generate markdown representation
163
169
  let markdown = table_to_markdown(&table_cells);
164
170
 
165
171
  all_tables.push(Table {
166
172
  cells: table_cells,
167
173
  markdown,
168
- page_number: page_index + 1,
174
+ page_number: page_index + 1, // 1-indexed
169
175
  });
170
176
  }
171
177
  }
@@ -281,13 +287,6 @@ impl Plugin for PdfExtractor {
281
287
 
282
288
  #[async_trait]
283
289
  impl DocumentExtractor for PdfExtractor {
284
- #[cfg_attr(feature = "otel", tracing::instrument(
285
- skip(self, content, config),
286
- fields(
287
- extractor.name = self.name(),
288
- content.size_bytes = content.len(),
289
- )
290
- ))]
291
290
  async fn extract_bytes(
292
291
  &self,
293
292
  content: &[u8],
@@ -296,10 +295,9 @@ impl DocumentExtractor for PdfExtractor {
296
295
  ) -> Result<ExtractionResult> {
297
296
  #[cfg(feature = "pdf")]
298
297
  let (pdf_metadata, native_text, tables) = if crate::core::batch_mode::is_batch_mode() {
298
+ // Batch mode: Move PDF extraction to blocking thread pool to enable parallelism
299
299
  let content_owned = content.to_vec();
300
- let span = tracing::Span::current();
301
300
  tokio::task::spawn_blocking(move || {
302
- let _guard = span.entered();
303
301
  let bindings = Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./"))
304
302
  .or_else(|_| Pdfium::bind_to_system_library())
305
303
  .map_err(|e| PdfError::MetadataExtractionFailed(format!("Failed to initialize Pdfium: {}", e)))?;
@@ -318,6 +316,7 @@ impl DocumentExtractor for PdfExtractor {
318
316
  let metadata = crate::pdf::metadata::extract_metadata_from_document(&document)?;
319
317
  let native_text = crate::pdf::text::extract_text_from_pdf_document(&document)?;
320
318
 
319
+ // Extract tables from native PDF text (when not using OCR)
321
320
  let tables = extract_tables_from_document(&document, &metadata)?;
322
321
 
323
322
  Ok::<_, crate::error::KreuzbergError>((metadata, native_text, tables))
@@ -325,6 +324,7 @@ impl DocumentExtractor for PdfExtractor {
325
324
  .await
326
325
  .map_err(|e| crate::error::KreuzbergError::Other(format!("PDF extraction task failed: {}", e)))??
327
326
  } else {
327
+ // Single-file mode: Direct extraction (no spawn overhead)
328
328
  let bindings = Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./"))
329
329
  .or_else(|_| Pdfium::bind_to_system_library())
330
330
  .map_err(|e| PdfError::MetadataExtractionFailed(format!("Failed to initialize Pdfium: {}", e)))?;
@@ -343,6 +343,7 @@ impl DocumentExtractor for PdfExtractor {
343
343
  let metadata = crate::pdf::metadata::extract_metadata_from_document(&document)?;
344
344
  let native_text = crate::pdf::text::extract_text_from_pdf_document(&document)?;
345
345
 
346
+ // Extract tables from native PDF text (when not using OCR)
346
347
  let tables = extract_tables_from_document(&document, &metadata)?;
347
348
 
348
349
  (metadata, native_text, tables)
@@ -415,6 +416,9 @@ impl DocumentExtractor for PdfExtractor {
415
416
  None
416
417
  };
417
418
 
419
+ // Tables were extracted during metadata/text extraction phase
420
+ // (see extract_tables_from_document function below)
421
+
418
422
  Ok(ExtractionResult {
419
423
  content: text,
420
424
  mime_type: mime_type.to_string(),
@@ -430,7 +434,6 @@ impl DocumentExtractor for PdfExtractor {
430
434
  })
431
435
  }
432
436
 
433
- #[cfg(feature = "tokio-runtime")]
434
437
  async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
435
438
  let bytes = tokio::fs::read(path).await?;
436
439
  self.extract_bytes(&bytes, mime_type, config).await
@@ -43,10 +43,8 @@ impl PptxExtractor {
43
43
  for image in &mut images {
44
44
  let image_data = image.data.clone();
45
45
  let tess_config_clone = tess_config.clone();
46
- let span = tracing::Span::current();
47
46
 
48
47
  let ocr_result = tokio::task::spawn_blocking(move || {
49
- let _guard = span.entered();
50
48
  let cache_dir = std::env::var("KREUZBERG_CACHE_DIR").ok().map(std::path::PathBuf::from);
51
49
 
52
50
  let proc = OcrProcessor::new(cache_dir)?;
@@ -102,13 +100,6 @@ impl Plugin for PptxExtractor {
102
100
 
103
101
  #[async_trait]
104
102
  impl DocumentExtractor for PptxExtractor {
105
- #[cfg_attr(feature = "otel", tracing::instrument(
106
- skip(self, content, config),
107
- fields(
108
- extractor.name = self.name(),
109
- content.size_bytes = content.len(),
110
- )
111
- ))]
112
103
  async fn extract_bytes(
113
104
  &self,
114
105
  content: &[u8],
@@ -117,16 +108,17 @@ impl DocumentExtractor for PptxExtractor {
117
108
  ) -> Result<ExtractionResult> {
118
109
  let extract_images = config.images.as_ref().is_some_and(|img| img.extract_images);
119
110
 
111
+ // Extract PPTX content
120
112
  let pptx_result = if crate::core::batch_mode::is_batch_mode() {
113
+ // Batch mode: Use spawn_blocking for parallelism
121
114
  let content_owned = content.to_vec();
122
- let span = tracing::Span::current();
123
115
  tokio::task::spawn_blocking(move || {
124
- let _guard = span.entered();
125
116
  crate::extraction::pptx::extract_pptx_from_bytes(&content_owned, extract_images)
126
117
  })
127
118
  .await
128
119
  .map_err(|e| crate::error::KreuzbergError::parsing(format!("PPTX extraction task failed: {}", e)))??
129
120
  } else {
121
+ // Single-file mode: Direct extraction (no spawn overhead)
130
122
  crate::extraction::pptx::extract_pptx_from_bytes(content, extract_images)?
131
123
  };
132
124
 
@@ -164,12 +156,6 @@ impl DocumentExtractor for PptxExtractor {
164
156
  })
165
157
  }
166
158
 
167
- #[cfg_attr(feature = "otel", tracing::instrument(
168
- skip(self, path, config),
169
- fields(
170
- extractor.name = self.name(),
171
- )
172
- ))]
173
159
  async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
174
160
  let path_str = path
175
161
  .to_str()
@@ -42,13 +42,6 @@ impl Plugin for StructuredExtractor {
42
42
 
43
43
  #[async_trait]
44
44
  impl DocumentExtractor for StructuredExtractor {
45
- #[cfg_attr(feature = "otel", tracing::instrument(
46
- skip(self, content, _config),
47
- fields(
48
- extractor.name = self.name(),
49
- content.size_bytes = content.len(),
50
- )
51
- ))]
52
45
  async fn extract_bytes(
53
46
  &self,
54
47
  content: &[u8],
@@ -87,13 +80,6 @@ impl DocumentExtractor for StructuredExtractor {
87
80
  })
88
81
  }
89
82
 
90
- #[cfg(feature = "tokio-runtime")]
91
- #[cfg_attr(feature = "otel", tracing::instrument(
92
- skip(self, path, config),
93
- fields(
94
- extractor.name = self.name(),
95
- )
96
- ))]
97
83
  async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
98
84
  let bytes = tokio::fs::read(path).await?;
99
85
  self.extract_bytes(&bytes, mime_type, config).await
@@ -53,33 +53,22 @@ impl Plugin for PlainTextExtractor {
53
53
 
54
54
  #[async_trait]
55
55
  impl DocumentExtractor for PlainTextExtractor {
56
- #[cfg_attr(feature = "otel", tracing::instrument(
57
- skip(self, content, _config),
58
- fields(
59
- extractor.name = self.name(),
60
- content.size_bytes = content.len(),
61
- )
62
- ))]
63
56
  async fn extract_bytes(
64
57
  &self,
65
58
  content: &[u8],
66
59
  mime_type: &str,
67
60
  _config: &ExtractionConfig,
68
61
  ) -> Result<ExtractionResult> {
69
- let text = String::from_utf8_lossy(content).into_owned();
70
- let text = text.trim_end_matches('\n').trim_end_matches('\r').to_string();
71
- let line_count = text.lines().count();
72
- let word_count = text.split_whitespace().count();
73
- let character_count = text.len();
62
+ let text_result = parse_text(content, false)?;
74
63
 
75
64
  Ok(ExtractionResult {
76
- content: text,
65
+ content: text_result.content,
77
66
  mime_type: mime_type.to_string(),
78
67
  metadata: crate::types::Metadata {
79
68
  format: Some(crate::types::FormatMetadata::Text(crate::types::TextMetadata {
80
- line_count,
81
- word_count,
82
- character_count,
69
+ line_count: text_result.line_count,
70
+ word_count: text_result.word_count,
71
+ character_count: text_result.character_count,
83
72
  headers: None,
84
73
  links: None,
85
74
  code_blocks: None,
@@ -149,13 +138,6 @@ impl Plugin for MarkdownExtractor {
149
138
 
150
139
  #[async_trait]
151
140
  impl DocumentExtractor for MarkdownExtractor {
152
- #[cfg_attr(feature = "otel", tracing::instrument(
153
- skip(self, content, _config),
154
- fields(
155
- extractor.name = self.name(),
156
- content.size_bytes = content.len(),
157
- )
158
- ))]
159
141
  async fn extract_bytes(
160
142
  &self,
161
143
  content: &[u8],
@@ -53,13 +53,6 @@ impl Plugin for XmlExtractor {
53
53
 
54
54
  #[async_trait]
55
55
  impl DocumentExtractor for XmlExtractor {
56
- #[cfg_attr(feature = "otel", tracing::instrument(
57
- skip(self, content, _config),
58
- fields(
59
- extractor.name = self.name(),
60
- content.size_bytes = content.len(),
61
- )
62
- ))]
63
56
  async fn extract_bytes(
64
57
  &self,
65
58
  content: &[u8],
@@ -248,6 +248,7 @@ mod tests {
248
248
  let english_text = "Natural language processing is a subfield of artificial intelligence.";
249
249
  let config = KeywordConfig::rake().with_language("fr");
250
250
  let keywords = extract_keywords_rake(english_text, &config).unwrap();
251
+ dbg!(&keywords);
251
252
  assert!(
252
253
  !keywords.is_empty(),
253
254
  "Should fall back to English stopwords and extract keywords"
@@ -39,7 +39,6 @@ pub mod core;
39
39
  pub mod error;
40
40
  pub mod extraction;
41
41
  pub mod extractors;
42
- pub mod panic_context;
43
42
  pub mod plugins;
44
43
  pub mod text;
45
44
  pub mod types;
@@ -80,9 +79,7 @@ pub mod pdf;
80
79
  pub use error::{KreuzbergError, Result};
81
80
  pub use types::*;
82
81
 
83
- #[cfg(feature = "tokio-runtime")]
84
- pub use core::extractor::{batch_extract_bytes, batch_extract_file};
85
- pub use core::extractor::{extract_bytes, extract_file};
82
+ pub use core::extractor::{batch_extract_bytes, batch_extract_file, extract_bytes, extract_file};
86
83
 
87
84
  pub use core::extractor::{batch_extract_bytes_sync, batch_extract_file_sync, extract_bytes_sync, extract_file_sync};
88
85
 
@@ -16,7 +16,7 @@
16
16
  //! use kreuzberg::mcp::start_mcp_server;
17
17
  //!
18
18
  //! #[tokio::main]
19
- //! async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
19
+ //! async fn main() -> anyhow::Result<()> {
20
20
  //! start_mcp_server().await?;
21
21
  //! Ok(())
22
22
  //! }
@@ -428,12 +428,12 @@ impl Default for KreuzbergMcp {
428
428
  /// use kreuzberg::mcp::start_mcp_server;
429
429
  ///
430
430
  /// #[tokio::main]
431
- /// async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
431
+ /// async fn main() -> anyhow::Result<()> {
432
432
  /// start_mcp_server().await?;
433
433
  /// Ok(())
434
434
  /// }
435
435
  /// ```
436
- pub async fn start_mcp_server() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
436
+ pub async fn start_mcp_server() -> Result<(), Box<dyn std::error::Error>> {
437
437
  let service = KreuzbergMcp::new()?.serve(stdio()).await?;
438
438
 
439
439
  service.waiting().await?;
@@ -444,9 +444,7 @@ pub async fn start_mcp_server() -> Result<(), Box<dyn std::error::Error + Send +
444
444
  ///
445
445
  /// This variant allows specifying a custom extraction configuration
446
446
  /// (e.g., loaded from a file) instead of using defaults.
447
- pub async fn start_mcp_server_with_config(
448
- config: ExtractionConfig,
449
- ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
447
+ pub async fn start_mcp_server_with_config(config: ExtractionConfig) -> Result<(), Box<dyn std::error::Error>> {
450
448
  let service = KreuzbergMcp::with_config(config).serve(stdio()).await?;
451
449
 
452
450
  service.waiting().await?;
@@ -51,14 +51,6 @@ impl OcrProcessor {
51
51
  Ok(Self { cache })
52
52
  }
53
53
 
54
- #[cfg_attr(feature = "otel", tracing::instrument(
55
- skip(self, image_bytes),
56
- fields(
57
- ocr.backend = "tesseract",
58
- ocr.language = %config.language,
59
- image.size_bytes = image_bytes.len(),
60
- )
61
- ))]
62
54
  pub fn process_image(&self, image_bytes: &[u8], config: &TesseractConfig) -> Result<OcrExtractionResult, OcrError> {
63
55
  config.validate().map_err(OcrError::InvalidConfiguration)?;
64
56
 
@@ -72,14 +64,9 @@ impl OcrProcessor {
72
64
  if config.use_cache
73
65
  && let Some(cached_result) = self.cache.get_cached_result(&image_hash, "tesseract", &config_str)?
74
66
  {
75
- #[cfg(feature = "otel")]
76
- tracing::Span::current().record("cache.hit", true);
77
67
  return Ok(cached_result);
78
68
  }
79
69
 
80
- #[cfg(feature = "otel")]
81
- tracing::Span::current().record("cache.hit", false);
82
-
83
70
  let result = self.perform_ocr(image_bytes, config)?;
84
71
 
85
72
  if config.use_cache {
@@ -241,6 +228,7 @@ impl OcrProcessor {
241
228
  });
242
229
 
243
230
  // Validate language before initializing to prevent segfault ~keep
231
+ // tesseract-rs can crash on empty language or missing language files
244
232
  if config.language.trim().is_empty() {
245
233
  return Err(OcrError::TesseractInitializationFailed(
246
234
  "Language cannot be empty. Please specify a valid language code (e.g., 'eng')".to_string(),
@@ -248,6 +236,7 @@ impl OcrProcessor {
248
236
  }
249
237
 
250
238
  // Validate language file exists before initializing to prevent segfault ~keep
239
+ // tesseract-rs can crash if language file is missing instead of returning error
251
240
  if !tessdata_path.is_empty() {
252
241
  let languages: Vec<&str> = config.language.split('+').collect();
253
242
  for lang in languages {
@@ -373,11 +362,6 @@ impl OcrProcessor {
373
362
  )
374
363
  });
375
364
 
376
- api.recognize()
377
- .map_err(|e| OcrError::ProcessingFailed(format!("Failed to recognize text: {}", e)))?;
378
-
379
- log_ci_debug(ci_debug_enabled, "recognize", || "completed".to_string());
380
-
381
365
  let tsv_data_for_tables = if config.enable_table_detection || config.output_format == "tsv" {
382
366
  Some(
383
367
  api.get_tsv_text(0)
@@ -40,7 +40,7 @@ impl std::error::Error for PdfError {}
40
40
  impl From<lopdf::Error> for PdfError {
41
41
  fn from(err: lopdf::Error) -> Self {
42
42
  match err {
43
- lopdf::Error::IO(io_err) => PdfError::IOError(io_err.to_string()),
43
+ lopdf::Error::IO(_) => panic!("lopdf IO errors should not be converted to PdfError - let them bubble up"),
44
44
  _ => PdfError::InvalidPdf(err.to_string()),
45
45
  }
46
46
  }