kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +11 -11
  6. data/README.md +5 -10
  7. data/examples/async_patterns.rb +0 -1
  8. data/ext/kreuzberg_rb/extconf.rb +0 -10
  9. data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
  10. data/ext/kreuzberg_rb/native/build.rs +2 -0
  11. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  12. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  13. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  14. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  15. data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
  16. data/kreuzberg.gemspec +14 -57
  17. data/lib/kreuzberg/cache_api.rb +0 -1
  18. data/lib/kreuzberg/cli.rb +2 -2
  19. data/lib/kreuzberg/config.rb +2 -9
  20. data/lib/kreuzberg/errors.rb +7 -75
  21. data/lib/kreuzberg/extraction_api.rb +0 -1
  22. data/lib/kreuzberg/setup_lib_path.rb +0 -1
  23. data/lib/kreuzberg/version.rb +1 -1
  24. data/lib/kreuzberg.rb +0 -21
  25. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  26. data/sig/kreuzberg.rbs +3 -55
  27. data/spec/binding/cli_proxy_spec.rb +4 -2
  28. data/spec/binding/cli_spec.rb +11 -12
  29. data/spec/examples.txt +104 -0
  30. data/spec/fixtures/config.yaml +1 -0
  31. data/spec/spec_helper.rb +1 -1
  32. data/vendor/kreuzberg/Cargo.toml +42 -112
  33. data/vendor/kreuzberg/README.md +2 -2
  34. data/vendor/kreuzberg/build.rs +4 -18
  35. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  36. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  37. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  38. data/vendor/kreuzberg/src/core/extractor.rs +81 -202
  39. data/vendor/kreuzberg/src/core/io.rs +2 -4
  40. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  41. data/vendor/kreuzberg/src/core/mod.rs +1 -4
  42. data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
  43. data/vendor/kreuzberg/src/embeddings.rs +16 -125
  44. data/vendor/kreuzberg/src/error.rs +1 -1
  45. data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/image.rs +13 -13
  47. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
  48. data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
  49. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  50. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  51. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  52. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  53. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  54. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  55. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  56. data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
  57. data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
  58. data/vendor/kreuzberg/src/extractors/email.rs +0 -14
  59. data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
  60. data/vendor/kreuzberg/src/extractors/html.rs +154 -137
  61. data/vendor/kreuzberg/src/extractors/image.rs +4 -7
  62. data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
  63. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  64. data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
  65. data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
  66. data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
  67. data/vendor/kreuzberg/src/extractors/text.rs +5 -23
  68. data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
  69. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  70. data/vendor/kreuzberg/src/lib.rs +1 -4
  71. data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
  72. data/vendor/kreuzberg/src/mcp/server.rs +3 -5
  73. data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
  74. data/vendor/kreuzberg/src/pdf/error.rs +1 -1
  75. data/vendor/kreuzberg/src/pdf/table.rs +44 -17
  76. data/vendor/kreuzberg/src/pdf/text.rs +3 -0
  77. data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
  78. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
  79. data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
  80. data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
  81. data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
  82. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  83. data/vendor/kreuzberg/src/types.rs +12 -42
  84. data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
  85. data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
  86. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  87. data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
  88. data/vendor/kreuzberg/tests/config_features.rs +0 -18
  89. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
  90. data/vendor/kreuzberg/tests/core_integration.rs +7 -24
  91. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  92. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  93. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
  95. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
  96. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
  97. data/vendor/kreuzberg/tests/security_validation.rs +1 -12
  98. metadata +25 -90
  99. data/.rubocop.yml +0 -538
  100. data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
  101. data/lib/kreuzberg/error_context.rb +0 -32
  102. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  103. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
  104. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
  105. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
  106. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
  107. data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
  108. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
  109. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  110. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
  111. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
  112. data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
  113. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
  114. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  115. data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
  116. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
  117. data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
  119. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  121. data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
  122. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  123. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  124. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  125. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
  126. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  127. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  128. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  129. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  130. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  131. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  132. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  133. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  134. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  135. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
  136. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  137. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  138. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
  139. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
  140. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
  141. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
  142. data/vendor/rb-sys/.cargo-ok +0 -1
  143. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  144. data/vendor/rb-sys/Cargo.lock +0 -393
  145. data/vendor/rb-sys/Cargo.toml +0 -70
  146. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  147. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  148. data/vendor/rb-sys/LICENSE-MIT +0 -21
  149. data/vendor/rb-sys/bin/release.sh +0 -21
  150. data/vendor/rb-sys/build/features.rs +0 -108
  151. data/vendor/rb-sys/build/main.rs +0 -246
  152. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  153. data/vendor/rb-sys/build/version.rs +0 -48
  154. data/vendor/rb-sys/readme.md +0 -36
  155. data/vendor/rb-sys/src/bindings.rs +0 -21
  156. data/vendor/rb-sys/src/hidden.rs +0 -11
  157. data/vendor/rb-sys/src/lib.rs +0 -34
  158. data/vendor/rb-sys/src/macros.rs +0 -371
  159. data/vendor/rb-sys/src/memory.rs +0 -53
  160. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  161. data/vendor/rb-sys/src/special_consts.rs +0 -31
  162. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  163. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  164. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  165. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  166. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  167. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  168. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  169. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  170. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  171. data/vendor/rb-sys/src/stable_api.rs +0 -261
  172. data/vendor/rb-sys/src/symbol.rs +0 -31
  173. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  174. data/vendor/rb-sys/src/utils.rs +0 -89
  175. data/vendor/rb-sys/src/value_type.rs +0 -7
@@ -0,0 +1,162 @@
1
+ use crate::error::{KreuzbergError, Result};
2
+ use once_cell::sync::OnceCell;
3
+ use regex::Regex;
4
+ use tokio::process::Command;
5
+
6
+ static PANDOC_VALIDATED: OnceCell<bool> = OnceCell::new();
7
+
8
+ /// Validate that Pandoc version 2 or above is installed and available
9
+ pub async fn validate_pandoc_version() -> Result<()> {
10
+ if PANDOC_VALIDATED.get().is_some() {
11
+ return Ok(());
12
+ }
13
+
14
+ let output = Command::new("pandoc").arg("--version").output().await.map_err(|e| {
15
+ KreuzbergError::MissingDependency(format!(
16
+ "Pandoc version 2 or above is required but not found in PATH: {}",
17
+ e
18
+ ))
19
+ })?;
20
+
21
+ if !output.status.success() {
22
+ return Err(KreuzbergError::MissingDependency(
23
+ "Pandoc version 2 or above is required but command failed".to_string(),
24
+ ));
25
+ }
26
+
27
+ let stdout = String::from_utf8_lossy(&output.stdout);
28
+
29
+ let version = extract_version(&stdout).ok_or_else(|| {
30
+ KreuzbergError::MissingDependency(format!("Could not parse Pandoc version from output: {}", stdout))
31
+ })?;
32
+
33
+ if version.major < 2 {
34
+ return Err(KreuzbergError::MissingDependency(format!(
35
+ "Pandoc version 2 or above is required, found version {}.{}.{}",
36
+ version.major, version.minor, version.patch
37
+ )));
38
+ }
39
+
40
+ let _ = PANDOC_VALIDATED.set(true);
41
+
42
+ Ok(())
43
+ }
44
+
45
+ #[derive(Debug, Clone)]
46
+ struct Version {
47
+ major: u32,
48
+ minor: u32,
49
+ patch: u32,
50
+ }
51
+
52
+ fn extract_version(output: &str) -> Option<Version> {
53
+ let patterns = [
54
+ r"pandoc(?:\.exe)?(?:\s+|\s+v|\s+version\s+)(\d+)\.(\d+)(?:\.(\d+))?",
55
+ r"pandoc\s+\(version\s+(\d+)\.(\d+)(?:\.(\d+))?\)",
56
+ r"pandoc-(\d+)\.(\d+)(?:\.(\d+))?",
57
+ r"^(\d+)\.(\d+)(?:\.(\d+)(?:\.(\d+))?)?",
58
+ r"(?:^|\s)(\d+)\.(\d+)(?:\.(\d+))?(?:\s|$)",
59
+ ];
60
+
61
+ for pattern in &patterns {
62
+ if let Ok(re) = Regex::new(pattern)
63
+ && let Some(caps) = re.captures(output)
64
+ {
65
+ let major = caps.get(1)?.as_str().parse().ok()?;
66
+ let minor = caps.get(2)?.as_str().parse().ok()?;
67
+ let patch = caps.get(3).and_then(|m| m.as_str().parse().ok()).unwrap_or(0);
68
+
69
+ return Some(Version { major, minor, patch });
70
+ }
71
+ }
72
+
73
+ for line in output.lines() {
74
+ for token in line.split_whitespace() {
75
+ if let Some(version) = parse_version_token(token) {
76
+ return Some(version);
77
+ }
78
+ }
79
+ }
80
+
81
+ None
82
+ }
83
+
84
+ fn parse_version_token(token: &str) -> Option<Version> {
85
+ let parts: Vec<&str> = token.split('.').collect();
86
+ if parts.len() >= 2
87
+ && let (Ok(major), Ok(minor)) = (parts[0].parse(), parts[1].parse())
88
+ {
89
+ let patch = parts.get(2).and_then(|p| p.parse().ok()).unwrap_or(0);
90
+ return Some(Version { major, minor, patch });
91
+ }
92
+ None
93
+ }
94
+
95
+ #[cfg(test)]
96
+ mod tests {
97
+ use super::*;
98
+
99
+ #[test]
100
+ fn test_extract_version_standard_format() {
101
+ let output = "pandoc 3.1.2";
102
+ let version = extract_version(output).unwrap();
103
+ assert_eq!(version.major, 3);
104
+ assert_eq!(version.minor, 1);
105
+ assert_eq!(version.patch, 2);
106
+ }
107
+
108
+ #[test]
109
+ fn test_extract_version_with_parens() {
110
+ let output = "pandoc (version 2.19.2)";
111
+ let version = extract_version(output).unwrap();
112
+ assert_eq!(version.major, 2);
113
+ assert_eq!(version.minor, 19);
114
+ }
115
+
116
+ #[test]
117
+ fn test_extract_version_with_exe() {
118
+ let output = "pandoc.exe 3.0";
119
+ let version = extract_version(output).unwrap();
120
+ assert_eq!(version.major, 3);
121
+ assert_eq!(version.minor, 0);
122
+ }
123
+
124
+ #[test]
125
+ fn test_extract_version_multiline() {
126
+ let output = "pandoc 3.1.2\nCopyright (C) 2006-2023 John MacFarlane";
127
+ let version = extract_version(output).unwrap();
128
+ assert_eq!(version.major, 3);
129
+ assert_eq!(version.minor, 1);
130
+ }
131
+
132
+ #[test]
133
+ fn test_extract_version_no_patch() {
134
+ let output = "pandoc 2.5";
135
+ let version = extract_version(output).unwrap();
136
+ assert_eq!(version.major, 2);
137
+ assert_eq!(version.minor, 5);
138
+ assert_eq!(version.patch, 0);
139
+ }
140
+
141
+ #[test]
142
+ fn test_parse_version_token() {
143
+ let version = parse_version_token("2.19.2").unwrap();
144
+ assert_eq!(version.major, 2);
145
+ assert_eq!(version.minor, 19);
146
+ assert_eq!(version.patch, 2);
147
+ }
148
+
149
+ #[test]
150
+ fn test_parse_version_token_no_patch() {
151
+ let version = parse_version_token("3.1").unwrap();
152
+ assert_eq!(version.major, 3);
153
+ assert_eq!(version.minor, 1);
154
+ assert_eq!(version.patch, 0);
155
+ }
156
+
157
+ #[test]
158
+ fn test_parse_version_token_invalid() {
159
+ let version = parse_version_token("abc");
160
+ assert!(version.is_none());
161
+ }
162
+ }
@@ -126,13 +126,6 @@ impl Plugin for ZipExtractor {
126
126
 
127
127
  #[async_trait]
128
128
  impl DocumentExtractor for ZipExtractor {
129
- #[cfg_attr(feature = "otel", tracing::instrument(
130
- skip(self, content, _config),
131
- fields(
132
- extractor.name = self.name(),
133
- content.size_bytes = content.len(),
134
- )
135
- ))]
136
129
  async fn extract_bytes(
137
130
  &self,
138
131
  content: &[u8],
@@ -204,13 +197,6 @@ impl Plugin for TarExtractor {
204
197
 
205
198
  #[async_trait]
206
199
  impl DocumentExtractor for TarExtractor {
207
- #[cfg_attr(feature = "otel", tracing::instrument(
208
- skip(self, content, _config),
209
- fields(
210
- extractor.name = self.name(),
211
- content.size_bytes = content.len(),
212
- )
213
- ))]
214
200
  async fn extract_bytes(
215
201
  &self,
216
202
  content: &[u8],
@@ -287,13 +273,6 @@ impl Plugin for SevenZExtractor {
287
273
 
288
274
  #[async_trait]
289
275
  impl DocumentExtractor for SevenZExtractor {
290
- #[cfg_attr(feature = "otel", tracing::instrument(
291
- skip(self, content, _config),
292
- fields(
293
- extractor.name = self.name(),
294
- content.size_bytes = content.len(),
295
- )
296
- ))]
297
276
  async fn extract_bytes(
298
277
  &self,
299
278
  content: &[u8],
@@ -4,7 +4,7 @@
4
4
 
5
5
  use crate::Result;
6
6
  use crate::core::config::ExtractionConfig;
7
- use crate::extraction::{cells_to_markdown, office_metadata};
7
+ use crate::extraction::office_metadata;
8
8
  use crate::plugins::{DocumentExtractor, Plugin};
9
9
  use crate::types::{ExtractionResult, Metadata, Table};
10
10
  use async_trait::async_trait;
@@ -15,6 +15,7 @@ use std::io::Cursor;
15
15
  /// This extractor provides:
16
16
  /// - Fast text extraction via streaming XML parsing (~160 MB/s average)
17
17
  /// - Comprehensive metadata extraction (core.xml, app.xml, custom.xml)
18
+ /// - ~400x faster than Pandoc subprocess approach
18
19
  pub struct DocxExtractor;
19
20
 
20
21
  impl DocxExtractor {
@@ -65,6 +66,7 @@ impl Plugin for DocxExtractor {
65
66
  /// # Returns
66
67
  /// * `Table` - Converted table with cells and markdown representation
67
68
  fn convert_docx_table_to_table(docx_table: &docx_lite::Table, table_index: usize) -> Table {
69
+ // Extract cells as 2D vector
68
70
  let cells: Vec<Vec<String>> = docx_table
69
71
  .rows
70
72
  .iter()
@@ -72,6 +74,7 @@ fn convert_docx_table_to_table(docx_table: &docx_lite::Table, table_index: usize
72
74
  row.cells
73
75
  .iter()
74
76
  .map(|cell| {
77
+ // Extract text from all paragraphs in the cell
75
78
  cell.paragraphs
76
79
  .iter()
77
80
  .map(|para| para.to_text())
@@ -84,12 +87,13 @@ fn convert_docx_table_to_table(docx_table: &docx_lite::Table, table_index: usize
84
87
  })
85
88
  .collect();
86
89
 
90
+ // Generate markdown representation
87
91
  let markdown = cells_to_markdown(&cells);
88
92
 
89
93
  Table {
90
94
  cells,
91
95
  markdown,
92
- page_number: table_index + 1,
96
+ page_number: table_index + 1, // 1-indexed
93
97
  }
94
98
  }
95
99
 
@@ -100,33 +104,82 @@ fn convert_docx_table_to_table(docx_table: &docx_lite::Table, table_index: usize
100
104
  ///
101
105
  /// # Returns
102
106
  /// * `String` - Markdown formatted table
107
+ fn cells_to_markdown(cells: &[Vec<String>]) -> String {
108
+ if cells.is_empty() {
109
+ return String::new();
110
+ }
111
+
112
+ let mut markdown = String::new();
113
+
114
+ // Determine number of columns from first row
115
+ let num_cols = cells.first().map(|r| r.len()).unwrap_or(0);
116
+ if num_cols == 0 {
117
+ return String::new();
118
+ }
119
+
120
+ // Header row (first row)
121
+ if let Some(header) = cells.first() {
122
+ markdown.push_str("| ");
123
+ for cell in header {
124
+ // Escape pipe characters in cell content
125
+ let escaped = cell.replace('|', "\\|");
126
+ markdown.push_str(&escaped);
127
+ markdown.push_str(" | ");
128
+ }
129
+ markdown.push('\n');
130
+
131
+ // Separator row
132
+ markdown.push('|');
133
+ for _ in 0..num_cols {
134
+ markdown.push_str("------|");
135
+ }
136
+ markdown.push('\n');
137
+ }
138
+
139
+ // Data rows (skip first row as it's the header)
140
+ for row in cells.iter().skip(1) {
141
+ markdown.push_str("| ");
142
+ for (idx, cell) in row.iter().enumerate() {
143
+ if idx >= num_cols {
144
+ break; // Handle irregular tables
145
+ }
146
+ // Escape pipe characters in cell content
147
+ let escaped = cell.replace('|', "\\|");
148
+ markdown.push_str(&escaped);
149
+ markdown.push_str(" | ");
150
+ }
151
+ // Pad with empty cells if row is shorter than expected
152
+ for _ in row.len()..num_cols {
153
+ markdown.push_str(" | ");
154
+ }
155
+ markdown.push('\n');
156
+ }
157
+
158
+ markdown
159
+ }
103
160
 
104
161
  #[async_trait]
105
162
  impl DocumentExtractor for DocxExtractor {
106
- #[cfg_attr(feature = "otel", tracing::instrument(
107
- skip(self, content, _config),
108
- fields(
109
- extractor.name = self.name(),
110
- content.size_bytes = content.len(),
111
- )
112
- ))]
113
163
  async fn extract_bytes(
114
164
  &self,
115
165
  content: &[u8],
116
166
  mime_type: &str,
117
167
  _config: &ExtractionConfig,
118
168
  ) -> Result<ExtractionResult> {
169
+ // Parse the DOCX document to extract both text and tables
119
170
  let (text, tables) = if crate::core::batch_mode::is_batch_mode() {
171
+ // Batch mode: Use spawn_blocking for parallelism
120
172
  let content_owned = content.to_vec();
121
- let span = tracing::Span::current();
122
173
  tokio::task::spawn_blocking(move || -> crate::error::Result<(String, Vec<Table>)> {
123
- let _guard = span.entered();
174
+ // Parse document structure
124
175
  let cursor = Cursor::new(&content_owned);
125
176
  let doc = docx_lite::parse_document(cursor)
126
177
  .map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX parsing failed: {}", e)))?;
127
178
 
179
+ // Extract text
128
180
  let text = doc.extract_text();
129
181
 
182
+ // Extract tables
130
183
  let tables: Vec<Table> = doc
131
184
  .tables
132
185
  .iter()
@@ -139,12 +192,15 @@ impl DocumentExtractor for DocxExtractor {
139
192
  .await
140
193
  .map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX extraction task failed: {}", e)))??
141
194
  } else {
195
+ // Single-file mode: Direct extraction (no spawn overhead)
142
196
  let cursor = Cursor::new(content);
143
197
  let doc = docx_lite::parse_document(cursor)
144
198
  .map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX parsing failed: {}", e)))?;
145
199
 
200
+ // Extract text
146
201
  let text = doc.extract_text();
147
202
 
203
+ // Extract tables
148
204
  let tables: Vec<Table> = doc
149
205
  .tables
150
206
  .iter()
@@ -155,11 +211,11 @@ impl DocumentExtractor for DocxExtractor {
155
211
  (text, tables)
156
212
  };
157
213
 
214
+ // Extract metadata using existing office_metadata module
158
215
  let mut archive = if crate::core::batch_mode::is_batch_mode() {
216
+ // Batch mode: Use spawn_blocking for parallelism
159
217
  let content_owned = content.to_vec();
160
- let span = tracing::Span::current();
161
218
  tokio::task::spawn_blocking(move || -> crate::error::Result<_> {
162
- let _guard = span.entered();
163
219
  let cursor = Cursor::new(content_owned);
164
220
  zip::ZipArchive::new(cursor)
165
221
  .map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e)))
@@ -167,6 +223,8 @@ impl DocumentExtractor for DocxExtractor {
167
223
  .await
168
224
  .map_err(|e| crate::error::KreuzbergError::parsing(format!("Task join error: {}", e)))??
169
225
  } else {
226
+ // Single-file mode: Direct extraction (no spawn overhead)
227
+ // Note: We still need to clone for ZipArchive type consistency with batch mode
170
228
  let content_owned = content.to_vec();
171
229
  let cursor = Cursor::new(content_owned);
172
230
  zip::ZipArchive::new(cursor)
@@ -175,6 +233,7 @@ impl DocumentExtractor for DocxExtractor {
175
233
 
176
234
  let mut metadata_map = std::collections::HashMap::new();
177
235
 
236
+ // Extract core properties (title, creator, dates, keywords, etc.)
178
237
  if let Ok(core) = office_metadata::extract_core_properties(&mut archive) {
179
238
  if let Some(title) = core.title {
180
239
  metadata_map.insert("title".to_string(), serde_json::Value::String(title));
@@ -218,6 +277,7 @@ impl DocumentExtractor for DocxExtractor {
218
277
  }
219
278
  }
220
279
 
280
+ // Extract app properties (page count, word count, etc.)
221
281
  if let Ok(app) = office_metadata::extract_docx_app_properties(&mut archive) {
222
282
  if let Some(pages) = app.pages {
223
283
  metadata_map.insert("page_count".to_string(), serde_json::Value::Number(pages.into()));
@@ -254,6 +314,7 @@ impl DocumentExtractor for DocxExtractor {
254
314
  }
255
315
  }
256
316
 
317
+ // Extract custom properties
257
318
  if let Ok(custom) = office_metadata::extract_custom_properties(&mut archive) {
258
319
  for (key, value) in custom {
259
320
  metadata_map.insert(format!("custom_{}", key), value);
@@ -279,7 +340,7 @@ impl DocumentExtractor for DocxExtractor {
279
340
  }
280
341
 
281
342
  fn priority(&self) -> i32 {
282
- 50
343
+ 50 // Higher priority than Pandoc (40) to take precedence
283
344
  }
284
345
  }
285
346
 
@@ -319,12 +380,61 @@ mod tests {
319
380
  assert!(extractor.shutdown().is_ok());
320
381
  }
321
382
 
383
+ #[test]
384
+ fn test_cells_to_markdown_basic_table() {
385
+ let cells = vec![
386
+ vec!["Header1".to_string(), "Header2".to_string()],
387
+ vec!["Row1Col1".to_string(), "Row1Col2".to_string()],
388
+ vec!["Row2Col1".to_string(), "Row2Col2".to_string()],
389
+ ];
390
+
391
+ let markdown = cells_to_markdown(&cells);
392
+
393
+ assert!(markdown.contains("| Header1 | Header2 |"));
394
+ assert!(markdown.contains("|------|------|"));
395
+ assert!(markdown.contains("| Row1Col1 | Row1Col2 |"));
396
+ assert!(markdown.contains("| Row2Col1 | Row2Col2 |"));
397
+ }
398
+
399
+ #[test]
400
+ fn test_cells_to_markdown_empty() {
401
+ let cells: Vec<Vec<String>> = vec![];
402
+ let markdown = cells_to_markdown(&cells);
403
+ assert_eq!(markdown, "");
404
+ }
405
+
406
+ #[test]
407
+ fn test_cells_to_markdown_escape_pipes() {
408
+ let cells = vec![vec!["Header".to_string()], vec!["Cell with | pipe".to_string()]];
409
+
410
+ let markdown = cells_to_markdown(&cells);
411
+ assert!(markdown.contains("Cell with \\| pipe"));
412
+ }
413
+
414
+ #[test]
415
+ fn test_cells_to_markdown_irregular_rows() {
416
+ let cells = vec![
417
+ vec!["H1".to_string(), "H2".to_string(), "H3".to_string()],
418
+ vec!["R1C1".to_string(), "R1C2".to_string()], // Missing third column
419
+ vec!["R2C1".to_string(), "R2C2".to_string(), "R2C3".to_string()],
420
+ ];
421
+
422
+ let markdown = cells_to_markdown(&cells);
423
+
424
+ // Should have 3 columns in header
425
+ assert!(markdown.contains("| H1 | H2 | H3 |"));
426
+ // Should pad short rows
427
+ assert!(markdown.contains("| R1C1 | R1C2 | |"));
428
+ }
429
+
322
430
  #[test]
323
431
  fn test_convert_docx_table_to_table() {
324
432
  use docx_lite::{Paragraph, Run, Table as DocxTable, TableCell, TableRow};
325
433
 
434
+ // Create a simple docx-lite table
326
435
  let mut table = DocxTable::new();
327
436
 
437
+ // Header row
328
438
  let mut header_row = TableRow::default();
329
439
  let mut cell1 = TableCell::default();
330
440
  let mut para1 = Paragraph::new();
@@ -340,6 +450,7 @@ mod tests {
340
450
 
341
451
  table.rows.push(header_row);
342
452
 
453
+ // Data row
343
454
  let mut data_row = TableRow::default();
344
455
  let mut cell3 = TableCell::default();
345
456
  let mut para3 = Paragraph::new();
@@ -355,10 +466,11 @@ mod tests {
355
466
 
356
467
  table.rows.push(data_row);
357
468
 
469
+ // Convert to Kreuzberg Table
358
470
  let result = convert_docx_table_to_table(&table, 0);
359
471
 
360
- assert_eq!(result.page_number, 1);
361
- assert_eq!(result.cells.len(), 2);
472
+ assert_eq!(result.page_number, 1); // 0 + 1 = 1 (1-indexed)
473
+ assert_eq!(result.cells.len(), 2); // 2 rows
362
474
  assert_eq!(result.cells[0], vec!["Name", "Age"]);
363
475
  assert_eq!(result.cells[1], vec!["Alice", "30"]);
364
476
  assert!(result.markdown.contains("| Name | Age |"));
@@ -44,13 +44,6 @@ impl Plugin for EmailExtractor {
44
44
 
45
45
  #[async_trait]
46
46
  impl DocumentExtractor for EmailExtractor {
47
- #[cfg_attr(feature = "otel", tracing::instrument(
48
- skip(self, content, _config),
49
- fields(
50
- extractor.name = self.name(),
51
- content.size_bytes = content.len(),
52
- )
53
- ))]
54
47
  async fn extract_bytes(
55
48
  &self,
56
49
  content: &[u8],
@@ -99,13 +92,6 @@ impl DocumentExtractor for EmailExtractor {
99
92
  })
100
93
  }
101
94
 
102
- #[cfg(feature = "tokio-runtime")]
103
- #[cfg_attr(feature = "otel", tracing::instrument(
104
- skip(self, path, config),
105
- fields(
106
- extractor.name = self.name(),
107
- )
108
- ))]
109
95
  async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
110
96
  let bytes = tokio::fs::read(path).await?;
111
97
  self.extract_bytes(&bytes, mime_type, config).await
@@ -31,34 +31,46 @@ impl ExcelExtractor {
31
31
  let mut tables = Vec::with_capacity(workbook.sheets.len());
32
32
 
33
33
  for (sheet_index, sheet) in workbook.sheets.iter().enumerate() {
34
+ // Skip empty sheets
34
35
  if sheet.row_count == 0 || sheet.col_count == 0 {
35
36
  continue;
36
37
  }
37
38
 
39
+ // We need to re-parse the sheet to get structured cell data
40
+ // The workbook.sheets only contains markdown, not raw cell data
41
+ // So we'll extract from the markdown table representation
42
+
43
+ // Parse cells from markdown
38
44
  let lines: Vec<&str> = sheet.markdown.lines().collect();
39
45
  let mut cells: Vec<Vec<String>> = Vec::new();
40
46
 
47
+ // Find the table content (skip header line "## Sheet Name" and blank line)
41
48
  let table_start = lines.iter().position(|line| line.starts_with("| "));
42
49
 
43
50
  if let Some(start_idx) = table_start {
44
51
  for line in lines.iter().skip(start_idx) {
45
52
  if line.starts_with("| ") && !line.contains("---") {
53
+ // Parse table row
46
54
  let row: Vec<String> = line
47
55
  .trim_start_matches("| ")
48
56
  .trim_end_matches(" |")
49
57
  .split(" | ")
50
- .map(|cell| cell.replace("\\|", "|").replace("\\\\", "\\"))
58
+ .map(|cell| {
59
+ // Unescape markdown pipes and backslashes
60
+ cell.replace("\\|", "|").replace("\\\\", "\\")
61
+ })
51
62
  .collect();
52
63
  cells.push(row);
53
64
  }
54
65
  }
55
66
  }
56
67
 
68
+ // Only create table if we have data
57
69
  if !cells.is_empty() {
58
70
  tables.push(Table {
59
71
  cells,
60
72
  markdown: sheet.markdown.clone(),
61
- page_number: sheet_index + 1,
73
+ page_number: sheet_index + 1, // 1-indexed
62
74
  });
63
75
  }
64
76
  }
@@ -87,13 +99,6 @@ impl Plugin for ExcelExtractor {
87
99
 
88
100
  #[async_trait]
89
101
  impl DocumentExtractor for ExcelExtractor {
90
- #[cfg_attr(feature = "otel", tracing::instrument(
91
- skip(self, content, _config),
92
- fields(
93
- extractor.name = self.name(),
94
- content.size_bytes = content.len(),
95
- )
96
- ))]
97
102
  async fn extract_bytes(
98
103
  &self,
99
104
  content: &[u8],
@@ -112,17 +117,18 @@ impl DocumentExtractor for ExcelExtractor {
112
117
  _ => ".xlsx",
113
118
  };
114
119
 
120
+ // Extract workbook
115
121
  let workbook = if crate::core::batch_mode::is_batch_mode() {
122
+ // Batch mode: Use spawn_blocking for parallelism
116
123
  let content_owned = content.to_vec();
117
124
  let extension_owned = extension.to_string();
118
- let span = tracing::Span::current();
119
125
  tokio::task::spawn_blocking(move || {
120
- let _guard = span.entered();
121
126
  crate::extraction::excel::read_excel_bytes(&content_owned, &extension_owned)
122
127
  })
123
128
  .await
124
129
  .map_err(|e| crate::error::KreuzbergError::parsing(format!("Excel extraction task failed: {}", e)))??
125
130
  } else {
131
+ // Single-file mode: Direct extraction (no spawn overhead)
126
132
  crate::extraction::excel::read_excel_bytes(content, extension)?
127
133
  };
128
134
 
@@ -157,12 +163,6 @@ impl DocumentExtractor for ExcelExtractor {
157
163
  })
158
164
  }
159
165
 
160
- #[cfg_attr(feature = "otel", tracing::instrument(
161
- skip(self, path, _config),
162
- fields(
163
- extractor.name = self.name(),
164
- )
165
- ))]
166
166
  async fn extract_file(&self, path: &Path, mime_type: &str, _config: &ExtractionConfig) -> Result<ExtractionResult> {
167
167
  let path_str = path
168
168
  .to_str()
@@ -244,6 +244,7 @@ mod tests {
244
244
  use crate::types::ExcelSheet;
245
245
  use std::collections::HashMap;
246
246
 
247
+ // Create a mock workbook with a single sheet
247
248
  let sheet = ExcelSheet {
248
249
  name: "TestSheet".to_string(),
249
250
  markdown: r#"## TestSheet
@@ -268,7 +269,7 @@ mod tests {
268
269
 
269
270
  assert_eq!(tables.len(), 1);
270
271
  assert_eq!(tables[0].page_number, 1);
271
- assert_eq!(tables[0].cells.len(), 3);
272
+ assert_eq!(tables[0].cells.len(), 3); // Header + 2 data rows
272
273
  assert_eq!(tables[0].cells[0], vec!["Name", "Age", "City"]);
273
274
  assert_eq!(tables[0].cells[1], vec!["Alice", "30", "NYC"]);
274
275
  assert_eq!(tables[0].cells[2], vec!["Bob", "25", "LA"]);
@@ -293,7 +294,7 @@ mod tests {
293
294
  };
294
295
 
295
296
  let tables = ExcelExtractor::sheets_to_tables(&workbook);
296
- assert_eq!(tables.len(), 0);
297
+ assert_eq!(tables.len(), 0); // Empty sheets should not create tables
297
298
  }
298
299
 
299
300
  #[test]