kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +11 -11
  6. data/README.md +5 -10
  7. data/examples/async_patterns.rb +0 -1
  8. data/ext/kreuzberg_rb/extconf.rb +0 -10
  9. data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
  10. data/ext/kreuzberg_rb/native/build.rs +2 -0
  11. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  12. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  13. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  14. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  15. data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
  16. data/kreuzberg.gemspec +14 -57
  17. data/lib/kreuzberg/cache_api.rb +0 -1
  18. data/lib/kreuzberg/cli.rb +2 -2
  19. data/lib/kreuzberg/config.rb +2 -9
  20. data/lib/kreuzberg/errors.rb +7 -75
  21. data/lib/kreuzberg/extraction_api.rb +0 -1
  22. data/lib/kreuzberg/setup_lib_path.rb +0 -1
  23. data/lib/kreuzberg/version.rb +1 -1
  24. data/lib/kreuzberg.rb +0 -21
  25. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  26. data/sig/kreuzberg.rbs +3 -55
  27. data/spec/binding/cli_proxy_spec.rb +4 -2
  28. data/spec/binding/cli_spec.rb +11 -12
  29. data/spec/examples.txt +104 -0
  30. data/spec/fixtures/config.yaml +1 -0
  31. data/spec/spec_helper.rb +1 -1
  32. data/vendor/kreuzberg/Cargo.toml +42 -112
  33. data/vendor/kreuzberg/README.md +2 -2
  34. data/vendor/kreuzberg/build.rs +4 -18
  35. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  36. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  37. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  38. data/vendor/kreuzberg/src/core/extractor.rs +81 -202
  39. data/vendor/kreuzberg/src/core/io.rs +2 -4
  40. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  41. data/vendor/kreuzberg/src/core/mod.rs +1 -4
  42. data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
  43. data/vendor/kreuzberg/src/embeddings.rs +16 -125
  44. data/vendor/kreuzberg/src/error.rs +1 -1
  45. data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/image.rs +13 -13
  47. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
  48. data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
  49. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  50. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  51. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  52. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  53. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  54. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  55. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  56. data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
  57. data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
  58. data/vendor/kreuzberg/src/extractors/email.rs +0 -14
  59. data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
  60. data/vendor/kreuzberg/src/extractors/html.rs +154 -137
  61. data/vendor/kreuzberg/src/extractors/image.rs +4 -7
  62. data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
  63. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  64. data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
  65. data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
  66. data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
  67. data/vendor/kreuzberg/src/extractors/text.rs +5 -23
  68. data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
  69. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  70. data/vendor/kreuzberg/src/lib.rs +1 -4
  71. data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
  72. data/vendor/kreuzberg/src/mcp/server.rs +3 -5
  73. data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
  74. data/vendor/kreuzberg/src/pdf/error.rs +1 -1
  75. data/vendor/kreuzberg/src/pdf/table.rs +44 -17
  76. data/vendor/kreuzberg/src/pdf/text.rs +3 -0
  77. data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
  78. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
  79. data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
  80. data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
  81. data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
  82. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  83. data/vendor/kreuzberg/src/types.rs +12 -42
  84. data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
  85. data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
  86. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  87. data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
  88. data/vendor/kreuzberg/tests/config_features.rs +0 -18
  89. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
  90. data/vendor/kreuzberg/tests/core_integration.rs +7 -24
  91. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  92. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  93. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
  95. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
  96. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
  97. data/vendor/kreuzberg/tests/security_validation.rs +1 -12
  98. metadata +25 -90
  99. data/.rubocop.yml +0 -538
  100. data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
  101. data/lib/kreuzberg/error_context.rb +0 -32
  102. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  103. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
  104. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
  105. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
  106. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
  107. data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
  108. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
  109. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  110. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
  111. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
  112. data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
  113. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
  114. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  115. data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
  116. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
  117. data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
  119. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  121. data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
  122. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  123. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  124. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  125. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
  126. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  127. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  128. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  129. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  130. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  131. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  132. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  133. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  134. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  135. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
  136. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  137. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  138. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
  139. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
  140. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
  141. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
  142. data/vendor/rb-sys/.cargo-ok +0 -1
  143. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  144. data/vendor/rb-sys/Cargo.lock +0 -393
  145. data/vendor/rb-sys/Cargo.toml +0 -70
  146. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  147. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  148. data/vendor/rb-sys/LICENSE-MIT +0 -21
  149. data/vendor/rb-sys/bin/release.sh +0 -21
  150. data/vendor/rb-sys/build/features.rs +0 -108
  151. data/vendor/rb-sys/build/main.rs +0 -246
  152. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  153. data/vendor/rb-sys/build/version.rs +0 -48
  154. data/vendor/rb-sys/readme.md +0 -36
  155. data/vendor/rb-sys/src/bindings.rs +0 -21
  156. data/vendor/rb-sys/src/hidden.rs +0 -11
  157. data/vendor/rb-sys/src/lib.rs +0 -34
  158. data/vendor/rb-sys/src/macros.rs +0 -371
  159. data/vendor/rb-sys/src/memory.rs +0 -53
  160. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  161. data/vendor/rb-sys/src/special_consts.rs +0 -31
  162. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  163. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  164. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  165. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  166. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  167. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  168. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  169. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  170. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  171. data/vendor/rb-sys/src/stable_api.rs +0 -261
  172. data/vendor/rb-sys/src/symbol.rs +0 -31
  173. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  174. data/vendor/rb-sys/src/utils.rs +0 -89
  175. data/vendor/rb-sys/src/value_type.rs +0 -7
@@ -5,10 +5,9 @@ use crate::core::config::ExtractionConfig;
5
5
  use crate::plugins::{DocumentExtractor, Plugin};
6
6
  use crate::types::{ExtractionResult, Metadata, Table};
7
7
  use async_trait::async_trait;
8
+ use scraper::{Html, Selector};
8
9
  use std::path::Path;
9
10
 
10
- // NOTE: scraper dependency has been removed in favor of html-to-markdown-rs
11
-
12
11
  /// HTML document extractor using html-to-markdown.
13
12
  pub struct HtmlExtractor;
14
13
 
@@ -24,152 +23,134 @@ impl HtmlExtractor {
24
23
  }
25
24
  }
26
25
 
27
- /// Extract all tables from HTML content using html-to-markdown-rs.
28
- ///
29
- /// Uses html-to-markdown-rs to convert HTML to Markdown, which preserves
30
- /// table structure in markdown format. Tables are then parsed from the
31
- /// resulting markdown to maintain compatibility with existing Table API.
26
+ /// Extract all tables from HTML content.
32
27
  ///
33
- /// This approach eliminates the need for the `scraper` dependency as
34
- /// html-to-markdown-rs already handles all table parsing.
28
+ /// Parses HTML to find `<table>` elements and extracts their structure
29
+ /// into `Table` objects with cells and markdown representation.
35
30
  fn extract_html_tables(html: &str) -> Result<Vec<Table>> {
36
- let markdown = crate::extraction::html::convert_html_to_markdown(html, None)?;
31
+ let document = Html::parse_document(html);
32
+ let table_selector = Selector::parse("table")
33
+ .map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to parse table selector: {}", e)))?;
34
+ let row_selector = Selector::parse("tr")
35
+ .map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to parse row selector: {}", e)))?;
36
+ let header_selector = Selector::parse("th")
37
+ .map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to parse header selector: {}", e)))?;
38
+ let cell_selector = Selector::parse("td")
39
+ .map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to parse cell selector: {}", e)))?;
37
40
 
38
- let tables = parse_markdown_tables(&markdown);
41
+ let mut tables = Vec::new();
39
42
 
40
- Ok(tables)
41
- }
43
+ for (table_index, table_elem) in document.select(&table_selector).enumerate() {
44
+ let mut cells: Vec<Vec<String>> = Vec::new();
45
+
46
+ for row in table_elem.select(&row_selector) {
47
+ let mut row_cells = Vec::new();
48
+
49
+ // Try headers first (th elements)
50
+ let headers: Vec<_> = row.select(&header_selector).collect();
51
+ if !headers.is_empty() {
52
+ for header in headers {
53
+ let text = header
54
+ .text()
55
+ .collect::<Vec<_>>()
56
+ .join(" ")
57
+ .split_whitespace()
58
+ .collect::<Vec<_>>()
59
+ .join(" ");
60
+ row_cells.push(text);
61
+ }
62
+ } else {
63
+ // Use data cells (td elements)
64
+ for cell in row.select(&cell_selector) {
65
+ let text = cell
66
+ .text()
67
+ .collect::<Vec<_>>()
68
+ .join(" ")
69
+ .split_whitespace()
70
+ .collect::<Vec<_>>()
71
+ .join(" ");
72
+ row_cells.push(text);
73
+ }
74
+ }
42
75
 
43
- /// Parse markdown tables from HTML-converted markdown.
44
- ///
45
- /// Extracts table data from markdown pipe-delimited format.
46
- /// This maintains the existing Table structure API.
47
- fn parse_markdown_tables(markdown: &str) -> Vec<Table> {
48
- let mut tables = Vec::new();
49
- let mut table_index = 0;
50
- let lines: Vec<&str> = markdown.lines().collect();
51
- let mut i = 0;
52
-
53
- while i < lines.len() {
54
- if lines[i].trim_start().starts_with('|')
55
- && let Some((cells, end_idx)) = extract_markdown_table(&lines, i)
56
- && !cells.is_empty()
57
- {
58
- let markdown_table = reconstruct_markdown_table(&cells);
76
+ if !row_cells.is_empty() {
77
+ cells.push(row_cells);
78
+ }
79
+ }
80
+
81
+ // Only create a table if it has content
82
+ if !cells.is_empty() {
83
+ let markdown = cells_to_markdown(&cells);
59
84
  tables.push(Table {
60
85
  cells,
61
- markdown: markdown_table,
62
- page_number: table_index + 1,
86
+ markdown,
87
+ page_number: table_index + 1, // 1-indexed
63
88
  });
64
- table_index += 1;
65
- i = end_idx;
66
- continue;
67
89
  }
68
- i += 1;
69
90
  }
70
91
 
71
- tables
92
+ Ok(tables)
72
93
  }
73
94
 
74
- /// Extract a single markdown table from lines.
95
+ /// Convert table cells to markdown format.
75
96
  ///
76
- /// Returns the parsed table cells and the index after the table ends.
77
- fn extract_markdown_table(lines: &[&str], start_idx: usize) -> Option<(Vec<Vec<String>>, usize)> {
78
- let header_line = lines.get(start_idx)?;
79
-
80
- if !header_line.trim_start().starts_with('|') {
81
- return None;
82
- }
83
-
84
- let mut cells = Vec::new();
85
- let mut i = start_idx;
86
-
87
- if let Some(header_cells) = parse_markdown_table_row(header_line) {
88
- cells.push(header_cells);
89
- i += 1;
90
- } else {
91
- return None;
92
- }
93
-
94
- if i < lines.len() {
95
- let sep_line = lines[i];
96
- if is_markdown_table_separator(sep_line) {
97
- i += 1;
98
- }
99
- }
100
-
101
- while i < lines.len() {
102
- let line = lines[i];
103
- if let Some(row_cells) = parse_markdown_table_row(line) {
104
- cells.push(row_cells);
105
- i += 1;
106
- } else if !line.trim_start().starts_with('|') {
107
- break;
108
- } else {
109
- i += 1;
110
- }
111
- }
112
-
113
- if cells.len() > 1 { Some((cells, i)) } else { None }
114
- }
115
-
116
- /// Parse a single markdown table row into cell contents.
117
- fn parse_markdown_table_row(line: &str) -> Option<Vec<String>> {
118
- let trimmed = line.trim_start();
119
-
120
- if !trimmed.starts_with('|') || !trimmed.contains('|') {
121
- return None;
122
- }
123
-
124
- let cells: Vec<String> = trimmed
125
- .split('|')
126
- .skip(1)
127
- .map(|cell| cell.trim().to_string())
128
- .filter(|cell| !cell.is_empty())
129
- .collect();
130
-
131
- if cells.is_empty() { None } else { Some(cells) }
132
- }
133
-
134
- /// Check if a line is a markdown table separator.
135
- fn is_markdown_table_separator(line: &str) -> bool {
136
- let trimmed = line.trim_start();
137
- if !trimmed.starts_with('|') {
138
- return false;
139
- }
140
-
141
- trimmed
142
- .split('|')
143
- .all(|cell| cell.trim().chars().all(|c| c == '-' || c == ':' || c.is_whitespace()))
144
- }
145
-
146
- /// Reconstruct markdown table from cells.
97
+ /// Reuses the same logic as DOCX extractor for consistency.
98
+ /// First row is treated as header, remaining rows as data.
147
99
  ///
148
- /// Takes parsed table cells and creates a properly formatted markdown table string.
149
- fn reconstruct_markdown_table(cells: &[Vec<String>]) -> String {
100
+ /// # Arguments
101
+ /// * `cells` - 2D vector of cell strings (rows × columns)
102
+ ///
103
+ /// # Returns
104
+ /// * `String` - Markdown formatted table
105
+ fn cells_to_markdown(cells: &[Vec<String>]) -> String {
150
106
  if cells.is_empty() {
151
107
  return String::new();
152
108
  }
153
109
 
154
110
  let mut markdown = String::new();
155
111
 
156
- for (row_idx, row) in cells.iter().enumerate() {
112
+ // Determine number of columns from first row
113
+ let num_cols = cells.first().map(|r| r.len()).unwrap_or(0);
114
+ if num_cols == 0 {
115
+ return String::new();
116
+ }
117
+
118
+ // Header row (first row)
119
+ if let Some(header) = cells.first() {
120
+ markdown.push_str("| ");
121
+ for cell in header {
122
+ // Escape pipe characters in cell content
123
+ let escaped = cell.replace('|', "\\|");
124
+ markdown.push_str(&escaped);
125
+ markdown.push_str(" | ");
126
+ }
127
+ markdown.push('\n');
128
+
129
+ // Separator row
157
130
  markdown.push('|');
158
- for cell in row {
159
- markdown.push(' ');
160
- markdown.push_str(cell);
161
- markdown.push(' ');
162
- markdown.push('|');
131
+ for _ in 0..num_cols {
132
+ markdown.push_str("------|");
163
133
  }
164
134
  markdown.push('\n');
135
+ }
165
136
 
166
- if row_idx == 0 {
167
- markdown.push('|');
168
- for _ in row {
169
- markdown.push_str("------|");
137
+ // Data rows (skip first row as it's the header)
138
+ for row in cells.iter().skip(1) {
139
+ markdown.push_str("| ");
140
+ for (idx, cell) in row.iter().enumerate() {
141
+ if idx >= num_cols {
142
+ break; // Handle irregular tables
170
143
  }
171
- markdown.push('\n');
144
+ // Escape pipe characters in cell content
145
+ let escaped = cell.replace('|', "\\|");
146
+ markdown.push_str(&escaped);
147
+ markdown.push_str(" | ");
148
+ }
149
+ // Pad with empty cells if row is shorter than expected
150
+ for _ in row.len()..num_cols {
151
+ markdown.push_str(" | ");
172
152
  }
153
+ markdown.push('\n');
173
154
  }
174
155
 
175
156
  markdown
@@ -195,13 +176,6 @@ impl Plugin for HtmlExtractor {
195
176
 
196
177
  #[async_trait]
197
178
  impl DocumentExtractor for HtmlExtractor {
198
- #[cfg_attr(feature = "otel", tracing::instrument(
199
- skip(self, content, config),
200
- fields(
201
- extractor.name = self.name(),
202
- content.size_bytes = content.len(),
203
- )
204
- ))]
205
179
  async fn extract_bytes(
206
180
  &self,
207
181
  content: &[u8],
@@ -212,6 +186,7 @@ impl DocumentExtractor for HtmlExtractor {
212
186
  .map(|s| s.to_string())
213
187
  .unwrap_or_else(|_| String::from_utf8_lossy(content).to_string());
214
188
 
189
+ // Extract tables from HTML
215
190
  let tables = extract_html_tables(&html)?;
216
191
 
217
192
  let markdown = crate::extraction::html::convert_html_to_markdown(&html, config.html_options.clone())?;
@@ -232,13 +207,6 @@ impl DocumentExtractor for HtmlExtractor {
232
207
  })
233
208
  }
234
209
 
235
- #[cfg(feature = "tokio-runtime")]
236
- #[cfg_attr(feature = "otel", tracing::instrument(
237
- skip(self, path, config),
238
- fields(
239
- extractor.name = self.name(),
240
- )
241
- ))]
242
210
  async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
243
211
  let bytes = tokio::fs::read(path).await?;
244
212
  self.extract_bytes(&bytes, mime_type, config).await
@@ -294,6 +262,7 @@ mod tests {
294
262
  assert_eq!(table.cells[2], vec!["Row2Col1", "Row2Col2"]);
295
263
  assert_eq!(table.page_number, 1);
296
264
 
265
+ // Check markdown format
297
266
  assert!(table.markdown.contains("| Header1 | Header2 |"));
298
267
  assert!(table.markdown.contains("|------|------|"));
299
268
  assert!(table.markdown.contains("| Row1Col1 | Row1Col2 |"));
@@ -357,8 +326,56 @@ mod tests {
357
326
  assert_eq!(tables.len(), 1);
358
327
 
359
328
  let table = &tables[0];
360
- assert_eq!(table.cells[0][0], "Header **Bold**");
361
- assert_eq!(table.cells[1][0], "Data with *emphasis*");
329
+ // Whitespace is normalized during text extraction
330
+ assert_eq!(table.cells[0][0], "Header Bold");
331
+ assert_eq!(table.cells[1][0], "Data with emphasis");
332
+ }
333
+
334
+ #[test]
335
+ fn test_cells_to_markdown_basic() {
336
+ let cells = vec![
337
+ vec!["Header1".to_string(), "Header2".to_string()],
338
+ vec!["Row1Col1".to_string(), "Row1Col2".to_string()],
339
+ vec!["Row2Col1".to_string(), "Row2Col2".to_string()],
340
+ ];
341
+
342
+ let markdown = cells_to_markdown(&cells);
343
+
344
+ assert!(markdown.contains("| Header1 | Header2 |"));
345
+ assert!(markdown.contains("|------|------|"));
346
+ assert!(markdown.contains("| Row1Col1 | Row1Col2 |"));
347
+ assert!(markdown.contains("| Row2Col1 | Row2Col2 |"));
348
+ }
349
+
350
+ #[test]
351
+ fn test_cells_to_markdown_empty() {
352
+ let cells: Vec<Vec<String>> = vec![];
353
+ let markdown = cells_to_markdown(&cells);
354
+ assert_eq!(markdown, "");
355
+ }
356
+
357
+ #[test]
358
+ fn test_cells_to_markdown_escape_pipes() {
359
+ let cells = vec![vec!["Header".to_string()], vec!["Cell with | pipe".to_string()]];
360
+
361
+ let markdown = cells_to_markdown(&cells);
362
+ assert!(markdown.contains("Cell with \\| pipe"));
363
+ }
364
+
365
+ #[test]
366
+ fn test_cells_to_markdown_irregular_rows() {
367
+ let cells = vec![
368
+ vec!["H1".to_string(), "H2".to_string(), "H3".to_string()],
369
+ vec!["R1C1".to_string(), "R1C2".to_string()], // Missing third column
370
+ vec!["R2C1".to_string(), "R2C2".to_string(), "R2C3".to_string()],
371
+ ];
372
+
373
+ let markdown = cells_to_markdown(&cells);
374
+
375
+ // Should have 3 columns in header
376
+ assert!(markdown.contains("| H1 | H2 | H3 |"));
377
+ // Should pad short rows
378
+ assert!(markdown.contains("| R1C1 | R1C2 | |"));
362
379
  }
363
380
 
364
381
  #[tokio::test]
@@ -39,6 +39,7 @@ impl ImageExtractor {
39
39
  registry.get(&ocr_config.backend)?
40
40
  };
41
41
 
42
+ // Process image using the backend - returns full ExtractionResult with tables/metadata
42
43
  backend.process_image(content, ocr_config).await
43
44
  }
44
45
  }
@@ -77,13 +78,6 @@ impl Plugin for ImageExtractor {
77
78
 
78
79
  #[async_trait]
79
80
  impl DocumentExtractor for ImageExtractor {
80
- #[cfg_attr(feature = "otel", tracing::instrument(
81
- skip(self, content, config),
82
- fields(
83
- extractor.name = self.name(),
84
- content.size_bytes = content.len(),
85
- )
86
- ))]
87
81
  async fn extract_bytes(
88
82
  &self,
89
83
  content: &[u8],
@@ -99,11 +93,13 @@ impl DocumentExtractor for ImageExtractor {
99
93
  exif: extraction_metadata.exif_data,
100
94
  };
101
95
 
96
+ // If OCR is enabled, use OCR result (which includes tables and OCR-specific metadata)
102
97
  if config.ocr.is_some() {
103
98
  #[cfg(feature = "ocr")]
104
99
  {
105
100
  let mut ocr_result = self.extract_with_ocr(content, config).await?;
106
101
 
102
+ // Add image metadata to the OCR result
107
103
  ocr_result.metadata.format = Some(crate::types::FormatMetadata::Image(image_metadata));
108
104
  ocr_result.mime_type = mime_type.to_string();
109
105
 
@@ -131,6 +127,7 @@ impl DocumentExtractor for ImageExtractor {
131
127
  }
132
128
  }
133
129
 
130
+ // No OCR - just return image dimensions
134
131
  Ok(ExtractionResult {
135
132
  content: format!(
136
133
  "Image: {} {}x{}",
@@ -8,7 +8,6 @@ use crate::plugins::registry::get_document_extractor_registry;
8
8
  use once_cell::sync::Lazy;
9
9
  use std::sync::Arc;
10
10
 
11
- pub mod security;
12
11
  pub mod structured;
13
12
  pub mod text;
14
13
 
@@ -27,44 +26,11 @@ pub mod excel;
27
26
  #[cfg(feature = "html")]
28
27
  pub mod html;
29
28
 
30
- #[cfg(feature = "office")]
31
- pub mod bibtex;
32
-
33
29
  #[cfg(feature = "office")]
34
30
  pub mod docx;
35
31
 
36
32
  #[cfg(feature = "office")]
37
- pub mod epub;
38
-
39
- #[cfg(feature = "office")]
40
- pub mod fictionbook;
41
-
42
- #[cfg(feature = "office")]
43
- pub mod markdown;
44
-
45
- #[cfg(feature = "office")]
46
- pub mod rst;
47
-
48
- #[cfg(feature = "office")]
49
- pub mod latex;
50
-
51
- #[cfg(feature = "office")]
52
- pub mod jupyter;
53
-
54
- #[cfg(feature = "office")]
55
- pub mod orgmode;
56
-
57
- #[cfg(feature = "office")]
58
- pub mod odt;
59
-
60
- #[cfg(feature = "office")]
61
- pub mod opml;
62
-
63
- #[cfg(feature = "office")]
64
- pub mod typst;
65
-
66
- #[cfg(feature = "xml")]
67
- pub mod jats;
33
+ pub mod pandoc;
68
34
 
69
35
  #[cfg(feature = "pdf")]
70
36
  pub mod pdf;
@@ -72,15 +38,9 @@ pub mod pdf;
72
38
  #[cfg(feature = "office")]
73
39
  pub mod pptx;
74
40
 
75
- #[cfg(feature = "office")]
76
- pub mod rtf;
77
-
78
41
  #[cfg(feature = "xml")]
79
42
  pub mod xml;
80
43
 
81
- #[cfg(feature = "xml")]
82
- pub mod docbook;
83
-
84
44
  pub use structured::StructuredExtractor;
85
45
  pub use text::{MarkdownExtractor, PlainTextExtractor};
86
46
 
@@ -99,44 +59,11 @@ pub use excel::ExcelExtractor;
99
59
  #[cfg(feature = "html")]
100
60
  pub use html::HtmlExtractor;
101
61
 
102
- #[cfg(feature = "office")]
103
- pub use bibtex::BibtexExtractor;
104
-
105
62
  #[cfg(feature = "office")]
106
63
  pub use docx::DocxExtractor;
107
64
 
108
65
  #[cfg(feature = "office")]
109
- pub use epub::EpubExtractor;
110
-
111
- #[cfg(feature = "office")]
112
- pub use fictionbook::FictionBookExtractor;
113
-
114
- #[cfg(feature = "office")]
115
- pub use markdown::MarkdownExtractor as EnhancedMarkdownExtractor;
116
-
117
- #[cfg(feature = "office")]
118
- pub use rst::RstExtractor;
119
-
120
- #[cfg(feature = "office")]
121
- pub use latex::LatexExtractor;
122
-
123
- #[cfg(feature = "office")]
124
- pub use jupyter::JupyterExtractor;
125
-
126
- #[cfg(feature = "office")]
127
- pub use orgmode::OrgModeExtractor;
128
-
129
- #[cfg(feature = "office")]
130
- pub use odt::OdtExtractor;
131
-
132
- #[cfg(feature = "xml")]
133
- pub use jats::JatsExtractor;
134
-
135
- #[cfg(feature = "office")]
136
- pub use opml::OpmlExtractor;
137
-
138
- #[cfg(feature = "office")]
139
- pub use typst::TypstExtractor;
66
+ pub use pandoc::PandocExtractor;
140
67
 
141
68
  #[cfg(feature = "pdf")]
142
69
  pub use pdf::PdfExtractor;
@@ -144,15 +71,9 @@ pub use pdf::PdfExtractor;
144
71
  #[cfg(feature = "office")]
145
72
  pub use pptx::PptxExtractor;
146
73
 
147
- #[cfg(feature = "office")]
148
- pub use rtf::RtfExtractor;
149
-
150
74
  #[cfg(feature = "xml")]
151
75
  pub use xml::XmlExtractor;
152
76
 
153
- #[cfg(feature = "xml")]
154
- pub use docbook::DocbookExtractor;
155
-
156
77
  /// Lazy-initialized flag that ensures extractors are registered exactly once.
157
78
  ///
158
79
  /// This static is accessed on first extraction operation to automatically
@@ -165,6 +86,7 @@ static EXTRACTORS_INITIALIZED: Lazy<Result<()>> = Lazy::new(register_default_ext
165
86
  /// It's safe to call multiple times - registration only happens once,
166
87
  /// unless the registry was cleared, in which case extractors are re-registered.
167
88
  pub fn ensure_initialized() -> Result<()> {
89
+ // First, try the lazy initialization
168
90
  EXTRACTORS_INITIALIZED
169
91
  .as_ref()
170
92
  .map(|_| ())
@@ -173,12 +95,15 @@ pub fn ensure_initialized() -> Result<()> {
173
95
  plugin_name: "built-in-extractors".to_string(),
174
96
  })?;
175
97
 
98
+ // Check if registry is empty (e.g., after clear_document_extractors)
99
+ // If so, re-register the default extractors
176
100
  let registry = get_document_extractor_registry();
177
101
  let registry_guard = registry
178
102
  .read()
179
103
  .map_err(|e| crate::KreuzbergError::Other(format!("Document extractor registry lock poisoned: {}", e)))?;
180
104
 
181
105
  if registry_guard.list().is_empty() {
106
+ // Drop read lock before acquiring write lock
182
107
  drop(registry_guard);
183
108
  register_default_extractors()?;
184
109
  }
@@ -228,20 +153,9 @@ pub fn register_default_extractors() -> Result<()> {
228
153
 
229
154
  #[cfg(feature = "office")]
230
155
  {
231
- registry.register(Arc::new(EnhancedMarkdownExtractor::new()))?;
232
- registry.register(Arc::new(BibtexExtractor::new()))?;
233
156
  registry.register(Arc::new(DocxExtractor::new()))?;
234
- registry.register(Arc::new(EpubExtractor::new()))?;
235
- registry.register(Arc::new(FictionBookExtractor::new()))?;
236
157
  registry.register(Arc::new(PptxExtractor::new()))?;
237
- registry.register(Arc::new(OdtExtractor::new()))?;
238
- registry.register(Arc::new(RtfExtractor::new()))?;
239
- registry.register(Arc::new(RstExtractor::new()))?;
240
- registry.register(Arc::new(LatexExtractor::new()))?;
241
- registry.register(Arc::new(JupyterExtractor::new()))?;
242
- registry.register(Arc::new(OrgModeExtractor::new()))?;
243
- registry.register(Arc::new(OpmlExtractor::new()))?;
244
- registry.register(Arc::new(TypstExtractor::new()))?;
158
+ registry.register(Arc::new(PandocExtractor::new()))?;
245
159
  }
246
160
 
247
161
  #[cfg(feature = "email")]
@@ -313,21 +227,10 @@ mod tests {
313
227
 
314
228
  #[cfg(feature = "office")]
315
229
  {
316
- expected_count += 13;
317
- assert!(extractor_names.contains(&"markdown-extractor".to_string()));
318
- assert!(extractor_names.contains(&"bibtex-extractor".to_string()));
230
+ expected_count += 3;
319
231
  assert!(extractor_names.contains(&"docx-extractor".to_string()));
320
- assert!(extractor_names.contains(&"epub-extractor".to_string()));
321
- assert!(extractor_names.contains(&"fictionbook-extractor".to_string()));
322
232
  assert!(extractor_names.contains(&"pptx-extractor".to_string()));
323
- assert!(extractor_names.contains(&"odt-extractor".to_string()));
324
- assert!(extractor_names.contains(&"rtf-extractor".to_string()));
325
- assert!(extractor_names.contains(&"rst-extractor".to_string()));
326
- assert!(extractor_names.contains(&"latex-extractor".to_string()));
327
- assert!(extractor_names.contains(&"jupyter-extractor".to_string()));
328
- assert!(extractor_names.contains(&"orgmode-extractor".to_string()));
329
- assert!(extractor_names.contains(&"opml-extractor".to_string()));
330
- assert!(extractor_names.contains(&"typst-extractor".to_string()));
233
+ assert!(extractor_names.contains(&"pandoc-extractor".to_string()));
331
234
  }
332
235
 
333
236
  #[cfg(feature = "email")]