kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +11 -11
  6. data/README.md +5 -10
  7. data/examples/async_patterns.rb +0 -1
  8. data/ext/kreuzberg_rb/extconf.rb +0 -10
  9. data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
  10. data/ext/kreuzberg_rb/native/build.rs +2 -0
  11. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  12. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  13. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  14. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  15. data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
  16. data/kreuzberg.gemspec +14 -57
  17. data/lib/kreuzberg/cache_api.rb +0 -1
  18. data/lib/kreuzberg/cli.rb +2 -2
  19. data/lib/kreuzberg/config.rb +2 -9
  20. data/lib/kreuzberg/errors.rb +7 -75
  21. data/lib/kreuzberg/extraction_api.rb +0 -1
  22. data/lib/kreuzberg/setup_lib_path.rb +0 -1
  23. data/lib/kreuzberg/version.rb +1 -1
  24. data/lib/kreuzberg.rb +0 -21
  25. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  26. data/sig/kreuzberg.rbs +3 -55
  27. data/spec/binding/cli_proxy_spec.rb +4 -2
  28. data/spec/binding/cli_spec.rb +11 -12
  29. data/spec/examples.txt +104 -0
  30. data/spec/fixtures/config.yaml +1 -0
  31. data/spec/spec_helper.rb +1 -1
  32. data/vendor/kreuzberg/Cargo.toml +42 -112
  33. data/vendor/kreuzberg/README.md +2 -2
  34. data/vendor/kreuzberg/build.rs +4 -18
  35. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  36. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  37. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  38. data/vendor/kreuzberg/src/core/extractor.rs +81 -202
  39. data/vendor/kreuzberg/src/core/io.rs +2 -4
  40. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  41. data/vendor/kreuzberg/src/core/mod.rs +1 -4
  42. data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
  43. data/vendor/kreuzberg/src/embeddings.rs +16 -125
  44. data/vendor/kreuzberg/src/error.rs +1 -1
  45. data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/image.rs +13 -13
  47. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
  48. data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
  49. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  50. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  51. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  52. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  53. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  54. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  55. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  56. data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
  57. data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
  58. data/vendor/kreuzberg/src/extractors/email.rs +0 -14
  59. data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
  60. data/vendor/kreuzberg/src/extractors/html.rs +154 -137
  61. data/vendor/kreuzberg/src/extractors/image.rs +4 -7
  62. data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
  63. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  64. data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
  65. data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
  66. data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
  67. data/vendor/kreuzberg/src/extractors/text.rs +5 -23
  68. data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
  69. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  70. data/vendor/kreuzberg/src/lib.rs +1 -4
  71. data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
  72. data/vendor/kreuzberg/src/mcp/server.rs +3 -5
  73. data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
  74. data/vendor/kreuzberg/src/pdf/error.rs +1 -1
  75. data/vendor/kreuzberg/src/pdf/table.rs +44 -17
  76. data/vendor/kreuzberg/src/pdf/text.rs +3 -0
  77. data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
  78. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
  79. data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
  80. data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
  81. data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
  82. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  83. data/vendor/kreuzberg/src/types.rs +12 -42
  84. data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
  85. data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
  86. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  87. data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
  88. data/vendor/kreuzberg/tests/config_features.rs +0 -18
  89. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
  90. data/vendor/kreuzberg/tests/core_integration.rs +7 -24
  91. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  92. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  93. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
  95. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
  96. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
  97. data/vendor/kreuzberg/tests/security_validation.rs +1 -12
  98. metadata +25 -90
  99. data/.rubocop.yml +0 -538
  100. data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
  101. data/lib/kreuzberg/error_context.rb +0 -32
  102. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  103. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
  104. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
  105. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
  106. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
  107. data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
  108. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
  109. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  110. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
  111. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
  112. data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
  113. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
  114. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  115. data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
  116. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
  117. data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
  119. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  121. data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
  122. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  123. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  124. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  125. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
  126. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  127. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  128. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  129. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  130. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  131. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  132. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  133. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  134. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  135. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
  136. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  137. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  138. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
  139. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
  140. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
  141. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
  142. data/vendor/rb-sys/.cargo-ok +0 -1
  143. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  144. data/vendor/rb-sys/Cargo.lock +0 -393
  145. data/vendor/rb-sys/Cargo.toml +0 -70
  146. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  147. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  148. data/vendor/rb-sys/LICENSE-MIT +0 -21
  149. data/vendor/rb-sys/bin/release.sh +0 -21
  150. data/vendor/rb-sys/build/features.rs +0 -108
  151. data/vendor/rb-sys/build/main.rs +0 -246
  152. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  153. data/vendor/rb-sys/build/version.rs +0 -48
  154. data/vendor/rb-sys/readme.md +0 -36
  155. data/vendor/rb-sys/src/bindings.rs +0 -21
  156. data/vendor/rb-sys/src/hidden.rs +0 -11
  157. data/vendor/rb-sys/src/lib.rs +0 -34
  158. data/vendor/rb-sys/src/macros.rs +0 -371
  159. data/vendor/rb-sys/src/memory.rs +0 -53
  160. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  161. data/vendor/rb-sys/src/special_consts.rs +0 -31
  162. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  163. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  164. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  165. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  166. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  167. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  168. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  169. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  170. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  171. data/vendor/rb-sys/src/stable_api.rs +0 -261
  172. data/vendor/rb-sys/src/symbol.rs +0 -31
  173. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  174. data/vendor/rb-sys/src/utils.rs +0 -89
  175. data/vendor/rb-sys/src/value_type.rs +0 -7
@@ -4,7 +4,7 @@
4
4
  //! allowing us to reuse the existing table reconstruction logic.
5
5
 
6
6
  use super::error::{PdfError, Result};
7
- use html_to_markdown_rs::hocr::HocrWord;
7
+ use crate::ocr::table::HocrWord;
8
8
  use pdfium_render::prelude::*;
9
9
 
10
10
  /// Spacing threshold for word boundary detection (in PDF units).
@@ -35,24 +35,25 @@ const MIN_WORD_LENGTH: usize = 1;
35
35
  /// use kreuzberg::pdf::table::extract_words_from_page;
36
36
  /// use pdfium_render::prelude::*;
37
37
  ///
38
- /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
39
38
  /// let pdfium = Pdfium::default();
40
39
  /// let document = pdfium.load_pdf_from_file("example.pdf", None)?;
41
40
  /// let page = document.pages().get(0)?;
42
41
  /// let words = extract_words_from_page(&page, 90.0)?;
43
- /// # Ok(())
44
- /// # }
45
42
  /// ```
46
43
  pub fn extract_words_from_page(page: &PdfPage, min_confidence: f64) -> Result<Vec<HocrWord>> {
44
+ // Get page dimensions for coordinate system
47
45
  let page_width = page.width().value as i32;
48
46
  let page_height = page.height().value as i32;
49
47
 
48
+ // Get all text from page
50
49
  let page_text = page
51
50
  .text()
52
51
  .map_err(|e| PdfError::TextExtractionFailed(format!("Failed to get page text: {}", e)))?;
53
52
 
53
+ // Extract character-level information
54
54
  let chars = page_text.chars();
55
55
 
56
+ // Group characters into words based on spacing
56
57
  let words = group_chars_into_words(chars, page_width, page_height, min_confidence)?;
57
58
 
58
59
  Ok(words)
@@ -90,22 +91,26 @@ fn group_chars_into_words(
90
91
  let mut current_word_chars: Vec<CharInfo> = Vec::new();
91
92
 
92
93
  for pdf_char in chars.iter() {
94
+ // Get character bounds (use loose_bounds for table detection)
93
95
  let bounds = pdf_char
94
96
  .loose_bounds()
95
97
  .map_err(|e| PdfError::TextExtractionFailed(format!("Failed to get char bounds: {}", e)))?;
96
98
 
99
+ // Get unicode character (skip if invalid)
97
100
  let Some(ch) = pdf_char.unicode_char() else {
98
101
  continue;
99
102
  };
100
103
 
104
+ // Extract character information
101
105
  let char_info = CharInfo {
102
106
  text: ch,
103
107
  x: bounds.left().value,
104
- y: bounds.bottom().value,
108
+ y: bounds.bottom().value, // PDF coordinates: bottom-left origin
105
109
  width: bounds.width().value,
106
110
  height: bounds.height().value,
107
111
  };
108
112
 
113
+ // Skip whitespace characters (they're used for word boundaries)
109
114
  if char_info.text.is_whitespace() {
110
115
  if !current_word_chars.is_empty() {
111
116
  if let Some(word) = finalize_word(&current_word_chars, page_height, min_confidence) {
@@ -116,6 +121,7 @@ fn group_chars_into_words(
116
121
  continue;
117
122
  }
118
123
 
124
+ // Check if this character should start a new word
119
125
  if should_start_new_word(&current_word_chars, &char_info) && !current_word_chars.is_empty() {
120
126
  if let Some(word) = finalize_word(&current_word_chars, page_height, min_confidence) {
121
127
  words.push(word);
@@ -126,10 +132,11 @@ fn group_chars_into_words(
126
132
  current_word_chars.push(char_info);
127
133
  }
128
134
 
129
- if !current_word_chars.is_empty()
130
- && let Some(word) = finalize_word(&current_word_chars, page_height, min_confidence)
131
- {
132
- words.push(word);
135
+ // Finalize last word
136
+ if !current_word_chars.is_empty() {
137
+ if let Some(word) = finalize_word(&current_word_chars, page_height, min_confidence) {
138
+ words.push(word);
139
+ }
133
140
  }
134
141
 
135
142
  Ok(words)
@@ -146,11 +153,13 @@ fn should_start_new_word(current_word_chars: &[CharInfo], new_char: &CharInfo) -
146
153
 
147
154
  let last_char = &current_word_chars[current_word_chars.len() - 1];
148
155
 
156
+ // Check vertical distance (different lines)
149
157
  let vertical_distance = (new_char.y - last_char.y).abs();
150
158
  if vertical_distance > last_char.height * 0.5 {
151
159
  return true;
152
160
  }
153
161
 
162
+ // Check horizontal distance (word spacing)
154
163
  let horizontal_gap = new_char.x - (last_char.x + last_char.width);
155
164
  horizontal_gap > WORD_SPACING_THRESHOLD
156
165
  }
@@ -164,12 +173,14 @@ fn finalize_word(chars: &[CharInfo], page_height: i32, min_confidence: f64) -> O
164
173
  return None;
165
174
  }
166
175
 
176
+ // Build word text
167
177
  let text: String = chars.iter().map(|c| c.text).collect();
168
178
 
169
179
  if text.len() < MIN_WORD_LENGTH {
170
180
  return None;
171
181
  }
172
182
 
183
+ // Calculate bounding box (encompassing all characters)
173
184
  let left = chars
174
185
  .iter()
175
186
  .map(|c| c.x)
@@ -194,10 +205,14 @@ fn finalize_word(chars: &[CharInfo], page_height: i32, min_confidence: f64) -> O
194
205
  let width = (right - left).round() as i32;
195
206
  let height = (top - bottom).round() as i32;
196
207
 
208
+ // Convert PDF coordinates (bottom-left origin) to image coordinates (top-left origin)
209
+ // HocrWord expects top-left origin like images/OCR output
197
210
  let top_in_image_coords = (page_height as f32 - top).round() as i32;
198
211
 
212
+ // PDF text has high confidence (no OCR uncertainty)
199
213
  let confidence = 95.0;
200
214
 
215
+ // Apply confidence threshold
201
216
  if confidence < min_confidence {
202
217
  return None;
203
218
  }
@@ -255,18 +270,20 @@ mod tests {
255
270
  height: 12.0,
256
271
  }];
257
272
 
273
+ // Close character - same word
258
274
  let close_char = CharInfo {
259
275
  text: 'B',
260
- x: 111.0,
276
+ x: 111.0, // 1 unit gap
261
277
  y: 50.0,
262
278
  width: 10.0,
263
279
  height: 12.0,
264
280
  };
265
281
  assert!(!should_start_new_word(&chars, &close_char));
266
282
 
283
+ // Far character - new word
267
284
  let far_char = CharInfo {
268
285
  text: 'C',
269
- x: 120.0,
286
+ x: 120.0, // 10 unit gap (> WORD_SPACING_THRESHOLD)
270
287
  y: 50.0,
271
288
  width: 10.0,
272
289
  height: 12.0,
@@ -284,10 +301,11 @@ mod tests {
284
301
  height: 12.0,
285
302
  }];
286
303
 
304
+ // Character on different line
287
305
  let new_line_char = CharInfo {
288
306
  text: 'B',
289
307
  x: 100.0,
290
- y: 70.0,
308
+ y: 70.0, // Different y
291
309
  width: 10.0,
292
310
  height: 12.0,
293
311
  };
@@ -318,7 +336,7 @@ mod tests {
318
336
 
319
337
  assert_eq!(word.text, "Hi");
320
338
  assert_eq!(word.left, 100);
321
- assert_eq!(word.width, 18);
339
+ assert_eq!(word.width, 18); // 110 + 8 - 100
322
340
  assert_eq!(word.height, 12);
323
341
  assert_eq!(word.confidence, 95.0);
324
342
  }
@@ -340,19 +358,22 @@ mod tests {
340
358
  height: 12.0,
341
359
  }];
342
360
 
361
+ // Low threshold - should pass
343
362
  let word = finalize_word(&chars, 800, 90.0);
344
363
  assert!(word.is_some());
345
364
 
365
+ // High threshold - should fail
346
366
  let word = finalize_word(&chars, 800, 96.0);
347
367
  assert!(word.is_none());
348
368
  }
349
369
 
350
370
  #[test]
351
371
  fn test_coordinate_conversion() {
372
+ // Test PDF coordinate (bottom-left origin) to image coordinate (top-left origin)
352
373
  let chars = vec![CharInfo {
353
374
  text: 'A',
354
375
  x: 100.0,
355
- y: 700.0,
376
+ y: 700.0, // PDF coordinates: bottom-left origin
356
377
  width: 10.0,
357
378
  height: 12.0,
358
379
  }];
@@ -360,11 +381,13 @@ mod tests {
360
381
  let page_height = 800;
361
382
  let word = finalize_word(&chars, page_height, 0.0).unwrap();
362
383
 
384
+ // top_in_image_coords = page_height - (y + height) = 800 - (700 + 12) = 88
363
385
  assert_eq!(word.top, 88);
364
386
  }
365
387
 
366
388
  #[test]
367
389
  fn test_word_bounding_box() {
390
+ // Test that bounding box encompasses all characters
368
391
  let chars = vec![
369
392
  CharInfo {
370
393
  text: 'A',
@@ -376,18 +399,22 @@ mod tests {
376
399
  CharInfo {
377
400
  text: 'B',
378
401
  x: 110.0,
379
- y: 51.0,
402
+ y: 51.0, // Slightly different y
380
403
  width: 10.0,
381
- height: 13.0,
404
+ height: 13.0, // Slightly different height
382
405
  },
383
406
  ];
384
407
 
385
408
  let word = finalize_word(&chars, 800, 0.0).unwrap();
386
409
 
410
+ // Left should be minimum x
387
411
  assert_eq!(word.left, 100);
388
412
 
389
- assert_eq!(word.width, 20);
413
+ // Width should span from leftmost to rightmost character
414
+ assert_eq!(word.width, 20); // 120 - 100
390
415
 
416
+ // Height should encompass both characters
417
+ // max(y+height) - min(y) = max(51+13, 50+12) - 50 = 64 - 50 = 14
391
418
  assert_eq!(word.height, 14);
392
419
  }
393
420
  }
@@ -92,6 +92,8 @@ pub fn extract_text_from_pdf_with_passwords(pdf_bytes: &[u8], passwords: &[&str]
92
92
  pub fn extract_text_from_pdf_document(document: &PdfDocument<'_>) -> Result<String> {
93
93
  let page_count = document.pages().len() as usize;
94
94
 
95
+ // Pre-allocate capacity based on estimated page size (average 2KB per page)
96
+ // This reduces memory reallocations during string concatenation
95
97
  let estimated_size = page_count * 2048;
96
98
  let mut content = String::with_capacity(estimated_size);
97
99
 
@@ -108,6 +110,7 @@ pub fn extract_text_from_pdf_document(document: &PdfDocument<'_>) -> Result<Stri
108
110
  content.push_str(&page_text);
109
111
  }
110
112
 
113
+ // Shrink to actual size to free unused capacity
111
114
  content.shrink_to_fit();
112
115
 
113
116
  Ok(content)
@@ -361,6 +361,8 @@ pub trait DocumentExtractor: Plugin {
361
361
  }
362
362
  }
363
363
 
364
+ // Public registration APIs
365
+
364
366
  /// Register a document extractor with the global registry.
365
367
  ///
366
368
  /// The extractor will be registered for all MIME types it supports and will be
@@ -536,7 +538,6 @@ pub fn clear_extractors() -> crate::Result<()> {
536
538
  #[cfg(test)]
537
539
  mod tests {
538
540
  use super::*;
539
- use serial_test::serial;
540
541
 
541
542
  struct MockExtractor {
542
543
  mime_types: Vec<&'static str>,
@@ -829,8 +830,9 @@ mod tests {
829
830
  assert_eq!(result.mime_type, "application/json");
830
831
  }
831
832
 
833
+ // Tests for public registration APIs
834
+
832
835
  #[test]
833
- #[serial]
834
836
  fn test_register_extractor() {
835
837
  use std::sync::Arc;
836
838
 
@@ -845,7 +847,6 @@ mod tests {
845
847
  }
846
848
 
847
849
  #[test]
848
- #[serial]
849
850
  fn test_unregister_extractor() {
850
851
  use std::sync::Arc;
851
852
 
@@ -860,14 +861,12 @@ mod tests {
860
861
  }
861
862
 
862
863
  #[test]
863
- #[serial]
864
864
  fn test_unregister_nonexistent_extractor() {
865
865
  let result = super::unregister_extractor("nonexistent-extractor-xyz");
866
866
  assert!(result.is_ok());
867
867
  }
868
868
 
869
869
  #[test]
870
- #[serial]
871
870
  fn test_list_extractors() {
872
871
  use std::sync::Arc;
873
872
 
@@ -889,6 +888,7 @@ mod tests {
889
888
  super::register_extractor(extractor2).unwrap();
890
889
 
891
890
  let list = super::list_extractors().unwrap();
891
+ // Both extractors have the same name, so only one will be registered
892
892
  assert_eq!(list.len(), 1);
893
893
  assert!(list.contains(&"mock-extractor".to_string()));
894
894
 
@@ -896,7 +896,6 @@ mod tests {
896
896
  }
897
897
 
898
898
  #[test]
899
- #[serial]
900
899
  fn test_clear_extractors() {
901
900
  use std::sync::Arc;
902
901
 
@@ -922,7 +921,6 @@ mod tests {
922
921
  }
923
922
 
924
923
  #[test]
925
- #[serial]
926
924
  fn test_register_extractor_with_invalid_name() {
927
925
  use std::sync::Arc;
928
926
 
@@ -967,7 +965,6 @@ mod tests {
967
965
  }
968
966
 
969
967
  #[test]
970
- #[serial]
971
968
  fn test_register_extractor_with_empty_name() {
972
969
  use std::sync::Arc;
973
970
 
@@ -81,8 +81,7 @@ pub enum OcrBackendType {
81
81
  /// }
82
82
  /// }
83
83
  /// ```
84
- #[cfg_attr(not(target_arch = "wasm32"), async_trait)]
85
- #[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
84
+ #[async_trait]
86
85
  pub trait OcrBackend: Plugin {
87
86
  /// Process an image and extract text via OCR.
88
87
  ///
@@ -253,6 +252,8 @@ pub trait OcrBackend: Plugin {
253
252
  }
254
253
  }
255
254
 
255
+ // Public registration APIs
256
+
256
257
  /// Register an OCR backend with the global registry.
257
258
  ///
258
259
  /// The OCR backend will be registered with its name from the `name()` method
@@ -319,6 +320,8 @@ pub fn register_ocr_backend(backend: Arc<dyn OcrBackend>) -> crate::Result<()> {
319
320
 
320
321
  let registry = get_ocr_backend_registry();
321
322
  // ~keep: Lock poisoning indicates a panic in another thread holding the lock.
323
+ // This is a critical runtime error (similar to OOM) that should bubble up
324
+ // as it indicates the registry is in an inconsistent state.
322
325
  let mut registry = registry
323
326
  .write()
324
327
  .expect("OCR backend registry lock poisoned - critical runtime error");
@@ -354,6 +357,8 @@ pub fn unregister_ocr_backend(name: &str) -> crate::Result<()> {
354
357
 
355
358
  let registry = get_ocr_backend_registry();
356
359
  // ~keep: Lock poisoning indicates a panic in another thread holding the lock.
360
+ // This is a critical runtime error (similar to OOM) that should bubble up
361
+ // as it indicates the registry is in an inconsistent state.
357
362
  let mut registry = registry
358
363
  .write()
359
364
  .expect("OCR backend registry lock poisoned - critical runtime error");
@@ -387,6 +392,8 @@ pub fn list_ocr_backends() -> crate::Result<Vec<String>> {
387
392
 
388
393
  let registry = get_ocr_backend_registry();
389
394
  // ~keep: Lock poisoning indicates a panic in another thread holding the lock.
395
+ // This is a critical runtime error (similar to OOM) that should bubble up
396
+ // as it indicates the registry is in an inconsistent state.
390
397
  let registry = registry
391
398
  .read()
392
399
  .expect("OCR backend registry lock poisoned - critical runtime error");
@@ -418,6 +425,8 @@ pub fn clear_ocr_backends() -> crate::Result<()> {
418
425
 
419
426
  let registry = get_ocr_backend_registry();
420
427
  // ~keep: Lock poisoning indicates a panic in another thread holding the lock.
428
+ // This is a critical runtime error (similar to OOM) that should bubble up
429
+ // as it indicates the registry is in an inconsistent state.
421
430
  let mut registry = registry
422
431
  .write()
423
432
  .expect("OCR backend registry lock poisoned - critical runtime error");
@@ -105,8 +105,7 @@ pub enum ProcessingStage {
105
105
  /// }
106
106
  /// }
107
107
  /// ```
108
- #[cfg_attr(not(target_arch = "wasm32"), async_trait)]
109
- #[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
108
+ #[async_trait]
110
109
  pub trait PostProcessor: Plugin {
111
110
  /// Process an extraction result.
112
111
  ///
@@ -264,19 +264,10 @@ impl DocumentExtractorRegistry {
264
264
  /// # Returns
265
265
  ///
266
266
  /// The highest priority extractor, or an error if none found.
267
- #[cfg_attr(feature = "otel", tracing::instrument(
268
- skip(self),
269
- fields(
270
- registry.mime_type = %mime_type,
271
- registry.found = tracing::field::Empty,
272
- )
273
- ))]
274
267
  pub fn get(&self, mime_type: &str) -> Result<Arc<dyn DocumentExtractor>> {
275
268
  if let Some(priority_map) = self.extractors.get(mime_type)
276
269
  && let Some((_priority, extractor)) = priority_map.iter().next_back()
277
270
  {
278
- #[cfg(feature = "otel")]
279
- tracing::Span::current().record("registry.found", true);
280
271
  return Ok(Arc::clone(extractor));
281
272
  }
282
273
 
@@ -302,13 +293,9 @@ impl DocumentExtractorRegistry {
302
293
  }
303
294
 
304
295
  if let Some((_priority, extractor)) = best_match {
305
- #[cfg(feature = "otel")]
306
- tracing::Span::current().record("registry.found", true);
307
296
  return Ok(extractor);
308
297
  }
309
298
 
310
- #[cfg(feature = "otel")]
311
- tracing::Span::current().record("registry.found", false);
312
299
  Err(KreuzbergError::UnsupportedFormat(mime_type.to_string()))
313
300
  }
314
301
 
@@ -68,8 +68,7 @@ use std::sync::Arc;
68
68
  /// }
69
69
  /// }
70
70
  /// ```
71
- #[cfg_attr(not(target_arch = "wasm32"), async_trait)]
72
- #[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
71
+ #[async_trait]
73
72
  pub trait Validator: Plugin {
74
73
  /// Validate an extraction result.
75
74
  ///
@@ -276,6 +275,8 @@ pub trait Validator: Plugin {
276
275
  }
277
276
  }
278
277
 
278
+ // Public registration APIs
279
+
279
280
  /// Register a validator with the global registry.
280
281
  ///
281
282
  /// The validator will be registered with its default priority and will be called
@@ -811,8 +812,9 @@ mod tests {
811
812
  assert!(validator.validate(&result, &config).await.is_ok());
812
813
  }
813
814
 
815
+ // Tests for public registration APIs
816
+
814
817
  #[test]
815
- #[serial_test::serial]
816
818
  fn test_register_validator() {
817
819
  use std::sync::Arc;
818
820
 
@@ -824,7 +826,6 @@ mod tests {
824
826
  }
825
827
 
826
828
  #[test]
827
- #[serial_test::serial]
828
829
  fn test_unregister_validator() {
829
830
  use std::sync::Arc;
830
831
 
@@ -836,20 +837,19 @@ mod tests {
836
837
  }
837
838
 
838
839
  #[test]
839
- #[serial_test::serial]
840
840
  fn test_unregister_nonexistent_validator() {
841
841
  let result = super::unregister_validator("nonexistent-validator-xyz");
842
842
  assert!(result.is_ok());
843
843
  }
844
844
 
845
845
  #[test]
846
- #[serial_test::serial]
847
846
  fn test_list_validators() {
848
847
  use std::sync::Arc;
849
848
 
850
849
  super::clear_validators().unwrap();
851
850
 
852
851
  let validator1 = Arc::new(MockValidator { should_fail: false });
852
+ // Both validators have the same name, so only one will be registered
853
853
  let validator2 = Arc::new(MockValidator { should_fail: false });
854
854
 
855
855
  let list_before = super::list_validators().unwrap();
@@ -859,6 +859,7 @@ mod tests {
859
859
  super::register_validator(validator2).unwrap();
860
860
 
861
861
  let list = super::list_validators().unwrap();
862
+ // Only 1 validator registered since they have the same name
862
863
  assert_eq!(list.len(), 1);
863
864
  assert!(list.contains(&"mock-validator".to_string()));
864
865
 
@@ -866,7 +867,6 @@ mod tests {
866
867
  }
867
868
 
868
869
  #[test]
869
- #[serial_test::serial]
870
870
  fn test_clear_validators() {
871
871
  use std::sync::Arc;
872
872
 
@@ -878,6 +878,7 @@ mod tests {
878
878
  super::register_validator(validator1).unwrap();
879
879
  super::register_validator(validator2).unwrap();
880
880
 
881
+ // Verify at least one validator is registered
881
882
  let list_before = super::list_validators().unwrap();
882
883
  assert!(!list_before.is_empty());
883
884
 
@@ -889,7 +890,6 @@ mod tests {
889
890
  }
890
891
 
891
892
  #[test]
892
- #[serial_test::serial]
893
893
  fn test_register_validator_with_invalid_name() {
894
894
  use std::sync::Arc;
895
895
 
@@ -922,7 +922,6 @@ mod tests {
922
922
  }
923
923
 
924
924
  #[test]
925
- #[serial_test::serial]
926
925
  fn test_register_validator_with_empty_name() {
927
926
  use std::sync::Arc;
928
927
 
@@ -100,7 +100,7 @@ macro_rules! embed_stopwords {
100
100
  panic!(
101
101
  "Failed to parse embedded stopwords for language '{}': {}. \
102
102
  This indicates corrupted or malformed JSON in the embedded stopwords data. \
103
- Please report this issue at https://github.com/kreuzberg-dev/kreuzberg/issues",
103
+ Please report this issue at https://github.com/Goldziher/kreuzberg/issues",
104
104
  $lang, e
105
105
  );
106
106
  }
@@ -1437,7 +1437,7 @@ mod tests {
1437
1437
  let duration = start.elapsed();
1438
1438
 
1439
1439
  assert!(
1440
- duration.as_millis() < 500,
1440
+ duration.as_millis() < 100,
1441
1441
  "30,000 lookups took too long: {:?}",
1442
1442
  duration
1443
1443
  );
@@ -844,6 +844,18 @@ pub struct CacheStats {
844
844
  pub newest_file_age_days: f64,
845
845
  }
846
846
 
847
+ /// Pandoc extraction result.
848
+ ///
849
+ /// Result of extracting content from a document using Pandoc,
850
+ /// including text and any metadata Pandoc was able to extract.
851
+ #[derive(Debug, Clone, Serialize, Deserialize)]
852
+ pub struct PandocExtractionResult {
853
+ /// Extracted text content
854
+ pub content: String,
855
+ /// Metadata extracted by Pandoc (varies by format)
856
+ pub metadata: HashMap<String, serde_json::Value>,
857
+ }
858
+
847
859
  /// LibreOffice conversion result.
848
860
  ///
849
861
  /// Result of converting a legacy office document (e.g., .doc, .ppt)
@@ -859,45 +871,3 @@ pub struct LibreOfficeConversionResult {
859
871
  /// Target MIME type after conversion
860
872
  pub target_mime: String,
861
873
  }
862
-
863
- #[cfg(test)]
864
- mod tests {
865
- use super::*;
866
-
867
- #[test]
868
- fn test_metadata_serialization_with_format() {
869
- let mut metadata = Metadata {
870
- format: Some(FormatMetadata::Text(TextMetadata {
871
- line_count: 1,
872
- word_count: 2,
873
- character_count: 13,
874
- headers: None,
875
- links: None,
876
- code_blocks: None,
877
- })),
878
- ..Default::default()
879
- };
880
-
881
- metadata
882
- .additional
883
- .insert("quality_score".to_string(), serde_json::json!(1.0));
884
-
885
- let json = serde_json::to_value(&metadata).unwrap();
886
- println!("Serialized metadata: {}", serde_json::to_string_pretty(&json).unwrap());
887
-
888
- // Check that format_type is present
889
- assert!(
890
- json.get("format_type").is_some(),
891
- "format_type should be present in serialized JSON"
892
- );
893
- assert_eq!(json.get("format_type").unwrap(), "text");
894
-
895
- // Check that Text metadata fields are present
896
- assert_eq!(json.get("line_count").unwrap(), 1);
897
- assert_eq!(json.get("word_count").unwrap(), 2);
898
- assert_eq!(json.get("character_count").unwrap(), 13);
899
-
900
- // Check that additional field is merged
901
- assert_eq!(json.get("quality_score").unwrap(), 1.0);
902
- }
903
- }
@@ -19,18 +19,6 @@ use kreuzberg::core::extractor::extract_file_sync;
19
19
 
20
20
  mod helpers;
21
21
 
22
- fn trim_trailing_newlines(value: &str) -> &str {
23
- value.trim_end_matches(['\n', '\r'])
24
- }
25
-
26
- fn assert_text_content(actual: &str, expected: &str) {
27
- assert_eq!(
28
- trim_trailing_newlines(actual),
29
- expected,
30
- "Content mismatch after trimming trailing newlines"
31
- );
32
- }
33
-
34
22
  /// Test that batch extraction processes documents in parallel.
35
23
  ///
36
24
  /// Validates:
@@ -317,8 +305,7 @@ async fn test_batch_bytes_parallel_processing() {
317
305
  assert_eq!(results.len(), 30);
318
306
 
319
307
  for (i, result) in results.iter().enumerate() {
320
- let expected = format!("Test content number {}", i);
321
- assert_text_content(&result.content, &expected);
308
+ assert_eq!(result.content, format!("Test content number {}", i));
322
309
  }
323
310
 
324
311
  println!("Batch processed 30 byte arrays in {:?}", duration);
@@ -343,9 +330,9 @@ async fn test_batch_bytes_mixed_valid_invalid() {
343
330
  let results = results.unwrap();
344
331
  assert_eq!(results.len(), 5);
345
332
 
346
- assert_text_content(&results[0].content, "valid content 1");
347
- assert_text_content(&results[2].content, "valid content 2");
348
- assert_text_content(&results[4].content, "valid content 3");
333
+ assert_eq!(results[0].content, "valid content 1");
334
+ assert_eq!(results[2].content, "valid content 2");
335
+ assert_eq!(results[4].content, "valid content 3");
349
336
 
350
337
  assert!(results[1].metadata.error.is_some());
351
338
  assert!(results[3].metadata.error.is_some());
@@ -547,8 +534,7 @@ async fn test_batch_accuracy_under_load() {
547
534
  for (i, result) in results.iter().enumerate() {
548
535
  let expected = format!("Document number {} with unique content", i);
549
536
  assert_eq!(
550
- trim_trailing_newlines(&result.content),
551
- expected,
537
+ result.content, expected,
552
538
  "Document {} content mismatch - possible cross-contamination",
553
539
  i
554
540
  );