kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +11 -11
  6. data/README.md +5 -10
  7. data/examples/async_patterns.rb +0 -1
  8. data/ext/kreuzberg_rb/extconf.rb +0 -10
  9. data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
  10. data/ext/kreuzberg_rb/native/build.rs +2 -0
  11. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  12. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  13. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  14. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  15. data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
  16. data/kreuzberg.gemspec +14 -57
  17. data/lib/kreuzberg/cache_api.rb +0 -1
  18. data/lib/kreuzberg/cli.rb +2 -2
  19. data/lib/kreuzberg/config.rb +2 -9
  20. data/lib/kreuzberg/errors.rb +7 -75
  21. data/lib/kreuzberg/extraction_api.rb +0 -1
  22. data/lib/kreuzberg/setup_lib_path.rb +0 -1
  23. data/lib/kreuzberg/version.rb +1 -1
  24. data/lib/kreuzberg.rb +0 -21
  25. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  26. data/sig/kreuzberg.rbs +3 -55
  27. data/spec/binding/cli_proxy_spec.rb +4 -2
  28. data/spec/binding/cli_spec.rb +11 -12
  29. data/spec/examples.txt +104 -0
  30. data/spec/fixtures/config.yaml +1 -0
  31. data/spec/spec_helper.rb +1 -1
  32. data/vendor/kreuzberg/Cargo.toml +42 -112
  33. data/vendor/kreuzberg/README.md +2 -2
  34. data/vendor/kreuzberg/build.rs +4 -18
  35. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  36. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  37. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  38. data/vendor/kreuzberg/src/core/extractor.rs +81 -202
  39. data/vendor/kreuzberg/src/core/io.rs +2 -4
  40. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  41. data/vendor/kreuzberg/src/core/mod.rs +1 -4
  42. data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
  43. data/vendor/kreuzberg/src/embeddings.rs +16 -125
  44. data/vendor/kreuzberg/src/error.rs +1 -1
  45. data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/image.rs +13 -13
  47. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
  48. data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
  49. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  50. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  51. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  52. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  53. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  54. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  55. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  56. data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
  57. data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
  58. data/vendor/kreuzberg/src/extractors/email.rs +0 -14
  59. data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
  60. data/vendor/kreuzberg/src/extractors/html.rs +154 -137
  61. data/vendor/kreuzberg/src/extractors/image.rs +4 -7
  62. data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
  63. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  64. data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
  65. data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
  66. data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
  67. data/vendor/kreuzberg/src/extractors/text.rs +5 -23
  68. data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
  69. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  70. data/vendor/kreuzberg/src/lib.rs +1 -4
  71. data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
  72. data/vendor/kreuzberg/src/mcp/server.rs +3 -5
  73. data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
  74. data/vendor/kreuzberg/src/pdf/error.rs +1 -1
  75. data/vendor/kreuzberg/src/pdf/table.rs +44 -17
  76. data/vendor/kreuzberg/src/pdf/text.rs +3 -0
  77. data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
  78. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
  79. data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
  80. data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
  81. data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
  82. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  83. data/vendor/kreuzberg/src/types.rs +12 -42
  84. data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
  85. data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
  86. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  87. data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
  88. data/vendor/kreuzberg/tests/config_features.rs +0 -18
  89. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
  90. data/vendor/kreuzberg/tests/core_integration.rs +7 -24
  91. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  92. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  93. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
  95. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
  96. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
  97. data/vendor/kreuzberg/tests/security_validation.rs +1 -12
  98. metadata +25 -90
  99. data/.rubocop.yml +0 -538
  100. data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
  101. data/lib/kreuzberg/error_context.rb +0 -32
  102. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  103. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
  104. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
  105. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
  106. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
  107. data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
  108. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
  109. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  110. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
  111. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
  112. data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
  113. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
  114. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  115. data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
  116. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
  117. data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
  119. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  121. data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
  122. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  123. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  124. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  125. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
  126. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  127. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  128. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  129. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  130. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  131. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  132. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  133. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  134. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  135. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
  136. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  137. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  138. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
  139. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
  140. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
  141. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
  142. data/vendor/rb-sys/.cargo-ok +0 -1
  143. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  144. data/vendor/rb-sys/Cargo.lock +0 -393
  145. data/vendor/rb-sys/Cargo.toml +0 -70
  146. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  147. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  148. data/vendor/rb-sys/LICENSE-MIT +0 -21
  149. data/vendor/rb-sys/bin/release.sh +0 -21
  150. data/vendor/rb-sys/build/features.rs +0 -108
  151. data/vendor/rb-sys/build/main.rs +0 -246
  152. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  153. data/vendor/rb-sys/build/version.rs +0 -48
  154. data/vendor/rb-sys/readme.md +0 -36
  155. data/vendor/rb-sys/src/bindings.rs +0 -21
  156. data/vendor/rb-sys/src/hidden.rs +0 -11
  157. data/vendor/rb-sys/src/lib.rs +0 -34
  158. data/vendor/rb-sys/src/macros.rs +0 -371
  159. data/vendor/rb-sys/src/memory.rs +0 -53
  160. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  161. data/vendor/rb-sys/src/special_consts.rs +0 -31
  162. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  163. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  164. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  165. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  166. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  167. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  168. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  169. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  170. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  171. data/vendor/rb-sys/src/stable_api.rs +0 -261
  172. data/vendor/rb-sys/src/symbol.rs +0 -31
  173. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  174. data/vendor/rb-sys/src/utils.rs +0 -89
  175. data/vendor/rb-sys/src/value_type.rs +0 -7
@@ -0,0 +1,503 @@
1
+ //! Pandoc integration tests.
2
+ //!
3
+ //! Tests for Pandoc-based document extraction (RST, LaTeX, ODT, RTF).
4
+ //! Validates that Pandoc integration works when available and degrades gracefully when missing.
5
+ //!
6
+ //! Note: These tests require the `office` feature to be enabled.
7
+
8
+ #![cfg(feature = "office")]
9
+
10
+ use kreuzberg::core::config::ExtractionConfig;
11
+ use kreuzberg::core::extractor::extract_bytes;
12
+ use kreuzberg::extraction::pandoc::validate_pandoc_version;
13
+
14
+ mod helpers;
15
+
16
+ /// Check if Pandoc is installed and available.
17
+ async fn is_pandoc_available() -> bool {
18
+ validate_pandoc_version().await.is_ok()
19
+ }
20
+
21
+ /// Test reStructuredText (RST) extraction.
22
+ #[tokio::test]
23
+ async fn test_rst_extraction() {
24
+ if !is_pandoc_available().await {
25
+ println!("Skipping test: Pandoc not installed");
26
+ return;
27
+ }
28
+
29
+ let config = ExtractionConfig::default();
30
+
31
+ let rst_content = b"Title
32
+ =====
33
+
34
+ This is a paragraph in reStructuredText.
35
+
36
+ Section Heading
37
+ ---------------
38
+
39
+ - Bullet point 1
40
+ - Bullet point 2
41
+ - Bullet point 3
42
+
43
+ **Bold text** and *italic text*.";
44
+
45
+ let result = extract_bytes(rst_content, "text/x-rst", &config).await;
46
+
47
+ assert!(result.is_ok(), "RST extraction should succeed");
48
+ let extraction = result.unwrap();
49
+
50
+ assert_eq!(extraction.mime_type, "text/x-rst");
51
+
52
+ assert!(!extraction.content.is_empty(), "Content should not be empty");
53
+ assert!(
54
+ extraction.chunks.is_none(),
55
+ "Chunks should be None without chunking config"
56
+ );
57
+ assert!(
58
+ extraction.detected_languages.is_none(),
59
+ "Language detection not enabled"
60
+ );
61
+ assert!(extraction.tables.is_empty(), "RST should not extract tables");
62
+
63
+ assert!(extraction.content.contains("Title"), "Should extract title");
64
+ assert!(
65
+ extraction.content.contains("paragraph"),
66
+ "Should extract paragraph text"
67
+ );
68
+ assert!(
69
+ extraction.content.contains("Section Heading"),
70
+ "Should extract section heading"
71
+ );
72
+
73
+ assert!(
74
+ extraction.content.contains("Bullet point 1") || extraction.content.contains("point 1"),
75
+ "Should extract bullet points"
76
+ );
77
+
78
+ assert!(
79
+ extraction.content.contains("Bold text") || extraction.content.contains("italic text"),
80
+ "Should extract formatted text content"
81
+ );
82
+
83
+ let content_lower = extraction.content.to_lowercase();
84
+ assert!(content_lower.contains("title"), "Should extract title");
85
+ assert!(content_lower.contains("section"), "Should extract section heading");
86
+ assert!(content_lower.contains("bullet"), "Should extract bullet list");
87
+ }
88
+
89
+ /// Test LaTeX extraction.
90
+ #[tokio::test]
91
+ async fn test_latex_extraction() {
92
+ if !is_pandoc_available().await {
93
+ println!("Skipping test: Pandoc not installed");
94
+ return;
95
+ }
96
+
97
+ let config = ExtractionConfig::default();
98
+
99
+ let latex_content = b"\\documentclass{article}
100
+ \\begin{document}
101
+
102
+ \\title{Test Document}
103
+ \\author{Test Author}
104
+ \\maketitle
105
+
106
+ \\section{Introduction}
107
+
108
+ This is a test LaTeX document with \\textbf{bold} and \\textit{italic} text.
109
+
110
+ \\subsection{Subsection}
111
+
112
+ Some content in a subsection.
113
+
114
+ \\end{document}";
115
+
116
+ let result = extract_bytes(latex_content, "application/x-latex", &config).await;
117
+
118
+ assert!(result.is_ok(), "LaTeX extraction should succeed");
119
+ let extraction = result.unwrap();
120
+
121
+ assert_eq!(extraction.mime_type, "application/x-latex");
122
+
123
+ assert!(!extraction.content.is_empty(), "Content should not be empty");
124
+ assert!(
125
+ extraction.chunks.is_none(),
126
+ "Chunks should be None without chunking config"
127
+ );
128
+ assert!(
129
+ extraction.detected_languages.is_none(),
130
+ "Language detection not enabled"
131
+ );
132
+ assert!(
133
+ extraction.tables.is_empty(),
134
+ "LaTeX should not extract tables in this test"
135
+ );
136
+
137
+ assert!(
138
+ extraction.content.contains("Test Document"),
139
+ "Should extract document title"
140
+ );
141
+
142
+ assert!(
143
+ extraction.content.contains("Introduction"),
144
+ "Should extract section heading"
145
+ );
146
+ assert!(
147
+ extraction.content.contains("Subsection"),
148
+ "Should extract subsection heading"
149
+ );
150
+
151
+ assert!(
152
+ extraction.content.contains("test LaTeX document"),
153
+ "Should extract paragraph text"
154
+ );
155
+
156
+ assert!(
157
+ !extraction.content.contains("\\textbf") && !extraction.content.contains("\\section"),
158
+ "LaTeX commands should be stripped, not included in output"
159
+ );
160
+ }
161
+
162
+ /// Test OpenDocument Text (ODT) extraction.
163
+ #[tokio::test]
164
+ async fn test_odt_extraction() {
165
+ if !is_pandoc_available().await {
166
+ println!("Skipping test: Pandoc not installed");
167
+ return;
168
+ }
169
+
170
+ let config = ExtractionConfig::default();
171
+
172
+ let invalid_odt = b"This is not a valid ODT file";
173
+
174
+ let result = extract_bytes(invalid_odt, "application/vnd.oasis.opendocument.text", &config).await;
175
+
176
+ assert!(result.is_err(), "Invalid ODT should fail gracefully");
177
+
178
+ let error = result.unwrap_err();
179
+ match error {
180
+ kreuzberg::KreuzbergError::Parsing { .. } => {}
181
+ kreuzberg::KreuzbergError::Io(_) => {}
182
+ other => panic!("Expected Parsing or Io error, got: {:?}", other),
183
+ }
184
+ }
185
+
186
+ /// Test Rich Text Format (RTF) extraction.
187
+ #[tokio::test]
188
+ async fn test_rtf_extraction() {
189
+ if !is_pandoc_available().await {
190
+ println!("Skipping test: Pandoc not installed");
191
+ return;
192
+ }
193
+
194
+ let config = ExtractionConfig::default();
195
+
196
+ let rtf_content = b"{\\rtf1\\ansi\\deff0
197
+ {\\fonttbl{\\f0 Times New Roman;}}
198
+ \\f0\\fs24 This is a test RTF document.\\par
199
+ \\b Bold text\\b0 and \\i italic text\\i0.\\par
200
+ }";
201
+
202
+ let result = extract_bytes(rtf_content, "application/rtf", &config).await;
203
+
204
+ assert!(result.is_ok(), "RTF extraction should succeed");
205
+ let extraction = result.unwrap();
206
+
207
+ assert_eq!(extraction.mime_type, "application/rtf");
208
+
209
+ assert!(!extraction.content.is_empty(), "Content should not be empty");
210
+ assert!(
211
+ extraction.chunks.is_none(),
212
+ "Chunks should be None without chunking config"
213
+ );
214
+ assert!(
215
+ extraction.detected_languages.is_none(),
216
+ "Language detection not enabled"
217
+ );
218
+ assert!(
219
+ extraction.tables.is_empty(),
220
+ "RTF should not extract tables in this test"
221
+ );
222
+
223
+ assert!(
224
+ extraction.content.contains("test RTF document"),
225
+ "Should extract main paragraph"
226
+ );
227
+ assert!(
228
+ extraction.content.contains("Bold text") || extraction.content.contains("Bold"),
229
+ "Should extract bold text"
230
+ );
231
+ assert!(
232
+ extraction.content.contains("italic text") || extraction.content.contains("italic"),
233
+ "Should extract italic text"
234
+ );
235
+
236
+ assert!(
237
+ !extraction.content.contains("\\rtf") && !extraction.content.contains("\\par"),
238
+ "RTF control codes should be stripped from output"
239
+ );
240
+ }
241
+
242
+ /// Test graceful degradation when Pandoc is not installed.
243
+ #[tokio::test]
244
+ async fn test_pandoc_not_installed() {
245
+ let validation_result = validate_pandoc_version().await;
246
+
247
+ if validation_result.is_ok() {
248
+ println!("Pandoc is installed - skipping 'not installed' test");
249
+ return;
250
+ }
251
+
252
+ assert!(
253
+ validation_result.is_err(),
254
+ "Should return error when Pandoc not installed"
255
+ );
256
+ }
257
+
258
+ /// Test Pandoc conversion error handling.
259
+ #[tokio::test]
260
+ async fn test_pandoc_conversion_error() {
261
+ if !is_pandoc_available().await {
262
+ println!("Skipping test: Pandoc not installed");
263
+ return;
264
+ }
265
+
266
+ let config = ExtractionConfig::default();
267
+
268
+ let malformed_rst = b"===\nThis is malformed\n===\n===";
269
+
270
+ let result = extract_bytes(malformed_rst, "text/x-rst", &config).await;
271
+
272
+ assert!(
273
+ result.is_ok() || result.is_err(),
274
+ "Should handle malformed content gracefully"
275
+ );
276
+ }
277
+
278
+ /// Test EPUB extraction (ebook format).
279
+ #[tokio::test]
280
+ async fn test_epub_extraction() {
281
+ if !is_pandoc_available().await {
282
+ println!("Skipping test: Pandoc not installed");
283
+ return;
284
+ }
285
+
286
+ let config = ExtractionConfig::default();
287
+
288
+ let invalid_epub = b"This is not a valid EPUB file";
289
+
290
+ let result = extract_bytes(invalid_epub, "application/epub+zip", &config).await;
291
+
292
+ assert!(result.is_err(), "Invalid EPUB should fail gracefully");
293
+
294
+ let error = result.unwrap_err();
295
+ match error {
296
+ kreuzberg::KreuzbergError::Parsing { .. } => {}
297
+ kreuzberg::KreuzbergError::Io(_) => {}
298
+ other => panic!("Expected Parsing or Io error for invalid EPUB, got: {:?}", other),
299
+ }
300
+ }
301
+
302
+ /// Test Org mode extraction.
303
+ #[tokio::test]
304
+ async fn test_org_mode_extraction() {
305
+ if !is_pandoc_available().await {
306
+ println!("Skipping test: Pandoc not installed");
307
+ return;
308
+ }
309
+
310
+ let config = ExtractionConfig::default();
311
+
312
+ let org_content = b"* Top Level Heading
313
+
314
+ This is a paragraph in Org mode.
315
+
316
+ ** Second Level Heading
317
+
318
+ - Item 1
319
+ - Item 2
320
+ - Item 3
321
+
322
+ *bold text* and /italic text/";
323
+
324
+ let result = extract_bytes(org_content, "text/x-org", &config).await;
325
+
326
+ assert!(result.is_ok(), "Org mode extraction should succeed");
327
+ let extraction = result.unwrap();
328
+
329
+ assert!(!extraction.content.is_empty(), "Content should not be empty");
330
+ assert!(
331
+ extraction.chunks.is_none(),
332
+ "Chunks should be None without chunking config"
333
+ );
334
+ assert!(
335
+ extraction.detected_languages.is_none(),
336
+ "Language detection not enabled"
337
+ );
338
+ assert!(
339
+ extraction.tables.is_empty(),
340
+ "Org mode should not extract tables in this test"
341
+ );
342
+
343
+ assert!(
344
+ extraction.content.contains("Top Level") || extraction.content.contains("paragraph"),
345
+ "Org mode content should be extracted"
346
+ );
347
+
348
+ assert!(
349
+ extraction.content.contains("paragraph") || extraction.content.contains("Heading"),
350
+ "Text content should be present"
351
+ );
352
+ }
353
+
354
+ /// Test Typst extraction (new document format).
355
+ #[tokio::test]
356
+ async fn test_typst_extraction() {
357
+ if !is_pandoc_available().await {
358
+ println!("Skipping test: Pandoc not installed");
359
+ return;
360
+ }
361
+
362
+ let config = ExtractionConfig::default();
363
+
364
+ let typst_content = b"= Heading
365
+
366
+ This is a paragraph in Typst.
367
+
368
+ == Subheading
369
+
370
+ #strong[Bold text] and #emph[italic text].";
371
+
372
+ let result = extract_bytes(typst_content, "application/x-typst", &config).await;
373
+
374
+ assert!(
375
+ result.is_ok() || result.is_err(),
376
+ "Should handle Typst gracefully (may not be supported in all Pandoc versions)"
377
+ );
378
+ }
379
+
380
+ /// Test CommonMark extraction.
381
+ #[tokio::test]
382
+ async fn test_commonmark_extraction() {
383
+ if !is_pandoc_available().await {
384
+ println!("Skipping test: Pandoc not installed");
385
+ return;
386
+ }
387
+
388
+ let config = ExtractionConfig::default();
389
+
390
+ let commonmark_content = b"# Heading
391
+
392
+ This is a paragraph in CommonMark.
393
+
394
+ ## Subheading
395
+
396
+ - List item 1
397
+ - List item 2
398
+
399
+ **Bold** and *italic* text.";
400
+
401
+ let result = extract_bytes(commonmark_content, "text/x-commonmark", &config).await;
402
+
403
+ assert!(result.is_ok(), "CommonMark extraction should succeed");
404
+ let extraction = result.unwrap();
405
+
406
+ assert!(!extraction.content.is_empty(), "Content should not be empty");
407
+ assert!(
408
+ extraction.chunks.is_none(),
409
+ "Chunks should be None without chunking config"
410
+ );
411
+ assert!(
412
+ extraction.detected_languages.is_none(),
413
+ "Language detection not enabled"
414
+ );
415
+ assert!(
416
+ extraction.tables.is_empty(),
417
+ "CommonMark should not extract tables in this test"
418
+ );
419
+
420
+ assert!(
421
+ extraction.content.contains("Heading") || extraction.content.contains("paragraph"),
422
+ "CommonMark content should be extracted"
423
+ );
424
+
425
+ let content_lower = extraction.content.to_lowercase();
426
+ assert!(
427
+ content_lower.contains("heading") || content_lower.contains("paragraph"),
428
+ "Should extract text"
429
+ );
430
+ assert!(
431
+ content_lower.contains("list") || content_lower.contains("item"),
432
+ "Should extract list items"
433
+ );
434
+ }
435
+
436
+ /// Test empty content.
437
+ #[tokio::test]
438
+ async fn test_pandoc_empty_content() {
439
+ if !is_pandoc_available().await {
440
+ println!("Skipping test: Pandoc not installed");
441
+ return;
442
+ }
443
+
444
+ let config = ExtractionConfig::default();
445
+
446
+ let empty_rst = b"";
447
+
448
+ let result = extract_bytes(empty_rst, "text/x-rst", &config).await;
449
+
450
+ if let Ok(extraction) = result {
451
+ assert!(
452
+ extraction.content.is_empty() || extraction.content.trim().is_empty(),
453
+ "Empty input should produce empty or minimal output"
454
+ );
455
+ }
456
+ }
457
+
458
+ /// Test Unicode content in Pandoc formats.
459
+ #[tokio::test]
460
+ async fn test_pandoc_unicode_content() {
461
+ if !is_pandoc_available().await {
462
+ println!("Skipping test: Pandoc not installed");
463
+ return;
464
+ }
465
+
466
+ let config = ExtractionConfig::default();
467
+
468
+ let unicode_rst = "Title with Unicode
469
+ ==================
470
+
471
+ This document contains Unicode: 你好世界 🌍 café
472
+
473
+ Section
474
+ -------
475
+
476
+ Arabic: مرحبا
477
+ Emoji: 🎉 ✅ 🚀"
478
+ .as_bytes();
479
+
480
+ let result = extract_bytes(unicode_rst, "text/x-rst", &config).await;
481
+
482
+ assert!(result.is_ok(), "Unicode content should be handled");
483
+ let extraction = result.unwrap();
484
+
485
+ assert!(!extraction.content.is_empty(), "Content should be extracted");
486
+ assert!(
487
+ extraction.chunks.is_none(),
488
+ "Chunks should be None without chunking config"
489
+ );
490
+ assert!(
491
+ extraction.detected_languages.is_none(),
492
+ "Language detection not enabled"
493
+ );
494
+ assert!(
495
+ extraction.tables.is_empty(),
496
+ "RST should not extract tables in this test"
497
+ );
498
+
499
+ assert!(
500
+ extraction.content.len() > 20,
501
+ "Should have substantial extracted content"
502
+ );
503
+ }
@@ -850,6 +850,7 @@ async fn test_pipeline_multiple_processor_errors() {
850
850
 
851
851
  let result = run_pipeline(result, &config).await;
852
852
  assert!(result.is_err(), "Expected pipeline to return error");
853
+ // First failing processor (fail1 in Early stage) will cause pipeline to fail
853
854
  match result {
854
855
  Err(KreuzbergError::Plugin { message, plugin_name }) => {
855
856
  assert_eq!(message, "fail1 error");
@@ -433,6 +433,7 @@ fn test_postprocessor_error_handling() {
433
433
  let result = extract_file_sync(test_file, None, &config);
434
434
 
435
435
  // NOTE: Plugin errors now bubble up and fail the extraction (design change)
436
+ // Other error types (non-IO, non-Plugin) are caught and recorded in metadata
436
437
  assert!(
437
438
  result.is_err(),
438
439
  "Extraction should fail when postprocessor returns Plugin error"
@@ -15,6 +15,8 @@ use kreuzberg::{KreuzbergError, Result};
15
15
  use std::path::Path;
16
16
  use std::sync::Arc;
17
17
 
18
+ // ===== Mock Validators =====
19
+
18
20
  struct MockValidator {
19
21
  name: String,
20
22
  should_fail: bool,
@@ -85,6 +87,8 @@ impl Validator for FailingInitValidator {
85
87
  }
86
88
  }
87
89
 
90
+ // ===== Mock Extractors =====
91
+
88
92
  struct MockExtractor {
89
93
  name: String,
90
94
  mime_types: Vec<&'static str>,
@@ -142,6 +146,8 @@ impl DocumentExtractor for MockExtractor {
142
146
  }
143
147
  }
144
148
 
149
+ // ===== Validator Registry Tests =====
150
+
145
151
  /// Test validator registration and listing.
146
152
  #[test]
147
153
  fn test_validator_registration_succeeds() {
@@ -274,10 +280,13 @@ fn test_validator_registration_with_failed_init_fails() {
274
280
  assert!(result.is_err(), "Registration with failed init should fail");
275
281
 
276
282
  match result {
277
- Err(KreuzbergError::Plugin { .. }) => {}
283
+ Err(KreuzbergError::Plugin { .. }) => {
284
+ // Expected error type
285
+ }
278
286
  _ => panic!("Expected Plugin error"),
279
287
  }
280
288
 
289
+ // Validator should not be in the list
281
290
  assert_eq!(registry.list().len(), 0, "Failed validator should not be registered");
282
291
  }
283
292
 
@@ -286,6 +295,7 @@ fn test_validator_registration_with_failed_init_fails() {
286
295
  fn test_clear_validators_succeeds() {
287
296
  let mut registry = ValidatorRegistry::new();
288
297
 
298
+ // Register multiple validators
289
299
  let v1 = Arc::new(MockValidator {
290
300
  name: "validator-1".to_string(),
291
301
  should_fail: false,
@@ -299,6 +309,7 @@ fn test_clear_validators_succeeds() {
299
309
  registry.register(v2).unwrap();
300
310
  assert_eq!(registry.list().len(), 2);
301
311
 
312
+ // Clear all
302
313
  let result = registry.shutdown_all();
303
314
  assert!(result.is_ok(), "Clear should succeed");
304
315
  assert_eq!(registry.list().len(), 0, "Registry should be empty after clear");
@@ -359,11 +370,14 @@ fn test_get_all_validators_respects_priority() {
359
370
  let all = registry.get_all();
360
371
  assert_eq!(all.len(), 3, "Should have three validators");
361
372
 
373
+ // Should be in descending priority order
362
374
  assert_eq!(all[0].name(), "high-priority");
363
375
  assert_eq!(all[1].name(), "medium-priority");
364
376
  assert_eq!(all[2].name(), "low-priority");
365
377
  }
366
378
 
379
+ // ===== Extractor Registry Tests =====
380
+
367
381
  /// Test extractor registration and retrieval.
368
382
  #[test]
369
383
  fn test_extractor_registration_succeeds() {
@@ -437,6 +451,7 @@ fn test_extractor_priority_selection() {
437
451
  registry.register(low_priority).unwrap();
438
452
  registry.register(high_priority).unwrap();
439
453
 
454
+ // Should get the high priority extractor
440
455
  let result = registry.get("text/plain").unwrap();
441
456
  assert_eq!(
442
457
  result.name(),
@@ -458,14 +473,17 @@ fn test_extractor_wildcard_mime_matching() {
458
473
 
459
474
  registry.register(extractor).unwrap();
460
475
 
476
+ // Should match text/plain
461
477
  let result = registry.get("text/plain");
462
478
  assert!(result.is_ok(), "Should match text/plain with text/*");
463
479
  assert_eq!(result.unwrap().name(), "text-extractor");
464
480
 
481
+ // Should match text/html
465
482
  let result = registry.get("text/html");
466
483
  assert!(result.is_ok(), "Should match text/html with text/*");
467
484
  assert_eq!(result.unwrap().name(), "text-extractor");
468
485
 
486
+ // Should not match application/pdf
469
487
  let result = registry.get("application/pdf");
470
488
  assert!(result.is_err(), "Should not match application/pdf with text/*");
471
489
  }
@@ -488,6 +506,7 @@ fn test_extractor_unregistration_succeeds() {
488
506
  assert!(result.is_ok(), "Unregistration should succeed");
489
507
  assert_eq!(registry.list().len(), 0, "Registry should be empty after removal");
490
508
 
509
+ // Should no longer find extractor for MIME type
491
510
  let lookup_result = registry.get("text/plain");
492
511
  assert!(lookup_result.is_err(), "Should not find extractor after removal");
493
512
  }
@@ -505,10 +524,12 @@ fn test_extractor_multiple_mime_types() {
505
524
 
506
525
  registry.register(extractor).unwrap();
507
526
 
527
+ // Should find for all MIME types
508
528
  assert!(registry.get("application/pdf").is_ok());
509
529
  assert!(registry.get("application/vnd.ms-excel").is_ok());
510
530
  assert!(registry.get("text/csv").is_ok());
511
531
 
532
+ // All should return the same extractor
512
533
  assert_eq!(
513
534
  registry.get("application/pdf").unwrap().name(),
514
535
  "multi-format-extractor"
@@ -12,17 +12,6 @@ use kreuzberg::core::extractor::{extract_bytes_sync, extract_file_sync};
12
12
  use std::io::Write;
13
13
  use tempfile::NamedTempFile;
14
14
 
15
- fn trim_trailing_newlines(value: &str) -> &str {
16
- value.trim_end_matches(['\n', '\r'])
17
- }
18
-
19
- fn assert_text_content(actual: &str, expected: &str) {
20
- assert_eq!(
21
- trim_trailing_newlines(actual),
22
- expected,
23
- "Content mismatch after trimming trailing newlines"
24
- );
25
- }
26
15
  #[test]
27
16
  fn test_archive_zip_bomb_detection() {
28
17
  let mut cursor = std::io::Cursor::new(Vec::new());
@@ -277,7 +266,7 @@ fn test_resource_single_byte_file() {
277
266
 
278
267
  assert!(result.is_ok());
279
268
  if let Ok(extracted) = result {
280
- assert_text_content(&extracted.content, "a");
269
+ assert_eq!(extracted.content, "a");
281
270
  }
282
271
  }
283
272