kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +11 -11
  6. data/README.md +5 -10
  7. data/examples/async_patterns.rb +0 -1
  8. data/ext/kreuzberg_rb/extconf.rb +0 -10
  9. data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
  10. data/ext/kreuzberg_rb/native/build.rs +2 -0
  11. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  12. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  13. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  14. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  15. data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
  16. data/kreuzberg.gemspec +14 -57
  17. data/lib/kreuzberg/cache_api.rb +0 -1
  18. data/lib/kreuzberg/cli.rb +2 -2
  19. data/lib/kreuzberg/config.rb +2 -9
  20. data/lib/kreuzberg/errors.rb +7 -75
  21. data/lib/kreuzberg/extraction_api.rb +0 -1
  22. data/lib/kreuzberg/setup_lib_path.rb +0 -1
  23. data/lib/kreuzberg/version.rb +1 -1
  24. data/lib/kreuzberg.rb +0 -21
  25. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  26. data/sig/kreuzberg.rbs +3 -55
  27. data/spec/binding/cli_proxy_spec.rb +4 -2
  28. data/spec/binding/cli_spec.rb +11 -12
  29. data/spec/examples.txt +104 -0
  30. data/spec/fixtures/config.yaml +1 -0
  31. data/spec/spec_helper.rb +1 -1
  32. data/vendor/kreuzberg/Cargo.toml +42 -112
  33. data/vendor/kreuzberg/README.md +2 -2
  34. data/vendor/kreuzberg/build.rs +4 -18
  35. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  36. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  37. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  38. data/vendor/kreuzberg/src/core/extractor.rs +81 -202
  39. data/vendor/kreuzberg/src/core/io.rs +2 -4
  40. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  41. data/vendor/kreuzberg/src/core/mod.rs +1 -4
  42. data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
  43. data/vendor/kreuzberg/src/embeddings.rs +16 -125
  44. data/vendor/kreuzberg/src/error.rs +1 -1
  45. data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/image.rs +13 -13
  47. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
  48. data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
  49. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  50. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  51. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  52. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  53. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  54. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  55. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  56. data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
  57. data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
  58. data/vendor/kreuzberg/src/extractors/email.rs +0 -14
  59. data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
  60. data/vendor/kreuzberg/src/extractors/html.rs +154 -137
  61. data/vendor/kreuzberg/src/extractors/image.rs +4 -7
  62. data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
  63. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  64. data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
  65. data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
  66. data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
  67. data/vendor/kreuzberg/src/extractors/text.rs +5 -23
  68. data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
  69. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  70. data/vendor/kreuzberg/src/lib.rs +1 -4
  71. data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
  72. data/vendor/kreuzberg/src/mcp/server.rs +3 -5
  73. data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
  74. data/vendor/kreuzberg/src/pdf/error.rs +1 -1
  75. data/vendor/kreuzberg/src/pdf/table.rs +44 -17
  76. data/vendor/kreuzberg/src/pdf/text.rs +3 -0
  77. data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
  78. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
  79. data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
  80. data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
  81. data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
  82. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  83. data/vendor/kreuzberg/src/types.rs +12 -42
  84. data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
  85. data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
  86. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  87. data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
  88. data/vendor/kreuzberg/tests/config_features.rs +0 -18
  89. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
  90. data/vendor/kreuzberg/tests/core_integration.rs +7 -24
  91. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  92. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  93. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
  95. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
  96. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
  97. data/vendor/kreuzberg/tests/security_validation.rs +1 -12
  98. metadata +25 -90
  99. data/.rubocop.yml +0 -538
  100. data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
  101. data/lib/kreuzberg/error_context.rb +0 -32
  102. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  103. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
  104. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
  105. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
  106. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
  107. data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
  108. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
  109. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  110. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
  111. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
  112. data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
  113. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
  114. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  115. data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
  116. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
  117. data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
  119. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  121. data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
  122. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  123. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  124. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  125. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
  126. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  127. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  128. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  129. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  130. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  131. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  132. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  133. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  134. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  135. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
  136. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  137. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  138. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
  139. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
  140. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
  141. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
  142. data/vendor/rb-sys/.cargo-ok +0 -1
  143. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  144. data/vendor/rb-sys/Cargo.lock +0 -393
  145. data/vendor/rb-sys/Cargo.toml +0 -70
  146. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  147. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  148. data/vendor/rb-sys/LICENSE-MIT +0 -21
  149. data/vendor/rb-sys/bin/release.sh +0 -21
  150. data/vendor/rb-sys/build/features.rs +0 -108
  151. data/vendor/rb-sys/build/main.rs +0 -246
  152. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  153. data/vendor/rb-sys/build/version.rs +0 -48
  154. data/vendor/rb-sys/readme.md +0 -36
  155. data/vendor/rb-sys/src/bindings.rs +0 -21
  156. data/vendor/rb-sys/src/hidden.rs +0 -11
  157. data/vendor/rb-sys/src/lib.rs +0 -34
  158. data/vendor/rb-sys/src/macros.rs +0 -371
  159. data/vendor/rb-sys/src/memory.rs +0 -53
  160. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  161. data/vendor/rb-sys/src/special_consts.rs +0 -31
  162. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  163. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  164. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  165. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  166. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  167. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  168. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  169. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  170. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  171. data/vendor/rb-sys/src/stable_api.rs +0 -261
  172. data/vendor/rb-sys/src/symbol.rs +0 -31
  173. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  174. data/vendor/rb-sys/src/utils.rs +0 -89
  175. data/vendor/rb-sys/src/value_type.rs +0 -7
@@ -12,18 +12,6 @@ use std::path::PathBuf;
12
12
  mod helpers;
13
13
  use helpers::{get_test_documents_dir, get_test_file_path, skip_if_missing, test_documents_available};
14
14
 
15
- fn trim_trailing_newlines(value: &str) -> &str {
16
- value.trim_end_matches(['\n', '\r'])
17
- }
18
-
19
- fn assert_text_content(actual: &str, expected: &str) {
20
- assert_eq!(
21
- trim_trailing_newlines(actual),
22
- expected,
23
- "Content mismatch after trimming trailing newlines"
24
- );
25
- }
26
-
27
15
  /// Test batch extraction with multiple file formats (PDF, DOCX, TXT).
28
16
  #[tokio::test]
29
17
  async fn test_batch_extract_file_multiple_formats() {
@@ -134,7 +122,7 @@ async fn test_batch_extract_bytes_multiple() {
134
122
 
135
123
  assert_eq!(results.len(), 3);
136
124
 
137
- assert_text_content(&results[0].content, "This is plain text content");
125
+ assert_eq!(results[0].content, "This is plain text content");
138
126
  assert_eq!(results[0].mime_type, "text/plain");
139
127
 
140
128
  assert!(results[1].content.contains("Markdown Header"));
@@ -310,7 +298,7 @@ fn test_batch_extract_bytes_sync_variant() {
310
298
  let results = results.unwrap();
311
299
 
312
300
  assert_eq!(results.len(), 3);
313
- assert_text_content(&results[0].content, "content 1");
314
- assert_text_content(&results[1].content, "content 2");
301
+ assert_eq!(results[0].content, "content 1");
302
+ assert_eq!(results[1].content, "content 2");
315
303
  assert!(results[2].content.contains("content 3"));
316
304
  }
@@ -0,0 +1,92 @@
1
+ #[cfg(feature = "chunking")]
2
+ #[test]
3
+ fn demonstrate_correct_offset_calculation() {
4
+ use kreuzberg::chunking::{ChunkerType, ChunkingConfig, chunk_text};
5
+
6
+ println!("\n=== Demonstrating Correct Chunking Offset Calculation ===\n");
7
+
8
+ let config_with_overlap = ChunkingConfig {
9
+ max_characters: 20,
10
+ overlap: 5,
11
+ trim: false,
12
+ chunker_type: ChunkerType::Text,
13
+ };
14
+
15
+ let text = "AAAAA BBBBB CCCCC DDDDD EEEEE FFFFF";
16
+ println!("Text: \"{}\"", text);
17
+ println!(
18
+ "Max characters: {}, Overlap: {}\n",
19
+ config_with_overlap.max_characters, config_with_overlap.overlap
20
+ );
21
+
22
+ let result = chunk_text(text, &config_with_overlap).unwrap();
23
+
24
+ println!("WITH OVERLAP (5 chars):");
25
+ for (i, chunk) in result.chunks.iter().enumerate() {
26
+ println!(
27
+ " Chunk {}: [{:3} - {:3}] = \"{}\"",
28
+ i,
29
+ chunk.metadata.char_start,
30
+ chunk.metadata.char_end,
31
+ chunk.content.replace('\n', "\\n")
32
+ );
33
+ }
34
+
35
+ println!("\nOverlap verification:");
36
+ for i in 0..result.chunks.len() - 1 {
37
+ let current = &result.chunks[i];
38
+ let next = &result.chunks[i + 1];
39
+ let overlap_size = current.metadata.char_end - next.metadata.char_start;
40
+ println!(
41
+ " Chunks {} and {}: overlap = {} chars (next starts at {} while current ends at {})",
42
+ i,
43
+ i + 1,
44
+ overlap_size,
45
+ next.metadata.char_start,
46
+ current.metadata.char_end
47
+ );
48
+ assert!(
49
+ overlap_size > 0 && overlap_size <= config_with_overlap.overlap + 10,
50
+ "Overlap should exist and be reasonable"
51
+ );
52
+ }
53
+
54
+ println!("\n\n=== Without Overlap ===\n");
55
+ let config_no_overlap = ChunkingConfig {
56
+ max_characters: 20,
57
+ overlap: 0,
58
+ trim: false,
59
+ chunker_type: ChunkerType::Text,
60
+ };
61
+
62
+ let result_no_overlap = chunk_text(text, &config_no_overlap).unwrap();
63
+
64
+ println!("WITHOUT OVERLAP:");
65
+ for (i, chunk) in result_no_overlap.chunks.iter().enumerate() {
66
+ println!(
67
+ " Chunk {}: [{:3} - {:3}] = \"{}\"",
68
+ i,
69
+ chunk.metadata.char_start,
70
+ chunk.metadata.char_end,
71
+ chunk.content.replace('\n', "\\n")
72
+ );
73
+ }
74
+
75
+ println!("\nAdjacency verification:");
76
+ for i in 0..result_no_overlap.chunks.len() - 1 {
77
+ let current = &result_no_overlap.chunks[i];
78
+ let next = &result_no_overlap.chunks[i + 1];
79
+ let gap = next.metadata.char_start as i32 - current.metadata.char_end as i32;
80
+ println!(
81
+ " Chunks {} and {}: gap = {} (next starts at {}, current ends at {})",
82
+ i,
83
+ i + 1,
84
+ gap,
85
+ next.metadata.char_start,
86
+ current.metadata.char_end
87
+ );
88
+ assert!(gap >= 0, "Should have no overlap (gap >= 0)");
89
+ }
90
+
91
+ println!("\n✓ All offset calculations are correct!");
92
+ }
@@ -30,18 +30,6 @@ use tokio::time::timeout;
30
30
 
31
31
  mod helpers;
32
32
 
33
- fn trim_trailing_newlines(value: &str) -> &str {
34
- value.trim_end_matches(['\n', '\r'])
35
- }
36
-
37
- fn assert_text_content(actual: &str, expected: &str) {
38
- assert_eq!(
39
- trim_trailing_newlines(actual),
40
- expected,
41
- "Content mismatch after trimming trailing newlines"
42
- );
43
- }
44
-
45
33
  /// Test many concurrent extractions of different MIME types.
46
34
  ///
47
35
  /// Validates that:
@@ -156,7 +144,7 @@ async fn test_concurrent_extractions_with_cache() {
156
144
  let result = handle.await.expect("Task should not panic");
157
145
  assert!(result.is_ok(), "Cache read should succeed");
158
146
  let extraction = result.unwrap();
159
- assert_text_content(&extraction.content, expected_content);
147
+ assert_eq!(extraction.content, expected_content);
160
148
  }
161
149
  }
162
150
 
@@ -171,10 +159,6 @@ async fn test_concurrent_extractions_with_cache() {
171
159
  async fn test_concurrent_ocr_processing() {
172
160
  use helpers::{get_test_file_path, skip_if_missing};
173
161
 
174
- if cfg!(windows) {
175
- return;
176
- }
177
-
178
162
  if skip_if_missing("images/ocr_image.jpg") {
179
163
  tracing::debug!("Skipping concurrent OCR test: test file not available");
180
164
  return;
@@ -484,15 +484,8 @@ async fn test_quality_processing_disabled() {
484
484
  }
485
485
 
486
486
  /// Test chunking with embeddings using balanced preset.
487
- ///
488
- /// This test requires ONNX Runtime to be installed as a system dependency.
489
- /// On macOS with Homebrew: `brew install onnxruntime`
490
- /// On Linux: Install via your package manager or download from https://github.com/microsoft/onnxruntime/releases
491
- /// On Windows: Download from https://github.com/microsoft/onnxruntime/releases
492
487
  #[tokio::test]
493
488
  #[cfg(feature = "embeddings")]
494
- #[cfg_attr(target_os = "macos", ignore = "ONNX models not cached on macOS")]
495
- #[cfg_attr(target_os = "windows", ignore = "ONNX models not cached on Windows")]
496
489
  async fn test_chunking_with_embeddings() {
497
490
  use kreuzberg::core::config::EmbeddingConfig;
498
491
 
@@ -550,15 +543,8 @@ async fn test_chunking_with_embeddings() {
550
543
  }
551
544
 
552
545
  /// Test chunking with fast embedding preset.
553
- ///
554
- /// This test requires ONNX Runtime to be installed as a system dependency.
555
- /// On macOS with Homebrew: `brew install onnxruntime`
556
- /// On Linux: Install via your package manager or download from https://github.com/microsoft/onnxruntime/releases
557
- /// On Windows: Download from https://github.com/microsoft/onnxruntime/releases
558
546
  #[tokio::test]
559
547
  #[cfg(feature = "embeddings")]
560
- #[cfg_attr(target_os = "macos", ignore = "ONNX models not cached on macOS")]
561
- #[cfg_attr(target_os = "windows", ignore = "ONNX models not cached on Windows")]
562
548
  async fn test_chunking_with_fast_embeddings() {
563
549
  use kreuzberg::core::config::{EmbeddingConfig, EmbeddingModelType};
564
550
 
@@ -587,10 +573,6 @@ async fn test_chunking_with_fast_embeddings() {
587
573
  let chunks = result.chunks.expect("Should have chunks");
588
574
  assert!(!chunks.is_empty(), "Should have at least one chunk");
589
575
 
590
- if let Some(error) = result.metadata.additional.get("embedding_error") {
591
- panic!("Embedding generation failed: {}", error);
592
- }
593
-
594
576
  for chunk in &chunks {
595
577
  let embedding = chunk.embedding.as_ref().expect("Should have embedding");
596
578
  assert_eq!(embedding.len(), 384, "Fast preset should produce 384-dim embeddings");
@@ -124,6 +124,7 @@ ocr:
124
124
  fn test_from_file_nonexistent_path_fails() {
125
125
  let result = ExtractionConfig::from_file("/nonexistent/path/config.toml");
126
126
  assert!(result.is_err(), "Should fail for nonexistent path: {:?}", result);
127
+ // Error can be Io or other types depending on the implementation
127
128
  }
128
129
 
129
130
  /// Test from_file with malformed TOML fails.
@@ -141,6 +142,7 @@ enabled = true
141
142
 
142
143
  let result = ExtractionConfig::from_file(&config_path);
143
144
  assert!(result.is_err(), "Should fail for malformed TOML: {:?}", result);
145
+ // Error handling varies - just ensure it failed
144
146
  }
145
147
 
146
148
  /// Test from_file with malformed JSON fails.
@@ -162,6 +164,7 @@ fn test_from_file_malformed_json_fails() {
162
164
 
163
165
  let result = ExtractionConfig::from_file(&config_path);
164
166
  assert!(result.is_err(), "Should fail for malformed JSON: {:?}", result);
167
+ // Error handling varies - just ensure it failed
165
168
  }
166
169
 
167
170
  /// Test from_file with malformed YAML fails.
@@ -180,6 +183,7 @@ ocr:
180
183
 
181
184
  let result = ExtractionConfig::from_file(&config_path);
182
185
  assert!(result.is_err(), "Should fail for malformed YAML: {:?}", result);
186
+ // Error handling varies - just ensure it failed
183
187
  }
184
188
 
185
189
  /// Test from_file with empty file uses defaults.
@@ -194,6 +198,7 @@ fn test_from_file_empty_file_uses_defaults() {
194
198
  assert!(config.is_ok(), "Should load empty file successfully");
195
199
 
196
200
  let config = config.unwrap();
201
+ // Should have default values
197
202
  assert!(config.ocr.is_none(), "Default config should have no OCR");
198
203
  assert!(config.chunking.is_none(), "Default config should have no chunking");
199
204
  }
@@ -209,18 +214,22 @@ fn test_from_file_unsupported_extension_fails() {
209
214
  let result = ExtractionConfig::from_file(&config_path);
210
215
  assert!(result.is_err(), "Should fail for unsupported extension: {:?}", result);
211
216
 
212
- if let Err(KreuzbergError::Validation { message, .. }) = result {
213
- assert!(
214
- message.contains("format") || message.contains("extension") || message.contains("Unsupported"),
215
- "Error should mention format/extension: {}",
216
- message
217
- );
217
+ match result {
218
+ Err(KreuzbergError::Validation { message, .. }) => {
219
+ assert!(
220
+ message.contains("format") || message.contains("extension") || message.contains("Unsupported"),
221
+ "Error should mention format/extension: {}",
222
+ message
223
+ );
224
+ }
225
+ _ => {
226
+ // Some other error is also acceptable
227
+ }
218
228
  }
219
229
  }
220
230
 
221
231
  /// Test discover() finds config in current directory.
222
232
  #[test]
223
- #[serial_test::serial]
224
233
  fn test_discover_finds_config_in_current_dir() {
225
234
  let temp_dir = TempDir::new().unwrap();
226
235
  let config_path = temp_dir.path().join("kreuzberg.toml");
@@ -232,11 +241,13 @@ enabled = true
232
241
 
233
242
  fs::write(&config_path, toml_content).unwrap();
234
243
 
244
+ // Change to temp directory
235
245
  let original_dir = std::env::current_dir().unwrap();
236
246
  std::env::set_current_dir(temp_dir.path()).unwrap();
237
247
 
238
248
  let result = ExtractionConfig::discover();
239
249
 
250
+ // Restore original directory
240
251
  std::env::set_current_dir(original_dir).unwrap();
241
252
 
242
253
  assert!(result.is_ok(), "Discover should succeed");
@@ -247,7 +258,6 @@ enabled = true
247
258
 
248
259
  /// Test discover() finds config in parent directory.
249
260
  #[test]
250
- #[serial_test::serial]
251
261
  fn test_discover_finds_config_in_parent_dir() {
252
262
  let temp_dir = TempDir::new().unwrap();
253
263
  let config_path = temp_dir.path().join("kreuzberg.toml");
@@ -259,14 +269,17 @@ enabled = true
259
269
 
260
270
  fs::write(&config_path, toml_content).unwrap();
261
271
 
272
+ // Create subdirectory
262
273
  let sub_dir = temp_dir.path().join("subdir");
263
274
  fs::create_dir(&sub_dir).unwrap();
264
275
 
276
+ // Change to subdirectory
265
277
  let original_dir = std::env::current_dir().unwrap();
266
278
  std::env::set_current_dir(&sub_dir).unwrap();
267
279
 
268
280
  let result = ExtractionConfig::discover();
269
281
 
282
+ // Restore original directory
270
283
  std::env::set_current_dir(original_dir).unwrap();
271
284
 
272
285
  assert!(result.is_ok(), "Discover should succeed");
@@ -277,39 +290,44 @@ enabled = true
277
290
 
278
291
  /// Test discover() returns None when no config found.
279
292
  #[test]
280
- #[serial_test::serial]
281
293
  fn test_discover_returns_none_when_not_found() {
282
294
  let temp_dir = TempDir::new().unwrap();
283
295
  let sub_dir = temp_dir.path().join("subdir");
284
296
  fs::create_dir(&sub_dir).unwrap();
285
297
 
298
+ // Change to subdirectory (no config files)
286
299
  let original_dir = std::env::current_dir().unwrap();
287
300
  std::env::set_current_dir(&sub_dir).unwrap();
288
301
 
289
302
  let result = ExtractionConfig::discover();
290
303
 
304
+ // Restore original directory
291
305
  std::env::set_current_dir(original_dir).unwrap();
292
306
 
293
307
  assert!(result.is_ok(), "Discover should succeed even when no config found");
294
308
  let _config = result.unwrap();
309
+ // May return None or may find a config in parent directories (e.g., repository root)
310
+ // Just verify it doesn't error - the specific behavior depends on the directory structure
295
311
  }
296
312
 
297
313
  /// Test discover() prefers certain file names.
298
314
  #[test]
299
- #[serial_test::serial]
300
315
  fn test_discover_file_name_preference() {
301
316
  let temp_dir = TempDir::new().unwrap();
302
317
 
318
+ // Create multiple config files
303
319
  fs::write(temp_dir.path().join("kreuzberg.toml"), "[ocr]\nenabled = true").unwrap();
304
320
  fs::write(temp_dir.path().join(".kreuzberg.toml"), "[ocr]\nenabled = false").unwrap();
305
321
 
306
322
  let original_dir = std::env::current_dir().unwrap();
307
323
  if std::env::set_current_dir(temp_dir.path()).is_err() {
324
+ // Skip this test if we can't change directory
308
325
  return;
309
326
  }
310
327
 
311
328
  let result = ExtractionConfig::discover();
312
329
 
330
+ // Always restore directory even if test fails
313
331
  let _ = std::env::set_current_dir(original_dir);
314
332
 
315
333
  assert!(result.is_ok(), "Discover should succeed");
@@ -319,7 +337,6 @@ fn test_discover_file_name_preference() {
319
337
 
320
338
  /// Test discover() with nested directories.
321
339
  #[test]
322
- #[serial_test::serial]
323
340
  fn test_discover_with_nested_directories() {
324
341
  let temp_dir = TempDir::new().unwrap();
325
342
  let config_path = temp_dir.path().join("kreuzberg.toml");
@@ -331,18 +348,22 @@ enabled = true
331
348
 
332
349
  fs::write(&config_path, toml_content).unwrap();
333
350
 
351
+ // Create nested subdirectories
334
352
  let level1 = temp_dir.path().join("level1");
335
353
  let level2 = level1.join("level2");
336
354
  let level3 = level2.join("level3");
337
355
  fs::create_dir_all(&level3).unwrap();
338
356
 
357
+ // Change to deepest directory
339
358
  let original_dir = std::env::current_dir().unwrap();
340
359
  if std::env::set_current_dir(&level3).is_err() {
360
+ // Skip this test if we can't change directory
341
361
  return;
342
362
  }
343
363
 
344
364
  let result = ExtractionConfig::discover();
345
365
 
366
+ // Always restore directory even if test fails
346
367
  let _ = std::env::set_current_dir(&original_dir);
347
368
 
348
369
  assert!(result.is_ok(), "Discover should succeed");
@@ -398,6 +419,7 @@ fn test_from_file_with_invalid_values() {
398
419
  let temp_dir = TempDir::new().unwrap();
399
420
  let config_path = temp_dir.path().join("config.toml");
400
421
 
422
+ // Negative values should be rejected during deserialization or validation
401
423
  let toml_content = r#"
402
424
  [chunking]
403
425
  max_chars = -1000
@@ -407,9 +429,11 @@ max_overlap = -100
407
429
  fs::write(&config_path, toml_content).unwrap();
408
430
 
409
431
  let result = ExtractionConfig::from_file(&config_path);
410
- if let Ok(config) = result
411
- && let Some(chunking) = config.chunking
412
- {
413
- assert!(chunking.max_chars > 0, "max_chars should be positive");
432
+ // Should either fail parsing or have clamped values
433
+ if let Ok(config) = result {
434
+ // If it succeeds, values should be reasonable
435
+ if let Some(chunking) = config.chunking {
436
+ assert!(chunking.max_chars > 0, "max_chars should be positive");
437
+ }
414
438
  }
415
439
  }
@@ -11,18 +11,6 @@ use std::fs::{self, File};
11
11
  use std::io::Write;
12
12
  use tempfile::tempdir;
13
13
 
14
- fn trim_trailing_newlines(value: &str) -> &str {
15
- value.trim_end_matches(['\n', '\r'])
16
- }
17
-
18
- fn assert_text_content(actual: &str, expected: &str) {
19
- assert_eq!(
20
- trim_trailing_newlines(actual),
21
- expected,
22
- "Content mismatch after trimming trailing newlines"
23
- );
24
- }
25
-
26
14
  /// Test basic file extraction with MIME detection.
27
15
  #[tokio::test]
28
16
  async fn test_extract_file_basic() {
@@ -37,7 +25,7 @@ async fn test_extract_file_basic() {
37
25
  assert!(result.is_ok(), "Basic file extraction should succeed");
38
26
  let result = result.unwrap();
39
27
 
40
- assert_text_content(&result.content, "Hello, Kreuzberg!");
28
+ assert_eq!(result.content, "Hello, Kreuzberg!");
41
29
  assert_eq!(result.mime_type, "text/plain");
42
30
  assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
43
31
  assert!(result.detected_languages.is_none(), "Language detection not enabled");
@@ -193,12 +181,7 @@ async fn test_batch_extract_bytes_concurrency() {
193
181
 
194
182
  for (i, result) in results.iter().enumerate() {
195
183
  let expected_content = format!("content {}", i + 1);
196
- assert_eq!(
197
- trim_trailing_newlines(&result.content),
198
- expected_content,
199
- "Content mismatch for item {}",
200
- i
201
- );
184
+ assert_eq!(result.content, expected_content, "Content mismatch for item {}", i);
202
185
  assert_eq!(result.mime_type, "text/plain", "MIME type should be text/plain");
203
186
  assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
204
187
  assert!(result.detected_languages.is_none(), "Language detection not enabled");
@@ -218,13 +201,13 @@ fn test_sync_wrappers() {
218
201
  let result = extract_file_sync(&file_path, None, &config);
219
202
  assert!(result.is_ok(), "Sync file extraction should succeed");
220
203
  let extraction = result.unwrap();
221
- assert_text_content(&extraction.content, "sync content");
204
+ assert_eq!(extraction.content, "sync content");
222
205
  assert!(extraction.chunks.is_none(), "Chunks should be None");
223
206
 
224
207
  let result = extract_bytes_sync(b"test bytes", "text/plain", &config);
225
208
  assert!(result.is_ok(), "Sync bytes extraction should succeed");
226
209
  let extraction = result.unwrap();
227
- assert_text_content(&extraction.content, "test bytes");
210
+ assert_eq!(extraction.content, "test bytes");
228
211
  assert!(extraction.chunks.is_none(), "Chunks should be None");
229
212
 
230
213
  let paths = vec![file_path];
@@ -232,7 +215,7 @@ fn test_sync_wrappers() {
232
215
  assert!(results.is_ok(), "Batch sync file should succeed");
233
216
  let results = results.unwrap();
234
217
  assert_eq!(results.len(), 1);
235
- assert_text_content(&results[0].content, "sync content");
218
+ assert_eq!(results[0].content, "sync content");
236
219
  assert!(results[0].chunks.is_none(), "Chunks should be None");
237
220
 
238
221
  let contents = vec![(b"test".as_slice(), "text/plain")];
@@ -240,7 +223,7 @@ fn test_sync_wrappers() {
240
223
  assert!(results.is_ok(), "Batch bytes sync should succeed");
241
224
  let results = results.unwrap();
242
225
  assert_eq!(results.len(), 1);
243
- assert_text_content(&results[0].content, "test");
226
+ assert_eq!(results[0].content, "test");
244
227
  assert!(results[0].chunks.is_none(), "Chunks should be None");
245
228
  }
246
229
 
@@ -432,7 +415,7 @@ async fn test_pipeline_execution() {
432
415
  assert!(result.is_ok(), "Pipeline execution should succeed");
433
416
 
434
417
  let result = result.unwrap();
435
- assert_text_content(&result.content, "pipeline content");
418
+ assert_eq!(result.content, "pipeline content");
436
419
  assert_eq!(result.mime_type, "text/plain");
437
420
  assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
438
421
  assert!(result.detected_languages.is_none(), "Language detection not enabled");