kreuzberg 4.1.1 → 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +4 -4
  3. data/README.md +8 -5
  4. data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
  5. data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
  6. data/ext/kreuzberg_rb/native/src/config/types.rs +23 -13
  7. data/kreuzberg.gemspec +14 -2
  8. data/lib/kreuzberg/api_proxy.rb +0 -1
  9. data/lib/kreuzberg/cli_proxy.rb +0 -1
  10. data/lib/kreuzberg/config.rb +70 -35
  11. data/lib/kreuzberg/mcp_proxy.rb +0 -1
  12. data/lib/kreuzberg/version.rb +1 -1
  13. data/sig/kreuzberg.rbs +5 -1
  14. data/spec/binding/batch_operations_spec.rb +80 -0
  15. data/spec/binding/metadata_types_spec.rb +77 -57
  16. data/spec/serialization_spec.rb +134 -0
  17. data/spec/unit/config/output_format_spec.rb +380 -0
  18. data/vendor/Cargo.toml +1 -1
  19. data/vendor/kreuzberg/Cargo.toml +3 -3
  20. data/vendor/kreuzberg/README.md +1 -1
  21. data/vendor/kreuzberg/src/embeddings.rs +4 -4
  22. data/vendor/kreuzberg/src/mcp/format.rs +237 -39
  23. data/vendor/kreuzberg/src/mcp/params.rs +26 -33
  24. data/vendor/kreuzberg/src/mcp/server.rs +6 -3
  25. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
  26. data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
  27. data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
  28. data/vendor/kreuzberg/tests/api_embed.rs +84 -50
  29. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
  30. data/vendor/kreuzberg/tests/api_tests.rs +298 -139
  31. data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
  32. data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
  33. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
  34. data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
  35. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
  36. data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
  37. data/vendor/kreuzberg/tests/config_behavioral.rs +414 -0
  38. data/vendor/kreuzberg/tests/config_features.rs +19 -15
  39. data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
  40. data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
  41. data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
  42. data/vendor/kreuzberg/tests/core_integration.rs +55 -53
  43. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
  44. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
  45. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
  46. data/vendor/kreuzberg/tests/email_integration.rs +7 -7
  47. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
  48. data/vendor/kreuzberg/tests/error_handling.rs +13 -11
  49. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
  50. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  51. data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
  52. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
  53. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
  54. data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
  55. data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
  56. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
  57. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
  58. data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
  59. data/vendor/kreuzberg/tests/mime_detection.rs +72 -41
  60. data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
  61. data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
  62. data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
  63. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
  64. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
  65. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
  66. data/vendor/kreuzberg/tests/page_markers.rs +1 -1
  67. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
  68. data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
  69. data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
  70. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
  71. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
  72. data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
  73. data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
  74. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +40 -30
  75. data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
  76. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
  77. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
  78. data/vendor/kreuzberg/tests/security_validation.rs +20 -19
  79. data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
  80. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
  81. data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
  82. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
  83. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
  84. data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
  85. data/vendor/kreuzberg-tesseract/build.rs +4 -4
  86. data/vendor/kreuzberg-tesseract/src/lib.rs +6 -6
  87. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +3 -3
  88. metadata +13 -2
@@ -0,0 +1,112 @@
1
+ //! Cross-language serialization integration tests.
2
+ //!
3
+ //! These tests validate that ExtractionConfig serializes correctly
4
+ //! and that the serialized output can be used for cross-language comparison.
5
+
6
+ use kreuzberg::core::config::ExtractionConfig;
7
+
8
+ #[test]
9
+ fn test_extraction_config_minimal_serialization() {
10
+ let config = ExtractionConfig::default();
11
+ let json = serde_json::to_value(&config).expect("Failed to serialize config");
12
+
13
+ // Validate that all expected fields are present
14
+ assert!(json.get("use_cache").is_some(), "Missing use_cache field");
15
+ assert!(
16
+ json.get("enable_quality_processing").is_some(),
17
+ "Missing enable_quality_processing field"
18
+ );
19
+ assert!(json.get("force_ocr").is_some(), "Missing force_ocr field");
20
+ }
21
+
22
+ #[test]
23
+ fn test_extraction_config_serialization_round_trip() {
24
+ let original = ExtractionConfig {
25
+ use_cache: true,
26
+ enable_quality_processing: false,
27
+ force_ocr: true,
28
+ ..Default::default()
29
+ };
30
+
31
+ // Serialize to JSON
32
+ let json = serde_json::to_value(&original).expect("Failed to serialize");
33
+
34
+ // Deserialize back
35
+ let restored: ExtractionConfig = serde_json::from_value(json).expect("Failed to deserialize");
36
+
37
+ // Validate that key fields are preserved
38
+ assert_eq!(original.use_cache, restored.use_cache, "use_cache field not preserved");
39
+ assert_eq!(
40
+ original.enable_quality_processing, restored.enable_quality_processing,
41
+ "enable_quality_processing field not preserved"
42
+ );
43
+ assert_eq!(original.force_ocr, restored.force_ocr, "force_ocr field not preserved");
44
+ }
45
+
46
+ #[test]
47
+ fn test_extraction_config_nested_serialization() {
48
+ let config = ExtractionConfig {
49
+ use_cache: true,
50
+ enable_quality_processing: true,
51
+ force_ocr: false,
52
+ // Note: Nested fields like ocr, chunking, etc. would be set here
53
+ // This test focuses on the basic serialization structure
54
+ ..Default::default()
55
+ };
56
+
57
+ let json = serde_json::to_value(&config).expect("Failed to serialize");
58
+
59
+ // Ensure it's a proper JSON object
60
+ assert!(json.is_object(), "Serialized output should be a JSON object");
61
+
62
+ // Validate that core fields are present
63
+ assert!(json.get("use_cache").is_some());
64
+ assert!(json.get("enable_quality_processing").is_some());
65
+ assert!(json.get("force_ocr").is_some());
66
+ }
67
+
68
+ #[test]
69
+ fn test_extraction_config_json_format() {
70
+ let config = ExtractionConfig::default();
71
+ let json_string = serde_json::to_string(&config).expect("Failed to serialize to string");
72
+
73
+ // Validate that output is valid JSON
74
+ let parsed: serde_json::Value = serde_json::from_str(&json_string).expect("Invalid JSON output");
75
+ assert!(parsed.is_object(), "JSON should be an object");
76
+ }
77
+
78
+ #[test]
79
+ fn test_extraction_config_pretty_print() {
80
+ let config = ExtractionConfig::default();
81
+ let pretty_json = serde_json::to_string_pretty(&config).expect("Failed to serialize");
82
+
83
+ // Validate that pretty-printed JSON is parseable
84
+ let _parsed: serde_json::Value = serde_json::from_str(&pretty_json).expect("Invalid pretty-printed JSON");
85
+
86
+ // Pretty JSON should have newlines
87
+ assert!(pretty_json.contains('\n'), "Pretty JSON should have newlines");
88
+ }
89
+
90
+ #[test]
91
+ fn test_extraction_config_field_consistency() {
92
+ let configs = vec![
93
+ ExtractionConfig::default(),
94
+ ExtractionConfig {
95
+ use_cache: true,
96
+ ..Default::default()
97
+ },
98
+ ExtractionConfig {
99
+ enable_quality_processing: false,
100
+ ..Default::default()
101
+ },
102
+ ];
103
+
104
+ for config in configs {
105
+ let json = serde_json::to_value(&config).expect("Failed to serialize");
106
+
107
+ // All configs should have the same set of top-level fields
108
+ assert!(json.get("use_cache").is_some());
109
+ assert!(json.get("enable_quality_processing").is_some());
110
+ assert!(json.get("force_ocr").is_some());
111
+ }
112
+ }
@@ -67,7 +67,7 @@ fn test_stopwords_removed_during_moderate_token_reduction() {
67
67
  };
68
68
 
69
69
  let input = "The quick brown fox is jumping over the lazy dog and running through the forest";
70
- let result = reduce_tokens(input, &config, Some("en")).unwrap();
70
+ let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
71
71
 
72
72
  assert!(!result.contains(" the "), "Should remove 'the'. Result: {}", result);
73
73
  assert!(!result.contains(" is "), "Should remove 'is'. Result: {}", result);
@@ -103,7 +103,7 @@ fn test_stopwords_across_reduction_levels() {
103
103
  use_simd: false,
104
104
  ..Default::default()
105
105
  };
106
- let light_result = reduce_tokens(text, &light_config, Some("en")).unwrap();
106
+ let light_result = reduce_tokens(text, &light_config, Some("en")).expect("Operation failed");
107
107
 
108
108
  let light_stopwords = count_stopwords(&light_result, "en");
109
109
  assert!(light_stopwords > 0, "Light reduction should preserve some stopwords");
@@ -113,7 +113,7 @@ fn test_stopwords_across_reduction_levels() {
113
113
  use_simd: false,
114
114
  ..Default::default()
115
115
  };
116
- let moderate_result = reduce_tokens(text, &moderate_config, Some("en")).unwrap();
116
+ let moderate_result = reduce_tokens(text, &moderate_config, Some("en")).expect("Operation failed");
117
117
 
118
118
  let moderate_stopwords = count_stopwords(&moderate_result, "en");
119
119
  assert!(
@@ -128,7 +128,7 @@ fn test_stopwords_across_reduction_levels() {
128
128
  use_simd: false,
129
129
  ..Default::default()
130
130
  };
131
- let aggressive_result = reduce_tokens(text, &aggressive_config, Some("en")).unwrap();
131
+ let aggressive_result = reduce_tokens(text, &aggressive_config, Some("en")).expect("Operation failed");
132
132
 
133
133
  assert!(
134
134
  aggressive_result.len() <= moderate_result.len(),
@@ -146,7 +146,7 @@ fn test_stopwords_preserve_semantic_meaning() {
146
146
 
147
147
  let input =
148
148
  "The artificial intelligence system is processing the natural language text for extracting meaningful insights";
149
- let result = reduce_tokens(input, &config, Some("en")).unwrap();
149
+ let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
150
150
 
151
151
  let content_words = extract_content_words(&result, "en");
152
152
 
@@ -185,7 +185,7 @@ fn test_stopwords_with_multiple_languages() {
185
185
  ..Default::default()
186
186
  };
187
187
  let en_input = "The computer science program is very comprehensive and includes many courses";
188
- let en_result = reduce_tokens(en_input, &en_config, Some("en")).unwrap();
188
+ let en_result = reduce_tokens(en_input, &en_config, Some("en")).expect("Operation failed");
189
189
 
190
190
  let en_original_stopwords = count_stopwords(en_input, "en");
191
191
  let en_result_stopwords = count_stopwords(&en_result, "en");
@@ -200,7 +200,7 @@ fn test_stopwords_with_multiple_languages() {
200
200
  ..Default::default()
201
201
  };
202
202
  let es_input = "El programa de ciencias de la computación es muy completo y tiene muchos cursos";
203
- let es_result = reduce_tokens(es_input, &es_config, Some("es")).unwrap();
203
+ let es_result = reduce_tokens(es_input, &es_config, Some("es")).expect("Operation failed");
204
204
 
205
205
  let es_original_stopwords = count_stopwords(es_input, "es");
206
206
  let es_result_stopwords = count_stopwords(&es_result, "es");
@@ -221,7 +221,7 @@ fn test_stopwords_with_multiple_languages() {
221
221
  ..Default::default()
222
222
  };
223
223
  let de_input = "Die künstliche Intelligenz ist ein wichtiges Forschungsgebiet der Informatik";
224
- let de_result = reduce_tokens(de_input, &de_config, Some("de")).unwrap();
224
+ let de_result = reduce_tokens(de_input, &de_config, Some("de")).expect("Operation failed");
225
225
 
226
226
  let de_original_stopwords = count_stopwords(de_input, "de");
227
227
  let de_result_stopwords = count_stopwords(&de_result, "de");
@@ -240,7 +240,7 @@ fn test_language_fallback_to_english_stopwords() {
240
240
  };
241
241
 
242
242
  let input = "The system is processing the data with the algorithm";
243
- let result = reduce_tokens(input, &config, Some("xyz")).unwrap();
243
+ let result = reduce_tokens(input, &config, Some("xyz")).expect("Operation failed");
244
244
 
245
245
  let original_stopwords = count_stopwords(input, "en");
246
246
  let result_stopwords = count_stopwords(&result, "en");
@@ -267,7 +267,7 @@ fn test_custom_stopwords_integration() {
267
267
  };
268
268
 
269
269
  let input = "The algorithm processes the data in the system efficiently";
270
- let result = reduce_tokens(input, &config, Some("en")).unwrap();
270
+ let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
271
271
 
272
272
  assert!(
273
273
  !result.contains("algorithm"),
@@ -301,7 +301,7 @@ fn test_stopwords_with_chinese_text() {
301
301
  };
302
302
 
303
303
  let input = "这个人工智能系统可以处理自然语言";
304
- let result = reduce_tokens(input, &config, Some("zh")).unwrap();
304
+ let result = reduce_tokens(input, &config, Some("zh")).expect("Operation failed");
305
305
 
306
306
  assert!(
307
307
  !result.is_empty(),
@@ -325,7 +325,7 @@ fn test_stopwords_with_mixed_cjk_english() {
325
325
  };
326
326
 
327
327
  let input = "The machine learning model 机器学习模型 is processing data efficiently";
328
- let result = reduce_tokens(input, &config, Some("en")).unwrap();
328
+ let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
329
329
 
330
330
  assert!(
331
331
  !result.contains(" the ") && !result.contains("The "),
@@ -355,7 +355,7 @@ fn test_stopwords_with_japanese_text() {
355
355
  };
356
356
 
357
357
  let input = "人工知能技術の研究開発";
358
- let result = reduce_tokens(input, &config, Some("ja")).unwrap();
358
+ let result = reduce_tokens(input, &config, Some("ja")).expect("Operation failed");
359
359
 
360
360
  assert!(
361
361
  !result.is_empty(),
@@ -373,7 +373,7 @@ fn test_stopwords_with_korean_text() {
373
373
  };
374
374
 
375
375
  let input = "인공 지능 기술 개발";
376
- let result = reduce_tokens(input, &config, Some("ko")).unwrap();
376
+ let result = reduce_tokens(input, &config, Some("ko")).expect("Operation failed");
377
377
 
378
378
  assert!(
379
379
  !result.is_empty(),
@@ -391,7 +391,7 @@ fn test_stopwords_excluded_from_rake_keywords() {
391
391
 
392
392
  let config = KeywordConfig::rake().with_language("en").with_max_keywords(10);
393
393
 
394
- let keywords = extract_keywords(text, &config).unwrap();
394
+ let keywords = extract_keywords(text, &config).expect("Operation failed");
395
395
 
396
396
  assert!(!keywords.is_empty(), "Should extract keywords");
397
397
 
@@ -439,7 +439,7 @@ fn test_stopwords_excluded_from_yake_keywords() {
439
439
 
440
440
  let config = KeywordConfig::yake().with_language("en").with_max_keywords(10);
441
441
 
442
- let keywords = extract_keywords(text, &config).unwrap();
442
+ let keywords = extract_keywords(text, &config).expect("Operation failed");
443
443
 
444
444
  assert!(!keywords.is_empty(), "Should extract keywords");
445
445
 
@@ -472,7 +472,7 @@ fn test_keywords_respect_language_specific_stopwords() {
472
472
 
473
473
  let config = KeywordConfig::rake().with_language("es").with_max_keywords(8);
474
474
 
475
- let keywords = extract_keywords(spanish_text, &config).unwrap();
475
+ let keywords = extract_keywords(spanish_text, &config).expect("Operation failed");
476
476
 
477
477
  assert!(!keywords.is_empty(), "Should extract Spanish keywords");
478
478
 
@@ -516,7 +516,7 @@ fn test_all_stopwords_text_reduction() {
516
516
  };
517
517
 
518
518
  let input = "the is a an and or but of to in for on at by";
519
- let result = reduce_tokens(input, &config, Some("en")).unwrap();
519
+ let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
520
520
 
521
521
  assert!(
522
522
  result.len() < input.len(),
@@ -533,7 +533,7 @@ fn test_no_stopwords_text_reduction() {
533
533
  };
534
534
 
535
535
  let input = "PyTorch TensorFlow CUDA GPU optimization benchmark performance metrics";
536
- let result = reduce_tokens(input, &config, Some("en")).unwrap();
536
+ let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
537
537
 
538
538
  let input_words: Vec<&str> = input.split_whitespace().collect();
539
539
  let result_lower = result.to_lowercase();
@@ -558,7 +558,7 @@ fn test_mixed_case_stopwords_removal() {
558
558
  };
559
559
 
560
560
  let input = "The SYSTEM Is Processing The DATA With The ALGORITHM";
561
- let result = reduce_tokens(input, &config, Some("en")).unwrap();
561
+ let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
562
562
 
563
563
  let result_words: Vec<&str> = result.split_whitespace().collect();
564
564
  assert!(
@@ -594,7 +594,7 @@ fn test_reduce_tokens_function_with_stopwords() {
594
594
  };
595
595
 
596
596
  let text = "The artificial intelligence system processes the natural language efficiently";
597
- let result = reduce_tokens(text, &config, Some("en")).unwrap();
597
+ let result = reduce_tokens(text, &config, Some("en")).expect("Operation failed");
598
598
 
599
599
  let original_stopwords = count_stopwords(text, "en");
600
600
  let result_stopwords = count_stopwords(&result, "en");
@@ -622,7 +622,7 @@ fn test_stopwords_with_punctuation() {
622
622
  };
623
623
 
624
624
  let input = "The system, which is processing the data, uses the algorithm.";
625
- let result = reduce_tokens(input, &config, Some("en")).unwrap();
625
+ let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
626
626
 
627
627
  assert!(
628
628
  !result.contains(" the ") || result.split_whitespace().filter(|w| w.contains("the")).count() < 3,
@@ -646,7 +646,7 @@ fn test_stopwords_with_numbers() {
646
646
  };
647
647
 
648
648
  let input = "The model has 100 layers and processes the data in 10 seconds";
649
- let result = reduce_tokens(input, &config, Some("en")).unwrap();
649
+ let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
650
650
 
651
651
  assert!(
652
652
  result.contains("100"),
@@ -672,9 +672,9 @@ fn test_stopwords_removal_consistency_across_calls() {
672
672
 
673
673
  let input = "The machine learning model is trained on the dataset";
674
674
 
675
- let result1 = reduce_tokens(input, &config, Some("en")).unwrap();
676
- let result2 = reduce_tokens(input, &config, Some("en")).unwrap();
677
- let result3 = reduce_tokens(input, &config, Some("en")).unwrap();
675
+ let result1 = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
676
+ let result2 = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
677
+ let result3 = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
678
678
 
679
679
  assert_eq!(result1, result2, "Results should be consistent across calls");
680
680
  assert_eq!(result2, result3, "Results should be consistent across calls");
@@ -694,7 +694,7 @@ fn test_stopwords_with_long_text() {
694
694
  The system processes the data efficiently and achieves the best performance. ";
695
695
  let input = paragraph.repeat(10);
696
696
 
697
- let result = reduce_tokens(&input, &config, Some("en")).unwrap();
697
+ let result = reduce_tokens(&input, &config, Some("en")).expect("Operation failed");
698
698
 
699
699
  assert!(
700
700
  result.len() < input.len(),
@@ -719,9 +719,9 @@ fn test_get_stopwords_with_fallback_in_reduction() {
719
719
  let primary_stopwords = get_stopwords_with_fallback("xyz", "en");
720
720
  assert!(primary_stopwords.is_some(), "Should fallback to English");
721
721
 
722
- let en_stopwords = get_stopwords("en").unwrap();
722
+ let en_stopwords = get_stopwords("en").expect("Operation failed");
723
723
  assert_eq!(
724
- primary_stopwords.unwrap().len(),
724
+ primary_stopwords.expect("Operation failed").len(),
725
725
  en_stopwords.len(),
726
726
  "Fallback should return English stopwords"
727
727
  );
@@ -733,7 +733,7 @@ fn test_get_stopwords_with_fallback_in_reduction() {
733
733
  };
734
734
 
735
735
  let input = "The system is processing the data";
736
- let result = reduce_tokens(input, &config, Some("xyz")).unwrap();
736
+ let result = reduce_tokens(input, &config, Some("xyz")).expect("Operation failed");
737
737
 
738
738
  assert!(
739
739
  !result.contains(" the ") && !result.contains(" is "),
@@ -789,7 +789,7 @@ fn test_token_reduction_handles_multibyte_utf8() {
789
789
  };
790
790
 
791
791
  let input = "品質管理は重要です。🚀 高速抽出と漢字処理が求められています。";
792
- let result = reduce_tokens(input, &config, Some("ja")).unwrap();
792
+ let result = reduce_tokens(input, &config, Some("ja")).expect("Operation failed");
793
793
 
794
794
  assert!(
795
795
  result.contains("品質管理") || result.contains("漢字処理"),
@@ -814,7 +814,7 @@ fn test_token_reduction_concurrent_access() {
814
814
  for _ in 0..8 {
815
815
  let cfg = Arc::clone(&config);
816
816
  scope.spawn(move || {
817
- let reduced = reduce_tokens(input, &cfg, Some("en")).unwrap();
817
+ let reduced = reduce_tokens(input, &cfg, Some("en")).expect("Operation failed");
818
818
  assert!(!reduced.is_empty());
819
819
  });
820
820
  }
@@ -831,7 +831,7 @@ fn demo_stopwords_effectiveness() {
831
831
  use_simd: false,
832
832
  ..Default::default()
833
833
  };
834
- let en_result = reduce_tokens(en_text, &en_config, Some("en")).unwrap();
834
+ let en_result = reduce_tokens(en_text, &en_config, Some("en")).expect("Operation failed");
835
835
 
836
836
  println!("\n=== English Example ===");
837
837
  println!("BEFORE: {} chars", en_text.len());
@@ -849,7 +849,7 @@ fn demo_stopwords_effectiveness() {
849
849
  use_simd: false,
850
850
  ..Default::default()
851
851
  };
852
- let zh_result = reduce_tokens(zh_text, &zh_config, Some("zh")).unwrap();
852
+ let zh_result = reduce_tokens(zh_text, &zh_config, Some("zh")).expect("Operation failed");
853
853
 
854
854
  println!("\n=== Chinese Example ===");
855
855
  println!("BEFORE: {}", zh_text);
@@ -870,7 +870,7 @@ fn demo_stopwords_effectiveness() {
870
870
  use_simd: false,
871
871
  ..Default::default()
872
872
  };
873
- let result = reduce_tokens(text, &config, Some("en")).unwrap();
873
+ let result = reduce_tokens(text, &config, Some("en")).expect("Operation failed");
874
874
  println!(
875
875
  "{:?}: {} chars -> {} chars ({}% reduction)",
876
876
  level,
@@ -881,7 +881,7 @@ fn demo_stopwords_effectiveness() {
881
881
  println!(" {}", result);
882
882
  }
883
883
 
884
- let stopwords = get_stopwords("en").unwrap();
884
+ let stopwords = get_stopwords("en").expect("Operation failed");
885
885
  println!("\n=== Stopwords Stats ===");
886
886
  println!("English stopwords: {}", stopwords.len());
887
887
  println!("Sample stopwords: {:?}", stopwords.iter().take(10).collect::<Vec<_>>());
@@ -26,7 +26,7 @@ async fn test_fastembed_embedding_generation() {
26
26
  let result = model.embed(texts.clone(), None);
27
27
  assert!(result.is_ok(), "Failed to generate embeddings: {:?}", result.err());
28
28
 
29
- let embeddings = result.unwrap();
29
+ let embeddings = result.expect("Operation failed");
30
30
  assert_eq!(embeddings.len(), 3, "Expected 3 embeddings");
31
31
 
32
32
  for (i, embedding) in embeddings.iter().enumerate() {
@@ -64,7 +64,7 @@ async fn test_fastembed_batch_processing() {
64
64
 
65
65
  assert!(result.is_ok(), "Batch embedding failed: {:?}", result.err());
66
66
 
67
- let embeddings = result.unwrap();
67
+ let embeddings = result.expect("Operation failed");
68
68
  assert_eq!(embeddings.len(), 50, "Expected 50 embeddings");
69
69
 
70
70
  println!(
@@ -96,7 +96,7 @@ async fn test_fastembed_different_models() {
96
96
  let result = m.embed(test_text.clone(), None);
97
97
  assert!(result.is_ok(), "Failed to generate embedding for {}", description);
98
98
 
99
- let embeddings = result.unwrap();
99
+ let embeddings = result.expect("Operation failed");
100
100
  assert_eq!(embeddings.len(), 1);
101
101
  assert_eq!(
102
102
  embeddings[0].len(),
@@ -197,7 +197,7 @@ async fn test_generate_embeddings_for_chunks_basic() {
197
197
  for (i, chunk) in chunks.iter().enumerate() {
198
198
  assert!(chunk.embedding.is_some(), "Chunk {} missing embedding", i);
199
199
 
200
- let embedding = chunk.embedding.as_ref().unwrap();
200
+ let embedding = chunk.embedding.as_ref().expect("Operation failed");
201
201
  assert_eq!(embedding.len(), 384, "Chunk {} has wrong embedding dimensions", i);
202
202
 
203
203
  let sum: f32 = embedding.iter().sum();
@@ -269,8 +269,8 @@ async fn test_generate_embeddings_for_chunks_normalization() {
269
269
 
270
270
  generate_embeddings_for_chunks(&mut chunks_norm, &config_norm).expect("Failed to generate normalized embeddings");
271
271
 
272
- let embedding_no_norm = chunks_no_norm[0].embedding.as_ref().unwrap();
273
- let embedding_norm = chunks_norm[0].embedding.as_ref().unwrap();
272
+ let embedding_no_norm = chunks_no_norm[0].embedding.as_ref().expect("Operation failed");
273
+ let embedding_norm = chunks_norm[0].embedding.as_ref().expect("Operation failed");
274
274
 
275
275
  let magnitude_no_norm: f32 = embedding_no_norm.iter().map(|x| x * x).sum::<f32>().sqrt();
276
276
  let magnitude_norm: f32 = embedding_norm.iter().map(|x| x * x).sum::<f32>().sqrt();
@@ -560,7 +560,7 @@ async fn test_generate_embeddings_for_chunks_batch_size() {
560
560
  i
561
561
  );
562
562
  assert_eq!(
563
- chunk.embedding.as_ref().unwrap().len(),
563
+ chunk.embedding.as_ref().expect("Operation failed").len(),
564
564
  384,
565
565
  "Chunk {} has wrong dimensions",
566
566
  i
@@ -612,7 +612,7 @@ async fn test_generate_embeddings_chunking_integration() {
612
612
  for (i, chunk) in chunking_result.chunks.iter().enumerate() {
613
613
  assert!(chunk.embedding.is_some(), "Chunk {} missing embedding", i);
614
614
 
615
- let embedding = chunk.embedding.as_ref().unwrap();
615
+ let embedding = chunk.embedding.as_ref().expect("Operation failed");
616
616
  assert_eq!(embedding.len(), 384, "Chunk {} has wrong embedding dimensions", i);
617
617
 
618
618
  let magnitude: f32 = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
@@ -56,7 +56,7 @@ async fn test_simple_typst_document_extraction() {
56
56
  return;
57
57
  }
58
58
 
59
- let extraction = result.unwrap();
59
+ let extraction = result.expect("Operation failed");
60
60
 
61
61
  assert_eq!(extraction.mime_type, "text/x-typst", "MIME type should be preserved");
62
62
 
@@ -145,7 +145,7 @@ async fn test_minimal_typst_document_extraction() {
145
145
  return;
146
146
  }
147
147
 
148
- let extraction = result.unwrap();
148
+ let extraction = result.expect("Operation failed");
149
149
 
150
150
  assert!(
151
151
  !extraction.content.is_empty(),
@@ -189,7 +189,7 @@ async fn test_heading_hierarchy_extraction() {
189
189
  return;
190
190
  }
191
191
 
192
- let extraction = result.unwrap();
192
+ let extraction = result.expect("Operation failed");
193
193
 
194
194
  assert!(!extraction.content.is_empty(), "Document should extract content");
195
195
 
@@ -269,7 +269,7 @@ async fn test_metadata_extraction() {
269
269
  return;
270
270
  }
271
271
 
272
- let extraction = result.unwrap();
272
+ let extraction = result.expect("Operation failed");
273
273
 
274
274
  if let Some(title) = extraction.metadata.additional.get("title") {
275
275
  assert!(
@@ -330,7 +330,7 @@ async fn test_advanced_typst_document_extraction() {
330
330
  return;
331
331
  }
332
332
 
333
- let extraction = result.unwrap();
333
+ let extraction = result.expect("Operation failed");
334
334
 
335
335
  assert!(
336
336
  extraction.metadata.additional.contains_key("title"),
@@ -411,7 +411,7 @@ async fn test_typst_reader_extraction() {
411
411
  return;
412
412
  }
413
413
 
414
- let extraction = result.unwrap();
414
+ let extraction = result.expect("Operation failed");
415
415
 
416
416
  assert!(
417
417
  !extraction.content.is_empty(),
@@ -454,7 +454,7 @@ async fn test_undergradmath_extraction() {
454
454
  return;
455
455
  }
456
456
 
457
- let extraction = result.unwrap();
457
+ let extraction = result.expect("Operation failed");
458
458
 
459
459
  assert!(
460
460
  !extraction.content.is_empty(),
@@ -534,7 +534,7 @@ async fn test_formatting_preservation() {
534
534
  return;
535
535
  }
536
536
 
537
- let extraction = result.unwrap();
537
+ let extraction = result.expect("Operation failed");
538
538
 
539
539
  assert!(
540
540
  extraction.content.contains("*") || extraction.content.contains("bold"),
@@ -576,7 +576,7 @@ async fn test_large_document_extraction() {
576
576
  return;
577
577
  }
578
578
 
579
- let extraction = result.unwrap();
579
+ let extraction = result.expect("Operation failed");
580
580
 
581
581
  assert!(
582
582
  !extraction.content.is_empty(),
@@ -7,9 +7,9 @@ use kreuzberg::extraction::excel::read_excel_file;
7
7
  fn test_xlsx_full_metadata_extraction() {
8
8
  let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
9
9
  .parent()
10
- .unwrap()
10
+ .expect("Operation failed")
11
11
  .parent()
12
- .unwrap();
12
+ .expect("Operation failed");
13
13
  let test_file = workspace_root.join("test_documents/office/excel.xlsx");
14
14
 
15
15
  if !test_file.exists() {
@@ -17,7 +17,8 @@ fn test_xlsx_full_metadata_extraction() {
17
17
  return;
18
18
  }
19
19
 
20
- let result = read_excel_file(test_file.to_str().unwrap()).expect("Should extract XLSX successfully");
20
+ let file_path = test_file.to_str().expect("File path should be valid UTF-8");
21
+ let result = read_excel_file(file_path).expect("Should extract XLSX successfully");
21
22
 
22
23
  assert!(!result.sheets.is_empty(), "Should have at least one sheet");
23
24
 
@@ -34,9 +35,9 @@ fn test_xlsx_full_metadata_extraction() {
34
35
  fn test_xlsx_multi_sheet_metadata() {
35
36
  let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
36
37
  .parent()
37
- .unwrap()
38
+ .expect("Operation failed")
38
39
  .parent()
39
- .unwrap();
40
+ .expect("Operation failed");
40
41
  let test_file = workspace_root.join("test_documents/spreadsheets/excel_multi_sheet.xlsx");
41
42
 
42
43
  if !test_file.exists() {
@@ -44,7 +45,8 @@ fn test_xlsx_multi_sheet_metadata() {
44
45
  return;
45
46
  }
46
47
 
47
- let result = read_excel_file(test_file.to_str().unwrap()).expect("Should extract multi-sheet XLSX successfully");
48
+ let file_path = test_file.to_str().expect("File path should be valid UTF-8");
49
+ let result = read_excel_file(file_path).expect("Should extract multi-sheet XLSX successfully");
48
50
 
49
51
  assert!(
50
52
  result.sheets.len() > 1,
@@ -65,9 +67,9 @@ fn test_xlsx_multi_sheet_metadata() {
65
67
  fn test_xlsx_minimal_metadata_extraction() {
66
68
  let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
67
69
  .parent()
68
- .unwrap()
70
+ .expect("Operation failed")
69
71
  .parent()
70
- .unwrap();
72
+ .expect("Operation failed");
71
73
  let test_file = workspace_root.join("test_documents/spreadsheets/test_01.xlsx");
72
74
 
73
75
  if !test_file.exists() {
@@ -75,7 +77,8 @@ fn test_xlsx_minimal_metadata_extraction() {
75
77
  return;
76
78
  }
77
79
 
78
- let result = read_excel_file(test_file.to_str().unwrap()).expect("Should extract XLSX successfully");
80
+ let file_path = test_file.to_str().expect("File path should be valid UTF-8");
81
+ let result = read_excel_file(file_path).expect("Should extract XLSX successfully");
79
82
 
80
83
  assert!(!result.sheets.is_empty(), "Content should not be empty");
81
84
  assert!(
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-tesseract"
3
- version = "4.1.1"
3
+ version = "4.2.0"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -21,10 +21,10 @@ libc = { workspace = true }
21
21
  thiserror = { workspace = true }
22
22
 
23
23
  [dev-dependencies]
24
- image = { workspace = true }
24
+ image = { workspace = true, features = ["png"] }
25
25
 
26
26
  [build-dependencies]
27
- cc = { version = "^1.2.53", optional = true }
27
+ cc = { version = "^1.2.54", optional = true }
28
28
  cmake = { version = "0.1.57", optional = true }
29
29
  zip = { version = "7.2.0", optional = true }
30
30
 
@@ -38,7 +38,7 @@ mod build_tesseract {
38
38
  return None;
39
39
  }
40
40
  }
41
- Some(path.join("tesseract-rs-cache"))
41
+ Some(path.join("kreuzberg-tesseract-cache"))
42
42
  }
43
43
 
44
44
  fn get_preferred_out_dir() -> PathBuf {
@@ -63,14 +63,14 @@ mod build_tesseract {
63
63
  PathBuf::from(home_dir)
64
64
  .join("Library")
65
65
  .join("Application Support")
66
- .join("tesseract-rs")
66
+ .join("kreuzberg-tesseract")
67
67
  } else if cfg!(target_os = "linux") {
68
68
  let home_dir = env::var("HOME").unwrap_or_else(|_| {
69
69
  env::var("USER")
70
70
  .map(|user| format!("/home/{}", user))
71
71
  .expect("Neither HOME nor USER environment variable set")
72
72
  });
73
- PathBuf::from(home_dir).join(".tesseract-rs")
73
+ PathBuf::from(home_dir).join(".kreuzberg-tesseract")
74
74
  } else {
75
75
  panic!("Unsupported operating system");
76
76
  }
@@ -117,7 +117,7 @@ mod build_tesseract {
117
117
  "cargo:warning=Failed to create cache dir {:?}: {}. Falling back to temp dir.",
118
118
  preferred, err
119
119
  );
120
- let fallback = env::temp_dir().join("tesseract-rs-cache");
120
+ let fallback = env::temp_dir().join("kreuzberg-tesseract-cache");
121
121
  fs::create_dir_all(&fallback).expect("Failed to create fallback cache directory in temp dir");
122
122
  fallback
123
123
  }