kreuzberg 4.1.2 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
  5. data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
  6. data/kreuzberg.gemspec +13 -1
  7. data/lib/kreuzberg/cli.rb +16 -6
  8. data/lib/kreuzberg/cli_proxy.rb +3 -1
  9. data/lib/kreuzberg/config.rb +121 -39
  10. data/lib/kreuzberg/djot_content.rb +225 -0
  11. data/lib/kreuzberg/extraction_api.rb +20 -4
  12. data/lib/kreuzberg/result.rb +12 -2
  13. data/lib/kreuzberg/version.rb +1 -1
  14. data/lib/kreuzberg.rb +1 -0
  15. data/sig/kreuzberg.rbs +28 -12
  16. data/spec/binding/batch_operations_spec.rb +80 -0
  17. data/spec/binding/batch_spec.rb +6 -5
  18. data/spec/binding/error_recovery_spec.rb +3 -3
  19. data/spec/binding/metadata_types_spec.rb +77 -57
  20. data/spec/binding/tables_spec.rb +11 -2
  21. data/spec/serialization_spec.rb +134 -0
  22. data/spec/unit/config/output_format_spec.rb +380 -0
  23. data/vendor/Cargo.toml +1 -1
  24. data/vendor/kreuzberg/Cargo.toml +1 -1
  25. data/vendor/kreuzberg/README.md +1 -1
  26. data/vendor/kreuzberg/src/api/startup.rs +15 -1
  27. data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
  28. data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
  29. data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
  30. data/vendor/kreuzberg/src/core/io.rs +7 -7
  31. data/vendor/kreuzberg/src/core/mime.rs +4 -4
  32. data/vendor/kreuzberg/src/embeddings.rs +4 -4
  33. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
  34. data/vendor/kreuzberg/src/mcp/format.rs +237 -39
  35. data/vendor/kreuzberg/src/mcp/params.rs +26 -33
  36. data/vendor/kreuzberg/src/mcp/server.rs +6 -3
  37. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
  38. data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
  39. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
  40. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
  41. data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
  42. data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
  43. data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
  44. data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
  45. data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
  46. data/vendor/kreuzberg/tests/api_embed.rs +84 -50
  47. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
  48. data/vendor/kreuzberg/tests/api_tests.rs +298 -139
  49. data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
  50. data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
  51. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
  52. data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
  53. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
  54. data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
  55. data/vendor/kreuzberg/tests/config_behavioral.rs +416 -0
  56. data/vendor/kreuzberg/tests/config_features.rs +19 -15
  57. data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
  58. data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
  59. data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
  60. data/vendor/kreuzberg/tests/core_integration.rs +57 -57
  61. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
  62. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
  63. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
  64. data/vendor/kreuzberg/tests/email_integration.rs +7 -7
  65. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
  66. data/vendor/kreuzberg/tests/error_handling.rs +13 -11
  67. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
  68. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  69. data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
  70. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
  71. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
  72. data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
  73. data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
  74. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
  75. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
  76. data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
  77. data/vendor/kreuzberg/tests/mime_detection.rs +75 -43
  78. data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
  79. data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
  80. data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
  81. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
  82. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
  83. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
  84. data/vendor/kreuzberg/tests/page_markers.rs +1 -1
  85. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
  86. data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
  87. data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
  88. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
  89. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
  90. data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
  91. data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
  92. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +324 -31
  93. data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
  94. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
  95. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
  96. data/vendor/kreuzberg/tests/security_validation.rs +20 -19
  97. data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
  98. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
  99. data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
  100. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
  101. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
  102. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  103. metadata +12 -2
@@ -74,7 +74,7 @@ async fn test_rtf_accent_extraction() {
74
74
  let result = extract_file(&path, Some("application/rtf"), &config).await;
75
75
 
76
76
  assert!(result.is_ok(), "RTF extraction should succeed for accent.rtf");
77
- let extraction = result.unwrap();
77
+ let extraction = result.expect("Operation failed");
78
78
 
79
79
  assert_eq!(extraction.mime_type, "application/rtf");
80
80
 
@@ -112,7 +112,7 @@ async fn test_rtf_bookmark_extraction() {
112
112
  let result = extract_file(&path, Some("application/rtf"), &config).await;
113
113
 
114
114
  assert!(result.is_ok(), "RTF extraction should succeed for bookmark.rtf");
115
- let extraction = result.unwrap();
115
+ let extraction = result.expect("Operation failed");
116
116
 
117
117
  let content = extraction.content.to_lowercase();
118
118
 
@@ -137,7 +137,7 @@ async fn test_rtf_footnote_extraction() {
137
137
  let result = extract_file(&path, Some("application/rtf"), &config).await;
138
138
 
139
139
  assert!(result.is_ok(), "RTF extraction should succeed for footnote.rtf");
140
- let extraction = result.unwrap();
140
+ let extraction = result.expect("Operation failed");
141
141
 
142
142
  assert!(!extraction.content.is_empty(), "Content should not be empty");
143
143
 
@@ -176,7 +176,7 @@ async fn test_rtf_formatting_extraction() {
176
176
  let result = extract_file(&path, Some("application/rtf"), &config).await;
177
177
 
178
178
  assert!(result.is_ok(), "RTF extraction should succeed for formatting.rtf");
179
- let extraction = result.unwrap();
179
+ let extraction = result.expect("Operation failed");
180
180
 
181
181
  assert!(!extraction.content.is_empty(), "Content should not be empty");
182
182
 
@@ -223,7 +223,7 @@ async fn test_rtf_heading_extraction() {
223
223
  let result = extract_file(&path, Some("application/rtf"), &config).await;
224
224
 
225
225
  assert!(result.is_ok(), "RTF extraction should succeed for heading.rtf");
226
- let extraction = result.unwrap();
226
+ let extraction = result.expect("Operation failed");
227
227
 
228
228
  assert!(!extraction.content.is_empty(), "Content should not be empty");
229
229
 
@@ -269,7 +269,7 @@ async fn test_rtf_image_extraction() {
269
269
  let result = extract_file(&path, Some("application/rtf"), &config).await;
270
270
 
271
271
  assert!(result.is_ok(), "RTF extraction should succeed for image.rtf");
272
- let extraction = result.unwrap();
272
+ let extraction = result.expect("Operation failed");
273
273
 
274
274
  assert!(!extraction.content.is_empty(), "Content should not be empty");
275
275
 
@@ -301,7 +301,7 @@ async fn test_rtf_link_extraction() {
301
301
  let result = extract_file(&path, Some("application/rtf"), &config).await;
302
302
 
303
303
  assert!(result.is_ok(), "RTF extraction should succeed for link.rtf");
304
- let extraction = result.unwrap();
304
+ let extraction = result.expect("Operation failed");
305
305
 
306
306
  assert!(!extraction.content.is_empty(), "Content should not be empty");
307
307
 
@@ -328,7 +328,7 @@ async fn test_rtf_list_complex_extraction() {
328
328
  let result = extract_file(&path, Some("application/rtf"), &config).await;
329
329
 
330
330
  assert!(result.is_ok(), "RTF extraction should succeed for list_complex.rtf");
331
- let extraction = result.unwrap();
331
+ let extraction = result.expect("Operation failed");
332
332
 
333
333
  assert!(!extraction.content.is_empty(), "Content should not be empty");
334
334
 
@@ -381,7 +381,7 @@ async fn test_rtf_list_simple_extraction() {
381
381
  let result = extract_file(&path, Some("application/rtf"), &config).await;
382
382
 
383
383
  assert!(result.is_ok(), "RTF extraction should succeed for list_simple.rtf");
384
- let extraction = result.unwrap();
384
+ let extraction = result.expect("Operation failed");
385
385
 
386
386
  assert!(!extraction.content.is_empty(), "Content should not be empty");
387
387
 
@@ -422,7 +422,7 @@ async fn test_rtf_table_error_codes_extraction() {
422
422
  result.is_ok(),
423
423
  "RTF extraction should succeed for table_error_codes.rtf"
424
424
  );
425
- let extraction = result.unwrap();
425
+ let extraction = result.expect("Operation failed");
426
426
 
427
427
  assert!(
428
428
  extraction.mime_type == "application/rtf",
@@ -448,7 +448,7 @@ async fn test_rtf_table_simple_extraction() {
448
448
  let result = extract_file(&path, Some("application/rtf"), &config).await;
449
449
 
450
450
  assert!(result.is_ok(), "RTF extraction should succeed for table_simple.rtf");
451
- let extraction = result.unwrap();
451
+ let extraction = result.expect("Operation failed");
452
452
 
453
453
  assert!(
454
454
  extraction.mime_type == "application/rtf",
@@ -470,7 +470,7 @@ async fn test_rtf_unicode_extraction() {
470
470
  let result = extract_file(&path, Some("application/rtf"), &config).await;
471
471
 
472
472
  assert!(result.is_ok(), "RTF extraction should succeed for unicode.rtf");
473
- let extraction = result.unwrap();
473
+ let extraction = result.expect("Operation failed");
474
474
 
475
475
  assert!(!extraction.content.is_empty(), "Content should not be empty");
476
476
 
@@ -493,8 +493,8 @@ async fn test_rtf_extraction_deterministic_unicode() {
493
493
 
494
494
  assert!(result1.is_ok() && result2.is_ok(), "Both extractions should succeed");
495
495
 
496
- let extraction1 = result1.unwrap();
497
- let extraction2 = result2.unwrap();
496
+ let extraction1 = result1.expect("Operation failed");
497
+ let extraction2 = result2.expect("Operation failed");
498
498
 
499
499
  assert_eq!(
500
500
  extraction1.content, extraction2.content,
@@ -514,8 +514,8 @@ async fn test_rtf_extraction_deterministic_list_complex() {
514
514
 
515
515
  assert!(result1.is_ok() && result2.is_ok(), "Both extractions should succeed");
516
516
 
517
- let extraction1 = result1.unwrap();
518
- let extraction2 = result2.unwrap();
517
+ let extraction1 = result1.expect("Operation failed");
518
+ let extraction2 = result2.expect("Operation failed");
519
519
 
520
520
  assert_eq!(
521
521
  extraction1.content, extraction2.content,
@@ -551,7 +551,7 @@ async fn test_rtf_no_critical_content_loss() {
551
551
  filename
552
552
  );
553
553
 
554
- let extraction = result.unwrap();
554
+ let extraction = result.expect("Operation failed");
555
555
  assert!(
556
556
  !extraction.content.is_empty(),
557
557
  "FAIL: CRITICAL - Extracted 0 bytes from {}. RTF extractor lost all content.",
@@ -582,7 +582,7 @@ async fn test_rtf_mime_type_preservation() {
582
582
 
583
583
  assert!(result.is_ok(), "Extraction should succeed for {}", filename);
584
584
 
585
- let extraction = result.unwrap();
585
+ let extraction = result.expect("Operation failed");
586
586
  assert_eq!(
587
587
  extraction.mime_type, "application/rtf",
588
588
  "FAIL: MIME type not preserved for {}",
@@ -31,11 +31,11 @@ fn test_archive_zip_bomb_detection() {
31
31
  let mut zip = ZipWriter::new(&mut cursor);
32
32
  let options = FileOptions::<'_, ()>::default();
33
33
 
34
- zip.start_file("large.txt", options).unwrap();
34
+ zip.start_file("large.txt", options).expect("Operation failed");
35
35
  let zeros = vec![0u8; 10 * 1024 * 1024];
36
- zip.write_all(&zeros).unwrap();
36
+ zip.write_all(&zeros).expect("Operation failed");
37
37
 
38
- zip.finish().unwrap();
38
+ zip.finish().expect("Operation failed");
39
39
  }
40
40
 
41
41
  let bytes = cursor.into_inner();
@@ -57,10 +57,10 @@ fn test_archive_path_traversal_zip() {
57
57
  let mut zip = ZipWriter::new(&mut cursor);
58
58
  let options = FileOptions::<'_, ()>::default();
59
59
 
60
- zip.start_file("../../etc/passwd", options).unwrap();
61
- zip.write_all(b"malicious content").unwrap();
60
+ zip.start_file("../../etc/passwd", options).expect("Operation failed");
61
+ zip.write_all(b"malicious content").expect("Operation failed");
62
62
 
63
- zip.finish().unwrap();
63
+ zip.finish().expect("Operation failed");
64
64
  }
65
65
 
66
66
  let bytes = cursor.into_inner();
@@ -97,10 +97,10 @@ fn test_archive_absolute_paths_rejected() {
97
97
  let mut zip = ZipWriter::new(&mut cursor);
98
98
  let options = FileOptions::<'_, ()>::default();
99
99
 
100
- zip.start_file("/tmp/malicious.txt", options).unwrap();
101
- zip.write_all(b"malicious content").unwrap();
100
+ zip.start_file("/tmp/malicious.txt", options).expect("Operation failed");
101
+ zip.write_all(b"malicious content").expect("Operation failed");
102
102
 
103
- zip.finish().unwrap();
103
+ zip.finish().expect("Operation failed");
104
104
  }
105
105
 
106
106
  let bytes = cursor.into_inner();
@@ -125,10 +125,10 @@ fn test_archive_deeply_nested_directories() {
125
125
  let deep_path = (0..100).map(|i| format!("dir{}", i)).collect::<Vec<_>>().join("/");
126
126
  let file_path = format!("{}/file.txt", deep_path);
127
127
 
128
- zip.start_file(&file_path, options).unwrap();
129
- zip.write_all(b"deep content").unwrap();
128
+ zip.start_file(&file_path, options).expect("Operation failed");
129
+ zip.write_all(b"deep content").expect("Operation failed");
130
130
 
131
- zip.finish().unwrap();
131
+ zip.finish().expect("Operation failed");
132
132
  }
133
133
 
134
134
  let bytes = cursor.into_inner();
@@ -149,11 +149,12 @@ fn test_archive_many_small_files() {
149
149
  let options = FileOptions::<'_, ()>::default();
150
150
 
151
151
  for i in 0..1000 {
152
- zip.start_file(format!("file{}.txt", i), options).unwrap();
153
- zip.write_all(b"small content").unwrap();
152
+ zip.start_file(format!("file{}.txt", i), options)
153
+ .expect("Operation failed");
154
+ zip.write_all(b"small content").expect("Operation failed");
154
155
  }
155
156
 
156
- zip.finish().unwrap();
157
+ zip.finish().expect("Operation failed");
157
158
  }
158
159
 
159
160
  let bytes = cursor.into_inner();
@@ -404,13 +405,13 @@ fn test_security_directory_instead_of_file() {
404
405
 
405
406
  #[test]
406
407
  fn test_security_special_file_handling() {
407
- let mut tmpfile = NamedTempFile::new().unwrap();
408
- tmpfile.write_all(b"test content").unwrap();
409
- tmpfile.flush().unwrap();
408
+ let mut tmpfile = NamedTempFile::new().expect("Operation failed");
409
+ tmpfile.write_all(b"test content").expect("Operation failed");
410
+ tmpfile.flush().expect("Operation failed");
410
411
  let path = tmpfile.path();
411
412
 
412
413
  let config = ExtractionConfig::default();
413
- let result = extract_file_sync(path.to_str().unwrap(), None, &config);
414
+ let result = extract_file_sync(path.to_str().expect("Operation failed"), None, &config);
414
415
 
415
416
  assert!(result.is_ok() || result.is_err());
416
417
  }
@@ -0,0 +1,112 @@
1
+ //! Cross-language serialization integration tests.
2
+ //!
3
+ //! These tests validate that ExtractionConfig serializes correctly
4
+ //! and that the serialized output can be used for cross-language comparison.
5
+
6
+ use kreuzberg::core::config::ExtractionConfig;
7
+
8
+ #[test]
9
+ fn test_extraction_config_minimal_serialization() {
10
+ let config = ExtractionConfig::default();
11
+ let json = serde_json::to_value(&config).expect("Failed to serialize config");
12
+
13
+ // Validate that all expected fields are present
14
+ assert!(json.get("use_cache").is_some(), "Missing use_cache field");
15
+ assert!(
16
+ json.get("enable_quality_processing").is_some(),
17
+ "Missing enable_quality_processing field"
18
+ );
19
+ assert!(json.get("force_ocr").is_some(), "Missing force_ocr field");
20
+ }
21
+
22
+ #[test]
23
+ fn test_extraction_config_serialization_round_trip() {
24
+ let original = ExtractionConfig {
25
+ use_cache: true,
26
+ enable_quality_processing: false,
27
+ force_ocr: true,
28
+ ..Default::default()
29
+ };
30
+
31
+ // Serialize to JSON
32
+ let json = serde_json::to_value(&original).expect("Failed to serialize");
33
+
34
+ // Deserialize back
35
+ let restored: ExtractionConfig = serde_json::from_value(json).expect("Failed to deserialize");
36
+
37
+ // Validate that key fields are preserved
38
+ assert_eq!(original.use_cache, restored.use_cache, "use_cache field not preserved");
39
+ assert_eq!(
40
+ original.enable_quality_processing, restored.enable_quality_processing,
41
+ "enable_quality_processing field not preserved"
42
+ );
43
+ assert_eq!(original.force_ocr, restored.force_ocr, "force_ocr field not preserved");
44
+ }
45
+
46
+ #[test]
47
+ fn test_extraction_config_nested_serialization() {
48
+ let config = ExtractionConfig {
49
+ use_cache: true,
50
+ enable_quality_processing: true,
51
+ force_ocr: false,
52
+ // Note: Nested fields like ocr, chunking, etc. would be set here
53
+ // This test focuses on the basic serialization structure
54
+ ..Default::default()
55
+ };
56
+
57
+ let json = serde_json::to_value(&config).expect("Failed to serialize");
58
+
59
+ // Ensure it's a proper JSON object
60
+ assert!(json.is_object(), "Serialized output should be a JSON object");
61
+
62
+ // Validate that core fields are present
63
+ assert!(json.get("use_cache").is_some());
64
+ assert!(json.get("enable_quality_processing").is_some());
65
+ assert!(json.get("force_ocr").is_some());
66
+ }
67
+
68
+ #[test]
69
+ fn test_extraction_config_json_format() {
70
+ let config = ExtractionConfig::default();
71
+ let json_string = serde_json::to_string(&config).expect("Failed to serialize to string");
72
+
73
+ // Validate that output is valid JSON
74
+ let parsed: serde_json::Value = serde_json::from_str(&json_string).expect("Invalid JSON output");
75
+ assert!(parsed.is_object(), "JSON should be an object");
76
+ }
77
+
78
+ #[test]
79
+ fn test_extraction_config_pretty_print() {
80
+ let config = ExtractionConfig::default();
81
+ let pretty_json = serde_json::to_string_pretty(&config).expect("Failed to serialize");
82
+
83
+ // Validate that pretty-printed JSON is parseable
84
+ let _parsed: serde_json::Value = serde_json::from_str(&pretty_json).expect("Invalid pretty-printed JSON");
85
+
86
+ // Pretty JSON should have newlines
87
+ assert!(pretty_json.contains('\n'), "Pretty JSON should have newlines");
88
+ }
89
+
90
+ #[test]
91
+ fn test_extraction_config_field_consistency() {
92
+ let configs = vec![
93
+ ExtractionConfig::default(),
94
+ ExtractionConfig {
95
+ use_cache: true,
96
+ ..Default::default()
97
+ },
98
+ ExtractionConfig {
99
+ enable_quality_processing: false,
100
+ ..Default::default()
101
+ },
102
+ ];
103
+
104
+ for config in configs {
105
+ let json = serde_json::to_value(&config).expect("Failed to serialize");
106
+
107
+ // All configs should have the same set of top-level fields
108
+ assert!(json.get("use_cache").is_some());
109
+ assert!(json.get("enable_quality_processing").is_some());
110
+ assert!(json.get("force_ocr").is_some());
111
+ }
112
+ }
@@ -67,7 +67,7 @@ fn test_stopwords_removed_during_moderate_token_reduction() {
67
67
  };
68
68
 
69
69
  let input = "The quick brown fox is jumping over the lazy dog and running through the forest";
70
- let result = reduce_tokens(input, &config, Some("en")).unwrap();
70
+ let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
71
71
 
72
72
  assert!(!result.contains(" the "), "Should remove 'the'. Result: {}", result);
73
73
  assert!(!result.contains(" is "), "Should remove 'is'. Result: {}", result);
@@ -103,7 +103,7 @@ fn test_stopwords_across_reduction_levels() {
103
103
  use_simd: false,
104
104
  ..Default::default()
105
105
  };
106
- let light_result = reduce_tokens(text, &light_config, Some("en")).unwrap();
106
+ let light_result = reduce_tokens(text, &light_config, Some("en")).expect("Operation failed");
107
107
 
108
108
  let light_stopwords = count_stopwords(&light_result, "en");
109
109
  assert!(light_stopwords > 0, "Light reduction should preserve some stopwords");
@@ -113,7 +113,7 @@ fn test_stopwords_across_reduction_levels() {
113
113
  use_simd: false,
114
114
  ..Default::default()
115
115
  };
116
- let moderate_result = reduce_tokens(text, &moderate_config, Some("en")).unwrap();
116
+ let moderate_result = reduce_tokens(text, &moderate_config, Some("en")).expect("Operation failed");
117
117
 
118
118
  let moderate_stopwords = count_stopwords(&moderate_result, "en");
119
119
  assert!(
@@ -128,7 +128,7 @@ fn test_stopwords_across_reduction_levels() {
128
128
  use_simd: false,
129
129
  ..Default::default()
130
130
  };
131
- let aggressive_result = reduce_tokens(text, &aggressive_config, Some("en")).unwrap();
131
+ let aggressive_result = reduce_tokens(text, &aggressive_config, Some("en")).expect("Operation failed");
132
132
 
133
133
  assert!(
134
134
  aggressive_result.len() <= moderate_result.len(),
@@ -146,7 +146,7 @@ fn test_stopwords_preserve_semantic_meaning() {
146
146
 
147
147
  let input =
148
148
  "The artificial intelligence system is processing the natural language text for extracting meaningful insights";
149
- let result = reduce_tokens(input, &config, Some("en")).unwrap();
149
+ let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
150
150
 
151
151
  let content_words = extract_content_words(&result, "en");
152
152
 
@@ -185,7 +185,7 @@ fn test_stopwords_with_multiple_languages() {
185
185
  ..Default::default()
186
186
  };
187
187
  let en_input = "The computer science program is very comprehensive and includes many courses";
188
- let en_result = reduce_tokens(en_input, &en_config, Some("en")).unwrap();
188
+ let en_result = reduce_tokens(en_input, &en_config, Some("en")).expect("Operation failed");
189
189
 
190
190
  let en_original_stopwords = count_stopwords(en_input, "en");
191
191
  let en_result_stopwords = count_stopwords(&en_result, "en");
@@ -200,7 +200,7 @@ fn test_stopwords_with_multiple_languages() {
200
200
  ..Default::default()
201
201
  };
202
202
  let es_input = "El programa de ciencias de la computación es muy completo y tiene muchos cursos";
203
- let es_result = reduce_tokens(es_input, &es_config, Some("es")).unwrap();
203
+ let es_result = reduce_tokens(es_input, &es_config, Some("es")).expect("Operation failed");
204
204
 
205
205
  let es_original_stopwords = count_stopwords(es_input, "es");
206
206
  let es_result_stopwords = count_stopwords(&es_result, "es");
@@ -221,7 +221,7 @@ fn test_stopwords_with_multiple_languages() {
221
221
  ..Default::default()
222
222
  };
223
223
  let de_input = "Die künstliche Intelligenz ist ein wichtiges Forschungsgebiet der Informatik";
224
- let de_result = reduce_tokens(de_input, &de_config, Some("de")).unwrap();
224
+ let de_result = reduce_tokens(de_input, &de_config, Some("de")).expect("Operation failed");
225
225
 
226
226
  let de_original_stopwords = count_stopwords(de_input, "de");
227
227
  let de_result_stopwords = count_stopwords(&de_result, "de");
@@ -240,7 +240,7 @@ fn test_language_fallback_to_english_stopwords() {
240
240
  };
241
241
 
242
242
  let input = "The system is processing the data with the algorithm";
243
- let result = reduce_tokens(input, &config, Some("xyz")).unwrap();
243
+ let result = reduce_tokens(input, &config, Some("xyz")).expect("Operation failed");
244
244
 
245
245
  let original_stopwords = count_stopwords(input, "en");
246
246
  let result_stopwords = count_stopwords(&result, "en");
@@ -267,7 +267,7 @@ fn test_custom_stopwords_integration() {
267
267
  };
268
268
 
269
269
  let input = "The algorithm processes the data in the system efficiently";
270
- let result = reduce_tokens(input, &config, Some("en")).unwrap();
270
+ let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
271
271
 
272
272
  assert!(
273
273
  !result.contains("algorithm"),
@@ -301,7 +301,7 @@ fn test_stopwords_with_chinese_text() {
301
301
  };
302
302
 
303
303
  let input = "这个人工智能系统可以处理自然语言";
304
- let result = reduce_tokens(input, &config, Some("zh")).unwrap();
304
+ let result = reduce_tokens(input, &config, Some("zh")).expect("Operation failed");
305
305
 
306
306
  assert!(
307
307
  !result.is_empty(),
@@ -325,7 +325,7 @@ fn test_stopwords_with_mixed_cjk_english() {
325
325
  };
326
326
 
327
327
  let input = "The machine learning model 机器学习模型 is processing data efficiently";
328
- let result = reduce_tokens(input, &config, Some("en")).unwrap();
328
+ let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
329
329
 
330
330
  assert!(
331
331
  !result.contains(" the ") && !result.contains("The "),
@@ -355,7 +355,7 @@ fn test_stopwords_with_japanese_text() {
355
355
  };
356
356
 
357
357
  let input = "人工知能技術の研究開発";
358
- let result = reduce_tokens(input, &config, Some("ja")).unwrap();
358
+ let result = reduce_tokens(input, &config, Some("ja")).expect("Operation failed");
359
359
 
360
360
  assert!(
361
361
  !result.is_empty(),
@@ -373,7 +373,7 @@ fn test_stopwords_with_korean_text() {
373
373
  };
374
374
 
375
375
  let input = "인공 지능 기술 개발";
376
- let result = reduce_tokens(input, &config, Some("ko")).unwrap();
376
+ let result = reduce_tokens(input, &config, Some("ko")).expect("Operation failed");
377
377
 
378
378
  assert!(
379
379
  !result.is_empty(),
@@ -391,7 +391,7 @@ fn test_stopwords_excluded_from_rake_keywords() {
391
391
 
392
392
  let config = KeywordConfig::rake().with_language("en").with_max_keywords(10);
393
393
 
394
- let keywords = extract_keywords(text, &config).unwrap();
394
+ let keywords = extract_keywords(text, &config).expect("Operation failed");
395
395
 
396
396
  assert!(!keywords.is_empty(), "Should extract keywords");
397
397
 
@@ -439,7 +439,7 @@ fn test_stopwords_excluded_from_yake_keywords() {
439
439
 
440
440
  let config = KeywordConfig::yake().with_language("en").with_max_keywords(10);
441
441
 
442
- let keywords = extract_keywords(text, &config).unwrap();
442
+ let keywords = extract_keywords(text, &config).expect("Operation failed");
443
443
 
444
444
  assert!(!keywords.is_empty(), "Should extract keywords");
445
445
 
@@ -472,7 +472,7 @@ fn test_keywords_respect_language_specific_stopwords() {
472
472
 
473
473
  let config = KeywordConfig::rake().with_language("es").with_max_keywords(8);
474
474
 
475
- let keywords = extract_keywords(spanish_text, &config).unwrap();
475
+ let keywords = extract_keywords(spanish_text, &config).expect("Operation failed");
476
476
 
477
477
  assert!(!keywords.is_empty(), "Should extract Spanish keywords");
478
478
 
@@ -516,7 +516,7 @@ fn test_all_stopwords_text_reduction() {
516
516
  };
517
517
 
518
518
  let input = "the is a an and or but of to in for on at by";
519
- let result = reduce_tokens(input, &config, Some("en")).unwrap();
519
+ let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
520
520
 
521
521
  assert!(
522
522
  result.len() < input.len(),
@@ -533,7 +533,7 @@ fn test_no_stopwords_text_reduction() {
533
533
  };
534
534
 
535
535
  let input = "PyTorch TensorFlow CUDA GPU optimization benchmark performance metrics";
536
- let result = reduce_tokens(input, &config, Some("en")).unwrap();
536
+ let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
537
537
 
538
538
  let input_words: Vec<&str> = input.split_whitespace().collect();
539
539
  let result_lower = result.to_lowercase();
@@ -558,7 +558,7 @@ fn test_mixed_case_stopwords_removal() {
558
558
  };
559
559
 
560
560
  let input = "The SYSTEM Is Processing The DATA With The ALGORITHM";
561
- let result = reduce_tokens(input, &config, Some("en")).unwrap();
561
+ let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
562
562
 
563
563
  let result_words: Vec<&str> = result.split_whitespace().collect();
564
564
  assert!(
@@ -594,7 +594,7 @@ fn test_reduce_tokens_function_with_stopwords() {
594
594
  };
595
595
 
596
596
  let text = "The artificial intelligence system processes the natural language efficiently";
597
- let result = reduce_tokens(text, &config, Some("en")).unwrap();
597
+ let result = reduce_tokens(text, &config, Some("en")).expect("Operation failed");
598
598
 
599
599
  let original_stopwords = count_stopwords(text, "en");
600
600
  let result_stopwords = count_stopwords(&result, "en");
@@ -622,7 +622,7 @@ fn test_stopwords_with_punctuation() {
622
622
  };
623
623
 
624
624
  let input = "The system, which is processing the data, uses the algorithm.";
625
- let result = reduce_tokens(input, &config, Some("en")).unwrap();
625
+ let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
626
626
 
627
627
  assert!(
628
628
  !result.contains(" the ") || result.split_whitespace().filter(|w| w.contains("the")).count() < 3,
@@ -646,7 +646,7 @@ fn test_stopwords_with_numbers() {
646
646
  };
647
647
 
648
648
  let input = "The model has 100 layers and processes the data in 10 seconds";
649
- let result = reduce_tokens(input, &config, Some("en")).unwrap();
649
+ let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
650
650
 
651
651
  assert!(
652
652
  result.contains("100"),
@@ -672,9 +672,9 @@ fn test_stopwords_removal_consistency_across_calls() {
672
672
 
673
673
  let input = "The machine learning model is trained on the dataset";
674
674
 
675
- let result1 = reduce_tokens(input, &config, Some("en")).unwrap();
676
- let result2 = reduce_tokens(input, &config, Some("en")).unwrap();
677
- let result3 = reduce_tokens(input, &config, Some("en")).unwrap();
675
+ let result1 = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
676
+ let result2 = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
677
+ let result3 = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
678
678
 
679
679
  assert_eq!(result1, result2, "Results should be consistent across calls");
680
680
  assert_eq!(result2, result3, "Results should be consistent across calls");
@@ -694,7 +694,7 @@ fn test_stopwords_with_long_text() {
694
694
  The system processes the data efficiently and achieves the best performance. ";
695
695
  let input = paragraph.repeat(10);
696
696
 
697
- let result = reduce_tokens(&input, &config, Some("en")).unwrap();
697
+ let result = reduce_tokens(&input, &config, Some("en")).expect("Operation failed");
698
698
 
699
699
  assert!(
700
700
  result.len() < input.len(),
@@ -719,9 +719,9 @@ fn test_get_stopwords_with_fallback_in_reduction() {
719
719
  let primary_stopwords = get_stopwords_with_fallback("xyz", "en");
720
720
  assert!(primary_stopwords.is_some(), "Should fallback to English");
721
721
 
722
- let en_stopwords = get_stopwords("en").unwrap();
722
+ let en_stopwords = get_stopwords("en").expect("Operation failed");
723
723
  assert_eq!(
724
- primary_stopwords.unwrap().len(),
724
+ primary_stopwords.expect("Operation failed").len(),
725
725
  en_stopwords.len(),
726
726
  "Fallback should return English stopwords"
727
727
  );
@@ -733,7 +733,7 @@ fn test_get_stopwords_with_fallback_in_reduction() {
733
733
  };
734
734
 
735
735
  let input = "The system is processing the data";
736
- let result = reduce_tokens(input, &config, Some("xyz")).unwrap();
736
+ let result = reduce_tokens(input, &config, Some("xyz")).expect("Operation failed");
737
737
 
738
738
  assert!(
739
739
  !result.contains(" the ") && !result.contains(" is "),
@@ -789,7 +789,7 @@ fn test_token_reduction_handles_multibyte_utf8() {
789
789
  };
790
790
 
791
791
  let input = "品質管理は重要です。🚀 高速抽出と漢字処理が求められています。";
792
- let result = reduce_tokens(input, &config, Some("ja")).unwrap();
792
+ let result = reduce_tokens(input, &config, Some("ja")).expect("Operation failed");
793
793
 
794
794
  assert!(
795
795
  result.contains("品質管理") || result.contains("漢字処理"),
@@ -814,7 +814,7 @@ fn test_token_reduction_concurrent_access() {
814
814
  for _ in 0..8 {
815
815
  let cfg = Arc::clone(&config);
816
816
  scope.spawn(move || {
817
- let reduced = reduce_tokens(input, &cfg, Some("en")).unwrap();
817
+ let reduced = reduce_tokens(input, &cfg, Some("en")).expect("Operation failed");
818
818
  assert!(!reduced.is_empty());
819
819
  });
820
820
  }
@@ -831,7 +831,7 @@ fn demo_stopwords_effectiveness() {
831
831
  use_simd: false,
832
832
  ..Default::default()
833
833
  };
834
- let en_result = reduce_tokens(en_text, &en_config, Some("en")).unwrap();
834
+ let en_result = reduce_tokens(en_text, &en_config, Some("en")).expect("Operation failed");
835
835
 
836
836
  println!("\n=== English Example ===");
837
837
  println!("BEFORE: {} chars", en_text.len());
@@ -849,7 +849,7 @@ fn demo_stopwords_effectiveness() {
849
849
  use_simd: false,
850
850
  ..Default::default()
851
851
  };
852
- let zh_result = reduce_tokens(zh_text, &zh_config, Some("zh")).unwrap();
852
+ let zh_result = reduce_tokens(zh_text, &zh_config, Some("zh")).expect("Operation failed");
853
853
 
854
854
  println!("\n=== Chinese Example ===");
855
855
  println!("BEFORE: {}", zh_text);
@@ -870,7 +870,7 @@ fn demo_stopwords_effectiveness() {
870
870
  use_simd: false,
871
871
  ..Default::default()
872
872
  };
873
- let result = reduce_tokens(text, &config, Some("en")).unwrap();
873
+ let result = reduce_tokens(text, &config, Some("en")).expect("Operation failed");
874
874
  println!(
875
875
  "{:?}: {} chars -> {} chars ({}% reduction)",
876
876
  level,
@@ -881,7 +881,7 @@ fn demo_stopwords_effectiveness() {
881
881
  println!(" {}", result);
882
882
  }
883
883
 
884
- let stopwords = get_stopwords("en").unwrap();
884
+ let stopwords = get_stopwords("en").expect("Operation failed");
885
885
  println!("\n=== Stopwords Stats ===");
886
886
  println!("English stopwords: {}", stopwords.len());
887
887
  println!("Sample stopwords: {:?}", stopwords.iter().take(10).collect::<Vec<_>>());