kreuzberg 4.1.1 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -4
- data/README.md +8 -5
- data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
- data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +23 -13
- data/kreuzberg.gemspec +14 -2
- data/lib/kreuzberg/api_proxy.rb +0 -1
- data/lib/kreuzberg/cli_proxy.rb +0 -1
- data/lib/kreuzberg/config.rb +70 -35
- data/lib/kreuzberg/mcp_proxy.rb +0 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +5 -1
- data/spec/binding/batch_operations_spec.rb +80 -0
- data/spec/binding/metadata_types_spec.rb +77 -57
- data/spec/serialization_spec.rb +134 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +3 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/embeddings.rs +4 -4
- data/vendor/kreuzberg/src/mcp/format.rs +237 -39
- data/vendor/kreuzberg/src/mcp/params.rs +26 -33
- data/vendor/kreuzberg/src/mcp/server.rs +6 -3
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
- data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
- data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
- data/vendor/kreuzberg/tests/api_embed.rs +84 -50
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
- data/vendor/kreuzberg/tests/api_tests.rs +298 -139
- data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
- data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
- data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
- data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
- data/vendor/kreuzberg/tests/config_behavioral.rs +414 -0
- data/vendor/kreuzberg/tests/config_features.rs +19 -15
- data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
- data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
- data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
- data/vendor/kreuzberg/tests/core_integration.rs +55 -53
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
- data/vendor/kreuzberg/tests/email_integration.rs +7 -7
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/error_handling.rs +13 -11
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
- data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +72 -41
- data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
- data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/page_markers.rs +1 -1
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
- data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +40 -30
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
- data/vendor/kreuzberg/tests/security_validation.rs +20 -19
- data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
- data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
- data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
- data/vendor/kreuzberg-tesseract/build.rs +4 -4
- data/vendor/kreuzberg-tesseract/src/lib.rs +6 -6
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +3 -3
- metadata +13 -2
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
//! Cross-language serialization integration tests.
|
|
2
|
+
//!
|
|
3
|
+
//! These tests validate that ExtractionConfig serializes correctly
|
|
4
|
+
//! and that the serialized output can be used for cross-language comparison.
|
|
5
|
+
|
|
6
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
7
|
+
|
|
8
|
+
#[test]
|
|
9
|
+
fn test_extraction_config_minimal_serialization() {
|
|
10
|
+
let config = ExtractionConfig::default();
|
|
11
|
+
let json = serde_json::to_value(&config).expect("Failed to serialize config");
|
|
12
|
+
|
|
13
|
+
// Validate that all expected fields are present
|
|
14
|
+
assert!(json.get("use_cache").is_some(), "Missing use_cache field");
|
|
15
|
+
assert!(
|
|
16
|
+
json.get("enable_quality_processing").is_some(),
|
|
17
|
+
"Missing enable_quality_processing field"
|
|
18
|
+
);
|
|
19
|
+
assert!(json.get("force_ocr").is_some(), "Missing force_ocr field");
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
#[test]
|
|
23
|
+
fn test_extraction_config_serialization_round_trip() {
|
|
24
|
+
let original = ExtractionConfig {
|
|
25
|
+
use_cache: true,
|
|
26
|
+
enable_quality_processing: false,
|
|
27
|
+
force_ocr: true,
|
|
28
|
+
..Default::default()
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
// Serialize to JSON
|
|
32
|
+
let json = serde_json::to_value(&original).expect("Failed to serialize");
|
|
33
|
+
|
|
34
|
+
// Deserialize back
|
|
35
|
+
let restored: ExtractionConfig = serde_json::from_value(json).expect("Failed to deserialize");
|
|
36
|
+
|
|
37
|
+
// Validate that key fields are preserved
|
|
38
|
+
assert_eq!(original.use_cache, restored.use_cache, "use_cache field not preserved");
|
|
39
|
+
assert_eq!(
|
|
40
|
+
original.enable_quality_processing, restored.enable_quality_processing,
|
|
41
|
+
"enable_quality_processing field not preserved"
|
|
42
|
+
);
|
|
43
|
+
assert_eq!(original.force_ocr, restored.force_ocr, "force_ocr field not preserved");
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
#[test]
|
|
47
|
+
fn test_extraction_config_nested_serialization() {
|
|
48
|
+
let config = ExtractionConfig {
|
|
49
|
+
use_cache: true,
|
|
50
|
+
enable_quality_processing: true,
|
|
51
|
+
force_ocr: false,
|
|
52
|
+
// Note: Nested fields like ocr, chunking, etc. would be set here
|
|
53
|
+
// This test focuses on the basic serialization structure
|
|
54
|
+
..Default::default()
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
let json = serde_json::to_value(&config).expect("Failed to serialize");
|
|
58
|
+
|
|
59
|
+
// Ensure it's a proper JSON object
|
|
60
|
+
assert!(json.is_object(), "Serialized output should be a JSON object");
|
|
61
|
+
|
|
62
|
+
// Validate that core fields are present
|
|
63
|
+
assert!(json.get("use_cache").is_some());
|
|
64
|
+
assert!(json.get("enable_quality_processing").is_some());
|
|
65
|
+
assert!(json.get("force_ocr").is_some());
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
#[test]
|
|
69
|
+
fn test_extraction_config_json_format() {
|
|
70
|
+
let config = ExtractionConfig::default();
|
|
71
|
+
let json_string = serde_json::to_string(&config).expect("Failed to serialize to string");
|
|
72
|
+
|
|
73
|
+
// Validate that output is valid JSON
|
|
74
|
+
let parsed: serde_json::Value = serde_json::from_str(&json_string).expect("Invalid JSON output");
|
|
75
|
+
assert!(parsed.is_object(), "JSON should be an object");
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
#[test]
|
|
79
|
+
fn test_extraction_config_pretty_print() {
|
|
80
|
+
let config = ExtractionConfig::default();
|
|
81
|
+
let pretty_json = serde_json::to_string_pretty(&config).expect("Failed to serialize");
|
|
82
|
+
|
|
83
|
+
// Validate that pretty-printed JSON is parseable
|
|
84
|
+
let _parsed: serde_json::Value = serde_json::from_str(&pretty_json).expect("Invalid pretty-printed JSON");
|
|
85
|
+
|
|
86
|
+
// Pretty JSON should have newlines
|
|
87
|
+
assert!(pretty_json.contains('\n'), "Pretty JSON should have newlines");
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
#[test]
|
|
91
|
+
fn test_extraction_config_field_consistency() {
|
|
92
|
+
let configs = vec![
|
|
93
|
+
ExtractionConfig::default(),
|
|
94
|
+
ExtractionConfig {
|
|
95
|
+
use_cache: true,
|
|
96
|
+
..Default::default()
|
|
97
|
+
},
|
|
98
|
+
ExtractionConfig {
|
|
99
|
+
enable_quality_processing: false,
|
|
100
|
+
..Default::default()
|
|
101
|
+
},
|
|
102
|
+
];
|
|
103
|
+
|
|
104
|
+
for config in configs {
|
|
105
|
+
let json = serde_json::to_value(&config).expect("Failed to serialize");
|
|
106
|
+
|
|
107
|
+
// All configs should have the same set of top-level fields
|
|
108
|
+
assert!(json.get("use_cache").is_some());
|
|
109
|
+
assert!(json.get("enable_quality_processing").is_some());
|
|
110
|
+
assert!(json.get("force_ocr").is_some());
|
|
111
|
+
}
|
|
112
|
+
}
|
|
@@ -67,7 +67,7 @@ fn test_stopwords_removed_during_moderate_token_reduction() {
|
|
|
67
67
|
};
|
|
68
68
|
|
|
69
69
|
let input = "The quick brown fox is jumping over the lazy dog and running through the forest";
|
|
70
|
-
let result = reduce_tokens(input, &config, Some("en")).
|
|
70
|
+
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
|
71
71
|
|
|
72
72
|
assert!(!result.contains(" the "), "Should remove 'the'. Result: {}", result);
|
|
73
73
|
assert!(!result.contains(" is "), "Should remove 'is'. Result: {}", result);
|
|
@@ -103,7 +103,7 @@ fn test_stopwords_across_reduction_levels() {
|
|
|
103
103
|
use_simd: false,
|
|
104
104
|
..Default::default()
|
|
105
105
|
};
|
|
106
|
-
let light_result = reduce_tokens(text, &light_config, Some("en")).
|
|
106
|
+
let light_result = reduce_tokens(text, &light_config, Some("en")).expect("Operation failed");
|
|
107
107
|
|
|
108
108
|
let light_stopwords = count_stopwords(&light_result, "en");
|
|
109
109
|
assert!(light_stopwords > 0, "Light reduction should preserve some stopwords");
|
|
@@ -113,7 +113,7 @@ fn test_stopwords_across_reduction_levels() {
|
|
|
113
113
|
use_simd: false,
|
|
114
114
|
..Default::default()
|
|
115
115
|
};
|
|
116
|
-
let moderate_result = reduce_tokens(text, &moderate_config, Some("en")).
|
|
116
|
+
let moderate_result = reduce_tokens(text, &moderate_config, Some("en")).expect("Operation failed");
|
|
117
117
|
|
|
118
118
|
let moderate_stopwords = count_stopwords(&moderate_result, "en");
|
|
119
119
|
assert!(
|
|
@@ -128,7 +128,7 @@ fn test_stopwords_across_reduction_levels() {
|
|
|
128
128
|
use_simd: false,
|
|
129
129
|
..Default::default()
|
|
130
130
|
};
|
|
131
|
-
let aggressive_result = reduce_tokens(text, &aggressive_config, Some("en")).
|
|
131
|
+
let aggressive_result = reduce_tokens(text, &aggressive_config, Some("en")).expect("Operation failed");
|
|
132
132
|
|
|
133
133
|
assert!(
|
|
134
134
|
aggressive_result.len() <= moderate_result.len(),
|
|
@@ -146,7 +146,7 @@ fn test_stopwords_preserve_semantic_meaning() {
|
|
|
146
146
|
|
|
147
147
|
let input =
|
|
148
148
|
"The artificial intelligence system is processing the natural language text for extracting meaningful insights";
|
|
149
|
-
let result = reduce_tokens(input, &config, Some("en")).
|
|
149
|
+
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
|
150
150
|
|
|
151
151
|
let content_words = extract_content_words(&result, "en");
|
|
152
152
|
|
|
@@ -185,7 +185,7 @@ fn test_stopwords_with_multiple_languages() {
|
|
|
185
185
|
..Default::default()
|
|
186
186
|
};
|
|
187
187
|
let en_input = "The computer science program is very comprehensive and includes many courses";
|
|
188
|
-
let en_result = reduce_tokens(en_input, &en_config, Some("en")).
|
|
188
|
+
let en_result = reduce_tokens(en_input, &en_config, Some("en")).expect("Operation failed");
|
|
189
189
|
|
|
190
190
|
let en_original_stopwords = count_stopwords(en_input, "en");
|
|
191
191
|
let en_result_stopwords = count_stopwords(&en_result, "en");
|
|
@@ -200,7 +200,7 @@ fn test_stopwords_with_multiple_languages() {
|
|
|
200
200
|
..Default::default()
|
|
201
201
|
};
|
|
202
202
|
let es_input = "El programa de ciencias de la computación es muy completo y tiene muchos cursos";
|
|
203
|
-
let es_result = reduce_tokens(es_input, &es_config, Some("es")).
|
|
203
|
+
let es_result = reduce_tokens(es_input, &es_config, Some("es")).expect("Operation failed");
|
|
204
204
|
|
|
205
205
|
let es_original_stopwords = count_stopwords(es_input, "es");
|
|
206
206
|
let es_result_stopwords = count_stopwords(&es_result, "es");
|
|
@@ -221,7 +221,7 @@ fn test_stopwords_with_multiple_languages() {
|
|
|
221
221
|
..Default::default()
|
|
222
222
|
};
|
|
223
223
|
let de_input = "Die künstliche Intelligenz ist ein wichtiges Forschungsgebiet der Informatik";
|
|
224
|
-
let de_result = reduce_tokens(de_input, &de_config, Some("de")).
|
|
224
|
+
let de_result = reduce_tokens(de_input, &de_config, Some("de")).expect("Operation failed");
|
|
225
225
|
|
|
226
226
|
let de_original_stopwords = count_stopwords(de_input, "de");
|
|
227
227
|
let de_result_stopwords = count_stopwords(&de_result, "de");
|
|
@@ -240,7 +240,7 @@ fn test_language_fallback_to_english_stopwords() {
|
|
|
240
240
|
};
|
|
241
241
|
|
|
242
242
|
let input = "The system is processing the data with the algorithm";
|
|
243
|
-
let result = reduce_tokens(input, &config, Some("xyz")).
|
|
243
|
+
let result = reduce_tokens(input, &config, Some("xyz")).expect("Operation failed");
|
|
244
244
|
|
|
245
245
|
let original_stopwords = count_stopwords(input, "en");
|
|
246
246
|
let result_stopwords = count_stopwords(&result, "en");
|
|
@@ -267,7 +267,7 @@ fn test_custom_stopwords_integration() {
|
|
|
267
267
|
};
|
|
268
268
|
|
|
269
269
|
let input = "The algorithm processes the data in the system efficiently";
|
|
270
|
-
let result = reduce_tokens(input, &config, Some("en")).
|
|
270
|
+
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
|
271
271
|
|
|
272
272
|
assert!(
|
|
273
273
|
!result.contains("algorithm"),
|
|
@@ -301,7 +301,7 @@ fn test_stopwords_with_chinese_text() {
|
|
|
301
301
|
};
|
|
302
302
|
|
|
303
303
|
let input = "这个人工智能系统可以处理自然语言";
|
|
304
|
-
let result = reduce_tokens(input, &config, Some("zh")).
|
|
304
|
+
let result = reduce_tokens(input, &config, Some("zh")).expect("Operation failed");
|
|
305
305
|
|
|
306
306
|
assert!(
|
|
307
307
|
!result.is_empty(),
|
|
@@ -325,7 +325,7 @@ fn test_stopwords_with_mixed_cjk_english() {
|
|
|
325
325
|
};
|
|
326
326
|
|
|
327
327
|
let input = "The machine learning model 机器学习模型 is processing data efficiently";
|
|
328
|
-
let result = reduce_tokens(input, &config, Some("en")).
|
|
328
|
+
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
|
329
329
|
|
|
330
330
|
assert!(
|
|
331
331
|
!result.contains(" the ") && !result.contains("The "),
|
|
@@ -355,7 +355,7 @@ fn test_stopwords_with_japanese_text() {
|
|
|
355
355
|
};
|
|
356
356
|
|
|
357
357
|
let input = "人工知能技術の研究開発";
|
|
358
|
-
let result = reduce_tokens(input, &config, Some("ja")).
|
|
358
|
+
let result = reduce_tokens(input, &config, Some("ja")).expect("Operation failed");
|
|
359
359
|
|
|
360
360
|
assert!(
|
|
361
361
|
!result.is_empty(),
|
|
@@ -373,7 +373,7 @@ fn test_stopwords_with_korean_text() {
|
|
|
373
373
|
};
|
|
374
374
|
|
|
375
375
|
let input = "인공 지능 기술 개발";
|
|
376
|
-
let result = reduce_tokens(input, &config, Some("ko")).
|
|
376
|
+
let result = reduce_tokens(input, &config, Some("ko")).expect("Operation failed");
|
|
377
377
|
|
|
378
378
|
assert!(
|
|
379
379
|
!result.is_empty(),
|
|
@@ -391,7 +391,7 @@ fn test_stopwords_excluded_from_rake_keywords() {
|
|
|
391
391
|
|
|
392
392
|
let config = KeywordConfig::rake().with_language("en").with_max_keywords(10);
|
|
393
393
|
|
|
394
|
-
let keywords = extract_keywords(text, &config).
|
|
394
|
+
let keywords = extract_keywords(text, &config).expect("Operation failed");
|
|
395
395
|
|
|
396
396
|
assert!(!keywords.is_empty(), "Should extract keywords");
|
|
397
397
|
|
|
@@ -439,7 +439,7 @@ fn test_stopwords_excluded_from_yake_keywords() {
|
|
|
439
439
|
|
|
440
440
|
let config = KeywordConfig::yake().with_language("en").with_max_keywords(10);
|
|
441
441
|
|
|
442
|
-
let keywords = extract_keywords(text, &config).
|
|
442
|
+
let keywords = extract_keywords(text, &config).expect("Operation failed");
|
|
443
443
|
|
|
444
444
|
assert!(!keywords.is_empty(), "Should extract keywords");
|
|
445
445
|
|
|
@@ -472,7 +472,7 @@ fn test_keywords_respect_language_specific_stopwords() {
|
|
|
472
472
|
|
|
473
473
|
let config = KeywordConfig::rake().with_language("es").with_max_keywords(8);
|
|
474
474
|
|
|
475
|
-
let keywords = extract_keywords(spanish_text, &config).
|
|
475
|
+
let keywords = extract_keywords(spanish_text, &config).expect("Operation failed");
|
|
476
476
|
|
|
477
477
|
assert!(!keywords.is_empty(), "Should extract Spanish keywords");
|
|
478
478
|
|
|
@@ -516,7 +516,7 @@ fn test_all_stopwords_text_reduction() {
|
|
|
516
516
|
};
|
|
517
517
|
|
|
518
518
|
let input = "the is a an and or but of to in for on at by";
|
|
519
|
-
let result = reduce_tokens(input, &config, Some("en")).
|
|
519
|
+
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
|
520
520
|
|
|
521
521
|
assert!(
|
|
522
522
|
result.len() < input.len(),
|
|
@@ -533,7 +533,7 @@ fn test_no_stopwords_text_reduction() {
|
|
|
533
533
|
};
|
|
534
534
|
|
|
535
535
|
let input = "PyTorch TensorFlow CUDA GPU optimization benchmark performance metrics";
|
|
536
|
-
let result = reduce_tokens(input, &config, Some("en")).
|
|
536
|
+
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
|
537
537
|
|
|
538
538
|
let input_words: Vec<&str> = input.split_whitespace().collect();
|
|
539
539
|
let result_lower = result.to_lowercase();
|
|
@@ -558,7 +558,7 @@ fn test_mixed_case_stopwords_removal() {
|
|
|
558
558
|
};
|
|
559
559
|
|
|
560
560
|
let input = "The SYSTEM Is Processing The DATA With The ALGORITHM";
|
|
561
|
-
let result = reduce_tokens(input, &config, Some("en")).
|
|
561
|
+
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
|
562
562
|
|
|
563
563
|
let result_words: Vec<&str> = result.split_whitespace().collect();
|
|
564
564
|
assert!(
|
|
@@ -594,7 +594,7 @@ fn test_reduce_tokens_function_with_stopwords() {
|
|
|
594
594
|
};
|
|
595
595
|
|
|
596
596
|
let text = "The artificial intelligence system processes the natural language efficiently";
|
|
597
|
-
let result = reduce_tokens(text, &config, Some("en")).
|
|
597
|
+
let result = reduce_tokens(text, &config, Some("en")).expect("Operation failed");
|
|
598
598
|
|
|
599
599
|
let original_stopwords = count_stopwords(text, "en");
|
|
600
600
|
let result_stopwords = count_stopwords(&result, "en");
|
|
@@ -622,7 +622,7 @@ fn test_stopwords_with_punctuation() {
|
|
|
622
622
|
};
|
|
623
623
|
|
|
624
624
|
let input = "The system, which is processing the data, uses the algorithm.";
|
|
625
|
-
let result = reduce_tokens(input, &config, Some("en")).
|
|
625
|
+
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
|
626
626
|
|
|
627
627
|
assert!(
|
|
628
628
|
!result.contains(" the ") || result.split_whitespace().filter(|w| w.contains("the")).count() < 3,
|
|
@@ -646,7 +646,7 @@ fn test_stopwords_with_numbers() {
|
|
|
646
646
|
};
|
|
647
647
|
|
|
648
648
|
let input = "The model has 100 layers and processes the data in 10 seconds";
|
|
649
|
-
let result = reduce_tokens(input, &config, Some("en")).
|
|
649
|
+
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
|
650
650
|
|
|
651
651
|
assert!(
|
|
652
652
|
result.contains("100"),
|
|
@@ -672,9 +672,9 @@ fn test_stopwords_removal_consistency_across_calls() {
|
|
|
672
672
|
|
|
673
673
|
let input = "The machine learning model is trained on the dataset";
|
|
674
674
|
|
|
675
|
-
let result1 = reduce_tokens(input, &config, Some("en")).
|
|
676
|
-
let result2 = reduce_tokens(input, &config, Some("en")).
|
|
677
|
-
let result3 = reduce_tokens(input, &config, Some("en")).
|
|
675
|
+
let result1 = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
|
676
|
+
let result2 = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
|
677
|
+
let result3 = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
|
678
678
|
|
|
679
679
|
assert_eq!(result1, result2, "Results should be consistent across calls");
|
|
680
680
|
assert_eq!(result2, result3, "Results should be consistent across calls");
|
|
@@ -694,7 +694,7 @@ fn test_stopwords_with_long_text() {
|
|
|
694
694
|
The system processes the data efficiently and achieves the best performance. ";
|
|
695
695
|
let input = paragraph.repeat(10);
|
|
696
696
|
|
|
697
|
-
let result = reduce_tokens(&input, &config, Some("en")).
|
|
697
|
+
let result = reduce_tokens(&input, &config, Some("en")).expect("Operation failed");
|
|
698
698
|
|
|
699
699
|
assert!(
|
|
700
700
|
result.len() < input.len(),
|
|
@@ -719,9 +719,9 @@ fn test_get_stopwords_with_fallback_in_reduction() {
|
|
|
719
719
|
let primary_stopwords = get_stopwords_with_fallback("xyz", "en");
|
|
720
720
|
assert!(primary_stopwords.is_some(), "Should fallback to English");
|
|
721
721
|
|
|
722
|
-
let en_stopwords = get_stopwords("en").
|
|
722
|
+
let en_stopwords = get_stopwords("en").expect("Operation failed");
|
|
723
723
|
assert_eq!(
|
|
724
|
-
primary_stopwords.
|
|
724
|
+
primary_stopwords.expect("Operation failed").len(),
|
|
725
725
|
en_stopwords.len(),
|
|
726
726
|
"Fallback should return English stopwords"
|
|
727
727
|
);
|
|
@@ -733,7 +733,7 @@ fn test_get_stopwords_with_fallback_in_reduction() {
|
|
|
733
733
|
};
|
|
734
734
|
|
|
735
735
|
let input = "The system is processing the data";
|
|
736
|
-
let result = reduce_tokens(input, &config, Some("xyz")).
|
|
736
|
+
let result = reduce_tokens(input, &config, Some("xyz")).expect("Operation failed");
|
|
737
737
|
|
|
738
738
|
assert!(
|
|
739
739
|
!result.contains(" the ") && !result.contains(" is "),
|
|
@@ -789,7 +789,7 @@ fn test_token_reduction_handles_multibyte_utf8() {
|
|
|
789
789
|
};
|
|
790
790
|
|
|
791
791
|
let input = "品質管理は重要です。🚀 高速抽出と漢字処理が求められています。";
|
|
792
|
-
let result = reduce_tokens(input, &config, Some("ja")).
|
|
792
|
+
let result = reduce_tokens(input, &config, Some("ja")).expect("Operation failed");
|
|
793
793
|
|
|
794
794
|
assert!(
|
|
795
795
|
result.contains("品質管理") || result.contains("漢字処理"),
|
|
@@ -814,7 +814,7 @@ fn test_token_reduction_concurrent_access() {
|
|
|
814
814
|
for _ in 0..8 {
|
|
815
815
|
let cfg = Arc::clone(&config);
|
|
816
816
|
scope.spawn(move || {
|
|
817
|
-
let reduced = reduce_tokens(input, &cfg, Some("en")).
|
|
817
|
+
let reduced = reduce_tokens(input, &cfg, Some("en")).expect("Operation failed");
|
|
818
818
|
assert!(!reduced.is_empty());
|
|
819
819
|
});
|
|
820
820
|
}
|
|
@@ -831,7 +831,7 @@ fn demo_stopwords_effectiveness() {
|
|
|
831
831
|
use_simd: false,
|
|
832
832
|
..Default::default()
|
|
833
833
|
};
|
|
834
|
-
let en_result = reduce_tokens(en_text, &en_config, Some("en")).
|
|
834
|
+
let en_result = reduce_tokens(en_text, &en_config, Some("en")).expect("Operation failed");
|
|
835
835
|
|
|
836
836
|
println!("\n=== English Example ===");
|
|
837
837
|
println!("BEFORE: {} chars", en_text.len());
|
|
@@ -849,7 +849,7 @@ fn demo_stopwords_effectiveness() {
|
|
|
849
849
|
use_simd: false,
|
|
850
850
|
..Default::default()
|
|
851
851
|
};
|
|
852
|
-
let zh_result = reduce_tokens(zh_text, &zh_config, Some("zh")).
|
|
852
|
+
let zh_result = reduce_tokens(zh_text, &zh_config, Some("zh")).expect("Operation failed");
|
|
853
853
|
|
|
854
854
|
println!("\n=== Chinese Example ===");
|
|
855
855
|
println!("BEFORE: {}", zh_text);
|
|
@@ -870,7 +870,7 @@ fn demo_stopwords_effectiveness() {
|
|
|
870
870
|
use_simd: false,
|
|
871
871
|
..Default::default()
|
|
872
872
|
};
|
|
873
|
-
let result = reduce_tokens(text, &config, Some("en")).
|
|
873
|
+
let result = reduce_tokens(text, &config, Some("en")).expect("Operation failed");
|
|
874
874
|
println!(
|
|
875
875
|
"{:?}: {} chars -> {} chars ({}% reduction)",
|
|
876
876
|
level,
|
|
@@ -881,7 +881,7 @@ fn demo_stopwords_effectiveness() {
|
|
|
881
881
|
println!(" {}", result);
|
|
882
882
|
}
|
|
883
883
|
|
|
884
|
-
let stopwords = get_stopwords("en").
|
|
884
|
+
let stopwords = get_stopwords("en").expect("Operation failed");
|
|
885
885
|
println!("\n=== Stopwords Stats ===");
|
|
886
886
|
println!("English stopwords: {}", stopwords.len());
|
|
887
887
|
println!("Sample stopwords: {:?}", stopwords.iter().take(10).collect::<Vec<_>>());
|
|
@@ -26,7 +26,7 @@ async fn test_fastembed_embedding_generation() {
|
|
|
26
26
|
let result = model.embed(texts.clone(), None);
|
|
27
27
|
assert!(result.is_ok(), "Failed to generate embeddings: {:?}", result.err());
|
|
28
28
|
|
|
29
|
-
let embeddings = result.
|
|
29
|
+
let embeddings = result.expect("Operation failed");
|
|
30
30
|
assert_eq!(embeddings.len(), 3, "Expected 3 embeddings");
|
|
31
31
|
|
|
32
32
|
for (i, embedding) in embeddings.iter().enumerate() {
|
|
@@ -64,7 +64,7 @@ async fn test_fastembed_batch_processing() {
|
|
|
64
64
|
|
|
65
65
|
assert!(result.is_ok(), "Batch embedding failed: {:?}", result.err());
|
|
66
66
|
|
|
67
|
-
let embeddings = result.
|
|
67
|
+
let embeddings = result.expect("Operation failed");
|
|
68
68
|
assert_eq!(embeddings.len(), 50, "Expected 50 embeddings");
|
|
69
69
|
|
|
70
70
|
println!(
|
|
@@ -96,7 +96,7 @@ async fn test_fastembed_different_models() {
|
|
|
96
96
|
let result = m.embed(test_text.clone(), None);
|
|
97
97
|
assert!(result.is_ok(), "Failed to generate embedding for {}", description);
|
|
98
98
|
|
|
99
|
-
let embeddings = result.
|
|
99
|
+
let embeddings = result.expect("Operation failed");
|
|
100
100
|
assert_eq!(embeddings.len(), 1);
|
|
101
101
|
assert_eq!(
|
|
102
102
|
embeddings[0].len(),
|
|
@@ -197,7 +197,7 @@ async fn test_generate_embeddings_for_chunks_basic() {
|
|
|
197
197
|
for (i, chunk) in chunks.iter().enumerate() {
|
|
198
198
|
assert!(chunk.embedding.is_some(), "Chunk {} missing embedding", i);
|
|
199
199
|
|
|
200
|
-
let embedding = chunk.embedding.as_ref().
|
|
200
|
+
let embedding = chunk.embedding.as_ref().expect("Operation failed");
|
|
201
201
|
assert_eq!(embedding.len(), 384, "Chunk {} has wrong embedding dimensions", i);
|
|
202
202
|
|
|
203
203
|
let sum: f32 = embedding.iter().sum();
|
|
@@ -269,8 +269,8 @@ async fn test_generate_embeddings_for_chunks_normalization() {
|
|
|
269
269
|
|
|
270
270
|
generate_embeddings_for_chunks(&mut chunks_norm, &config_norm).expect("Failed to generate normalized embeddings");
|
|
271
271
|
|
|
272
|
-
let embedding_no_norm = chunks_no_norm[0].embedding.as_ref().
|
|
273
|
-
let embedding_norm = chunks_norm[0].embedding.as_ref().
|
|
272
|
+
let embedding_no_norm = chunks_no_norm[0].embedding.as_ref().expect("Operation failed");
|
|
273
|
+
let embedding_norm = chunks_norm[0].embedding.as_ref().expect("Operation failed");
|
|
274
274
|
|
|
275
275
|
let magnitude_no_norm: f32 = embedding_no_norm.iter().map(|x| x * x).sum::<f32>().sqrt();
|
|
276
276
|
let magnitude_norm: f32 = embedding_norm.iter().map(|x| x * x).sum::<f32>().sqrt();
|
|
@@ -560,7 +560,7 @@ async fn test_generate_embeddings_for_chunks_batch_size() {
|
|
|
560
560
|
i
|
|
561
561
|
);
|
|
562
562
|
assert_eq!(
|
|
563
|
-
chunk.embedding.as_ref().
|
|
563
|
+
chunk.embedding.as_ref().expect("Operation failed").len(),
|
|
564
564
|
384,
|
|
565
565
|
"Chunk {} has wrong dimensions",
|
|
566
566
|
i
|
|
@@ -612,7 +612,7 @@ async fn test_generate_embeddings_chunking_integration() {
|
|
|
612
612
|
for (i, chunk) in chunking_result.chunks.iter().enumerate() {
|
|
613
613
|
assert!(chunk.embedding.is_some(), "Chunk {} missing embedding", i);
|
|
614
614
|
|
|
615
|
-
let embedding = chunk.embedding.as_ref().
|
|
615
|
+
let embedding = chunk.embedding.as_ref().expect("Operation failed");
|
|
616
616
|
assert_eq!(embedding.len(), 384, "Chunk {} has wrong embedding dimensions", i);
|
|
617
617
|
|
|
618
618
|
let magnitude: f32 = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
|
|
@@ -56,7 +56,7 @@ async fn test_simple_typst_document_extraction() {
|
|
|
56
56
|
return;
|
|
57
57
|
}
|
|
58
58
|
|
|
59
|
-
let extraction = result.
|
|
59
|
+
let extraction = result.expect("Operation failed");
|
|
60
60
|
|
|
61
61
|
assert_eq!(extraction.mime_type, "text/x-typst", "MIME type should be preserved");
|
|
62
62
|
|
|
@@ -145,7 +145,7 @@ async fn test_minimal_typst_document_extraction() {
|
|
|
145
145
|
return;
|
|
146
146
|
}
|
|
147
147
|
|
|
148
|
-
let extraction = result.
|
|
148
|
+
let extraction = result.expect("Operation failed");
|
|
149
149
|
|
|
150
150
|
assert!(
|
|
151
151
|
!extraction.content.is_empty(),
|
|
@@ -189,7 +189,7 @@ async fn test_heading_hierarchy_extraction() {
|
|
|
189
189
|
return;
|
|
190
190
|
}
|
|
191
191
|
|
|
192
|
-
let extraction = result.
|
|
192
|
+
let extraction = result.expect("Operation failed");
|
|
193
193
|
|
|
194
194
|
assert!(!extraction.content.is_empty(), "Document should extract content");
|
|
195
195
|
|
|
@@ -269,7 +269,7 @@ async fn test_metadata_extraction() {
|
|
|
269
269
|
return;
|
|
270
270
|
}
|
|
271
271
|
|
|
272
|
-
let extraction = result.
|
|
272
|
+
let extraction = result.expect("Operation failed");
|
|
273
273
|
|
|
274
274
|
if let Some(title) = extraction.metadata.additional.get("title") {
|
|
275
275
|
assert!(
|
|
@@ -330,7 +330,7 @@ async fn test_advanced_typst_document_extraction() {
|
|
|
330
330
|
return;
|
|
331
331
|
}
|
|
332
332
|
|
|
333
|
-
let extraction = result.
|
|
333
|
+
let extraction = result.expect("Operation failed");
|
|
334
334
|
|
|
335
335
|
assert!(
|
|
336
336
|
extraction.metadata.additional.contains_key("title"),
|
|
@@ -411,7 +411,7 @@ async fn test_typst_reader_extraction() {
|
|
|
411
411
|
return;
|
|
412
412
|
}
|
|
413
413
|
|
|
414
|
-
let extraction = result.
|
|
414
|
+
let extraction = result.expect("Operation failed");
|
|
415
415
|
|
|
416
416
|
assert!(
|
|
417
417
|
!extraction.content.is_empty(),
|
|
@@ -454,7 +454,7 @@ async fn test_undergradmath_extraction() {
|
|
|
454
454
|
return;
|
|
455
455
|
}
|
|
456
456
|
|
|
457
|
-
let extraction = result.
|
|
457
|
+
let extraction = result.expect("Operation failed");
|
|
458
458
|
|
|
459
459
|
assert!(
|
|
460
460
|
!extraction.content.is_empty(),
|
|
@@ -534,7 +534,7 @@ async fn test_formatting_preservation() {
|
|
|
534
534
|
return;
|
|
535
535
|
}
|
|
536
536
|
|
|
537
|
-
let extraction = result.
|
|
537
|
+
let extraction = result.expect("Operation failed");
|
|
538
538
|
|
|
539
539
|
assert!(
|
|
540
540
|
extraction.content.contains("*") || extraction.content.contains("bold"),
|
|
@@ -576,7 +576,7 @@ async fn test_large_document_extraction() {
|
|
|
576
576
|
return;
|
|
577
577
|
}
|
|
578
578
|
|
|
579
|
-
let extraction = result.
|
|
579
|
+
let extraction = result.expect("Operation failed");
|
|
580
580
|
|
|
581
581
|
assert!(
|
|
582
582
|
!extraction.content.is_empty(),
|
|
@@ -7,9 +7,9 @@ use kreuzberg::extraction::excel::read_excel_file;
|
|
|
7
7
|
fn test_xlsx_full_metadata_extraction() {
|
|
8
8
|
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
9
9
|
.parent()
|
|
10
|
-
.
|
|
10
|
+
.expect("Operation failed")
|
|
11
11
|
.parent()
|
|
12
|
-
.
|
|
12
|
+
.expect("Operation failed");
|
|
13
13
|
let test_file = workspace_root.join("test_documents/office/excel.xlsx");
|
|
14
14
|
|
|
15
15
|
if !test_file.exists() {
|
|
@@ -17,7 +17,8 @@ fn test_xlsx_full_metadata_extraction() {
|
|
|
17
17
|
return;
|
|
18
18
|
}
|
|
19
19
|
|
|
20
|
-
let
|
|
20
|
+
let file_path = test_file.to_str().expect("File path should be valid UTF-8");
|
|
21
|
+
let result = read_excel_file(file_path).expect("Should extract XLSX successfully");
|
|
21
22
|
|
|
22
23
|
assert!(!result.sheets.is_empty(), "Should have at least one sheet");
|
|
23
24
|
|
|
@@ -34,9 +35,9 @@ fn test_xlsx_full_metadata_extraction() {
|
|
|
34
35
|
fn test_xlsx_multi_sheet_metadata() {
|
|
35
36
|
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
36
37
|
.parent()
|
|
37
|
-
.
|
|
38
|
+
.expect("Operation failed")
|
|
38
39
|
.parent()
|
|
39
|
-
.
|
|
40
|
+
.expect("Operation failed");
|
|
40
41
|
let test_file = workspace_root.join("test_documents/spreadsheets/excel_multi_sheet.xlsx");
|
|
41
42
|
|
|
42
43
|
if !test_file.exists() {
|
|
@@ -44,7 +45,8 @@ fn test_xlsx_multi_sheet_metadata() {
|
|
|
44
45
|
return;
|
|
45
46
|
}
|
|
46
47
|
|
|
47
|
-
let
|
|
48
|
+
let file_path = test_file.to_str().expect("File path should be valid UTF-8");
|
|
49
|
+
let result = read_excel_file(file_path).expect("Should extract multi-sheet XLSX successfully");
|
|
48
50
|
|
|
49
51
|
assert!(
|
|
50
52
|
result.sheets.len() > 1,
|
|
@@ -65,9 +67,9 @@ fn test_xlsx_multi_sheet_metadata() {
|
|
|
65
67
|
fn test_xlsx_minimal_metadata_extraction() {
|
|
66
68
|
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
67
69
|
.parent()
|
|
68
|
-
.
|
|
70
|
+
.expect("Operation failed")
|
|
69
71
|
.parent()
|
|
70
|
-
.
|
|
72
|
+
.expect("Operation failed");
|
|
71
73
|
let test_file = workspace_root.join("test_documents/spreadsheets/test_01.xlsx");
|
|
72
74
|
|
|
73
75
|
if !test_file.exists() {
|
|
@@ -75,7 +77,8 @@ fn test_xlsx_minimal_metadata_extraction() {
|
|
|
75
77
|
return;
|
|
76
78
|
}
|
|
77
79
|
|
|
78
|
-
let
|
|
80
|
+
let file_path = test_file.to_str().expect("File path should be valid UTF-8");
|
|
81
|
+
let result = read_excel_file(file_path).expect("Should extract XLSX successfully");
|
|
79
82
|
|
|
80
83
|
assert!(!result.sheets.is_empty(), "Content should not be empty");
|
|
81
84
|
assert!(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg-tesseract"
|
|
3
|
-
version = "4.
|
|
3
|
+
version = "4.2.0"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -21,10 +21,10 @@ libc = { workspace = true }
|
|
|
21
21
|
thiserror = { workspace = true }
|
|
22
22
|
|
|
23
23
|
[dev-dependencies]
|
|
24
|
-
image = { workspace = true }
|
|
24
|
+
image = { workspace = true, features = ["png"] }
|
|
25
25
|
|
|
26
26
|
[build-dependencies]
|
|
27
|
-
cc = { version = "^1.2.
|
|
27
|
+
cc = { version = "^1.2.54", optional = true }
|
|
28
28
|
cmake = { version = "0.1.57", optional = true }
|
|
29
29
|
zip = { version = "7.2.0", optional = true }
|
|
30
30
|
|
|
@@ -38,7 +38,7 @@ mod build_tesseract {
|
|
|
38
38
|
return None;
|
|
39
39
|
}
|
|
40
40
|
}
|
|
41
|
-
Some(path.join("tesseract-
|
|
41
|
+
Some(path.join("kreuzberg-tesseract-cache"))
|
|
42
42
|
}
|
|
43
43
|
|
|
44
44
|
fn get_preferred_out_dir() -> PathBuf {
|
|
@@ -63,14 +63,14 @@ mod build_tesseract {
|
|
|
63
63
|
PathBuf::from(home_dir)
|
|
64
64
|
.join("Library")
|
|
65
65
|
.join("Application Support")
|
|
66
|
-
.join("tesseract
|
|
66
|
+
.join("kreuzberg-tesseract")
|
|
67
67
|
} else if cfg!(target_os = "linux") {
|
|
68
68
|
let home_dir = env::var("HOME").unwrap_or_else(|_| {
|
|
69
69
|
env::var("USER")
|
|
70
70
|
.map(|user| format!("/home/{}", user))
|
|
71
71
|
.expect("Neither HOME nor USER environment variable set")
|
|
72
72
|
});
|
|
73
|
-
PathBuf::from(home_dir).join(".tesseract
|
|
73
|
+
PathBuf::from(home_dir).join(".kreuzberg-tesseract")
|
|
74
74
|
} else {
|
|
75
75
|
panic!("Unsupported operating system");
|
|
76
76
|
}
|
|
@@ -117,7 +117,7 @@ mod build_tesseract {
|
|
|
117
117
|
"cargo:warning=Failed to create cache dir {:?}: {}. Falling back to temp dir.",
|
|
118
118
|
preferred, err
|
|
119
119
|
);
|
|
120
|
-
let fallback = env::temp_dir().join("tesseract-
|
|
120
|
+
let fallback = env::temp_dir().join("kreuzberg-tesseract-cache");
|
|
121
121
|
fs::create_dir_all(&fallback).expect("Failed to create fallback cache directory in temp dir");
|
|
122
122
|
fallback
|
|
123
123
|
}
|