kreuzberg 4.1.2 → 4.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
- data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
- data/kreuzberg.gemspec +13 -1
- data/lib/kreuzberg/cli.rb +16 -6
- data/lib/kreuzberg/cli_proxy.rb +3 -1
- data/lib/kreuzberg/config.rb +121 -39
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/extraction_api.rb +20 -4
- data/lib/kreuzberg/result.rb +12 -2
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +1 -0
- data/sig/kreuzberg.rbs +28 -12
- data/spec/binding/batch_operations_spec.rb +80 -0
- data/spec/binding/batch_spec.rb +6 -5
- data/spec/binding/error_recovery_spec.rb +3 -3
- data/spec/binding/metadata_types_spec.rb +77 -57
- data/spec/binding/tables_spec.rb +11 -2
- data/spec/serialization_spec.rb +134 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/startup.rs +15 -1
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
- data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
- data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
- data/vendor/kreuzberg/src/core/io.rs +7 -7
- data/vendor/kreuzberg/src/core/mime.rs +4 -4
- data/vendor/kreuzberg/src/embeddings.rs +4 -4
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
- data/vendor/kreuzberg/src/mcp/format.rs +237 -39
- data/vendor/kreuzberg/src/mcp/params.rs +26 -33
- data/vendor/kreuzberg/src/mcp/server.rs +6 -3
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
- data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
- data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
- data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
- data/vendor/kreuzberg/tests/api_embed.rs +84 -50
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
- data/vendor/kreuzberg/tests/api_tests.rs +298 -139
- data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
- data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
- data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
- data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
- data/vendor/kreuzberg/tests/config_behavioral.rs +416 -0
- data/vendor/kreuzberg/tests/config_features.rs +19 -15
- data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
- data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
- data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
- data/vendor/kreuzberg/tests/core_integration.rs +57 -57
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
- data/vendor/kreuzberg/tests/email_integration.rs +7 -7
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/error_handling.rs +13 -11
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
- data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +75 -43
- data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
- data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/page_markers.rs +1 -1
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
- data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +324 -31
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
- data/vendor/kreuzberg/tests/security_validation.rs +20 -19
- data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
- data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +12 -2
|
@@ -74,7 +74,7 @@ async fn test_rtf_accent_extraction() {
|
|
|
74
74
|
let result = extract_file(&path, Some("application/rtf"), &config).await;
|
|
75
75
|
|
|
76
76
|
assert!(result.is_ok(), "RTF extraction should succeed for accent.rtf");
|
|
77
|
-
let extraction = result.
|
|
77
|
+
let extraction = result.expect("Operation failed");
|
|
78
78
|
|
|
79
79
|
assert_eq!(extraction.mime_type, "application/rtf");
|
|
80
80
|
|
|
@@ -112,7 +112,7 @@ async fn test_rtf_bookmark_extraction() {
|
|
|
112
112
|
let result = extract_file(&path, Some("application/rtf"), &config).await;
|
|
113
113
|
|
|
114
114
|
assert!(result.is_ok(), "RTF extraction should succeed for bookmark.rtf");
|
|
115
|
-
let extraction = result.
|
|
115
|
+
let extraction = result.expect("Operation failed");
|
|
116
116
|
|
|
117
117
|
let content = extraction.content.to_lowercase();
|
|
118
118
|
|
|
@@ -137,7 +137,7 @@ async fn test_rtf_footnote_extraction() {
|
|
|
137
137
|
let result = extract_file(&path, Some("application/rtf"), &config).await;
|
|
138
138
|
|
|
139
139
|
assert!(result.is_ok(), "RTF extraction should succeed for footnote.rtf");
|
|
140
|
-
let extraction = result.
|
|
140
|
+
let extraction = result.expect("Operation failed");
|
|
141
141
|
|
|
142
142
|
assert!(!extraction.content.is_empty(), "Content should not be empty");
|
|
143
143
|
|
|
@@ -176,7 +176,7 @@ async fn test_rtf_formatting_extraction() {
|
|
|
176
176
|
let result = extract_file(&path, Some("application/rtf"), &config).await;
|
|
177
177
|
|
|
178
178
|
assert!(result.is_ok(), "RTF extraction should succeed for formatting.rtf");
|
|
179
|
-
let extraction = result.
|
|
179
|
+
let extraction = result.expect("Operation failed");
|
|
180
180
|
|
|
181
181
|
assert!(!extraction.content.is_empty(), "Content should not be empty");
|
|
182
182
|
|
|
@@ -223,7 +223,7 @@ async fn test_rtf_heading_extraction() {
|
|
|
223
223
|
let result = extract_file(&path, Some("application/rtf"), &config).await;
|
|
224
224
|
|
|
225
225
|
assert!(result.is_ok(), "RTF extraction should succeed for heading.rtf");
|
|
226
|
-
let extraction = result.
|
|
226
|
+
let extraction = result.expect("Operation failed");
|
|
227
227
|
|
|
228
228
|
assert!(!extraction.content.is_empty(), "Content should not be empty");
|
|
229
229
|
|
|
@@ -269,7 +269,7 @@ async fn test_rtf_image_extraction() {
|
|
|
269
269
|
let result = extract_file(&path, Some("application/rtf"), &config).await;
|
|
270
270
|
|
|
271
271
|
assert!(result.is_ok(), "RTF extraction should succeed for image.rtf");
|
|
272
|
-
let extraction = result.
|
|
272
|
+
let extraction = result.expect("Operation failed");
|
|
273
273
|
|
|
274
274
|
assert!(!extraction.content.is_empty(), "Content should not be empty");
|
|
275
275
|
|
|
@@ -301,7 +301,7 @@ async fn test_rtf_link_extraction() {
|
|
|
301
301
|
let result = extract_file(&path, Some("application/rtf"), &config).await;
|
|
302
302
|
|
|
303
303
|
assert!(result.is_ok(), "RTF extraction should succeed for link.rtf");
|
|
304
|
-
let extraction = result.
|
|
304
|
+
let extraction = result.expect("Operation failed");
|
|
305
305
|
|
|
306
306
|
assert!(!extraction.content.is_empty(), "Content should not be empty");
|
|
307
307
|
|
|
@@ -328,7 +328,7 @@ async fn test_rtf_list_complex_extraction() {
|
|
|
328
328
|
let result = extract_file(&path, Some("application/rtf"), &config).await;
|
|
329
329
|
|
|
330
330
|
assert!(result.is_ok(), "RTF extraction should succeed for list_complex.rtf");
|
|
331
|
-
let extraction = result.
|
|
331
|
+
let extraction = result.expect("Operation failed");
|
|
332
332
|
|
|
333
333
|
assert!(!extraction.content.is_empty(), "Content should not be empty");
|
|
334
334
|
|
|
@@ -381,7 +381,7 @@ async fn test_rtf_list_simple_extraction() {
|
|
|
381
381
|
let result = extract_file(&path, Some("application/rtf"), &config).await;
|
|
382
382
|
|
|
383
383
|
assert!(result.is_ok(), "RTF extraction should succeed for list_simple.rtf");
|
|
384
|
-
let extraction = result.
|
|
384
|
+
let extraction = result.expect("Operation failed");
|
|
385
385
|
|
|
386
386
|
assert!(!extraction.content.is_empty(), "Content should not be empty");
|
|
387
387
|
|
|
@@ -422,7 +422,7 @@ async fn test_rtf_table_error_codes_extraction() {
|
|
|
422
422
|
result.is_ok(),
|
|
423
423
|
"RTF extraction should succeed for table_error_codes.rtf"
|
|
424
424
|
);
|
|
425
|
-
let extraction = result.
|
|
425
|
+
let extraction = result.expect("Operation failed");
|
|
426
426
|
|
|
427
427
|
assert!(
|
|
428
428
|
extraction.mime_type == "application/rtf",
|
|
@@ -448,7 +448,7 @@ async fn test_rtf_table_simple_extraction() {
|
|
|
448
448
|
let result = extract_file(&path, Some("application/rtf"), &config).await;
|
|
449
449
|
|
|
450
450
|
assert!(result.is_ok(), "RTF extraction should succeed for table_simple.rtf");
|
|
451
|
-
let extraction = result.
|
|
451
|
+
let extraction = result.expect("Operation failed");
|
|
452
452
|
|
|
453
453
|
assert!(
|
|
454
454
|
extraction.mime_type == "application/rtf",
|
|
@@ -470,7 +470,7 @@ async fn test_rtf_unicode_extraction() {
|
|
|
470
470
|
let result = extract_file(&path, Some("application/rtf"), &config).await;
|
|
471
471
|
|
|
472
472
|
assert!(result.is_ok(), "RTF extraction should succeed for unicode.rtf");
|
|
473
|
-
let extraction = result.
|
|
473
|
+
let extraction = result.expect("Operation failed");
|
|
474
474
|
|
|
475
475
|
assert!(!extraction.content.is_empty(), "Content should not be empty");
|
|
476
476
|
|
|
@@ -493,8 +493,8 @@ async fn test_rtf_extraction_deterministic_unicode() {
|
|
|
493
493
|
|
|
494
494
|
assert!(result1.is_ok() && result2.is_ok(), "Both extractions should succeed");
|
|
495
495
|
|
|
496
|
-
let extraction1 = result1.
|
|
497
|
-
let extraction2 = result2.
|
|
496
|
+
let extraction1 = result1.expect("Operation failed");
|
|
497
|
+
let extraction2 = result2.expect("Operation failed");
|
|
498
498
|
|
|
499
499
|
assert_eq!(
|
|
500
500
|
extraction1.content, extraction2.content,
|
|
@@ -514,8 +514,8 @@ async fn test_rtf_extraction_deterministic_list_complex() {
|
|
|
514
514
|
|
|
515
515
|
assert!(result1.is_ok() && result2.is_ok(), "Both extractions should succeed");
|
|
516
516
|
|
|
517
|
-
let extraction1 = result1.
|
|
518
|
-
let extraction2 = result2.
|
|
517
|
+
let extraction1 = result1.expect("Operation failed");
|
|
518
|
+
let extraction2 = result2.expect("Operation failed");
|
|
519
519
|
|
|
520
520
|
assert_eq!(
|
|
521
521
|
extraction1.content, extraction2.content,
|
|
@@ -551,7 +551,7 @@ async fn test_rtf_no_critical_content_loss() {
|
|
|
551
551
|
filename
|
|
552
552
|
);
|
|
553
553
|
|
|
554
|
-
let extraction = result.
|
|
554
|
+
let extraction = result.expect("Operation failed");
|
|
555
555
|
assert!(
|
|
556
556
|
!extraction.content.is_empty(),
|
|
557
557
|
"FAIL: CRITICAL - Extracted 0 bytes from {}. RTF extractor lost all content.",
|
|
@@ -582,7 +582,7 @@ async fn test_rtf_mime_type_preservation() {
|
|
|
582
582
|
|
|
583
583
|
assert!(result.is_ok(), "Extraction should succeed for {}", filename);
|
|
584
584
|
|
|
585
|
-
let extraction = result.
|
|
585
|
+
let extraction = result.expect("Operation failed");
|
|
586
586
|
assert_eq!(
|
|
587
587
|
extraction.mime_type, "application/rtf",
|
|
588
588
|
"FAIL: MIME type not preserved for {}",
|
|
@@ -31,11 +31,11 @@ fn test_archive_zip_bomb_detection() {
|
|
|
31
31
|
let mut zip = ZipWriter::new(&mut cursor);
|
|
32
32
|
let options = FileOptions::<'_, ()>::default();
|
|
33
33
|
|
|
34
|
-
zip.start_file("large.txt", options).
|
|
34
|
+
zip.start_file("large.txt", options).expect("Operation failed");
|
|
35
35
|
let zeros = vec![0u8; 10 * 1024 * 1024];
|
|
36
|
-
zip.write_all(&zeros).
|
|
36
|
+
zip.write_all(&zeros).expect("Operation failed");
|
|
37
37
|
|
|
38
|
-
zip.finish().
|
|
38
|
+
zip.finish().expect("Operation failed");
|
|
39
39
|
}
|
|
40
40
|
|
|
41
41
|
let bytes = cursor.into_inner();
|
|
@@ -57,10 +57,10 @@ fn test_archive_path_traversal_zip() {
|
|
|
57
57
|
let mut zip = ZipWriter::new(&mut cursor);
|
|
58
58
|
let options = FileOptions::<'_, ()>::default();
|
|
59
59
|
|
|
60
|
-
zip.start_file("../../etc/passwd", options).
|
|
61
|
-
zip.write_all(b"malicious content").
|
|
60
|
+
zip.start_file("../../etc/passwd", options).expect("Operation failed");
|
|
61
|
+
zip.write_all(b"malicious content").expect("Operation failed");
|
|
62
62
|
|
|
63
|
-
zip.finish().
|
|
63
|
+
zip.finish().expect("Operation failed");
|
|
64
64
|
}
|
|
65
65
|
|
|
66
66
|
let bytes = cursor.into_inner();
|
|
@@ -97,10 +97,10 @@ fn test_archive_absolute_paths_rejected() {
|
|
|
97
97
|
let mut zip = ZipWriter::new(&mut cursor);
|
|
98
98
|
let options = FileOptions::<'_, ()>::default();
|
|
99
99
|
|
|
100
|
-
zip.start_file("/tmp/malicious.txt", options).
|
|
101
|
-
zip.write_all(b"malicious content").
|
|
100
|
+
zip.start_file("/tmp/malicious.txt", options).expect("Operation failed");
|
|
101
|
+
zip.write_all(b"malicious content").expect("Operation failed");
|
|
102
102
|
|
|
103
|
-
zip.finish().
|
|
103
|
+
zip.finish().expect("Operation failed");
|
|
104
104
|
}
|
|
105
105
|
|
|
106
106
|
let bytes = cursor.into_inner();
|
|
@@ -125,10 +125,10 @@ fn test_archive_deeply_nested_directories() {
|
|
|
125
125
|
let deep_path = (0..100).map(|i| format!("dir{}", i)).collect::<Vec<_>>().join("/");
|
|
126
126
|
let file_path = format!("{}/file.txt", deep_path);
|
|
127
127
|
|
|
128
|
-
zip.start_file(&file_path, options).
|
|
129
|
-
zip.write_all(b"deep content").
|
|
128
|
+
zip.start_file(&file_path, options).expect("Operation failed");
|
|
129
|
+
zip.write_all(b"deep content").expect("Operation failed");
|
|
130
130
|
|
|
131
|
-
zip.finish().
|
|
131
|
+
zip.finish().expect("Operation failed");
|
|
132
132
|
}
|
|
133
133
|
|
|
134
134
|
let bytes = cursor.into_inner();
|
|
@@ -149,11 +149,12 @@ fn test_archive_many_small_files() {
|
|
|
149
149
|
let options = FileOptions::<'_, ()>::default();
|
|
150
150
|
|
|
151
151
|
for i in 0..1000 {
|
|
152
|
-
zip.start_file(format!("file{}.txt", i), options)
|
|
153
|
-
|
|
152
|
+
zip.start_file(format!("file{}.txt", i), options)
|
|
153
|
+
.expect("Operation failed");
|
|
154
|
+
zip.write_all(b"small content").expect("Operation failed");
|
|
154
155
|
}
|
|
155
156
|
|
|
156
|
-
zip.finish().
|
|
157
|
+
zip.finish().expect("Operation failed");
|
|
157
158
|
}
|
|
158
159
|
|
|
159
160
|
let bytes = cursor.into_inner();
|
|
@@ -404,13 +405,13 @@ fn test_security_directory_instead_of_file() {
|
|
|
404
405
|
|
|
405
406
|
#[test]
|
|
406
407
|
fn test_security_special_file_handling() {
|
|
407
|
-
let mut tmpfile = NamedTempFile::new().
|
|
408
|
-
tmpfile.write_all(b"test content").
|
|
409
|
-
tmpfile.flush().
|
|
408
|
+
let mut tmpfile = NamedTempFile::new().expect("Operation failed");
|
|
409
|
+
tmpfile.write_all(b"test content").expect("Operation failed");
|
|
410
|
+
tmpfile.flush().expect("Operation failed");
|
|
410
411
|
let path = tmpfile.path();
|
|
411
412
|
|
|
412
413
|
let config = ExtractionConfig::default();
|
|
413
|
-
let result = extract_file_sync(path.to_str().
|
|
414
|
+
let result = extract_file_sync(path.to_str().expect("Operation failed"), None, &config);
|
|
414
415
|
|
|
415
416
|
assert!(result.is_ok() || result.is_err());
|
|
416
417
|
}
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
//! Cross-language serialization integration tests.
|
|
2
|
+
//!
|
|
3
|
+
//! These tests validate that ExtractionConfig serializes correctly
|
|
4
|
+
//! and that the serialized output can be used for cross-language comparison.
|
|
5
|
+
|
|
6
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
7
|
+
|
|
8
|
+
#[test]
|
|
9
|
+
fn test_extraction_config_minimal_serialization() {
|
|
10
|
+
let config = ExtractionConfig::default();
|
|
11
|
+
let json = serde_json::to_value(&config).expect("Failed to serialize config");
|
|
12
|
+
|
|
13
|
+
// Validate that all expected fields are present
|
|
14
|
+
assert!(json.get("use_cache").is_some(), "Missing use_cache field");
|
|
15
|
+
assert!(
|
|
16
|
+
json.get("enable_quality_processing").is_some(),
|
|
17
|
+
"Missing enable_quality_processing field"
|
|
18
|
+
);
|
|
19
|
+
assert!(json.get("force_ocr").is_some(), "Missing force_ocr field");
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
#[test]
|
|
23
|
+
fn test_extraction_config_serialization_round_trip() {
|
|
24
|
+
let original = ExtractionConfig {
|
|
25
|
+
use_cache: true,
|
|
26
|
+
enable_quality_processing: false,
|
|
27
|
+
force_ocr: true,
|
|
28
|
+
..Default::default()
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
// Serialize to JSON
|
|
32
|
+
let json = serde_json::to_value(&original).expect("Failed to serialize");
|
|
33
|
+
|
|
34
|
+
// Deserialize back
|
|
35
|
+
let restored: ExtractionConfig = serde_json::from_value(json).expect("Failed to deserialize");
|
|
36
|
+
|
|
37
|
+
// Validate that key fields are preserved
|
|
38
|
+
assert_eq!(original.use_cache, restored.use_cache, "use_cache field not preserved");
|
|
39
|
+
assert_eq!(
|
|
40
|
+
original.enable_quality_processing, restored.enable_quality_processing,
|
|
41
|
+
"enable_quality_processing field not preserved"
|
|
42
|
+
);
|
|
43
|
+
assert_eq!(original.force_ocr, restored.force_ocr, "force_ocr field not preserved");
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
#[test]
|
|
47
|
+
fn test_extraction_config_nested_serialization() {
|
|
48
|
+
let config = ExtractionConfig {
|
|
49
|
+
use_cache: true,
|
|
50
|
+
enable_quality_processing: true,
|
|
51
|
+
force_ocr: false,
|
|
52
|
+
// Note: Nested fields like ocr, chunking, etc. would be set here
|
|
53
|
+
// This test focuses on the basic serialization structure
|
|
54
|
+
..Default::default()
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
let json = serde_json::to_value(&config).expect("Failed to serialize");
|
|
58
|
+
|
|
59
|
+
// Ensure it's a proper JSON object
|
|
60
|
+
assert!(json.is_object(), "Serialized output should be a JSON object");
|
|
61
|
+
|
|
62
|
+
// Validate that core fields are present
|
|
63
|
+
assert!(json.get("use_cache").is_some());
|
|
64
|
+
assert!(json.get("enable_quality_processing").is_some());
|
|
65
|
+
assert!(json.get("force_ocr").is_some());
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
#[test]
|
|
69
|
+
fn test_extraction_config_json_format() {
|
|
70
|
+
let config = ExtractionConfig::default();
|
|
71
|
+
let json_string = serde_json::to_string(&config).expect("Failed to serialize to string");
|
|
72
|
+
|
|
73
|
+
// Validate that output is valid JSON
|
|
74
|
+
let parsed: serde_json::Value = serde_json::from_str(&json_string).expect("Invalid JSON output");
|
|
75
|
+
assert!(parsed.is_object(), "JSON should be an object");
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
#[test]
|
|
79
|
+
fn test_extraction_config_pretty_print() {
|
|
80
|
+
let config = ExtractionConfig::default();
|
|
81
|
+
let pretty_json = serde_json::to_string_pretty(&config).expect("Failed to serialize");
|
|
82
|
+
|
|
83
|
+
// Validate that pretty-printed JSON is parseable
|
|
84
|
+
let _parsed: serde_json::Value = serde_json::from_str(&pretty_json).expect("Invalid pretty-printed JSON");
|
|
85
|
+
|
|
86
|
+
// Pretty JSON should have newlines
|
|
87
|
+
assert!(pretty_json.contains('\n'), "Pretty JSON should have newlines");
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
#[test]
|
|
91
|
+
fn test_extraction_config_field_consistency() {
|
|
92
|
+
let configs = vec![
|
|
93
|
+
ExtractionConfig::default(),
|
|
94
|
+
ExtractionConfig {
|
|
95
|
+
use_cache: true,
|
|
96
|
+
..Default::default()
|
|
97
|
+
},
|
|
98
|
+
ExtractionConfig {
|
|
99
|
+
enable_quality_processing: false,
|
|
100
|
+
..Default::default()
|
|
101
|
+
},
|
|
102
|
+
];
|
|
103
|
+
|
|
104
|
+
for config in configs {
|
|
105
|
+
let json = serde_json::to_value(&config).expect("Failed to serialize");
|
|
106
|
+
|
|
107
|
+
// All configs should have the same set of top-level fields
|
|
108
|
+
assert!(json.get("use_cache").is_some());
|
|
109
|
+
assert!(json.get("enable_quality_processing").is_some());
|
|
110
|
+
assert!(json.get("force_ocr").is_some());
|
|
111
|
+
}
|
|
112
|
+
}
|
|
@@ -67,7 +67,7 @@ fn test_stopwords_removed_during_moderate_token_reduction() {
|
|
|
67
67
|
};
|
|
68
68
|
|
|
69
69
|
let input = "The quick brown fox is jumping over the lazy dog and running through the forest";
|
|
70
|
-
let result = reduce_tokens(input, &config, Some("en")).
|
|
70
|
+
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
|
71
71
|
|
|
72
72
|
assert!(!result.contains(" the "), "Should remove 'the'. Result: {}", result);
|
|
73
73
|
assert!(!result.contains(" is "), "Should remove 'is'. Result: {}", result);
|
|
@@ -103,7 +103,7 @@ fn test_stopwords_across_reduction_levels() {
|
|
|
103
103
|
use_simd: false,
|
|
104
104
|
..Default::default()
|
|
105
105
|
};
|
|
106
|
-
let light_result = reduce_tokens(text, &light_config, Some("en")).
|
|
106
|
+
let light_result = reduce_tokens(text, &light_config, Some("en")).expect("Operation failed");
|
|
107
107
|
|
|
108
108
|
let light_stopwords = count_stopwords(&light_result, "en");
|
|
109
109
|
assert!(light_stopwords > 0, "Light reduction should preserve some stopwords");
|
|
@@ -113,7 +113,7 @@ fn test_stopwords_across_reduction_levels() {
|
|
|
113
113
|
use_simd: false,
|
|
114
114
|
..Default::default()
|
|
115
115
|
};
|
|
116
|
-
let moderate_result = reduce_tokens(text, &moderate_config, Some("en")).
|
|
116
|
+
let moderate_result = reduce_tokens(text, &moderate_config, Some("en")).expect("Operation failed");
|
|
117
117
|
|
|
118
118
|
let moderate_stopwords = count_stopwords(&moderate_result, "en");
|
|
119
119
|
assert!(
|
|
@@ -128,7 +128,7 @@ fn test_stopwords_across_reduction_levels() {
|
|
|
128
128
|
use_simd: false,
|
|
129
129
|
..Default::default()
|
|
130
130
|
};
|
|
131
|
-
let aggressive_result = reduce_tokens(text, &aggressive_config, Some("en")).
|
|
131
|
+
let aggressive_result = reduce_tokens(text, &aggressive_config, Some("en")).expect("Operation failed");
|
|
132
132
|
|
|
133
133
|
assert!(
|
|
134
134
|
aggressive_result.len() <= moderate_result.len(),
|
|
@@ -146,7 +146,7 @@ fn test_stopwords_preserve_semantic_meaning() {
|
|
|
146
146
|
|
|
147
147
|
let input =
|
|
148
148
|
"The artificial intelligence system is processing the natural language text for extracting meaningful insights";
|
|
149
|
-
let result = reduce_tokens(input, &config, Some("en")).
|
|
149
|
+
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
|
150
150
|
|
|
151
151
|
let content_words = extract_content_words(&result, "en");
|
|
152
152
|
|
|
@@ -185,7 +185,7 @@ fn test_stopwords_with_multiple_languages() {
|
|
|
185
185
|
..Default::default()
|
|
186
186
|
};
|
|
187
187
|
let en_input = "The computer science program is very comprehensive and includes many courses";
|
|
188
|
-
let en_result = reduce_tokens(en_input, &en_config, Some("en")).
|
|
188
|
+
let en_result = reduce_tokens(en_input, &en_config, Some("en")).expect("Operation failed");
|
|
189
189
|
|
|
190
190
|
let en_original_stopwords = count_stopwords(en_input, "en");
|
|
191
191
|
let en_result_stopwords = count_stopwords(&en_result, "en");
|
|
@@ -200,7 +200,7 @@ fn test_stopwords_with_multiple_languages() {
|
|
|
200
200
|
..Default::default()
|
|
201
201
|
};
|
|
202
202
|
let es_input = "El programa de ciencias de la computación es muy completo y tiene muchos cursos";
|
|
203
|
-
let es_result = reduce_tokens(es_input, &es_config, Some("es")).
|
|
203
|
+
let es_result = reduce_tokens(es_input, &es_config, Some("es")).expect("Operation failed");
|
|
204
204
|
|
|
205
205
|
let es_original_stopwords = count_stopwords(es_input, "es");
|
|
206
206
|
let es_result_stopwords = count_stopwords(&es_result, "es");
|
|
@@ -221,7 +221,7 @@ fn test_stopwords_with_multiple_languages() {
|
|
|
221
221
|
..Default::default()
|
|
222
222
|
};
|
|
223
223
|
let de_input = "Die künstliche Intelligenz ist ein wichtiges Forschungsgebiet der Informatik";
|
|
224
|
-
let de_result = reduce_tokens(de_input, &de_config, Some("de")).
|
|
224
|
+
let de_result = reduce_tokens(de_input, &de_config, Some("de")).expect("Operation failed");
|
|
225
225
|
|
|
226
226
|
let de_original_stopwords = count_stopwords(de_input, "de");
|
|
227
227
|
let de_result_stopwords = count_stopwords(&de_result, "de");
|
|
@@ -240,7 +240,7 @@ fn test_language_fallback_to_english_stopwords() {
|
|
|
240
240
|
};
|
|
241
241
|
|
|
242
242
|
let input = "The system is processing the data with the algorithm";
|
|
243
|
-
let result = reduce_tokens(input, &config, Some("xyz")).
|
|
243
|
+
let result = reduce_tokens(input, &config, Some("xyz")).expect("Operation failed");
|
|
244
244
|
|
|
245
245
|
let original_stopwords = count_stopwords(input, "en");
|
|
246
246
|
let result_stopwords = count_stopwords(&result, "en");
|
|
@@ -267,7 +267,7 @@ fn test_custom_stopwords_integration() {
|
|
|
267
267
|
};
|
|
268
268
|
|
|
269
269
|
let input = "The algorithm processes the data in the system efficiently";
|
|
270
|
-
let result = reduce_tokens(input, &config, Some("en")).
|
|
270
|
+
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
|
271
271
|
|
|
272
272
|
assert!(
|
|
273
273
|
!result.contains("algorithm"),
|
|
@@ -301,7 +301,7 @@ fn test_stopwords_with_chinese_text() {
|
|
|
301
301
|
};
|
|
302
302
|
|
|
303
303
|
let input = "这个人工智能系统可以处理自然语言";
|
|
304
|
-
let result = reduce_tokens(input, &config, Some("zh")).
|
|
304
|
+
let result = reduce_tokens(input, &config, Some("zh")).expect("Operation failed");
|
|
305
305
|
|
|
306
306
|
assert!(
|
|
307
307
|
!result.is_empty(),
|
|
@@ -325,7 +325,7 @@ fn test_stopwords_with_mixed_cjk_english() {
|
|
|
325
325
|
};
|
|
326
326
|
|
|
327
327
|
let input = "The machine learning model 机器学习模型 is processing data efficiently";
|
|
328
|
-
let result = reduce_tokens(input, &config, Some("en")).
|
|
328
|
+
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
|
329
329
|
|
|
330
330
|
assert!(
|
|
331
331
|
!result.contains(" the ") && !result.contains("The "),
|
|
@@ -355,7 +355,7 @@ fn test_stopwords_with_japanese_text() {
|
|
|
355
355
|
};
|
|
356
356
|
|
|
357
357
|
let input = "人工知能技術の研究開発";
|
|
358
|
-
let result = reduce_tokens(input, &config, Some("ja")).
|
|
358
|
+
let result = reduce_tokens(input, &config, Some("ja")).expect("Operation failed");
|
|
359
359
|
|
|
360
360
|
assert!(
|
|
361
361
|
!result.is_empty(),
|
|
@@ -373,7 +373,7 @@ fn test_stopwords_with_korean_text() {
|
|
|
373
373
|
};
|
|
374
374
|
|
|
375
375
|
let input = "인공 지능 기술 개발";
|
|
376
|
-
let result = reduce_tokens(input, &config, Some("ko")).
|
|
376
|
+
let result = reduce_tokens(input, &config, Some("ko")).expect("Operation failed");
|
|
377
377
|
|
|
378
378
|
assert!(
|
|
379
379
|
!result.is_empty(),
|
|
@@ -391,7 +391,7 @@ fn test_stopwords_excluded_from_rake_keywords() {
|
|
|
391
391
|
|
|
392
392
|
let config = KeywordConfig::rake().with_language("en").with_max_keywords(10);
|
|
393
393
|
|
|
394
|
-
let keywords = extract_keywords(text, &config).
|
|
394
|
+
let keywords = extract_keywords(text, &config).expect("Operation failed");
|
|
395
395
|
|
|
396
396
|
assert!(!keywords.is_empty(), "Should extract keywords");
|
|
397
397
|
|
|
@@ -439,7 +439,7 @@ fn test_stopwords_excluded_from_yake_keywords() {
|
|
|
439
439
|
|
|
440
440
|
let config = KeywordConfig::yake().with_language("en").with_max_keywords(10);
|
|
441
441
|
|
|
442
|
-
let keywords = extract_keywords(text, &config).
|
|
442
|
+
let keywords = extract_keywords(text, &config).expect("Operation failed");
|
|
443
443
|
|
|
444
444
|
assert!(!keywords.is_empty(), "Should extract keywords");
|
|
445
445
|
|
|
@@ -472,7 +472,7 @@ fn test_keywords_respect_language_specific_stopwords() {
|
|
|
472
472
|
|
|
473
473
|
let config = KeywordConfig::rake().with_language("es").with_max_keywords(8);
|
|
474
474
|
|
|
475
|
-
let keywords = extract_keywords(spanish_text, &config).
|
|
475
|
+
let keywords = extract_keywords(spanish_text, &config).expect("Operation failed");
|
|
476
476
|
|
|
477
477
|
assert!(!keywords.is_empty(), "Should extract Spanish keywords");
|
|
478
478
|
|
|
@@ -516,7 +516,7 @@ fn test_all_stopwords_text_reduction() {
|
|
|
516
516
|
};
|
|
517
517
|
|
|
518
518
|
let input = "the is a an and or but of to in for on at by";
|
|
519
|
-
let result = reduce_tokens(input, &config, Some("en")).
|
|
519
|
+
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
|
520
520
|
|
|
521
521
|
assert!(
|
|
522
522
|
result.len() < input.len(),
|
|
@@ -533,7 +533,7 @@ fn test_no_stopwords_text_reduction() {
|
|
|
533
533
|
};
|
|
534
534
|
|
|
535
535
|
let input = "PyTorch TensorFlow CUDA GPU optimization benchmark performance metrics";
|
|
536
|
-
let result = reduce_tokens(input, &config, Some("en")).
|
|
536
|
+
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
|
537
537
|
|
|
538
538
|
let input_words: Vec<&str> = input.split_whitespace().collect();
|
|
539
539
|
let result_lower = result.to_lowercase();
|
|
@@ -558,7 +558,7 @@ fn test_mixed_case_stopwords_removal() {
|
|
|
558
558
|
};
|
|
559
559
|
|
|
560
560
|
let input = "The SYSTEM Is Processing The DATA With The ALGORITHM";
|
|
561
|
-
let result = reduce_tokens(input, &config, Some("en")).
|
|
561
|
+
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
|
562
562
|
|
|
563
563
|
let result_words: Vec<&str> = result.split_whitespace().collect();
|
|
564
564
|
assert!(
|
|
@@ -594,7 +594,7 @@ fn test_reduce_tokens_function_with_stopwords() {
|
|
|
594
594
|
};
|
|
595
595
|
|
|
596
596
|
let text = "The artificial intelligence system processes the natural language efficiently";
|
|
597
|
-
let result = reduce_tokens(text, &config, Some("en")).
|
|
597
|
+
let result = reduce_tokens(text, &config, Some("en")).expect("Operation failed");
|
|
598
598
|
|
|
599
599
|
let original_stopwords = count_stopwords(text, "en");
|
|
600
600
|
let result_stopwords = count_stopwords(&result, "en");
|
|
@@ -622,7 +622,7 @@ fn test_stopwords_with_punctuation() {
|
|
|
622
622
|
};
|
|
623
623
|
|
|
624
624
|
let input = "The system, which is processing the data, uses the algorithm.";
|
|
625
|
-
let result = reduce_tokens(input, &config, Some("en")).
|
|
625
|
+
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
|
626
626
|
|
|
627
627
|
assert!(
|
|
628
628
|
!result.contains(" the ") || result.split_whitespace().filter(|w| w.contains("the")).count() < 3,
|
|
@@ -646,7 +646,7 @@ fn test_stopwords_with_numbers() {
|
|
|
646
646
|
};
|
|
647
647
|
|
|
648
648
|
let input = "The model has 100 layers and processes the data in 10 seconds";
|
|
649
|
-
let result = reduce_tokens(input, &config, Some("en")).
|
|
649
|
+
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
|
650
650
|
|
|
651
651
|
assert!(
|
|
652
652
|
result.contains("100"),
|
|
@@ -672,9 +672,9 @@ fn test_stopwords_removal_consistency_across_calls() {
|
|
|
672
672
|
|
|
673
673
|
let input = "The machine learning model is trained on the dataset";
|
|
674
674
|
|
|
675
|
-
let result1 = reduce_tokens(input, &config, Some("en")).
|
|
676
|
-
let result2 = reduce_tokens(input, &config, Some("en")).
|
|
677
|
-
let result3 = reduce_tokens(input, &config, Some("en")).
|
|
675
|
+
let result1 = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
|
676
|
+
let result2 = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
|
677
|
+
let result3 = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
|
678
678
|
|
|
679
679
|
assert_eq!(result1, result2, "Results should be consistent across calls");
|
|
680
680
|
assert_eq!(result2, result3, "Results should be consistent across calls");
|
|
@@ -694,7 +694,7 @@ fn test_stopwords_with_long_text() {
|
|
|
694
694
|
The system processes the data efficiently and achieves the best performance. ";
|
|
695
695
|
let input = paragraph.repeat(10);
|
|
696
696
|
|
|
697
|
-
let result = reduce_tokens(&input, &config, Some("en")).
|
|
697
|
+
let result = reduce_tokens(&input, &config, Some("en")).expect("Operation failed");
|
|
698
698
|
|
|
699
699
|
assert!(
|
|
700
700
|
result.len() < input.len(),
|
|
@@ -719,9 +719,9 @@ fn test_get_stopwords_with_fallback_in_reduction() {
|
|
|
719
719
|
let primary_stopwords = get_stopwords_with_fallback("xyz", "en");
|
|
720
720
|
assert!(primary_stopwords.is_some(), "Should fallback to English");
|
|
721
721
|
|
|
722
|
-
let en_stopwords = get_stopwords("en").
|
|
722
|
+
let en_stopwords = get_stopwords("en").expect("Operation failed");
|
|
723
723
|
assert_eq!(
|
|
724
|
-
primary_stopwords.
|
|
724
|
+
primary_stopwords.expect("Operation failed").len(),
|
|
725
725
|
en_stopwords.len(),
|
|
726
726
|
"Fallback should return English stopwords"
|
|
727
727
|
);
|
|
@@ -733,7 +733,7 @@ fn test_get_stopwords_with_fallback_in_reduction() {
|
|
|
733
733
|
};
|
|
734
734
|
|
|
735
735
|
let input = "The system is processing the data";
|
|
736
|
-
let result = reduce_tokens(input, &config, Some("xyz")).
|
|
736
|
+
let result = reduce_tokens(input, &config, Some("xyz")).expect("Operation failed");
|
|
737
737
|
|
|
738
738
|
assert!(
|
|
739
739
|
!result.contains(" the ") && !result.contains(" is "),
|
|
@@ -789,7 +789,7 @@ fn test_token_reduction_handles_multibyte_utf8() {
|
|
|
789
789
|
};
|
|
790
790
|
|
|
791
791
|
let input = "品質管理は重要です。🚀 高速抽出と漢字処理が求められています。";
|
|
792
|
-
let result = reduce_tokens(input, &config, Some("ja")).
|
|
792
|
+
let result = reduce_tokens(input, &config, Some("ja")).expect("Operation failed");
|
|
793
793
|
|
|
794
794
|
assert!(
|
|
795
795
|
result.contains("品質管理") || result.contains("漢字処理"),
|
|
@@ -814,7 +814,7 @@ fn test_token_reduction_concurrent_access() {
|
|
|
814
814
|
for _ in 0..8 {
|
|
815
815
|
let cfg = Arc::clone(&config);
|
|
816
816
|
scope.spawn(move || {
|
|
817
|
-
let reduced = reduce_tokens(input, &cfg, Some("en")).
|
|
817
|
+
let reduced = reduce_tokens(input, &cfg, Some("en")).expect("Operation failed");
|
|
818
818
|
assert!(!reduced.is_empty());
|
|
819
819
|
});
|
|
820
820
|
}
|
|
@@ -831,7 +831,7 @@ fn demo_stopwords_effectiveness() {
|
|
|
831
831
|
use_simd: false,
|
|
832
832
|
..Default::default()
|
|
833
833
|
};
|
|
834
|
-
let en_result = reduce_tokens(en_text, &en_config, Some("en")).
|
|
834
|
+
let en_result = reduce_tokens(en_text, &en_config, Some("en")).expect("Operation failed");
|
|
835
835
|
|
|
836
836
|
println!("\n=== English Example ===");
|
|
837
837
|
println!("BEFORE: {} chars", en_text.len());
|
|
@@ -849,7 +849,7 @@ fn demo_stopwords_effectiveness() {
|
|
|
849
849
|
use_simd: false,
|
|
850
850
|
..Default::default()
|
|
851
851
|
};
|
|
852
|
-
let zh_result = reduce_tokens(zh_text, &zh_config, Some("zh")).
|
|
852
|
+
let zh_result = reduce_tokens(zh_text, &zh_config, Some("zh")).expect("Operation failed");
|
|
853
853
|
|
|
854
854
|
println!("\n=== Chinese Example ===");
|
|
855
855
|
println!("BEFORE: {}", zh_text);
|
|
@@ -870,7 +870,7 @@ fn demo_stopwords_effectiveness() {
|
|
|
870
870
|
use_simd: false,
|
|
871
871
|
..Default::default()
|
|
872
872
|
};
|
|
873
|
-
let result = reduce_tokens(text, &config, Some("en")).
|
|
873
|
+
let result = reduce_tokens(text, &config, Some("en")).expect("Operation failed");
|
|
874
874
|
println!(
|
|
875
875
|
"{:?}: {} chars -> {} chars ({}% reduction)",
|
|
876
876
|
level,
|
|
@@ -881,7 +881,7 @@ fn demo_stopwords_effectiveness() {
|
|
|
881
881
|
println!(" {}", result);
|
|
882
882
|
}
|
|
883
883
|
|
|
884
|
-
let stopwords = get_stopwords("en").
|
|
884
|
+
let stopwords = get_stopwords("en").expect("Operation failed");
|
|
885
885
|
println!("\n=== Stopwords Stats ===");
|
|
886
886
|
println!("English stopwords: {}", stopwords.len());
|
|
887
887
|
println!("Sample stopwords: {:?}", stopwords.iter().take(10).collect::<Vec<_>>());
|