kreuzberg 4.1.1 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -4
- data/README.md +8 -5
- data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
- data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +23 -13
- data/kreuzberg.gemspec +14 -2
- data/lib/kreuzberg/api_proxy.rb +0 -1
- data/lib/kreuzberg/cli_proxy.rb +0 -1
- data/lib/kreuzberg/config.rb +70 -35
- data/lib/kreuzberg/mcp_proxy.rb +0 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +5 -1
- data/spec/binding/batch_operations_spec.rb +80 -0
- data/spec/binding/metadata_types_spec.rb +77 -57
- data/spec/serialization_spec.rb +134 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +3 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/embeddings.rs +4 -4
- data/vendor/kreuzberg/src/mcp/format.rs +237 -39
- data/vendor/kreuzberg/src/mcp/params.rs +26 -33
- data/vendor/kreuzberg/src/mcp/server.rs +6 -3
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
- data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
- data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
- data/vendor/kreuzberg/tests/api_embed.rs +84 -50
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
- data/vendor/kreuzberg/tests/api_tests.rs +298 -139
- data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
- data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
- data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
- data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
- data/vendor/kreuzberg/tests/config_behavioral.rs +414 -0
- data/vendor/kreuzberg/tests/config_features.rs +19 -15
- data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
- data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
- data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
- data/vendor/kreuzberg/tests/core_integration.rs +55 -53
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
- data/vendor/kreuzberg/tests/email_integration.rs +7 -7
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/error_handling.rs +13 -11
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
- data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +72 -41
- data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
- data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/page_markers.rs +1 -1
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
- data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +40 -30
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
- data/vendor/kreuzberg/tests/security_validation.rs +20 -19
- data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
- data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
- data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
- data/vendor/kreuzberg-tesseract/build.rs +4 -4
- data/vendor/kreuzberg-tesseract/src/lib.rs +6 -6
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +3 -3
- metadata +13 -2
|
@@ -48,12 +48,17 @@ async fn test_mime_detection_by_extension() {
|
|
|
48
48
|
let temp_dir = TempDir::new().expect("Should create temp dir");
|
|
49
49
|
let temp_path = temp_dir.path().join(filename);
|
|
50
50
|
|
|
51
|
-
std::fs::write(&temp_path, b"test content").
|
|
51
|
+
std::fs::write(&temp_path, b"test content").expect("Operation failed");
|
|
52
52
|
|
|
53
53
|
let detected = detect_mime_type(&temp_path, true);
|
|
54
54
|
|
|
55
55
|
assert!(detected.is_ok(), "Should detect MIME type for {}", filename);
|
|
56
|
-
assert_eq!(
|
|
56
|
+
assert_eq!(
|
|
57
|
+
detected.expect("Operation failed"),
|
|
58
|
+
expected_mime,
|
|
59
|
+
"MIME type mismatch for {}",
|
|
60
|
+
filename
|
|
61
|
+
);
|
|
57
62
|
}
|
|
58
63
|
}
|
|
59
64
|
|
|
@@ -76,11 +81,11 @@ async fn test_mime_detection_case_insensitive() {
|
|
|
76
81
|
let temp_dir = TempDir::new().expect("Should create temp dir");
|
|
77
82
|
let temp_path = temp_dir.path().join(filename);
|
|
78
83
|
|
|
79
|
-
std::fs::write(&temp_path, b"test").
|
|
84
|
+
std::fs::write(&temp_path, b"test").expect("Operation failed");
|
|
80
85
|
|
|
81
86
|
let detected = detect_mime_type(&temp_path, true);
|
|
82
87
|
assert!(detected.is_ok(), "Should handle {} (case insensitive)", filename);
|
|
83
|
-
assert_eq!(detected.
|
|
88
|
+
assert_eq!(detected.expect("Operation failed"), expected_mime);
|
|
84
89
|
}
|
|
85
90
|
}
|
|
86
91
|
|
|
@@ -118,11 +123,15 @@ async fn test_mime_detection_by_content() {
|
|
|
118
123
|
|
|
119
124
|
for test_case in test_cases {
|
|
120
125
|
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
|
|
121
|
-
let temp_path = temp_file
|
|
126
|
+
let temp_path = temp_file
|
|
127
|
+
.path()
|
|
128
|
+
.parent()
|
|
129
|
+
.expect("Operation failed")
|
|
130
|
+
.join(test_case.filename);
|
|
122
131
|
|
|
123
|
-
temp_file.write_all(&test_case.content).
|
|
124
|
-
temp_file.flush().
|
|
125
|
-
std::fs::copy(temp_file.path(), &temp_path).
|
|
132
|
+
temp_file.write_all(&test_case.content).expect("Operation failed");
|
|
133
|
+
temp_file.flush().expect("Operation failed");
|
|
134
|
+
std::fs::copy(temp_file.path(), &temp_path).expect("Operation failed");
|
|
126
135
|
|
|
127
136
|
let detected = detect_mime_type(&temp_path, true);
|
|
128
137
|
|
|
@@ -170,7 +179,7 @@ async fn test_mime_type_validation() {
|
|
|
170
179
|
for mime_type in supported {
|
|
171
180
|
let result = validate_mime_type(mime_type);
|
|
172
181
|
assert!(result.is_ok(), "Should validate supported MIME type: {}", mime_type);
|
|
173
|
-
assert_eq!(result.
|
|
182
|
+
assert_eq!(result.expect("Operation failed"), mime_type);
|
|
174
183
|
}
|
|
175
184
|
}
|
|
176
185
|
|
|
@@ -222,18 +231,24 @@ async fn test_unknown_mime_type() {
|
|
|
222
231
|
#[tokio::test]
|
|
223
232
|
async fn test_mime_mismatch_warning() {
|
|
224
233
|
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
|
|
225
|
-
let temp_path = temp_file
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
234
|
+
let temp_path = temp_file
|
|
235
|
+
.path()
|
|
236
|
+
.parent()
|
|
237
|
+
.expect("Operation failed")
|
|
238
|
+
.join("document.pdf");
|
|
239
|
+
|
|
240
|
+
temp_file
|
|
241
|
+
.write_all(&[0x50, 0x4B, 0x03, 0x04])
|
|
242
|
+
.expect("Operation failed");
|
|
243
|
+
temp_file.flush().expect("Operation failed");
|
|
244
|
+
std::fs::copy(temp_file.path(), &temp_path).expect("Operation failed");
|
|
230
245
|
|
|
231
246
|
let detected = detect_mime_type(&temp_path, true);
|
|
232
247
|
|
|
233
248
|
assert!(detected.is_ok(), "Should detect MIME type even with mismatch");
|
|
234
249
|
|
|
235
250
|
assert_eq!(
|
|
236
|
-
detected.
|
|
251
|
+
detected.expect("Operation failed"),
|
|
237
252
|
"application/pdf",
|
|
238
253
|
"Extension-based detection should take precedence"
|
|
239
254
|
);
|
|
@@ -245,18 +260,22 @@ async fn test_mime_mismatch_warning() {
|
|
|
245
260
|
#[tokio::test]
|
|
246
261
|
async fn test_extension_content_mismatch() {
|
|
247
262
|
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
|
|
248
|
-
let temp_path = temp_file
|
|
263
|
+
let temp_path = temp_file
|
|
264
|
+
.path()
|
|
265
|
+
.parent()
|
|
266
|
+
.expect("Operation failed")
|
|
267
|
+
.join("document.txt");
|
|
249
268
|
|
|
250
|
-
temp_file.write_all(b"%PDF-1.4\n").
|
|
251
|
-
temp_file.flush().
|
|
252
|
-
std::fs::copy(temp_file.path(), &temp_path).
|
|
269
|
+
temp_file.write_all(b"%PDF-1.4\n").expect("Operation failed");
|
|
270
|
+
temp_file.flush().expect("Operation failed");
|
|
271
|
+
std::fs::copy(temp_file.path(), &temp_path).expect("Operation failed");
|
|
253
272
|
|
|
254
273
|
let detected = detect_mime_type(&temp_path, true);
|
|
255
274
|
|
|
256
275
|
assert!(detected.is_ok(), "Should detect MIME type");
|
|
257
276
|
|
|
258
277
|
assert_eq!(
|
|
259
|
-
detected.
|
|
278
|
+
detected.expect("Operation failed"),
|
|
260
279
|
"text/plain",
|
|
261
280
|
"Should use extension for MIME detection"
|
|
262
281
|
);
|
|
@@ -268,11 +287,11 @@ async fn test_extension_content_mismatch() {
|
|
|
268
287
|
#[tokio::test]
|
|
269
288
|
async fn test_no_extension() {
|
|
270
289
|
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
|
|
271
|
-
let temp_path = temp_file.path().parent().
|
|
290
|
+
let temp_path = temp_file.path().parent().expect("Operation failed").join("testfile");
|
|
272
291
|
|
|
273
|
-
temp_file.write_all(b"test content").
|
|
274
|
-
temp_file.flush().
|
|
275
|
-
std::fs::copy(temp_file.path(), &temp_path).
|
|
292
|
+
temp_file.write_all(b"test content").expect("Operation failed");
|
|
293
|
+
temp_file.flush().expect("Operation failed");
|
|
294
|
+
std::fs::copy(temp_file.path(), &temp_path).expect("Operation failed");
|
|
276
295
|
|
|
277
296
|
let detected = detect_mime_type(&temp_path, true);
|
|
278
297
|
|
|
@@ -322,23 +341,31 @@ async fn test_mime_detection_skip_existence_check() {
|
|
|
322
341
|
let result = detect_mime_type(nonexistent_path, false);
|
|
323
342
|
|
|
324
343
|
assert!(result.is_ok(), "Should succeed when skipping existence check");
|
|
325
|
-
assert_eq!(result.
|
|
344
|
+
assert_eq!(result.expect("Operation failed"), "application/pdf");
|
|
326
345
|
}
|
|
327
346
|
|
|
328
347
|
/// Test multiple dots in filename.
|
|
329
348
|
#[tokio::test]
|
|
330
349
|
async fn test_filename_multiple_dots() {
|
|
331
350
|
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
|
|
332
|
-
let temp_path = temp_file
|
|
351
|
+
let temp_path = temp_file
|
|
352
|
+
.path()
|
|
353
|
+
.parent()
|
|
354
|
+
.expect("Operation failed")
|
|
355
|
+
.join("my.backup.file.pdf");
|
|
333
356
|
|
|
334
|
-
temp_file.write_all(b"test").
|
|
335
|
-
temp_file.flush().
|
|
336
|
-
std::fs::copy(temp_file.path(), &temp_path).
|
|
357
|
+
temp_file.write_all(b"test").expect("Operation failed");
|
|
358
|
+
temp_file.flush().expect("Operation failed");
|
|
359
|
+
std::fs::copy(temp_file.path(), &temp_path).expect("Operation failed");
|
|
337
360
|
|
|
338
361
|
let detected = detect_mime_type(&temp_path, true);
|
|
339
362
|
|
|
340
363
|
assert!(detected.is_ok(), "Should handle multiple dots in filename");
|
|
341
|
-
assert_eq!(
|
|
364
|
+
assert_eq!(
|
|
365
|
+
detected.expect("Operation failed"),
|
|
366
|
+
"application/pdf",
|
|
367
|
+
"Should use last extension"
|
|
368
|
+
);
|
|
342
369
|
|
|
343
370
|
let _ = std::fs::remove_file(&temp_path);
|
|
344
371
|
}
|
|
@@ -347,16 +374,20 @@ async fn test_filename_multiple_dots() {
|
|
|
347
374
|
#[tokio::test]
|
|
348
375
|
async fn test_filename_special_characters() {
|
|
349
376
|
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
|
|
350
|
-
let temp_path = temp_file
|
|
377
|
+
let temp_path = temp_file
|
|
378
|
+
.path()
|
|
379
|
+
.parent()
|
|
380
|
+
.expect("Operation failed")
|
|
381
|
+
.join("文档 (copy) [v2].pdf");
|
|
351
382
|
|
|
352
|
-
temp_file.write_all(b"test").
|
|
353
|
-
temp_file.flush().
|
|
354
|
-
std::fs::copy(temp_file.path(), &temp_path).
|
|
383
|
+
temp_file.write_all(b"test").expect("Operation failed");
|
|
384
|
+
temp_file.flush().expect("Operation failed");
|
|
385
|
+
std::fs::copy(temp_file.path(), &temp_path).expect("Operation failed");
|
|
355
386
|
|
|
356
387
|
let detected = detect_mime_type(&temp_path, true);
|
|
357
388
|
|
|
358
389
|
assert!(detected.is_ok(), "Should handle special characters in filename");
|
|
359
|
-
assert_eq!(detected.
|
|
390
|
+
assert_eq!(detected.expect("Operation failed"), "application/pdf");
|
|
360
391
|
|
|
361
392
|
let _ = std::fs::remove_file(&temp_path);
|
|
362
393
|
}
|
|
@@ -382,11 +413,11 @@ async fn test_pandoc_formats_mime_detection() {
|
|
|
382
413
|
|
|
383
414
|
for (filename, expected_mime) in pandoc_formats {
|
|
384
415
|
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
|
|
385
|
-
let temp_path = temp_file.path().parent().
|
|
416
|
+
let temp_path = temp_file.path().parent().expect("Operation failed").join(filename);
|
|
386
417
|
|
|
387
|
-
temp_file.write_all(b"test content").
|
|
388
|
-
temp_file.flush().
|
|
389
|
-
std::fs::copy(temp_file.path(), &temp_path).
|
|
418
|
+
temp_file.write_all(b"test content").expect("Operation failed");
|
|
419
|
+
temp_file.flush().expect("Operation failed");
|
|
420
|
+
std::fs::copy(temp_file.path(), &temp_path).expect("Operation failed");
|
|
390
421
|
|
|
391
422
|
let detected = detect_mime_type(&temp_path, true);
|
|
392
423
|
|
|
@@ -396,7 +427,7 @@ async fn test_pandoc_formats_mime_detection() {
|
|
|
396
427
|
filename
|
|
397
428
|
);
|
|
398
429
|
assert_eq!(
|
|
399
|
-
detected.
|
|
430
|
+
detected.expect("Operation failed"),
|
|
400
431
|
expected_mime,
|
|
401
432
|
"MIME type mismatch for Pandoc format: {}",
|
|
402
433
|
filename
|
|
@@ -424,6 +455,6 @@ async fn test_pandoc_mime_validation() {
|
|
|
424
455
|
for mime_type in pandoc_mimes {
|
|
425
456
|
let result = validate_mime_type(mime_type);
|
|
426
457
|
assert!(result.is_ok(), "Pandoc MIME type should be supported: {}", mime_type);
|
|
427
|
-
assert_eq!(result.
|
|
458
|
+
assert_eq!(result.expect("Operation failed"), mime_type);
|
|
428
459
|
}
|
|
429
460
|
}
|
|
@@ -492,8 +492,8 @@ fn test_ocr_cache_disabled_then_enabled() {
|
|
|
492
492
|
}
|
|
493
493
|
assert!(result2.is_ok(), "Second extraction should succeed");
|
|
494
494
|
|
|
495
|
-
assert_non_empty_content(&result1.
|
|
496
|
-
assert_non_empty_content(&result2.
|
|
495
|
+
assert_non_empty_content(&result1.expect("Operation failed"));
|
|
496
|
+
assert_non_empty_content(&result2.expect("Operation failed"));
|
|
497
497
|
}
|
|
498
498
|
|
|
499
499
|
#[test]
|
|
@@ -548,7 +548,10 @@ fn test_ocr_concurrent_same_file() {
|
|
|
548
548
|
handles.push(handle);
|
|
549
549
|
}
|
|
550
550
|
|
|
551
|
-
let successes: usize = handles
|
|
551
|
+
let successes: usize = handles
|
|
552
|
+
.into_iter()
|
|
553
|
+
.map(|h| if h.join().expect("Iterator failed") { 1 } else { 0 })
|
|
554
|
+
.sum();
|
|
552
555
|
|
|
553
556
|
tracing::debug!("Concurrent processing: {}/5 threads succeeded", successes);
|
|
554
557
|
|
|
@@ -615,7 +618,10 @@ fn test_ocr_concurrent_different_files() {
|
|
|
615
618
|
handles.push(handle);
|
|
616
619
|
}
|
|
617
620
|
|
|
618
|
-
let successes: usize = handles
|
|
621
|
+
let successes: usize = handles
|
|
622
|
+
.into_iter()
|
|
623
|
+
.map(|h| if h.join().expect("Iterator failed") { 1 } else { 0 })
|
|
624
|
+
.sum();
|
|
619
625
|
|
|
620
626
|
assert_eq!(
|
|
621
627
|
successes, 2,
|
|
@@ -120,7 +120,7 @@ fn test_registry_singleton_behavior() {
|
|
|
120
120
|
#[test]
|
|
121
121
|
fn test_easyocr_special_languages() {
|
|
122
122
|
let registry = LanguageRegistry::new();
|
|
123
|
-
let languages = registry.get_supported_languages("easyocr").
|
|
123
|
+
let languages = registry.get_supported_languages("easyocr").expect("Operation failed");
|
|
124
124
|
|
|
125
125
|
let special_langs = vec!["ch_sim", "ch_tra", "rs_cyrillic", "rs_latin"];
|
|
126
126
|
|
|
@@ -56,10 +56,10 @@ fn test_rayon_batch_stress_many_images() {
|
|
|
56
56
|
success_count
|
|
57
57
|
);
|
|
58
58
|
|
|
59
|
-
let first_content = results[0].result.as_ref().
|
|
59
|
+
let first_content = results[0].result.as_ref().expect("Operation failed").content.clone();
|
|
60
60
|
for (i, result) in results.iter().enumerate().skip(1) {
|
|
61
61
|
assert!(result.success, "Result {} should succeed", i);
|
|
62
|
-
let content = &result.result.as_ref().
|
|
62
|
+
let content = &result.result.as_ref().expect("Operation failed").content;
|
|
63
63
|
assert_eq!(
|
|
64
64
|
content, &first_content,
|
|
65
65
|
"Result {} content differs - possible race condition",
|
|
@@ -220,7 +220,7 @@ fn test_tesseract_api_thread_safety() {
|
|
|
220
220
|
thread_id,
|
|
221
221
|
result.err()
|
|
222
222
|
);
|
|
223
|
-
result.
|
|
223
|
+
result.expect("Operation failed")
|
|
224
224
|
}));
|
|
225
225
|
}
|
|
226
226
|
|
|
@@ -26,9 +26,9 @@ mod helpers;
|
|
|
26
26
|
fn get_test_file_path(filename: &str) -> PathBuf {
|
|
27
27
|
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
28
28
|
.parent()
|
|
29
|
-
.
|
|
29
|
+
.expect("Operation failed")
|
|
30
30
|
.parent()
|
|
31
|
-
.
|
|
31
|
+
.expect("Operation failed");
|
|
32
32
|
workspace_root.join(format!("test_documents/odt/{}", filename))
|
|
33
33
|
}
|
|
34
34
|
|
|
@@ -48,9 +48,9 @@ fn ensure_test_file_exists(path: &Path) -> bool {
|
|
|
48
48
|
async fn test_odt_metadata_extraction() {
|
|
49
49
|
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
50
50
|
.parent()
|
|
51
|
-
.
|
|
51
|
+
.expect("Operation failed")
|
|
52
52
|
.parent()
|
|
53
|
-
.
|
|
53
|
+
.expect("Operation failed");
|
|
54
54
|
let test_file = workspace_root.join("test_documents/metadata_test.odt");
|
|
55
55
|
|
|
56
56
|
if !ensure_test_file_exists(&test_file) {
|
|
@@ -615,9 +615,9 @@ async fn test_odt_table_no_duplicate_content() {
|
|
|
615
615
|
async fn test_odt_comprehensive_table_extraction() {
|
|
616
616
|
let test_file = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
|
617
617
|
.parent()
|
|
618
|
-
.
|
|
618
|
+
.expect("Operation failed")
|
|
619
619
|
.parent()
|
|
620
|
-
.
|
|
620
|
+
.expect("Operation failed")
|
|
621
621
|
.join("test_documents/extraction_test.odt");
|
|
622
622
|
|
|
623
623
|
if !test_file.exists() {
|
|
@@ -22,9 +22,9 @@ mod helpers;
|
|
|
22
22
|
fn get_test_opml_path(filename: &str) -> PathBuf {
|
|
23
23
|
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
24
24
|
.parent()
|
|
25
|
-
.
|
|
25
|
+
.expect("Operation failed")
|
|
26
26
|
.parent()
|
|
27
|
-
.
|
|
27
|
+
.expect("Operation failed");
|
|
28
28
|
workspace_root.join(format!("test_documents/opml/{}", filename))
|
|
29
29
|
}
|
|
30
30
|
|
|
@@ -24,9 +24,9 @@ use std::path::PathBuf;
|
|
|
24
24
|
fn get_test_orgmode_path(filename: &str) -> PathBuf {
|
|
25
25
|
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
26
26
|
.parent()
|
|
27
|
-
.
|
|
27
|
+
.expect("Operation failed")
|
|
28
28
|
.parent()
|
|
29
|
-
.
|
|
29
|
+
.expect("Operation failed");
|
|
30
30
|
workspace_root.join(format!("test_documents/orgmode/{}", filename))
|
|
31
31
|
}
|
|
32
32
|
|
|
@@ -202,7 +202,7 @@ fn test_marker_appears_before_content() {
|
|
|
202
202
|
assert!(marker_pos.is_some(), "Marker should be present");
|
|
203
203
|
|
|
204
204
|
// Marker should be very early in the content (within first 50 chars)
|
|
205
|
-
let pos = marker_pos.
|
|
205
|
+
let pos = marker_pos.expect("Operation failed");
|
|
206
206
|
assert!(
|
|
207
207
|
pos < 50,
|
|
208
208
|
"Marker for page 1 should appear at the start, but found at position {}",
|
|
@@ -64,7 +64,7 @@ async fn test_full_hierarchy_extraction() {
|
|
|
64
64
|
"Pages should be extracted when extract_pages is enabled"
|
|
65
65
|
);
|
|
66
66
|
|
|
67
|
-
let pages = result.pages.as_ref().
|
|
67
|
+
let pages = result.pages.as_ref().expect("Operation failed");
|
|
68
68
|
assert!(!pages.is_empty(), "At least one page should be extracted");
|
|
69
69
|
|
|
70
70
|
// Check that the first page has hierarchy information
|
|
@@ -74,7 +74,7 @@ async fn test_full_hierarchy_extraction() {
|
|
|
74
74
|
"First page should have hierarchy information when hierarchy extraction is enabled"
|
|
75
75
|
);
|
|
76
76
|
|
|
77
|
-
let hierarchy = first_page.hierarchy.as_ref().
|
|
77
|
+
let hierarchy = first_page.hierarchy.as_ref().expect("Operation failed");
|
|
78
78
|
|
|
79
79
|
// Verify hierarchy structure
|
|
80
80
|
assert!(hierarchy.block_count > 0, "Hierarchy should contain at least one block");
|
|
@@ -172,7 +172,7 @@ async fn test_hierarchy_disabled() {
|
|
|
172
172
|
// Verify that pages were extracted
|
|
173
173
|
assert!(result.pages.is_some(), "Pages should be extracted");
|
|
174
174
|
|
|
175
|
-
let pages = result.pages.as_ref().
|
|
175
|
+
let pages = result.pages.as_ref().expect("Operation failed");
|
|
176
176
|
assert!(!pages.is_empty(), "At least one page should be extracted");
|
|
177
177
|
|
|
178
178
|
// Check that the first page does NOT have hierarchy information when disabled
|
|
@@ -227,7 +227,7 @@ async fn test_hierarchy_with_explicit_disabled() {
|
|
|
227
227
|
// Verify that pages were extracted
|
|
228
228
|
assert!(result.pages.is_some(), "Pages should be extracted");
|
|
229
229
|
|
|
230
|
-
let pages = result.pages.as_ref().
|
|
230
|
+
let pages = result.pages.as_ref().expect("Operation failed");
|
|
231
231
|
assert!(!pages.is_empty(), "At least one page should be extracted");
|
|
232
232
|
|
|
233
233
|
// Check that the first page does NOT have hierarchy information when disabled
|
|
@@ -282,7 +282,7 @@ async fn test_hierarchy_different_k_clusters() {
|
|
|
282
282
|
|
|
283
283
|
assert!(result.pages.is_some(), "Pages should be extracted");
|
|
284
284
|
|
|
285
|
-
let pages = result.pages.as_ref().
|
|
285
|
+
let pages = result.pages.as_ref().expect("Operation failed");
|
|
286
286
|
assert!(!pages.is_empty(), "At least one page should be extracted");
|
|
287
287
|
|
|
288
288
|
let first_page = &pages[0];
|
|
@@ -292,7 +292,7 @@ async fn test_hierarchy_different_k_clusters() {
|
|
|
292
292
|
k
|
|
293
293
|
);
|
|
294
294
|
|
|
295
|
-
let hierarchy = first_page.hierarchy.as_ref().
|
|
295
|
+
let hierarchy = first_page.hierarchy.as_ref().expect("Operation failed");
|
|
296
296
|
eprintln!("K={}: {} hierarchy blocks extracted", k, hierarchy.block_count);
|
|
297
297
|
assert!(hierarchy.block_count > 0, "Should have blocks with k={}", k);
|
|
298
298
|
}
|
|
@@ -29,7 +29,7 @@ fn test_extract_chars_basic() {
|
|
|
29
29
|
// Load PDF
|
|
30
30
|
let pdfium = Pdfium;
|
|
31
31
|
let document = pdfium
|
|
32
|
-
.load_pdf_from_file(pdf_path.to_str().
|
|
32
|
+
.load_pdf_from_file(pdf_path.to_str().expect("Operation failed"), None)
|
|
33
33
|
.expect("Failed to load test PDF");
|
|
34
34
|
|
|
35
35
|
// Get first page
|
|
@@ -62,7 +62,7 @@ fn test_extract_chars_preserves_order() {
|
|
|
62
62
|
// Load PDF
|
|
63
63
|
let pdfium = Pdfium;
|
|
64
64
|
let document = pdfium
|
|
65
|
-
.load_pdf_from_file(pdf_path.to_str().
|
|
65
|
+
.load_pdf_from_file(pdf_path.to_str().expect("Operation failed"), None)
|
|
66
66
|
.expect("Failed to load test PDF");
|
|
67
67
|
|
|
68
68
|
// Get first page
|