kreuzberg 4.1.2 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
- data/kreuzberg.gemspec +13 -1
- data/lib/kreuzberg/config.rb +70 -35
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +5 -1
- data/spec/binding/batch_operations_spec.rb +80 -0
- data/spec/binding/metadata_types_spec.rb +77 -57
- data/spec/serialization_spec.rb +134 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/embeddings.rs +4 -4
- data/vendor/kreuzberg/src/mcp/format.rs +237 -39
- data/vendor/kreuzberg/src/mcp/params.rs +26 -33
- data/vendor/kreuzberg/src/mcp/server.rs +6 -3
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
- data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
- data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
- data/vendor/kreuzberg/tests/api_embed.rs +84 -50
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
- data/vendor/kreuzberg/tests/api_tests.rs +298 -139
- data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
- data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
- data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
- data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
- data/vendor/kreuzberg/tests/config_behavioral.rs +414 -0
- data/vendor/kreuzberg/tests/config_features.rs +19 -15
- data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
- data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
- data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
- data/vendor/kreuzberg/tests/core_integration.rs +55 -53
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
- data/vendor/kreuzberg/tests/email_integration.rs +7 -7
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/error_handling.rs +13 -11
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
- data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +72 -41
- data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
- data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/page_markers.rs +1 -1
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
- data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +40 -30
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
- data/vendor/kreuzberg/tests/security_validation.rs +20 -19
- data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
- data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +10 -2
|
@@ -25,9 +25,9 @@ use zip::write::{FileOptions, ZipWriter};
|
|
|
25
25
|
async fn test_ppsx_slideshow_extraction() {
|
|
26
26
|
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
27
27
|
.parent()
|
|
28
|
-
.
|
|
28
|
+
.expect("Operation failed")
|
|
29
29
|
.parent()
|
|
30
|
-
.
|
|
30
|
+
.expect("Operation failed");
|
|
31
31
|
let test_file = workspace_root.join("test_documents/presentations/sample.ppsx");
|
|
32
32
|
|
|
33
33
|
if !test_file.exists() {
|
|
@@ -69,9 +69,9 @@ async fn test_ppsx_slideshow_extraction() {
|
|
|
69
69
|
async fn test_ppsx_with_explicit_mime_type() {
|
|
70
70
|
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
71
71
|
.parent()
|
|
72
|
-
.
|
|
72
|
+
.expect("Operation failed")
|
|
73
73
|
.parent()
|
|
74
|
-
.
|
|
74
|
+
.expect("Operation failed");
|
|
75
75
|
let test_file = workspace_root.join("test_documents/presentations/sample.ppsx");
|
|
76
76
|
|
|
77
77
|
if !test_file.exists() {
|
|
@@ -120,24 +120,26 @@ async fn test_pptx_with_image_placeholder_no_txbody() {
|
|
|
120
120
|
let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
|
|
121
121
|
|
|
122
122
|
// Add [Content_Types].xml
|
|
123
|
-
zip.start_file("[Content_Types].xml", options)
|
|
123
|
+
zip.start_file("[Content_Types].xml", options)
|
|
124
|
+
.expect("Operation failed");
|
|
124
125
|
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
125
126
|
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
|
126
127
|
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
|
127
128
|
<Default Extension="xml" ContentType="application/xml"/>
|
|
128
129
|
<Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
|
|
129
130
|
<Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
|
|
130
|
-
</Types>"#).
|
|
131
|
+
</Types>"#).expect("Operation failed");
|
|
131
132
|
|
|
132
133
|
// Add _rels/.rels
|
|
133
|
-
zip.start_file("_rels/.rels", options).
|
|
134
|
+
zip.start_file("_rels/.rels", options).expect("Operation failed");
|
|
134
135
|
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
135
136
|
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
136
137
|
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
|
|
137
|
-
</Relationships>"#).
|
|
138
|
+
</Relationships>"#).expect("Operation failed");
|
|
138
139
|
|
|
139
140
|
// Add ppt/presentation.xml
|
|
140
|
-
zip.start_file("ppt/presentation.xml", options)
|
|
141
|
+
zip.start_file("ppt/presentation.xml", options)
|
|
142
|
+
.expect("Operation failed");
|
|
141
143
|
zip.write_all(
|
|
142
144
|
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
143
145
|
<p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
|
|
@@ -148,18 +150,20 @@ async fn test_pptx_with_image_placeholder_no_txbody() {
|
|
|
148
150
|
</p:sldIdLst>
|
|
149
151
|
</p:presentation>"#,
|
|
150
152
|
)
|
|
151
|
-
.
|
|
153
|
+
.expect("Operation failed");
|
|
152
154
|
|
|
153
155
|
// Add ppt/_rels/presentation.xml.rels
|
|
154
|
-
zip.start_file("ppt/_rels/presentation.xml.rels", options)
|
|
156
|
+
zip.start_file("ppt/_rels/presentation.xml.rels", options)
|
|
157
|
+
.expect("Operation failed");
|
|
155
158
|
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
156
159
|
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
157
160
|
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
|
|
158
|
-
</Relationships>"#).
|
|
161
|
+
</Relationships>"#).expect("Operation failed");
|
|
159
162
|
|
|
160
163
|
// Add ppt/slides/slide1.xml with a shape WITHOUT txBody (image placeholder)
|
|
161
164
|
// This is the critical test case - a <p:sp> element with no <p:txBody>
|
|
162
|
-
zip.start_file("ppt/slides/slide1.xml", options)
|
|
165
|
+
zip.start_file("ppt/slides/slide1.xml", options)
|
|
166
|
+
.expect("Operation failed");
|
|
163
167
|
zip.write_all(
|
|
164
168
|
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
165
169
|
<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
|
|
@@ -259,18 +263,19 @@ async fn test_pptx_with_image_placeholder_no_txbody() {
|
|
|
259
263
|
</p:cSld>
|
|
260
264
|
</p:sld>"#,
|
|
261
265
|
)
|
|
262
|
-
.
|
|
266
|
+
.expect("Operation failed");
|
|
263
267
|
|
|
264
268
|
// Add ppt/slides/_rels/slide1.xml.rels (empty)
|
|
265
|
-
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
|
|
269
|
+
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
|
|
270
|
+
.expect("Operation failed");
|
|
266
271
|
zip.write_all(
|
|
267
272
|
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
268
273
|
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
269
274
|
</Relationships>"#,
|
|
270
275
|
)
|
|
271
|
-
.
|
|
276
|
+
.expect("Operation failed");
|
|
272
277
|
|
|
273
|
-
zip.finish().
|
|
278
|
+
zip.finish().expect("Operation failed");
|
|
274
279
|
}
|
|
275
280
|
|
|
276
281
|
// Extract the PPTX file
|
|
@@ -336,24 +341,26 @@ async fn test_pptx_mixed_shapes_extraction() {
|
|
|
336
341
|
let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
|
|
337
342
|
|
|
338
343
|
// Add [Content_Types].xml
|
|
339
|
-
zip.start_file("[Content_Types].xml", options)
|
|
344
|
+
zip.start_file("[Content_Types].xml", options)
|
|
345
|
+
.expect("Operation failed");
|
|
340
346
|
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
341
347
|
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
|
342
348
|
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
|
343
349
|
<Default Extension="xml" ContentType="application/xml"/>
|
|
344
350
|
<Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
|
|
345
351
|
<Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
|
|
346
|
-
</Types>"#).
|
|
352
|
+
</Types>"#).expect("Operation failed");
|
|
347
353
|
|
|
348
354
|
// Add _rels/.rels
|
|
349
|
-
zip.start_file("_rels/.rels", options).
|
|
355
|
+
zip.start_file("_rels/.rels", options).expect("Operation failed");
|
|
350
356
|
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
351
357
|
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
352
358
|
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
|
|
353
|
-
</Relationships>"#).
|
|
359
|
+
</Relationships>"#).expect("Operation failed");
|
|
354
360
|
|
|
355
361
|
// Add ppt/presentation.xml
|
|
356
|
-
zip.start_file("ppt/presentation.xml", options)
|
|
362
|
+
zip.start_file("ppt/presentation.xml", options)
|
|
363
|
+
.expect("Operation failed");
|
|
357
364
|
zip.write_all(
|
|
358
365
|
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
359
366
|
<p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
|
|
@@ -364,17 +371,19 @@ async fn test_pptx_mixed_shapes_extraction() {
|
|
|
364
371
|
</p:sldIdLst>
|
|
365
372
|
</p:presentation>"#,
|
|
366
373
|
)
|
|
367
|
-
.
|
|
374
|
+
.expect("Operation failed");
|
|
368
375
|
|
|
369
376
|
// Add ppt/_rels/presentation.xml.rels
|
|
370
|
-
zip.start_file("ppt/_rels/presentation.xml.rels", options)
|
|
377
|
+
zip.start_file("ppt/_rels/presentation.xml.rels", options)
|
|
378
|
+
.expect("Operation failed");
|
|
371
379
|
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
372
380
|
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
373
381
|
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
|
|
374
|
-
</Relationships>"#).
|
|
382
|
+
</Relationships>"#).expect("Operation failed");
|
|
375
383
|
|
|
376
384
|
// Add slide with various shapes - some with txBody, some without
|
|
377
|
-
zip.start_file("ppt/slides/slide1.xml", options)
|
|
385
|
+
zip.start_file("ppt/slides/slide1.xml", options)
|
|
386
|
+
.expect("Operation failed");
|
|
378
387
|
zip.write_all(
|
|
379
388
|
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
380
389
|
<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
|
|
@@ -453,18 +462,19 @@ async fn test_pptx_mixed_shapes_extraction() {
|
|
|
453
462
|
</p:cSld>
|
|
454
463
|
</p:sld>"#,
|
|
455
464
|
)
|
|
456
|
-
.
|
|
465
|
+
.expect("Operation failed");
|
|
457
466
|
|
|
458
467
|
// Add empty rels
|
|
459
|
-
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
|
|
468
|
+
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
|
|
469
|
+
.expect("Operation failed");
|
|
460
470
|
zip.write_all(
|
|
461
471
|
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
462
472
|
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
463
473
|
</Relationships>"#,
|
|
464
474
|
)
|
|
465
|
-
.
|
|
475
|
+
.expect("Operation failed");
|
|
466
476
|
|
|
467
|
-
zip.finish().
|
|
477
|
+
zip.finish().expect("Operation failed");
|
|
468
478
|
}
|
|
469
479
|
|
|
470
480
|
let result = extract_file(
|
|
@@ -184,9 +184,9 @@ fn test_register_multiple_validators_succeeds() {
|
|
|
184
184
|
should_fail: true,
|
|
185
185
|
});
|
|
186
186
|
|
|
187
|
-
registry.register(v1).
|
|
188
|
-
registry.register(v2).
|
|
189
|
-
registry.register(v3).
|
|
187
|
+
registry.register(v1).expect("Operation failed");
|
|
188
|
+
registry.register(v2).expect("Operation failed");
|
|
189
|
+
registry.register(v3).expect("Operation failed");
|
|
190
190
|
|
|
191
191
|
let list = registry.list();
|
|
192
192
|
assert_eq!(list.len(), 3, "Should have three validators");
|
|
@@ -205,7 +205,7 @@ fn test_validator_unregistration_succeeds() {
|
|
|
205
205
|
should_fail: false,
|
|
206
206
|
});
|
|
207
207
|
|
|
208
|
-
registry.register(validator).
|
|
208
|
+
registry.register(validator).expect("Operation failed");
|
|
209
209
|
assert_eq!(registry.list().len(), 1);
|
|
210
210
|
|
|
211
211
|
let result = registry.remove("temp-validator");
|
|
@@ -298,8 +298,8 @@ fn test_clear_validators_succeeds() {
|
|
|
298
298
|
should_fail: false,
|
|
299
299
|
});
|
|
300
300
|
|
|
301
|
-
registry.register(v1).
|
|
302
|
-
registry.register(v2).
|
|
301
|
+
registry.register(v1).expect("Operation failed");
|
|
302
|
+
registry.register(v2).expect("Operation failed");
|
|
303
303
|
assert_eq!(registry.list().len(), 2);
|
|
304
304
|
|
|
305
305
|
let result = registry.shutdown_all();
|
|
@@ -355,9 +355,9 @@ fn test_get_all_validators_respects_priority() {
|
|
|
355
355
|
priority: 100,
|
|
356
356
|
});
|
|
357
357
|
|
|
358
|
-
registry.register(medium).
|
|
359
|
-
registry.register(low).
|
|
360
|
-
registry.register(high).
|
|
358
|
+
registry.register(medium).expect("Operation failed");
|
|
359
|
+
registry.register(low).expect("Operation failed");
|
|
360
|
+
registry.register(high).expect("Operation failed");
|
|
361
361
|
|
|
362
362
|
let all = registry.get_all();
|
|
363
363
|
assert_eq!(all.len(), 3, "Should have three validators");
|
|
@@ -397,11 +397,11 @@ fn test_get_extractor_by_mime_type_succeeds() {
|
|
|
397
397
|
priority: 50,
|
|
398
398
|
});
|
|
399
399
|
|
|
400
|
-
registry.register(extractor).
|
|
400
|
+
registry.register(extractor).expect("Operation failed");
|
|
401
401
|
|
|
402
402
|
let result = registry.get("application/pdf");
|
|
403
403
|
assert!(result.is_ok(), "Should find extractor for PDF");
|
|
404
|
-
assert_eq!(result.
|
|
404
|
+
assert_eq!(result.expect("Operation failed").name(), "pdf-extractor");
|
|
405
405
|
}
|
|
406
406
|
|
|
407
407
|
/// Test extractor not found for unsupported MIME type.
|
|
@@ -437,10 +437,10 @@ fn test_extractor_priority_selection() {
|
|
|
437
437
|
priority: 100,
|
|
438
438
|
});
|
|
439
439
|
|
|
440
|
-
registry.register(low_priority).
|
|
441
|
-
registry.register(high_priority).
|
|
440
|
+
registry.register(low_priority).expect("Operation failed");
|
|
441
|
+
registry.register(high_priority).expect("Operation failed");
|
|
442
442
|
|
|
443
|
-
let result = registry.get("text/plain").
|
|
443
|
+
let result = registry.get("text/plain").expect("Value not found");
|
|
444
444
|
assert_eq!(
|
|
445
445
|
result.name(),
|
|
446
446
|
"high-priority-extractor",
|
|
@@ -459,15 +459,15 @@ fn test_extractor_wildcard_mime_matching() {
|
|
|
459
459
|
priority: 50,
|
|
460
460
|
});
|
|
461
461
|
|
|
462
|
-
registry.register(extractor).
|
|
462
|
+
registry.register(extractor).expect("Operation failed");
|
|
463
463
|
|
|
464
464
|
let result = registry.get("text/plain");
|
|
465
465
|
assert!(result.is_ok(), "Should match text/plain with text/*");
|
|
466
|
-
assert_eq!(result.
|
|
466
|
+
assert_eq!(result.expect("Operation failed").name(), "text-extractor");
|
|
467
467
|
|
|
468
468
|
let result = registry.get("text/html");
|
|
469
469
|
assert!(result.is_ok(), "Should match text/html with text/*");
|
|
470
|
-
assert_eq!(result.
|
|
470
|
+
assert_eq!(result.expect("Operation failed").name(), "text-extractor");
|
|
471
471
|
|
|
472
472
|
let result = registry.get("application/pdf");
|
|
473
473
|
assert!(result.is_err(), "Should not match application/pdf with text/*");
|
|
@@ -484,7 +484,7 @@ fn test_extractor_unregistration_succeeds() {
|
|
|
484
484
|
priority: 50,
|
|
485
485
|
});
|
|
486
486
|
|
|
487
|
-
registry.register(extractor).
|
|
487
|
+
registry.register(extractor).expect("Operation failed");
|
|
488
488
|
assert_eq!(registry.list().len(), 1);
|
|
489
489
|
|
|
490
490
|
let result = registry.remove("temp-extractor");
|
|
@@ -506,17 +506,20 @@ fn test_extractor_multiple_mime_types() {
|
|
|
506
506
|
priority: 50,
|
|
507
507
|
});
|
|
508
508
|
|
|
509
|
-
registry.register(extractor).
|
|
509
|
+
registry.register(extractor).expect("Operation failed");
|
|
510
510
|
|
|
511
511
|
assert!(registry.get("application/pdf").is_ok());
|
|
512
512
|
assert!(registry.get("application/vnd.ms-excel").is_ok());
|
|
513
513
|
assert!(registry.get("text/csv").is_ok());
|
|
514
514
|
|
|
515
515
|
assert_eq!(
|
|
516
|
-
registry.get("application/pdf").
|
|
516
|
+
registry.get("application/pdf").expect("Value not found").name(),
|
|
517
|
+
"multi-format-extractor"
|
|
518
|
+
);
|
|
519
|
+
assert_eq!(
|
|
520
|
+
registry.get("text/csv").expect("Value not found").name(),
|
|
517
521
|
"multi-format-extractor"
|
|
518
522
|
);
|
|
519
|
-
assert_eq!(registry.get("text/csv").unwrap().name(), "multi-format-extractor");
|
|
520
523
|
}
|
|
521
524
|
|
|
522
525
|
/// Test clearing all extractors.
|
|
@@ -535,8 +538,8 @@ fn test_clear_extractors_succeeds() {
|
|
|
535
538
|
priority: 50,
|
|
536
539
|
});
|
|
537
540
|
|
|
538
|
-
registry.register(e1).
|
|
539
|
-
registry.register(e2).
|
|
541
|
+
registry.register(e1).expect("Operation failed");
|
|
542
|
+
registry.register(e2).expect("Operation failed");
|
|
540
543
|
assert_eq!(registry.list().len(), 2);
|
|
541
544
|
|
|
542
545
|
let result = registry.shutdown_all();
|
|
@@ -686,7 +686,7 @@ async fn test_rst_extraction_no_errors() {
|
|
|
686
686
|
result.err()
|
|
687
687
|
);
|
|
688
688
|
|
|
689
|
-
let extraction = result.
|
|
689
|
+
let extraction = result.expect("Operation failed");
|
|
690
690
|
|
|
691
691
|
assert!(!extraction.content.is_empty(), "Extracted content should not be empty");
|
|
692
692
|
|
|
@@ -74,7 +74,7 @@ async fn test_rtf_accent_extraction() {
|
|
|
74
74
|
let result = extract_file(&path, Some("application/rtf"), &config).await;
|
|
75
75
|
|
|
76
76
|
assert!(result.is_ok(), "RTF extraction should succeed for accent.rtf");
|
|
77
|
-
let extraction = result.
|
|
77
|
+
let extraction = result.expect("Operation failed");
|
|
78
78
|
|
|
79
79
|
assert_eq!(extraction.mime_type, "application/rtf");
|
|
80
80
|
|
|
@@ -112,7 +112,7 @@ async fn test_rtf_bookmark_extraction() {
|
|
|
112
112
|
let result = extract_file(&path, Some("application/rtf"), &config).await;
|
|
113
113
|
|
|
114
114
|
assert!(result.is_ok(), "RTF extraction should succeed for bookmark.rtf");
|
|
115
|
-
let extraction = result.
|
|
115
|
+
let extraction = result.expect("Operation failed");
|
|
116
116
|
|
|
117
117
|
let content = extraction.content.to_lowercase();
|
|
118
118
|
|
|
@@ -137,7 +137,7 @@ async fn test_rtf_footnote_extraction() {
|
|
|
137
137
|
let result = extract_file(&path, Some("application/rtf"), &config).await;
|
|
138
138
|
|
|
139
139
|
assert!(result.is_ok(), "RTF extraction should succeed for footnote.rtf");
|
|
140
|
-
let extraction = result.
|
|
140
|
+
let extraction = result.expect("Operation failed");
|
|
141
141
|
|
|
142
142
|
assert!(!extraction.content.is_empty(), "Content should not be empty");
|
|
143
143
|
|
|
@@ -176,7 +176,7 @@ async fn test_rtf_formatting_extraction() {
|
|
|
176
176
|
let result = extract_file(&path, Some("application/rtf"), &config).await;
|
|
177
177
|
|
|
178
178
|
assert!(result.is_ok(), "RTF extraction should succeed for formatting.rtf");
|
|
179
|
-
let extraction = result.
|
|
179
|
+
let extraction = result.expect("Operation failed");
|
|
180
180
|
|
|
181
181
|
assert!(!extraction.content.is_empty(), "Content should not be empty");
|
|
182
182
|
|
|
@@ -223,7 +223,7 @@ async fn test_rtf_heading_extraction() {
|
|
|
223
223
|
let result = extract_file(&path, Some("application/rtf"), &config).await;
|
|
224
224
|
|
|
225
225
|
assert!(result.is_ok(), "RTF extraction should succeed for heading.rtf");
|
|
226
|
-
let extraction = result.
|
|
226
|
+
let extraction = result.expect("Operation failed");
|
|
227
227
|
|
|
228
228
|
assert!(!extraction.content.is_empty(), "Content should not be empty");
|
|
229
229
|
|
|
@@ -269,7 +269,7 @@ async fn test_rtf_image_extraction() {
|
|
|
269
269
|
let result = extract_file(&path, Some("application/rtf"), &config).await;
|
|
270
270
|
|
|
271
271
|
assert!(result.is_ok(), "RTF extraction should succeed for image.rtf");
|
|
272
|
-
let extraction = result.
|
|
272
|
+
let extraction = result.expect("Operation failed");
|
|
273
273
|
|
|
274
274
|
assert!(!extraction.content.is_empty(), "Content should not be empty");
|
|
275
275
|
|
|
@@ -301,7 +301,7 @@ async fn test_rtf_link_extraction() {
|
|
|
301
301
|
let result = extract_file(&path, Some("application/rtf"), &config).await;
|
|
302
302
|
|
|
303
303
|
assert!(result.is_ok(), "RTF extraction should succeed for link.rtf");
|
|
304
|
-
let extraction = result.
|
|
304
|
+
let extraction = result.expect("Operation failed");
|
|
305
305
|
|
|
306
306
|
assert!(!extraction.content.is_empty(), "Content should not be empty");
|
|
307
307
|
|
|
@@ -328,7 +328,7 @@ async fn test_rtf_list_complex_extraction() {
|
|
|
328
328
|
let result = extract_file(&path, Some("application/rtf"), &config).await;
|
|
329
329
|
|
|
330
330
|
assert!(result.is_ok(), "RTF extraction should succeed for list_complex.rtf");
|
|
331
|
-
let extraction = result.
|
|
331
|
+
let extraction = result.expect("Operation failed");
|
|
332
332
|
|
|
333
333
|
assert!(!extraction.content.is_empty(), "Content should not be empty");
|
|
334
334
|
|
|
@@ -381,7 +381,7 @@ async fn test_rtf_list_simple_extraction() {
|
|
|
381
381
|
let result = extract_file(&path, Some("application/rtf"), &config).await;
|
|
382
382
|
|
|
383
383
|
assert!(result.is_ok(), "RTF extraction should succeed for list_simple.rtf");
|
|
384
|
-
let extraction = result.
|
|
384
|
+
let extraction = result.expect("Operation failed");
|
|
385
385
|
|
|
386
386
|
assert!(!extraction.content.is_empty(), "Content should not be empty");
|
|
387
387
|
|
|
@@ -422,7 +422,7 @@ async fn test_rtf_table_error_codes_extraction() {
|
|
|
422
422
|
result.is_ok(),
|
|
423
423
|
"RTF extraction should succeed for table_error_codes.rtf"
|
|
424
424
|
);
|
|
425
|
-
let extraction = result.
|
|
425
|
+
let extraction = result.expect("Operation failed");
|
|
426
426
|
|
|
427
427
|
assert!(
|
|
428
428
|
extraction.mime_type == "application/rtf",
|
|
@@ -448,7 +448,7 @@ async fn test_rtf_table_simple_extraction() {
|
|
|
448
448
|
let result = extract_file(&path, Some("application/rtf"), &config).await;
|
|
449
449
|
|
|
450
450
|
assert!(result.is_ok(), "RTF extraction should succeed for table_simple.rtf");
|
|
451
|
-
let extraction = result.
|
|
451
|
+
let extraction = result.expect("Operation failed");
|
|
452
452
|
|
|
453
453
|
assert!(
|
|
454
454
|
extraction.mime_type == "application/rtf",
|
|
@@ -470,7 +470,7 @@ async fn test_rtf_unicode_extraction() {
|
|
|
470
470
|
let result = extract_file(&path, Some("application/rtf"), &config).await;
|
|
471
471
|
|
|
472
472
|
assert!(result.is_ok(), "RTF extraction should succeed for unicode.rtf");
|
|
473
|
-
let extraction = result.
|
|
473
|
+
let extraction = result.expect("Operation failed");
|
|
474
474
|
|
|
475
475
|
assert!(!extraction.content.is_empty(), "Content should not be empty");
|
|
476
476
|
|
|
@@ -493,8 +493,8 @@ async fn test_rtf_extraction_deterministic_unicode() {
|
|
|
493
493
|
|
|
494
494
|
assert!(result1.is_ok() && result2.is_ok(), "Both extractions should succeed");
|
|
495
495
|
|
|
496
|
-
let extraction1 = result1.
|
|
497
|
-
let extraction2 = result2.
|
|
496
|
+
let extraction1 = result1.expect("Operation failed");
|
|
497
|
+
let extraction2 = result2.expect("Operation failed");
|
|
498
498
|
|
|
499
499
|
assert_eq!(
|
|
500
500
|
extraction1.content, extraction2.content,
|
|
@@ -514,8 +514,8 @@ async fn test_rtf_extraction_deterministic_list_complex() {
|
|
|
514
514
|
|
|
515
515
|
assert!(result1.is_ok() && result2.is_ok(), "Both extractions should succeed");
|
|
516
516
|
|
|
517
|
-
let extraction1 = result1.
|
|
518
|
-
let extraction2 = result2.
|
|
517
|
+
let extraction1 = result1.expect("Operation failed");
|
|
518
|
+
let extraction2 = result2.expect("Operation failed");
|
|
519
519
|
|
|
520
520
|
assert_eq!(
|
|
521
521
|
extraction1.content, extraction2.content,
|
|
@@ -551,7 +551,7 @@ async fn test_rtf_no_critical_content_loss() {
|
|
|
551
551
|
filename
|
|
552
552
|
);
|
|
553
553
|
|
|
554
|
-
let extraction = result.
|
|
554
|
+
let extraction = result.expect("Operation failed");
|
|
555
555
|
assert!(
|
|
556
556
|
!extraction.content.is_empty(),
|
|
557
557
|
"FAIL: CRITICAL - Extracted 0 bytes from {}. RTF extractor lost all content.",
|
|
@@ -582,7 +582,7 @@ async fn test_rtf_mime_type_preservation() {
|
|
|
582
582
|
|
|
583
583
|
assert!(result.is_ok(), "Extraction should succeed for {}", filename);
|
|
584
584
|
|
|
585
|
-
let extraction = result.
|
|
585
|
+
let extraction = result.expect("Operation failed");
|
|
586
586
|
assert_eq!(
|
|
587
587
|
extraction.mime_type, "application/rtf",
|
|
588
588
|
"FAIL: MIME type not preserved for {}",
|
|
@@ -31,11 +31,11 @@ fn test_archive_zip_bomb_detection() {
|
|
|
31
31
|
let mut zip = ZipWriter::new(&mut cursor);
|
|
32
32
|
let options = FileOptions::<'_, ()>::default();
|
|
33
33
|
|
|
34
|
-
zip.start_file("large.txt", options).
|
|
34
|
+
zip.start_file("large.txt", options).expect("Operation failed");
|
|
35
35
|
let zeros = vec![0u8; 10 * 1024 * 1024];
|
|
36
|
-
zip.write_all(&zeros).
|
|
36
|
+
zip.write_all(&zeros).expect("Operation failed");
|
|
37
37
|
|
|
38
|
-
zip.finish().
|
|
38
|
+
zip.finish().expect("Operation failed");
|
|
39
39
|
}
|
|
40
40
|
|
|
41
41
|
let bytes = cursor.into_inner();
|
|
@@ -57,10 +57,10 @@ fn test_archive_path_traversal_zip() {
|
|
|
57
57
|
let mut zip = ZipWriter::new(&mut cursor);
|
|
58
58
|
let options = FileOptions::<'_, ()>::default();
|
|
59
59
|
|
|
60
|
-
zip.start_file("../../etc/passwd", options).
|
|
61
|
-
zip.write_all(b"malicious content").
|
|
60
|
+
zip.start_file("../../etc/passwd", options).expect("Operation failed");
|
|
61
|
+
zip.write_all(b"malicious content").expect("Operation failed");
|
|
62
62
|
|
|
63
|
-
zip.finish().
|
|
63
|
+
zip.finish().expect("Operation failed");
|
|
64
64
|
}
|
|
65
65
|
|
|
66
66
|
let bytes = cursor.into_inner();
|
|
@@ -97,10 +97,10 @@ fn test_archive_absolute_paths_rejected() {
|
|
|
97
97
|
let mut zip = ZipWriter::new(&mut cursor);
|
|
98
98
|
let options = FileOptions::<'_, ()>::default();
|
|
99
99
|
|
|
100
|
-
zip.start_file("/tmp/malicious.txt", options).
|
|
101
|
-
zip.write_all(b"malicious content").
|
|
100
|
+
zip.start_file("/tmp/malicious.txt", options).expect("Operation failed");
|
|
101
|
+
zip.write_all(b"malicious content").expect("Operation failed");
|
|
102
102
|
|
|
103
|
-
zip.finish().
|
|
103
|
+
zip.finish().expect("Operation failed");
|
|
104
104
|
}
|
|
105
105
|
|
|
106
106
|
let bytes = cursor.into_inner();
|
|
@@ -125,10 +125,10 @@ fn test_archive_deeply_nested_directories() {
|
|
|
125
125
|
let deep_path = (0..100).map(|i| format!("dir{}", i)).collect::<Vec<_>>().join("/");
|
|
126
126
|
let file_path = format!("{}/file.txt", deep_path);
|
|
127
127
|
|
|
128
|
-
zip.start_file(&file_path, options).
|
|
129
|
-
zip.write_all(b"deep content").
|
|
128
|
+
zip.start_file(&file_path, options).expect("Operation failed");
|
|
129
|
+
zip.write_all(b"deep content").expect("Operation failed");
|
|
130
130
|
|
|
131
|
-
zip.finish().
|
|
131
|
+
zip.finish().expect("Operation failed");
|
|
132
132
|
}
|
|
133
133
|
|
|
134
134
|
let bytes = cursor.into_inner();
|
|
@@ -149,11 +149,12 @@ fn test_archive_many_small_files() {
|
|
|
149
149
|
let options = FileOptions::<'_, ()>::default();
|
|
150
150
|
|
|
151
151
|
for i in 0..1000 {
|
|
152
|
-
zip.start_file(format!("file{}.txt", i), options)
|
|
153
|
-
|
|
152
|
+
zip.start_file(format!("file{}.txt", i), options)
|
|
153
|
+
.expect("Operation failed");
|
|
154
|
+
zip.write_all(b"small content").expect("Operation failed");
|
|
154
155
|
}
|
|
155
156
|
|
|
156
|
-
zip.finish().
|
|
157
|
+
zip.finish().expect("Operation failed");
|
|
157
158
|
}
|
|
158
159
|
|
|
159
160
|
let bytes = cursor.into_inner();
|
|
@@ -404,13 +405,13 @@ fn test_security_directory_instead_of_file() {
|
|
|
404
405
|
|
|
405
406
|
#[test]
|
|
406
407
|
fn test_security_special_file_handling() {
|
|
407
|
-
let mut tmpfile = NamedTempFile::new().
|
|
408
|
-
tmpfile.write_all(b"test content").
|
|
409
|
-
tmpfile.flush().
|
|
408
|
+
let mut tmpfile = NamedTempFile::new().expect("Operation failed");
|
|
409
|
+
tmpfile.write_all(b"test content").expect("Operation failed");
|
|
410
|
+
tmpfile.flush().expect("Operation failed");
|
|
410
411
|
let path = tmpfile.path();
|
|
411
412
|
|
|
412
413
|
let config = ExtractionConfig::default();
|
|
413
|
-
let result = extract_file_sync(path.to_str().
|
|
414
|
+
let result = extract_file_sync(path.to_str().expect("Operation failed"), None, &config);
|
|
414
415
|
|
|
415
416
|
assert!(result.is_ok() || result.is_err());
|
|
416
417
|
}
|