kreuzberg 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +534 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +157 -0
- data/README.md +421 -0
- data/Rakefile +25 -0
- data/Steepfile +47 -0
- data/examples/async_patterns.rb +340 -0
- data/ext/kreuzberg_rb/extconf.rb +35 -0
- data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
- data/ext/kreuzberg_rb/native/README.md +425 -0
- data/ext/kreuzberg_rb/native/build.rs +17 -0
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
- data/ext/kreuzberg_rb/native/include/strings.h +20 -0
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
- data/extconf.rb +28 -0
- data/kreuzberg.gemspec +105 -0
- data/lib/kreuzberg/api_proxy.rb +142 -0
- data/lib/kreuzberg/cache_api.rb +45 -0
- data/lib/kreuzberg/cli.rb +55 -0
- data/lib/kreuzberg/cli_proxy.rb +127 -0
- data/lib/kreuzberg/config.rb +684 -0
- data/lib/kreuzberg/errors.rb +50 -0
- data/lib/kreuzberg/extraction_api.rb +84 -0
- data/lib/kreuzberg/mcp_proxy.rb +186 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
- data/lib/kreuzberg/post_processor_protocol.rb +86 -0
- data/lib/kreuzberg/result.rb +216 -0
- data/lib/kreuzberg/setup_lib_path.rb +79 -0
- data/lib/kreuzberg/validator_protocol.rb +89 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +82 -0
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +468 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +87 -0
- data/spec/binding/cli_spec.rb +54 -0
- data/spec/binding/config_spec.rb +345 -0
- data/spec/binding/config_validation_spec.rb +283 -0
- data/spec/binding/error_handling_spec.rb +213 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +274 -0
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +39 -0
- data/spec/fixtures/config.yaml +42 -0
- data/spec/fixtures/invalid_config.toml +4 -0
- data/spec/smoke/package_spec.rb +178 -0
- data/spec/spec_helper.rb +42 -0
- data/vendor/kreuzberg/Cargo.toml +134 -0
- data/vendor/kreuzberg/README.md +175 -0
- data/vendor/kreuzberg/build.rs +460 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -0
- data/vendor/kreuzberg/src/api/handlers.rs +199 -0
- data/vendor/kreuzberg/src/api/mod.rs +79 -0
- data/vendor/kreuzberg/src/api/server.rs +353 -0
- data/vendor/kreuzberg/src/api/types.rs +170 -0
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
- data/vendor/kreuzberg/src/core/config.rs +1032 -0
- data/vendor/kreuzberg/src/core/extractor.rs +903 -0
- data/vendor/kreuzberg/src/core/io.rs +327 -0
- data/vendor/kreuzberg/src/core/mime.rs +615 -0
- data/vendor/kreuzberg/src/core/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
- data/vendor/kreuzberg/src/embeddings.rs +323 -0
- data/vendor/kreuzberg/src/error.rs +431 -0
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
- data/vendor/kreuzberg/src/extraction/email.rs +854 -0
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
- data/vendor/kreuzberg/src/extraction/html.rs +553 -0
- data/vendor/kreuzberg/src/extraction/image.rs +368 -0
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
- data/vendor/kreuzberg/src/extraction/table.rs +328 -0
- data/vendor/kreuzberg/src/extraction/text.rs +269 -0
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
- data/vendor/kreuzberg/src/extractors/email.rs +129 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
- data/vendor/kreuzberg/src/extractors/html.rs +410 -0
- data/vendor/kreuzberg/src/extractors/image.rs +195 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
- data/vendor/kreuzberg/src/extractors/text.rs +242 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
- data/vendor/kreuzberg/src/image/dpi.rs +164 -0
- data/vendor/kreuzberg/src/image/mod.rs +6 -0
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
- data/vendor/kreuzberg/src/image/resize.rs +89 -0
- data/vendor/kreuzberg/src/keywords/config.rs +154 -0
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
- data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
- data/vendor/kreuzberg/src/keywords/types.rs +68 -0
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
- data/vendor/kreuzberg/src/lib.rs +102 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
- data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
- data/vendor/kreuzberg/src/ocr/error.rs +37 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
- data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
- data/vendor/kreuzberg/src/ocr/types.rs +393 -0
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
- data/vendor/kreuzberg/src/pdf/error.rs +122 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
- data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
- data/vendor/kreuzberg/src/pdf/table.rs +420 -0
- data/vendor/kreuzberg/src/pdf/text.rs +161 -0
- data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
- data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
- data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
- data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
- data/vendor/kreuzberg/src/text/mod.rs +19 -0
- data/vendor/kreuzberg/src/text/quality.rs +697 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
- data/vendor/kreuzberg/src/types.rs +873 -0
- data/vendor/kreuzberg/src/utils/mod.rs +17 -0
- data/vendor/kreuzberg/src/utils/quality.rs +959 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
- data/vendor/kreuzberg/tests/api_tests.rs +966 -0
- data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
- data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
- data/vendor/kreuzberg/tests/config_features.rs +580 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
- data/vendor/kreuzberg/tests/core_integration.rs +493 -0
- data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
- data/vendor/kreuzberg/tests/email_integration.rs +325 -0
- data/vendor/kreuzberg/tests/error_handling.rs +393 -0
- data/vendor/kreuzberg/tests/format_integration.rs +159 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
- data/vendor/kreuzberg/tests/image_integration.rs +253 -0
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
- data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
- data/vendor/kreuzberg/tests/security_validation.rs +404 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
- data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
- metadata +471 -0
|
@@ -0,0 +1,424 @@
|
|
|
1
|
+
//! CSV and spreadsheet integration tests.
|
|
2
|
+
//!
|
|
3
|
+
//! Tests for CSV and TSV extraction via Pandoc.
|
|
4
|
+
//! Validates data extraction, custom delimiters, quoted fields, and edge cases.
|
|
5
|
+
|
|
6
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
7
|
+
use kreuzberg::core::extractor::extract_bytes;
|
|
8
|
+
|
|
9
|
+
mod helpers;
|
|
10
|
+
|
|
11
|
+
/// Test basic CSV extraction - simple comma-separated values.
|
|
12
|
+
#[tokio::test]
|
|
13
|
+
async fn test_csv_basic_extraction() {
|
|
14
|
+
let config = ExtractionConfig::default();
|
|
15
|
+
|
|
16
|
+
let csv_content = b"Name,Age,City\nAlice,30,NYC\nBob,25,LA";
|
|
17
|
+
|
|
18
|
+
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
|
19
|
+
|
|
20
|
+
if result.is_err() {
|
|
21
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
22
|
+
return;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
let extraction = result.unwrap();
|
|
26
|
+
|
|
27
|
+
assert_eq!(extraction.mime_type, "text/csv");
|
|
28
|
+
assert!(
|
|
29
|
+
extraction.chunks.is_none(),
|
|
30
|
+
"Chunks should be None without chunking config"
|
|
31
|
+
);
|
|
32
|
+
assert!(
|
|
33
|
+
extraction.detected_languages.is_none(),
|
|
34
|
+
"Language detection not enabled"
|
|
35
|
+
);
|
|
36
|
+
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
37
|
+
|
|
38
|
+
assert!(extraction.content.contains("Name"), "Should contain 'Name' header");
|
|
39
|
+
assert!(extraction.content.contains("Age"), "Should contain 'Age' header");
|
|
40
|
+
assert!(extraction.content.contains("City"), "Should contain 'City' header");
|
|
41
|
+
|
|
42
|
+
assert!(extraction.content.contains("Alice"), "Should contain Alice row");
|
|
43
|
+
assert!(extraction.content.contains("30"), "Should contain Alice's age");
|
|
44
|
+
assert!(extraction.content.contains("NYC"), "Should contain Alice's city");
|
|
45
|
+
|
|
46
|
+
assert!(extraction.content.contains("Bob"), "Should contain Bob row");
|
|
47
|
+
assert!(extraction.content.contains("25"), "Should contain Bob's age");
|
|
48
|
+
assert!(extraction.content.contains("LA"), "Should contain Bob's city");
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/// Test CSV with headers - first row as headers.
|
|
52
|
+
#[tokio::test]
|
|
53
|
+
async fn test_csv_with_headers() {
|
|
54
|
+
let config = ExtractionConfig::default();
|
|
55
|
+
|
|
56
|
+
let csv_content = b"Product,Price,Quantity\nApple,1.50,100\nBanana,0.75,200\nOrange,2.00,150";
|
|
57
|
+
|
|
58
|
+
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
|
59
|
+
|
|
60
|
+
if result.is_err() {
|
|
61
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
62
|
+
return;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
let extraction = result.unwrap();
|
|
66
|
+
|
|
67
|
+
assert!(
|
|
68
|
+
extraction.chunks.is_none(),
|
|
69
|
+
"Chunks should be None without chunking config"
|
|
70
|
+
);
|
|
71
|
+
assert!(
|
|
72
|
+
extraction.detected_languages.is_none(),
|
|
73
|
+
"Language detection not enabled"
|
|
74
|
+
);
|
|
75
|
+
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
76
|
+
|
|
77
|
+
assert!(extraction.content.contains("Product"), "Should contain Product header");
|
|
78
|
+
assert!(extraction.content.contains("Price"), "Should contain Price header");
|
|
79
|
+
assert!(
|
|
80
|
+
extraction.content.contains("Quantity"),
|
|
81
|
+
"Should contain Quantity header"
|
|
82
|
+
);
|
|
83
|
+
|
|
84
|
+
assert!(
|
|
85
|
+
extraction.content.contains("Apple")
|
|
86
|
+
&& extraction.content.contains("1.50")
|
|
87
|
+
&& extraction.content.contains("100")
|
|
88
|
+
);
|
|
89
|
+
assert!(
|
|
90
|
+
extraction.content.contains("Banana")
|
|
91
|
+
&& extraction.content.contains("0.75")
|
|
92
|
+
&& extraction.content.contains("200")
|
|
93
|
+
);
|
|
94
|
+
assert!(
|
|
95
|
+
extraction.content.contains("Orange")
|
|
96
|
+
&& extraction.content.contains("2.00")
|
|
97
|
+
&& extraction.content.contains("150")
|
|
98
|
+
);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
/// Test CSV with custom delimiter - tab and semicolon.
|
|
102
|
+
#[tokio::test]
|
|
103
|
+
async fn test_csv_custom_delimiter() {
|
|
104
|
+
let config = ExtractionConfig::default();
|
|
105
|
+
|
|
106
|
+
let csv_content = b"Name;Age;City\nAlice;30;NYC\nBob;25;LA";
|
|
107
|
+
|
|
108
|
+
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
|
109
|
+
|
|
110
|
+
if result.is_err() {
|
|
111
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
112
|
+
return;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
let extraction = result.unwrap();
|
|
116
|
+
|
|
117
|
+
assert!(
|
|
118
|
+
extraction.chunks.is_none(),
|
|
119
|
+
"Chunks should be None without chunking config"
|
|
120
|
+
);
|
|
121
|
+
assert!(
|
|
122
|
+
extraction.detected_languages.is_none(),
|
|
123
|
+
"Language detection not enabled"
|
|
124
|
+
);
|
|
125
|
+
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
126
|
+
|
|
127
|
+
assert!(!extraction.content.is_empty(), "Content should be extracted");
|
|
128
|
+
|
|
129
|
+
assert!(extraction.content.contains("Alice"), "Should contain Alice");
|
|
130
|
+
assert!(extraction.content.contains("30"), "Should contain age");
|
|
131
|
+
assert!(extraction.content.contains("NYC"), "Should contain city");
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/// Test TSV (Tab-Separated Values) file.
|
|
135
|
+
#[tokio::test]
|
|
136
|
+
async fn test_tsv_file() {
|
|
137
|
+
let config = ExtractionConfig::default();
|
|
138
|
+
|
|
139
|
+
let tsv_content = b"Name\tAge\tCity\nAlice\t30\tNYC\nBob\t25\tLA";
|
|
140
|
+
|
|
141
|
+
let result = extract_bytes(tsv_content, "text/tab-separated-values", &config).await;
|
|
142
|
+
|
|
143
|
+
if result.is_err() {
|
|
144
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
145
|
+
return;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
let extraction = result.unwrap();
|
|
149
|
+
|
|
150
|
+
assert_eq!(extraction.mime_type, "text/tab-separated-values");
|
|
151
|
+
assert!(
|
|
152
|
+
extraction.chunks.is_none(),
|
|
153
|
+
"Chunks should be None without chunking config"
|
|
154
|
+
);
|
|
155
|
+
assert!(
|
|
156
|
+
extraction.detected_languages.is_none(),
|
|
157
|
+
"Language detection not enabled"
|
|
158
|
+
);
|
|
159
|
+
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
160
|
+
|
|
161
|
+
assert!(extraction.content.contains("Name"), "Should contain Name header");
|
|
162
|
+
assert!(extraction.content.contains("Age"), "Should contain Age header");
|
|
163
|
+
assert!(extraction.content.contains("City"), "Should contain City header");
|
|
164
|
+
assert!(extraction.content.contains("Alice"), "Should contain Alice");
|
|
165
|
+
assert!(extraction.content.contains("Bob"), "Should contain Bob");
|
|
166
|
+
assert!(extraction.content.contains("30") && extraction.content.contains("NYC"));
|
|
167
|
+
assert!(extraction.content.contains("25") && extraction.content.contains("LA"));
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
/// Test CSV with quoted fields - fields containing commas.
|
|
171
|
+
#[tokio::test]
|
|
172
|
+
async fn test_csv_quoted_fields() {
|
|
173
|
+
let config = ExtractionConfig::default();
|
|
174
|
+
|
|
175
|
+
let csv_content =
|
|
176
|
+
b"Name,Description,Price\n\"Smith, John\",\"Product A, premium\",100\n\"Doe, Jane\",\"Product B, standard\",50";
|
|
177
|
+
|
|
178
|
+
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
|
179
|
+
|
|
180
|
+
if result.is_err() {
|
|
181
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
182
|
+
return;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
let extraction = result.unwrap();
|
|
186
|
+
|
|
187
|
+
assert!(
|
|
188
|
+
extraction.chunks.is_none(),
|
|
189
|
+
"Chunks should be None without chunking config"
|
|
190
|
+
);
|
|
191
|
+
assert!(
|
|
192
|
+
extraction.detected_languages.is_none(),
|
|
193
|
+
"Language detection not enabled"
|
|
194
|
+
);
|
|
195
|
+
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
196
|
+
|
|
197
|
+
assert!(extraction.content.contains("Smith"), "Should contain Smith");
|
|
198
|
+
assert!(extraction.content.contains("John"), "Should contain John");
|
|
199
|
+
assert!(extraction.content.contains("Doe"), "Should contain Doe");
|
|
200
|
+
assert!(extraction.content.contains("Jane"), "Should contain Jane");
|
|
201
|
+
|
|
202
|
+
assert!(extraction.content.contains("Product A") || extraction.content.contains("premium"));
|
|
203
|
+
assert!(extraction.content.contains("Product B") || extraction.content.contains("standard"));
|
|
204
|
+
|
|
205
|
+
assert!(extraction.content.contains("100") && extraction.content.contains("50"));
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
/// Test CSV with special characters - Unicode, newlines in fields.
|
|
209
|
+
#[tokio::test]
|
|
210
|
+
async fn test_csv_special_characters() {
|
|
211
|
+
let config = ExtractionConfig::default();
|
|
212
|
+
|
|
213
|
+
let csv_content = "Name,City,Emoji\nAlice,Tokyo 東京,🎉\nBob,París,✅\nCarlos,Москва,🌍".as_bytes();
|
|
214
|
+
|
|
215
|
+
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
|
216
|
+
|
|
217
|
+
if result.is_err() {
|
|
218
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
219
|
+
return;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
let extraction = result.unwrap();
|
|
223
|
+
|
|
224
|
+
assert!(
|
|
225
|
+
extraction.chunks.is_none(),
|
|
226
|
+
"Chunks should be None without chunking config"
|
|
227
|
+
);
|
|
228
|
+
assert!(
|
|
229
|
+
extraction.detected_languages.is_none(),
|
|
230
|
+
"Language detection not enabled"
|
|
231
|
+
);
|
|
232
|
+
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
233
|
+
|
|
234
|
+
assert!(!extraction.content.is_empty(), "Special characters should be handled");
|
|
235
|
+
|
|
236
|
+
assert!(extraction.content.contains("Alice"), "Should contain Alice");
|
|
237
|
+
assert!(extraction.content.contains("Bob"), "Should contain Bob");
|
|
238
|
+
assert!(extraction.content.contains("Carlos"), "Should contain Carlos");
|
|
239
|
+
|
|
240
|
+
assert!(extraction.content.contains("Tokyo") || extraction.content.contains("東京"));
|
|
241
|
+
assert!(extraction.content.contains("París") || extraction.content.contains("Paris"));
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
/// Test CSV with large file - 10,000+ rows (streaming).
|
|
245
|
+
#[tokio::test]
|
|
246
|
+
async fn test_csv_large_file() {
|
|
247
|
+
let config = ExtractionConfig::default();
|
|
248
|
+
|
|
249
|
+
let mut csv_content = "ID,Name,Value\n".to_string();
|
|
250
|
+
for i in 1..=10_000 {
|
|
251
|
+
csv_content.push_str(&format!("{},Item{},{}.00\n", i, i, i * 10));
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
let result = extract_bytes(csv_content.as_bytes(), "text/csv", &config).await;
|
|
255
|
+
|
|
256
|
+
if result.is_err() {
|
|
257
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
258
|
+
return;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
let extraction = result.unwrap();
|
|
262
|
+
|
|
263
|
+
assert!(
|
|
264
|
+
extraction.chunks.is_none(),
|
|
265
|
+
"Chunks should be None without chunking config"
|
|
266
|
+
);
|
|
267
|
+
assert!(
|
|
268
|
+
extraction.detected_languages.is_none(),
|
|
269
|
+
"Language detection not enabled"
|
|
270
|
+
);
|
|
271
|
+
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
272
|
+
|
|
273
|
+
assert!(!extraction.content.is_empty(), "Large CSV should be processed");
|
|
274
|
+
|
|
275
|
+
assert!(
|
|
276
|
+
extraction.content.len() > 1000,
|
|
277
|
+
"Large CSV content should be substantial"
|
|
278
|
+
);
|
|
279
|
+
|
|
280
|
+
assert!(extraction.content.contains("Item1") || extraction.content.contains("10.00"));
|
|
281
|
+
|
|
282
|
+
assert!(extraction.content.contains("Item5000") || extraction.content.contains("50000.00"));
|
|
283
|
+
|
|
284
|
+
assert!(extraction.content.contains("Item10000") || extraction.content.contains("100000.00"));
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
/// Test malformed CSV - inconsistent columns.
|
|
288
|
+
#[tokio::test]
|
|
289
|
+
async fn test_csv_malformed() {
|
|
290
|
+
let config = ExtractionConfig::default();
|
|
291
|
+
|
|
292
|
+
let csv_content = b"Name,Age,City\nAlice,30\nBob,25,LA,Extra\nCarlos,35,SF";
|
|
293
|
+
|
|
294
|
+
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
|
295
|
+
|
|
296
|
+
assert!(
|
|
297
|
+
result.is_ok() || result.is_err(),
|
|
298
|
+
"Should handle malformed CSV gracefully"
|
|
299
|
+
);
|
|
300
|
+
|
|
301
|
+
if let Ok(extraction) = result {
|
|
302
|
+
assert!(!extraction.content.is_empty());
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
/// Test empty CSV file.
|
|
307
|
+
#[tokio::test]
|
|
308
|
+
async fn test_csv_empty() {
|
|
309
|
+
let config = ExtractionConfig::default();
|
|
310
|
+
|
|
311
|
+
let empty_csv = b"";
|
|
312
|
+
|
|
313
|
+
let result = extract_bytes(empty_csv, "text/csv", &config).await;
|
|
314
|
+
|
|
315
|
+
assert!(result.is_ok() || result.is_err(), "Should handle empty CSV gracefully");
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
/// Test CSV with only headers.
|
|
319
|
+
#[tokio::test]
|
|
320
|
+
async fn test_csv_headers_only() {
|
|
321
|
+
let config = ExtractionConfig::default();
|
|
322
|
+
|
|
323
|
+
let csv_content = b"Name,Age,City";
|
|
324
|
+
|
|
325
|
+
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
|
326
|
+
|
|
327
|
+
if result.is_err() {
|
|
328
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
329
|
+
return;
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
let extraction = result.unwrap();
|
|
333
|
+
|
|
334
|
+
assert!(
|
|
335
|
+
extraction.chunks.is_none(),
|
|
336
|
+
"Chunks should be None without chunking config"
|
|
337
|
+
);
|
|
338
|
+
assert!(
|
|
339
|
+
extraction.detected_languages.is_none(),
|
|
340
|
+
"Language detection not enabled"
|
|
341
|
+
);
|
|
342
|
+
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
343
|
+
|
|
344
|
+
assert!(
|
|
345
|
+
extraction.content.contains("Name") || !extraction.content.is_empty(),
|
|
346
|
+
"Headers should be extracted"
|
|
347
|
+
);
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
/// Test CSV with blank lines.
|
|
351
|
+
#[tokio::test]
|
|
352
|
+
async fn test_csv_blank_lines() {
|
|
353
|
+
let config = ExtractionConfig::default();
|
|
354
|
+
|
|
355
|
+
let csv_content = b"Name,Age\nAlice,30\n\nBob,25\n\nCarlos,35";
|
|
356
|
+
|
|
357
|
+
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
|
358
|
+
|
|
359
|
+
if result.is_err() {
|
|
360
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
361
|
+
return;
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
let extraction = result.unwrap();
|
|
365
|
+
|
|
366
|
+
assert!(
|
|
367
|
+
extraction.chunks.is_none(),
|
|
368
|
+
"Chunks should be None without chunking config"
|
|
369
|
+
);
|
|
370
|
+
assert!(
|
|
371
|
+
extraction.detected_languages.is_none(),
|
|
372
|
+
"Language detection not enabled"
|
|
373
|
+
);
|
|
374
|
+
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
375
|
+
|
|
376
|
+
assert!(extraction.content.contains("Alice") || extraction.content.contains("Bob"));
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
/// Test CSV with numeric data.
|
|
380
|
+
#[tokio::test]
|
|
381
|
+
async fn test_csv_numeric_data() {
|
|
382
|
+
let config = ExtractionConfig::default();
|
|
383
|
+
|
|
384
|
+
let csv_content = b"ID,Price,Quantity,Discount\n1,19.99,100,0.15\n2,29.99,50,0.20\n3,9.99,200,0.10";
|
|
385
|
+
|
|
386
|
+
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
|
387
|
+
|
|
388
|
+
if result.is_err() {
|
|
389
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
390
|
+
return;
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
let extraction = result.unwrap();
|
|
394
|
+
|
|
395
|
+
assert!(
|
|
396
|
+
extraction.chunks.is_none(),
|
|
397
|
+
"Chunks should be None without chunking config"
|
|
398
|
+
);
|
|
399
|
+
assert!(
|
|
400
|
+
extraction.detected_languages.is_none(),
|
|
401
|
+
"Language detection not enabled"
|
|
402
|
+
);
|
|
403
|
+
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
404
|
+
|
|
405
|
+
assert!(extraction.content.contains("Price"), "Should contain Price header");
|
|
406
|
+
assert!(
|
|
407
|
+
extraction.content.contains("Quantity"),
|
|
408
|
+
"Should contain Quantity header"
|
|
409
|
+
);
|
|
410
|
+
assert!(
|
|
411
|
+
extraction.content.contains("Discount"),
|
|
412
|
+
"Should contain Discount header"
|
|
413
|
+
);
|
|
414
|
+
|
|
415
|
+
assert!(extraction.content.contains("19.99"), "Should contain first price");
|
|
416
|
+
assert!(extraction.content.contains("100"), "Should contain first quantity");
|
|
417
|
+
assert!(extraction.content.contains("0.15"), "Should contain first discount");
|
|
418
|
+
|
|
419
|
+
assert!(extraction.content.contains("29.99"), "Should contain second price");
|
|
420
|
+
assert!(extraction.content.contains("50"), "Should contain second quantity");
|
|
421
|
+
|
|
422
|
+
assert!(extraction.content.contains("9.99"), "Should contain third price");
|
|
423
|
+
assert!(extraction.content.contains("200"), "Should contain third quantity");
|
|
424
|
+
}
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
//! End-to-end integration test for DOCX metadata extraction
|
|
2
|
+
|
|
3
|
+
#![cfg(feature = "office")]
|
|
4
|
+
|
|
5
|
+
use kreuzberg::extraction::pandoc::extract_file;
|
|
6
|
+
|
|
7
|
+
#[tokio::test]
|
|
8
|
+
async fn test_docx_full_metadata_extraction() {
|
|
9
|
+
if kreuzberg::extraction::pandoc::validate_pandoc_version().await.is_err() {
|
|
10
|
+
println!("Skipping test: Pandoc not available");
|
|
11
|
+
return;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
15
|
+
.parent()
|
|
16
|
+
.unwrap()
|
|
17
|
+
.parent()
|
|
18
|
+
.unwrap();
|
|
19
|
+
let test_file = workspace_root.join("test_documents/documents/word_sample.docx");
|
|
20
|
+
|
|
21
|
+
if !test_file.exists() {
|
|
22
|
+
println!("Skipping test: Test file not found at {:?}", test_file);
|
|
23
|
+
return;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
let result = extract_file(&test_file, "docx")
|
|
27
|
+
.await
|
|
28
|
+
.expect("Should extract DOCX successfully");
|
|
29
|
+
|
|
30
|
+
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
31
|
+
assert!(
|
|
32
|
+
result.content.to_lowercase().contains("swim"),
|
|
33
|
+
"Content should contain 'swim'"
|
|
34
|
+
);
|
|
35
|
+
|
|
36
|
+
assert_eq!(
|
|
37
|
+
result.metadata.get("created_by").and_then(|v| v.as_str()),
|
|
38
|
+
Some("Christoph Auer"),
|
|
39
|
+
"Should have correct creator"
|
|
40
|
+
);
|
|
41
|
+
assert_eq!(
|
|
42
|
+
result.metadata.get("modified_by").and_then(|v| v.as_str()),
|
|
43
|
+
Some("Maxim Lysak"),
|
|
44
|
+
"Should have correct last modified by"
|
|
45
|
+
);
|
|
46
|
+
assert_eq!(
|
|
47
|
+
result.metadata.get("created_at").and_then(|v| v.as_str()),
|
|
48
|
+
Some("2024-10-09T12:43:00Z"),
|
|
49
|
+
"Should have correct creation date"
|
|
50
|
+
);
|
|
51
|
+
assert_eq!(
|
|
52
|
+
result.metadata.get("revision").and_then(|v| v.as_str()),
|
|
53
|
+
Some("7"),
|
|
54
|
+
"Should have revision number"
|
|
55
|
+
);
|
|
56
|
+
|
|
57
|
+
assert_eq!(
|
|
58
|
+
result.metadata.get("page_count").and_then(|v| v.as_i64()),
|
|
59
|
+
Some(2),
|
|
60
|
+
"Should have 2 pages"
|
|
61
|
+
);
|
|
62
|
+
assert_eq!(
|
|
63
|
+
result.metadata.get("word_count").and_then(|v| v.as_i64()),
|
|
64
|
+
Some(108),
|
|
65
|
+
"Should have 108 words"
|
|
66
|
+
);
|
|
67
|
+
assert_eq!(
|
|
68
|
+
result.metadata.get("character_count").and_then(|v| v.as_i64()),
|
|
69
|
+
Some(620),
|
|
70
|
+
"Should have 620 characters"
|
|
71
|
+
);
|
|
72
|
+
assert_eq!(
|
|
73
|
+
result.metadata.get("line_count").and_then(|v| v.as_i64()),
|
|
74
|
+
Some(5),
|
|
75
|
+
"Should have 5 lines"
|
|
76
|
+
);
|
|
77
|
+
assert_eq!(
|
|
78
|
+
result.metadata.get("paragraph_count").and_then(|v| v.as_i64()),
|
|
79
|
+
Some(1),
|
|
80
|
+
"Should have 1 paragraph"
|
|
81
|
+
);
|
|
82
|
+
|
|
83
|
+
println!("✅ DOCX metadata extraction test passed!");
|
|
84
|
+
println!(" Found {} metadata fields", result.metadata.len());
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
#[tokio::test]
|
|
88
|
+
async fn test_docx_minimal_metadata_extraction() {
|
|
89
|
+
if kreuzberg::extraction::pandoc::validate_pandoc_version().await.is_err() {
|
|
90
|
+
println!("Skipping test: Pandoc not available");
|
|
91
|
+
return;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
95
|
+
.parent()
|
|
96
|
+
.unwrap()
|
|
97
|
+
.parent()
|
|
98
|
+
.unwrap();
|
|
99
|
+
let test_file = workspace_root.join("test_documents/documents/lorem_ipsum.docx");
|
|
100
|
+
|
|
101
|
+
if !test_file.exists() {
|
|
102
|
+
println!("Skipping test: Test file not found at {:?}", test_file);
|
|
103
|
+
return;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
let result = extract_file(&test_file, "docx")
|
|
107
|
+
.await
|
|
108
|
+
.expect("Should extract DOCX successfully");
|
|
109
|
+
|
|
110
|
+
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
111
|
+
|
|
112
|
+
assert_eq!(
|
|
113
|
+
result.metadata.get("page_count").and_then(|v| v.as_i64()),
|
|
114
|
+
Some(1),
|
|
115
|
+
"Should have 1 page"
|
|
116
|
+
);
|
|
117
|
+
assert_eq!(
|
|
118
|
+
result.metadata.get("word_count").and_then(|v| v.as_i64()),
|
|
119
|
+
Some(520),
|
|
120
|
+
"Should have 520 words"
|
|
121
|
+
);
|
|
122
|
+
|
|
123
|
+
println!("✅ DOCX minimal metadata extraction test passed!");
|
|
124
|
+
}
|