kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
|
@@ -124,6 +124,7 @@ ocr:
|
|
|
124
124
|
fn test_from_file_nonexistent_path_fails() {
|
|
125
125
|
let result = ExtractionConfig::from_file("/nonexistent/path/config.toml");
|
|
126
126
|
assert!(result.is_err(), "Should fail for nonexistent path: {:?}", result);
|
|
127
|
+
// Error can be Io or other types depending on the implementation
|
|
127
128
|
}
|
|
128
129
|
|
|
129
130
|
/// Test from_file with malformed TOML fails.
|
|
@@ -141,6 +142,7 @@ enabled = true
|
|
|
141
142
|
|
|
142
143
|
let result = ExtractionConfig::from_file(&config_path);
|
|
143
144
|
assert!(result.is_err(), "Should fail for malformed TOML: {:?}", result);
|
|
145
|
+
// Error handling varies - just ensure it failed
|
|
144
146
|
}
|
|
145
147
|
|
|
146
148
|
/// Test from_file with malformed JSON fails.
|
|
@@ -162,6 +164,7 @@ fn test_from_file_malformed_json_fails() {
|
|
|
162
164
|
|
|
163
165
|
let result = ExtractionConfig::from_file(&config_path);
|
|
164
166
|
assert!(result.is_err(), "Should fail for malformed JSON: {:?}", result);
|
|
167
|
+
// Error handling varies - just ensure it failed
|
|
165
168
|
}
|
|
166
169
|
|
|
167
170
|
/// Test from_file with malformed YAML fails.
|
|
@@ -180,6 +183,7 @@ ocr:
|
|
|
180
183
|
|
|
181
184
|
let result = ExtractionConfig::from_file(&config_path);
|
|
182
185
|
assert!(result.is_err(), "Should fail for malformed YAML: {:?}", result);
|
|
186
|
+
// Error handling varies - just ensure it failed
|
|
183
187
|
}
|
|
184
188
|
|
|
185
189
|
/// Test from_file with empty file uses defaults.
|
|
@@ -194,6 +198,7 @@ fn test_from_file_empty_file_uses_defaults() {
|
|
|
194
198
|
assert!(config.is_ok(), "Should load empty file successfully");
|
|
195
199
|
|
|
196
200
|
let config = config.unwrap();
|
|
201
|
+
// Should have default values
|
|
197
202
|
assert!(config.ocr.is_none(), "Default config should have no OCR");
|
|
198
203
|
assert!(config.chunking.is_none(), "Default config should have no chunking");
|
|
199
204
|
}
|
|
@@ -209,18 +214,22 @@ fn test_from_file_unsupported_extension_fails() {
|
|
|
209
214
|
let result = ExtractionConfig::from_file(&config_path);
|
|
210
215
|
assert!(result.is_err(), "Should fail for unsupported extension: {:?}", result);
|
|
211
216
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
217
|
+
match result {
|
|
218
|
+
Err(KreuzbergError::Validation { message, .. }) => {
|
|
219
|
+
assert!(
|
|
220
|
+
message.contains("format") || message.contains("extension") || message.contains("Unsupported"),
|
|
221
|
+
"Error should mention format/extension: {}",
|
|
222
|
+
message
|
|
223
|
+
);
|
|
224
|
+
}
|
|
225
|
+
_ => {
|
|
226
|
+
// Some other error is also acceptable
|
|
227
|
+
}
|
|
218
228
|
}
|
|
219
229
|
}
|
|
220
230
|
|
|
221
231
|
/// Test discover() finds config in current directory.
|
|
222
232
|
#[test]
|
|
223
|
-
#[serial_test::serial]
|
|
224
233
|
fn test_discover_finds_config_in_current_dir() {
|
|
225
234
|
let temp_dir = TempDir::new().unwrap();
|
|
226
235
|
let config_path = temp_dir.path().join("kreuzberg.toml");
|
|
@@ -232,11 +241,13 @@ enabled = true
|
|
|
232
241
|
|
|
233
242
|
fs::write(&config_path, toml_content).unwrap();
|
|
234
243
|
|
|
244
|
+
// Change to temp directory
|
|
235
245
|
let original_dir = std::env::current_dir().unwrap();
|
|
236
246
|
std::env::set_current_dir(temp_dir.path()).unwrap();
|
|
237
247
|
|
|
238
248
|
let result = ExtractionConfig::discover();
|
|
239
249
|
|
|
250
|
+
// Restore original directory
|
|
240
251
|
std::env::set_current_dir(original_dir).unwrap();
|
|
241
252
|
|
|
242
253
|
assert!(result.is_ok(), "Discover should succeed");
|
|
@@ -247,7 +258,6 @@ enabled = true
|
|
|
247
258
|
|
|
248
259
|
/// Test discover() finds config in parent directory.
|
|
249
260
|
#[test]
|
|
250
|
-
#[serial_test::serial]
|
|
251
261
|
fn test_discover_finds_config_in_parent_dir() {
|
|
252
262
|
let temp_dir = TempDir::new().unwrap();
|
|
253
263
|
let config_path = temp_dir.path().join("kreuzberg.toml");
|
|
@@ -259,14 +269,17 @@ enabled = true
|
|
|
259
269
|
|
|
260
270
|
fs::write(&config_path, toml_content).unwrap();
|
|
261
271
|
|
|
272
|
+
// Create subdirectory
|
|
262
273
|
let sub_dir = temp_dir.path().join("subdir");
|
|
263
274
|
fs::create_dir(&sub_dir).unwrap();
|
|
264
275
|
|
|
276
|
+
// Change to subdirectory
|
|
265
277
|
let original_dir = std::env::current_dir().unwrap();
|
|
266
278
|
std::env::set_current_dir(&sub_dir).unwrap();
|
|
267
279
|
|
|
268
280
|
let result = ExtractionConfig::discover();
|
|
269
281
|
|
|
282
|
+
// Restore original directory
|
|
270
283
|
std::env::set_current_dir(original_dir).unwrap();
|
|
271
284
|
|
|
272
285
|
assert!(result.is_ok(), "Discover should succeed");
|
|
@@ -277,39 +290,44 @@ enabled = true
|
|
|
277
290
|
|
|
278
291
|
/// Test discover() returns None when no config found.
|
|
279
292
|
#[test]
|
|
280
|
-
#[serial_test::serial]
|
|
281
293
|
fn test_discover_returns_none_when_not_found() {
|
|
282
294
|
let temp_dir = TempDir::new().unwrap();
|
|
283
295
|
let sub_dir = temp_dir.path().join("subdir");
|
|
284
296
|
fs::create_dir(&sub_dir).unwrap();
|
|
285
297
|
|
|
298
|
+
// Change to subdirectory (no config files)
|
|
286
299
|
let original_dir = std::env::current_dir().unwrap();
|
|
287
300
|
std::env::set_current_dir(&sub_dir).unwrap();
|
|
288
301
|
|
|
289
302
|
let result = ExtractionConfig::discover();
|
|
290
303
|
|
|
304
|
+
// Restore original directory
|
|
291
305
|
std::env::set_current_dir(original_dir).unwrap();
|
|
292
306
|
|
|
293
307
|
assert!(result.is_ok(), "Discover should succeed even when no config found");
|
|
294
308
|
let _config = result.unwrap();
|
|
309
|
+
// May return None or may find a config in parent directories (e.g., repository root)
|
|
310
|
+
// Just verify it doesn't error - the specific behavior depends on the directory structure
|
|
295
311
|
}
|
|
296
312
|
|
|
297
313
|
/// Test discover() prefers certain file names.
|
|
298
314
|
#[test]
|
|
299
|
-
#[serial_test::serial]
|
|
300
315
|
fn test_discover_file_name_preference() {
|
|
301
316
|
let temp_dir = TempDir::new().unwrap();
|
|
302
317
|
|
|
318
|
+
// Create multiple config files
|
|
303
319
|
fs::write(temp_dir.path().join("kreuzberg.toml"), "[ocr]\nenabled = true").unwrap();
|
|
304
320
|
fs::write(temp_dir.path().join(".kreuzberg.toml"), "[ocr]\nenabled = false").unwrap();
|
|
305
321
|
|
|
306
322
|
let original_dir = std::env::current_dir().unwrap();
|
|
307
323
|
if std::env::set_current_dir(temp_dir.path()).is_err() {
|
|
324
|
+
// Skip this test if we can't change directory
|
|
308
325
|
return;
|
|
309
326
|
}
|
|
310
327
|
|
|
311
328
|
let result = ExtractionConfig::discover();
|
|
312
329
|
|
|
330
|
+
// Always restore directory even if test fails
|
|
313
331
|
let _ = std::env::set_current_dir(original_dir);
|
|
314
332
|
|
|
315
333
|
assert!(result.is_ok(), "Discover should succeed");
|
|
@@ -319,7 +337,6 @@ fn test_discover_file_name_preference() {
|
|
|
319
337
|
|
|
320
338
|
/// Test discover() with nested directories.
|
|
321
339
|
#[test]
|
|
322
|
-
#[serial_test::serial]
|
|
323
340
|
fn test_discover_with_nested_directories() {
|
|
324
341
|
let temp_dir = TempDir::new().unwrap();
|
|
325
342
|
let config_path = temp_dir.path().join("kreuzberg.toml");
|
|
@@ -331,18 +348,22 @@ enabled = true
|
|
|
331
348
|
|
|
332
349
|
fs::write(&config_path, toml_content).unwrap();
|
|
333
350
|
|
|
351
|
+
// Create nested subdirectories
|
|
334
352
|
let level1 = temp_dir.path().join("level1");
|
|
335
353
|
let level2 = level1.join("level2");
|
|
336
354
|
let level3 = level2.join("level3");
|
|
337
355
|
fs::create_dir_all(&level3).unwrap();
|
|
338
356
|
|
|
357
|
+
// Change to deepest directory
|
|
339
358
|
let original_dir = std::env::current_dir().unwrap();
|
|
340
359
|
if std::env::set_current_dir(&level3).is_err() {
|
|
360
|
+
// Skip this test if we can't change directory
|
|
341
361
|
return;
|
|
342
362
|
}
|
|
343
363
|
|
|
344
364
|
let result = ExtractionConfig::discover();
|
|
345
365
|
|
|
366
|
+
// Always restore directory even if test fails
|
|
346
367
|
let _ = std::env::set_current_dir(&original_dir);
|
|
347
368
|
|
|
348
369
|
assert!(result.is_ok(), "Discover should succeed");
|
|
@@ -389,7 +410,6 @@ extract_images = true
|
|
|
389
410
|
"Should have language detection config"
|
|
390
411
|
);
|
|
391
412
|
assert!(config.images.is_some(), "Should have image extraction config");
|
|
392
|
-
#[cfg(feature = "pdf")]
|
|
393
413
|
assert!(config.pdf_options.is_some(), "Should have PDF config");
|
|
394
414
|
}
|
|
395
415
|
|
|
@@ -399,6 +419,7 @@ fn test_from_file_with_invalid_values() {
|
|
|
399
419
|
let temp_dir = TempDir::new().unwrap();
|
|
400
420
|
let config_path = temp_dir.path().join("config.toml");
|
|
401
421
|
|
|
422
|
+
// Negative values should be rejected during deserialization or validation
|
|
402
423
|
let toml_content = r#"
|
|
403
424
|
[chunking]
|
|
404
425
|
max_chars = -1000
|
|
@@ -408,9 +429,11 @@ max_overlap = -100
|
|
|
408
429
|
fs::write(&config_path, toml_content).unwrap();
|
|
409
430
|
|
|
410
431
|
let result = ExtractionConfig::from_file(&config_path);
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
432
|
+
// Should either fail parsing or have clamped values
|
|
433
|
+
if let Ok(config) = result {
|
|
434
|
+
// If it succeeds, values should be reasonable
|
|
435
|
+
if let Some(chunking) = config.chunking {
|
|
436
|
+
assert!(chunking.max_chars > 0, "max_chars should be positive");
|
|
437
|
+
}
|
|
415
438
|
}
|
|
416
439
|
}
|
|
@@ -11,18 +11,6 @@ use std::fs::{self, File};
|
|
|
11
11
|
use std::io::Write;
|
|
12
12
|
use tempfile::tempdir;
|
|
13
13
|
|
|
14
|
-
fn trim_trailing_newlines(value: &str) -> &str {
|
|
15
|
-
value.trim_end_matches(['\n', '\r'])
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
fn assert_text_content(actual: &str, expected: &str) {
|
|
19
|
-
assert_eq!(
|
|
20
|
-
trim_trailing_newlines(actual),
|
|
21
|
-
expected,
|
|
22
|
-
"Content mismatch after trimming trailing newlines"
|
|
23
|
-
);
|
|
24
|
-
}
|
|
25
|
-
|
|
26
14
|
/// Test basic file extraction with MIME detection.
|
|
27
15
|
#[tokio::test]
|
|
28
16
|
async fn test_extract_file_basic() {
|
|
@@ -37,7 +25,7 @@ async fn test_extract_file_basic() {
|
|
|
37
25
|
assert!(result.is_ok(), "Basic file extraction should succeed");
|
|
38
26
|
let result = result.unwrap();
|
|
39
27
|
|
|
40
|
-
|
|
28
|
+
assert_eq!(result.content, "Hello, Kreuzberg!");
|
|
41
29
|
assert_eq!(result.mime_type, "text/plain");
|
|
42
30
|
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
43
31
|
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
@@ -185,12 +173,7 @@ async fn test_batch_extract_bytes_concurrency() {
|
|
|
185
173
|
(b"content 5".as_slice(), "text/plain"),
|
|
186
174
|
];
|
|
187
175
|
|
|
188
|
-
let
|
|
189
|
-
.into_iter()
|
|
190
|
-
.map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
|
|
191
|
-
.collect();
|
|
192
|
-
|
|
193
|
-
let results = batch_extract_bytes(owned_contents, &config).await;
|
|
176
|
+
let results = batch_extract_bytes(contents, &config).await;
|
|
194
177
|
assert!(results.is_ok());
|
|
195
178
|
|
|
196
179
|
let results = results.unwrap();
|
|
@@ -198,12 +181,7 @@ async fn test_batch_extract_bytes_concurrency() {
|
|
|
198
181
|
|
|
199
182
|
for (i, result) in results.iter().enumerate() {
|
|
200
183
|
let expected_content = format!("content {}", i + 1);
|
|
201
|
-
assert_eq!(
|
|
202
|
-
trim_trailing_newlines(&result.content),
|
|
203
|
-
expected_content,
|
|
204
|
-
"Content mismatch for item {}",
|
|
205
|
-
i
|
|
206
|
-
);
|
|
184
|
+
assert_eq!(result.content, expected_content, "Content mismatch for item {}", i);
|
|
207
185
|
assert_eq!(result.mime_type, "text/plain", "MIME type should be text/plain");
|
|
208
186
|
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
209
187
|
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
@@ -223,13 +201,13 @@ fn test_sync_wrappers() {
|
|
|
223
201
|
let result = extract_file_sync(&file_path, None, &config);
|
|
224
202
|
assert!(result.is_ok(), "Sync file extraction should succeed");
|
|
225
203
|
let extraction = result.unwrap();
|
|
226
|
-
|
|
204
|
+
assert_eq!(extraction.content, "sync content");
|
|
227
205
|
assert!(extraction.chunks.is_none(), "Chunks should be None");
|
|
228
206
|
|
|
229
207
|
let result = extract_bytes_sync(b"test bytes", "text/plain", &config);
|
|
230
208
|
assert!(result.is_ok(), "Sync bytes extraction should succeed");
|
|
231
209
|
let extraction = result.unwrap();
|
|
232
|
-
|
|
210
|
+
assert_eq!(extraction.content, "test bytes");
|
|
233
211
|
assert!(extraction.chunks.is_none(), "Chunks should be None");
|
|
234
212
|
|
|
235
213
|
let paths = vec![file_path];
|
|
@@ -237,19 +215,15 @@ fn test_sync_wrappers() {
|
|
|
237
215
|
assert!(results.is_ok(), "Batch sync file should succeed");
|
|
238
216
|
let results = results.unwrap();
|
|
239
217
|
assert_eq!(results.len(), 1);
|
|
240
|
-
|
|
218
|
+
assert_eq!(results[0].content, "sync content");
|
|
241
219
|
assert!(results[0].chunks.is_none(), "Chunks should be None");
|
|
242
220
|
|
|
243
221
|
let contents = vec![(b"test".as_slice(), "text/plain")];
|
|
244
|
-
let
|
|
245
|
-
.into_iter()
|
|
246
|
-
.map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
|
|
247
|
-
.collect();
|
|
248
|
-
let results = batch_extract_bytes_sync(owned_contents, &config);
|
|
222
|
+
let results = batch_extract_bytes_sync(contents, &config);
|
|
249
223
|
assert!(results.is_ok(), "Batch bytes sync should succeed");
|
|
250
224
|
let results = results.unwrap();
|
|
251
225
|
assert_eq!(results.len(), 1);
|
|
252
|
-
|
|
226
|
+
assert_eq!(results[0].content, "test");
|
|
253
227
|
assert!(results[0].chunks.is_none(), "Chunks should be None");
|
|
254
228
|
}
|
|
255
229
|
|
|
@@ -441,7 +415,7 @@ async fn test_pipeline_execution() {
|
|
|
441
415
|
assert!(result.is_ok(), "Pipeline execution should succeed");
|
|
442
416
|
|
|
443
417
|
let result = result.unwrap();
|
|
444
|
-
|
|
418
|
+
assert_eq!(result.content, "pipeline content");
|
|
445
419
|
assert_eq!(result.mime_type, "text/plain");
|
|
446
420
|
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
447
421
|
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
//! CSV and spreadsheet integration tests.
|
|
2
2
|
//!
|
|
3
|
-
//! Tests for CSV and TSV extraction.
|
|
3
|
+
//! Tests for CSV and TSV extraction via Pandoc.
|
|
4
4
|
//! Validates data extraction, custom delimiters, quoted fields, and edge cases.
|
|
5
5
|
|
|
6
6
|
use kreuzberg::core::config::ExtractionConfig;
|
|
@@ -15,13 +15,14 @@ async fn test_csv_basic_extraction() {
|
|
|
15
15
|
|
|
16
16
|
let csv_content = b"Name,Age,City\nAlice,30,NYC\nBob,25,LA";
|
|
17
17
|
|
|
18
|
-
let
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
18
|
+
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
|
19
|
+
|
|
20
|
+
if result.is_err() {
|
|
21
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
22
|
+
return;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
let extraction = result.unwrap();
|
|
25
26
|
|
|
26
27
|
assert_eq!(extraction.mime_type, "text/csv");
|
|
27
28
|
assert!(
|
|
@@ -54,13 +55,14 @@ async fn test_csv_with_headers() {
|
|
|
54
55
|
|
|
55
56
|
let csv_content = b"Product,Price,Quantity\nApple,1.50,100\nBanana,0.75,200\nOrange,2.00,150";
|
|
56
57
|
|
|
57
|
-
let
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
58
|
+
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
|
59
|
+
|
|
60
|
+
if result.is_err() {
|
|
61
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
62
|
+
return;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
let extraction = result.unwrap();
|
|
64
66
|
|
|
65
67
|
assert!(
|
|
66
68
|
extraction.chunks.is_none(),
|
|
@@ -103,13 +105,14 @@ async fn test_csv_custom_delimiter() {
|
|
|
103
105
|
|
|
104
106
|
let csv_content = b"Name;Age;City\nAlice;30;NYC\nBob;25;LA";
|
|
105
107
|
|
|
106
|
-
let
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
108
|
+
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
|
109
|
+
|
|
110
|
+
if result.is_err() {
|
|
111
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
112
|
+
return;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
let extraction = result.unwrap();
|
|
113
116
|
|
|
114
117
|
assert!(
|
|
115
118
|
extraction.chunks.is_none(),
|
|
@@ -135,13 +138,14 @@ async fn test_tsv_file() {
|
|
|
135
138
|
|
|
136
139
|
let tsv_content = b"Name\tAge\tCity\nAlice\t30\tNYC\nBob\t25\tLA";
|
|
137
140
|
|
|
138
|
-
let
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
141
|
+
let result = extract_bytes(tsv_content, "text/tab-separated-values", &config).await;
|
|
142
|
+
|
|
143
|
+
if result.is_err() {
|
|
144
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
145
|
+
return;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
let extraction = result.unwrap();
|
|
145
149
|
|
|
146
150
|
assert_eq!(extraction.mime_type, "text/tab-separated-values");
|
|
147
151
|
assert!(
|
|
@@ -171,13 +175,14 @@ async fn test_csv_quoted_fields() {
|
|
|
171
175
|
let csv_content =
|
|
172
176
|
b"Name,Description,Price\n\"Smith, John\",\"Product A, premium\",100\n\"Doe, Jane\",\"Product B, standard\",50";
|
|
173
177
|
|
|
174
|
-
let
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
178
|
+
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
|
179
|
+
|
|
180
|
+
if result.is_err() {
|
|
181
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
182
|
+
return;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
let extraction = result.unwrap();
|
|
181
186
|
|
|
182
187
|
assert!(
|
|
183
188
|
extraction.chunks.is_none(),
|
|
@@ -207,13 +212,14 @@ async fn test_csv_special_characters() {
|
|
|
207
212
|
|
|
208
213
|
let csv_content = "Name,City,Emoji\nAlice,Tokyo 東京,🎉\nBob,París,✅\nCarlos,Москва,🌍".as_bytes();
|
|
209
214
|
|
|
210
|
-
let
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
215
|
+
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
|
216
|
+
|
|
217
|
+
if result.is_err() {
|
|
218
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
219
|
+
return;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
let extraction = result.unwrap();
|
|
217
223
|
|
|
218
224
|
assert!(
|
|
219
225
|
extraction.chunks.is_none(),
|
|
@@ -245,13 +251,14 @@ async fn test_csv_large_file() {
|
|
|
245
251
|
csv_content.push_str(&format!("{},Item{},{}.00\n", i, i, i * 10));
|
|
246
252
|
}
|
|
247
253
|
|
|
248
|
-
let
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
254
|
+
let result = extract_bytes(csv_content.as_bytes(), "text/csv", &config).await;
|
|
255
|
+
|
|
256
|
+
if result.is_err() {
|
|
257
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
258
|
+
return;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
let extraction = result.unwrap();
|
|
255
262
|
|
|
256
263
|
assert!(
|
|
257
264
|
extraction.chunks.is_none(),
|
|
@@ -315,13 +322,14 @@ async fn test_csv_headers_only() {
|
|
|
315
322
|
|
|
316
323
|
let csv_content = b"Name,Age,City";
|
|
317
324
|
|
|
318
|
-
let
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
+
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
|
326
|
+
|
|
327
|
+
if result.is_err() {
|
|
328
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
329
|
+
return;
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
let extraction = result.unwrap();
|
|
325
333
|
|
|
326
334
|
assert!(
|
|
327
335
|
extraction.chunks.is_none(),
|
|
@@ -346,13 +354,14 @@ async fn test_csv_blank_lines() {
|
|
|
346
354
|
|
|
347
355
|
let csv_content = b"Name,Age\nAlice,30\n\nBob,25\n\nCarlos,35";
|
|
348
356
|
|
|
349
|
-
let
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
357
|
+
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
|
358
|
+
|
|
359
|
+
if result.is_err() {
|
|
360
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
361
|
+
return;
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
let extraction = result.unwrap();
|
|
356
365
|
|
|
357
366
|
assert!(
|
|
358
367
|
extraction.chunks.is_none(),
|
|
@@ -374,13 +383,14 @@ async fn test_csv_numeric_data() {
|
|
|
374
383
|
|
|
375
384
|
let csv_content = b"ID,Price,Quantity,Discount\n1,19.99,100,0.15\n2,29.99,50,0.20\n3,9.99,200,0.10";
|
|
376
385
|
|
|
377
|
-
let
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
386
|
+
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
|
387
|
+
|
|
388
|
+
if result.is_err() {
|
|
389
|
+
println!("Skipping test: Pandoc may not be installed");
|
|
390
|
+
return;
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
let extraction = result.unwrap();
|
|
384
394
|
|
|
385
395
|
assert!(
|
|
386
396
|
extraction.chunks.is_none(),
|
|
@@ -2,10 +2,15 @@
|
|
|
2
2
|
|
|
3
3
|
#![cfg(feature = "office")]
|
|
4
4
|
|
|
5
|
-
use kreuzberg::
|
|
5
|
+
use kreuzberg::extraction::pandoc::extract_file;
|
|
6
6
|
|
|
7
7
|
#[tokio::test]
|
|
8
8
|
async fn test_docx_full_metadata_extraction() {
|
|
9
|
+
if kreuzberg::extraction::pandoc::validate_pandoc_version().await.is_err() {
|
|
10
|
+
println!("Skipping test: Pandoc not available");
|
|
11
|
+
return;
|
|
12
|
+
}
|
|
13
|
+
|
|
9
14
|
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
10
15
|
.parent()
|
|
11
16
|
.unwrap()
|
|
@@ -18,7 +23,7 @@ async fn test_docx_full_metadata_extraction() {
|
|
|
18
23
|
return;
|
|
19
24
|
}
|
|
20
25
|
|
|
21
|
-
let result = extract_file(&test_file,
|
|
26
|
+
let result = extract_file(&test_file, "docx")
|
|
22
27
|
.await
|
|
23
28
|
.expect("Should extract DOCX successfully");
|
|
24
29
|
|
|
@@ -29,66 +34,63 @@ async fn test_docx_full_metadata_extraction() {
|
|
|
29
34
|
);
|
|
30
35
|
|
|
31
36
|
assert_eq!(
|
|
32
|
-
result.metadata.
|
|
37
|
+
result.metadata.get("created_by").and_then(|v| v.as_str()),
|
|
33
38
|
Some("Christoph Auer"),
|
|
34
39
|
"Should have correct creator"
|
|
35
40
|
);
|
|
36
41
|
assert_eq!(
|
|
37
|
-
result.metadata.
|
|
42
|
+
result.metadata.get("modified_by").and_then(|v| v.as_str()),
|
|
38
43
|
Some("Maxim Lysak"),
|
|
39
44
|
"Should have correct last modified by"
|
|
40
45
|
);
|
|
41
46
|
assert_eq!(
|
|
42
|
-
result.metadata.
|
|
47
|
+
result.metadata.get("created_at").and_then(|v| v.as_str()),
|
|
43
48
|
Some("2024-10-09T12:43:00Z"),
|
|
44
49
|
"Should have correct creation date"
|
|
45
50
|
);
|
|
46
51
|
assert_eq!(
|
|
47
|
-
result.metadata.
|
|
52
|
+
result.metadata.get("revision").and_then(|v| v.as_str()),
|
|
48
53
|
Some("7"),
|
|
49
54
|
"Should have revision number"
|
|
50
55
|
);
|
|
51
56
|
|
|
52
57
|
assert_eq!(
|
|
53
|
-
result.metadata.
|
|
58
|
+
result.metadata.get("page_count").and_then(|v| v.as_i64()),
|
|
54
59
|
Some(2),
|
|
55
60
|
"Should have 2 pages"
|
|
56
61
|
);
|
|
57
62
|
assert_eq!(
|
|
58
|
-
result.metadata.
|
|
63
|
+
result.metadata.get("word_count").and_then(|v| v.as_i64()),
|
|
59
64
|
Some(108),
|
|
60
65
|
"Should have 108 words"
|
|
61
66
|
);
|
|
62
67
|
assert_eq!(
|
|
63
|
-
result
|
|
64
|
-
.metadata
|
|
65
|
-
.additional
|
|
66
|
-
.get("character_count")
|
|
67
|
-
.and_then(|v| v.as_i64()),
|
|
68
|
+
result.metadata.get("character_count").and_then(|v| v.as_i64()),
|
|
68
69
|
Some(620),
|
|
69
70
|
"Should have 620 characters"
|
|
70
71
|
);
|
|
71
72
|
assert_eq!(
|
|
72
|
-
result.metadata.
|
|
73
|
+
result.metadata.get("line_count").and_then(|v| v.as_i64()),
|
|
73
74
|
Some(5),
|
|
74
75
|
"Should have 5 lines"
|
|
75
76
|
);
|
|
76
77
|
assert_eq!(
|
|
77
|
-
result
|
|
78
|
-
.metadata
|
|
79
|
-
.additional
|
|
80
|
-
.get("paragraph_count")
|
|
81
|
-
.and_then(|v| v.as_i64()),
|
|
78
|
+
result.metadata.get("paragraph_count").and_then(|v| v.as_i64()),
|
|
82
79
|
Some(1),
|
|
83
80
|
"Should have 1 paragraph"
|
|
84
81
|
);
|
|
85
82
|
|
|
86
83
|
println!("✅ DOCX metadata extraction test passed!");
|
|
87
|
-
println!(" Found {} metadata fields", result.metadata.
|
|
84
|
+
println!(" Found {} metadata fields", result.metadata.len());
|
|
88
85
|
}
|
|
89
86
|
|
|
90
87
|
#[tokio::test]
|
|
91
88
|
async fn test_docx_minimal_metadata_extraction() {
|
|
89
|
+
if kreuzberg::extraction::pandoc::validate_pandoc_version().await.is_err() {
|
|
90
|
+
println!("Skipping test: Pandoc not available");
|
|
91
|
+
return;
|
|
92
|
+
}
|
|
93
|
+
|
|
92
94
|
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
93
95
|
.parent()
|
|
94
96
|
.unwrap()
|
|
@@ -101,19 +103,19 @@ async fn test_docx_minimal_metadata_extraction() {
|
|
|
101
103
|
return;
|
|
102
104
|
}
|
|
103
105
|
|
|
104
|
-
let result = extract_file(&test_file,
|
|
106
|
+
let result = extract_file(&test_file, "docx")
|
|
105
107
|
.await
|
|
106
108
|
.expect("Should extract DOCX successfully");
|
|
107
109
|
|
|
108
110
|
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
109
111
|
|
|
110
112
|
assert_eq!(
|
|
111
|
-
result.metadata.
|
|
113
|
+
result.metadata.get("page_count").and_then(|v| v.as_i64()),
|
|
112
114
|
Some(1),
|
|
113
115
|
"Should have 1 page"
|
|
114
116
|
);
|
|
115
117
|
assert_eq!(
|
|
116
|
-
result.metadata.
|
|
118
|
+
result.metadata.get("word_count").and_then(|v| v.as_i64()),
|
|
117
119
|
Some(520),
|
|
118
120
|
"Should have 520 words"
|
|
119
121
|
);
|