kreuzberg 4.0.0.pre.rc.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +1 -0
- data/.rubocop.yml +538 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +157 -0
- data/README.md +426 -0
- data/Rakefile +25 -0
- data/Steepfile +47 -0
- data/examples/async_patterns.rb +341 -0
- data/ext/kreuzberg_rb/extconf.rb +45 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
- data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
- data/ext/kreuzberg_rb/native/README.md +425 -0
- data/ext/kreuzberg_rb/native/build.rs +15 -0
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
- data/ext/kreuzberg_rb/native/include/strings.h +20 -0
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
- data/extconf.rb +28 -0
- data/kreuzberg.gemspec +148 -0
- data/lib/kreuzberg/api_proxy.rb +142 -0
- data/lib/kreuzberg/cache_api.rb +46 -0
- data/lib/kreuzberg/cli.rb +55 -0
- data/lib/kreuzberg/cli_proxy.rb +127 -0
- data/lib/kreuzberg/config.rb +691 -0
- data/lib/kreuzberg/error_context.rb +32 -0
- data/lib/kreuzberg/errors.rb +118 -0
- data/lib/kreuzberg/extraction_api.rb +85 -0
- data/lib/kreuzberg/mcp_proxy.rb +186 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
- data/lib/kreuzberg/post_processor_protocol.rb +86 -0
- data/lib/kreuzberg/result.rb +216 -0
- data/lib/kreuzberg/setup_lib_path.rb +80 -0
- data/lib/kreuzberg/validator_protocol.rb +89 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +103 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +520 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +85 -0
- data/spec/binding/cli_spec.rb +55 -0
- data/spec/binding/config_spec.rb +345 -0
- data/spec/binding/config_validation_spec.rb +283 -0
- data/spec/binding/error_handling_spec.rb +213 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +274 -0
- data/spec/fixtures/config.toml +39 -0
- data/spec/fixtures/config.yaml +41 -0
- data/spec/fixtures/invalid_config.toml +4 -0
- data/spec/smoke/package_spec.rb +178 -0
- data/spec/spec_helper.rb +42 -0
- data/vendor/kreuzberg/Cargo.toml +204 -0
- data/vendor/kreuzberg/README.md +175 -0
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
- data/vendor/kreuzberg/build.rs +474 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -0
- data/vendor/kreuzberg/src/api/handlers.rs +199 -0
- data/vendor/kreuzberg/src/api/mod.rs +79 -0
- data/vendor/kreuzberg/src/api/server.rs +353 -0
- data/vendor/kreuzberg/src/api/types.rs +170 -0
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
- data/vendor/kreuzberg/src/core/config.rs +1032 -0
- data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
- data/vendor/kreuzberg/src/core/io.rs +329 -0
- data/vendor/kreuzberg/src/core/mime.rs +605 -0
- data/vendor/kreuzberg/src/core/mod.rs +45 -0
- data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
- data/vendor/kreuzberg/src/embeddings.rs +432 -0
- data/vendor/kreuzberg/src/error.rs +431 -0
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
- data/vendor/kreuzberg/src/extraction/email.rs +854 -0
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
- data/vendor/kreuzberg/src/extraction/html.rs +553 -0
- data/vendor/kreuzberg/src/extraction/image.rs +368 -0
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
- data/vendor/kreuzberg/src/extraction/table.rs +328 -0
- data/vendor/kreuzberg/src/extraction/text.rs +269 -0
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
- data/vendor/kreuzberg/src/extractors/email.rs +143 -0
- data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
- data/vendor/kreuzberg/src/extractors/html.rs +393 -0
- data/vendor/kreuzberg/src/extractors/image.rs +198 -0
- data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
- data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
- data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
- data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
- data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
- data/vendor/kreuzberg/src/extractors/security.rs +484 -0
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
- data/vendor/kreuzberg/src/extractors/text.rs +260 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
- data/vendor/kreuzberg/src/image/dpi.rs +164 -0
- data/vendor/kreuzberg/src/image/mod.rs +6 -0
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
- data/vendor/kreuzberg/src/image/resize.rs +89 -0
- data/vendor/kreuzberg/src/keywords/config.rs +154 -0
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
- data/vendor/kreuzberg/src/keywords/types.rs +68 -0
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
- data/vendor/kreuzberg/src/lib.rs +105 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
- data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
- data/vendor/kreuzberg/src/ocr/error.rs +37 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
- data/vendor/kreuzberg/src/ocr/types.rs +393 -0
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
- data/vendor/kreuzberg/src/panic_context.rs +154 -0
- data/vendor/kreuzberg/src/pdf/error.rs +122 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
- data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
- data/vendor/kreuzberg/src/pdf/table.rs +393 -0
- data/vendor/kreuzberg/src/pdf/text.rs +158 -0
- data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
- data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
- data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
- data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
- data/vendor/kreuzberg/src/text/mod.rs +19 -0
- data/vendor/kreuzberg/src/text/quality.rs +697 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
- data/vendor/kreuzberg/src/types.rs +903 -0
- data/vendor/kreuzberg/src/utils/mod.rs +17 -0
- data/vendor/kreuzberg/src/utils/quality.rs +959 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
- data/vendor/kreuzberg/tests/api_tests.rs +966 -0
- data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
- data/vendor/kreuzberg/tests/config_features.rs +598 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
- data/vendor/kreuzberg/tests/core_integration.rs +510 -0
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
- data/vendor/kreuzberg/tests/email_integration.rs +325 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
- data/vendor/kreuzberg/tests/error_handling.rs +393 -0
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
- data/vendor/kreuzberg/tests/format_integration.rs +159 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
- data/vendor/kreuzberg/tests/image_integration.rs +253 -0
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
- data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
- data/vendor/kreuzberg/tests/security_validation.rs +415 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
- data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
- data/vendor/rb-sys/.cargo-ok +1 -0
- data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
- data/vendor/rb-sys/Cargo.lock +393 -0
- data/vendor/rb-sys/Cargo.toml +70 -0
- data/vendor/rb-sys/Cargo.toml.orig +57 -0
- data/vendor/rb-sys/LICENSE-APACHE +190 -0
- data/vendor/rb-sys/LICENSE-MIT +21 -0
- data/vendor/rb-sys/bin/release.sh +21 -0
- data/vendor/rb-sys/build/features.rs +108 -0
- data/vendor/rb-sys/build/main.rs +246 -0
- data/vendor/rb-sys/build/stable_api_config.rs +153 -0
- data/vendor/rb-sys/build/version.rs +48 -0
- data/vendor/rb-sys/readme.md +36 -0
- data/vendor/rb-sys/src/bindings.rs +21 -0
- data/vendor/rb-sys/src/hidden.rs +11 -0
- data/vendor/rb-sys/src/lib.rs +34 -0
- data/vendor/rb-sys/src/macros.rs +371 -0
- data/vendor/rb-sys/src/memory.rs +53 -0
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
- data/vendor/rb-sys/src/special_consts.rs +31 -0
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
- data/vendor/rb-sys/src/stable_api.rs +261 -0
- data/vendor/rb-sys/src/symbol.rs +31 -0
- data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
- data/vendor/rb-sys/src/utils.rs +89 -0
- data/vendor/rb-sys/src/value_type.rs +7 -0
- metadata +536 -0
|
@@ -0,0 +1,415 @@
|
|
|
1
|
+
//! Configuration loading integration tests.
|
|
2
|
+
//!
|
|
3
|
+
//! Tests the config loading APIs:
|
|
4
|
+
//! - from_file() with TOML/YAML/JSON
|
|
5
|
+
//! - discover() for searching parent directories
|
|
6
|
+
//! - Error handling for invalid configs
|
|
7
|
+
|
|
8
|
+
use kreuzberg::KreuzbergError;
|
|
9
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
10
|
+
use std::fs;
|
|
11
|
+
use tempfile::TempDir;
|
|
12
|
+
|
|
13
|
+
/// Test loading config from TOML file.
|
|
14
|
+
#[test]
|
|
15
|
+
fn test_from_file_toml_succeeds() {
|
|
16
|
+
let temp_dir = TempDir::new().unwrap();
|
|
17
|
+
let config_path = temp_dir.path().join("config.toml");
|
|
18
|
+
|
|
19
|
+
let toml_content = r#"
|
|
20
|
+
[ocr]
|
|
21
|
+
enabled = true
|
|
22
|
+
backend = "tesseract"
|
|
23
|
+
|
|
24
|
+
[chunking]
|
|
25
|
+
max_chars = 1000
|
|
26
|
+
max_overlap = 100
|
|
27
|
+
"#;
|
|
28
|
+
|
|
29
|
+
fs::write(&config_path, toml_content).unwrap();
|
|
30
|
+
|
|
31
|
+
let config = ExtractionConfig::from_file(&config_path);
|
|
32
|
+
assert!(config.is_ok(), "Should load TOML config successfully");
|
|
33
|
+
|
|
34
|
+
let config = config.unwrap();
|
|
35
|
+
assert!(config.ocr.is_some(), "Should have OCR config");
|
|
36
|
+
assert!(config.chunking.is_some(), "Should have chunking config");
|
|
37
|
+
|
|
38
|
+
let chunking = config.chunking.unwrap();
|
|
39
|
+
assert_eq!(chunking.max_chars, 1000);
|
|
40
|
+
assert_eq!(chunking.max_overlap, 100);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/// Test loading config from YAML file.
|
|
44
|
+
#[test]
|
|
45
|
+
fn test_from_file_yaml_succeeds() {
|
|
46
|
+
let temp_dir = TempDir::new().unwrap();
|
|
47
|
+
let config_path = temp_dir.path().join("config.yaml");
|
|
48
|
+
|
|
49
|
+
let yaml_content = r#"
|
|
50
|
+
ocr:
|
|
51
|
+
enabled: true
|
|
52
|
+
backend: tesseract
|
|
53
|
+
chunking:
|
|
54
|
+
max_chars: 1000
|
|
55
|
+
max_overlap: 100
|
|
56
|
+
"#;
|
|
57
|
+
|
|
58
|
+
fs::write(&config_path, yaml_content).unwrap();
|
|
59
|
+
|
|
60
|
+
let config = ExtractionConfig::from_file(&config_path);
|
|
61
|
+
assert!(config.is_ok(), "Should load YAML config successfully");
|
|
62
|
+
|
|
63
|
+
let config = config.unwrap();
|
|
64
|
+
assert!(config.ocr.is_some(), "Should have OCR config");
|
|
65
|
+
assert!(config.chunking.is_some(), "Should have chunking config");
|
|
66
|
+
|
|
67
|
+
let chunking = config.chunking.unwrap();
|
|
68
|
+
assert_eq!(chunking.max_chars, 1000);
|
|
69
|
+
assert_eq!(chunking.max_overlap, 100);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/// Test loading config from JSON file.
|
|
73
|
+
#[test]
|
|
74
|
+
fn test_from_file_json_succeeds() {
|
|
75
|
+
let temp_dir = TempDir::new().unwrap();
|
|
76
|
+
let config_path = temp_dir.path().join("config.json");
|
|
77
|
+
|
|
78
|
+
let json_content = r#"
|
|
79
|
+
{
|
|
80
|
+
"ocr": {
|
|
81
|
+
"enabled": true,
|
|
82
|
+
"backend": "tesseract"
|
|
83
|
+
},
|
|
84
|
+
"chunking": {
|
|
85
|
+
"max_chars": 1000,
|
|
86
|
+
"max_overlap": 100
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
"#;
|
|
90
|
+
|
|
91
|
+
fs::write(&config_path, json_content).unwrap();
|
|
92
|
+
|
|
93
|
+
let config = ExtractionConfig::from_file(&config_path);
|
|
94
|
+
assert!(config.is_ok(), "Should load JSON config successfully");
|
|
95
|
+
|
|
96
|
+
let config = config.unwrap();
|
|
97
|
+
assert!(config.ocr.is_some(), "Should have OCR config");
|
|
98
|
+
assert!(config.chunking.is_some(), "Should have chunking config");
|
|
99
|
+
|
|
100
|
+
let chunking = config.chunking.unwrap();
|
|
101
|
+
assert_eq!(chunking.max_chars, 1000);
|
|
102
|
+
assert_eq!(chunking.max_overlap, 100);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/// Test loading config from .yml extension.
|
|
106
|
+
#[test]
|
|
107
|
+
fn test_from_file_yml_extension_succeeds() {
|
|
108
|
+
let temp_dir = TempDir::new().unwrap();
|
|
109
|
+
let config_path = temp_dir.path().join("config.yml");
|
|
110
|
+
|
|
111
|
+
let yml_content = r#"
|
|
112
|
+
ocr:
|
|
113
|
+
enabled: true
|
|
114
|
+
"#;
|
|
115
|
+
|
|
116
|
+
fs::write(&config_path, yml_content).unwrap();
|
|
117
|
+
|
|
118
|
+
let config = ExtractionConfig::from_file(&config_path);
|
|
119
|
+
assert!(config.is_ok(), "Should load .yml config successfully");
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/// Test from_file with nonexistent path fails.
|
|
123
|
+
#[test]
|
|
124
|
+
fn test_from_file_nonexistent_path_fails() {
|
|
125
|
+
let result = ExtractionConfig::from_file("/nonexistent/path/config.toml");
|
|
126
|
+
assert!(result.is_err(), "Should fail for nonexistent path: {:?}", result);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/// Test from_file with malformed TOML fails.
|
|
130
|
+
#[test]
|
|
131
|
+
fn test_from_file_malformed_toml_fails() {
|
|
132
|
+
let temp_dir = TempDir::new().unwrap();
|
|
133
|
+
let config_path = temp_dir.path().join("config.toml");
|
|
134
|
+
|
|
135
|
+
let malformed_toml = r#"
|
|
136
|
+
[ocr
|
|
137
|
+
enabled = true
|
|
138
|
+
"#;
|
|
139
|
+
|
|
140
|
+
fs::write(&config_path, malformed_toml).unwrap();
|
|
141
|
+
|
|
142
|
+
let result = ExtractionConfig::from_file(&config_path);
|
|
143
|
+
assert!(result.is_err(), "Should fail for malformed TOML: {:?}", result);
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/// Test from_file with malformed JSON fails.
|
|
147
|
+
#[test]
|
|
148
|
+
fn test_from_file_malformed_json_fails() {
|
|
149
|
+
let temp_dir = TempDir::new().unwrap();
|
|
150
|
+
let config_path = temp_dir.path().join("config.json");
|
|
151
|
+
|
|
152
|
+
let malformed_json = r#"
|
|
153
|
+
{
|
|
154
|
+
"ocr": {
|
|
155
|
+
"enabled": true
|
|
156
|
+
}
|
|
157
|
+
"chunking": {}
|
|
158
|
+
}
|
|
159
|
+
"#;
|
|
160
|
+
|
|
161
|
+
fs::write(&config_path, malformed_json).unwrap();
|
|
162
|
+
|
|
163
|
+
let result = ExtractionConfig::from_file(&config_path);
|
|
164
|
+
assert!(result.is_err(), "Should fail for malformed JSON: {:?}", result);
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
/// Test from_file with malformed YAML fails.
|
|
168
|
+
#[test]
|
|
169
|
+
fn test_from_file_malformed_yaml_fails() {
|
|
170
|
+
let temp_dir = TempDir::new().unwrap();
|
|
171
|
+
let config_path = temp_dir.path().join("config.yaml");
|
|
172
|
+
|
|
173
|
+
let malformed_yaml = r#"
|
|
174
|
+
ocr:
|
|
175
|
+
enabled: true
|
|
176
|
+
- invalid_list
|
|
177
|
+
"#;
|
|
178
|
+
|
|
179
|
+
fs::write(&config_path, malformed_yaml).unwrap();
|
|
180
|
+
|
|
181
|
+
let result = ExtractionConfig::from_file(&config_path);
|
|
182
|
+
assert!(result.is_err(), "Should fail for malformed YAML: {:?}", result);
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
/// Test from_file with empty file uses defaults.
|
|
186
|
+
#[test]
|
|
187
|
+
fn test_from_file_empty_file_uses_defaults() {
|
|
188
|
+
let temp_dir = TempDir::new().unwrap();
|
|
189
|
+
let config_path = temp_dir.path().join("config.toml");
|
|
190
|
+
|
|
191
|
+
fs::write(&config_path, "").unwrap();
|
|
192
|
+
|
|
193
|
+
let config = ExtractionConfig::from_file(&config_path);
|
|
194
|
+
assert!(config.is_ok(), "Should load empty file successfully");
|
|
195
|
+
|
|
196
|
+
let config = config.unwrap();
|
|
197
|
+
assert!(config.ocr.is_none(), "Default config should have no OCR");
|
|
198
|
+
assert!(config.chunking.is_none(), "Default config should have no chunking");
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/// Test from_file with unsupported extension fails.
|
|
202
|
+
#[test]
|
|
203
|
+
fn test_from_file_unsupported_extension_fails() {
|
|
204
|
+
let temp_dir = TempDir::new().unwrap();
|
|
205
|
+
let config_path = temp_dir.path().join("config.txt");
|
|
206
|
+
|
|
207
|
+
fs::write(&config_path, "ocr:\n enabled: true").unwrap();
|
|
208
|
+
|
|
209
|
+
let result = ExtractionConfig::from_file(&config_path);
|
|
210
|
+
assert!(result.is_err(), "Should fail for unsupported extension: {:?}", result);
|
|
211
|
+
|
|
212
|
+
if let Err(KreuzbergError::Validation { message, .. }) = result {
|
|
213
|
+
assert!(
|
|
214
|
+
message.contains("format") || message.contains("extension") || message.contains("Unsupported"),
|
|
215
|
+
"Error should mention format/extension: {}",
|
|
216
|
+
message
|
|
217
|
+
);
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
/// Test discover() finds config in current directory.
|
|
222
|
+
#[test]
|
|
223
|
+
#[serial_test::serial]
|
|
224
|
+
fn test_discover_finds_config_in_current_dir() {
|
|
225
|
+
let temp_dir = TempDir::new().unwrap();
|
|
226
|
+
let config_path = temp_dir.path().join("kreuzberg.toml");
|
|
227
|
+
|
|
228
|
+
let toml_content = r#"
|
|
229
|
+
[ocr]
|
|
230
|
+
enabled = true
|
|
231
|
+
"#;
|
|
232
|
+
|
|
233
|
+
fs::write(&config_path, toml_content).unwrap();
|
|
234
|
+
|
|
235
|
+
let original_dir = std::env::current_dir().unwrap();
|
|
236
|
+
std::env::set_current_dir(temp_dir.path()).unwrap();
|
|
237
|
+
|
|
238
|
+
let result = ExtractionConfig::discover();
|
|
239
|
+
|
|
240
|
+
std::env::set_current_dir(original_dir).unwrap();
|
|
241
|
+
|
|
242
|
+
assert!(result.is_ok(), "Discover should succeed");
|
|
243
|
+
let config = result.unwrap();
|
|
244
|
+
assert!(config.is_some(), "Should find config in current directory");
|
|
245
|
+
assert!(config.unwrap().ocr.is_some(), "Should have OCR config");
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
/// Test discover() finds config in parent directory.
|
|
249
|
+
#[test]
|
|
250
|
+
#[serial_test::serial]
|
|
251
|
+
fn test_discover_finds_config_in_parent_dir() {
|
|
252
|
+
let temp_dir = TempDir::new().unwrap();
|
|
253
|
+
let config_path = temp_dir.path().join("kreuzberg.toml");
|
|
254
|
+
|
|
255
|
+
let toml_content = r#"
|
|
256
|
+
[ocr]
|
|
257
|
+
enabled = true
|
|
258
|
+
"#;
|
|
259
|
+
|
|
260
|
+
fs::write(&config_path, toml_content).unwrap();
|
|
261
|
+
|
|
262
|
+
let sub_dir = temp_dir.path().join("subdir");
|
|
263
|
+
fs::create_dir(&sub_dir).unwrap();
|
|
264
|
+
|
|
265
|
+
let original_dir = std::env::current_dir().unwrap();
|
|
266
|
+
std::env::set_current_dir(&sub_dir).unwrap();
|
|
267
|
+
|
|
268
|
+
let result = ExtractionConfig::discover();
|
|
269
|
+
|
|
270
|
+
std::env::set_current_dir(original_dir).unwrap();
|
|
271
|
+
|
|
272
|
+
assert!(result.is_ok(), "Discover should succeed");
|
|
273
|
+
let config = result.unwrap();
|
|
274
|
+
assert!(config.is_some(), "Should find config in parent directory");
|
|
275
|
+
assert!(config.unwrap().ocr.is_some(), "Should have OCR config");
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
/// Test discover() returns None when no config found.
|
|
279
|
+
#[test]
|
|
280
|
+
#[serial_test::serial]
|
|
281
|
+
fn test_discover_returns_none_when_not_found() {
|
|
282
|
+
let temp_dir = TempDir::new().unwrap();
|
|
283
|
+
let sub_dir = temp_dir.path().join("subdir");
|
|
284
|
+
fs::create_dir(&sub_dir).unwrap();
|
|
285
|
+
|
|
286
|
+
let original_dir = std::env::current_dir().unwrap();
|
|
287
|
+
std::env::set_current_dir(&sub_dir).unwrap();
|
|
288
|
+
|
|
289
|
+
let result = ExtractionConfig::discover();
|
|
290
|
+
|
|
291
|
+
std::env::set_current_dir(original_dir).unwrap();
|
|
292
|
+
|
|
293
|
+
assert!(result.is_ok(), "Discover should succeed even when no config found");
|
|
294
|
+
let _config = result.unwrap();
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
/// Test discover() prefers certain file names.
|
|
298
|
+
#[test]
|
|
299
|
+
#[serial_test::serial]
|
|
300
|
+
fn test_discover_file_name_preference() {
|
|
301
|
+
let temp_dir = TempDir::new().unwrap();
|
|
302
|
+
|
|
303
|
+
fs::write(temp_dir.path().join("kreuzberg.toml"), "[ocr]\nenabled = true").unwrap();
|
|
304
|
+
fs::write(temp_dir.path().join(".kreuzberg.toml"), "[ocr]\nenabled = false").unwrap();
|
|
305
|
+
|
|
306
|
+
let original_dir = std::env::current_dir().unwrap();
|
|
307
|
+
if std::env::set_current_dir(temp_dir.path()).is_err() {
|
|
308
|
+
return;
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
let result = ExtractionConfig::discover();
|
|
312
|
+
|
|
313
|
+
let _ = std::env::set_current_dir(original_dir);
|
|
314
|
+
|
|
315
|
+
assert!(result.is_ok(), "Discover should succeed");
|
|
316
|
+
let config = result.unwrap();
|
|
317
|
+
assert!(config.is_some(), "Should find a config file");
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
/// Test discover() with nested directories.
|
|
321
|
+
#[test]
|
|
322
|
+
#[serial_test::serial]
|
|
323
|
+
fn test_discover_with_nested_directories() {
|
|
324
|
+
let temp_dir = TempDir::new().unwrap();
|
|
325
|
+
let config_path = temp_dir.path().join("kreuzberg.toml");
|
|
326
|
+
|
|
327
|
+
let toml_content = r#"
|
|
328
|
+
[ocr]
|
|
329
|
+
enabled = true
|
|
330
|
+
"#;
|
|
331
|
+
|
|
332
|
+
fs::write(&config_path, toml_content).unwrap();
|
|
333
|
+
|
|
334
|
+
let level1 = temp_dir.path().join("level1");
|
|
335
|
+
let level2 = level1.join("level2");
|
|
336
|
+
let level3 = level2.join("level3");
|
|
337
|
+
fs::create_dir_all(&level3).unwrap();
|
|
338
|
+
|
|
339
|
+
let original_dir = std::env::current_dir().unwrap();
|
|
340
|
+
if std::env::set_current_dir(&level3).is_err() {
|
|
341
|
+
return;
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
let result = ExtractionConfig::discover();
|
|
345
|
+
|
|
346
|
+
let _ = std::env::set_current_dir(&original_dir);
|
|
347
|
+
|
|
348
|
+
assert!(result.is_ok(), "Discover should succeed");
|
|
349
|
+
let config = result.unwrap();
|
|
350
|
+
assert!(config.is_some(), "Should find config in ancestor directory");
|
|
351
|
+
assert!(config.unwrap().ocr.is_some(), "Should have OCR config");
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
/// Test config loading with all supported features.
|
|
355
|
+
#[test]
|
|
356
|
+
fn test_from_file_comprehensive_config() {
|
|
357
|
+
let temp_dir = TempDir::new().unwrap();
|
|
358
|
+
let config_path = temp_dir.path().join("config.toml");
|
|
359
|
+
|
|
360
|
+
let toml_content = r#"
|
|
361
|
+
[ocr]
|
|
362
|
+
enabled = true
|
|
363
|
+
backend = "tesseract"
|
|
364
|
+
|
|
365
|
+
[chunking]
|
|
366
|
+
max_chars = 2000
|
|
367
|
+
max_overlap = 200
|
|
368
|
+
|
|
369
|
+
[language_detection]
|
|
370
|
+
enabled = true
|
|
371
|
+
|
|
372
|
+
[images]
|
|
373
|
+
enabled = true
|
|
374
|
+
|
|
375
|
+
[pdf_options]
|
|
376
|
+
extract_images = true
|
|
377
|
+
"#;
|
|
378
|
+
|
|
379
|
+
fs::write(&config_path, toml_content).unwrap();
|
|
380
|
+
|
|
381
|
+
let config = ExtractionConfig::from_file(&config_path);
|
|
382
|
+
assert!(config.is_ok(), "Should load comprehensive config successfully");
|
|
383
|
+
|
|
384
|
+
let config = config.unwrap();
|
|
385
|
+
assert!(config.ocr.is_some(), "Should have OCR config");
|
|
386
|
+
assert!(config.chunking.is_some(), "Should have chunking config");
|
|
387
|
+
assert!(
|
|
388
|
+
config.language_detection.is_some(),
|
|
389
|
+
"Should have language detection config"
|
|
390
|
+
);
|
|
391
|
+
assert!(config.images.is_some(), "Should have image extraction config");
|
|
392
|
+
assert!(config.pdf_options.is_some(), "Should have PDF config");
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
/// Test config validation with invalid values.
|
|
396
|
+
#[test]
|
|
397
|
+
fn test_from_file_with_invalid_values() {
|
|
398
|
+
let temp_dir = TempDir::new().unwrap();
|
|
399
|
+
let config_path = temp_dir.path().join("config.toml");
|
|
400
|
+
|
|
401
|
+
let toml_content = r#"
|
|
402
|
+
[chunking]
|
|
403
|
+
max_chars = -1000
|
|
404
|
+
max_overlap = -100
|
|
405
|
+
"#;
|
|
406
|
+
|
|
407
|
+
fs::write(&config_path, toml_content).unwrap();
|
|
408
|
+
|
|
409
|
+
let result = ExtractionConfig::from_file(&config_path);
|
|
410
|
+
if let Ok(config) = result
|
|
411
|
+
&& let Some(chunking) = config.chunking
|
|
412
|
+
{
|
|
413
|
+
assert!(chunking.max_chars > 0, "max_chars should be positive");
|
|
414
|
+
}
|
|
415
|
+
}
|