kreuzberg 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +534 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +157 -0
- data/README.md +421 -0
- data/Rakefile +25 -0
- data/Steepfile +47 -0
- data/examples/async_patterns.rb +340 -0
- data/ext/kreuzberg_rb/extconf.rb +35 -0
- data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
- data/ext/kreuzberg_rb/native/README.md +425 -0
- data/ext/kreuzberg_rb/native/build.rs +17 -0
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
- data/ext/kreuzberg_rb/native/include/strings.h +20 -0
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
- data/extconf.rb +28 -0
- data/kreuzberg.gemspec +105 -0
- data/lib/kreuzberg/api_proxy.rb +142 -0
- data/lib/kreuzberg/cache_api.rb +45 -0
- data/lib/kreuzberg/cli.rb +55 -0
- data/lib/kreuzberg/cli_proxy.rb +127 -0
- data/lib/kreuzberg/config.rb +684 -0
- data/lib/kreuzberg/errors.rb +50 -0
- data/lib/kreuzberg/extraction_api.rb +84 -0
- data/lib/kreuzberg/mcp_proxy.rb +186 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
- data/lib/kreuzberg/post_processor_protocol.rb +86 -0
- data/lib/kreuzberg/result.rb +216 -0
- data/lib/kreuzberg/setup_lib_path.rb +79 -0
- data/lib/kreuzberg/validator_protocol.rb +89 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +82 -0
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +468 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +87 -0
- data/spec/binding/cli_spec.rb +54 -0
- data/spec/binding/config_spec.rb +345 -0
- data/spec/binding/config_validation_spec.rb +283 -0
- data/spec/binding/error_handling_spec.rb +213 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +274 -0
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +39 -0
- data/spec/fixtures/config.yaml +42 -0
- data/spec/fixtures/invalid_config.toml +4 -0
- data/spec/smoke/package_spec.rb +178 -0
- data/spec/spec_helper.rb +42 -0
- data/vendor/kreuzberg/Cargo.toml +134 -0
- data/vendor/kreuzberg/README.md +175 -0
- data/vendor/kreuzberg/build.rs +460 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -0
- data/vendor/kreuzberg/src/api/handlers.rs +199 -0
- data/vendor/kreuzberg/src/api/mod.rs +79 -0
- data/vendor/kreuzberg/src/api/server.rs +353 -0
- data/vendor/kreuzberg/src/api/types.rs +170 -0
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
- data/vendor/kreuzberg/src/core/config.rs +1032 -0
- data/vendor/kreuzberg/src/core/extractor.rs +903 -0
- data/vendor/kreuzberg/src/core/io.rs +327 -0
- data/vendor/kreuzberg/src/core/mime.rs +615 -0
- data/vendor/kreuzberg/src/core/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
- data/vendor/kreuzberg/src/embeddings.rs +323 -0
- data/vendor/kreuzberg/src/error.rs +431 -0
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
- data/vendor/kreuzberg/src/extraction/email.rs +854 -0
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
- data/vendor/kreuzberg/src/extraction/html.rs +553 -0
- data/vendor/kreuzberg/src/extraction/image.rs +368 -0
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
- data/vendor/kreuzberg/src/extraction/table.rs +328 -0
- data/vendor/kreuzberg/src/extraction/text.rs +269 -0
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
- data/vendor/kreuzberg/src/extractors/email.rs +129 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
- data/vendor/kreuzberg/src/extractors/html.rs +410 -0
- data/vendor/kreuzberg/src/extractors/image.rs +195 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
- data/vendor/kreuzberg/src/extractors/text.rs +242 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
- data/vendor/kreuzberg/src/image/dpi.rs +164 -0
- data/vendor/kreuzberg/src/image/mod.rs +6 -0
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
- data/vendor/kreuzberg/src/image/resize.rs +89 -0
- data/vendor/kreuzberg/src/keywords/config.rs +154 -0
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
- data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
- data/vendor/kreuzberg/src/keywords/types.rs +68 -0
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
- data/vendor/kreuzberg/src/lib.rs +102 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
- data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
- data/vendor/kreuzberg/src/ocr/error.rs +37 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
- data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
- data/vendor/kreuzberg/src/ocr/types.rs +393 -0
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
- data/vendor/kreuzberg/src/pdf/error.rs +122 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
- data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
- data/vendor/kreuzberg/src/pdf/table.rs +420 -0
- data/vendor/kreuzberg/src/pdf/text.rs +161 -0
- data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
- data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
- data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
- data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
- data/vendor/kreuzberg/src/text/mod.rs +19 -0
- data/vendor/kreuzberg/src/text/quality.rs +697 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
- data/vendor/kreuzberg/src/types.rs +873 -0
- data/vendor/kreuzberg/src/utils/mod.rs +17 -0
- data/vendor/kreuzberg/src/utils/quality.rs +959 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
- data/vendor/kreuzberg/tests/api_tests.rs +966 -0
- data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
- data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
- data/vendor/kreuzberg/tests/config_features.rs +580 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
- data/vendor/kreuzberg/tests/core_integration.rs +493 -0
- data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
- data/vendor/kreuzberg/tests/email_integration.rs +325 -0
- data/vendor/kreuzberg/tests/error_handling.rs +393 -0
- data/vendor/kreuzberg/tests/format_integration.rs +159 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
- data/vendor/kreuzberg/tests/image_integration.rs +253 -0
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
- data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
- data/vendor/kreuzberg/tests/security_validation.rs +404 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
- data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
- metadata +471 -0
|
@@ -0,0 +1,439 @@
|
|
|
1
|
+
//! Configuration loading integration tests.
|
|
2
|
+
//!
|
|
3
|
+
//! Tests the config loading APIs:
|
|
4
|
+
//! - from_file() with TOML/YAML/JSON
|
|
5
|
+
//! - discover() for searching parent directories
|
|
6
|
+
//! - Error handling for invalid configs
|
|
7
|
+
|
|
8
|
+
use kreuzberg::KreuzbergError;
|
|
9
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
10
|
+
use std::fs;
|
|
11
|
+
use tempfile::TempDir;
|
|
12
|
+
|
|
13
|
+
/// Test loading config from TOML file.
|
|
14
|
+
#[test]
|
|
15
|
+
fn test_from_file_toml_succeeds() {
|
|
16
|
+
let temp_dir = TempDir::new().unwrap();
|
|
17
|
+
let config_path = temp_dir.path().join("config.toml");
|
|
18
|
+
|
|
19
|
+
let toml_content = r#"
|
|
20
|
+
[ocr]
|
|
21
|
+
enabled = true
|
|
22
|
+
backend = "tesseract"
|
|
23
|
+
|
|
24
|
+
[chunking]
|
|
25
|
+
max_chars = 1000
|
|
26
|
+
max_overlap = 100
|
|
27
|
+
"#;
|
|
28
|
+
|
|
29
|
+
fs::write(&config_path, toml_content).unwrap();
|
|
30
|
+
|
|
31
|
+
let config = ExtractionConfig::from_file(&config_path);
|
|
32
|
+
assert!(config.is_ok(), "Should load TOML config successfully");
|
|
33
|
+
|
|
34
|
+
let config = config.unwrap();
|
|
35
|
+
assert!(config.ocr.is_some(), "Should have OCR config");
|
|
36
|
+
assert!(config.chunking.is_some(), "Should have chunking config");
|
|
37
|
+
|
|
38
|
+
let chunking = config.chunking.unwrap();
|
|
39
|
+
assert_eq!(chunking.max_chars, 1000);
|
|
40
|
+
assert_eq!(chunking.max_overlap, 100);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/// Test loading config from YAML file.
|
|
44
|
+
#[test]
|
|
45
|
+
fn test_from_file_yaml_succeeds() {
|
|
46
|
+
let temp_dir = TempDir::new().unwrap();
|
|
47
|
+
let config_path = temp_dir.path().join("config.yaml");
|
|
48
|
+
|
|
49
|
+
let yaml_content = r#"
|
|
50
|
+
ocr:
|
|
51
|
+
enabled: true
|
|
52
|
+
backend: tesseract
|
|
53
|
+
chunking:
|
|
54
|
+
max_chars: 1000
|
|
55
|
+
max_overlap: 100
|
|
56
|
+
"#;
|
|
57
|
+
|
|
58
|
+
fs::write(&config_path, yaml_content).unwrap();
|
|
59
|
+
|
|
60
|
+
let config = ExtractionConfig::from_file(&config_path);
|
|
61
|
+
assert!(config.is_ok(), "Should load YAML config successfully");
|
|
62
|
+
|
|
63
|
+
let config = config.unwrap();
|
|
64
|
+
assert!(config.ocr.is_some(), "Should have OCR config");
|
|
65
|
+
assert!(config.chunking.is_some(), "Should have chunking config");
|
|
66
|
+
|
|
67
|
+
let chunking = config.chunking.unwrap();
|
|
68
|
+
assert_eq!(chunking.max_chars, 1000);
|
|
69
|
+
assert_eq!(chunking.max_overlap, 100);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/// Test loading config from JSON file.
|
|
73
|
+
#[test]
|
|
74
|
+
fn test_from_file_json_succeeds() {
|
|
75
|
+
let temp_dir = TempDir::new().unwrap();
|
|
76
|
+
let config_path = temp_dir.path().join("config.json");
|
|
77
|
+
|
|
78
|
+
let json_content = r#"
|
|
79
|
+
{
|
|
80
|
+
"ocr": {
|
|
81
|
+
"enabled": true,
|
|
82
|
+
"backend": "tesseract"
|
|
83
|
+
},
|
|
84
|
+
"chunking": {
|
|
85
|
+
"max_chars": 1000,
|
|
86
|
+
"max_overlap": 100
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
"#;
|
|
90
|
+
|
|
91
|
+
fs::write(&config_path, json_content).unwrap();
|
|
92
|
+
|
|
93
|
+
let config = ExtractionConfig::from_file(&config_path);
|
|
94
|
+
assert!(config.is_ok(), "Should load JSON config successfully");
|
|
95
|
+
|
|
96
|
+
let config = config.unwrap();
|
|
97
|
+
assert!(config.ocr.is_some(), "Should have OCR config");
|
|
98
|
+
assert!(config.chunking.is_some(), "Should have chunking config");
|
|
99
|
+
|
|
100
|
+
let chunking = config.chunking.unwrap();
|
|
101
|
+
assert_eq!(chunking.max_chars, 1000);
|
|
102
|
+
assert_eq!(chunking.max_overlap, 100);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/// Test loading config from .yml extension.
|
|
106
|
+
#[test]
|
|
107
|
+
fn test_from_file_yml_extension_succeeds() {
|
|
108
|
+
let temp_dir = TempDir::new().unwrap();
|
|
109
|
+
let config_path = temp_dir.path().join("config.yml");
|
|
110
|
+
|
|
111
|
+
let yml_content = r#"
|
|
112
|
+
ocr:
|
|
113
|
+
enabled: true
|
|
114
|
+
"#;
|
|
115
|
+
|
|
116
|
+
fs::write(&config_path, yml_content).unwrap();
|
|
117
|
+
|
|
118
|
+
let config = ExtractionConfig::from_file(&config_path);
|
|
119
|
+
assert!(config.is_ok(), "Should load .yml config successfully");
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/// Test from_file with nonexistent path fails.
|
|
123
|
+
#[test]
|
|
124
|
+
fn test_from_file_nonexistent_path_fails() {
|
|
125
|
+
let result = ExtractionConfig::from_file("/nonexistent/path/config.toml");
|
|
126
|
+
assert!(result.is_err(), "Should fail for nonexistent path: {:?}", result);
|
|
127
|
+
// Error can be Io or other types depending on the implementation
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
/// Test from_file with malformed TOML fails.
|
|
131
|
+
#[test]
|
|
132
|
+
fn test_from_file_malformed_toml_fails() {
|
|
133
|
+
let temp_dir = TempDir::new().unwrap();
|
|
134
|
+
let config_path = temp_dir.path().join("config.toml");
|
|
135
|
+
|
|
136
|
+
let malformed_toml = r#"
|
|
137
|
+
[ocr
|
|
138
|
+
enabled = true
|
|
139
|
+
"#;
|
|
140
|
+
|
|
141
|
+
fs::write(&config_path, malformed_toml).unwrap();
|
|
142
|
+
|
|
143
|
+
let result = ExtractionConfig::from_file(&config_path);
|
|
144
|
+
assert!(result.is_err(), "Should fail for malformed TOML: {:?}", result);
|
|
145
|
+
// Error handling varies - just ensure it failed
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/// Test from_file with malformed JSON fails.
|
|
149
|
+
#[test]
|
|
150
|
+
fn test_from_file_malformed_json_fails() {
|
|
151
|
+
let temp_dir = TempDir::new().unwrap();
|
|
152
|
+
let config_path = temp_dir.path().join("config.json");
|
|
153
|
+
|
|
154
|
+
let malformed_json = r#"
|
|
155
|
+
{
|
|
156
|
+
"ocr": {
|
|
157
|
+
"enabled": true
|
|
158
|
+
}
|
|
159
|
+
"chunking": {}
|
|
160
|
+
}
|
|
161
|
+
"#;
|
|
162
|
+
|
|
163
|
+
fs::write(&config_path, malformed_json).unwrap();
|
|
164
|
+
|
|
165
|
+
let result = ExtractionConfig::from_file(&config_path);
|
|
166
|
+
assert!(result.is_err(), "Should fail for malformed JSON: {:?}", result);
|
|
167
|
+
// Error handling varies - just ensure it failed
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
/// Test from_file with malformed YAML fails.
|
|
171
|
+
#[test]
|
|
172
|
+
fn test_from_file_malformed_yaml_fails() {
|
|
173
|
+
let temp_dir = TempDir::new().unwrap();
|
|
174
|
+
let config_path = temp_dir.path().join("config.yaml");
|
|
175
|
+
|
|
176
|
+
let malformed_yaml = r#"
|
|
177
|
+
ocr:
|
|
178
|
+
enabled: true
|
|
179
|
+
- invalid_list
|
|
180
|
+
"#;
|
|
181
|
+
|
|
182
|
+
fs::write(&config_path, malformed_yaml).unwrap();
|
|
183
|
+
|
|
184
|
+
let result = ExtractionConfig::from_file(&config_path);
|
|
185
|
+
assert!(result.is_err(), "Should fail for malformed YAML: {:?}", result);
|
|
186
|
+
// Error handling varies - just ensure it failed
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
/// Test from_file with empty file uses defaults.
|
|
190
|
+
#[test]
|
|
191
|
+
fn test_from_file_empty_file_uses_defaults() {
|
|
192
|
+
let temp_dir = TempDir::new().unwrap();
|
|
193
|
+
let config_path = temp_dir.path().join("config.toml");
|
|
194
|
+
|
|
195
|
+
fs::write(&config_path, "").unwrap();
|
|
196
|
+
|
|
197
|
+
let config = ExtractionConfig::from_file(&config_path);
|
|
198
|
+
assert!(config.is_ok(), "Should load empty file successfully");
|
|
199
|
+
|
|
200
|
+
let config = config.unwrap();
|
|
201
|
+
// Should have default values
|
|
202
|
+
assert!(config.ocr.is_none(), "Default config should have no OCR");
|
|
203
|
+
assert!(config.chunking.is_none(), "Default config should have no chunking");
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
/// Test from_file with unsupported extension fails.
|
|
207
|
+
#[test]
|
|
208
|
+
fn test_from_file_unsupported_extension_fails() {
|
|
209
|
+
let temp_dir = TempDir::new().unwrap();
|
|
210
|
+
let config_path = temp_dir.path().join("config.txt");
|
|
211
|
+
|
|
212
|
+
fs::write(&config_path, "ocr:\n enabled: true").unwrap();
|
|
213
|
+
|
|
214
|
+
let result = ExtractionConfig::from_file(&config_path);
|
|
215
|
+
assert!(result.is_err(), "Should fail for unsupported extension: {:?}", result);
|
|
216
|
+
|
|
217
|
+
match result {
|
|
218
|
+
Err(KreuzbergError::Validation { message, .. }) => {
|
|
219
|
+
assert!(
|
|
220
|
+
message.contains("format") || message.contains("extension") || message.contains("Unsupported"),
|
|
221
|
+
"Error should mention format/extension: {}",
|
|
222
|
+
message
|
|
223
|
+
);
|
|
224
|
+
}
|
|
225
|
+
_ => {
|
|
226
|
+
// Some other error is also acceptable
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
/// Test discover() finds config in current directory.
|
|
232
|
+
#[test]
|
|
233
|
+
fn test_discover_finds_config_in_current_dir() {
|
|
234
|
+
let temp_dir = TempDir::new().unwrap();
|
|
235
|
+
let config_path = temp_dir.path().join("kreuzberg.toml");
|
|
236
|
+
|
|
237
|
+
let toml_content = r#"
|
|
238
|
+
[ocr]
|
|
239
|
+
enabled = true
|
|
240
|
+
"#;
|
|
241
|
+
|
|
242
|
+
fs::write(&config_path, toml_content).unwrap();
|
|
243
|
+
|
|
244
|
+
// Change to temp directory
|
|
245
|
+
let original_dir = std::env::current_dir().unwrap();
|
|
246
|
+
std::env::set_current_dir(temp_dir.path()).unwrap();
|
|
247
|
+
|
|
248
|
+
let result = ExtractionConfig::discover();
|
|
249
|
+
|
|
250
|
+
// Restore original directory
|
|
251
|
+
std::env::set_current_dir(original_dir).unwrap();
|
|
252
|
+
|
|
253
|
+
assert!(result.is_ok(), "Discover should succeed");
|
|
254
|
+
let config = result.unwrap();
|
|
255
|
+
assert!(config.is_some(), "Should find config in current directory");
|
|
256
|
+
assert!(config.unwrap().ocr.is_some(), "Should have OCR config");
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
/// Test discover() finds config in parent directory.
|
|
260
|
+
#[test]
|
|
261
|
+
fn test_discover_finds_config_in_parent_dir() {
|
|
262
|
+
let temp_dir = TempDir::new().unwrap();
|
|
263
|
+
let config_path = temp_dir.path().join("kreuzberg.toml");
|
|
264
|
+
|
|
265
|
+
let toml_content = r#"
|
|
266
|
+
[ocr]
|
|
267
|
+
enabled = true
|
|
268
|
+
"#;
|
|
269
|
+
|
|
270
|
+
fs::write(&config_path, toml_content).unwrap();
|
|
271
|
+
|
|
272
|
+
// Create subdirectory
|
|
273
|
+
let sub_dir = temp_dir.path().join("subdir");
|
|
274
|
+
fs::create_dir(&sub_dir).unwrap();
|
|
275
|
+
|
|
276
|
+
// Change to subdirectory
|
|
277
|
+
let original_dir = std::env::current_dir().unwrap();
|
|
278
|
+
std::env::set_current_dir(&sub_dir).unwrap();
|
|
279
|
+
|
|
280
|
+
let result = ExtractionConfig::discover();
|
|
281
|
+
|
|
282
|
+
// Restore original directory
|
|
283
|
+
std::env::set_current_dir(original_dir).unwrap();
|
|
284
|
+
|
|
285
|
+
assert!(result.is_ok(), "Discover should succeed");
|
|
286
|
+
let config = result.unwrap();
|
|
287
|
+
assert!(config.is_some(), "Should find config in parent directory");
|
|
288
|
+
assert!(config.unwrap().ocr.is_some(), "Should have OCR config");
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
/// Test discover() returns None when no config found.
|
|
292
|
+
#[test]
|
|
293
|
+
fn test_discover_returns_none_when_not_found() {
|
|
294
|
+
let temp_dir = TempDir::new().unwrap();
|
|
295
|
+
let sub_dir = temp_dir.path().join("subdir");
|
|
296
|
+
fs::create_dir(&sub_dir).unwrap();
|
|
297
|
+
|
|
298
|
+
// Change to subdirectory (no config files)
|
|
299
|
+
let original_dir = std::env::current_dir().unwrap();
|
|
300
|
+
std::env::set_current_dir(&sub_dir).unwrap();
|
|
301
|
+
|
|
302
|
+
let result = ExtractionConfig::discover();
|
|
303
|
+
|
|
304
|
+
// Restore original directory
|
|
305
|
+
std::env::set_current_dir(original_dir).unwrap();
|
|
306
|
+
|
|
307
|
+
assert!(result.is_ok(), "Discover should succeed even when no config found");
|
|
308
|
+
let _config = result.unwrap();
|
|
309
|
+
// May return None or may find a config in parent directories (e.g., repository root)
|
|
310
|
+
// Just verify it doesn't error - the specific behavior depends on the directory structure
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
/// Test discover() prefers certain file names.
|
|
314
|
+
#[test]
|
|
315
|
+
fn test_discover_file_name_preference() {
|
|
316
|
+
let temp_dir = TempDir::new().unwrap();
|
|
317
|
+
|
|
318
|
+
// Create multiple config files
|
|
319
|
+
fs::write(temp_dir.path().join("kreuzberg.toml"), "[ocr]\nenabled = true").unwrap();
|
|
320
|
+
fs::write(temp_dir.path().join(".kreuzberg.toml"), "[ocr]\nenabled = false").unwrap();
|
|
321
|
+
|
|
322
|
+
let original_dir = std::env::current_dir().unwrap();
|
|
323
|
+
if std::env::set_current_dir(temp_dir.path()).is_err() {
|
|
324
|
+
// Skip this test if we can't change directory
|
|
325
|
+
return;
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
let result = ExtractionConfig::discover();
|
|
329
|
+
|
|
330
|
+
// Always restore directory even if test fails
|
|
331
|
+
let _ = std::env::set_current_dir(original_dir);
|
|
332
|
+
|
|
333
|
+
assert!(result.is_ok(), "Discover should succeed");
|
|
334
|
+
let config = result.unwrap();
|
|
335
|
+
assert!(config.is_some(), "Should find a config file");
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
/// Test discover() with nested directories.
|
|
339
|
+
#[test]
|
|
340
|
+
fn test_discover_with_nested_directories() {
|
|
341
|
+
let temp_dir = TempDir::new().unwrap();
|
|
342
|
+
let config_path = temp_dir.path().join("kreuzberg.toml");
|
|
343
|
+
|
|
344
|
+
let toml_content = r#"
|
|
345
|
+
[ocr]
|
|
346
|
+
enabled = true
|
|
347
|
+
"#;
|
|
348
|
+
|
|
349
|
+
fs::write(&config_path, toml_content).unwrap();
|
|
350
|
+
|
|
351
|
+
// Create nested subdirectories
|
|
352
|
+
let level1 = temp_dir.path().join("level1");
|
|
353
|
+
let level2 = level1.join("level2");
|
|
354
|
+
let level3 = level2.join("level3");
|
|
355
|
+
fs::create_dir_all(&level3).unwrap();
|
|
356
|
+
|
|
357
|
+
// Change to deepest directory
|
|
358
|
+
let original_dir = std::env::current_dir().unwrap();
|
|
359
|
+
if std::env::set_current_dir(&level3).is_err() {
|
|
360
|
+
// Skip this test if we can't change directory
|
|
361
|
+
return;
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
let result = ExtractionConfig::discover();
|
|
365
|
+
|
|
366
|
+
// Always restore directory even if test fails
|
|
367
|
+
let _ = std::env::set_current_dir(&original_dir);
|
|
368
|
+
|
|
369
|
+
assert!(result.is_ok(), "Discover should succeed");
|
|
370
|
+
let config = result.unwrap();
|
|
371
|
+
assert!(config.is_some(), "Should find config in ancestor directory");
|
|
372
|
+
assert!(config.unwrap().ocr.is_some(), "Should have OCR config");
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
/// Test config loading with all supported features.
|
|
376
|
+
#[test]
|
|
377
|
+
fn test_from_file_comprehensive_config() {
|
|
378
|
+
let temp_dir = TempDir::new().unwrap();
|
|
379
|
+
let config_path = temp_dir.path().join("config.toml");
|
|
380
|
+
|
|
381
|
+
let toml_content = r#"
|
|
382
|
+
[ocr]
|
|
383
|
+
enabled = true
|
|
384
|
+
backend = "tesseract"
|
|
385
|
+
|
|
386
|
+
[chunking]
|
|
387
|
+
max_chars = 2000
|
|
388
|
+
max_overlap = 200
|
|
389
|
+
|
|
390
|
+
[language_detection]
|
|
391
|
+
enabled = true
|
|
392
|
+
|
|
393
|
+
[images]
|
|
394
|
+
enabled = true
|
|
395
|
+
|
|
396
|
+
[pdf_options]
|
|
397
|
+
extract_images = true
|
|
398
|
+
"#;
|
|
399
|
+
|
|
400
|
+
fs::write(&config_path, toml_content).unwrap();
|
|
401
|
+
|
|
402
|
+
let config = ExtractionConfig::from_file(&config_path);
|
|
403
|
+
assert!(config.is_ok(), "Should load comprehensive config successfully");
|
|
404
|
+
|
|
405
|
+
let config = config.unwrap();
|
|
406
|
+
assert!(config.ocr.is_some(), "Should have OCR config");
|
|
407
|
+
assert!(config.chunking.is_some(), "Should have chunking config");
|
|
408
|
+
assert!(
|
|
409
|
+
config.language_detection.is_some(),
|
|
410
|
+
"Should have language detection config"
|
|
411
|
+
);
|
|
412
|
+
assert!(config.images.is_some(), "Should have image extraction config");
|
|
413
|
+
assert!(config.pdf_options.is_some(), "Should have PDF config");
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
/// Test config validation with invalid values.
|
|
417
|
+
#[test]
|
|
418
|
+
fn test_from_file_with_invalid_values() {
|
|
419
|
+
let temp_dir = TempDir::new().unwrap();
|
|
420
|
+
let config_path = temp_dir.path().join("config.toml");
|
|
421
|
+
|
|
422
|
+
// Negative values should be rejected during deserialization or validation
|
|
423
|
+
let toml_content = r#"
|
|
424
|
+
[chunking]
|
|
425
|
+
max_chars = -1000
|
|
426
|
+
max_overlap = -100
|
|
427
|
+
"#;
|
|
428
|
+
|
|
429
|
+
fs::write(&config_path, toml_content).unwrap();
|
|
430
|
+
|
|
431
|
+
let result = ExtractionConfig::from_file(&config_path);
|
|
432
|
+
// Should either fail parsing or have clamped values
|
|
433
|
+
if let Ok(config) = result {
|
|
434
|
+
// If it succeeds, values should be reasonable
|
|
435
|
+
if let Some(chunking) = config.chunking {
|
|
436
|
+
assert!(chunking.max_chars > 0, "max_chars should be positive");
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
}
|