kreuzberg 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +534 -0
  5. data/Gemfile +9 -0
  6. data/Gemfile.lock +157 -0
  7. data/README.md +421 -0
  8. data/Rakefile +25 -0
  9. data/Steepfile +47 -0
  10. data/examples/async_patterns.rb +340 -0
  11. data/ext/kreuzberg_rb/extconf.rb +35 -0
  12. data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
  13. data/ext/kreuzberg_rb/native/README.md +425 -0
  14. data/ext/kreuzberg_rb/native/build.rs +17 -0
  15. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  16. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  17. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  18. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  19. data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
  20. data/extconf.rb +28 -0
  21. data/kreuzberg.gemspec +105 -0
  22. data/lib/kreuzberg/api_proxy.rb +142 -0
  23. data/lib/kreuzberg/cache_api.rb +45 -0
  24. data/lib/kreuzberg/cli.rb +55 -0
  25. data/lib/kreuzberg/cli_proxy.rb +127 -0
  26. data/lib/kreuzberg/config.rb +684 -0
  27. data/lib/kreuzberg/errors.rb +50 -0
  28. data/lib/kreuzberg/extraction_api.rb +84 -0
  29. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  30. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  31. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  32. data/lib/kreuzberg/result.rb +216 -0
  33. data/lib/kreuzberg/setup_lib_path.rb +79 -0
  34. data/lib/kreuzberg/validator_protocol.rb +89 -0
  35. data/lib/kreuzberg/version.rb +5 -0
  36. data/lib/kreuzberg.rb +82 -0
  37. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  38. data/sig/kreuzberg/internal.rbs +184 -0
  39. data/sig/kreuzberg.rbs +468 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +87 -0
  42. data/spec/binding/cli_spec.rb +54 -0
  43. data/spec/binding/config_spec.rb +345 -0
  44. data/spec/binding/config_validation_spec.rb +283 -0
  45. data/spec/binding/error_handling_spec.rb +213 -0
  46. data/spec/binding/errors_spec.rb +66 -0
  47. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  48. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  49. data/spec/binding/plugins/validator_spec.rb +274 -0
  50. data/spec/examples.txt +104 -0
  51. data/spec/fixtures/config.toml +39 -0
  52. data/spec/fixtures/config.yaml +42 -0
  53. data/spec/fixtures/invalid_config.toml +4 -0
  54. data/spec/smoke/package_spec.rb +178 -0
  55. data/spec/spec_helper.rb +42 -0
  56. data/vendor/kreuzberg/Cargo.toml +134 -0
  57. data/vendor/kreuzberg/README.md +175 -0
  58. data/vendor/kreuzberg/build.rs +460 -0
  59. data/vendor/kreuzberg/src/api/error.rs +81 -0
  60. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  61. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  62. data/vendor/kreuzberg/src/api/server.rs +353 -0
  63. data/vendor/kreuzberg/src/api/types.rs +170 -0
  64. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  65. data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
  66. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  67. data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
  68. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  69. data/vendor/kreuzberg/src/core/extractor.rs +903 -0
  70. data/vendor/kreuzberg/src/core/io.rs +327 -0
  71. data/vendor/kreuzberg/src/core/mime.rs +615 -0
  72. data/vendor/kreuzberg/src/core/mod.rs +42 -0
  73. data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
  74. data/vendor/kreuzberg/src/embeddings.rs +323 -0
  75. data/vendor/kreuzberg/src/error.rs +431 -0
  76. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  77. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  78. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  79. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  80. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  81. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  82. data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
  83. data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
  84. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  85. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
  88. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  89. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  90. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  91. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  92. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  93. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  94. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  95. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  96. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  97. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  98. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  99. data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
  100. data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
  101. data/vendor/kreuzberg/src/extractors/email.rs +129 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
  103. data/vendor/kreuzberg/src/extractors/html.rs +410 -0
  104. data/vendor/kreuzberg/src/extractors/image.rs +195 -0
  105. data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
  106. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  107. data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
  108. data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
  109. data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
  110. data/vendor/kreuzberg/src/extractors/text.rs +242 -0
  111. data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
  112. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  113. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  114. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  115. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  116. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  117. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  118. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  119. data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
  120. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  121. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  122. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  123. data/vendor/kreuzberg/src/lib.rs +102 -0
  124. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  125. data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
  126. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  127. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  128. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  129. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  130. data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
  131. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  132. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  133. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  134. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  135. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  136. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  137. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  138. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  139. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  140. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  141. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  142. data/vendor/kreuzberg/src/pdf/table.rs +420 -0
  143. data/vendor/kreuzberg/src/pdf/text.rs +161 -0
  144. data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
  145. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  146. data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
  147. data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
  148. data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
  149. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  150. data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
  151. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  152. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  153. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  154. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  155. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  156. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  157. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  158. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  159. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  160. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  161. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  162. data/vendor/kreuzberg/src/types.rs +873 -0
  163. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  164. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  165. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  166. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  167. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  168. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  169. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  170. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  171. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  172. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  173. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  174. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  175. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  176. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  177. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  178. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  179. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  180. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  181. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  182. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  183. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  184. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  185. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  186. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  187. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  188. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  189. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  190. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  191. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  192. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  193. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  194. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  195. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  196. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  197. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  198. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  199. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  200. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  201. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  202. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  203. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  204. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  205. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  206. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  207. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  208. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  209. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  210. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  211. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  212. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  213. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  214. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  215. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  216. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  217. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  218. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  219. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  220. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  221. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  222. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  223. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  224. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  225. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  226. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  227. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  228. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  229. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  230. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  231. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  232. data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
  233. data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
  234. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  235. data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
  236. data/vendor/kreuzberg/tests/config_features.rs +580 -0
  237. data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
  238. data/vendor/kreuzberg/tests/core_integration.rs +493 -0
  239. data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
  240. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
  241. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  242. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  243. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  244. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  245. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  246. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  247. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  248. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  249. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  250. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  251. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  252. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  253. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  254. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  255. data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
  256. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  257. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
  258. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  259. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  260. data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
  261. data/vendor/kreuzberg/tests/security_validation.rs +404 -0
  262. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  263. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  264. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  265. metadata +471 -0
@@ -0,0 +1,439 @@
1
+ //! Configuration loading integration tests.
2
+ //!
3
+ //! Tests the config loading APIs:
4
+ //! - from_file() with TOML/YAML/JSON
5
+ //! - discover() for searching parent directories
6
+ //! - Error handling for invalid configs
7
+
8
+ use kreuzberg::KreuzbergError;
9
+ use kreuzberg::core::config::ExtractionConfig;
10
+ use std::fs;
11
+ use tempfile::TempDir;
12
+
13
+ /// Test loading config from TOML file.
14
+ #[test]
15
+ fn test_from_file_toml_succeeds() {
16
+ let temp_dir = TempDir::new().unwrap();
17
+ let config_path = temp_dir.path().join("config.toml");
18
+
19
+ let toml_content = r#"
20
+ [ocr]
21
+ enabled = true
22
+ backend = "tesseract"
23
+
24
+ [chunking]
25
+ max_chars = 1000
26
+ max_overlap = 100
27
+ "#;
28
+
29
+ fs::write(&config_path, toml_content).unwrap();
30
+
31
+ let config = ExtractionConfig::from_file(&config_path);
32
+ assert!(config.is_ok(), "Should load TOML config successfully");
33
+
34
+ let config = config.unwrap();
35
+ assert!(config.ocr.is_some(), "Should have OCR config");
36
+ assert!(config.chunking.is_some(), "Should have chunking config");
37
+
38
+ let chunking = config.chunking.unwrap();
39
+ assert_eq!(chunking.max_chars, 1000);
40
+ assert_eq!(chunking.max_overlap, 100);
41
+ }
42
+
43
+ /// Test loading config from YAML file.
44
+ #[test]
45
+ fn test_from_file_yaml_succeeds() {
46
+ let temp_dir = TempDir::new().unwrap();
47
+ let config_path = temp_dir.path().join("config.yaml");
48
+
49
+ let yaml_content = r#"
50
+ ocr:
51
+ enabled: true
52
+ backend: tesseract
53
+ chunking:
54
+ max_chars: 1000
55
+ max_overlap: 100
56
+ "#;
57
+
58
+ fs::write(&config_path, yaml_content).unwrap();
59
+
60
+ let config = ExtractionConfig::from_file(&config_path);
61
+ assert!(config.is_ok(), "Should load YAML config successfully");
62
+
63
+ let config = config.unwrap();
64
+ assert!(config.ocr.is_some(), "Should have OCR config");
65
+ assert!(config.chunking.is_some(), "Should have chunking config");
66
+
67
+ let chunking = config.chunking.unwrap();
68
+ assert_eq!(chunking.max_chars, 1000);
69
+ assert_eq!(chunking.max_overlap, 100);
70
+ }
71
+
72
+ /// Test loading config from JSON file.
73
+ #[test]
74
+ fn test_from_file_json_succeeds() {
75
+ let temp_dir = TempDir::new().unwrap();
76
+ let config_path = temp_dir.path().join("config.json");
77
+
78
+ let json_content = r#"
79
+ {
80
+ "ocr": {
81
+ "enabled": true,
82
+ "backend": "tesseract"
83
+ },
84
+ "chunking": {
85
+ "max_chars": 1000,
86
+ "max_overlap": 100
87
+ }
88
+ }
89
+ "#;
90
+
91
+ fs::write(&config_path, json_content).unwrap();
92
+
93
+ let config = ExtractionConfig::from_file(&config_path);
94
+ assert!(config.is_ok(), "Should load JSON config successfully");
95
+
96
+ let config = config.unwrap();
97
+ assert!(config.ocr.is_some(), "Should have OCR config");
98
+ assert!(config.chunking.is_some(), "Should have chunking config");
99
+
100
+ let chunking = config.chunking.unwrap();
101
+ assert_eq!(chunking.max_chars, 1000);
102
+ assert_eq!(chunking.max_overlap, 100);
103
+ }
104
+
105
+ /// Test loading config from .yml extension.
106
+ #[test]
107
+ fn test_from_file_yml_extension_succeeds() {
108
+ let temp_dir = TempDir::new().unwrap();
109
+ let config_path = temp_dir.path().join("config.yml");
110
+
111
+ let yml_content = r#"
112
+ ocr:
113
+ enabled: true
114
+ "#;
115
+
116
+ fs::write(&config_path, yml_content).unwrap();
117
+
118
+ let config = ExtractionConfig::from_file(&config_path);
119
+ assert!(config.is_ok(), "Should load .yml config successfully");
120
+ }
121
+
122
+ /// Test from_file with nonexistent path fails.
123
+ #[test]
124
+ fn test_from_file_nonexistent_path_fails() {
125
+ let result = ExtractionConfig::from_file("/nonexistent/path/config.toml");
126
+ assert!(result.is_err(), "Should fail for nonexistent path: {:?}", result);
127
+ // Error can be Io or other types depending on the implementation
128
+ }
129
+
130
+ /// Test from_file with malformed TOML fails.
131
+ #[test]
132
+ fn test_from_file_malformed_toml_fails() {
133
+ let temp_dir = TempDir::new().unwrap();
134
+ let config_path = temp_dir.path().join("config.toml");
135
+
136
+ let malformed_toml = r#"
137
+ [ocr
138
+ enabled = true
139
+ "#;
140
+
141
+ fs::write(&config_path, malformed_toml).unwrap();
142
+
143
+ let result = ExtractionConfig::from_file(&config_path);
144
+ assert!(result.is_err(), "Should fail for malformed TOML: {:?}", result);
145
+ // Error handling varies - just ensure it failed
146
+ }
147
+
148
+ /// Test from_file with malformed JSON fails.
149
+ #[test]
150
+ fn test_from_file_malformed_json_fails() {
151
+ let temp_dir = TempDir::new().unwrap();
152
+ let config_path = temp_dir.path().join("config.json");
153
+
154
+ let malformed_json = r#"
155
+ {
156
+ "ocr": {
157
+ "enabled": true
158
+ }
159
+ "chunking": {}
160
+ }
161
+ "#;
162
+
163
+ fs::write(&config_path, malformed_json).unwrap();
164
+
165
+ let result = ExtractionConfig::from_file(&config_path);
166
+ assert!(result.is_err(), "Should fail for malformed JSON: {:?}", result);
167
+ // Error handling varies - just ensure it failed
168
+ }
169
+
170
+ /// Test from_file with malformed YAML fails.
171
+ #[test]
172
+ fn test_from_file_malformed_yaml_fails() {
173
+ let temp_dir = TempDir::new().unwrap();
174
+ let config_path = temp_dir.path().join("config.yaml");
175
+
176
+ let malformed_yaml = r#"
177
+ ocr:
178
+ enabled: true
179
+ - invalid_list
180
+ "#;
181
+
182
+ fs::write(&config_path, malformed_yaml).unwrap();
183
+
184
+ let result = ExtractionConfig::from_file(&config_path);
185
+ assert!(result.is_err(), "Should fail for malformed YAML: {:?}", result);
186
+ // Error handling varies - just ensure it failed
187
+ }
188
+
189
+ /// Test from_file with empty file uses defaults.
190
+ #[test]
191
+ fn test_from_file_empty_file_uses_defaults() {
192
+ let temp_dir = TempDir::new().unwrap();
193
+ let config_path = temp_dir.path().join("config.toml");
194
+
195
+ fs::write(&config_path, "").unwrap();
196
+
197
+ let config = ExtractionConfig::from_file(&config_path);
198
+ assert!(config.is_ok(), "Should load empty file successfully");
199
+
200
+ let config = config.unwrap();
201
+ // Should have default values
202
+ assert!(config.ocr.is_none(), "Default config should have no OCR");
203
+ assert!(config.chunking.is_none(), "Default config should have no chunking");
204
+ }
205
+
206
+ /// Test from_file with unsupported extension fails.
207
+ #[test]
208
+ fn test_from_file_unsupported_extension_fails() {
209
+ let temp_dir = TempDir::new().unwrap();
210
+ let config_path = temp_dir.path().join("config.txt");
211
+
212
+ fs::write(&config_path, "ocr:\n enabled: true").unwrap();
213
+
214
+ let result = ExtractionConfig::from_file(&config_path);
215
+ assert!(result.is_err(), "Should fail for unsupported extension: {:?}", result);
216
+
217
+ match result {
218
+ Err(KreuzbergError::Validation { message, .. }) => {
219
+ assert!(
220
+ message.contains("format") || message.contains("extension") || message.contains("Unsupported"),
221
+ "Error should mention format/extension: {}",
222
+ message
223
+ );
224
+ }
225
+ _ => {
226
+ // Some other error is also acceptable
227
+ }
228
+ }
229
+ }
230
+
231
+ /// Test discover() finds config in current directory.
232
+ #[test]
233
+ fn test_discover_finds_config_in_current_dir() {
234
+ let temp_dir = TempDir::new().unwrap();
235
+ let config_path = temp_dir.path().join("kreuzberg.toml");
236
+
237
+ let toml_content = r#"
238
+ [ocr]
239
+ enabled = true
240
+ "#;
241
+
242
+ fs::write(&config_path, toml_content).unwrap();
243
+
244
+ // Change to temp directory
245
+ let original_dir = std::env::current_dir().unwrap();
246
+ std::env::set_current_dir(temp_dir.path()).unwrap();
247
+
248
+ let result = ExtractionConfig::discover();
249
+
250
+ // Restore original directory
251
+ std::env::set_current_dir(original_dir).unwrap();
252
+
253
+ assert!(result.is_ok(), "Discover should succeed");
254
+ let config = result.unwrap();
255
+ assert!(config.is_some(), "Should find config in current directory");
256
+ assert!(config.unwrap().ocr.is_some(), "Should have OCR config");
257
+ }
258
+
259
+ /// Test discover() finds config in parent directory.
260
+ #[test]
261
+ fn test_discover_finds_config_in_parent_dir() {
262
+ let temp_dir = TempDir::new().unwrap();
263
+ let config_path = temp_dir.path().join("kreuzberg.toml");
264
+
265
+ let toml_content = r#"
266
+ [ocr]
267
+ enabled = true
268
+ "#;
269
+
270
+ fs::write(&config_path, toml_content).unwrap();
271
+
272
+ // Create subdirectory
273
+ let sub_dir = temp_dir.path().join("subdir");
274
+ fs::create_dir(&sub_dir).unwrap();
275
+
276
+ // Change to subdirectory
277
+ let original_dir = std::env::current_dir().unwrap();
278
+ std::env::set_current_dir(&sub_dir).unwrap();
279
+
280
+ let result = ExtractionConfig::discover();
281
+
282
+ // Restore original directory
283
+ std::env::set_current_dir(original_dir).unwrap();
284
+
285
+ assert!(result.is_ok(), "Discover should succeed");
286
+ let config = result.unwrap();
287
+ assert!(config.is_some(), "Should find config in parent directory");
288
+ assert!(config.unwrap().ocr.is_some(), "Should have OCR config");
289
+ }
290
+
291
+ /// Test discover() returns None when no config found.
292
+ #[test]
293
+ fn test_discover_returns_none_when_not_found() {
294
+ let temp_dir = TempDir::new().unwrap();
295
+ let sub_dir = temp_dir.path().join("subdir");
296
+ fs::create_dir(&sub_dir).unwrap();
297
+
298
+ // Change to subdirectory (no config files)
299
+ let original_dir = std::env::current_dir().unwrap();
300
+ std::env::set_current_dir(&sub_dir).unwrap();
301
+
302
+ let result = ExtractionConfig::discover();
303
+
304
+ // Restore original directory
305
+ std::env::set_current_dir(original_dir).unwrap();
306
+
307
+ assert!(result.is_ok(), "Discover should succeed even when no config found");
308
+ let _config = result.unwrap();
309
+ // May return None or may find a config in parent directories (e.g., repository root)
310
+ // Just verify it doesn't error - the specific behavior depends on the directory structure
311
+ }
312
+
313
+ /// Test discover() prefers certain file names.
314
+ #[test]
315
+ fn test_discover_file_name_preference() {
316
+ let temp_dir = TempDir::new().unwrap();
317
+
318
+ // Create multiple config files
319
+ fs::write(temp_dir.path().join("kreuzberg.toml"), "[ocr]\nenabled = true").unwrap();
320
+ fs::write(temp_dir.path().join(".kreuzberg.toml"), "[ocr]\nenabled = false").unwrap();
321
+
322
+ let original_dir = std::env::current_dir().unwrap();
323
+ if std::env::set_current_dir(temp_dir.path()).is_err() {
324
+ // Skip this test if we can't change directory
325
+ return;
326
+ }
327
+
328
+ let result = ExtractionConfig::discover();
329
+
330
+ // Always restore directory even if test fails
331
+ let _ = std::env::set_current_dir(original_dir);
332
+
333
+ assert!(result.is_ok(), "Discover should succeed");
334
+ let config = result.unwrap();
335
+ assert!(config.is_some(), "Should find a config file");
336
+ }
337
+
338
+ /// Test discover() with nested directories.
339
+ #[test]
340
+ fn test_discover_with_nested_directories() {
341
+ let temp_dir = TempDir::new().unwrap();
342
+ let config_path = temp_dir.path().join("kreuzberg.toml");
343
+
344
+ let toml_content = r#"
345
+ [ocr]
346
+ enabled = true
347
+ "#;
348
+
349
+ fs::write(&config_path, toml_content).unwrap();
350
+
351
+ // Create nested subdirectories
352
+ let level1 = temp_dir.path().join("level1");
353
+ let level2 = level1.join("level2");
354
+ let level3 = level2.join("level3");
355
+ fs::create_dir_all(&level3).unwrap();
356
+
357
+ // Change to deepest directory
358
+ let original_dir = std::env::current_dir().unwrap();
359
+ if std::env::set_current_dir(&level3).is_err() {
360
+ // Skip this test if we can't change directory
361
+ return;
362
+ }
363
+
364
+ let result = ExtractionConfig::discover();
365
+
366
+ // Always restore directory even if test fails
367
+ let _ = std::env::set_current_dir(&original_dir);
368
+
369
+ assert!(result.is_ok(), "Discover should succeed");
370
+ let config = result.unwrap();
371
+ assert!(config.is_some(), "Should find config in ancestor directory");
372
+ assert!(config.unwrap().ocr.is_some(), "Should have OCR config");
373
+ }
374
+
375
+ /// Test config loading with all supported features.
376
+ #[test]
377
+ fn test_from_file_comprehensive_config() {
378
+ let temp_dir = TempDir::new().unwrap();
379
+ let config_path = temp_dir.path().join("config.toml");
380
+
381
+ let toml_content = r#"
382
+ [ocr]
383
+ enabled = true
384
+ backend = "tesseract"
385
+
386
+ [chunking]
387
+ max_chars = 2000
388
+ max_overlap = 200
389
+
390
+ [language_detection]
391
+ enabled = true
392
+
393
+ [images]
394
+ enabled = true
395
+
396
+ [pdf_options]
397
+ extract_images = true
398
+ "#;
399
+
400
+ fs::write(&config_path, toml_content).unwrap();
401
+
402
+ let config = ExtractionConfig::from_file(&config_path);
403
+ assert!(config.is_ok(), "Should load comprehensive config successfully");
404
+
405
+ let config = config.unwrap();
406
+ assert!(config.ocr.is_some(), "Should have OCR config");
407
+ assert!(config.chunking.is_some(), "Should have chunking config");
408
+ assert!(
409
+ config.language_detection.is_some(),
410
+ "Should have language detection config"
411
+ );
412
+ assert!(config.images.is_some(), "Should have image extraction config");
413
+ assert!(config.pdf_options.is_some(), "Should have PDF config");
414
+ }
415
+
416
+ /// Test config validation with invalid values.
417
+ #[test]
418
+ fn test_from_file_with_invalid_values() {
419
+ let temp_dir = TempDir::new().unwrap();
420
+ let config_path = temp_dir.path().join("config.toml");
421
+
422
+ // Negative values should be rejected during deserialization or validation
423
+ let toml_content = r#"
424
+ [chunking]
425
+ max_chars = -1000
426
+ max_overlap = -100
427
+ "#;
428
+
429
+ fs::write(&config_path, toml_content).unwrap();
430
+
431
+ let result = ExtractionConfig::from_file(&config_path);
432
+ // Should either fail parsing or have clamped values
433
+ if let Ok(config) = result {
434
+ // If it succeeds, values should be reasonable
435
+ if let Some(chunking) = config.chunking {
436
+ assert!(chunking.max_chars > 0, "max_chars should be positive");
437
+ }
438
+ }
439
+ }