kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -1,753 +0,0 @@
1
- //! Comprehensive integration tests for ServerConfig precedence order system.
2
- //!
3
- //! Tests verify the precedence order: CLI > Env > File > Default
4
- //! These tests use real config files and environment variables.
5
-
6
- #![cfg(feature = "api")]
7
-
8
- use kreuzberg::ServerConfig;
9
- use std::fs;
10
- use tempfile::tempdir;
11
-
12
- // Helper function to cleanup environment variables
13
- #[allow(unsafe_code)]
14
- fn cleanup_env_vars() {
15
- unsafe {
16
- std::env::remove_var("KREUZBERG_HOST");
17
- std::env::remove_var("KREUZBERG_PORT");
18
- std::env::remove_var("KREUZBERG_CORS_ORIGINS");
19
- std::env::remove_var("KREUZBERG_MAX_REQUEST_BODY_BYTES");
20
- std::env::remove_var("KREUZBERG_MAX_MULTIPART_FIELD_BYTES");
21
- std::env::remove_var("KREUZBERG_MAX_UPLOAD_SIZE_MB");
22
- }
23
- }
24
-
25
- // Helper function to set environment variables
26
- #[allow(unsafe_code)]
27
- fn set_env(key: &str, value: &str) {
28
- unsafe {
29
- std::env::set_var(key, value);
30
- }
31
- }
32
-
33
- // Helper function to get and store original environment variables
34
- fn save_env(keys: &[&str]) -> Vec<(String, Option<String>)> {
35
- keys.iter()
36
- .map(|key| (key.to_string(), std::env::var(key).ok()))
37
- .collect()
38
- }
39
-
40
- // Helper function to restore environment variables
41
- #[allow(unsafe_code)]
42
- fn restore_env(saved: Vec<(String, Option<String>)>) {
43
- unsafe {
44
- for (key, value) in saved {
45
- if let Some(v) = value {
46
- std::env::set_var(&key, v);
47
- } else {
48
- std::env::remove_var(&key);
49
- }
50
- }
51
- }
52
- }
53
-
54
- // Test 1: Config precedence order - Env wins over File
55
- #[test]
56
- #[serial_test::serial]
57
- fn test_config_precedence_env_over_file() {
58
- let saved = save_env(&["KREUZBERG_HOST", "KREUZBERG_PORT"]);
59
-
60
- let dir = tempdir().unwrap();
61
- let config_path = dir.path().join("config.toml");
62
-
63
- // Create config file with file values
64
- fs::write(
65
- &config_path,
66
- r#"
67
- host = "file-host"
68
- port = 8001
69
- "#,
70
- )
71
- .unwrap();
72
-
73
- // Set env vars (should override file)
74
- set_env("KREUZBERG_HOST", "env-host");
75
- set_env("KREUZBERG_PORT", "8002");
76
-
77
- // Load and apply
78
- let mut config = ServerConfig::from_file(&config_path).unwrap();
79
- assert_eq!(config.host, "file-host");
80
- assert_eq!(config.port, 8001);
81
-
82
- // Apply env overrides
83
- config.apply_env_overrides().unwrap();
84
-
85
- // Verify env vars won (Env > File)
86
- assert_eq!(config.host, "env-host", "Env HOST should override file HOST");
87
- assert_eq!(config.port, 8002, "Env PORT should override file PORT");
88
-
89
- cleanup_env_vars();
90
- restore_env(saved);
91
- }
92
-
93
- // Test 2: File-only configuration
94
- #[test]
95
- fn test_file_only_configuration() {
96
- let dir = tempdir().unwrap();
97
- let config_path = dir.path().join("config.toml");
98
-
99
- // Create config with specific values
100
- fs::write(
101
- &config_path,
102
- r#"
103
- host = "192.168.1.100"
104
- port = 9000
105
- cors_origins = ["https://app.example.com"]
106
- max_request_body_bytes = 50000000
107
- max_multipart_field_bytes = 75000000
108
- "#,
109
- )
110
- .unwrap();
111
-
112
- let config = ServerConfig::from_file(&config_path).unwrap();
113
-
114
- assert_eq!(config.host, "192.168.1.100");
115
- assert_eq!(config.port, 9000);
116
- assert_eq!(config.cors_origins.len(), 1);
117
- assert_eq!(config.cors_origins[0], "https://app.example.com");
118
- assert_eq!(config.max_request_body_bytes, 50_000_000);
119
- assert_eq!(config.max_multipart_field_bytes, 75_000_000);
120
- }
121
-
122
- // Test 3: Env-only configuration (no config file)
123
- #[test]
124
- #[serial_test::serial]
125
- fn test_env_only_configuration() {
126
- let saved = save_env(&["KREUZBERG_HOST", "KREUZBERG_PORT", "KREUZBERG_CORS_ORIGINS"]);
127
-
128
- set_env("KREUZBERG_HOST", "0.0.0.0");
129
- set_env("KREUZBERG_PORT", "3000");
130
- set_env(
131
- "KREUZBERG_CORS_ORIGINS",
132
- "https://api.example.com, https://app.example.com",
133
- );
134
-
135
- // Create default config
136
- let mut config = ServerConfig::default();
137
-
138
- // Verify defaults initially
139
- assert_eq!(config.host, "127.0.0.1");
140
- assert_eq!(config.port, 8000);
141
-
142
- // Apply env overrides
143
- config.apply_env_overrides().unwrap();
144
-
145
- // Verify env vars are used
146
- assert_eq!(config.host, "0.0.0.0");
147
- assert_eq!(config.port, 3000);
148
- assert_eq!(config.cors_origins.len(), 2);
149
- assert!(config.cors_origins.contains(&"https://api.example.com".to_string()));
150
- assert!(config.cors_origins.contains(&"https://app.example.com".to_string()));
151
-
152
- cleanup_env_vars();
153
- restore_env(saved);
154
- }
155
-
156
- // Test 4: Default configuration
157
- #[test]
158
- fn test_default_configuration() {
159
- let config = ServerConfig::default();
160
-
161
- // Verify defaults
162
- assert_eq!(config.host, "127.0.0.1");
163
- assert_eq!(config.port, 8000);
164
- assert!(config.cors_origins.is_empty());
165
- assert_eq!(config.max_request_body_bytes, 104_857_600); // 100 MB
166
- assert_eq!(config.max_multipart_field_bytes, 104_857_600); // 100 MB
167
- assert!(config.max_upload_mb.is_none());
168
- assert_eq!(config.listen_addr(), "127.0.0.1:8000");
169
- }
170
-
171
- // Test 5: Backward compatibility - file without [server] section
172
- #[test]
173
- fn test_backward_compatibility_no_server_section() {
174
- let dir = tempdir().unwrap();
175
- let config_path = dir.path().join("config.toml");
176
-
177
- // Create config with only extraction settings (no [server] section)
178
- fs::write(
179
- &config_path,
180
- r#"
181
- # No [server] section - extraction-only config
182
- use_cache = false
183
- enable_quality_processing = true
184
- "#,
185
- )
186
- .unwrap();
187
-
188
- // ServerConfig::from_file should load with defaults for missing [server] section
189
- let config = ServerConfig::from_file(&config_path).unwrap();
190
-
191
- // Verify ServerConfig fields have defaults
192
- assert_eq!(config.host, "127.0.0.1");
193
- assert_eq!(config.port, 8000);
194
- assert!(config.cors_origins.is_empty());
195
- }
196
-
197
- // Test 6: All three formats - TOML
198
- #[test]
199
- fn test_config_format_toml() {
200
- let dir = tempdir().unwrap();
201
- let config_path = dir.path().join("config.toml");
202
-
203
- fs::write(
204
- &config_path,
205
- r#"
206
- host = "10.0.0.1"
207
- port = 7000
208
- cors_origins = ["https://test.com"]
209
- "#,
210
- )
211
- .unwrap();
212
-
213
- let config = ServerConfig::from_file(&config_path).unwrap();
214
- assert_eq!(config.host, "10.0.0.1");
215
- assert_eq!(config.port, 7000);
216
- }
217
-
218
- // Test 7: All three formats - YAML
219
- #[test]
220
- fn test_config_format_yaml() {
221
- let dir = tempdir().unwrap();
222
- let config_path = dir.path().join("config.yaml");
223
-
224
- fs::write(
225
- &config_path,
226
- r#"
227
- host: 10.0.0.2
228
- port: 7001
229
- cors_origins:
230
- - https://test.com
231
- "#,
232
- )
233
- .unwrap();
234
-
235
- let config = ServerConfig::from_file(&config_path).unwrap();
236
- assert_eq!(config.host, "10.0.0.2");
237
- assert_eq!(config.port, 7001);
238
- }
239
-
240
- // Test 8: All three formats - JSON
241
- #[test]
242
- fn test_config_format_json() {
243
- let dir = tempdir().unwrap();
244
- let config_path = dir.path().join("config.json");
245
-
246
- fs::write(
247
- &config_path,
248
- r#"{
249
- "host": "10.0.0.3",
250
- "port": 7002,
251
- "cors_origins": ["https://test.com"]
252
- }
253
- "#,
254
- )
255
- .unwrap();
256
-
257
- let config = ServerConfig::from_file(&config_path).unwrap();
258
- assert_eq!(config.host, "10.0.0.3");
259
- assert_eq!(config.port, 7002);
260
- }
261
-
262
- // Test 9: CORS configuration - empty (allow all)
263
- #[test]
264
- fn test_cors_configuration_allow_all() {
265
- let dir = tempdir().unwrap();
266
- let config_path = dir.path().join("config.toml");
267
-
268
- fs::write(
269
- &config_path,
270
- r#"
271
- host = "127.0.0.1"
272
- port = 8000
273
- # Empty cors_origins means allow all
274
- "#,
275
- )
276
- .unwrap();
277
-
278
- let config = ServerConfig::from_file(&config_path).unwrap();
279
-
280
- assert!(config.cors_allows_all(), "Empty cors_origins should allow all");
281
- assert!(config.is_origin_allowed("https://any.com"));
282
- assert!(config.is_origin_allowed("http://localhost:3000"));
283
- }
284
-
285
- // Test 10: CORS configuration - specific origins
286
- #[test]
287
- fn test_cors_configuration_specific_origins() {
288
- let dir = tempdir().unwrap();
289
- let config_path = dir.path().join("config.toml");
290
-
291
- fs::write(
292
- &config_path,
293
- r#"
294
- host = "127.0.0.1"
295
- port = 8000
296
- cors_origins = ["https://app1.com", "https://app2.com"]
297
- "#,
298
- )
299
- .unwrap();
300
-
301
- let config = ServerConfig::from_file(&config_path).unwrap();
302
-
303
- assert!(!config.cors_allows_all(), "Specific origins should not allow all");
304
- assert!(config.is_origin_allowed("https://app1.com"));
305
- assert!(config.is_origin_allowed("https://app2.com"));
306
- assert!(!config.is_origin_allowed("https://app3.com"));
307
- }
308
-
309
- // Test 11: CORS precedence - env overrides file
310
- #[test]
311
- #[serial_test::serial]
312
- fn test_cors_precedence_env_over_file() {
313
- let saved = save_env(&["KREUZBERG_CORS_ORIGINS"]);
314
-
315
- let dir = tempdir().unwrap();
316
- let config_path = dir.path().join("config.toml");
317
-
318
- fs::write(
319
- &config_path,
320
- r#"
321
- cors_origins = ["https://file.com"]
322
- "#,
323
- )
324
- .unwrap();
325
-
326
- set_env("KREUZBERG_CORS_ORIGINS", "https://env1.com, https://env2.com");
327
-
328
- let mut config = ServerConfig::from_file(&config_path).unwrap();
329
- assert_eq!(config.cors_origins.len(), 1);
330
- assert_eq!(config.cors_origins[0], "https://file.com");
331
-
332
- config.apply_env_overrides().unwrap();
333
-
334
- assert_eq!(config.cors_origins.len(), 2);
335
- assert!(config.cors_origins.contains(&"https://env1.com".to_string()));
336
- assert!(config.cors_origins.contains(&"https://env2.com".to_string()));
337
-
338
- cleanup_env_vars();
339
- restore_env(saved);
340
- }
341
-
342
- // Test 12: Legacy max_upload_mb backward compatibility
343
- #[test]
344
- fn test_legacy_max_upload_mb_in_file() {
345
- let dir = tempdir().unwrap();
346
- let config_path = dir.path().join("config.toml");
347
-
348
- fs::write(
349
- &config_path,
350
- r#"
351
- host = "127.0.0.1"
352
- port = 8000
353
- max_upload_mb = 50
354
- "#,
355
- )
356
- .unwrap();
357
-
358
- let config = ServerConfig::from_file(&config_path).unwrap();
359
-
360
- assert_eq!(config.max_upload_mb, Some(50));
361
- // Should be converted to bytes
362
- assert_eq!(config.max_multipart_field_bytes, 50 * 1_048_576);
363
- }
364
-
365
- // Test 13: Legacy max_upload_mb env override
366
- #[test]
367
- #[serial_test::serial]
368
- fn test_legacy_max_upload_mb_env_override() {
369
- let saved = save_env(&["KREUZBERG_MAX_UPLOAD_SIZE_MB"]);
370
-
371
- set_env("KREUZBERG_MAX_UPLOAD_SIZE_MB", "75");
372
-
373
- let mut config = ServerConfig::default();
374
- assert!(config.max_upload_mb.is_none());
375
-
376
- config.apply_env_overrides().unwrap();
377
-
378
- assert_eq!(config.max_upload_mb, Some(75));
379
- assert_eq!(config.max_multipart_field_bytes, 75 * 1_048_576);
380
-
381
- cleanup_env_vars();
382
- restore_env(saved);
383
- }
384
-
385
- // Test 14: Invalid env var values - invalid port
386
- #[test]
387
- #[serial_test::serial]
388
- fn test_invalid_env_port() {
389
- let saved = save_env(&["KREUZBERG_PORT"]);
390
-
391
- set_env("KREUZBERG_PORT", "not_a_number");
392
-
393
- let mut config = ServerConfig::default();
394
- let result = config.apply_env_overrides();
395
-
396
- assert!(result.is_err());
397
- let err_msg = result.unwrap_err().to_string();
398
- assert!(err_msg.contains("KREUZBERG_PORT"));
399
- assert!(err_msg.contains("valid u16"));
400
-
401
- cleanup_env_vars();
402
- restore_env(saved);
403
- }
404
-
405
- // Test 15: Invalid env var values - invalid max_request_body_bytes
406
- #[test]
407
- #[serial_test::serial]
408
- fn test_invalid_env_max_request_body_bytes() {
409
- let saved = save_env(&["KREUZBERG_MAX_REQUEST_BODY_BYTES"]);
410
-
411
- set_env("KREUZBERG_MAX_REQUEST_BODY_BYTES", "invalid_number");
412
-
413
- let mut config = ServerConfig::default();
414
- let result = config.apply_env_overrides();
415
-
416
- assert!(result.is_err());
417
- let err_msg = result.unwrap_err().to_string();
418
- assert!(err_msg.contains("KREUZBERG_MAX_REQUEST_BODY_BYTES"));
419
-
420
- cleanup_env_vars();
421
- restore_env(saved);
422
- }
423
-
424
- // Test 16: Partial overrides - only host, not port
425
- #[test]
426
- #[serial_test::serial]
427
- fn test_partial_overrides_host_only() {
428
- let saved = save_env(&["KREUZBERG_HOST", "KREUZBERG_PORT"]);
429
-
430
- let dir = tempdir().unwrap();
431
- let config_path = dir.path().join("config.toml");
432
-
433
- fs::write(
434
- &config_path,
435
- r#"
436
- host = "file-host"
437
- port = 8001
438
- "#,
439
- )
440
- .unwrap();
441
-
442
- set_env("KREUZBERG_HOST", "env-host");
443
- // Explicitly don't set KREUZBERG_PORT
444
-
445
- let mut config = ServerConfig::from_file(&config_path).unwrap();
446
- config.apply_env_overrides().unwrap();
447
-
448
- assert_eq!(config.host, "env-host", "Host should be overridden by env");
449
- assert_eq!(config.port, 8001, "Port should keep file value");
450
-
451
- cleanup_env_vars();
452
- restore_env(saved);
453
- }
454
-
455
- // Test 17: Partial overrides - only port, not host
456
- #[test]
457
- #[serial_test::serial]
458
- fn test_partial_overrides_port_only() {
459
- let saved = save_env(&["KREUZBERG_HOST", "KREUZBERG_PORT"]);
460
-
461
- let dir = tempdir().unwrap();
462
- let config_path = dir.path().join("config.toml");
463
-
464
- fs::write(
465
- &config_path,
466
- r#"
467
- host = "file-host"
468
- port = 8001
469
- "#,
470
- )
471
- .unwrap();
472
-
473
- set_env("KREUZBERG_PORT", "9000");
474
- // Explicitly don't set KREUZBERG_HOST
475
-
476
- let mut config = ServerConfig::from_file(&config_path).unwrap();
477
- config.apply_env_overrides().unwrap();
478
-
479
- assert_eq!(config.host, "file-host", "Host should keep file value");
480
- assert_eq!(config.port, 9000, "Port should be overridden by env");
481
-
482
- cleanup_env_vars();
483
- restore_env(saved);
484
- }
485
-
486
- // Test 18: Complex scenario with multiple settings
487
- #[test]
488
- #[serial_test::serial]
489
- fn test_complex_scenario_multiple_settings() {
490
- let saved = save_env(&[
491
- "KREUZBERG_HOST",
492
- "KREUZBERG_PORT",
493
- "KREUZBERG_CORS_ORIGINS",
494
- "KREUZBERG_MAX_REQUEST_BODY_BYTES",
495
- ]);
496
-
497
- let dir = tempdir().unwrap();
498
- let config_path = dir.path().join("config.toml");
499
-
500
- fs::write(
501
- &config_path,
502
- r#"
503
- host = "127.0.0.1"
504
- port = 8000
505
- cors_origins = ["https://file.com"]
506
- max_request_body_bytes = 50000000
507
- max_multipart_field_bytes = 75000000
508
- "#,
509
- )
510
- .unwrap();
511
-
512
- // Override some settings
513
- set_env("KREUZBERG_HOST", "0.0.0.0");
514
- set_env("KREUZBERG_PORT", "3000");
515
- set_env("KREUZBERG_CORS_ORIGINS", "https://env.com");
516
- // Don't set max_request_body_bytes - should keep file value
517
-
518
- let mut config = ServerConfig::from_file(&config_path).unwrap();
519
- config.apply_env_overrides().unwrap();
520
-
521
- assert_eq!(config.host, "0.0.0.0");
522
- assert_eq!(config.port, 3000);
523
- assert_eq!(config.cors_origins.len(), 1);
524
- assert_eq!(config.cors_origins[0], "https://env.com");
525
- assert_eq!(config.max_request_body_bytes, 50_000_000, "File value should persist");
526
- assert_eq!(config.max_multipart_field_bytes, 75_000_000);
527
-
528
- cleanup_env_vars();
529
- restore_env(saved);
530
- }
531
-
532
- // Test 19: listen_addr helper method
533
- #[test]
534
- fn test_listen_addr_helper() {
535
- let mut config = ServerConfig::default();
536
- assert_eq!(config.listen_addr(), "127.0.0.1:8000");
537
-
538
- config.host = "0.0.0.0".to_string();
539
- config.port = 3000;
540
- assert_eq!(config.listen_addr(), "0.0.0.0:3000");
541
- }
542
-
543
- // Test 20: Upload limits conversion to MB
544
- #[test]
545
- fn test_upload_limits_to_mb_conversion() {
546
- let mut config = ServerConfig::default();
547
-
548
- // Test request body MB
549
- assert_eq!(config.max_request_body_mb(), 100);
550
-
551
- config.max_request_body_bytes = 1_048_576; // 1 MB
552
- assert_eq!(config.max_request_body_mb(), 1);
553
-
554
- config.max_request_body_bytes = 1_048_577; // 1 MB + 1 byte - should round up
555
- assert_eq!(config.max_request_body_mb(), 2);
556
-
557
- // Test multipart field MB
558
- config.max_multipart_field_bytes = 1_048_576;
559
- assert_eq!(config.max_multipart_field_mb(), 1);
560
-
561
- config.max_multipart_field_bytes = 52_428_800; // 50 MB
562
- assert_eq!(config.max_multipart_field_mb(), 50);
563
- }
564
-
565
- // Test 21: Serialization consistency
566
- #[test]
567
- fn test_serialization_consistency() {
568
- let dir = tempdir().unwrap();
569
- let config_path = dir.path().join("config.toml");
570
-
571
- let original = r#"
572
- host = "192.168.1.100"
573
- port = 9000
574
- cors_origins = ["https://app.com"]
575
- max_request_body_bytes = 50000000
576
- max_multipart_field_bytes = 75000000
577
- "#;
578
-
579
- fs::write(&config_path, original).unwrap();
580
-
581
- let config = ServerConfig::from_file(&config_path).unwrap();
582
-
583
- // Serialize back
584
- let serialized = toml::to_string(&config).unwrap();
585
-
586
- // Deserialize again
587
- let config2: ServerConfig = toml::from_str(&serialized).unwrap();
588
-
589
- // Verify consistency
590
- assert_eq!(config.host, config2.host);
591
- assert_eq!(config.port, config2.port);
592
- assert_eq!(config.cors_origins, config2.cors_origins);
593
- assert_eq!(config.max_request_body_bytes, config2.max_request_body_bytes);
594
- assert_eq!(config.max_multipart_field_bytes, config2.max_multipart_field_bytes);
595
- }
596
-
597
- // Test 22: Empty CORS origins with env override
598
- #[test]
599
- #[serial_test::serial]
600
- fn test_empty_cors_to_specific_via_env() {
601
- let saved = save_env(&["KREUZBERG_CORS_ORIGINS"]);
602
-
603
- let dir = tempdir().unwrap();
604
- let config_path = dir.path().join("config.toml");
605
-
606
- fs::write(
607
- &config_path,
608
- r#"
609
- host = "127.0.0.1"
610
- port = 8000
611
- "#,
612
- )
613
- .unwrap();
614
-
615
- let mut config = ServerConfig::from_file(&config_path).unwrap();
616
- assert!(config.cors_allows_all(), "File config allows all origins");
617
-
618
- // Override with specific origins
619
- set_env("KREUZBERG_CORS_ORIGINS", "https://restricted.com");
620
- config.apply_env_overrides().unwrap();
621
-
622
- assert!(!config.cors_allows_all(), "Should now restrict to specific origin");
623
- assert!(config.is_origin_allowed("https://restricted.com"));
624
- assert!(!config.is_origin_allowed("https://other.com"));
625
-
626
- cleanup_env_vars();
627
- restore_env(saved);
628
- }
629
-
630
- // Test 23: Max upload limits in different formats
631
- #[test]
632
- fn test_max_limits_across_formats() {
633
- let dir = tempdir().unwrap();
634
-
635
- // Test TOML
636
- let toml_path = dir.path().join("config.toml");
637
- fs::write(
638
- &toml_path,
639
- r#"
640
- max_request_body_bytes = 100000000
641
- max_multipart_field_bytes = 200000000
642
- "#,
643
- )
644
- .unwrap();
645
-
646
- let toml_config = ServerConfig::from_file(&toml_path).unwrap();
647
- assert_eq!(toml_config.max_request_body_bytes, 100_000_000);
648
- assert_eq!(toml_config.max_multipart_field_bytes, 200_000_000);
649
-
650
- // Test YAML
651
- let yaml_path = dir.path().join("config.yaml");
652
- fs::write(
653
- &yaml_path,
654
- r#"
655
- max_request_body_bytes: 100000000
656
- max_multipart_field_bytes: 200000000
657
- "#,
658
- )
659
- .unwrap();
660
-
661
- let yaml_config = ServerConfig::from_file(&yaml_path).unwrap();
662
- assert_eq!(yaml_config.max_request_body_bytes, 100_000_000);
663
- assert_eq!(yaml_config.max_multipart_field_bytes, 200_000_000);
664
-
665
- // Test JSON
666
- let json_path = dir.path().join("config.json");
667
- fs::write(
668
- &json_path,
669
- r#"{
670
- "max_request_body_bytes": 100000000,
671
- "max_multipart_field_bytes": 200000000
672
- }
673
- "#,
674
- )
675
- .unwrap();
676
-
677
- let json_config = ServerConfig::from_file(&json_path).unwrap();
678
- assert_eq!(json_config.max_request_body_bytes, 100_000_000);
679
- assert_eq!(json_config.max_multipart_field_bytes, 200_000_000);
680
- }
681
-
682
- // Test 24: Port validation at bounds
683
- #[test]
684
- #[serial_test::serial]
685
- fn test_port_validation_bounds() {
686
- let saved = save_env(&["KREUZBERG_PORT"]);
687
-
688
- // Valid port: 0
689
- set_env("KREUZBERG_PORT", "0");
690
- let mut config = ServerConfig::default();
691
- config.apply_env_overrides().unwrap();
692
- assert_eq!(config.port, 0);
693
-
694
- // Valid port: 65535 (max u16)
695
- set_env("KREUZBERG_PORT", "65535");
696
- let mut config = ServerConfig::default();
697
- config.apply_env_overrides().unwrap();
698
- assert_eq!(config.port, 65535);
699
-
700
- // Invalid port: too large
701
- set_env("KREUZBERG_PORT", "65536");
702
- let mut config = ServerConfig::default();
703
- let result = config.apply_env_overrides();
704
- assert!(result.is_err());
705
-
706
- cleanup_env_vars();
707
- restore_env(saved);
708
- }
709
-
710
- // Test 25: Multiple env var overrides at once
711
- #[test]
712
- #[serial_test::serial]
713
- fn test_multiple_env_overrides_simultaneous() {
714
- let saved = save_env(&[
715
- "KREUZBERG_HOST",
716
- "KREUZBERG_PORT",
717
- "KREUZBERG_CORS_ORIGINS",
718
- "KREUZBERG_MAX_REQUEST_BODY_BYTES",
719
- "KREUZBERG_MAX_MULTIPART_FIELD_BYTES",
720
- ]);
721
-
722
- let dir = tempdir().unwrap();
723
- let config_path = dir.path().join("config.toml");
724
-
725
- fs::write(
726
- &config_path,
727
- r#"
728
- host = "127.0.0.1"
729
- port = 8000
730
- "#,
731
- )
732
- .unwrap();
733
-
734
- // Set all env vars
735
- set_env("KREUZBERG_HOST", "192.168.1.1");
736
- set_env("KREUZBERG_PORT", "5000");
737
- set_env("KREUZBERG_CORS_ORIGINS", "https://api.com, https://app.com");
738
- set_env("KREUZBERG_MAX_REQUEST_BODY_BYTES", "150000000");
739
- set_env("KREUZBERG_MAX_MULTIPART_FIELD_BYTES", "250000000");
740
-
741
- let mut config = ServerConfig::from_file(&config_path).unwrap();
742
- config.apply_env_overrides().unwrap();
743
-
744
- // All should be overridden
745
- assert_eq!(config.host, "192.168.1.1");
746
- assert_eq!(config.port, 5000);
747
- assert_eq!(config.cors_origins.len(), 2);
748
- assert_eq!(config.max_request_body_bytes, 150_000_000);
749
- assert_eq!(config.max_multipart_field_bytes, 250_000_000);
750
-
751
- cleanup_env_vars();
752
- restore_env(saved);
753
- }