kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,90 @@
1
+ //! Environment variable overrides for server configuration.
2
+ //!
3
+ //! This module provides functionality to override server configuration values
4
+ //! using environment variables. All settings can be overridden at runtime.
5
+
6
+ use crate::{KreuzbergError, Result};
7
+
8
+ /// Apply environment variable overrides to a ServerConfig.
9
+ ///
10
+ /// Reads the following environment variables and overrides config values if set:
11
+ ///
12
+ /// - `KREUZBERG_HOST` - Server host address
13
+ /// - `KREUZBERG_PORT` - Server port number (parsed as u16)
14
+ /// - `KREUZBERG_CORS_ORIGINS` - Comma-separated list of allowed origins
15
+ /// - `KREUZBERG_MAX_REQUEST_BODY_BYTES` - Max request body size in bytes
16
+ /// - `KREUZBERG_MAX_MULTIPART_FIELD_BYTES` - Max multipart field size in bytes
17
+ /// - `KREUZBERG_MAX_UPLOAD_SIZE_MB` - Max upload size in MB (legacy)
18
+ ///
19
+ /// # Errors
20
+ ///
21
+ /// Returns `KreuzbergError::Validation` if:
22
+ /// - `KREUZBERG_PORT` cannot be parsed as u16
23
+ /// - `KREUZBERG_MAX_REQUEST_BODY_BYTES` cannot be parsed as usize
24
+ /// - `KREUZBERG_MAX_MULTIPART_FIELD_BYTES` cannot be parsed as usize
25
+ /// - `KREUZBERG_MAX_UPLOAD_SIZE_MB` cannot be parsed as usize
26
+ pub fn apply_env_overrides(
27
+ host: &mut String,
28
+ port: &mut u16,
29
+ cors_origins: &mut Vec<String>,
30
+ max_request_body_bytes: &mut usize,
31
+ max_multipart_field_bytes: &mut usize,
32
+ max_upload_mb: &mut Option<usize>,
33
+ ) -> Result<()> {
34
+ // Host override
35
+ if let Ok(env_host) = std::env::var("KREUZBERG_HOST") {
36
+ *host = env_host;
37
+ }
38
+
39
+ // Port override
40
+ if let Ok(port_str) = std::env::var("KREUZBERG_PORT") {
41
+ *port = port_str.parse::<u16>().map_err(|e| {
42
+ KreuzbergError::validation(format!(
43
+ "KREUZBERG_PORT must be a valid u16 number, got '{}': {}",
44
+ port_str, e
45
+ ))
46
+ })?;
47
+ }
48
+
49
+ // CORS origins override (comma-separated)
50
+ if let Ok(origins_str) = std::env::var("KREUZBERG_CORS_ORIGINS") {
51
+ *cors_origins = origins_str
52
+ .split(',')
53
+ .map(|s| s.trim().to_string())
54
+ .filter(|s| !s.is_empty())
55
+ .collect();
56
+ }
57
+
58
+ // Max request body bytes override
59
+ if let Ok(bytes_str) = std::env::var("KREUZBERG_MAX_REQUEST_BODY_BYTES") {
60
+ *max_request_body_bytes = bytes_str.parse::<usize>().map_err(|e| {
61
+ KreuzbergError::validation(format!(
62
+ "KREUZBERG_MAX_REQUEST_BODY_BYTES must be a valid usize, got '{}': {}",
63
+ bytes_str, e
64
+ ))
65
+ })?;
66
+ }
67
+
68
+ // Max multipart field bytes override
69
+ if let Ok(bytes_str) = std::env::var("KREUZBERG_MAX_MULTIPART_FIELD_BYTES") {
70
+ *max_multipart_field_bytes = bytes_str.parse::<usize>().map_err(|e| {
71
+ KreuzbergError::validation(format!(
72
+ "KREUZBERG_MAX_MULTIPART_FIELD_BYTES must be a valid usize, got '{}': {}",
73
+ bytes_str, e
74
+ ))
75
+ })?;
76
+ }
77
+
78
+ // Legacy max upload size override (in MB)
79
+ if let Ok(mb_str) = std::env::var("KREUZBERG_MAX_UPLOAD_SIZE_MB") {
80
+ let mb = mb_str.parse::<usize>().map_err(|e| {
81
+ KreuzbergError::validation(format!(
82
+ "KREUZBERG_MAX_UPLOAD_SIZE_MB must be a valid usize, got '{}': {}",
83
+ mb_str, e
84
+ ))
85
+ })?;
86
+ *max_upload_mb = Some(mb);
87
+ }
88
+
89
+ Ok(())
90
+ }
@@ -0,0 +1,202 @@
1
+ //! File loading logic for server configuration.
2
+ //!
3
+ //! This module provides functionality to load server configuration from various
4
+ //! file formats (TOML, YAML, JSON) with support for both flat and nested formats.
5
+
6
+ use crate::{KreuzbergError, Result};
7
+ use serde::Deserialize;
8
+ use std::path::Path;
9
+
10
+ use super::ServerConfig;
11
+
12
+ /// Load server configuration from a file.
13
+ ///
14
+ /// Automatically detects the file format based on extension:
15
+ /// - `.toml` - TOML format
16
+ /// - `.yaml` or `.yml` - YAML format
17
+ /// - `.json` - JSON format
18
+ ///
19
+ /// This function handles two config file formats:
20
+ /// 1. Flat format: Server config at root level
21
+ /// 2. Nested format: Server config under `[server]` section (combined with ExtractionConfig)
22
+ ///
23
+ /// # Arguments
24
+ ///
25
+ /// * `path` - Path to the configuration file
26
+ ///
27
+ /// # Errors
28
+ ///
29
+ /// Returns `KreuzbergError::Validation` if:
30
+ /// - File doesn't exist or cannot be read
31
+ /// - File extension is not recognized
32
+ /// - File content is invalid for the detected format
33
+ pub fn from_file(path: impl AsRef<Path>) -> Result<ServerConfig> {
34
+ let path = path.as_ref();
35
+
36
+ let content = std::fs::read_to_string(path)
37
+ .map_err(|e| KreuzbergError::validation(format!("Failed to read config file {}: {}", path.display(), e)))?;
38
+
39
+ let extension = path.extension().and_then(|ext| ext.to_str()).ok_or_else(|| {
40
+ KreuzbergError::validation(format!(
41
+ "Cannot determine file format: no extension found in {}",
42
+ path.display()
43
+ ))
44
+ })?;
45
+
46
+ let mut config = match extension.to_lowercase().as_str() {
47
+ "toml" => from_toml_str(&content, path)?,
48
+ "yaml" | "yml" => from_yaml_str(&content, path)?,
49
+ "json" => from_json_str(&content, path)?,
50
+ _ => {
51
+ return Err(KreuzbergError::validation(format!(
52
+ "Unsupported config file format: .{}. Supported formats: .toml, .yaml, .yml, .json",
53
+ extension
54
+ )));
55
+ }
56
+ };
57
+
58
+ // Normalize legacy fields
59
+ config.normalize_legacy_fields();
60
+
61
+ Ok(config)
62
+ }
63
+
64
+ /// Load server configuration from a TOML file.
65
+ ///
66
+ /// # Arguments
67
+ ///
68
+ /// * `path` - Path to the TOML file
69
+ ///
70
+ /// # Errors
71
+ ///
72
+ /// Returns `KreuzbergError::Validation` if the file doesn't exist or is invalid TOML.
73
+ pub fn from_toml_file(path: impl AsRef<Path>) -> Result<ServerConfig> {
74
+ let path = path.as_ref();
75
+
76
+ let content = std::fs::read_to_string(path)
77
+ .map_err(|e| KreuzbergError::validation(format!("Failed to read config file {}: {}", path.display(), e)))?;
78
+
79
+ let mut config: ServerConfig = toml::from_str(&content)
80
+ .map_err(|e| KreuzbergError::validation(format!("Invalid TOML in {}: {}", path.display(), e)))?;
81
+
82
+ config.normalize_legacy_fields();
83
+
84
+ Ok(config)
85
+ }
86
+
87
+ /// Load server configuration from a YAML file.
88
+ ///
89
+ /// # Arguments
90
+ ///
91
+ /// * `path` - Path to the YAML file
92
+ ///
93
+ /// # Errors
94
+ ///
95
+ /// Returns `KreuzbergError::Validation` if the file doesn't exist or is invalid YAML.
96
+ pub fn from_yaml_file(path: impl AsRef<Path>) -> Result<ServerConfig> {
97
+ let path = path.as_ref();
98
+
99
+ let content = std::fs::read_to_string(path)
100
+ .map_err(|e| KreuzbergError::validation(format!("Failed to read config file {}: {}", path.display(), e)))?;
101
+
102
+ let mut config: ServerConfig = serde_yaml_ng::from_str(&content)
103
+ .map_err(|e| KreuzbergError::validation(format!("Invalid YAML in {}: {}", path.display(), e)))?;
104
+
105
+ config.normalize_legacy_fields();
106
+
107
+ Ok(config)
108
+ }
109
+
110
+ /// Load server configuration from a JSON file.
111
+ ///
112
+ /// # Arguments
113
+ ///
114
+ /// * `path` - Path to the JSON file
115
+ ///
116
+ /// # Errors
117
+ ///
118
+ /// Returns `KreuzbergError::Validation` if the file doesn't exist or is invalid JSON.
119
+ pub fn from_json_file(path: impl AsRef<Path>) -> Result<ServerConfig> {
120
+ let path = path.as_ref();
121
+
122
+ let content = std::fs::read_to_string(path)
123
+ .map_err(|e| KreuzbergError::validation(format!("Failed to read config file {}: {}", path.display(), e)))?;
124
+
125
+ let mut config: ServerConfig = serde_json::from_str(&content)
126
+ .map_err(|e| KreuzbergError::validation(format!("Invalid JSON in {}: {}", path.display(), e)))?;
127
+
128
+ config.normalize_legacy_fields();
129
+
130
+ Ok(config)
131
+ }
132
+
133
+ // Helper functions for parsing different formats
134
+
135
+ fn from_toml_str(content: &str, path: &Path) -> Result<ServerConfig> {
136
+ // Try nested format first (with [server] section)
137
+ #[derive(Deserialize)]
138
+ struct RootConfig {
139
+ #[serde(default)]
140
+ server: Option<ServerConfig>,
141
+ }
142
+
143
+ if let Ok(root) = toml::from_str::<RootConfig>(content) {
144
+ if let Some(server) = root.server {
145
+ return Ok(server);
146
+ } else {
147
+ // No [server] section, try flat format
148
+ return toml::from_str::<ServerConfig>(content)
149
+ .map_err(|e| KreuzbergError::validation(format!("Invalid TOML in {}: {}", path.display(), e)));
150
+ }
151
+ }
152
+
153
+ // Fall back to flat format
154
+ toml::from_str::<ServerConfig>(content)
155
+ .map_err(|e| KreuzbergError::validation(format!("Invalid TOML in {}: {}", path.display(), e)))
156
+ }
157
+
158
+ fn from_yaml_str(content: &str, path: &Path) -> Result<ServerConfig> {
159
+ // Try nested format first (with server: section)
160
+ #[derive(Deserialize)]
161
+ struct RootConfig {
162
+ #[serde(default)]
163
+ server: Option<ServerConfig>,
164
+ }
165
+
166
+ if let Ok(root) = serde_yaml_ng::from_str::<RootConfig>(content) {
167
+ if let Some(server) = root.server {
168
+ return Ok(server);
169
+ } else {
170
+ // No server section, try flat format
171
+ return serde_yaml_ng::from_str::<ServerConfig>(content)
172
+ .map_err(|e| KreuzbergError::validation(format!("Invalid YAML in {}: {}", path.display(), e)));
173
+ }
174
+ }
175
+
176
+ // Fall back to flat format
177
+ serde_yaml_ng::from_str::<ServerConfig>(content)
178
+ .map_err(|e| KreuzbergError::validation(format!("Invalid YAML in {}: {}", path.display(), e)))
179
+ }
180
+
181
+ fn from_json_str(content: &str, path: &Path) -> Result<ServerConfig> {
182
+ // Try nested format first (with "server" key)
183
+ #[derive(Deserialize)]
184
+ struct RootConfig {
185
+ #[serde(default)]
186
+ server: Option<ServerConfig>,
187
+ }
188
+
189
+ if let Ok(root) = serde_json::from_str::<RootConfig>(content) {
190
+ if let Some(server) = root.server {
191
+ return Ok(server);
192
+ } else {
193
+ // No server key, try flat format
194
+ return serde_json::from_str::<ServerConfig>(content)
195
+ .map_err(|e| KreuzbergError::validation(format!("Invalid JSON in {}: {}", path.display(), e)));
196
+ }
197
+ }
198
+
199
+ // Fall back to flat format
200
+ serde_json::from_str::<ServerConfig>(content)
201
+ .map_err(|e| KreuzbergError::validation(format!("Invalid JSON in {}: {}", path.display(), e)))
202
+ }
@@ -0,0 +1,380 @@
1
+ //! Server configuration for the Kreuzberg API.
2
+ //!
3
+ //! This module provides the `ServerConfig` struct for managing API server settings
4
+ //! including host, port, CORS, and upload size limits. Configuration can be loaded
5
+ //! from TOML, YAML, or JSON files and can be overridden by environment variables.
6
+ //!
7
+ //! # Features
8
+ //!
9
+ //! - **Multi-format support**: Load configuration from TOML, YAML, or JSON files
10
+ //! - **Environment overrides**: All settings can be overridden via environment variables
11
+ //! - **Backward compatibility**: Supports legacy `max_upload_mb` field for smooth migrations
12
+ //! - **Sensible defaults**: All fields have reasonable defaults matching current behavior
13
+ //! - **Flexible CORS**: Support for all origins (default) or specific origin lists
14
+ //!
15
+ //! # Example
16
+ //!
17
+ //! ```rust,no_run
18
+ //! use kreuzberg::core::ServerConfig;
19
+ //!
20
+ //! # fn example() -> kreuzberg::Result<()> {
21
+ //! // Create with defaults
22
+ //! let mut config = ServerConfig::default();
23
+ //!
24
+ //! // Or load from file
25
+ //! let mut config = ServerConfig::from_file("kreuzberg.toml")?;
26
+ //!
27
+ //! // Apply environment variable overrides
28
+ //! config.apply_env_overrides()?;
29
+ //!
30
+ //! # Ok(())
31
+ //! # }
32
+ //! ```
33
+
34
+ use crate::Result;
35
+ use serde::{Deserialize, Serialize};
36
+ use std::path::Path;
37
+
38
+ mod env;
39
+ mod loader;
40
+ mod validation;
41
+
42
+ #[cfg(test)]
43
+ mod tests;
44
+
45
+ /// Default host address for API server
46
+ const DEFAULT_HOST: &str = "127.0.0.1";
47
+
48
+ /// Default port for API server
49
+ const DEFAULT_PORT: u16 = 8000;
50
+
51
+ /// Default maximum request body size: 100 MB
52
+ const DEFAULT_MAX_REQUEST_BODY_BYTES: usize = 104_857_600;
53
+
54
+ /// Default maximum multipart field size: 100 MB
55
+ const DEFAULT_MAX_MULTIPART_FIELD_BYTES: usize = 104_857_600;
56
+
57
+ /// API server configuration.
58
+ ///
59
+ /// This struct holds all configuration options for the Kreuzberg API server,
60
+ /// including host/port settings, CORS configuration, and upload limits.
61
+ ///
62
+ /// # Defaults
63
+ ///
64
+ /// - `host`: "127.0.0.1" (localhost only)
65
+ /// - `port`: 8000
66
+ /// - `cors_origins`: empty vector (allows all origins)
67
+ /// - `max_request_body_bytes`: 104_857_600 (100 MB)
68
+ /// - `max_multipart_field_bytes`: 104_857_600 (100 MB)
69
+ /// - `max_upload_mb`: None (legacy field, not used if other fields set)
70
+ #[derive(Debug, Clone, Serialize, Deserialize)]
71
+ #[serde(default)]
72
+ pub struct ServerConfig {
73
+ /// Server host address (e.g., "127.0.0.1", "0.0.0.0")
74
+ #[serde(default = "default_host")]
75
+ pub host: String,
76
+
77
+ /// Server port number
78
+ #[serde(default = "default_port")]
79
+ pub port: u16,
80
+
81
+ /// CORS allowed origins. Empty vector means allow all origins.
82
+ ///
83
+ /// If this is an empty vector, the server will accept requests from any origin.
84
+ /// If populated with specific origins (e.g., ["https://example.com"]), only
85
+ /// those origins will be allowed.
86
+ #[serde(default)]
87
+ pub cors_origins: Vec<String>,
88
+
89
+ /// Maximum size of request body in bytes (default: 100 MB)
90
+ #[serde(default = "default_max_request_body_bytes")]
91
+ pub max_request_body_bytes: usize,
92
+
93
+ /// Maximum size of multipart fields in bytes (default: 100 MB)
94
+ #[serde(default = "default_max_multipart_field_bytes")]
95
+ pub max_multipart_field_bytes: usize,
96
+
97
+ /// Legacy upload size limit in MB (for backward compatibility).
98
+ ///
99
+ /// This field is deprecated and only used for backward compatibility.
100
+ /// If set, it will override `max_multipart_field_bytes` during normalization.
101
+ /// New configurations should use `max_multipart_field_bytes` directly.
102
+ #[serde(skip_serializing_if = "Option::is_none")]
103
+ pub max_upload_mb: Option<usize>,
104
+ }
105
+
106
+ impl Default for ServerConfig {
107
+ fn default() -> Self {
108
+ Self {
109
+ host: default_host(),
110
+ port: default_port(),
111
+ cors_origins: Vec::new(),
112
+ max_request_body_bytes: default_max_request_body_bytes(),
113
+ max_multipart_field_bytes: default_max_multipart_field_bytes(),
114
+ max_upload_mb: None,
115
+ }
116
+ }
117
+ }
118
+
119
+ // Default value functions for serde
120
+ fn default_host() -> String {
121
+ DEFAULT_HOST.to_string()
122
+ }
123
+
124
+ fn default_port() -> u16 {
125
+ DEFAULT_PORT
126
+ }
127
+
128
+ fn default_max_request_body_bytes() -> usize {
129
+ DEFAULT_MAX_REQUEST_BODY_BYTES
130
+ }
131
+
132
+ fn default_max_multipart_field_bytes() -> usize {
133
+ DEFAULT_MAX_MULTIPART_FIELD_BYTES
134
+ }
135
+
136
+ impl ServerConfig {
137
+ /// Create a new `ServerConfig` with default values.
138
+ pub fn new() -> Self {
139
+ Self::default()
140
+ }
141
+
142
+ /// Get the server listen address (host:port).
143
+ ///
144
+ /// # Example
145
+ ///
146
+ /// ```rust
147
+ /// use kreuzberg::core::ServerConfig;
148
+ ///
149
+ /// let config = ServerConfig::default();
150
+ /// assert_eq!(config.listen_addr(), "127.0.0.1:8000");
151
+ /// ```
152
+ pub fn listen_addr(&self) -> String {
153
+ format!("{}:{}", self.host, self.port)
154
+ }
155
+
156
+ /// Check if CORS allows all origins.
157
+ ///
158
+ /// Returns `true` if the `cors_origins` vector is empty, meaning all origins
159
+ /// are allowed. Returns `false` if specific origins are configured.
160
+ ///
161
+ /// # Example
162
+ ///
163
+ /// ```rust
164
+ /// use kreuzberg::core::ServerConfig;
165
+ ///
166
+ /// let mut config = ServerConfig::default();
167
+ /// assert!(config.cors_allows_all());
168
+ ///
169
+ /// config.cors_origins.push("https://example.com".to_string());
170
+ /// assert!(!config.cors_allows_all());
171
+ /// ```
172
+ pub fn cors_allows_all(&self) -> bool {
173
+ self.cors_origins.is_empty()
174
+ }
175
+
176
+ /// Check if a given origin is allowed by CORS configuration.
177
+ ///
178
+ /// Returns `true` if:
179
+ /// - CORS allows all origins (empty origins list), or
180
+ /// - The given origin is in the allowed origins list
181
+ ///
182
+ /// # Arguments
183
+ ///
184
+ /// * `origin` - The origin to check (e.g., "https://example.com")
185
+ ///
186
+ /// # Example
187
+ ///
188
+ /// ```rust
189
+ /// use kreuzberg::core::ServerConfig;
190
+ ///
191
+ /// let mut config = ServerConfig::default();
192
+ /// assert!(config.is_origin_allowed("https://example.com"));
193
+ ///
194
+ /// config.cors_origins.push("https://allowed.com".to_string());
195
+ /// assert!(config.is_origin_allowed("https://allowed.com"));
196
+ /// assert!(!config.is_origin_allowed("https://denied.com"));
197
+ /// ```
198
+ pub fn is_origin_allowed(&self, origin: &str) -> bool {
199
+ self.cors_origins.is_empty() || self.cors_origins.contains(&origin.to_string())
200
+ }
201
+
202
+ /// Get maximum request body size in megabytes (rounded up).
203
+ ///
204
+ /// # Example
205
+ ///
206
+ /// ```rust
207
+ /// use kreuzberg::core::ServerConfig;
208
+ ///
209
+ /// let mut config = ServerConfig::default();
210
+ /// assert_eq!(config.max_request_body_mb(), 100);
211
+ /// ```
212
+ pub fn max_request_body_mb(&self) -> usize {
213
+ self.max_request_body_bytes.div_ceil(1_048_576)
214
+ }
215
+
216
+ /// Get maximum multipart field size in megabytes (rounded up).
217
+ ///
218
+ /// # Example
219
+ ///
220
+ /// ```rust
221
+ /// use kreuzberg::core::ServerConfig;
222
+ ///
223
+ /// let mut config = ServerConfig::default();
224
+ /// assert_eq!(config.max_multipart_field_mb(), 100);
225
+ /// ```
226
+ pub fn max_multipart_field_mb(&self) -> usize {
227
+ self.max_multipart_field_bytes.div_ceil(1_048_576)
228
+ }
229
+
230
+ /// Normalize legacy field values for backward compatibility.
231
+ ///
232
+ /// If `max_upload_mb` is set, it will be converted to bytes and used to
233
+ /// override `max_multipart_field_bytes`. This allows old configurations
234
+ /// using the legacy field to continue working.
235
+ ///
236
+ /// This method is automatically called by `apply_env_overrides()`.
237
+ pub fn normalize_legacy_fields(&mut self) {
238
+ validation::normalize_legacy_fields(self.max_upload_mb, &mut self.max_multipart_field_bytes);
239
+ }
240
+
241
+ /// Apply environment variable overrides to the configuration.
242
+ ///
243
+ /// Reads the following environment variables and overrides config values if set:
244
+ ///
245
+ /// - `KREUZBERG_HOST` - Server host address
246
+ /// - `KREUZBERG_PORT` - Server port number (parsed as u16)
247
+ /// - `KREUZBERG_CORS_ORIGINS` - Comma-separated list of allowed origins
248
+ /// - `KREUZBERG_MAX_REQUEST_BODY_BYTES` - Max request body size in bytes
249
+ /// - `KREUZBERG_MAX_MULTIPART_FIELD_BYTES` - Max multipart field size in bytes
250
+ /// - `KREUZBERG_MAX_UPLOAD_SIZE_MB` - Max upload size in MB (legacy)
251
+ ///
252
+ /// # Errors
253
+ ///
254
+ /// Returns `KreuzbergError::Validation` if:
255
+ /// - `KREUZBERG_PORT` cannot be parsed as u16
256
+ /// - `KREUZBERG_MAX_REQUEST_BODY_BYTES` cannot be parsed as usize
257
+ /// - `KREUZBERG_MAX_MULTIPART_FIELD_BYTES` cannot be parsed as usize
258
+ /// - `KREUZBERG_MAX_UPLOAD_SIZE_MB` cannot be parsed as usize
259
+ ///
260
+ /// # Example
261
+ ///
262
+ /// ```rust,no_run
263
+ /// use kreuzberg::core::ServerConfig;
264
+ ///
265
+ /// # fn example() -> kreuzberg::Result<()> {
266
+ /// unsafe {
267
+ /// std::env::set_var("KREUZBERG_HOST", "0.0.0.0");
268
+ /// std::env::set_var("KREUZBERG_PORT", "3000");
269
+ /// }
270
+ ///
271
+ /// let mut config = ServerConfig::default();
272
+ /// config.apply_env_overrides()?;
273
+ ///
274
+ /// assert_eq!(config.host, "0.0.0.0");
275
+ /// assert_eq!(config.port, 3000);
276
+ /// # Ok(())
277
+ /// # }
278
+ /// ```
279
+ pub fn apply_env_overrides(&mut self) -> Result<()> {
280
+ env::apply_env_overrides(
281
+ &mut self.host,
282
+ &mut self.port,
283
+ &mut self.cors_origins,
284
+ &mut self.max_request_body_bytes,
285
+ &mut self.max_multipart_field_bytes,
286
+ &mut self.max_upload_mb,
287
+ )?;
288
+
289
+ // Apply legacy field normalization
290
+ self.normalize_legacy_fields();
291
+
292
+ Ok(())
293
+ }
294
+
295
+ /// Load server configuration from a file.
296
+ ///
297
+ /// Automatically detects the file format based on extension:
298
+ /// - `.toml` - TOML format
299
+ /// - `.yaml` or `.yml` - YAML format
300
+ /// - `.json` - JSON format
301
+ ///
302
+ /// This function handles two config file formats:
303
+ /// 1. Flat format: Server config at root level
304
+ /// 2. Nested format: Server config under `[server]` section (combined with ExtractionConfig)
305
+ ///
306
+ /// # Arguments
307
+ ///
308
+ /// * `path` - Path to the configuration file
309
+ ///
310
+ /// # Errors
311
+ ///
312
+ /// Returns `KreuzbergError::Validation` if:
313
+ /// - File doesn't exist or cannot be read
314
+ /// - File extension is not recognized
315
+ /// - File content is invalid for the detected format
316
+ ///
317
+ /// # Example
318
+ ///
319
+ /// ```rust,no_run
320
+ /// use kreuzberg::core::ServerConfig;
321
+ ///
322
+ /// # fn example() -> kreuzberg::Result<()> {
323
+ /// let config = ServerConfig::from_file("kreuzberg.toml")?;
324
+ /// # Ok(())
325
+ /// # }
326
+ /// ```
327
+ pub fn from_file(path: impl AsRef<Path>) -> Result<Self> {
328
+ loader::from_file(path)
329
+ }
330
+
331
+ /// Load server configuration from a TOML file.
332
+ ///
333
+ /// # Arguments
334
+ ///
335
+ /// * `path` - Path to the TOML file
336
+ ///
337
+ /// # Errors
338
+ ///
339
+ /// Returns `KreuzbergError::Validation` if the file doesn't exist or is invalid TOML.
340
+ ///
341
+ /// # Example
342
+ ///
343
+ /// ```rust,no_run
344
+ /// use kreuzberg::core::ServerConfig;
345
+ ///
346
+ /// # fn example() -> kreuzberg::Result<()> {
347
+ /// let config = ServerConfig::from_toml_file("kreuzberg.toml")?;
348
+ /// # Ok(())
349
+ /// # }
350
+ /// ```
351
+ pub fn from_toml_file(path: impl AsRef<Path>) -> Result<Self> {
352
+ loader::from_toml_file(path)
353
+ }
354
+
355
+ /// Load server configuration from a YAML file.
356
+ ///
357
+ /// # Arguments
358
+ ///
359
+ /// * `path` - Path to the YAML file
360
+ ///
361
+ /// # Errors
362
+ ///
363
+ /// Returns `KreuzbergError::Validation` if the file doesn't exist or is invalid YAML.
364
+ pub fn from_yaml_file(path: impl AsRef<Path>) -> Result<Self> {
365
+ loader::from_yaml_file(path)
366
+ }
367
+
368
+ /// Load server configuration from a JSON file.
369
+ ///
370
+ /// # Arguments
371
+ ///
372
+ /// * `path` - Path to the JSON file
373
+ ///
374
+ /// # Errors
375
+ ///
376
+ /// Returns `KreuzbergError::Validation` if the file doesn't exist or is invalid JSON.
377
+ pub fn from_json_file(path: impl AsRef<Path>) -> Result<Self> {
378
+ loader::from_json_file(path)
379
+ }
380
+ }