kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
data/sig/kreuzberg.rbs CHANGED
@@ -250,7 +250,56 @@ module Kreuzberg
250
250
  tables: Array[table_hash]?,
251
251
  detected_languages: Array[String]?,
252
252
  chunks: Array[chunk_hash]?,
253
- images: Array[image_hash]?
253
+ images: Array[image_hash]?,
254
+ elements: Array[element_hash]?,
255
+ djot_content: djot_content_hash?
256
+ }
257
+
258
+ type djot_content_hash = {
259
+ plain_text: String,
260
+ blocks: Array[formatted_block_hash],
261
+ metadata_json: String,
262
+ tables: Array[table_hash],
263
+ images: Array[djot_image_hash],
264
+ links: Array[djot_link_hash],
265
+ footnotes: Array[footnote_hash],
266
+ attributes: Hash[String, attributes_hash]?
267
+ }
268
+
269
+ type formatted_block_hash = {
270
+ block_type: String,
271
+ level: Integer?,
272
+ content: String?,
273
+ children: Array[formatted_block_hash]?,
274
+ attributes: attributes_hash?
275
+ }
276
+
277
+ type djot_image_hash = {
278
+ url: String,
279
+ alt: String?,
280
+ title: String?,
281
+ attributes: attributes_hash?
282
+ }
283
+
284
+ type djot_link_hash = {
285
+ url: String,
286
+ text: String,
287
+ title: String?,
288
+ link_type: String?
289
+ }
290
+
291
+ type footnote_hash = {
292
+ label: String,
293
+ content: String
294
+ }
295
+
296
+ type attributes_hash = Hash[String, String | Integer | bool | Array[String] | nil]
297
+
298
+ type element_hash = {
299
+ element_id: String,
300
+ element_type: String,
301
+ text: String,
302
+ metadata: Hash[String, untyped]?
254
303
  }
255
304
 
256
305
  type table_hash = {
@@ -359,6 +408,60 @@ module Kreuzberg
359
408
  def to_h: () -> image_hash
360
409
  end
361
410
 
411
+ # Structured Djot document representation
412
+ class DjotContent
413
+ attr_reader plain_text: String
414
+ attr_reader blocks: Array[DjotContent::FormattedBlock]
415
+ attr_reader metadata: Hash[untyped, untyped]
416
+ attr_reader tables: Array[Table]
417
+ attr_reader images: Array[DjotContent::DjotImage]
418
+ attr_reader links: Array[DjotContent::DjotLink]
419
+ attr_reader footnotes: Array[DjotContent::Footnote]
420
+ attr_reader attributes: Hash[String, untyped]?
421
+
422
+ def initialize: (djot_content_hash hash) -> void
423
+ def to_h: () -> djot_content_hash
424
+
425
+ class FormattedBlock
426
+ attr_reader block_type: String
427
+ attr_reader level: Integer?
428
+ attr_reader content: String?
429
+ attr_reader children: Array[FormattedBlock]?
430
+ attr_reader attributes: Hash[String, untyped]?
431
+
432
+ def initialize: (formatted_block_hash hash) -> void
433
+ def to_h: () -> formatted_block_hash
434
+ end
435
+
436
+ class DjotImage
437
+ attr_reader url: String
438
+ attr_reader alt: String?
439
+ attr_reader title: String?
440
+ attr_reader attributes: Hash[String, untyped]?
441
+
442
+ def initialize: (djot_image_hash hash) -> void
443
+ def to_h: () -> djot_image_hash
444
+ end
445
+
446
+ class DjotLink
447
+ attr_reader url: String
448
+ attr_reader text: String
449
+ attr_reader title: String?
450
+ attr_reader link_type: String?
451
+
452
+ def initialize: (djot_link_hash hash) -> void
453
+ def to_h: () -> djot_link_hash
454
+ end
455
+
456
+ class Footnote
457
+ attr_reader label: String
458
+ attr_reader content: String
459
+
460
+ def initialize: (label: String, content: String) -> void
461
+ def to_h: () -> footnote_hash
462
+ end
463
+ end
464
+
362
465
  attr_reader content: String
363
466
  attr_reader mime_type: String
364
467
  attr_reader metadata: Hash[untyped, untyped]
@@ -367,6 +470,7 @@ module Kreuzberg
367
470
  attr_reader detected_languages: Array[String]?
368
471
  attr_reader chunks: Array[Chunk]?
369
472
  attr_reader images: Array[Image]?
473
+ attr_reader djot_content: DjotContent?
370
474
 
371
475
  def initialize: (extraction_result_hash hash) -> void
372
476
  def to_h: () -> Hash[Symbol, untyped]
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "4.0.8"
6
+ version = "4.1.0"
7
7
  edition = "2024"
8
8
  rust-version = "1.91"
9
9
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -29,7 +29,7 @@ serde = { version = "1.0.228", features = ["derive"] }
29
29
  serde_json = "1.0.149"
30
30
 
31
31
  # Error handling
32
- thiserror = "2.0.17"
32
+ thiserror = "2.0.18"
33
33
  anyhow = "1.0"
34
34
 
35
35
  # Async utilities
@@ -47,7 +47,7 @@ hex = "0.4.3"
47
47
  toml = "0.9.11"
48
48
  num_cpus = "1.17.0"
49
49
  once_cell = "1.21.3"
50
- html-to-markdown-rs = { version = "2.22.5", default-features = false }
50
+ html-to-markdown-rs = { version = "2.23.4", default-features = false }
51
51
  reqwest = { version = "0.13.1", default-features = false, features = ["json", "rustls"] }
52
52
  image = { version = "0.25.9", default-features = false }
53
53
  lzma-rust2 = { version = "0.15.7" }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.0.8"
3
+ version = "4.1.0"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -136,6 +136,7 @@ regex = "1.12.2"
136
136
  serde = { workspace = true }
137
137
  serde_json = { workspace = true }
138
138
  serde_yaml_ng = "0.10.0"
139
+ jotdown = "0.9"
139
140
  toml = { workspace = true }
140
141
  mime_guess = "2.0"
141
142
  rmp-serde = "1.3"
@@ -152,7 +153,7 @@ lopdf = { version = "0.39.0", optional = true }
152
153
  calamine = { version = "0.32.0", features = ["dates"], optional = true }
153
154
  polars = { version = "0.52.0", default-features = false, features = ["ipc"], optional = true }
154
155
  roxmltree = { version = "0.21.1", optional = true }
155
- zip = { version = "7.1.0", optional = true }
156
+ zip = { version = "7.2.0", optional = true }
156
157
  mail-parser = { version = "0.11.1", optional = true }
157
158
  msg_parser = { version = "0.1.1", optional = true }
158
159
  html-to-markdown-rs = { workspace = true, features = [
@@ -215,7 +216,7 @@ smartcore = { version = "0.4", default-features = false, features = ["serde"] }
215
216
  tempfile = { workspace = true }
216
217
  filetime = "0.2"
217
218
  tar = "0.4.44"
218
- zip = "7.1.0"
219
+ zip = "7.2.0"
219
220
  serial_test = "3.3.1"
220
221
  anyhow = { workspace = true }
221
222
  tokio-test = "0.4"
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
17
17
 
18
18
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
19
19
 
20
- > **🚀 Version 4.0.0 Release Candidate**
20
+ > **🚀 Version 4.1.0 Release**
21
21
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
22
22
  >
23
23
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -0,0 +1,69 @@
1
+ //! API server configuration loading.
2
+
3
+ use crate::{Result, core::ServerConfig};
4
+
5
+ /// Load ServerConfig with proper precedence order.
6
+ ///
7
+ /// This function implements the configuration hierarchy:
8
+ /// 1. File (if provided)
9
+ /// 2. Environment variables (via apply_env_overrides)
10
+ /// 3. Defaults
11
+ ///
12
+ /// The config file can be in flat format (server settings at root) or nested format
13
+ /// (server settings under [server] section alongside other configs like [ocr]).
14
+ ///
15
+ /// # Arguments
16
+ ///
17
+ /// * `config_path` - Optional path to a ServerConfig file (TOML, YAML, or JSON)
18
+ ///
19
+ /// # Returns
20
+ ///
21
+ /// A configured ServerConfig with proper precedence applied.
22
+ ///
23
+ /// # Errors
24
+ ///
25
+ /// Returns an error if:
26
+ /// - The config file path is provided but cannot be read
27
+ /// - The config file contains invalid server configuration
28
+ /// - Environment variable overrides contain invalid values
29
+ ///
30
+ /// # Examples
31
+ ///
32
+ /// ```no_run
33
+ /// use kreuzberg::api::load_server_config;
34
+ /// use std::path::Path;
35
+ ///
36
+ /// # fn example() -> kreuzberg::Result<()> {
37
+ /// // Load from file with env overrides
38
+ /// let config = load_server_config(Some(Path::new("server.toml")))?;
39
+ ///
40
+ /// // Or use defaults with env overrides
41
+ /// let config = load_server_config(None)?;
42
+ /// # Ok(())
43
+ /// # }
44
+ /// ```
45
+ pub fn load_server_config(config_path: Option<&std::path::Path>) -> Result<ServerConfig> {
46
+ let mut config = if let Some(path) = config_path {
47
+ ServerConfig::from_file(path)?
48
+ } else {
49
+ ServerConfig::default()
50
+ };
51
+
52
+ // Apply environment variable overrides with proper logging
53
+ config.apply_env_overrides()?;
54
+
55
+ tracing::info!(
56
+ "Server configuration loaded: host={}, port={}, request_body_limit={} MB, multipart_field_limit={} MB, CORS={}",
57
+ config.host,
58
+ config.port,
59
+ config.max_request_body_mb(),
60
+ config.max_multipart_field_mb(),
61
+ if config.cors_allows_all() {
62
+ "allow all origins".to_string()
63
+ } else {
64
+ format!("{} specific origins", config.cors_origins.len())
65
+ }
66
+ );
67
+
68
+ Ok(config)
69
+ }
@@ -10,8 +10,8 @@ use crate::{batch_extract_bytes, cache, extract_bytes};
10
10
  use super::{
11
11
  error::ApiError,
12
12
  types::{
13
- ApiState, CacheClearResponse, CacheStatsResponse, EmbedRequest, EmbedResponse, ExtractResponse, HealthResponse,
14
- InfoResponse,
13
+ ApiState, CacheClearResponse, CacheStatsResponse, ChunkRequest, ChunkResponse, EmbedRequest, EmbedResponse,
14
+ ExtractResponse, HealthResponse, InfoResponse,
15
15
  },
16
16
  };
17
17
 
@@ -85,6 +85,25 @@ pub async fn extract_handler(
85
85
  )))
86
86
  })?;
87
87
  }
88
+ "output_format" => {
89
+ let format_str = field
90
+ .text()
91
+ .await
92
+ .map_err(|e| ApiError::validation(crate::error::KreuzbergError::validation(e.to_string())))?;
93
+
94
+ config.output_format = match format_str.to_lowercase().as_str() {
95
+ "plain" => crate::core::config::OutputFormat::Plain,
96
+ "markdown" => crate::core::config::OutputFormat::Markdown,
97
+ "djot" => crate::core::config::OutputFormat::Djot,
98
+ "html" => crate::core::config::OutputFormat::Html,
99
+ _ => {
100
+ return Err(ApiError::validation(crate::error::KreuzbergError::validation(format!(
101
+ "Invalid output_format: '{}'. Valid values: 'plain', 'markdown', 'djot', 'html'",
102
+ format_str
103
+ ))));
104
+ }
105
+ };
106
+ }
88
107
  _ => {}
89
108
  }
90
109
  }
@@ -318,3 +337,81 @@ pub async fn embed_handler(Json(_request): Json<EmbedRequest>) -> Result<Json<Em
318
337
  "Embeddings feature is not enabled. Rebuild with --features embeddings".to_string(),
319
338
  )))
320
339
  }
340
+
341
+ /// Chunk text endpoint handler.
342
+ ///
343
+ /// POST /chunk
344
+ ///
345
+ /// Accepts JSON body with text and optional configuration.
346
+ /// Returns chunks with metadata.
347
+ #[cfg_attr(
348
+ feature = "otel",
349
+ tracing::instrument(
350
+ name = "api.chunk",
351
+ skip(request),
352
+ fields(text_length = request.text.len(), chunker_type = request.chunker_type.as_str())
353
+ )
354
+ )]
355
+ pub async fn chunk_handler(Json(request): Json<ChunkRequest>) -> Result<Json<ChunkResponse>, ApiError> {
356
+ use super::types::{ChunkItem, ChunkingConfigResponse};
357
+ use crate::chunking::{ChunkerType, ChunkingConfig, chunk_text};
358
+
359
+ // Validate input
360
+ if request.text.is_empty() {
361
+ return Err(ApiError::validation(crate::error::KreuzbergError::validation(
362
+ "Text cannot be empty",
363
+ )));
364
+ }
365
+
366
+ // Parse chunker_type
367
+ let chunker_type = match request.chunker_type.to_lowercase().as_str() {
368
+ "text" | "" => ChunkerType::Text,
369
+ "markdown" => ChunkerType::Markdown,
370
+ other => {
371
+ return Err(ApiError::validation(crate::error::KreuzbergError::validation(format!(
372
+ "Invalid chunker_type: '{}'. Valid values: 'text', 'markdown'",
373
+ other
374
+ ))));
375
+ }
376
+ };
377
+
378
+ // Build config with defaults
379
+ let cfg = request.config.unwrap_or_default();
380
+ let config = ChunkingConfig {
381
+ max_characters: cfg.max_characters.unwrap_or(2000),
382
+ overlap: cfg.overlap.unwrap_or(100),
383
+ trim: cfg.trim.unwrap_or(true),
384
+ chunker_type,
385
+ };
386
+
387
+ // Perform chunking
388
+ let result = chunk_text(&request.text, &config, None).map_err(ApiError::internal)?;
389
+
390
+ // Transform to response
391
+ let chunks = result
392
+ .chunks
393
+ .into_iter()
394
+ .map(|chunk| ChunkItem {
395
+ content: chunk.content,
396
+ byte_start: chunk.metadata.byte_start,
397
+ byte_end: chunk.metadata.byte_end,
398
+ chunk_index: chunk.metadata.chunk_index,
399
+ total_chunks: chunk.metadata.total_chunks,
400
+ first_page: chunk.metadata.first_page,
401
+ last_page: chunk.metadata.last_page,
402
+ })
403
+ .collect();
404
+
405
+ Ok(Json(ChunkResponse {
406
+ chunks,
407
+ chunk_count: result.chunk_count,
408
+ config: ChunkingConfigResponse {
409
+ max_characters: config.max_characters,
410
+ overlap: config.overlap,
411
+ trim: config.trim,
412
+ chunker_type: format!("{:?}", config.chunker_type).to_lowercase(),
413
+ },
414
+ input_size_bytes: request.text.len(),
415
+ chunker_type: request.chunker_type.to_lowercase(),
416
+ }))
417
+ }
@@ -7,6 +7,7 @@
7
7
  //!
8
8
  //! - `POST /extract` - Extract text from uploaded files (multipart form data)
9
9
  //! - `POST /embed` - Generate embeddings for text (JSON body with texts array)
10
+ //! - `POST /chunk` - Chunk text into smaller pieces (JSON body with text and config)
10
11
  //! - `GET /health` - Health check endpoint
11
12
  //! - `GET /info` - Server information
12
13
  //! - `GET /cache/stats` - Get cache statistics
@@ -76,19 +77,25 @@
76
77
  //! curl -X POST http://localhost:8000/embed \
77
78
  //! -H "Content-Type: application/json" \
78
79
  //! -d '{"texts":["Hello world","Second text"]}'
80
+ //!
81
+ //! # Chunk text
82
+ //! curl -X POST http://localhost:8000/chunk \
83
+ //! -H "Content-Type: application/json" \
84
+ //! -d '{"text":"Long text to chunk...","chunker_type":"text"}'
79
85
  //! ```
80
86
 
87
+ mod config;
81
88
  mod error;
82
89
  mod handlers;
83
- mod server;
90
+ mod router;
91
+ mod startup;
84
92
  mod types;
85
93
 
94
+ pub use config::load_server_config;
86
95
  pub use error::ApiError;
87
- pub use server::{
88
- create_router, create_router_with_limits, create_router_with_limits_and_server_config, load_server_config, serve,
89
- serve_default, serve_with_config, serve_with_config_and_limits, serve_with_server_config,
90
- };
96
+ pub use router::{create_router, create_router_with_limits, create_router_with_limits_and_server_config};
97
+ pub use startup::{serve, serve_default, serve_with_config, serve_with_config_and_limits, serve_with_server_config};
91
98
  pub use types::{
92
- ApiSizeLimits, ApiState, CacheClearResponse, CacheStatsResponse, EmbedRequest, EmbedResponse, ErrorResponse,
93
- ExtractResponse, HealthResponse, InfoResponse,
99
+ ApiSizeLimits, ApiState, CacheClearResponse, CacheStatsResponse, ChunkRequest, ChunkResponse, EmbedRequest,
100
+ EmbedResponse, ErrorResponse, ExtractResponse, HealthResponse, InfoResponse,
94
101
  };
@@ -0,0 +1,214 @@
1
+ //! API router setup and configuration.
2
+
3
+ use std::sync::Arc;
4
+
5
+ use axum::{
6
+ Router,
7
+ extract::DefaultBodyLimit,
8
+ routing::{delete, get, post},
9
+ };
10
+ use tower_http::{
11
+ cors::{AllowOrigin, Any, CorsLayer},
12
+ limit::RequestBodyLimitLayer,
13
+ trace::TraceLayer,
14
+ };
15
+
16
+ use crate::{ExtractionConfig, core::ServerConfig};
17
+
18
+ use super::{
19
+ handlers::{
20
+ cache_clear_handler, cache_stats_handler, chunk_handler, embed_handler, extract_handler, health_handler,
21
+ info_handler,
22
+ },
23
+ types::{ApiSizeLimits, ApiState},
24
+ };
25
+
26
+ /// Create the API router with all routes configured.
27
+ ///
28
+ /// This is public to allow users to embed the router in their own applications.
29
+ ///
30
+ /// # Arguments
31
+ ///
32
+ /// * `config` - Default extraction configuration. Per-request configs override these defaults.
33
+ ///
34
+ /// # Examples
35
+ ///
36
+ /// ```no_run
37
+ /// use kreuzberg::{ExtractionConfig, api::create_router};
38
+ ///
39
+ /// # #[tokio::main]
40
+ /// # async fn main() {
41
+ /// // Create router with default config and size limits
42
+ /// let config = ExtractionConfig::default();
43
+ /// let router = create_router(config);
44
+ /// # }
45
+ /// ```
46
+ pub fn create_router(config: ExtractionConfig) -> Router {
47
+ create_router_with_limits(config, ApiSizeLimits::default())
48
+ }
49
+
50
+ /// Create the API router with custom size limits.
51
+ ///
52
+ /// This allows fine-grained control over request body and multipart field size limits.
53
+ ///
54
+ /// # Arguments
55
+ ///
56
+ /// * `config` - Default extraction configuration. Per-request configs override these defaults.
57
+ /// * `limits` - Size limits for request bodies and multipart uploads.
58
+ ///
59
+ /// # Examples
60
+ ///
61
+ /// ```no_run
62
+ /// use kreuzberg::{ExtractionConfig, api::{create_router_with_limits, ApiSizeLimits}};
63
+ ///
64
+ /// # #[tokio::main]
65
+ /// # async fn main() {
66
+ /// // Create router with 50 MB limits
67
+ /// let config = ExtractionConfig::default();
68
+ /// let limits = ApiSizeLimits::from_mb(50, 50);
69
+ /// let router = create_router_with_limits(config, limits);
70
+ /// # }
71
+ /// ```
72
+ ///
73
+ /// ```no_run
74
+ /// use kreuzberg::{ExtractionConfig, api::{create_router_with_limits, ApiSizeLimits}};
75
+ /// use tower_http::limit::RequestBodyLimitLayer;
76
+ ///
77
+ /// # #[tokio::main]
78
+ /// # async fn main() {
79
+ /// // Custom limits for very large documents (500 MB)
80
+ /// let config = ExtractionConfig::default();
81
+ /// let limits = ApiSizeLimits::from_mb(500, 500);
82
+ /// let router = create_router_with_limits(config, limits);
83
+ /// # }
84
+ /// ```
85
+ pub fn create_router_with_limits(config: ExtractionConfig, limits: ApiSizeLimits) -> Router {
86
+ create_router_with_limits_and_server_config(config, limits, ServerConfig::default())
87
+ }
88
+
89
+ /// Create the API router with custom size limits and server configuration.
90
+ ///
91
+ /// This function provides full control over request limits, CORS, and server settings via ServerConfig.
92
+ ///
93
+ /// # Arguments
94
+ ///
95
+ /// * `config` - Default extraction configuration. Per-request configs override these defaults.
96
+ /// * `limits` - Size limits for request bodies and multipart uploads.
97
+ /// * `server_config` - Server configuration including host, port, and CORS settings.
98
+ ///
99
+ /// # Examples
100
+ ///
101
+ /// ```no_run
102
+ /// use kreuzberg::{ExtractionConfig, api::create_router_with_limits, core::ServerConfig};
103
+ ///
104
+ /// # #[tokio::main]
105
+ /// # async fn main() -> kreuzberg::Result<()> {
106
+ /// let extraction_config = ExtractionConfig::default();
107
+ /// let mut server_config = ServerConfig::default();
108
+ /// server_config.cors_origins = vec!["https://example.com".to_string()];
109
+ /// let router = create_router_with_limits_and_server_config(
110
+ /// extraction_config,
111
+ /// Default::default(),
112
+ /// server_config
113
+ /// );
114
+ /// # Ok(())
115
+ /// # }
116
+ /// ```
117
+ pub fn create_router_with_limits_and_server_config(
118
+ config: ExtractionConfig,
119
+ limits: ApiSizeLimits,
120
+ server_config: ServerConfig,
121
+ ) -> Router {
122
+ let state = ApiState {
123
+ default_config: Arc::new(config),
124
+ };
125
+
126
+ // CORS configuration based on ServerConfig
127
+ let cors_layer = if server_config.cors_allows_all() {
128
+ tracing::warn!(
129
+ "CORS configured to allow all origins (default). This permits CSRF attacks. \
130
+ For production, set KREUZBERG_CORS_ORIGINS environment variable to comma-separated \
131
+ list of allowed origins (e.g., 'https://app.example.com,https://api.example.com')"
132
+ );
133
+ CorsLayer::new().allow_origin(Any).allow_methods(Any).allow_headers(Any)
134
+ } else {
135
+ let origins: Vec<_> = server_config
136
+ .cors_origins
137
+ .iter()
138
+ .filter_map(|s| s.trim().parse::<axum::http::HeaderValue>().ok())
139
+ .collect();
140
+
141
+ if !origins.is_empty() {
142
+ tracing::info!("CORS configured with {} explicit allowed origin(s)", origins.len());
143
+ CorsLayer::new()
144
+ .allow_origin(AllowOrigin::list(origins))
145
+ .allow_methods(Any)
146
+ .allow_headers(Any)
147
+ } else {
148
+ tracing::warn!(
149
+ "CORS origins configured but empty/invalid - falling back to permissive CORS. \
150
+ This allows CSRF attacks. Set explicit origins for production."
151
+ );
152
+ CorsLayer::new().allow_origin(Any).allow_methods(Any).allow_headers(Any)
153
+ }
154
+ };
155
+
156
+ Router::new()
157
+ .route("/extract", post(extract_handler))
158
+ .route("/embed", post(embed_handler))
159
+ .route("/chunk", post(chunk_handler))
160
+ .route("/health", get(health_handler))
161
+ .route("/info", get(info_handler))
162
+ .route("/cache/stats", get(cache_stats_handler))
163
+ .route("/cache/clear", delete(cache_clear_handler))
164
+ .layer(DefaultBodyLimit::max(limits.max_request_body_bytes))
165
+ .layer(RequestBodyLimitLayer::new(limits.max_request_body_bytes))
166
+ .layer(cors_layer)
167
+ .layer(TraceLayer::new_for_http())
168
+ .with_state(state)
169
+ }
170
+
171
+ #[cfg(test)]
172
+ mod tests {
173
+ use super::*;
174
+
175
+ #[test]
176
+ fn test_create_router() {
177
+ let config = ExtractionConfig::default();
178
+ let _router = create_router(config);
179
+ }
180
+
181
+ #[test]
182
+ fn test_router_has_routes() {
183
+ use std::mem::size_of_val;
184
+ let config = ExtractionConfig::default();
185
+ let router = create_router(config);
186
+ assert!(size_of_val(&router) > 0);
187
+ }
188
+
189
+ #[test]
190
+ fn test_create_router_with_limits() {
191
+ let config = ExtractionConfig::default();
192
+ let limits = ApiSizeLimits::from_mb(50, 50);
193
+ let _router = create_router_with_limits(config, limits);
194
+ }
195
+
196
+ #[test]
197
+ fn test_create_router_with_server_config() {
198
+ let extraction_config = ExtractionConfig::default();
199
+ let limits = ApiSizeLimits::from_mb(100, 100);
200
+ let server_config = ServerConfig::default();
201
+ let _router = create_router_with_limits_and_server_config(extraction_config, limits, server_config);
202
+ }
203
+
204
+ #[test]
205
+ fn test_server_config_cors_handling() {
206
+ let extraction_config = ExtractionConfig::default();
207
+ let limits = ApiSizeLimits::default();
208
+ let server_config = ServerConfig {
209
+ cors_origins: vec!["https://example.com".to_string()],
210
+ ..Default::default()
211
+ };
212
+ let _router = create_router_with_limits_and_server_config(extraction_config, limits, server_config);
213
+ }
214
+ }