kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,214 @@
1
+ //! API router setup and configuration.
2
+
3
+ use std::sync::Arc;
4
+
5
+ use axum::{
6
+ Router,
7
+ extract::DefaultBodyLimit,
8
+ routing::{delete, get, post},
9
+ };
10
+ use tower_http::{
11
+ cors::{AllowOrigin, Any, CorsLayer},
12
+ limit::RequestBodyLimitLayer,
13
+ trace::TraceLayer,
14
+ };
15
+
16
+ use crate::{ExtractionConfig, core::ServerConfig};
17
+
18
+ use super::{
19
+ handlers::{
20
+ cache_clear_handler, cache_stats_handler, chunk_handler, embed_handler, extract_handler, health_handler,
21
+ info_handler,
22
+ },
23
+ types::{ApiSizeLimits, ApiState},
24
+ };
25
+
26
+ /// Create the API router with all routes configured.
27
+ ///
28
+ /// This is public to allow users to embed the router in their own applications.
29
+ ///
30
+ /// # Arguments
31
+ ///
32
+ /// * `config` - Default extraction configuration. Per-request configs override these defaults.
33
+ ///
34
+ /// # Examples
35
+ ///
36
+ /// ```no_run
37
+ /// use kreuzberg::{ExtractionConfig, api::create_router};
38
+ ///
39
+ /// # #[tokio::main]
40
+ /// # async fn main() {
41
+ /// // Create router with default config and size limits
42
+ /// let config = ExtractionConfig::default();
43
+ /// let router = create_router(config);
44
+ /// # }
45
+ /// ```
46
+ pub fn create_router(config: ExtractionConfig) -> Router {
47
+ create_router_with_limits(config, ApiSizeLimits::default())
48
+ }
49
+
50
+ /// Create the API router with custom size limits.
51
+ ///
52
+ /// This allows fine-grained control over request body and multipart field size limits.
53
+ ///
54
+ /// # Arguments
55
+ ///
56
+ /// * `config` - Default extraction configuration. Per-request configs override these defaults.
57
+ /// * `limits` - Size limits for request bodies and multipart uploads.
58
+ ///
59
+ /// # Examples
60
+ ///
61
+ /// ```no_run
62
+ /// use kreuzberg::{ExtractionConfig, api::{create_router_with_limits, ApiSizeLimits}};
63
+ ///
64
+ /// # #[tokio::main]
65
+ /// # async fn main() {
66
+ /// // Create router with 50 MB limits
67
+ /// let config = ExtractionConfig::default();
68
+ /// let limits = ApiSizeLimits::from_mb(50, 50);
69
+ /// let router = create_router_with_limits(config, limits);
70
+ /// # }
71
+ /// ```
72
+ ///
73
+ /// ```no_run
74
+ /// use kreuzberg::{ExtractionConfig, api::{create_router_with_limits, ApiSizeLimits}};
75
+ /// use tower_http::limit::RequestBodyLimitLayer;
76
+ ///
77
+ /// # #[tokio::main]
78
+ /// # async fn main() {
79
+ /// // Custom limits for very large documents (500 MB)
80
+ /// let config = ExtractionConfig::default();
81
+ /// let limits = ApiSizeLimits::from_mb(500, 500);
82
+ /// let router = create_router_with_limits(config, limits);
83
+ /// # }
84
+ /// ```
85
+ pub fn create_router_with_limits(config: ExtractionConfig, limits: ApiSizeLimits) -> Router {
86
+ create_router_with_limits_and_server_config(config, limits, ServerConfig::default())
87
+ }
88
+
89
+ /// Create the API router with custom size limits and server configuration.
90
+ ///
91
+ /// This function provides full control over request limits, CORS, and server settings via ServerConfig.
92
+ ///
93
+ /// # Arguments
94
+ ///
95
+ /// * `config` - Default extraction configuration. Per-request configs override these defaults.
96
+ /// * `limits` - Size limits for request bodies and multipart uploads.
97
+ /// * `server_config` - Server configuration including host, port, and CORS settings.
98
+ ///
99
+ /// # Examples
100
+ ///
101
+ /// ```no_run
102
+ /// use kreuzberg::{ExtractionConfig, api::create_router_with_limits, core::ServerConfig};
103
+ ///
104
+ /// # #[tokio::main]
105
+ /// # async fn main() -> kreuzberg::Result<()> {
106
+ /// let extraction_config = ExtractionConfig::default();
107
+ /// let mut server_config = ServerConfig::default();
108
+ /// server_config.cors_origins = vec!["https://example.com".to_string()];
109
+ /// let router = create_router_with_limits_and_server_config(
110
+ /// extraction_config,
111
+ /// Default::default(),
112
+ /// server_config
113
+ /// );
114
+ /// # Ok(())
115
+ /// # }
116
+ /// ```
117
+ pub fn create_router_with_limits_and_server_config(
118
+ config: ExtractionConfig,
119
+ limits: ApiSizeLimits,
120
+ server_config: ServerConfig,
121
+ ) -> Router {
122
+ let state = ApiState {
123
+ default_config: Arc::new(config),
124
+ };
125
+
126
+ // CORS configuration based on ServerConfig
127
+ let cors_layer = if server_config.cors_allows_all() {
128
+ tracing::warn!(
129
+ "CORS configured to allow all origins (default). This permits CSRF attacks. \
130
+ For production, set KREUZBERG_CORS_ORIGINS environment variable to comma-separated \
131
+ list of allowed origins (e.g., 'https://app.example.com,https://api.example.com')"
132
+ );
133
+ CorsLayer::new().allow_origin(Any).allow_methods(Any).allow_headers(Any)
134
+ } else {
135
+ let origins: Vec<_> = server_config
136
+ .cors_origins
137
+ .iter()
138
+ .filter_map(|s| s.trim().parse::<axum::http::HeaderValue>().ok())
139
+ .collect();
140
+
141
+ if !origins.is_empty() {
142
+ tracing::info!("CORS configured with {} explicit allowed origin(s)", origins.len());
143
+ CorsLayer::new()
144
+ .allow_origin(AllowOrigin::list(origins))
145
+ .allow_methods(Any)
146
+ .allow_headers(Any)
147
+ } else {
148
+ tracing::warn!(
149
+ "CORS origins configured but empty/invalid - falling back to permissive CORS. \
150
+ This allows CSRF attacks. Set explicit origins for production."
151
+ );
152
+ CorsLayer::new().allow_origin(Any).allow_methods(Any).allow_headers(Any)
153
+ }
154
+ };
155
+
156
+ Router::new()
157
+ .route("/extract", post(extract_handler))
158
+ .route("/embed", post(embed_handler))
159
+ .route("/chunk", post(chunk_handler))
160
+ .route("/health", get(health_handler))
161
+ .route("/info", get(info_handler))
162
+ .route("/cache/stats", get(cache_stats_handler))
163
+ .route("/cache/clear", delete(cache_clear_handler))
164
+ .layer(DefaultBodyLimit::max(limits.max_request_body_bytes))
165
+ .layer(RequestBodyLimitLayer::new(limits.max_request_body_bytes))
166
+ .layer(cors_layer)
167
+ .layer(TraceLayer::new_for_http())
168
+ .with_state(state)
169
+ }
170
+
171
+ #[cfg(test)]
172
+ mod tests {
173
+ use super::*;
174
+
175
+ #[test]
176
+ fn test_create_router() {
177
+ let config = ExtractionConfig::default();
178
+ let _router = create_router(config);
179
+ }
180
+
181
+ #[test]
182
+ fn test_router_has_routes() {
183
+ use std::mem::size_of_val;
184
+ let config = ExtractionConfig::default();
185
+ let router = create_router(config);
186
+ assert!(size_of_val(&router) > 0);
187
+ }
188
+
189
+ #[test]
190
+ fn test_create_router_with_limits() {
191
+ let config = ExtractionConfig::default();
192
+ let limits = ApiSizeLimits::from_mb(50, 50);
193
+ let _router = create_router_with_limits(config, limits);
194
+ }
195
+
196
+ #[test]
197
+ fn test_create_router_with_server_config() {
198
+ let extraction_config = ExtractionConfig::default();
199
+ let limits = ApiSizeLimits::from_mb(100, 100);
200
+ let server_config = ServerConfig::default();
201
+ let _router = create_router_with_limits_and_server_config(extraction_config, limits, server_config);
202
+ }
203
+
204
+ #[test]
205
+ fn test_server_config_cors_handling() {
206
+ let extraction_config = ExtractionConfig::default();
207
+ let limits = ApiSizeLimits::default();
208
+ let server_config = ServerConfig {
209
+ cors_origins: vec!["https://example.com".to_string()],
210
+ ..Default::default()
211
+ };
212
+ let _router = create_router_with_limits_and_server_config(extraction_config, limits, server_config);
213
+ }
214
+ }
@@ -0,0 +1,243 @@
1
+ //! API server startup functions.
2
+
3
+ use std::net::{IpAddr, SocketAddr};
4
+
5
+ use crate::{ExtractionConfig, Result, core::ServerConfig};
6
+
7
+ use super::{config::load_server_config, router::create_router_with_limits_and_server_config, types::ApiSizeLimits};
8
+
9
+ /// Start the API server with config file discovery.
10
+ ///
11
+ /// Searches for kreuzberg.toml/yaml/json in current and parent directories.
12
+ /// If no config file is found, uses default configuration.
13
+ ///
14
+ /// # Arguments
15
+ ///
16
+ /// * `host` - IP address to bind to (e.g., "127.0.0.1" or "0.0.0.0")
17
+ /// * `port` - Port number to bind to (e.g., 8000)
18
+ ///
19
+ /// # Examples
20
+ ///
21
+ /// ```no_run
22
+ /// use kreuzberg::api::serve;
23
+ ///
24
+ /// #[tokio::main]
25
+ /// async fn main() -> kreuzberg::Result<()> {
26
+ /// // Local development
27
+ /// serve("127.0.0.1", 8000).await?;
28
+ /// Ok(())
29
+ /// }
30
+ /// ```
31
+ ///
32
+ /// ```no_run
33
+ /// use kreuzberg::api::serve;
34
+ ///
35
+ /// #[tokio::main]
36
+ /// async fn main() -> kreuzberg::Result<()> {
37
+ /// // Docker/production (listen on all interfaces)
38
+ /// serve("0.0.0.0", 8000).await?;
39
+ /// Ok(())
40
+ /// }
41
+ /// ```
42
+ ///
43
+ /// # Environment Variables
44
+ ///
45
+ /// ```bash
46
+ /// # Python/Docker usage
47
+ /// export KREUZBERG_HOST=0.0.0.0
48
+ /// export KREUZBERG_PORT=8000
49
+ ///
50
+ /// # CORS configuration (IMPORTANT for production security)
51
+ /// # Default: allows all origins (permits CSRF attacks)
52
+ /// # Production: set to comma-separated list of allowed origins
53
+ /// export KREUZBERG_CORS_ORIGINS="https://app.example.com,https://api.example.com"
54
+ ///
55
+ /// # Upload size limits (default: 100 MB)
56
+ /// # Modern approach (in bytes):
57
+ /// export KREUZBERG_MAX_REQUEST_BODY_BYTES=104857600 # 100 MB
58
+ /// export KREUZBERG_MAX_MULTIPART_FIELD_BYTES=104857600 # 100 MB per file
59
+ ///
60
+ /// # Legacy approach (in MB, applies to both limits):
61
+ /// export KREUZBERG_MAX_UPLOAD_SIZE_MB=100 # 100 MB
62
+ ///
63
+ /// python -m kreuzberg.api
64
+ /// ```
65
+ pub async fn serve(host: impl AsRef<str>, port: u16) -> Result<()> {
66
+ let extraction_config = match ExtractionConfig::discover()? {
67
+ Some(config) => {
68
+ tracing::info!("Loaded extraction config from discovered file");
69
+ config
70
+ }
71
+ None => {
72
+ tracing::info!("No config file found, using default configuration");
73
+ ExtractionConfig::default()
74
+ }
75
+ };
76
+
77
+ let server_config = load_server_config(None)?;
78
+ let limits = ApiSizeLimits::new(
79
+ server_config.max_request_body_bytes,
80
+ server_config.max_multipart_field_bytes,
81
+ );
82
+
83
+ serve_with_config_and_limits(host, port, extraction_config, limits).await
84
+ }
85
+
86
+ /// Start the API server with explicit config.
87
+ ///
88
+ /// Uses default size limits (100 MB). For custom limits, use `serve_with_config_and_limits`.
89
+ ///
90
+ /// # Arguments
91
+ ///
92
+ /// * `host` - IP address to bind to (e.g., "127.0.0.1" or "0.0.0.0")
93
+ /// * `port` - Port number to bind to (e.g., 8000)
94
+ /// * `config` - Default extraction configuration for all requests
95
+ ///
96
+ /// # Examples
97
+ ///
98
+ /// ```no_run
99
+ /// use kreuzberg::{ExtractionConfig, api::serve_with_config};
100
+ ///
101
+ /// #[tokio::main]
102
+ /// async fn main() -> kreuzberg::Result<()> {
103
+ /// let config = ExtractionConfig::from_toml_file("config/kreuzberg.toml")?;
104
+ /// serve_with_config("127.0.0.1", 8000, config).await?;
105
+ /// Ok(())
106
+ /// }
107
+ /// ```
108
+ pub async fn serve_with_config(host: impl AsRef<str>, port: u16, config: ExtractionConfig) -> Result<()> {
109
+ let limits = ApiSizeLimits::default();
110
+ tracing::info!(
111
+ "Upload size limit: 100 MB (default, {} bytes)",
112
+ limits.max_request_body_bytes
113
+ );
114
+ serve_with_config_and_limits(host, port, config, limits).await
115
+ }
116
+
117
+ /// Start the API server with explicit config and size limits.
118
+ ///
119
+ /// # Arguments
120
+ ///
121
+ /// * `host` - IP address to bind to (e.g., "127.0.0.1" or "0.0.0.0")
122
+ /// * `port` - Port number to bind to (e.g., 8000)
123
+ /// * `config` - Default extraction configuration for all requests
124
+ /// * `limits` - Size limits for request bodies and multipart uploads
125
+ ///
126
+ /// # Examples
127
+ ///
128
+ /// ```no_run
129
+ /// use kreuzberg::{ExtractionConfig, api::{serve_with_config_and_limits, ApiSizeLimits}};
130
+ ///
131
+ /// #[tokio::main]
132
+ /// async fn main() -> kreuzberg::Result<()> {
133
+ /// let config = ExtractionConfig::from_toml_file("config/kreuzberg.toml")?;
134
+ /// let limits = ApiSizeLimits::from_mb(200, 200);
135
+ /// serve_with_config_and_limits("127.0.0.1", 8000, config, limits).await?;
136
+ /// Ok(())
137
+ /// }
138
+ /// ```
139
+ pub async fn serve_with_config_and_limits(
140
+ host: impl AsRef<str>,
141
+ port: u16,
142
+ config: ExtractionConfig,
143
+ limits: ApiSizeLimits,
144
+ ) -> Result<()> {
145
+ let ip: IpAddr = host
146
+ .as_ref()
147
+ .parse()
148
+ .map_err(|e| crate::error::KreuzbergError::validation(format!("Invalid host address: {}", e)))?;
149
+
150
+ let server_config = ServerConfig {
151
+ host: host.as_ref().to_string(),
152
+ port,
153
+ max_request_body_bytes: limits.max_request_body_bytes,
154
+ max_multipart_field_bytes: limits.max_multipart_field_bytes,
155
+ ..Default::default()
156
+ };
157
+
158
+ let addr = SocketAddr::new(ip, port);
159
+ let app = create_router_with_limits_and_server_config(config, limits, server_config);
160
+
161
+ tracing::info!("Starting Kreuzberg API server on http://{}:{}", ip, port);
162
+
163
+ let listener = tokio::net::TcpListener::bind(addr)
164
+ .await
165
+ .map_err(crate::error::KreuzbergError::Io)?;
166
+
167
+ axum::serve(listener, app)
168
+ .await
169
+ .map_err(|e| crate::error::KreuzbergError::Other(e.to_string()))?;
170
+
171
+ Ok(())
172
+ }
173
+
174
+ /// Start the API server with explicit extraction config and server config.
175
+ ///
176
+ /// This function accepts a fully-configured ServerConfig, including CORS origins,
177
+ /// size limits, host, and port. It respects all ServerConfig fields without
178
+ /// re-parsing environment variables, making it ideal for CLI usage where
179
+ /// configuration precedence has already been applied.
180
+ ///
181
+ /// # Arguments
182
+ ///
183
+ /// * `extraction_config` - Default extraction configuration for all requests
184
+ /// * `server_config` - Server configuration including host, port, CORS, and size limits
185
+ ///
186
+ /// # Examples
187
+ ///
188
+ /// ```no_run
189
+ /// use kreuzberg::{ExtractionConfig, api::serve_with_server_config, core::ServerConfig};
190
+ ///
191
+ /// #[tokio::main]
192
+ /// async fn main() -> kreuzberg::Result<()> {
193
+ /// let extraction_config = ExtractionConfig::default();
194
+ /// let mut server_config = ServerConfig::default();
195
+ /// server_config.host = "0.0.0.0".to_string();
196
+ /// server_config.port = 3000;
197
+ /// server_config.cors_origins = vec!["https://example.com".to_string()];
198
+ ///
199
+ /// serve_with_server_config(extraction_config, server_config).await?;
200
+ /// Ok(())
201
+ /// }
202
+ /// ```
203
+ pub async fn serve_with_server_config(extraction_config: ExtractionConfig, server_config: ServerConfig) -> Result<()> {
204
+ let ip: IpAddr = server_config
205
+ .host
206
+ .parse()
207
+ .map_err(|e| crate::error::KreuzbergError::validation(format!("Invalid host address: {}", e)))?;
208
+
209
+ let limits = ApiSizeLimits::new(
210
+ server_config.max_request_body_bytes,
211
+ server_config.max_multipart_field_bytes,
212
+ );
213
+
214
+ let addr = SocketAddr::new(ip, server_config.port);
215
+ let app = create_router_with_limits_and_server_config(extraction_config, limits, server_config.clone());
216
+
217
+ tracing::info!(
218
+ "Starting Kreuzberg API server on http://{}:{} (request_body_limit={} MB, multipart_field_limit={} MB)",
219
+ ip,
220
+ server_config.port,
221
+ server_config.max_request_body_mb(),
222
+ server_config.max_multipart_field_mb()
223
+ );
224
+
225
+ let listener = tokio::net::TcpListener::bind(addr)
226
+ .await
227
+ .map_err(crate::error::KreuzbergError::Io)?;
228
+
229
+ axum::serve(listener, app)
230
+ .await
231
+ .map_err(|e| crate::error::KreuzbergError::Other(e.to_string()))?;
232
+
233
+ Ok(())
234
+ }
235
+
236
+ /// Start the API server with default host and port.
237
+ ///
238
+ /// Defaults: host = "127.0.0.1", port = 8000
239
+ ///
240
+ /// Uses config file discovery (searches current/parent directories for kreuzberg.toml/yaml/json).
241
+ pub async fn serve_default() -> Result<()> {
242
+ serve("127.0.0.1", 8000).await
243
+ }
@@ -204,3 +204,81 @@ pub struct EmbedResponse {
204
204
  /// Number of embeddings generated
205
205
  pub count: usize,
206
206
  }
207
+
208
+ /// Default chunker type.
209
+ fn default_chunker_type() -> String {
210
+ "text".to_string()
211
+ }
212
+
213
+ /// Chunk request with text and configuration.
214
+ #[derive(Debug, Clone, Serialize, Deserialize)]
215
+ pub struct ChunkRequest {
216
+ /// Text to chunk
217
+ pub text: String,
218
+ /// Optional chunking configuration
219
+ #[serde(skip_serializing_if = "Option::is_none")]
220
+ pub config: Option<ChunkingConfigRequest>,
221
+ /// Chunker type (text or markdown)
222
+ #[serde(default = "default_chunker_type")]
223
+ pub chunker_type: String,
224
+ }
225
+
226
+ /// Chunking configuration request.
227
+ #[derive(Debug, Clone, Default, Serialize, Deserialize)]
228
+ pub struct ChunkingConfigRequest {
229
+ /// Maximum characters per chunk
230
+ pub max_characters: Option<usize>,
231
+ /// Overlap between chunks in characters
232
+ pub overlap: Option<usize>,
233
+ /// Whether to trim whitespace
234
+ pub trim: Option<bool>,
235
+ }
236
+
237
+ /// Chunk response with chunks and metadata.
238
+ #[derive(Debug, Clone, Serialize, Deserialize)]
239
+ pub struct ChunkResponse {
240
+ /// List of chunks
241
+ pub chunks: Vec<ChunkItem>,
242
+ /// Total number of chunks
243
+ pub chunk_count: usize,
244
+ /// Configuration used for chunking
245
+ pub config: ChunkingConfigResponse,
246
+ /// Input text size in bytes
247
+ pub input_size_bytes: usize,
248
+ /// Chunker type used for chunking
249
+ pub chunker_type: String,
250
+ }
251
+
252
+ /// Individual chunk item with metadata.
253
+ #[derive(Debug, Clone, Serialize, Deserialize)]
254
+ pub struct ChunkItem {
255
+ /// Chunk content
256
+ pub content: String,
257
+ /// Byte offset start position
258
+ pub byte_start: usize,
259
+ /// Byte offset end position
260
+ pub byte_end: usize,
261
+ /// Index of this chunk (0-based)
262
+ pub chunk_index: usize,
263
+ /// Total number of chunks
264
+ pub total_chunks: usize,
265
+ /// First page number (optional, for PDF chunking)
266
+ #[serde(skip_serializing_if = "Option::is_none")]
267
+ pub first_page: Option<usize>,
268
+ /// Last page number (optional, for PDF chunking)
269
+ #[serde(skip_serializing_if = "Option::is_none")]
270
+ pub last_page: Option<usize>,
271
+ }
272
+
273
+ /// Chunking configuration response.
274
+ #[derive(Debug, Clone, Serialize, Deserialize)]
275
+ pub struct ChunkingConfigResponse {
276
+ /// Maximum characters per chunk
277
+ pub max_characters: usize,
278
+ /// Overlap between chunks in characters
279
+ pub overlap: usize,
280
+ /// Whether whitespace was trimmed
281
+ pub trim: bool,
282
+ /// Type of chunker used
283
+ pub chunker_type: String,
284
+ }