kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -1,518 +0,0 @@
1
- //! API server setup and configuration.
2
-
3
- use std::{net::SocketAddr, sync::Arc};
4
-
5
- use axum::{
6
- Router,
7
- extract::DefaultBodyLimit,
8
- routing::{delete, get, post},
9
- };
10
- use tower_http::{
11
- cors::{AllowOrigin, Any, CorsLayer},
12
- limit::RequestBodyLimitLayer,
13
- trace::TraceLayer,
14
- };
15
-
16
- use crate::{ExtractionConfig, Result, core::ServerConfig};
17
-
18
- use super::{
19
- handlers::{
20
- cache_clear_handler, cache_stats_handler, embed_handler, extract_handler, health_handler, info_handler,
21
- },
22
- types::{ApiSizeLimits, ApiState},
23
- };
24
-
25
- /// Load ServerConfig with proper precedence order.
26
- ///
27
- /// This function implements the configuration hierarchy:
28
- /// 1. File (if provided)
29
- /// 2. Environment variables (via apply_env_overrides)
30
- /// 3. Defaults
31
- ///
32
- /// The config file can be in flat format (server settings at root) or nested format
33
- /// (server settings under [server] section alongside other configs like [ocr]).
34
- ///
35
- /// # Arguments
36
- ///
37
- /// * `config_path` - Optional path to a ServerConfig file (TOML, YAML, or JSON)
38
- ///
39
- /// # Returns
40
- ///
41
- /// A configured ServerConfig with proper precedence applied.
42
- ///
43
- /// # Errors
44
- ///
45
- /// Returns an error if:
46
- /// - The config file path is provided but cannot be read
47
- /// - The config file contains invalid server configuration
48
- /// - Environment variable overrides contain invalid values
49
- ///
50
- /// # Examples
51
- ///
52
- /// ```no_run
53
- /// use kreuzberg::api::load_server_config;
54
- /// use std::path::Path;
55
- ///
56
- /// # fn example() -> kreuzberg::Result<()> {
57
- /// // Load from file with env overrides
58
- /// let config = load_server_config(Some(Path::new("server.toml")))?;
59
- ///
60
- /// // Or use defaults with env overrides
61
- /// let config = load_server_config(None)?;
62
- /// # Ok(())
63
- /// # }
64
- /// ```
65
- pub fn load_server_config(config_path: Option<&std::path::Path>) -> Result<ServerConfig> {
66
- let mut config = if let Some(path) = config_path {
67
- ServerConfig::from_file(path)?
68
- } else {
69
- ServerConfig::default()
70
- };
71
-
72
- // Apply environment variable overrides with proper logging
73
- config.apply_env_overrides()?;
74
-
75
- tracing::info!(
76
- "Server configuration loaded: host={}, port={}, request_body_limit={} MB, multipart_field_limit={} MB, CORS={}",
77
- config.host,
78
- config.port,
79
- config.max_request_body_mb(),
80
- config.max_multipart_field_mb(),
81
- if config.cors_allows_all() {
82
- "allow all origins".to_string()
83
- } else {
84
- format!("{} specific origins", config.cors_origins.len())
85
- }
86
- );
87
-
88
- Ok(config)
89
- }
90
-
91
- /// Create the API router with all routes configured.
92
- ///
93
- /// This is public to allow users to embed the router in their own applications.
94
- ///
95
- /// # Arguments
96
- ///
97
- /// * `config` - Default extraction configuration. Per-request configs override these defaults.
98
- ///
99
- /// # Examples
100
- ///
101
- /// ```no_run
102
- /// use kreuzberg::{ExtractionConfig, api::create_router};
103
- ///
104
- /// # #[tokio::main]
105
- /// # async fn main() {
106
- /// // Create router with default config and size limits
107
- /// let config = ExtractionConfig::default();
108
- /// let router = create_router(config);
109
- /// # }
110
- /// ```
111
- pub fn create_router(config: ExtractionConfig) -> Router {
112
- create_router_with_limits(config, ApiSizeLimits::default())
113
- }
114
-
115
- /// Create the API router with custom size limits.
116
- ///
117
- /// This allows fine-grained control over request body and multipart field size limits.
118
- ///
119
- /// # Arguments
120
- ///
121
- /// * `config` - Default extraction configuration. Per-request configs override these defaults.
122
- /// * `limits` - Size limits for request bodies and multipart uploads.
123
- ///
124
- /// # Examples
125
- ///
126
- /// ```no_run
127
- /// use kreuzberg::{ExtractionConfig, api::{create_router_with_limits, ApiSizeLimits}};
128
- ///
129
- /// # #[tokio::main]
130
- /// # async fn main() {
131
- /// // Create router with 50 MB limits
132
- /// let config = ExtractionConfig::default();
133
- /// let limits = ApiSizeLimits::from_mb(50, 50);
134
- /// let router = create_router_with_limits(config, limits);
135
- /// # }
136
- /// ```
137
- ///
138
- /// ```no_run
139
- /// use kreuzberg::{ExtractionConfig, api::{create_router_with_limits, ApiSizeLimits}};
140
- /// use tower_http::limit::RequestBodyLimitLayer;
141
- ///
142
- /// # #[tokio::main]
143
- /// # async fn main() {
144
- /// // Custom limits for very large documents (500 MB)
145
- /// let config = ExtractionConfig::default();
146
- /// let limits = ApiSizeLimits::from_mb(500, 500);
147
- /// let router = create_router_with_limits(config, limits);
148
- /// # }
149
- /// ```
150
- pub fn create_router_with_limits(config: ExtractionConfig, limits: ApiSizeLimits) -> Router {
151
- create_router_with_limits_and_server_config(config, limits, ServerConfig::default())
152
- }
153
-
154
- /// Create the API router with custom size limits and server configuration.
155
- ///
156
- /// This function provides full control over request limits, CORS, and server settings via ServerConfig.
157
- ///
158
- /// # Arguments
159
- ///
160
- /// * `config` - Default extraction configuration. Per-request configs override these defaults.
161
- /// * `limits` - Size limits for request bodies and multipart uploads.
162
- /// * `server_config` - Server configuration including host, port, and CORS settings.
163
- ///
164
- /// # Examples
165
- ///
166
- /// ```no_run
167
- /// use kreuzberg::{ExtractionConfig, api::create_router_with_limits, core::ServerConfig};
168
- ///
169
- /// # #[tokio::main]
170
- /// # async fn main() -> kreuzberg::Result<()> {
171
- /// let extraction_config = ExtractionConfig::default();
172
- /// let mut server_config = ServerConfig::default();
173
- /// server_config.cors_origins = vec!["https://example.com".to_string()];
174
- /// let router = create_router_with_limits_and_server_config(
175
- /// extraction_config,
176
- /// Default::default(),
177
- /// server_config
178
- /// );
179
- /// # Ok(())
180
- /// # }
181
- /// ```
182
- pub fn create_router_with_limits_and_server_config(
183
- config: ExtractionConfig,
184
- limits: ApiSizeLimits,
185
- server_config: ServerConfig,
186
- ) -> Router {
187
- let state = ApiState {
188
- default_config: Arc::new(config),
189
- };
190
-
191
- // CORS configuration based on ServerConfig
192
- let cors_layer = if server_config.cors_allows_all() {
193
- tracing::warn!(
194
- "CORS configured to allow all origins (default). This permits CSRF attacks. \
195
- For production, set KREUZBERG_CORS_ORIGINS environment variable to comma-separated \
196
- list of allowed origins (e.g., 'https://app.example.com,https://api.example.com')"
197
- );
198
- CorsLayer::new().allow_origin(Any).allow_methods(Any).allow_headers(Any)
199
- } else {
200
- let origins: Vec<_> = server_config
201
- .cors_origins
202
- .iter()
203
- .filter_map(|s| s.trim().parse::<axum::http::HeaderValue>().ok())
204
- .collect();
205
-
206
- if !origins.is_empty() {
207
- tracing::info!("CORS configured with {} explicit allowed origin(s)", origins.len());
208
- CorsLayer::new()
209
- .allow_origin(AllowOrigin::list(origins))
210
- .allow_methods(Any)
211
- .allow_headers(Any)
212
- } else {
213
- tracing::warn!(
214
- "CORS origins configured but empty/invalid - falling back to permissive CORS. \
215
- This allows CSRF attacks. Set explicit origins for production."
216
- );
217
- CorsLayer::new().allow_origin(Any).allow_methods(Any).allow_headers(Any)
218
- }
219
- };
220
-
221
- Router::new()
222
- .route("/extract", post(extract_handler))
223
- .route("/embed", post(embed_handler))
224
- .route("/health", get(health_handler))
225
- .route("/info", get(info_handler))
226
- .route("/cache/stats", get(cache_stats_handler))
227
- .route("/cache/clear", delete(cache_clear_handler))
228
- .layer(DefaultBodyLimit::max(limits.max_request_body_bytes))
229
- .layer(RequestBodyLimitLayer::new(limits.max_request_body_bytes))
230
- .layer(cors_layer)
231
- .layer(TraceLayer::new_for_http())
232
- .with_state(state)
233
- }
234
-
235
- /// Start the API server with config file discovery.
236
- ///
237
- /// Searches for kreuzberg.toml/yaml/json in current and parent directories.
238
- /// If no config file is found, uses default configuration.
239
- ///
240
- /// # Arguments
241
- ///
242
- /// * `host` - IP address to bind to (e.g., "127.0.0.1" or "0.0.0.0")
243
- /// * `port` - Port number to bind to (e.g., 8000)
244
- ///
245
- /// # Examples
246
- ///
247
- /// ```no_run
248
- /// use kreuzberg::api::serve;
249
- ///
250
- /// #[tokio::main]
251
- /// async fn main() -> kreuzberg::Result<()> {
252
- /// // Local development
253
- /// serve("127.0.0.1", 8000).await?;
254
- /// Ok(())
255
- /// }
256
- /// ```
257
- ///
258
- /// ```no_run
259
- /// use kreuzberg::api::serve;
260
- ///
261
- /// #[tokio::main]
262
- /// async fn main() -> kreuzberg::Result<()> {
263
- /// // Docker/production (listen on all interfaces)
264
- /// serve("0.0.0.0", 8000).await?;
265
- /// Ok(())
266
- /// }
267
- /// ```
268
- ///
269
- /// # Environment Variables
270
- ///
271
- /// ```bash
272
- /// # Python/Docker usage
273
- /// export KREUZBERG_HOST=0.0.0.0
274
- /// export KREUZBERG_PORT=8000
275
- ///
276
- /// # CORS configuration (IMPORTANT for production security)
277
- /// # Default: allows all origins (permits CSRF attacks)
278
- /// # Production: set to comma-separated list of allowed origins
279
- /// export KREUZBERG_CORS_ORIGINS="https://app.example.com,https://api.example.com"
280
- ///
281
- /// # Upload size limits (default: 100 MB)
282
- /// # Modern approach (in bytes):
283
- /// export KREUZBERG_MAX_REQUEST_BODY_BYTES=104857600 # 100 MB
284
- /// export KREUZBERG_MAX_MULTIPART_FIELD_BYTES=104857600 # 100 MB per file
285
- ///
286
- /// # Legacy approach (in MB, applies to both limits):
287
- /// export KREUZBERG_MAX_UPLOAD_SIZE_MB=100 # 100 MB
288
- ///
289
- /// python -m kreuzberg.api
290
- /// ```
291
- pub async fn serve(host: impl AsRef<str>, port: u16) -> Result<()> {
292
- let extraction_config = match ExtractionConfig::discover()? {
293
- Some(config) => {
294
- tracing::info!("Loaded extraction config from discovered file");
295
- config
296
- }
297
- None => {
298
- tracing::info!("No config file found, using default configuration");
299
- ExtractionConfig::default()
300
- }
301
- };
302
-
303
- let server_config = load_server_config(None)?;
304
- let limits = ApiSizeLimits::new(
305
- server_config.max_request_body_bytes,
306
- server_config.max_multipart_field_bytes,
307
- );
308
-
309
- serve_with_config_and_limits(host, port, extraction_config, limits).await
310
- }
311
-
312
- /// Start the API server with explicit config.
313
- ///
314
- /// Uses default size limits (100 MB). For custom limits, use `serve_with_config_and_limits`.
315
- ///
316
- /// # Arguments
317
- ///
318
- /// * `host` - IP address to bind to (e.g., "127.0.0.1" or "0.0.0.0")
319
- /// * `port` - Port number to bind to (e.g., 8000)
320
- /// * `config` - Default extraction configuration for all requests
321
- ///
322
- /// # Examples
323
- ///
324
- /// ```no_run
325
- /// use kreuzberg::{ExtractionConfig, api::serve_with_config};
326
- ///
327
- /// #[tokio::main]
328
- /// async fn main() -> kreuzberg::Result<()> {
329
- /// let config = ExtractionConfig::from_toml_file("config/kreuzberg.toml")?;
330
- /// serve_with_config("127.0.0.1", 8000, config).await?;
331
- /// Ok(())
332
- /// }
333
- /// ```
334
- pub async fn serve_with_config(host: impl AsRef<str>, port: u16, config: ExtractionConfig) -> Result<()> {
335
- let limits = ApiSizeLimits::default();
336
- tracing::info!(
337
- "Upload size limit: 100 MB (default, {} bytes)",
338
- limits.max_request_body_bytes
339
- );
340
- serve_with_config_and_limits(host, port, config, limits).await
341
- }
342
-
343
- /// Start the API server with explicit config and size limits.
344
- ///
345
- /// # Arguments
346
- ///
347
- /// * `host` - IP address to bind to (e.g., "127.0.0.1" or "0.0.0.0")
348
- /// * `port` - Port number to bind to (e.g., 8000)
349
- /// * `config` - Default extraction configuration for all requests
350
- /// * `limits` - Size limits for request bodies and multipart uploads
351
- ///
352
- /// # Examples
353
- ///
354
- /// ```no_run
355
- /// use kreuzberg::{ExtractionConfig, api::{serve_with_config_and_limits, ApiSizeLimits}};
356
- ///
357
- /// #[tokio::main]
358
- /// async fn main() -> kreuzberg::Result<()> {
359
- /// let config = ExtractionConfig::from_toml_file("config/kreuzberg.toml")?;
360
- /// let limits = ApiSizeLimits::from_mb(200, 200);
361
- /// serve_with_config_and_limits("127.0.0.1", 8000, config, limits).await?;
362
- /// Ok(())
363
- /// }
364
- /// ```
365
- pub async fn serve_with_config_and_limits(
366
- host: impl AsRef<str>,
367
- port: u16,
368
- config: ExtractionConfig,
369
- limits: ApiSizeLimits,
370
- ) -> Result<()> {
371
- use std::net::IpAddr;
372
-
373
- let ip: IpAddr = host
374
- .as_ref()
375
- .parse()
376
- .map_err(|e| crate::error::KreuzbergError::validation(format!("Invalid host address: {}", e)))?;
377
-
378
- let server_config = ServerConfig {
379
- host: host.as_ref().to_string(),
380
- port,
381
- max_request_body_bytes: limits.max_request_body_bytes,
382
- max_multipart_field_bytes: limits.max_multipart_field_bytes,
383
- ..Default::default()
384
- };
385
-
386
- let addr = SocketAddr::new(ip, port);
387
- let app = create_router_with_limits_and_server_config(config, limits, server_config);
388
-
389
- tracing::info!("Starting Kreuzberg API server on http://{}:{}", ip, port);
390
-
391
- let listener = tokio::net::TcpListener::bind(addr)
392
- .await
393
- .map_err(crate::error::KreuzbergError::Io)?;
394
-
395
- axum::serve(listener, app)
396
- .await
397
- .map_err(|e| crate::error::KreuzbergError::Other(e.to_string()))?;
398
-
399
- Ok(())
400
- }
401
-
402
- /// Start the API server with explicit extraction config and server config.
403
- ///
404
- /// This function accepts a fully-configured ServerConfig, including CORS origins,
405
- /// size limits, host, and port. It respects all ServerConfig fields without
406
- /// re-parsing environment variables, making it ideal for CLI usage where
407
- /// configuration precedence has already been applied.
408
- ///
409
- /// # Arguments
410
- ///
411
- /// * `extraction_config` - Default extraction configuration for all requests
412
- /// * `server_config` - Server configuration including host, port, CORS, and size limits
413
- ///
414
- /// # Examples
415
- ///
416
- /// ```no_run
417
- /// use kreuzberg::{ExtractionConfig, api::serve_with_server_config, core::ServerConfig};
418
- ///
419
- /// #[tokio::main]
420
- /// async fn main() -> kreuzberg::Result<()> {
421
- /// let extraction_config = ExtractionConfig::default();
422
- /// let mut server_config = ServerConfig::default();
423
- /// server_config.host = "0.0.0.0".to_string();
424
- /// server_config.port = 3000;
425
- /// server_config.cors_origins = vec!["https://example.com".to_string()];
426
- ///
427
- /// serve_with_server_config(extraction_config, server_config).await?;
428
- /// Ok(())
429
- /// }
430
- /// ```
431
- pub async fn serve_with_server_config(extraction_config: ExtractionConfig, server_config: ServerConfig) -> Result<()> {
432
- use std::net::IpAddr;
433
-
434
- let ip: IpAddr = server_config
435
- .host
436
- .parse()
437
- .map_err(|e| crate::error::KreuzbergError::validation(format!("Invalid host address: {}", e)))?;
438
-
439
- let limits = ApiSizeLimits::new(
440
- server_config.max_request_body_bytes,
441
- server_config.max_multipart_field_bytes,
442
- );
443
-
444
- let addr = SocketAddr::new(ip, server_config.port);
445
- let app = create_router_with_limits_and_server_config(extraction_config, limits, server_config.clone());
446
-
447
- tracing::info!(
448
- "Starting Kreuzberg API server on http://{}:{} (request_body_limit={} MB, multipart_field_limit={} MB)",
449
- ip,
450
- server_config.port,
451
- server_config.max_request_body_mb(),
452
- server_config.max_multipart_field_mb()
453
- );
454
-
455
- let listener = tokio::net::TcpListener::bind(addr)
456
- .await
457
- .map_err(crate::error::KreuzbergError::Io)?;
458
-
459
- axum::serve(listener, app)
460
- .await
461
- .map_err(|e| crate::error::KreuzbergError::Other(e.to_string()))?;
462
-
463
- Ok(())
464
- }
465
-
466
- /// Start the API server with default host and port.
467
- ///
468
- /// Defaults: host = "127.0.0.1", port = 8000
469
- ///
470
- /// Uses config file discovery (searches current/parent directories for kreuzberg.toml/yaml/json).
471
- pub async fn serve_default() -> Result<()> {
472
- serve("127.0.0.1", 8000).await
473
- }
474
-
475
- #[cfg(test)]
476
- #[allow(unsafe_code)]
477
- mod tests {
478
- use super::*;
479
-
480
- #[test]
481
- fn test_create_router() {
482
- let config = ExtractionConfig::default();
483
- let _router = create_router(config);
484
- }
485
-
486
- #[test]
487
- fn test_router_has_routes() {
488
- let config = ExtractionConfig::default();
489
- let router = create_router(config);
490
- assert!(size_of_val(&router) > 0);
491
- }
492
-
493
- #[test]
494
- fn test_create_router_with_limits() {
495
- let config = ExtractionConfig::default();
496
- let limits = ApiSizeLimits::from_mb(50, 50);
497
- let _router = create_router_with_limits(config, limits);
498
- }
499
-
500
- #[test]
501
- fn test_create_router_with_server_config() {
502
- let extraction_config = ExtractionConfig::default();
503
- let limits = ApiSizeLimits::from_mb(100, 100);
504
- let server_config = ServerConfig::default();
505
- let _router = create_router_with_limits_and_server_config(extraction_config, limits, server_config);
506
- }
507
-
508
- #[test]
509
- fn test_server_config_cors_handling() {
510
- let extraction_config = ExtractionConfig::default();
511
- let limits = ApiSizeLimits::default();
512
- let server_config = ServerConfig {
513
- cors_origins: vec!["https://example.com".to_string()],
514
- ..Default::default()
515
- };
516
- let _router = create_router_with_limits_and_server_config(extraction_config, limits, server_config);
517
- }
518
- }