kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,312 @@
1
+ //! Post-processing and chunking configuration.
2
+ //!
3
+ //! Defines configuration for post-processing pipelines, text chunking,
4
+ //! and embedding generation.
5
+
6
+ use serde::{Deserialize, Serialize};
7
+ use std::collections::HashSet;
8
+ use std::path::PathBuf;
9
+
10
+ /// Post-processor configuration.
11
+ #[derive(Debug, Clone, Serialize, Deserialize)]
12
+ pub struct PostProcessorConfig {
13
+ /// Enable post-processors
14
+ #[serde(default = "default_true")]
15
+ pub enabled: bool,
16
+
17
+ /// Whitelist of processor names to run (None = all enabled)
18
+ #[serde(default)]
19
+ pub enabled_processors: Option<Vec<String>>,
20
+
21
+ /// Blacklist of processor names to skip (None = none disabled)
22
+ #[serde(default)]
23
+ pub disabled_processors: Option<Vec<String>>,
24
+
25
+ /// Pre-computed HashSet for O(1) enabled processor lookup
26
+ #[serde(skip)]
27
+ pub enabled_set: Option<HashSet<String>>,
28
+
29
+ /// Pre-computed HashSet for O(1) disabled processor lookup
30
+ #[serde(skip)]
31
+ pub disabled_set: Option<HashSet<String>>,
32
+ }
33
+
34
+ impl PostProcessorConfig {
35
+ /// Pre-compute HashSets for O(1) processor name lookups.
36
+ ///
37
+ /// This method converts the enabled/disabled processor Vec to HashSet
38
+ /// for constant-time lookups in the pipeline.
39
+ pub fn build_lookup_sets(&mut self) {
40
+ if let Some(ref enabled) = self.enabled_processors {
41
+ self.enabled_set = Some(enabled.iter().cloned().collect());
42
+ }
43
+ if let Some(ref disabled) = self.disabled_processors {
44
+ self.disabled_set = Some(disabled.iter().cloned().collect());
45
+ }
46
+ }
47
+ }
48
+
49
+ impl Default for PostProcessorConfig {
50
+ fn default() -> Self {
51
+ Self {
52
+ enabled: true,
53
+ enabled_processors: None,
54
+ disabled_processors: None,
55
+ enabled_set: None,
56
+ disabled_set: None,
57
+ }
58
+ }
59
+ }
60
+
61
+ /// Chunking configuration.
62
+ #[derive(Debug, Clone, Serialize, Deserialize)]
63
+ pub struct ChunkingConfig {
64
+ /// Maximum characters per chunk
65
+ #[serde(default = "default_chunk_size")]
66
+ pub max_chars: usize,
67
+
68
+ /// Overlap between chunks in characters
69
+ #[serde(default = "default_chunk_overlap")]
70
+ pub max_overlap: usize,
71
+
72
+ /// Optional embedding configuration for chunk embeddings
73
+ #[serde(skip_serializing_if = "Option::is_none")]
74
+ pub embedding: Option<EmbeddingConfig>,
75
+
76
+ /// Use a preset configuration (overrides individual settings if provided)
77
+ #[serde(skip_serializing_if = "Option::is_none")]
78
+ pub preset: Option<String>,
79
+ }
80
+
81
+ /// Embedding configuration for text chunks.
82
+ ///
83
+ /// Configures embedding generation using ONNX models via fastembed-rs.
84
+ /// Requires the `embeddings` feature to be enabled.
85
+ #[derive(Debug, Clone, Serialize, Deserialize)]
86
+ pub struct EmbeddingConfig {
87
+ /// The embedding model to use
88
+ pub model: EmbeddingModelType,
89
+
90
+ /// Whether to normalize embedding vectors (recommended for cosine similarity)
91
+ #[serde(default = "default_normalize")]
92
+ pub normalize: bool,
93
+
94
+ /// Batch size for embedding generation
95
+ #[serde(default = "default_batch_size")]
96
+ pub batch_size: usize,
97
+
98
+ /// Show model download progress
99
+ #[serde(default)]
100
+ pub show_download_progress: bool,
101
+
102
+ /// Custom cache directory for model files
103
+ ///
104
+ /// Defaults to `~/.cache/kreuzberg/embeddings/` if not specified.
105
+ /// Allows full customization of model download location.
106
+ #[serde(skip_serializing_if = "Option::is_none")]
107
+ pub cache_dir: Option<PathBuf>,
108
+ }
109
+
110
+ impl Default for EmbeddingConfig {
111
+ fn default() -> Self {
112
+ Self {
113
+ model: EmbeddingModelType::Preset {
114
+ name: "balanced".to_string(),
115
+ },
116
+ normalize: true,
117
+ batch_size: 32,
118
+ show_download_progress: false,
119
+ cache_dir: None,
120
+ }
121
+ }
122
+ }
123
+
124
+ /// Embedding model types supported by Kreuzberg.
125
+ #[derive(Debug, Clone, Serialize, Deserialize)]
126
+ #[serde(tag = "type", rename_all = "snake_case")]
127
+ pub enum EmbeddingModelType {
128
+ /// Use a preset model configuration (recommended)
129
+ Preset { name: String },
130
+
131
+ /// Use a specific fastembed model by name
132
+ #[cfg(feature = "embeddings")]
133
+ FastEmbed { model: String, dimensions: usize },
134
+
135
+ /// Use a custom ONNX model from HuggingFace
136
+ Custom { model_id: String, dimensions: usize },
137
+ }
138
+
139
+ fn default_true() -> bool {
140
+ true
141
+ }
142
+
143
+ fn default_chunk_size() -> usize {
144
+ 1000
145
+ }
146
+
147
+ fn default_chunk_overlap() -> usize {
148
+ 200
149
+ }
150
+
151
+ fn default_normalize() -> bool {
152
+ true
153
+ }
154
+
155
+ fn default_batch_size() -> usize {
156
+ 32
157
+ }
158
+
159
+ #[cfg(test)]
160
+ mod tests {
161
+ use super::*;
162
+
163
+ #[test]
164
+ fn test_postprocessor_config_default() {
165
+ let config = PostProcessorConfig::default();
166
+ assert!(config.enabled);
167
+ assert!(config.enabled_processors.is_none());
168
+ assert!(config.disabled_processors.is_none());
169
+ }
170
+
171
+ #[test]
172
+ fn test_postprocessor_config_build_lookup_sets() {
173
+ let mut config = PostProcessorConfig {
174
+ enabled: true,
175
+ enabled_processors: Some(vec!["a".to_string(), "b".to_string()]),
176
+ disabled_processors: Some(vec!["c".to_string()]),
177
+ enabled_set: None,
178
+ disabled_set: None,
179
+ };
180
+
181
+ config.build_lookup_sets();
182
+
183
+ assert!(config.enabled_set.is_some());
184
+ assert!(config.disabled_set.is_some());
185
+ assert!(config.enabled_set.unwrap().contains("a"));
186
+ assert!(config.disabled_set.unwrap().contains("c"));
187
+ }
188
+
189
+ #[test]
190
+ fn test_chunking_config_defaults() {
191
+ let config = ChunkingConfig {
192
+ max_chars: 1000,
193
+ max_overlap: 200,
194
+ embedding: None,
195
+ preset: None,
196
+ };
197
+ assert_eq!(config.max_chars, 1000);
198
+ assert_eq!(config.max_overlap, 200);
199
+ }
200
+
201
+ #[test]
202
+ fn test_embedding_config_default() {
203
+ let config = EmbeddingConfig::default();
204
+ assert!(config.normalize);
205
+ assert_eq!(config.batch_size, 32);
206
+ assert!(config.cache_dir.is_none());
207
+ }
208
+
209
+ /// Tests that EmbeddingModelType::Preset serializes with "type" field (internally-tagged).
210
+ /// This validates the API schema matches the documented format:
211
+ /// `{"type": "preset", "name": "fast"}` NOT `{"preset": {"name": "fast"}}`
212
+ #[test]
213
+ fn test_embedding_model_type_preset_serialization() {
214
+ let model = EmbeddingModelType::Preset {
215
+ name: "fast".to_string(),
216
+ };
217
+ let json = serde_json::to_string(&model).unwrap();
218
+
219
+ // Should use internally-tagged format with "type" discriminator
220
+ assert!(json.contains(r#""type":"preset""#), "Should contain type:preset field");
221
+ assert!(json.contains(r#""name":"fast""#), "Should contain name:fast field");
222
+
223
+ // Should NOT use adjacently-tagged format
224
+ assert!(
225
+ !json.contains(r#"{"preset":"#),
226
+ "Should NOT use adjacently-tagged format"
227
+ );
228
+ }
229
+
230
+ /// Tests that EmbeddingModelType::Preset deserializes from the documented API format.
231
+ /// API documentation shows: `{"type": "preset", "name": "fast"}`
232
+ #[test]
233
+ fn test_embedding_model_type_preset_deserialization() {
234
+ // This is the documented API format that users should send
235
+ let json = r#"{"type": "preset", "name": "fast"}"#;
236
+ let model: EmbeddingModelType = serde_json::from_str(json).unwrap();
237
+
238
+ match model {
239
+ EmbeddingModelType::Preset { name } => {
240
+ assert_eq!(name, "fast");
241
+ }
242
+ _ => panic!("Expected Preset variant"),
243
+ }
244
+ }
245
+
246
+ /// Tests that the wrong format (adjacently-tagged) is rejected.
247
+ /// This ensures the API doesn't accept the old/wrong documentation format.
248
+ #[test]
249
+ fn test_embedding_model_type_rejects_wrong_format() {
250
+ // This is the WRONG format that was in the old documentation
251
+ let wrong_json = r#"{"preset": {"name": "fast"}}"#;
252
+ let result: Result<EmbeddingModelType, _> = serde_json::from_str(wrong_json);
253
+
254
+ // Should fail to parse - the wrong format should be rejected
255
+ assert!(result.is_err(), "Should reject adjacently-tagged format");
256
+ }
257
+
258
+ /// Tests round-trip serialization/deserialization of EmbeddingConfig.
259
+ #[test]
260
+ fn test_embedding_config_roundtrip() {
261
+ let config = EmbeddingConfig {
262
+ model: EmbeddingModelType::Preset {
263
+ name: "balanced".to_string(),
264
+ },
265
+ normalize: true,
266
+ batch_size: 64,
267
+ show_download_progress: false,
268
+ cache_dir: None,
269
+ };
270
+
271
+ let json = serde_json::to_string(&config).unwrap();
272
+ let deserialized: EmbeddingConfig = serde_json::from_str(&json).unwrap();
273
+
274
+ match deserialized.model {
275
+ EmbeddingModelType::Preset { name } => {
276
+ assert_eq!(name, "balanced");
277
+ }
278
+ _ => panic!("Expected Preset variant"),
279
+ }
280
+ assert!(deserialized.normalize);
281
+ assert_eq!(deserialized.batch_size, 64);
282
+ }
283
+
284
+ /// Tests Custom model type serialization format.
285
+ #[test]
286
+ fn test_embedding_model_type_custom_serialization() {
287
+ let model = EmbeddingModelType::Custom {
288
+ model_id: "sentence-transformers/all-MiniLM-L6-v2".to_string(),
289
+ dimensions: 384,
290
+ };
291
+ let json = serde_json::to_string(&model).unwrap();
292
+
293
+ assert!(json.contains(r#""type":"custom""#), "Should contain type:custom field");
294
+ assert!(json.contains(r#""model_id":"#), "Should contain model_id field");
295
+ assert!(json.contains(r#""dimensions":384"#), "Should contain dimensions field");
296
+ }
297
+
298
+ /// Tests Custom model type deserialization.
299
+ #[test]
300
+ fn test_embedding_model_type_custom_deserialization() {
301
+ let json = r#"{"type": "custom", "model_id": "test/model", "dimensions": 512}"#;
302
+ let model: EmbeddingModelType = serde_json::from_str(json).unwrap();
303
+
304
+ match model {
305
+ EmbeddingModelType::Custom { model_id, dimensions } => {
306
+ assert_eq!(model_id, "test/model");
307
+ assert_eq!(dimensions, 512);
308
+ }
309
+ _ => panic!("Expected Custom variant"),
310
+ }
311
+ }
312
+ }
@@ -0,0 +1,187 @@
1
+ //! Cross-section dependency validation.
2
+ //!
3
+ //! This module contains validation functions that check dependencies and relationships
4
+ //! between different configuration sections. These validators ensure that related
5
+ //! configuration values are consistent and compatible with each other.
6
+
7
+ use crate::{KreuzbergError, Result};
8
+
9
+ /// Validate a port number for server configuration.
10
+ ///
11
+ /// Port must be in the range 1-65535. While ports 1-1023 are privileged and may require
12
+ /// special permissions on some systems, they are still valid port numbers.
13
+ ///
14
+ /// # Arguments
15
+ ///
16
+ /// * `port` - The port number to validate
17
+ ///
18
+ /// # Returns
19
+ ///
20
+ /// `Ok(())` if the port is valid, or a `ValidationError` with details about valid ranges.
21
+ ///
22
+ /// # Examples
23
+ ///
24
+ /// ```rust
25
+ /// use kreuzberg::core::config_validation::validate_port;
26
+ ///
27
+ /// assert!(validate_port(8000).is_ok());
28
+ /// assert!(validate_port(80).is_ok());
29
+ /// assert!(validate_port(1).is_ok());
30
+ /// assert!(validate_port(65535).is_ok());
31
+ /// assert!(validate_port(0).is_err());
32
+ /// ```
33
+ pub fn validate_port(port: u16) -> Result<()> {
34
+ if port > 0 {
35
+ Ok(())
36
+ } else {
37
+ Err(KreuzbergError::Validation {
38
+ message: format!("Port must be 1-65535, got {}", port),
39
+ source: None,
40
+ })
41
+ }
42
+ }
43
+
44
+ /// Validate a host/IP address string for server configuration.
45
+ ///
46
+ /// Accepts valid IPv4 addresses (e.g., "127.0.0.1", "0.0.0.0"), valid IPv6 addresses
47
+ /// (e.g., "::1", "::"), and hostnames (e.g., "localhost", "example.com").
48
+ ///
49
+ /// # Arguments
50
+ ///
51
+ /// * `host` - The host/IP address string to validate
52
+ ///
53
+ /// # Returns
54
+ ///
55
+ /// `Ok(())` if the host is valid, or a `ValidationError` with details about valid formats.
56
+ ///
57
+ /// # Examples
58
+ ///
59
+ /// ```rust
60
+ /// use kreuzberg::core::config_validation::validate_host;
61
+ ///
62
+ /// assert!(validate_host("127.0.0.1").is_ok());
63
+ /// assert!(validate_host("0.0.0.0").is_ok());
64
+ /// assert!(validate_host("::1").is_ok());
65
+ /// assert!(validate_host("::").is_ok());
66
+ /// assert!(validate_host("localhost").is_ok());
67
+ /// assert!(validate_host("example.com").is_ok());
68
+ /// assert!(validate_host("").is_err());
69
+ /// ```
70
+ pub fn validate_host(host: &str) -> Result<()> {
71
+ let host = host.trim();
72
+
73
+ if host.is_empty() {
74
+ return Err(KreuzbergError::Validation {
75
+ message: "Invalid host '': must be a valid IP address or hostname".to_string(),
76
+ source: None,
77
+ });
78
+ }
79
+
80
+ // Check if it's a valid IPv4 address
81
+ if host.parse::<std::net::Ipv4Addr>().is_ok() {
82
+ return Ok(());
83
+ }
84
+
85
+ // Check if it's a valid IPv6 address
86
+ if host.parse::<std::net::Ipv6Addr>().is_ok() {
87
+ return Ok(());
88
+ }
89
+
90
+ // Check if it's a valid hostname (basic validation)
91
+ // Hostnames must contain only alphanumeric characters, dots, and hyphens
92
+ // Must not look like an invalid IPv4 address (all numeric with dots)
93
+ let looks_like_ipv4 = host
94
+ .split('.')
95
+ .all(|part| !part.is_empty() && part.chars().all(|c| c.is_numeric()));
96
+ if !looks_like_ipv4
97
+ && host.chars().all(|c| c.is_alphanumeric() || c == '.' || c == '-')
98
+ && !host.starts_with('-')
99
+ && !host.ends_with('-')
100
+ {
101
+ return Ok(());
102
+ }
103
+
104
+ Err(KreuzbergError::Validation {
105
+ message: format!("Invalid host '{}': must be a valid IP address or hostname", host),
106
+ source: None,
107
+ })
108
+ }
109
+
110
+ /// Validate a CORS (Cross-Origin Resource Sharing) origin URL.
111
+ ///
112
+ /// Accepts valid HTTP/HTTPS URLs (e.g., "https://example.com") or the wildcard "*"
113
+ /// to allow all origins. URLs must start with "http://" or "https://", or be exactly "*".
114
+ ///
115
+ /// # Arguments
116
+ ///
117
+ /// * `origin` - The CORS origin URL to validate
118
+ ///
119
+ /// # Returns
120
+ ///
121
+ /// `Ok(())` if the origin is valid, or a `ValidationError` with details about valid formats.
122
+ ///
123
+ /// # Examples
124
+ ///
125
+ /// ```rust
126
+ /// use kreuzberg::core::config_validation::validate_cors_origin;
127
+ ///
128
+ /// assert!(validate_cors_origin("https://example.com").is_ok());
129
+ /// assert!(validate_cors_origin("http://localhost:3000").is_ok());
130
+ /// assert!(validate_cors_origin("*").is_ok());
131
+ /// assert!(validate_cors_origin("not-a-url").is_err());
132
+ /// assert!(validate_cors_origin("ftp://example.com").is_err());
133
+ /// ```
134
+ pub fn validate_cors_origin(origin: &str) -> Result<()> {
135
+ let origin = origin.trim();
136
+
137
+ if origin == "*" {
138
+ return Ok(());
139
+ }
140
+
141
+ if origin.starts_with("http://") || origin.starts_with("https://") {
142
+ // Basic validation: ensure there's something after the protocol
143
+ if origin.len() > 8 && (origin.starts_with("http://") && origin.len() > 7 || origin.starts_with("https://")) {
144
+ return Ok(());
145
+ }
146
+ }
147
+
148
+ Err(KreuzbergError::Validation {
149
+ message: format!(
150
+ "Invalid CORS origin '{}': must be a valid HTTP/HTTPS URL or '*'",
151
+ origin
152
+ ),
153
+ source: None,
154
+ })
155
+ }
156
+
157
+ /// Validate an upload size limit for server configuration.
158
+ ///
159
+ /// Upload size must be greater than 0 (measured in bytes).
160
+ ///
161
+ /// # Arguments
162
+ ///
163
+ /// * `size` - The maximum upload size in bytes to validate
164
+ ///
165
+ /// # Returns
166
+ ///
167
+ /// `Ok(())` if the size is valid, or a `ValidationError` with details about constraints.
168
+ ///
169
+ /// # Examples
170
+ ///
171
+ /// ```rust
172
+ /// use kreuzberg::core::config_validation::validate_upload_size;
173
+ ///
174
+ /// assert!(validate_upload_size(1024).is_ok());
175
+ /// assert!(validate_upload_size(1_000_000).is_ok());
176
+ /// assert!(validate_upload_size(0).is_err());
177
+ /// ```
178
+ pub fn validate_upload_size(size: usize) -> Result<()> {
179
+ if size > 0 {
180
+ Ok(())
181
+ } else {
182
+ Err(KreuzbergError::Validation {
183
+ message: format!("Upload size must be greater than 0, got {}", size),
184
+ source: None,
185
+ })
186
+ }
187
+ }