kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,156 @@
1
+ //! Cache utilities for key generation and disk space management.
2
+
3
+ use crate::error::Result;
4
+ use ahash::AHasher;
5
+ use std::hash::{Hash, Hasher};
6
+
7
+ #[cfg(unix)]
8
+ use crate::error::KreuzbergError;
9
+ #[cfg(unix)]
10
+ use std::path::Path;
11
+
12
+ /// Cache key hash format width (32 hex digits for u64 hash)
13
+ const CACHE_KEY_HASH_WIDTH: usize = 32;
14
+
15
+ /// Generate a deterministic cache key from configuration parameters.
16
+ ///
17
+ /// # Algorithm
18
+ ///
19
+ /// Uses ahash (non-cryptographic 64-bit hash) for performance. Cache keys are
20
+ /// generated by:
21
+ /// 1. Sorting key-value pairs by key (for determinism)
22
+ /// 2. Concatenating as "key1=val1&key2=val2&..."
23
+ /// 3. Hashing with ahash and formatting as 32-character hex
24
+ ///
25
+ /// # Collision Probability
26
+ ///
27
+ /// AHash produces 64-bit hashes, leading to birthday paradox collisions:
28
+ /// - **~0.01%** probability at 1 million cache entries
29
+ /// - **~1%** probability at 100 million entries
30
+ /// - **~50%** probability at 4.3 billion (2^32) entries
31
+ ///
32
+ /// For context: P(collision) ≈ n^2 / (2 * 2^64) where n = number of entries.
33
+ ///
34
+ /// # Performance vs Security Trade-off
35
+ ///
36
+ /// - **ahash**: ~10x faster than SHA256, sufficient for cache keys
37
+ /// - **SHA256**: Collision-resistant but overkill for caching
38
+ /// - **Practical risk**: Low for typical usage (< 1M entries)
39
+ ///
40
+ /// # Impact of Collisions
41
+ ///
42
+ /// If two different configurations hash to the same key:
43
+ /// - One configuration reads the other's cached data
44
+ /// - Results in incorrect data served from cache
45
+ /// - Detected via metadata validation (size/mtime checks)
46
+ ///
47
+ /// # Recommendations
48
+ ///
49
+ /// - **< 1M entries**: ahash is safe and fast
50
+ /// - **> 100M entries**: Monitor cache size, consider periodic clearing
51
+ /// - **Critical data**: If collision risk is unacceptable, add SHA256 option
52
+ ///
53
+ /// # Example
54
+ ///
55
+ /// ```rust
56
+ /// use kreuzberg::cache::generate_cache_key;
57
+ ///
58
+ /// let parts = [("format", "pdf"), ("ocr", "true"), ("lang", "en")];
59
+ /// let key = generate_cache_key(&parts);
60
+ /// assert_eq!(key.len(), 32); // 64-bit hash as hex
61
+ /// ```
62
+ pub fn generate_cache_key(parts: &[(&str, &str)]) -> String {
63
+ if parts.is_empty() {
64
+ return "empty".to_string();
65
+ }
66
+
67
+ let mut sorted_parts: Vec<_> = parts.to_vec();
68
+ sorted_parts.sort_by_key(|(k, _)| *k);
69
+
70
+ let estimated_size = sorted_parts.iter().map(|(k, v)| k.len() + v.len() + 2).sum::<usize>();
71
+ let mut cache_str = String::with_capacity(estimated_size);
72
+
73
+ for (i, (key, val)) in sorted_parts.iter().enumerate() {
74
+ if i > 0 {
75
+ cache_str.push('&');
76
+ }
77
+ cache_str.push_str(&format!("{}={}", key, val));
78
+ }
79
+
80
+ let mut hasher = AHasher::default();
81
+ cache_str.hash(&mut hasher);
82
+ let hash = hasher.finish();
83
+
84
+ format!("{:0width$x}", hash, width = CACHE_KEY_HASH_WIDTH)
85
+ }
86
+
87
+ #[allow(unsafe_code)]
88
+ pub fn get_available_disk_space(path: &str) -> Result<f64> {
89
+ #[cfg(unix)]
90
+ {
91
+ let path = Path::new(path);
92
+ let check_path = if path.exists() {
93
+ path
94
+ } else if let Some(parent) = path.parent() {
95
+ parent
96
+ } else {
97
+ Path::new("/")
98
+ };
99
+
100
+ use libc::{statvfs, statvfs as statvfs_struct};
101
+ use std::ffi::CString;
102
+
103
+ let path_str = check_path
104
+ .to_str()
105
+ .ok_or_else(|| KreuzbergError::validation("Path contains invalid UTF-8".to_string()))?;
106
+ let c_path = CString::new(path_str).map_err(|e| KreuzbergError::validation(format!("Invalid path: {}", e)))?;
107
+
108
+ let mut stat: statvfs_struct = unsafe { std::mem::zeroed() };
109
+
110
+ let result = unsafe { statvfs(c_path.as_ptr(), &mut stat) };
111
+
112
+ if result == 0 {
113
+ #[allow(clippy::unnecessary_cast)]
114
+ let available_bytes = stat.f_bavail as u64 * stat.f_frsize as u64;
115
+ Ok(available_bytes as f64 / (1024.0 * 1024.0))
116
+ } else {
117
+ tracing::debug!("Failed to get disk stats for {}: errno {}", path_str, result);
118
+ Ok(10000.0)
119
+ }
120
+ }
121
+
122
+ #[cfg(not(unix))]
123
+ {
124
+ let _ = path;
125
+ Ok(10000.0)
126
+ }
127
+ }
128
+
129
+ pub fn fast_hash(data: &[u8]) -> u64 {
130
+ let mut hasher = AHasher::default();
131
+ data.hash(&mut hasher);
132
+ hasher.finish()
133
+ }
134
+
135
+ pub fn validate_cache_key(key: &str) -> bool {
136
+ key.len() == 32 && key.chars().all(|c| c.is_ascii_hexdigit())
137
+ }
138
+
139
+ pub fn filter_old_cache_entries(cache_times: &[f64], current_time: f64, max_age_seconds: f64) -> Vec<usize> {
140
+ cache_times
141
+ .iter()
142
+ .enumerate()
143
+ .filter_map(|(idx, &time)| {
144
+ if current_time - time > max_age_seconds {
145
+ Some(idx)
146
+ } else {
147
+ None
148
+ }
149
+ })
150
+ .collect()
151
+ }
152
+
153
+ pub fn sort_cache_by_access_time(mut entries: Vec<(String, f64)>) -> Vec<String> {
154
+ entries.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
155
+ entries.into_iter().map(|(key, _)| key).collect()
156
+ }
@@ -0,0 +1,301 @@
1
+ //! Page boundary handling and page range calculation for chunked text.
2
+ //!
3
+ //! This module provides functions to track which pages text chunks span,
4
+ //! enabling accurate page-level metadata for document processing.
5
+
6
+ use crate::error::{KreuzbergError, Result};
7
+ use crate::types::PageBoundary;
8
+
9
+ /// Validates the consistency and correctness of page boundaries.
10
+ ///
11
+ /// # Validation Rules
12
+ ///
13
+ /// 1. Boundaries must be sorted by byte_start (monotonically increasing)
14
+ /// 2. Boundaries must not overlap (byte_end[i] <= byte_start[i+1])
15
+ /// 3. Each boundary must have byte_start < byte_end
16
+ ///
17
+ /// # Arguments
18
+ ///
19
+ /// * `boundaries` - Page boundary markers to validate
20
+ ///
21
+ /// # Returns
22
+ ///
23
+ /// Returns `Ok(())` if all boundaries are valid.
24
+ /// Returns `KreuzbergError::Validation` if any boundary is invalid.
25
+ pub fn validate_page_boundaries(boundaries: &[PageBoundary]) -> Result<()> {
26
+ if boundaries.is_empty() {
27
+ return Ok(());
28
+ }
29
+
30
+ for (idx, boundary) in boundaries.iter().enumerate() {
31
+ if boundary.byte_start >= boundary.byte_end {
32
+ return Err(KreuzbergError::validation(format!(
33
+ "Invalid boundary range at index {}: byte_start ({}) must be < byte_end ({})",
34
+ idx, boundary.byte_start, boundary.byte_end
35
+ )));
36
+ }
37
+ }
38
+
39
+ for i in 0..boundaries.len() - 1 {
40
+ let current = &boundaries[i];
41
+ let next = &boundaries[i + 1];
42
+
43
+ if current.byte_start > next.byte_start {
44
+ return Err(KreuzbergError::validation(format!(
45
+ "Page boundaries not sorted: boundary at index {} (byte_start={}) comes after boundary at index {} (byte_start={})",
46
+ i,
47
+ current.byte_start,
48
+ i + 1,
49
+ next.byte_start
50
+ )));
51
+ }
52
+
53
+ if current.byte_end > next.byte_start {
54
+ return Err(KreuzbergError::validation(format!(
55
+ "Overlapping page boundaries: boundary {} ends at {} but boundary {} starts at {}",
56
+ i,
57
+ current.byte_end,
58
+ i + 1,
59
+ next.byte_start
60
+ )));
61
+ }
62
+ }
63
+
64
+ Ok(())
65
+ }
66
+
67
+ /// Calculate which pages a byte range spans.
68
+ ///
69
+ /// # Arguments
70
+ ///
71
+ /// * `byte_start` - Starting byte offset of the chunk
72
+ /// * `byte_end` - Ending byte offset of the chunk
73
+ /// * `boundaries` - Page boundary markers from the document
74
+ ///
75
+ /// # Returns
76
+ ///
77
+ /// A tuple of (first_page, last_page) where page numbers are 1-indexed.
78
+ /// Returns (None, None) if boundaries are empty or chunk doesn't overlap any page.
79
+ ///
80
+ /// # Errors
81
+ ///
82
+ /// Returns `KreuzbergError::Validation` if boundaries are invalid.
83
+ ///
84
+ /// # Examples
85
+ ///
86
+ /// ```rust,ignore
87
+ /// use kreuzberg::chunking::boundaries::calculate_page_range;
88
+ /// use kreuzberg::types::PageBoundary;
89
+ ///
90
+ /// let boundaries = vec![
91
+ /// PageBoundary { byte_start: 0, byte_end: 100, page_number: 1 },
92
+ /// PageBoundary { byte_start: 100, byte_end: 200, page_number: 2 },
93
+ /// ];
94
+ ///
95
+ /// let (first, last) = calculate_page_range(50, 150, &boundaries)?;
96
+ /// assert_eq!(first, Some(1));
97
+ /// assert_eq!(last, Some(2));
98
+ /// # Ok::<(), kreuzberg::Result<()>>(())
99
+ /// ```
100
+ pub fn calculate_page_range(
101
+ byte_start: usize,
102
+ byte_end: usize,
103
+ boundaries: &[PageBoundary],
104
+ ) -> Result<(Option<usize>, Option<usize>)> {
105
+ if boundaries.is_empty() {
106
+ return Ok((None, None));
107
+ }
108
+
109
+ validate_page_boundaries(boundaries)?;
110
+
111
+ let mut first_page = None;
112
+ let mut last_page = None;
113
+
114
+ for boundary in boundaries {
115
+ if byte_start < boundary.byte_end && byte_end > boundary.byte_start {
116
+ if first_page.is_none() {
117
+ first_page = Some(boundary.page_number);
118
+ }
119
+ last_page = Some(boundary.page_number);
120
+ }
121
+ }
122
+
123
+ Ok((first_page, last_page))
124
+ }
125
+
126
+ #[cfg(test)]
127
+ mod tests {
128
+ use super::*;
129
+
130
+ #[test]
131
+ fn test_validate_page_boundaries_valid() {
132
+ let boundaries = vec![
133
+ PageBoundary {
134
+ byte_start: 0,
135
+ byte_end: 20,
136
+ page_number: 1,
137
+ },
138
+ PageBoundary {
139
+ byte_start: 20,
140
+ byte_end: 40,
141
+ page_number: 2,
142
+ },
143
+ PageBoundary {
144
+ byte_start: 40,
145
+ byte_end: 60,
146
+ page_number: 3,
147
+ },
148
+ ];
149
+
150
+ let result = validate_page_boundaries(&boundaries);
151
+ assert!(result.is_ok());
152
+ }
153
+
154
+ #[test]
155
+ fn test_validate_page_boundaries_empty() {
156
+ let boundaries: Vec<PageBoundary> = vec![];
157
+ let result = validate_page_boundaries(&boundaries);
158
+ assert!(result.is_ok());
159
+ }
160
+
161
+ #[test]
162
+ fn test_calculate_page_range_within_page() {
163
+ let boundaries = vec![
164
+ PageBoundary {
165
+ byte_start: 0,
166
+ byte_end: 100,
167
+ page_number: 1,
168
+ },
169
+ PageBoundary {
170
+ byte_start: 100,
171
+ byte_end: 200,
172
+ page_number: 2,
173
+ },
174
+ ];
175
+
176
+ let (first, last) = calculate_page_range(10, 50, &boundaries).unwrap();
177
+ assert_eq!(first, Some(1));
178
+ assert_eq!(last, Some(1));
179
+ }
180
+
181
+ #[test]
182
+ fn test_calculate_page_range_spanning_pages() {
183
+ let boundaries = vec![
184
+ PageBoundary {
185
+ byte_start: 0,
186
+ byte_end: 100,
187
+ page_number: 1,
188
+ },
189
+ PageBoundary {
190
+ byte_start: 100,
191
+ byte_end: 200,
192
+ page_number: 2,
193
+ },
194
+ ];
195
+
196
+ let (first, last) = calculate_page_range(50, 150, &boundaries).unwrap();
197
+ assert_eq!(first, Some(1));
198
+ assert_eq!(last, Some(2));
199
+ }
200
+
201
+ #[test]
202
+ fn test_calculate_page_range_empty_boundaries() {
203
+ let boundaries: Vec<PageBoundary> = vec![];
204
+
205
+ let (first, last) = calculate_page_range(0, 50, &boundaries).unwrap();
206
+ assert_eq!(first, None);
207
+ assert_eq!(last, None);
208
+ }
209
+
210
+ #[test]
211
+ fn test_calculate_page_range_no_overlap() {
212
+ let boundaries = vec![
213
+ PageBoundary {
214
+ byte_start: 0,
215
+ byte_end: 100,
216
+ page_number: 1,
217
+ },
218
+ PageBoundary {
219
+ byte_start: 100,
220
+ byte_end: 200,
221
+ page_number: 2,
222
+ },
223
+ ];
224
+
225
+ let (first, last) = calculate_page_range(200, 250, &boundaries).unwrap();
226
+ assert_eq!(first, None);
227
+ assert_eq!(last, None);
228
+ }
229
+
230
+ #[test]
231
+ fn test_calculate_page_range_three_pages() {
232
+ let boundaries = vec![
233
+ PageBoundary {
234
+ byte_start: 0,
235
+ byte_end: 100,
236
+ page_number: 1,
237
+ },
238
+ PageBoundary {
239
+ byte_start: 100,
240
+ byte_end: 200,
241
+ page_number: 2,
242
+ },
243
+ PageBoundary {
244
+ byte_start: 200,
245
+ byte_end: 300,
246
+ page_number: 3,
247
+ },
248
+ ];
249
+
250
+ let (first, last) = calculate_page_range(50, 250, &boundaries).unwrap();
251
+ assert_eq!(first, Some(1));
252
+ assert_eq!(last, Some(3));
253
+ }
254
+
255
+ #[test]
256
+ fn test_calculate_page_range_with_invalid_boundaries() {
257
+ let boundaries = vec![PageBoundary {
258
+ byte_start: 15,
259
+ byte_end: 10,
260
+ page_number: 1,
261
+ }];
262
+
263
+ let result = calculate_page_range(0, 20, &boundaries);
264
+ assert!(result.is_err());
265
+ let err = result.unwrap_err();
266
+ assert!(err.to_string().contains("Invalid boundary range"));
267
+ }
268
+
269
+ #[test]
270
+ fn test_page_boundaries_with_gaps() {
271
+ let boundaries = vec![
272
+ PageBoundary {
273
+ byte_start: 0,
274
+ byte_end: 10,
275
+ page_number: 1,
276
+ },
277
+ PageBoundary {
278
+ byte_start: 15,
279
+ byte_end: 25,
280
+ page_number: 2,
281
+ },
282
+ ];
283
+
284
+ let result = validate_page_boundaries(&boundaries);
285
+ assert!(result.is_ok());
286
+ }
287
+
288
+ #[test]
289
+ fn test_chunk_with_same_start_and_end() {
290
+ let boundaries = vec![PageBoundary {
291
+ byte_start: 10,
292
+ byte_end: 10,
293
+ page_number: 1,
294
+ }];
295
+
296
+ let result = validate_page_boundaries(&boundaries);
297
+ assert!(result.is_err());
298
+ let err = result.unwrap_err();
299
+ assert!(err.to_string().contains("Invalid boundary range"));
300
+ }
301
+ }