kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,156 @@
1
+ //! Cache utilities for key generation and disk space management.
2
+
3
+ use crate::error::Result;
4
+ use ahash::AHasher;
5
+ use std::hash::{Hash, Hasher};
6
+
7
+ #[cfg(unix)]
8
+ use crate::error::KreuzbergError;
9
+ #[cfg(unix)]
10
+ use std::path::Path;
11
+
12
+ /// Cache key hash format width (32 hex digits for u64 hash)
13
+ const CACHE_KEY_HASH_WIDTH: usize = 32;
14
+
15
+ /// Generate a deterministic cache key from configuration parameters.
16
+ ///
17
+ /// # Algorithm
18
+ ///
19
+ /// Uses ahash (non-cryptographic 64-bit hash) for performance. Cache keys are
20
+ /// generated by:
21
+ /// 1. Sorting key-value pairs by key (for determinism)
22
+ /// 2. Concatenating as "key1=val1&key2=val2&..."
23
+ /// 3. Hashing with ahash and formatting as 32-character hex
24
+ ///
25
+ /// # Collision Probability
26
+ ///
27
+ /// AHash produces 64-bit hashes, leading to birthday paradox collisions:
28
+ /// - **~0.01%** probability at 1 million cache entries
29
+ /// - **~1%** probability at 100 million entries
30
+ /// - **~50%** probability at 4.3 billion (2^32) entries
31
+ ///
32
+ /// For context: P(collision) ≈ n^2 / (2 * 2^64) where n = number of entries.
33
+ ///
34
+ /// # Performance vs Security Trade-off
35
+ ///
36
+ /// - **ahash**: ~10x faster than SHA256, sufficient for cache keys
37
+ /// - **SHA256**: Collision-resistant but overkill for caching
38
+ /// - **Practical risk**: Low for typical usage (< 1M entries)
39
+ ///
40
+ /// # Impact of Collisions
41
+ ///
42
+ /// If two different configurations hash to the same key:
43
+ /// - One configuration reads the other's cached data
44
+ /// - Results in incorrect data served from cache
45
+ /// - Detected via metadata validation (size/mtime checks)
46
+ ///
47
+ /// # Recommendations
48
+ ///
49
+ /// - **< 1M entries**: ahash is safe and fast
50
+ /// - **> 100M entries**: Monitor cache size, consider periodic clearing
51
+ /// - **Critical data**: If collision risk is unacceptable, add SHA256 option
52
+ ///
53
+ /// # Example
54
+ ///
55
+ /// ```rust
56
+ /// use kreuzberg::cache::generate_cache_key;
57
+ ///
58
+ /// let parts = [("format", "pdf"), ("ocr", "true"), ("lang", "en")];
59
+ /// let key = generate_cache_key(&parts);
60
+ /// assert_eq!(key.len(), 32); // 64-bit hash as hex
61
+ /// ```
62
+ pub fn generate_cache_key(parts: &[(&str, &str)]) -> String {
63
+ if parts.is_empty() {
64
+ return "empty".to_string();
65
+ }
66
+
67
+ let mut sorted_parts: Vec<_> = parts.to_vec();
68
+ sorted_parts.sort_by_key(|(k, _)| *k);
69
+
70
+ let estimated_size = sorted_parts.iter().map(|(k, v)| k.len() + v.len() + 2).sum::<usize>();
71
+ let mut cache_str = String::with_capacity(estimated_size);
72
+
73
+ for (i, (key, val)) in sorted_parts.iter().enumerate() {
74
+ if i > 0 {
75
+ cache_str.push('&');
76
+ }
77
+ cache_str.push_str(&format!("{}={}", key, val));
78
+ }
79
+
80
+ let mut hasher = AHasher::default();
81
+ cache_str.hash(&mut hasher);
82
+ let hash = hasher.finish();
83
+
84
+ format!("{:0width$x}", hash, width = CACHE_KEY_HASH_WIDTH)
85
+ }
86
+
87
+ #[allow(unsafe_code)]
88
+ pub fn get_available_disk_space(path: &str) -> Result<f64> {
89
+ #[cfg(unix)]
90
+ {
91
+ let path = Path::new(path);
92
+ let check_path = if path.exists() {
93
+ path
94
+ } else if let Some(parent) = path.parent() {
95
+ parent
96
+ } else {
97
+ Path::new("/")
98
+ };
99
+
100
+ use libc::{statvfs, statvfs as statvfs_struct};
101
+ use std::ffi::CString;
102
+
103
+ let path_str = check_path
104
+ .to_str()
105
+ .ok_or_else(|| KreuzbergError::validation("Path contains invalid UTF-8".to_string()))?;
106
+ let c_path = CString::new(path_str).map_err(|e| KreuzbergError::validation(format!("Invalid path: {}", e)))?;
107
+
108
+ let mut stat: statvfs_struct = unsafe { std::mem::zeroed() };
109
+
110
+ let result = unsafe { statvfs(c_path.as_ptr(), &mut stat) };
111
+
112
+ if result == 0 {
113
+ #[allow(clippy::unnecessary_cast)]
114
+ let available_bytes = stat.f_bavail as u64 * stat.f_frsize as u64;
115
+ Ok(available_bytes as f64 / (1024.0 * 1024.0))
116
+ } else {
117
+ tracing::debug!("Failed to get disk stats for {}: errno {}", path_str, result);
118
+ Ok(10000.0)
119
+ }
120
+ }
121
+
122
+ #[cfg(not(unix))]
123
+ {
124
+ let _ = path;
125
+ Ok(10000.0)
126
+ }
127
+ }
128
+
129
+ pub fn fast_hash(data: &[u8]) -> u64 {
130
+ let mut hasher = AHasher::default();
131
+ data.hash(&mut hasher);
132
+ hasher.finish()
133
+ }
134
+
135
+ pub fn validate_cache_key(key: &str) -> bool {
136
+ key.len() == 32 && key.chars().all(|c| c.is_ascii_hexdigit())
137
+ }
138
+
139
+ pub fn filter_old_cache_entries(cache_times: &[f64], current_time: f64, max_age_seconds: f64) -> Vec<usize> {
140
+ cache_times
141
+ .iter()
142
+ .enumerate()
143
+ .filter_map(|(idx, &time)| {
144
+ if current_time - time > max_age_seconds {
145
+ Some(idx)
146
+ } else {
147
+ None
148
+ }
149
+ })
150
+ .collect()
151
+ }
152
+
153
+ pub fn sort_cache_by_access_time(mut entries: Vec<(String, f64)>) -> Vec<String> {
154
+ entries.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
155
+ entries.into_iter().map(|(key, _)| key).collect()
156
+ }
@@ -0,0 +1,301 @@
1
+ //! Page boundary handling and page range calculation for chunked text.
2
+ //!
3
+ //! This module provides functions to track which pages text chunks span,
4
+ //! enabling accurate page-level metadata for document processing.
5
+
6
+ use crate::error::{KreuzbergError, Result};
7
+ use crate::types::PageBoundary;
8
+
9
+ /// Validates the consistency and correctness of page boundaries.
10
+ ///
11
+ /// # Validation Rules
12
+ ///
13
+ /// 1. Boundaries must be sorted by byte_start (monotonically increasing)
14
+ /// 2. Boundaries must not overlap (byte_end[i] <= byte_start[i+1])
15
+ /// 3. Each boundary must have byte_start < byte_end
16
+ ///
17
+ /// # Arguments
18
+ ///
19
+ /// * `boundaries` - Page boundary markers to validate
20
+ ///
21
+ /// # Returns
22
+ ///
23
+ /// Returns `Ok(())` if all boundaries are valid.
24
+ /// Returns `KreuzbergError::Validation` if any boundary is invalid.
25
+ pub fn validate_page_boundaries(boundaries: &[PageBoundary]) -> Result<()> {
26
+ if boundaries.is_empty() {
27
+ return Ok(());
28
+ }
29
+
30
+ for (idx, boundary) in boundaries.iter().enumerate() {
31
+ if boundary.byte_start >= boundary.byte_end {
32
+ return Err(KreuzbergError::validation(format!(
33
+ "Invalid boundary range at index {}: byte_start ({}) must be < byte_end ({})",
34
+ idx, boundary.byte_start, boundary.byte_end
35
+ )));
36
+ }
37
+ }
38
+
39
+ for i in 0..boundaries.len() - 1 {
40
+ let current = &boundaries[i];
41
+ let next = &boundaries[i + 1];
42
+
43
+ if current.byte_start > next.byte_start {
44
+ return Err(KreuzbergError::validation(format!(
45
+ "Page boundaries not sorted: boundary at index {} (byte_start={}) comes after boundary at index {} (byte_start={})",
46
+ i,
47
+ current.byte_start,
48
+ i + 1,
49
+ next.byte_start
50
+ )));
51
+ }
52
+
53
+ if current.byte_end > next.byte_start {
54
+ return Err(KreuzbergError::validation(format!(
55
+ "Overlapping page boundaries: boundary {} ends at {} but boundary {} starts at {}",
56
+ i,
57
+ current.byte_end,
58
+ i + 1,
59
+ next.byte_start
60
+ )));
61
+ }
62
+ }
63
+
64
+ Ok(())
65
+ }
66
+
67
+ /// Calculate which pages a byte range spans.
68
+ ///
69
+ /// # Arguments
70
+ ///
71
+ /// * `byte_start` - Starting byte offset of the chunk
72
+ /// * `byte_end` - Ending byte offset of the chunk
73
+ /// * `boundaries` - Page boundary markers from the document
74
+ ///
75
+ /// # Returns
76
+ ///
77
+ /// A tuple of (first_page, last_page) where page numbers are 1-indexed.
78
+ /// Returns (None, None) if boundaries are empty or chunk doesn't overlap any page.
79
+ ///
80
+ /// # Errors
81
+ ///
82
+ /// Returns `KreuzbergError::Validation` if boundaries are invalid.
83
+ ///
84
+ /// # Examples
85
+ ///
86
+ /// ```rust,ignore
87
+ /// use kreuzberg::chunking::boundaries::calculate_page_range;
88
+ /// use kreuzberg::types::PageBoundary;
89
+ ///
90
+ /// let boundaries = vec![
91
+ /// PageBoundary { byte_start: 0, byte_end: 100, page_number: 1 },
92
+ /// PageBoundary { byte_start: 100, byte_end: 200, page_number: 2 },
93
+ /// ];
94
+ ///
95
+ /// let (first, last) = calculate_page_range(50, 150, &boundaries)?;
96
+ /// assert_eq!(first, Some(1));
97
+ /// assert_eq!(last, Some(2));
98
+ /// # Ok::<(), kreuzberg::Result<()>>(())
99
+ /// ```
100
+ pub fn calculate_page_range(
101
+ byte_start: usize,
102
+ byte_end: usize,
103
+ boundaries: &[PageBoundary],
104
+ ) -> Result<(Option<usize>, Option<usize>)> {
105
+ if boundaries.is_empty() {
106
+ return Ok((None, None));
107
+ }
108
+
109
+ validate_page_boundaries(boundaries)?;
110
+
111
+ let mut first_page = None;
112
+ let mut last_page = None;
113
+
114
+ for boundary in boundaries {
115
+ if byte_start < boundary.byte_end && byte_end > boundary.byte_start {
116
+ if first_page.is_none() {
117
+ first_page = Some(boundary.page_number);
118
+ }
119
+ last_page = Some(boundary.page_number);
120
+ }
121
+ }
122
+
123
+ Ok((first_page, last_page))
124
+ }
125
+
126
+ #[cfg(test)]
127
+ mod tests {
128
+ use super::*;
129
+
130
+ #[test]
131
+ fn test_validate_page_boundaries_valid() {
132
+ let boundaries = vec![
133
+ PageBoundary {
134
+ byte_start: 0,
135
+ byte_end: 20,
136
+ page_number: 1,
137
+ },
138
+ PageBoundary {
139
+ byte_start: 20,
140
+ byte_end: 40,
141
+ page_number: 2,
142
+ },
143
+ PageBoundary {
144
+ byte_start: 40,
145
+ byte_end: 60,
146
+ page_number: 3,
147
+ },
148
+ ];
149
+
150
+ let result = validate_page_boundaries(&boundaries);
151
+ assert!(result.is_ok());
152
+ }
153
+
154
+ #[test]
155
+ fn test_validate_page_boundaries_empty() {
156
+ let boundaries: Vec<PageBoundary> = vec![];
157
+ let result = validate_page_boundaries(&boundaries);
158
+ assert!(result.is_ok());
159
+ }
160
+
161
+ #[test]
162
+ fn test_calculate_page_range_within_page() {
163
+ let boundaries = vec![
164
+ PageBoundary {
165
+ byte_start: 0,
166
+ byte_end: 100,
167
+ page_number: 1,
168
+ },
169
+ PageBoundary {
170
+ byte_start: 100,
171
+ byte_end: 200,
172
+ page_number: 2,
173
+ },
174
+ ];
175
+
176
+ let (first, last) = calculate_page_range(10, 50, &boundaries).unwrap();
177
+ assert_eq!(first, Some(1));
178
+ assert_eq!(last, Some(1));
179
+ }
180
+
181
+ #[test]
182
+ fn test_calculate_page_range_spanning_pages() {
183
+ let boundaries = vec![
184
+ PageBoundary {
185
+ byte_start: 0,
186
+ byte_end: 100,
187
+ page_number: 1,
188
+ },
189
+ PageBoundary {
190
+ byte_start: 100,
191
+ byte_end: 200,
192
+ page_number: 2,
193
+ },
194
+ ];
195
+
196
+ let (first, last) = calculate_page_range(50, 150, &boundaries).unwrap();
197
+ assert_eq!(first, Some(1));
198
+ assert_eq!(last, Some(2));
199
+ }
200
+
201
+ #[test]
202
+ fn test_calculate_page_range_empty_boundaries() {
203
+ let boundaries: Vec<PageBoundary> = vec![];
204
+
205
+ let (first, last) = calculate_page_range(0, 50, &boundaries).unwrap();
206
+ assert_eq!(first, None);
207
+ assert_eq!(last, None);
208
+ }
209
+
210
+ #[test]
211
+ fn test_calculate_page_range_no_overlap() {
212
+ let boundaries = vec![
213
+ PageBoundary {
214
+ byte_start: 0,
215
+ byte_end: 100,
216
+ page_number: 1,
217
+ },
218
+ PageBoundary {
219
+ byte_start: 100,
220
+ byte_end: 200,
221
+ page_number: 2,
222
+ },
223
+ ];
224
+
225
+ let (first, last) = calculate_page_range(200, 250, &boundaries).unwrap();
226
+ assert_eq!(first, None);
227
+ assert_eq!(last, None);
228
+ }
229
+
230
+ #[test]
231
+ fn test_calculate_page_range_three_pages() {
232
+ let boundaries = vec![
233
+ PageBoundary {
234
+ byte_start: 0,
235
+ byte_end: 100,
236
+ page_number: 1,
237
+ },
238
+ PageBoundary {
239
+ byte_start: 100,
240
+ byte_end: 200,
241
+ page_number: 2,
242
+ },
243
+ PageBoundary {
244
+ byte_start: 200,
245
+ byte_end: 300,
246
+ page_number: 3,
247
+ },
248
+ ];
249
+
250
+ let (first, last) = calculate_page_range(50, 250, &boundaries).unwrap();
251
+ assert_eq!(first, Some(1));
252
+ assert_eq!(last, Some(3));
253
+ }
254
+
255
+ #[test]
256
+ fn test_calculate_page_range_with_invalid_boundaries() {
257
+ let boundaries = vec![PageBoundary {
258
+ byte_start: 15,
259
+ byte_end: 10,
260
+ page_number: 1,
261
+ }];
262
+
263
+ let result = calculate_page_range(0, 20, &boundaries);
264
+ assert!(result.is_err());
265
+ let err = result.unwrap_err();
266
+ assert!(err.to_string().contains("Invalid boundary range"));
267
+ }
268
+
269
+ #[test]
270
+ fn test_page_boundaries_with_gaps() {
271
+ let boundaries = vec![
272
+ PageBoundary {
273
+ byte_start: 0,
274
+ byte_end: 10,
275
+ page_number: 1,
276
+ },
277
+ PageBoundary {
278
+ byte_start: 15,
279
+ byte_end: 25,
280
+ page_number: 2,
281
+ },
282
+ ];
283
+
284
+ let result = validate_page_boundaries(&boundaries);
285
+ assert!(result.is_ok());
286
+ }
287
+
288
+ #[test]
289
+ fn test_chunk_with_same_start_and_end() {
290
+ let boundaries = vec![PageBoundary {
291
+ byte_start: 10,
292
+ byte_end: 10,
293
+ page_number: 1,
294
+ }];
295
+
296
+ let result = validate_page_boundaries(&boundaries);
297
+ assert!(result.is_err());
298
+ let err = result.unwrap_err();
299
+ assert!(err.to_string().contains("Invalid boundary range"));
300
+ }
301
+ }