kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,428 @@
1
+ //! Core cache implementation with GenericCache struct.
2
+ //!
3
+ //! # Lock Poisoning Handling
4
+ //!
5
+ //! This module uses `Arc<Mutex<T>>` for thread-safe state management and implements
6
+ //! explicit lock poisoning recovery throughout all public methods:
7
+ //!
8
+ //! **What is lock poisoning?**
9
+ //! - When a thread panics while holding a Mutex, the lock becomes "poisoned"
10
+ //! - Rust marks the Mutex to indicate data may be in an inconsistent state
11
+ //! - Subsequent lock attempts return `Err(PoisonError)` instead of acquiring the lock
12
+ //!
13
+ //! **Recovery strategy:**
14
+ //! - All `.lock()` calls use `.map_err()` to convert `PoisonError` into `KreuzbergError::LockPoisoned`
15
+ //! - The error propagates to callers via `Result` returns (never `.unwrap()` on locks)
16
+ //! - Provides clear error messages indicating which mutex is poisoned
17
+ //! - Follows CLAUDE.md requirement: "Lock poisoning must be handled - never `.unwrap()` on Mutex/RwLock"
18
+ //!
19
+ //! **Affected state:**
20
+ //! - `processing_locks`: Tracks cache keys currently being processed (6 lock sites)
21
+ //! - `deleting_files`: Prevents read-during-delete race conditions (3 lock sites)
22
+ //!
23
+ //! This approach ensures that lock poisoning (rare in practice) is surfaced to users
24
+ //! rather than causing panics, maintaining system stability during concurrent operations.
25
+
26
+ use crate::error::{KreuzbergError, Result};
27
+ use serde::{Deserialize, Serialize};
28
+ use std::collections::HashSet;
29
+ use std::fs;
30
+ use std::path::{Path, PathBuf};
31
+ use std::sync::atomic::{AtomicUsize, Ordering};
32
+ use std::sync::{Arc, Mutex};
33
+ use std::time::{SystemTime, UNIX_EPOCH};
34
+
35
+ use super::cleanup::smart_cleanup_cache;
36
+
37
+ #[derive(Debug, Clone, Serialize, Deserialize)]
38
+ pub struct CacheStats {
39
+ pub total_files: usize,
40
+ pub total_size_mb: f64,
41
+ pub available_space_mb: f64,
42
+ pub oldest_file_age_days: f64,
43
+ pub newest_file_age_days: f64,
44
+ }
45
+
46
+ #[derive(Debug, Clone)]
47
+ pub(super) struct CacheEntry {
48
+ pub(super) path: PathBuf,
49
+ pub(super) size: u64,
50
+ pub(super) modified: SystemTime,
51
+ }
52
+
53
+ pub(super) struct CacheScanResult {
54
+ pub(super) stats: CacheStats,
55
+ pub(super) entries: Vec<CacheEntry>,
56
+ }
57
+
58
+ pub struct GenericCache {
59
+ cache_dir: PathBuf,
60
+ cache_type: String,
61
+ max_age_days: f64,
62
+ max_cache_size_mb: f64,
63
+ min_free_space_mb: f64,
64
+ processing_locks: Arc<Mutex<HashSet<String>>>,
65
+ /// Tracks cache keys being deleted to prevent read-during-delete race conditions
66
+ deleting_files: Arc<Mutex<HashSet<PathBuf>>>,
67
+ /// Counter for triggering periodic cleanup (every 100 writes)
68
+ write_counter: Arc<AtomicUsize>,
69
+ }
70
+
71
+ impl GenericCache {
72
+ pub fn new(
73
+ cache_type: String,
74
+ cache_dir: Option<String>,
75
+ max_age_days: f64,
76
+ max_cache_size_mb: f64,
77
+ min_free_space_mb: f64,
78
+ ) -> Result<Self> {
79
+ let cache_dir_path = if let Some(dir) = cache_dir {
80
+ PathBuf::from(dir).join(&cache_type)
81
+ } else {
82
+ // OSError/RuntimeError must bubble up - system errors need user reports ~keep
83
+ std::env::current_dir()?.join(".kreuzberg").join(&cache_type)
84
+ };
85
+
86
+ fs::create_dir_all(&cache_dir_path)
87
+ .map_err(|e| KreuzbergError::cache(format!("Failed to create cache directory: {}", e)))?;
88
+
89
+ Ok(Self {
90
+ cache_dir: cache_dir_path,
91
+ cache_type,
92
+ max_age_days,
93
+ max_cache_size_mb,
94
+ min_free_space_mb,
95
+ processing_locks: Arc::new(Mutex::new(HashSet::new())),
96
+ deleting_files: Arc::new(Mutex::new(HashSet::new())),
97
+ write_counter: Arc::new(AtomicUsize::new(0)),
98
+ })
99
+ }
100
+
101
+ fn get_cache_path(&self, cache_key: &str) -> PathBuf {
102
+ self.cache_dir.join(format!("{}.msgpack", cache_key))
103
+ }
104
+
105
+ fn get_metadata_path(&self, cache_key: &str) -> PathBuf {
106
+ self.cache_dir.join(format!("{}.meta", cache_key))
107
+ }
108
+
109
+ fn is_valid(&self, cache_path: &Path, source_file: Option<&str>) -> bool {
110
+ if !cache_path.exists() {
111
+ return false;
112
+ }
113
+
114
+ if let Ok(metadata) = fs::metadata(cache_path)
115
+ && let Ok(modified) = metadata.modified()
116
+ && let Ok(elapsed) = SystemTime::now().duration_since(modified)
117
+ {
118
+ let age_days = elapsed.as_secs() as f64 / (24.0 * 3600.0);
119
+ if age_days > self.max_age_days {
120
+ return false;
121
+ }
122
+ }
123
+
124
+ if let Some(source_path) = source_file {
125
+ let Some(file_stem) = cache_path.file_stem().and_then(|s| s.to_str()) else {
126
+ return false;
127
+ };
128
+ let meta_path = self.get_metadata_path(file_stem);
129
+
130
+ if meta_path.exists() {
131
+ if let Ok(meta_metadata) = fs::metadata(&meta_path)
132
+ && meta_metadata.len() == 16
133
+ && let Ok(cached_meta_bytes) = fs::read(&meta_path)
134
+ {
135
+ let cached_size = u64::from_le_bytes([
136
+ cached_meta_bytes[0],
137
+ cached_meta_bytes[1],
138
+ cached_meta_bytes[2],
139
+ cached_meta_bytes[3],
140
+ cached_meta_bytes[4],
141
+ cached_meta_bytes[5],
142
+ cached_meta_bytes[6],
143
+ cached_meta_bytes[7],
144
+ ]);
145
+ let cached_mtime = u64::from_le_bytes([
146
+ cached_meta_bytes[8],
147
+ cached_meta_bytes[9],
148
+ cached_meta_bytes[10],
149
+ cached_meta_bytes[11],
150
+ cached_meta_bytes[12],
151
+ cached_meta_bytes[13],
152
+ cached_meta_bytes[14],
153
+ cached_meta_bytes[15],
154
+ ]);
155
+
156
+ if let Ok(source_metadata) = fs::metadata(source_path) {
157
+ let current_size = source_metadata.len();
158
+ let Some(current_mtime) = source_metadata
159
+ .modified()
160
+ .ok()
161
+ .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
162
+ .map(|d| d.as_secs())
163
+ else {
164
+ return false;
165
+ };
166
+
167
+ return cached_size == current_size && cached_mtime == current_mtime;
168
+ }
169
+ }
170
+ return false;
171
+ }
172
+ }
173
+
174
+ true
175
+ }
176
+
177
+ fn save_metadata(&self, cache_key: &str, source_file: Option<&str>) {
178
+ if let Some(source_path) = source_file
179
+ && let Ok(metadata) = fs::metadata(source_path)
180
+ {
181
+ let size = metadata.len();
182
+ let Some(mtime) = metadata
183
+ .modified()
184
+ .ok()
185
+ .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
186
+ .map(|d| d.as_secs())
187
+ else {
188
+ return;
189
+ };
190
+
191
+ let mut bytes = Vec::with_capacity(16);
192
+ bytes.extend_from_slice(&size.to_le_bytes());
193
+ bytes.extend_from_slice(&mtime.to_le_bytes());
194
+
195
+ let meta_path = self.get_metadata_path(cache_key);
196
+ // Cache metadata write failure - safe to ignore, cache is optional fallback ~keep
197
+ let _ = fs::write(meta_path, bytes);
198
+ }
199
+ }
200
+
201
+ #[cfg_attr(feature = "otel", tracing::instrument(
202
+ skip(self),
203
+ fields(
204
+ cache.hit = tracing::field::Empty,
205
+ cache.key = %cache_key,
206
+ )
207
+ ))]
208
+ pub fn get(&self, cache_key: &str, source_file: Option<&str>) -> Result<Option<Vec<u8>>> {
209
+ let cache_path = self.get_cache_path(cache_key);
210
+
211
+ {
212
+ let deleting = self
213
+ .deleting_files
214
+ .lock()
215
+ .map_err(|e| KreuzbergError::LockPoisoned(format!("Deleting files mutex poisoned: {}", e)))?;
216
+ if deleting.contains(&cache_path) {
217
+ #[cfg(feature = "otel")]
218
+ tracing::Span::current().record("cache.hit", false);
219
+ return Ok(None);
220
+ }
221
+ }
222
+
223
+ if !self.is_valid(&cache_path, source_file) {
224
+ #[cfg(feature = "otel")]
225
+ tracing::Span::current().record("cache.hit", false);
226
+ return Ok(None);
227
+ }
228
+
229
+ match fs::read(&cache_path) {
230
+ Ok(content) => {
231
+ #[cfg(feature = "otel")]
232
+ tracing::Span::current().record("cache.hit", true);
233
+ Ok(Some(content))
234
+ }
235
+ Err(_) => {
236
+ // Best-effort cleanup of corrupted cache files ~keep
237
+ if let Err(e) = fs::remove_file(&cache_path) {
238
+ tracing::debug!("Failed to remove corrupted cache file: {}", e);
239
+ }
240
+ if let Err(e) = fs::remove_file(self.get_metadata_path(cache_key)) {
241
+ tracing::debug!("Failed to remove corrupted metadata file: {}", e);
242
+ }
243
+ #[cfg(feature = "otel")]
244
+ tracing::Span::current().record("cache.hit", false);
245
+ Ok(None)
246
+ }
247
+ }
248
+ }
249
+
250
+ #[cfg_attr(feature = "otel", tracing::instrument(
251
+ skip(self, data),
252
+ fields(
253
+ cache.key = %cache_key,
254
+ cache.size_bytes = data.len(),
255
+ )
256
+ ))]
257
+ pub fn set(&self, cache_key: &str, data: Vec<u8>, source_file: Option<&str>) -> Result<()> {
258
+ let cache_path = self.get_cache_path(cache_key);
259
+
260
+ fs::write(&cache_path, &data)
261
+ .map_err(|e| KreuzbergError::cache(format!("Failed to write cache file: {}", e)))?;
262
+
263
+ self.save_metadata(cache_key, source_file);
264
+
265
+ let count = self.write_counter.fetch_add(1, Ordering::Relaxed);
266
+ if count.is_multiple_of(100)
267
+ && let Some(cache_path_str) = self.cache_dir.to_str()
268
+ {
269
+ // Cache cleanup failure - safe to ignore, cache is optional fallback ~keep
270
+ let _ = smart_cleanup_cache(
271
+ cache_path_str,
272
+ self.max_age_days,
273
+ self.max_cache_size_mb,
274
+ self.min_free_space_mb,
275
+ );
276
+ }
277
+
278
+ Ok(())
279
+ }
280
+
281
+ pub fn is_processing(&self, cache_key: &str) -> Result<bool> {
282
+ // OSError/RuntimeError must bubble up - system errors need user reports ~keep
283
+ let locks = self
284
+ .processing_locks
285
+ .lock()
286
+ .map_err(|e| KreuzbergError::LockPoisoned(format!("Processing locks mutex poisoned: {}", e)))?;
287
+ Ok(locks.contains(cache_key))
288
+ }
289
+
290
+ pub fn mark_processing(&self, cache_key: String) -> Result<()> {
291
+ // OSError/RuntimeError must bubble up - system errors need user reports ~keep
292
+ let mut locks = self
293
+ .processing_locks
294
+ .lock()
295
+ .map_err(|e| KreuzbergError::LockPoisoned(format!("Processing locks mutex poisoned: {}", e)))?;
296
+ locks.insert(cache_key);
297
+ Ok(())
298
+ }
299
+
300
+ pub fn mark_complete(&self, cache_key: &str) -> Result<()> {
301
+ // OSError/RuntimeError must bubble up - system errors need user reports ~keep
302
+ let mut locks = self
303
+ .processing_locks
304
+ .lock()
305
+ .map_err(|e| KreuzbergError::LockPoisoned(format!("Processing locks mutex poisoned: {}", e)))?;
306
+ locks.remove(cache_key);
307
+ Ok(())
308
+ }
309
+
310
+ /// Mark a file path as being deleted to prevent concurrent reads.
311
+ ///
312
+ /// # TOCTOU Race Condition
313
+ ///
314
+ /// There is a Time-Of-Check-To-Time-Of-Use (TOCTOU) race condition between:
315
+ /// 1. Iterating directory entries in `clear()` (getting path/metadata)
316
+ /// 2. Marking the file for deletion here
317
+ /// 3. Actually deleting the file
318
+ ///
319
+ /// **Race scenario:**
320
+ /// - Thread A: Begins iterating in `clear()`, gets path
321
+ /// - Thread B: Calls `get()`, checks `deleting_files` (not marked yet), proceeds
322
+ /// - Thread A: Calls `mark_for_deletion()` here
323
+ /// - Thread A: Deletes file with `fs::remove_file()`
324
+ /// - Thread B: Tries to read file, but it's already deleted
325
+ ///
326
+ /// **Why this is acceptable:**
327
+ /// - Cache operations are best-effort optimizations, not critical
328
+ /// - `get()` already handles file read failures gracefully (treats as cache miss)
329
+ /// - The worst case is a failed read → cache miss → recomputation
330
+ /// - No data corruption or invariant violations occur
331
+ /// - Alternative (atomic operation) would require complex locking impacting performance
332
+ fn mark_for_deletion(&self, path: &Path) -> Result<()> {
333
+ let mut deleting = self
334
+ .deleting_files
335
+ .lock()
336
+ .map_err(|e| KreuzbergError::LockPoisoned(format!("Deleting files mutex poisoned: {}", e)))?;
337
+ deleting.insert(path.to_path_buf());
338
+ Ok(())
339
+ }
340
+
341
+ /// Remove a file path from the deletion set
342
+ fn unmark_deletion(&self, path: &Path) -> Result<()> {
343
+ let mut deleting = self
344
+ .deleting_files
345
+ .lock()
346
+ .map_err(|e| KreuzbergError::LockPoisoned(format!("Deleting files mutex poisoned: {}", e)))?;
347
+ deleting.remove(&path.to_path_buf());
348
+ Ok(())
349
+ }
350
+
351
+ pub fn clear(&self) -> Result<(usize, f64)> {
352
+ let dir_path = &self.cache_dir;
353
+
354
+ if !dir_path.exists() {
355
+ return Ok((0, 0.0));
356
+ }
357
+
358
+ let mut removed_count = 0;
359
+ let mut removed_size = 0.0;
360
+
361
+ let read_dir = fs::read_dir(dir_path)
362
+ .map_err(|e| KreuzbergError::cache(format!("Failed to read cache directory: {}", e)))?;
363
+
364
+ for entry in read_dir {
365
+ let entry = match entry {
366
+ Ok(e) => e,
367
+ Err(e) => {
368
+ tracing::debug!("Error reading entry: {}", e);
369
+ continue;
370
+ }
371
+ };
372
+
373
+ let metadata = match entry.metadata() {
374
+ Ok(m) if m.is_file() => m,
375
+ _ => continue,
376
+ };
377
+
378
+ let path = entry.path();
379
+ if path.extension().and_then(|s| s.to_str()) != Some("msgpack") {
380
+ continue;
381
+ }
382
+
383
+ let size_mb = metadata.len() as f64 / (1024.0 * 1024.0);
384
+
385
+ // Mark file for deletion to prevent concurrent access ~keep
386
+ if let Err(e) = self.mark_for_deletion(&path) {
387
+ tracing::debug!("Failed to mark file for deletion: {} (continuing anyway)", e);
388
+ }
389
+
390
+ match fs::remove_file(&path) {
391
+ Ok(_) => {
392
+ removed_count += 1;
393
+ removed_size += size_mb;
394
+ // Unmark after successful deletion ~keep
395
+ if let Err(e) = self.unmark_deletion(&path) {
396
+ tracing::debug!("Failed to unmark deleted file: {} (non-critical)", e);
397
+ }
398
+ }
399
+ Err(e) => {
400
+ tracing::debug!("Failed to remove {:?}: {}", path, e);
401
+ // Unmark after failed deletion to allow retries ~keep
402
+ if let Err(e) = self.unmark_deletion(&path) {
403
+ tracing::debug!("Failed to unmark file after deletion error: {} (non-critical)", e);
404
+ }
405
+ }
406
+ }
407
+ }
408
+
409
+ Ok((removed_count, removed_size))
410
+ }
411
+
412
+ pub fn get_stats(&self) -> Result<CacheStats> {
413
+ use super::cleanup::get_cache_metadata;
414
+ let cache_path_str = self
415
+ .cache_dir
416
+ .to_str()
417
+ .ok_or_else(|| KreuzbergError::validation("Cache directory path contains invalid UTF-8".to_string()))?;
418
+ get_cache_metadata(cache_path_str)
419
+ }
420
+
421
+ pub fn cache_dir(&self) -> &Path {
422
+ &self.cache_dir
423
+ }
424
+
425
+ pub fn cache_type(&self) -> &str {
426
+ &self.cache_type
427
+ }
428
+ }