kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,179 @@
1
+ //! Cache management MCP tools.
2
+
3
+ use crate::{cache, mcp::errors::map_kreuzberg_error_to_mcp};
4
+ use rmcp::{
5
+ ErrorData as McpError,
6
+ handler::server::wrapper::Parameters,
7
+ model::{CallToolResult, Content, RawContent},
8
+ tool,
9
+ };
10
+
11
+ /// MCP tool methods for cache management.
12
+ pub(in crate::mcp) trait CacheTool {
13
+ /// Get cache statistics.
14
+ ///
15
+ /// This tool returns statistics about the cache including total files, size, and disk space.
16
+ #[tool(
17
+ description = "Get cache statistics including total files, size, and available disk space.",
18
+ annotations(title = "Cache Stats", read_only_hint = true, idempotent_hint = true)
19
+ )]
20
+ fn cache_stats(&self, Parameters(_): Parameters<()>) -> Result<CallToolResult, McpError> {
21
+ let cache_dir = std::env::current_dir()
22
+ .unwrap_or_else(|_| std::path::PathBuf::from("."))
23
+ .join(".kreuzberg");
24
+
25
+ let stats = cache::get_cache_metadata(cache_dir.to_str().unwrap_or(".")).map_err(map_kreuzberg_error_to_mcp)?;
26
+
27
+ let response = format!(
28
+ "Cache Statistics\n\
29
+ ================\n\
30
+ Directory: {}\n\
31
+ Total files: {}\n\
32
+ Total size: {:.2} MB\n\
33
+ Available space: {:.2} MB\n\
34
+ Oldest file age: {:.2} days\n\
35
+ Newest file age: {:.2} days",
36
+ cache_dir.to_string_lossy(),
37
+ stats.total_files,
38
+ stats.total_size_mb,
39
+ stats.available_space_mb,
40
+ stats.oldest_file_age_days,
41
+ stats.newest_file_age_days
42
+ );
43
+
44
+ Ok(CallToolResult::success(vec![Content::text(response)]))
45
+ }
46
+
47
+ /// Clear the cache.
48
+ ///
49
+ /// This tool removes all cached files and returns the number of files removed and space freed.
50
+ #[tool(
51
+ description = "Clear all cached files. Returns the number of files removed and space freed in MB.",
52
+ annotations(title = "Clear Cache", destructive_hint = true)
53
+ )]
54
+ fn cache_clear(&self, Parameters(_): Parameters<()>) -> Result<CallToolResult, McpError> {
55
+ let cache_dir = std::env::current_dir()
56
+ .unwrap_or_else(|_| std::path::PathBuf::from("."))
57
+ .join(".kreuzberg");
58
+
59
+ let (removed_files, freed_mb) =
60
+ cache::clear_cache_directory(cache_dir.to_str().unwrap_or(".")).map_err(map_kreuzberg_error_to_mcp)?;
61
+
62
+ let response = format!(
63
+ "Cache cleared successfully\n\
64
+ Directory: {}\n\
65
+ Removed files: {}\n\
66
+ Freed space: {:.2} MB",
67
+ cache_dir.to_string_lossy(),
68
+ removed_files,
69
+ freed_mb
70
+ );
71
+
72
+ Ok(CallToolResult::success(vec![Content::text(response)]))
73
+ }
74
+ }
75
+
76
+ #[cfg(test)]
77
+ mod tests {
78
+ use super::*;
79
+
80
+ // Simple test struct for trait implementation
81
+ struct TestMcpServer;
82
+
83
+ impl CacheTool for TestMcpServer {}
84
+
85
+ #[tokio::test]
86
+ async fn test_cache_stats_returns_statistics() {
87
+ let server = TestMcpServer;
88
+
89
+ let result = server.cache_stats(Parameters(()));
90
+
91
+ assert!(result.is_ok());
92
+ let call_result = result.unwrap();
93
+ if let Some(content) = call_result.content.first() {
94
+ match &content.raw {
95
+ RawContent::Text(text) => {
96
+ assert!(text.text.contains("Cache Statistics"));
97
+ assert!(text.text.contains("Directory:"));
98
+ assert!(text.text.contains("Total files:"));
99
+ assert!(text.text.contains("Total size:"));
100
+ assert!(text.text.contains("Available space:"));
101
+ }
102
+ _ => panic!("Expected text content"),
103
+ }
104
+ } else {
105
+ panic!("Expected content in result");
106
+ }
107
+ }
108
+
109
+ #[tokio::test]
110
+ async fn test_cache_clear_returns_result() {
111
+ let server = TestMcpServer;
112
+
113
+ let result = server.cache_clear(Parameters(()));
114
+
115
+ assert!(result.is_ok());
116
+ let call_result = result.unwrap();
117
+ if let Some(content) = call_result.content.first() {
118
+ match &content.raw {
119
+ RawContent::Text(text) => {
120
+ assert!(text.text.contains("Cache cleared"));
121
+ assert!(text.text.contains("Directory:"));
122
+ assert!(text.text.contains("Removed files:"));
123
+ assert!(text.text.contains("Freed space:"));
124
+ }
125
+ _ => panic!("Expected text content"),
126
+ }
127
+ } else {
128
+ panic!("Expected content in result");
129
+ }
130
+ }
131
+
132
+ #[tokio::test]
133
+ async fn test_cache_clear_is_idempotent() {
134
+ let server = TestMcpServer;
135
+
136
+ let result1 = server.cache_clear(Parameters(()));
137
+ assert!(result1.is_ok());
138
+
139
+ let result2 = server.cache_clear(Parameters(()));
140
+ assert!(result2.is_ok());
141
+ }
142
+
143
+ #[tokio::test]
144
+ async fn test_cache_clear_returns_metrics() {
145
+ let server = TestMcpServer;
146
+
147
+ let result = server.cache_clear(Parameters(()));
148
+
149
+ assert!(result.is_ok());
150
+ let call_result = result.unwrap();
151
+ if let Some(content) = call_result.content.first()
152
+ && let RawContent::Text(text) = &content.raw
153
+ {
154
+ assert!(text.text.contains("Removed files:"));
155
+ assert!(text.text.contains("Freed space:"));
156
+ }
157
+ }
158
+
159
+ #[tokio::test]
160
+ async fn test_cache_stats_returns_valid_data() {
161
+ let server = TestMcpServer;
162
+
163
+ let result = server.cache_stats(Parameters(()));
164
+
165
+ assert!(result.is_ok());
166
+ let call_result = result.unwrap();
167
+ if let Some(content) = call_result.content.first()
168
+ && let RawContent::Text(text) = &content.raw
169
+ {
170
+ assert!(text.text.contains("Cache Statistics"));
171
+ assert!(text.text.contains("Directory:"));
172
+ assert!(text.text.contains("Total files:"));
173
+ assert!(text.text.contains("Total size:"));
174
+ assert!(text.text.contains("Available space:"));
175
+ assert!(text.text.contains("Oldest file age:"));
176
+ assert!(text.text.contains("Newest file age:"));
177
+ }
178
+ }
179
+ }
@@ -0,0 +1,403 @@
1
+ //! Document extraction MCP tools.
2
+
3
+ use base64::prelude::*;
4
+ use crate::{
5
+ ExtractionConfig, batch_extract_file, batch_extract_file_sync, extract_bytes, extract_bytes_sync, extract_file,
6
+ extract_file_sync, mcp::errors::map_kreuzberg_error_to_mcp, mcp::format::{build_config, format_extraction_result},
7
+ mcp::params::{BatchExtractFilesParams, ExtractBytesParams, ExtractFileParams},
8
+ };
9
+ use rmcp::{
10
+ ErrorData as McpError,
11
+ handler::server::wrapper::Parameters,
12
+ model::{CallToolResult, Content, RawContent},
13
+ tool,
14
+ };
15
+
16
+ /// MCP tool methods for document extraction.
17
+ pub(in crate::mcp) trait ExtractionTool {
18
+ /// Get reference to default config
19
+ fn default_config(&self) -> &std::sync::Arc<ExtractionConfig>;
20
+
21
+ /// Extract content from a file.
22
+ ///
23
+ /// This tool extracts text, metadata, and tables from documents in various formats
24
+ /// including PDFs, Word documents, Excel spreadsheets, images (with OCR), and more.
25
+ #[tool(
26
+ description = "Extract content from a file by path. Supports PDFs, Word, Excel, images (with OCR), HTML, and more.",
27
+ annotations(title = "Extract File", read_only_hint = true, idempotent_hint = true)
28
+ )]
29
+ async fn extract_file(
30
+ &self,
31
+ Parameters(params): Parameters<ExtractFileParams>,
32
+ ) -> Result<CallToolResult, McpError> {
33
+ let config = build_config(self.default_config(), params.enable_ocr, params.force_ocr);
34
+
35
+ let result = if params.r#async {
36
+ extract_file(&params.path, params.mime_type.as_deref(), &config)
37
+ .await
38
+ .map_err(map_kreuzberg_error_to_mcp)?
39
+ } else {
40
+ extract_file_sync(&params.path, params.mime_type.as_deref(), &config).map_err(map_kreuzberg_error_to_mcp)?
41
+ };
42
+
43
+ let response = format_extraction_result(&result);
44
+ Ok(CallToolResult::success(vec![Content::text(response)]))
45
+ }
46
+
47
+ /// Extract content from base64-encoded bytes.
48
+ ///
49
+ /// This tool extracts text, metadata, and tables from base64-encoded document data.
50
+ #[tool(
51
+ description = "Extract content from base64-encoded file data. Returns extracted text, metadata, and tables.",
52
+ annotations(title = "Extract Bytes", read_only_hint = true, idempotent_hint = true)
53
+ )]
54
+ async fn extract_bytes(
55
+ &self,
56
+ Parameters(params): Parameters<ExtractBytesParams>,
57
+ ) -> Result<CallToolResult, McpError> {
58
+ let bytes = BASE64_STANDARD
59
+ .decode(&params.data)
60
+ .map_err(|e| McpError::invalid_params(format!("Invalid base64: {}", e), None))?;
61
+
62
+ let config = build_config(self.default_config(), params.enable_ocr, params.force_ocr);
63
+
64
+ let mime_type = params.mime_type.as_deref().unwrap_or("");
65
+
66
+ let result = if params.r#async {
67
+ extract_bytes(&bytes, mime_type, &config)
68
+ .await
69
+ .map_err(map_kreuzberg_error_to_mcp)?
70
+ } else {
71
+ extract_bytes_sync(&bytes, mime_type, &config).map_err(map_kreuzberg_error_to_mcp)?
72
+ };
73
+
74
+ let response = format_extraction_result(&result);
75
+ Ok(CallToolResult::success(vec![Content::text(response)]))
76
+ }
77
+
78
+ /// Extract content from multiple files in parallel.
79
+ ///
80
+ /// This tool efficiently processes multiple documents simultaneously, useful for batch operations.
81
+ #[tool(
82
+ description = "Extract content from multiple files in parallel. Returns results for all files.",
83
+ annotations(title = "Batch Extract Files", read_only_hint = true, idempotent_hint = true)
84
+ )]
85
+ async fn batch_extract_files(
86
+ &self,
87
+ Parameters(params): Parameters<BatchExtractFilesParams>,
88
+ ) -> Result<CallToolResult, McpError> {
89
+ let config = build_config(self.default_config(), params.enable_ocr, params.force_ocr);
90
+
91
+ let results = if params.r#async {
92
+ batch_extract_file(params.paths.clone(), &config)
93
+ .await
94
+ .map_err(map_kreuzberg_error_to_mcp)?
95
+ } else {
96
+ batch_extract_file_sync(params.paths.clone(), &config).map_err(map_kreuzberg_error_to_mcp)?
97
+ };
98
+
99
+ let mut response = String::new();
100
+ for (i, result) in results.iter().enumerate() {
101
+ response.push_str(&format!("=== Document {}: {} ===\n", i + 1, params.paths[i]));
102
+ response.push_str(&format_extraction_result(result));
103
+ response.push_str("\n\n");
104
+ }
105
+
106
+ Ok(CallToolResult::success(vec![Content::text(response)]))
107
+ }
108
+ }
109
+
110
+ #[cfg(test)]
111
+ mod tests {
112
+ use super::*;
113
+ use std::path::PathBuf;
114
+
115
+ /// Get the path to a test document relative to workspace root.
116
+ fn get_test_path(relative_path: &str) -> String {
117
+ let workspace_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
118
+ .parent()
119
+ .unwrap()
120
+ .parent()
121
+ .unwrap()
122
+ .to_path_buf();
123
+
124
+ workspace_root
125
+ .join("test_documents")
126
+ .join(relative_path)
127
+ .to_string_lossy()
128
+ .to_string()
129
+ }
130
+
131
+ // Simple test struct for trait implementation
132
+ struct TestMcpServer {
133
+ config: std::sync::Arc<ExtractionConfig>,
134
+ }
135
+
136
+ impl TestMcpServer {
137
+ fn new() -> Self {
138
+ Self {
139
+ config: std::sync::Arc::new(ExtractionConfig::default()),
140
+ }
141
+ }
142
+ }
143
+
144
+ impl ExtractionTool for TestMcpServer {
145
+ fn default_config(&self) -> &std::sync::Arc<ExtractionConfig> {
146
+ &self.config
147
+ }
148
+ }
149
+
150
+ #[tokio::test]
151
+ async fn test_extract_file_sync_with_valid_pdf() {
152
+ let server = TestMcpServer::new();
153
+ let params = ExtractFileParams {
154
+ path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
155
+ mime_type: None,
156
+ enable_ocr: false,
157
+ force_ocr: false,
158
+ r#async: true,
159
+ };
160
+
161
+ let result = server.extract_file(Parameters(params)).await;
162
+
163
+ assert!(result.is_ok());
164
+ let call_result = result.unwrap();
165
+ if let Some(content) = call_result.content.first() {
166
+ match &content.raw {
167
+ RawContent::Text(text) => {
168
+ assert!(!text.text.is_empty());
169
+ assert!(text.text.contains("Content"));
170
+ }
171
+ _ => panic!("Expected text content"),
172
+ }
173
+ } else {
174
+ panic!("Expected content in result");
175
+ }
176
+ }
177
+
178
+ #[tokio::test]
179
+ async fn test_extract_file_async_with_valid_pdf() {
180
+ let server = TestMcpServer::new();
181
+ let params = ExtractFileParams {
182
+ path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
183
+ mime_type: None,
184
+ enable_ocr: false,
185
+ force_ocr: false,
186
+ r#async: true,
187
+ };
188
+
189
+ let result = server.extract_file(Parameters(params)).await;
190
+
191
+ assert!(result.is_ok());
192
+ let call_result = result.unwrap();
193
+ if let Some(content) = call_result.content.first() {
194
+ match &content.raw {
195
+ RawContent::Text(text) => {
196
+ assert!(!text.text.is_empty());
197
+ }
198
+ _ => panic!("Expected text content"),
199
+ }
200
+ } else {
201
+ panic!("Expected content in result");
202
+ }
203
+ }
204
+
205
+ #[tokio::test]
206
+ async fn test_extract_file_with_invalid_path() {
207
+ let server = TestMcpServer::new();
208
+ let params = ExtractFileParams {
209
+ path: "/nonexistent/file.pdf".to_string(),
210
+ mime_type: None,
211
+ enable_ocr: false,
212
+ force_ocr: false,
213
+ r#async: true,
214
+ };
215
+
216
+ let result = server.extract_file(Parameters(params)).await;
217
+
218
+ assert!(result.is_err());
219
+ let error = result.unwrap_err();
220
+ assert!(error.code.0 == -32602 || error.code.0 == -32603);
221
+ }
222
+
223
+ #[tokio::test]
224
+ async fn test_extract_file_with_mime_type_hint() {
225
+ let server = TestMcpServer::new();
226
+ let params = ExtractFileParams {
227
+ path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
228
+ mime_type: Some("application/pdf".to_string()),
229
+ enable_ocr: false,
230
+ force_ocr: false,
231
+ r#async: true,
232
+ };
233
+
234
+ let result = server.extract_file(Parameters(params)).await;
235
+
236
+ assert!(result.is_ok());
237
+ }
238
+
239
+ #[tokio::test]
240
+ async fn test_extract_bytes_sync_with_valid_data() {
241
+ let server = TestMcpServer::new();
242
+
243
+ let text_content = b"Hello, world!";
244
+ let encoded = BASE64_STANDARD.encode(text_content);
245
+
246
+ let params = ExtractBytesParams {
247
+ data: encoded,
248
+ mime_type: Some("text/plain".to_string()),
249
+ enable_ocr: false,
250
+ force_ocr: false,
251
+ r#async: true,
252
+ };
253
+
254
+ let result = server.extract_bytes(Parameters(params)).await;
255
+
256
+ assert!(result.is_ok());
257
+ let call_result = result.unwrap();
258
+ if let Some(content) = call_result.content.first() {
259
+ match &content.raw {
260
+ RawContent::Text(text) => {
261
+ assert!(text.text.contains("Hello, world!"));
262
+ }
263
+ _ => panic!("Expected text content"),
264
+ }
265
+ } else {
266
+ panic!("Expected content in result");
267
+ }
268
+ }
269
+
270
+ #[tokio::test]
271
+ async fn test_extract_bytes_with_invalid_base64() {
272
+ let server = TestMcpServer::new();
273
+
274
+ let params = ExtractBytesParams {
275
+ data: "not-valid-base64!!!".to_string(),
276
+ mime_type: None,
277
+ enable_ocr: false,
278
+ force_ocr: false,
279
+ r#async: true,
280
+ };
281
+
282
+ let result = server.extract_bytes(Parameters(params)).await;
283
+
284
+ assert!(result.is_err());
285
+ let error = result.unwrap_err();
286
+ assert_eq!(error.code.0, -32602);
287
+ assert!(error.message.contains("Invalid base64"));
288
+ }
289
+
290
+ #[tokio::test]
291
+ async fn test_batch_extract_files_sync_with_valid_files() {
292
+ let server = TestMcpServer::new();
293
+ let params = BatchExtractFilesParams {
294
+ paths: vec![get_test_path("pdfs_with_tables/tiny.pdf").to_string()],
295
+ enable_ocr: false,
296
+ force_ocr: false,
297
+ r#async: true,
298
+ };
299
+
300
+ let result = server.batch_extract_files(Parameters(params)).await;
301
+
302
+ assert!(result.is_ok());
303
+ let call_result = result.unwrap();
304
+ if let Some(content) = call_result.content.first() {
305
+ match &content.raw {
306
+ RawContent::Text(text) => {
307
+ assert!(text.text.contains("Document 1"));
308
+ assert!(text.text.contains("tiny.pdf"));
309
+ }
310
+ _ => panic!("Expected text content"),
311
+ }
312
+ } else {
313
+ panic!("Expected content in result");
314
+ }
315
+ }
316
+
317
+ #[tokio::test]
318
+ async fn test_batch_extract_files_with_empty_list() {
319
+ let server = TestMcpServer::new();
320
+ let params = BatchExtractFilesParams {
321
+ paths: vec![],
322
+ enable_ocr: false,
323
+ force_ocr: false,
324
+ r#async: true,
325
+ };
326
+
327
+ let result = server.batch_extract_files(Parameters(params)).await;
328
+
329
+ assert!(result.is_ok());
330
+ let call_result = result.unwrap();
331
+ if let Some(content) = call_result.content.first() {
332
+ match &content.raw {
333
+ RawContent::Text(text) => {
334
+ assert!(text.text.is_empty() || text.text.trim().is_empty());
335
+ }
336
+ _ => panic!("Expected text content"),
337
+ }
338
+ } else {
339
+ panic!("Expected content in result");
340
+ }
341
+ }
342
+
343
+ #[tokio::test]
344
+ async fn test_response_includes_metadata() {
345
+ let server = TestMcpServer::new();
346
+
347
+ let test_file = get_test_path("pdfs_with_tables/tiny.pdf");
348
+
349
+ if std::path::Path::new(&test_file).exists() {
350
+ let params = ExtractFileParams {
351
+ path: test_file.to_string(),
352
+ mime_type: None,
353
+ enable_ocr: false,
354
+ force_ocr: false,
355
+ r#async: true,
356
+ };
357
+
358
+ let result = server.extract_file(Parameters(params)).await;
359
+
360
+ assert!(result.is_ok());
361
+ let call_result = result.unwrap();
362
+
363
+ if let Some(content) = call_result.content.first()
364
+ && let RawContent::Text(text) = &content.raw
365
+ {
366
+ assert!(text.text.contains("Metadata:"));
367
+ }
368
+ }
369
+ }
370
+
371
+ #[tokio::test]
372
+ async fn test_batch_extract_preserves_file_order() {
373
+ let server = TestMcpServer::new();
374
+
375
+ let file1 = get_test_path("pdfs_with_tables/tiny.pdf");
376
+ let file2 = get_test_path("pdfs_with_tables/medium.pdf");
377
+
378
+ if std::path::Path::new(&file1).exists() && std::path::Path::new(&file2).exists() {
379
+ let params = BatchExtractFilesParams {
380
+ paths: vec![file1.to_string(), file2.to_string()],
381
+ enable_ocr: false,
382
+ force_ocr: false,
383
+ r#async: true,
384
+ };
385
+
386
+ let result = server.batch_extract_files(Parameters(params)).await;
387
+
388
+ if let Ok(call_result) = result
389
+ && let Some(content) = call_result.content.first()
390
+ && let RawContent::Text(text) = &content.raw
391
+ {
392
+ assert!(text.text.contains("Document 1"));
393
+ assert!(text.text.contains("Document 2"));
394
+
395
+ let doc1_pos = text.text.find("Document 1");
396
+ let doc2_pos = text.text.find("Document 2");
397
+ if let (Some(pos1), Some(pos2)) = (doc1_pos, doc2_pos) {
398
+ assert!(pos1 < pos2, "Documents should be in order");
399
+ }
400
+ }
401
+ }
402
+ }
403
+ }