kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,179 @@
1
+ //! Cache management MCP tools.
2
+
3
+ use crate::{cache, mcp::errors::map_kreuzberg_error_to_mcp};
4
+ use rmcp::{
5
+ ErrorData as McpError,
6
+ handler::server::wrapper::Parameters,
7
+ model::{CallToolResult, Content, RawContent},
8
+ tool,
9
+ };
10
+
11
+ /// MCP tool methods for cache management.
12
+ pub(in crate::mcp) trait CacheTool {
13
+ /// Get cache statistics.
14
+ ///
15
+ /// This tool returns statistics about the cache including total files, size, and disk space.
16
+ #[tool(
17
+ description = "Get cache statistics including total files, size, and available disk space.",
18
+ annotations(title = "Cache Stats", read_only_hint = true, idempotent_hint = true)
19
+ )]
20
+ fn cache_stats(&self, Parameters(_): Parameters<()>) -> Result<CallToolResult, McpError> {
21
+ let cache_dir = std::env::current_dir()
22
+ .unwrap_or_else(|_| std::path::PathBuf::from("."))
23
+ .join(".kreuzberg");
24
+
25
+ let stats = cache::get_cache_metadata(cache_dir.to_str().unwrap_or(".")).map_err(map_kreuzberg_error_to_mcp)?;
26
+
27
+ let response = format!(
28
+ "Cache Statistics\n\
29
+ ================\n\
30
+ Directory: {}\n\
31
+ Total files: {}\n\
32
+ Total size: {:.2} MB\n\
33
+ Available space: {:.2} MB\n\
34
+ Oldest file age: {:.2} days\n\
35
+ Newest file age: {:.2} days",
36
+ cache_dir.to_string_lossy(),
37
+ stats.total_files,
38
+ stats.total_size_mb,
39
+ stats.available_space_mb,
40
+ stats.oldest_file_age_days,
41
+ stats.newest_file_age_days
42
+ );
43
+
44
+ Ok(CallToolResult::success(vec![Content::text(response)]))
45
+ }
46
+
47
+ /// Clear the cache.
48
+ ///
49
+ /// This tool removes all cached files and returns the number of files removed and space freed.
50
+ #[tool(
51
+ description = "Clear all cached files. Returns the number of files removed and space freed in MB.",
52
+ annotations(title = "Clear Cache", destructive_hint = true)
53
+ )]
54
+ fn cache_clear(&self, Parameters(_): Parameters<()>) -> Result<CallToolResult, McpError> {
55
+ let cache_dir = std::env::current_dir()
56
+ .unwrap_or_else(|_| std::path::PathBuf::from("."))
57
+ .join(".kreuzberg");
58
+
59
+ let (removed_files, freed_mb) =
60
+ cache::clear_cache_directory(cache_dir.to_str().unwrap_or(".")).map_err(map_kreuzberg_error_to_mcp)?;
61
+
62
+ let response = format!(
63
+ "Cache cleared successfully\n\
64
+ Directory: {}\n\
65
+ Removed files: {}\n\
66
+ Freed space: {:.2} MB",
67
+ cache_dir.to_string_lossy(),
68
+ removed_files,
69
+ freed_mb
70
+ );
71
+
72
+ Ok(CallToolResult::success(vec![Content::text(response)]))
73
+ }
74
+ }
75
+
76
+ #[cfg(test)]
77
+ mod tests {
78
+ use super::*;
79
+
80
+ // Simple test struct for trait implementation
81
+ struct TestMcpServer;
82
+
83
+ impl CacheTool for TestMcpServer {}
84
+
85
+ #[tokio::test]
86
+ async fn test_cache_stats_returns_statistics() {
87
+ let server = TestMcpServer;
88
+
89
+ let result = server.cache_stats(Parameters(()));
90
+
91
+ assert!(result.is_ok());
92
+ let call_result = result.unwrap();
93
+ if let Some(content) = call_result.content.first() {
94
+ match &content.raw {
95
+ RawContent::Text(text) => {
96
+ assert!(text.text.contains("Cache Statistics"));
97
+ assert!(text.text.contains("Directory:"));
98
+ assert!(text.text.contains("Total files:"));
99
+ assert!(text.text.contains("Total size:"));
100
+ assert!(text.text.contains("Available space:"));
101
+ }
102
+ _ => panic!("Expected text content"),
103
+ }
104
+ } else {
105
+ panic!("Expected content in result");
106
+ }
107
+ }
108
+
109
+ #[tokio::test]
110
+ async fn test_cache_clear_returns_result() {
111
+ let server = TestMcpServer;
112
+
113
+ let result = server.cache_clear(Parameters(()));
114
+
115
+ assert!(result.is_ok());
116
+ let call_result = result.unwrap();
117
+ if let Some(content) = call_result.content.first() {
118
+ match &content.raw {
119
+ RawContent::Text(text) => {
120
+ assert!(text.text.contains("Cache cleared"));
121
+ assert!(text.text.contains("Directory:"));
122
+ assert!(text.text.contains("Removed files:"));
123
+ assert!(text.text.contains("Freed space:"));
124
+ }
125
+ _ => panic!("Expected text content"),
126
+ }
127
+ } else {
128
+ panic!("Expected content in result");
129
+ }
130
+ }
131
+
132
+ #[tokio::test]
133
+ async fn test_cache_clear_is_idempotent() {
134
+ let server = TestMcpServer;
135
+
136
+ let result1 = server.cache_clear(Parameters(()));
137
+ assert!(result1.is_ok());
138
+
139
+ let result2 = server.cache_clear(Parameters(()));
140
+ assert!(result2.is_ok());
141
+ }
142
+
143
+ #[tokio::test]
144
+ async fn test_cache_clear_returns_metrics() {
145
+ let server = TestMcpServer;
146
+
147
+ let result = server.cache_clear(Parameters(()));
148
+
149
+ assert!(result.is_ok());
150
+ let call_result = result.unwrap();
151
+ if let Some(content) = call_result.content.first()
152
+ && let RawContent::Text(text) = &content.raw
153
+ {
154
+ assert!(text.text.contains("Removed files:"));
155
+ assert!(text.text.contains("Freed space:"));
156
+ }
157
+ }
158
+
159
+ #[tokio::test]
160
+ async fn test_cache_stats_returns_valid_data() {
161
+ let server = TestMcpServer;
162
+
163
+ let result = server.cache_stats(Parameters(()));
164
+
165
+ assert!(result.is_ok());
166
+ let call_result = result.unwrap();
167
+ if let Some(content) = call_result.content.first()
168
+ && let RawContent::Text(text) = &content.raw
169
+ {
170
+ assert!(text.text.contains("Cache Statistics"));
171
+ assert!(text.text.contains("Directory:"));
172
+ assert!(text.text.contains("Total files:"));
173
+ assert!(text.text.contains("Total size:"));
174
+ assert!(text.text.contains("Available space:"));
175
+ assert!(text.text.contains("Oldest file age:"));
176
+ assert!(text.text.contains("Newest file age:"));
177
+ }
178
+ }
179
+ }
@@ -0,0 +1,403 @@
1
+ //! Document extraction MCP tools.
2
+
3
+ use base64::prelude::*;
4
+ use crate::{
5
+ ExtractionConfig, batch_extract_file, batch_extract_file_sync, extract_bytes, extract_bytes_sync, extract_file,
6
+ extract_file_sync, mcp::errors::map_kreuzberg_error_to_mcp, mcp::format::{build_config, format_extraction_result},
7
+ mcp::params::{BatchExtractFilesParams, ExtractBytesParams, ExtractFileParams},
8
+ };
9
+ use rmcp::{
10
+ ErrorData as McpError,
11
+ handler::server::wrapper::Parameters,
12
+ model::{CallToolResult, Content, RawContent},
13
+ tool,
14
+ };
15
+
16
+ /// MCP tool methods for document extraction.
17
+ pub(in crate::mcp) trait ExtractionTool {
18
+ /// Get reference to default config
19
+ fn default_config(&self) -> &std::sync::Arc<ExtractionConfig>;
20
+
21
+ /// Extract content from a file.
22
+ ///
23
+ /// This tool extracts text, metadata, and tables from documents in various formats
24
+ /// including PDFs, Word documents, Excel spreadsheets, images (with OCR), and more.
25
+ #[tool(
26
+ description = "Extract content from a file by path. Supports PDFs, Word, Excel, images (with OCR), HTML, and more.",
27
+ annotations(title = "Extract File", read_only_hint = true, idempotent_hint = true)
28
+ )]
29
+ async fn extract_file(
30
+ &self,
31
+ Parameters(params): Parameters<ExtractFileParams>,
32
+ ) -> Result<CallToolResult, McpError> {
33
+ let config = build_config(self.default_config(), params.enable_ocr, params.force_ocr);
34
+
35
+ let result = if params.r#async {
36
+ extract_file(&params.path, params.mime_type.as_deref(), &config)
37
+ .await
38
+ .map_err(map_kreuzberg_error_to_mcp)?
39
+ } else {
40
+ extract_file_sync(&params.path, params.mime_type.as_deref(), &config).map_err(map_kreuzberg_error_to_mcp)?
41
+ };
42
+
43
+ let response = format_extraction_result(&result);
44
+ Ok(CallToolResult::success(vec![Content::text(response)]))
45
+ }
46
+
47
+ /// Extract content from base64-encoded bytes.
48
+ ///
49
+ /// This tool extracts text, metadata, and tables from base64-encoded document data.
50
+ #[tool(
51
+ description = "Extract content from base64-encoded file data. Returns extracted text, metadata, and tables.",
52
+ annotations(title = "Extract Bytes", read_only_hint = true, idempotent_hint = true)
53
+ )]
54
+ async fn extract_bytes(
55
+ &self,
56
+ Parameters(params): Parameters<ExtractBytesParams>,
57
+ ) -> Result<CallToolResult, McpError> {
58
+ let bytes = BASE64_STANDARD
59
+ .decode(&params.data)
60
+ .map_err(|e| McpError::invalid_params(format!("Invalid base64: {}", e), None))?;
61
+
62
+ let config = build_config(self.default_config(), params.enable_ocr, params.force_ocr);
63
+
64
+ let mime_type = params.mime_type.as_deref().unwrap_or("");
65
+
66
+ let result = if params.r#async {
67
+ extract_bytes(&bytes, mime_type, &config)
68
+ .await
69
+ .map_err(map_kreuzberg_error_to_mcp)?
70
+ } else {
71
+ extract_bytes_sync(&bytes, mime_type, &config).map_err(map_kreuzberg_error_to_mcp)?
72
+ };
73
+
74
+ let response = format_extraction_result(&result);
75
+ Ok(CallToolResult::success(vec![Content::text(response)]))
76
+ }
77
+
78
+ /// Extract content from multiple files in parallel.
79
+ ///
80
+ /// This tool efficiently processes multiple documents simultaneously, useful for batch operations.
81
+ #[tool(
82
+ description = "Extract content from multiple files in parallel. Returns results for all files.",
83
+ annotations(title = "Batch Extract Files", read_only_hint = true, idempotent_hint = true)
84
+ )]
85
+ async fn batch_extract_files(
86
+ &self,
87
+ Parameters(params): Parameters<BatchExtractFilesParams>,
88
+ ) -> Result<CallToolResult, McpError> {
89
+ let config = build_config(self.default_config(), params.enable_ocr, params.force_ocr);
90
+
91
+ let results = if params.r#async {
92
+ batch_extract_file(params.paths.clone(), &config)
93
+ .await
94
+ .map_err(map_kreuzberg_error_to_mcp)?
95
+ } else {
96
+ batch_extract_file_sync(params.paths.clone(), &config).map_err(map_kreuzberg_error_to_mcp)?
97
+ };
98
+
99
+ let mut response = String::new();
100
+ for (i, result) in results.iter().enumerate() {
101
+ response.push_str(&format!("=== Document {}: {} ===\n", i + 1, params.paths[i]));
102
+ response.push_str(&format_extraction_result(result));
103
+ response.push_str("\n\n");
104
+ }
105
+
106
+ Ok(CallToolResult::success(vec![Content::text(response)]))
107
+ }
108
+ }
109
+
110
+ #[cfg(test)]
111
+ mod tests {
112
+ use super::*;
113
+ use std::path::PathBuf;
114
+
115
+ /// Get the path to a test document relative to workspace root.
116
+ fn get_test_path(relative_path: &str) -> String {
117
+ let workspace_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
118
+ .parent()
119
+ .unwrap()
120
+ .parent()
121
+ .unwrap()
122
+ .to_path_buf();
123
+
124
+ workspace_root
125
+ .join("test_documents")
126
+ .join(relative_path)
127
+ .to_string_lossy()
128
+ .to_string()
129
+ }
130
+
131
+ // Simple test struct for trait implementation
132
+ struct TestMcpServer {
133
+ config: std::sync::Arc<ExtractionConfig>,
134
+ }
135
+
136
+ impl TestMcpServer {
137
+ fn new() -> Self {
138
+ Self {
139
+ config: std::sync::Arc::new(ExtractionConfig::default()),
140
+ }
141
+ }
142
+ }
143
+
144
+ impl ExtractionTool for TestMcpServer {
145
+ fn default_config(&self) -> &std::sync::Arc<ExtractionConfig> {
146
+ &self.config
147
+ }
148
+ }
149
+
150
+ #[tokio::test]
151
+ async fn test_extract_file_sync_with_valid_pdf() {
152
+ let server = TestMcpServer::new();
153
+ let params = ExtractFileParams {
154
+ path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
155
+ mime_type: None,
156
+ enable_ocr: false,
157
+ force_ocr: false,
158
+ r#async: true,
159
+ };
160
+
161
+ let result = server.extract_file(Parameters(params)).await;
162
+
163
+ assert!(result.is_ok());
164
+ let call_result = result.unwrap();
165
+ if let Some(content) = call_result.content.first() {
166
+ match &content.raw {
167
+ RawContent::Text(text) => {
168
+ assert!(!text.text.is_empty());
169
+ assert!(text.text.contains("Content"));
170
+ }
171
+ _ => panic!("Expected text content"),
172
+ }
173
+ } else {
174
+ panic!("Expected content in result");
175
+ }
176
+ }
177
+
178
+ #[tokio::test]
179
+ async fn test_extract_file_async_with_valid_pdf() {
180
+ let server = TestMcpServer::new();
181
+ let params = ExtractFileParams {
182
+ path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
183
+ mime_type: None,
184
+ enable_ocr: false,
185
+ force_ocr: false,
186
+ r#async: true,
187
+ };
188
+
189
+ let result = server.extract_file(Parameters(params)).await;
190
+
191
+ assert!(result.is_ok());
192
+ let call_result = result.unwrap();
193
+ if let Some(content) = call_result.content.first() {
194
+ match &content.raw {
195
+ RawContent::Text(text) => {
196
+ assert!(!text.text.is_empty());
197
+ }
198
+ _ => panic!("Expected text content"),
199
+ }
200
+ } else {
201
+ panic!("Expected content in result");
202
+ }
203
+ }
204
+
205
+ #[tokio::test]
206
+ async fn test_extract_file_with_invalid_path() {
207
+ let server = TestMcpServer::new();
208
+ let params = ExtractFileParams {
209
+ path: "/nonexistent/file.pdf".to_string(),
210
+ mime_type: None,
211
+ enable_ocr: false,
212
+ force_ocr: false,
213
+ r#async: true,
214
+ };
215
+
216
+ let result = server.extract_file(Parameters(params)).await;
217
+
218
+ assert!(result.is_err());
219
+ let error = result.unwrap_err();
220
+ assert!(error.code.0 == -32602 || error.code.0 == -32603);
221
+ }
222
+
223
+ #[tokio::test]
224
+ async fn test_extract_file_with_mime_type_hint() {
225
+ let server = TestMcpServer::new();
226
+ let params = ExtractFileParams {
227
+ path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
228
+ mime_type: Some("application/pdf".to_string()),
229
+ enable_ocr: false,
230
+ force_ocr: false,
231
+ r#async: true,
232
+ };
233
+
234
+ let result = server.extract_file(Parameters(params)).await;
235
+
236
+ assert!(result.is_ok());
237
+ }
238
+
239
+ #[tokio::test]
240
+ async fn test_extract_bytes_sync_with_valid_data() {
241
+ let server = TestMcpServer::new();
242
+
243
+ let text_content = b"Hello, world!";
244
+ let encoded = BASE64_STANDARD.encode(text_content);
245
+
246
+ let params = ExtractBytesParams {
247
+ data: encoded,
248
+ mime_type: Some("text/plain".to_string()),
249
+ enable_ocr: false,
250
+ force_ocr: false,
251
+ r#async: true,
252
+ };
253
+
254
+ let result = server.extract_bytes(Parameters(params)).await;
255
+
256
+ assert!(result.is_ok());
257
+ let call_result = result.unwrap();
258
+ if let Some(content) = call_result.content.first() {
259
+ match &content.raw {
260
+ RawContent::Text(text) => {
261
+ assert!(text.text.contains("Hello, world!"));
262
+ }
263
+ _ => panic!("Expected text content"),
264
+ }
265
+ } else {
266
+ panic!("Expected content in result");
267
+ }
268
+ }
269
+
270
+ #[tokio::test]
271
+ async fn test_extract_bytes_with_invalid_base64() {
272
+ let server = TestMcpServer::new();
273
+
274
+ let params = ExtractBytesParams {
275
+ data: "not-valid-base64!!!".to_string(),
276
+ mime_type: None,
277
+ enable_ocr: false,
278
+ force_ocr: false,
279
+ r#async: true,
280
+ };
281
+
282
+ let result = server.extract_bytes(Parameters(params)).await;
283
+
284
+ assert!(result.is_err());
285
+ let error = result.unwrap_err();
286
+ assert_eq!(error.code.0, -32602);
287
+ assert!(error.message.contains("Invalid base64"));
288
+ }
289
+
290
+ #[tokio::test]
291
+ async fn test_batch_extract_files_sync_with_valid_files() {
292
+ let server = TestMcpServer::new();
293
+ let params = BatchExtractFilesParams {
294
+ paths: vec![get_test_path("pdfs_with_tables/tiny.pdf").to_string()],
295
+ enable_ocr: false,
296
+ force_ocr: false,
297
+ r#async: true,
298
+ };
299
+
300
+ let result = server.batch_extract_files(Parameters(params)).await;
301
+
302
+ assert!(result.is_ok());
303
+ let call_result = result.unwrap();
304
+ if let Some(content) = call_result.content.first() {
305
+ match &content.raw {
306
+ RawContent::Text(text) => {
307
+ assert!(text.text.contains("Document 1"));
308
+ assert!(text.text.contains("tiny.pdf"));
309
+ }
310
+ _ => panic!("Expected text content"),
311
+ }
312
+ } else {
313
+ panic!("Expected content in result");
314
+ }
315
+ }
316
+
317
+ #[tokio::test]
318
+ async fn test_batch_extract_files_with_empty_list() {
319
+ let server = TestMcpServer::new();
320
+ let params = BatchExtractFilesParams {
321
+ paths: vec![],
322
+ enable_ocr: false,
323
+ force_ocr: false,
324
+ r#async: true,
325
+ };
326
+
327
+ let result = server.batch_extract_files(Parameters(params)).await;
328
+
329
+ assert!(result.is_ok());
330
+ let call_result = result.unwrap();
331
+ if let Some(content) = call_result.content.first() {
332
+ match &content.raw {
333
+ RawContent::Text(text) => {
334
+ assert!(text.text.is_empty() || text.text.trim().is_empty());
335
+ }
336
+ _ => panic!("Expected text content"),
337
+ }
338
+ } else {
339
+ panic!("Expected content in result");
340
+ }
341
+ }
342
+
343
+ #[tokio::test]
344
+ async fn test_response_includes_metadata() {
345
+ let server = TestMcpServer::new();
346
+
347
+ let test_file = get_test_path("pdfs_with_tables/tiny.pdf");
348
+
349
+ if std::path::Path::new(&test_file).exists() {
350
+ let params = ExtractFileParams {
351
+ path: test_file.to_string(),
352
+ mime_type: None,
353
+ enable_ocr: false,
354
+ force_ocr: false,
355
+ r#async: true,
356
+ };
357
+
358
+ let result = server.extract_file(Parameters(params)).await;
359
+
360
+ assert!(result.is_ok());
361
+ let call_result = result.unwrap();
362
+
363
+ if let Some(content) = call_result.content.first()
364
+ && let RawContent::Text(text) = &content.raw
365
+ {
366
+ assert!(text.text.contains("Metadata:"));
367
+ }
368
+ }
369
+ }
370
+
371
+ #[tokio::test]
372
+ async fn test_batch_extract_preserves_file_order() {
373
+ let server = TestMcpServer::new();
374
+
375
+ let file1 = get_test_path("pdfs_with_tables/tiny.pdf");
376
+ let file2 = get_test_path("pdfs_with_tables/medium.pdf");
377
+
378
+ if std::path::Path::new(&file1).exists() && std::path::Path::new(&file2).exists() {
379
+ let params = BatchExtractFilesParams {
380
+ paths: vec![file1.to_string(), file2.to_string()],
381
+ enable_ocr: false,
382
+ force_ocr: false,
383
+ r#async: true,
384
+ };
385
+
386
+ let result = server.batch_extract_files(Parameters(params)).await;
387
+
388
+ if let Ok(call_result) = result
389
+ && let Some(content) = call_result.content.first()
390
+ && let RawContent::Text(text) = &content.raw
391
+ {
392
+ assert!(text.text.contains("Document 1"));
393
+ assert!(text.text.contains("Document 2"));
394
+
395
+ let doc1_pos = text.text.find("Document 1");
396
+ let doc2_pos = text.text.find("Document 2");
397
+ if let (Some(pos1), Some(pos2)) = (doc1_pos, doc2_pos) {
398
+ assert!(pos1 < pos2, "Documents should be in order");
399
+ }
400
+ }
401
+ }
402
+ }
403
+ }