kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,391 @@
1
+ //! Document extractor plugin trait.
2
+ //!
3
+ //! This module defines the trait for implementing custom document extractors.
4
+
5
+ use crate::Result;
6
+ use crate::core::config::ExtractionConfig;
7
+ use crate::plugins::Plugin;
8
+ use crate::types::ExtractionResult;
9
+ use async_trait::async_trait;
10
+ use std::path::Path;
11
+
12
+ #[cfg(not(feature = "tokio-runtime"))]
13
+ use crate::KreuzbergError;
14
+
15
+ /// Trait for document extractor plugins.
16
+ ///
17
+ /// Implement this trait to add support for new document formats or to override
18
+ /// built-in extraction behavior with custom logic.
19
+ ///
20
+ /// # Priority System
21
+ ///
22
+ /// When multiple extractors support the same MIME type, the registry selects
23
+ /// the extractor with the highest priority value. Use this to:
24
+ /// - Override built-in extractors (priority > 50)
25
+ /// - Provide fallback extractors (priority < 50)
26
+ /// - Implement specialized extractors for specific use cases
27
+ ///
28
+ /// Default priority is 50.
29
+ ///
30
+ /// # Thread Safety
31
+ ///
32
+ /// Extractors must be thread-safe (`Send + Sync`) to support concurrent extraction.
33
+ ///
34
+ /// # Example
35
+ ///
36
+ /// ```rust
37
+ /// use kreuzberg::plugins::{Plugin, DocumentExtractor};
38
+ /// use kreuzberg::{Result, ExtractionConfig};
39
+ /// use kreuzberg::types::{ExtractionResult, Metadata};
40
+ /// use async_trait::async_trait;
41
+ /// use std::path::Path;
42
+ ///
43
+ /// /// Custom PDF extractor with premium features
44
+ /// struct PremiumPdfExtractor;
45
+ ///
46
+ /// impl Plugin for PremiumPdfExtractor {
47
+ /// fn name(&self) -> &str { "premium-pdf" }
48
+ /// fn version(&self) -> String { "2.0.0".to_string() }
49
+ /// fn initialize(&self) -> Result<()> { Ok(()) }
50
+ /// fn shutdown(&self) -> Result<()> { Ok(()) }
51
+ /// }
52
+ ///
53
+ /// #[async_trait]
54
+ /// impl DocumentExtractor for PremiumPdfExtractor {
55
+ /// async fn extract_bytes(&self, content: &[u8], mime_type: &str, config: &ExtractionConfig)
56
+ /// -> Result<ExtractionResult> {
57
+ /// // Premium extraction logic with better accuracy
58
+ /// Ok(ExtractionResult {
59
+ /// content: "Premium extracted content".to_string(),
60
+ /// mime_type: mime_type.to_string(),
61
+ /// metadata: Metadata::default(),
62
+ /// tables: vec![],
63
+ /// detected_languages: None,
64
+ /// chunks: None,
65
+ /// images: None,
66
+ /// djot_content: None,
67
+ /// pages: None,
68
+ /// elements: None,
69
+ /// })
70
+ /// }
71
+ ///
72
+ /// async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig)
73
+ /// -> Result<ExtractionResult> {
74
+ /// let bytes = std::fs::read(path)?;
75
+ /// self.extract_bytes(&bytes, mime_type, config).await
76
+ /// }
77
+ ///
78
+ /// fn supported_mime_types(&self) -> &[&str] {
79
+ /// &["application/pdf"]
80
+ /// }
81
+ ///
82
+ /// fn priority(&self) -> i32 {
83
+ /// 100 // Higher than default (50) - will be preferred
84
+ /// }
85
+ /// }
86
+ /// ```
87
+ #[async_trait]
88
+ pub trait DocumentExtractor: Plugin {
89
+ /// Extract content from a byte array.
90
+ ///
91
+ /// This is the core extraction method that processes in-memory document data.
92
+ ///
93
+ /// # Arguments
94
+ ///
95
+ /// * `content` - Raw document bytes
96
+ /// * `mime_type` - MIME type of the document (already validated)
97
+ /// * `config` - Extraction configuration
98
+ ///
99
+ /// # Returns
100
+ ///
101
+ /// An `ExtractionResult` containing the extracted content, metadata, and tables.
102
+ ///
103
+ /// # Errors
104
+ ///
105
+ /// - `KreuzbergError::Parsing` - Document parsing failed
106
+ /// - `KreuzbergError::Validation` - Invalid document structure
107
+ /// - `KreuzbergError::Io` - I/O errors (these always bubble up)
108
+ /// - `KreuzbergError::MissingDependency` - Required dependency not available
109
+ ///
110
+ /// # Example
111
+ ///
112
+ /// ```rust,no_run
113
+ /// # use kreuzberg::plugins::{Plugin, DocumentExtractor};
114
+ /// # use kreuzberg::{Result, ExtractionConfig};
115
+ /// # use kreuzberg::types::{ExtractionResult, Metadata};
116
+ /// # use async_trait::async_trait;
117
+ /// # use std::path::Path;
118
+ /// # struct MyExtractor;
119
+ /// # impl Plugin for MyExtractor {
120
+ /// # fn name(&self) -> &str { "my-extractor" }
121
+ /// # fn version(&self) -> String { "1.0.0".to_string() }
122
+ /// # fn initialize(&self) -> Result<()> { Ok(()) }
123
+ /// # fn shutdown(&self) -> Result<()> { Ok(()) }
124
+ /// # }
125
+ /// # #[async_trait]
126
+ /// # impl DocumentExtractor for MyExtractor {
127
+ /// # fn supported_mime_types(&self) -> &[&str] { &["text/plain"] }
128
+ /// # fn priority(&self) -> i32 { 50 }
129
+ /// # async fn extract_file(&self, _: &Path, _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> { todo!() }
130
+ /// async fn extract_bytes(&self, content: &[u8], mime_type: &str, config: &ExtractionConfig)
131
+ /// -> Result<ExtractionResult> {
132
+ /// // Parse document
133
+ /// let text = String::from_utf8_lossy(content).to_string();
134
+ ///
135
+ /// // Extract metadata
136
+ /// let mut metadata = Metadata::default();
137
+ /// metadata.additional.insert("byte_count".to_string(), serde_json::json!(content.len()));
138
+ ///
139
+ /// Ok(ExtractionResult {
140
+ /// content: text,
141
+ /// mime_type: mime_type.to_string(),
142
+ /// metadata,
143
+ /// tables: vec![],
144
+ /// detected_languages: None,
145
+ /// chunks: None,
146
+ /// images: None,
147
+ /// djot_content: None,
148
+ /// pages: None,
149
+ /// elements: None,
150
+ /// })
151
+ /// }
152
+ /// # }
153
+ /// ```
154
+ async fn extract_bytes(
155
+ &self,
156
+ content: &[u8],
157
+ mime_type: &str,
158
+ config: &ExtractionConfig,
159
+ ) -> Result<ExtractionResult>;
160
+
161
+ /// Extract content from a file.
162
+ ///
163
+ /// Default implementation reads the file and calls `extract_bytes`.
164
+ /// Override for custom file handling, streaming, or memory optimizations.
165
+ ///
166
+ /// # Arguments
167
+ ///
168
+ /// * `path` - Path to the document file
169
+ /// * `mime_type` - MIME type of the document (already validated)
170
+ /// * `config` - Extraction configuration
171
+ ///
172
+ /// # Errors
173
+ ///
174
+ /// Same as `extract_bytes`, plus file I/O errors.
175
+ ///
176
+ /// # Example - Custom File Handling
177
+ ///
178
+ /// ```rust,no_run
179
+ /// # use kreuzberg::plugins::{Plugin, DocumentExtractor};
180
+ /// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
181
+ /// # use kreuzberg::types::Metadata;
182
+ /// # use async_trait::async_trait;
183
+ /// # use std::path::Path;
184
+ /// # struct StreamingExtractor;
185
+ /// # impl Plugin for StreamingExtractor {
186
+ /// # fn name(&self) -> &str { "streaming" }
187
+ /// # fn version(&self) -> String { "1.0.0".to_string() }
188
+ /// # fn initialize(&self) -> Result<()> { Ok(()) }
189
+ /// # fn shutdown(&self) -> Result<()> { Ok(()) }
190
+ /// # }
191
+ /// # #[async_trait]
192
+ /// # impl DocumentExtractor for StreamingExtractor {
193
+ /// # fn supported_mime_types(&self) -> &[&str] { &["text/plain"] }
194
+ /// # fn priority(&self) -> i32 { 50 }
195
+ /// # async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> { todo!() }
196
+ /// /// Override for memory-efficient streaming extraction
197
+ /// async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig)
198
+ /// -> Result<ExtractionResult> {
199
+ /// // Stream large files instead of loading entirely into memory
200
+ /// let mut content = String::new();
201
+ ///
202
+ /// // Use buffered reader for streaming
203
+ /// use std::io::{BufRead, BufReader};
204
+ /// let file = std::fs::File::open(path)?;
205
+ /// let reader = BufReader::new(file);
206
+ ///
207
+ /// for line in reader.lines() {
208
+ /// content.push_str(&line?);
209
+ /// content.push('\n');
210
+ /// }
211
+ ///
212
+ /// Ok(ExtractionResult {
213
+ /// content,
214
+ /// mime_type: mime_type.to_string(),
215
+ /// metadata: Metadata::default(),
216
+ /// tables: vec![],
217
+ /// detected_languages: None,
218
+ /// chunks: None,
219
+ /// images: None,
220
+ /// djot_content: None,
221
+ /// pages: None,
222
+ /// elements: None,
223
+ /// })
224
+ /// }
225
+ /// # }
226
+ /// ```
227
+ async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
228
+ #[cfg(feature = "tokio-runtime")]
229
+ {
230
+ use crate::core::io;
231
+ let bytes = io::read_file_async(path).await?;
232
+ self.extract_bytes(&bytes, mime_type, config).await
233
+ }
234
+ #[cfg(not(feature = "tokio-runtime"))]
235
+ {
236
+ let _ = (path, mime_type, config);
237
+ Err(KreuzbergError::Other(
238
+ "File-based extraction requires the tokio-runtime feature".to_string(),
239
+ ))
240
+ }
241
+ }
242
+
243
+ /// Get the list of MIME types supported by this extractor.
244
+ ///
245
+ /// Can include exact MIME types and prefix patterns:
246
+ /// - Exact: `"application/pdf"`, `"text/plain"`
247
+ /// - Prefix: `"image/*"` (matches any image type)
248
+ ///
249
+ /// # Returns
250
+ ///
251
+ /// A slice of MIME type strings.
252
+ ///
253
+ /// # Example
254
+ ///
255
+ /// ```rust
256
+ /// # use kreuzberg::plugins::{Plugin, DocumentExtractor};
257
+ /// # use kreuzberg::Result;
258
+ /// # use async_trait::async_trait;
259
+ /// # use std::path::Path;
260
+ /// # struct MultiFormatExtractor;
261
+ /// # impl Plugin for MultiFormatExtractor {
262
+ /// # fn name(&self) -> &str { "multi-format" }
263
+ /// # fn version(&self) -> String { "1.0.0".to_string() }
264
+ /// # fn initialize(&self) -> Result<()> { Ok(()) }
265
+ /// # fn shutdown(&self) -> Result<()> { Ok(()) }
266
+ /// # }
267
+ /// # use kreuzberg::{ExtractionResult, ExtractionConfig};
268
+ /// # #[async_trait]
269
+ /// # impl DocumentExtractor for MultiFormatExtractor {
270
+ /// # fn priority(&self) -> i32 { 50 }
271
+ /// # async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> { todo!() }
272
+ /// # async fn extract_file(&self, _: &Path, _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> { todo!() }
273
+ /// fn supported_mime_types(&self) -> &[&str] {
274
+ /// &[
275
+ /// "text/plain",
276
+ /// "text/markdown",
277
+ /// "application/json",
278
+ /// "application/xml",
279
+ /// "text/html",
280
+ /// ]
281
+ /// }
282
+ /// # }
283
+ /// ```
284
+ fn supported_mime_types(&self) -> &[&str];
285
+
286
+ /// Get the priority of this extractor.
287
+ ///
288
+ /// Higher priority extractors are preferred when multiple extractors
289
+ /// support the same MIME type.
290
+ ///
291
+ /// # Priority Guidelines
292
+ ///
293
+ /// - **0-25**: Fallback/low-quality extractors
294
+ /// - **26-49**: Alternative extractors
295
+ /// - **50**: Default priority (built-in extractors)
296
+ /// - **51-75**: Premium/enhanced extractors
297
+ /// - **76-100**: Specialized/high-priority extractors
298
+ ///
299
+ /// # Returns
300
+ ///
301
+ /// Priority value (default: 50)
302
+ ///
303
+ /// # Example
304
+ ///
305
+ /// ```rust
306
+ /// # use kreuzberg::plugins::{Plugin, DocumentExtractor};
307
+ /// # use kreuzberg::Result;
308
+ /// # use async_trait::async_trait;
309
+ /// # use std::path::Path;
310
+ /// # struct FallbackExtractor;
311
+ /// # impl Plugin for FallbackExtractor {
312
+ /// # fn name(&self) -> &str { "fallback" }
313
+ /// # fn version(&self) -> String { "1.0.0".to_string() }
314
+ /// # fn initialize(&self) -> Result<()> { Ok(()) }
315
+ /// # fn shutdown(&self) -> Result<()> { Ok(()) }
316
+ /// # }
317
+ /// # use kreuzberg::{ExtractionResult, ExtractionConfig};
318
+ /// # #[async_trait]
319
+ /// # impl DocumentExtractor for FallbackExtractor {
320
+ /// # fn supported_mime_types(&self) -> &[&str] { &["text/plain"] }
321
+ /// # async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> { todo!() }
322
+ /// # async fn extract_file(&self, _: &Path, _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> { todo!() }
323
+ /// fn priority(&self) -> i32 {
324
+ /// 10 // Low priority - only used as fallback
325
+ /// }
326
+ /// # }
327
+ /// ```
328
+ fn priority(&self) -> i32 {
329
+ 50
330
+ }
331
+
332
+ /// Optional: Check if this extractor can handle a specific file.
333
+ ///
334
+ /// Allows for more sophisticated detection beyond MIME types.
335
+ /// Defaults to `true` (rely on MIME type matching).
336
+ ///
337
+ /// # Arguments
338
+ ///
339
+ /// * `path` - Path to the file to check
340
+ /// * `mime_type` - Detected MIME type
341
+ ///
342
+ /// # Returns
343
+ ///
344
+ /// `true` if the extractor can handle this file, `false` otherwise.
345
+ ///
346
+ /// # Example
347
+ ///
348
+ /// ```rust,no_run
349
+ /// # use kreuzberg::plugins::{Plugin, DocumentExtractor};
350
+ /// # use kreuzberg::Result;
351
+ /// # use async_trait::async_trait;
352
+ /// # use std::path::Path;
353
+ /// # struct SmartExtractor;
354
+ /// # impl Plugin for SmartExtractor {
355
+ /// # fn name(&self) -> &str { "smart" }
356
+ /// # fn version(&self) -> String { "1.0.0".to_string() }
357
+ /// # fn initialize(&self) -> Result<()> { Ok(()) }
358
+ /// # fn shutdown(&self) -> Result<()> { Ok(()) }
359
+ /// # }
360
+ /// # use kreuzberg::{ExtractionResult, ExtractionConfig};
361
+ /// # #[async_trait]
362
+ /// # impl DocumentExtractor for SmartExtractor {
363
+ /// # fn supported_mime_types(&self) -> &[&str] { &["application/pdf"] }
364
+ /// # fn priority(&self) -> i32 { 50 }
365
+ /// # async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> { todo!() }
366
+ /// # async fn extract_file(&self, _: &Path, _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> { todo!() }
367
+ /// /// Only handle PDFs that are searchable (have text layer)
368
+ /// fn can_handle(&self, path: &Path, mime_type: &str) -> bool {
369
+ /// if mime_type != "application/pdf" {
370
+ /// return false;
371
+ /// }
372
+ ///
373
+ /// // Check if PDF has text layer (simplified example)
374
+ /// // In real implementation, analyze PDF structure here
375
+ /// let _ = path; // Use path for PDF analysis
376
+ /// true // Simplified - always accept
377
+ /// }
378
+ /// # }
379
+ /// ```
380
+ fn can_handle(&self, _path: &Path, _mime_type: &str) -> bool {
381
+ true
382
+ }
383
+
384
+ /// Attempt to get a reference to this extractor as a SyncExtractor.
385
+ ///
386
+ /// Returns None if the extractor doesn't support synchronous extraction.
387
+ /// This is used for WASM and other sync-only environments.
388
+ fn as_sync_extractor(&self) -> Option<&dyn crate::extractors::SyncExtractor> {
389
+ None
390
+ }
391
+ }
@@ -48,6 +48,8 @@
48
48
  //! # chunks: None,
49
49
  //! # images: None,
50
50
  //! # pages: None,
51
+ //! # djot_content: None,
52
+ //! # elements: None,
51
53
  //! # })
52
54
  //! # }
53
55
  //! # async fn extract_file(&self, _: &std::path::Path, _: &str, _: &kreuzberg::ExtractionConfig)
@@ -61,6 +63,8 @@
61
63
  //! # chunks: None,
62
64
  //! # images: None,
63
65
  //! # pages: None,
66
+ //! # djot_content: None,
67
+ //! # elements: None,
64
68
  //! # })
65
69
  //! # }
66
70
  //! # fn supported_mime_types(&self) -> &[&str] { &[] }
@@ -122,7 +126,9 @@
122
126
  //! detected_languages: None,
123
127
  //! chunks: None,
124
128
  //! images: None,
129
+ //! djot_content: None,
125
130
  //! pages: None,
131
+ //! elements: None,
126
132
  //! })
127
133
  //! }
128
134
  //!
@@ -210,3 +216,10 @@ pub use ocr::{
210
216
  pub use processor::{PostProcessor, ProcessingStage, list_post_processors};
211
217
  pub use traits::Plugin;
212
218
  pub use validator::{Validator, clear_validators, list_validators, register_validator, unregister_validator};
219
+
220
+ // Re-export registry items for backward compatibility
221
+ pub use registry::{
222
+ DOCUMENT_EXTRACTOR_REGISTRY, DocumentExtractorRegistry, OCR_BACKEND_REGISTRY, OcrBackendRegistry,
223
+ POST_PROCESSOR_REGISTRY, PostProcessorRegistry, VALIDATOR_REGISTRY, ValidatorRegistry,
224
+ get_document_extractor_registry, get_ocr_backend_registry, get_post_processor_registry, get_validator_registry,
225
+ };
@@ -67,7 +67,9 @@ pub enum OcrBackendType {
67
67
  /// detected_languages: None,
68
68
  /// chunks: None,
69
69
  /// images: None,
70
+ /// djot_content: None,
70
71
  /// pages: None,
72
+ /// elements: None,
71
73
  /// })
72
74
  /// }
73
75
  ///
@@ -146,7 +148,9 @@ pub trait OcrBackend: Plugin {
146
148
  /// detected_languages: None,
147
149
  /// chunks: None,
148
150
  /// images: None,
151
+ /// djot_content: None,
149
152
  /// pages: None,
153
+ /// elements: None,
150
154
  /// })
151
155
  /// }
152
156
  /// # }
@@ -317,7 +321,9 @@ pub trait OcrBackend: Plugin {
317
321
  /// detected_languages: None,
318
322
  /// chunks: None,
319
323
  /// images: None,
324
+ /// djot_content: None,
320
325
  /// pages: None,
326
+ /// elements: None,
321
327
  /// })
322
328
  /// }
323
329
  /// fn supports_language(&self, _: &str) -> bool { true }
@@ -478,7 +484,9 @@ mod tests {
478
484
  detected_languages: None,
479
485
  chunks: None,
480
486
  images: None,
487
+ djot_content: None,
481
488
  pages: None,
489
+ elements: None,
482
490
  })
483
491
  }
484
492
 
@@ -505,6 +513,7 @@ mod tests {
505
513
  backend: "mock".to_string(),
506
514
  language: "eng".to_string(),
507
515
  tesseract_config: None,
516
+ output_format: None,
508
517
  };
509
518
 
510
519
  let result = backend.process_image(b"fake image data", &config).await.unwrap();
@@ -592,6 +601,7 @@ mod tests {
592
601
  backend: "mock".to_string(),
593
602
  language: "eng".to_string(),
594
603
  tesseract_config: None,
604
+ output_format: None,
595
605
  };
596
606
 
597
607
  let result = backend.process_file(path, &config).await.unwrap();
@@ -629,6 +639,7 @@ mod tests {
629
639
  backend: "mock".to_string(),
630
640
  language: "eng".to_string(),
631
641
  tesseract_config: None,
642
+ output_format: None,
632
643
  };
633
644
 
634
645
  let result = backend.process_image(b"", &config).await;