kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,434 @@
1
+ //! Document extractor registry management.
2
+ //!
3
+ //! This module provides functions for managing the global extractor registry.
4
+
5
+ use super::r#trait::DocumentExtractor;
6
+ use std::sync::Arc;
7
+
8
+ /// Register a document extractor with the global registry.
9
+ ///
10
+ /// The extractor will be registered for all MIME types it supports and will be
11
+ /// available for document extraction. The extractor's `name()` method is used as
12
+ /// the registration name.
13
+ ///
14
+ /// # Arguments
15
+ ///
16
+ /// * `extractor` - The extractor implementation wrapped in Arc
17
+ ///
18
+ /// # Returns
19
+ ///
20
+ /// - `Ok(())` if registration succeeded
21
+ /// - `Err(...)` if validation failed or initialization failed
22
+ ///
23
+ /// # Errors
24
+ ///
25
+ /// - `KreuzbergError::Validation` - Invalid extractor name (empty or contains whitespace)
26
+ /// - Any error from the extractor's `initialize()` method
27
+ ///
28
+ /// # Example
29
+ ///
30
+ /// ```rust
31
+ /// use kreuzberg::plugins::{Plugin, DocumentExtractor, register_extractor};
32
+ /// use kreuzberg::{Result, ExtractionConfig};
33
+ /// use kreuzberg::types::{ExtractionResult, Metadata};
34
+ /// use async_trait::async_trait;
35
+ /// use std::sync::Arc;
36
+ /// use std::path::Path;
37
+ ///
38
+ /// struct CustomExtractor;
39
+ ///
40
+ /// impl Plugin for CustomExtractor {
41
+ /// fn name(&self) -> &str { "custom-extractor" }
42
+ /// fn version(&self) -> String { "1.0.0".to_string() }
43
+ /// fn initialize(&self) -> Result<()> { Ok(()) }
44
+ /// fn shutdown(&self) -> Result<()> { Ok(()) }
45
+ /// }
46
+ ///
47
+ /// #[async_trait]
48
+ /// impl DocumentExtractor for CustomExtractor {
49
+ /// async fn extract_bytes(&self, content: &[u8], mime_type: &str, _: &ExtractionConfig)
50
+ /// -> Result<ExtractionResult> {
51
+ /// Ok(ExtractionResult {
52
+ /// content: String::from_utf8_lossy(content).to_string(),
53
+ /// mime_type: mime_type.to_string(),
54
+ /// metadata: Metadata::default(),
55
+ /// tables: vec![],
56
+ /// detected_languages: None,
57
+ /// chunks: None,
58
+ /// images: None,
59
+ /// djot_content: None,
60
+ /// pages: None,
61
+ /// elements: None,
62
+ /// })
63
+ /// }
64
+ ///
65
+ /// fn supported_mime_types(&self) -> &[&str] {
66
+ /// &["text/custom"]
67
+ /// }
68
+ /// }
69
+ ///
70
+ /// # tokio_test::block_on(async {
71
+ /// let extractor = Arc::new(CustomExtractor);
72
+ /// register_extractor(extractor)?;
73
+ /// # Ok::<(), kreuzberg::KreuzbergError>(())
74
+ /// # });
75
+ /// ```
76
+ pub fn register_extractor(extractor: Arc<dyn DocumentExtractor>) -> crate::Result<()> {
77
+ use crate::plugins::registry::get_document_extractor_registry;
78
+
79
+ let registry = get_document_extractor_registry();
80
+ let mut registry = registry
81
+ .write()
82
+ .expect("~keep Failed to acquire write lock on extractor registry"); // ~keep
83
+
84
+ registry.register(extractor)
85
+ }
86
+
87
+ /// Unregister a document extractor by name.
88
+ ///
89
+ /// Removes the extractor from the global registry and calls its `shutdown()` method.
90
+ ///
91
+ /// # Arguments
92
+ ///
93
+ /// * `name` - Name of the extractor to unregister
94
+ ///
95
+ /// # Returns
96
+ ///
97
+ /// - `Ok(())` if the extractor was unregistered or didn't exist
98
+ /// - `Err(...)` if the shutdown method failed
99
+ ///
100
+ /// # Example
101
+ ///
102
+ /// ```rust
103
+ /// use kreuzberg::plugins::unregister_extractor;
104
+ ///
105
+ /// # tokio_test::block_on(async {
106
+ /// unregister_extractor("custom-extractor")?;
107
+ /// # Ok::<(), kreuzberg::KreuzbergError>(())
108
+ /// # });
109
+ /// ```
110
+ pub fn unregister_extractor(name: &str) -> crate::Result<()> {
111
+ use crate::plugins::registry::get_document_extractor_registry;
112
+
113
+ let registry = get_document_extractor_registry();
114
+ let mut registry = registry
115
+ .write()
116
+ .expect("~keep Failed to acquire write lock on extractor registry"); // ~keep
117
+
118
+ registry.remove(name)
119
+ }
120
+
121
+ /// List all registered extractors.
122
+ ///
123
+ /// Returns the names of all extractors currently registered in the global registry.
124
+ ///
125
+ /// # Returns
126
+ ///
127
+ /// A vector of extractor names.
128
+ ///
129
+ /// # Example
130
+ ///
131
+ /// ```rust
132
+ /// use kreuzberg::plugins::list_extractors;
133
+ ///
134
+ /// # tokio_test::block_on(async {
135
+ /// let extractors = list_extractors()?;
136
+ /// for name in extractors {
137
+ /// println!("Registered extractor: {}", name);
138
+ /// }
139
+ /// # Ok::<(), kreuzberg::KreuzbergError>(())
140
+ /// # });
141
+ /// ```
142
+ pub fn list_extractors() -> crate::Result<Vec<String>> {
143
+ use crate::plugins::registry::get_document_extractor_registry;
144
+
145
+ let registry = get_document_extractor_registry();
146
+ let registry = registry
147
+ .read()
148
+ .expect("~keep Failed to acquire read lock on extractor registry"); // ~keep
149
+
150
+ Ok(registry.list())
151
+ }
152
+
153
+ /// Clear all extractors from the global registry.
154
+ ///
155
+ /// Removes all extractors and calls their `shutdown()` methods.
156
+ ///
157
+ /// # Returns
158
+ ///
159
+ /// - `Ok(())` if all extractors were cleared successfully
160
+ /// - `Err(...)` if any shutdown method failed
161
+ ///
162
+ /// # Example
163
+ ///
164
+ /// ```rust
165
+ /// use kreuzberg::plugins::clear_extractors;
166
+ ///
167
+ /// # tokio_test::block_on(async {
168
+ /// clear_extractors()?;
169
+ /// # Ok::<(), kreuzberg::KreuzbergError>(())
170
+ /// # });
171
+ /// ```
172
+ pub fn clear_extractors() -> crate::Result<()> {
173
+ use crate::plugins::registry::get_document_extractor_registry;
174
+
175
+ let registry = get_document_extractor_registry();
176
+ let mut registry = registry
177
+ .write()
178
+ .expect("~keep Failed to acquire write lock on extractor registry"); // ~keep
179
+
180
+ registry.shutdown_all()
181
+ }
182
+
183
+ #[cfg(test)]
184
+ mod tests {
185
+ use super::*;
186
+ use crate::Result;
187
+ use crate::core::config::ExtractionConfig;
188
+ use crate::plugins::Plugin;
189
+ use crate::types::ExtractionResult;
190
+ use async_trait::async_trait;
191
+ use serial_test::serial;
192
+
193
+ struct MockExtractor {
194
+ mime_types: Vec<&'static str>,
195
+ priority: i32,
196
+ }
197
+
198
+ impl Plugin for MockExtractor {
199
+ fn name(&self) -> &str {
200
+ "mock-extractor"
201
+ }
202
+
203
+ fn version(&self) -> String {
204
+ "1.0.0".to_string()
205
+ }
206
+
207
+ fn initialize(&self) -> Result<()> {
208
+ Ok(())
209
+ }
210
+
211
+ fn shutdown(&self) -> Result<()> {
212
+ Ok(())
213
+ }
214
+ }
215
+
216
+ #[async_trait]
217
+ impl DocumentExtractor for MockExtractor {
218
+ async fn extract_bytes(
219
+ &self,
220
+ content: &[u8],
221
+ mime_type: &str,
222
+ _config: &ExtractionConfig,
223
+ ) -> Result<ExtractionResult> {
224
+ Ok(ExtractionResult {
225
+ content: String::from_utf8_lossy(content).to_string(),
226
+ mime_type: mime_type.to_string(),
227
+ metadata: crate::types::Metadata::default(),
228
+ tables: vec![],
229
+ detected_languages: None,
230
+ chunks: None,
231
+ images: None,
232
+ djot_content: None,
233
+ pages: None,
234
+ elements: None,
235
+ })
236
+ }
237
+
238
+ fn supported_mime_types(&self) -> &[&str] {
239
+ &self.mime_types
240
+ }
241
+
242
+ fn priority(&self) -> i32 {
243
+ self.priority
244
+ }
245
+ }
246
+
247
+ #[test]
248
+ #[serial]
249
+ fn test_register_extractor() {
250
+ use std::sync::Arc;
251
+
252
+ let extractor = Arc::new(MockExtractor {
253
+ mime_types: vec!["text/test-register"],
254
+ priority: 50,
255
+ });
256
+ let result = super::register_extractor(extractor);
257
+ assert!(result.is_ok());
258
+
259
+ let _ = super::unregister_extractor("mock-extractor");
260
+ }
261
+
262
+ #[test]
263
+ #[serial]
264
+ fn test_unregister_extractor() {
265
+ use std::sync::Arc;
266
+
267
+ let extractor = Arc::new(MockExtractor {
268
+ mime_types: vec!["text/test-unregister"],
269
+ priority: 50,
270
+ });
271
+ super::register_extractor(extractor).unwrap();
272
+
273
+ let result = super::unregister_extractor("mock-extractor");
274
+ assert!(result.is_ok());
275
+ }
276
+
277
+ #[test]
278
+ #[serial]
279
+ fn test_unregister_nonexistent_extractor() {
280
+ let result = super::unregister_extractor("nonexistent-extractor-xyz");
281
+ assert!(result.is_ok());
282
+ }
283
+
284
+ #[test]
285
+ #[serial]
286
+ fn test_list_extractors() {
287
+ use std::sync::Arc;
288
+
289
+ super::clear_extractors().unwrap();
290
+
291
+ let extractor1 = Arc::new(MockExtractor {
292
+ mime_types: vec!["text/test-list-1"],
293
+ priority: 50,
294
+ });
295
+ let extractor2 = Arc::new(MockExtractor {
296
+ mime_types: vec!["text/test-list-2"],
297
+ priority: 51,
298
+ });
299
+
300
+ let list_before = super::list_extractors().unwrap();
301
+ assert_eq!(list_before.len(), 0);
302
+
303
+ super::register_extractor(extractor1).unwrap();
304
+ super::register_extractor(extractor2).unwrap();
305
+
306
+ let list = super::list_extractors().unwrap();
307
+ assert_eq!(list.len(), 1);
308
+ assert!(list.contains(&"mock-extractor".to_string()));
309
+
310
+ super::unregister_extractor("mock-extractor").unwrap();
311
+ }
312
+
313
+ #[test]
314
+ #[serial]
315
+ fn test_clear_extractors() {
316
+ use std::sync::Arc;
317
+
318
+ super::clear_extractors().unwrap();
319
+
320
+ let extractor1 = Arc::new(MockExtractor {
321
+ mime_types: vec!["text/test-clear-1"],
322
+ priority: 50,
323
+ });
324
+ let extractor2 = Arc::new(MockExtractor {
325
+ mime_types: vec!["text/test-clear-2"],
326
+ priority: 51,
327
+ });
328
+
329
+ super::register_extractor(extractor1).unwrap();
330
+ super::register_extractor(extractor2).unwrap();
331
+
332
+ let result = super::clear_extractors();
333
+ assert!(result.is_ok());
334
+
335
+ let list = super::list_extractors().unwrap();
336
+ assert_eq!(list.len(), 0);
337
+ }
338
+
339
+ #[test]
340
+ #[serial]
341
+ fn test_register_extractor_with_invalid_name() {
342
+ use std::sync::Arc;
343
+
344
+ struct InvalidNameExtractor;
345
+ impl Plugin for InvalidNameExtractor {
346
+ fn name(&self) -> &str {
347
+ "invalid name with spaces"
348
+ }
349
+ fn version(&self) -> String {
350
+ "1.0.0".to_string()
351
+ }
352
+ fn initialize(&self) -> Result<()> {
353
+ Ok(())
354
+ }
355
+ fn shutdown(&self) -> Result<()> {
356
+ Ok(())
357
+ }
358
+ }
359
+
360
+ #[async_trait]
361
+ impl DocumentExtractor for InvalidNameExtractor {
362
+ async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> {
363
+ Ok(ExtractionResult {
364
+ content: String::new(),
365
+ mime_type: String::new(),
366
+ metadata: crate::types::Metadata::default(),
367
+ tables: vec![],
368
+ detected_languages: None,
369
+ chunks: None,
370
+ images: None,
371
+ djot_content: None,
372
+ pages: None,
373
+ elements: None,
374
+ })
375
+ }
376
+
377
+ fn supported_mime_types(&self) -> &[&str] {
378
+ &["text/plain"]
379
+ }
380
+ }
381
+
382
+ let extractor = Arc::new(InvalidNameExtractor);
383
+ let result = super::register_extractor(extractor);
384
+ assert!(matches!(result, Err(crate::KreuzbergError::Validation { .. })));
385
+ }
386
+
387
+ #[test]
388
+ #[serial]
389
+ fn test_register_extractor_with_empty_name() {
390
+ use std::sync::Arc;
391
+
392
+ struct EmptyNameExtractor;
393
+ impl Plugin for EmptyNameExtractor {
394
+ fn name(&self) -> &str {
395
+ ""
396
+ }
397
+ fn version(&self) -> String {
398
+ "1.0.0".to_string()
399
+ }
400
+ fn initialize(&self) -> Result<()> {
401
+ Ok(())
402
+ }
403
+ fn shutdown(&self) -> Result<()> {
404
+ Ok(())
405
+ }
406
+ }
407
+
408
+ #[async_trait]
409
+ impl DocumentExtractor for EmptyNameExtractor {
410
+ async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> {
411
+ Ok(ExtractionResult {
412
+ content: String::new(),
413
+ mime_type: String::new(),
414
+ metadata: crate::types::Metadata::default(),
415
+ tables: vec![],
416
+ detected_languages: None,
417
+ chunks: None,
418
+ images: None,
419
+ djot_content: None,
420
+ pages: None,
421
+ elements: None,
422
+ })
423
+ }
424
+
425
+ fn supported_mime_types(&self) -> &[&str] {
426
+ &["text/plain"]
427
+ }
428
+ }
429
+
430
+ let extractor = Arc::new(EmptyNameExtractor);
431
+ let result = super::register_extractor(extractor);
432
+ assert!(matches!(result, Err(crate::KreuzbergError::Validation { .. })));
433
+ }
434
+ }