kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,293 @@
1
+ //! OCR backend registry.
2
+
3
+ use crate::plugins::OcrBackend;
4
+ use crate::{KreuzbergError, Result};
5
+ use std::collections::HashMap;
6
+ use std::sync::Arc;
7
+
8
+ /// Registry for OCR backend plugins.
9
+ ///
10
+ /// Manages OCR backends with backend type and language-based selection.
11
+ ///
12
+ /// # Thread Safety
13
+ ///
14
+ /// The registry is thread-safe and can be accessed concurrently from multiple threads.
15
+ ///
16
+ /// # Example
17
+ ///
18
+ /// ```rust,no_run
19
+ /// use kreuzberg::plugins::registry::OcrBackendRegistry;
20
+ /// use std::sync::Arc;
21
+ ///
22
+ /// let registry = OcrBackendRegistry::new();
23
+ /// // Register OCR backends
24
+ /// // registry.register(Arc::new(TesseractBackend::new()));
25
+ /// ```
26
+ pub struct OcrBackendRegistry {
27
+ pub(super) backends: HashMap<String, Arc<dyn OcrBackend>>,
28
+ }
29
+
30
+ impl OcrBackendRegistry {
31
+ /// Create a new OCR backend registry with default backends.
32
+ ///
33
+ /// Registers the Tesseract backend by default if the "ocr" feature is enabled.
34
+ pub fn new() -> Self {
35
+ #[cfg(feature = "ocr")]
36
+ let mut registry = Self {
37
+ backends: HashMap::new(),
38
+ };
39
+
40
+ #[cfg(not(feature = "ocr"))]
41
+ let registry = Self {
42
+ backends: HashMap::new(),
43
+ };
44
+
45
+ #[cfg(feature = "ocr")]
46
+ {
47
+ use crate::ocr::tesseract_backend::TesseractBackend;
48
+ if let Ok(backend) = TesseractBackend::new() {
49
+ let _ = registry.register(Arc::new(backend));
50
+ }
51
+ }
52
+
53
+ registry
54
+ }
55
+
56
+ /// Create a new empty OCR backend registry without default backends.
57
+ ///
58
+ /// This is useful for testing or when you want full control over backend registration.
59
+ pub fn new_empty() -> Self {
60
+ Self {
61
+ backends: HashMap::new(),
62
+ }
63
+ }
64
+
65
+ /// Register an OCR backend.
66
+ ///
67
+ /// # Arguments
68
+ ///
69
+ /// * `backend` - The OCR backend to register
70
+ ///
71
+ /// # Returns
72
+ ///
73
+ /// - `Ok(())` if registration succeeded
74
+ /// - `Err(...)` if initialization failed
75
+ ///
76
+ /// # Example
77
+ ///
78
+ /// ```rust,no_run
79
+ /// # use kreuzberg::plugins::registry::OcrBackendRegistry;
80
+ /// # use std::sync::Arc;
81
+ /// let mut registry = OcrBackendRegistry::new();
82
+ /// // let backend = Arc::new(MyOcrBackend::new());
83
+ /// // registry.register(backend)?;
84
+ /// # Ok::<(), kreuzberg::KreuzbergError>(())
85
+ /// ```
86
+ pub fn register(&mut self, backend: Arc<dyn OcrBackend>) -> Result<()> {
87
+ let name = backend.name().to_string();
88
+
89
+ super::validate_plugin_name(&name)?;
90
+
91
+ backend.initialize()?;
92
+
93
+ self.backends.insert(name, backend);
94
+ Ok(())
95
+ }
96
+
97
+ /// Get an OCR backend by name.
98
+ ///
99
+ /// # Arguments
100
+ ///
101
+ /// * `name` - Backend name
102
+ ///
103
+ /// # Returns
104
+ ///
105
+ /// The backend if found, or an error if not registered.
106
+ pub fn get(&self, name: &str) -> Result<Arc<dyn OcrBackend>> {
107
+ self.backends.get(name).cloned().ok_or_else(|| KreuzbergError::Plugin {
108
+ message: format!("OCR backend '{}' not registered", name),
109
+ plugin_name: name.to_string(),
110
+ })
111
+ }
112
+
113
+ /// Get an OCR backend that supports a specific language.
114
+ ///
115
+ /// Returns the first backend that supports the language.
116
+ ///
117
+ /// # Arguments
118
+ ///
119
+ /// * `language` - Language code (e.g., "eng", "deu")
120
+ ///
121
+ /// # Returns
122
+ ///
123
+ /// The first backend supporting the language, or an error if none found.
124
+ pub fn get_for_language(&self, language: &str) -> Result<Arc<dyn OcrBackend>> {
125
+ self.backends
126
+ .values()
127
+ .find(|backend| backend.supports_language(language))
128
+ .cloned()
129
+ .ok_or_else(|| KreuzbergError::Plugin {
130
+ message: format!("No OCR backend supports language '{}'", language),
131
+ plugin_name: language.to_string(),
132
+ })
133
+ }
134
+
135
+ /// List all registered backend names.
136
+ pub fn list(&self) -> Vec<String> {
137
+ self.backends.keys().cloned().collect()
138
+ }
139
+
140
+ /// Remove a backend from the registry.
141
+ ///
142
+ /// Calls `shutdown()` on the backend before removing.
143
+ pub fn remove(&mut self, name: &str) -> Result<()> {
144
+ if let Some(backend) = self.backends.remove(name) {
145
+ backend.shutdown()?;
146
+ }
147
+ Ok(())
148
+ }
149
+
150
+ /// Shutdown all backends and clear the registry.
151
+ pub fn shutdown_all(&mut self) -> Result<()> {
152
+ let names: Vec<_> = self.backends.keys().cloned().collect();
153
+ for name in names {
154
+ self.remove(&name)?;
155
+ }
156
+ Ok(())
157
+ }
158
+ }
159
+
160
+ impl Default for OcrBackendRegistry {
161
+ fn default() -> Self {
162
+ Self::new()
163
+ }
164
+ }
165
+
166
+ #[cfg(test)]
167
+ mod tests {
168
+ use super::*;
169
+ use crate::core::config::OcrConfig;
170
+ use crate::plugins::{OcrBackend, Plugin};
171
+ use crate::types::ExtractionResult;
172
+ use async_trait::async_trait;
173
+
174
+ struct MockOcrBackend {
175
+ name: String,
176
+ languages: Vec<String>,
177
+ }
178
+
179
+ impl Plugin for MockOcrBackend {
180
+ fn name(&self) -> &str {
181
+ &self.name
182
+ }
183
+ fn version(&self) -> String {
184
+ "1.0.0".to_string()
185
+ }
186
+ fn initialize(&self) -> Result<()> {
187
+ Ok(())
188
+ }
189
+ fn shutdown(&self) -> Result<()> {
190
+ Ok(())
191
+ }
192
+ }
193
+
194
+ #[async_trait]
195
+ impl OcrBackend for MockOcrBackend {
196
+ async fn process_image(&self, _: &[u8], _: &OcrConfig) -> Result<ExtractionResult> {
197
+ Ok(ExtractionResult {
198
+ content: "test".to_string(),
199
+ mime_type: "text/plain".to_string(),
200
+ metadata: crate::types::Metadata::default(),
201
+ tables: vec![],
202
+ detected_languages: None,
203
+ chunks: None,
204
+ images: None,
205
+ djot_content: None,
206
+ pages: None,
207
+ elements: None,
208
+ })
209
+ }
210
+
211
+ fn supports_language(&self, lang: &str) -> bool {
212
+ self.languages.iter().any(|l| l == lang)
213
+ }
214
+
215
+ fn backend_type(&self) -> crate::plugins::ocr::OcrBackendType {
216
+ crate::plugins::ocr::OcrBackendType::Custom
217
+ }
218
+ }
219
+
220
+ #[test]
221
+ fn test_ocr_backend_registry() {
222
+ let mut registry = OcrBackendRegistry::new_empty();
223
+
224
+ let backend = Arc::new(MockOcrBackend {
225
+ name: "test-ocr".to_string(),
226
+ languages: vec!["eng".to_string(), "deu".to_string()],
227
+ });
228
+
229
+ registry.register(backend).unwrap();
230
+
231
+ let retrieved = registry.get("test-ocr").unwrap();
232
+ assert_eq!(retrieved.name(), "test-ocr");
233
+
234
+ let eng_backend = registry.get_for_language("eng").unwrap();
235
+ assert_eq!(eng_backend.name(), "test-ocr");
236
+
237
+ let names = registry.list();
238
+ assert_eq!(names.len(), 1);
239
+ assert!(names.contains(&"test-ocr".to_string()));
240
+ }
241
+
242
+ #[test]
243
+ fn test_ocr_backend_registry_new_empty() {
244
+ let registry = OcrBackendRegistry::new_empty();
245
+ assert_eq!(registry.list().len(), 0);
246
+ }
247
+
248
+ #[test]
249
+ fn test_ocr_backend_get_missing() {
250
+ let registry = OcrBackendRegistry::new_empty();
251
+ let result = registry.get("nonexistent");
252
+ assert!(result.is_err());
253
+ }
254
+
255
+ #[test]
256
+ fn test_ocr_backend_get_for_language_missing() {
257
+ let registry = OcrBackendRegistry::new_empty();
258
+ let result = registry.get_for_language("fra");
259
+ assert!(result.is_err());
260
+ }
261
+
262
+ #[test]
263
+ fn test_ocr_backend_remove() {
264
+ let mut registry = OcrBackendRegistry::new_empty();
265
+ let backend = Arc::new(MockOcrBackend {
266
+ name: "test-backend".to_string(),
267
+ languages: vec!["eng".to_string()],
268
+ });
269
+ registry.register(backend).unwrap();
270
+
271
+ registry.remove("test-backend").unwrap();
272
+ assert_eq!(registry.list().len(), 0);
273
+ }
274
+
275
+ #[test]
276
+ fn test_ocr_backend_shutdown_all() {
277
+ let mut registry = OcrBackendRegistry::new_empty();
278
+ let backend1 = Arc::new(MockOcrBackend {
279
+ name: "backend1".to_string(),
280
+ languages: vec!["eng".to_string()],
281
+ });
282
+ let backend2 = Arc::new(MockOcrBackend {
283
+ name: "backend2".to_string(),
284
+ languages: vec!["deu".to_string()],
285
+ });
286
+
287
+ registry.register(backend1).unwrap();
288
+ registry.register(backend2).unwrap();
289
+
290
+ registry.shutdown_all().unwrap();
291
+ assert_eq!(registry.list().len(), 0);
292
+ }
293
+ }
@@ -0,0 +1,304 @@
1
+ //! Post-processor registry implementation.
2
+
3
+ use crate::Result;
4
+ use crate::plugins::{PostProcessor, ProcessingStage};
5
+ use std::collections::{BTreeMap, HashMap};
6
+ use std::sync::Arc;
7
+
8
+ /// Registry for post-processor plugins.
9
+ ///
10
+ /// Manages post-processors organized by processing stage.
11
+ pub struct PostProcessorRegistry {
12
+ processors: HashMap<ProcessingStage, BTreeMap<i32, Vec<Arc<dyn PostProcessor>>>>,
13
+ name_index: HashMap<String, (ProcessingStage, i32)>,
14
+ }
15
+
16
+ impl PostProcessorRegistry {
17
+ /// Create a new empty post-processor registry.
18
+ pub fn new() -> Self {
19
+ Self {
20
+ processors: HashMap::new(),
21
+ name_index: HashMap::new(),
22
+ }
23
+ }
24
+
25
+ /// Register a post-processor.
26
+ ///
27
+ /// # Arguments
28
+ ///
29
+ /// * `processor` - The post-processor to register
30
+ /// * `priority` - Execution priority (higher = runs first within stage)
31
+ pub fn register(&mut self, processor: Arc<dyn PostProcessor>, priority: i32) -> Result<()> {
32
+ let name = processor.name().to_string();
33
+ let stage = processor.processing_stage();
34
+
35
+ super::validate_plugin_name(&name)?;
36
+
37
+ processor.initialize()?;
38
+
39
+ if self.name_index.contains_key(&name) {
40
+ self.remove(&name)?;
41
+ }
42
+
43
+ self.processors
44
+ .entry(stage)
45
+ .or_default()
46
+ .entry(priority)
47
+ .or_default()
48
+ .push(Arc::clone(&processor));
49
+
50
+ self.name_index.insert(name, (stage, priority));
51
+
52
+ Ok(())
53
+ }
54
+
55
+ /// Get all processors for a specific stage, in priority order.
56
+ ///
57
+ /// # Arguments
58
+ ///
59
+ /// * `stage` - The processing stage
60
+ ///
61
+ /// # Returns
62
+ ///
63
+ /// Vector of processors in priority order (highest first).
64
+ pub fn get_for_stage(&self, stage: ProcessingStage) -> Vec<Arc<dyn PostProcessor>> {
65
+ let mut result = Vec::new();
66
+
67
+ if let Some(priority_map) = self.processors.get(&stage) {
68
+ for (_priority, processors) in priority_map.iter().rev() {
69
+ for processor in processors {
70
+ result.push(Arc::clone(processor));
71
+ }
72
+ }
73
+ }
74
+
75
+ result
76
+ }
77
+
78
+ /// List all registered processor names.
79
+ pub fn list(&self) -> Vec<String> {
80
+ self.name_index.keys().cloned().collect()
81
+ }
82
+
83
+ /// Remove a processor from the registry.
84
+ pub fn remove(&mut self, name: &str) -> Result<()> {
85
+ let (stage, priority) = match self.name_index.remove(name) {
86
+ Some(location) => location,
87
+ None => return Ok(()),
88
+ };
89
+
90
+ let processor_to_shutdown = if let Some(priority_map) = self.processors.get_mut(&stage) {
91
+ let processor = priority_map.get_mut(&priority).and_then(|processors| {
92
+ processors
93
+ .iter()
94
+ .position(|p| p.name() == name)
95
+ .map(|pos| processors.remove(pos))
96
+ });
97
+
98
+ if let Some(processors) = priority_map.get(&priority)
99
+ && processors.is_empty()
100
+ {
101
+ priority_map.remove(&priority);
102
+ }
103
+
104
+ if priority_map.is_empty() {
105
+ self.processors.remove(&stage);
106
+ }
107
+ processor
108
+ } else {
109
+ None
110
+ };
111
+
112
+ if let Some(processor) = processor_to_shutdown {
113
+ processor.shutdown()?;
114
+ }
115
+
116
+ Ok(())
117
+ }
118
+
119
+ /// Shutdown all processors and clear the registry.
120
+ pub fn shutdown_all(&mut self) -> Result<()> {
121
+ let names = self.list();
122
+ for name in names {
123
+ self.remove(&name)?;
124
+ }
125
+ Ok(())
126
+ }
127
+ }
128
+
129
+ impl Default for PostProcessorRegistry {
130
+ fn default() -> Self {
131
+ Self::new()
132
+ }
133
+ }
134
+
135
+ #[cfg(test)]
136
+ mod tests {
137
+ use super::*;
138
+ use crate::KreuzbergError;
139
+ use crate::core::config::ExtractionConfig;
140
+ use crate::plugins::Plugin;
141
+ use crate::types::ExtractionResult;
142
+ use async_trait::async_trait;
143
+
144
+ struct MockPostProcessor {
145
+ name: String,
146
+ stage: ProcessingStage,
147
+ }
148
+
149
+ impl Plugin for MockPostProcessor {
150
+ fn name(&self) -> &str {
151
+ &self.name
152
+ }
153
+ fn version(&self) -> String {
154
+ "1.0.0".to_string()
155
+ }
156
+ fn initialize(&self) -> Result<()> {
157
+ Ok(())
158
+ }
159
+ fn shutdown(&self) -> Result<()> {
160
+ Ok(())
161
+ }
162
+ }
163
+
164
+ #[async_trait]
165
+ impl PostProcessor for MockPostProcessor {
166
+ async fn process(&self, _result: &mut ExtractionResult, _: &ExtractionConfig) -> Result<()> {
167
+ Ok(())
168
+ }
169
+
170
+ fn processing_stage(&self) -> ProcessingStage {
171
+ self.stage
172
+ }
173
+ }
174
+
175
+ #[test]
176
+ fn test_post_processor_registry() {
177
+ let mut registry = PostProcessorRegistry::new();
178
+
179
+ let early = Arc::new(MockPostProcessor {
180
+ name: "early-processor".to_string(),
181
+ stage: ProcessingStage::Early,
182
+ });
183
+
184
+ let middle = Arc::new(MockPostProcessor {
185
+ name: "middle-processor".to_string(),
186
+ stage: ProcessingStage::Middle,
187
+ });
188
+
189
+ registry.register(early, 100).unwrap();
190
+ registry.register(middle, 50).unwrap();
191
+
192
+ let early_processors = registry.get_for_stage(ProcessingStage::Early);
193
+ assert_eq!(early_processors.len(), 1);
194
+ assert_eq!(early_processors[0].name(), "early-processor");
195
+
196
+ let middle_processors = registry.get_for_stage(ProcessingStage::Middle);
197
+ assert_eq!(middle_processors.len(), 1);
198
+
199
+ let names = registry.list();
200
+ assert_eq!(names.len(), 2);
201
+ }
202
+
203
+ #[test]
204
+ fn test_post_processor_registry_remove() {
205
+ let mut registry = PostProcessorRegistry::new();
206
+
207
+ let processor = Arc::new(MockPostProcessor {
208
+ name: "test-processor".to_string(),
209
+ stage: ProcessingStage::Early,
210
+ });
211
+
212
+ registry.register(processor, 50).unwrap();
213
+ assert_eq!(registry.get_for_stage(ProcessingStage::Early).len(), 1);
214
+
215
+ registry.remove("test-processor").unwrap();
216
+ assert_eq!(registry.get_for_stage(ProcessingStage::Early).len(), 0);
217
+ }
218
+
219
+ #[test]
220
+ fn test_post_processor_registry_default() {
221
+ let registry = PostProcessorRegistry::default();
222
+ assert_eq!(registry.list().len(), 0);
223
+ }
224
+
225
+ #[test]
226
+ fn test_post_processor_registry_invalid_name_empty() {
227
+ let mut registry = PostProcessorRegistry::new();
228
+
229
+ let processor = Arc::new(MockPostProcessor {
230
+ name: "".to_string(),
231
+ stage: ProcessingStage::Early,
232
+ });
233
+
234
+ let result = registry.register(processor, 50);
235
+ assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
236
+ }
237
+
238
+ #[test]
239
+ fn test_post_processor_registry_invalid_name_whitespace() {
240
+ let mut registry = PostProcessorRegistry::new();
241
+
242
+ let processor = Arc::new(MockPostProcessor {
243
+ name: "my processor".to_string(),
244
+ stage: ProcessingStage::Early,
245
+ });
246
+
247
+ let result = registry.register(processor, 50);
248
+ assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
249
+ }
250
+
251
+ #[test]
252
+ fn test_post_processor_registry_shutdown_all() {
253
+ let mut registry = PostProcessorRegistry::new();
254
+
255
+ let early = Arc::new(MockPostProcessor {
256
+ name: "early".to_string(),
257
+ stage: ProcessingStage::Early,
258
+ });
259
+
260
+ let late = Arc::new(MockPostProcessor {
261
+ name: "late".to_string(),
262
+ stage: ProcessingStage::Late,
263
+ });
264
+
265
+ registry.register(early, 100).unwrap();
266
+ registry.register(late, 50).unwrap();
267
+
268
+ assert_eq!(registry.list().len(), 2);
269
+
270
+ registry.shutdown_all().unwrap();
271
+ assert_eq!(registry.list().len(), 0);
272
+ }
273
+
274
+ #[test]
275
+ fn test_post_processor_registry_priority_order() {
276
+ let mut registry = PostProcessorRegistry::new();
277
+
278
+ let low = Arc::new(MockPostProcessor {
279
+ name: "low-priority".to_string(),
280
+ stage: ProcessingStage::Early,
281
+ });
282
+
283
+ let high = Arc::new(MockPostProcessor {
284
+ name: "high-priority".to_string(),
285
+ stage: ProcessingStage::Early,
286
+ });
287
+
288
+ registry.register(low, 10).unwrap();
289
+ registry.register(high, 100).unwrap();
290
+
291
+ let processors = registry.get_for_stage(ProcessingStage::Early);
292
+ assert_eq!(processors.len(), 2);
293
+ assert_eq!(processors[0].name(), "high-priority");
294
+ assert_eq!(processors[1].name(), "low-priority");
295
+ }
296
+
297
+ #[test]
298
+ fn test_post_processor_registry_empty_stage() {
299
+ let registry = PostProcessorRegistry::new();
300
+
301
+ let processors = registry.get_for_stage(ProcessingStage::Late);
302
+ assert_eq!(processors.len(), 0);
303
+ }
304
+ }