kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,434 @@
1
+ //! Document extractor registry management.
2
+ //!
3
+ //! This module provides functions for managing the global extractor registry.
4
+
5
+ use super::r#trait::DocumentExtractor;
6
+ use std::sync::Arc;
7
+
8
+ /// Register a document extractor with the global registry.
9
+ ///
10
+ /// The extractor will be registered for all MIME types it supports and will be
11
+ /// available for document extraction. The extractor's `name()` method is used as
12
+ /// the registration name.
13
+ ///
14
+ /// # Arguments
15
+ ///
16
+ /// * `extractor` - The extractor implementation wrapped in Arc
17
+ ///
18
+ /// # Returns
19
+ ///
20
+ /// - `Ok(())` if registration succeeded
21
+ /// - `Err(...)` if validation failed or initialization failed
22
+ ///
23
+ /// # Errors
24
+ ///
25
+ /// - `KreuzbergError::Validation` - Invalid extractor name (empty or contains whitespace)
26
+ /// - Any error from the extractor's `initialize()` method
27
+ ///
28
+ /// # Example
29
+ ///
30
+ /// ```rust
31
+ /// use kreuzberg::plugins::{Plugin, DocumentExtractor, register_extractor};
32
+ /// use kreuzberg::{Result, ExtractionConfig};
33
+ /// use kreuzberg::types::{ExtractionResult, Metadata};
34
+ /// use async_trait::async_trait;
35
+ /// use std::sync::Arc;
36
+ /// use std::path::Path;
37
+ ///
38
+ /// struct CustomExtractor;
39
+ ///
40
+ /// impl Plugin for CustomExtractor {
41
+ /// fn name(&self) -> &str { "custom-extractor" }
42
+ /// fn version(&self) -> String { "1.0.0".to_string() }
43
+ /// fn initialize(&self) -> Result<()> { Ok(()) }
44
+ /// fn shutdown(&self) -> Result<()> { Ok(()) }
45
+ /// }
46
+ ///
47
+ /// #[async_trait]
48
+ /// impl DocumentExtractor for CustomExtractor {
49
+ /// async fn extract_bytes(&self, content: &[u8], mime_type: &str, _: &ExtractionConfig)
50
+ /// -> Result<ExtractionResult> {
51
+ /// Ok(ExtractionResult {
52
+ /// content: String::from_utf8_lossy(content).to_string(),
53
+ /// mime_type: mime_type.to_string(),
54
+ /// metadata: Metadata::default(),
55
+ /// tables: vec![],
56
+ /// detected_languages: None,
57
+ /// chunks: None,
58
+ /// images: None,
59
+ /// djot_content: None,
60
+ /// pages: None,
61
+ /// elements: None,
62
+ /// })
63
+ /// }
64
+ ///
65
+ /// fn supported_mime_types(&self) -> &[&str] {
66
+ /// &["text/custom"]
67
+ /// }
68
+ /// }
69
+ ///
70
+ /// # tokio_test::block_on(async {
71
+ /// let extractor = Arc::new(CustomExtractor);
72
+ /// register_extractor(extractor)?;
73
+ /// # Ok::<(), kreuzberg::KreuzbergError>(())
74
+ /// # });
75
+ /// ```
76
+ pub fn register_extractor(extractor: Arc<dyn DocumentExtractor>) -> crate::Result<()> {
77
+ use crate::plugins::registry::get_document_extractor_registry;
78
+
79
+ let registry = get_document_extractor_registry();
80
+ let mut registry = registry
81
+ .write()
82
+ .expect("~keep Failed to acquire write lock on extractor registry"); // ~keep
83
+
84
+ registry.register(extractor)
85
+ }
86
+
87
+ /// Unregister a document extractor by name.
88
+ ///
89
+ /// Removes the extractor from the global registry and calls its `shutdown()` method.
90
+ ///
91
+ /// # Arguments
92
+ ///
93
+ /// * `name` - Name of the extractor to unregister
94
+ ///
95
+ /// # Returns
96
+ ///
97
+ /// - `Ok(())` if the extractor was unregistered or didn't exist
98
+ /// - `Err(...)` if the shutdown method failed
99
+ ///
100
+ /// # Example
101
+ ///
102
+ /// ```rust
103
+ /// use kreuzberg::plugins::unregister_extractor;
104
+ ///
105
+ /// # tokio_test::block_on(async {
106
+ /// unregister_extractor("custom-extractor")?;
107
+ /// # Ok::<(), kreuzberg::KreuzbergError>(())
108
+ /// # });
109
+ /// ```
110
+ pub fn unregister_extractor(name: &str) -> crate::Result<()> {
111
+ use crate::plugins::registry::get_document_extractor_registry;
112
+
113
+ let registry = get_document_extractor_registry();
114
+ let mut registry = registry
115
+ .write()
116
+ .expect("~keep Failed to acquire write lock on extractor registry"); // ~keep
117
+
118
+ registry.remove(name)
119
+ }
120
+
121
+ /// List all registered extractors.
122
+ ///
123
+ /// Returns the names of all extractors currently registered in the global registry.
124
+ ///
125
+ /// # Returns
126
+ ///
127
+ /// A vector of extractor names.
128
+ ///
129
+ /// # Example
130
+ ///
131
+ /// ```rust
132
+ /// use kreuzberg::plugins::list_extractors;
133
+ ///
134
+ /// # tokio_test::block_on(async {
135
+ /// let extractors = list_extractors()?;
136
+ /// for name in extractors {
137
+ /// println!("Registered extractor: {}", name);
138
+ /// }
139
+ /// # Ok::<(), kreuzberg::KreuzbergError>(())
140
+ /// # });
141
+ /// ```
142
+ pub fn list_extractors() -> crate::Result<Vec<String>> {
143
+ use crate::plugins::registry::get_document_extractor_registry;
144
+
145
+ let registry = get_document_extractor_registry();
146
+ let registry = registry
147
+ .read()
148
+ .expect("~keep Failed to acquire read lock on extractor registry"); // ~keep
149
+
150
+ Ok(registry.list())
151
+ }
152
+
153
+ /// Clear all extractors from the global registry.
154
+ ///
155
+ /// Removes all extractors and calls their `shutdown()` methods.
156
+ ///
157
+ /// # Returns
158
+ ///
159
+ /// - `Ok(())` if all extractors were cleared successfully
160
+ /// - `Err(...)` if any shutdown method failed
161
+ ///
162
+ /// # Example
163
+ ///
164
+ /// ```rust
165
+ /// use kreuzberg::plugins::clear_extractors;
166
+ ///
167
+ /// # tokio_test::block_on(async {
168
+ /// clear_extractors()?;
169
+ /// # Ok::<(), kreuzberg::KreuzbergError>(())
170
+ /// # });
171
+ /// ```
172
+ pub fn clear_extractors() -> crate::Result<()> {
173
+ use crate::plugins::registry::get_document_extractor_registry;
174
+
175
+ let registry = get_document_extractor_registry();
176
+ let mut registry = registry
177
+ .write()
178
+ .expect("~keep Failed to acquire write lock on extractor registry"); // ~keep
179
+
180
+ registry.shutdown_all()
181
+ }
182
+
183
+ #[cfg(test)]
184
+ mod tests {
185
+ use super::*;
186
+ use crate::Result;
187
+ use crate::core::config::ExtractionConfig;
188
+ use crate::plugins::Plugin;
189
+ use crate::types::ExtractionResult;
190
+ use async_trait::async_trait;
191
+ use serial_test::serial;
192
+
193
+ struct MockExtractor {
194
+ mime_types: Vec<&'static str>,
195
+ priority: i32,
196
+ }
197
+
198
+ impl Plugin for MockExtractor {
199
+ fn name(&self) -> &str {
200
+ "mock-extractor"
201
+ }
202
+
203
+ fn version(&self) -> String {
204
+ "1.0.0".to_string()
205
+ }
206
+
207
+ fn initialize(&self) -> Result<()> {
208
+ Ok(())
209
+ }
210
+
211
+ fn shutdown(&self) -> Result<()> {
212
+ Ok(())
213
+ }
214
+ }
215
+
216
+ #[async_trait]
217
+ impl DocumentExtractor for MockExtractor {
218
+ async fn extract_bytes(
219
+ &self,
220
+ content: &[u8],
221
+ mime_type: &str,
222
+ _config: &ExtractionConfig,
223
+ ) -> Result<ExtractionResult> {
224
+ Ok(ExtractionResult {
225
+ content: String::from_utf8_lossy(content).to_string(),
226
+ mime_type: mime_type.to_string(),
227
+ metadata: crate::types::Metadata::default(),
228
+ tables: vec![],
229
+ detected_languages: None,
230
+ chunks: None,
231
+ images: None,
232
+ djot_content: None,
233
+ pages: None,
234
+ elements: None,
235
+ })
236
+ }
237
+
238
+ fn supported_mime_types(&self) -> &[&str] {
239
+ &self.mime_types
240
+ }
241
+
242
+ fn priority(&self) -> i32 {
243
+ self.priority
244
+ }
245
+ }
246
+
247
+ #[test]
248
+ #[serial]
249
+ fn test_register_extractor() {
250
+ use std::sync::Arc;
251
+
252
+ let extractor = Arc::new(MockExtractor {
253
+ mime_types: vec!["text/test-register"],
254
+ priority: 50,
255
+ });
256
+ let result = super::register_extractor(extractor);
257
+ assert!(result.is_ok());
258
+
259
+ let _ = super::unregister_extractor("mock-extractor");
260
+ }
261
+
262
+ #[test]
263
+ #[serial]
264
+ fn test_unregister_extractor() {
265
+ use std::sync::Arc;
266
+
267
+ let extractor = Arc::new(MockExtractor {
268
+ mime_types: vec!["text/test-unregister"],
269
+ priority: 50,
270
+ });
271
+ super::register_extractor(extractor).unwrap();
272
+
273
+ let result = super::unregister_extractor("mock-extractor");
274
+ assert!(result.is_ok());
275
+ }
276
+
277
+ #[test]
278
+ #[serial]
279
+ fn test_unregister_nonexistent_extractor() {
280
+ let result = super::unregister_extractor("nonexistent-extractor-xyz");
281
+ assert!(result.is_ok());
282
+ }
283
+
284
+ #[test]
285
+ #[serial]
286
+ fn test_list_extractors() {
287
+ use std::sync::Arc;
288
+
289
+ super::clear_extractors().unwrap();
290
+
291
+ let extractor1 = Arc::new(MockExtractor {
292
+ mime_types: vec!["text/test-list-1"],
293
+ priority: 50,
294
+ });
295
+ let extractor2 = Arc::new(MockExtractor {
296
+ mime_types: vec!["text/test-list-2"],
297
+ priority: 51,
298
+ });
299
+
300
+ let list_before = super::list_extractors().unwrap();
301
+ assert_eq!(list_before.len(), 0);
302
+
303
+ super::register_extractor(extractor1).unwrap();
304
+ super::register_extractor(extractor2).unwrap();
305
+
306
+ let list = super::list_extractors().unwrap();
307
+ assert_eq!(list.len(), 1);
308
+ assert!(list.contains(&"mock-extractor".to_string()));
309
+
310
+ super::unregister_extractor("mock-extractor").unwrap();
311
+ }
312
+
313
+ #[test]
314
+ #[serial]
315
+ fn test_clear_extractors() {
316
+ use std::sync::Arc;
317
+
318
+ super::clear_extractors().unwrap();
319
+
320
+ let extractor1 = Arc::new(MockExtractor {
321
+ mime_types: vec!["text/test-clear-1"],
322
+ priority: 50,
323
+ });
324
+ let extractor2 = Arc::new(MockExtractor {
325
+ mime_types: vec!["text/test-clear-2"],
326
+ priority: 51,
327
+ });
328
+
329
+ super::register_extractor(extractor1).unwrap();
330
+ super::register_extractor(extractor2).unwrap();
331
+
332
+ let result = super::clear_extractors();
333
+ assert!(result.is_ok());
334
+
335
+ let list = super::list_extractors().unwrap();
336
+ assert_eq!(list.len(), 0);
337
+ }
338
+
339
+ #[test]
340
+ #[serial]
341
+ fn test_register_extractor_with_invalid_name() {
342
+ use std::sync::Arc;
343
+
344
+ struct InvalidNameExtractor;
345
+ impl Plugin for InvalidNameExtractor {
346
+ fn name(&self) -> &str {
347
+ "invalid name with spaces"
348
+ }
349
+ fn version(&self) -> String {
350
+ "1.0.0".to_string()
351
+ }
352
+ fn initialize(&self) -> Result<()> {
353
+ Ok(())
354
+ }
355
+ fn shutdown(&self) -> Result<()> {
356
+ Ok(())
357
+ }
358
+ }
359
+
360
+ #[async_trait]
361
+ impl DocumentExtractor for InvalidNameExtractor {
362
+ async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> {
363
+ Ok(ExtractionResult {
364
+ content: String::new(),
365
+ mime_type: String::new(),
366
+ metadata: crate::types::Metadata::default(),
367
+ tables: vec![],
368
+ detected_languages: None,
369
+ chunks: None,
370
+ images: None,
371
+ djot_content: None,
372
+ pages: None,
373
+ elements: None,
374
+ })
375
+ }
376
+
377
+ fn supported_mime_types(&self) -> &[&str] {
378
+ &["text/plain"]
379
+ }
380
+ }
381
+
382
+ let extractor = Arc::new(InvalidNameExtractor);
383
+ let result = super::register_extractor(extractor);
384
+ assert!(matches!(result, Err(crate::KreuzbergError::Validation { .. })));
385
+ }
386
+
387
+ #[test]
388
+ #[serial]
389
+ fn test_register_extractor_with_empty_name() {
390
+ use std::sync::Arc;
391
+
392
+ struct EmptyNameExtractor;
393
+ impl Plugin for EmptyNameExtractor {
394
+ fn name(&self) -> &str {
395
+ ""
396
+ }
397
+ fn version(&self) -> String {
398
+ "1.0.0".to_string()
399
+ }
400
+ fn initialize(&self) -> Result<()> {
401
+ Ok(())
402
+ }
403
+ fn shutdown(&self) -> Result<()> {
404
+ Ok(())
405
+ }
406
+ }
407
+
408
+ #[async_trait]
409
+ impl DocumentExtractor for EmptyNameExtractor {
410
+ async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> {
411
+ Ok(ExtractionResult {
412
+ content: String::new(),
413
+ mime_type: String::new(),
414
+ metadata: crate::types::Metadata::default(),
415
+ tables: vec![],
416
+ detected_languages: None,
417
+ chunks: None,
418
+ images: None,
419
+ djot_content: None,
420
+ pages: None,
421
+ elements: None,
422
+ })
423
+ }
424
+
425
+ fn supported_mime_types(&self) -> &[&str] {
426
+ &["text/plain"]
427
+ }
428
+ }
429
+
430
+ let extractor = Arc::new(EmptyNameExtractor);
431
+ let result = super::register_extractor(extractor);
432
+ assert!(matches!(result, Err(crate::KreuzbergError::Validation { .. })));
433
+ }
434
+ }