kreuzberg 4.0.0.pre.rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +538 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +157 -0
  8. data/README.md +426 -0
  9. data/Rakefile +25 -0
  10. data/Steepfile +47 -0
  11. data/examples/async_patterns.rb +341 -0
  12. data/ext/kreuzberg_rb/extconf.rb +45 -0
  13. data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
  14. data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
  15. data/ext/kreuzberg_rb/native/README.md +425 -0
  16. data/ext/kreuzberg_rb/native/build.rs +15 -0
  17. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  18. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  19. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  20. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  21. data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
  22. data/extconf.rb +28 -0
  23. data/kreuzberg.gemspec +148 -0
  24. data/lib/kreuzberg/api_proxy.rb +142 -0
  25. data/lib/kreuzberg/cache_api.rb +46 -0
  26. data/lib/kreuzberg/cli.rb +55 -0
  27. data/lib/kreuzberg/cli_proxy.rb +127 -0
  28. data/lib/kreuzberg/config.rb +691 -0
  29. data/lib/kreuzberg/error_context.rb +32 -0
  30. data/lib/kreuzberg/errors.rb +118 -0
  31. data/lib/kreuzberg/extraction_api.rb +85 -0
  32. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  33. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  34. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  35. data/lib/kreuzberg/result.rb +216 -0
  36. data/lib/kreuzberg/setup_lib_path.rb +80 -0
  37. data/lib/kreuzberg/validator_protocol.rb +89 -0
  38. data/lib/kreuzberg/version.rb +5 -0
  39. data/lib/kreuzberg.rb +103 -0
  40. data/sig/kreuzberg/internal.rbs +184 -0
  41. data/sig/kreuzberg.rbs +520 -0
  42. data/spec/binding/cache_spec.rb +227 -0
  43. data/spec/binding/cli_proxy_spec.rb +85 -0
  44. data/spec/binding/cli_spec.rb +55 -0
  45. data/spec/binding/config_spec.rb +345 -0
  46. data/spec/binding/config_validation_spec.rb +283 -0
  47. data/spec/binding/error_handling_spec.rb +213 -0
  48. data/spec/binding/errors_spec.rb +66 -0
  49. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  50. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  51. data/spec/binding/plugins/validator_spec.rb +274 -0
  52. data/spec/fixtures/config.toml +39 -0
  53. data/spec/fixtures/config.yaml +41 -0
  54. data/spec/fixtures/invalid_config.toml +4 -0
  55. data/spec/smoke/package_spec.rb +178 -0
  56. data/spec/spec_helper.rb +42 -0
  57. data/vendor/kreuzberg/Cargo.toml +204 -0
  58. data/vendor/kreuzberg/README.md +175 -0
  59. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
  60. data/vendor/kreuzberg/build.rs +474 -0
  61. data/vendor/kreuzberg/src/api/error.rs +81 -0
  62. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  63. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  64. data/vendor/kreuzberg/src/api/server.rs +353 -0
  65. data/vendor/kreuzberg/src/api/types.rs +170 -0
  66. data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
  67. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  68. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
  69. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  70. data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
  71. data/vendor/kreuzberg/src/core/io.rs +329 -0
  72. data/vendor/kreuzberg/src/core/mime.rs +605 -0
  73. data/vendor/kreuzberg/src/core/mod.rs +45 -0
  74. data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
  75. data/vendor/kreuzberg/src/embeddings.rs +432 -0
  76. data/vendor/kreuzberg/src/error.rs +431 -0
  77. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  78. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  79. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  80. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  81. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  82. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  83. data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
  84. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
  85. data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  88. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  89. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
  90. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
  91. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  92. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  93. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  94. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  95. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  96. data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
  97. data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
  98. data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
  99. data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
  100. data/vendor/kreuzberg/src/extractors/email.rs +143 -0
  101. data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
  103. data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
  104. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
  105. data/vendor/kreuzberg/src/extractors/html.rs +393 -0
  106. data/vendor/kreuzberg/src/extractors/image.rs +198 -0
  107. data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
  108. data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
  109. data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
  110. data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
  111. data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
  112. data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
  113. data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
  114. data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
  115. data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
  116. data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
  117. data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
  119. data/vendor/kreuzberg/src/extractors/security.rs +484 -0
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
  121. data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
  122. data/vendor/kreuzberg/src/extractors/text.rs +260 -0
  123. data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
  124. data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
  125. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  126. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  127. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  128. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  129. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  130. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  131. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  132. data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
  133. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  134. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  135. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  136. data/vendor/kreuzberg/src/lib.rs +105 -0
  137. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  138. data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
  139. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  140. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  141. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  142. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  143. data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
  144. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  145. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  146. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  147. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  148. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  149. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  150. data/vendor/kreuzberg/src/panic_context.rs +154 -0
  151. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  152. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  153. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  154. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  155. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  156. data/vendor/kreuzberg/src/pdf/table.rs +393 -0
  157. data/vendor/kreuzberg/src/pdf/text.rs +158 -0
  158. data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
  159. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  160. data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
  161. data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
  162. data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
  163. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  164. data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
  165. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  166. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  167. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  168. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  169. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  170. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  171. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  172. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  173. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  174. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  175. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  176. data/vendor/kreuzberg/src/types.rs +903 -0
  177. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  178. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  179. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  180. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  181. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  182. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  183. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  184. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  185. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  186. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  187. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  188. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  189. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  190. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  191. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  192. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  193. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  194. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  195. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  196. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  197. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  198. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  199. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  200. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  201. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  202. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  203. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  204. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  205. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  206. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  207. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  208. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  209. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  210. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  211. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  212. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  213. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  214. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  215. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  216. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  217. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  218. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  219. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  220. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  221. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  222. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  223. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  224. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  225. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  226. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  227. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  228. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  229. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  230. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  231. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  232. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  233. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  234. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  235. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  236. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  237. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  238. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  239. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  240. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  241. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  242. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  243. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  244. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
  245. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  246. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  247. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
  248. data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
  249. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
  250. data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
  251. data/vendor/kreuzberg/tests/config_features.rs +598 -0
  252. data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
  253. data/vendor/kreuzberg/tests/core_integration.rs +510 -0
  254. data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
  255. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
  256. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
  257. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
  258. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
  260. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  261. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
  262. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  263. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  264. data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
  265. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  266. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
  267. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
  268. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
  269. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  270. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  271. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
  272. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
  273. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  274. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  275. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  276. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  277. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  278. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
  279. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
  280. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
  281. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  282. data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
  283. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  284. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
  285. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  286. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  287. data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
  288. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
  289. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
  290. data/vendor/kreuzberg/tests/security_validation.rs +415 -0
  291. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  292. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  293. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
  294. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
  295. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  296. data/vendor/rb-sys/.cargo-ok +1 -0
  297. data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
  298. data/vendor/rb-sys/Cargo.lock +393 -0
  299. data/vendor/rb-sys/Cargo.toml +70 -0
  300. data/vendor/rb-sys/Cargo.toml.orig +57 -0
  301. data/vendor/rb-sys/LICENSE-APACHE +190 -0
  302. data/vendor/rb-sys/LICENSE-MIT +21 -0
  303. data/vendor/rb-sys/bin/release.sh +21 -0
  304. data/vendor/rb-sys/build/features.rs +108 -0
  305. data/vendor/rb-sys/build/main.rs +246 -0
  306. data/vendor/rb-sys/build/stable_api_config.rs +153 -0
  307. data/vendor/rb-sys/build/version.rs +48 -0
  308. data/vendor/rb-sys/readme.md +36 -0
  309. data/vendor/rb-sys/src/bindings.rs +21 -0
  310. data/vendor/rb-sys/src/hidden.rs +11 -0
  311. data/vendor/rb-sys/src/lib.rs +34 -0
  312. data/vendor/rb-sys/src/macros.rs +371 -0
  313. data/vendor/rb-sys/src/memory.rs +53 -0
  314. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
  315. data/vendor/rb-sys/src/special_consts.rs +31 -0
  316. data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
  317. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
  318. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
  319. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
  320. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
  321. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
  322. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
  323. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
  324. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
  325. data/vendor/rb-sys/src/stable_api.rs +261 -0
  326. data/vendor/rb-sys/src/symbol.rs +31 -0
  327. data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
  328. data/vendor/rb-sys/src/utils.rs +89 -0
  329. data/vendor/rb-sys/src/value_type.rs +7 -0
  330. metadata +536 -0
@@ -0,0 +1,586 @@
1
+ //! Plugin registry integration tests.
2
+ //!
3
+ //! Tests the core registry APIs for all plugin types:
4
+ //! - Validator registration/unregistration
5
+ //! - Extractor registration/unregistration
6
+ //! - Registry clearing and listing
7
+ //! - Error handling and edge cases
8
+
9
+ use async_trait::async_trait;
10
+ use kreuzberg::core::config::ExtractionConfig;
11
+ use kreuzberg::plugins::registry::{DocumentExtractorRegistry, ValidatorRegistry};
12
+ use kreuzberg::plugins::{DocumentExtractor, Plugin, Validator};
13
+ use kreuzberg::types::{ExtractionResult, Metadata};
14
+ use kreuzberg::{KreuzbergError, Result};
15
+ use std::path::Path;
16
+ use std::sync::Arc;
17
+
18
+ struct MockValidator {
19
+ name: String,
20
+ should_fail: bool,
21
+ }
22
+
23
+ impl Plugin for MockValidator {
24
+ fn name(&self) -> &str {
25
+ &self.name
26
+ }
27
+
28
+ fn version(&self) -> String {
29
+ "1.0.0".to_string()
30
+ }
31
+
32
+ fn initialize(&self) -> Result<()> {
33
+ Ok(())
34
+ }
35
+
36
+ fn shutdown(&self) -> Result<()> {
37
+ Ok(())
38
+ }
39
+ }
40
+
41
+ #[async_trait]
42
+ impl Validator for MockValidator {
43
+ async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
44
+ if self.should_fail {
45
+ Err(KreuzbergError::validation("Mock validation failed"))
46
+ } else {
47
+ Ok(())
48
+ }
49
+ }
50
+
51
+ fn priority(&self) -> i32 {
52
+ 50
53
+ }
54
+ }
55
+
56
+ struct FailingInitValidator {
57
+ name: String,
58
+ }
59
+
60
+ impl Plugin for FailingInitValidator {
61
+ fn name(&self) -> &str {
62
+ &self.name
63
+ }
64
+
65
+ fn version(&self) -> String {
66
+ "1.0.0".to_string()
67
+ }
68
+
69
+ fn initialize(&self) -> Result<()> {
70
+ Err(KreuzbergError::Plugin {
71
+ message: "Initialization failed".to_string(),
72
+ plugin_name: self.name.clone(),
73
+ })
74
+ }
75
+
76
+ fn shutdown(&self) -> Result<()> {
77
+ Ok(())
78
+ }
79
+ }
80
+
81
+ #[async_trait]
82
+ impl Validator for FailingInitValidator {
83
+ async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
84
+ Ok(())
85
+ }
86
+ }
87
+
88
+ struct MockExtractor {
89
+ name: String,
90
+ mime_types: Vec<&'static str>,
91
+ priority: i32,
92
+ }
93
+
94
+ impl Plugin for MockExtractor {
95
+ fn name(&self) -> &str {
96
+ &self.name
97
+ }
98
+
99
+ fn version(&self) -> String {
100
+ "1.0.0".to_string()
101
+ }
102
+
103
+ fn initialize(&self) -> Result<()> {
104
+ Ok(())
105
+ }
106
+
107
+ fn shutdown(&self) -> Result<()> {
108
+ Ok(())
109
+ }
110
+ }
111
+
112
+ #[async_trait]
113
+ impl DocumentExtractor for MockExtractor {
114
+ async fn extract_bytes(
115
+ &self,
116
+ content: &[u8],
117
+ mime_type: &str,
118
+ _config: &ExtractionConfig,
119
+ ) -> Result<ExtractionResult> {
120
+ Ok(ExtractionResult {
121
+ content: format!("Extracted by {}: {}", self.name, String::from_utf8_lossy(content)),
122
+ mime_type: mime_type.to_string(),
123
+ metadata: Metadata::default(),
124
+ tables: vec![],
125
+ detected_languages: None,
126
+ chunks: None,
127
+ images: None,
128
+ })
129
+ }
130
+
131
+ async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
132
+ let content = std::fs::read(path)?;
133
+ self.extract_bytes(&content, mime_type, config).await
134
+ }
135
+
136
+ fn supported_mime_types(&self) -> &[&str] {
137
+ &self.mime_types
138
+ }
139
+
140
+ fn priority(&self) -> i32 {
141
+ self.priority
142
+ }
143
+ }
144
+
145
+ /// Test validator registration and listing.
146
+ #[test]
147
+ fn test_validator_registration_succeeds() {
148
+ let mut registry = ValidatorRegistry::new();
149
+
150
+ let validator = Arc::new(MockValidator {
151
+ name: "test-validator".to_string(),
152
+ should_fail: false,
153
+ });
154
+
155
+ let result = registry.register(validator);
156
+ assert!(result.is_ok(), "Validator registration should succeed");
157
+
158
+ let list = registry.list();
159
+ assert_eq!(list.len(), 1, "Should have one validator");
160
+ assert!(
161
+ list.contains(&"test-validator".to_string()),
162
+ "Should contain registered validator"
163
+ );
164
+ }
165
+
166
+ /// Test registering multiple validators.
167
+ #[test]
168
+ fn test_register_multiple_validators_succeeds() {
169
+ let mut registry = ValidatorRegistry::new();
170
+
171
+ let v1 = Arc::new(MockValidator {
172
+ name: "validator-1".to_string(),
173
+ should_fail: false,
174
+ });
175
+ let v2 = Arc::new(MockValidator {
176
+ name: "validator-2".to_string(),
177
+ should_fail: false,
178
+ });
179
+ let v3 = Arc::new(MockValidator {
180
+ name: "validator-3".to_string(),
181
+ should_fail: true,
182
+ });
183
+
184
+ registry.register(v1).unwrap();
185
+ registry.register(v2).unwrap();
186
+ registry.register(v3).unwrap();
187
+
188
+ let list = registry.list();
189
+ assert_eq!(list.len(), 3, "Should have three validators");
190
+ assert!(list.contains(&"validator-1".to_string()));
191
+ assert!(list.contains(&"validator-2".to_string()));
192
+ assert!(list.contains(&"validator-3".to_string()));
193
+ }
194
+
195
+ /// Test validator unregistration.
196
+ #[test]
197
+ fn test_validator_unregistration_succeeds() {
198
+ let mut registry = ValidatorRegistry::new();
199
+
200
+ let validator = Arc::new(MockValidator {
201
+ name: "temp-validator".to_string(),
202
+ should_fail: false,
203
+ });
204
+
205
+ registry.register(validator).unwrap();
206
+ assert_eq!(registry.list().len(), 1);
207
+
208
+ let result = registry.remove("temp-validator");
209
+ assert!(result.is_ok(), "Unregistration should succeed");
210
+ assert_eq!(registry.list().len(), 0, "Registry should be empty after removal");
211
+ }
212
+
213
+ /// Test unregistering non-existent validator.
214
+ #[test]
215
+ fn test_unregister_nonexistent_validator_succeeds() {
216
+ let mut registry = ValidatorRegistry::new();
217
+
218
+ let result = registry.remove("nonexistent-validator");
219
+ assert!(result.is_ok(), "Removing non-existent validator should succeed (no-op)");
220
+ }
221
+
222
+ /// Test validator registration with empty name fails.
223
+ #[test]
224
+ fn test_validator_registration_with_empty_name_fails() {
225
+ let mut registry = ValidatorRegistry::new();
226
+
227
+ let validator = Arc::new(MockValidator {
228
+ name: "".to_string(),
229
+ should_fail: false,
230
+ });
231
+
232
+ let result = registry.register(validator);
233
+ assert!(result.is_err(), "Registration with empty name should fail");
234
+
235
+ match result {
236
+ Err(KreuzbergError::Validation { message, .. }) => {
237
+ assert!(message.contains("empty"), "Error should mention empty name");
238
+ }
239
+ _ => panic!("Expected Validation error"),
240
+ }
241
+ }
242
+
243
+ /// Test validator registration with whitespace in name fails.
244
+ #[test]
245
+ fn test_validator_registration_with_whitespace_fails() {
246
+ let mut registry = ValidatorRegistry::new();
247
+
248
+ let validator = Arc::new(MockValidator {
249
+ name: "validator with spaces".to_string(),
250
+ should_fail: false,
251
+ });
252
+
253
+ let result = registry.register(validator);
254
+ assert!(result.is_err(), "Registration with whitespace should fail");
255
+
256
+ match result {
257
+ Err(KreuzbergError::Validation { message, .. }) => {
258
+ assert!(message.contains("whitespace"), "Error should mention whitespace");
259
+ }
260
+ _ => panic!("Expected Validation error"),
261
+ }
262
+ }
263
+
264
+ /// Test validator registration with failed initialization.
265
+ #[test]
266
+ fn test_validator_registration_with_failed_init_fails() {
267
+ let mut registry = ValidatorRegistry::new();
268
+
269
+ let validator = Arc::new(FailingInitValidator {
270
+ name: "failing-validator".to_string(),
271
+ });
272
+
273
+ let result = registry.register(validator);
274
+ assert!(result.is_err(), "Registration with failed init should fail");
275
+
276
+ match result {
277
+ Err(KreuzbergError::Plugin { .. }) => {}
278
+ _ => panic!("Expected Plugin error"),
279
+ }
280
+
281
+ assert_eq!(registry.list().len(), 0, "Failed validator should not be registered");
282
+ }
283
+
284
+ /// Test clearing all validators.
285
+ #[test]
286
+ fn test_clear_validators_succeeds() {
287
+ let mut registry = ValidatorRegistry::new();
288
+
289
+ let v1 = Arc::new(MockValidator {
290
+ name: "validator-1".to_string(),
291
+ should_fail: false,
292
+ });
293
+ let v2 = Arc::new(MockValidator {
294
+ name: "validator-2".to_string(),
295
+ should_fail: false,
296
+ });
297
+
298
+ registry.register(v1).unwrap();
299
+ registry.register(v2).unwrap();
300
+ assert_eq!(registry.list().len(), 2);
301
+
302
+ let result = registry.shutdown_all();
303
+ assert!(result.is_ok(), "Clear should succeed");
304
+ assert_eq!(registry.list().len(), 0, "Registry should be empty after clear");
305
+ }
306
+
307
+ /// Test getting all validators in priority order.
308
+ #[test]
309
+ fn test_get_all_validators_respects_priority() {
310
+ let mut registry = ValidatorRegistry::new();
311
+
312
+ struct PriorityValidator {
313
+ name: String,
314
+ priority: i32,
315
+ }
316
+
317
+ impl Plugin for PriorityValidator {
318
+ fn name(&self) -> &str {
319
+ &self.name
320
+ }
321
+ fn version(&self) -> String {
322
+ "1.0.0".to_string()
323
+ }
324
+ fn initialize(&self) -> Result<()> {
325
+ Ok(())
326
+ }
327
+ fn shutdown(&self) -> Result<()> {
328
+ Ok(())
329
+ }
330
+ }
331
+
332
+ #[async_trait]
333
+ impl Validator for PriorityValidator {
334
+ async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
335
+ Ok(())
336
+ }
337
+ fn priority(&self) -> i32 {
338
+ self.priority
339
+ }
340
+ }
341
+
342
+ let low = Arc::new(PriorityValidator {
343
+ name: "low-priority".to_string(),
344
+ priority: 10,
345
+ });
346
+ let medium = Arc::new(PriorityValidator {
347
+ name: "medium-priority".to_string(),
348
+ priority: 50,
349
+ });
350
+ let high = Arc::new(PriorityValidator {
351
+ name: "high-priority".to_string(),
352
+ priority: 100,
353
+ });
354
+
355
+ registry.register(medium).unwrap();
356
+ registry.register(low).unwrap();
357
+ registry.register(high).unwrap();
358
+
359
+ let all = registry.get_all();
360
+ assert_eq!(all.len(), 3, "Should have three validators");
361
+
362
+ assert_eq!(all[0].name(), "high-priority");
363
+ assert_eq!(all[1].name(), "medium-priority");
364
+ assert_eq!(all[2].name(), "low-priority");
365
+ }
366
+
367
+ /// Test extractor registration and retrieval.
368
+ #[test]
369
+ fn test_extractor_registration_succeeds() {
370
+ let mut registry = DocumentExtractorRegistry::new();
371
+
372
+ let extractor = Arc::new(MockExtractor {
373
+ name: "test-extractor".to_string(),
374
+ mime_types: vec!["text/plain"],
375
+ priority: 50,
376
+ });
377
+
378
+ let result = registry.register(extractor);
379
+ assert!(result.is_ok(), "Extractor registration should succeed");
380
+
381
+ let list = registry.list();
382
+ assert_eq!(list.len(), 1, "Should have one extractor");
383
+ assert!(list.contains(&"test-extractor".to_string()));
384
+ }
385
+
386
+ /// Test extractor retrieval by MIME type.
387
+ #[test]
388
+ fn test_get_extractor_by_mime_type_succeeds() {
389
+ let mut registry = DocumentExtractorRegistry::new();
390
+
391
+ let extractor = Arc::new(MockExtractor {
392
+ name: "pdf-extractor".to_string(),
393
+ mime_types: vec!["application/pdf"],
394
+ priority: 50,
395
+ });
396
+
397
+ registry.register(extractor).unwrap();
398
+
399
+ let result = registry.get("application/pdf");
400
+ assert!(result.is_ok(), "Should find extractor for PDF");
401
+ assert_eq!(result.unwrap().name(), "pdf-extractor");
402
+ }
403
+
404
+ /// Test extractor not found for unsupported MIME type.
405
+ #[test]
406
+ fn test_get_extractor_for_unsupported_mime_fails() {
407
+ let registry = DocumentExtractorRegistry::new();
408
+
409
+ let result = registry.get("application/nonexistent");
410
+ assert!(result.is_err(), "Should not find extractor for unsupported MIME type");
411
+
412
+ match result {
413
+ Err(KreuzbergError::UnsupportedFormat(mime)) => {
414
+ assert_eq!(mime, "application/nonexistent");
415
+ }
416
+ _ => panic!("Expected UnsupportedFormat error"),
417
+ }
418
+ }
419
+
420
+ /// Test extractor priority selection.
421
+ #[test]
422
+ fn test_extractor_priority_selection() {
423
+ let mut registry = DocumentExtractorRegistry::new();
424
+
425
+ let low_priority = Arc::new(MockExtractor {
426
+ name: "low-priority-extractor".to_string(),
427
+ mime_types: vec!["text/plain"],
428
+ priority: 10,
429
+ });
430
+
431
+ let high_priority = Arc::new(MockExtractor {
432
+ name: "high-priority-extractor".to_string(),
433
+ mime_types: vec!["text/plain"],
434
+ priority: 100,
435
+ });
436
+
437
+ registry.register(low_priority).unwrap();
438
+ registry.register(high_priority).unwrap();
439
+
440
+ let result = registry.get("text/plain").unwrap();
441
+ assert_eq!(
442
+ result.name(),
443
+ "high-priority-extractor",
444
+ "Should select highest priority extractor"
445
+ );
446
+ }
447
+
448
+ /// Test extractor wildcard MIME type matching.
449
+ #[test]
450
+ fn test_extractor_wildcard_mime_matching() {
451
+ let mut registry = DocumentExtractorRegistry::new();
452
+
453
+ let extractor = Arc::new(MockExtractor {
454
+ name: "text-extractor".to_string(),
455
+ mime_types: vec!["text/*"],
456
+ priority: 50,
457
+ });
458
+
459
+ registry.register(extractor).unwrap();
460
+
461
+ let result = registry.get("text/plain");
462
+ assert!(result.is_ok(), "Should match text/plain with text/*");
463
+ assert_eq!(result.unwrap().name(), "text-extractor");
464
+
465
+ let result = registry.get("text/html");
466
+ assert!(result.is_ok(), "Should match text/html with text/*");
467
+ assert_eq!(result.unwrap().name(), "text-extractor");
468
+
469
+ let result = registry.get("application/pdf");
470
+ assert!(result.is_err(), "Should not match application/pdf with text/*");
471
+ }
472
+
473
+ /// Test extractor unregistration.
474
+ #[test]
475
+ fn test_extractor_unregistration_succeeds() {
476
+ let mut registry = DocumentExtractorRegistry::new();
477
+
478
+ let extractor = Arc::new(MockExtractor {
479
+ name: "temp-extractor".to_string(),
480
+ mime_types: vec!["text/plain"],
481
+ priority: 50,
482
+ });
483
+
484
+ registry.register(extractor).unwrap();
485
+ assert_eq!(registry.list().len(), 1);
486
+
487
+ let result = registry.remove("temp-extractor");
488
+ assert!(result.is_ok(), "Unregistration should succeed");
489
+ assert_eq!(registry.list().len(), 0, "Registry should be empty after removal");
490
+
491
+ let lookup_result = registry.get("text/plain");
492
+ assert!(lookup_result.is_err(), "Should not find extractor after removal");
493
+ }
494
+
495
+ /// Test extractor registration with multiple MIME types.
496
+ #[test]
497
+ fn test_extractor_multiple_mime_types() {
498
+ let mut registry = DocumentExtractorRegistry::new();
499
+
500
+ let extractor = Arc::new(MockExtractor {
501
+ name: "multi-format-extractor".to_string(),
502
+ mime_types: vec!["application/pdf", "application/vnd.ms-excel", "text/csv"],
503
+ priority: 50,
504
+ });
505
+
506
+ registry.register(extractor).unwrap();
507
+
508
+ assert!(registry.get("application/pdf").is_ok());
509
+ assert!(registry.get("application/vnd.ms-excel").is_ok());
510
+ assert!(registry.get("text/csv").is_ok());
511
+
512
+ assert_eq!(
513
+ registry.get("application/pdf").unwrap().name(),
514
+ "multi-format-extractor"
515
+ );
516
+ assert_eq!(registry.get("text/csv").unwrap().name(), "multi-format-extractor");
517
+ }
518
+
519
+ /// Test clearing all extractors.
520
+ #[test]
521
+ fn test_clear_extractors_succeeds() {
522
+ let mut registry = DocumentExtractorRegistry::new();
523
+
524
+ let e1 = Arc::new(MockExtractor {
525
+ name: "extractor-1".to_string(),
526
+ mime_types: vec!["text/plain"],
527
+ priority: 50,
528
+ });
529
+ let e2 = Arc::new(MockExtractor {
530
+ name: "extractor-2".to_string(),
531
+ mime_types: vec!["application/pdf"],
532
+ priority: 50,
533
+ });
534
+
535
+ registry.register(e1).unwrap();
536
+ registry.register(e2).unwrap();
537
+ assert_eq!(registry.list().len(), 2);
538
+
539
+ let result = registry.shutdown_all();
540
+ assert!(result.is_ok(), "Clear should succeed");
541
+ assert_eq!(registry.list().len(), 0, "Registry should be empty after clear");
542
+ }
543
+
544
+ /// Test extractor registration with empty name fails.
545
+ #[test]
546
+ fn test_extractor_registration_with_empty_name_fails() {
547
+ let mut registry = DocumentExtractorRegistry::new();
548
+
549
+ let extractor = Arc::new(MockExtractor {
550
+ name: "".to_string(),
551
+ mime_types: vec!["text/plain"],
552
+ priority: 50,
553
+ });
554
+
555
+ let result = registry.register(extractor);
556
+ assert!(result.is_err(), "Registration with empty name should fail");
557
+
558
+ match result {
559
+ Err(KreuzbergError::Validation { message, .. }) => {
560
+ assert!(message.contains("empty"), "Error should mention empty name");
561
+ }
562
+ _ => panic!("Expected Validation error"),
563
+ }
564
+ }
565
+
566
+ /// Test extractor registration with whitespace fails.
567
+ #[test]
568
+ fn test_extractor_registration_with_whitespace_fails() {
569
+ let mut registry = DocumentExtractorRegistry::new();
570
+
571
+ let extractor = Arc::new(MockExtractor {
572
+ name: "extractor with spaces".to_string(),
573
+ mime_types: vec!["text/plain"],
574
+ priority: 50,
575
+ });
576
+
577
+ let result = registry.register(extractor);
578
+ assert!(result.is_err(), "Registration with whitespace should fail");
579
+
580
+ match result {
581
+ Err(KreuzbergError::Validation { message, .. }) => {
582
+ assert!(message.contains("whitespace"), "Error should mention whitespace");
583
+ }
584
+ _ => panic!("Expected Validation error"),
585
+ }
586
+ }