kreuzberg 4.0.0.pre.rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +538 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +157 -0
  8. data/README.md +426 -0
  9. data/Rakefile +25 -0
  10. data/Steepfile +47 -0
  11. data/examples/async_patterns.rb +341 -0
  12. data/ext/kreuzberg_rb/extconf.rb +45 -0
  13. data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
  14. data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
  15. data/ext/kreuzberg_rb/native/README.md +425 -0
  16. data/ext/kreuzberg_rb/native/build.rs +15 -0
  17. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  18. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  19. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  20. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  21. data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
  22. data/extconf.rb +28 -0
  23. data/kreuzberg.gemspec +148 -0
  24. data/lib/kreuzberg/api_proxy.rb +142 -0
  25. data/lib/kreuzberg/cache_api.rb +46 -0
  26. data/lib/kreuzberg/cli.rb +55 -0
  27. data/lib/kreuzberg/cli_proxy.rb +127 -0
  28. data/lib/kreuzberg/config.rb +691 -0
  29. data/lib/kreuzberg/error_context.rb +32 -0
  30. data/lib/kreuzberg/errors.rb +118 -0
  31. data/lib/kreuzberg/extraction_api.rb +85 -0
  32. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  33. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  34. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  35. data/lib/kreuzberg/result.rb +216 -0
  36. data/lib/kreuzberg/setup_lib_path.rb +80 -0
  37. data/lib/kreuzberg/validator_protocol.rb +89 -0
  38. data/lib/kreuzberg/version.rb +5 -0
  39. data/lib/kreuzberg.rb +103 -0
  40. data/sig/kreuzberg/internal.rbs +184 -0
  41. data/sig/kreuzberg.rbs +520 -0
  42. data/spec/binding/cache_spec.rb +227 -0
  43. data/spec/binding/cli_proxy_spec.rb +85 -0
  44. data/spec/binding/cli_spec.rb +55 -0
  45. data/spec/binding/config_spec.rb +345 -0
  46. data/spec/binding/config_validation_spec.rb +283 -0
  47. data/spec/binding/error_handling_spec.rb +213 -0
  48. data/spec/binding/errors_spec.rb +66 -0
  49. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  50. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  51. data/spec/binding/plugins/validator_spec.rb +274 -0
  52. data/spec/fixtures/config.toml +39 -0
  53. data/spec/fixtures/config.yaml +41 -0
  54. data/spec/fixtures/invalid_config.toml +4 -0
  55. data/spec/smoke/package_spec.rb +178 -0
  56. data/spec/spec_helper.rb +42 -0
  57. data/vendor/kreuzberg/Cargo.toml +204 -0
  58. data/vendor/kreuzberg/README.md +175 -0
  59. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
  60. data/vendor/kreuzberg/build.rs +474 -0
  61. data/vendor/kreuzberg/src/api/error.rs +81 -0
  62. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  63. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  64. data/vendor/kreuzberg/src/api/server.rs +353 -0
  65. data/vendor/kreuzberg/src/api/types.rs +170 -0
  66. data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
  67. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  68. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
  69. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  70. data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
  71. data/vendor/kreuzberg/src/core/io.rs +329 -0
  72. data/vendor/kreuzberg/src/core/mime.rs +605 -0
  73. data/vendor/kreuzberg/src/core/mod.rs +45 -0
  74. data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
  75. data/vendor/kreuzberg/src/embeddings.rs +432 -0
  76. data/vendor/kreuzberg/src/error.rs +431 -0
  77. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  78. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  79. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  80. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  81. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  82. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  83. data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
  84. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
  85. data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  88. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  89. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
  90. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
  91. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  92. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  93. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  94. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  95. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  96. data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
  97. data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
  98. data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
  99. data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
  100. data/vendor/kreuzberg/src/extractors/email.rs +143 -0
  101. data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
  103. data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
  104. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
  105. data/vendor/kreuzberg/src/extractors/html.rs +393 -0
  106. data/vendor/kreuzberg/src/extractors/image.rs +198 -0
  107. data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
  108. data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
  109. data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
  110. data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
  111. data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
  112. data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
  113. data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
  114. data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
  115. data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
  116. data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
  117. data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
  119. data/vendor/kreuzberg/src/extractors/security.rs +484 -0
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
  121. data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
  122. data/vendor/kreuzberg/src/extractors/text.rs +260 -0
  123. data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
  124. data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
  125. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  126. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  127. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  128. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  129. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  130. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  131. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  132. data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
  133. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  134. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  135. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  136. data/vendor/kreuzberg/src/lib.rs +105 -0
  137. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  138. data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
  139. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  140. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  141. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  142. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  143. data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
  144. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  145. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  146. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  147. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  148. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  149. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  150. data/vendor/kreuzberg/src/panic_context.rs +154 -0
  151. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  152. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  153. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  154. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  155. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  156. data/vendor/kreuzberg/src/pdf/table.rs +393 -0
  157. data/vendor/kreuzberg/src/pdf/text.rs +158 -0
  158. data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
  159. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  160. data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
  161. data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
  162. data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
  163. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  164. data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
  165. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  166. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  167. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  168. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  169. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  170. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  171. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  172. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  173. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  174. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  175. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  176. data/vendor/kreuzberg/src/types.rs +903 -0
  177. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  178. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  179. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  180. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  181. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  182. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  183. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  184. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  185. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  186. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  187. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  188. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  189. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  190. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  191. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  192. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  193. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  194. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  195. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  196. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  197. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  198. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  199. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  200. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  201. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  202. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  203. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  204. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  205. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  206. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  207. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  208. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  209. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  210. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  211. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  212. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  213. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  214. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  215. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  216. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  217. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  218. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  219. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  220. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  221. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  222. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  223. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  224. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  225. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  226. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  227. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  228. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  229. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  230. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  231. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  232. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  233. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  234. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  235. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  236. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  237. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  238. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  239. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  240. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  241. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  242. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  243. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  244. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
  245. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  246. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  247. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
  248. data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
  249. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
  250. data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
  251. data/vendor/kreuzberg/tests/config_features.rs +598 -0
  252. data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
  253. data/vendor/kreuzberg/tests/core_integration.rs +510 -0
  254. data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
  255. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
  256. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
  257. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
  258. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
  260. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  261. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
  262. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  263. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  264. data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
  265. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  266. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
  267. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
  268. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
  269. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  270. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  271. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
  272. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
  273. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  274. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  275. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  276. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  277. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  278. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
  279. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
  280. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
  281. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  282. data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
  283. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  284. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
  285. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  286. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  287. data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
  288. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
  289. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
  290. data/vendor/kreuzberg/tests/security_validation.rs +415 -0
  291. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  292. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  293. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
  294. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
  295. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  296. data/vendor/rb-sys/.cargo-ok +1 -0
  297. data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
  298. data/vendor/rb-sys/Cargo.lock +393 -0
  299. data/vendor/rb-sys/Cargo.toml +70 -0
  300. data/vendor/rb-sys/Cargo.toml.orig +57 -0
  301. data/vendor/rb-sys/LICENSE-APACHE +190 -0
  302. data/vendor/rb-sys/LICENSE-MIT +21 -0
  303. data/vendor/rb-sys/bin/release.sh +21 -0
  304. data/vendor/rb-sys/build/features.rs +108 -0
  305. data/vendor/rb-sys/build/main.rs +246 -0
  306. data/vendor/rb-sys/build/stable_api_config.rs +153 -0
  307. data/vendor/rb-sys/build/version.rs +48 -0
  308. data/vendor/rb-sys/readme.md +36 -0
  309. data/vendor/rb-sys/src/bindings.rs +21 -0
  310. data/vendor/rb-sys/src/hidden.rs +11 -0
  311. data/vendor/rb-sys/src/lib.rs +34 -0
  312. data/vendor/rb-sys/src/macros.rs +371 -0
  313. data/vendor/rb-sys/src/memory.rs +53 -0
  314. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
  315. data/vendor/rb-sys/src/special_consts.rs +31 -0
  316. data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
  317. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
  318. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
  319. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
  320. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
  321. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
  322. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
  323. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
  324. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
  325. data/vendor/rb-sys/src/stable_api.rs +261 -0
  326. data/vendor/rb-sys/src/symbol.rs +31 -0
  327. data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
  328. data/vendor/rb-sys/src/utils.rs +89 -0
  329. data/vendor/rb-sys/src/value_type.rs +7 -0
  330. metadata +536 -0
@@ -0,0 +1,237 @@
1
+ //! Keyword extraction module.
2
+ //!
3
+ //! Provides unified keyword extraction interface supporting multiple algorithms:
4
+ //! - YAKE (Yet Another Keyword Extractor) - statistical approach
5
+ //! - RAKE (Rapid Automatic Keyword Extraction) - co-occurrence based
6
+ //!
7
+ //! # Feature Flags
8
+ //!
9
+ //! - `keywords-yake`: Enable YAKE algorithm
10
+ //! - `keywords-rake`: Enable RAKE algorithm
11
+ //! - `keywords`: Enable both algorithms (default in `full` feature)
12
+ //!
13
+ //! # Examples
14
+ //!
15
+ //! ```rust,no_run
16
+ //! # use kreuzberg::keywords::{extract_keywords, KeywordConfig};
17
+ //! let text = "Rust is a systems programming language focused on safety and performance.";
18
+ //!
19
+ //! // Use default algorithm (YAKE if available)
20
+ //! let config = KeywordConfig::default();
21
+ //! let keywords = extract_keywords(text, &config).unwrap();
22
+ //!
23
+ //! for keyword in keywords {
24
+ //! println!("{}: {:.3}", keyword.text, keyword.score);
25
+ //! }
26
+ //! ```
27
+ //!
28
+ //! ```rust,no_run
29
+ //! # #[cfg(feature = "keywords-rake")]
30
+ //! # {
31
+ //! # use kreuzberg::keywords::{extract_keywords, KeywordConfig};
32
+ //! // Use RAKE algorithm explicitly
33
+ //! let text = "Machine learning models require large datasets.";
34
+ //! let config = KeywordConfig::rake()
35
+ //! .with_max_keywords(5)
36
+ //! .with_min_score(0.3);
37
+ //!
38
+ //! let keywords = extract_keywords(text, &config).unwrap();
39
+ //! # }
40
+ //! ```
41
+
42
+ use crate::Result;
43
+ use crate::plugins::registry::get_post_processor_registry;
44
+ use once_cell::sync::Lazy;
45
+ use std::sync::Arc;
46
+
47
+ pub mod config;
48
+ pub mod processor;
49
+ pub mod types;
50
+
51
+ #[cfg(feature = "keywords-yake")]
52
+ mod yake;
53
+
54
+ #[cfg(feature = "keywords-rake")]
55
+ mod rake;
56
+
57
+ pub use config::KeywordConfig;
58
+ pub use processor::KeywordExtractor;
59
+
60
+ #[cfg(feature = "keywords-rake")]
61
+ pub use config::RakeParams;
62
+
63
+ #[cfg(feature = "keywords-yake")]
64
+ pub use config::YakeParams;
65
+ pub use types::{Keyword, KeywordAlgorithm};
66
+
67
+ /// Extract keywords from text using the specified algorithm.
68
+ ///
69
+ /// This is the unified entry point for keyword extraction. The algorithm
70
+ /// used is determined by `config.algorithm`.
71
+ ///
72
+ /// # Arguments
73
+ ///
74
+ /// * `text` - The text to extract keywords from
75
+ /// * `config` - Keyword extraction configuration
76
+ ///
77
+ /// # Returns
78
+ ///
79
+ /// A vector of keywords sorted by relevance (highest score first).
80
+ ///
81
+ /// # Errors
82
+ ///
83
+ /// Returns an error if:
84
+ /// - The specified algorithm feature is not enabled
85
+ /// - Keyword extraction fails
86
+ ///
87
+ /// # Examples
88
+ ///
89
+ /// ```rust,no_run
90
+ /// # use kreuzberg::keywords::{extract_keywords, KeywordConfig};
91
+ /// let text = "Document intelligence with Rust provides memory safety.";
92
+ /// let config = KeywordConfig::default()
93
+ /// .with_max_keywords(10)
94
+ /// .with_language("en");
95
+ ///
96
+ /// let keywords = extract_keywords(text, &config)?;
97
+ ///
98
+ /// for keyword in keywords {
99
+ /// println!("{}: {:.3}", keyword.text, keyword.score);
100
+ /// }
101
+ /// # Ok::<(), kreuzberg::KreuzbergError>(())
102
+ /// ```
103
+ pub fn extract_keywords(text: &str, config: &KeywordConfig) -> Result<Vec<Keyword>> {
104
+ match config.algorithm {
105
+ #[cfg(feature = "keywords-yake")]
106
+ KeywordAlgorithm::Yake => yake::extract_keywords_yake(text, config),
107
+
108
+ #[cfg(feature = "keywords-rake")]
109
+ KeywordAlgorithm::Rake => rake::extract_keywords_rake(text, config),
110
+
111
+ #[cfg(not(any(feature = "keywords-yake", feature = "keywords-rake")))]
112
+ _ => Err(crate::KreuzbergError::Other(
113
+ "No keyword extraction algorithm feature enabled".to_string(),
114
+ )),
115
+ }
116
+ }
117
+
118
+ /// Lazy-initialized flag that ensures keyword processor is registered exactly once.
119
+ ///
120
+ /// This static is accessed on first use to automatically register the
121
+ /// keyword extraction processor with the plugin registry.
122
+ static PROCESSOR_INITIALIZED: Lazy<Result<()>> = Lazy::new(register_keyword_processor);
123
+
124
+ /// Ensure the keyword processor is registered.
125
+ ///
126
+ /// This function is called automatically when needed.
127
+ /// It's safe to call multiple times - registration only happens once.
128
+ pub fn ensure_initialized() -> Result<()> {
129
+ PROCESSOR_INITIALIZED
130
+ .as_ref()
131
+ .map(|_| ())
132
+ .map_err(|e| crate::KreuzbergError::Plugin {
133
+ message: format!("Failed to register keyword processor: {}", e),
134
+ plugin_name: "keyword-extraction".to_string(),
135
+ })
136
+ }
137
+
138
+ /// Register the keyword extraction processor with the global registry.
139
+ ///
140
+ /// This function should be called once at application startup to register
141
+ /// the keyword extraction post-processor.
142
+ ///
143
+ /// **Note:** This is called automatically on first use.
144
+ /// Explicit calling is optional.
145
+ ///
146
+ /// # Example
147
+ ///
148
+ /// ```rust
149
+ /// # #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
150
+ /// use kreuzberg::keywords::register_keyword_processor;
151
+ ///
152
+ /// # #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
153
+ /// # fn main() -> kreuzberg::Result<()> {
154
+ /// register_keyword_processor()?;
155
+ /// # Ok(())
156
+ /// # }
157
+ /// # #[cfg(not(any(feature = "keywords-yake", feature = "keywords-rake")))]
158
+ /// # fn main() {}
159
+ /// ```
160
+ pub fn register_keyword_processor() -> Result<()> {
161
+ let registry = get_post_processor_registry();
162
+ let mut registry = registry
163
+ .write()
164
+ .map_err(|e| crate::KreuzbergError::Other(format!("Post-processor registry lock poisoned: {}", e)))?;
165
+
166
+ registry.register(Arc::new(KeywordExtractor), 50)?;
167
+
168
+ Ok(())
169
+ }
170
+
171
+ #[cfg(test)]
172
+ mod tests {
173
+ use super::*;
174
+
175
+ #[test]
176
+ fn test_extract_keywords_default_algorithm() {
177
+ let text = "Rust programming language provides memory safety and performance.";
178
+ let config = KeywordConfig::default();
179
+
180
+ let keywords = extract_keywords(text, &config).unwrap();
181
+
182
+ assert!(!keywords.is_empty(), "Should extract keywords");
183
+ assert!(keywords.len() <= config.max_keywords);
184
+ }
185
+
186
+ #[cfg(feature = "keywords-yake")]
187
+ #[test]
188
+ fn test_extract_keywords_yake() {
189
+ let text = "Natural language processing using Rust is efficient and safe.";
190
+ let config = KeywordConfig::yake();
191
+
192
+ let keywords = extract_keywords(text, &config).unwrap();
193
+
194
+ assert!(!keywords.is_empty());
195
+ assert_eq!(keywords[0].algorithm, KeywordAlgorithm::Yake);
196
+ }
197
+
198
+ #[cfg(feature = "keywords-rake")]
199
+ #[test]
200
+ fn test_extract_keywords_rake() {
201
+ let text = "Natural language processing using Rust is efficient and safe.";
202
+ let config = KeywordConfig::rake();
203
+
204
+ let keywords = extract_keywords(text, &config).unwrap();
205
+
206
+ assert!(!keywords.is_empty());
207
+ assert_eq!(keywords[0].algorithm, KeywordAlgorithm::Rake);
208
+ }
209
+
210
+ #[cfg(all(feature = "keywords-yake", feature = "keywords-rake"))]
211
+ #[test]
212
+ fn test_compare_algorithms() {
213
+ let text = "Machine learning and artificial intelligence are transforming technology. \
214
+ Deep learning models require substantial computational resources.";
215
+
216
+ let yake_config = KeywordConfig::yake().with_max_keywords(5);
217
+ let yake_keywords = extract_keywords(text, &yake_config).unwrap();
218
+
219
+ let rake_config = KeywordConfig::rake().with_max_keywords(5);
220
+ let rake_keywords = extract_keywords(text, &rake_config).unwrap();
221
+
222
+ assert!(!yake_keywords.is_empty());
223
+ assert!(!rake_keywords.is_empty());
224
+
225
+ assert!(yake_keywords.iter().all(|k| k.algorithm == KeywordAlgorithm::Yake));
226
+ assert!(rake_keywords.iter().all(|k| k.algorithm == KeywordAlgorithm::Rake));
227
+
228
+ println!(
229
+ "YAKE keywords: {:?}",
230
+ yake_keywords.iter().map(|k| &k.text).collect::<Vec<_>>()
231
+ );
232
+ println!(
233
+ "RAKE keywords: {:?}",
234
+ rake_keywords.iter().map(|k| &k.text).collect::<Vec<_>>()
235
+ );
236
+ }
237
+ }
@@ -0,0 +1,267 @@
1
+ //! Keyword extraction post-processor.
2
+ //!
3
+ //! This module provides a PostProcessor plugin that extracts keywords from
4
+ //! extraction results and stores them in metadata.
5
+
6
+ use crate::plugins::{Plugin, PostProcessor, ProcessingStage};
7
+ use crate::{ExtractionConfig, ExtractionResult, KreuzbergError, Result};
8
+ use async_trait::async_trait;
9
+
10
+ /// Post-processor that extracts keywords from document content.
11
+ ///
12
+ /// This processor:
13
+ /// - Runs in the Middle processing stage
14
+ /// - Only processes when `config.keywords` is configured
15
+ /// - Stores extracted keywords in `metadata.additional["keywords"]`
16
+ /// - Uses the configured algorithm (YAKE or RAKE)
17
+ ///
18
+ /// # Example
19
+ ///
20
+ /// ```rust,no_run
21
+ /// use kreuzberg::plugins::{Plugin, PostProcessor};
22
+ /// use kreuzberg::keywords::processor::KeywordExtractor;
23
+ ///
24
+ /// let processor = KeywordExtractor;
25
+ /// assert_eq!(processor.name(), "keyword-extraction");
26
+ /// ```
27
+ #[derive(Debug, Clone, Copy)]
28
+ pub struct KeywordExtractor;
29
+
30
+ impl Plugin for KeywordExtractor {
31
+ fn name(&self) -> &str {
32
+ "keyword-extraction"
33
+ }
34
+
35
+ fn version(&self) -> String {
36
+ env!("CARGO_PKG_VERSION").to_string()
37
+ }
38
+
39
+ fn initialize(&self) -> Result<()> {
40
+ Ok(())
41
+ }
42
+
43
+ fn shutdown(&self) -> Result<()> {
44
+ Ok(())
45
+ }
46
+ }
47
+
48
+ #[async_trait]
49
+ impl PostProcessor for KeywordExtractor {
50
+ async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig) -> Result<()> {
51
+ let keyword_config = match &config.keywords {
52
+ Some(cfg) => cfg,
53
+ None => return Ok(()),
54
+ };
55
+
56
+ let word_count = result.content.split_whitespace().count();
57
+ if word_count < 10 {
58
+ return Ok(());
59
+ }
60
+
61
+ let keywords = super::extract_keywords(&result.content, keyword_config)
62
+ .map_err(|e| KreuzbergError::Other(format!("Keyword extraction failed: {}", e)))?;
63
+
64
+ result
65
+ .metadata
66
+ .additional
67
+ .insert("keywords".to_string(), serde_json::to_value(&keywords)?);
68
+
69
+ Ok(())
70
+ }
71
+
72
+ fn processing_stage(&self) -> ProcessingStage {
73
+ ProcessingStage::Middle
74
+ }
75
+
76
+ fn should_process(&self, _result: &ExtractionResult, config: &ExtractionConfig) -> bool {
77
+ config.keywords.is_some()
78
+ }
79
+
80
+ fn estimated_duration_ms(&self, result: &ExtractionResult) -> u64 {
81
+ let word_count = result.content.split_whitespace().count();
82
+ (word_count as u64) / 100 + 10
83
+ }
84
+ }
85
+
86
+ #[cfg(test)]
87
+ mod tests {
88
+ use super::*;
89
+ use crate::keywords::KeywordConfig;
90
+ use crate::types::Metadata;
91
+
92
+ const TEST_TEXT: &str = r#"
93
+ Machine learning is a branch of artificial intelligence that focuses on
94
+ building systems that can learn from data. Deep learning is a subset of
95
+ machine learning that uses neural networks with multiple layers.
96
+ "#;
97
+
98
+ #[tokio::test]
99
+ #[cfg(feature = "keywords-yake")]
100
+ async fn test_keyword_processor_with_yake() {
101
+ let processor = KeywordExtractor;
102
+ let config = ExtractionConfig {
103
+ keywords: Some(KeywordConfig::yake()),
104
+ ..Default::default()
105
+ };
106
+
107
+ let mut result = ExtractionResult {
108
+ content: TEST_TEXT.to_string(),
109
+ mime_type: "text/plain".to_string(),
110
+ metadata: Metadata::default(),
111
+ tables: vec![],
112
+ detected_languages: None,
113
+ chunks: None,
114
+ images: None,
115
+ };
116
+
117
+ processor.process(&mut result, &config).await.unwrap();
118
+
119
+ assert!(result.metadata.additional.contains_key("keywords"));
120
+ let keywords = result.metadata.additional.get("keywords").unwrap();
121
+ assert!(keywords.is_array());
122
+ let kw_array = keywords.as_array().unwrap();
123
+ assert!(!kw_array.is_empty());
124
+ }
125
+
126
+ #[tokio::test]
127
+ #[cfg(feature = "keywords-rake")]
128
+ async fn test_keyword_processor_with_rake() {
129
+ let processor = KeywordExtractor;
130
+ let config = ExtractionConfig {
131
+ keywords: Some(KeywordConfig::rake()),
132
+ ..Default::default()
133
+ };
134
+
135
+ let mut result = ExtractionResult {
136
+ content: TEST_TEXT.to_string(),
137
+ mime_type: "text/plain".to_string(),
138
+ metadata: Metadata::default(),
139
+ tables: vec![],
140
+ detected_languages: None,
141
+ chunks: None,
142
+ images: None,
143
+ };
144
+
145
+ processor.process(&mut result, &config).await.unwrap();
146
+
147
+ assert!(result.metadata.additional.contains_key("keywords"));
148
+ let keywords = result.metadata.additional.get("keywords").unwrap();
149
+ assert!(keywords.is_array());
150
+ let kw_array = keywords.as_array().unwrap();
151
+ assert!(!kw_array.is_empty());
152
+ }
153
+
154
+ #[tokio::test]
155
+ async fn test_keyword_processor_no_config() {
156
+ let processor = KeywordExtractor;
157
+ let config = ExtractionConfig::default();
158
+
159
+ let mut result = ExtractionResult {
160
+ content: TEST_TEXT.to_string(),
161
+ mime_type: "text/plain".to_string(),
162
+ metadata: Metadata::default(),
163
+ tables: vec![],
164
+ detected_languages: None,
165
+ chunks: None,
166
+ images: None,
167
+ };
168
+
169
+ processor.process(&mut result, &config).await.unwrap();
170
+
171
+ assert!(!result.metadata.additional.contains_key("keywords"));
172
+ }
173
+
174
+ #[tokio::test]
175
+ #[cfg(feature = "keywords-yake")]
176
+ async fn test_keyword_processor_short_content() {
177
+ let processor = KeywordExtractor;
178
+ let config = ExtractionConfig {
179
+ keywords: Some(KeywordConfig::yake()),
180
+ ..Default::default()
181
+ };
182
+
183
+ let mut result = ExtractionResult {
184
+ content: "Short text".to_string(),
185
+ mime_type: "text/plain".to_string(),
186
+ metadata: Metadata::default(),
187
+ tables: vec![],
188
+ detected_languages: None,
189
+ chunks: None,
190
+ images: None,
191
+ };
192
+
193
+ processor.process(&mut result, &config).await.unwrap();
194
+
195
+ assert!(!result.metadata.additional.contains_key("keywords"));
196
+ }
197
+
198
+ #[test]
199
+ fn test_keyword_processor_plugin_interface() {
200
+ let processor = KeywordExtractor;
201
+ assert_eq!(processor.name(), "keyword-extraction");
202
+ assert!(!processor.version().is_empty());
203
+ assert!(processor.initialize().is_ok());
204
+ assert!(processor.shutdown().is_ok());
205
+ }
206
+
207
+ #[test]
208
+ fn test_keyword_processor_stage() {
209
+ let processor = KeywordExtractor;
210
+ assert_eq!(processor.processing_stage(), ProcessingStage::Middle);
211
+ }
212
+
213
+ #[test]
214
+ #[cfg(feature = "keywords-yake")]
215
+ fn test_keyword_processor_should_process() {
216
+ let processor = KeywordExtractor;
217
+
218
+ let result = ExtractionResult {
219
+ content: TEST_TEXT.to_string(),
220
+ mime_type: "text/plain".to_string(),
221
+ metadata: Metadata::default(),
222
+ tables: vec![],
223
+ detected_languages: None,
224
+ chunks: None,
225
+ images: None,
226
+ };
227
+
228
+ let config_with_keywords = ExtractionConfig {
229
+ keywords: Some(KeywordConfig::yake()),
230
+ ..Default::default()
231
+ };
232
+ assert!(processor.should_process(&result, &config_with_keywords));
233
+
234
+ let config_without_keywords = ExtractionConfig::default();
235
+ assert!(!processor.should_process(&result, &config_without_keywords));
236
+ }
237
+
238
+ #[test]
239
+ fn test_keyword_processor_estimated_duration() {
240
+ let processor = KeywordExtractor;
241
+
242
+ let short_result = ExtractionResult {
243
+ content: "Short text with just a few words".to_string(),
244
+ mime_type: "text/plain".to_string(),
245
+ metadata: Metadata::default(),
246
+ tables: vec![],
247
+ detected_languages: None,
248
+ chunks: None,
249
+ images: None,
250
+ };
251
+
252
+ let long_result = ExtractionResult {
253
+ content: "word ".repeat(1000),
254
+ mime_type: "text/plain".to_string(),
255
+ metadata: Metadata::default(),
256
+ tables: vec![],
257
+ detected_languages: None,
258
+ chunks: None,
259
+ images: None,
260
+ };
261
+
262
+ let short_duration = processor.estimated_duration_ms(&short_result);
263
+ let long_duration = processor.estimated_duration_ms(&long_result);
264
+
265
+ assert!(long_duration > short_duration);
266
+ }
267
+ }