kreuzberg 4.0.0.pre.rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +538 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +157 -0
  8. data/README.md +426 -0
  9. data/Rakefile +25 -0
  10. data/Steepfile +47 -0
  11. data/examples/async_patterns.rb +341 -0
  12. data/ext/kreuzberg_rb/extconf.rb +45 -0
  13. data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
  14. data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
  15. data/ext/kreuzberg_rb/native/README.md +425 -0
  16. data/ext/kreuzberg_rb/native/build.rs +15 -0
  17. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  18. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  19. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  20. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  21. data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
  22. data/extconf.rb +28 -0
  23. data/kreuzberg.gemspec +148 -0
  24. data/lib/kreuzberg/api_proxy.rb +142 -0
  25. data/lib/kreuzberg/cache_api.rb +46 -0
  26. data/lib/kreuzberg/cli.rb +55 -0
  27. data/lib/kreuzberg/cli_proxy.rb +127 -0
  28. data/lib/kreuzberg/config.rb +691 -0
  29. data/lib/kreuzberg/error_context.rb +32 -0
  30. data/lib/kreuzberg/errors.rb +118 -0
  31. data/lib/kreuzberg/extraction_api.rb +85 -0
  32. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  33. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  34. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  35. data/lib/kreuzberg/result.rb +216 -0
  36. data/lib/kreuzberg/setup_lib_path.rb +80 -0
  37. data/lib/kreuzberg/validator_protocol.rb +89 -0
  38. data/lib/kreuzberg/version.rb +5 -0
  39. data/lib/kreuzberg.rb +103 -0
  40. data/sig/kreuzberg/internal.rbs +184 -0
  41. data/sig/kreuzberg.rbs +520 -0
  42. data/spec/binding/cache_spec.rb +227 -0
  43. data/spec/binding/cli_proxy_spec.rb +85 -0
  44. data/spec/binding/cli_spec.rb +55 -0
  45. data/spec/binding/config_spec.rb +345 -0
  46. data/spec/binding/config_validation_spec.rb +283 -0
  47. data/spec/binding/error_handling_spec.rb +213 -0
  48. data/spec/binding/errors_spec.rb +66 -0
  49. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  50. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  51. data/spec/binding/plugins/validator_spec.rb +274 -0
  52. data/spec/fixtures/config.toml +39 -0
  53. data/spec/fixtures/config.yaml +41 -0
  54. data/spec/fixtures/invalid_config.toml +4 -0
  55. data/spec/smoke/package_spec.rb +178 -0
  56. data/spec/spec_helper.rb +42 -0
  57. data/vendor/kreuzberg/Cargo.toml +204 -0
  58. data/vendor/kreuzberg/README.md +175 -0
  59. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
  60. data/vendor/kreuzberg/build.rs +474 -0
  61. data/vendor/kreuzberg/src/api/error.rs +81 -0
  62. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  63. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  64. data/vendor/kreuzberg/src/api/server.rs +353 -0
  65. data/vendor/kreuzberg/src/api/types.rs +170 -0
  66. data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
  67. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  68. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
  69. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  70. data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
  71. data/vendor/kreuzberg/src/core/io.rs +329 -0
  72. data/vendor/kreuzberg/src/core/mime.rs +605 -0
  73. data/vendor/kreuzberg/src/core/mod.rs +45 -0
  74. data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
  75. data/vendor/kreuzberg/src/embeddings.rs +432 -0
  76. data/vendor/kreuzberg/src/error.rs +431 -0
  77. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  78. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  79. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  80. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  81. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  82. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  83. data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
  84. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
  85. data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  88. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  89. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
  90. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
  91. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  92. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  93. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  94. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  95. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  96. data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
  97. data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
  98. data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
  99. data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
  100. data/vendor/kreuzberg/src/extractors/email.rs +143 -0
  101. data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
  103. data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
  104. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
  105. data/vendor/kreuzberg/src/extractors/html.rs +393 -0
  106. data/vendor/kreuzberg/src/extractors/image.rs +198 -0
  107. data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
  108. data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
  109. data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
  110. data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
  111. data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
  112. data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
  113. data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
  114. data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
  115. data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
  116. data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
  117. data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
  119. data/vendor/kreuzberg/src/extractors/security.rs +484 -0
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
  121. data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
  122. data/vendor/kreuzberg/src/extractors/text.rs +260 -0
  123. data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
  124. data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
  125. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  126. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  127. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  128. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  129. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  130. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  131. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  132. data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
  133. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  134. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  135. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  136. data/vendor/kreuzberg/src/lib.rs +105 -0
  137. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  138. data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
  139. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  140. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  141. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  142. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  143. data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
  144. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  145. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  146. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  147. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  148. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  149. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  150. data/vendor/kreuzberg/src/panic_context.rs +154 -0
  151. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  152. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  153. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  154. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  155. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  156. data/vendor/kreuzberg/src/pdf/table.rs +393 -0
  157. data/vendor/kreuzberg/src/pdf/text.rs +158 -0
  158. data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
  159. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  160. data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
  161. data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
  162. data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
  163. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  164. data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
  165. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  166. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  167. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  168. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  169. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  170. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  171. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  172. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  173. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  174. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  175. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  176. data/vendor/kreuzberg/src/types.rs +903 -0
  177. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  178. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  179. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  180. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  181. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  182. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  183. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  184. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  185. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  186. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  187. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  188. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  189. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  190. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  191. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  192. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  193. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  194. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  195. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  196. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  197. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  198. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  199. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  200. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  201. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  202. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  203. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  204. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  205. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  206. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  207. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  208. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  209. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  210. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  211. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  212. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  213. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  214. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  215. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  216. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  217. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  218. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  219. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  220. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  221. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  222. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  223. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  224. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  225. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  226. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  227. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  228. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  229. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  230. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  231. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  232. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  233. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  234. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  235. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  236. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  237. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  238. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  239. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  240. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  241. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  242. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  243. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  244. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
  245. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  246. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  247. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
  248. data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
  249. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
  250. data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
  251. data/vendor/kreuzberg/tests/config_features.rs +598 -0
  252. data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
  253. data/vendor/kreuzberg/tests/core_integration.rs +510 -0
  254. data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
  255. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
  256. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
  257. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
  258. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
  260. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  261. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
  262. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  263. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  264. data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
  265. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  266. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
  267. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
  268. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
  269. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  270. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  271. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
  272. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
  273. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  274. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  275. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  276. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  277. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  278. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
  279. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
  280. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
  281. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  282. data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
  283. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  284. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
  285. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  286. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  287. data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
  288. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
  289. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
  290. data/vendor/kreuzberg/tests/security_validation.rs +415 -0
  291. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  292. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  293. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
  294. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
  295. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  296. data/vendor/rb-sys/.cargo-ok +1 -0
  297. data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
  298. data/vendor/rb-sys/Cargo.lock +393 -0
  299. data/vendor/rb-sys/Cargo.toml +70 -0
  300. data/vendor/rb-sys/Cargo.toml.orig +57 -0
  301. data/vendor/rb-sys/LICENSE-APACHE +190 -0
  302. data/vendor/rb-sys/LICENSE-MIT +21 -0
  303. data/vendor/rb-sys/bin/release.sh +21 -0
  304. data/vendor/rb-sys/build/features.rs +108 -0
  305. data/vendor/rb-sys/build/main.rs +246 -0
  306. data/vendor/rb-sys/build/stable_api_config.rs +153 -0
  307. data/vendor/rb-sys/build/version.rs +48 -0
  308. data/vendor/rb-sys/readme.md +36 -0
  309. data/vendor/rb-sys/src/bindings.rs +21 -0
  310. data/vendor/rb-sys/src/hidden.rs +11 -0
  311. data/vendor/rb-sys/src/lib.rs +34 -0
  312. data/vendor/rb-sys/src/macros.rs +371 -0
  313. data/vendor/rb-sys/src/memory.rs +53 -0
  314. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
  315. data/vendor/rb-sys/src/special_consts.rs +31 -0
  316. data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
  317. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
  318. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
  319. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
  320. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
  321. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
  322. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
  323. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
  324. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
  325. data/vendor/rb-sys/src/stable_api.rs +261 -0
  326. data/vendor/rb-sys/src/symbol.rs +31 -0
  327. data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
  328. data/vendor/rb-sys/src/utils.rs +89 -0
  329. data/vendor/rb-sys/src/value_type.rs +7 -0
  330. metadata +536 -0
@@ -0,0 +1,417 @@
1
+ use crate::error::{KreuzbergError, Result};
2
+ use crate::types::{ExtractionConfig, ImagePreprocessingMetadata};
3
+ use image::{DynamicImage, ImageBuffer, Rgb};
4
+
5
+ use super::dpi::calculate_smart_dpi;
6
+ use super::resize::resize_image;
7
+
8
+ const PDF_POINTS_PER_INCH: f64 = 72.0;
9
+
10
+ /// Result of image normalization
11
+ pub struct NormalizeResult {
12
+ /// Processed RGB image data (height * width * 3 bytes)
13
+ pub rgb_data: Vec<u8>,
14
+ /// Image dimensions (width, height)
15
+ pub dimensions: (usize, usize),
16
+ /// Preprocessing metadata
17
+ pub metadata: ImagePreprocessingMetadata,
18
+ }
19
+
20
+ /// Normalize image DPI based on extraction configuration
21
+ ///
22
+ /// # Arguments
23
+ /// * `rgb_data` - RGB image data as a flat `Vec<u8>` (height * width * 3 bytes, row-major)
24
+ /// * `width` - Image width in pixels
25
+ /// * `height` - Image height in pixels
26
+ /// * `config` - Extraction configuration containing DPI settings
27
+ /// * `current_dpi` - Optional current DPI of the image (defaults to 72 if None)
28
+ ///
29
+ /// # Returns
30
+ /// * `NormalizeResult` containing processed image data and metadata
31
+ pub fn normalize_image_dpi(
32
+ rgb_data: &[u8],
33
+ width: usize,
34
+ height: usize,
35
+ config: &ExtractionConfig,
36
+ current_dpi: Option<f64>,
37
+ ) -> Result<NormalizeResult> {
38
+ if width > 65536 || height > 65536 {
39
+ return Err(KreuzbergError::validation(format!(
40
+ "Image dimensions {}x{} exceed maximum 65536x65536",
41
+ width, height
42
+ )));
43
+ }
44
+
45
+ let expected_size = height * width * 3;
46
+ if rgb_data.len() != expected_size {
47
+ return Err(KreuzbergError::validation(format!(
48
+ "RGB data size {} does not match expected size {} for {}x{} image",
49
+ rgb_data.len(),
50
+ expected_size,
51
+ width,
52
+ height
53
+ )));
54
+ }
55
+
56
+ let current_dpi = current_dpi.unwrap_or(PDF_POINTS_PER_INCH);
57
+ let original_dpi = (current_dpi, current_dpi);
58
+ let max_memory_mb = 2048.0;
59
+
60
+ let (target_dpi, auto_adjusted, calculated_dpi) =
61
+ calculate_target_dpi(width as u32, height as u32, current_dpi, config, max_memory_mb);
62
+
63
+ let scale_factor = f64::from(target_dpi) / current_dpi;
64
+
65
+ if !needs_resize(width as u32, height as u32, scale_factor, config) {
66
+ return Ok(create_skip_result(
67
+ rgb_data.to_vec(),
68
+ width,
69
+ height,
70
+ original_dpi,
71
+ config,
72
+ target_dpi,
73
+ scale_factor,
74
+ auto_adjusted,
75
+ calculated_dpi,
76
+ ));
77
+ }
78
+
79
+ let (new_width, new_height, final_scale, dimension_clamped) =
80
+ calculate_new_dimensions(width as u32, height as u32, scale_factor, config);
81
+
82
+ perform_resize(
83
+ rgb_data,
84
+ width as u32,
85
+ height as u32,
86
+ new_width,
87
+ new_height,
88
+ final_scale,
89
+ original_dpi,
90
+ target_dpi,
91
+ auto_adjusted,
92
+ dimension_clamped,
93
+ calculated_dpi,
94
+ config,
95
+ )
96
+ }
97
+
98
+ /// Calculate target DPI based on configuration
99
+ fn calculate_target_dpi(
100
+ width: u32,
101
+ height: u32,
102
+ current_dpi: f64,
103
+ config: &ExtractionConfig,
104
+ max_memory_mb: f64,
105
+ ) -> (i32, bool, Option<i32>) {
106
+ if config.auto_adjust_dpi {
107
+ let approx_width_points = f64::from(width) * PDF_POINTS_PER_INCH / current_dpi;
108
+ let approx_height_points = f64::from(height) * PDF_POINTS_PER_INCH / current_dpi;
109
+
110
+ let optimal_dpi = calculate_smart_dpi(
111
+ approx_width_points,
112
+ approx_height_points,
113
+ config.target_dpi,
114
+ config.max_image_dimension,
115
+ max_memory_mb,
116
+ );
117
+
118
+ (optimal_dpi, optimal_dpi != config.target_dpi, Some(optimal_dpi))
119
+ } else {
120
+ (config.target_dpi, false, None)
121
+ }
122
+ }
123
+
124
+ /// Check if resize is needed
125
+ fn needs_resize(width: u32, height: u32, scale_factor: f64, config: &ExtractionConfig) -> bool {
126
+ let max_dimension = width.max(height);
127
+ let exceeds_max = i32::try_from(max_dimension).map_or(true, |dim| dim > config.max_image_dimension);
128
+
129
+ (scale_factor - 1.0).abs() >= 0.05 || exceeds_max
130
+ }
131
+
132
+ /// Calculate new dimensions after scaling
133
+ #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
134
+ fn calculate_new_dimensions(
135
+ original_width: u32,
136
+ original_height: u32,
137
+ scale_factor: f64,
138
+ config: &ExtractionConfig,
139
+ ) -> (u32, u32, f64, bool) {
140
+ let mut new_width = (f64::from(original_width) * scale_factor).round() as u32;
141
+ let mut new_height = (f64::from(original_height) * scale_factor).round() as u32;
142
+ let mut final_scale = scale_factor;
143
+ let mut dimension_clamped = false;
144
+
145
+ let max_new_dimension = new_width.max(new_height);
146
+ if let Ok(max_dim_i32) = i32::try_from(max_new_dimension)
147
+ && max_dim_i32 > config.max_image_dimension
148
+ {
149
+ let dimension_scale = f64::from(config.max_image_dimension) / f64::from(max_new_dimension);
150
+ new_width = (f64::from(new_width) * dimension_scale).round() as u32;
151
+ new_height = (f64::from(new_height) * dimension_scale).round() as u32;
152
+ final_scale *= dimension_scale;
153
+ dimension_clamped = true;
154
+ }
155
+
156
+ (new_width, new_height, final_scale, dimension_clamped)
157
+ }
158
+
159
+ /// Create result when resize is skipped
160
+ #[allow(clippy::too_many_arguments)]
161
+ fn create_skip_result(
162
+ rgb_data: Vec<u8>,
163
+ width: usize,
164
+ height: usize,
165
+ original_dpi: (f64, f64),
166
+ config: &ExtractionConfig,
167
+ target_dpi: i32,
168
+ scale_factor: f64,
169
+ auto_adjusted: bool,
170
+ calculated_dpi: Option<i32>,
171
+ ) -> NormalizeResult {
172
+ NormalizeResult {
173
+ rgb_data,
174
+ dimensions: (width, height),
175
+ metadata: ImagePreprocessingMetadata {
176
+ original_dimensions: (width, height),
177
+ original_dpi,
178
+ target_dpi: config.target_dpi,
179
+ scale_factor,
180
+ auto_adjusted,
181
+ final_dpi: target_dpi,
182
+ new_dimensions: None,
183
+ resample_method: "NONE".to_string(),
184
+ dimension_clamped: false,
185
+ calculated_dpi,
186
+ skipped_resize: true,
187
+ resize_error: None,
188
+ },
189
+ }
190
+ }
191
+
192
+ /// Perform the actual resize operation
193
+ #[allow(clippy::too_many_arguments)]
194
+ fn perform_resize(
195
+ rgb_data: &[u8],
196
+ original_width: u32,
197
+ original_height: u32,
198
+ new_width: u32,
199
+ new_height: u32,
200
+ final_scale: f64,
201
+ original_dpi: (f64, f64),
202
+ target_dpi: i32,
203
+ auto_adjusted: bool,
204
+ dimension_clamped: bool,
205
+ calculated_dpi: Option<i32>,
206
+ config: &ExtractionConfig,
207
+ ) -> Result<NormalizeResult> {
208
+ let img_buffer = ImageBuffer::<Rgb<u8>, Vec<u8>>::from_raw(original_width, original_height, rgb_data.to_vec())
209
+ .ok_or_else(|| {
210
+ KreuzbergError::parsing(format!(
211
+ "Failed to create image buffer from {}x{} RGB data",
212
+ original_width, original_height
213
+ ))
214
+ })?;
215
+
216
+ let image = DynamicImage::ImageRgb8(img_buffer);
217
+
218
+ let resized = resize_image(&image, new_width, new_height, final_scale)?;
219
+
220
+ let rgb_image = resized.to_rgb8();
221
+ let result_rgb_data = rgb_image.into_raw();
222
+
223
+ let metadata = ImagePreprocessingMetadata {
224
+ original_dimensions: (original_width as usize, original_height as usize),
225
+ original_dpi,
226
+ target_dpi: config.target_dpi,
227
+ scale_factor: final_scale,
228
+ auto_adjusted,
229
+ final_dpi: target_dpi,
230
+ new_dimensions: Some((new_width as usize, new_height as usize)),
231
+ resample_method: if final_scale < 1.0 { "LANCZOS3" } else { "CATMULLROM" }.to_string(),
232
+ dimension_clamped,
233
+ calculated_dpi,
234
+ skipped_resize: false,
235
+ resize_error: None,
236
+ };
237
+
238
+ Ok(NormalizeResult {
239
+ rgb_data: result_rgb_data,
240
+ dimensions: (new_width as usize, new_height as usize),
241
+ metadata,
242
+ })
243
+ }
244
+
245
+ #[cfg(test)]
246
+ mod tests {
247
+ use super::*;
248
+
249
+ fn create_test_rgb_data(width: usize, height: usize) -> Vec<u8> {
250
+ let mut data = Vec::with_capacity(width * height * 3);
251
+ for _ in 0..width * height {
252
+ data.push(255);
253
+ data.push(0);
254
+ data.push(0);
255
+ }
256
+ data
257
+ }
258
+
259
+ #[test]
260
+ fn test_normalize_image_dpi_skip_resize() {
261
+ let config = ExtractionConfig {
262
+ target_dpi: 72,
263
+ max_image_dimension: 4096,
264
+ auto_adjust_dpi: false,
265
+ min_dpi: 72,
266
+ max_dpi: 600,
267
+ };
268
+
269
+ let rgb_data = create_test_rgb_data(100, 100);
270
+ let result = normalize_image_dpi(&rgb_data, 100, 100, &config, Some(72.0));
271
+
272
+ assert!(result.is_ok());
273
+ let normalized = result.unwrap();
274
+ assert_eq!(normalized.dimensions, (100, 100));
275
+ assert!(normalized.metadata.skipped_resize);
276
+ }
277
+
278
+ #[test]
279
+ fn test_normalize_image_dpi_upscale() {
280
+ let config = ExtractionConfig {
281
+ target_dpi: 300,
282
+ max_image_dimension: 4096,
283
+ auto_adjust_dpi: false,
284
+ min_dpi: 72,
285
+ max_dpi: 600,
286
+ };
287
+
288
+ let rgb_data = create_test_rgb_data(100, 100);
289
+ let result = normalize_image_dpi(&rgb_data, 100, 100, &config, Some(72.0));
290
+
291
+ assert!(result.is_ok());
292
+ let normalized = result.unwrap();
293
+ assert!(!normalized.metadata.skipped_resize);
294
+ assert!(normalized.dimensions.0 > 100);
295
+ assert!(normalized.dimensions.1 > 100);
296
+ }
297
+
298
+ #[test]
299
+ fn test_normalize_image_dpi_downscale() {
300
+ let config = ExtractionConfig {
301
+ target_dpi: 72,
302
+ max_image_dimension: 4096,
303
+ auto_adjust_dpi: false,
304
+ min_dpi: 72,
305
+ max_dpi: 600,
306
+ };
307
+
308
+ let rgb_data = create_test_rgb_data(1000, 1000);
309
+ let result = normalize_image_dpi(&rgb_data, 1000, 1000, &config, Some(300.0));
310
+
311
+ assert!(result.is_ok());
312
+ let normalized = result.unwrap();
313
+ assert!(!normalized.metadata.skipped_resize);
314
+ assert!(normalized.dimensions.0 < 1000);
315
+ assert!(normalized.dimensions.1 < 1000);
316
+ }
317
+
318
+ #[test]
319
+ fn test_normalize_image_dpi_dimension_clamp() {
320
+ let config = ExtractionConfig {
321
+ target_dpi: 300,
322
+ max_image_dimension: 500,
323
+ auto_adjust_dpi: false,
324
+ min_dpi: 72,
325
+ max_dpi: 600,
326
+ };
327
+
328
+ let rgb_data = create_test_rgb_data(1000, 1000);
329
+ let result = normalize_image_dpi(&rgb_data, 1000, 1000, &config, Some(300.0));
330
+
331
+ assert!(result.is_ok());
332
+ let normalized = result.unwrap();
333
+ assert!(normalized.metadata.dimension_clamped);
334
+ assert!(normalized.dimensions.0 <= 500);
335
+ assert!(normalized.dimensions.1 <= 500);
336
+ }
337
+
338
+ #[test]
339
+ fn test_normalize_image_dpi_auto_adjust() {
340
+ let config = ExtractionConfig {
341
+ target_dpi: 300,
342
+ max_image_dimension: 4096,
343
+ auto_adjust_dpi: true,
344
+ min_dpi: 72,
345
+ max_dpi: 600,
346
+ };
347
+
348
+ let rgb_data = create_test_rgb_data(100, 100);
349
+ let result = normalize_image_dpi(&rgb_data, 100, 100, &config, Some(72.0));
350
+
351
+ assert!(result.is_ok());
352
+ let normalized = result.unwrap();
353
+ assert!(normalized.metadata.calculated_dpi.is_some());
354
+ }
355
+
356
+ #[test]
357
+ fn test_normalize_image_dpi_invalid_dimensions() {
358
+ let config = ExtractionConfig::default();
359
+ let rgb_data = create_test_rgb_data(100, 100);
360
+
361
+ let result = normalize_image_dpi(&rgb_data, 100000, 100000, &config, None);
362
+ assert!(result.is_err());
363
+ }
364
+
365
+ #[test]
366
+ fn test_normalize_image_dpi_invalid_data_size() {
367
+ let config = ExtractionConfig::default();
368
+ let rgb_data = vec![0u8; 100];
369
+
370
+ let result = normalize_image_dpi(&rgb_data, 100, 100, &config, None);
371
+ assert!(result.is_err());
372
+ }
373
+
374
+ #[test]
375
+ fn test_needs_resize_threshold() {
376
+ let config = ExtractionConfig {
377
+ target_dpi: 300,
378
+ max_image_dimension: 4096,
379
+ auto_adjust_dpi: false,
380
+ min_dpi: 72,
381
+ max_dpi: 600,
382
+ };
383
+
384
+ assert!(!needs_resize(100, 100, 1.02, &config));
385
+
386
+ assert!(needs_resize(100, 100, 1.10, &config));
387
+ }
388
+
389
+ #[test]
390
+ fn test_calculate_new_dimensions_no_clamp() {
391
+ let config = ExtractionConfig::default();
392
+
393
+ let (new_w, new_h, scale, clamped) = calculate_new_dimensions(100, 100, 2.0, &config);
394
+
395
+ assert_eq!(new_w, 200);
396
+ assert_eq!(new_h, 200);
397
+ assert!((scale - 2.0).abs() < 0.01);
398
+ assert!(!clamped);
399
+ }
400
+
401
+ #[test]
402
+ fn test_calculate_new_dimensions_with_clamp() {
403
+ let config = ExtractionConfig {
404
+ target_dpi: 300,
405
+ max_image_dimension: 100,
406
+ auto_adjust_dpi: false,
407
+ min_dpi: 72,
408
+ max_dpi: 600,
409
+ };
410
+
411
+ let (new_w, new_h, _scale, clamped) = calculate_new_dimensions(100, 100, 2.0, &config);
412
+
413
+ assert!(new_w <= 100);
414
+ assert!(new_h <= 100);
415
+ assert!(clamped);
416
+ }
417
+ }
@@ -0,0 +1,89 @@
1
+ use crate::error::{KreuzbergError, Result};
2
+ use fast_image_resize::{FilterType, PixelType, ResizeAlg, ResizeOptions, Resizer, images::Image as FirImage};
3
+ use image::{DynamicImage, ImageBuffer, Rgb};
4
+
5
+ /// Resize an image using fast_image_resize with appropriate algorithm based on scale factor
6
+ pub fn resize_image(image: &DynamicImage, new_width: u32, new_height: u32, scale_factor: f64) -> Result<DynamicImage> {
7
+ let rgb_image = image.to_rgb8();
8
+ let (width, height) = rgb_image.dimensions();
9
+
10
+ let src_image = FirImage::from_vec_u8(width, height, rgb_image.into_raw(), PixelType::U8x3)
11
+ .map_err(|e| KreuzbergError::parsing(format!("Failed to create source image: {e:?}")))?;
12
+
13
+ let mut dst_image = FirImage::new(new_width, new_height, PixelType::U8x3);
14
+
15
+ let algorithm = if scale_factor < 1.0 {
16
+ ResizeAlg::Convolution(FilterType::Lanczos3)
17
+ } else {
18
+ ResizeAlg::Convolution(FilterType::CatmullRom)
19
+ };
20
+
21
+ let mut resizer = Resizer::new();
22
+ resizer
23
+ .resize(&src_image, &mut dst_image, &ResizeOptions::new().resize_alg(algorithm))
24
+ .map_err(|e| KreuzbergError::parsing(format!("Resize failed: {e:?}")))?;
25
+
26
+ let buffer = dst_image.into_vec();
27
+ let img_buffer = ImageBuffer::<Rgb<u8>, Vec<u8>>::from_raw(new_width, new_height, buffer)
28
+ .ok_or_else(|| KreuzbergError::parsing("Failed to create image buffer".to_string()))?;
29
+
30
+ Ok(DynamicImage::ImageRgb8(img_buffer))
31
+ }
32
+
33
+ #[cfg(test)]
34
+ mod tests {
35
+ use super::*;
36
+ use image::Rgb;
37
+
38
+ fn create_test_image() -> DynamicImage {
39
+ let mut img = ImageBuffer::new(100, 100);
40
+ for y in 0..100 {
41
+ for x in 0..100 {
42
+ img.put_pixel(x, y, Rgb([255u8, 0u8, 0u8]));
43
+ }
44
+ }
45
+ DynamicImage::ImageRgb8(img)
46
+ }
47
+
48
+ #[test]
49
+ fn test_resize_image_downscale() {
50
+ let img = create_test_image();
51
+ let result = resize_image(&img, 50, 50, 0.5);
52
+ assert!(result.is_ok());
53
+ let resized = result.unwrap();
54
+ assert_eq!(resized.width(), 50);
55
+ assert_eq!(resized.height(), 50);
56
+ }
57
+
58
+ #[test]
59
+ fn test_resize_image_upscale() {
60
+ let img = create_test_image();
61
+ let result = resize_image(&img, 200, 200, 2.0);
62
+ assert!(result.is_ok());
63
+ let resized = result.unwrap();
64
+ assert_eq!(resized.width(), 200);
65
+ assert_eq!(resized.height(), 200);
66
+ }
67
+
68
+ #[test]
69
+ fn test_resize_image_no_scale() {
70
+ let img = create_test_image();
71
+ let result = resize_image(&img, 100, 100, 1.0);
72
+ assert!(result.is_ok());
73
+ let resized = result.unwrap();
74
+ assert_eq!(resized.width(), 100);
75
+ assert_eq!(resized.height(), 100);
76
+ }
77
+
78
+ #[test]
79
+ fn test_resize_preserves_aspect_ratio() {
80
+ let img = create_test_image();
81
+ let result = resize_image(&img, 50, 50, 0.5);
82
+ assert!(result.is_ok());
83
+ let resized = result.unwrap();
84
+
85
+ let original_aspect = img.width() as f64 / img.height() as f64;
86
+ let resized_aspect = resized.width() as f64 / resized.height() as f64;
87
+ assert!((original_aspect - resized_aspect).abs() < 0.01);
88
+ }
89
+ }
@@ -0,0 +1,154 @@
1
+ //! Configuration for keyword extraction.
2
+
3
+ use super::types::KeywordAlgorithm;
4
+ use serde::{Deserialize, Serialize};
5
+
6
+ /// YAKE-specific parameters.
7
+ #[cfg(feature = "keywords-yake")]
8
+ #[derive(Debug, Clone, Serialize, Deserialize)]
9
+ pub struct YakeParams {
10
+ /// Window size for co-occurrence analysis (default: 2).
11
+ ///
12
+ /// Controls the context window for computing co-occurrence statistics.
13
+ pub window_size: usize,
14
+ }
15
+
16
+ #[cfg(feature = "keywords-yake")]
17
+ impl Default for YakeParams {
18
+ fn default() -> Self {
19
+ Self { window_size: 2 }
20
+ }
21
+ }
22
+
23
+ /// RAKE-specific parameters.
24
+ #[cfg(feature = "keywords-rake")]
25
+ #[derive(Debug, Clone, Serialize, Deserialize)]
26
+ pub struct RakeParams {
27
+ /// Minimum word length to consider (default: 1).
28
+ pub min_word_length: usize,
29
+
30
+ /// Maximum words in a keyword phrase (default: 3).
31
+ pub max_words_per_phrase: usize,
32
+ }
33
+
34
+ #[cfg(feature = "keywords-rake")]
35
+ impl Default for RakeParams {
36
+ fn default() -> Self {
37
+ Self {
38
+ min_word_length: 1,
39
+ max_words_per_phrase: 3,
40
+ }
41
+ }
42
+ }
43
+
44
+ /// Keyword extraction configuration.
45
+ #[derive(Debug, Clone, Serialize, Deserialize)]
46
+ pub struct KeywordConfig {
47
+ /// Algorithm to use for extraction.
48
+ pub algorithm: KeywordAlgorithm,
49
+
50
+ /// Maximum number of keywords to extract (default: 10).
51
+ pub max_keywords: usize,
52
+
53
+ /// Minimum score threshold (0.0-1.0, default: 0.0).
54
+ ///
55
+ /// Keywords with scores below this threshold are filtered out.
56
+ /// Note: Score ranges differ between algorithms.
57
+ pub min_score: f32,
58
+
59
+ /// N-gram range for keyword extraction (min, max).
60
+ ///
61
+ /// (1, 1) = unigrams only
62
+ /// (1, 2) = unigrams and bigrams
63
+ /// (1, 3) = unigrams, bigrams, and trigrams (default)
64
+ pub ngram_range: (usize, usize),
65
+
66
+ /// Language code for stopword filtering (e.g., "en", "de", "fr").
67
+ ///
68
+ /// If None, no stopword filtering is applied.
69
+ pub language: Option<String>,
70
+
71
+ /// YAKE-specific tuning parameters.
72
+ #[cfg(feature = "keywords-yake")]
73
+ #[serde(skip_serializing_if = "Option::is_none")]
74
+ pub yake_params: Option<YakeParams>,
75
+
76
+ /// RAKE-specific tuning parameters.
77
+ #[cfg(feature = "keywords-rake")]
78
+ #[serde(skip_serializing_if = "Option::is_none")]
79
+ pub rake_params: Option<RakeParams>,
80
+ }
81
+
82
+ impl Default for KeywordConfig {
83
+ fn default() -> Self {
84
+ Self {
85
+ algorithm: KeywordAlgorithm::default(),
86
+ max_keywords: 10,
87
+ min_score: 0.0,
88
+ ngram_range: (1, 3),
89
+ language: Some("en".to_string()),
90
+ #[cfg(feature = "keywords-yake")]
91
+ yake_params: None,
92
+ #[cfg(feature = "keywords-rake")]
93
+ rake_params: None,
94
+ }
95
+ }
96
+ }
97
+
98
+ impl KeywordConfig {
99
+ /// Create a new configuration with YAKE algorithm.
100
+ #[cfg(feature = "keywords-yake")]
101
+ pub fn yake() -> Self {
102
+ Self {
103
+ algorithm: KeywordAlgorithm::Yake,
104
+ ..Default::default()
105
+ }
106
+ }
107
+
108
+ /// Create a new configuration with RAKE algorithm.
109
+ #[cfg(feature = "keywords-rake")]
110
+ pub fn rake() -> Self {
111
+ Self {
112
+ algorithm: KeywordAlgorithm::Rake,
113
+ ..Default::default()
114
+ }
115
+ }
116
+
117
+ /// Set maximum number of keywords to extract.
118
+ pub fn with_max_keywords(mut self, max: usize) -> Self {
119
+ self.max_keywords = max;
120
+ self
121
+ }
122
+
123
+ /// Set minimum score threshold.
124
+ pub fn with_min_score(mut self, score: f32) -> Self {
125
+ self.min_score = score;
126
+ self
127
+ }
128
+
129
+ /// Set n-gram range.
130
+ pub fn with_ngram_range(mut self, min: usize, max: usize) -> Self {
131
+ self.ngram_range = (min, max);
132
+ self
133
+ }
134
+
135
+ /// Set language for stopword filtering.
136
+ pub fn with_language(mut self, lang: impl Into<String>) -> Self {
137
+ self.language = Some(lang.into());
138
+ self
139
+ }
140
+
141
+ /// Set YAKE-specific parameters.
142
+ #[cfg(feature = "keywords-yake")]
143
+ pub fn with_yake_params(mut self, params: YakeParams) -> Self {
144
+ self.yake_params = Some(params);
145
+ self
146
+ }
147
+
148
+ /// Set RAKE-specific parameters.
149
+ #[cfg(feature = "keywords-rake")]
150
+ pub fn with_rake_params(mut self, params: RakeParams) -> Self {
151
+ self.rake_params = Some(params);
152
+ self
153
+ }
154
+ }