kreuzberg 4.0.0.rc2 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (446) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +14 -14
  3. data/.rspec +3 -3
  4. data/.rubocop.yaml +1 -1
  5. data/.rubocop.yml +543 -538
  6. data/Gemfile +8 -8
  7. data/Gemfile.lock +194 -6
  8. data/README.md +391 -426
  9. data/Rakefile +34 -25
  10. data/Steepfile +51 -47
  11. data/examples/async_patterns.rb +283 -341
  12. data/ext/kreuzberg_rb/extconf.rb +65 -45
  13. data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
  14. data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
  15. data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
  16. data/ext/kreuzberg_rb/native/README.md +425 -425
  17. data/ext/kreuzberg_rb/native/build.rs +15 -15
  18. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
  19. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
  20. data/ext/kreuzberg_rb/native/include/strings.h +20 -20
  21. data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
  22. data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
  23. data/extconf.rb +60 -28
  24. data/kreuzberg.gemspec +199 -148
  25. data/lib/kreuzberg/api_proxy.rb +126 -142
  26. data/lib/kreuzberg/cache_api.rb +67 -46
  27. data/lib/kreuzberg/cli.rb +47 -55
  28. data/lib/kreuzberg/cli_proxy.rb +117 -127
  29. data/lib/kreuzberg/config.rb +936 -691
  30. data/lib/kreuzberg/error_context.rb +136 -32
  31. data/lib/kreuzberg/errors.rb +116 -118
  32. data/lib/kreuzberg/extraction_api.rb +313 -85
  33. data/lib/kreuzberg/mcp_proxy.rb +177 -186
  34. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
  35. data/lib/kreuzberg/post_processor_protocol.rb +15 -86
  36. data/lib/kreuzberg/result.rb +334 -216
  37. data/lib/kreuzberg/setup_lib_path.rb +99 -80
  38. data/lib/kreuzberg/types.rb +170 -0
  39. data/lib/kreuzberg/validator_protocol.rb +16 -89
  40. data/lib/kreuzberg/version.rb +5 -5
  41. data/lib/kreuzberg.rb +96 -103
  42. data/lib/libpdfium.so +0 -0
  43. data/sig/kreuzberg/internal.rbs +184 -184
  44. data/sig/kreuzberg.rbs +561 -520
  45. data/spec/binding/async_operations_spec.rb +473 -0
  46. data/spec/binding/batch_operations_spec.rb +595 -0
  47. data/spec/binding/batch_spec.rb +359 -0
  48. data/spec/binding/cache_spec.rb +227 -227
  49. data/spec/binding/cli_proxy_spec.rb +85 -85
  50. data/spec/binding/cli_spec.rb +55 -55
  51. data/spec/binding/config_result_spec.rb +377 -0
  52. data/spec/binding/config_spec.rb +419 -345
  53. data/spec/binding/config_validation_spec.rb +377 -283
  54. data/spec/binding/embeddings_spec.rb +816 -0
  55. data/spec/binding/error_handling_spec.rb +399 -213
  56. data/spec/binding/error_recovery_spec.rb +488 -0
  57. data/spec/binding/errors_spec.rb +66 -66
  58. data/spec/binding/font_config_spec.rb +220 -0
  59. data/spec/binding/images_spec.rb +738 -0
  60. data/spec/binding/keywords_extraction_spec.rb +600 -0
  61. data/spec/binding/metadata_types_spec.rb +1228 -0
  62. data/spec/binding/pages_extraction_spec.rb +471 -0
  63. data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
  64. data/spec/binding/plugins/postprocessor_spec.rb +269 -269
  65. data/spec/binding/plugins/validator_spec.rb +273 -274
  66. data/spec/binding/tables_spec.rb +641 -0
  67. data/spec/fixtures/config.toml +38 -39
  68. data/spec/fixtures/config.yaml +41 -41
  69. data/spec/fixtures/invalid_config.toml +3 -4
  70. data/spec/smoke/package_spec.rb +177 -178
  71. data/spec/spec_helper.rb +40 -42
  72. data/spec/unit/config/chunking_config_spec.rb +213 -0
  73. data/spec/unit/config/embedding_config_spec.rb +343 -0
  74. data/spec/unit/config/extraction_config_spec.rb +438 -0
  75. data/spec/unit/config/font_config_spec.rb +285 -0
  76. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  77. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  78. data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
  79. data/spec/unit/config/keyword_config_spec.rb +229 -0
  80. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  81. data/spec/unit/config/ocr_config_spec.rb +171 -0
  82. data/spec/unit/config/page_config_spec.rb +221 -0
  83. data/spec/unit/config/pdf_config_spec.rb +267 -0
  84. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  85. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  86. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  87. data/test/metadata_types_test.rb +959 -0
  88. data/vendor/Cargo.toml +61 -0
  89. data/vendor/kreuzberg/Cargo.toml +259 -204
  90. data/vendor/kreuzberg/README.md +263 -175
  91. data/vendor/kreuzberg/build.rs +782 -474
  92. data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
  93. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
  94. data/vendor/kreuzberg/src/api/error.rs +81 -81
  95. data/vendor/kreuzberg/src/api/handlers.rs +320 -199
  96. data/vendor/kreuzberg/src/api/mod.rs +94 -79
  97. data/vendor/kreuzberg/src/api/server.rs +518 -353
  98. data/vendor/kreuzberg/src/api/types.rs +206 -170
  99. data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
  100. data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
  101. data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
  102. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
  103. data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
  104. data/vendor/kreuzberg/src/core/config.rs +1914 -1032
  105. data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
  106. data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
  107. data/vendor/kreuzberg/src/core/formats.rs +235 -0
  108. data/vendor/kreuzberg/src/core/io.rs +329 -329
  109. data/vendor/kreuzberg/src/core/mime.rs +605 -605
  110. data/vendor/kreuzberg/src/core/mod.rs +61 -45
  111. data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
  112. data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
  113. data/vendor/kreuzberg/src/embeddings.rs +471 -432
  114. data/vendor/kreuzberg/src/error.rs +431 -431
  115. data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
  116. data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
  117. data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
  118. data/vendor/kreuzberg/src/extraction/email.rs +855 -854
  119. data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
  120. data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
  121. data/vendor/kreuzberg/src/extraction/image.rs +492 -368
  122. data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
  123. data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
  124. data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
  125. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
  126. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
  127. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
  128. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
  129. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
  130. data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
  131. data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
  132. data/vendor/kreuzberg/src/extraction/table.rs +329 -328
  133. data/vendor/kreuzberg/src/extraction/text.rs +277 -269
  134. data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
  135. data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
  136. data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
  137. data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
  138. data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
  139. data/vendor/kreuzberg/src/extractors/email.rs +157 -143
  140. data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
  141. data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
  142. data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
  143. data/vendor/kreuzberg/src/extractors/html.rs +419 -393
  144. data/vendor/kreuzberg/src/extractors/image.rs +219 -198
  145. data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
  146. data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
  147. data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
  149. data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
  150. data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
  151. data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
  152. data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
  153. data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
  154. data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
  155. data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
  156. data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
  157. data/vendor/kreuzberg/src/extractors/security.rs +484 -484
  158. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
  159. data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
  160. data/vendor/kreuzberg/src/extractors/text.rs +265 -260
  161. data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
  162. data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
  163. data/vendor/kreuzberg/src/image/dpi.rs +164 -164
  164. data/vendor/kreuzberg/src/image/mod.rs +6 -6
  165. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
  166. data/vendor/kreuzberg/src/image/resize.rs +89 -89
  167. data/vendor/kreuzberg/src/keywords/config.rs +154 -154
  168. data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
  169. data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
  170. data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
  171. data/vendor/kreuzberg/src/keywords/types.rs +68 -68
  172. data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
  173. data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
  174. data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
  175. data/vendor/kreuzberg/src/lib.rs +114 -105
  176. data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
  177. data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
  178. data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
  179. data/vendor/kreuzberg/src/ocr/error.rs +37 -37
  180. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
  181. data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
  182. data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
  183. data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
  184. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
  185. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
  186. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
  187. data/vendor/kreuzberg/src/ocr/types.rs +393 -393
  188. data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
  189. data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
  190. data/vendor/kreuzberg/src/panic_context.rs +154 -154
  191. data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
  192. data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
  193. data/vendor/kreuzberg/src/pdf/error.rs +214 -122
  194. data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
  196. data/vendor/kreuzberg/src/pdf/images.rs +139 -139
  197. data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
  198. data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
  199. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
  200. data/vendor/kreuzberg/src/pdf/table.rs +417 -393
  201. data/vendor/kreuzberg/src/pdf/text.rs +553 -158
  202. data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
  203. data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
  204. data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
  205. data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
  206. data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
  207. data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
  208. data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
  209. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
  210. data/vendor/kreuzberg/src/text/mod.rs +27 -19
  211. data/vendor/kreuzberg/src/text/quality.rs +710 -697
  212. data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
  213. data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
  214. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
  215. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
  216. data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
  217. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
  218. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
  219. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
  220. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
  221. data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
  222. data/vendor/kreuzberg/src/types.rs +1713 -903
  223. data/vendor/kreuzberg/src/utils/mod.rs +31 -17
  224. data/vendor/kreuzberg/src/utils/pool.rs +503 -0
  225. data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
  226. data/vendor/kreuzberg/src/utils/quality.rs +968 -959
  227. data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
  228. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
  229. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
  230. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
  231. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
  232. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
  233. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
  234. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
  235. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
  236. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
  237. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
  238. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
  239. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
  240. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
  241. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
  242. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
  243. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
  244. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
  245. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
  246. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
  247. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
  248. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
  249. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
  250. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
  251. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
  252. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
  253. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
  254. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
  255. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
  256. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
  257. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
  258. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
  259. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
  260. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
  261. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
  262. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
  263. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
  264. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
  265. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
  266. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
  267. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
  268. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
  269. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
  270. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
  271. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
  272. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
  273. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
  274. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
  275. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
  276. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
  277. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
  278. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
  279. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
  280. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
  281. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
  282. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
  283. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
  284. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
  285. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
  286. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
  287. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
  288. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
  289. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
  290. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
  291. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
  292. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
  293. data/vendor/kreuzberg/tests/api_embed.rs +360 -0
  294. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
  295. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
  296. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
  297. data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
  298. data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
  299. data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
  300. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
  301. data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
  302. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
  303. data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
  304. data/vendor/kreuzberg/tests/config_features.rs +612 -598
  305. data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
  306. data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
  307. data/vendor/kreuzberg/tests/core_integration.rs +519 -510
  308. data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
  309. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
  310. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
  311. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
  312. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
  313. data/vendor/kreuzberg/tests/email_integration.rs +327 -325
  314. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
  315. data/vendor/kreuzberg/tests/error_handling.rs +402 -393
  316. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
  317. data/vendor/kreuzberg/tests/format_integration.rs +165 -159
  318. data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
  319. data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
  320. data/vendor/kreuzberg/tests/image_integration.rs +255 -253
  321. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
  322. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
  323. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
  324. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
  325. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
  326. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
  327. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
  328. data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
  329. data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
  330. data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
  331. data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
  332. data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
  333. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
  334. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
  335. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
  336. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
  337. data/vendor/kreuzberg/tests/page_markers.rs +297 -0
  338. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
  339. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
  340. data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
  341. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
  342. data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
  343. data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
  344. data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
  345. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
  346. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
  347. data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
  348. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
  349. data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
  350. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
  351. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
  352. data/vendor/kreuzberg/tests/security_validation.rs +416 -415
  353. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
  354. data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
  355. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
  356. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
  357. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
  358. data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
  359. data/vendor/kreuzberg-ffi/README.md +851 -0
  360. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
  361. data/vendor/kreuzberg-ffi/build.rs +168 -0
  362. data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
  363. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
  364. data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
  365. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
  366. data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
  367. data/vendor/kreuzberg-ffi/src/error.rs +901 -0
  368. data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
  369. data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
  370. data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
  371. data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
  372. data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
  373. data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
  374. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
  375. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
  376. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
  377. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
  378. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
  379. data/vendor/kreuzberg-ffi/src/result.rs +510 -0
  380. data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
  381. data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
  382. data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
  383. data/vendor/kreuzberg-ffi/src/types.rs +363 -0
  384. data/vendor/kreuzberg-ffi/src/util.rs +210 -0
  385. data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
  386. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
  387. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
  388. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
  389. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
  390. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
  391. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
  392. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
  393. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
  394. data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
  395. data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
  396. data/vendor/kreuzberg-tesseract/README.md +399 -0
  397. data/vendor/kreuzberg-tesseract/build.rs +1127 -0
  398. data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
  399. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
  400. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
  401. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
  402. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
  403. data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
  404. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
  405. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
  406. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
  407. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
  408. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
  409. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
  410. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
  411. metadata +196 -45
  412. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  413. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  414. data/vendor/rb-sys/.cargo-ok +0 -1
  415. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  416. data/vendor/rb-sys/Cargo.lock +0 -393
  417. data/vendor/rb-sys/Cargo.toml +0 -70
  418. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  419. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  420. data/vendor/rb-sys/bin/release.sh +0 -21
  421. data/vendor/rb-sys/build/features.rs +0 -108
  422. data/vendor/rb-sys/build/main.rs +0 -246
  423. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  424. data/vendor/rb-sys/build/version.rs +0 -48
  425. data/vendor/rb-sys/readme.md +0 -36
  426. data/vendor/rb-sys/src/bindings.rs +0 -21
  427. data/vendor/rb-sys/src/hidden.rs +0 -11
  428. data/vendor/rb-sys/src/lib.rs +0 -34
  429. data/vendor/rb-sys/src/macros.rs +0 -371
  430. data/vendor/rb-sys/src/memory.rs +0 -53
  431. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  432. data/vendor/rb-sys/src/special_consts.rs +0 -31
  433. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  434. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  435. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  436. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  437. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  438. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  439. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  440. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  441. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  442. data/vendor/rb-sys/src/stable_api.rs +0 -261
  443. data/vendor/rb-sys/src/symbol.rs +0 -31
  444. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  445. data/vendor/rb-sys/src/utils.rs +0 -89
  446. data/vendor/rb-sys/src/value_type.rs +0 -7
@@ -1,1470 +1,1470 @@
1
- //! Stopwords management for text processing.
2
- //!
3
- //! Provides language-specific stopword collections used by keyword extraction
4
- //! and token reduction features. Stopwords are common words (the, is, and, etc.)
5
- //! that should be filtered out from text analysis.
6
- //!
7
- //! # Supported Languages
8
- //!
9
- //! Supports 64 languages with embedded stopword lists:
10
- //! - Afrikaans (af), Arabic (ar), Bulgarian (bg), Bengali (bn), Breton (br)
11
- //! - Catalan (ca), Czech (cs), Danish (da), German (de), Greek (el)
12
- //! - English (en), Esperanto (eo), Spanish (es), Estonian (et), Basque (eu)
13
- //! - Persian (fa), Finnish (fi), French (fr), Irish (ga), Galician (gl)
14
- //! - Gujarati (gu), Hausa (ha), Hebrew (he), Hindi (hi), Croatian (hr)
15
- //! - Hungarian (hu), Armenian (hy), Indonesian (id), Italian (it), Japanese (ja)
16
- //! - Kannada (kn), Korean (ko), Kurdish (ku), Latin (la), Lithuanian (lt)
17
- //! - Latvian (lv), Malayalam (ml), Marathi (mr), Malay (ms), Nepali (ne)
18
- //! - Dutch (nl), Norwegian (no), Polish (pl), Portuguese (pt), Romanian (ro)
19
- //! - Russian (ru), Sinhala (si), Slovak (sk), Slovenian (sl), Somali (so)
20
- //! - Sesotho (st), Swedish (sv), Swahili (sw), Tamil (ta), Telugu (te)
21
- //! - Thai (th), Tagalog (tl), Turkish (tr), Ukrainian (uk), Urdu (ur)
22
- //! - Vietnamese (vi), Yoruba (yo), Chinese (zh), Zulu (zu)
23
- //!
24
- //! All stopword lists are embedded in the binary at compile time for zero-overhead access.
25
- //!
26
- //! # Usage
27
- //!
28
- //! ```rust
29
- //! use kreuzberg::stopwords::{get_stopwords, get_stopwords_with_fallback};
30
- //!
31
- //! // Get English stopwords with normalization
32
- //! if let Some(en_stopwords) = get_stopwords("en") {
33
- //! assert!(en_stopwords.contains("the"));
34
- //!
35
- //! // Check if a word is a stopword
36
- //! if en_stopwords.contains("the") {
37
- //! println!("'the' is a stopword");
38
- //! }
39
- //! }
40
- //!
41
- //! // Case-insensitive - all of these work
42
- //! assert!(get_stopwords("EN").is_some());
43
- //! assert!(get_stopwords("En").is_some());
44
- //!
45
- //! // Locale codes are normalized to language code (first 2 chars)
46
- //! if let Some(en_us) = get_stopwords("en-US") {
47
- //! if let Some(en_gb) = get_stopwords("en_GB") {
48
- //! // Both point to "en" stopwords
49
- //! assert_eq!(en_us.len(), en_gb.len());
50
- //! }
51
- //! }
52
- //!
53
- //! // Spanish with locale
54
- //! if let Some(es_stopwords) = get_stopwords("es-ES") {
55
- //! assert!(es_stopwords.contains("el"));
56
- //! }
57
- //!
58
- //! // Fallback for unsupported languages
59
- //! if let Some(stopwords) = get_stopwords_with_fallback("unknown", "en") {
60
- //! // Will use English stopwords since "unknown" isn't supported
61
- //! assert!(stopwords.contains("the"));
62
- //! }
63
- //! ```
64
- //!
65
- //! # Direct Access (Advanced)
66
- //!
67
- //! For advanced use cases where you need direct access to the HashMap or want to
68
- //! iterate over all languages, you can use the `STOPWORDS` static directly:
69
- //!
70
- //! ```rust
71
- //! use kreuzberg::stopwords::STOPWORDS;
72
- //!
73
- //! // Direct access (case-sensitive, no normalization)
74
- //! let en_stopwords = STOPWORDS.get("en");
75
- //!
76
- //! // List all available languages
77
- //! for lang in STOPWORDS.keys() {
78
- //! println!("Available language: {}", lang);
79
- //! }
80
- //! ```
81
-
82
- use ahash::{AHashMap, AHashSet};
83
- use once_cell::sync::Lazy;
84
-
85
- /// Macro to generate embedded stopwords for all languages.
86
- ///
87
- /// This macro embeds the JSON files at compile time using `include_str!()` and
88
- /// generates code to parse and insert them into the stopwords map.
89
- macro_rules! embed_stopwords {
90
- ($map:expr, $($lang:literal),* $(,)?) => {
91
- $(
92
- {
93
- const JSON: &str = include_str!(concat!("../../stopwords/", $lang, "_stopwords.json"));
94
- match serde_json::from_str::<Vec<String>>(JSON) {
95
- Ok(words) => {
96
- let set: AHashSet<String> = words.into_iter().collect();
97
- $map.insert($lang.to_string(), set);
98
- }
99
- Err(e) => {
100
- panic!(
101
- "Failed to parse embedded stopwords for language '{}': {}. \
102
- This indicates corrupted or malformed JSON in the embedded stopwords data. \
103
- Please report this issue at https://github.com/kreuzberg-dev/kreuzberg/issues",
104
- $lang, e
105
- );
106
- }
107
- }
108
- }
109
- )*
110
- };
111
- }
112
-
113
- /// Global stopwords registry.
114
- ///
115
- /// A lazy-initialized map of language codes to stopword sets.
116
- /// All stopword lists are embedded in the binary at compile time for
117
- /// zero-overhead access and no runtime I/O dependencies.
118
- ///
119
- /// Supports 64 languages with comprehensive stopword coverage.
120
- ///
121
- /// # Note
122
- ///
123
- /// For most use cases, prefer [`get_stopwords()`] which provides language code
124
- /// normalization (case-insensitive, locale handling). Direct access to STOPWORDS
125
- /// is case-sensitive and requires exact language codes (lowercase, 2-letter ISO 639-1).
126
- ///
127
- /// # Examples
128
- ///
129
- /// ```rust
130
- /// use kreuzberg::stopwords::STOPWORDS;
131
- ///
132
- /// // Direct access (case-sensitive, no normalization)
133
- /// let en_stopwords = STOPWORDS.get("en");
134
- /// assert!(en_stopwords.is_some());
135
- ///
136
- /// // Case-sensitive - these return None
137
- /// assert!(STOPWORDS.get("EN").is_none());
138
- /// assert!(STOPWORDS.get("en-US").is_none());
139
- ///
140
- /// // List all available languages
141
- /// assert_eq!(STOPWORDS.len(), 64);
142
- /// for lang in STOPWORDS.keys() {
143
- /// println!("Available: {}", lang);
144
- /// }
145
- /// ```
146
- pub static STOPWORDS: Lazy<AHashMap<String, AHashSet<String>>> = Lazy::new(|| {
147
- let mut map = AHashMap::new();
148
-
149
- embed_stopwords!(
150
- map, "af", "ar", "bg", "bn", "br", "ca", "cs", "da", "de", "el", "en", "eo", "es", "et", "eu", "fa", "fi",
151
- "fr", "ga", "gl", "gu", "ha", "he", "hi", "hr", "hu", "hy", "id", "it", "ja", "kn", "ko", "ku", "la", "lt",
152
- "lv", "ml", "mr", "ms", "ne", "nl", "no", "pl", "pt", "ro", "ru", "si", "sk", "sl", "so", "st", "sv", "sw",
153
- "ta", "te", "th", "tl", "tr", "uk", "ur", "vi", "yo", "zh", "zu",
154
- );
155
-
156
- apply_stopword_whitelist(&mut map);
157
-
158
- map
159
- });
160
-
161
- fn apply_stopword_whitelist(map: &mut AHashMap<String, AHashSet<String>>) {
162
- const STOPWORD_REMOVALS: &[(&str, &[&str])] = &[("en", &["hello", "test", "world", "working", "great"])];
163
-
164
- for (lang, words) in STOPWORD_REMOVALS {
165
- if let Some(set) = map.get_mut(*lang) {
166
- for &word in *words {
167
- set.remove(word);
168
- }
169
- }
170
- }
171
- }
172
-
173
- /// Get stopwords for a language with normalization.
174
- ///
175
- /// This function provides a user-friendly interface to the stopwords registry with:
176
- /// - **Case-insensitive lookup**: "EN", "en", "En" all work
177
- /// - **Locale normalization**: "en-US", "en_GB", "es-ES" extract to "en", "es"
178
- /// - **Consistent behavior**: Returns `None` for unsupported languages
179
- ///
180
- /// # Language Code Format
181
- ///
182
- /// Accepts multiple formats:
183
- /// - ISO 639-1 two-letter codes: `"en"`, `"es"`, `"de"`, etc.
184
- /// - Uppercase variants: `"EN"`, `"ES"`, `"DE"`
185
- /// - Locale codes with hyphen: `"en-US"`, `"es-ES"`, `"pt-BR"`
186
- /// - Locale codes with underscore: `"en_US"`, `"es_ES"`, `"pt_BR"`
187
- ///
188
- /// All formats are normalized to lowercase two-letter ISO 639-1 codes.
189
- ///
190
- /// # Returns
191
- ///
192
- /// - `Some(&HashSet<String>)` if the language is supported (64 languages available)
193
- /// - `None` if the language is not supported
194
- ///
195
- /// # Examples
196
- ///
197
- /// ```rust
198
- /// use kreuzberg::stopwords::get_stopwords;
199
- ///
200
- /// // Simple language codes
201
- /// if let Some(en) = get_stopwords("en") {
202
- /// assert!(en.contains("the"));
203
- /// }
204
- ///
205
- /// // Case-insensitive
206
- /// assert!(get_stopwords("EN").is_some());
207
- /// assert!(get_stopwords("En").is_some());
208
- /// assert!(get_stopwords("eN").is_some());
209
- ///
210
- /// // Locale codes normalized to language code
211
- /// if let (Some(en_us), Some(en_gb), Some(en_lowercase)) =
212
- /// (get_stopwords("en-US"), get_stopwords("en_GB"), get_stopwords("en"))
213
- /// {
214
- /// // All point to the same stopwords set
215
- /// assert_eq!(en_us.len(), en_gb.len());
216
- /// assert_eq!(en_us.len(), en_lowercase.len());
217
- /// }
218
- ///
219
- /// // Spanish with various formats
220
- /// assert!(get_stopwords("es").is_some());
221
- /// assert!(get_stopwords("ES").is_some());
222
- /// assert!(get_stopwords("es-ES").is_some());
223
- /// assert!(get_stopwords("es_MX").is_some());
224
- ///
225
- /// // Unsupported language returns None
226
- /// assert!(get_stopwords("xx").is_none());
227
- /// assert!(get_stopwords("zzzz").is_none());
228
- /// ```
229
- ///
230
- /// # Performance
231
- ///
232
- /// This function performs two operations:
233
- /// 1. String normalization (lowercase + truncate) - O(1) for typical language codes
234
- /// 2. HashMap lookup in STOPWORDS - O(1) average case
235
- ///
236
- /// Total overhead is negligible (~10-50ns on modern CPUs).
237
- pub fn get_stopwords(lang: &str) -> Option<&'static AHashSet<String>> {
238
- let normalized = lang.to_lowercase();
239
-
240
- let lang_code = if let Some(pos) = normalized.find(&['-', '_'][..]) {
241
- &normalized[..pos]
242
- } else if normalized.len() >= 2 {
243
- &normalized[..2]
244
- } else {
245
- &normalized
246
- };
247
-
248
- STOPWORDS.get(lang_code)
249
- }
250
-
251
- /// Get stopwords for a language with fallback support.
252
- ///
253
- /// This function attempts to retrieve stopwords for the primary language,
254
- /// and if not available, falls back to a secondary language. This is useful
255
- /// for handling scenarios where:
256
- /// - A detected language isn't supported
257
- /// - You want to use English as a fallback for unknown languages
258
- /// - You need graceful degradation for multilingual content
259
- ///
260
- /// Both language codes support the same normalization as [`get_stopwords()`]:
261
- /// - Case-insensitive lookup (EN, en, En all work)
262
- /// - Locale codes normalized (en-US, en_GB extract to "en")
263
- ///
264
- /// # Arguments
265
- ///
266
- /// * `language` - Primary language code to try first
267
- /// * `fallback` - Fallback language code to use if primary not available
268
- ///
269
- /// # Returns
270
- ///
271
- /// - `Some(&HashSet<String>)` if either language is supported
272
- /// - `None` if neither language is supported
273
- ///
274
- /// # Examples
275
- ///
276
- /// ```rust
277
- /// use kreuzberg::stopwords::get_stopwords_with_fallback;
278
- ///
279
- /// // Detected language is Esperanto, fallback to English
280
- /// if let Some(stopwords) = get_stopwords_with_fallback("eo", "en") {
281
- /// // Will use Esperanto stopwords (supported)
282
- /// assert!(stopwords.contains("la"));
283
- /// }
284
- ///
285
- /// // Unsupported language, fallback to English
286
- /// if let Some(stopwords) = get_stopwords_with_fallback("xx", "en") {
287
- /// // Will use English stopwords (fallback)
288
- /// assert!(stopwords.contains("the"));
289
- /// }
290
- ///
291
- /// // Case-insensitive and locale-aware
292
- /// let result = get_stopwords_with_fallback("es-MX", "EN-US");
293
- /// assert!(result.is_some());
294
- ///
295
- /// // Both unsupported returns None
296
- /// assert!(get_stopwords_with_fallback("xx", "zz").is_none());
297
- /// ```
298
- ///
299
- /// # Common Patterns
300
- ///
301
- /// ```rust
302
- /// use kreuzberg::stopwords::get_stopwords_with_fallback;
303
- ///
304
- /// // English fallback for unknown languages
305
- /// let detected_lang = "xyz"; // Unknown language
306
- /// let stopwords = get_stopwords_with_fallback(detected_lang, "en")
307
- /// .expect("English fallback should always be available");
308
- ///
309
- /// // Multi-language content with English fallback
310
- /// for lang in ["de", "fr", "unknown", "es"] {
311
- /// if let Some(stopwords) = get_stopwords_with_fallback(lang, "en") {
312
- /// println!("Using stopwords for: {}", lang);
313
- /// }
314
- /// }
315
- /// ```
316
- ///
317
- /// # Performance
318
- ///
319
- /// This function performs at most two HashMap lookups:
320
- /// 1. Try primary language (O(1) average case)
321
- /// 2. If None, try fallback language (O(1) average case)
322
- ///
323
- /// Total overhead is negligible (~10-100ns on modern CPUs).
324
- pub fn get_stopwords_with_fallback(language: &str, fallback: &str) -> Option<&'static AHashSet<String>> {
325
- get_stopwords(language).or_else(|| get_stopwords(fallback))
326
- }
327
-
328
- #[cfg(test)]
329
- mod tests {
330
- use super::*;
331
-
332
- #[test]
333
- fn test_stopwords_lazy_initialization() {
334
- let stopwords = &*STOPWORDS;
335
- assert!(stopwords.contains_key("en"));
336
- assert!(stopwords.contains_key("es"));
337
- assert!(!stopwords.get("en").unwrap().is_empty());
338
- assert!(!stopwords.get("es").unwrap().is_empty());
339
- }
340
-
341
- #[test]
342
- fn test_english_stopwords() {
343
- let en_stopwords = STOPWORDS.get("en").unwrap();
344
-
345
- assert!(en_stopwords.contains("the"));
346
- assert!(en_stopwords.contains("is"));
347
- assert!(en_stopwords.contains("and"));
348
- assert!(en_stopwords.contains("a"));
349
- assert!(en_stopwords.contains("of"));
350
-
351
- assert!(en_stopwords.len() >= 70);
352
- }
353
-
354
- #[test]
355
- fn test_spanish_stopwords() {
356
- let es_stopwords = STOPWORDS.get("es").unwrap();
357
-
358
- assert!(es_stopwords.contains("el"));
359
- assert!(es_stopwords.contains("la"));
360
- assert!(es_stopwords.contains("es"));
361
- assert!(es_stopwords.contains("en"));
362
- assert!(es_stopwords.contains("de"));
363
-
364
- assert!(es_stopwords.len() >= 200);
365
- }
366
-
367
- #[test]
368
- fn test_all_64_languages_loaded() {
369
- let expected_languages = [
370
- "af", "ar", "bg", "bn", "br", "ca", "cs", "da", "de", "el", "en", "eo", "es", "et", "eu", "fa", "fi", "fr",
371
- "ga", "gl", "gu", "ha", "he", "hi", "hr", "hu", "hy", "id", "it", "ja", "kn", "ko", "ku", "la", "lt", "lv",
372
- "ml", "mr", "ms", "ne", "nl", "no", "pl", "pt", "ro", "ru", "si", "sk", "sl", "so", "st", "sv", "sw", "ta",
373
- "te", "th", "tl", "tr", "uk", "ur", "vi", "yo", "zh", "zu",
374
- ];
375
-
376
- for lang in &expected_languages {
377
- assert!(
378
- STOPWORDS.contains_key(*lang),
379
- "Missing stopwords for language: {}",
380
- lang
381
- );
382
- assert!(
383
- !STOPWORDS.get(*lang).unwrap().is_empty(),
384
- "Empty stopwords for language: {}",
385
- lang
386
- );
387
- }
388
-
389
- assert_eq!(STOPWORDS.len(), 64, "Expected 64 languages, found {}", STOPWORDS.len());
390
- }
391
-
392
- #[test]
393
- fn test_german_stopwords() {
394
- let de_stopwords = STOPWORDS.get("de").unwrap();
395
- assert!(de_stopwords.contains("der"));
396
- assert!(de_stopwords.contains("die"));
397
- assert!(de_stopwords.contains("und"));
398
- }
399
-
400
- #[test]
401
- fn test_french_stopwords() {
402
- let fr_stopwords = STOPWORDS.get("fr").unwrap();
403
- assert!(fr_stopwords.contains("le"));
404
- assert!(fr_stopwords.contains("de"));
405
- assert!(fr_stopwords.contains("un"));
406
- }
407
-
408
- #[test]
409
- fn test_chinese_stopwords() {
410
- let zh_stopwords = STOPWORDS.get("zh").unwrap();
411
- assert!(!zh_stopwords.is_empty());
412
- }
413
-
414
- #[test]
415
- fn test_arabic_stopwords() {
416
- let ar_stopwords = STOPWORDS.get("ar").unwrap();
417
- assert!(!ar_stopwords.is_empty());
418
- }
419
-
420
- #[test]
421
- fn test_unknown_language_returns_none() {
422
- assert!(!STOPWORDS.contains_key("xx"));
423
- assert!(STOPWORDS.get("unknown").is_none());
424
- }
425
-
426
- #[test]
427
- fn test_get_stopwords_lowercase() {
428
- assert!(get_stopwords("en").is_some());
429
- assert!(get_stopwords("es").is_some());
430
- assert!(get_stopwords("de").is_some());
431
- assert!(get_stopwords("fr").is_some());
432
- }
433
-
434
- #[test]
435
- fn test_get_stopwords_uppercase() {
436
- let en_upper = get_stopwords("EN");
437
- let en_lower = get_stopwords("en");
438
-
439
- assert!(en_upper.is_some());
440
- assert!(en_lower.is_some());
441
-
442
- assert_eq!(en_upper.unwrap().len(), en_lower.unwrap().len());
443
- }
444
-
445
- #[test]
446
- fn test_get_stopwords_mixed_case() {
447
- assert!(get_stopwords("En").is_some());
448
- assert!(get_stopwords("eN").is_some());
449
- assert!(get_stopwords("ES").is_some());
450
- assert!(get_stopwords("Es").is_some());
451
- assert!(get_stopwords("DE").is_some());
452
- assert!(get_stopwords("De").is_some());
453
- }
454
-
455
- #[test]
456
- fn test_get_stopwords_locale_hyphen() {
457
- let en_us = get_stopwords("en-US");
458
- let en_gb = get_stopwords("en-GB");
459
- let en = get_stopwords("en");
460
-
461
- assert!(en_us.is_some());
462
- assert!(en_gb.is_some());
463
-
464
- assert_eq!(en_us.unwrap().len(), en.unwrap().len());
465
- assert_eq!(en_gb.unwrap().len(), en.unwrap().len());
466
- }
467
-
468
- #[test]
469
- fn test_get_stopwords_locale_underscore() {
470
- let es_es = get_stopwords("es_ES");
471
- let es_mx = get_stopwords("es_MX");
472
- let es = get_stopwords("es");
473
-
474
- assert!(es_es.is_some());
475
- assert!(es_mx.is_some());
476
-
477
- assert_eq!(es_es.unwrap().len(), es.unwrap().len());
478
- assert_eq!(es_mx.unwrap().len(), es.unwrap().len());
479
- }
480
-
481
- #[test]
482
- fn test_get_stopwords_locale_uppercase() {
483
- let en_us_upper = get_stopwords("EN-US");
484
- let es_es_upper = get_stopwords("ES_ES");
485
- let pt_br_mixed = get_stopwords("Pt-BR");
486
-
487
- assert!(en_us_upper.is_some());
488
- assert!(es_es_upper.is_some());
489
- assert!(pt_br_mixed.is_some());
490
-
491
- assert!(en_us_upper.unwrap().contains("the"));
492
- assert!(es_es_upper.unwrap().contains("el"));
493
- assert!(pt_br_mixed.unwrap().contains("o"));
494
- }
495
-
496
- #[test]
497
- fn test_get_stopwords_all_supported_languages() {
498
- let languages = [
499
- "af", "ar", "bg", "bn", "br", "ca", "cs", "da", "de", "el", "en", "eo", "es", "et", "eu", "fa", "fi", "fr",
500
- "ga", "gl", "gu", "ha", "he", "hi", "hr", "hu", "hy", "id", "it", "ja", "kn", "ko", "ku", "la", "lt", "lv",
501
- "ml", "mr", "ms", "ne", "nl", "no", "pl", "pt", "ro", "ru", "si", "sk", "sl", "so", "st", "sv", "sw", "ta",
502
- "te", "th", "tl", "tr", "uk", "ur", "vi", "yo", "zh", "zu",
503
- ];
504
-
505
- for lang in &languages {
506
- assert!(
507
- get_stopwords(lang).is_some(),
508
- "Language {} should be available via get_stopwords",
509
- lang
510
- );
511
- }
512
- }
513
-
514
- #[test]
515
- fn test_get_stopwords_unsupported_language() {
516
- assert!(get_stopwords("xx").is_none());
517
- assert!(get_stopwords("zz").is_none());
518
- assert!(get_stopwords("xyz").is_none());
519
- assert!(get_stopwords("unknown").is_none());
520
- }
521
-
522
- #[test]
523
- fn test_get_stopwords_empty_string() {
524
- assert!(get_stopwords("").is_none());
525
- }
526
-
527
- #[test]
528
- fn test_get_stopwords_single_char() {
529
- assert!(get_stopwords("e").is_none());
530
- assert!(get_stopwords("z").is_none());
531
- }
532
-
533
- #[test]
534
- fn test_get_stopwords_long_locale() {
535
- let zh_cn_hans = get_stopwords("zh-CN-Hans");
536
- let pt_br_utf8 = get_stopwords("pt_BR.UTF-8");
537
-
538
- assert!(zh_cn_hans.is_some());
539
- assert!(pt_br_utf8.is_some());
540
-
541
- assert_eq!(zh_cn_hans.unwrap().len(), get_stopwords("zh").unwrap().len());
542
- assert_eq!(pt_br_utf8.unwrap().len(), get_stopwords("pt").unwrap().len());
543
- }
544
-
545
- #[test]
546
- fn test_get_stopwords_content_verification() {
547
- let en = get_stopwords("en").expect("English stopwords should exist");
548
- assert!(en.contains("the"));
549
- assert!(en.contains("is"));
550
- assert!(en.contains("and"));
551
-
552
- let es = get_stopwords("es").expect("Spanish stopwords should exist");
553
- assert!(es.contains("el"));
554
- assert!(es.contains("la"));
555
- assert!(es.contains("es"));
556
-
557
- let de = get_stopwords("de").expect("German stopwords should exist");
558
- assert!(de.contains("der"));
559
- assert!(de.contains("die"));
560
- assert!(de.contains("und"));
561
-
562
- let fr = get_stopwords("fr").expect("French stopwords should exist");
563
- assert!(fr.contains("le"));
564
- assert!(fr.contains("de"));
565
- assert!(fr.contains("un"));
566
- }
567
-
568
- #[test]
569
- fn test_get_stopwords_vs_direct_access() {
570
- let en_normalized = get_stopwords("en").unwrap();
571
- let en_direct = STOPWORDS.get("en").unwrap();
572
-
573
- assert_eq!(en_normalized.len(), en_direct.len());
574
-
575
- for word in en_direct {
576
- assert!(en_normalized.contains(word));
577
- }
578
- }
579
-
580
- #[test]
581
- fn test_get_stopwords_with_fallback_primary_available() {
582
- let result = get_stopwords_with_fallback("en", "es");
583
- assert!(result.is_some());
584
- let stopwords = result.unwrap();
585
- assert!(stopwords.contains("the"));
586
- assert!(!stopwords.contains("el"));
587
- }
588
-
589
- #[test]
590
- fn test_get_stopwords_with_fallback_use_fallback() {
591
- let result = get_stopwords_with_fallback("xx", "en");
592
- assert!(result.is_some());
593
- let stopwords = result.unwrap();
594
- assert!(stopwords.contains("the"));
595
- }
596
-
597
- #[test]
598
- fn test_get_stopwords_with_fallback_both_unavailable() {
599
- let result = get_stopwords_with_fallback("xx", "zz");
600
- assert!(result.is_none());
601
- }
602
-
603
- #[test]
604
- fn test_get_stopwords_with_fallback_case_insensitive() {
605
- let result1 = get_stopwords_with_fallback("EN", "es");
606
- let result2 = get_stopwords_with_fallback("xx", "ES");
607
- assert!(result1.is_some());
608
- assert!(result2.is_some());
609
- }
610
-
611
- #[test]
612
- fn test_get_stopwords_with_fallback_locale_codes() {
613
- let result = get_stopwords_with_fallback("es-MX", "en-US");
614
- assert!(result.is_some());
615
- let stopwords = result.unwrap();
616
- assert!(stopwords.contains("el"));
617
- }
618
-
619
- #[test]
620
- fn test_get_stopwords_with_fallback_esperanto_to_english() {
621
- let result = get_stopwords_with_fallback("eo", "en");
622
- assert!(result.is_some());
623
- let stopwords = result.unwrap();
624
- assert!(stopwords.contains("la"));
625
- }
626
-
627
- #[test]
628
- fn test_get_stopwords_with_fallback_unknown_to_english() {
629
- let result = get_stopwords_with_fallback("xyz", "en");
630
- assert!(result.is_some());
631
- let stopwords = result.unwrap();
632
- assert!(stopwords.contains("the"));
633
- }
634
-
635
- #[test]
636
- fn test_get_stopwords_with_fallback_same_as_chained_or_else() {
637
- let manual = get_stopwords("xx").or_else(|| get_stopwords("en"));
638
- let helper = get_stopwords_with_fallback("xx", "en");
639
- assert_eq!(manual.is_some(), helper.is_some());
640
- if let (Some(m), Some(h)) = (manual, helper) {
641
- assert_eq!(m.len(), h.len());
642
- }
643
- }
644
-
645
- #[test]
646
- fn test_get_stopwords_invalid_language_codes() {
647
- assert!(get_stopwords("invalid_lang").is_none());
648
- assert!(get_stopwords("xyz").is_none());
649
- assert!(get_stopwords("zzzz").is_none());
650
- assert!(get_stopwords("abc123").is_none());
651
- assert!(get_stopwords("!!!").is_none());
652
- }
653
-
654
- #[test]
655
- fn test_get_stopwords_edge_case_empty_and_whitespace() {
656
- assert!(get_stopwords("").is_none());
657
- assert!(get_stopwords(" ").is_none());
658
- assert!(get_stopwords(" ").is_none());
659
- assert!(get_stopwords("\t").is_none());
660
- assert!(get_stopwords("\n").is_none());
661
- }
662
-
663
- #[test]
664
- fn test_get_stopwords_special_characters() {
665
- assert!(get_stopwords("@#").is_none());
666
- assert!(get_stopwords("$%").is_none());
667
- assert!(get_stopwords("!!!").is_none());
668
-
669
- let result = get_stopwords("en!");
670
- assert!(result.is_some());
671
- if let Some(stopwords) = result {
672
- assert!(stopwords.contains("the"));
673
- }
674
-
675
- let result = get_stopwords("es@");
676
- assert!(result.is_some());
677
- if let Some(stopwords) = result {
678
- assert!(stopwords.contains("el"));
679
- }
680
-
681
- let result = get_stopwords("de#fr");
682
- assert!(result.is_some());
683
- if let Some(stopwords) = result {
684
- assert!(stopwords.contains("der"));
685
- }
686
- }
687
-
688
- #[test]
689
- fn test_get_stopwords_numeric_codes() {
690
- assert!(get_stopwords("12").is_none());
691
- assert!(get_stopwords("99").is_none());
692
- assert!(get_stopwords("123").is_none());
693
- assert!(get_stopwords("0").is_none());
694
- }
695
-
696
- #[test]
697
- fn test_get_stopwords_single_character_edge_cases() {
698
- assert!(get_stopwords("a").is_none());
699
- assert!(get_stopwords("e").is_none());
700
- assert!(get_stopwords("z").is_none());
701
- assert!(get_stopwords("1").is_none());
702
- assert!(get_stopwords("_").is_none());
703
- }
704
-
705
- #[test]
706
- fn test_get_stopwords_invalid_locale_formats() {
707
- assert!(get_stopwords("xx-YY").is_none());
708
- assert!(get_stopwords("zz_ZZ").is_none());
709
- assert!(get_stopwords("invalid-US").is_none());
710
- assert!(get_stopwords("aa_BB_CC").is_none());
711
- }
712
-
713
- #[test]
714
- fn test_get_stopwords_mixed_valid_invalid() {
715
- let result = get_stopwords("en123");
716
- assert!(result.is_some(), "Should extract 'en' from 'en123'");
717
-
718
- assert!(get_stopwords("12en").is_none());
719
- assert!(get_stopwords("@@en").is_none());
720
- }
721
-
722
- #[test]
723
- fn test_get_stopwords_case_sensitivity_validation() {
724
- let lower = get_stopwords("en");
725
- let upper = get_stopwords("EN");
726
- let mixed1 = get_stopwords("En");
727
- let mixed2 = get_stopwords("eN");
728
-
729
- assert!(lower.is_some());
730
- assert!(upper.is_some());
731
- assert!(mixed1.is_some());
732
- assert!(mixed2.is_some());
733
-
734
- if let (Some(l), Some(u), Some(m1), Some(m2)) = (lower, upper, mixed1, mixed2) {
735
- assert_eq!(l.len(), u.len());
736
- assert_eq!(l.len(), m1.len());
737
- assert_eq!(l.len(), m2.len());
738
- }
739
- }
740
-
741
- #[test]
742
- fn test_get_stopwords_none_return_safety() {
743
- let result = get_stopwords("invalid").and_then(|_| get_stopwords("also_invalid"));
744
- assert!(result.is_none());
745
-
746
- let chained = get_stopwords("xxx")
747
- .or_else(|| get_stopwords("yyy"))
748
- .or_else(|| get_stopwords("zzz"));
749
- assert!(chained.is_none());
750
- }
751
-
752
- #[test]
753
- fn test_get_stopwords_with_fallback_both_invalid() {
754
- assert!(get_stopwords_with_fallback("invalid", "also_invalid").is_none());
755
- assert!(get_stopwords_with_fallback("xxx", "yyy").is_none());
756
- assert!(get_stopwords_with_fallback("", "").is_none());
757
- assert!(get_stopwords_with_fallback("123", "456").is_none());
758
- }
759
-
760
- #[test]
761
- fn test_get_stopwords_with_fallback_invalid_primary_valid_fallback() {
762
- let result = get_stopwords_with_fallback("invalid_lang", "en");
763
- assert!(result.is_some());
764
- if let Some(stopwords) = result {
765
- assert!(stopwords.contains("the"));
766
- }
767
-
768
- let result2 = get_stopwords_with_fallback("xyz", "es");
769
- assert!(result2.is_some());
770
- if let Some(stopwords) = result2 {
771
- assert!(stopwords.contains("el"));
772
- }
773
- }
774
-
775
- #[test]
776
- fn test_get_stopwords_with_fallback_valid_primary_invalid_fallback() {
777
- let result = get_stopwords_with_fallback("en", "invalid_fallback");
778
- assert!(result.is_some());
779
- if let Some(stopwords) = result {
780
- assert!(stopwords.contains("the"));
781
- }
782
-
783
- let result2 = get_stopwords_with_fallback("es", "zzz");
784
- assert!(result2.is_some());
785
- if let Some(stopwords) = result2 {
786
- assert!(stopwords.contains("el"));
787
- }
788
- }
789
-
790
- #[test]
791
- fn test_get_stopwords_with_fallback_empty_strings() {
792
- assert!(get_stopwords_with_fallback("", "en").is_some());
793
- assert!(get_stopwords_with_fallback("en", "").is_some());
794
- assert!(get_stopwords_with_fallback("", "").is_none());
795
- }
796
-
797
- #[test]
798
- fn test_get_stopwords_with_fallback_special_characters() {
799
- assert!(get_stopwords_with_fallback("@#$", "en").is_some());
800
- assert!(get_stopwords_with_fallback("en", "!!!").is_some());
801
- assert!(get_stopwords_with_fallback("@#$", "!!!").is_none());
802
- }
803
-
804
- #[test]
805
- fn test_get_stopwords_with_fallback_case_insensitive_validation() {
806
- let result1 = get_stopwords_with_fallback("INVALID", "en");
807
- let result2 = get_stopwords_with_fallback("invalid", "EN");
808
- let result3 = get_stopwords_with_fallback("INVALID", "EN");
809
-
810
- assert!(result1.is_some());
811
- assert!(result2.is_some());
812
- assert!(result3.is_some());
813
-
814
- if let (Some(r1), Some(r2), Some(r3)) = (result1, result2, result3) {
815
- assert!(r1.contains("the"));
816
- assert!(r2.contains("the"));
817
- assert!(r3.contains("the"));
818
- }
819
- }
820
-
821
- #[test]
822
- fn test_direct_stopwords_access_invalid_keys() {
823
- assert!(STOPWORDS.get("invalid").is_none());
824
- assert!(STOPWORDS.get("EN").is_none());
825
- assert!(STOPWORDS.get("en-US").is_none());
826
- assert!(STOPWORDS.get("xyz").is_none());
827
- assert!(STOPWORDS.get("").is_none());
828
- }
829
-
830
- #[test]
831
- fn test_stopwords_case_sensitivity_direct_vs_normalized() {
832
- assert!(STOPWORDS.get("EN").is_none());
833
- assert!(get_stopwords("EN").is_some());
834
-
835
- assert!(STOPWORDS.get("Es").is_none());
836
- assert!(get_stopwords("Es").is_some());
837
-
838
- assert!(STOPWORDS.get("DE").is_none());
839
- assert!(get_stopwords("DE").is_some());
840
- }
841
-
842
- #[test]
843
- fn test_get_stopwords_unicode_characters() {
844
- // NOTE: Current implementation has a limitation - it uses byte slicing which can panic
845
-
846
- let result = get_stopwords("zh-中文");
847
- assert!(result.is_some());
848
-
849
- let result = get_stopwords("ar-العربية");
850
- assert!(result.is_some());
851
-
852
- let result = get_stopwords("ja_日本");
853
- assert!(result.is_some());
854
-
855
- assert!(get_stopwords("xx").is_none());
856
- assert!(get_stopwords("yy").is_none());
857
-
858
- // NOTE: The following would panic due to byte slicing on multi-byte chars:
859
- }
860
-
861
- #[test]
862
- fn test_get_stopwords_very_long_strings() {
863
- let long_string = "x".repeat(1000);
864
- assert!(get_stopwords(&long_string).is_none());
865
-
866
- let long_locale = "en-".to_string() + &"X".repeat(100);
867
- let result = get_stopwords(&long_locale);
868
- assert!(result.is_some());
869
- }
870
-
871
- #[test]
872
- fn test_get_stopwords_null_bytes() {
873
- assert!(get_stopwords("\0").is_none());
874
- assert!(get_stopwords("en\0").is_some());
875
- assert!(get_stopwords("\0en").is_none());
876
- }
877
-
878
- #[test]
879
- fn test_get_stopwords_boundary_conditions() {
880
- assert!(get_stopwords("e").is_none());
881
- assert!(get_stopwords("en").is_some());
882
- assert!(get_stopwords("eng").is_some());
883
-
884
- let result = get_stopwords("en-");
885
- assert!(result.is_some());
886
- }
887
-
888
- #[test]
889
- fn test_get_stopwords_multiple_separators() {
890
- assert!(get_stopwords("en-US-utf8").is_some());
891
- assert!(get_stopwords("es_MX_special").is_some());
892
- assert!(get_stopwords("pt-BR_variant").is_some());
893
- }
894
-
895
- #[test]
896
- fn test_romance_languages() {
897
- let fr = get_stopwords("fr").expect("French stopwords should exist");
898
- assert!(fr.contains("le"), "French should contain 'le'");
899
- assert!(fr.contains("et"), "French should contain 'et'");
900
- assert!(fr.len() >= 150, "French should have substantial stopwords");
901
-
902
- let es = get_stopwords("es").expect("Spanish stopwords should exist");
903
- assert!(es.contains("el"), "Spanish should contain 'el'");
904
- assert!(es.contains("y"), "Spanish should contain 'y'");
905
- assert!(es.len() >= 200, "Spanish should have substantial stopwords");
906
-
907
- let pt = get_stopwords("pt").expect("Portuguese stopwords should exist");
908
- assert!(pt.contains("o"), "Portuguese should contain 'o'");
909
- assert!(pt.contains("e"), "Portuguese should contain 'e'");
910
- assert!(pt.len() >= 150, "Portuguese should have substantial stopwords");
911
-
912
- let it = get_stopwords("it").expect("Italian stopwords should exist");
913
- assert!(it.contains("il"), "Italian should contain 'il'");
914
- assert!(it.contains("e"), "Italian should contain 'e'");
915
- assert!(it.len() >= 150, "Italian should have substantial stopwords");
916
-
917
- let ro = get_stopwords("ro").expect("Romanian stopwords should exist");
918
- assert!(!ro.is_empty(), "Romanian should have stopwords");
919
- assert!(ro.len() >= 100, "Romanian should have substantial stopwords");
920
- }
921
-
922
- #[test]
923
- fn test_germanic_languages() {
924
- let de = get_stopwords("de").expect("German stopwords should exist");
925
- assert!(de.contains("der"), "German should contain 'der'");
926
- assert!(de.contains("die"), "German should contain 'die'");
927
- assert!(de.contains("und"), "German should contain 'und'");
928
- assert!(de.len() >= 200, "German should have substantial stopwords");
929
-
930
- let en = get_stopwords("en").expect("English stopwords should exist");
931
- assert!(en.contains("the"), "English should contain 'the'");
932
- assert!(en.contains("and"), "English should contain 'and'");
933
- assert!(en.len() >= 70, "English should have substantial stopwords");
934
-
935
- let nl = get_stopwords("nl").expect("Dutch stopwords should exist");
936
- assert!(nl.contains("de"), "Dutch should contain 'de'");
937
- assert!(nl.contains("het"), "Dutch should contain 'het'");
938
- assert!(nl.len() >= 100, "Dutch should have substantial stopwords");
939
-
940
- let sv = get_stopwords("sv").expect("Swedish stopwords should exist");
941
- assert!(!sv.is_empty(), "Swedish should have stopwords");
942
- assert!(sv.len() >= 100, "Swedish should have substantial stopwords");
943
-
944
- let no = get_stopwords("no").expect("Norwegian stopwords should exist");
945
- assert!(!no.is_empty(), "Norwegian should have stopwords");
946
-
947
- let da = get_stopwords("da").expect("Danish stopwords should exist");
948
- assert!(!da.is_empty(), "Danish should have stopwords");
949
- }
950
-
951
- #[test]
952
- fn test_slavic_languages() {
953
- let ru = get_stopwords("ru").expect("Russian stopwords should exist");
954
- assert!(!ru.is_empty(), "Russian should have stopwords");
955
- assert!(ru.len() >= 100, "Russian should have substantial stopwords");
956
-
957
- let pl = get_stopwords("pl").expect("Polish stopwords should exist");
958
- assert!(!pl.is_empty(), "Polish should have stopwords");
959
- assert!(pl.len() >= 100, "Polish should have substantial stopwords");
960
-
961
- let cs = get_stopwords("cs").expect("Czech stopwords should exist");
962
- assert!(!cs.is_empty(), "Czech should have stopwords");
963
-
964
- let sk = get_stopwords("sk").expect("Slovak stopwords should exist");
965
- assert!(!sk.is_empty(), "Slovak should have stopwords");
966
-
967
- let bg = get_stopwords("bg").expect("Bulgarian stopwords should exist");
968
- assert!(!bg.is_empty(), "Bulgarian should have stopwords");
969
-
970
- let uk = get_stopwords("uk").expect("Ukrainian stopwords should exist");
971
- assert!(!uk.is_empty(), "Ukrainian should have stopwords");
972
-
973
- let hr = get_stopwords("hr").expect("Croatian stopwords should exist");
974
- assert!(!hr.is_empty(), "Croatian should have stopwords");
975
-
976
- let sl = get_stopwords("sl").expect("Slovenian stopwords should exist");
977
- assert!(!sl.is_empty(), "Slovenian should have stopwords");
978
- }
979
-
980
- #[test]
981
- fn test_asian_languages() {
982
- let zh = get_stopwords("zh").expect("Chinese stopwords should exist");
983
- assert!(!zh.is_empty(), "Chinese should have stopwords");
984
- assert!(zh.len() >= 50, "Chinese should have substantial stopwords");
985
-
986
- let ja = get_stopwords("ja").expect("Japanese stopwords should exist");
987
- assert!(!ja.is_empty(), "Japanese should have stopwords");
988
- assert!(ja.len() >= 50, "Japanese should have substantial stopwords");
989
-
990
- let ko = get_stopwords("ko").expect("Korean stopwords should exist");
991
- assert!(!ko.is_empty(), "Korean should have stopwords");
992
-
993
- let hi = get_stopwords("hi").expect("Hindi stopwords should exist");
994
- assert!(!hi.is_empty(), "Hindi should have stopwords");
995
- assert!(hi.len() >= 100, "Hindi should have substantial stopwords");
996
-
997
- let bn = get_stopwords("bn").expect("Bengali stopwords should exist");
998
- assert!(!bn.is_empty(), "Bengali should have stopwords");
999
-
1000
- let th = get_stopwords("th").expect("Thai stopwords should exist");
1001
- assert!(!th.is_empty(), "Thai should have stopwords");
1002
-
1003
- let vi = get_stopwords("vi").expect("Vietnamese stopwords should exist");
1004
- assert!(!vi.is_empty(), "Vietnamese should have stopwords");
1005
- }
1006
-
1007
- #[test]
1008
- fn test_african_languages() {
1009
- let af = get_stopwords("af").expect("Afrikaans stopwords should exist");
1010
- assert!(!af.is_empty(), "Afrikaans should have stopwords");
1011
-
1012
- let sw = get_stopwords("sw").expect("Swahili stopwords should exist");
1013
- assert!(!sw.is_empty(), "Swahili should have stopwords");
1014
-
1015
- let yo = get_stopwords("yo").expect("Yoruba stopwords should exist");
1016
- assert!(!yo.is_empty(), "Yoruba should have stopwords");
1017
-
1018
- let zu = get_stopwords("zu").expect("Zulu stopwords should exist");
1019
- assert!(!zu.is_empty(), "Zulu should have stopwords");
1020
-
1021
- let ha = get_stopwords("ha").expect("Hausa stopwords should exist");
1022
- assert!(!ha.is_empty(), "Hausa should have stopwords");
1023
-
1024
- let so = get_stopwords("so").expect("Somali stopwords should exist");
1025
- assert!(!so.is_empty(), "Somali should have stopwords");
1026
-
1027
- let st = get_stopwords("st").expect("Sesotho stopwords should exist");
1028
- assert!(!st.is_empty(), "Sesotho should have stopwords");
1029
- }
1030
-
1031
- #[test]
1032
- fn test_indic_languages() {
1033
- let hi = get_stopwords("hi").expect("Hindi stopwords should exist");
1034
- assert!(!hi.is_empty(), "Hindi should have stopwords");
1035
-
1036
- let bn = get_stopwords("bn").expect("Bengali stopwords should exist");
1037
- assert!(!bn.is_empty(), "Bengali should have stopwords");
1038
-
1039
- let gu = get_stopwords("gu").expect("Gujarati stopwords should exist");
1040
- assert!(!gu.is_empty(), "Gujarati should have stopwords");
1041
-
1042
- let kn = get_stopwords("kn").expect("Kannada stopwords should exist");
1043
- assert!(!kn.is_empty(), "Kannada should have stopwords");
1044
-
1045
- let ml = get_stopwords("ml").expect("Malayalam stopwords should exist");
1046
- assert!(!ml.is_empty(), "Malayalam should have stopwords");
1047
-
1048
- let mr = get_stopwords("mr").expect("Marathi stopwords should exist");
1049
- assert!(!mr.is_empty(), "Marathi should have stopwords");
1050
-
1051
- let ta = get_stopwords("ta").expect("Tamil stopwords should exist");
1052
- assert!(!ta.is_empty(), "Tamil should have stopwords");
1053
-
1054
- let te = get_stopwords("te").expect("Telugu stopwords should exist");
1055
- assert!(!te.is_empty(), "Telugu should have stopwords");
1056
-
1057
- let ur = get_stopwords("ur").expect("Urdu stopwords should exist");
1058
- assert!(!ur.is_empty(), "Urdu should have stopwords");
1059
-
1060
- let ne = get_stopwords("ne").expect("Nepali stopwords should exist");
1061
- assert!(!ne.is_empty(), "Nepali should have stopwords");
1062
-
1063
- let si = get_stopwords("si").expect("Sinhala stopwords should exist");
1064
- assert!(!si.is_empty(), "Sinhala should have stopwords");
1065
- }
1066
-
1067
- #[test]
1068
- fn test_middle_eastern_languages() {
1069
- let ar = get_stopwords("ar").expect("Arabic stopwords should exist");
1070
- assert!(!ar.is_empty(), "Arabic should have stopwords");
1071
- assert!(ar.len() >= 100, "Arabic should have substantial stopwords");
1072
-
1073
- let fa = get_stopwords("fa").expect("Persian stopwords should exist");
1074
- assert!(!fa.is_empty(), "Persian should have stopwords");
1075
-
1076
- let he = get_stopwords("he").expect("Hebrew stopwords should exist");
1077
- assert!(!he.is_empty(), "Hebrew should have stopwords");
1078
-
1079
- let tr = get_stopwords("tr").expect("Turkish stopwords should exist");
1080
- assert!(!tr.is_empty(), "Turkish should have stopwords");
1081
-
1082
- let ku = get_stopwords("ku").expect("Kurdish stopwords should exist");
1083
- assert!(!ku.is_empty(), "Kurdish stopwords should exist");
1084
- }
1085
-
1086
- #[test]
1087
- fn test_other_languages() {
1088
- let hy = get_stopwords("hy").expect("Armenian stopwords should exist");
1089
- assert!(!hy.is_empty(), "Armenian should have stopwords");
1090
-
1091
- let eu = get_stopwords("eu").expect("Basque stopwords should exist");
1092
- assert!(!eu.is_empty(), "Basque should have stopwords");
1093
-
1094
- let br = get_stopwords("br").expect("Breton stopwords should exist");
1095
- assert!(!br.is_empty(), "Breton should have stopwords");
1096
-
1097
- let ca = get_stopwords("ca").expect("Catalan stopwords should exist");
1098
- assert!(!ca.is_empty(), "Catalan should have stopwords");
1099
-
1100
- let eo = get_stopwords("eo").expect("Esperanto stopwords should exist");
1101
- assert!(eo.contains("la"), "Esperanto should contain 'la'");
1102
- assert!(!eo.is_empty(), "Esperanto should have stopwords");
1103
-
1104
- let et = get_stopwords("et").expect("Estonian stopwords should exist");
1105
- assert!(!et.is_empty(), "Estonian should have stopwords");
1106
-
1107
- let fi = get_stopwords("fi").expect("Finnish stopwords should exist");
1108
- assert!(!fi.is_empty(), "Finnish should have stopwords");
1109
-
1110
- let gl = get_stopwords("gl").expect("Galician stopwords should exist");
1111
- assert!(!gl.is_empty(), "Galician should have stopwords");
1112
-
1113
- let hu = get_stopwords("hu").expect("Hungarian stopwords should exist");
1114
- assert!(!hu.is_empty(), "Hungarian should have stopwords");
1115
-
1116
- let id = get_stopwords("id").expect("Indonesian stopwords should exist");
1117
- assert!(!id.is_empty(), "Indonesian should have stopwords");
1118
-
1119
- let ga = get_stopwords("ga").expect("Irish stopwords should exist");
1120
- assert!(!ga.is_empty(), "Irish should have stopwords");
1121
-
1122
- let la = get_stopwords("la").expect("Latin stopwords should exist");
1123
- assert!(!la.is_empty(), "Latin should have stopwords");
1124
-
1125
- let lt = get_stopwords("lt").expect("Lithuanian stopwords should exist");
1126
- assert!(!lt.is_empty(), "Lithuanian should have stopwords");
1127
-
1128
- let lv = get_stopwords("lv").expect("Latvian stopwords should exist");
1129
- assert!(!lv.is_empty(), "Latvian should have stopwords");
1130
-
1131
- let ms = get_stopwords("ms").expect("Malay stopwords should exist");
1132
- assert!(!ms.is_empty(), "Malay should have stopwords");
1133
-
1134
- let tl = get_stopwords("tl").expect("Tagalog stopwords should exist");
1135
- assert!(!tl.is_empty(), "Tagalog should have stopwords");
1136
- }
1137
-
1138
- #[test]
1139
- fn test_language_code_variants() {
1140
- let eng = get_stopwords("eng");
1141
- let en = get_stopwords("en");
1142
- assert!(eng.is_some(), "'eng' should extract to 'en'");
1143
- assert!(en.is_some());
1144
- assert_eq!(eng.unwrap().len(), en.unwrap().len());
1145
-
1146
- let spa = get_stopwords("spa");
1147
- assert!(spa.is_none(), "'spa' extracts to 'sp' which is invalid");
1148
-
1149
- let deu = get_stopwords("deu");
1150
- let de = get_stopwords("de");
1151
- assert!(deu.is_some(), "'deu' should extract to 'de'");
1152
- assert_eq!(deu.unwrap().len(), de.unwrap().len());
1153
-
1154
- let fra = get_stopwords("fra");
1155
- let fr = get_stopwords("fr");
1156
- assert!(fra.is_some(), "'fra' should extract to 'fr'");
1157
- assert_eq!(fra.unwrap().len(), fr.unwrap().len());
1158
-
1159
- let zho = get_stopwords("zho");
1160
- let zh = get_stopwords("zh");
1161
- assert!(zho.is_some(), "'zho' should extract to 'zh'");
1162
- assert_eq!(zho.unwrap().len(), zh.unwrap().len());
1163
- }
1164
-
1165
- #[test]
1166
- fn test_stopword_set_sizes() {
1167
- let mut sizes: Vec<(String, usize)> = Vec::new();
1168
-
1169
- for (lang, stopwords) in STOPWORDS.iter() {
1170
- sizes.push((lang.clone(), stopwords.len()));
1171
- assert!(!stopwords.is_empty(), "Language {} has empty stopwords", lang);
1172
- assert!(
1173
- stopwords.len() >= 5,
1174
- "Language {} has suspiciously few stopwords: {}",
1175
- lang,
1176
- stopwords.len()
1177
- );
1178
- assert!(
1179
- stopwords.len() <= 1500,
1180
- "Language {} has suspiciously many stopwords: {}",
1181
- lang,
1182
- stopwords.len()
1183
- );
1184
- }
1185
-
1186
- assert_eq!(sizes.len(), 64, "Should have exactly 64 languages");
1187
-
1188
- let en_size = STOPWORDS.get("en").unwrap().len();
1189
- assert!(
1190
- (70..=1500).contains(&en_size),
1191
- "English stopwords size {} outside expected range",
1192
- en_size
1193
- );
1194
-
1195
- let es_size = STOPWORDS.get("es").unwrap().len();
1196
- assert!(
1197
- (200..=1000).contains(&es_size),
1198
- "Spanish stopwords size {} outside expected range",
1199
- es_size
1200
- );
1201
- }
1202
-
1203
- #[test]
1204
- fn test_stopword_content_quality() {
1205
- let en = get_stopwords("en").expect("English stopwords");
1206
- let english_common = vec![
1207
- "the", "is", "are", "was", "were", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", "of",
1208
- "with",
1209
- ];
1210
- for word in english_common {
1211
- assert!(en.contains(word), "English missing common stopword: {}", word);
1212
- }
1213
-
1214
- let es = get_stopwords("es").expect("Spanish stopwords");
1215
- let spanish_common = vec![
1216
- "el", "la", "los", "las", "un", "una", "de", "en", "y", "o", "por", "para",
1217
- ];
1218
- for word in spanish_common {
1219
- assert!(es.contains(word), "Spanish missing common stopword: {}", word);
1220
- }
1221
-
1222
- let de = get_stopwords("de").expect("German stopwords");
1223
- let german_common = vec![
1224
- "der", "die", "das", "den", "dem", "des", "und", "oder", "in", "auf", "mit", "von",
1225
- ];
1226
- for word in german_common {
1227
- assert!(de.contains(word), "German missing common stopword: {}", word);
1228
- }
1229
-
1230
- let fr = get_stopwords("fr").expect("French stopwords");
1231
- let french_common = vec![
1232
- "le", "la", "les", "un", "une", "de", "en", "et", "ou", "pour", "avec", "dans",
1233
- ];
1234
- for word in french_common {
1235
- assert!(fr.contains(word), "French missing common stopword: {}", word);
1236
- }
1237
- }
1238
-
1239
- #[test]
1240
- fn test_stopword_deduplication() {
1241
- for (lang, stopwords) in STOPWORDS.iter() {
1242
- let original_len = stopwords.len();
1243
- let unique_len = stopwords.iter().collect::<AHashSet<_>>().len();
1244
- assert_eq!(original_len, unique_len, "Language {} has duplicate stopwords", lang);
1245
- }
1246
- }
1247
-
1248
- #[test]
1249
- fn test_case_normalization_comprehensive() {
1250
- let test_cases = vec![
1251
- ("en", "EN", "En", "eN"),
1252
- ("es", "ES", "Es", "eS"),
1253
- ("de", "DE", "De", "dE"),
1254
- ("fr", "FR", "Fr", "fR"),
1255
- ("zh", "ZH", "Zh", "zH"),
1256
- ("ar", "AR", "Ar", "aR"),
1257
- ];
1258
-
1259
- for (lower, upper, title, mixed) in test_cases {
1260
- let lower_result = get_stopwords(lower);
1261
- let upper_result = get_stopwords(upper);
1262
- let title_result = get_stopwords(title);
1263
- let mixed_result = get_stopwords(mixed);
1264
-
1265
- assert!(lower_result.is_some(), "{} should be valid", lower);
1266
- assert!(upper_result.is_some(), "{} should be valid", upper);
1267
- assert!(title_result.is_some(), "{} should be valid", title);
1268
- assert!(mixed_result.is_some(), "{} should be valid", mixed);
1269
-
1270
- let len = lower_result.unwrap().len();
1271
- assert_eq!(upper_result.unwrap().len(), len);
1272
- assert_eq!(title_result.unwrap().len(), len);
1273
- assert_eq!(mixed_result.unwrap().len(), len);
1274
- }
1275
- }
1276
-
1277
- #[test]
1278
- fn test_locale_code_normalization_comprehensive() {
1279
- let test_cases = vec![
1280
- ("en-US", "en_US", "en-GB", "en_GB", "en"),
1281
- ("es-ES", "es_ES", "es-MX", "es_MX", "es"),
1282
- ("pt-PT", "pt_PT", "pt-BR", "pt_BR", "pt"),
1283
- ("zh-CN", "zh_CN", "zh-TW", "zh_TW", "zh"),
1284
- ("fr-FR", "fr_FR", "fr-CA", "fr_CA", "fr"),
1285
- ];
1286
-
1287
- for (hyphen1, underscore1, hyphen2, underscore2, base) in test_cases {
1288
- let base_result = get_stopwords(base).unwrap_or_else(|| panic!("{} should be valid", base));
1289
-
1290
- let h1 = get_stopwords(hyphen1);
1291
- let u1 = get_stopwords(underscore1);
1292
- let h2 = get_stopwords(hyphen2);
1293
- let u2 = get_stopwords(underscore2);
1294
-
1295
- assert!(h1.is_some(), "{} should be valid", hyphen1);
1296
- assert!(u1.is_some(), "{} should be valid", underscore1);
1297
- assert!(h2.is_some(), "{} should be valid", hyphen2);
1298
- assert!(u2.is_some(), "{} should be valid", underscore2);
1299
-
1300
- let len = base_result.len();
1301
- assert_eq!(h1.unwrap().len(), len, "{} should match {}", hyphen1, base);
1302
- assert_eq!(u1.unwrap().len(), len, "{} should match {}", underscore1, base);
1303
- assert_eq!(h2.unwrap().len(), len, "{} should match {}", hyphen2, base);
1304
- assert_eq!(u2.unwrap().len(), len, "{} should match {}", underscore2, base);
1305
- }
1306
- }
1307
-
1308
- #[test]
1309
- fn test_fallback_chains() {
1310
- let scenarios = vec![
1311
- ("en", "es", true, "en"),
1312
- ("xx", "en", true, "en"),
1313
- ("xx", "yy", false, ""),
1314
- ("es", "xx", true, "es"),
1315
- ];
1316
-
1317
- for (primary, fallback, should_succeed, expected_lang) in scenarios {
1318
- let result = get_stopwords_with_fallback(primary, fallback);
1319
- assert_eq!(
1320
- result.is_some(),
1321
- should_succeed,
1322
- "Fallback({}, {}) should {}",
1323
- primary,
1324
- fallback,
1325
- if should_succeed { "succeed" } else { "fail" }
1326
- );
1327
-
1328
- if should_succeed {
1329
- let stopwords = result.unwrap();
1330
- let expected = get_stopwords(expected_lang).unwrap();
1331
- assert_eq!(
1332
- stopwords.len(),
1333
- expected.len(),
1334
- "Fallback should return {} stopwords",
1335
- expected_lang
1336
- );
1337
- }
1338
- }
1339
- }
1340
-
1341
- #[test]
1342
- fn test_stopword_string_types() {
1343
- for (lang, stopwords) in STOPWORDS.iter() {
1344
- for word in stopwords {
1345
- assert!(!word.is_empty(), "Language {} has empty stopword", lang);
1346
- assert!(
1347
- word.len() <= 100,
1348
- "Language {} has suspiciously long stopword: {} ({} bytes)",
1349
- lang,
1350
- word,
1351
- word.len()
1352
- );
1353
- assert!(word.chars().count() > 0, "Language {} has invalid UTF-8 stopword", lang);
1354
- }
1355
- }
1356
- }
1357
-
1358
- #[test]
1359
- fn test_concurrent_access() {
1360
- use std::thread;
1361
-
1362
- let languages = vec!["en", "es", "de", "fr", "zh", "ar", "ru", "ja"];
1363
- let mut handles = vec![];
1364
-
1365
- for lang in languages {
1366
- let handle = thread::spawn(move || {
1367
- let stopwords = get_stopwords(lang);
1368
- assert!(stopwords.is_some(), "Language {} should be available", lang);
1369
- stopwords.unwrap().len()
1370
- });
1371
- handles.push(handle);
1372
- }
1373
-
1374
- for handle in handles {
1375
- let len = handle.join().expect("Thread should not panic");
1376
- assert!(len > 0, "Stopwords should not be empty");
1377
- }
1378
- }
1379
-
1380
- #[test]
1381
- fn test_stopwords_immutability() {
1382
- let en1 = get_stopwords("en").unwrap();
1383
- let en2 = get_stopwords("en").unwrap();
1384
-
1385
- assert_eq!(en1.len(), en2.len());
1386
-
1387
- for word in en1 {
1388
- assert!(
1389
- en2.contains(word),
1390
- "Stopword '{}' should exist in both references",
1391
- word
1392
- );
1393
- }
1394
- }
1395
-
1396
- #[test]
1397
- fn test_edge_case_separator_positions() {
1398
- let test_cases = vec![
1399
- ("en-", true),
1400
- ("-en", false),
1401
- ("e-n", false),
1402
- ("en--US", true),
1403
- ("en_-US", true),
1404
- ("_en", false),
1405
- ("en_", true),
1406
- ];
1407
-
1408
- for (code, should_find_en) in test_cases {
1409
- let result = get_stopwords(code);
1410
- if should_find_en {
1411
- assert!(result.is_some(), "Code '{}' should extract 'en'", code);
1412
- if let Some(stopwords) = result {
1413
- assert!(
1414
- stopwords.contains("the"),
1415
- "Code '{}' should return English stopwords",
1416
- code
1417
- );
1418
- }
1419
- } else {
1420
- let _ = result;
1421
- }
1422
- }
1423
- }
1424
-
1425
- #[test]
1426
- fn test_performance_characteristics() {
1427
- use std::time::Instant;
1428
-
1429
- let _ = get_stopwords("en");
1430
-
1431
- let start = Instant::now();
1432
- for _ in 0..10000 {
1433
- let _ = get_stopwords("en");
1434
- let _ = get_stopwords("es");
1435
- let _ = get_stopwords("de");
1436
- }
1437
- let duration = start.elapsed();
1438
-
1439
- assert!(
1440
- duration.as_millis() < 500,
1441
- "30,000 lookups took too long: {:?}",
1442
- duration
1443
- );
1444
- }
1445
-
1446
- #[test]
1447
- fn test_language_completeness() {
1448
- let documented = vec![
1449
- "af", "ar", "bg", "bn", "br", "ca", "cs", "da", "de", "el", "en", "eo", "es", "et", "eu", "fa", "fi", "fr",
1450
- "ga", "gl", "gu", "ha", "he", "hi", "hr", "hu", "hy", "id", "it", "ja", "kn", "ko", "ku", "la", "lt", "lv",
1451
- "ml", "mr", "ms", "ne", "nl", "no", "pl", "pt", "ro", "ru", "si", "sk", "sl", "so", "st", "sv", "sw", "ta",
1452
- "te", "th", "tl", "tr", "uk", "ur", "vi", "yo", "zh", "zu",
1453
- ];
1454
-
1455
- assert_eq!(documented.len(), 64, "Documentation lists 64 languages");
1456
-
1457
- for lang in documented {
1458
- assert!(
1459
- STOPWORDS.contains_key(lang),
1460
- "Documented language '{}' is missing from STOPWORDS",
1461
- lang
1462
- );
1463
- assert!(
1464
- get_stopwords(lang).is_some(),
1465
- "Documented language '{}' not accessible via get_stopwords",
1466
- lang
1467
- );
1468
- }
1469
- }
1470
- }
1
+ //! Stopwords management for text processing.
2
+ //!
3
+ //! Provides language-specific stopword collections used by keyword extraction
4
+ //! and token reduction features. Stopwords are common words (the, is, and, etc.)
5
+ //! that should be filtered out from text analysis.
6
+ //!
7
+ //! # Supported Languages
8
+ //!
9
+ //! Supports 64 languages with embedded stopword lists:
10
+ //! - Afrikaans (af), Arabic (ar), Bulgarian (bg), Bengali (bn), Breton (br)
11
+ //! - Catalan (ca), Czech (cs), Danish (da), German (de), Greek (el)
12
+ //! - English (en), Esperanto (eo), Spanish (es), Estonian (et), Basque (eu)
13
+ //! - Persian (fa), Finnish (fi), French (fr), Irish (ga), Galician (gl)
14
+ //! - Gujarati (gu), Hausa (ha), Hebrew (he), Hindi (hi), Croatian (hr)
15
+ //! - Hungarian (hu), Armenian (hy), Indonesian (id), Italian (it), Japanese (ja)
16
+ //! - Kannada (kn), Korean (ko), Kurdish (ku), Latin (la), Lithuanian (lt)
17
+ //! - Latvian (lv), Malayalam (ml), Marathi (mr), Malay (ms), Nepali (ne)
18
+ //! - Dutch (nl), Norwegian (no), Polish (pl), Portuguese (pt), Romanian (ro)
19
+ //! - Russian (ru), Sinhala (si), Slovak (sk), Slovenian (sl), Somali (so)
20
+ //! - Sesotho (st), Swedish (sv), Swahili (sw), Tamil (ta), Telugu (te)
21
+ //! - Thai (th), Tagalog (tl), Turkish (tr), Ukrainian (uk), Urdu (ur)
22
+ //! - Vietnamese (vi), Yoruba (yo), Chinese (zh), Zulu (zu)
23
+ //!
24
+ //! All stopword lists are embedded in the binary at compile time for zero-overhead access.
25
+ //!
26
+ //! # Usage
27
+ //!
28
+ //! ```rust
29
+ //! use kreuzberg::stopwords::{get_stopwords, get_stopwords_with_fallback};
30
+ //!
31
+ //! // Get English stopwords with normalization
32
+ //! if let Some(en_stopwords) = get_stopwords("en") {
33
+ //! assert!(en_stopwords.contains("the"));
34
+ //!
35
+ //! // Check if a word is a stopword
36
+ //! if en_stopwords.contains("the") {
37
+ //! println!("'the' is a stopword");
38
+ //! }
39
+ //! }
40
+ //!
41
+ //! // Case-insensitive - all of these work
42
+ //! assert!(get_stopwords("EN").is_some());
43
+ //! assert!(get_stopwords("En").is_some());
44
+ //!
45
+ //! // Locale codes are normalized to language code (first 2 chars)
46
+ //! if let Some(en_us) = get_stopwords("en-US") {
47
+ //! if let Some(en_gb) = get_stopwords("en_GB") {
48
+ //! // Both point to "en" stopwords
49
+ //! assert_eq!(en_us.len(), en_gb.len());
50
+ //! }
51
+ //! }
52
+ //!
53
+ //! // Spanish with locale
54
+ //! if let Some(es_stopwords) = get_stopwords("es-ES") {
55
+ //! assert!(es_stopwords.contains("el"));
56
+ //! }
57
+ //!
58
+ //! // Fallback for unsupported languages
59
+ //! if let Some(stopwords) = get_stopwords_with_fallback("unknown", "en") {
60
+ //! // Will use English stopwords since "unknown" isn't supported
61
+ //! assert!(stopwords.contains("the"));
62
+ //! }
63
+ //! ```
64
+ //!
65
+ //! # Direct Access (Advanced)
66
+ //!
67
+ //! For advanced use cases where you need direct access to the HashMap or want to
68
+ //! iterate over all languages, you can use the `STOPWORDS` static directly:
69
+ //!
70
+ //! ```rust
71
+ //! use kreuzberg::stopwords::STOPWORDS;
72
+ //!
73
+ //! // Direct access (case-sensitive, no normalization)
74
+ //! let en_stopwords = STOPWORDS.get("en");
75
+ //!
76
+ //! // List all available languages
77
+ //! for lang in STOPWORDS.keys() {
78
+ //! println!("Available language: {}", lang);
79
+ //! }
80
+ //! ```
81
+
82
+ use ahash::{AHashMap, AHashSet};
83
+ use once_cell::sync::Lazy;
84
+
85
+ /// Macro to generate embedded stopwords for all languages.
86
+ ///
87
+ /// This macro embeds the JSON files at compile time using `include_str!()` and
88
+ /// generates code to parse and insert them into the stopwords map.
89
+ macro_rules! embed_stopwords {
90
+ ($map:expr, $($lang:literal),* $(,)?) => {
91
+ $(
92
+ {
93
+ const JSON: &str = include_str!(concat!("../../stopwords/", $lang, "_stopwords.json"));
94
+ match serde_json::from_str::<Vec<String>>(JSON) {
95
+ Ok(words) => {
96
+ let set: AHashSet<String> = words.into_iter().collect();
97
+ $map.insert($lang.to_string(), set);
98
+ }
99
+ Err(e) => {
100
+ panic!(
101
+ "Failed to parse embedded stopwords for language '{}': {}. \
102
+ This indicates corrupted or malformed JSON in the embedded stopwords data. \
103
+ Please report this issue at https://github.com/kreuzberg-dev/kreuzberg/issues",
104
+ $lang, e
105
+ );
106
+ }
107
+ }
108
+ }
109
+ )*
110
+ };
111
+ }
112
+
113
+ /// Global stopwords registry.
114
+ ///
115
+ /// A lazy-initialized map of language codes to stopword sets.
116
+ /// All stopword lists are embedded in the binary at compile time for
117
+ /// zero-overhead access and no runtime I/O dependencies.
118
+ ///
119
+ /// Supports 64 languages with comprehensive stopword coverage.
120
+ ///
121
+ /// # Note
122
+ ///
123
+ /// For most use cases, prefer [`get_stopwords()`] which provides language code
124
+ /// normalization (case-insensitive, locale handling). Direct access to STOPWORDS
125
+ /// is case-sensitive and requires exact language codes (lowercase, 2-letter ISO 639-1).
126
+ ///
127
+ /// # Examples
128
+ ///
129
+ /// ```rust
130
+ /// use kreuzberg::stopwords::STOPWORDS;
131
+ ///
132
+ /// // Direct access (case-sensitive, no normalization)
133
+ /// let en_stopwords = STOPWORDS.get("en");
134
+ /// assert!(en_stopwords.is_some());
135
+ ///
136
+ /// // Case-sensitive - these return None
137
+ /// assert!(STOPWORDS.get("EN").is_none());
138
+ /// assert!(STOPWORDS.get("en-US").is_none());
139
+ ///
140
+ /// // List all available languages
141
+ /// assert_eq!(STOPWORDS.len(), 64);
142
+ /// for lang in STOPWORDS.keys() {
143
+ /// println!("Available: {}", lang);
144
+ /// }
145
+ /// ```
146
+ pub static STOPWORDS: Lazy<AHashMap<String, AHashSet<String>>> = Lazy::new(|| {
147
+ let mut map = AHashMap::new();
148
+
149
+ embed_stopwords!(
150
+ map, "af", "ar", "bg", "bn", "br", "ca", "cs", "da", "de", "el", "en", "eo", "es", "et", "eu", "fa", "fi",
151
+ "fr", "ga", "gl", "gu", "ha", "he", "hi", "hr", "hu", "hy", "id", "it", "ja", "kn", "ko", "ku", "la", "lt",
152
+ "lv", "ml", "mr", "ms", "ne", "nl", "no", "pl", "pt", "ro", "ru", "si", "sk", "sl", "so", "st", "sv", "sw",
153
+ "ta", "te", "th", "tl", "tr", "uk", "ur", "vi", "yo", "zh", "zu",
154
+ );
155
+
156
+ apply_stopword_whitelist(&mut map);
157
+
158
+ map
159
+ });
160
+
161
+ fn apply_stopword_whitelist(map: &mut AHashMap<String, AHashSet<String>>) {
162
+ const STOPWORD_REMOVALS: &[(&str, &[&str])] = &[("en", &["hello", "test", "world", "working", "great"])];
163
+
164
+ for (lang, words) in STOPWORD_REMOVALS {
165
+ if let Some(set) = map.get_mut(*lang) {
166
+ for &word in *words {
167
+ set.remove(word);
168
+ }
169
+ }
170
+ }
171
+ }
172
+
173
+ /// Get stopwords for a language with normalization.
174
+ ///
175
+ /// This function provides a user-friendly interface to the stopwords registry with:
176
+ /// - **Case-insensitive lookup**: "EN", "en", "En" all work
177
+ /// - **Locale normalization**: "en-US", "en_GB", "es-ES" extract to "en", "es"
178
+ /// - **Consistent behavior**: Returns `None` for unsupported languages
179
+ ///
180
+ /// # Language Code Format
181
+ ///
182
+ /// Accepts multiple formats:
183
+ /// - ISO 639-1 two-letter codes: `"en"`, `"es"`, `"de"`, etc.
184
+ /// - Uppercase variants: `"EN"`, `"ES"`, `"DE"`
185
+ /// - Locale codes with hyphen: `"en-US"`, `"es-ES"`, `"pt-BR"`
186
+ /// - Locale codes with underscore: `"en_US"`, `"es_ES"`, `"pt_BR"`
187
+ ///
188
+ /// All formats are normalized to lowercase two-letter ISO 639-1 codes.
189
+ ///
190
+ /// # Returns
191
+ ///
192
+ /// - `Some(&HashSet<String>)` if the language is supported (64 languages available)
193
+ /// - `None` if the language is not supported
194
+ ///
195
+ /// # Examples
196
+ ///
197
+ /// ```rust
198
+ /// use kreuzberg::stopwords::get_stopwords;
199
+ ///
200
+ /// // Simple language codes
201
+ /// if let Some(en) = get_stopwords("en") {
202
+ /// assert!(en.contains("the"));
203
+ /// }
204
+ ///
205
+ /// // Case-insensitive
206
+ /// assert!(get_stopwords("EN").is_some());
207
+ /// assert!(get_stopwords("En").is_some());
208
+ /// assert!(get_stopwords("eN").is_some());
209
+ ///
210
+ /// // Locale codes normalized to language code
211
+ /// if let (Some(en_us), Some(en_gb), Some(en_lowercase)) =
212
+ /// (get_stopwords("en-US"), get_stopwords("en_GB"), get_stopwords("en"))
213
+ /// {
214
+ /// // All point to the same stopwords set
215
+ /// assert_eq!(en_us.len(), en_gb.len());
216
+ /// assert_eq!(en_us.len(), en_lowercase.len());
217
+ /// }
218
+ ///
219
+ /// // Spanish with various formats
220
+ /// assert!(get_stopwords("es").is_some());
221
+ /// assert!(get_stopwords("ES").is_some());
222
+ /// assert!(get_stopwords("es-ES").is_some());
223
+ /// assert!(get_stopwords("es_MX").is_some());
224
+ ///
225
+ /// // Unsupported language returns None
226
+ /// assert!(get_stopwords("xx").is_none());
227
+ /// assert!(get_stopwords("zzzz").is_none());
228
+ /// ```
229
+ ///
230
+ /// # Performance
231
+ ///
232
+ /// This function performs two operations:
233
+ /// 1. String normalization (lowercase + truncate) - O(1) for typical language codes
234
+ /// 2. HashMap lookup in STOPWORDS - O(1) average case
235
+ ///
236
+ /// Total overhead is negligible (~10-50ns on modern CPUs).
237
+ pub fn get_stopwords(lang: &str) -> Option<&'static AHashSet<String>> {
238
+ let normalized = lang.to_lowercase();
239
+
240
+ let lang_code = if let Some(pos) = normalized.find(&['-', '_'][..]) {
241
+ &normalized[..pos]
242
+ } else if normalized.len() >= 2 {
243
+ &normalized[..2]
244
+ } else {
245
+ &normalized
246
+ };
247
+
248
+ STOPWORDS.get(lang_code)
249
+ }
250
+
251
+ /// Get stopwords for a language with fallback support.
252
+ ///
253
+ /// This function attempts to retrieve stopwords for the primary language,
254
+ /// and if not available, falls back to a secondary language. This is useful
255
+ /// for handling scenarios where:
256
+ /// - A detected language isn't supported
257
+ /// - You want to use English as a fallback for unknown languages
258
+ /// - You need graceful degradation for multilingual content
259
+ ///
260
+ /// Both language codes support the same normalization as [`get_stopwords()`]:
261
+ /// - Case-insensitive lookup (EN, en, En all work)
262
+ /// - Locale codes normalized (en-US, en_GB extract to "en")
263
+ ///
264
+ /// # Arguments
265
+ ///
266
+ /// * `language` - Primary language code to try first
267
+ /// * `fallback` - Fallback language code to use if primary not available
268
+ ///
269
+ /// # Returns
270
+ ///
271
+ /// - `Some(&HashSet<String>)` if either language is supported
272
+ /// - `None` if neither language is supported
273
+ ///
274
+ /// # Examples
275
+ ///
276
+ /// ```rust
277
+ /// use kreuzberg::stopwords::get_stopwords_with_fallback;
278
+ ///
279
+ /// // Detected language is Esperanto, fallback to English
280
+ /// if let Some(stopwords) = get_stopwords_with_fallback("eo", "en") {
281
+ /// // Will use Esperanto stopwords (supported)
282
+ /// assert!(stopwords.contains("la"));
283
+ /// }
284
+ ///
285
+ /// // Unsupported language, fallback to English
286
+ /// if let Some(stopwords) = get_stopwords_with_fallback("xx", "en") {
287
+ /// // Will use English stopwords (fallback)
288
+ /// assert!(stopwords.contains("the"));
289
+ /// }
290
+ ///
291
+ /// // Case-insensitive and locale-aware
292
+ /// let result = get_stopwords_with_fallback("es-MX", "EN-US");
293
+ /// assert!(result.is_some());
294
+ ///
295
+ /// // Both unsupported returns None
296
+ /// assert!(get_stopwords_with_fallback("xx", "zz").is_none());
297
+ /// ```
298
+ ///
299
+ /// # Common Patterns
300
+ ///
301
+ /// ```rust
302
+ /// use kreuzberg::stopwords::get_stopwords_with_fallback;
303
+ ///
304
+ /// // English fallback for unknown languages
305
+ /// let detected_lang = "xyz"; // Unknown language
306
+ /// let stopwords = get_stopwords_with_fallback(detected_lang, "en")
307
+ /// .expect("English fallback should always be available");
308
+ ///
309
+ /// // Multi-language content with English fallback
310
+ /// for lang in ["de", "fr", "unknown", "es"] {
311
+ /// if let Some(stopwords) = get_stopwords_with_fallback(lang, "en") {
312
+ /// println!("Using stopwords for: {}", lang);
313
+ /// }
314
+ /// }
315
+ /// ```
316
+ ///
317
+ /// # Performance
318
+ ///
319
+ /// This function performs at most two HashMap lookups:
320
+ /// 1. Try primary language (O(1) average case)
321
+ /// 2. If None, try fallback language (O(1) average case)
322
+ ///
323
+ /// Total overhead is negligible (~10-100ns on modern CPUs).
324
+ pub fn get_stopwords_with_fallback(language: &str, fallback: &str) -> Option<&'static AHashSet<String>> {
325
+ get_stopwords(language).or_else(|| get_stopwords(fallback))
326
+ }
327
+
328
+ #[cfg(test)]
329
+ mod tests {
330
+ use super::*;
331
+
332
+ #[test]
333
+ fn test_stopwords_lazy_initialization() {
334
+ let stopwords = &*STOPWORDS;
335
+ assert!(stopwords.contains_key("en"));
336
+ assert!(stopwords.contains_key("es"));
337
+ assert!(!stopwords.get("en").unwrap().is_empty());
338
+ assert!(!stopwords.get("es").unwrap().is_empty());
339
+ }
340
+
341
+ #[test]
342
+ fn test_english_stopwords() {
343
+ let en_stopwords = STOPWORDS.get("en").unwrap();
344
+
345
+ assert!(en_stopwords.contains("the"));
346
+ assert!(en_stopwords.contains("is"));
347
+ assert!(en_stopwords.contains("and"));
348
+ assert!(en_stopwords.contains("a"));
349
+ assert!(en_stopwords.contains("of"));
350
+
351
+ assert!(en_stopwords.len() >= 70);
352
+ }
353
+
354
+ #[test]
355
+ fn test_spanish_stopwords() {
356
+ let es_stopwords = STOPWORDS.get("es").unwrap();
357
+
358
+ assert!(es_stopwords.contains("el"));
359
+ assert!(es_stopwords.contains("la"));
360
+ assert!(es_stopwords.contains("es"));
361
+ assert!(es_stopwords.contains("en"));
362
+ assert!(es_stopwords.contains("de"));
363
+
364
+ assert!(es_stopwords.len() >= 200);
365
+ }
366
+
367
+ #[test]
368
+ fn test_all_64_languages_loaded() {
369
+ let expected_languages = [
370
+ "af", "ar", "bg", "bn", "br", "ca", "cs", "da", "de", "el", "en", "eo", "es", "et", "eu", "fa", "fi", "fr",
371
+ "ga", "gl", "gu", "ha", "he", "hi", "hr", "hu", "hy", "id", "it", "ja", "kn", "ko", "ku", "la", "lt", "lv",
372
+ "ml", "mr", "ms", "ne", "nl", "no", "pl", "pt", "ro", "ru", "si", "sk", "sl", "so", "st", "sv", "sw", "ta",
373
+ "te", "th", "tl", "tr", "uk", "ur", "vi", "yo", "zh", "zu",
374
+ ];
375
+
376
+ for lang in &expected_languages {
377
+ assert!(
378
+ STOPWORDS.contains_key(*lang),
379
+ "Missing stopwords for language: {}",
380
+ lang
381
+ );
382
+ assert!(
383
+ !STOPWORDS.get(*lang).unwrap().is_empty(),
384
+ "Empty stopwords for language: {}",
385
+ lang
386
+ );
387
+ }
388
+
389
+ assert_eq!(STOPWORDS.len(), 64, "Expected 64 languages, found {}", STOPWORDS.len());
390
+ }
391
+
392
+ #[test]
393
+ fn test_german_stopwords() {
394
+ let de_stopwords = STOPWORDS.get("de").unwrap();
395
+ assert!(de_stopwords.contains("der"));
396
+ assert!(de_stopwords.contains("die"));
397
+ assert!(de_stopwords.contains("und"));
398
+ }
399
+
400
+ #[test]
401
+ fn test_french_stopwords() {
402
+ let fr_stopwords = STOPWORDS.get("fr").unwrap();
403
+ assert!(fr_stopwords.contains("le"));
404
+ assert!(fr_stopwords.contains("de"));
405
+ assert!(fr_stopwords.contains("un"));
406
+ }
407
+
408
+ #[test]
409
+ fn test_chinese_stopwords() {
410
+ let zh_stopwords = STOPWORDS.get("zh").unwrap();
411
+ assert!(!zh_stopwords.is_empty());
412
+ }
413
+
414
+ #[test]
415
+ fn test_arabic_stopwords() {
416
+ let ar_stopwords = STOPWORDS.get("ar").unwrap();
417
+ assert!(!ar_stopwords.is_empty());
418
+ }
419
+
420
+ #[test]
421
+ fn test_unknown_language_returns_none() {
422
+ assert!(!STOPWORDS.contains_key("xx"));
423
+ assert!(STOPWORDS.get("unknown").is_none());
424
+ }
425
+
426
+ #[test]
427
+ fn test_get_stopwords_lowercase() {
428
+ assert!(get_stopwords("en").is_some());
429
+ assert!(get_stopwords("es").is_some());
430
+ assert!(get_stopwords("de").is_some());
431
+ assert!(get_stopwords("fr").is_some());
432
+ }
433
+
434
+ #[test]
435
+ fn test_get_stopwords_uppercase() {
436
+ let en_upper = get_stopwords("EN");
437
+ let en_lower = get_stopwords("en");
438
+
439
+ assert!(en_upper.is_some());
440
+ assert!(en_lower.is_some());
441
+
442
+ assert_eq!(en_upper.unwrap().len(), en_lower.unwrap().len());
443
+ }
444
+
445
+ #[test]
446
+ fn test_get_stopwords_mixed_case() {
447
+ assert!(get_stopwords("En").is_some());
448
+ assert!(get_stopwords("eN").is_some());
449
+ assert!(get_stopwords("ES").is_some());
450
+ assert!(get_stopwords("Es").is_some());
451
+ assert!(get_stopwords("DE").is_some());
452
+ assert!(get_stopwords("De").is_some());
453
+ }
454
+
455
+ #[test]
456
+ fn test_get_stopwords_locale_hyphen() {
457
+ let en_us = get_stopwords("en-US");
458
+ let en_gb = get_stopwords("en-GB");
459
+ let en = get_stopwords("en");
460
+
461
+ assert!(en_us.is_some());
462
+ assert!(en_gb.is_some());
463
+
464
+ assert_eq!(en_us.unwrap().len(), en.unwrap().len());
465
+ assert_eq!(en_gb.unwrap().len(), en.unwrap().len());
466
+ }
467
+
468
+ #[test]
469
+ fn test_get_stopwords_locale_underscore() {
470
+ let es_es = get_stopwords("es_ES");
471
+ let es_mx = get_stopwords("es_MX");
472
+ let es = get_stopwords("es");
473
+
474
+ assert!(es_es.is_some());
475
+ assert!(es_mx.is_some());
476
+
477
+ assert_eq!(es_es.unwrap().len(), es.unwrap().len());
478
+ assert_eq!(es_mx.unwrap().len(), es.unwrap().len());
479
+ }
480
+
481
+ #[test]
482
+ fn test_get_stopwords_locale_uppercase() {
483
+ let en_us_upper = get_stopwords("EN-US");
484
+ let es_es_upper = get_stopwords("ES_ES");
485
+ let pt_br_mixed = get_stopwords("Pt-BR");
486
+
487
+ assert!(en_us_upper.is_some());
488
+ assert!(es_es_upper.is_some());
489
+ assert!(pt_br_mixed.is_some());
490
+
491
+ assert!(en_us_upper.unwrap().contains("the"));
492
+ assert!(es_es_upper.unwrap().contains("el"));
493
+ assert!(pt_br_mixed.unwrap().contains("o"));
494
+ }
495
+
496
+ #[test]
497
+ fn test_get_stopwords_all_supported_languages() {
498
+ let languages = [
499
+ "af", "ar", "bg", "bn", "br", "ca", "cs", "da", "de", "el", "en", "eo", "es", "et", "eu", "fa", "fi", "fr",
500
+ "ga", "gl", "gu", "ha", "he", "hi", "hr", "hu", "hy", "id", "it", "ja", "kn", "ko", "ku", "la", "lt", "lv",
501
+ "ml", "mr", "ms", "ne", "nl", "no", "pl", "pt", "ro", "ru", "si", "sk", "sl", "so", "st", "sv", "sw", "ta",
502
+ "te", "th", "tl", "tr", "uk", "ur", "vi", "yo", "zh", "zu",
503
+ ];
504
+
505
+ for lang in &languages {
506
+ assert!(
507
+ get_stopwords(lang).is_some(),
508
+ "Language {} should be available via get_stopwords",
509
+ lang
510
+ );
511
+ }
512
+ }
513
+
514
+ #[test]
515
+ fn test_get_stopwords_unsupported_language() {
516
+ assert!(get_stopwords("xx").is_none());
517
+ assert!(get_stopwords("zz").is_none());
518
+ assert!(get_stopwords("xyz").is_none());
519
+ assert!(get_stopwords("unknown").is_none());
520
+ }
521
+
522
+ #[test]
523
+ fn test_get_stopwords_empty_string() {
524
+ assert!(get_stopwords("").is_none());
525
+ }
526
+
527
+ #[test]
528
+ fn test_get_stopwords_single_char() {
529
+ assert!(get_stopwords("e").is_none());
530
+ assert!(get_stopwords("z").is_none());
531
+ }
532
+
533
+ #[test]
534
+ fn test_get_stopwords_long_locale() {
535
+ let zh_cn_hans = get_stopwords("zh-CN-Hans");
536
+ let pt_br_utf8 = get_stopwords("pt_BR.UTF-8");
537
+
538
+ assert!(zh_cn_hans.is_some());
539
+ assert!(pt_br_utf8.is_some());
540
+
541
+ assert_eq!(zh_cn_hans.unwrap().len(), get_stopwords("zh").unwrap().len());
542
+ assert_eq!(pt_br_utf8.unwrap().len(), get_stopwords("pt").unwrap().len());
543
+ }
544
+
545
+ #[test]
546
+ fn test_get_stopwords_content_verification() {
547
+ let en = get_stopwords("en").expect("English stopwords should exist");
548
+ assert!(en.contains("the"));
549
+ assert!(en.contains("is"));
550
+ assert!(en.contains("and"));
551
+
552
+ let es = get_stopwords("es").expect("Spanish stopwords should exist");
553
+ assert!(es.contains("el"));
554
+ assert!(es.contains("la"));
555
+ assert!(es.contains("es"));
556
+
557
+ let de = get_stopwords("de").expect("German stopwords should exist");
558
+ assert!(de.contains("der"));
559
+ assert!(de.contains("die"));
560
+ assert!(de.contains("und"));
561
+
562
+ let fr = get_stopwords("fr").expect("French stopwords should exist");
563
+ assert!(fr.contains("le"));
564
+ assert!(fr.contains("de"));
565
+ assert!(fr.contains("un"));
566
+ }
567
+
568
+ #[test]
569
+ fn test_get_stopwords_vs_direct_access() {
570
+ let en_normalized = get_stopwords("en").unwrap();
571
+ let en_direct = STOPWORDS.get("en").unwrap();
572
+
573
+ assert_eq!(en_normalized.len(), en_direct.len());
574
+
575
+ for word in en_direct {
576
+ assert!(en_normalized.contains(word));
577
+ }
578
+ }
579
+
580
+ #[test]
581
+ fn test_get_stopwords_with_fallback_primary_available() {
582
+ let result = get_stopwords_with_fallback("en", "es");
583
+ assert!(result.is_some());
584
+ let stopwords = result.unwrap();
585
+ assert!(stopwords.contains("the"));
586
+ assert!(!stopwords.contains("el"));
587
+ }
588
+
589
+ #[test]
590
+ fn test_get_stopwords_with_fallback_use_fallback() {
591
+ let result = get_stopwords_with_fallback("xx", "en");
592
+ assert!(result.is_some());
593
+ let stopwords = result.unwrap();
594
+ assert!(stopwords.contains("the"));
595
+ }
596
+
597
+ #[test]
598
+ fn test_get_stopwords_with_fallback_both_unavailable() {
599
+ let result = get_stopwords_with_fallback("xx", "zz");
600
+ assert!(result.is_none());
601
+ }
602
+
603
+ #[test]
604
+ fn test_get_stopwords_with_fallback_case_insensitive() {
605
+ let result1 = get_stopwords_with_fallback("EN", "es");
606
+ let result2 = get_stopwords_with_fallback("xx", "ES");
607
+ assert!(result1.is_some());
608
+ assert!(result2.is_some());
609
+ }
610
+
611
+ #[test]
612
+ fn test_get_stopwords_with_fallback_locale_codes() {
613
+ let result = get_stopwords_with_fallback("es-MX", "en-US");
614
+ assert!(result.is_some());
615
+ let stopwords = result.unwrap();
616
+ assert!(stopwords.contains("el"));
617
+ }
618
+
619
+ #[test]
620
+ fn test_get_stopwords_with_fallback_esperanto_to_english() {
621
+ let result = get_stopwords_with_fallback("eo", "en");
622
+ assert!(result.is_some());
623
+ let stopwords = result.unwrap();
624
+ assert!(stopwords.contains("la"));
625
+ }
626
+
627
+ #[test]
628
+ fn test_get_stopwords_with_fallback_unknown_to_english() {
629
+ let result = get_stopwords_with_fallback("xyz", "en");
630
+ assert!(result.is_some());
631
+ let stopwords = result.unwrap();
632
+ assert!(stopwords.contains("the"));
633
+ }
634
+
635
+ #[test]
636
+ fn test_get_stopwords_with_fallback_same_as_chained_or_else() {
637
+ let manual = get_stopwords("xx").or_else(|| get_stopwords("en"));
638
+ let helper = get_stopwords_with_fallback("xx", "en");
639
+ assert_eq!(manual.is_some(), helper.is_some());
640
+ if let (Some(m), Some(h)) = (manual, helper) {
641
+ assert_eq!(m.len(), h.len());
642
+ }
643
+ }
644
+
645
+ #[test]
646
+ fn test_get_stopwords_invalid_language_codes() {
647
+ assert!(get_stopwords("invalid_lang").is_none());
648
+ assert!(get_stopwords("xyz").is_none());
649
+ assert!(get_stopwords("zzzz").is_none());
650
+ assert!(get_stopwords("abc123").is_none());
651
+ assert!(get_stopwords("!!!").is_none());
652
+ }
653
+
654
+ #[test]
655
+ fn test_get_stopwords_edge_case_empty_and_whitespace() {
656
+ assert!(get_stopwords("").is_none());
657
+ assert!(get_stopwords(" ").is_none());
658
+ assert!(get_stopwords(" ").is_none());
659
+ assert!(get_stopwords("\t").is_none());
660
+ assert!(get_stopwords("\n").is_none());
661
+ }
662
+
663
+ #[test]
664
+ fn test_get_stopwords_special_characters() {
665
+ assert!(get_stopwords("@#").is_none());
666
+ assert!(get_stopwords("$%").is_none());
667
+ assert!(get_stopwords("!!!").is_none());
668
+
669
+ let result = get_stopwords("en!");
670
+ assert!(result.is_some());
671
+ if let Some(stopwords) = result {
672
+ assert!(stopwords.contains("the"));
673
+ }
674
+
675
+ let result = get_stopwords("es@");
676
+ assert!(result.is_some());
677
+ if let Some(stopwords) = result {
678
+ assert!(stopwords.contains("el"));
679
+ }
680
+
681
+ let result = get_stopwords("de#fr");
682
+ assert!(result.is_some());
683
+ if let Some(stopwords) = result {
684
+ assert!(stopwords.contains("der"));
685
+ }
686
+ }
687
+
688
+ #[test]
689
+ fn test_get_stopwords_numeric_codes() {
690
+ assert!(get_stopwords("12").is_none());
691
+ assert!(get_stopwords("99").is_none());
692
+ assert!(get_stopwords("123").is_none());
693
+ assert!(get_stopwords("0").is_none());
694
+ }
695
+
696
+ #[test]
697
+ fn test_get_stopwords_single_character_edge_cases() {
698
+ assert!(get_stopwords("a").is_none());
699
+ assert!(get_stopwords("e").is_none());
700
+ assert!(get_stopwords("z").is_none());
701
+ assert!(get_stopwords("1").is_none());
702
+ assert!(get_stopwords("_").is_none());
703
+ }
704
+
705
+ #[test]
706
+ fn test_get_stopwords_invalid_locale_formats() {
707
+ assert!(get_stopwords("xx-YY").is_none());
708
+ assert!(get_stopwords("zz_ZZ").is_none());
709
+ assert!(get_stopwords("invalid-US").is_none());
710
+ assert!(get_stopwords("aa_BB_CC").is_none());
711
+ }
712
+
713
+ #[test]
714
+ fn test_get_stopwords_mixed_valid_invalid() {
715
+ let result = get_stopwords("en123");
716
+ assert!(result.is_some(), "Should extract 'en' from 'en123'");
717
+
718
+ assert!(get_stopwords("12en").is_none());
719
+ assert!(get_stopwords("@@en").is_none());
720
+ }
721
+
722
+ #[test]
723
+ fn test_get_stopwords_case_sensitivity_validation() {
724
+ let lower = get_stopwords("en");
725
+ let upper = get_stopwords("EN");
726
+ let mixed1 = get_stopwords("En");
727
+ let mixed2 = get_stopwords("eN");
728
+
729
+ assert!(lower.is_some());
730
+ assert!(upper.is_some());
731
+ assert!(mixed1.is_some());
732
+ assert!(mixed2.is_some());
733
+
734
+ if let (Some(l), Some(u), Some(m1), Some(m2)) = (lower, upper, mixed1, mixed2) {
735
+ assert_eq!(l.len(), u.len());
736
+ assert_eq!(l.len(), m1.len());
737
+ assert_eq!(l.len(), m2.len());
738
+ }
739
+ }
740
+
741
+ #[test]
742
+ fn test_get_stopwords_none_return_safety() {
743
+ let result = get_stopwords("invalid").and_then(|_| get_stopwords("also_invalid"));
744
+ assert!(result.is_none());
745
+
746
+ let chained = get_stopwords("xxx")
747
+ .or_else(|| get_stopwords("yyy"))
748
+ .or_else(|| get_stopwords("zzz"));
749
+ assert!(chained.is_none());
750
+ }
751
+
752
+ #[test]
753
+ fn test_get_stopwords_with_fallback_both_invalid() {
754
+ assert!(get_stopwords_with_fallback("invalid", "also_invalid").is_none());
755
+ assert!(get_stopwords_with_fallback("xxx", "yyy").is_none());
756
+ assert!(get_stopwords_with_fallback("", "").is_none());
757
+ assert!(get_stopwords_with_fallback("123", "456").is_none());
758
+ }
759
+
760
+ #[test]
761
+ fn test_get_stopwords_with_fallback_invalid_primary_valid_fallback() {
762
+ let result = get_stopwords_with_fallback("invalid_lang", "en");
763
+ assert!(result.is_some());
764
+ if let Some(stopwords) = result {
765
+ assert!(stopwords.contains("the"));
766
+ }
767
+
768
+ let result2 = get_stopwords_with_fallback("xyz", "es");
769
+ assert!(result2.is_some());
770
+ if let Some(stopwords) = result2 {
771
+ assert!(stopwords.contains("el"));
772
+ }
773
+ }
774
+
775
+ #[test]
776
+ fn test_get_stopwords_with_fallback_valid_primary_invalid_fallback() {
777
+ let result = get_stopwords_with_fallback("en", "invalid_fallback");
778
+ assert!(result.is_some());
779
+ if let Some(stopwords) = result {
780
+ assert!(stopwords.contains("the"));
781
+ }
782
+
783
+ let result2 = get_stopwords_with_fallback("es", "zzz");
784
+ assert!(result2.is_some());
785
+ if let Some(stopwords) = result2 {
786
+ assert!(stopwords.contains("el"));
787
+ }
788
+ }
789
+
790
+ #[test]
791
+ fn test_get_stopwords_with_fallback_empty_strings() {
792
+ assert!(get_stopwords_with_fallback("", "en").is_some());
793
+ assert!(get_stopwords_with_fallback("en", "").is_some());
794
+ assert!(get_stopwords_with_fallback("", "").is_none());
795
+ }
796
+
797
+ #[test]
798
+ fn test_get_stopwords_with_fallback_special_characters() {
799
+ assert!(get_stopwords_with_fallback("@#$", "en").is_some());
800
+ assert!(get_stopwords_with_fallback("en", "!!!").is_some());
801
+ assert!(get_stopwords_with_fallback("@#$", "!!!").is_none());
802
+ }
803
+
804
+ #[test]
805
+ fn test_get_stopwords_with_fallback_case_insensitive_validation() {
806
+ let result1 = get_stopwords_with_fallback("INVALID", "en");
807
+ let result2 = get_stopwords_with_fallback("invalid", "EN");
808
+ let result3 = get_stopwords_with_fallback("INVALID", "EN");
809
+
810
+ assert!(result1.is_some());
811
+ assert!(result2.is_some());
812
+ assert!(result3.is_some());
813
+
814
+ if let (Some(r1), Some(r2), Some(r3)) = (result1, result2, result3) {
815
+ assert!(r1.contains("the"));
816
+ assert!(r2.contains("the"));
817
+ assert!(r3.contains("the"));
818
+ }
819
+ }
820
+
821
+ #[test]
822
+ fn test_direct_stopwords_access_invalid_keys() {
823
+ assert!(STOPWORDS.get("invalid").is_none());
824
+ assert!(STOPWORDS.get("EN").is_none());
825
+ assert!(STOPWORDS.get("en-US").is_none());
826
+ assert!(STOPWORDS.get("xyz").is_none());
827
+ assert!(STOPWORDS.get("").is_none());
828
+ }
829
+
830
+ #[test]
831
+ fn test_stopwords_case_sensitivity_direct_vs_normalized() {
832
+ assert!(STOPWORDS.get("EN").is_none());
833
+ assert!(get_stopwords("EN").is_some());
834
+
835
+ assert!(STOPWORDS.get("Es").is_none());
836
+ assert!(get_stopwords("Es").is_some());
837
+
838
+ assert!(STOPWORDS.get("DE").is_none());
839
+ assert!(get_stopwords("DE").is_some());
840
+ }
841
+
842
+ #[test]
843
+ fn test_get_stopwords_unicode_characters() {
844
+ // NOTE: Current implementation has a limitation - it uses byte slicing which can panic
845
+
846
+ let result = get_stopwords("zh-中文");
847
+ assert!(result.is_some());
848
+
849
+ let result = get_stopwords("ar-العربية");
850
+ assert!(result.is_some());
851
+
852
+ let result = get_stopwords("ja_日本");
853
+ assert!(result.is_some());
854
+
855
+ assert!(get_stopwords("xx").is_none());
856
+ assert!(get_stopwords("yy").is_none());
857
+
858
+ // NOTE: The following would panic due to byte slicing on multi-byte chars:
859
+ }
860
+
861
+ #[test]
862
+ fn test_get_stopwords_very_long_strings() {
863
+ let long_string = "x".repeat(1000);
864
+ assert!(get_stopwords(&long_string).is_none());
865
+
866
+ let long_locale = "en-".to_string() + &"X".repeat(100);
867
+ let result = get_stopwords(&long_locale);
868
+ assert!(result.is_some());
869
+ }
870
+
871
+ #[test]
872
+ fn test_get_stopwords_null_bytes() {
873
+ assert!(get_stopwords("\0").is_none());
874
+ assert!(get_stopwords("en\0").is_some());
875
+ assert!(get_stopwords("\0en").is_none());
876
+ }
877
+
878
+ #[test]
879
+ fn test_get_stopwords_boundary_conditions() {
880
+ assert!(get_stopwords("e").is_none());
881
+ assert!(get_stopwords("en").is_some());
882
+ assert!(get_stopwords("eng").is_some());
883
+
884
+ let result = get_stopwords("en-");
885
+ assert!(result.is_some());
886
+ }
887
+
888
+ #[test]
889
+ fn test_get_stopwords_multiple_separators() {
890
+ assert!(get_stopwords("en-US-utf8").is_some());
891
+ assert!(get_stopwords("es_MX_special").is_some());
892
+ assert!(get_stopwords("pt-BR_variant").is_some());
893
+ }
894
+
895
+ #[test]
896
+ fn test_romance_languages() {
897
+ let fr = get_stopwords("fr").expect("French stopwords should exist");
898
+ assert!(fr.contains("le"), "French should contain 'le'");
899
+ assert!(fr.contains("et"), "French should contain 'et'");
900
+ assert!(fr.len() >= 150, "French should have substantial stopwords");
901
+
902
+ let es = get_stopwords("es").expect("Spanish stopwords should exist");
903
+ assert!(es.contains("el"), "Spanish should contain 'el'");
904
+ assert!(es.contains("y"), "Spanish should contain 'y'");
905
+ assert!(es.len() >= 200, "Spanish should have substantial stopwords");
906
+
907
+ let pt = get_stopwords("pt").expect("Portuguese stopwords should exist");
908
+ assert!(pt.contains("o"), "Portuguese should contain 'o'");
909
+ assert!(pt.contains("e"), "Portuguese should contain 'e'");
910
+ assert!(pt.len() >= 150, "Portuguese should have substantial stopwords");
911
+
912
+ let it = get_stopwords("it").expect("Italian stopwords should exist");
913
+ assert!(it.contains("il"), "Italian should contain 'il'");
914
+ assert!(it.contains("e"), "Italian should contain 'e'");
915
+ assert!(it.len() >= 150, "Italian should have substantial stopwords");
916
+
917
+ let ro = get_stopwords("ro").expect("Romanian stopwords should exist");
918
+ assert!(!ro.is_empty(), "Romanian should have stopwords");
919
+ assert!(ro.len() >= 100, "Romanian should have substantial stopwords");
920
+ }
921
+
922
+ #[test]
923
+ fn test_germanic_languages() {
924
+ let de = get_stopwords("de").expect("German stopwords should exist");
925
+ assert!(de.contains("der"), "German should contain 'der'");
926
+ assert!(de.contains("die"), "German should contain 'die'");
927
+ assert!(de.contains("und"), "German should contain 'und'");
928
+ assert!(de.len() >= 200, "German should have substantial stopwords");
929
+
930
+ let en = get_stopwords("en").expect("English stopwords should exist");
931
+ assert!(en.contains("the"), "English should contain 'the'");
932
+ assert!(en.contains("and"), "English should contain 'and'");
933
+ assert!(en.len() >= 70, "English should have substantial stopwords");
934
+
935
+ let nl = get_stopwords("nl").expect("Dutch stopwords should exist");
936
+ assert!(nl.contains("de"), "Dutch should contain 'de'");
937
+ assert!(nl.contains("het"), "Dutch should contain 'het'");
938
+ assert!(nl.len() >= 100, "Dutch should have substantial stopwords");
939
+
940
+ let sv = get_stopwords("sv").expect("Swedish stopwords should exist");
941
+ assert!(!sv.is_empty(), "Swedish should have stopwords");
942
+ assert!(sv.len() >= 100, "Swedish should have substantial stopwords");
943
+
944
+ let no = get_stopwords("no").expect("Norwegian stopwords should exist");
945
+ assert!(!no.is_empty(), "Norwegian should have stopwords");
946
+
947
+ let da = get_stopwords("da").expect("Danish stopwords should exist");
948
+ assert!(!da.is_empty(), "Danish should have stopwords");
949
+ }
950
+
951
+ #[test]
952
+ fn test_slavic_languages() {
953
+ let ru = get_stopwords("ru").expect("Russian stopwords should exist");
954
+ assert!(!ru.is_empty(), "Russian should have stopwords");
955
+ assert!(ru.len() >= 100, "Russian should have substantial stopwords");
956
+
957
+ let pl = get_stopwords("pl").expect("Polish stopwords should exist");
958
+ assert!(!pl.is_empty(), "Polish should have stopwords");
959
+ assert!(pl.len() >= 100, "Polish should have substantial stopwords");
960
+
961
+ let cs = get_stopwords("cs").expect("Czech stopwords should exist");
962
+ assert!(!cs.is_empty(), "Czech should have stopwords");
963
+
964
+ let sk = get_stopwords("sk").expect("Slovak stopwords should exist");
965
+ assert!(!sk.is_empty(), "Slovak should have stopwords");
966
+
967
+ let bg = get_stopwords("bg").expect("Bulgarian stopwords should exist");
968
+ assert!(!bg.is_empty(), "Bulgarian should have stopwords");
969
+
970
+ let uk = get_stopwords("uk").expect("Ukrainian stopwords should exist");
971
+ assert!(!uk.is_empty(), "Ukrainian should have stopwords");
972
+
973
+ let hr = get_stopwords("hr").expect("Croatian stopwords should exist");
974
+ assert!(!hr.is_empty(), "Croatian should have stopwords");
975
+
976
+ let sl = get_stopwords("sl").expect("Slovenian stopwords should exist");
977
+ assert!(!sl.is_empty(), "Slovenian should have stopwords");
978
+ }
979
+
980
+ #[test]
981
+ fn test_asian_languages() {
982
+ let zh = get_stopwords("zh").expect("Chinese stopwords should exist");
983
+ assert!(!zh.is_empty(), "Chinese should have stopwords");
984
+ assert!(zh.len() >= 50, "Chinese should have substantial stopwords");
985
+
986
+ let ja = get_stopwords("ja").expect("Japanese stopwords should exist");
987
+ assert!(!ja.is_empty(), "Japanese should have stopwords");
988
+ assert!(ja.len() >= 50, "Japanese should have substantial stopwords");
989
+
990
+ let ko = get_stopwords("ko").expect("Korean stopwords should exist");
991
+ assert!(!ko.is_empty(), "Korean should have stopwords");
992
+
993
+ let hi = get_stopwords("hi").expect("Hindi stopwords should exist");
994
+ assert!(!hi.is_empty(), "Hindi should have stopwords");
995
+ assert!(hi.len() >= 100, "Hindi should have substantial stopwords");
996
+
997
+ let bn = get_stopwords("bn").expect("Bengali stopwords should exist");
998
+ assert!(!bn.is_empty(), "Bengali should have stopwords");
999
+
1000
+ let th = get_stopwords("th").expect("Thai stopwords should exist");
1001
+ assert!(!th.is_empty(), "Thai should have stopwords");
1002
+
1003
+ let vi = get_stopwords("vi").expect("Vietnamese stopwords should exist");
1004
+ assert!(!vi.is_empty(), "Vietnamese should have stopwords");
1005
+ }
1006
+
1007
+ #[test]
1008
+ fn test_african_languages() {
1009
+ let af = get_stopwords("af").expect("Afrikaans stopwords should exist");
1010
+ assert!(!af.is_empty(), "Afrikaans should have stopwords");
1011
+
1012
+ let sw = get_stopwords("sw").expect("Swahili stopwords should exist");
1013
+ assert!(!sw.is_empty(), "Swahili should have stopwords");
1014
+
1015
+ let yo = get_stopwords("yo").expect("Yoruba stopwords should exist");
1016
+ assert!(!yo.is_empty(), "Yoruba should have stopwords");
1017
+
1018
+ let zu = get_stopwords("zu").expect("Zulu stopwords should exist");
1019
+ assert!(!zu.is_empty(), "Zulu should have stopwords");
1020
+
1021
+ let ha = get_stopwords("ha").expect("Hausa stopwords should exist");
1022
+ assert!(!ha.is_empty(), "Hausa should have stopwords");
1023
+
1024
+ let so = get_stopwords("so").expect("Somali stopwords should exist");
1025
+ assert!(!so.is_empty(), "Somali should have stopwords");
1026
+
1027
+ let st = get_stopwords("st").expect("Sesotho stopwords should exist");
1028
+ assert!(!st.is_empty(), "Sesotho should have stopwords");
1029
+ }
1030
+
1031
+ #[test]
1032
+ fn test_indic_languages() {
1033
+ let hi = get_stopwords("hi").expect("Hindi stopwords should exist");
1034
+ assert!(!hi.is_empty(), "Hindi should have stopwords");
1035
+
1036
+ let bn = get_stopwords("bn").expect("Bengali stopwords should exist");
1037
+ assert!(!bn.is_empty(), "Bengali should have stopwords");
1038
+
1039
+ let gu = get_stopwords("gu").expect("Gujarati stopwords should exist");
1040
+ assert!(!gu.is_empty(), "Gujarati should have stopwords");
1041
+
1042
+ let kn = get_stopwords("kn").expect("Kannada stopwords should exist");
1043
+ assert!(!kn.is_empty(), "Kannada should have stopwords");
1044
+
1045
+ let ml = get_stopwords("ml").expect("Malayalam stopwords should exist");
1046
+ assert!(!ml.is_empty(), "Malayalam should have stopwords");
1047
+
1048
+ let mr = get_stopwords("mr").expect("Marathi stopwords should exist");
1049
+ assert!(!mr.is_empty(), "Marathi should have stopwords");
1050
+
1051
+ let ta = get_stopwords("ta").expect("Tamil stopwords should exist");
1052
+ assert!(!ta.is_empty(), "Tamil should have stopwords");
1053
+
1054
+ let te = get_stopwords("te").expect("Telugu stopwords should exist");
1055
+ assert!(!te.is_empty(), "Telugu should have stopwords");
1056
+
1057
+ let ur = get_stopwords("ur").expect("Urdu stopwords should exist");
1058
+ assert!(!ur.is_empty(), "Urdu should have stopwords");
1059
+
1060
+ let ne = get_stopwords("ne").expect("Nepali stopwords should exist");
1061
+ assert!(!ne.is_empty(), "Nepali should have stopwords");
1062
+
1063
+ let si = get_stopwords("si").expect("Sinhala stopwords should exist");
1064
+ assert!(!si.is_empty(), "Sinhala should have stopwords");
1065
+ }
1066
+
1067
+ #[test]
1068
+ fn test_middle_eastern_languages() {
1069
+ let ar = get_stopwords("ar").expect("Arabic stopwords should exist");
1070
+ assert!(!ar.is_empty(), "Arabic should have stopwords");
1071
+ assert!(ar.len() >= 100, "Arabic should have substantial stopwords");
1072
+
1073
+ let fa = get_stopwords("fa").expect("Persian stopwords should exist");
1074
+ assert!(!fa.is_empty(), "Persian should have stopwords");
1075
+
1076
+ let he = get_stopwords("he").expect("Hebrew stopwords should exist");
1077
+ assert!(!he.is_empty(), "Hebrew should have stopwords");
1078
+
1079
+ let tr = get_stopwords("tr").expect("Turkish stopwords should exist");
1080
+ assert!(!tr.is_empty(), "Turkish should have stopwords");
1081
+
1082
+ let ku = get_stopwords("ku").expect("Kurdish stopwords should exist");
1083
+ assert!(!ku.is_empty(), "Kurdish stopwords should exist");
1084
+ }
1085
+
1086
+ #[test]
1087
+ fn test_other_languages() {
1088
+ let hy = get_stopwords("hy").expect("Armenian stopwords should exist");
1089
+ assert!(!hy.is_empty(), "Armenian should have stopwords");
1090
+
1091
+ let eu = get_stopwords("eu").expect("Basque stopwords should exist");
1092
+ assert!(!eu.is_empty(), "Basque should have stopwords");
1093
+
1094
+ let br = get_stopwords("br").expect("Breton stopwords should exist");
1095
+ assert!(!br.is_empty(), "Breton should have stopwords");
1096
+
1097
+ let ca = get_stopwords("ca").expect("Catalan stopwords should exist");
1098
+ assert!(!ca.is_empty(), "Catalan should have stopwords");
1099
+
1100
+ let eo = get_stopwords("eo").expect("Esperanto stopwords should exist");
1101
+ assert!(eo.contains("la"), "Esperanto should contain 'la'");
1102
+ assert!(!eo.is_empty(), "Esperanto should have stopwords");
1103
+
1104
+ let et = get_stopwords("et").expect("Estonian stopwords should exist");
1105
+ assert!(!et.is_empty(), "Estonian should have stopwords");
1106
+
1107
+ let fi = get_stopwords("fi").expect("Finnish stopwords should exist");
1108
+ assert!(!fi.is_empty(), "Finnish should have stopwords");
1109
+
1110
+ let gl = get_stopwords("gl").expect("Galician stopwords should exist");
1111
+ assert!(!gl.is_empty(), "Galician should have stopwords");
1112
+
1113
+ let hu = get_stopwords("hu").expect("Hungarian stopwords should exist");
1114
+ assert!(!hu.is_empty(), "Hungarian should have stopwords");
1115
+
1116
+ let id = get_stopwords("id").expect("Indonesian stopwords should exist");
1117
+ assert!(!id.is_empty(), "Indonesian should have stopwords");
1118
+
1119
+ let ga = get_stopwords("ga").expect("Irish stopwords should exist");
1120
+ assert!(!ga.is_empty(), "Irish should have stopwords");
1121
+
1122
+ let la = get_stopwords("la").expect("Latin stopwords should exist");
1123
+ assert!(!la.is_empty(), "Latin should have stopwords");
1124
+
1125
+ let lt = get_stopwords("lt").expect("Lithuanian stopwords should exist");
1126
+ assert!(!lt.is_empty(), "Lithuanian should have stopwords");
1127
+
1128
+ let lv = get_stopwords("lv").expect("Latvian stopwords should exist");
1129
+ assert!(!lv.is_empty(), "Latvian should have stopwords");
1130
+
1131
+ let ms = get_stopwords("ms").expect("Malay stopwords should exist");
1132
+ assert!(!ms.is_empty(), "Malay should have stopwords");
1133
+
1134
+ let tl = get_stopwords("tl").expect("Tagalog stopwords should exist");
1135
+ assert!(!tl.is_empty(), "Tagalog should have stopwords");
1136
+ }
1137
+
1138
+ #[test]
1139
+ fn test_language_code_variants() {
1140
+ let eng = get_stopwords("eng");
1141
+ let en = get_stopwords("en");
1142
+ assert!(eng.is_some(), "'eng' should extract to 'en'");
1143
+ assert!(en.is_some());
1144
+ assert_eq!(eng.unwrap().len(), en.unwrap().len());
1145
+
1146
+ let spa = get_stopwords("spa");
1147
+ assert!(spa.is_none(), "'spa' extracts to 'sp' which is invalid");
1148
+
1149
+ let deu = get_stopwords("deu");
1150
+ let de = get_stopwords("de");
1151
+ assert!(deu.is_some(), "'deu' should extract to 'de'");
1152
+ assert_eq!(deu.unwrap().len(), de.unwrap().len());
1153
+
1154
+ let fra = get_stopwords("fra");
1155
+ let fr = get_stopwords("fr");
1156
+ assert!(fra.is_some(), "'fra' should extract to 'fr'");
1157
+ assert_eq!(fra.unwrap().len(), fr.unwrap().len());
1158
+
1159
+ let zho = get_stopwords("zho");
1160
+ let zh = get_stopwords("zh");
1161
+ assert!(zho.is_some(), "'zho' should extract to 'zh'");
1162
+ assert_eq!(zho.unwrap().len(), zh.unwrap().len());
1163
+ }
1164
+
1165
+ #[test]
1166
+ fn test_stopword_set_sizes() {
1167
+ let mut sizes: Vec<(String, usize)> = Vec::new();
1168
+
1169
+ for (lang, stopwords) in STOPWORDS.iter() {
1170
+ sizes.push((lang.clone(), stopwords.len()));
1171
+ assert!(!stopwords.is_empty(), "Language {} has empty stopwords", lang);
1172
+ assert!(
1173
+ stopwords.len() >= 5,
1174
+ "Language {} has suspiciously few stopwords: {}",
1175
+ lang,
1176
+ stopwords.len()
1177
+ );
1178
+ assert!(
1179
+ stopwords.len() <= 1500,
1180
+ "Language {} has suspiciously many stopwords: {}",
1181
+ lang,
1182
+ stopwords.len()
1183
+ );
1184
+ }
1185
+
1186
+ assert_eq!(sizes.len(), 64, "Should have exactly 64 languages");
1187
+
1188
+ let en_size = STOPWORDS.get("en").unwrap().len();
1189
+ assert!(
1190
+ (70..=1500).contains(&en_size),
1191
+ "English stopwords size {} outside expected range",
1192
+ en_size
1193
+ );
1194
+
1195
+ let es_size = STOPWORDS.get("es").unwrap().len();
1196
+ assert!(
1197
+ (200..=1000).contains(&es_size),
1198
+ "Spanish stopwords size {} outside expected range",
1199
+ es_size
1200
+ );
1201
+ }
1202
+
1203
+ #[test]
1204
+ fn test_stopword_content_quality() {
1205
+ let en = get_stopwords("en").expect("English stopwords");
1206
+ let english_common = vec![
1207
+ "the", "is", "are", "was", "were", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", "of",
1208
+ "with",
1209
+ ];
1210
+ for word in english_common {
1211
+ assert!(en.contains(word), "English missing common stopword: {}", word);
1212
+ }
1213
+
1214
+ let es = get_stopwords("es").expect("Spanish stopwords");
1215
+ let spanish_common = vec![
1216
+ "el", "la", "los", "las", "un", "una", "de", "en", "y", "o", "por", "para",
1217
+ ];
1218
+ for word in spanish_common {
1219
+ assert!(es.contains(word), "Spanish missing common stopword: {}", word);
1220
+ }
1221
+
1222
+ let de = get_stopwords("de").expect("German stopwords");
1223
+ let german_common = vec![
1224
+ "der", "die", "das", "den", "dem", "des", "und", "oder", "in", "auf", "mit", "von",
1225
+ ];
1226
+ for word in german_common {
1227
+ assert!(de.contains(word), "German missing common stopword: {}", word);
1228
+ }
1229
+
1230
+ let fr = get_stopwords("fr").expect("French stopwords");
1231
+ let french_common = vec![
1232
+ "le", "la", "les", "un", "une", "de", "en", "et", "ou", "pour", "avec", "dans",
1233
+ ];
1234
+ for word in french_common {
1235
+ assert!(fr.contains(word), "French missing common stopword: {}", word);
1236
+ }
1237
+ }
1238
+
1239
+ #[test]
1240
+ fn test_stopword_deduplication() {
1241
+ for (lang, stopwords) in STOPWORDS.iter() {
1242
+ let original_len = stopwords.len();
1243
+ let unique_len = stopwords.iter().collect::<AHashSet<_>>().len();
1244
+ assert_eq!(original_len, unique_len, "Language {} has duplicate stopwords", lang);
1245
+ }
1246
+ }
1247
+
1248
+ #[test]
1249
+ fn test_case_normalization_comprehensive() {
1250
+ let test_cases = vec![
1251
+ ("en", "EN", "En", "eN"),
1252
+ ("es", "ES", "Es", "eS"),
1253
+ ("de", "DE", "De", "dE"),
1254
+ ("fr", "FR", "Fr", "fR"),
1255
+ ("zh", "ZH", "Zh", "zH"),
1256
+ ("ar", "AR", "Ar", "aR"),
1257
+ ];
1258
+
1259
+ for (lower, upper, title, mixed) in test_cases {
1260
+ let lower_result = get_stopwords(lower);
1261
+ let upper_result = get_stopwords(upper);
1262
+ let title_result = get_stopwords(title);
1263
+ let mixed_result = get_stopwords(mixed);
1264
+
1265
+ assert!(lower_result.is_some(), "{} should be valid", lower);
1266
+ assert!(upper_result.is_some(), "{} should be valid", upper);
1267
+ assert!(title_result.is_some(), "{} should be valid", title);
1268
+ assert!(mixed_result.is_some(), "{} should be valid", mixed);
1269
+
1270
+ let len = lower_result.unwrap().len();
1271
+ assert_eq!(upper_result.unwrap().len(), len);
1272
+ assert_eq!(title_result.unwrap().len(), len);
1273
+ assert_eq!(mixed_result.unwrap().len(), len);
1274
+ }
1275
+ }
1276
+
1277
+ #[test]
1278
+ fn test_locale_code_normalization_comprehensive() {
1279
+ let test_cases = vec![
1280
+ ("en-US", "en_US", "en-GB", "en_GB", "en"),
1281
+ ("es-ES", "es_ES", "es-MX", "es_MX", "es"),
1282
+ ("pt-PT", "pt_PT", "pt-BR", "pt_BR", "pt"),
1283
+ ("zh-CN", "zh_CN", "zh-TW", "zh_TW", "zh"),
1284
+ ("fr-FR", "fr_FR", "fr-CA", "fr_CA", "fr"),
1285
+ ];
1286
+
1287
+ for (hyphen1, underscore1, hyphen2, underscore2, base) in test_cases {
1288
+ let base_result = get_stopwords(base).unwrap_or_else(|| panic!("{} should be valid", base));
1289
+
1290
+ let h1 = get_stopwords(hyphen1);
1291
+ let u1 = get_stopwords(underscore1);
1292
+ let h2 = get_stopwords(hyphen2);
1293
+ let u2 = get_stopwords(underscore2);
1294
+
1295
+ assert!(h1.is_some(), "{} should be valid", hyphen1);
1296
+ assert!(u1.is_some(), "{} should be valid", underscore1);
1297
+ assert!(h2.is_some(), "{} should be valid", hyphen2);
1298
+ assert!(u2.is_some(), "{} should be valid", underscore2);
1299
+
1300
+ let len = base_result.len();
1301
+ assert_eq!(h1.unwrap().len(), len, "{} should match {}", hyphen1, base);
1302
+ assert_eq!(u1.unwrap().len(), len, "{} should match {}", underscore1, base);
1303
+ assert_eq!(h2.unwrap().len(), len, "{} should match {}", hyphen2, base);
1304
+ assert_eq!(u2.unwrap().len(), len, "{} should match {}", underscore2, base);
1305
+ }
1306
+ }
1307
+
1308
+ #[test]
1309
+ fn test_fallback_chains() {
1310
+ let scenarios = vec![
1311
+ ("en", "es", true, "en"),
1312
+ ("xx", "en", true, "en"),
1313
+ ("xx", "yy", false, ""),
1314
+ ("es", "xx", true, "es"),
1315
+ ];
1316
+
1317
+ for (primary, fallback, should_succeed, expected_lang) in scenarios {
1318
+ let result = get_stopwords_with_fallback(primary, fallback);
1319
+ assert_eq!(
1320
+ result.is_some(),
1321
+ should_succeed,
1322
+ "Fallback({}, {}) should {}",
1323
+ primary,
1324
+ fallback,
1325
+ if should_succeed { "succeed" } else { "fail" }
1326
+ );
1327
+
1328
+ if should_succeed {
1329
+ let stopwords = result.unwrap();
1330
+ let expected = get_stopwords(expected_lang).unwrap();
1331
+ assert_eq!(
1332
+ stopwords.len(),
1333
+ expected.len(),
1334
+ "Fallback should return {} stopwords",
1335
+ expected_lang
1336
+ );
1337
+ }
1338
+ }
1339
+ }
1340
+
1341
+ #[test]
1342
+ fn test_stopword_string_types() {
1343
+ for (lang, stopwords) in STOPWORDS.iter() {
1344
+ for word in stopwords {
1345
+ assert!(!word.is_empty(), "Language {} has empty stopword", lang);
1346
+ assert!(
1347
+ word.len() <= 100,
1348
+ "Language {} has suspiciously long stopword: {} ({} bytes)",
1349
+ lang,
1350
+ word,
1351
+ word.len()
1352
+ );
1353
+ assert!(word.chars().count() > 0, "Language {} has invalid UTF-8 stopword", lang);
1354
+ }
1355
+ }
1356
+ }
1357
+
1358
+ #[test]
1359
+ fn test_concurrent_access() {
1360
+ use std::thread;
1361
+
1362
+ let languages = vec!["en", "es", "de", "fr", "zh", "ar", "ru", "ja"];
1363
+ let mut handles = vec![];
1364
+
1365
+ for lang in languages {
1366
+ let handle = thread::spawn(move || {
1367
+ let stopwords = get_stopwords(lang);
1368
+ assert!(stopwords.is_some(), "Language {} should be available", lang);
1369
+ stopwords.unwrap().len()
1370
+ });
1371
+ handles.push(handle);
1372
+ }
1373
+
1374
+ for handle in handles {
1375
+ let len = handle.join().expect("Thread should not panic");
1376
+ assert!(len > 0, "Stopwords should not be empty");
1377
+ }
1378
+ }
1379
+
1380
+ #[test]
1381
+ fn test_stopwords_immutability() {
1382
+ let en1 = get_stopwords("en").unwrap();
1383
+ let en2 = get_stopwords("en").unwrap();
1384
+
1385
+ assert_eq!(en1.len(), en2.len());
1386
+
1387
+ for word in en1 {
1388
+ assert!(
1389
+ en2.contains(word),
1390
+ "Stopword '{}' should exist in both references",
1391
+ word
1392
+ );
1393
+ }
1394
+ }
1395
+
1396
+ #[test]
1397
+ fn test_edge_case_separator_positions() {
1398
+ let test_cases = vec![
1399
+ ("en-", true),
1400
+ ("-en", false),
1401
+ ("e-n", false),
1402
+ ("en--US", true),
1403
+ ("en_-US", true),
1404
+ ("_en", false),
1405
+ ("en_", true),
1406
+ ];
1407
+
1408
+ for (code, should_find_en) in test_cases {
1409
+ let result = get_stopwords(code);
1410
+ if should_find_en {
1411
+ assert!(result.is_some(), "Code '{}' should extract 'en'", code);
1412
+ if let Some(stopwords) = result {
1413
+ assert!(
1414
+ stopwords.contains("the"),
1415
+ "Code '{}' should return English stopwords",
1416
+ code
1417
+ );
1418
+ }
1419
+ } else {
1420
+ let _ = result;
1421
+ }
1422
+ }
1423
+ }
1424
+
1425
+ #[test]
1426
+ fn test_performance_characteristics() {
1427
+ use std::time::Instant;
1428
+
1429
+ let _ = get_stopwords("en");
1430
+
1431
+ let start = Instant::now();
1432
+ for _ in 0..10000 {
1433
+ let _ = get_stopwords("en");
1434
+ let _ = get_stopwords("es");
1435
+ let _ = get_stopwords("de");
1436
+ }
1437
+ let duration = start.elapsed();
1438
+
1439
+ assert!(
1440
+ duration.as_millis() < 500,
1441
+ "30,000 lookups took too long: {:?}",
1442
+ duration
1443
+ );
1444
+ }
1445
+
1446
+ #[test]
1447
+ fn test_language_completeness() {
1448
+ let documented = vec![
1449
+ "af", "ar", "bg", "bn", "br", "ca", "cs", "da", "de", "el", "en", "eo", "es", "et", "eu", "fa", "fi", "fr",
1450
+ "ga", "gl", "gu", "ha", "he", "hi", "hr", "hu", "hy", "id", "it", "ja", "kn", "ko", "ku", "la", "lt", "lv",
1451
+ "ml", "mr", "ms", "ne", "nl", "no", "pl", "pt", "ro", "ru", "si", "sk", "sl", "so", "st", "sv", "sw", "ta",
1452
+ "te", "th", "tl", "tr", "uk", "ur", "vi", "yo", "zh", "zu",
1453
+ ];
1454
+
1455
+ assert_eq!(documented.len(), 64, "Documentation lists 64 languages");
1456
+
1457
+ for lang in documented {
1458
+ assert!(
1459
+ STOPWORDS.contains_key(lang),
1460
+ "Documented language '{}' is missing from STOPWORDS",
1461
+ lang
1462
+ );
1463
+ assert!(
1464
+ get_stopwords(lang).is_some(),
1465
+ "Documented language '{}' not accessible via get_stopwords",
1466
+ lang
1467
+ );
1468
+ }
1469
+ }
1470
+ }