RubyGems - kreuzberg - Versions diffs - 4.0.0.rc2 → 4.0.1 - Mend

kreuzberg 4.0.0.rc2 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (446) hide show

checksums.yaml +4 -4
data/.gitignore +14 -14
data/.rspec +3 -3
data/.rubocop.yaml +1 -1
data/.rubocop.yml +543 -538
data/Gemfile +8 -8
data/Gemfile.lock +194 -6
data/README.md +391 -426
data/Rakefile +34 -25
data/Steepfile +51 -47
data/examples/async_patterns.rb +283 -341
data/ext/kreuzberg_rb/extconf.rb +65 -45
data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
data/ext/kreuzberg_rb/native/README.md +425 -425
data/ext/kreuzberg_rb/native/build.rs +15 -15
data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
data/ext/kreuzberg_rb/native/include/strings.h +20 -20
data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
data/extconf.rb +60 -28
data/kreuzberg.gemspec +199 -148
data/lib/kreuzberg/api_proxy.rb +126 -142
data/lib/kreuzberg/cache_api.rb +67 -46
data/lib/kreuzberg/cli.rb +47 -55
data/lib/kreuzberg/cli_proxy.rb +117 -127
data/lib/kreuzberg/config.rb +936 -691
data/lib/kreuzberg/error_context.rb +136 -32
data/lib/kreuzberg/errors.rb +116 -118
data/lib/kreuzberg/extraction_api.rb +313 -85
data/lib/kreuzberg/mcp_proxy.rb +177 -186
data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
data/lib/kreuzberg/post_processor_protocol.rb +15 -86
data/lib/kreuzberg/result.rb +334 -216
data/lib/kreuzberg/setup_lib_path.rb +99 -80
data/lib/kreuzberg/types.rb +170 -0
data/lib/kreuzberg/validator_protocol.rb +16 -89
data/lib/kreuzberg/version.rb +5 -5
data/lib/kreuzberg.rb +96 -103
data/lib/libpdfium.so +0 -0
data/sig/kreuzberg/internal.rbs +184 -184
data/sig/kreuzberg.rbs +561 -520
data/spec/binding/async_operations_spec.rb +473 -0
data/spec/binding/batch_operations_spec.rb +595 -0
data/spec/binding/batch_spec.rb +359 -0
data/spec/binding/cache_spec.rb +227 -227
data/spec/binding/cli_proxy_spec.rb +85 -85
data/spec/binding/cli_spec.rb +55 -55
data/spec/binding/config_result_spec.rb +377 -0
data/spec/binding/config_spec.rb +419 -345
data/spec/binding/config_validation_spec.rb +377 -283
data/spec/binding/embeddings_spec.rb +816 -0
data/spec/binding/error_handling_spec.rb +399 -213
data/spec/binding/error_recovery_spec.rb +488 -0
data/spec/binding/errors_spec.rb +66 -66
data/spec/binding/font_config_spec.rb +220 -0
data/spec/binding/images_spec.rb +738 -0
data/spec/binding/keywords_extraction_spec.rb +600 -0
data/spec/binding/metadata_types_spec.rb +1228 -0
data/spec/binding/pages_extraction_spec.rb +471 -0
data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
data/spec/binding/plugins/postprocessor_spec.rb +269 -269
data/spec/binding/plugins/validator_spec.rb +273 -274
data/spec/binding/tables_spec.rb +641 -0
data/spec/fixtures/config.toml +38 -39
data/spec/fixtures/config.yaml +41 -41
data/spec/fixtures/invalid_config.toml +3 -4
data/spec/smoke/package_spec.rb +177 -178
data/spec/spec_helper.rb +40 -42
data/spec/unit/config/chunking_config_spec.rb +213 -0
data/spec/unit/config/embedding_config_spec.rb +343 -0
data/spec/unit/config/extraction_config_spec.rb +438 -0
data/spec/unit/config/font_config_spec.rb +285 -0
data/spec/unit/config/hierarchy_config_spec.rb +314 -0
data/spec/unit/config/image_extraction_config_spec.rb +209 -0
data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
data/spec/unit/config/keyword_config_spec.rb +229 -0
data/spec/unit/config/language_detection_config_spec.rb +258 -0
data/spec/unit/config/ocr_config_spec.rb +171 -0
data/spec/unit/config/page_config_spec.rb +221 -0
data/spec/unit/config/pdf_config_spec.rb +267 -0
data/spec/unit/config/postprocessor_config_spec.rb +290 -0
data/spec/unit/config/tesseract_config_spec.rb +181 -0
data/spec/unit/config/token_reduction_config_spec.rb +251 -0
data/test/metadata_types_test.rb +959 -0
data/vendor/Cargo.toml +61 -0
data/vendor/kreuzberg/Cargo.toml +259 -204
data/vendor/kreuzberg/README.md +263 -175
data/vendor/kreuzberg/build.rs +782 -474
data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
data/vendor/kreuzberg/src/api/error.rs +81 -81
data/vendor/kreuzberg/src/api/handlers.rs +320 -199
data/vendor/kreuzberg/src/api/mod.rs +94 -79
data/vendor/kreuzberg/src/api/server.rs +518 -353
data/vendor/kreuzberg/src/api/types.rs +206 -170
data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
data/vendor/kreuzberg/src/core/config.rs +1914 -1032
data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
data/vendor/kreuzberg/src/core/formats.rs +235 -0
data/vendor/kreuzberg/src/core/io.rs +329 -329
data/vendor/kreuzberg/src/core/mime.rs +605 -605
data/vendor/kreuzberg/src/core/mod.rs +61 -45
data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
data/vendor/kreuzberg/src/embeddings.rs +471 -432
data/vendor/kreuzberg/src/error.rs +431 -431
data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
data/vendor/kreuzberg/src/extraction/email.rs +855 -854
data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
data/vendor/kreuzberg/src/extraction/image.rs +492 -368
data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
data/vendor/kreuzberg/src/extraction/table.rs +329 -328
data/vendor/kreuzberg/src/extraction/text.rs +277 -269
data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
data/vendor/kreuzberg/src/extractors/email.rs +157 -143
data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
data/vendor/kreuzberg/src/extractors/html.rs +419 -393
data/vendor/kreuzberg/src/extractors/image.rs +219 -198
data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
data/vendor/kreuzberg/src/extractors/security.rs +484 -484
data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
data/vendor/kreuzberg/src/extractors/text.rs +265 -260
data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
data/vendor/kreuzberg/src/image/dpi.rs +164 -164
data/vendor/kreuzberg/src/image/mod.rs +6 -6
data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
data/vendor/kreuzberg/src/image/resize.rs +89 -89
data/vendor/kreuzberg/src/keywords/config.rs +154 -154
data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
data/vendor/kreuzberg/src/keywords/types.rs +68 -68
data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
data/vendor/kreuzberg/src/lib.rs +114 -105
data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
data/vendor/kreuzberg/src/ocr/error.rs +37 -37
data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
data/vendor/kreuzberg/src/ocr/types.rs +393 -393
data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
data/vendor/kreuzberg/src/panic_context.rs +154 -154
data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
data/vendor/kreuzberg/src/pdf/error.rs +214 -122
data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
data/vendor/kreuzberg/src/pdf/images.rs +139 -139
data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
data/vendor/kreuzberg/src/pdf/table.rs +417 -393
data/vendor/kreuzberg/src/pdf/text.rs +553 -158
data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
data/vendor/kreuzberg/src/text/mod.rs +27 -19
data/vendor/kreuzberg/src/text/quality.rs +710 -697
data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
data/vendor/kreuzberg/src/types.rs +1713 -903
data/vendor/kreuzberg/src/utils/mod.rs +31 -17
data/vendor/kreuzberg/src/utils/pool.rs +503 -0
data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
data/vendor/kreuzberg/src/utils/quality.rs +968 -959
data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
data/vendor/kreuzberg/tests/api_embed.rs +360 -0
data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
data/vendor/kreuzberg/tests/config_features.rs +612 -598
data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
data/vendor/kreuzberg/tests/core_integration.rs +519 -510
data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
data/vendor/kreuzberg/tests/email_integration.rs +327 -325
data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
data/vendor/kreuzberg/tests/error_handling.rs +402 -393
data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
data/vendor/kreuzberg/tests/format_integration.rs +165 -159
data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
data/vendor/kreuzberg/tests/image_integration.rs +255 -253
data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
data/vendor/kreuzberg/tests/page_markers.rs +297 -0
data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
data/vendor/kreuzberg/tests/security_validation.rs +416 -415
data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
data/vendor/kreuzberg-ffi/README.md +851 -0
data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
data/vendor/kreuzberg-ffi/build.rs +168 -0
data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
data/vendor/kreuzberg-ffi/src/error.rs +901 -0
data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
data/vendor/kreuzberg-ffi/src/result.rs +510 -0
data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
data/vendor/kreuzberg-ffi/src/types.rs +363 -0
data/vendor/kreuzberg-ffi/src/util.rs +210 -0
data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
data/vendor/kreuzberg-tesseract/README.md +399 -0
data/vendor/kreuzberg-tesseract/build.rs +1127 -0
data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
metadata +196 -45
data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
data/vendor/rb-sys/.cargo-ok +0 -1
data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
data/vendor/rb-sys/Cargo.lock +0 -393
data/vendor/rb-sys/Cargo.toml +0 -70
data/vendor/rb-sys/Cargo.toml.orig +0 -57
data/vendor/rb-sys/LICENSE-APACHE +0 -190
data/vendor/rb-sys/bin/release.sh +0 -21
data/vendor/rb-sys/build/features.rs +0 -108
data/vendor/rb-sys/build/main.rs +0 -246
data/vendor/rb-sys/build/stable_api_config.rs +0 -153
data/vendor/rb-sys/build/version.rs +0 -48
data/vendor/rb-sys/readme.md +0 -36
data/vendor/rb-sys/src/bindings.rs +0 -21
data/vendor/rb-sys/src/hidden.rs +0 -11
data/vendor/rb-sys/src/lib.rs +0 -34
data/vendor/rb-sys/src/macros.rs +0 -371
data/vendor/rb-sys/src/memory.rs +0 -53
data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
data/vendor/rb-sys/src/special_consts.rs +0 -31
data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
data/vendor/rb-sys/src/stable_api.rs +0 -261
data/vendor/rb-sys/src/symbol.rs +0 -31
data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
data/vendor/rb-sys/src/utils.rs +0 -89
data/vendor/rb-sys/src/value_type.rs +0 -7

data/vendor/kreuzberg/src/stopwords/mod.rs CHANGED Viewed

@@ -1,1470 +1,1470 @@
-//! Stopwords management for text processing.
-//!
-//! Provides language-specific stopword collections used by keyword extraction
-//! and token reduction features. Stopwords are common words (the, is, and, etc.)
-//! that should be filtered out from text analysis.
-//!
-//! # Supported Languages
-//!
-//! Supports 64 languages with embedded stopword lists:
-//! - Afrikaans (af), Arabic (ar), Bulgarian (bg), Bengali (bn), Breton (br)
-//! - Catalan (ca), Czech (cs), Danish (da), German (de), Greek (el)
-//! - English (en), Esperanto (eo), Spanish (es), Estonian (et), Basque (eu)
-//! - Persian (fa), Finnish (fi), French (fr), Irish (ga), Galician (gl)
-//! - Gujarati (gu), Hausa (ha), Hebrew (he), Hindi (hi), Croatian (hr)
-//! - Hungarian (hu), Armenian (hy), Indonesian (id), Italian (it), Japanese (ja)
-//! - Kannada (kn), Korean (ko), Kurdish (ku), Latin (la), Lithuanian (lt)
-//! - Latvian (lv), Malayalam (ml), Marathi (mr), Malay (ms), Nepali (ne)
-//! - Dutch (nl), Norwegian (no), Polish (pl), Portuguese (pt), Romanian (ro)
-//! - Russian (ru), Sinhala (si), Slovak (sk), Slovenian (sl), Somali (so)
-//! - Sesotho (st), Swedish (sv), Swahili (sw), Tamil (ta), Telugu (te)
-//! - Thai (th), Tagalog (tl), Turkish (tr), Ukrainian (uk), Urdu (ur)
-//! - Vietnamese (vi), Yoruba (yo), Chinese (zh), Zulu (zu)
-//!
-//! All stopword lists are embedded in the binary at compile time for zero-overhead access.
-//!
-//! # Usage
-//!
-//! ```rust
-//! use kreuzberg::stopwords::{get_stopwords, get_stopwords_with_fallback};
-//!
-//! // Get English stopwords with normalization
-//! if let Some(en_stopwords) = get_stopwords("en") {
-//!     assert!(en_stopwords.contains("the"));
-//!
-//!     // Check if a word is a stopword
-//!     if en_stopwords.contains("the") {
-//!         println!("'the' is a stopword");
-//!     }
-//! }
-//!
-//! // Case-insensitive - all of these work
-//! assert!(get_stopwords("EN").is_some());
-//! assert!(get_stopwords("En").is_some());
-//!
-//! // Locale codes are normalized to language code (first 2 chars)
-//! if let Some(en_us) = get_stopwords("en-US") {
-//!     if let Some(en_gb) = get_stopwords("en_GB") {
-//!         // Both point to "en" stopwords
-//!         assert_eq!(en_us.len(), en_gb.len());
-//!     }
-//! }
-//!
-//! // Spanish with locale
-//! if let Some(es_stopwords) = get_stopwords("es-ES") {
-//!     assert!(es_stopwords.contains("el"));
-//! }
-//!
-//! // Fallback for unsupported languages
-//! if let Some(stopwords) = get_stopwords_with_fallback("unknown", "en") {
-//!     // Will use English stopwords since "unknown" isn't supported
-//!     assert!(stopwords.contains("the"));
-//! }
-//! ```
-//!
-//! # Direct Access (Advanced)
-//!
-//! For advanced use cases where you need direct access to the HashMap or want to
-//! iterate over all languages, you can use the `STOPWORDS` static directly:
-//!
-//! ```rust
-//! use kreuzberg::stopwords::STOPWORDS;
-//!
-//! // Direct access (case-sensitive, no normalization)
-//! let en_stopwords = STOPWORDS.get("en");
-//!
-//! // List all available languages
-//! for lang in STOPWORDS.keys() {
-//!     println!("Available language: {}", lang);
-//! }
-//! ```
-use ahash::{AHashMap, AHashSet};
-use once_cell::sync::Lazy;
-/// Macro to generate embedded stopwords for all languages.
-///
-/// This macro embeds the JSON files at compile time using `include_str!()` and
-/// generates code to parse and insert them into the stopwords map.
-macro_rules! embed_stopwords {
-    ($map:expr, $($lang:literal),* $(,)?) => {
-        $(
-            {
-                const JSON: &str = include_str!(concat!("../../stopwords/", $lang, "_stopwords.json"));
-                match serde_json::from_str::<Vec<String>>(JSON) {
-                    Ok(words) => {
-                        let set: AHashSet<String> = words.into_iter().collect();
-                        $map.insert($lang.to_string(), set);
-                    }
-                    Err(e) => {
-                        panic!(
-                            "Failed to parse embedded stopwords for language '{}': {}. \
-                            This indicates corrupted or malformed JSON in the embedded stopwords data. \
-                            Please report this issue at https://github.com/kreuzberg-dev/kreuzberg/issues",
-                            $lang, e
-                        );
-                    }
-                }
-            }
-        )*
-    };
-}
-/// Global stopwords registry.
-///
-/// A lazy-initialized map of language codes to stopword sets.
-/// All stopword lists are embedded in the binary at compile time for
-/// zero-overhead access and no runtime I/O dependencies.
-///
-/// Supports 64 languages with comprehensive stopword coverage.
-///
-/// # Note
-///
-/// For most use cases, prefer [`get_stopwords()`] which provides language code
-/// normalization (case-insensitive, locale handling). Direct access to STOPWORDS
-/// is case-sensitive and requires exact language codes (lowercase, 2-letter ISO 639-1).
-///
-/// # Examples
-///
-/// ```rust
-/// use kreuzberg::stopwords::STOPWORDS;
-///
-/// // Direct access (case-sensitive, no normalization)
-/// let en_stopwords = STOPWORDS.get("en");
-/// assert!(en_stopwords.is_some());
-///
-/// // Case-sensitive - these return None
-/// assert!(STOPWORDS.get("EN").is_none());
-/// assert!(STOPWORDS.get("en-US").is_none());
-///
-/// // List all available languages
-/// assert_eq!(STOPWORDS.len(), 64);
-/// for lang in STOPWORDS.keys() {
-///     println!("Available: {}", lang);
-/// }
-/// ```
-pub static STOPWORDS: Lazy<AHashMap<String, AHashSet<String>>> = Lazy::new(|| {
-    let mut map = AHashMap::new();
-    embed_stopwords!(
-        map, "af", "ar", "bg", "bn", "br", "ca", "cs", "da", "de", "el", "en", "eo", "es", "et", "eu", "fa", "fi",
-        "fr", "ga", "gl", "gu", "ha", "he", "hi", "hr", "hu", "hy", "id", "it", "ja", "kn", "ko", "ku", "la", "lt",
-        "lv", "ml", "mr", "ms", "ne", "nl", "no", "pl", "pt", "ro", "ru", "si", "sk", "sl", "so", "st", "sv", "sw",
-        "ta", "te", "th", "tl", "tr", "uk", "ur", "vi", "yo", "zh", "zu",
-    );
-    apply_stopword_whitelist(&mut map);
-    map
-});
-fn apply_stopword_whitelist(map: &mut AHashMap<String, AHashSet<String>>) {
-    const STOPWORD_REMOVALS: &[(&str, &[&str])] = &[("en", &["hello", "test", "world", "working", "great"])];
-    for (lang, words) in STOPWORD_REMOVALS {
-        if let Some(set) = map.get_mut(*lang) {
-            for &word in *words {
-                set.remove(word);
-            }
-        }
-    }
-}
-/// Get stopwords for a language with normalization.
-///
-/// This function provides a user-friendly interface to the stopwords registry with:
-/// - **Case-insensitive lookup**: "EN", "en", "En" all work
-/// - **Locale normalization**: "en-US", "en_GB", "es-ES" extract to "en", "es"
-/// - **Consistent behavior**: Returns `None` for unsupported languages
-///
-/// # Language Code Format
-///
-/// Accepts multiple formats:
-/// - ISO 639-1 two-letter codes: `"en"`, `"es"`, `"de"`, etc.
-/// - Uppercase variants: `"EN"`, `"ES"`, `"DE"`
-/// - Locale codes with hyphen: `"en-US"`, `"es-ES"`, `"pt-BR"`
-/// - Locale codes with underscore: `"en_US"`, `"es_ES"`, `"pt_BR"`
-///
-/// All formats are normalized to lowercase two-letter ISO 639-1 codes.
-///
-/// # Returns
-///
-/// - `Some(&HashSet<String>)` if the language is supported (64 languages available)
-/// - `None` if the language is not supported
-///
-/// # Examples
-///
-/// ```rust
-/// use kreuzberg::stopwords::get_stopwords;
-///
-/// // Simple language codes
-/// if let Some(en) = get_stopwords("en") {
-///     assert!(en.contains("the"));
-/// }
-///
-/// // Case-insensitive
-/// assert!(get_stopwords("EN").is_some());
-/// assert!(get_stopwords("En").is_some());
-/// assert!(get_stopwords("eN").is_some());
-///
-/// // Locale codes normalized to language code
-/// if let (Some(en_us), Some(en_gb), Some(en_lowercase)) =
-///     (get_stopwords("en-US"), get_stopwords("en_GB"), get_stopwords("en"))
-/// {
-///     // All point to the same stopwords set
-///     assert_eq!(en_us.len(), en_gb.len());
-///     assert_eq!(en_us.len(), en_lowercase.len());
-/// }
-///
-/// // Spanish with various formats
-/// assert!(get_stopwords("es").is_some());
-/// assert!(get_stopwords("ES").is_some());
-/// assert!(get_stopwords("es-ES").is_some());
-/// assert!(get_stopwords("es_MX").is_some());
-///
-/// // Unsupported language returns None
-/// assert!(get_stopwords("xx").is_none());
-/// assert!(get_stopwords("zzzz").is_none());
-/// ```
-///
-/// # Performance
-///
-/// This function performs two operations:
-/// 1. String normalization (lowercase + truncate) - O(1) for typical language codes
-/// 2. HashMap lookup in STOPWORDS - O(1) average case
-///
-/// Total overhead is negligible (~10-50ns on modern CPUs).
-pub fn get_stopwords(lang: &str) -> Option<&'static AHashSet<String>> {
-    let normalized = lang.to_lowercase();
-    let lang_code = if let Some(pos) = normalized.find(&['-', '_'][..]) {
-        &normalized[..pos]
-    } else if normalized.len() >= 2 {
-        &normalized[..2]
-    } else {
-        &normalized
-    };
-    STOPWORDS.get(lang_code)
-}
-/// Get stopwords for a language with fallback support.
-///
-/// This function attempts to retrieve stopwords for the primary language,
-/// and if not available, falls back to a secondary language. This is useful
-/// for handling scenarios where:
-/// - A detected language isn't supported
-/// - You want to use English as a fallback for unknown languages
-/// - You need graceful degradation for multilingual content
-///
-/// Both language codes support the same normalization as [`get_stopwords()`]:
-/// - Case-insensitive lookup (EN, en, En all work)
-/// - Locale codes normalized (en-US, en_GB extract to "en")
-///
-/// # Arguments
-///
-/// * `language` - Primary language code to try first
-/// * `fallback` - Fallback language code to use if primary not available
-///
-/// # Returns
-///
-/// - `Some(&HashSet<String>)` if either language is supported
-/// - `None` if neither language is supported
-///
-/// # Examples
-///
-/// ```rust
-/// use kreuzberg::stopwords::get_stopwords_with_fallback;
-///
-/// // Detected language is Esperanto, fallback to English
-/// if let Some(stopwords) = get_stopwords_with_fallback("eo", "en") {
-///     // Will use Esperanto stopwords (supported)
-///     assert!(stopwords.contains("la"));
-/// }
-///
-/// // Unsupported language, fallback to English
-/// if let Some(stopwords) = get_stopwords_with_fallback("xx", "en") {
-///     // Will use English stopwords (fallback)
-///     assert!(stopwords.contains("the"));
-/// }
-///
-/// // Case-insensitive and locale-aware
-/// let result = get_stopwords_with_fallback("es-MX", "EN-US");
-/// assert!(result.is_some());
-///
-/// // Both unsupported returns None
-/// assert!(get_stopwords_with_fallback("xx", "zz").is_none());
-/// ```
-///
-/// # Common Patterns
-///
-/// ```rust
-/// use kreuzberg::stopwords::get_stopwords_with_fallback;
-///
-/// // English fallback for unknown languages
-/// let detected_lang = "xyz"; // Unknown language
-/// let stopwords = get_stopwords_with_fallback(detected_lang, "en")
-///     .expect("English fallback should always be available");
-///
-/// // Multi-language content with English fallback
-/// for lang in ["de", "fr", "unknown", "es"] {
-///     if let Some(stopwords) = get_stopwords_with_fallback(lang, "en") {
-///         println!("Using stopwords for: {}", lang);
-///     }
-/// }
-/// ```
-///
-/// # Performance
-///
-/// This function performs at most two HashMap lookups:
-/// 1. Try primary language (O(1) average case)
-/// 2. If None, try fallback language (O(1) average case)
-///
-/// Total overhead is negligible (~10-100ns on modern CPUs).
-pub fn get_stopwords_with_fallback(language: &str, fallback: &str) -> Option<&'static AHashSet<String>> {
-    get_stopwords(language).or_else(|| get_stopwords(fallback))
-}
-#[cfg(test)]
-mod tests {
-    use super::*;
-    #[test]
-    fn test_stopwords_lazy_initialization() {
-        let stopwords = &*STOPWORDS;
-        assert!(stopwords.contains_key("en"));
-        assert!(stopwords.contains_key("es"));
-        assert!(!stopwords.get("en").unwrap().is_empty());
-        assert!(!stopwords.get("es").unwrap().is_empty());
-    }
-    #[test]
-    fn test_english_stopwords() {
-        let en_stopwords = STOPWORDS.get("en").unwrap();
-        assert!(en_stopwords.contains("the"));
-        assert!(en_stopwords.contains("is"));
-        assert!(en_stopwords.contains("and"));
-        assert!(en_stopwords.contains("a"));
-        assert!(en_stopwords.contains("of"));
-        assert!(en_stopwords.len() >= 70);
-    }
-    #[test]
-    fn test_spanish_stopwords() {
-        let es_stopwords = STOPWORDS.get("es").unwrap();
-        assert!(es_stopwords.contains("el"));
-        assert!(es_stopwords.contains("la"));
-        assert!(es_stopwords.contains("es"));
-        assert!(es_stopwords.contains("en"));
-        assert!(es_stopwords.contains("de"));
-        assert!(es_stopwords.len() >= 200);
-    }
-    #[test]
-    fn test_all_64_languages_loaded() {
-        let expected_languages = [
-            "af", "ar", "bg", "bn", "br", "ca", "cs", "da", "de", "el", "en", "eo", "es", "et", "eu", "fa", "fi", "fr",
-            "ga", "gl", "gu", "ha", "he", "hi", "hr", "hu", "hy", "id", "it", "ja", "kn", "ko", "ku", "la", "lt", "lv",
-            "ml", "mr", "ms", "ne", "nl", "no", "pl", "pt", "ro", "ru", "si", "sk", "sl", "so", "st", "sv", "sw", "ta",
-            "te", "th", "tl", "tr", "uk", "ur", "vi", "yo", "zh", "zu",
-        ];
-        for lang in &expected_languages {
-            assert!(
-                STOPWORDS.contains_key(*lang),
-                "Missing stopwords for language: {}",
-                lang
-            );
-            assert!(
-                !STOPWORDS.get(*lang).unwrap().is_empty(),
-                "Empty stopwords for language: {}",
-                lang
-            );
-        }
-        assert_eq!(STOPWORDS.len(), 64, "Expected 64 languages, found {}", STOPWORDS.len());
-    }
-    #[test]
-    fn test_german_stopwords() {
-        let de_stopwords = STOPWORDS.get("de").unwrap();
-        assert!(de_stopwords.contains("der"));
-        assert!(de_stopwords.contains("die"));
-        assert!(de_stopwords.contains("und"));
-    }
-    #[test]
-    fn test_french_stopwords() {
-        let fr_stopwords = STOPWORDS.get("fr").unwrap();
-        assert!(fr_stopwords.contains("le"));
-        assert!(fr_stopwords.contains("de"));
-        assert!(fr_stopwords.contains("un"));
-    }
-    #[test]
-    fn test_chinese_stopwords() {
-        let zh_stopwords = STOPWORDS.get("zh").unwrap();
-        assert!(!zh_stopwords.is_empty());
-    }
-    #[test]
-    fn test_arabic_stopwords() {
-        let ar_stopwords = STOPWORDS.get("ar").unwrap();
-        assert!(!ar_stopwords.is_empty());
-    }
-    #[test]
-    fn test_unknown_language_returns_none() {
-        assert!(!STOPWORDS.contains_key("xx"));
-        assert!(STOPWORDS.get("unknown").is_none());
-    }
-    #[test]
-    fn test_get_stopwords_lowercase() {
-        assert!(get_stopwords("en").is_some());
-        assert!(get_stopwords("es").is_some());
-        assert!(get_stopwords("de").is_some());
-        assert!(get_stopwords("fr").is_some());
-    }
-    #[test]
-    fn test_get_stopwords_uppercase() {
-        let en_upper = get_stopwords("EN");
-        let en_lower = get_stopwords("en");
-        assert!(en_upper.is_some());
-        assert!(en_lower.is_some());
-        assert_eq!(en_upper.unwrap().len(), en_lower.unwrap().len());
-    }
-    #[test]
-    fn test_get_stopwords_mixed_case() {
-        assert!(get_stopwords("En").is_some());
-        assert!(get_stopwords("eN").is_some());
-        assert!(get_stopwords("ES").is_some());
-        assert!(get_stopwords("Es").is_some());
-        assert!(get_stopwords("DE").is_some());
-        assert!(get_stopwords("De").is_some());
-    }
-    #[test]
-    fn test_get_stopwords_locale_hyphen() {
-        let en_us = get_stopwords("en-US");
-        let en_gb = get_stopwords("en-GB");
-        let en = get_stopwords("en");
-        assert!(en_us.is_some());
-        assert!(en_gb.is_some());
-        assert_eq!(en_us.unwrap().len(), en.unwrap().len());
-        assert_eq!(en_gb.unwrap().len(), en.unwrap().len());
-    }
-    #[test]
-    fn test_get_stopwords_locale_underscore() {
-        let es_es = get_stopwords("es_ES");
-        let es_mx = get_stopwords("es_MX");
-        let es = get_stopwords("es");
-        assert!(es_es.is_some());
-        assert!(es_mx.is_some());
-        assert_eq!(es_es.unwrap().len(), es.unwrap().len());
-        assert_eq!(es_mx.unwrap().len(), es.unwrap().len());
-    }
-    #[test]
-    fn test_get_stopwords_locale_uppercase() {
-        let en_us_upper = get_stopwords("EN-US");
-        let es_es_upper = get_stopwords("ES_ES");
-        let pt_br_mixed = get_stopwords("Pt-BR");
-        assert!(en_us_upper.is_some());
-        assert!(es_es_upper.is_some());
-        assert!(pt_br_mixed.is_some());
-        assert!(en_us_upper.unwrap().contains("the"));
-        assert!(es_es_upper.unwrap().contains("el"));
-        assert!(pt_br_mixed.unwrap().contains("o"));
-    }
-    #[test]
-    fn test_get_stopwords_all_supported_languages() {
-        let languages = [
-            "af", "ar", "bg", "bn", "br", "ca", "cs", "da", "de", "el", "en", "eo", "es", "et", "eu", "fa", "fi", "fr",
-            "ga", "gl", "gu", "ha", "he", "hi", "hr", "hu", "hy", "id", "it", "ja", "kn", "ko", "ku", "la", "lt", "lv",
-            "ml", "mr", "ms", "ne", "nl", "no", "pl", "pt", "ro", "ru", "si", "sk", "sl", "so", "st", "sv", "sw", "ta",
-            "te", "th", "tl", "tr", "uk", "ur", "vi", "yo", "zh", "zu",
-        ];
-        for lang in &languages {
-            assert!(
-                get_stopwords(lang).is_some(),
-                "Language {} should be available via get_stopwords",
-                lang
-            );
-        }
-    }
-    #[test]
-    fn test_get_stopwords_unsupported_language() {
-        assert!(get_stopwords("xx").is_none());
-        assert!(get_stopwords("zz").is_none());
-        assert!(get_stopwords("xyz").is_none());
-        assert!(get_stopwords("unknown").is_none());
-    }
-    #[test]
-    fn test_get_stopwords_empty_string() {
-        assert!(get_stopwords("").is_none());
-    }
-    #[test]
-    fn test_get_stopwords_single_char() {
-        assert!(get_stopwords("e").is_none());
-        assert!(get_stopwords("z").is_none());
-    }
-    #[test]
-    fn test_get_stopwords_long_locale() {
-        let zh_cn_hans = get_stopwords("zh-CN-Hans");
-        let pt_br_utf8 = get_stopwords("pt_BR.UTF-8");
-        assert!(zh_cn_hans.is_some());
-        assert!(pt_br_utf8.is_some());
-        assert_eq!(zh_cn_hans.unwrap().len(), get_stopwords("zh").unwrap().len());
-        assert_eq!(pt_br_utf8.unwrap().len(), get_stopwords("pt").unwrap().len());
-    }
-    #[test]
-    fn test_get_stopwords_content_verification() {
-        let en = get_stopwords("en").expect("English stopwords should exist");
-        assert!(en.contains("the"));
-        assert!(en.contains("is"));
-        assert!(en.contains("and"));
-        let es = get_stopwords("es").expect("Spanish stopwords should exist");
-        assert!(es.contains("el"));
-        assert!(es.contains("la"));
-        assert!(es.contains("es"));
-        let de = get_stopwords("de").expect("German stopwords should exist");
-        assert!(de.contains("der"));
-        assert!(de.contains("die"));
-        assert!(de.contains("und"));
-        let fr = get_stopwords("fr").expect("French stopwords should exist");
-        assert!(fr.contains("le"));
-        assert!(fr.contains("de"));
-        assert!(fr.contains("un"));
-    }
-    #[test]
-    fn test_get_stopwords_vs_direct_access() {
-        let en_normalized = get_stopwords("en").unwrap();
-        let en_direct = STOPWORDS.get("en").unwrap();
-        assert_eq!(en_normalized.len(), en_direct.len());
-        for word in en_direct {
-            assert!(en_normalized.contains(word));
-        }
-    }
-    #[test]
-    fn test_get_stopwords_with_fallback_primary_available() {
-        let result = get_stopwords_with_fallback("en", "es");
-        assert!(result.is_some());
-        let stopwords = result.unwrap();
-        assert!(stopwords.contains("the"));
-        assert!(!stopwords.contains("el"));
-    }
-    #[test]
-    fn test_get_stopwords_with_fallback_use_fallback() {
-        let result = get_stopwords_with_fallback("xx", "en");
-        assert!(result.is_some());
-        let stopwords = result.unwrap();
-        assert!(stopwords.contains("the"));
-    }
-    #[test]
-    fn test_get_stopwords_with_fallback_both_unavailable() {
-        let result = get_stopwords_with_fallback("xx", "zz");
-        assert!(result.is_none());
-    }
-    #[test]
-    fn test_get_stopwords_with_fallback_case_insensitive() {
-        let result1 = get_stopwords_with_fallback("EN", "es");
-        let result2 = get_stopwords_with_fallback("xx", "ES");
-        assert!(result1.is_some());
-        assert!(result2.is_some());
-    }
-    #[test]
-    fn test_get_stopwords_with_fallback_locale_codes() {
-        let result = get_stopwords_with_fallback("es-MX", "en-US");
-        assert!(result.is_some());
-        let stopwords = result.unwrap();
-        assert!(stopwords.contains("el"));
-    }
-    #[test]
-    fn test_get_stopwords_with_fallback_esperanto_to_english() {
-        let result = get_stopwords_with_fallback("eo", "en");
-        assert!(result.is_some());
-        let stopwords = result.unwrap();
-        assert!(stopwords.contains("la"));
-    }
-    #[test]
-    fn test_get_stopwords_with_fallback_unknown_to_english() {
-        let result = get_stopwords_with_fallback("xyz", "en");
-        assert!(result.is_some());
-        let stopwords = result.unwrap();
-        assert!(stopwords.contains("the"));
-    }
-    #[test]
-    fn test_get_stopwords_with_fallback_same_as_chained_or_else() {
-        let manual = get_stopwords("xx").or_else(|| get_stopwords("en"));
-        let helper = get_stopwords_with_fallback("xx", "en");
-        assert_eq!(manual.is_some(), helper.is_some());
-        if let (Some(m), Some(h)) = (manual, helper) {
-            assert_eq!(m.len(), h.len());
-        }
-    }
-    #[test]
-    fn test_get_stopwords_invalid_language_codes() {
-        assert!(get_stopwords("invalid_lang").is_none());
-        assert!(get_stopwords("xyz").is_none());
-        assert!(get_stopwords("zzzz").is_none());
-        assert!(get_stopwords("abc123").is_none());
-        assert!(get_stopwords("!!!").is_none());
-    }
-    #[test]
-    fn test_get_stopwords_edge_case_empty_and_whitespace() {
-        assert!(get_stopwords("").is_none());
-        assert!(get_stopwords(" ").is_none());
-        assert!(get_stopwords("  ").is_none());
-        assert!(get_stopwords("\t").is_none());
-        assert!(get_stopwords("\n").is_none());
-    }
-    #[test]
-    fn test_get_stopwords_special_characters() {
-        assert!(get_stopwords("@#").is_none());
-        assert!(get_stopwords("$%").is_none());
-        assert!(get_stopwords("!!!").is_none());
-        let result = get_stopwords("en!");
-        assert!(result.is_some());
-        if let Some(stopwords) = result {
-            assert!(stopwords.contains("the"));
-        }
-        let result = get_stopwords("es@");
-        assert!(result.is_some());
-        if let Some(stopwords) = result {
-            assert!(stopwords.contains("el"));
-        }
-        let result = get_stopwords("de#fr");
-        assert!(result.is_some());
-        if let Some(stopwords) = result {
-            assert!(stopwords.contains("der"));
-        }
-    }
-    #[test]
-    fn test_get_stopwords_numeric_codes() {
-        assert!(get_stopwords("12").is_none());
-        assert!(get_stopwords("99").is_none());
-        assert!(get_stopwords("123").is_none());
-        assert!(get_stopwords("0").is_none());
-    }
-    #[test]
-    fn test_get_stopwords_single_character_edge_cases() {
-        assert!(get_stopwords("a").is_none());
-        assert!(get_stopwords("e").is_none());
-        assert!(get_stopwords("z").is_none());
-        assert!(get_stopwords("1").is_none());
-        assert!(get_stopwords("_").is_none());
-    }
-    #[test]
-    fn test_get_stopwords_invalid_locale_formats() {
-        assert!(get_stopwords("xx-YY").is_none());
-        assert!(get_stopwords("zz_ZZ").is_none());
-        assert!(get_stopwords("invalid-US").is_none());
-        assert!(get_stopwords("aa_BB_CC").is_none());
-    }
-    #[test]
-    fn test_get_stopwords_mixed_valid_invalid() {
-        let result = get_stopwords("en123");
-        assert!(result.is_some(), "Should extract 'en' from 'en123'");
-        assert!(get_stopwords("12en").is_none());
-        assert!(get_stopwords("@@en").is_none());
-    }
-    #[test]
-    fn test_get_stopwords_case_sensitivity_validation() {
-        let lower = get_stopwords("en");
-        let upper = get_stopwords("EN");
-        let mixed1 = get_stopwords("En");
-        let mixed2 = get_stopwords("eN");
-        assert!(lower.is_some());
-        assert!(upper.is_some());
-        assert!(mixed1.is_some());
-        assert!(mixed2.is_some());
-        if let (Some(l), Some(u), Some(m1), Some(m2)) = (lower, upper, mixed1, mixed2) {
-            assert_eq!(l.len(), u.len());
-            assert_eq!(l.len(), m1.len());
-            assert_eq!(l.len(), m2.len());
-        }
-    }
-    #[test]
-    fn test_get_stopwords_none_return_safety() {
-        let result = get_stopwords("invalid").and_then(|_| get_stopwords("also_invalid"));
-        assert!(result.is_none());
-        let chained = get_stopwords("xxx")
-            .or_else(|| get_stopwords("yyy"))
-            .or_else(|| get_stopwords("zzz"));
-        assert!(chained.is_none());
-    }
-    #[test]
-    fn test_get_stopwords_with_fallback_both_invalid() {
-        assert!(get_stopwords_with_fallback("invalid", "also_invalid").is_none());
-        assert!(get_stopwords_with_fallback("xxx", "yyy").is_none());
-        assert!(get_stopwords_with_fallback("", "").is_none());
-        assert!(get_stopwords_with_fallback("123", "456").is_none());
-    }
-    #[test]
-    fn test_get_stopwords_with_fallback_invalid_primary_valid_fallback() {
-        let result = get_stopwords_with_fallback("invalid_lang", "en");
-        assert!(result.is_some());
-        if let Some(stopwords) = result {
-            assert!(stopwords.contains("the"));
-        }
-        let result2 = get_stopwords_with_fallback("xyz", "es");
-        assert!(result2.is_some());
-        if let Some(stopwords) = result2 {
-            assert!(stopwords.contains("el"));
-        }
-    }
-    #[test]
-    fn test_get_stopwords_with_fallback_valid_primary_invalid_fallback() {
-        let result = get_stopwords_with_fallback("en", "invalid_fallback");
-        assert!(result.is_some());
-        if let Some(stopwords) = result {
-            assert!(stopwords.contains("the"));
-        }
-        let result2 = get_stopwords_with_fallback("es", "zzz");
-        assert!(result2.is_some());
-        if let Some(stopwords) = result2 {
-            assert!(stopwords.contains("el"));
-        }
-    }
-    #[test]
-    fn test_get_stopwords_with_fallback_empty_strings() {
-        assert!(get_stopwords_with_fallback("", "en").is_some());
-        assert!(get_stopwords_with_fallback("en", "").is_some());
-        assert!(get_stopwords_with_fallback("", "").is_none());
-    }
-    #[test]
-    fn test_get_stopwords_with_fallback_special_characters() {
-        assert!(get_stopwords_with_fallback("@#$", "en").is_some());
-        assert!(get_stopwords_with_fallback("en", "!!!").is_some());
-        assert!(get_stopwords_with_fallback("@#$", "!!!").is_none());
-    }
-    #[test]
-    fn test_get_stopwords_with_fallback_case_insensitive_validation() {
-        let result1 = get_stopwords_with_fallback("INVALID", "en");
-        let result2 = get_stopwords_with_fallback("invalid", "EN");
-        let result3 = get_stopwords_with_fallback("INVALID", "EN");
-        assert!(result1.is_some());
-        assert!(result2.is_some());
-        assert!(result3.is_some());
-        if let (Some(r1), Some(r2), Some(r3)) = (result1, result2, result3) {
-            assert!(r1.contains("the"));
-            assert!(r2.contains("the"));
-            assert!(r3.contains("the"));
-        }
-    }
-    #[test]
-    fn test_direct_stopwords_access_invalid_keys() {
-        assert!(STOPWORDS.get("invalid").is_none());
-        assert!(STOPWORDS.get("EN").is_none());
-        assert!(STOPWORDS.get("en-US").is_none());
-        assert!(STOPWORDS.get("xyz").is_none());
-        assert!(STOPWORDS.get("").is_none());
-    }
-    #[test]
-    fn test_stopwords_case_sensitivity_direct_vs_normalized() {
-        assert!(STOPWORDS.get("EN").is_none());
-        assert!(get_stopwords("EN").is_some());
-        assert!(STOPWORDS.get("Es").is_none());
-        assert!(get_stopwords("Es").is_some());
-        assert!(STOPWORDS.get("DE").is_none());
-        assert!(get_stopwords("DE").is_some());
-    }
-    #[test]
-    fn test_get_stopwords_unicode_characters() {
-        // NOTE: Current implementation has a limitation - it uses byte slicing which can panic
-        let result = get_stopwords("zh-中文");
-        assert!(result.is_some());
-        let result = get_stopwords("ar-العربية");
-        assert!(result.is_some());
-        let result = get_stopwords("ja_日本");
-        assert!(result.is_some());
-        assert!(get_stopwords("xx").is_none());
-        assert!(get_stopwords("yy").is_none());
-        // NOTE: The following would panic due to byte slicing on multi-byte chars:
-    }
-    #[test]
-    fn test_get_stopwords_very_long_strings() {
-        let long_string = "x".repeat(1000);
-        assert!(get_stopwords(&long_string).is_none());
-        let long_locale = "en-".to_string() + &"X".repeat(100);
-        let result = get_stopwords(&long_locale);
-        assert!(result.is_some());
-    }
-    #[test]
-    fn test_get_stopwords_null_bytes() {
-        assert!(get_stopwords("\0").is_none());
-        assert!(get_stopwords("en\0").is_some());
-        assert!(get_stopwords("\0en").is_none());
-    }
-    #[test]
-    fn test_get_stopwords_boundary_conditions() {
-        assert!(get_stopwords("e").is_none());
-        assert!(get_stopwords("en").is_some());
-        assert!(get_stopwords("eng").is_some());
-        let result = get_stopwords("en-");
-        assert!(result.is_some());
-    }
-    #[test]
-    fn test_get_stopwords_multiple_separators() {
-        assert!(get_stopwords("en-US-utf8").is_some());
-        assert!(get_stopwords("es_MX_special").is_some());
-        assert!(get_stopwords("pt-BR_variant").is_some());
-    }
-    #[test]
-    fn test_romance_languages() {
-        let fr = get_stopwords("fr").expect("French stopwords should exist");
-        assert!(fr.contains("le"), "French should contain 'le'");
-        assert!(fr.contains("et"), "French should contain 'et'");
-        assert!(fr.len() >= 150, "French should have substantial stopwords");
-        let es = get_stopwords("es").expect("Spanish stopwords should exist");
-        assert!(es.contains("el"), "Spanish should contain 'el'");
-        assert!(es.contains("y"), "Spanish should contain 'y'");
-        assert!(es.len() >= 200, "Spanish should have substantial stopwords");
-        let pt = get_stopwords("pt").expect("Portuguese stopwords should exist");
-        assert!(pt.contains("o"), "Portuguese should contain 'o'");
-        assert!(pt.contains("e"), "Portuguese should contain 'e'");
-        assert!(pt.len() >= 150, "Portuguese should have substantial stopwords");
-        let it = get_stopwords("it").expect("Italian stopwords should exist");
-        assert!(it.contains("il"), "Italian should contain 'il'");
-        assert!(it.contains("e"), "Italian should contain 'e'");
-        assert!(it.len() >= 150, "Italian should have substantial stopwords");
-        let ro = get_stopwords("ro").expect("Romanian stopwords should exist");
-        assert!(!ro.is_empty(), "Romanian should have stopwords");
-        assert!(ro.len() >= 100, "Romanian should have substantial stopwords");
-    }
-    #[test]
-    fn test_germanic_languages() {
-        let de = get_stopwords("de").expect("German stopwords should exist");
-        assert!(de.contains("der"), "German should contain 'der'");
-        assert!(de.contains("die"), "German should contain 'die'");
-        assert!(de.contains("und"), "German should contain 'und'");
-        assert!(de.len() >= 200, "German should have substantial stopwords");
-        let en = get_stopwords("en").expect("English stopwords should exist");
-        assert!(en.contains("the"), "English should contain 'the'");
-        assert!(en.contains("and"), "English should contain 'and'");
-        assert!(en.len() >= 70, "English should have substantial stopwords");
-        let nl = get_stopwords("nl").expect("Dutch stopwords should exist");
-        assert!(nl.contains("de"), "Dutch should contain 'de'");
-        assert!(nl.contains("het"), "Dutch should contain 'het'");
-        assert!(nl.len() >= 100, "Dutch should have substantial stopwords");
-        let sv = get_stopwords("sv").expect("Swedish stopwords should exist");
-        assert!(!sv.is_empty(), "Swedish should have stopwords");
-        assert!(sv.len() >= 100, "Swedish should have substantial stopwords");
-        let no = get_stopwords("no").expect("Norwegian stopwords should exist");
-        assert!(!no.is_empty(), "Norwegian should have stopwords");
-        let da = get_stopwords("da").expect("Danish stopwords should exist");
-        assert!(!da.is_empty(), "Danish should have stopwords");
-    }
-    #[test]
-    fn test_slavic_languages() {
-        let ru = get_stopwords("ru").expect("Russian stopwords should exist");
-        assert!(!ru.is_empty(), "Russian should have stopwords");
-        assert!(ru.len() >= 100, "Russian should have substantial stopwords");
-        let pl = get_stopwords("pl").expect("Polish stopwords should exist");
-        assert!(!pl.is_empty(), "Polish should have stopwords");
-        assert!(pl.len() >= 100, "Polish should have substantial stopwords");
-        let cs = get_stopwords("cs").expect("Czech stopwords should exist");
-        assert!(!cs.is_empty(), "Czech should have stopwords");
-        let sk = get_stopwords("sk").expect("Slovak stopwords should exist");
-        assert!(!sk.is_empty(), "Slovak should have stopwords");
-        let bg = get_stopwords("bg").expect("Bulgarian stopwords should exist");
-        assert!(!bg.is_empty(), "Bulgarian should have stopwords");
-        let uk = get_stopwords("uk").expect("Ukrainian stopwords should exist");
-        assert!(!uk.is_empty(), "Ukrainian should have stopwords");
-        let hr = get_stopwords("hr").expect("Croatian stopwords should exist");
-        assert!(!hr.is_empty(), "Croatian should have stopwords");
-        let sl = get_stopwords("sl").expect("Slovenian stopwords should exist");
-        assert!(!sl.is_empty(), "Slovenian should have stopwords");
-    }
-    #[test]
-    fn test_asian_languages() {
-        let zh = get_stopwords("zh").expect("Chinese stopwords should exist");
-        assert!(!zh.is_empty(), "Chinese should have stopwords");
-        assert!(zh.len() >= 50, "Chinese should have substantial stopwords");
-        let ja = get_stopwords("ja").expect("Japanese stopwords should exist");
-        assert!(!ja.is_empty(), "Japanese should have stopwords");
-        assert!(ja.len() >= 50, "Japanese should have substantial stopwords");
-        let ko = get_stopwords("ko").expect("Korean stopwords should exist");
-        assert!(!ko.is_empty(), "Korean should have stopwords");
-        let hi = get_stopwords("hi").expect("Hindi stopwords should exist");
-        assert!(!hi.is_empty(), "Hindi should have stopwords");
-        assert!(hi.len() >= 100, "Hindi should have substantial stopwords");
-        let bn = get_stopwords("bn").expect("Bengali stopwords should exist");
-        assert!(!bn.is_empty(), "Bengali should have stopwords");
-        let th = get_stopwords("th").expect("Thai stopwords should exist");
-        assert!(!th.is_empty(), "Thai should have stopwords");
-        let vi = get_stopwords("vi").expect("Vietnamese stopwords should exist");
-        assert!(!vi.is_empty(), "Vietnamese should have stopwords");
-    }
-    #[test]
-    fn test_african_languages() {
-        let af = get_stopwords("af").expect("Afrikaans stopwords should exist");
-        assert!(!af.is_empty(), "Afrikaans should have stopwords");
-        let sw = get_stopwords("sw").expect("Swahili stopwords should exist");
-        assert!(!sw.is_empty(), "Swahili should have stopwords");
-        let yo = get_stopwords("yo").expect("Yoruba stopwords should exist");
-        assert!(!yo.is_empty(), "Yoruba should have stopwords");
-        let zu = get_stopwords("zu").expect("Zulu stopwords should exist");
-        assert!(!zu.is_empty(), "Zulu should have stopwords");
-        let ha = get_stopwords("ha").expect("Hausa stopwords should exist");
-        assert!(!ha.is_empty(), "Hausa should have stopwords");
-        let so = get_stopwords("so").expect("Somali stopwords should exist");
-        assert!(!so.is_empty(), "Somali should have stopwords");
-        let st = get_stopwords("st").expect("Sesotho stopwords should exist");
-        assert!(!st.is_empty(), "Sesotho should have stopwords");
-    }
-    #[test]
-    fn test_indic_languages() {
-        let hi = get_stopwords("hi").expect("Hindi stopwords should exist");
-        assert!(!hi.is_empty(), "Hindi should have stopwords");
-        let bn = get_stopwords("bn").expect("Bengali stopwords should exist");
-        assert!(!bn.is_empty(), "Bengali should have stopwords");
-        let gu = get_stopwords("gu").expect("Gujarati stopwords should exist");
-        assert!(!gu.is_empty(), "Gujarati should have stopwords");
-        let kn = get_stopwords("kn").expect("Kannada stopwords should exist");
-        assert!(!kn.is_empty(), "Kannada should have stopwords");
-        let ml = get_stopwords("ml").expect("Malayalam stopwords should exist");
-        assert!(!ml.is_empty(), "Malayalam should have stopwords");
-        let mr = get_stopwords("mr").expect("Marathi stopwords should exist");
-        assert!(!mr.is_empty(), "Marathi should have stopwords");
-        let ta = get_stopwords("ta").expect("Tamil stopwords should exist");
-        assert!(!ta.is_empty(), "Tamil should have stopwords");
-        let te = get_stopwords("te").expect("Telugu stopwords should exist");
-        assert!(!te.is_empty(), "Telugu should have stopwords");
-        let ur = get_stopwords("ur").expect("Urdu stopwords should exist");
-        assert!(!ur.is_empty(), "Urdu should have stopwords");
-        let ne = get_stopwords("ne").expect("Nepali stopwords should exist");
-        assert!(!ne.is_empty(), "Nepali should have stopwords");
-        let si = get_stopwords("si").expect("Sinhala stopwords should exist");
-        assert!(!si.is_empty(), "Sinhala should have stopwords");
-    }
-    #[test]
-    fn test_middle_eastern_languages() {
-        let ar = get_stopwords("ar").expect("Arabic stopwords should exist");
-        assert!(!ar.is_empty(), "Arabic should have stopwords");
-        assert!(ar.len() >= 100, "Arabic should have substantial stopwords");
-        let fa = get_stopwords("fa").expect("Persian stopwords should exist");
-        assert!(!fa.is_empty(), "Persian should have stopwords");
-        let he = get_stopwords("he").expect("Hebrew stopwords should exist");
-        assert!(!he.is_empty(), "Hebrew should have stopwords");
-        let tr = get_stopwords("tr").expect("Turkish stopwords should exist");
-        assert!(!tr.is_empty(), "Turkish should have stopwords");
-        let ku = get_stopwords("ku").expect("Kurdish stopwords should exist");
-        assert!(!ku.is_empty(), "Kurdish stopwords should exist");
-    }
-    #[test]
-    fn test_other_languages() {
-        let hy = get_stopwords("hy").expect("Armenian stopwords should exist");
-        assert!(!hy.is_empty(), "Armenian should have stopwords");
-        let eu = get_stopwords("eu").expect("Basque stopwords should exist");
-        assert!(!eu.is_empty(), "Basque should have stopwords");
-        let br = get_stopwords("br").expect("Breton stopwords should exist");
-        assert!(!br.is_empty(), "Breton should have stopwords");
-        let ca = get_stopwords("ca").expect("Catalan stopwords should exist");
-        assert!(!ca.is_empty(), "Catalan should have stopwords");
-        let eo = get_stopwords("eo").expect("Esperanto stopwords should exist");
-        assert!(eo.contains("la"), "Esperanto should contain 'la'");
-        assert!(!eo.is_empty(), "Esperanto should have stopwords");
-        let et = get_stopwords("et").expect("Estonian stopwords should exist");
-        assert!(!et.is_empty(), "Estonian should have stopwords");
-        let fi = get_stopwords("fi").expect("Finnish stopwords should exist");
-        assert!(!fi.is_empty(), "Finnish should have stopwords");
-        let gl = get_stopwords("gl").expect("Galician stopwords should exist");
-        assert!(!gl.is_empty(), "Galician should have stopwords");
-        let hu = get_stopwords("hu").expect("Hungarian stopwords should exist");
-        assert!(!hu.is_empty(), "Hungarian should have stopwords");
-        let id = get_stopwords("id").expect("Indonesian stopwords should exist");
-        assert!(!id.is_empty(), "Indonesian should have stopwords");
-        let ga = get_stopwords("ga").expect("Irish stopwords should exist");
-        assert!(!ga.is_empty(), "Irish should have stopwords");
-        let la = get_stopwords("la").expect("Latin stopwords should exist");
-        assert!(!la.is_empty(), "Latin should have stopwords");
-        let lt = get_stopwords("lt").expect("Lithuanian stopwords should exist");
-        assert!(!lt.is_empty(), "Lithuanian should have stopwords");
-        let lv = get_stopwords("lv").expect("Latvian stopwords should exist");
-        assert!(!lv.is_empty(), "Latvian should have stopwords");
-        let ms = get_stopwords("ms").expect("Malay stopwords should exist");
-        assert!(!ms.is_empty(), "Malay should have stopwords");
-        let tl = get_stopwords("tl").expect("Tagalog stopwords should exist");
-        assert!(!tl.is_empty(), "Tagalog should have stopwords");
-    }
-    #[test]
-    fn test_language_code_variants() {
-        let eng = get_stopwords("eng");
-        let en = get_stopwords("en");
-        assert!(eng.is_some(), "'eng' should extract to 'en'");
-        assert!(en.is_some());
-        assert_eq!(eng.unwrap().len(), en.unwrap().len());
-        let spa = get_stopwords("spa");
-        assert!(spa.is_none(), "'spa' extracts to 'sp' which is invalid");
-        let deu = get_stopwords("deu");
-        let de = get_stopwords("de");
-        assert!(deu.is_some(), "'deu' should extract to 'de'");
-        assert_eq!(deu.unwrap().len(), de.unwrap().len());
-        let fra = get_stopwords("fra");
-        let fr = get_stopwords("fr");
-        assert!(fra.is_some(), "'fra' should extract to 'fr'");
-        assert_eq!(fra.unwrap().len(), fr.unwrap().len());
-        let zho = get_stopwords("zho");
-        let zh = get_stopwords("zh");
-        assert!(zho.is_some(), "'zho' should extract to 'zh'");
-        assert_eq!(zho.unwrap().len(), zh.unwrap().len());
-    }
-    #[test]
-    fn test_stopword_set_sizes() {
-        let mut sizes: Vec<(String, usize)> = Vec::new();
-        for (lang, stopwords) in STOPWORDS.iter() {
-            sizes.push((lang.clone(), stopwords.len()));
-            assert!(!stopwords.is_empty(), "Language {} has empty stopwords", lang);
-            assert!(
-                stopwords.len() >= 5,
-                "Language {} has suspiciously few stopwords: {}",
-                lang,
-                stopwords.len()
-            );
-            assert!(
-                stopwords.len() <= 1500,
-                "Language {} has suspiciously many stopwords: {}",
-                lang,
-                stopwords.len()
-            );
-        }
-        assert_eq!(sizes.len(), 64, "Should have exactly 64 languages");
-        let en_size = STOPWORDS.get("en").unwrap().len();
-        assert!(
-            (70..=1500).contains(&en_size),
-            "English stopwords size {} outside expected range",
-            en_size
-        );
-        let es_size = STOPWORDS.get("es").unwrap().len();
-        assert!(
-            (200..=1000).contains(&es_size),
-            "Spanish stopwords size {} outside expected range",
-            es_size
-        );
-    }
-    #[test]
-    fn test_stopword_content_quality() {
-        let en = get_stopwords("en").expect("English stopwords");
-        let english_common = vec![
-            "the", "is", "are", "was", "were", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", "of",
-            "with",
-        ];
-        for word in english_common {
-            assert!(en.contains(word), "English missing common stopword: {}", word);
-        }
-        let es = get_stopwords("es").expect("Spanish stopwords");
-        let spanish_common = vec![
-            "el", "la", "los", "las", "un", "una", "de", "en", "y", "o", "por", "para",
-        ];
-        for word in spanish_common {
-            assert!(es.contains(word), "Spanish missing common stopword: {}", word);
-        }
-        let de = get_stopwords("de").expect("German stopwords");
-        let german_common = vec![
-            "der", "die", "das", "den", "dem", "des", "und", "oder", "in", "auf", "mit", "von",
-        ];
-        for word in german_common {
-            assert!(de.contains(word), "German missing common stopword: {}", word);
-        }
-        let fr = get_stopwords("fr").expect("French stopwords");
-        let french_common = vec![
-            "le", "la", "les", "un", "une", "de", "en", "et", "ou", "pour", "avec", "dans",
-        ];
-        for word in french_common {
-            assert!(fr.contains(word), "French missing common stopword: {}", word);
-        }
-    }
-    #[test]
-    fn test_stopword_deduplication() {
-        for (lang, stopwords) in STOPWORDS.iter() {
-            let original_len = stopwords.len();
-            let unique_len = stopwords.iter().collect::<AHashSet<_>>().len();
-            assert_eq!(original_len, unique_len, "Language {} has duplicate stopwords", lang);
-        }
-    }
-    #[test]
-    fn test_case_normalization_comprehensive() {
-        let test_cases = vec![
-            ("en", "EN", "En", "eN"),
-            ("es", "ES", "Es", "eS"),
-            ("de", "DE", "De", "dE"),
-            ("fr", "FR", "Fr", "fR"),
-            ("zh", "ZH", "Zh", "zH"),
-            ("ar", "AR", "Ar", "aR"),
-        ];
-        for (lower, upper, title, mixed) in test_cases {
-            let lower_result = get_stopwords(lower);
-            let upper_result = get_stopwords(upper);
-            let title_result = get_stopwords(title);
-            let mixed_result = get_stopwords(mixed);
-            assert!(lower_result.is_some(), "{} should be valid", lower);
-            assert!(upper_result.is_some(), "{} should be valid", upper);
-            assert!(title_result.is_some(), "{} should be valid", title);
-            assert!(mixed_result.is_some(), "{} should be valid", mixed);
-            let len = lower_result.unwrap().len();
-            assert_eq!(upper_result.unwrap().len(), len);
-            assert_eq!(title_result.unwrap().len(), len);
-            assert_eq!(mixed_result.unwrap().len(), len);
-        }
-    }
-    #[test]
-    fn test_locale_code_normalization_comprehensive() {
-        let test_cases = vec![
-            ("en-US", "en_US", "en-GB", "en_GB", "en"),
-            ("es-ES", "es_ES", "es-MX", "es_MX", "es"),
-            ("pt-PT", "pt_PT", "pt-BR", "pt_BR", "pt"),
-            ("zh-CN", "zh_CN", "zh-TW", "zh_TW", "zh"),
-            ("fr-FR", "fr_FR", "fr-CA", "fr_CA", "fr"),
-        ];
-        for (hyphen1, underscore1, hyphen2, underscore2, base) in test_cases {
-            let base_result = get_stopwords(base).unwrap_or_else(|| panic!("{} should be valid", base));
-            let h1 = get_stopwords(hyphen1);
-            let u1 = get_stopwords(underscore1);
-            let h2 = get_stopwords(hyphen2);
-            let u2 = get_stopwords(underscore2);
-            assert!(h1.is_some(), "{} should be valid", hyphen1);
-            assert!(u1.is_some(), "{} should be valid", underscore1);
-            assert!(h2.is_some(), "{} should be valid", hyphen2);
-            assert!(u2.is_some(), "{} should be valid", underscore2);
-            let len = base_result.len();
-            assert_eq!(h1.unwrap().len(), len, "{} should match {}", hyphen1, base);
-            assert_eq!(u1.unwrap().len(), len, "{} should match {}", underscore1, base);
-            assert_eq!(h2.unwrap().len(), len, "{} should match {}", hyphen2, base);
-            assert_eq!(u2.unwrap().len(), len, "{} should match {}", underscore2, base);
-        }
-    }
-    #[test]
-    fn test_fallback_chains() {
-        let scenarios = vec![
-            ("en", "es", true, "en"),
-            ("xx", "en", true, "en"),
-            ("xx", "yy", false, ""),
-            ("es", "xx", true, "es"),
-        ];
-        for (primary, fallback, should_succeed, expected_lang) in scenarios {
-            let result = get_stopwords_with_fallback(primary, fallback);
-            assert_eq!(
-                result.is_some(),
-                should_succeed,
-                "Fallback({}, {}) should {}",
-                primary,
-                fallback,
-                if should_succeed { "succeed" } else { "fail" }
-            );
-            if should_succeed {
-                let stopwords = result.unwrap();
-                let expected = get_stopwords(expected_lang).unwrap();
-                assert_eq!(
-                    stopwords.len(),
-                    expected.len(),
-                    "Fallback should return {} stopwords",
-                    expected_lang
-                );
-            }
-        }
-    }
-    #[test]
-    fn test_stopword_string_types() {
-        for (lang, stopwords) in STOPWORDS.iter() {
-            for word in stopwords {
-                assert!(!word.is_empty(), "Language {} has empty stopword", lang);
-                assert!(
-                    word.len() <= 100,
-                    "Language {} has suspiciously long stopword: {} ({} bytes)",
-                    lang,
-                    word,
-                    word.len()
-                );
-                assert!(word.chars().count() > 0, "Language {} has invalid UTF-8 stopword", lang);
-            }
-        }
-    }
-    #[test]
-    fn test_concurrent_access() {
-        use std::thread;
-        let languages = vec!["en", "es", "de", "fr", "zh", "ar", "ru", "ja"];
-        let mut handles = vec![];
-        for lang in languages {
-            let handle = thread::spawn(move || {
-                let stopwords = get_stopwords(lang);
-                assert!(stopwords.is_some(), "Language {} should be available", lang);
-                stopwords.unwrap().len()
-            });
-            handles.push(handle);
-        }
-        for handle in handles {
-            let len = handle.join().expect("Thread should not panic");
-            assert!(len > 0, "Stopwords should not be empty");
-        }
-    }
-    #[test]
-    fn test_stopwords_immutability() {
-        let en1 = get_stopwords("en").unwrap();
-        let en2 = get_stopwords("en").unwrap();
-        assert_eq!(en1.len(), en2.len());
-        for word in en1 {
-            assert!(
-                en2.contains(word),
-                "Stopword '{}' should exist in both references",
-                word
-            );
-        }
-    }
-    #[test]
-    fn test_edge_case_separator_positions() {
-        let test_cases = vec![
-            ("en-", true),
-            ("-en", false),
-            ("e-n", false),
-            ("en--US", true),
-            ("en_-US", true),
-            ("_en", false),
-            ("en_", true),
-        ];
-        for (code, should_find_en) in test_cases {
-            let result = get_stopwords(code);
-            if should_find_en {
-                assert!(result.is_some(), "Code '{}' should extract 'en'", code);
-                if let Some(stopwords) = result {
-                    assert!(
-                        stopwords.contains("the"),
-                        "Code '{}' should return English stopwords",
-                        code
-                    );
-                }
-            } else {
-                let _ = result;
-            }
-        }
-    }
-    #[test]
-    fn test_performance_characteristics() {
-        use std::time::Instant;
-        let _ = get_stopwords("en");
-        let start = Instant::now();
-        for _ in 0..10000 {
-            let _ = get_stopwords("en");
-            let _ = get_stopwords("es");
-            let _ = get_stopwords("de");
-        }
-        let duration = start.elapsed();
-        assert!(
-            duration.as_millis() < 500,
-            "30,000 lookups took too long: {:?}",
-            duration
-        );
-    }
-    #[test]
-    fn test_language_completeness() {
-        let documented = vec![
-            "af", "ar", "bg", "bn", "br", "ca", "cs", "da", "de", "el", "en", "eo", "es", "et", "eu", "fa", "fi", "fr",
-            "ga", "gl", "gu", "ha", "he", "hi", "hr", "hu", "hy", "id", "it", "ja", "kn", "ko", "ku", "la", "lt", "lv",
-            "ml", "mr", "ms", "ne", "nl", "no", "pl", "pt", "ro", "ru", "si", "sk", "sl", "so", "st", "sv", "sw", "ta",
-            "te", "th", "tl", "tr", "uk", "ur", "vi", "yo", "zh", "zu",
-        ];
-        assert_eq!(documented.len(), 64, "Documentation lists 64 languages");
-        for lang in documented {
-            assert!(
-                STOPWORDS.contains_key(lang),
-                "Documented language '{}' is missing from STOPWORDS",
-                lang
-            );
-            assert!(
-                get_stopwords(lang).is_some(),
-                "Documented language '{}' not accessible via get_stopwords",
-                lang
-            );
-        }
-    }
-}
+//! Stopwords management for text processing.
+//!
+//! Provides language-specific stopword collections used by keyword extraction
+//! and token reduction features. Stopwords are common words (the, is, and, etc.)
+//! that should be filtered out from text analysis.
+//!
+//! # Supported Languages
+//!
+//! Supports 64 languages with embedded stopword lists:
+//! - Afrikaans (af), Arabic (ar), Bulgarian (bg), Bengali (bn), Breton (br)
+//! - Catalan (ca), Czech (cs), Danish (da), German (de), Greek (el)
+//! - English (en), Esperanto (eo), Spanish (es), Estonian (et), Basque (eu)
+//! - Persian (fa), Finnish (fi), French (fr), Irish (ga), Galician (gl)
+//! - Gujarati (gu), Hausa (ha), Hebrew (he), Hindi (hi), Croatian (hr)
+//! - Hungarian (hu), Armenian (hy), Indonesian (id), Italian (it), Japanese (ja)
+//! - Kannada (kn), Korean (ko), Kurdish (ku), Latin (la), Lithuanian (lt)
+//! - Latvian (lv), Malayalam (ml), Marathi (mr), Malay (ms), Nepali (ne)
+//! - Dutch (nl), Norwegian (no), Polish (pl), Portuguese (pt), Romanian (ro)
+//! - Russian (ru), Sinhala (si), Slovak (sk), Slovenian (sl), Somali (so)
+//! - Sesotho (st), Swedish (sv), Swahili (sw), Tamil (ta), Telugu (te)
+//! - Thai (th), Tagalog (tl), Turkish (tr), Ukrainian (uk), Urdu (ur)
+//! - Vietnamese (vi), Yoruba (yo), Chinese (zh), Zulu (zu)
+//!
+//! All stopword lists are embedded in the binary at compile time for zero-overhead access.
+//!
+//! # Usage
+//!
+//! ```rust
+//! use kreuzberg::stopwords::{get_stopwords, get_stopwords_with_fallback};
+//!
+//! // Get English stopwords with normalization
+//! if let Some(en_stopwords) = get_stopwords("en") {
+//!     assert!(en_stopwords.contains("the"));
+//!
+//!     // Check if a word is a stopword
+//!     if en_stopwords.contains("the") {
+//!         println!("'the' is a stopword");
+//!     }
+//! }
+//!
+//! // Case-insensitive - all of these work
+//! assert!(get_stopwords("EN").is_some());
+//! assert!(get_stopwords("En").is_some());
+//!
+//! // Locale codes are normalized to language code (first 2 chars)
+//! if let Some(en_us) = get_stopwords("en-US") {
+//!     if let Some(en_gb) = get_stopwords("en_GB") {
+//!         // Both point to "en" stopwords
+//!         assert_eq!(en_us.len(), en_gb.len());
+//!     }
+//! }
+//!
+//! // Spanish with locale
+//! if let Some(es_stopwords) = get_stopwords("es-ES") {
+//!     assert!(es_stopwords.contains("el"));
+//! }
+//!
+//! // Fallback for unsupported languages
+//! if let Some(stopwords) = get_stopwords_with_fallback("unknown", "en") {
+//!     // Will use English stopwords since "unknown" isn't supported
+//!     assert!(stopwords.contains("the"));
+//! }
+//! ```
+//!
+//! # Direct Access (Advanced)
+//!
+//! For advanced use cases where you need direct access to the HashMap or want to
+//! iterate over all languages, you can use the `STOPWORDS` static directly:
+//!
+//! ```rust
+//! use kreuzberg::stopwords::STOPWORDS;
+//!
+//! // Direct access (case-sensitive, no normalization)
+//! let en_stopwords = STOPWORDS.get("en");
+//!
+//! // List all available languages
+//! for lang in STOPWORDS.keys() {
+//!     println!("Available language: {}", lang);
+//! }
+//! ```
+use ahash::{AHashMap, AHashSet};
+use once_cell::sync::Lazy;
+/// Macro to generate embedded stopwords for all languages.
+///
+/// This macro embeds the JSON files at compile time using `include_str!()` and
+/// generates code to parse and insert them into the stopwords map.
+macro_rules! embed_stopwords {
+    ($map:expr, $($lang:literal),* $(,)?) => {
+        $(
+            {
+                const JSON: &str = include_str!(concat!("../../stopwords/", $lang, "_stopwords.json"));
+                match serde_json::from_str::<Vec<String>>(JSON) {
+                    Ok(words) => {
+                        let set: AHashSet<String> = words.into_iter().collect();
+                        $map.insert($lang.to_string(), set);
+                    }
+                    Err(e) => {
+                        panic!(
+                            "Failed to parse embedded stopwords for language '{}': {}. \
+                            This indicates corrupted or malformed JSON in the embedded stopwords data. \
+                            Please report this issue at https://github.com/kreuzberg-dev/kreuzberg/issues",
+                            $lang, e
+                        );
+                    }
+                }
+            }
+        )*
+    };
+}
+/// Global stopwords registry.
+///
+/// A lazy-initialized map of language codes to stopword sets.
+/// All stopword lists are embedded in the binary at compile time for
+/// zero-overhead access and no runtime I/O dependencies.
+///
+/// Supports 64 languages with comprehensive stopword coverage.
+///
+/// # Note
+///
+/// For most use cases, prefer [`get_stopwords()`] which provides language code
+/// normalization (case-insensitive, locale handling). Direct access to STOPWORDS
+/// is case-sensitive and requires exact language codes (lowercase, 2-letter ISO 639-1).
+///
+/// # Examples
+///
+/// ```rust
+/// use kreuzberg::stopwords::STOPWORDS;
+///
+/// // Direct access (case-sensitive, no normalization)
+/// let en_stopwords = STOPWORDS.get("en");
+/// assert!(en_stopwords.is_some());
+///
+/// // Case-sensitive - these return None
+/// assert!(STOPWORDS.get("EN").is_none());
+/// assert!(STOPWORDS.get("en-US").is_none());
+///
+/// // List all available languages
+/// assert_eq!(STOPWORDS.len(), 64);
+/// for lang in STOPWORDS.keys() {
+///     println!("Available: {}", lang);
+/// }
+/// ```
+pub static STOPWORDS: Lazy<AHashMap<String, AHashSet<String>>> = Lazy::new(|| {
+    let mut map = AHashMap::new();
+    embed_stopwords!(
+        map, "af", "ar", "bg", "bn", "br", "ca", "cs", "da", "de", "el", "en", "eo", "es", "et", "eu", "fa", "fi",
+        "fr", "ga", "gl", "gu", "ha", "he", "hi", "hr", "hu", "hy", "id", "it", "ja", "kn", "ko", "ku", "la", "lt",
+        "lv", "ml", "mr", "ms", "ne", "nl", "no", "pl", "pt", "ro", "ru", "si", "sk", "sl", "so", "st", "sv", "sw",
+        "ta", "te", "th", "tl", "tr", "uk", "ur", "vi", "yo", "zh", "zu",
+    );
+    apply_stopword_whitelist(&mut map);
+    map
+});
+fn apply_stopword_whitelist(map: &mut AHashMap<String, AHashSet<String>>) {
+    const STOPWORD_REMOVALS: &[(&str, &[&str])] = &[("en", &["hello", "test", "world", "working", "great"])];
+    for (lang, words) in STOPWORD_REMOVALS {
+        if let Some(set) = map.get_mut(*lang) {
+            for &word in *words {
+                set.remove(word);
+            }
+        }
+    }
+}
+/// Get stopwords for a language with normalization.
+///
+/// This function provides a user-friendly interface to the stopwords registry with:
+/// - **Case-insensitive lookup**: "EN", "en", "En" all work
+/// - **Locale normalization**: "en-US", "en_GB", "es-ES" extract to "en", "es"
+/// - **Consistent behavior**: Returns `None` for unsupported languages
+///
+/// # Language Code Format
+///
+/// Accepts multiple formats:
+/// - ISO 639-1 two-letter codes: `"en"`, `"es"`, `"de"`, etc.
+/// - Uppercase variants: `"EN"`, `"ES"`, `"DE"`
+/// - Locale codes with hyphen: `"en-US"`, `"es-ES"`, `"pt-BR"`
+/// - Locale codes with underscore: `"en_US"`, `"es_ES"`, `"pt_BR"`
+///
+/// All formats are normalized to lowercase two-letter ISO 639-1 codes.
+///
+/// # Returns
+///
+/// - `Some(&HashSet<String>)` if the language is supported (64 languages available)
+/// - `None` if the language is not supported
+///
+/// # Examples
+///
+/// ```rust
+/// use kreuzberg::stopwords::get_stopwords;
+///
+/// // Simple language codes
+/// if let Some(en) = get_stopwords("en") {
+///     assert!(en.contains("the"));
+/// }
+///
+/// // Case-insensitive
+/// assert!(get_stopwords("EN").is_some());
+/// assert!(get_stopwords("En").is_some());
+/// assert!(get_stopwords("eN").is_some());
+///
+/// // Locale codes normalized to language code
+/// if let (Some(en_us), Some(en_gb), Some(en_lowercase)) =
+///     (get_stopwords("en-US"), get_stopwords("en_GB"), get_stopwords("en"))
+/// {
+///     // All point to the same stopwords set
+///     assert_eq!(en_us.len(), en_gb.len());
+///     assert_eq!(en_us.len(), en_lowercase.len());
+/// }
+///
+/// // Spanish with various formats
+/// assert!(get_stopwords("es").is_some());
+/// assert!(get_stopwords("ES").is_some());
+/// assert!(get_stopwords("es-ES").is_some());
+/// assert!(get_stopwords("es_MX").is_some());
+///
+/// // Unsupported language returns None
+/// assert!(get_stopwords("xx").is_none());
+/// assert!(get_stopwords("zzzz").is_none());
+/// ```
+///
+/// # Performance
+///
+/// This function performs two operations:
+/// 1. String normalization (lowercase + truncate) - O(1) for typical language codes
+/// 2. HashMap lookup in STOPWORDS - O(1) average case
+///
+/// Total overhead is negligible (~10-50ns on modern CPUs).
+pub fn get_stopwords(lang: &str) -> Option<&'static AHashSet<String>> {
+    let normalized = lang.to_lowercase();
+    let lang_code = if let Some(pos) = normalized.find(&['-', '_'][..]) {
+        &normalized[..pos]
+    } else if normalized.len() >= 2 {
+        &normalized[..2]
+    } else {
+        &normalized
+    };
+    STOPWORDS.get(lang_code)
+}
+/// Get stopwords for a language with fallback support.
+///
+/// This function attempts to retrieve stopwords for the primary language,
+/// and if not available, falls back to a secondary language. This is useful
+/// for handling scenarios where:
+/// - A detected language isn't supported
+/// - You want to use English as a fallback for unknown languages
+/// - You need graceful degradation for multilingual content
+///
+/// Both language codes support the same normalization as [`get_stopwords()`]:
+/// - Case-insensitive lookup (EN, en, En all work)
+/// - Locale codes normalized (en-US, en_GB extract to "en")
+///
+/// # Arguments
+///
+/// * `language` - Primary language code to try first
+/// * `fallback` - Fallback language code to use if primary not available
+///
+/// # Returns
+///
+/// - `Some(&HashSet<String>)` if either language is supported
+/// - `None` if neither language is supported
+///
+/// # Examples
+///
+/// ```rust
+/// use kreuzberg::stopwords::get_stopwords_with_fallback;
+///
+/// // Detected language is Esperanto, fallback to English
+/// if let Some(stopwords) = get_stopwords_with_fallback("eo", "en") {
+///     // Will use Esperanto stopwords (supported)
+///     assert!(stopwords.contains("la"));
+/// }
+///
+/// // Unsupported language, fallback to English
+/// if let Some(stopwords) = get_stopwords_with_fallback("xx", "en") {
+///     // Will use English stopwords (fallback)
+///     assert!(stopwords.contains("the"));
+/// }
+///
+/// // Case-insensitive and locale-aware
+/// let result = get_stopwords_with_fallback("es-MX", "EN-US");
+/// assert!(result.is_some());
+///
+/// // Both unsupported returns None
+/// assert!(get_stopwords_with_fallback("xx", "zz").is_none());
+/// ```
+///
+/// # Common Patterns
+///
+/// ```rust
+/// use kreuzberg::stopwords::get_stopwords_with_fallback;
+///
+/// // English fallback for unknown languages
+/// let detected_lang = "xyz"; // Unknown language
+/// let stopwords = get_stopwords_with_fallback(detected_lang, "en")
+///     .expect("English fallback should always be available");
+///
+/// // Multi-language content with English fallback
+/// for lang in ["de", "fr", "unknown", "es"] {
+///     if let Some(stopwords) = get_stopwords_with_fallback(lang, "en") {
+///         println!("Using stopwords for: {}", lang);
+///     }
+/// }
+/// ```
+///
+/// # Performance
+///
+/// This function performs at most two HashMap lookups:
+/// 1. Try primary language (O(1) average case)
+/// 2. If None, try fallback language (O(1) average case)
+///
+/// Total overhead is negligible (~10-100ns on modern CPUs).
+pub fn get_stopwords_with_fallback(language: &str, fallback: &str) -> Option<&'static AHashSet<String>> {
+    get_stopwords(language).or_else(|| get_stopwords(fallback))
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_stopwords_lazy_initialization() {
+        let stopwords = &*STOPWORDS;
+        assert!(stopwords.contains_key("en"));
+        assert!(stopwords.contains_key("es"));
+        assert!(!stopwords.get("en").unwrap().is_empty());
+        assert!(!stopwords.get("es").unwrap().is_empty());
+    }
+    #[test]
+    fn test_english_stopwords() {
+        let en_stopwords = STOPWORDS.get("en").unwrap();
+        assert!(en_stopwords.contains("the"));
+        assert!(en_stopwords.contains("is"));
+        assert!(en_stopwords.contains("and"));
+        assert!(en_stopwords.contains("a"));
+        assert!(en_stopwords.contains("of"));
+        assert!(en_stopwords.len() >= 70);
+    }
+    #[test]
+    fn test_spanish_stopwords() {
+        let es_stopwords = STOPWORDS.get("es").unwrap();
+        assert!(es_stopwords.contains("el"));
+        assert!(es_stopwords.contains("la"));
+        assert!(es_stopwords.contains("es"));
+        assert!(es_stopwords.contains("en"));
+        assert!(es_stopwords.contains("de"));
+        assert!(es_stopwords.len() >= 200);
+    }
+    #[test]
+    fn test_all_64_languages_loaded() {
+        let expected_languages = [
+            "af", "ar", "bg", "bn", "br", "ca", "cs", "da", "de", "el", "en", "eo", "es", "et", "eu", "fa", "fi", "fr",
+            "ga", "gl", "gu", "ha", "he", "hi", "hr", "hu", "hy", "id", "it", "ja", "kn", "ko", "ku", "la", "lt", "lv",
+            "ml", "mr", "ms", "ne", "nl", "no", "pl", "pt", "ro", "ru", "si", "sk", "sl", "so", "st", "sv", "sw", "ta",
+            "te", "th", "tl", "tr", "uk", "ur", "vi", "yo", "zh", "zu",
+        ];
+        for lang in &expected_languages {
+            assert!(
+                STOPWORDS.contains_key(*lang),
+                "Missing stopwords for language: {}",
+                lang
+            );
+            assert!(
+                !STOPWORDS.get(*lang).unwrap().is_empty(),
+                "Empty stopwords for language: {}",
+                lang
+            );
+        }
+        assert_eq!(STOPWORDS.len(), 64, "Expected 64 languages, found {}", STOPWORDS.len());
+    }
+    #[test]
+    fn test_german_stopwords() {
+        let de_stopwords = STOPWORDS.get("de").unwrap();
+        assert!(de_stopwords.contains("der"));
+        assert!(de_stopwords.contains("die"));
+        assert!(de_stopwords.contains("und"));
+    }
+    #[test]
+    fn test_french_stopwords() {
+        let fr_stopwords = STOPWORDS.get("fr").unwrap();
+        assert!(fr_stopwords.contains("le"));
+        assert!(fr_stopwords.contains("de"));
+        assert!(fr_stopwords.contains("un"));
+    }
+    #[test]
+    fn test_chinese_stopwords() {
+        let zh_stopwords = STOPWORDS.get("zh").unwrap();
+        assert!(!zh_stopwords.is_empty());
+    }
+    #[test]
+    fn test_arabic_stopwords() {
+        let ar_stopwords = STOPWORDS.get("ar").unwrap();
+        assert!(!ar_stopwords.is_empty());
+    }
+    #[test]
+    fn test_unknown_language_returns_none() {
+        assert!(!STOPWORDS.contains_key("xx"));
+        assert!(STOPWORDS.get("unknown").is_none());
+    }
+    #[test]
+    fn test_get_stopwords_lowercase() {
+        assert!(get_stopwords("en").is_some());
+        assert!(get_stopwords("es").is_some());
+        assert!(get_stopwords("de").is_some());
+        assert!(get_stopwords("fr").is_some());
+    }
+    #[test]
+    fn test_get_stopwords_uppercase() {
+        let en_upper = get_stopwords("EN");
+        let en_lower = get_stopwords("en");
+        assert!(en_upper.is_some());
+        assert!(en_lower.is_some());
+        assert_eq!(en_upper.unwrap().len(), en_lower.unwrap().len());
+    }
+    #[test]
+    fn test_get_stopwords_mixed_case() {
+        assert!(get_stopwords("En").is_some());
+        assert!(get_stopwords("eN").is_some());
+        assert!(get_stopwords("ES").is_some());
+        assert!(get_stopwords("Es").is_some());
+        assert!(get_stopwords("DE").is_some());
+        assert!(get_stopwords("De").is_some());
+    }
+    #[test]
+    fn test_get_stopwords_locale_hyphen() {
+        let en_us = get_stopwords("en-US");
+        let en_gb = get_stopwords("en-GB");
+        let en = get_stopwords("en");
+        assert!(en_us.is_some());
+        assert!(en_gb.is_some());
+        assert_eq!(en_us.unwrap().len(), en.unwrap().len());
+        assert_eq!(en_gb.unwrap().len(), en.unwrap().len());
+    }
+    #[test]
+    fn test_get_stopwords_locale_underscore() {
+        let es_es = get_stopwords("es_ES");
+        let es_mx = get_stopwords("es_MX");
+        let es = get_stopwords("es");
+        assert!(es_es.is_some());
+        assert!(es_mx.is_some());
+        assert_eq!(es_es.unwrap().len(), es.unwrap().len());
+        assert_eq!(es_mx.unwrap().len(), es.unwrap().len());
+    }
+    #[test]
+    fn test_get_stopwords_locale_uppercase() {
+        let en_us_upper = get_stopwords("EN-US");
+        let es_es_upper = get_stopwords("ES_ES");
+        let pt_br_mixed = get_stopwords("Pt-BR");
+        assert!(en_us_upper.is_some());
+        assert!(es_es_upper.is_some());
+        assert!(pt_br_mixed.is_some());
+        assert!(en_us_upper.unwrap().contains("the"));
+        assert!(es_es_upper.unwrap().contains("el"));
+        assert!(pt_br_mixed.unwrap().contains("o"));
+    }
+    #[test]
+    fn test_get_stopwords_all_supported_languages() {
+        let languages = [
+            "af", "ar", "bg", "bn", "br", "ca", "cs", "da", "de", "el", "en", "eo", "es", "et", "eu", "fa", "fi", "fr",
+            "ga", "gl", "gu", "ha", "he", "hi", "hr", "hu", "hy", "id", "it", "ja", "kn", "ko", "ku", "la", "lt", "lv",
+            "ml", "mr", "ms", "ne", "nl", "no", "pl", "pt", "ro", "ru", "si", "sk", "sl", "so", "st", "sv", "sw", "ta",
+            "te", "th", "tl", "tr", "uk", "ur", "vi", "yo", "zh", "zu",
+        ];
+        for lang in &languages {
+            assert!(
+                get_stopwords(lang).is_some(),
+                "Language {} should be available via get_stopwords",
+                lang
+            );
+        }
+    }
+    #[test]
+    fn test_get_stopwords_unsupported_language() {
+        assert!(get_stopwords("xx").is_none());
+        assert!(get_stopwords("zz").is_none());
+        assert!(get_stopwords("xyz").is_none());
+        assert!(get_stopwords("unknown").is_none());
+    }
+    #[test]
+    fn test_get_stopwords_empty_string() {
+        assert!(get_stopwords("").is_none());
+    }
+    #[test]
+    fn test_get_stopwords_single_char() {
+        assert!(get_stopwords("e").is_none());
+        assert!(get_stopwords("z").is_none());
+    }
+    #[test]
+    fn test_get_stopwords_long_locale() {
+        let zh_cn_hans = get_stopwords("zh-CN-Hans");
+        let pt_br_utf8 = get_stopwords("pt_BR.UTF-8");
+        assert!(zh_cn_hans.is_some());
+        assert!(pt_br_utf8.is_some());
+        assert_eq!(zh_cn_hans.unwrap().len(), get_stopwords("zh").unwrap().len());
+        assert_eq!(pt_br_utf8.unwrap().len(), get_stopwords("pt").unwrap().len());
+    }
+    #[test]
+    fn test_get_stopwords_content_verification() {
+        let en = get_stopwords("en").expect("English stopwords should exist");
+        assert!(en.contains("the"));
+        assert!(en.contains("is"));
+        assert!(en.contains("and"));
+        let es = get_stopwords("es").expect("Spanish stopwords should exist");
+        assert!(es.contains("el"));
+        assert!(es.contains("la"));
+        assert!(es.contains("es"));
+        let de = get_stopwords("de").expect("German stopwords should exist");
+        assert!(de.contains("der"));
+        assert!(de.contains("die"));
+        assert!(de.contains("und"));
+        let fr = get_stopwords("fr").expect("French stopwords should exist");
+        assert!(fr.contains("le"));
+        assert!(fr.contains("de"));
+        assert!(fr.contains("un"));
+    }
+    #[test]
+    fn test_get_stopwords_vs_direct_access() {
+        let en_normalized = get_stopwords("en").unwrap();
+        let en_direct = STOPWORDS.get("en").unwrap();
+        assert_eq!(en_normalized.len(), en_direct.len());
+        for word in en_direct {
+            assert!(en_normalized.contains(word));
+        }
+    }
+    #[test]
+    fn test_get_stopwords_with_fallback_primary_available() {
+        let result = get_stopwords_with_fallback("en", "es");
+        assert!(result.is_some());
+        let stopwords = result.unwrap();
+        assert!(stopwords.contains("the"));
+        assert!(!stopwords.contains("el"));
+    }
+    #[test]
+    fn test_get_stopwords_with_fallback_use_fallback() {
+        let result = get_stopwords_with_fallback("xx", "en");
+        assert!(result.is_some());
+        let stopwords = result.unwrap();
+        assert!(stopwords.contains("the"));
+    }
+    #[test]
+    fn test_get_stopwords_with_fallback_both_unavailable() {
+        let result = get_stopwords_with_fallback("xx", "zz");
+        assert!(result.is_none());
+    }
+    #[test]
+    fn test_get_stopwords_with_fallback_case_insensitive() {
+        let result1 = get_stopwords_with_fallback("EN", "es");
+        let result2 = get_stopwords_with_fallback("xx", "ES");
+        assert!(result1.is_some());
+        assert!(result2.is_some());
+    }
+    #[test]
+    fn test_get_stopwords_with_fallback_locale_codes() {
+        let result = get_stopwords_with_fallback("es-MX", "en-US");
+        assert!(result.is_some());
+        let stopwords = result.unwrap();
+        assert!(stopwords.contains("el"));
+    }
+    #[test]
+    fn test_get_stopwords_with_fallback_esperanto_to_english() {
+        let result = get_stopwords_with_fallback("eo", "en");
+        assert!(result.is_some());
+        let stopwords = result.unwrap();
+        assert!(stopwords.contains("la"));
+    }
+    #[test]
+    fn test_get_stopwords_with_fallback_unknown_to_english() {
+        let result = get_stopwords_with_fallback("xyz", "en");
+        assert!(result.is_some());
+        let stopwords = result.unwrap();
+        assert!(stopwords.contains("the"));
+    }
+    #[test]
+    fn test_get_stopwords_with_fallback_same_as_chained_or_else() {
+        let manual = get_stopwords("xx").or_else(|| get_stopwords("en"));
+        let helper = get_stopwords_with_fallback("xx", "en");
+        assert_eq!(manual.is_some(), helper.is_some());
+        if let (Some(m), Some(h)) = (manual, helper) {
+            assert_eq!(m.len(), h.len());
+        }
+    }
+    #[test]
+    fn test_get_stopwords_invalid_language_codes() {
+        assert!(get_stopwords("invalid_lang").is_none());
+        assert!(get_stopwords("xyz").is_none());
+        assert!(get_stopwords("zzzz").is_none());
+        assert!(get_stopwords("abc123").is_none());
+        assert!(get_stopwords("!!!").is_none());
+    }
+    #[test]
+    fn test_get_stopwords_edge_case_empty_and_whitespace() {
+        assert!(get_stopwords("").is_none());
+        assert!(get_stopwords(" ").is_none());
+        assert!(get_stopwords("  ").is_none());
+        assert!(get_stopwords("\t").is_none());
+        assert!(get_stopwords("\n").is_none());
+    }
+    #[test]
+    fn test_get_stopwords_special_characters() {
+        assert!(get_stopwords("@#").is_none());
+        assert!(get_stopwords("$%").is_none());
+        assert!(get_stopwords("!!!").is_none());
+        let result = get_stopwords("en!");
+        assert!(result.is_some());
+        if let Some(stopwords) = result {
+            assert!(stopwords.contains("the"));
+        }
+        let result = get_stopwords("es@");
+        assert!(result.is_some());
+        if let Some(stopwords) = result {
+            assert!(stopwords.contains("el"));
+        }
+        let result = get_stopwords("de#fr");
+        assert!(result.is_some());
+        if let Some(stopwords) = result {
+            assert!(stopwords.contains("der"));
+        }
+    }
+    #[test]
+    fn test_get_stopwords_numeric_codes() {
+        assert!(get_stopwords("12").is_none());
+        assert!(get_stopwords("99").is_none());
+        assert!(get_stopwords("123").is_none());
+        assert!(get_stopwords("0").is_none());
+    }
+    #[test]
+    fn test_get_stopwords_single_character_edge_cases() {
+        assert!(get_stopwords("a").is_none());
+        assert!(get_stopwords("e").is_none());
+        assert!(get_stopwords("z").is_none());
+        assert!(get_stopwords("1").is_none());
+        assert!(get_stopwords("_").is_none());
+    }
+    #[test]
+    fn test_get_stopwords_invalid_locale_formats() {
+        assert!(get_stopwords("xx-YY").is_none());
+        assert!(get_stopwords("zz_ZZ").is_none());
+        assert!(get_stopwords("invalid-US").is_none());
+        assert!(get_stopwords("aa_BB_CC").is_none());
+    }
+    #[test]
+    fn test_get_stopwords_mixed_valid_invalid() {
+        let result = get_stopwords("en123");
+        assert!(result.is_some(), "Should extract 'en' from 'en123'");
+        assert!(get_stopwords("12en").is_none());
+        assert!(get_stopwords("@@en").is_none());
+    }
+    #[test]
+    fn test_get_stopwords_case_sensitivity_validation() {
+        let lower = get_stopwords("en");
+        let upper = get_stopwords("EN");
+        let mixed1 = get_stopwords("En");
+        let mixed2 = get_stopwords("eN");
+        assert!(lower.is_some());
+        assert!(upper.is_some());
+        assert!(mixed1.is_some());
+        assert!(mixed2.is_some());
+        if let (Some(l), Some(u), Some(m1), Some(m2)) = (lower, upper, mixed1, mixed2) {
+            assert_eq!(l.len(), u.len());
+            assert_eq!(l.len(), m1.len());
+            assert_eq!(l.len(), m2.len());
+        }
+    }
+    #[test]
+    fn test_get_stopwords_none_return_safety() {
+        let result = get_stopwords("invalid").and_then(|_| get_stopwords("also_invalid"));
+        assert!(result.is_none());
+        let chained = get_stopwords("xxx")
+            .or_else(|| get_stopwords("yyy"))
+            .or_else(|| get_stopwords("zzz"));
+        assert!(chained.is_none());
+    }
+    #[test]
+    fn test_get_stopwords_with_fallback_both_invalid() {
+        assert!(get_stopwords_with_fallback("invalid", "also_invalid").is_none());
+        assert!(get_stopwords_with_fallback("xxx", "yyy").is_none());
+        assert!(get_stopwords_with_fallback("", "").is_none());
+        assert!(get_stopwords_with_fallback("123", "456").is_none());
+    }
+    #[test]
+    fn test_get_stopwords_with_fallback_invalid_primary_valid_fallback() {
+        let result = get_stopwords_with_fallback("invalid_lang", "en");
+        assert!(result.is_some());
+        if let Some(stopwords) = result {
+            assert!(stopwords.contains("the"));
+        }
+        let result2 = get_stopwords_with_fallback("xyz", "es");
+        assert!(result2.is_some());
+        if let Some(stopwords) = result2 {
+            assert!(stopwords.contains("el"));
+        }
+    }
+    #[test]
+    fn test_get_stopwords_with_fallback_valid_primary_invalid_fallback() {
+        let result = get_stopwords_with_fallback("en", "invalid_fallback");
+        assert!(result.is_some());
+        if let Some(stopwords) = result {
+            assert!(stopwords.contains("the"));
+        }
+        let result2 = get_stopwords_with_fallback("es", "zzz");
+        assert!(result2.is_some());
+        if let Some(stopwords) = result2 {
+            assert!(stopwords.contains("el"));
+        }
+    }
+    #[test]
+    fn test_get_stopwords_with_fallback_empty_strings() {
+        assert!(get_stopwords_with_fallback("", "en").is_some());
+        assert!(get_stopwords_with_fallback("en", "").is_some());
+        assert!(get_stopwords_with_fallback("", "").is_none());
+    }
+    #[test]
+    fn test_get_stopwords_with_fallback_special_characters() {
+        assert!(get_stopwords_with_fallback("@#$", "en").is_some());
+        assert!(get_stopwords_with_fallback("en", "!!!").is_some());
+        assert!(get_stopwords_with_fallback("@#$", "!!!").is_none());
+    }
+    #[test]
+    fn test_get_stopwords_with_fallback_case_insensitive_validation() {
+        let result1 = get_stopwords_with_fallback("INVALID", "en");
+        let result2 = get_stopwords_with_fallback("invalid", "EN");
+        let result3 = get_stopwords_with_fallback("INVALID", "EN");
+        assert!(result1.is_some());
+        assert!(result2.is_some());
+        assert!(result3.is_some());
+        if let (Some(r1), Some(r2), Some(r3)) = (result1, result2, result3) {
+            assert!(r1.contains("the"));
+            assert!(r2.contains("the"));
+            assert!(r3.contains("the"));
+        }
+    }
+    #[test]
+    fn test_direct_stopwords_access_invalid_keys() {
+        assert!(STOPWORDS.get("invalid").is_none());
+        assert!(STOPWORDS.get("EN").is_none());
+        assert!(STOPWORDS.get("en-US").is_none());
+        assert!(STOPWORDS.get("xyz").is_none());
+        assert!(STOPWORDS.get("").is_none());
+    }
+    #[test]
+    fn test_stopwords_case_sensitivity_direct_vs_normalized() {
+        assert!(STOPWORDS.get("EN").is_none());
+        assert!(get_stopwords("EN").is_some());
+        assert!(STOPWORDS.get("Es").is_none());
+        assert!(get_stopwords("Es").is_some());
+        assert!(STOPWORDS.get("DE").is_none());
+        assert!(get_stopwords("DE").is_some());
+    }
+    #[test]
+    fn test_get_stopwords_unicode_characters() {
+        // NOTE: Current implementation has a limitation - it uses byte slicing which can panic
+        let result = get_stopwords("zh-中文");
+        assert!(result.is_some());
+        let result = get_stopwords("ar-العربية");
+        assert!(result.is_some());
+        let result = get_stopwords("ja_日本");
+        assert!(result.is_some());
+        assert!(get_stopwords("xx").is_none());
+        assert!(get_stopwords("yy").is_none());
+        // NOTE: The following would panic due to byte slicing on multi-byte chars:
+    }
+    #[test]
+    fn test_get_stopwords_very_long_strings() {
+        let long_string = "x".repeat(1000);
+        assert!(get_stopwords(&long_string).is_none());
+        let long_locale = "en-".to_string() + &"X".repeat(100);
+        let result = get_stopwords(&long_locale);
+        assert!(result.is_some());
+    }
+    #[test]
+    fn test_get_stopwords_null_bytes() {
+        assert!(get_stopwords("\0").is_none());
+        assert!(get_stopwords("en\0").is_some());
+        assert!(get_stopwords("\0en").is_none());
+    }
+    #[test]
+    fn test_get_stopwords_boundary_conditions() {
+        assert!(get_stopwords("e").is_none());
+        assert!(get_stopwords("en").is_some());
+        assert!(get_stopwords("eng").is_some());
+        let result = get_stopwords("en-");
+        assert!(result.is_some());
+    }
+    #[test]
+    fn test_get_stopwords_multiple_separators() {
+        assert!(get_stopwords("en-US-utf8").is_some());
+        assert!(get_stopwords("es_MX_special").is_some());
+        assert!(get_stopwords("pt-BR_variant").is_some());
+    }
+    #[test]
+    fn test_romance_languages() {
+        let fr = get_stopwords("fr").expect("French stopwords should exist");
+        assert!(fr.contains("le"), "French should contain 'le'");
+        assert!(fr.contains("et"), "French should contain 'et'");
+        assert!(fr.len() >= 150, "French should have substantial stopwords");
+        let es = get_stopwords("es").expect("Spanish stopwords should exist");
+        assert!(es.contains("el"), "Spanish should contain 'el'");
+        assert!(es.contains("y"), "Spanish should contain 'y'");
+        assert!(es.len() >= 200, "Spanish should have substantial stopwords");
+        let pt = get_stopwords("pt").expect("Portuguese stopwords should exist");
+        assert!(pt.contains("o"), "Portuguese should contain 'o'");
+        assert!(pt.contains("e"), "Portuguese should contain 'e'");
+        assert!(pt.len() >= 150, "Portuguese should have substantial stopwords");
+        let it = get_stopwords("it").expect("Italian stopwords should exist");
+        assert!(it.contains("il"), "Italian should contain 'il'");
+        assert!(it.contains("e"), "Italian should contain 'e'");
+        assert!(it.len() >= 150, "Italian should have substantial stopwords");
+        let ro = get_stopwords("ro").expect("Romanian stopwords should exist");
+        assert!(!ro.is_empty(), "Romanian should have stopwords");
+        assert!(ro.len() >= 100, "Romanian should have substantial stopwords");
+    }
+    #[test]
+    fn test_germanic_languages() {
+        let de = get_stopwords("de").expect("German stopwords should exist");
+        assert!(de.contains("der"), "German should contain 'der'");
+        assert!(de.contains("die"), "German should contain 'die'");
+        assert!(de.contains("und"), "German should contain 'und'");
+        assert!(de.len() >= 200, "German should have substantial stopwords");
+        let en = get_stopwords("en").expect("English stopwords should exist");
+        assert!(en.contains("the"), "English should contain 'the'");
+        assert!(en.contains("and"), "English should contain 'and'");
+        assert!(en.len() >= 70, "English should have substantial stopwords");
+        let nl = get_stopwords("nl").expect("Dutch stopwords should exist");
+        assert!(nl.contains("de"), "Dutch should contain 'de'");
+        assert!(nl.contains("het"), "Dutch should contain 'het'");
+        assert!(nl.len() >= 100, "Dutch should have substantial stopwords");
+        let sv = get_stopwords("sv").expect("Swedish stopwords should exist");
+        assert!(!sv.is_empty(), "Swedish should have stopwords");
+        assert!(sv.len() >= 100, "Swedish should have substantial stopwords");
+        let no = get_stopwords("no").expect("Norwegian stopwords should exist");
+        assert!(!no.is_empty(), "Norwegian should have stopwords");
+        let da = get_stopwords("da").expect("Danish stopwords should exist");
+        assert!(!da.is_empty(), "Danish should have stopwords");
+    }
+    #[test]
+    fn test_slavic_languages() {
+        let ru = get_stopwords("ru").expect("Russian stopwords should exist");
+        assert!(!ru.is_empty(), "Russian should have stopwords");
+        assert!(ru.len() >= 100, "Russian should have substantial stopwords");
+        let pl = get_stopwords("pl").expect("Polish stopwords should exist");
+        assert!(!pl.is_empty(), "Polish should have stopwords");
+        assert!(pl.len() >= 100, "Polish should have substantial stopwords");
+        let cs = get_stopwords("cs").expect("Czech stopwords should exist");
+        assert!(!cs.is_empty(), "Czech should have stopwords");
+        let sk = get_stopwords("sk").expect("Slovak stopwords should exist");
+        assert!(!sk.is_empty(), "Slovak should have stopwords");
+        let bg = get_stopwords("bg").expect("Bulgarian stopwords should exist");
+        assert!(!bg.is_empty(), "Bulgarian should have stopwords");
+        let uk = get_stopwords("uk").expect("Ukrainian stopwords should exist");
+        assert!(!uk.is_empty(), "Ukrainian should have stopwords");
+        let hr = get_stopwords("hr").expect("Croatian stopwords should exist");
+        assert!(!hr.is_empty(), "Croatian should have stopwords");
+        let sl = get_stopwords("sl").expect("Slovenian stopwords should exist");
+        assert!(!sl.is_empty(), "Slovenian should have stopwords");
+    }
+    #[test]
+    fn test_asian_languages() {
+        let zh = get_stopwords("zh").expect("Chinese stopwords should exist");
+        assert!(!zh.is_empty(), "Chinese should have stopwords");
+        assert!(zh.len() >= 50, "Chinese should have substantial stopwords");
+        let ja = get_stopwords("ja").expect("Japanese stopwords should exist");
+        assert!(!ja.is_empty(), "Japanese should have stopwords");
+        assert!(ja.len() >= 50, "Japanese should have substantial stopwords");
+        let ko = get_stopwords("ko").expect("Korean stopwords should exist");
+        assert!(!ko.is_empty(), "Korean should have stopwords");
+        let hi = get_stopwords("hi").expect("Hindi stopwords should exist");
+        assert!(!hi.is_empty(), "Hindi should have stopwords");
+        assert!(hi.len() >= 100, "Hindi should have substantial stopwords");
+        let bn = get_stopwords("bn").expect("Bengali stopwords should exist");
+        assert!(!bn.is_empty(), "Bengali should have stopwords");
+        let th = get_stopwords("th").expect("Thai stopwords should exist");
+        assert!(!th.is_empty(), "Thai should have stopwords");
+        let vi = get_stopwords("vi").expect("Vietnamese stopwords should exist");
+        assert!(!vi.is_empty(), "Vietnamese should have stopwords");
+    }
+    #[test]
+    fn test_african_languages() {
+        let af = get_stopwords("af").expect("Afrikaans stopwords should exist");
+        assert!(!af.is_empty(), "Afrikaans should have stopwords");
+        let sw = get_stopwords("sw").expect("Swahili stopwords should exist");
+        assert!(!sw.is_empty(), "Swahili should have stopwords");
+        let yo = get_stopwords("yo").expect("Yoruba stopwords should exist");
+        assert!(!yo.is_empty(), "Yoruba should have stopwords");
+        let zu = get_stopwords("zu").expect("Zulu stopwords should exist");
+        assert!(!zu.is_empty(), "Zulu should have stopwords");
+        let ha = get_stopwords("ha").expect("Hausa stopwords should exist");
+        assert!(!ha.is_empty(), "Hausa should have stopwords");
+        let so = get_stopwords("so").expect("Somali stopwords should exist");
+        assert!(!so.is_empty(), "Somali should have stopwords");
+        let st = get_stopwords("st").expect("Sesotho stopwords should exist");
+        assert!(!st.is_empty(), "Sesotho should have stopwords");
+    }
+    #[test]
+    fn test_indic_languages() {
+        let hi = get_stopwords("hi").expect("Hindi stopwords should exist");
+        assert!(!hi.is_empty(), "Hindi should have stopwords");
+        let bn = get_stopwords("bn").expect("Bengali stopwords should exist");
+        assert!(!bn.is_empty(), "Bengali should have stopwords");
+        let gu = get_stopwords("gu").expect("Gujarati stopwords should exist");
+        assert!(!gu.is_empty(), "Gujarati should have stopwords");
+        let kn = get_stopwords("kn").expect("Kannada stopwords should exist");
+        assert!(!kn.is_empty(), "Kannada should have stopwords");
+        let ml = get_stopwords("ml").expect("Malayalam stopwords should exist");
+        assert!(!ml.is_empty(), "Malayalam should have stopwords");
+        let mr = get_stopwords("mr").expect("Marathi stopwords should exist");
+        assert!(!mr.is_empty(), "Marathi should have stopwords");
+        let ta = get_stopwords("ta").expect("Tamil stopwords should exist");
+        assert!(!ta.is_empty(), "Tamil should have stopwords");
+        let te = get_stopwords("te").expect("Telugu stopwords should exist");
+        assert!(!te.is_empty(), "Telugu should have stopwords");
+        let ur = get_stopwords("ur").expect("Urdu stopwords should exist");
+        assert!(!ur.is_empty(), "Urdu should have stopwords");
+        let ne = get_stopwords("ne").expect("Nepali stopwords should exist");
+        assert!(!ne.is_empty(), "Nepali should have stopwords");
+        let si = get_stopwords("si").expect("Sinhala stopwords should exist");
+        assert!(!si.is_empty(), "Sinhala should have stopwords");
+    }
+    #[test]
+    fn test_middle_eastern_languages() {
+        let ar = get_stopwords("ar").expect("Arabic stopwords should exist");
+        assert!(!ar.is_empty(), "Arabic should have stopwords");
+        assert!(ar.len() >= 100, "Arabic should have substantial stopwords");
+        let fa = get_stopwords("fa").expect("Persian stopwords should exist");
+        assert!(!fa.is_empty(), "Persian should have stopwords");
+        let he = get_stopwords("he").expect("Hebrew stopwords should exist");
+        assert!(!he.is_empty(), "Hebrew should have stopwords");
+        let tr = get_stopwords("tr").expect("Turkish stopwords should exist");
+        assert!(!tr.is_empty(), "Turkish should have stopwords");
+        let ku = get_stopwords("ku").expect("Kurdish stopwords should exist");
+        assert!(!ku.is_empty(), "Kurdish stopwords should exist");
+    }
+    #[test]
+    fn test_other_languages() {
+        let hy = get_stopwords("hy").expect("Armenian stopwords should exist");
+        assert!(!hy.is_empty(), "Armenian should have stopwords");
+        let eu = get_stopwords("eu").expect("Basque stopwords should exist");
+        assert!(!eu.is_empty(), "Basque should have stopwords");
+        let br = get_stopwords("br").expect("Breton stopwords should exist");
+        assert!(!br.is_empty(), "Breton should have stopwords");
+        let ca = get_stopwords("ca").expect("Catalan stopwords should exist");
+        assert!(!ca.is_empty(), "Catalan should have stopwords");
+        let eo = get_stopwords("eo").expect("Esperanto stopwords should exist");
+        assert!(eo.contains("la"), "Esperanto should contain 'la'");
+        assert!(!eo.is_empty(), "Esperanto should have stopwords");
+        let et = get_stopwords("et").expect("Estonian stopwords should exist");
+        assert!(!et.is_empty(), "Estonian should have stopwords");
+        let fi = get_stopwords("fi").expect("Finnish stopwords should exist");
+        assert!(!fi.is_empty(), "Finnish should have stopwords");
+        let gl = get_stopwords("gl").expect("Galician stopwords should exist");
+        assert!(!gl.is_empty(), "Galician should have stopwords");
+        let hu = get_stopwords("hu").expect("Hungarian stopwords should exist");
+        assert!(!hu.is_empty(), "Hungarian should have stopwords");
+        let id = get_stopwords("id").expect("Indonesian stopwords should exist");
+        assert!(!id.is_empty(), "Indonesian should have stopwords");
+        let ga = get_stopwords("ga").expect("Irish stopwords should exist");
+        assert!(!ga.is_empty(), "Irish should have stopwords");
+        let la = get_stopwords("la").expect("Latin stopwords should exist");
+        assert!(!la.is_empty(), "Latin should have stopwords");
+        let lt = get_stopwords("lt").expect("Lithuanian stopwords should exist");
+        assert!(!lt.is_empty(), "Lithuanian should have stopwords");
+        let lv = get_stopwords("lv").expect("Latvian stopwords should exist");
+        assert!(!lv.is_empty(), "Latvian should have stopwords");
+        let ms = get_stopwords("ms").expect("Malay stopwords should exist");
+        assert!(!ms.is_empty(), "Malay should have stopwords");
+        let tl = get_stopwords("tl").expect("Tagalog stopwords should exist");
+        assert!(!tl.is_empty(), "Tagalog should have stopwords");
+    }
+    #[test]
+    fn test_language_code_variants() {
+        let eng = get_stopwords("eng");
+        let en = get_stopwords("en");
+        assert!(eng.is_some(), "'eng' should extract to 'en'");
+        assert!(en.is_some());
+        assert_eq!(eng.unwrap().len(), en.unwrap().len());
+        let spa = get_stopwords("spa");
+        assert!(spa.is_none(), "'spa' extracts to 'sp' which is invalid");
+        let deu = get_stopwords("deu");
+        let de = get_stopwords("de");
+        assert!(deu.is_some(), "'deu' should extract to 'de'");
+        assert_eq!(deu.unwrap().len(), de.unwrap().len());
+        let fra = get_stopwords("fra");
+        let fr = get_stopwords("fr");
+        assert!(fra.is_some(), "'fra' should extract to 'fr'");
+        assert_eq!(fra.unwrap().len(), fr.unwrap().len());
+        let zho = get_stopwords("zho");
+        let zh = get_stopwords("zh");
+        assert!(zho.is_some(), "'zho' should extract to 'zh'");
+        assert_eq!(zho.unwrap().len(), zh.unwrap().len());
+    }
+    #[test]
+    fn test_stopword_set_sizes() {
+        let mut sizes: Vec<(String, usize)> = Vec::new();
+        for (lang, stopwords) in STOPWORDS.iter() {
+            sizes.push((lang.clone(), stopwords.len()));
+            assert!(!stopwords.is_empty(), "Language {} has empty stopwords", lang);
+            assert!(
+                stopwords.len() >= 5,
+                "Language {} has suspiciously few stopwords: {}",
+                lang,
+                stopwords.len()
+            );
+            assert!(
+                stopwords.len() <= 1500,
+                "Language {} has suspiciously many stopwords: {}",
+                lang,
+                stopwords.len()
+            );
+        }
+        assert_eq!(sizes.len(), 64, "Should have exactly 64 languages");
+        let en_size = STOPWORDS.get("en").unwrap().len();
+        assert!(
+            (70..=1500).contains(&en_size),
+            "English stopwords size {} outside expected range",
+            en_size
+        );
+        let es_size = STOPWORDS.get("es").unwrap().len();
+        assert!(
+            (200..=1000).contains(&es_size),
+            "Spanish stopwords size {} outside expected range",
+            es_size
+        );
+    }
+    #[test]
+    fn test_stopword_content_quality() {
+        let en = get_stopwords("en").expect("English stopwords");
+        let english_common = vec![
+            "the", "is", "are", "was", "were", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", "of",
+            "with",
+        ];
+        for word in english_common {
+            assert!(en.contains(word), "English missing common stopword: {}", word);
+        }
+        let es = get_stopwords("es").expect("Spanish stopwords");
+        let spanish_common = vec![
+            "el", "la", "los", "las", "un", "una", "de", "en", "y", "o", "por", "para",
+        ];
+        for word in spanish_common {
+            assert!(es.contains(word), "Spanish missing common stopword: {}", word);
+        }
+        let de = get_stopwords("de").expect("German stopwords");
+        let german_common = vec![
+            "der", "die", "das", "den", "dem", "des", "und", "oder", "in", "auf", "mit", "von",
+        ];
+        for word in german_common {
+            assert!(de.contains(word), "German missing common stopword: {}", word);
+        }
+        let fr = get_stopwords("fr").expect("French stopwords");
+        let french_common = vec![
+            "le", "la", "les", "un", "une", "de", "en", "et", "ou", "pour", "avec", "dans",
+        ];
+        for word in french_common {
+            assert!(fr.contains(word), "French missing common stopword: {}", word);
+        }
+    }
+    #[test]
+    fn test_stopword_deduplication() {
+        for (lang, stopwords) in STOPWORDS.iter() {
+            let original_len = stopwords.len();
+            let unique_len = stopwords.iter().collect::<AHashSet<_>>().len();
+            assert_eq!(original_len, unique_len, "Language {} has duplicate stopwords", lang);
+        }
+    }
+    #[test]
+    fn test_case_normalization_comprehensive() {
+        let test_cases = vec![
+            ("en", "EN", "En", "eN"),
+            ("es", "ES", "Es", "eS"),
+            ("de", "DE", "De", "dE"),
+            ("fr", "FR", "Fr", "fR"),
+            ("zh", "ZH", "Zh", "zH"),
+            ("ar", "AR", "Ar", "aR"),
+        ];
+        for (lower, upper, title, mixed) in test_cases {
+            let lower_result = get_stopwords(lower);
+            let upper_result = get_stopwords(upper);
+            let title_result = get_stopwords(title);
+            let mixed_result = get_stopwords(mixed);
+            assert!(lower_result.is_some(), "{} should be valid", lower);
+            assert!(upper_result.is_some(), "{} should be valid", upper);
+            assert!(title_result.is_some(), "{} should be valid", title);
+            assert!(mixed_result.is_some(), "{} should be valid", mixed);
+            let len = lower_result.unwrap().len();
+            assert_eq!(upper_result.unwrap().len(), len);
+            assert_eq!(title_result.unwrap().len(), len);
+            assert_eq!(mixed_result.unwrap().len(), len);
+        }
+    }
+    #[test]
+    fn test_locale_code_normalization_comprehensive() {
+        let test_cases = vec![
+            ("en-US", "en_US", "en-GB", "en_GB", "en"),
+            ("es-ES", "es_ES", "es-MX", "es_MX", "es"),
+            ("pt-PT", "pt_PT", "pt-BR", "pt_BR", "pt"),
+            ("zh-CN", "zh_CN", "zh-TW", "zh_TW", "zh"),
+            ("fr-FR", "fr_FR", "fr-CA", "fr_CA", "fr"),
+        ];
+        for (hyphen1, underscore1, hyphen2, underscore2, base) in test_cases {
+            let base_result = get_stopwords(base).unwrap_or_else(|| panic!("{} should be valid", base));
+            let h1 = get_stopwords(hyphen1);
+            let u1 = get_stopwords(underscore1);
+            let h2 = get_stopwords(hyphen2);
+            let u2 = get_stopwords(underscore2);
+            assert!(h1.is_some(), "{} should be valid", hyphen1);
+            assert!(u1.is_some(), "{} should be valid", underscore1);
+            assert!(h2.is_some(), "{} should be valid", hyphen2);
+            assert!(u2.is_some(), "{} should be valid", underscore2);
+            let len = base_result.len();
+            assert_eq!(h1.unwrap().len(), len, "{} should match {}", hyphen1, base);
+            assert_eq!(u1.unwrap().len(), len, "{} should match {}", underscore1, base);
+            assert_eq!(h2.unwrap().len(), len, "{} should match {}", hyphen2, base);
+            assert_eq!(u2.unwrap().len(), len, "{} should match {}", underscore2, base);
+        }
+    }
+    #[test]
+    fn test_fallback_chains() {
+        let scenarios = vec![
+            ("en", "es", true, "en"),
+            ("xx", "en", true, "en"),
+            ("xx", "yy", false, ""),
+            ("es", "xx", true, "es"),
+        ];
+        for (primary, fallback, should_succeed, expected_lang) in scenarios {
+            let result = get_stopwords_with_fallback(primary, fallback);
+            assert_eq!(
+                result.is_some(),
+                should_succeed,
+                "Fallback({}, {}) should {}",
+                primary,
+                fallback,
+                if should_succeed { "succeed" } else { "fail" }
+            );
+            if should_succeed {
+                let stopwords = result.unwrap();
+                let expected = get_stopwords(expected_lang).unwrap();
+                assert_eq!(
+                    stopwords.len(),
+                    expected.len(),
+                    "Fallback should return {} stopwords",
+                    expected_lang
+                );
+            }
+        }
+    }
+    #[test]
+    fn test_stopword_string_types() {
+        for (lang, stopwords) in STOPWORDS.iter() {
+            for word in stopwords {
+                assert!(!word.is_empty(), "Language {} has empty stopword", lang);
+                assert!(
+                    word.len() <= 100,
+                    "Language {} has suspiciously long stopword: {} ({} bytes)",
+                    lang,
+                    word,
+                    word.len()
+                );
+                assert!(word.chars().count() > 0, "Language {} has invalid UTF-8 stopword", lang);
+            }
+        }
+    }
+    #[test]
+    fn test_concurrent_access() {
+        use std::thread;
+        let languages = vec!["en", "es", "de", "fr", "zh", "ar", "ru", "ja"];
+        let mut handles = vec![];
+        for lang in languages {
+            let handle = thread::spawn(move || {
+                let stopwords = get_stopwords(lang);
+                assert!(stopwords.is_some(), "Language {} should be available", lang);
+                stopwords.unwrap().len()
+            });
+            handles.push(handle);
+        }
+        for handle in handles {
+            let len = handle.join().expect("Thread should not panic");
+            assert!(len > 0, "Stopwords should not be empty");
+        }
+    }
+    #[test]
+    fn test_stopwords_immutability() {
+        let en1 = get_stopwords("en").unwrap();
+        let en2 = get_stopwords("en").unwrap();
+        assert_eq!(en1.len(), en2.len());
+        for word in en1 {
+            assert!(
+                en2.contains(word),
+                "Stopword '{}' should exist in both references",
+                word
+            );
+        }
+    }
+    #[test]
+    fn test_edge_case_separator_positions() {
+        let test_cases = vec![
+            ("en-", true),
+            ("-en", false),
+            ("e-n", false),
+            ("en--US", true),
+            ("en_-US", true),
+            ("_en", false),
+            ("en_", true),
+        ];
+        for (code, should_find_en) in test_cases {
+            let result = get_stopwords(code);
+            if should_find_en {
+                assert!(result.is_some(), "Code '{}' should extract 'en'", code);
+                if let Some(stopwords) = result {
+                    assert!(
+                        stopwords.contains("the"),
+                        "Code '{}' should return English stopwords",
+                        code
+                    );
+                }
+            } else {
+                let _ = result;
+            }
+        }
+    }
+    #[test]
+    fn test_performance_characteristics() {
+        use std::time::Instant;
+        let _ = get_stopwords("en");
+        let start = Instant::now();
+        for _ in 0..10000 {
+            let _ = get_stopwords("en");
+            let _ = get_stopwords("es");
+            let _ = get_stopwords("de");
+        }
+        let duration = start.elapsed();
+        assert!(
+            duration.as_millis() < 500,
+            "30,000 lookups took too long: {:?}",
+            duration
+        );
+    }
+    #[test]
+    fn test_language_completeness() {
+        let documented = vec![
+            "af", "ar", "bg", "bn", "br", "ca", "cs", "da", "de", "el", "en", "eo", "es", "et", "eu", "fa", "fi", "fr",
+            "ga", "gl", "gu", "ha", "he", "hi", "hr", "hu", "hy", "id", "it", "ja", "kn", "ko", "ku", "la", "lt", "lv",
+            "ml", "mr", "ms", "ne", "nl", "no", "pl", "pt", "ro", "ru", "si", "sk", "sl", "so", "st", "sv", "sw", "ta",
+            "te", "th", "tl", "tr", "uk", "ur", "vi", "yo", "zh", "zu",
+        ];
+        assert_eq!(documented.len(), 64, "Documentation lists 64 languages");
+        for lang in documented {
+            assert!(
+                STOPWORDS.contains_key(lang),
+                "Documented language '{}' is missing from STOPWORDS",
+                lang
+            );
+            assert!(
+                get_stopwords(lang).is_some(),
+                "Documented language '{}' not accessible via get_stopwords",
+                lang
+            );
+        }
+    }
+}