whatlanguage 1.0.6 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: ccd0489249d639e17bda9ae90c496d33e2e6048a
4
- data.tar.gz: 83a969f3f186de199996e19bd6c6f391ba7cae24
2
+ SHA256:
3
+ metadata.gz: '0939c1109ab44fd32e9e611c20696ee315a980877c0f04d5990fe42f51c17201'
4
+ data.tar.gz: e5002ebde89b8c6acfa313355d2aa588b31cbb7db27870690bbeae761ecaeb2c
5
5
  SHA512:
6
- metadata.gz: a1fbd9f4745e74637c8eddf8a0c4ac74fed02493d00b011c8e3de80887000253b0d6e39260d54afed9cc1918270c3dad227c963c07591840ad4692ecd775682a
7
- data.tar.gz: 59a45845d1f19f073d0f1af57526fcedea5813126f93a7795fe4218842bcf6212d7169f6a34d04d060d587f24c172558c7548ae7d063345f7e5a4800c105f078
6
+ metadata.gz: 2a06fa18163cee6de9278291856c1b04c633880be1a2816476ce4b47b03c374242c9eb0ea012e1fb04322de753d45492a9ca96856ec70582ea1287fb19034c2e
7
+ data.tar.gz: e57243d9fccc9b5c43e5da591db26085d2eedd7776a4bc3e6dfd9b8ddcfcc59112de6309f63a41f0b2f38bc171bce2d80e638b2c3a8645230657aeb643f852ef
data/CHANGELOG.md ADDED
@@ -0,0 +1,41 @@
1
+ # Changelog
2
+
3
+ ## 2.0.0 / 2026-06-10
4
+
5
+ - Rewrote the detection engine from scratch. Bloom-filter dictionary lookups are
6
+ replaced by a script router plus character-trigram out-of-place scoring (a pure-Ruby
7
+ port of whatlang/Franc, public-domain UDHR-derived profiles).
8
+ - Expanded from 20 to 160+ languages across 20+ scripts.
9
+ - Removed the ~5 MB of binary `.lang` filter files and the wordlist corpus; the model
10
+ is now a single ~220 KB JSON file. No dependencies beyond the standard library.
11
+ - Requires Ruby 3.0 or newer.
12
+ - Added `#detect`, `#ranked`, and `#score_hash` for callers that need result metadata
13
+ or runner-up scores. `#scores` and `#process_text` remain as compatibility aliases.
14
+ - Added class-level convenience methods: `WhatLanguage.detect`, `.language`,
15
+ `.language_iso`, `.ranked`, `.score_hash`, and `.languages`.
16
+ - Added `only:` for configured detectors, replacing positional language
17
+ selection as the documented API.
18
+ - Removed the `String#language` and `String#language_iso` monkeypatch.
19
+ - Short shared-script fragments now return `nil` by default; pass `min_chars: 0`
20
+ to preserve the previous best-effort behavior.
21
+ - Removed internals: `BloominSimple`, `BitField`, the filter-build scripts.
22
+
23
+ ## 1.0.6 / 2016-01-28
24
+
25
+ - Minor test fixes and tweaks
26
+ - New release taking into account a handful of pull requests
27
+
28
+ ## 1.0.5 / 2013-10-05
29
+
30
+ - Many more languages supported
31
+
32
+ ## 1.0.4 / 2013-03-07
33
+
34
+ ## 1.0.1 / 2008-08-22
35
+
36
+ - Public release
37
+ - Removed wordlists from distribution to reduce size
38
+
39
+ ## 1.0.0 / 2007-07-02
40
+
41
+ - First version with pre-built English, French, and Spanish filters
data/Gemfile CHANGED
@@ -1,4 +1,4 @@
1
- source 'https://rubygems.org'
1
+ # frozen_string_literal: true
2
2
 
3
- # Specify your gem's dependencies in whatlanguage2.gemspec
3
+ source 'https://rubygems.org'
4
4
  gemspec
data/LICENSE.txt CHANGED
@@ -1,10 +1,43 @@
1
- Copyright (c) 2008-2013 Peter Cooper
2
-
3
1
  MIT License
4
2
 
3
+ Ruby code is Copyright (c) 2007-2026 Peter Cooper
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ 'Software'), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+
24
+ ---
25
+
26
+ lib/whatlanguage/trigrams.json is licensed as per the https://github.com/greyblake/whatlang-rs project.
27
+
28
+ Its license is as follows:
29
+
30
+ (The MIT License)
31
+
32
+ Copyright (c) 2017 Sergey Potapov <blake131313@gmail.com>
33
+ Copyright (c) 2014 Titus Wormer <tituswormer@gmail.com>
34
+ Copyright (c) 2008 Kent S Johnson
35
+ Copyright (c) 2006 Jacob R Rideout <kde@jacobrideout.net>
36
+ Copyright (c) 2004 Maciej Ceglowski
37
+
5
38
  Permission is hereby granted, free of charge, to any person obtaining
6
39
  a copy of this software and associated documentation files (the
7
- "Software"), to deal in the Software without restriction, including
40
+ 'Software'), to deal in the Software without restriction, including
8
41
  without limitation the rights to use, copy, modify, merge, publish,
9
42
  distribute, sublicense, and/or sell copies of the Software, and to
10
43
  permit persons to whom the Software is furnished to do so, subject to
@@ -13,10 +46,10 @@ the following conditions:
13
46
  The above copyright notice and this permission notice shall be
14
47
  included in all copies or substantial portions of the Software.
15
48
 
16
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
49
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
17
50
  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
51
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
52
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
53
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
54
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
55
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md CHANGED
@@ -1,114 +1,73 @@
1
1
  # whatlanguage
2
2
 
3
- by Peter Cooper
3
+ Pure Ruby natural language detection for 160+ languages.
4
4
 
5
- Text language detection. Quick, fast, memory efficient, and all in pure Ruby. Uses Bloom filters for aforementioned speed and memory benefits. It works well on texts of over 10 words in length (e.g. blog posts or comments) and *very poorly* on short or Twitter-esque text, so be aware.
5
+ ```ruby
6
+ require 'whatlanguage'
6
7
 
7
- Works with Dutch, English, Farsi, French, German, Italian, Pinyin, Swedish, Portuguese, Russian, Arabic, Finnish, Greek, Hebrew, Hungarian, Korean, Norwegian, Polish and Spanish out of the box.
8
+ WhatLanguage.language("Que linguagem é essa? É uma pergunta sobre a língua portuguesa.")
9
+ # => :portuguese
10
+ ```
8
11
 
9
- ## Important note
12
+ - `gem install whatlanguage`
13
+ - No runtime dependencies.
14
+ - Supports 20+ writing systems.
15
+ - Ships a compact ~220 KB trigram model.
16
+ - Requires Ruby 3.0+
17
+ - Best on sentence-length text or longer.
10
18
 
11
- This library was first built in 2007 and has received only a few minor updates over the years. There are now more efficient and effective algorithms for doing language detection which I am investigating for a future WhatLanguage.
19
+ ## How it works
12
20
 
13
- This library has been updated to be distributed and to work on modern Ruby implementations but other than that, has had no significant improvements.
21
+ Detection is in two stages. First, the dominant Unicode script is detected; scripts used by a single language (Greek, Korean, Thai, Japanese using Hiragana/Katakana) resolve immediately. For scripts shared by several languages (e.g. Latin, Cyrillic, Arabic, Hebrew) trigrams are ranked by frequency and compared against candidate language profiles.
14
22
 
15
- ## Synopsis
23
+ The trigram profiles are vendored from [whatlang](https://github.com/greyblake/whatlang-rs), a port of [Franc](https://github.com/wooorm/franc), whose models are built from the public-domain UDHR corpus (see Credits). The model is a ~220 KB JSON file.
16
24
 
17
- Full Example
25
+ > [!IMPORTANT]
26
+ > v2.0 has many breaking changes as the entire library has been rewritten, though the core `WhatLanguage.language` API remains similar. Versions 1.0.6 and earlier (so the 2007-2025 run of the library) used a Bloom-filter technique and had 5MB of binary files to handle ~20 languages. Version 2.0 is more accurate, faster, and supports more languages from a single 220KB JSON file :-)
18
27
 
19
- ```ruby
20
- require 'whatlanguage/string'
21
-
22
- texts = []
23
- texts << %q{Deux autres personnes ont été arrêtées durant la nuit}
24
- texts << %q{The links between the attempted car bombings in Glasgow and London are becoming clearer}
25
- texts << %q{En estado de máxima alertaen su nivel de crítico}
26
- texts << %q{Returns the object in enum with the maximum value.}
27
- texts << %q{Propose des données au sujet de la langue espagnole.}
28
- texts << %q{La palabra "mezquita" se usa en español para referirse a todo tipo de edificios dedicados.}
29
- texts << %q{اللغة التي هي هذه؟}
30
- texts << %q{Mitä kieltä tämä on?}
31
- texts << %q{Ποια γλώσσα είναι αυτή;}
32
- texts << %q{באיזו שפה זה?}
33
- texts << %q{Milyen nyelv ez?}
34
- texts << %q{이 어떤 언어인가?}
35
- texts << %q{Hvilket språk er dette?}
36
- texts << %q{W jakim języku to jest?}
37
-
38
- texts.each { |text| puts "#{text[0..18]}... is in #{text.language.to_s.capitalize}" }
39
- ```
28
+ ## Usage
40
29
 
41
- Initialize WhatLanguage with all filters
30
+ Return a full detection result:
42
31
 
43
32
  ```ruby
44
- wl = WhatLanguage.new(:all)
33
+ wl = WhatLanguage.new
34
+ text = "Die Stadt plant neue Investitionen in den öffentlichen Verkehr"
35
+ result = wl.detect(text)
36
+ result.language # => :german
37
+ result.iso # => :de
38
+ result.score # => 79018
39
+ result.ranked # => [[:german, 79018], [:dutch, 77631], ... ]
45
40
  ```
46
41
 
47
- Return language with best score
42
+ Return ranked scores, or the raw score hash:
48
43
 
49
44
  ```ruby
50
- wl.language(text)
45
+ wl.ranked(text) # => [[:german, 79018], [:dutch, 77631], ... ]
46
+ wl.score_hash(text) # => { german: 79018, dutch: 77631, ... }
51
47
  ```
52
48
 
53
- Return hash with scores for all relevant languages
49
+ Restrict candidate languages:
54
50
 
55
51
  ```ruby
56
- wl.process_text(text)
52
+ wl = WhatLanguage.new(only: [:english, :german, :french])
57
53
  ```
58
54
 
59
- Convenience methods on String
55
+ Short Latin-script fragments are ignored by default because there is not enough signal to rank shared-script languages reliably. The threshold applies to the statistical trigram stage; scripts that identify a single supported language, such as Greek, Korean, or Thai, can still resolve from shorter text. The threshold can be adjusted:
60
56
 
61
57
  ```ruby
62
- "This is a test".language # => :english
63
- "This is a test".language_iso # => :en
58
+ wl = WhatLanguage.new(min_chars: 0)
64
59
  ```
65
60
 
66
- Initialize WhatLanguage with certain languages
67
-
68
- ```ruby
69
- wl = WhatLanguage.new(:english, :german, :french)
70
- ```
71
-
72
- ## Requirements
73
-
74
- None, minor libraries (BloominSimple and BitField) included with this release.
75
-
76
- ## Installation
77
-
78
- gem install whatlanguage
79
-
80
- To test, go into irb, then:
81
-
82
- ```ruby
83
- require 'whatlanguage'
84
- "Je suis un homme".language
85
- ```
86
-
87
- ## Credits
88
-
89
- Contributions from Konrad Reiche, Salimane Adjao Moustapha, and others appreciated.
90
-
91
- ## License
92
-
93
- MIT License
61
+ ## Known limitations
94
62
 
95
- Copyright (c) 2007-2016 Peter Cooper
63
+ - Short fragments are unreliable. For languages resolved by statistical comparison, fewer than 10 significant characters returns `nil` by default.
64
+ - Scores are relative ranking values, not probabilities. Use `#ranked` or `#detect.ranked` when close runners-up matter.
65
+ - Closely related written languages can be hard to separate, especially Norwegian Bokmål/Danish, Hebrew/Yiddish, and similar language pairs.
66
+ - Kanji-only Japanese text can classify as Chinese because Han characters alone do not identify the language.
67
+ - Romanized text is classified by Latin-script trigram profiles; it is not treated as native-script text.
96
68
 
97
- Permission is hereby granted, free of charge, to any person obtaining
98
- a copy of this software and associated documentation files (the
99
- 'Software'), to deal in the Software without restriction, including
100
- without limitation the rights to use, copy, modify, merge, publish,
101
- distribute, sublicense, and/or sell copies of the Software, and to
102
- permit persons to whom the Software is furnished to do so, subject to
103
- the following conditions:
69
+ ## Credits
104
70
 
105
- The above copyright notice and this permission notice shall be
106
- included in all copies or substantial portions of the Software.
71
+ Contributions from Konrad Reiche, Salimane Adjao Moustapha, Andrew Cone, Lasse Skindstad Ebert, Henrik Nyh, Daniel Sandbecker, Michael Hartl, Pedro Lambert, Tobias Preuss, Pepijn Looije, and others appreciated.
107
72
 
108
- THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
109
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
110
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
111
- IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
112
- CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
113
- TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
114
- SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
73
+ The trigram language profiles in `lib/whatlanguage/trigrams.json` are taken from [whatlang](https://github.com/greyblake/whatlang-rs) (MIT, © Sergey Potapov), itself a derivative of [Franc](https://github.com/wooorm/franc) (MIT, © Titus Wormer). Those profiles are derived from the public-domain Universal Declaration of Human Rights translations.
data/Rakefile CHANGED
@@ -1,6 +1,12 @@
1
- require "bundler/gem_tasks"
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
2
4
  require 'rake/testtask'
3
5
 
4
- Rake::TestTask.new
6
+ Rake::TestTask.new do |t|
7
+ t.libs << 'lib' << 'test'
8
+ t.pattern = 'test/**/*_test.rb'
9
+ t.warning = false
10
+ end
5
11
 
6
- task :default => :test
12
+ task default: :test
@@ -0,0 +1,180 @@
1
+ # frozen_string_literal: true
2
+
3
+ # AUTO-GENERATED from the ISO 639-3 registry + whatlang dataset codes.
4
+ # Maps ISO 639-3 code => [language name symbol, ISO 639-1 (or 639-3 fallback) symbol].
5
+ # Original gem languages keep their historical symbols and 2-letter codes.
6
+ class WhatLanguage
7
+ CODE_INFO = {
8
+ "ace" => [:achinese, :ace],
9
+ "ada" => [:adangme, :ada],
10
+ "afr" => [:afrikaans, :af],
11
+ "aka" => [:akan, :ak],
12
+ "amh" => [:amharic, :am],
13
+ "ara" => [:arabic, :ar],
14
+ "hye" => [:armenian, :hy],
15
+ "quy" => [:ayacucho_quechua, :quy],
16
+ "aze" => [:azerbaijani, :az],
17
+ "ban" => [:balinese, :ban],
18
+ "rmn" => [:balkan_romani, :rmn],
19
+ "bam" => [:bambara, :bm],
20
+ "bci" => [:baoule, :bci],
21
+ "bel" => [:belarusian, :be],
22
+ "bem" => [:bemba, :bem],
23
+ "ben" => [:bengali, :bn],
24
+ "bho" => [:bhojpuri, :bho],
25
+ "bin" => [:bini, :bin],
26
+ "bos" => [:bosnian, :bs],
27
+ "bug" => [:buginese, :bug],
28
+ "bul" => [:bulgarian, :bg],
29
+ "mya" => [:burmese, :my],
30
+ "cat" => [:catalan, :ca],
31
+ "ceb" => [:cebuano, :ceb],
32
+ "tzm" => [:central_atlas_tamazight, :tzm],
33
+ "ayr" => [:central_aymara, :ayr],
34
+ "bcl" => [:central_bikol, :bcl],
35
+ "knc" => [:central_kanuri, :knc],
36
+ "nhn" => [:central_nahuatl, :nhn],
37
+ "nya" => [:chichewa, :ny],
38
+ "qug" => [:chimborazo_highland_quichua, :qug],
39
+ "cmn" => [:chinese, :zh],
40
+ "cjk" => [:chokwe, :cjk],
41
+ "hrv" => [:croatian, :hr],
42
+ "quz" => [:cusco_quechua, :quz],
43
+ "ces" => [:czech, :cs],
44
+ "dan" => [:danish, :da],
45
+ "nld" => [:dutch, :nl],
46
+ "dyu" => [:dyula, :dyu],
47
+ "emk" => [:eastern_maninkakan, :emk],
48
+ "eng" => [:english, :en],
49
+ "epo" => [:esperanto, :eo],
50
+ "est" => [:estonian, :et],
51
+ "ewe" => [:ewe, :ee],
52
+ "pes" => [:farsi, :fa],
53
+ "fin" => [:finnish, :fi],
54
+ "fon" => [:fon, :fon],
55
+ "fra" => [:french, :fr],
56
+ "gaa" => [:ga, :gaa],
57
+ "glg" => [:galician, :gl],
58
+ "lug" => [:ganda, :lg],
59
+ "kat" => [:georgian, :ka],
60
+ "deu" => [:german, :de],
61
+ "ell" => [:greek, :el],
62
+ "guj" => [:gujarati, :gu],
63
+ "hat" => [:haitian, :ht],
64
+ "khk" => [:halh_mongolian, :khk],
65
+ "hau" => [:hausa, :ha],
66
+ "heb" => [:hebrew, :he],
67
+ "hil" => [:hiligaynon, :hil],
68
+ "hin" => [:hindi, :hi],
69
+ "hun" => [:hungarian, :hu],
70
+ "ibb" => [:ibibio, :ibb],
71
+ "ibo" => [:igbo, :ig],
72
+ "ilo" => [:iloko, :ilo],
73
+ "ind" => [:indonesian, :id],
74
+ "ita" => [:italian, :it],
75
+ "jpn" => [:japanese, :ja],
76
+ "jav" => [:javanese, :jv],
77
+ "kbp" => [:kabiye, :kbp],
78
+ "kan" => [:kannada, :kn],
79
+ "kaz" => [:kazakh, :kk],
80
+ "khm" => [:khmer, :km],
81
+ "kmb" => [:kimbundu, :kmb],
82
+ "kin" => [:kinyarwanda, :rw],
83
+ "kir" => [:kirghiz, :ky],
84
+ "koi" => [:komi_permyak, :koi],
85
+ "kng" => [:koongo, :kng],
86
+ "kor" => [:korean, :ko],
87
+ "kur" => [:kurdish, :ku],
88
+ "lat" => [:latin, :la],
89
+ "lav" => [:latvian, :lv],
90
+ "lin" => [:lingala, :ln],
91
+ "lit" => [:lithuanian, :lt],
92
+ "src" => [:logudorese_sardinian, :src],
93
+ "nds" => [:low_german, :nds],
94
+ "lua" => [:luba_lulua, :lua],
95
+ "lun" => [:lunda, :lun],
96
+ "mkd" => [:macedonian, :mk],
97
+ "mad" => [:madurese, :mad],
98
+ "mag" => [:magahi, :mag],
99
+ "mai" => [:maithili, :mai],
100
+ "vmw" => [:makhuwa, :vmw],
101
+ "kde" => [:makonde, :kde],
102
+ "mlg" => [:malagasy, :mg],
103
+ "mal" => [:malayalam, :ml],
104
+ "mar" => [:marathi, :mr],
105
+ "men" => [:mende, :men],
106
+ "min" => [:minangkabau, :min],
107
+ "mos" => [:mossi, :mos],
108
+ "ndo" => [:ndonga, :ng],
109
+ "nep" => [:nepali, :ne],
110
+ "dip" => [:northeastern_dinka, :dip],
111
+ "uzn" => [:northern_uzbek, :uzn],
112
+ "nob" => [:norwegian, :no],
113
+ "nno" => [:norwegian_nynorsk, :nn],
114
+ "nyn" => [:nyankole, :nyn],
115
+ "ori" => [:oriya, :or],
116
+ "orm" => [:oromo, :om],
117
+ "pam" => [:pampanga, :pam],
118
+ "pan" => [:panjabi, :pa],
119
+ "nso" => [:pedi, :nso],
120
+ "pol" => [:polish, :pl],
121
+ "por" => [:portuguese, :pt],
122
+ "ron" => [:romanian, :ro],
123
+ "run" => [:rundi, :rn],
124
+ "rus" => [:russian, :ru],
125
+ "sag" => [:sango, :sg],
126
+ "skr" => [:saraiki, :skr],
127
+ "srp" => [:serbian, :sr],
128
+ "sna" => [:shona, :sn],
129
+ "sin" => [:sinhala, :si],
130
+ "snn" => [:siona, :snn],
131
+ "slk" => [:slovak, :sk],
132
+ "slv" => [:slovenian, :sl],
133
+ "som" => [:somali, :so],
134
+ "snk" => [:soninke, :snk],
135
+ "hms" => [:southern_qiandong_miao, :hms],
136
+ "sot" => [:southern_sotho, :st],
137
+ "spa" => [:spanish, :es],
138
+ "suk" => [:sukuma, :suk],
139
+ "sun" => [:sundanese, :su],
140
+ "swh" => [:swahili, :swh],
141
+ "ssw" => [:swati, :ss],
142
+ "swe" => [:swedish, :sv],
143
+ "tgl" => [:tagalog, :tl],
144
+ "tgk" => [:tajik, :tg],
145
+ "tam" => [:tamil, :ta],
146
+ "tat" => [:tatar, :tt],
147
+ "tel" => [:telugu, :te],
148
+ "tha" => [:thai, :th],
149
+ "tir" => [:tigrinya, :ti],
150
+ "tem" => [:timne, :tem],
151
+ "tiv" => [:tiv, :tiv],
152
+ "tpi" => [:tok_pisin, :tpi],
153
+ "toi" => [:tonga, :toi],
154
+ "als" => [:tosk_albanian, :als],
155
+ "tso" => [:tsonga, :ts],
156
+ "tsn" => [:tswana, :tn],
157
+ "tur" => [:turkish, :tr],
158
+ "tuk" => [:turkmen, :tk],
159
+ "uig" => [:uighur, :ug],
160
+ "ukr" => [:ukrainian, :uk],
161
+ "umb" => [:umbundu, :umb],
162
+ "urd" => [:urdu, :ur],
163
+ "uzb" => [:uzbek, :uz],
164
+ "vie" => [:vietnamese, :vi],
165
+ "rmy" => [:vlax_romani, :rmy],
166
+ "war" => [:waray, :war],
167
+ "cym" => [:welsh, :cy],
168
+ "wol" => [:wolof, :wo],
169
+ "xho" => [:xhosa, :xh],
170
+ "yao" => [:yao, :yao],
171
+ "yid" => [:yiddish, :yi],
172
+ "yor" => [:yoruba, :yo],
173
+ "zul" => [:zulu, :zu],
174
+ }.freeze
175
+
176
+ # Convenience aliases for legacy selection symbols that no longer map 1:1.
177
+ NAME_ALIASES = { pinyin: :chinese, mandarin: :chinese }.freeze
178
+
179
+ private_constant :CODE_INFO, :NAME_ALIASES
180
+ end