language_detection 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (100) hide show
  1. data/.gitignore +19 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +85 -0
  5. data/Rakefile +11 -0
  6. data/ext/cld/Makefile +34 -0
  7. data/ext/cld/base/basictypes.h +348 -0
  8. data/ext/cld/base/build_config.h +124 -0
  9. data/ext/cld/base/casts.h +156 -0
  10. data/ext/cld/base/commandlineflags.h +443 -0
  11. data/ext/cld/base/crash.h +41 -0
  12. data/ext/cld/base/dynamic_annotations.h +358 -0
  13. data/ext/cld/base/global_strip_options.h +59 -0
  14. data/ext/cld/base/log_severity.h +46 -0
  15. data/ext/cld/base/logging.h +1403 -0
  16. data/ext/cld/base/macros.h +243 -0
  17. data/ext/cld/base/port.h +54 -0
  18. data/ext/cld/base/scoped_ptr.h +428 -0
  19. data/ext/cld/base/stl_decl.h +0 -0
  20. data/ext/cld/base/stl_decl_msvc.h +107 -0
  21. data/ext/cld/base/string_util.h +29 -0
  22. data/ext/cld/base/strtoint.h +93 -0
  23. data/ext/cld/base/template_util.h +96 -0
  24. data/ext/cld/base/type_traits.h +198 -0
  25. data/ext/cld/base/vlog_is_on.h +143 -0
  26. data/ext/cld/cld.so +0 -0
  27. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  28. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  29. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  30. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  31. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  32. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  33. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  34. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  35. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  36. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  37. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  38. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  39. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  40. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  41. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  42. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  43. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  44. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  45. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  46. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  47. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  48. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  49. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  50. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  51. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  52. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  53. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  54. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  55. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  56. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  57. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  58. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  59. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  60. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  61. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  62. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  63. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  64. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  65. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  66. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  67. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  68. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  69. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  70. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  71. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  72. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  73. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  74. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  75. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  76. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  77. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  78. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  79. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  80. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  81. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  82. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  83. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  84. data/ext/cld/encodings/internal/encodings.cc +12 -0
  85. data/ext/cld/encodings/lang_enc.h +254 -0
  86. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  87. data/ext/cld/encodings/public/encodings.h +301 -0
  88. data/ext/cld/extconf.rb +1 -0
  89. data/ext/cld/language_detection.cc +88 -0
  90. data/ext/cld/languages/internal/languages.cc +337 -0
  91. data/ext/cld/languages/proto/languages.pb.h +179 -0
  92. data/ext/cld/languages/public/languages.h +379 -0
  93. data/language_detection.gemspec +28 -0
  94. data/lib/language_detection/string.rb +1 -0
  95. data/lib/language_detection/version.rb +3 -0
  96. data/lib/language_detection.rb +54 -0
  97. data/test/_helper.rb +15 -0
  98. data/test/fixtures/languages.csv +80 -0
  99. data/test/language_detection_test.rb +88 -0
  100. metadata +250 -0
@@ -0,0 +1,337 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include "languages/public/languages.h"
6
+
7
+ #include "base/string_util.h"
8
+ #include "encodings/compact_lang_det/win/cld_basictypes.h"
9
+
10
+
11
+ Language default_language() {return ENGLISH;}
12
+
13
+
14
+ // Language names and codes
15
+
16
+ struct LanguageInfo {
17
+ const char * language_name_;
18
+ const char * language_code_639_1_; // the ISO-639-1 code for the language
19
+ const char * language_code_639_2_; // the ISO-639-2 code for the language
20
+ const char * language_code_other_; // some nonstandard code for the language
21
+ };
22
+
23
+ static const LanguageInfo kLanguageInfoTable[] = {
24
+ { "ENGLISH", "en", "eng", NULL},
25
+ { "DANISH", "da", "dan", NULL},
26
+ { "DUTCH", "nl", "dut", NULL},
27
+ { "FINNISH", "fi", "fin", NULL},
28
+ { "FRENCH", "fr", "fre", NULL},
29
+ { "GERMAN", "de", "ger", NULL},
30
+ { "HEBREW", "he", "heb", NULL},
31
+ { "ITALIAN", "it", "ita", NULL},
32
+ { "Japanese", "ja", "jpn", NULL},
33
+ { "Korean", "ko", "kor", NULL},
34
+ { "NORWEGIAN", "nb", "nor", NULL},
35
+ { "POLISH", "pl", "pol", NULL},
36
+ { "PORTUGUESE", "pt", "por", NULL},
37
+ { "RUSSIAN", "ru", "rus", NULL},
38
+ { "SPANISH", "es", "spa", NULL},
39
+ { "SWEDISH", "sv", "swe", NULL},
40
+ { "Chinese", "zh", "chi", "zh-CN"},
41
+ { "CZECH", "cs", "cze", NULL},
42
+ { "GREEK", "el", "gre", NULL},
43
+ { "ICELANDIC", "is", "ice", NULL},
44
+ { "LATVIAN", "lv", "lav", NULL},
45
+ { "LITHUANIAN", "lt", "lit", NULL},
46
+ { "ROMANIAN", "ro", "rum", NULL},
47
+ { "HUNGARIAN", "hu", "hun", NULL},
48
+ { "ESTONIAN", "et", "est", NULL},
49
+ // TODO: Although Teragram has two output names "TG_UNKNOWN_LANGUAGE"
50
+ // and "Unknown", they are essentially the same. Need to unify them.
51
+ // "un" and "ut" are invented by us, not from ISO-639.
52
+ //
53
+ { "TG_UNKNOWN_LANGUAGE", NULL, NULL, "ut"},
54
+ { "Unknown", NULL, NULL, "un"},
55
+ { "BULGARIAN", "bg", "bul", NULL},
56
+ { "CROATIAN", "hr", "scr", NULL},
57
+ { "SERBIAN", "sr", "scc", NULL},
58
+ { "IRISH", "ga", "gle", NULL},
59
+ { "GALICIAN", "gl", "glg", NULL},
60
+ // Impossible to tell Tagalog from Filipino at the moment.
61
+ // Use ISO 639-2 code for Filipino here.
62
+ { "TAGALOG", NULL, "fil", NULL},
63
+ { "TURKISH", "tr", "tur", NULL},
64
+ { "UKRAINIAN", "uk", "ukr", NULL},
65
+ { "HINDI", "hi", "hin", NULL},
66
+ { "MACEDONIAN", "mk", "mac", NULL},
67
+ { "BENGALI", "bn", "ben", NULL},
68
+ { "INDONESIAN", "id", "ind", NULL},
69
+ { "LATIN", "la", "lat", NULL},
70
+ { "MALAY", "ms", "may", NULL},
71
+ { "MALAYALAM", "ml", "mal", NULL},
72
+ { "WELSH", "cy", "wel", NULL},
73
+ { "NEPALI", "ne", "nep", NULL},
74
+ { "TELUGU", "te", "tel", NULL},
75
+ { "ALBANIAN", "sq", "alb", NULL},
76
+ { "TAMIL", "ta", "tam", NULL},
77
+ { "BELARUSIAN", "be", "bel", NULL},
78
+ { "JAVANESE", "jw", "jav", NULL},
79
+ { "OCCITAN", "oc", "oci", NULL},
80
+ { "URDU", "ur", "urd", NULL},
81
+ { "BIHARI", "bh", "bih", NULL},
82
+ { "GUJARATI", "gu", "guj", NULL},
83
+ { "THAI", "th", "tha", NULL},
84
+ { "ARABIC", "ar", "ara", NULL},
85
+ { "CATALAN", "ca", "cat", NULL},
86
+ { "ESPERANTO", "eo", "epo", NULL},
87
+ { "BASQUE", "eu", "baq", NULL},
88
+ { "INTERLINGUA", "ia", "ina", NULL},
89
+ { "KANNADA", "kn", "kan", NULL},
90
+ { "PUNJABI", "pa", "pan", NULL},
91
+ { "SCOTS_GAELIC", "gd", "gla", NULL},
92
+ { "SWAHILI", "sw", "swa", NULL},
93
+ { "SLOVENIAN", "sl", "slv", NULL},
94
+ { "MARATHI", "mr", "mar", NULL},
95
+ { "MALTESE", "mt", "mlt", NULL},
96
+ { "VIETNAMESE", "vi", "vie", NULL},
97
+ { "FRISIAN", "fy", "fry", NULL},
98
+ { "SLOVAK", "sk", "slo", NULL},
99
+ { "ChineseT",
100
+ NULL, NULL, // We intentionally set these 2 fields to NULL to avoid
101
+ // confusion between CHINESE_T and CHINESE.
102
+ "zh-TW"},
103
+ { "FAROESE", "fo", "fao", NULL},
104
+ { "SUNDANESE", "su", "sun", NULL},
105
+ { "UZBEK", "uz", "uzb", NULL},
106
+ { "AMHARIC", "am", "amh", NULL},
107
+ { "AZERBAIJANI", "az", "aze", NULL},
108
+ { "GEORGIAN", "ka", "geo", NULL},
109
+ { "TIGRINYA", "ti", "tir", NULL},
110
+ { "PERSIAN", "fa", "per", NULL},
111
+ { "BOSNIAN", "bs", "bos", NULL},
112
+ { "SINHALESE", "si", "sin", NULL},
113
+ { "NORWEGIAN_N", "nn", "nno", NULL},
114
+ { "PORTUGUESE_P", NULL, NULL, "pt-PT"},
115
+ { "PORTUGUESE_B", NULL, NULL, "pt-BR"},
116
+ { "XHOSA", "xh", "xho", NULL},
117
+ { "ZULU", "zu", "zul", NULL},
118
+ { "GUARANI", "gn", "grn", NULL},
119
+ { "SESOTHO", "st", "sot", NULL},
120
+ { "TURKMEN", "tk", "tuk", NULL},
121
+ { "KYRGYZ", "ky", "kir", NULL},
122
+ { "BRETON", "br", "bre", NULL},
123
+ { "TWI", "tw", "twi", NULL},
124
+ { "YIDDISH", "yi", "yid", NULL},
125
+ { "SERBO_CROATIAN", "sh", NULL, NULL},
126
+ { "SOMALI", "so", "som", NULL},
127
+ { "UIGHUR", "ug", "uig", NULL},
128
+ { "KURDISH", "ku", "kur", NULL},
129
+ { "MONGOLIAN", "mn", "mon", NULL},
130
+ { "ARMENIAN", "hy", "arm", NULL},
131
+ { "LAOTHIAN", "lo", "lao", NULL},
132
+ { "SINDHI", "sd", "snd", NULL},
133
+ { "RHAETO_ROMANCE", "rm", "roh", NULL},
134
+ { "AFRIKAANS", "af", "afr", NULL},
135
+ { "LUXEMBOURGISH", "lb", "ltz", NULL},
136
+ { "BURMESE", "my", "bur", NULL},
137
+ // KHMER is known as Cambodian for Google user interfaces.
138
+ { "KHMER", "km", "khm", NULL},
139
+ { "TIBETAN", "bo", "tib", NULL},
140
+ { "DHIVEHI", "dv", "div", NULL},
141
+ { "CHEROKEE", NULL, "chr", NULL},
142
+ { "SYRIAC", NULL, "syr", NULL},
143
+ { "LIMBU", NULL, NULL, "sit-NP"},
144
+ { "ORIYA", "or", "ori", NULL},
145
+ { "ASSAMESE", "as", "asm", NULL},
146
+ { "CORSICAN", "co", "cos", NULL},
147
+ { "INTERLINGUE", "ie", "ine", NULL},
148
+ { "KAZAKH", "kk", "kaz", NULL},
149
+ { "LINGALA", "ln", "lin", NULL},
150
+ { "MOLDAVIAN", "mo", "mol", NULL},
151
+ { "PASHTO", "ps", "pus", NULL},
152
+ { "QUECHUA", "qu", "que", NULL},
153
+ { "SHONA", "sn", "sna", NULL},
154
+ { "TAJIK", "tg", "tgk", NULL},
155
+ { "TATAR", "tt", "tat", NULL},
156
+ { "TONGA", "to", "tog", NULL},
157
+ { "YORUBA", "yo", "yor", NULL},
158
+ { "CREOLES_AND_PIDGINS_ENGLISH_BASED", NULL, "cpe", NULL},
159
+ { "CREOLES_AND_PIDGINS_FRENCH_BASED", NULL, "cpf", NULL},
160
+ { "CREOLES_AND_PIDGINS_PORTUGUESE_BASED", NULL, "cpp", NULL},
161
+ { "CREOLES_AND_PIDGINS_OTHER", NULL, "crp", NULL},
162
+ { "MAORI", "mi", "mao", NULL},
163
+ { "WOLOF", "wo", "wol", NULL},
164
+ { "ABKHAZIAN", "ab", "abk", NULL},
165
+ { "AFAR", "aa", "aar", NULL},
166
+ { "AYMARA", "ay", "aym", NULL},
167
+ { "BASHKIR", "ba", "bak", NULL},
168
+ { "BISLAMA", "bi", "bis", NULL},
169
+ { "DZONGKHA", "dz", "dzo", NULL},
170
+ { "FIJIAN", "fj", "fij", NULL},
171
+ { "GREENLANDIC", "kl", "kal", NULL},
172
+ { "HAUSA", "ha", "hau", NULL},
173
+ { "HAITIAN_CREOLE", "ht", NULL, NULL},
174
+ { "INUPIAK", "ik", "ipk", NULL},
175
+ { "INUKTITUT", "iu", "iku", NULL},
176
+ { "KASHMIRI", "ks", "kas", NULL},
177
+ { "KINYARWANDA", "rw", "kin", NULL},
178
+ { "MALAGASY", "mg", "mlg", NULL},
179
+ { "NAURU", "na", "nau", NULL},
180
+ { "OROMO", "om", "orm", NULL},
181
+ { "RUNDI", "rn", "run", NULL},
182
+ { "SAMOAN", "sm", "smo", NULL},
183
+ { "SANGO", "sg", "sag", NULL},
184
+ { "SANSKRIT", "sa", "san", NULL},
185
+ { "SISWANT", "ss", "ssw", NULL},
186
+ { "TSONGA", "ts", "tso", NULL},
187
+ { "TSWANA", "tn", "tsn", NULL},
188
+ { "VOLAPUK", "vo", "vol", NULL},
189
+ { "ZHUANG", "za", "zha", NULL},
190
+ { "KHASI", NULL, "kha", NULL},
191
+ { "SCOTS", NULL, "sco", NULL},
192
+ { "GANDA", "lg", "lug", NULL},
193
+ { "MANX", "gv", "glv", NULL},
194
+ { "MONTENEGRIN", NULL, NULL, "sr-ME"},
195
+ { "XX", NULL, NULL, "XX"},
196
+ };
197
+
198
+ COMPILE_ASSERT(arraysize(kLanguageInfoTable) == NUM_LANGUAGES + 1,
199
+ kLanguageInfoTable_has_incorrect_length);
200
+
201
+
202
+ // LANGUAGE NAMES
203
+
204
+ const char* default_language_name() {
205
+ return kLanguageInfoTable[ENGLISH].language_name_;
206
+ }
207
+
208
+ static const char* const kInvalidLanguageName = "invalid_language";
209
+
210
+ const char *invalid_language_name() {
211
+ return kInvalidLanguageName;
212
+ }
213
+
214
+ const char* LanguageName(Language lang) {
215
+ return IsValidLanguage(lang)
216
+ ? kLanguageInfoTable[lang].language_name_
217
+ : kInvalidLanguageName;
218
+ }
219
+
220
+
221
+
222
+ // LANGUAGE CODES
223
+
224
+
225
+ // The space before invalid_language_code is intentional. It is used
226
+ // to prevent it matching any two letter language code.
227
+ //
228
+ static const char* const kInvalidLanguageCode = " invalid_language_code";
229
+
230
+ const char *invalid_language_code() {
231
+ return kInvalidLanguageCode;
232
+ }
233
+
234
+ const char * LanguageCode(Language lang) {
235
+ if (! IsValidLanguage(lang))
236
+ return kInvalidLanguageCode;
237
+ const LanguageInfo& info = kLanguageInfoTable[lang];
238
+ if (info.language_code_639_1_) {
239
+ return info.language_code_639_1_;
240
+ } else if (info.language_code_639_2_) {
241
+ return info.language_code_639_2_;
242
+ } else if (info.language_code_other_) {
243
+ return info.language_code_other_;
244
+ } else {
245
+ return kInvalidLanguageCode;
246
+ }
247
+ }
248
+
249
+ const char* default_language_code() {
250
+ return kLanguageInfoTable[ENGLISH].language_code_639_1_;
251
+ }
252
+
253
+ const char* LanguageCodeISO639_1(Language lang) {
254
+ if (! IsValidLanguage(lang))
255
+ return kInvalidLanguageCode;
256
+ if (const char* code = kLanguageInfoTable[lang].language_code_639_1_)
257
+ return code;
258
+ return kInvalidLanguageCode;
259
+ }
260
+
261
+ const char* LanguageCodeISO639_2(Language lang) {
262
+ if (! IsValidLanguage(lang))
263
+ return kInvalidLanguageCode;
264
+ if (const char* code = kLanguageInfoTable[lang].language_code_639_2_)
265
+ return code;
266
+ return kInvalidLanguageCode;
267
+ }
268
+
269
+ const char* LanguageCodeWithDialects(Language lang) {
270
+ if (lang == CHINESE)
271
+ return "zh-CN";
272
+ return LanguageCode(lang);
273
+ }
274
+
275
+
276
+
277
+ bool LanguageFromCode(const char* lang_code, Language *language) {
278
+ *language = UNKNOWN_LANGUAGE;
279
+ if ( lang_code == NULL ) return false;
280
+
281
+ for ( int i = 0 ; i < kNumLanguages ; i++ ) {
282
+ const LanguageInfo& info = kLanguageInfoTable[i];
283
+ if ((info.language_code_639_1_ &&
284
+ !base::strcasecmp(lang_code, info.language_code_639_1_)) ||
285
+ (info.language_code_639_2_ &&
286
+ !base::strcasecmp(lang_code, info.language_code_639_2_)) ||
287
+ (info.language_code_other_ &&
288
+ !base::strcasecmp(lang_code, info.language_code_other_))) {
289
+ *language = static_cast<Language>(i);
290
+ return true;
291
+ }
292
+ }
293
+
294
+ // For convenience, this function can also parse the non-standard
295
+ // five-letter language codes "zh-cn" and "zh-tw" which are used by
296
+ // front-ends such as GWS to distinguish Simplified from Traditional
297
+ // Chinese.
298
+ if (!base::strcasecmp(lang_code, "zh-cn") ||
299
+ !base::strcasecmp(lang_code, "zh_cn")) {
300
+ *language = CHINESE;
301
+ return true;
302
+ }
303
+ if (!base::strcasecmp(lang_code, "zh-tw") ||
304
+ !base::strcasecmp(lang_code, "zh_tw")) {
305
+ *language = CHINESE_T;
306
+ return true;
307
+ }
308
+ if (!base::strcasecmp(lang_code, "sr-me") ||
309
+ !base::strcasecmp(lang_code, "sr_me")) {
310
+ *language = MONTENEGRIN;
311
+ return true;
312
+ }
313
+
314
+ // Process language-code synonyms.
315
+ if (!base::strcasecmp(lang_code, "he")) {
316
+ *language = HEBREW; // Use "iw".
317
+ return true;
318
+ }
319
+ if (!base::strcasecmp(lang_code, "in")) {
320
+ *language = INDONESIAN; // Use "id".
321
+ return true;
322
+ }
323
+ if (!base::strcasecmp(lang_code, "ji")) {
324
+ *language = YIDDISH; // Use "yi".
325
+ return true;
326
+ }
327
+
328
+ // Process language-detection synonyms.
329
+ // These distinct languages cannot be differentiated by our current
330
+ // language-detection algorithms.
331
+ if (!base::strcasecmp(lang_code, "fil")) {
332
+ *language = TAGALOG;
333
+ return true;
334
+ }
335
+
336
+ return false;
337
+ }
@@ -0,0 +1,179 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef LANGUAGES_PROTO_LANGUAGES_PB_H_
6
+ #define LANGUAGES_PROTO_LANGUAGES_PB_H_
7
+
8
+ enum Language {
9
+ ENGLISH = 0,
10
+ DANISH = 1,
11
+ DUTCH = 2,
12
+ FINNISH = 3,
13
+ FRENCH = 4,
14
+ GERMAN = 5,
15
+ HEBREW = 6,
16
+ ITALIAN = 7,
17
+ JAPANESE = 8,
18
+ KOREAN = 9,
19
+ NORWEGIAN = 10,
20
+ POLISH = 11,
21
+ PORTUGUESE = 12,
22
+ RUSSIAN = 13,
23
+ SPANISH = 14,
24
+ SWEDISH = 15,
25
+ CHINESE = 16,
26
+ CZECH = 17,
27
+ GREEK = 18,
28
+ ICELANDIC = 19,
29
+ LATVIAN = 20,
30
+ LITHUANIAN = 21,
31
+ ROMANIAN = 22,
32
+ HUNGARIAN = 23,
33
+ ESTONIAN = 24,
34
+ TG_UNKNOWN_LANGUAGE = 25,
35
+ UNKNOWN_LANGUAGE = 26,
36
+ BULGARIAN = 27,
37
+ CROATIAN = 28,
38
+ SERBIAN = 29,
39
+ IRISH = 30, // UI only.
40
+ GALICIAN = 31,
41
+ TAGALOG = 32, // Tagalog (tl) + Filipino (fil),
42
+ TURKISH = 33,
43
+ UKRAINIAN = 34,
44
+ HINDI = 35,
45
+ MACEDONIAN = 36,
46
+ BENGALI = 37,
47
+ INDONESIAN = 38,
48
+ LATIN = 39, // UI only.
49
+ MALAY = 40,
50
+ MALAYALAM = 41,
51
+ WELSH = 42, // UI only.
52
+ NEPALI = 43,
53
+ TELUGU = 44,
54
+ ALBANIAN = 45,
55
+ TAMIL = 46,
56
+ BELARUSIAN = 47,
57
+ JAVANESE = 48, // UI only.
58
+ OCCITAN = 49, // UI only.
59
+ URDU = 50,
60
+ BIHARI = 51,
61
+ GUJARATI = 52,
62
+ THAI = 53,
63
+ ARABIC = 54,
64
+ CATALAN = 55,
65
+ ESPERANTO = 56,
66
+ BASQUE = 57,
67
+ INTERLINGUA = 58, // UI only.
68
+ KANNADA = 59,
69
+ PUNJABI = 60,
70
+ SCOTS_GAELIC = 61, // UI only.
71
+ SWAHILI = 62,
72
+ SLOVENIAN = 63,
73
+ MARATHI = 64,
74
+ MALTESE = 65,
75
+ VIETNAMESE = 66,
76
+ FRISIAN = 67, // UI only.
77
+ SLOVAK = 68,
78
+ CHINESE_T = 69, // This is added to solve the problem of
79
+ // distinguishing Traditional and Simplified
80
+ // Chinese when the encoding is UTF8.
81
+ FAROESE = 70, // UI only.
82
+ SUNDANESE = 71, // UI only.
83
+ UZBEK = 72,
84
+ AMHARIC = 73,
85
+ AZERBAIJANI = 74,
86
+ GEORGIAN = 75,
87
+ TIGRINYA = 76, // UI only.
88
+ PERSIAN = 77,
89
+ BOSNIAN = 78, // UI only. LangId language: CROATIAN (28)
90
+ SINHALESE = 79,
91
+ NORWEGIAN_N = 80, // UI only. LangId language: NORWEGIAN (10)
92
+ PORTUGUESE_P = 81, // UI only. LangId language: PORTUGUESE (12)
93
+ PORTUGUESE_B = 82, // UI only. LangId language: PORTUGUESE (12)
94
+ XHOSA = 83, // UI only.
95
+ ZULU = 84, // UI only.
96
+ GUARANI = 85,
97
+ SESOTHO = 86, // UI only.
98
+ TURKMEN = 87, // UI only.
99
+ KYRGYZ = 88,
100
+ BRETON = 89, // UI only.
101
+ TWI = 90, // UI only.
102
+ YIDDISH = 91, // UI only.
103
+ SERBO_CROATIAN= 92, // UI only. LangId language: SERBIAN (29)
104
+ SOMALI = 93, // UI only.
105
+ UIGHUR = 94,
106
+ KURDISH = 95,
107
+ MONGOLIAN = 96,
108
+ ARMENIAN = 97,
109
+ LAOTHIAN = 98,
110
+ SINDHI = 99,
111
+ RHAETO_ROMANCE= 100, // UI only.
112
+ AFRIKAANS = 101,
113
+ LUXEMBOURGISH = 102, // UI only.
114
+ BURMESE = 103,
115
+ KHMER = 104,
116
+ TIBETAN = 105,
117
+ DHIVEHI = 106, // sometimes spelled Divehi, lang of Maldives
118
+ CHEROKEE = 107,
119
+ SYRIAC = 108, // UI only.
120
+ LIMBU = 109, // UI only.
121
+ ORIYA = 110,
122
+ ASSAMESE = 111, // UI only.
123
+ CORSICAN = 112, // UI only.
124
+ INTERLINGUE = 113, // UI only.
125
+ KAZAKH = 114,
126
+ LINGALA = 115, // UI only.
127
+ MOLDAVIAN = 116, // UI only. LangId language: ROMANIAN (22)
128
+ PASHTO = 117,
129
+ QUECHUA = 118, // UI only.
130
+ SHONA = 119, // UI only.
131
+ TAJIK = 120,
132
+ TATAR = 121, // UI only.
133
+ TONGA = 122, // UI only.
134
+ YORUBA = 123, // UI only.
135
+ CREOLES_AND_PIDGINS_ENGLISH_BASED = 124, // UI only.
136
+ CREOLES_AND_PIDGINS_FRENCH_BASED = 125, // UI only.
137
+ CREOLES_AND_PIDGINS_PORTUGUESE_BASED = 126, // UI only.
138
+ CREOLES_AND_PIDGINS_OTHER = 127, // UI only.
139
+ MAORI = 128, // UI only.
140
+ WOLOF = 129, // UI only.
141
+ ABKHAZIAN = 130, // UI only.
142
+ AFAR = 131, // UI only.
143
+ AYMARA = 132, // UI only.
144
+ BASHKIR = 133, // UI only.
145
+ BISLAMA = 134, // UI only.
146
+ DZONGKHA = 135, // UI only.
147
+ FIJIAN = 136, // UI only.
148
+ GREENLANDIC = 137, // UI only.
149
+ HAUSA = 138, // UI only.
150
+ HAITIAN_CREOLE= 139, // UI only.
151
+ INUPIAK = 140, // UI only.
152
+ INUKTITUT = 141,
153
+ KASHMIRI = 142, // UI only.
154
+ KINYARWANDA = 143, // UI only.
155
+ MALAGASY = 144, // UI only.
156
+ NAURU = 145, // UI only.
157
+ OROMO = 146, // UI only.
158
+ RUNDI = 147, // UI only.
159
+ SAMOAN = 148, // UI only.
160
+ SANGO = 149, // UI only.
161
+ SANSKRIT = 150,
162
+ SISWANT = 151, // UI only.
163
+ TSONGA = 152, // UI only.
164
+ TSWANA = 153, // UI only.
165
+ VOLAPUK = 154, // UI only.
166
+ ZHUANG = 155, // UI only.
167
+ KHASI = 156, // UI only.
168
+ SCOTS = 157, // UI only.
169
+ GANDA = 158, // UI only.
170
+ MANX = 159, // UI only.
171
+ MONTENEGRIN = 160, // UI only. LangId language: SERBIAN (29)
172
+ NUM_LANGUAGES = 161, // Always keep this at the end. It is not a
173
+ // valid Language enum. It is only used to
174
+ // indicate the total number of Languages.
175
+ // NOTE: If you add a language, you will break a unittest. See the note
176
+ // at the top of this enum.
177
+ };
178
+
179
+ #endif // LANGUAGES_PROTO_LANGUAGES_PB_H_