language_detection 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +85 -0
- data/Rakefile +11 -0
- data/ext/cld/Makefile +34 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +124 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/cld.so +0 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +1 -0
- data/ext/cld/language_detection.cc +88 -0
- data/ext/cld/languages/internal/languages.cc +337 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/language_detection.gemspec +28 -0
- data/lib/language_detection/string.rb +1 -0
- data/lib/language_detection/version.rb +3 -0
- data/lib/language_detection.rb +54 -0
- data/test/_helper.rb +15 -0
- data/test/fixtures/languages.csv +80 -0
- data/test/language_detection_test.rb +88 -0
- metadata +250 -0
@@ -0,0 +1,337 @@
|
|
1
|
+
// Copyright (c) 2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#include "languages/public/languages.h"
|
6
|
+
|
7
|
+
#include "base/string_util.h"
|
8
|
+
#include "encodings/compact_lang_det/win/cld_basictypes.h"
|
9
|
+
|
10
|
+
|
11
|
+
Language default_language() {return ENGLISH;}
|
12
|
+
|
13
|
+
|
14
|
+
// Language names and codes
|
15
|
+
|
16
|
+
struct LanguageInfo {
|
17
|
+
const char * language_name_;
|
18
|
+
const char * language_code_639_1_; // the ISO-639-1 code for the language
|
19
|
+
const char * language_code_639_2_; // the ISO-639-2 code for the language
|
20
|
+
const char * language_code_other_; // some nonstandard code for the language
|
21
|
+
};
|
22
|
+
|
23
|
+
static const LanguageInfo kLanguageInfoTable[] = {
|
24
|
+
{ "ENGLISH", "en", "eng", NULL},
|
25
|
+
{ "DANISH", "da", "dan", NULL},
|
26
|
+
{ "DUTCH", "nl", "dut", NULL},
|
27
|
+
{ "FINNISH", "fi", "fin", NULL},
|
28
|
+
{ "FRENCH", "fr", "fre", NULL},
|
29
|
+
{ "GERMAN", "de", "ger", NULL},
|
30
|
+
{ "HEBREW", "he", "heb", NULL},
|
31
|
+
{ "ITALIAN", "it", "ita", NULL},
|
32
|
+
{ "Japanese", "ja", "jpn", NULL},
|
33
|
+
{ "Korean", "ko", "kor", NULL},
|
34
|
+
{ "NORWEGIAN", "nb", "nor", NULL},
|
35
|
+
{ "POLISH", "pl", "pol", NULL},
|
36
|
+
{ "PORTUGUESE", "pt", "por", NULL},
|
37
|
+
{ "RUSSIAN", "ru", "rus", NULL},
|
38
|
+
{ "SPANISH", "es", "spa", NULL},
|
39
|
+
{ "SWEDISH", "sv", "swe", NULL},
|
40
|
+
{ "Chinese", "zh", "chi", "zh-CN"},
|
41
|
+
{ "CZECH", "cs", "cze", NULL},
|
42
|
+
{ "GREEK", "el", "gre", NULL},
|
43
|
+
{ "ICELANDIC", "is", "ice", NULL},
|
44
|
+
{ "LATVIAN", "lv", "lav", NULL},
|
45
|
+
{ "LITHUANIAN", "lt", "lit", NULL},
|
46
|
+
{ "ROMANIAN", "ro", "rum", NULL},
|
47
|
+
{ "HUNGARIAN", "hu", "hun", NULL},
|
48
|
+
{ "ESTONIAN", "et", "est", NULL},
|
49
|
+
// TODO: Although Teragram has two output names "TG_UNKNOWN_LANGUAGE"
|
50
|
+
// and "Unknown", they are essentially the same. Need to unify them.
|
51
|
+
// "un" and "ut" are invented by us, not from ISO-639.
|
52
|
+
//
|
53
|
+
{ "TG_UNKNOWN_LANGUAGE", NULL, NULL, "ut"},
|
54
|
+
{ "Unknown", NULL, NULL, "un"},
|
55
|
+
{ "BULGARIAN", "bg", "bul", NULL},
|
56
|
+
{ "CROATIAN", "hr", "scr", NULL},
|
57
|
+
{ "SERBIAN", "sr", "scc", NULL},
|
58
|
+
{ "IRISH", "ga", "gle", NULL},
|
59
|
+
{ "GALICIAN", "gl", "glg", NULL},
|
60
|
+
// Impossible to tell Tagalog from Filipino at the moment.
|
61
|
+
// Use ISO 639-2 code for Filipino here.
|
62
|
+
{ "TAGALOG", NULL, "fil", NULL},
|
63
|
+
{ "TURKISH", "tr", "tur", NULL},
|
64
|
+
{ "UKRAINIAN", "uk", "ukr", NULL},
|
65
|
+
{ "HINDI", "hi", "hin", NULL},
|
66
|
+
{ "MACEDONIAN", "mk", "mac", NULL},
|
67
|
+
{ "BENGALI", "bn", "ben", NULL},
|
68
|
+
{ "INDONESIAN", "id", "ind", NULL},
|
69
|
+
{ "LATIN", "la", "lat", NULL},
|
70
|
+
{ "MALAY", "ms", "may", NULL},
|
71
|
+
{ "MALAYALAM", "ml", "mal", NULL},
|
72
|
+
{ "WELSH", "cy", "wel", NULL},
|
73
|
+
{ "NEPALI", "ne", "nep", NULL},
|
74
|
+
{ "TELUGU", "te", "tel", NULL},
|
75
|
+
{ "ALBANIAN", "sq", "alb", NULL},
|
76
|
+
{ "TAMIL", "ta", "tam", NULL},
|
77
|
+
{ "BELARUSIAN", "be", "bel", NULL},
|
78
|
+
{ "JAVANESE", "jw", "jav", NULL},
|
79
|
+
{ "OCCITAN", "oc", "oci", NULL},
|
80
|
+
{ "URDU", "ur", "urd", NULL},
|
81
|
+
{ "BIHARI", "bh", "bih", NULL},
|
82
|
+
{ "GUJARATI", "gu", "guj", NULL},
|
83
|
+
{ "THAI", "th", "tha", NULL},
|
84
|
+
{ "ARABIC", "ar", "ara", NULL},
|
85
|
+
{ "CATALAN", "ca", "cat", NULL},
|
86
|
+
{ "ESPERANTO", "eo", "epo", NULL},
|
87
|
+
{ "BASQUE", "eu", "baq", NULL},
|
88
|
+
{ "INTERLINGUA", "ia", "ina", NULL},
|
89
|
+
{ "KANNADA", "kn", "kan", NULL},
|
90
|
+
{ "PUNJABI", "pa", "pan", NULL},
|
91
|
+
{ "SCOTS_GAELIC", "gd", "gla", NULL},
|
92
|
+
{ "SWAHILI", "sw", "swa", NULL},
|
93
|
+
{ "SLOVENIAN", "sl", "slv", NULL},
|
94
|
+
{ "MARATHI", "mr", "mar", NULL},
|
95
|
+
{ "MALTESE", "mt", "mlt", NULL},
|
96
|
+
{ "VIETNAMESE", "vi", "vie", NULL},
|
97
|
+
{ "FRISIAN", "fy", "fry", NULL},
|
98
|
+
{ "SLOVAK", "sk", "slo", NULL},
|
99
|
+
{ "ChineseT",
|
100
|
+
NULL, NULL, // We intentionally set these 2 fields to NULL to avoid
|
101
|
+
// confusion between CHINESE_T and CHINESE.
|
102
|
+
"zh-TW"},
|
103
|
+
{ "FAROESE", "fo", "fao", NULL},
|
104
|
+
{ "SUNDANESE", "su", "sun", NULL},
|
105
|
+
{ "UZBEK", "uz", "uzb", NULL},
|
106
|
+
{ "AMHARIC", "am", "amh", NULL},
|
107
|
+
{ "AZERBAIJANI", "az", "aze", NULL},
|
108
|
+
{ "GEORGIAN", "ka", "geo", NULL},
|
109
|
+
{ "TIGRINYA", "ti", "tir", NULL},
|
110
|
+
{ "PERSIAN", "fa", "per", NULL},
|
111
|
+
{ "BOSNIAN", "bs", "bos", NULL},
|
112
|
+
{ "SINHALESE", "si", "sin", NULL},
|
113
|
+
{ "NORWEGIAN_N", "nn", "nno", NULL},
|
114
|
+
{ "PORTUGUESE_P", NULL, NULL, "pt-PT"},
|
115
|
+
{ "PORTUGUESE_B", NULL, NULL, "pt-BR"},
|
116
|
+
{ "XHOSA", "xh", "xho", NULL},
|
117
|
+
{ "ZULU", "zu", "zul", NULL},
|
118
|
+
{ "GUARANI", "gn", "grn", NULL},
|
119
|
+
{ "SESOTHO", "st", "sot", NULL},
|
120
|
+
{ "TURKMEN", "tk", "tuk", NULL},
|
121
|
+
{ "KYRGYZ", "ky", "kir", NULL},
|
122
|
+
{ "BRETON", "br", "bre", NULL},
|
123
|
+
{ "TWI", "tw", "twi", NULL},
|
124
|
+
{ "YIDDISH", "yi", "yid", NULL},
|
125
|
+
{ "SERBO_CROATIAN", "sh", NULL, NULL},
|
126
|
+
{ "SOMALI", "so", "som", NULL},
|
127
|
+
{ "UIGHUR", "ug", "uig", NULL},
|
128
|
+
{ "KURDISH", "ku", "kur", NULL},
|
129
|
+
{ "MONGOLIAN", "mn", "mon", NULL},
|
130
|
+
{ "ARMENIAN", "hy", "arm", NULL},
|
131
|
+
{ "LAOTHIAN", "lo", "lao", NULL},
|
132
|
+
{ "SINDHI", "sd", "snd", NULL},
|
133
|
+
{ "RHAETO_ROMANCE", "rm", "roh", NULL},
|
134
|
+
{ "AFRIKAANS", "af", "afr", NULL},
|
135
|
+
{ "LUXEMBOURGISH", "lb", "ltz", NULL},
|
136
|
+
{ "BURMESE", "my", "bur", NULL},
|
137
|
+
// KHMER is known as Cambodian for Google user interfaces.
|
138
|
+
{ "KHMER", "km", "khm", NULL},
|
139
|
+
{ "TIBETAN", "bo", "tib", NULL},
|
140
|
+
{ "DHIVEHI", "dv", "div", NULL},
|
141
|
+
{ "CHEROKEE", NULL, "chr", NULL},
|
142
|
+
{ "SYRIAC", NULL, "syr", NULL},
|
143
|
+
{ "LIMBU", NULL, NULL, "sit-NP"},
|
144
|
+
{ "ORIYA", "or", "ori", NULL},
|
145
|
+
{ "ASSAMESE", "as", "asm", NULL},
|
146
|
+
{ "CORSICAN", "co", "cos", NULL},
|
147
|
+
{ "INTERLINGUE", "ie", "ine", NULL},
|
148
|
+
{ "KAZAKH", "kk", "kaz", NULL},
|
149
|
+
{ "LINGALA", "ln", "lin", NULL},
|
150
|
+
{ "MOLDAVIAN", "mo", "mol", NULL},
|
151
|
+
{ "PASHTO", "ps", "pus", NULL},
|
152
|
+
{ "QUECHUA", "qu", "que", NULL},
|
153
|
+
{ "SHONA", "sn", "sna", NULL},
|
154
|
+
{ "TAJIK", "tg", "tgk", NULL},
|
155
|
+
{ "TATAR", "tt", "tat", NULL},
|
156
|
+
{ "TONGA", "to", "tog", NULL},
|
157
|
+
{ "YORUBA", "yo", "yor", NULL},
|
158
|
+
{ "CREOLES_AND_PIDGINS_ENGLISH_BASED", NULL, "cpe", NULL},
|
159
|
+
{ "CREOLES_AND_PIDGINS_FRENCH_BASED", NULL, "cpf", NULL},
|
160
|
+
{ "CREOLES_AND_PIDGINS_PORTUGUESE_BASED", NULL, "cpp", NULL},
|
161
|
+
{ "CREOLES_AND_PIDGINS_OTHER", NULL, "crp", NULL},
|
162
|
+
{ "MAORI", "mi", "mao", NULL},
|
163
|
+
{ "WOLOF", "wo", "wol", NULL},
|
164
|
+
{ "ABKHAZIAN", "ab", "abk", NULL},
|
165
|
+
{ "AFAR", "aa", "aar", NULL},
|
166
|
+
{ "AYMARA", "ay", "aym", NULL},
|
167
|
+
{ "BASHKIR", "ba", "bak", NULL},
|
168
|
+
{ "BISLAMA", "bi", "bis", NULL},
|
169
|
+
{ "DZONGKHA", "dz", "dzo", NULL},
|
170
|
+
{ "FIJIAN", "fj", "fij", NULL},
|
171
|
+
{ "GREENLANDIC", "kl", "kal", NULL},
|
172
|
+
{ "HAUSA", "ha", "hau", NULL},
|
173
|
+
{ "HAITIAN_CREOLE", "ht", NULL, NULL},
|
174
|
+
{ "INUPIAK", "ik", "ipk", NULL},
|
175
|
+
{ "INUKTITUT", "iu", "iku", NULL},
|
176
|
+
{ "KASHMIRI", "ks", "kas", NULL},
|
177
|
+
{ "KINYARWANDA", "rw", "kin", NULL},
|
178
|
+
{ "MALAGASY", "mg", "mlg", NULL},
|
179
|
+
{ "NAURU", "na", "nau", NULL},
|
180
|
+
{ "OROMO", "om", "orm", NULL},
|
181
|
+
{ "RUNDI", "rn", "run", NULL},
|
182
|
+
{ "SAMOAN", "sm", "smo", NULL},
|
183
|
+
{ "SANGO", "sg", "sag", NULL},
|
184
|
+
{ "SANSKRIT", "sa", "san", NULL},
|
185
|
+
{ "SISWANT", "ss", "ssw", NULL},
|
186
|
+
{ "TSONGA", "ts", "tso", NULL},
|
187
|
+
{ "TSWANA", "tn", "tsn", NULL},
|
188
|
+
{ "VOLAPUK", "vo", "vol", NULL},
|
189
|
+
{ "ZHUANG", "za", "zha", NULL},
|
190
|
+
{ "KHASI", NULL, "kha", NULL},
|
191
|
+
{ "SCOTS", NULL, "sco", NULL},
|
192
|
+
{ "GANDA", "lg", "lug", NULL},
|
193
|
+
{ "MANX", "gv", "glv", NULL},
|
194
|
+
{ "MONTENEGRIN", NULL, NULL, "sr-ME"},
|
195
|
+
{ "XX", NULL, NULL, "XX"},
|
196
|
+
};
|
197
|
+
|
198
|
+
COMPILE_ASSERT(arraysize(kLanguageInfoTable) == NUM_LANGUAGES + 1,
|
199
|
+
kLanguageInfoTable_has_incorrect_length);
|
200
|
+
|
201
|
+
|
202
|
+
// LANGUAGE NAMES
|
203
|
+
|
204
|
+
const char* default_language_name() {
|
205
|
+
return kLanguageInfoTable[ENGLISH].language_name_;
|
206
|
+
}
|
207
|
+
|
208
|
+
static const char* const kInvalidLanguageName = "invalid_language";
|
209
|
+
|
210
|
+
const char *invalid_language_name() {
|
211
|
+
return kInvalidLanguageName;
|
212
|
+
}
|
213
|
+
|
214
|
+
const char* LanguageName(Language lang) {
|
215
|
+
return IsValidLanguage(lang)
|
216
|
+
? kLanguageInfoTable[lang].language_name_
|
217
|
+
: kInvalidLanguageName;
|
218
|
+
}
|
219
|
+
|
220
|
+
|
221
|
+
|
222
|
+
// LANGUAGE CODES
|
223
|
+
|
224
|
+
|
225
|
+
// The space before invalid_language_code is intentional. It is used
|
226
|
+
// to prevent it matching any two letter language code.
|
227
|
+
//
|
228
|
+
static const char* const kInvalidLanguageCode = " invalid_language_code";
|
229
|
+
|
230
|
+
const char *invalid_language_code() {
|
231
|
+
return kInvalidLanguageCode;
|
232
|
+
}
|
233
|
+
|
234
|
+
const char * LanguageCode(Language lang) {
|
235
|
+
if (! IsValidLanguage(lang))
|
236
|
+
return kInvalidLanguageCode;
|
237
|
+
const LanguageInfo& info = kLanguageInfoTable[lang];
|
238
|
+
if (info.language_code_639_1_) {
|
239
|
+
return info.language_code_639_1_;
|
240
|
+
} else if (info.language_code_639_2_) {
|
241
|
+
return info.language_code_639_2_;
|
242
|
+
} else if (info.language_code_other_) {
|
243
|
+
return info.language_code_other_;
|
244
|
+
} else {
|
245
|
+
return kInvalidLanguageCode;
|
246
|
+
}
|
247
|
+
}
|
248
|
+
|
249
|
+
const char* default_language_code() {
|
250
|
+
return kLanguageInfoTable[ENGLISH].language_code_639_1_;
|
251
|
+
}
|
252
|
+
|
253
|
+
const char* LanguageCodeISO639_1(Language lang) {
|
254
|
+
if (! IsValidLanguage(lang))
|
255
|
+
return kInvalidLanguageCode;
|
256
|
+
if (const char* code = kLanguageInfoTable[lang].language_code_639_1_)
|
257
|
+
return code;
|
258
|
+
return kInvalidLanguageCode;
|
259
|
+
}
|
260
|
+
|
261
|
+
const char* LanguageCodeISO639_2(Language lang) {
|
262
|
+
if (! IsValidLanguage(lang))
|
263
|
+
return kInvalidLanguageCode;
|
264
|
+
if (const char* code = kLanguageInfoTable[lang].language_code_639_2_)
|
265
|
+
return code;
|
266
|
+
return kInvalidLanguageCode;
|
267
|
+
}
|
268
|
+
|
269
|
+
const char* LanguageCodeWithDialects(Language lang) {
|
270
|
+
if (lang == CHINESE)
|
271
|
+
return "zh-CN";
|
272
|
+
return LanguageCode(lang);
|
273
|
+
}
|
274
|
+
|
275
|
+
|
276
|
+
|
277
|
+
bool LanguageFromCode(const char* lang_code, Language *language) {
|
278
|
+
*language = UNKNOWN_LANGUAGE;
|
279
|
+
if ( lang_code == NULL ) return false;
|
280
|
+
|
281
|
+
for ( int i = 0 ; i < kNumLanguages ; i++ ) {
|
282
|
+
const LanguageInfo& info = kLanguageInfoTable[i];
|
283
|
+
if ((info.language_code_639_1_ &&
|
284
|
+
!base::strcasecmp(lang_code, info.language_code_639_1_)) ||
|
285
|
+
(info.language_code_639_2_ &&
|
286
|
+
!base::strcasecmp(lang_code, info.language_code_639_2_)) ||
|
287
|
+
(info.language_code_other_ &&
|
288
|
+
!base::strcasecmp(lang_code, info.language_code_other_))) {
|
289
|
+
*language = static_cast<Language>(i);
|
290
|
+
return true;
|
291
|
+
}
|
292
|
+
}
|
293
|
+
|
294
|
+
// For convenience, this function can also parse the non-standard
|
295
|
+
// five-letter language codes "zh-cn" and "zh-tw" which are used by
|
296
|
+
// front-ends such as GWS to distinguish Simplified from Traditional
|
297
|
+
// Chinese.
|
298
|
+
if (!base::strcasecmp(lang_code, "zh-cn") ||
|
299
|
+
!base::strcasecmp(lang_code, "zh_cn")) {
|
300
|
+
*language = CHINESE;
|
301
|
+
return true;
|
302
|
+
}
|
303
|
+
if (!base::strcasecmp(lang_code, "zh-tw") ||
|
304
|
+
!base::strcasecmp(lang_code, "zh_tw")) {
|
305
|
+
*language = CHINESE_T;
|
306
|
+
return true;
|
307
|
+
}
|
308
|
+
if (!base::strcasecmp(lang_code, "sr-me") ||
|
309
|
+
!base::strcasecmp(lang_code, "sr_me")) {
|
310
|
+
*language = MONTENEGRIN;
|
311
|
+
return true;
|
312
|
+
}
|
313
|
+
|
314
|
+
// Process language-code synonyms.
|
315
|
+
if (!base::strcasecmp(lang_code, "he")) {
|
316
|
+
*language = HEBREW; // Use "iw".
|
317
|
+
return true;
|
318
|
+
}
|
319
|
+
if (!base::strcasecmp(lang_code, "in")) {
|
320
|
+
*language = INDONESIAN; // Use "id".
|
321
|
+
return true;
|
322
|
+
}
|
323
|
+
if (!base::strcasecmp(lang_code, "ji")) {
|
324
|
+
*language = YIDDISH; // Use "yi".
|
325
|
+
return true;
|
326
|
+
}
|
327
|
+
|
328
|
+
// Process language-detection synonyms.
|
329
|
+
// These distinct languages cannot be differentiated by our current
|
330
|
+
// language-detection algorithms.
|
331
|
+
if (!base::strcasecmp(lang_code, "fil")) {
|
332
|
+
*language = TAGALOG;
|
333
|
+
return true;
|
334
|
+
}
|
335
|
+
|
336
|
+
return false;
|
337
|
+
}
|
@@ -0,0 +1,179 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef LANGUAGES_PROTO_LANGUAGES_PB_H_
|
6
|
+
#define LANGUAGES_PROTO_LANGUAGES_PB_H_
|
7
|
+
|
8
|
+
enum Language {
|
9
|
+
ENGLISH = 0,
|
10
|
+
DANISH = 1,
|
11
|
+
DUTCH = 2,
|
12
|
+
FINNISH = 3,
|
13
|
+
FRENCH = 4,
|
14
|
+
GERMAN = 5,
|
15
|
+
HEBREW = 6,
|
16
|
+
ITALIAN = 7,
|
17
|
+
JAPANESE = 8,
|
18
|
+
KOREAN = 9,
|
19
|
+
NORWEGIAN = 10,
|
20
|
+
POLISH = 11,
|
21
|
+
PORTUGUESE = 12,
|
22
|
+
RUSSIAN = 13,
|
23
|
+
SPANISH = 14,
|
24
|
+
SWEDISH = 15,
|
25
|
+
CHINESE = 16,
|
26
|
+
CZECH = 17,
|
27
|
+
GREEK = 18,
|
28
|
+
ICELANDIC = 19,
|
29
|
+
LATVIAN = 20,
|
30
|
+
LITHUANIAN = 21,
|
31
|
+
ROMANIAN = 22,
|
32
|
+
HUNGARIAN = 23,
|
33
|
+
ESTONIAN = 24,
|
34
|
+
TG_UNKNOWN_LANGUAGE = 25,
|
35
|
+
UNKNOWN_LANGUAGE = 26,
|
36
|
+
BULGARIAN = 27,
|
37
|
+
CROATIAN = 28,
|
38
|
+
SERBIAN = 29,
|
39
|
+
IRISH = 30, // UI only.
|
40
|
+
GALICIAN = 31,
|
41
|
+
TAGALOG = 32, // Tagalog (tl) + Filipino (fil),
|
42
|
+
TURKISH = 33,
|
43
|
+
UKRAINIAN = 34,
|
44
|
+
HINDI = 35,
|
45
|
+
MACEDONIAN = 36,
|
46
|
+
BENGALI = 37,
|
47
|
+
INDONESIAN = 38,
|
48
|
+
LATIN = 39, // UI only.
|
49
|
+
MALAY = 40,
|
50
|
+
MALAYALAM = 41,
|
51
|
+
WELSH = 42, // UI only.
|
52
|
+
NEPALI = 43,
|
53
|
+
TELUGU = 44,
|
54
|
+
ALBANIAN = 45,
|
55
|
+
TAMIL = 46,
|
56
|
+
BELARUSIAN = 47,
|
57
|
+
JAVANESE = 48, // UI only.
|
58
|
+
OCCITAN = 49, // UI only.
|
59
|
+
URDU = 50,
|
60
|
+
BIHARI = 51,
|
61
|
+
GUJARATI = 52,
|
62
|
+
THAI = 53,
|
63
|
+
ARABIC = 54,
|
64
|
+
CATALAN = 55,
|
65
|
+
ESPERANTO = 56,
|
66
|
+
BASQUE = 57,
|
67
|
+
INTERLINGUA = 58, // UI only.
|
68
|
+
KANNADA = 59,
|
69
|
+
PUNJABI = 60,
|
70
|
+
SCOTS_GAELIC = 61, // UI only.
|
71
|
+
SWAHILI = 62,
|
72
|
+
SLOVENIAN = 63,
|
73
|
+
MARATHI = 64,
|
74
|
+
MALTESE = 65,
|
75
|
+
VIETNAMESE = 66,
|
76
|
+
FRISIAN = 67, // UI only.
|
77
|
+
SLOVAK = 68,
|
78
|
+
CHINESE_T = 69, // This is added to solve the problem of
|
79
|
+
// distinguishing Traditional and Simplified
|
80
|
+
// Chinese when the encoding is UTF8.
|
81
|
+
FAROESE = 70, // UI only.
|
82
|
+
SUNDANESE = 71, // UI only.
|
83
|
+
UZBEK = 72,
|
84
|
+
AMHARIC = 73,
|
85
|
+
AZERBAIJANI = 74,
|
86
|
+
GEORGIAN = 75,
|
87
|
+
TIGRINYA = 76, // UI only.
|
88
|
+
PERSIAN = 77,
|
89
|
+
BOSNIAN = 78, // UI only. LangId language: CROATIAN (28)
|
90
|
+
SINHALESE = 79,
|
91
|
+
NORWEGIAN_N = 80, // UI only. LangId language: NORWEGIAN (10)
|
92
|
+
PORTUGUESE_P = 81, // UI only. LangId language: PORTUGUESE (12)
|
93
|
+
PORTUGUESE_B = 82, // UI only. LangId language: PORTUGUESE (12)
|
94
|
+
XHOSA = 83, // UI only.
|
95
|
+
ZULU = 84, // UI only.
|
96
|
+
GUARANI = 85,
|
97
|
+
SESOTHO = 86, // UI only.
|
98
|
+
TURKMEN = 87, // UI only.
|
99
|
+
KYRGYZ = 88,
|
100
|
+
BRETON = 89, // UI only.
|
101
|
+
TWI = 90, // UI only.
|
102
|
+
YIDDISH = 91, // UI only.
|
103
|
+
SERBO_CROATIAN= 92, // UI only. LangId language: SERBIAN (29)
|
104
|
+
SOMALI = 93, // UI only.
|
105
|
+
UIGHUR = 94,
|
106
|
+
KURDISH = 95,
|
107
|
+
MONGOLIAN = 96,
|
108
|
+
ARMENIAN = 97,
|
109
|
+
LAOTHIAN = 98,
|
110
|
+
SINDHI = 99,
|
111
|
+
RHAETO_ROMANCE= 100, // UI only.
|
112
|
+
AFRIKAANS = 101,
|
113
|
+
LUXEMBOURGISH = 102, // UI only.
|
114
|
+
BURMESE = 103,
|
115
|
+
KHMER = 104,
|
116
|
+
TIBETAN = 105,
|
117
|
+
DHIVEHI = 106, // sometimes spelled Divehi, lang of Maldives
|
118
|
+
CHEROKEE = 107,
|
119
|
+
SYRIAC = 108, // UI only.
|
120
|
+
LIMBU = 109, // UI only.
|
121
|
+
ORIYA = 110,
|
122
|
+
ASSAMESE = 111, // UI only.
|
123
|
+
CORSICAN = 112, // UI only.
|
124
|
+
INTERLINGUE = 113, // UI only.
|
125
|
+
KAZAKH = 114,
|
126
|
+
LINGALA = 115, // UI only.
|
127
|
+
MOLDAVIAN = 116, // UI only. LangId language: ROMANIAN (22)
|
128
|
+
PASHTO = 117,
|
129
|
+
QUECHUA = 118, // UI only.
|
130
|
+
SHONA = 119, // UI only.
|
131
|
+
TAJIK = 120,
|
132
|
+
TATAR = 121, // UI only.
|
133
|
+
TONGA = 122, // UI only.
|
134
|
+
YORUBA = 123, // UI only.
|
135
|
+
CREOLES_AND_PIDGINS_ENGLISH_BASED = 124, // UI only.
|
136
|
+
CREOLES_AND_PIDGINS_FRENCH_BASED = 125, // UI only.
|
137
|
+
CREOLES_AND_PIDGINS_PORTUGUESE_BASED = 126, // UI only.
|
138
|
+
CREOLES_AND_PIDGINS_OTHER = 127, // UI only.
|
139
|
+
MAORI = 128, // UI only.
|
140
|
+
WOLOF = 129, // UI only.
|
141
|
+
ABKHAZIAN = 130, // UI only.
|
142
|
+
AFAR = 131, // UI only.
|
143
|
+
AYMARA = 132, // UI only.
|
144
|
+
BASHKIR = 133, // UI only.
|
145
|
+
BISLAMA = 134, // UI only.
|
146
|
+
DZONGKHA = 135, // UI only.
|
147
|
+
FIJIAN = 136, // UI only.
|
148
|
+
GREENLANDIC = 137, // UI only.
|
149
|
+
HAUSA = 138, // UI only.
|
150
|
+
HAITIAN_CREOLE= 139, // UI only.
|
151
|
+
INUPIAK = 140, // UI only.
|
152
|
+
INUKTITUT = 141,
|
153
|
+
KASHMIRI = 142, // UI only.
|
154
|
+
KINYARWANDA = 143, // UI only.
|
155
|
+
MALAGASY = 144, // UI only.
|
156
|
+
NAURU = 145, // UI only.
|
157
|
+
OROMO = 146, // UI only.
|
158
|
+
RUNDI = 147, // UI only.
|
159
|
+
SAMOAN = 148, // UI only.
|
160
|
+
SANGO = 149, // UI only.
|
161
|
+
SANSKRIT = 150,
|
162
|
+
SISWANT = 151, // UI only.
|
163
|
+
TSONGA = 152, // UI only.
|
164
|
+
TSWANA = 153, // UI only.
|
165
|
+
VOLAPUK = 154, // UI only.
|
166
|
+
ZHUANG = 155, // UI only.
|
167
|
+
KHASI = 156, // UI only.
|
168
|
+
SCOTS = 157, // UI only.
|
169
|
+
GANDA = 158, // UI only.
|
170
|
+
MANX = 159, // UI only.
|
171
|
+
MONTENEGRIN = 160, // UI only. LangId language: SERBIAN (29)
|
172
|
+
NUM_LANGUAGES = 161, // Always keep this at the end. It is not a
|
173
|
+
// valid Language enum. It is only used to
|
174
|
+
// indicate the total number of Languages.
|
175
|
+
// NOTE: If you add a language, you will break a unittest. See the note
|
176
|
+
// at the top of this enum.
|
177
|
+
};
|
178
|
+
|
179
|
+
#endif // LANGUAGES_PROTO_LANGUAGES_PB_H_
|