language_detection 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +85 -0
- data/Rakefile +11 -0
- data/ext/cld/Makefile +34 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +124 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/cld.so +0 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +1 -0
- data/ext/cld/language_detection.cc +88 -0
- data/ext/cld/languages/internal/languages.cc +337 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/language_detection.gemspec +28 -0
- data/lib/language_detection/string.rb +1 -0
- data/lib/language_detection/version.rb +3 -0
- data/lib/language_detection.rb +54 -0
- data/test/_helper.rb +15 -0
- data/test/fixtures/languages.csv +80 -0
- data/test/language_detection_test.rb +88 -0
- metadata +250 -0
|
@@ -0,0 +1,337 @@
|
|
|
1
|
+
// Copyright (c) 2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#include "languages/public/languages.h"
|
|
6
|
+
|
|
7
|
+
#include "base/string_util.h"
|
|
8
|
+
#include "encodings/compact_lang_det/win/cld_basictypes.h"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
Language default_language() {return ENGLISH;}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
// Language names and codes
|
|
15
|
+
|
|
16
|
+
struct LanguageInfo {
|
|
17
|
+
const char * language_name_;
|
|
18
|
+
const char * language_code_639_1_; // the ISO-639-1 code for the language
|
|
19
|
+
const char * language_code_639_2_; // the ISO-639-2 code for the language
|
|
20
|
+
const char * language_code_other_; // some nonstandard code for the language
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
static const LanguageInfo kLanguageInfoTable[] = {
|
|
24
|
+
{ "ENGLISH", "en", "eng", NULL},
|
|
25
|
+
{ "DANISH", "da", "dan", NULL},
|
|
26
|
+
{ "DUTCH", "nl", "dut", NULL},
|
|
27
|
+
{ "FINNISH", "fi", "fin", NULL},
|
|
28
|
+
{ "FRENCH", "fr", "fre", NULL},
|
|
29
|
+
{ "GERMAN", "de", "ger", NULL},
|
|
30
|
+
{ "HEBREW", "he", "heb", NULL},
|
|
31
|
+
{ "ITALIAN", "it", "ita", NULL},
|
|
32
|
+
{ "Japanese", "ja", "jpn", NULL},
|
|
33
|
+
{ "Korean", "ko", "kor", NULL},
|
|
34
|
+
{ "NORWEGIAN", "nb", "nor", NULL},
|
|
35
|
+
{ "POLISH", "pl", "pol", NULL},
|
|
36
|
+
{ "PORTUGUESE", "pt", "por", NULL},
|
|
37
|
+
{ "RUSSIAN", "ru", "rus", NULL},
|
|
38
|
+
{ "SPANISH", "es", "spa", NULL},
|
|
39
|
+
{ "SWEDISH", "sv", "swe", NULL},
|
|
40
|
+
{ "Chinese", "zh", "chi", "zh-CN"},
|
|
41
|
+
{ "CZECH", "cs", "cze", NULL},
|
|
42
|
+
{ "GREEK", "el", "gre", NULL},
|
|
43
|
+
{ "ICELANDIC", "is", "ice", NULL},
|
|
44
|
+
{ "LATVIAN", "lv", "lav", NULL},
|
|
45
|
+
{ "LITHUANIAN", "lt", "lit", NULL},
|
|
46
|
+
{ "ROMANIAN", "ro", "rum", NULL},
|
|
47
|
+
{ "HUNGARIAN", "hu", "hun", NULL},
|
|
48
|
+
{ "ESTONIAN", "et", "est", NULL},
|
|
49
|
+
// TODO: Although Teragram has two output names "TG_UNKNOWN_LANGUAGE"
|
|
50
|
+
// and "Unknown", they are essentially the same. Need to unify them.
|
|
51
|
+
// "un" and "ut" are invented by us, not from ISO-639.
|
|
52
|
+
//
|
|
53
|
+
{ "TG_UNKNOWN_LANGUAGE", NULL, NULL, "ut"},
|
|
54
|
+
{ "Unknown", NULL, NULL, "un"},
|
|
55
|
+
{ "BULGARIAN", "bg", "bul", NULL},
|
|
56
|
+
{ "CROATIAN", "hr", "scr", NULL},
|
|
57
|
+
{ "SERBIAN", "sr", "scc", NULL},
|
|
58
|
+
{ "IRISH", "ga", "gle", NULL},
|
|
59
|
+
{ "GALICIAN", "gl", "glg", NULL},
|
|
60
|
+
// Impossible to tell Tagalog from Filipino at the moment.
|
|
61
|
+
// Use ISO 639-2 code for Filipino here.
|
|
62
|
+
{ "TAGALOG", NULL, "fil", NULL},
|
|
63
|
+
{ "TURKISH", "tr", "tur", NULL},
|
|
64
|
+
{ "UKRAINIAN", "uk", "ukr", NULL},
|
|
65
|
+
{ "HINDI", "hi", "hin", NULL},
|
|
66
|
+
{ "MACEDONIAN", "mk", "mac", NULL},
|
|
67
|
+
{ "BENGALI", "bn", "ben", NULL},
|
|
68
|
+
{ "INDONESIAN", "id", "ind", NULL},
|
|
69
|
+
{ "LATIN", "la", "lat", NULL},
|
|
70
|
+
{ "MALAY", "ms", "may", NULL},
|
|
71
|
+
{ "MALAYALAM", "ml", "mal", NULL},
|
|
72
|
+
{ "WELSH", "cy", "wel", NULL},
|
|
73
|
+
{ "NEPALI", "ne", "nep", NULL},
|
|
74
|
+
{ "TELUGU", "te", "tel", NULL},
|
|
75
|
+
{ "ALBANIAN", "sq", "alb", NULL},
|
|
76
|
+
{ "TAMIL", "ta", "tam", NULL},
|
|
77
|
+
{ "BELARUSIAN", "be", "bel", NULL},
|
|
78
|
+
{ "JAVANESE", "jw", "jav", NULL},
|
|
79
|
+
{ "OCCITAN", "oc", "oci", NULL},
|
|
80
|
+
{ "URDU", "ur", "urd", NULL},
|
|
81
|
+
{ "BIHARI", "bh", "bih", NULL},
|
|
82
|
+
{ "GUJARATI", "gu", "guj", NULL},
|
|
83
|
+
{ "THAI", "th", "tha", NULL},
|
|
84
|
+
{ "ARABIC", "ar", "ara", NULL},
|
|
85
|
+
{ "CATALAN", "ca", "cat", NULL},
|
|
86
|
+
{ "ESPERANTO", "eo", "epo", NULL},
|
|
87
|
+
{ "BASQUE", "eu", "baq", NULL},
|
|
88
|
+
{ "INTERLINGUA", "ia", "ina", NULL},
|
|
89
|
+
{ "KANNADA", "kn", "kan", NULL},
|
|
90
|
+
{ "PUNJABI", "pa", "pan", NULL},
|
|
91
|
+
{ "SCOTS_GAELIC", "gd", "gla", NULL},
|
|
92
|
+
{ "SWAHILI", "sw", "swa", NULL},
|
|
93
|
+
{ "SLOVENIAN", "sl", "slv", NULL},
|
|
94
|
+
{ "MARATHI", "mr", "mar", NULL},
|
|
95
|
+
{ "MALTESE", "mt", "mlt", NULL},
|
|
96
|
+
{ "VIETNAMESE", "vi", "vie", NULL},
|
|
97
|
+
{ "FRISIAN", "fy", "fry", NULL},
|
|
98
|
+
{ "SLOVAK", "sk", "slo", NULL},
|
|
99
|
+
{ "ChineseT",
|
|
100
|
+
NULL, NULL, // We intentionally set these 2 fields to NULL to avoid
|
|
101
|
+
// confusion between CHINESE_T and CHINESE.
|
|
102
|
+
"zh-TW"},
|
|
103
|
+
{ "FAROESE", "fo", "fao", NULL},
|
|
104
|
+
{ "SUNDANESE", "su", "sun", NULL},
|
|
105
|
+
{ "UZBEK", "uz", "uzb", NULL},
|
|
106
|
+
{ "AMHARIC", "am", "amh", NULL},
|
|
107
|
+
{ "AZERBAIJANI", "az", "aze", NULL},
|
|
108
|
+
{ "GEORGIAN", "ka", "geo", NULL},
|
|
109
|
+
{ "TIGRINYA", "ti", "tir", NULL},
|
|
110
|
+
{ "PERSIAN", "fa", "per", NULL},
|
|
111
|
+
{ "BOSNIAN", "bs", "bos", NULL},
|
|
112
|
+
{ "SINHALESE", "si", "sin", NULL},
|
|
113
|
+
{ "NORWEGIAN_N", "nn", "nno", NULL},
|
|
114
|
+
{ "PORTUGUESE_P", NULL, NULL, "pt-PT"},
|
|
115
|
+
{ "PORTUGUESE_B", NULL, NULL, "pt-BR"},
|
|
116
|
+
{ "XHOSA", "xh", "xho", NULL},
|
|
117
|
+
{ "ZULU", "zu", "zul", NULL},
|
|
118
|
+
{ "GUARANI", "gn", "grn", NULL},
|
|
119
|
+
{ "SESOTHO", "st", "sot", NULL},
|
|
120
|
+
{ "TURKMEN", "tk", "tuk", NULL},
|
|
121
|
+
{ "KYRGYZ", "ky", "kir", NULL},
|
|
122
|
+
{ "BRETON", "br", "bre", NULL},
|
|
123
|
+
{ "TWI", "tw", "twi", NULL},
|
|
124
|
+
{ "YIDDISH", "yi", "yid", NULL},
|
|
125
|
+
{ "SERBO_CROATIAN", "sh", NULL, NULL},
|
|
126
|
+
{ "SOMALI", "so", "som", NULL},
|
|
127
|
+
{ "UIGHUR", "ug", "uig", NULL},
|
|
128
|
+
{ "KURDISH", "ku", "kur", NULL},
|
|
129
|
+
{ "MONGOLIAN", "mn", "mon", NULL},
|
|
130
|
+
{ "ARMENIAN", "hy", "arm", NULL},
|
|
131
|
+
{ "LAOTHIAN", "lo", "lao", NULL},
|
|
132
|
+
{ "SINDHI", "sd", "snd", NULL},
|
|
133
|
+
{ "RHAETO_ROMANCE", "rm", "roh", NULL},
|
|
134
|
+
{ "AFRIKAANS", "af", "afr", NULL},
|
|
135
|
+
{ "LUXEMBOURGISH", "lb", "ltz", NULL},
|
|
136
|
+
{ "BURMESE", "my", "bur", NULL},
|
|
137
|
+
// KHMER is known as Cambodian for Google user interfaces.
|
|
138
|
+
{ "KHMER", "km", "khm", NULL},
|
|
139
|
+
{ "TIBETAN", "bo", "tib", NULL},
|
|
140
|
+
{ "DHIVEHI", "dv", "div", NULL},
|
|
141
|
+
{ "CHEROKEE", NULL, "chr", NULL},
|
|
142
|
+
{ "SYRIAC", NULL, "syr", NULL},
|
|
143
|
+
{ "LIMBU", NULL, NULL, "sit-NP"},
|
|
144
|
+
{ "ORIYA", "or", "ori", NULL},
|
|
145
|
+
{ "ASSAMESE", "as", "asm", NULL},
|
|
146
|
+
{ "CORSICAN", "co", "cos", NULL},
|
|
147
|
+
{ "INTERLINGUE", "ie", "ine", NULL},
|
|
148
|
+
{ "KAZAKH", "kk", "kaz", NULL},
|
|
149
|
+
{ "LINGALA", "ln", "lin", NULL},
|
|
150
|
+
{ "MOLDAVIAN", "mo", "mol", NULL},
|
|
151
|
+
{ "PASHTO", "ps", "pus", NULL},
|
|
152
|
+
{ "QUECHUA", "qu", "que", NULL},
|
|
153
|
+
{ "SHONA", "sn", "sna", NULL},
|
|
154
|
+
{ "TAJIK", "tg", "tgk", NULL},
|
|
155
|
+
{ "TATAR", "tt", "tat", NULL},
|
|
156
|
+
{ "TONGA", "to", "tog", NULL},
|
|
157
|
+
{ "YORUBA", "yo", "yor", NULL},
|
|
158
|
+
{ "CREOLES_AND_PIDGINS_ENGLISH_BASED", NULL, "cpe", NULL},
|
|
159
|
+
{ "CREOLES_AND_PIDGINS_FRENCH_BASED", NULL, "cpf", NULL},
|
|
160
|
+
{ "CREOLES_AND_PIDGINS_PORTUGUESE_BASED", NULL, "cpp", NULL},
|
|
161
|
+
{ "CREOLES_AND_PIDGINS_OTHER", NULL, "crp", NULL},
|
|
162
|
+
{ "MAORI", "mi", "mao", NULL},
|
|
163
|
+
{ "WOLOF", "wo", "wol", NULL},
|
|
164
|
+
{ "ABKHAZIAN", "ab", "abk", NULL},
|
|
165
|
+
{ "AFAR", "aa", "aar", NULL},
|
|
166
|
+
{ "AYMARA", "ay", "aym", NULL},
|
|
167
|
+
{ "BASHKIR", "ba", "bak", NULL},
|
|
168
|
+
{ "BISLAMA", "bi", "bis", NULL},
|
|
169
|
+
{ "DZONGKHA", "dz", "dzo", NULL},
|
|
170
|
+
{ "FIJIAN", "fj", "fij", NULL},
|
|
171
|
+
{ "GREENLANDIC", "kl", "kal", NULL},
|
|
172
|
+
{ "HAUSA", "ha", "hau", NULL},
|
|
173
|
+
{ "HAITIAN_CREOLE", "ht", NULL, NULL},
|
|
174
|
+
{ "INUPIAK", "ik", "ipk", NULL},
|
|
175
|
+
{ "INUKTITUT", "iu", "iku", NULL},
|
|
176
|
+
{ "KASHMIRI", "ks", "kas", NULL},
|
|
177
|
+
{ "KINYARWANDA", "rw", "kin", NULL},
|
|
178
|
+
{ "MALAGASY", "mg", "mlg", NULL},
|
|
179
|
+
{ "NAURU", "na", "nau", NULL},
|
|
180
|
+
{ "OROMO", "om", "orm", NULL},
|
|
181
|
+
{ "RUNDI", "rn", "run", NULL},
|
|
182
|
+
{ "SAMOAN", "sm", "smo", NULL},
|
|
183
|
+
{ "SANGO", "sg", "sag", NULL},
|
|
184
|
+
{ "SANSKRIT", "sa", "san", NULL},
|
|
185
|
+
{ "SISWANT", "ss", "ssw", NULL},
|
|
186
|
+
{ "TSONGA", "ts", "tso", NULL},
|
|
187
|
+
{ "TSWANA", "tn", "tsn", NULL},
|
|
188
|
+
{ "VOLAPUK", "vo", "vol", NULL},
|
|
189
|
+
{ "ZHUANG", "za", "zha", NULL},
|
|
190
|
+
{ "KHASI", NULL, "kha", NULL},
|
|
191
|
+
{ "SCOTS", NULL, "sco", NULL},
|
|
192
|
+
{ "GANDA", "lg", "lug", NULL},
|
|
193
|
+
{ "MANX", "gv", "glv", NULL},
|
|
194
|
+
{ "MONTENEGRIN", NULL, NULL, "sr-ME"},
|
|
195
|
+
{ "XX", NULL, NULL, "XX"},
|
|
196
|
+
};
|
|
197
|
+
|
|
198
|
+
COMPILE_ASSERT(arraysize(kLanguageInfoTable) == NUM_LANGUAGES + 1,
|
|
199
|
+
kLanguageInfoTable_has_incorrect_length);
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
// LANGUAGE NAMES
|
|
203
|
+
|
|
204
|
+
const char* default_language_name() {
|
|
205
|
+
return kLanguageInfoTable[ENGLISH].language_name_;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
static const char* const kInvalidLanguageName = "invalid_language";
|
|
209
|
+
|
|
210
|
+
const char *invalid_language_name() {
|
|
211
|
+
return kInvalidLanguageName;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
const char* LanguageName(Language lang) {
|
|
215
|
+
return IsValidLanguage(lang)
|
|
216
|
+
? kLanguageInfoTable[lang].language_name_
|
|
217
|
+
: kInvalidLanguageName;
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
// LANGUAGE CODES
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
// The space before invalid_language_code is intentional. It is used
|
|
226
|
+
// to prevent it matching any two letter language code.
|
|
227
|
+
//
|
|
228
|
+
static const char* const kInvalidLanguageCode = " invalid_language_code";
|
|
229
|
+
|
|
230
|
+
const char *invalid_language_code() {
|
|
231
|
+
return kInvalidLanguageCode;
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
const char * LanguageCode(Language lang) {
|
|
235
|
+
if (! IsValidLanguage(lang))
|
|
236
|
+
return kInvalidLanguageCode;
|
|
237
|
+
const LanguageInfo& info = kLanguageInfoTable[lang];
|
|
238
|
+
if (info.language_code_639_1_) {
|
|
239
|
+
return info.language_code_639_1_;
|
|
240
|
+
} else if (info.language_code_639_2_) {
|
|
241
|
+
return info.language_code_639_2_;
|
|
242
|
+
} else if (info.language_code_other_) {
|
|
243
|
+
return info.language_code_other_;
|
|
244
|
+
} else {
|
|
245
|
+
return kInvalidLanguageCode;
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
const char* default_language_code() {
|
|
250
|
+
return kLanguageInfoTable[ENGLISH].language_code_639_1_;
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
const char* LanguageCodeISO639_1(Language lang) {
|
|
254
|
+
if (! IsValidLanguage(lang))
|
|
255
|
+
return kInvalidLanguageCode;
|
|
256
|
+
if (const char* code = kLanguageInfoTable[lang].language_code_639_1_)
|
|
257
|
+
return code;
|
|
258
|
+
return kInvalidLanguageCode;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
const char* LanguageCodeISO639_2(Language lang) {
|
|
262
|
+
if (! IsValidLanguage(lang))
|
|
263
|
+
return kInvalidLanguageCode;
|
|
264
|
+
if (const char* code = kLanguageInfoTable[lang].language_code_639_2_)
|
|
265
|
+
return code;
|
|
266
|
+
return kInvalidLanguageCode;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
const char* LanguageCodeWithDialects(Language lang) {
|
|
270
|
+
if (lang == CHINESE)
|
|
271
|
+
return "zh-CN";
|
|
272
|
+
return LanguageCode(lang);
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
bool LanguageFromCode(const char* lang_code, Language *language) {
|
|
278
|
+
*language = UNKNOWN_LANGUAGE;
|
|
279
|
+
if ( lang_code == NULL ) return false;
|
|
280
|
+
|
|
281
|
+
for ( int i = 0 ; i < kNumLanguages ; i++ ) {
|
|
282
|
+
const LanguageInfo& info = kLanguageInfoTable[i];
|
|
283
|
+
if ((info.language_code_639_1_ &&
|
|
284
|
+
!base::strcasecmp(lang_code, info.language_code_639_1_)) ||
|
|
285
|
+
(info.language_code_639_2_ &&
|
|
286
|
+
!base::strcasecmp(lang_code, info.language_code_639_2_)) ||
|
|
287
|
+
(info.language_code_other_ &&
|
|
288
|
+
!base::strcasecmp(lang_code, info.language_code_other_))) {
|
|
289
|
+
*language = static_cast<Language>(i);
|
|
290
|
+
return true;
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
// For convenience, this function can also parse the non-standard
|
|
295
|
+
// five-letter language codes "zh-cn" and "zh-tw" which are used by
|
|
296
|
+
// front-ends such as GWS to distinguish Simplified from Traditional
|
|
297
|
+
// Chinese.
|
|
298
|
+
if (!base::strcasecmp(lang_code, "zh-cn") ||
|
|
299
|
+
!base::strcasecmp(lang_code, "zh_cn")) {
|
|
300
|
+
*language = CHINESE;
|
|
301
|
+
return true;
|
|
302
|
+
}
|
|
303
|
+
if (!base::strcasecmp(lang_code, "zh-tw") ||
|
|
304
|
+
!base::strcasecmp(lang_code, "zh_tw")) {
|
|
305
|
+
*language = CHINESE_T;
|
|
306
|
+
return true;
|
|
307
|
+
}
|
|
308
|
+
if (!base::strcasecmp(lang_code, "sr-me") ||
|
|
309
|
+
!base::strcasecmp(lang_code, "sr_me")) {
|
|
310
|
+
*language = MONTENEGRIN;
|
|
311
|
+
return true;
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
// Process language-code synonyms.
|
|
315
|
+
if (!base::strcasecmp(lang_code, "he")) {
|
|
316
|
+
*language = HEBREW; // Use "iw".
|
|
317
|
+
return true;
|
|
318
|
+
}
|
|
319
|
+
if (!base::strcasecmp(lang_code, "in")) {
|
|
320
|
+
*language = INDONESIAN; // Use "id".
|
|
321
|
+
return true;
|
|
322
|
+
}
|
|
323
|
+
if (!base::strcasecmp(lang_code, "ji")) {
|
|
324
|
+
*language = YIDDISH; // Use "yi".
|
|
325
|
+
return true;
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
// Process language-detection synonyms.
|
|
329
|
+
// These distinct languages cannot be differentiated by our current
|
|
330
|
+
// language-detection algorithms.
|
|
331
|
+
if (!base::strcasecmp(lang_code, "fil")) {
|
|
332
|
+
*language = TAGALOG;
|
|
333
|
+
return true;
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
return false;
|
|
337
|
+
}
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#ifndef LANGUAGES_PROTO_LANGUAGES_PB_H_
|
|
6
|
+
#define LANGUAGES_PROTO_LANGUAGES_PB_H_
|
|
7
|
+
|
|
8
|
+
enum Language {
|
|
9
|
+
ENGLISH = 0,
|
|
10
|
+
DANISH = 1,
|
|
11
|
+
DUTCH = 2,
|
|
12
|
+
FINNISH = 3,
|
|
13
|
+
FRENCH = 4,
|
|
14
|
+
GERMAN = 5,
|
|
15
|
+
HEBREW = 6,
|
|
16
|
+
ITALIAN = 7,
|
|
17
|
+
JAPANESE = 8,
|
|
18
|
+
KOREAN = 9,
|
|
19
|
+
NORWEGIAN = 10,
|
|
20
|
+
POLISH = 11,
|
|
21
|
+
PORTUGUESE = 12,
|
|
22
|
+
RUSSIAN = 13,
|
|
23
|
+
SPANISH = 14,
|
|
24
|
+
SWEDISH = 15,
|
|
25
|
+
CHINESE = 16,
|
|
26
|
+
CZECH = 17,
|
|
27
|
+
GREEK = 18,
|
|
28
|
+
ICELANDIC = 19,
|
|
29
|
+
LATVIAN = 20,
|
|
30
|
+
LITHUANIAN = 21,
|
|
31
|
+
ROMANIAN = 22,
|
|
32
|
+
HUNGARIAN = 23,
|
|
33
|
+
ESTONIAN = 24,
|
|
34
|
+
TG_UNKNOWN_LANGUAGE = 25,
|
|
35
|
+
UNKNOWN_LANGUAGE = 26,
|
|
36
|
+
BULGARIAN = 27,
|
|
37
|
+
CROATIAN = 28,
|
|
38
|
+
SERBIAN = 29,
|
|
39
|
+
IRISH = 30, // UI only.
|
|
40
|
+
GALICIAN = 31,
|
|
41
|
+
TAGALOG = 32, // Tagalog (tl) + Filipino (fil),
|
|
42
|
+
TURKISH = 33,
|
|
43
|
+
UKRAINIAN = 34,
|
|
44
|
+
HINDI = 35,
|
|
45
|
+
MACEDONIAN = 36,
|
|
46
|
+
BENGALI = 37,
|
|
47
|
+
INDONESIAN = 38,
|
|
48
|
+
LATIN = 39, // UI only.
|
|
49
|
+
MALAY = 40,
|
|
50
|
+
MALAYALAM = 41,
|
|
51
|
+
WELSH = 42, // UI only.
|
|
52
|
+
NEPALI = 43,
|
|
53
|
+
TELUGU = 44,
|
|
54
|
+
ALBANIAN = 45,
|
|
55
|
+
TAMIL = 46,
|
|
56
|
+
BELARUSIAN = 47,
|
|
57
|
+
JAVANESE = 48, // UI only.
|
|
58
|
+
OCCITAN = 49, // UI only.
|
|
59
|
+
URDU = 50,
|
|
60
|
+
BIHARI = 51,
|
|
61
|
+
GUJARATI = 52,
|
|
62
|
+
THAI = 53,
|
|
63
|
+
ARABIC = 54,
|
|
64
|
+
CATALAN = 55,
|
|
65
|
+
ESPERANTO = 56,
|
|
66
|
+
BASQUE = 57,
|
|
67
|
+
INTERLINGUA = 58, // UI only.
|
|
68
|
+
KANNADA = 59,
|
|
69
|
+
PUNJABI = 60,
|
|
70
|
+
SCOTS_GAELIC = 61, // UI only.
|
|
71
|
+
SWAHILI = 62,
|
|
72
|
+
SLOVENIAN = 63,
|
|
73
|
+
MARATHI = 64,
|
|
74
|
+
MALTESE = 65,
|
|
75
|
+
VIETNAMESE = 66,
|
|
76
|
+
FRISIAN = 67, // UI only.
|
|
77
|
+
SLOVAK = 68,
|
|
78
|
+
CHINESE_T = 69, // This is added to solve the problem of
|
|
79
|
+
// distinguishing Traditional and Simplified
|
|
80
|
+
// Chinese when the encoding is UTF8.
|
|
81
|
+
FAROESE = 70, // UI only.
|
|
82
|
+
SUNDANESE = 71, // UI only.
|
|
83
|
+
UZBEK = 72,
|
|
84
|
+
AMHARIC = 73,
|
|
85
|
+
AZERBAIJANI = 74,
|
|
86
|
+
GEORGIAN = 75,
|
|
87
|
+
TIGRINYA = 76, // UI only.
|
|
88
|
+
PERSIAN = 77,
|
|
89
|
+
BOSNIAN = 78, // UI only. LangId language: CROATIAN (28)
|
|
90
|
+
SINHALESE = 79,
|
|
91
|
+
NORWEGIAN_N = 80, // UI only. LangId language: NORWEGIAN (10)
|
|
92
|
+
PORTUGUESE_P = 81, // UI only. LangId language: PORTUGUESE (12)
|
|
93
|
+
PORTUGUESE_B = 82, // UI only. LangId language: PORTUGUESE (12)
|
|
94
|
+
XHOSA = 83, // UI only.
|
|
95
|
+
ZULU = 84, // UI only.
|
|
96
|
+
GUARANI = 85,
|
|
97
|
+
SESOTHO = 86, // UI only.
|
|
98
|
+
TURKMEN = 87, // UI only.
|
|
99
|
+
KYRGYZ = 88,
|
|
100
|
+
BRETON = 89, // UI only.
|
|
101
|
+
TWI = 90, // UI only.
|
|
102
|
+
YIDDISH = 91, // UI only.
|
|
103
|
+
SERBO_CROATIAN= 92, // UI only. LangId language: SERBIAN (29)
|
|
104
|
+
SOMALI = 93, // UI only.
|
|
105
|
+
UIGHUR = 94,
|
|
106
|
+
KURDISH = 95,
|
|
107
|
+
MONGOLIAN = 96,
|
|
108
|
+
ARMENIAN = 97,
|
|
109
|
+
LAOTHIAN = 98,
|
|
110
|
+
SINDHI = 99,
|
|
111
|
+
RHAETO_ROMANCE= 100, // UI only.
|
|
112
|
+
AFRIKAANS = 101,
|
|
113
|
+
LUXEMBOURGISH = 102, // UI only.
|
|
114
|
+
BURMESE = 103,
|
|
115
|
+
KHMER = 104,
|
|
116
|
+
TIBETAN = 105,
|
|
117
|
+
DHIVEHI = 106, // sometimes spelled Divehi, lang of Maldives
|
|
118
|
+
CHEROKEE = 107,
|
|
119
|
+
SYRIAC = 108, // UI only.
|
|
120
|
+
LIMBU = 109, // UI only.
|
|
121
|
+
ORIYA = 110,
|
|
122
|
+
ASSAMESE = 111, // UI only.
|
|
123
|
+
CORSICAN = 112, // UI only.
|
|
124
|
+
INTERLINGUE = 113, // UI only.
|
|
125
|
+
KAZAKH = 114,
|
|
126
|
+
LINGALA = 115, // UI only.
|
|
127
|
+
MOLDAVIAN = 116, // UI only. LangId language: ROMANIAN (22)
|
|
128
|
+
PASHTO = 117,
|
|
129
|
+
QUECHUA = 118, // UI only.
|
|
130
|
+
SHONA = 119, // UI only.
|
|
131
|
+
TAJIK = 120,
|
|
132
|
+
TATAR = 121, // UI only.
|
|
133
|
+
TONGA = 122, // UI only.
|
|
134
|
+
YORUBA = 123, // UI only.
|
|
135
|
+
CREOLES_AND_PIDGINS_ENGLISH_BASED = 124, // UI only.
|
|
136
|
+
CREOLES_AND_PIDGINS_FRENCH_BASED = 125, // UI only.
|
|
137
|
+
CREOLES_AND_PIDGINS_PORTUGUESE_BASED = 126, // UI only.
|
|
138
|
+
CREOLES_AND_PIDGINS_OTHER = 127, // UI only.
|
|
139
|
+
MAORI = 128, // UI only.
|
|
140
|
+
WOLOF = 129, // UI only.
|
|
141
|
+
ABKHAZIAN = 130, // UI only.
|
|
142
|
+
AFAR = 131, // UI only.
|
|
143
|
+
AYMARA = 132, // UI only.
|
|
144
|
+
BASHKIR = 133, // UI only.
|
|
145
|
+
BISLAMA = 134, // UI only.
|
|
146
|
+
DZONGKHA = 135, // UI only.
|
|
147
|
+
FIJIAN = 136, // UI only.
|
|
148
|
+
GREENLANDIC = 137, // UI only.
|
|
149
|
+
HAUSA = 138, // UI only.
|
|
150
|
+
HAITIAN_CREOLE= 139, // UI only.
|
|
151
|
+
INUPIAK = 140, // UI only.
|
|
152
|
+
INUKTITUT = 141,
|
|
153
|
+
KASHMIRI = 142, // UI only.
|
|
154
|
+
KINYARWANDA = 143, // UI only.
|
|
155
|
+
MALAGASY = 144, // UI only.
|
|
156
|
+
NAURU = 145, // UI only.
|
|
157
|
+
OROMO = 146, // UI only.
|
|
158
|
+
RUNDI = 147, // UI only.
|
|
159
|
+
SAMOAN = 148, // UI only.
|
|
160
|
+
SANGO = 149, // UI only.
|
|
161
|
+
SANSKRIT = 150,
|
|
162
|
+
SISWANT = 151, // UI only.
|
|
163
|
+
TSONGA = 152, // UI only.
|
|
164
|
+
TSWANA = 153, // UI only.
|
|
165
|
+
VOLAPUK = 154, // UI only.
|
|
166
|
+
ZHUANG = 155, // UI only.
|
|
167
|
+
KHASI = 156, // UI only.
|
|
168
|
+
SCOTS = 157, // UI only.
|
|
169
|
+
GANDA = 158, // UI only.
|
|
170
|
+
MANX = 159, // UI only.
|
|
171
|
+
MONTENEGRIN = 160, // UI only. LangId language: SERBIAN (29)
|
|
172
|
+
NUM_LANGUAGES = 161, // Always keep this at the end. It is not a
|
|
173
|
+
// valid Language enum. It is only used to
|
|
174
|
+
// indicate the total number of Languages.
|
|
175
|
+
// NOTE: If you add a language, you will break a unittest. See the note
|
|
176
|
+
// at the top of this enum.
|
|
177
|
+
};
|
|
178
|
+
|
|
179
|
+
#endif // LANGUAGES_PROTO_LANGUAGES_PB_H_
|