language_detection 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +85 -0
- data/Rakefile +11 -0
- data/ext/cld/Makefile +34 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +124 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/cld.so +0 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +1 -0
- data/ext/cld/language_detection.cc +88 -0
- data/ext/cld/languages/internal/languages.cc +337 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/language_detection.gemspec +28 -0
- data/lib/language_detection/string.rb +1 -0
- data/lib/language_detection/version.rb +3 -0
- data/lib/language_detection.rb +54 -0
- data/test/_helper.rb +15 -0
- data/test/fixtures/languages.csv +80 -0
- data/test/language_detection_test.rb +88 -0
- metadata +250 -0
@@ -0,0 +1,545 @@
|
|
1
|
+
// Copyright (c) 2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
// This file extends lang_enc.cc with additional languages and extended routines
|
6
|
+
// It is current with Unicode 5.1 (beta Jan 2008)
|
7
|
+
//
|
8
|
+
|
9
|
+
#include <stdlib.h>
|
10
|
+
#include <stdio.h>
|
11
|
+
#include <string.h>
|
12
|
+
|
13
|
+
#include "encodings/compact_lang_det/ext_lang_enc.h"
|
14
|
+
#include "encodings/compact_lang_det/win/cld_macros.h"
|
15
|
+
#include "encodings/compact_lang_det/win/cld_strtoint.h"
|
16
|
+
|
17
|
+
// Language names above NUM_LANGUAGES
|
18
|
+
// These are also the C enum declared names
|
19
|
+
static const char* const kExtLanguageName[] = {
|
20
|
+
"X_BORK_BORK_BORK", "X_PIG_LATIN", "X_HACKER", "X_KLINGON", "X_ELMER_FUDD",
|
21
|
+
|
22
|
+
// Pseudo-languages for Unicode scripts that express a single language
|
23
|
+
"X_OGHAM", "X_RUNIC", "X_YI", "X_OLD_ITALIC", "X_GOTHIC",
|
24
|
+
"X_DESERET", "X_HANUNOO", "X_BUHID", "X_TAGBANWA", "X_TAI_LE",
|
25
|
+
"X_LINEAR_B", "X_UGARITIC", "X_SHAVIAN", "X_OSMANYA", "X_CYPRIOT",
|
26
|
+
"X_BUGINESE", "X_COPTIC", "X_NEW_TAI_LUE", "X_GLAGOLITIC", "X_TIFINAGH",
|
27
|
+
"X_SYLOTI_NAGRI", "X_OLD_PERSIAN", "X_KHAROSHTHI", "X_BALINESE", "X_CUNEIFORM",
|
28
|
+
"X_PHOENICIAN", "X_PHAGS_PA", "X_NKO",
|
29
|
+
|
30
|
+
// Unicode 5.1
|
31
|
+
"X_SUDANESE", "X_LEPCHA", "X_OL_CHIKI", "X_VAI", "X_SAURASHTRA",
|
32
|
+
"X_KAYAH_LI", "X_REJANG", "X_LYCIAN", "X_CARIAN", "X_LYDIAN",
|
33
|
+
"X_CHAM",
|
34
|
+
};
|
35
|
+
|
36
|
+
|
37
|
+
// These are the C enum declared names, for programs creating C code
|
38
|
+
static const char* const kExtLangDeclaredName[] = {
|
39
|
+
"ENGLISH", /* 0 */
|
40
|
+
"DANISH", /* 1 */
|
41
|
+
"DUTCH", /* 2 */
|
42
|
+
"FINNISH", /* 3 */
|
43
|
+
"FRENCH", /* 4 */
|
44
|
+
"GERMAN", /* 5 */
|
45
|
+
"HEBREW", /* 6 */
|
46
|
+
"ITALIAN", /* 7 */
|
47
|
+
"JAPANESE", /* 8 */
|
48
|
+
"KOREAN", /* 9 */
|
49
|
+
"NORWEGIAN", /* 10 */
|
50
|
+
"POLISH", /* 11 */
|
51
|
+
"PORTUGUESE", /* 12 */
|
52
|
+
"RUSSIAN", /* 13 */
|
53
|
+
"SPANISH", /* 14 */
|
54
|
+
"SWEDISH", /* 15 */
|
55
|
+
"CHINESE", /* 16 */
|
56
|
+
"CZECH", /* 17 */
|
57
|
+
"GREEK", /* 18 */
|
58
|
+
"ICELANDIC", /* 19 */
|
59
|
+
"LATVIAN", /* 20 */
|
60
|
+
"LITHUANIAN", /* 21 */
|
61
|
+
"ROMANIAN", /* 22 */
|
62
|
+
"HUNGARIAN", /* 23 */
|
63
|
+
"ESTONIAN", /* 24 */
|
64
|
+
"TG_UNKNOWN_LANGUAGE", /* 25 */
|
65
|
+
"UNKNOWN_LANGUAGE", /* 26 */
|
66
|
+
"BULGARIAN", /* 27 */
|
67
|
+
"CROATIAN", /* 28 */
|
68
|
+
"SERBIAN", /* 29 */
|
69
|
+
"IRISH", /* 30 */
|
70
|
+
"GALICIAN", /* 31 */
|
71
|
+
"TAGALOG", /* 32 */
|
72
|
+
"TURKISH", /* 33 */
|
73
|
+
"UKRAINIAN", /* 34 */
|
74
|
+
"HINDI", /* 35 */
|
75
|
+
"MACEDONIAN", /* 36 */
|
76
|
+
"BENGALI", /* 37 */
|
77
|
+
"INDONESIAN", /* 38 */
|
78
|
+
"LATIN", /* 39 */
|
79
|
+
"MALAY", /* 40 */
|
80
|
+
"MALAYALAM", /* 41 */
|
81
|
+
"WELSH", /* 42 */
|
82
|
+
"NEPALI", /* 43 */
|
83
|
+
"TELUGU", /* 44 */
|
84
|
+
"ALBANIAN", /* 45 */
|
85
|
+
"TAMIL", /* 46 */
|
86
|
+
"BELARUSIAN", /* 47 */
|
87
|
+
"JAVANESE", /* 48 */
|
88
|
+
"OCCITAN", /* 49 */
|
89
|
+
"URDU", /* 50 */
|
90
|
+
"BIHARI", /* 51 */
|
91
|
+
"GUJARATI", /* 52 */
|
92
|
+
"THAI", /* 53 */
|
93
|
+
"ARABIC", /* 54 */
|
94
|
+
"CATALAN", /* 55 */
|
95
|
+
"ESPERANTO", /* 56 */
|
96
|
+
"BASQUE", /* 57 */
|
97
|
+
"INTERLINGUA", /* 58 */
|
98
|
+
"KANNADA", /* 59 */
|
99
|
+
"PUNJABI", /* 60 */
|
100
|
+
"SCOTS_GAELIC", /* 61 */
|
101
|
+
"SWAHILI", /* 62 */
|
102
|
+
"SLOVENIAN", /* 63 */
|
103
|
+
"MARATHI", /* 64 */
|
104
|
+
"MALTESE", /* 65 */
|
105
|
+
"VIETNAMESE", /* 66 */
|
106
|
+
"FRISIAN", /* 67 */
|
107
|
+
"SLOVAK", /* 68 */
|
108
|
+
"CHINESE_T", /* 69 */
|
109
|
+
"FAROESE", /* 70 */
|
110
|
+
"SUNDANESE", /* 71 */
|
111
|
+
"UZBEK", /* 72 */
|
112
|
+
"AMHARIC", /* 73 */
|
113
|
+
"AZERBAIJANI", /* 74 */
|
114
|
+
"GEORGIAN", /* 75 */
|
115
|
+
"TIGRINYA", /* 76 */
|
116
|
+
"PERSIAN", /* 77 */
|
117
|
+
"BOSNIAN", /* 78 */
|
118
|
+
"SINHALESE", /* 79 */
|
119
|
+
"NORWEGIAN_N", /* 80 */
|
120
|
+
"PORTUGUESE_P", /* 81 */
|
121
|
+
"PORTUGUESE_B", /* 82 */
|
122
|
+
"XHOSA", /* 83 */
|
123
|
+
"ZULU", /* 84 */
|
124
|
+
"GUARANI", /* 85 */
|
125
|
+
"SESOTHO", /* 86 */
|
126
|
+
"TURKMEN", /* 87 */
|
127
|
+
"KYRGYZ", /* 88 */
|
128
|
+
"BRETON", /* 89 */
|
129
|
+
"TWI", /* 90 */
|
130
|
+
"YIDDISH", /* 91 */
|
131
|
+
"SERBO_CROATIAN", /* 92 */
|
132
|
+
"SOMALI", /* 93 */
|
133
|
+
"UIGHUR", /* 94 */
|
134
|
+
"KURDISH", /* 95 */
|
135
|
+
"MONGOLIAN", /* 96 */
|
136
|
+
"ARMENIAN", /* 97 */
|
137
|
+
"LAOTHIAN", /* 98 */
|
138
|
+
"SINDHI", /* 99 */
|
139
|
+
"RHAETO_ROMANCE", /* 100 */
|
140
|
+
"AFRIKAANS", /* 101 */
|
141
|
+
"LUXEMBOURGISH", /* 102 */
|
142
|
+
"BURMESE", /* 103 */
|
143
|
+
"KHMER", /* 104 */
|
144
|
+
"TIBETAN", /* 105 */
|
145
|
+
"DHIVEHI", /* 106 */ // sometimes spelled Divehi; lang of Maldives
|
146
|
+
"CHEROKEE", /* 107 */
|
147
|
+
"SYRIAC", /* 108 */
|
148
|
+
"LIMBU", /* 109 */
|
149
|
+
"ORIYA", /* 110 */
|
150
|
+
"ASSAMESE", /* 111 */
|
151
|
+
"CORSICAN", /* 112 */
|
152
|
+
"INTERLINGUE", /* 113 */
|
153
|
+
"KAZAKH", /* 114 */
|
154
|
+
"LINGALA", /* 115 */
|
155
|
+
"MOLDAVIAN", /* 116 */
|
156
|
+
"PASHTO", /* 117 */
|
157
|
+
"QUECHUA", /* 118 */
|
158
|
+
"SHONA", /* 119 */
|
159
|
+
"TAJIK", /* 120 */
|
160
|
+
"TATAR", /* 121 */
|
161
|
+
"TONGA", /* 122 */
|
162
|
+
"YORUBA", /* 123 */
|
163
|
+
"CREOLES_AND_PIDGINS_ENGLISH_BASED", /* 124 */
|
164
|
+
"CREOLES_AND_PIDGINS_FRENCH_BASED", /* 125 */
|
165
|
+
"CREOLES_AND_PIDGINS_PORTUGUESE_BASED", /* 126 */
|
166
|
+
"CREOLES_AND_PIDGINS_OTHER", /* 127 */
|
167
|
+
"MAORI", /* 128 */
|
168
|
+
"WOLOF", /* 129 */
|
169
|
+
"ABKHAZIAN", /* 130 */
|
170
|
+
"AFAR", /* 131 */
|
171
|
+
"AYMARA", /* 132 */
|
172
|
+
"BASHKIR", /* 133 */
|
173
|
+
"BISLAMA", /* 134 */
|
174
|
+
"DZONGKHA", /* 135 */
|
175
|
+
"FIJIAN", /* 136 */
|
176
|
+
"GREENLANDIC", /* 137 */
|
177
|
+
"HAUSA", /* 138 */
|
178
|
+
"HAITIAN_CREOLE", /* 139 */
|
179
|
+
"INUPIAK", /* 140 */
|
180
|
+
"INUKTITUT", /* 141 */
|
181
|
+
"KASHMIRI", /* 142 */
|
182
|
+
"KINYARWANDA", /* 143 */
|
183
|
+
"MALAGASY", /* 144 */
|
184
|
+
"NAURU", /* 145 */
|
185
|
+
"OROMO", /* 146 */
|
186
|
+
"RUNDI", /* 147 */
|
187
|
+
"SAMOAN", /* 148 */
|
188
|
+
"SANGO", /* 149 */
|
189
|
+
"SANSKRIT", /* 150 */
|
190
|
+
"SISWANT", /* 151 */
|
191
|
+
"TSONGA", /* 152 */
|
192
|
+
"TSWANA", /* 153 */
|
193
|
+
"VOLAPUK", /* 154 */
|
194
|
+
"ZHUANG", /* 155 */
|
195
|
+
"KHASI", /* 156 */
|
196
|
+
"SCOTS", /* 157 */
|
197
|
+
"GANDA", /* 158 */
|
198
|
+
"MANX", /* 159 */
|
199
|
+
"MONTENEGRIN", /* 160 */
|
200
|
+
// Add new language declared names just before here
|
201
|
+
};
|
202
|
+
|
203
|
+
COMPILE_ASSERT(arraysize(kExtLangDeclaredName) == NUM_LANGUAGES,
|
204
|
+
kExtLangDeclaredName_has_incorrect_length);
|
205
|
+
|
206
|
+
|
207
|
+
// Language codes above NUM_LANGUAGES
|
208
|
+
// I made all these up, except Klingon from ISO-639-2 (dsites)
|
209
|
+
// NOTE: zza is a standard name
|
210
|
+
static const char* const kExtLanguageCode[] = {
|
211
|
+
// "X_BORK_BORK_BORK", "X_PIG_LATIN", "X_HACKER", "X_KLINGON", "X_ELMER_FUDD",
|
212
|
+
// All Latin script
|
213
|
+
"zzb", "zzp", "zzh", "tlh", "zze",
|
214
|
+
|
215
|
+
// Pseudo-languages for Unicode scripts that express a single language
|
216
|
+
"xx-Ogam", "xx-Runr", "xx-Yiii", "xx-Ital", "xx-Goth",
|
217
|
+
"xx-Dsrt", "xx-Hano", "xx-Buhd", "xx-Tagb", "xx-Tale",
|
218
|
+
"xx-Linb", "xx-Ugar", "xx-Shaw", "xx-Osma", "xx-Cprt",
|
219
|
+
"xx-Bugi", "xx-Copt", "xx-Talu", "xx-Glag", "xx-Tfng",
|
220
|
+
"xx-Sylo", "xx-Xpeo", "xx-Khar", "xx-Bali", "xx-Xsux",
|
221
|
+
"xx-Phnx", "xx-Phag", "xx-Nkoo",
|
222
|
+
|
223
|
+
// Unicode 5.1
|
224
|
+
"xx-Sund", "xx-Lepc", "xx-Olck", "xx-Vaii", "xx-Saur",
|
225
|
+
"xx-Kali", "xx-Rjng", "xx-Lyci", "xx-Cari", "xx-Lydi",
|
226
|
+
"xx-Cham",
|
227
|
+
};
|
228
|
+
|
229
|
+
|
230
|
+
// Given the Language, returns its string name used as the output by
|
231
|
+
// the lang/enc identifier, e.g. "Korean"
|
232
|
+
// "invalid_language" if the input is invalid.
|
233
|
+
// TG_UNKNOWN_LANGUAGE is used as a placeholder for the "ignore me" language,
|
234
|
+
// used to subtract out HTML, link farms, DNA strings, and alittle English porn
|
235
|
+
const char* ExtLanguageName(const Language lang) {
|
236
|
+
if (lang < 0) {
|
237
|
+
// No-text-at-all result from a Tote
|
238
|
+
return "";
|
239
|
+
}
|
240
|
+
// CompactLanguageDetect extension
|
241
|
+
if (lang == TG_UNKNOWN_LANGUAGE) {
|
242
|
+
return "Ignore";
|
243
|
+
}
|
244
|
+
if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
|
245
|
+
return LanguageName(lang);
|
246
|
+
}
|
247
|
+
if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
|
248
|
+
return kExtLanguageName[lang - EXT_LANGUAGE_BASE];
|
249
|
+
}
|
250
|
+
return invalid_language_name();
|
251
|
+
}
|
252
|
+
|
253
|
+
|
254
|
+
// Given the Language, returns its Language enum spelling, for use by
|
255
|
+
// programs that create C declarations, e.g. "KOREAN"
|
256
|
+
// "UNKNOWN_LANGUAGE" if the input is invalid.
|
257
|
+
const char* ExtLanguageDeclaredName(const Language lang) {
|
258
|
+
if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
|
259
|
+
return kExtLangDeclaredName[lang];
|
260
|
+
}
|
261
|
+
if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
|
262
|
+
return kExtLanguageName[lang - EXT_LANGUAGE_BASE];
|
263
|
+
}
|
264
|
+
return "UNKNOWN_LANGUAGE";
|
265
|
+
}
|
266
|
+
|
267
|
+
// Given the Language, return the language code, e.g. "ko"
|
268
|
+
const char* ExtLanguageCode(const Language lang) {
|
269
|
+
// Hack for ignore/porn pseudo-language
|
270
|
+
if (lang == TG_UNKNOWN_LANGUAGE) {
|
271
|
+
return "xxx";
|
272
|
+
}
|
273
|
+
if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
|
274
|
+
return LanguageCode(lang);
|
275
|
+
}
|
276
|
+
if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
|
277
|
+
return kExtLanguageCode[lang - EXT_LANGUAGE_BASE];
|
278
|
+
}
|
279
|
+
return "??";
|
280
|
+
}
|
281
|
+
|
282
|
+
|
283
|
+
// Convert "en-Latn-GB" to ENGLISH
|
284
|
+
// Normalize to PORTUGUESE, not PORTUGUESE_B nor PORTUGUESE_P
|
285
|
+
// Consider for later: NORWEGIAN, NORWEGIAN_N
|
286
|
+
// Consider for later: SCOTS, SCOTS_GAELIC
|
287
|
+
// Consider for later: SERBO_CROATIAN, SERBIAN, CROATIAN, BOSNIAN
|
288
|
+
//
|
289
|
+
Language GetLanguageFromNumberOrName(const char* src) {
|
290
|
+
if (strspn(src, "0123456789") == strlen(src)) {
|
291
|
+
// All digits
|
292
|
+
return static_cast<Language>(strto32(src, NULL, 10));
|
293
|
+
}
|
294
|
+
|
295
|
+
Language retlang = UNKNOWN_LANGUAGE;
|
296
|
+
size_t len = strlen(src);
|
297
|
+
|
298
|
+
if (true /*FLAGS_mergepairs*/) {
|
299
|
+
// Merge sets of langauges pt-xx en-xx fr-xx, NOT bs/hr/sr
|
300
|
+
if (memcmp(src, "pt-", 3) == 0) {return PORTUGUESE;}
|
301
|
+
if (memcmp(src, "en-", 3) == 0) {return ENGLISH;}
|
302
|
+
if (memcmp(src, "fr-", 3) == 0) {return FRENCH;}
|
303
|
+
// Use NormalizeLanguage instead
|
304
|
+
if (memcmp(src, "bs-", 3) == 0) {return CROATIAN;}
|
305
|
+
if (memcmp(src, "hr-", 3) == 0) {return CROATIAN;}
|
306
|
+
if (memcmp(src, "sr-Latn", 7) == 0) {return CROATIAN;}
|
307
|
+
if (memcmp(src, "sh-Latn", 7) == 0) {return CROATIAN;}
|
308
|
+
if (memcmp(src, "sr-Cyrl", 7) == 0) {return SERBIAN;}
|
309
|
+
if (memcmp(src, "sh-Cyrl", 7) == 0) {return SERBIAN;}
|
310
|
+
}
|
311
|
+
|
312
|
+
// Extensions
|
313
|
+
if (len >= 3) {
|
314
|
+
// Standin for ignore/porn "language"
|
315
|
+
if (memcmp(src, "xxx", 3) == 0) {return TG_UNKNOWN_LANGUAGE;}
|
316
|
+
|
317
|
+
if (memcmp(src, "zzb", 3) == 0) {return X_BORK_BORK_BORK;}
|
318
|
+
if (memcmp(src, "zzp", 3) == 0) {return X_PIG_LATIN;}
|
319
|
+
if (memcmp(src, "zzh", 3) == 0) {return X_HACKER;}
|
320
|
+
if (memcmp(src, "tlh", 3) == 0) {return X_KLINGON;}
|
321
|
+
if (memcmp(src, "zze", 3) == 0) {return X_ELMER_FUDD;}
|
322
|
+
}
|
323
|
+
|
324
|
+
// We have a name like en-Latn-GB or pt-BR
|
325
|
+
// First, get rid of some special cases
|
326
|
+
if (len <= 3) {
|
327
|
+
LanguageFromCode(src, &retlang);
|
328
|
+
} else if (len == 7) {
|
329
|
+
// More Extensions
|
330
|
+
if (memcmp(src, "xx-", 3) == 0) {
|
331
|
+
if (memcmp(src, "xx-Ogam", 7) == 0) {return X_OGHAM;}
|
332
|
+
if (memcmp(src, "xx-Runr", 7) == 0) {return X_RUNIC;}
|
333
|
+
if (memcmp(src, "xx-Yiii", 7) == 0) {return X_YI;}
|
334
|
+
if (memcmp(src, "xx-Ital", 7) == 0) {return X_OLD_ITALIC;}
|
335
|
+
if (memcmp(src, "xx-Goth", 7) == 0) {return X_GOTHIC;}
|
336
|
+
if (memcmp(src, "xx-Dsrt", 7) == 0) {return X_DESERET;}
|
337
|
+
if (memcmp(src, "xx-Hano", 7) == 0) {return X_HANUNOO;}
|
338
|
+
if (memcmp(src, "xx-Buhd", 7) == 0) {return X_BUHID;}
|
339
|
+
if (memcmp(src, "xx-Tagb", 7) == 0) {return X_TAGBANWA;}
|
340
|
+
if (memcmp(src, "xx-Tale", 7) == 0) {return X_TAI_LE;}
|
341
|
+
if (memcmp(src, "xx-Linb", 7) == 0) {return X_LINEAR_B;}
|
342
|
+
if (memcmp(src, "xx-Ugar", 7) == 0) {return X_UGARITIC;}
|
343
|
+
if (memcmp(src, "xx-Shaw", 7) == 0) {return X_SHAVIAN;}
|
344
|
+
if (memcmp(src, "xx-Osma", 7) == 0) {return X_OSMANYA;}
|
345
|
+
if (memcmp(src, "xx-Cprt", 7) == 0) {return X_CYPRIOT;}
|
346
|
+
if (memcmp(src, "xx-Bugi", 7) == 0) {return X_BUGINESE;}
|
347
|
+
if (memcmp(src, "xx-Copt", 7) == 0) {return X_COPTIC;}
|
348
|
+
if (memcmp(src, "xx-Talu", 7) == 0) {return X_NEW_TAI_LUE;}
|
349
|
+
if (memcmp(src, "xx-Glag", 7) == 0) {return X_GLAGOLITIC;}
|
350
|
+
if (memcmp(src, "xx-Tfng", 7) == 0) {return X_TIFINAGH;}
|
351
|
+
if (memcmp(src, "xx-Sylo", 7) == 0) {return X_SYLOTI_NAGRI;}
|
352
|
+
if (memcmp(src, "xx-Xpeo", 7) == 0) {return X_OLD_PERSIAN;}
|
353
|
+
if (memcmp(src, "xx-Khar", 7) == 0) {return X_KHAROSHTHI;}
|
354
|
+
if (memcmp(src, "xx-Bali", 7) == 0) {return X_BALINESE;}
|
355
|
+
if (memcmp(src, "xx-Xsux", 7) == 0) {return X_CUNEIFORM;}
|
356
|
+
if (memcmp(src, "xx-Phnx", 7) == 0) {return X_PHOENICIAN;}
|
357
|
+
if (memcmp(src, "xx-Phag", 7) == 0) {return X_PHAGS_PA;}
|
358
|
+
if (memcmp(src, "xx-Nkoo", 7) == 0) {return X_NKO;}
|
359
|
+
|
360
|
+
// Unicode 5.1
|
361
|
+
if (memcmp(src, "xx-Sund", 7) == 0) {return X_SUDANESE;}
|
362
|
+
if (memcmp(src, "xx-Lepc", 7) == 0) {return X_LEPCHA;}
|
363
|
+
if (memcmp(src, "xx-Olck", 7) == 0) {return X_OL_CHIKI;}
|
364
|
+
if (memcmp(src, "xx-Vaii", 7) == 0) {return X_VAI;}
|
365
|
+
if (memcmp(src, "xx-Saur", 7) == 0) {return X_SAURASHTRA;}
|
366
|
+
if (memcmp(src, "xx-Kali", 7) == 0) {return X_KAYAH_LI;}
|
367
|
+
if (memcmp(src, "xx-Rjng", 7) == 0) {return X_REJANG;}
|
368
|
+
if (memcmp(src, "xx-Lyci", 7) == 0) {return X_LYCIAN;}
|
369
|
+
if (memcmp(src, "xx-Cari", 7) == 0) {return X_CARIAN;}
|
370
|
+
if (memcmp(src, "xx-Lydi", 7) == 0) {return X_LYDIAN;}
|
371
|
+
if (memcmp(src, "xx-Cham", 7) == 0) {return X_CHAM;}
|
372
|
+
}
|
373
|
+
}
|
374
|
+
// Some other weird ones
|
375
|
+
// Could be Latn or Limb; all our current training data is Latn
|
376
|
+
if (strcmp(src, "sit-NP") == 0) {return LIMBU;}
|
377
|
+
if (strcmp(src, "un-Latn") == 0) {return UNKNOWN_LANGUAGE;}
|
378
|
+
|
379
|
+
// Multi-country langauges
|
380
|
+
if (memcmp(src, "zh", 2) == 0) {
|
381
|
+
if (memcmp(&src[len - 2], "TW", 2) == 0) {return CHINESE_T;}
|
382
|
+
if (memcmp(&src[len - 2], "HK", 2) == 0) {return CHINESE_T;}
|
383
|
+
return CHINESE;
|
384
|
+
}
|
385
|
+
if (memcmp(src, "pt", 2) == 0) {
|
386
|
+
if (memcmp(&src[len - 2], "BR", 2) == 0) {return PORTUGUESE;}
|
387
|
+
return PORTUGUESE;
|
388
|
+
}
|
389
|
+
if (memcmp(src, "fr", 2) == 0) {
|
390
|
+
if (memcmp(&src[len -2], "CA", 2) == 0) {return FRENCH;}
|
391
|
+
return FRENCH;
|
392
|
+
}
|
393
|
+
|
394
|
+
// None of the special cases matched
|
395
|
+
if (src[2] == '-') {
|
396
|
+
char temp[4];
|
397
|
+
memcpy(temp, src, 4);
|
398
|
+
temp[2] = '\0';
|
399
|
+
LanguageFromCode(temp, &retlang);
|
400
|
+
}
|
401
|
+
if (src[3] == '-') {
|
402
|
+
char temp[4];
|
403
|
+
memcpy(temp, src, 4);
|
404
|
+
temp[3] = '\0';
|
405
|
+
LanguageFromCode(temp, &retlang);
|
406
|
+
}
|
407
|
+
if (retlang != UNKNOWN_LANGUAGE) {
|
408
|
+
return retlang;
|
409
|
+
}
|
410
|
+
|
411
|
+
return retlang;
|
412
|
+
}
|
413
|
+
|
414
|
+
typedef struct {
|
415
|
+
const char* name;
|
416
|
+
UnicodeLScript lscript;
|
417
|
+
} NameScriptPair;
|
418
|
+
|
419
|
+
// In alphabetic order for binary search
|
420
|
+
static const NameScriptPair kNameScriptPair[] = {
|
421
|
+
// Unicode 5.1 additional scripts
|
422
|
+
{"Arab", ULScript_Arabic},
|
423
|
+
{"Armn", ULScript_Armenian},
|
424
|
+
{"Bali", ULScript_Balinese},
|
425
|
+
{"Beng", ULScript_Bengali},
|
426
|
+
{"Bugi", ULScript_Buginese},
|
427
|
+
{"Buhd", ULScript_Buhid},
|
428
|
+
{"Cans", ULScript_Canadian_Aboriginal},
|
429
|
+
{"Cari", ULScript_Carian}, // Unicode 5.1
|
430
|
+
{"Cham", ULScript_Cham}, // Unicode 5.1
|
431
|
+
{"Cher", ULScript_Cherokee},
|
432
|
+
{"Copt", ULScript_Coptic},
|
433
|
+
{"Cprt", ULScript_Cypriot},
|
434
|
+
{"Cyrl", ULScript_Cyrillic},
|
435
|
+
{"Deva", ULScript_Devanagari},
|
436
|
+
{"Dsrt", ULScript_Deseret},
|
437
|
+
{"Ethi", ULScript_Ethiopic},
|
438
|
+
{"Geor", ULScript_Georgian},
|
439
|
+
{"Glag", ULScript_Glagolitic},
|
440
|
+
{"Goth", ULScript_Gothic},
|
441
|
+
{"Grek", ULScript_Greek},
|
442
|
+
{"Gujr", ULScript_Gujarati},
|
443
|
+
{"Guru", ULScript_Gurmukhi},
|
444
|
+
{"Hani", ULScript_HanCJK},
|
445
|
+
{"Hano", ULScript_Hanunoo},
|
446
|
+
{"Hebr", ULScript_Hebrew},
|
447
|
+
{"Ital", ULScript_Old_Italic},
|
448
|
+
{"Kali", ULScript_Kayah_Li}, // Unicode 5.1
|
449
|
+
{"Khar", ULScript_Kharoshthi},
|
450
|
+
{"Khmr", ULScript_Khmer},
|
451
|
+
{"Knda", ULScript_Kannada},
|
452
|
+
{"Laoo", ULScript_Lao},
|
453
|
+
{"Latn", ULScript_Latin},
|
454
|
+
{"Lepc", ULScript_Lepcha}, // Unicode 5.1
|
455
|
+
{"Limb", ULScript_Limbu},
|
456
|
+
{"Linb", ULScript_Linear_B},
|
457
|
+
{"Lyci", ULScript_Lycian}, // Unicode 5.1
|
458
|
+
{"Lydi", ULScript_Lydian}, // Unicode 5.1
|
459
|
+
{"Mlym", ULScript_Malayalam},
|
460
|
+
{"Mong", ULScript_Mongolian},
|
461
|
+
{"Mymr", ULScript_Myanmar},
|
462
|
+
{"Nkoo", ULScript_Nko},
|
463
|
+
{"Ogam", ULScript_Ogham},
|
464
|
+
{"Olck", ULScript_Ol_Chiki}, // Unicode 5.1
|
465
|
+
{"Orya", ULScript_Oriya},
|
466
|
+
{"Osma", ULScript_Osmanya},
|
467
|
+
{"Phag", ULScript_Phags_Pa},
|
468
|
+
{"Phnx", ULScript_Phoenician},
|
469
|
+
{"Rjng", ULScript_Rejang}, // Unicode 5.1
|
470
|
+
{"Runr", ULScript_Runic},
|
471
|
+
{"Saur", ULScript_Saurashtra}, // Unicode 5.1
|
472
|
+
{"Shaw", ULScript_Shavian},
|
473
|
+
{"Sinh", ULScript_Sinhala},
|
474
|
+
{"Sund", ULScript_Sundanese}, // Unicode 5.1
|
475
|
+
{"Sylo", ULScript_Syloti_Nagri},
|
476
|
+
{"Syrc", ULScript_Syriac},
|
477
|
+
{"Tagb", ULScript_Tagbanwa},
|
478
|
+
{"Tale", ULScript_Tai_Le},
|
479
|
+
{"Talu", ULScript_New_Tai_Lue},
|
480
|
+
{"Taml", ULScript_Tamil},
|
481
|
+
{"Telu", ULScript_Telugu},
|
482
|
+
{"Tfng", ULScript_Tifinagh},
|
483
|
+
{"Tglg", ULScript_Tagalog},
|
484
|
+
{"Thaa", ULScript_Thaana},
|
485
|
+
{"Thai", ULScript_Thai},
|
486
|
+
{"Tibt", ULScript_Tibetan},
|
487
|
+
{"Ugar", ULScript_Ugaritic},
|
488
|
+
{"Vaii", ULScript_Vai}, // Unicode 5.1 // NOTE: apparently 'Vai '
|
489
|
+
{"Xpeo", ULScript_Old_Persian},
|
490
|
+
{"Xsux", ULScript_Cuneiform},
|
491
|
+
{"Yiii", ULScript_Yi},
|
492
|
+
{"Zyyy", ULScript_Common},
|
493
|
+
{"Zzzz", ULScript_Inherited},
|
494
|
+
};
|
495
|
+
|
496
|
+
// Convert "en-Latn-GB" to ULScript_Latin
|
497
|
+
UnicodeLScript GetLScriptFromNumberOrName(const char* src) {
|
498
|
+
if (strspn(src, "0123456789") == strlen(src)) {
|
499
|
+
// All digits
|
500
|
+
return static_cast<UnicodeLScript>(strto32(src, NULL, 10));
|
501
|
+
}
|
502
|
+
|
503
|
+
if (strcmp(src, "zh-TW") == 0) {return ULScript_HanCJK;}
|
504
|
+
if (strcmp(src, "zh-CN") == 0) {return ULScript_HanCJK;}
|
505
|
+
if (strcmp(src, "pt-BR") == 0) {return ULScript_Latin;}
|
506
|
+
if (strcmp(src, "pt-PT") == 0) {return ULScript_Latin;}
|
507
|
+
// Could be Latn or Limb; all our current training data is Latn
|
508
|
+
if (strcmp(src, "sit-NP") == 0) {return ULScript_Latin;}
|
509
|
+
|
510
|
+
// Isolate just the script field
|
511
|
+
char temp[5];
|
512
|
+
const char* src2 = strchr(src, '-');
|
513
|
+
if (src2 == NULL) {return ULScript_Latin;}
|
514
|
+
src2 += 1; // over the -
|
515
|
+
memcpy(temp, src2, 4);
|
516
|
+
temp[4] = '\0';
|
517
|
+
|
518
|
+
int lo = 0;
|
519
|
+
int hi = ULScript_NUM_SCRIPTS;
|
520
|
+
while (lo < hi) {
|
521
|
+
int mid = (lo + hi) >> 1;
|
522
|
+
if (strcmp(temp, kNameScriptPair[mid].name) < 0) {
|
523
|
+
hi = mid;
|
524
|
+
} else if (strcmp(temp, kNameScriptPair[mid].name) > 0) {
|
525
|
+
lo = mid + 1;
|
526
|
+
} else {
|
527
|
+
return kNameScriptPair[mid].lscript;
|
528
|
+
}
|
529
|
+
}
|
530
|
+
return ULScript_Latin;
|
531
|
+
}
|
532
|
+
|
533
|
+
|
534
|
+
// Merge together some languages, such as bo/hr/sr
|
535
|
+
// Croatian Latin and Serbian Cyrillic now.
|
536
|
+
Language NormalizeLanguage(Language lang) {
|
537
|
+
if (lang == BOSNIAN) {return CROATIAN;}
|
538
|
+
if (lang == SERBO_CROATIAN) {return SERBIAN;}
|
539
|
+
|
540
|
+
if (lang == PORTUGUESE_P) {return PORTUGUESE;}
|
541
|
+
if (lang == PORTUGUESE_B) {return PORTUGUESE;}
|
542
|
+
|
543
|
+
return lang;
|
544
|
+
}
|
545
|
+
|
@@ -0,0 +1,119 @@
|
|
1
|
+
// Copyright (c) 2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
//
|
5
|
+
// This file extends lang_enc.h with additional languages and extended routines.
|
6
|
+
// It is current with Unicode 5.1 (March 2008)
|
7
|
+
//
|
8
|
+
|
9
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_EXT_LANG_ENC_H__
|
10
|
+
#define ENCODINGS_COMPACT_LANG_DET_EXT_LANG_ENC_H__
|
11
|
+
|
12
|
+
#include "languages/public/languages.h"
|
13
|
+
#include "encodings/compact_lang_det/letterscript_enum.h"
|
14
|
+
|
15
|
+
|
16
|
+
// Leave a small gap after the base languages, so adding one or two is easy.
|
17
|
+
// Just reduce the gap here (currently 5 entries)
|
18
|
+
|
19
|
+
// Montengrin added, so reducing this from 5 to 4. dsites 2008.10.06
|
20
|
+
#define EXT_LANGUAGE_BASE (NUM_LANGUAGES + 4)
|
21
|
+
|
22
|
+
// Google UI languages
|
23
|
+
#define X_BORK_BORK_BORK (Language)(EXT_LANGUAGE_BASE+0)
|
24
|
+
#define X_PIG_LATIN (Language)(EXT_LANGUAGE_BASE+1)
|
25
|
+
#define X_HACKER (Language)(EXT_LANGUAGE_BASE+2)
|
26
|
+
#define X_KLINGON (Language)(EXT_LANGUAGE_BASE+3)
|
27
|
+
#define X_ELMER_FUDD (Language)(EXT_LANGUAGE_BASE+4)
|
28
|
+
|
29
|
+
// Pseudo-languages for Unicode scripts that express a single language
|
30
|
+
#define X_OGHAM (Language)(EXT_LANGUAGE_BASE+5)
|
31
|
+
#define X_RUNIC (Language)(EXT_LANGUAGE_BASE+6)
|
32
|
+
#define X_YI (Language)(EXT_LANGUAGE_BASE+7)
|
33
|
+
#define X_OLD_ITALIC (Language)(EXT_LANGUAGE_BASE+8)
|
34
|
+
#define X_GOTHIC (Language)(EXT_LANGUAGE_BASE+9)
|
35
|
+
#define X_DESERET (Language)(EXT_LANGUAGE_BASE+10)
|
36
|
+
#define X_HANUNOO (Language)(EXT_LANGUAGE_BASE+11)
|
37
|
+
#define X_BUHID (Language)(EXT_LANGUAGE_BASE+12)
|
38
|
+
#define X_TAGBANWA (Language)(EXT_LANGUAGE_BASE+13)
|
39
|
+
#define X_TAI_LE (Language)(EXT_LANGUAGE_BASE+14)
|
40
|
+
#define X_LINEAR_B (Language)(EXT_LANGUAGE_BASE+15)
|
41
|
+
#define X_UGARITIC (Language)(EXT_LANGUAGE_BASE+16)
|
42
|
+
#define X_SHAVIAN (Language)(EXT_LANGUAGE_BASE+17)
|
43
|
+
#define X_OSMANYA (Language)(EXT_LANGUAGE_BASE+18)
|
44
|
+
#define X_CYPRIOT (Language)(EXT_LANGUAGE_BASE+19)
|
45
|
+
#define X_BUGINESE (Language)(EXT_LANGUAGE_BASE+20)
|
46
|
+
#define X_COPTIC (Language)(EXT_LANGUAGE_BASE+21)
|
47
|
+
#define X_NEW_TAI_LUE (Language)(EXT_LANGUAGE_BASE+22)
|
48
|
+
#define X_GLAGOLITIC (Language)(EXT_LANGUAGE_BASE+23)
|
49
|
+
#define X_TIFINAGH (Language)(EXT_LANGUAGE_BASE+24)
|
50
|
+
#define X_SYLOTI_NAGRI (Language)(EXT_LANGUAGE_BASE+25)
|
51
|
+
#define X_OLD_PERSIAN (Language)(EXT_LANGUAGE_BASE+26)
|
52
|
+
#define X_KHAROSHTHI (Language)(EXT_LANGUAGE_BASE+27)
|
53
|
+
#define X_BALINESE (Language)(EXT_LANGUAGE_BASE+28)
|
54
|
+
#define X_CUNEIFORM (Language)(EXT_LANGUAGE_BASE+29)
|
55
|
+
#define X_PHOENICIAN (Language)(EXT_LANGUAGE_BASE+30)
|
56
|
+
#define X_PHAGS_PA (Language)(EXT_LANGUAGE_BASE+31)
|
57
|
+
#define X_NKO (Language)(EXT_LANGUAGE_BASE+32)
|
58
|
+
|
59
|
+
// Unicode 5.1
|
60
|
+
#define X_SUDANESE (Language)(EXT_LANGUAGE_BASE+33)
|
61
|
+
#define X_LEPCHA (Language)(EXT_LANGUAGE_BASE+34)
|
62
|
+
#define X_OL_CHIKI (Language)(EXT_LANGUAGE_BASE+35)
|
63
|
+
#define X_VAI (Language)(EXT_LANGUAGE_BASE+36)
|
64
|
+
#define X_SAURASHTRA (Language)(EXT_LANGUAGE_BASE+37)
|
65
|
+
#define X_KAYAH_LI (Language)(EXT_LANGUAGE_BASE+38)
|
66
|
+
#define X_REJANG (Language)(EXT_LANGUAGE_BASE+39)
|
67
|
+
#define X_LYCIAN (Language)(EXT_LANGUAGE_BASE+40)
|
68
|
+
#define X_CARIAN (Language)(EXT_LANGUAGE_BASE+41)
|
69
|
+
#define X_LYDIAN (Language)(EXT_LANGUAGE_BASE+42)
|
70
|
+
#define X_CHAM (Language)(EXT_LANGUAGE_BASE+43)
|
71
|
+
|
72
|
+
#define EXT_NUM_LANGUAGES (Language)(EXT_LANGUAGE_BASE+44)
|
73
|
+
|
74
|
+
|
75
|
+
|
76
|
+
// ExtLanguageName
|
77
|
+
// ------------
|
78
|
+
// Given the Language, returns its string name used as the output by
|
79
|
+
// the lang/enc identifier, e.g. "Korean"
|
80
|
+
// "invalid_language" if the input is invalid.
|
81
|
+
extern const char* ExtLanguageName(const Language lang);
|
82
|
+
|
83
|
+
// ExtLanguageDeclaredName
|
84
|
+
// ------------
|
85
|
+
// Given the Language, returns its Language enum spelling, for use by
|
86
|
+
// programs that create C declarations, e.g. "KOREAN"
|
87
|
+
// "UNKNOWN_LANGUAGE" if the input is invalid.
|
88
|
+
extern const char* ExtLanguageDeclaredName(const Language lang);
|
89
|
+
|
90
|
+
// ExtLanguageCode
|
91
|
+
// ------------
|
92
|
+
// Given the Language, return the language code, e.g. "ko"
|
93
|
+
// This is determined by
|
94
|
+
// the following (in order of preference):
|
95
|
+
// - ISO-639-1 two-letter language code
|
96
|
+
// (all except those mentioned below)
|
97
|
+
// - ISO-639-2 three-letter bibliographic language code
|
98
|
+
// (Tibetan, Dhivehi, Cherokee, Syriac)
|
99
|
+
// - Google-specific language code
|
100
|
+
// (ChineseT ("zh-TW"), Teragram Unknown, Unknown,
|
101
|
+
// Portuguese-Portugal, Portuguese-Brazil, Limbu)
|
102
|
+
extern const char * ExtLanguageCode(const Language lang);
|
103
|
+
|
104
|
+
|
105
|
+
// Convert "en-Latn-GB" to ENGLISH
|
106
|
+
// Normalize to PORTUGUESE, not PORTUGUESE_B nor PORTUGUESE_P
|
107
|
+
// Consider for later: NORWEGIAN, NORWEGIAN_N
|
108
|
+
// Consider for later: SCOTS, SCOTS_GAELIC
|
109
|
+
// Consider for later: SERBO_CROATIAN, SERBIAN, CROATIAN, BOSNIAN
|
110
|
+
//
|
111
|
+
Language GetLanguageFromNumberOrName(const char* src);
|
112
|
+
|
113
|
+
// Convert "en-Latn-GB" to ULScript_Latin
|
114
|
+
UnicodeLScript GetLScriptFromNumberOrName(const char* src);
|
115
|
+
|
116
|
+
// Merge together some languages, such as bo/hr/sr
|
117
|
+
Language NormalizeLanguage(Language lang);
|
118
|
+
|
119
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_EXT_LANG_ENC_H__
|