compact_enc_det 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/ext/compact_enc_det/compact_enc_det/CMakeLists.txt +103 -0
- data/ext/compact_enc_det/compact_enc_det/LICENSE +202 -0
- data/ext/compact_enc_det/compact_enc_det/README.md +46 -0
- data/ext/compact_enc_det/compact_enc_det/autogen.sh +74 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.cc +5719 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.h +83 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_fuzz_test.cc +54 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables.h +6326 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables2.h +856 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.cc +169 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.h +45 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_unittest.cc +5260 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/detail_head_string.inc +152 -0
- data/ext/compact_enc_det/compact_enc_det/util/basictypes.h +331 -0
- data/ext/compact_enc_det/compact_enc_det/util/case_insensitive_hash.h +88 -0
- data/ext/compact_enc_det/compact_enc_det/util/commandlineflags.h +39 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.cc +891 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.h +299 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.pb.h +181 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings_unittest.cc +34 -0
- data/ext/compact_enc_det/compact_enc_det/util/languages/languages.cc +349 -0
- data/ext/compact_enc_det/compact_enc_det/util/languages/languages.h +381 -0
- data/ext/compact_enc_det/compact_enc_det/util/languages/languages.pb.h +191 -0
- data/ext/compact_enc_det/compact_enc_det/util/logging.h +25 -0
- data/ext/compact_enc_det/compact_enc_det/util/port.h +53 -0
- data/ext/compact_enc_det/compact_enc_det/util/string_util.h +61 -0
- data/ext/compact_enc_det/compact_enc_det/util/varsetter.h +66 -0
- data/ext/compact_enc_det/compact_enc_det.cc +100 -0
- data/ext/compact_enc_det/extconf.rb +20 -0
- data/lib/compact_enc_det/version.rb +3 -0
- data/lib/compact_enc_det.rb +2 -0
- metadata +106 -0
@@ -0,0 +1,349 @@
|
|
1
|
+
// Copyright 2016 Google Inc.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
////////////////////////////////////////////////////////////////////////////////
|
16
|
+
|
17
|
+
#include "util/languages/languages.h"
|
18
|
+
|
19
|
+
#include "util/basictypes.h"
|
20
|
+
#include "util/string_util.h"
|
21
|
+
|
22
|
+
|
23
|
+
Language default_language() {return ENGLISH;}
|
24
|
+
|
25
|
+
|
26
|
+
// Language names and codes
|
27
|
+
|
28
|
+
struct LanguageInfo {
|
29
|
+
const char * language_name_;
|
30
|
+
const char * language_code_639_1_; // the ISO-639-1 code for the language
|
31
|
+
const char * language_code_639_2_; // the ISO-639-2 code for the language
|
32
|
+
const char * language_code_other_; // some nonstandard code for the language
|
33
|
+
};
|
34
|
+
|
35
|
+
static const LanguageInfo kLanguageInfoTable[] = {
|
36
|
+
{ "ENGLISH", "en", "eng", NULL},
|
37
|
+
{ "DANISH", "da", "dan", NULL},
|
38
|
+
{ "DUTCH", "nl", "dut", NULL},
|
39
|
+
{ "FINNISH", "fi", "fin", NULL},
|
40
|
+
{ "FRENCH", "fr", "fre", NULL},
|
41
|
+
{ "GERMAN", "de", "ger", NULL},
|
42
|
+
{ "HEBREW", "he", "heb", NULL},
|
43
|
+
{ "ITALIAN", "it", "ita", NULL},
|
44
|
+
{ "Japanese", "ja", "jpn", NULL},
|
45
|
+
{ "Korean", "ko", "kor", NULL},
|
46
|
+
{ "NORWEGIAN", "nb", "nor", NULL},
|
47
|
+
{ "POLISH", "pl", "pol", NULL},
|
48
|
+
{ "PORTUGUESE", "pt", "por", NULL},
|
49
|
+
{ "RUSSIAN", "ru", "rus", NULL},
|
50
|
+
{ "SPANISH", "es", "spa", NULL},
|
51
|
+
{ "SWEDISH", "sv", "swe", NULL},
|
52
|
+
{ "Chinese", "zh", "chi", "zh-CN"},
|
53
|
+
{ "CZECH", "cs", "cze", NULL},
|
54
|
+
{ "GREEK", "el", "gre", NULL},
|
55
|
+
{ "ICELANDIC", "is", "ice", NULL},
|
56
|
+
{ "LATVIAN", "lv", "lav", NULL},
|
57
|
+
{ "LITHUANIAN", "lt", "lit", NULL},
|
58
|
+
{ "ROMANIAN", "ro", "rum", NULL},
|
59
|
+
{ "HUNGARIAN", "hu", "hun", NULL},
|
60
|
+
{ "ESTONIAN", "et", "est", NULL},
|
61
|
+
// TODO: Although Teragram has two output names "TG_UNKNOWN_LANGUAGE"
|
62
|
+
// and "Unknown", they are essentially the same. Need to unify them.
|
63
|
+
// "un" and "ut" are invented by us, not from ISO-639.
|
64
|
+
//
|
65
|
+
{ "TG_UNKNOWN_LANGUAGE", NULL, NULL, "ut"},
|
66
|
+
{ "Unknown", NULL, NULL, "un"},
|
67
|
+
{ "BULGARIAN", "bg", "bul", NULL},
|
68
|
+
{ "CROATIAN", "hr", "scr", NULL},
|
69
|
+
{ "SERBIAN", "sr", "scc", NULL},
|
70
|
+
{ "IRISH", "ga", "gle", NULL},
|
71
|
+
{ "GALICIAN", "gl", "glg", NULL},
|
72
|
+
// Impossible to tell Tagalog from Filipino at the moment.
|
73
|
+
// Use ISO 639-2 code for Filipino here.
|
74
|
+
{ "TAGALOG", NULL, "fil", NULL},
|
75
|
+
{ "TURKISH", "tr", "tur", NULL},
|
76
|
+
{ "UKRAINIAN", "uk", "ukr", NULL},
|
77
|
+
{ "HINDI", "hi", "hin", NULL},
|
78
|
+
{ "MACEDONIAN", "mk", "mac", NULL},
|
79
|
+
{ "BENGALI", "bn", "ben", NULL},
|
80
|
+
{ "INDONESIAN", "id", "ind", NULL},
|
81
|
+
{ "LATIN", "la", "lat", NULL},
|
82
|
+
{ "MALAY", "ms", "may", NULL},
|
83
|
+
{ "MALAYALAM", "ml", "mal", NULL},
|
84
|
+
{ "WELSH", "cy", "wel", NULL},
|
85
|
+
{ "NEPALI", "ne", "nep", NULL},
|
86
|
+
{ "TELUGU", "te", "tel", NULL},
|
87
|
+
{ "ALBANIAN", "sq", "alb", NULL},
|
88
|
+
{ "TAMIL", "ta", "tam", NULL},
|
89
|
+
{ "BELARUSIAN", "be", "bel", NULL},
|
90
|
+
{ "JAVANESE", "jw", "jav", NULL},
|
91
|
+
{ "OCCITAN", "oc", "oci", NULL},
|
92
|
+
{ "URDU", "ur", "urd", NULL},
|
93
|
+
{ "BIHARI", "bh", "bih", NULL},
|
94
|
+
{ "GUJARATI", "gu", "guj", NULL},
|
95
|
+
{ "THAI", "th", "tha", NULL},
|
96
|
+
{ "ARABIC", "ar", "ara", NULL},
|
97
|
+
{ "CATALAN", "ca", "cat", NULL},
|
98
|
+
{ "ESPERANTO", "eo", "epo", NULL},
|
99
|
+
{ "BASQUE", "eu", "baq", NULL},
|
100
|
+
{ "INTERLINGUA", "ia", "ina", NULL},
|
101
|
+
{ "KANNADA", "kn", "kan", NULL},
|
102
|
+
{ "PUNJABI", "pa", "pan", NULL},
|
103
|
+
{ "SCOTS_GAELIC", "gd", "gla", NULL},
|
104
|
+
{ "SWAHILI", "sw", "swa", NULL},
|
105
|
+
{ "SLOVENIAN", "sl", "slv", NULL},
|
106
|
+
{ "MARATHI", "mr", "mar", NULL},
|
107
|
+
{ "MALTESE", "mt", "mlt", NULL},
|
108
|
+
{ "VIETNAMESE", "vi", "vie", NULL},
|
109
|
+
{ "FRISIAN", "fy", "fry", NULL},
|
110
|
+
{ "SLOVAK", "sk", "slo", NULL},
|
111
|
+
{ "ChineseT",
|
112
|
+
NULL, NULL, // We intentionally set these 2 fields to NULL to avoid
|
113
|
+
// confusion between CHINESE_T and CHINESE.
|
114
|
+
"zh-TW"},
|
115
|
+
{ "FAROESE", "fo", "fao", NULL},
|
116
|
+
{ "SUNDANESE", "su", "sun", NULL},
|
117
|
+
{ "UZBEK", "uz", "uzb", NULL},
|
118
|
+
{ "AMHARIC", "am", "amh", NULL},
|
119
|
+
{ "AZERBAIJANI", "az", "aze", NULL},
|
120
|
+
{ "GEORGIAN", "ka", "geo", NULL},
|
121
|
+
{ "TIGRINYA", "ti", "tir", NULL},
|
122
|
+
{ "PERSIAN", "fa", "per", NULL},
|
123
|
+
{ "BOSNIAN", "bs", "bos", NULL},
|
124
|
+
{ "SINHALESE", "si", "sin", NULL},
|
125
|
+
{ "NORWEGIAN_N", "nn", "nno", NULL},
|
126
|
+
{ "PORTUGUESE_P", NULL, NULL, "pt-PT"},
|
127
|
+
{ "PORTUGUESE_B", NULL, NULL, "pt-BR"},
|
128
|
+
{ "XHOSA", "xh", "xho", NULL},
|
129
|
+
{ "ZULU", "zu", "zul", NULL},
|
130
|
+
{ "GUARANI", "gn", "grn", NULL},
|
131
|
+
{ "SESOTHO", "st", "sot", NULL},
|
132
|
+
{ "TURKMEN", "tk", "tuk", NULL},
|
133
|
+
{ "KYRGYZ", "ky", "kir", NULL},
|
134
|
+
{ "BRETON", "br", "bre", NULL},
|
135
|
+
{ "TWI", "tw", "twi", NULL},
|
136
|
+
{ "YIDDISH", "yi", "yid", NULL},
|
137
|
+
{ "SERBO_CROATIAN", "sh", NULL, NULL},
|
138
|
+
{ "SOMALI", "so", "som", NULL},
|
139
|
+
{ "UIGHUR", "ug", "uig", NULL},
|
140
|
+
{ "KURDISH", "ku", "kur", NULL},
|
141
|
+
{ "MONGOLIAN", "mn", "mon", NULL},
|
142
|
+
{ "ARMENIAN", "hy", "arm", NULL},
|
143
|
+
{ "LAOTHIAN", "lo", "lao", NULL},
|
144
|
+
{ "SINDHI", "sd", "snd", NULL},
|
145
|
+
{ "RHAETO_ROMANCE", "rm", "roh", NULL},
|
146
|
+
{ "AFRIKAANS", "af", "afr", NULL},
|
147
|
+
{ "LUXEMBOURGISH", "lb", "ltz", NULL},
|
148
|
+
{ "BURMESE", "my", "bur", NULL},
|
149
|
+
// KHMER is known as Cambodian for Google user interfaces.
|
150
|
+
{ "KHMER", "km", "khm", NULL},
|
151
|
+
{ "TIBETAN", "bo", "tib", NULL},
|
152
|
+
{ "DHIVEHI", "dv", "div", NULL},
|
153
|
+
{ "CHEROKEE", NULL, "chr", NULL},
|
154
|
+
{ "SYRIAC", NULL, "syr", NULL},
|
155
|
+
{ "LIMBU", NULL, NULL, "sit-NP"},
|
156
|
+
{ "ORIYA", "or", "ori", NULL},
|
157
|
+
{ "ASSAMESE", "as", "asm", NULL},
|
158
|
+
{ "CORSICAN", "co", "cos", NULL},
|
159
|
+
{ "INTERLINGUE", "ie", "ine", NULL},
|
160
|
+
{ "KAZAKH", "kk", "kaz", NULL},
|
161
|
+
{ "LINGALA", "ln", "lin", NULL},
|
162
|
+
{ "MOLDAVIAN", "mo", "mol", NULL},
|
163
|
+
{ "PASHTO", "ps", "pus", NULL},
|
164
|
+
{ "QUECHUA", "qu", "que", NULL},
|
165
|
+
{ "SHONA", "sn", "sna", NULL},
|
166
|
+
{ "TAJIK", "tg", "tgk", NULL},
|
167
|
+
{ "TATAR", "tt", "tat", NULL},
|
168
|
+
{ "TONGA", "to", "tog", NULL},
|
169
|
+
{ "YORUBA", "yo", "yor", NULL},
|
170
|
+
{ "CREOLES_AND_PIDGINS_ENGLISH_BASED", NULL, "cpe", NULL},
|
171
|
+
{ "CREOLES_AND_PIDGINS_FRENCH_BASED", NULL, "cpf", NULL},
|
172
|
+
{ "CREOLES_AND_PIDGINS_PORTUGUESE_BASED", NULL, "cpp", NULL},
|
173
|
+
{ "CREOLES_AND_PIDGINS_OTHER", NULL, "crp", NULL},
|
174
|
+
{ "MAORI", "mi", "mao", NULL},
|
175
|
+
{ "WOLOF", "wo", "wol", NULL},
|
176
|
+
{ "ABKHAZIAN", "ab", "abk", NULL},
|
177
|
+
{ "AFAR", "aa", "aar", NULL},
|
178
|
+
{ "AYMARA", "ay", "aym", NULL},
|
179
|
+
{ "BASHKIR", "ba", "bak", NULL},
|
180
|
+
{ "BISLAMA", "bi", "bis", NULL},
|
181
|
+
{ "DZONGKHA", "dz", "dzo", NULL},
|
182
|
+
{ "FIJIAN", "fj", "fij", NULL},
|
183
|
+
{ "GREENLANDIC", "kl", "kal", NULL},
|
184
|
+
{ "HAUSA", "ha", "hau", NULL},
|
185
|
+
{ "HAITIAN_CREOLE", "ht", NULL, NULL},
|
186
|
+
{ "INUPIAK", "ik", "ipk", NULL},
|
187
|
+
{ "INUKTITUT", "iu", "iku", NULL},
|
188
|
+
{ "KASHMIRI", "ks", "kas", NULL},
|
189
|
+
{ "KINYARWANDA", "rw", "kin", NULL},
|
190
|
+
{ "MALAGASY", "mg", "mlg", NULL},
|
191
|
+
{ "NAURU", "na", "nau", NULL},
|
192
|
+
{ "OROMO", "om", "orm", NULL},
|
193
|
+
{ "RUNDI", "rn", "run", NULL},
|
194
|
+
{ "SAMOAN", "sm", "smo", NULL},
|
195
|
+
{ "SANGO", "sg", "sag", NULL},
|
196
|
+
{ "SANSKRIT", "sa", "san", NULL},
|
197
|
+
{ "SISWANT", "ss", "ssw", NULL},
|
198
|
+
{ "TSONGA", "ts", "tso", NULL},
|
199
|
+
{ "TSWANA", "tn", "tsn", NULL},
|
200
|
+
{ "VOLAPUK", "vo", "vol", NULL},
|
201
|
+
{ "ZHUANG", "za", "zha", NULL},
|
202
|
+
{ "KHASI", NULL, "kha", NULL},
|
203
|
+
{ "SCOTS", NULL, "sco", NULL},
|
204
|
+
{ "GANDA", "lg", "lug", NULL},
|
205
|
+
{ "MANX", "gv", "glv", NULL},
|
206
|
+
{ "MONTENEGRIN", NULL, NULL, "sr-ME"},
|
207
|
+
{ "XX", NULL, NULL, "XX"},
|
208
|
+
};
|
209
|
+
|
210
|
+
COMPILE_ASSERT(arraysize(kLanguageInfoTable) == NUM_LANGUAGES + 1,
|
211
|
+
kLanguageInfoTable_has_incorrect_length);
|
212
|
+
|
213
|
+
|
214
|
+
// LANGUAGE NAMES
|
215
|
+
|
216
|
+
const char* default_language_name() {
|
217
|
+
return kLanguageInfoTable[ENGLISH].language_name_;
|
218
|
+
}
|
219
|
+
|
220
|
+
static const char* const kInvalidLanguageName = "invalid_language";
|
221
|
+
|
222
|
+
const char *invalid_language_name() {
|
223
|
+
return kInvalidLanguageName;
|
224
|
+
}
|
225
|
+
|
226
|
+
const char* LanguageName(Language lang) {
|
227
|
+
return IsValidLanguage(lang)
|
228
|
+
? kLanguageInfoTable[lang].language_name_
|
229
|
+
: kInvalidLanguageName;
|
230
|
+
}
|
231
|
+
|
232
|
+
|
233
|
+
|
234
|
+
// LANGUAGE CODES
|
235
|
+
|
236
|
+
|
237
|
+
// The space before invalid_language_code is intentional. It is used
|
238
|
+
// to prevent it matching any two letter language code.
|
239
|
+
//
|
240
|
+
static const char* const kInvalidLanguageCode = " invalid_language_code";
|
241
|
+
|
242
|
+
const char *invalid_language_code() {
|
243
|
+
return kInvalidLanguageCode;
|
244
|
+
}
|
245
|
+
|
246
|
+
const char * LanguageCode(Language lang) {
|
247
|
+
if (! IsValidLanguage(lang))
|
248
|
+
return kInvalidLanguageCode;
|
249
|
+
const LanguageInfo& info = kLanguageInfoTable[lang];
|
250
|
+
if (info.language_code_639_1_) {
|
251
|
+
return info.language_code_639_1_;
|
252
|
+
} else if (info.language_code_639_2_) {
|
253
|
+
return info.language_code_639_2_;
|
254
|
+
} else if (info.language_code_other_) {
|
255
|
+
return info.language_code_other_;
|
256
|
+
} else {
|
257
|
+
return kInvalidLanguageCode;
|
258
|
+
}
|
259
|
+
}
|
260
|
+
|
261
|
+
const char* default_language_code() {
|
262
|
+
return kLanguageInfoTable[ENGLISH].language_code_639_1_;
|
263
|
+
}
|
264
|
+
|
265
|
+
const char* LanguageCodeISO639_1(Language lang) {
|
266
|
+
if (! IsValidLanguage(lang))
|
267
|
+
return kInvalidLanguageCode;
|
268
|
+
if (const char* code = kLanguageInfoTable[lang].language_code_639_1_)
|
269
|
+
return code;
|
270
|
+
return kInvalidLanguageCode;
|
271
|
+
}
|
272
|
+
|
273
|
+
const char* LanguageCodeISO639_2(Language lang) {
|
274
|
+
if (! IsValidLanguage(lang))
|
275
|
+
return kInvalidLanguageCode;
|
276
|
+
if (const char* code = kLanguageInfoTable[lang].language_code_639_2_)
|
277
|
+
return code;
|
278
|
+
return kInvalidLanguageCode;
|
279
|
+
}
|
280
|
+
|
281
|
+
const char* LanguageCodeWithDialects(Language lang) {
|
282
|
+
if (lang == CHINESE)
|
283
|
+
return "zh-CN";
|
284
|
+
return LanguageCode(lang);
|
285
|
+
}
|
286
|
+
|
287
|
+
|
288
|
+
|
289
|
+
bool LanguageFromCode(const char* lang_code, Language *language) {
|
290
|
+
*language = UNKNOWN_LANGUAGE;
|
291
|
+
if ( lang_code == NULL ) return false;
|
292
|
+
|
293
|
+
for ( int i = 0 ; i < kNumLanguages ; i++ ) {
|
294
|
+
const LanguageInfo& info = kLanguageInfoTable[i];
|
295
|
+
if ((info.language_code_639_1_ &&
|
296
|
+
!base::strcasecmp(lang_code, info.language_code_639_1_)) ||
|
297
|
+
(info.language_code_639_2_ &&
|
298
|
+
!base::strcasecmp(lang_code, info.language_code_639_2_)) ||
|
299
|
+
(info.language_code_other_ &&
|
300
|
+
!base::strcasecmp(lang_code, info.language_code_other_))) {
|
301
|
+
*language = static_cast<Language>(i);
|
302
|
+
return true;
|
303
|
+
}
|
304
|
+
}
|
305
|
+
|
306
|
+
// For convenience, this function can also parse the non-standard
|
307
|
+
// five-letter language codes "zh-cn" and "zh-tw" which are used by
|
308
|
+
// front-ends such as GWS to distinguish Simplified from Traditional
|
309
|
+
// Chinese.
|
310
|
+
if (!base::strcasecmp(lang_code, "zh-cn") ||
|
311
|
+
!base::strcasecmp(lang_code, "zh_cn")) {
|
312
|
+
*language = CHINESE;
|
313
|
+
return true;
|
314
|
+
}
|
315
|
+
if (!base::strcasecmp(lang_code, "zh-tw") ||
|
316
|
+
!base::strcasecmp(lang_code, "zh_tw")) {
|
317
|
+
*language = CHINESE_T;
|
318
|
+
return true;
|
319
|
+
}
|
320
|
+
if (!base::strcasecmp(lang_code, "sr-me") ||
|
321
|
+
!base::strcasecmp(lang_code, "sr_me")) {
|
322
|
+
*language = MONTENEGRIN;
|
323
|
+
return true;
|
324
|
+
}
|
325
|
+
|
326
|
+
// Process language-code synonyms.
|
327
|
+
if (!base::strcasecmp(lang_code, "he")) {
|
328
|
+
*language = HEBREW; // Use "iw".
|
329
|
+
return true;
|
330
|
+
}
|
331
|
+
if (!base::strcasecmp(lang_code, "in")) {
|
332
|
+
*language = INDONESIAN; // Use "id".
|
333
|
+
return true;
|
334
|
+
}
|
335
|
+
if (!base::strcasecmp(lang_code, "ji")) {
|
336
|
+
*language = YIDDISH; // Use "yi".
|
337
|
+
return true;
|
338
|
+
}
|
339
|
+
|
340
|
+
// Process language-detection synonyms.
|
341
|
+
// These distinct languages cannot be differentiated by our current
|
342
|
+
// language-detection algorithms.
|
343
|
+
if (!base::strcasecmp(lang_code, "fil")) {
|
344
|
+
*language = TAGALOG;
|
345
|
+
return true;
|
346
|
+
}
|
347
|
+
|
348
|
+
return false;
|
349
|
+
}
|