compact_enc_det 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. checksums.yaml +7 -0
  2. data/ext/compact_enc_det/compact_enc_det/CMakeLists.txt +103 -0
  3. data/ext/compact_enc_det/compact_enc_det/LICENSE +202 -0
  4. data/ext/compact_enc_det/compact_enc_det/README.md +46 -0
  5. data/ext/compact_enc_det/compact_enc_det/autogen.sh +74 -0
  6. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.cc +5719 -0
  7. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.h +83 -0
  8. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_fuzz_test.cc +54 -0
  9. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables.h +6326 -0
  10. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables2.h +856 -0
  11. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.cc +169 -0
  12. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.h +45 -0
  13. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_unittest.cc +5260 -0
  14. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/detail_head_string.inc +152 -0
  15. data/ext/compact_enc_det/compact_enc_det/util/basictypes.h +331 -0
  16. data/ext/compact_enc_det/compact_enc_det/util/case_insensitive_hash.h +88 -0
  17. data/ext/compact_enc_det/compact_enc_det/util/commandlineflags.h +39 -0
  18. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.cc +891 -0
  19. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.h +299 -0
  20. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.pb.h +181 -0
  21. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings_unittest.cc +34 -0
  22. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.cc +349 -0
  23. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.h +381 -0
  24. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.pb.h +191 -0
  25. data/ext/compact_enc_det/compact_enc_det/util/logging.h +25 -0
  26. data/ext/compact_enc_det/compact_enc_det/util/port.h +53 -0
  27. data/ext/compact_enc_det/compact_enc_det/util/string_util.h +61 -0
  28. data/ext/compact_enc_det/compact_enc_det/util/varsetter.h +66 -0
  29. data/ext/compact_enc_det/compact_enc_det.cc +100 -0
  30. data/ext/compact_enc_det/extconf.rb +20 -0
  31. data/lib/compact_enc_det/version.rb +3 -0
  32. data/lib/compact_enc_det.rb +2 -0
  33. metadata +106 -0
@@ -0,0 +1,349 @@
1
+ // Copyright 2016 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ ////////////////////////////////////////////////////////////////////////////////
16
+
17
+ #include "util/languages/languages.h"
18
+
19
+ #include "util/basictypes.h"
20
+ #include "util/string_util.h"
21
+
22
+
23
+ Language default_language() {return ENGLISH;}
24
+
25
+
26
+ // Language names and codes
27
+
28
+ struct LanguageInfo {
29
+ const char * language_name_;
30
+ const char * language_code_639_1_; // the ISO-639-1 code for the language
31
+ const char * language_code_639_2_; // the ISO-639-2 code for the language
32
+ const char * language_code_other_; // some nonstandard code for the language
33
+ };
34
+
35
+ static const LanguageInfo kLanguageInfoTable[] = {
36
+ { "ENGLISH", "en", "eng", NULL},
37
+ { "DANISH", "da", "dan", NULL},
38
+ { "DUTCH", "nl", "dut", NULL},
39
+ { "FINNISH", "fi", "fin", NULL},
40
+ { "FRENCH", "fr", "fre", NULL},
41
+ { "GERMAN", "de", "ger", NULL},
42
+ { "HEBREW", "he", "heb", NULL},
43
+ { "ITALIAN", "it", "ita", NULL},
44
+ { "Japanese", "ja", "jpn", NULL},
45
+ { "Korean", "ko", "kor", NULL},
46
+ { "NORWEGIAN", "nb", "nor", NULL},
47
+ { "POLISH", "pl", "pol", NULL},
48
+ { "PORTUGUESE", "pt", "por", NULL},
49
+ { "RUSSIAN", "ru", "rus", NULL},
50
+ { "SPANISH", "es", "spa", NULL},
51
+ { "SWEDISH", "sv", "swe", NULL},
52
+ { "Chinese", "zh", "chi", "zh-CN"},
53
+ { "CZECH", "cs", "cze", NULL},
54
+ { "GREEK", "el", "gre", NULL},
55
+ { "ICELANDIC", "is", "ice", NULL},
56
+ { "LATVIAN", "lv", "lav", NULL},
57
+ { "LITHUANIAN", "lt", "lit", NULL},
58
+ { "ROMANIAN", "ro", "rum", NULL},
59
+ { "HUNGARIAN", "hu", "hun", NULL},
60
+ { "ESTONIAN", "et", "est", NULL},
61
+ // TODO: Although Teragram has two output names "TG_UNKNOWN_LANGUAGE"
62
+ // and "Unknown", they are essentially the same. Need to unify them.
63
+ // "un" and "ut" are invented by us, not from ISO-639.
64
+ //
65
+ { "TG_UNKNOWN_LANGUAGE", NULL, NULL, "ut"},
66
+ { "Unknown", NULL, NULL, "un"},
67
+ { "BULGARIAN", "bg", "bul", NULL},
68
+ { "CROATIAN", "hr", "scr", NULL},
69
+ { "SERBIAN", "sr", "scc", NULL},
70
+ { "IRISH", "ga", "gle", NULL},
71
+ { "GALICIAN", "gl", "glg", NULL},
72
+ // Impossible to tell Tagalog from Filipino at the moment.
73
+ // Use ISO 639-2 code for Filipino here.
74
+ { "TAGALOG", NULL, "fil", NULL},
75
+ { "TURKISH", "tr", "tur", NULL},
76
+ { "UKRAINIAN", "uk", "ukr", NULL},
77
+ { "HINDI", "hi", "hin", NULL},
78
+ { "MACEDONIAN", "mk", "mac", NULL},
79
+ { "BENGALI", "bn", "ben", NULL},
80
+ { "INDONESIAN", "id", "ind", NULL},
81
+ { "LATIN", "la", "lat", NULL},
82
+ { "MALAY", "ms", "may", NULL},
83
+ { "MALAYALAM", "ml", "mal", NULL},
84
+ { "WELSH", "cy", "wel", NULL},
85
+ { "NEPALI", "ne", "nep", NULL},
86
+ { "TELUGU", "te", "tel", NULL},
87
+ { "ALBANIAN", "sq", "alb", NULL},
88
+ { "TAMIL", "ta", "tam", NULL},
89
+ { "BELARUSIAN", "be", "bel", NULL},
90
+ { "JAVANESE", "jw", "jav", NULL},
91
+ { "OCCITAN", "oc", "oci", NULL},
92
+ { "URDU", "ur", "urd", NULL},
93
+ { "BIHARI", "bh", "bih", NULL},
94
+ { "GUJARATI", "gu", "guj", NULL},
95
+ { "THAI", "th", "tha", NULL},
96
+ { "ARABIC", "ar", "ara", NULL},
97
+ { "CATALAN", "ca", "cat", NULL},
98
+ { "ESPERANTO", "eo", "epo", NULL},
99
+ { "BASQUE", "eu", "baq", NULL},
100
+ { "INTERLINGUA", "ia", "ina", NULL},
101
+ { "KANNADA", "kn", "kan", NULL},
102
+ { "PUNJABI", "pa", "pan", NULL},
103
+ { "SCOTS_GAELIC", "gd", "gla", NULL},
104
+ { "SWAHILI", "sw", "swa", NULL},
105
+ { "SLOVENIAN", "sl", "slv", NULL},
106
+ { "MARATHI", "mr", "mar", NULL},
107
+ { "MALTESE", "mt", "mlt", NULL},
108
+ { "VIETNAMESE", "vi", "vie", NULL},
109
+ { "FRISIAN", "fy", "fry", NULL},
110
+ { "SLOVAK", "sk", "slo", NULL},
111
+ { "ChineseT",
112
+ NULL, NULL, // We intentionally set these 2 fields to NULL to avoid
113
+ // confusion between CHINESE_T and CHINESE.
114
+ "zh-TW"},
115
+ { "FAROESE", "fo", "fao", NULL},
116
+ { "SUNDANESE", "su", "sun", NULL},
117
+ { "UZBEK", "uz", "uzb", NULL},
118
+ { "AMHARIC", "am", "amh", NULL},
119
+ { "AZERBAIJANI", "az", "aze", NULL},
120
+ { "GEORGIAN", "ka", "geo", NULL},
121
+ { "TIGRINYA", "ti", "tir", NULL},
122
+ { "PERSIAN", "fa", "per", NULL},
123
+ { "BOSNIAN", "bs", "bos", NULL},
124
+ { "SINHALESE", "si", "sin", NULL},
125
+ { "NORWEGIAN_N", "nn", "nno", NULL},
126
+ { "PORTUGUESE_P", NULL, NULL, "pt-PT"},
127
+ { "PORTUGUESE_B", NULL, NULL, "pt-BR"},
128
+ { "XHOSA", "xh", "xho", NULL},
129
+ { "ZULU", "zu", "zul", NULL},
130
+ { "GUARANI", "gn", "grn", NULL},
131
+ { "SESOTHO", "st", "sot", NULL},
132
+ { "TURKMEN", "tk", "tuk", NULL},
133
+ { "KYRGYZ", "ky", "kir", NULL},
134
+ { "BRETON", "br", "bre", NULL},
135
+ { "TWI", "tw", "twi", NULL},
136
+ { "YIDDISH", "yi", "yid", NULL},
137
+ { "SERBO_CROATIAN", "sh", NULL, NULL},
138
+ { "SOMALI", "so", "som", NULL},
139
+ { "UIGHUR", "ug", "uig", NULL},
140
+ { "KURDISH", "ku", "kur", NULL},
141
+ { "MONGOLIAN", "mn", "mon", NULL},
142
+ { "ARMENIAN", "hy", "arm", NULL},
143
+ { "LAOTHIAN", "lo", "lao", NULL},
144
+ { "SINDHI", "sd", "snd", NULL},
145
+ { "RHAETO_ROMANCE", "rm", "roh", NULL},
146
+ { "AFRIKAANS", "af", "afr", NULL},
147
+ { "LUXEMBOURGISH", "lb", "ltz", NULL},
148
+ { "BURMESE", "my", "bur", NULL},
149
+ // KHMER is known as Cambodian for Google user interfaces.
150
+ { "KHMER", "km", "khm", NULL},
151
+ { "TIBETAN", "bo", "tib", NULL},
152
+ { "DHIVEHI", "dv", "div", NULL},
153
+ { "CHEROKEE", NULL, "chr", NULL},
154
+ { "SYRIAC", NULL, "syr", NULL},
155
+ { "LIMBU", NULL, NULL, "sit-NP"},
156
+ { "ORIYA", "or", "ori", NULL},
157
+ { "ASSAMESE", "as", "asm", NULL},
158
+ { "CORSICAN", "co", "cos", NULL},
159
+ { "INTERLINGUE", "ie", "ine", NULL},
160
+ { "KAZAKH", "kk", "kaz", NULL},
161
+ { "LINGALA", "ln", "lin", NULL},
162
+ { "MOLDAVIAN", "mo", "mol", NULL},
163
+ { "PASHTO", "ps", "pus", NULL},
164
+ { "QUECHUA", "qu", "que", NULL},
165
+ { "SHONA", "sn", "sna", NULL},
166
+ { "TAJIK", "tg", "tgk", NULL},
167
+ { "TATAR", "tt", "tat", NULL},
168
+ { "TONGA", "to", "tog", NULL},
169
+ { "YORUBA", "yo", "yor", NULL},
170
+ { "CREOLES_AND_PIDGINS_ENGLISH_BASED", NULL, "cpe", NULL},
171
+ { "CREOLES_AND_PIDGINS_FRENCH_BASED", NULL, "cpf", NULL},
172
+ { "CREOLES_AND_PIDGINS_PORTUGUESE_BASED", NULL, "cpp", NULL},
173
+ { "CREOLES_AND_PIDGINS_OTHER", NULL, "crp", NULL},
174
+ { "MAORI", "mi", "mao", NULL},
175
+ { "WOLOF", "wo", "wol", NULL},
176
+ { "ABKHAZIAN", "ab", "abk", NULL},
177
+ { "AFAR", "aa", "aar", NULL},
178
+ { "AYMARA", "ay", "aym", NULL},
179
+ { "BASHKIR", "ba", "bak", NULL},
180
+ { "BISLAMA", "bi", "bis", NULL},
181
+ { "DZONGKHA", "dz", "dzo", NULL},
182
+ { "FIJIAN", "fj", "fij", NULL},
183
+ { "GREENLANDIC", "kl", "kal", NULL},
184
+ { "HAUSA", "ha", "hau", NULL},
185
+ { "HAITIAN_CREOLE", "ht", NULL, NULL},
186
+ { "INUPIAK", "ik", "ipk", NULL},
187
+ { "INUKTITUT", "iu", "iku", NULL},
188
+ { "KASHMIRI", "ks", "kas", NULL},
189
+ { "KINYARWANDA", "rw", "kin", NULL},
190
+ { "MALAGASY", "mg", "mlg", NULL},
191
+ { "NAURU", "na", "nau", NULL},
192
+ { "OROMO", "om", "orm", NULL},
193
+ { "RUNDI", "rn", "run", NULL},
194
+ { "SAMOAN", "sm", "smo", NULL},
195
+ { "SANGO", "sg", "sag", NULL},
196
+ { "SANSKRIT", "sa", "san", NULL},
197
+ { "SISWANT", "ss", "ssw", NULL},
198
+ { "TSONGA", "ts", "tso", NULL},
199
+ { "TSWANA", "tn", "tsn", NULL},
200
+ { "VOLAPUK", "vo", "vol", NULL},
201
+ { "ZHUANG", "za", "zha", NULL},
202
+ { "KHASI", NULL, "kha", NULL},
203
+ { "SCOTS", NULL, "sco", NULL},
204
+ { "GANDA", "lg", "lug", NULL},
205
+ { "MANX", "gv", "glv", NULL},
206
+ { "MONTENEGRIN", NULL, NULL, "sr-ME"},
207
+ { "XX", NULL, NULL, "XX"},
208
+ };
209
+
210
+ COMPILE_ASSERT(arraysize(kLanguageInfoTable) == NUM_LANGUAGES + 1,
211
+ kLanguageInfoTable_has_incorrect_length);
212
+
213
+
214
+ // LANGUAGE NAMES
215
+
216
+ const char* default_language_name() {
217
+ return kLanguageInfoTable[ENGLISH].language_name_;
218
+ }
219
+
220
+ static const char* const kInvalidLanguageName = "invalid_language";
221
+
222
+ const char *invalid_language_name() {
223
+ return kInvalidLanguageName;
224
+ }
225
+
226
+ const char* LanguageName(Language lang) {
227
+ return IsValidLanguage(lang)
228
+ ? kLanguageInfoTable[lang].language_name_
229
+ : kInvalidLanguageName;
230
+ }
231
+
232
+
233
+
234
+ // LANGUAGE CODES
235
+
236
+
237
+ // The space before invalid_language_code is intentional. It is used
238
+ // to prevent it matching any two letter language code.
239
+ //
240
+ static const char* const kInvalidLanguageCode = " invalid_language_code";
241
+
242
+ const char *invalid_language_code() {
243
+ return kInvalidLanguageCode;
244
+ }
245
+
246
+ const char * LanguageCode(Language lang) {
247
+ if (! IsValidLanguage(lang))
248
+ return kInvalidLanguageCode;
249
+ const LanguageInfo& info = kLanguageInfoTable[lang];
250
+ if (info.language_code_639_1_) {
251
+ return info.language_code_639_1_;
252
+ } else if (info.language_code_639_2_) {
253
+ return info.language_code_639_2_;
254
+ } else if (info.language_code_other_) {
255
+ return info.language_code_other_;
256
+ } else {
257
+ return kInvalidLanguageCode;
258
+ }
259
+ }
260
+
261
+ const char* default_language_code() {
262
+ return kLanguageInfoTable[ENGLISH].language_code_639_1_;
263
+ }
264
+
265
+ const char* LanguageCodeISO639_1(Language lang) {
266
+ if (! IsValidLanguage(lang))
267
+ return kInvalidLanguageCode;
268
+ if (const char* code = kLanguageInfoTable[lang].language_code_639_1_)
269
+ return code;
270
+ return kInvalidLanguageCode;
271
+ }
272
+
273
+ const char* LanguageCodeISO639_2(Language lang) {
274
+ if (! IsValidLanguage(lang))
275
+ return kInvalidLanguageCode;
276
+ if (const char* code = kLanguageInfoTable[lang].language_code_639_2_)
277
+ return code;
278
+ return kInvalidLanguageCode;
279
+ }
280
+
281
+ const char* LanguageCodeWithDialects(Language lang) {
282
+ if (lang == CHINESE)
283
+ return "zh-CN";
284
+ return LanguageCode(lang);
285
+ }
286
+
287
+
288
+
289
+ bool LanguageFromCode(const char* lang_code, Language *language) {
290
+ *language = UNKNOWN_LANGUAGE;
291
+ if ( lang_code == NULL ) return false;
292
+
293
+ for ( int i = 0 ; i < kNumLanguages ; i++ ) {
294
+ const LanguageInfo& info = kLanguageInfoTable[i];
295
+ if ((info.language_code_639_1_ &&
296
+ !base::strcasecmp(lang_code, info.language_code_639_1_)) ||
297
+ (info.language_code_639_2_ &&
298
+ !base::strcasecmp(lang_code, info.language_code_639_2_)) ||
299
+ (info.language_code_other_ &&
300
+ !base::strcasecmp(lang_code, info.language_code_other_))) {
301
+ *language = static_cast<Language>(i);
302
+ return true;
303
+ }
304
+ }
305
+
306
+ // For convenience, this function can also parse the non-standard
307
+ // five-letter language codes "zh-cn" and "zh-tw" which are used by
308
+ // front-ends such as GWS to distinguish Simplified from Traditional
309
+ // Chinese.
310
+ if (!base::strcasecmp(lang_code, "zh-cn") ||
311
+ !base::strcasecmp(lang_code, "zh_cn")) {
312
+ *language = CHINESE;
313
+ return true;
314
+ }
315
+ if (!base::strcasecmp(lang_code, "zh-tw") ||
316
+ !base::strcasecmp(lang_code, "zh_tw")) {
317
+ *language = CHINESE_T;
318
+ return true;
319
+ }
320
+ if (!base::strcasecmp(lang_code, "sr-me") ||
321
+ !base::strcasecmp(lang_code, "sr_me")) {
322
+ *language = MONTENEGRIN;
323
+ return true;
324
+ }
325
+
326
+ // Process language-code synonyms.
327
+ if (!base::strcasecmp(lang_code, "he")) {
328
+ *language = HEBREW; // Use "iw".
329
+ return true;
330
+ }
331
+ if (!base::strcasecmp(lang_code, "in")) {
332
+ *language = INDONESIAN; // Use "id".
333
+ return true;
334
+ }
335
+ if (!base::strcasecmp(lang_code, "ji")) {
336
+ *language = YIDDISH; // Use "yi".
337
+ return true;
338
+ }
339
+
340
+ // Process language-detection synonyms.
341
+ // These distinct languages cannot be differentiated by our current
342
+ // language-detection algorithms.
343
+ if (!base::strcasecmp(lang_code, "fil")) {
344
+ *language = TAGALOG;
345
+ return true;
346
+ }
347
+
348
+ return false;
349
+ }