compact_enc_det 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +7 -0
  2. data/ext/compact_enc_det/compact_enc_det/CMakeLists.txt +103 -0
  3. data/ext/compact_enc_det/compact_enc_det/LICENSE +202 -0
  4. data/ext/compact_enc_det/compact_enc_det/README.md +46 -0
  5. data/ext/compact_enc_det/compact_enc_det/autogen.sh +74 -0
  6. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.cc +5719 -0
  7. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.h +83 -0
  8. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_fuzz_test.cc +54 -0
  9. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables.h +6326 -0
  10. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables2.h +856 -0
  11. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.cc +169 -0
  12. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.h +45 -0
  13. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_unittest.cc +5260 -0
  14. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/detail_head_string.inc +152 -0
  15. data/ext/compact_enc_det/compact_enc_det/util/basictypes.h +331 -0
  16. data/ext/compact_enc_det/compact_enc_det/util/case_insensitive_hash.h +88 -0
  17. data/ext/compact_enc_det/compact_enc_det/util/commandlineflags.h +39 -0
  18. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.cc +891 -0
  19. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.h +299 -0
  20. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.pb.h +181 -0
  21. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings_unittest.cc +34 -0
  22. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.cc +349 -0
  23. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.h +381 -0
  24. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.pb.h +191 -0
  25. data/ext/compact_enc_det/compact_enc_det/util/logging.h +25 -0
  26. data/ext/compact_enc_det/compact_enc_det/util/port.h +53 -0
  27. data/ext/compact_enc_det/compact_enc_det/util/string_util.h +61 -0
  28. data/ext/compact_enc_det/compact_enc_det/util/varsetter.h +66 -0
  29. data/ext/compact_enc_det/compact_enc_det.cc +100 -0
  30. data/ext/compact_enc_det/extconf.rb +20 -0
  31. data/lib/compact_enc_det/version.rb +3 -0
  32. data/lib/compact_enc_det.rb +2 -0
  33. metadata +106 -0
@@ -0,0 +1,349 @@
1
+ // Copyright 2016 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ ////////////////////////////////////////////////////////////////////////////////
16
+
17
+ #include "util/languages/languages.h"
18
+
19
+ #include "util/basictypes.h"
20
+ #include "util/string_util.h"
21
+
22
+
23
+ Language default_language() {return ENGLISH;}
24
+
25
+
26
+ // Language names and codes
27
+
28
+ struct LanguageInfo {
29
+ const char * language_name_;
30
+ const char * language_code_639_1_; // the ISO-639-1 code for the language
31
+ const char * language_code_639_2_; // the ISO-639-2 code for the language
32
+ const char * language_code_other_; // some nonstandard code for the language
33
+ };
34
+
35
+ static const LanguageInfo kLanguageInfoTable[] = {
36
+ { "ENGLISH", "en", "eng", NULL},
37
+ { "DANISH", "da", "dan", NULL},
38
+ { "DUTCH", "nl", "dut", NULL},
39
+ { "FINNISH", "fi", "fin", NULL},
40
+ { "FRENCH", "fr", "fre", NULL},
41
+ { "GERMAN", "de", "ger", NULL},
42
+ { "HEBREW", "he", "heb", NULL},
43
+ { "ITALIAN", "it", "ita", NULL},
44
+ { "Japanese", "ja", "jpn", NULL},
45
+ { "Korean", "ko", "kor", NULL},
46
+ { "NORWEGIAN", "nb", "nor", NULL},
47
+ { "POLISH", "pl", "pol", NULL},
48
+ { "PORTUGUESE", "pt", "por", NULL},
49
+ { "RUSSIAN", "ru", "rus", NULL},
50
+ { "SPANISH", "es", "spa", NULL},
51
+ { "SWEDISH", "sv", "swe", NULL},
52
+ { "Chinese", "zh", "chi", "zh-CN"},
53
+ { "CZECH", "cs", "cze", NULL},
54
+ { "GREEK", "el", "gre", NULL},
55
+ { "ICELANDIC", "is", "ice", NULL},
56
+ { "LATVIAN", "lv", "lav", NULL},
57
+ { "LITHUANIAN", "lt", "lit", NULL},
58
+ { "ROMANIAN", "ro", "rum", NULL},
59
+ { "HUNGARIAN", "hu", "hun", NULL},
60
+ { "ESTONIAN", "et", "est", NULL},
61
+ // TODO: Although Teragram has two output names "TG_UNKNOWN_LANGUAGE"
62
+ // and "Unknown", they are essentially the same. Need to unify them.
63
+ // "un" and "ut" are invented by us, not from ISO-639.
64
+ //
65
+ { "TG_UNKNOWN_LANGUAGE", NULL, NULL, "ut"},
66
+ { "Unknown", NULL, NULL, "un"},
67
+ { "BULGARIAN", "bg", "bul", NULL},
68
+ { "CROATIAN", "hr", "scr", NULL},
69
+ { "SERBIAN", "sr", "scc", NULL},
70
+ { "IRISH", "ga", "gle", NULL},
71
+ { "GALICIAN", "gl", "glg", NULL},
72
+ // Impossible to tell Tagalog from Filipino at the moment.
73
+ // Use ISO 639-2 code for Filipino here.
74
+ { "TAGALOG", NULL, "fil", NULL},
75
+ { "TURKISH", "tr", "tur", NULL},
76
+ { "UKRAINIAN", "uk", "ukr", NULL},
77
+ { "HINDI", "hi", "hin", NULL},
78
+ { "MACEDONIAN", "mk", "mac", NULL},
79
+ { "BENGALI", "bn", "ben", NULL},
80
+ { "INDONESIAN", "id", "ind", NULL},
81
+ { "LATIN", "la", "lat", NULL},
82
+ { "MALAY", "ms", "may", NULL},
83
+ { "MALAYALAM", "ml", "mal", NULL},
84
+ { "WELSH", "cy", "wel", NULL},
85
+ { "NEPALI", "ne", "nep", NULL},
86
+ { "TELUGU", "te", "tel", NULL},
87
+ { "ALBANIAN", "sq", "alb", NULL},
88
+ { "TAMIL", "ta", "tam", NULL},
89
+ { "BELARUSIAN", "be", "bel", NULL},
90
+ { "JAVANESE", "jw", "jav", NULL},
91
+ { "OCCITAN", "oc", "oci", NULL},
92
+ { "URDU", "ur", "urd", NULL},
93
+ { "BIHARI", "bh", "bih", NULL},
94
+ { "GUJARATI", "gu", "guj", NULL},
95
+ { "THAI", "th", "tha", NULL},
96
+ { "ARABIC", "ar", "ara", NULL},
97
+ { "CATALAN", "ca", "cat", NULL},
98
+ { "ESPERANTO", "eo", "epo", NULL},
99
+ { "BASQUE", "eu", "baq", NULL},
100
+ { "INTERLINGUA", "ia", "ina", NULL},
101
+ { "KANNADA", "kn", "kan", NULL},
102
+ { "PUNJABI", "pa", "pan", NULL},
103
+ { "SCOTS_GAELIC", "gd", "gla", NULL},
104
+ { "SWAHILI", "sw", "swa", NULL},
105
+ { "SLOVENIAN", "sl", "slv", NULL},
106
+ { "MARATHI", "mr", "mar", NULL},
107
+ { "MALTESE", "mt", "mlt", NULL},
108
+ { "VIETNAMESE", "vi", "vie", NULL},
109
+ { "FRISIAN", "fy", "fry", NULL},
110
+ { "SLOVAK", "sk", "slo", NULL},
111
+ { "ChineseT",
112
+ NULL, NULL, // We intentionally set these 2 fields to NULL to avoid
113
+ // confusion between CHINESE_T and CHINESE.
114
+ "zh-TW"},
115
+ { "FAROESE", "fo", "fao", NULL},
116
+ { "SUNDANESE", "su", "sun", NULL},
117
+ { "UZBEK", "uz", "uzb", NULL},
118
+ { "AMHARIC", "am", "amh", NULL},
119
+ { "AZERBAIJANI", "az", "aze", NULL},
120
+ { "GEORGIAN", "ka", "geo", NULL},
121
+ { "TIGRINYA", "ti", "tir", NULL},
122
+ { "PERSIAN", "fa", "per", NULL},
123
+ { "BOSNIAN", "bs", "bos", NULL},
124
+ { "SINHALESE", "si", "sin", NULL},
125
+ { "NORWEGIAN_N", "nn", "nno", NULL},
126
+ { "PORTUGUESE_P", NULL, NULL, "pt-PT"},
127
+ { "PORTUGUESE_B", NULL, NULL, "pt-BR"},
128
+ { "XHOSA", "xh", "xho", NULL},
129
+ { "ZULU", "zu", "zul", NULL},
130
+ { "GUARANI", "gn", "grn", NULL},
131
+ { "SESOTHO", "st", "sot", NULL},
132
+ { "TURKMEN", "tk", "tuk", NULL},
133
+ { "KYRGYZ", "ky", "kir", NULL},
134
+ { "BRETON", "br", "bre", NULL},
135
+ { "TWI", "tw", "twi", NULL},
136
+ { "YIDDISH", "yi", "yid", NULL},
137
+ { "SERBO_CROATIAN", "sh", NULL, NULL},
138
+ { "SOMALI", "so", "som", NULL},
139
+ { "UIGHUR", "ug", "uig", NULL},
140
+ { "KURDISH", "ku", "kur", NULL},
141
+ { "MONGOLIAN", "mn", "mon", NULL},
142
+ { "ARMENIAN", "hy", "arm", NULL},
143
+ { "LAOTHIAN", "lo", "lao", NULL},
144
+ { "SINDHI", "sd", "snd", NULL},
145
+ { "RHAETO_ROMANCE", "rm", "roh", NULL},
146
+ { "AFRIKAANS", "af", "afr", NULL},
147
+ { "LUXEMBOURGISH", "lb", "ltz", NULL},
148
+ { "BURMESE", "my", "bur", NULL},
149
+ // KHMER is known as Cambodian for Google user interfaces.
150
+ { "KHMER", "km", "khm", NULL},
151
+ { "TIBETAN", "bo", "tib", NULL},
152
+ { "DHIVEHI", "dv", "div", NULL},
153
+ { "CHEROKEE", NULL, "chr", NULL},
154
+ { "SYRIAC", NULL, "syr", NULL},
155
+ { "LIMBU", NULL, NULL, "sit-NP"},
156
+ { "ORIYA", "or", "ori", NULL},
157
+ { "ASSAMESE", "as", "asm", NULL},
158
+ { "CORSICAN", "co", "cos", NULL},
159
+ { "INTERLINGUE", "ie", "ine", NULL},
160
+ { "KAZAKH", "kk", "kaz", NULL},
161
+ { "LINGALA", "ln", "lin", NULL},
162
+ { "MOLDAVIAN", "mo", "mol", NULL},
163
+ { "PASHTO", "ps", "pus", NULL},
164
+ { "QUECHUA", "qu", "que", NULL},
165
+ { "SHONA", "sn", "sna", NULL},
166
+ { "TAJIK", "tg", "tgk", NULL},
167
+ { "TATAR", "tt", "tat", NULL},
168
+ { "TONGA", "to", "tog", NULL},
169
+ { "YORUBA", "yo", "yor", NULL},
170
+ { "CREOLES_AND_PIDGINS_ENGLISH_BASED", NULL, "cpe", NULL},
171
+ { "CREOLES_AND_PIDGINS_FRENCH_BASED", NULL, "cpf", NULL},
172
+ { "CREOLES_AND_PIDGINS_PORTUGUESE_BASED", NULL, "cpp", NULL},
173
+ { "CREOLES_AND_PIDGINS_OTHER", NULL, "crp", NULL},
174
+ { "MAORI", "mi", "mao", NULL},
175
+ { "WOLOF", "wo", "wol", NULL},
176
+ { "ABKHAZIAN", "ab", "abk", NULL},
177
+ { "AFAR", "aa", "aar", NULL},
178
+ { "AYMARA", "ay", "aym", NULL},
179
+ { "BASHKIR", "ba", "bak", NULL},
180
+ { "BISLAMA", "bi", "bis", NULL},
181
+ { "DZONGKHA", "dz", "dzo", NULL},
182
+ { "FIJIAN", "fj", "fij", NULL},
183
+ { "GREENLANDIC", "kl", "kal", NULL},
184
+ { "HAUSA", "ha", "hau", NULL},
185
+ { "HAITIAN_CREOLE", "ht", NULL, NULL},
186
+ { "INUPIAK", "ik", "ipk", NULL},
187
+ { "INUKTITUT", "iu", "iku", NULL},
188
+ { "KASHMIRI", "ks", "kas", NULL},
189
+ { "KINYARWANDA", "rw", "kin", NULL},
190
+ { "MALAGASY", "mg", "mlg", NULL},
191
+ { "NAURU", "na", "nau", NULL},
192
+ { "OROMO", "om", "orm", NULL},
193
+ { "RUNDI", "rn", "run", NULL},
194
+ { "SAMOAN", "sm", "smo", NULL},
195
+ { "SANGO", "sg", "sag", NULL},
196
+ { "SANSKRIT", "sa", "san", NULL},
197
+ { "SISWANT", "ss", "ssw", NULL},
198
+ { "TSONGA", "ts", "tso", NULL},
199
+ { "TSWANA", "tn", "tsn", NULL},
200
+ { "VOLAPUK", "vo", "vol", NULL},
201
+ { "ZHUANG", "za", "zha", NULL},
202
+ { "KHASI", NULL, "kha", NULL},
203
+ { "SCOTS", NULL, "sco", NULL},
204
+ { "GANDA", "lg", "lug", NULL},
205
+ { "MANX", "gv", "glv", NULL},
206
+ { "MONTENEGRIN", NULL, NULL, "sr-ME"},
207
+ { "XX", NULL, NULL, "XX"},
208
+ };
209
+
210
+ COMPILE_ASSERT(arraysize(kLanguageInfoTable) == NUM_LANGUAGES + 1,
211
+ kLanguageInfoTable_has_incorrect_length);
212
+
213
+
214
+ // LANGUAGE NAMES
215
+
216
+ const char* default_language_name() {
217
+ return kLanguageInfoTable[ENGLISH].language_name_;
218
+ }
219
+
220
+ static const char* const kInvalidLanguageName = "invalid_language";
221
+
222
+ const char *invalid_language_name() {
223
+ return kInvalidLanguageName;
224
+ }
225
+
226
+ const char* LanguageName(Language lang) {
227
+ return IsValidLanguage(lang)
228
+ ? kLanguageInfoTable[lang].language_name_
229
+ : kInvalidLanguageName;
230
+ }
231
+
232
+
233
+
234
+ // LANGUAGE CODES
235
+
236
+
237
+ // The space before invalid_language_code is intentional. It is used
238
+ // to prevent it matching any two letter language code.
239
+ //
240
+ static const char* const kInvalidLanguageCode = " invalid_language_code";
241
+
242
+ const char *invalid_language_code() {
243
+ return kInvalidLanguageCode;
244
+ }
245
+
246
+ const char * LanguageCode(Language lang) {
247
+ if (! IsValidLanguage(lang))
248
+ return kInvalidLanguageCode;
249
+ const LanguageInfo& info = kLanguageInfoTable[lang];
250
+ if (info.language_code_639_1_) {
251
+ return info.language_code_639_1_;
252
+ } else if (info.language_code_639_2_) {
253
+ return info.language_code_639_2_;
254
+ } else if (info.language_code_other_) {
255
+ return info.language_code_other_;
256
+ } else {
257
+ return kInvalidLanguageCode;
258
+ }
259
+ }
260
+
261
+ const char* default_language_code() {
262
+ return kLanguageInfoTable[ENGLISH].language_code_639_1_;
263
+ }
264
+
265
+ const char* LanguageCodeISO639_1(Language lang) {
266
+ if (! IsValidLanguage(lang))
267
+ return kInvalidLanguageCode;
268
+ if (const char* code = kLanguageInfoTable[lang].language_code_639_1_)
269
+ return code;
270
+ return kInvalidLanguageCode;
271
+ }
272
+
273
+ const char* LanguageCodeISO639_2(Language lang) {
274
+ if (! IsValidLanguage(lang))
275
+ return kInvalidLanguageCode;
276
+ if (const char* code = kLanguageInfoTable[lang].language_code_639_2_)
277
+ return code;
278
+ return kInvalidLanguageCode;
279
+ }
280
+
281
+ const char* LanguageCodeWithDialects(Language lang) {
282
+ if (lang == CHINESE)
283
+ return "zh-CN";
284
+ return LanguageCode(lang);
285
+ }
286
+
287
+
288
+
289
+ bool LanguageFromCode(const char* lang_code, Language *language) {
290
+ *language = UNKNOWN_LANGUAGE;
291
+ if ( lang_code == NULL ) return false;
292
+
293
+ for ( int i = 0 ; i < kNumLanguages ; i++ ) {
294
+ const LanguageInfo& info = kLanguageInfoTable[i];
295
+ if ((info.language_code_639_1_ &&
296
+ !base::strcasecmp(lang_code, info.language_code_639_1_)) ||
297
+ (info.language_code_639_2_ &&
298
+ !base::strcasecmp(lang_code, info.language_code_639_2_)) ||
299
+ (info.language_code_other_ &&
300
+ !base::strcasecmp(lang_code, info.language_code_other_))) {
301
+ *language = static_cast<Language>(i);
302
+ return true;
303
+ }
304
+ }
305
+
306
+ // For convenience, this function can also parse the non-standard
307
+ // five-letter language codes "zh-cn" and "zh-tw" which are used by
308
+ // front-ends such as GWS to distinguish Simplified from Traditional
309
+ // Chinese.
310
+ if (!base::strcasecmp(lang_code, "zh-cn") ||
311
+ !base::strcasecmp(lang_code, "zh_cn")) {
312
+ *language = CHINESE;
313
+ return true;
314
+ }
315
+ if (!base::strcasecmp(lang_code, "zh-tw") ||
316
+ !base::strcasecmp(lang_code, "zh_tw")) {
317
+ *language = CHINESE_T;
318
+ return true;
319
+ }
320
+ if (!base::strcasecmp(lang_code, "sr-me") ||
321
+ !base::strcasecmp(lang_code, "sr_me")) {
322
+ *language = MONTENEGRIN;
323
+ return true;
324
+ }
325
+
326
+ // Process language-code synonyms.
327
+ if (!base::strcasecmp(lang_code, "he")) {
328
+ *language = HEBREW; // Use "iw".
329
+ return true;
330
+ }
331
+ if (!base::strcasecmp(lang_code, "in")) {
332
+ *language = INDONESIAN; // Use "id".
333
+ return true;
334
+ }
335
+ if (!base::strcasecmp(lang_code, "ji")) {
336
+ *language = YIDDISH; // Use "yi".
337
+ return true;
338
+ }
339
+
340
+ // Process language-detection synonyms.
341
+ // These distinct languages cannot be differentiated by our current
342
+ // language-detection algorithms.
343
+ if (!base::strcasecmp(lang_code, "fil")) {
344
+ *language = TAGALOG;
345
+ return true;
346
+ }
347
+
348
+ return false;
349
+ }