cld-fixed 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +20 -0
  3. data/.rspec +2 -0
  4. data/Gemfile +6 -0
  5. data/LICENSE +27 -0
  6. data/README.md +34 -0
  7. data/Rakefile +5 -0
  8. data/cld.gemspec +22 -0
  9. data/ext/cld/Makefile.am +28 -0
  10. data/ext/cld/Makefile.in +790 -0
  11. data/ext/cld/aclocal.m4 +8895 -0
  12. data/ext/cld/base/basictypes.h +348 -0
  13. data/ext/cld/base/build_config.h +115 -0
  14. data/ext/cld/base/casts.h +156 -0
  15. data/ext/cld/base/commandlineflags.h +443 -0
  16. data/ext/cld/base/crash.h +41 -0
  17. data/ext/cld/base/dynamic_annotations.h +358 -0
  18. data/ext/cld/base/global_strip_options.h +59 -0
  19. data/ext/cld/base/log_severity.h +46 -0
  20. data/ext/cld/base/logging.h +1403 -0
  21. data/ext/cld/base/macros.h +243 -0
  22. data/ext/cld/base/port.h +54 -0
  23. data/ext/cld/base/scoped_ptr.h +428 -0
  24. data/ext/cld/base/stl_decl.h +0 -0
  25. data/ext/cld/base/stl_decl_msvc.h +107 -0
  26. data/ext/cld/base/string_util.h +29 -0
  27. data/ext/cld/base/strtoint.h +93 -0
  28. data/ext/cld/base/template_util.h +96 -0
  29. data/ext/cld/base/type_traits.h +198 -0
  30. data/ext/cld/base/vlog_is_on.h +143 -0
  31. data/ext/cld/build_aux/config.guess +1500 -0
  32. data/ext/cld/build_aux/config.sub +1616 -0
  33. data/ext/cld/build_aux/depcomp +584 -0
  34. data/ext/cld/build_aux/install-sh +507 -0
  35. data/ext/cld/build_aux/ltmain.sh +8745 -0
  36. data/ext/cld/build_aux/missing +367 -0
  37. data/ext/cld/cld_encodings.h +95 -0
  38. data/ext/cld/configure +17362 -0
  39. data/ext/cld/configure.ac +14 -0
  40. data/ext/cld/encodings/compact_lang_det/#cldutil.cc# +905 -0
  41. data/ext/cld/encodings/compact_lang_det/#cldutil.h# +1205 -0
  42. data/ext/cld/encodings/compact_lang_det/#compact_lang_det_impl.h# +171 -0
  43. data/ext/cld/encodings/compact_lang_det/#ext_lang_enc.cc# +545 -0
  44. data/ext/cld/encodings/compact_lang_det/#ext_lang_enc.h# +119 -0
  45. data/ext/cld/encodings/compact_lang_det/#getonescriptspan.cc# +570 -0
  46. data/ext/cld/encodings/compact_lang_det/#getonescriptspan.h# +131 -0
  47. data/ext/cld/encodings/compact_lang_det/#tote.cc# +299 -0
  48. data/ext/cld/encodings/compact_lang_det/#tote.h# +89 -0
  49. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  50. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  51. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  52. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  53. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  54. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  55. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  56. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  57. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  58. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  59. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  60. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  61. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  62. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  63. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  64. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  65. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  66. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  67. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  68. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  69. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  70. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  71. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  72. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  73. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  74. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  75. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  76. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  77. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  78. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  79. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  80. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  81. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  82. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  83. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  84. data/ext/cld/encodings/compact_lang_det/win/#cld_unilib_windows.cc# +29 -0
  85. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  86. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  87. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  88. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  89. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  90. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  91. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  92. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  93. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  94. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  95. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  96. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  97. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  98. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  99. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  100. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  101. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  102. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  103. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  104. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  105. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  106. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  107. data/ext/cld/encodings/internal/encodings.cc +12 -0
  108. data/ext/cld/encodings/lang_enc.h +254 -0
  109. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  110. data/ext/cld/encodings/public/encodings.h +301 -0
  111. data/ext/cld/extconf.rb +7 -0
  112. data/ext/cld/languages/internal/#languages.cc# +337 -0
  113. data/ext/cld/languages/internal/languages.cc +336 -0
  114. data/ext/cld/languages/proto/languages.pb.h +179 -0
  115. data/ext/cld/languages/public/languages.h +379 -0
  116. data/ext/cld/thunk.cc +55 -0
  117. data/lib/cld.rb +21 -0
  118. data/lib/cld/version.rb +3 -0
  119. data/spec/cld_spec.rb +67 -0
  120. data/spec/spec_helper.rb +6 -0
  121. metadata +193 -0
@@ -0,0 +1,336 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include "languages/public/languages.h"
6
+
7
+ #include "base/string_util.h"
8
+ #include "encodings/compact_lang_det/win/cld_basictypes.h"
9
+
10
+
11
+ Language default_language() {return ENGLISH;}
12
+
13
+
14
+ // Language names and codes
15
+
16
+ struct LanguageInfo {
17
+ const char * language_name_;
18
+ const char * language_code_639_1_; // the ISO-639-1 code for the language
19
+ const char * language_code_639_2_; // the ISO-639-2 code for the language
20
+ const char * language_code_other_; // some nonstandard code for the language
21
+ };
22
+
23
+ static const LanguageInfo kLanguageInfoTable[] = {
24
+ { "ENGLISH", "en", "eng", NULL},
25
+ { "DANISH", "da", "dan", NULL},
26
+ { "DUTCH", "nl", "dut", NULL},
27
+ { "FINNISH", "fi", "fin", NULL},
28
+ { "FRENCH", "fr", "fre", NULL},
29
+ { "GERMAN", "de", "ger", NULL},
30
+ { "HEBREW", "he", "heb", NULL},
31
+ { "ITALIAN", "it", "ita", NULL},
32
+ { "Japanese", "ja", "jpn", NULL},
33
+ { "Korean", "ko", "kor", NULL},
34
+ { "NORWEGIAN", "nb", "nor", NULL},
35
+ { "POLISH", "pl", "pol", NULL},
36
+ { "PORTUGUESE", "pt", "por", NULL},
37
+ { "RUSSIAN", "ru", "rus", NULL},
38
+ { "SPANISH", "es", "spa", NULL},
39
+ { "SWEDISH", "sv", "swe", NULL},
40
+ { "Chinese", "zh", "chi", "zh-CN"},
41
+ { "CZECH", "cs", "cze", NULL},
42
+ { "GREEK", "el", "gre", NULL},
43
+ { "ICELANDIC", "is", "ice", NULL},
44
+ { "LATVIAN", "lv", "lav", NULL},
45
+ { "LITHUANIAN", "lt", "lit", NULL},
46
+ { "ROMANIAN", "ro", "rum", NULL},
47
+ { "HUNGARIAN", "hu", "hun", NULL},
48
+ { "ESTONIAN", "et", "est", NULL},
49
+ // TODO: Although Teragram has two output names "TG_UNKNOWN_LANGUAGE"
50
+ // and "Unknown", they are essentially the same. Need to unify them.
51
+ // "un" and "ut" are invented by us, not from ISO-639.
52
+ //
53
+ { "TG_UNKNOWN_LANGUAGE", NULL, NULL, "ut"},
54
+ { "Unknown", NULL, NULL, "un"},
55
+ { "BULGARIAN", "bg", "bul", NULL},
56
+ { "CROATIAN", "hr", "scr", NULL},
57
+ { "SERBIAN", "sr", "scc", NULL},
58
+ { "IRISH", "ga", "gle", NULL},
59
+ { "GALICIAN", "gl", "glg", NULL},
60
+ // Impossible to tell Tagalog from Filipino at the moment.
61
+ { "TAGALOG", "tl", "tgl", NULL},
62
+ { "TURKISH", "tr", "tur", NULL},
63
+ { "UKRAINIAN", "uk", "ukr", NULL},
64
+ { "HINDI", "hi", "hin", NULL},
65
+ { "MACEDONIAN", "mk", "mac", NULL},
66
+ { "BENGALI", "bn", "ben", NULL},
67
+ { "INDONESIAN", "id", "ind", NULL},
68
+ { "LATIN", "la", "lat", NULL},
69
+ { "MALAY", "ms", "may", NULL},
70
+ { "MALAYALAM", "ml", "mal", NULL},
71
+ { "WELSH", "cy", "wel", NULL},
72
+ { "NEPALI", "ne", "nep", NULL},
73
+ { "TELUGU", "te", "tel", NULL},
74
+ { "ALBANIAN", "sq", "alb", NULL},
75
+ { "TAMIL", "ta", "tam", NULL},
76
+ { "BELARUSIAN", "be", "bel", NULL},
77
+ { "JAVANESE", "jw", "jav", NULL},
78
+ { "OCCITAN", "oc", "oci", NULL},
79
+ { "URDU", "ur", "urd", NULL},
80
+ { "BIHARI", "bh", "bih", NULL},
81
+ { "GUJARATI", "gu", "guj", NULL},
82
+ { "THAI", "th", "tha", NULL},
83
+ { "ARABIC", "ar", "ara", NULL},
84
+ { "CATALAN", "ca", "cat", NULL},
85
+ { "ESPERANTO", "eo", "epo", NULL},
86
+ { "BASQUE", "eu", "baq", NULL},
87
+ { "INTERLINGUA", "ia", "ina", NULL},
88
+ { "KANNADA", "kn", "kan", NULL},
89
+ { "PUNJABI", "pa", "pan", NULL},
90
+ { "SCOTS_GAELIC", "gd", "gla", NULL},
91
+ { "SWAHILI", "sw", "swa", NULL},
92
+ { "SLOVENIAN", "sl", "slv", NULL},
93
+ { "MARATHI", "mr", "mar", NULL},
94
+ { "MALTESE", "mt", "mlt", NULL},
95
+ { "VIETNAMESE", "vi", "vie", NULL},
96
+ { "FRISIAN", "fy", "fry", NULL},
97
+ { "SLOVAK", "sk", "slo", NULL},
98
+ { "ChineseT",
99
+ NULL, NULL, // We intentionally set these 2 fields to NULL to avoid
100
+ // confusion between CHINESE_T and CHINESE.
101
+ "zh-TW"},
102
+ { "FAROESE", "fo", "fao", NULL},
103
+ { "SUNDANESE", "su", "sun", NULL},
104
+ { "UZBEK", "uz", "uzb", NULL},
105
+ { "AMHARIC", "am", "amh", NULL},
106
+ { "AZERBAIJANI", "az", "aze", NULL},
107
+ { "GEORGIAN", "ka", "geo", NULL},
108
+ { "TIGRINYA", "ti", "tir", NULL},
109
+ { "PERSIAN", "fa", "per", NULL},
110
+ { "BOSNIAN", "bs", "bos", NULL},
111
+ { "SINHALESE", "si", "sin", NULL},
112
+ { "NORWEGIAN_N", "nn", "nno", NULL},
113
+ { "PORTUGUESE_P", NULL, NULL, "pt-PT"},
114
+ { "PORTUGUESE_B", NULL, NULL, "pt-BR"},
115
+ { "XHOSA", "xh", "xho", NULL},
116
+ { "ZULU", "zu", "zul", NULL},
117
+ { "GUARANI", "gn", "grn", NULL},
118
+ { "SESOTHO", "st", "sot", NULL},
119
+ { "TURKMEN", "tk", "tuk", NULL},
120
+ { "KYRGYZ", "ky", "kir", NULL},
121
+ { "BRETON", "br", "bre", NULL},
122
+ { "TWI", "tw", "twi", NULL},
123
+ { "YIDDISH", "yi", "yid", NULL},
124
+ { "SERBO_CROATIAN", "sh", NULL, NULL},
125
+ { "SOMALI", "so", "som", NULL},
126
+ { "UIGHUR", "ug", "uig", NULL},
127
+ { "KURDISH", "ku", "kur", NULL},
128
+ { "MONGOLIAN", "mn", "mon", NULL},
129
+ { "ARMENIAN", "hy", "arm", NULL},
130
+ { "LAOTHIAN", "lo", "lao", NULL},
131
+ { "SINDHI", "sd", "snd", NULL},
132
+ { "RHAETO_ROMANCE", "rm", "roh", NULL},
133
+ { "AFRIKAANS", "af", "afr", NULL},
134
+ { "LUXEMBOURGISH", "lb", "ltz", NULL},
135
+ { "BURMESE", "my", "bur", NULL},
136
+ // KHMER is known as Cambodian for Google user interfaces.
137
+ { "KHMER", "km", "khm", NULL},
138
+ { "TIBETAN", "bo", "tib", NULL},
139
+ { "DHIVEHI", "dv", "div", NULL},
140
+ { "CHEROKEE", NULL, "chr", NULL},
141
+ { "SYRIAC", NULL, "syr", NULL},
142
+ { "LIMBU", NULL, NULL, "sit-NP"},
143
+ { "ORIYA", "or", "ori", NULL},
144
+ { "ASSAMESE", "as", "asm", NULL},
145
+ { "CORSICAN", "co", "cos", NULL},
146
+ { "INTERLINGUE", "ie", "ine", NULL},
147
+ { "KAZAKH", "kk", "kaz", NULL},
148
+ { "LINGALA", "ln", "lin", NULL},
149
+ { "MOLDAVIAN", "mo", "mol", NULL},
150
+ { "PASHTO", "ps", "pus", NULL},
151
+ { "QUECHUA", "qu", "que", NULL},
152
+ { "SHONA", "sn", "sna", NULL},
153
+ { "TAJIK", "tg", "tgk", NULL},
154
+ { "TATAR", "tt", "tat", NULL},
155
+ { "TONGA", "to", "tog", NULL},
156
+ { "YORUBA", "yo", "yor", NULL},
157
+ { "CREOLES_AND_PIDGINS_ENGLISH_BASED", NULL, "cpe", NULL},
158
+ { "CREOLES_AND_PIDGINS_FRENCH_BASED", NULL, "cpf", NULL},
159
+ { "CREOLES_AND_PIDGINS_PORTUGUESE_BASED", NULL, "cpp", NULL},
160
+ { "CREOLES_AND_PIDGINS_OTHER", NULL, "crp", NULL},
161
+ { "MAORI", "mi", "mao", NULL},
162
+ { "WOLOF", "wo", "wol", NULL},
163
+ { "ABKHAZIAN", "ab", "abk", NULL},
164
+ { "AFAR", "aa", "aar", NULL},
165
+ { "AYMARA", "ay", "aym", NULL},
166
+ { "BASHKIR", "ba", "bak", NULL},
167
+ { "BISLAMA", "bi", "bis", NULL},
168
+ { "DZONGKHA", "dz", "dzo", NULL},
169
+ { "FIJIAN", "fj", "fij", NULL},
170
+ { "GREENLANDIC", "kl", "kal", NULL},
171
+ { "HAUSA", "ha", "hau", NULL},
172
+ { "HAITIAN_CREOLE", "ht", NULL, NULL},
173
+ { "INUPIAK", "ik", "ipk", NULL},
174
+ { "INUKTITUT", "iu", "iku", NULL},
175
+ { "KASHMIRI", "ks", "kas", NULL},
176
+ { "KINYARWANDA", "rw", "kin", NULL},
177
+ { "MALAGASY", "mg", "mlg", NULL},
178
+ { "NAURU", "na", "nau", NULL},
179
+ { "OROMO", "om", "orm", NULL},
180
+ { "RUNDI", "rn", "run", NULL},
181
+ { "SAMOAN", "sm", "smo", NULL},
182
+ { "SANGO", "sg", "sag", NULL},
183
+ { "SANSKRIT", "sa", "san", NULL},
184
+ { "SISWANT", "ss", "ssw", NULL},
185
+ { "TSONGA", "ts", "tso", NULL},
186
+ { "TSWANA", "tn", "tsn", NULL},
187
+ { "VOLAPUK", "vo", "vol", NULL},
188
+ { "ZHUANG", "za", "zha", NULL},
189
+ { "KHASI", NULL, "kha", NULL},
190
+ { "SCOTS", NULL, "sco", NULL},
191
+ { "GANDA", "lg", "lug", NULL},
192
+ { "MANX", "gv", "glv", NULL},
193
+ { "MONTENEGRIN", NULL, NULL, "sr-ME"},
194
+ { "XX", NULL, NULL, "XX"},
195
+ };
196
+
197
+ COMPILE_ASSERT(arraysize(kLanguageInfoTable) == NUM_LANGUAGES + 1,
198
+ kLanguageInfoTable_has_incorrect_length);
199
+
200
+
201
+ // LANGUAGE NAMES
202
+
203
+ const char* default_language_name() {
204
+ return kLanguageInfoTable[ENGLISH].language_name_;
205
+ }
206
+
207
+ static const char* const kInvalidLanguageName = "invalid_language";
208
+
209
+ const char *invalid_language_name() {
210
+ return kInvalidLanguageName;
211
+ }
212
+
213
+ const char* LanguageName(Language lang) {
214
+ return IsValidLanguage(lang)
215
+ ? kLanguageInfoTable[lang].language_name_
216
+ : kInvalidLanguageName;
217
+ }
218
+
219
+
220
+
221
+ // LANGUAGE CODES
222
+
223
+
224
+ // The space before invalid_language_code is intentional. It is used
225
+ // to prevent it matching any two letter language code.
226
+ //
227
+ static const char* const kInvalidLanguageCode = " invalid_language_code";
228
+
229
+ const char *invalid_language_code() {
230
+ return kInvalidLanguageCode;
231
+ }
232
+
233
+ const char * LanguageCode(Language lang) {
234
+ if (! IsValidLanguage(lang))
235
+ return kInvalidLanguageCode;
236
+ const LanguageInfo& info = kLanguageInfoTable[lang];
237
+ if (info.language_code_639_1_) {
238
+ return info.language_code_639_1_;
239
+ } else if (info.language_code_639_2_) {
240
+ return info.language_code_639_2_;
241
+ } else if (info.language_code_other_) {
242
+ return info.language_code_other_;
243
+ } else {
244
+ return kInvalidLanguageCode;
245
+ }
246
+ }
247
+
248
+ const char* default_language_code() {
249
+ return kLanguageInfoTable[ENGLISH].language_code_639_1_;
250
+ }
251
+
252
+ const char* LanguageCodeISO639_1(Language lang) {
253
+ if (! IsValidLanguage(lang))
254
+ return kInvalidLanguageCode;
255
+ if (const char* code = kLanguageInfoTable[lang].language_code_639_1_)
256
+ return code;
257
+ return kInvalidLanguageCode;
258
+ }
259
+
260
+ const char* LanguageCodeISO639_2(Language lang) {
261
+ if (! IsValidLanguage(lang))
262
+ return kInvalidLanguageCode;
263
+ if (const char* code = kLanguageInfoTable[lang].language_code_639_2_)
264
+ return code;
265
+ return kInvalidLanguageCode;
266
+ }
267
+
268
+ const char* LanguageCodeWithDialects(Language lang) {
269
+ if (lang == CHINESE)
270
+ return "zh-CN";
271
+ return LanguageCode(lang);
272
+ }
273
+
274
+
275
+
276
+ bool LanguageFromCode(const char* lang_code, Language *language) {
277
+ *language = UNKNOWN_LANGUAGE;
278
+ if ( lang_code == NULL ) return false;
279
+
280
+ for ( int i = 0 ; i < kNumLanguages ; i++ ) {
281
+ const LanguageInfo& info = kLanguageInfoTable[i];
282
+ if ((info.language_code_639_1_ &&
283
+ !base::strcasecmp(lang_code, info.language_code_639_1_)) ||
284
+ (info.language_code_639_2_ &&
285
+ !base::strcasecmp(lang_code, info.language_code_639_2_)) ||
286
+ (info.language_code_other_ &&
287
+ !base::strcasecmp(lang_code, info.language_code_other_))) {
288
+ *language = static_cast<Language>(i);
289
+ return true;
290
+ }
291
+ }
292
+
293
+ // For convenience, this function can also parse the non-standard
294
+ // five-letter language codes "zh-cn" and "zh-tw" which are used by
295
+ // front-ends such as GWS to distinguish Simplified from Traditional
296
+ // Chinese.
297
+ if (!base::strcasecmp(lang_code, "zh-cn") ||
298
+ !base::strcasecmp(lang_code, "zh_cn")) {
299
+ *language = CHINESE;
300
+ return true;
301
+ }
302
+ if (!base::strcasecmp(lang_code, "zh-tw") ||
303
+ !base::strcasecmp(lang_code, "zh_tw")) {
304
+ *language = CHINESE_T;
305
+ return true;
306
+ }
307
+ if (!base::strcasecmp(lang_code, "sr-me") ||
308
+ !base::strcasecmp(lang_code, "sr_me")) {
309
+ *language = MONTENEGRIN;
310
+ return true;
311
+ }
312
+
313
+ // Process language-code synonyms.
314
+ if (!base::strcasecmp(lang_code, "he")) {
315
+ *language = HEBREW; // Use "iw".
316
+ return true;
317
+ }
318
+ if (!base::strcasecmp(lang_code, "in")) {
319
+ *language = INDONESIAN; // Use "id".
320
+ return true;
321
+ }
322
+ if (!base::strcasecmp(lang_code, "ji")) {
323
+ *language = YIDDISH; // Use "yi".
324
+ return true;
325
+ }
326
+
327
+ // Process language-detection synonyms.
328
+ // These distinct languages cannot be differentiated by our current
329
+ // language-detection algorithms.
330
+ if (!base::strcasecmp(lang_code, "fil")) {
331
+ *language = TAGALOG;
332
+ return true;
333
+ }
334
+
335
+ return false;
336
+ }
@@ -0,0 +1,179 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef LANGUAGES_PROTO_LANGUAGES_PB_H_
6
+ #define LANGUAGES_PROTO_LANGUAGES_PB_H_
7
+
8
+ enum Language {
9
+ ENGLISH = 0,
10
+ DANISH = 1,
11
+ DUTCH = 2,
12
+ FINNISH = 3,
13
+ FRENCH = 4,
14
+ GERMAN = 5,
15
+ HEBREW = 6,
16
+ ITALIAN = 7,
17
+ JAPANESE = 8,
18
+ KOREAN = 9,
19
+ NORWEGIAN = 10,
20
+ POLISH = 11,
21
+ PORTUGUESE = 12,
22
+ RUSSIAN = 13,
23
+ SPANISH = 14,
24
+ SWEDISH = 15,
25
+ CHINESE = 16,
26
+ CZECH = 17,
27
+ GREEK = 18,
28
+ ICELANDIC = 19,
29
+ LATVIAN = 20,
30
+ LITHUANIAN = 21,
31
+ ROMANIAN = 22,
32
+ HUNGARIAN = 23,
33
+ ESTONIAN = 24,
34
+ TG_UNKNOWN_LANGUAGE = 25,
35
+ UNKNOWN_LANGUAGE = 26,
36
+ BULGARIAN = 27,
37
+ CROATIAN = 28,
38
+ SERBIAN = 29,
39
+ IRISH = 30, // UI only.
40
+ GALICIAN = 31,
41
+ TAGALOG = 32, // Tagalog (tl) + Filipino (fil),
42
+ TURKISH = 33,
43
+ UKRAINIAN = 34,
44
+ HINDI = 35,
45
+ MACEDONIAN = 36,
46
+ BENGALI = 37,
47
+ INDONESIAN = 38,
48
+ LATIN = 39, // UI only.
49
+ MALAY = 40,
50
+ MALAYALAM = 41,
51
+ WELSH = 42, // UI only.
52
+ NEPALI = 43,
53
+ TELUGU = 44,
54
+ ALBANIAN = 45,
55
+ TAMIL = 46,
56
+ BELARUSIAN = 47,
57
+ JAVANESE = 48, // UI only.
58
+ OCCITAN = 49, // UI only.
59
+ URDU = 50,
60
+ BIHARI = 51,
61
+ GUJARATI = 52,
62
+ THAI = 53,
63
+ ARABIC = 54,
64
+ CATALAN = 55,
65
+ ESPERANTO = 56,
66
+ BASQUE = 57,
67
+ INTERLINGUA = 58, // UI only.
68
+ KANNADA = 59,
69
+ PUNJABI = 60,
70
+ SCOTS_GAELIC = 61, // UI only.
71
+ SWAHILI = 62,
72
+ SLOVENIAN = 63,
73
+ MARATHI = 64,
74
+ MALTESE = 65,
75
+ VIETNAMESE = 66,
76
+ FRISIAN = 67, // UI only.
77
+ SLOVAK = 68,
78
+ CHINESE_T = 69, // This is added to solve the problem of
79
+ // distinguishing Traditional and Simplified
80
+ // Chinese when the encoding is UTF8.
81
+ FAROESE = 70, // UI only.
82
+ SUNDANESE = 71, // UI only.
83
+ UZBEK = 72,
84
+ AMHARIC = 73,
85
+ AZERBAIJANI = 74,
86
+ GEORGIAN = 75,
87
+ TIGRINYA = 76, // UI only.
88
+ PERSIAN = 77,
89
+ BOSNIAN = 78, // UI only. LangId language: CROATIAN (28)
90
+ SINHALESE = 79,
91
+ NORWEGIAN_N = 80, // UI only. LangId language: NORWEGIAN (10)
92
+ PORTUGUESE_P = 81, // UI only. LangId language: PORTUGUESE (12)
93
+ PORTUGUESE_B = 82, // UI only. LangId language: PORTUGUESE (12)
94
+ XHOSA = 83, // UI only.
95
+ ZULU = 84, // UI only.
96
+ GUARANI = 85,
97
+ SESOTHO = 86, // UI only.
98
+ TURKMEN = 87, // UI only.
99
+ KYRGYZ = 88,
100
+ BRETON = 89, // UI only.
101
+ TWI = 90, // UI only.
102
+ YIDDISH = 91, // UI only.
103
+ SERBO_CROATIAN= 92, // UI only. LangId language: SERBIAN (29)
104
+ SOMALI = 93, // UI only.
105
+ UIGHUR = 94,
106
+ KURDISH = 95,
107
+ MONGOLIAN = 96,
108
+ ARMENIAN = 97,
109
+ LAOTHIAN = 98,
110
+ SINDHI = 99,
111
+ RHAETO_ROMANCE= 100, // UI only.
112
+ AFRIKAANS = 101,
113
+ LUXEMBOURGISH = 102, // UI only.
114
+ BURMESE = 103,
115
+ KHMER = 104,
116
+ TIBETAN = 105,
117
+ DHIVEHI = 106, // sometimes spelled Divehi, lang of Maldives
118
+ CHEROKEE = 107,
119
+ SYRIAC = 108, // UI only.
120
+ LIMBU = 109, // UI only.
121
+ ORIYA = 110,
122
+ ASSAMESE = 111, // UI only.
123
+ CORSICAN = 112, // UI only.
124
+ INTERLINGUE = 113, // UI only.
125
+ KAZAKH = 114,
126
+ LINGALA = 115, // UI only.
127
+ MOLDAVIAN = 116, // UI only. LangId language: ROMANIAN (22)
128
+ PASHTO = 117,
129
+ QUECHUA = 118, // UI only.
130
+ SHONA = 119, // UI only.
131
+ TAJIK = 120,
132
+ TATAR = 121, // UI only.
133
+ TONGA = 122, // UI only.
134
+ YORUBA = 123, // UI only.
135
+ CREOLES_AND_PIDGINS_ENGLISH_BASED = 124, // UI only.
136
+ CREOLES_AND_PIDGINS_FRENCH_BASED = 125, // UI only.
137
+ CREOLES_AND_PIDGINS_PORTUGUESE_BASED = 126, // UI only.
138
+ CREOLES_AND_PIDGINS_OTHER = 127, // UI only.
139
+ MAORI = 128, // UI only.
140
+ WOLOF = 129, // UI only.
141
+ ABKHAZIAN = 130, // UI only.
142
+ AFAR = 131, // UI only.
143
+ AYMARA = 132, // UI only.
144
+ BASHKIR = 133, // UI only.
145
+ BISLAMA = 134, // UI only.
146
+ DZONGKHA = 135, // UI only.
147
+ FIJIAN = 136, // UI only.
148
+ GREENLANDIC = 137, // UI only.
149
+ HAUSA = 138, // UI only.
150
+ HAITIAN_CREOLE= 139, // UI only.
151
+ INUPIAK = 140, // UI only.
152
+ INUKTITUT = 141,
153
+ KASHMIRI = 142, // UI only.
154
+ KINYARWANDA = 143, // UI only.
155
+ MALAGASY = 144, // UI only.
156
+ NAURU = 145, // UI only.
157
+ OROMO = 146, // UI only.
158
+ RUNDI = 147, // UI only.
159
+ SAMOAN = 148, // UI only.
160
+ SANGO = 149, // UI only.
161
+ SANSKRIT = 150,
162
+ SISWANT = 151, // UI only.
163
+ TSONGA = 152, // UI only.
164
+ TSWANA = 153, // UI only.
165
+ VOLAPUK = 154, // UI only.
166
+ ZHUANG = 155, // UI only.
167
+ KHASI = 156, // UI only.
168
+ SCOTS = 157, // UI only.
169
+ GANDA = 158, // UI only.
170
+ MANX = 159, // UI only.
171
+ MONTENEGRIN = 160, // UI only. LangId language: SERBIAN (29)
172
+ NUM_LANGUAGES = 161, // Always keep this at the end. It is not a
173
+ // valid Language enum. It is only used to
174
+ // indicate the total number of Languages.
175
+ // NOTE: If you add a language, you will break a unittest. See the note
176
+ // at the top of this enum.
177
+ };
178
+
179
+ #endif // LANGUAGES_PROTO_LANGUAGES_PB_H_