language_detection 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (100) hide show
  1. data/.gitignore +19 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +85 -0
  5. data/Rakefile +11 -0
  6. data/ext/cld/Makefile +34 -0
  7. data/ext/cld/base/basictypes.h +348 -0
  8. data/ext/cld/base/build_config.h +124 -0
  9. data/ext/cld/base/casts.h +156 -0
  10. data/ext/cld/base/commandlineflags.h +443 -0
  11. data/ext/cld/base/crash.h +41 -0
  12. data/ext/cld/base/dynamic_annotations.h +358 -0
  13. data/ext/cld/base/global_strip_options.h +59 -0
  14. data/ext/cld/base/log_severity.h +46 -0
  15. data/ext/cld/base/logging.h +1403 -0
  16. data/ext/cld/base/macros.h +243 -0
  17. data/ext/cld/base/port.h +54 -0
  18. data/ext/cld/base/scoped_ptr.h +428 -0
  19. data/ext/cld/base/stl_decl.h +0 -0
  20. data/ext/cld/base/stl_decl_msvc.h +107 -0
  21. data/ext/cld/base/string_util.h +29 -0
  22. data/ext/cld/base/strtoint.h +93 -0
  23. data/ext/cld/base/template_util.h +96 -0
  24. data/ext/cld/base/type_traits.h +198 -0
  25. data/ext/cld/base/vlog_is_on.h +143 -0
  26. data/ext/cld/cld.so +0 -0
  27. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  28. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  29. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  30. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  31. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  32. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  33. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  34. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  35. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  36. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  37. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  38. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  39. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  40. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  41. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  42. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  43. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  44. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  45. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  46. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  47. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  48. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  49. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  50. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  51. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  52. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  53. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  54. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  55. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  56. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  57. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  58. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  59. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  60. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  61. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  62. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  63. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  64. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  65. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  66. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  67. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  68. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  69. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  70. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  71. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  72. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  73. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  74. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  75. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  76. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  77. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  78. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  79. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  80. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  81. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  82. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  83. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  84. data/ext/cld/encodings/internal/encodings.cc +12 -0
  85. data/ext/cld/encodings/lang_enc.h +254 -0
  86. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  87. data/ext/cld/encodings/public/encodings.h +301 -0
  88. data/ext/cld/extconf.rb +1 -0
  89. data/ext/cld/language_detection.cc +88 -0
  90. data/ext/cld/languages/internal/languages.cc +337 -0
  91. data/ext/cld/languages/proto/languages.pb.h +179 -0
  92. data/ext/cld/languages/public/languages.h +379 -0
  93. data/language_detection.gemspec +28 -0
  94. data/lib/language_detection/string.rb +1 -0
  95. data/lib/language_detection/version.rb +3 -0
  96. data/lib/language_detection.rb +54 -0
  97. data/test/_helper.rb +15 -0
  98. data/test/fixtures/languages.csv +80 -0
  99. data/test/language_detection_test.rb +88 -0
  100. metadata +250 -0
@@ -0,0 +1,545 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ // This file extends lang_enc.cc with additional languages and extended routines
6
+ // It is current with Unicode 5.1 (beta Jan 2008)
7
+ //
8
+
9
+ #include <stdlib.h>
10
+ #include <stdio.h>
11
+ #include <string.h>
12
+
13
+ #include "encodings/compact_lang_det/ext_lang_enc.h"
14
+ #include "encodings/compact_lang_det/win/cld_macros.h"
15
+ #include "encodings/compact_lang_det/win/cld_strtoint.h"
16
+
17
+ // Language names above NUM_LANGUAGES
18
+ // These are also the C enum declared names
19
+ static const char* const kExtLanguageName[] = {
20
+ "X_BORK_BORK_BORK", "X_PIG_LATIN", "X_HACKER", "X_KLINGON", "X_ELMER_FUDD",
21
+
22
+ // Pseudo-languages for Unicode scripts that express a single language
23
+ "X_OGHAM", "X_RUNIC", "X_YI", "X_OLD_ITALIC", "X_GOTHIC",
24
+ "X_DESERET", "X_HANUNOO", "X_BUHID", "X_TAGBANWA", "X_TAI_LE",
25
+ "X_LINEAR_B", "X_UGARITIC", "X_SHAVIAN", "X_OSMANYA", "X_CYPRIOT",
26
+ "X_BUGINESE", "X_COPTIC", "X_NEW_TAI_LUE", "X_GLAGOLITIC", "X_TIFINAGH",
27
+ "X_SYLOTI_NAGRI", "X_OLD_PERSIAN", "X_KHAROSHTHI", "X_BALINESE", "X_CUNEIFORM",
28
+ "X_PHOENICIAN", "X_PHAGS_PA", "X_NKO",
29
+
30
+ // Unicode 5.1
31
+ "X_SUDANESE", "X_LEPCHA", "X_OL_CHIKI", "X_VAI", "X_SAURASHTRA",
32
+ "X_KAYAH_LI", "X_REJANG", "X_LYCIAN", "X_CARIAN", "X_LYDIAN",
33
+ "X_CHAM",
34
+ };
35
+
36
+
37
+ // These are the C enum declared names, for programs creating C code
38
+ static const char* const kExtLangDeclaredName[] = {
39
+ "ENGLISH", /* 0 */
40
+ "DANISH", /* 1 */
41
+ "DUTCH", /* 2 */
42
+ "FINNISH", /* 3 */
43
+ "FRENCH", /* 4 */
44
+ "GERMAN", /* 5 */
45
+ "HEBREW", /* 6 */
46
+ "ITALIAN", /* 7 */
47
+ "JAPANESE", /* 8 */
48
+ "KOREAN", /* 9 */
49
+ "NORWEGIAN", /* 10 */
50
+ "POLISH", /* 11 */
51
+ "PORTUGUESE", /* 12 */
52
+ "RUSSIAN", /* 13 */
53
+ "SPANISH", /* 14 */
54
+ "SWEDISH", /* 15 */
55
+ "CHINESE", /* 16 */
56
+ "CZECH", /* 17 */
57
+ "GREEK", /* 18 */
58
+ "ICELANDIC", /* 19 */
59
+ "LATVIAN", /* 20 */
60
+ "LITHUANIAN", /* 21 */
61
+ "ROMANIAN", /* 22 */
62
+ "HUNGARIAN", /* 23 */
63
+ "ESTONIAN", /* 24 */
64
+ "TG_UNKNOWN_LANGUAGE", /* 25 */
65
+ "UNKNOWN_LANGUAGE", /* 26 */
66
+ "BULGARIAN", /* 27 */
67
+ "CROATIAN", /* 28 */
68
+ "SERBIAN", /* 29 */
69
+ "IRISH", /* 30 */
70
+ "GALICIAN", /* 31 */
71
+ "TAGALOG", /* 32 */
72
+ "TURKISH", /* 33 */
73
+ "UKRAINIAN", /* 34 */
74
+ "HINDI", /* 35 */
75
+ "MACEDONIAN", /* 36 */
76
+ "BENGALI", /* 37 */
77
+ "INDONESIAN", /* 38 */
78
+ "LATIN", /* 39 */
79
+ "MALAY", /* 40 */
80
+ "MALAYALAM", /* 41 */
81
+ "WELSH", /* 42 */
82
+ "NEPALI", /* 43 */
83
+ "TELUGU", /* 44 */
84
+ "ALBANIAN", /* 45 */
85
+ "TAMIL", /* 46 */
86
+ "BELARUSIAN", /* 47 */
87
+ "JAVANESE", /* 48 */
88
+ "OCCITAN", /* 49 */
89
+ "URDU", /* 50 */
90
+ "BIHARI", /* 51 */
91
+ "GUJARATI", /* 52 */
92
+ "THAI", /* 53 */
93
+ "ARABIC", /* 54 */
94
+ "CATALAN", /* 55 */
95
+ "ESPERANTO", /* 56 */
96
+ "BASQUE", /* 57 */
97
+ "INTERLINGUA", /* 58 */
98
+ "KANNADA", /* 59 */
99
+ "PUNJABI", /* 60 */
100
+ "SCOTS_GAELIC", /* 61 */
101
+ "SWAHILI", /* 62 */
102
+ "SLOVENIAN", /* 63 */
103
+ "MARATHI", /* 64 */
104
+ "MALTESE", /* 65 */
105
+ "VIETNAMESE", /* 66 */
106
+ "FRISIAN", /* 67 */
107
+ "SLOVAK", /* 68 */
108
+ "CHINESE_T", /* 69 */
109
+ "FAROESE", /* 70 */
110
+ "SUNDANESE", /* 71 */
111
+ "UZBEK", /* 72 */
112
+ "AMHARIC", /* 73 */
113
+ "AZERBAIJANI", /* 74 */
114
+ "GEORGIAN", /* 75 */
115
+ "TIGRINYA", /* 76 */
116
+ "PERSIAN", /* 77 */
117
+ "BOSNIAN", /* 78 */
118
+ "SINHALESE", /* 79 */
119
+ "NORWEGIAN_N", /* 80 */
120
+ "PORTUGUESE_P", /* 81 */
121
+ "PORTUGUESE_B", /* 82 */
122
+ "XHOSA", /* 83 */
123
+ "ZULU", /* 84 */
124
+ "GUARANI", /* 85 */
125
+ "SESOTHO", /* 86 */
126
+ "TURKMEN", /* 87 */
127
+ "KYRGYZ", /* 88 */
128
+ "BRETON", /* 89 */
129
+ "TWI", /* 90 */
130
+ "YIDDISH", /* 91 */
131
+ "SERBO_CROATIAN", /* 92 */
132
+ "SOMALI", /* 93 */
133
+ "UIGHUR", /* 94 */
134
+ "KURDISH", /* 95 */
135
+ "MONGOLIAN", /* 96 */
136
+ "ARMENIAN", /* 97 */
137
+ "LAOTHIAN", /* 98 */
138
+ "SINDHI", /* 99 */
139
+ "RHAETO_ROMANCE", /* 100 */
140
+ "AFRIKAANS", /* 101 */
141
+ "LUXEMBOURGISH", /* 102 */
142
+ "BURMESE", /* 103 */
143
+ "KHMER", /* 104 */
144
+ "TIBETAN", /* 105 */
145
+ "DHIVEHI", /* 106 */ // sometimes spelled Divehi; lang of Maldives
146
+ "CHEROKEE", /* 107 */
147
+ "SYRIAC", /* 108 */
148
+ "LIMBU", /* 109 */
149
+ "ORIYA", /* 110 */
150
+ "ASSAMESE", /* 111 */
151
+ "CORSICAN", /* 112 */
152
+ "INTERLINGUE", /* 113 */
153
+ "KAZAKH", /* 114 */
154
+ "LINGALA", /* 115 */
155
+ "MOLDAVIAN", /* 116 */
156
+ "PASHTO", /* 117 */
157
+ "QUECHUA", /* 118 */
158
+ "SHONA", /* 119 */
159
+ "TAJIK", /* 120 */
160
+ "TATAR", /* 121 */
161
+ "TONGA", /* 122 */
162
+ "YORUBA", /* 123 */
163
+ "CREOLES_AND_PIDGINS_ENGLISH_BASED", /* 124 */
164
+ "CREOLES_AND_PIDGINS_FRENCH_BASED", /* 125 */
165
+ "CREOLES_AND_PIDGINS_PORTUGUESE_BASED", /* 126 */
166
+ "CREOLES_AND_PIDGINS_OTHER", /* 127 */
167
+ "MAORI", /* 128 */
168
+ "WOLOF", /* 129 */
169
+ "ABKHAZIAN", /* 130 */
170
+ "AFAR", /* 131 */
171
+ "AYMARA", /* 132 */
172
+ "BASHKIR", /* 133 */
173
+ "BISLAMA", /* 134 */
174
+ "DZONGKHA", /* 135 */
175
+ "FIJIAN", /* 136 */
176
+ "GREENLANDIC", /* 137 */
177
+ "HAUSA", /* 138 */
178
+ "HAITIAN_CREOLE", /* 139 */
179
+ "INUPIAK", /* 140 */
180
+ "INUKTITUT", /* 141 */
181
+ "KASHMIRI", /* 142 */
182
+ "KINYARWANDA", /* 143 */
183
+ "MALAGASY", /* 144 */
184
+ "NAURU", /* 145 */
185
+ "OROMO", /* 146 */
186
+ "RUNDI", /* 147 */
187
+ "SAMOAN", /* 148 */
188
+ "SANGO", /* 149 */
189
+ "SANSKRIT", /* 150 */
190
+ "SISWANT", /* 151 */
191
+ "TSONGA", /* 152 */
192
+ "TSWANA", /* 153 */
193
+ "VOLAPUK", /* 154 */
194
+ "ZHUANG", /* 155 */
195
+ "KHASI", /* 156 */
196
+ "SCOTS", /* 157 */
197
+ "GANDA", /* 158 */
198
+ "MANX", /* 159 */
199
+ "MONTENEGRIN", /* 160 */
200
+ // Add new language declared names just before here
201
+ };
202
+
203
+ COMPILE_ASSERT(arraysize(kExtLangDeclaredName) == NUM_LANGUAGES,
204
+ kExtLangDeclaredName_has_incorrect_length);
205
+
206
+
207
+ // Language codes above NUM_LANGUAGES
208
+ // I made all these up, except Klingon from ISO-639-2 (dsites)
209
+ // NOTE: zza is a standard name
210
+ static const char* const kExtLanguageCode[] = {
211
+ // "X_BORK_BORK_BORK", "X_PIG_LATIN", "X_HACKER", "X_KLINGON", "X_ELMER_FUDD",
212
+ // All Latin script
213
+ "zzb", "zzp", "zzh", "tlh", "zze",
214
+
215
+ // Pseudo-languages for Unicode scripts that express a single language
216
+ "xx-Ogam", "xx-Runr", "xx-Yiii", "xx-Ital", "xx-Goth",
217
+ "xx-Dsrt", "xx-Hano", "xx-Buhd", "xx-Tagb", "xx-Tale",
218
+ "xx-Linb", "xx-Ugar", "xx-Shaw", "xx-Osma", "xx-Cprt",
219
+ "xx-Bugi", "xx-Copt", "xx-Talu", "xx-Glag", "xx-Tfng",
220
+ "xx-Sylo", "xx-Xpeo", "xx-Khar", "xx-Bali", "xx-Xsux",
221
+ "xx-Phnx", "xx-Phag", "xx-Nkoo",
222
+
223
+ // Unicode 5.1
224
+ "xx-Sund", "xx-Lepc", "xx-Olck", "xx-Vaii", "xx-Saur",
225
+ "xx-Kali", "xx-Rjng", "xx-Lyci", "xx-Cari", "xx-Lydi",
226
+ "xx-Cham",
227
+ };
228
+
229
+
230
+ // Given the Language, returns its string name used as the output by
231
+ // the lang/enc identifier, e.g. "Korean"
232
+ // "invalid_language" if the input is invalid.
233
+ // TG_UNKNOWN_LANGUAGE is used as a placeholder for the "ignore me" language,
234
+ // used to subtract out HTML, link farms, DNA strings, and alittle English porn
235
+ const char* ExtLanguageName(const Language lang) {
236
+ if (lang < 0) {
237
+ // No-text-at-all result from a Tote
238
+ return "";
239
+ }
240
+ // CompactLanguageDetect extension
241
+ if (lang == TG_UNKNOWN_LANGUAGE) {
242
+ return "Ignore";
243
+ }
244
+ if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
245
+ return LanguageName(lang);
246
+ }
247
+ if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
248
+ return kExtLanguageName[lang - EXT_LANGUAGE_BASE];
249
+ }
250
+ return invalid_language_name();
251
+ }
252
+
253
+
254
+ // Given the Language, returns its Language enum spelling, for use by
255
+ // programs that create C declarations, e.g. "KOREAN"
256
+ // "UNKNOWN_LANGUAGE" if the input is invalid.
257
+ const char* ExtLanguageDeclaredName(const Language lang) {
258
+ if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
259
+ return kExtLangDeclaredName[lang];
260
+ }
261
+ if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
262
+ return kExtLanguageName[lang - EXT_LANGUAGE_BASE];
263
+ }
264
+ return "UNKNOWN_LANGUAGE";
265
+ }
266
+
267
+ // Given the Language, return the language code, e.g. "ko"
268
+ const char* ExtLanguageCode(const Language lang) {
269
+ // Hack for ignore/porn pseudo-language
270
+ if (lang == TG_UNKNOWN_LANGUAGE) {
271
+ return "xxx";
272
+ }
273
+ if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
274
+ return LanguageCode(lang);
275
+ }
276
+ if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
277
+ return kExtLanguageCode[lang - EXT_LANGUAGE_BASE];
278
+ }
279
+ return "??";
280
+ }
281
+
282
+
283
+ // Convert "en-Latn-GB" to ENGLISH
284
+ // Normalize to PORTUGUESE, not PORTUGUESE_B nor PORTUGUESE_P
285
+ // Consider for later: NORWEGIAN, NORWEGIAN_N
286
+ // Consider for later: SCOTS, SCOTS_GAELIC
287
+ // Consider for later: SERBO_CROATIAN, SERBIAN, CROATIAN, BOSNIAN
288
+ //
289
+ Language GetLanguageFromNumberOrName(const char* src) {
290
+ if (strspn(src, "0123456789") == strlen(src)) {
291
+ // All digits
292
+ return static_cast<Language>(strto32(src, NULL, 10));
293
+ }
294
+
295
+ Language retlang = UNKNOWN_LANGUAGE;
296
+ size_t len = strlen(src);
297
+
298
+ if (true /*FLAGS_mergepairs*/) {
299
+ // Merge sets of langauges pt-xx en-xx fr-xx, NOT bs/hr/sr
300
+ if (memcmp(src, "pt-", 3) == 0) {return PORTUGUESE;}
301
+ if (memcmp(src, "en-", 3) == 0) {return ENGLISH;}
302
+ if (memcmp(src, "fr-", 3) == 0) {return FRENCH;}
303
+ // Use NormalizeLanguage instead
304
+ if (memcmp(src, "bs-", 3) == 0) {return CROATIAN;}
305
+ if (memcmp(src, "hr-", 3) == 0) {return CROATIAN;}
306
+ if (memcmp(src, "sr-Latn", 7) == 0) {return CROATIAN;}
307
+ if (memcmp(src, "sh-Latn", 7) == 0) {return CROATIAN;}
308
+ if (memcmp(src, "sr-Cyrl", 7) == 0) {return SERBIAN;}
309
+ if (memcmp(src, "sh-Cyrl", 7) == 0) {return SERBIAN;}
310
+ }
311
+
312
+ // Extensions
313
+ if (len >= 3) {
314
+ // Standin for ignore/porn "language"
315
+ if (memcmp(src, "xxx", 3) == 0) {return TG_UNKNOWN_LANGUAGE;}
316
+
317
+ if (memcmp(src, "zzb", 3) == 0) {return X_BORK_BORK_BORK;}
318
+ if (memcmp(src, "zzp", 3) == 0) {return X_PIG_LATIN;}
319
+ if (memcmp(src, "zzh", 3) == 0) {return X_HACKER;}
320
+ if (memcmp(src, "tlh", 3) == 0) {return X_KLINGON;}
321
+ if (memcmp(src, "zze", 3) == 0) {return X_ELMER_FUDD;}
322
+ }
323
+
324
+ // We have a name like en-Latn-GB or pt-BR
325
+ // First, get rid of some special cases
326
+ if (len <= 3) {
327
+ LanguageFromCode(src, &retlang);
328
+ } else if (len == 7) {
329
+ // More Extensions
330
+ if (memcmp(src, "xx-", 3) == 0) {
331
+ if (memcmp(src, "xx-Ogam", 7) == 0) {return X_OGHAM;}
332
+ if (memcmp(src, "xx-Runr", 7) == 0) {return X_RUNIC;}
333
+ if (memcmp(src, "xx-Yiii", 7) == 0) {return X_YI;}
334
+ if (memcmp(src, "xx-Ital", 7) == 0) {return X_OLD_ITALIC;}
335
+ if (memcmp(src, "xx-Goth", 7) == 0) {return X_GOTHIC;}
336
+ if (memcmp(src, "xx-Dsrt", 7) == 0) {return X_DESERET;}
337
+ if (memcmp(src, "xx-Hano", 7) == 0) {return X_HANUNOO;}
338
+ if (memcmp(src, "xx-Buhd", 7) == 0) {return X_BUHID;}
339
+ if (memcmp(src, "xx-Tagb", 7) == 0) {return X_TAGBANWA;}
340
+ if (memcmp(src, "xx-Tale", 7) == 0) {return X_TAI_LE;}
341
+ if (memcmp(src, "xx-Linb", 7) == 0) {return X_LINEAR_B;}
342
+ if (memcmp(src, "xx-Ugar", 7) == 0) {return X_UGARITIC;}
343
+ if (memcmp(src, "xx-Shaw", 7) == 0) {return X_SHAVIAN;}
344
+ if (memcmp(src, "xx-Osma", 7) == 0) {return X_OSMANYA;}
345
+ if (memcmp(src, "xx-Cprt", 7) == 0) {return X_CYPRIOT;}
346
+ if (memcmp(src, "xx-Bugi", 7) == 0) {return X_BUGINESE;}
347
+ if (memcmp(src, "xx-Copt", 7) == 0) {return X_COPTIC;}
348
+ if (memcmp(src, "xx-Talu", 7) == 0) {return X_NEW_TAI_LUE;}
349
+ if (memcmp(src, "xx-Glag", 7) == 0) {return X_GLAGOLITIC;}
350
+ if (memcmp(src, "xx-Tfng", 7) == 0) {return X_TIFINAGH;}
351
+ if (memcmp(src, "xx-Sylo", 7) == 0) {return X_SYLOTI_NAGRI;}
352
+ if (memcmp(src, "xx-Xpeo", 7) == 0) {return X_OLD_PERSIAN;}
353
+ if (memcmp(src, "xx-Khar", 7) == 0) {return X_KHAROSHTHI;}
354
+ if (memcmp(src, "xx-Bali", 7) == 0) {return X_BALINESE;}
355
+ if (memcmp(src, "xx-Xsux", 7) == 0) {return X_CUNEIFORM;}
356
+ if (memcmp(src, "xx-Phnx", 7) == 0) {return X_PHOENICIAN;}
357
+ if (memcmp(src, "xx-Phag", 7) == 0) {return X_PHAGS_PA;}
358
+ if (memcmp(src, "xx-Nkoo", 7) == 0) {return X_NKO;}
359
+
360
+ // Unicode 5.1
361
+ if (memcmp(src, "xx-Sund", 7) == 0) {return X_SUDANESE;}
362
+ if (memcmp(src, "xx-Lepc", 7) == 0) {return X_LEPCHA;}
363
+ if (memcmp(src, "xx-Olck", 7) == 0) {return X_OL_CHIKI;}
364
+ if (memcmp(src, "xx-Vaii", 7) == 0) {return X_VAI;}
365
+ if (memcmp(src, "xx-Saur", 7) == 0) {return X_SAURASHTRA;}
366
+ if (memcmp(src, "xx-Kali", 7) == 0) {return X_KAYAH_LI;}
367
+ if (memcmp(src, "xx-Rjng", 7) == 0) {return X_REJANG;}
368
+ if (memcmp(src, "xx-Lyci", 7) == 0) {return X_LYCIAN;}
369
+ if (memcmp(src, "xx-Cari", 7) == 0) {return X_CARIAN;}
370
+ if (memcmp(src, "xx-Lydi", 7) == 0) {return X_LYDIAN;}
371
+ if (memcmp(src, "xx-Cham", 7) == 0) {return X_CHAM;}
372
+ }
373
+ }
374
+ // Some other weird ones
375
+ // Could be Latn or Limb; all our current training data is Latn
376
+ if (strcmp(src, "sit-NP") == 0) {return LIMBU;}
377
+ if (strcmp(src, "un-Latn") == 0) {return UNKNOWN_LANGUAGE;}
378
+
379
+ // Multi-country langauges
380
+ if (memcmp(src, "zh", 2) == 0) {
381
+ if (memcmp(&src[len - 2], "TW", 2) == 0) {return CHINESE_T;}
382
+ if (memcmp(&src[len - 2], "HK", 2) == 0) {return CHINESE_T;}
383
+ return CHINESE;
384
+ }
385
+ if (memcmp(src, "pt", 2) == 0) {
386
+ if (memcmp(&src[len - 2], "BR", 2) == 0) {return PORTUGUESE;}
387
+ return PORTUGUESE;
388
+ }
389
+ if (memcmp(src, "fr", 2) == 0) {
390
+ if (memcmp(&src[len -2], "CA", 2) == 0) {return FRENCH;}
391
+ return FRENCH;
392
+ }
393
+
394
+ // None of the special cases matched
395
+ if (src[2] == '-') {
396
+ char temp[4];
397
+ memcpy(temp, src, 4);
398
+ temp[2] = '\0';
399
+ LanguageFromCode(temp, &retlang);
400
+ }
401
+ if (src[3] == '-') {
402
+ char temp[4];
403
+ memcpy(temp, src, 4);
404
+ temp[3] = '\0';
405
+ LanguageFromCode(temp, &retlang);
406
+ }
407
+ if (retlang != UNKNOWN_LANGUAGE) {
408
+ return retlang;
409
+ }
410
+
411
+ return retlang;
412
+ }
413
+
414
+ typedef struct {
415
+ const char* name;
416
+ UnicodeLScript lscript;
417
+ } NameScriptPair;
418
+
419
+ // In alphabetic order for binary search
420
+ static const NameScriptPair kNameScriptPair[] = {
421
+ // Unicode 5.1 additional scripts
422
+ {"Arab", ULScript_Arabic},
423
+ {"Armn", ULScript_Armenian},
424
+ {"Bali", ULScript_Balinese},
425
+ {"Beng", ULScript_Bengali},
426
+ {"Bugi", ULScript_Buginese},
427
+ {"Buhd", ULScript_Buhid},
428
+ {"Cans", ULScript_Canadian_Aboriginal},
429
+ {"Cari", ULScript_Carian}, // Unicode 5.1
430
+ {"Cham", ULScript_Cham}, // Unicode 5.1
431
+ {"Cher", ULScript_Cherokee},
432
+ {"Copt", ULScript_Coptic},
433
+ {"Cprt", ULScript_Cypriot},
434
+ {"Cyrl", ULScript_Cyrillic},
435
+ {"Deva", ULScript_Devanagari},
436
+ {"Dsrt", ULScript_Deseret},
437
+ {"Ethi", ULScript_Ethiopic},
438
+ {"Geor", ULScript_Georgian},
439
+ {"Glag", ULScript_Glagolitic},
440
+ {"Goth", ULScript_Gothic},
441
+ {"Grek", ULScript_Greek},
442
+ {"Gujr", ULScript_Gujarati},
443
+ {"Guru", ULScript_Gurmukhi},
444
+ {"Hani", ULScript_HanCJK},
445
+ {"Hano", ULScript_Hanunoo},
446
+ {"Hebr", ULScript_Hebrew},
447
+ {"Ital", ULScript_Old_Italic},
448
+ {"Kali", ULScript_Kayah_Li}, // Unicode 5.1
449
+ {"Khar", ULScript_Kharoshthi},
450
+ {"Khmr", ULScript_Khmer},
451
+ {"Knda", ULScript_Kannada},
452
+ {"Laoo", ULScript_Lao},
453
+ {"Latn", ULScript_Latin},
454
+ {"Lepc", ULScript_Lepcha}, // Unicode 5.1
455
+ {"Limb", ULScript_Limbu},
456
+ {"Linb", ULScript_Linear_B},
457
+ {"Lyci", ULScript_Lycian}, // Unicode 5.1
458
+ {"Lydi", ULScript_Lydian}, // Unicode 5.1
459
+ {"Mlym", ULScript_Malayalam},
460
+ {"Mong", ULScript_Mongolian},
461
+ {"Mymr", ULScript_Myanmar},
462
+ {"Nkoo", ULScript_Nko},
463
+ {"Ogam", ULScript_Ogham},
464
+ {"Olck", ULScript_Ol_Chiki}, // Unicode 5.1
465
+ {"Orya", ULScript_Oriya},
466
+ {"Osma", ULScript_Osmanya},
467
+ {"Phag", ULScript_Phags_Pa},
468
+ {"Phnx", ULScript_Phoenician},
469
+ {"Rjng", ULScript_Rejang}, // Unicode 5.1
470
+ {"Runr", ULScript_Runic},
471
+ {"Saur", ULScript_Saurashtra}, // Unicode 5.1
472
+ {"Shaw", ULScript_Shavian},
473
+ {"Sinh", ULScript_Sinhala},
474
+ {"Sund", ULScript_Sundanese}, // Unicode 5.1
475
+ {"Sylo", ULScript_Syloti_Nagri},
476
+ {"Syrc", ULScript_Syriac},
477
+ {"Tagb", ULScript_Tagbanwa},
478
+ {"Tale", ULScript_Tai_Le},
479
+ {"Talu", ULScript_New_Tai_Lue},
480
+ {"Taml", ULScript_Tamil},
481
+ {"Telu", ULScript_Telugu},
482
+ {"Tfng", ULScript_Tifinagh},
483
+ {"Tglg", ULScript_Tagalog},
484
+ {"Thaa", ULScript_Thaana},
485
+ {"Thai", ULScript_Thai},
486
+ {"Tibt", ULScript_Tibetan},
487
+ {"Ugar", ULScript_Ugaritic},
488
+ {"Vaii", ULScript_Vai}, // Unicode 5.1 // NOTE: apparently 'Vai '
489
+ {"Xpeo", ULScript_Old_Persian},
490
+ {"Xsux", ULScript_Cuneiform},
491
+ {"Yiii", ULScript_Yi},
492
+ {"Zyyy", ULScript_Common},
493
+ {"Zzzz", ULScript_Inherited},
494
+ };
495
+
496
+ // Convert "en-Latn-GB" to ULScript_Latin
497
+ UnicodeLScript GetLScriptFromNumberOrName(const char* src) {
498
+ if (strspn(src, "0123456789") == strlen(src)) {
499
+ // All digits
500
+ return static_cast<UnicodeLScript>(strto32(src, NULL, 10));
501
+ }
502
+
503
+ if (strcmp(src, "zh-TW") == 0) {return ULScript_HanCJK;}
504
+ if (strcmp(src, "zh-CN") == 0) {return ULScript_HanCJK;}
505
+ if (strcmp(src, "pt-BR") == 0) {return ULScript_Latin;}
506
+ if (strcmp(src, "pt-PT") == 0) {return ULScript_Latin;}
507
+ // Could be Latn or Limb; all our current training data is Latn
508
+ if (strcmp(src, "sit-NP") == 0) {return ULScript_Latin;}
509
+
510
+ // Isolate just the script field
511
+ char temp[5];
512
+ const char* src2 = strchr(src, '-');
513
+ if (src2 == NULL) {return ULScript_Latin;}
514
+ src2 += 1; // over the -
515
+ memcpy(temp, src2, 4);
516
+ temp[4] = '\0';
517
+
518
+ int lo = 0;
519
+ int hi = ULScript_NUM_SCRIPTS;
520
+ while (lo < hi) {
521
+ int mid = (lo + hi) >> 1;
522
+ if (strcmp(temp, kNameScriptPair[mid].name) < 0) {
523
+ hi = mid;
524
+ } else if (strcmp(temp, kNameScriptPair[mid].name) > 0) {
525
+ lo = mid + 1;
526
+ } else {
527
+ return kNameScriptPair[mid].lscript;
528
+ }
529
+ }
530
+ return ULScript_Latin;
531
+ }
532
+
533
+
534
+ // Merge together some languages, such as bo/hr/sr
535
+ // Croatian Latin and Serbian Cyrillic now.
536
+ Language NormalizeLanguage(Language lang) {
537
+ if (lang == BOSNIAN) {return CROATIAN;}
538
+ if (lang == SERBO_CROATIAN) {return SERBIAN;}
539
+
540
+ if (lang == PORTUGUESE_P) {return PORTUGUESE;}
541
+ if (lang == PORTUGUESE_B) {return PORTUGUESE;}
542
+
543
+ return lang;
544
+ }
545
+
@@ -0,0 +1,119 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+ //
5
+ // This file extends lang_enc.h with additional languages and extended routines.
6
+ // It is current with Unicode 5.1 (March 2008)
7
+ //
8
+
9
+ #ifndef ENCODINGS_COMPACT_LANG_DET_EXT_LANG_ENC_H__
10
+ #define ENCODINGS_COMPACT_LANG_DET_EXT_LANG_ENC_H__
11
+
12
+ #include "languages/public/languages.h"
13
+ #include "encodings/compact_lang_det/letterscript_enum.h"
14
+
15
+
16
+ // Leave a small gap after the base languages, so adding one or two is easy.
17
+ // Just reduce the gap here (currently 5 entries)
18
+
19
+ // Montengrin added, so reducing this from 5 to 4. dsites 2008.10.06
20
+ #define EXT_LANGUAGE_BASE (NUM_LANGUAGES + 4)
21
+
22
+ // Google UI languages
23
+ #define X_BORK_BORK_BORK (Language)(EXT_LANGUAGE_BASE+0)
24
+ #define X_PIG_LATIN (Language)(EXT_LANGUAGE_BASE+1)
25
+ #define X_HACKER (Language)(EXT_LANGUAGE_BASE+2)
26
+ #define X_KLINGON (Language)(EXT_LANGUAGE_BASE+3)
27
+ #define X_ELMER_FUDD (Language)(EXT_LANGUAGE_BASE+4)
28
+
29
+ // Pseudo-languages for Unicode scripts that express a single language
30
+ #define X_OGHAM (Language)(EXT_LANGUAGE_BASE+5)
31
+ #define X_RUNIC (Language)(EXT_LANGUAGE_BASE+6)
32
+ #define X_YI (Language)(EXT_LANGUAGE_BASE+7)
33
+ #define X_OLD_ITALIC (Language)(EXT_LANGUAGE_BASE+8)
34
+ #define X_GOTHIC (Language)(EXT_LANGUAGE_BASE+9)
35
+ #define X_DESERET (Language)(EXT_LANGUAGE_BASE+10)
36
+ #define X_HANUNOO (Language)(EXT_LANGUAGE_BASE+11)
37
+ #define X_BUHID (Language)(EXT_LANGUAGE_BASE+12)
38
+ #define X_TAGBANWA (Language)(EXT_LANGUAGE_BASE+13)
39
+ #define X_TAI_LE (Language)(EXT_LANGUAGE_BASE+14)
40
+ #define X_LINEAR_B (Language)(EXT_LANGUAGE_BASE+15)
41
+ #define X_UGARITIC (Language)(EXT_LANGUAGE_BASE+16)
42
+ #define X_SHAVIAN (Language)(EXT_LANGUAGE_BASE+17)
43
+ #define X_OSMANYA (Language)(EXT_LANGUAGE_BASE+18)
44
+ #define X_CYPRIOT (Language)(EXT_LANGUAGE_BASE+19)
45
+ #define X_BUGINESE (Language)(EXT_LANGUAGE_BASE+20)
46
+ #define X_COPTIC (Language)(EXT_LANGUAGE_BASE+21)
47
+ #define X_NEW_TAI_LUE (Language)(EXT_LANGUAGE_BASE+22)
48
+ #define X_GLAGOLITIC (Language)(EXT_LANGUAGE_BASE+23)
49
+ #define X_TIFINAGH (Language)(EXT_LANGUAGE_BASE+24)
50
+ #define X_SYLOTI_NAGRI (Language)(EXT_LANGUAGE_BASE+25)
51
+ #define X_OLD_PERSIAN (Language)(EXT_LANGUAGE_BASE+26)
52
+ #define X_KHAROSHTHI (Language)(EXT_LANGUAGE_BASE+27)
53
+ #define X_BALINESE (Language)(EXT_LANGUAGE_BASE+28)
54
+ #define X_CUNEIFORM (Language)(EXT_LANGUAGE_BASE+29)
55
+ #define X_PHOENICIAN (Language)(EXT_LANGUAGE_BASE+30)
56
+ #define X_PHAGS_PA (Language)(EXT_LANGUAGE_BASE+31)
57
+ #define X_NKO (Language)(EXT_LANGUAGE_BASE+32)
58
+
59
+ // Unicode 5.1
60
+ #define X_SUDANESE (Language)(EXT_LANGUAGE_BASE+33)
61
+ #define X_LEPCHA (Language)(EXT_LANGUAGE_BASE+34)
62
+ #define X_OL_CHIKI (Language)(EXT_LANGUAGE_BASE+35)
63
+ #define X_VAI (Language)(EXT_LANGUAGE_BASE+36)
64
+ #define X_SAURASHTRA (Language)(EXT_LANGUAGE_BASE+37)
65
+ #define X_KAYAH_LI (Language)(EXT_LANGUAGE_BASE+38)
66
+ #define X_REJANG (Language)(EXT_LANGUAGE_BASE+39)
67
+ #define X_LYCIAN (Language)(EXT_LANGUAGE_BASE+40)
68
+ #define X_CARIAN (Language)(EXT_LANGUAGE_BASE+41)
69
+ #define X_LYDIAN (Language)(EXT_LANGUAGE_BASE+42)
70
+ #define X_CHAM (Language)(EXT_LANGUAGE_BASE+43)
71
+
72
+ #define EXT_NUM_LANGUAGES (Language)(EXT_LANGUAGE_BASE+44)
73
+
74
+
75
+
76
+ // ExtLanguageName
77
+ // ------------
78
+ // Given the Language, returns its string name used as the output by
79
+ // the lang/enc identifier, e.g. "Korean"
80
+ // "invalid_language" if the input is invalid.
81
+ extern const char* ExtLanguageName(const Language lang);
82
+
83
+ // ExtLanguageDeclaredName
84
+ // ------------
85
+ // Given the Language, returns its Language enum spelling, for use by
86
+ // programs that create C declarations, e.g. "KOREAN"
87
+ // "UNKNOWN_LANGUAGE" if the input is invalid.
88
+ extern const char* ExtLanguageDeclaredName(const Language lang);
89
+
90
+ // ExtLanguageCode
91
+ // ------------
92
+ // Given the Language, return the language code, e.g. "ko"
93
+ // This is determined by
94
+ // the following (in order of preference):
95
+ // - ISO-639-1 two-letter language code
96
+ // (all except those mentioned below)
97
+ // - ISO-639-2 three-letter bibliographic language code
98
+ // (Tibetan, Dhivehi, Cherokee, Syriac)
99
+ // - Google-specific language code
100
+ // (ChineseT ("zh-TW"), Teragram Unknown, Unknown,
101
+ // Portuguese-Portugal, Portuguese-Brazil, Limbu)
102
+ extern const char * ExtLanguageCode(const Language lang);
103
+
104
+
105
+ // Convert "en-Latn-GB" to ENGLISH
106
+ // Normalize to PORTUGUESE, not PORTUGUESE_B nor PORTUGUESE_P
107
+ // Consider for later: NORWEGIAN, NORWEGIAN_N
108
+ // Consider for later: SCOTS, SCOTS_GAELIC
109
+ // Consider for later: SERBO_CROATIAN, SERBIAN, CROATIAN, BOSNIAN
110
+ //
111
+ Language GetLanguageFromNumberOrName(const char* src);
112
+
113
+ // Convert "en-Latn-GB" to ULScript_Latin
114
+ UnicodeLScript GetLScriptFromNumberOrName(const char* src);
115
+
116
+ // Merge together some languages, such as bo/hr/sr
117
+ Language NormalizeLanguage(Language lang);
118
+
119
+ #endif // ENCODINGS_COMPACT_LANG_DET_EXT_LANG_ENC_H__