language_detection 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. data/.gitignore +19 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +85 -0
  5. data/Rakefile +11 -0
  6. data/ext/cld/Makefile +34 -0
  7. data/ext/cld/base/basictypes.h +348 -0
  8. data/ext/cld/base/build_config.h +124 -0
  9. data/ext/cld/base/casts.h +156 -0
  10. data/ext/cld/base/commandlineflags.h +443 -0
  11. data/ext/cld/base/crash.h +41 -0
  12. data/ext/cld/base/dynamic_annotations.h +358 -0
  13. data/ext/cld/base/global_strip_options.h +59 -0
  14. data/ext/cld/base/log_severity.h +46 -0
  15. data/ext/cld/base/logging.h +1403 -0
  16. data/ext/cld/base/macros.h +243 -0
  17. data/ext/cld/base/port.h +54 -0
  18. data/ext/cld/base/scoped_ptr.h +428 -0
  19. data/ext/cld/base/stl_decl.h +0 -0
  20. data/ext/cld/base/stl_decl_msvc.h +107 -0
  21. data/ext/cld/base/string_util.h +29 -0
  22. data/ext/cld/base/strtoint.h +93 -0
  23. data/ext/cld/base/template_util.h +96 -0
  24. data/ext/cld/base/type_traits.h +198 -0
  25. data/ext/cld/base/vlog_is_on.h +143 -0
  26. data/ext/cld/cld.so +0 -0
  27. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  28. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  29. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  30. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  31. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  32. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  33. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  34. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  35. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  36. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  37. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  38. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  39. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  40. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  41. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  42. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  43. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  44. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  45. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  46. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  47. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  48. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  49. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  50. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  51. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  52. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  53. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  54. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  55. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  56. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  57. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  58. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  59. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  60. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  61. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  62. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  63. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  64. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  65. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  66. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  67. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  68. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  69. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  70. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  71. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  72. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  73. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  74. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  75. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  76. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  77. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  78. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  79. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  80. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  81. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  82. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  83. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  84. data/ext/cld/encodings/internal/encodings.cc +12 -0
  85. data/ext/cld/encodings/lang_enc.h +254 -0
  86. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  87. data/ext/cld/encodings/public/encodings.h +301 -0
  88. data/ext/cld/extconf.rb +1 -0
  89. data/ext/cld/language_detection.cc +88 -0
  90. data/ext/cld/languages/internal/languages.cc +337 -0
  91. data/ext/cld/languages/proto/languages.pb.h +179 -0
  92. data/ext/cld/languages/public/languages.h +379 -0
  93. data/language_detection.gemspec +28 -0
  94. data/lib/language_detection/string.rb +1 -0
  95. data/lib/language_detection/version.rb +3 -0
  96. data/lib/language_detection.rb +54 -0
  97. data/test/_helper.rb +15 -0
  98. data/test/fixtures/languages.csv +80 -0
  99. data/test/language_detection_test.rb +88 -0
  100. metadata +250 -0
@@ -0,0 +1,545 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ // This file extends lang_enc.cc with additional languages and extended routines
6
+ // It is current with Unicode 5.1 (beta Jan 2008)
7
+ //
8
+
9
+ #include <stdlib.h>
10
+ #include <stdio.h>
11
+ #include <string.h>
12
+
13
+ #include "encodings/compact_lang_det/ext_lang_enc.h"
14
+ #include "encodings/compact_lang_det/win/cld_macros.h"
15
+ #include "encodings/compact_lang_det/win/cld_strtoint.h"
16
+
17
+ // Language names above NUM_LANGUAGES
18
+ // These are also the C enum declared names
19
+ static const char* const kExtLanguageName[] = {
20
+ "X_BORK_BORK_BORK", "X_PIG_LATIN", "X_HACKER", "X_KLINGON", "X_ELMER_FUDD",
21
+
22
+ // Pseudo-languages for Unicode scripts that express a single language
23
+ "X_OGHAM", "X_RUNIC", "X_YI", "X_OLD_ITALIC", "X_GOTHIC",
24
+ "X_DESERET", "X_HANUNOO", "X_BUHID", "X_TAGBANWA", "X_TAI_LE",
25
+ "X_LINEAR_B", "X_UGARITIC", "X_SHAVIAN", "X_OSMANYA", "X_CYPRIOT",
26
+ "X_BUGINESE", "X_COPTIC", "X_NEW_TAI_LUE", "X_GLAGOLITIC", "X_TIFINAGH",
27
+ "X_SYLOTI_NAGRI", "X_OLD_PERSIAN", "X_KHAROSHTHI", "X_BALINESE", "X_CUNEIFORM",
28
+ "X_PHOENICIAN", "X_PHAGS_PA", "X_NKO",
29
+
30
+ // Unicode 5.1
31
+ "X_SUDANESE", "X_LEPCHA", "X_OL_CHIKI", "X_VAI", "X_SAURASHTRA",
32
+ "X_KAYAH_LI", "X_REJANG", "X_LYCIAN", "X_CARIAN", "X_LYDIAN",
33
+ "X_CHAM",
34
+ };
35
+
36
+
37
+ // These are the C enum declared names, for programs creating C code
38
+ static const char* const kExtLangDeclaredName[] = {
39
+ "ENGLISH", /* 0 */
40
+ "DANISH", /* 1 */
41
+ "DUTCH", /* 2 */
42
+ "FINNISH", /* 3 */
43
+ "FRENCH", /* 4 */
44
+ "GERMAN", /* 5 */
45
+ "HEBREW", /* 6 */
46
+ "ITALIAN", /* 7 */
47
+ "JAPANESE", /* 8 */
48
+ "KOREAN", /* 9 */
49
+ "NORWEGIAN", /* 10 */
50
+ "POLISH", /* 11 */
51
+ "PORTUGUESE", /* 12 */
52
+ "RUSSIAN", /* 13 */
53
+ "SPANISH", /* 14 */
54
+ "SWEDISH", /* 15 */
55
+ "CHINESE", /* 16 */
56
+ "CZECH", /* 17 */
57
+ "GREEK", /* 18 */
58
+ "ICELANDIC", /* 19 */
59
+ "LATVIAN", /* 20 */
60
+ "LITHUANIAN", /* 21 */
61
+ "ROMANIAN", /* 22 */
62
+ "HUNGARIAN", /* 23 */
63
+ "ESTONIAN", /* 24 */
64
+ "TG_UNKNOWN_LANGUAGE", /* 25 */
65
+ "UNKNOWN_LANGUAGE", /* 26 */
66
+ "BULGARIAN", /* 27 */
67
+ "CROATIAN", /* 28 */
68
+ "SERBIAN", /* 29 */
69
+ "IRISH", /* 30 */
70
+ "GALICIAN", /* 31 */
71
+ "TAGALOG", /* 32 */
72
+ "TURKISH", /* 33 */
73
+ "UKRAINIAN", /* 34 */
74
+ "HINDI", /* 35 */
75
+ "MACEDONIAN", /* 36 */
76
+ "BENGALI", /* 37 */
77
+ "INDONESIAN", /* 38 */
78
+ "LATIN", /* 39 */
79
+ "MALAY", /* 40 */
80
+ "MALAYALAM", /* 41 */
81
+ "WELSH", /* 42 */
82
+ "NEPALI", /* 43 */
83
+ "TELUGU", /* 44 */
84
+ "ALBANIAN", /* 45 */
85
+ "TAMIL", /* 46 */
86
+ "BELARUSIAN", /* 47 */
87
+ "JAVANESE", /* 48 */
88
+ "OCCITAN", /* 49 */
89
+ "URDU", /* 50 */
90
+ "BIHARI", /* 51 */
91
+ "GUJARATI", /* 52 */
92
+ "THAI", /* 53 */
93
+ "ARABIC", /* 54 */
94
+ "CATALAN", /* 55 */
95
+ "ESPERANTO", /* 56 */
96
+ "BASQUE", /* 57 */
97
+ "INTERLINGUA", /* 58 */
98
+ "KANNADA", /* 59 */
99
+ "PUNJABI", /* 60 */
100
+ "SCOTS_GAELIC", /* 61 */
101
+ "SWAHILI", /* 62 */
102
+ "SLOVENIAN", /* 63 */
103
+ "MARATHI", /* 64 */
104
+ "MALTESE", /* 65 */
105
+ "VIETNAMESE", /* 66 */
106
+ "FRISIAN", /* 67 */
107
+ "SLOVAK", /* 68 */
108
+ "CHINESE_T", /* 69 */
109
+ "FAROESE", /* 70 */
110
+ "SUNDANESE", /* 71 */
111
+ "UZBEK", /* 72 */
112
+ "AMHARIC", /* 73 */
113
+ "AZERBAIJANI", /* 74 */
114
+ "GEORGIAN", /* 75 */
115
+ "TIGRINYA", /* 76 */
116
+ "PERSIAN", /* 77 */
117
+ "BOSNIAN", /* 78 */
118
+ "SINHALESE", /* 79 */
119
+ "NORWEGIAN_N", /* 80 */
120
+ "PORTUGUESE_P", /* 81 */
121
+ "PORTUGUESE_B", /* 82 */
122
+ "XHOSA", /* 83 */
123
+ "ZULU", /* 84 */
124
+ "GUARANI", /* 85 */
125
+ "SESOTHO", /* 86 */
126
+ "TURKMEN", /* 87 */
127
+ "KYRGYZ", /* 88 */
128
+ "BRETON", /* 89 */
129
+ "TWI", /* 90 */
130
+ "YIDDISH", /* 91 */
131
+ "SERBO_CROATIAN", /* 92 */
132
+ "SOMALI", /* 93 */
133
+ "UIGHUR", /* 94 */
134
+ "KURDISH", /* 95 */
135
+ "MONGOLIAN", /* 96 */
136
+ "ARMENIAN", /* 97 */
137
+ "LAOTHIAN", /* 98 */
138
+ "SINDHI", /* 99 */
139
+ "RHAETO_ROMANCE", /* 100 */
140
+ "AFRIKAANS", /* 101 */
141
+ "LUXEMBOURGISH", /* 102 */
142
+ "BURMESE", /* 103 */
143
+ "KHMER", /* 104 */
144
+ "TIBETAN", /* 105 */
145
+ "DHIVEHI", /* 106 */ // sometimes spelled Divehi; lang of Maldives
146
+ "CHEROKEE", /* 107 */
147
+ "SYRIAC", /* 108 */
148
+ "LIMBU", /* 109 */
149
+ "ORIYA", /* 110 */
150
+ "ASSAMESE", /* 111 */
151
+ "CORSICAN", /* 112 */
152
+ "INTERLINGUE", /* 113 */
153
+ "KAZAKH", /* 114 */
154
+ "LINGALA", /* 115 */
155
+ "MOLDAVIAN", /* 116 */
156
+ "PASHTO", /* 117 */
157
+ "QUECHUA", /* 118 */
158
+ "SHONA", /* 119 */
159
+ "TAJIK", /* 120 */
160
+ "TATAR", /* 121 */
161
+ "TONGA", /* 122 */
162
+ "YORUBA", /* 123 */
163
+ "CREOLES_AND_PIDGINS_ENGLISH_BASED", /* 124 */
164
+ "CREOLES_AND_PIDGINS_FRENCH_BASED", /* 125 */
165
+ "CREOLES_AND_PIDGINS_PORTUGUESE_BASED", /* 126 */
166
+ "CREOLES_AND_PIDGINS_OTHER", /* 127 */
167
+ "MAORI", /* 128 */
168
+ "WOLOF", /* 129 */
169
+ "ABKHAZIAN", /* 130 */
170
+ "AFAR", /* 131 */
171
+ "AYMARA", /* 132 */
172
+ "BASHKIR", /* 133 */
173
+ "BISLAMA", /* 134 */
174
+ "DZONGKHA", /* 135 */
175
+ "FIJIAN", /* 136 */
176
+ "GREENLANDIC", /* 137 */
177
+ "HAUSA", /* 138 */
178
+ "HAITIAN_CREOLE", /* 139 */
179
+ "INUPIAK", /* 140 */
180
+ "INUKTITUT", /* 141 */
181
+ "KASHMIRI", /* 142 */
182
+ "KINYARWANDA", /* 143 */
183
+ "MALAGASY", /* 144 */
184
+ "NAURU", /* 145 */
185
+ "OROMO", /* 146 */
186
+ "RUNDI", /* 147 */
187
+ "SAMOAN", /* 148 */
188
+ "SANGO", /* 149 */
189
+ "SANSKRIT", /* 150 */
190
+ "SISWANT", /* 151 */
191
+ "TSONGA", /* 152 */
192
+ "TSWANA", /* 153 */
193
+ "VOLAPUK", /* 154 */
194
+ "ZHUANG", /* 155 */
195
+ "KHASI", /* 156 */
196
+ "SCOTS", /* 157 */
197
+ "GANDA", /* 158 */
198
+ "MANX", /* 159 */
199
+ "MONTENEGRIN", /* 160 */
200
+ // Add new language declared names just before here
201
+ };
202
+
203
+ COMPILE_ASSERT(arraysize(kExtLangDeclaredName) == NUM_LANGUAGES,
204
+ kExtLangDeclaredName_has_incorrect_length);
205
+
206
+
207
+ // Language codes above NUM_LANGUAGES
208
+ // I made all these up, except Klingon from ISO-639-2 (dsites)
209
+ // NOTE: zza is a standard name
210
+ static const char* const kExtLanguageCode[] = {
211
+ // "X_BORK_BORK_BORK", "X_PIG_LATIN", "X_HACKER", "X_KLINGON", "X_ELMER_FUDD",
212
+ // All Latin script
213
+ "zzb", "zzp", "zzh", "tlh", "zze",
214
+
215
+ // Pseudo-languages for Unicode scripts that express a single language
216
+ "xx-Ogam", "xx-Runr", "xx-Yiii", "xx-Ital", "xx-Goth",
217
+ "xx-Dsrt", "xx-Hano", "xx-Buhd", "xx-Tagb", "xx-Tale",
218
+ "xx-Linb", "xx-Ugar", "xx-Shaw", "xx-Osma", "xx-Cprt",
219
+ "xx-Bugi", "xx-Copt", "xx-Talu", "xx-Glag", "xx-Tfng",
220
+ "xx-Sylo", "xx-Xpeo", "xx-Khar", "xx-Bali", "xx-Xsux",
221
+ "xx-Phnx", "xx-Phag", "xx-Nkoo",
222
+
223
+ // Unicode 5.1
224
+ "xx-Sund", "xx-Lepc", "xx-Olck", "xx-Vaii", "xx-Saur",
225
+ "xx-Kali", "xx-Rjng", "xx-Lyci", "xx-Cari", "xx-Lydi",
226
+ "xx-Cham",
227
+ };
228
+
229
+
230
+ // Given the Language, returns its string name used as the output by
231
+ // the lang/enc identifier, e.g. "Korean"
232
+ // "invalid_language" if the input is invalid.
233
+ // TG_UNKNOWN_LANGUAGE is used as a placeholder for the "ignore me" language,
234
+ // used to subtract out HTML, link farms, DNA strings, and alittle English porn
235
+ const char* ExtLanguageName(const Language lang) {
236
+ if (lang < 0) {
237
+ // No-text-at-all result from a Tote
238
+ return "";
239
+ }
240
+ // CompactLanguageDetect extension
241
+ if (lang == TG_UNKNOWN_LANGUAGE) {
242
+ return "Ignore";
243
+ }
244
+ if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
245
+ return LanguageName(lang);
246
+ }
247
+ if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
248
+ return kExtLanguageName[lang - EXT_LANGUAGE_BASE];
249
+ }
250
+ return invalid_language_name();
251
+ }
252
+
253
+
254
+ // Given the Language, returns its Language enum spelling, for use by
255
+ // programs that create C declarations, e.g. "KOREAN"
256
+ // "UNKNOWN_LANGUAGE" if the input is invalid.
257
+ const char* ExtLanguageDeclaredName(const Language lang) {
258
+ if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
259
+ return kExtLangDeclaredName[lang];
260
+ }
261
+ if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
262
+ return kExtLanguageName[lang - EXT_LANGUAGE_BASE];
263
+ }
264
+ return "UNKNOWN_LANGUAGE";
265
+ }
266
+
267
+ // Given the Language, return the language code, e.g. "ko"
268
+ const char* ExtLanguageCode(const Language lang) {
269
+ // Hack for ignore/porn pseudo-language
270
+ if (lang == TG_UNKNOWN_LANGUAGE) {
271
+ return "xxx";
272
+ }
273
+ if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
274
+ return LanguageCode(lang);
275
+ }
276
+ if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
277
+ return kExtLanguageCode[lang - EXT_LANGUAGE_BASE];
278
+ }
279
+ return "??";
280
+ }
281
+
282
+
283
+ // Convert "en-Latn-GB" to ENGLISH
284
+ // Normalize to PORTUGUESE, not PORTUGUESE_B nor PORTUGUESE_P
285
+ // Consider for later: NORWEGIAN, NORWEGIAN_N
286
+ // Consider for later: SCOTS, SCOTS_GAELIC
287
+ // Consider for later: SERBO_CROATIAN, SERBIAN, CROATIAN, BOSNIAN
288
+ //
289
+ Language GetLanguageFromNumberOrName(const char* src) {
290
+ if (strspn(src, "0123456789") == strlen(src)) {
291
+ // All digits
292
+ return static_cast<Language>(strto32(src, NULL, 10));
293
+ }
294
+
295
+ Language retlang = UNKNOWN_LANGUAGE;
296
+ size_t len = strlen(src);
297
+
298
+ if (true /*FLAGS_mergepairs*/) {
299
+ // Merge sets of langauges pt-xx en-xx fr-xx, NOT bs/hr/sr
300
+ if (memcmp(src, "pt-", 3) == 0) {return PORTUGUESE;}
301
+ if (memcmp(src, "en-", 3) == 0) {return ENGLISH;}
302
+ if (memcmp(src, "fr-", 3) == 0) {return FRENCH;}
303
+ // Use NormalizeLanguage instead
304
+ if (memcmp(src, "bs-", 3) == 0) {return CROATIAN;}
305
+ if (memcmp(src, "hr-", 3) == 0) {return CROATIAN;}
306
+ if (memcmp(src, "sr-Latn", 7) == 0) {return CROATIAN;}
307
+ if (memcmp(src, "sh-Latn", 7) == 0) {return CROATIAN;}
308
+ if (memcmp(src, "sr-Cyrl", 7) == 0) {return SERBIAN;}
309
+ if (memcmp(src, "sh-Cyrl", 7) == 0) {return SERBIAN;}
310
+ }
311
+
312
+ // Extensions
313
+ if (len >= 3) {
314
+ // Standin for ignore/porn "language"
315
+ if (memcmp(src, "xxx", 3) == 0) {return TG_UNKNOWN_LANGUAGE;}
316
+
317
+ if (memcmp(src, "zzb", 3) == 0) {return X_BORK_BORK_BORK;}
318
+ if (memcmp(src, "zzp", 3) == 0) {return X_PIG_LATIN;}
319
+ if (memcmp(src, "zzh", 3) == 0) {return X_HACKER;}
320
+ if (memcmp(src, "tlh", 3) == 0) {return X_KLINGON;}
321
+ if (memcmp(src, "zze", 3) == 0) {return X_ELMER_FUDD;}
322
+ }
323
+
324
+ // We have a name like en-Latn-GB or pt-BR
325
+ // First, get rid of some special cases
326
+ if (len <= 3) {
327
+ LanguageFromCode(src, &retlang);
328
+ } else if (len == 7) {
329
+ // More Extensions
330
+ if (memcmp(src, "xx-", 3) == 0) {
331
+ if (memcmp(src, "xx-Ogam", 7) == 0) {return X_OGHAM;}
332
+ if (memcmp(src, "xx-Runr", 7) == 0) {return X_RUNIC;}
333
+ if (memcmp(src, "xx-Yiii", 7) == 0) {return X_YI;}
334
+ if (memcmp(src, "xx-Ital", 7) == 0) {return X_OLD_ITALIC;}
335
+ if (memcmp(src, "xx-Goth", 7) == 0) {return X_GOTHIC;}
336
+ if (memcmp(src, "xx-Dsrt", 7) == 0) {return X_DESERET;}
337
+ if (memcmp(src, "xx-Hano", 7) == 0) {return X_HANUNOO;}
338
+ if (memcmp(src, "xx-Buhd", 7) == 0) {return X_BUHID;}
339
+ if (memcmp(src, "xx-Tagb", 7) == 0) {return X_TAGBANWA;}
340
+ if (memcmp(src, "xx-Tale", 7) == 0) {return X_TAI_LE;}
341
+ if (memcmp(src, "xx-Linb", 7) == 0) {return X_LINEAR_B;}
342
+ if (memcmp(src, "xx-Ugar", 7) == 0) {return X_UGARITIC;}
343
+ if (memcmp(src, "xx-Shaw", 7) == 0) {return X_SHAVIAN;}
344
+ if (memcmp(src, "xx-Osma", 7) == 0) {return X_OSMANYA;}
345
+ if (memcmp(src, "xx-Cprt", 7) == 0) {return X_CYPRIOT;}
346
+ if (memcmp(src, "xx-Bugi", 7) == 0) {return X_BUGINESE;}
347
+ if (memcmp(src, "xx-Copt", 7) == 0) {return X_COPTIC;}
348
+ if (memcmp(src, "xx-Talu", 7) == 0) {return X_NEW_TAI_LUE;}
349
+ if (memcmp(src, "xx-Glag", 7) == 0) {return X_GLAGOLITIC;}
350
+ if (memcmp(src, "xx-Tfng", 7) == 0) {return X_TIFINAGH;}
351
+ if (memcmp(src, "xx-Sylo", 7) == 0) {return X_SYLOTI_NAGRI;}
352
+ if (memcmp(src, "xx-Xpeo", 7) == 0) {return X_OLD_PERSIAN;}
353
+ if (memcmp(src, "xx-Khar", 7) == 0) {return X_KHAROSHTHI;}
354
+ if (memcmp(src, "xx-Bali", 7) == 0) {return X_BALINESE;}
355
+ if (memcmp(src, "xx-Xsux", 7) == 0) {return X_CUNEIFORM;}
356
+ if (memcmp(src, "xx-Phnx", 7) == 0) {return X_PHOENICIAN;}
357
+ if (memcmp(src, "xx-Phag", 7) == 0) {return X_PHAGS_PA;}
358
+ if (memcmp(src, "xx-Nkoo", 7) == 0) {return X_NKO;}
359
+
360
+ // Unicode 5.1
361
+ if (memcmp(src, "xx-Sund", 7) == 0) {return X_SUDANESE;}
362
+ if (memcmp(src, "xx-Lepc", 7) == 0) {return X_LEPCHA;}
363
+ if (memcmp(src, "xx-Olck", 7) == 0) {return X_OL_CHIKI;}
364
+ if (memcmp(src, "xx-Vaii", 7) == 0) {return X_VAI;}
365
+ if (memcmp(src, "xx-Saur", 7) == 0) {return X_SAURASHTRA;}
366
+ if (memcmp(src, "xx-Kali", 7) == 0) {return X_KAYAH_LI;}
367
+ if (memcmp(src, "xx-Rjng", 7) == 0) {return X_REJANG;}
368
+ if (memcmp(src, "xx-Lyci", 7) == 0) {return X_LYCIAN;}
369
+ if (memcmp(src, "xx-Cari", 7) == 0) {return X_CARIAN;}
370
+ if (memcmp(src, "xx-Lydi", 7) == 0) {return X_LYDIAN;}
371
+ if (memcmp(src, "xx-Cham", 7) == 0) {return X_CHAM;}
372
+ }
373
+ }
374
+ // Some other weird ones
375
+ // Could be Latn or Limb; all our current training data is Latn
376
+ if (strcmp(src, "sit-NP") == 0) {return LIMBU;}
377
+ if (strcmp(src, "un-Latn") == 0) {return UNKNOWN_LANGUAGE;}
378
+
379
+ // Multi-country langauges
380
+ if (memcmp(src, "zh", 2) == 0) {
381
+ if (memcmp(&src[len - 2], "TW", 2) == 0) {return CHINESE_T;}
382
+ if (memcmp(&src[len - 2], "HK", 2) == 0) {return CHINESE_T;}
383
+ return CHINESE;
384
+ }
385
+ if (memcmp(src, "pt", 2) == 0) {
386
+ if (memcmp(&src[len - 2], "BR", 2) == 0) {return PORTUGUESE;}
387
+ return PORTUGUESE;
388
+ }
389
+ if (memcmp(src, "fr", 2) == 0) {
390
+ if (memcmp(&src[len -2], "CA", 2) == 0) {return FRENCH;}
391
+ return FRENCH;
392
+ }
393
+
394
+ // None of the special cases matched
395
+ if (src[2] == '-') {
396
+ char temp[4];
397
+ memcpy(temp, src, 4);
398
+ temp[2] = '\0';
399
+ LanguageFromCode(temp, &retlang);
400
+ }
401
+ if (src[3] == '-') {
402
+ char temp[4];
403
+ memcpy(temp, src, 4);
404
+ temp[3] = '\0';
405
+ LanguageFromCode(temp, &retlang);
406
+ }
407
+ if (retlang != UNKNOWN_LANGUAGE) {
408
+ return retlang;
409
+ }
410
+
411
+ return retlang;
412
+ }
413
+
414
+ typedef struct {
415
+ const char* name;
416
+ UnicodeLScript lscript;
417
+ } NameScriptPair;
418
+
419
+ // In alphabetic order for binary search
420
+ static const NameScriptPair kNameScriptPair[] = {
421
+ // Unicode 5.1 additional scripts
422
+ {"Arab", ULScript_Arabic},
423
+ {"Armn", ULScript_Armenian},
424
+ {"Bali", ULScript_Balinese},
425
+ {"Beng", ULScript_Bengali},
426
+ {"Bugi", ULScript_Buginese},
427
+ {"Buhd", ULScript_Buhid},
428
+ {"Cans", ULScript_Canadian_Aboriginal},
429
+ {"Cari", ULScript_Carian}, // Unicode 5.1
430
+ {"Cham", ULScript_Cham}, // Unicode 5.1
431
+ {"Cher", ULScript_Cherokee},
432
+ {"Copt", ULScript_Coptic},
433
+ {"Cprt", ULScript_Cypriot},
434
+ {"Cyrl", ULScript_Cyrillic},
435
+ {"Deva", ULScript_Devanagari},
436
+ {"Dsrt", ULScript_Deseret},
437
+ {"Ethi", ULScript_Ethiopic},
438
+ {"Geor", ULScript_Georgian},
439
+ {"Glag", ULScript_Glagolitic},
440
+ {"Goth", ULScript_Gothic},
441
+ {"Grek", ULScript_Greek},
442
+ {"Gujr", ULScript_Gujarati},
443
+ {"Guru", ULScript_Gurmukhi},
444
+ {"Hani", ULScript_HanCJK},
445
+ {"Hano", ULScript_Hanunoo},
446
+ {"Hebr", ULScript_Hebrew},
447
+ {"Ital", ULScript_Old_Italic},
448
+ {"Kali", ULScript_Kayah_Li}, // Unicode 5.1
449
+ {"Khar", ULScript_Kharoshthi},
450
+ {"Khmr", ULScript_Khmer},
451
+ {"Knda", ULScript_Kannada},
452
+ {"Laoo", ULScript_Lao},
453
+ {"Latn", ULScript_Latin},
454
+ {"Lepc", ULScript_Lepcha}, // Unicode 5.1
455
+ {"Limb", ULScript_Limbu},
456
+ {"Linb", ULScript_Linear_B},
457
+ {"Lyci", ULScript_Lycian}, // Unicode 5.1
458
+ {"Lydi", ULScript_Lydian}, // Unicode 5.1
459
+ {"Mlym", ULScript_Malayalam},
460
+ {"Mong", ULScript_Mongolian},
461
+ {"Mymr", ULScript_Myanmar},
462
+ {"Nkoo", ULScript_Nko},
463
+ {"Ogam", ULScript_Ogham},
464
+ {"Olck", ULScript_Ol_Chiki}, // Unicode 5.1
465
+ {"Orya", ULScript_Oriya},
466
+ {"Osma", ULScript_Osmanya},
467
+ {"Phag", ULScript_Phags_Pa},
468
+ {"Phnx", ULScript_Phoenician},
469
+ {"Rjng", ULScript_Rejang}, // Unicode 5.1
470
+ {"Runr", ULScript_Runic},
471
+ {"Saur", ULScript_Saurashtra}, // Unicode 5.1
472
+ {"Shaw", ULScript_Shavian},
473
+ {"Sinh", ULScript_Sinhala},
474
+ {"Sund", ULScript_Sundanese}, // Unicode 5.1
475
+ {"Sylo", ULScript_Syloti_Nagri},
476
+ {"Syrc", ULScript_Syriac},
477
+ {"Tagb", ULScript_Tagbanwa},
478
+ {"Tale", ULScript_Tai_Le},
479
+ {"Talu", ULScript_New_Tai_Lue},
480
+ {"Taml", ULScript_Tamil},
481
+ {"Telu", ULScript_Telugu},
482
+ {"Tfng", ULScript_Tifinagh},
483
+ {"Tglg", ULScript_Tagalog},
484
+ {"Thaa", ULScript_Thaana},
485
+ {"Thai", ULScript_Thai},
486
+ {"Tibt", ULScript_Tibetan},
487
+ {"Ugar", ULScript_Ugaritic},
488
+ {"Vaii", ULScript_Vai}, // Unicode 5.1 // NOTE: apparently 'Vai '
489
+ {"Xpeo", ULScript_Old_Persian},
490
+ {"Xsux", ULScript_Cuneiform},
491
+ {"Yiii", ULScript_Yi},
492
+ {"Zyyy", ULScript_Common},
493
+ {"Zzzz", ULScript_Inherited},
494
+ };
495
+
496
+ // Convert "en-Latn-GB" to ULScript_Latin
497
+ UnicodeLScript GetLScriptFromNumberOrName(const char* src) {
498
+ if (strspn(src, "0123456789") == strlen(src)) {
499
+ // All digits
500
+ return static_cast<UnicodeLScript>(strto32(src, NULL, 10));
501
+ }
502
+
503
+ if (strcmp(src, "zh-TW") == 0) {return ULScript_HanCJK;}
504
+ if (strcmp(src, "zh-CN") == 0) {return ULScript_HanCJK;}
505
+ if (strcmp(src, "pt-BR") == 0) {return ULScript_Latin;}
506
+ if (strcmp(src, "pt-PT") == 0) {return ULScript_Latin;}
507
+ // Could be Latn or Limb; all our current training data is Latn
508
+ if (strcmp(src, "sit-NP") == 0) {return ULScript_Latin;}
509
+
510
+ // Isolate just the script field
511
+ char temp[5];
512
+ const char* src2 = strchr(src, '-');
513
+ if (src2 == NULL) {return ULScript_Latin;}
514
+ src2 += 1; // over the -
515
+ memcpy(temp, src2, 4);
516
+ temp[4] = '\0';
517
+
518
+ int lo = 0;
519
+ int hi = ULScript_NUM_SCRIPTS;
520
+ while (lo < hi) {
521
+ int mid = (lo + hi) >> 1;
522
+ if (strcmp(temp, kNameScriptPair[mid].name) < 0) {
523
+ hi = mid;
524
+ } else if (strcmp(temp, kNameScriptPair[mid].name) > 0) {
525
+ lo = mid + 1;
526
+ } else {
527
+ return kNameScriptPair[mid].lscript;
528
+ }
529
+ }
530
+ return ULScript_Latin;
531
+ }
532
+
533
+
534
+ // Merge together some languages, such as bo/hr/sr
535
+ // Croatian Latin and Serbian Cyrillic now.
536
+ Language NormalizeLanguage(Language lang) {
537
+ if (lang == BOSNIAN) {return CROATIAN;}
538
+ if (lang == SERBO_CROATIAN) {return SERBIAN;}
539
+
540
+ if (lang == PORTUGUESE_P) {return PORTUGUESE;}
541
+ if (lang == PORTUGUESE_B) {return PORTUGUESE;}
542
+
543
+ return lang;
544
+ }
545
+
@@ -0,0 +1,119 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+ //
5
+ // This file extends lang_enc.h with additional languages and extended routines.
6
+ // It is current with Unicode 5.1 (March 2008)
7
+ //
8
+
9
+ #ifndef ENCODINGS_COMPACT_LANG_DET_EXT_LANG_ENC_H__
10
+ #define ENCODINGS_COMPACT_LANG_DET_EXT_LANG_ENC_H__
11
+
12
+ #include "languages/public/languages.h"
13
+ #include "encodings/compact_lang_det/letterscript_enum.h"
14
+
15
+
16
+ // Leave a small gap after the base languages, so adding one or two is easy.
17
+ // Just reduce the gap here (currently 5 entries)
18
+
19
+ // Montengrin added, so reducing this from 5 to 4. dsites 2008.10.06
20
+ #define EXT_LANGUAGE_BASE (NUM_LANGUAGES + 4)
21
+
22
+ // Google UI languages
23
+ #define X_BORK_BORK_BORK (Language)(EXT_LANGUAGE_BASE+0)
24
+ #define X_PIG_LATIN (Language)(EXT_LANGUAGE_BASE+1)
25
+ #define X_HACKER (Language)(EXT_LANGUAGE_BASE+2)
26
+ #define X_KLINGON (Language)(EXT_LANGUAGE_BASE+3)
27
+ #define X_ELMER_FUDD (Language)(EXT_LANGUAGE_BASE+4)
28
+
29
+ // Pseudo-languages for Unicode scripts that express a single language
30
+ #define X_OGHAM (Language)(EXT_LANGUAGE_BASE+5)
31
+ #define X_RUNIC (Language)(EXT_LANGUAGE_BASE+6)
32
+ #define X_YI (Language)(EXT_LANGUAGE_BASE+7)
33
+ #define X_OLD_ITALIC (Language)(EXT_LANGUAGE_BASE+8)
34
+ #define X_GOTHIC (Language)(EXT_LANGUAGE_BASE+9)
35
+ #define X_DESERET (Language)(EXT_LANGUAGE_BASE+10)
36
+ #define X_HANUNOO (Language)(EXT_LANGUAGE_BASE+11)
37
+ #define X_BUHID (Language)(EXT_LANGUAGE_BASE+12)
38
+ #define X_TAGBANWA (Language)(EXT_LANGUAGE_BASE+13)
39
+ #define X_TAI_LE (Language)(EXT_LANGUAGE_BASE+14)
40
+ #define X_LINEAR_B (Language)(EXT_LANGUAGE_BASE+15)
41
+ #define X_UGARITIC (Language)(EXT_LANGUAGE_BASE+16)
42
+ #define X_SHAVIAN (Language)(EXT_LANGUAGE_BASE+17)
43
+ #define X_OSMANYA (Language)(EXT_LANGUAGE_BASE+18)
44
+ #define X_CYPRIOT (Language)(EXT_LANGUAGE_BASE+19)
45
+ #define X_BUGINESE (Language)(EXT_LANGUAGE_BASE+20)
46
+ #define X_COPTIC (Language)(EXT_LANGUAGE_BASE+21)
47
+ #define X_NEW_TAI_LUE (Language)(EXT_LANGUAGE_BASE+22)
48
+ #define X_GLAGOLITIC (Language)(EXT_LANGUAGE_BASE+23)
49
+ #define X_TIFINAGH (Language)(EXT_LANGUAGE_BASE+24)
50
+ #define X_SYLOTI_NAGRI (Language)(EXT_LANGUAGE_BASE+25)
51
+ #define X_OLD_PERSIAN (Language)(EXT_LANGUAGE_BASE+26)
52
+ #define X_KHAROSHTHI (Language)(EXT_LANGUAGE_BASE+27)
53
+ #define X_BALINESE (Language)(EXT_LANGUAGE_BASE+28)
54
+ #define X_CUNEIFORM (Language)(EXT_LANGUAGE_BASE+29)
55
+ #define X_PHOENICIAN (Language)(EXT_LANGUAGE_BASE+30)
56
+ #define X_PHAGS_PA (Language)(EXT_LANGUAGE_BASE+31)
57
+ #define X_NKO (Language)(EXT_LANGUAGE_BASE+32)
58
+
59
+ // Unicode 5.1
60
+ #define X_SUDANESE (Language)(EXT_LANGUAGE_BASE+33)
61
+ #define X_LEPCHA (Language)(EXT_LANGUAGE_BASE+34)
62
+ #define X_OL_CHIKI (Language)(EXT_LANGUAGE_BASE+35)
63
+ #define X_VAI (Language)(EXT_LANGUAGE_BASE+36)
64
+ #define X_SAURASHTRA (Language)(EXT_LANGUAGE_BASE+37)
65
+ #define X_KAYAH_LI (Language)(EXT_LANGUAGE_BASE+38)
66
+ #define X_REJANG (Language)(EXT_LANGUAGE_BASE+39)
67
+ #define X_LYCIAN (Language)(EXT_LANGUAGE_BASE+40)
68
+ #define X_CARIAN (Language)(EXT_LANGUAGE_BASE+41)
69
+ #define X_LYDIAN (Language)(EXT_LANGUAGE_BASE+42)
70
+ #define X_CHAM (Language)(EXT_LANGUAGE_BASE+43)
71
+
72
+ #define EXT_NUM_LANGUAGES (Language)(EXT_LANGUAGE_BASE+44)
73
+
74
+
75
+
76
+ // ExtLanguageName
77
+ // ------------
78
+ // Given the Language, returns its string name used as the output by
79
+ // the lang/enc identifier, e.g. "Korean"
80
+ // "invalid_language" if the input is invalid.
81
+ extern const char* ExtLanguageName(const Language lang);
82
+
83
+ // ExtLanguageDeclaredName
84
+ // ------------
85
+ // Given the Language, returns its Language enum spelling, for use by
86
+ // programs that create C declarations, e.g. "KOREAN"
87
+ // "UNKNOWN_LANGUAGE" if the input is invalid.
88
+ extern const char* ExtLanguageDeclaredName(const Language lang);
89
+
90
+ // ExtLanguageCode
91
+ // ------------
92
+ // Given the Language, return the language code, e.g. "ko"
93
+ // This is determined by
94
+ // the following (in order of preference):
95
+ // - ISO-639-1 two-letter language code
96
+ // (all except those mentioned below)
97
+ // - ISO-639-2 three-letter bibliographic language code
98
+ // (Tibetan, Dhivehi, Cherokee, Syriac)
99
+ // - Google-specific language code
100
+ // (ChineseT ("zh-TW"), Teragram Unknown, Unknown,
101
+ // Portuguese-Portugal, Portuguese-Brazil, Limbu)
102
+ extern const char * ExtLanguageCode(const Language lang);
103
+
104
+
105
+ // Convert "en-Latn-GB" to ENGLISH
106
+ // Normalize to PORTUGUESE, not PORTUGUESE_B nor PORTUGUESE_P
107
+ // Consider for later: NORWEGIAN, NORWEGIAN_N
108
+ // Consider for later: SCOTS, SCOTS_GAELIC
109
+ // Consider for later: SERBO_CROATIAN, SERBIAN, CROATIAN, BOSNIAN
110
+ //
111
+ Language GetLanguageFromNumberOrName(const char* src);
112
+
113
+ // Convert "en-Latn-GB" to ULScript_Latin
114
+ UnicodeLScript GetLScriptFromNumberOrName(const char* src);
115
+
116
+ // Merge together some languages, such as bo/hr/sr
117
+ Language NormalizeLanguage(Language lang);
118
+
119
+ #endif // ENCODINGS_COMPACT_LANG_DET_EXT_LANG_ENC_H__