cld 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +27 -0
- data/Manifest +106 -0
- data/README.rdoc +173 -0
- data/Rakefile +15 -0
- data/base/basictypes.h +348 -0
- data/base/build_config.h +115 -0
- data/base/casts.h +156 -0
- data/base/commandlineflags.h +443 -0
- data/base/crash.h +41 -0
- data/base/dynamic_annotations.h +358 -0
- data/base/global_strip_options.h +59 -0
- data/base/log_severity.h +46 -0
- data/base/logging.h +1403 -0
- data/base/macros.h +243 -0
- data/base/port.h +54 -0
- data/base/scoped_ptr.h +428 -0
- data/base/stl_decl.h +0 -0
- data/base/stl_decl_msvc.h +107 -0
- data/base/string_util.h +29 -0
- data/base/strtoint.h +93 -0
- data/base/template_util.h +96 -0
- data/base/type_traits.h +198 -0
- data/base/vlog_is_on.h +143 -0
- data/build.sh +48 -0
- data/build.win.cmd +28 -0
- data/cld.gemspec +30 -0
- data/cld_encodings.h +95 -0
- data/encodings/compact_lang_det/#cldutil.cc# +905 -0
- data/encodings/compact_lang_det/#cldutil.h# +1205 -0
- data/encodings/compact_lang_det/#compact_lang_det_impl.h# +171 -0
- data/encodings/compact_lang_det/#ext_lang_enc.cc# +545 -0
- data/encodings/compact_lang_det/#ext_lang_enc.h# +119 -0
- data/encodings/compact_lang_det/#getonescriptspan.cc# +570 -0
- data/encodings/compact_lang_det/#getonescriptspan.h# +131 -0
- data/encodings/compact_lang_det/#tote.cc# +299 -0
- data/encodings/compact_lang_det/#tote.h# +89 -0
- data/encodings/compact_lang_det/cldutil.cc +905 -0
- data/encodings/compact_lang_det/cldutil.h +1205 -0
- data/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/encodings/compact_lang_det/compile.cmd +1 -0
- data/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/encodings/compact_lang_det/tote.cc +299 -0
- data/encodings/compact_lang_det/tote.h +89 -0
- data/encodings/compact_lang_det/unittest_data.h +193 -0
- data/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/encodings/compact_lang_det/win/#cld_unilib_windows.cc# +29 -0
- data/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/encodings/internal/encodings.cc +12 -0
- data/encodings/lang_enc.h +254 -0
- data/encodings/proto/encodings.pb.h +169 -0
- data/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +7 -0
- data/languages/internal/#languages.cc# +337 -0
- data/languages/internal/languages.cc +337 -0
- data/languages/proto/languages.pb.h +179 -0
- data/languages/public/languages.h +379 -0
- data/lib/cld.rb +12 -0
- data/test/test.rb +570 -0
- data/thunk.cc +131 -0
- metadata +168 -0
@@ -0,0 +1,379 @@
|
|
1
|
+
// Copyright (c) 2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef LANGUAGES_PUBLIC_LANGUAGES_H_
|
6
|
+
#define LANGUAGES_PUBLIC_LANGUAGES_H_
|
7
|
+
|
8
|
+
// This interface defines the Language enum and functions that depend
|
9
|
+
// only on Language values.
|
10
|
+
|
11
|
+
// A hash-function for Language, hash<Language>, is defined in
|
12
|
+
// i18n/languages/public/languages-hash.h
|
13
|
+
|
14
|
+
#ifndef SWIG
|
15
|
+
// Language enum defined in languages.proto
|
16
|
+
// Also description on how to add languages.
|
17
|
+
#include "languages/proto/languages.pb.h"
|
18
|
+
|
19
|
+
// We need this for compatibility:
|
20
|
+
// - The Language enum in the default namespace.
|
21
|
+
// COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
|
22
|
+
//using namespace i18n::languages;
|
23
|
+
|
24
|
+
#else
|
25
|
+
// And we must have a swig-compatible enum.
|
26
|
+
// This one is a simple cleaned up version of language.proto, making the enum
|
27
|
+
// compatible with C++.
|
28
|
+
#include "i18n/languages/internal/languages_proto_wrapper.h"
|
29
|
+
|
30
|
+
#endif
|
31
|
+
|
32
|
+
// COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
|
33
|
+
//#include "util/utf8/proptables/script_enum.h"
|
34
|
+
|
35
|
+
const int kNumLanguages = NUM_LANGUAGES;
|
36
|
+
|
37
|
+
// Return the default language (ENGLISH).
|
38
|
+
Language default_language();
|
39
|
+
|
40
|
+
|
41
|
+
// *******************************************
|
42
|
+
// Language predicates
|
43
|
+
// IsValidLanguage()
|
44
|
+
// IS_LANGUAGE_UNKNOWN()
|
45
|
+
// IsCJKLanguage()
|
46
|
+
// IsChineseLanguage()
|
47
|
+
// IsNorwegianLanguage()
|
48
|
+
// IsPortugueseLanguage()
|
49
|
+
// IsRightToLeftLanguage()
|
50
|
+
// IsMaybeRightToLeftLanguage()
|
51
|
+
// IsSameLanguage()
|
52
|
+
// IsScriptRequiringLongerSnippets()
|
53
|
+
// *******************************************
|
54
|
+
|
55
|
+
// IsValidLanguage
|
56
|
+
// ===============
|
57
|
+
//
|
58
|
+
// Function to check if the input is within range of the Language enum. If
|
59
|
+
// IsValidLanguage(lang) returns true, it is safe to call
|
60
|
+
// static_cast<Language>(lang).
|
61
|
+
//
|
62
|
+
inline bool IsValidLanguage(int lang) {
|
63
|
+
return ((lang >= 0) && (lang < kNumLanguages));
|
64
|
+
}
|
65
|
+
|
66
|
+
// Return true if the language is "unknown". (This function was
|
67
|
+
// previously a macro, hence the spelling in all caps.)
|
68
|
+
//
|
69
|
+
inline bool IS_LANGUAGE_UNKNOWN(Language lang) {
|
70
|
+
return lang == TG_UNKNOWN_LANGUAGE || lang == UNKNOWN_LANGUAGE;
|
71
|
+
}
|
72
|
+
|
73
|
+
// IsCJKLanguage
|
74
|
+
// -------------
|
75
|
+
//
|
76
|
+
// This function returns true if the language is either Chinese
|
77
|
+
// (simplified or traditional), Japanese, or Korean.
|
78
|
+
bool IsCJKLanguage(Language lang);
|
79
|
+
|
80
|
+
// IsChineseLanguage
|
81
|
+
// -----------------
|
82
|
+
//
|
83
|
+
// This function returns true if the language is either Chinese
|
84
|
+
// (simplified or traditional)
|
85
|
+
bool IsChineseLanguage(Language lang);
|
86
|
+
|
87
|
+
// IsNorwegianLanguage
|
88
|
+
// --------------------
|
89
|
+
//
|
90
|
+
// This function returns true if the language is any of the Norwegian
|
91
|
+
// (regular or Nynorsk).
|
92
|
+
bool IsNorwegianLanguage(Language lang);
|
93
|
+
|
94
|
+
// IsPortugueseLanguage
|
95
|
+
// --------------------
|
96
|
+
//
|
97
|
+
// This function returns true if the language is any of the Portuguese
|
98
|
+
// languages (regular, Portugal or Brazil)
|
99
|
+
bool IsPortugueseLanguage(Language lang);
|
100
|
+
|
101
|
+
// IsSameLanguage
|
102
|
+
// --------------
|
103
|
+
//
|
104
|
+
// WARNING: This function provides only a simple test on the values of
|
105
|
+
// the two Language arguments. It returns false if either language is
|
106
|
+
// invalid. It returns true if the language arguments are equal, or
|
107
|
+
// if they are both Chinese languages, both Norwegian languages, or
|
108
|
+
// both Portuguese languages, as defined by IsChineseLanguage,
|
109
|
+
// IsNorwegianLanguage, and IsPortugueseLanguage. Otherwise it returns
|
110
|
+
// false.
|
111
|
+
bool IsSameLanguage(Language lang1, Language lang2);
|
112
|
+
|
113
|
+
|
114
|
+
// IsRightToLeftLanguage
|
115
|
+
// ---------------------
|
116
|
+
//
|
117
|
+
// This function returns true if the language is only written right-to-left
|
118
|
+
// (E.g., Hebrew, Arabic, Persian etc.)
|
119
|
+
//
|
120
|
+
// IMPORTANT NOTE: Technically we're talking about scripts, not languages.
|
121
|
+
// There are languages that can be written in more than one script.
|
122
|
+
// Examples:
|
123
|
+
// - Kurdish and Azeri ('AZERBAIJANI') can be written left-to-right in
|
124
|
+
// Latin or Cyrillic script, and right-to-left in Arabic script.
|
125
|
+
// - Sindhi and Punjabi are written in different scripts, depending on
|
126
|
+
// region and dialect.
|
127
|
+
// - Turkmen used an Arabic script historically, but not any more.
|
128
|
+
// - Pashto and Uyghur can use Arabic script, but use a Roman script
|
129
|
+
// on the Internet.
|
130
|
+
// - Kashmiri and Urdu are written either with Arabic or Devanagari script.
|
131
|
+
//
|
132
|
+
// This function only returns true for languages that are always, unequivocally
|
133
|
+
// written in right-to-left script.
|
134
|
+
//
|
135
|
+
// TODO(benjy): If we want to do anything special with multi-script languages
|
136
|
+
// we should create new 'languages' for each language+script, as we do for
|
137
|
+
// traditional vs. simplified Chinese. However most such languages are rare in
|
138
|
+
// use and even rarer on the web, so this is unlikely to be something we'll
|
139
|
+
// be concerned with for a while.
|
140
|
+
bool IsRightToLeftLanguage(Language lang);
|
141
|
+
|
142
|
+
// IsMaybeRightToLeftLanguage
|
143
|
+
// --------------------------
|
144
|
+
//
|
145
|
+
// This function returns true if the language may appear on the web in a
|
146
|
+
// right-to-left script (E.g., Hebrew, Arabic, Persian, Urdu, Kurdish, etc.)
|
147
|
+
//
|
148
|
+
// NOTE: See important notes under IsRightToLeftLanguage(...).
|
149
|
+
//
|
150
|
+
// This function returns true for languages that *may* appear on the web in a
|
151
|
+
// right-to-left script, even if they may also appear in a left-to-right
|
152
|
+
// script.
|
153
|
+
//
|
154
|
+
// This function should typically be used in cases where doing some work on
|
155
|
+
// left-to-right text would be OK (usually a no-op), and this function is used
|
156
|
+
// just to cut down on unnecessary work on regular, LTR text.
|
157
|
+
bool IsMaybeRightToLeftLanguage(Language lang);
|
158
|
+
|
159
|
+
// IsScriptRequiringLongerSnippets
|
160
|
+
// --------------------
|
161
|
+
//
|
162
|
+
// This function returns true if the script chracteristics require longer
|
163
|
+
// snippet length (Devanagari, Bengali, Gurmukhi,
|
164
|
+
// Gujarati, Oriya, Tamil, Telugu, Kannada, Malayalam).
|
165
|
+
// COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
|
166
|
+
// bool IsScriptRequiringLongerSnippets(UnicodeScript script);
|
167
|
+
|
168
|
+
|
169
|
+
// *******************************************
|
170
|
+
// LANGUAGE NAMES
|
171
|
+
//
|
172
|
+
// This interface defines a standard name for each valid Language,
|
173
|
+
// and a standard name for invalid languages. Some language names use all
|
174
|
+
// uppercase letters, but others use mixed case.
|
175
|
+
// LanguageName() [Language to name]
|
176
|
+
// LanguageEnumName() [language to enum name]
|
177
|
+
// LanguageFromName() [name to Language]
|
178
|
+
// default_language_name()
|
179
|
+
// invalid_language_name()
|
180
|
+
// *******************************************
|
181
|
+
|
182
|
+
// Given a Language, returns its standard name.
|
183
|
+
// Return invalid_language_name() if the language is invalid.
|
184
|
+
const char* LanguageName(Language lang);
|
185
|
+
|
186
|
+
// Given a Language, return the name of the enum constant for that
|
187
|
+
// language. In all but a few cases, this is the same as its standard
|
188
|
+
// name. For example, LanguageName(CHINESE) returns "Chinese", but
|
189
|
+
// LanguageEnumName(CHINESE) returns "CHINESE". This is intended for
|
190
|
+
// code that is generating C++ code, where the enum constant is more
|
191
|
+
// useful than its integer value. Return "NUM_LANGUAGES" if
|
192
|
+
// the language is invalid.
|
193
|
+
const char* LanguageEnumName(Language lang);
|
194
|
+
|
195
|
+
// The maximum length of a standard language name.
|
196
|
+
const int kMaxLanguageNameSize = 50;
|
197
|
+
|
198
|
+
// The standard name for the default language.
|
199
|
+
const char* default_language_name();
|
200
|
+
|
201
|
+
// The standard name for all invalid languages.
|
202
|
+
const char* invalid_language_name();
|
203
|
+
|
204
|
+
// If lang_name matches the standard name of a Language, using a
|
205
|
+
// case-insensitive comparison, set *language to that Language and
|
206
|
+
// return true.
|
207
|
+
// Otherwise, set *language to UNKNOWN_LANGUAGE and return false.
|
208
|
+
//
|
209
|
+
// For backwards compatibility, "HATIAN_CREOLE" is allowed as a name
|
210
|
+
// for HAITIAN_CREOLE, and "QUECHAU" is allowed as a name for QUECHUA.
|
211
|
+
// For compatibility with LanguageEnumName, "UNKNOWN_LANGUAGE" is allowed
|
212
|
+
// as a name for UNKNOWN_LANGUAGE (the return value is true in this case,
|
213
|
+
// as it is for "Unknown"), and "CHINESE_T" is allowed as a name for
|
214
|
+
// CHINESE_T (i.e., a synonym for "ChineseT").
|
215
|
+
//
|
216
|
+
// REQUIRES: language must not be NULL.
|
217
|
+
//
|
218
|
+
bool LanguageFromName(const char* lang_name, Language *language);
|
219
|
+
|
220
|
+
|
221
|
+
|
222
|
+
// *******************************************
|
223
|
+
// LANGUAGE CODES
|
224
|
+
//
|
225
|
+
// This interface defines a standard code for each valid language, and
|
226
|
+
// a standard code for invalid languages. These are derived from ISO codes,
|
227
|
+
// with some Google additions.
|
228
|
+
// LanguageCode()
|
229
|
+
// default_language_code()
|
230
|
+
// invalid_language_code()
|
231
|
+
// LanguageCodeWithDialects()
|
232
|
+
// LanguageCodeISO639_1()
|
233
|
+
// LanguageCodeISO639_2()
|
234
|
+
// *******************************************
|
235
|
+
|
236
|
+
// Given a Language, return its standard code. There are Google-specific codes:
|
237
|
+
// For CHINESE_T, return "zh-TW".
|
238
|
+
// For TG_UNKNOWN_LANGUAGE, return "ut".
|
239
|
+
// For UNKNOWN_LANGUAGE, return "un".
|
240
|
+
// For PORTUGUESE_P, return "pt-PT".
|
241
|
+
// For PORTUGUESE_B, return "pt-BR".
|
242
|
+
// For LIMBU, return "sit-NP".
|
243
|
+
// For CHEROKEE, return "chr".
|
244
|
+
// For SYRIAC, return "syr".
|
245
|
+
// Otherwise return the ISO 639-1 two-letter language code for lang.
|
246
|
+
// If lang is invalid, return invalid_language_code().
|
247
|
+
//
|
248
|
+
// NOTE: See the note below about the codes for Chinese languages.
|
249
|
+
//
|
250
|
+
const char* LanguageCode(Language lang);
|
251
|
+
|
252
|
+
// The maximum length of a language code.
|
253
|
+
const int kMaxLanguageCodeSize = 50;
|
254
|
+
|
255
|
+
// The standard code for the default language.
|
256
|
+
const char* default_language_code();
|
257
|
+
|
258
|
+
// The standard code for all invalid languages.
|
259
|
+
const char* invalid_language_code();
|
260
|
+
|
261
|
+
|
262
|
+
// --------------------------------------------
|
263
|
+
// NOTE: CHINESE LANGUAGE CODES
|
264
|
+
//
|
265
|
+
// There are three functions that return codes for Chinese languages.
|
266
|
+
// LanguageCode(lang) and LanguageCodeWithDialects(lang) are defined here.
|
267
|
+
// LanguageCode(lang, encoding) is defined in i18n/encodings.lang_enc.h.
|
268
|
+
// The following list shows the different results.
|
269
|
+
//
|
270
|
+
// LanguageCode(CHINESE) returns "zh"
|
271
|
+
// LanguageCode(CHINESE_T) returns "zh-TW".
|
272
|
+
//
|
273
|
+
// LanguageCodeWithDialects(CHINESE) returns "zh-CN".
|
274
|
+
// LanguageCodeWithDialects(CHINESE_T) returns "zh-TW".
|
275
|
+
//
|
276
|
+
// LanguageCode(CHINESE_T, <any encoding>) returns "zh-TW".
|
277
|
+
// LanguageCode(CHINESE, CHINESE_BIG5) returns "zh-TW".
|
278
|
+
// LanguageCode(CHINESE, <any other encoding>) returns "zh-CN".
|
279
|
+
//
|
280
|
+
// --------------------------------------------
|
281
|
+
|
282
|
+
// LanguageCodeWithDialects
|
283
|
+
// ------------------------
|
284
|
+
//
|
285
|
+
// If lang is CHINESE, return "zh-CN". Otherwise return LanguageCode(lang).
|
286
|
+
const char* LanguageCodeWithDialects(Language lang);
|
287
|
+
|
288
|
+
// LanguageCodeISO639_1
|
289
|
+
// --------------------
|
290
|
+
//
|
291
|
+
// Return the ISO 639-1 two-letter language code for lang.
|
292
|
+
// Return invalid_language_code() if lang is invalid or does not have
|
293
|
+
// an ISO 639-1 two-letter language code.
|
294
|
+
const char* LanguageCodeISO639_1(Language lang);
|
295
|
+
|
296
|
+
// LanguageCodeISO639_2
|
297
|
+
// --------------------
|
298
|
+
//
|
299
|
+
// Return the ISO 639-2 three-letter language for lang.
|
300
|
+
// Return invalid_language_code() if lang is invalid or does not have
|
301
|
+
// an ISO 639-2 three-letter language code.
|
302
|
+
const char* LanguageCodeISO639_2(Language lang);
|
303
|
+
|
304
|
+
// LanguageFromCode
|
305
|
+
// ----------------
|
306
|
+
//
|
307
|
+
// If lang_code matches the code for a Language, using a case-insensitive
|
308
|
+
// comparison, set *lang to that Language and return true.
|
309
|
+
// Otherwise, set *lang to UNKNOWN_LANGUAGE and return false.
|
310
|
+
//
|
311
|
+
// lang_code can be an ISO 639-1 (two-letter) code, an ISO 639-2
|
312
|
+
// (three-letter) code, or a Google-specific code (see LanguageCode).
|
313
|
+
//
|
314
|
+
// Certain language-code aliases are also allowed:
|
315
|
+
// For "zh-cn" and "zh_cn", set *lang to CHINESE.
|
316
|
+
// For "zh-tw" and "zh_tw", set *lang to CHINESE_T.
|
317
|
+
// For "he", set *lang to HEBREW.
|
318
|
+
// For "in", set *lang to INDONESIAN.
|
319
|
+
// For "ji", set *lang to YIDDISH.
|
320
|
+
// For "fil", set *lang to TAGALOG.
|
321
|
+
//
|
322
|
+
// REQUIRES: 'lang' must not be NULL.
|
323
|
+
bool LanguageFromCode(const char* lang_code, Language *language);
|
324
|
+
|
325
|
+
|
326
|
+
// LanguageFromCodeOrName
|
327
|
+
// ----------------------
|
328
|
+
//
|
329
|
+
// If lang_code_or_name is a language code or a language name.
|
330
|
+
// set *language to the corresponding Language and return true.
|
331
|
+
// Otherwise set *language to UNKNOWN_LANGUAGE and return false.
|
332
|
+
//
|
333
|
+
bool LanguageFromCodeOrName(const char* lang_code_or_name,
|
334
|
+
Language* language);
|
335
|
+
|
336
|
+
// LanguageNameFromCode
|
337
|
+
// --------------------
|
338
|
+
//
|
339
|
+
// If language_code is the code for a Language (see LanguageFromCode),
|
340
|
+
// return the standard name of that language (see LanguageName).
|
341
|
+
// Otherwise return invalid_language_name().
|
342
|
+
//
|
343
|
+
const char* LanguageNameFromCode(const char* language_code);
|
344
|
+
|
345
|
+
|
346
|
+
// Miscellany
|
347
|
+
|
348
|
+
// LanguageCodeToUnderscoreForm
|
349
|
+
// ----------------------------
|
350
|
+
//
|
351
|
+
// Given a language code, convert the dash "-" to underscore "_".
|
352
|
+
//
|
353
|
+
// Specifically, if result_length <= strlen(lang_code), set result[0]
|
354
|
+
// to '\0' and return false. Otherwise, copy lang_code to result,
|
355
|
+
// converting every dash to an underscore, converting every character
|
356
|
+
// before the first dash or underscore to lower case, and converting
|
357
|
+
// every character after the first dash or underscore to upper
|
358
|
+
// case. If there is no dash or underscore, convert the entire string
|
359
|
+
// to lower case.
|
360
|
+
//
|
361
|
+
// REQUIRES: 'lang_code' must not be NULL. 'result' must not be NULL.
|
362
|
+
|
363
|
+
bool LanguageCodeToUnderscoreForm(const char* lang_code,
|
364
|
+
char* result,
|
365
|
+
int result_length);
|
366
|
+
|
367
|
+
//
|
368
|
+
// AlwaysPutInExpectedRestrict
|
369
|
+
// ---------------------------
|
370
|
+
//
|
371
|
+
// For Web pages in certain top-level domains, Web Search always
|
372
|
+
// applies a "country restrict". If 'tld' matches one of those, using
|
373
|
+
// a case-SENSITIVE comparison, set *expected_language to the Language
|
374
|
+
// most commonly found in that top-level domain and return true.
|
375
|
+
// Otherwise, set *expected_language to UNKNOWN_LANGUAGE and return false.
|
376
|
+
bool AlwaysPutInExpectedRestrict(const char *tld, Language *expected_language);
|
377
|
+
|
378
|
+
|
379
|
+
#endif // LANGUAGES_PUBLIC_LANGUAGES_H_
|
data/lib/cld.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require "ffi"
|
3
|
+
|
4
|
+
module CLD
|
5
|
+
extend FFI::Library
|
6
|
+
dir = File.expand_path(File.join(File.dirname(__FILE__), "../ext/cld"))
|
7
|
+
ffi_lib "#{dir}/cld.so"
|
8
|
+
attach_function "detect_language","detectLanguageThunkInt", [:buffer_in], :int
|
9
|
+
def self.english?(text)
|
10
|
+
detect_language(text) == 0
|
11
|
+
end
|
12
|
+
end
|
data/test/test.rb
ADDED
@@ -0,0 +1,570 @@
|
|
1
|
+
# Copyright (c) 2009 The Chromium Authors. All rights reserved.
|
2
|
+
# Use of this source code is governed by a BSD-style license that can be
|
3
|
+
# found in the LICENSE file.
|
4
|
+
|
5
|
+
require "test/unit"
|
6
|
+
require "ccld"
|
7
|
+
|
8
|
+
VERBOSE = False
|
9
|
+
|
10
|
+
# MKM: ported from FullTests in compact_lang_det_unittest_small.cc
|
11
|
+
|
12
|
+
class TestCLD(unittest.TestCase):
|
13
|
+
|
14
|
+
langsSeen = set()
|
15
|
+
|
16
|
+
def runOne(self, expectedLangName, s):
|
17
|
+
if VERBOSE:
|
18
|
+
print
|
19
|
+
print 'Test: %s [%d bytes]' % (expectedLangName, len(s))
|
20
|
+
detectedLangName, detectedLangCode, isReliable, textBytesFound, details = cld.detect(s, pickSummaryLanguage=True)
|
21
|
+
if VERBOSE:
|
22
|
+
print ' detected: %s' % detectedLangName
|
23
|
+
print ' reliable: %s' % (isReliable != 0)
|
24
|
+
print ' textBytes: %s' % textBytesFound
|
25
|
+
print ' details: %s' % str(details)
|
26
|
+
self.langsSeen.add(expectedLangName)
|
27
|
+
print ' %d langs' % len(self.langsSeen)
|
28
|
+
self.assertEquals(expectedLangName, detectedLangName)
|
29
|
+
self.assertTrue(isReliable)
|
30
|
+
|
31
|
+
def testAFRIKAANS(self):
|
32
|
+
self.runOne('AFRIKAANS', kTeststr_af_Latn)
|
33
|
+
|
34
|
+
# def testAFAR(self):
|
35
|
+
# self.runOne('AFAR', kTeststr_aa_Latn)
|
36
|
+
|
37
|
+
# def testABKHAZIAN(self):
|
38
|
+
# self.runOne('ABKHAZIAN', kTeststr_ab_Cyrl)
|
39
|
+
|
40
|
+
def testAFRIKAANS(self):
|
41
|
+
self.runOne('AFRIKAANS', kTeststr_af_Latn)
|
42
|
+
|
43
|
+
# def testAMHARIC(self):
|
44
|
+
# self.runOne('AMHARIC', kTeststr_am_Ethi)
|
45
|
+
|
46
|
+
def testARABIC(self):
|
47
|
+
self.runOne('ARABIC', kTeststr_ar_Arab)
|
48
|
+
|
49
|
+
# def testASSAMESE(self):
|
50
|
+
# self.runOne('ASSAMESE', kTeststr_as_Beng)
|
51
|
+
|
52
|
+
# def testAYMARA(self):
|
53
|
+
# self.runOne('AYMARA', kTeststr_ay_Latn)
|
54
|
+
|
55
|
+
# AZERBAIJANI Arab & Cyrl removed 2008.05.27. Just AZERBAIJANI Latn left
|
56
|
+
# def testAZERBAIJANI(self):
|
57
|
+
# self.runOne('AZERBAIJANI', kTeststr_az_Arab)
|
58
|
+
|
59
|
+
# Missing data: az-Cyrl
|
60
|
+
# def testAZERBAIJANI(self):
|
61
|
+
# self.runOne('AZERBAIJANI', kTeststr_az_Latn)
|
62
|
+
|
63
|
+
# def testBASHKIR(self):
|
64
|
+
# self.runOne('BASHKIR', kTeststr_ba_Cyrl)
|
65
|
+
|
66
|
+
def testBELARUSIAN(self):
|
67
|
+
self.runOne('BELARUSIAN', kTeststr_be_Cyrl)
|
68
|
+
|
69
|
+
def testBULGARIAN(self):
|
70
|
+
self.runOne('BULGARIAN', kTeststr_bg_Cyrl)
|
71
|
+
|
72
|
+
# def testBIHARI(self):
|
73
|
+
# self.runOne('BIHARI', kTeststr_bh_Deva)
|
74
|
+
|
75
|
+
# def testBISLAMA(self):
|
76
|
+
# self.runOne('BISLAMA', kTeststr_bi_Latn)
|
77
|
+
|
78
|
+
# def testBENGALI(self):
|
79
|
+
# self.runOne('BENGALI', kTeststr_bn_Beng)
|
80
|
+
|
81
|
+
# def testTIBETAN(self):
|
82
|
+
# self.runOne('TIBETAN', kTeststr_bo_Tibt)
|
83
|
+
|
84
|
+
# def testBRETON(self):
|
85
|
+
# self.runOne('BRETON', kTeststr_br_Latn)
|
86
|
+
|
87
|
+
def testSERBIAN(self):
|
88
|
+
self.runOne('SERBIAN', kTeststr_bs_Cyrl) # NOTE: Not BOSNIAN
|
89
|
+
|
90
|
+
# def testCROATIAN(self):
|
91
|
+
# self.runOne('CROATIAN', kTeststr_bs_Latn) # NOTE: Not BOSNIAN
|
92
|
+
|
93
|
+
def testCATALAN(self):
|
94
|
+
self.runOne('CATALAN', kTeststr_ca_Latn)
|
95
|
+
|
96
|
+
def testCHEROKEE(self):
|
97
|
+
self.runOne('CHEROKEE', kTeststr_chr_Cher)
|
98
|
+
|
99
|
+
# def testCORSICAN(self):
|
100
|
+
# self.runOne('CORSICAN', kTeststr_co_Latn)
|
101
|
+
|
102
|
+
# No CREOLES_AND_PIDGINS_ENGLISH_BASED
|
103
|
+
# No CREOLES_AND_PIDGINS_FRENCH_BASED
|
104
|
+
# No CREOLES_AND_PIDGINS_OTHER
|
105
|
+
# No CREOLES_AND_PIDGINS_PORTUGUESE_BASED
|
106
|
+
def testCZECH(self):
|
107
|
+
self.runOne('CZECH', kTeststr_cs_Latn)
|
108
|
+
|
109
|
+
def testWELSH(self):
|
110
|
+
self.runOne('WELSH', kTeststr_cy_Latn)
|
111
|
+
|
112
|
+
def testDANISH(self):
|
113
|
+
self.runOne('DANISH', kTeststr_da_Latn)
|
114
|
+
|
115
|
+
def testGERMAN(self):
|
116
|
+
self.runOne('GERMAN', kTeststr_de_Latn)
|
117
|
+
|
118
|
+
def testDHIVEHI(self):
|
119
|
+
self.runOne('DHIVEHI', kTeststr_dv_Thaa)
|
120
|
+
|
121
|
+
# def testDZONGKHA(self):
|
122
|
+
# self.runOne('DZONGKHA', kTeststr_dz_Tibt)
|
123
|
+
|
124
|
+
def testGREEK(self):
|
125
|
+
self.runOne('GREEK', kTeststr_el_Grek)
|
126
|
+
|
127
|
+
def testENGLISH(self):
|
128
|
+
self.runOne('ENGLISH', kTeststr_en_Latn)
|
129
|
+
|
130
|
+
def testENGLISH(self):
|
131
|
+
self.runOne('ENGLISH', kTeststr_en)
|
132
|
+
|
133
|
+
# def testESPERANTO(self):
|
134
|
+
# self.runOne('ESPERANTO', kTeststr_eo_Latn)
|
135
|
+
|
136
|
+
def testSPANISH(self):
|
137
|
+
self.runOne('SPANISH', kTeststr_es_Latn)
|
138
|
+
|
139
|
+
def testESTONIAN(self):
|
140
|
+
self.runOne('ESTONIAN', kTeststr_et_Latn)
|
141
|
+
|
142
|
+
# def testBASQUE(self):
|
143
|
+
# self.runOne('BASQUE', kTeststr_eu_Latn)
|
144
|
+
|
145
|
+
def testPERSIAN(self):
|
146
|
+
self.runOne('PERSIAN', kTeststr_fa_Arab)
|
147
|
+
|
148
|
+
def testFINNISH(self):
|
149
|
+
self.runOne('FINNISH', kTeststr_fi_Latn)
|
150
|
+
|
151
|
+
# def testFIJIAN(self):
|
152
|
+
# self.runOne('FIJIAN', kTeststr_fj_Latn)
|
153
|
+
|
154
|
+
# def testFAROESE(self):
|
155
|
+
# self.runOne('FAROESE', kTeststr_fo_Latn)
|
156
|
+
|
157
|
+
def testFRENCH(self):
|
158
|
+
self.runOne('FRENCH', kTeststr_fr_Latn)
|
159
|
+
|
160
|
+
# def testFRISIAN(self):
|
161
|
+
# self.runOne('FRISIAN', kTeststr_fy_Latn)
|
162
|
+
|
163
|
+
def testIRISH(self):
|
164
|
+
self.runOne('IRISH', kTeststr_ga_Latn)
|
165
|
+
|
166
|
+
# def testSCOTS_GAELIC(self):
|
167
|
+
# self.runOne('SCOTS_GAELIC', kTeststr_gd_Latn)
|
168
|
+
|
169
|
+
# def testGALICIAN(self):
|
170
|
+
# self.runOne('GALICIAN', kTeststr_gl_Latn)
|
171
|
+
|
172
|
+
# def testGUARANI(self):
|
173
|
+
# self.runOne('GUARANI', kTeststr_gn_Latn)
|
174
|
+
|
175
|
+
def testGUJARATI(self):
|
176
|
+
self.runOne('GUJARATI', kTeststr_gu_Gujr)
|
177
|
+
|
178
|
+
# def testMANX(self):
|
179
|
+
# self.runOne('MANX', kTeststr_gv_Latn)
|
180
|
+
|
181
|
+
# def testHAUSA(self):
|
182
|
+
# self.runOne('HAUSA', kTeststr_ha_Latn)
|
183
|
+
|
184
|
+
def testHINDI(self):
|
185
|
+
self.runOne('HINDI', kTeststr_hi_Deva)
|
186
|
+
|
187
|
+
def testHINDI2(self):
|
188
|
+
self.runOne('HINDI', kTeststr_ks)
|
189
|
+
|
190
|
+
def testCROATIAN(self):
|
191
|
+
self.runOne('CROATIAN', kTeststr_hr_Latn) # NOTE: now CROATIAN
|
192
|
+
|
193
|
+
# def testHAITIAN_CREOLE(self):
|
194
|
+
# self.runOne('HAITIAN_CREOLE', kTeststr_ht_Latn)
|
195
|
+
|
196
|
+
def testHUNGARIAN(self):
|
197
|
+
self.runOne('HUNGARIAN', kTeststr_hu_Latn)
|
198
|
+
|
199
|
+
def testARMENIAN(self):
|
200
|
+
self.runOne('ARMENIAN', kTeststr_hy_Armn)
|
201
|
+
|
202
|
+
# def testINTERLINGUA(self):
|
203
|
+
# self.runOne('INTERLINGUA', kTeststr_ia_Latn)
|
204
|
+
|
205
|
+
def testMALAY(self):
|
206
|
+
self.runOne('MALAY', kTeststr_id_Latn)
|
207
|
+
|
208
|
+
# def testINTERLINGUE(self):
|
209
|
+
# self.runOne('INTERLINGUE', kTeststr_ie_Latn)
|
210
|
+
|
211
|
+
# def testINUPIAK(self):
|
212
|
+
# self.runOne('INUPIAK', kTeststr_ik_Latn)
|
213
|
+
|
214
|
+
def testICELANDIC(self):
|
215
|
+
self.runOne('ICELANDIC', kTeststr_is_Latn)
|
216
|
+
|
217
|
+
def testITALIAN(self):
|
218
|
+
self.runOne('ITALIAN', kTeststr_it_Latn)
|
219
|
+
|
220
|
+
def testINUKTITUT(self):
|
221
|
+
self.runOne('INUKTITUT', kTeststr_iu_Cans)
|
222
|
+
|
223
|
+
def testHEBREW(self):
|
224
|
+
self.runOne('HEBREW', kTeststr_iw_Hebr)
|
225
|
+
|
226
|
+
def testJAPANESE(self):
|
227
|
+
self.runOne('Japanese', kTeststr_ja_Hani)
|
228
|
+
|
229
|
+
# def testJAVANESE(self):
|
230
|
+
# self.runOne('JAVANESE', kTeststr_jw_Latn)
|
231
|
+
|
232
|
+
def testGEORGIAN(self):
|
233
|
+
self.runOne('GEORGIAN', kTeststr_ka_Geor)
|
234
|
+
|
235
|
+
# def testKHASI(self):
|
236
|
+
# self.runOne('KHASI', kTeststr_kha_Latn)
|
237
|
+
|
238
|
+
# def testKAZAKH(self):
|
239
|
+
# self.runOne('KAZAKH', kTeststr_kk_Arab)
|
240
|
+
|
241
|
+
# def testKAZAKH(self):
|
242
|
+
# self.runOne('KAZAKH', kTeststr_kk_Cyrl)
|
243
|
+
|
244
|
+
# def testKAZAKH(self):
|
245
|
+
# self.runOne('KAZAKH', kTeststr_kk_Latn)
|
246
|
+
|
247
|
+
# def testGREENLANDIC(self):
|
248
|
+
# self.runOne('GREENLANDIC', kTeststr_kl_Latn)
|
249
|
+
|
250
|
+
def testKHMER(self):
|
251
|
+
self.runOne('KHMER', kTeststr_km_Khmr)
|
252
|
+
|
253
|
+
def testKANNADA(self):
|
254
|
+
self.runOne('KANNADA', kTeststr_kn_Knda)
|
255
|
+
|
256
|
+
def testKOREAN(self):
|
257
|
+
self.runOne('Korean', kTeststr_ko_Hani)
|
258
|
+
|
259
|
+
# def testKASHMIRI(self):
|
260
|
+
# self.runOne('KASHMIRI', kTeststr_ks_Deva)
|
261
|
+
|
262
|
+
# KURDISH Latn removed 2008.05.27. Just KURDISH Arab left
|
263
|
+
# def testKURDISH(self):
|
264
|
+
# self.runOne('KURDISH', kTeststr_ku_Arab)
|
265
|
+
|
266
|
+
# def testKURDISH(self):
|
267
|
+
# self.runOne('KURDISH', kTeststr_ku_Latn)
|
268
|
+
|
269
|
+
# def testKYRGYZ(self):
|
270
|
+
# self.runOne('KYRGYZ', kTeststr_ky_Arab)
|
271
|
+
|
272
|
+
# def testKYRGYZ(self):
|
273
|
+
# self.runOne('KYRGYZ', kTeststr_ky_Cyrl)
|
274
|
+
|
275
|
+
|
276
|
+
# def testLATIN(self):
|
277
|
+
# self.runOne('LATIN', kTeststr_la_Latn)
|
278
|
+
|
279
|
+
# def testLUXEMBOURGISH(self):
|
280
|
+
# self.runOne('LUXEMBOURGISH', kTeststr_lb_Latn)
|
281
|
+
|
282
|
+
# def testGANDA(self):
|
283
|
+
# self.runOne('GANDA', kTeststr_lg_Latn)
|
284
|
+
|
285
|
+
# def testLINGALA(self):
|
286
|
+
# self.runOne('LINGALA', kTeststr_ln_Latn)
|
287
|
+
|
288
|
+
def testLAOTHIAN(self):
|
289
|
+
self.runOne('LAOTHIAN', kTeststr_lo_Laoo)
|
290
|
+
|
291
|
+
def testLITHUANIAN(self):
|
292
|
+
self.runOne('LITHUANIAN', kTeststr_lt_Latn)
|
293
|
+
|
294
|
+
def testLATVIAN(self):
|
295
|
+
self.runOne('LATVIAN', kTeststr_lv_Latn)
|
296
|
+
|
297
|
+
# def testMALAGASY(self):
|
298
|
+
# self.runOne('MALAGASY', kTeststr_mg_Latn)
|
299
|
+
|
300
|
+
# def testMAORI(self):
|
301
|
+
# self.runOne('MAORI', kTeststr_mi_Latn)
|
302
|
+
|
303
|
+
def testMACEDONIAN(self):
|
304
|
+
self.runOne('MACEDONIAN', kTeststr_mk_Cyrl)
|
305
|
+
|
306
|
+
def testMALAYALAM(self):
|
307
|
+
self.runOne('MALAYALAM', kTeststr_ml_Mlym)
|
308
|
+
|
309
|
+
# def testMONGOLIAN(self):
|
310
|
+
# self.runOne('MONGOLIAN', kTeststr_mn_Cyrl)
|
311
|
+
|
312
|
+
# def testMOLDAVIAN(self):
|
313
|
+
# self.runOne('MOLDAVIAN', kTeststr_mo_Cyrl)
|
314
|
+
|
315
|
+
# def testMARATHI(self):
|
316
|
+
# self.runOne('MARATHI', kTeststr_mr_Deva)
|
317
|
+
|
318
|
+
def testMALAY(self):
|
319
|
+
self.runOne('MALAY', kTeststr_ms_Latn)
|
320
|
+
|
321
|
+
# def testMALAY(self):
|
322
|
+
# self.runOne('MALAY', kTeststr_ms_Latn2)
|
323
|
+
|
324
|
+
def testMALAY(self):
|
325
|
+
self.runOne('MALAY', kTeststr_ms_Latn3)
|
326
|
+
|
327
|
+
# def testMALTESE(self):
|
328
|
+
# self.runOne('MALTESE', kTeststr_mt_Latn)
|
329
|
+
|
330
|
+
# def testBURMESE(self):
|
331
|
+
# self.runOne('BURMESE', kTeststr_my_Latn)
|
332
|
+
|
333
|
+
# def testBURMESE(self):
|
334
|
+
# self.runOne('BURMESE', kTeststr_my_Mymr)
|
335
|
+
|
336
|
+
# def testNAURU(self):
|
337
|
+
# self.runOne('NAURU', kTeststr_na_Latn)
|
338
|
+
|
339
|
+
# def testNEPALI(self):
|
340
|
+
# self.runOne('NEPALI', kTeststr_ne_Deva)
|
341
|
+
|
342
|
+
def testDUTCH(self):
|
343
|
+
self.runOne('DUTCH', kTeststr_nl_Latn)
|
344
|
+
|
345
|
+
# def testNORWEGIAN_N(self):
|
346
|
+
# self.runOne('NORWEGIAN_N', kTeststr_nn_Latn)
|
347
|
+
|
348
|
+
def testNORWEGIAN(self):
|
349
|
+
self.runOne('NORWEGIAN', kTeststr_no_Latn)
|
350
|
+
|
351
|
+
|
352
|
+
# def testOCCITAN(self):
|
353
|
+
# self.runOne('OCCITAN', kTeststr_oc_Latn)
|
354
|
+
|
355
|
+
# def testOROMO(self):
|
356
|
+
# self.runOne('OROMO', kTeststr_om_Latn)
|
357
|
+
|
358
|
+
def testORIYA(self):
|
359
|
+
self.runOne('ORIYA', kTeststr_or_Orya)
|
360
|
+
|
361
|
+
def testPUNJABI(self):
|
362
|
+
self.runOne('PUNJABI', kTeststr_pa_Guru)
|
363
|
+
|
364
|
+
def testPOLISH(self):
|
365
|
+
self.runOne('POLISH', kTeststr_pl_Latn)
|
366
|
+
|
367
|
+
# def testPASHTO(self):
|
368
|
+
# self.runOne('PASHTO', kTeststr_ps_Arab)
|
369
|
+
|
370
|
+
def testPORTUGUESE(self):
|
371
|
+
self.runOne('PORTUGUESE', kTeststr_pt_BR) # NOTE: not PORTUGUESE_B
|
372
|
+
# nor PORTUGUESE_P
|
373
|
+
|
374
|
+
# def testQUECHUA(self):
|
375
|
+
# self.runOne('QUECHUA', kTeststr_qu_Latn)
|
376
|
+
|
377
|
+
# def testRHAETO_ROMANCE(self):
|
378
|
+
# self.runOne('RHAETO_ROMANCE', kTeststr_rm_Latn)
|
379
|
+
|
380
|
+
# def testRUNDI(self):
|
381
|
+
# self.runOne('RUNDI', kTeststr_rn_Latn)
|
382
|
+
|
383
|
+
def testROMANIAN(self):
|
384
|
+
self.runOne('ROMANIAN', kTeststr_ro_Latn)
|
385
|
+
|
386
|
+
def testRUSSIAN(self):
|
387
|
+
self.runOne('RUSSIAN', kTeststr_ru_Cyrl)
|
388
|
+
|
389
|
+
# def testKINYARWANDA(self):
|
390
|
+
# self.runOne('KINYARWANDA', kTeststr_rw_Latn)
|
391
|
+
|
392
|
+
# def testSANSKRIT(self):
|
393
|
+
# self.runOne('SANSKRIT', kTeststr_sa_Deva)
|
394
|
+
|
395
|
+
# def testSANSKRIT(self):
|
396
|
+
# self.runOne('SANSKRIT', kTeststr_sa_Latn)
|
397
|
+
|
398
|
+
# def testSCOTS(self):
|
399
|
+
# self.runOne('SCOTS', kTeststr_sco_Latn)
|
400
|
+
|
401
|
+
# def testSINDHI(self):
|
402
|
+
# self.runOne('SINDHI', kTeststr_sd_Arab)
|
403
|
+
|
404
|
+
# def testSANGO(self):
|
405
|
+
# self.runOne('SANGO', kTeststr_sg_Latn)
|
406
|
+
|
407
|
+
# No SERBO_CROATIAN (sh)
|
408
|
+
def testSINHALESE(self):
|
409
|
+
self.runOne('SINHALESE', kTeststr_si_Sinh)
|
410
|
+
|
411
|
+
# def testLIMBU(self):
|
412
|
+
# self.runOne('LIMBU', kTeststr_sit_NP)
|
413
|
+
|
414
|
+
def testSLOVAK(self):
|
415
|
+
self.runOne('SLOVAK', kTeststr_sk_Latn)
|
416
|
+
|
417
|
+
def testSLOVENIAN(self):
|
418
|
+
self.runOne('SLOVENIAN', kTeststr_sl_Latn)
|
419
|
+
|
420
|
+
# def testSAMOAN(self):
|
421
|
+
# self.runOne('SAMOAN', kTeststr_sm_Latn)
|
422
|
+
|
423
|
+
# def testSHONA(self):
|
424
|
+
# self.runOne('SHONA', kTeststr_sn_Latn)
|
425
|
+
|
426
|
+
# def testSOMALI(self):
|
427
|
+
# self.runOne('SOMALI', kTeststr_so_Latn)
|
428
|
+
|
429
|
+
# def testALBANIAN(self):
|
430
|
+
# self.runOne('ALBANIAN', kTeststr_sq_Latn)
|
431
|
+
|
432
|
+
def testSERBIAN(self):
|
433
|
+
self.runOne('SERBIAN', kTeststr_sr_Cyrl) # NOTE: now SERBIAN
|
434
|
+
|
435
|
+
def testCROATIAN(self):
|
436
|
+
self.runOne('CROATIAN', kTeststr_sr_Latn) # NOTE: Not SERBIAN
|
437
|
+
|
438
|
+
def testCROATIAN(self):
|
439
|
+
self.runOne('CROATIAN', kTeststr_sr_ME_Latn) # NOTE: not SERBIAN nor MONTENEGRIN
|
440
|
+
|
441
|
+
# def testSISWANT(self):
|
442
|
+
# self.runOne('SISWANT', kTeststr_ss_Latn)
|
443
|
+
|
444
|
+
# def testSESOTHO(self):
|
445
|
+
# self.runOne('SESOTHO', kTeststr_st_Latn)
|
446
|
+
|
447
|
+
# def testSUNDANESE(self):
|
448
|
+
# self.runOne('SUNDANESE', kTeststr_su_Latn)
|
449
|
+
|
450
|
+
def testSWEDISH(self):
|
451
|
+
self.runOne('SWEDISH', kTeststr_sv_Latn)
|
452
|
+
|
453
|
+
def testSWAHILI(self):
|
454
|
+
self.runOne('SWAHILI', kTeststr_sw_Latn)
|
455
|
+
|
456
|
+
def testSYRIAC(self):
|
457
|
+
self.runOne('SYRIAC', kTeststr_syr_Syrc)
|
458
|
+
|
459
|
+
def testTAMIL(self):
|
460
|
+
self.runOne('TAMIL', kTeststr_ta_Taml)
|
461
|
+
|
462
|
+
def testTELUGU(self):
|
463
|
+
self.runOne('TELUGU', kTeststr_te_Telu)
|
464
|
+
|
465
|
+
# Tajik Arab removed 2008.05.27. Just Tajik Cyrl left
|
466
|
+
# def testTAJIK(self):
|
467
|
+
# self.runOne('TAJIK', kTeststr_tg_Arab)
|
468
|
+
|
469
|
+
# def testTAJIK(self):
|
470
|
+
# self.runOne('TAJIK', kTeststr_tg_Cyrl)
|
471
|
+
|
472
|
+
def testTHAI(self):
|
473
|
+
self.runOne('THAI', kTeststr_th_Thai)
|
474
|
+
|
475
|
+
# def testTIGRINYA(self):
|
476
|
+
# self.runOne('TIGRINYA', kTeststr_ti_Ethi)
|
477
|
+
|
478
|
+
# def testTURKMEN(self):
|
479
|
+
# self.runOne('TURKMEN', kTeststr_tk_Cyrl)
|
480
|
+
|
481
|
+
# def testTURKMEN(self):
|
482
|
+
# self.runOne('TURKMEN', kTeststr_tk_Latn)
|
483
|
+
|
484
|
+
def testTAGALOG(self):
|
485
|
+
self.runOne('TAGALOG', kTeststr_tl_Latn)
|
486
|
+
|
487
|
+
# def testTSWANA(self):
|
488
|
+
# self.runOne('TSWANA', kTeststr_tn_Latn)
|
489
|
+
|
490
|
+
# def testTONGA(self):
|
491
|
+
# self.runOne('TONGA', kTeststr_to_Latn)
|
492
|
+
|
493
|
+
def testTURKISH(self):
|
494
|
+
self.runOne('TURKISH', kTeststr_tr_Latn)
|
495
|
+
|
496
|
+
# def testTSONGA(self):
|
497
|
+
# self.runOne('TSONGA', kTeststr_ts_Latn)
|
498
|
+
|
499
|
+
# def testTATAR(self):
|
500
|
+
# self.runOne('TATAR', kTeststr_tt_Cyrl)
|
501
|
+
|
502
|
+
# def testTATAR(self):
|
503
|
+
# self.runOne('TATAR', kTeststr_tt_Latn)
|
504
|
+
|
505
|
+
# def testTWI(self):
|
506
|
+
# self.runOne('TWI', kTeststr_tw_Latn)
|
507
|
+
|
508
|
+
# def testUIGHUR(self):
|
509
|
+
# self.runOne('UIGHUR', kTeststr_ug_Arab)
|
510
|
+
|
511
|
+
# def testUIGHUR(self):
|
512
|
+
# self.runOne('UIGHUR', kTeststr_ug_Cyrl)
|
513
|
+
|
514
|
+
# def testUIGHUR(self):
|
515
|
+
# self.runOne('UIGHUR', kTeststr_ug_Latn)
|
516
|
+
|
517
|
+
def testUKRAINIAN(self):
|
518
|
+
self.runOne('UKRAINIAN', kTeststr_uk_Cyrl)
|
519
|
+
|
520
|
+
# def testURDU(self):
|
521
|
+
# self.runOne('URDU', kTeststr_ur_Arab)
|
522
|
+
|
523
|
+
# def testUZBEK(self):
|
524
|
+
# self.runOne('UZBEK', kTeststr_uz_Arab)
|
525
|
+
|
526
|
+
# def testUZBEK(self):
|
527
|
+
# self.runOne('UZBEK', kTeststr_uz_Cyrl)
|
528
|
+
|
529
|
+
# def testUZBEK(self):
|
530
|
+
# self.runOne('UZBEK', kTeststr_uz_Latn)
|
531
|
+
|
532
|
+
def testVIETNAMESE(self):
|
533
|
+
self.runOne('VIETNAMESE', kTeststr_vi_Latn)
|
534
|
+
|
535
|
+
# def testVOLAPUK(self):
|
536
|
+
# self.runOne('VOLAPUK', kTeststr_vo_Latn)
|
537
|
+
|
538
|
+
# def testWOLOF(self):
|
539
|
+
# self.runOne('WOLOF', kTeststr_wo_Latn)
|
540
|
+
|
541
|
+
# def testXHOSA(self):
|
542
|
+
# self.runOne('XHOSA', kTeststr_xh_Latn)
|
543
|
+
|
544
|
+
def testYIDDISH(self):
|
545
|
+
self.runOne('YIDDISH', kTeststr_yi_Hebr)
|
546
|
+
|
547
|
+
# def testYORUBA(self):
|
548
|
+
# self.runOne('YORUBA', kTeststr_yo_Latn)
|
549
|
+
|
550
|
+
# Zhuang Hani removed 2008.05.13. Just Zhuang Latn left
|
551
|
+
# def testZHUANG(self):
|
552
|
+
# self.runOne('ZHUANG', kTeststr_za_Hani)
|
553
|
+
|
554
|
+
# def testZHUANG(self):
|
555
|
+
# self.runOne('ZHUANG', kTeststr_za_Latn)
|
556
|
+
|
557
|
+
def testCHINESE(self):
|
558
|
+
self.runOne('Chinese', kTeststr_zh_Hani)
|
559
|
+
|
560
|
+
def testCHINESE_T(self):
|
561
|
+
self.runOne('ChineseT', kTeststr_zh_TW)
|
562
|
+
|
563
|
+
# def testZULU(self):
|
564
|
+
# self.runOne('ZULU', kTeststr_zu_Latn)
|
565
|
+
|
566
|
+
# No TG_UNKNOWN_LANGUAGE
|
567
|
+
# No UNKNOWN_LANGUAGE
|
568
|
+
|
569
|
+
if __name__ == '__main__':
|
570
|
+
unittest.main()
|