cld 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (107) hide show
  1. data/LICENSE +27 -0
  2. data/Manifest +106 -0
  3. data/README.rdoc +173 -0
  4. data/Rakefile +15 -0
  5. data/base/basictypes.h +348 -0
  6. data/base/build_config.h +115 -0
  7. data/base/casts.h +156 -0
  8. data/base/commandlineflags.h +443 -0
  9. data/base/crash.h +41 -0
  10. data/base/dynamic_annotations.h +358 -0
  11. data/base/global_strip_options.h +59 -0
  12. data/base/log_severity.h +46 -0
  13. data/base/logging.h +1403 -0
  14. data/base/macros.h +243 -0
  15. data/base/port.h +54 -0
  16. data/base/scoped_ptr.h +428 -0
  17. data/base/stl_decl.h +0 -0
  18. data/base/stl_decl_msvc.h +107 -0
  19. data/base/string_util.h +29 -0
  20. data/base/strtoint.h +93 -0
  21. data/base/template_util.h +96 -0
  22. data/base/type_traits.h +198 -0
  23. data/base/vlog_is_on.h +143 -0
  24. data/build.sh +48 -0
  25. data/build.win.cmd +28 -0
  26. data/cld.gemspec +30 -0
  27. data/cld_encodings.h +95 -0
  28. data/encodings/compact_lang_det/#cldutil.cc# +905 -0
  29. data/encodings/compact_lang_det/#cldutil.h# +1205 -0
  30. data/encodings/compact_lang_det/#compact_lang_det_impl.h# +171 -0
  31. data/encodings/compact_lang_det/#ext_lang_enc.cc# +545 -0
  32. data/encodings/compact_lang_det/#ext_lang_enc.h# +119 -0
  33. data/encodings/compact_lang_det/#getonescriptspan.cc# +570 -0
  34. data/encodings/compact_lang_det/#getonescriptspan.h# +131 -0
  35. data/encodings/compact_lang_det/#tote.cc# +299 -0
  36. data/encodings/compact_lang_det/#tote.h# +89 -0
  37. data/encodings/compact_lang_det/cldutil.cc +905 -0
  38. data/encodings/compact_lang_det/cldutil.h +1205 -0
  39. data/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  40. data/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  41. data/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  42. data/encodings/compact_lang_det/compact_lang_det.h +145 -0
  43. data/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  44. data/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  45. data/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  46. data/encodings/compact_lang_det/compile.cmd +1 -0
  47. data/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  48. data/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  49. data/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  50. data/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  51. data/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  52. data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  53. data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  54. data/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  55. data/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  56. data/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  57. data/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  58. data/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  59. data/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  60. data/encodings/compact_lang_det/getonescriptspan.h +131 -0
  61. data/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  62. data/encodings/compact_lang_det/letterscript_enum.h +99 -0
  63. data/encodings/compact_lang_det/subsetsequence.cc +259 -0
  64. data/encodings/compact_lang_det/subsetsequence.h +44 -0
  65. data/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  66. data/encodings/compact_lang_det/tote.cc +299 -0
  67. data/encodings/compact_lang_det/tote.h +89 -0
  68. data/encodings/compact_lang_det/unittest_data.h +193 -0
  69. data/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  70. data/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  71. data/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  72. data/encodings/compact_lang_det/win/#cld_unilib_windows.cc# +29 -0
  73. data/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  74. data/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  75. data/encodings/compact_lang_det/win/cld_google.h +18 -0
  76. data/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  77. data/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  78. data/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  79. data/encodings/compact_lang_det/win/cld_logging.h +21 -0
  80. data/encodings/compact_lang_det/win/cld_macros.h +19 -0
  81. data/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  82. data/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  83. data/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  84. data/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  85. data/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  86. data/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  87. data/encodings/compact_lang_det/win/cld_utf.h +24 -0
  88. data/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  89. data/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  90. data/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  91. data/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  92. data/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  93. data/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  94. data/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  95. data/encodings/internal/encodings.cc +12 -0
  96. data/encodings/lang_enc.h +254 -0
  97. data/encodings/proto/encodings.pb.h +169 -0
  98. data/encodings/public/encodings.h +301 -0
  99. data/ext/cld/extconf.rb +7 -0
  100. data/languages/internal/#languages.cc# +337 -0
  101. data/languages/internal/languages.cc +337 -0
  102. data/languages/proto/languages.pb.h +179 -0
  103. data/languages/public/languages.h +379 -0
  104. data/lib/cld.rb +12 -0
  105. data/test/test.rb +570 -0
  106. data/thunk.cc +131 -0
  107. metadata +168 -0
@@ -0,0 +1,379 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef LANGUAGES_PUBLIC_LANGUAGES_H_
6
+ #define LANGUAGES_PUBLIC_LANGUAGES_H_
7
+
8
+ // This interface defines the Language enum and functions that depend
9
+ // only on Language values.
10
+
11
+ // A hash-function for Language, hash<Language>, is defined in
12
+ // i18n/languages/public/languages-hash.h
13
+
14
+ #ifndef SWIG
15
+ // Language enum defined in languages.proto
16
+ // Also description on how to add languages.
17
+ #include "languages/proto/languages.pb.h"
18
+
19
+ // We need this for compatibility:
20
+ // - The Language enum in the default namespace.
21
+ // COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
22
+ //using namespace i18n::languages;
23
+
24
+ #else
25
+ // And we must have a swig-compatible enum.
26
+ // This one is a simple cleaned up version of language.proto, making the enum
27
+ // compatible with C++.
28
+ #include "i18n/languages/internal/languages_proto_wrapper.h"
29
+
30
+ #endif
31
+
32
+ // COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
33
+ //#include "util/utf8/proptables/script_enum.h"
34
+
35
+ const int kNumLanguages = NUM_LANGUAGES;
36
+
37
+ // Return the default language (ENGLISH).
38
+ Language default_language();
39
+
40
+
41
+ // *******************************************
42
+ // Language predicates
43
+ // IsValidLanguage()
44
+ // IS_LANGUAGE_UNKNOWN()
45
+ // IsCJKLanguage()
46
+ // IsChineseLanguage()
47
+ // IsNorwegianLanguage()
48
+ // IsPortugueseLanguage()
49
+ // IsRightToLeftLanguage()
50
+ // IsMaybeRightToLeftLanguage()
51
+ // IsSameLanguage()
52
+ // IsScriptRequiringLongerSnippets()
53
+ // *******************************************
54
+
55
+ // IsValidLanguage
56
+ // ===============
57
+ //
58
+ // Function to check if the input is within range of the Language enum. If
59
+ // IsValidLanguage(lang) returns true, it is safe to call
60
+ // static_cast<Language>(lang).
61
+ //
62
+ inline bool IsValidLanguage(int lang) {
63
+ return ((lang >= 0) && (lang < kNumLanguages));
64
+ }
65
+
66
+ // Return true if the language is "unknown". (This function was
67
+ // previously a macro, hence the spelling in all caps.)
68
+ //
69
+ inline bool IS_LANGUAGE_UNKNOWN(Language lang) {
70
+ return lang == TG_UNKNOWN_LANGUAGE || lang == UNKNOWN_LANGUAGE;
71
+ }
72
+
73
+ // IsCJKLanguage
74
+ // -------------
75
+ //
76
+ // This function returns true if the language is either Chinese
77
+ // (simplified or traditional), Japanese, or Korean.
78
+ bool IsCJKLanguage(Language lang);
79
+
80
+ // IsChineseLanguage
81
+ // -----------------
82
+ //
83
+ // This function returns true if the language is either Chinese
84
+ // (simplified or traditional)
85
+ bool IsChineseLanguage(Language lang);
86
+
87
+ // IsNorwegianLanguage
88
+ // --------------------
89
+ //
90
+ // This function returns true if the language is any of the Norwegian
91
+ // (regular or Nynorsk).
92
+ bool IsNorwegianLanguage(Language lang);
93
+
94
+ // IsPortugueseLanguage
95
+ // --------------------
96
+ //
97
+ // This function returns true if the language is any of the Portuguese
98
+ // languages (regular, Portugal or Brazil)
99
+ bool IsPortugueseLanguage(Language lang);
100
+
101
+ // IsSameLanguage
102
+ // --------------
103
+ //
104
+ // WARNING: This function provides only a simple test on the values of
105
+ // the two Language arguments. It returns false if either language is
106
+ // invalid. It returns true if the language arguments are equal, or
107
+ // if they are both Chinese languages, both Norwegian languages, or
108
+ // both Portuguese languages, as defined by IsChineseLanguage,
109
+ // IsNorwegianLanguage, and IsPortugueseLanguage. Otherwise it returns
110
+ // false.
111
+ bool IsSameLanguage(Language lang1, Language lang2);
112
+
113
+
114
+ // IsRightToLeftLanguage
115
+ // ---------------------
116
+ //
117
+ // This function returns true if the language is only written right-to-left
118
+ // (E.g., Hebrew, Arabic, Persian etc.)
119
+ //
120
+ // IMPORTANT NOTE: Technically we're talking about scripts, not languages.
121
+ // There are languages that can be written in more than one script.
122
+ // Examples:
123
+ // - Kurdish and Azeri ('AZERBAIJANI') can be written left-to-right in
124
+ // Latin or Cyrillic script, and right-to-left in Arabic script.
125
+ // - Sindhi and Punjabi are written in different scripts, depending on
126
+ // region and dialect.
127
+ // - Turkmen used an Arabic script historically, but not any more.
128
+ // - Pashto and Uyghur can use Arabic script, but use a Roman script
129
+ // on the Internet.
130
+ // - Kashmiri and Urdu are written either with Arabic or Devanagari script.
131
+ //
132
+ // This function only returns true for languages that are always, unequivocally
133
+ // written in right-to-left script.
134
+ //
135
+ // TODO(benjy): If we want to do anything special with multi-script languages
136
+ // we should create new 'languages' for each language+script, as we do for
137
+ // traditional vs. simplified Chinese. However most such languages are rare in
138
+ // use and even rarer on the web, so this is unlikely to be something we'll
139
+ // be concerned with for a while.
140
+ bool IsRightToLeftLanguage(Language lang);
141
+
142
+ // IsMaybeRightToLeftLanguage
143
+ // --------------------------
144
+ //
145
+ // This function returns true if the language may appear on the web in a
146
+ // right-to-left script (E.g., Hebrew, Arabic, Persian, Urdu, Kurdish, etc.)
147
+ //
148
+ // NOTE: See important notes under IsRightToLeftLanguage(...).
149
+ //
150
+ // This function returns true for languages that *may* appear on the web in a
151
+ // right-to-left script, even if they may also appear in a left-to-right
152
+ // script.
153
+ //
154
+ // This function should typically be used in cases where doing some work on
155
+ // left-to-right text would be OK (usually a no-op), and this function is used
156
+ // just to cut down on unnecessary work on regular, LTR text.
157
+ bool IsMaybeRightToLeftLanguage(Language lang);
158
+
159
+ // IsScriptRequiringLongerSnippets
160
+ // --------------------
161
+ //
162
+ // This function returns true if the script chracteristics require longer
163
+ // snippet length (Devanagari, Bengali, Gurmukhi,
164
+ // Gujarati, Oriya, Tamil, Telugu, Kannada, Malayalam).
165
+ // COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
166
+ // bool IsScriptRequiringLongerSnippets(UnicodeScript script);
167
+
168
+
169
+ // *******************************************
170
+ // LANGUAGE NAMES
171
+ //
172
+ // This interface defines a standard name for each valid Language,
173
+ // and a standard name for invalid languages. Some language names use all
174
+ // uppercase letters, but others use mixed case.
175
+ // LanguageName() [Language to name]
176
+ // LanguageEnumName() [language to enum name]
177
+ // LanguageFromName() [name to Language]
178
+ // default_language_name()
179
+ // invalid_language_name()
180
+ // *******************************************
181
+
182
+ // Given a Language, returns its standard name.
183
+ // Return invalid_language_name() if the language is invalid.
184
+ const char* LanguageName(Language lang);
185
+
186
+ // Given a Language, return the name of the enum constant for that
187
+ // language. In all but a few cases, this is the same as its standard
188
+ // name. For example, LanguageName(CHINESE) returns "Chinese", but
189
+ // LanguageEnumName(CHINESE) returns "CHINESE". This is intended for
190
+ // code that is generating C++ code, where the enum constant is more
191
+ // useful than its integer value. Return "NUM_LANGUAGES" if
192
+ // the language is invalid.
193
+ const char* LanguageEnumName(Language lang);
194
+
195
+ // The maximum length of a standard language name.
196
+ const int kMaxLanguageNameSize = 50;
197
+
198
+ // The standard name for the default language.
199
+ const char* default_language_name();
200
+
201
+ // The standard name for all invalid languages.
202
+ const char* invalid_language_name();
203
+
204
+ // If lang_name matches the standard name of a Language, using a
205
+ // case-insensitive comparison, set *language to that Language and
206
+ // return true.
207
+ // Otherwise, set *language to UNKNOWN_LANGUAGE and return false.
208
+ //
209
+ // For backwards compatibility, "HATIAN_CREOLE" is allowed as a name
210
+ // for HAITIAN_CREOLE, and "QUECHAU" is allowed as a name for QUECHUA.
211
+ // For compatibility with LanguageEnumName, "UNKNOWN_LANGUAGE" is allowed
212
+ // as a name for UNKNOWN_LANGUAGE (the return value is true in this case,
213
+ // as it is for "Unknown"), and "CHINESE_T" is allowed as a name for
214
+ // CHINESE_T (i.e., a synonym for "ChineseT").
215
+ //
216
+ // REQUIRES: language must not be NULL.
217
+ //
218
+ bool LanguageFromName(const char* lang_name, Language *language);
219
+
220
+
221
+
222
+ // *******************************************
223
+ // LANGUAGE CODES
224
+ //
225
+ // This interface defines a standard code for each valid language, and
226
+ // a standard code for invalid languages. These are derived from ISO codes,
227
+ // with some Google additions.
228
+ // LanguageCode()
229
+ // default_language_code()
230
+ // invalid_language_code()
231
+ // LanguageCodeWithDialects()
232
+ // LanguageCodeISO639_1()
233
+ // LanguageCodeISO639_2()
234
+ // *******************************************
235
+
236
+ // Given a Language, return its standard code. There are Google-specific codes:
237
+ // For CHINESE_T, return "zh-TW".
238
+ // For TG_UNKNOWN_LANGUAGE, return "ut".
239
+ // For UNKNOWN_LANGUAGE, return "un".
240
+ // For PORTUGUESE_P, return "pt-PT".
241
+ // For PORTUGUESE_B, return "pt-BR".
242
+ // For LIMBU, return "sit-NP".
243
+ // For CHEROKEE, return "chr".
244
+ // For SYRIAC, return "syr".
245
+ // Otherwise return the ISO 639-1 two-letter language code for lang.
246
+ // If lang is invalid, return invalid_language_code().
247
+ //
248
+ // NOTE: See the note below about the codes for Chinese languages.
249
+ //
250
+ const char* LanguageCode(Language lang);
251
+
252
+ // The maximum length of a language code.
253
+ const int kMaxLanguageCodeSize = 50;
254
+
255
+ // The standard code for the default language.
256
+ const char* default_language_code();
257
+
258
+ // The standard code for all invalid languages.
259
+ const char* invalid_language_code();
260
+
261
+
262
+ // --------------------------------------------
263
+ // NOTE: CHINESE LANGUAGE CODES
264
+ //
265
+ // There are three functions that return codes for Chinese languages.
266
+ // LanguageCode(lang) and LanguageCodeWithDialects(lang) are defined here.
267
+ // LanguageCode(lang, encoding) is defined in i18n/encodings.lang_enc.h.
268
+ // The following list shows the different results.
269
+ //
270
+ // LanguageCode(CHINESE) returns "zh"
271
+ // LanguageCode(CHINESE_T) returns "zh-TW".
272
+ //
273
+ // LanguageCodeWithDialects(CHINESE) returns "zh-CN".
274
+ // LanguageCodeWithDialects(CHINESE_T) returns "zh-TW".
275
+ //
276
+ // LanguageCode(CHINESE_T, <any encoding>) returns "zh-TW".
277
+ // LanguageCode(CHINESE, CHINESE_BIG5) returns "zh-TW".
278
+ // LanguageCode(CHINESE, <any other encoding>) returns "zh-CN".
279
+ //
280
+ // --------------------------------------------
281
+
282
+ // LanguageCodeWithDialects
283
+ // ------------------------
284
+ //
285
+ // If lang is CHINESE, return "zh-CN". Otherwise return LanguageCode(lang).
286
+ const char* LanguageCodeWithDialects(Language lang);
287
+
288
+ // LanguageCodeISO639_1
289
+ // --------------------
290
+ //
291
+ // Return the ISO 639-1 two-letter language code for lang.
292
+ // Return invalid_language_code() if lang is invalid or does not have
293
+ // an ISO 639-1 two-letter language code.
294
+ const char* LanguageCodeISO639_1(Language lang);
295
+
296
+ // LanguageCodeISO639_2
297
+ // --------------------
298
+ //
299
+ // Return the ISO 639-2 three-letter language for lang.
300
+ // Return invalid_language_code() if lang is invalid or does not have
301
+ // an ISO 639-2 three-letter language code.
302
+ const char* LanguageCodeISO639_2(Language lang);
303
+
304
+ // LanguageFromCode
305
+ // ----------------
306
+ //
307
+ // If lang_code matches the code for a Language, using a case-insensitive
308
+ // comparison, set *lang to that Language and return true.
309
+ // Otherwise, set *lang to UNKNOWN_LANGUAGE and return false.
310
+ //
311
+ // lang_code can be an ISO 639-1 (two-letter) code, an ISO 639-2
312
+ // (three-letter) code, or a Google-specific code (see LanguageCode).
313
+ //
314
+ // Certain language-code aliases are also allowed:
315
+ // For "zh-cn" and "zh_cn", set *lang to CHINESE.
316
+ // For "zh-tw" and "zh_tw", set *lang to CHINESE_T.
317
+ // For "he", set *lang to HEBREW.
318
+ // For "in", set *lang to INDONESIAN.
319
+ // For "ji", set *lang to YIDDISH.
320
+ // For "fil", set *lang to TAGALOG.
321
+ //
322
+ // REQUIRES: 'lang' must not be NULL.
323
+ bool LanguageFromCode(const char* lang_code, Language *language);
324
+
325
+
326
+ // LanguageFromCodeOrName
327
+ // ----------------------
328
+ //
329
+ // If lang_code_or_name is a language code or a language name.
330
+ // set *language to the corresponding Language and return true.
331
+ // Otherwise set *language to UNKNOWN_LANGUAGE and return false.
332
+ //
333
+ bool LanguageFromCodeOrName(const char* lang_code_or_name,
334
+ Language* language);
335
+
336
+ // LanguageNameFromCode
337
+ // --------------------
338
+ //
339
+ // If language_code is the code for a Language (see LanguageFromCode),
340
+ // return the standard name of that language (see LanguageName).
341
+ // Otherwise return invalid_language_name().
342
+ //
343
+ const char* LanguageNameFromCode(const char* language_code);
344
+
345
+
346
+ // Miscellany
347
+
348
+ // LanguageCodeToUnderscoreForm
349
+ // ----------------------------
350
+ //
351
+ // Given a language code, convert the dash "-" to underscore "_".
352
+ //
353
+ // Specifically, if result_length <= strlen(lang_code), set result[0]
354
+ // to '\0' and return false. Otherwise, copy lang_code to result,
355
+ // converting every dash to an underscore, converting every character
356
+ // before the first dash or underscore to lower case, and converting
357
+ // every character after the first dash or underscore to upper
358
+ // case. If there is no dash or underscore, convert the entire string
359
+ // to lower case.
360
+ //
361
+ // REQUIRES: 'lang_code' must not be NULL. 'result' must not be NULL.
362
+
363
+ bool LanguageCodeToUnderscoreForm(const char* lang_code,
364
+ char* result,
365
+ int result_length);
366
+
367
+ //
368
+ // AlwaysPutInExpectedRestrict
369
+ // ---------------------------
370
+ //
371
+ // For Web pages in certain top-level domains, Web Search always
372
+ // applies a "country restrict". If 'tld' matches one of those, using
373
+ // a case-SENSITIVE comparison, set *expected_language to the Language
374
+ // most commonly found in that top-level domain and return true.
375
+ // Otherwise, set *expected_language to UNKNOWN_LANGUAGE and return false.
376
+ bool AlwaysPutInExpectedRestrict(const char *tld, Language *expected_language);
377
+
378
+
379
+ #endif // LANGUAGES_PUBLIC_LANGUAGES_H_
@@ -0,0 +1,12 @@
1
+ require "rubygems"
2
+ require "ffi"
3
+
4
+ module CLD
5
+ extend FFI::Library
6
+ dir = File.expand_path(File.join(File.dirname(__FILE__), "../ext/cld"))
7
+ ffi_lib "#{dir}/cld.so"
8
+ attach_function "detect_language","detectLanguageThunkInt", [:buffer_in], :int
9
+ def self.english?(text)
10
+ detect_language(text) == 0
11
+ end
12
+ end
@@ -0,0 +1,570 @@
1
+ # Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ # Use of this source code is governed by a BSD-style license that can be
3
+ # found in the LICENSE file.
4
+
5
+ require "test/unit"
6
+ require "ccld"
7
+
8
+ VERBOSE = False
9
+
10
+ # MKM: ported from FullTests in compact_lang_det_unittest_small.cc
11
+
12
+ class TestCLD(unittest.TestCase):
13
+
14
+ langsSeen = set()
15
+
16
+ def runOne(self, expectedLangName, s):
17
+ if VERBOSE:
18
+ print
19
+ print 'Test: %s [%d bytes]' % (expectedLangName, len(s))
20
+ detectedLangName, detectedLangCode, isReliable, textBytesFound, details = cld.detect(s, pickSummaryLanguage=True)
21
+ if VERBOSE:
22
+ print ' detected: %s' % detectedLangName
23
+ print ' reliable: %s' % (isReliable != 0)
24
+ print ' textBytes: %s' % textBytesFound
25
+ print ' details: %s' % str(details)
26
+ self.langsSeen.add(expectedLangName)
27
+ print ' %d langs' % len(self.langsSeen)
28
+ self.assertEquals(expectedLangName, detectedLangName)
29
+ self.assertTrue(isReliable)
30
+
31
+ def testAFRIKAANS(self):
32
+ self.runOne('AFRIKAANS', kTeststr_af_Latn)
33
+
34
+ # def testAFAR(self):
35
+ # self.runOne('AFAR', kTeststr_aa_Latn)
36
+
37
+ # def testABKHAZIAN(self):
38
+ # self.runOne('ABKHAZIAN', kTeststr_ab_Cyrl)
39
+
40
+ def testAFRIKAANS(self):
41
+ self.runOne('AFRIKAANS', kTeststr_af_Latn)
42
+
43
+ # def testAMHARIC(self):
44
+ # self.runOne('AMHARIC', kTeststr_am_Ethi)
45
+
46
+ def testARABIC(self):
47
+ self.runOne('ARABIC', kTeststr_ar_Arab)
48
+
49
+ # def testASSAMESE(self):
50
+ # self.runOne('ASSAMESE', kTeststr_as_Beng)
51
+
52
+ # def testAYMARA(self):
53
+ # self.runOne('AYMARA', kTeststr_ay_Latn)
54
+
55
+ # AZERBAIJANI Arab & Cyrl removed 2008.05.27. Just AZERBAIJANI Latn left
56
+ # def testAZERBAIJANI(self):
57
+ # self.runOne('AZERBAIJANI', kTeststr_az_Arab)
58
+
59
+ # Missing data: az-Cyrl
60
+ # def testAZERBAIJANI(self):
61
+ # self.runOne('AZERBAIJANI', kTeststr_az_Latn)
62
+
63
+ # def testBASHKIR(self):
64
+ # self.runOne('BASHKIR', kTeststr_ba_Cyrl)
65
+
66
+ def testBELARUSIAN(self):
67
+ self.runOne('BELARUSIAN', kTeststr_be_Cyrl)
68
+
69
+ def testBULGARIAN(self):
70
+ self.runOne('BULGARIAN', kTeststr_bg_Cyrl)
71
+
72
+ # def testBIHARI(self):
73
+ # self.runOne('BIHARI', kTeststr_bh_Deva)
74
+
75
+ # def testBISLAMA(self):
76
+ # self.runOne('BISLAMA', kTeststr_bi_Latn)
77
+
78
+ # def testBENGALI(self):
79
+ # self.runOne('BENGALI', kTeststr_bn_Beng)
80
+
81
+ # def testTIBETAN(self):
82
+ # self.runOne('TIBETAN', kTeststr_bo_Tibt)
83
+
84
+ # def testBRETON(self):
85
+ # self.runOne('BRETON', kTeststr_br_Latn)
86
+
87
+ def testSERBIAN(self):
88
+ self.runOne('SERBIAN', kTeststr_bs_Cyrl) # NOTE: Not BOSNIAN
89
+
90
+ # def testCROATIAN(self):
91
+ # self.runOne('CROATIAN', kTeststr_bs_Latn) # NOTE: Not BOSNIAN
92
+
93
+ def testCATALAN(self):
94
+ self.runOne('CATALAN', kTeststr_ca_Latn)
95
+
96
+ def testCHEROKEE(self):
97
+ self.runOne('CHEROKEE', kTeststr_chr_Cher)
98
+
99
+ # def testCORSICAN(self):
100
+ # self.runOne('CORSICAN', kTeststr_co_Latn)
101
+
102
+ # No CREOLES_AND_PIDGINS_ENGLISH_BASED
103
+ # No CREOLES_AND_PIDGINS_FRENCH_BASED
104
+ # No CREOLES_AND_PIDGINS_OTHER
105
+ # No CREOLES_AND_PIDGINS_PORTUGUESE_BASED
106
+ def testCZECH(self):
107
+ self.runOne('CZECH', kTeststr_cs_Latn)
108
+
109
+ def testWELSH(self):
110
+ self.runOne('WELSH', kTeststr_cy_Latn)
111
+
112
+ def testDANISH(self):
113
+ self.runOne('DANISH', kTeststr_da_Latn)
114
+
115
+ def testGERMAN(self):
116
+ self.runOne('GERMAN', kTeststr_de_Latn)
117
+
118
+ def testDHIVEHI(self):
119
+ self.runOne('DHIVEHI', kTeststr_dv_Thaa)
120
+
121
+ # def testDZONGKHA(self):
122
+ # self.runOne('DZONGKHA', kTeststr_dz_Tibt)
123
+
124
+ def testGREEK(self):
125
+ self.runOne('GREEK', kTeststr_el_Grek)
126
+
127
+ def testENGLISH(self):
128
+ self.runOne('ENGLISH', kTeststr_en_Latn)
129
+
130
+ def testENGLISH(self):
131
+ self.runOne('ENGLISH', kTeststr_en)
132
+
133
+ # def testESPERANTO(self):
134
+ # self.runOne('ESPERANTO', kTeststr_eo_Latn)
135
+
136
+ def testSPANISH(self):
137
+ self.runOne('SPANISH', kTeststr_es_Latn)
138
+
139
+ def testESTONIAN(self):
140
+ self.runOne('ESTONIAN', kTeststr_et_Latn)
141
+
142
+ # def testBASQUE(self):
143
+ # self.runOne('BASQUE', kTeststr_eu_Latn)
144
+
145
+ def testPERSIAN(self):
146
+ self.runOne('PERSIAN', kTeststr_fa_Arab)
147
+
148
+ def testFINNISH(self):
149
+ self.runOne('FINNISH', kTeststr_fi_Latn)
150
+
151
+ # def testFIJIAN(self):
152
+ # self.runOne('FIJIAN', kTeststr_fj_Latn)
153
+
154
+ # def testFAROESE(self):
155
+ # self.runOne('FAROESE', kTeststr_fo_Latn)
156
+
157
+ def testFRENCH(self):
158
+ self.runOne('FRENCH', kTeststr_fr_Latn)
159
+
160
+ # def testFRISIAN(self):
161
+ # self.runOne('FRISIAN', kTeststr_fy_Latn)
162
+
163
+ def testIRISH(self):
164
+ self.runOne('IRISH', kTeststr_ga_Latn)
165
+
166
+ # def testSCOTS_GAELIC(self):
167
+ # self.runOne('SCOTS_GAELIC', kTeststr_gd_Latn)
168
+
169
+ # def testGALICIAN(self):
170
+ # self.runOne('GALICIAN', kTeststr_gl_Latn)
171
+
172
+ # def testGUARANI(self):
173
+ # self.runOne('GUARANI', kTeststr_gn_Latn)
174
+
175
+ def testGUJARATI(self):
176
+ self.runOne('GUJARATI', kTeststr_gu_Gujr)
177
+
178
+ # def testMANX(self):
179
+ # self.runOne('MANX', kTeststr_gv_Latn)
180
+
181
+ # def testHAUSA(self):
182
+ # self.runOne('HAUSA', kTeststr_ha_Latn)
183
+
184
+ def testHINDI(self):
185
+ self.runOne('HINDI', kTeststr_hi_Deva)
186
+
187
+ def testHINDI2(self):
188
+ self.runOne('HINDI', kTeststr_ks)
189
+
190
+ def testCROATIAN(self):
191
+ self.runOne('CROATIAN', kTeststr_hr_Latn) # NOTE: now CROATIAN
192
+
193
+ # def testHAITIAN_CREOLE(self):
194
+ # self.runOne('HAITIAN_CREOLE', kTeststr_ht_Latn)
195
+
196
+ def testHUNGARIAN(self):
197
+ self.runOne('HUNGARIAN', kTeststr_hu_Latn)
198
+
199
+ def testARMENIAN(self):
200
+ self.runOne('ARMENIAN', kTeststr_hy_Armn)
201
+
202
+ # def testINTERLINGUA(self):
203
+ # self.runOne('INTERLINGUA', kTeststr_ia_Latn)
204
+
205
+ def testMALAY(self):
206
+ self.runOne('MALAY', kTeststr_id_Latn)
207
+
208
+ # def testINTERLINGUE(self):
209
+ # self.runOne('INTERLINGUE', kTeststr_ie_Latn)
210
+
211
+ # def testINUPIAK(self):
212
+ # self.runOne('INUPIAK', kTeststr_ik_Latn)
213
+
214
+ def testICELANDIC(self):
215
+ self.runOne('ICELANDIC', kTeststr_is_Latn)
216
+
217
+ def testITALIAN(self):
218
+ self.runOne('ITALIAN', kTeststr_it_Latn)
219
+
220
+ def testINUKTITUT(self):
221
+ self.runOne('INUKTITUT', kTeststr_iu_Cans)
222
+
223
+ def testHEBREW(self):
224
+ self.runOne('HEBREW', kTeststr_iw_Hebr)
225
+
226
+ def testJAPANESE(self):
227
+ self.runOne('Japanese', kTeststr_ja_Hani)
228
+
229
+ # def testJAVANESE(self):
230
+ # self.runOne('JAVANESE', kTeststr_jw_Latn)
231
+
232
+ def testGEORGIAN(self):
233
+ self.runOne('GEORGIAN', kTeststr_ka_Geor)
234
+
235
+ # def testKHASI(self):
236
+ # self.runOne('KHASI', kTeststr_kha_Latn)
237
+
238
+ # def testKAZAKH(self):
239
+ # self.runOne('KAZAKH', kTeststr_kk_Arab)
240
+
241
+ # def testKAZAKH(self):
242
+ # self.runOne('KAZAKH', kTeststr_kk_Cyrl)
243
+
244
+ # def testKAZAKH(self):
245
+ # self.runOne('KAZAKH', kTeststr_kk_Latn)
246
+
247
+ # def testGREENLANDIC(self):
248
+ # self.runOne('GREENLANDIC', kTeststr_kl_Latn)
249
+
250
+ def testKHMER(self):
251
+ self.runOne('KHMER', kTeststr_km_Khmr)
252
+
253
+ def testKANNADA(self):
254
+ self.runOne('KANNADA', kTeststr_kn_Knda)
255
+
256
+ def testKOREAN(self):
257
+ self.runOne('Korean', kTeststr_ko_Hani)
258
+
259
+ # def testKASHMIRI(self):
260
+ # self.runOne('KASHMIRI', kTeststr_ks_Deva)
261
+
262
+ # KURDISH Latn removed 2008.05.27. Just KURDISH Arab left
263
+ # def testKURDISH(self):
264
+ # self.runOne('KURDISH', kTeststr_ku_Arab)
265
+
266
+ # def testKURDISH(self):
267
+ # self.runOne('KURDISH', kTeststr_ku_Latn)
268
+
269
+ # def testKYRGYZ(self):
270
+ # self.runOne('KYRGYZ', kTeststr_ky_Arab)
271
+
272
+ # def testKYRGYZ(self):
273
+ # self.runOne('KYRGYZ', kTeststr_ky_Cyrl)
274
+
275
+
276
+ # def testLATIN(self):
277
+ # self.runOne('LATIN', kTeststr_la_Latn)
278
+
279
+ # def testLUXEMBOURGISH(self):
280
+ # self.runOne('LUXEMBOURGISH', kTeststr_lb_Latn)
281
+
282
+ # def testGANDA(self):
283
+ # self.runOne('GANDA', kTeststr_lg_Latn)
284
+
285
+ # def testLINGALA(self):
286
+ # self.runOne('LINGALA', kTeststr_ln_Latn)
287
+
288
+ def testLAOTHIAN(self):
289
+ self.runOne('LAOTHIAN', kTeststr_lo_Laoo)
290
+
291
+ def testLITHUANIAN(self):
292
+ self.runOne('LITHUANIAN', kTeststr_lt_Latn)
293
+
294
+ def testLATVIAN(self):
295
+ self.runOne('LATVIAN', kTeststr_lv_Latn)
296
+
297
+ # def testMALAGASY(self):
298
+ # self.runOne('MALAGASY', kTeststr_mg_Latn)
299
+
300
+ # def testMAORI(self):
301
+ # self.runOne('MAORI', kTeststr_mi_Latn)
302
+
303
+ def testMACEDONIAN(self):
304
+ self.runOne('MACEDONIAN', kTeststr_mk_Cyrl)
305
+
306
+ def testMALAYALAM(self):
307
+ self.runOne('MALAYALAM', kTeststr_ml_Mlym)
308
+
309
+ # def testMONGOLIAN(self):
310
+ # self.runOne('MONGOLIAN', kTeststr_mn_Cyrl)
311
+
312
+ # def testMOLDAVIAN(self):
313
+ # self.runOne('MOLDAVIAN', kTeststr_mo_Cyrl)
314
+
315
+ # def testMARATHI(self):
316
+ # self.runOne('MARATHI', kTeststr_mr_Deva)
317
+
318
+ def testMALAY(self):
319
+ self.runOne('MALAY', kTeststr_ms_Latn)
320
+
321
+ # def testMALAY(self):
322
+ # self.runOne('MALAY', kTeststr_ms_Latn2)
323
+
324
+ def testMALAY(self):
325
+ self.runOne('MALAY', kTeststr_ms_Latn3)
326
+
327
+ # def testMALTESE(self):
328
+ # self.runOne('MALTESE', kTeststr_mt_Latn)
329
+
330
+ # def testBURMESE(self):
331
+ # self.runOne('BURMESE', kTeststr_my_Latn)
332
+
333
+ # def testBURMESE(self):
334
+ # self.runOne('BURMESE', kTeststr_my_Mymr)
335
+
336
+ # def testNAURU(self):
337
+ # self.runOne('NAURU', kTeststr_na_Latn)
338
+
339
+ # def testNEPALI(self):
340
+ # self.runOne('NEPALI', kTeststr_ne_Deva)
341
+
342
+ def testDUTCH(self):
343
+ self.runOne('DUTCH', kTeststr_nl_Latn)
344
+
345
+ # def testNORWEGIAN_N(self):
346
+ # self.runOne('NORWEGIAN_N', kTeststr_nn_Latn)
347
+
348
+ def testNORWEGIAN(self):
349
+ self.runOne('NORWEGIAN', kTeststr_no_Latn)
350
+
351
+
352
+ # def testOCCITAN(self):
353
+ # self.runOne('OCCITAN', kTeststr_oc_Latn)
354
+
355
+ # def testOROMO(self):
356
+ # self.runOne('OROMO', kTeststr_om_Latn)
357
+
358
+ def testORIYA(self):
359
+ self.runOne('ORIYA', kTeststr_or_Orya)
360
+
361
+ def testPUNJABI(self):
362
+ self.runOne('PUNJABI', kTeststr_pa_Guru)
363
+
364
+ def testPOLISH(self):
365
+ self.runOne('POLISH', kTeststr_pl_Latn)
366
+
367
+ # def testPASHTO(self):
368
+ # self.runOne('PASHTO', kTeststr_ps_Arab)
369
+
370
+ def testPORTUGUESE(self):
371
+ self.runOne('PORTUGUESE', kTeststr_pt_BR) # NOTE: not PORTUGUESE_B
372
+ # nor PORTUGUESE_P
373
+
374
+ # def testQUECHUA(self):
375
+ # self.runOne('QUECHUA', kTeststr_qu_Latn)
376
+
377
+ # def testRHAETO_ROMANCE(self):
378
+ # self.runOne('RHAETO_ROMANCE', kTeststr_rm_Latn)
379
+
380
+ # def testRUNDI(self):
381
+ # self.runOne('RUNDI', kTeststr_rn_Latn)
382
+
383
+ def testROMANIAN(self):
384
+ self.runOne('ROMANIAN', kTeststr_ro_Latn)
385
+
386
+ def testRUSSIAN(self):
387
+ self.runOne('RUSSIAN', kTeststr_ru_Cyrl)
388
+
389
+ # def testKINYARWANDA(self):
390
+ # self.runOne('KINYARWANDA', kTeststr_rw_Latn)
391
+
392
+ # def testSANSKRIT(self):
393
+ # self.runOne('SANSKRIT', kTeststr_sa_Deva)
394
+
395
+ # def testSANSKRIT(self):
396
+ # self.runOne('SANSKRIT', kTeststr_sa_Latn)
397
+
398
+ # def testSCOTS(self):
399
+ # self.runOne('SCOTS', kTeststr_sco_Latn)
400
+
401
+ # def testSINDHI(self):
402
+ # self.runOne('SINDHI', kTeststr_sd_Arab)
403
+
404
+ # def testSANGO(self):
405
+ # self.runOne('SANGO', kTeststr_sg_Latn)
406
+
407
+ # No SERBO_CROATIAN (sh)
408
+ def testSINHALESE(self):
409
+ self.runOne('SINHALESE', kTeststr_si_Sinh)
410
+
411
+ # def testLIMBU(self):
412
+ # self.runOne('LIMBU', kTeststr_sit_NP)
413
+
414
+ def testSLOVAK(self):
415
+ self.runOne('SLOVAK', kTeststr_sk_Latn)
416
+
417
+ def testSLOVENIAN(self):
418
+ self.runOne('SLOVENIAN', kTeststr_sl_Latn)
419
+
420
+ # def testSAMOAN(self):
421
+ # self.runOne('SAMOAN', kTeststr_sm_Latn)
422
+
423
+ # def testSHONA(self):
424
+ # self.runOne('SHONA', kTeststr_sn_Latn)
425
+
426
+ # def testSOMALI(self):
427
+ # self.runOne('SOMALI', kTeststr_so_Latn)
428
+
429
+ # def testALBANIAN(self):
430
+ # self.runOne('ALBANIAN', kTeststr_sq_Latn)
431
+
432
+ def testSERBIAN(self):
433
+ self.runOne('SERBIAN', kTeststr_sr_Cyrl) # NOTE: now SERBIAN
434
+
435
+ def testCROATIAN(self):
436
+ self.runOne('CROATIAN', kTeststr_sr_Latn) # NOTE: Not SERBIAN
437
+
438
+ def testCROATIAN(self):
439
+ self.runOne('CROATIAN', kTeststr_sr_ME_Latn) # NOTE: not SERBIAN nor MONTENEGRIN
440
+
441
+ # def testSISWANT(self):
442
+ # self.runOne('SISWANT', kTeststr_ss_Latn)
443
+
444
+ # def testSESOTHO(self):
445
+ # self.runOne('SESOTHO', kTeststr_st_Latn)
446
+
447
+ # def testSUNDANESE(self):
448
+ # self.runOne('SUNDANESE', kTeststr_su_Latn)
449
+
450
+ def testSWEDISH(self):
451
+ self.runOne('SWEDISH', kTeststr_sv_Latn)
452
+
453
+ def testSWAHILI(self):
454
+ self.runOne('SWAHILI', kTeststr_sw_Latn)
455
+
456
+ def testSYRIAC(self):
457
+ self.runOne('SYRIAC', kTeststr_syr_Syrc)
458
+
459
+ def testTAMIL(self):
460
+ self.runOne('TAMIL', kTeststr_ta_Taml)
461
+
462
+ def testTELUGU(self):
463
+ self.runOne('TELUGU', kTeststr_te_Telu)
464
+
465
+ # Tajik Arab removed 2008.05.27. Just Tajik Cyrl left
466
+ # def testTAJIK(self):
467
+ # self.runOne('TAJIK', kTeststr_tg_Arab)
468
+
469
+ # def testTAJIK(self):
470
+ # self.runOne('TAJIK', kTeststr_tg_Cyrl)
471
+
472
+ def testTHAI(self):
473
+ self.runOne('THAI', kTeststr_th_Thai)
474
+
475
+ # def testTIGRINYA(self):
476
+ # self.runOne('TIGRINYA', kTeststr_ti_Ethi)
477
+
478
+ # def testTURKMEN(self):
479
+ # self.runOne('TURKMEN', kTeststr_tk_Cyrl)
480
+
481
+ # def testTURKMEN(self):
482
+ # self.runOne('TURKMEN', kTeststr_tk_Latn)
483
+
484
+ def testTAGALOG(self):
485
+ self.runOne('TAGALOG', kTeststr_tl_Latn)
486
+
487
+ # def testTSWANA(self):
488
+ # self.runOne('TSWANA', kTeststr_tn_Latn)
489
+
490
+ # def testTONGA(self):
491
+ # self.runOne('TONGA', kTeststr_to_Latn)
492
+
493
+ def testTURKISH(self):
494
+ self.runOne('TURKISH', kTeststr_tr_Latn)
495
+
496
+ # def testTSONGA(self):
497
+ # self.runOne('TSONGA', kTeststr_ts_Latn)
498
+
499
+ # def testTATAR(self):
500
+ # self.runOne('TATAR', kTeststr_tt_Cyrl)
501
+
502
+ # def testTATAR(self):
503
+ # self.runOne('TATAR', kTeststr_tt_Latn)
504
+
505
+ # def testTWI(self):
506
+ # self.runOne('TWI', kTeststr_tw_Latn)
507
+
508
+ # def testUIGHUR(self):
509
+ # self.runOne('UIGHUR', kTeststr_ug_Arab)
510
+
511
+ # def testUIGHUR(self):
512
+ # self.runOne('UIGHUR', kTeststr_ug_Cyrl)
513
+
514
+ # def testUIGHUR(self):
515
+ # self.runOne('UIGHUR', kTeststr_ug_Latn)
516
+
517
+ def testUKRAINIAN(self):
518
+ self.runOne('UKRAINIAN', kTeststr_uk_Cyrl)
519
+
520
+ # def testURDU(self):
521
+ # self.runOne('URDU', kTeststr_ur_Arab)
522
+
523
+ # def testUZBEK(self):
524
+ # self.runOne('UZBEK', kTeststr_uz_Arab)
525
+
526
+ # def testUZBEK(self):
527
+ # self.runOne('UZBEK', kTeststr_uz_Cyrl)
528
+
529
+ # def testUZBEK(self):
530
+ # self.runOne('UZBEK', kTeststr_uz_Latn)
531
+
532
+ def testVIETNAMESE(self):
533
+ self.runOne('VIETNAMESE', kTeststr_vi_Latn)
534
+
535
+ # def testVOLAPUK(self):
536
+ # self.runOne('VOLAPUK', kTeststr_vo_Latn)
537
+
538
+ # def testWOLOF(self):
539
+ # self.runOne('WOLOF', kTeststr_wo_Latn)
540
+
541
+ # def testXHOSA(self):
542
+ # self.runOne('XHOSA', kTeststr_xh_Latn)
543
+
544
+ def testYIDDISH(self):
545
+ self.runOne('YIDDISH', kTeststr_yi_Hebr)
546
+
547
+ # def testYORUBA(self):
548
+ # self.runOne('YORUBA', kTeststr_yo_Latn)
549
+
550
+ # Zhuang Hani removed 2008.05.13. Just Zhuang Latn left
551
+ # def testZHUANG(self):
552
+ # self.runOne('ZHUANG', kTeststr_za_Hani)
553
+
554
+ # def testZHUANG(self):
555
+ # self.runOne('ZHUANG', kTeststr_za_Latn)
556
+
557
+ def testCHINESE(self):
558
+ self.runOne('Chinese', kTeststr_zh_Hani)
559
+
560
+ def testCHINESE_T(self):
561
+ self.runOne('ChineseT', kTeststr_zh_TW)
562
+
563
+ # def testZULU(self):
564
+ # self.runOne('ZULU', kTeststr_zu_Latn)
565
+
566
+ # No TG_UNKNOWN_LANGUAGE
567
+ # No UNKNOWN_LANGUAGE
568
+
569
+ if __name__ == '__main__':
570
+ unittest.main()