language_detection 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (100) hide show
  1. data/.gitignore +19 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +85 -0
  5. data/Rakefile +11 -0
  6. data/ext/cld/Makefile +34 -0
  7. data/ext/cld/base/basictypes.h +348 -0
  8. data/ext/cld/base/build_config.h +124 -0
  9. data/ext/cld/base/casts.h +156 -0
  10. data/ext/cld/base/commandlineflags.h +443 -0
  11. data/ext/cld/base/crash.h +41 -0
  12. data/ext/cld/base/dynamic_annotations.h +358 -0
  13. data/ext/cld/base/global_strip_options.h +59 -0
  14. data/ext/cld/base/log_severity.h +46 -0
  15. data/ext/cld/base/logging.h +1403 -0
  16. data/ext/cld/base/macros.h +243 -0
  17. data/ext/cld/base/port.h +54 -0
  18. data/ext/cld/base/scoped_ptr.h +428 -0
  19. data/ext/cld/base/stl_decl.h +0 -0
  20. data/ext/cld/base/stl_decl_msvc.h +107 -0
  21. data/ext/cld/base/string_util.h +29 -0
  22. data/ext/cld/base/strtoint.h +93 -0
  23. data/ext/cld/base/template_util.h +96 -0
  24. data/ext/cld/base/type_traits.h +198 -0
  25. data/ext/cld/base/vlog_is_on.h +143 -0
  26. data/ext/cld/cld.so +0 -0
  27. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  28. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  29. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  30. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  31. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  32. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  33. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  34. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  35. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  36. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  37. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  38. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  39. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  40. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  41. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  42. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  43. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  44. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  45. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  46. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  47. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  48. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  49. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  50. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  51. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  52. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  53. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  54. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  55. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  56. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  57. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  58. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  59. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  60. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  61. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  62. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  63. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  64. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  65. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  66. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  67. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  68. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  69. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  70. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  71. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  72. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  73. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  74. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  75. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  76. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  77. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  78. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  79. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  80. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  81. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  82. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  83. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  84. data/ext/cld/encodings/internal/encodings.cc +12 -0
  85. data/ext/cld/encodings/lang_enc.h +254 -0
  86. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  87. data/ext/cld/encodings/public/encodings.h +301 -0
  88. data/ext/cld/extconf.rb +1 -0
  89. data/ext/cld/language_detection.cc +88 -0
  90. data/ext/cld/languages/internal/languages.cc +337 -0
  91. data/ext/cld/languages/proto/languages.pb.h +179 -0
  92. data/ext/cld/languages/public/languages.h +379 -0
  93. data/language_detection.gemspec +28 -0
  94. data/lib/language_detection/string.rb +1 -0
  95. data/lib/language_detection/version.rb +3 -0
  96. data/lib/language_detection.rb +54 -0
  97. data/test/_helper.rb +15 -0
  98. data/test/fixtures/languages.csv +80 -0
  99. data/test/language_detection_test.rb +88 -0
  100. metadata +250 -0
@@ -0,0 +1,173 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_IMPL_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_IMPL_H_
7
+
8
+ #include "encodings/lang_enc.h"
9
+ #include "encodings/compact_lang_det/win/cld_basictypes.h"
10
+
11
+
12
+ static const int kCLDFlagFinish = 1;
13
+ static const int kCLDFlagSqueeze = 2;
14
+ static const int kCLDFlagRepeats = 4;
15
+ static const int kCLDFlagTop40 = 8;
16
+ static const int kCLDFlagShort = 16;
17
+ static const int kCLDFlagHint = 32; // Experimental, undebugged
18
+ static const int kCLDFlagUseWords = 64;
19
+
20
+ /***
21
+
22
+ Flag meanings:
23
+
24
+ Flags are used in the context of a recursive call from Detect to itself,
25
+ trying to deal in a more restrictive way with input that was not reliably
26
+ identified in the top-level call.
27
+
28
+ Finish -- Do not further recurse; return whatever result ensues, even if it is
29
+ unreliable. Typically set in any recursive call to take a second try
30
+ on unreliable text.
31
+
32
+ Squeeze -- For each text run, do an inplace cheapsqueeze to remove chunks of
33
+ highly repetitive text and chunks of text with too many 1- and
34
+ 2-letter words. This avoids scoring repetitive or useless non-text
35
+ crap in large files such bogus JPEGs within an HTML file.
36
+
37
+ Repeats -- When scoring a text run, do a cheap prediction of each character
38
+ and do not score a unigram/quadgram if the last character of same is
39
+ correctly predicted. This is a slower, finer-grained form of
40
+ cheapsqueeze, typically used when the first pass got unreliable
41
+ results.
42
+
43
+ Top40 -- Restrict the set of scored languages to the Google "Top 40*", which is
44
+ actually 38 languages. This gets rid of about 110 language that
45
+ represent about 0.7% of the web. Typically used when the first pass
46
+ got unreliable results.
47
+
48
+ Short -- Use trigram (three letter) scoring instad of quadgrams. Restricted to
49
+ the top 40* languages, Latin and Cyrillic scripts only.
50
+ Not as precise as quadgrams, but it gives some plausible result on
51
+ 1- or 2-word text in major languages.
52
+
53
+ Hint -- EXPERIMENTAL flag for compact_lang_det_test.cc to indicate a language
54
+ hint supplied in parameter plus_one.
55
+
56
+ UseWords -- In additon to scoring quad/uni/nil-grams, score complete words
57
+
58
+
59
+ Tentative decision logic:
60
+
61
+ In the middle of first pass -- After 4KB of text, look at the front 256 bytes
62
+ of every full 4KB buffer. If it compresses very well (say 3:1) or has
63
+ lots of spaces (say 1 of every 4 bytes), assume that the input is
64
+ large and contains lots of bogus non-text. Recurse, passing the
65
+ Squeeze flag to strip out chunks of this non-text.
66
+
67
+ At the end of the first pass --
68
+ If the top language is reliable and >= 70% of the document, return.
69
+ Else if the top language is reliable and top+2nd >= say 94%, return.
70
+ Else, either the top language is not reliable or there is a lot of
71
+ other crap.
72
+ ***/
73
+
74
+
75
+ namespace CompactLangDet {
76
+ struct DetectionTables;
77
+ } // namespace CompactLangDet
78
+
79
+
80
+ namespace CompactLangDetImpl {
81
+ // Scan interchange-valid UTF-8 bytes and detect most likely language,
82
+ // or set of languages.
83
+ //
84
+ // Design goals:
85
+ // Skip over big stretches of HTML tags
86
+ // Able to return ranges of different languages
87
+ // Relatively small tables and relatively fast processing
88
+ // Thread safe
89
+ //
90
+
91
+ typedef struct {
92
+ int perscript_count;
93
+ const Language* perscript_lang;
94
+ } PerScriptPair;
95
+
96
+ typedef struct {
97
+ // Constants for hashing 4-7 byte quadgram to 32 bits
98
+ const int kQuadHashB4Shift;
99
+ const int kQuadHashB4bShift;
100
+ const int kQuadHashB5Shift;
101
+ const int kQuadHashB5bShift;
102
+ // Constants for hashing 32 bits to kQuadKeyTable subscript/key
103
+ const int kHashvalToSubShift;
104
+ const uint32 kHashvalToSubMask;
105
+ const int kHashvalToKeyShift;
106
+ const uint32 kHashvalToKeyMask;
107
+ const int kHashvalAssociativity;
108
+ // Pointers to the actual tables
109
+ const PerScriptPair* kPerScriptPair;
110
+ const uint16* kQuadKeyTable;
111
+ const uint32* kQuadValueTable;
112
+ } LangDetObj;
113
+
114
+ // For HTML documents, tags are skipped, along with <script> ... </script>
115
+ // and <style> ... </style> sequences, and entities are expanded.
116
+ //
117
+ // We distinguish between bytes of the raw input buffer and bytes of non-tag
118
+ // text letters. Since tags can be over 50% of the bytes of an HTML Page,
119
+ // and are nearly all seven-bit ASCII English, we prefer to distinguish
120
+ // language mixture fractions based on just the non-tag text.
121
+ //
122
+ // Inputs: text and text_length
123
+ // is_plain_text if true says to NOT parse/skip HTML tags nor entities
124
+ // Outputs:
125
+ // language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE
126
+ // percent3 is an array of the text percentages 0..100 of the top 3 languages
127
+ // normalized_score3 is an array of internal scores, normalized to the
128
+ // average score for each language over a body of training text. A
129
+ // normalized score significantly away from 1.0 indicates very skewed text
130
+ // or gibberish.
131
+ //
132
+ // text_bytes is the amount of non-tag/letters-only text found
133
+ // is_reliable set true if the returned Language is at least 2**30 times more
134
+ // probable then the second-best Language
135
+ //
136
+ // Return value: the most likely Language for the majority of the input text
137
+ // Length 0 input and text with no reliable letter sequences returns
138
+ // UNKNOWN_LANGUAGE
139
+ //
140
+ // Subsetting: For fast detection over large documents, these routines will
141
+ // scan non-tag text of the initial part of a document, then will
142
+ // skip 4-16 bytes and subsample text in the rest of the document, up to a
143
+ // fixed limit (currently 160KB of non-tag letters).
144
+ //
145
+
146
+ Language DetectLanguageSummaryV25(
147
+ const CompactLangDet::DetectionTables* tables,
148
+ const char* buffer,
149
+ int buffer_length,
150
+ bool is_plain_text,
151
+ bool do_pick_summary_language,
152
+ bool do_remove_weak_matches,
153
+ const char* tld_hint, // "id" boosts Indonesian
154
+ int encoding_hint, // SJS boosts Japanese
155
+ Language language_hint, // ITALIAN boosts it
156
+ bool allow_extended_lang,
157
+ int flags,
158
+ Language plus_one,
159
+ Language* language3,
160
+ int* percent3,
161
+ double* normalized_score3,
162
+ int* text_bytes,
163
+ bool* is_reliable);
164
+
165
+ // For unit testing:
166
+ // Remove portions of text that have a high density of spaces, or that are
167
+ // overly repetitive, squeezing the remaining text in-place to the front
168
+ // of the input buffer.
169
+ // Return the new, possibly-shorter length
170
+ int CheapSqueezeInplace(char* isrc, int srclen, int ichunksize);
171
+ }; // End namespace CompactLangDetImpl
172
+
173
+ #endif // ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_IMPL_H_
@@ -0,0 +1,406 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+ //
5
+ // Unit test compact language detector
6
+ //
7
+ // Small version, covering these languages only:
8
+ // Arabic Bulgarian Catalan Chinese ChineseT Croatian Czech Danish Dutch
9
+ // English Estonian Finnish French German Greek Hebrew Hindi Hungarian
10
+ // Icelandic Indonesian Italian Japanese Korean Latvian Lithuanian Norwegian
11
+ // Polish Portuguese Romanian Russian Serbian Slovak Slovenian Spanish
12
+ // Swedish Tagalog Thai Turkish Ukrainian Vietnamese
13
+
14
+ // Additional single-language scripts recognized for free:
15
+ // Armenian Cherokee Dhivehi Georgian Gujarati Inuktitut Kannada Khmer
16
+ // Laothian Malayalam Oriya Punjabi Sinhalese Syriac Telugu Tamil
17
+ //
18
+
19
+ #include <string>
20
+ #include "testing/gtest/include/gtest/gtest.h"
21
+ #include "encodings/compact_lang_det/compact_lang_det.h"
22
+ #include "encodings/compact_lang_det/ext_lang_enc.h"
23
+ #include "encodings/compact_lang_det/unittest_data.h"
24
+
25
+ #include "encodings/compact_lang_det/win/cld_commandlineflags.h"
26
+ #include "encodings/compact_lang_det/win/cld_google.h"
27
+
28
+ DEFINE_bool(html, false, "Print language spans in HTML on stderr");
29
+ DEFINE_bool(detail, false, "Print incoming text to stderr");
30
+ DEFINE_bool(skipbig, false, "Skip BigInputTests");
31
+
32
+ // Test strings.
33
+ // These are all included here to make the unit test self-contained.
34
+ const char* kTeststr_en =
35
+ "confiscation of goods is assigned as the penalty part most of the courts "
36
+ "consist of members and when it is necessary to bring public cases before a "
37
+ "jury of members two courts combine for the purpose the most important cases "
38
+ "of all are brought jurors or";
39
+
40
+
41
+ // UTF8 constants. Use a UTF-8 aware editor for this file
42
+ const char* kTeststr_ks =
43
+ "नेपाल एसिया "
44
+ "मंज अख मुलुक"
45
+ " राजधानी काठ"
46
+ "माडौं नेपाल "
47
+ "अधिराज्य पेर"
48
+ "ेग्वाय "
49
+ "दक्षिण अमेरि"
50
+ "का महाद्वीपे"
51
+ " मध् यक्षेत्"
52
+ "रे एक देश अस"
53
+ "् ति फणीश्वर"
54
+ " नाथ रेणु "
55
+ "फिजी छु दक्ष"
56
+ "िण प्रशान् त"
57
+ " महासागर मंज"
58
+ " अख देश बहाम"
59
+ "ास छु केरेबि"
60
+ "यन मंज "
61
+ "अख मुलुख राज"
62
+ "धानी नसौ सम्"
63
+ " बद्घ विषय ब"
64
+ "ुरुंडी अफ्री"
65
+ "का महाद्वीपे"
66
+ " मध् "
67
+ "यक्षेत्रे दे"
68
+ "श अस् ति सम्"
69
+ " बद्घ विषय";
70
+
71
+ // const char* kTeststr_ks =
72
+ // \u0928\u0947\u092A\u093E\u0932\u0020\u090F\u0938\u093F\u092F\u093E\u0020
73
+ // \u092E\u0902\u091C\u0020\u0905\u0916\u0020\u092E\u0941\u0932\u0941\u0915
74
+ // \u0020\u0930\u093E\u091C\u0927\u093E\u0928\u0940\u0020\u0915\u093E\u0920
75
+ // \u092E\u093E\u0921\u094C\u0902\u0020\u0928\u0947\u092A\u093E\u0932\u0020
76
+ // \u0905\u0927\u093F\u0930\u093E\u091C\u094D\u092F\u0020\u092A\u0947\u0930
77
+ // \u0947\u0917\u094D\u0935\u093E\u092F\u0020
78
+ // \u0926\u0915\u094D\u0937\u093F\u0923\u0020\u0905\u092E\u0947\u0930\u093F
79
+ // \u0915\u093E\u0020\u092E\u0939\u093E\u0926\u094D\u0935\u0940\u092A\u0947
80
+ // \u0020\u092E\u0927\u094D\u0020\u092F\u0915\u094D\u0937\u0947\u0924\u094D
81
+ // \u0930\u0947\u0020\u090F\u0915\u0020\u0926\u0947\u0936\u0020\u0905\u0938
82
+ // \u094D\u0020\u0924\u093F\u0020\u092B\u0923\u0940\u0936\u094D\u0935\u0930
83
+ // \u0020\u0928\u093E\u0925\u0020\u0930\u0947\u0923\u0941\u0020
84
+ // \u092B\u093F\u091C\u0940\u0020\u091B\u0941\u0020\u0926\u0915\u094D\u0937
85
+ // \u093F\u0923\u0020\u092A\u094D\u0930\u0936\u093E\u0928\u094D\u0020\u0924
86
+ // \u0020\u092E\u0939\u093E\u0938\u093E\u0917\u0930\u0020\u092E\u0902\u091C
87
+ // \u0020\u0905\u0916\u0020\u0926\u0947\u0936\u0020\u092C\u0939\u093E\u092E
88
+ // \u093E\u0938\u0020\u091B\u0941\u0020\u0915\u0947\u0930\u0947\u092C\u093F
89
+ // \u092F\u0928\u0020\u092E\u0902\u091C\u0020
90
+ // \u0905\u0916\u0020\u092E\u0941\u0932\u0941\u0916\u0020\u0930\u093E\u091C
91
+ // \u0927\u093E\u0928\u0940\u0020\u0928\u0938\u094C\u0020\u0938\u092E\u094D
92
+ // \u0020\u092C\u0926\u094D\u0918\u0020\u0935\u093F\u0937\u092F\u0020\u092C
93
+ // \u0941\u0930\u0941\u0902\u0921\u0940\u0020\u0905\u092B\u094D\u0930\u0940
94
+ // \u0915\u093E\u0020\u092E\u0939\u093E\u0926\u094D\u0935\u0940\u092A\u0947
95
+ // \u0020\u092E\u0927\u094D\u0020
96
+ // \u092F\u0915\u094D\u0937\u0947\u0924\u094D\u0930\u0947\u0020\u0926\u0947
97
+ // \u0936\u0020\u0905\u0938\u094D\u0020\u0924\u093F\u0020\u0938\u092E\u094D
98
+ // \u0020\u092C\u0926\u094D\u0918\u0020\u0935\u093F\u0937\u092F
99
+
100
+
101
+
102
+
103
+ namespace {
104
+
105
+ class CompactLangDetTest : public testing::Test {
106
+ protected:
107
+ // Objects declared here can be used by all tests in the test case for Foo.
108
+
109
+ // Detect language of plaintext src
110
+ Language TestCompactLangDetPlain(const char* src) {
111
+ bool is_plain_text = true;
112
+ bool is_reliable;
113
+
114
+ Language lang = CompactLangDet::DetectLanguage(NULL, src, strlen(src),
115
+ is_plain_text,
116
+ &is_reliable);
117
+ return lang;
118
+ }
119
+
120
+
121
+ // Detect extended language of plaintext src
122
+ Language TestExtCompactLangDetPlain(const char* src) {
123
+ bool is_plain_text = true;
124
+ Language language3[3];
125
+ int percent3[3];
126
+ int text_bytes;
127
+ bool is_reliable;
128
+
129
+ Language lang = CompactLangDet::ExtDetectLanguageSummary(NULL,
130
+ src, strlen(src),
131
+ is_plain_text,
132
+ language3,
133
+ percent3,
134
+ &text_bytes,
135
+ &is_reliable);
136
+ return lang;
137
+ }
138
+ }; // end class CompactLangDetTest
139
+
140
+
141
+ TEST_F(CompactLangDetTest, EasyTests) {
142
+ EXPECT_EQ(ENGLISH, TestCompactLangDetPlain(kTeststr_en));
143
+ EXPECT_EQ(HINDI, TestCompactLangDetPlain(kTeststr_hi_Deva));
144
+ }
145
+
146
+
147
+ TEST_F(CompactLangDetTest, FullTests) {
148
+ // Only the tests reflecting the currently used detection tables are enabled.
149
+
150
+ // Do all the languages in all their scripts
151
+ //// EXPECT_EQ(AFAR, TestCompactLangDetPlain(kTeststr_aa_Latn));
152
+ //// EXPECT_EQ(ABKHAZIAN, TestCompactLangDetPlain(kTeststr_ab_Cyrl));
153
+ EXPECT_EQ(AFRIKAANS, TestCompactLangDetPlain(kTeststr_af_Latn));
154
+ //// EXPECT_EQ(AMHARIC, TestCompactLangDetPlain(kTeststr_am_Ethi));
155
+ EXPECT_EQ(ARABIC, TestCompactLangDetPlain(kTeststr_ar_Arab));
156
+ //// EXPECT_EQ(ASSAMESE, TestCompactLangDetPlain(kTeststr_as_Beng));
157
+ //// EXPECT_EQ(AYMARA, TestCompactLangDetPlain(kTeststr_ay_Latn));
158
+ // AZERBAIJANI Arab & Cyrl removed 2008.05.27. Just AZERBAIJANI Latn left
159
+ // EXPECT_EQ(AZERBAIJANI, TestCompactLangDetPlain(kTeststr_az_Arab));
160
+ // Missing data: az-Cyrl
161
+ //// EXPECT_EQ(AZERBAIJANI, TestCompactLangDetPlain(kTeststr_az_Latn));
162
+
163
+ //// EXPECT_EQ(BASHKIR, TestCompactLangDetPlain(kTeststr_ba_Cyrl));
164
+ EXPECT_EQ(BELARUSIAN, TestCompactLangDetPlain(kTeststr_be_Cyrl));
165
+ EXPECT_EQ(BULGARIAN, TestCompactLangDetPlain(kTeststr_bg_Cyrl));
166
+ //// EXPECT_EQ(BIHARI, TestCompactLangDetPlain(kTeststr_bh_Deva));
167
+ //// EXPECT_EQ(BISLAMA, TestCompactLangDetPlain(kTeststr_bi_Latn));
168
+ //// EXPECT_EQ(BENGALI, TestCompactLangDetPlain(kTeststr_bn_Beng));
169
+
170
+ //// EXPECT_EQ(TIBETAN, TestCompactLangDetPlain(kTeststr_bo_Tibt));
171
+ //// EXPECT_EQ(BRETON, TestCompactLangDetPlain(kTeststr_br_Latn));
172
+ EXPECT_EQ(SERBIAN, TestCompactLangDetPlain(kTeststr_bs_Cyrl)); // NOTE: Not BOSNIAN
173
+ //// EXPECT_EQ(CROATIAN, TestCompactLangDetPlain(kTeststr_bs_Latn)); // NOTE: Not BOSNIAN
174
+
175
+ EXPECT_EQ(CATALAN, TestCompactLangDetPlain(kTeststr_ca_Latn));
176
+ EXPECT_EQ(CHEROKEE, TestCompactLangDetPlain(kTeststr_chr_Cher));
177
+ //// EXPECT_EQ(CORSICAN, TestCompactLangDetPlain(kTeststr_co_Latn));
178
+ // No CREOLES_AND_PIDGINS_ENGLISH_BASED
179
+ // No CREOLES_AND_PIDGINS_FRENCH_BASED
180
+ // No CREOLES_AND_PIDGINS_OTHER
181
+ // No CREOLES_AND_PIDGINS_PORTUGUESE_BASED
182
+ EXPECT_EQ(CZECH, TestCompactLangDetPlain(kTeststr_cs_Latn));
183
+ EXPECT_EQ(WELSH, TestCompactLangDetPlain(kTeststr_cy_Latn));
184
+
185
+ EXPECT_EQ(DANISH, TestCompactLangDetPlain(kTeststr_da_Latn));
186
+ EXPECT_EQ(GERMAN, TestCompactLangDetPlain(kTeststr_de_Latn));
187
+ EXPECT_EQ(DHIVEHI, TestCompactLangDetPlain(kTeststr_dv_Thaa));
188
+ //// EXPECT_EQ(DZONGKHA, TestCompactLangDetPlain(kTeststr_dz_Tibt));
189
+
190
+ EXPECT_EQ(GREEK, TestCompactLangDetPlain(kTeststr_el_Grek));
191
+ EXPECT_EQ(ENGLISH, TestCompactLangDetPlain(kTeststr_en_Latn));
192
+ //// EXPECT_EQ(ESPERANTO, TestCompactLangDetPlain(kTeststr_eo_Latn));
193
+ EXPECT_EQ(SPANISH, TestCompactLangDetPlain(kTeststr_es_Latn));
194
+ EXPECT_EQ(ESTONIAN, TestCompactLangDetPlain(kTeststr_et_Latn));
195
+ //// EXPECT_EQ(BASQUE, TestCompactLangDetPlain(kTeststr_eu_Latn));
196
+
197
+ EXPECT_EQ(PERSIAN, TestCompactLangDetPlain(kTeststr_fa_Arab));
198
+ EXPECT_EQ(FINNISH, TestCompactLangDetPlain(kTeststr_fi_Latn));
199
+ //// EXPECT_EQ(FIJIAN, TestCompactLangDetPlain(kTeststr_fj_Latn));
200
+ //// EXPECT_EQ(FAROESE, TestCompactLangDetPlain(kTeststr_fo_Latn));
201
+ EXPECT_EQ(FRENCH, TestCompactLangDetPlain(kTeststr_fr_Latn));
202
+ //// EXPECT_EQ(FRISIAN, TestCompactLangDetPlain(kTeststr_fy_Latn));
203
+
204
+ EXPECT_EQ(IRISH, TestCompactLangDetPlain(kTeststr_ga_Latn));
205
+ //// EXPECT_EQ(SCOTS_GAELIC, TestCompactLangDetPlain(kTeststr_gd_Latn));
206
+ //// EXPECT_EQ(GALICIAN, TestCompactLangDetPlain(kTeststr_gl_Latn));
207
+ //// EXPECT_EQ(GUARANI, TestCompactLangDetPlain(kTeststr_gn_Latn));
208
+ EXPECT_EQ(GUJARATI, TestCompactLangDetPlain(kTeststr_gu_Gujr));
209
+ //// EXPECT_EQ(MANX, TestCompactLangDetPlain(kTeststr_gv_Latn));
210
+
211
+ //// EXPECT_EQ(HAUSA, TestCompactLangDetPlain(kTeststr_ha_Latn));
212
+ EXPECT_EQ(HINDI, TestCompactLangDetPlain(kTeststr_hi_Deva));
213
+ EXPECT_EQ(CROATIAN, TestCompactLangDetPlain(kTeststr_hr_Latn)); // NOTE: now CROATIAN
214
+ //// EXPECT_EQ(HAITIAN_CREOLE, TestCompactLangDetPlain(kTeststr_ht_Latn));
215
+ EXPECT_EQ(HUNGARIAN, TestCompactLangDetPlain(kTeststr_hu_Latn));
216
+ EXPECT_EQ(ARMENIAN, TestCompactLangDetPlain(kTeststr_hy_Armn));
217
+
218
+ //// EXPECT_EQ(INTERLINGUA, TestCompactLangDetPlain(kTeststr_ia_Latn));
219
+ EXPECT_EQ(MALAY, TestCompactLangDetPlain(kTeststr_id_Latn));
220
+ //// EXPECT_EQ(INTERLINGUE, TestCompactLangDetPlain(kTeststr_ie_Latn));
221
+ //// EXPECT_EQ(INUPIAK, TestCompactLangDetPlain(kTeststr_ik_Latn));
222
+ EXPECT_EQ(ICELANDIC, TestCompactLangDetPlain(kTeststr_is_Latn));
223
+ EXPECT_EQ(ITALIAN, TestCompactLangDetPlain(kTeststr_it_Latn));
224
+ EXPECT_EQ(INUKTITUT, TestCompactLangDetPlain(kTeststr_iu_Cans));
225
+ EXPECT_EQ(HEBREW, TestCompactLangDetPlain(kTeststr_iw_Hebr));
226
+
227
+ EXPECT_EQ(JAPANESE, TestCompactLangDetPlain(kTeststr_ja_Hani));
228
+ //// EXPECT_EQ(JAVANESE, TestCompactLangDetPlain(kTeststr_jw_Latn));
229
+
230
+ EXPECT_EQ(GEORGIAN, TestCompactLangDetPlain(kTeststr_ka_Geor));
231
+ //// EXPECT_EQ(KHASI, TestCompactLangDetPlain(kTeststr_kha_Latn));
232
+ //// EXPECT_EQ(KAZAKH, TestCompactLangDetPlain(kTeststr_kk_Arab));
233
+ //// EXPECT_EQ(KAZAKH, TestCompactLangDetPlain(kTeststr_kk_Cyrl));
234
+ //// EXPECT_EQ(KAZAKH, TestCompactLangDetPlain(kTeststr_kk_Latn));
235
+ //// EXPECT_EQ(GREENLANDIC, TestCompactLangDetPlain(kTeststr_kl_Latn));
236
+ EXPECT_EQ(KHMER, TestCompactLangDetPlain(kTeststr_km_Khmr));
237
+ EXPECT_EQ(KANNADA, TestCompactLangDetPlain(kTeststr_kn_Knda));
238
+ EXPECT_EQ(KOREAN, TestCompactLangDetPlain(kTeststr_ko_Hani));
239
+ //// EXPECT_EQ(KASHMIRI, TestCompactLangDetPlain(kTeststr_ks_Deva));
240
+ // KURDISH Latn removed 2008.05.27. Just KURDISH Arab left
241
+ //// EXPECT_EQ(KURDISH, TestCompactLangDetPlain(kTeststr_ku_Arab));
242
+ // EXPECT_EQ(KURDISH, TestCompactLangDetPlain(kTeststr_ku_Latn));
243
+ //// EXPECT_EQ(KYRGYZ, TestCompactLangDetPlain(kTeststr_ky_Arab));
244
+ //// EXPECT_EQ(KYRGYZ, TestCompactLangDetPlain(kTeststr_ky_Cyrl));
245
+
246
+ //// EXPECT_EQ(LATIN, TestCompactLangDetPlain(kTeststr_la_Latn));
247
+ //// EXPECT_EQ(LUXEMBOURGISH, TestCompactLangDetPlain(kTeststr_lb_Latn));
248
+ //// EXPECT_EQ(GANDA, TestCompactLangDetPlain(kTeststr_lg_Latn));
249
+ //// EXPECT_EQ(LINGALA, TestCompactLangDetPlain(kTeststr_ln_Latn));
250
+ EXPECT_EQ(LAOTHIAN, TestCompactLangDetPlain(kTeststr_lo_Laoo));
251
+ EXPECT_EQ(LITHUANIAN, TestCompactLangDetPlain(kTeststr_lt_Latn));
252
+ EXPECT_EQ(LATVIAN, TestCompactLangDetPlain(kTeststr_lv_Latn));
253
+
254
+ //// EXPECT_EQ(MALAGASY, TestCompactLangDetPlain(kTeststr_mg_Latn));
255
+ //// EXPECT_EQ(MAORI, TestCompactLangDetPlain(kTeststr_mi_Latn));
256
+ EXPECT_EQ(MACEDONIAN, TestCompactLangDetPlain(kTeststr_mk_Cyrl));
257
+ EXPECT_EQ(MALAYALAM, TestCompactLangDetPlain(kTeststr_ml_Mlym));
258
+ //// EXPECT_EQ(MONGOLIAN, TestCompactLangDetPlain(kTeststr_mn_Cyrl));
259
+ //// EXPECT_EQ(MOLDAVIAN, TestCompactLangDetPlain(kTeststr_mo_Cyrl));
260
+ //// EXPECT_EQ(MARATHI, TestCompactLangDetPlain(kTeststr_mr_Deva));
261
+ EXPECT_EQ(MALAY, TestCompactLangDetPlain(kTeststr_ms_Latn));
262
+ // EXPECT_EQ(MALAY, TestCompactLangDetPlain(kTeststr_ms_Latn2));
263
+ EXPECT_EQ(MALAY, TestCompactLangDetPlain(kTeststr_ms_Latn3));
264
+ //// EXPECT_EQ(MALTESE, TestCompactLangDetPlain(kTeststr_mt_Latn));
265
+ //// EXPECT_EQ(BURMESE, TestCompactLangDetPlain(kTeststr_my_Latn));
266
+ //// EXPECT_EQ(BURMESE, TestCompactLangDetPlain(kTeststr_my_Mymr));
267
+
268
+ //// EXPECT_EQ(NAURU, TestCompactLangDetPlain(kTeststr_na_Latn));
269
+ //// EXPECT_EQ(NEPALI, TestCompactLangDetPlain(kTeststr_ne_Deva));
270
+ EXPECT_EQ(DUTCH, TestCompactLangDetPlain(kTeststr_nl_Latn));
271
+ //// EXPECT_EQ(NORWEGIAN_N, TestCompactLangDetPlain(kTeststr_nn_Latn));
272
+ EXPECT_EQ(NORWEGIAN, TestCompactLangDetPlain(kTeststr_no_Latn));
273
+
274
+ //// EXPECT_EQ(OCCITAN, TestCompactLangDetPlain(kTeststr_oc_Latn));
275
+ //// EXPECT_EQ(OROMO, TestCompactLangDetPlain(kTeststr_om_Latn));
276
+ EXPECT_EQ(ORIYA, TestCompactLangDetPlain(kTeststr_or_Orya));
277
+
278
+ EXPECT_EQ(PUNJABI, TestCompactLangDetPlain(kTeststr_pa_Guru));
279
+ EXPECT_EQ(POLISH, TestCompactLangDetPlain(kTeststr_pl_Latn));
280
+ //// EXPECT_EQ(PASHTO, TestCompactLangDetPlain(kTeststr_ps_Arab));
281
+ EXPECT_EQ(PORTUGUESE, TestCompactLangDetPlain(kTeststr_pt_BR)); // NOTE: not PORTUGUESE_B
282
+ // nor PORTUGUESE_P
283
+
284
+ //// EXPECT_EQ(QUECHUA, TestCompactLangDetPlain(kTeststr_qu_Latn));
285
+
286
+ //// EXPECT_EQ(RHAETO_ROMANCE, TestCompactLangDetPlain(kTeststr_rm_Latn));
287
+ //// EXPECT_EQ(RUNDI, TestCompactLangDetPlain(kTeststr_rn_Latn));
288
+ EXPECT_EQ(ROMANIAN, TestCompactLangDetPlain(kTeststr_ro_Latn));
289
+ EXPECT_EQ(RUSSIAN, TestCompactLangDetPlain(kTeststr_ru_Cyrl));
290
+ //// EXPECT_EQ(KINYARWANDA, TestCompactLangDetPlain(kTeststr_rw_Latn));
291
+
292
+ //// EXPECT_EQ(SANSKRIT, TestCompactLangDetPlain(kTeststr_sa_Deva));
293
+ //// EXPECT_EQ(SANSKRIT, TestCompactLangDetPlain(kTeststr_sa_Latn));
294
+ //// EXPECT_EQ(SCOTS, TestCompactLangDetPlain(kTeststr_sco_Latn));
295
+ //// EXPECT_EQ(SINDHI, TestCompactLangDetPlain(kTeststr_sd_Arab));
296
+ //// EXPECT_EQ(SANGO, TestCompactLangDetPlain(kTeststr_sg_Latn));
297
+ // No SERBO_CROATIAN (sh)
298
+ EXPECT_EQ(SINHALESE, TestCompactLangDetPlain(kTeststr_si_Sinh));
299
+ //// EXPECT_EQ(LIMBU, TestCompactLangDetPlain(kTeststr_sit_NP));
300
+ EXPECT_EQ(SLOVAK, TestCompactLangDetPlain(kTeststr_sk_Latn));
301
+ EXPECT_EQ(SLOVENIAN, TestCompactLangDetPlain(kTeststr_sl_Latn));
302
+ //// EXPECT_EQ(SAMOAN, TestCompactLangDetPlain(kTeststr_sm_Latn));
303
+ //// EXPECT_EQ(SHONA, TestCompactLangDetPlain(kTeststr_sn_Latn));
304
+ //// EXPECT_EQ(SOMALI, TestCompactLangDetPlain(kTeststr_so_Latn));
305
+ //// EXPECT_EQ(ALBANIAN, TestCompactLangDetPlain(kTeststr_sq_Latn));
306
+ EXPECT_EQ(SERBIAN, TestCompactLangDetPlain(kTeststr_sr_Cyrl)); // NOTE: now SERBIAN
307
+ EXPECT_EQ(CROATIAN, TestCompactLangDetPlain(kTeststr_sr_Latn)); // NOTE: Not SERBIAN
308
+ EXPECT_EQ(CROATIAN, TestCompactLangDetPlain(kTeststr_sr_ME_Latn)); // NOTE: not SERBIAN nor MONTENEGRIN
309
+ //// EXPECT_EQ(SISWANT, TestCompactLangDetPlain(kTeststr_ss_Latn));
310
+ //// EXPECT_EQ(SESOTHO, TestCompactLangDetPlain(kTeststr_st_Latn));
311
+ //// EXPECT_EQ(SUNDANESE, TestCompactLangDetPlain(kTeststr_su_Latn));
312
+ EXPECT_EQ(SWEDISH, TestCompactLangDetPlain(kTeststr_sv_Latn));
313
+ EXPECT_EQ(SWAHILI, TestCompactLangDetPlain(kTeststr_sw_Latn));
314
+ EXPECT_EQ(SYRIAC, TestCompactLangDetPlain(kTeststr_syr_Syrc));
315
+
316
+ EXPECT_EQ(TAMIL, TestCompactLangDetPlain(kTeststr_ta_Taml));
317
+ EXPECT_EQ(TELUGU, TestCompactLangDetPlain(kTeststr_te_Telu));
318
+ // Tajik Arab removed 2008.05.27. Just Tajik Cyrl left
319
+ // EXPECT_EQ(TAJIK, TestCompactLangDetPlain(kTeststr_tg_Arab));
320
+ //// EXPECT_EQ(TAJIK, TestCompactLangDetPlain(kTeststr_tg_Cyrl));
321
+ EXPECT_EQ(THAI, TestCompactLangDetPlain(kTeststr_th_Thai));
322
+ //// EXPECT_EQ(TIGRINYA, TestCompactLangDetPlain(kTeststr_ti_Ethi));
323
+ //// EXPECT_EQ(TURKMEN, TestCompactLangDetPlain(kTeststr_tk_Cyrl));
324
+ //// EXPECT_EQ(TURKMEN, TestCompactLangDetPlain(kTeststr_tk_Latn));
325
+ EXPECT_EQ(TAGALOG, TestCompactLangDetPlain(kTeststr_tl_Latn));
326
+ //// EXPECT_EQ(TSWANA, TestCompactLangDetPlain(kTeststr_tn_Latn));
327
+ //// EXPECT_EQ(TONGA, TestCompactLangDetPlain(kTeststr_to_Latn));
328
+ EXPECT_EQ(TURKISH, TestCompactLangDetPlain(kTeststr_tr_Latn));
329
+ //// EXPECT_EQ(TSONGA, TestCompactLangDetPlain(kTeststr_ts_Latn));
330
+ //// EXPECT_EQ(TATAR, TestCompactLangDetPlain(kTeststr_tt_Cyrl));
331
+ //// EXPECT_EQ(TATAR, TestCompactLangDetPlain(kTeststr_tt_Latn));
332
+ //// EXPECT_EQ(TWI, TestCompactLangDetPlain(kTeststr_tw_Latn));
333
+
334
+ //// EXPECT_EQ(UIGHUR, TestCompactLangDetPlain(kTeststr_ug_Arab));
335
+ //// EXPECT_EQ(UIGHUR, TestCompactLangDetPlain(kTeststr_ug_Cyrl));
336
+ //// EXPECT_EQ(UIGHUR, TestCompactLangDetPlain(kTeststr_ug_Latn));
337
+ EXPECT_EQ(UKRAINIAN, TestCompactLangDetPlain(kTeststr_uk_Cyrl));
338
+ //// EXPECT_EQ(URDU, TestCompactLangDetPlain(kTeststr_ur_Arab));
339
+ //// EXPECT_EQ(UZBEK, TestCompactLangDetPlain(kTeststr_uz_Arab));
340
+ //// EXPECT_EQ(UZBEK, TestCompactLangDetPlain(kTeststr_uz_Cyrl));
341
+ //// EXPECT_EQ(UZBEK, TestCompactLangDetPlain(kTeststr_uz_Latn));
342
+
343
+ EXPECT_EQ(VIETNAMESE, TestCompactLangDetPlain(kTeststr_vi_Latn));
344
+ //// EXPECT_EQ(VOLAPUK, TestCompactLangDetPlain(kTeststr_vo_Latn));
345
+
346
+ //// EXPECT_EQ(WOLOF, TestCompactLangDetPlain(kTeststr_wo_Latn));
347
+
348
+ //// EXPECT_EQ(XHOSA, TestCompactLangDetPlain(kTeststr_xh_Latn));
349
+
350
+ EXPECT_EQ(YIDDISH, TestCompactLangDetPlain(kTeststr_yi_Hebr));
351
+ //// EXPECT_EQ(YORUBA, TestCompactLangDetPlain(kTeststr_yo_Latn));
352
+
353
+ // Zhuang Hani removed 2008.05.13. Just Zhuang Latn left
354
+ // EXPECT_EQ(ZHUANG, TestCompactLangDetPlain(kTeststr_za_Hani));
355
+ //// EXPECT_EQ(ZHUANG, TestCompactLangDetPlain(kTeststr_za_Latn));
356
+ EXPECT_EQ(CHINESE, TestCompactLangDetPlain(kTeststr_zh_Hani));
357
+ EXPECT_EQ(CHINESE_T, TestCompactLangDetPlain(kTeststr_zh_TW));
358
+ //// EXPECT_EQ(ZULU, TestCompactLangDetPlain(kTeststr_zu_Latn));
359
+ // No TG_UNKNOWN_LANGUAGE
360
+ // No UNKNOWN_LANGUAGE
361
+ }
362
+
363
+
364
+ TEST_F(CompactLangDetTest, ExtendedTests) {
365
+ // Do the extended languages, with them not-allowed then allowed
366
+ // These turn out to be extraordinarily sensitive forms of garbage bytes
367
+ //// EXPECT_EQ(ENGLISH, TestCompactLangDetPlain(kTeststr_tlh_Latn));
368
+ //// EXPECT_EQ(X_KLINGON, TestExtCompactLangDetPlain(kTeststr_tlh_Latn));
369
+
370
+ //// EXPECT_EQ(ENGLISH, TestCompactLangDetPlain(kTeststr_zzp_Latn));
371
+ //// EXPECT_EQ(X_PIG_LATIN, TestExtCompactLangDetPlain(kTeststr_zzp_Latn));
372
+
373
+ //// EXPECT_EQ(ENGLISH, TestCompactLangDetPlain(kTeststr_xx_Bugi));
374
+ //// EXPECT_EQ(X_BUGINESE, TestExtCompactLangDetPlain(kTeststr_xx_Bugi));
375
+
376
+ //// EXPECT_EQ(ENGLISH, TestCompactLangDetPlain(kTeststr_xx_Goth));
377
+ //// EXPECT_EQ(X_GOTHIC, TestExtCompactLangDetPlain(kTeststr_xx_Goth));
378
+
379
+ // Next three now removed permanently from probability tables (May 2008)
380
+ // (used to be X_BORK_BORK_BORK, X_ELMER_FUDD, X_HACKER).
381
+ //
382
+ // Small changes in probability tables may cause these non-texts to
383
+ // change detection result. If that happens, cross-check that
384
+ // the new result is not because of a bug, then change the expected values.
385
+ EXPECT_EQ(ENGLISH, TestCompactLangDetPlain(kTeststr_zzb_Latn));
386
+ EXPECT_EQ(ENGLISH, TestExtCompactLangDetPlain(kTeststr_zzb_Latn));
387
+
388
+ EXPECT_EQ(ENGLISH, TestCompactLangDetPlain(kTeststr_zze_Latn));
389
+ EXPECT_EQ(ENGLISH, TestExtCompactLangDetPlain(kTeststr_zze_Latn));
390
+
391
+ //// EXPECT_EQ(ENGLISH, TestCompactLangDetPlain(kTeststr_zzh_Latn));
392
+ //// EXPECT_EQ(ENGLISH, TestExtCompactLangDetPlain(kTeststr_zzh_Latn));
393
+ }
394
+
395
+
396
+ } // End namespace
397
+
398
+ #if !defined(CLD_WINDOWS)
399
+ int main(int argc, char** argv) {
400
+ FLAGS_logtostderr = true;
401
+ InitGoogle("Unit test for CLD small", &argc, &argv, false);
402
+ return RUN_ALL_TESTS();
403
+ }
404
+ #endif
405
+
406
+
@@ -0,0 +1 @@
1
+ gcc -DCOMPILER_GCC -I../.. *.cc