language_detection 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +85 -0
- data/Rakefile +11 -0
- data/ext/cld/Makefile +34 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +124 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/cld.so +0 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +1 -0
- data/ext/cld/language_detection.cc +88 -0
- data/ext/cld/languages/internal/languages.cc +337 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/language_detection.gemspec +28 -0
- data/lib/language_detection/string.rb +1 -0
- data/lib/language_detection/version.rb +3 -0
- data/lib/language_detection.rb +54 -0
- data/test/_helper.rb +15 -0
- data/test/fixtures/languages.csv +80 -0
- data/test/language_detection_test.rb +88 -0
- metadata +250 -0
@@ -0,0 +1,173 @@
|
|
1
|
+
// Copyright (c) 2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_IMPL_H_
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_IMPL_H_
|
7
|
+
|
8
|
+
#include "encodings/lang_enc.h"
|
9
|
+
#include "encodings/compact_lang_det/win/cld_basictypes.h"
|
10
|
+
|
11
|
+
|
12
|
+
static const int kCLDFlagFinish = 1;
|
13
|
+
static const int kCLDFlagSqueeze = 2;
|
14
|
+
static const int kCLDFlagRepeats = 4;
|
15
|
+
static const int kCLDFlagTop40 = 8;
|
16
|
+
static const int kCLDFlagShort = 16;
|
17
|
+
static const int kCLDFlagHint = 32; // Experimental, undebugged
|
18
|
+
static const int kCLDFlagUseWords = 64;
|
19
|
+
|
20
|
+
/***
|
21
|
+
|
22
|
+
Flag meanings:
|
23
|
+
|
24
|
+
Flags are used in the context of a recursive call from Detect to itself,
|
25
|
+
trying to deal in a more restrictive way with input that was not reliably
|
26
|
+
identified in the top-level call.
|
27
|
+
|
28
|
+
Finish -- Do not further recurse; return whatever result ensues, even if it is
|
29
|
+
unreliable. Typically set in any recursive call to take a second try
|
30
|
+
on unreliable text.
|
31
|
+
|
32
|
+
Squeeze -- For each text run, do an inplace cheapsqueeze to remove chunks of
|
33
|
+
highly repetitive text and chunks of text with too many 1- and
|
34
|
+
2-letter words. This avoids scoring repetitive or useless non-text
|
35
|
+
crap in large files such bogus JPEGs within an HTML file.
|
36
|
+
|
37
|
+
Repeats -- When scoring a text run, do a cheap prediction of each character
|
38
|
+
and do not score a unigram/quadgram if the last character of same is
|
39
|
+
correctly predicted. This is a slower, finer-grained form of
|
40
|
+
cheapsqueeze, typically used when the first pass got unreliable
|
41
|
+
results.
|
42
|
+
|
43
|
+
Top40 -- Restrict the set of scored languages to the Google "Top 40*", which is
|
44
|
+
actually 38 languages. This gets rid of about 110 language that
|
45
|
+
represent about 0.7% of the web. Typically used when the first pass
|
46
|
+
got unreliable results.
|
47
|
+
|
48
|
+
Short -- Use trigram (three letter) scoring instad of quadgrams. Restricted to
|
49
|
+
the top 40* languages, Latin and Cyrillic scripts only.
|
50
|
+
Not as precise as quadgrams, but it gives some plausible result on
|
51
|
+
1- or 2-word text in major languages.
|
52
|
+
|
53
|
+
Hint -- EXPERIMENTAL flag for compact_lang_det_test.cc to indicate a language
|
54
|
+
hint supplied in parameter plus_one.
|
55
|
+
|
56
|
+
UseWords -- In additon to scoring quad/uni/nil-grams, score complete words
|
57
|
+
|
58
|
+
|
59
|
+
Tentative decision logic:
|
60
|
+
|
61
|
+
In the middle of first pass -- After 4KB of text, look at the front 256 bytes
|
62
|
+
of every full 4KB buffer. If it compresses very well (say 3:1) or has
|
63
|
+
lots of spaces (say 1 of every 4 bytes), assume that the input is
|
64
|
+
large and contains lots of bogus non-text. Recurse, passing the
|
65
|
+
Squeeze flag to strip out chunks of this non-text.
|
66
|
+
|
67
|
+
At the end of the first pass --
|
68
|
+
If the top language is reliable and >= 70% of the document, return.
|
69
|
+
Else if the top language is reliable and top+2nd >= say 94%, return.
|
70
|
+
Else, either the top language is not reliable or there is a lot of
|
71
|
+
other crap.
|
72
|
+
***/
|
73
|
+
|
74
|
+
|
75
|
+
namespace CompactLangDet {
|
76
|
+
struct DetectionTables;
|
77
|
+
} // namespace CompactLangDet
|
78
|
+
|
79
|
+
|
80
|
+
namespace CompactLangDetImpl {
|
81
|
+
// Scan interchange-valid UTF-8 bytes and detect most likely language,
|
82
|
+
// or set of languages.
|
83
|
+
//
|
84
|
+
// Design goals:
|
85
|
+
// Skip over big stretches of HTML tags
|
86
|
+
// Able to return ranges of different languages
|
87
|
+
// Relatively small tables and relatively fast processing
|
88
|
+
// Thread safe
|
89
|
+
//
|
90
|
+
|
91
|
+
typedef struct {
|
92
|
+
int perscript_count;
|
93
|
+
const Language* perscript_lang;
|
94
|
+
} PerScriptPair;
|
95
|
+
|
96
|
+
typedef struct {
|
97
|
+
// Constants for hashing 4-7 byte quadgram to 32 bits
|
98
|
+
const int kQuadHashB4Shift;
|
99
|
+
const int kQuadHashB4bShift;
|
100
|
+
const int kQuadHashB5Shift;
|
101
|
+
const int kQuadHashB5bShift;
|
102
|
+
// Constants for hashing 32 bits to kQuadKeyTable subscript/key
|
103
|
+
const int kHashvalToSubShift;
|
104
|
+
const uint32 kHashvalToSubMask;
|
105
|
+
const int kHashvalToKeyShift;
|
106
|
+
const uint32 kHashvalToKeyMask;
|
107
|
+
const int kHashvalAssociativity;
|
108
|
+
// Pointers to the actual tables
|
109
|
+
const PerScriptPair* kPerScriptPair;
|
110
|
+
const uint16* kQuadKeyTable;
|
111
|
+
const uint32* kQuadValueTable;
|
112
|
+
} LangDetObj;
|
113
|
+
|
114
|
+
// For HTML documents, tags are skipped, along with <script> ... </script>
|
115
|
+
// and <style> ... </style> sequences, and entities are expanded.
|
116
|
+
//
|
117
|
+
// We distinguish between bytes of the raw input buffer and bytes of non-tag
|
118
|
+
// text letters. Since tags can be over 50% of the bytes of an HTML Page,
|
119
|
+
// and are nearly all seven-bit ASCII English, we prefer to distinguish
|
120
|
+
// language mixture fractions based on just the non-tag text.
|
121
|
+
//
|
122
|
+
// Inputs: text and text_length
|
123
|
+
// is_plain_text if true says to NOT parse/skip HTML tags nor entities
|
124
|
+
// Outputs:
|
125
|
+
// language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE
|
126
|
+
// percent3 is an array of the text percentages 0..100 of the top 3 languages
|
127
|
+
// normalized_score3 is an array of internal scores, normalized to the
|
128
|
+
// average score for each language over a body of training text. A
|
129
|
+
// normalized score significantly away from 1.0 indicates very skewed text
|
130
|
+
// or gibberish.
|
131
|
+
//
|
132
|
+
// text_bytes is the amount of non-tag/letters-only text found
|
133
|
+
// is_reliable set true if the returned Language is at least 2**30 times more
|
134
|
+
// probable then the second-best Language
|
135
|
+
//
|
136
|
+
// Return value: the most likely Language for the majority of the input text
|
137
|
+
// Length 0 input and text with no reliable letter sequences returns
|
138
|
+
// UNKNOWN_LANGUAGE
|
139
|
+
//
|
140
|
+
// Subsetting: For fast detection over large documents, these routines will
|
141
|
+
// scan non-tag text of the initial part of a document, then will
|
142
|
+
// skip 4-16 bytes and subsample text in the rest of the document, up to a
|
143
|
+
// fixed limit (currently 160KB of non-tag letters).
|
144
|
+
//
|
145
|
+
|
146
|
+
Language DetectLanguageSummaryV25(
|
147
|
+
const CompactLangDet::DetectionTables* tables,
|
148
|
+
const char* buffer,
|
149
|
+
int buffer_length,
|
150
|
+
bool is_plain_text,
|
151
|
+
bool do_pick_summary_language,
|
152
|
+
bool do_remove_weak_matches,
|
153
|
+
const char* tld_hint, // "id" boosts Indonesian
|
154
|
+
int encoding_hint, // SJS boosts Japanese
|
155
|
+
Language language_hint, // ITALIAN boosts it
|
156
|
+
bool allow_extended_lang,
|
157
|
+
int flags,
|
158
|
+
Language plus_one,
|
159
|
+
Language* language3,
|
160
|
+
int* percent3,
|
161
|
+
double* normalized_score3,
|
162
|
+
int* text_bytes,
|
163
|
+
bool* is_reliable);
|
164
|
+
|
165
|
+
// For unit testing:
|
166
|
+
// Remove portions of text that have a high density of spaces, or that are
|
167
|
+
// overly repetitive, squeezing the remaining text in-place to the front
|
168
|
+
// of the input buffer.
|
169
|
+
// Return the new, possibly-shorter length
|
170
|
+
int CheapSqueezeInplace(char* isrc, int srclen, int ichunksize);
|
171
|
+
}; // End namespace CompactLangDetImpl
|
172
|
+
|
173
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_IMPL_H_
|
@@ -0,0 +1,406 @@
|
|
1
|
+
// Copyright (c) 2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
//
|
5
|
+
// Unit test compact language detector
|
6
|
+
//
|
7
|
+
// Small version, covering these languages only:
|
8
|
+
// Arabic Bulgarian Catalan Chinese ChineseT Croatian Czech Danish Dutch
|
9
|
+
// English Estonian Finnish French German Greek Hebrew Hindi Hungarian
|
10
|
+
// Icelandic Indonesian Italian Japanese Korean Latvian Lithuanian Norwegian
|
11
|
+
// Polish Portuguese Romanian Russian Serbian Slovak Slovenian Spanish
|
12
|
+
// Swedish Tagalog Thai Turkish Ukrainian Vietnamese
|
13
|
+
|
14
|
+
// Additional single-language scripts recognized for free:
|
15
|
+
// Armenian Cherokee Dhivehi Georgian Gujarati Inuktitut Kannada Khmer
|
16
|
+
// Laothian Malayalam Oriya Punjabi Sinhalese Syriac Telugu Tamil
|
17
|
+
//
|
18
|
+
|
19
|
+
#include <string>
|
20
|
+
#include "testing/gtest/include/gtest/gtest.h"
|
21
|
+
#include "encodings/compact_lang_det/compact_lang_det.h"
|
22
|
+
#include "encodings/compact_lang_det/ext_lang_enc.h"
|
23
|
+
#include "encodings/compact_lang_det/unittest_data.h"
|
24
|
+
|
25
|
+
#include "encodings/compact_lang_det/win/cld_commandlineflags.h"
|
26
|
+
#include "encodings/compact_lang_det/win/cld_google.h"
|
27
|
+
|
28
|
+
DEFINE_bool(html, false, "Print language spans in HTML on stderr");
|
29
|
+
DEFINE_bool(detail, false, "Print incoming text to stderr");
|
30
|
+
DEFINE_bool(skipbig, false, "Skip BigInputTests");
|
31
|
+
|
32
|
+
// Test strings.
|
33
|
+
// These are all included here to make the unit test self-contained.
|
34
|
+
const char* kTeststr_en =
|
35
|
+
"confiscation of goods is assigned as the penalty part most of the courts "
|
36
|
+
"consist of members and when it is necessary to bring public cases before a "
|
37
|
+
"jury of members two courts combine for the purpose the most important cases "
|
38
|
+
"of all are brought jurors or";
|
39
|
+
|
40
|
+
|
41
|
+
// UTF8 constants. Use a UTF-8 aware editor for this file
|
42
|
+
const char* kTeststr_ks =
|
43
|
+
"नेपाल एसिया "
|
44
|
+
"मंज अख मुलुक"
|
45
|
+
" राजधानी काठ"
|
46
|
+
"माडौं नेपाल "
|
47
|
+
"अधिराज्य पेर"
|
48
|
+
"ेग्वाय "
|
49
|
+
"दक्षिण अमेरि"
|
50
|
+
"का महाद्वीपे"
|
51
|
+
" मध् यक्षेत्"
|
52
|
+
"रे एक देश अस"
|
53
|
+
"् ति फणीश्वर"
|
54
|
+
" नाथ रेणु "
|
55
|
+
"फिजी छु दक्ष"
|
56
|
+
"िण प्रशान् त"
|
57
|
+
" महासागर मंज"
|
58
|
+
" अख देश बहाम"
|
59
|
+
"ास छु केरेबि"
|
60
|
+
"यन मंज "
|
61
|
+
"अख मुलुख राज"
|
62
|
+
"धानी नसौ सम्"
|
63
|
+
" बद्घ विषय ब"
|
64
|
+
"ुरुंडी अफ्री"
|
65
|
+
"का महाद्वीपे"
|
66
|
+
" मध् "
|
67
|
+
"यक्षेत्रे दे"
|
68
|
+
"श अस् ति सम्"
|
69
|
+
" बद्घ विषय";
|
70
|
+
|
71
|
+
// const char* kTeststr_ks =
|
72
|
+
// \u0928\u0947\u092A\u093E\u0932\u0020\u090F\u0938\u093F\u092F\u093E\u0020
|
73
|
+
// \u092E\u0902\u091C\u0020\u0905\u0916\u0020\u092E\u0941\u0932\u0941\u0915
|
74
|
+
// \u0020\u0930\u093E\u091C\u0927\u093E\u0928\u0940\u0020\u0915\u093E\u0920
|
75
|
+
// \u092E\u093E\u0921\u094C\u0902\u0020\u0928\u0947\u092A\u093E\u0932\u0020
|
76
|
+
// \u0905\u0927\u093F\u0930\u093E\u091C\u094D\u092F\u0020\u092A\u0947\u0930
|
77
|
+
// \u0947\u0917\u094D\u0935\u093E\u092F\u0020
|
78
|
+
// \u0926\u0915\u094D\u0937\u093F\u0923\u0020\u0905\u092E\u0947\u0930\u093F
|
79
|
+
// \u0915\u093E\u0020\u092E\u0939\u093E\u0926\u094D\u0935\u0940\u092A\u0947
|
80
|
+
// \u0020\u092E\u0927\u094D\u0020\u092F\u0915\u094D\u0937\u0947\u0924\u094D
|
81
|
+
// \u0930\u0947\u0020\u090F\u0915\u0020\u0926\u0947\u0936\u0020\u0905\u0938
|
82
|
+
// \u094D\u0020\u0924\u093F\u0020\u092B\u0923\u0940\u0936\u094D\u0935\u0930
|
83
|
+
// \u0020\u0928\u093E\u0925\u0020\u0930\u0947\u0923\u0941\u0020
|
84
|
+
// \u092B\u093F\u091C\u0940\u0020\u091B\u0941\u0020\u0926\u0915\u094D\u0937
|
85
|
+
// \u093F\u0923\u0020\u092A\u094D\u0930\u0936\u093E\u0928\u094D\u0020\u0924
|
86
|
+
// \u0020\u092E\u0939\u093E\u0938\u093E\u0917\u0930\u0020\u092E\u0902\u091C
|
87
|
+
// \u0020\u0905\u0916\u0020\u0926\u0947\u0936\u0020\u092C\u0939\u093E\u092E
|
88
|
+
// \u093E\u0938\u0020\u091B\u0941\u0020\u0915\u0947\u0930\u0947\u092C\u093F
|
89
|
+
// \u092F\u0928\u0020\u092E\u0902\u091C\u0020
|
90
|
+
// \u0905\u0916\u0020\u092E\u0941\u0932\u0941\u0916\u0020\u0930\u093E\u091C
|
91
|
+
// \u0927\u093E\u0928\u0940\u0020\u0928\u0938\u094C\u0020\u0938\u092E\u094D
|
92
|
+
// \u0020\u092C\u0926\u094D\u0918\u0020\u0935\u093F\u0937\u092F\u0020\u092C
|
93
|
+
// \u0941\u0930\u0941\u0902\u0921\u0940\u0020\u0905\u092B\u094D\u0930\u0940
|
94
|
+
// \u0915\u093E\u0020\u092E\u0939\u093E\u0926\u094D\u0935\u0940\u092A\u0947
|
95
|
+
// \u0020\u092E\u0927\u094D\u0020
|
96
|
+
// \u092F\u0915\u094D\u0937\u0947\u0924\u094D\u0930\u0947\u0020\u0926\u0947
|
97
|
+
// \u0936\u0020\u0905\u0938\u094D\u0020\u0924\u093F\u0020\u0938\u092E\u094D
|
98
|
+
// \u0020\u092C\u0926\u094D\u0918\u0020\u0935\u093F\u0937\u092F
|
99
|
+
|
100
|
+
|
101
|
+
|
102
|
+
|
103
|
+
namespace {
|
104
|
+
|
105
|
+
class CompactLangDetTest : public testing::Test {
|
106
|
+
protected:
|
107
|
+
// Objects declared here can be used by all tests in the test case for Foo.
|
108
|
+
|
109
|
+
// Detect language of plaintext src
|
110
|
+
Language TestCompactLangDetPlain(const char* src) {
|
111
|
+
bool is_plain_text = true;
|
112
|
+
bool is_reliable;
|
113
|
+
|
114
|
+
Language lang = CompactLangDet::DetectLanguage(NULL, src, strlen(src),
|
115
|
+
is_plain_text,
|
116
|
+
&is_reliable);
|
117
|
+
return lang;
|
118
|
+
}
|
119
|
+
|
120
|
+
|
121
|
+
// Detect extended language of plaintext src
|
122
|
+
Language TestExtCompactLangDetPlain(const char* src) {
|
123
|
+
bool is_plain_text = true;
|
124
|
+
Language language3[3];
|
125
|
+
int percent3[3];
|
126
|
+
int text_bytes;
|
127
|
+
bool is_reliable;
|
128
|
+
|
129
|
+
Language lang = CompactLangDet::ExtDetectLanguageSummary(NULL,
|
130
|
+
src, strlen(src),
|
131
|
+
is_plain_text,
|
132
|
+
language3,
|
133
|
+
percent3,
|
134
|
+
&text_bytes,
|
135
|
+
&is_reliable);
|
136
|
+
return lang;
|
137
|
+
}
|
138
|
+
}; // end class CompactLangDetTest
|
139
|
+
|
140
|
+
|
141
|
+
TEST_F(CompactLangDetTest, EasyTests) {
|
142
|
+
EXPECT_EQ(ENGLISH, TestCompactLangDetPlain(kTeststr_en));
|
143
|
+
EXPECT_EQ(HINDI, TestCompactLangDetPlain(kTeststr_hi_Deva));
|
144
|
+
}
|
145
|
+
|
146
|
+
|
147
|
+
TEST_F(CompactLangDetTest, FullTests) {
|
148
|
+
// Only the tests reflecting the currently used detection tables are enabled.
|
149
|
+
|
150
|
+
// Do all the languages in all their scripts
|
151
|
+
//// EXPECT_EQ(AFAR, TestCompactLangDetPlain(kTeststr_aa_Latn));
|
152
|
+
//// EXPECT_EQ(ABKHAZIAN, TestCompactLangDetPlain(kTeststr_ab_Cyrl));
|
153
|
+
EXPECT_EQ(AFRIKAANS, TestCompactLangDetPlain(kTeststr_af_Latn));
|
154
|
+
//// EXPECT_EQ(AMHARIC, TestCompactLangDetPlain(kTeststr_am_Ethi));
|
155
|
+
EXPECT_EQ(ARABIC, TestCompactLangDetPlain(kTeststr_ar_Arab));
|
156
|
+
//// EXPECT_EQ(ASSAMESE, TestCompactLangDetPlain(kTeststr_as_Beng));
|
157
|
+
//// EXPECT_EQ(AYMARA, TestCompactLangDetPlain(kTeststr_ay_Latn));
|
158
|
+
// AZERBAIJANI Arab & Cyrl removed 2008.05.27. Just AZERBAIJANI Latn left
|
159
|
+
// EXPECT_EQ(AZERBAIJANI, TestCompactLangDetPlain(kTeststr_az_Arab));
|
160
|
+
// Missing data: az-Cyrl
|
161
|
+
//// EXPECT_EQ(AZERBAIJANI, TestCompactLangDetPlain(kTeststr_az_Latn));
|
162
|
+
|
163
|
+
//// EXPECT_EQ(BASHKIR, TestCompactLangDetPlain(kTeststr_ba_Cyrl));
|
164
|
+
EXPECT_EQ(BELARUSIAN, TestCompactLangDetPlain(kTeststr_be_Cyrl));
|
165
|
+
EXPECT_EQ(BULGARIAN, TestCompactLangDetPlain(kTeststr_bg_Cyrl));
|
166
|
+
//// EXPECT_EQ(BIHARI, TestCompactLangDetPlain(kTeststr_bh_Deva));
|
167
|
+
//// EXPECT_EQ(BISLAMA, TestCompactLangDetPlain(kTeststr_bi_Latn));
|
168
|
+
//// EXPECT_EQ(BENGALI, TestCompactLangDetPlain(kTeststr_bn_Beng));
|
169
|
+
|
170
|
+
//// EXPECT_EQ(TIBETAN, TestCompactLangDetPlain(kTeststr_bo_Tibt));
|
171
|
+
//// EXPECT_EQ(BRETON, TestCompactLangDetPlain(kTeststr_br_Latn));
|
172
|
+
EXPECT_EQ(SERBIAN, TestCompactLangDetPlain(kTeststr_bs_Cyrl)); // NOTE: Not BOSNIAN
|
173
|
+
//// EXPECT_EQ(CROATIAN, TestCompactLangDetPlain(kTeststr_bs_Latn)); // NOTE: Not BOSNIAN
|
174
|
+
|
175
|
+
EXPECT_EQ(CATALAN, TestCompactLangDetPlain(kTeststr_ca_Latn));
|
176
|
+
EXPECT_EQ(CHEROKEE, TestCompactLangDetPlain(kTeststr_chr_Cher));
|
177
|
+
//// EXPECT_EQ(CORSICAN, TestCompactLangDetPlain(kTeststr_co_Latn));
|
178
|
+
// No CREOLES_AND_PIDGINS_ENGLISH_BASED
|
179
|
+
// No CREOLES_AND_PIDGINS_FRENCH_BASED
|
180
|
+
// No CREOLES_AND_PIDGINS_OTHER
|
181
|
+
// No CREOLES_AND_PIDGINS_PORTUGUESE_BASED
|
182
|
+
EXPECT_EQ(CZECH, TestCompactLangDetPlain(kTeststr_cs_Latn));
|
183
|
+
EXPECT_EQ(WELSH, TestCompactLangDetPlain(kTeststr_cy_Latn));
|
184
|
+
|
185
|
+
EXPECT_EQ(DANISH, TestCompactLangDetPlain(kTeststr_da_Latn));
|
186
|
+
EXPECT_EQ(GERMAN, TestCompactLangDetPlain(kTeststr_de_Latn));
|
187
|
+
EXPECT_EQ(DHIVEHI, TestCompactLangDetPlain(kTeststr_dv_Thaa));
|
188
|
+
//// EXPECT_EQ(DZONGKHA, TestCompactLangDetPlain(kTeststr_dz_Tibt));
|
189
|
+
|
190
|
+
EXPECT_EQ(GREEK, TestCompactLangDetPlain(kTeststr_el_Grek));
|
191
|
+
EXPECT_EQ(ENGLISH, TestCompactLangDetPlain(kTeststr_en_Latn));
|
192
|
+
//// EXPECT_EQ(ESPERANTO, TestCompactLangDetPlain(kTeststr_eo_Latn));
|
193
|
+
EXPECT_EQ(SPANISH, TestCompactLangDetPlain(kTeststr_es_Latn));
|
194
|
+
EXPECT_EQ(ESTONIAN, TestCompactLangDetPlain(kTeststr_et_Latn));
|
195
|
+
//// EXPECT_EQ(BASQUE, TestCompactLangDetPlain(kTeststr_eu_Latn));
|
196
|
+
|
197
|
+
EXPECT_EQ(PERSIAN, TestCompactLangDetPlain(kTeststr_fa_Arab));
|
198
|
+
EXPECT_EQ(FINNISH, TestCompactLangDetPlain(kTeststr_fi_Latn));
|
199
|
+
//// EXPECT_EQ(FIJIAN, TestCompactLangDetPlain(kTeststr_fj_Latn));
|
200
|
+
//// EXPECT_EQ(FAROESE, TestCompactLangDetPlain(kTeststr_fo_Latn));
|
201
|
+
EXPECT_EQ(FRENCH, TestCompactLangDetPlain(kTeststr_fr_Latn));
|
202
|
+
//// EXPECT_EQ(FRISIAN, TestCompactLangDetPlain(kTeststr_fy_Latn));
|
203
|
+
|
204
|
+
EXPECT_EQ(IRISH, TestCompactLangDetPlain(kTeststr_ga_Latn));
|
205
|
+
//// EXPECT_EQ(SCOTS_GAELIC, TestCompactLangDetPlain(kTeststr_gd_Latn));
|
206
|
+
//// EXPECT_EQ(GALICIAN, TestCompactLangDetPlain(kTeststr_gl_Latn));
|
207
|
+
//// EXPECT_EQ(GUARANI, TestCompactLangDetPlain(kTeststr_gn_Latn));
|
208
|
+
EXPECT_EQ(GUJARATI, TestCompactLangDetPlain(kTeststr_gu_Gujr));
|
209
|
+
//// EXPECT_EQ(MANX, TestCompactLangDetPlain(kTeststr_gv_Latn));
|
210
|
+
|
211
|
+
//// EXPECT_EQ(HAUSA, TestCompactLangDetPlain(kTeststr_ha_Latn));
|
212
|
+
EXPECT_EQ(HINDI, TestCompactLangDetPlain(kTeststr_hi_Deva));
|
213
|
+
EXPECT_EQ(CROATIAN, TestCompactLangDetPlain(kTeststr_hr_Latn)); // NOTE: now CROATIAN
|
214
|
+
//// EXPECT_EQ(HAITIAN_CREOLE, TestCompactLangDetPlain(kTeststr_ht_Latn));
|
215
|
+
EXPECT_EQ(HUNGARIAN, TestCompactLangDetPlain(kTeststr_hu_Latn));
|
216
|
+
EXPECT_EQ(ARMENIAN, TestCompactLangDetPlain(kTeststr_hy_Armn));
|
217
|
+
|
218
|
+
//// EXPECT_EQ(INTERLINGUA, TestCompactLangDetPlain(kTeststr_ia_Latn));
|
219
|
+
EXPECT_EQ(MALAY, TestCompactLangDetPlain(kTeststr_id_Latn));
|
220
|
+
//// EXPECT_EQ(INTERLINGUE, TestCompactLangDetPlain(kTeststr_ie_Latn));
|
221
|
+
//// EXPECT_EQ(INUPIAK, TestCompactLangDetPlain(kTeststr_ik_Latn));
|
222
|
+
EXPECT_EQ(ICELANDIC, TestCompactLangDetPlain(kTeststr_is_Latn));
|
223
|
+
EXPECT_EQ(ITALIAN, TestCompactLangDetPlain(kTeststr_it_Latn));
|
224
|
+
EXPECT_EQ(INUKTITUT, TestCompactLangDetPlain(kTeststr_iu_Cans));
|
225
|
+
EXPECT_EQ(HEBREW, TestCompactLangDetPlain(kTeststr_iw_Hebr));
|
226
|
+
|
227
|
+
EXPECT_EQ(JAPANESE, TestCompactLangDetPlain(kTeststr_ja_Hani));
|
228
|
+
//// EXPECT_EQ(JAVANESE, TestCompactLangDetPlain(kTeststr_jw_Latn));
|
229
|
+
|
230
|
+
EXPECT_EQ(GEORGIAN, TestCompactLangDetPlain(kTeststr_ka_Geor));
|
231
|
+
//// EXPECT_EQ(KHASI, TestCompactLangDetPlain(kTeststr_kha_Latn));
|
232
|
+
//// EXPECT_EQ(KAZAKH, TestCompactLangDetPlain(kTeststr_kk_Arab));
|
233
|
+
//// EXPECT_EQ(KAZAKH, TestCompactLangDetPlain(kTeststr_kk_Cyrl));
|
234
|
+
//// EXPECT_EQ(KAZAKH, TestCompactLangDetPlain(kTeststr_kk_Latn));
|
235
|
+
//// EXPECT_EQ(GREENLANDIC, TestCompactLangDetPlain(kTeststr_kl_Latn));
|
236
|
+
EXPECT_EQ(KHMER, TestCompactLangDetPlain(kTeststr_km_Khmr));
|
237
|
+
EXPECT_EQ(KANNADA, TestCompactLangDetPlain(kTeststr_kn_Knda));
|
238
|
+
EXPECT_EQ(KOREAN, TestCompactLangDetPlain(kTeststr_ko_Hani));
|
239
|
+
//// EXPECT_EQ(KASHMIRI, TestCompactLangDetPlain(kTeststr_ks_Deva));
|
240
|
+
// KURDISH Latn removed 2008.05.27. Just KURDISH Arab left
|
241
|
+
//// EXPECT_EQ(KURDISH, TestCompactLangDetPlain(kTeststr_ku_Arab));
|
242
|
+
// EXPECT_EQ(KURDISH, TestCompactLangDetPlain(kTeststr_ku_Latn));
|
243
|
+
//// EXPECT_EQ(KYRGYZ, TestCompactLangDetPlain(kTeststr_ky_Arab));
|
244
|
+
//// EXPECT_EQ(KYRGYZ, TestCompactLangDetPlain(kTeststr_ky_Cyrl));
|
245
|
+
|
246
|
+
//// EXPECT_EQ(LATIN, TestCompactLangDetPlain(kTeststr_la_Latn));
|
247
|
+
//// EXPECT_EQ(LUXEMBOURGISH, TestCompactLangDetPlain(kTeststr_lb_Latn));
|
248
|
+
//// EXPECT_EQ(GANDA, TestCompactLangDetPlain(kTeststr_lg_Latn));
|
249
|
+
//// EXPECT_EQ(LINGALA, TestCompactLangDetPlain(kTeststr_ln_Latn));
|
250
|
+
EXPECT_EQ(LAOTHIAN, TestCompactLangDetPlain(kTeststr_lo_Laoo));
|
251
|
+
EXPECT_EQ(LITHUANIAN, TestCompactLangDetPlain(kTeststr_lt_Latn));
|
252
|
+
EXPECT_EQ(LATVIAN, TestCompactLangDetPlain(kTeststr_lv_Latn));
|
253
|
+
|
254
|
+
//// EXPECT_EQ(MALAGASY, TestCompactLangDetPlain(kTeststr_mg_Latn));
|
255
|
+
//// EXPECT_EQ(MAORI, TestCompactLangDetPlain(kTeststr_mi_Latn));
|
256
|
+
EXPECT_EQ(MACEDONIAN, TestCompactLangDetPlain(kTeststr_mk_Cyrl));
|
257
|
+
EXPECT_EQ(MALAYALAM, TestCompactLangDetPlain(kTeststr_ml_Mlym));
|
258
|
+
//// EXPECT_EQ(MONGOLIAN, TestCompactLangDetPlain(kTeststr_mn_Cyrl));
|
259
|
+
//// EXPECT_EQ(MOLDAVIAN, TestCompactLangDetPlain(kTeststr_mo_Cyrl));
|
260
|
+
//// EXPECT_EQ(MARATHI, TestCompactLangDetPlain(kTeststr_mr_Deva));
|
261
|
+
EXPECT_EQ(MALAY, TestCompactLangDetPlain(kTeststr_ms_Latn));
|
262
|
+
// EXPECT_EQ(MALAY, TestCompactLangDetPlain(kTeststr_ms_Latn2));
|
263
|
+
EXPECT_EQ(MALAY, TestCompactLangDetPlain(kTeststr_ms_Latn3));
|
264
|
+
//// EXPECT_EQ(MALTESE, TestCompactLangDetPlain(kTeststr_mt_Latn));
|
265
|
+
//// EXPECT_EQ(BURMESE, TestCompactLangDetPlain(kTeststr_my_Latn));
|
266
|
+
//// EXPECT_EQ(BURMESE, TestCompactLangDetPlain(kTeststr_my_Mymr));
|
267
|
+
|
268
|
+
//// EXPECT_EQ(NAURU, TestCompactLangDetPlain(kTeststr_na_Latn));
|
269
|
+
//// EXPECT_EQ(NEPALI, TestCompactLangDetPlain(kTeststr_ne_Deva));
|
270
|
+
EXPECT_EQ(DUTCH, TestCompactLangDetPlain(kTeststr_nl_Latn));
|
271
|
+
//// EXPECT_EQ(NORWEGIAN_N, TestCompactLangDetPlain(kTeststr_nn_Latn));
|
272
|
+
EXPECT_EQ(NORWEGIAN, TestCompactLangDetPlain(kTeststr_no_Latn));
|
273
|
+
|
274
|
+
//// EXPECT_EQ(OCCITAN, TestCompactLangDetPlain(kTeststr_oc_Latn));
|
275
|
+
//// EXPECT_EQ(OROMO, TestCompactLangDetPlain(kTeststr_om_Latn));
|
276
|
+
EXPECT_EQ(ORIYA, TestCompactLangDetPlain(kTeststr_or_Orya));
|
277
|
+
|
278
|
+
EXPECT_EQ(PUNJABI, TestCompactLangDetPlain(kTeststr_pa_Guru));
|
279
|
+
EXPECT_EQ(POLISH, TestCompactLangDetPlain(kTeststr_pl_Latn));
|
280
|
+
//// EXPECT_EQ(PASHTO, TestCompactLangDetPlain(kTeststr_ps_Arab));
|
281
|
+
EXPECT_EQ(PORTUGUESE, TestCompactLangDetPlain(kTeststr_pt_BR)); // NOTE: not PORTUGUESE_B
|
282
|
+
// nor PORTUGUESE_P
|
283
|
+
|
284
|
+
//// EXPECT_EQ(QUECHUA, TestCompactLangDetPlain(kTeststr_qu_Latn));
|
285
|
+
|
286
|
+
//// EXPECT_EQ(RHAETO_ROMANCE, TestCompactLangDetPlain(kTeststr_rm_Latn));
|
287
|
+
//// EXPECT_EQ(RUNDI, TestCompactLangDetPlain(kTeststr_rn_Latn));
|
288
|
+
EXPECT_EQ(ROMANIAN, TestCompactLangDetPlain(kTeststr_ro_Latn));
|
289
|
+
EXPECT_EQ(RUSSIAN, TestCompactLangDetPlain(kTeststr_ru_Cyrl));
|
290
|
+
//// EXPECT_EQ(KINYARWANDA, TestCompactLangDetPlain(kTeststr_rw_Latn));
|
291
|
+
|
292
|
+
//// EXPECT_EQ(SANSKRIT, TestCompactLangDetPlain(kTeststr_sa_Deva));
|
293
|
+
//// EXPECT_EQ(SANSKRIT, TestCompactLangDetPlain(kTeststr_sa_Latn));
|
294
|
+
//// EXPECT_EQ(SCOTS, TestCompactLangDetPlain(kTeststr_sco_Latn));
|
295
|
+
//// EXPECT_EQ(SINDHI, TestCompactLangDetPlain(kTeststr_sd_Arab));
|
296
|
+
//// EXPECT_EQ(SANGO, TestCompactLangDetPlain(kTeststr_sg_Latn));
|
297
|
+
// No SERBO_CROATIAN (sh)
|
298
|
+
EXPECT_EQ(SINHALESE, TestCompactLangDetPlain(kTeststr_si_Sinh));
|
299
|
+
//// EXPECT_EQ(LIMBU, TestCompactLangDetPlain(kTeststr_sit_NP));
|
300
|
+
EXPECT_EQ(SLOVAK, TestCompactLangDetPlain(kTeststr_sk_Latn));
|
301
|
+
EXPECT_EQ(SLOVENIAN, TestCompactLangDetPlain(kTeststr_sl_Latn));
|
302
|
+
//// EXPECT_EQ(SAMOAN, TestCompactLangDetPlain(kTeststr_sm_Latn));
|
303
|
+
//// EXPECT_EQ(SHONA, TestCompactLangDetPlain(kTeststr_sn_Latn));
|
304
|
+
//// EXPECT_EQ(SOMALI, TestCompactLangDetPlain(kTeststr_so_Latn));
|
305
|
+
//// EXPECT_EQ(ALBANIAN, TestCompactLangDetPlain(kTeststr_sq_Latn));
|
306
|
+
EXPECT_EQ(SERBIAN, TestCompactLangDetPlain(kTeststr_sr_Cyrl)); // NOTE: now SERBIAN
|
307
|
+
EXPECT_EQ(CROATIAN, TestCompactLangDetPlain(kTeststr_sr_Latn)); // NOTE: Not SERBIAN
|
308
|
+
EXPECT_EQ(CROATIAN, TestCompactLangDetPlain(kTeststr_sr_ME_Latn)); // NOTE: not SERBIAN nor MONTENEGRIN
|
309
|
+
//// EXPECT_EQ(SISWANT, TestCompactLangDetPlain(kTeststr_ss_Latn));
|
310
|
+
//// EXPECT_EQ(SESOTHO, TestCompactLangDetPlain(kTeststr_st_Latn));
|
311
|
+
//// EXPECT_EQ(SUNDANESE, TestCompactLangDetPlain(kTeststr_su_Latn));
|
312
|
+
EXPECT_EQ(SWEDISH, TestCompactLangDetPlain(kTeststr_sv_Latn));
|
313
|
+
EXPECT_EQ(SWAHILI, TestCompactLangDetPlain(kTeststr_sw_Latn));
|
314
|
+
EXPECT_EQ(SYRIAC, TestCompactLangDetPlain(kTeststr_syr_Syrc));
|
315
|
+
|
316
|
+
EXPECT_EQ(TAMIL, TestCompactLangDetPlain(kTeststr_ta_Taml));
|
317
|
+
EXPECT_EQ(TELUGU, TestCompactLangDetPlain(kTeststr_te_Telu));
|
318
|
+
// Tajik Arab removed 2008.05.27. Just Tajik Cyrl left
|
319
|
+
// EXPECT_EQ(TAJIK, TestCompactLangDetPlain(kTeststr_tg_Arab));
|
320
|
+
//// EXPECT_EQ(TAJIK, TestCompactLangDetPlain(kTeststr_tg_Cyrl));
|
321
|
+
EXPECT_EQ(THAI, TestCompactLangDetPlain(kTeststr_th_Thai));
|
322
|
+
//// EXPECT_EQ(TIGRINYA, TestCompactLangDetPlain(kTeststr_ti_Ethi));
|
323
|
+
//// EXPECT_EQ(TURKMEN, TestCompactLangDetPlain(kTeststr_tk_Cyrl));
|
324
|
+
//// EXPECT_EQ(TURKMEN, TestCompactLangDetPlain(kTeststr_tk_Latn));
|
325
|
+
EXPECT_EQ(TAGALOG, TestCompactLangDetPlain(kTeststr_tl_Latn));
|
326
|
+
//// EXPECT_EQ(TSWANA, TestCompactLangDetPlain(kTeststr_tn_Latn));
|
327
|
+
//// EXPECT_EQ(TONGA, TestCompactLangDetPlain(kTeststr_to_Latn));
|
328
|
+
EXPECT_EQ(TURKISH, TestCompactLangDetPlain(kTeststr_tr_Latn));
|
329
|
+
//// EXPECT_EQ(TSONGA, TestCompactLangDetPlain(kTeststr_ts_Latn));
|
330
|
+
//// EXPECT_EQ(TATAR, TestCompactLangDetPlain(kTeststr_tt_Cyrl));
|
331
|
+
//// EXPECT_EQ(TATAR, TestCompactLangDetPlain(kTeststr_tt_Latn));
|
332
|
+
//// EXPECT_EQ(TWI, TestCompactLangDetPlain(kTeststr_tw_Latn));
|
333
|
+
|
334
|
+
//// EXPECT_EQ(UIGHUR, TestCompactLangDetPlain(kTeststr_ug_Arab));
|
335
|
+
//// EXPECT_EQ(UIGHUR, TestCompactLangDetPlain(kTeststr_ug_Cyrl));
|
336
|
+
//// EXPECT_EQ(UIGHUR, TestCompactLangDetPlain(kTeststr_ug_Latn));
|
337
|
+
EXPECT_EQ(UKRAINIAN, TestCompactLangDetPlain(kTeststr_uk_Cyrl));
|
338
|
+
//// EXPECT_EQ(URDU, TestCompactLangDetPlain(kTeststr_ur_Arab));
|
339
|
+
//// EXPECT_EQ(UZBEK, TestCompactLangDetPlain(kTeststr_uz_Arab));
|
340
|
+
//// EXPECT_EQ(UZBEK, TestCompactLangDetPlain(kTeststr_uz_Cyrl));
|
341
|
+
//// EXPECT_EQ(UZBEK, TestCompactLangDetPlain(kTeststr_uz_Latn));
|
342
|
+
|
343
|
+
EXPECT_EQ(VIETNAMESE, TestCompactLangDetPlain(kTeststr_vi_Latn));
|
344
|
+
//// EXPECT_EQ(VOLAPUK, TestCompactLangDetPlain(kTeststr_vo_Latn));
|
345
|
+
|
346
|
+
//// EXPECT_EQ(WOLOF, TestCompactLangDetPlain(kTeststr_wo_Latn));
|
347
|
+
|
348
|
+
//// EXPECT_EQ(XHOSA, TestCompactLangDetPlain(kTeststr_xh_Latn));
|
349
|
+
|
350
|
+
EXPECT_EQ(YIDDISH, TestCompactLangDetPlain(kTeststr_yi_Hebr));
|
351
|
+
//// EXPECT_EQ(YORUBA, TestCompactLangDetPlain(kTeststr_yo_Latn));
|
352
|
+
|
353
|
+
// Zhuang Hani removed 2008.05.13. Just Zhuang Latn left
|
354
|
+
// EXPECT_EQ(ZHUANG, TestCompactLangDetPlain(kTeststr_za_Hani));
|
355
|
+
//// EXPECT_EQ(ZHUANG, TestCompactLangDetPlain(kTeststr_za_Latn));
|
356
|
+
EXPECT_EQ(CHINESE, TestCompactLangDetPlain(kTeststr_zh_Hani));
|
357
|
+
EXPECT_EQ(CHINESE_T, TestCompactLangDetPlain(kTeststr_zh_TW));
|
358
|
+
//// EXPECT_EQ(ZULU, TestCompactLangDetPlain(kTeststr_zu_Latn));
|
359
|
+
// No TG_UNKNOWN_LANGUAGE
|
360
|
+
// No UNKNOWN_LANGUAGE
|
361
|
+
}
|
362
|
+
|
363
|
+
|
364
|
+
TEST_F(CompactLangDetTest, ExtendedTests) {
|
365
|
+
// Do the extended languages, with them not-allowed then allowed
|
366
|
+
// These turn out to be extraordinarily sensitive forms of garbage bytes
|
367
|
+
//// EXPECT_EQ(ENGLISH, TestCompactLangDetPlain(kTeststr_tlh_Latn));
|
368
|
+
//// EXPECT_EQ(X_KLINGON, TestExtCompactLangDetPlain(kTeststr_tlh_Latn));
|
369
|
+
|
370
|
+
//// EXPECT_EQ(ENGLISH, TestCompactLangDetPlain(kTeststr_zzp_Latn));
|
371
|
+
//// EXPECT_EQ(X_PIG_LATIN, TestExtCompactLangDetPlain(kTeststr_zzp_Latn));
|
372
|
+
|
373
|
+
//// EXPECT_EQ(ENGLISH, TestCompactLangDetPlain(kTeststr_xx_Bugi));
|
374
|
+
//// EXPECT_EQ(X_BUGINESE, TestExtCompactLangDetPlain(kTeststr_xx_Bugi));
|
375
|
+
|
376
|
+
//// EXPECT_EQ(ENGLISH, TestCompactLangDetPlain(kTeststr_xx_Goth));
|
377
|
+
//// EXPECT_EQ(X_GOTHIC, TestExtCompactLangDetPlain(kTeststr_xx_Goth));
|
378
|
+
|
379
|
+
// Next three now removed permanently from probability tables (May 2008)
|
380
|
+
// (used to be X_BORK_BORK_BORK, X_ELMER_FUDD, X_HACKER).
|
381
|
+
//
|
382
|
+
// Small changes in probability tables may cause these non-texts to
|
383
|
+
// change detection result. If that happens, cross-check that
|
384
|
+
// the new result is not because of a bug, then change the expected values.
|
385
|
+
EXPECT_EQ(ENGLISH, TestCompactLangDetPlain(kTeststr_zzb_Latn));
|
386
|
+
EXPECT_EQ(ENGLISH, TestExtCompactLangDetPlain(kTeststr_zzb_Latn));
|
387
|
+
|
388
|
+
EXPECT_EQ(ENGLISH, TestCompactLangDetPlain(kTeststr_zze_Latn));
|
389
|
+
EXPECT_EQ(ENGLISH, TestExtCompactLangDetPlain(kTeststr_zze_Latn));
|
390
|
+
|
391
|
+
//// EXPECT_EQ(ENGLISH, TestCompactLangDetPlain(kTeststr_zzh_Latn));
|
392
|
+
//// EXPECT_EQ(ENGLISH, TestExtCompactLangDetPlain(kTeststr_zzh_Latn));
|
393
|
+
}
|
394
|
+
|
395
|
+
|
396
|
+
} // End namespace
|
397
|
+
|
398
|
+
#if !defined(CLD_WINDOWS)
|
399
|
+
int main(int argc, char** argv) {
|
400
|
+
FLAGS_logtostderr = true;
|
401
|
+
InitGoogle("Unit test for CLD small", &argc, &argv, false);
|
402
|
+
return RUN_ALL_TESTS();
|
403
|
+
}
|
404
|
+
#endif
|
405
|
+
|
406
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
gcc -DCOMPILER_GCC -I../.. *.cc
|