language_detection 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +85 -0
- data/Rakefile +11 -0
- data/ext/cld/Makefile +34 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +124 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/cld.so +0 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +1 -0
- data/ext/cld/language_detection.cc +88 -0
- data/ext/cld/languages/internal/languages.cc +337 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/language_detection.gemspec +28 -0
- data/lib/language_detection/string.rb +1 -0
- data/lib/language_detection/version.rb +3 -0
- data/lib/language_detection.rb +54 -0
- data/test/_helper.rb +15 -0
- data/test/fixtures/languages.csv +80 -0
- data/test/language_detection_test.rb +88 -0
- metadata +250 -0
@@ -0,0 +1,2574 @@
|
|
1
|
+
// Copyright (c) 2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#include <stdio.h>
|
6
|
+
#include <string.h>
|
7
|
+
//#include <sys/time.h> // for gettimeofday
|
8
|
+
#include <string>
|
9
|
+
|
10
|
+
#include "encodings/lang_enc.h"
|
11
|
+
|
12
|
+
#include "encodings/compact_lang_det/compact_lang_det.h"
|
13
|
+
#include "encodings/compact_lang_det/compact_lang_det_impl.h"
|
14
|
+
#include "encodings/compact_lang_det/getonescriptspan.h"
|
15
|
+
#include "encodings/compact_lang_det/letterscript_enum.h"
|
16
|
+
#include "encodings/compact_lang_det/tote.h"
|
17
|
+
#include "encodings/compact_lang_det/utf8propjustletter.h"
|
18
|
+
#include "encodings/compact_lang_det/utf8propletterscriptnum.h"
|
19
|
+
#include "encodings/compact_lang_det/utf8scannotjustletterspecial.h"
|
20
|
+
|
21
|
+
#include "encodings/compact_lang_det/cldutil_dbg.h"
|
22
|
+
|
23
|
+
#include "encodings/compact_lang_det/win/cld_basictypes.h"
|
24
|
+
#include "encodings/compact_lang_det/win/cld_commandlineflags.h"
|
25
|
+
#include "encodings/compact_lang_det/win/cld_google.h"
|
26
|
+
#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
|
27
|
+
|
28
|
+
// Linker supplies the right tables
|
29
|
+
extern const UTF8PropObj compact_lang_det_generated_ctjkvz_b1_obj;
|
30
|
+
extern const cld::CLDTableSummary kCjkBiTable_obj;
|
31
|
+
extern const cld::CLDTableSummary kQuadTable_obj;
|
32
|
+
extern const cld::CLDTableSummary kLongWord8Table_obj;
|
33
|
+
|
34
|
+
DEFINE_bool(cld_html, false, "Print language spans in HTML on stderr");
|
35
|
+
DEFINE_bool(cld_forcewords, false, "Score all words, in addition to quads");
|
36
|
+
|
37
|
+
DEFINE_bool(cld_showme, false, "Put squeeze/repeat points into HTML text");
|
38
|
+
DEFINE_bool(cld_echotext, false, "Print each scriptspan to stderr");
|
39
|
+
DEFINE_int32(cld_textlimit, 160, "Examine only initial n KB of actual text");
|
40
|
+
// 20 quadgrams is about 80 bytes or about 12 words in real text
|
41
|
+
DEFINE_int32(cld_smoothwidth, 20, "Smoothing window width in quadgrams");
|
42
|
+
|
43
|
+
|
44
|
+
static const int kLangHintInitial = 12; // Boost language by N initially
|
45
|
+
static const int kLangHintBoost = 12; // Boost language by N/16 per quadgram
|
46
|
+
|
47
|
+
static const int kShortSpanThresh = 32; // Bytes
|
48
|
+
static const int kMaxSecondChanceLen = 1024; // Look at first 1K of short spans
|
49
|
+
|
50
|
+
static const int kCheapSqueezeTestThresh = 4096; // Only look for squeezing
|
51
|
+
// after this many text bytes
|
52
|
+
static const int kCheapSqueezeTestLen = 256; // Bytes to test to trigger sqz
|
53
|
+
static const int kSpacesTriggerPercent = 25; // Trigger sqz if >=25% spaces
|
54
|
+
static const int kPredictTriggerPercent = 67; // Trigger sqz if >=67% predicted
|
55
|
+
|
56
|
+
static const int kChunksizeDefault = 48; // Squeeze 48-byte chunks
|
57
|
+
static const int kSpacesThreshPercent = 25; // Squeeze if >=25% spaces
|
58
|
+
static const int kPredictThreshPercent = 40; // Squeeze if >=40% predicted
|
59
|
+
|
60
|
+
static const int kMaxSpaceScan = 32; // Bytes
|
61
|
+
|
62
|
+
static const int kGoodLang1Percent = 70;
|
63
|
+
static const int kGoodLang1and2Percent = 93;
|
64
|
+
static const int kShortTextThresh = 256; // Bytes
|
65
|
+
|
66
|
+
static const int kMinChunkSizeQuads = 4; // Chunk is at least four quads
|
67
|
+
static const int kMaxChunkSizeQuads = 1024; // Chunk is at most 1K quads
|
68
|
+
|
69
|
+
static const int kDefaultWordSpan = 256; // Scan at least this many initial
|
70
|
+
// bytes with word scoring
|
71
|
+
static const int kReallyBigWordSpan = 9999999; // Forces word scoring all text
|
72
|
+
|
73
|
+
static const int kMinReliableSeq = 50; // Record in seq if >= 50% reliable
|
74
|
+
|
75
|
+
static const int kPredictionTableSize = 4096; // Must be exactly 4096 for
|
76
|
+
// cheap compressor
|
77
|
+
|
78
|
+
//
|
79
|
+
// Generated by dsites 2008.07.07 from 10% of Base
|
80
|
+
//
|
81
|
+
|
82
|
+
// Three packed language probs, subscripted by Encoding
|
83
|
+
static const uint32 kEncodingHintProbs[] = {
|
84
|
+
0x00000000, // ASCII
|
85
|
+
0x18120cd5, // Latin2 POLISH.11 CZECH.5 HUNGARIAN.3
|
86
|
+
0x1d3a4bc9, // Latin3 AZERBAIJANI.10 BASQUE.3 CROATIAN.1
|
87
|
+
0x030819d4, // Latin4 ESTONIAN.11 ITALIAN.4 DUTCH.2
|
88
|
+
0x00000000, // ISO-8859-5
|
89
|
+
0x00003742, // Arabic ARABIC.12
|
90
|
+
0x00000000, // Greek
|
91
|
+
0x00000742, // Hebrew HEBREW.12
|
92
|
+
0x00002242, // Latin5 TURKISH.12
|
93
|
+
0x060419c9, // Latin6 ESTONIAN.10 FINNISH.3 GERMAN.1
|
94
|
+
0x00000942, // EUC-JP Japanese.12
|
95
|
+
0x00000942, // SJS Japanese.12
|
96
|
+
0x00000942, // JIS Japanese.12
|
97
|
+
0x00004642, // BIG5 ChineseT.12
|
98
|
+
0x00001142, // GB Chinese.12
|
99
|
+
0x46295fcd, // EUC-CN UIGHUR.10 MALAY.6 ChineseT.5
|
100
|
+
0x00000a42, // KSC Korean.12
|
101
|
+
0x00000000, // Unicode
|
102
|
+
0x03104674, // EUC ChineseT.9 SWEDISH.8 DUTCH.3
|
103
|
+
0x00000000, // CNS
|
104
|
+
0x0f1146c3, // BIG5-CP950 ChineseT.9 Chinese.5 SPANISH.4
|
105
|
+
0x00000942, // CP932 Japanese.12
|
106
|
+
0x00000000, // UTF8
|
107
|
+
0x00000000, // Unknown
|
108
|
+
0x00000000, // ASCII-7-bit
|
109
|
+
0x00000000, // KOI8R
|
110
|
+
0x00000000, // CP1251
|
111
|
+
0x00000000, // CP1252
|
112
|
+
0x00000000, // KOI8U
|
113
|
+
0x451d12cd, // CP1250 CZECH.10 CROATIAN.6 SLOVAK.5
|
114
|
+
0x0d06052a, // ISO-8859-15 FRENCH.9 GERMAN.8 PORTUGUESE.7
|
115
|
+
0x00002242, // CP1254 TURKISH.12
|
116
|
+
0x191516be, // CP1257 LITHUANIAN.8 LATVIAN.7 ESTONIAN.7
|
117
|
+
0x08003642, // ISO-8859-11 THAI.12 ITALIAN.1
|
118
|
+
0x00000000, // CP874
|
119
|
+
0x00003742, // CP1256 ARABIC.12
|
120
|
+
0x00000742, // CP1255 HEBREW.12
|
121
|
+
0x00000000, // ISO-8859-8-I
|
122
|
+
0x00000000, // VISUAL
|
123
|
+
0x00000000, // CP852
|
124
|
+
0x39001242, // CSN_369103 CZECH.12 ESPERANTO.1
|
125
|
+
0x00000000, // CP1253
|
126
|
+
0x00000000, // CP866
|
127
|
+
0x2e001944, // ISO-8859-13 ESTONIAN.12 ALBANIAN.3
|
128
|
+
0x08090a74, // ISO-2022-KR Korean.9 Japanese.8 ITALIAN.3
|
129
|
+
0x00001142, // GBK Chinese.12
|
130
|
+
0x4600113d, // GB18030 Chinese.11 ChineseT.7
|
131
|
+
0x00004642, // BIG5_HKSCS ChineseT.12
|
132
|
+
0x00000000, // ISO_2022_CN
|
133
|
+
0x00000000, // TSCII
|
134
|
+
0x00000000, // TAM
|
135
|
+
0x00000000, // TAB
|
136
|
+
0x00000000, // JAGRAN
|
137
|
+
0x00000000, // MACINTOSH
|
138
|
+
0x00000000, // UTF7
|
139
|
+
0x00000000, // BHASKAR
|
140
|
+
0x00000000, // HTCHANAKYA
|
141
|
+
0x090646ca, // UTF-16BE ChineseT.10 GERMAN.4 Japanese.2
|
142
|
+
0x00000000, // UTF-16LE
|
143
|
+
0x00000000, // UTF-32BE
|
144
|
+
0x00000000, // UTF-32LE
|
145
|
+
0x00000000, // X-BINARYENC
|
146
|
+
0x06001142, // HZ-GB-2312 Chinese.12 GERMAN.1
|
147
|
+
0x461109c2, // X-UTF8UTF8 Japanese.9 Chinese.5 ChineseT.3
|
148
|
+
0x00000000, // X-TAM-ELANGO
|
149
|
+
0x00000000, // X-TAM-LTTMBARANI
|
150
|
+
0x00000000, // X-TAM-SHREE
|
151
|
+
0x00000000, // X-TAM-TBOOMIS
|
152
|
+
0x00000000, // X-TAM-TMNEWS
|
153
|
+
0x00000000, // X-TAM-WEBTAMIL
|
154
|
+
0x00000000, // X-KDDI-Shift_JIS
|
155
|
+
0x00000000, // X-DoCoMo-Shift_JIS
|
156
|
+
0x00000000, // X-SoftBank-Shift_JIS
|
157
|
+
0x00000000, // X-KDDI-ISO-2022-JP
|
158
|
+
0x00000000, // X-SoftBank-ISO-2022-JP
|
159
|
+
};
|
160
|
+
|
161
|
+
COMPILE_ASSERT(arraysize(kEncodingHintProbs) == NUM_ENCODINGS,
|
162
|
+
kEncodingHintProbs_has_incorrect_size);
|
163
|
+
|
164
|
+
//
|
165
|
+
// Generated by dsites 2008.07.07 from 10% of Base
|
166
|
+
//
|
167
|
+
|
168
|
+
// Three packed language probs, subscripted by (anchor) language
|
169
|
+
static const uint32 kLanguageHintProbs[] = {
|
170
|
+
0x00000000, // ENGLISH
|
171
|
+
0x00000242, // DANISH DANISH.12
|
172
|
+
0x00000342, // DUTCH DUTCH.12
|
173
|
+
0x00000442, // FINNISH FINNISH.12
|
174
|
+
0x00000542, // FRENCH FRENCH.12
|
175
|
+
0x00000642, // GERMAN GERMAN.12
|
176
|
+
0x00000742, // HEBREW HEBREW.12
|
177
|
+
0x00000842, // ITALIAN ITALIAN.12
|
178
|
+
0x00000942, // Japanese Japanese.12
|
179
|
+
0x00000a42, // Korean Korean.12
|
180
|
+
0x51000b43, // NORWEGIAN NORWEGIAN.12 NORWEGIAN_N.2
|
181
|
+
0x00000c42, // POLISH POLISH.12
|
182
|
+
0x00000d42, // PORTUGUESE PORTUGUESE.12
|
183
|
+
0x00000000, // RUSSIAN
|
184
|
+
0x00000f42, // SPANISH SPANISH.12
|
185
|
+
0x00001042, // SWEDISH SWEDISH.12
|
186
|
+
0x00001142, // Chinese Chinese.12
|
187
|
+
0x00001242, // CZECH CZECH.12
|
188
|
+
0x00000000, // GREEK
|
189
|
+
0x47001442, // ICELANDIC ICELANDIC.12 FAROESE.1
|
190
|
+
0x00001542, // LATVIAN LATVIAN.12
|
191
|
+
0x00001642, // LITHUANIAN LITHUANIAN.12
|
192
|
+
0x00001742, // ROMANIAN ROMANIAN.12
|
193
|
+
0x00001842, // HUNGARIAN HUNGARIAN.12
|
194
|
+
0x00001942, // ESTONIAN ESTONIAN.12
|
195
|
+
0x00000000, // TG_UNKNOWN_LANGUAGE
|
196
|
+
0x00000000, // Unknown
|
197
|
+
0x00001c42, // BULGARIAN BULGARIAN.12
|
198
|
+
0x00001d42, // CROATIAN CROATIAN.12
|
199
|
+
0x1e001d46, // SERBIAN CROATIAN.12 SERBIAN.5
|
200
|
+
0x00000000, // IRISH
|
201
|
+
0x0f00203d, // GALICIAN GALICIAN.11 SPANISH.7
|
202
|
+
0x5e00213a, // TAGALOG TAGALOG.11 SOMALI.4
|
203
|
+
0x00002242, // TURKISH TURKISH.12
|
204
|
+
0x00002342, // UKRAINIAN UKRAINIAN.12
|
205
|
+
0x00000000, // HINDI
|
206
|
+
0x1c1e25d4, // MACEDONIAN MACEDONIAN.11 SERBIAN.4 BULGARIAN.2
|
207
|
+
0x00002642, // BENGALI BENGALI.12
|
208
|
+
0x00002742, // INDONESIAN INDONESIAN.12
|
209
|
+
0x00000000, // LATIN
|
210
|
+
0x2700293c, // MALAY MALAY.11 INDONESIAN.6
|
211
|
+
0x00000000, // MALAYALAM
|
212
|
+
0x00000000, // WELSH
|
213
|
+
0x00000000, // NEPALI
|
214
|
+
0x00000000, // TELUGU
|
215
|
+
0x00002e42, // ALBANIAN ALBANIAN.12
|
216
|
+
0x00000000, // TAMIL
|
217
|
+
0x00003042, // BELARUSIAN BELARUSIAN.12
|
218
|
+
0x00000000, // JAVANESE
|
219
|
+
0x00000000, // OCCITAN
|
220
|
+
0x375f3330, // URDU URDU.10 UIGHUR.7 ARABIC.4
|
221
|
+
0x41003436, // BIHARI BIHARI.10 MARATHI.10
|
222
|
+
0x00000000, // GUJARATI
|
223
|
+
0x0a4636b2, // THAI THAI.7 ChineseT.3 Korean.2
|
224
|
+
0x00003742, // ARABIC ARABIC.12
|
225
|
+
0x00003842, // CATALAN CATALAN.12
|
226
|
+
0x00003942, // ESPERANTO ESPERANTO.12
|
227
|
+
0x00003a42, // BASQUE BASQUE.12
|
228
|
+
0x00000000, // INTERLINGUA
|
229
|
+
0x00000000, // KANNADA
|
230
|
+
0x05060cca, // PUNJABI POLISH.10 GERMAN.4 FRENCH.2
|
231
|
+
0x00000000, // SCOTS_GAELIC
|
232
|
+
0x00003f42, // SWAHILI SWAHILI.12
|
233
|
+
0x00004042, // SLOVENIAN SLOVENIAN.12
|
234
|
+
0x00004142, // MARATHI MARATHI.12
|
235
|
+
0x00004242, // MALTESE MALTESE.12
|
236
|
+
0x00004342, // VIETNAMESE VIETNAMESE.12
|
237
|
+
0x00000000, // FRISIAN
|
238
|
+
0x12004543, // SLOVAK SLOVAK.12 CZECH.2
|
239
|
+
0x00004642, // ChineseT ChineseT.12
|
240
|
+
0x00000000, // FAROESE
|
241
|
+
0x00000000, // SUNDANESE
|
242
|
+
0x79004944, // UZBEK UZBEK.12 TAJIK.3
|
243
|
+
0x4d004a46, // AMHARIC AMHARIC.12 TIGRINYA.5
|
244
|
+
0x00004b42, // AZERBAIJANI AZERBAIJANI.12
|
245
|
+
0x00000000, // GEORGIAN
|
246
|
+
0x00000000, // TIGRINYA
|
247
|
+
0x00004e42, // PERSIAN PERSIAN.12
|
248
|
+
0x00000000, // BOSNIAN
|
249
|
+
0x00000000, // SINHALESE
|
250
|
+
0x00000000, // NORWEGIAN_N
|
251
|
+
0x00000000, // PORTUGUESE_P
|
252
|
+
0x00000000, // PORTUGUESE_B
|
253
|
+
0x00000000, // XHOSA
|
254
|
+
0x00000000, // ZULU
|
255
|
+
0x00000000, // GUARANI
|
256
|
+
0x00000000, // SESOTHO
|
257
|
+
0x00000000, // TURKMEN
|
258
|
+
0x7a005933, // KYRGYZ KYRGYZ.10 TATAR.7
|
259
|
+
0x00000000, // BRETON
|
260
|
+
0x00000000, // TWI
|
261
|
+
0x00000000, // YIDDISH
|
262
|
+
0x00000000, // SERBO_CROATIAN
|
263
|
+
0x00000000, // SOMALI
|
264
|
+
0x00005f42, // UIGHUR UIGHUR.12
|
265
|
+
0x00006042, // KURDISH KURDISH.12
|
266
|
+
0x00006142, // MONGOLIAN MONGOLIAN.12
|
267
|
+
0x051130c9, // ARMENIAN BELARUSIAN.10 Chinese.3 FRENCH.1
|
268
|
+
0x020f0521, // LAOTHIAN FRENCH.8 SPANISH.7 DANISH.6
|
269
|
+
0x64004e35, // SINDHI PERSIAN.10 SINDHI.9
|
270
|
+
0x00000000, // RHAETO_ROMANCE
|
271
|
+
0x00006642, // AFRIKAANS AFRIKAANS.12
|
272
|
+
0x00000000, // LUXEMBOURGISH
|
273
|
+
0x00006842, // BURMESE BURMESE.12
|
274
|
+
0x00002242, // KHMER TURKISH.12
|
275
|
+
0x88006a3c, // TIBETAN TIBETAN.11 DZONGKHA.6
|
276
|
+
0x00000000, // DHIVEHI
|
277
|
+
0x00000000, // CHEROKEE
|
278
|
+
0x00000000, // SYRIAC
|
279
|
+
0x00000000, // LIMBU
|
280
|
+
0x00000000, // ORIYA
|
281
|
+
0x00000000, // ASSAMESE
|
282
|
+
0x00000000, // CORSICAN
|
283
|
+
0x00000000, // INTERLINGUE
|
284
|
+
0x00007342, // KAZAKH KAZAKH.12
|
285
|
+
0x00000000, // LINGALA
|
286
|
+
0x00000000, // MOLDAVIAN
|
287
|
+
0x5f007645, // PASHTO PASHTO.12 UIGHUR.4
|
288
|
+
0x00000000, // QUECHUA
|
289
|
+
0x00000000, // SHONA
|
290
|
+
0x00007942, // TAJIK TAJIK.12
|
291
|
+
0x00000000, // TATAR
|
292
|
+
0x00000000, // TONGA
|
293
|
+
0x00000000, // YORUBA
|
294
|
+
0x00000000, // CREOLES_AND_PIDGINS_ENGLISH_BASED
|
295
|
+
0x00000000, // CREOLES_AND_PIDGINS_FRENCH_BASED
|
296
|
+
0x00000000, // CREOLES_AND_PIDGINS_PORTUGUESE_BASED
|
297
|
+
0x00000000, // CREOLES_AND_PIDGINS_OTHER
|
298
|
+
0x00000000, // MAORI
|
299
|
+
0x00000000, // WOLOF
|
300
|
+
0x00000000, // ABKHAZIAN
|
301
|
+
0x00000000, // AFAR
|
302
|
+
0x00000000, // AYMARA
|
303
|
+
0x00000000, // BASHKIR
|
304
|
+
0x00000000, // BISLAMA
|
305
|
+
0x00000000, // DZONGKHA
|
306
|
+
0x00000000, // FIJIAN
|
307
|
+
0x00000000, // GREENLANDIC
|
308
|
+
0x00000000, // HAUSA
|
309
|
+
0x00000000, // HAITIAN_CREOLE
|
310
|
+
0x00000000, // INUPIAK
|
311
|
+
0x00000542, // INUKTITUT FRENCH.12
|
312
|
+
0x00000000, // KASHMIRI
|
313
|
+
0x00000000, // KINYARWANDA
|
314
|
+
0x00000000, // MALAGASY
|
315
|
+
0x00000000, // NAURU
|
316
|
+
0x00000000, // OROMO
|
317
|
+
0x00000000, // RUNDI
|
318
|
+
0x00000000, // SAMOAN
|
319
|
+
0x00000000, // SANGO
|
320
|
+
0x344197d3, // SANSKRIT SANSKRIT.11 MARATHI.4 BIHARI.1
|
321
|
+
0x00000000, // SISWANT
|
322
|
+
0x00000000, // TSONGA
|
323
|
+
0x00000000, // TSWANA
|
324
|
+
0x00000000, // VOLAPUK
|
325
|
+
0x00000000, // ZHUANG
|
326
|
+
0x00000000, // KHASI
|
327
|
+
0x00000000, // SCOTS
|
328
|
+
0x00000000, // GANDA
|
329
|
+
0x00000000, // MANX
|
330
|
+
0x00000000, // MONTENEGRIN
|
331
|
+
// Add new language hints just before here (just use 0x00000000)
|
332
|
+
};
|
333
|
+
|
334
|
+
COMPILE_ASSERT(arraysize(kLanguageHintProbs) == NUM_LANGUAGES,
|
335
|
+
kLanguageHintProbs_has_incorrect_size);
|
336
|
+
|
337
|
+
//
|
338
|
+
// Generated by dsites 2008.07.07 from 10% of Base
|
339
|
+
//
|
340
|
+
|
341
|
+
typedef struct {
|
342
|
+
char key[4];
|
343
|
+
uint32 probs;
|
344
|
+
} HintEntry;
|
345
|
+
|
346
|
+
|
347
|
+
// Massaged TLD, followed by three packed language probs
|
348
|
+
// Hand-removed 4 items dsites 2008.07.15
|
349
|
+
static const int kTLDHintProbsSize = 201;
|
350
|
+
static const HintEntry kTLDHintProbs[kTLDHintProbsSize] = { // MaxRange 12
|
351
|
+
{{0x61,0x63,0x5f,0x5f}, 0x0a000945}, // ac__ Japanese.12 Korean.4
|
352
|
+
{{0x61,0x64,0x5f,0x5f}, 0x00003842}, // ad__ CATALAN.12
|
353
|
+
{{0x61,0x65,0x5f,0x5f}, 0x00003742}, // ae__ ARABIC.12
|
354
|
+
{{0x61,0x66,0x5f,0x5f}, 0x4e00763d}, // af__ PASHTO.11 PERSIAN.7
|
355
|
+
{{0x61,0x67,0x5f,0x5f}, 0x09000643}, // ag__ GERMAN.12 Japanese.2
|
356
|
+
{{0x61,0x69,0x5f,0x5f}, 0x0c180938}, // ai__ Japanese.11 HUNGARIAN.7 POLISH.2
|
357
|
+
{{0x61,0x6c,0x5f,0x5f}, 0x00002e42}, // al__ ALBANIAN.12
|
358
|
+
{{0x61,0x6e,0x5f,0x5f}, 0x6e00033d}, // an__ DUTCH.11 LIMBU.7
|
359
|
+
{{0x61,0x6f,0x5f,0x5f}, 0x05000d42}, // ao__ PORTUGUESE.12 FRENCH.1
|
360
|
+
{{0x61,0x71,0x5f,0x5f}, 0x05000f29}, // aq__ SPANISH.9 FRENCH.6
|
361
|
+
{{0x61,0x72,0x5f,0x5f}, 0x00000f42}, // ar__ SPANISH.12
|
362
|
+
{{0x61,0x73,0x5f,0x5f}, 0x0f120bcd}, // as__ NORWEGIAN.10 CZECH.6 SPANISH.5
|
363
|
+
{{0x61,0x74,0x5f,0x5f}, 0x00000642}, // at__ GERMAN.12
|
364
|
+
{{0x61,0x77,0x5f,0x5f}, 0x0f000345}, // aw__ DUTCH.12 SPANISH.4
|
365
|
+
{{0x61,0x78,0x5f,0x5f}, 0x00001042}, // ax__ SWEDISH.12
|
366
|
+
{{0x61,0x7a,0x5f,0x5f}, 0x00004b42}, // az__ AZERBAIJANI.12
|
367
|
+
{{0x62,0x61,0x5f,0x5f}, 0x00001d42}, // ba__ CROATIAN.12
|
368
|
+
{{0x62,0x62,0x5f,0x5f}, 0x00002842}, // bb__ LATIN.12
|
369
|
+
{{0x62,0x64,0x5f,0x5f}, 0x00002642}, // bd__ BENGALI.12
|
370
|
+
{{0x62,0x65,0x5f,0x5f}, 0x05000335}, // be__ DUTCH.10 FRENCH.9
|
371
|
+
{{0x62,0x66,0x5f,0x5f}, 0x00000542}, // bf__ FRENCH.12
|
372
|
+
{{0x62,0x67,0x5f,0x5f}, 0x00001c42}, // bg__ BULGARIAN.12
|
373
|
+
{{0x62,0x68,0x5f,0x5f}, 0x00003742}, // bh__ ARABIC.12
|
374
|
+
{{0x62,0x69,0x5f,0x5f}, 0x0f00053f}, // bi__ FRENCH.11 SPANISH.9
|
375
|
+
{{0x62,0x6a,0x5f,0x5f}, 0x00000542}, // bj__ FRENCH.12
|
376
|
+
{{0x62,0x6d,0x5f,0x5f}, 0x98043929}, // bm__ ESPERANTO.9 FINNISH.8 SISWANT.6
|
377
|
+
{{0x62,0x6e,0x5f,0x5f}, 0x00002942}, // bn__ MALAY.12
|
378
|
+
{{0x62,0x6f,0x5f,0x5f}, 0x00000f42}, // bo__ SPANISH.12
|
379
|
+
{{0x62,0x72,0x5f,0x5f}, 0x00000d42}, // br__ PORTUGUESE.12
|
380
|
+
{{0x62,0x74,0x5f,0x5f}, 0x00008842}, // bt__ DZONGKHA.12
|
381
|
+
{{0x62,0x77,0x5f,0x5f}, 0x06059ac4}, // bw__ TSWANA.9 FRENCH.6 GERMAN.5
|
382
|
+
{{0x62,0x79,0x5f,0x5f}, 0x00003024}, // by__ BELARUSIAN.9
|
383
|
+
{{0x62,0x7a,0x5f,0x5f}, 0x0f0a0924}, // bz__ Japanese.9 Korean.5 SPANISH.1
|
384
|
+
{{0x63,0x61,0x5f,0x5f}, 0x00000542}, // ca__ FRENCH.12
|
385
|
+
{{0x63,0x61,0x74,0x5f}, 0x00003842}, // cat_ CATALAN.12
|
386
|
+
{{0x63,0x64,0x5f,0x5f}, 0x06051224}, // cd__ CZECH.9 FRENCH.5 GERMAN.1
|
387
|
+
{{0x63,0x66,0x5f,0x5f}, 0x00000542}, // cf__ FRENCH.12
|
388
|
+
{{0x63,0x67,0x5f,0x5f}, 0x00000542}, // cg__ FRENCH.12
|
389
|
+
{{0x63,0x68,0x5f,0x5f}, 0x08050638}, // ch__ GERMAN.11 FRENCH.7 ITALIAN.2
|
390
|
+
{{0x63,0x69,0x5f,0x5f}, 0x00000542}, // ci__ FRENCH.12
|
391
|
+
{{0x63,0x6c,0x5f,0x5f}, 0x00000f42}, // cl__ SPANISH.12
|
392
|
+
{{0x63,0x6d,0x5f,0x5f}, 0x00000542}, // cm__ FRENCH.12
|
393
|
+
{{0x63,0x6e,0x5f,0x5f}, 0x00001142}, // cn__ Chinese.12
|
394
|
+
{{0x63,0x6f,0x5f,0x5f}, 0x00000f42}, // co__ SPANISH.12
|
395
|
+
// {{0x63,0x6f,0x6f,0x70}, 0x0f0509cd}, // coop Japanese.10 FRENCH.6 SPANISH.5
|
396
|
+
{{0x63,0x72,0x5f,0x5f}, 0x00000f42}, // cr__ SPANISH.12
|
397
|
+
{{0x63,0x75,0x5f,0x5f}, 0x00000f42}, // cu__ SPANISH.12
|
398
|
+
{{0x63,0x76,0x5f,0x5f}, 0x00000d42}, // cv__ PORTUGUESE.12
|
399
|
+
{{0x63,0x78,0x5f,0x5f}, 0x223a091f}, // cx__ Japanese.8 BASQUE.6 TURKISH.4
|
400
|
+
{{0x63,0x79,0x5f,0x5f}, 0x150622ba}, // cy__ TURKISH.8 GERMAN.4 LATVIAN.3
|
401
|
+
{{0x63,0x7a,0x5f,0x5f}, 0x00001242}, // cz__ CZECH.12
|
402
|
+
{{0x64,0x65,0x5f,0x5f}, 0x00000642}, // de__ GERMAN.12
|
403
|
+
{{0x64,0x6b,0x5f,0x5f}, 0x00000242}, // dk__ DANISH.12
|
404
|
+
{{0x64,0x6f,0x5f,0x5f}, 0x21000f42}, // do__ SPANISH.12 TAGALOG.1
|
405
|
+
{{0x64,0x7a,0x5f,0x5f}, 0x37000535}, // dz__ FRENCH.10 ARABIC.9
|
406
|
+
{{0x65,0x63,0x5f,0x5f}, 0x00000f42}, // ec__ SPANISH.12
|
407
|
+
// {{0x65,0x64,0x75,0x5f}, 0x2e0f3873}, // edu_ CATALAN.9 SPANISH.7 ALBANIAN.2
|
408
|
+
{{0x65,0x65,0x5f,0x5f}, 0x00001942}, // ee__ ESTONIAN.12
|
409
|
+
{{0x65,0x67,0x5f,0x5f}, 0x05003742}, // eg__ ARABIC.12 FRENCH.1
|
410
|
+
{{0x65,0x72,0x5f,0x5f}, 0x00000b42}, // er__ NORWEGIAN.12
|
411
|
+
{{0x65,0x73,0x5f,0x5f}, 0x38200fd4}, // es__ SPANISH.11 GALICIAN.4 CATALAN.2
|
412
|
+
{{0x65,0x74,0x5f,0x5f}, 0x39004a39}, // et__ AMHARIC.11 ESPERANTO.3
|
413
|
+
{{0x66,0x69,0x5f,0x5f}, 0x10000444}, // fi__ FINNISH.12 SWEDISH.3
|
414
|
+
{{0x66,0x6a,0x5f,0x5f}, 0x050489e0}, // fj__ FIJIAN.12 FINNISH.5 FRENCH.3
|
415
|
+
{{0x66,0x6f,0x5f,0x5f}, 0x00004742}, // fo__ FAROESE.12
|
416
|
+
{{0x66,0x72,0x5f,0x5f}, 0x00000542}, // fr__ FRENCH.12
|
417
|
+
{{0x67,0x61,0x5f,0x5f}, 0x00000542}, // ga__ FRENCH.12
|
418
|
+
{{0x67,0x64,0x5f,0x5f}, 0x061d05d5}, // gd__ FRENCH.11 CROATIAN.5 GERMAN.3
|
419
|
+
{{0x67,0x65,0x5f,0x5f}, 0x00004c2d}, // ge__ GEORGIAN.10
|
420
|
+
{{0x67,0x66,0x5f,0x5f}, 0x00000542}, // gf__ FRENCH.12
|
421
|
+
{{0x67,0x67,0x5f,0x5f}, 0x06002244}, // gg__ TURKISH.12 GERMAN.3
|
422
|
+
{{0x67,0x68,0x5f,0x5f}, 0x05000436}, // gh__ FINNISH.10 FRENCH.10
|
423
|
+
{{0x67,0x69,0x5f,0x5f}, 0x0f0538ce}, // gi__ CATALAN.10 FRENCH.7 SPANISH.6
|
424
|
+
{{0x67,0x6c,0x5f,0x5f}, 0x398a0238}, // gl__ DANISH.11 GREENLANDIC.7 ESPERANTO.2
|
425
|
+
{{0x67,0x6d,0x5f,0x5f}, 0x0600043e}, // gm__ FINNISH.11 GERMAN.8
|
426
|
+
{{0x67,0x6e,0x5f,0x5f}, 0x00000542}, // gn__ FRENCH.12
|
427
|
+
// {{0x67,0x6f,0x76,0x5f}, 0x05000f25}, // gov_ SPANISH.9 FRENCH.2
|
428
|
+
{{0x67,0x70,0x5f,0x5f}, 0x00000542}, // gp__ FRENCH.12
|
429
|
+
{{0x67,0x71,0x5f,0x5f}, 0x0f000547}, // gq__ FRENCH.12 SPANISH.6
|
430
|
+
{{0x67,0x73,0x5f,0x5f}, 0x00000942}, // gs__ Japanese.12
|
431
|
+
{{0x67,0x74,0x5f,0x5f}, 0x00000f42}, // gt__ SPANISH.12
|
432
|
+
{{0x68,0x6b,0x5f,0x5f}, 0x11004643}, // hk__ ChineseT.12 Chinese.2
|
433
|
+
{{0x68,0x6d,0x5f,0x5f}, 0x4606092e}, // hm__ Japanese.10 GERMAN.6 ChineseT.2
|
434
|
+
{{0x68,0x6e,0x5f,0x5f}, 0x00000f42}, // hn__ SPANISH.12
|
435
|
+
{{0x68,0x72,0x5f,0x5f}, 0x00001d42}, // hr__ CROATIAN.12
|
436
|
+
{{0x68,0x74,0x5f,0x5f}, 0x0f000542}, // ht__ FRENCH.12 SPANISH.1
|
437
|
+
{{0x68,0x75,0x5f,0x5f}, 0x00001842}, // hu__ HUNGARIAN.12
|
438
|
+
{{0x69,0x64,0x5f,0x5f}, 0x00002742}, // id__ INDONESIAN.12
|
439
|
+
{{0x69,0x65,0x5f,0x5f}, 0x050c1f24}, // ie__ IRISH.9 POLISH.5 FRENCH.1
|
440
|
+
{{0x69,0x6c,0x5f,0x5f}, 0x00000742}, // il__ HEBREW.12
|
441
|
+
{{0x69,0x6e,0x74,0x5f}, 0x0f060574}, // int_ FRENCH.9 GERMAN.8 SPANISH.3
|
442
|
+
{{0x69,0x6f,0x5f,0x5f}, 0x11090fd5}, // io__ SPANISH.11 Japanese.5 Chinese.3
|
443
|
+
{{0x69,0x71,0x5f,0x5f}, 0x60003744}, // iq__ ARABIC.12 KURDISH.3
|
444
|
+
{{0x69,0x72,0x5f,0x5f}, 0x00004e42}, // ir__ PERSIAN.12
|
445
|
+
{{0x69,0x73,0x5f,0x5f}, 0x00001442}, // is__ ICELANDIC.12
|
446
|
+
{{0x69,0x74,0x5f,0x5f}, 0x00000842}, // it__ ITALIAN.12
|
447
|
+
{{0x6a,0x65,0x5f,0x5f}, 0x29050328}, // je__ DUTCH.9 FRENCH.7 MALAY.5
|
448
|
+
{{0x6a,0x6d,0x5f,0x5f}, 0x040f0576}, // jm__ FRENCH.9 SPANISH.8 FINNISH.5
|
449
|
+
{{0x6a,0x6f,0x5f,0x5f}, 0x00003742}, // jo__ ARABIC.12
|
450
|
+
// {{0x6a,0x6f,0x62,0x73}, 0x0f060329}, // jobs DUTCH.9 GERMAN.8 SPANISH.6
|
451
|
+
{{0x6a,0x70,0x5f,0x5f}, 0x00000942}, // jp__ Japanese.12
|
452
|
+
{{0x6b,0x65,0x5f,0x5f}, 0x040f3fc3}, // ke__ SWAHILI.9 SPANISH.5 FINNISH.4
|
453
|
+
{{0x6b,0x69,0x5f,0x5f}, 0x04000643}, // ki__ GERMAN.12 FINNISH.2
|
454
|
+
{{0x6b,0x6d,0x5f,0x5f}, 0x00000542}, // km__ FRENCH.12
|
455
|
+
{{0x6b,0x70,0x5f,0x5f}, 0x00000a42}, // kp__ Korean.12
|
456
|
+
{{0x6b,0x72,0x5f,0x5f}, 0x00000a42}, // kr__ Korean.12
|
457
|
+
{{0x6b,0x77,0x5f,0x5f}, 0x00003742}, // kw__ ARABIC.12
|
458
|
+
{{0x6b,0x79,0x5f,0x5f}, 0x0500083f}, // ky__ ITALIAN.11 FRENCH.9
|
459
|
+
{{0x6b,0x7a,0x5f,0x5f}, 0x0000732d}, // kz__ KAZAKH.10
|
460
|
+
{{0x6c,0x62,0x5f,0x5f}, 0x05003747}, // lb__ ARABIC.12 FRENCH.6
|
461
|
+
{{0x6c,0x63,0x5f,0x5f}, 0x09000645}, // lc__ GERMAN.12 Japanese.4
|
462
|
+
{{0x6c,0x69,0x5f,0x5f}, 0x1600063d}, // li__ GERMAN.11 LITHUANIAN.7
|
463
|
+
{{0x6c,0x73,0x5f,0x5f}, 0x00005742}, // ls__ SESOTHO.12
|
464
|
+
{{0x6c,0x74,0x5f,0x5f}, 0x00001642}, // lt__ LITHUANIAN.12
|
465
|
+
{{0x6c,0x75,0x5f,0x5f}, 0x0600053d}, // lu__ FRENCH.11 GERMAN.7
|
466
|
+
{{0x6c,0x76,0x5f,0x5f}, 0x00001542}, // lv__ LATVIAN.12
|
467
|
+
{{0x6c,0x79,0x5f,0x5f}, 0x05003744}, // ly__ ARABIC.12 FRENCH.3
|
468
|
+
{{0x6d,0x61,0x5f,0x5f}, 0x3700053d}, // ma__ FRENCH.11 ARABIC.7
|
469
|
+
{{0x6d,0x63,0x5f,0x5f}, 0x00000542}, // mc__ FRENCH.12
|
470
|
+
{{0x6d,0x64,0x5f,0x5f}, 0x00001724}, // md__ ROMANIAN.9
|
471
|
+
{{0x6d,0x65,0x5f,0x5f}, 0x00001d42}, // me__ CROATIAN.12
|
472
|
+
{{0x6d,0x67,0x5f,0x5f}, 0x00000542}, // mg__ FRENCH.12
|
473
|
+
{{0x6d,0x6b,0x5f,0x5f}, 0x1c002543}, // mk__ MACEDONIAN.12 BULGARIAN.2
|
474
|
+
{{0x6d,0x6c,0x5f,0x5f}, 0x00000542}, // ml__ FRENCH.12
|
475
|
+
{{0x6d,0x6e,0x5f,0x5f}, 0x00006142}, // mn__ MONGOLIAN.12
|
476
|
+
{{0x6d,0x6f,0x5f,0x5f}, 0x110d4631}, // mo__ ChineseT.10 PORTUGUESE.8 Chinese.5
|
477
|
+
{{0x6d,0x71,0x5f,0x5f}, 0x00000542}, // mq__ FRENCH.12
|
478
|
+
{{0x6d,0x72,0x5f,0x5f}, 0x37000535}, // mr__ FRENCH.10 ARABIC.9
|
479
|
+
{{0x6d,0x73,0x5f,0x5f}, 0x090f06d5}, // ms__ GERMAN.11 SPANISH.5 Japanese.3
|
480
|
+
{{0x6d,0x74,0x5f,0x5f}, 0x00004242}, // mt__ MALTESE.12
|
481
|
+
{{0x6d,0x75,0x5f,0x5f}, 0x05000934}, // mu__ Japanese.10 FRENCH.8
|
482
|
+
{{0x6d,0x76,0x5f,0x5f}, 0x28000436}, // mv__ FINNISH.10 LATIN.10
|
483
|
+
{{0x6d,0x77,0x5f,0x5f}, 0x0611092a}, // mw__ Japanese.9 Chinese.8 GERMAN.7
|
484
|
+
{{0x6d,0x78,0x5f,0x5f}, 0x00000f42}, // mx__ SPANISH.12
|
485
|
+
{{0x6d,0x79,0x5f,0x5f}, 0x00002942}, // my__ MALAY.12
|
486
|
+
{{0x6d,0x7a,0x5f,0x5f}, 0x00000d42}, // mz__ PORTUGUESE.12
|
487
|
+
{{0x6e,0x61,0x5f,0x5f}, 0x06006644}, // na__ AFRIKAANS.12 GERMAN.3
|
488
|
+
{{0x6e,0x63,0x5f,0x5f}, 0x00000542}, // nc__ FRENCH.12
|
489
|
+
{{0x6e,0x65,0x5f,0x5f}, 0x8b000542}, // ne__ FRENCH.12 HAUSA.1
|
490
|
+
{{0x6e,0x66,0x5f,0x5f}, 0x00000542}, // nf__ FRENCH.12
|
491
|
+
{{0x6e,0x69,0x5f,0x5f}, 0x00000f42}, // ni__ SPANISH.12
|
492
|
+
{{0x6e,0x6c,0x5f,0x5f}, 0x00000342}, // nl__ DUTCH.12
|
493
|
+
{{0x6e,0x6f,0x5f,0x5f}, 0x51000b43}, // no__ NORWEGIAN.12 NORWEGIAN_N.2
|
494
|
+
{{0x6e,0x75,0x5f,0x5f}, 0x0300103b}, // nu__ SWEDISH.11 DUTCH.5
|
495
|
+
{{0x6f,0x6d,0x5f,0x5f}, 0x00003742}, // om__ ARABIC.12
|
496
|
+
{{0x70,0x61,0x5f,0x5f}, 0x00000f42}, // pa__ SPANISH.12
|
497
|
+
{{0x70,0x65,0x5f,0x5f}, 0x00000f42}, // pe__ SPANISH.12
|
498
|
+
{{0x70,0x66,0x5f,0x5f}, 0x00000542}, // pf__ FRENCH.12
|
499
|
+
{{0x70,0x67,0x5f,0x5f}, 0x00000f24}, // pg__ SPANISH.9
|
500
|
+
{{0x70,0x68,0x5f,0x5f}, 0x00002142}, // ph__ TAGALOG.12
|
501
|
+
{{0x70,0x6b,0x5f,0x5f}, 0x00003342}, // pk__ URDU.12
|
502
|
+
{{0x70,0x6c,0x5f,0x5f}, 0x30000c42}, // pl__ POLISH.12 BELARUSIAN.1
|
503
|
+
{{0x70,0x6e,0x5f,0x5f}, 0x04000644}, // pn__ GERMAN.12 FINNISH.3
|
504
|
+
{{0x70,0x72,0x5f,0x5f}, 0x00000f42}, // pr__ SPANISH.12
|
505
|
+
{{0x70,0x72,0x6f,0x5f}, 0x46050fd5}, // pro_ SPANISH.11 FRENCH.5 ChineseT.3
|
506
|
+
{{0x70,0x73,0x5f,0x5f}, 0x00003742}, // ps__ ARABIC.12
|
507
|
+
{{0x70,0x74,0x5f,0x5f}, 0x00000d42}, // pt__ PORTUGUESE.12
|
508
|
+
{{0x70,0x79,0x5f,0x5f}, 0x00000f42}, // py__ SPANISH.12
|
509
|
+
{{0x71,0x61,0x5f,0x5f}, 0x00003742}, // qa__ ARABIC.12
|
510
|
+
{{0x72,0x65,0x5f,0x5f}, 0x00000542}, // re__ FRENCH.12
|
511
|
+
{{0x72,0x6f,0x5f,0x5f}, 0x00001742}, // ro__ ROMANIAN.12
|
512
|
+
{{0x72,0x73,0x5f,0x5f}, 0x00001d42}, // rs__ CROATIAN.12
|
513
|
+
{{0x72,0x77,0x5f,0x5f}, 0x9000053e}, // rw__ FRENCH.11 KINYARWANDA.8
|
514
|
+
{{0x73,0x61,0x5f,0x5f}, 0x00003742}, // sa__ ARABIC.12
|
515
|
+
{{0x73,0x62,0x5f,0x5f}, 0x00000442}, // sb__ FINNISH.12
|
516
|
+
{{0x73,0x63,0x5f,0x5f}, 0x060f092f}, // sc__ Japanese.10 SPANISH.7 GERMAN.3
|
517
|
+
{{0x73,0x64,0x5f,0x5f}, 0x00003742}, // sd__ ARABIC.12
|
518
|
+
{{0x73,0x65,0x5f,0x5f}, 0x00001042}, // se__ SWEDISH.12
|
519
|
+
{{0x73,0x69,0x5f,0x5f}, 0x00004042}, // si__ SLOVENIAN.12
|
520
|
+
{{0x73,0x6b,0x5f,0x5f}, 0x12004543}, // sk__ SLOVAK.12 CZECH.2
|
521
|
+
{{0x73,0x6d,0x5f,0x5f}, 0x00000842}, // sm__ ITALIAN.12
|
522
|
+
{{0x73,0x6e,0x5f,0x5f}, 0x00000542}, // sn__ FRENCH.12
|
523
|
+
{{0x73,0x72,0x5f,0x5f}, 0x03001e44}, // sr__ SERBIAN.12 DUTCH.3
|
524
|
+
{{0x73,0x76,0x5f,0x5f}, 0x00000f42}, // sv__ SPANISH.12
|
525
|
+
{{0x73,0x79,0x5f,0x5f}, 0x00003742}, // sy__ ARABIC.12
|
526
|
+
{{0x74,0x63,0x5f,0x5f}, 0x0a2206cd}, // tc__ GERMAN.10 TURKISH.6 Korean.5
|
527
|
+
{{0x74,0x66,0x5f,0x5f}, 0x00000642}, // tf__ GERMAN.12
|
528
|
+
{{0x74,0x67,0x5f,0x5f}, 0x00000542}, // tg__ FRENCH.12
|
529
|
+
{{0x74,0x68,0x5f,0x5f}, 0x9e0936c9}, // th__ THAI.10 Japanese.3 SCOTS.1
|
530
|
+
{{0x74,0x6a,0x5f,0x5f}, 0x00007924}, // tj__ TAJIK.9
|
531
|
+
{{0x74,0x6c,0x5f,0x5f}, 0x060f0dcd}, // tl__ PORTUGUESE.10 SPANISH.6 GERMAN.5
|
532
|
+
{{0x74,0x6e,0x5f,0x5f}, 0x3700053e}, // tn__ FRENCH.11 ARABIC.8
|
533
|
+
{{0x74,0x6f,0x5f,0x5f}, 0x064609c5}, // to__ Japanese.9 ChineseT.7 GERMAN.6
|
534
|
+
{{0x74,0x70,0x5f,0x5f}, 0x06000944}, // tp__ Japanese.12 GERMAN.3
|
535
|
+
{{0x74,0x72,0x5f,0x5f}, 0x00002242}, // tr__ TURKISH.12
|
536
|
+
{{0x74,0x72,0x61,0x76}, 0x064509c3}, // trav Japanese.9 SLOVAK.5 GERMAN.4
|
537
|
+
{{0x74,0x74,0x5f,0x5f}, 0x0f00063e}, // tt__ GERMAN.11 SPANISH.8
|
538
|
+
{{0x74,0x77,0x5f,0x5f}, 0x00004642}, // tw__ ChineseT.12
|
539
|
+
{{0x74,0x7a,0x5f,0x5f}, 0x00003f42}, // tz__ SWAHILI.12
|
540
|
+
{{0x75,0x61,0x5f,0x5f}, 0x0000232d}, // ua__ UKRAINIAN.10
|
541
|
+
{{0x75,0x79,0x5f,0x5f}, 0x00000f42}, // uy__ SPANISH.12
|
542
|
+
{{0x75,0x7a,0x5f,0x5f}, 0x0000492d}, // uz__ UZBEK.10
|
543
|
+
{{0x76,0x61,0x5f,0x5f}, 0x060f0828}, // va__ ITALIAN.9 SPANISH.7 GERMAN.5
|
544
|
+
{{0x76,0x63,0x5f,0x5f}, 0x0d000939}, // vc__ Japanese.11 PORTUGUESE.3
|
545
|
+
{{0x76,0x65,0x5f,0x5f}, 0x00000f42}, // ve__ SPANISH.12
|
546
|
+
{{0x76,0x67,0x5f,0x5f}, 0x09000f43}, // vg__ SPANISH.12 Japanese.2
|
547
|
+
{{0x76,0x69,0x5f,0x5f}, 0x00002942}, // vi__ MALAY.12
|
548
|
+
{{0x76,0x6e,0x5f,0x5f}, 0x00004342}, // vn__ VIETNAMESE.12
|
549
|
+
{{0x76,0x75,0x5f,0x5f}, 0x00000642}, // vu__ GERMAN.12
|
550
|
+
{{0x77,0x73,0x5f,0x5f}, 0x4b0f0624}, // ws__ GERMAN.9 SPANISH.5 AZERBAIJANI.1
|
551
|
+
{{0x79,0x65,0x5f,0x5f}, 0x00003742}, // ye__ ARABIC.12
|
552
|
+
{{0x79,0x75,0x5f,0x5f}, 0x1e001d3d}, // yu__ CROATIAN.11 SERBIAN.7
|
553
|
+
{{0x7a,0x61,0x5f,0x5f}, 0x00006642}, // za__ AFRIKAANS.12
|
554
|
+
{{0x7a,0x6d,0x5f,0x5f}, 0x0b000435}, // zm__ FINNISH.10 NORWEGIAN.9
|
555
|
+
{{0x7a,0x77,0x5f,0x5f}, 0x3f00783e}, // zw__ SHONA.11 SWAHILI.8
|
556
|
+
};
|
557
|
+
|
558
|
+
|
559
|
+
// Statistically closest language, based on quadgram table
|
560
|
+
// Those that are far from other languges map to UNKNOWN_LANGUAGE
|
561
|
+
// Subscripted by Language
|
562
|
+
//
|
563
|
+
// From lang_correlation.txt and hand-edits
|
564
|
+
// sed 's/^\([^ ]*\) \([^ ]*\) coef=0\.\(..\).*$/
|
565
|
+
// (\3 >= kMinCorrPercent) ? \2 : UNKNOWN_LANGUAGE,
|
566
|
+
// \/\/ \1/' lang_correlation.txt >/tmp/closest_lang_decl.txt
|
567
|
+
//
|
568
|
+
static const int kMinCorrPercent = 24; // Pick off how close you want
|
569
|
+
// 24 catches PERSIAN <== ARABIC
|
570
|
+
// but not SPANISH <== PORTUGESE
|
571
|
+
static Language Unknown = UNKNOWN_LANGUAGE;
|
572
|
+
|
573
|
+
// Subscripted by Language
|
574
|
+
static const Language kClosestAltLanguage[] = {
|
575
|
+
(28 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // ENGLISH
|
576
|
+
(36 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // DANISH
|
577
|
+
(31 >= kMinCorrPercent) ? AFRIKAANS : UNKNOWN_LANGUAGE, // DUTCH
|
578
|
+
(15 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // FINNISH
|
579
|
+
(11 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // FRENCH
|
580
|
+
(17 >= kMinCorrPercent) ? LUXEMBOURGISH : UNKNOWN_LANGUAGE, // GERMAN
|
581
|
+
(27 >= kMinCorrPercent) ? YIDDISH : UNKNOWN_LANGUAGE, // HEBREW
|
582
|
+
(16 >= kMinCorrPercent) ? CORSICAN : UNKNOWN_LANGUAGE, // ITALIAN
|
583
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Japanese
|
584
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Korean
|
585
|
+
(41 >= kMinCorrPercent) ? NORWEGIAN_N : UNKNOWN_LANGUAGE, // NORWEGIAN
|
586
|
+
( 5 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // POLISH
|
587
|
+
(23 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // PORTUGUESE
|
588
|
+
(33 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // RUSSIAN
|
589
|
+
(28 >= kMinCorrPercent) ? GALICIAN : UNKNOWN_LANGUAGE, // SPANISH
|
590
|
+
(17 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // SWEDISH
|
591
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Chinese
|
592
|
+
(42 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // CZECH
|
593
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GREEK
|
594
|
+
(35 >= kMinCorrPercent) ? FAROESE : UNKNOWN_LANGUAGE, // ICELANDIC
|
595
|
+
( 7 >= kMinCorrPercent) ? LITHUANIAN : UNKNOWN_LANGUAGE, // LATVIAN
|
596
|
+
( 7 >= kMinCorrPercent) ? LATVIAN : UNKNOWN_LANGUAGE, // LITHUANIAN
|
597
|
+
( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ROMANIAN
|
598
|
+
( 4 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // HUNGARIAN
|
599
|
+
(15 >= kMinCorrPercent) ? FINNISH : UNKNOWN_LANGUAGE, // ESTONIAN
|
600
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Ignore
|
601
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Unknown
|
602
|
+
(33 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // BULGARIAN
|
603
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CROATIAN
|
604
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SERBIAN
|
605
|
+
(24 >= kMinCorrPercent) ? SCOTS_GAELIC : UNKNOWN_LANGUAGE, // IRISH
|
606
|
+
(28 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GALICIAN
|
607
|
+
( 8 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // TAGALOG
|
608
|
+
(29 >= kMinCorrPercent) ? AZERBAIJANI : UNKNOWN_LANGUAGE, // TURKISH
|
609
|
+
(28 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // UKRAINIAN
|
610
|
+
(37 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // HINDI
|
611
|
+
(29 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // MACEDONIAN
|
612
|
+
(14 >= kMinCorrPercent) ? ASSAMESE : UNKNOWN_LANGUAGE, // BENGALI
|
613
|
+
(46 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // INDONESIAN
|
614
|
+
( 9 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // LATIN
|
615
|
+
(46 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // MALAY
|
616
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MALAYALAM
|
617
|
+
( 4 >= kMinCorrPercent) ? BRETON : UNKNOWN_LANGUAGE, // WELSH
|
618
|
+
( 8 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // NEPALI
|
619
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TELUGU
|
620
|
+
( 3 >= kMinCorrPercent) ? ESPERANTO : UNKNOWN_LANGUAGE, // ALBANIAN
|
621
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TAMIL
|
622
|
+
(22 >= kMinCorrPercent) ? UKRAINIAN : UNKNOWN_LANGUAGE, // BELARUSIAN
|
623
|
+
(15 >= kMinCorrPercent) ? SUNDANESE : UNKNOWN_LANGUAGE, // JAVANESE
|
624
|
+
(19 >= kMinCorrPercent) ? CATALAN : UNKNOWN_LANGUAGE, // OCCITAN
|
625
|
+
(27 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // URDU
|
626
|
+
(36 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // BIHARI
|
627
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GUJARATI
|
628
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // THAI
|
629
|
+
(24 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // ARABIC
|
630
|
+
(19 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // CATALAN
|
631
|
+
( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ESPERANTO
|
632
|
+
( 3 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // BASQUE
|
633
|
+
( 9 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // INTERLINGUA
|
634
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KANNADA
|
635
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PUNJABI
|
636
|
+
(24 >= kMinCorrPercent) ? IRISH : UNKNOWN_LANGUAGE, // SCOTS_GAELIC
|
637
|
+
( 7 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SWAHILI
|
638
|
+
(28 >= kMinCorrPercent) ? SERBO_CROATIAN : UNKNOWN_LANGUAGE, // SLOVENIAN
|
639
|
+
(37 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // MARATHI
|
640
|
+
( 3 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // MALTESE
|
641
|
+
( 1 >= kMinCorrPercent) ? YORUBA : UNKNOWN_LANGUAGE, // VIETNAMESE
|
642
|
+
(15 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // FRISIAN
|
643
|
+
(42 >= kMinCorrPercent) ? CZECH : UNKNOWN_LANGUAGE, // SLOVAK
|
644
|
+
// Original ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ChineseT
|
645
|
+
(24 >= kMinCorrPercent) ? CHINESE : UNKNOWN_LANGUAGE, // ChineseT
|
646
|
+
(35 >= kMinCorrPercent) ? ICELANDIC : UNKNOWN_LANGUAGE, // FAROESE
|
647
|
+
(15 >= kMinCorrPercent) ? JAVANESE : UNKNOWN_LANGUAGE, // SUNDANESE
|
648
|
+
(17 >= kMinCorrPercent) ? TAJIK : UNKNOWN_LANGUAGE, // UZBEK
|
649
|
+
( 7 >= kMinCorrPercent) ? TIGRINYA : UNKNOWN_LANGUAGE, // AMHARIC
|
650
|
+
(29 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // AZERBAIJANI
|
651
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GEORGIAN
|
652
|
+
( 7 >= kMinCorrPercent) ? AMHARIC : UNKNOWN_LANGUAGE, // TIGRINYA
|
653
|
+
(27 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // PERSIAN
|
654
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // BOSNIAN
|
655
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SINHALESE
|
656
|
+
(41 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // NORWEGIAN_N
|
657
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_P
|
658
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_B
|
659
|
+
(37 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // XHOSA
|
660
|
+
(37 >= kMinCorrPercent) ? XHOSA : UNKNOWN_LANGUAGE, // ZULU
|
661
|
+
( 2 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GUARANI
|
662
|
+
(29 >= kMinCorrPercent) ? TSWANA : UNKNOWN_LANGUAGE, // SESOTHO
|
663
|
+
( 7 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // TURKMEN
|
664
|
+
( 8 >= kMinCorrPercent) ? KAZAKH : UNKNOWN_LANGUAGE, // KYRGYZ
|
665
|
+
( 5 >= kMinCorrPercent) ? FRENCH : UNKNOWN_LANGUAGE, // BRETON
|
666
|
+
( 3 >= kMinCorrPercent) ? GANDA : UNKNOWN_LANGUAGE, // TWI
|
667
|
+
(27 >= kMinCorrPercent) ? HEBREW : UNKNOWN_LANGUAGE, // YIDDISH
|
668
|
+
(28 >= kMinCorrPercent) ? SLOVENIAN : UNKNOWN_LANGUAGE, // SERBO_CROATIAN
|
669
|
+
(12 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // SOMALI
|
670
|
+
( 9 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // UIGHUR
|
671
|
+
(15 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // KURDISH
|
672
|
+
( 6 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // MONGOLIAN
|
673
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ARMENIAN
|
674
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // LAOTHIAN
|
675
|
+
( 8 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // SINDHI
|
676
|
+
(10 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // RHAETO_ROMANCE
|
677
|
+
(31 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // AFRIKAANS
|
678
|
+
(17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // LUXEMBOURGISH
|
679
|
+
( 2 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // BURMESE
|
680
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KHMER
|
681
|
+
(45 >= kMinCorrPercent) ? DZONGKHA : UNKNOWN_LANGUAGE, // TIBETAN
|
682
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // DHIVEHI
|
683
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CHEROKEE
|
684
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SYRIAC
|
685
|
+
( 8 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // LIMBU
|
686
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ORIYA
|
687
|
+
(14 >= kMinCorrPercent) ? BENGALI : UNKNOWN_LANGUAGE, // ASSAMESE
|
688
|
+
(16 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // CORSICAN
|
689
|
+
( 5 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // INTERLINGUE
|
690
|
+
( 8 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // KAZAKH
|
691
|
+
( 4 >= kMinCorrPercent) ? SWAHILI : UNKNOWN_LANGUAGE, // LINGALA
|
692
|
+
(11 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // MOLDAVIAN
|
693
|
+
(19 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // PASHTO
|
694
|
+
( 5 >= kMinCorrPercent) ? AYMARA : UNKNOWN_LANGUAGE, // QUECHUA
|
695
|
+
( 5 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SHONA
|
696
|
+
(17 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // TAJIK
|
697
|
+
(13 >= kMinCorrPercent) ? BASHKIR : UNKNOWN_LANGUAGE, // TATAR
|
698
|
+
(11 >= kMinCorrPercent) ? SAMOAN : UNKNOWN_LANGUAGE, // TONGA
|
699
|
+
( 2 >= kMinCorrPercent) ? TWI : UNKNOWN_LANGUAGE, // YORUBA
|
700
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_ENGLISH_BASED
|
701
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_FRENCH_BASED
|
702
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_PORTUGUESE_BASED
|
703
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_OTHER
|
704
|
+
( 6 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // MAORI
|
705
|
+
( 3 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // WOLOF
|
706
|
+
( 1 >= kMinCorrPercent) ? MONGOLIAN : UNKNOWN_LANGUAGE, // ABKHAZIAN
|
707
|
+
( 8 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // AFAR
|
708
|
+
( 5 >= kMinCorrPercent) ? QUECHUA : UNKNOWN_LANGUAGE, // AYMARA
|
709
|
+
(13 >= kMinCorrPercent) ? TATAR : UNKNOWN_LANGUAGE, // BASHKIR
|
710
|
+
( 3 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // BISLAMA
|
711
|
+
(45 >= kMinCorrPercent) ? TIBETAN : UNKNOWN_LANGUAGE, // DZONGKHA
|
712
|
+
( 4 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // FIJIAN
|
713
|
+
( 7 >= kMinCorrPercent) ? INUPIAK : UNKNOWN_LANGUAGE, // GREENLANDIC
|
714
|
+
( 3 >= kMinCorrPercent) ? AFAR : UNKNOWN_LANGUAGE, // HAUSA
|
715
|
+
( 3 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // HAITIAN_CREOLE
|
716
|
+
( 7 >= kMinCorrPercent) ? GREENLANDIC : UNKNOWN_LANGUAGE, // INUPIAK
|
717
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // INUKTITUT
|
718
|
+
( 4 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // KASHMIRI
|
719
|
+
(30 >= kMinCorrPercent) ? RUNDI : UNKNOWN_LANGUAGE, // KINYARWANDA
|
720
|
+
( 2 >= kMinCorrPercent) ? TAGALOG : UNKNOWN_LANGUAGE, // MALAGASY
|
721
|
+
(17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // NAURU
|
722
|
+
(12 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // OROMO
|
723
|
+
(30 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // RUNDI
|
724
|
+
(11 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // SAMOAN
|
725
|
+
( 1 >= kMinCorrPercent) ? LINGALA : UNKNOWN_LANGUAGE, // SANGO
|
726
|
+
(32 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // SANSKRIT
|
727
|
+
(16 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // SISWANT
|
728
|
+
( 5 >= kMinCorrPercent) ? SISWANT : UNKNOWN_LANGUAGE, // TSONGA
|
729
|
+
(29 >= kMinCorrPercent) ? SESOTHO : UNKNOWN_LANGUAGE, // TSWANA
|
730
|
+
( 2 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // VOLAPUK
|
731
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ZHUANG
|
732
|
+
( 1 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // KHASI
|
733
|
+
(28 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // SCOTS
|
734
|
+
(15 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // GANDA
|
735
|
+
( 7 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // MANX
|
736
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MONTENEGRIN
|
737
|
+
};
|
738
|
+
|
739
|
+
COMPILE_ASSERT(arraysize(kClosestAltLanguage) == NUM_LANGUAGES,
|
740
|
+
kClosestAltLanguage_has_incorrect_size);
|
741
|
+
|
742
|
+
|
743
|
+
inline bool FlagFinish(int flags) {return (flags & kCLDFlagFinish) != 0;}
|
744
|
+
inline bool FlagSqueeze(int flags) {return (flags & kCLDFlagSqueeze) != 0;}
|
745
|
+
inline bool FlagRepeats(int flags) {return (flags & kCLDFlagRepeats) != 0;}
|
746
|
+
inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;}
|
747
|
+
inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;}
|
748
|
+
inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;}
|
749
|
+
inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;}
|
750
|
+
|
751
|
+
|
752
|
+
|
753
|
+
|
754
|
+
//------------------------------------------------------------------------------
|
755
|
+
// For --cld_html debugging output. Not thread safe
|
756
|
+
//------------------------------------------------------------------------------
|
757
|
+
static Language prior_lang = UNKNOWN_LANGUAGE;
|
758
|
+
static bool prior_unreliable = false;
|
759
|
+
|
760
|
+
//------------------------------------------------------------------------------
|
761
|
+
// End For --cld_html debugging output
|
762
|
+
//------------------------------------------------------------------------------
|
763
|
+
|
764
|
+
|
765
|
+
// Backscan to word boundary, returning how many bytes n to go back
|
766
|
+
// so that src - n is non-space ans src - n - 1 is space.
|
767
|
+
// If not found in kMaxSpaceScan bytes, return 0
|
768
|
+
int BackscanToSpace(const char* src, int limit) {
|
769
|
+
int n = 0;
|
770
|
+
limit = cld::minint(limit, kMaxSpaceScan);
|
771
|
+
while (n < limit) {
|
772
|
+
if (src[-n - 1] == ' ') {return n;} // We are at _X
|
773
|
+
++n;
|
774
|
+
}
|
775
|
+
return 0;
|
776
|
+
}
|
777
|
+
|
778
|
+
// Forwardscan to word boundary, returning how many bytes n to go forward
|
779
|
+
// so that src + n is non-space ans src + n - 1 is space.
|
780
|
+
// If not found in kMaxSpaceScan bytes, return 0
|
781
|
+
int ForwardscanToSpace(const char* src, int limit) {
|
782
|
+
int n = 0;
|
783
|
+
limit = cld::minint(limit, kMaxSpaceScan);
|
784
|
+
while (n < limit) {
|
785
|
+
if (src[n] == ' ') {return n + 1;} // We are at _X
|
786
|
+
++n;
|
787
|
+
}
|
788
|
+
return 0;
|
789
|
+
}
|
790
|
+
|
791
|
+
|
792
|
+
// This uses a cheap predictor to get a measure of compression, and
|
793
|
+
// hence a measure of repetitiveness. It works on complete UTF-8 characters
|
794
|
+
// instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly
|
795
|
+
// all the time when done with a byte-based count. Sigh.
|
796
|
+
//
|
797
|
+
// To allow running prediction across multiple chunks, caller passes in current
|
798
|
+
// 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
|
799
|
+
//
|
800
|
+
// Returns the number of *bytes* correctly predicted, increments by 1..4 for
|
801
|
+
// each correctly-predicted character.
|
802
|
+
//
|
803
|
+
// NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text
|
804
|
+
//
|
805
|
+
int CountPredictedBytes(const char* isrc, int srclen, int* hash, int* tbl) {
|
806
|
+
int p_count = 0;
|
807
|
+
const uint8* src = reinterpret_cast<const uint8*>(isrc);
|
808
|
+
const uint8* srclimit = src + srclen;
|
809
|
+
int local_hash = *hash;
|
810
|
+
|
811
|
+
while (src < srclimit) {
|
812
|
+
int c = src[0];
|
813
|
+
int incr = 1;
|
814
|
+
|
815
|
+
// Pick up one char and length
|
816
|
+
if (c < 0xc0) {
|
817
|
+
// One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
|
818
|
+
// Do nothing more
|
819
|
+
} else if ((c & 0xe0) == 0xc0) {
|
820
|
+
// Two-byte
|
821
|
+
c = (c << 8) | src[1];
|
822
|
+
incr = 2;
|
823
|
+
} else if ((c & 0xf0) == 0xe0) {
|
824
|
+
// Three-byte
|
825
|
+
c = (c << 16) | (src[1] << 8) | src[2];
|
826
|
+
incr = 3;
|
827
|
+
} else {
|
828
|
+
// Four-byte
|
829
|
+
c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
|
830
|
+
incr = 4;
|
831
|
+
}
|
832
|
+
src += incr;
|
833
|
+
|
834
|
+
int p = tbl[local_hash]; // Prediction
|
835
|
+
tbl[local_hash] = c; // Update prediction
|
836
|
+
p_count += (c == p); // Count good predictions
|
837
|
+
|
838
|
+
local_hash = ((local_hash << 4) ^ c) & 0xfff;
|
839
|
+
}
|
840
|
+
|
841
|
+
*hash = local_hash;
|
842
|
+
return p_count;
|
843
|
+
}
|
844
|
+
|
845
|
+
|
846
|
+
|
847
|
+
// Counts number of spaces; a little faster than one-at-a-time
|
848
|
+
// Doesn't count odd bytes at end
|
849
|
+
int CountSpaces4(const char* src, int src_len) {
|
850
|
+
int s_count = 0;
|
851
|
+
for (int i = 0; i < (src_len & ~3); i += 4) {
|
852
|
+
s_count += (src[i] == ' ');
|
853
|
+
s_count += (src[i+1] == ' ');
|
854
|
+
s_count += (src[i+2] == ' ');
|
855
|
+
s_count += (src[i+3] == ' ');
|
856
|
+
}
|
857
|
+
return s_count;
|
858
|
+
}
|
859
|
+
|
860
|
+
// Remove words of text that have more than half their letters predicted
|
861
|
+
// correctly by our cheap predictor, moving the remaining words in-place
|
862
|
+
// to the front of the input buffer.
|
863
|
+
//
|
864
|
+
// To allow running prediction across multiple chunks, caller passes in current
|
865
|
+
// 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
|
866
|
+
//
|
867
|
+
// Return the new, possibly-shorter length
|
868
|
+
//
|
869
|
+
// Result Buffer ALWAYS has leading space and trailing space space space NUL,
|
870
|
+
// if input does
|
871
|
+
//
|
872
|
+
int CheapRepWordsInplace(char* isrc, int srclen, int* hash, int* tbl) {
|
873
|
+
const uint8* src = reinterpret_cast<const uint8*>(isrc);
|
874
|
+
const uint8* srclimit = src + srclen;
|
875
|
+
char* dst = isrc;
|
876
|
+
int local_hash = *hash;
|
877
|
+
char* word_dst = dst; // Start of next word
|
878
|
+
int good_predict_bytes = 0;
|
879
|
+
int word_length_bytes = 0;
|
880
|
+
|
881
|
+
while (src < srclimit) {
|
882
|
+
int c = src[0];
|
883
|
+
int incr = 1;
|
884
|
+
*dst++ = c;
|
885
|
+
|
886
|
+
if (c == ' ') {
|
887
|
+
if ((good_predict_bytes * 2) > word_length_bytes) {
|
888
|
+
// Word is well-predicted: backup to start of this word
|
889
|
+
dst = word_dst;
|
890
|
+
if (FLAGS_cld_showme) {
|
891
|
+
// Mark the deletion point with period
|
892
|
+
// Don't repeat multiple periods
|
893
|
+
// Cannot mark with more bytes or may overwrite unseen input
|
894
|
+
if ((isrc < (dst - 2)) && (dst[-2] != '.')) {
|
895
|
+
*dst++ = '.';
|
896
|
+
*dst++ = ' ';
|
897
|
+
}
|
898
|
+
}
|
899
|
+
}
|
900
|
+
word_dst = dst; // Start of next word
|
901
|
+
good_predict_bytes = 0;
|
902
|
+
word_length_bytes = 0;
|
903
|
+
}
|
904
|
+
|
905
|
+
// Pick up one char and length
|
906
|
+
if (c < 0xc0) {
|
907
|
+
// One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
|
908
|
+
// Do nothing more
|
909
|
+
} else if ((c & 0xe0) == 0xc0) {
|
910
|
+
// Two-byte
|
911
|
+
*dst++ = src[1];
|
912
|
+
c = (c << 8) | src[1];
|
913
|
+
incr = 2;
|
914
|
+
} else if ((c & 0xf0) == 0xe0) {
|
915
|
+
// Three-byte
|
916
|
+
*dst++ = src[1];
|
917
|
+
*dst++ = src[2];
|
918
|
+
c = (c << 16) | (src[1] << 8) | src[2];
|
919
|
+
incr = 3;
|
920
|
+
} else {
|
921
|
+
// Four-byte
|
922
|
+
*dst++ = src[1];
|
923
|
+
*dst++ = src[2];
|
924
|
+
*dst++ = src[3];
|
925
|
+
c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
|
926
|
+
incr = 4;
|
927
|
+
}
|
928
|
+
src += incr;
|
929
|
+
word_length_bytes += incr;
|
930
|
+
|
931
|
+
int p = tbl[local_hash]; // Prediction
|
932
|
+
tbl[local_hash] = c; // Update prediction
|
933
|
+
if (c == p) {
|
934
|
+
good_predict_bytes += incr; // Count good predictions
|
935
|
+
}
|
936
|
+
|
937
|
+
local_hash = ((local_hash << 4) ^ c) & 0xfff;
|
938
|
+
}
|
939
|
+
|
940
|
+
*hash = local_hash;
|
941
|
+
|
942
|
+
if ((dst - isrc) < (srclen - 3)) {
|
943
|
+
// Pad and make last char clean UTF-8 by putting following spaces
|
944
|
+
dst[0] = ' ';
|
945
|
+
dst[1] = ' ';
|
946
|
+
dst[2] = ' ';
|
947
|
+
dst[3] = '\0';
|
948
|
+
} else if ((dst - isrc) < srclen) {
|
949
|
+
// Make last char clean UTF-8 by putting following space off the end
|
950
|
+
dst[0] = ' ';
|
951
|
+
}
|
952
|
+
|
953
|
+
return static_cast<int>(dst - isrc);
|
954
|
+
}
|
955
|
+
|
956
|
+
|
957
|
+
// Remove portions of text that have a high density of spaces, or that are
|
958
|
+
// overly repetitive, squeezing the remaining text in-place to the front of the
|
959
|
+
// input buffer.
|
960
|
+
//
|
961
|
+
// Squeezing looks at density of space/prediced chars in fixed-size chunks,
|
962
|
+
// specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes.
|
963
|
+
//
|
964
|
+
// Return the new, possibly-shorter length
|
965
|
+
//
|
966
|
+
// Result Buffer ALWAYS has leading space and trailing space space space NUL,
|
967
|
+
// if input does
|
968
|
+
//
|
969
|
+
int CompactLangDetImpl::CheapSqueezeInplace(char* isrc,
|
970
|
+
int srclen,
|
971
|
+
int ichunksize) {
|
972
|
+
char* src = isrc;
|
973
|
+
char* dst = src;
|
974
|
+
char* srclimit = src + srclen;
|
975
|
+
bool skipping = false;
|
976
|
+
|
977
|
+
int hash = 0;
|
978
|
+
// Allocate local prediction table.
|
979
|
+
int* predict_tbl = new int[kPredictionTableSize];
|
980
|
+
memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
|
981
|
+
|
982
|
+
int chunksize = ichunksize;
|
983
|
+
if (chunksize == 0) {chunksize = kChunksizeDefault;}
|
984
|
+
int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
|
985
|
+
int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
|
986
|
+
|
987
|
+
while (src < srclimit) {
|
988
|
+
int remaining_bytes = srclimit - src;
|
989
|
+
int len = cld::minint(chunksize, remaining_bytes);
|
990
|
+
int space_n = CountSpaces4(src, len);
|
991
|
+
int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
|
992
|
+
if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
|
993
|
+
// Skip the text
|
994
|
+
if (!skipping) {
|
995
|
+
// Keeping-to-skipping transition; do it at a space
|
996
|
+
int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
|
997
|
+
dst -= n;
|
998
|
+
skipping = true;
|
999
|
+
if (FLAGS_cld_showme) {
|
1000
|
+
// Mark the deletion point with black square U+25A0
|
1001
|
+
*dst++ = 0xe2;
|
1002
|
+
*dst++ = 0x96;
|
1003
|
+
*dst++ = 0xa0;
|
1004
|
+
*dst++ = ' ';
|
1005
|
+
}
|
1006
|
+
if (dst == isrc) {
|
1007
|
+
// Force a leading space if the first chunk is deleted
|
1008
|
+
*dst++ = ' ';
|
1009
|
+
}
|
1010
|
+
}
|
1011
|
+
} else {
|
1012
|
+
// Keep the text
|
1013
|
+
if (skipping) {
|
1014
|
+
// Skipping-to-keeping transition; do it at a space
|
1015
|
+
int n = ForwardscanToSpace(src, len);
|
1016
|
+
src += n;
|
1017
|
+
remaining_bytes -= n; // Shrink remaining length
|
1018
|
+
len -= n;
|
1019
|
+
skipping = false;
|
1020
|
+
}
|
1021
|
+
// "len" can be negative in some cases
|
1022
|
+
if (len > 0) {
|
1023
|
+
memcpy(dst, src, len);
|
1024
|
+
dst += len;
|
1025
|
+
}
|
1026
|
+
}
|
1027
|
+
src += len;
|
1028
|
+
}
|
1029
|
+
|
1030
|
+
if ((dst - isrc) < (srclen - 3)) {
|
1031
|
+
// Pad and make last char clean UTF-8 by putting following spaces
|
1032
|
+
dst[0] = ' ';
|
1033
|
+
dst[1] = ' ';
|
1034
|
+
dst[2] = ' ';
|
1035
|
+
dst[3] = '\0';
|
1036
|
+
} else if ((dst - isrc) < srclen) {
|
1037
|
+
// Make last char clean UTF-8 by putting following space off the end
|
1038
|
+
dst[0] = ' ';
|
1039
|
+
}
|
1040
|
+
|
1041
|
+
// Deallocate local prediction table
|
1042
|
+
delete[] predict_tbl;
|
1043
|
+
return static_cast<int>(dst - isrc);
|
1044
|
+
}
|
1045
|
+
|
1046
|
+
// Timing 2.8GHz P4 (dsites 2008.03.20) with 170KB input
|
1047
|
+
// About 90 MB/sec, with or without memcpy, chunksize 48 or 4096
|
1048
|
+
// Just CountSpaces is about 340 MB/sec
|
1049
|
+
// Byte-only CountPredictedBytes is about 150 MB/sec
|
1050
|
+
// Byte-only CountPredictedBytes, conditional tbl[] = is about 85! MB/sec
|
1051
|
+
// Byte-only CountPredictedBytes is about 180 MB/sec, byte tbl, byte/int c
|
1052
|
+
// Unjammed byte-only both = 170 MB/sec
|
1053
|
+
// Jammed byte-only both = 120 MB/sec
|
1054
|
+
// Back to original w/slight updates, 110 MB/sec
|
1055
|
+
//
|
1056
|
+
bool CheapSqueezeTriggerTest(const char* src, int srclen, int testsize) {
|
1057
|
+
// Don't trigger at all on short text
|
1058
|
+
if (srclen < testsize) {return false;}
|
1059
|
+
int space_thresh = (testsize * kSpacesTriggerPercent) / 100;
|
1060
|
+
int predict_thresh = (testsize * kPredictTriggerPercent) / 100;
|
1061
|
+
int hash = 0;
|
1062
|
+
// Allocate local prediction table.
|
1063
|
+
int* predict_tbl = new int[kPredictionTableSize];
|
1064
|
+
memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
|
1065
|
+
|
1066
|
+
bool retval = false;
|
1067
|
+
if ((CountSpaces4(src, testsize) >= space_thresh) ||
|
1068
|
+
(CountPredictedBytes(src, testsize, &hash, predict_tbl) >=
|
1069
|
+
predict_thresh)) {
|
1070
|
+
retval = true;
|
1071
|
+
}
|
1072
|
+
// Deallocate local prediction table
|
1073
|
+
delete[] predict_tbl;
|
1074
|
+
return retval;
|
1075
|
+
}
|
1076
|
+
|
1077
|
+
|
1078
|
+
|
1079
|
+
// Close pairs (correlation) language_enum/language_enum
|
1080
|
+
// id/ms (0.47) 38/40 [1]
|
1081
|
+
// bo/dz (0.46) 105/135 [2]
|
1082
|
+
// cz/sk (0.43) 17/68 [3]
|
1083
|
+
// no/nn (0.42) 10/80 [4]
|
1084
|
+
// hi/mr (0.38) 35/64 [5]
|
1085
|
+
// xh/zu (0.37) 83/84 [6]
|
1086
|
+
// Subscripted by packed language, gives 0 or a subscript in closepair
|
1087
|
+
// scoring array inside doc_tote
|
1088
|
+
static const uint8 kClosePair[EXT_NUM_LANGUAGES + 1] = {
|
1089
|
+
0,
|
1090
|
+
0,0,0,0,0,0,0,0, 0,0,4,0,0,0,0,0, 0,3,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
1091
|
+
0,0,0,5,0,0,1,0, 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
1092
|
+
5,0,0,0,3,0,0,0, 0,0,0,0,0,0,0,0, 4,0,0,6,6,0,0,0, 0,0,0,0,0,0,0,0,
|
1093
|
+
0,0,0,0,0,0,0,0, 0,2,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
1094
|
+
0,0,0,0,0,0,0,2, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
1095
|
+
// Add new language close-pair number just before here (just use 0)
|
1096
|
+
};
|
1097
|
+
|
1098
|
+
|
1099
|
+
// Delete any extended languages from doc_tote
|
1100
|
+
void RemoveExtendedLanguages(ToteWithReliability* doc_tote) {
|
1101
|
+
for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
|
1102
|
+
if (cld::UnpackLanguage(doc_tote->Key(sub)) >= NUM_LANGUAGES) {
|
1103
|
+
// Effectively remove the extended language by setting key&score to zero
|
1104
|
+
if (FLAGS_dbgscore) {
|
1105
|
+
fprintf(stderr, "{-%s} ",
|
1106
|
+
ExtLanguageCode(cld::UnpackLanguage(doc_tote->Key(sub))));
|
1107
|
+
}
|
1108
|
+
|
1109
|
+
// Delete entry
|
1110
|
+
doc_tote->SetKey(sub, 0);
|
1111
|
+
doc_tote->SetValue(sub, 0);
|
1112
|
+
doc_tote->SetReliability(sub, 0);
|
1113
|
+
}
|
1114
|
+
}
|
1115
|
+
}
|
1116
|
+
|
1117
|
+
static const int kMinReliableKeepPercent = 41; // Remove lang if reli < this
|
1118
|
+
|
1119
|
+
// For Tier3 languages, require a minimum number of bytes to be first-place lang
|
1120
|
+
static const int kGoodFirstT3MinBytes = 24; // <this => no first
|
1121
|
+
|
1122
|
+
// Move bytes for unreliable langs to another lang or UNKNOWN
|
1123
|
+
// doc_tote is sorted, so cannot Add
|
1124
|
+
//
|
1125
|
+
// If both CHINESE and CHINESET are present and unreliable, do not delete both;
|
1126
|
+
// merge both into CHINESE.
|
1127
|
+
//
|
1128
|
+
//dsites 2009.03.19
|
1129
|
+
// we also want to remove Tier3 languages as the first lang if there is very
|
1130
|
+
// little text like ej1 ej2 ej3 ej4
|
1131
|
+
// maybe fold this back in earlier
|
1132
|
+
//
|
1133
|
+
void RemoveUnreliableLanguages(ToteWithReliability* doc_tote, bool do_remove_weak_matches) {
|
1134
|
+
// Prepass to merge some low-reliablility languages
|
1135
|
+
int total_bytes = 0;
|
1136
|
+
for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
|
1137
|
+
int plang = doc_tote->Key(sub);
|
1138
|
+
if (plang == 0) {continue;} // Empty slot
|
1139
|
+
|
1140
|
+
Language lang = cld::UnpackLanguage(plang);
|
1141
|
+
int bytes = doc_tote->Value(sub);
|
1142
|
+
int reli = doc_tote->Reliability(sub);
|
1143
|
+
if (bytes == 0) {continue;} // Zero bytes
|
1144
|
+
total_bytes += bytes;
|
1145
|
+
|
1146
|
+
// Reliable percent is stored reliable score over stored bytecount
|
1147
|
+
int reliable_percent = reli / bytes;
|
1148
|
+
if (reliable_percent >= kMinReliableKeepPercent) {continue;} // Keeper
|
1149
|
+
|
1150
|
+
// This language is too unreliable to keep, but we might merge it.
|
1151
|
+
Language altlang = UNKNOWN_LANGUAGE;
|
1152
|
+
if (lang < NUM_LANGUAGES) {altlang = kClosestAltLanguage[lang];}
|
1153
|
+
if (altlang == UNKNOWN_LANGUAGE) {continue;} // No alternative
|
1154
|
+
|
1155
|
+
// Look for alternative in doc_tote
|
1156
|
+
int altsub = doc_tote->Find(cld::PackLanguage(altlang));
|
1157
|
+
if (altsub < 0) {continue;} // No alternative text
|
1158
|
+
|
1159
|
+
int bytes2 = doc_tote->Value(altsub);
|
1160
|
+
int reli2 = doc_tote->Reliability(altsub);
|
1161
|
+
if (bytes2 == 0) {continue;} // Zero bytes
|
1162
|
+
|
1163
|
+
// Reliable percent is stored reliable score over stored bytecount
|
1164
|
+
int reliable_percent2 = reli2 / bytes2;
|
1165
|
+
|
1166
|
+
// Merge one language into the other. Break ties toward lower lang #
|
1167
|
+
int tosub = altsub;
|
1168
|
+
int fromsub = sub;
|
1169
|
+
bool into_lang = false;
|
1170
|
+
if ((reliable_percent2 < reliable_percent) ||
|
1171
|
+
((reliable_percent2 == reliable_percent) && (lang < altlang))) {
|
1172
|
+
tosub = sub;
|
1173
|
+
fromsub = altsub;
|
1174
|
+
into_lang = true;
|
1175
|
+
}
|
1176
|
+
|
1177
|
+
// Make sure reliability doesn't drop and is enough to avoid delete
|
1178
|
+
int newpercent = cld::maxint(reliable_percent, reliable_percent2);
|
1179
|
+
newpercent = cld::maxint(newpercent, kMinReliableKeepPercent);
|
1180
|
+
int newbytes = bytes + bytes2;
|
1181
|
+
int newreli = newpercent * newbytes;
|
1182
|
+
|
1183
|
+
doc_tote->SetKey(fromsub, 0);
|
1184
|
+
doc_tote->SetValue(fromsub, 0);
|
1185
|
+
doc_tote->SetReliability(fromsub, 0);
|
1186
|
+
doc_tote->SetValue(tosub, newbytes);
|
1187
|
+
doc_tote->SetReliability(tosub, newreli);
|
1188
|
+
|
1189
|
+
// Show fate of unreliable languages if at least 10 bytes
|
1190
|
+
if (FLAGS_cld_html /*&& (newpercent >= 10)*/ && (newbytes >= 10)) {
|
1191
|
+
if (into_lang) {
|
1192
|
+
fprintf(stderr, "{Unreli %s.%d(%dB) => %s} ",
|
1193
|
+
ExtLanguageCode(altlang), reliable_percent2, bytes2,
|
1194
|
+
ExtLanguageCode(lang));
|
1195
|
+
} else {
|
1196
|
+
fprintf(stderr, "{Unreli %s.%d(%dB) => %s} ",
|
1197
|
+
ExtLanguageCode(lang), reliable_percent, bytes,
|
1198
|
+
ExtLanguageCode(altlang));
|
1199
|
+
}
|
1200
|
+
}
|
1201
|
+
}
|
1202
|
+
|
1203
|
+
|
1204
|
+
if (do_remove_weak_matches) {
|
1205
|
+
// Pass to delete any remaining unreliable languages
|
1206
|
+
for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
|
1207
|
+
int plang = doc_tote->Key(sub);
|
1208
|
+
if (plang == 0) {continue;} // Empty slot
|
1209
|
+
|
1210
|
+
Language lang = cld::UnpackLanguage(plang);
|
1211
|
+
int bytes = doc_tote->Value(sub);
|
1212
|
+
int reli = doc_tote->Reliability(sub);
|
1213
|
+
if (bytes == 0) {continue;} // Zero bytes
|
1214
|
+
|
1215
|
+
bool is_tier3 = (cld::kIsPackedTop40[plang] == 0);
|
1216
|
+
if (is_tier3 &&
|
1217
|
+
(bytes < kGoodFirstT3MinBytes) &&
|
1218
|
+
(bytes < total_bytes)) {
|
1219
|
+
reli = 0; // Too-short tier3
|
1220
|
+
}
|
1221
|
+
|
1222
|
+
// Reliable percent is stored as reliable score over stored bytecount
|
1223
|
+
int reliable_percent = reli / bytes;
|
1224
|
+
if (reliable_percent >= kMinReliableKeepPercent) {continue;} // Keeper
|
1225
|
+
|
1226
|
+
// Delete unreliable entry
|
1227
|
+
doc_tote->SetKey(sub, 0);
|
1228
|
+
doc_tote->SetValue(sub, 0);
|
1229
|
+
doc_tote->SetReliability(sub, 0);
|
1230
|
+
|
1231
|
+
// Show fate of unreliable languages if at least 10 bytes
|
1232
|
+
if (FLAGS_cld_html /*&& (reliable_percent >= 10)*/ && (bytes >= 10)) {
|
1233
|
+
fprintf(stderr, "{Unreli %s.%d(%dB)} ",
|
1234
|
+
ExtLanguageCode(lang), reliable_percent, bytes);
|
1235
|
+
}
|
1236
|
+
}
|
1237
|
+
}
|
1238
|
+
|
1239
|
+
if (FLAGS_cld_html) {fprintf(stderr, "<br>\n");}
|
1240
|
+
}
|
1241
|
+
|
1242
|
+
|
1243
|
+
// Move less likely byte count to more likely for close pairs of languages
|
1244
|
+
void RefineScoredClosePairs(ToteWithReliability* doc_tote) {
|
1245
|
+
for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
|
1246
|
+
int close_packedlang = doc_tote->Key(sub);
|
1247
|
+
int subscr = kClosePair[close_packedlang];
|
1248
|
+
if (subscr == 0) {continue;}
|
1249
|
+
|
1250
|
+
// We have a close pair language -- if the other one is also scored and the
|
1251
|
+
// longword score differs enough, put all our eggs into one basket
|
1252
|
+
|
1253
|
+
// Nonzero longword score: Go look for the other of this pair
|
1254
|
+
for (int sub2 = sub + 1; sub2 < doc_tote->MaxSize(); ++sub2) {
|
1255
|
+
if (kClosePair[doc_tote->Key(sub2)] == subscr) {
|
1256
|
+
// We have a matching pair
|
1257
|
+
int close_packedlang2 = doc_tote->Key(sub2);
|
1258
|
+
|
1259
|
+
// Move all the text bytes from lower byte-count to higher one
|
1260
|
+
int from_sub, to_sub;
|
1261
|
+
Language from_lang, to_lang;
|
1262
|
+
if (doc_tote->Value(sub) < doc_tote->Value(sub2)) {
|
1263
|
+
from_sub = sub;
|
1264
|
+
to_sub = sub2;
|
1265
|
+
from_lang = cld::UnpackLanguage(close_packedlang);
|
1266
|
+
to_lang = cld::UnpackLanguage(close_packedlang2);
|
1267
|
+
} else {
|
1268
|
+
from_sub = sub2;
|
1269
|
+
to_sub = sub;
|
1270
|
+
from_lang = cld::UnpackLanguage(close_packedlang2);
|
1271
|
+
to_lang = cld::UnpackLanguage(close_packedlang);
|
1272
|
+
}
|
1273
|
+
|
1274
|
+
// Move all the bytes smaller => larger of the pair
|
1275
|
+
if (FLAGS_cld_html || FLAGS_dbgscore) {
|
1276
|
+
// Show fate of closepair language
|
1277
|
+
int val = doc_tote->Value(from_sub);
|
1278
|
+
int reli = doc_tote->Reliability(from_sub);
|
1279
|
+
int reliable_percent = reli / (val ? val : 1); // avoid zdiv
|
1280
|
+
fprintf(stderr, "{CloseLangPair: %s.%d%%(%dB) => %s} ",
|
1281
|
+
ExtLanguageCode(from_lang),
|
1282
|
+
reliable_percent,
|
1283
|
+
doc_tote->Value(from_sub),
|
1284
|
+
ExtLanguageCode(to_lang));
|
1285
|
+
}
|
1286
|
+
int sum = doc_tote->Value(to_sub) + doc_tote->Value(from_sub);
|
1287
|
+
doc_tote->SetValue(to_sub, sum);
|
1288
|
+
doc_tote->SetReliability(to_sub, 100 * sum);
|
1289
|
+
|
1290
|
+
// Delete old entry
|
1291
|
+
doc_tote->SetKey(from_sub, 0);
|
1292
|
+
doc_tote->SetValue(from_sub, 0);
|
1293
|
+
doc_tote->SetReliability(from_sub, 0);
|
1294
|
+
|
1295
|
+
break; // Exit inner for sub2 loop
|
1296
|
+
}
|
1297
|
+
} // End for sub2
|
1298
|
+
} // End for sub
|
1299
|
+
}
|
1300
|
+
|
1301
|
+
|
1302
|
+
void ApplyLanguageHints(Tote* chunk_tote, int tote_grams,
|
1303
|
+
uint8* lang_hint_boost) {
|
1304
|
+
// Need 8 quad/unigrams to give full hint boost, else derate linearly
|
1305
|
+
if (tote_grams > 8) {
|
1306
|
+
tote_grams = 8;
|
1307
|
+
}
|
1308
|
+
for (int sub = 0; sub < chunk_tote->MaxSize(); ++sub) {
|
1309
|
+
// Hint boosts are per packed subscript
|
1310
|
+
int lang_sub = chunk_tote->Key(sub);
|
1311
|
+
int new_value = chunk_tote->Value(sub) +
|
1312
|
+
((lang_hint_boost[lang_sub] * tote_grams) >> 3);
|
1313
|
+
chunk_tote->SetValue(sub, new_value);
|
1314
|
+
if (FLAGS_dbgscore && (lang_hint_boost[lang_sub] > 0)) {
|
1315
|
+
fprintf(stderr, "[%s+=%d*%d/8] ",
|
1316
|
+
ExtLanguageCode(cld::UnpackLanguage(lang_sub)),
|
1317
|
+
lang_hint_boost[lang_sub], tote_grams);
|
1318
|
+
}
|
1319
|
+
}
|
1320
|
+
}
|
1321
|
+
|
1322
|
+
|
1323
|
+
void PrintHtmlEscapedText(FILE* f, const char* txt, int len) {
|
1324
|
+
for (int i = 0; i < len; ++i) {
|
1325
|
+
char c = txt[i];
|
1326
|
+
if (c == '<') {
|
1327
|
+
fprintf(f, "<");
|
1328
|
+
} else if (c == '>') {
|
1329
|
+
fprintf(f, ">");
|
1330
|
+
} else if (c == '&') {
|
1331
|
+
fprintf(f, "&");
|
1332
|
+
} else if (c == '\'') {
|
1333
|
+
fprintf(f, "'");
|
1334
|
+
} else if (c == '"') {
|
1335
|
+
fprintf(f, """);
|
1336
|
+
} else {
|
1337
|
+
fprintf(f, "%c", c);
|
1338
|
+
}
|
1339
|
+
}
|
1340
|
+
fprintf(f, "<br>\n");
|
1341
|
+
}
|
1342
|
+
|
1343
|
+
|
1344
|
+
// Add one chunk's score to running document score
|
1345
|
+
// If the top language is UNKNOWN_LANGUAGE, score nothing. This is used to
|
1346
|
+
// positively identify text to be ignored, such as link farms.
|
1347
|
+
// Sort before scoring and reinit afterward
|
1348
|
+
//
|
1349
|
+
// src and srclen are just for debug output
|
1350
|
+
void ScoreChunkIntoDoc(const char* src, int srclen, int advance_by,
|
1351
|
+
UnicodeLScript lscript,
|
1352
|
+
Tote* chunk_tote,
|
1353
|
+
ToteWithReliability* doc_tote,
|
1354
|
+
int tote_grams,
|
1355
|
+
uint8* lang_hint_boost) {
|
1356
|
+
// Apply hints before sorting
|
1357
|
+
if (lang_hint_boost) {
|
1358
|
+
ApplyLanguageHints(chunk_tote, tote_grams, lang_hint_boost);
|
1359
|
+
}
|
1360
|
+
|
1361
|
+
// Sort to get top two languages
|
1362
|
+
chunk_tote->Sort(2);
|
1363
|
+
Language cur_lang = cld::UnpackLanguage(chunk_tote->Key(0));
|
1364
|
+
|
1365
|
+
// Return if empty
|
1366
|
+
if (cur_lang < 0) {
|
1367
|
+
chunk_tote->Reinit();
|
1368
|
+
return;
|
1369
|
+
}
|
1370
|
+
|
1371
|
+
bool cur_unreliable = false;
|
1372
|
+
|
1373
|
+
// Reliability is a function of mean script score per KB of text
|
1374
|
+
int len = chunk_tote->GetByteCount();
|
1375
|
+
int reliability = cld::GetReliability((len * 2) / advance_by,
|
1376
|
+
lscript,
|
1377
|
+
chunk_tote);
|
1378
|
+
cur_unreliable = (reliability < cld::kMinReliable);
|
1379
|
+
|
1380
|
+
// If tote_grams=0, always reliable
|
1381
|
+
// If tote_grams=1, always unreliable
|
1382
|
+
if (tote_grams == 0) {
|
1383
|
+
reliability = 100;
|
1384
|
+
cur_unreliable = false;
|
1385
|
+
} else if (tote_grams == 1) {
|
1386
|
+
reliability = 0;
|
1387
|
+
cur_unreliable = true;
|
1388
|
+
}
|
1389
|
+
|
1390
|
+
#if 0
|
1391
|
+
// TEMP
|
1392
|
+
if (FLAGS_cld_html) {
|
1393
|
+
if (reliability >= kMinReliableKeepPercent) {
|
1394
|
+
fprintf(stderr, "R%d%% ", reliability);
|
1395
|
+
} else {
|
1396
|
+
fprintf(stderr, "--R%d%% ", reliability);
|
1397
|
+
}
|
1398
|
+
}
|
1399
|
+
#endif
|
1400
|
+
|
1401
|
+
// Track the sequence of language fragments [result currently unused]
|
1402
|
+
////if (reliability >= kMinReliableSeq) {
|
1403
|
+
//// doc_tote->AddSeq(chunk_tote->Key(0));
|
1404
|
+
////}
|
1405
|
+
|
1406
|
+
if (cur_unreliable && (chunk_tote->Key(1) != 0)) {
|
1407
|
+
// Unreliable and two top contenders, split byte count 5/8 - 3/8
|
1408
|
+
int top_len = ((len * 5) + 4) >> 3;
|
1409
|
+
int second_len = len - top_len;
|
1410
|
+
|
1411
|
+
doc_tote->Add(chunk_tote->Key(0),
|
1412
|
+
top_len, chunk_tote->Value(0), reliability);
|
1413
|
+
doc_tote->Add(chunk_tote->Key(1),
|
1414
|
+
second_len, chunk_tote->Value(1), reliability);
|
1415
|
+
if (FLAGS_dbgscore) {
|
1416
|
+
fprintf(stderr, "{+%s.%d.%dR(%dB) +%s.%d.%dR(%dB)} ",
|
1417
|
+
ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(0))),
|
1418
|
+
chunk_tote->Value(0),
|
1419
|
+
reliability,
|
1420
|
+
top_len,
|
1421
|
+
ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(1))),
|
1422
|
+
chunk_tote->Value(1),
|
1423
|
+
reliability,
|
1424
|
+
second_len);
|
1425
|
+
}
|
1426
|
+
} else {
|
1427
|
+
// Reliable or single contender
|
1428
|
+
doc_tote->Add(chunk_tote->Key(0),
|
1429
|
+
len, chunk_tote->Value(0), reliability);
|
1430
|
+
if (FLAGS_dbgscore) {
|
1431
|
+
fprintf(stderr, "{+%s.%d.%dR(%dB)} ",
|
1432
|
+
ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(0))),
|
1433
|
+
chunk_tote->Value(0),
|
1434
|
+
reliability,
|
1435
|
+
len);
|
1436
|
+
}
|
1437
|
+
}
|
1438
|
+
|
1439
|
+
if (FLAGS_cld_html) {
|
1440
|
+
if (cur_lang < 0) {cur_lang = UNKNOWN_LANGUAGE;}
|
1441
|
+
cld::PrintLang(stderr, chunk_tote,
|
1442
|
+
cur_lang, cur_unreliable,
|
1443
|
+
prior_lang, prior_unreliable);
|
1444
|
+
prior_lang = cur_lang;
|
1445
|
+
prior_unreliable = cur_unreliable;
|
1446
|
+
|
1447
|
+
string temp(src, srclen);
|
1448
|
+
if (temp[0] == '=') {
|
1449
|
+
// Rewrite =ScriptX= or =SwitchX= as =Xxxx= for script code Xxxx
|
1450
|
+
temp = "=Buffered_";
|
1451
|
+
temp.append(UnicodeLScriptCode(lscript));
|
1452
|
+
temp.append("=");
|
1453
|
+
}
|
1454
|
+
cld::PrintText(stderr, cur_lang, temp);
|
1455
|
+
}
|
1456
|
+
|
1457
|
+
chunk_tote->Reinit();
|
1458
|
+
}
|
1459
|
+
|
1460
|
+
|
1461
|
+
void PrintTopLang(Language top_lang) {
|
1462
|
+
if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
|
1463
|
+
fprintf(stderr, "[] ");
|
1464
|
+
} else {
|
1465
|
+
fprintf(stderr, "[%s] ", ExtLanguageName(top_lang));
|
1466
|
+
prior_lang = top_lang;
|
1467
|
+
}
|
1468
|
+
}
|
1469
|
+
|
1470
|
+
void PrintTopLangSpeculative(Language top_lang) {
|
1471
|
+
fprintf(stderr, "<span style=\"color:#%06X;\">", 0xa0a0a0);
|
1472
|
+
if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
|
1473
|
+
fprintf(stderr, "[] ");
|
1474
|
+
} else {
|
1475
|
+
fprintf(stderr, "[%s] ", ExtLanguageName(top_lang));
|
1476
|
+
prior_lang = top_lang;
|
1477
|
+
}
|
1478
|
+
fprintf(stderr, "</span>\n");
|
1479
|
+
}
|
1480
|
+
|
1481
|
+
|
1482
|
+
// Add one chunk's score to running document score
|
1483
|
+
// Convenience function with constant src text
|
1484
|
+
void ScoreChunkIntoDoc2(const char* src, int advance_by,
|
1485
|
+
UnicodeLScript lscript,
|
1486
|
+
Tote* chunk_tote,
|
1487
|
+
ToteWithReliability* doc_tote,
|
1488
|
+
int tote_grams,
|
1489
|
+
uint8* lang_hint_boost) {
|
1490
|
+
int srclen = static_cast<int>(strlen(src));
|
1491
|
+
ScoreChunkIntoDoc(src, srclen, advance_by, lscript, chunk_tote,
|
1492
|
+
doc_tote, tote_grams, lang_hint_boost);
|
1493
|
+
}
|
1494
|
+
|
1495
|
+
|
1496
|
+
// Score one scriptspan using the only language for that script
|
1497
|
+
void ScoreNilgrams(getone::LangSpan* scriptspan, int lang,
|
1498
|
+
ToteWithReliability* doc_tote,
|
1499
|
+
uint8* lang_hint_boost,
|
1500
|
+
int flags, Language plus_one) {
|
1501
|
+
// For debugging only. Not thread-safe
|
1502
|
+
prior_lang = UNKNOWN_LANGUAGE;
|
1503
|
+
prior_unreliable = false;
|
1504
|
+
|
1505
|
+
const char* src = scriptspan->text;
|
1506
|
+
int len = scriptspan->text_bytes;
|
1507
|
+
|
1508
|
+
Tote chunk_tote;
|
1509
|
+
// Score 1000 for 1000 bytes
|
1510
|
+
chunk_tote.AddGram();
|
1511
|
+
chunk_tote.Add(lang, scriptspan->text_bytes);
|
1512
|
+
chunk_tote.AddBytes(scriptspan->text_bytes);
|
1513
|
+
int advance_by = 2;
|
1514
|
+
int tote_grams = 0; // Indicates fully reliable
|
1515
|
+
ScoreChunkIntoDoc(src, len, advance_by,
|
1516
|
+
scriptspan->script, &chunk_tote,
|
1517
|
+
doc_tote, tote_grams, lang_hint_boost);
|
1518
|
+
}
|
1519
|
+
|
1520
|
+
// Score one scriptspan using unigrams
|
1521
|
+
// Updates tote_grams
|
1522
|
+
static void ScoreUnigrams(const UTF8PropObj* unigram_obj,
|
1523
|
+
getone::LangSpan* scriptspan,
|
1524
|
+
int* tote_grams, int gram_limit,
|
1525
|
+
Tote* chunk_tote,
|
1526
|
+
ToteWithReliability* doc_tote,
|
1527
|
+
uint8* lang_hint_boost,
|
1528
|
+
int advance_by, int flags,
|
1529
|
+
int* initial_word_span, Language plus_one) {
|
1530
|
+
// chunk_tote may have partial sum coming in
|
1531
|
+
const char* src = scriptspan->text;
|
1532
|
+
const char* srclimit = src + scriptspan->text_bytes;
|
1533
|
+
|
1534
|
+
// For debugging only. Not thread-safe
|
1535
|
+
prior_lang = UNKNOWN_LANGUAGE;
|
1536
|
+
prior_unreliable = false;
|
1537
|
+
|
1538
|
+
// Break text up into multiple chunks and score each
|
1539
|
+
while (src < srclimit) {
|
1540
|
+
// Updates tote_grams
|
1541
|
+
int len = cld::DoUniScoreV3(unigram_obj,
|
1542
|
+
src, srclimit - src, advance_by,
|
1543
|
+
tote_grams, gram_limit, chunk_tote);
|
1544
|
+
if (FlagUseWords(flags) || (*initial_word_span > 0)) {
|
1545
|
+
// Use bigram scoring in addition to quadgrams
|
1546
|
+
cld::DoBigramScoreV3(&kCjkBiTable_obj,
|
1547
|
+
src, len, chunk_tote);
|
1548
|
+
}
|
1549
|
+
chunk_tote->AddBytes(len);
|
1550
|
+
*initial_word_span -= len;
|
1551
|
+
|
1552
|
+
if (*tote_grams >= gram_limit) {
|
1553
|
+
// Add this chunk to doc totals
|
1554
|
+
// Remove all but top40 if asked
|
1555
|
+
if (FlagTop40(flags)) {
|
1556
|
+
cld::DemoteNotTop40(chunk_tote, cld::PackLanguage(plus_one));
|
1557
|
+
}
|
1558
|
+
|
1559
|
+
// Sort, accumulate into doc total, reinit
|
1560
|
+
ScoreChunkIntoDoc(src, len, advance_by,
|
1561
|
+
scriptspan->script, chunk_tote,
|
1562
|
+
doc_tote, *tote_grams, lang_hint_boost);
|
1563
|
+
*tote_grams = 0;
|
1564
|
+
} else {
|
1565
|
+
if (FLAGS_cld_html) {
|
1566
|
+
string temp(src, len);
|
1567
|
+
Language top_lang = cld::UnpackLanguage(chunk_tote->CurrentTopKey());
|
1568
|
+
PrintTopLangSpeculative(top_lang);
|
1569
|
+
cld::PrintText(stderr, top_lang, temp);
|
1570
|
+
}
|
1571
|
+
}
|
1572
|
+
src += len;
|
1573
|
+
}
|
1574
|
+
// chunk_tote may have partial sum going out
|
1575
|
+
}
|
1576
|
+
|
1577
|
+
// Back up one UTF-8 character
|
1578
|
+
const uint8* BackOneUTF8(const uint8* p) {
|
1579
|
+
const uint8* retval = p - 1;
|
1580
|
+
if ((*retval & 0xc0) == 0x80) {--retval;}
|
1581
|
+
if ((*retval & 0xc0) == 0x80) {--retval;}
|
1582
|
+
if ((*retval & 0xc0) == 0x80) {--retval;}
|
1583
|
+
return retval;
|
1584
|
+
}
|
1585
|
+
|
1586
|
+
|
1587
|
+
// Score one scriptspan using quadgrams
|
1588
|
+
// Incoming chunk_tote may have partial accumulation
|
1589
|
+
static void ScoreQuadgrams(const cld::CLDTableSummary* quadgram_obj,
|
1590
|
+
getone::LangSpan* scriptspan,
|
1591
|
+
int* tote_grams, int gram_limit,
|
1592
|
+
Tote* chunk_tote,
|
1593
|
+
ToteWithReliability* doc_tote,
|
1594
|
+
uint8* lang_hint_boost,
|
1595
|
+
int advance_by, int flags,
|
1596
|
+
int* initial_word_span, Language plus_one) {
|
1597
|
+
// chunk_tote may have partial sum coming in
|
1598
|
+
const char* src = scriptspan->text;
|
1599
|
+
const char* srclimit = src + scriptspan->text_bytes;
|
1600
|
+
const char* lastscored_src = src;
|
1601
|
+
|
1602
|
+
// For debugging only. Not thread-safe
|
1603
|
+
prior_lang = UNKNOWN_LANGUAGE;
|
1604
|
+
prior_unreliable = false;
|
1605
|
+
|
1606
|
+
// Break text up into multiple chunks and score each
|
1607
|
+
while (src < srclimit) {
|
1608
|
+
// Updates tote_grams
|
1609
|
+
int len = cld::DoQuadScoreV3(quadgram_obj,
|
1610
|
+
src, srclimit - src, advance_by,
|
1611
|
+
tote_grams, gram_limit, chunk_tote);
|
1612
|
+
if (FlagUseWords(flags) || (*initial_word_span > 0)) {
|
1613
|
+
// Use word scoring in addition to quadgrams
|
1614
|
+
cld::DoOctaScoreV3(&kLongWord8Table_obj,
|
1615
|
+
src, len, chunk_tote);
|
1616
|
+
}
|
1617
|
+
chunk_tote->AddBytes(len);
|
1618
|
+
*initial_word_span -= len;
|
1619
|
+
|
1620
|
+
if (*tote_grams >= gram_limit) {
|
1621
|
+
// Remove all but top40 if asked
|
1622
|
+
if (FlagTop40(flags)) {
|
1623
|
+
cld::DemoteNotTop40(chunk_tote, cld::PackLanguage(plus_one));
|
1624
|
+
}
|
1625
|
+
|
1626
|
+
// Sort, accumulate into doc total, reinit
|
1627
|
+
ScoreChunkIntoDoc(src, len, advance_by,
|
1628
|
+
scriptspan->script, chunk_tote,
|
1629
|
+
doc_tote, *tote_grams, lang_hint_boost);
|
1630
|
+
lastscored_src = src + len;
|
1631
|
+
*tote_grams = 0;
|
1632
|
+
} else {
|
1633
|
+
if (FLAGS_cld_html) {
|
1634
|
+
string temp(src, len);
|
1635
|
+
Language top_lang = cld::UnpackLanguage(chunk_tote->CurrentTopKey());
|
1636
|
+
PrintTopLangSpeculative(top_lang);
|
1637
|
+
cld::PrintText(stderr, top_lang, temp);
|
1638
|
+
}
|
1639
|
+
}
|
1640
|
+
src += len;
|
1641
|
+
}
|
1642
|
+
}
|
1643
|
+
|
1644
|
+
|
1645
|
+
|
1646
|
+
void PrintLangs(FILE* f, const Language* language3, const int* percent3,
|
1647
|
+
const int* text_bytes, const bool* is_reliable) {
|
1648
|
+
fprintf(f, "<br> Initial_Languages ");
|
1649
|
+
if (language3[0] != UNKNOWN_LANGUAGE) {
|
1650
|
+
fprintf(f, "%s%s(%d%%) ",
|
1651
|
+
ExtLanguageName(language3[0]),
|
1652
|
+
*is_reliable ? "" : "*",
|
1653
|
+
percent3[0]);
|
1654
|
+
}
|
1655
|
+
if (language3[1] != UNKNOWN_LANGUAGE) {
|
1656
|
+
fprintf(f, "%s(%d%%) ", ExtLanguageName(language3[1]), percent3[1]);
|
1657
|
+
}
|
1658
|
+
if (language3[2] != UNKNOWN_LANGUAGE) {
|
1659
|
+
fprintf(f, "%s(%d%%) ", ExtLanguageName(language3[2]), percent3[2]);
|
1660
|
+
}
|
1661
|
+
fprintf(f, "%d bytes \n", *text_bytes);
|
1662
|
+
|
1663
|
+
fprintf(f, "<br>\n");
|
1664
|
+
}
|
1665
|
+
|
1666
|
+
|
1667
|
+
// Start the tote with a count of one for the default language for script
|
1668
|
+
void InitScriptToteLang(Tote* script_tote, UnicodeLScript lscript) {
|
1669
|
+
Language defaultlang = cld::kDefaultLanguagePerLScript[lscript];
|
1670
|
+
script_tote->Add(cld::PackLanguage(defaultlang), 1);
|
1671
|
+
script_tote->AddBytes(1);
|
1672
|
+
#if 0
|
1673
|
+
if (FLAGS_cld_html) {
|
1674
|
+
cld::PrintLang(stderr, script_tote,
|
1675
|
+
defaultlang, false,
|
1676
|
+
UNKNOWN_LANGUAGE, false);
|
1677
|
+
prior_lang = cur_lang;
|
1678
|
+
string temp("+1");
|
1679
|
+
cld::PrintText(stderr, defaultlang, temp);
|
1680
|
+
}
|
1681
|
+
#endif
|
1682
|
+
}
|
1683
|
+
|
1684
|
+
static const char* const kToteName[4] =
|
1685
|
+
{"=Latn=", "=Hani=", "=Script2=", "=Script3="};
|
1686
|
+
static const char* const kToteSwitch[4] =
|
1687
|
+
{"=Latn=", "=Hani=", "=Switch2=", "=Switch3="};
|
1688
|
+
|
1689
|
+
|
1690
|
+
|
1691
|
+
// Upper to lower, keep digits, everything else to minus '-' (2d)
|
1692
|
+
static const char kCharsetToLowerTbl[256] = {
|
1693
|
+
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
|
1694
|
+
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
|
1695
|
+
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
|
1696
|
+
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37, 0x38,0x39,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
|
1697
|
+
|
1698
|
+
0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
|
1699
|
+
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x2d,
|
1700
|
+
0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
|
1701
|
+
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x2d,
|
1702
|
+
|
1703
|
+
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
|
1704
|
+
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
|
1705
|
+
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
|
1706
|
+
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
|
1707
|
+
|
1708
|
+
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
|
1709
|
+
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
|
1710
|
+
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
|
1711
|
+
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
|
1712
|
+
};
|
1713
|
+
|
1714
|
+
|
1715
|
+
static const char kIsAlpha[256] = {
|
1716
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
1717
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
1718
|
+
0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,
|
1719
|
+
0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,
|
1720
|
+
|
1721
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
1722
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
1723
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
1724
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
1725
|
+
};
|
1726
|
+
|
1727
|
+
static const char kIsDigit[256] = {
|
1728
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
1729
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1, 1,1,0,0,0,0,0,0,
|
1730
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
1731
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
1732
|
+
|
1733
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
1734
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
1735
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
1736
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
1737
|
+
};
|
1738
|
+
|
1739
|
+
// Normalize ASCII string to first 4 alphabetic/digit chars
|
1740
|
+
// Letters are forced to lowercase ASCII
|
1741
|
+
// Used to normalize TLD values
|
1742
|
+
void MakeChar4(const char* str, char* norm) {
|
1743
|
+
memcpy(norm, "____", 4); // four underscores
|
1744
|
+
int l_ptr = 0;
|
1745
|
+
for (unsigned int i = 0; i < strlen(str); ++i) {
|
1746
|
+
uint8 uc = static_cast<uint8>(str[i]);
|
1747
|
+
if (kIsAlpha[uc] | kIsDigit[uc]) {
|
1748
|
+
if (l_ptr < 4) { // Else ignore
|
1749
|
+
norm[l_ptr] = kCharsetToLowerTbl[uc];
|
1750
|
+
l_ptr++;
|
1751
|
+
}
|
1752
|
+
}
|
1753
|
+
}
|
1754
|
+
}
|
1755
|
+
|
1756
|
+
// Find subscript of matching key in first 4 bytes of sorted hint array, or -1
|
1757
|
+
static int HintBinaryLookup4(const HintEntry* hintprobs, int hintprobssize,
|
1758
|
+
const char* norm_key) {
|
1759
|
+
// Key is always in range [lo..hi)
|
1760
|
+
int lo = 0;
|
1761
|
+
int hi = hintprobssize;
|
1762
|
+
while (lo < hi) {
|
1763
|
+
int mid = (lo + hi) >> 1;
|
1764
|
+
int comp = memcmp(&hintprobs[mid].key[0], norm_key, 4);
|
1765
|
+
if (comp < 0) {
|
1766
|
+
lo = mid + 1;
|
1767
|
+
} else if (comp > 0) {
|
1768
|
+
hi = mid;
|
1769
|
+
} else {
|
1770
|
+
return mid;
|
1771
|
+
}
|
1772
|
+
}
|
1773
|
+
return -1;
|
1774
|
+
}
|
1775
|
+
|
1776
|
+
|
1777
|
+
// Increment the initial probabilities based on a per-TLD probs entry
|
1778
|
+
void ApplyTLDHint(uint8* lang_hint_boost, const char* tld_hint) {
|
1779
|
+
if (FLAGS_dbgscore) {
|
1780
|
+
fprintf(stderr, "TLD hint %s\n", tld_hint);
|
1781
|
+
}
|
1782
|
+
char normalized_tld[8];
|
1783
|
+
MakeChar4(tld_hint, normalized_tld);
|
1784
|
+
int n = HintBinaryLookup4(kTLDHintProbs, kTLDHintProbsSize,
|
1785
|
+
normalized_tld);
|
1786
|
+
// TLD is four bytes, probability entry is 4 bytes
|
1787
|
+
if (n >= 0) {
|
1788
|
+
uint32 probs = kTLDHintProbs[n].probs;
|
1789
|
+
|
1790
|
+
uint8 prob123 = (probs >> 0) & 0xff;
|
1791
|
+
const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
|
1792
|
+
uint8 top1 = (probs >> 8) & 0xff;
|
1793
|
+
if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);}
|
1794
|
+
uint8 top2 = (probs >> 16) & 0xff;
|
1795
|
+
if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);}
|
1796
|
+
uint8 top3 = (probs >> 24) & 0xff;
|
1797
|
+
if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);}
|
1798
|
+
}
|
1799
|
+
}
|
1800
|
+
|
1801
|
+
|
1802
|
+
// Increment the initial probabilities based on a per-encoding probs entry
|
1803
|
+
void ApplyEncodingHint(uint8* lang_hint_boost, int encoding_hint) {
|
1804
|
+
if (FLAGS_dbgscore) {
|
1805
|
+
Encoding tempenc = static_cast<Encoding>(encoding_hint);
|
1806
|
+
fprintf(stderr, "ENC hint %s\n", EncodingName(tempenc));
|
1807
|
+
}
|
1808
|
+
if (encoding_hint < ISO_8859_1) {return;}
|
1809
|
+
if (encoding_hint >= NUM_ENCODINGS) {return;}
|
1810
|
+
uint32 probs = kEncodingHintProbs[encoding_hint];
|
1811
|
+
|
1812
|
+
uint8 prob123 = (probs >> 0) & 0xff;
|
1813
|
+
const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
|
1814
|
+
uint8 top1 = (probs >> 8) & 0xff;
|
1815
|
+
if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);}
|
1816
|
+
uint8 top2 = (probs >> 16) & 0xff;
|
1817
|
+
if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);}
|
1818
|
+
uint8 top3 = (probs >> 24) & 0xff;
|
1819
|
+
if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);}
|
1820
|
+
}
|
1821
|
+
|
1822
|
+
|
1823
|
+
// Increment the initial probability for given language by fixed amount
|
1824
|
+
// Does not recognize extended languages as hints
|
1825
|
+
void ApplyLanguageHint(uint8* lang_hint_boost, Language language_hint) {
|
1826
|
+
if (FLAGS_dbgscore) {
|
1827
|
+
fprintf(stderr, "LANG hint %s\n", ExtLanguageName(language_hint));
|
1828
|
+
}
|
1829
|
+
if (language_hint < ENGLISH) {return;}
|
1830
|
+
if (language_hint >= NUM_LANGUAGES) {return;}
|
1831
|
+
uint32 probs = kLanguageHintProbs[language_hint];
|
1832
|
+
|
1833
|
+
uint8 prob123 = (probs >> 0) & 0xff;
|
1834
|
+
const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
|
1835
|
+
uint8 top1 = (probs >> 8) & 0xff;
|
1836
|
+
if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);}
|
1837
|
+
uint8 top2 = (probs >> 16) & 0xff;
|
1838
|
+
if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);}
|
1839
|
+
uint8 top3 = (probs >> 24) & 0xff;
|
1840
|
+
if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);}
|
1841
|
+
}
|
1842
|
+
|
1843
|
+
// Extract return values before fixups
|
1844
|
+
void ExtractLangEtc(ToteWithReliability* doc_tote, int total_text_bytes,
|
1845
|
+
int* reliable_percent3, Language* language3, int* percent3,
|
1846
|
+
double* normalized_score3,
|
1847
|
+
int* text_bytes, bool* is_reliable) {
|
1848
|
+
reliable_percent3[0] = 0;
|
1849
|
+
reliable_percent3[1] = 0;
|
1850
|
+
reliable_percent3[2] = 0;
|
1851
|
+
language3[0] = UNKNOWN_LANGUAGE;
|
1852
|
+
language3[1] = UNKNOWN_LANGUAGE;
|
1853
|
+
language3[2] = UNKNOWN_LANGUAGE;
|
1854
|
+
percent3[0] = 100;
|
1855
|
+
percent3[1] = 0;
|
1856
|
+
percent3[2] = 0;
|
1857
|
+
normalized_score3[0] = 0.0;
|
1858
|
+
normalized_score3[1] = 0.0;
|
1859
|
+
normalized_score3[2] = 0.0;
|
1860
|
+
|
1861
|
+
*text_bytes = total_text_bytes;
|
1862
|
+
*is_reliable = false;
|
1863
|
+
|
1864
|
+
int bytecount1 = total_text_bytes;
|
1865
|
+
int bytecount2 = 0;
|
1866
|
+
int bytecount3 = 0;
|
1867
|
+
|
1868
|
+
int lang1 = doc_tote->Key(0);
|
1869
|
+
if (lang1 != 0) {
|
1870
|
+
// We have a top language
|
1871
|
+
language3[0] = cld::UnpackLanguage(lang1);
|
1872
|
+
bytecount1 = doc_tote->Value(0);
|
1873
|
+
int reli1 = doc_tote->Reliability(0);
|
1874
|
+
reliable_percent3[0] = reli1 / (bytecount1 ? bytecount1 : 1); // avoid zdiv
|
1875
|
+
normalized_score3[0] = cld::GetNormalizedScore(language3[0],
|
1876
|
+
ULScript_Common,
|
1877
|
+
bytecount1,
|
1878
|
+
doc_tote->Score(0));
|
1879
|
+
}
|
1880
|
+
|
1881
|
+
int lang2 = doc_tote->Key(1);
|
1882
|
+
if (lang2 != 0) {
|
1883
|
+
language3[1] = cld::UnpackLanguage(lang2);
|
1884
|
+
bytecount2 = doc_tote->Value(1);
|
1885
|
+
int reli2 = doc_tote->Reliability(1);
|
1886
|
+
reliable_percent3[1] = reli2 / (bytecount2 ? bytecount2 : 1); // avoid zdiv
|
1887
|
+
normalized_score3[1] = cld::GetNormalizedScore(language3[1],
|
1888
|
+
ULScript_Common,
|
1889
|
+
bytecount2,
|
1890
|
+
doc_tote->Score(1));
|
1891
|
+
}
|
1892
|
+
|
1893
|
+
int lang3 = doc_tote->Key(2);
|
1894
|
+
if (lang3 != 0) {
|
1895
|
+
language3[2] = cld::UnpackLanguage(lang3);
|
1896
|
+
bytecount3 = doc_tote->Value(2);
|
1897
|
+
int reli3 = doc_tote->Reliability(2);
|
1898
|
+
reliable_percent3[2] = reli3 / (bytecount3 ? bytecount3 : 1); // avoid zdiv
|
1899
|
+
normalized_score3[2] = cld::GetNormalizedScore(language3[2],
|
1900
|
+
ULScript_Common,
|
1901
|
+
bytecount3,
|
1902
|
+
doc_tote->Score(2));
|
1903
|
+
}
|
1904
|
+
|
1905
|
+
// Increase total bytes to sum (top 3) if low for some reason
|
1906
|
+
int total_bytecount12 = bytecount1 + bytecount2;
|
1907
|
+
int total_bytecount123 = total_bytecount12 + bytecount3;
|
1908
|
+
if (total_text_bytes < total_bytecount123) {
|
1909
|
+
total_text_bytes = total_bytecount123;
|
1910
|
+
*text_bytes = total_text_bytes;
|
1911
|
+
}
|
1912
|
+
|
1913
|
+
// Sum minus previous % gives better roundoff behavior than bytecount/total
|
1914
|
+
int total_text_bytes_div = cld::maxint(1, total_text_bytes); // Avoid zdiv
|
1915
|
+
percent3[0] = (bytecount1 * 100) / total_text_bytes_div;
|
1916
|
+
percent3[1] = (total_bytecount12 * 100) / total_text_bytes_div;
|
1917
|
+
percent3[2] = (total_bytecount123 * 100) / total_text_bytes_div;
|
1918
|
+
percent3[2] -= percent3[1];
|
1919
|
+
percent3[1] -= percent3[0];
|
1920
|
+
|
1921
|
+
// Roundoff, say 96% 1.6% 1.4%, will produce non-obvious 96% 1% 2%
|
1922
|
+
// Fix this explicitly
|
1923
|
+
if (percent3[1] < percent3[2]) {
|
1924
|
+
++percent3[1];
|
1925
|
+
--percent3[2];
|
1926
|
+
}
|
1927
|
+
if (percent3[0] < percent3[1]) {
|
1928
|
+
++percent3[0];
|
1929
|
+
--percent3[1];
|
1930
|
+
}
|
1931
|
+
|
1932
|
+
*text_bytes = total_text_bytes;
|
1933
|
+
|
1934
|
+
if (lang1 != 0) {
|
1935
|
+
// We have a top language
|
1936
|
+
// Its reliability is overal result reliability
|
1937
|
+
int bytecount = doc_tote->Value(0);
|
1938
|
+
int reli = doc_tote->Reliability(0);
|
1939
|
+
int reliable_percent = reli / (bytecount ? bytecount : 1); // avoid zdiv
|
1940
|
+
*is_reliable = reliable_percent >= cld::kMinReliable;
|
1941
|
+
} else {
|
1942
|
+
// No top language at all. This can happen with zero text or 100% Klingon
|
1943
|
+
// if extended=false. Just return all UNKNOWN_LANGUAGE, reliable.
|
1944
|
+
*is_reliable = true;
|
1945
|
+
}
|
1946
|
+
}
|
1947
|
+
|
1948
|
+
bool IsFIGS(Language lang) {
|
1949
|
+
if (lang == FRENCH) {return true;}
|
1950
|
+
if (lang == ITALIAN) {return true;}
|
1951
|
+
if (lang == GERMAN) {return true;}
|
1952
|
+
if (lang == SPANISH) {return true;}
|
1953
|
+
return false;
|
1954
|
+
}
|
1955
|
+
|
1956
|
+
bool IsEFIGS(Language lang) {
|
1957
|
+
if (lang == ENGLISH) {return true;}
|
1958
|
+
if (lang == FRENCH) {return true;}
|
1959
|
+
if (lang == ITALIAN) {return true;}
|
1960
|
+
if (lang == GERMAN) {return true;}
|
1961
|
+
if (lang == SPANISH) {return true;}
|
1962
|
+
return false;
|
1963
|
+
}
|
1964
|
+
|
1965
|
+
static const int kNonEnBoilerplateMinPercent = 17; // <this => no second
|
1966
|
+
static const int kNonFIGSBoilerplateMinPercent = 20; // <this => no second
|
1967
|
+
static const int kGoodFirstMinPercent = 26; // <this => UNK
|
1968
|
+
static const int kGoodFirstReliableMinPercent = 51; // <this => unreli
|
1969
|
+
static const int kIgnoreMaxPercent = 95; // >this => unreli
|
1970
|
+
static const int kKeepMinPercent = 2; // <this => unreli
|
1971
|
+
|
1972
|
+
// For Tier3 languages, require more bytes of text to override
|
1973
|
+
// the first-place language
|
1974
|
+
static const int kGoodSecondT1T2MinBytes = 15; // <this => no second
|
1975
|
+
static const int kGoodSecondT3MinBytes = 128; // <this => no second
|
1976
|
+
//
|
1977
|
+
|
1978
|
+
// Calculate a single summary language for the document, and its reliability.
|
1979
|
+
// Returns language3[0] or language3[1] or ENGLISH or UNKNOWN_LANGUAGE
|
1980
|
+
// This is the heart of matching human-rater perception.
|
1981
|
+
// reliable_percent3[] is currently unused
|
1982
|
+
//
|
1983
|
+
// Do not return Tier3 second language unless there are at least 128 bytes
|
1984
|
+
void CalcSummaryLang(ToteWithReliability* doc_tote, int total_text_bytes,
|
1985
|
+
const int* reliable_percent3,
|
1986
|
+
const Language* language3,
|
1987
|
+
const int* percent3,
|
1988
|
+
Language* summary_lang, bool* is_reliable) {
|
1989
|
+
// Vector of active languages; changes if we delete some
|
1990
|
+
int slot_count = 3;
|
1991
|
+
int active_slot[3] = {0, 1, 2};
|
1992
|
+
|
1993
|
+
int ignore_percent = 0;
|
1994
|
+
int return_percent = percent3[0]; // Default to top lang
|
1995
|
+
*summary_lang = language3[0];
|
1996
|
+
*is_reliable = true;
|
1997
|
+
if (percent3[0] < kKeepMinPercent) {*is_reliable = false;}
|
1998
|
+
|
1999
|
+
// If any of top 3 is IGNORE, remove it and increment ignore_percent
|
2000
|
+
for (int i = 0; i < 3; ++i) {
|
2001
|
+
if (language3[i] == TG_UNKNOWN_LANGUAGE) {
|
2002
|
+
ignore_percent += percent3[i];
|
2003
|
+
// Move the rest up, levaing input vectors unchanged
|
2004
|
+
for (int j=i+1; j < 3; ++j) {
|
2005
|
+
active_slot[j - 1] = active_slot[j];
|
2006
|
+
}
|
2007
|
+
-- slot_count;
|
2008
|
+
// Logically remove Ignore from percentage-text calculation
|
2009
|
+
// (extra 1 in 101 avoids zdiv, biases slightly small)
|
2010
|
+
return_percent = (percent3[0] * 100) / (101 - ignore_percent);
|
2011
|
+
*summary_lang = language3[active_slot[0]];
|
2012
|
+
if (percent3[active_slot[0]] < kKeepMinPercent) {*is_reliable = false;}
|
2013
|
+
}
|
2014
|
+
}
|
2015
|
+
|
2016
|
+
|
2017
|
+
// If English and X, where X (not UNK) is big enough,
|
2018
|
+
// assume the English is boilerplate and return X.
|
2019
|
+
// Logically remove English from percentage-text calculation
|
2020
|
+
int second_bytes = (total_text_bytes * percent3[active_slot[1]]) / 100;
|
2021
|
+
// Require more bytes of text for Tier3 languages
|
2022
|
+
int minbytesneeded = kGoodSecondT1T2MinBytes;
|
2023
|
+
int plang_second = cld::PackLanguage(language3[active_slot[1]]);
|
2024
|
+
bool is_tier3 = (cld::kIsPackedTop40[plang_second] == 0);
|
2025
|
+
if (is_tier3) {
|
2026
|
+
minbytesneeded = kGoodSecondT3MinBytes;
|
2027
|
+
}
|
2028
|
+
|
2029
|
+
if ((language3[active_slot[0]] == ENGLISH) &&
|
2030
|
+
(language3[active_slot[1]] != ENGLISH) &&
|
2031
|
+
(language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
|
2032
|
+
(percent3[active_slot[1]] >= kNonEnBoilerplateMinPercent) &&
|
2033
|
+
(second_bytes >= minbytesneeded)) {
|
2034
|
+
ignore_percent += percent3[active_slot[0]];
|
2035
|
+
return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
|
2036
|
+
*summary_lang = language3[active_slot[1]];
|
2037
|
+
if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
|
2038
|
+
|
2039
|
+
// Else If FIGS and X, where X (not UNK, EFIGS) is big enough,
|
2040
|
+
// assume the FIGS is boilerplate and return X.
|
2041
|
+
// Logically remove FIGS from percentage-text calculation
|
2042
|
+
} else if (IsFIGS(language3[active_slot[0]]) &&
|
2043
|
+
!IsEFIGS(language3[active_slot[1]]) &&
|
2044
|
+
(language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
|
2045
|
+
(percent3[active_slot[1]] >= kNonFIGSBoilerplateMinPercent) &&
|
2046
|
+
(second_bytes >= minbytesneeded)) {
|
2047
|
+
ignore_percent += percent3[active_slot[0]];
|
2048
|
+
return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
|
2049
|
+
*summary_lang = language3[active_slot[1]];
|
2050
|
+
if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
|
2051
|
+
|
2052
|
+
// Else we are returning the first language, but want to improve its
|
2053
|
+
// return_percent if the second language should be ignored
|
2054
|
+
} else if ((language3[active_slot[1]] == ENGLISH) &&
|
2055
|
+
(language3[active_slot[0]] != ENGLISH)) {
|
2056
|
+
ignore_percent += percent3[active_slot[1]];
|
2057
|
+
return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
|
2058
|
+
} else if (IsFIGS(language3[active_slot[1]]) &&
|
2059
|
+
!IsEFIGS(language3[active_slot[0]])) {
|
2060
|
+
ignore_percent += percent3[active_slot[1]];
|
2061
|
+
return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
|
2062
|
+
}
|
2063
|
+
|
2064
|
+
// If return percent is too small (too many languages), return UNKNOWN
|
2065
|
+
if ((return_percent < kGoodFirstMinPercent)) {
|
2066
|
+
*summary_lang = UNKNOWN_LANGUAGE;
|
2067
|
+
*is_reliable = false;
|
2068
|
+
}
|
2069
|
+
|
2070
|
+
// If return percent is small, return language but set unreliable.
|
2071
|
+
if ((return_percent < kGoodFirstReliableMinPercent)) {
|
2072
|
+
*is_reliable = false;
|
2073
|
+
}
|
2074
|
+
|
2075
|
+
// If ignore percent is too large, set unreliable.
|
2076
|
+
if ((ignore_percent > kIgnoreMaxPercent)) {
|
2077
|
+
*is_reliable = false;
|
2078
|
+
}
|
2079
|
+
|
2080
|
+
// If we removed all the active languages, return UNKNOWN
|
2081
|
+
if (slot_count == 0) {
|
2082
|
+
*summary_lang = UNKNOWN_LANGUAGE;
|
2083
|
+
*is_reliable = false;
|
2084
|
+
}
|
2085
|
+
}
|
2086
|
+
|
2087
|
+
|
2088
|
+
|
2089
|
+
// Result vector must be exactly three items
|
2090
|
+
Language CompactLangDetImpl::DetectLanguageSummaryV25(
|
2091
|
+
const CompactLangDet::DetectionTables* tables,
|
2092
|
+
const char* buffer,
|
2093
|
+
int buffer_length,
|
2094
|
+
bool is_plain_text,
|
2095
|
+
bool do_pick_summary_language,
|
2096
|
+
bool do_remove_weak_matches,
|
2097
|
+
const char* tld_hint, // "id" boosts Indonesian
|
2098
|
+
int encoding_hint, // SJS boosts Japanese
|
2099
|
+
Language language_hint, // ITALIAN boosts it
|
2100
|
+
bool allow_extended_lang,
|
2101
|
+
int flags,
|
2102
|
+
Language plus_one,
|
2103
|
+
Language* language3,
|
2104
|
+
int* percent3,
|
2105
|
+
double* normalized_score3,
|
2106
|
+
int* text_bytes,
|
2107
|
+
bool* is_reliable) {
|
2108
|
+
if (!tables) {
|
2109
|
+
static const CompactLangDet::DetectionTables default_cld_tables = {
|
2110
|
+
&kQuadTable_obj,
|
2111
|
+
&compact_lang_det_generated_ctjkvz_b1_obj
|
2112
|
+
};
|
2113
|
+
tables = &default_cld_tables;
|
2114
|
+
}
|
2115
|
+
language3[0] = UNKNOWN_LANGUAGE;
|
2116
|
+
language3[1] = UNKNOWN_LANGUAGE;
|
2117
|
+
language3[2] = UNKNOWN_LANGUAGE;
|
2118
|
+
percent3[0] = 100;
|
2119
|
+
percent3[1] = 0;
|
2120
|
+
percent3[2] = 0;
|
2121
|
+
normalized_score3[0] = 0.0;
|
2122
|
+
normalized_score3[1] = 0.0;
|
2123
|
+
normalized_score3[2] = 0.0;
|
2124
|
+
*text_bytes = 0;
|
2125
|
+
*is_reliable = false;
|
2126
|
+
|
2127
|
+
// Document totals
|
2128
|
+
ToteWithReliability doc_tote; // Reliability = 0..100
|
2129
|
+
|
2130
|
+
// Vector of packed per-language boosts (just one filled in from hints)
|
2131
|
+
uint8 lang_hint_boost[EXT_NUM_LANGUAGES + 1];
|
2132
|
+
memset(lang_hint_boost, 0, sizeof(lang_hint_boost));
|
2133
|
+
|
2134
|
+
// Apply hints,if any
|
2135
|
+
if ((tld_hint != NULL) && (tld_hint[0] != '\0')) {
|
2136
|
+
ApplyTLDHint(lang_hint_boost, tld_hint);
|
2137
|
+
}
|
2138
|
+
if (encoding_hint != UNKNOWN_ENCODING) {
|
2139
|
+
ApplyEncodingHint(lang_hint_boost, encoding_hint);
|
2140
|
+
}
|
2141
|
+
if (language_hint != UNKNOWN_LANGUAGE) {
|
2142
|
+
ApplyLanguageHint(lang_hint_boost, language_hint);
|
2143
|
+
}
|
2144
|
+
|
2145
|
+
|
2146
|
+
// Four individual script totals, Latin, Han, other2, other3
|
2147
|
+
int next_other_tote = 2;
|
2148
|
+
|
2149
|
+
// Four totes for up to four different scripts pending at once
|
2150
|
+
Tote totes[4]; // [0] Latn [1] Hani [2] other [3] other
|
2151
|
+
bool tote_seen[4] = {false, false, false, false};
|
2152
|
+
int tote_grams[4] = {0, 0, 0, 0}; // Number in partial chunk
|
2153
|
+
UnicodeLScript tote_script[4] =
|
2154
|
+
{ULScript_Latin, ULScript_HanCJK, ULScript_Common, ULScript_Common};
|
2155
|
+
|
2156
|
+
// Loop through text spans in a single script
|
2157
|
+
ScriptScanner ss(buffer, buffer_length, is_plain_text);
|
2158
|
+
getone::LangSpan scriptspan;
|
2159
|
+
|
2160
|
+
scriptspan.text = NULL;
|
2161
|
+
scriptspan.text_bytes = 0;
|
2162
|
+
scriptspan.offset = 0;
|
2163
|
+
scriptspan.script = ULScript_Common;
|
2164
|
+
scriptspan.lang = UNKNOWN_LANGUAGE;
|
2165
|
+
|
2166
|
+
int total_text_bytes = 0;
|
2167
|
+
int textlimit = FLAGS_cld_textlimit << 10; // in KB
|
2168
|
+
if (textlimit == 0) {textlimit = 0x7fffffff;}
|
2169
|
+
|
2170
|
+
int advance_by = 2; // Advance 2 bytes
|
2171
|
+
int advance_limit = textlimit >> 3; // For first 1/8 of max document
|
2172
|
+
|
2173
|
+
int initial_word_span = kDefaultWordSpan;
|
2174
|
+
if (FLAGS_cld_forcewords) {
|
2175
|
+
initial_word_span = kReallyBigWordSpan;
|
2176
|
+
}
|
2177
|
+
|
2178
|
+
// Pick up chunk sizes
|
2179
|
+
// Smoothwidth is units of quadgrams, about 2.5 chars (unigrams) each
|
2180
|
+
// Sanity check -- force into a reasonable range
|
2181
|
+
int chunksizequads = FLAGS_cld_smoothwidth;
|
2182
|
+
chunksizequads = cld::minint(cld::maxint(chunksizequads, kMinChunkSizeQuads),
|
2183
|
+
kMaxChunkSizeQuads);
|
2184
|
+
int chunksizeunis = (chunksizequads * 5) >> 1;
|
2185
|
+
|
2186
|
+
// Varying short-span limit doesn't work well -- skips too much beyond 20KB
|
2187
|
+
// int spantooshortlimit = advance_by * FLAGS_cld_smoothwidth;
|
2188
|
+
int spantooshortlimit = kShortSpanThresh;
|
2189
|
+
|
2190
|
+
// For debugging only. Not thread-safe
|
2191
|
+
prior_lang = UNKNOWN_LANGUAGE;
|
2192
|
+
prior_unreliable = false;
|
2193
|
+
|
2194
|
+
// Allocate full-document prediction table for finding repeating words
|
2195
|
+
int hash = 0;
|
2196
|
+
int* predict_tbl = new int[kPredictionTableSize];
|
2197
|
+
if (FlagRepeats(flags)) {
|
2198
|
+
memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
|
2199
|
+
}
|
2200
|
+
|
2201
|
+
// Loop through scriptspans accumulating number of text bytes in each language
|
2202
|
+
while (ss.GetOneScriptSpanLower(&scriptspan)) {
|
2203
|
+
UnicodeLScript lscript = scriptspan.script;
|
2204
|
+
|
2205
|
+
// Echo text if asked to
|
2206
|
+
if (FLAGS_cld_echotext) {
|
2207
|
+
PrintHtmlEscapedText(stderr, scriptspan.text, scriptspan.text_bytes);
|
2208
|
+
}
|
2209
|
+
|
2210
|
+
// Squeeze out big chunks of text span if asked to
|
2211
|
+
if (FlagSqueeze(flags)) {
|
2212
|
+
// Remove repetitive or mostly-spaces chunks
|
2213
|
+
int newlen;
|
2214
|
+
int chunksize = 0; // Use the default
|
2215
|
+
newlen = CheapSqueezeInplace(scriptspan.text, scriptspan.text_bytes,
|
2216
|
+
chunksize);
|
2217
|
+
scriptspan.text_bytes = newlen;
|
2218
|
+
} else {
|
2219
|
+
// Check now and then to see if we should be squeezing
|
2220
|
+
if ((total_text_bytes >= kCheapSqueezeTestThresh) &&
|
2221
|
+
!FlagFinish(flags) &&
|
2222
|
+
((getone::kMaxScriptBuffer >> 1) < scriptspan.text_bytes) &&
|
2223
|
+
CheapSqueezeTriggerTest(scriptspan.text,
|
2224
|
+
scriptspan.text_bytes,
|
2225
|
+
kCheapSqueezeTestLen)) {
|
2226
|
+
// Recursive call with big-chunk squeezing set
|
2227
|
+
if (FLAGS_cld_html || FLAGS_dbgscore) {
|
2228
|
+
fprintf(stderr,
|
2229
|
+
"<br>---text_bytes[%d] Recursive(Squeeze)---<br><br>\n",
|
2230
|
+
total_text_bytes);
|
2231
|
+
}
|
2232
|
+
// Deallocate full-document prediction table
|
2233
|
+
delete[] predict_tbl;
|
2234
|
+
|
2235
|
+
return DetectLanguageSummaryV25(
|
2236
|
+
tables,
|
2237
|
+
buffer,
|
2238
|
+
buffer_length,
|
2239
|
+
is_plain_text,
|
2240
|
+
do_pick_summary_language,
|
2241
|
+
do_remove_weak_matches,
|
2242
|
+
tld_hint, // "id" boosts Indonesian
|
2243
|
+
encoding_hint, // SJS boosts Japanese
|
2244
|
+
language_hint, // ITALIAN boosts it
|
2245
|
+
allow_extended_lang,
|
2246
|
+
flags | kCLDFlagSqueeze,
|
2247
|
+
plus_one,
|
2248
|
+
language3,
|
2249
|
+
percent3,
|
2250
|
+
normalized_score3,
|
2251
|
+
text_bytes,
|
2252
|
+
is_reliable);
|
2253
|
+
}
|
2254
|
+
}
|
2255
|
+
|
2256
|
+
// Remove repetitive words if asked to
|
2257
|
+
if (FlagRepeats(flags)) {
|
2258
|
+
// Remove repetitive words
|
2259
|
+
int newlen;
|
2260
|
+
newlen = CheapRepWordsInplace(scriptspan.text, scriptspan.text_bytes,
|
2261
|
+
&hash, predict_tbl);
|
2262
|
+
scriptspan.text_bytes = newlen;
|
2263
|
+
}
|
2264
|
+
|
2265
|
+
// The real scoring
|
2266
|
+
// Accumulate directly into the document total, or accmulate in one of four
|
2267
|
+
// chunk totals. The purpose of the multiple chunk totals is to piece
|
2268
|
+
// together short choppy pieces of text in alternating scripts. One total is
|
2269
|
+
// dedicated to Latin text, one to Han text, and the other two are dynamicly
|
2270
|
+
// assigned.
|
2271
|
+
Language onlylang = cld::kOnlyLanguagePerLScript[lscript];
|
2272
|
+
|
2273
|
+
if (onlylang != UNKNOWN_LANGUAGE) {
|
2274
|
+
// This entire script run is in a single language.
|
2275
|
+
ScoreNilgrams(&scriptspan, cld::PackLanguage(onlylang), &doc_tote,
|
2276
|
+
lang_hint_boost, flags, plus_one);
|
2277
|
+
} else if (cld::kScoreUniPerLScript[lscript] != 0) {
|
2278
|
+
// This entire script run's languages can be distinguished by uni-grams
|
2279
|
+
// Accumulate in hani_tote
|
2280
|
+
int tote_num = 1;
|
2281
|
+
if (!tote_seen[tote_num]) {
|
2282
|
+
tote_seen[tote_num] = true;
|
2283
|
+
// Default language gets 1 byte
|
2284
|
+
total_text_bytes += 1;
|
2285
|
+
InitScriptToteLang(&totes[tote_num], lscript);
|
2286
|
+
}
|
2287
|
+
ScoreUnigrams(tables->unigram_obj,
|
2288
|
+
&scriptspan, &tote_grams[tote_num], chunksizeunis,
|
2289
|
+
&totes[tote_num],
|
2290
|
+
&doc_tote, lang_hint_boost,
|
2291
|
+
advance_by, flags, &initial_word_span, plus_one);
|
2292
|
+
} else {
|
2293
|
+
// This entire script-run's languages can be distinguished by quad-grams
|
2294
|
+
// Accumulate in latn_tote or script0/1_tote
|
2295
|
+
int tote_num = -1;
|
2296
|
+
for (int t = 0; t < 4; ++t) {
|
2297
|
+
if (lscript == tote_script[t]) {
|
2298
|
+
tote_num = t;
|
2299
|
+
break;
|
2300
|
+
}
|
2301
|
+
}
|
2302
|
+
if (tote_num < 0) {
|
2303
|
+
// Need to allocate other0/1
|
2304
|
+
tote_num = next_other_tote;
|
2305
|
+
next_other_tote ^= 1; // Round-robin
|
2306
|
+
if (tote_seen[tote_num]) {
|
2307
|
+
// Flush previous
|
2308
|
+
ScoreChunkIntoDoc2(kToteSwitch[tote_num], advance_by,
|
2309
|
+
tote_script[tote_num], &totes[tote_num],
|
2310
|
+
&doc_tote, tote_grams[tote_num], lang_hint_boost);
|
2311
|
+
totes[tote_num].Reinit();
|
2312
|
+
}
|
2313
|
+
tote_script[tote_num] = lscript;
|
2314
|
+
}
|
2315
|
+
|
2316
|
+
if (!tote_seen[tote_num]) {
|
2317
|
+
tote_seen[tote_num] = true;
|
2318
|
+
// Default language gets 1 byte
|
2319
|
+
total_text_bytes += 1;
|
2320
|
+
InitScriptToteLang(&totes[tote_num], lscript);
|
2321
|
+
}
|
2322
|
+
|
2323
|
+
// The actual accumulation, possibly with word scoring also
|
2324
|
+
ScoreQuadgrams(tables->quadgram_obj, &scriptspan, &tote_grams[tote_num],
|
2325
|
+
chunksizequads,
|
2326
|
+
&totes[tote_num],
|
2327
|
+
&doc_tote, lang_hint_boost,
|
2328
|
+
advance_by, flags, &initial_word_span, plus_one);
|
2329
|
+
}
|
2330
|
+
|
2331
|
+
total_text_bytes += scriptspan.text_bytes;
|
2332
|
+
|
2333
|
+
// For long documents, do less-dense samples the further along we go.
|
2334
|
+
// This is to keep speed sublinear in document size.
|
2335
|
+
if (total_text_bytes > advance_limit) {
|
2336
|
+
if (total_text_bytes > textlimit) {
|
2337
|
+
// Don't look at rest of doc
|
2338
|
+
if (FLAGS_cld_html || FLAGS_dbgscore) {
|
2339
|
+
fprintf(stderr, "<br>---text_bytes[%d] textlimit %d reached---<br>",
|
2340
|
+
total_text_bytes, textlimit);
|
2341
|
+
}
|
2342
|
+
break;
|
2343
|
+
}
|
2344
|
+
advance_by <<= 1; // Double advance bytes
|
2345
|
+
advance_limit <<= 1; // Double limit until next change
|
2346
|
+
spantooshortlimit <<= 1; // Double short-span size
|
2347
|
+
if (FLAGS_cld_html || FLAGS_dbgscore) {
|
2348
|
+
fprintf(stderr, "<br>---text_bytes[%d] advance_by doubled to %d---<br>",
|
2349
|
+
total_text_bytes, advance_by);
|
2350
|
+
}
|
2351
|
+
}
|
2352
|
+
} // End while (ss.GetOneScriptSpanLower())
|
2353
|
+
|
2354
|
+
// Deallocate full-document prediction table
|
2355
|
+
delete[] predict_tbl;
|
2356
|
+
|
2357
|
+
// Flush pending totals
|
2358
|
+
for (int tote_num = 0; tote_num < 4; ++tote_num) {
|
2359
|
+
if (tote_seen[tote_num]) {
|
2360
|
+
ScoreChunkIntoDoc2(kToteName[tote_num], advance_by,
|
2361
|
+
tote_script[tote_num], &totes[tote_num], &doc_tote,
|
2362
|
+
tote_grams[tote_num], lang_hint_boost);
|
2363
|
+
}
|
2364
|
+
}
|
2365
|
+
|
2366
|
+
// If extended languages are disallowed, remove them here
|
2367
|
+
if (!allow_extended_lang) {
|
2368
|
+
RemoveExtendedLanguages(&doc_tote);
|
2369
|
+
}
|
2370
|
+
|
2371
|
+
// Force close pairs to one or the other
|
2372
|
+
RefineScoredClosePairs(&doc_tote);
|
2373
|
+
|
2374
|
+
|
2375
|
+
// Calculate return results
|
2376
|
+
// Find top three byte counts in tote heap
|
2377
|
+
int reliable_percent3[3];
|
2378
|
+
|
2379
|
+
|
2380
|
+
// Cannot use Add, etc. after sorting
|
2381
|
+
doc_tote.Sort(3);
|
2382
|
+
|
2383
|
+
ExtractLangEtc(&doc_tote, total_text_bytes,
|
2384
|
+
reliable_percent3, language3, percent3, normalized_score3,
|
2385
|
+
text_bytes, is_reliable);
|
2386
|
+
|
2387
|
+
bool have_good_answer = false;
|
2388
|
+
if (FlagFinish(flags)) {
|
2389
|
+
// Force a result
|
2390
|
+
have_good_answer = true;
|
2391
|
+
} else if (total_text_bytes <= kShortTextThresh) {
|
2392
|
+
// Don't recurse on short text -- we already did word scores
|
2393
|
+
have_good_answer = true;
|
2394
|
+
} else if (*is_reliable &&
|
2395
|
+
(percent3[0] >= kGoodLang1Percent)) {
|
2396
|
+
have_good_answer = true;
|
2397
|
+
} else if (*is_reliable &&
|
2398
|
+
((percent3[0] + percent3[1]) >= kGoodLang1and2Percent)) {
|
2399
|
+
have_good_answer = true;
|
2400
|
+
}
|
2401
|
+
|
2402
|
+
|
2403
|
+
if (have_good_answer) {
|
2404
|
+
// This is the real, non-recursive return
|
2405
|
+
|
2406
|
+
// Move bytes for unreliable langs to another lang or
|
2407
|
+
// UNKNOWN
|
2408
|
+
RemoveUnreliableLanguages(&doc_tote, do_remove_weak_matches);
|
2409
|
+
|
2410
|
+
// Redo the result extraction after the removal above
|
2411
|
+
doc_tote.Sort(3);
|
2412
|
+
|
2413
|
+
ExtractLangEtc(&doc_tote, total_text_bytes,
|
2414
|
+
reliable_percent3, language3, percent3, normalized_score3,
|
2415
|
+
text_bytes, is_reliable);
|
2416
|
+
|
2417
|
+
#if 0
|
2418
|
+
// OLD code, replaced by CalcSummaryLang
|
2419
|
+
//
|
2420
|
+
// Suppress ignore-me text, TG_UNKNOWN_LANGUAGE if 2nd or 3rd language
|
2421
|
+
// Force it to English if first language
|
2422
|
+
if (language3[2] == TG_UNKNOWN_LANGUAGE) {
|
2423
|
+
reliable_percent3[2] = 0;
|
2424
|
+
language3[2] = UNKNOWN_LANGUAGE;
|
2425
|
+
percent3[2] = 0;
|
2426
|
+
} else if (language3[1] == TG_UNKNOWN_LANGUAGE) {
|
2427
|
+
// Move up lower language
|
2428
|
+
reliable_percent3[1] = reliable_percent3[2];
|
2429
|
+
language3[1] = language3[2];
|
2430
|
+
percent3[1] = percent3[2];
|
2431
|
+
reliable_percent3[2] = 0;
|
2432
|
+
language3[2] = UNKNOWN_LANGUAGE;
|
2433
|
+
percent3[2] = 0;
|
2434
|
+
} else if (language3[0] == TG_UNKNOWN_LANGUAGE) {
|
2435
|
+
language3[0] = ENGLISH;
|
2436
|
+
}
|
2437
|
+
|
2438
|
+
if (language3[0] == UNKNOWN_LANGUAGE) {
|
2439
|
+
// Last-ditch test for some result, but it is UNKNOWN_LANGUAGE
|
2440
|
+
// Force it to English (should not happen)
|
2441
|
+
language3[0] = ENGLISH;
|
2442
|
+
percent3[0] = 100;
|
2443
|
+
*is_reliable = true;
|
2444
|
+
}
|
2445
|
+
#endif
|
2446
|
+
|
2447
|
+
|
2448
|
+
#if 0
|
2449
|
+
// Scaffolding to reveal subset sequence lang distribution across doc text
|
2450
|
+
// Track the sequence of language fragments [result currently unused]
|
2451
|
+
if (FLAGS_cld_html) {
|
2452
|
+
static const int kMaxSubsetSeq = 12;
|
2453
|
+
uint8 subseq[kMaxSubsetSeq];
|
2454
|
+
doc_tote.ExtractSeq(kMaxSubsetSeq, subseq);
|
2455
|
+
|
2456
|
+
fprintf(stderr, "<br>\nSubset Sequence[%d]: ", kMaxSubsetSeq);
|
2457
|
+
for (int i = 0; i < kMaxSubsetSeq; ++i) {
|
2458
|
+
fprintf(stderr, "%s ", ExtLanguageCode(cld::UnpackLanguage(subseq[i])));
|
2459
|
+
if ((i % 4) == 3) {fprintf(stderr, " ");}
|
2460
|
+
}
|
2461
|
+
fprintf(stderr, " ");
|
2462
|
+
|
2463
|
+
for (int i = 0; i < 3; ++i) {
|
2464
|
+
if (language3[i] != UNKNOWN_LANGUAGE) {
|
2465
|
+
fprintf(stderr, "%s.%d(%d%%) ",
|
2466
|
+
ExtLanguageCode(language3[i]),
|
2467
|
+
reliable_percent3[i],
|
2468
|
+
percent3[i]);
|
2469
|
+
}
|
2470
|
+
}
|
2471
|
+
|
2472
|
+
fprintf(stderr, "%d B ", total_text_bytes);
|
2473
|
+
fprintf(stderr, "<br>\n");
|
2474
|
+
}
|
2475
|
+
// End Scaffolding to reveal subset sequence lang distribution
|
2476
|
+
#endif
|
2477
|
+
|
2478
|
+
Language summary_lang;
|
2479
|
+
if (do_pick_summary_language) {
|
2480
|
+
CalcSummaryLang(&doc_tote, total_text_bytes,
|
2481
|
+
reliable_percent3, language3, percent3,
|
2482
|
+
&summary_lang, is_reliable);
|
2483
|
+
} else {
|
2484
|
+
summary_lang = language3[0];
|
2485
|
+
}
|
2486
|
+
|
2487
|
+
if (FLAGS_cld_html) {
|
2488
|
+
for (int i = 0; i < 3; ++i) {
|
2489
|
+
if (language3[i] != UNKNOWN_LANGUAGE) {
|
2490
|
+
fprintf(stderr, "%s.%d(%d%%) ",
|
2491
|
+
ExtLanguageCode(language3[i]),
|
2492
|
+
reliable_percent3[i],
|
2493
|
+
percent3[i]);
|
2494
|
+
}
|
2495
|
+
}
|
2496
|
+
|
2497
|
+
fprintf(stderr, "%d B ", total_text_bytes);
|
2498
|
+
fprintf(stderr, "= %s%c ",
|
2499
|
+
ExtLanguageName(summary_lang), is_reliable ? ' ' : '*');
|
2500
|
+
fprintf(stderr, "<br>\n");
|
2501
|
+
}
|
2502
|
+
|
2503
|
+
return summary_lang;
|
2504
|
+
}
|
2505
|
+
|
2506
|
+
// Not a good answer -- do recursive call to refine
|
2507
|
+
if (FLAGS_cld_html || FLAGS_dbgscore) {
|
2508
|
+
// This is what we hope to improve on in the recursive call, if any
|
2509
|
+
PrintLangs(stderr, language3, percent3, text_bytes, is_reliable);
|
2510
|
+
}
|
2511
|
+
|
2512
|
+
// For restriction to Top40 + one, the one is 1st/2nd lang that is not Top40
|
2513
|
+
// For this purpose, we treate "Ignore" as top40
|
2514
|
+
Language new_plus_one = UNKNOWN_LANGUAGE;
|
2515
|
+
if (cld::kIsPackedTop40[cld::PackLanguage(language3[0])] == 0) {
|
2516
|
+
new_plus_one = language3[0];
|
2517
|
+
} else if (cld::kIsPackedTop40[cld::PackLanguage(language3[1])] == 0) {
|
2518
|
+
new_plus_one = language3[1];
|
2519
|
+
}
|
2520
|
+
|
2521
|
+
if (total_text_bytes < kShortTextThresh) {
|
2522
|
+
// Short text: Recursive call with top40 and short set
|
2523
|
+
if (FLAGS_cld_html || FLAGS_dbgscore) {
|
2524
|
+
fprintf(stderr, " ---text_bytes[%d] "
|
2525
|
+
"Recursive(Top40/Rep/Short/Words)---<br><br>\n",
|
2526
|
+
total_text_bytes);
|
2527
|
+
}
|
2528
|
+
return DetectLanguageSummaryV25(
|
2529
|
+
tables,
|
2530
|
+
buffer,
|
2531
|
+
buffer_length,
|
2532
|
+
is_plain_text,
|
2533
|
+
do_pick_summary_language,
|
2534
|
+
do_remove_weak_matches,
|
2535
|
+
tld_hint, // "id" boosts Indonesian
|
2536
|
+
encoding_hint, // SJS boosts Japanese
|
2537
|
+
language_hint, // ITALIAN boosts it
|
2538
|
+
allow_extended_lang,
|
2539
|
+
flags | kCLDFlagTop40 | kCLDFlagRepeats |
|
2540
|
+
kCLDFlagShort | kCLDFlagUseWords | kCLDFlagFinish,
|
2541
|
+
new_plus_one,
|
2542
|
+
language3,
|
2543
|
+
percent3,
|
2544
|
+
normalized_score3,
|
2545
|
+
text_bytes,
|
2546
|
+
is_reliable);
|
2547
|
+
}
|
2548
|
+
|
2549
|
+
// Longer text: Recursive call with top40 set
|
2550
|
+
if (FLAGS_cld_html || FLAGS_dbgscore) {
|
2551
|
+
fprintf(stderr,
|
2552
|
+
" ---text_bytes[%d] Recursive(Top40/Rep)---<br><br>\n",
|
2553
|
+
total_text_bytes);
|
2554
|
+
}
|
2555
|
+
return DetectLanguageSummaryV25(
|
2556
|
+
tables,
|
2557
|
+
buffer,
|
2558
|
+
buffer_length,
|
2559
|
+
is_plain_text,
|
2560
|
+
do_pick_summary_language,
|
2561
|
+
do_remove_weak_matches,
|
2562
|
+
tld_hint, // "id" boosts Indonesian
|
2563
|
+
encoding_hint, // SJS boosts Japanese
|
2564
|
+
language_hint, // ITALIAN boosts it
|
2565
|
+
allow_extended_lang,
|
2566
|
+
flags | kCLDFlagTop40 | kCLDFlagRepeats |
|
2567
|
+
kCLDFlagFinish,
|
2568
|
+
new_plus_one,
|
2569
|
+
language3,
|
2570
|
+
percent3,
|
2571
|
+
normalized_score3,
|
2572
|
+
text_bytes,
|
2573
|
+
is_reliable);
|
2574
|
+
} // End CompactLangDetImpl::DetectLanguageSummaryV25
|