language_detection 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +85 -0
- data/Rakefile +11 -0
- data/ext/cld/Makefile +34 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +124 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/cld.so +0 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +1 -0
- data/ext/cld/language_detection.cc +88 -0
- data/ext/cld/languages/internal/languages.cc +337 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/language_detection.gemspec +28 -0
- data/lib/language_detection/string.rb +1 -0
- data/lib/language_detection/version.rb +3 -0
- data/lib/language_detection.rb +54 -0
- data/test/_helper.rb +15 -0
- data/test/fixtures/languages.csv +80 -0
- data/test/language_detection_test.rb +88 -0
- metadata +250 -0
|
@@ -0,0 +1,2574 @@
|
|
|
1
|
+
// Copyright (c) 2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#include <stdio.h>
|
|
6
|
+
#include <string.h>
|
|
7
|
+
//#include <sys/time.h> // for gettimeofday
|
|
8
|
+
#include <string>
|
|
9
|
+
|
|
10
|
+
#include "encodings/lang_enc.h"
|
|
11
|
+
|
|
12
|
+
#include "encodings/compact_lang_det/compact_lang_det.h"
|
|
13
|
+
#include "encodings/compact_lang_det/compact_lang_det_impl.h"
|
|
14
|
+
#include "encodings/compact_lang_det/getonescriptspan.h"
|
|
15
|
+
#include "encodings/compact_lang_det/letterscript_enum.h"
|
|
16
|
+
#include "encodings/compact_lang_det/tote.h"
|
|
17
|
+
#include "encodings/compact_lang_det/utf8propjustletter.h"
|
|
18
|
+
#include "encodings/compact_lang_det/utf8propletterscriptnum.h"
|
|
19
|
+
#include "encodings/compact_lang_det/utf8scannotjustletterspecial.h"
|
|
20
|
+
|
|
21
|
+
#include "encodings/compact_lang_det/cldutil_dbg.h"
|
|
22
|
+
|
|
23
|
+
#include "encodings/compact_lang_det/win/cld_basictypes.h"
|
|
24
|
+
#include "encodings/compact_lang_det/win/cld_commandlineflags.h"
|
|
25
|
+
#include "encodings/compact_lang_det/win/cld_google.h"
|
|
26
|
+
#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
|
|
27
|
+
|
|
28
|
+
// Linker supplies the right tables
|
|
29
|
+
extern const UTF8PropObj compact_lang_det_generated_ctjkvz_b1_obj;
|
|
30
|
+
extern const cld::CLDTableSummary kCjkBiTable_obj;
|
|
31
|
+
extern const cld::CLDTableSummary kQuadTable_obj;
|
|
32
|
+
extern const cld::CLDTableSummary kLongWord8Table_obj;
|
|
33
|
+
|
|
34
|
+
DEFINE_bool(cld_html, false, "Print language spans in HTML on stderr");
|
|
35
|
+
DEFINE_bool(cld_forcewords, false, "Score all words, in addition to quads");
|
|
36
|
+
|
|
37
|
+
DEFINE_bool(cld_showme, false, "Put squeeze/repeat points into HTML text");
|
|
38
|
+
DEFINE_bool(cld_echotext, false, "Print each scriptspan to stderr");
|
|
39
|
+
DEFINE_int32(cld_textlimit, 160, "Examine only initial n KB of actual text");
|
|
40
|
+
// 20 quadgrams is about 80 bytes or about 12 words in real text
|
|
41
|
+
DEFINE_int32(cld_smoothwidth, 20, "Smoothing window width in quadgrams");
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
static const int kLangHintInitial = 12; // Boost language by N initially
|
|
45
|
+
static const int kLangHintBoost = 12; // Boost language by N/16 per quadgram
|
|
46
|
+
|
|
47
|
+
static const int kShortSpanThresh = 32; // Bytes
|
|
48
|
+
static const int kMaxSecondChanceLen = 1024; // Look at first 1K of short spans
|
|
49
|
+
|
|
50
|
+
static const int kCheapSqueezeTestThresh = 4096; // Only look for squeezing
|
|
51
|
+
// after this many text bytes
|
|
52
|
+
static const int kCheapSqueezeTestLen = 256; // Bytes to test to trigger sqz
|
|
53
|
+
static const int kSpacesTriggerPercent = 25; // Trigger sqz if >=25% spaces
|
|
54
|
+
static const int kPredictTriggerPercent = 67; // Trigger sqz if >=67% predicted
|
|
55
|
+
|
|
56
|
+
static const int kChunksizeDefault = 48; // Squeeze 48-byte chunks
|
|
57
|
+
static const int kSpacesThreshPercent = 25; // Squeeze if >=25% spaces
|
|
58
|
+
static const int kPredictThreshPercent = 40; // Squeeze if >=40% predicted
|
|
59
|
+
|
|
60
|
+
static const int kMaxSpaceScan = 32; // Bytes
|
|
61
|
+
|
|
62
|
+
static const int kGoodLang1Percent = 70;
|
|
63
|
+
static const int kGoodLang1and2Percent = 93;
|
|
64
|
+
static const int kShortTextThresh = 256; // Bytes
|
|
65
|
+
|
|
66
|
+
static const int kMinChunkSizeQuads = 4; // Chunk is at least four quads
|
|
67
|
+
static const int kMaxChunkSizeQuads = 1024; // Chunk is at most 1K quads
|
|
68
|
+
|
|
69
|
+
static const int kDefaultWordSpan = 256; // Scan at least this many initial
|
|
70
|
+
// bytes with word scoring
|
|
71
|
+
static const int kReallyBigWordSpan = 9999999; // Forces word scoring all text
|
|
72
|
+
|
|
73
|
+
static const int kMinReliableSeq = 50; // Record in seq if >= 50% reliable
|
|
74
|
+
|
|
75
|
+
static const int kPredictionTableSize = 4096; // Must be exactly 4096 for
|
|
76
|
+
// cheap compressor
|
|
77
|
+
|
|
78
|
+
//
|
|
79
|
+
// Generated by dsites 2008.07.07 from 10% of Base
|
|
80
|
+
//
|
|
81
|
+
|
|
82
|
+
// Three packed language probs, subscripted by Encoding
|
|
83
|
+
static const uint32 kEncodingHintProbs[] = {
|
|
84
|
+
0x00000000, // ASCII
|
|
85
|
+
0x18120cd5, // Latin2 POLISH.11 CZECH.5 HUNGARIAN.3
|
|
86
|
+
0x1d3a4bc9, // Latin3 AZERBAIJANI.10 BASQUE.3 CROATIAN.1
|
|
87
|
+
0x030819d4, // Latin4 ESTONIAN.11 ITALIAN.4 DUTCH.2
|
|
88
|
+
0x00000000, // ISO-8859-5
|
|
89
|
+
0x00003742, // Arabic ARABIC.12
|
|
90
|
+
0x00000000, // Greek
|
|
91
|
+
0x00000742, // Hebrew HEBREW.12
|
|
92
|
+
0x00002242, // Latin5 TURKISH.12
|
|
93
|
+
0x060419c9, // Latin6 ESTONIAN.10 FINNISH.3 GERMAN.1
|
|
94
|
+
0x00000942, // EUC-JP Japanese.12
|
|
95
|
+
0x00000942, // SJS Japanese.12
|
|
96
|
+
0x00000942, // JIS Japanese.12
|
|
97
|
+
0x00004642, // BIG5 ChineseT.12
|
|
98
|
+
0x00001142, // GB Chinese.12
|
|
99
|
+
0x46295fcd, // EUC-CN UIGHUR.10 MALAY.6 ChineseT.5
|
|
100
|
+
0x00000a42, // KSC Korean.12
|
|
101
|
+
0x00000000, // Unicode
|
|
102
|
+
0x03104674, // EUC ChineseT.9 SWEDISH.8 DUTCH.3
|
|
103
|
+
0x00000000, // CNS
|
|
104
|
+
0x0f1146c3, // BIG5-CP950 ChineseT.9 Chinese.5 SPANISH.4
|
|
105
|
+
0x00000942, // CP932 Japanese.12
|
|
106
|
+
0x00000000, // UTF8
|
|
107
|
+
0x00000000, // Unknown
|
|
108
|
+
0x00000000, // ASCII-7-bit
|
|
109
|
+
0x00000000, // KOI8R
|
|
110
|
+
0x00000000, // CP1251
|
|
111
|
+
0x00000000, // CP1252
|
|
112
|
+
0x00000000, // KOI8U
|
|
113
|
+
0x451d12cd, // CP1250 CZECH.10 CROATIAN.6 SLOVAK.5
|
|
114
|
+
0x0d06052a, // ISO-8859-15 FRENCH.9 GERMAN.8 PORTUGUESE.7
|
|
115
|
+
0x00002242, // CP1254 TURKISH.12
|
|
116
|
+
0x191516be, // CP1257 LITHUANIAN.8 LATVIAN.7 ESTONIAN.7
|
|
117
|
+
0x08003642, // ISO-8859-11 THAI.12 ITALIAN.1
|
|
118
|
+
0x00000000, // CP874
|
|
119
|
+
0x00003742, // CP1256 ARABIC.12
|
|
120
|
+
0x00000742, // CP1255 HEBREW.12
|
|
121
|
+
0x00000000, // ISO-8859-8-I
|
|
122
|
+
0x00000000, // VISUAL
|
|
123
|
+
0x00000000, // CP852
|
|
124
|
+
0x39001242, // CSN_369103 CZECH.12 ESPERANTO.1
|
|
125
|
+
0x00000000, // CP1253
|
|
126
|
+
0x00000000, // CP866
|
|
127
|
+
0x2e001944, // ISO-8859-13 ESTONIAN.12 ALBANIAN.3
|
|
128
|
+
0x08090a74, // ISO-2022-KR Korean.9 Japanese.8 ITALIAN.3
|
|
129
|
+
0x00001142, // GBK Chinese.12
|
|
130
|
+
0x4600113d, // GB18030 Chinese.11 ChineseT.7
|
|
131
|
+
0x00004642, // BIG5_HKSCS ChineseT.12
|
|
132
|
+
0x00000000, // ISO_2022_CN
|
|
133
|
+
0x00000000, // TSCII
|
|
134
|
+
0x00000000, // TAM
|
|
135
|
+
0x00000000, // TAB
|
|
136
|
+
0x00000000, // JAGRAN
|
|
137
|
+
0x00000000, // MACINTOSH
|
|
138
|
+
0x00000000, // UTF7
|
|
139
|
+
0x00000000, // BHASKAR
|
|
140
|
+
0x00000000, // HTCHANAKYA
|
|
141
|
+
0x090646ca, // UTF-16BE ChineseT.10 GERMAN.4 Japanese.2
|
|
142
|
+
0x00000000, // UTF-16LE
|
|
143
|
+
0x00000000, // UTF-32BE
|
|
144
|
+
0x00000000, // UTF-32LE
|
|
145
|
+
0x00000000, // X-BINARYENC
|
|
146
|
+
0x06001142, // HZ-GB-2312 Chinese.12 GERMAN.1
|
|
147
|
+
0x461109c2, // X-UTF8UTF8 Japanese.9 Chinese.5 ChineseT.3
|
|
148
|
+
0x00000000, // X-TAM-ELANGO
|
|
149
|
+
0x00000000, // X-TAM-LTTMBARANI
|
|
150
|
+
0x00000000, // X-TAM-SHREE
|
|
151
|
+
0x00000000, // X-TAM-TBOOMIS
|
|
152
|
+
0x00000000, // X-TAM-TMNEWS
|
|
153
|
+
0x00000000, // X-TAM-WEBTAMIL
|
|
154
|
+
0x00000000, // X-KDDI-Shift_JIS
|
|
155
|
+
0x00000000, // X-DoCoMo-Shift_JIS
|
|
156
|
+
0x00000000, // X-SoftBank-Shift_JIS
|
|
157
|
+
0x00000000, // X-KDDI-ISO-2022-JP
|
|
158
|
+
0x00000000, // X-SoftBank-ISO-2022-JP
|
|
159
|
+
};
|
|
160
|
+
|
|
161
|
+
COMPILE_ASSERT(arraysize(kEncodingHintProbs) == NUM_ENCODINGS,
|
|
162
|
+
kEncodingHintProbs_has_incorrect_size);
|
|
163
|
+
|
|
164
|
+
//
|
|
165
|
+
// Generated by dsites 2008.07.07 from 10% of Base
|
|
166
|
+
//
|
|
167
|
+
|
|
168
|
+
// Three packed language probs, subscripted by (anchor) language
|
|
169
|
+
static const uint32 kLanguageHintProbs[] = {
|
|
170
|
+
0x00000000, // ENGLISH
|
|
171
|
+
0x00000242, // DANISH DANISH.12
|
|
172
|
+
0x00000342, // DUTCH DUTCH.12
|
|
173
|
+
0x00000442, // FINNISH FINNISH.12
|
|
174
|
+
0x00000542, // FRENCH FRENCH.12
|
|
175
|
+
0x00000642, // GERMAN GERMAN.12
|
|
176
|
+
0x00000742, // HEBREW HEBREW.12
|
|
177
|
+
0x00000842, // ITALIAN ITALIAN.12
|
|
178
|
+
0x00000942, // Japanese Japanese.12
|
|
179
|
+
0x00000a42, // Korean Korean.12
|
|
180
|
+
0x51000b43, // NORWEGIAN NORWEGIAN.12 NORWEGIAN_N.2
|
|
181
|
+
0x00000c42, // POLISH POLISH.12
|
|
182
|
+
0x00000d42, // PORTUGUESE PORTUGUESE.12
|
|
183
|
+
0x00000000, // RUSSIAN
|
|
184
|
+
0x00000f42, // SPANISH SPANISH.12
|
|
185
|
+
0x00001042, // SWEDISH SWEDISH.12
|
|
186
|
+
0x00001142, // Chinese Chinese.12
|
|
187
|
+
0x00001242, // CZECH CZECH.12
|
|
188
|
+
0x00000000, // GREEK
|
|
189
|
+
0x47001442, // ICELANDIC ICELANDIC.12 FAROESE.1
|
|
190
|
+
0x00001542, // LATVIAN LATVIAN.12
|
|
191
|
+
0x00001642, // LITHUANIAN LITHUANIAN.12
|
|
192
|
+
0x00001742, // ROMANIAN ROMANIAN.12
|
|
193
|
+
0x00001842, // HUNGARIAN HUNGARIAN.12
|
|
194
|
+
0x00001942, // ESTONIAN ESTONIAN.12
|
|
195
|
+
0x00000000, // TG_UNKNOWN_LANGUAGE
|
|
196
|
+
0x00000000, // Unknown
|
|
197
|
+
0x00001c42, // BULGARIAN BULGARIAN.12
|
|
198
|
+
0x00001d42, // CROATIAN CROATIAN.12
|
|
199
|
+
0x1e001d46, // SERBIAN CROATIAN.12 SERBIAN.5
|
|
200
|
+
0x00000000, // IRISH
|
|
201
|
+
0x0f00203d, // GALICIAN GALICIAN.11 SPANISH.7
|
|
202
|
+
0x5e00213a, // TAGALOG TAGALOG.11 SOMALI.4
|
|
203
|
+
0x00002242, // TURKISH TURKISH.12
|
|
204
|
+
0x00002342, // UKRAINIAN UKRAINIAN.12
|
|
205
|
+
0x00000000, // HINDI
|
|
206
|
+
0x1c1e25d4, // MACEDONIAN MACEDONIAN.11 SERBIAN.4 BULGARIAN.2
|
|
207
|
+
0x00002642, // BENGALI BENGALI.12
|
|
208
|
+
0x00002742, // INDONESIAN INDONESIAN.12
|
|
209
|
+
0x00000000, // LATIN
|
|
210
|
+
0x2700293c, // MALAY MALAY.11 INDONESIAN.6
|
|
211
|
+
0x00000000, // MALAYALAM
|
|
212
|
+
0x00000000, // WELSH
|
|
213
|
+
0x00000000, // NEPALI
|
|
214
|
+
0x00000000, // TELUGU
|
|
215
|
+
0x00002e42, // ALBANIAN ALBANIAN.12
|
|
216
|
+
0x00000000, // TAMIL
|
|
217
|
+
0x00003042, // BELARUSIAN BELARUSIAN.12
|
|
218
|
+
0x00000000, // JAVANESE
|
|
219
|
+
0x00000000, // OCCITAN
|
|
220
|
+
0x375f3330, // URDU URDU.10 UIGHUR.7 ARABIC.4
|
|
221
|
+
0x41003436, // BIHARI BIHARI.10 MARATHI.10
|
|
222
|
+
0x00000000, // GUJARATI
|
|
223
|
+
0x0a4636b2, // THAI THAI.7 ChineseT.3 Korean.2
|
|
224
|
+
0x00003742, // ARABIC ARABIC.12
|
|
225
|
+
0x00003842, // CATALAN CATALAN.12
|
|
226
|
+
0x00003942, // ESPERANTO ESPERANTO.12
|
|
227
|
+
0x00003a42, // BASQUE BASQUE.12
|
|
228
|
+
0x00000000, // INTERLINGUA
|
|
229
|
+
0x00000000, // KANNADA
|
|
230
|
+
0x05060cca, // PUNJABI POLISH.10 GERMAN.4 FRENCH.2
|
|
231
|
+
0x00000000, // SCOTS_GAELIC
|
|
232
|
+
0x00003f42, // SWAHILI SWAHILI.12
|
|
233
|
+
0x00004042, // SLOVENIAN SLOVENIAN.12
|
|
234
|
+
0x00004142, // MARATHI MARATHI.12
|
|
235
|
+
0x00004242, // MALTESE MALTESE.12
|
|
236
|
+
0x00004342, // VIETNAMESE VIETNAMESE.12
|
|
237
|
+
0x00000000, // FRISIAN
|
|
238
|
+
0x12004543, // SLOVAK SLOVAK.12 CZECH.2
|
|
239
|
+
0x00004642, // ChineseT ChineseT.12
|
|
240
|
+
0x00000000, // FAROESE
|
|
241
|
+
0x00000000, // SUNDANESE
|
|
242
|
+
0x79004944, // UZBEK UZBEK.12 TAJIK.3
|
|
243
|
+
0x4d004a46, // AMHARIC AMHARIC.12 TIGRINYA.5
|
|
244
|
+
0x00004b42, // AZERBAIJANI AZERBAIJANI.12
|
|
245
|
+
0x00000000, // GEORGIAN
|
|
246
|
+
0x00000000, // TIGRINYA
|
|
247
|
+
0x00004e42, // PERSIAN PERSIAN.12
|
|
248
|
+
0x00000000, // BOSNIAN
|
|
249
|
+
0x00000000, // SINHALESE
|
|
250
|
+
0x00000000, // NORWEGIAN_N
|
|
251
|
+
0x00000000, // PORTUGUESE_P
|
|
252
|
+
0x00000000, // PORTUGUESE_B
|
|
253
|
+
0x00000000, // XHOSA
|
|
254
|
+
0x00000000, // ZULU
|
|
255
|
+
0x00000000, // GUARANI
|
|
256
|
+
0x00000000, // SESOTHO
|
|
257
|
+
0x00000000, // TURKMEN
|
|
258
|
+
0x7a005933, // KYRGYZ KYRGYZ.10 TATAR.7
|
|
259
|
+
0x00000000, // BRETON
|
|
260
|
+
0x00000000, // TWI
|
|
261
|
+
0x00000000, // YIDDISH
|
|
262
|
+
0x00000000, // SERBO_CROATIAN
|
|
263
|
+
0x00000000, // SOMALI
|
|
264
|
+
0x00005f42, // UIGHUR UIGHUR.12
|
|
265
|
+
0x00006042, // KURDISH KURDISH.12
|
|
266
|
+
0x00006142, // MONGOLIAN MONGOLIAN.12
|
|
267
|
+
0x051130c9, // ARMENIAN BELARUSIAN.10 Chinese.3 FRENCH.1
|
|
268
|
+
0x020f0521, // LAOTHIAN FRENCH.8 SPANISH.7 DANISH.6
|
|
269
|
+
0x64004e35, // SINDHI PERSIAN.10 SINDHI.9
|
|
270
|
+
0x00000000, // RHAETO_ROMANCE
|
|
271
|
+
0x00006642, // AFRIKAANS AFRIKAANS.12
|
|
272
|
+
0x00000000, // LUXEMBOURGISH
|
|
273
|
+
0x00006842, // BURMESE BURMESE.12
|
|
274
|
+
0x00002242, // KHMER TURKISH.12
|
|
275
|
+
0x88006a3c, // TIBETAN TIBETAN.11 DZONGKHA.6
|
|
276
|
+
0x00000000, // DHIVEHI
|
|
277
|
+
0x00000000, // CHEROKEE
|
|
278
|
+
0x00000000, // SYRIAC
|
|
279
|
+
0x00000000, // LIMBU
|
|
280
|
+
0x00000000, // ORIYA
|
|
281
|
+
0x00000000, // ASSAMESE
|
|
282
|
+
0x00000000, // CORSICAN
|
|
283
|
+
0x00000000, // INTERLINGUE
|
|
284
|
+
0x00007342, // KAZAKH KAZAKH.12
|
|
285
|
+
0x00000000, // LINGALA
|
|
286
|
+
0x00000000, // MOLDAVIAN
|
|
287
|
+
0x5f007645, // PASHTO PASHTO.12 UIGHUR.4
|
|
288
|
+
0x00000000, // QUECHUA
|
|
289
|
+
0x00000000, // SHONA
|
|
290
|
+
0x00007942, // TAJIK TAJIK.12
|
|
291
|
+
0x00000000, // TATAR
|
|
292
|
+
0x00000000, // TONGA
|
|
293
|
+
0x00000000, // YORUBA
|
|
294
|
+
0x00000000, // CREOLES_AND_PIDGINS_ENGLISH_BASED
|
|
295
|
+
0x00000000, // CREOLES_AND_PIDGINS_FRENCH_BASED
|
|
296
|
+
0x00000000, // CREOLES_AND_PIDGINS_PORTUGUESE_BASED
|
|
297
|
+
0x00000000, // CREOLES_AND_PIDGINS_OTHER
|
|
298
|
+
0x00000000, // MAORI
|
|
299
|
+
0x00000000, // WOLOF
|
|
300
|
+
0x00000000, // ABKHAZIAN
|
|
301
|
+
0x00000000, // AFAR
|
|
302
|
+
0x00000000, // AYMARA
|
|
303
|
+
0x00000000, // BASHKIR
|
|
304
|
+
0x00000000, // BISLAMA
|
|
305
|
+
0x00000000, // DZONGKHA
|
|
306
|
+
0x00000000, // FIJIAN
|
|
307
|
+
0x00000000, // GREENLANDIC
|
|
308
|
+
0x00000000, // HAUSA
|
|
309
|
+
0x00000000, // HAITIAN_CREOLE
|
|
310
|
+
0x00000000, // INUPIAK
|
|
311
|
+
0x00000542, // INUKTITUT FRENCH.12
|
|
312
|
+
0x00000000, // KASHMIRI
|
|
313
|
+
0x00000000, // KINYARWANDA
|
|
314
|
+
0x00000000, // MALAGASY
|
|
315
|
+
0x00000000, // NAURU
|
|
316
|
+
0x00000000, // OROMO
|
|
317
|
+
0x00000000, // RUNDI
|
|
318
|
+
0x00000000, // SAMOAN
|
|
319
|
+
0x00000000, // SANGO
|
|
320
|
+
0x344197d3, // SANSKRIT SANSKRIT.11 MARATHI.4 BIHARI.1
|
|
321
|
+
0x00000000, // SISWANT
|
|
322
|
+
0x00000000, // TSONGA
|
|
323
|
+
0x00000000, // TSWANA
|
|
324
|
+
0x00000000, // VOLAPUK
|
|
325
|
+
0x00000000, // ZHUANG
|
|
326
|
+
0x00000000, // KHASI
|
|
327
|
+
0x00000000, // SCOTS
|
|
328
|
+
0x00000000, // GANDA
|
|
329
|
+
0x00000000, // MANX
|
|
330
|
+
0x00000000, // MONTENEGRIN
|
|
331
|
+
// Add new language hints just before here (just use 0x00000000)
|
|
332
|
+
};
|
|
333
|
+
|
|
334
|
+
COMPILE_ASSERT(arraysize(kLanguageHintProbs) == NUM_LANGUAGES,
|
|
335
|
+
kLanguageHintProbs_has_incorrect_size);
|
|
336
|
+
|
|
337
|
+
//
|
|
338
|
+
// Generated by dsites 2008.07.07 from 10% of Base
|
|
339
|
+
//
|
|
340
|
+
|
|
341
|
+
typedef struct {
|
|
342
|
+
char key[4];
|
|
343
|
+
uint32 probs;
|
|
344
|
+
} HintEntry;
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
// Massaged TLD, followed by three packed language probs
|
|
348
|
+
// Hand-removed 4 items dsites 2008.07.15
|
|
349
|
+
static const int kTLDHintProbsSize = 201;
|
|
350
|
+
static const HintEntry kTLDHintProbs[kTLDHintProbsSize] = { // MaxRange 12
|
|
351
|
+
{{0x61,0x63,0x5f,0x5f}, 0x0a000945}, // ac__ Japanese.12 Korean.4
|
|
352
|
+
{{0x61,0x64,0x5f,0x5f}, 0x00003842}, // ad__ CATALAN.12
|
|
353
|
+
{{0x61,0x65,0x5f,0x5f}, 0x00003742}, // ae__ ARABIC.12
|
|
354
|
+
{{0x61,0x66,0x5f,0x5f}, 0x4e00763d}, // af__ PASHTO.11 PERSIAN.7
|
|
355
|
+
{{0x61,0x67,0x5f,0x5f}, 0x09000643}, // ag__ GERMAN.12 Japanese.2
|
|
356
|
+
{{0x61,0x69,0x5f,0x5f}, 0x0c180938}, // ai__ Japanese.11 HUNGARIAN.7 POLISH.2
|
|
357
|
+
{{0x61,0x6c,0x5f,0x5f}, 0x00002e42}, // al__ ALBANIAN.12
|
|
358
|
+
{{0x61,0x6e,0x5f,0x5f}, 0x6e00033d}, // an__ DUTCH.11 LIMBU.7
|
|
359
|
+
{{0x61,0x6f,0x5f,0x5f}, 0x05000d42}, // ao__ PORTUGUESE.12 FRENCH.1
|
|
360
|
+
{{0x61,0x71,0x5f,0x5f}, 0x05000f29}, // aq__ SPANISH.9 FRENCH.6
|
|
361
|
+
{{0x61,0x72,0x5f,0x5f}, 0x00000f42}, // ar__ SPANISH.12
|
|
362
|
+
{{0x61,0x73,0x5f,0x5f}, 0x0f120bcd}, // as__ NORWEGIAN.10 CZECH.6 SPANISH.5
|
|
363
|
+
{{0x61,0x74,0x5f,0x5f}, 0x00000642}, // at__ GERMAN.12
|
|
364
|
+
{{0x61,0x77,0x5f,0x5f}, 0x0f000345}, // aw__ DUTCH.12 SPANISH.4
|
|
365
|
+
{{0x61,0x78,0x5f,0x5f}, 0x00001042}, // ax__ SWEDISH.12
|
|
366
|
+
{{0x61,0x7a,0x5f,0x5f}, 0x00004b42}, // az__ AZERBAIJANI.12
|
|
367
|
+
{{0x62,0x61,0x5f,0x5f}, 0x00001d42}, // ba__ CROATIAN.12
|
|
368
|
+
{{0x62,0x62,0x5f,0x5f}, 0x00002842}, // bb__ LATIN.12
|
|
369
|
+
{{0x62,0x64,0x5f,0x5f}, 0x00002642}, // bd__ BENGALI.12
|
|
370
|
+
{{0x62,0x65,0x5f,0x5f}, 0x05000335}, // be__ DUTCH.10 FRENCH.9
|
|
371
|
+
{{0x62,0x66,0x5f,0x5f}, 0x00000542}, // bf__ FRENCH.12
|
|
372
|
+
{{0x62,0x67,0x5f,0x5f}, 0x00001c42}, // bg__ BULGARIAN.12
|
|
373
|
+
{{0x62,0x68,0x5f,0x5f}, 0x00003742}, // bh__ ARABIC.12
|
|
374
|
+
{{0x62,0x69,0x5f,0x5f}, 0x0f00053f}, // bi__ FRENCH.11 SPANISH.9
|
|
375
|
+
{{0x62,0x6a,0x5f,0x5f}, 0x00000542}, // bj__ FRENCH.12
|
|
376
|
+
{{0x62,0x6d,0x5f,0x5f}, 0x98043929}, // bm__ ESPERANTO.9 FINNISH.8 SISWANT.6
|
|
377
|
+
{{0x62,0x6e,0x5f,0x5f}, 0x00002942}, // bn__ MALAY.12
|
|
378
|
+
{{0x62,0x6f,0x5f,0x5f}, 0x00000f42}, // bo__ SPANISH.12
|
|
379
|
+
{{0x62,0x72,0x5f,0x5f}, 0x00000d42}, // br__ PORTUGUESE.12
|
|
380
|
+
{{0x62,0x74,0x5f,0x5f}, 0x00008842}, // bt__ DZONGKHA.12
|
|
381
|
+
{{0x62,0x77,0x5f,0x5f}, 0x06059ac4}, // bw__ TSWANA.9 FRENCH.6 GERMAN.5
|
|
382
|
+
{{0x62,0x79,0x5f,0x5f}, 0x00003024}, // by__ BELARUSIAN.9
|
|
383
|
+
{{0x62,0x7a,0x5f,0x5f}, 0x0f0a0924}, // bz__ Japanese.9 Korean.5 SPANISH.1
|
|
384
|
+
{{0x63,0x61,0x5f,0x5f}, 0x00000542}, // ca__ FRENCH.12
|
|
385
|
+
{{0x63,0x61,0x74,0x5f}, 0x00003842}, // cat_ CATALAN.12
|
|
386
|
+
{{0x63,0x64,0x5f,0x5f}, 0x06051224}, // cd__ CZECH.9 FRENCH.5 GERMAN.1
|
|
387
|
+
{{0x63,0x66,0x5f,0x5f}, 0x00000542}, // cf__ FRENCH.12
|
|
388
|
+
{{0x63,0x67,0x5f,0x5f}, 0x00000542}, // cg__ FRENCH.12
|
|
389
|
+
{{0x63,0x68,0x5f,0x5f}, 0x08050638}, // ch__ GERMAN.11 FRENCH.7 ITALIAN.2
|
|
390
|
+
{{0x63,0x69,0x5f,0x5f}, 0x00000542}, // ci__ FRENCH.12
|
|
391
|
+
{{0x63,0x6c,0x5f,0x5f}, 0x00000f42}, // cl__ SPANISH.12
|
|
392
|
+
{{0x63,0x6d,0x5f,0x5f}, 0x00000542}, // cm__ FRENCH.12
|
|
393
|
+
{{0x63,0x6e,0x5f,0x5f}, 0x00001142}, // cn__ Chinese.12
|
|
394
|
+
{{0x63,0x6f,0x5f,0x5f}, 0x00000f42}, // co__ SPANISH.12
|
|
395
|
+
// {{0x63,0x6f,0x6f,0x70}, 0x0f0509cd}, // coop Japanese.10 FRENCH.6 SPANISH.5
|
|
396
|
+
{{0x63,0x72,0x5f,0x5f}, 0x00000f42}, // cr__ SPANISH.12
|
|
397
|
+
{{0x63,0x75,0x5f,0x5f}, 0x00000f42}, // cu__ SPANISH.12
|
|
398
|
+
{{0x63,0x76,0x5f,0x5f}, 0x00000d42}, // cv__ PORTUGUESE.12
|
|
399
|
+
{{0x63,0x78,0x5f,0x5f}, 0x223a091f}, // cx__ Japanese.8 BASQUE.6 TURKISH.4
|
|
400
|
+
{{0x63,0x79,0x5f,0x5f}, 0x150622ba}, // cy__ TURKISH.8 GERMAN.4 LATVIAN.3
|
|
401
|
+
{{0x63,0x7a,0x5f,0x5f}, 0x00001242}, // cz__ CZECH.12
|
|
402
|
+
{{0x64,0x65,0x5f,0x5f}, 0x00000642}, // de__ GERMAN.12
|
|
403
|
+
{{0x64,0x6b,0x5f,0x5f}, 0x00000242}, // dk__ DANISH.12
|
|
404
|
+
{{0x64,0x6f,0x5f,0x5f}, 0x21000f42}, // do__ SPANISH.12 TAGALOG.1
|
|
405
|
+
{{0x64,0x7a,0x5f,0x5f}, 0x37000535}, // dz__ FRENCH.10 ARABIC.9
|
|
406
|
+
{{0x65,0x63,0x5f,0x5f}, 0x00000f42}, // ec__ SPANISH.12
|
|
407
|
+
// {{0x65,0x64,0x75,0x5f}, 0x2e0f3873}, // edu_ CATALAN.9 SPANISH.7 ALBANIAN.2
|
|
408
|
+
{{0x65,0x65,0x5f,0x5f}, 0x00001942}, // ee__ ESTONIAN.12
|
|
409
|
+
{{0x65,0x67,0x5f,0x5f}, 0x05003742}, // eg__ ARABIC.12 FRENCH.1
|
|
410
|
+
{{0x65,0x72,0x5f,0x5f}, 0x00000b42}, // er__ NORWEGIAN.12
|
|
411
|
+
{{0x65,0x73,0x5f,0x5f}, 0x38200fd4}, // es__ SPANISH.11 GALICIAN.4 CATALAN.2
|
|
412
|
+
{{0x65,0x74,0x5f,0x5f}, 0x39004a39}, // et__ AMHARIC.11 ESPERANTO.3
|
|
413
|
+
{{0x66,0x69,0x5f,0x5f}, 0x10000444}, // fi__ FINNISH.12 SWEDISH.3
|
|
414
|
+
{{0x66,0x6a,0x5f,0x5f}, 0x050489e0}, // fj__ FIJIAN.12 FINNISH.5 FRENCH.3
|
|
415
|
+
{{0x66,0x6f,0x5f,0x5f}, 0x00004742}, // fo__ FAROESE.12
|
|
416
|
+
{{0x66,0x72,0x5f,0x5f}, 0x00000542}, // fr__ FRENCH.12
|
|
417
|
+
{{0x67,0x61,0x5f,0x5f}, 0x00000542}, // ga__ FRENCH.12
|
|
418
|
+
{{0x67,0x64,0x5f,0x5f}, 0x061d05d5}, // gd__ FRENCH.11 CROATIAN.5 GERMAN.3
|
|
419
|
+
{{0x67,0x65,0x5f,0x5f}, 0x00004c2d}, // ge__ GEORGIAN.10
|
|
420
|
+
{{0x67,0x66,0x5f,0x5f}, 0x00000542}, // gf__ FRENCH.12
|
|
421
|
+
{{0x67,0x67,0x5f,0x5f}, 0x06002244}, // gg__ TURKISH.12 GERMAN.3
|
|
422
|
+
{{0x67,0x68,0x5f,0x5f}, 0x05000436}, // gh__ FINNISH.10 FRENCH.10
|
|
423
|
+
{{0x67,0x69,0x5f,0x5f}, 0x0f0538ce}, // gi__ CATALAN.10 FRENCH.7 SPANISH.6
|
|
424
|
+
{{0x67,0x6c,0x5f,0x5f}, 0x398a0238}, // gl__ DANISH.11 GREENLANDIC.7 ESPERANTO.2
|
|
425
|
+
{{0x67,0x6d,0x5f,0x5f}, 0x0600043e}, // gm__ FINNISH.11 GERMAN.8
|
|
426
|
+
{{0x67,0x6e,0x5f,0x5f}, 0x00000542}, // gn__ FRENCH.12
|
|
427
|
+
// {{0x67,0x6f,0x76,0x5f}, 0x05000f25}, // gov_ SPANISH.9 FRENCH.2
|
|
428
|
+
{{0x67,0x70,0x5f,0x5f}, 0x00000542}, // gp__ FRENCH.12
|
|
429
|
+
{{0x67,0x71,0x5f,0x5f}, 0x0f000547}, // gq__ FRENCH.12 SPANISH.6
|
|
430
|
+
{{0x67,0x73,0x5f,0x5f}, 0x00000942}, // gs__ Japanese.12
|
|
431
|
+
{{0x67,0x74,0x5f,0x5f}, 0x00000f42}, // gt__ SPANISH.12
|
|
432
|
+
{{0x68,0x6b,0x5f,0x5f}, 0x11004643}, // hk__ ChineseT.12 Chinese.2
|
|
433
|
+
{{0x68,0x6d,0x5f,0x5f}, 0x4606092e}, // hm__ Japanese.10 GERMAN.6 ChineseT.2
|
|
434
|
+
{{0x68,0x6e,0x5f,0x5f}, 0x00000f42}, // hn__ SPANISH.12
|
|
435
|
+
{{0x68,0x72,0x5f,0x5f}, 0x00001d42}, // hr__ CROATIAN.12
|
|
436
|
+
{{0x68,0x74,0x5f,0x5f}, 0x0f000542}, // ht__ FRENCH.12 SPANISH.1
|
|
437
|
+
{{0x68,0x75,0x5f,0x5f}, 0x00001842}, // hu__ HUNGARIAN.12
|
|
438
|
+
{{0x69,0x64,0x5f,0x5f}, 0x00002742}, // id__ INDONESIAN.12
|
|
439
|
+
{{0x69,0x65,0x5f,0x5f}, 0x050c1f24}, // ie__ IRISH.9 POLISH.5 FRENCH.1
|
|
440
|
+
{{0x69,0x6c,0x5f,0x5f}, 0x00000742}, // il__ HEBREW.12
|
|
441
|
+
{{0x69,0x6e,0x74,0x5f}, 0x0f060574}, // int_ FRENCH.9 GERMAN.8 SPANISH.3
|
|
442
|
+
{{0x69,0x6f,0x5f,0x5f}, 0x11090fd5}, // io__ SPANISH.11 Japanese.5 Chinese.3
|
|
443
|
+
{{0x69,0x71,0x5f,0x5f}, 0x60003744}, // iq__ ARABIC.12 KURDISH.3
|
|
444
|
+
{{0x69,0x72,0x5f,0x5f}, 0x00004e42}, // ir__ PERSIAN.12
|
|
445
|
+
{{0x69,0x73,0x5f,0x5f}, 0x00001442}, // is__ ICELANDIC.12
|
|
446
|
+
{{0x69,0x74,0x5f,0x5f}, 0x00000842}, // it__ ITALIAN.12
|
|
447
|
+
{{0x6a,0x65,0x5f,0x5f}, 0x29050328}, // je__ DUTCH.9 FRENCH.7 MALAY.5
|
|
448
|
+
{{0x6a,0x6d,0x5f,0x5f}, 0x040f0576}, // jm__ FRENCH.9 SPANISH.8 FINNISH.5
|
|
449
|
+
{{0x6a,0x6f,0x5f,0x5f}, 0x00003742}, // jo__ ARABIC.12
|
|
450
|
+
// {{0x6a,0x6f,0x62,0x73}, 0x0f060329}, // jobs DUTCH.9 GERMAN.8 SPANISH.6
|
|
451
|
+
{{0x6a,0x70,0x5f,0x5f}, 0x00000942}, // jp__ Japanese.12
|
|
452
|
+
{{0x6b,0x65,0x5f,0x5f}, 0x040f3fc3}, // ke__ SWAHILI.9 SPANISH.5 FINNISH.4
|
|
453
|
+
{{0x6b,0x69,0x5f,0x5f}, 0x04000643}, // ki__ GERMAN.12 FINNISH.2
|
|
454
|
+
{{0x6b,0x6d,0x5f,0x5f}, 0x00000542}, // km__ FRENCH.12
|
|
455
|
+
{{0x6b,0x70,0x5f,0x5f}, 0x00000a42}, // kp__ Korean.12
|
|
456
|
+
{{0x6b,0x72,0x5f,0x5f}, 0x00000a42}, // kr__ Korean.12
|
|
457
|
+
{{0x6b,0x77,0x5f,0x5f}, 0x00003742}, // kw__ ARABIC.12
|
|
458
|
+
{{0x6b,0x79,0x5f,0x5f}, 0x0500083f}, // ky__ ITALIAN.11 FRENCH.9
|
|
459
|
+
{{0x6b,0x7a,0x5f,0x5f}, 0x0000732d}, // kz__ KAZAKH.10
|
|
460
|
+
{{0x6c,0x62,0x5f,0x5f}, 0x05003747}, // lb__ ARABIC.12 FRENCH.6
|
|
461
|
+
{{0x6c,0x63,0x5f,0x5f}, 0x09000645}, // lc__ GERMAN.12 Japanese.4
|
|
462
|
+
{{0x6c,0x69,0x5f,0x5f}, 0x1600063d}, // li__ GERMAN.11 LITHUANIAN.7
|
|
463
|
+
{{0x6c,0x73,0x5f,0x5f}, 0x00005742}, // ls__ SESOTHO.12
|
|
464
|
+
{{0x6c,0x74,0x5f,0x5f}, 0x00001642}, // lt__ LITHUANIAN.12
|
|
465
|
+
{{0x6c,0x75,0x5f,0x5f}, 0x0600053d}, // lu__ FRENCH.11 GERMAN.7
|
|
466
|
+
{{0x6c,0x76,0x5f,0x5f}, 0x00001542}, // lv__ LATVIAN.12
|
|
467
|
+
{{0x6c,0x79,0x5f,0x5f}, 0x05003744}, // ly__ ARABIC.12 FRENCH.3
|
|
468
|
+
{{0x6d,0x61,0x5f,0x5f}, 0x3700053d}, // ma__ FRENCH.11 ARABIC.7
|
|
469
|
+
{{0x6d,0x63,0x5f,0x5f}, 0x00000542}, // mc__ FRENCH.12
|
|
470
|
+
{{0x6d,0x64,0x5f,0x5f}, 0x00001724}, // md__ ROMANIAN.9
|
|
471
|
+
{{0x6d,0x65,0x5f,0x5f}, 0x00001d42}, // me__ CROATIAN.12
|
|
472
|
+
{{0x6d,0x67,0x5f,0x5f}, 0x00000542}, // mg__ FRENCH.12
|
|
473
|
+
{{0x6d,0x6b,0x5f,0x5f}, 0x1c002543}, // mk__ MACEDONIAN.12 BULGARIAN.2
|
|
474
|
+
{{0x6d,0x6c,0x5f,0x5f}, 0x00000542}, // ml__ FRENCH.12
|
|
475
|
+
{{0x6d,0x6e,0x5f,0x5f}, 0x00006142}, // mn__ MONGOLIAN.12
|
|
476
|
+
{{0x6d,0x6f,0x5f,0x5f}, 0x110d4631}, // mo__ ChineseT.10 PORTUGUESE.8 Chinese.5
|
|
477
|
+
{{0x6d,0x71,0x5f,0x5f}, 0x00000542}, // mq__ FRENCH.12
|
|
478
|
+
{{0x6d,0x72,0x5f,0x5f}, 0x37000535}, // mr__ FRENCH.10 ARABIC.9
|
|
479
|
+
{{0x6d,0x73,0x5f,0x5f}, 0x090f06d5}, // ms__ GERMAN.11 SPANISH.5 Japanese.3
|
|
480
|
+
{{0x6d,0x74,0x5f,0x5f}, 0x00004242}, // mt__ MALTESE.12
|
|
481
|
+
{{0x6d,0x75,0x5f,0x5f}, 0x05000934}, // mu__ Japanese.10 FRENCH.8
|
|
482
|
+
{{0x6d,0x76,0x5f,0x5f}, 0x28000436}, // mv__ FINNISH.10 LATIN.10
|
|
483
|
+
{{0x6d,0x77,0x5f,0x5f}, 0x0611092a}, // mw__ Japanese.9 Chinese.8 GERMAN.7
|
|
484
|
+
{{0x6d,0x78,0x5f,0x5f}, 0x00000f42}, // mx__ SPANISH.12
|
|
485
|
+
{{0x6d,0x79,0x5f,0x5f}, 0x00002942}, // my__ MALAY.12
|
|
486
|
+
{{0x6d,0x7a,0x5f,0x5f}, 0x00000d42}, // mz__ PORTUGUESE.12
|
|
487
|
+
{{0x6e,0x61,0x5f,0x5f}, 0x06006644}, // na__ AFRIKAANS.12 GERMAN.3
|
|
488
|
+
{{0x6e,0x63,0x5f,0x5f}, 0x00000542}, // nc__ FRENCH.12
|
|
489
|
+
{{0x6e,0x65,0x5f,0x5f}, 0x8b000542}, // ne__ FRENCH.12 HAUSA.1
|
|
490
|
+
{{0x6e,0x66,0x5f,0x5f}, 0x00000542}, // nf__ FRENCH.12
|
|
491
|
+
{{0x6e,0x69,0x5f,0x5f}, 0x00000f42}, // ni__ SPANISH.12
|
|
492
|
+
{{0x6e,0x6c,0x5f,0x5f}, 0x00000342}, // nl__ DUTCH.12
|
|
493
|
+
{{0x6e,0x6f,0x5f,0x5f}, 0x51000b43}, // no__ NORWEGIAN.12 NORWEGIAN_N.2
|
|
494
|
+
{{0x6e,0x75,0x5f,0x5f}, 0x0300103b}, // nu__ SWEDISH.11 DUTCH.5
|
|
495
|
+
{{0x6f,0x6d,0x5f,0x5f}, 0x00003742}, // om__ ARABIC.12
|
|
496
|
+
{{0x70,0x61,0x5f,0x5f}, 0x00000f42}, // pa__ SPANISH.12
|
|
497
|
+
{{0x70,0x65,0x5f,0x5f}, 0x00000f42}, // pe__ SPANISH.12
|
|
498
|
+
{{0x70,0x66,0x5f,0x5f}, 0x00000542}, // pf__ FRENCH.12
|
|
499
|
+
{{0x70,0x67,0x5f,0x5f}, 0x00000f24}, // pg__ SPANISH.9
|
|
500
|
+
{{0x70,0x68,0x5f,0x5f}, 0x00002142}, // ph__ TAGALOG.12
|
|
501
|
+
{{0x70,0x6b,0x5f,0x5f}, 0x00003342}, // pk__ URDU.12
|
|
502
|
+
{{0x70,0x6c,0x5f,0x5f}, 0x30000c42}, // pl__ POLISH.12 BELARUSIAN.1
|
|
503
|
+
{{0x70,0x6e,0x5f,0x5f}, 0x04000644}, // pn__ GERMAN.12 FINNISH.3
|
|
504
|
+
{{0x70,0x72,0x5f,0x5f}, 0x00000f42}, // pr__ SPANISH.12
|
|
505
|
+
{{0x70,0x72,0x6f,0x5f}, 0x46050fd5}, // pro_ SPANISH.11 FRENCH.5 ChineseT.3
|
|
506
|
+
{{0x70,0x73,0x5f,0x5f}, 0x00003742}, // ps__ ARABIC.12
|
|
507
|
+
{{0x70,0x74,0x5f,0x5f}, 0x00000d42}, // pt__ PORTUGUESE.12
|
|
508
|
+
{{0x70,0x79,0x5f,0x5f}, 0x00000f42}, // py__ SPANISH.12
|
|
509
|
+
{{0x71,0x61,0x5f,0x5f}, 0x00003742}, // qa__ ARABIC.12
|
|
510
|
+
{{0x72,0x65,0x5f,0x5f}, 0x00000542}, // re__ FRENCH.12
|
|
511
|
+
{{0x72,0x6f,0x5f,0x5f}, 0x00001742}, // ro__ ROMANIAN.12
|
|
512
|
+
{{0x72,0x73,0x5f,0x5f}, 0x00001d42}, // rs__ CROATIAN.12
|
|
513
|
+
{{0x72,0x77,0x5f,0x5f}, 0x9000053e}, // rw__ FRENCH.11 KINYARWANDA.8
|
|
514
|
+
{{0x73,0x61,0x5f,0x5f}, 0x00003742}, // sa__ ARABIC.12
|
|
515
|
+
{{0x73,0x62,0x5f,0x5f}, 0x00000442}, // sb__ FINNISH.12
|
|
516
|
+
{{0x73,0x63,0x5f,0x5f}, 0x060f092f}, // sc__ Japanese.10 SPANISH.7 GERMAN.3
|
|
517
|
+
{{0x73,0x64,0x5f,0x5f}, 0x00003742}, // sd__ ARABIC.12
|
|
518
|
+
{{0x73,0x65,0x5f,0x5f}, 0x00001042}, // se__ SWEDISH.12
|
|
519
|
+
{{0x73,0x69,0x5f,0x5f}, 0x00004042}, // si__ SLOVENIAN.12
|
|
520
|
+
{{0x73,0x6b,0x5f,0x5f}, 0x12004543}, // sk__ SLOVAK.12 CZECH.2
|
|
521
|
+
{{0x73,0x6d,0x5f,0x5f}, 0x00000842}, // sm__ ITALIAN.12
|
|
522
|
+
{{0x73,0x6e,0x5f,0x5f}, 0x00000542}, // sn__ FRENCH.12
|
|
523
|
+
{{0x73,0x72,0x5f,0x5f}, 0x03001e44}, // sr__ SERBIAN.12 DUTCH.3
|
|
524
|
+
{{0x73,0x76,0x5f,0x5f}, 0x00000f42}, // sv__ SPANISH.12
|
|
525
|
+
{{0x73,0x79,0x5f,0x5f}, 0x00003742}, // sy__ ARABIC.12
|
|
526
|
+
{{0x74,0x63,0x5f,0x5f}, 0x0a2206cd}, // tc__ GERMAN.10 TURKISH.6 Korean.5
|
|
527
|
+
{{0x74,0x66,0x5f,0x5f}, 0x00000642}, // tf__ GERMAN.12
|
|
528
|
+
{{0x74,0x67,0x5f,0x5f}, 0x00000542}, // tg__ FRENCH.12
|
|
529
|
+
{{0x74,0x68,0x5f,0x5f}, 0x9e0936c9}, // th__ THAI.10 Japanese.3 SCOTS.1
|
|
530
|
+
{{0x74,0x6a,0x5f,0x5f}, 0x00007924}, // tj__ TAJIK.9
|
|
531
|
+
{{0x74,0x6c,0x5f,0x5f}, 0x060f0dcd}, // tl__ PORTUGUESE.10 SPANISH.6 GERMAN.5
|
|
532
|
+
{{0x74,0x6e,0x5f,0x5f}, 0x3700053e}, // tn__ FRENCH.11 ARABIC.8
|
|
533
|
+
{{0x74,0x6f,0x5f,0x5f}, 0x064609c5}, // to__ Japanese.9 ChineseT.7 GERMAN.6
|
|
534
|
+
{{0x74,0x70,0x5f,0x5f}, 0x06000944}, // tp__ Japanese.12 GERMAN.3
|
|
535
|
+
{{0x74,0x72,0x5f,0x5f}, 0x00002242}, // tr__ TURKISH.12
|
|
536
|
+
{{0x74,0x72,0x61,0x76}, 0x064509c3}, // trav Japanese.9 SLOVAK.5 GERMAN.4
|
|
537
|
+
{{0x74,0x74,0x5f,0x5f}, 0x0f00063e}, // tt__ GERMAN.11 SPANISH.8
|
|
538
|
+
{{0x74,0x77,0x5f,0x5f}, 0x00004642}, // tw__ ChineseT.12
|
|
539
|
+
{{0x74,0x7a,0x5f,0x5f}, 0x00003f42}, // tz__ SWAHILI.12
|
|
540
|
+
{{0x75,0x61,0x5f,0x5f}, 0x0000232d}, // ua__ UKRAINIAN.10
|
|
541
|
+
{{0x75,0x79,0x5f,0x5f}, 0x00000f42}, // uy__ SPANISH.12
|
|
542
|
+
{{0x75,0x7a,0x5f,0x5f}, 0x0000492d}, // uz__ UZBEK.10
|
|
543
|
+
{{0x76,0x61,0x5f,0x5f}, 0x060f0828}, // va__ ITALIAN.9 SPANISH.7 GERMAN.5
|
|
544
|
+
{{0x76,0x63,0x5f,0x5f}, 0x0d000939}, // vc__ Japanese.11 PORTUGUESE.3
|
|
545
|
+
{{0x76,0x65,0x5f,0x5f}, 0x00000f42}, // ve__ SPANISH.12
|
|
546
|
+
{{0x76,0x67,0x5f,0x5f}, 0x09000f43}, // vg__ SPANISH.12 Japanese.2
|
|
547
|
+
{{0x76,0x69,0x5f,0x5f}, 0x00002942}, // vi__ MALAY.12
|
|
548
|
+
{{0x76,0x6e,0x5f,0x5f}, 0x00004342}, // vn__ VIETNAMESE.12
|
|
549
|
+
{{0x76,0x75,0x5f,0x5f}, 0x00000642}, // vu__ GERMAN.12
|
|
550
|
+
{{0x77,0x73,0x5f,0x5f}, 0x4b0f0624}, // ws__ GERMAN.9 SPANISH.5 AZERBAIJANI.1
|
|
551
|
+
{{0x79,0x65,0x5f,0x5f}, 0x00003742}, // ye__ ARABIC.12
|
|
552
|
+
{{0x79,0x75,0x5f,0x5f}, 0x1e001d3d}, // yu__ CROATIAN.11 SERBIAN.7
|
|
553
|
+
{{0x7a,0x61,0x5f,0x5f}, 0x00006642}, // za__ AFRIKAANS.12
|
|
554
|
+
{{0x7a,0x6d,0x5f,0x5f}, 0x0b000435}, // zm__ FINNISH.10 NORWEGIAN.9
|
|
555
|
+
{{0x7a,0x77,0x5f,0x5f}, 0x3f00783e}, // zw__ SHONA.11 SWAHILI.8
|
|
556
|
+
};
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
// Statistically closest language, based on quadgram table
|
|
560
|
+
// Those that are far from other languges map to UNKNOWN_LANGUAGE
|
|
561
|
+
// Subscripted by Language
|
|
562
|
+
//
|
|
563
|
+
// From lang_correlation.txt and hand-edits
|
|
564
|
+
// sed 's/^\([^ ]*\) \([^ ]*\) coef=0\.\(..\).*$/
|
|
565
|
+
// (\3 >= kMinCorrPercent) ? \2 : UNKNOWN_LANGUAGE,
|
|
566
|
+
// \/\/ \1/' lang_correlation.txt >/tmp/closest_lang_decl.txt
|
|
567
|
+
//
|
|
568
|
+
static const int kMinCorrPercent = 24; // Pick off how close you want
|
|
569
|
+
// 24 catches PERSIAN <== ARABIC
|
|
570
|
+
// but not SPANISH <== PORTUGESE
|
|
571
|
+
static Language Unknown = UNKNOWN_LANGUAGE;
|
|
572
|
+
|
|
573
|
+
// Subscripted by Language
|
|
574
|
+
static const Language kClosestAltLanguage[] = {
|
|
575
|
+
(28 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // ENGLISH
|
|
576
|
+
(36 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // DANISH
|
|
577
|
+
(31 >= kMinCorrPercent) ? AFRIKAANS : UNKNOWN_LANGUAGE, // DUTCH
|
|
578
|
+
(15 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // FINNISH
|
|
579
|
+
(11 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // FRENCH
|
|
580
|
+
(17 >= kMinCorrPercent) ? LUXEMBOURGISH : UNKNOWN_LANGUAGE, // GERMAN
|
|
581
|
+
(27 >= kMinCorrPercent) ? YIDDISH : UNKNOWN_LANGUAGE, // HEBREW
|
|
582
|
+
(16 >= kMinCorrPercent) ? CORSICAN : UNKNOWN_LANGUAGE, // ITALIAN
|
|
583
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Japanese
|
|
584
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Korean
|
|
585
|
+
(41 >= kMinCorrPercent) ? NORWEGIAN_N : UNKNOWN_LANGUAGE, // NORWEGIAN
|
|
586
|
+
( 5 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // POLISH
|
|
587
|
+
(23 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // PORTUGUESE
|
|
588
|
+
(33 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // RUSSIAN
|
|
589
|
+
(28 >= kMinCorrPercent) ? GALICIAN : UNKNOWN_LANGUAGE, // SPANISH
|
|
590
|
+
(17 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // SWEDISH
|
|
591
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Chinese
|
|
592
|
+
(42 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // CZECH
|
|
593
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GREEK
|
|
594
|
+
(35 >= kMinCorrPercent) ? FAROESE : UNKNOWN_LANGUAGE, // ICELANDIC
|
|
595
|
+
( 7 >= kMinCorrPercent) ? LITHUANIAN : UNKNOWN_LANGUAGE, // LATVIAN
|
|
596
|
+
( 7 >= kMinCorrPercent) ? LATVIAN : UNKNOWN_LANGUAGE, // LITHUANIAN
|
|
597
|
+
( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ROMANIAN
|
|
598
|
+
( 4 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // HUNGARIAN
|
|
599
|
+
(15 >= kMinCorrPercent) ? FINNISH : UNKNOWN_LANGUAGE, // ESTONIAN
|
|
600
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Ignore
|
|
601
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Unknown
|
|
602
|
+
(33 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // BULGARIAN
|
|
603
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CROATIAN
|
|
604
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SERBIAN
|
|
605
|
+
(24 >= kMinCorrPercent) ? SCOTS_GAELIC : UNKNOWN_LANGUAGE, // IRISH
|
|
606
|
+
(28 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GALICIAN
|
|
607
|
+
( 8 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // TAGALOG
|
|
608
|
+
(29 >= kMinCorrPercent) ? AZERBAIJANI : UNKNOWN_LANGUAGE, // TURKISH
|
|
609
|
+
(28 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // UKRAINIAN
|
|
610
|
+
(37 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // HINDI
|
|
611
|
+
(29 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // MACEDONIAN
|
|
612
|
+
(14 >= kMinCorrPercent) ? ASSAMESE : UNKNOWN_LANGUAGE, // BENGALI
|
|
613
|
+
(46 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // INDONESIAN
|
|
614
|
+
( 9 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // LATIN
|
|
615
|
+
(46 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // MALAY
|
|
616
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MALAYALAM
|
|
617
|
+
( 4 >= kMinCorrPercent) ? BRETON : UNKNOWN_LANGUAGE, // WELSH
|
|
618
|
+
( 8 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // NEPALI
|
|
619
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TELUGU
|
|
620
|
+
( 3 >= kMinCorrPercent) ? ESPERANTO : UNKNOWN_LANGUAGE, // ALBANIAN
|
|
621
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TAMIL
|
|
622
|
+
(22 >= kMinCorrPercent) ? UKRAINIAN : UNKNOWN_LANGUAGE, // BELARUSIAN
|
|
623
|
+
(15 >= kMinCorrPercent) ? SUNDANESE : UNKNOWN_LANGUAGE, // JAVANESE
|
|
624
|
+
(19 >= kMinCorrPercent) ? CATALAN : UNKNOWN_LANGUAGE, // OCCITAN
|
|
625
|
+
(27 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // URDU
|
|
626
|
+
(36 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // BIHARI
|
|
627
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GUJARATI
|
|
628
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // THAI
|
|
629
|
+
(24 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // ARABIC
|
|
630
|
+
(19 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // CATALAN
|
|
631
|
+
( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ESPERANTO
|
|
632
|
+
( 3 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // BASQUE
|
|
633
|
+
( 9 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // INTERLINGUA
|
|
634
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KANNADA
|
|
635
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PUNJABI
|
|
636
|
+
(24 >= kMinCorrPercent) ? IRISH : UNKNOWN_LANGUAGE, // SCOTS_GAELIC
|
|
637
|
+
( 7 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SWAHILI
|
|
638
|
+
(28 >= kMinCorrPercent) ? SERBO_CROATIAN : UNKNOWN_LANGUAGE, // SLOVENIAN
|
|
639
|
+
(37 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // MARATHI
|
|
640
|
+
( 3 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // MALTESE
|
|
641
|
+
( 1 >= kMinCorrPercent) ? YORUBA : UNKNOWN_LANGUAGE, // VIETNAMESE
|
|
642
|
+
(15 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // FRISIAN
|
|
643
|
+
(42 >= kMinCorrPercent) ? CZECH : UNKNOWN_LANGUAGE, // SLOVAK
|
|
644
|
+
// Original ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ChineseT
|
|
645
|
+
(24 >= kMinCorrPercent) ? CHINESE : UNKNOWN_LANGUAGE, // ChineseT
|
|
646
|
+
(35 >= kMinCorrPercent) ? ICELANDIC : UNKNOWN_LANGUAGE, // FAROESE
|
|
647
|
+
(15 >= kMinCorrPercent) ? JAVANESE : UNKNOWN_LANGUAGE, // SUNDANESE
|
|
648
|
+
(17 >= kMinCorrPercent) ? TAJIK : UNKNOWN_LANGUAGE, // UZBEK
|
|
649
|
+
( 7 >= kMinCorrPercent) ? TIGRINYA : UNKNOWN_LANGUAGE, // AMHARIC
|
|
650
|
+
(29 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // AZERBAIJANI
|
|
651
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GEORGIAN
|
|
652
|
+
( 7 >= kMinCorrPercent) ? AMHARIC : UNKNOWN_LANGUAGE, // TIGRINYA
|
|
653
|
+
(27 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // PERSIAN
|
|
654
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // BOSNIAN
|
|
655
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SINHALESE
|
|
656
|
+
(41 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // NORWEGIAN_N
|
|
657
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_P
|
|
658
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_B
|
|
659
|
+
(37 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // XHOSA
|
|
660
|
+
(37 >= kMinCorrPercent) ? XHOSA : UNKNOWN_LANGUAGE, // ZULU
|
|
661
|
+
( 2 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GUARANI
|
|
662
|
+
(29 >= kMinCorrPercent) ? TSWANA : UNKNOWN_LANGUAGE, // SESOTHO
|
|
663
|
+
( 7 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // TURKMEN
|
|
664
|
+
( 8 >= kMinCorrPercent) ? KAZAKH : UNKNOWN_LANGUAGE, // KYRGYZ
|
|
665
|
+
( 5 >= kMinCorrPercent) ? FRENCH : UNKNOWN_LANGUAGE, // BRETON
|
|
666
|
+
( 3 >= kMinCorrPercent) ? GANDA : UNKNOWN_LANGUAGE, // TWI
|
|
667
|
+
(27 >= kMinCorrPercent) ? HEBREW : UNKNOWN_LANGUAGE, // YIDDISH
|
|
668
|
+
(28 >= kMinCorrPercent) ? SLOVENIAN : UNKNOWN_LANGUAGE, // SERBO_CROATIAN
|
|
669
|
+
(12 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // SOMALI
|
|
670
|
+
( 9 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // UIGHUR
|
|
671
|
+
(15 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // KURDISH
|
|
672
|
+
( 6 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // MONGOLIAN
|
|
673
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ARMENIAN
|
|
674
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // LAOTHIAN
|
|
675
|
+
( 8 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // SINDHI
|
|
676
|
+
(10 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // RHAETO_ROMANCE
|
|
677
|
+
(31 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // AFRIKAANS
|
|
678
|
+
(17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // LUXEMBOURGISH
|
|
679
|
+
( 2 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // BURMESE
|
|
680
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KHMER
|
|
681
|
+
(45 >= kMinCorrPercent) ? DZONGKHA : UNKNOWN_LANGUAGE, // TIBETAN
|
|
682
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // DHIVEHI
|
|
683
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CHEROKEE
|
|
684
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SYRIAC
|
|
685
|
+
( 8 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // LIMBU
|
|
686
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ORIYA
|
|
687
|
+
(14 >= kMinCorrPercent) ? BENGALI : UNKNOWN_LANGUAGE, // ASSAMESE
|
|
688
|
+
(16 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // CORSICAN
|
|
689
|
+
( 5 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // INTERLINGUE
|
|
690
|
+
( 8 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // KAZAKH
|
|
691
|
+
( 4 >= kMinCorrPercent) ? SWAHILI : UNKNOWN_LANGUAGE, // LINGALA
|
|
692
|
+
(11 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // MOLDAVIAN
|
|
693
|
+
(19 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // PASHTO
|
|
694
|
+
( 5 >= kMinCorrPercent) ? AYMARA : UNKNOWN_LANGUAGE, // QUECHUA
|
|
695
|
+
( 5 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SHONA
|
|
696
|
+
(17 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // TAJIK
|
|
697
|
+
(13 >= kMinCorrPercent) ? BASHKIR : UNKNOWN_LANGUAGE, // TATAR
|
|
698
|
+
(11 >= kMinCorrPercent) ? SAMOAN : UNKNOWN_LANGUAGE, // TONGA
|
|
699
|
+
( 2 >= kMinCorrPercent) ? TWI : UNKNOWN_LANGUAGE, // YORUBA
|
|
700
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_ENGLISH_BASED
|
|
701
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_FRENCH_BASED
|
|
702
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_PORTUGUESE_BASED
|
|
703
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_OTHER
|
|
704
|
+
( 6 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // MAORI
|
|
705
|
+
( 3 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // WOLOF
|
|
706
|
+
( 1 >= kMinCorrPercent) ? MONGOLIAN : UNKNOWN_LANGUAGE, // ABKHAZIAN
|
|
707
|
+
( 8 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // AFAR
|
|
708
|
+
( 5 >= kMinCorrPercent) ? QUECHUA : UNKNOWN_LANGUAGE, // AYMARA
|
|
709
|
+
(13 >= kMinCorrPercent) ? TATAR : UNKNOWN_LANGUAGE, // BASHKIR
|
|
710
|
+
( 3 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // BISLAMA
|
|
711
|
+
(45 >= kMinCorrPercent) ? TIBETAN : UNKNOWN_LANGUAGE, // DZONGKHA
|
|
712
|
+
( 4 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // FIJIAN
|
|
713
|
+
( 7 >= kMinCorrPercent) ? INUPIAK : UNKNOWN_LANGUAGE, // GREENLANDIC
|
|
714
|
+
( 3 >= kMinCorrPercent) ? AFAR : UNKNOWN_LANGUAGE, // HAUSA
|
|
715
|
+
( 3 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // HAITIAN_CREOLE
|
|
716
|
+
( 7 >= kMinCorrPercent) ? GREENLANDIC : UNKNOWN_LANGUAGE, // INUPIAK
|
|
717
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // INUKTITUT
|
|
718
|
+
( 4 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // KASHMIRI
|
|
719
|
+
(30 >= kMinCorrPercent) ? RUNDI : UNKNOWN_LANGUAGE, // KINYARWANDA
|
|
720
|
+
( 2 >= kMinCorrPercent) ? TAGALOG : UNKNOWN_LANGUAGE, // MALAGASY
|
|
721
|
+
(17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // NAURU
|
|
722
|
+
(12 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // OROMO
|
|
723
|
+
(30 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // RUNDI
|
|
724
|
+
(11 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // SAMOAN
|
|
725
|
+
( 1 >= kMinCorrPercent) ? LINGALA : UNKNOWN_LANGUAGE, // SANGO
|
|
726
|
+
(32 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // SANSKRIT
|
|
727
|
+
(16 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // SISWANT
|
|
728
|
+
( 5 >= kMinCorrPercent) ? SISWANT : UNKNOWN_LANGUAGE, // TSONGA
|
|
729
|
+
(29 >= kMinCorrPercent) ? SESOTHO : UNKNOWN_LANGUAGE, // TSWANA
|
|
730
|
+
( 2 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // VOLAPUK
|
|
731
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ZHUANG
|
|
732
|
+
( 1 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // KHASI
|
|
733
|
+
(28 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // SCOTS
|
|
734
|
+
(15 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // GANDA
|
|
735
|
+
( 7 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // MANX
|
|
736
|
+
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MONTENEGRIN
|
|
737
|
+
};
|
|
738
|
+
|
|
739
|
+
COMPILE_ASSERT(arraysize(kClosestAltLanguage) == NUM_LANGUAGES,
|
|
740
|
+
kClosestAltLanguage_has_incorrect_size);
|
|
741
|
+
|
|
742
|
+
|
|
743
|
+
inline bool FlagFinish(int flags) {return (flags & kCLDFlagFinish) != 0;}
|
|
744
|
+
inline bool FlagSqueeze(int flags) {return (flags & kCLDFlagSqueeze) != 0;}
|
|
745
|
+
inline bool FlagRepeats(int flags) {return (flags & kCLDFlagRepeats) != 0;}
|
|
746
|
+
inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;}
|
|
747
|
+
inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;}
|
|
748
|
+
inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;}
|
|
749
|
+
inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;}
|
|
750
|
+
|
|
751
|
+
|
|
752
|
+
|
|
753
|
+
|
|
754
|
+
//------------------------------------------------------------------------------
|
|
755
|
+
// For --cld_html debugging output. Not thread safe
|
|
756
|
+
//------------------------------------------------------------------------------
|
|
757
|
+
static Language prior_lang = UNKNOWN_LANGUAGE;
|
|
758
|
+
static bool prior_unreliable = false;
|
|
759
|
+
|
|
760
|
+
//------------------------------------------------------------------------------
|
|
761
|
+
// End For --cld_html debugging output
|
|
762
|
+
//------------------------------------------------------------------------------
|
|
763
|
+
|
|
764
|
+
|
|
765
|
+
// Backscan to word boundary, returning how many bytes n to go back
|
|
766
|
+
// so that src - n is non-space ans src - n - 1 is space.
|
|
767
|
+
// If not found in kMaxSpaceScan bytes, return 0
|
|
768
|
+
int BackscanToSpace(const char* src, int limit) {
|
|
769
|
+
int n = 0;
|
|
770
|
+
limit = cld::minint(limit, kMaxSpaceScan);
|
|
771
|
+
while (n < limit) {
|
|
772
|
+
if (src[-n - 1] == ' ') {return n;} // We are at _X
|
|
773
|
+
++n;
|
|
774
|
+
}
|
|
775
|
+
return 0;
|
|
776
|
+
}
|
|
777
|
+
|
|
778
|
+
// Forwardscan to word boundary, returning how many bytes n to go forward
|
|
779
|
+
// so that src + n is non-space ans src + n - 1 is space.
|
|
780
|
+
// If not found in kMaxSpaceScan bytes, return 0
|
|
781
|
+
int ForwardscanToSpace(const char* src, int limit) {
|
|
782
|
+
int n = 0;
|
|
783
|
+
limit = cld::minint(limit, kMaxSpaceScan);
|
|
784
|
+
while (n < limit) {
|
|
785
|
+
if (src[n] == ' ') {return n + 1;} // We are at _X
|
|
786
|
+
++n;
|
|
787
|
+
}
|
|
788
|
+
return 0;
|
|
789
|
+
}
|
|
790
|
+
|
|
791
|
+
|
|
792
|
+
// This uses a cheap predictor to get a measure of compression, and
|
|
793
|
+
// hence a measure of repetitiveness. It works on complete UTF-8 characters
|
|
794
|
+
// instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly
|
|
795
|
+
// all the time when done with a byte-based count. Sigh.
|
|
796
|
+
//
|
|
797
|
+
// To allow running prediction across multiple chunks, caller passes in current
|
|
798
|
+
// 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
|
|
799
|
+
//
|
|
800
|
+
// Returns the number of *bytes* correctly predicted, increments by 1..4 for
|
|
801
|
+
// each correctly-predicted character.
|
|
802
|
+
//
|
|
803
|
+
// NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text
|
|
804
|
+
//
|
|
805
|
+
int CountPredictedBytes(const char* isrc, int srclen, int* hash, int* tbl) {
|
|
806
|
+
int p_count = 0;
|
|
807
|
+
const uint8* src = reinterpret_cast<const uint8*>(isrc);
|
|
808
|
+
const uint8* srclimit = src + srclen;
|
|
809
|
+
int local_hash = *hash;
|
|
810
|
+
|
|
811
|
+
while (src < srclimit) {
|
|
812
|
+
int c = src[0];
|
|
813
|
+
int incr = 1;
|
|
814
|
+
|
|
815
|
+
// Pick up one char and length
|
|
816
|
+
if (c < 0xc0) {
|
|
817
|
+
// One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
|
|
818
|
+
// Do nothing more
|
|
819
|
+
} else if ((c & 0xe0) == 0xc0) {
|
|
820
|
+
// Two-byte
|
|
821
|
+
c = (c << 8) | src[1];
|
|
822
|
+
incr = 2;
|
|
823
|
+
} else if ((c & 0xf0) == 0xe0) {
|
|
824
|
+
// Three-byte
|
|
825
|
+
c = (c << 16) | (src[1] << 8) | src[2];
|
|
826
|
+
incr = 3;
|
|
827
|
+
} else {
|
|
828
|
+
// Four-byte
|
|
829
|
+
c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
|
|
830
|
+
incr = 4;
|
|
831
|
+
}
|
|
832
|
+
src += incr;
|
|
833
|
+
|
|
834
|
+
int p = tbl[local_hash]; // Prediction
|
|
835
|
+
tbl[local_hash] = c; // Update prediction
|
|
836
|
+
p_count += (c == p); // Count good predictions
|
|
837
|
+
|
|
838
|
+
local_hash = ((local_hash << 4) ^ c) & 0xfff;
|
|
839
|
+
}
|
|
840
|
+
|
|
841
|
+
*hash = local_hash;
|
|
842
|
+
return p_count;
|
|
843
|
+
}
|
|
844
|
+
|
|
845
|
+
|
|
846
|
+
|
|
847
|
+
// Counts number of spaces; a little faster than one-at-a-time
|
|
848
|
+
// Doesn't count odd bytes at end
|
|
849
|
+
int CountSpaces4(const char* src, int src_len) {
|
|
850
|
+
int s_count = 0;
|
|
851
|
+
for (int i = 0; i < (src_len & ~3); i += 4) {
|
|
852
|
+
s_count += (src[i] == ' ');
|
|
853
|
+
s_count += (src[i+1] == ' ');
|
|
854
|
+
s_count += (src[i+2] == ' ');
|
|
855
|
+
s_count += (src[i+3] == ' ');
|
|
856
|
+
}
|
|
857
|
+
return s_count;
|
|
858
|
+
}
|
|
859
|
+
|
|
860
|
+
// Remove words of text that have more than half their letters predicted
|
|
861
|
+
// correctly by our cheap predictor, moving the remaining words in-place
|
|
862
|
+
// to the front of the input buffer.
|
|
863
|
+
//
|
|
864
|
+
// To allow running prediction across multiple chunks, caller passes in current
|
|
865
|
+
// 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
|
|
866
|
+
//
|
|
867
|
+
// Return the new, possibly-shorter length
|
|
868
|
+
//
|
|
869
|
+
// Result Buffer ALWAYS has leading space and trailing space space space NUL,
|
|
870
|
+
// if input does
|
|
871
|
+
//
|
|
872
|
+
int CheapRepWordsInplace(char* isrc, int srclen, int* hash, int* tbl) {
|
|
873
|
+
const uint8* src = reinterpret_cast<const uint8*>(isrc);
|
|
874
|
+
const uint8* srclimit = src + srclen;
|
|
875
|
+
char* dst = isrc;
|
|
876
|
+
int local_hash = *hash;
|
|
877
|
+
char* word_dst = dst; // Start of next word
|
|
878
|
+
int good_predict_bytes = 0;
|
|
879
|
+
int word_length_bytes = 0;
|
|
880
|
+
|
|
881
|
+
while (src < srclimit) {
|
|
882
|
+
int c = src[0];
|
|
883
|
+
int incr = 1;
|
|
884
|
+
*dst++ = c;
|
|
885
|
+
|
|
886
|
+
if (c == ' ') {
|
|
887
|
+
if ((good_predict_bytes * 2) > word_length_bytes) {
|
|
888
|
+
// Word is well-predicted: backup to start of this word
|
|
889
|
+
dst = word_dst;
|
|
890
|
+
if (FLAGS_cld_showme) {
|
|
891
|
+
// Mark the deletion point with period
|
|
892
|
+
// Don't repeat multiple periods
|
|
893
|
+
// Cannot mark with more bytes or may overwrite unseen input
|
|
894
|
+
if ((isrc < (dst - 2)) && (dst[-2] != '.')) {
|
|
895
|
+
*dst++ = '.';
|
|
896
|
+
*dst++ = ' ';
|
|
897
|
+
}
|
|
898
|
+
}
|
|
899
|
+
}
|
|
900
|
+
word_dst = dst; // Start of next word
|
|
901
|
+
good_predict_bytes = 0;
|
|
902
|
+
word_length_bytes = 0;
|
|
903
|
+
}
|
|
904
|
+
|
|
905
|
+
// Pick up one char and length
|
|
906
|
+
if (c < 0xc0) {
|
|
907
|
+
// One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
|
|
908
|
+
// Do nothing more
|
|
909
|
+
} else if ((c & 0xe0) == 0xc0) {
|
|
910
|
+
// Two-byte
|
|
911
|
+
*dst++ = src[1];
|
|
912
|
+
c = (c << 8) | src[1];
|
|
913
|
+
incr = 2;
|
|
914
|
+
} else if ((c & 0xf0) == 0xe0) {
|
|
915
|
+
// Three-byte
|
|
916
|
+
*dst++ = src[1];
|
|
917
|
+
*dst++ = src[2];
|
|
918
|
+
c = (c << 16) | (src[1] << 8) | src[2];
|
|
919
|
+
incr = 3;
|
|
920
|
+
} else {
|
|
921
|
+
// Four-byte
|
|
922
|
+
*dst++ = src[1];
|
|
923
|
+
*dst++ = src[2];
|
|
924
|
+
*dst++ = src[3];
|
|
925
|
+
c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
|
|
926
|
+
incr = 4;
|
|
927
|
+
}
|
|
928
|
+
src += incr;
|
|
929
|
+
word_length_bytes += incr;
|
|
930
|
+
|
|
931
|
+
int p = tbl[local_hash]; // Prediction
|
|
932
|
+
tbl[local_hash] = c; // Update prediction
|
|
933
|
+
if (c == p) {
|
|
934
|
+
good_predict_bytes += incr; // Count good predictions
|
|
935
|
+
}
|
|
936
|
+
|
|
937
|
+
local_hash = ((local_hash << 4) ^ c) & 0xfff;
|
|
938
|
+
}
|
|
939
|
+
|
|
940
|
+
*hash = local_hash;
|
|
941
|
+
|
|
942
|
+
if ((dst - isrc) < (srclen - 3)) {
|
|
943
|
+
// Pad and make last char clean UTF-8 by putting following spaces
|
|
944
|
+
dst[0] = ' ';
|
|
945
|
+
dst[1] = ' ';
|
|
946
|
+
dst[2] = ' ';
|
|
947
|
+
dst[3] = '\0';
|
|
948
|
+
} else if ((dst - isrc) < srclen) {
|
|
949
|
+
// Make last char clean UTF-8 by putting following space off the end
|
|
950
|
+
dst[0] = ' ';
|
|
951
|
+
}
|
|
952
|
+
|
|
953
|
+
return static_cast<int>(dst - isrc);
|
|
954
|
+
}
|
|
955
|
+
|
|
956
|
+
|
|
957
|
+
// Remove portions of text that have a high density of spaces, or that are
|
|
958
|
+
// overly repetitive, squeezing the remaining text in-place to the front of the
|
|
959
|
+
// input buffer.
|
|
960
|
+
//
|
|
961
|
+
// Squeezing looks at density of space/prediced chars in fixed-size chunks,
|
|
962
|
+
// specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes.
|
|
963
|
+
//
|
|
964
|
+
// Return the new, possibly-shorter length
|
|
965
|
+
//
|
|
966
|
+
// Result Buffer ALWAYS has leading space and trailing space space space NUL,
|
|
967
|
+
// if input does
|
|
968
|
+
//
|
|
969
|
+
int CompactLangDetImpl::CheapSqueezeInplace(char* isrc,
|
|
970
|
+
int srclen,
|
|
971
|
+
int ichunksize) {
|
|
972
|
+
char* src = isrc;
|
|
973
|
+
char* dst = src;
|
|
974
|
+
char* srclimit = src + srclen;
|
|
975
|
+
bool skipping = false;
|
|
976
|
+
|
|
977
|
+
int hash = 0;
|
|
978
|
+
// Allocate local prediction table.
|
|
979
|
+
int* predict_tbl = new int[kPredictionTableSize];
|
|
980
|
+
memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
|
|
981
|
+
|
|
982
|
+
int chunksize = ichunksize;
|
|
983
|
+
if (chunksize == 0) {chunksize = kChunksizeDefault;}
|
|
984
|
+
int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
|
|
985
|
+
int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
|
|
986
|
+
|
|
987
|
+
while (src < srclimit) {
|
|
988
|
+
int remaining_bytes = srclimit - src;
|
|
989
|
+
int len = cld::minint(chunksize, remaining_bytes);
|
|
990
|
+
int space_n = CountSpaces4(src, len);
|
|
991
|
+
int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
|
|
992
|
+
if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
|
|
993
|
+
// Skip the text
|
|
994
|
+
if (!skipping) {
|
|
995
|
+
// Keeping-to-skipping transition; do it at a space
|
|
996
|
+
int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
|
|
997
|
+
dst -= n;
|
|
998
|
+
skipping = true;
|
|
999
|
+
if (FLAGS_cld_showme) {
|
|
1000
|
+
// Mark the deletion point with black square U+25A0
|
|
1001
|
+
*dst++ = 0xe2;
|
|
1002
|
+
*dst++ = 0x96;
|
|
1003
|
+
*dst++ = 0xa0;
|
|
1004
|
+
*dst++ = ' ';
|
|
1005
|
+
}
|
|
1006
|
+
if (dst == isrc) {
|
|
1007
|
+
// Force a leading space if the first chunk is deleted
|
|
1008
|
+
*dst++ = ' ';
|
|
1009
|
+
}
|
|
1010
|
+
}
|
|
1011
|
+
} else {
|
|
1012
|
+
// Keep the text
|
|
1013
|
+
if (skipping) {
|
|
1014
|
+
// Skipping-to-keeping transition; do it at a space
|
|
1015
|
+
int n = ForwardscanToSpace(src, len);
|
|
1016
|
+
src += n;
|
|
1017
|
+
remaining_bytes -= n; // Shrink remaining length
|
|
1018
|
+
len -= n;
|
|
1019
|
+
skipping = false;
|
|
1020
|
+
}
|
|
1021
|
+
// "len" can be negative in some cases
|
|
1022
|
+
if (len > 0) {
|
|
1023
|
+
memcpy(dst, src, len);
|
|
1024
|
+
dst += len;
|
|
1025
|
+
}
|
|
1026
|
+
}
|
|
1027
|
+
src += len;
|
|
1028
|
+
}
|
|
1029
|
+
|
|
1030
|
+
if ((dst - isrc) < (srclen - 3)) {
|
|
1031
|
+
// Pad and make last char clean UTF-8 by putting following spaces
|
|
1032
|
+
dst[0] = ' ';
|
|
1033
|
+
dst[1] = ' ';
|
|
1034
|
+
dst[2] = ' ';
|
|
1035
|
+
dst[3] = '\0';
|
|
1036
|
+
} else if ((dst - isrc) < srclen) {
|
|
1037
|
+
// Make last char clean UTF-8 by putting following space off the end
|
|
1038
|
+
dst[0] = ' ';
|
|
1039
|
+
}
|
|
1040
|
+
|
|
1041
|
+
// Deallocate local prediction table
|
|
1042
|
+
delete[] predict_tbl;
|
|
1043
|
+
return static_cast<int>(dst - isrc);
|
|
1044
|
+
}
|
|
1045
|
+
|
|
1046
|
+
// Timing 2.8GHz P4 (dsites 2008.03.20) with 170KB input
|
|
1047
|
+
// About 90 MB/sec, with or without memcpy, chunksize 48 or 4096
|
|
1048
|
+
// Just CountSpaces is about 340 MB/sec
|
|
1049
|
+
// Byte-only CountPredictedBytes is about 150 MB/sec
|
|
1050
|
+
// Byte-only CountPredictedBytes, conditional tbl[] = is about 85! MB/sec
|
|
1051
|
+
// Byte-only CountPredictedBytes is about 180 MB/sec, byte tbl, byte/int c
|
|
1052
|
+
// Unjammed byte-only both = 170 MB/sec
|
|
1053
|
+
// Jammed byte-only both = 120 MB/sec
|
|
1054
|
+
// Back to original w/slight updates, 110 MB/sec
|
|
1055
|
+
//
|
|
1056
|
+
bool CheapSqueezeTriggerTest(const char* src, int srclen, int testsize) {
|
|
1057
|
+
// Don't trigger at all on short text
|
|
1058
|
+
if (srclen < testsize) {return false;}
|
|
1059
|
+
int space_thresh = (testsize * kSpacesTriggerPercent) / 100;
|
|
1060
|
+
int predict_thresh = (testsize * kPredictTriggerPercent) / 100;
|
|
1061
|
+
int hash = 0;
|
|
1062
|
+
// Allocate local prediction table.
|
|
1063
|
+
int* predict_tbl = new int[kPredictionTableSize];
|
|
1064
|
+
memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
|
|
1065
|
+
|
|
1066
|
+
bool retval = false;
|
|
1067
|
+
if ((CountSpaces4(src, testsize) >= space_thresh) ||
|
|
1068
|
+
(CountPredictedBytes(src, testsize, &hash, predict_tbl) >=
|
|
1069
|
+
predict_thresh)) {
|
|
1070
|
+
retval = true;
|
|
1071
|
+
}
|
|
1072
|
+
// Deallocate local prediction table
|
|
1073
|
+
delete[] predict_tbl;
|
|
1074
|
+
return retval;
|
|
1075
|
+
}
|
|
1076
|
+
|
|
1077
|
+
|
|
1078
|
+
|
|
1079
|
+
// Close pairs (correlation) language_enum/language_enum
|
|
1080
|
+
// id/ms (0.47) 38/40 [1]
|
|
1081
|
+
// bo/dz (0.46) 105/135 [2]
|
|
1082
|
+
// cz/sk (0.43) 17/68 [3]
|
|
1083
|
+
// no/nn (0.42) 10/80 [4]
|
|
1084
|
+
// hi/mr (0.38) 35/64 [5]
|
|
1085
|
+
// xh/zu (0.37) 83/84 [6]
|
|
1086
|
+
// Subscripted by packed language, gives 0 or a subscript in closepair
|
|
1087
|
+
// scoring array inside doc_tote
|
|
1088
|
+
static const uint8 kClosePair[EXT_NUM_LANGUAGES + 1] = {
|
|
1089
|
+
0,
|
|
1090
|
+
0,0,0,0,0,0,0,0, 0,0,4,0,0,0,0,0, 0,3,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
1091
|
+
0,0,0,5,0,0,1,0, 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
1092
|
+
5,0,0,0,3,0,0,0, 0,0,0,0,0,0,0,0, 4,0,0,6,6,0,0,0, 0,0,0,0,0,0,0,0,
|
|
1093
|
+
0,0,0,0,0,0,0,0, 0,2,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
1094
|
+
0,0,0,0,0,0,0,2, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
1095
|
+
// Add new language close-pair number just before here (just use 0)
|
|
1096
|
+
};
|
|
1097
|
+
|
|
1098
|
+
|
|
1099
|
+
// Delete any extended languages from doc_tote
|
|
1100
|
+
void RemoveExtendedLanguages(ToteWithReliability* doc_tote) {
|
|
1101
|
+
for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
|
|
1102
|
+
if (cld::UnpackLanguage(doc_tote->Key(sub)) >= NUM_LANGUAGES) {
|
|
1103
|
+
// Effectively remove the extended language by setting key&score to zero
|
|
1104
|
+
if (FLAGS_dbgscore) {
|
|
1105
|
+
fprintf(stderr, "{-%s} ",
|
|
1106
|
+
ExtLanguageCode(cld::UnpackLanguage(doc_tote->Key(sub))));
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1109
|
+
// Delete entry
|
|
1110
|
+
doc_tote->SetKey(sub, 0);
|
|
1111
|
+
doc_tote->SetValue(sub, 0);
|
|
1112
|
+
doc_tote->SetReliability(sub, 0);
|
|
1113
|
+
}
|
|
1114
|
+
}
|
|
1115
|
+
}
|
|
1116
|
+
|
|
1117
|
+
static const int kMinReliableKeepPercent = 41; // Remove lang if reli < this
|
|
1118
|
+
|
|
1119
|
+
// For Tier3 languages, require a minimum number of bytes to be first-place lang
|
|
1120
|
+
static const int kGoodFirstT3MinBytes = 24; // <this => no first
|
|
1121
|
+
|
|
1122
|
+
// Move bytes for unreliable langs to another lang or UNKNOWN
|
|
1123
|
+
// doc_tote is sorted, so cannot Add
|
|
1124
|
+
//
|
|
1125
|
+
// If both CHINESE and CHINESET are present and unreliable, do not delete both;
|
|
1126
|
+
// merge both into CHINESE.
|
|
1127
|
+
//
|
|
1128
|
+
//dsites 2009.03.19
|
|
1129
|
+
// we also want to remove Tier3 languages as the first lang if there is very
|
|
1130
|
+
// little text like ej1 ej2 ej3 ej4
|
|
1131
|
+
// maybe fold this back in earlier
|
|
1132
|
+
//
|
|
1133
|
+
void RemoveUnreliableLanguages(ToteWithReliability* doc_tote, bool do_remove_weak_matches) {
|
|
1134
|
+
// Prepass to merge some low-reliablility languages
|
|
1135
|
+
int total_bytes = 0;
|
|
1136
|
+
for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
|
|
1137
|
+
int plang = doc_tote->Key(sub);
|
|
1138
|
+
if (plang == 0) {continue;} // Empty slot
|
|
1139
|
+
|
|
1140
|
+
Language lang = cld::UnpackLanguage(plang);
|
|
1141
|
+
int bytes = doc_tote->Value(sub);
|
|
1142
|
+
int reli = doc_tote->Reliability(sub);
|
|
1143
|
+
if (bytes == 0) {continue;} // Zero bytes
|
|
1144
|
+
total_bytes += bytes;
|
|
1145
|
+
|
|
1146
|
+
// Reliable percent is stored reliable score over stored bytecount
|
|
1147
|
+
int reliable_percent = reli / bytes;
|
|
1148
|
+
if (reliable_percent >= kMinReliableKeepPercent) {continue;} // Keeper
|
|
1149
|
+
|
|
1150
|
+
// This language is too unreliable to keep, but we might merge it.
|
|
1151
|
+
Language altlang = UNKNOWN_LANGUAGE;
|
|
1152
|
+
if (lang < NUM_LANGUAGES) {altlang = kClosestAltLanguage[lang];}
|
|
1153
|
+
if (altlang == UNKNOWN_LANGUAGE) {continue;} // No alternative
|
|
1154
|
+
|
|
1155
|
+
// Look for alternative in doc_tote
|
|
1156
|
+
int altsub = doc_tote->Find(cld::PackLanguage(altlang));
|
|
1157
|
+
if (altsub < 0) {continue;} // No alternative text
|
|
1158
|
+
|
|
1159
|
+
int bytes2 = doc_tote->Value(altsub);
|
|
1160
|
+
int reli2 = doc_tote->Reliability(altsub);
|
|
1161
|
+
if (bytes2 == 0) {continue;} // Zero bytes
|
|
1162
|
+
|
|
1163
|
+
// Reliable percent is stored reliable score over stored bytecount
|
|
1164
|
+
int reliable_percent2 = reli2 / bytes2;
|
|
1165
|
+
|
|
1166
|
+
// Merge one language into the other. Break ties toward lower lang #
|
|
1167
|
+
int tosub = altsub;
|
|
1168
|
+
int fromsub = sub;
|
|
1169
|
+
bool into_lang = false;
|
|
1170
|
+
if ((reliable_percent2 < reliable_percent) ||
|
|
1171
|
+
((reliable_percent2 == reliable_percent) && (lang < altlang))) {
|
|
1172
|
+
tosub = sub;
|
|
1173
|
+
fromsub = altsub;
|
|
1174
|
+
into_lang = true;
|
|
1175
|
+
}
|
|
1176
|
+
|
|
1177
|
+
// Make sure reliability doesn't drop and is enough to avoid delete
|
|
1178
|
+
int newpercent = cld::maxint(reliable_percent, reliable_percent2);
|
|
1179
|
+
newpercent = cld::maxint(newpercent, kMinReliableKeepPercent);
|
|
1180
|
+
int newbytes = bytes + bytes2;
|
|
1181
|
+
int newreli = newpercent * newbytes;
|
|
1182
|
+
|
|
1183
|
+
doc_tote->SetKey(fromsub, 0);
|
|
1184
|
+
doc_tote->SetValue(fromsub, 0);
|
|
1185
|
+
doc_tote->SetReliability(fromsub, 0);
|
|
1186
|
+
doc_tote->SetValue(tosub, newbytes);
|
|
1187
|
+
doc_tote->SetReliability(tosub, newreli);
|
|
1188
|
+
|
|
1189
|
+
// Show fate of unreliable languages if at least 10 bytes
|
|
1190
|
+
if (FLAGS_cld_html /*&& (newpercent >= 10)*/ && (newbytes >= 10)) {
|
|
1191
|
+
if (into_lang) {
|
|
1192
|
+
fprintf(stderr, "{Unreli %s.%d(%dB) => %s} ",
|
|
1193
|
+
ExtLanguageCode(altlang), reliable_percent2, bytes2,
|
|
1194
|
+
ExtLanguageCode(lang));
|
|
1195
|
+
} else {
|
|
1196
|
+
fprintf(stderr, "{Unreli %s.%d(%dB) => %s} ",
|
|
1197
|
+
ExtLanguageCode(lang), reliable_percent, bytes,
|
|
1198
|
+
ExtLanguageCode(altlang));
|
|
1199
|
+
}
|
|
1200
|
+
}
|
|
1201
|
+
}
|
|
1202
|
+
|
|
1203
|
+
|
|
1204
|
+
if (do_remove_weak_matches) {
|
|
1205
|
+
// Pass to delete any remaining unreliable languages
|
|
1206
|
+
for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
|
|
1207
|
+
int plang = doc_tote->Key(sub);
|
|
1208
|
+
if (plang == 0) {continue;} // Empty slot
|
|
1209
|
+
|
|
1210
|
+
Language lang = cld::UnpackLanguage(plang);
|
|
1211
|
+
int bytes = doc_tote->Value(sub);
|
|
1212
|
+
int reli = doc_tote->Reliability(sub);
|
|
1213
|
+
if (bytes == 0) {continue;} // Zero bytes
|
|
1214
|
+
|
|
1215
|
+
bool is_tier3 = (cld::kIsPackedTop40[plang] == 0);
|
|
1216
|
+
if (is_tier3 &&
|
|
1217
|
+
(bytes < kGoodFirstT3MinBytes) &&
|
|
1218
|
+
(bytes < total_bytes)) {
|
|
1219
|
+
reli = 0; // Too-short tier3
|
|
1220
|
+
}
|
|
1221
|
+
|
|
1222
|
+
// Reliable percent is stored as reliable score over stored bytecount
|
|
1223
|
+
int reliable_percent = reli / bytes;
|
|
1224
|
+
if (reliable_percent >= kMinReliableKeepPercent) {continue;} // Keeper
|
|
1225
|
+
|
|
1226
|
+
// Delete unreliable entry
|
|
1227
|
+
doc_tote->SetKey(sub, 0);
|
|
1228
|
+
doc_tote->SetValue(sub, 0);
|
|
1229
|
+
doc_tote->SetReliability(sub, 0);
|
|
1230
|
+
|
|
1231
|
+
// Show fate of unreliable languages if at least 10 bytes
|
|
1232
|
+
if (FLAGS_cld_html /*&& (reliable_percent >= 10)*/ && (bytes >= 10)) {
|
|
1233
|
+
fprintf(stderr, "{Unreli %s.%d(%dB)} ",
|
|
1234
|
+
ExtLanguageCode(lang), reliable_percent, bytes);
|
|
1235
|
+
}
|
|
1236
|
+
}
|
|
1237
|
+
}
|
|
1238
|
+
|
|
1239
|
+
if (FLAGS_cld_html) {fprintf(stderr, "<br>\n");}
|
|
1240
|
+
}
|
|
1241
|
+
|
|
1242
|
+
|
|
1243
|
+
// Move less likely byte count to more likely for close pairs of languages
|
|
1244
|
+
void RefineScoredClosePairs(ToteWithReliability* doc_tote) {
|
|
1245
|
+
for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
|
|
1246
|
+
int close_packedlang = doc_tote->Key(sub);
|
|
1247
|
+
int subscr = kClosePair[close_packedlang];
|
|
1248
|
+
if (subscr == 0) {continue;}
|
|
1249
|
+
|
|
1250
|
+
// We have a close pair language -- if the other one is also scored and the
|
|
1251
|
+
// longword score differs enough, put all our eggs into one basket
|
|
1252
|
+
|
|
1253
|
+
// Nonzero longword score: Go look for the other of this pair
|
|
1254
|
+
for (int sub2 = sub + 1; sub2 < doc_tote->MaxSize(); ++sub2) {
|
|
1255
|
+
if (kClosePair[doc_tote->Key(sub2)] == subscr) {
|
|
1256
|
+
// We have a matching pair
|
|
1257
|
+
int close_packedlang2 = doc_tote->Key(sub2);
|
|
1258
|
+
|
|
1259
|
+
// Move all the text bytes from lower byte-count to higher one
|
|
1260
|
+
int from_sub, to_sub;
|
|
1261
|
+
Language from_lang, to_lang;
|
|
1262
|
+
if (doc_tote->Value(sub) < doc_tote->Value(sub2)) {
|
|
1263
|
+
from_sub = sub;
|
|
1264
|
+
to_sub = sub2;
|
|
1265
|
+
from_lang = cld::UnpackLanguage(close_packedlang);
|
|
1266
|
+
to_lang = cld::UnpackLanguage(close_packedlang2);
|
|
1267
|
+
} else {
|
|
1268
|
+
from_sub = sub2;
|
|
1269
|
+
to_sub = sub;
|
|
1270
|
+
from_lang = cld::UnpackLanguage(close_packedlang2);
|
|
1271
|
+
to_lang = cld::UnpackLanguage(close_packedlang);
|
|
1272
|
+
}
|
|
1273
|
+
|
|
1274
|
+
// Move all the bytes smaller => larger of the pair
|
|
1275
|
+
if (FLAGS_cld_html || FLAGS_dbgscore) {
|
|
1276
|
+
// Show fate of closepair language
|
|
1277
|
+
int val = doc_tote->Value(from_sub);
|
|
1278
|
+
int reli = doc_tote->Reliability(from_sub);
|
|
1279
|
+
int reliable_percent = reli / (val ? val : 1); // avoid zdiv
|
|
1280
|
+
fprintf(stderr, "{CloseLangPair: %s.%d%%(%dB) => %s} ",
|
|
1281
|
+
ExtLanguageCode(from_lang),
|
|
1282
|
+
reliable_percent,
|
|
1283
|
+
doc_tote->Value(from_sub),
|
|
1284
|
+
ExtLanguageCode(to_lang));
|
|
1285
|
+
}
|
|
1286
|
+
int sum = doc_tote->Value(to_sub) + doc_tote->Value(from_sub);
|
|
1287
|
+
doc_tote->SetValue(to_sub, sum);
|
|
1288
|
+
doc_tote->SetReliability(to_sub, 100 * sum);
|
|
1289
|
+
|
|
1290
|
+
// Delete old entry
|
|
1291
|
+
doc_tote->SetKey(from_sub, 0);
|
|
1292
|
+
doc_tote->SetValue(from_sub, 0);
|
|
1293
|
+
doc_tote->SetReliability(from_sub, 0);
|
|
1294
|
+
|
|
1295
|
+
break; // Exit inner for sub2 loop
|
|
1296
|
+
}
|
|
1297
|
+
} // End for sub2
|
|
1298
|
+
} // End for sub
|
|
1299
|
+
}
|
|
1300
|
+
|
|
1301
|
+
|
|
1302
|
+
void ApplyLanguageHints(Tote* chunk_tote, int tote_grams,
|
|
1303
|
+
uint8* lang_hint_boost) {
|
|
1304
|
+
// Need 8 quad/unigrams to give full hint boost, else derate linearly
|
|
1305
|
+
if (tote_grams > 8) {
|
|
1306
|
+
tote_grams = 8;
|
|
1307
|
+
}
|
|
1308
|
+
for (int sub = 0; sub < chunk_tote->MaxSize(); ++sub) {
|
|
1309
|
+
// Hint boosts are per packed subscript
|
|
1310
|
+
int lang_sub = chunk_tote->Key(sub);
|
|
1311
|
+
int new_value = chunk_tote->Value(sub) +
|
|
1312
|
+
((lang_hint_boost[lang_sub] * tote_grams) >> 3);
|
|
1313
|
+
chunk_tote->SetValue(sub, new_value);
|
|
1314
|
+
if (FLAGS_dbgscore && (lang_hint_boost[lang_sub] > 0)) {
|
|
1315
|
+
fprintf(stderr, "[%s+=%d*%d/8] ",
|
|
1316
|
+
ExtLanguageCode(cld::UnpackLanguage(lang_sub)),
|
|
1317
|
+
lang_hint_boost[lang_sub], tote_grams);
|
|
1318
|
+
}
|
|
1319
|
+
}
|
|
1320
|
+
}
|
|
1321
|
+
|
|
1322
|
+
|
|
1323
|
+
void PrintHtmlEscapedText(FILE* f, const char* txt, int len) {
|
|
1324
|
+
for (int i = 0; i < len; ++i) {
|
|
1325
|
+
char c = txt[i];
|
|
1326
|
+
if (c == '<') {
|
|
1327
|
+
fprintf(f, "<");
|
|
1328
|
+
} else if (c == '>') {
|
|
1329
|
+
fprintf(f, ">");
|
|
1330
|
+
} else if (c == '&') {
|
|
1331
|
+
fprintf(f, "&");
|
|
1332
|
+
} else if (c == '\'') {
|
|
1333
|
+
fprintf(f, "'");
|
|
1334
|
+
} else if (c == '"') {
|
|
1335
|
+
fprintf(f, """);
|
|
1336
|
+
} else {
|
|
1337
|
+
fprintf(f, "%c", c);
|
|
1338
|
+
}
|
|
1339
|
+
}
|
|
1340
|
+
fprintf(f, "<br>\n");
|
|
1341
|
+
}
|
|
1342
|
+
|
|
1343
|
+
|
|
1344
|
+
// Add one chunk's score to running document score
|
|
1345
|
+
// If the top language is UNKNOWN_LANGUAGE, score nothing. This is used to
|
|
1346
|
+
// positively identify text to be ignored, such as link farms.
|
|
1347
|
+
// Sort before scoring and reinit afterward
|
|
1348
|
+
//
|
|
1349
|
+
// src and srclen are just for debug output
|
|
1350
|
+
void ScoreChunkIntoDoc(const char* src, int srclen, int advance_by,
|
|
1351
|
+
UnicodeLScript lscript,
|
|
1352
|
+
Tote* chunk_tote,
|
|
1353
|
+
ToteWithReliability* doc_tote,
|
|
1354
|
+
int tote_grams,
|
|
1355
|
+
uint8* lang_hint_boost) {
|
|
1356
|
+
// Apply hints before sorting
|
|
1357
|
+
if (lang_hint_boost) {
|
|
1358
|
+
ApplyLanguageHints(chunk_tote, tote_grams, lang_hint_boost);
|
|
1359
|
+
}
|
|
1360
|
+
|
|
1361
|
+
// Sort to get top two languages
|
|
1362
|
+
chunk_tote->Sort(2);
|
|
1363
|
+
Language cur_lang = cld::UnpackLanguage(chunk_tote->Key(0));
|
|
1364
|
+
|
|
1365
|
+
// Return if empty
|
|
1366
|
+
if (cur_lang < 0) {
|
|
1367
|
+
chunk_tote->Reinit();
|
|
1368
|
+
return;
|
|
1369
|
+
}
|
|
1370
|
+
|
|
1371
|
+
bool cur_unreliable = false;
|
|
1372
|
+
|
|
1373
|
+
// Reliability is a function of mean script score per KB of text
|
|
1374
|
+
int len = chunk_tote->GetByteCount();
|
|
1375
|
+
int reliability = cld::GetReliability((len * 2) / advance_by,
|
|
1376
|
+
lscript,
|
|
1377
|
+
chunk_tote);
|
|
1378
|
+
cur_unreliable = (reliability < cld::kMinReliable);
|
|
1379
|
+
|
|
1380
|
+
// If tote_grams=0, always reliable
|
|
1381
|
+
// If tote_grams=1, always unreliable
|
|
1382
|
+
if (tote_grams == 0) {
|
|
1383
|
+
reliability = 100;
|
|
1384
|
+
cur_unreliable = false;
|
|
1385
|
+
} else if (tote_grams == 1) {
|
|
1386
|
+
reliability = 0;
|
|
1387
|
+
cur_unreliable = true;
|
|
1388
|
+
}
|
|
1389
|
+
|
|
1390
|
+
#if 0
|
|
1391
|
+
// TEMP
|
|
1392
|
+
if (FLAGS_cld_html) {
|
|
1393
|
+
if (reliability >= kMinReliableKeepPercent) {
|
|
1394
|
+
fprintf(stderr, "R%d%% ", reliability);
|
|
1395
|
+
} else {
|
|
1396
|
+
fprintf(stderr, "--R%d%% ", reliability);
|
|
1397
|
+
}
|
|
1398
|
+
}
|
|
1399
|
+
#endif
|
|
1400
|
+
|
|
1401
|
+
// Track the sequence of language fragments [result currently unused]
|
|
1402
|
+
////if (reliability >= kMinReliableSeq) {
|
|
1403
|
+
//// doc_tote->AddSeq(chunk_tote->Key(0));
|
|
1404
|
+
////}
|
|
1405
|
+
|
|
1406
|
+
if (cur_unreliable && (chunk_tote->Key(1) != 0)) {
|
|
1407
|
+
// Unreliable and two top contenders, split byte count 5/8 - 3/8
|
|
1408
|
+
int top_len = ((len * 5) + 4) >> 3;
|
|
1409
|
+
int second_len = len - top_len;
|
|
1410
|
+
|
|
1411
|
+
doc_tote->Add(chunk_tote->Key(0),
|
|
1412
|
+
top_len, chunk_tote->Value(0), reliability);
|
|
1413
|
+
doc_tote->Add(chunk_tote->Key(1),
|
|
1414
|
+
second_len, chunk_tote->Value(1), reliability);
|
|
1415
|
+
if (FLAGS_dbgscore) {
|
|
1416
|
+
fprintf(stderr, "{+%s.%d.%dR(%dB) +%s.%d.%dR(%dB)} ",
|
|
1417
|
+
ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(0))),
|
|
1418
|
+
chunk_tote->Value(0),
|
|
1419
|
+
reliability,
|
|
1420
|
+
top_len,
|
|
1421
|
+
ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(1))),
|
|
1422
|
+
chunk_tote->Value(1),
|
|
1423
|
+
reliability,
|
|
1424
|
+
second_len);
|
|
1425
|
+
}
|
|
1426
|
+
} else {
|
|
1427
|
+
// Reliable or single contender
|
|
1428
|
+
doc_tote->Add(chunk_tote->Key(0),
|
|
1429
|
+
len, chunk_tote->Value(0), reliability);
|
|
1430
|
+
if (FLAGS_dbgscore) {
|
|
1431
|
+
fprintf(stderr, "{+%s.%d.%dR(%dB)} ",
|
|
1432
|
+
ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(0))),
|
|
1433
|
+
chunk_tote->Value(0),
|
|
1434
|
+
reliability,
|
|
1435
|
+
len);
|
|
1436
|
+
}
|
|
1437
|
+
}
|
|
1438
|
+
|
|
1439
|
+
if (FLAGS_cld_html) {
|
|
1440
|
+
if (cur_lang < 0) {cur_lang = UNKNOWN_LANGUAGE;}
|
|
1441
|
+
cld::PrintLang(stderr, chunk_tote,
|
|
1442
|
+
cur_lang, cur_unreliable,
|
|
1443
|
+
prior_lang, prior_unreliable);
|
|
1444
|
+
prior_lang = cur_lang;
|
|
1445
|
+
prior_unreliable = cur_unreliable;
|
|
1446
|
+
|
|
1447
|
+
string temp(src, srclen);
|
|
1448
|
+
if (temp[0] == '=') {
|
|
1449
|
+
// Rewrite =ScriptX= or =SwitchX= as =Xxxx= for script code Xxxx
|
|
1450
|
+
temp = "=Buffered_";
|
|
1451
|
+
temp.append(UnicodeLScriptCode(lscript));
|
|
1452
|
+
temp.append("=");
|
|
1453
|
+
}
|
|
1454
|
+
cld::PrintText(stderr, cur_lang, temp);
|
|
1455
|
+
}
|
|
1456
|
+
|
|
1457
|
+
chunk_tote->Reinit();
|
|
1458
|
+
}
|
|
1459
|
+
|
|
1460
|
+
|
|
1461
|
+
void PrintTopLang(Language top_lang) {
|
|
1462
|
+
if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
|
|
1463
|
+
fprintf(stderr, "[] ");
|
|
1464
|
+
} else {
|
|
1465
|
+
fprintf(stderr, "[%s] ", ExtLanguageName(top_lang));
|
|
1466
|
+
prior_lang = top_lang;
|
|
1467
|
+
}
|
|
1468
|
+
}
|
|
1469
|
+
|
|
1470
|
+
void PrintTopLangSpeculative(Language top_lang) {
|
|
1471
|
+
fprintf(stderr, "<span style=\"color:#%06X;\">", 0xa0a0a0);
|
|
1472
|
+
if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
|
|
1473
|
+
fprintf(stderr, "[] ");
|
|
1474
|
+
} else {
|
|
1475
|
+
fprintf(stderr, "[%s] ", ExtLanguageName(top_lang));
|
|
1476
|
+
prior_lang = top_lang;
|
|
1477
|
+
}
|
|
1478
|
+
fprintf(stderr, "</span>\n");
|
|
1479
|
+
}
|
|
1480
|
+
|
|
1481
|
+
|
|
1482
|
+
// Add one chunk's score to running document score
|
|
1483
|
+
// Convenience function with constant src text
|
|
1484
|
+
void ScoreChunkIntoDoc2(const char* src, int advance_by,
|
|
1485
|
+
UnicodeLScript lscript,
|
|
1486
|
+
Tote* chunk_tote,
|
|
1487
|
+
ToteWithReliability* doc_tote,
|
|
1488
|
+
int tote_grams,
|
|
1489
|
+
uint8* lang_hint_boost) {
|
|
1490
|
+
int srclen = static_cast<int>(strlen(src));
|
|
1491
|
+
ScoreChunkIntoDoc(src, srclen, advance_by, lscript, chunk_tote,
|
|
1492
|
+
doc_tote, tote_grams, lang_hint_boost);
|
|
1493
|
+
}
|
|
1494
|
+
|
|
1495
|
+
|
|
1496
|
+
// Score one scriptspan using the only language for that script
|
|
1497
|
+
void ScoreNilgrams(getone::LangSpan* scriptspan, int lang,
|
|
1498
|
+
ToteWithReliability* doc_tote,
|
|
1499
|
+
uint8* lang_hint_boost,
|
|
1500
|
+
int flags, Language plus_one) {
|
|
1501
|
+
// For debugging only. Not thread-safe
|
|
1502
|
+
prior_lang = UNKNOWN_LANGUAGE;
|
|
1503
|
+
prior_unreliable = false;
|
|
1504
|
+
|
|
1505
|
+
const char* src = scriptspan->text;
|
|
1506
|
+
int len = scriptspan->text_bytes;
|
|
1507
|
+
|
|
1508
|
+
Tote chunk_tote;
|
|
1509
|
+
// Score 1000 for 1000 bytes
|
|
1510
|
+
chunk_tote.AddGram();
|
|
1511
|
+
chunk_tote.Add(lang, scriptspan->text_bytes);
|
|
1512
|
+
chunk_tote.AddBytes(scriptspan->text_bytes);
|
|
1513
|
+
int advance_by = 2;
|
|
1514
|
+
int tote_grams = 0; // Indicates fully reliable
|
|
1515
|
+
ScoreChunkIntoDoc(src, len, advance_by,
|
|
1516
|
+
scriptspan->script, &chunk_tote,
|
|
1517
|
+
doc_tote, tote_grams, lang_hint_boost);
|
|
1518
|
+
}
|
|
1519
|
+
|
|
1520
|
+
// Score one scriptspan using unigrams
|
|
1521
|
+
// Updates tote_grams
|
|
1522
|
+
static void ScoreUnigrams(const UTF8PropObj* unigram_obj,
|
|
1523
|
+
getone::LangSpan* scriptspan,
|
|
1524
|
+
int* tote_grams, int gram_limit,
|
|
1525
|
+
Tote* chunk_tote,
|
|
1526
|
+
ToteWithReliability* doc_tote,
|
|
1527
|
+
uint8* lang_hint_boost,
|
|
1528
|
+
int advance_by, int flags,
|
|
1529
|
+
int* initial_word_span, Language plus_one) {
|
|
1530
|
+
// chunk_tote may have partial sum coming in
|
|
1531
|
+
const char* src = scriptspan->text;
|
|
1532
|
+
const char* srclimit = src + scriptspan->text_bytes;
|
|
1533
|
+
|
|
1534
|
+
// For debugging only. Not thread-safe
|
|
1535
|
+
prior_lang = UNKNOWN_LANGUAGE;
|
|
1536
|
+
prior_unreliable = false;
|
|
1537
|
+
|
|
1538
|
+
// Break text up into multiple chunks and score each
|
|
1539
|
+
while (src < srclimit) {
|
|
1540
|
+
// Updates tote_grams
|
|
1541
|
+
int len = cld::DoUniScoreV3(unigram_obj,
|
|
1542
|
+
src, srclimit - src, advance_by,
|
|
1543
|
+
tote_grams, gram_limit, chunk_tote);
|
|
1544
|
+
if (FlagUseWords(flags) || (*initial_word_span > 0)) {
|
|
1545
|
+
// Use bigram scoring in addition to quadgrams
|
|
1546
|
+
cld::DoBigramScoreV3(&kCjkBiTable_obj,
|
|
1547
|
+
src, len, chunk_tote);
|
|
1548
|
+
}
|
|
1549
|
+
chunk_tote->AddBytes(len);
|
|
1550
|
+
*initial_word_span -= len;
|
|
1551
|
+
|
|
1552
|
+
if (*tote_grams >= gram_limit) {
|
|
1553
|
+
// Add this chunk to doc totals
|
|
1554
|
+
// Remove all but top40 if asked
|
|
1555
|
+
if (FlagTop40(flags)) {
|
|
1556
|
+
cld::DemoteNotTop40(chunk_tote, cld::PackLanguage(plus_one));
|
|
1557
|
+
}
|
|
1558
|
+
|
|
1559
|
+
// Sort, accumulate into doc total, reinit
|
|
1560
|
+
ScoreChunkIntoDoc(src, len, advance_by,
|
|
1561
|
+
scriptspan->script, chunk_tote,
|
|
1562
|
+
doc_tote, *tote_grams, lang_hint_boost);
|
|
1563
|
+
*tote_grams = 0;
|
|
1564
|
+
} else {
|
|
1565
|
+
if (FLAGS_cld_html) {
|
|
1566
|
+
string temp(src, len);
|
|
1567
|
+
Language top_lang = cld::UnpackLanguage(chunk_tote->CurrentTopKey());
|
|
1568
|
+
PrintTopLangSpeculative(top_lang);
|
|
1569
|
+
cld::PrintText(stderr, top_lang, temp);
|
|
1570
|
+
}
|
|
1571
|
+
}
|
|
1572
|
+
src += len;
|
|
1573
|
+
}
|
|
1574
|
+
// chunk_tote may have partial sum going out
|
|
1575
|
+
}
|
|
1576
|
+
|
|
1577
|
+
// Back up one UTF-8 character
|
|
1578
|
+
const uint8* BackOneUTF8(const uint8* p) {
|
|
1579
|
+
const uint8* retval = p - 1;
|
|
1580
|
+
if ((*retval & 0xc0) == 0x80) {--retval;}
|
|
1581
|
+
if ((*retval & 0xc0) == 0x80) {--retval;}
|
|
1582
|
+
if ((*retval & 0xc0) == 0x80) {--retval;}
|
|
1583
|
+
return retval;
|
|
1584
|
+
}
|
|
1585
|
+
|
|
1586
|
+
|
|
1587
|
+
// Score one scriptspan using quadgrams
|
|
1588
|
+
// Incoming chunk_tote may have partial accumulation
|
|
1589
|
+
static void ScoreQuadgrams(const cld::CLDTableSummary* quadgram_obj,
|
|
1590
|
+
getone::LangSpan* scriptspan,
|
|
1591
|
+
int* tote_grams, int gram_limit,
|
|
1592
|
+
Tote* chunk_tote,
|
|
1593
|
+
ToteWithReliability* doc_tote,
|
|
1594
|
+
uint8* lang_hint_boost,
|
|
1595
|
+
int advance_by, int flags,
|
|
1596
|
+
int* initial_word_span, Language plus_one) {
|
|
1597
|
+
// chunk_tote may have partial sum coming in
|
|
1598
|
+
const char* src = scriptspan->text;
|
|
1599
|
+
const char* srclimit = src + scriptspan->text_bytes;
|
|
1600
|
+
const char* lastscored_src = src;
|
|
1601
|
+
|
|
1602
|
+
// For debugging only. Not thread-safe
|
|
1603
|
+
prior_lang = UNKNOWN_LANGUAGE;
|
|
1604
|
+
prior_unreliable = false;
|
|
1605
|
+
|
|
1606
|
+
// Break text up into multiple chunks and score each
|
|
1607
|
+
while (src < srclimit) {
|
|
1608
|
+
// Updates tote_grams
|
|
1609
|
+
int len = cld::DoQuadScoreV3(quadgram_obj,
|
|
1610
|
+
src, srclimit - src, advance_by,
|
|
1611
|
+
tote_grams, gram_limit, chunk_tote);
|
|
1612
|
+
if (FlagUseWords(flags) || (*initial_word_span > 0)) {
|
|
1613
|
+
// Use word scoring in addition to quadgrams
|
|
1614
|
+
cld::DoOctaScoreV3(&kLongWord8Table_obj,
|
|
1615
|
+
src, len, chunk_tote);
|
|
1616
|
+
}
|
|
1617
|
+
chunk_tote->AddBytes(len);
|
|
1618
|
+
*initial_word_span -= len;
|
|
1619
|
+
|
|
1620
|
+
if (*tote_grams >= gram_limit) {
|
|
1621
|
+
// Remove all but top40 if asked
|
|
1622
|
+
if (FlagTop40(flags)) {
|
|
1623
|
+
cld::DemoteNotTop40(chunk_tote, cld::PackLanguage(plus_one));
|
|
1624
|
+
}
|
|
1625
|
+
|
|
1626
|
+
// Sort, accumulate into doc total, reinit
|
|
1627
|
+
ScoreChunkIntoDoc(src, len, advance_by,
|
|
1628
|
+
scriptspan->script, chunk_tote,
|
|
1629
|
+
doc_tote, *tote_grams, lang_hint_boost);
|
|
1630
|
+
lastscored_src = src + len;
|
|
1631
|
+
*tote_grams = 0;
|
|
1632
|
+
} else {
|
|
1633
|
+
if (FLAGS_cld_html) {
|
|
1634
|
+
string temp(src, len);
|
|
1635
|
+
Language top_lang = cld::UnpackLanguage(chunk_tote->CurrentTopKey());
|
|
1636
|
+
PrintTopLangSpeculative(top_lang);
|
|
1637
|
+
cld::PrintText(stderr, top_lang, temp);
|
|
1638
|
+
}
|
|
1639
|
+
}
|
|
1640
|
+
src += len;
|
|
1641
|
+
}
|
|
1642
|
+
}
|
|
1643
|
+
|
|
1644
|
+
|
|
1645
|
+
|
|
1646
|
+
void PrintLangs(FILE* f, const Language* language3, const int* percent3,
|
|
1647
|
+
const int* text_bytes, const bool* is_reliable) {
|
|
1648
|
+
fprintf(f, "<br> Initial_Languages ");
|
|
1649
|
+
if (language3[0] != UNKNOWN_LANGUAGE) {
|
|
1650
|
+
fprintf(f, "%s%s(%d%%) ",
|
|
1651
|
+
ExtLanguageName(language3[0]),
|
|
1652
|
+
*is_reliable ? "" : "*",
|
|
1653
|
+
percent3[0]);
|
|
1654
|
+
}
|
|
1655
|
+
if (language3[1] != UNKNOWN_LANGUAGE) {
|
|
1656
|
+
fprintf(f, "%s(%d%%) ", ExtLanguageName(language3[1]), percent3[1]);
|
|
1657
|
+
}
|
|
1658
|
+
if (language3[2] != UNKNOWN_LANGUAGE) {
|
|
1659
|
+
fprintf(f, "%s(%d%%) ", ExtLanguageName(language3[2]), percent3[2]);
|
|
1660
|
+
}
|
|
1661
|
+
fprintf(f, "%d bytes \n", *text_bytes);
|
|
1662
|
+
|
|
1663
|
+
fprintf(f, "<br>\n");
|
|
1664
|
+
}
|
|
1665
|
+
|
|
1666
|
+
|
|
1667
|
+
// Start the tote with a count of one for the default language for script
|
|
1668
|
+
void InitScriptToteLang(Tote* script_tote, UnicodeLScript lscript) {
|
|
1669
|
+
Language defaultlang = cld::kDefaultLanguagePerLScript[lscript];
|
|
1670
|
+
script_tote->Add(cld::PackLanguage(defaultlang), 1);
|
|
1671
|
+
script_tote->AddBytes(1);
|
|
1672
|
+
#if 0
|
|
1673
|
+
if (FLAGS_cld_html) {
|
|
1674
|
+
cld::PrintLang(stderr, script_tote,
|
|
1675
|
+
defaultlang, false,
|
|
1676
|
+
UNKNOWN_LANGUAGE, false);
|
|
1677
|
+
prior_lang = cur_lang;
|
|
1678
|
+
string temp("+1");
|
|
1679
|
+
cld::PrintText(stderr, defaultlang, temp);
|
|
1680
|
+
}
|
|
1681
|
+
#endif
|
|
1682
|
+
}
|
|
1683
|
+
|
|
1684
|
+
static const char* const kToteName[4] =
|
|
1685
|
+
{"=Latn=", "=Hani=", "=Script2=", "=Script3="};
|
|
1686
|
+
static const char* const kToteSwitch[4] =
|
|
1687
|
+
{"=Latn=", "=Hani=", "=Switch2=", "=Switch3="};
|
|
1688
|
+
|
|
1689
|
+
|
|
1690
|
+
|
|
1691
|
+
// Upper to lower, keep digits, everything else to minus '-' (2d)
|
|
1692
|
+
static const char kCharsetToLowerTbl[256] = {
|
|
1693
|
+
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
|
|
1694
|
+
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
|
|
1695
|
+
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
|
|
1696
|
+
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37, 0x38,0x39,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
|
|
1697
|
+
|
|
1698
|
+
0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
|
|
1699
|
+
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x2d,
|
|
1700
|
+
0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
|
|
1701
|
+
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x2d,
|
|
1702
|
+
|
|
1703
|
+
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
|
|
1704
|
+
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
|
|
1705
|
+
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
|
|
1706
|
+
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
|
|
1707
|
+
|
|
1708
|
+
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
|
|
1709
|
+
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
|
|
1710
|
+
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
|
|
1711
|
+
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
|
|
1712
|
+
};
|
|
1713
|
+
|
|
1714
|
+
|
|
1715
|
+
static const char kIsAlpha[256] = {
|
|
1716
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
1717
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
1718
|
+
0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,
|
|
1719
|
+
0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,
|
|
1720
|
+
|
|
1721
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
1722
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
1723
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
1724
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
1725
|
+
};
|
|
1726
|
+
|
|
1727
|
+
static const char kIsDigit[256] = {
|
|
1728
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
1729
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1, 1,1,0,0,0,0,0,0,
|
|
1730
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
1731
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
1732
|
+
|
|
1733
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
1734
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
1735
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
1736
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
1737
|
+
};
|
|
1738
|
+
|
|
1739
|
+
// Normalize ASCII string to first 4 alphabetic/digit chars
|
|
1740
|
+
// Letters are forced to lowercase ASCII
|
|
1741
|
+
// Used to normalize TLD values
|
|
1742
|
+
void MakeChar4(const char* str, char* norm) {
|
|
1743
|
+
memcpy(norm, "____", 4); // four underscores
|
|
1744
|
+
int l_ptr = 0;
|
|
1745
|
+
for (unsigned int i = 0; i < strlen(str); ++i) {
|
|
1746
|
+
uint8 uc = static_cast<uint8>(str[i]);
|
|
1747
|
+
if (kIsAlpha[uc] | kIsDigit[uc]) {
|
|
1748
|
+
if (l_ptr < 4) { // Else ignore
|
|
1749
|
+
norm[l_ptr] = kCharsetToLowerTbl[uc];
|
|
1750
|
+
l_ptr++;
|
|
1751
|
+
}
|
|
1752
|
+
}
|
|
1753
|
+
}
|
|
1754
|
+
}
|
|
1755
|
+
|
|
1756
|
+
// Find subscript of matching key in first 4 bytes of sorted hint array, or -1
|
|
1757
|
+
static int HintBinaryLookup4(const HintEntry* hintprobs, int hintprobssize,
|
|
1758
|
+
const char* norm_key) {
|
|
1759
|
+
// Key is always in range [lo..hi)
|
|
1760
|
+
int lo = 0;
|
|
1761
|
+
int hi = hintprobssize;
|
|
1762
|
+
while (lo < hi) {
|
|
1763
|
+
int mid = (lo + hi) >> 1;
|
|
1764
|
+
int comp = memcmp(&hintprobs[mid].key[0], norm_key, 4);
|
|
1765
|
+
if (comp < 0) {
|
|
1766
|
+
lo = mid + 1;
|
|
1767
|
+
} else if (comp > 0) {
|
|
1768
|
+
hi = mid;
|
|
1769
|
+
} else {
|
|
1770
|
+
return mid;
|
|
1771
|
+
}
|
|
1772
|
+
}
|
|
1773
|
+
return -1;
|
|
1774
|
+
}
|
|
1775
|
+
|
|
1776
|
+
|
|
1777
|
+
// Increment the initial probabilities based on a per-TLD probs entry
|
|
1778
|
+
void ApplyTLDHint(uint8* lang_hint_boost, const char* tld_hint) {
|
|
1779
|
+
if (FLAGS_dbgscore) {
|
|
1780
|
+
fprintf(stderr, "TLD hint %s\n", tld_hint);
|
|
1781
|
+
}
|
|
1782
|
+
char normalized_tld[8];
|
|
1783
|
+
MakeChar4(tld_hint, normalized_tld);
|
|
1784
|
+
int n = HintBinaryLookup4(kTLDHintProbs, kTLDHintProbsSize,
|
|
1785
|
+
normalized_tld);
|
|
1786
|
+
// TLD is four bytes, probability entry is 4 bytes
|
|
1787
|
+
if (n >= 0) {
|
|
1788
|
+
uint32 probs = kTLDHintProbs[n].probs;
|
|
1789
|
+
|
|
1790
|
+
uint8 prob123 = (probs >> 0) & 0xff;
|
|
1791
|
+
const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
|
|
1792
|
+
uint8 top1 = (probs >> 8) & 0xff;
|
|
1793
|
+
if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);}
|
|
1794
|
+
uint8 top2 = (probs >> 16) & 0xff;
|
|
1795
|
+
if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);}
|
|
1796
|
+
uint8 top3 = (probs >> 24) & 0xff;
|
|
1797
|
+
if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);}
|
|
1798
|
+
}
|
|
1799
|
+
}
|
|
1800
|
+
|
|
1801
|
+
|
|
1802
|
+
// Increment the initial probabilities based on a per-encoding probs entry
|
|
1803
|
+
void ApplyEncodingHint(uint8* lang_hint_boost, int encoding_hint) {
|
|
1804
|
+
if (FLAGS_dbgscore) {
|
|
1805
|
+
Encoding tempenc = static_cast<Encoding>(encoding_hint);
|
|
1806
|
+
fprintf(stderr, "ENC hint %s\n", EncodingName(tempenc));
|
|
1807
|
+
}
|
|
1808
|
+
if (encoding_hint < ISO_8859_1) {return;}
|
|
1809
|
+
if (encoding_hint >= NUM_ENCODINGS) {return;}
|
|
1810
|
+
uint32 probs = kEncodingHintProbs[encoding_hint];
|
|
1811
|
+
|
|
1812
|
+
uint8 prob123 = (probs >> 0) & 0xff;
|
|
1813
|
+
const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
|
|
1814
|
+
uint8 top1 = (probs >> 8) & 0xff;
|
|
1815
|
+
if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);}
|
|
1816
|
+
uint8 top2 = (probs >> 16) & 0xff;
|
|
1817
|
+
if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);}
|
|
1818
|
+
uint8 top3 = (probs >> 24) & 0xff;
|
|
1819
|
+
if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);}
|
|
1820
|
+
}
|
|
1821
|
+
|
|
1822
|
+
|
|
1823
|
+
// Increment the initial probability for given language by fixed amount
|
|
1824
|
+
// Does not recognize extended languages as hints
|
|
1825
|
+
void ApplyLanguageHint(uint8* lang_hint_boost, Language language_hint) {
|
|
1826
|
+
if (FLAGS_dbgscore) {
|
|
1827
|
+
fprintf(stderr, "LANG hint %s\n", ExtLanguageName(language_hint));
|
|
1828
|
+
}
|
|
1829
|
+
if (language_hint < ENGLISH) {return;}
|
|
1830
|
+
if (language_hint >= NUM_LANGUAGES) {return;}
|
|
1831
|
+
uint32 probs = kLanguageHintProbs[language_hint];
|
|
1832
|
+
|
|
1833
|
+
uint8 prob123 = (probs >> 0) & 0xff;
|
|
1834
|
+
const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
|
|
1835
|
+
uint8 top1 = (probs >> 8) & 0xff;
|
|
1836
|
+
if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);}
|
|
1837
|
+
uint8 top2 = (probs >> 16) & 0xff;
|
|
1838
|
+
if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);}
|
|
1839
|
+
uint8 top3 = (probs >> 24) & 0xff;
|
|
1840
|
+
if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);}
|
|
1841
|
+
}
|
|
1842
|
+
|
|
1843
|
+
// Extract return values before fixups
|
|
1844
|
+
void ExtractLangEtc(ToteWithReliability* doc_tote, int total_text_bytes,
|
|
1845
|
+
int* reliable_percent3, Language* language3, int* percent3,
|
|
1846
|
+
double* normalized_score3,
|
|
1847
|
+
int* text_bytes, bool* is_reliable) {
|
|
1848
|
+
reliable_percent3[0] = 0;
|
|
1849
|
+
reliable_percent3[1] = 0;
|
|
1850
|
+
reliable_percent3[2] = 0;
|
|
1851
|
+
language3[0] = UNKNOWN_LANGUAGE;
|
|
1852
|
+
language3[1] = UNKNOWN_LANGUAGE;
|
|
1853
|
+
language3[2] = UNKNOWN_LANGUAGE;
|
|
1854
|
+
percent3[0] = 100;
|
|
1855
|
+
percent3[1] = 0;
|
|
1856
|
+
percent3[2] = 0;
|
|
1857
|
+
normalized_score3[0] = 0.0;
|
|
1858
|
+
normalized_score3[1] = 0.0;
|
|
1859
|
+
normalized_score3[2] = 0.0;
|
|
1860
|
+
|
|
1861
|
+
*text_bytes = total_text_bytes;
|
|
1862
|
+
*is_reliable = false;
|
|
1863
|
+
|
|
1864
|
+
int bytecount1 = total_text_bytes;
|
|
1865
|
+
int bytecount2 = 0;
|
|
1866
|
+
int bytecount3 = 0;
|
|
1867
|
+
|
|
1868
|
+
int lang1 = doc_tote->Key(0);
|
|
1869
|
+
if (lang1 != 0) {
|
|
1870
|
+
// We have a top language
|
|
1871
|
+
language3[0] = cld::UnpackLanguage(lang1);
|
|
1872
|
+
bytecount1 = doc_tote->Value(0);
|
|
1873
|
+
int reli1 = doc_tote->Reliability(0);
|
|
1874
|
+
reliable_percent3[0] = reli1 / (bytecount1 ? bytecount1 : 1); // avoid zdiv
|
|
1875
|
+
normalized_score3[0] = cld::GetNormalizedScore(language3[0],
|
|
1876
|
+
ULScript_Common,
|
|
1877
|
+
bytecount1,
|
|
1878
|
+
doc_tote->Score(0));
|
|
1879
|
+
}
|
|
1880
|
+
|
|
1881
|
+
int lang2 = doc_tote->Key(1);
|
|
1882
|
+
if (lang2 != 0) {
|
|
1883
|
+
language3[1] = cld::UnpackLanguage(lang2);
|
|
1884
|
+
bytecount2 = doc_tote->Value(1);
|
|
1885
|
+
int reli2 = doc_tote->Reliability(1);
|
|
1886
|
+
reliable_percent3[1] = reli2 / (bytecount2 ? bytecount2 : 1); // avoid zdiv
|
|
1887
|
+
normalized_score3[1] = cld::GetNormalizedScore(language3[1],
|
|
1888
|
+
ULScript_Common,
|
|
1889
|
+
bytecount2,
|
|
1890
|
+
doc_tote->Score(1));
|
|
1891
|
+
}
|
|
1892
|
+
|
|
1893
|
+
int lang3 = doc_tote->Key(2);
|
|
1894
|
+
if (lang3 != 0) {
|
|
1895
|
+
language3[2] = cld::UnpackLanguage(lang3);
|
|
1896
|
+
bytecount3 = doc_tote->Value(2);
|
|
1897
|
+
int reli3 = doc_tote->Reliability(2);
|
|
1898
|
+
reliable_percent3[2] = reli3 / (bytecount3 ? bytecount3 : 1); // avoid zdiv
|
|
1899
|
+
normalized_score3[2] = cld::GetNormalizedScore(language3[2],
|
|
1900
|
+
ULScript_Common,
|
|
1901
|
+
bytecount3,
|
|
1902
|
+
doc_tote->Score(2));
|
|
1903
|
+
}
|
|
1904
|
+
|
|
1905
|
+
// Increase total bytes to sum (top 3) if low for some reason
|
|
1906
|
+
int total_bytecount12 = bytecount1 + bytecount2;
|
|
1907
|
+
int total_bytecount123 = total_bytecount12 + bytecount3;
|
|
1908
|
+
if (total_text_bytes < total_bytecount123) {
|
|
1909
|
+
total_text_bytes = total_bytecount123;
|
|
1910
|
+
*text_bytes = total_text_bytes;
|
|
1911
|
+
}
|
|
1912
|
+
|
|
1913
|
+
// Sum minus previous % gives better roundoff behavior than bytecount/total
|
|
1914
|
+
int total_text_bytes_div = cld::maxint(1, total_text_bytes); // Avoid zdiv
|
|
1915
|
+
percent3[0] = (bytecount1 * 100) / total_text_bytes_div;
|
|
1916
|
+
percent3[1] = (total_bytecount12 * 100) / total_text_bytes_div;
|
|
1917
|
+
percent3[2] = (total_bytecount123 * 100) / total_text_bytes_div;
|
|
1918
|
+
percent3[2] -= percent3[1];
|
|
1919
|
+
percent3[1] -= percent3[0];
|
|
1920
|
+
|
|
1921
|
+
// Roundoff, say 96% 1.6% 1.4%, will produce non-obvious 96% 1% 2%
|
|
1922
|
+
// Fix this explicitly
|
|
1923
|
+
if (percent3[1] < percent3[2]) {
|
|
1924
|
+
++percent3[1];
|
|
1925
|
+
--percent3[2];
|
|
1926
|
+
}
|
|
1927
|
+
if (percent3[0] < percent3[1]) {
|
|
1928
|
+
++percent3[0];
|
|
1929
|
+
--percent3[1];
|
|
1930
|
+
}
|
|
1931
|
+
|
|
1932
|
+
*text_bytes = total_text_bytes;
|
|
1933
|
+
|
|
1934
|
+
if (lang1 != 0) {
|
|
1935
|
+
// We have a top language
|
|
1936
|
+
// Its reliability is overal result reliability
|
|
1937
|
+
int bytecount = doc_tote->Value(0);
|
|
1938
|
+
int reli = doc_tote->Reliability(0);
|
|
1939
|
+
int reliable_percent = reli / (bytecount ? bytecount : 1); // avoid zdiv
|
|
1940
|
+
*is_reliable = reliable_percent >= cld::kMinReliable;
|
|
1941
|
+
} else {
|
|
1942
|
+
// No top language at all. This can happen with zero text or 100% Klingon
|
|
1943
|
+
// if extended=false. Just return all UNKNOWN_LANGUAGE, reliable.
|
|
1944
|
+
*is_reliable = true;
|
|
1945
|
+
}
|
|
1946
|
+
}
|
|
1947
|
+
|
|
1948
|
+
bool IsFIGS(Language lang) {
|
|
1949
|
+
if (lang == FRENCH) {return true;}
|
|
1950
|
+
if (lang == ITALIAN) {return true;}
|
|
1951
|
+
if (lang == GERMAN) {return true;}
|
|
1952
|
+
if (lang == SPANISH) {return true;}
|
|
1953
|
+
return false;
|
|
1954
|
+
}
|
|
1955
|
+
|
|
1956
|
+
bool IsEFIGS(Language lang) {
|
|
1957
|
+
if (lang == ENGLISH) {return true;}
|
|
1958
|
+
if (lang == FRENCH) {return true;}
|
|
1959
|
+
if (lang == ITALIAN) {return true;}
|
|
1960
|
+
if (lang == GERMAN) {return true;}
|
|
1961
|
+
if (lang == SPANISH) {return true;}
|
|
1962
|
+
return false;
|
|
1963
|
+
}
|
|
1964
|
+
|
|
1965
|
+
static const int kNonEnBoilerplateMinPercent = 17; // <this => no second
|
|
1966
|
+
static const int kNonFIGSBoilerplateMinPercent = 20; // <this => no second
|
|
1967
|
+
static const int kGoodFirstMinPercent = 26; // <this => UNK
|
|
1968
|
+
static const int kGoodFirstReliableMinPercent = 51; // <this => unreli
|
|
1969
|
+
static const int kIgnoreMaxPercent = 95; // >this => unreli
|
|
1970
|
+
static const int kKeepMinPercent = 2; // <this => unreli
|
|
1971
|
+
|
|
1972
|
+
// For Tier3 languages, require more bytes of text to override
|
|
1973
|
+
// the first-place language
|
|
1974
|
+
static const int kGoodSecondT1T2MinBytes = 15; // <this => no second
|
|
1975
|
+
static const int kGoodSecondT3MinBytes = 128; // <this => no second
|
|
1976
|
+
//
|
|
1977
|
+
|
|
1978
|
+
// Calculate a single summary language for the document, and its reliability.
|
|
1979
|
+
// Returns language3[0] or language3[1] or ENGLISH or UNKNOWN_LANGUAGE
|
|
1980
|
+
// This is the heart of matching human-rater perception.
|
|
1981
|
+
// reliable_percent3[] is currently unused
|
|
1982
|
+
//
|
|
1983
|
+
// Do not return Tier3 second language unless there are at least 128 bytes
|
|
1984
|
+
void CalcSummaryLang(ToteWithReliability* doc_tote, int total_text_bytes,
|
|
1985
|
+
const int* reliable_percent3,
|
|
1986
|
+
const Language* language3,
|
|
1987
|
+
const int* percent3,
|
|
1988
|
+
Language* summary_lang, bool* is_reliable) {
|
|
1989
|
+
// Vector of active languages; changes if we delete some
|
|
1990
|
+
int slot_count = 3;
|
|
1991
|
+
int active_slot[3] = {0, 1, 2};
|
|
1992
|
+
|
|
1993
|
+
int ignore_percent = 0;
|
|
1994
|
+
int return_percent = percent3[0]; // Default to top lang
|
|
1995
|
+
*summary_lang = language3[0];
|
|
1996
|
+
*is_reliable = true;
|
|
1997
|
+
if (percent3[0] < kKeepMinPercent) {*is_reliable = false;}
|
|
1998
|
+
|
|
1999
|
+
// If any of top 3 is IGNORE, remove it and increment ignore_percent
|
|
2000
|
+
for (int i = 0; i < 3; ++i) {
|
|
2001
|
+
if (language3[i] == TG_UNKNOWN_LANGUAGE) {
|
|
2002
|
+
ignore_percent += percent3[i];
|
|
2003
|
+
// Move the rest up, levaing input vectors unchanged
|
|
2004
|
+
for (int j=i+1; j < 3; ++j) {
|
|
2005
|
+
active_slot[j - 1] = active_slot[j];
|
|
2006
|
+
}
|
|
2007
|
+
-- slot_count;
|
|
2008
|
+
// Logically remove Ignore from percentage-text calculation
|
|
2009
|
+
// (extra 1 in 101 avoids zdiv, biases slightly small)
|
|
2010
|
+
return_percent = (percent3[0] * 100) / (101 - ignore_percent);
|
|
2011
|
+
*summary_lang = language3[active_slot[0]];
|
|
2012
|
+
if (percent3[active_slot[0]] < kKeepMinPercent) {*is_reliable = false;}
|
|
2013
|
+
}
|
|
2014
|
+
}
|
|
2015
|
+
|
|
2016
|
+
|
|
2017
|
+
// If English and X, where X (not UNK) is big enough,
|
|
2018
|
+
// assume the English is boilerplate and return X.
|
|
2019
|
+
// Logically remove English from percentage-text calculation
|
|
2020
|
+
int second_bytes = (total_text_bytes * percent3[active_slot[1]]) / 100;
|
|
2021
|
+
// Require more bytes of text for Tier3 languages
|
|
2022
|
+
int minbytesneeded = kGoodSecondT1T2MinBytes;
|
|
2023
|
+
int plang_second = cld::PackLanguage(language3[active_slot[1]]);
|
|
2024
|
+
bool is_tier3 = (cld::kIsPackedTop40[plang_second] == 0);
|
|
2025
|
+
if (is_tier3) {
|
|
2026
|
+
minbytesneeded = kGoodSecondT3MinBytes;
|
|
2027
|
+
}
|
|
2028
|
+
|
|
2029
|
+
if ((language3[active_slot[0]] == ENGLISH) &&
|
|
2030
|
+
(language3[active_slot[1]] != ENGLISH) &&
|
|
2031
|
+
(language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
|
|
2032
|
+
(percent3[active_slot[1]] >= kNonEnBoilerplateMinPercent) &&
|
|
2033
|
+
(second_bytes >= minbytesneeded)) {
|
|
2034
|
+
ignore_percent += percent3[active_slot[0]];
|
|
2035
|
+
return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
|
|
2036
|
+
*summary_lang = language3[active_slot[1]];
|
|
2037
|
+
if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
|
|
2038
|
+
|
|
2039
|
+
// Else If FIGS and X, where X (not UNK, EFIGS) is big enough,
|
|
2040
|
+
// assume the FIGS is boilerplate and return X.
|
|
2041
|
+
// Logically remove FIGS from percentage-text calculation
|
|
2042
|
+
} else if (IsFIGS(language3[active_slot[0]]) &&
|
|
2043
|
+
!IsEFIGS(language3[active_slot[1]]) &&
|
|
2044
|
+
(language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
|
|
2045
|
+
(percent3[active_slot[1]] >= kNonFIGSBoilerplateMinPercent) &&
|
|
2046
|
+
(second_bytes >= minbytesneeded)) {
|
|
2047
|
+
ignore_percent += percent3[active_slot[0]];
|
|
2048
|
+
return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
|
|
2049
|
+
*summary_lang = language3[active_slot[1]];
|
|
2050
|
+
if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
|
|
2051
|
+
|
|
2052
|
+
// Else we are returning the first language, but want to improve its
|
|
2053
|
+
// return_percent if the second language should be ignored
|
|
2054
|
+
} else if ((language3[active_slot[1]] == ENGLISH) &&
|
|
2055
|
+
(language3[active_slot[0]] != ENGLISH)) {
|
|
2056
|
+
ignore_percent += percent3[active_slot[1]];
|
|
2057
|
+
return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
|
|
2058
|
+
} else if (IsFIGS(language3[active_slot[1]]) &&
|
|
2059
|
+
!IsEFIGS(language3[active_slot[0]])) {
|
|
2060
|
+
ignore_percent += percent3[active_slot[1]];
|
|
2061
|
+
return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
|
|
2062
|
+
}
|
|
2063
|
+
|
|
2064
|
+
// If return percent is too small (too many languages), return UNKNOWN
|
|
2065
|
+
if ((return_percent < kGoodFirstMinPercent)) {
|
|
2066
|
+
*summary_lang = UNKNOWN_LANGUAGE;
|
|
2067
|
+
*is_reliable = false;
|
|
2068
|
+
}
|
|
2069
|
+
|
|
2070
|
+
// If return percent is small, return language but set unreliable.
|
|
2071
|
+
if ((return_percent < kGoodFirstReliableMinPercent)) {
|
|
2072
|
+
*is_reliable = false;
|
|
2073
|
+
}
|
|
2074
|
+
|
|
2075
|
+
// If ignore percent is too large, set unreliable.
|
|
2076
|
+
if ((ignore_percent > kIgnoreMaxPercent)) {
|
|
2077
|
+
*is_reliable = false;
|
|
2078
|
+
}
|
|
2079
|
+
|
|
2080
|
+
// If we removed all the active languages, return UNKNOWN
|
|
2081
|
+
if (slot_count == 0) {
|
|
2082
|
+
*summary_lang = UNKNOWN_LANGUAGE;
|
|
2083
|
+
*is_reliable = false;
|
|
2084
|
+
}
|
|
2085
|
+
}
|
|
2086
|
+
|
|
2087
|
+
|
|
2088
|
+
|
|
2089
|
+
// Result vector must be exactly three items
|
|
2090
|
+
Language CompactLangDetImpl::DetectLanguageSummaryV25(
|
|
2091
|
+
const CompactLangDet::DetectionTables* tables,
|
|
2092
|
+
const char* buffer,
|
|
2093
|
+
int buffer_length,
|
|
2094
|
+
bool is_plain_text,
|
|
2095
|
+
bool do_pick_summary_language,
|
|
2096
|
+
bool do_remove_weak_matches,
|
|
2097
|
+
const char* tld_hint, // "id" boosts Indonesian
|
|
2098
|
+
int encoding_hint, // SJS boosts Japanese
|
|
2099
|
+
Language language_hint, // ITALIAN boosts it
|
|
2100
|
+
bool allow_extended_lang,
|
|
2101
|
+
int flags,
|
|
2102
|
+
Language plus_one,
|
|
2103
|
+
Language* language3,
|
|
2104
|
+
int* percent3,
|
|
2105
|
+
double* normalized_score3,
|
|
2106
|
+
int* text_bytes,
|
|
2107
|
+
bool* is_reliable) {
|
|
2108
|
+
if (!tables) {
|
|
2109
|
+
static const CompactLangDet::DetectionTables default_cld_tables = {
|
|
2110
|
+
&kQuadTable_obj,
|
|
2111
|
+
&compact_lang_det_generated_ctjkvz_b1_obj
|
|
2112
|
+
};
|
|
2113
|
+
tables = &default_cld_tables;
|
|
2114
|
+
}
|
|
2115
|
+
language3[0] = UNKNOWN_LANGUAGE;
|
|
2116
|
+
language3[1] = UNKNOWN_LANGUAGE;
|
|
2117
|
+
language3[2] = UNKNOWN_LANGUAGE;
|
|
2118
|
+
percent3[0] = 100;
|
|
2119
|
+
percent3[1] = 0;
|
|
2120
|
+
percent3[2] = 0;
|
|
2121
|
+
normalized_score3[0] = 0.0;
|
|
2122
|
+
normalized_score3[1] = 0.0;
|
|
2123
|
+
normalized_score3[2] = 0.0;
|
|
2124
|
+
*text_bytes = 0;
|
|
2125
|
+
*is_reliable = false;
|
|
2126
|
+
|
|
2127
|
+
// Document totals
|
|
2128
|
+
ToteWithReliability doc_tote; // Reliability = 0..100
|
|
2129
|
+
|
|
2130
|
+
// Vector of packed per-language boosts (just one filled in from hints)
|
|
2131
|
+
uint8 lang_hint_boost[EXT_NUM_LANGUAGES + 1];
|
|
2132
|
+
memset(lang_hint_boost, 0, sizeof(lang_hint_boost));
|
|
2133
|
+
|
|
2134
|
+
// Apply hints,if any
|
|
2135
|
+
if ((tld_hint != NULL) && (tld_hint[0] != '\0')) {
|
|
2136
|
+
ApplyTLDHint(lang_hint_boost, tld_hint);
|
|
2137
|
+
}
|
|
2138
|
+
if (encoding_hint != UNKNOWN_ENCODING) {
|
|
2139
|
+
ApplyEncodingHint(lang_hint_boost, encoding_hint);
|
|
2140
|
+
}
|
|
2141
|
+
if (language_hint != UNKNOWN_LANGUAGE) {
|
|
2142
|
+
ApplyLanguageHint(lang_hint_boost, language_hint);
|
|
2143
|
+
}
|
|
2144
|
+
|
|
2145
|
+
|
|
2146
|
+
// Four individual script totals, Latin, Han, other2, other3
|
|
2147
|
+
int next_other_tote = 2;
|
|
2148
|
+
|
|
2149
|
+
// Four totes for up to four different scripts pending at once
|
|
2150
|
+
Tote totes[4]; // [0] Latn [1] Hani [2] other [3] other
|
|
2151
|
+
bool tote_seen[4] = {false, false, false, false};
|
|
2152
|
+
int tote_grams[4] = {0, 0, 0, 0}; // Number in partial chunk
|
|
2153
|
+
UnicodeLScript tote_script[4] =
|
|
2154
|
+
{ULScript_Latin, ULScript_HanCJK, ULScript_Common, ULScript_Common};
|
|
2155
|
+
|
|
2156
|
+
// Loop through text spans in a single script
|
|
2157
|
+
ScriptScanner ss(buffer, buffer_length, is_plain_text);
|
|
2158
|
+
getone::LangSpan scriptspan;
|
|
2159
|
+
|
|
2160
|
+
scriptspan.text = NULL;
|
|
2161
|
+
scriptspan.text_bytes = 0;
|
|
2162
|
+
scriptspan.offset = 0;
|
|
2163
|
+
scriptspan.script = ULScript_Common;
|
|
2164
|
+
scriptspan.lang = UNKNOWN_LANGUAGE;
|
|
2165
|
+
|
|
2166
|
+
int total_text_bytes = 0;
|
|
2167
|
+
int textlimit = FLAGS_cld_textlimit << 10; // in KB
|
|
2168
|
+
if (textlimit == 0) {textlimit = 0x7fffffff;}
|
|
2169
|
+
|
|
2170
|
+
int advance_by = 2; // Advance 2 bytes
|
|
2171
|
+
int advance_limit = textlimit >> 3; // For first 1/8 of max document
|
|
2172
|
+
|
|
2173
|
+
int initial_word_span = kDefaultWordSpan;
|
|
2174
|
+
if (FLAGS_cld_forcewords) {
|
|
2175
|
+
initial_word_span = kReallyBigWordSpan;
|
|
2176
|
+
}
|
|
2177
|
+
|
|
2178
|
+
// Pick up chunk sizes
|
|
2179
|
+
// Smoothwidth is units of quadgrams, about 2.5 chars (unigrams) each
|
|
2180
|
+
// Sanity check -- force into a reasonable range
|
|
2181
|
+
int chunksizequads = FLAGS_cld_smoothwidth;
|
|
2182
|
+
chunksizequads = cld::minint(cld::maxint(chunksizequads, kMinChunkSizeQuads),
|
|
2183
|
+
kMaxChunkSizeQuads);
|
|
2184
|
+
int chunksizeunis = (chunksizequads * 5) >> 1;
|
|
2185
|
+
|
|
2186
|
+
// Varying short-span limit doesn't work well -- skips too much beyond 20KB
|
|
2187
|
+
// int spantooshortlimit = advance_by * FLAGS_cld_smoothwidth;
|
|
2188
|
+
int spantooshortlimit = kShortSpanThresh;
|
|
2189
|
+
|
|
2190
|
+
// For debugging only. Not thread-safe
|
|
2191
|
+
prior_lang = UNKNOWN_LANGUAGE;
|
|
2192
|
+
prior_unreliable = false;
|
|
2193
|
+
|
|
2194
|
+
// Allocate full-document prediction table for finding repeating words
|
|
2195
|
+
int hash = 0;
|
|
2196
|
+
int* predict_tbl = new int[kPredictionTableSize];
|
|
2197
|
+
if (FlagRepeats(flags)) {
|
|
2198
|
+
memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
|
|
2199
|
+
}
|
|
2200
|
+
|
|
2201
|
+
// Loop through scriptspans accumulating number of text bytes in each language
|
|
2202
|
+
while (ss.GetOneScriptSpanLower(&scriptspan)) {
|
|
2203
|
+
UnicodeLScript lscript = scriptspan.script;
|
|
2204
|
+
|
|
2205
|
+
// Echo text if asked to
|
|
2206
|
+
if (FLAGS_cld_echotext) {
|
|
2207
|
+
PrintHtmlEscapedText(stderr, scriptspan.text, scriptspan.text_bytes);
|
|
2208
|
+
}
|
|
2209
|
+
|
|
2210
|
+
// Squeeze out big chunks of text span if asked to
|
|
2211
|
+
if (FlagSqueeze(flags)) {
|
|
2212
|
+
// Remove repetitive or mostly-spaces chunks
|
|
2213
|
+
int newlen;
|
|
2214
|
+
int chunksize = 0; // Use the default
|
|
2215
|
+
newlen = CheapSqueezeInplace(scriptspan.text, scriptspan.text_bytes,
|
|
2216
|
+
chunksize);
|
|
2217
|
+
scriptspan.text_bytes = newlen;
|
|
2218
|
+
} else {
|
|
2219
|
+
// Check now and then to see if we should be squeezing
|
|
2220
|
+
if ((total_text_bytes >= kCheapSqueezeTestThresh) &&
|
|
2221
|
+
!FlagFinish(flags) &&
|
|
2222
|
+
((getone::kMaxScriptBuffer >> 1) < scriptspan.text_bytes) &&
|
|
2223
|
+
CheapSqueezeTriggerTest(scriptspan.text,
|
|
2224
|
+
scriptspan.text_bytes,
|
|
2225
|
+
kCheapSqueezeTestLen)) {
|
|
2226
|
+
// Recursive call with big-chunk squeezing set
|
|
2227
|
+
if (FLAGS_cld_html || FLAGS_dbgscore) {
|
|
2228
|
+
fprintf(stderr,
|
|
2229
|
+
"<br>---text_bytes[%d] Recursive(Squeeze)---<br><br>\n",
|
|
2230
|
+
total_text_bytes);
|
|
2231
|
+
}
|
|
2232
|
+
// Deallocate full-document prediction table
|
|
2233
|
+
delete[] predict_tbl;
|
|
2234
|
+
|
|
2235
|
+
return DetectLanguageSummaryV25(
|
|
2236
|
+
tables,
|
|
2237
|
+
buffer,
|
|
2238
|
+
buffer_length,
|
|
2239
|
+
is_plain_text,
|
|
2240
|
+
do_pick_summary_language,
|
|
2241
|
+
do_remove_weak_matches,
|
|
2242
|
+
tld_hint, // "id" boosts Indonesian
|
|
2243
|
+
encoding_hint, // SJS boosts Japanese
|
|
2244
|
+
language_hint, // ITALIAN boosts it
|
|
2245
|
+
allow_extended_lang,
|
|
2246
|
+
flags | kCLDFlagSqueeze,
|
|
2247
|
+
plus_one,
|
|
2248
|
+
language3,
|
|
2249
|
+
percent3,
|
|
2250
|
+
normalized_score3,
|
|
2251
|
+
text_bytes,
|
|
2252
|
+
is_reliable);
|
|
2253
|
+
}
|
|
2254
|
+
}
|
|
2255
|
+
|
|
2256
|
+
// Remove repetitive words if asked to
|
|
2257
|
+
if (FlagRepeats(flags)) {
|
|
2258
|
+
// Remove repetitive words
|
|
2259
|
+
int newlen;
|
|
2260
|
+
newlen = CheapRepWordsInplace(scriptspan.text, scriptspan.text_bytes,
|
|
2261
|
+
&hash, predict_tbl);
|
|
2262
|
+
scriptspan.text_bytes = newlen;
|
|
2263
|
+
}
|
|
2264
|
+
|
|
2265
|
+
// The real scoring
|
|
2266
|
+
// Accumulate directly into the document total, or accmulate in one of four
|
|
2267
|
+
// chunk totals. The purpose of the multiple chunk totals is to piece
|
|
2268
|
+
// together short choppy pieces of text in alternating scripts. One total is
|
|
2269
|
+
// dedicated to Latin text, one to Han text, and the other two are dynamicly
|
|
2270
|
+
// assigned.
|
|
2271
|
+
Language onlylang = cld::kOnlyLanguagePerLScript[lscript];
|
|
2272
|
+
|
|
2273
|
+
if (onlylang != UNKNOWN_LANGUAGE) {
|
|
2274
|
+
// This entire script run is in a single language.
|
|
2275
|
+
ScoreNilgrams(&scriptspan, cld::PackLanguage(onlylang), &doc_tote,
|
|
2276
|
+
lang_hint_boost, flags, plus_one);
|
|
2277
|
+
} else if (cld::kScoreUniPerLScript[lscript] != 0) {
|
|
2278
|
+
// This entire script run's languages can be distinguished by uni-grams
|
|
2279
|
+
// Accumulate in hani_tote
|
|
2280
|
+
int tote_num = 1;
|
|
2281
|
+
if (!tote_seen[tote_num]) {
|
|
2282
|
+
tote_seen[tote_num] = true;
|
|
2283
|
+
// Default language gets 1 byte
|
|
2284
|
+
total_text_bytes += 1;
|
|
2285
|
+
InitScriptToteLang(&totes[tote_num], lscript);
|
|
2286
|
+
}
|
|
2287
|
+
ScoreUnigrams(tables->unigram_obj,
|
|
2288
|
+
&scriptspan, &tote_grams[tote_num], chunksizeunis,
|
|
2289
|
+
&totes[tote_num],
|
|
2290
|
+
&doc_tote, lang_hint_boost,
|
|
2291
|
+
advance_by, flags, &initial_word_span, plus_one);
|
|
2292
|
+
} else {
|
|
2293
|
+
// This entire script-run's languages can be distinguished by quad-grams
|
|
2294
|
+
// Accumulate in latn_tote or script0/1_tote
|
|
2295
|
+
int tote_num = -1;
|
|
2296
|
+
for (int t = 0; t < 4; ++t) {
|
|
2297
|
+
if (lscript == tote_script[t]) {
|
|
2298
|
+
tote_num = t;
|
|
2299
|
+
break;
|
|
2300
|
+
}
|
|
2301
|
+
}
|
|
2302
|
+
if (tote_num < 0) {
|
|
2303
|
+
// Need to allocate other0/1
|
|
2304
|
+
tote_num = next_other_tote;
|
|
2305
|
+
next_other_tote ^= 1; // Round-robin
|
|
2306
|
+
if (tote_seen[tote_num]) {
|
|
2307
|
+
// Flush previous
|
|
2308
|
+
ScoreChunkIntoDoc2(kToteSwitch[tote_num], advance_by,
|
|
2309
|
+
tote_script[tote_num], &totes[tote_num],
|
|
2310
|
+
&doc_tote, tote_grams[tote_num], lang_hint_boost);
|
|
2311
|
+
totes[tote_num].Reinit();
|
|
2312
|
+
}
|
|
2313
|
+
tote_script[tote_num] = lscript;
|
|
2314
|
+
}
|
|
2315
|
+
|
|
2316
|
+
if (!tote_seen[tote_num]) {
|
|
2317
|
+
tote_seen[tote_num] = true;
|
|
2318
|
+
// Default language gets 1 byte
|
|
2319
|
+
total_text_bytes += 1;
|
|
2320
|
+
InitScriptToteLang(&totes[tote_num], lscript);
|
|
2321
|
+
}
|
|
2322
|
+
|
|
2323
|
+
// The actual accumulation, possibly with word scoring also
|
|
2324
|
+
ScoreQuadgrams(tables->quadgram_obj, &scriptspan, &tote_grams[tote_num],
|
|
2325
|
+
chunksizequads,
|
|
2326
|
+
&totes[tote_num],
|
|
2327
|
+
&doc_tote, lang_hint_boost,
|
|
2328
|
+
advance_by, flags, &initial_word_span, plus_one);
|
|
2329
|
+
}
|
|
2330
|
+
|
|
2331
|
+
total_text_bytes += scriptspan.text_bytes;
|
|
2332
|
+
|
|
2333
|
+
// For long documents, do less-dense samples the further along we go.
|
|
2334
|
+
// This is to keep speed sublinear in document size.
|
|
2335
|
+
if (total_text_bytes > advance_limit) {
|
|
2336
|
+
if (total_text_bytes > textlimit) {
|
|
2337
|
+
// Don't look at rest of doc
|
|
2338
|
+
if (FLAGS_cld_html || FLAGS_dbgscore) {
|
|
2339
|
+
fprintf(stderr, "<br>---text_bytes[%d] textlimit %d reached---<br>",
|
|
2340
|
+
total_text_bytes, textlimit);
|
|
2341
|
+
}
|
|
2342
|
+
break;
|
|
2343
|
+
}
|
|
2344
|
+
advance_by <<= 1; // Double advance bytes
|
|
2345
|
+
advance_limit <<= 1; // Double limit until next change
|
|
2346
|
+
spantooshortlimit <<= 1; // Double short-span size
|
|
2347
|
+
if (FLAGS_cld_html || FLAGS_dbgscore) {
|
|
2348
|
+
fprintf(stderr, "<br>---text_bytes[%d] advance_by doubled to %d---<br>",
|
|
2349
|
+
total_text_bytes, advance_by);
|
|
2350
|
+
}
|
|
2351
|
+
}
|
|
2352
|
+
} // End while (ss.GetOneScriptSpanLower())
|
|
2353
|
+
|
|
2354
|
+
// Deallocate full-document prediction table
|
|
2355
|
+
delete[] predict_tbl;
|
|
2356
|
+
|
|
2357
|
+
// Flush pending totals
|
|
2358
|
+
for (int tote_num = 0; tote_num < 4; ++tote_num) {
|
|
2359
|
+
if (tote_seen[tote_num]) {
|
|
2360
|
+
ScoreChunkIntoDoc2(kToteName[tote_num], advance_by,
|
|
2361
|
+
tote_script[tote_num], &totes[tote_num], &doc_tote,
|
|
2362
|
+
tote_grams[tote_num], lang_hint_boost);
|
|
2363
|
+
}
|
|
2364
|
+
}
|
|
2365
|
+
|
|
2366
|
+
// If extended languages are disallowed, remove them here
|
|
2367
|
+
if (!allow_extended_lang) {
|
|
2368
|
+
RemoveExtendedLanguages(&doc_tote);
|
|
2369
|
+
}
|
|
2370
|
+
|
|
2371
|
+
// Force close pairs to one or the other
|
|
2372
|
+
RefineScoredClosePairs(&doc_tote);
|
|
2373
|
+
|
|
2374
|
+
|
|
2375
|
+
// Calculate return results
|
|
2376
|
+
// Find top three byte counts in tote heap
|
|
2377
|
+
int reliable_percent3[3];
|
|
2378
|
+
|
|
2379
|
+
|
|
2380
|
+
// Cannot use Add, etc. after sorting
|
|
2381
|
+
doc_tote.Sort(3);
|
|
2382
|
+
|
|
2383
|
+
ExtractLangEtc(&doc_tote, total_text_bytes,
|
|
2384
|
+
reliable_percent3, language3, percent3, normalized_score3,
|
|
2385
|
+
text_bytes, is_reliable);
|
|
2386
|
+
|
|
2387
|
+
bool have_good_answer = false;
|
|
2388
|
+
if (FlagFinish(flags)) {
|
|
2389
|
+
// Force a result
|
|
2390
|
+
have_good_answer = true;
|
|
2391
|
+
} else if (total_text_bytes <= kShortTextThresh) {
|
|
2392
|
+
// Don't recurse on short text -- we already did word scores
|
|
2393
|
+
have_good_answer = true;
|
|
2394
|
+
} else if (*is_reliable &&
|
|
2395
|
+
(percent3[0] >= kGoodLang1Percent)) {
|
|
2396
|
+
have_good_answer = true;
|
|
2397
|
+
} else if (*is_reliable &&
|
|
2398
|
+
((percent3[0] + percent3[1]) >= kGoodLang1and2Percent)) {
|
|
2399
|
+
have_good_answer = true;
|
|
2400
|
+
}
|
|
2401
|
+
|
|
2402
|
+
|
|
2403
|
+
if (have_good_answer) {
|
|
2404
|
+
// This is the real, non-recursive return
|
|
2405
|
+
|
|
2406
|
+
// Move bytes for unreliable langs to another lang or
|
|
2407
|
+
// UNKNOWN
|
|
2408
|
+
RemoveUnreliableLanguages(&doc_tote, do_remove_weak_matches);
|
|
2409
|
+
|
|
2410
|
+
// Redo the result extraction after the removal above
|
|
2411
|
+
doc_tote.Sort(3);
|
|
2412
|
+
|
|
2413
|
+
ExtractLangEtc(&doc_tote, total_text_bytes,
|
|
2414
|
+
reliable_percent3, language3, percent3, normalized_score3,
|
|
2415
|
+
text_bytes, is_reliable);
|
|
2416
|
+
|
|
2417
|
+
#if 0
|
|
2418
|
+
// OLD code, replaced by CalcSummaryLang
|
|
2419
|
+
//
|
|
2420
|
+
// Suppress ignore-me text, TG_UNKNOWN_LANGUAGE if 2nd or 3rd language
|
|
2421
|
+
// Force it to English if first language
|
|
2422
|
+
if (language3[2] == TG_UNKNOWN_LANGUAGE) {
|
|
2423
|
+
reliable_percent3[2] = 0;
|
|
2424
|
+
language3[2] = UNKNOWN_LANGUAGE;
|
|
2425
|
+
percent3[2] = 0;
|
|
2426
|
+
} else if (language3[1] == TG_UNKNOWN_LANGUAGE) {
|
|
2427
|
+
// Move up lower language
|
|
2428
|
+
reliable_percent3[1] = reliable_percent3[2];
|
|
2429
|
+
language3[1] = language3[2];
|
|
2430
|
+
percent3[1] = percent3[2];
|
|
2431
|
+
reliable_percent3[2] = 0;
|
|
2432
|
+
language3[2] = UNKNOWN_LANGUAGE;
|
|
2433
|
+
percent3[2] = 0;
|
|
2434
|
+
} else if (language3[0] == TG_UNKNOWN_LANGUAGE) {
|
|
2435
|
+
language3[0] = ENGLISH;
|
|
2436
|
+
}
|
|
2437
|
+
|
|
2438
|
+
if (language3[0] == UNKNOWN_LANGUAGE) {
|
|
2439
|
+
// Last-ditch test for some result, but it is UNKNOWN_LANGUAGE
|
|
2440
|
+
// Force it to English (should not happen)
|
|
2441
|
+
language3[0] = ENGLISH;
|
|
2442
|
+
percent3[0] = 100;
|
|
2443
|
+
*is_reliable = true;
|
|
2444
|
+
}
|
|
2445
|
+
#endif
|
|
2446
|
+
|
|
2447
|
+
|
|
2448
|
+
#if 0
|
|
2449
|
+
// Scaffolding to reveal subset sequence lang distribution across doc text
|
|
2450
|
+
// Track the sequence of language fragments [result currently unused]
|
|
2451
|
+
if (FLAGS_cld_html) {
|
|
2452
|
+
static const int kMaxSubsetSeq = 12;
|
|
2453
|
+
uint8 subseq[kMaxSubsetSeq];
|
|
2454
|
+
doc_tote.ExtractSeq(kMaxSubsetSeq, subseq);
|
|
2455
|
+
|
|
2456
|
+
fprintf(stderr, "<br>\nSubset Sequence[%d]: ", kMaxSubsetSeq);
|
|
2457
|
+
for (int i = 0; i < kMaxSubsetSeq; ++i) {
|
|
2458
|
+
fprintf(stderr, "%s ", ExtLanguageCode(cld::UnpackLanguage(subseq[i])));
|
|
2459
|
+
if ((i % 4) == 3) {fprintf(stderr, " ");}
|
|
2460
|
+
}
|
|
2461
|
+
fprintf(stderr, " ");
|
|
2462
|
+
|
|
2463
|
+
for (int i = 0; i < 3; ++i) {
|
|
2464
|
+
if (language3[i] != UNKNOWN_LANGUAGE) {
|
|
2465
|
+
fprintf(stderr, "%s.%d(%d%%) ",
|
|
2466
|
+
ExtLanguageCode(language3[i]),
|
|
2467
|
+
reliable_percent3[i],
|
|
2468
|
+
percent3[i]);
|
|
2469
|
+
}
|
|
2470
|
+
}
|
|
2471
|
+
|
|
2472
|
+
fprintf(stderr, "%d B ", total_text_bytes);
|
|
2473
|
+
fprintf(stderr, "<br>\n");
|
|
2474
|
+
}
|
|
2475
|
+
// End Scaffolding to reveal subset sequence lang distribution
|
|
2476
|
+
#endif
|
|
2477
|
+
|
|
2478
|
+
Language summary_lang;
|
|
2479
|
+
if (do_pick_summary_language) {
|
|
2480
|
+
CalcSummaryLang(&doc_tote, total_text_bytes,
|
|
2481
|
+
reliable_percent3, language3, percent3,
|
|
2482
|
+
&summary_lang, is_reliable);
|
|
2483
|
+
} else {
|
|
2484
|
+
summary_lang = language3[0];
|
|
2485
|
+
}
|
|
2486
|
+
|
|
2487
|
+
if (FLAGS_cld_html) {
|
|
2488
|
+
for (int i = 0; i < 3; ++i) {
|
|
2489
|
+
if (language3[i] != UNKNOWN_LANGUAGE) {
|
|
2490
|
+
fprintf(stderr, "%s.%d(%d%%) ",
|
|
2491
|
+
ExtLanguageCode(language3[i]),
|
|
2492
|
+
reliable_percent3[i],
|
|
2493
|
+
percent3[i]);
|
|
2494
|
+
}
|
|
2495
|
+
}
|
|
2496
|
+
|
|
2497
|
+
fprintf(stderr, "%d B ", total_text_bytes);
|
|
2498
|
+
fprintf(stderr, "= %s%c ",
|
|
2499
|
+
ExtLanguageName(summary_lang), is_reliable ? ' ' : '*');
|
|
2500
|
+
fprintf(stderr, "<br>\n");
|
|
2501
|
+
}
|
|
2502
|
+
|
|
2503
|
+
return summary_lang;
|
|
2504
|
+
}
|
|
2505
|
+
|
|
2506
|
+
// Not a good answer -- do recursive call to refine
|
|
2507
|
+
if (FLAGS_cld_html || FLAGS_dbgscore) {
|
|
2508
|
+
// This is what we hope to improve on in the recursive call, if any
|
|
2509
|
+
PrintLangs(stderr, language3, percent3, text_bytes, is_reliable);
|
|
2510
|
+
}
|
|
2511
|
+
|
|
2512
|
+
// For restriction to Top40 + one, the one is 1st/2nd lang that is not Top40
|
|
2513
|
+
// For this purpose, we treate "Ignore" as top40
|
|
2514
|
+
Language new_plus_one = UNKNOWN_LANGUAGE;
|
|
2515
|
+
if (cld::kIsPackedTop40[cld::PackLanguage(language3[0])] == 0) {
|
|
2516
|
+
new_plus_one = language3[0];
|
|
2517
|
+
} else if (cld::kIsPackedTop40[cld::PackLanguage(language3[1])] == 0) {
|
|
2518
|
+
new_plus_one = language3[1];
|
|
2519
|
+
}
|
|
2520
|
+
|
|
2521
|
+
if (total_text_bytes < kShortTextThresh) {
|
|
2522
|
+
// Short text: Recursive call with top40 and short set
|
|
2523
|
+
if (FLAGS_cld_html || FLAGS_dbgscore) {
|
|
2524
|
+
fprintf(stderr, " ---text_bytes[%d] "
|
|
2525
|
+
"Recursive(Top40/Rep/Short/Words)---<br><br>\n",
|
|
2526
|
+
total_text_bytes);
|
|
2527
|
+
}
|
|
2528
|
+
return DetectLanguageSummaryV25(
|
|
2529
|
+
tables,
|
|
2530
|
+
buffer,
|
|
2531
|
+
buffer_length,
|
|
2532
|
+
is_plain_text,
|
|
2533
|
+
do_pick_summary_language,
|
|
2534
|
+
do_remove_weak_matches,
|
|
2535
|
+
tld_hint, // "id" boosts Indonesian
|
|
2536
|
+
encoding_hint, // SJS boosts Japanese
|
|
2537
|
+
language_hint, // ITALIAN boosts it
|
|
2538
|
+
allow_extended_lang,
|
|
2539
|
+
flags | kCLDFlagTop40 | kCLDFlagRepeats |
|
|
2540
|
+
kCLDFlagShort | kCLDFlagUseWords | kCLDFlagFinish,
|
|
2541
|
+
new_plus_one,
|
|
2542
|
+
language3,
|
|
2543
|
+
percent3,
|
|
2544
|
+
normalized_score3,
|
|
2545
|
+
text_bytes,
|
|
2546
|
+
is_reliable);
|
|
2547
|
+
}
|
|
2548
|
+
|
|
2549
|
+
// Longer text: Recursive call with top40 set
|
|
2550
|
+
if (FLAGS_cld_html || FLAGS_dbgscore) {
|
|
2551
|
+
fprintf(stderr,
|
|
2552
|
+
" ---text_bytes[%d] Recursive(Top40/Rep)---<br><br>\n",
|
|
2553
|
+
total_text_bytes);
|
|
2554
|
+
}
|
|
2555
|
+
return DetectLanguageSummaryV25(
|
|
2556
|
+
tables,
|
|
2557
|
+
buffer,
|
|
2558
|
+
buffer_length,
|
|
2559
|
+
is_plain_text,
|
|
2560
|
+
do_pick_summary_language,
|
|
2561
|
+
do_remove_weak_matches,
|
|
2562
|
+
tld_hint, // "id" boosts Indonesian
|
|
2563
|
+
encoding_hint, // SJS boosts Japanese
|
|
2564
|
+
language_hint, // ITALIAN boosts it
|
|
2565
|
+
allow_extended_lang,
|
|
2566
|
+
flags | kCLDFlagTop40 | kCLDFlagRepeats |
|
|
2567
|
+
kCLDFlagFinish,
|
|
2568
|
+
new_plus_one,
|
|
2569
|
+
language3,
|
|
2570
|
+
percent3,
|
|
2571
|
+
normalized_score3,
|
|
2572
|
+
text_bytes,
|
|
2573
|
+
is_reliable);
|
|
2574
|
+
} // End CompactLangDetImpl::DetectLanguageSummaryV25
|