language_detection 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (100) hide show
  1. data/.gitignore +19 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +85 -0
  5. data/Rakefile +11 -0
  6. data/ext/cld/Makefile +34 -0
  7. data/ext/cld/base/basictypes.h +348 -0
  8. data/ext/cld/base/build_config.h +124 -0
  9. data/ext/cld/base/casts.h +156 -0
  10. data/ext/cld/base/commandlineflags.h +443 -0
  11. data/ext/cld/base/crash.h +41 -0
  12. data/ext/cld/base/dynamic_annotations.h +358 -0
  13. data/ext/cld/base/global_strip_options.h +59 -0
  14. data/ext/cld/base/log_severity.h +46 -0
  15. data/ext/cld/base/logging.h +1403 -0
  16. data/ext/cld/base/macros.h +243 -0
  17. data/ext/cld/base/port.h +54 -0
  18. data/ext/cld/base/scoped_ptr.h +428 -0
  19. data/ext/cld/base/stl_decl.h +0 -0
  20. data/ext/cld/base/stl_decl_msvc.h +107 -0
  21. data/ext/cld/base/string_util.h +29 -0
  22. data/ext/cld/base/strtoint.h +93 -0
  23. data/ext/cld/base/template_util.h +96 -0
  24. data/ext/cld/base/type_traits.h +198 -0
  25. data/ext/cld/base/vlog_is_on.h +143 -0
  26. data/ext/cld/cld.so +0 -0
  27. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  28. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  29. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  30. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  31. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  32. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  33. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  34. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  35. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  36. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  37. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  38. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  39. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  40. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  41. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  42. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  43. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  44. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  45. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  46. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  47. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  48. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  49. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  50. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  51. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  52. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  53. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  54. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  55. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  56. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  57. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  58. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  59. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  60. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  61. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  62. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  63. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  64. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  65. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  66. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  67. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  68. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  69. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  70. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  71. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  72. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  73. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  74. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  75. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  76. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  77. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  78. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  79. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  80. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  81. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  82. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  83. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  84. data/ext/cld/encodings/internal/encodings.cc +12 -0
  85. data/ext/cld/encodings/lang_enc.h +254 -0
  86. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  87. data/ext/cld/encodings/public/encodings.h +301 -0
  88. data/ext/cld/extconf.rb +1 -0
  89. data/ext/cld/language_detection.cc +88 -0
  90. data/ext/cld/languages/internal/languages.cc +337 -0
  91. data/ext/cld/languages/proto/languages.pb.h +179 -0
  92. data/ext/cld/languages/public/languages.h +379 -0
  93. data/language_detection.gemspec +28 -0
  94. data/lib/language_detection/string.rb +1 -0
  95. data/lib/language_detection/version.rb +3 -0
  96. data/lib/language_detection.rb +54 -0
  97. data/test/_helper.rb +15 -0
  98. data/test/fixtures/languages.csv +80 -0
  99. data/test/language_detection_test.rb +88 -0
  100. metadata +250 -0
@@ -0,0 +1,2574 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include <stdio.h>
6
+ #include <string.h>
7
+ //#include <sys/time.h> // for gettimeofday
8
+ #include <string>
9
+
10
+ #include "encodings/lang_enc.h"
11
+
12
+ #include "encodings/compact_lang_det/compact_lang_det.h"
13
+ #include "encodings/compact_lang_det/compact_lang_det_impl.h"
14
+ #include "encodings/compact_lang_det/getonescriptspan.h"
15
+ #include "encodings/compact_lang_det/letterscript_enum.h"
16
+ #include "encodings/compact_lang_det/tote.h"
17
+ #include "encodings/compact_lang_det/utf8propjustletter.h"
18
+ #include "encodings/compact_lang_det/utf8propletterscriptnum.h"
19
+ #include "encodings/compact_lang_det/utf8scannotjustletterspecial.h"
20
+
21
+ #include "encodings/compact_lang_det/cldutil_dbg.h"
22
+
23
+ #include "encodings/compact_lang_det/win/cld_basictypes.h"
24
+ #include "encodings/compact_lang_det/win/cld_commandlineflags.h"
25
+ #include "encodings/compact_lang_det/win/cld_google.h"
26
+ #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
27
+
28
+ // Linker supplies the right tables
29
+ extern const UTF8PropObj compact_lang_det_generated_ctjkvz_b1_obj;
30
+ extern const cld::CLDTableSummary kCjkBiTable_obj;
31
+ extern const cld::CLDTableSummary kQuadTable_obj;
32
+ extern const cld::CLDTableSummary kLongWord8Table_obj;
33
+
34
+ DEFINE_bool(cld_html, false, "Print language spans in HTML on stderr");
35
+ DEFINE_bool(cld_forcewords, false, "Score all words, in addition to quads");
36
+
37
+ DEFINE_bool(cld_showme, false, "Put squeeze/repeat points into HTML text");
38
+ DEFINE_bool(cld_echotext, false, "Print each scriptspan to stderr");
39
+ DEFINE_int32(cld_textlimit, 160, "Examine only initial n KB of actual text");
40
+ // 20 quadgrams is about 80 bytes or about 12 words in real text
41
+ DEFINE_int32(cld_smoothwidth, 20, "Smoothing window width in quadgrams");
42
+
43
+
44
+ static const int kLangHintInitial = 12; // Boost language by N initially
45
+ static const int kLangHintBoost = 12; // Boost language by N/16 per quadgram
46
+
47
+ static const int kShortSpanThresh = 32; // Bytes
48
+ static const int kMaxSecondChanceLen = 1024; // Look at first 1K of short spans
49
+
50
+ static const int kCheapSqueezeTestThresh = 4096; // Only look for squeezing
51
+ // after this many text bytes
52
+ static const int kCheapSqueezeTestLen = 256; // Bytes to test to trigger sqz
53
+ static const int kSpacesTriggerPercent = 25; // Trigger sqz if >=25% spaces
54
+ static const int kPredictTriggerPercent = 67; // Trigger sqz if >=67% predicted
55
+
56
+ static const int kChunksizeDefault = 48; // Squeeze 48-byte chunks
57
+ static const int kSpacesThreshPercent = 25; // Squeeze if >=25% spaces
58
+ static const int kPredictThreshPercent = 40; // Squeeze if >=40% predicted
59
+
60
+ static const int kMaxSpaceScan = 32; // Bytes
61
+
62
+ static const int kGoodLang1Percent = 70;
63
+ static const int kGoodLang1and2Percent = 93;
64
+ static const int kShortTextThresh = 256; // Bytes
65
+
66
+ static const int kMinChunkSizeQuads = 4; // Chunk is at least four quads
67
+ static const int kMaxChunkSizeQuads = 1024; // Chunk is at most 1K quads
68
+
69
+ static const int kDefaultWordSpan = 256; // Scan at least this many initial
70
+ // bytes with word scoring
71
+ static const int kReallyBigWordSpan = 9999999; // Forces word scoring all text
72
+
73
+ static const int kMinReliableSeq = 50; // Record in seq if >= 50% reliable
74
+
75
+ static const int kPredictionTableSize = 4096; // Must be exactly 4096 for
76
+ // cheap compressor
77
+
78
+ //
79
+ // Generated by dsites 2008.07.07 from 10% of Base
80
+ //
81
+
82
+ // Three packed language probs, subscripted by Encoding
83
+ static const uint32 kEncodingHintProbs[] = {
84
+ 0x00000000, // ASCII
85
+ 0x18120cd5, // Latin2 POLISH.11 CZECH.5 HUNGARIAN.3
86
+ 0x1d3a4bc9, // Latin3 AZERBAIJANI.10 BASQUE.3 CROATIAN.1
87
+ 0x030819d4, // Latin4 ESTONIAN.11 ITALIAN.4 DUTCH.2
88
+ 0x00000000, // ISO-8859-5
89
+ 0x00003742, // Arabic ARABIC.12
90
+ 0x00000000, // Greek
91
+ 0x00000742, // Hebrew HEBREW.12
92
+ 0x00002242, // Latin5 TURKISH.12
93
+ 0x060419c9, // Latin6 ESTONIAN.10 FINNISH.3 GERMAN.1
94
+ 0x00000942, // EUC-JP Japanese.12
95
+ 0x00000942, // SJS Japanese.12
96
+ 0x00000942, // JIS Japanese.12
97
+ 0x00004642, // BIG5 ChineseT.12
98
+ 0x00001142, // GB Chinese.12
99
+ 0x46295fcd, // EUC-CN UIGHUR.10 MALAY.6 ChineseT.5
100
+ 0x00000a42, // KSC Korean.12
101
+ 0x00000000, // Unicode
102
+ 0x03104674, // EUC ChineseT.9 SWEDISH.8 DUTCH.3
103
+ 0x00000000, // CNS
104
+ 0x0f1146c3, // BIG5-CP950 ChineseT.9 Chinese.5 SPANISH.4
105
+ 0x00000942, // CP932 Japanese.12
106
+ 0x00000000, // UTF8
107
+ 0x00000000, // Unknown
108
+ 0x00000000, // ASCII-7-bit
109
+ 0x00000000, // KOI8R
110
+ 0x00000000, // CP1251
111
+ 0x00000000, // CP1252
112
+ 0x00000000, // KOI8U
113
+ 0x451d12cd, // CP1250 CZECH.10 CROATIAN.6 SLOVAK.5
114
+ 0x0d06052a, // ISO-8859-15 FRENCH.9 GERMAN.8 PORTUGUESE.7
115
+ 0x00002242, // CP1254 TURKISH.12
116
+ 0x191516be, // CP1257 LITHUANIAN.8 LATVIAN.7 ESTONIAN.7
117
+ 0x08003642, // ISO-8859-11 THAI.12 ITALIAN.1
118
+ 0x00000000, // CP874
119
+ 0x00003742, // CP1256 ARABIC.12
120
+ 0x00000742, // CP1255 HEBREW.12
121
+ 0x00000000, // ISO-8859-8-I
122
+ 0x00000000, // VISUAL
123
+ 0x00000000, // CP852
124
+ 0x39001242, // CSN_369103 CZECH.12 ESPERANTO.1
125
+ 0x00000000, // CP1253
126
+ 0x00000000, // CP866
127
+ 0x2e001944, // ISO-8859-13 ESTONIAN.12 ALBANIAN.3
128
+ 0x08090a74, // ISO-2022-KR Korean.9 Japanese.8 ITALIAN.3
129
+ 0x00001142, // GBK Chinese.12
130
+ 0x4600113d, // GB18030 Chinese.11 ChineseT.7
131
+ 0x00004642, // BIG5_HKSCS ChineseT.12
132
+ 0x00000000, // ISO_2022_CN
133
+ 0x00000000, // TSCII
134
+ 0x00000000, // TAM
135
+ 0x00000000, // TAB
136
+ 0x00000000, // JAGRAN
137
+ 0x00000000, // MACINTOSH
138
+ 0x00000000, // UTF7
139
+ 0x00000000, // BHASKAR
140
+ 0x00000000, // HTCHANAKYA
141
+ 0x090646ca, // UTF-16BE ChineseT.10 GERMAN.4 Japanese.2
142
+ 0x00000000, // UTF-16LE
143
+ 0x00000000, // UTF-32BE
144
+ 0x00000000, // UTF-32LE
145
+ 0x00000000, // X-BINARYENC
146
+ 0x06001142, // HZ-GB-2312 Chinese.12 GERMAN.1
147
+ 0x461109c2, // X-UTF8UTF8 Japanese.9 Chinese.5 ChineseT.3
148
+ 0x00000000, // X-TAM-ELANGO
149
+ 0x00000000, // X-TAM-LTTMBARANI
150
+ 0x00000000, // X-TAM-SHREE
151
+ 0x00000000, // X-TAM-TBOOMIS
152
+ 0x00000000, // X-TAM-TMNEWS
153
+ 0x00000000, // X-TAM-WEBTAMIL
154
+ 0x00000000, // X-KDDI-Shift_JIS
155
+ 0x00000000, // X-DoCoMo-Shift_JIS
156
+ 0x00000000, // X-SoftBank-Shift_JIS
157
+ 0x00000000, // X-KDDI-ISO-2022-JP
158
+ 0x00000000, // X-SoftBank-ISO-2022-JP
159
+ };
160
+
161
+ COMPILE_ASSERT(arraysize(kEncodingHintProbs) == NUM_ENCODINGS,
162
+ kEncodingHintProbs_has_incorrect_size);
163
+
164
+ //
165
+ // Generated by dsites 2008.07.07 from 10% of Base
166
+ //
167
+
168
+ // Three packed language probs, subscripted by (anchor) language
169
+ static const uint32 kLanguageHintProbs[] = {
170
+ 0x00000000, // ENGLISH
171
+ 0x00000242, // DANISH DANISH.12
172
+ 0x00000342, // DUTCH DUTCH.12
173
+ 0x00000442, // FINNISH FINNISH.12
174
+ 0x00000542, // FRENCH FRENCH.12
175
+ 0x00000642, // GERMAN GERMAN.12
176
+ 0x00000742, // HEBREW HEBREW.12
177
+ 0x00000842, // ITALIAN ITALIAN.12
178
+ 0x00000942, // Japanese Japanese.12
179
+ 0x00000a42, // Korean Korean.12
180
+ 0x51000b43, // NORWEGIAN NORWEGIAN.12 NORWEGIAN_N.2
181
+ 0x00000c42, // POLISH POLISH.12
182
+ 0x00000d42, // PORTUGUESE PORTUGUESE.12
183
+ 0x00000000, // RUSSIAN
184
+ 0x00000f42, // SPANISH SPANISH.12
185
+ 0x00001042, // SWEDISH SWEDISH.12
186
+ 0x00001142, // Chinese Chinese.12
187
+ 0x00001242, // CZECH CZECH.12
188
+ 0x00000000, // GREEK
189
+ 0x47001442, // ICELANDIC ICELANDIC.12 FAROESE.1
190
+ 0x00001542, // LATVIAN LATVIAN.12
191
+ 0x00001642, // LITHUANIAN LITHUANIAN.12
192
+ 0x00001742, // ROMANIAN ROMANIAN.12
193
+ 0x00001842, // HUNGARIAN HUNGARIAN.12
194
+ 0x00001942, // ESTONIAN ESTONIAN.12
195
+ 0x00000000, // TG_UNKNOWN_LANGUAGE
196
+ 0x00000000, // Unknown
197
+ 0x00001c42, // BULGARIAN BULGARIAN.12
198
+ 0x00001d42, // CROATIAN CROATIAN.12
199
+ 0x1e001d46, // SERBIAN CROATIAN.12 SERBIAN.5
200
+ 0x00000000, // IRISH
201
+ 0x0f00203d, // GALICIAN GALICIAN.11 SPANISH.7
202
+ 0x5e00213a, // TAGALOG TAGALOG.11 SOMALI.4
203
+ 0x00002242, // TURKISH TURKISH.12
204
+ 0x00002342, // UKRAINIAN UKRAINIAN.12
205
+ 0x00000000, // HINDI
206
+ 0x1c1e25d4, // MACEDONIAN MACEDONIAN.11 SERBIAN.4 BULGARIAN.2
207
+ 0x00002642, // BENGALI BENGALI.12
208
+ 0x00002742, // INDONESIAN INDONESIAN.12
209
+ 0x00000000, // LATIN
210
+ 0x2700293c, // MALAY MALAY.11 INDONESIAN.6
211
+ 0x00000000, // MALAYALAM
212
+ 0x00000000, // WELSH
213
+ 0x00000000, // NEPALI
214
+ 0x00000000, // TELUGU
215
+ 0x00002e42, // ALBANIAN ALBANIAN.12
216
+ 0x00000000, // TAMIL
217
+ 0x00003042, // BELARUSIAN BELARUSIAN.12
218
+ 0x00000000, // JAVANESE
219
+ 0x00000000, // OCCITAN
220
+ 0x375f3330, // URDU URDU.10 UIGHUR.7 ARABIC.4
221
+ 0x41003436, // BIHARI BIHARI.10 MARATHI.10
222
+ 0x00000000, // GUJARATI
223
+ 0x0a4636b2, // THAI THAI.7 ChineseT.3 Korean.2
224
+ 0x00003742, // ARABIC ARABIC.12
225
+ 0x00003842, // CATALAN CATALAN.12
226
+ 0x00003942, // ESPERANTO ESPERANTO.12
227
+ 0x00003a42, // BASQUE BASQUE.12
228
+ 0x00000000, // INTERLINGUA
229
+ 0x00000000, // KANNADA
230
+ 0x05060cca, // PUNJABI POLISH.10 GERMAN.4 FRENCH.2
231
+ 0x00000000, // SCOTS_GAELIC
232
+ 0x00003f42, // SWAHILI SWAHILI.12
233
+ 0x00004042, // SLOVENIAN SLOVENIAN.12
234
+ 0x00004142, // MARATHI MARATHI.12
235
+ 0x00004242, // MALTESE MALTESE.12
236
+ 0x00004342, // VIETNAMESE VIETNAMESE.12
237
+ 0x00000000, // FRISIAN
238
+ 0x12004543, // SLOVAK SLOVAK.12 CZECH.2
239
+ 0x00004642, // ChineseT ChineseT.12
240
+ 0x00000000, // FAROESE
241
+ 0x00000000, // SUNDANESE
242
+ 0x79004944, // UZBEK UZBEK.12 TAJIK.3
243
+ 0x4d004a46, // AMHARIC AMHARIC.12 TIGRINYA.5
244
+ 0x00004b42, // AZERBAIJANI AZERBAIJANI.12
245
+ 0x00000000, // GEORGIAN
246
+ 0x00000000, // TIGRINYA
247
+ 0x00004e42, // PERSIAN PERSIAN.12
248
+ 0x00000000, // BOSNIAN
249
+ 0x00000000, // SINHALESE
250
+ 0x00000000, // NORWEGIAN_N
251
+ 0x00000000, // PORTUGUESE_P
252
+ 0x00000000, // PORTUGUESE_B
253
+ 0x00000000, // XHOSA
254
+ 0x00000000, // ZULU
255
+ 0x00000000, // GUARANI
256
+ 0x00000000, // SESOTHO
257
+ 0x00000000, // TURKMEN
258
+ 0x7a005933, // KYRGYZ KYRGYZ.10 TATAR.7
259
+ 0x00000000, // BRETON
260
+ 0x00000000, // TWI
261
+ 0x00000000, // YIDDISH
262
+ 0x00000000, // SERBO_CROATIAN
263
+ 0x00000000, // SOMALI
264
+ 0x00005f42, // UIGHUR UIGHUR.12
265
+ 0x00006042, // KURDISH KURDISH.12
266
+ 0x00006142, // MONGOLIAN MONGOLIAN.12
267
+ 0x051130c9, // ARMENIAN BELARUSIAN.10 Chinese.3 FRENCH.1
268
+ 0x020f0521, // LAOTHIAN FRENCH.8 SPANISH.7 DANISH.6
269
+ 0x64004e35, // SINDHI PERSIAN.10 SINDHI.9
270
+ 0x00000000, // RHAETO_ROMANCE
271
+ 0x00006642, // AFRIKAANS AFRIKAANS.12
272
+ 0x00000000, // LUXEMBOURGISH
273
+ 0x00006842, // BURMESE BURMESE.12
274
+ 0x00002242, // KHMER TURKISH.12
275
+ 0x88006a3c, // TIBETAN TIBETAN.11 DZONGKHA.6
276
+ 0x00000000, // DHIVEHI
277
+ 0x00000000, // CHEROKEE
278
+ 0x00000000, // SYRIAC
279
+ 0x00000000, // LIMBU
280
+ 0x00000000, // ORIYA
281
+ 0x00000000, // ASSAMESE
282
+ 0x00000000, // CORSICAN
283
+ 0x00000000, // INTERLINGUE
284
+ 0x00007342, // KAZAKH KAZAKH.12
285
+ 0x00000000, // LINGALA
286
+ 0x00000000, // MOLDAVIAN
287
+ 0x5f007645, // PASHTO PASHTO.12 UIGHUR.4
288
+ 0x00000000, // QUECHUA
289
+ 0x00000000, // SHONA
290
+ 0x00007942, // TAJIK TAJIK.12
291
+ 0x00000000, // TATAR
292
+ 0x00000000, // TONGA
293
+ 0x00000000, // YORUBA
294
+ 0x00000000, // CREOLES_AND_PIDGINS_ENGLISH_BASED
295
+ 0x00000000, // CREOLES_AND_PIDGINS_FRENCH_BASED
296
+ 0x00000000, // CREOLES_AND_PIDGINS_PORTUGUESE_BASED
297
+ 0x00000000, // CREOLES_AND_PIDGINS_OTHER
298
+ 0x00000000, // MAORI
299
+ 0x00000000, // WOLOF
300
+ 0x00000000, // ABKHAZIAN
301
+ 0x00000000, // AFAR
302
+ 0x00000000, // AYMARA
303
+ 0x00000000, // BASHKIR
304
+ 0x00000000, // BISLAMA
305
+ 0x00000000, // DZONGKHA
306
+ 0x00000000, // FIJIAN
307
+ 0x00000000, // GREENLANDIC
308
+ 0x00000000, // HAUSA
309
+ 0x00000000, // HAITIAN_CREOLE
310
+ 0x00000000, // INUPIAK
311
+ 0x00000542, // INUKTITUT FRENCH.12
312
+ 0x00000000, // KASHMIRI
313
+ 0x00000000, // KINYARWANDA
314
+ 0x00000000, // MALAGASY
315
+ 0x00000000, // NAURU
316
+ 0x00000000, // OROMO
317
+ 0x00000000, // RUNDI
318
+ 0x00000000, // SAMOAN
319
+ 0x00000000, // SANGO
320
+ 0x344197d3, // SANSKRIT SANSKRIT.11 MARATHI.4 BIHARI.1
321
+ 0x00000000, // SISWANT
322
+ 0x00000000, // TSONGA
323
+ 0x00000000, // TSWANA
324
+ 0x00000000, // VOLAPUK
325
+ 0x00000000, // ZHUANG
326
+ 0x00000000, // KHASI
327
+ 0x00000000, // SCOTS
328
+ 0x00000000, // GANDA
329
+ 0x00000000, // MANX
330
+ 0x00000000, // MONTENEGRIN
331
+ // Add new language hints just before here (just use 0x00000000)
332
+ };
333
+
334
+ COMPILE_ASSERT(arraysize(kLanguageHintProbs) == NUM_LANGUAGES,
335
+ kLanguageHintProbs_has_incorrect_size);
336
+
337
+ //
338
+ // Generated by dsites 2008.07.07 from 10% of Base
339
+ //
340
+
341
+ typedef struct {
342
+ char key[4];
343
+ uint32 probs;
344
+ } HintEntry;
345
+
346
+
347
+ // Massaged TLD, followed by three packed language probs
348
+ // Hand-removed 4 items dsites 2008.07.15
349
+ static const int kTLDHintProbsSize = 201;
350
+ static const HintEntry kTLDHintProbs[kTLDHintProbsSize] = { // MaxRange 12
351
+ {{0x61,0x63,0x5f,0x5f}, 0x0a000945}, // ac__ Japanese.12 Korean.4
352
+ {{0x61,0x64,0x5f,0x5f}, 0x00003842}, // ad__ CATALAN.12
353
+ {{0x61,0x65,0x5f,0x5f}, 0x00003742}, // ae__ ARABIC.12
354
+ {{0x61,0x66,0x5f,0x5f}, 0x4e00763d}, // af__ PASHTO.11 PERSIAN.7
355
+ {{0x61,0x67,0x5f,0x5f}, 0x09000643}, // ag__ GERMAN.12 Japanese.2
356
+ {{0x61,0x69,0x5f,0x5f}, 0x0c180938}, // ai__ Japanese.11 HUNGARIAN.7 POLISH.2
357
+ {{0x61,0x6c,0x5f,0x5f}, 0x00002e42}, // al__ ALBANIAN.12
358
+ {{0x61,0x6e,0x5f,0x5f}, 0x6e00033d}, // an__ DUTCH.11 LIMBU.7
359
+ {{0x61,0x6f,0x5f,0x5f}, 0x05000d42}, // ao__ PORTUGUESE.12 FRENCH.1
360
+ {{0x61,0x71,0x5f,0x5f}, 0x05000f29}, // aq__ SPANISH.9 FRENCH.6
361
+ {{0x61,0x72,0x5f,0x5f}, 0x00000f42}, // ar__ SPANISH.12
362
+ {{0x61,0x73,0x5f,0x5f}, 0x0f120bcd}, // as__ NORWEGIAN.10 CZECH.6 SPANISH.5
363
+ {{0x61,0x74,0x5f,0x5f}, 0x00000642}, // at__ GERMAN.12
364
+ {{0x61,0x77,0x5f,0x5f}, 0x0f000345}, // aw__ DUTCH.12 SPANISH.4
365
+ {{0x61,0x78,0x5f,0x5f}, 0x00001042}, // ax__ SWEDISH.12
366
+ {{0x61,0x7a,0x5f,0x5f}, 0x00004b42}, // az__ AZERBAIJANI.12
367
+ {{0x62,0x61,0x5f,0x5f}, 0x00001d42}, // ba__ CROATIAN.12
368
+ {{0x62,0x62,0x5f,0x5f}, 0x00002842}, // bb__ LATIN.12
369
+ {{0x62,0x64,0x5f,0x5f}, 0x00002642}, // bd__ BENGALI.12
370
+ {{0x62,0x65,0x5f,0x5f}, 0x05000335}, // be__ DUTCH.10 FRENCH.9
371
+ {{0x62,0x66,0x5f,0x5f}, 0x00000542}, // bf__ FRENCH.12
372
+ {{0x62,0x67,0x5f,0x5f}, 0x00001c42}, // bg__ BULGARIAN.12
373
+ {{0x62,0x68,0x5f,0x5f}, 0x00003742}, // bh__ ARABIC.12
374
+ {{0x62,0x69,0x5f,0x5f}, 0x0f00053f}, // bi__ FRENCH.11 SPANISH.9
375
+ {{0x62,0x6a,0x5f,0x5f}, 0x00000542}, // bj__ FRENCH.12
376
+ {{0x62,0x6d,0x5f,0x5f}, 0x98043929}, // bm__ ESPERANTO.9 FINNISH.8 SISWANT.6
377
+ {{0x62,0x6e,0x5f,0x5f}, 0x00002942}, // bn__ MALAY.12
378
+ {{0x62,0x6f,0x5f,0x5f}, 0x00000f42}, // bo__ SPANISH.12
379
+ {{0x62,0x72,0x5f,0x5f}, 0x00000d42}, // br__ PORTUGUESE.12
380
+ {{0x62,0x74,0x5f,0x5f}, 0x00008842}, // bt__ DZONGKHA.12
381
+ {{0x62,0x77,0x5f,0x5f}, 0x06059ac4}, // bw__ TSWANA.9 FRENCH.6 GERMAN.5
382
+ {{0x62,0x79,0x5f,0x5f}, 0x00003024}, // by__ BELARUSIAN.9
383
+ {{0x62,0x7a,0x5f,0x5f}, 0x0f0a0924}, // bz__ Japanese.9 Korean.5 SPANISH.1
384
+ {{0x63,0x61,0x5f,0x5f}, 0x00000542}, // ca__ FRENCH.12
385
+ {{0x63,0x61,0x74,0x5f}, 0x00003842}, // cat_ CATALAN.12
386
+ {{0x63,0x64,0x5f,0x5f}, 0x06051224}, // cd__ CZECH.9 FRENCH.5 GERMAN.1
387
+ {{0x63,0x66,0x5f,0x5f}, 0x00000542}, // cf__ FRENCH.12
388
+ {{0x63,0x67,0x5f,0x5f}, 0x00000542}, // cg__ FRENCH.12
389
+ {{0x63,0x68,0x5f,0x5f}, 0x08050638}, // ch__ GERMAN.11 FRENCH.7 ITALIAN.2
390
+ {{0x63,0x69,0x5f,0x5f}, 0x00000542}, // ci__ FRENCH.12
391
+ {{0x63,0x6c,0x5f,0x5f}, 0x00000f42}, // cl__ SPANISH.12
392
+ {{0x63,0x6d,0x5f,0x5f}, 0x00000542}, // cm__ FRENCH.12
393
+ {{0x63,0x6e,0x5f,0x5f}, 0x00001142}, // cn__ Chinese.12
394
+ {{0x63,0x6f,0x5f,0x5f}, 0x00000f42}, // co__ SPANISH.12
395
+ // {{0x63,0x6f,0x6f,0x70}, 0x0f0509cd}, // coop Japanese.10 FRENCH.6 SPANISH.5
396
+ {{0x63,0x72,0x5f,0x5f}, 0x00000f42}, // cr__ SPANISH.12
397
+ {{0x63,0x75,0x5f,0x5f}, 0x00000f42}, // cu__ SPANISH.12
398
+ {{0x63,0x76,0x5f,0x5f}, 0x00000d42}, // cv__ PORTUGUESE.12
399
+ {{0x63,0x78,0x5f,0x5f}, 0x223a091f}, // cx__ Japanese.8 BASQUE.6 TURKISH.4
400
+ {{0x63,0x79,0x5f,0x5f}, 0x150622ba}, // cy__ TURKISH.8 GERMAN.4 LATVIAN.3
401
+ {{0x63,0x7a,0x5f,0x5f}, 0x00001242}, // cz__ CZECH.12
402
+ {{0x64,0x65,0x5f,0x5f}, 0x00000642}, // de__ GERMAN.12
403
+ {{0x64,0x6b,0x5f,0x5f}, 0x00000242}, // dk__ DANISH.12
404
+ {{0x64,0x6f,0x5f,0x5f}, 0x21000f42}, // do__ SPANISH.12 TAGALOG.1
405
+ {{0x64,0x7a,0x5f,0x5f}, 0x37000535}, // dz__ FRENCH.10 ARABIC.9
406
+ {{0x65,0x63,0x5f,0x5f}, 0x00000f42}, // ec__ SPANISH.12
407
+ // {{0x65,0x64,0x75,0x5f}, 0x2e0f3873}, // edu_ CATALAN.9 SPANISH.7 ALBANIAN.2
408
+ {{0x65,0x65,0x5f,0x5f}, 0x00001942}, // ee__ ESTONIAN.12
409
+ {{0x65,0x67,0x5f,0x5f}, 0x05003742}, // eg__ ARABIC.12 FRENCH.1
410
+ {{0x65,0x72,0x5f,0x5f}, 0x00000b42}, // er__ NORWEGIAN.12
411
+ {{0x65,0x73,0x5f,0x5f}, 0x38200fd4}, // es__ SPANISH.11 GALICIAN.4 CATALAN.2
412
+ {{0x65,0x74,0x5f,0x5f}, 0x39004a39}, // et__ AMHARIC.11 ESPERANTO.3
413
+ {{0x66,0x69,0x5f,0x5f}, 0x10000444}, // fi__ FINNISH.12 SWEDISH.3
414
+ {{0x66,0x6a,0x5f,0x5f}, 0x050489e0}, // fj__ FIJIAN.12 FINNISH.5 FRENCH.3
415
+ {{0x66,0x6f,0x5f,0x5f}, 0x00004742}, // fo__ FAROESE.12
416
+ {{0x66,0x72,0x5f,0x5f}, 0x00000542}, // fr__ FRENCH.12
417
+ {{0x67,0x61,0x5f,0x5f}, 0x00000542}, // ga__ FRENCH.12
418
+ {{0x67,0x64,0x5f,0x5f}, 0x061d05d5}, // gd__ FRENCH.11 CROATIAN.5 GERMAN.3
419
+ {{0x67,0x65,0x5f,0x5f}, 0x00004c2d}, // ge__ GEORGIAN.10
420
+ {{0x67,0x66,0x5f,0x5f}, 0x00000542}, // gf__ FRENCH.12
421
+ {{0x67,0x67,0x5f,0x5f}, 0x06002244}, // gg__ TURKISH.12 GERMAN.3
422
+ {{0x67,0x68,0x5f,0x5f}, 0x05000436}, // gh__ FINNISH.10 FRENCH.10
423
+ {{0x67,0x69,0x5f,0x5f}, 0x0f0538ce}, // gi__ CATALAN.10 FRENCH.7 SPANISH.6
424
+ {{0x67,0x6c,0x5f,0x5f}, 0x398a0238}, // gl__ DANISH.11 GREENLANDIC.7 ESPERANTO.2
425
+ {{0x67,0x6d,0x5f,0x5f}, 0x0600043e}, // gm__ FINNISH.11 GERMAN.8
426
+ {{0x67,0x6e,0x5f,0x5f}, 0x00000542}, // gn__ FRENCH.12
427
+ // {{0x67,0x6f,0x76,0x5f}, 0x05000f25}, // gov_ SPANISH.9 FRENCH.2
428
+ {{0x67,0x70,0x5f,0x5f}, 0x00000542}, // gp__ FRENCH.12
429
+ {{0x67,0x71,0x5f,0x5f}, 0x0f000547}, // gq__ FRENCH.12 SPANISH.6
430
+ {{0x67,0x73,0x5f,0x5f}, 0x00000942}, // gs__ Japanese.12
431
+ {{0x67,0x74,0x5f,0x5f}, 0x00000f42}, // gt__ SPANISH.12
432
+ {{0x68,0x6b,0x5f,0x5f}, 0x11004643}, // hk__ ChineseT.12 Chinese.2
433
+ {{0x68,0x6d,0x5f,0x5f}, 0x4606092e}, // hm__ Japanese.10 GERMAN.6 ChineseT.2
434
+ {{0x68,0x6e,0x5f,0x5f}, 0x00000f42}, // hn__ SPANISH.12
435
+ {{0x68,0x72,0x5f,0x5f}, 0x00001d42}, // hr__ CROATIAN.12
436
+ {{0x68,0x74,0x5f,0x5f}, 0x0f000542}, // ht__ FRENCH.12 SPANISH.1
437
+ {{0x68,0x75,0x5f,0x5f}, 0x00001842}, // hu__ HUNGARIAN.12
438
+ {{0x69,0x64,0x5f,0x5f}, 0x00002742}, // id__ INDONESIAN.12
439
+ {{0x69,0x65,0x5f,0x5f}, 0x050c1f24}, // ie__ IRISH.9 POLISH.5 FRENCH.1
440
+ {{0x69,0x6c,0x5f,0x5f}, 0x00000742}, // il__ HEBREW.12
441
+ {{0x69,0x6e,0x74,0x5f}, 0x0f060574}, // int_ FRENCH.9 GERMAN.8 SPANISH.3
442
+ {{0x69,0x6f,0x5f,0x5f}, 0x11090fd5}, // io__ SPANISH.11 Japanese.5 Chinese.3
443
+ {{0x69,0x71,0x5f,0x5f}, 0x60003744}, // iq__ ARABIC.12 KURDISH.3
444
+ {{0x69,0x72,0x5f,0x5f}, 0x00004e42}, // ir__ PERSIAN.12
445
+ {{0x69,0x73,0x5f,0x5f}, 0x00001442}, // is__ ICELANDIC.12
446
+ {{0x69,0x74,0x5f,0x5f}, 0x00000842}, // it__ ITALIAN.12
447
+ {{0x6a,0x65,0x5f,0x5f}, 0x29050328}, // je__ DUTCH.9 FRENCH.7 MALAY.5
448
+ {{0x6a,0x6d,0x5f,0x5f}, 0x040f0576}, // jm__ FRENCH.9 SPANISH.8 FINNISH.5
449
+ {{0x6a,0x6f,0x5f,0x5f}, 0x00003742}, // jo__ ARABIC.12
450
+ // {{0x6a,0x6f,0x62,0x73}, 0x0f060329}, // jobs DUTCH.9 GERMAN.8 SPANISH.6
451
+ {{0x6a,0x70,0x5f,0x5f}, 0x00000942}, // jp__ Japanese.12
452
+ {{0x6b,0x65,0x5f,0x5f}, 0x040f3fc3}, // ke__ SWAHILI.9 SPANISH.5 FINNISH.4
453
+ {{0x6b,0x69,0x5f,0x5f}, 0x04000643}, // ki__ GERMAN.12 FINNISH.2
454
+ {{0x6b,0x6d,0x5f,0x5f}, 0x00000542}, // km__ FRENCH.12
455
+ {{0x6b,0x70,0x5f,0x5f}, 0x00000a42}, // kp__ Korean.12
456
+ {{0x6b,0x72,0x5f,0x5f}, 0x00000a42}, // kr__ Korean.12
457
+ {{0x6b,0x77,0x5f,0x5f}, 0x00003742}, // kw__ ARABIC.12
458
+ {{0x6b,0x79,0x5f,0x5f}, 0x0500083f}, // ky__ ITALIAN.11 FRENCH.9
459
+ {{0x6b,0x7a,0x5f,0x5f}, 0x0000732d}, // kz__ KAZAKH.10
460
+ {{0x6c,0x62,0x5f,0x5f}, 0x05003747}, // lb__ ARABIC.12 FRENCH.6
461
+ {{0x6c,0x63,0x5f,0x5f}, 0x09000645}, // lc__ GERMAN.12 Japanese.4
462
+ {{0x6c,0x69,0x5f,0x5f}, 0x1600063d}, // li__ GERMAN.11 LITHUANIAN.7
463
+ {{0x6c,0x73,0x5f,0x5f}, 0x00005742}, // ls__ SESOTHO.12
464
+ {{0x6c,0x74,0x5f,0x5f}, 0x00001642}, // lt__ LITHUANIAN.12
465
+ {{0x6c,0x75,0x5f,0x5f}, 0x0600053d}, // lu__ FRENCH.11 GERMAN.7
466
+ {{0x6c,0x76,0x5f,0x5f}, 0x00001542}, // lv__ LATVIAN.12
467
+ {{0x6c,0x79,0x5f,0x5f}, 0x05003744}, // ly__ ARABIC.12 FRENCH.3
468
+ {{0x6d,0x61,0x5f,0x5f}, 0x3700053d}, // ma__ FRENCH.11 ARABIC.7
469
+ {{0x6d,0x63,0x5f,0x5f}, 0x00000542}, // mc__ FRENCH.12
470
+ {{0x6d,0x64,0x5f,0x5f}, 0x00001724}, // md__ ROMANIAN.9
471
+ {{0x6d,0x65,0x5f,0x5f}, 0x00001d42}, // me__ CROATIAN.12
472
+ {{0x6d,0x67,0x5f,0x5f}, 0x00000542}, // mg__ FRENCH.12
473
+ {{0x6d,0x6b,0x5f,0x5f}, 0x1c002543}, // mk__ MACEDONIAN.12 BULGARIAN.2
474
+ {{0x6d,0x6c,0x5f,0x5f}, 0x00000542}, // ml__ FRENCH.12
475
+ {{0x6d,0x6e,0x5f,0x5f}, 0x00006142}, // mn__ MONGOLIAN.12
476
+ {{0x6d,0x6f,0x5f,0x5f}, 0x110d4631}, // mo__ ChineseT.10 PORTUGUESE.8 Chinese.5
477
+ {{0x6d,0x71,0x5f,0x5f}, 0x00000542}, // mq__ FRENCH.12
478
+ {{0x6d,0x72,0x5f,0x5f}, 0x37000535}, // mr__ FRENCH.10 ARABIC.9
479
+ {{0x6d,0x73,0x5f,0x5f}, 0x090f06d5}, // ms__ GERMAN.11 SPANISH.5 Japanese.3
480
+ {{0x6d,0x74,0x5f,0x5f}, 0x00004242}, // mt__ MALTESE.12
481
+ {{0x6d,0x75,0x5f,0x5f}, 0x05000934}, // mu__ Japanese.10 FRENCH.8
482
+ {{0x6d,0x76,0x5f,0x5f}, 0x28000436}, // mv__ FINNISH.10 LATIN.10
483
+ {{0x6d,0x77,0x5f,0x5f}, 0x0611092a}, // mw__ Japanese.9 Chinese.8 GERMAN.7
484
+ {{0x6d,0x78,0x5f,0x5f}, 0x00000f42}, // mx__ SPANISH.12
485
+ {{0x6d,0x79,0x5f,0x5f}, 0x00002942}, // my__ MALAY.12
486
+ {{0x6d,0x7a,0x5f,0x5f}, 0x00000d42}, // mz__ PORTUGUESE.12
487
+ {{0x6e,0x61,0x5f,0x5f}, 0x06006644}, // na__ AFRIKAANS.12 GERMAN.3
488
+ {{0x6e,0x63,0x5f,0x5f}, 0x00000542}, // nc__ FRENCH.12
489
+ {{0x6e,0x65,0x5f,0x5f}, 0x8b000542}, // ne__ FRENCH.12 HAUSA.1
490
+ {{0x6e,0x66,0x5f,0x5f}, 0x00000542}, // nf__ FRENCH.12
491
+ {{0x6e,0x69,0x5f,0x5f}, 0x00000f42}, // ni__ SPANISH.12
492
+ {{0x6e,0x6c,0x5f,0x5f}, 0x00000342}, // nl__ DUTCH.12
493
+ {{0x6e,0x6f,0x5f,0x5f}, 0x51000b43}, // no__ NORWEGIAN.12 NORWEGIAN_N.2
494
+ {{0x6e,0x75,0x5f,0x5f}, 0x0300103b}, // nu__ SWEDISH.11 DUTCH.5
495
+ {{0x6f,0x6d,0x5f,0x5f}, 0x00003742}, // om__ ARABIC.12
496
+ {{0x70,0x61,0x5f,0x5f}, 0x00000f42}, // pa__ SPANISH.12
497
+ {{0x70,0x65,0x5f,0x5f}, 0x00000f42}, // pe__ SPANISH.12
498
+ {{0x70,0x66,0x5f,0x5f}, 0x00000542}, // pf__ FRENCH.12
499
+ {{0x70,0x67,0x5f,0x5f}, 0x00000f24}, // pg__ SPANISH.9
500
+ {{0x70,0x68,0x5f,0x5f}, 0x00002142}, // ph__ TAGALOG.12
501
+ {{0x70,0x6b,0x5f,0x5f}, 0x00003342}, // pk__ URDU.12
502
+ {{0x70,0x6c,0x5f,0x5f}, 0x30000c42}, // pl__ POLISH.12 BELARUSIAN.1
503
+ {{0x70,0x6e,0x5f,0x5f}, 0x04000644}, // pn__ GERMAN.12 FINNISH.3
504
+ {{0x70,0x72,0x5f,0x5f}, 0x00000f42}, // pr__ SPANISH.12
505
+ {{0x70,0x72,0x6f,0x5f}, 0x46050fd5}, // pro_ SPANISH.11 FRENCH.5 ChineseT.3
506
+ {{0x70,0x73,0x5f,0x5f}, 0x00003742}, // ps__ ARABIC.12
507
+ {{0x70,0x74,0x5f,0x5f}, 0x00000d42}, // pt__ PORTUGUESE.12
508
+ {{0x70,0x79,0x5f,0x5f}, 0x00000f42}, // py__ SPANISH.12
509
+ {{0x71,0x61,0x5f,0x5f}, 0x00003742}, // qa__ ARABIC.12
510
+ {{0x72,0x65,0x5f,0x5f}, 0x00000542}, // re__ FRENCH.12
511
+ {{0x72,0x6f,0x5f,0x5f}, 0x00001742}, // ro__ ROMANIAN.12
512
+ {{0x72,0x73,0x5f,0x5f}, 0x00001d42}, // rs__ CROATIAN.12
513
+ {{0x72,0x77,0x5f,0x5f}, 0x9000053e}, // rw__ FRENCH.11 KINYARWANDA.8
514
+ {{0x73,0x61,0x5f,0x5f}, 0x00003742}, // sa__ ARABIC.12
515
+ {{0x73,0x62,0x5f,0x5f}, 0x00000442}, // sb__ FINNISH.12
516
+ {{0x73,0x63,0x5f,0x5f}, 0x060f092f}, // sc__ Japanese.10 SPANISH.7 GERMAN.3
517
+ {{0x73,0x64,0x5f,0x5f}, 0x00003742}, // sd__ ARABIC.12
518
+ {{0x73,0x65,0x5f,0x5f}, 0x00001042}, // se__ SWEDISH.12
519
+ {{0x73,0x69,0x5f,0x5f}, 0x00004042}, // si__ SLOVENIAN.12
520
+ {{0x73,0x6b,0x5f,0x5f}, 0x12004543}, // sk__ SLOVAK.12 CZECH.2
521
+ {{0x73,0x6d,0x5f,0x5f}, 0x00000842}, // sm__ ITALIAN.12
522
+ {{0x73,0x6e,0x5f,0x5f}, 0x00000542}, // sn__ FRENCH.12
523
+ {{0x73,0x72,0x5f,0x5f}, 0x03001e44}, // sr__ SERBIAN.12 DUTCH.3
524
+ {{0x73,0x76,0x5f,0x5f}, 0x00000f42}, // sv__ SPANISH.12
525
+ {{0x73,0x79,0x5f,0x5f}, 0x00003742}, // sy__ ARABIC.12
526
+ {{0x74,0x63,0x5f,0x5f}, 0x0a2206cd}, // tc__ GERMAN.10 TURKISH.6 Korean.5
527
+ {{0x74,0x66,0x5f,0x5f}, 0x00000642}, // tf__ GERMAN.12
528
+ {{0x74,0x67,0x5f,0x5f}, 0x00000542}, // tg__ FRENCH.12
529
+ {{0x74,0x68,0x5f,0x5f}, 0x9e0936c9}, // th__ THAI.10 Japanese.3 SCOTS.1
530
+ {{0x74,0x6a,0x5f,0x5f}, 0x00007924}, // tj__ TAJIK.9
531
+ {{0x74,0x6c,0x5f,0x5f}, 0x060f0dcd}, // tl__ PORTUGUESE.10 SPANISH.6 GERMAN.5
532
+ {{0x74,0x6e,0x5f,0x5f}, 0x3700053e}, // tn__ FRENCH.11 ARABIC.8
533
+ {{0x74,0x6f,0x5f,0x5f}, 0x064609c5}, // to__ Japanese.9 ChineseT.7 GERMAN.6
534
+ {{0x74,0x70,0x5f,0x5f}, 0x06000944}, // tp__ Japanese.12 GERMAN.3
535
+ {{0x74,0x72,0x5f,0x5f}, 0x00002242}, // tr__ TURKISH.12
536
+ {{0x74,0x72,0x61,0x76}, 0x064509c3}, // trav Japanese.9 SLOVAK.5 GERMAN.4
537
+ {{0x74,0x74,0x5f,0x5f}, 0x0f00063e}, // tt__ GERMAN.11 SPANISH.8
538
+ {{0x74,0x77,0x5f,0x5f}, 0x00004642}, // tw__ ChineseT.12
539
+ {{0x74,0x7a,0x5f,0x5f}, 0x00003f42}, // tz__ SWAHILI.12
540
+ {{0x75,0x61,0x5f,0x5f}, 0x0000232d}, // ua__ UKRAINIAN.10
541
+ {{0x75,0x79,0x5f,0x5f}, 0x00000f42}, // uy__ SPANISH.12
542
+ {{0x75,0x7a,0x5f,0x5f}, 0x0000492d}, // uz__ UZBEK.10
543
+ {{0x76,0x61,0x5f,0x5f}, 0x060f0828}, // va__ ITALIAN.9 SPANISH.7 GERMAN.5
544
+ {{0x76,0x63,0x5f,0x5f}, 0x0d000939}, // vc__ Japanese.11 PORTUGUESE.3
545
+ {{0x76,0x65,0x5f,0x5f}, 0x00000f42}, // ve__ SPANISH.12
546
+ {{0x76,0x67,0x5f,0x5f}, 0x09000f43}, // vg__ SPANISH.12 Japanese.2
547
+ {{0x76,0x69,0x5f,0x5f}, 0x00002942}, // vi__ MALAY.12
548
+ {{0x76,0x6e,0x5f,0x5f}, 0x00004342}, // vn__ VIETNAMESE.12
549
+ {{0x76,0x75,0x5f,0x5f}, 0x00000642}, // vu__ GERMAN.12
550
+ {{0x77,0x73,0x5f,0x5f}, 0x4b0f0624}, // ws__ GERMAN.9 SPANISH.5 AZERBAIJANI.1
551
+ {{0x79,0x65,0x5f,0x5f}, 0x00003742}, // ye__ ARABIC.12
552
+ {{0x79,0x75,0x5f,0x5f}, 0x1e001d3d}, // yu__ CROATIAN.11 SERBIAN.7
553
+ {{0x7a,0x61,0x5f,0x5f}, 0x00006642}, // za__ AFRIKAANS.12
554
+ {{0x7a,0x6d,0x5f,0x5f}, 0x0b000435}, // zm__ FINNISH.10 NORWEGIAN.9
555
+ {{0x7a,0x77,0x5f,0x5f}, 0x3f00783e}, // zw__ SHONA.11 SWAHILI.8
556
+ };
557
+
558
+
559
+ // Statistically closest language, based on quadgram table
560
+ // Those that are far from other languges map to UNKNOWN_LANGUAGE
561
+ // Subscripted by Language
562
+ //
563
+ // From lang_correlation.txt and hand-edits
564
+ // sed 's/^\([^ ]*\) \([^ ]*\) coef=0\.\(..\).*$/
565
+ // (\3 >= kMinCorrPercent) ? \2 : UNKNOWN_LANGUAGE,
566
+ // \/\/ \1/' lang_correlation.txt >/tmp/closest_lang_decl.txt
567
+ //
568
+ static const int kMinCorrPercent = 24; // Pick off how close you want
569
+ // 24 catches PERSIAN <== ARABIC
570
+ // but not SPANISH <== PORTUGESE
571
+ static Language Unknown = UNKNOWN_LANGUAGE;
572
+
573
+ // Subscripted by Language
574
+ static const Language kClosestAltLanguage[] = {
575
+ (28 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // ENGLISH
576
+ (36 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // DANISH
577
+ (31 >= kMinCorrPercent) ? AFRIKAANS : UNKNOWN_LANGUAGE, // DUTCH
578
+ (15 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // FINNISH
579
+ (11 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // FRENCH
580
+ (17 >= kMinCorrPercent) ? LUXEMBOURGISH : UNKNOWN_LANGUAGE, // GERMAN
581
+ (27 >= kMinCorrPercent) ? YIDDISH : UNKNOWN_LANGUAGE, // HEBREW
582
+ (16 >= kMinCorrPercent) ? CORSICAN : UNKNOWN_LANGUAGE, // ITALIAN
583
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Japanese
584
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Korean
585
+ (41 >= kMinCorrPercent) ? NORWEGIAN_N : UNKNOWN_LANGUAGE, // NORWEGIAN
586
+ ( 5 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // POLISH
587
+ (23 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // PORTUGUESE
588
+ (33 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // RUSSIAN
589
+ (28 >= kMinCorrPercent) ? GALICIAN : UNKNOWN_LANGUAGE, // SPANISH
590
+ (17 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // SWEDISH
591
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Chinese
592
+ (42 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // CZECH
593
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GREEK
594
+ (35 >= kMinCorrPercent) ? FAROESE : UNKNOWN_LANGUAGE, // ICELANDIC
595
+ ( 7 >= kMinCorrPercent) ? LITHUANIAN : UNKNOWN_LANGUAGE, // LATVIAN
596
+ ( 7 >= kMinCorrPercent) ? LATVIAN : UNKNOWN_LANGUAGE, // LITHUANIAN
597
+ ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ROMANIAN
598
+ ( 4 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // HUNGARIAN
599
+ (15 >= kMinCorrPercent) ? FINNISH : UNKNOWN_LANGUAGE, // ESTONIAN
600
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Ignore
601
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Unknown
602
+ (33 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // BULGARIAN
603
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CROATIAN
604
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SERBIAN
605
+ (24 >= kMinCorrPercent) ? SCOTS_GAELIC : UNKNOWN_LANGUAGE, // IRISH
606
+ (28 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GALICIAN
607
+ ( 8 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // TAGALOG
608
+ (29 >= kMinCorrPercent) ? AZERBAIJANI : UNKNOWN_LANGUAGE, // TURKISH
609
+ (28 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // UKRAINIAN
610
+ (37 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // HINDI
611
+ (29 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // MACEDONIAN
612
+ (14 >= kMinCorrPercent) ? ASSAMESE : UNKNOWN_LANGUAGE, // BENGALI
613
+ (46 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // INDONESIAN
614
+ ( 9 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // LATIN
615
+ (46 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // MALAY
616
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MALAYALAM
617
+ ( 4 >= kMinCorrPercent) ? BRETON : UNKNOWN_LANGUAGE, // WELSH
618
+ ( 8 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // NEPALI
619
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TELUGU
620
+ ( 3 >= kMinCorrPercent) ? ESPERANTO : UNKNOWN_LANGUAGE, // ALBANIAN
621
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TAMIL
622
+ (22 >= kMinCorrPercent) ? UKRAINIAN : UNKNOWN_LANGUAGE, // BELARUSIAN
623
+ (15 >= kMinCorrPercent) ? SUNDANESE : UNKNOWN_LANGUAGE, // JAVANESE
624
+ (19 >= kMinCorrPercent) ? CATALAN : UNKNOWN_LANGUAGE, // OCCITAN
625
+ (27 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // URDU
626
+ (36 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // BIHARI
627
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GUJARATI
628
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // THAI
629
+ (24 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // ARABIC
630
+ (19 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // CATALAN
631
+ ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ESPERANTO
632
+ ( 3 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // BASQUE
633
+ ( 9 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // INTERLINGUA
634
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KANNADA
635
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PUNJABI
636
+ (24 >= kMinCorrPercent) ? IRISH : UNKNOWN_LANGUAGE, // SCOTS_GAELIC
637
+ ( 7 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SWAHILI
638
+ (28 >= kMinCorrPercent) ? SERBO_CROATIAN : UNKNOWN_LANGUAGE, // SLOVENIAN
639
+ (37 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // MARATHI
640
+ ( 3 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // MALTESE
641
+ ( 1 >= kMinCorrPercent) ? YORUBA : UNKNOWN_LANGUAGE, // VIETNAMESE
642
+ (15 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // FRISIAN
643
+ (42 >= kMinCorrPercent) ? CZECH : UNKNOWN_LANGUAGE, // SLOVAK
644
+ // Original ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ChineseT
645
+ (24 >= kMinCorrPercent) ? CHINESE : UNKNOWN_LANGUAGE, // ChineseT
646
+ (35 >= kMinCorrPercent) ? ICELANDIC : UNKNOWN_LANGUAGE, // FAROESE
647
+ (15 >= kMinCorrPercent) ? JAVANESE : UNKNOWN_LANGUAGE, // SUNDANESE
648
+ (17 >= kMinCorrPercent) ? TAJIK : UNKNOWN_LANGUAGE, // UZBEK
649
+ ( 7 >= kMinCorrPercent) ? TIGRINYA : UNKNOWN_LANGUAGE, // AMHARIC
650
+ (29 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // AZERBAIJANI
651
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GEORGIAN
652
+ ( 7 >= kMinCorrPercent) ? AMHARIC : UNKNOWN_LANGUAGE, // TIGRINYA
653
+ (27 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // PERSIAN
654
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // BOSNIAN
655
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SINHALESE
656
+ (41 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // NORWEGIAN_N
657
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_P
658
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_B
659
+ (37 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // XHOSA
660
+ (37 >= kMinCorrPercent) ? XHOSA : UNKNOWN_LANGUAGE, // ZULU
661
+ ( 2 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GUARANI
662
+ (29 >= kMinCorrPercent) ? TSWANA : UNKNOWN_LANGUAGE, // SESOTHO
663
+ ( 7 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // TURKMEN
664
+ ( 8 >= kMinCorrPercent) ? KAZAKH : UNKNOWN_LANGUAGE, // KYRGYZ
665
+ ( 5 >= kMinCorrPercent) ? FRENCH : UNKNOWN_LANGUAGE, // BRETON
666
+ ( 3 >= kMinCorrPercent) ? GANDA : UNKNOWN_LANGUAGE, // TWI
667
+ (27 >= kMinCorrPercent) ? HEBREW : UNKNOWN_LANGUAGE, // YIDDISH
668
+ (28 >= kMinCorrPercent) ? SLOVENIAN : UNKNOWN_LANGUAGE, // SERBO_CROATIAN
669
+ (12 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // SOMALI
670
+ ( 9 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // UIGHUR
671
+ (15 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // KURDISH
672
+ ( 6 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // MONGOLIAN
673
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ARMENIAN
674
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // LAOTHIAN
675
+ ( 8 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // SINDHI
676
+ (10 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // RHAETO_ROMANCE
677
+ (31 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // AFRIKAANS
678
+ (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // LUXEMBOURGISH
679
+ ( 2 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // BURMESE
680
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KHMER
681
+ (45 >= kMinCorrPercent) ? DZONGKHA : UNKNOWN_LANGUAGE, // TIBETAN
682
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // DHIVEHI
683
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CHEROKEE
684
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SYRIAC
685
+ ( 8 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // LIMBU
686
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ORIYA
687
+ (14 >= kMinCorrPercent) ? BENGALI : UNKNOWN_LANGUAGE, // ASSAMESE
688
+ (16 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // CORSICAN
689
+ ( 5 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // INTERLINGUE
690
+ ( 8 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // KAZAKH
691
+ ( 4 >= kMinCorrPercent) ? SWAHILI : UNKNOWN_LANGUAGE, // LINGALA
692
+ (11 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // MOLDAVIAN
693
+ (19 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // PASHTO
694
+ ( 5 >= kMinCorrPercent) ? AYMARA : UNKNOWN_LANGUAGE, // QUECHUA
695
+ ( 5 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SHONA
696
+ (17 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // TAJIK
697
+ (13 >= kMinCorrPercent) ? BASHKIR : UNKNOWN_LANGUAGE, // TATAR
698
+ (11 >= kMinCorrPercent) ? SAMOAN : UNKNOWN_LANGUAGE, // TONGA
699
+ ( 2 >= kMinCorrPercent) ? TWI : UNKNOWN_LANGUAGE, // YORUBA
700
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_ENGLISH_BASED
701
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_FRENCH_BASED
702
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_PORTUGUESE_BASED
703
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_OTHER
704
+ ( 6 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // MAORI
705
+ ( 3 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // WOLOF
706
+ ( 1 >= kMinCorrPercent) ? MONGOLIAN : UNKNOWN_LANGUAGE, // ABKHAZIAN
707
+ ( 8 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // AFAR
708
+ ( 5 >= kMinCorrPercent) ? QUECHUA : UNKNOWN_LANGUAGE, // AYMARA
709
+ (13 >= kMinCorrPercent) ? TATAR : UNKNOWN_LANGUAGE, // BASHKIR
710
+ ( 3 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // BISLAMA
711
+ (45 >= kMinCorrPercent) ? TIBETAN : UNKNOWN_LANGUAGE, // DZONGKHA
712
+ ( 4 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // FIJIAN
713
+ ( 7 >= kMinCorrPercent) ? INUPIAK : UNKNOWN_LANGUAGE, // GREENLANDIC
714
+ ( 3 >= kMinCorrPercent) ? AFAR : UNKNOWN_LANGUAGE, // HAUSA
715
+ ( 3 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // HAITIAN_CREOLE
716
+ ( 7 >= kMinCorrPercent) ? GREENLANDIC : UNKNOWN_LANGUAGE, // INUPIAK
717
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // INUKTITUT
718
+ ( 4 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // KASHMIRI
719
+ (30 >= kMinCorrPercent) ? RUNDI : UNKNOWN_LANGUAGE, // KINYARWANDA
720
+ ( 2 >= kMinCorrPercent) ? TAGALOG : UNKNOWN_LANGUAGE, // MALAGASY
721
+ (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // NAURU
722
+ (12 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // OROMO
723
+ (30 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // RUNDI
724
+ (11 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // SAMOAN
725
+ ( 1 >= kMinCorrPercent) ? LINGALA : UNKNOWN_LANGUAGE, // SANGO
726
+ (32 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // SANSKRIT
727
+ (16 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // SISWANT
728
+ ( 5 >= kMinCorrPercent) ? SISWANT : UNKNOWN_LANGUAGE, // TSONGA
729
+ (29 >= kMinCorrPercent) ? SESOTHO : UNKNOWN_LANGUAGE, // TSWANA
730
+ ( 2 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // VOLAPUK
731
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ZHUANG
732
+ ( 1 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // KHASI
733
+ (28 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // SCOTS
734
+ (15 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // GANDA
735
+ ( 7 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // MANX
736
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MONTENEGRIN
737
+ };
738
+
739
+ COMPILE_ASSERT(arraysize(kClosestAltLanguage) == NUM_LANGUAGES,
740
+ kClosestAltLanguage_has_incorrect_size);
741
+
742
+
743
+ inline bool FlagFinish(int flags) {return (flags & kCLDFlagFinish) != 0;}
744
+ inline bool FlagSqueeze(int flags) {return (flags & kCLDFlagSqueeze) != 0;}
745
+ inline bool FlagRepeats(int flags) {return (flags & kCLDFlagRepeats) != 0;}
746
+ inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;}
747
+ inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;}
748
+ inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;}
749
+ inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;}
750
+
751
+
752
+
753
+
754
+ //------------------------------------------------------------------------------
755
+ // For --cld_html debugging output. Not thread safe
756
+ //------------------------------------------------------------------------------
757
+ static Language prior_lang = UNKNOWN_LANGUAGE;
758
+ static bool prior_unreliable = false;
759
+
760
+ //------------------------------------------------------------------------------
761
+ // End For --cld_html debugging output
762
+ //------------------------------------------------------------------------------
763
+
764
+
765
+ // Backscan to word boundary, returning how many bytes n to go back
766
+ // so that src - n is non-space ans src - n - 1 is space.
767
+ // If not found in kMaxSpaceScan bytes, return 0
768
+ int BackscanToSpace(const char* src, int limit) {
769
+ int n = 0;
770
+ limit = cld::minint(limit, kMaxSpaceScan);
771
+ while (n < limit) {
772
+ if (src[-n - 1] == ' ') {return n;} // We are at _X
773
+ ++n;
774
+ }
775
+ return 0;
776
+ }
777
+
778
+ // Forwardscan to word boundary, returning how many bytes n to go forward
779
+ // so that src + n is non-space ans src + n - 1 is space.
780
+ // If not found in kMaxSpaceScan bytes, return 0
781
+ int ForwardscanToSpace(const char* src, int limit) {
782
+ int n = 0;
783
+ limit = cld::minint(limit, kMaxSpaceScan);
784
+ while (n < limit) {
785
+ if (src[n] == ' ') {return n + 1;} // We are at _X
786
+ ++n;
787
+ }
788
+ return 0;
789
+ }
790
+
791
+
792
+ // This uses a cheap predictor to get a measure of compression, and
793
+ // hence a measure of repetitiveness. It works on complete UTF-8 characters
794
+ // instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly
795
+ // all the time when done with a byte-based count. Sigh.
796
+ //
797
+ // To allow running prediction across multiple chunks, caller passes in current
798
+ // 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
799
+ //
800
+ // Returns the number of *bytes* correctly predicted, increments by 1..4 for
801
+ // each correctly-predicted character.
802
+ //
803
+ // NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text
804
+ //
805
+ int CountPredictedBytes(const char* isrc, int srclen, int* hash, int* tbl) {
806
+ int p_count = 0;
807
+ const uint8* src = reinterpret_cast<const uint8*>(isrc);
808
+ const uint8* srclimit = src + srclen;
809
+ int local_hash = *hash;
810
+
811
+ while (src < srclimit) {
812
+ int c = src[0];
813
+ int incr = 1;
814
+
815
+ // Pick up one char and length
816
+ if (c < 0xc0) {
817
+ // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
818
+ // Do nothing more
819
+ } else if ((c & 0xe0) == 0xc0) {
820
+ // Two-byte
821
+ c = (c << 8) | src[1];
822
+ incr = 2;
823
+ } else if ((c & 0xf0) == 0xe0) {
824
+ // Three-byte
825
+ c = (c << 16) | (src[1] << 8) | src[2];
826
+ incr = 3;
827
+ } else {
828
+ // Four-byte
829
+ c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
830
+ incr = 4;
831
+ }
832
+ src += incr;
833
+
834
+ int p = tbl[local_hash]; // Prediction
835
+ tbl[local_hash] = c; // Update prediction
836
+ p_count += (c == p); // Count good predictions
837
+
838
+ local_hash = ((local_hash << 4) ^ c) & 0xfff;
839
+ }
840
+
841
+ *hash = local_hash;
842
+ return p_count;
843
+ }
844
+
845
+
846
+
847
+ // Counts number of spaces; a little faster than one-at-a-time
848
+ // Doesn't count odd bytes at end
849
+ int CountSpaces4(const char* src, int src_len) {
850
+ int s_count = 0;
851
+ for (int i = 0; i < (src_len & ~3); i += 4) {
852
+ s_count += (src[i] == ' ');
853
+ s_count += (src[i+1] == ' ');
854
+ s_count += (src[i+2] == ' ');
855
+ s_count += (src[i+3] == ' ');
856
+ }
857
+ return s_count;
858
+ }
859
+
860
+ // Remove words of text that have more than half their letters predicted
861
+ // correctly by our cheap predictor, moving the remaining words in-place
862
+ // to the front of the input buffer.
863
+ //
864
+ // To allow running prediction across multiple chunks, caller passes in current
865
+ // 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
866
+ //
867
+ // Return the new, possibly-shorter length
868
+ //
869
+ // Result Buffer ALWAYS has leading space and trailing space space space NUL,
870
+ // if input does
871
+ //
872
+ int CheapRepWordsInplace(char* isrc, int srclen, int* hash, int* tbl) {
873
+ const uint8* src = reinterpret_cast<const uint8*>(isrc);
874
+ const uint8* srclimit = src + srclen;
875
+ char* dst = isrc;
876
+ int local_hash = *hash;
877
+ char* word_dst = dst; // Start of next word
878
+ int good_predict_bytes = 0;
879
+ int word_length_bytes = 0;
880
+
881
+ while (src < srclimit) {
882
+ int c = src[0];
883
+ int incr = 1;
884
+ *dst++ = c;
885
+
886
+ if (c == ' ') {
887
+ if ((good_predict_bytes * 2) > word_length_bytes) {
888
+ // Word is well-predicted: backup to start of this word
889
+ dst = word_dst;
890
+ if (FLAGS_cld_showme) {
891
+ // Mark the deletion point with period
892
+ // Don't repeat multiple periods
893
+ // Cannot mark with more bytes or may overwrite unseen input
894
+ if ((isrc < (dst - 2)) && (dst[-2] != '.')) {
895
+ *dst++ = '.';
896
+ *dst++ = ' ';
897
+ }
898
+ }
899
+ }
900
+ word_dst = dst; // Start of next word
901
+ good_predict_bytes = 0;
902
+ word_length_bytes = 0;
903
+ }
904
+
905
+ // Pick up one char and length
906
+ if (c < 0xc0) {
907
+ // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
908
+ // Do nothing more
909
+ } else if ((c & 0xe0) == 0xc0) {
910
+ // Two-byte
911
+ *dst++ = src[1];
912
+ c = (c << 8) | src[1];
913
+ incr = 2;
914
+ } else if ((c & 0xf0) == 0xe0) {
915
+ // Three-byte
916
+ *dst++ = src[1];
917
+ *dst++ = src[2];
918
+ c = (c << 16) | (src[1] << 8) | src[2];
919
+ incr = 3;
920
+ } else {
921
+ // Four-byte
922
+ *dst++ = src[1];
923
+ *dst++ = src[2];
924
+ *dst++ = src[3];
925
+ c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
926
+ incr = 4;
927
+ }
928
+ src += incr;
929
+ word_length_bytes += incr;
930
+
931
+ int p = tbl[local_hash]; // Prediction
932
+ tbl[local_hash] = c; // Update prediction
933
+ if (c == p) {
934
+ good_predict_bytes += incr; // Count good predictions
935
+ }
936
+
937
+ local_hash = ((local_hash << 4) ^ c) & 0xfff;
938
+ }
939
+
940
+ *hash = local_hash;
941
+
942
+ if ((dst - isrc) < (srclen - 3)) {
943
+ // Pad and make last char clean UTF-8 by putting following spaces
944
+ dst[0] = ' ';
945
+ dst[1] = ' ';
946
+ dst[2] = ' ';
947
+ dst[3] = '\0';
948
+ } else if ((dst - isrc) < srclen) {
949
+ // Make last char clean UTF-8 by putting following space off the end
950
+ dst[0] = ' ';
951
+ }
952
+
953
+ return static_cast<int>(dst - isrc);
954
+ }
955
+
956
+
957
+ // Remove portions of text that have a high density of spaces, or that are
958
+ // overly repetitive, squeezing the remaining text in-place to the front of the
959
+ // input buffer.
960
+ //
961
+ // Squeezing looks at density of space/prediced chars in fixed-size chunks,
962
+ // specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes.
963
+ //
964
+ // Return the new, possibly-shorter length
965
+ //
966
+ // Result Buffer ALWAYS has leading space and trailing space space space NUL,
967
+ // if input does
968
+ //
969
+ int CompactLangDetImpl::CheapSqueezeInplace(char* isrc,
970
+ int srclen,
971
+ int ichunksize) {
972
+ char* src = isrc;
973
+ char* dst = src;
974
+ char* srclimit = src + srclen;
975
+ bool skipping = false;
976
+
977
+ int hash = 0;
978
+ // Allocate local prediction table.
979
+ int* predict_tbl = new int[kPredictionTableSize];
980
+ memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
981
+
982
+ int chunksize = ichunksize;
983
+ if (chunksize == 0) {chunksize = kChunksizeDefault;}
984
+ int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
985
+ int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
986
+
987
+ while (src < srclimit) {
988
+ int remaining_bytes = srclimit - src;
989
+ int len = cld::minint(chunksize, remaining_bytes);
990
+ int space_n = CountSpaces4(src, len);
991
+ int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
992
+ if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
993
+ // Skip the text
994
+ if (!skipping) {
995
+ // Keeping-to-skipping transition; do it at a space
996
+ int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
997
+ dst -= n;
998
+ skipping = true;
999
+ if (FLAGS_cld_showme) {
1000
+ // Mark the deletion point with black square U+25A0
1001
+ *dst++ = 0xe2;
1002
+ *dst++ = 0x96;
1003
+ *dst++ = 0xa0;
1004
+ *dst++ = ' ';
1005
+ }
1006
+ if (dst == isrc) {
1007
+ // Force a leading space if the first chunk is deleted
1008
+ *dst++ = ' ';
1009
+ }
1010
+ }
1011
+ } else {
1012
+ // Keep the text
1013
+ if (skipping) {
1014
+ // Skipping-to-keeping transition; do it at a space
1015
+ int n = ForwardscanToSpace(src, len);
1016
+ src += n;
1017
+ remaining_bytes -= n; // Shrink remaining length
1018
+ len -= n;
1019
+ skipping = false;
1020
+ }
1021
+ // "len" can be negative in some cases
1022
+ if (len > 0) {
1023
+ memcpy(dst, src, len);
1024
+ dst += len;
1025
+ }
1026
+ }
1027
+ src += len;
1028
+ }
1029
+
1030
+ if ((dst - isrc) < (srclen - 3)) {
1031
+ // Pad and make last char clean UTF-8 by putting following spaces
1032
+ dst[0] = ' ';
1033
+ dst[1] = ' ';
1034
+ dst[2] = ' ';
1035
+ dst[3] = '\0';
1036
+ } else if ((dst - isrc) < srclen) {
1037
+ // Make last char clean UTF-8 by putting following space off the end
1038
+ dst[0] = ' ';
1039
+ }
1040
+
1041
+ // Deallocate local prediction table
1042
+ delete[] predict_tbl;
1043
+ return static_cast<int>(dst - isrc);
1044
+ }
1045
+
1046
+ // Timing 2.8GHz P4 (dsites 2008.03.20) with 170KB input
1047
+ // About 90 MB/sec, with or without memcpy, chunksize 48 or 4096
1048
+ // Just CountSpaces is about 340 MB/sec
1049
+ // Byte-only CountPredictedBytes is about 150 MB/sec
1050
+ // Byte-only CountPredictedBytes, conditional tbl[] = is about 85! MB/sec
1051
+ // Byte-only CountPredictedBytes is about 180 MB/sec, byte tbl, byte/int c
1052
+ // Unjammed byte-only both = 170 MB/sec
1053
+ // Jammed byte-only both = 120 MB/sec
1054
+ // Back to original w/slight updates, 110 MB/sec
1055
+ //
1056
+ bool CheapSqueezeTriggerTest(const char* src, int srclen, int testsize) {
1057
+ // Don't trigger at all on short text
1058
+ if (srclen < testsize) {return false;}
1059
+ int space_thresh = (testsize * kSpacesTriggerPercent) / 100;
1060
+ int predict_thresh = (testsize * kPredictTriggerPercent) / 100;
1061
+ int hash = 0;
1062
+ // Allocate local prediction table.
1063
+ int* predict_tbl = new int[kPredictionTableSize];
1064
+ memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
1065
+
1066
+ bool retval = false;
1067
+ if ((CountSpaces4(src, testsize) >= space_thresh) ||
1068
+ (CountPredictedBytes(src, testsize, &hash, predict_tbl) >=
1069
+ predict_thresh)) {
1070
+ retval = true;
1071
+ }
1072
+ // Deallocate local prediction table
1073
+ delete[] predict_tbl;
1074
+ return retval;
1075
+ }
1076
+
1077
+
1078
+
1079
+ // Close pairs (correlation) language_enum/language_enum
1080
+ // id/ms (0.47) 38/40 [1]
1081
+ // bo/dz (0.46) 105/135 [2]
1082
+ // cz/sk (0.43) 17/68 [3]
1083
+ // no/nn (0.42) 10/80 [4]
1084
+ // hi/mr (0.38) 35/64 [5]
1085
+ // xh/zu (0.37) 83/84 [6]
1086
+ // Subscripted by packed language, gives 0 or a subscript in closepair
1087
+ // scoring array inside doc_tote
1088
+ static const uint8 kClosePair[EXT_NUM_LANGUAGES + 1] = {
1089
+ 0,
1090
+ 0,0,0,0,0,0,0,0, 0,0,4,0,0,0,0,0, 0,3,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1091
+ 0,0,0,5,0,0,1,0, 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1092
+ 5,0,0,0,3,0,0,0, 0,0,0,0,0,0,0,0, 4,0,0,6,6,0,0,0, 0,0,0,0,0,0,0,0,
1093
+ 0,0,0,0,0,0,0,0, 0,2,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1094
+ 0,0,0,0,0,0,0,2, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1095
+ // Add new language close-pair number just before here (just use 0)
1096
+ };
1097
+
1098
+
1099
+ // Delete any extended languages from doc_tote
1100
+ void RemoveExtendedLanguages(ToteWithReliability* doc_tote) {
1101
+ for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1102
+ if (cld::UnpackLanguage(doc_tote->Key(sub)) >= NUM_LANGUAGES) {
1103
+ // Effectively remove the extended language by setting key&score to zero
1104
+ if (FLAGS_dbgscore) {
1105
+ fprintf(stderr, "{-%s} ",
1106
+ ExtLanguageCode(cld::UnpackLanguage(doc_tote->Key(sub))));
1107
+ }
1108
+
1109
+ // Delete entry
1110
+ doc_tote->SetKey(sub, 0);
1111
+ doc_tote->SetValue(sub, 0);
1112
+ doc_tote->SetReliability(sub, 0);
1113
+ }
1114
+ }
1115
+ }
1116
+
1117
+ static const int kMinReliableKeepPercent = 41; // Remove lang if reli < this
1118
+
1119
+ // For Tier3 languages, require a minimum number of bytes to be first-place lang
1120
+ static const int kGoodFirstT3MinBytes = 24; // <this => no first
1121
+
1122
+ // Move bytes for unreliable langs to another lang or UNKNOWN
1123
+ // doc_tote is sorted, so cannot Add
1124
+ //
1125
+ // If both CHINESE and CHINESET are present and unreliable, do not delete both;
1126
+ // merge both into CHINESE.
1127
+ //
1128
+ //dsites 2009.03.19
1129
+ // we also want to remove Tier3 languages as the first lang if there is very
1130
+ // little text like ej1 ej2 ej3 ej4
1131
+ // maybe fold this back in earlier
1132
+ //
1133
+ void RemoveUnreliableLanguages(ToteWithReliability* doc_tote, bool do_remove_weak_matches) {
1134
+ // Prepass to merge some low-reliablility languages
1135
+ int total_bytes = 0;
1136
+ for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1137
+ int plang = doc_tote->Key(sub);
1138
+ if (plang == 0) {continue;} // Empty slot
1139
+
1140
+ Language lang = cld::UnpackLanguage(plang);
1141
+ int bytes = doc_tote->Value(sub);
1142
+ int reli = doc_tote->Reliability(sub);
1143
+ if (bytes == 0) {continue;} // Zero bytes
1144
+ total_bytes += bytes;
1145
+
1146
+ // Reliable percent is stored reliable score over stored bytecount
1147
+ int reliable_percent = reli / bytes;
1148
+ if (reliable_percent >= kMinReliableKeepPercent) {continue;} // Keeper
1149
+
1150
+ // This language is too unreliable to keep, but we might merge it.
1151
+ Language altlang = UNKNOWN_LANGUAGE;
1152
+ if (lang < NUM_LANGUAGES) {altlang = kClosestAltLanguage[lang];}
1153
+ if (altlang == UNKNOWN_LANGUAGE) {continue;} // No alternative
1154
+
1155
+ // Look for alternative in doc_tote
1156
+ int altsub = doc_tote->Find(cld::PackLanguage(altlang));
1157
+ if (altsub < 0) {continue;} // No alternative text
1158
+
1159
+ int bytes2 = doc_tote->Value(altsub);
1160
+ int reli2 = doc_tote->Reliability(altsub);
1161
+ if (bytes2 == 0) {continue;} // Zero bytes
1162
+
1163
+ // Reliable percent is stored reliable score over stored bytecount
1164
+ int reliable_percent2 = reli2 / bytes2;
1165
+
1166
+ // Merge one language into the other. Break ties toward lower lang #
1167
+ int tosub = altsub;
1168
+ int fromsub = sub;
1169
+ bool into_lang = false;
1170
+ if ((reliable_percent2 < reliable_percent) ||
1171
+ ((reliable_percent2 == reliable_percent) && (lang < altlang))) {
1172
+ tosub = sub;
1173
+ fromsub = altsub;
1174
+ into_lang = true;
1175
+ }
1176
+
1177
+ // Make sure reliability doesn't drop and is enough to avoid delete
1178
+ int newpercent = cld::maxint(reliable_percent, reliable_percent2);
1179
+ newpercent = cld::maxint(newpercent, kMinReliableKeepPercent);
1180
+ int newbytes = bytes + bytes2;
1181
+ int newreli = newpercent * newbytes;
1182
+
1183
+ doc_tote->SetKey(fromsub, 0);
1184
+ doc_tote->SetValue(fromsub, 0);
1185
+ doc_tote->SetReliability(fromsub, 0);
1186
+ doc_tote->SetValue(tosub, newbytes);
1187
+ doc_tote->SetReliability(tosub, newreli);
1188
+
1189
+ // Show fate of unreliable languages if at least 10 bytes
1190
+ if (FLAGS_cld_html /*&& (newpercent >= 10)*/ && (newbytes >= 10)) {
1191
+ if (into_lang) {
1192
+ fprintf(stderr, "{Unreli %s.%d(%dB) => %s} ",
1193
+ ExtLanguageCode(altlang), reliable_percent2, bytes2,
1194
+ ExtLanguageCode(lang));
1195
+ } else {
1196
+ fprintf(stderr, "{Unreli %s.%d(%dB) => %s} ",
1197
+ ExtLanguageCode(lang), reliable_percent, bytes,
1198
+ ExtLanguageCode(altlang));
1199
+ }
1200
+ }
1201
+ }
1202
+
1203
+
1204
+ if (do_remove_weak_matches) {
1205
+ // Pass to delete any remaining unreliable languages
1206
+ for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1207
+ int plang = doc_tote->Key(sub);
1208
+ if (plang == 0) {continue;} // Empty slot
1209
+
1210
+ Language lang = cld::UnpackLanguage(plang);
1211
+ int bytes = doc_tote->Value(sub);
1212
+ int reli = doc_tote->Reliability(sub);
1213
+ if (bytes == 0) {continue;} // Zero bytes
1214
+
1215
+ bool is_tier3 = (cld::kIsPackedTop40[plang] == 0);
1216
+ if (is_tier3 &&
1217
+ (bytes < kGoodFirstT3MinBytes) &&
1218
+ (bytes < total_bytes)) {
1219
+ reli = 0; // Too-short tier3
1220
+ }
1221
+
1222
+ // Reliable percent is stored as reliable score over stored bytecount
1223
+ int reliable_percent = reli / bytes;
1224
+ if (reliable_percent >= kMinReliableKeepPercent) {continue;} // Keeper
1225
+
1226
+ // Delete unreliable entry
1227
+ doc_tote->SetKey(sub, 0);
1228
+ doc_tote->SetValue(sub, 0);
1229
+ doc_tote->SetReliability(sub, 0);
1230
+
1231
+ // Show fate of unreliable languages if at least 10 bytes
1232
+ if (FLAGS_cld_html /*&& (reliable_percent >= 10)*/ && (bytes >= 10)) {
1233
+ fprintf(stderr, "{Unreli %s.%d(%dB)} ",
1234
+ ExtLanguageCode(lang), reliable_percent, bytes);
1235
+ }
1236
+ }
1237
+ }
1238
+
1239
+ if (FLAGS_cld_html) {fprintf(stderr, "<br>\n");}
1240
+ }
1241
+
1242
+
1243
+ // Move less likely byte count to more likely for close pairs of languages
1244
+ void RefineScoredClosePairs(ToteWithReliability* doc_tote) {
1245
+ for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1246
+ int close_packedlang = doc_tote->Key(sub);
1247
+ int subscr = kClosePair[close_packedlang];
1248
+ if (subscr == 0) {continue;}
1249
+
1250
+ // We have a close pair language -- if the other one is also scored and the
1251
+ // longword score differs enough, put all our eggs into one basket
1252
+
1253
+ // Nonzero longword score: Go look for the other of this pair
1254
+ for (int sub2 = sub + 1; sub2 < doc_tote->MaxSize(); ++sub2) {
1255
+ if (kClosePair[doc_tote->Key(sub2)] == subscr) {
1256
+ // We have a matching pair
1257
+ int close_packedlang2 = doc_tote->Key(sub2);
1258
+
1259
+ // Move all the text bytes from lower byte-count to higher one
1260
+ int from_sub, to_sub;
1261
+ Language from_lang, to_lang;
1262
+ if (doc_tote->Value(sub) < doc_tote->Value(sub2)) {
1263
+ from_sub = sub;
1264
+ to_sub = sub2;
1265
+ from_lang = cld::UnpackLanguage(close_packedlang);
1266
+ to_lang = cld::UnpackLanguage(close_packedlang2);
1267
+ } else {
1268
+ from_sub = sub2;
1269
+ to_sub = sub;
1270
+ from_lang = cld::UnpackLanguage(close_packedlang2);
1271
+ to_lang = cld::UnpackLanguage(close_packedlang);
1272
+ }
1273
+
1274
+ // Move all the bytes smaller => larger of the pair
1275
+ if (FLAGS_cld_html || FLAGS_dbgscore) {
1276
+ // Show fate of closepair language
1277
+ int val = doc_tote->Value(from_sub);
1278
+ int reli = doc_tote->Reliability(from_sub);
1279
+ int reliable_percent = reli / (val ? val : 1); // avoid zdiv
1280
+ fprintf(stderr, "{CloseLangPair: %s.%d%%(%dB) => %s} ",
1281
+ ExtLanguageCode(from_lang),
1282
+ reliable_percent,
1283
+ doc_tote->Value(from_sub),
1284
+ ExtLanguageCode(to_lang));
1285
+ }
1286
+ int sum = doc_tote->Value(to_sub) + doc_tote->Value(from_sub);
1287
+ doc_tote->SetValue(to_sub, sum);
1288
+ doc_tote->SetReliability(to_sub, 100 * sum);
1289
+
1290
+ // Delete old entry
1291
+ doc_tote->SetKey(from_sub, 0);
1292
+ doc_tote->SetValue(from_sub, 0);
1293
+ doc_tote->SetReliability(from_sub, 0);
1294
+
1295
+ break; // Exit inner for sub2 loop
1296
+ }
1297
+ } // End for sub2
1298
+ } // End for sub
1299
+ }
1300
+
1301
+
1302
+ void ApplyLanguageHints(Tote* chunk_tote, int tote_grams,
1303
+ uint8* lang_hint_boost) {
1304
+ // Need 8 quad/unigrams to give full hint boost, else derate linearly
1305
+ if (tote_grams > 8) {
1306
+ tote_grams = 8;
1307
+ }
1308
+ for (int sub = 0; sub < chunk_tote->MaxSize(); ++sub) {
1309
+ // Hint boosts are per packed subscript
1310
+ int lang_sub = chunk_tote->Key(sub);
1311
+ int new_value = chunk_tote->Value(sub) +
1312
+ ((lang_hint_boost[lang_sub] * tote_grams) >> 3);
1313
+ chunk_tote->SetValue(sub, new_value);
1314
+ if (FLAGS_dbgscore && (lang_hint_boost[lang_sub] > 0)) {
1315
+ fprintf(stderr, "[%s+=%d*%d/8] ",
1316
+ ExtLanguageCode(cld::UnpackLanguage(lang_sub)),
1317
+ lang_hint_boost[lang_sub], tote_grams);
1318
+ }
1319
+ }
1320
+ }
1321
+
1322
+
1323
+ void PrintHtmlEscapedText(FILE* f, const char* txt, int len) {
1324
+ for (int i = 0; i < len; ++i) {
1325
+ char c = txt[i];
1326
+ if (c == '<') {
1327
+ fprintf(f, "&lt;");
1328
+ } else if (c == '>') {
1329
+ fprintf(f, "&gt;");
1330
+ } else if (c == '&') {
1331
+ fprintf(f, "&amp;");
1332
+ } else if (c == '\'') {
1333
+ fprintf(f, "&apos;");
1334
+ } else if (c == '"') {
1335
+ fprintf(f, "&quot;");
1336
+ } else {
1337
+ fprintf(f, "%c", c);
1338
+ }
1339
+ }
1340
+ fprintf(f, "<br>\n");
1341
+ }
1342
+
1343
+
1344
+ // Add one chunk's score to running document score
1345
+ // If the top language is UNKNOWN_LANGUAGE, score nothing. This is used to
1346
+ // positively identify text to be ignored, such as link farms.
1347
+ // Sort before scoring and reinit afterward
1348
+ //
1349
+ // src and srclen are just for debug output
1350
+ void ScoreChunkIntoDoc(const char* src, int srclen, int advance_by,
1351
+ UnicodeLScript lscript,
1352
+ Tote* chunk_tote,
1353
+ ToteWithReliability* doc_tote,
1354
+ int tote_grams,
1355
+ uint8* lang_hint_boost) {
1356
+ // Apply hints before sorting
1357
+ if (lang_hint_boost) {
1358
+ ApplyLanguageHints(chunk_tote, tote_grams, lang_hint_boost);
1359
+ }
1360
+
1361
+ // Sort to get top two languages
1362
+ chunk_tote->Sort(2);
1363
+ Language cur_lang = cld::UnpackLanguage(chunk_tote->Key(0));
1364
+
1365
+ // Return if empty
1366
+ if (cur_lang < 0) {
1367
+ chunk_tote->Reinit();
1368
+ return;
1369
+ }
1370
+
1371
+ bool cur_unreliable = false;
1372
+
1373
+ // Reliability is a function of mean script score per KB of text
1374
+ int len = chunk_tote->GetByteCount();
1375
+ int reliability = cld::GetReliability((len * 2) / advance_by,
1376
+ lscript,
1377
+ chunk_tote);
1378
+ cur_unreliable = (reliability < cld::kMinReliable);
1379
+
1380
+ // If tote_grams=0, always reliable
1381
+ // If tote_grams=1, always unreliable
1382
+ if (tote_grams == 0) {
1383
+ reliability = 100;
1384
+ cur_unreliable = false;
1385
+ } else if (tote_grams == 1) {
1386
+ reliability = 0;
1387
+ cur_unreliable = true;
1388
+ }
1389
+
1390
+ #if 0
1391
+ // TEMP
1392
+ if (FLAGS_cld_html) {
1393
+ if (reliability >= kMinReliableKeepPercent) {
1394
+ fprintf(stderr, "R%d%% ", reliability);
1395
+ } else {
1396
+ fprintf(stderr, "--R%d%% ", reliability);
1397
+ }
1398
+ }
1399
+ #endif
1400
+
1401
+ // Track the sequence of language fragments [result currently unused]
1402
+ ////if (reliability >= kMinReliableSeq) {
1403
+ //// doc_tote->AddSeq(chunk_tote->Key(0));
1404
+ ////}
1405
+
1406
+ if (cur_unreliable && (chunk_tote->Key(1) != 0)) {
1407
+ // Unreliable and two top contenders, split byte count 5/8 - 3/8
1408
+ int top_len = ((len * 5) + 4) >> 3;
1409
+ int second_len = len - top_len;
1410
+
1411
+ doc_tote->Add(chunk_tote->Key(0),
1412
+ top_len, chunk_tote->Value(0), reliability);
1413
+ doc_tote->Add(chunk_tote->Key(1),
1414
+ second_len, chunk_tote->Value(1), reliability);
1415
+ if (FLAGS_dbgscore) {
1416
+ fprintf(stderr, "{+%s.%d.%dR(%dB) +%s.%d.%dR(%dB)} ",
1417
+ ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(0))),
1418
+ chunk_tote->Value(0),
1419
+ reliability,
1420
+ top_len,
1421
+ ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(1))),
1422
+ chunk_tote->Value(1),
1423
+ reliability,
1424
+ second_len);
1425
+ }
1426
+ } else {
1427
+ // Reliable or single contender
1428
+ doc_tote->Add(chunk_tote->Key(0),
1429
+ len, chunk_tote->Value(0), reliability);
1430
+ if (FLAGS_dbgscore) {
1431
+ fprintf(stderr, "{+%s.%d.%dR(%dB)} ",
1432
+ ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(0))),
1433
+ chunk_tote->Value(0),
1434
+ reliability,
1435
+ len);
1436
+ }
1437
+ }
1438
+
1439
+ if (FLAGS_cld_html) {
1440
+ if (cur_lang < 0) {cur_lang = UNKNOWN_LANGUAGE;}
1441
+ cld::PrintLang(stderr, chunk_tote,
1442
+ cur_lang, cur_unreliable,
1443
+ prior_lang, prior_unreliable);
1444
+ prior_lang = cur_lang;
1445
+ prior_unreliable = cur_unreliable;
1446
+
1447
+ string temp(src, srclen);
1448
+ if (temp[0] == '=') {
1449
+ // Rewrite =ScriptX= or =SwitchX= as =Xxxx= for script code Xxxx
1450
+ temp = "=Buffered_";
1451
+ temp.append(UnicodeLScriptCode(lscript));
1452
+ temp.append("=");
1453
+ }
1454
+ cld::PrintText(stderr, cur_lang, temp);
1455
+ }
1456
+
1457
+ chunk_tote->Reinit();
1458
+ }
1459
+
1460
+
1461
+ void PrintTopLang(Language top_lang) {
1462
+ if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
1463
+ fprintf(stderr, "[] ");
1464
+ } else {
1465
+ fprintf(stderr, "[%s] ", ExtLanguageName(top_lang));
1466
+ prior_lang = top_lang;
1467
+ }
1468
+ }
1469
+
1470
+ void PrintTopLangSpeculative(Language top_lang) {
1471
+ fprintf(stderr, "<span style=\"color:#%06X;\">", 0xa0a0a0);
1472
+ if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
1473
+ fprintf(stderr, "[] ");
1474
+ } else {
1475
+ fprintf(stderr, "[%s] ", ExtLanguageName(top_lang));
1476
+ prior_lang = top_lang;
1477
+ }
1478
+ fprintf(stderr, "</span>\n");
1479
+ }
1480
+
1481
+
1482
+ // Add one chunk's score to running document score
1483
+ // Convenience function with constant src text
1484
+ void ScoreChunkIntoDoc2(const char* src, int advance_by,
1485
+ UnicodeLScript lscript,
1486
+ Tote* chunk_tote,
1487
+ ToteWithReliability* doc_tote,
1488
+ int tote_grams,
1489
+ uint8* lang_hint_boost) {
1490
+ int srclen = static_cast<int>(strlen(src));
1491
+ ScoreChunkIntoDoc(src, srclen, advance_by, lscript, chunk_tote,
1492
+ doc_tote, tote_grams, lang_hint_boost);
1493
+ }
1494
+
1495
+
1496
+ // Score one scriptspan using the only language for that script
1497
+ void ScoreNilgrams(getone::LangSpan* scriptspan, int lang,
1498
+ ToteWithReliability* doc_tote,
1499
+ uint8* lang_hint_boost,
1500
+ int flags, Language plus_one) {
1501
+ // For debugging only. Not thread-safe
1502
+ prior_lang = UNKNOWN_LANGUAGE;
1503
+ prior_unreliable = false;
1504
+
1505
+ const char* src = scriptspan->text;
1506
+ int len = scriptspan->text_bytes;
1507
+
1508
+ Tote chunk_tote;
1509
+ // Score 1000 for 1000 bytes
1510
+ chunk_tote.AddGram();
1511
+ chunk_tote.Add(lang, scriptspan->text_bytes);
1512
+ chunk_tote.AddBytes(scriptspan->text_bytes);
1513
+ int advance_by = 2;
1514
+ int tote_grams = 0; // Indicates fully reliable
1515
+ ScoreChunkIntoDoc(src, len, advance_by,
1516
+ scriptspan->script, &chunk_tote,
1517
+ doc_tote, tote_grams, lang_hint_boost);
1518
+ }
1519
+
1520
+ // Score one scriptspan using unigrams
1521
+ // Updates tote_grams
1522
+ static void ScoreUnigrams(const UTF8PropObj* unigram_obj,
1523
+ getone::LangSpan* scriptspan,
1524
+ int* tote_grams, int gram_limit,
1525
+ Tote* chunk_tote,
1526
+ ToteWithReliability* doc_tote,
1527
+ uint8* lang_hint_boost,
1528
+ int advance_by, int flags,
1529
+ int* initial_word_span, Language plus_one) {
1530
+ // chunk_tote may have partial sum coming in
1531
+ const char* src = scriptspan->text;
1532
+ const char* srclimit = src + scriptspan->text_bytes;
1533
+
1534
+ // For debugging only. Not thread-safe
1535
+ prior_lang = UNKNOWN_LANGUAGE;
1536
+ prior_unreliable = false;
1537
+
1538
+ // Break text up into multiple chunks and score each
1539
+ while (src < srclimit) {
1540
+ // Updates tote_grams
1541
+ int len = cld::DoUniScoreV3(unigram_obj,
1542
+ src, srclimit - src, advance_by,
1543
+ tote_grams, gram_limit, chunk_tote);
1544
+ if (FlagUseWords(flags) || (*initial_word_span > 0)) {
1545
+ // Use bigram scoring in addition to quadgrams
1546
+ cld::DoBigramScoreV3(&kCjkBiTable_obj,
1547
+ src, len, chunk_tote);
1548
+ }
1549
+ chunk_tote->AddBytes(len);
1550
+ *initial_word_span -= len;
1551
+
1552
+ if (*tote_grams >= gram_limit) {
1553
+ // Add this chunk to doc totals
1554
+ // Remove all but top40 if asked
1555
+ if (FlagTop40(flags)) {
1556
+ cld::DemoteNotTop40(chunk_tote, cld::PackLanguage(plus_one));
1557
+ }
1558
+
1559
+ // Sort, accumulate into doc total, reinit
1560
+ ScoreChunkIntoDoc(src, len, advance_by,
1561
+ scriptspan->script, chunk_tote,
1562
+ doc_tote, *tote_grams, lang_hint_boost);
1563
+ *tote_grams = 0;
1564
+ } else {
1565
+ if (FLAGS_cld_html) {
1566
+ string temp(src, len);
1567
+ Language top_lang = cld::UnpackLanguage(chunk_tote->CurrentTopKey());
1568
+ PrintTopLangSpeculative(top_lang);
1569
+ cld::PrintText(stderr, top_lang, temp);
1570
+ }
1571
+ }
1572
+ src += len;
1573
+ }
1574
+ // chunk_tote may have partial sum going out
1575
+ }
1576
+
1577
+ // Back up one UTF-8 character
1578
+ const uint8* BackOneUTF8(const uint8* p) {
1579
+ const uint8* retval = p - 1;
1580
+ if ((*retval & 0xc0) == 0x80) {--retval;}
1581
+ if ((*retval & 0xc0) == 0x80) {--retval;}
1582
+ if ((*retval & 0xc0) == 0x80) {--retval;}
1583
+ return retval;
1584
+ }
1585
+
1586
+
1587
+ // Score one scriptspan using quadgrams
1588
+ // Incoming chunk_tote may have partial accumulation
1589
+ static void ScoreQuadgrams(const cld::CLDTableSummary* quadgram_obj,
1590
+ getone::LangSpan* scriptspan,
1591
+ int* tote_grams, int gram_limit,
1592
+ Tote* chunk_tote,
1593
+ ToteWithReliability* doc_tote,
1594
+ uint8* lang_hint_boost,
1595
+ int advance_by, int flags,
1596
+ int* initial_word_span, Language plus_one) {
1597
+ // chunk_tote may have partial sum coming in
1598
+ const char* src = scriptspan->text;
1599
+ const char* srclimit = src + scriptspan->text_bytes;
1600
+ const char* lastscored_src = src;
1601
+
1602
+ // For debugging only. Not thread-safe
1603
+ prior_lang = UNKNOWN_LANGUAGE;
1604
+ prior_unreliable = false;
1605
+
1606
+ // Break text up into multiple chunks and score each
1607
+ while (src < srclimit) {
1608
+ // Updates tote_grams
1609
+ int len = cld::DoQuadScoreV3(quadgram_obj,
1610
+ src, srclimit - src, advance_by,
1611
+ tote_grams, gram_limit, chunk_tote);
1612
+ if (FlagUseWords(flags) || (*initial_word_span > 0)) {
1613
+ // Use word scoring in addition to quadgrams
1614
+ cld::DoOctaScoreV3(&kLongWord8Table_obj,
1615
+ src, len, chunk_tote);
1616
+ }
1617
+ chunk_tote->AddBytes(len);
1618
+ *initial_word_span -= len;
1619
+
1620
+ if (*tote_grams >= gram_limit) {
1621
+ // Remove all but top40 if asked
1622
+ if (FlagTop40(flags)) {
1623
+ cld::DemoteNotTop40(chunk_tote, cld::PackLanguage(plus_one));
1624
+ }
1625
+
1626
+ // Sort, accumulate into doc total, reinit
1627
+ ScoreChunkIntoDoc(src, len, advance_by,
1628
+ scriptspan->script, chunk_tote,
1629
+ doc_tote, *tote_grams, lang_hint_boost);
1630
+ lastscored_src = src + len;
1631
+ *tote_grams = 0;
1632
+ } else {
1633
+ if (FLAGS_cld_html) {
1634
+ string temp(src, len);
1635
+ Language top_lang = cld::UnpackLanguage(chunk_tote->CurrentTopKey());
1636
+ PrintTopLangSpeculative(top_lang);
1637
+ cld::PrintText(stderr, top_lang, temp);
1638
+ }
1639
+ }
1640
+ src += len;
1641
+ }
1642
+ }
1643
+
1644
+
1645
+
1646
+ void PrintLangs(FILE* f, const Language* language3, const int* percent3,
1647
+ const int* text_bytes, const bool* is_reliable) {
1648
+ fprintf(f, "<br>&nbsp;&nbsp;Initial_Languages ");
1649
+ if (language3[0] != UNKNOWN_LANGUAGE) {
1650
+ fprintf(f, "%s%s(%d%%) ",
1651
+ ExtLanguageName(language3[0]),
1652
+ *is_reliable ? "" : "*",
1653
+ percent3[0]);
1654
+ }
1655
+ if (language3[1] != UNKNOWN_LANGUAGE) {
1656
+ fprintf(f, "%s(%d%%) ", ExtLanguageName(language3[1]), percent3[1]);
1657
+ }
1658
+ if (language3[2] != UNKNOWN_LANGUAGE) {
1659
+ fprintf(f, "%s(%d%%) ", ExtLanguageName(language3[2]), percent3[2]);
1660
+ }
1661
+ fprintf(f, "%d bytes \n", *text_bytes);
1662
+
1663
+ fprintf(f, "<br>\n");
1664
+ }
1665
+
1666
+
1667
+ // Start the tote with a count of one for the default language for script
1668
+ void InitScriptToteLang(Tote* script_tote, UnicodeLScript lscript) {
1669
+ Language defaultlang = cld::kDefaultLanguagePerLScript[lscript];
1670
+ script_tote->Add(cld::PackLanguage(defaultlang), 1);
1671
+ script_tote->AddBytes(1);
1672
+ #if 0
1673
+ if (FLAGS_cld_html) {
1674
+ cld::PrintLang(stderr, script_tote,
1675
+ defaultlang, false,
1676
+ UNKNOWN_LANGUAGE, false);
1677
+ prior_lang = cur_lang;
1678
+ string temp("+1");
1679
+ cld::PrintText(stderr, defaultlang, temp);
1680
+ }
1681
+ #endif
1682
+ }
1683
+
1684
+ static const char* const kToteName[4] =
1685
+ {"=Latn=", "=Hani=", "=Script2=", "=Script3="};
1686
+ static const char* const kToteSwitch[4] =
1687
+ {"=Latn=", "=Hani=", "=Switch2=", "=Switch3="};
1688
+
1689
+
1690
+
1691
+ // Upper to lower, keep digits, everything else to minus '-' (2d)
1692
+ static const char kCharsetToLowerTbl[256] = {
1693
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1694
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1695
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1696
+ 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37, 0x38,0x39,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1697
+
1698
+ 0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
1699
+ 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x2d,
1700
+ 0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
1701
+ 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x2d,
1702
+
1703
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1704
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1705
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1706
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1707
+
1708
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1709
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1710
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1711
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1712
+ };
1713
+
1714
+
1715
+ static const char kIsAlpha[256] = {
1716
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1717
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1718
+ 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,
1719
+ 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,
1720
+
1721
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1722
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1723
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1724
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1725
+ };
1726
+
1727
+ static const char kIsDigit[256] = {
1728
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1729
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1, 1,1,0,0,0,0,0,0,
1730
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1731
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1732
+
1733
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1734
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1735
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1736
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1737
+ };
1738
+
1739
+ // Normalize ASCII string to first 4 alphabetic/digit chars
1740
+ // Letters are forced to lowercase ASCII
1741
+ // Used to normalize TLD values
1742
+ void MakeChar4(const char* str, char* norm) {
1743
+ memcpy(norm, "____", 4); // four underscores
1744
+ int l_ptr = 0;
1745
+ for (unsigned int i = 0; i < strlen(str); ++i) {
1746
+ uint8 uc = static_cast<uint8>(str[i]);
1747
+ if (kIsAlpha[uc] | kIsDigit[uc]) {
1748
+ if (l_ptr < 4) { // Else ignore
1749
+ norm[l_ptr] = kCharsetToLowerTbl[uc];
1750
+ l_ptr++;
1751
+ }
1752
+ }
1753
+ }
1754
+ }
1755
+
1756
+ // Find subscript of matching key in first 4 bytes of sorted hint array, or -1
1757
+ static int HintBinaryLookup4(const HintEntry* hintprobs, int hintprobssize,
1758
+ const char* norm_key) {
1759
+ // Key is always in range [lo..hi)
1760
+ int lo = 0;
1761
+ int hi = hintprobssize;
1762
+ while (lo < hi) {
1763
+ int mid = (lo + hi) >> 1;
1764
+ int comp = memcmp(&hintprobs[mid].key[0], norm_key, 4);
1765
+ if (comp < 0) {
1766
+ lo = mid + 1;
1767
+ } else if (comp > 0) {
1768
+ hi = mid;
1769
+ } else {
1770
+ return mid;
1771
+ }
1772
+ }
1773
+ return -1;
1774
+ }
1775
+
1776
+
1777
+ // Increment the initial probabilities based on a per-TLD probs entry
1778
+ void ApplyTLDHint(uint8* lang_hint_boost, const char* tld_hint) {
1779
+ if (FLAGS_dbgscore) {
1780
+ fprintf(stderr, "TLD hint %s\n", tld_hint);
1781
+ }
1782
+ char normalized_tld[8];
1783
+ MakeChar4(tld_hint, normalized_tld);
1784
+ int n = HintBinaryLookup4(kTLDHintProbs, kTLDHintProbsSize,
1785
+ normalized_tld);
1786
+ // TLD is four bytes, probability entry is 4 bytes
1787
+ if (n >= 0) {
1788
+ uint32 probs = kTLDHintProbs[n].probs;
1789
+
1790
+ uint8 prob123 = (probs >> 0) & 0xff;
1791
+ const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
1792
+ uint8 top1 = (probs >> 8) & 0xff;
1793
+ if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);}
1794
+ uint8 top2 = (probs >> 16) & 0xff;
1795
+ if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);}
1796
+ uint8 top3 = (probs >> 24) & 0xff;
1797
+ if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);}
1798
+ }
1799
+ }
1800
+
1801
+
1802
+ // Increment the initial probabilities based on a per-encoding probs entry
1803
+ void ApplyEncodingHint(uint8* lang_hint_boost, int encoding_hint) {
1804
+ if (FLAGS_dbgscore) {
1805
+ Encoding tempenc = static_cast<Encoding>(encoding_hint);
1806
+ fprintf(stderr, "ENC hint %s\n", EncodingName(tempenc));
1807
+ }
1808
+ if (encoding_hint < ISO_8859_1) {return;}
1809
+ if (encoding_hint >= NUM_ENCODINGS) {return;}
1810
+ uint32 probs = kEncodingHintProbs[encoding_hint];
1811
+
1812
+ uint8 prob123 = (probs >> 0) & 0xff;
1813
+ const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
1814
+ uint8 top1 = (probs >> 8) & 0xff;
1815
+ if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);}
1816
+ uint8 top2 = (probs >> 16) & 0xff;
1817
+ if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);}
1818
+ uint8 top3 = (probs >> 24) & 0xff;
1819
+ if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);}
1820
+ }
1821
+
1822
+
1823
+ // Increment the initial probability for given language by fixed amount
1824
+ // Does not recognize extended languages as hints
1825
+ void ApplyLanguageHint(uint8* lang_hint_boost, Language language_hint) {
1826
+ if (FLAGS_dbgscore) {
1827
+ fprintf(stderr, "LANG hint %s\n", ExtLanguageName(language_hint));
1828
+ }
1829
+ if (language_hint < ENGLISH) {return;}
1830
+ if (language_hint >= NUM_LANGUAGES) {return;}
1831
+ uint32 probs = kLanguageHintProbs[language_hint];
1832
+
1833
+ uint8 prob123 = (probs >> 0) & 0xff;
1834
+ const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
1835
+ uint8 top1 = (probs >> 8) & 0xff;
1836
+ if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);}
1837
+ uint8 top2 = (probs >> 16) & 0xff;
1838
+ if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);}
1839
+ uint8 top3 = (probs >> 24) & 0xff;
1840
+ if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);}
1841
+ }
1842
+
1843
+ // Extract return values before fixups
1844
+ void ExtractLangEtc(ToteWithReliability* doc_tote, int total_text_bytes,
1845
+ int* reliable_percent3, Language* language3, int* percent3,
1846
+ double* normalized_score3,
1847
+ int* text_bytes, bool* is_reliable) {
1848
+ reliable_percent3[0] = 0;
1849
+ reliable_percent3[1] = 0;
1850
+ reliable_percent3[2] = 0;
1851
+ language3[0] = UNKNOWN_LANGUAGE;
1852
+ language3[1] = UNKNOWN_LANGUAGE;
1853
+ language3[2] = UNKNOWN_LANGUAGE;
1854
+ percent3[0] = 100;
1855
+ percent3[1] = 0;
1856
+ percent3[2] = 0;
1857
+ normalized_score3[0] = 0.0;
1858
+ normalized_score3[1] = 0.0;
1859
+ normalized_score3[2] = 0.0;
1860
+
1861
+ *text_bytes = total_text_bytes;
1862
+ *is_reliable = false;
1863
+
1864
+ int bytecount1 = total_text_bytes;
1865
+ int bytecount2 = 0;
1866
+ int bytecount3 = 0;
1867
+
1868
+ int lang1 = doc_tote->Key(0);
1869
+ if (lang1 != 0) {
1870
+ // We have a top language
1871
+ language3[0] = cld::UnpackLanguage(lang1);
1872
+ bytecount1 = doc_tote->Value(0);
1873
+ int reli1 = doc_tote->Reliability(0);
1874
+ reliable_percent3[0] = reli1 / (bytecount1 ? bytecount1 : 1); // avoid zdiv
1875
+ normalized_score3[0] = cld::GetNormalizedScore(language3[0],
1876
+ ULScript_Common,
1877
+ bytecount1,
1878
+ doc_tote->Score(0));
1879
+ }
1880
+
1881
+ int lang2 = doc_tote->Key(1);
1882
+ if (lang2 != 0) {
1883
+ language3[1] = cld::UnpackLanguage(lang2);
1884
+ bytecount2 = doc_tote->Value(1);
1885
+ int reli2 = doc_tote->Reliability(1);
1886
+ reliable_percent3[1] = reli2 / (bytecount2 ? bytecount2 : 1); // avoid zdiv
1887
+ normalized_score3[1] = cld::GetNormalizedScore(language3[1],
1888
+ ULScript_Common,
1889
+ bytecount2,
1890
+ doc_tote->Score(1));
1891
+ }
1892
+
1893
+ int lang3 = doc_tote->Key(2);
1894
+ if (lang3 != 0) {
1895
+ language3[2] = cld::UnpackLanguage(lang3);
1896
+ bytecount3 = doc_tote->Value(2);
1897
+ int reli3 = doc_tote->Reliability(2);
1898
+ reliable_percent3[2] = reli3 / (bytecount3 ? bytecount3 : 1); // avoid zdiv
1899
+ normalized_score3[2] = cld::GetNormalizedScore(language3[2],
1900
+ ULScript_Common,
1901
+ bytecount3,
1902
+ doc_tote->Score(2));
1903
+ }
1904
+
1905
+ // Increase total bytes to sum (top 3) if low for some reason
1906
+ int total_bytecount12 = bytecount1 + bytecount2;
1907
+ int total_bytecount123 = total_bytecount12 + bytecount3;
1908
+ if (total_text_bytes < total_bytecount123) {
1909
+ total_text_bytes = total_bytecount123;
1910
+ *text_bytes = total_text_bytes;
1911
+ }
1912
+
1913
+ // Sum minus previous % gives better roundoff behavior than bytecount/total
1914
+ int total_text_bytes_div = cld::maxint(1, total_text_bytes); // Avoid zdiv
1915
+ percent3[0] = (bytecount1 * 100) / total_text_bytes_div;
1916
+ percent3[1] = (total_bytecount12 * 100) / total_text_bytes_div;
1917
+ percent3[2] = (total_bytecount123 * 100) / total_text_bytes_div;
1918
+ percent3[2] -= percent3[1];
1919
+ percent3[1] -= percent3[0];
1920
+
1921
+ // Roundoff, say 96% 1.6% 1.4%, will produce non-obvious 96% 1% 2%
1922
+ // Fix this explicitly
1923
+ if (percent3[1] < percent3[2]) {
1924
+ ++percent3[1];
1925
+ --percent3[2];
1926
+ }
1927
+ if (percent3[0] < percent3[1]) {
1928
+ ++percent3[0];
1929
+ --percent3[1];
1930
+ }
1931
+
1932
+ *text_bytes = total_text_bytes;
1933
+
1934
+ if (lang1 != 0) {
1935
+ // We have a top language
1936
+ // Its reliability is overal result reliability
1937
+ int bytecount = doc_tote->Value(0);
1938
+ int reli = doc_tote->Reliability(0);
1939
+ int reliable_percent = reli / (bytecount ? bytecount : 1); // avoid zdiv
1940
+ *is_reliable = reliable_percent >= cld::kMinReliable;
1941
+ } else {
1942
+ // No top language at all. This can happen with zero text or 100% Klingon
1943
+ // if extended=false. Just return all UNKNOWN_LANGUAGE, reliable.
1944
+ *is_reliable = true;
1945
+ }
1946
+ }
1947
+
1948
+ bool IsFIGS(Language lang) {
1949
+ if (lang == FRENCH) {return true;}
1950
+ if (lang == ITALIAN) {return true;}
1951
+ if (lang == GERMAN) {return true;}
1952
+ if (lang == SPANISH) {return true;}
1953
+ return false;
1954
+ }
1955
+
1956
+ bool IsEFIGS(Language lang) {
1957
+ if (lang == ENGLISH) {return true;}
1958
+ if (lang == FRENCH) {return true;}
1959
+ if (lang == ITALIAN) {return true;}
1960
+ if (lang == GERMAN) {return true;}
1961
+ if (lang == SPANISH) {return true;}
1962
+ return false;
1963
+ }
1964
+
1965
+ static const int kNonEnBoilerplateMinPercent = 17; // <this => no second
1966
+ static const int kNonFIGSBoilerplateMinPercent = 20; // <this => no second
1967
+ static const int kGoodFirstMinPercent = 26; // <this => UNK
1968
+ static const int kGoodFirstReliableMinPercent = 51; // <this => unreli
1969
+ static const int kIgnoreMaxPercent = 95; // >this => unreli
1970
+ static const int kKeepMinPercent = 2; // <this => unreli
1971
+
1972
+ // For Tier3 languages, require more bytes of text to override
1973
+ // the first-place language
1974
+ static const int kGoodSecondT1T2MinBytes = 15; // <this => no second
1975
+ static const int kGoodSecondT3MinBytes = 128; // <this => no second
1976
+ //
1977
+
1978
+ // Calculate a single summary language for the document, and its reliability.
1979
+ // Returns language3[0] or language3[1] or ENGLISH or UNKNOWN_LANGUAGE
1980
+ // This is the heart of matching human-rater perception.
1981
+ // reliable_percent3[] is currently unused
1982
+ //
1983
+ // Do not return Tier3 second language unless there are at least 128 bytes
1984
+ void CalcSummaryLang(ToteWithReliability* doc_tote, int total_text_bytes,
1985
+ const int* reliable_percent3,
1986
+ const Language* language3,
1987
+ const int* percent3,
1988
+ Language* summary_lang, bool* is_reliable) {
1989
+ // Vector of active languages; changes if we delete some
1990
+ int slot_count = 3;
1991
+ int active_slot[3] = {0, 1, 2};
1992
+
1993
+ int ignore_percent = 0;
1994
+ int return_percent = percent3[0]; // Default to top lang
1995
+ *summary_lang = language3[0];
1996
+ *is_reliable = true;
1997
+ if (percent3[0] < kKeepMinPercent) {*is_reliable = false;}
1998
+
1999
+ // If any of top 3 is IGNORE, remove it and increment ignore_percent
2000
+ for (int i = 0; i < 3; ++i) {
2001
+ if (language3[i] == TG_UNKNOWN_LANGUAGE) {
2002
+ ignore_percent += percent3[i];
2003
+ // Move the rest up, levaing input vectors unchanged
2004
+ for (int j=i+1; j < 3; ++j) {
2005
+ active_slot[j - 1] = active_slot[j];
2006
+ }
2007
+ -- slot_count;
2008
+ // Logically remove Ignore from percentage-text calculation
2009
+ // (extra 1 in 101 avoids zdiv, biases slightly small)
2010
+ return_percent = (percent3[0] * 100) / (101 - ignore_percent);
2011
+ *summary_lang = language3[active_slot[0]];
2012
+ if (percent3[active_slot[0]] < kKeepMinPercent) {*is_reliable = false;}
2013
+ }
2014
+ }
2015
+
2016
+
2017
+ // If English and X, where X (not UNK) is big enough,
2018
+ // assume the English is boilerplate and return X.
2019
+ // Logically remove English from percentage-text calculation
2020
+ int second_bytes = (total_text_bytes * percent3[active_slot[1]]) / 100;
2021
+ // Require more bytes of text for Tier3 languages
2022
+ int minbytesneeded = kGoodSecondT1T2MinBytes;
2023
+ int plang_second = cld::PackLanguage(language3[active_slot[1]]);
2024
+ bool is_tier3 = (cld::kIsPackedTop40[plang_second] == 0);
2025
+ if (is_tier3) {
2026
+ minbytesneeded = kGoodSecondT3MinBytes;
2027
+ }
2028
+
2029
+ if ((language3[active_slot[0]] == ENGLISH) &&
2030
+ (language3[active_slot[1]] != ENGLISH) &&
2031
+ (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
2032
+ (percent3[active_slot[1]] >= kNonEnBoilerplateMinPercent) &&
2033
+ (second_bytes >= minbytesneeded)) {
2034
+ ignore_percent += percent3[active_slot[0]];
2035
+ return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
2036
+ *summary_lang = language3[active_slot[1]];
2037
+ if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
2038
+
2039
+ // Else If FIGS and X, where X (not UNK, EFIGS) is big enough,
2040
+ // assume the FIGS is boilerplate and return X.
2041
+ // Logically remove FIGS from percentage-text calculation
2042
+ } else if (IsFIGS(language3[active_slot[0]]) &&
2043
+ !IsEFIGS(language3[active_slot[1]]) &&
2044
+ (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
2045
+ (percent3[active_slot[1]] >= kNonFIGSBoilerplateMinPercent) &&
2046
+ (second_bytes >= minbytesneeded)) {
2047
+ ignore_percent += percent3[active_slot[0]];
2048
+ return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
2049
+ *summary_lang = language3[active_slot[1]];
2050
+ if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
2051
+
2052
+ // Else we are returning the first language, but want to improve its
2053
+ // return_percent if the second language should be ignored
2054
+ } else if ((language3[active_slot[1]] == ENGLISH) &&
2055
+ (language3[active_slot[0]] != ENGLISH)) {
2056
+ ignore_percent += percent3[active_slot[1]];
2057
+ return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
2058
+ } else if (IsFIGS(language3[active_slot[1]]) &&
2059
+ !IsEFIGS(language3[active_slot[0]])) {
2060
+ ignore_percent += percent3[active_slot[1]];
2061
+ return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
2062
+ }
2063
+
2064
+ // If return percent is too small (too many languages), return UNKNOWN
2065
+ if ((return_percent < kGoodFirstMinPercent)) {
2066
+ *summary_lang = UNKNOWN_LANGUAGE;
2067
+ *is_reliable = false;
2068
+ }
2069
+
2070
+ // If return percent is small, return language but set unreliable.
2071
+ if ((return_percent < kGoodFirstReliableMinPercent)) {
2072
+ *is_reliable = false;
2073
+ }
2074
+
2075
+ // If ignore percent is too large, set unreliable.
2076
+ if ((ignore_percent > kIgnoreMaxPercent)) {
2077
+ *is_reliable = false;
2078
+ }
2079
+
2080
+ // If we removed all the active languages, return UNKNOWN
2081
+ if (slot_count == 0) {
2082
+ *summary_lang = UNKNOWN_LANGUAGE;
2083
+ *is_reliable = false;
2084
+ }
2085
+ }
2086
+
2087
+
2088
+
2089
+ // Result vector must be exactly three items
2090
+ Language CompactLangDetImpl::DetectLanguageSummaryV25(
2091
+ const CompactLangDet::DetectionTables* tables,
2092
+ const char* buffer,
2093
+ int buffer_length,
2094
+ bool is_plain_text,
2095
+ bool do_pick_summary_language,
2096
+ bool do_remove_weak_matches,
2097
+ const char* tld_hint, // "id" boosts Indonesian
2098
+ int encoding_hint, // SJS boosts Japanese
2099
+ Language language_hint, // ITALIAN boosts it
2100
+ bool allow_extended_lang,
2101
+ int flags,
2102
+ Language plus_one,
2103
+ Language* language3,
2104
+ int* percent3,
2105
+ double* normalized_score3,
2106
+ int* text_bytes,
2107
+ bool* is_reliable) {
2108
+ if (!tables) {
2109
+ static const CompactLangDet::DetectionTables default_cld_tables = {
2110
+ &kQuadTable_obj,
2111
+ &compact_lang_det_generated_ctjkvz_b1_obj
2112
+ };
2113
+ tables = &default_cld_tables;
2114
+ }
2115
+ language3[0] = UNKNOWN_LANGUAGE;
2116
+ language3[1] = UNKNOWN_LANGUAGE;
2117
+ language3[2] = UNKNOWN_LANGUAGE;
2118
+ percent3[0] = 100;
2119
+ percent3[1] = 0;
2120
+ percent3[2] = 0;
2121
+ normalized_score3[0] = 0.0;
2122
+ normalized_score3[1] = 0.0;
2123
+ normalized_score3[2] = 0.0;
2124
+ *text_bytes = 0;
2125
+ *is_reliable = false;
2126
+
2127
+ // Document totals
2128
+ ToteWithReliability doc_tote; // Reliability = 0..100
2129
+
2130
+ // Vector of packed per-language boosts (just one filled in from hints)
2131
+ uint8 lang_hint_boost[EXT_NUM_LANGUAGES + 1];
2132
+ memset(lang_hint_boost, 0, sizeof(lang_hint_boost));
2133
+
2134
+ // Apply hints,if any
2135
+ if ((tld_hint != NULL) && (tld_hint[0] != '\0')) {
2136
+ ApplyTLDHint(lang_hint_boost, tld_hint);
2137
+ }
2138
+ if (encoding_hint != UNKNOWN_ENCODING) {
2139
+ ApplyEncodingHint(lang_hint_boost, encoding_hint);
2140
+ }
2141
+ if (language_hint != UNKNOWN_LANGUAGE) {
2142
+ ApplyLanguageHint(lang_hint_boost, language_hint);
2143
+ }
2144
+
2145
+
2146
+ // Four individual script totals, Latin, Han, other2, other3
2147
+ int next_other_tote = 2;
2148
+
2149
+ // Four totes for up to four different scripts pending at once
2150
+ Tote totes[4]; // [0] Latn [1] Hani [2] other [3] other
2151
+ bool tote_seen[4] = {false, false, false, false};
2152
+ int tote_grams[4] = {0, 0, 0, 0}; // Number in partial chunk
2153
+ UnicodeLScript tote_script[4] =
2154
+ {ULScript_Latin, ULScript_HanCJK, ULScript_Common, ULScript_Common};
2155
+
2156
+ // Loop through text spans in a single script
2157
+ ScriptScanner ss(buffer, buffer_length, is_plain_text);
2158
+ getone::LangSpan scriptspan;
2159
+
2160
+ scriptspan.text = NULL;
2161
+ scriptspan.text_bytes = 0;
2162
+ scriptspan.offset = 0;
2163
+ scriptspan.script = ULScript_Common;
2164
+ scriptspan.lang = UNKNOWN_LANGUAGE;
2165
+
2166
+ int total_text_bytes = 0;
2167
+ int textlimit = FLAGS_cld_textlimit << 10; // in KB
2168
+ if (textlimit == 0) {textlimit = 0x7fffffff;}
2169
+
2170
+ int advance_by = 2; // Advance 2 bytes
2171
+ int advance_limit = textlimit >> 3; // For first 1/8 of max document
2172
+
2173
+ int initial_word_span = kDefaultWordSpan;
2174
+ if (FLAGS_cld_forcewords) {
2175
+ initial_word_span = kReallyBigWordSpan;
2176
+ }
2177
+
2178
+ // Pick up chunk sizes
2179
+ // Smoothwidth is units of quadgrams, about 2.5 chars (unigrams) each
2180
+ // Sanity check -- force into a reasonable range
2181
+ int chunksizequads = FLAGS_cld_smoothwidth;
2182
+ chunksizequads = cld::minint(cld::maxint(chunksizequads, kMinChunkSizeQuads),
2183
+ kMaxChunkSizeQuads);
2184
+ int chunksizeunis = (chunksizequads * 5) >> 1;
2185
+
2186
+ // Varying short-span limit doesn't work well -- skips too much beyond 20KB
2187
+ // int spantooshortlimit = advance_by * FLAGS_cld_smoothwidth;
2188
+ int spantooshortlimit = kShortSpanThresh;
2189
+
2190
+ // For debugging only. Not thread-safe
2191
+ prior_lang = UNKNOWN_LANGUAGE;
2192
+ prior_unreliable = false;
2193
+
2194
+ // Allocate full-document prediction table for finding repeating words
2195
+ int hash = 0;
2196
+ int* predict_tbl = new int[kPredictionTableSize];
2197
+ if (FlagRepeats(flags)) {
2198
+ memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
2199
+ }
2200
+
2201
+ // Loop through scriptspans accumulating number of text bytes in each language
2202
+ while (ss.GetOneScriptSpanLower(&scriptspan)) {
2203
+ UnicodeLScript lscript = scriptspan.script;
2204
+
2205
+ // Echo text if asked to
2206
+ if (FLAGS_cld_echotext) {
2207
+ PrintHtmlEscapedText(stderr, scriptspan.text, scriptspan.text_bytes);
2208
+ }
2209
+
2210
+ // Squeeze out big chunks of text span if asked to
2211
+ if (FlagSqueeze(flags)) {
2212
+ // Remove repetitive or mostly-spaces chunks
2213
+ int newlen;
2214
+ int chunksize = 0; // Use the default
2215
+ newlen = CheapSqueezeInplace(scriptspan.text, scriptspan.text_bytes,
2216
+ chunksize);
2217
+ scriptspan.text_bytes = newlen;
2218
+ } else {
2219
+ // Check now and then to see if we should be squeezing
2220
+ if ((total_text_bytes >= kCheapSqueezeTestThresh) &&
2221
+ !FlagFinish(flags) &&
2222
+ ((getone::kMaxScriptBuffer >> 1) < scriptspan.text_bytes) &&
2223
+ CheapSqueezeTriggerTest(scriptspan.text,
2224
+ scriptspan.text_bytes,
2225
+ kCheapSqueezeTestLen)) {
2226
+ // Recursive call with big-chunk squeezing set
2227
+ if (FLAGS_cld_html || FLAGS_dbgscore) {
2228
+ fprintf(stderr,
2229
+ "<br>---text_bytes[%d] Recursive(Squeeze)---<br><br>\n",
2230
+ total_text_bytes);
2231
+ }
2232
+ // Deallocate full-document prediction table
2233
+ delete[] predict_tbl;
2234
+
2235
+ return DetectLanguageSummaryV25(
2236
+ tables,
2237
+ buffer,
2238
+ buffer_length,
2239
+ is_plain_text,
2240
+ do_pick_summary_language,
2241
+ do_remove_weak_matches,
2242
+ tld_hint, // "id" boosts Indonesian
2243
+ encoding_hint, // SJS boosts Japanese
2244
+ language_hint, // ITALIAN boosts it
2245
+ allow_extended_lang,
2246
+ flags | kCLDFlagSqueeze,
2247
+ plus_one,
2248
+ language3,
2249
+ percent3,
2250
+ normalized_score3,
2251
+ text_bytes,
2252
+ is_reliable);
2253
+ }
2254
+ }
2255
+
2256
+ // Remove repetitive words if asked to
2257
+ if (FlagRepeats(flags)) {
2258
+ // Remove repetitive words
2259
+ int newlen;
2260
+ newlen = CheapRepWordsInplace(scriptspan.text, scriptspan.text_bytes,
2261
+ &hash, predict_tbl);
2262
+ scriptspan.text_bytes = newlen;
2263
+ }
2264
+
2265
+ // The real scoring
2266
+ // Accumulate directly into the document total, or accmulate in one of four
2267
+ // chunk totals. The purpose of the multiple chunk totals is to piece
2268
+ // together short choppy pieces of text in alternating scripts. One total is
2269
+ // dedicated to Latin text, one to Han text, and the other two are dynamicly
2270
+ // assigned.
2271
+ Language onlylang = cld::kOnlyLanguagePerLScript[lscript];
2272
+
2273
+ if (onlylang != UNKNOWN_LANGUAGE) {
2274
+ // This entire script run is in a single language.
2275
+ ScoreNilgrams(&scriptspan, cld::PackLanguage(onlylang), &doc_tote,
2276
+ lang_hint_boost, flags, plus_one);
2277
+ } else if (cld::kScoreUniPerLScript[lscript] != 0) {
2278
+ // This entire script run's languages can be distinguished by uni-grams
2279
+ // Accumulate in hani_tote
2280
+ int tote_num = 1;
2281
+ if (!tote_seen[tote_num]) {
2282
+ tote_seen[tote_num] = true;
2283
+ // Default language gets 1 byte
2284
+ total_text_bytes += 1;
2285
+ InitScriptToteLang(&totes[tote_num], lscript);
2286
+ }
2287
+ ScoreUnigrams(tables->unigram_obj,
2288
+ &scriptspan, &tote_grams[tote_num], chunksizeunis,
2289
+ &totes[tote_num],
2290
+ &doc_tote, lang_hint_boost,
2291
+ advance_by, flags, &initial_word_span, plus_one);
2292
+ } else {
2293
+ // This entire script-run's languages can be distinguished by quad-grams
2294
+ // Accumulate in latn_tote or script0/1_tote
2295
+ int tote_num = -1;
2296
+ for (int t = 0; t < 4; ++t) {
2297
+ if (lscript == tote_script[t]) {
2298
+ tote_num = t;
2299
+ break;
2300
+ }
2301
+ }
2302
+ if (tote_num < 0) {
2303
+ // Need to allocate other0/1
2304
+ tote_num = next_other_tote;
2305
+ next_other_tote ^= 1; // Round-robin
2306
+ if (tote_seen[tote_num]) {
2307
+ // Flush previous
2308
+ ScoreChunkIntoDoc2(kToteSwitch[tote_num], advance_by,
2309
+ tote_script[tote_num], &totes[tote_num],
2310
+ &doc_tote, tote_grams[tote_num], lang_hint_boost);
2311
+ totes[tote_num].Reinit();
2312
+ }
2313
+ tote_script[tote_num] = lscript;
2314
+ }
2315
+
2316
+ if (!tote_seen[tote_num]) {
2317
+ tote_seen[tote_num] = true;
2318
+ // Default language gets 1 byte
2319
+ total_text_bytes += 1;
2320
+ InitScriptToteLang(&totes[tote_num], lscript);
2321
+ }
2322
+
2323
+ // The actual accumulation, possibly with word scoring also
2324
+ ScoreQuadgrams(tables->quadgram_obj, &scriptspan, &tote_grams[tote_num],
2325
+ chunksizequads,
2326
+ &totes[tote_num],
2327
+ &doc_tote, lang_hint_boost,
2328
+ advance_by, flags, &initial_word_span, plus_one);
2329
+ }
2330
+
2331
+ total_text_bytes += scriptspan.text_bytes;
2332
+
2333
+ // For long documents, do less-dense samples the further along we go.
2334
+ // This is to keep speed sublinear in document size.
2335
+ if (total_text_bytes > advance_limit) {
2336
+ if (total_text_bytes > textlimit) {
2337
+ // Don't look at rest of doc
2338
+ if (FLAGS_cld_html || FLAGS_dbgscore) {
2339
+ fprintf(stderr, "<br>---text_bytes[%d] textlimit %d reached---<br>",
2340
+ total_text_bytes, textlimit);
2341
+ }
2342
+ break;
2343
+ }
2344
+ advance_by <<= 1; // Double advance bytes
2345
+ advance_limit <<= 1; // Double limit until next change
2346
+ spantooshortlimit <<= 1; // Double short-span size
2347
+ if (FLAGS_cld_html || FLAGS_dbgscore) {
2348
+ fprintf(stderr, "<br>---text_bytes[%d] advance_by doubled to %d---<br>",
2349
+ total_text_bytes, advance_by);
2350
+ }
2351
+ }
2352
+ } // End while (ss.GetOneScriptSpanLower())
2353
+
2354
+ // Deallocate full-document prediction table
2355
+ delete[] predict_tbl;
2356
+
2357
+ // Flush pending totals
2358
+ for (int tote_num = 0; tote_num < 4; ++tote_num) {
2359
+ if (tote_seen[tote_num]) {
2360
+ ScoreChunkIntoDoc2(kToteName[tote_num], advance_by,
2361
+ tote_script[tote_num], &totes[tote_num], &doc_tote,
2362
+ tote_grams[tote_num], lang_hint_boost);
2363
+ }
2364
+ }
2365
+
2366
+ // If extended languages are disallowed, remove them here
2367
+ if (!allow_extended_lang) {
2368
+ RemoveExtendedLanguages(&doc_tote);
2369
+ }
2370
+
2371
+ // Force close pairs to one or the other
2372
+ RefineScoredClosePairs(&doc_tote);
2373
+
2374
+
2375
+ // Calculate return results
2376
+ // Find top three byte counts in tote heap
2377
+ int reliable_percent3[3];
2378
+
2379
+
2380
+ // Cannot use Add, etc. after sorting
2381
+ doc_tote.Sort(3);
2382
+
2383
+ ExtractLangEtc(&doc_tote, total_text_bytes,
2384
+ reliable_percent3, language3, percent3, normalized_score3,
2385
+ text_bytes, is_reliable);
2386
+
2387
+ bool have_good_answer = false;
2388
+ if (FlagFinish(flags)) {
2389
+ // Force a result
2390
+ have_good_answer = true;
2391
+ } else if (total_text_bytes <= kShortTextThresh) {
2392
+ // Don't recurse on short text -- we already did word scores
2393
+ have_good_answer = true;
2394
+ } else if (*is_reliable &&
2395
+ (percent3[0] >= kGoodLang1Percent)) {
2396
+ have_good_answer = true;
2397
+ } else if (*is_reliable &&
2398
+ ((percent3[0] + percent3[1]) >= kGoodLang1and2Percent)) {
2399
+ have_good_answer = true;
2400
+ }
2401
+
2402
+
2403
+ if (have_good_answer) {
2404
+ // This is the real, non-recursive return
2405
+
2406
+ // Move bytes for unreliable langs to another lang or
2407
+ // UNKNOWN
2408
+ RemoveUnreliableLanguages(&doc_tote, do_remove_weak_matches);
2409
+
2410
+ // Redo the result extraction after the removal above
2411
+ doc_tote.Sort(3);
2412
+
2413
+ ExtractLangEtc(&doc_tote, total_text_bytes,
2414
+ reliable_percent3, language3, percent3, normalized_score3,
2415
+ text_bytes, is_reliable);
2416
+
2417
+ #if 0
2418
+ // OLD code, replaced by CalcSummaryLang
2419
+ //
2420
+ // Suppress ignore-me text, TG_UNKNOWN_LANGUAGE if 2nd or 3rd language
2421
+ // Force it to English if first language
2422
+ if (language3[2] == TG_UNKNOWN_LANGUAGE) {
2423
+ reliable_percent3[2] = 0;
2424
+ language3[2] = UNKNOWN_LANGUAGE;
2425
+ percent3[2] = 0;
2426
+ } else if (language3[1] == TG_UNKNOWN_LANGUAGE) {
2427
+ // Move up lower language
2428
+ reliable_percent3[1] = reliable_percent3[2];
2429
+ language3[1] = language3[2];
2430
+ percent3[1] = percent3[2];
2431
+ reliable_percent3[2] = 0;
2432
+ language3[2] = UNKNOWN_LANGUAGE;
2433
+ percent3[2] = 0;
2434
+ } else if (language3[0] == TG_UNKNOWN_LANGUAGE) {
2435
+ language3[0] = ENGLISH;
2436
+ }
2437
+
2438
+ if (language3[0] == UNKNOWN_LANGUAGE) {
2439
+ // Last-ditch test for some result, but it is UNKNOWN_LANGUAGE
2440
+ // Force it to English (should not happen)
2441
+ language3[0] = ENGLISH;
2442
+ percent3[0] = 100;
2443
+ *is_reliable = true;
2444
+ }
2445
+ #endif
2446
+
2447
+
2448
+ #if 0
2449
+ // Scaffolding to reveal subset sequence lang distribution across doc text
2450
+ // Track the sequence of language fragments [result currently unused]
2451
+ if (FLAGS_cld_html) {
2452
+ static const int kMaxSubsetSeq = 12;
2453
+ uint8 subseq[kMaxSubsetSeq];
2454
+ doc_tote.ExtractSeq(kMaxSubsetSeq, subseq);
2455
+
2456
+ fprintf(stderr, "<br>\nSubset Sequence[%d]: ", kMaxSubsetSeq);
2457
+ for (int i = 0; i < kMaxSubsetSeq; ++i) {
2458
+ fprintf(stderr, "%s ", ExtLanguageCode(cld::UnpackLanguage(subseq[i])));
2459
+ if ((i % 4) == 3) {fprintf(stderr, "&nbsp; ");}
2460
+ }
2461
+ fprintf(stderr, "&nbsp;&nbsp; ");
2462
+
2463
+ for (int i = 0; i < 3; ++i) {
2464
+ if (language3[i] != UNKNOWN_LANGUAGE) {
2465
+ fprintf(stderr, "%s.%d(%d%%) ",
2466
+ ExtLanguageCode(language3[i]),
2467
+ reliable_percent3[i],
2468
+ percent3[i]);
2469
+ }
2470
+ }
2471
+
2472
+ fprintf(stderr, "%d B ", total_text_bytes);
2473
+ fprintf(stderr, "<br>\n");
2474
+ }
2475
+ // End Scaffolding to reveal subset sequence lang distribution
2476
+ #endif
2477
+
2478
+ Language summary_lang;
2479
+ if (do_pick_summary_language) {
2480
+ CalcSummaryLang(&doc_tote, total_text_bytes,
2481
+ reliable_percent3, language3, percent3,
2482
+ &summary_lang, is_reliable);
2483
+ } else {
2484
+ summary_lang = language3[0];
2485
+ }
2486
+
2487
+ if (FLAGS_cld_html) {
2488
+ for (int i = 0; i < 3; ++i) {
2489
+ if (language3[i] != UNKNOWN_LANGUAGE) {
2490
+ fprintf(stderr, "%s.%d(%d%%) ",
2491
+ ExtLanguageCode(language3[i]),
2492
+ reliable_percent3[i],
2493
+ percent3[i]);
2494
+ }
2495
+ }
2496
+
2497
+ fprintf(stderr, "%d B ", total_text_bytes);
2498
+ fprintf(stderr, "= %s%c ",
2499
+ ExtLanguageName(summary_lang), is_reliable ? ' ' : '*');
2500
+ fprintf(stderr, "<br>\n");
2501
+ }
2502
+
2503
+ return summary_lang;
2504
+ }
2505
+
2506
+ // Not a good answer -- do recursive call to refine
2507
+ if (FLAGS_cld_html || FLAGS_dbgscore) {
2508
+ // This is what we hope to improve on in the recursive call, if any
2509
+ PrintLangs(stderr, language3, percent3, text_bytes, is_reliable);
2510
+ }
2511
+
2512
+ // For restriction to Top40 + one, the one is 1st/2nd lang that is not Top40
2513
+ // For this purpose, we treate "Ignore" as top40
2514
+ Language new_plus_one = UNKNOWN_LANGUAGE;
2515
+ if (cld::kIsPackedTop40[cld::PackLanguage(language3[0])] == 0) {
2516
+ new_plus_one = language3[0];
2517
+ } else if (cld::kIsPackedTop40[cld::PackLanguage(language3[1])] == 0) {
2518
+ new_plus_one = language3[1];
2519
+ }
2520
+
2521
+ if (total_text_bytes < kShortTextThresh) {
2522
+ // Short text: Recursive call with top40 and short set
2523
+ if (FLAGS_cld_html || FLAGS_dbgscore) {
2524
+ fprintf(stderr, "&nbsp;&nbsp;---text_bytes[%d] "
2525
+ "Recursive(Top40/Rep/Short/Words)---<br><br>\n",
2526
+ total_text_bytes);
2527
+ }
2528
+ return DetectLanguageSummaryV25(
2529
+ tables,
2530
+ buffer,
2531
+ buffer_length,
2532
+ is_plain_text,
2533
+ do_pick_summary_language,
2534
+ do_remove_weak_matches,
2535
+ tld_hint, // "id" boosts Indonesian
2536
+ encoding_hint, // SJS boosts Japanese
2537
+ language_hint, // ITALIAN boosts it
2538
+ allow_extended_lang,
2539
+ flags | kCLDFlagTop40 | kCLDFlagRepeats |
2540
+ kCLDFlagShort | kCLDFlagUseWords | kCLDFlagFinish,
2541
+ new_plus_one,
2542
+ language3,
2543
+ percent3,
2544
+ normalized_score3,
2545
+ text_bytes,
2546
+ is_reliable);
2547
+ }
2548
+
2549
+ // Longer text: Recursive call with top40 set
2550
+ if (FLAGS_cld_html || FLAGS_dbgscore) {
2551
+ fprintf(stderr,
2552
+ "&nbsp;&nbsp;---text_bytes[%d] Recursive(Top40/Rep)---<br><br>\n",
2553
+ total_text_bytes);
2554
+ }
2555
+ return DetectLanguageSummaryV25(
2556
+ tables,
2557
+ buffer,
2558
+ buffer_length,
2559
+ is_plain_text,
2560
+ do_pick_summary_language,
2561
+ do_remove_weak_matches,
2562
+ tld_hint, // "id" boosts Indonesian
2563
+ encoding_hint, // SJS boosts Japanese
2564
+ language_hint, // ITALIAN boosts it
2565
+ allow_extended_lang,
2566
+ flags | kCLDFlagTop40 | kCLDFlagRepeats |
2567
+ kCLDFlagFinish,
2568
+ new_plus_one,
2569
+ language3,
2570
+ percent3,
2571
+ normalized_score3,
2572
+ text_bytes,
2573
+ is_reliable);
2574
+ } // End CompactLangDetImpl::DetectLanguageSummaryV25