language_detection 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. data/.gitignore +19 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +85 -0
  5. data/Rakefile +11 -0
  6. data/ext/cld/Makefile +34 -0
  7. data/ext/cld/base/basictypes.h +348 -0
  8. data/ext/cld/base/build_config.h +124 -0
  9. data/ext/cld/base/casts.h +156 -0
  10. data/ext/cld/base/commandlineflags.h +443 -0
  11. data/ext/cld/base/crash.h +41 -0
  12. data/ext/cld/base/dynamic_annotations.h +358 -0
  13. data/ext/cld/base/global_strip_options.h +59 -0
  14. data/ext/cld/base/log_severity.h +46 -0
  15. data/ext/cld/base/logging.h +1403 -0
  16. data/ext/cld/base/macros.h +243 -0
  17. data/ext/cld/base/port.h +54 -0
  18. data/ext/cld/base/scoped_ptr.h +428 -0
  19. data/ext/cld/base/stl_decl.h +0 -0
  20. data/ext/cld/base/stl_decl_msvc.h +107 -0
  21. data/ext/cld/base/string_util.h +29 -0
  22. data/ext/cld/base/strtoint.h +93 -0
  23. data/ext/cld/base/template_util.h +96 -0
  24. data/ext/cld/base/type_traits.h +198 -0
  25. data/ext/cld/base/vlog_is_on.h +143 -0
  26. data/ext/cld/cld.so +0 -0
  27. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  28. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  29. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  30. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  31. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  32. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  33. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  34. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  35. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  36. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  37. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  38. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  39. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  40. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  41. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  42. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  43. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  44. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  45. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  46. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  47. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  48. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  49. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  50. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  51. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  52. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  53. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  54. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  55. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  56. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  57. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  58. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  59. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  60. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  61. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  62. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  63. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  64. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  65. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  66. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  67. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  68. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  69. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  70. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  71. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  72. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  73. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  74. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  75. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  76. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  77. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  78. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  79. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  80. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  81. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  82. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  83. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  84. data/ext/cld/encodings/internal/encodings.cc +12 -0
  85. data/ext/cld/encodings/lang_enc.h +254 -0
  86. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  87. data/ext/cld/encodings/public/encodings.h +301 -0
  88. data/ext/cld/extconf.rb +1 -0
  89. data/ext/cld/language_detection.cc +88 -0
  90. data/ext/cld/languages/internal/languages.cc +337 -0
  91. data/ext/cld/languages/proto/languages.pb.h +179 -0
  92. data/ext/cld/languages/public/languages.h +379 -0
  93. data/language_detection.gemspec +28 -0
  94. data/lib/language_detection/string.rb +1 -0
  95. data/lib/language_detection/version.rb +3 -0
  96. data/lib/language_detection.rb +54 -0
  97. data/test/_helper.rb +15 -0
  98. data/test/fixtures/languages.csv +80 -0
  99. data/test/language_detection_test.rb +88 -0
  100. metadata +250 -0
@@ -0,0 +1,2574 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include <stdio.h>
6
+ #include <string.h>
7
+ //#include <sys/time.h> // for gettimeofday
8
+ #include <string>
9
+
10
+ #include "encodings/lang_enc.h"
11
+
12
+ #include "encodings/compact_lang_det/compact_lang_det.h"
13
+ #include "encodings/compact_lang_det/compact_lang_det_impl.h"
14
+ #include "encodings/compact_lang_det/getonescriptspan.h"
15
+ #include "encodings/compact_lang_det/letterscript_enum.h"
16
+ #include "encodings/compact_lang_det/tote.h"
17
+ #include "encodings/compact_lang_det/utf8propjustletter.h"
18
+ #include "encodings/compact_lang_det/utf8propletterscriptnum.h"
19
+ #include "encodings/compact_lang_det/utf8scannotjustletterspecial.h"
20
+
21
+ #include "encodings/compact_lang_det/cldutil_dbg.h"
22
+
23
+ #include "encodings/compact_lang_det/win/cld_basictypes.h"
24
+ #include "encodings/compact_lang_det/win/cld_commandlineflags.h"
25
+ #include "encodings/compact_lang_det/win/cld_google.h"
26
+ #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
27
+
28
+ // Linker supplies the right tables
29
+ extern const UTF8PropObj compact_lang_det_generated_ctjkvz_b1_obj;
30
+ extern const cld::CLDTableSummary kCjkBiTable_obj;
31
+ extern const cld::CLDTableSummary kQuadTable_obj;
32
+ extern const cld::CLDTableSummary kLongWord8Table_obj;
33
+
34
+ DEFINE_bool(cld_html, false, "Print language spans in HTML on stderr");
35
+ DEFINE_bool(cld_forcewords, false, "Score all words, in addition to quads");
36
+
37
+ DEFINE_bool(cld_showme, false, "Put squeeze/repeat points into HTML text");
38
+ DEFINE_bool(cld_echotext, false, "Print each scriptspan to stderr");
39
+ DEFINE_int32(cld_textlimit, 160, "Examine only initial n KB of actual text");
40
+ // 20 quadgrams is about 80 bytes or about 12 words in real text
41
+ DEFINE_int32(cld_smoothwidth, 20, "Smoothing window width in quadgrams");
42
+
43
+
44
+ static const int kLangHintInitial = 12; // Boost language by N initially
45
+ static const int kLangHintBoost = 12; // Boost language by N/16 per quadgram
46
+
47
+ static const int kShortSpanThresh = 32; // Bytes
48
+ static const int kMaxSecondChanceLen = 1024; // Look at first 1K of short spans
49
+
50
+ static const int kCheapSqueezeTestThresh = 4096; // Only look for squeezing
51
+ // after this many text bytes
52
+ static const int kCheapSqueezeTestLen = 256; // Bytes to test to trigger sqz
53
+ static const int kSpacesTriggerPercent = 25; // Trigger sqz if >=25% spaces
54
+ static const int kPredictTriggerPercent = 67; // Trigger sqz if >=67% predicted
55
+
56
+ static const int kChunksizeDefault = 48; // Squeeze 48-byte chunks
57
+ static const int kSpacesThreshPercent = 25; // Squeeze if >=25% spaces
58
+ static const int kPredictThreshPercent = 40; // Squeeze if >=40% predicted
59
+
60
+ static const int kMaxSpaceScan = 32; // Bytes
61
+
62
+ static const int kGoodLang1Percent = 70;
63
+ static const int kGoodLang1and2Percent = 93;
64
+ static const int kShortTextThresh = 256; // Bytes
65
+
66
+ static const int kMinChunkSizeQuads = 4; // Chunk is at least four quads
67
+ static const int kMaxChunkSizeQuads = 1024; // Chunk is at most 1K quads
68
+
69
+ static const int kDefaultWordSpan = 256; // Scan at least this many initial
70
+ // bytes with word scoring
71
+ static const int kReallyBigWordSpan = 9999999; // Forces word scoring all text
72
+
73
+ static const int kMinReliableSeq = 50; // Record in seq if >= 50% reliable
74
+
75
+ static const int kPredictionTableSize = 4096; // Must be exactly 4096 for
76
+ // cheap compressor
77
+
78
+ //
79
+ // Generated by dsites 2008.07.07 from 10% of Base
80
+ //
81
+
82
+ // Three packed language probs, subscripted by Encoding
83
+ static const uint32 kEncodingHintProbs[] = {
84
+ 0x00000000, // ASCII
85
+ 0x18120cd5, // Latin2 POLISH.11 CZECH.5 HUNGARIAN.3
86
+ 0x1d3a4bc9, // Latin3 AZERBAIJANI.10 BASQUE.3 CROATIAN.1
87
+ 0x030819d4, // Latin4 ESTONIAN.11 ITALIAN.4 DUTCH.2
88
+ 0x00000000, // ISO-8859-5
89
+ 0x00003742, // Arabic ARABIC.12
90
+ 0x00000000, // Greek
91
+ 0x00000742, // Hebrew HEBREW.12
92
+ 0x00002242, // Latin5 TURKISH.12
93
+ 0x060419c9, // Latin6 ESTONIAN.10 FINNISH.3 GERMAN.1
94
+ 0x00000942, // EUC-JP Japanese.12
95
+ 0x00000942, // SJS Japanese.12
96
+ 0x00000942, // JIS Japanese.12
97
+ 0x00004642, // BIG5 ChineseT.12
98
+ 0x00001142, // GB Chinese.12
99
+ 0x46295fcd, // EUC-CN UIGHUR.10 MALAY.6 ChineseT.5
100
+ 0x00000a42, // KSC Korean.12
101
+ 0x00000000, // Unicode
102
+ 0x03104674, // EUC ChineseT.9 SWEDISH.8 DUTCH.3
103
+ 0x00000000, // CNS
104
+ 0x0f1146c3, // BIG5-CP950 ChineseT.9 Chinese.5 SPANISH.4
105
+ 0x00000942, // CP932 Japanese.12
106
+ 0x00000000, // UTF8
107
+ 0x00000000, // Unknown
108
+ 0x00000000, // ASCII-7-bit
109
+ 0x00000000, // KOI8R
110
+ 0x00000000, // CP1251
111
+ 0x00000000, // CP1252
112
+ 0x00000000, // KOI8U
113
+ 0x451d12cd, // CP1250 CZECH.10 CROATIAN.6 SLOVAK.5
114
+ 0x0d06052a, // ISO-8859-15 FRENCH.9 GERMAN.8 PORTUGUESE.7
115
+ 0x00002242, // CP1254 TURKISH.12
116
+ 0x191516be, // CP1257 LITHUANIAN.8 LATVIAN.7 ESTONIAN.7
117
+ 0x08003642, // ISO-8859-11 THAI.12 ITALIAN.1
118
+ 0x00000000, // CP874
119
+ 0x00003742, // CP1256 ARABIC.12
120
+ 0x00000742, // CP1255 HEBREW.12
121
+ 0x00000000, // ISO-8859-8-I
122
+ 0x00000000, // VISUAL
123
+ 0x00000000, // CP852
124
+ 0x39001242, // CSN_369103 CZECH.12 ESPERANTO.1
125
+ 0x00000000, // CP1253
126
+ 0x00000000, // CP866
127
+ 0x2e001944, // ISO-8859-13 ESTONIAN.12 ALBANIAN.3
128
+ 0x08090a74, // ISO-2022-KR Korean.9 Japanese.8 ITALIAN.3
129
+ 0x00001142, // GBK Chinese.12
130
+ 0x4600113d, // GB18030 Chinese.11 ChineseT.7
131
+ 0x00004642, // BIG5_HKSCS ChineseT.12
132
+ 0x00000000, // ISO_2022_CN
133
+ 0x00000000, // TSCII
134
+ 0x00000000, // TAM
135
+ 0x00000000, // TAB
136
+ 0x00000000, // JAGRAN
137
+ 0x00000000, // MACINTOSH
138
+ 0x00000000, // UTF7
139
+ 0x00000000, // BHASKAR
140
+ 0x00000000, // HTCHANAKYA
141
+ 0x090646ca, // UTF-16BE ChineseT.10 GERMAN.4 Japanese.2
142
+ 0x00000000, // UTF-16LE
143
+ 0x00000000, // UTF-32BE
144
+ 0x00000000, // UTF-32LE
145
+ 0x00000000, // X-BINARYENC
146
+ 0x06001142, // HZ-GB-2312 Chinese.12 GERMAN.1
147
+ 0x461109c2, // X-UTF8UTF8 Japanese.9 Chinese.5 ChineseT.3
148
+ 0x00000000, // X-TAM-ELANGO
149
+ 0x00000000, // X-TAM-LTTMBARANI
150
+ 0x00000000, // X-TAM-SHREE
151
+ 0x00000000, // X-TAM-TBOOMIS
152
+ 0x00000000, // X-TAM-TMNEWS
153
+ 0x00000000, // X-TAM-WEBTAMIL
154
+ 0x00000000, // X-KDDI-Shift_JIS
155
+ 0x00000000, // X-DoCoMo-Shift_JIS
156
+ 0x00000000, // X-SoftBank-Shift_JIS
157
+ 0x00000000, // X-KDDI-ISO-2022-JP
158
+ 0x00000000, // X-SoftBank-ISO-2022-JP
159
+ };
160
+
161
+ COMPILE_ASSERT(arraysize(kEncodingHintProbs) == NUM_ENCODINGS,
162
+ kEncodingHintProbs_has_incorrect_size);
163
+
164
+ //
165
+ // Generated by dsites 2008.07.07 from 10% of Base
166
+ //
167
+
168
+ // Three packed language probs, subscripted by (anchor) language
169
+ static const uint32 kLanguageHintProbs[] = {
170
+ 0x00000000, // ENGLISH
171
+ 0x00000242, // DANISH DANISH.12
172
+ 0x00000342, // DUTCH DUTCH.12
173
+ 0x00000442, // FINNISH FINNISH.12
174
+ 0x00000542, // FRENCH FRENCH.12
175
+ 0x00000642, // GERMAN GERMAN.12
176
+ 0x00000742, // HEBREW HEBREW.12
177
+ 0x00000842, // ITALIAN ITALIAN.12
178
+ 0x00000942, // Japanese Japanese.12
179
+ 0x00000a42, // Korean Korean.12
180
+ 0x51000b43, // NORWEGIAN NORWEGIAN.12 NORWEGIAN_N.2
181
+ 0x00000c42, // POLISH POLISH.12
182
+ 0x00000d42, // PORTUGUESE PORTUGUESE.12
183
+ 0x00000000, // RUSSIAN
184
+ 0x00000f42, // SPANISH SPANISH.12
185
+ 0x00001042, // SWEDISH SWEDISH.12
186
+ 0x00001142, // Chinese Chinese.12
187
+ 0x00001242, // CZECH CZECH.12
188
+ 0x00000000, // GREEK
189
+ 0x47001442, // ICELANDIC ICELANDIC.12 FAROESE.1
190
+ 0x00001542, // LATVIAN LATVIAN.12
191
+ 0x00001642, // LITHUANIAN LITHUANIAN.12
192
+ 0x00001742, // ROMANIAN ROMANIAN.12
193
+ 0x00001842, // HUNGARIAN HUNGARIAN.12
194
+ 0x00001942, // ESTONIAN ESTONIAN.12
195
+ 0x00000000, // TG_UNKNOWN_LANGUAGE
196
+ 0x00000000, // Unknown
197
+ 0x00001c42, // BULGARIAN BULGARIAN.12
198
+ 0x00001d42, // CROATIAN CROATIAN.12
199
+ 0x1e001d46, // SERBIAN CROATIAN.12 SERBIAN.5
200
+ 0x00000000, // IRISH
201
+ 0x0f00203d, // GALICIAN GALICIAN.11 SPANISH.7
202
+ 0x5e00213a, // TAGALOG TAGALOG.11 SOMALI.4
203
+ 0x00002242, // TURKISH TURKISH.12
204
+ 0x00002342, // UKRAINIAN UKRAINIAN.12
205
+ 0x00000000, // HINDI
206
+ 0x1c1e25d4, // MACEDONIAN MACEDONIAN.11 SERBIAN.4 BULGARIAN.2
207
+ 0x00002642, // BENGALI BENGALI.12
208
+ 0x00002742, // INDONESIAN INDONESIAN.12
209
+ 0x00000000, // LATIN
210
+ 0x2700293c, // MALAY MALAY.11 INDONESIAN.6
211
+ 0x00000000, // MALAYALAM
212
+ 0x00000000, // WELSH
213
+ 0x00000000, // NEPALI
214
+ 0x00000000, // TELUGU
215
+ 0x00002e42, // ALBANIAN ALBANIAN.12
216
+ 0x00000000, // TAMIL
217
+ 0x00003042, // BELARUSIAN BELARUSIAN.12
218
+ 0x00000000, // JAVANESE
219
+ 0x00000000, // OCCITAN
220
+ 0x375f3330, // URDU URDU.10 UIGHUR.7 ARABIC.4
221
+ 0x41003436, // BIHARI BIHARI.10 MARATHI.10
222
+ 0x00000000, // GUJARATI
223
+ 0x0a4636b2, // THAI THAI.7 ChineseT.3 Korean.2
224
+ 0x00003742, // ARABIC ARABIC.12
225
+ 0x00003842, // CATALAN CATALAN.12
226
+ 0x00003942, // ESPERANTO ESPERANTO.12
227
+ 0x00003a42, // BASQUE BASQUE.12
228
+ 0x00000000, // INTERLINGUA
229
+ 0x00000000, // KANNADA
230
+ 0x05060cca, // PUNJABI POLISH.10 GERMAN.4 FRENCH.2
231
+ 0x00000000, // SCOTS_GAELIC
232
+ 0x00003f42, // SWAHILI SWAHILI.12
233
+ 0x00004042, // SLOVENIAN SLOVENIAN.12
234
+ 0x00004142, // MARATHI MARATHI.12
235
+ 0x00004242, // MALTESE MALTESE.12
236
+ 0x00004342, // VIETNAMESE VIETNAMESE.12
237
+ 0x00000000, // FRISIAN
238
+ 0x12004543, // SLOVAK SLOVAK.12 CZECH.2
239
+ 0x00004642, // ChineseT ChineseT.12
240
+ 0x00000000, // FAROESE
241
+ 0x00000000, // SUNDANESE
242
+ 0x79004944, // UZBEK UZBEK.12 TAJIK.3
243
+ 0x4d004a46, // AMHARIC AMHARIC.12 TIGRINYA.5
244
+ 0x00004b42, // AZERBAIJANI AZERBAIJANI.12
245
+ 0x00000000, // GEORGIAN
246
+ 0x00000000, // TIGRINYA
247
+ 0x00004e42, // PERSIAN PERSIAN.12
248
+ 0x00000000, // BOSNIAN
249
+ 0x00000000, // SINHALESE
250
+ 0x00000000, // NORWEGIAN_N
251
+ 0x00000000, // PORTUGUESE_P
252
+ 0x00000000, // PORTUGUESE_B
253
+ 0x00000000, // XHOSA
254
+ 0x00000000, // ZULU
255
+ 0x00000000, // GUARANI
256
+ 0x00000000, // SESOTHO
257
+ 0x00000000, // TURKMEN
258
+ 0x7a005933, // KYRGYZ KYRGYZ.10 TATAR.7
259
+ 0x00000000, // BRETON
260
+ 0x00000000, // TWI
261
+ 0x00000000, // YIDDISH
262
+ 0x00000000, // SERBO_CROATIAN
263
+ 0x00000000, // SOMALI
264
+ 0x00005f42, // UIGHUR UIGHUR.12
265
+ 0x00006042, // KURDISH KURDISH.12
266
+ 0x00006142, // MONGOLIAN MONGOLIAN.12
267
+ 0x051130c9, // ARMENIAN BELARUSIAN.10 Chinese.3 FRENCH.1
268
+ 0x020f0521, // LAOTHIAN FRENCH.8 SPANISH.7 DANISH.6
269
+ 0x64004e35, // SINDHI PERSIAN.10 SINDHI.9
270
+ 0x00000000, // RHAETO_ROMANCE
271
+ 0x00006642, // AFRIKAANS AFRIKAANS.12
272
+ 0x00000000, // LUXEMBOURGISH
273
+ 0x00006842, // BURMESE BURMESE.12
274
+ 0x00002242, // KHMER TURKISH.12
275
+ 0x88006a3c, // TIBETAN TIBETAN.11 DZONGKHA.6
276
+ 0x00000000, // DHIVEHI
277
+ 0x00000000, // CHEROKEE
278
+ 0x00000000, // SYRIAC
279
+ 0x00000000, // LIMBU
280
+ 0x00000000, // ORIYA
281
+ 0x00000000, // ASSAMESE
282
+ 0x00000000, // CORSICAN
283
+ 0x00000000, // INTERLINGUE
284
+ 0x00007342, // KAZAKH KAZAKH.12
285
+ 0x00000000, // LINGALA
286
+ 0x00000000, // MOLDAVIAN
287
+ 0x5f007645, // PASHTO PASHTO.12 UIGHUR.4
288
+ 0x00000000, // QUECHUA
289
+ 0x00000000, // SHONA
290
+ 0x00007942, // TAJIK TAJIK.12
291
+ 0x00000000, // TATAR
292
+ 0x00000000, // TONGA
293
+ 0x00000000, // YORUBA
294
+ 0x00000000, // CREOLES_AND_PIDGINS_ENGLISH_BASED
295
+ 0x00000000, // CREOLES_AND_PIDGINS_FRENCH_BASED
296
+ 0x00000000, // CREOLES_AND_PIDGINS_PORTUGUESE_BASED
297
+ 0x00000000, // CREOLES_AND_PIDGINS_OTHER
298
+ 0x00000000, // MAORI
299
+ 0x00000000, // WOLOF
300
+ 0x00000000, // ABKHAZIAN
301
+ 0x00000000, // AFAR
302
+ 0x00000000, // AYMARA
303
+ 0x00000000, // BASHKIR
304
+ 0x00000000, // BISLAMA
305
+ 0x00000000, // DZONGKHA
306
+ 0x00000000, // FIJIAN
307
+ 0x00000000, // GREENLANDIC
308
+ 0x00000000, // HAUSA
309
+ 0x00000000, // HAITIAN_CREOLE
310
+ 0x00000000, // INUPIAK
311
+ 0x00000542, // INUKTITUT FRENCH.12
312
+ 0x00000000, // KASHMIRI
313
+ 0x00000000, // KINYARWANDA
314
+ 0x00000000, // MALAGASY
315
+ 0x00000000, // NAURU
316
+ 0x00000000, // OROMO
317
+ 0x00000000, // RUNDI
318
+ 0x00000000, // SAMOAN
319
+ 0x00000000, // SANGO
320
+ 0x344197d3, // SANSKRIT SANSKRIT.11 MARATHI.4 BIHARI.1
321
+ 0x00000000, // SISWANT
322
+ 0x00000000, // TSONGA
323
+ 0x00000000, // TSWANA
324
+ 0x00000000, // VOLAPUK
325
+ 0x00000000, // ZHUANG
326
+ 0x00000000, // KHASI
327
+ 0x00000000, // SCOTS
328
+ 0x00000000, // GANDA
329
+ 0x00000000, // MANX
330
+ 0x00000000, // MONTENEGRIN
331
+ // Add new language hints just before here (just use 0x00000000)
332
+ };
333
+
334
+ COMPILE_ASSERT(arraysize(kLanguageHintProbs) == NUM_LANGUAGES,
335
+ kLanguageHintProbs_has_incorrect_size);
336
+
337
+ //
338
+ // Generated by dsites 2008.07.07 from 10% of Base
339
+ //
340
+
341
+ typedef struct {
342
+ char key[4];
343
+ uint32 probs;
344
+ } HintEntry;
345
+
346
+
347
+ // Massaged TLD, followed by three packed language probs
348
+ // Hand-removed 4 items dsites 2008.07.15
349
+ static const int kTLDHintProbsSize = 201;
350
+ static const HintEntry kTLDHintProbs[kTLDHintProbsSize] = { // MaxRange 12
351
+ {{0x61,0x63,0x5f,0x5f}, 0x0a000945}, // ac__ Japanese.12 Korean.4
352
+ {{0x61,0x64,0x5f,0x5f}, 0x00003842}, // ad__ CATALAN.12
353
+ {{0x61,0x65,0x5f,0x5f}, 0x00003742}, // ae__ ARABIC.12
354
+ {{0x61,0x66,0x5f,0x5f}, 0x4e00763d}, // af__ PASHTO.11 PERSIAN.7
355
+ {{0x61,0x67,0x5f,0x5f}, 0x09000643}, // ag__ GERMAN.12 Japanese.2
356
+ {{0x61,0x69,0x5f,0x5f}, 0x0c180938}, // ai__ Japanese.11 HUNGARIAN.7 POLISH.2
357
+ {{0x61,0x6c,0x5f,0x5f}, 0x00002e42}, // al__ ALBANIAN.12
358
+ {{0x61,0x6e,0x5f,0x5f}, 0x6e00033d}, // an__ DUTCH.11 LIMBU.7
359
+ {{0x61,0x6f,0x5f,0x5f}, 0x05000d42}, // ao__ PORTUGUESE.12 FRENCH.1
360
+ {{0x61,0x71,0x5f,0x5f}, 0x05000f29}, // aq__ SPANISH.9 FRENCH.6
361
+ {{0x61,0x72,0x5f,0x5f}, 0x00000f42}, // ar__ SPANISH.12
362
+ {{0x61,0x73,0x5f,0x5f}, 0x0f120bcd}, // as__ NORWEGIAN.10 CZECH.6 SPANISH.5
363
+ {{0x61,0x74,0x5f,0x5f}, 0x00000642}, // at__ GERMAN.12
364
+ {{0x61,0x77,0x5f,0x5f}, 0x0f000345}, // aw__ DUTCH.12 SPANISH.4
365
+ {{0x61,0x78,0x5f,0x5f}, 0x00001042}, // ax__ SWEDISH.12
366
+ {{0x61,0x7a,0x5f,0x5f}, 0x00004b42}, // az__ AZERBAIJANI.12
367
+ {{0x62,0x61,0x5f,0x5f}, 0x00001d42}, // ba__ CROATIAN.12
368
+ {{0x62,0x62,0x5f,0x5f}, 0x00002842}, // bb__ LATIN.12
369
+ {{0x62,0x64,0x5f,0x5f}, 0x00002642}, // bd__ BENGALI.12
370
+ {{0x62,0x65,0x5f,0x5f}, 0x05000335}, // be__ DUTCH.10 FRENCH.9
371
+ {{0x62,0x66,0x5f,0x5f}, 0x00000542}, // bf__ FRENCH.12
372
+ {{0x62,0x67,0x5f,0x5f}, 0x00001c42}, // bg__ BULGARIAN.12
373
+ {{0x62,0x68,0x5f,0x5f}, 0x00003742}, // bh__ ARABIC.12
374
+ {{0x62,0x69,0x5f,0x5f}, 0x0f00053f}, // bi__ FRENCH.11 SPANISH.9
375
+ {{0x62,0x6a,0x5f,0x5f}, 0x00000542}, // bj__ FRENCH.12
376
+ {{0x62,0x6d,0x5f,0x5f}, 0x98043929}, // bm__ ESPERANTO.9 FINNISH.8 SISWANT.6
377
+ {{0x62,0x6e,0x5f,0x5f}, 0x00002942}, // bn__ MALAY.12
378
+ {{0x62,0x6f,0x5f,0x5f}, 0x00000f42}, // bo__ SPANISH.12
379
+ {{0x62,0x72,0x5f,0x5f}, 0x00000d42}, // br__ PORTUGUESE.12
380
+ {{0x62,0x74,0x5f,0x5f}, 0x00008842}, // bt__ DZONGKHA.12
381
+ {{0x62,0x77,0x5f,0x5f}, 0x06059ac4}, // bw__ TSWANA.9 FRENCH.6 GERMAN.5
382
+ {{0x62,0x79,0x5f,0x5f}, 0x00003024}, // by__ BELARUSIAN.9
383
+ {{0x62,0x7a,0x5f,0x5f}, 0x0f0a0924}, // bz__ Japanese.9 Korean.5 SPANISH.1
384
+ {{0x63,0x61,0x5f,0x5f}, 0x00000542}, // ca__ FRENCH.12
385
+ {{0x63,0x61,0x74,0x5f}, 0x00003842}, // cat_ CATALAN.12
386
+ {{0x63,0x64,0x5f,0x5f}, 0x06051224}, // cd__ CZECH.9 FRENCH.5 GERMAN.1
387
+ {{0x63,0x66,0x5f,0x5f}, 0x00000542}, // cf__ FRENCH.12
388
+ {{0x63,0x67,0x5f,0x5f}, 0x00000542}, // cg__ FRENCH.12
389
+ {{0x63,0x68,0x5f,0x5f}, 0x08050638}, // ch__ GERMAN.11 FRENCH.7 ITALIAN.2
390
+ {{0x63,0x69,0x5f,0x5f}, 0x00000542}, // ci__ FRENCH.12
391
+ {{0x63,0x6c,0x5f,0x5f}, 0x00000f42}, // cl__ SPANISH.12
392
+ {{0x63,0x6d,0x5f,0x5f}, 0x00000542}, // cm__ FRENCH.12
393
+ {{0x63,0x6e,0x5f,0x5f}, 0x00001142}, // cn__ Chinese.12
394
+ {{0x63,0x6f,0x5f,0x5f}, 0x00000f42}, // co__ SPANISH.12
395
+ // {{0x63,0x6f,0x6f,0x70}, 0x0f0509cd}, // coop Japanese.10 FRENCH.6 SPANISH.5
396
+ {{0x63,0x72,0x5f,0x5f}, 0x00000f42}, // cr__ SPANISH.12
397
+ {{0x63,0x75,0x5f,0x5f}, 0x00000f42}, // cu__ SPANISH.12
398
+ {{0x63,0x76,0x5f,0x5f}, 0x00000d42}, // cv__ PORTUGUESE.12
399
+ {{0x63,0x78,0x5f,0x5f}, 0x223a091f}, // cx__ Japanese.8 BASQUE.6 TURKISH.4
400
+ {{0x63,0x79,0x5f,0x5f}, 0x150622ba}, // cy__ TURKISH.8 GERMAN.4 LATVIAN.3
401
+ {{0x63,0x7a,0x5f,0x5f}, 0x00001242}, // cz__ CZECH.12
402
+ {{0x64,0x65,0x5f,0x5f}, 0x00000642}, // de__ GERMAN.12
403
+ {{0x64,0x6b,0x5f,0x5f}, 0x00000242}, // dk__ DANISH.12
404
+ {{0x64,0x6f,0x5f,0x5f}, 0x21000f42}, // do__ SPANISH.12 TAGALOG.1
405
+ {{0x64,0x7a,0x5f,0x5f}, 0x37000535}, // dz__ FRENCH.10 ARABIC.9
406
+ {{0x65,0x63,0x5f,0x5f}, 0x00000f42}, // ec__ SPANISH.12
407
+ // {{0x65,0x64,0x75,0x5f}, 0x2e0f3873}, // edu_ CATALAN.9 SPANISH.7 ALBANIAN.2
408
+ {{0x65,0x65,0x5f,0x5f}, 0x00001942}, // ee__ ESTONIAN.12
409
+ {{0x65,0x67,0x5f,0x5f}, 0x05003742}, // eg__ ARABIC.12 FRENCH.1
410
+ {{0x65,0x72,0x5f,0x5f}, 0x00000b42}, // er__ NORWEGIAN.12
411
+ {{0x65,0x73,0x5f,0x5f}, 0x38200fd4}, // es__ SPANISH.11 GALICIAN.4 CATALAN.2
412
+ {{0x65,0x74,0x5f,0x5f}, 0x39004a39}, // et__ AMHARIC.11 ESPERANTO.3
413
+ {{0x66,0x69,0x5f,0x5f}, 0x10000444}, // fi__ FINNISH.12 SWEDISH.3
414
+ {{0x66,0x6a,0x5f,0x5f}, 0x050489e0}, // fj__ FIJIAN.12 FINNISH.5 FRENCH.3
415
+ {{0x66,0x6f,0x5f,0x5f}, 0x00004742}, // fo__ FAROESE.12
416
+ {{0x66,0x72,0x5f,0x5f}, 0x00000542}, // fr__ FRENCH.12
417
+ {{0x67,0x61,0x5f,0x5f}, 0x00000542}, // ga__ FRENCH.12
418
+ {{0x67,0x64,0x5f,0x5f}, 0x061d05d5}, // gd__ FRENCH.11 CROATIAN.5 GERMAN.3
419
+ {{0x67,0x65,0x5f,0x5f}, 0x00004c2d}, // ge__ GEORGIAN.10
420
+ {{0x67,0x66,0x5f,0x5f}, 0x00000542}, // gf__ FRENCH.12
421
+ {{0x67,0x67,0x5f,0x5f}, 0x06002244}, // gg__ TURKISH.12 GERMAN.3
422
+ {{0x67,0x68,0x5f,0x5f}, 0x05000436}, // gh__ FINNISH.10 FRENCH.10
423
+ {{0x67,0x69,0x5f,0x5f}, 0x0f0538ce}, // gi__ CATALAN.10 FRENCH.7 SPANISH.6
424
+ {{0x67,0x6c,0x5f,0x5f}, 0x398a0238}, // gl__ DANISH.11 GREENLANDIC.7 ESPERANTO.2
425
+ {{0x67,0x6d,0x5f,0x5f}, 0x0600043e}, // gm__ FINNISH.11 GERMAN.8
426
+ {{0x67,0x6e,0x5f,0x5f}, 0x00000542}, // gn__ FRENCH.12
427
+ // {{0x67,0x6f,0x76,0x5f}, 0x05000f25}, // gov_ SPANISH.9 FRENCH.2
428
+ {{0x67,0x70,0x5f,0x5f}, 0x00000542}, // gp__ FRENCH.12
429
+ {{0x67,0x71,0x5f,0x5f}, 0x0f000547}, // gq__ FRENCH.12 SPANISH.6
430
+ {{0x67,0x73,0x5f,0x5f}, 0x00000942}, // gs__ Japanese.12
431
+ {{0x67,0x74,0x5f,0x5f}, 0x00000f42}, // gt__ SPANISH.12
432
+ {{0x68,0x6b,0x5f,0x5f}, 0x11004643}, // hk__ ChineseT.12 Chinese.2
433
+ {{0x68,0x6d,0x5f,0x5f}, 0x4606092e}, // hm__ Japanese.10 GERMAN.6 ChineseT.2
434
+ {{0x68,0x6e,0x5f,0x5f}, 0x00000f42}, // hn__ SPANISH.12
435
+ {{0x68,0x72,0x5f,0x5f}, 0x00001d42}, // hr__ CROATIAN.12
436
+ {{0x68,0x74,0x5f,0x5f}, 0x0f000542}, // ht__ FRENCH.12 SPANISH.1
437
+ {{0x68,0x75,0x5f,0x5f}, 0x00001842}, // hu__ HUNGARIAN.12
438
+ {{0x69,0x64,0x5f,0x5f}, 0x00002742}, // id__ INDONESIAN.12
439
+ {{0x69,0x65,0x5f,0x5f}, 0x050c1f24}, // ie__ IRISH.9 POLISH.5 FRENCH.1
440
+ {{0x69,0x6c,0x5f,0x5f}, 0x00000742}, // il__ HEBREW.12
441
+ {{0x69,0x6e,0x74,0x5f}, 0x0f060574}, // int_ FRENCH.9 GERMAN.8 SPANISH.3
442
+ {{0x69,0x6f,0x5f,0x5f}, 0x11090fd5}, // io__ SPANISH.11 Japanese.5 Chinese.3
443
+ {{0x69,0x71,0x5f,0x5f}, 0x60003744}, // iq__ ARABIC.12 KURDISH.3
444
+ {{0x69,0x72,0x5f,0x5f}, 0x00004e42}, // ir__ PERSIAN.12
445
+ {{0x69,0x73,0x5f,0x5f}, 0x00001442}, // is__ ICELANDIC.12
446
+ {{0x69,0x74,0x5f,0x5f}, 0x00000842}, // it__ ITALIAN.12
447
+ {{0x6a,0x65,0x5f,0x5f}, 0x29050328}, // je__ DUTCH.9 FRENCH.7 MALAY.5
448
+ {{0x6a,0x6d,0x5f,0x5f}, 0x040f0576}, // jm__ FRENCH.9 SPANISH.8 FINNISH.5
449
+ {{0x6a,0x6f,0x5f,0x5f}, 0x00003742}, // jo__ ARABIC.12
450
+ // {{0x6a,0x6f,0x62,0x73}, 0x0f060329}, // jobs DUTCH.9 GERMAN.8 SPANISH.6
451
+ {{0x6a,0x70,0x5f,0x5f}, 0x00000942}, // jp__ Japanese.12
452
+ {{0x6b,0x65,0x5f,0x5f}, 0x040f3fc3}, // ke__ SWAHILI.9 SPANISH.5 FINNISH.4
453
+ {{0x6b,0x69,0x5f,0x5f}, 0x04000643}, // ki__ GERMAN.12 FINNISH.2
454
+ {{0x6b,0x6d,0x5f,0x5f}, 0x00000542}, // km__ FRENCH.12
455
+ {{0x6b,0x70,0x5f,0x5f}, 0x00000a42}, // kp__ Korean.12
456
+ {{0x6b,0x72,0x5f,0x5f}, 0x00000a42}, // kr__ Korean.12
457
+ {{0x6b,0x77,0x5f,0x5f}, 0x00003742}, // kw__ ARABIC.12
458
+ {{0x6b,0x79,0x5f,0x5f}, 0x0500083f}, // ky__ ITALIAN.11 FRENCH.9
459
+ {{0x6b,0x7a,0x5f,0x5f}, 0x0000732d}, // kz__ KAZAKH.10
460
+ {{0x6c,0x62,0x5f,0x5f}, 0x05003747}, // lb__ ARABIC.12 FRENCH.6
461
+ {{0x6c,0x63,0x5f,0x5f}, 0x09000645}, // lc__ GERMAN.12 Japanese.4
462
+ {{0x6c,0x69,0x5f,0x5f}, 0x1600063d}, // li__ GERMAN.11 LITHUANIAN.7
463
+ {{0x6c,0x73,0x5f,0x5f}, 0x00005742}, // ls__ SESOTHO.12
464
+ {{0x6c,0x74,0x5f,0x5f}, 0x00001642}, // lt__ LITHUANIAN.12
465
+ {{0x6c,0x75,0x5f,0x5f}, 0x0600053d}, // lu__ FRENCH.11 GERMAN.7
466
+ {{0x6c,0x76,0x5f,0x5f}, 0x00001542}, // lv__ LATVIAN.12
467
+ {{0x6c,0x79,0x5f,0x5f}, 0x05003744}, // ly__ ARABIC.12 FRENCH.3
468
+ {{0x6d,0x61,0x5f,0x5f}, 0x3700053d}, // ma__ FRENCH.11 ARABIC.7
469
+ {{0x6d,0x63,0x5f,0x5f}, 0x00000542}, // mc__ FRENCH.12
470
+ {{0x6d,0x64,0x5f,0x5f}, 0x00001724}, // md__ ROMANIAN.9
471
+ {{0x6d,0x65,0x5f,0x5f}, 0x00001d42}, // me__ CROATIAN.12
472
+ {{0x6d,0x67,0x5f,0x5f}, 0x00000542}, // mg__ FRENCH.12
473
+ {{0x6d,0x6b,0x5f,0x5f}, 0x1c002543}, // mk__ MACEDONIAN.12 BULGARIAN.2
474
+ {{0x6d,0x6c,0x5f,0x5f}, 0x00000542}, // ml__ FRENCH.12
475
+ {{0x6d,0x6e,0x5f,0x5f}, 0x00006142}, // mn__ MONGOLIAN.12
476
+ {{0x6d,0x6f,0x5f,0x5f}, 0x110d4631}, // mo__ ChineseT.10 PORTUGUESE.8 Chinese.5
477
+ {{0x6d,0x71,0x5f,0x5f}, 0x00000542}, // mq__ FRENCH.12
478
+ {{0x6d,0x72,0x5f,0x5f}, 0x37000535}, // mr__ FRENCH.10 ARABIC.9
479
+ {{0x6d,0x73,0x5f,0x5f}, 0x090f06d5}, // ms__ GERMAN.11 SPANISH.5 Japanese.3
480
+ {{0x6d,0x74,0x5f,0x5f}, 0x00004242}, // mt__ MALTESE.12
481
+ {{0x6d,0x75,0x5f,0x5f}, 0x05000934}, // mu__ Japanese.10 FRENCH.8
482
+ {{0x6d,0x76,0x5f,0x5f}, 0x28000436}, // mv__ FINNISH.10 LATIN.10
483
+ {{0x6d,0x77,0x5f,0x5f}, 0x0611092a}, // mw__ Japanese.9 Chinese.8 GERMAN.7
484
+ {{0x6d,0x78,0x5f,0x5f}, 0x00000f42}, // mx__ SPANISH.12
485
+ {{0x6d,0x79,0x5f,0x5f}, 0x00002942}, // my__ MALAY.12
486
+ {{0x6d,0x7a,0x5f,0x5f}, 0x00000d42}, // mz__ PORTUGUESE.12
487
+ {{0x6e,0x61,0x5f,0x5f}, 0x06006644}, // na__ AFRIKAANS.12 GERMAN.3
488
+ {{0x6e,0x63,0x5f,0x5f}, 0x00000542}, // nc__ FRENCH.12
489
+ {{0x6e,0x65,0x5f,0x5f}, 0x8b000542}, // ne__ FRENCH.12 HAUSA.1
490
+ {{0x6e,0x66,0x5f,0x5f}, 0x00000542}, // nf__ FRENCH.12
491
+ {{0x6e,0x69,0x5f,0x5f}, 0x00000f42}, // ni__ SPANISH.12
492
+ {{0x6e,0x6c,0x5f,0x5f}, 0x00000342}, // nl__ DUTCH.12
493
+ {{0x6e,0x6f,0x5f,0x5f}, 0x51000b43}, // no__ NORWEGIAN.12 NORWEGIAN_N.2
494
+ {{0x6e,0x75,0x5f,0x5f}, 0x0300103b}, // nu__ SWEDISH.11 DUTCH.5
495
+ {{0x6f,0x6d,0x5f,0x5f}, 0x00003742}, // om__ ARABIC.12
496
+ {{0x70,0x61,0x5f,0x5f}, 0x00000f42}, // pa__ SPANISH.12
497
+ {{0x70,0x65,0x5f,0x5f}, 0x00000f42}, // pe__ SPANISH.12
498
+ {{0x70,0x66,0x5f,0x5f}, 0x00000542}, // pf__ FRENCH.12
499
+ {{0x70,0x67,0x5f,0x5f}, 0x00000f24}, // pg__ SPANISH.9
500
+ {{0x70,0x68,0x5f,0x5f}, 0x00002142}, // ph__ TAGALOG.12
501
+ {{0x70,0x6b,0x5f,0x5f}, 0x00003342}, // pk__ URDU.12
502
+ {{0x70,0x6c,0x5f,0x5f}, 0x30000c42}, // pl__ POLISH.12 BELARUSIAN.1
503
+ {{0x70,0x6e,0x5f,0x5f}, 0x04000644}, // pn__ GERMAN.12 FINNISH.3
504
+ {{0x70,0x72,0x5f,0x5f}, 0x00000f42}, // pr__ SPANISH.12
505
+ {{0x70,0x72,0x6f,0x5f}, 0x46050fd5}, // pro_ SPANISH.11 FRENCH.5 ChineseT.3
506
+ {{0x70,0x73,0x5f,0x5f}, 0x00003742}, // ps__ ARABIC.12
507
+ {{0x70,0x74,0x5f,0x5f}, 0x00000d42}, // pt__ PORTUGUESE.12
508
+ {{0x70,0x79,0x5f,0x5f}, 0x00000f42}, // py__ SPANISH.12
509
+ {{0x71,0x61,0x5f,0x5f}, 0x00003742}, // qa__ ARABIC.12
510
+ {{0x72,0x65,0x5f,0x5f}, 0x00000542}, // re__ FRENCH.12
511
+ {{0x72,0x6f,0x5f,0x5f}, 0x00001742}, // ro__ ROMANIAN.12
512
+ {{0x72,0x73,0x5f,0x5f}, 0x00001d42}, // rs__ CROATIAN.12
513
+ {{0x72,0x77,0x5f,0x5f}, 0x9000053e}, // rw__ FRENCH.11 KINYARWANDA.8
514
+ {{0x73,0x61,0x5f,0x5f}, 0x00003742}, // sa__ ARABIC.12
515
+ {{0x73,0x62,0x5f,0x5f}, 0x00000442}, // sb__ FINNISH.12
516
+ {{0x73,0x63,0x5f,0x5f}, 0x060f092f}, // sc__ Japanese.10 SPANISH.7 GERMAN.3
517
+ {{0x73,0x64,0x5f,0x5f}, 0x00003742}, // sd__ ARABIC.12
518
+ {{0x73,0x65,0x5f,0x5f}, 0x00001042}, // se__ SWEDISH.12
519
+ {{0x73,0x69,0x5f,0x5f}, 0x00004042}, // si__ SLOVENIAN.12
520
+ {{0x73,0x6b,0x5f,0x5f}, 0x12004543}, // sk__ SLOVAK.12 CZECH.2
521
+ {{0x73,0x6d,0x5f,0x5f}, 0x00000842}, // sm__ ITALIAN.12
522
+ {{0x73,0x6e,0x5f,0x5f}, 0x00000542}, // sn__ FRENCH.12
523
+ {{0x73,0x72,0x5f,0x5f}, 0x03001e44}, // sr__ SERBIAN.12 DUTCH.3
524
+ {{0x73,0x76,0x5f,0x5f}, 0x00000f42}, // sv__ SPANISH.12
525
+ {{0x73,0x79,0x5f,0x5f}, 0x00003742}, // sy__ ARABIC.12
526
+ {{0x74,0x63,0x5f,0x5f}, 0x0a2206cd}, // tc__ GERMAN.10 TURKISH.6 Korean.5
527
+ {{0x74,0x66,0x5f,0x5f}, 0x00000642}, // tf__ GERMAN.12
528
+ {{0x74,0x67,0x5f,0x5f}, 0x00000542}, // tg__ FRENCH.12
529
+ {{0x74,0x68,0x5f,0x5f}, 0x9e0936c9}, // th__ THAI.10 Japanese.3 SCOTS.1
530
+ {{0x74,0x6a,0x5f,0x5f}, 0x00007924}, // tj__ TAJIK.9
531
+ {{0x74,0x6c,0x5f,0x5f}, 0x060f0dcd}, // tl__ PORTUGUESE.10 SPANISH.6 GERMAN.5
532
+ {{0x74,0x6e,0x5f,0x5f}, 0x3700053e}, // tn__ FRENCH.11 ARABIC.8
533
+ {{0x74,0x6f,0x5f,0x5f}, 0x064609c5}, // to__ Japanese.9 ChineseT.7 GERMAN.6
534
+ {{0x74,0x70,0x5f,0x5f}, 0x06000944}, // tp__ Japanese.12 GERMAN.3
535
+ {{0x74,0x72,0x5f,0x5f}, 0x00002242}, // tr__ TURKISH.12
536
+ {{0x74,0x72,0x61,0x76}, 0x064509c3}, // trav Japanese.9 SLOVAK.5 GERMAN.4
537
+ {{0x74,0x74,0x5f,0x5f}, 0x0f00063e}, // tt__ GERMAN.11 SPANISH.8
538
+ {{0x74,0x77,0x5f,0x5f}, 0x00004642}, // tw__ ChineseT.12
539
+ {{0x74,0x7a,0x5f,0x5f}, 0x00003f42}, // tz__ SWAHILI.12
540
+ {{0x75,0x61,0x5f,0x5f}, 0x0000232d}, // ua__ UKRAINIAN.10
541
+ {{0x75,0x79,0x5f,0x5f}, 0x00000f42}, // uy__ SPANISH.12
542
+ {{0x75,0x7a,0x5f,0x5f}, 0x0000492d}, // uz__ UZBEK.10
543
+ {{0x76,0x61,0x5f,0x5f}, 0x060f0828}, // va__ ITALIAN.9 SPANISH.7 GERMAN.5
544
+ {{0x76,0x63,0x5f,0x5f}, 0x0d000939}, // vc__ Japanese.11 PORTUGUESE.3
545
+ {{0x76,0x65,0x5f,0x5f}, 0x00000f42}, // ve__ SPANISH.12
546
+ {{0x76,0x67,0x5f,0x5f}, 0x09000f43}, // vg__ SPANISH.12 Japanese.2
547
+ {{0x76,0x69,0x5f,0x5f}, 0x00002942}, // vi__ MALAY.12
548
+ {{0x76,0x6e,0x5f,0x5f}, 0x00004342}, // vn__ VIETNAMESE.12
549
+ {{0x76,0x75,0x5f,0x5f}, 0x00000642}, // vu__ GERMAN.12
550
+ {{0x77,0x73,0x5f,0x5f}, 0x4b0f0624}, // ws__ GERMAN.9 SPANISH.5 AZERBAIJANI.1
551
+ {{0x79,0x65,0x5f,0x5f}, 0x00003742}, // ye__ ARABIC.12
552
+ {{0x79,0x75,0x5f,0x5f}, 0x1e001d3d}, // yu__ CROATIAN.11 SERBIAN.7
553
+ {{0x7a,0x61,0x5f,0x5f}, 0x00006642}, // za__ AFRIKAANS.12
554
+ {{0x7a,0x6d,0x5f,0x5f}, 0x0b000435}, // zm__ FINNISH.10 NORWEGIAN.9
555
+ {{0x7a,0x77,0x5f,0x5f}, 0x3f00783e}, // zw__ SHONA.11 SWAHILI.8
556
+ };
557
+
558
+
559
+ // Statistically closest language, based on quadgram table
560
+ // Those that are far from other languges map to UNKNOWN_LANGUAGE
561
+ // Subscripted by Language
562
+ //
563
+ // From lang_correlation.txt and hand-edits
564
+ // sed 's/^\([^ ]*\) \([^ ]*\) coef=0\.\(..\).*$/
565
+ // (\3 >= kMinCorrPercent) ? \2 : UNKNOWN_LANGUAGE,
566
+ // \/\/ \1/' lang_correlation.txt >/tmp/closest_lang_decl.txt
567
+ //
568
+ static const int kMinCorrPercent = 24; // Pick off how close you want
569
+ // 24 catches PERSIAN <== ARABIC
570
+ // but not SPANISH <== PORTUGESE
571
+ static Language Unknown = UNKNOWN_LANGUAGE;
572
+
573
+ // Subscripted by Language
574
+ static const Language kClosestAltLanguage[] = {
575
+ (28 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // ENGLISH
576
+ (36 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // DANISH
577
+ (31 >= kMinCorrPercent) ? AFRIKAANS : UNKNOWN_LANGUAGE, // DUTCH
578
+ (15 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // FINNISH
579
+ (11 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // FRENCH
580
+ (17 >= kMinCorrPercent) ? LUXEMBOURGISH : UNKNOWN_LANGUAGE, // GERMAN
581
+ (27 >= kMinCorrPercent) ? YIDDISH : UNKNOWN_LANGUAGE, // HEBREW
582
+ (16 >= kMinCorrPercent) ? CORSICAN : UNKNOWN_LANGUAGE, // ITALIAN
583
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Japanese
584
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Korean
585
+ (41 >= kMinCorrPercent) ? NORWEGIAN_N : UNKNOWN_LANGUAGE, // NORWEGIAN
586
+ ( 5 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // POLISH
587
+ (23 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // PORTUGUESE
588
+ (33 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // RUSSIAN
589
+ (28 >= kMinCorrPercent) ? GALICIAN : UNKNOWN_LANGUAGE, // SPANISH
590
+ (17 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // SWEDISH
591
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Chinese
592
+ (42 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // CZECH
593
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GREEK
594
+ (35 >= kMinCorrPercent) ? FAROESE : UNKNOWN_LANGUAGE, // ICELANDIC
595
+ ( 7 >= kMinCorrPercent) ? LITHUANIAN : UNKNOWN_LANGUAGE, // LATVIAN
596
+ ( 7 >= kMinCorrPercent) ? LATVIAN : UNKNOWN_LANGUAGE, // LITHUANIAN
597
+ ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ROMANIAN
598
+ ( 4 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // HUNGARIAN
599
+ (15 >= kMinCorrPercent) ? FINNISH : UNKNOWN_LANGUAGE, // ESTONIAN
600
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Ignore
601
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Unknown
602
+ (33 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // BULGARIAN
603
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CROATIAN
604
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SERBIAN
605
+ (24 >= kMinCorrPercent) ? SCOTS_GAELIC : UNKNOWN_LANGUAGE, // IRISH
606
+ (28 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GALICIAN
607
+ ( 8 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // TAGALOG
608
+ (29 >= kMinCorrPercent) ? AZERBAIJANI : UNKNOWN_LANGUAGE, // TURKISH
609
+ (28 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // UKRAINIAN
610
+ (37 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // HINDI
611
+ (29 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // MACEDONIAN
612
+ (14 >= kMinCorrPercent) ? ASSAMESE : UNKNOWN_LANGUAGE, // BENGALI
613
+ (46 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // INDONESIAN
614
+ ( 9 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // LATIN
615
+ (46 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // MALAY
616
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MALAYALAM
617
+ ( 4 >= kMinCorrPercent) ? BRETON : UNKNOWN_LANGUAGE, // WELSH
618
+ ( 8 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // NEPALI
619
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TELUGU
620
+ ( 3 >= kMinCorrPercent) ? ESPERANTO : UNKNOWN_LANGUAGE, // ALBANIAN
621
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TAMIL
622
+ (22 >= kMinCorrPercent) ? UKRAINIAN : UNKNOWN_LANGUAGE, // BELARUSIAN
623
+ (15 >= kMinCorrPercent) ? SUNDANESE : UNKNOWN_LANGUAGE, // JAVANESE
624
+ (19 >= kMinCorrPercent) ? CATALAN : UNKNOWN_LANGUAGE, // OCCITAN
625
+ (27 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // URDU
626
+ (36 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // BIHARI
627
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GUJARATI
628
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // THAI
629
+ (24 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // ARABIC
630
+ (19 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // CATALAN
631
+ ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ESPERANTO
632
+ ( 3 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // BASQUE
633
+ ( 9 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // INTERLINGUA
634
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KANNADA
635
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PUNJABI
636
+ (24 >= kMinCorrPercent) ? IRISH : UNKNOWN_LANGUAGE, // SCOTS_GAELIC
637
+ ( 7 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SWAHILI
638
+ (28 >= kMinCorrPercent) ? SERBO_CROATIAN : UNKNOWN_LANGUAGE, // SLOVENIAN
639
+ (37 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // MARATHI
640
+ ( 3 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // MALTESE
641
+ ( 1 >= kMinCorrPercent) ? YORUBA : UNKNOWN_LANGUAGE, // VIETNAMESE
642
+ (15 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // FRISIAN
643
+ (42 >= kMinCorrPercent) ? CZECH : UNKNOWN_LANGUAGE, // SLOVAK
644
+ // Original ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ChineseT
645
+ (24 >= kMinCorrPercent) ? CHINESE : UNKNOWN_LANGUAGE, // ChineseT
646
+ (35 >= kMinCorrPercent) ? ICELANDIC : UNKNOWN_LANGUAGE, // FAROESE
647
+ (15 >= kMinCorrPercent) ? JAVANESE : UNKNOWN_LANGUAGE, // SUNDANESE
648
+ (17 >= kMinCorrPercent) ? TAJIK : UNKNOWN_LANGUAGE, // UZBEK
649
+ ( 7 >= kMinCorrPercent) ? TIGRINYA : UNKNOWN_LANGUAGE, // AMHARIC
650
+ (29 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // AZERBAIJANI
651
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GEORGIAN
652
+ ( 7 >= kMinCorrPercent) ? AMHARIC : UNKNOWN_LANGUAGE, // TIGRINYA
653
+ (27 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // PERSIAN
654
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // BOSNIAN
655
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SINHALESE
656
+ (41 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // NORWEGIAN_N
657
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_P
658
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_B
659
+ (37 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // XHOSA
660
+ (37 >= kMinCorrPercent) ? XHOSA : UNKNOWN_LANGUAGE, // ZULU
661
+ ( 2 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GUARANI
662
+ (29 >= kMinCorrPercent) ? TSWANA : UNKNOWN_LANGUAGE, // SESOTHO
663
+ ( 7 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // TURKMEN
664
+ ( 8 >= kMinCorrPercent) ? KAZAKH : UNKNOWN_LANGUAGE, // KYRGYZ
665
+ ( 5 >= kMinCorrPercent) ? FRENCH : UNKNOWN_LANGUAGE, // BRETON
666
+ ( 3 >= kMinCorrPercent) ? GANDA : UNKNOWN_LANGUAGE, // TWI
667
+ (27 >= kMinCorrPercent) ? HEBREW : UNKNOWN_LANGUAGE, // YIDDISH
668
+ (28 >= kMinCorrPercent) ? SLOVENIAN : UNKNOWN_LANGUAGE, // SERBO_CROATIAN
669
+ (12 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // SOMALI
670
+ ( 9 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // UIGHUR
671
+ (15 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // KURDISH
672
+ ( 6 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // MONGOLIAN
673
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ARMENIAN
674
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // LAOTHIAN
675
+ ( 8 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // SINDHI
676
+ (10 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // RHAETO_ROMANCE
677
+ (31 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // AFRIKAANS
678
+ (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // LUXEMBOURGISH
679
+ ( 2 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // BURMESE
680
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KHMER
681
+ (45 >= kMinCorrPercent) ? DZONGKHA : UNKNOWN_LANGUAGE, // TIBETAN
682
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // DHIVEHI
683
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CHEROKEE
684
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SYRIAC
685
+ ( 8 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // LIMBU
686
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ORIYA
687
+ (14 >= kMinCorrPercent) ? BENGALI : UNKNOWN_LANGUAGE, // ASSAMESE
688
+ (16 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // CORSICAN
689
+ ( 5 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // INTERLINGUE
690
+ ( 8 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // KAZAKH
691
+ ( 4 >= kMinCorrPercent) ? SWAHILI : UNKNOWN_LANGUAGE, // LINGALA
692
+ (11 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // MOLDAVIAN
693
+ (19 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // PASHTO
694
+ ( 5 >= kMinCorrPercent) ? AYMARA : UNKNOWN_LANGUAGE, // QUECHUA
695
+ ( 5 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SHONA
696
+ (17 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // TAJIK
697
+ (13 >= kMinCorrPercent) ? BASHKIR : UNKNOWN_LANGUAGE, // TATAR
698
+ (11 >= kMinCorrPercent) ? SAMOAN : UNKNOWN_LANGUAGE, // TONGA
699
+ ( 2 >= kMinCorrPercent) ? TWI : UNKNOWN_LANGUAGE, // YORUBA
700
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_ENGLISH_BASED
701
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_FRENCH_BASED
702
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_PORTUGUESE_BASED
703
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_OTHER
704
+ ( 6 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // MAORI
705
+ ( 3 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // WOLOF
706
+ ( 1 >= kMinCorrPercent) ? MONGOLIAN : UNKNOWN_LANGUAGE, // ABKHAZIAN
707
+ ( 8 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // AFAR
708
+ ( 5 >= kMinCorrPercent) ? QUECHUA : UNKNOWN_LANGUAGE, // AYMARA
709
+ (13 >= kMinCorrPercent) ? TATAR : UNKNOWN_LANGUAGE, // BASHKIR
710
+ ( 3 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // BISLAMA
711
+ (45 >= kMinCorrPercent) ? TIBETAN : UNKNOWN_LANGUAGE, // DZONGKHA
712
+ ( 4 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // FIJIAN
713
+ ( 7 >= kMinCorrPercent) ? INUPIAK : UNKNOWN_LANGUAGE, // GREENLANDIC
714
+ ( 3 >= kMinCorrPercent) ? AFAR : UNKNOWN_LANGUAGE, // HAUSA
715
+ ( 3 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // HAITIAN_CREOLE
716
+ ( 7 >= kMinCorrPercent) ? GREENLANDIC : UNKNOWN_LANGUAGE, // INUPIAK
717
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // INUKTITUT
718
+ ( 4 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // KASHMIRI
719
+ (30 >= kMinCorrPercent) ? RUNDI : UNKNOWN_LANGUAGE, // KINYARWANDA
720
+ ( 2 >= kMinCorrPercent) ? TAGALOG : UNKNOWN_LANGUAGE, // MALAGASY
721
+ (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // NAURU
722
+ (12 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // OROMO
723
+ (30 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // RUNDI
724
+ (11 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // SAMOAN
725
+ ( 1 >= kMinCorrPercent) ? LINGALA : UNKNOWN_LANGUAGE, // SANGO
726
+ (32 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // SANSKRIT
727
+ (16 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // SISWANT
728
+ ( 5 >= kMinCorrPercent) ? SISWANT : UNKNOWN_LANGUAGE, // TSONGA
729
+ (29 >= kMinCorrPercent) ? SESOTHO : UNKNOWN_LANGUAGE, // TSWANA
730
+ ( 2 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // VOLAPUK
731
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ZHUANG
732
+ ( 1 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // KHASI
733
+ (28 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // SCOTS
734
+ (15 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // GANDA
735
+ ( 7 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // MANX
736
+ ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MONTENEGRIN
737
+ };
738
+
739
+ COMPILE_ASSERT(arraysize(kClosestAltLanguage) == NUM_LANGUAGES,
740
+ kClosestAltLanguage_has_incorrect_size);
741
+
742
+
743
+ inline bool FlagFinish(int flags) {return (flags & kCLDFlagFinish) != 0;}
744
+ inline bool FlagSqueeze(int flags) {return (flags & kCLDFlagSqueeze) != 0;}
745
+ inline bool FlagRepeats(int flags) {return (flags & kCLDFlagRepeats) != 0;}
746
+ inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;}
747
+ inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;}
748
+ inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;}
749
+ inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;}
750
+
751
+
752
+
753
+
754
+ //------------------------------------------------------------------------------
755
+ // For --cld_html debugging output. Not thread safe
756
+ //------------------------------------------------------------------------------
757
+ static Language prior_lang = UNKNOWN_LANGUAGE;
758
+ static bool prior_unreliable = false;
759
+
760
+ //------------------------------------------------------------------------------
761
+ // End For --cld_html debugging output
762
+ //------------------------------------------------------------------------------
763
+
764
+
765
+ // Backscan to word boundary, returning how many bytes n to go back
766
+ // so that src - n is non-space ans src - n - 1 is space.
767
+ // If not found in kMaxSpaceScan bytes, return 0
768
+ int BackscanToSpace(const char* src, int limit) {
769
+ int n = 0;
770
+ limit = cld::minint(limit, kMaxSpaceScan);
771
+ while (n < limit) {
772
+ if (src[-n - 1] == ' ') {return n;} // We are at _X
773
+ ++n;
774
+ }
775
+ return 0;
776
+ }
777
+
778
+ // Forwardscan to word boundary, returning how many bytes n to go forward
779
+ // so that src + n is non-space ans src + n - 1 is space.
780
+ // If not found in kMaxSpaceScan bytes, return 0
781
+ int ForwardscanToSpace(const char* src, int limit) {
782
+ int n = 0;
783
+ limit = cld::minint(limit, kMaxSpaceScan);
784
+ while (n < limit) {
785
+ if (src[n] == ' ') {return n + 1;} // We are at _X
786
+ ++n;
787
+ }
788
+ return 0;
789
+ }
790
+
791
+
792
+ // This uses a cheap predictor to get a measure of compression, and
793
+ // hence a measure of repetitiveness. It works on complete UTF-8 characters
794
+ // instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly
795
+ // all the time when done with a byte-based count. Sigh.
796
+ //
797
+ // To allow running prediction across multiple chunks, caller passes in current
798
+ // 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
799
+ //
800
+ // Returns the number of *bytes* correctly predicted, increments by 1..4 for
801
+ // each correctly-predicted character.
802
+ //
803
+ // NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text
804
+ //
805
+ int CountPredictedBytes(const char* isrc, int srclen, int* hash, int* tbl) {
806
+ int p_count = 0;
807
+ const uint8* src = reinterpret_cast<const uint8*>(isrc);
808
+ const uint8* srclimit = src + srclen;
809
+ int local_hash = *hash;
810
+
811
+ while (src < srclimit) {
812
+ int c = src[0];
813
+ int incr = 1;
814
+
815
+ // Pick up one char and length
816
+ if (c < 0xc0) {
817
+ // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
818
+ // Do nothing more
819
+ } else if ((c & 0xe0) == 0xc0) {
820
+ // Two-byte
821
+ c = (c << 8) | src[1];
822
+ incr = 2;
823
+ } else if ((c & 0xf0) == 0xe0) {
824
+ // Three-byte
825
+ c = (c << 16) | (src[1] << 8) | src[2];
826
+ incr = 3;
827
+ } else {
828
+ // Four-byte
829
+ c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
830
+ incr = 4;
831
+ }
832
+ src += incr;
833
+
834
+ int p = tbl[local_hash]; // Prediction
835
+ tbl[local_hash] = c; // Update prediction
836
+ p_count += (c == p); // Count good predictions
837
+
838
+ local_hash = ((local_hash << 4) ^ c) & 0xfff;
839
+ }
840
+
841
+ *hash = local_hash;
842
+ return p_count;
843
+ }
844
+
845
+
846
+
847
+ // Counts number of spaces; a little faster than one-at-a-time
848
+ // Doesn't count odd bytes at end
849
+ int CountSpaces4(const char* src, int src_len) {
850
+ int s_count = 0;
851
+ for (int i = 0; i < (src_len & ~3); i += 4) {
852
+ s_count += (src[i] == ' ');
853
+ s_count += (src[i+1] == ' ');
854
+ s_count += (src[i+2] == ' ');
855
+ s_count += (src[i+3] == ' ');
856
+ }
857
+ return s_count;
858
+ }
859
+
860
+ // Remove words of text that have more than half their letters predicted
861
+ // correctly by our cheap predictor, moving the remaining words in-place
862
+ // to the front of the input buffer.
863
+ //
864
+ // To allow running prediction across multiple chunks, caller passes in current
865
+ // 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
866
+ //
867
+ // Return the new, possibly-shorter length
868
+ //
869
+ // Result Buffer ALWAYS has leading space and trailing space space space NUL,
870
+ // if input does
871
+ //
872
+ int CheapRepWordsInplace(char* isrc, int srclen, int* hash, int* tbl) {
873
+ const uint8* src = reinterpret_cast<const uint8*>(isrc);
874
+ const uint8* srclimit = src + srclen;
875
+ char* dst = isrc;
876
+ int local_hash = *hash;
877
+ char* word_dst = dst; // Start of next word
878
+ int good_predict_bytes = 0;
879
+ int word_length_bytes = 0;
880
+
881
+ while (src < srclimit) {
882
+ int c = src[0];
883
+ int incr = 1;
884
+ *dst++ = c;
885
+
886
+ if (c == ' ') {
887
+ if ((good_predict_bytes * 2) > word_length_bytes) {
888
+ // Word is well-predicted: backup to start of this word
889
+ dst = word_dst;
890
+ if (FLAGS_cld_showme) {
891
+ // Mark the deletion point with period
892
+ // Don't repeat multiple periods
893
+ // Cannot mark with more bytes or may overwrite unseen input
894
+ if ((isrc < (dst - 2)) && (dst[-2] != '.')) {
895
+ *dst++ = '.';
896
+ *dst++ = ' ';
897
+ }
898
+ }
899
+ }
900
+ word_dst = dst; // Start of next word
901
+ good_predict_bytes = 0;
902
+ word_length_bytes = 0;
903
+ }
904
+
905
+ // Pick up one char and length
906
+ if (c < 0xc0) {
907
+ // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
908
+ // Do nothing more
909
+ } else if ((c & 0xe0) == 0xc0) {
910
+ // Two-byte
911
+ *dst++ = src[1];
912
+ c = (c << 8) | src[1];
913
+ incr = 2;
914
+ } else if ((c & 0xf0) == 0xe0) {
915
+ // Three-byte
916
+ *dst++ = src[1];
917
+ *dst++ = src[2];
918
+ c = (c << 16) | (src[1] << 8) | src[2];
919
+ incr = 3;
920
+ } else {
921
+ // Four-byte
922
+ *dst++ = src[1];
923
+ *dst++ = src[2];
924
+ *dst++ = src[3];
925
+ c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
926
+ incr = 4;
927
+ }
928
+ src += incr;
929
+ word_length_bytes += incr;
930
+
931
+ int p = tbl[local_hash]; // Prediction
932
+ tbl[local_hash] = c; // Update prediction
933
+ if (c == p) {
934
+ good_predict_bytes += incr; // Count good predictions
935
+ }
936
+
937
+ local_hash = ((local_hash << 4) ^ c) & 0xfff;
938
+ }
939
+
940
+ *hash = local_hash;
941
+
942
+ if ((dst - isrc) < (srclen - 3)) {
943
+ // Pad and make last char clean UTF-8 by putting following spaces
944
+ dst[0] = ' ';
945
+ dst[1] = ' ';
946
+ dst[2] = ' ';
947
+ dst[3] = '\0';
948
+ } else if ((dst - isrc) < srclen) {
949
+ // Make last char clean UTF-8 by putting following space off the end
950
+ dst[0] = ' ';
951
+ }
952
+
953
+ return static_cast<int>(dst - isrc);
954
+ }
955
+
956
+
957
+ // Remove portions of text that have a high density of spaces, or that are
958
+ // overly repetitive, squeezing the remaining text in-place to the front of the
959
+ // input buffer.
960
+ //
961
+ // Squeezing looks at density of space/prediced chars in fixed-size chunks,
962
+ // specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes.
963
+ //
964
+ // Return the new, possibly-shorter length
965
+ //
966
+ // Result Buffer ALWAYS has leading space and trailing space space space NUL,
967
+ // if input does
968
+ //
969
+ int CompactLangDetImpl::CheapSqueezeInplace(char* isrc,
970
+ int srclen,
971
+ int ichunksize) {
972
+ char* src = isrc;
973
+ char* dst = src;
974
+ char* srclimit = src + srclen;
975
+ bool skipping = false;
976
+
977
+ int hash = 0;
978
+ // Allocate local prediction table.
979
+ int* predict_tbl = new int[kPredictionTableSize];
980
+ memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
981
+
982
+ int chunksize = ichunksize;
983
+ if (chunksize == 0) {chunksize = kChunksizeDefault;}
984
+ int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
985
+ int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
986
+
987
+ while (src < srclimit) {
988
+ int remaining_bytes = srclimit - src;
989
+ int len = cld::minint(chunksize, remaining_bytes);
990
+ int space_n = CountSpaces4(src, len);
991
+ int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
992
+ if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
993
+ // Skip the text
994
+ if (!skipping) {
995
+ // Keeping-to-skipping transition; do it at a space
996
+ int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
997
+ dst -= n;
998
+ skipping = true;
999
+ if (FLAGS_cld_showme) {
1000
+ // Mark the deletion point with black square U+25A0
1001
+ *dst++ = 0xe2;
1002
+ *dst++ = 0x96;
1003
+ *dst++ = 0xa0;
1004
+ *dst++ = ' ';
1005
+ }
1006
+ if (dst == isrc) {
1007
+ // Force a leading space if the first chunk is deleted
1008
+ *dst++ = ' ';
1009
+ }
1010
+ }
1011
+ } else {
1012
+ // Keep the text
1013
+ if (skipping) {
1014
+ // Skipping-to-keeping transition; do it at a space
1015
+ int n = ForwardscanToSpace(src, len);
1016
+ src += n;
1017
+ remaining_bytes -= n; // Shrink remaining length
1018
+ len -= n;
1019
+ skipping = false;
1020
+ }
1021
+ // "len" can be negative in some cases
1022
+ if (len > 0) {
1023
+ memcpy(dst, src, len);
1024
+ dst += len;
1025
+ }
1026
+ }
1027
+ src += len;
1028
+ }
1029
+
1030
+ if ((dst - isrc) < (srclen - 3)) {
1031
+ // Pad and make last char clean UTF-8 by putting following spaces
1032
+ dst[0] = ' ';
1033
+ dst[1] = ' ';
1034
+ dst[2] = ' ';
1035
+ dst[3] = '\0';
1036
+ } else if ((dst - isrc) < srclen) {
1037
+ // Make last char clean UTF-8 by putting following space off the end
1038
+ dst[0] = ' ';
1039
+ }
1040
+
1041
+ // Deallocate local prediction table
1042
+ delete[] predict_tbl;
1043
+ return static_cast<int>(dst - isrc);
1044
+ }
1045
+
1046
+ // Timing 2.8GHz P4 (dsites 2008.03.20) with 170KB input
1047
+ // About 90 MB/sec, with or without memcpy, chunksize 48 or 4096
1048
+ // Just CountSpaces is about 340 MB/sec
1049
+ // Byte-only CountPredictedBytes is about 150 MB/sec
1050
+ // Byte-only CountPredictedBytes, conditional tbl[] = is about 85! MB/sec
1051
+ // Byte-only CountPredictedBytes is about 180 MB/sec, byte tbl, byte/int c
1052
+ // Unjammed byte-only both = 170 MB/sec
1053
+ // Jammed byte-only both = 120 MB/sec
1054
+ // Back to original w/slight updates, 110 MB/sec
1055
+ //
1056
+ bool CheapSqueezeTriggerTest(const char* src, int srclen, int testsize) {
1057
+ // Don't trigger at all on short text
1058
+ if (srclen < testsize) {return false;}
1059
+ int space_thresh = (testsize * kSpacesTriggerPercent) / 100;
1060
+ int predict_thresh = (testsize * kPredictTriggerPercent) / 100;
1061
+ int hash = 0;
1062
+ // Allocate local prediction table.
1063
+ int* predict_tbl = new int[kPredictionTableSize];
1064
+ memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
1065
+
1066
+ bool retval = false;
1067
+ if ((CountSpaces4(src, testsize) >= space_thresh) ||
1068
+ (CountPredictedBytes(src, testsize, &hash, predict_tbl) >=
1069
+ predict_thresh)) {
1070
+ retval = true;
1071
+ }
1072
+ // Deallocate local prediction table
1073
+ delete[] predict_tbl;
1074
+ return retval;
1075
+ }
1076
+
1077
+
1078
+
1079
+ // Close pairs (correlation) language_enum/language_enum
1080
+ // id/ms (0.47) 38/40 [1]
1081
+ // bo/dz (0.46) 105/135 [2]
1082
+ // cz/sk (0.43) 17/68 [3]
1083
+ // no/nn (0.42) 10/80 [4]
1084
+ // hi/mr (0.38) 35/64 [5]
1085
+ // xh/zu (0.37) 83/84 [6]
1086
+ // Subscripted by packed language, gives 0 or a subscript in closepair
1087
+ // scoring array inside doc_tote
1088
+ static const uint8 kClosePair[EXT_NUM_LANGUAGES + 1] = {
1089
+ 0,
1090
+ 0,0,0,0,0,0,0,0, 0,0,4,0,0,0,0,0, 0,3,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1091
+ 0,0,0,5,0,0,1,0, 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1092
+ 5,0,0,0,3,0,0,0, 0,0,0,0,0,0,0,0, 4,0,0,6,6,0,0,0, 0,0,0,0,0,0,0,0,
1093
+ 0,0,0,0,0,0,0,0, 0,2,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1094
+ 0,0,0,0,0,0,0,2, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1095
+ // Add new language close-pair number just before here (just use 0)
1096
+ };
1097
+
1098
+
1099
+ // Delete any extended languages from doc_tote
1100
+ void RemoveExtendedLanguages(ToteWithReliability* doc_tote) {
1101
+ for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1102
+ if (cld::UnpackLanguage(doc_tote->Key(sub)) >= NUM_LANGUAGES) {
1103
+ // Effectively remove the extended language by setting key&score to zero
1104
+ if (FLAGS_dbgscore) {
1105
+ fprintf(stderr, "{-%s} ",
1106
+ ExtLanguageCode(cld::UnpackLanguage(doc_tote->Key(sub))));
1107
+ }
1108
+
1109
+ // Delete entry
1110
+ doc_tote->SetKey(sub, 0);
1111
+ doc_tote->SetValue(sub, 0);
1112
+ doc_tote->SetReliability(sub, 0);
1113
+ }
1114
+ }
1115
+ }
1116
+
1117
+ static const int kMinReliableKeepPercent = 41; // Remove lang if reli < this
1118
+
1119
+ // For Tier3 languages, require a minimum number of bytes to be first-place lang
1120
+ static const int kGoodFirstT3MinBytes = 24; // <this => no first
1121
+
1122
+ // Move bytes for unreliable langs to another lang or UNKNOWN
1123
+ // doc_tote is sorted, so cannot Add
1124
+ //
1125
+ // If both CHINESE and CHINESET are present and unreliable, do not delete both;
1126
+ // merge both into CHINESE.
1127
+ //
1128
+ //dsites 2009.03.19
1129
+ // we also want to remove Tier3 languages as the first lang if there is very
1130
+ // little text like ej1 ej2 ej3 ej4
1131
+ // maybe fold this back in earlier
1132
+ //
1133
+ void RemoveUnreliableLanguages(ToteWithReliability* doc_tote, bool do_remove_weak_matches) {
1134
+ // Prepass to merge some low-reliablility languages
1135
+ int total_bytes = 0;
1136
+ for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1137
+ int plang = doc_tote->Key(sub);
1138
+ if (plang == 0) {continue;} // Empty slot
1139
+
1140
+ Language lang = cld::UnpackLanguage(plang);
1141
+ int bytes = doc_tote->Value(sub);
1142
+ int reli = doc_tote->Reliability(sub);
1143
+ if (bytes == 0) {continue;} // Zero bytes
1144
+ total_bytes += bytes;
1145
+
1146
+ // Reliable percent is stored reliable score over stored bytecount
1147
+ int reliable_percent = reli / bytes;
1148
+ if (reliable_percent >= kMinReliableKeepPercent) {continue;} // Keeper
1149
+
1150
+ // This language is too unreliable to keep, but we might merge it.
1151
+ Language altlang = UNKNOWN_LANGUAGE;
1152
+ if (lang < NUM_LANGUAGES) {altlang = kClosestAltLanguage[lang];}
1153
+ if (altlang == UNKNOWN_LANGUAGE) {continue;} // No alternative
1154
+
1155
+ // Look for alternative in doc_tote
1156
+ int altsub = doc_tote->Find(cld::PackLanguage(altlang));
1157
+ if (altsub < 0) {continue;} // No alternative text
1158
+
1159
+ int bytes2 = doc_tote->Value(altsub);
1160
+ int reli2 = doc_tote->Reliability(altsub);
1161
+ if (bytes2 == 0) {continue;} // Zero bytes
1162
+
1163
+ // Reliable percent is stored reliable score over stored bytecount
1164
+ int reliable_percent2 = reli2 / bytes2;
1165
+
1166
+ // Merge one language into the other. Break ties toward lower lang #
1167
+ int tosub = altsub;
1168
+ int fromsub = sub;
1169
+ bool into_lang = false;
1170
+ if ((reliable_percent2 < reliable_percent) ||
1171
+ ((reliable_percent2 == reliable_percent) && (lang < altlang))) {
1172
+ tosub = sub;
1173
+ fromsub = altsub;
1174
+ into_lang = true;
1175
+ }
1176
+
1177
+ // Make sure reliability doesn't drop and is enough to avoid delete
1178
+ int newpercent = cld::maxint(reliable_percent, reliable_percent2);
1179
+ newpercent = cld::maxint(newpercent, kMinReliableKeepPercent);
1180
+ int newbytes = bytes + bytes2;
1181
+ int newreli = newpercent * newbytes;
1182
+
1183
+ doc_tote->SetKey(fromsub, 0);
1184
+ doc_tote->SetValue(fromsub, 0);
1185
+ doc_tote->SetReliability(fromsub, 0);
1186
+ doc_tote->SetValue(tosub, newbytes);
1187
+ doc_tote->SetReliability(tosub, newreli);
1188
+
1189
+ // Show fate of unreliable languages if at least 10 bytes
1190
+ if (FLAGS_cld_html /*&& (newpercent >= 10)*/ && (newbytes >= 10)) {
1191
+ if (into_lang) {
1192
+ fprintf(stderr, "{Unreli %s.%d(%dB) => %s} ",
1193
+ ExtLanguageCode(altlang), reliable_percent2, bytes2,
1194
+ ExtLanguageCode(lang));
1195
+ } else {
1196
+ fprintf(stderr, "{Unreli %s.%d(%dB) => %s} ",
1197
+ ExtLanguageCode(lang), reliable_percent, bytes,
1198
+ ExtLanguageCode(altlang));
1199
+ }
1200
+ }
1201
+ }
1202
+
1203
+
1204
+ if (do_remove_weak_matches) {
1205
+ // Pass to delete any remaining unreliable languages
1206
+ for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1207
+ int plang = doc_tote->Key(sub);
1208
+ if (plang == 0) {continue;} // Empty slot
1209
+
1210
+ Language lang = cld::UnpackLanguage(plang);
1211
+ int bytes = doc_tote->Value(sub);
1212
+ int reli = doc_tote->Reliability(sub);
1213
+ if (bytes == 0) {continue;} // Zero bytes
1214
+
1215
+ bool is_tier3 = (cld::kIsPackedTop40[plang] == 0);
1216
+ if (is_tier3 &&
1217
+ (bytes < kGoodFirstT3MinBytes) &&
1218
+ (bytes < total_bytes)) {
1219
+ reli = 0; // Too-short tier3
1220
+ }
1221
+
1222
+ // Reliable percent is stored as reliable score over stored bytecount
1223
+ int reliable_percent = reli / bytes;
1224
+ if (reliable_percent >= kMinReliableKeepPercent) {continue;} // Keeper
1225
+
1226
+ // Delete unreliable entry
1227
+ doc_tote->SetKey(sub, 0);
1228
+ doc_tote->SetValue(sub, 0);
1229
+ doc_tote->SetReliability(sub, 0);
1230
+
1231
+ // Show fate of unreliable languages if at least 10 bytes
1232
+ if (FLAGS_cld_html /*&& (reliable_percent >= 10)*/ && (bytes >= 10)) {
1233
+ fprintf(stderr, "{Unreli %s.%d(%dB)} ",
1234
+ ExtLanguageCode(lang), reliable_percent, bytes);
1235
+ }
1236
+ }
1237
+ }
1238
+
1239
+ if (FLAGS_cld_html) {fprintf(stderr, "<br>\n");}
1240
+ }
1241
+
1242
+
1243
+ // Move less likely byte count to more likely for close pairs of languages
1244
+ void RefineScoredClosePairs(ToteWithReliability* doc_tote) {
1245
+ for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1246
+ int close_packedlang = doc_tote->Key(sub);
1247
+ int subscr = kClosePair[close_packedlang];
1248
+ if (subscr == 0) {continue;}
1249
+
1250
+ // We have a close pair language -- if the other one is also scored and the
1251
+ // longword score differs enough, put all our eggs into one basket
1252
+
1253
+ // Nonzero longword score: Go look for the other of this pair
1254
+ for (int sub2 = sub + 1; sub2 < doc_tote->MaxSize(); ++sub2) {
1255
+ if (kClosePair[doc_tote->Key(sub2)] == subscr) {
1256
+ // We have a matching pair
1257
+ int close_packedlang2 = doc_tote->Key(sub2);
1258
+
1259
+ // Move all the text bytes from lower byte-count to higher one
1260
+ int from_sub, to_sub;
1261
+ Language from_lang, to_lang;
1262
+ if (doc_tote->Value(sub) < doc_tote->Value(sub2)) {
1263
+ from_sub = sub;
1264
+ to_sub = sub2;
1265
+ from_lang = cld::UnpackLanguage(close_packedlang);
1266
+ to_lang = cld::UnpackLanguage(close_packedlang2);
1267
+ } else {
1268
+ from_sub = sub2;
1269
+ to_sub = sub;
1270
+ from_lang = cld::UnpackLanguage(close_packedlang2);
1271
+ to_lang = cld::UnpackLanguage(close_packedlang);
1272
+ }
1273
+
1274
+ // Move all the bytes smaller => larger of the pair
1275
+ if (FLAGS_cld_html || FLAGS_dbgscore) {
1276
+ // Show fate of closepair language
1277
+ int val = doc_tote->Value(from_sub);
1278
+ int reli = doc_tote->Reliability(from_sub);
1279
+ int reliable_percent = reli / (val ? val : 1); // avoid zdiv
1280
+ fprintf(stderr, "{CloseLangPair: %s.%d%%(%dB) => %s} ",
1281
+ ExtLanguageCode(from_lang),
1282
+ reliable_percent,
1283
+ doc_tote->Value(from_sub),
1284
+ ExtLanguageCode(to_lang));
1285
+ }
1286
+ int sum = doc_tote->Value(to_sub) + doc_tote->Value(from_sub);
1287
+ doc_tote->SetValue(to_sub, sum);
1288
+ doc_tote->SetReliability(to_sub, 100 * sum);
1289
+
1290
+ // Delete old entry
1291
+ doc_tote->SetKey(from_sub, 0);
1292
+ doc_tote->SetValue(from_sub, 0);
1293
+ doc_tote->SetReliability(from_sub, 0);
1294
+
1295
+ break; // Exit inner for sub2 loop
1296
+ }
1297
+ } // End for sub2
1298
+ } // End for sub
1299
+ }
1300
+
1301
+
1302
+ void ApplyLanguageHints(Tote* chunk_tote, int tote_grams,
1303
+ uint8* lang_hint_boost) {
1304
+ // Need 8 quad/unigrams to give full hint boost, else derate linearly
1305
+ if (tote_grams > 8) {
1306
+ tote_grams = 8;
1307
+ }
1308
+ for (int sub = 0; sub < chunk_tote->MaxSize(); ++sub) {
1309
+ // Hint boosts are per packed subscript
1310
+ int lang_sub = chunk_tote->Key(sub);
1311
+ int new_value = chunk_tote->Value(sub) +
1312
+ ((lang_hint_boost[lang_sub] * tote_grams) >> 3);
1313
+ chunk_tote->SetValue(sub, new_value);
1314
+ if (FLAGS_dbgscore && (lang_hint_boost[lang_sub] > 0)) {
1315
+ fprintf(stderr, "[%s+=%d*%d/8] ",
1316
+ ExtLanguageCode(cld::UnpackLanguage(lang_sub)),
1317
+ lang_hint_boost[lang_sub], tote_grams);
1318
+ }
1319
+ }
1320
+ }
1321
+
1322
+
1323
+ void PrintHtmlEscapedText(FILE* f, const char* txt, int len) {
1324
+ for (int i = 0; i < len; ++i) {
1325
+ char c = txt[i];
1326
+ if (c == '<') {
1327
+ fprintf(f, "&lt;");
1328
+ } else if (c == '>') {
1329
+ fprintf(f, "&gt;");
1330
+ } else if (c == '&') {
1331
+ fprintf(f, "&amp;");
1332
+ } else if (c == '\'') {
1333
+ fprintf(f, "&apos;");
1334
+ } else if (c == '"') {
1335
+ fprintf(f, "&quot;");
1336
+ } else {
1337
+ fprintf(f, "%c", c);
1338
+ }
1339
+ }
1340
+ fprintf(f, "<br>\n");
1341
+ }
1342
+
1343
+
1344
+ // Add one chunk's score to running document score
1345
+ // If the top language is UNKNOWN_LANGUAGE, score nothing. This is used to
1346
+ // positively identify text to be ignored, such as link farms.
1347
+ // Sort before scoring and reinit afterward
1348
+ //
1349
+ // src and srclen are just for debug output
1350
+ void ScoreChunkIntoDoc(const char* src, int srclen, int advance_by,
1351
+ UnicodeLScript lscript,
1352
+ Tote* chunk_tote,
1353
+ ToteWithReliability* doc_tote,
1354
+ int tote_grams,
1355
+ uint8* lang_hint_boost) {
1356
+ // Apply hints before sorting
1357
+ if (lang_hint_boost) {
1358
+ ApplyLanguageHints(chunk_tote, tote_grams, lang_hint_boost);
1359
+ }
1360
+
1361
+ // Sort to get top two languages
1362
+ chunk_tote->Sort(2);
1363
+ Language cur_lang = cld::UnpackLanguage(chunk_tote->Key(0));
1364
+
1365
+ // Return if empty
1366
+ if (cur_lang < 0) {
1367
+ chunk_tote->Reinit();
1368
+ return;
1369
+ }
1370
+
1371
+ bool cur_unreliable = false;
1372
+
1373
+ // Reliability is a function of mean script score per KB of text
1374
+ int len = chunk_tote->GetByteCount();
1375
+ int reliability = cld::GetReliability((len * 2) / advance_by,
1376
+ lscript,
1377
+ chunk_tote);
1378
+ cur_unreliable = (reliability < cld::kMinReliable);
1379
+
1380
+ // If tote_grams=0, always reliable
1381
+ // If tote_grams=1, always unreliable
1382
+ if (tote_grams == 0) {
1383
+ reliability = 100;
1384
+ cur_unreliable = false;
1385
+ } else if (tote_grams == 1) {
1386
+ reliability = 0;
1387
+ cur_unreliable = true;
1388
+ }
1389
+
1390
+ #if 0
1391
+ // TEMP
1392
+ if (FLAGS_cld_html) {
1393
+ if (reliability >= kMinReliableKeepPercent) {
1394
+ fprintf(stderr, "R%d%% ", reliability);
1395
+ } else {
1396
+ fprintf(stderr, "--R%d%% ", reliability);
1397
+ }
1398
+ }
1399
+ #endif
1400
+
1401
+ // Track the sequence of language fragments [result currently unused]
1402
+ ////if (reliability >= kMinReliableSeq) {
1403
+ //// doc_tote->AddSeq(chunk_tote->Key(0));
1404
+ ////}
1405
+
1406
+ if (cur_unreliable && (chunk_tote->Key(1) != 0)) {
1407
+ // Unreliable and two top contenders, split byte count 5/8 - 3/8
1408
+ int top_len = ((len * 5) + 4) >> 3;
1409
+ int second_len = len - top_len;
1410
+
1411
+ doc_tote->Add(chunk_tote->Key(0),
1412
+ top_len, chunk_tote->Value(0), reliability);
1413
+ doc_tote->Add(chunk_tote->Key(1),
1414
+ second_len, chunk_tote->Value(1), reliability);
1415
+ if (FLAGS_dbgscore) {
1416
+ fprintf(stderr, "{+%s.%d.%dR(%dB) +%s.%d.%dR(%dB)} ",
1417
+ ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(0))),
1418
+ chunk_tote->Value(0),
1419
+ reliability,
1420
+ top_len,
1421
+ ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(1))),
1422
+ chunk_tote->Value(1),
1423
+ reliability,
1424
+ second_len);
1425
+ }
1426
+ } else {
1427
+ // Reliable or single contender
1428
+ doc_tote->Add(chunk_tote->Key(0),
1429
+ len, chunk_tote->Value(0), reliability);
1430
+ if (FLAGS_dbgscore) {
1431
+ fprintf(stderr, "{+%s.%d.%dR(%dB)} ",
1432
+ ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(0))),
1433
+ chunk_tote->Value(0),
1434
+ reliability,
1435
+ len);
1436
+ }
1437
+ }
1438
+
1439
+ if (FLAGS_cld_html) {
1440
+ if (cur_lang < 0) {cur_lang = UNKNOWN_LANGUAGE;}
1441
+ cld::PrintLang(stderr, chunk_tote,
1442
+ cur_lang, cur_unreliable,
1443
+ prior_lang, prior_unreliable);
1444
+ prior_lang = cur_lang;
1445
+ prior_unreliable = cur_unreliable;
1446
+
1447
+ string temp(src, srclen);
1448
+ if (temp[0] == '=') {
1449
+ // Rewrite =ScriptX= or =SwitchX= as =Xxxx= for script code Xxxx
1450
+ temp = "=Buffered_";
1451
+ temp.append(UnicodeLScriptCode(lscript));
1452
+ temp.append("=");
1453
+ }
1454
+ cld::PrintText(stderr, cur_lang, temp);
1455
+ }
1456
+
1457
+ chunk_tote->Reinit();
1458
+ }
1459
+
1460
+
1461
+ void PrintTopLang(Language top_lang) {
1462
+ if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
1463
+ fprintf(stderr, "[] ");
1464
+ } else {
1465
+ fprintf(stderr, "[%s] ", ExtLanguageName(top_lang));
1466
+ prior_lang = top_lang;
1467
+ }
1468
+ }
1469
+
1470
+ void PrintTopLangSpeculative(Language top_lang) {
1471
+ fprintf(stderr, "<span style=\"color:#%06X;\">", 0xa0a0a0);
1472
+ if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
1473
+ fprintf(stderr, "[] ");
1474
+ } else {
1475
+ fprintf(stderr, "[%s] ", ExtLanguageName(top_lang));
1476
+ prior_lang = top_lang;
1477
+ }
1478
+ fprintf(stderr, "</span>\n");
1479
+ }
1480
+
1481
+
1482
+ // Add one chunk's score to running document score
1483
+ // Convenience function with constant src text
1484
+ void ScoreChunkIntoDoc2(const char* src, int advance_by,
1485
+ UnicodeLScript lscript,
1486
+ Tote* chunk_tote,
1487
+ ToteWithReliability* doc_tote,
1488
+ int tote_grams,
1489
+ uint8* lang_hint_boost) {
1490
+ int srclen = static_cast<int>(strlen(src));
1491
+ ScoreChunkIntoDoc(src, srclen, advance_by, lscript, chunk_tote,
1492
+ doc_tote, tote_grams, lang_hint_boost);
1493
+ }
1494
+
1495
+
1496
+ // Score one scriptspan using the only language for that script
1497
+ void ScoreNilgrams(getone::LangSpan* scriptspan, int lang,
1498
+ ToteWithReliability* doc_tote,
1499
+ uint8* lang_hint_boost,
1500
+ int flags, Language plus_one) {
1501
+ // For debugging only. Not thread-safe
1502
+ prior_lang = UNKNOWN_LANGUAGE;
1503
+ prior_unreliable = false;
1504
+
1505
+ const char* src = scriptspan->text;
1506
+ int len = scriptspan->text_bytes;
1507
+
1508
+ Tote chunk_tote;
1509
+ // Score 1000 for 1000 bytes
1510
+ chunk_tote.AddGram();
1511
+ chunk_tote.Add(lang, scriptspan->text_bytes);
1512
+ chunk_tote.AddBytes(scriptspan->text_bytes);
1513
+ int advance_by = 2;
1514
+ int tote_grams = 0; // Indicates fully reliable
1515
+ ScoreChunkIntoDoc(src, len, advance_by,
1516
+ scriptspan->script, &chunk_tote,
1517
+ doc_tote, tote_grams, lang_hint_boost);
1518
+ }
1519
+
1520
+ // Score one scriptspan using unigrams
1521
+ // Updates tote_grams
1522
+ static void ScoreUnigrams(const UTF8PropObj* unigram_obj,
1523
+ getone::LangSpan* scriptspan,
1524
+ int* tote_grams, int gram_limit,
1525
+ Tote* chunk_tote,
1526
+ ToteWithReliability* doc_tote,
1527
+ uint8* lang_hint_boost,
1528
+ int advance_by, int flags,
1529
+ int* initial_word_span, Language plus_one) {
1530
+ // chunk_tote may have partial sum coming in
1531
+ const char* src = scriptspan->text;
1532
+ const char* srclimit = src + scriptspan->text_bytes;
1533
+
1534
+ // For debugging only. Not thread-safe
1535
+ prior_lang = UNKNOWN_LANGUAGE;
1536
+ prior_unreliable = false;
1537
+
1538
+ // Break text up into multiple chunks and score each
1539
+ while (src < srclimit) {
1540
+ // Updates tote_grams
1541
+ int len = cld::DoUniScoreV3(unigram_obj,
1542
+ src, srclimit - src, advance_by,
1543
+ tote_grams, gram_limit, chunk_tote);
1544
+ if (FlagUseWords(flags) || (*initial_word_span > 0)) {
1545
+ // Use bigram scoring in addition to quadgrams
1546
+ cld::DoBigramScoreV3(&kCjkBiTable_obj,
1547
+ src, len, chunk_tote);
1548
+ }
1549
+ chunk_tote->AddBytes(len);
1550
+ *initial_word_span -= len;
1551
+
1552
+ if (*tote_grams >= gram_limit) {
1553
+ // Add this chunk to doc totals
1554
+ // Remove all but top40 if asked
1555
+ if (FlagTop40(flags)) {
1556
+ cld::DemoteNotTop40(chunk_tote, cld::PackLanguage(plus_one));
1557
+ }
1558
+
1559
+ // Sort, accumulate into doc total, reinit
1560
+ ScoreChunkIntoDoc(src, len, advance_by,
1561
+ scriptspan->script, chunk_tote,
1562
+ doc_tote, *tote_grams, lang_hint_boost);
1563
+ *tote_grams = 0;
1564
+ } else {
1565
+ if (FLAGS_cld_html) {
1566
+ string temp(src, len);
1567
+ Language top_lang = cld::UnpackLanguage(chunk_tote->CurrentTopKey());
1568
+ PrintTopLangSpeculative(top_lang);
1569
+ cld::PrintText(stderr, top_lang, temp);
1570
+ }
1571
+ }
1572
+ src += len;
1573
+ }
1574
+ // chunk_tote may have partial sum going out
1575
+ }
1576
+
1577
+ // Back up one UTF-8 character
1578
+ const uint8* BackOneUTF8(const uint8* p) {
1579
+ const uint8* retval = p - 1;
1580
+ if ((*retval & 0xc0) == 0x80) {--retval;}
1581
+ if ((*retval & 0xc0) == 0x80) {--retval;}
1582
+ if ((*retval & 0xc0) == 0x80) {--retval;}
1583
+ return retval;
1584
+ }
1585
+
1586
+
1587
+ // Score one scriptspan using quadgrams
1588
+ // Incoming chunk_tote may have partial accumulation
1589
+ static void ScoreQuadgrams(const cld::CLDTableSummary* quadgram_obj,
1590
+ getone::LangSpan* scriptspan,
1591
+ int* tote_grams, int gram_limit,
1592
+ Tote* chunk_tote,
1593
+ ToteWithReliability* doc_tote,
1594
+ uint8* lang_hint_boost,
1595
+ int advance_by, int flags,
1596
+ int* initial_word_span, Language plus_one) {
1597
+ // chunk_tote may have partial sum coming in
1598
+ const char* src = scriptspan->text;
1599
+ const char* srclimit = src + scriptspan->text_bytes;
1600
+ const char* lastscored_src = src;
1601
+
1602
+ // For debugging only. Not thread-safe
1603
+ prior_lang = UNKNOWN_LANGUAGE;
1604
+ prior_unreliable = false;
1605
+
1606
+ // Break text up into multiple chunks and score each
1607
+ while (src < srclimit) {
1608
+ // Updates tote_grams
1609
+ int len = cld::DoQuadScoreV3(quadgram_obj,
1610
+ src, srclimit - src, advance_by,
1611
+ tote_grams, gram_limit, chunk_tote);
1612
+ if (FlagUseWords(flags) || (*initial_word_span > 0)) {
1613
+ // Use word scoring in addition to quadgrams
1614
+ cld::DoOctaScoreV3(&kLongWord8Table_obj,
1615
+ src, len, chunk_tote);
1616
+ }
1617
+ chunk_tote->AddBytes(len);
1618
+ *initial_word_span -= len;
1619
+
1620
+ if (*tote_grams >= gram_limit) {
1621
+ // Remove all but top40 if asked
1622
+ if (FlagTop40(flags)) {
1623
+ cld::DemoteNotTop40(chunk_tote, cld::PackLanguage(plus_one));
1624
+ }
1625
+
1626
+ // Sort, accumulate into doc total, reinit
1627
+ ScoreChunkIntoDoc(src, len, advance_by,
1628
+ scriptspan->script, chunk_tote,
1629
+ doc_tote, *tote_grams, lang_hint_boost);
1630
+ lastscored_src = src + len;
1631
+ *tote_grams = 0;
1632
+ } else {
1633
+ if (FLAGS_cld_html) {
1634
+ string temp(src, len);
1635
+ Language top_lang = cld::UnpackLanguage(chunk_tote->CurrentTopKey());
1636
+ PrintTopLangSpeculative(top_lang);
1637
+ cld::PrintText(stderr, top_lang, temp);
1638
+ }
1639
+ }
1640
+ src += len;
1641
+ }
1642
+ }
1643
+
1644
+
1645
+
1646
+ void PrintLangs(FILE* f, const Language* language3, const int* percent3,
1647
+ const int* text_bytes, const bool* is_reliable) {
1648
+ fprintf(f, "<br>&nbsp;&nbsp;Initial_Languages ");
1649
+ if (language3[0] != UNKNOWN_LANGUAGE) {
1650
+ fprintf(f, "%s%s(%d%%) ",
1651
+ ExtLanguageName(language3[0]),
1652
+ *is_reliable ? "" : "*",
1653
+ percent3[0]);
1654
+ }
1655
+ if (language3[1] != UNKNOWN_LANGUAGE) {
1656
+ fprintf(f, "%s(%d%%) ", ExtLanguageName(language3[1]), percent3[1]);
1657
+ }
1658
+ if (language3[2] != UNKNOWN_LANGUAGE) {
1659
+ fprintf(f, "%s(%d%%) ", ExtLanguageName(language3[2]), percent3[2]);
1660
+ }
1661
+ fprintf(f, "%d bytes \n", *text_bytes);
1662
+
1663
+ fprintf(f, "<br>\n");
1664
+ }
1665
+
1666
+
1667
+ // Start the tote with a count of one for the default language for script
1668
+ void InitScriptToteLang(Tote* script_tote, UnicodeLScript lscript) {
1669
+ Language defaultlang = cld::kDefaultLanguagePerLScript[lscript];
1670
+ script_tote->Add(cld::PackLanguage(defaultlang), 1);
1671
+ script_tote->AddBytes(1);
1672
+ #if 0
1673
+ if (FLAGS_cld_html) {
1674
+ cld::PrintLang(stderr, script_tote,
1675
+ defaultlang, false,
1676
+ UNKNOWN_LANGUAGE, false);
1677
+ prior_lang = cur_lang;
1678
+ string temp("+1");
1679
+ cld::PrintText(stderr, defaultlang, temp);
1680
+ }
1681
+ #endif
1682
+ }
1683
+
1684
+ static const char* const kToteName[4] =
1685
+ {"=Latn=", "=Hani=", "=Script2=", "=Script3="};
1686
+ static const char* const kToteSwitch[4] =
1687
+ {"=Latn=", "=Hani=", "=Switch2=", "=Switch3="};
1688
+
1689
+
1690
+
1691
+ // Upper to lower, keep digits, everything else to minus '-' (2d)
1692
+ static const char kCharsetToLowerTbl[256] = {
1693
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1694
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1695
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1696
+ 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37, 0x38,0x39,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1697
+
1698
+ 0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
1699
+ 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x2d,
1700
+ 0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
1701
+ 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x2d,
1702
+
1703
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1704
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1705
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1706
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1707
+
1708
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1709
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1710
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1711
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1712
+ };
1713
+
1714
+
1715
+ static const char kIsAlpha[256] = {
1716
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1717
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1718
+ 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,
1719
+ 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,
1720
+
1721
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1722
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1723
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1724
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1725
+ };
1726
+
1727
+ static const char kIsDigit[256] = {
1728
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1729
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1, 1,1,0,0,0,0,0,0,
1730
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1731
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1732
+
1733
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1734
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1735
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1736
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1737
+ };
1738
+
1739
+ // Normalize ASCII string to first 4 alphabetic/digit chars
1740
+ // Letters are forced to lowercase ASCII
1741
+ // Used to normalize TLD values
1742
+ void MakeChar4(const char* str, char* norm) {
1743
+ memcpy(norm, "____", 4); // four underscores
1744
+ int l_ptr = 0;
1745
+ for (unsigned int i = 0; i < strlen(str); ++i) {
1746
+ uint8 uc = static_cast<uint8>(str[i]);
1747
+ if (kIsAlpha[uc] | kIsDigit[uc]) {
1748
+ if (l_ptr < 4) { // Else ignore
1749
+ norm[l_ptr] = kCharsetToLowerTbl[uc];
1750
+ l_ptr++;
1751
+ }
1752
+ }
1753
+ }
1754
+ }
1755
+
1756
+ // Find subscript of matching key in first 4 bytes of sorted hint array, or -1
1757
+ static int HintBinaryLookup4(const HintEntry* hintprobs, int hintprobssize,
1758
+ const char* norm_key) {
1759
+ // Key is always in range [lo..hi)
1760
+ int lo = 0;
1761
+ int hi = hintprobssize;
1762
+ while (lo < hi) {
1763
+ int mid = (lo + hi) >> 1;
1764
+ int comp = memcmp(&hintprobs[mid].key[0], norm_key, 4);
1765
+ if (comp < 0) {
1766
+ lo = mid + 1;
1767
+ } else if (comp > 0) {
1768
+ hi = mid;
1769
+ } else {
1770
+ return mid;
1771
+ }
1772
+ }
1773
+ return -1;
1774
+ }
1775
+
1776
+
1777
+ // Increment the initial probabilities based on a per-TLD probs entry
1778
+ void ApplyTLDHint(uint8* lang_hint_boost, const char* tld_hint) {
1779
+ if (FLAGS_dbgscore) {
1780
+ fprintf(stderr, "TLD hint %s\n", tld_hint);
1781
+ }
1782
+ char normalized_tld[8];
1783
+ MakeChar4(tld_hint, normalized_tld);
1784
+ int n = HintBinaryLookup4(kTLDHintProbs, kTLDHintProbsSize,
1785
+ normalized_tld);
1786
+ // TLD is four bytes, probability entry is 4 bytes
1787
+ if (n >= 0) {
1788
+ uint32 probs = kTLDHintProbs[n].probs;
1789
+
1790
+ uint8 prob123 = (probs >> 0) & 0xff;
1791
+ const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
1792
+ uint8 top1 = (probs >> 8) & 0xff;
1793
+ if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);}
1794
+ uint8 top2 = (probs >> 16) & 0xff;
1795
+ if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);}
1796
+ uint8 top3 = (probs >> 24) & 0xff;
1797
+ if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);}
1798
+ }
1799
+ }
1800
+
1801
+
1802
+ // Increment the initial probabilities based on a per-encoding probs entry
1803
+ void ApplyEncodingHint(uint8* lang_hint_boost, int encoding_hint) {
1804
+ if (FLAGS_dbgscore) {
1805
+ Encoding tempenc = static_cast<Encoding>(encoding_hint);
1806
+ fprintf(stderr, "ENC hint %s\n", EncodingName(tempenc));
1807
+ }
1808
+ if (encoding_hint < ISO_8859_1) {return;}
1809
+ if (encoding_hint >= NUM_ENCODINGS) {return;}
1810
+ uint32 probs = kEncodingHintProbs[encoding_hint];
1811
+
1812
+ uint8 prob123 = (probs >> 0) & 0xff;
1813
+ const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
1814
+ uint8 top1 = (probs >> 8) & 0xff;
1815
+ if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);}
1816
+ uint8 top2 = (probs >> 16) & 0xff;
1817
+ if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);}
1818
+ uint8 top3 = (probs >> 24) & 0xff;
1819
+ if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);}
1820
+ }
1821
+
1822
+
1823
+ // Increment the initial probability for given language by fixed amount
1824
+ // Does not recognize extended languages as hints
1825
+ void ApplyLanguageHint(uint8* lang_hint_boost, Language language_hint) {
1826
+ if (FLAGS_dbgscore) {
1827
+ fprintf(stderr, "LANG hint %s\n", ExtLanguageName(language_hint));
1828
+ }
1829
+ if (language_hint < ENGLISH) {return;}
1830
+ if (language_hint >= NUM_LANGUAGES) {return;}
1831
+ uint32 probs = kLanguageHintProbs[language_hint];
1832
+
1833
+ uint8 prob123 = (probs >> 0) & 0xff;
1834
+ const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
1835
+ uint8 top1 = (probs >> 8) & 0xff;
1836
+ if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);}
1837
+ uint8 top2 = (probs >> 16) & 0xff;
1838
+ if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);}
1839
+ uint8 top3 = (probs >> 24) & 0xff;
1840
+ if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);}
1841
+ }
1842
+
1843
+ // Extract return values before fixups
1844
+ void ExtractLangEtc(ToteWithReliability* doc_tote, int total_text_bytes,
1845
+ int* reliable_percent3, Language* language3, int* percent3,
1846
+ double* normalized_score3,
1847
+ int* text_bytes, bool* is_reliable) {
1848
+ reliable_percent3[0] = 0;
1849
+ reliable_percent3[1] = 0;
1850
+ reliable_percent3[2] = 0;
1851
+ language3[0] = UNKNOWN_LANGUAGE;
1852
+ language3[1] = UNKNOWN_LANGUAGE;
1853
+ language3[2] = UNKNOWN_LANGUAGE;
1854
+ percent3[0] = 100;
1855
+ percent3[1] = 0;
1856
+ percent3[2] = 0;
1857
+ normalized_score3[0] = 0.0;
1858
+ normalized_score3[1] = 0.0;
1859
+ normalized_score3[2] = 0.0;
1860
+
1861
+ *text_bytes = total_text_bytes;
1862
+ *is_reliable = false;
1863
+
1864
+ int bytecount1 = total_text_bytes;
1865
+ int bytecount2 = 0;
1866
+ int bytecount3 = 0;
1867
+
1868
+ int lang1 = doc_tote->Key(0);
1869
+ if (lang1 != 0) {
1870
+ // We have a top language
1871
+ language3[0] = cld::UnpackLanguage(lang1);
1872
+ bytecount1 = doc_tote->Value(0);
1873
+ int reli1 = doc_tote->Reliability(0);
1874
+ reliable_percent3[0] = reli1 / (bytecount1 ? bytecount1 : 1); // avoid zdiv
1875
+ normalized_score3[0] = cld::GetNormalizedScore(language3[0],
1876
+ ULScript_Common,
1877
+ bytecount1,
1878
+ doc_tote->Score(0));
1879
+ }
1880
+
1881
+ int lang2 = doc_tote->Key(1);
1882
+ if (lang2 != 0) {
1883
+ language3[1] = cld::UnpackLanguage(lang2);
1884
+ bytecount2 = doc_tote->Value(1);
1885
+ int reli2 = doc_tote->Reliability(1);
1886
+ reliable_percent3[1] = reli2 / (bytecount2 ? bytecount2 : 1); // avoid zdiv
1887
+ normalized_score3[1] = cld::GetNormalizedScore(language3[1],
1888
+ ULScript_Common,
1889
+ bytecount2,
1890
+ doc_tote->Score(1));
1891
+ }
1892
+
1893
+ int lang3 = doc_tote->Key(2);
1894
+ if (lang3 != 0) {
1895
+ language3[2] = cld::UnpackLanguage(lang3);
1896
+ bytecount3 = doc_tote->Value(2);
1897
+ int reli3 = doc_tote->Reliability(2);
1898
+ reliable_percent3[2] = reli3 / (bytecount3 ? bytecount3 : 1); // avoid zdiv
1899
+ normalized_score3[2] = cld::GetNormalizedScore(language3[2],
1900
+ ULScript_Common,
1901
+ bytecount3,
1902
+ doc_tote->Score(2));
1903
+ }
1904
+
1905
+ // Increase total bytes to sum (top 3) if low for some reason
1906
+ int total_bytecount12 = bytecount1 + bytecount2;
1907
+ int total_bytecount123 = total_bytecount12 + bytecount3;
1908
+ if (total_text_bytes < total_bytecount123) {
1909
+ total_text_bytes = total_bytecount123;
1910
+ *text_bytes = total_text_bytes;
1911
+ }
1912
+
1913
+ // Sum minus previous % gives better roundoff behavior than bytecount/total
1914
+ int total_text_bytes_div = cld::maxint(1, total_text_bytes); // Avoid zdiv
1915
+ percent3[0] = (bytecount1 * 100) / total_text_bytes_div;
1916
+ percent3[1] = (total_bytecount12 * 100) / total_text_bytes_div;
1917
+ percent3[2] = (total_bytecount123 * 100) / total_text_bytes_div;
1918
+ percent3[2] -= percent3[1];
1919
+ percent3[1] -= percent3[0];
1920
+
1921
+ // Roundoff, say 96% 1.6% 1.4%, will produce non-obvious 96% 1% 2%
1922
+ // Fix this explicitly
1923
+ if (percent3[1] < percent3[2]) {
1924
+ ++percent3[1];
1925
+ --percent3[2];
1926
+ }
1927
+ if (percent3[0] < percent3[1]) {
1928
+ ++percent3[0];
1929
+ --percent3[1];
1930
+ }
1931
+
1932
+ *text_bytes = total_text_bytes;
1933
+
1934
+ if (lang1 != 0) {
1935
+ // We have a top language
1936
+ // Its reliability is overal result reliability
1937
+ int bytecount = doc_tote->Value(0);
1938
+ int reli = doc_tote->Reliability(0);
1939
+ int reliable_percent = reli / (bytecount ? bytecount : 1); // avoid zdiv
1940
+ *is_reliable = reliable_percent >= cld::kMinReliable;
1941
+ } else {
1942
+ // No top language at all. This can happen with zero text or 100% Klingon
1943
+ // if extended=false. Just return all UNKNOWN_LANGUAGE, reliable.
1944
+ *is_reliable = true;
1945
+ }
1946
+ }
1947
+
1948
+ bool IsFIGS(Language lang) {
1949
+ if (lang == FRENCH) {return true;}
1950
+ if (lang == ITALIAN) {return true;}
1951
+ if (lang == GERMAN) {return true;}
1952
+ if (lang == SPANISH) {return true;}
1953
+ return false;
1954
+ }
1955
+
1956
+ bool IsEFIGS(Language lang) {
1957
+ if (lang == ENGLISH) {return true;}
1958
+ if (lang == FRENCH) {return true;}
1959
+ if (lang == ITALIAN) {return true;}
1960
+ if (lang == GERMAN) {return true;}
1961
+ if (lang == SPANISH) {return true;}
1962
+ return false;
1963
+ }
1964
+
1965
+ static const int kNonEnBoilerplateMinPercent = 17; // <this => no second
1966
+ static const int kNonFIGSBoilerplateMinPercent = 20; // <this => no second
1967
+ static const int kGoodFirstMinPercent = 26; // <this => UNK
1968
+ static const int kGoodFirstReliableMinPercent = 51; // <this => unreli
1969
+ static const int kIgnoreMaxPercent = 95; // >this => unreli
1970
+ static const int kKeepMinPercent = 2; // <this => unreli
1971
+
1972
+ // For Tier3 languages, require more bytes of text to override
1973
+ // the first-place language
1974
+ static const int kGoodSecondT1T2MinBytes = 15; // <this => no second
1975
+ static const int kGoodSecondT3MinBytes = 128; // <this => no second
1976
+ //
1977
+
1978
+ // Calculate a single summary language for the document, and its reliability.
1979
+ // Returns language3[0] or language3[1] or ENGLISH or UNKNOWN_LANGUAGE
1980
+ // This is the heart of matching human-rater perception.
1981
+ // reliable_percent3[] is currently unused
1982
+ //
1983
+ // Do not return Tier3 second language unless there are at least 128 bytes
1984
+ void CalcSummaryLang(ToteWithReliability* doc_tote, int total_text_bytes,
1985
+ const int* reliable_percent3,
1986
+ const Language* language3,
1987
+ const int* percent3,
1988
+ Language* summary_lang, bool* is_reliable) {
1989
+ // Vector of active languages; changes if we delete some
1990
+ int slot_count = 3;
1991
+ int active_slot[3] = {0, 1, 2};
1992
+
1993
+ int ignore_percent = 0;
1994
+ int return_percent = percent3[0]; // Default to top lang
1995
+ *summary_lang = language3[0];
1996
+ *is_reliable = true;
1997
+ if (percent3[0] < kKeepMinPercent) {*is_reliable = false;}
1998
+
1999
+ // If any of top 3 is IGNORE, remove it and increment ignore_percent
2000
+ for (int i = 0; i < 3; ++i) {
2001
+ if (language3[i] == TG_UNKNOWN_LANGUAGE) {
2002
+ ignore_percent += percent3[i];
2003
+ // Move the rest up, levaing input vectors unchanged
2004
+ for (int j=i+1; j < 3; ++j) {
2005
+ active_slot[j - 1] = active_slot[j];
2006
+ }
2007
+ -- slot_count;
2008
+ // Logically remove Ignore from percentage-text calculation
2009
+ // (extra 1 in 101 avoids zdiv, biases slightly small)
2010
+ return_percent = (percent3[0] * 100) / (101 - ignore_percent);
2011
+ *summary_lang = language3[active_slot[0]];
2012
+ if (percent3[active_slot[0]] < kKeepMinPercent) {*is_reliable = false;}
2013
+ }
2014
+ }
2015
+
2016
+
2017
+ // If English and X, where X (not UNK) is big enough,
2018
+ // assume the English is boilerplate and return X.
2019
+ // Logically remove English from percentage-text calculation
2020
+ int second_bytes = (total_text_bytes * percent3[active_slot[1]]) / 100;
2021
+ // Require more bytes of text for Tier3 languages
2022
+ int minbytesneeded = kGoodSecondT1T2MinBytes;
2023
+ int plang_second = cld::PackLanguage(language3[active_slot[1]]);
2024
+ bool is_tier3 = (cld::kIsPackedTop40[plang_second] == 0);
2025
+ if (is_tier3) {
2026
+ minbytesneeded = kGoodSecondT3MinBytes;
2027
+ }
2028
+
2029
+ if ((language3[active_slot[0]] == ENGLISH) &&
2030
+ (language3[active_slot[1]] != ENGLISH) &&
2031
+ (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
2032
+ (percent3[active_slot[1]] >= kNonEnBoilerplateMinPercent) &&
2033
+ (second_bytes >= minbytesneeded)) {
2034
+ ignore_percent += percent3[active_slot[0]];
2035
+ return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
2036
+ *summary_lang = language3[active_slot[1]];
2037
+ if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
2038
+
2039
+ // Else If FIGS and X, where X (not UNK, EFIGS) is big enough,
2040
+ // assume the FIGS is boilerplate and return X.
2041
+ // Logically remove FIGS from percentage-text calculation
2042
+ } else if (IsFIGS(language3[active_slot[0]]) &&
2043
+ !IsEFIGS(language3[active_slot[1]]) &&
2044
+ (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
2045
+ (percent3[active_slot[1]] >= kNonFIGSBoilerplateMinPercent) &&
2046
+ (second_bytes >= minbytesneeded)) {
2047
+ ignore_percent += percent3[active_slot[0]];
2048
+ return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
2049
+ *summary_lang = language3[active_slot[1]];
2050
+ if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
2051
+
2052
+ // Else we are returning the first language, but want to improve its
2053
+ // return_percent if the second language should be ignored
2054
+ } else if ((language3[active_slot[1]] == ENGLISH) &&
2055
+ (language3[active_slot[0]] != ENGLISH)) {
2056
+ ignore_percent += percent3[active_slot[1]];
2057
+ return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
2058
+ } else if (IsFIGS(language3[active_slot[1]]) &&
2059
+ !IsEFIGS(language3[active_slot[0]])) {
2060
+ ignore_percent += percent3[active_slot[1]];
2061
+ return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
2062
+ }
2063
+
2064
+ // If return percent is too small (too many languages), return UNKNOWN
2065
+ if ((return_percent < kGoodFirstMinPercent)) {
2066
+ *summary_lang = UNKNOWN_LANGUAGE;
2067
+ *is_reliable = false;
2068
+ }
2069
+
2070
+ // If return percent is small, return language but set unreliable.
2071
+ if ((return_percent < kGoodFirstReliableMinPercent)) {
2072
+ *is_reliable = false;
2073
+ }
2074
+
2075
+ // If ignore percent is too large, set unreliable.
2076
+ if ((ignore_percent > kIgnoreMaxPercent)) {
2077
+ *is_reliable = false;
2078
+ }
2079
+
2080
+ // If we removed all the active languages, return UNKNOWN
2081
+ if (slot_count == 0) {
2082
+ *summary_lang = UNKNOWN_LANGUAGE;
2083
+ *is_reliable = false;
2084
+ }
2085
+ }
2086
+
2087
+
2088
+
2089
+ // Result vector must be exactly three items
2090
+ Language CompactLangDetImpl::DetectLanguageSummaryV25(
2091
+ const CompactLangDet::DetectionTables* tables,
2092
+ const char* buffer,
2093
+ int buffer_length,
2094
+ bool is_plain_text,
2095
+ bool do_pick_summary_language,
2096
+ bool do_remove_weak_matches,
2097
+ const char* tld_hint, // "id" boosts Indonesian
2098
+ int encoding_hint, // SJS boosts Japanese
2099
+ Language language_hint, // ITALIAN boosts it
2100
+ bool allow_extended_lang,
2101
+ int flags,
2102
+ Language plus_one,
2103
+ Language* language3,
2104
+ int* percent3,
2105
+ double* normalized_score3,
2106
+ int* text_bytes,
2107
+ bool* is_reliable) {
2108
+ if (!tables) {
2109
+ static const CompactLangDet::DetectionTables default_cld_tables = {
2110
+ &kQuadTable_obj,
2111
+ &compact_lang_det_generated_ctjkvz_b1_obj
2112
+ };
2113
+ tables = &default_cld_tables;
2114
+ }
2115
+ language3[0] = UNKNOWN_LANGUAGE;
2116
+ language3[1] = UNKNOWN_LANGUAGE;
2117
+ language3[2] = UNKNOWN_LANGUAGE;
2118
+ percent3[0] = 100;
2119
+ percent3[1] = 0;
2120
+ percent3[2] = 0;
2121
+ normalized_score3[0] = 0.0;
2122
+ normalized_score3[1] = 0.0;
2123
+ normalized_score3[2] = 0.0;
2124
+ *text_bytes = 0;
2125
+ *is_reliable = false;
2126
+
2127
+ // Document totals
2128
+ ToteWithReliability doc_tote; // Reliability = 0..100
2129
+
2130
+ // Vector of packed per-language boosts (just one filled in from hints)
2131
+ uint8 lang_hint_boost[EXT_NUM_LANGUAGES + 1];
2132
+ memset(lang_hint_boost, 0, sizeof(lang_hint_boost));
2133
+
2134
+ // Apply hints,if any
2135
+ if ((tld_hint != NULL) && (tld_hint[0] != '\0')) {
2136
+ ApplyTLDHint(lang_hint_boost, tld_hint);
2137
+ }
2138
+ if (encoding_hint != UNKNOWN_ENCODING) {
2139
+ ApplyEncodingHint(lang_hint_boost, encoding_hint);
2140
+ }
2141
+ if (language_hint != UNKNOWN_LANGUAGE) {
2142
+ ApplyLanguageHint(lang_hint_boost, language_hint);
2143
+ }
2144
+
2145
+
2146
+ // Four individual script totals, Latin, Han, other2, other3
2147
+ int next_other_tote = 2;
2148
+
2149
+ // Four totes for up to four different scripts pending at once
2150
+ Tote totes[4]; // [0] Latn [1] Hani [2] other [3] other
2151
+ bool tote_seen[4] = {false, false, false, false};
2152
+ int tote_grams[4] = {0, 0, 0, 0}; // Number in partial chunk
2153
+ UnicodeLScript tote_script[4] =
2154
+ {ULScript_Latin, ULScript_HanCJK, ULScript_Common, ULScript_Common};
2155
+
2156
+ // Loop through text spans in a single script
2157
+ ScriptScanner ss(buffer, buffer_length, is_plain_text);
2158
+ getone::LangSpan scriptspan;
2159
+
2160
+ scriptspan.text = NULL;
2161
+ scriptspan.text_bytes = 0;
2162
+ scriptspan.offset = 0;
2163
+ scriptspan.script = ULScript_Common;
2164
+ scriptspan.lang = UNKNOWN_LANGUAGE;
2165
+
2166
+ int total_text_bytes = 0;
2167
+ int textlimit = FLAGS_cld_textlimit << 10; // in KB
2168
+ if (textlimit == 0) {textlimit = 0x7fffffff;}
2169
+
2170
+ int advance_by = 2; // Advance 2 bytes
2171
+ int advance_limit = textlimit >> 3; // For first 1/8 of max document
2172
+
2173
+ int initial_word_span = kDefaultWordSpan;
2174
+ if (FLAGS_cld_forcewords) {
2175
+ initial_word_span = kReallyBigWordSpan;
2176
+ }
2177
+
2178
+ // Pick up chunk sizes
2179
+ // Smoothwidth is units of quadgrams, about 2.5 chars (unigrams) each
2180
+ // Sanity check -- force into a reasonable range
2181
+ int chunksizequads = FLAGS_cld_smoothwidth;
2182
+ chunksizequads = cld::minint(cld::maxint(chunksizequads, kMinChunkSizeQuads),
2183
+ kMaxChunkSizeQuads);
2184
+ int chunksizeunis = (chunksizequads * 5) >> 1;
2185
+
2186
+ // Varying short-span limit doesn't work well -- skips too much beyond 20KB
2187
+ // int spantooshortlimit = advance_by * FLAGS_cld_smoothwidth;
2188
+ int spantooshortlimit = kShortSpanThresh;
2189
+
2190
+ // For debugging only. Not thread-safe
2191
+ prior_lang = UNKNOWN_LANGUAGE;
2192
+ prior_unreliable = false;
2193
+
2194
+ // Allocate full-document prediction table for finding repeating words
2195
+ int hash = 0;
2196
+ int* predict_tbl = new int[kPredictionTableSize];
2197
+ if (FlagRepeats(flags)) {
2198
+ memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
2199
+ }
2200
+
2201
+ // Loop through scriptspans accumulating number of text bytes in each language
2202
+ while (ss.GetOneScriptSpanLower(&scriptspan)) {
2203
+ UnicodeLScript lscript = scriptspan.script;
2204
+
2205
+ // Echo text if asked to
2206
+ if (FLAGS_cld_echotext) {
2207
+ PrintHtmlEscapedText(stderr, scriptspan.text, scriptspan.text_bytes);
2208
+ }
2209
+
2210
+ // Squeeze out big chunks of text span if asked to
2211
+ if (FlagSqueeze(flags)) {
2212
+ // Remove repetitive or mostly-spaces chunks
2213
+ int newlen;
2214
+ int chunksize = 0; // Use the default
2215
+ newlen = CheapSqueezeInplace(scriptspan.text, scriptspan.text_bytes,
2216
+ chunksize);
2217
+ scriptspan.text_bytes = newlen;
2218
+ } else {
2219
+ // Check now and then to see if we should be squeezing
2220
+ if ((total_text_bytes >= kCheapSqueezeTestThresh) &&
2221
+ !FlagFinish(flags) &&
2222
+ ((getone::kMaxScriptBuffer >> 1) < scriptspan.text_bytes) &&
2223
+ CheapSqueezeTriggerTest(scriptspan.text,
2224
+ scriptspan.text_bytes,
2225
+ kCheapSqueezeTestLen)) {
2226
+ // Recursive call with big-chunk squeezing set
2227
+ if (FLAGS_cld_html || FLAGS_dbgscore) {
2228
+ fprintf(stderr,
2229
+ "<br>---text_bytes[%d] Recursive(Squeeze)---<br><br>\n",
2230
+ total_text_bytes);
2231
+ }
2232
+ // Deallocate full-document prediction table
2233
+ delete[] predict_tbl;
2234
+
2235
+ return DetectLanguageSummaryV25(
2236
+ tables,
2237
+ buffer,
2238
+ buffer_length,
2239
+ is_plain_text,
2240
+ do_pick_summary_language,
2241
+ do_remove_weak_matches,
2242
+ tld_hint, // "id" boosts Indonesian
2243
+ encoding_hint, // SJS boosts Japanese
2244
+ language_hint, // ITALIAN boosts it
2245
+ allow_extended_lang,
2246
+ flags | kCLDFlagSqueeze,
2247
+ plus_one,
2248
+ language3,
2249
+ percent3,
2250
+ normalized_score3,
2251
+ text_bytes,
2252
+ is_reliable);
2253
+ }
2254
+ }
2255
+
2256
+ // Remove repetitive words if asked to
2257
+ if (FlagRepeats(flags)) {
2258
+ // Remove repetitive words
2259
+ int newlen;
2260
+ newlen = CheapRepWordsInplace(scriptspan.text, scriptspan.text_bytes,
2261
+ &hash, predict_tbl);
2262
+ scriptspan.text_bytes = newlen;
2263
+ }
2264
+
2265
+ // The real scoring
2266
+ // Accumulate directly into the document total, or accmulate in one of four
2267
+ // chunk totals. The purpose of the multiple chunk totals is to piece
2268
+ // together short choppy pieces of text in alternating scripts. One total is
2269
+ // dedicated to Latin text, one to Han text, and the other two are dynamicly
2270
+ // assigned.
2271
+ Language onlylang = cld::kOnlyLanguagePerLScript[lscript];
2272
+
2273
+ if (onlylang != UNKNOWN_LANGUAGE) {
2274
+ // This entire script run is in a single language.
2275
+ ScoreNilgrams(&scriptspan, cld::PackLanguage(onlylang), &doc_tote,
2276
+ lang_hint_boost, flags, plus_one);
2277
+ } else if (cld::kScoreUniPerLScript[lscript] != 0) {
2278
+ // This entire script run's languages can be distinguished by uni-grams
2279
+ // Accumulate in hani_tote
2280
+ int tote_num = 1;
2281
+ if (!tote_seen[tote_num]) {
2282
+ tote_seen[tote_num] = true;
2283
+ // Default language gets 1 byte
2284
+ total_text_bytes += 1;
2285
+ InitScriptToteLang(&totes[tote_num], lscript);
2286
+ }
2287
+ ScoreUnigrams(tables->unigram_obj,
2288
+ &scriptspan, &tote_grams[tote_num], chunksizeunis,
2289
+ &totes[tote_num],
2290
+ &doc_tote, lang_hint_boost,
2291
+ advance_by, flags, &initial_word_span, plus_one);
2292
+ } else {
2293
+ // This entire script-run's languages can be distinguished by quad-grams
2294
+ // Accumulate in latn_tote or script0/1_tote
2295
+ int tote_num = -1;
2296
+ for (int t = 0; t < 4; ++t) {
2297
+ if (lscript == tote_script[t]) {
2298
+ tote_num = t;
2299
+ break;
2300
+ }
2301
+ }
2302
+ if (tote_num < 0) {
2303
+ // Need to allocate other0/1
2304
+ tote_num = next_other_tote;
2305
+ next_other_tote ^= 1; // Round-robin
2306
+ if (tote_seen[tote_num]) {
2307
+ // Flush previous
2308
+ ScoreChunkIntoDoc2(kToteSwitch[tote_num], advance_by,
2309
+ tote_script[tote_num], &totes[tote_num],
2310
+ &doc_tote, tote_grams[tote_num], lang_hint_boost);
2311
+ totes[tote_num].Reinit();
2312
+ }
2313
+ tote_script[tote_num] = lscript;
2314
+ }
2315
+
2316
+ if (!tote_seen[tote_num]) {
2317
+ tote_seen[tote_num] = true;
2318
+ // Default language gets 1 byte
2319
+ total_text_bytes += 1;
2320
+ InitScriptToteLang(&totes[tote_num], lscript);
2321
+ }
2322
+
2323
+ // The actual accumulation, possibly with word scoring also
2324
+ ScoreQuadgrams(tables->quadgram_obj, &scriptspan, &tote_grams[tote_num],
2325
+ chunksizequads,
2326
+ &totes[tote_num],
2327
+ &doc_tote, lang_hint_boost,
2328
+ advance_by, flags, &initial_word_span, plus_one);
2329
+ }
2330
+
2331
+ total_text_bytes += scriptspan.text_bytes;
2332
+
2333
+ // For long documents, do less-dense samples the further along we go.
2334
+ // This is to keep speed sublinear in document size.
2335
+ if (total_text_bytes > advance_limit) {
2336
+ if (total_text_bytes > textlimit) {
2337
+ // Don't look at rest of doc
2338
+ if (FLAGS_cld_html || FLAGS_dbgscore) {
2339
+ fprintf(stderr, "<br>---text_bytes[%d] textlimit %d reached---<br>",
2340
+ total_text_bytes, textlimit);
2341
+ }
2342
+ break;
2343
+ }
2344
+ advance_by <<= 1; // Double advance bytes
2345
+ advance_limit <<= 1; // Double limit until next change
2346
+ spantooshortlimit <<= 1; // Double short-span size
2347
+ if (FLAGS_cld_html || FLAGS_dbgscore) {
2348
+ fprintf(stderr, "<br>---text_bytes[%d] advance_by doubled to %d---<br>",
2349
+ total_text_bytes, advance_by);
2350
+ }
2351
+ }
2352
+ } // End while (ss.GetOneScriptSpanLower())
2353
+
2354
+ // Deallocate full-document prediction table
2355
+ delete[] predict_tbl;
2356
+
2357
+ // Flush pending totals
2358
+ for (int tote_num = 0; tote_num < 4; ++tote_num) {
2359
+ if (tote_seen[tote_num]) {
2360
+ ScoreChunkIntoDoc2(kToteName[tote_num], advance_by,
2361
+ tote_script[tote_num], &totes[tote_num], &doc_tote,
2362
+ tote_grams[tote_num], lang_hint_boost);
2363
+ }
2364
+ }
2365
+
2366
+ // If extended languages are disallowed, remove them here
2367
+ if (!allow_extended_lang) {
2368
+ RemoveExtendedLanguages(&doc_tote);
2369
+ }
2370
+
2371
+ // Force close pairs to one or the other
2372
+ RefineScoredClosePairs(&doc_tote);
2373
+
2374
+
2375
+ // Calculate return results
2376
+ // Find top three byte counts in tote heap
2377
+ int reliable_percent3[3];
2378
+
2379
+
2380
+ // Cannot use Add, etc. after sorting
2381
+ doc_tote.Sort(3);
2382
+
2383
+ ExtractLangEtc(&doc_tote, total_text_bytes,
2384
+ reliable_percent3, language3, percent3, normalized_score3,
2385
+ text_bytes, is_reliable);
2386
+
2387
+ bool have_good_answer = false;
2388
+ if (FlagFinish(flags)) {
2389
+ // Force a result
2390
+ have_good_answer = true;
2391
+ } else if (total_text_bytes <= kShortTextThresh) {
2392
+ // Don't recurse on short text -- we already did word scores
2393
+ have_good_answer = true;
2394
+ } else if (*is_reliable &&
2395
+ (percent3[0] >= kGoodLang1Percent)) {
2396
+ have_good_answer = true;
2397
+ } else if (*is_reliable &&
2398
+ ((percent3[0] + percent3[1]) >= kGoodLang1and2Percent)) {
2399
+ have_good_answer = true;
2400
+ }
2401
+
2402
+
2403
+ if (have_good_answer) {
2404
+ // This is the real, non-recursive return
2405
+
2406
+ // Move bytes for unreliable langs to another lang or
2407
+ // UNKNOWN
2408
+ RemoveUnreliableLanguages(&doc_tote, do_remove_weak_matches);
2409
+
2410
+ // Redo the result extraction after the removal above
2411
+ doc_tote.Sort(3);
2412
+
2413
+ ExtractLangEtc(&doc_tote, total_text_bytes,
2414
+ reliable_percent3, language3, percent3, normalized_score3,
2415
+ text_bytes, is_reliable);
2416
+
2417
+ #if 0
2418
+ // OLD code, replaced by CalcSummaryLang
2419
+ //
2420
+ // Suppress ignore-me text, TG_UNKNOWN_LANGUAGE if 2nd or 3rd language
2421
+ // Force it to English if first language
2422
+ if (language3[2] == TG_UNKNOWN_LANGUAGE) {
2423
+ reliable_percent3[2] = 0;
2424
+ language3[2] = UNKNOWN_LANGUAGE;
2425
+ percent3[2] = 0;
2426
+ } else if (language3[1] == TG_UNKNOWN_LANGUAGE) {
2427
+ // Move up lower language
2428
+ reliable_percent3[1] = reliable_percent3[2];
2429
+ language3[1] = language3[2];
2430
+ percent3[1] = percent3[2];
2431
+ reliable_percent3[2] = 0;
2432
+ language3[2] = UNKNOWN_LANGUAGE;
2433
+ percent3[2] = 0;
2434
+ } else if (language3[0] == TG_UNKNOWN_LANGUAGE) {
2435
+ language3[0] = ENGLISH;
2436
+ }
2437
+
2438
+ if (language3[0] == UNKNOWN_LANGUAGE) {
2439
+ // Last-ditch test for some result, but it is UNKNOWN_LANGUAGE
2440
+ // Force it to English (should not happen)
2441
+ language3[0] = ENGLISH;
2442
+ percent3[0] = 100;
2443
+ *is_reliable = true;
2444
+ }
2445
+ #endif
2446
+
2447
+
2448
+ #if 0
2449
+ // Scaffolding to reveal subset sequence lang distribution across doc text
2450
+ // Track the sequence of language fragments [result currently unused]
2451
+ if (FLAGS_cld_html) {
2452
+ static const int kMaxSubsetSeq = 12;
2453
+ uint8 subseq[kMaxSubsetSeq];
2454
+ doc_tote.ExtractSeq(kMaxSubsetSeq, subseq);
2455
+
2456
+ fprintf(stderr, "<br>\nSubset Sequence[%d]: ", kMaxSubsetSeq);
2457
+ for (int i = 0; i < kMaxSubsetSeq; ++i) {
2458
+ fprintf(stderr, "%s ", ExtLanguageCode(cld::UnpackLanguage(subseq[i])));
2459
+ if ((i % 4) == 3) {fprintf(stderr, "&nbsp; ");}
2460
+ }
2461
+ fprintf(stderr, "&nbsp;&nbsp; ");
2462
+
2463
+ for (int i = 0; i < 3; ++i) {
2464
+ if (language3[i] != UNKNOWN_LANGUAGE) {
2465
+ fprintf(stderr, "%s.%d(%d%%) ",
2466
+ ExtLanguageCode(language3[i]),
2467
+ reliable_percent3[i],
2468
+ percent3[i]);
2469
+ }
2470
+ }
2471
+
2472
+ fprintf(stderr, "%d B ", total_text_bytes);
2473
+ fprintf(stderr, "<br>\n");
2474
+ }
2475
+ // End Scaffolding to reveal subset sequence lang distribution
2476
+ #endif
2477
+
2478
+ Language summary_lang;
2479
+ if (do_pick_summary_language) {
2480
+ CalcSummaryLang(&doc_tote, total_text_bytes,
2481
+ reliable_percent3, language3, percent3,
2482
+ &summary_lang, is_reliable);
2483
+ } else {
2484
+ summary_lang = language3[0];
2485
+ }
2486
+
2487
+ if (FLAGS_cld_html) {
2488
+ for (int i = 0; i < 3; ++i) {
2489
+ if (language3[i] != UNKNOWN_LANGUAGE) {
2490
+ fprintf(stderr, "%s.%d(%d%%) ",
2491
+ ExtLanguageCode(language3[i]),
2492
+ reliable_percent3[i],
2493
+ percent3[i]);
2494
+ }
2495
+ }
2496
+
2497
+ fprintf(stderr, "%d B ", total_text_bytes);
2498
+ fprintf(stderr, "= %s%c ",
2499
+ ExtLanguageName(summary_lang), is_reliable ? ' ' : '*');
2500
+ fprintf(stderr, "<br>\n");
2501
+ }
2502
+
2503
+ return summary_lang;
2504
+ }
2505
+
2506
+ // Not a good answer -- do recursive call to refine
2507
+ if (FLAGS_cld_html || FLAGS_dbgscore) {
2508
+ // This is what we hope to improve on in the recursive call, if any
2509
+ PrintLangs(stderr, language3, percent3, text_bytes, is_reliable);
2510
+ }
2511
+
2512
+ // For restriction to Top40 + one, the one is 1st/2nd lang that is not Top40
2513
+ // For this purpose, we treate "Ignore" as top40
2514
+ Language new_plus_one = UNKNOWN_LANGUAGE;
2515
+ if (cld::kIsPackedTop40[cld::PackLanguage(language3[0])] == 0) {
2516
+ new_plus_one = language3[0];
2517
+ } else if (cld::kIsPackedTop40[cld::PackLanguage(language3[1])] == 0) {
2518
+ new_plus_one = language3[1];
2519
+ }
2520
+
2521
+ if (total_text_bytes < kShortTextThresh) {
2522
+ // Short text: Recursive call with top40 and short set
2523
+ if (FLAGS_cld_html || FLAGS_dbgscore) {
2524
+ fprintf(stderr, "&nbsp;&nbsp;---text_bytes[%d] "
2525
+ "Recursive(Top40/Rep/Short/Words)---<br><br>\n",
2526
+ total_text_bytes);
2527
+ }
2528
+ return DetectLanguageSummaryV25(
2529
+ tables,
2530
+ buffer,
2531
+ buffer_length,
2532
+ is_plain_text,
2533
+ do_pick_summary_language,
2534
+ do_remove_weak_matches,
2535
+ tld_hint, // "id" boosts Indonesian
2536
+ encoding_hint, // SJS boosts Japanese
2537
+ language_hint, // ITALIAN boosts it
2538
+ allow_extended_lang,
2539
+ flags | kCLDFlagTop40 | kCLDFlagRepeats |
2540
+ kCLDFlagShort | kCLDFlagUseWords | kCLDFlagFinish,
2541
+ new_plus_one,
2542
+ language3,
2543
+ percent3,
2544
+ normalized_score3,
2545
+ text_bytes,
2546
+ is_reliable);
2547
+ }
2548
+
2549
+ // Longer text: Recursive call with top40 set
2550
+ if (FLAGS_cld_html || FLAGS_dbgscore) {
2551
+ fprintf(stderr,
2552
+ "&nbsp;&nbsp;---text_bytes[%d] Recursive(Top40/Rep)---<br><br>\n",
2553
+ total_text_bytes);
2554
+ }
2555
+ return DetectLanguageSummaryV25(
2556
+ tables,
2557
+ buffer,
2558
+ buffer_length,
2559
+ is_plain_text,
2560
+ do_pick_summary_language,
2561
+ do_remove_weak_matches,
2562
+ tld_hint, // "id" boosts Indonesian
2563
+ encoding_hint, // SJS boosts Japanese
2564
+ language_hint, // ITALIAN boosts it
2565
+ allow_extended_lang,
2566
+ flags | kCLDFlagTop40 | kCLDFlagRepeats |
2567
+ kCLDFlagFinish,
2568
+ new_plus_one,
2569
+ language3,
2570
+ percent3,
2571
+ normalized_score3,
2572
+ text_bytes,
2573
+ is_reliable);
2574
+ } // End CompactLangDetImpl::DetectLanguageSummaryV25