language_detection 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (100) hide show
  1. data/.gitignore +19 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +85 -0
  5. data/Rakefile +11 -0
  6. data/ext/cld/Makefile +34 -0
  7. data/ext/cld/base/basictypes.h +348 -0
  8. data/ext/cld/base/build_config.h +124 -0
  9. data/ext/cld/base/casts.h +156 -0
  10. data/ext/cld/base/commandlineflags.h +443 -0
  11. data/ext/cld/base/crash.h +41 -0
  12. data/ext/cld/base/dynamic_annotations.h +358 -0
  13. data/ext/cld/base/global_strip_options.h +59 -0
  14. data/ext/cld/base/log_severity.h +46 -0
  15. data/ext/cld/base/logging.h +1403 -0
  16. data/ext/cld/base/macros.h +243 -0
  17. data/ext/cld/base/port.h +54 -0
  18. data/ext/cld/base/scoped_ptr.h +428 -0
  19. data/ext/cld/base/stl_decl.h +0 -0
  20. data/ext/cld/base/stl_decl_msvc.h +107 -0
  21. data/ext/cld/base/string_util.h +29 -0
  22. data/ext/cld/base/strtoint.h +93 -0
  23. data/ext/cld/base/template_util.h +96 -0
  24. data/ext/cld/base/type_traits.h +198 -0
  25. data/ext/cld/base/vlog_is_on.h +143 -0
  26. data/ext/cld/cld.so +0 -0
  27. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  28. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  29. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  30. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  31. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  32. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  33. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  34. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  35. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  36. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  37. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  38. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  39. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  40. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  41. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  42. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  43. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  44. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  45. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  46. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  47. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  48. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  49. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  50. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  51. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  52. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  53. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  54. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  55. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  56. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  57. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  58. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  59. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  60. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  61. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  62. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  63. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  64. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  65. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  66. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  67. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  68. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  69. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  70. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  71. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  72. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  73. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  74. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  75. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  76. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  77. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  78. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  79. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  80. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  81. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  82. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  83. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  84. data/ext/cld/encodings/internal/encodings.cc +12 -0
  85. data/ext/cld/encodings/lang_enc.h +254 -0
  86. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  87. data/ext/cld/encodings/public/encodings.h +301 -0
  88. data/ext/cld/extconf.rb +1 -0
  89. data/ext/cld/language_detection.cc +88 -0
  90. data/ext/cld/languages/internal/languages.cc +337 -0
  91. data/ext/cld/languages/proto/languages.pb.h +179 -0
  92. data/ext/cld/languages/public/languages.h +379 -0
  93. data/language_detection.gemspec +28 -0
  94. data/lib/language_detection/string.rb +1 -0
  95. data/lib/language_detection/version.rb +3 -0
  96. data/lib/language_detection.rb +54 -0
  97. data/test/_helper.rb +15 -0
  98. data/test/fixtures/languages.csv +80 -0
  99. data/test/language_detection_test.rb +88 -0
  100. metadata +250 -0
@@ -0,0 +1,1205 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_CLDUTIL_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_CLDUTIL_H_
7
+
8
+ #include <string>
9
+ #include "encodings/compact_lang_det/ext_lang_enc.h"
10
+ #include "encodings/compact_lang_det/tote.h"
11
+ #include "encodings/compact_lang_det/win/cld_basictypes.h"
12
+ #include "encodings/compact_lang_det/win/cld_commandlineflags.h"
13
+ #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
14
+
15
+ namespace cld {
16
+
17
+ // Hash bucket for four-way associative lookup with < 64K buckets
18
+ // 32 bytes per bucket, 8-byte entries
19
+ typedef struct {
20
+ uint32 key[4]; // hashed word to look up
21
+ uint32 value[4]; // packed three lang numbers and probability subscript
22
+ } SmallWordProbBucket4;
23
+
24
+ // Hash bucket for fouro-way associative lookup with >= 64K buckets
25
+ // 24 bytes per bucket, 6-byte entries
26
+ typedef struct {
27
+ uint16 key[4]; // Half of hashed word to look up; other
28
+ // half is used to pick the bucket
29
+ uint32 value[4]; // packed three lang numbers and probability subscript
30
+ } LargeQuadProbBucket4;
31
+
32
+ // Hash bucket for four-way associative lookup, indirect probabilities
33
+ // 16 bytes per bucket, 4-byte entries
34
+ typedef struct {
35
+ uint32 keyvalue[4]; // Upper part of word is hash, lower is indirect prob
36
+ } IndirectProbBucket4;
37
+
38
+
39
+ // This describes a complete CLD table, consisting of
40
+ // a main lookup table, an indirect language/probability table, and
41
+ // three constants.
42
+ // The main table key is a quadgram, bigram, or longword hash, with
43
+ // part of the key used to select a bucket modulo kCLDTableSize,
44
+ // and the rest matched against the key portion of four entries in a bucket,
45
+ // defined by kCLDTableKeyMask. The remaining bits of an entry, defined
46
+ // by ~kCLDTableKeyMask, are usually a subscript in the indirect table.
47
+ //
48
+ // By using part of the key to select a bucket, those key bits do not need
49
+ // to be stored in the main table entries, saving space (typically 2 bytes).
50
+ //
51
+ // By using an indirect table for lang/prob triples, only the subscript needs
52
+ // to be stored in the main table entires, saving space (typically 2 bytes).
53
+ //
54
+ // Each entry in the indirect table has three languages and three
55
+ // corresponding probabilities, packed into four bytes.
56
+ //
57
+ // The build date constant is included just for version tracking and is not
58
+ // otherwise used.
59
+ //
60
+ // Different-size tables can be linked in for different production
61
+ // environments. By going indirect through this struct, the runtime code is
62
+ // insensitive to the actual sizes.
63
+ //
64
+ // An empty placeholder table can be described by a table size of 1
65
+ // bucket, a keymask of 0xffffffff, a degenerate bucket of four no-match
66
+ // entries, and a degenerate indirect table of one no-languages entry.
67
+ //
68
+ //
69
+ struct CLDTableSummary {
70
+ const IndirectProbBucket4* kCLDTable;
71
+ // Each bucket has four entries, part
72
+ // key and part indirect subscript
73
+ const uint32* kCLDTableInd; // Each entry is three packed lang/prob
74
+ const int kCLDTableSize; // Bucket count
75
+ const int kCLDTableIndSize; // Entries count
76
+ const int kCLDTableKeyMask; // Mask hash key
77
+ const int kCLDTableBuildDate; // yyyymmdd
78
+ };
79
+
80
+
81
+ // Keeps per-character 0-12 language probabilities for CTJKVZ-- in that order.
82
+ // Chinese ChineseT Japanese Korean Vietnamese Zhuang
83
+ // (2 bytes unused, for alignment padding and future)
84
+ typedef struct {
85
+ uint8 probs[8];
86
+ } UnigramProbArray;
87
+
88
+ // Map 8-bit subscript to CTJKVZ probabilities
89
+ // Target runtime probabilities for CTJK + VZ
90
+ // Hand-generated to cover a reasonable range of choices
91
+ static const int kTargetCTJKVZProbsSize = 242;
92
+ static const UnigramProbArray kTargetCTJKVZProbs[kTargetCTJKVZProbsSize] = {
93
+ {{0,0,0,0,0,0,0,0}},
94
+ {{0,0,0,0,0,12,0,0}},
95
+ {{0,0,0,0,12,0,0,0}},
96
+ {{0,0,0,12,0,0,0,0}},
97
+ {{0,0,12,0,0,0,0,0}},
98
+ {{0,12,0,0,0,0,0,0}},
99
+ {{12,0,0,0,0,0,0,0}},
100
+
101
+ {{8,0,0,0,4,0,0,0}},
102
+ {{8,0,0,4,0,0,0,0}},
103
+ {{8,0,4,0,0,0,0,0}},
104
+ {{8,4,0,0,0,0,0,0}},
105
+ {{8,2,0,2,0,0,0,0}},
106
+ {{0,0,0,0,0,8,0,0}},
107
+ {{0,4,8,0,0,0,0,0}},
108
+ {{4,0,0,0,0,8,0,0}},
109
+ {{0,0,8,0,0,0,0,0}},
110
+ {{8,2,2,0,0,0,0,0}},
111
+ {{0,8,4,0,0,0,0,0}},
112
+ {{8,0,0,0,0,4,0,0}},
113
+ {{0,8,2,0,0,0,0,0}},
114
+ {{4,8,0,0,0,0,0,0}},
115
+ {{2,8,0,2,0,0,0,0}},
116
+ {{2,2,8,0,0,0,0,0}},
117
+ {{0,8,0,0,0,0,0,0}},
118
+ {{0,2,8,0,0,0,0,0}},
119
+ {{2,8,2,0,0,0,0,0}},
120
+ {{8,0,0,0,0,0,0,0}},
121
+ {{2,8,0,0,0,0,0,0}},
122
+ {{8,2,0,0,0,0,0,0}},
123
+
124
+ {{0,6,2,0,2,0,0,0}},
125
+ {{2,0,0,0,6,0,0,0}},
126
+ {{4,0,0,0,6,0,0,0}},
127
+ {{4,6,0,0,4,0,0,0}},
128
+ {{4,6,2,0,2,0,0,0}},
129
+ {{4,6,4,0,2,0,0,0}},
130
+ {{5,4,6,0,0,0,0,0}},
131
+ {{6,0,0,0,4,0,0,0}},
132
+ {{6,0,2,0,4,0,0,0}},
133
+ {{6,0,4,0,4,0,0,0}},
134
+ {{6,2,0,0,4,0,0,0}},
135
+ {{6,2,2,0,4,0,0,0}},
136
+ {{6,2,4,0,2,0,0,0}},
137
+ {{6,4,0,0,2,0,0,0}},
138
+ {{6,4,2,0,2,0,0,0}},
139
+ {{0,0,6,2,0,0,0,0}},
140
+ {{0,6,2,0,0,2,0,0}},
141
+ {{2,2,2,0,0,6,0,0}},
142
+ {{2,2,6,4,0,0,0,0}},
143
+ {{2,4,0,0,0,6,0,0}},
144
+ {{2,6,0,4,0,0,0,0}},
145
+ {{2,6,2,4,0,0,0,0}},
146
+ {{2,6,4,4,0,0,0,0}},
147
+ {{4,0,2,0,0,6,0,0}},
148
+ {{4,2,6,2,0,0,0,0}},
149
+ {{4,4,2,0,0,6,0,0}},
150
+ {{4,6,4,0,0,2,0,0}},
151
+ {{6,0,2,0,0,2,0,0}},
152
+ {{6,2,0,0,0,2,0,0}},
153
+ {{6,2,2,0,0,4,0,0}},
154
+ {{6,2,4,0,0,2,0,0}},
155
+ {{4,6,2,0,0,4,0,0}},
156
+ {{6,4,2,0,0,4,0,0}},
157
+ {{2,0,0,0,0,6,0,0}},
158
+ {{6,2,0,2,0,0,0,0}},
159
+ {{2,2,0,0,0,6,0,0}},
160
+ {{6,2,6,0,0,0,0,0}},
161
+ {{6,4,2,0,0,2,0,0}},
162
+ {{6,4,2,2,0,0,0,0}},
163
+ {{4,6,4,2,0,0,0,0}},
164
+ {{6,0,2,0,0,4,0,0}},
165
+ {{6,0,4,0,0,2,0,0}},
166
+ {{6,0,6,0,0,0,0,0}},
167
+ {{6,2,2,0,0,0,0,0}},
168
+ {{6,4,0,0,0,2,0,0}},
169
+ {{6,4,5,0,0,0,0,0}},
170
+ {{0,6,0,2,0,0,0,0}},
171
+ {{0,6,2,2,0,0,0,0}},
172
+ {{2,6,0,2,0,0,0,0}},
173
+ {{2,6,2,2,0,0,0,0}},
174
+ {{4,2,0,0,0,6,0,0}},
175
+ {{6,4,0,0,0,4,0,0}},
176
+ {{6,4,0,2,0,0,0,0}},
177
+ {{6,6,0,2,0,0,0,0}},
178
+ {{6,0,4,0,0,4,0,0}},
179
+ {{6,2,0,0,0,4,0,0}},
180
+ {{6,6,2,2,0,0,0,0}},
181
+ {{4,6,0,0,0,2,0,0}},
182
+ {{2,6,6,0,0,0,0,0}},
183
+ {{4,5,6,0,0,0,0,0}},
184
+ {{4,6,0,2,0,0,0,0}},
185
+ {{6,2,0,0,0,6,0,0}},
186
+ {{0,6,4,2,0,0,0,0}},
187
+ {{4,0,6,0,0,0,0,0}},
188
+ {{2,6,4,2,0,0,0,0}},
189
+ {{4,6,0,0,0,4,0,0}},
190
+ {{6,2,2,0,0,0,0,0}},
191
+ {{4,6,2,2,0,0,0,0}},
192
+ {{4,6,5,0,0,0,0,0}},
193
+ {{6,0,2,0,0,0,0,0}},
194
+ {{6,4,4,0,0,0,0,0}},
195
+ {{4,2,6,0,0,0,0,0}},
196
+ {{2,0,6,0,0,0,0,0}},
197
+ {{4,4,0,0,0,6,0,0}},
198
+ {{4,4,6,0,0,0,0,0}},
199
+ {{4,6,2,0,0,2,0,0}},
200
+ {{2,2,6,0,0,0,0,0}},
201
+ {{2,4,6,0,0,0,0,0}},
202
+ {{0,6,6,0,0,0,0,0}},
203
+ {{6,2,4,0,0,0,0,0}},
204
+ {{0,4,6,0,0,0,0,0}},
205
+ {{4,0,0,0,0,6,0,0}},
206
+ {{4,6,4,0,0,0,0,0}},
207
+ {{6,0,0,0,0,6,0,0}},
208
+ {{6,0,0,0,0,2,0,0}},
209
+ {{6,0,4,0,0,0,0,0}},
210
+ {{6,5,4,0,0,0,0,0}},
211
+ {{0,2,6,0,0,0,0,0}},
212
+ {{0,0,6,0,0,0,0,0}},
213
+ {{6,6,2,0,0,0,0,0}},
214
+ {{2,6,4,0,0,0,0,0}},
215
+ {{6,4,2,0,0,0,0,0}},
216
+ {{2,6,2,0,0,0,0,0}},
217
+ {{2,6,0,0,0,0,0,0}},
218
+ {{6,0,0,0,0,4,0,0}},
219
+ {{6,4,0,0,0,0,0,0}},
220
+ {{6,6,0,0,0,0,0,0}},
221
+ {{5,6,4,0,0,0,0,0}},
222
+ {{0,6,0,0,0,0,0,0}},
223
+ {{6,2,0,0,0,0,0,0}},
224
+ {{0,6,2,0,0,0,0,0}},
225
+ {{4,6,2,0,0,0,0,0}},
226
+ {{0,6,4,0,0,0,0,0}},
227
+ {{4,6,0,0,0,0,0,0}},
228
+ {{6,0,0,0,0,0,0,0}},
229
+ {{6,6,5,0,0,0,0,0}},
230
+ {{6,5,6,0,0,0,0,0}},
231
+ {{5,6,6,0,0,0,0,0}},
232
+ {{5,5,6,0,0,0,0,0}},
233
+ {{5,6,5,0,0,0,0,0}},
234
+ {{6,5,5,0,0,0,0,0}},
235
+ {{6,6,6,0,0,0,0,0}},
236
+ {{6,5,0,0,0,0,0,0}},
237
+ {{6,0,5,0,0,0,0,0}},
238
+ {{0,6,5,0,0,0,0,0}},
239
+ {{5,6,0,0,0,0,0,0}},
240
+ {{5,0,6,0,0,0,0,0}},
241
+ {{0,5,6,0,0,0,0,0}},
242
+
243
+ {{0,0,0,0,4,0,0,0}},
244
+ {{0,0,0,4,0,0,0,0}},
245
+ {{2,2,0,0,4,0,0,0}},
246
+ {{2,2,2,0,4,0,0,0}},
247
+ {{2,4,0,0,2,0,0,0}},
248
+ {{2,4,2,0,2,0,0,0}},
249
+ {{2,4,4,0,2,0,0,0}},
250
+ {{4,0,2,0,4,0,0,0}},
251
+ {{4,0,4,0,2,0,0,0}},
252
+ {{4,2,0,0,2,0,0,0}},
253
+ {{4,2,2,0,2,0,0,0}},
254
+ {{4,4,0,0,2,0,0,0}},
255
+ {{4,4,2,0,2,0,0,0}},
256
+ {{4,4,4,0,2,0,0,0}},
257
+ {{0,2,2,4,0,0,0,0}},
258
+ {{2,2,4,2,0,0,0,0}},
259
+ {{2,4,4,0,0,2,0,0}},
260
+ {{2,4,4,2,0,0,0,0}},
261
+ {{4,0,4,0,0,2,0,0}},
262
+ {{4,0,4,0,0,4,0,0}},
263
+ {{4,2,2,4,0,0,0,0}},
264
+ {{4,4,0,2,0,0,0,0}},
265
+ {{2,2,0,4,0,0,0,0}},
266
+ {{2,4,2,2,0,0,0,0}},
267
+ {{4,4,2,2,0,0,0,0}},
268
+ {{4,0,4,0,0,0,0,0}},
269
+ {{4,4,4,0,0,4,0,0}},
270
+ {{0,4,0,2,0,0,0,0}},
271
+ {{0,4,2,2,0,0,0,0}},
272
+ {{4,0,2,0,0,2,0,0}},
273
+ {{4,2,0,0,0,4,0,0}},
274
+ {{2,2,2,0,0,4,0,0}},
275
+ {{4,0,0,2,0,0,0,0}},
276
+ {{4,4,4,0,0,2,0,0}},
277
+ {{4,0,0,0,0,4,0,0}},
278
+ {{4,0,2,0,0,4,0,0}},
279
+ {{4,2,0,0,0,2,0,0}},
280
+ {{4,2,2,0,0,2,0,0}},
281
+ {{2,4,0,2,0,0,0,0}},
282
+ {{2,2,0,0,0,4,0,0}},
283
+ {{2,4,0,0,0,4,0,0}},
284
+ {{2,4,2,0,0,4,0,0}},
285
+ {{4,2,4,0,0,0,0,0}},
286
+ {{2,0,4,0,0,0,0,0}},
287
+ {{4,0,2,0,0,0,0,0}},
288
+ {{4,4,0,0,0,4,0,0}},
289
+ {{4,4,2,0,0,4,0,0}},
290
+ {{0,4,4,0,0,0,0,0}},
291
+ {{4,4,0,0,0,2,0,0}},
292
+ {{2,4,0,0,0,2,0,0}},
293
+ {{2,2,4,0,0,0,0,0}},
294
+ {{0,2,4,0,0,0,0,0}},
295
+ {{4,2,2,0,0,0,0,0}},
296
+ {{2,4,2,0,0,2,0,0}},
297
+ {{4,4,4,0,0,0,0,0}},
298
+ {{2,4,4,0,0,0,0,0}},
299
+ {{0,0,4,0,0,0,0,0}},
300
+ {{0,4,2,0,0,0,0,0}},
301
+ {{4,4,2,0,0,2,0,0}},
302
+ {{2,4,2,0,0,0,0,0}},
303
+ {{4,2,0,0,0,0,0,0}},
304
+ {{4,4,0,0,0,0,0,0}},
305
+ {{4,4,2,0,0,0,0,0}},
306
+ {{2,4,0,0,0,0,0,0}},
307
+ {{0,4,0,0,0,0,0,0}},
308
+ {{4,0,0,0,0,0,0,0}},
309
+ {{0,0,0,4,4,0,0,0}},
310
+ {{0,0,4,0,4,0,0,0}},
311
+ {{0,0,4,4,0,0,0,0}},
312
+ {{0,4,0,0,4,0,0,0}},
313
+ {{0,4,0,4,0,0,0,0}},
314
+ {{4,0,0,0,4,0,0,0}},
315
+ {{4,0,0,4,0,0,0,0}},
316
+
317
+ {{2,0,0,0,0,0,0,0}},
318
+ {{0,2,0,0,0,0,0,0}},
319
+ {{0,2,0,2,2,0,0,0}},
320
+ {{0,2,2,0,2,0,0,0}},
321
+ {{2,0,0,2,2,0,0,0}},
322
+ {{2,0,2,0,2,0,0,0}},
323
+ {{2,0,2,2,0,0,0,0}},
324
+ {{2,2,0,0,2,0,0,0}},
325
+ {{2,2,2,2,0,0,0,0}},
326
+ {{2,2,0,2,0,0,0,0}},
327
+ {{2,2,0,0,0,0,0,0}},
328
+ {{0,0,2,0,0,0,0,0}},
329
+ {{0,2,2,0,0,0,0,0}},
330
+ {{2,2,2,0,0,0,0,0}},
331
+ {{0,0,0,2,0,0,0,0}},
332
+ {{2,0,2,0,0,0,0,0}},
333
+ {{0,2,0,2,0,0,0,0}},
334
+ {{0,0,2,2,0,0,0,0}},
335
+ {{0,2,2,2,0,0,0,0}},
336
+ };
337
+
338
+
339
+
340
+
341
+ // 1 to skip ASCII space, vowels AEIOU aeiou and UTF-8 continuation bytes 80-BF
342
+ static const uint8 kSkipSpaceVowelContinue[256] = {
343
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
344
+ 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
345
+ 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
346
+ 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
347
+
348
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
349
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
350
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
351
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
352
+ };
353
+
354
+ // 1 to skip ASCII space, and UTF-8 continuation bytes 80-BF
355
+ static const uint8 kSkipSpaceContinue[256] = {
356
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
357
+ 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
358
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
359
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
360
+
361
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
362
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
363
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
364
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
365
+ };
366
+
367
+
368
+ // If != UNKNOWN, use nilgrams to determine language of this script
369
+ static const Language kOnlyLanguagePerLScript[] = {
370
+ ENGLISH, // ULScript_Common, [no words should be in this script]
371
+ UNKNOWN_LANGUAGE, // ULScript_Latin,
372
+ //UNKNOWN_LANGUAGE, // ULScript_Greek, Jan 2009: change so we can score quads
373
+ GREEK, // ULScript_Greek, Mar 2009: change back; do gibberish separately
374
+ UNKNOWN_LANGUAGE, // ULScript_Cyrillic,
375
+ ARMENIAN, // ULScript_Armenian,
376
+ UNKNOWN_LANGUAGE, // ULScript_Hebrew,
377
+ UNKNOWN_LANGUAGE, // ULScript_Arabic,
378
+ SYRIAC, // ULScript_Syriac,
379
+ DHIVEHI, // ULScript_Thaana,
380
+ UNKNOWN_LANGUAGE, // ULScript_Devanagari,
381
+ UNKNOWN_LANGUAGE, // ULScript_Bengali,
382
+ PUNJABI, // ULScript_Gurmukhi,
383
+ GUJARATI, // ULScript_Gujarati,
384
+ ORIYA, // ULScript_Oriya,
385
+ TAMIL, // ULScript_Tamil,
386
+ TELUGU, // ULScript_Telugu,
387
+ KANNADA, // ULScript_Kannada,
388
+ MALAYALAM, // ULScript_Malayalam,
389
+ SINHALESE, // ULScript_Sinhala,
390
+ THAI, // ULScript_Thai,
391
+ LAOTHIAN, // ULScript_Lao,
392
+ UNKNOWN_LANGUAGE, // ULScript_Tibetan,
393
+ BURMESE, // ULScript_Myanmar,
394
+ GEORGIAN, // ULScript_Georgian,
395
+ UNKNOWN_LANGUAGE, // ULScript_HanCJK,
396
+ UNKNOWN_LANGUAGE, // ULScript_Ethiopic,
397
+ CHEROKEE, // ULScript_Cherokee,
398
+ INUKTITUT, // ULScript_Canadian_Aboriginal,
399
+ X_OGHAM, // ULScript_Ogham,
400
+ X_RUNIC, // ULScript_Runic,
401
+ KHMER, // ULScript_Khmer,
402
+ MONGOLIAN, // ULScript_Mongolian,
403
+ X_YI, // ULScript_Yi,
404
+ X_OLD_ITALIC, // ULScript_Old_Italic,
405
+ X_GOTHIC, // ULScript_Gothic,
406
+ X_DESERET, // ULScript_Deseret,
407
+ ENGLISH, // ULScript_Inherited, [no words should be in this script]
408
+ TAGALOG, // ULScript_Tagalog,
409
+ X_HANUNOO, // ULScript_Hanunoo,
410
+ X_BUHID, // ULScript_Buhid,
411
+ X_TAGBANWA, // ULScript_Tagbanwa,
412
+ LIMBU, // ULScript_Limbu,
413
+ X_TAI_LE, // ULScript_Tai_Le,
414
+ X_LINEAR_B, // ULScript_Linear_B,
415
+ X_UGARITIC, // ULScript_Ugaritic,
416
+ X_SHAVIAN, // ULScript_Shavian,
417
+ X_OSMANYA, // ULScript_Osmanya,
418
+ X_CYPRIOT, // ULScript_Cypriot,
419
+ X_BUGINESE, // ULScript_Buginese,
420
+ X_COPTIC, // ULScript_Coptic,
421
+ X_NEW_TAI_LUE, // ULScript_New_Tai_Lue,
422
+ X_GLAGOLITIC, // ULScript_Glagolitic,
423
+ X_TIFINAGH, // ULScript_Tifinagh,
424
+ X_SYLOTI_NAGRI, // ULScript_Syloti_Nagri,
425
+ X_OLD_PERSIAN, // ULScript_Old_Persian,
426
+ X_KHAROSHTHI, // ULScript_Kharoshthi,
427
+ X_BALINESE, // ULScript_Balinese,
428
+ X_CUNEIFORM, // ULScript_Cuneiform,
429
+ X_PHOENICIAN, // ULScript_Phoenician,
430
+ X_PHAGS_PA, // ULScript_Phags_Pa,
431
+ X_NKO, // ULScript_Nko,
432
+
433
+ // Unicode 5.1
434
+ X_SUDANESE, // ULScript_Sundanese,
435
+ X_LEPCHA, // ULScript_Lepcha,
436
+ X_OL_CHIKI, // ULScript_Ol_Chiki,
437
+ X_VAI, // ULScript_Vai,
438
+ X_SAURASHTRA, // ULScript_Saurashtra,
439
+ X_KAYAH_LI, // ULScript_Kayah_Li,
440
+ X_REJANG, // ULScript_Rejang,
441
+ X_LYCIAN, // ULScript_Lycian,
442
+ X_CARIAN, // ULScript_Carian,
443
+ X_LYDIAN, // ULScript_Lydian,
444
+ X_CHAM, // ULScript_Cham,
445
+ };
446
+
447
+ COMPILE_ASSERT(arraysize(kOnlyLanguagePerLScript) == ULScript_NUM_SCRIPTS,
448
+ kOnlyLanguagePerLScript_has_incorrect_length);
449
+
450
+
451
+ // This is, in a sense, the complement of the table above
452
+ // If != UNKNOWN, determines a default language of this script
453
+ static const Language kDefaultLanguagePerLScript[] = {
454
+ UNKNOWN_LANGUAGE, // ULScript_Common, [no words should be in this script]
455
+ ENGLISH, // ULScript_Latin,
456
+ UNKNOWN_LANGUAGE, // ULScript_Greek,
457
+ RUSSIAN, // ULScript_Cyrillic,
458
+ UNKNOWN_LANGUAGE, // ULScript_Armenian,
459
+ HEBREW, // ULScript_Hebrew,
460
+ ARABIC, // ULScript_Arabic,
461
+ UNKNOWN_LANGUAGE, // ULScript_Syriac,
462
+ UNKNOWN_LANGUAGE, // ULScript_Thaana,
463
+ HINDI, // ULScript_Devanagari,
464
+ BENGALI, // ULScript_Bengali,
465
+ UNKNOWN_LANGUAGE, // ULScript_Gurmukhi,
466
+ UNKNOWN_LANGUAGE, // ULScript_Gujarati,
467
+ UNKNOWN_LANGUAGE, // ULScript_Oriya,
468
+ UNKNOWN_LANGUAGE, // ULScript_Tamil,
469
+ UNKNOWN_LANGUAGE, // ULScript_Telugu,
470
+ UNKNOWN_LANGUAGE, // ULScript_Kannada,
471
+ UNKNOWN_LANGUAGE, // ULScript_Malayalam,
472
+ UNKNOWN_LANGUAGE, // ULScript_Sinhala,
473
+ UNKNOWN_LANGUAGE, // ULScript_Thai,
474
+ UNKNOWN_LANGUAGE, // ULScript_Lao,
475
+ TIBETAN, // ULScript_Tibetan,
476
+ UNKNOWN_LANGUAGE, // ULScript_Myanmar,
477
+ UNKNOWN_LANGUAGE, // ULScript_Georgian,
478
+ CHINESE, // ULScript_HanCJK,
479
+ AMHARIC, // ULScript_Ethiopic,
480
+ UNKNOWN_LANGUAGE, // ULScript_Cherokee,
481
+ UNKNOWN_LANGUAGE, // ULScript_Canadian_Aboriginal,
482
+ UNKNOWN_LANGUAGE, // ULScript_Ogham,
483
+ UNKNOWN_LANGUAGE, // ULScript_Runic,
484
+ UNKNOWN_LANGUAGE, // ULScript_Khmer,
485
+ UNKNOWN_LANGUAGE, // ULScript_Mongolian,
486
+ UNKNOWN_LANGUAGE, // ULScript_Yi,
487
+ UNKNOWN_LANGUAGE, // ULScript_Old_Italic,
488
+ UNKNOWN_LANGUAGE, // ULScript_Gothic,
489
+ UNKNOWN_LANGUAGE, // ULScript_Deseret,
490
+ UNKNOWN_LANGUAGE, // ULScript_Inherited, [no words should be in this script]
491
+ UNKNOWN_LANGUAGE, // ULScript_Tagalog,
492
+ UNKNOWN_LANGUAGE, // ULScript_Hanunoo,
493
+ UNKNOWN_LANGUAGE, // ULScript_Buhid,
494
+ UNKNOWN_LANGUAGE, // ULScript_Tagbanwa,
495
+ UNKNOWN_LANGUAGE, // ULScript_Limbu,
496
+ UNKNOWN_LANGUAGE, // ULScript_Tai_Le,
497
+ UNKNOWN_LANGUAGE, // ULScript_Linear_B,
498
+ UNKNOWN_LANGUAGE, // ULScript_Ugaritic,
499
+ UNKNOWN_LANGUAGE, // ULScript_Shavian,
500
+ UNKNOWN_LANGUAGE, // ULScript_Osmanya,
501
+ UNKNOWN_LANGUAGE, // ULScript_Cypriot,
502
+ UNKNOWN_LANGUAGE, // ULScript_Buginese,
503
+ UNKNOWN_LANGUAGE, // ULScript_Coptic,
504
+ UNKNOWN_LANGUAGE, // ULScript_New_Tai_Lue,
505
+ UNKNOWN_LANGUAGE, // ULScript_Glagolitic,
506
+ UNKNOWN_LANGUAGE, // ULScript_Tifinagh,
507
+ UNKNOWN_LANGUAGE, // ULScript_Syloti_Nagri,
508
+ UNKNOWN_LANGUAGE, // ULScript_Old_Persian,
509
+ UNKNOWN_LANGUAGE, // ULScript_Kharoshthi,
510
+ UNKNOWN_LANGUAGE, // ULScript_Balinese,
511
+ UNKNOWN_LANGUAGE, // ULScript_Cuneiform,
512
+ UNKNOWN_LANGUAGE, // ULScript_Phoenician,
513
+ UNKNOWN_LANGUAGE, // ULScript_Phags_Pa,
514
+ UNKNOWN_LANGUAGE, // ULScript_Nko,
515
+
516
+ // Unicode 5.1
517
+ UNKNOWN_LANGUAGE, // ULScript_Sundanese,
518
+ UNKNOWN_LANGUAGE, // ULScript_Lepcha,
519
+ UNKNOWN_LANGUAGE, // ULScript_Ol_Chiki,
520
+ UNKNOWN_LANGUAGE, // ULScript_Vai,
521
+ UNKNOWN_LANGUAGE, // ULScript_Saurashtra,
522
+ UNKNOWN_LANGUAGE, // ULScript_Kayah_Li,
523
+ UNKNOWN_LANGUAGE, // ULScript_Rejang,
524
+ UNKNOWN_LANGUAGE, // ULScript_Lycian,
525
+ UNKNOWN_LANGUAGE, // ULScript_Carian,
526
+ UNKNOWN_LANGUAGE, // ULScript_Lydian,
527
+ UNKNOWN_LANGUAGE, // ULScript_Cham,
528
+ };
529
+
530
+ COMPILE_ASSERT(arraysize(kDefaultLanguagePerLScript) == ULScript_NUM_SCRIPTS,
531
+ kDefaultLanguagePerLScript_has_incorrect_length);
532
+
533
+
534
+ // True for standalone languages (only lang in a script)
535
+ // Subscripted by packed language number
536
+ // If 1, we will use nilgrams to determine language
537
+ static const uint8 kIsStandaloneLang[EXT_NUM_LANGUAGES + 1] = {
538
+ 0,
539
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,1,0, // GREEK
540
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
541
+ 0,1,0,0,1, 0,1,0,0,0, 0,0,1,1,0, 0,0,0,0,1, // MALAYALAM..KANNADA
542
+ 1,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 1,0,0,0,1, // PUNJABI..SINHALESE
543
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,1,1,0, // ARMENIAN..LAOTHIAN
544
+
545
+ 0,0,0,0,1, 0,1,1,1,0, 1,0,0,0,0, 0,0,0,0,0, // KHMER..ORIYA
546
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
547
+ 0,1,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, // INUKTITUT
548
+
549
+ 0,0,0,0,0, // [160..164]
550
+ // Add new language standalone bit just before here
551
+ 0,0,0,0,0, 1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,1,
552
+ 1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,1,
553
+
554
+ 1,1,1,1,
555
+ };
556
+
557
+ // True for ULScript_HanCJK
558
+ // (Vietnamese and Zhuang also have Latin script quadgrams)
559
+ // Subscripted by packed language number
560
+ static const uint8 kIsUnigramLang[EXT_NUM_LANGUAGES + 1] = {
561
+ 0,
562
+ 0,0,0,0,0, 0,0,0,1,1, 0,0,0,0,0, 0,1,0,0,0, // JAPANESE KOREAN CHINESE
563
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
564
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
565
+ 0,0,0,0,0, 0,1,0,0,1, 0,0,0,0,0, 0,0,0,0,0, // VIETNAMESE CHINESE_T
566
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
567
+
568
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
569
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
570
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 1,0,0,0,0, // ZHUANG
571
+
572
+ 0,0,0,0,0, // [160..164]
573
+ // Add new language unigram bit just before here
574
+
575
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
576
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
577
+
578
+ 0,0,0,0,
579
+ };
580
+
581
+
582
+ // True for ULScript_HanCJK
583
+ // Subscripted by lscript number
584
+ static const uint8 kScoreUniPerLScript[] = {
585
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1,0,0,0,0,0,0,0,
586
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
587
+ 0,0,0,0,0,0,0,0,
588
+ };
589
+
590
+ COMPILE_ASSERT(arraysize(kScoreUniPerLScript) == ULScript_NUM_SCRIPTS,
591
+ kScoreUniPerLScript_has_incorrect_length);
592
+
593
+
594
+ // Defines Top40 packed languages
595
+
596
+ // Tier 0/1 Language enum list (16)
597
+ // ENGLISH, /*no en_GB,*/ FRENCH, ITALIAN, GERMAN, SPANISH, // E - FIGS
598
+ // DUTCH, CHINESE, CHINESE_T, JAPANESE, KOREAN,
599
+ // PORTUGUESE, RUSSIAN, POLISH, TURKISH, THAI,
600
+ // ARABIC,
601
+ //
602
+ // Tier 2 Language enum list (22)
603
+ // SWEDISH, FINNISH, DANISH, /*no pt-PT,*/ ROMANIAN, HUNGARIAN,
604
+ // HEBREW, INDONESIAN, CZECH, GREEK, NORWEGIAN,
605
+ // VIETNAMESE, BULGARIAN, CROATIAN, LITHUANIAN, SLOVAK,
606
+ // TAGALOG, SLOVENIAN, SERBIAN, CATALAN, LATVIAN,
607
+ // UKRAINIAN, HINDI,
608
+ //
609
+ // use SERBO_CROATIAN instead of BOSNIAN, SERBIAN, CROATIAN, MONTENEGRIN(21)
610
+ //
611
+ // Include IgnoreMe (TG_UNKNOWN_LANGUAGE, 25+1) as a top 40
612
+
613
+ // NOTE: packed, i.e. Language enum + 1
614
+ static const uint8 kIsPackedTop40[EXT_NUM_LANGUAGES + 1] = {
615
+ 0,
616
+ 1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,0,
617
+ 1,1,1,1,0, 1,0,1,0,0, 0,0,1,1,1, 1,0,0,1,0,
618
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,1,1, 1,0,0,0,0,
619
+ 0,0,0,1,0, 0,1,0,1,1, 0,0,0,0,0, 0,0,0,0,0,
620
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,1,0,0, 0,0,0,0,0,
621
+
622
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
623
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
624
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
625
+
626
+ 0,0,0,0,0, // [160..164]
627
+ // Add new language top40 bit just before here
628
+
629
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
630
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
631
+
632
+ 0,0,0,0,
633
+ };
634
+
635
+
636
+
637
+ // Table has 234 eight-byte entries. Each entry has a five-byte array and
638
+ // a three-byte array of log base 2 probabilities in the range 0..11.
639
+ // The intended use is to express five or three probabilities in a single-byte
640
+ // subscript, then decode via this table. These probabilities are
641
+ // intended to go with an array of five or three language numbers.
642
+ //
643
+ // The corresponding language numbers will have to be sorted by descending
644
+ // probability, then the actual probability subscript chosen to match the
645
+ // closest available entry in this table.
646
+ //
647
+ // Pattern of probability values:
648
+ // hi 3/4 1/2 1/4 lo hi mid lo
649
+ // where "3/4" is (hi*3+lo)/4, "1/2" is (hi+lo)/2, and "1/4" is (hi+lo*3)/4 and
650
+ // mid is one of 3/4 1/2 or 1/4.
651
+ // There are three groups of 78 (=12*13/2) entries, with hi running 0..11 and
652
+ // lo running 0..hi. Only the first group is used for five-entry lookups.
653
+ // The mid value in the first group is 1/2, the second group 3/4, and the
654
+ // third group 1/4. For three-entry lookups, this allows the mid entry to be
655
+ // somewhat higher or lower than the midpoint, to allow a better match to the
656
+ // original probabilities.
657
+ static const int kLgProbV2TblSize = 234;
658
+ static const uint8 kLgProbV2Tbl[kLgProbV2TblSize * 8] = {
659
+ 1,1,1,1,1, 1,1,1, // [0]
660
+ 2,2,2,1,1, 2,2,1, // [1]
661
+ 2,2,2,2,2, 2,2,2,
662
+ 3,3,2,2,1, 3,2,1, // [3]
663
+ 3,3,3,2,2, 3,3,2,
664
+ 3,3,3,3,3, 3,3,3,
665
+ 4,3,3,2,1, 4,3,1, // [6]
666
+ 4,4,3,3,2, 4,3,2,
667
+ 4,4,4,3,3, 4,4,3,
668
+ 4,4,4,4,4, 4,4,4,
669
+ 5,4,3,2,1, 5,3,1, // [10]
670
+ 5,4,4,3,2, 5,4,2,
671
+ 5,5,4,4,3, 5,4,3,
672
+ 5,5,5,4,4, 5,5,4,
673
+ 5,5,5,5,5, 5,5,5,
674
+ 6,5,4,2,1, 6,4,1, // [15]
675
+ 6,5,4,3,2, 6,4,2,
676
+ 6,5,5,4,3, 6,5,3,
677
+ 6,6,5,5,4, 6,5,4,
678
+ 6,6,6,5,5, 6,6,5,
679
+ 6,6,6,6,6, 6,6,6,
680
+ 7,6,4,3,1, 7,4,1, // [21]
681
+ 7,6,5,3,2, 7,5,2,
682
+ 7,6,5,4,3, 7,5,3,
683
+ 7,6,6,5,4, 7,6,4,
684
+ 7,7,6,6,5, 7,6,5,
685
+ 7,7,7,6,6, 7,7,6,
686
+ 7,7,7,7,7, 7,7,7,
687
+ 8,6,5,3,1, 8,5,1, // [28]
688
+ 8,7,5,4,2, 8,5,2,
689
+ 8,7,6,4,3, 8,6,3,
690
+ 8,7,6,5,4, 8,6,4,
691
+ 8,7,7,6,5, 8,7,5,
692
+ 8,8,7,7,6, 8,7,6,
693
+ 8,8,8,7,7, 8,8,7,
694
+ 8,8,8,8,8, 8,8,8,
695
+ 9,7,5,3,1, 9,5,1, // [36]
696
+ 9,7,6,4,2, 9,6,2,
697
+ 9,8,6,5,3, 9,6,3,
698
+ 9,8,7,5,4, 9,7,4,
699
+ 9,8,7,6,5, 9,7,5,
700
+ 9,8,8,7,6, 9,8,6,
701
+ 9,9,8,8,7, 9,8,7,
702
+ 9,9,9,8,8, 9,9,8,
703
+ 9,9,9,9,9, 9,9,9,
704
+ 10,8,6,3,1, 10,6,1, // [45]
705
+ 10,8,6,4,2, 10,6,2,
706
+ 10,8,7,5,3, 10,7,3,
707
+ 10,9,7,6,4, 10,7,4,
708
+ 10,9,8,6,5, 10,8,5,
709
+ 10,9,8,7,6, 10,8,6,
710
+ 10,9,9,8,7, 10,9,7,
711
+ 10,10,9,9,8, 10,9,8,
712
+ 10,10,10,9,9, 10,10,9,
713
+ 10,10,10,10,10, 10,10,10,
714
+ 11,9,6,4,1, 11,6,1, // [55]
715
+ 11,9,7,4,2, 11,7,2,
716
+ 11,9,7,5,3, 11,7,3,
717
+ 11,9,8,6,4, 11,8,4,
718
+ 11,10,8,7,5, 11,8,5,
719
+ 11,10,9,7,6, 11,9,6,
720
+ 11,10,9,8,7, 11,9,7,
721
+ 11,10,10,9,8, 11,10,8,
722
+ 11,11,10,10,9, 11,10,9,
723
+ 11,11,11,10,10, 11,11,10,
724
+ 11,11,11,11,11, 11,11,11,
725
+ 12,9,7,4,1, 12,7,1, // [66]
726
+ 12,10,7,5,2, 12,7,2,
727
+ 12,10,8,5,3, 12,8,3,
728
+ 12,10,8,6,4, 12,8,4,
729
+ 12,10,9,7,5, 12,9,5,
730
+ 12,11,9,8,6, 12,9,6,
731
+ 12,11,10,8,7, 12,10,7,
732
+ 12,11,10,9,8, 12,10,8,
733
+ 12,11,11,10,9, 12,11,9,
734
+ 12,12,11,11,10, 12,11,10,
735
+ 12,12,12,11,11, 12,12,11,
736
+ 12,12,12,12,12, 12,12,12,
737
+
738
+ 1,1,1,1,1, 1,1,1,
739
+ 2,2,2,1,1, 2,2,1,
740
+ 2,2,2,2,2, 2,2,2,
741
+ 3,3,2,2,1, 3,3,1,
742
+ 3,3,3,2,2, 3,3,2,
743
+ 3,3,3,3,3, 3,3,3,
744
+ 4,3,3,2,1, 4,3,1,
745
+ 4,4,3,3,2, 4,4,2,
746
+ 4,4,4,3,3, 4,4,3,
747
+ 4,4,4,4,4, 4,4,4,
748
+ 5,4,3,2,1, 5,4,1,
749
+ 5,4,4,3,2, 5,4,2,
750
+ 5,5,4,4,3, 5,5,3,
751
+ 5,5,5,4,4, 5,5,4,
752
+ 5,5,5,5,5, 5,5,5,
753
+ 6,5,4,2,1, 6,5,1,
754
+ 6,5,4,3,2, 6,5,2,
755
+ 6,5,5,4,3, 6,5,3,
756
+ 6,6,5,5,4, 6,6,4,
757
+ 6,6,6,5,5, 6,6,5,
758
+ 6,6,6,6,6, 6,6,6,
759
+ 7,6,4,3,1, 7,6,1,
760
+ 7,6,5,3,2, 7,6,2,
761
+ 7,6,5,4,3, 7,6,3,
762
+ 7,6,6,5,4, 7,6,4,
763
+ 7,7,6,6,5, 7,7,5,
764
+ 7,7,7,6,6, 7,7,6,
765
+ 7,7,7,7,7, 7,7,7,
766
+ 8,6,5,3,1, 8,6,1,
767
+ 8,7,5,4,2, 8,7,2,
768
+ 8,7,6,4,3, 8,7,3,
769
+ 8,7,6,5,4, 8,7,4,
770
+ 8,7,7,6,5, 8,7,5,
771
+ 8,8,7,7,6, 8,8,6,
772
+ 8,8,8,7,7, 8,8,7,
773
+ 8,8,8,8,8, 8,8,8,
774
+ 9,7,5,3,1, 9,7,1,
775
+ 9,7,6,4,2, 9,7,2,
776
+ 9,8,6,5,3, 9,8,3,
777
+ 9,8,7,5,4, 9,8,4,
778
+ 9,8,7,6,5, 9,8,5,
779
+ 9,8,8,7,6, 9,8,6,
780
+ 9,9,8,8,7, 9,9,7,
781
+ 9,9,9,8,8, 9,9,8,
782
+ 9,9,9,9,9, 9,9,9,
783
+ 10,8,6,3,1, 10,8,1,
784
+ 10,8,6,4,2, 10,8,2,
785
+ 10,8,7,5,3, 10,8,3,
786
+ 10,9,7,6,4, 10,9,4,
787
+ 10,9,8,6,5, 10,9,5,
788
+ 10,9,8,7,6, 10,9,6,
789
+ 10,9,9,8,7, 10,9,7,
790
+ 10,10,9,9,8, 10,10,8,
791
+ 10,10,10,9,9, 10,10,9,
792
+ 10,10,10,10,10, 10,10,10,
793
+ 11,9,6,4,1, 11,9,1,
794
+ 11,9,7,4,2, 11,9,2,
795
+ 11,9,7,5,3, 11,9,3,
796
+ 11,9,8,6,4, 11,9,4,
797
+ 11,10,8,7,5, 11,10,5,
798
+ 11,10,9,7,6, 11,10,6,
799
+ 11,10,9,8,7, 11,10,7,
800
+ 11,10,10,9,8, 11,10,8,
801
+ 11,11,10,10,9, 11,11,9,
802
+ 11,11,11,10,10, 11,11,10,
803
+ 11,11,11,11,11, 11,11,11,
804
+ 12,9,7,4,1, 12,9,1,
805
+ 12,10,7,5,2, 12,10,2,
806
+ 12,10,8,5,3, 12,10,3,
807
+ 12,10,8,6,4, 12,10,4,
808
+ 12,10,9,7,5, 12,10,5,
809
+ 12,11,9,8,6, 12,11,6,
810
+ 12,11,10,8,7, 12,11,7,
811
+ 12,11,10,9,8, 12,11,8,
812
+ 12,11,11,10,9, 12,11,9,
813
+ 12,12,11,11,10, 12,12,10,
814
+ 12,12,12,11,11, 12,12,11,
815
+ 12,12,12,12,12, 12,12,12,
816
+
817
+ 1,1,1,1,1, 1,1,1,
818
+ 2,2,2,1,1, 2,1,1,
819
+ 2,2,2,2,2, 2,2,2,
820
+ 3,3,2,2,1, 3,2,1,
821
+ 3,3,3,2,2, 3,2,2,
822
+ 3,3,3,3,3, 3,3,3,
823
+ 4,3,3,2,1, 4,2,1,
824
+ 4,4,3,3,2, 4,3,2,
825
+ 4,4,4,3,3, 4,3,3,
826
+ 4,4,4,4,4, 4,4,4,
827
+ 5,4,3,2,1, 5,2,1,
828
+ 5,4,4,3,2, 5,3,2,
829
+ 5,5,4,4,3, 5,4,3,
830
+ 5,5,5,4,4, 5,4,4,
831
+ 5,5,5,5,5, 5,5,5,
832
+ 6,5,4,2,1, 6,2,1,
833
+ 6,5,4,3,2, 6,3,2,
834
+ 6,5,5,4,3, 6,4,3,
835
+ 6,6,5,5,4, 6,5,4,
836
+ 6,6,6,5,5, 6,5,5,
837
+ 6,6,6,6,6, 6,6,6,
838
+ 7,6,4,3,1, 7,3,1,
839
+ 7,6,5,3,2, 7,3,2,
840
+ 7,6,5,4,3, 7,4,3,
841
+ 7,6,6,5,4, 7,5,4,
842
+ 7,7,6,6,5, 7,6,5,
843
+ 7,7,7,6,6, 7,6,6,
844
+ 7,7,7,7,7, 7,7,7,
845
+ 8,6,5,3,1, 8,3,1,
846
+ 8,7,5,4,2, 8,4,2,
847
+ 8,7,6,4,3, 8,4,3,
848
+ 8,7,6,5,4, 8,5,4,
849
+ 8,7,7,6,5, 8,6,5,
850
+ 8,8,7,7,6, 8,7,6,
851
+ 8,8,8,7,7, 8,7,7,
852
+ 8,8,8,8,8, 8,8,8,
853
+ 9,7,5,3,1, 9,3,1,
854
+ 9,7,6,4,2, 9,4,2,
855
+ 9,8,6,5,3, 9,5,3,
856
+ 9,8,7,5,4, 9,5,4,
857
+ 9,8,7,6,5, 9,6,5,
858
+ 9,8,8,7,6, 9,7,6,
859
+ 9,9,8,8,7, 9,8,7,
860
+ 9,9,9,8,8, 9,8,8,
861
+ 9,9,9,9,9, 9,9,9,
862
+ 10,8,6,3,1, 10,3,1,
863
+ 10,8,6,4,2, 10,4,2,
864
+ 10,8,7,5,3, 10,5,3,
865
+ 10,9,7,6,4, 10,6,4,
866
+ 10,9,8,6,5, 10,6,5,
867
+ 10,9,8,7,6, 10,7,6,
868
+ 10,9,9,8,7, 10,8,7,
869
+ 10,10,9,9,8, 10,9,8,
870
+ 10,10,10,9,9, 10,9,9,
871
+ 10,10,10,10,10, 10,10,10,
872
+ 11,9,6,4,1, 11,4,1,
873
+ 11,9,7,4,2, 11,4,2,
874
+ 11,9,7,5,3, 11,5,3,
875
+ 11,9,8,6,4, 11,6,4,
876
+ 11,10,8,7,5, 11,7,5,
877
+ 11,10,9,7,6, 11,7,6,
878
+ 11,10,9,8,7, 11,8,7,
879
+ 11,10,10,9,8, 11,9,8,
880
+ 11,11,10,10,9, 11,10,9,
881
+ 11,11,11,10,10, 11,10,10,
882
+ 11,11,11,11,11, 11,11,11,
883
+ 12,9,7,4,1, 12,4,1,
884
+ 12,10,7,5,2, 12,5,2,
885
+ 12,10,8,5,3, 12,5,3,
886
+ 12,10,8,6,4, 12,6,4,
887
+ 12,10,9,7,5, 12,7,5,
888
+ 12,11,9,8,6, 12,8,6,
889
+ 12,11,10,8,7, 12,8,7,
890
+ 12,11,10,9,8, 12,9,8,
891
+ 12,11,11,10,9, 12,10,9,
892
+ 12,12,11,11,10, 12,11,10,
893
+ 12,12,12,11,11, 12,11,11,
894
+ 12,12,12,12,12, 12,12,12,
895
+ };
896
+
897
+ // Backmap a single desired probability into an entry in kLgProbV2Tbl
898
+ static const uint8 kLgProbV2TblBackmap[13] = {
899
+ 0,
900
+ 0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66,
901
+ };
902
+
903
+
904
+ // Always advances one UTF-8 character
905
+ static const uint8 kAdvanceOneChar[256] = {
906
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
907
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
908
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
909
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
910
+
911
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
912
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
913
+ 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
914
+ 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,
915
+ };
916
+
917
+ // Does not advance past space or cr/lf/nul
918
+ static const uint8 kAdvanceOneCharButSpace[256] = {
919
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
920
+ 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
921
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
922
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
923
+
924
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
925
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
926
+ 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
927
+ 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,
928
+ };
929
+
930
+ // Advances *only* on space or ASCII vowel (or illegal byte)
931
+ static const uint8 kAdvanceOneCharSpaceVowel[256] = {
932
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
933
+ 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
934
+ 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
935
+ 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
936
+
937
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
938
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
939
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
940
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
941
+ };
942
+
943
+ // Advances *only* on space (or illegal byte)
944
+ static const uint8 kAdvanceOneCharSpace[256] = {
945
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
946
+ 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
947
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
948
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
949
+
950
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
951
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
952
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
953
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
954
+ };
955
+
956
+
957
+ //------------------------------------------------------------------------------
958
+ // General
959
+ //------------------------------------------------------------------------------
960
+ static inline int minint(int a, int b) {return (a < b) ? a: b;}
961
+ static inline int maxint(int a, int b) {return (a > b) ? a: b;}
962
+
963
+ // Here to make available for debugging
964
+ int ReliabilityDelta(int value1, int value2, int count);
965
+ int ReliabilityMainstream(int topscore, int len, int mean_score);
966
+
967
+ // Returns "0" for too small
968
+ inline const char* MyExtLanguageCode(Language lang) {
969
+ return ExtLanguageCode(lang);
970
+ }
971
+
972
+ // Map script into Latin, Cyrillic, Arabic, Other. Used in keeping track of
973
+ // amount of training data for language-script combinations
974
+ inline int LScript4(UnicodeLScript lscript) {
975
+ if (lscript == ULScript_Latin) {return 0;}
976
+ if (lscript == ULScript_Cyrillic) {return 1;}
977
+ if (lscript == ULScript_Arabic) {return 2;}
978
+ return 3;
979
+ }
980
+
981
+
982
+ // Routines to access 3 or 5 log probabilities in a single byte.
983
+
984
+ // Return address of 8-byte entry[i]
985
+ inline const uint8* LgProb2TblEntry(int i) {
986
+ return &kLgProbV2Tbl[i * 8];
987
+ }
988
+
989
+ // Return one of five probabilities in an entry
990
+ // CURRENTLY UNUSED
991
+ inline uint8 LgProb5(const uint8* entry, int j) {
992
+ return entry[j];
993
+ }
994
+
995
+ // Return one of three probabilities in an entry
996
+ inline uint8 LgProb3(const uint8* entry, int j) {
997
+ return entry[j + 5];
998
+ }
999
+
1000
+
1001
+
1002
+ //------------------------------------------------------------------------------
1003
+ // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores
1004
+ //------------------------------------------------------------------------------
1005
+
1006
+ // Pick up 1..12 bytes and hash them via mask/shift/add. NO pre/post
1007
+ // OVERSHOOTS up to 3 bytes
1008
+ uint32 BiHashV25(const char* word_ptr, int bytecount);
1009
+
1010
+ // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
1011
+ // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
1012
+ uint32 QuadHashV25(const char* word_ptr, int bytecount);
1013
+
1014
+ // Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
1015
+ // OVERSHOOTS up to 3 bytes
1016
+ uint32 QuadHashV25Underscore(const char* word_ptr, int bytecount);
1017
+
1018
+
1019
+ // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
1020
+ // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
1021
+ // For runtime use of tables V3
1022
+ uint64 OctaHash40(const char* word_ptr, int bytecount);
1023
+
1024
+ uint64 OctaHash40underscore(const char* word_ptr, int bytecount);
1025
+
1026
+
1027
+ // From 32-bit gram FP, return hash table subscript and remaining key
1028
+ inline void QuadFPJustHash(uint32 quadhash,
1029
+ uint32 keymask,
1030
+ int bucketcount,
1031
+ uint32* subscr, uint32* hashkey) {
1032
+ *subscr = (quadhash + (quadhash >> 12)) & (bucketcount - 1);
1033
+ *hashkey = quadhash & keymask;
1034
+ }
1035
+
1036
+ // Look up 32-bit gram FP in caller-passed table
1037
+ // Typical size 256K entries (1.5MB)
1038
+ // Two-byte hashkey
1039
+ inline const uint32 QuadHashV3Lookup4(const cld::CLDTableSummary* gram_obj,
1040
+ uint32 quadhash) {
1041
+
1042
+ uint32 subscr, hashkey;
1043
+ const IndirectProbBucket4* quadtable = gram_obj->kCLDTable;
1044
+ uint32 keymask = gram_obj->kCLDTableKeyMask;
1045
+ int bucketcount = gram_obj->kCLDTableSize;
1046
+ QuadFPJustHash(quadhash, keymask, bucketcount, &subscr, &hashkey);
1047
+ const IndirectProbBucket4* bucket_ptr = &quadtable[subscr];
1048
+ // Four-way associative, 4 compares
1049
+ if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) {
1050
+ return bucket_ptr->keyvalue[0];
1051
+ }
1052
+ if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) {
1053
+ return bucket_ptr->keyvalue[1];
1054
+ }
1055
+ if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) {
1056
+ return bucket_ptr->keyvalue[2];
1057
+ }
1058
+ if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) {
1059
+ return bucket_ptr->keyvalue[3];
1060
+ }
1061
+ return 0;
1062
+ }
1063
+
1064
+
1065
+ // Map 40 bits to subscript, hashkey, expected 18-22 bit subscript (min 16)
1066
+ // wwwwwwww xxxxxxxx xxxxxxxx yyyyyyyy yyyyyyyy
1067
+ // + ........ ....wwww wwwwxxxx xxxxxxxx xxxxyyyy
1068
+ // 00000000 00000000 00000011 11111111 11111111 (18-bit bucketcount-1)
1069
+ //
1070
+ // hashkey:
1071
+ // wwwwxxxx xxxxxxxx xxxx.... ........ (20-bit keymask)
1072
+ // 12-bit shift in subscript mixes in ~4 letters x 4 bits each
1073
+
1074
+ // From 40-bit gram FP, return hash table subscript and remaining key
1075
+ inline void OctaFPJustHash(uint64 longwordhash,
1076
+ uint32 keymask,
1077
+ int bucketcount,
1078
+ uint32* subscr, uint32* hashkey) {
1079
+ uint32 temp = (longwordhash + (longwordhash >> 12)) & (bucketcount - 1);
1080
+ *subscr = temp;
1081
+ temp = longwordhash >> 4;
1082
+ *hashkey = temp & keymask;
1083
+ }
1084
+
1085
+ // Look up 40-bit gram FP in caller-passed table
1086
+ // Typical size 256K-4M entries (1-16MB)
1087
+ // 24-12 bit hashkey packed with 8-20 bit indirect lang/probs
1088
+ // keymask is 0xfffff000 for 20-bit hashkey and 12-bit indirect
1089
+ inline const uint32 OctaHashV3Lookup4(const cld::CLDTableSummary* gram_obj,
1090
+ uint64 longwordhash) {
1091
+ uint32 subscr, hashkey;
1092
+ const IndirectProbBucket4* octatable = gram_obj->kCLDTable;
1093
+ uint32 keymask = gram_obj->kCLDTableKeyMask;
1094
+ int bucketcount = gram_obj->kCLDTableSize;
1095
+ OctaFPJustHash(longwordhash, keymask, bucketcount,
1096
+ &subscr, &hashkey);
1097
+ const IndirectProbBucket4* bucket_ptr = &octatable[subscr];
1098
+ // Four-way associative, 4 compares
1099
+ if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) {
1100
+ return bucket_ptr->keyvalue[0];
1101
+ }
1102
+ if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) {
1103
+ return bucket_ptr->keyvalue[1];
1104
+ }
1105
+ if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) {
1106
+ return bucket_ptr->keyvalue[2];
1107
+ }
1108
+ if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) {
1109
+ return bucket_ptr->keyvalue[3];
1110
+ }
1111
+ return 0;
1112
+ }
1113
+
1114
+
1115
+
1116
+ //------------------------------------------------------------------------------
1117
+ // Scoring single groups of letters
1118
+ //------------------------------------------------------------------------------
1119
+
1120
+ // UNIGRAM score one => tote
1121
+ // Input: 1-byte entry of subscript into unigram probs, plus
1122
+ // an accumulator tote.
1123
+ // Output: running sums in tote updated
1124
+ void ProcessProbV25UniTote(int propval, Tote* tote);
1125
+
1126
+ // BIGRAM, QUADGRAM, OCTAGRAM score one => tote
1127
+ // Input: 4-byte entry of 3 language numbers and one probability subscript,
1128
+ // plus an accumulator tote. (language 0 means unused entry)
1129
+ // Output: running sums in tote updated
1130
+ void ProcessProbV25Tote(uint32 probs, Tote* tote);
1131
+
1132
+
1133
+ //------------------------------------------------------------------------------
1134
+ // Routines to accumulate probabilities
1135
+ //------------------------------------------------------------------------------
1136
+
1137
+ // Score up to n=gram_limit unigrams, returning number of bytes consumed
1138
+ // Caller supplies table, such as compact_lang_det_generated_ctjkvz_b1_obj
1139
+ int DoUniScoreV3(const UTF8PropObj* unigram_obj,
1140
+ const char* isrc, int srclen, int advance_by,
1141
+ int* tote_grams, int gram_limit, Tote* chunk_tote);
1142
+
1143
+
1144
+ // Score all words in isrc, using languages that have bigrams (CJK)
1145
+ // Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj
1146
+ // Return number of bigrams that hit in the hash table
1147
+ int DoBigramScoreV3(const cld::CLDTableSummary* bigram_obj,
1148
+ const char* isrc, int srclen, Tote* chunk_tote);
1149
+
1150
+
1151
+ // Score up to n=gram_limit quadgrams, returning number of bytes consumed
1152
+ // Caller supplies table, such as &kQuadTable_obj or &kGibberishTable_obj
1153
+ int DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj,
1154
+ const char* isrc, int srclen, int advance_by,
1155
+ int* tote_grams, int gram_limit, Tote* chunk_tote);
1156
+
1157
+ // Score all octagrams (words) in isrc, using languages that have quadgrams
1158
+ // Caller supplies table, such as &kLongWord8Table_obj
1159
+ // Return number of words that hit in the hash table
1160
+ int DoOctaScoreV3(const cld::CLDTableSummary* octagram_obj,
1161
+ const char* isrc, int srclen, Tote* chunk_tote);
1162
+
1163
+ //------------------------------------------------------------------------------
1164
+ // Reliability calculations, for single language and between languages
1165
+ //------------------------------------------------------------------------------
1166
+
1167
+ // Reliability = 0..100
1168
+ static const int kMinReliable = 75;
1169
+
1170
+ // Calculate ratio of score per 1KB vs. expected score per 1KB
1171
+ double GetNormalizedScore(Language lang, UnicodeLScript lscript,
1172
+ int bytes, int score);
1173
+
1174
+ // Calculate reliablity of len bytes of script lscript with chunk_tote
1175
+ int GetReliability(int len, UnicodeLScript lscript, const Tote* chunk_tote);
1176
+
1177
+
1178
+ //------------------------------------------------------------------------------
1179
+ // Miscellaneous
1180
+ //------------------------------------------------------------------------------
1181
+
1182
+ // Make languages packed into uint32 values non-zero
1183
+ // These routines later could remap so languages not in QuadHash tables are not
1184
+ // represented, and so that any thrashing in accumulation is eliminated
1185
+ uint8 inline PackLanguage(Language lang) {
1186
+ return static_cast<uint8>(lang + 1);}
1187
+
1188
+ Language inline UnpackLanguage(int ilang) {
1189
+ return static_cast<Language>(ilang - 1);}
1190
+
1191
+ // Useful single-byte tests
1192
+ bool inline IsUTF8ContinueByte(char c) {
1193
+ return static_cast<signed char>(c) < -64;}
1194
+ bool inline IsUTF8HighByte(char c) {
1195
+ return static_cast<signed char>(c) < 0;}
1196
+
1197
+
1198
+ // Demote all languages except Top40 and plus_one
1199
+ // Do this just before sorting
1200
+ void DemoteNotTop40(Tote* chunk_tote, int packed_plus_one);
1201
+
1202
+ } // End namespace cld
1203
+
1204
+
1205
+ #endif // ENCODINGS_COMPACT_LANG_DET_CLDUTIL_H_