language_detection 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (100) hide show
  1. data/.gitignore +19 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +85 -0
  5. data/Rakefile +11 -0
  6. data/ext/cld/Makefile +34 -0
  7. data/ext/cld/base/basictypes.h +348 -0
  8. data/ext/cld/base/build_config.h +124 -0
  9. data/ext/cld/base/casts.h +156 -0
  10. data/ext/cld/base/commandlineflags.h +443 -0
  11. data/ext/cld/base/crash.h +41 -0
  12. data/ext/cld/base/dynamic_annotations.h +358 -0
  13. data/ext/cld/base/global_strip_options.h +59 -0
  14. data/ext/cld/base/log_severity.h +46 -0
  15. data/ext/cld/base/logging.h +1403 -0
  16. data/ext/cld/base/macros.h +243 -0
  17. data/ext/cld/base/port.h +54 -0
  18. data/ext/cld/base/scoped_ptr.h +428 -0
  19. data/ext/cld/base/stl_decl.h +0 -0
  20. data/ext/cld/base/stl_decl_msvc.h +107 -0
  21. data/ext/cld/base/string_util.h +29 -0
  22. data/ext/cld/base/strtoint.h +93 -0
  23. data/ext/cld/base/template_util.h +96 -0
  24. data/ext/cld/base/type_traits.h +198 -0
  25. data/ext/cld/base/vlog_is_on.h +143 -0
  26. data/ext/cld/cld.so +0 -0
  27. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  28. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  29. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  30. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  31. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  32. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  33. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  34. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  35. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  36. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  37. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  38. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  39. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  40. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  41. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  42. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  43. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  44. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  45. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  46. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  47. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  48. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  49. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  50. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  51. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  52. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  53. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  54. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  55. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  56. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  57. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  58. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  59. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  60. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  61. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  62. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  63. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  64. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  65. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  66. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  67. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  68. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  69. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  70. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  71. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  72. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  73. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  74. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  75. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  76. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  77. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  78. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  79. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  80. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  81. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  82. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  83. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  84. data/ext/cld/encodings/internal/encodings.cc +12 -0
  85. data/ext/cld/encodings/lang_enc.h +254 -0
  86. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  87. data/ext/cld/encodings/public/encodings.h +301 -0
  88. data/ext/cld/extconf.rb +1 -0
  89. data/ext/cld/language_detection.cc +88 -0
  90. data/ext/cld/languages/internal/languages.cc +337 -0
  91. data/ext/cld/languages/proto/languages.pb.h +179 -0
  92. data/ext/cld/languages/public/languages.h +379 -0
  93. data/language_detection.gemspec +28 -0
  94. data/lib/language_detection/string.rb +1 -0
  95. data/lib/language_detection/version.rb +3 -0
  96. data/lib/language_detection.rb +54 -0
  97. data/test/_helper.rb +15 -0
  98. data/test/fixtures/languages.csv +80 -0
  99. data/test/language_detection_test.rb +88 -0
  100. metadata +250 -0
@@ -0,0 +1,905 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include <string>
6
+ #include "encodings/compact_lang_det/cldutil.h"
7
+ #include "encodings/compact_lang_det/cldutil_dbg.h"
8
+ #include "encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h"
9
+ #include "encodings/compact_lang_det/utf8propletterscriptnum.h"
10
+ #include "encodings/compact_lang_det/win/cld_commandlineflags.h"
11
+ #include "encodings/compact_lang_det/win/cld_logging.h"
12
+ #include "encodings/compact_lang_det/win/cld_unilib.h"
13
+ #include "encodings/compact_lang_det/win/cld_utf.h"
14
+ #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
15
+
16
+ // Runtime routines for hashing, looking up, and scoring
17
+ // unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
18
+ // Unigrams and bigrams are for CJK languages only, including simplified/
19
+ // traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
20
+ // Zhuang Han characters. Surrounding spaces are not considered.
21
+ // Quadgrams and octagrams for for non-CJK and include two bits indicating
22
+ // preceding and trailing spaces (word boundaries).
23
+
24
+
25
+ // Indicator bits for leading/trailing space around quad/octagram
26
+ // NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of
27
+ // 1-, 2-, or 3-bytes each.
28
+ static const uint32 kPreSpaceIndicator = 0x00004444;
29
+ static const uint32 kPostSpaceIndicator = 0x44440000;
30
+
31
+ // Little-endian masks for 0..24 bytes picked up as uint32's
32
+ static const uint32 kWordMask0[4] = {
33
+ 0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF
34
+ };
35
+
36
+ static const int kMinCJKUTF8CharBytes = 3;
37
+
38
+ static const int kMinGramCount = 3;
39
+ static const int kMaxGramCount = 16;
40
+
41
+
42
+
43
+
44
+ // Routines to access a hash table of <key:wordhash, value:probs> pairs
45
+ // Buckets have 4-byte wordhash for sizes < 32K buckets, but only
46
+ // 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
47
+ // bucket subscript.
48
+ // Probs is a packed: three languages plus a subscript for probability table
49
+ // Buckets have all the keys together, then all the values.Key array never
50
+ // crosses a cache-line boundary, so no-match case takes exactly one cache miss.
51
+ // Match case may sometimes take an additional cache miss on value access.
52
+ //
53
+ // Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
54
+ // byte buckets with single cache miss.
55
+ // Or 2-byte key and 6-byte value, allowing 5 languages instead of three.
56
+ //------------------------------------------------------------------------------
57
+
58
+
59
+ //------------------------------------------------------------------------------
60
+ // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores
61
+ //------------------------------------------------------------------------------
62
+
63
+ // Design principles for these hash functions
64
+ // - Few operations
65
+ // - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in
66
+ // Latin script expect 1- and 2-byte mixtures.
67
+ // - Last byte of each character has about 5 bits of information
68
+ // - Spread good bits around so they can interact in at least two ways
69
+ // with other characters
70
+ // - Use add for additional mixing thorugh carries
71
+
72
+ // CJK Three-byte bigram
73
+ // ....dddd..cccccc..bbbbbb....aaaa
74
+ // ..................ffffff..eeeeee
75
+ // make
76
+ // ....dddd..cccccc..bbbbbb....aaaa
77
+ // 000....dddd..cccccc..bbbbbb....a
78
+ // ..................ffffff..eeeeee
79
+ // ffffff..eeeeee000000000000000000
80
+ //
81
+ // CJK Four-byte bigram
82
+ // ..dddddd..cccccc....bbbb....aaaa
83
+ // ..hhhhhh..gggggg....ffff....eeee
84
+ // make
85
+ // ..dddddd..cccccc....bbbb....aaaa
86
+ // 000..dddddd..cccccc....bbbb....a
87
+ // ..hhhhhh..gggggg....ffff....eeee
88
+ // ..ffff....eeee000000000000000000
89
+
90
+ // BIGRAM
91
+ // Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
92
+ // OVERSHOOTS up to 3 bytes
93
+ // For runtime use of tables
94
+ uint32 cld::BiHashV25(const char* word_ptr, int bytecount) {
95
+ if (bytecount == 0) {
96
+ return 0;
97
+ }
98
+ const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
99
+ uint32 word0, word1;
100
+ if (bytecount <= 4) {
101
+ word0 = word_ptr32[0] & kWordMask0[bytecount & 3];
102
+ word0 = word0 ^ (word0 >> 3);
103
+ return word0;
104
+ }
105
+ // Else do 8 bytes
106
+ word0 = word_ptr32[0];
107
+ word0 = word0 ^ (word0 >> 3);
108
+ word1 = word_ptr32[1] & kWordMask0[bytecount & 3];
109
+ word1 = word1 ^ (word1 << 18);
110
+ return word0 + word1;
111
+ }
112
+
113
+ //
114
+ // Ascii-7 One-byte chars
115
+ // ...ddddd...ccccc...bbbbb...aaaaa
116
+ // make
117
+ // ...ddddd...ccccc...bbbbb...aaaaa
118
+ // 000...ddddd...ccccc...bbbbb...aa
119
+ //
120
+ // Latin 1- and 2-byte chars
121
+ // ...ddddd...ccccc...bbbbb...aaaaa
122
+ // ...................fffff...eeeee
123
+ // make
124
+ // ...ddddd...ccccc...bbbbb...aaaaa
125
+ // 000...ddddd...ccccc...bbbbb...aa
126
+ // ...................fffff...eeeee
127
+ // ...............fffff...eeeee0000
128
+ //
129
+ // Non-CJK Two-byte chars
130
+ // ...ddddd...........bbbbb........
131
+ // ...hhhhh...........fffff........
132
+ // make
133
+ // ...ddddd...........bbbbb........
134
+ // 000...ddddd...........bbbbb.....
135
+ // ...hhhhh...........fffff........
136
+ // hhhh...........fffff........0000
137
+ //
138
+ // Non-CJK Three-byte chars
139
+ // ...........ccccc................
140
+ // ...................fffff........
141
+ // ...lllll...................iiiii
142
+ // make
143
+ // ...........ccccc................
144
+ // 000...........ccccc.............
145
+ // ...................fffff........
146
+ // ...............fffff........0000
147
+ // ...lllll...................iiiii
148
+ // .lllll...................iiiii00
149
+ //
150
+
151
+ // QUADGRAM
152
+ // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
153
+ // OVERSHOOTS up to 3 bytes
154
+ // For runtime use of tables
155
+ uint32 QuadHashV25Mix(const char* word_ptr, int bytecount, uint32 prepost) {
156
+ const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
157
+ uint32 word0, word1, word2;
158
+ if (bytecount <= 4) {
159
+ word0 = word_ptr32[0] & kWordMask0[bytecount & 3];
160
+ word0 = word0 ^ (word0 >> 3);
161
+ return word0 ^ prepost;
162
+ } else if (bytecount <= 8) {
163
+ word0 = word_ptr32[0];
164
+ word0 = word0 ^ (word0 >> 3);
165
+ word1 = word_ptr32[1] & kWordMask0[bytecount & 3];
166
+ word1 = word1 ^ (word1 << 4);
167
+ return (word0 ^ prepost) + word1;
168
+ }
169
+ // else do 12 bytes
170
+ word0 = word_ptr32[0];
171
+ word0 = word0 ^ (word0 >> 3);
172
+ word1 = word_ptr32[1];
173
+ word1 = word1 ^ (word1 << 4);
174
+ word2 = word_ptr32[2] & kWordMask0[bytecount & 3];
175
+ word2 = word2 ^ (word2 << 2);
176
+ return (word0 ^ prepost) + word1 + word2;
177
+ }
178
+
179
+
180
+ // QUADGRAM wrapper with surrounding spaces
181
+ // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
182
+ // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
183
+ // For runtime use of tables
184
+ uint32 cld::QuadHashV25(const char* word_ptr, int bytecount) {
185
+ if (bytecount == 0) {
186
+ return 0;
187
+ }
188
+ uint32 prepost = 0;
189
+ if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
190
+ if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
191
+ return QuadHashV25Mix(word_ptr, bytecount, prepost);
192
+ }
193
+
194
+ // QUADGRAM wrapper with surrounding underscores (offline use)
195
+ // Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
196
+ // OVERSHOOTS up to 3 bytes
197
+ // For offline construction of tables
198
+ uint32 cld::QuadHashV25Underscore(const char* word_ptr, int bytecount) {
199
+ if (bytecount == 0) {
200
+ return 0;
201
+ }
202
+ const char* local_word_ptr = word_ptr;
203
+ int local_bytecount = bytecount;
204
+ uint32 prepost = 0;
205
+ if (local_word_ptr[0] == '_') {
206
+ prepost |= kPreSpaceIndicator;
207
+ ++local_word_ptr;
208
+ --local_bytecount;
209
+ }
210
+ if (local_word_ptr[local_bytecount - 1] == '_') {
211
+ prepost |= kPostSpaceIndicator;
212
+ --local_bytecount;
213
+ }
214
+ return QuadHashV25Mix(local_word_ptr, local_bytecount, prepost);
215
+ }
216
+
217
+
218
+ // OCTAGRAM
219
+ // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
220
+ // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
221
+ //
222
+ // The low 32 bits follow the pattern from above, tuned to different scripts
223
+ // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
224
+ // For runtime use of tables V3
225
+ uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) {
226
+ const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
227
+ uint64 word0;
228
+ uint64 word1;
229
+ uint64 sum;
230
+
231
+ if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
232
+ if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
233
+ switch ((bytecount - 1) >> 2) {
234
+ case 0: // 1..4 bytes
235
+ word0 = word_ptr32[0] & kWordMask0[bytecount & 3];
236
+ sum = word0;
237
+ word0 = word0 ^ (word0 >> 3);
238
+ break;
239
+ case 1: // 5..8 bytes
240
+ word0 = word_ptr32[0];
241
+ sum = word0;
242
+ word0 = word0 ^ (word0 >> 3);
243
+ word1 = word_ptr32[1] & kWordMask0[bytecount & 3];
244
+ sum += word1;
245
+ word1 = word1 ^ (word1 << 4);
246
+ word0 += word1;
247
+ break;
248
+ case 2: // 9..12 bytes
249
+ word0 = word_ptr32[0];
250
+ sum = word0;
251
+ word0 = word0 ^ (word0 >> 3);
252
+ word1 = word_ptr32[1];
253
+ sum += word1;
254
+ word1 = word1 ^ (word1 << 4);
255
+ word0 += word1;
256
+ word1 = word_ptr32[2] & kWordMask0[bytecount & 3];
257
+ sum += word1;
258
+ word1 = word1 ^ (word1 << 2);
259
+ word0 += word1;
260
+ break;
261
+ case 3: // 13..16 bytes
262
+ word0 = word_ptr32[0];
263
+ sum = word0;
264
+ word0 = word0 ^ (word0 >> 3);
265
+ word1 = word_ptr32[1];
266
+ sum += word1;
267
+ word1 = word1 ^ (word1 << 4);
268
+ word0 += word1;
269
+ word1 = word_ptr32[2];
270
+ sum += word1;
271
+ word1 = word1 ^ (word1 << 2);
272
+ word0 += word1;
273
+ word1 = word_ptr32[3] & kWordMask0[bytecount & 3];
274
+ sum += word1;
275
+ word1 = word1 ^ (word1 >> 8);
276
+ word0 += word1;
277
+ break;
278
+ case 4: // 17..20 bytes
279
+ word0 = word_ptr32[0];
280
+ sum = word0;
281
+ word0 = word0 ^ (word0 >> 3);
282
+ word1 = word_ptr32[1];
283
+ sum += word1;
284
+ word1 = word1 ^ (word1 << 4);
285
+ word0 += word1;
286
+ word1 = word_ptr32[2];
287
+ sum += word1;
288
+ word1 = word1 ^ (word1 << 2);
289
+ word0 += word1;
290
+ word1 = word_ptr32[3];
291
+ sum += word1;
292
+ word1 = word1 ^ (word1 >> 8);
293
+ word0 += word1;
294
+ word1 = word_ptr32[4] & kWordMask0[bytecount & 3];
295
+ sum += word1;
296
+ word1 = word1 ^ (word1 >> 4);
297
+ word0 += word1;
298
+ break;
299
+ default: // 21..24 bytes and higher (ignores beyond 24)
300
+ word0 = word_ptr32[0];
301
+ sum = word0;
302
+ word0 = word0 ^ (word0 >> 3);
303
+ word1 = word_ptr32[1];
304
+ sum += word1;
305
+ word1 = word1 ^ (word1 << 4);
306
+ word0 += word1;
307
+ word1 = word_ptr32[2];
308
+ sum += word1;
309
+ word1 = word1 ^ (word1 << 2);
310
+ word0 += word1;
311
+ word1 = word_ptr32[3];
312
+ sum += word1;
313
+ word1 = word1 ^ (word1 >> 8);
314
+ word0 += word1;
315
+ word1 = word_ptr32[4];
316
+ sum += word1;
317
+ word1 = word1 ^ (word1 >> 4);
318
+ word0 += word1;
319
+ word1 = word_ptr32[5] & kWordMask0[bytecount & 3];
320
+ sum += word1;
321
+ word1 = word1 ^ (word1 >> 6);
322
+ word0 += word1;
323
+ break;
324
+ }
325
+
326
+ sum += (sum >> 17); // extra 1-bit shift for bytes 2 & 3
327
+ sum += (sum >> 9); // extra 1-bit shift for bytes 1 & 3
328
+ sum = (sum & 0xff) << 32;
329
+ return (word0 ^ prepost) + sum;
330
+ }
331
+
332
+ // OCTAGRAM wrapper with surrounding spaces
333
+ // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
334
+ // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
335
+ //
336
+ // The low 32 bits follow the pattern from above, tuned to different scripts
337
+ // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
338
+ // For runtime use of tables V3
339
+ uint64 cld::OctaHash40(const char* word_ptr, int bytecount) {
340
+ if (bytecount == 0) {
341
+ return 0;
342
+ }
343
+ uint64 prepost = 0;
344
+ if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
345
+ if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
346
+ return OctaHash40Mix(word_ptr, bytecount, prepost);
347
+ }
348
+
349
+
350
+ // OCTAGRAM wrapper with surrounding underscores (offline use)
351
+ // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
352
+ // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
353
+ //
354
+ // The low 32 bits follow the pattern from above, tuned to different scripts
355
+ // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
356
+ // For offline construction of tables
357
+ uint64 cld::OctaHash40underscore(const char* word_ptr, int bytecount) {
358
+ if (bytecount == 0) {
359
+ return 0;
360
+ }
361
+ const char* local_word_ptr = word_ptr;
362
+ int local_bytecount = bytecount;
363
+ uint64 prepost = 0;
364
+ if (local_word_ptr[0] == '_') {
365
+ prepost |= kPreSpaceIndicator;
366
+ ++local_word_ptr;
367
+ --local_bytecount;
368
+ }
369
+ if (local_word_ptr[local_bytecount - 1] == '_') {
370
+ prepost |= kPostSpaceIndicator;
371
+ --local_bytecount;
372
+ }
373
+ return OctaHash40Mix(local_word_ptr, local_bytecount, prepost);
374
+ }
375
+
376
+
377
+
378
+
379
+ //------------------------------------------------------------------------------
380
+ // Scoring single groups of letters
381
+ //------------------------------------------------------------------------------
382
+
383
+ // UNIGRAM score one => tote
384
+ // Input: 1-byte entry of subscript into unigram probs, plus
385
+ // an accumulator tote.
386
+ // Output: running sums in tote updated
387
+ void cld::ProcessProbV25UniTote(int propval, Tote* tote) {
388
+ tote->AddGram();
389
+ const UnigramProbArray* pa = &kTargetCTJKVZProbs[propval];
390
+ if (pa->probs[0] > 0) {tote->Add(cld::PackLanguage(CHINESE), pa->probs[0]);}
391
+ if (pa->probs[1] > 0) {tote->Add(cld::PackLanguage(CHINESE_T), pa->probs[1]);}
392
+ if (pa->probs[2] > 0) {tote->Add(cld::PackLanguage(JAPANESE), pa->probs[2]);}
393
+ if (pa->probs[3] > 0) {tote->Add(cld::PackLanguage(KOREAN), pa->probs[3]);}
394
+ if (pa->probs[4] > 0) {tote->Add(cld::PackLanguage(VIETNAMESE), pa->probs[4]);}
395
+ if (pa->probs[5] > 0) {tote->Add(cld::PackLanguage(ZHUANG), pa->probs[5]);}
396
+ }
397
+
398
+ // BIGRAM, QUADGRAM, OCTAGRAM score one => tote
399
+ // Input: 4-byte entry of 3 language numbers and one probability subscript, plus
400
+ // an accumulator tote. (language 0 means unused entry)
401
+ // Output: running sums in tote updated
402
+ void cld::ProcessProbV25Tote(uint32 probs, Tote* tote) {
403
+ tote->AddGram();
404
+ uint8 prob123 = (probs >> 0) & 0xff;
405
+ const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
406
+
407
+ uint8 top1 = (probs >> 8) & 0xff;
408
+ if (top1 > 0) {tote->Add(top1, cld::LgProb3(prob123_entry, 0));}
409
+ uint8 top2 = (probs >> 16) & 0xff;
410
+ if (top2 > 0) {tote->Add(top2, cld::LgProb3(prob123_entry, 1));}
411
+ uint8 top3 = (probs >> 24) & 0xff;
412
+ if (top3 > 0) {tote->Add(top3, cld::LgProb3(prob123_entry, 2));}
413
+ }
414
+
415
+
416
+ //------------------------------------------------------------------------------
417
+ // Routines to accumulate probabilities
418
+ //------------------------------------------------------------------------------
419
+
420
+
421
+ // UNIGRAM, using UTF-8 property table, advancing by 1/2/4/8 chars
422
+ // Caller supplies table, such as compact_lang_det_generated_ctjkvz_b1_obj
423
+ // Score up to n unigrams, returning number of bytes consumed
424
+ // Updates tote_grams
425
+ int cld::DoUniScoreV3(const UTF8PropObj* unigram_obj,
426
+ const char* isrc, int srclen, int advance_by,
427
+ int* tote_grams, int gram_limit, Tote* chunk_tote) {
428
+ const char* src = isrc;
429
+ if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
430
+
431
+ // Property-based CJK unigram lookup
432
+ if (src[0] == ' ') {++src; --srclen;}
433
+
434
+ const uint8* usrc = reinterpret_cast<const uint8*>(src);
435
+ int usrclen = srclen;
436
+
437
+ while (usrclen > 0) {
438
+ int len = kAdvanceOneChar[usrc[0]];
439
+ // Look up property of one UTF-8 character and advance over it
440
+ // Return 0 if input length is zero
441
+ // Return 0 and advance one byte if input is ill-formed
442
+
443
+ int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &usrclen);
444
+
445
+ if (FLAGS_dbglookup) {
446
+ DbgUniTermToStderr(propval, usrc, len);
447
+ }
448
+
449
+ if (propval > 0) {
450
+ ProcessProbV25UniTote(propval, chunk_tote);
451
+ ++(*tote_grams);
452
+ if (FLAGS_dbgscore) {DbgScoreRecordUni((const char*)usrc, propval, len);}
453
+ }
454
+
455
+ // Advance by 1/2/4/8 characters (half of quad advance)
456
+ if (advance_by == 2) {
457
+ // Already advanced by 1
458
+ } else if (advance_by == 4) {
459
+ // Advance by 2 chars total, if not at end
460
+ if (UTFmax <= usrclen) {
461
+ int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
462
+ }
463
+ } else if (advance_by == 8) {
464
+ // Advance by 4 chars total, if not at end
465
+ if ((UTFmax * 3) <= usrclen) {
466
+ int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
467
+ n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
468
+ n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
469
+ }
470
+ } else {
471
+ // Advance by 8 chars total, if not at end
472
+ if ((UTFmax * 7) <= usrclen) {
473
+ int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
474
+ n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
475
+ n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
476
+ n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
477
+ n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
478
+ n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
479
+ n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
480
+ }
481
+ }
482
+ DCHECK(usrclen >= 0);
483
+
484
+ if (*tote_grams >= gram_limit) {
485
+ break;
486
+ }
487
+ }
488
+ if (FLAGS_dbgscore) {
489
+ // With advance_by>2, we consume more input to get the same number of quads
490
+ int len = src - isrc;
491
+ DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
492
+ DbgScoreFlush();
493
+ }
494
+
495
+ int consumed2 = reinterpret_cast<const char*>(usrc) - isrc;
496
+ return consumed2;
497
+ }
498
+
499
+
500
+ // BIGRAM, using hash table, always advancing by 1 char
501
+ // Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj
502
+ // Score all bigrams in isrc, using languages that have bigrams (CJK)
503
+ // Return number of bigrams that hit in the hash table
504
+ int cld::DoBigramScoreV3(const cld::CLDTableSummary* bigram_obj,
505
+ const char* isrc, int srclen, Tote* chunk_tote) {
506
+ int hit_count = 0;
507
+ const char* src = isrc;
508
+
509
+ // Hashtable-based CJK bigram lookup
510
+ const uint8* usrc = reinterpret_cast<const uint8*>(src);
511
+ const uint8* usrclimit1 = usrc + srclen - UTFmax;
512
+ if (FLAGS_dbgscore) {
513
+ fprintf(stderr, " " );
514
+ }
515
+
516
+ while (usrc < usrclimit1) {
517
+ int len = kAdvanceOneChar[usrc[0]];
518
+ int len2 = kAdvanceOneChar[usrc[len]] + len;
519
+
520
+ if ((kMinCJKUTF8CharBytes * 2) <= len2) { // Two CJK chars possible
521
+ // Lookup and score this bigram
522
+ // Always ignore pre/post spaces
523
+ uint32 bihash = BiHashV25(reinterpret_cast<const char*>(usrc), len2);
524
+ uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash);
525
+ // Now go indirect on the subscript
526
+ probs = bigram_obj->kCLDTableInd[probs &
527
+ ~bigram_obj->kCLDTableKeyMask];
528
+
529
+ // Process the bigram
530
+ if (FLAGS_dbglookup) {
531
+ const char* ssrc = reinterpret_cast<const char*>(usrc);
532
+ DbgBiTermToStderr(bihash, probs, ssrc, len2);
533
+ DbgScoreRecord(NULL, probs, len2);
534
+ } else if (FLAGS_dbgscore && (probs != 0)) {
535
+ const char* ssrc = reinterpret_cast<const char*>(usrc);
536
+ DbgScoreRecord(NULL, probs, len2);
537
+ string temp(ssrc, len2);
538
+ fprintf(stderr, "%s ", temp.c_str());
539
+ }
540
+
541
+ if (probs != 0) {
542
+ ProcessProbV25Tote(probs, chunk_tote);
543
+ ++hit_count;
544
+ }
545
+ }
546
+ usrc += len; // Advance by one char
547
+ }
548
+
549
+ if (FLAGS_dbgscore) {
550
+ fprintf(stderr, "[%d bigrams scored]\n", hit_count);
551
+ DbgScoreState();
552
+ }
553
+ return hit_count;
554
+ }
555
+
556
+
557
+
558
+ // QUADGRAM, using hash table, advancing by 2/4/8/16 chars
559
+ // Caller supplies table, such as &kQuadTable_obj or &kGibberishTable_obj
560
+ // Score up to n quadgrams, returning number of bytes consumed
561
+ // Updates tote_grams
562
+ int cld::DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj,
563
+ const char* isrc, int srclen, int advance_by,
564
+ int* tote_grams, int gram_limit, Tote* chunk_tote) {
565
+ const char* src = isrc;
566
+ const char* srclimit = src + srclen;
567
+ // Limit is end, which has extra 20 20 20 00 past len
568
+ const char* srclimit7 = src + srclen - (UTFmax * 7);
569
+ const char* srclimit15 = src + srclen - (UTFmax * 15);
570
+
571
+ if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
572
+
573
+ // Run a little cache of last hits to catch overly-repetitive "text"
574
+ int next_prior = 0;
575
+ uint32 prior_quads[2] = {0, 0};
576
+
577
+ // Visit all quadgrams
578
+ if (src[0] == ' ') {++src;}
579
+ while (src < srclimit) {
580
+ // Find one quadgram
581
+ const char* src_end = src;
582
+ src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
583
+ src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
584
+ const char* src_mid = src_end;
585
+ src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
586
+ src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
587
+ int len = src_end - src;
588
+
589
+ // Lookup and score this quadgram
590
+ uint32 quadhash = QuadHashV25(src, len);
591
+ uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash);
592
+ // Now go indirect on the subscript
593
+ probs = quadgram_obj->kCLDTableInd[probs &
594
+ ~quadgram_obj->kCLDTableKeyMask];
595
+
596
+ // Process the quadgram
597
+ if (FLAGS_dbglookup) {
598
+ DbgQuadTermToStderr(quadhash, probs, src, len);
599
+ }
600
+ if (probs != 0) {
601
+ // Filter out recent repeats. If this works out, use in the other lookups
602
+ if ((quadhash != prior_quads[0]) && (quadhash != prior_quads[1])) {
603
+ prior_quads[next_prior] = quadhash;
604
+ next_prior = (next_prior + 1) & 1;
605
+ ProcessProbV25Tote(probs, chunk_tote);
606
+ ++(*tote_grams);
607
+ if (FLAGS_dbgscore) {DbgScoreRecord(src, probs, len);}
608
+ }
609
+ }
610
+
611
+ // Advance all the way past word if at end-of-word
612
+ if (src_end[0] == ' ') {
613
+ src_mid = src_end;
614
+ }
615
+
616
+ // Advance by 2/4/8/16 characters
617
+ if (advance_by == 2) {
618
+ src = src_mid;
619
+ } else if (advance_by == 4) {
620
+ src = src_end;
621
+ } else if (advance_by == 8) {
622
+ // Advance by 8 chars total (4 more), if not at end
623
+ if (src < srclimit7) {
624
+ src_end += kAdvanceOneChar[(uint8)src_end[0]];
625
+ src_end += kAdvanceOneChar[(uint8)src_end[0]];
626
+ src_end += kAdvanceOneChar[(uint8)src_end[0]];
627
+ src_end += kAdvanceOneChar[(uint8)src_end[0]];
628
+ }
629
+ src = src_end;
630
+ } else {
631
+ // Advance by 16 chars total (12 more), if not at end
632
+ if (src < srclimit15) {
633
+ // Advance by ~16 chars by adding 3 * current bytelen
634
+ int fourcharlen = src_end - src;
635
+ src = src_end + (3 * fourcharlen);
636
+ // Advance a bit more if mid-character
637
+ src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
638
+ src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
639
+ } else {
640
+ src = src_end;
641
+ }
642
+ }
643
+ DCHECK(src < srclimit);
644
+ src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
645
+
646
+ if (*tote_grams >= gram_limit) {
647
+ break;
648
+ }
649
+ }
650
+
651
+ if (FLAGS_dbgscore) {
652
+ // With advance_by>2, we consume more input to get the same number of quads
653
+ int len = src - isrc;
654
+ DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
655
+ DbgScoreFlush();
656
+ }
657
+
658
+ int consumed = src - isrc;
659
+
660
+ // If advancing by more than 2, src may have overshot srclimit
661
+ if (consumed > srclen) {
662
+ consumed = srclen;
663
+ }
664
+
665
+ return consumed;
666
+ }
667
+
668
+
669
+ // OCTAGRAM, using hash table, always advancing by 1 word
670
+ // Caller supplies table, such as &kLongWord8Table_obj
671
+ // Score all words in isrc, using languages that have quadgrams
672
+ // We don't normally use this routine except on the first quadgram run,
673
+ // but it can be used to resolve unreliable pages.
674
+ // This routine does not have an optimized advance_by
675
+ // SOON: Uses indirect language/probability longword
676
+ //
677
+ // Return number of words that hit in the hash table
678
+ int cld::DoOctaScoreV3(const cld::CLDTableSummary* octagram_obj,
679
+ const char* isrc, int srclen, Tote* chunk_tote) {
680
+ int hit_count = 0;
681
+ const char* src = isrc;
682
+ const char* srclimit = src + srclen + 1;
683
+ // Limit is end+1, to include extra space char (0x20) off the end
684
+ //
685
+ // Score all words truncated to 8 characters
686
+ int charcount = 0;
687
+ // Skip any initial space
688
+ if (src[0] == ' ') {++src;}
689
+ const char* word_ptr = src;
690
+ const char* word_end = word_ptr;
691
+ if (FLAGS_dbgscore) {
692
+ fprintf(stderr, " " );
693
+ }
694
+ while (src < srclimit) {
695
+ // Terminate previous word or continue current word
696
+ if (src[0] == ' ') {
697
+ int bytecount = word_end - word_ptr;
698
+ if (bytecount == 0)
699
+ break;
700
+ // Lookup and score this word
701
+ uint64 wordhash40 = OctaHash40(word_ptr, bytecount);
702
+ uint32 probs = OctaHashV3Lookup4(octagram_obj, wordhash40);
703
+ // Now go indirect on the subscript
704
+ probs = octagram_obj->kCLDTableInd[probs &
705
+ ~octagram_obj->kCLDTableKeyMask];
706
+
707
+ // // Lookup and score this word
708
+ // uint32 wordhash = QuadHashV25(word_ptr, bytecount);
709
+ // uint32 probs = WordHashLookup4(wordhash, kLongWord8Table,
710
+ // kLongWord8TableSize);
711
+ //
712
+ if (FLAGS_dbglookup) {
713
+ DbgWordTermToStderr(wordhash40, probs, word_ptr, bytecount);
714
+ DbgScoreRecord(NULL, probs, bytecount);
715
+ } else if (FLAGS_dbgscore && (probs != 0)) {
716
+ DbgScoreRecord(NULL, probs, bytecount);
717
+ string temp(word_ptr, bytecount);
718
+ fprintf(stderr, "%s ", temp.c_str());
719
+ }
720
+
721
+ if (probs != 0) {
722
+ ProcessProbV25Tote(probs, chunk_tote);
723
+ ++hit_count;
724
+ }
725
+ charcount = 0;
726
+ word_ptr = src + 1; // Over the space
727
+ word_end = word_ptr;
728
+ } else {
729
+ ++charcount;
730
+ }
731
+
732
+ // Advance to next char
733
+ src += cld_UniLib::OneCharLen(src);
734
+ if (charcount <= 8) {
735
+ word_end = src;
736
+ }
737
+ }
738
+
739
+ if (FLAGS_dbgscore) {
740
+ fprintf(stderr, "[%d words scored]\n", hit_count);
741
+ DbgScoreState();
742
+ }
743
+ return hit_count;
744
+ }
745
+
746
+
747
+
748
+ //------------------------------------------------------------------------------
749
+ // Reliability calculations, for single language and between languages
750
+ //------------------------------------------------------------------------------
751
+
752
+ // Return reliablity of result 0..100 for top two scores
753
+ // delta==0 is 0% reliable, delta==fully_reliable_thresh is 100% reliable
754
+ // (on a scale where +1 is a factor of 2 ** 1.6 = 3.02)
755
+ // Threshold is uni/quadgram increment count, bounded above and below.
756
+ //
757
+ // Requiring a factor of 3 improvement (e.g. +1 log base 3)
758
+ // for each scored quadgram is too stringent, so I've backed this off to a
759
+ // factor of 2 (e.g. +5/8 log base 3).
760
+ //
761
+ // I also somewhat lowered the Min/MaxGramCount limits above
762
+ //
763
+ // Added: if fewer than 8 quads/unis, max reliability is 12*n percent
764
+ //
765
+ int cld::ReliabilityDelta(int value1, int value2, int gramcount) {
766
+ int max_reliability_percent = 100;
767
+ if (gramcount < 8) {
768
+ max_reliability_percent = 12 * gramcount;
769
+ }
770
+ int fully_reliable_thresh = (gramcount * 5) >> 3; // see note above
771
+ if (fully_reliable_thresh < kMinGramCount) { // Fully = 3..16
772
+ fully_reliable_thresh = kMinGramCount;
773
+ } else if (fully_reliable_thresh > kMaxGramCount) {
774
+ fully_reliable_thresh = kMaxGramCount;
775
+ }
776
+
777
+ int delta = value1 - value2;
778
+ if (delta >= fully_reliable_thresh) {return max_reliability_percent;}
779
+ if (delta <= 0) {return 0;}
780
+ return cld::minint(max_reliability_percent,
781
+ (100 * delta) / fully_reliable_thresh);
782
+ }
783
+
784
+ // Return reliablity of result 0..100 for top score vs. mainsteam score
785
+ // Values are score per 1024 bytes of input
786
+ // ratio = max(top/mainstream, mainstream/top)
787
+ // ratio > 4.0 is 0% reliable, <= 2.0 is 100% reliable
788
+ // Change: short-text word scoring can give unusually good results.
789
+ // Let top exceed mainstream by 4x at 50% reliable
790
+ int cld::ReliabilityMainstream(int topscore, int len, int mean_score) {
791
+ if (mean_score == 0) {return 100;} // No reliability data available yet
792
+ if (topscore == 0) {return 0;} // zero score = unreliable
793
+ if (len == 0) {return 0;} // zero len = unreliable
794
+ int top_kb = (topscore << 10) / len;
795
+ double ratio;
796
+ double ratio_cutoff;
797
+ if (top_kb > mean_score) {
798
+ ratio = (1.0 * top_kb) / mean_score;
799
+ ratio_cutoff = 5.0; // ramp down from 100% to 0%: 3.0-5.0
800
+ } else {
801
+ ratio = (1.0 * mean_score) / top_kb;
802
+ ratio_cutoff = 4.0; // ramp down from 100% to 0%: 2.0-4.0
803
+ }
804
+ if (ratio <= ratio_cutoff - 2.0) {return 100;}
805
+ if (ratio > ratio_cutoff) {return 0;}
806
+
807
+ int iratio = static_cast<int>(100 * (ratio_cutoff - ratio) / 2.0);
808
+ return iratio;
809
+ }
810
+
811
+ // Calculate ratio of score per 1KB vs. expected score per 1KB
812
+ double cld::GetNormalizedScore(Language lang, UnicodeLScript lscript,
813
+ int bytes, int score) {
814
+ // Average training-data score for this language-script combo, per 1KB
815
+ int expected_score = kMeanScore[lang * 4 + LScript4(lscript)];
816
+ if (lscript == ULScript_Common) {
817
+ // We don't know the script (only happens with second-chance score)
818
+ // Look for first non-zero mean value
819
+ for (int i = 0; i < 3; ++i) {
820
+ if (kMeanScore[lang * 4 + i] > 0) {
821
+ expected_score = kMeanScore[lang * 4 + i];
822
+ }
823
+ }
824
+ }
825
+ if (expected_score < 100) {
826
+ expected_score = 1000;
827
+ }
828
+
829
+ // Our score per 1KB
830
+ double our_score = (score << 10) / (bytes ? bytes : 1); // Avoid zdiv
831
+ double ratio = our_score / expected_score;
832
+
833
+ // Just the raw count normalized as though each language has mean=1000;
834
+ ratio = (score * 1000.0) / expected_score;
835
+ return ratio;
836
+ }
837
+
838
+ // Calculate reliablity of len bytes of script lscript with chunk_tote
839
+ int cld::GetReliability(int len, UnicodeLScript lscript,
840
+ const Tote* chunk_tote) {
841
+ Language cur_lang = UnpackLanguage(chunk_tote->Key(0));
842
+ // Average score for this language-script combo
843
+ int mean_score = kMeanScore[cur_lang * 4 + LScript4(lscript)];
844
+ if (lscript == ULScript_Common) {
845
+ // We don't know the script (only happens with second-chance score)
846
+ // Look for first non-zero mean value
847
+ for (int i = 0; i < 3; ++i) {
848
+ if (kMeanScore[cur_lang * 4 + i] > 0) {
849
+ mean_score = kMeanScore[cur_lang * 4 + i];
850
+ }
851
+ }
852
+ }
853
+ int reliability_delta = ReliabilityDelta(chunk_tote->Value(0),
854
+ chunk_tote->Value(1),
855
+ chunk_tote->GetGramCount());
856
+
857
+ int reliability_main = ReliabilityMainstream(chunk_tote->Value(0),
858
+ len,
859
+ mean_score);
860
+
861
+ int reliability_min = minint(reliability_delta, reliability_main);
862
+
863
+
864
+ if (FLAGS_dbgreli) {
865
+ char temp1[4];
866
+ char temp2[4];
867
+ cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(0)), temp1);
868
+ if (temp1[2] == ' ') {temp1[2] = '\0';}
869
+ cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(1)), temp2);
870
+ if (temp2[2] == ' ') {temp2[2] = '\0';}
871
+ int srclen = len;
872
+ fprintf(stderr, "CALC GetReliability gram=%d incr=%d srclen=%d, %s=%d %s=%d "
873
+ "top/KB=%d mean/KB=%d del=%d%% reli=%d%% "
874
+ "lang/lscript %d %d\n",
875
+ chunk_tote->GetGramCount(),
876
+ chunk_tote->GetIncrCount(),
877
+ srclen,
878
+ temp1, chunk_tote->Value(0),
879
+ temp2, chunk_tote->Value(1),
880
+ (chunk_tote->Value(0) << 10) / (srclen ? srclen : 1),
881
+ mean_score,
882
+ reliability_delta,
883
+ reliability_main,
884
+ cur_lang, lscript);
885
+ }
886
+
887
+ return reliability_min;
888
+ }
889
+
890
+
891
+ //------------------------------------------------------------------------------
892
+ // Miscellaneous
893
+ //------------------------------------------------------------------------------
894
+
895
+ // Demote all languages except Top40 and plus_one
896
+ // Do this just before sorting chunk_tote results
897
+ void cld::DemoteNotTop40(Tote* chunk_tote, int packed_plus_one) {
898
+ for (int sub = 0; sub < chunk_tote->MaxSize(); ++sub) {
899
+ if (chunk_tote->Key(sub) == 0) continue;
900
+ if (chunk_tote->Key(sub) == packed_plus_one) continue;
901
+ if (kIsPackedTop40[chunk_tote->Key(sub)]) continue;
902
+ // Quarter the score of others
903
+ chunk_tote->SetValue(sub, chunk_tote->Value(sub) >> 2);
904
+ }
905
+ }