language_detection 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. data/.gitignore +19 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +85 -0
  5. data/Rakefile +11 -0
  6. data/ext/cld/Makefile +34 -0
  7. data/ext/cld/base/basictypes.h +348 -0
  8. data/ext/cld/base/build_config.h +124 -0
  9. data/ext/cld/base/casts.h +156 -0
  10. data/ext/cld/base/commandlineflags.h +443 -0
  11. data/ext/cld/base/crash.h +41 -0
  12. data/ext/cld/base/dynamic_annotations.h +358 -0
  13. data/ext/cld/base/global_strip_options.h +59 -0
  14. data/ext/cld/base/log_severity.h +46 -0
  15. data/ext/cld/base/logging.h +1403 -0
  16. data/ext/cld/base/macros.h +243 -0
  17. data/ext/cld/base/port.h +54 -0
  18. data/ext/cld/base/scoped_ptr.h +428 -0
  19. data/ext/cld/base/stl_decl.h +0 -0
  20. data/ext/cld/base/stl_decl_msvc.h +107 -0
  21. data/ext/cld/base/string_util.h +29 -0
  22. data/ext/cld/base/strtoint.h +93 -0
  23. data/ext/cld/base/template_util.h +96 -0
  24. data/ext/cld/base/type_traits.h +198 -0
  25. data/ext/cld/base/vlog_is_on.h +143 -0
  26. data/ext/cld/cld.so +0 -0
  27. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  28. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  29. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  30. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  31. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  32. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  33. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  34. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  35. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  36. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  37. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  38. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  39. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  40. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  41. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  42. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  43. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  44. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  45. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  46. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  47. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  48. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  49. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  50. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  51. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  52. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  53. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  54. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  55. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  56. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  57. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  58. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  59. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  60. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  61. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  62. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  63. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  64. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  65. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  66. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  67. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  68. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  69. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  70. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  71. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  72. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  73. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  74. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  75. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  76. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  77. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  78. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  79. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  80. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  81. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  82. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  83. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  84. data/ext/cld/encodings/internal/encodings.cc +12 -0
  85. data/ext/cld/encodings/lang_enc.h +254 -0
  86. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  87. data/ext/cld/encodings/public/encodings.h +301 -0
  88. data/ext/cld/extconf.rb +1 -0
  89. data/ext/cld/language_detection.cc +88 -0
  90. data/ext/cld/languages/internal/languages.cc +337 -0
  91. data/ext/cld/languages/proto/languages.pb.h +179 -0
  92. data/ext/cld/languages/public/languages.h +379 -0
  93. data/language_detection.gemspec +28 -0
  94. data/lib/language_detection/string.rb +1 -0
  95. data/lib/language_detection/version.rb +3 -0
  96. data/lib/language_detection.rb +54 -0
  97. data/test/_helper.rb +15 -0
  98. data/test/fixtures/languages.csv +80 -0
  99. data/test/language_detection_test.rb +88 -0
  100. metadata +250 -0
@@ -0,0 +1,905 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include <string>
6
+ #include "encodings/compact_lang_det/cldutil.h"
7
+ #include "encodings/compact_lang_det/cldutil_dbg.h"
8
+ #include "encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h"
9
+ #include "encodings/compact_lang_det/utf8propletterscriptnum.h"
10
+ #include "encodings/compact_lang_det/win/cld_commandlineflags.h"
11
+ #include "encodings/compact_lang_det/win/cld_logging.h"
12
+ #include "encodings/compact_lang_det/win/cld_unilib.h"
13
+ #include "encodings/compact_lang_det/win/cld_utf.h"
14
+ #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
15
+
16
+ // Runtime routines for hashing, looking up, and scoring
17
+ // unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
18
+ // Unigrams and bigrams are for CJK languages only, including simplified/
19
+ // traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
20
+ // Zhuang Han characters. Surrounding spaces are not considered.
21
+ // Quadgrams and octagrams for for non-CJK and include two bits indicating
22
+ // preceding and trailing spaces (word boundaries).
23
+
24
+
25
+ // Indicator bits for leading/trailing space around quad/octagram
26
+ // NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of
27
+ // 1-, 2-, or 3-bytes each.
28
+ static const uint32 kPreSpaceIndicator = 0x00004444;
29
+ static const uint32 kPostSpaceIndicator = 0x44440000;
30
+
31
+ // Little-endian masks for 0..24 bytes picked up as uint32's
32
+ static const uint32 kWordMask0[4] = {
33
+ 0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF
34
+ };
35
+
36
+ static const int kMinCJKUTF8CharBytes = 3;
37
+
38
+ static const int kMinGramCount = 3;
39
+ static const int kMaxGramCount = 16;
40
+
41
+
42
+
43
+
44
+ // Routines to access a hash table of <key:wordhash, value:probs> pairs
45
+ // Buckets have 4-byte wordhash for sizes < 32K buckets, but only
46
+ // 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
47
+ // bucket subscript.
48
+ // Probs is a packed: three languages plus a subscript for probability table
49
+ // Buckets have all the keys together, then all the values.Key array never
50
+ // crosses a cache-line boundary, so no-match case takes exactly one cache miss.
51
+ // Match case may sometimes take an additional cache miss on value access.
52
+ //
53
+ // Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
54
+ // byte buckets with single cache miss.
55
+ // Or 2-byte key and 6-byte value, allowing 5 languages instead of three.
56
+ //------------------------------------------------------------------------------
57
+
58
+
59
+ //------------------------------------------------------------------------------
60
+ // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores
61
+ //------------------------------------------------------------------------------
62
+
63
+ // Design principles for these hash functions
64
+ // - Few operations
65
+ // - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in
66
+ // Latin script expect 1- and 2-byte mixtures.
67
+ // - Last byte of each character has about 5 bits of information
68
+ // - Spread good bits around so they can interact in at least two ways
69
+ // with other characters
70
+ // - Use add for additional mixing thorugh carries
71
+
72
+ // CJK Three-byte bigram
73
+ // ....dddd..cccccc..bbbbbb....aaaa
74
+ // ..................ffffff..eeeeee
75
+ // make
76
+ // ....dddd..cccccc..bbbbbb....aaaa
77
+ // 000....dddd..cccccc..bbbbbb....a
78
+ // ..................ffffff..eeeeee
79
+ // ffffff..eeeeee000000000000000000
80
+ //
81
+ // CJK Four-byte bigram
82
+ // ..dddddd..cccccc....bbbb....aaaa
83
+ // ..hhhhhh..gggggg....ffff....eeee
84
+ // make
85
+ // ..dddddd..cccccc....bbbb....aaaa
86
+ // 000..dddddd..cccccc....bbbb....a
87
+ // ..hhhhhh..gggggg....ffff....eeee
88
+ // ..ffff....eeee000000000000000000
89
+
90
+ // BIGRAM
91
+ // Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
92
+ // OVERSHOOTS up to 3 bytes
93
+ // For runtime use of tables
94
+ uint32 cld::BiHashV25(const char* word_ptr, int bytecount) {
95
+ if (bytecount == 0) {
96
+ return 0;
97
+ }
98
+ const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
99
+ uint32 word0, word1;
100
+ if (bytecount <= 4) {
101
+ word0 = word_ptr32[0] & kWordMask0[bytecount & 3];
102
+ word0 = word0 ^ (word0 >> 3);
103
+ return word0;
104
+ }
105
+ // Else do 8 bytes
106
+ word0 = word_ptr32[0];
107
+ word0 = word0 ^ (word0 >> 3);
108
+ word1 = word_ptr32[1] & kWordMask0[bytecount & 3];
109
+ word1 = word1 ^ (word1 << 18);
110
+ return word0 + word1;
111
+ }
112
+
113
+ //
114
+ // Ascii-7 One-byte chars
115
+ // ...ddddd...ccccc...bbbbb...aaaaa
116
+ // make
117
+ // ...ddddd...ccccc...bbbbb...aaaaa
118
+ // 000...ddddd...ccccc...bbbbb...aa
119
+ //
120
+ // Latin 1- and 2-byte chars
121
+ // ...ddddd...ccccc...bbbbb...aaaaa
122
+ // ...................fffff...eeeee
123
+ // make
124
+ // ...ddddd...ccccc...bbbbb...aaaaa
125
+ // 000...ddddd...ccccc...bbbbb...aa
126
+ // ...................fffff...eeeee
127
+ // ...............fffff...eeeee0000
128
+ //
129
+ // Non-CJK Two-byte chars
130
+ // ...ddddd...........bbbbb........
131
+ // ...hhhhh...........fffff........
132
+ // make
133
+ // ...ddddd...........bbbbb........
134
+ // 000...ddddd...........bbbbb.....
135
+ // ...hhhhh...........fffff........
136
+ // hhhh...........fffff........0000
137
+ //
138
+ // Non-CJK Three-byte chars
139
+ // ...........ccccc................
140
+ // ...................fffff........
141
+ // ...lllll...................iiiii
142
+ // make
143
+ // ...........ccccc................
144
+ // 000...........ccccc.............
145
+ // ...................fffff........
146
+ // ...............fffff........0000
147
+ // ...lllll...................iiiii
148
+ // .lllll...................iiiii00
149
+ //
150
+
151
+ // QUADGRAM
152
+ // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
153
+ // OVERSHOOTS up to 3 bytes
154
+ // For runtime use of tables
155
+ uint32 QuadHashV25Mix(const char* word_ptr, int bytecount, uint32 prepost) {
156
+ const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
157
+ uint32 word0, word1, word2;
158
+ if (bytecount <= 4) {
159
+ word0 = word_ptr32[0] & kWordMask0[bytecount & 3];
160
+ word0 = word0 ^ (word0 >> 3);
161
+ return word0 ^ prepost;
162
+ } else if (bytecount <= 8) {
163
+ word0 = word_ptr32[0];
164
+ word0 = word0 ^ (word0 >> 3);
165
+ word1 = word_ptr32[1] & kWordMask0[bytecount & 3];
166
+ word1 = word1 ^ (word1 << 4);
167
+ return (word0 ^ prepost) + word1;
168
+ }
169
+ // else do 12 bytes
170
+ word0 = word_ptr32[0];
171
+ word0 = word0 ^ (word0 >> 3);
172
+ word1 = word_ptr32[1];
173
+ word1 = word1 ^ (word1 << 4);
174
+ word2 = word_ptr32[2] & kWordMask0[bytecount & 3];
175
+ word2 = word2 ^ (word2 << 2);
176
+ return (word0 ^ prepost) + word1 + word2;
177
+ }
178
+
179
+
180
+ // QUADGRAM wrapper with surrounding spaces
181
+ // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
182
+ // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
183
+ // For runtime use of tables
184
+ uint32 cld::QuadHashV25(const char* word_ptr, int bytecount) {
185
+ if (bytecount == 0) {
186
+ return 0;
187
+ }
188
+ uint32 prepost = 0;
189
+ if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
190
+ if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
191
+ return QuadHashV25Mix(word_ptr, bytecount, prepost);
192
+ }
193
+
194
+ // QUADGRAM wrapper with surrounding underscores (offline use)
195
+ // Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
196
+ // OVERSHOOTS up to 3 bytes
197
+ // For offline construction of tables
198
+ uint32 cld::QuadHashV25Underscore(const char* word_ptr, int bytecount) {
199
+ if (bytecount == 0) {
200
+ return 0;
201
+ }
202
+ const char* local_word_ptr = word_ptr;
203
+ int local_bytecount = bytecount;
204
+ uint32 prepost = 0;
205
+ if (local_word_ptr[0] == '_') {
206
+ prepost |= kPreSpaceIndicator;
207
+ ++local_word_ptr;
208
+ --local_bytecount;
209
+ }
210
+ if (local_word_ptr[local_bytecount - 1] == '_') {
211
+ prepost |= kPostSpaceIndicator;
212
+ --local_bytecount;
213
+ }
214
+ return QuadHashV25Mix(local_word_ptr, local_bytecount, prepost);
215
+ }
216
+
217
+
218
+ // OCTAGRAM
219
+ // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
220
+ // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
221
+ //
222
+ // The low 32 bits follow the pattern from above, tuned to different scripts
223
+ // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
224
+ // For runtime use of tables V3
225
+ uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) {
226
+ const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
227
+ uint64 word0;
228
+ uint64 word1;
229
+ uint64 sum;
230
+
231
+ if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
232
+ if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
233
+ switch ((bytecount - 1) >> 2) {
234
+ case 0: // 1..4 bytes
235
+ word0 = word_ptr32[0] & kWordMask0[bytecount & 3];
236
+ sum = word0;
237
+ word0 = word0 ^ (word0 >> 3);
238
+ break;
239
+ case 1: // 5..8 bytes
240
+ word0 = word_ptr32[0];
241
+ sum = word0;
242
+ word0 = word0 ^ (word0 >> 3);
243
+ word1 = word_ptr32[1] & kWordMask0[bytecount & 3];
244
+ sum += word1;
245
+ word1 = word1 ^ (word1 << 4);
246
+ word0 += word1;
247
+ break;
248
+ case 2: // 9..12 bytes
249
+ word0 = word_ptr32[0];
250
+ sum = word0;
251
+ word0 = word0 ^ (word0 >> 3);
252
+ word1 = word_ptr32[1];
253
+ sum += word1;
254
+ word1 = word1 ^ (word1 << 4);
255
+ word0 += word1;
256
+ word1 = word_ptr32[2] & kWordMask0[bytecount & 3];
257
+ sum += word1;
258
+ word1 = word1 ^ (word1 << 2);
259
+ word0 += word1;
260
+ break;
261
+ case 3: // 13..16 bytes
262
+ word0 = word_ptr32[0];
263
+ sum = word0;
264
+ word0 = word0 ^ (word0 >> 3);
265
+ word1 = word_ptr32[1];
266
+ sum += word1;
267
+ word1 = word1 ^ (word1 << 4);
268
+ word0 += word1;
269
+ word1 = word_ptr32[2];
270
+ sum += word1;
271
+ word1 = word1 ^ (word1 << 2);
272
+ word0 += word1;
273
+ word1 = word_ptr32[3] & kWordMask0[bytecount & 3];
274
+ sum += word1;
275
+ word1 = word1 ^ (word1 >> 8);
276
+ word0 += word1;
277
+ break;
278
+ case 4: // 17..20 bytes
279
+ word0 = word_ptr32[0];
280
+ sum = word0;
281
+ word0 = word0 ^ (word0 >> 3);
282
+ word1 = word_ptr32[1];
283
+ sum += word1;
284
+ word1 = word1 ^ (word1 << 4);
285
+ word0 += word1;
286
+ word1 = word_ptr32[2];
287
+ sum += word1;
288
+ word1 = word1 ^ (word1 << 2);
289
+ word0 += word1;
290
+ word1 = word_ptr32[3];
291
+ sum += word1;
292
+ word1 = word1 ^ (word1 >> 8);
293
+ word0 += word1;
294
+ word1 = word_ptr32[4] & kWordMask0[bytecount & 3];
295
+ sum += word1;
296
+ word1 = word1 ^ (word1 >> 4);
297
+ word0 += word1;
298
+ break;
299
+ default: // 21..24 bytes and higher (ignores beyond 24)
300
+ word0 = word_ptr32[0];
301
+ sum = word0;
302
+ word0 = word0 ^ (word0 >> 3);
303
+ word1 = word_ptr32[1];
304
+ sum += word1;
305
+ word1 = word1 ^ (word1 << 4);
306
+ word0 += word1;
307
+ word1 = word_ptr32[2];
308
+ sum += word1;
309
+ word1 = word1 ^ (word1 << 2);
310
+ word0 += word1;
311
+ word1 = word_ptr32[3];
312
+ sum += word1;
313
+ word1 = word1 ^ (word1 >> 8);
314
+ word0 += word1;
315
+ word1 = word_ptr32[4];
316
+ sum += word1;
317
+ word1 = word1 ^ (word1 >> 4);
318
+ word0 += word1;
319
+ word1 = word_ptr32[5] & kWordMask0[bytecount & 3];
320
+ sum += word1;
321
+ word1 = word1 ^ (word1 >> 6);
322
+ word0 += word1;
323
+ break;
324
+ }
325
+
326
+ sum += (sum >> 17); // extra 1-bit shift for bytes 2 & 3
327
+ sum += (sum >> 9); // extra 1-bit shift for bytes 1 & 3
328
+ sum = (sum & 0xff) << 32;
329
+ return (word0 ^ prepost) + sum;
330
+ }
331
+
332
+ // OCTAGRAM wrapper with surrounding spaces
333
+ // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
334
+ // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
335
+ //
336
+ // The low 32 bits follow the pattern from above, tuned to different scripts
337
+ // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
338
+ // For runtime use of tables V3
339
+ uint64 cld::OctaHash40(const char* word_ptr, int bytecount) {
340
+ if (bytecount == 0) {
341
+ return 0;
342
+ }
343
+ uint64 prepost = 0;
344
+ if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
345
+ if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
346
+ return OctaHash40Mix(word_ptr, bytecount, prepost);
347
+ }
348
+
349
+
350
+ // OCTAGRAM wrapper with surrounding underscores (offline use)
351
+ // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
352
+ // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
353
+ //
354
+ // The low 32 bits follow the pattern from above, tuned to different scripts
355
+ // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
356
+ // For offline construction of tables
357
+ uint64 cld::OctaHash40underscore(const char* word_ptr, int bytecount) {
358
+ if (bytecount == 0) {
359
+ return 0;
360
+ }
361
+ const char* local_word_ptr = word_ptr;
362
+ int local_bytecount = bytecount;
363
+ uint64 prepost = 0;
364
+ if (local_word_ptr[0] == '_') {
365
+ prepost |= kPreSpaceIndicator;
366
+ ++local_word_ptr;
367
+ --local_bytecount;
368
+ }
369
+ if (local_word_ptr[local_bytecount - 1] == '_') {
370
+ prepost |= kPostSpaceIndicator;
371
+ --local_bytecount;
372
+ }
373
+ return OctaHash40Mix(local_word_ptr, local_bytecount, prepost);
374
+ }
375
+
376
+
377
+
378
+
379
+ //------------------------------------------------------------------------------
380
+ // Scoring single groups of letters
381
+ //------------------------------------------------------------------------------
382
+
383
+ // UNIGRAM score one => tote
384
+ // Input: 1-byte entry of subscript into unigram probs, plus
385
+ // an accumulator tote.
386
+ // Output: running sums in tote updated
387
+ void cld::ProcessProbV25UniTote(int propval, Tote* tote) {
388
+ tote->AddGram();
389
+ const UnigramProbArray* pa = &kTargetCTJKVZProbs[propval];
390
+ if (pa->probs[0] > 0) {tote->Add(cld::PackLanguage(CHINESE), pa->probs[0]);}
391
+ if (pa->probs[1] > 0) {tote->Add(cld::PackLanguage(CHINESE_T), pa->probs[1]);}
392
+ if (pa->probs[2] > 0) {tote->Add(cld::PackLanguage(JAPANESE), pa->probs[2]);}
393
+ if (pa->probs[3] > 0) {tote->Add(cld::PackLanguage(KOREAN), pa->probs[3]);}
394
+ if (pa->probs[4] > 0) {tote->Add(cld::PackLanguage(VIETNAMESE), pa->probs[4]);}
395
+ if (pa->probs[5] > 0) {tote->Add(cld::PackLanguage(ZHUANG), pa->probs[5]);}
396
+ }
397
+
398
+ // BIGRAM, QUADGRAM, OCTAGRAM score one => tote
399
+ // Input: 4-byte entry of 3 language numbers and one probability subscript, plus
400
+ // an accumulator tote. (language 0 means unused entry)
401
+ // Output: running sums in tote updated
402
+ void cld::ProcessProbV25Tote(uint32 probs, Tote* tote) {
403
+ tote->AddGram();
404
+ uint8 prob123 = (probs >> 0) & 0xff;
405
+ const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
406
+
407
+ uint8 top1 = (probs >> 8) & 0xff;
408
+ if (top1 > 0) {tote->Add(top1, cld::LgProb3(prob123_entry, 0));}
409
+ uint8 top2 = (probs >> 16) & 0xff;
410
+ if (top2 > 0) {tote->Add(top2, cld::LgProb3(prob123_entry, 1));}
411
+ uint8 top3 = (probs >> 24) & 0xff;
412
+ if (top3 > 0) {tote->Add(top3, cld::LgProb3(prob123_entry, 2));}
413
+ }
414
+
415
+
416
+ //------------------------------------------------------------------------------
417
+ // Routines to accumulate probabilities
418
+ //------------------------------------------------------------------------------
419
+
420
+
421
+ // UNIGRAM, using UTF-8 property table, advancing by 1/2/4/8 chars
422
+ // Caller supplies table, such as compact_lang_det_generated_ctjkvz_b1_obj
423
+ // Score up to n unigrams, returning number of bytes consumed
424
+ // Updates tote_grams
425
+ int cld::DoUniScoreV3(const UTF8PropObj* unigram_obj,
426
+ const char* isrc, int srclen, int advance_by,
427
+ int* tote_grams, int gram_limit, Tote* chunk_tote) {
428
+ const char* src = isrc;
429
+ if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
430
+
431
+ // Property-based CJK unigram lookup
432
+ if (src[0] == ' ') {++src; --srclen;}
433
+
434
+ const uint8* usrc = reinterpret_cast<const uint8*>(src);
435
+ int usrclen = srclen;
436
+
437
+ while (usrclen > 0) {
438
+ int len = kAdvanceOneChar[usrc[0]];
439
+ // Look up property of one UTF-8 character and advance over it
440
+ // Return 0 if input length is zero
441
+ // Return 0 and advance one byte if input is ill-formed
442
+
443
+ int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &usrclen);
444
+
445
+ if (FLAGS_dbglookup) {
446
+ DbgUniTermToStderr(propval, usrc, len);
447
+ }
448
+
449
+ if (propval > 0) {
450
+ ProcessProbV25UniTote(propval, chunk_tote);
451
+ ++(*tote_grams);
452
+ if (FLAGS_dbgscore) {DbgScoreRecordUni((const char*)usrc, propval, len);}
453
+ }
454
+
455
+ // Advance by 1/2/4/8 characters (half of quad advance)
456
+ if (advance_by == 2) {
457
+ // Already advanced by 1
458
+ } else if (advance_by == 4) {
459
+ // Advance by 2 chars total, if not at end
460
+ if (UTFmax <= usrclen) {
461
+ int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
462
+ }
463
+ } else if (advance_by == 8) {
464
+ // Advance by 4 chars total, if not at end
465
+ if ((UTFmax * 3) <= usrclen) {
466
+ int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
467
+ n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
468
+ n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
469
+ }
470
+ } else {
471
+ // Advance by 8 chars total, if not at end
472
+ if ((UTFmax * 7) <= usrclen) {
473
+ int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
474
+ n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
475
+ n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
476
+ n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
477
+ n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
478
+ n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
479
+ n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
480
+ }
481
+ }
482
+ DCHECK(usrclen >= 0);
483
+
484
+ if (*tote_grams >= gram_limit) {
485
+ break;
486
+ }
487
+ }
488
+ if (FLAGS_dbgscore) {
489
+ // With advance_by>2, we consume more input to get the same number of quads
490
+ int len = src - isrc;
491
+ DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
492
+ DbgScoreFlush();
493
+ }
494
+
495
+ int consumed2 = reinterpret_cast<const char*>(usrc) - isrc;
496
+ return consumed2;
497
+ }
498
+
499
+
500
+ // BIGRAM, using hash table, always advancing by 1 char
501
+ // Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj
502
+ // Score all bigrams in isrc, using languages that have bigrams (CJK)
503
+ // Return number of bigrams that hit in the hash table
504
+ int cld::DoBigramScoreV3(const cld::CLDTableSummary* bigram_obj,
505
+ const char* isrc, int srclen, Tote* chunk_tote) {
506
+ int hit_count = 0;
507
+ const char* src = isrc;
508
+
509
+ // Hashtable-based CJK bigram lookup
510
+ const uint8* usrc = reinterpret_cast<const uint8*>(src);
511
+ const uint8* usrclimit1 = usrc + srclen - UTFmax;
512
+ if (FLAGS_dbgscore) {
513
+ fprintf(stderr, " " );
514
+ }
515
+
516
+ while (usrc < usrclimit1) {
517
+ int len = kAdvanceOneChar[usrc[0]];
518
+ int len2 = kAdvanceOneChar[usrc[len]] + len;
519
+
520
+ if ((kMinCJKUTF8CharBytes * 2) <= len2) { // Two CJK chars possible
521
+ // Lookup and score this bigram
522
+ // Always ignore pre/post spaces
523
+ uint32 bihash = BiHashV25(reinterpret_cast<const char*>(usrc), len2);
524
+ uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash);
525
+ // Now go indirect on the subscript
526
+ probs = bigram_obj->kCLDTableInd[probs &
527
+ ~bigram_obj->kCLDTableKeyMask];
528
+
529
+ // Process the bigram
530
+ if (FLAGS_dbglookup) {
531
+ const char* ssrc = reinterpret_cast<const char*>(usrc);
532
+ DbgBiTermToStderr(bihash, probs, ssrc, len2);
533
+ DbgScoreRecord(NULL, probs, len2);
534
+ } else if (FLAGS_dbgscore && (probs != 0)) {
535
+ const char* ssrc = reinterpret_cast<const char*>(usrc);
536
+ DbgScoreRecord(NULL, probs, len2);
537
+ string temp(ssrc, len2);
538
+ fprintf(stderr, "%s ", temp.c_str());
539
+ }
540
+
541
+ if (probs != 0) {
542
+ ProcessProbV25Tote(probs, chunk_tote);
543
+ ++hit_count;
544
+ }
545
+ }
546
+ usrc += len; // Advance by one char
547
+ }
548
+
549
+ if (FLAGS_dbgscore) {
550
+ fprintf(stderr, "[%d bigrams scored]\n", hit_count);
551
+ DbgScoreState();
552
+ }
553
+ return hit_count;
554
+ }
555
+
556
+
557
+
558
+ // QUADGRAM, using hash table, advancing by 2/4/8/16 chars
559
+ // Caller supplies table, such as &kQuadTable_obj or &kGibberishTable_obj
560
+ // Score up to n quadgrams, returning number of bytes consumed
561
+ // Updates tote_grams
562
+ int cld::DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj,
563
+ const char* isrc, int srclen, int advance_by,
564
+ int* tote_grams, int gram_limit, Tote* chunk_tote) {
565
+ const char* src = isrc;
566
+ const char* srclimit = src + srclen;
567
+ // Limit is end, which has extra 20 20 20 00 past len
568
+ const char* srclimit7 = src + srclen - (UTFmax * 7);
569
+ const char* srclimit15 = src + srclen - (UTFmax * 15);
570
+
571
+ if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
572
+
573
+ // Run a little cache of last hits to catch overly-repetitive "text"
574
+ int next_prior = 0;
575
+ uint32 prior_quads[2] = {0, 0};
576
+
577
+ // Visit all quadgrams
578
+ if (src[0] == ' ') {++src;}
579
+ while (src < srclimit) {
580
+ // Find one quadgram
581
+ const char* src_end = src;
582
+ src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
583
+ src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
584
+ const char* src_mid = src_end;
585
+ src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
586
+ src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
587
+ int len = src_end - src;
588
+
589
+ // Lookup and score this quadgram
590
+ uint32 quadhash = QuadHashV25(src, len);
591
+ uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash);
592
+ // Now go indirect on the subscript
593
+ probs = quadgram_obj->kCLDTableInd[probs &
594
+ ~quadgram_obj->kCLDTableKeyMask];
595
+
596
+ // Process the quadgram
597
+ if (FLAGS_dbglookup) {
598
+ DbgQuadTermToStderr(quadhash, probs, src, len);
599
+ }
600
+ if (probs != 0) {
601
+ // Filter out recent repeats. If this works out, use in the other lookups
602
+ if ((quadhash != prior_quads[0]) && (quadhash != prior_quads[1])) {
603
+ prior_quads[next_prior] = quadhash;
604
+ next_prior = (next_prior + 1) & 1;
605
+ ProcessProbV25Tote(probs, chunk_tote);
606
+ ++(*tote_grams);
607
+ if (FLAGS_dbgscore) {DbgScoreRecord(src, probs, len);}
608
+ }
609
+ }
610
+
611
+ // Advance all the way past word if at end-of-word
612
+ if (src_end[0] == ' ') {
613
+ src_mid = src_end;
614
+ }
615
+
616
+ // Advance by 2/4/8/16 characters
617
+ if (advance_by == 2) {
618
+ src = src_mid;
619
+ } else if (advance_by == 4) {
620
+ src = src_end;
621
+ } else if (advance_by == 8) {
622
+ // Advance by 8 chars total (4 more), if not at end
623
+ if (src < srclimit7) {
624
+ src_end += kAdvanceOneChar[(uint8)src_end[0]];
625
+ src_end += kAdvanceOneChar[(uint8)src_end[0]];
626
+ src_end += kAdvanceOneChar[(uint8)src_end[0]];
627
+ src_end += kAdvanceOneChar[(uint8)src_end[0]];
628
+ }
629
+ src = src_end;
630
+ } else {
631
+ // Advance by 16 chars total (12 more), if not at end
632
+ if (src < srclimit15) {
633
+ // Advance by ~16 chars by adding 3 * current bytelen
634
+ int fourcharlen = src_end - src;
635
+ src = src_end + (3 * fourcharlen);
636
+ // Advance a bit more if mid-character
637
+ src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
638
+ src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
639
+ } else {
640
+ src = src_end;
641
+ }
642
+ }
643
+ DCHECK(src < srclimit);
644
+ src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
645
+
646
+ if (*tote_grams >= gram_limit) {
647
+ break;
648
+ }
649
+ }
650
+
651
+ if (FLAGS_dbgscore) {
652
+ // With advance_by>2, we consume more input to get the same number of quads
653
+ int len = src - isrc;
654
+ DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
655
+ DbgScoreFlush();
656
+ }
657
+
658
+ int consumed = src - isrc;
659
+
660
+ // If advancing by more than 2, src may have overshot srclimit
661
+ if (consumed > srclen) {
662
+ consumed = srclen;
663
+ }
664
+
665
+ return consumed;
666
+ }
667
+
668
+
669
+ // OCTAGRAM, using hash table, always advancing by 1 word
670
+ // Caller supplies table, such as &kLongWord8Table_obj
671
+ // Score all words in isrc, using languages that have quadgrams
672
+ // We don't normally use this routine except on the first quadgram run,
673
+ // but it can be used to resolve unreliable pages.
674
+ // This routine does not have an optimized advance_by
675
+ // SOON: Uses indirect language/probability longword
676
+ //
677
+ // Return number of words that hit in the hash table
678
+ int cld::DoOctaScoreV3(const cld::CLDTableSummary* octagram_obj,
679
+ const char* isrc, int srclen, Tote* chunk_tote) {
680
+ int hit_count = 0;
681
+ const char* src = isrc;
682
+ const char* srclimit = src + srclen + 1;
683
+ // Limit is end+1, to include extra space char (0x20) off the end
684
+ //
685
+ // Score all words truncated to 8 characters
686
+ int charcount = 0;
687
+ // Skip any initial space
688
+ if (src[0] == ' ') {++src;}
689
+ const char* word_ptr = src;
690
+ const char* word_end = word_ptr;
691
+ if (FLAGS_dbgscore) {
692
+ fprintf(stderr, " " );
693
+ }
694
+ while (src < srclimit) {
695
+ // Terminate previous word or continue current word
696
+ if (src[0] == ' ') {
697
+ int bytecount = word_end - word_ptr;
698
+ if (bytecount == 0)
699
+ break;
700
+ // Lookup and score this word
701
+ uint64 wordhash40 = OctaHash40(word_ptr, bytecount);
702
+ uint32 probs = OctaHashV3Lookup4(octagram_obj, wordhash40);
703
+ // Now go indirect on the subscript
704
+ probs = octagram_obj->kCLDTableInd[probs &
705
+ ~octagram_obj->kCLDTableKeyMask];
706
+
707
+ // // Lookup and score this word
708
+ // uint32 wordhash = QuadHashV25(word_ptr, bytecount);
709
+ // uint32 probs = WordHashLookup4(wordhash, kLongWord8Table,
710
+ // kLongWord8TableSize);
711
+ //
712
+ if (FLAGS_dbglookup) {
713
+ DbgWordTermToStderr(wordhash40, probs, word_ptr, bytecount);
714
+ DbgScoreRecord(NULL, probs, bytecount);
715
+ } else if (FLAGS_dbgscore && (probs != 0)) {
716
+ DbgScoreRecord(NULL, probs, bytecount);
717
+ string temp(word_ptr, bytecount);
718
+ fprintf(stderr, "%s ", temp.c_str());
719
+ }
720
+
721
+ if (probs != 0) {
722
+ ProcessProbV25Tote(probs, chunk_tote);
723
+ ++hit_count;
724
+ }
725
+ charcount = 0;
726
+ word_ptr = src + 1; // Over the space
727
+ word_end = word_ptr;
728
+ } else {
729
+ ++charcount;
730
+ }
731
+
732
+ // Advance to next char
733
+ src += cld_UniLib::OneCharLen(src);
734
+ if (charcount <= 8) {
735
+ word_end = src;
736
+ }
737
+ }
738
+
739
+ if (FLAGS_dbgscore) {
740
+ fprintf(stderr, "[%d words scored]\n", hit_count);
741
+ DbgScoreState();
742
+ }
743
+ return hit_count;
744
+ }
745
+
746
+
747
+
748
+ //------------------------------------------------------------------------------
749
+ // Reliability calculations, for single language and between languages
750
+ //------------------------------------------------------------------------------
751
+
752
+ // Return reliablity of result 0..100 for top two scores
753
+ // delta==0 is 0% reliable, delta==fully_reliable_thresh is 100% reliable
754
+ // (on a scale where +1 is a factor of 2 ** 1.6 = 3.02)
755
+ // Threshold is uni/quadgram increment count, bounded above and below.
756
+ //
757
+ // Requiring a factor of 3 improvement (e.g. +1 log base 3)
758
+ // for each scored quadgram is too stringent, so I've backed this off to a
759
+ // factor of 2 (e.g. +5/8 log base 3).
760
+ //
761
+ // I also somewhat lowered the Min/MaxGramCount limits above
762
+ //
763
+ // Added: if fewer than 8 quads/unis, max reliability is 12*n percent
764
+ //
765
+ int cld::ReliabilityDelta(int value1, int value2, int gramcount) {
766
+ int max_reliability_percent = 100;
767
+ if (gramcount < 8) {
768
+ max_reliability_percent = 12 * gramcount;
769
+ }
770
+ int fully_reliable_thresh = (gramcount * 5) >> 3; // see note above
771
+ if (fully_reliable_thresh < kMinGramCount) { // Fully = 3..16
772
+ fully_reliable_thresh = kMinGramCount;
773
+ } else if (fully_reliable_thresh > kMaxGramCount) {
774
+ fully_reliable_thresh = kMaxGramCount;
775
+ }
776
+
777
+ int delta = value1 - value2;
778
+ if (delta >= fully_reliable_thresh) {return max_reliability_percent;}
779
+ if (delta <= 0) {return 0;}
780
+ return cld::minint(max_reliability_percent,
781
+ (100 * delta) / fully_reliable_thresh);
782
+ }
783
+
784
+ // Return reliablity of result 0..100 for top score vs. mainsteam score
785
+ // Values are score per 1024 bytes of input
786
+ // ratio = max(top/mainstream, mainstream/top)
787
+ // ratio > 4.0 is 0% reliable, <= 2.0 is 100% reliable
788
+ // Change: short-text word scoring can give unusually good results.
789
+ // Let top exceed mainstream by 4x at 50% reliable
790
+ int cld::ReliabilityMainstream(int topscore, int len, int mean_score) {
791
+ if (mean_score == 0) {return 100;} // No reliability data available yet
792
+ if (topscore == 0) {return 0;} // zero score = unreliable
793
+ if (len == 0) {return 0;} // zero len = unreliable
794
+ int top_kb = (topscore << 10) / len;
795
+ double ratio;
796
+ double ratio_cutoff;
797
+ if (top_kb > mean_score) {
798
+ ratio = (1.0 * top_kb) / mean_score;
799
+ ratio_cutoff = 5.0; // ramp down from 100% to 0%: 3.0-5.0
800
+ } else {
801
+ ratio = (1.0 * mean_score) / top_kb;
802
+ ratio_cutoff = 4.0; // ramp down from 100% to 0%: 2.0-4.0
803
+ }
804
+ if (ratio <= ratio_cutoff - 2.0) {return 100;}
805
+ if (ratio > ratio_cutoff) {return 0;}
806
+
807
+ int iratio = static_cast<int>(100 * (ratio_cutoff - ratio) / 2.0);
808
+ return iratio;
809
+ }
810
+
811
+ // Calculate ratio of score per 1KB vs. expected score per 1KB
812
+ double cld::GetNormalizedScore(Language lang, UnicodeLScript lscript,
813
+ int bytes, int score) {
814
+ // Average training-data score for this language-script combo, per 1KB
815
+ int expected_score = kMeanScore[lang * 4 + LScript4(lscript)];
816
+ if (lscript == ULScript_Common) {
817
+ // We don't know the script (only happens with second-chance score)
818
+ // Look for first non-zero mean value
819
+ for (int i = 0; i < 3; ++i) {
820
+ if (kMeanScore[lang * 4 + i] > 0) {
821
+ expected_score = kMeanScore[lang * 4 + i];
822
+ }
823
+ }
824
+ }
825
+ if (expected_score < 100) {
826
+ expected_score = 1000;
827
+ }
828
+
829
+ // Our score per 1KB
830
+ double our_score = (score << 10) / (bytes ? bytes : 1); // Avoid zdiv
831
+ double ratio = our_score / expected_score;
832
+
833
+ // Just the raw count normalized as though each language has mean=1000;
834
+ ratio = (score * 1000.0) / expected_score;
835
+ return ratio;
836
+ }
837
+
838
+ // Calculate reliablity of len bytes of script lscript with chunk_tote
839
+ int cld::GetReliability(int len, UnicodeLScript lscript,
840
+ const Tote* chunk_tote) {
841
+ Language cur_lang = UnpackLanguage(chunk_tote->Key(0));
842
+ // Average score for this language-script combo
843
+ int mean_score = kMeanScore[cur_lang * 4 + LScript4(lscript)];
844
+ if (lscript == ULScript_Common) {
845
+ // We don't know the script (only happens with second-chance score)
846
+ // Look for first non-zero mean value
847
+ for (int i = 0; i < 3; ++i) {
848
+ if (kMeanScore[cur_lang * 4 + i] > 0) {
849
+ mean_score = kMeanScore[cur_lang * 4 + i];
850
+ }
851
+ }
852
+ }
853
+ int reliability_delta = ReliabilityDelta(chunk_tote->Value(0),
854
+ chunk_tote->Value(1),
855
+ chunk_tote->GetGramCount());
856
+
857
+ int reliability_main = ReliabilityMainstream(chunk_tote->Value(0),
858
+ len,
859
+ mean_score);
860
+
861
+ int reliability_min = minint(reliability_delta, reliability_main);
862
+
863
+
864
+ if (FLAGS_dbgreli) {
865
+ char temp1[4];
866
+ char temp2[4];
867
+ cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(0)), temp1);
868
+ if (temp1[2] == ' ') {temp1[2] = '\0';}
869
+ cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(1)), temp2);
870
+ if (temp2[2] == ' ') {temp2[2] = '\0';}
871
+ int srclen = len;
872
+ fprintf(stderr, "CALC GetReliability gram=%d incr=%d srclen=%d, %s=%d %s=%d "
873
+ "top/KB=%d mean/KB=%d del=%d%% reli=%d%% "
874
+ "lang/lscript %d %d\n",
875
+ chunk_tote->GetGramCount(),
876
+ chunk_tote->GetIncrCount(),
877
+ srclen,
878
+ temp1, chunk_tote->Value(0),
879
+ temp2, chunk_tote->Value(1),
880
+ (chunk_tote->Value(0) << 10) / (srclen ? srclen : 1),
881
+ mean_score,
882
+ reliability_delta,
883
+ reliability_main,
884
+ cur_lang, lscript);
885
+ }
886
+
887
+ return reliability_min;
888
+ }
889
+
890
+
891
+ //------------------------------------------------------------------------------
892
+ // Miscellaneous
893
+ //------------------------------------------------------------------------------
894
+
895
+ // Demote all languages except Top40 and plus_one
896
+ // Do this just before sorting chunk_tote results
897
+ void cld::DemoteNotTop40(Tote* chunk_tote, int packed_plus_one) {
898
+ for (int sub = 0; sub < chunk_tote->MaxSize(); ++sub) {
899
+ if (chunk_tote->Key(sub) == 0) continue;
900
+ if (chunk_tote->Key(sub) == packed_plus_one) continue;
901
+ if (kIsPackedTop40[chunk_tote->Key(sub)]) continue;
902
+ // Quarter the score of others
903
+ chunk_tote->SetValue(sub, chunk_tote->Value(sub) >> 2);
904
+ }
905
+ }