language_detection 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +85 -0
- data/Rakefile +11 -0
- data/ext/cld/Makefile +34 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +124 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/cld.so +0 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +1 -0
- data/ext/cld/language_detection.cc +88 -0
- data/ext/cld/languages/internal/languages.cc +337 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/language_detection.gemspec +28 -0
- data/lib/language_detection/string.rb +1 -0
- data/lib/language_detection/version.rb +3 -0
- data/lib/language_detection.rb +54 -0
- data/test/_helper.rb +15 -0
- data/test/fixtures/languages.csv +80 -0
- data/test/language_detection_test.rb +88 -0
- metadata +250 -0
@@ -0,0 +1,905 @@
|
|
1
|
+
// Copyright (c) 2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#include <string>
|
6
|
+
#include "encodings/compact_lang_det/cldutil.h"
|
7
|
+
#include "encodings/compact_lang_det/cldutil_dbg.h"
|
8
|
+
#include "encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h"
|
9
|
+
#include "encodings/compact_lang_det/utf8propletterscriptnum.h"
|
10
|
+
#include "encodings/compact_lang_det/win/cld_commandlineflags.h"
|
11
|
+
#include "encodings/compact_lang_det/win/cld_logging.h"
|
12
|
+
#include "encodings/compact_lang_det/win/cld_unilib.h"
|
13
|
+
#include "encodings/compact_lang_det/win/cld_utf.h"
|
14
|
+
#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
|
15
|
+
|
16
|
+
// Runtime routines for hashing, looking up, and scoring
|
17
|
+
// unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
|
18
|
+
// Unigrams and bigrams are for CJK languages only, including simplified/
|
19
|
+
// traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
|
20
|
+
// Zhuang Han characters. Surrounding spaces are not considered.
|
21
|
+
// Quadgrams and octagrams for for non-CJK and include two bits indicating
|
22
|
+
// preceding and trailing spaces (word boundaries).
|
23
|
+
|
24
|
+
|
25
|
+
// Indicator bits for leading/trailing space around quad/octagram
|
26
|
+
// NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of
|
27
|
+
// 1-, 2-, or 3-bytes each.
|
28
|
+
static const uint32 kPreSpaceIndicator = 0x00004444;
|
29
|
+
static const uint32 kPostSpaceIndicator = 0x44440000;
|
30
|
+
|
31
|
+
// Little-endian masks for 0..24 bytes picked up as uint32's
|
32
|
+
static const uint32 kWordMask0[4] = {
|
33
|
+
0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF
|
34
|
+
};
|
35
|
+
|
36
|
+
static const int kMinCJKUTF8CharBytes = 3;
|
37
|
+
|
38
|
+
static const int kMinGramCount = 3;
|
39
|
+
static const int kMaxGramCount = 16;
|
40
|
+
|
41
|
+
|
42
|
+
|
43
|
+
|
44
|
+
// Routines to access a hash table of <key:wordhash, value:probs> pairs
|
45
|
+
// Buckets have 4-byte wordhash for sizes < 32K buckets, but only
|
46
|
+
// 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
|
47
|
+
// bucket subscript.
|
48
|
+
// Probs is a packed: three languages plus a subscript for probability table
|
49
|
+
// Buckets have all the keys together, then all the values.Key array never
|
50
|
+
// crosses a cache-line boundary, so no-match case takes exactly one cache miss.
|
51
|
+
// Match case may sometimes take an additional cache miss on value access.
|
52
|
+
//
|
53
|
+
// Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
|
54
|
+
// byte buckets with single cache miss.
|
55
|
+
// Or 2-byte key and 6-byte value, allowing 5 languages instead of three.
|
56
|
+
//------------------------------------------------------------------------------
|
57
|
+
|
58
|
+
|
59
|
+
//------------------------------------------------------------------------------
|
60
|
+
// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores
|
61
|
+
//------------------------------------------------------------------------------
|
62
|
+
|
63
|
+
// Design principles for these hash functions
|
64
|
+
// - Few operations
|
65
|
+
// - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in
|
66
|
+
// Latin script expect 1- and 2-byte mixtures.
|
67
|
+
// - Last byte of each character has about 5 bits of information
|
68
|
+
// - Spread good bits around so they can interact in at least two ways
|
69
|
+
// with other characters
|
70
|
+
// - Use add for additional mixing thorugh carries
|
71
|
+
|
72
|
+
// CJK Three-byte bigram
|
73
|
+
// ....dddd..cccccc..bbbbbb....aaaa
|
74
|
+
// ..................ffffff..eeeeee
|
75
|
+
// make
|
76
|
+
// ....dddd..cccccc..bbbbbb....aaaa
|
77
|
+
// 000....dddd..cccccc..bbbbbb....a
|
78
|
+
// ..................ffffff..eeeeee
|
79
|
+
// ffffff..eeeeee000000000000000000
|
80
|
+
//
|
81
|
+
// CJK Four-byte bigram
|
82
|
+
// ..dddddd..cccccc....bbbb....aaaa
|
83
|
+
// ..hhhhhh..gggggg....ffff....eeee
|
84
|
+
// make
|
85
|
+
// ..dddddd..cccccc....bbbb....aaaa
|
86
|
+
// 000..dddddd..cccccc....bbbb....a
|
87
|
+
// ..hhhhhh..gggggg....ffff....eeee
|
88
|
+
// ..ffff....eeee000000000000000000
|
89
|
+
|
90
|
+
// BIGRAM
|
91
|
+
// Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
|
92
|
+
// OVERSHOOTS up to 3 bytes
|
93
|
+
// For runtime use of tables
|
94
|
+
uint32 cld::BiHashV25(const char* word_ptr, int bytecount) {
|
95
|
+
if (bytecount == 0) {
|
96
|
+
return 0;
|
97
|
+
}
|
98
|
+
const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
|
99
|
+
uint32 word0, word1;
|
100
|
+
if (bytecount <= 4) {
|
101
|
+
word0 = word_ptr32[0] & kWordMask0[bytecount & 3];
|
102
|
+
word0 = word0 ^ (word0 >> 3);
|
103
|
+
return word0;
|
104
|
+
}
|
105
|
+
// Else do 8 bytes
|
106
|
+
word0 = word_ptr32[0];
|
107
|
+
word0 = word0 ^ (word0 >> 3);
|
108
|
+
word1 = word_ptr32[1] & kWordMask0[bytecount & 3];
|
109
|
+
word1 = word1 ^ (word1 << 18);
|
110
|
+
return word0 + word1;
|
111
|
+
}
|
112
|
+
|
113
|
+
//
|
114
|
+
// Ascii-7 One-byte chars
|
115
|
+
// ...ddddd...ccccc...bbbbb...aaaaa
|
116
|
+
// make
|
117
|
+
// ...ddddd...ccccc...bbbbb...aaaaa
|
118
|
+
// 000...ddddd...ccccc...bbbbb...aa
|
119
|
+
//
|
120
|
+
// Latin 1- and 2-byte chars
|
121
|
+
// ...ddddd...ccccc...bbbbb...aaaaa
|
122
|
+
// ...................fffff...eeeee
|
123
|
+
// make
|
124
|
+
// ...ddddd...ccccc...bbbbb...aaaaa
|
125
|
+
// 000...ddddd...ccccc...bbbbb...aa
|
126
|
+
// ...................fffff...eeeee
|
127
|
+
// ...............fffff...eeeee0000
|
128
|
+
//
|
129
|
+
// Non-CJK Two-byte chars
|
130
|
+
// ...ddddd...........bbbbb........
|
131
|
+
// ...hhhhh...........fffff........
|
132
|
+
// make
|
133
|
+
// ...ddddd...........bbbbb........
|
134
|
+
// 000...ddddd...........bbbbb.....
|
135
|
+
// ...hhhhh...........fffff........
|
136
|
+
// hhhh...........fffff........0000
|
137
|
+
//
|
138
|
+
// Non-CJK Three-byte chars
|
139
|
+
// ...........ccccc................
|
140
|
+
// ...................fffff........
|
141
|
+
// ...lllll...................iiiii
|
142
|
+
// make
|
143
|
+
// ...........ccccc................
|
144
|
+
// 000...........ccccc.............
|
145
|
+
// ...................fffff........
|
146
|
+
// ...............fffff........0000
|
147
|
+
// ...lllll...................iiiii
|
148
|
+
// .lllll...................iiiii00
|
149
|
+
//
|
150
|
+
|
151
|
+
// QUADGRAM
|
152
|
+
// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
|
153
|
+
// OVERSHOOTS up to 3 bytes
|
154
|
+
// For runtime use of tables
|
155
|
+
uint32 QuadHashV25Mix(const char* word_ptr, int bytecount, uint32 prepost) {
|
156
|
+
const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
|
157
|
+
uint32 word0, word1, word2;
|
158
|
+
if (bytecount <= 4) {
|
159
|
+
word0 = word_ptr32[0] & kWordMask0[bytecount & 3];
|
160
|
+
word0 = word0 ^ (word0 >> 3);
|
161
|
+
return word0 ^ prepost;
|
162
|
+
} else if (bytecount <= 8) {
|
163
|
+
word0 = word_ptr32[0];
|
164
|
+
word0 = word0 ^ (word0 >> 3);
|
165
|
+
word1 = word_ptr32[1] & kWordMask0[bytecount & 3];
|
166
|
+
word1 = word1 ^ (word1 << 4);
|
167
|
+
return (word0 ^ prepost) + word1;
|
168
|
+
}
|
169
|
+
// else do 12 bytes
|
170
|
+
word0 = word_ptr32[0];
|
171
|
+
word0 = word0 ^ (word0 >> 3);
|
172
|
+
word1 = word_ptr32[1];
|
173
|
+
word1 = word1 ^ (word1 << 4);
|
174
|
+
word2 = word_ptr32[2] & kWordMask0[bytecount & 3];
|
175
|
+
word2 = word2 ^ (word2 << 2);
|
176
|
+
return (word0 ^ prepost) + word1 + word2;
|
177
|
+
}
|
178
|
+
|
179
|
+
|
180
|
+
// QUADGRAM wrapper with surrounding spaces
|
181
|
+
// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
|
182
|
+
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
|
183
|
+
// For runtime use of tables
|
184
|
+
uint32 cld::QuadHashV25(const char* word_ptr, int bytecount) {
|
185
|
+
if (bytecount == 0) {
|
186
|
+
return 0;
|
187
|
+
}
|
188
|
+
uint32 prepost = 0;
|
189
|
+
if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
|
190
|
+
if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
|
191
|
+
return QuadHashV25Mix(word_ptr, bytecount, prepost);
|
192
|
+
}
|
193
|
+
|
194
|
+
// QUADGRAM wrapper with surrounding underscores (offline use)
|
195
|
+
// Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
|
196
|
+
// OVERSHOOTS up to 3 bytes
|
197
|
+
// For offline construction of tables
|
198
|
+
uint32 cld::QuadHashV25Underscore(const char* word_ptr, int bytecount) {
|
199
|
+
if (bytecount == 0) {
|
200
|
+
return 0;
|
201
|
+
}
|
202
|
+
const char* local_word_ptr = word_ptr;
|
203
|
+
int local_bytecount = bytecount;
|
204
|
+
uint32 prepost = 0;
|
205
|
+
if (local_word_ptr[0] == '_') {
|
206
|
+
prepost |= kPreSpaceIndicator;
|
207
|
+
++local_word_ptr;
|
208
|
+
--local_bytecount;
|
209
|
+
}
|
210
|
+
if (local_word_ptr[local_bytecount - 1] == '_') {
|
211
|
+
prepost |= kPostSpaceIndicator;
|
212
|
+
--local_bytecount;
|
213
|
+
}
|
214
|
+
return QuadHashV25Mix(local_word_ptr, local_bytecount, prepost);
|
215
|
+
}
|
216
|
+
|
217
|
+
|
218
|
+
// OCTAGRAM
|
219
|
+
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
|
220
|
+
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
|
221
|
+
//
|
222
|
+
// The low 32 bits follow the pattern from above, tuned to different scripts
|
223
|
+
// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
|
224
|
+
// For runtime use of tables V3
|
225
|
+
uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) {
|
226
|
+
const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
|
227
|
+
uint64 word0;
|
228
|
+
uint64 word1;
|
229
|
+
uint64 sum;
|
230
|
+
|
231
|
+
if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
|
232
|
+
if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
|
233
|
+
switch ((bytecount - 1) >> 2) {
|
234
|
+
case 0: // 1..4 bytes
|
235
|
+
word0 = word_ptr32[0] & kWordMask0[bytecount & 3];
|
236
|
+
sum = word0;
|
237
|
+
word0 = word0 ^ (word0 >> 3);
|
238
|
+
break;
|
239
|
+
case 1: // 5..8 bytes
|
240
|
+
word0 = word_ptr32[0];
|
241
|
+
sum = word0;
|
242
|
+
word0 = word0 ^ (word0 >> 3);
|
243
|
+
word1 = word_ptr32[1] & kWordMask0[bytecount & 3];
|
244
|
+
sum += word1;
|
245
|
+
word1 = word1 ^ (word1 << 4);
|
246
|
+
word0 += word1;
|
247
|
+
break;
|
248
|
+
case 2: // 9..12 bytes
|
249
|
+
word0 = word_ptr32[0];
|
250
|
+
sum = word0;
|
251
|
+
word0 = word0 ^ (word0 >> 3);
|
252
|
+
word1 = word_ptr32[1];
|
253
|
+
sum += word1;
|
254
|
+
word1 = word1 ^ (word1 << 4);
|
255
|
+
word0 += word1;
|
256
|
+
word1 = word_ptr32[2] & kWordMask0[bytecount & 3];
|
257
|
+
sum += word1;
|
258
|
+
word1 = word1 ^ (word1 << 2);
|
259
|
+
word0 += word1;
|
260
|
+
break;
|
261
|
+
case 3: // 13..16 bytes
|
262
|
+
word0 = word_ptr32[0];
|
263
|
+
sum = word0;
|
264
|
+
word0 = word0 ^ (word0 >> 3);
|
265
|
+
word1 = word_ptr32[1];
|
266
|
+
sum += word1;
|
267
|
+
word1 = word1 ^ (word1 << 4);
|
268
|
+
word0 += word1;
|
269
|
+
word1 = word_ptr32[2];
|
270
|
+
sum += word1;
|
271
|
+
word1 = word1 ^ (word1 << 2);
|
272
|
+
word0 += word1;
|
273
|
+
word1 = word_ptr32[3] & kWordMask0[bytecount & 3];
|
274
|
+
sum += word1;
|
275
|
+
word1 = word1 ^ (word1 >> 8);
|
276
|
+
word0 += word1;
|
277
|
+
break;
|
278
|
+
case 4: // 17..20 bytes
|
279
|
+
word0 = word_ptr32[0];
|
280
|
+
sum = word0;
|
281
|
+
word0 = word0 ^ (word0 >> 3);
|
282
|
+
word1 = word_ptr32[1];
|
283
|
+
sum += word1;
|
284
|
+
word1 = word1 ^ (word1 << 4);
|
285
|
+
word0 += word1;
|
286
|
+
word1 = word_ptr32[2];
|
287
|
+
sum += word1;
|
288
|
+
word1 = word1 ^ (word1 << 2);
|
289
|
+
word0 += word1;
|
290
|
+
word1 = word_ptr32[3];
|
291
|
+
sum += word1;
|
292
|
+
word1 = word1 ^ (word1 >> 8);
|
293
|
+
word0 += word1;
|
294
|
+
word1 = word_ptr32[4] & kWordMask0[bytecount & 3];
|
295
|
+
sum += word1;
|
296
|
+
word1 = word1 ^ (word1 >> 4);
|
297
|
+
word0 += word1;
|
298
|
+
break;
|
299
|
+
default: // 21..24 bytes and higher (ignores beyond 24)
|
300
|
+
word0 = word_ptr32[0];
|
301
|
+
sum = word0;
|
302
|
+
word0 = word0 ^ (word0 >> 3);
|
303
|
+
word1 = word_ptr32[1];
|
304
|
+
sum += word1;
|
305
|
+
word1 = word1 ^ (word1 << 4);
|
306
|
+
word0 += word1;
|
307
|
+
word1 = word_ptr32[2];
|
308
|
+
sum += word1;
|
309
|
+
word1 = word1 ^ (word1 << 2);
|
310
|
+
word0 += word1;
|
311
|
+
word1 = word_ptr32[3];
|
312
|
+
sum += word1;
|
313
|
+
word1 = word1 ^ (word1 >> 8);
|
314
|
+
word0 += word1;
|
315
|
+
word1 = word_ptr32[4];
|
316
|
+
sum += word1;
|
317
|
+
word1 = word1 ^ (word1 >> 4);
|
318
|
+
word0 += word1;
|
319
|
+
word1 = word_ptr32[5] & kWordMask0[bytecount & 3];
|
320
|
+
sum += word1;
|
321
|
+
word1 = word1 ^ (word1 >> 6);
|
322
|
+
word0 += word1;
|
323
|
+
break;
|
324
|
+
}
|
325
|
+
|
326
|
+
sum += (sum >> 17); // extra 1-bit shift for bytes 2 & 3
|
327
|
+
sum += (sum >> 9); // extra 1-bit shift for bytes 1 & 3
|
328
|
+
sum = (sum & 0xff) << 32;
|
329
|
+
return (word0 ^ prepost) + sum;
|
330
|
+
}
|
331
|
+
|
332
|
+
// OCTAGRAM wrapper with surrounding spaces
|
333
|
+
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
|
334
|
+
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
|
335
|
+
//
|
336
|
+
// The low 32 bits follow the pattern from above, tuned to different scripts
|
337
|
+
// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
|
338
|
+
// For runtime use of tables V3
|
339
|
+
uint64 cld::OctaHash40(const char* word_ptr, int bytecount) {
|
340
|
+
if (bytecount == 0) {
|
341
|
+
return 0;
|
342
|
+
}
|
343
|
+
uint64 prepost = 0;
|
344
|
+
if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
|
345
|
+
if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
|
346
|
+
return OctaHash40Mix(word_ptr, bytecount, prepost);
|
347
|
+
}
|
348
|
+
|
349
|
+
|
350
|
+
// OCTAGRAM wrapper with surrounding underscores (offline use)
|
351
|
+
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
|
352
|
+
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
|
353
|
+
//
|
354
|
+
// The low 32 bits follow the pattern from above, tuned to different scripts
|
355
|
+
// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
|
356
|
+
// For offline construction of tables
|
357
|
+
uint64 cld::OctaHash40underscore(const char* word_ptr, int bytecount) {
|
358
|
+
if (bytecount == 0) {
|
359
|
+
return 0;
|
360
|
+
}
|
361
|
+
const char* local_word_ptr = word_ptr;
|
362
|
+
int local_bytecount = bytecount;
|
363
|
+
uint64 prepost = 0;
|
364
|
+
if (local_word_ptr[0] == '_') {
|
365
|
+
prepost |= kPreSpaceIndicator;
|
366
|
+
++local_word_ptr;
|
367
|
+
--local_bytecount;
|
368
|
+
}
|
369
|
+
if (local_word_ptr[local_bytecount - 1] == '_') {
|
370
|
+
prepost |= kPostSpaceIndicator;
|
371
|
+
--local_bytecount;
|
372
|
+
}
|
373
|
+
return OctaHash40Mix(local_word_ptr, local_bytecount, prepost);
|
374
|
+
}
|
375
|
+
|
376
|
+
|
377
|
+
|
378
|
+
|
379
|
+
//------------------------------------------------------------------------------
|
380
|
+
// Scoring single groups of letters
|
381
|
+
//------------------------------------------------------------------------------
|
382
|
+
|
383
|
+
// UNIGRAM score one => tote
|
384
|
+
// Input: 1-byte entry of subscript into unigram probs, plus
|
385
|
+
// an accumulator tote.
|
386
|
+
// Output: running sums in tote updated
|
387
|
+
void cld::ProcessProbV25UniTote(int propval, Tote* tote) {
|
388
|
+
tote->AddGram();
|
389
|
+
const UnigramProbArray* pa = &kTargetCTJKVZProbs[propval];
|
390
|
+
if (pa->probs[0] > 0) {tote->Add(cld::PackLanguage(CHINESE), pa->probs[0]);}
|
391
|
+
if (pa->probs[1] > 0) {tote->Add(cld::PackLanguage(CHINESE_T), pa->probs[1]);}
|
392
|
+
if (pa->probs[2] > 0) {tote->Add(cld::PackLanguage(JAPANESE), pa->probs[2]);}
|
393
|
+
if (pa->probs[3] > 0) {tote->Add(cld::PackLanguage(KOREAN), pa->probs[3]);}
|
394
|
+
if (pa->probs[4] > 0) {tote->Add(cld::PackLanguage(VIETNAMESE), pa->probs[4]);}
|
395
|
+
if (pa->probs[5] > 0) {tote->Add(cld::PackLanguage(ZHUANG), pa->probs[5]);}
|
396
|
+
}
|
397
|
+
|
398
|
+
// BIGRAM, QUADGRAM, OCTAGRAM score one => tote
|
399
|
+
// Input: 4-byte entry of 3 language numbers and one probability subscript, plus
|
400
|
+
// an accumulator tote. (language 0 means unused entry)
|
401
|
+
// Output: running sums in tote updated
|
402
|
+
void cld::ProcessProbV25Tote(uint32 probs, Tote* tote) {
|
403
|
+
tote->AddGram();
|
404
|
+
uint8 prob123 = (probs >> 0) & 0xff;
|
405
|
+
const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
|
406
|
+
|
407
|
+
uint8 top1 = (probs >> 8) & 0xff;
|
408
|
+
if (top1 > 0) {tote->Add(top1, cld::LgProb3(prob123_entry, 0));}
|
409
|
+
uint8 top2 = (probs >> 16) & 0xff;
|
410
|
+
if (top2 > 0) {tote->Add(top2, cld::LgProb3(prob123_entry, 1));}
|
411
|
+
uint8 top3 = (probs >> 24) & 0xff;
|
412
|
+
if (top3 > 0) {tote->Add(top3, cld::LgProb3(prob123_entry, 2));}
|
413
|
+
}
|
414
|
+
|
415
|
+
|
416
|
+
//------------------------------------------------------------------------------
|
417
|
+
// Routines to accumulate probabilities
|
418
|
+
//------------------------------------------------------------------------------
|
419
|
+
|
420
|
+
|
421
|
+
// UNIGRAM, using UTF-8 property table, advancing by 1/2/4/8 chars
|
422
|
+
// Caller supplies table, such as compact_lang_det_generated_ctjkvz_b1_obj
|
423
|
+
// Score up to n unigrams, returning number of bytes consumed
|
424
|
+
// Updates tote_grams
|
425
|
+
int cld::DoUniScoreV3(const UTF8PropObj* unigram_obj,
|
426
|
+
const char* isrc, int srclen, int advance_by,
|
427
|
+
int* tote_grams, int gram_limit, Tote* chunk_tote) {
|
428
|
+
const char* src = isrc;
|
429
|
+
if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
|
430
|
+
|
431
|
+
// Property-based CJK unigram lookup
|
432
|
+
if (src[0] == ' ') {++src; --srclen;}
|
433
|
+
|
434
|
+
const uint8* usrc = reinterpret_cast<const uint8*>(src);
|
435
|
+
int usrclen = srclen;
|
436
|
+
|
437
|
+
while (usrclen > 0) {
|
438
|
+
int len = kAdvanceOneChar[usrc[0]];
|
439
|
+
// Look up property of one UTF-8 character and advance over it
|
440
|
+
// Return 0 if input length is zero
|
441
|
+
// Return 0 and advance one byte if input is ill-formed
|
442
|
+
|
443
|
+
int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &usrclen);
|
444
|
+
|
445
|
+
if (FLAGS_dbglookup) {
|
446
|
+
DbgUniTermToStderr(propval, usrc, len);
|
447
|
+
}
|
448
|
+
|
449
|
+
if (propval > 0) {
|
450
|
+
ProcessProbV25UniTote(propval, chunk_tote);
|
451
|
+
++(*tote_grams);
|
452
|
+
if (FLAGS_dbgscore) {DbgScoreRecordUni((const char*)usrc, propval, len);}
|
453
|
+
}
|
454
|
+
|
455
|
+
// Advance by 1/2/4/8 characters (half of quad advance)
|
456
|
+
if (advance_by == 2) {
|
457
|
+
// Already advanced by 1
|
458
|
+
} else if (advance_by == 4) {
|
459
|
+
// Advance by 2 chars total, if not at end
|
460
|
+
if (UTFmax <= usrclen) {
|
461
|
+
int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
462
|
+
}
|
463
|
+
} else if (advance_by == 8) {
|
464
|
+
// Advance by 4 chars total, if not at end
|
465
|
+
if ((UTFmax * 3) <= usrclen) {
|
466
|
+
int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
467
|
+
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
468
|
+
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
469
|
+
}
|
470
|
+
} else {
|
471
|
+
// Advance by 8 chars total, if not at end
|
472
|
+
if ((UTFmax * 7) <= usrclen) {
|
473
|
+
int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
474
|
+
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
475
|
+
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
476
|
+
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
477
|
+
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
478
|
+
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
479
|
+
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
480
|
+
}
|
481
|
+
}
|
482
|
+
DCHECK(usrclen >= 0);
|
483
|
+
|
484
|
+
if (*tote_grams >= gram_limit) {
|
485
|
+
break;
|
486
|
+
}
|
487
|
+
}
|
488
|
+
if (FLAGS_dbgscore) {
|
489
|
+
// With advance_by>2, we consume more input to get the same number of quads
|
490
|
+
int len = src - isrc;
|
491
|
+
DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
|
492
|
+
DbgScoreFlush();
|
493
|
+
}
|
494
|
+
|
495
|
+
int consumed2 = reinterpret_cast<const char*>(usrc) - isrc;
|
496
|
+
return consumed2;
|
497
|
+
}
|
498
|
+
|
499
|
+
|
500
|
+
// BIGRAM, using hash table, always advancing by 1 char
|
501
|
+
// Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj
|
502
|
+
// Score all bigrams in isrc, using languages that have bigrams (CJK)
|
503
|
+
// Return number of bigrams that hit in the hash table
|
504
|
+
int cld::DoBigramScoreV3(const cld::CLDTableSummary* bigram_obj,
|
505
|
+
const char* isrc, int srclen, Tote* chunk_tote) {
|
506
|
+
int hit_count = 0;
|
507
|
+
const char* src = isrc;
|
508
|
+
|
509
|
+
// Hashtable-based CJK bigram lookup
|
510
|
+
const uint8* usrc = reinterpret_cast<const uint8*>(src);
|
511
|
+
const uint8* usrclimit1 = usrc + srclen - UTFmax;
|
512
|
+
if (FLAGS_dbgscore) {
|
513
|
+
fprintf(stderr, " " );
|
514
|
+
}
|
515
|
+
|
516
|
+
while (usrc < usrclimit1) {
|
517
|
+
int len = kAdvanceOneChar[usrc[0]];
|
518
|
+
int len2 = kAdvanceOneChar[usrc[len]] + len;
|
519
|
+
|
520
|
+
if ((kMinCJKUTF8CharBytes * 2) <= len2) { // Two CJK chars possible
|
521
|
+
// Lookup and score this bigram
|
522
|
+
// Always ignore pre/post spaces
|
523
|
+
uint32 bihash = BiHashV25(reinterpret_cast<const char*>(usrc), len2);
|
524
|
+
uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash);
|
525
|
+
// Now go indirect on the subscript
|
526
|
+
probs = bigram_obj->kCLDTableInd[probs &
|
527
|
+
~bigram_obj->kCLDTableKeyMask];
|
528
|
+
|
529
|
+
// Process the bigram
|
530
|
+
if (FLAGS_dbglookup) {
|
531
|
+
const char* ssrc = reinterpret_cast<const char*>(usrc);
|
532
|
+
DbgBiTermToStderr(bihash, probs, ssrc, len2);
|
533
|
+
DbgScoreRecord(NULL, probs, len2);
|
534
|
+
} else if (FLAGS_dbgscore && (probs != 0)) {
|
535
|
+
const char* ssrc = reinterpret_cast<const char*>(usrc);
|
536
|
+
DbgScoreRecord(NULL, probs, len2);
|
537
|
+
string temp(ssrc, len2);
|
538
|
+
fprintf(stderr, "%s ", temp.c_str());
|
539
|
+
}
|
540
|
+
|
541
|
+
if (probs != 0) {
|
542
|
+
ProcessProbV25Tote(probs, chunk_tote);
|
543
|
+
++hit_count;
|
544
|
+
}
|
545
|
+
}
|
546
|
+
usrc += len; // Advance by one char
|
547
|
+
}
|
548
|
+
|
549
|
+
if (FLAGS_dbgscore) {
|
550
|
+
fprintf(stderr, "[%d bigrams scored]\n", hit_count);
|
551
|
+
DbgScoreState();
|
552
|
+
}
|
553
|
+
return hit_count;
|
554
|
+
}
|
555
|
+
|
556
|
+
|
557
|
+
|
558
|
+
// QUADGRAM, using hash table, advancing by 2/4/8/16 chars
|
559
|
+
// Caller supplies table, such as &kQuadTable_obj or &kGibberishTable_obj
|
560
|
+
// Score up to n quadgrams, returning number of bytes consumed
|
561
|
+
// Updates tote_grams
|
562
|
+
int cld::DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj,
|
563
|
+
const char* isrc, int srclen, int advance_by,
|
564
|
+
int* tote_grams, int gram_limit, Tote* chunk_tote) {
|
565
|
+
const char* src = isrc;
|
566
|
+
const char* srclimit = src + srclen;
|
567
|
+
// Limit is end, which has extra 20 20 20 00 past len
|
568
|
+
const char* srclimit7 = src + srclen - (UTFmax * 7);
|
569
|
+
const char* srclimit15 = src + srclen - (UTFmax * 15);
|
570
|
+
|
571
|
+
if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
|
572
|
+
|
573
|
+
// Run a little cache of last hits to catch overly-repetitive "text"
|
574
|
+
int next_prior = 0;
|
575
|
+
uint32 prior_quads[2] = {0, 0};
|
576
|
+
|
577
|
+
// Visit all quadgrams
|
578
|
+
if (src[0] == ' ') {++src;}
|
579
|
+
while (src < srclimit) {
|
580
|
+
// Find one quadgram
|
581
|
+
const char* src_end = src;
|
582
|
+
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
583
|
+
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
584
|
+
const char* src_mid = src_end;
|
585
|
+
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
586
|
+
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
587
|
+
int len = src_end - src;
|
588
|
+
|
589
|
+
// Lookup and score this quadgram
|
590
|
+
uint32 quadhash = QuadHashV25(src, len);
|
591
|
+
uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash);
|
592
|
+
// Now go indirect on the subscript
|
593
|
+
probs = quadgram_obj->kCLDTableInd[probs &
|
594
|
+
~quadgram_obj->kCLDTableKeyMask];
|
595
|
+
|
596
|
+
// Process the quadgram
|
597
|
+
if (FLAGS_dbglookup) {
|
598
|
+
DbgQuadTermToStderr(quadhash, probs, src, len);
|
599
|
+
}
|
600
|
+
if (probs != 0) {
|
601
|
+
// Filter out recent repeats. If this works out, use in the other lookups
|
602
|
+
if ((quadhash != prior_quads[0]) && (quadhash != prior_quads[1])) {
|
603
|
+
prior_quads[next_prior] = quadhash;
|
604
|
+
next_prior = (next_prior + 1) & 1;
|
605
|
+
ProcessProbV25Tote(probs, chunk_tote);
|
606
|
+
++(*tote_grams);
|
607
|
+
if (FLAGS_dbgscore) {DbgScoreRecord(src, probs, len);}
|
608
|
+
}
|
609
|
+
}
|
610
|
+
|
611
|
+
// Advance all the way past word if at end-of-word
|
612
|
+
if (src_end[0] == ' ') {
|
613
|
+
src_mid = src_end;
|
614
|
+
}
|
615
|
+
|
616
|
+
// Advance by 2/4/8/16 characters
|
617
|
+
if (advance_by == 2) {
|
618
|
+
src = src_mid;
|
619
|
+
} else if (advance_by == 4) {
|
620
|
+
src = src_end;
|
621
|
+
} else if (advance_by == 8) {
|
622
|
+
// Advance by 8 chars total (4 more), if not at end
|
623
|
+
if (src < srclimit7) {
|
624
|
+
src_end += kAdvanceOneChar[(uint8)src_end[0]];
|
625
|
+
src_end += kAdvanceOneChar[(uint8)src_end[0]];
|
626
|
+
src_end += kAdvanceOneChar[(uint8)src_end[0]];
|
627
|
+
src_end += kAdvanceOneChar[(uint8)src_end[0]];
|
628
|
+
}
|
629
|
+
src = src_end;
|
630
|
+
} else {
|
631
|
+
// Advance by 16 chars total (12 more), if not at end
|
632
|
+
if (src < srclimit15) {
|
633
|
+
// Advance by ~16 chars by adding 3 * current bytelen
|
634
|
+
int fourcharlen = src_end - src;
|
635
|
+
src = src_end + (3 * fourcharlen);
|
636
|
+
// Advance a bit more if mid-character
|
637
|
+
src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
|
638
|
+
src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
|
639
|
+
} else {
|
640
|
+
src = src_end;
|
641
|
+
}
|
642
|
+
}
|
643
|
+
DCHECK(src < srclimit);
|
644
|
+
src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
|
645
|
+
|
646
|
+
if (*tote_grams >= gram_limit) {
|
647
|
+
break;
|
648
|
+
}
|
649
|
+
}
|
650
|
+
|
651
|
+
if (FLAGS_dbgscore) {
|
652
|
+
// With advance_by>2, we consume more input to get the same number of quads
|
653
|
+
int len = src - isrc;
|
654
|
+
DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
|
655
|
+
DbgScoreFlush();
|
656
|
+
}
|
657
|
+
|
658
|
+
int consumed = src - isrc;
|
659
|
+
|
660
|
+
// If advancing by more than 2, src may have overshot srclimit
|
661
|
+
if (consumed > srclen) {
|
662
|
+
consumed = srclen;
|
663
|
+
}
|
664
|
+
|
665
|
+
return consumed;
|
666
|
+
}
|
667
|
+
|
668
|
+
|
669
|
+
// OCTAGRAM, using hash table, always advancing by 1 word
|
670
|
+
// Caller supplies table, such as &kLongWord8Table_obj
|
671
|
+
// Score all words in isrc, using languages that have quadgrams
|
672
|
+
// We don't normally use this routine except on the first quadgram run,
|
673
|
+
// but it can be used to resolve unreliable pages.
|
674
|
+
// This routine does not have an optimized advance_by
|
675
|
+
// SOON: Uses indirect language/probability longword
|
676
|
+
//
|
677
|
+
// Return number of words that hit in the hash table
|
678
|
+
int cld::DoOctaScoreV3(const cld::CLDTableSummary* octagram_obj,
|
679
|
+
const char* isrc, int srclen, Tote* chunk_tote) {
|
680
|
+
int hit_count = 0;
|
681
|
+
const char* src = isrc;
|
682
|
+
const char* srclimit = src + srclen + 1;
|
683
|
+
// Limit is end+1, to include extra space char (0x20) off the end
|
684
|
+
//
|
685
|
+
// Score all words truncated to 8 characters
|
686
|
+
int charcount = 0;
|
687
|
+
// Skip any initial space
|
688
|
+
if (src[0] == ' ') {++src;}
|
689
|
+
const char* word_ptr = src;
|
690
|
+
const char* word_end = word_ptr;
|
691
|
+
if (FLAGS_dbgscore) {
|
692
|
+
fprintf(stderr, " " );
|
693
|
+
}
|
694
|
+
while (src < srclimit) {
|
695
|
+
// Terminate previous word or continue current word
|
696
|
+
if (src[0] == ' ') {
|
697
|
+
int bytecount = word_end - word_ptr;
|
698
|
+
if (bytecount == 0)
|
699
|
+
break;
|
700
|
+
// Lookup and score this word
|
701
|
+
uint64 wordhash40 = OctaHash40(word_ptr, bytecount);
|
702
|
+
uint32 probs = OctaHashV3Lookup4(octagram_obj, wordhash40);
|
703
|
+
// Now go indirect on the subscript
|
704
|
+
probs = octagram_obj->kCLDTableInd[probs &
|
705
|
+
~octagram_obj->kCLDTableKeyMask];
|
706
|
+
|
707
|
+
// // Lookup and score this word
|
708
|
+
// uint32 wordhash = QuadHashV25(word_ptr, bytecount);
|
709
|
+
// uint32 probs = WordHashLookup4(wordhash, kLongWord8Table,
|
710
|
+
// kLongWord8TableSize);
|
711
|
+
//
|
712
|
+
if (FLAGS_dbglookup) {
|
713
|
+
DbgWordTermToStderr(wordhash40, probs, word_ptr, bytecount);
|
714
|
+
DbgScoreRecord(NULL, probs, bytecount);
|
715
|
+
} else if (FLAGS_dbgscore && (probs != 0)) {
|
716
|
+
DbgScoreRecord(NULL, probs, bytecount);
|
717
|
+
string temp(word_ptr, bytecount);
|
718
|
+
fprintf(stderr, "%s ", temp.c_str());
|
719
|
+
}
|
720
|
+
|
721
|
+
if (probs != 0) {
|
722
|
+
ProcessProbV25Tote(probs, chunk_tote);
|
723
|
+
++hit_count;
|
724
|
+
}
|
725
|
+
charcount = 0;
|
726
|
+
word_ptr = src + 1; // Over the space
|
727
|
+
word_end = word_ptr;
|
728
|
+
} else {
|
729
|
+
++charcount;
|
730
|
+
}
|
731
|
+
|
732
|
+
// Advance to next char
|
733
|
+
src += cld_UniLib::OneCharLen(src);
|
734
|
+
if (charcount <= 8) {
|
735
|
+
word_end = src;
|
736
|
+
}
|
737
|
+
}
|
738
|
+
|
739
|
+
if (FLAGS_dbgscore) {
|
740
|
+
fprintf(stderr, "[%d words scored]\n", hit_count);
|
741
|
+
DbgScoreState();
|
742
|
+
}
|
743
|
+
return hit_count;
|
744
|
+
}
|
745
|
+
|
746
|
+
|
747
|
+
|
748
|
+
//------------------------------------------------------------------------------
|
749
|
+
// Reliability calculations, for single language and between languages
|
750
|
+
//------------------------------------------------------------------------------
|
751
|
+
|
752
|
+
// Return reliablity of result 0..100 for top two scores
|
753
|
+
// delta==0 is 0% reliable, delta==fully_reliable_thresh is 100% reliable
|
754
|
+
// (on a scale where +1 is a factor of 2 ** 1.6 = 3.02)
|
755
|
+
// Threshold is uni/quadgram increment count, bounded above and below.
|
756
|
+
//
|
757
|
+
// Requiring a factor of 3 improvement (e.g. +1 log base 3)
|
758
|
+
// for each scored quadgram is too stringent, so I've backed this off to a
|
759
|
+
// factor of 2 (e.g. +5/8 log base 3).
|
760
|
+
//
|
761
|
+
// I also somewhat lowered the Min/MaxGramCount limits above
|
762
|
+
//
|
763
|
+
// Added: if fewer than 8 quads/unis, max reliability is 12*n percent
|
764
|
+
//
|
765
|
+
int cld::ReliabilityDelta(int value1, int value2, int gramcount) {
|
766
|
+
int max_reliability_percent = 100;
|
767
|
+
if (gramcount < 8) {
|
768
|
+
max_reliability_percent = 12 * gramcount;
|
769
|
+
}
|
770
|
+
int fully_reliable_thresh = (gramcount * 5) >> 3; // see note above
|
771
|
+
if (fully_reliable_thresh < kMinGramCount) { // Fully = 3..16
|
772
|
+
fully_reliable_thresh = kMinGramCount;
|
773
|
+
} else if (fully_reliable_thresh > kMaxGramCount) {
|
774
|
+
fully_reliable_thresh = kMaxGramCount;
|
775
|
+
}
|
776
|
+
|
777
|
+
int delta = value1 - value2;
|
778
|
+
if (delta >= fully_reliable_thresh) {return max_reliability_percent;}
|
779
|
+
if (delta <= 0) {return 0;}
|
780
|
+
return cld::minint(max_reliability_percent,
|
781
|
+
(100 * delta) / fully_reliable_thresh);
|
782
|
+
}
|
783
|
+
|
784
|
+
// Return reliablity of result 0..100 for top score vs. mainsteam score
|
785
|
+
// Values are score per 1024 bytes of input
|
786
|
+
// ratio = max(top/mainstream, mainstream/top)
|
787
|
+
// ratio > 4.0 is 0% reliable, <= 2.0 is 100% reliable
|
788
|
+
// Change: short-text word scoring can give unusually good results.
|
789
|
+
// Let top exceed mainstream by 4x at 50% reliable
|
790
|
+
int cld::ReliabilityMainstream(int topscore, int len, int mean_score) {
|
791
|
+
if (mean_score == 0) {return 100;} // No reliability data available yet
|
792
|
+
if (topscore == 0) {return 0;} // zero score = unreliable
|
793
|
+
if (len == 0) {return 0;} // zero len = unreliable
|
794
|
+
int top_kb = (topscore << 10) / len;
|
795
|
+
double ratio;
|
796
|
+
double ratio_cutoff;
|
797
|
+
if (top_kb > mean_score) {
|
798
|
+
ratio = (1.0 * top_kb) / mean_score;
|
799
|
+
ratio_cutoff = 5.0; // ramp down from 100% to 0%: 3.0-5.0
|
800
|
+
} else {
|
801
|
+
ratio = (1.0 * mean_score) / top_kb;
|
802
|
+
ratio_cutoff = 4.0; // ramp down from 100% to 0%: 2.0-4.0
|
803
|
+
}
|
804
|
+
if (ratio <= ratio_cutoff - 2.0) {return 100;}
|
805
|
+
if (ratio > ratio_cutoff) {return 0;}
|
806
|
+
|
807
|
+
int iratio = static_cast<int>(100 * (ratio_cutoff - ratio) / 2.0);
|
808
|
+
return iratio;
|
809
|
+
}
|
810
|
+
|
811
|
+
// Calculate ratio of score per 1KB vs. expected score per 1KB
|
812
|
+
double cld::GetNormalizedScore(Language lang, UnicodeLScript lscript,
|
813
|
+
int bytes, int score) {
|
814
|
+
// Average training-data score for this language-script combo, per 1KB
|
815
|
+
int expected_score = kMeanScore[lang * 4 + LScript4(lscript)];
|
816
|
+
if (lscript == ULScript_Common) {
|
817
|
+
// We don't know the script (only happens with second-chance score)
|
818
|
+
// Look for first non-zero mean value
|
819
|
+
for (int i = 0; i < 3; ++i) {
|
820
|
+
if (kMeanScore[lang * 4 + i] > 0) {
|
821
|
+
expected_score = kMeanScore[lang * 4 + i];
|
822
|
+
}
|
823
|
+
}
|
824
|
+
}
|
825
|
+
if (expected_score < 100) {
|
826
|
+
expected_score = 1000;
|
827
|
+
}
|
828
|
+
|
829
|
+
// Our score per 1KB
|
830
|
+
double our_score = (score << 10) / (bytes ? bytes : 1); // Avoid zdiv
|
831
|
+
double ratio = our_score / expected_score;
|
832
|
+
|
833
|
+
// Just the raw count normalized as though each language has mean=1000;
|
834
|
+
ratio = (score * 1000.0) / expected_score;
|
835
|
+
return ratio;
|
836
|
+
}
|
837
|
+
|
838
|
+
// Calculate reliablity of len bytes of script lscript with chunk_tote
|
839
|
+
int cld::GetReliability(int len, UnicodeLScript lscript,
|
840
|
+
const Tote* chunk_tote) {
|
841
|
+
Language cur_lang = UnpackLanguage(chunk_tote->Key(0));
|
842
|
+
// Average score for this language-script combo
|
843
|
+
int mean_score = kMeanScore[cur_lang * 4 + LScript4(lscript)];
|
844
|
+
if (lscript == ULScript_Common) {
|
845
|
+
// We don't know the script (only happens with second-chance score)
|
846
|
+
// Look for first non-zero mean value
|
847
|
+
for (int i = 0; i < 3; ++i) {
|
848
|
+
if (kMeanScore[cur_lang * 4 + i] > 0) {
|
849
|
+
mean_score = kMeanScore[cur_lang * 4 + i];
|
850
|
+
}
|
851
|
+
}
|
852
|
+
}
|
853
|
+
int reliability_delta = ReliabilityDelta(chunk_tote->Value(0),
|
854
|
+
chunk_tote->Value(1),
|
855
|
+
chunk_tote->GetGramCount());
|
856
|
+
|
857
|
+
int reliability_main = ReliabilityMainstream(chunk_tote->Value(0),
|
858
|
+
len,
|
859
|
+
mean_score);
|
860
|
+
|
861
|
+
int reliability_min = minint(reliability_delta, reliability_main);
|
862
|
+
|
863
|
+
|
864
|
+
if (FLAGS_dbgreli) {
|
865
|
+
char temp1[4];
|
866
|
+
char temp2[4];
|
867
|
+
cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(0)), temp1);
|
868
|
+
if (temp1[2] == ' ') {temp1[2] = '\0';}
|
869
|
+
cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(1)), temp2);
|
870
|
+
if (temp2[2] == ' ') {temp2[2] = '\0';}
|
871
|
+
int srclen = len;
|
872
|
+
fprintf(stderr, "CALC GetReliability gram=%d incr=%d srclen=%d, %s=%d %s=%d "
|
873
|
+
"top/KB=%d mean/KB=%d del=%d%% reli=%d%% "
|
874
|
+
"lang/lscript %d %d\n",
|
875
|
+
chunk_tote->GetGramCount(),
|
876
|
+
chunk_tote->GetIncrCount(),
|
877
|
+
srclen,
|
878
|
+
temp1, chunk_tote->Value(0),
|
879
|
+
temp2, chunk_tote->Value(1),
|
880
|
+
(chunk_tote->Value(0) << 10) / (srclen ? srclen : 1),
|
881
|
+
mean_score,
|
882
|
+
reliability_delta,
|
883
|
+
reliability_main,
|
884
|
+
cur_lang, lscript);
|
885
|
+
}
|
886
|
+
|
887
|
+
return reliability_min;
|
888
|
+
}
|
889
|
+
|
890
|
+
|
891
|
+
//------------------------------------------------------------------------------
|
892
|
+
// Miscellaneous
|
893
|
+
//------------------------------------------------------------------------------
|
894
|
+
|
895
|
+
// Demote all languages except Top40 and plus_one
|
896
|
+
// Do this just before sorting chunk_tote results
|
897
|
+
void cld::DemoteNotTop40(Tote* chunk_tote, int packed_plus_one) {
|
898
|
+
for (int sub = 0; sub < chunk_tote->MaxSize(); ++sub) {
|
899
|
+
if (chunk_tote->Key(sub) == 0) continue;
|
900
|
+
if (chunk_tote->Key(sub) == packed_plus_one) continue;
|
901
|
+
if (kIsPackedTop40[chunk_tote->Key(sub)]) continue;
|
902
|
+
// Quarter the score of others
|
903
|
+
chunk_tote->SetValue(sub, chunk_tote->Value(sub) >> 2);
|
904
|
+
}
|
905
|
+
}
|