language_detection 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +85 -0
- data/Rakefile +11 -0
- data/ext/cld/Makefile +34 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +124 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/cld.so +0 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +1 -0
- data/ext/cld/language_detection.cc +88 -0
- data/ext/cld/languages/internal/languages.cc +337 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/language_detection.gemspec +28 -0
- data/lib/language_detection/string.rb +1 -0
- data/lib/language_detection/version.rb +3 -0
- data/lib/language_detection.rb +54 -0
- data/test/_helper.rb +15 -0
- data/test/fixtures/languages.csv +80 -0
- data/test/language_detection_test.rb +88 -0
- metadata +250 -0
|
@@ -0,0 +1,905 @@
|
|
|
1
|
+
// Copyright (c) 2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#include <string>
|
|
6
|
+
#include "encodings/compact_lang_det/cldutil.h"
|
|
7
|
+
#include "encodings/compact_lang_det/cldutil_dbg.h"
|
|
8
|
+
#include "encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h"
|
|
9
|
+
#include "encodings/compact_lang_det/utf8propletterscriptnum.h"
|
|
10
|
+
#include "encodings/compact_lang_det/win/cld_commandlineflags.h"
|
|
11
|
+
#include "encodings/compact_lang_det/win/cld_logging.h"
|
|
12
|
+
#include "encodings/compact_lang_det/win/cld_unilib.h"
|
|
13
|
+
#include "encodings/compact_lang_det/win/cld_utf.h"
|
|
14
|
+
#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
|
|
15
|
+
|
|
16
|
+
// Runtime routines for hashing, looking up, and scoring
|
|
17
|
+
// unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
|
|
18
|
+
// Unigrams and bigrams are for CJK languages only, including simplified/
|
|
19
|
+
// traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
|
|
20
|
+
// Zhuang Han characters. Surrounding spaces are not considered.
|
|
21
|
+
// Quadgrams and octagrams for for non-CJK and include two bits indicating
|
|
22
|
+
// preceding and trailing spaces (word boundaries).
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
// Indicator bits for leading/trailing space around quad/octagram
|
|
26
|
+
// NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of
|
|
27
|
+
// 1-, 2-, or 3-bytes each.
|
|
28
|
+
static const uint32 kPreSpaceIndicator = 0x00004444;
|
|
29
|
+
static const uint32 kPostSpaceIndicator = 0x44440000;
|
|
30
|
+
|
|
31
|
+
// Little-endian masks for 0..24 bytes picked up as uint32's
|
|
32
|
+
static const uint32 kWordMask0[4] = {
|
|
33
|
+
0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
static const int kMinCJKUTF8CharBytes = 3;
|
|
37
|
+
|
|
38
|
+
static const int kMinGramCount = 3;
|
|
39
|
+
static const int kMaxGramCount = 16;
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
// Routines to access a hash table of <key:wordhash, value:probs> pairs
|
|
45
|
+
// Buckets have 4-byte wordhash for sizes < 32K buckets, but only
|
|
46
|
+
// 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
|
|
47
|
+
// bucket subscript.
|
|
48
|
+
// Probs is a packed: three languages plus a subscript for probability table
|
|
49
|
+
// Buckets have all the keys together, then all the values.Key array never
|
|
50
|
+
// crosses a cache-line boundary, so no-match case takes exactly one cache miss.
|
|
51
|
+
// Match case may sometimes take an additional cache miss on value access.
|
|
52
|
+
//
|
|
53
|
+
// Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
|
|
54
|
+
// byte buckets with single cache miss.
|
|
55
|
+
// Or 2-byte key and 6-byte value, allowing 5 languages instead of three.
|
|
56
|
+
//------------------------------------------------------------------------------
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
//------------------------------------------------------------------------------
|
|
60
|
+
// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores
|
|
61
|
+
//------------------------------------------------------------------------------
|
|
62
|
+
|
|
63
|
+
// Design principles for these hash functions
|
|
64
|
+
// - Few operations
|
|
65
|
+
// - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in
|
|
66
|
+
// Latin script expect 1- and 2-byte mixtures.
|
|
67
|
+
// - Last byte of each character has about 5 bits of information
|
|
68
|
+
// - Spread good bits around so they can interact in at least two ways
|
|
69
|
+
// with other characters
|
|
70
|
+
// - Use add for additional mixing thorugh carries
|
|
71
|
+
|
|
72
|
+
// CJK Three-byte bigram
|
|
73
|
+
// ....dddd..cccccc..bbbbbb....aaaa
|
|
74
|
+
// ..................ffffff..eeeeee
|
|
75
|
+
// make
|
|
76
|
+
// ....dddd..cccccc..bbbbbb....aaaa
|
|
77
|
+
// 000....dddd..cccccc..bbbbbb....a
|
|
78
|
+
// ..................ffffff..eeeeee
|
|
79
|
+
// ffffff..eeeeee000000000000000000
|
|
80
|
+
//
|
|
81
|
+
// CJK Four-byte bigram
|
|
82
|
+
// ..dddddd..cccccc....bbbb....aaaa
|
|
83
|
+
// ..hhhhhh..gggggg....ffff....eeee
|
|
84
|
+
// make
|
|
85
|
+
// ..dddddd..cccccc....bbbb....aaaa
|
|
86
|
+
// 000..dddddd..cccccc....bbbb....a
|
|
87
|
+
// ..hhhhhh..gggggg....ffff....eeee
|
|
88
|
+
// ..ffff....eeee000000000000000000
|
|
89
|
+
|
|
90
|
+
// BIGRAM
|
|
91
|
+
// Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
|
|
92
|
+
// OVERSHOOTS up to 3 bytes
|
|
93
|
+
// For runtime use of tables
|
|
94
|
+
uint32 cld::BiHashV25(const char* word_ptr, int bytecount) {
|
|
95
|
+
if (bytecount == 0) {
|
|
96
|
+
return 0;
|
|
97
|
+
}
|
|
98
|
+
const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
|
|
99
|
+
uint32 word0, word1;
|
|
100
|
+
if (bytecount <= 4) {
|
|
101
|
+
word0 = word_ptr32[0] & kWordMask0[bytecount & 3];
|
|
102
|
+
word0 = word0 ^ (word0 >> 3);
|
|
103
|
+
return word0;
|
|
104
|
+
}
|
|
105
|
+
// Else do 8 bytes
|
|
106
|
+
word0 = word_ptr32[0];
|
|
107
|
+
word0 = word0 ^ (word0 >> 3);
|
|
108
|
+
word1 = word_ptr32[1] & kWordMask0[bytecount & 3];
|
|
109
|
+
word1 = word1 ^ (word1 << 18);
|
|
110
|
+
return word0 + word1;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
//
|
|
114
|
+
// Ascii-7 One-byte chars
|
|
115
|
+
// ...ddddd...ccccc...bbbbb...aaaaa
|
|
116
|
+
// make
|
|
117
|
+
// ...ddddd...ccccc...bbbbb...aaaaa
|
|
118
|
+
// 000...ddddd...ccccc...bbbbb...aa
|
|
119
|
+
//
|
|
120
|
+
// Latin 1- and 2-byte chars
|
|
121
|
+
// ...ddddd...ccccc...bbbbb...aaaaa
|
|
122
|
+
// ...................fffff...eeeee
|
|
123
|
+
// make
|
|
124
|
+
// ...ddddd...ccccc...bbbbb...aaaaa
|
|
125
|
+
// 000...ddddd...ccccc...bbbbb...aa
|
|
126
|
+
// ...................fffff...eeeee
|
|
127
|
+
// ...............fffff...eeeee0000
|
|
128
|
+
//
|
|
129
|
+
// Non-CJK Two-byte chars
|
|
130
|
+
// ...ddddd...........bbbbb........
|
|
131
|
+
// ...hhhhh...........fffff........
|
|
132
|
+
// make
|
|
133
|
+
// ...ddddd...........bbbbb........
|
|
134
|
+
// 000...ddddd...........bbbbb.....
|
|
135
|
+
// ...hhhhh...........fffff........
|
|
136
|
+
// hhhh...........fffff........0000
|
|
137
|
+
//
|
|
138
|
+
// Non-CJK Three-byte chars
|
|
139
|
+
// ...........ccccc................
|
|
140
|
+
// ...................fffff........
|
|
141
|
+
// ...lllll...................iiiii
|
|
142
|
+
// make
|
|
143
|
+
// ...........ccccc................
|
|
144
|
+
// 000...........ccccc.............
|
|
145
|
+
// ...................fffff........
|
|
146
|
+
// ...............fffff........0000
|
|
147
|
+
// ...lllll...................iiiii
|
|
148
|
+
// .lllll...................iiiii00
|
|
149
|
+
//
|
|
150
|
+
|
|
151
|
+
// QUADGRAM
|
|
152
|
+
// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
|
|
153
|
+
// OVERSHOOTS up to 3 bytes
|
|
154
|
+
// For runtime use of tables
|
|
155
|
+
uint32 QuadHashV25Mix(const char* word_ptr, int bytecount, uint32 prepost) {
|
|
156
|
+
const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
|
|
157
|
+
uint32 word0, word1, word2;
|
|
158
|
+
if (bytecount <= 4) {
|
|
159
|
+
word0 = word_ptr32[0] & kWordMask0[bytecount & 3];
|
|
160
|
+
word0 = word0 ^ (word0 >> 3);
|
|
161
|
+
return word0 ^ prepost;
|
|
162
|
+
} else if (bytecount <= 8) {
|
|
163
|
+
word0 = word_ptr32[0];
|
|
164
|
+
word0 = word0 ^ (word0 >> 3);
|
|
165
|
+
word1 = word_ptr32[1] & kWordMask0[bytecount & 3];
|
|
166
|
+
word1 = word1 ^ (word1 << 4);
|
|
167
|
+
return (word0 ^ prepost) + word1;
|
|
168
|
+
}
|
|
169
|
+
// else do 12 bytes
|
|
170
|
+
word0 = word_ptr32[0];
|
|
171
|
+
word0 = word0 ^ (word0 >> 3);
|
|
172
|
+
word1 = word_ptr32[1];
|
|
173
|
+
word1 = word1 ^ (word1 << 4);
|
|
174
|
+
word2 = word_ptr32[2] & kWordMask0[bytecount & 3];
|
|
175
|
+
word2 = word2 ^ (word2 << 2);
|
|
176
|
+
return (word0 ^ prepost) + word1 + word2;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
// QUADGRAM wrapper with surrounding spaces
|
|
181
|
+
// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
|
|
182
|
+
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
|
|
183
|
+
// For runtime use of tables
|
|
184
|
+
uint32 cld::QuadHashV25(const char* word_ptr, int bytecount) {
|
|
185
|
+
if (bytecount == 0) {
|
|
186
|
+
return 0;
|
|
187
|
+
}
|
|
188
|
+
uint32 prepost = 0;
|
|
189
|
+
if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
|
|
190
|
+
if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
|
|
191
|
+
return QuadHashV25Mix(word_ptr, bytecount, prepost);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// QUADGRAM wrapper with surrounding underscores (offline use)
|
|
195
|
+
// Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
|
|
196
|
+
// OVERSHOOTS up to 3 bytes
|
|
197
|
+
// For offline construction of tables
|
|
198
|
+
uint32 cld::QuadHashV25Underscore(const char* word_ptr, int bytecount) {
|
|
199
|
+
if (bytecount == 0) {
|
|
200
|
+
return 0;
|
|
201
|
+
}
|
|
202
|
+
const char* local_word_ptr = word_ptr;
|
|
203
|
+
int local_bytecount = bytecount;
|
|
204
|
+
uint32 prepost = 0;
|
|
205
|
+
if (local_word_ptr[0] == '_') {
|
|
206
|
+
prepost |= kPreSpaceIndicator;
|
|
207
|
+
++local_word_ptr;
|
|
208
|
+
--local_bytecount;
|
|
209
|
+
}
|
|
210
|
+
if (local_word_ptr[local_bytecount - 1] == '_') {
|
|
211
|
+
prepost |= kPostSpaceIndicator;
|
|
212
|
+
--local_bytecount;
|
|
213
|
+
}
|
|
214
|
+
return QuadHashV25Mix(local_word_ptr, local_bytecount, prepost);
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
// OCTAGRAM
|
|
219
|
+
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
|
|
220
|
+
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
|
|
221
|
+
//
|
|
222
|
+
// The low 32 bits follow the pattern from above, tuned to different scripts
|
|
223
|
+
// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
|
|
224
|
+
// For runtime use of tables V3
|
|
225
|
+
uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) {
|
|
226
|
+
const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
|
|
227
|
+
uint64 word0;
|
|
228
|
+
uint64 word1;
|
|
229
|
+
uint64 sum;
|
|
230
|
+
|
|
231
|
+
if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
|
|
232
|
+
if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
|
|
233
|
+
switch ((bytecount - 1) >> 2) {
|
|
234
|
+
case 0: // 1..4 bytes
|
|
235
|
+
word0 = word_ptr32[0] & kWordMask0[bytecount & 3];
|
|
236
|
+
sum = word0;
|
|
237
|
+
word0 = word0 ^ (word0 >> 3);
|
|
238
|
+
break;
|
|
239
|
+
case 1: // 5..8 bytes
|
|
240
|
+
word0 = word_ptr32[0];
|
|
241
|
+
sum = word0;
|
|
242
|
+
word0 = word0 ^ (word0 >> 3);
|
|
243
|
+
word1 = word_ptr32[1] & kWordMask0[bytecount & 3];
|
|
244
|
+
sum += word1;
|
|
245
|
+
word1 = word1 ^ (word1 << 4);
|
|
246
|
+
word0 += word1;
|
|
247
|
+
break;
|
|
248
|
+
case 2: // 9..12 bytes
|
|
249
|
+
word0 = word_ptr32[0];
|
|
250
|
+
sum = word0;
|
|
251
|
+
word0 = word0 ^ (word0 >> 3);
|
|
252
|
+
word1 = word_ptr32[1];
|
|
253
|
+
sum += word1;
|
|
254
|
+
word1 = word1 ^ (word1 << 4);
|
|
255
|
+
word0 += word1;
|
|
256
|
+
word1 = word_ptr32[2] & kWordMask0[bytecount & 3];
|
|
257
|
+
sum += word1;
|
|
258
|
+
word1 = word1 ^ (word1 << 2);
|
|
259
|
+
word0 += word1;
|
|
260
|
+
break;
|
|
261
|
+
case 3: // 13..16 bytes
|
|
262
|
+
word0 = word_ptr32[0];
|
|
263
|
+
sum = word0;
|
|
264
|
+
word0 = word0 ^ (word0 >> 3);
|
|
265
|
+
word1 = word_ptr32[1];
|
|
266
|
+
sum += word1;
|
|
267
|
+
word1 = word1 ^ (word1 << 4);
|
|
268
|
+
word0 += word1;
|
|
269
|
+
word1 = word_ptr32[2];
|
|
270
|
+
sum += word1;
|
|
271
|
+
word1 = word1 ^ (word1 << 2);
|
|
272
|
+
word0 += word1;
|
|
273
|
+
word1 = word_ptr32[3] & kWordMask0[bytecount & 3];
|
|
274
|
+
sum += word1;
|
|
275
|
+
word1 = word1 ^ (word1 >> 8);
|
|
276
|
+
word0 += word1;
|
|
277
|
+
break;
|
|
278
|
+
case 4: // 17..20 bytes
|
|
279
|
+
word0 = word_ptr32[0];
|
|
280
|
+
sum = word0;
|
|
281
|
+
word0 = word0 ^ (word0 >> 3);
|
|
282
|
+
word1 = word_ptr32[1];
|
|
283
|
+
sum += word1;
|
|
284
|
+
word1 = word1 ^ (word1 << 4);
|
|
285
|
+
word0 += word1;
|
|
286
|
+
word1 = word_ptr32[2];
|
|
287
|
+
sum += word1;
|
|
288
|
+
word1 = word1 ^ (word1 << 2);
|
|
289
|
+
word0 += word1;
|
|
290
|
+
word1 = word_ptr32[3];
|
|
291
|
+
sum += word1;
|
|
292
|
+
word1 = word1 ^ (word1 >> 8);
|
|
293
|
+
word0 += word1;
|
|
294
|
+
word1 = word_ptr32[4] & kWordMask0[bytecount & 3];
|
|
295
|
+
sum += word1;
|
|
296
|
+
word1 = word1 ^ (word1 >> 4);
|
|
297
|
+
word0 += word1;
|
|
298
|
+
break;
|
|
299
|
+
default: // 21..24 bytes and higher (ignores beyond 24)
|
|
300
|
+
word0 = word_ptr32[0];
|
|
301
|
+
sum = word0;
|
|
302
|
+
word0 = word0 ^ (word0 >> 3);
|
|
303
|
+
word1 = word_ptr32[1];
|
|
304
|
+
sum += word1;
|
|
305
|
+
word1 = word1 ^ (word1 << 4);
|
|
306
|
+
word0 += word1;
|
|
307
|
+
word1 = word_ptr32[2];
|
|
308
|
+
sum += word1;
|
|
309
|
+
word1 = word1 ^ (word1 << 2);
|
|
310
|
+
word0 += word1;
|
|
311
|
+
word1 = word_ptr32[3];
|
|
312
|
+
sum += word1;
|
|
313
|
+
word1 = word1 ^ (word1 >> 8);
|
|
314
|
+
word0 += word1;
|
|
315
|
+
word1 = word_ptr32[4];
|
|
316
|
+
sum += word1;
|
|
317
|
+
word1 = word1 ^ (word1 >> 4);
|
|
318
|
+
word0 += word1;
|
|
319
|
+
word1 = word_ptr32[5] & kWordMask0[bytecount & 3];
|
|
320
|
+
sum += word1;
|
|
321
|
+
word1 = word1 ^ (word1 >> 6);
|
|
322
|
+
word0 += word1;
|
|
323
|
+
break;
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
sum += (sum >> 17); // extra 1-bit shift for bytes 2 & 3
|
|
327
|
+
sum += (sum >> 9); // extra 1-bit shift for bytes 1 & 3
|
|
328
|
+
sum = (sum & 0xff) << 32;
|
|
329
|
+
return (word0 ^ prepost) + sum;
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
// OCTAGRAM wrapper with surrounding spaces
|
|
333
|
+
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
|
|
334
|
+
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
|
|
335
|
+
//
|
|
336
|
+
// The low 32 bits follow the pattern from above, tuned to different scripts
|
|
337
|
+
// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
|
|
338
|
+
// For runtime use of tables V3
|
|
339
|
+
uint64 cld::OctaHash40(const char* word_ptr, int bytecount) {
|
|
340
|
+
if (bytecount == 0) {
|
|
341
|
+
return 0;
|
|
342
|
+
}
|
|
343
|
+
uint64 prepost = 0;
|
|
344
|
+
if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
|
|
345
|
+
if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
|
|
346
|
+
return OctaHash40Mix(word_ptr, bytecount, prepost);
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
// OCTAGRAM wrapper with surrounding underscores (offline use)
|
|
351
|
+
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
|
|
352
|
+
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
|
|
353
|
+
//
|
|
354
|
+
// The low 32 bits follow the pattern from above, tuned to different scripts
|
|
355
|
+
// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
|
|
356
|
+
// For offline construction of tables
|
|
357
|
+
uint64 cld::OctaHash40underscore(const char* word_ptr, int bytecount) {
|
|
358
|
+
if (bytecount == 0) {
|
|
359
|
+
return 0;
|
|
360
|
+
}
|
|
361
|
+
const char* local_word_ptr = word_ptr;
|
|
362
|
+
int local_bytecount = bytecount;
|
|
363
|
+
uint64 prepost = 0;
|
|
364
|
+
if (local_word_ptr[0] == '_') {
|
|
365
|
+
prepost |= kPreSpaceIndicator;
|
|
366
|
+
++local_word_ptr;
|
|
367
|
+
--local_bytecount;
|
|
368
|
+
}
|
|
369
|
+
if (local_word_ptr[local_bytecount - 1] == '_') {
|
|
370
|
+
prepost |= kPostSpaceIndicator;
|
|
371
|
+
--local_bytecount;
|
|
372
|
+
}
|
|
373
|
+
return OctaHash40Mix(local_word_ptr, local_bytecount, prepost);
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
//------------------------------------------------------------------------------
|
|
380
|
+
// Scoring single groups of letters
|
|
381
|
+
//------------------------------------------------------------------------------
|
|
382
|
+
|
|
383
|
+
// UNIGRAM score one => tote
|
|
384
|
+
// Input: 1-byte entry of subscript into unigram probs, plus
|
|
385
|
+
// an accumulator tote.
|
|
386
|
+
// Output: running sums in tote updated
|
|
387
|
+
void cld::ProcessProbV25UniTote(int propval, Tote* tote) {
|
|
388
|
+
tote->AddGram();
|
|
389
|
+
const UnigramProbArray* pa = &kTargetCTJKVZProbs[propval];
|
|
390
|
+
if (pa->probs[0] > 0) {tote->Add(cld::PackLanguage(CHINESE), pa->probs[0]);}
|
|
391
|
+
if (pa->probs[1] > 0) {tote->Add(cld::PackLanguage(CHINESE_T), pa->probs[1]);}
|
|
392
|
+
if (pa->probs[2] > 0) {tote->Add(cld::PackLanguage(JAPANESE), pa->probs[2]);}
|
|
393
|
+
if (pa->probs[3] > 0) {tote->Add(cld::PackLanguage(KOREAN), pa->probs[3]);}
|
|
394
|
+
if (pa->probs[4] > 0) {tote->Add(cld::PackLanguage(VIETNAMESE), pa->probs[4]);}
|
|
395
|
+
if (pa->probs[5] > 0) {tote->Add(cld::PackLanguage(ZHUANG), pa->probs[5]);}
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
// BIGRAM, QUADGRAM, OCTAGRAM score one => tote
|
|
399
|
+
// Input: 4-byte entry of 3 language numbers and one probability subscript, plus
|
|
400
|
+
// an accumulator tote. (language 0 means unused entry)
|
|
401
|
+
// Output: running sums in tote updated
|
|
402
|
+
void cld::ProcessProbV25Tote(uint32 probs, Tote* tote) {
|
|
403
|
+
tote->AddGram();
|
|
404
|
+
uint8 prob123 = (probs >> 0) & 0xff;
|
|
405
|
+
const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
|
|
406
|
+
|
|
407
|
+
uint8 top1 = (probs >> 8) & 0xff;
|
|
408
|
+
if (top1 > 0) {tote->Add(top1, cld::LgProb3(prob123_entry, 0));}
|
|
409
|
+
uint8 top2 = (probs >> 16) & 0xff;
|
|
410
|
+
if (top2 > 0) {tote->Add(top2, cld::LgProb3(prob123_entry, 1));}
|
|
411
|
+
uint8 top3 = (probs >> 24) & 0xff;
|
|
412
|
+
if (top3 > 0) {tote->Add(top3, cld::LgProb3(prob123_entry, 2));}
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
//------------------------------------------------------------------------------
|
|
417
|
+
// Routines to accumulate probabilities
|
|
418
|
+
//------------------------------------------------------------------------------
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
// UNIGRAM, using UTF-8 property table, advancing by 1/2/4/8 chars
|
|
422
|
+
// Caller supplies table, such as compact_lang_det_generated_ctjkvz_b1_obj
|
|
423
|
+
// Score up to n unigrams, returning number of bytes consumed
|
|
424
|
+
// Updates tote_grams
|
|
425
|
+
int cld::DoUniScoreV3(const UTF8PropObj* unigram_obj,
|
|
426
|
+
const char* isrc, int srclen, int advance_by,
|
|
427
|
+
int* tote_grams, int gram_limit, Tote* chunk_tote) {
|
|
428
|
+
const char* src = isrc;
|
|
429
|
+
if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
|
|
430
|
+
|
|
431
|
+
// Property-based CJK unigram lookup
|
|
432
|
+
if (src[0] == ' ') {++src; --srclen;}
|
|
433
|
+
|
|
434
|
+
const uint8* usrc = reinterpret_cast<const uint8*>(src);
|
|
435
|
+
int usrclen = srclen;
|
|
436
|
+
|
|
437
|
+
while (usrclen > 0) {
|
|
438
|
+
int len = kAdvanceOneChar[usrc[0]];
|
|
439
|
+
// Look up property of one UTF-8 character and advance over it
|
|
440
|
+
// Return 0 if input length is zero
|
|
441
|
+
// Return 0 and advance one byte if input is ill-formed
|
|
442
|
+
|
|
443
|
+
int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &usrclen);
|
|
444
|
+
|
|
445
|
+
if (FLAGS_dbglookup) {
|
|
446
|
+
DbgUniTermToStderr(propval, usrc, len);
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
if (propval > 0) {
|
|
450
|
+
ProcessProbV25UniTote(propval, chunk_tote);
|
|
451
|
+
++(*tote_grams);
|
|
452
|
+
if (FLAGS_dbgscore) {DbgScoreRecordUni((const char*)usrc, propval, len);}
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
// Advance by 1/2/4/8 characters (half of quad advance)
|
|
456
|
+
if (advance_by == 2) {
|
|
457
|
+
// Already advanced by 1
|
|
458
|
+
} else if (advance_by == 4) {
|
|
459
|
+
// Advance by 2 chars total, if not at end
|
|
460
|
+
if (UTFmax <= usrclen) {
|
|
461
|
+
int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
|
462
|
+
}
|
|
463
|
+
} else if (advance_by == 8) {
|
|
464
|
+
// Advance by 4 chars total, if not at end
|
|
465
|
+
if ((UTFmax * 3) <= usrclen) {
|
|
466
|
+
int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
|
467
|
+
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
|
468
|
+
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
|
469
|
+
}
|
|
470
|
+
} else {
|
|
471
|
+
// Advance by 8 chars total, if not at end
|
|
472
|
+
if ((UTFmax * 7) <= usrclen) {
|
|
473
|
+
int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
|
474
|
+
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
|
475
|
+
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
|
476
|
+
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
|
477
|
+
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
|
478
|
+
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
|
479
|
+
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
DCHECK(usrclen >= 0);
|
|
483
|
+
|
|
484
|
+
if (*tote_grams >= gram_limit) {
|
|
485
|
+
break;
|
|
486
|
+
}
|
|
487
|
+
}
|
|
488
|
+
if (FLAGS_dbgscore) {
|
|
489
|
+
// With advance_by>2, we consume more input to get the same number of quads
|
|
490
|
+
int len = src - isrc;
|
|
491
|
+
DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
|
|
492
|
+
DbgScoreFlush();
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
int consumed2 = reinterpret_cast<const char*>(usrc) - isrc;
|
|
496
|
+
return consumed2;
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
// BIGRAM, using hash table, always advancing by 1 char
|
|
501
|
+
// Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj
|
|
502
|
+
// Score all bigrams in isrc, using languages that have bigrams (CJK)
|
|
503
|
+
// Return number of bigrams that hit in the hash table
|
|
504
|
+
int cld::DoBigramScoreV3(const cld::CLDTableSummary* bigram_obj,
|
|
505
|
+
const char* isrc, int srclen, Tote* chunk_tote) {
|
|
506
|
+
int hit_count = 0;
|
|
507
|
+
const char* src = isrc;
|
|
508
|
+
|
|
509
|
+
// Hashtable-based CJK bigram lookup
|
|
510
|
+
const uint8* usrc = reinterpret_cast<const uint8*>(src);
|
|
511
|
+
const uint8* usrclimit1 = usrc + srclen - UTFmax;
|
|
512
|
+
if (FLAGS_dbgscore) {
|
|
513
|
+
fprintf(stderr, " " );
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
while (usrc < usrclimit1) {
|
|
517
|
+
int len = kAdvanceOneChar[usrc[0]];
|
|
518
|
+
int len2 = kAdvanceOneChar[usrc[len]] + len;
|
|
519
|
+
|
|
520
|
+
if ((kMinCJKUTF8CharBytes * 2) <= len2) { // Two CJK chars possible
|
|
521
|
+
// Lookup and score this bigram
|
|
522
|
+
// Always ignore pre/post spaces
|
|
523
|
+
uint32 bihash = BiHashV25(reinterpret_cast<const char*>(usrc), len2);
|
|
524
|
+
uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash);
|
|
525
|
+
// Now go indirect on the subscript
|
|
526
|
+
probs = bigram_obj->kCLDTableInd[probs &
|
|
527
|
+
~bigram_obj->kCLDTableKeyMask];
|
|
528
|
+
|
|
529
|
+
// Process the bigram
|
|
530
|
+
if (FLAGS_dbglookup) {
|
|
531
|
+
const char* ssrc = reinterpret_cast<const char*>(usrc);
|
|
532
|
+
DbgBiTermToStderr(bihash, probs, ssrc, len2);
|
|
533
|
+
DbgScoreRecord(NULL, probs, len2);
|
|
534
|
+
} else if (FLAGS_dbgscore && (probs != 0)) {
|
|
535
|
+
const char* ssrc = reinterpret_cast<const char*>(usrc);
|
|
536
|
+
DbgScoreRecord(NULL, probs, len2);
|
|
537
|
+
string temp(ssrc, len2);
|
|
538
|
+
fprintf(stderr, "%s ", temp.c_str());
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
if (probs != 0) {
|
|
542
|
+
ProcessProbV25Tote(probs, chunk_tote);
|
|
543
|
+
++hit_count;
|
|
544
|
+
}
|
|
545
|
+
}
|
|
546
|
+
usrc += len; // Advance by one char
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
if (FLAGS_dbgscore) {
|
|
550
|
+
fprintf(stderr, "[%d bigrams scored]\n", hit_count);
|
|
551
|
+
DbgScoreState();
|
|
552
|
+
}
|
|
553
|
+
return hit_count;
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
// QUADGRAM, using hash table, advancing by 2/4/8/16 chars
|
|
559
|
+
// Caller supplies table, such as &kQuadTable_obj or &kGibberishTable_obj
|
|
560
|
+
// Score up to n quadgrams, returning number of bytes consumed
|
|
561
|
+
// Updates tote_grams
|
|
562
|
+
int cld::DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj,
|
|
563
|
+
const char* isrc, int srclen, int advance_by,
|
|
564
|
+
int* tote_grams, int gram_limit, Tote* chunk_tote) {
|
|
565
|
+
const char* src = isrc;
|
|
566
|
+
const char* srclimit = src + srclen;
|
|
567
|
+
// Limit is end, which has extra 20 20 20 00 past len
|
|
568
|
+
const char* srclimit7 = src + srclen - (UTFmax * 7);
|
|
569
|
+
const char* srclimit15 = src + srclen - (UTFmax * 15);
|
|
570
|
+
|
|
571
|
+
if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
|
|
572
|
+
|
|
573
|
+
// Run a little cache of last hits to catch overly-repetitive "text"
|
|
574
|
+
int next_prior = 0;
|
|
575
|
+
uint32 prior_quads[2] = {0, 0};
|
|
576
|
+
|
|
577
|
+
// Visit all quadgrams
|
|
578
|
+
if (src[0] == ' ') {++src;}
|
|
579
|
+
while (src < srclimit) {
|
|
580
|
+
// Find one quadgram
|
|
581
|
+
const char* src_end = src;
|
|
582
|
+
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
|
583
|
+
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
|
584
|
+
const char* src_mid = src_end;
|
|
585
|
+
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
|
586
|
+
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
|
587
|
+
int len = src_end - src;
|
|
588
|
+
|
|
589
|
+
// Lookup and score this quadgram
|
|
590
|
+
uint32 quadhash = QuadHashV25(src, len);
|
|
591
|
+
uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash);
|
|
592
|
+
// Now go indirect on the subscript
|
|
593
|
+
probs = quadgram_obj->kCLDTableInd[probs &
|
|
594
|
+
~quadgram_obj->kCLDTableKeyMask];
|
|
595
|
+
|
|
596
|
+
// Process the quadgram
|
|
597
|
+
if (FLAGS_dbglookup) {
|
|
598
|
+
DbgQuadTermToStderr(quadhash, probs, src, len);
|
|
599
|
+
}
|
|
600
|
+
if (probs != 0) {
|
|
601
|
+
// Filter out recent repeats. If this works out, use in the other lookups
|
|
602
|
+
if ((quadhash != prior_quads[0]) && (quadhash != prior_quads[1])) {
|
|
603
|
+
prior_quads[next_prior] = quadhash;
|
|
604
|
+
next_prior = (next_prior + 1) & 1;
|
|
605
|
+
ProcessProbV25Tote(probs, chunk_tote);
|
|
606
|
+
++(*tote_grams);
|
|
607
|
+
if (FLAGS_dbgscore) {DbgScoreRecord(src, probs, len);}
|
|
608
|
+
}
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
// Advance all the way past word if at end-of-word
|
|
612
|
+
if (src_end[0] == ' ') {
|
|
613
|
+
src_mid = src_end;
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
// Advance by 2/4/8/16 characters
|
|
617
|
+
if (advance_by == 2) {
|
|
618
|
+
src = src_mid;
|
|
619
|
+
} else if (advance_by == 4) {
|
|
620
|
+
src = src_end;
|
|
621
|
+
} else if (advance_by == 8) {
|
|
622
|
+
// Advance by 8 chars total (4 more), if not at end
|
|
623
|
+
if (src < srclimit7) {
|
|
624
|
+
src_end += kAdvanceOneChar[(uint8)src_end[0]];
|
|
625
|
+
src_end += kAdvanceOneChar[(uint8)src_end[0]];
|
|
626
|
+
src_end += kAdvanceOneChar[(uint8)src_end[0]];
|
|
627
|
+
src_end += kAdvanceOneChar[(uint8)src_end[0]];
|
|
628
|
+
}
|
|
629
|
+
src = src_end;
|
|
630
|
+
} else {
|
|
631
|
+
// Advance by 16 chars total (12 more), if not at end
|
|
632
|
+
if (src < srclimit15) {
|
|
633
|
+
// Advance by ~16 chars by adding 3 * current bytelen
|
|
634
|
+
int fourcharlen = src_end - src;
|
|
635
|
+
src = src_end + (3 * fourcharlen);
|
|
636
|
+
// Advance a bit more if mid-character
|
|
637
|
+
src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
|
|
638
|
+
src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
|
|
639
|
+
} else {
|
|
640
|
+
src = src_end;
|
|
641
|
+
}
|
|
642
|
+
}
|
|
643
|
+
DCHECK(src < srclimit);
|
|
644
|
+
src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
|
|
645
|
+
|
|
646
|
+
if (*tote_grams >= gram_limit) {
|
|
647
|
+
break;
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
if (FLAGS_dbgscore) {
|
|
652
|
+
// With advance_by>2, we consume more input to get the same number of quads
|
|
653
|
+
int len = src - isrc;
|
|
654
|
+
DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
|
|
655
|
+
DbgScoreFlush();
|
|
656
|
+
}
|
|
657
|
+
|
|
658
|
+
int consumed = src - isrc;
|
|
659
|
+
|
|
660
|
+
// If advancing by more than 2, src may have overshot srclimit
|
|
661
|
+
if (consumed > srclen) {
|
|
662
|
+
consumed = srclen;
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
return consumed;
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
// OCTAGRAM, using hash table, always advancing by 1 word
|
|
670
|
+
// Caller supplies table, such as &kLongWord8Table_obj
|
|
671
|
+
// Score all words in isrc, using languages that have quadgrams
|
|
672
|
+
// We don't normally use this routine except on the first quadgram run,
|
|
673
|
+
// but it can be used to resolve unreliable pages.
|
|
674
|
+
// This routine does not have an optimized advance_by
|
|
675
|
+
// SOON: Uses indirect language/probability longword
|
|
676
|
+
//
|
|
677
|
+
// Return number of words that hit in the hash table
|
|
678
|
+
int cld::DoOctaScoreV3(const cld::CLDTableSummary* octagram_obj,
|
|
679
|
+
const char* isrc, int srclen, Tote* chunk_tote) {
|
|
680
|
+
int hit_count = 0;
|
|
681
|
+
const char* src = isrc;
|
|
682
|
+
const char* srclimit = src + srclen + 1;
|
|
683
|
+
// Limit is end+1, to include extra space char (0x20) off the end
|
|
684
|
+
//
|
|
685
|
+
// Score all words truncated to 8 characters
|
|
686
|
+
int charcount = 0;
|
|
687
|
+
// Skip any initial space
|
|
688
|
+
if (src[0] == ' ') {++src;}
|
|
689
|
+
const char* word_ptr = src;
|
|
690
|
+
const char* word_end = word_ptr;
|
|
691
|
+
if (FLAGS_dbgscore) {
|
|
692
|
+
fprintf(stderr, " " );
|
|
693
|
+
}
|
|
694
|
+
while (src < srclimit) {
|
|
695
|
+
// Terminate previous word or continue current word
|
|
696
|
+
if (src[0] == ' ') {
|
|
697
|
+
int bytecount = word_end - word_ptr;
|
|
698
|
+
if (bytecount == 0)
|
|
699
|
+
break;
|
|
700
|
+
// Lookup and score this word
|
|
701
|
+
uint64 wordhash40 = OctaHash40(word_ptr, bytecount);
|
|
702
|
+
uint32 probs = OctaHashV3Lookup4(octagram_obj, wordhash40);
|
|
703
|
+
// Now go indirect on the subscript
|
|
704
|
+
probs = octagram_obj->kCLDTableInd[probs &
|
|
705
|
+
~octagram_obj->kCLDTableKeyMask];
|
|
706
|
+
|
|
707
|
+
// // Lookup and score this word
|
|
708
|
+
// uint32 wordhash = QuadHashV25(word_ptr, bytecount);
|
|
709
|
+
// uint32 probs = WordHashLookup4(wordhash, kLongWord8Table,
|
|
710
|
+
// kLongWord8TableSize);
|
|
711
|
+
//
|
|
712
|
+
if (FLAGS_dbglookup) {
|
|
713
|
+
DbgWordTermToStderr(wordhash40, probs, word_ptr, bytecount);
|
|
714
|
+
DbgScoreRecord(NULL, probs, bytecount);
|
|
715
|
+
} else if (FLAGS_dbgscore && (probs != 0)) {
|
|
716
|
+
DbgScoreRecord(NULL, probs, bytecount);
|
|
717
|
+
string temp(word_ptr, bytecount);
|
|
718
|
+
fprintf(stderr, "%s ", temp.c_str());
|
|
719
|
+
}
|
|
720
|
+
|
|
721
|
+
if (probs != 0) {
|
|
722
|
+
ProcessProbV25Tote(probs, chunk_tote);
|
|
723
|
+
++hit_count;
|
|
724
|
+
}
|
|
725
|
+
charcount = 0;
|
|
726
|
+
word_ptr = src + 1; // Over the space
|
|
727
|
+
word_end = word_ptr;
|
|
728
|
+
} else {
|
|
729
|
+
++charcount;
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
// Advance to next char
|
|
733
|
+
src += cld_UniLib::OneCharLen(src);
|
|
734
|
+
if (charcount <= 8) {
|
|
735
|
+
word_end = src;
|
|
736
|
+
}
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
if (FLAGS_dbgscore) {
|
|
740
|
+
fprintf(stderr, "[%d words scored]\n", hit_count);
|
|
741
|
+
DbgScoreState();
|
|
742
|
+
}
|
|
743
|
+
return hit_count;
|
|
744
|
+
}
|
|
745
|
+
|
|
746
|
+
|
|
747
|
+
|
|
748
|
+
//------------------------------------------------------------------------------
|
|
749
|
+
// Reliability calculations, for single language and between languages
|
|
750
|
+
//------------------------------------------------------------------------------
|
|
751
|
+
|
|
752
|
+
// Return reliablity of result 0..100 for top two scores
|
|
753
|
+
// delta==0 is 0% reliable, delta==fully_reliable_thresh is 100% reliable
|
|
754
|
+
// (on a scale where +1 is a factor of 2 ** 1.6 = 3.02)
|
|
755
|
+
// Threshold is uni/quadgram increment count, bounded above and below.
|
|
756
|
+
//
|
|
757
|
+
// Requiring a factor of 3 improvement (e.g. +1 log base 3)
|
|
758
|
+
// for each scored quadgram is too stringent, so I've backed this off to a
|
|
759
|
+
// factor of 2 (e.g. +5/8 log base 3).
|
|
760
|
+
//
|
|
761
|
+
// I also somewhat lowered the Min/MaxGramCount limits above
|
|
762
|
+
//
|
|
763
|
+
// Added: if fewer than 8 quads/unis, max reliability is 12*n percent
|
|
764
|
+
//
|
|
765
|
+
int cld::ReliabilityDelta(int value1, int value2, int gramcount) {
|
|
766
|
+
int max_reliability_percent = 100;
|
|
767
|
+
if (gramcount < 8) {
|
|
768
|
+
max_reliability_percent = 12 * gramcount;
|
|
769
|
+
}
|
|
770
|
+
int fully_reliable_thresh = (gramcount * 5) >> 3; // see note above
|
|
771
|
+
if (fully_reliable_thresh < kMinGramCount) { // Fully = 3..16
|
|
772
|
+
fully_reliable_thresh = kMinGramCount;
|
|
773
|
+
} else if (fully_reliable_thresh > kMaxGramCount) {
|
|
774
|
+
fully_reliable_thresh = kMaxGramCount;
|
|
775
|
+
}
|
|
776
|
+
|
|
777
|
+
int delta = value1 - value2;
|
|
778
|
+
if (delta >= fully_reliable_thresh) {return max_reliability_percent;}
|
|
779
|
+
if (delta <= 0) {return 0;}
|
|
780
|
+
return cld::minint(max_reliability_percent,
|
|
781
|
+
(100 * delta) / fully_reliable_thresh);
|
|
782
|
+
}
|
|
783
|
+
|
|
784
|
+
// Return reliablity of result 0..100 for top score vs. mainsteam score
|
|
785
|
+
// Values are score per 1024 bytes of input
|
|
786
|
+
// ratio = max(top/mainstream, mainstream/top)
|
|
787
|
+
// ratio > 4.0 is 0% reliable, <= 2.0 is 100% reliable
|
|
788
|
+
// Change: short-text word scoring can give unusually good results.
|
|
789
|
+
// Let top exceed mainstream by 4x at 50% reliable
|
|
790
|
+
int cld::ReliabilityMainstream(int topscore, int len, int mean_score) {
|
|
791
|
+
if (mean_score == 0) {return 100;} // No reliability data available yet
|
|
792
|
+
if (topscore == 0) {return 0;} // zero score = unreliable
|
|
793
|
+
if (len == 0) {return 0;} // zero len = unreliable
|
|
794
|
+
int top_kb = (topscore << 10) / len;
|
|
795
|
+
double ratio;
|
|
796
|
+
double ratio_cutoff;
|
|
797
|
+
if (top_kb > mean_score) {
|
|
798
|
+
ratio = (1.0 * top_kb) / mean_score;
|
|
799
|
+
ratio_cutoff = 5.0; // ramp down from 100% to 0%: 3.0-5.0
|
|
800
|
+
} else {
|
|
801
|
+
ratio = (1.0 * mean_score) / top_kb;
|
|
802
|
+
ratio_cutoff = 4.0; // ramp down from 100% to 0%: 2.0-4.0
|
|
803
|
+
}
|
|
804
|
+
if (ratio <= ratio_cutoff - 2.0) {return 100;}
|
|
805
|
+
if (ratio > ratio_cutoff) {return 0;}
|
|
806
|
+
|
|
807
|
+
int iratio = static_cast<int>(100 * (ratio_cutoff - ratio) / 2.0);
|
|
808
|
+
return iratio;
|
|
809
|
+
}
|
|
810
|
+
|
|
811
|
+
// Calculate ratio of score per 1KB vs. expected score per 1KB
|
|
812
|
+
double cld::GetNormalizedScore(Language lang, UnicodeLScript lscript,
|
|
813
|
+
int bytes, int score) {
|
|
814
|
+
// Average training-data score for this language-script combo, per 1KB
|
|
815
|
+
int expected_score = kMeanScore[lang * 4 + LScript4(lscript)];
|
|
816
|
+
if (lscript == ULScript_Common) {
|
|
817
|
+
// We don't know the script (only happens with second-chance score)
|
|
818
|
+
// Look for first non-zero mean value
|
|
819
|
+
for (int i = 0; i < 3; ++i) {
|
|
820
|
+
if (kMeanScore[lang * 4 + i] > 0) {
|
|
821
|
+
expected_score = kMeanScore[lang * 4 + i];
|
|
822
|
+
}
|
|
823
|
+
}
|
|
824
|
+
}
|
|
825
|
+
if (expected_score < 100) {
|
|
826
|
+
expected_score = 1000;
|
|
827
|
+
}
|
|
828
|
+
|
|
829
|
+
// Our score per 1KB
|
|
830
|
+
double our_score = (score << 10) / (bytes ? bytes : 1); // Avoid zdiv
|
|
831
|
+
double ratio = our_score / expected_score;
|
|
832
|
+
|
|
833
|
+
// Just the raw count normalized as though each language has mean=1000;
|
|
834
|
+
ratio = (score * 1000.0) / expected_score;
|
|
835
|
+
return ratio;
|
|
836
|
+
}
|
|
837
|
+
|
|
838
|
+
// Calculate reliablity of len bytes of script lscript with chunk_tote
|
|
839
|
+
int cld::GetReliability(int len, UnicodeLScript lscript,
|
|
840
|
+
const Tote* chunk_tote) {
|
|
841
|
+
Language cur_lang = UnpackLanguage(chunk_tote->Key(0));
|
|
842
|
+
// Average score for this language-script combo
|
|
843
|
+
int mean_score = kMeanScore[cur_lang * 4 + LScript4(lscript)];
|
|
844
|
+
if (lscript == ULScript_Common) {
|
|
845
|
+
// We don't know the script (only happens with second-chance score)
|
|
846
|
+
// Look for first non-zero mean value
|
|
847
|
+
for (int i = 0; i < 3; ++i) {
|
|
848
|
+
if (kMeanScore[cur_lang * 4 + i] > 0) {
|
|
849
|
+
mean_score = kMeanScore[cur_lang * 4 + i];
|
|
850
|
+
}
|
|
851
|
+
}
|
|
852
|
+
}
|
|
853
|
+
int reliability_delta = ReliabilityDelta(chunk_tote->Value(0),
|
|
854
|
+
chunk_tote->Value(1),
|
|
855
|
+
chunk_tote->GetGramCount());
|
|
856
|
+
|
|
857
|
+
int reliability_main = ReliabilityMainstream(chunk_tote->Value(0),
|
|
858
|
+
len,
|
|
859
|
+
mean_score);
|
|
860
|
+
|
|
861
|
+
int reliability_min = minint(reliability_delta, reliability_main);
|
|
862
|
+
|
|
863
|
+
|
|
864
|
+
if (FLAGS_dbgreli) {
|
|
865
|
+
char temp1[4];
|
|
866
|
+
char temp2[4];
|
|
867
|
+
cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(0)), temp1);
|
|
868
|
+
if (temp1[2] == ' ') {temp1[2] = '\0';}
|
|
869
|
+
cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(1)), temp2);
|
|
870
|
+
if (temp2[2] == ' ') {temp2[2] = '\0';}
|
|
871
|
+
int srclen = len;
|
|
872
|
+
fprintf(stderr, "CALC GetReliability gram=%d incr=%d srclen=%d, %s=%d %s=%d "
|
|
873
|
+
"top/KB=%d mean/KB=%d del=%d%% reli=%d%% "
|
|
874
|
+
"lang/lscript %d %d\n",
|
|
875
|
+
chunk_tote->GetGramCount(),
|
|
876
|
+
chunk_tote->GetIncrCount(),
|
|
877
|
+
srclen,
|
|
878
|
+
temp1, chunk_tote->Value(0),
|
|
879
|
+
temp2, chunk_tote->Value(1),
|
|
880
|
+
(chunk_tote->Value(0) << 10) / (srclen ? srclen : 1),
|
|
881
|
+
mean_score,
|
|
882
|
+
reliability_delta,
|
|
883
|
+
reliability_main,
|
|
884
|
+
cur_lang, lscript);
|
|
885
|
+
}
|
|
886
|
+
|
|
887
|
+
return reliability_min;
|
|
888
|
+
}
|
|
889
|
+
|
|
890
|
+
|
|
891
|
+
//------------------------------------------------------------------------------
|
|
892
|
+
// Miscellaneous
|
|
893
|
+
//------------------------------------------------------------------------------
|
|
894
|
+
|
|
895
|
+
// Demote all languages except Top40 and plus_one
|
|
896
|
+
// Do this just before sorting chunk_tote results
|
|
897
|
+
void cld::DemoteNotTop40(Tote* chunk_tote, int packed_plus_one) {
|
|
898
|
+
for (int sub = 0; sub < chunk_tote->MaxSize(); ++sub) {
|
|
899
|
+
if (chunk_tote->Key(sub) == 0) continue;
|
|
900
|
+
if (chunk_tote->Key(sub) == packed_plus_one) continue;
|
|
901
|
+
if (kIsPackedTop40[chunk_tote->Key(sub)]) continue;
|
|
902
|
+
// Quarter the score of others
|
|
903
|
+
chunk_tote->SetValue(sub, chunk_tote->Value(sub) >> 2);
|
|
904
|
+
}
|
|
905
|
+
}
|