language_detection 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +85 -0
- data/Rakefile +11 -0
- data/ext/cld/Makefile +34 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +124 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/cld.so +0 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +1 -0
- data/ext/cld/language_detection.cc +88 -0
- data/ext/cld/languages/internal/languages.cc +337 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/language_detection.gemspec +28 -0
- data/lib/language_detection/string.rb +1 -0
- data/lib/language_detection/version.rb +3 -0
- data/lib/language_detection.rb +54 -0
- data/test/_helper.rb +15 -0
- data/test/fixtures/languages.csv +80 -0
- data/test/language_detection_test.rb +88 -0
- metadata +250 -0
@@ -0,0 +1,1205 @@
|
|
1
|
+
// Copyright (c) 2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_CLDUTIL_H_
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_CLDUTIL_H_
|
7
|
+
|
8
|
+
#include <string>
|
9
|
+
#include "encodings/compact_lang_det/ext_lang_enc.h"
|
10
|
+
#include "encodings/compact_lang_det/tote.h"
|
11
|
+
#include "encodings/compact_lang_det/win/cld_basictypes.h"
|
12
|
+
#include "encodings/compact_lang_det/win/cld_commandlineflags.h"
|
13
|
+
#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
|
14
|
+
|
15
|
+
namespace cld {
|
16
|
+
|
17
|
+
// Hash bucket for four-way associative lookup with < 64K buckets
|
18
|
+
// 32 bytes per bucket, 8-byte entries
|
19
|
+
typedef struct {
|
20
|
+
uint32 key[4]; // hashed word to look up
|
21
|
+
uint32 value[4]; // packed three lang numbers and probability subscript
|
22
|
+
} SmallWordProbBucket4;
|
23
|
+
|
24
|
+
// Hash bucket for fouro-way associative lookup with >= 64K buckets
|
25
|
+
// 24 bytes per bucket, 6-byte entries
|
26
|
+
typedef struct {
|
27
|
+
uint16 key[4]; // Half of hashed word to look up; other
|
28
|
+
// half is used to pick the bucket
|
29
|
+
uint32 value[4]; // packed three lang numbers and probability subscript
|
30
|
+
} LargeQuadProbBucket4;
|
31
|
+
|
32
|
+
// Hash bucket for four-way associative lookup, indirect probabilities
|
33
|
+
// 16 bytes per bucket, 4-byte entries
|
34
|
+
typedef struct {
|
35
|
+
uint32 keyvalue[4]; // Upper part of word is hash, lower is indirect prob
|
36
|
+
} IndirectProbBucket4;
|
37
|
+
|
38
|
+
|
39
|
+
// This describes a complete CLD table, consisting of
|
40
|
+
// a main lookup table, an indirect language/probability table, and
|
41
|
+
// three constants.
|
42
|
+
// The main table key is a quadgram, bigram, or longword hash, with
|
43
|
+
// part of the key used to select a bucket modulo kCLDTableSize,
|
44
|
+
// and the rest matched against the key portion of four entries in a bucket,
|
45
|
+
// defined by kCLDTableKeyMask. The remaining bits of an entry, defined
|
46
|
+
// by ~kCLDTableKeyMask, are usually a subscript in the indirect table.
|
47
|
+
//
|
48
|
+
// By using part of the key to select a bucket, those key bits do not need
|
49
|
+
// to be stored in the main table entries, saving space (typically 2 bytes).
|
50
|
+
//
|
51
|
+
// By using an indirect table for lang/prob triples, only the subscript needs
|
52
|
+
// to be stored in the main table entires, saving space (typically 2 bytes).
|
53
|
+
//
|
54
|
+
// Each entry in the indirect table has three languages and three
|
55
|
+
// corresponding probabilities, packed into four bytes.
|
56
|
+
//
|
57
|
+
// The build date constant is included just for version tracking and is not
|
58
|
+
// otherwise used.
|
59
|
+
//
|
60
|
+
// Different-size tables can be linked in for different production
|
61
|
+
// environments. By going indirect through this struct, the runtime code is
|
62
|
+
// insensitive to the actual sizes.
|
63
|
+
//
|
64
|
+
// An empty placeholder table can be described by a table size of 1
|
65
|
+
// bucket, a keymask of 0xffffffff, a degenerate bucket of four no-match
|
66
|
+
// entries, and a degenerate indirect table of one no-languages entry.
|
67
|
+
//
|
68
|
+
//
|
69
|
+
struct CLDTableSummary {
|
70
|
+
const IndirectProbBucket4* kCLDTable;
|
71
|
+
// Each bucket has four entries, part
|
72
|
+
// key and part indirect subscript
|
73
|
+
const uint32* kCLDTableInd; // Each entry is three packed lang/prob
|
74
|
+
const int kCLDTableSize; // Bucket count
|
75
|
+
const int kCLDTableIndSize; // Entries count
|
76
|
+
const int kCLDTableKeyMask; // Mask hash key
|
77
|
+
const int kCLDTableBuildDate; // yyyymmdd
|
78
|
+
};
|
79
|
+
|
80
|
+
|
81
|
+
// Keeps per-character 0-12 language probabilities for CTJKVZ-- in that order.
|
82
|
+
// Chinese ChineseT Japanese Korean Vietnamese Zhuang
|
83
|
+
// (2 bytes unused, for alignment padding and future)
|
84
|
+
typedef struct {
|
85
|
+
uint8 probs[8];
|
86
|
+
} UnigramProbArray;
|
87
|
+
|
88
|
+
// Map 8-bit subscript to CTJKVZ probabilities
|
89
|
+
// Target runtime probabilities for CTJK + VZ
|
90
|
+
// Hand-generated to cover a reasonable range of choices
|
91
|
+
static const int kTargetCTJKVZProbsSize = 242;
|
92
|
+
static const UnigramProbArray kTargetCTJKVZProbs[kTargetCTJKVZProbsSize] = {
|
93
|
+
{{0,0,0,0,0,0,0,0}},
|
94
|
+
{{0,0,0,0,0,12,0,0}},
|
95
|
+
{{0,0,0,0,12,0,0,0}},
|
96
|
+
{{0,0,0,12,0,0,0,0}},
|
97
|
+
{{0,0,12,0,0,0,0,0}},
|
98
|
+
{{0,12,0,0,0,0,0,0}},
|
99
|
+
{{12,0,0,0,0,0,0,0}},
|
100
|
+
|
101
|
+
{{8,0,0,0,4,0,0,0}},
|
102
|
+
{{8,0,0,4,0,0,0,0}},
|
103
|
+
{{8,0,4,0,0,0,0,0}},
|
104
|
+
{{8,4,0,0,0,0,0,0}},
|
105
|
+
{{8,2,0,2,0,0,0,0}},
|
106
|
+
{{0,0,0,0,0,8,0,0}},
|
107
|
+
{{0,4,8,0,0,0,0,0}},
|
108
|
+
{{4,0,0,0,0,8,0,0}},
|
109
|
+
{{0,0,8,0,0,0,0,0}},
|
110
|
+
{{8,2,2,0,0,0,0,0}},
|
111
|
+
{{0,8,4,0,0,0,0,0}},
|
112
|
+
{{8,0,0,0,0,4,0,0}},
|
113
|
+
{{0,8,2,0,0,0,0,0}},
|
114
|
+
{{4,8,0,0,0,0,0,0}},
|
115
|
+
{{2,8,0,2,0,0,0,0}},
|
116
|
+
{{2,2,8,0,0,0,0,0}},
|
117
|
+
{{0,8,0,0,0,0,0,0}},
|
118
|
+
{{0,2,8,0,0,0,0,0}},
|
119
|
+
{{2,8,2,0,0,0,0,0}},
|
120
|
+
{{8,0,0,0,0,0,0,0}},
|
121
|
+
{{2,8,0,0,0,0,0,0}},
|
122
|
+
{{8,2,0,0,0,0,0,0}},
|
123
|
+
|
124
|
+
{{0,6,2,0,2,0,0,0}},
|
125
|
+
{{2,0,0,0,6,0,0,0}},
|
126
|
+
{{4,0,0,0,6,0,0,0}},
|
127
|
+
{{4,6,0,0,4,0,0,0}},
|
128
|
+
{{4,6,2,0,2,0,0,0}},
|
129
|
+
{{4,6,4,0,2,0,0,0}},
|
130
|
+
{{5,4,6,0,0,0,0,0}},
|
131
|
+
{{6,0,0,0,4,0,0,0}},
|
132
|
+
{{6,0,2,0,4,0,0,0}},
|
133
|
+
{{6,0,4,0,4,0,0,0}},
|
134
|
+
{{6,2,0,0,4,0,0,0}},
|
135
|
+
{{6,2,2,0,4,0,0,0}},
|
136
|
+
{{6,2,4,0,2,0,0,0}},
|
137
|
+
{{6,4,0,0,2,0,0,0}},
|
138
|
+
{{6,4,2,0,2,0,0,0}},
|
139
|
+
{{0,0,6,2,0,0,0,0}},
|
140
|
+
{{0,6,2,0,0,2,0,0}},
|
141
|
+
{{2,2,2,0,0,6,0,0}},
|
142
|
+
{{2,2,6,4,0,0,0,0}},
|
143
|
+
{{2,4,0,0,0,6,0,0}},
|
144
|
+
{{2,6,0,4,0,0,0,0}},
|
145
|
+
{{2,6,2,4,0,0,0,0}},
|
146
|
+
{{2,6,4,4,0,0,0,0}},
|
147
|
+
{{4,0,2,0,0,6,0,0}},
|
148
|
+
{{4,2,6,2,0,0,0,0}},
|
149
|
+
{{4,4,2,0,0,6,0,0}},
|
150
|
+
{{4,6,4,0,0,2,0,0}},
|
151
|
+
{{6,0,2,0,0,2,0,0}},
|
152
|
+
{{6,2,0,0,0,2,0,0}},
|
153
|
+
{{6,2,2,0,0,4,0,0}},
|
154
|
+
{{6,2,4,0,0,2,0,0}},
|
155
|
+
{{4,6,2,0,0,4,0,0}},
|
156
|
+
{{6,4,2,0,0,4,0,0}},
|
157
|
+
{{2,0,0,0,0,6,0,0}},
|
158
|
+
{{6,2,0,2,0,0,0,0}},
|
159
|
+
{{2,2,0,0,0,6,0,0}},
|
160
|
+
{{6,2,6,0,0,0,0,0}},
|
161
|
+
{{6,4,2,0,0,2,0,0}},
|
162
|
+
{{6,4,2,2,0,0,0,0}},
|
163
|
+
{{4,6,4,2,0,0,0,0}},
|
164
|
+
{{6,0,2,0,0,4,0,0}},
|
165
|
+
{{6,0,4,0,0,2,0,0}},
|
166
|
+
{{6,0,6,0,0,0,0,0}},
|
167
|
+
{{6,2,2,0,0,0,0,0}},
|
168
|
+
{{6,4,0,0,0,2,0,0}},
|
169
|
+
{{6,4,5,0,0,0,0,0}},
|
170
|
+
{{0,6,0,2,0,0,0,0}},
|
171
|
+
{{0,6,2,2,0,0,0,0}},
|
172
|
+
{{2,6,0,2,0,0,0,0}},
|
173
|
+
{{2,6,2,2,0,0,0,0}},
|
174
|
+
{{4,2,0,0,0,6,0,0}},
|
175
|
+
{{6,4,0,0,0,4,0,0}},
|
176
|
+
{{6,4,0,2,0,0,0,0}},
|
177
|
+
{{6,6,0,2,0,0,0,0}},
|
178
|
+
{{6,0,4,0,0,4,0,0}},
|
179
|
+
{{6,2,0,0,0,4,0,0}},
|
180
|
+
{{6,6,2,2,0,0,0,0}},
|
181
|
+
{{4,6,0,0,0,2,0,0}},
|
182
|
+
{{2,6,6,0,0,0,0,0}},
|
183
|
+
{{4,5,6,0,0,0,0,0}},
|
184
|
+
{{4,6,0,2,0,0,0,0}},
|
185
|
+
{{6,2,0,0,0,6,0,0}},
|
186
|
+
{{0,6,4,2,0,0,0,0}},
|
187
|
+
{{4,0,6,0,0,0,0,0}},
|
188
|
+
{{2,6,4,2,0,0,0,0}},
|
189
|
+
{{4,6,0,0,0,4,0,0}},
|
190
|
+
{{6,2,2,0,0,0,0,0}},
|
191
|
+
{{4,6,2,2,0,0,0,0}},
|
192
|
+
{{4,6,5,0,0,0,0,0}},
|
193
|
+
{{6,0,2,0,0,0,0,0}},
|
194
|
+
{{6,4,4,0,0,0,0,0}},
|
195
|
+
{{4,2,6,0,0,0,0,0}},
|
196
|
+
{{2,0,6,0,0,0,0,0}},
|
197
|
+
{{4,4,0,0,0,6,0,0}},
|
198
|
+
{{4,4,6,0,0,0,0,0}},
|
199
|
+
{{4,6,2,0,0,2,0,0}},
|
200
|
+
{{2,2,6,0,0,0,0,0}},
|
201
|
+
{{2,4,6,0,0,0,0,0}},
|
202
|
+
{{0,6,6,0,0,0,0,0}},
|
203
|
+
{{6,2,4,0,0,0,0,0}},
|
204
|
+
{{0,4,6,0,0,0,0,0}},
|
205
|
+
{{4,0,0,0,0,6,0,0}},
|
206
|
+
{{4,6,4,0,0,0,0,0}},
|
207
|
+
{{6,0,0,0,0,6,0,0}},
|
208
|
+
{{6,0,0,0,0,2,0,0}},
|
209
|
+
{{6,0,4,0,0,0,0,0}},
|
210
|
+
{{6,5,4,0,0,0,0,0}},
|
211
|
+
{{0,2,6,0,0,0,0,0}},
|
212
|
+
{{0,0,6,0,0,0,0,0}},
|
213
|
+
{{6,6,2,0,0,0,0,0}},
|
214
|
+
{{2,6,4,0,0,0,0,0}},
|
215
|
+
{{6,4,2,0,0,0,0,0}},
|
216
|
+
{{2,6,2,0,0,0,0,0}},
|
217
|
+
{{2,6,0,0,0,0,0,0}},
|
218
|
+
{{6,0,0,0,0,4,0,0}},
|
219
|
+
{{6,4,0,0,0,0,0,0}},
|
220
|
+
{{6,6,0,0,0,0,0,0}},
|
221
|
+
{{5,6,4,0,0,0,0,0}},
|
222
|
+
{{0,6,0,0,0,0,0,0}},
|
223
|
+
{{6,2,0,0,0,0,0,0}},
|
224
|
+
{{0,6,2,0,0,0,0,0}},
|
225
|
+
{{4,6,2,0,0,0,0,0}},
|
226
|
+
{{0,6,4,0,0,0,0,0}},
|
227
|
+
{{4,6,0,0,0,0,0,0}},
|
228
|
+
{{6,0,0,0,0,0,0,0}},
|
229
|
+
{{6,6,5,0,0,0,0,0}},
|
230
|
+
{{6,5,6,0,0,0,0,0}},
|
231
|
+
{{5,6,6,0,0,0,0,0}},
|
232
|
+
{{5,5,6,0,0,0,0,0}},
|
233
|
+
{{5,6,5,0,0,0,0,0}},
|
234
|
+
{{6,5,5,0,0,0,0,0}},
|
235
|
+
{{6,6,6,0,0,0,0,0}},
|
236
|
+
{{6,5,0,0,0,0,0,0}},
|
237
|
+
{{6,0,5,0,0,0,0,0}},
|
238
|
+
{{0,6,5,0,0,0,0,0}},
|
239
|
+
{{5,6,0,0,0,0,0,0}},
|
240
|
+
{{5,0,6,0,0,0,0,0}},
|
241
|
+
{{0,5,6,0,0,0,0,0}},
|
242
|
+
|
243
|
+
{{0,0,0,0,4,0,0,0}},
|
244
|
+
{{0,0,0,4,0,0,0,0}},
|
245
|
+
{{2,2,0,0,4,0,0,0}},
|
246
|
+
{{2,2,2,0,4,0,0,0}},
|
247
|
+
{{2,4,0,0,2,0,0,0}},
|
248
|
+
{{2,4,2,0,2,0,0,0}},
|
249
|
+
{{2,4,4,0,2,0,0,0}},
|
250
|
+
{{4,0,2,0,4,0,0,0}},
|
251
|
+
{{4,0,4,0,2,0,0,0}},
|
252
|
+
{{4,2,0,0,2,0,0,0}},
|
253
|
+
{{4,2,2,0,2,0,0,0}},
|
254
|
+
{{4,4,0,0,2,0,0,0}},
|
255
|
+
{{4,4,2,0,2,0,0,0}},
|
256
|
+
{{4,4,4,0,2,0,0,0}},
|
257
|
+
{{0,2,2,4,0,0,0,0}},
|
258
|
+
{{2,2,4,2,0,0,0,0}},
|
259
|
+
{{2,4,4,0,0,2,0,0}},
|
260
|
+
{{2,4,4,2,0,0,0,0}},
|
261
|
+
{{4,0,4,0,0,2,0,0}},
|
262
|
+
{{4,0,4,0,0,4,0,0}},
|
263
|
+
{{4,2,2,4,0,0,0,0}},
|
264
|
+
{{4,4,0,2,0,0,0,0}},
|
265
|
+
{{2,2,0,4,0,0,0,0}},
|
266
|
+
{{2,4,2,2,0,0,0,0}},
|
267
|
+
{{4,4,2,2,0,0,0,0}},
|
268
|
+
{{4,0,4,0,0,0,0,0}},
|
269
|
+
{{4,4,4,0,0,4,0,0}},
|
270
|
+
{{0,4,0,2,0,0,0,0}},
|
271
|
+
{{0,4,2,2,0,0,0,0}},
|
272
|
+
{{4,0,2,0,0,2,0,0}},
|
273
|
+
{{4,2,0,0,0,4,0,0}},
|
274
|
+
{{2,2,2,0,0,4,0,0}},
|
275
|
+
{{4,0,0,2,0,0,0,0}},
|
276
|
+
{{4,4,4,0,0,2,0,0}},
|
277
|
+
{{4,0,0,0,0,4,0,0}},
|
278
|
+
{{4,0,2,0,0,4,0,0}},
|
279
|
+
{{4,2,0,0,0,2,0,0}},
|
280
|
+
{{4,2,2,0,0,2,0,0}},
|
281
|
+
{{2,4,0,2,0,0,0,0}},
|
282
|
+
{{2,2,0,0,0,4,0,0}},
|
283
|
+
{{2,4,0,0,0,4,0,0}},
|
284
|
+
{{2,4,2,0,0,4,0,0}},
|
285
|
+
{{4,2,4,0,0,0,0,0}},
|
286
|
+
{{2,0,4,0,0,0,0,0}},
|
287
|
+
{{4,0,2,0,0,0,0,0}},
|
288
|
+
{{4,4,0,0,0,4,0,0}},
|
289
|
+
{{4,4,2,0,0,4,0,0}},
|
290
|
+
{{0,4,4,0,0,0,0,0}},
|
291
|
+
{{4,4,0,0,0,2,0,0}},
|
292
|
+
{{2,4,0,0,0,2,0,0}},
|
293
|
+
{{2,2,4,0,0,0,0,0}},
|
294
|
+
{{0,2,4,0,0,0,0,0}},
|
295
|
+
{{4,2,2,0,0,0,0,0}},
|
296
|
+
{{2,4,2,0,0,2,0,0}},
|
297
|
+
{{4,4,4,0,0,0,0,0}},
|
298
|
+
{{2,4,4,0,0,0,0,0}},
|
299
|
+
{{0,0,4,0,0,0,0,0}},
|
300
|
+
{{0,4,2,0,0,0,0,0}},
|
301
|
+
{{4,4,2,0,0,2,0,0}},
|
302
|
+
{{2,4,2,0,0,0,0,0}},
|
303
|
+
{{4,2,0,0,0,0,0,0}},
|
304
|
+
{{4,4,0,0,0,0,0,0}},
|
305
|
+
{{4,4,2,0,0,0,0,0}},
|
306
|
+
{{2,4,0,0,0,0,0,0}},
|
307
|
+
{{0,4,0,0,0,0,0,0}},
|
308
|
+
{{4,0,0,0,0,0,0,0}},
|
309
|
+
{{0,0,0,4,4,0,0,0}},
|
310
|
+
{{0,0,4,0,4,0,0,0}},
|
311
|
+
{{0,0,4,4,0,0,0,0}},
|
312
|
+
{{0,4,0,0,4,0,0,0}},
|
313
|
+
{{0,4,0,4,0,0,0,0}},
|
314
|
+
{{4,0,0,0,4,0,0,0}},
|
315
|
+
{{4,0,0,4,0,0,0,0}},
|
316
|
+
|
317
|
+
{{2,0,0,0,0,0,0,0}},
|
318
|
+
{{0,2,0,0,0,0,0,0}},
|
319
|
+
{{0,2,0,2,2,0,0,0}},
|
320
|
+
{{0,2,2,0,2,0,0,0}},
|
321
|
+
{{2,0,0,2,2,0,0,0}},
|
322
|
+
{{2,0,2,0,2,0,0,0}},
|
323
|
+
{{2,0,2,2,0,0,0,0}},
|
324
|
+
{{2,2,0,0,2,0,0,0}},
|
325
|
+
{{2,2,2,2,0,0,0,0}},
|
326
|
+
{{2,2,0,2,0,0,0,0}},
|
327
|
+
{{2,2,0,0,0,0,0,0}},
|
328
|
+
{{0,0,2,0,0,0,0,0}},
|
329
|
+
{{0,2,2,0,0,0,0,0}},
|
330
|
+
{{2,2,2,0,0,0,0,0}},
|
331
|
+
{{0,0,0,2,0,0,0,0}},
|
332
|
+
{{2,0,2,0,0,0,0,0}},
|
333
|
+
{{0,2,0,2,0,0,0,0}},
|
334
|
+
{{0,0,2,2,0,0,0,0}},
|
335
|
+
{{0,2,2,2,0,0,0,0}},
|
336
|
+
};
|
337
|
+
|
338
|
+
|
339
|
+
|
340
|
+
|
341
|
+
// 1 to skip ASCII space, vowels AEIOU aeiou and UTF-8 continuation bytes 80-BF
|
342
|
+
static const uint8 kSkipSpaceVowelContinue[256] = {
|
343
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
344
|
+
1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
345
|
+
0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
|
346
|
+
0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
|
347
|
+
|
348
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
349
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
350
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
351
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
352
|
+
};
|
353
|
+
|
354
|
+
// 1 to skip ASCII space, and UTF-8 continuation bytes 80-BF
|
355
|
+
static const uint8 kSkipSpaceContinue[256] = {
|
356
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
357
|
+
1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
358
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
359
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
360
|
+
|
361
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
362
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
363
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
364
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
365
|
+
};
|
366
|
+
|
367
|
+
|
368
|
+
// If != UNKNOWN, use nilgrams to determine language of this script
|
369
|
+
static const Language kOnlyLanguagePerLScript[] = {
|
370
|
+
ENGLISH, // ULScript_Common, [no words should be in this script]
|
371
|
+
UNKNOWN_LANGUAGE, // ULScript_Latin,
|
372
|
+
//UNKNOWN_LANGUAGE, // ULScript_Greek, Jan 2009: change so we can score quads
|
373
|
+
GREEK, // ULScript_Greek, Mar 2009: change back; do gibberish separately
|
374
|
+
UNKNOWN_LANGUAGE, // ULScript_Cyrillic,
|
375
|
+
ARMENIAN, // ULScript_Armenian,
|
376
|
+
UNKNOWN_LANGUAGE, // ULScript_Hebrew,
|
377
|
+
UNKNOWN_LANGUAGE, // ULScript_Arabic,
|
378
|
+
SYRIAC, // ULScript_Syriac,
|
379
|
+
DHIVEHI, // ULScript_Thaana,
|
380
|
+
UNKNOWN_LANGUAGE, // ULScript_Devanagari,
|
381
|
+
UNKNOWN_LANGUAGE, // ULScript_Bengali,
|
382
|
+
PUNJABI, // ULScript_Gurmukhi,
|
383
|
+
GUJARATI, // ULScript_Gujarati,
|
384
|
+
ORIYA, // ULScript_Oriya,
|
385
|
+
TAMIL, // ULScript_Tamil,
|
386
|
+
TELUGU, // ULScript_Telugu,
|
387
|
+
KANNADA, // ULScript_Kannada,
|
388
|
+
MALAYALAM, // ULScript_Malayalam,
|
389
|
+
SINHALESE, // ULScript_Sinhala,
|
390
|
+
THAI, // ULScript_Thai,
|
391
|
+
LAOTHIAN, // ULScript_Lao,
|
392
|
+
UNKNOWN_LANGUAGE, // ULScript_Tibetan,
|
393
|
+
BURMESE, // ULScript_Myanmar,
|
394
|
+
GEORGIAN, // ULScript_Georgian,
|
395
|
+
UNKNOWN_LANGUAGE, // ULScript_HanCJK,
|
396
|
+
UNKNOWN_LANGUAGE, // ULScript_Ethiopic,
|
397
|
+
CHEROKEE, // ULScript_Cherokee,
|
398
|
+
INUKTITUT, // ULScript_Canadian_Aboriginal,
|
399
|
+
X_OGHAM, // ULScript_Ogham,
|
400
|
+
X_RUNIC, // ULScript_Runic,
|
401
|
+
KHMER, // ULScript_Khmer,
|
402
|
+
MONGOLIAN, // ULScript_Mongolian,
|
403
|
+
X_YI, // ULScript_Yi,
|
404
|
+
X_OLD_ITALIC, // ULScript_Old_Italic,
|
405
|
+
X_GOTHIC, // ULScript_Gothic,
|
406
|
+
X_DESERET, // ULScript_Deseret,
|
407
|
+
ENGLISH, // ULScript_Inherited, [no words should be in this script]
|
408
|
+
TAGALOG, // ULScript_Tagalog,
|
409
|
+
X_HANUNOO, // ULScript_Hanunoo,
|
410
|
+
X_BUHID, // ULScript_Buhid,
|
411
|
+
X_TAGBANWA, // ULScript_Tagbanwa,
|
412
|
+
LIMBU, // ULScript_Limbu,
|
413
|
+
X_TAI_LE, // ULScript_Tai_Le,
|
414
|
+
X_LINEAR_B, // ULScript_Linear_B,
|
415
|
+
X_UGARITIC, // ULScript_Ugaritic,
|
416
|
+
X_SHAVIAN, // ULScript_Shavian,
|
417
|
+
X_OSMANYA, // ULScript_Osmanya,
|
418
|
+
X_CYPRIOT, // ULScript_Cypriot,
|
419
|
+
X_BUGINESE, // ULScript_Buginese,
|
420
|
+
X_COPTIC, // ULScript_Coptic,
|
421
|
+
X_NEW_TAI_LUE, // ULScript_New_Tai_Lue,
|
422
|
+
X_GLAGOLITIC, // ULScript_Glagolitic,
|
423
|
+
X_TIFINAGH, // ULScript_Tifinagh,
|
424
|
+
X_SYLOTI_NAGRI, // ULScript_Syloti_Nagri,
|
425
|
+
X_OLD_PERSIAN, // ULScript_Old_Persian,
|
426
|
+
X_KHAROSHTHI, // ULScript_Kharoshthi,
|
427
|
+
X_BALINESE, // ULScript_Balinese,
|
428
|
+
X_CUNEIFORM, // ULScript_Cuneiform,
|
429
|
+
X_PHOENICIAN, // ULScript_Phoenician,
|
430
|
+
X_PHAGS_PA, // ULScript_Phags_Pa,
|
431
|
+
X_NKO, // ULScript_Nko,
|
432
|
+
|
433
|
+
// Unicode 5.1
|
434
|
+
X_SUDANESE, // ULScript_Sundanese,
|
435
|
+
X_LEPCHA, // ULScript_Lepcha,
|
436
|
+
X_OL_CHIKI, // ULScript_Ol_Chiki,
|
437
|
+
X_VAI, // ULScript_Vai,
|
438
|
+
X_SAURASHTRA, // ULScript_Saurashtra,
|
439
|
+
X_KAYAH_LI, // ULScript_Kayah_Li,
|
440
|
+
X_REJANG, // ULScript_Rejang,
|
441
|
+
X_LYCIAN, // ULScript_Lycian,
|
442
|
+
X_CARIAN, // ULScript_Carian,
|
443
|
+
X_LYDIAN, // ULScript_Lydian,
|
444
|
+
X_CHAM, // ULScript_Cham,
|
445
|
+
};
|
446
|
+
|
447
|
+
COMPILE_ASSERT(arraysize(kOnlyLanguagePerLScript) == ULScript_NUM_SCRIPTS,
|
448
|
+
kOnlyLanguagePerLScript_has_incorrect_length);
|
449
|
+
|
450
|
+
|
451
|
+
// This is, in a sense, the complement of the table above
|
452
|
+
// If != UNKNOWN, determines a default language of this script
|
453
|
+
static const Language kDefaultLanguagePerLScript[] = {
|
454
|
+
UNKNOWN_LANGUAGE, // ULScript_Common, [no words should be in this script]
|
455
|
+
ENGLISH, // ULScript_Latin,
|
456
|
+
UNKNOWN_LANGUAGE, // ULScript_Greek,
|
457
|
+
RUSSIAN, // ULScript_Cyrillic,
|
458
|
+
UNKNOWN_LANGUAGE, // ULScript_Armenian,
|
459
|
+
HEBREW, // ULScript_Hebrew,
|
460
|
+
ARABIC, // ULScript_Arabic,
|
461
|
+
UNKNOWN_LANGUAGE, // ULScript_Syriac,
|
462
|
+
UNKNOWN_LANGUAGE, // ULScript_Thaana,
|
463
|
+
HINDI, // ULScript_Devanagari,
|
464
|
+
BENGALI, // ULScript_Bengali,
|
465
|
+
UNKNOWN_LANGUAGE, // ULScript_Gurmukhi,
|
466
|
+
UNKNOWN_LANGUAGE, // ULScript_Gujarati,
|
467
|
+
UNKNOWN_LANGUAGE, // ULScript_Oriya,
|
468
|
+
UNKNOWN_LANGUAGE, // ULScript_Tamil,
|
469
|
+
UNKNOWN_LANGUAGE, // ULScript_Telugu,
|
470
|
+
UNKNOWN_LANGUAGE, // ULScript_Kannada,
|
471
|
+
UNKNOWN_LANGUAGE, // ULScript_Malayalam,
|
472
|
+
UNKNOWN_LANGUAGE, // ULScript_Sinhala,
|
473
|
+
UNKNOWN_LANGUAGE, // ULScript_Thai,
|
474
|
+
UNKNOWN_LANGUAGE, // ULScript_Lao,
|
475
|
+
TIBETAN, // ULScript_Tibetan,
|
476
|
+
UNKNOWN_LANGUAGE, // ULScript_Myanmar,
|
477
|
+
UNKNOWN_LANGUAGE, // ULScript_Georgian,
|
478
|
+
CHINESE, // ULScript_HanCJK,
|
479
|
+
AMHARIC, // ULScript_Ethiopic,
|
480
|
+
UNKNOWN_LANGUAGE, // ULScript_Cherokee,
|
481
|
+
UNKNOWN_LANGUAGE, // ULScript_Canadian_Aboriginal,
|
482
|
+
UNKNOWN_LANGUAGE, // ULScript_Ogham,
|
483
|
+
UNKNOWN_LANGUAGE, // ULScript_Runic,
|
484
|
+
UNKNOWN_LANGUAGE, // ULScript_Khmer,
|
485
|
+
UNKNOWN_LANGUAGE, // ULScript_Mongolian,
|
486
|
+
UNKNOWN_LANGUAGE, // ULScript_Yi,
|
487
|
+
UNKNOWN_LANGUAGE, // ULScript_Old_Italic,
|
488
|
+
UNKNOWN_LANGUAGE, // ULScript_Gothic,
|
489
|
+
UNKNOWN_LANGUAGE, // ULScript_Deseret,
|
490
|
+
UNKNOWN_LANGUAGE, // ULScript_Inherited, [no words should be in this script]
|
491
|
+
UNKNOWN_LANGUAGE, // ULScript_Tagalog,
|
492
|
+
UNKNOWN_LANGUAGE, // ULScript_Hanunoo,
|
493
|
+
UNKNOWN_LANGUAGE, // ULScript_Buhid,
|
494
|
+
UNKNOWN_LANGUAGE, // ULScript_Tagbanwa,
|
495
|
+
UNKNOWN_LANGUAGE, // ULScript_Limbu,
|
496
|
+
UNKNOWN_LANGUAGE, // ULScript_Tai_Le,
|
497
|
+
UNKNOWN_LANGUAGE, // ULScript_Linear_B,
|
498
|
+
UNKNOWN_LANGUAGE, // ULScript_Ugaritic,
|
499
|
+
UNKNOWN_LANGUAGE, // ULScript_Shavian,
|
500
|
+
UNKNOWN_LANGUAGE, // ULScript_Osmanya,
|
501
|
+
UNKNOWN_LANGUAGE, // ULScript_Cypriot,
|
502
|
+
UNKNOWN_LANGUAGE, // ULScript_Buginese,
|
503
|
+
UNKNOWN_LANGUAGE, // ULScript_Coptic,
|
504
|
+
UNKNOWN_LANGUAGE, // ULScript_New_Tai_Lue,
|
505
|
+
UNKNOWN_LANGUAGE, // ULScript_Glagolitic,
|
506
|
+
UNKNOWN_LANGUAGE, // ULScript_Tifinagh,
|
507
|
+
UNKNOWN_LANGUAGE, // ULScript_Syloti_Nagri,
|
508
|
+
UNKNOWN_LANGUAGE, // ULScript_Old_Persian,
|
509
|
+
UNKNOWN_LANGUAGE, // ULScript_Kharoshthi,
|
510
|
+
UNKNOWN_LANGUAGE, // ULScript_Balinese,
|
511
|
+
UNKNOWN_LANGUAGE, // ULScript_Cuneiform,
|
512
|
+
UNKNOWN_LANGUAGE, // ULScript_Phoenician,
|
513
|
+
UNKNOWN_LANGUAGE, // ULScript_Phags_Pa,
|
514
|
+
UNKNOWN_LANGUAGE, // ULScript_Nko,
|
515
|
+
|
516
|
+
// Unicode 5.1
|
517
|
+
UNKNOWN_LANGUAGE, // ULScript_Sundanese,
|
518
|
+
UNKNOWN_LANGUAGE, // ULScript_Lepcha,
|
519
|
+
UNKNOWN_LANGUAGE, // ULScript_Ol_Chiki,
|
520
|
+
UNKNOWN_LANGUAGE, // ULScript_Vai,
|
521
|
+
UNKNOWN_LANGUAGE, // ULScript_Saurashtra,
|
522
|
+
UNKNOWN_LANGUAGE, // ULScript_Kayah_Li,
|
523
|
+
UNKNOWN_LANGUAGE, // ULScript_Rejang,
|
524
|
+
UNKNOWN_LANGUAGE, // ULScript_Lycian,
|
525
|
+
UNKNOWN_LANGUAGE, // ULScript_Carian,
|
526
|
+
UNKNOWN_LANGUAGE, // ULScript_Lydian,
|
527
|
+
UNKNOWN_LANGUAGE, // ULScript_Cham,
|
528
|
+
};
|
529
|
+
|
530
|
+
COMPILE_ASSERT(arraysize(kDefaultLanguagePerLScript) == ULScript_NUM_SCRIPTS,
|
531
|
+
kDefaultLanguagePerLScript_has_incorrect_length);
|
532
|
+
|
533
|
+
|
534
|
+
// True for standalone languages (only lang in a script)
|
535
|
+
// Subscripted by packed language number
|
536
|
+
// If 1, we will use nilgrams to determine language
|
537
|
+
static const uint8 kIsStandaloneLang[EXT_NUM_LANGUAGES + 1] = {
|
538
|
+
0,
|
539
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,1,0, // GREEK
|
540
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
|
541
|
+
0,1,0,0,1, 0,1,0,0,0, 0,0,1,1,0, 0,0,0,0,1, // MALAYALAM..KANNADA
|
542
|
+
1,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 1,0,0,0,1, // PUNJABI..SINHALESE
|
543
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,1,1,0, // ARMENIAN..LAOTHIAN
|
544
|
+
|
545
|
+
0,0,0,0,1, 0,1,1,1,0, 1,0,0,0,0, 0,0,0,0,0, // KHMER..ORIYA
|
546
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
|
547
|
+
0,1,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, // INUKTITUT
|
548
|
+
|
549
|
+
0,0,0,0,0, // [160..164]
|
550
|
+
// Add new language standalone bit just before here
|
551
|
+
0,0,0,0,0, 1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,1,
|
552
|
+
1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,1,
|
553
|
+
|
554
|
+
1,1,1,1,
|
555
|
+
};
|
556
|
+
|
557
|
+
// True for ULScript_HanCJK
|
558
|
+
// (Vietnamese and Zhuang also have Latin script quadgrams)
|
559
|
+
// Subscripted by packed language number
|
560
|
+
static const uint8 kIsUnigramLang[EXT_NUM_LANGUAGES + 1] = {
|
561
|
+
0,
|
562
|
+
0,0,0,0,0, 0,0,0,1,1, 0,0,0,0,0, 0,1,0,0,0, // JAPANESE KOREAN CHINESE
|
563
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
|
564
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
|
565
|
+
0,0,0,0,0, 0,1,0,0,1, 0,0,0,0,0, 0,0,0,0,0, // VIETNAMESE CHINESE_T
|
566
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
|
567
|
+
|
568
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
|
569
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
|
570
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 1,0,0,0,0, // ZHUANG
|
571
|
+
|
572
|
+
0,0,0,0,0, // [160..164]
|
573
|
+
// Add new language unigram bit just before here
|
574
|
+
|
575
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
|
576
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
|
577
|
+
|
578
|
+
0,0,0,0,
|
579
|
+
};
|
580
|
+
|
581
|
+
|
582
|
+
// True for ULScript_HanCJK
|
583
|
+
// Subscripted by lscript number
|
584
|
+
static const uint8 kScoreUniPerLScript[] = {
|
585
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1,0,0,0,0,0,0,0,
|
586
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
587
|
+
0,0,0,0,0,0,0,0,
|
588
|
+
};
|
589
|
+
|
590
|
+
COMPILE_ASSERT(arraysize(kScoreUniPerLScript) == ULScript_NUM_SCRIPTS,
|
591
|
+
kScoreUniPerLScript_has_incorrect_length);
|
592
|
+
|
593
|
+
|
594
|
+
// Defines Top40 packed languages
|
595
|
+
|
596
|
+
// Tier 0/1 Language enum list (16)
|
597
|
+
// ENGLISH, /*no en_GB,*/ FRENCH, ITALIAN, GERMAN, SPANISH, // E - FIGS
|
598
|
+
// DUTCH, CHINESE, CHINESE_T, JAPANESE, KOREAN,
|
599
|
+
// PORTUGUESE, RUSSIAN, POLISH, TURKISH, THAI,
|
600
|
+
// ARABIC,
|
601
|
+
//
|
602
|
+
// Tier 2 Language enum list (22)
|
603
|
+
// SWEDISH, FINNISH, DANISH, /*no pt-PT,*/ ROMANIAN, HUNGARIAN,
|
604
|
+
// HEBREW, INDONESIAN, CZECH, GREEK, NORWEGIAN,
|
605
|
+
// VIETNAMESE, BULGARIAN, CROATIAN, LITHUANIAN, SLOVAK,
|
606
|
+
// TAGALOG, SLOVENIAN, SERBIAN, CATALAN, LATVIAN,
|
607
|
+
// UKRAINIAN, HINDI,
|
608
|
+
//
|
609
|
+
// use SERBO_CROATIAN instead of BOSNIAN, SERBIAN, CROATIAN, MONTENEGRIN(21)
|
610
|
+
//
|
611
|
+
// Include IgnoreMe (TG_UNKNOWN_LANGUAGE, 25+1) as a top 40
|
612
|
+
|
613
|
+
// NOTE: packed, i.e. Language enum + 1
|
614
|
+
static const uint8 kIsPackedTop40[EXT_NUM_LANGUAGES + 1] = {
|
615
|
+
0,
|
616
|
+
1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,0,
|
617
|
+
1,1,1,1,0, 1,0,1,0,0, 0,0,1,1,1, 1,0,0,1,0,
|
618
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,1,1, 1,0,0,0,0,
|
619
|
+
0,0,0,1,0, 0,1,0,1,1, 0,0,0,0,0, 0,0,0,0,0,
|
620
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,1,0,0, 0,0,0,0,0,
|
621
|
+
|
622
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
|
623
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
|
624
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
|
625
|
+
|
626
|
+
0,0,0,0,0, // [160..164]
|
627
|
+
// Add new language top40 bit just before here
|
628
|
+
|
629
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
|
630
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
|
631
|
+
|
632
|
+
0,0,0,0,
|
633
|
+
};
|
634
|
+
|
635
|
+
|
636
|
+
|
637
|
+
// Table has 234 eight-byte entries. Each entry has a five-byte array and
|
638
|
+
// a three-byte array of log base 2 probabilities in the range 0..11.
|
639
|
+
// The intended use is to express five or three probabilities in a single-byte
|
640
|
+
// subscript, then decode via this table. These probabilities are
|
641
|
+
// intended to go with an array of five or three language numbers.
|
642
|
+
//
|
643
|
+
// The corresponding language numbers will have to be sorted by descending
|
644
|
+
// probability, then the actual probability subscript chosen to match the
|
645
|
+
// closest available entry in this table.
|
646
|
+
//
|
647
|
+
// Pattern of probability values:
|
648
|
+
// hi 3/4 1/2 1/4 lo hi mid lo
|
649
|
+
// where "3/4" is (hi*3+lo)/4, "1/2" is (hi+lo)/2, and "1/4" is (hi+lo*3)/4 and
|
650
|
+
// mid is one of 3/4 1/2 or 1/4.
|
651
|
+
// There are three groups of 78 (=12*13/2) entries, with hi running 0..11 and
|
652
|
+
// lo running 0..hi. Only the first group is used for five-entry lookups.
|
653
|
+
// The mid value in the first group is 1/2, the second group 3/4, and the
|
654
|
+
// third group 1/4. For three-entry lookups, this allows the mid entry to be
|
655
|
+
// somewhat higher or lower than the midpoint, to allow a better match to the
|
656
|
+
// original probabilities.
|
657
|
+
static const int kLgProbV2TblSize = 234;
|
658
|
+
static const uint8 kLgProbV2Tbl[kLgProbV2TblSize * 8] = {
|
659
|
+
1,1,1,1,1, 1,1,1, // [0]
|
660
|
+
2,2,2,1,1, 2,2,1, // [1]
|
661
|
+
2,2,2,2,2, 2,2,2,
|
662
|
+
3,3,2,2,1, 3,2,1, // [3]
|
663
|
+
3,3,3,2,2, 3,3,2,
|
664
|
+
3,3,3,3,3, 3,3,3,
|
665
|
+
4,3,3,2,1, 4,3,1, // [6]
|
666
|
+
4,4,3,3,2, 4,3,2,
|
667
|
+
4,4,4,3,3, 4,4,3,
|
668
|
+
4,4,4,4,4, 4,4,4,
|
669
|
+
5,4,3,2,1, 5,3,1, // [10]
|
670
|
+
5,4,4,3,2, 5,4,2,
|
671
|
+
5,5,4,4,3, 5,4,3,
|
672
|
+
5,5,5,4,4, 5,5,4,
|
673
|
+
5,5,5,5,5, 5,5,5,
|
674
|
+
6,5,4,2,1, 6,4,1, // [15]
|
675
|
+
6,5,4,3,2, 6,4,2,
|
676
|
+
6,5,5,4,3, 6,5,3,
|
677
|
+
6,6,5,5,4, 6,5,4,
|
678
|
+
6,6,6,5,5, 6,6,5,
|
679
|
+
6,6,6,6,6, 6,6,6,
|
680
|
+
7,6,4,3,1, 7,4,1, // [21]
|
681
|
+
7,6,5,3,2, 7,5,2,
|
682
|
+
7,6,5,4,3, 7,5,3,
|
683
|
+
7,6,6,5,4, 7,6,4,
|
684
|
+
7,7,6,6,5, 7,6,5,
|
685
|
+
7,7,7,6,6, 7,7,6,
|
686
|
+
7,7,7,7,7, 7,7,7,
|
687
|
+
8,6,5,3,1, 8,5,1, // [28]
|
688
|
+
8,7,5,4,2, 8,5,2,
|
689
|
+
8,7,6,4,3, 8,6,3,
|
690
|
+
8,7,6,5,4, 8,6,4,
|
691
|
+
8,7,7,6,5, 8,7,5,
|
692
|
+
8,8,7,7,6, 8,7,6,
|
693
|
+
8,8,8,7,7, 8,8,7,
|
694
|
+
8,8,8,8,8, 8,8,8,
|
695
|
+
9,7,5,3,1, 9,5,1, // [36]
|
696
|
+
9,7,6,4,2, 9,6,2,
|
697
|
+
9,8,6,5,3, 9,6,3,
|
698
|
+
9,8,7,5,4, 9,7,4,
|
699
|
+
9,8,7,6,5, 9,7,5,
|
700
|
+
9,8,8,7,6, 9,8,6,
|
701
|
+
9,9,8,8,7, 9,8,7,
|
702
|
+
9,9,9,8,8, 9,9,8,
|
703
|
+
9,9,9,9,9, 9,9,9,
|
704
|
+
10,8,6,3,1, 10,6,1, // [45]
|
705
|
+
10,8,6,4,2, 10,6,2,
|
706
|
+
10,8,7,5,3, 10,7,3,
|
707
|
+
10,9,7,6,4, 10,7,4,
|
708
|
+
10,9,8,6,5, 10,8,5,
|
709
|
+
10,9,8,7,6, 10,8,6,
|
710
|
+
10,9,9,8,7, 10,9,7,
|
711
|
+
10,10,9,9,8, 10,9,8,
|
712
|
+
10,10,10,9,9, 10,10,9,
|
713
|
+
10,10,10,10,10, 10,10,10,
|
714
|
+
11,9,6,4,1, 11,6,1, // [55]
|
715
|
+
11,9,7,4,2, 11,7,2,
|
716
|
+
11,9,7,5,3, 11,7,3,
|
717
|
+
11,9,8,6,4, 11,8,4,
|
718
|
+
11,10,8,7,5, 11,8,5,
|
719
|
+
11,10,9,7,6, 11,9,6,
|
720
|
+
11,10,9,8,7, 11,9,7,
|
721
|
+
11,10,10,9,8, 11,10,8,
|
722
|
+
11,11,10,10,9, 11,10,9,
|
723
|
+
11,11,11,10,10, 11,11,10,
|
724
|
+
11,11,11,11,11, 11,11,11,
|
725
|
+
12,9,7,4,1, 12,7,1, // [66]
|
726
|
+
12,10,7,5,2, 12,7,2,
|
727
|
+
12,10,8,5,3, 12,8,3,
|
728
|
+
12,10,8,6,4, 12,8,4,
|
729
|
+
12,10,9,7,5, 12,9,5,
|
730
|
+
12,11,9,8,6, 12,9,6,
|
731
|
+
12,11,10,8,7, 12,10,7,
|
732
|
+
12,11,10,9,8, 12,10,8,
|
733
|
+
12,11,11,10,9, 12,11,9,
|
734
|
+
12,12,11,11,10, 12,11,10,
|
735
|
+
12,12,12,11,11, 12,12,11,
|
736
|
+
12,12,12,12,12, 12,12,12,
|
737
|
+
|
738
|
+
1,1,1,1,1, 1,1,1,
|
739
|
+
2,2,2,1,1, 2,2,1,
|
740
|
+
2,2,2,2,2, 2,2,2,
|
741
|
+
3,3,2,2,1, 3,3,1,
|
742
|
+
3,3,3,2,2, 3,3,2,
|
743
|
+
3,3,3,3,3, 3,3,3,
|
744
|
+
4,3,3,2,1, 4,3,1,
|
745
|
+
4,4,3,3,2, 4,4,2,
|
746
|
+
4,4,4,3,3, 4,4,3,
|
747
|
+
4,4,4,4,4, 4,4,4,
|
748
|
+
5,4,3,2,1, 5,4,1,
|
749
|
+
5,4,4,3,2, 5,4,2,
|
750
|
+
5,5,4,4,3, 5,5,3,
|
751
|
+
5,5,5,4,4, 5,5,4,
|
752
|
+
5,5,5,5,5, 5,5,5,
|
753
|
+
6,5,4,2,1, 6,5,1,
|
754
|
+
6,5,4,3,2, 6,5,2,
|
755
|
+
6,5,5,4,3, 6,5,3,
|
756
|
+
6,6,5,5,4, 6,6,4,
|
757
|
+
6,6,6,5,5, 6,6,5,
|
758
|
+
6,6,6,6,6, 6,6,6,
|
759
|
+
7,6,4,3,1, 7,6,1,
|
760
|
+
7,6,5,3,2, 7,6,2,
|
761
|
+
7,6,5,4,3, 7,6,3,
|
762
|
+
7,6,6,5,4, 7,6,4,
|
763
|
+
7,7,6,6,5, 7,7,5,
|
764
|
+
7,7,7,6,6, 7,7,6,
|
765
|
+
7,7,7,7,7, 7,7,7,
|
766
|
+
8,6,5,3,1, 8,6,1,
|
767
|
+
8,7,5,4,2, 8,7,2,
|
768
|
+
8,7,6,4,3, 8,7,3,
|
769
|
+
8,7,6,5,4, 8,7,4,
|
770
|
+
8,7,7,6,5, 8,7,5,
|
771
|
+
8,8,7,7,6, 8,8,6,
|
772
|
+
8,8,8,7,7, 8,8,7,
|
773
|
+
8,8,8,8,8, 8,8,8,
|
774
|
+
9,7,5,3,1, 9,7,1,
|
775
|
+
9,7,6,4,2, 9,7,2,
|
776
|
+
9,8,6,5,3, 9,8,3,
|
777
|
+
9,8,7,5,4, 9,8,4,
|
778
|
+
9,8,7,6,5, 9,8,5,
|
779
|
+
9,8,8,7,6, 9,8,6,
|
780
|
+
9,9,8,8,7, 9,9,7,
|
781
|
+
9,9,9,8,8, 9,9,8,
|
782
|
+
9,9,9,9,9, 9,9,9,
|
783
|
+
10,8,6,3,1, 10,8,1,
|
784
|
+
10,8,6,4,2, 10,8,2,
|
785
|
+
10,8,7,5,3, 10,8,3,
|
786
|
+
10,9,7,6,4, 10,9,4,
|
787
|
+
10,9,8,6,5, 10,9,5,
|
788
|
+
10,9,8,7,6, 10,9,6,
|
789
|
+
10,9,9,8,7, 10,9,7,
|
790
|
+
10,10,9,9,8, 10,10,8,
|
791
|
+
10,10,10,9,9, 10,10,9,
|
792
|
+
10,10,10,10,10, 10,10,10,
|
793
|
+
11,9,6,4,1, 11,9,1,
|
794
|
+
11,9,7,4,2, 11,9,2,
|
795
|
+
11,9,7,5,3, 11,9,3,
|
796
|
+
11,9,8,6,4, 11,9,4,
|
797
|
+
11,10,8,7,5, 11,10,5,
|
798
|
+
11,10,9,7,6, 11,10,6,
|
799
|
+
11,10,9,8,7, 11,10,7,
|
800
|
+
11,10,10,9,8, 11,10,8,
|
801
|
+
11,11,10,10,9, 11,11,9,
|
802
|
+
11,11,11,10,10, 11,11,10,
|
803
|
+
11,11,11,11,11, 11,11,11,
|
804
|
+
12,9,7,4,1, 12,9,1,
|
805
|
+
12,10,7,5,2, 12,10,2,
|
806
|
+
12,10,8,5,3, 12,10,3,
|
807
|
+
12,10,8,6,4, 12,10,4,
|
808
|
+
12,10,9,7,5, 12,10,5,
|
809
|
+
12,11,9,8,6, 12,11,6,
|
810
|
+
12,11,10,8,7, 12,11,7,
|
811
|
+
12,11,10,9,8, 12,11,8,
|
812
|
+
12,11,11,10,9, 12,11,9,
|
813
|
+
12,12,11,11,10, 12,12,10,
|
814
|
+
12,12,12,11,11, 12,12,11,
|
815
|
+
12,12,12,12,12, 12,12,12,
|
816
|
+
|
817
|
+
1,1,1,1,1, 1,1,1,
|
818
|
+
2,2,2,1,1, 2,1,1,
|
819
|
+
2,2,2,2,2, 2,2,2,
|
820
|
+
3,3,2,2,1, 3,2,1,
|
821
|
+
3,3,3,2,2, 3,2,2,
|
822
|
+
3,3,3,3,3, 3,3,3,
|
823
|
+
4,3,3,2,1, 4,2,1,
|
824
|
+
4,4,3,3,2, 4,3,2,
|
825
|
+
4,4,4,3,3, 4,3,3,
|
826
|
+
4,4,4,4,4, 4,4,4,
|
827
|
+
5,4,3,2,1, 5,2,1,
|
828
|
+
5,4,4,3,2, 5,3,2,
|
829
|
+
5,5,4,4,3, 5,4,3,
|
830
|
+
5,5,5,4,4, 5,4,4,
|
831
|
+
5,5,5,5,5, 5,5,5,
|
832
|
+
6,5,4,2,1, 6,2,1,
|
833
|
+
6,5,4,3,2, 6,3,2,
|
834
|
+
6,5,5,4,3, 6,4,3,
|
835
|
+
6,6,5,5,4, 6,5,4,
|
836
|
+
6,6,6,5,5, 6,5,5,
|
837
|
+
6,6,6,6,6, 6,6,6,
|
838
|
+
7,6,4,3,1, 7,3,1,
|
839
|
+
7,6,5,3,2, 7,3,2,
|
840
|
+
7,6,5,4,3, 7,4,3,
|
841
|
+
7,6,6,5,4, 7,5,4,
|
842
|
+
7,7,6,6,5, 7,6,5,
|
843
|
+
7,7,7,6,6, 7,6,6,
|
844
|
+
7,7,7,7,7, 7,7,7,
|
845
|
+
8,6,5,3,1, 8,3,1,
|
846
|
+
8,7,5,4,2, 8,4,2,
|
847
|
+
8,7,6,4,3, 8,4,3,
|
848
|
+
8,7,6,5,4, 8,5,4,
|
849
|
+
8,7,7,6,5, 8,6,5,
|
850
|
+
8,8,7,7,6, 8,7,6,
|
851
|
+
8,8,8,7,7, 8,7,7,
|
852
|
+
8,8,8,8,8, 8,8,8,
|
853
|
+
9,7,5,3,1, 9,3,1,
|
854
|
+
9,7,6,4,2, 9,4,2,
|
855
|
+
9,8,6,5,3, 9,5,3,
|
856
|
+
9,8,7,5,4, 9,5,4,
|
857
|
+
9,8,7,6,5, 9,6,5,
|
858
|
+
9,8,8,7,6, 9,7,6,
|
859
|
+
9,9,8,8,7, 9,8,7,
|
860
|
+
9,9,9,8,8, 9,8,8,
|
861
|
+
9,9,9,9,9, 9,9,9,
|
862
|
+
10,8,6,3,1, 10,3,1,
|
863
|
+
10,8,6,4,2, 10,4,2,
|
864
|
+
10,8,7,5,3, 10,5,3,
|
865
|
+
10,9,7,6,4, 10,6,4,
|
866
|
+
10,9,8,6,5, 10,6,5,
|
867
|
+
10,9,8,7,6, 10,7,6,
|
868
|
+
10,9,9,8,7, 10,8,7,
|
869
|
+
10,10,9,9,8, 10,9,8,
|
870
|
+
10,10,10,9,9, 10,9,9,
|
871
|
+
10,10,10,10,10, 10,10,10,
|
872
|
+
11,9,6,4,1, 11,4,1,
|
873
|
+
11,9,7,4,2, 11,4,2,
|
874
|
+
11,9,7,5,3, 11,5,3,
|
875
|
+
11,9,8,6,4, 11,6,4,
|
876
|
+
11,10,8,7,5, 11,7,5,
|
877
|
+
11,10,9,7,6, 11,7,6,
|
878
|
+
11,10,9,8,7, 11,8,7,
|
879
|
+
11,10,10,9,8, 11,9,8,
|
880
|
+
11,11,10,10,9, 11,10,9,
|
881
|
+
11,11,11,10,10, 11,10,10,
|
882
|
+
11,11,11,11,11, 11,11,11,
|
883
|
+
12,9,7,4,1, 12,4,1,
|
884
|
+
12,10,7,5,2, 12,5,2,
|
885
|
+
12,10,8,5,3, 12,5,3,
|
886
|
+
12,10,8,6,4, 12,6,4,
|
887
|
+
12,10,9,7,5, 12,7,5,
|
888
|
+
12,11,9,8,6, 12,8,6,
|
889
|
+
12,11,10,8,7, 12,8,7,
|
890
|
+
12,11,10,9,8, 12,9,8,
|
891
|
+
12,11,11,10,9, 12,10,9,
|
892
|
+
12,12,11,11,10, 12,11,10,
|
893
|
+
12,12,12,11,11, 12,11,11,
|
894
|
+
12,12,12,12,12, 12,12,12,
|
895
|
+
};
|
896
|
+
|
897
|
+
// Backmap a single desired probability into an entry in kLgProbV2Tbl
|
898
|
+
static const uint8 kLgProbV2TblBackmap[13] = {
|
899
|
+
0,
|
900
|
+
0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66,
|
901
|
+
};
|
902
|
+
|
903
|
+
|
904
|
+
// Always advances one UTF-8 character
|
905
|
+
static const uint8 kAdvanceOneChar[256] = {
|
906
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
907
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
908
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
909
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
910
|
+
|
911
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
912
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
913
|
+
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
|
914
|
+
3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,
|
915
|
+
};
|
916
|
+
|
917
|
+
// Does not advance past space or cr/lf/nul
|
918
|
+
static const uint8 kAdvanceOneCharButSpace[256] = {
|
919
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
920
|
+
0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
921
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
922
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
923
|
+
|
924
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
925
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
926
|
+
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
|
927
|
+
3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,
|
928
|
+
};
|
929
|
+
|
930
|
+
// Advances *only* on space or ASCII vowel (or illegal byte)
|
931
|
+
static const uint8 kAdvanceOneCharSpaceVowel[256] = {
|
932
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
933
|
+
1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
934
|
+
0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
|
935
|
+
0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
|
936
|
+
|
937
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
938
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
939
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
940
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
941
|
+
};
|
942
|
+
|
943
|
+
// Advances *only* on space (or illegal byte)
|
944
|
+
static const uint8 kAdvanceOneCharSpace[256] = {
|
945
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
946
|
+
1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
947
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
948
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
949
|
+
|
950
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
951
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
952
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
953
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
954
|
+
};
|
955
|
+
|
956
|
+
|
957
|
+
//------------------------------------------------------------------------------
|
958
|
+
// General
|
959
|
+
//------------------------------------------------------------------------------
|
960
|
+
static inline int minint(int a, int b) {return (a < b) ? a: b;}
|
961
|
+
static inline int maxint(int a, int b) {return (a > b) ? a: b;}
|
962
|
+
|
963
|
+
// Here to make available for debugging
|
964
|
+
int ReliabilityDelta(int value1, int value2, int count);
|
965
|
+
int ReliabilityMainstream(int topscore, int len, int mean_score);
|
966
|
+
|
967
|
+
// Returns "0" for too small
|
968
|
+
inline const char* MyExtLanguageCode(Language lang) {
|
969
|
+
return ExtLanguageCode(lang);
|
970
|
+
}
|
971
|
+
|
972
|
+
// Map script into Latin, Cyrillic, Arabic, Other. Used in keeping track of
|
973
|
+
// amount of training data for language-script combinations
|
974
|
+
inline int LScript4(UnicodeLScript lscript) {
|
975
|
+
if (lscript == ULScript_Latin) {return 0;}
|
976
|
+
if (lscript == ULScript_Cyrillic) {return 1;}
|
977
|
+
if (lscript == ULScript_Arabic) {return 2;}
|
978
|
+
return 3;
|
979
|
+
}
|
980
|
+
|
981
|
+
|
982
|
+
// Routines to access 3 or 5 log probabilities in a single byte.
|
983
|
+
|
984
|
+
// Return address of 8-byte entry[i]
|
985
|
+
inline const uint8* LgProb2TblEntry(int i) {
|
986
|
+
return &kLgProbV2Tbl[i * 8];
|
987
|
+
}
|
988
|
+
|
989
|
+
// Return one of five probabilities in an entry
|
990
|
+
// CURRENTLY UNUSED
|
991
|
+
inline uint8 LgProb5(const uint8* entry, int j) {
|
992
|
+
return entry[j];
|
993
|
+
}
|
994
|
+
|
995
|
+
// Return one of three probabilities in an entry
|
996
|
+
inline uint8 LgProb3(const uint8* entry, int j) {
|
997
|
+
return entry[j + 5];
|
998
|
+
}
|
999
|
+
|
1000
|
+
|
1001
|
+
|
1002
|
+
//------------------------------------------------------------------------------
|
1003
|
+
// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores
|
1004
|
+
//------------------------------------------------------------------------------
|
1005
|
+
|
1006
|
+
// Pick up 1..12 bytes and hash them via mask/shift/add. NO pre/post
|
1007
|
+
// OVERSHOOTS up to 3 bytes
|
1008
|
+
uint32 BiHashV25(const char* word_ptr, int bytecount);
|
1009
|
+
|
1010
|
+
// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
|
1011
|
+
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
|
1012
|
+
uint32 QuadHashV25(const char* word_ptr, int bytecount);
|
1013
|
+
|
1014
|
+
// Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
|
1015
|
+
// OVERSHOOTS up to 3 bytes
|
1016
|
+
uint32 QuadHashV25Underscore(const char* word_ptr, int bytecount);
|
1017
|
+
|
1018
|
+
|
1019
|
+
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
|
1020
|
+
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
|
1021
|
+
// For runtime use of tables V3
|
1022
|
+
uint64 OctaHash40(const char* word_ptr, int bytecount);
|
1023
|
+
|
1024
|
+
uint64 OctaHash40underscore(const char* word_ptr, int bytecount);
|
1025
|
+
|
1026
|
+
|
1027
|
+
// From 32-bit gram FP, return hash table subscript and remaining key
|
1028
|
+
inline void QuadFPJustHash(uint32 quadhash,
|
1029
|
+
uint32 keymask,
|
1030
|
+
int bucketcount,
|
1031
|
+
uint32* subscr, uint32* hashkey) {
|
1032
|
+
*subscr = (quadhash + (quadhash >> 12)) & (bucketcount - 1);
|
1033
|
+
*hashkey = quadhash & keymask;
|
1034
|
+
}
|
1035
|
+
|
1036
|
+
// Look up 32-bit gram FP in caller-passed table
|
1037
|
+
// Typical size 256K entries (1.5MB)
|
1038
|
+
// Two-byte hashkey
|
1039
|
+
inline const uint32 QuadHashV3Lookup4(const cld::CLDTableSummary* gram_obj,
|
1040
|
+
uint32 quadhash) {
|
1041
|
+
|
1042
|
+
uint32 subscr, hashkey;
|
1043
|
+
const IndirectProbBucket4* quadtable = gram_obj->kCLDTable;
|
1044
|
+
uint32 keymask = gram_obj->kCLDTableKeyMask;
|
1045
|
+
int bucketcount = gram_obj->kCLDTableSize;
|
1046
|
+
QuadFPJustHash(quadhash, keymask, bucketcount, &subscr, &hashkey);
|
1047
|
+
const IndirectProbBucket4* bucket_ptr = &quadtable[subscr];
|
1048
|
+
// Four-way associative, 4 compares
|
1049
|
+
if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) {
|
1050
|
+
return bucket_ptr->keyvalue[0];
|
1051
|
+
}
|
1052
|
+
if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) {
|
1053
|
+
return bucket_ptr->keyvalue[1];
|
1054
|
+
}
|
1055
|
+
if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) {
|
1056
|
+
return bucket_ptr->keyvalue[2];
|
1057
|
+
}
|
1058
|
+
if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) {
|
1059
|
+
return bucket_ptr->keyvalue[3];
|
1060
|
+
}
|
1061
|
+
return 0;
|
1062
|
+
}
|
1063
|
+
|
1064
|
+
|
1065
|
+
// Map 40 bits to subscript, hashkey, expected 18-22 bit subscript (min 16)
|
1066
|
+
// wwwwwwww xxxxxxxx xxxxxxxx yyyyyyyy yyyyyyyy
|
1067
|
+
// + ........ ....wwww wwwwxxxx xxxxxxxx xxxxyyyy
|
1068
|
+
// 00000000 00000000 00000011 11111111 11111111 (18-bit bucketcount-1)
|
1069
|
+
//
|
1070
|
+
// hashkey:
|
1071
|
+
// wwwwxxxx xxxxxxxx xxxx.... ........ (20-bit keymask)
|
1072
|
+
// 12-bit shift in subscript mixes in ~4 letters x 4 bits each
|
1073
|
+
|
1074
|
+
// From 40-bit gram FP, return hash table subscript and remaining key
|
1075
|
+
inline void OctaFPJustHash(uint64 longwordhash,
|
1076
|
+
uint32 keymask,
|
1077
|
+
int bucketcount,
|
1078
|
+
uint32* subscr, uint32* hashkey) {
|
1079
|
+
uint32 temp = (longwordhash + (longwordhash >> 12)) & (bucketcount - 1);
|
1080
|
+
*subscr = temp;
|
1081
|
+
temp = longwordhash >> 4;
|
1082
|
+
*hashkey = temp & keymask;
|
1083
|
+
}
|
1084
|
+
|
1085
|
+
// Look up 40-bit gram FP in caller-passed table
|
1086
|
+
// Typical size 256K-4M entries (1-16MB)
|
1087
|
+
// 24-12 bit hashkey packed with 8-20 bit indirect lang/probs
|
1088
|
+
// keymask is 0xfffff000 for 20-bit hashkey and 12-bit indirect
|
1089
|
+
inline const uint32 OctaHashV3Lookup4(const cld::CLDTableSummary* gram_obj,
|
1090
|
+
uint64 longwordhash) {
|
1091
|
+
uint32 subscr, hashkey;
|
1092
|
+
const IndirectProbBucket4* octatable = gram_obj->kCLDTable;
|
1093
|
+
uint32 keymask = gram_obj->kCLDTableKeyMask;
|
1094
|
+
int bucketcount = gram_obj->kCLDTableSize;
|
1095
|
+
OctaFPJustHash(longwordhash, keymask, bucketcount,
|
1096
|
+
&subscr, &hashkey);
|
1097
|
+
const IndirectProbBucket4* bucket_ptr = &octatable[subscr];
|
1098
|
+
// Four-way associative, 4 compares
|
1099
|
+
if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) {
|
1100
|
+
return bucket_ptr->keyvalue[0];
|
1101
|
+
}
|
1102
|
+
if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) {
|
1103
|
+
return bucket_ptr->keyvalue[1];
|
1104
|
+
}
|
1105
|
+
if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) {
|
1106
|
+
return bucket_ptr->keyvalue[2];
|
1107
|
+
}
|
1108
|
+
if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) {
|
1109
|
+
return bucket_ptr->keyvalue[3];
|
1110
|
+
}
|
1111
|
+
return 0;
|
1112
|
+
}
|
1113
|
+
|
1114
|
+
|
1115
|
+
|
1116
|
+
//------------------------------------------------------------------------------
|
1117
|
+
// Scoring single groups of letters
|
1118
|
+
//------------------------------------------------------------------------------
|
1119
|
+
|
1120
|
+
// UNIGRAM score one => tote
|
1121
|
+
// Input: 1-byte entry of subscript into unigram probs, plus
|
1122
|
+
// an accumulator tote.
|
1123
|
+
// Output: running sums in tote updated
|
1124
|
+
void ProcessProbV25UniTote(int propval, Tote* tote);
|
1125
|
+
|
1126
|
+
// BIGRAM, QUADGRAM, OCTAGRAM score one => tote
|
1127
|
+
// Input: 4-byte entry of 3 language numbers and one probability subscript,
|
1128
|
+
// plus an accumulator tote. (language 0 means unused entry)
|
1129
|
+
// Output: running sums in tote updated
|
1130
|
+
void ProcessProbV25Tote(uint32 probs, Tote* tote);
|
1131
|
+
|
1132
|
+
|
1133
|
+
//------------------------------------------------------------------------------
|
1134
|
+
// Routines to accumulate probabilities
|
1135
|
+
//------------------------------------------------------------------------------
|
1136
|
+
|
1137
|
+
// Score up to n=gram_limit unigrams, returning number of bytes consumed
|
1138
|
+
// Caller supplies table, such as compact_lang_det_generated_ctjkvz_b1_obj
|
1139
|
+
int DoUniScoreV3(const UTF8PropObj* unigram_obj,
|
1140
|
+
const char* isrc, int srclen, int advance_by,
|
1141
|
+
int* tote_grams, int gram_limit, Tote* chunk_tote);
|
1142
|
+
|
1143
|
+
|
1144
|
+
// Score all words in isrc, using languages that have bigrams (CJK)
|
1145
|
+
// Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj
|
1146
|
+
// Return number of bigrams that hit in the hash table
|
1147
|
+
int DoBigramScoreV3(const cld::CLDTableSummary* bigram_obj,
|
1148
|
+
const char* isrc, int srclen, Tote* chunk_tote);
|
1149
|
+
|
1150
|
+
|
1151
|
+
// Score up to n=gram_limit quadgrams, returning number of bytes consumed
|
1152
|
+
// Caller supplies table, such as &kQuadTable_obj or &kGibberishTable_obj
|
1153
|
+
int DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj,
|
1154
|
+
const char* isrc, int srclen, int advance_by,
|
1155
|
+
int* tote_grams, int gram_limit, Tote* chunk_tote);
|
1156
|
+
|
1157
|
+
// Score all octagrams (words) in isrc, using languages that have quadgrams
|
1158
|
+
// Caller supplies table, such as &kLongWord8Table_obj
|
1159
|
+
// Return number of words that hit in the hash table
|
1160
|
+
int DoOctaScoreV3(const cld::CLDTableSummary* octagram_obj,
|
1161
|
+
const char* isrc, int srclen, Tote* chunk_tote);
|
1162
|
+
|
1163
|
+
//------------------------------------------------------------------------------
|
1164
|
+
// Reliability calculations, for single language and between languages
|
1165
|
+
//------------------------------------------------------------------------------
|
1166
|
+
|
1167
|
+
// Reliability = 0..100
|
1168
|
+
static const int kMinReliable = 75;
|
1169
|
+
|
1170
|
+
// Calculate ratio of score per 1KB vs. expected score per 1KB
|
1171
|
+
double GetNormalizedScore(Language lang, UnicodeLScript lscript,
|
1172
|
+
int bytes, int score);
|
1173
|
+
|
1174
|
+
// Calculate reliablity of len bytes of script lscript with chunk_tote
|
1175
|
+
int GetReliability(int len, UnicodeLScript lscript, const Tote* chunk_tote);
|
1176
|
+
|
1177
|
+
|
1178
|
+
//------------------------------------------------------------------------------
|
1179
|
+
// Miscellaneous
|
1180
|
+
//------------------------------------------------------------------------------
|
1181
|
+
|
1182
|
+
// Make languages packed into uint32 values non-zero
|
1183
|
+
// These routines later could remap so languages not in QuadHash tables are not
|
1184
|
+
// represented, and so that any thrashing in accumulation is eliminated
|
1185
|
+
uint8 inline PackLanguage(Language lang) {
|
1186
|
+
return static_cast<uint8>(lang + 1);}
|
1187
|
+
|
1188
|
+
Language inline UnpackLanguage(int ilang) {
|
1189
|
+
return static_cast<Language>(ilang - 1);}
|
1190
|
+
|
1191
|
+
// Useful single-byte tests
|
1192
|
+
bool inline IsUTF8ContinueByte(char c) {
|
1193
|
+
return static_cast<signed char>(c) < -64;}
|
1194
|
+
bool inline IsUTF8HighByte(char c) {
|
1195
|
+
return static_cast<signed char>(c) < 0;}
|
1196
|
+
|
1197
|
+
|
1198
|
+
// Demote all languages except Top40 and plus_one
|
1199
|
+
// Do this just before sorting
|
1200
|
+
void DemoteNotTop40(Tote* chunk_tote, int packed_plus_one);
|
1201
|
+
|
1202
|
+
} // End namespace cld
|
1203
|
+
|
1204
|
+
|
1205
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_CLDUTIL_H_
|