language_detection 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +85 -0
- data/Rakefile +11 -0
- data/ext/cld/Makefile +34 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +124 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/cld.so +0 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +1 -0
- data/ext/cld/language_detection.cc +88 -0
- data/ext/cld/languages/internal/languages.cc +337 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/language_detection.gemspec +28 -0
- data/lib/language_detection/string.rb +1 -0
- data/lib/language_detection/version.rb +3 -0
- data/lib/language_detection.rb +54 -0
- data/test/_helper.rb +15 -0
- data/test/fixtures/languages.csv +80 -0
- data/test/language_detection_test.rb +88 -0
- metadata +250 -0
|
@@ -0,0 +1,1205 @@
|
|
|
1
|
+
// Copyright (c) 2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_CLDUTIL_H_
|
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_CLDUTIL_H_
|
|
7
|
+
|
|
8
|
+
#include <string>
|
|
9
|
+
#include "encodings/compact_lang_det/ext_lang_enc.h"
|
|
10
|
+
#include "encodings/compact_lang_det/tote.h"
|
|
11
|
+
#include "encodings/compact_lang_det/win/cld_basictypes.h"
|
|
12
|
+
#include "encodings/compact_lang_det/win/cld_commandlineflags.h"
|
|
13
|
+
#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
|
|
14
|
+
|
|
15
|
+
namespace cld {
|
|
16
|
+
|
|
17
|
+
// Hash bucket for four-way associative lookup with < 64K buckets
|
|
18
|
+
// 32 bytes per bucket, 8-byte entries
|
|
19
|
+
typedef struct {
|
|
20
|
+
uint32 key[4]; // hashed word to look up
|
|
21
|
+
uint32 value[4]; // packed three lang numbers and probability subscript
|
|
22
|
+
} SmallWordProbBucket4;
|
|
23
|
+
|
|
24
|
+
// Hash bucket for fouro-way associative lookup with >= 64K buckets
|
|
25
|
+
// 24 bytes per bucket, 6-byte entries
|
|
26
|
+
typedef struct {
|
|
27
|
+
uint16 key[4]; // Half of hashed word to look up; other
|
|
28
|
+
// half is used to pick the bucket
|
|
29
|
+
uint32 value[4]; // packed three lang numbers and probability subscript
|
|
30
|
+
} LargeQuadProbBucket4;
|
|
31
|
+
|
|
32
|
+
// Hash bucket for four-way associative lookup, indirect probabilities
|
|
33
|
+
// 16 bytes per bucket, 4-byte entries
|
|
34
|
+
typedef struct {
|
|
35
|
+
uint32 keyvalue[4]; // Upper part of word is hash, lower is indirect prob
|
|
36
|
+
} IndirectProbBucket4;
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
// This describes a complete CLD table, consisting of
|
|
40
|
+
// a main lookup table, an indirect language/probability table, and
|
|
41
|
+
// three constants.
|
|
42
|
+
// The main table key is a quadgram, bigram, or longword hash, with
|
|
43
|
+
// part of the key used to select a bucket modulo kCLDTableSize,
|
|
44
|
+
// and the rest matched against the key portion of four entries in a bucket,
|
|
45
|
+
// defined by kCLDTableKeyMask. The remaining bits of an entry, defined
|
|
46
|
+
// by ~kCLDTableKeyMask, are usually a subscript in the indirect table.
|
|
47
|
+
//
|
|
48
|
+
// By using part of the key to select a bucket, those key bits do not need
|
|
49
|
+
// to be stored in the main table entries, saving space (typically 2 bytes).
|
|
50
|
+
//
|
|
51
|
+
// By using an indirect table for lang/prob triples, only the subscript needs
|
|
52
|
+
// to be stored in the main table entires, saving space (typically 2 bytes).
|
|
53
|
+
//
|
|
54
|
+
// Each entry in the indirect table has three languages and three
|
|
55
|
+
// corresponding probabilities, packed into four bytes.
|
|
56
|
+
//
|
|
57
|
+
// The build date constant is included just for version tracking and is not
|
|
58
|
+
// otherwise used.
|
|
59
|
+
//
|
|
60
|
+
// Different-size tables can be linked in for different production
|
|
61
|
+
// environments. By going indirect through this struct, the runtime code is
|
|
62
|
+
// insensitive to the actual sizes.
|
|
63
|
+
//
|
|
64
|
+
// An empty placeholder table can be described by a table size of 1
|
|
65
|
+
// bucket, a keymask of 0xffffffff, a degenerate bucket of four no-match
|
|
66
|
+
// entries, and a degenerate indirect table of one no-languages entry.
|
|
67
|
+
//
|
|
68
|
+
//
|
|
69
|
+
struct CLDTableSummary {
|
|
70
|
+
const IndirectProbBucket4* kCLDTable;
|
|
71
|
+
// Each bucket has four entries, part
|
|
72
|
+
// key and part indirect subscript
|
|
73
|
+
const uint32* kCLDTableInd; // Each entry is three packed lang/prob
|
|
74
|
+
const int kCLDTableSize; // Bucket count
|
|
75
|
+
const int kCLDTableIndSize; // Entries count
|
|
76
|
+
const int kCLDTableKeyMask; // Mask hash key
|
|
77
|
+
const int kCLDTableBuildDate; // yyyymmdd
|
|
78
|
+
};
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
// Keeps per-character 0-12 language probabilities for CTJKVZ-- in that order.
|
|
82
|
+
// Chinese ChineseT Japanese Korean Vietnamese Zhuang
|
|
83
|
+
// (2 bytes unused, for alignment padding and future)
|
|
84
|
+
typedef struct {
|
|
85
|
+
uint8 probs[8];
|
|
86
|
+
} UnigramProbArray;
|
|
87
|
+
|
|
88
|
+
// Map 8-bit subscript to CTJKVZ probabilities
|
|
89
|
+
// Target runtime probabilities for CTJK + VZ
|
|
90
|
+
// Hand-generated to cover a reasonable range of choices
|
|
91
|
+
static const int kTargetCTJKVZProbsSize = 242;
|
|
92
|
+
static const UnigramProbArray kTargetCTJKVZProbs[kTargetCTJKVZProbsSize] = {
|
|
93
|
+
{{0,0,0,0,0,0,0,0}},
|
|
94
|
+
{{0,0,0,0,0,12,0,0}},
|
|
95
|
+
{{0,0,0,0,12,0,0,0}},
|
|
96
|
+
{{0,0,0,12,0,0,0,0}},
|
|
97
|
+
{{0,0,12,0,0,0,0,0}},
|
|
98
|
+
{{0,12,0,0,0,0,0,0}},
|
|
99
|
+
{{12,0,0,0,0,0,0,0}},
|
|
100
|
+
|
|
101
|
+
{{8,0,0,0,4,0,0,0}},
|
|
102
|
+
{{8,0,0,4,0,0,0,0}},
|
|
103
|
+
{{8,0,4,0,0,0,0,0}},
|
|
104
|
+
{{8,4,0,0,0,0,0,0}},
|
|
105
|
+
{{8,2,0,2,0,0,0,0}},
|
|
106
|
+
{{0,0,0,0,0,8,0,0}},
|
|
107
|
+
{{0,4,8,0,0,0,0,0}},
|
|
108
|
+
{{4,0,0,0,0,8,0,0}},
|
|
109
|
+
{{0,0,8,0,0,0,0,0}},
|
|
110
|
+
{{8,2,2,0,0,0,0,0}},
|
|
111
|
+
{{0,8,4,0,0,0,0,0}},
|
|
112
|
+
{{8,0,0,0,0,4,0,0}},
|
|
113
|
+
{{0,8,2,0,0,0,0,0}},
|
|
114
|
+
{{4,8,0,0,0,0,0,0}},
|
|
115
|
+
{{2,8,0,2,0,0,0,0}},
|
|
116
|
+
{{2,2,8,0,0,0,0,0}},
|
|
117
|
+
{{0,8,0,0,0,0,0,0}},
|
|
118
|
+
{{0,2,8,0,0,0,0,0}},
|
|
119
|
+
{{2,8,2,0,0,0,0,0}},
|
|
120
|
+
{{8,0,0,0,0,0,0,0}},
|
|
121
|
+
{{2,8,0,0,0,0,0,0}},
|
|
122
|
+
{{8,2,0,0,0,0,0,0}},
|
|
123
|
+
|
|
124
|
+
{{0,6,2,0,2,0,0,0}},
|
|
125
|
+
{{2,0,0,0,6,0,0,0}},
|
|
126
|
+
{{4,0,0,0,6,0,0,0}},
|
|
127
|
+
{{4,6,0,0,4,0,0,0}},
|
|
128
|
+
{{4,6,2,0,2,0,0,0}},
|
|
129
|
+
{{4,6,4,0,2,0,0,0}},
|
|
130
|
+
{{5,4,6,0,0,0,0,0}},
|
|
131
|
+
{{6,0,0,0,4,0,0,0}},
|
|
132
|
+
{{6,0,2,0,4,0,0,0}},
|
|
133
|
+
{{6,0,4,0,4,0,0,0}},
|
|
134
|
+
{{6,2,0,0,4,0,0,0}},
|
|
135
|
+
{{6,2,2,0,4,0,0,0}},
|
|
136
|
+
{{6,2,4,0,2,0,0,0}},
|
|
137
|
+
{{6,4,0,0,2,0,0,0}},
|
|
138
|
+
{{6,4,2,0,2,0,0,0}},
|
|
139
|
+
{{0,0,6,2,0,0,0,0}},
|
|
140
|
+
{{0,6,2,0,0,2,0,0}},
|
|
141
|
+
{{2,2,2,0,0,6,0,0}},
|
|
142
|
+
{{2,2,6,4,0,0,0,0}},
|
|
143
|
+
{{2,4,0,0,0,6,0,0}},
|
|
144
|
+
{{2,6,0,4,0,0,0,0}},
|
|
145
|
+
{{2,6,2,4,0,0,0,0}},
|
|
146
|
+
{{2,6,4,4,0,0,0,0}},
|
|
147
|
+
{{4,0,2,0,0,6,0,0}},
|
|
148
|
+
{{4,2,6,2,0,0,0,0}},
|
|
149
|
+
{{4,4,2,0,0,6,0,0}},
|
|
150
|
+
{{4,6,4,0,0,2,0,0}},
|
|
151
|
+
{{6,0,2,0,0,2,0,0}},
|
|
152
|
+
{{6,2,0,0,0,2,0,0}},
|
|
153
|
+
{{6,2,2,0,0,4,0,0}},
|
|
154
|
+
{{6,2,4,0,0,2,0,0}},
|
|
155
|
+
{{4,6,2,0,0,4,0,0}},
|
|
156
|
+
{{6,4,2,0,0,4,0,0}},
|
|
157
|
+
{{2,0,0,0,0,6,0,0}},
|
|
158
|
+
{{6,2,0,2,0,0,0,0}},
|
|
159
|
+
{{2,2,0,0,0,6,0,0}},
|
|
160
|
+
{{6,2,6,0,0,0,0,0}},
|
|
161
|
+
{{6,4,2,0,0,2,0,0}},
|
|
162
|
+
{{6,4,2,2,0,0,0,0}},
|
|
163
|
+
{{4,6,4,2,0,0,0,0}},
|
|
164
|
+
{{6,0,2,0,0,4,0,0}},
|
|
165
|
+
{{6,0,4,0,0,2,0,0}},
|
|
166
|
+
{{6,0,6,0,0,0,0,0}},
|
|
167
|
+
{{6,2,2,0,0,0,0,0}},
|
|
168
|
+
{{6,4,0,0,0,2,0,0}},
|
|
169
|
+
{{6,4,5,0,0,0,0,0}},
|
|
170
|
+
{{0,6,0,2,0,0,0,0}},
|
|
171
|
+
{{0,6,2,2,0,0,0,0}},
|
|
172
|
+
{{2,6,0,2,0,0,0,0}},
|
|
173
|
+
{{2,6,2,2,0,0,0,0}},
|
|
174
|
+
{{4,2,0,0,0,6,0,0}},
|
|
175
|
+
{{6,4,0,0,0,4,0,0}},
|
|
176
|
+
{{6,4,0,2,0,0,0,0}},
|
|
177
|
+
{{6,6,0,2,0,0,0,0}},
|
|
178
|
+
{{6,0,4,0,0,4,0,0}},
|
|
179
|
+
{{6,2,0,0,0,4,0,0}},
|
|
180
|
+
{{6,6,2,2,0,0,0,0}},
|
|
181
|
+
{{4,6,0,0,0,2,0,0}},
|
|
182
|
+
{{2,6,6,0,0,0,0,0}},
|
|
183
|
+
{{4,5,6,0,0,0,0,0}},
|
|
184
|
+
{{4,6,0,2,0,0,0,0}},
|
|
185
|
+
{{6,2,0,0,0,6,0,0}},
|
|
186
|
+
{{0,6,4,2,0,0,0,0}},
|
|
187
|
+
{{4,0,6,0,0,0,0,0}},
|
|
188
|
+
{{2,6,4,2,0,0,0,0}},
|
|
189
|
+
{{4,6,0,0,0,4,0,0}},
|
|
190
|
+
{{6,2,2,0,0,0,0,0}},
|
|
191
|
+
{{4,6,2,2,0,0,0,0}},
|
|
192
|
+
{{4,6,5,0,0,0,0,0}},
|
|
193
|
+
{{6,0,2,0,0,0,0,0}},
|
|
194
|
+
{{6,4,4,0,0,0,0,0}},
|
|
195
|
+
{{4,2,6,0,0,0,0,0}},
|
|
196
|
+
{{2,0,6,0,0,0,0,0}},
|
|
197
|
+
{{4,4,0,0,0,6,0,0}},
|
|
198
|
+
{{4,4,6,0,0,0,0,0}},
|
|
199
|
+
{{4,6,2,0,0,2,0,0}},
|
|
200
|
+
{{2,2,6,0,0,0,0,0}},
|
|
201
|
+
{{2,4,6,0,0,0,0,0}},
|
|
202
|
+
{{0,6,6,0,0,0,0,0}},
|
|
203
|
+
{{6,2,4,0,0,0,0,0}},
|
|
204
|
+
{{0,4,6,0,0,0,0,0}},
|
|
205
|
+
{{4,0,0,0,0,6,0,0}},
|
|
206
|
+
{{4,6,4,0,0,0,0,0}},
|
|
207
|
+
{{6,0,0,0,0,6,0,0}},
|
|
208
|
+
{{6,0,0,0,0,2,0,0}},
|
|
209
|
+
{{6,0,4,0,0,0,0,0}},
|
|
210
|
+
{{6,5,4,0,0,0,0,0}},
|
|
211
|
+
{{0,2,6,0,0,0,0,0}},
|
|
212
|
+
{{0,0,6,0,0,0,0,0}},
|
|
213
|
+
{{6,6,2,0,0,0,0,0}},
|
|
214
|
+
{{2,6,4,0,0,0,0,0}},
|
|
215
|
+
{{6,4,2,0,0,0,0,0}},
|
|
216
|
+
{{2,6,2,0,0,0,0,0}},
|
|
217
|
+
{{2,6,0,0,0,0,0,0}},
|
|
218
|
+
{{6,0,0,0,0,4,0,0}},
|
|
219
|
+
{{6,4,0,0,0,0,0,0}},
|
|
220
|
+
{{6,6,0,0,0,0,0,0}},
|
|
221
|
+
{{5,6,4,0,0,0,0,0}},
|
|
222
|
+
{{0,6,0,0,0,0,0,0}},
|
|
223
|
+
{{6,2,0,0,0,0,0,0}},
|
|
224
|
+
{{0,6,2,0,0,0,0,0}},
|
|
225
|
+
{{4,6,2,0,0,0,0,0}},
|
|
226
|
+
{{0,6,4,0,0,0,0,0}},
|
|
227
|
+
{{4,6,0,0,0,0,0,0}},
|
|
228
|
+
{{6,0,0,0,0,0,0,0}},
|
|
229
|
+
{{6,6,5,0,0,0,0,0}},
|
|
230
|
+
{{6,5,6,0,0,0,0,0}},
|
|
231
|
+
{{5,6,6,0,0,0,0,0}},
|
|
232
|
+
{{5,5,6,0,0,0,0,0}},
|
|
233
|
+
{{5,6,5,0,0,0,0,0}},
|
|
234
|
+
{{6,5,5,0,0,0,0,0}},
|
|
235
|
+
{{6,6,6,0,0,0,0,0}},
|
|
236
|
+
{{6,5,0,0,0,0,0,0}},
|
|
237
|
+
{{6,0,5,0,0,0,0,0}},
|
|
238
|
+
{{0,6,5,0,0,0,0,0}},
|
|
239
|
+
{{5,6,0,0,0,0,0,0}},
|
|
240
|
+
{{5,0,6,0,0,0,0,0}},
|
|
241
|
+
{{0,5,6,0,0,0,0,0}},
|
|
242
|
+
|
|
243
|
+
{{0,0,0,0,4,0,0,0}},
|
|
244
|
+
{{0,0,0,4,0,0,0,0}},
|
|
245
|
+
{{2,2,0,0,4,0,0,0}},
|
|
246
|
+
{{2,2,2,0,4,0,0,0}},
|
|
247
|
+
{{2,4,0,0,2,0,0,0}},
|
|
248
|
+
{{2,4,2,0,2,0,0,0}},
|
|
249
|
+
{{2,4,4,0,2,0,0,0}},
|
|
250
|
+
{{4,0,2,0,4,0,0,0}},
|
|
251
|
+
{{4,0,4,0,2,0,0,0}},
|
|
252
|
+
{{4,2,0,0,2,0,0,0}},
|
|
253
|
+
{{4,2,2,0,2,0,0,0}},
|
|
254
|
+
{{4,4,0,0,2,0,0,0}},
|
|
255
|
+
{{4,4,2,0,2,0,0,0}},
|
|
256
|
+
{{4,4,4,0,2,0,0,0}},
|
|
257
|
+
{{0,2,2,4,0,0,0,0}},
|
|
258
|
+
{{2,2,4,2,0,0,0,0}},
|
|
259
|
+
{{2,4,4,0,0,2,0,0}},
|
|
260
|
+
{{2,4,4,2,0,0,0,0}},
|
|
261
|
+
{{4,0,4,0,0,2,0,0}},
|
|
262
|
+
{{4,0,4,0,0,4,0,0}},
|
|
263
|
+
{{4,2,2,4,0,0,0,0}},
|
|
264
|
+
{{4,4,0,2,0,0,0,0}},
|
|
265
|
+
{{2,2,0,4,0,0,0,0}},
|
|
266
|
+
{{2,4,2,2,0,0,0,0}},
|
|
267
|
+
{{4,4,2,2,0,0,0,0}},
|
|
268
|
+
{{4,0,4,0,0,0,0,0}},
|
|
269
|
+
{{4,4,4,0,0,4,0,0}},
|
|
270
|
+
{{0,4,0,2,0,0,0,0}},
|
|
271
|
+
{{0,4,2,2,0,0,0,0}},
|
|
272
|
+
{{4,0,2,0,0,2,0,0}},
|
|
273
|
+
{{4,2,0,0,0,4,0,0}},
|
|
274
|
+
{{2,2,2,0,0,4,0,0}},
|
|
275
|
+
{{4,0,0,2,0,0,0,0}},
|
|
276
|
+
{{4,4,4,0,0,2,0,0}},
|
|
277
|
+
{{4,0,0,0,0,4,0,0}},
|
|
278
|
+
{{4,0,2,0,0,4,0,0}},
|
|
279
|
+
{{4,2,0,0,0,2,0,0}},
|
|
280
|
+
{{4,2,2,0,0,2,0,0}},
|
|
281
|
+
{{2,4,0,2,0,0,0,0}},
|
|
282
|
+
{{2,2,0,0,0,4,0,0}},
|
|
283
|
+
{{2,4,0,0,0,4,0,0}},
|
|
284
|
+
{{2,4,2,0,0,4,0,0}},
|
|
285
|
+
{{4,2,4,0,0,0,0,0}},
|
|
286
|
+
{{2,0,4,0,0,0,0,0}},
|
|
287
|
+
{{4,0,2,0,0,0,0,0}},
|
|
288
|
+
{{4,4,0,0,0,4,0,0}},
|
|
289
|
+
{{4,4,2,0,0,4,0,0}},
|
|
290
|
+
{{0,4,4,0,0,0,0,0}},
|
|
291
|
+
{{4,4,0,0,0,2,0,0}},
|
|
292
|
+
{{2,4,0,0,0,2,0,0}},
|
|
293
|
+
{{2,2,4,0,0,0,0,0}},
|
|
294
|
+
{{0,2,4,0,0,0,0,0}},
|
|
295
|
+
{{4,2,2,0,0,0,0,0}},
|
|
296
|
+
{{2,4,2,0,0,2,0,0}},
|
|
297
|
+
{{4,4,4,0,0,0,0,0}},
|
|
298
|
+
{{2,4,4,0,0,0,0,0}},
|
|
299
|
+
{{0,0,4,0,0,0,0,0}},
|
|
300
|
+
{{0,4,2,0,0,0,0,0}},
|
|
301
|
+
{{4,4,2,0,0,2,0,0}},
|
|
302
|
+
{{2,4,2,0,0,0,0,0}},
|
|
303
|
+
{{4,2,0,0,0,0,0,0}},
|
|
304
|
+
{{4,4,0,0,0,0,0,0}},
|
|
305
|
+
{{4,4,2,0,0,0,0,0}},
|
|
306
|
+
{{2,4,0,0,0,0,0,0}},
|
|
307
|
+
{{0,4,0,0,0,0,0,0}},
|
|
308
|
+
{{4,0,0,0,0,0,0,0}},
|
|
309
|
+
{{0,0,0,4,4,0,0,0}},
|
|
310
|
+
{{0,0,4,0,4,0,0,0}},
|
|
311
|
+
{{0,0,4,4,0,0,0,0}},
|
|
312
|
+
{{0,4,0,0,4,0,0,0}},
|
|
313
|
+
{{0,4,0,4,0,0,0,0}},
|
|
314
|
+
{{4,0,0,0,4,0,0,0}},
|
|
315
|
+
{{4,0,0,4,0,0,0,0}},
|
|
316
|
+
|
|
317
|
+
{{2,0,0,0,0,0,0,0}},
|
|
318
|
+
{{0,2,0,0,0,0,0,0}},
|
|
319
|
+
{{0,2,0,2,2,0,0,0}},
|
|
320
|
+
{{0,2,2,0,2,0,0,0}},
|
|
321
|
+
{{2,0,0,2,2,0,0,0}},
|
|
322
|
+
{{2,0,2,0,2,0,0,0}},
|
|
323
|
+
{{2,0,2,2,0,0,0,0}},
|
|
324
|
+
{{2,2,0,0,2,0,0,0}},
|
|
325
|
+
{{2,2,2,2,0,0,0,0}},
|
|
326
|
+
{{2,2,0,2,0,0,0,0}},
|
|
327
|
+
{{2,2,0,0,0,0,0,0}},
|
|
328
|
+
{{0,0,2,0,0,0,0,0}},
|
|
329
|
+
{{0,2,2,0,0,0,0,0}},
|
|
330
|
+
{{2,2,2,0,0,0,0,0}},
|
|
331
|
+
{{0,0,0,2,0,0,0,0}},
|
|
332
|
+
{{2,0,2,0,0,0,0,0}},
|
|
333
|
+
{{0,2,0,2,0,0,0,0}},
|
|
334
|
+
{{0,0,2,2,0,0,0,0}},
|
|
335
|
+
{{0,2,2,2,0,0,0,0}},
|
|
336
|
+
};
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
// 1 to skip ASCII space, vowels AEIOU aeiou and UTF-8 continuation bytes 80-BF
|
|
342
|
+
static const uint8 kSkipSpaceVowelContinue[256] = {
|
|
343
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
344
|
+
1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
345
|
+
0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
|
|
346
|
+
0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
|
|
347
|
+
|
|
348
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
349
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
350
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
351
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
352
|
+
};
|
|
353
|
+
|
|
354
|
+
// 1 to skip ASCII space, and UTF-8 continuation bytes 80-BF
|
|
355
|
+
static const uint8 kSkipSpaceContinue[256] = {
|
|
356
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
357
|
+
1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
358
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
359
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
360
|
+
|
|
361
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
362
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
363
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
364
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
365
|
+
};
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
// If != UNKNOWN, use nilgrams to determine language of this script
|
|
369
|
+
static const Language kOnlyLanguagePerLScript[] = {
|
|
370
|
+
ENGLISH, // ULScript_Common, [no words should be in this script]
|
|
371
|
+
UNKNOWN_LANGUAGE, // ULScript_Latin,
|
|
372
|
+
//UNKNOWN_LANGUAGE, // ULScript_Greek, Jan 2009: change so we can score quads
|
|
373
|
+
GREEK, // ULScript_Greek, Mar 2009: change back; do gibberish separately
|
|
374
|
+
UNKNOWN_LANGUAGE, // ULScript_Cyrillic,
|
|
375
|
+
ARMENIAN, // ULScript_Armenian,
|
|
376
|
+
UNKNOWN_LANGUAGE, // ULScript_Hebrew,
|
|
377
|
+
UNKNOWN_LANGUAGE, // ULScript_Arabic,
|
|
378
|
+
SYRIAC, // ULScript_Syriac,
|
|
379
|
+
DHIVEHI, // ULScript_Thaana,
|
|
380
|
+
UNKNOWN_LANGUAGE, // ULScript_Devanagari,
|
|
381
|
+
UNKNOWN_LANGUAGE, // ULScript_Bengali,
|
|
382
|
+
PUNJABI, // ULScript_Gurmukhi,
|
|
383
|
+
GUJARATI, // ULScript_Gujarati,
|
|
384
|
+
ORIYA, // ULScript_Oriya,
|
|
385
|
+
TAMIL, // ULScript_Tamil,
|
|
386
|
+
TELUGU, // ULScript_Telugu,
|
|
387
|
+
KANNADA, // ULScript_Kannada,
|
|
388
|
+
MALAYALAM, // ULScript_Malayalam,
|
|
389
|
+
SINHALESE, // ULScript_Sinhala,
|
|
390
|
+
THAI, // ULScript_Thai,
|
|
391
|
+
LAOTHIAN, // ULScript_Lao,
|
|
392
|
+
UNKNOWN_LANGUAGE, // ULScript_Tibetan,
|
|
393
|
+
BURMESE, // ULScript_Myanmar,
|
|
394
|
+
GEORGIAN, // ULScript_Georgian,
|
|
395
|
+
UNKNOWN_LANGUAGE, // ULScript_HanCJK,
|
|
396
|
+
UNKNOWN_LANGUAGE, // ULScript_Ethiopic,
|
|
397
|
+
CHEROKEE, // ULScript_Cherokee,
|
|
398
|
+
INUKTITUT, // ULScript_Canadian_Aboriginal,
|
|
399
|
+
X_OGHAM, // ULScript_Ogham,
|
|
400
|
+
X_RUNIC, // ULScript_Runic,
|
|
401
|
+
KHMER, // ULScript_Khmer,
|
|
402
|
+
MONGOLIAN, // ULScript_Mongolian,
|
|
403
|
+
X_YI, // ULScript_Yi,
|
|
404
|
+
X_OLD_ITALIC, // ULScript_Old_Italic,
|
|
405
|
+
X_GOTHIC, // ULScript_Gothic,
|
|
406
|
+
X_DESERET, // ULScript_Deseret,
|
|
407
|
+
ENGLISH, // ULScript_Inherited, [no words should be in this script]
|
|
408
|
+
TAGALOG, // ULScript_Tagalog,
|
|
409
|
+
X_HANUNOO, // ULScript_Hanunoo,
|
|
410
|
+
X_BUHID, // ULScript_Buhid,
|
|
411
|
+
X_TAGBANWA, // ULScript_Tagbanwa,
|
|
412
|
+
LIMBU, // ULScript_Limbu,
|
|
413
|
+
X_TAI_LE, // ULScript_Tai_Le,
|
|
414
|
+
X_LINEAR_B, // ULScript_Linear_B,
|
|
415
|
+
X_UGARITIC, // ULScript_Ugaritic,
|
|
416
|
+
X_SHAVIAN, // ULScript_Shavian,
|
|
417
|
+
X_OSMANYA, // ULScript_Osmanya,
|
|
418
|
+
X_CYPRIOT, // ULScript_Cypriot,
|
|
419
|
+
X_BUGINESE, // ULScript_Buginese,
|
|
420
|
+
X_COPTIC, // ULScript_Coptic,
|
|
421
|
+
X_NEW_TAI_LUE, // ULScript_New_Tai_Lue,
|
|
422
|
+
X_GLAGOLITIC, // ULScript_Glagolitic,
|
|
423
|
+
X_TIFINAGH, // ULScript_Tifinagh,
|
|
424
|
+
X_SYLOTI_NAGRI, // ULScript_Syloti_Nagri,
|
|
425
|
+
X_OLD_PERSIAN, // ULScript_Old_Persian,
|
|
426
|
+
X_KHAROSHTHI, // ULScript_Kharoshthi,
|
|
427
|
+
X_BALINESE, // ULScript_Balinese,
|
|
428
|
+
X_CUNEIFORM, // ULScript_Cuneiform,
|
|
429
|
+
X_PHOENICIAN, // ULScript_Phoenician,
|
|
430
|
+
X_PHAGS_PA, // ULScript_Phags_Pa,
|
|
431
|
+
X_NKO, // ULScript_Nko,
|
|
432
|
+
|
|
433
|
+
// Unicode 5.1
|
|
434
|
+
X_SUDANESE, // ULScript_Sundanese,
|
|
435
|
+
X_LEPCHA, // ULScript_Lepcha,
|
|
436
|
+
X_OL_CHIKI, // ULScript_Ol_Chiki,
|
|
437
|
+
X_VAI, // ULScript_Vai,
|
|
438
|
+
X_SAURASHTRA, // ULScript_Saurashtra,
|
|
439
|
+
X_KAYAH_LI, // ULScript_Kayah_Li,
|
|
440
|
+
X_REJANG, // ULScript_Rejang,
|
|
441
|
+
X_LYCIAN, // ULScript_Lycian,
|
|
442
|
+
X_CARIAN, // ULScript_Carian,
|
|
443
|
+
X_LYDIAN, // ULScript_Lydian,
|
|
444
|
+
X_CHAM, // ULScript_Cham,
|
|
445
|
+
};
|
|
446
|
+
|
|
447
|
+
COMPILE_ASSERT(arraysize(kOnlyLanguagePerLScript) == ULScript_NUM_SCRIPTS,
|
|
448
|
+
kOnlyLanguagePerLScript_has_incorrect_length);
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
// This is, in a sense, the complement of the table above
|
|
452
|
+
// If != UNKNOWN, determines a default language of this script
|
|
453
|
+
static const Language kDefaultLanguagePerLScript[] = {
|
|
454
|
+
UNKNOWN_LANGUAGE, // ULScript_Common, [no words should be in this script]
|
|
455
|
+
ENGLISH, // ULScript_Latin,
|
|
456
|
+
UNKNOWN_LANGUAGE, // ULScript_Greek,
|
|
457
|
+
RUSSIAN, // ULScript_Cyrillic,
|
|
458
|
+
UNKNOWN_LANGUAGE, // ULScript_Armenian,
|
|
459
|
+
HEBREW, // ULScript_Hebrew,
|
|
460
|
+
ARABIC, // ULScript_Arabic,
|
|
461
|
+
UNKNOWN_LANGUAGE, // ULScript_Syriac,
|
|
462
|
+
UNKNOWN_LANGUAGE, // ULScript_Thaana,
|
|
463
|
+
HINDI, // ULScript_Devanagari,
|
|
464
|
+
BENGALI, // ULScript_Bengali,
|
|
465
|
+
UNKNOWN_LANGUAGE, // ULScript_Gurmukhi,
|
|
466
|
+
UNKNOWN_LANGUAGE, // ULScript_Gujarati,
|
|
467
|
+
UNKNOWN_LANGUAGE, // ULScript_Oriya,
|
|
468
|
+
UNKNOWN_LANGUAGE, // ULScript_Tamil,
|
|
469
|
+
UNKNOWN_LANGUAGE, // ULScript_Telugu,
|
|
470
|
+
UNKNOWN_LANGUAGE, // ULScript_Kannada,
|
|
471
|
+
UNKNOWN_LANGUAGE, // ULScript_Malayalam,
|
|
472
|
+
UNKNOWN_LANGUAGE, // ULScript_Sinhala,
|
|
473
|
+
UNKNOWN_LANGUAGE, // ULScript_Thai,
|
|
474
|
+
UNKNOWN_LANGUAGE, // ULScript_Lao,
|
|
475
|
+
TIBETAN, // ULScript_Tibetan,
|
|
476
|
+
UNKNOWN_LANGUAGE, // ULScript_Myanmar,
|
|
477
|
+
UNKNOWN_LANGUAGE, // ULScript_Georgian,
|
|
478
|
+
CHINESE, // ULScript_HanCJK,
|
|
479
|
+
AMHARIC, // ULScript_Ethiopic,
|
|
480
|
+
UNKNOWN_LANGUAGE, // ULScript_Cherokee,
|
|
481
|
+
UNKNOWN_LANGUAGE, // ULScript_Canadian_Aboriginal,
|
|
482
|
+
UNKNOWN_LANGUAGE, // ULScript_Ogham,
|
|
483
|
+
UNKNOWN_LANGUAGE, // ULScript_Runic,
|
|
484
|
+
UNKNOWN_LANGUAGE, // ULScript_Khmer,
|
|
485
|
+
UNKNOWN_LANGUAGE, // ULScript_Mongolian,
|
|
486
|
+
UNKNOWN_LANGUAGE, // ULScript_Yi,
|
|
487
|
+
UNKNOWN_LANGUAGE, // ULScript_Old_Italic,
|
|
488
|
+
UNKNOWN_LANGUAGE, // ULScript_Gothic,
|
|
489
|
+
UNKNOWN_LANGUAGE, // ULScript_Deseret,
|
|
490
|
+
UNKNOWN_LANGUAGE, // ULScript_Inherited, [no words should be in this script]
|
|
491
|
+
UNKNOWN_LANGUAGE, // ULScript_Tagalog,
|
|
492
|
+
UNKNOWN_LANGUAGE, // ULScript_Hanunoo,
|
|
493
|
+
UNKNOWN_LANGUAGE, // ULScript_Buhid,
|
|
494
|
+
UNKNOWN_LANGUAGE, // ULScript_Tagbanwa,
|
|
495
|
+
UNKNOWN_LANGUAGE, // ULScript_Limbu,
|
|
496
|
+
UNKNOWN_LANGUAGE, // ULScript_Tai_Le,
|
|
497
|
+
UNKNOWN_LANGUAGE, // ULScript_Linear_B,
|
|
498
|
+
UNKNOWN_LANGUAGE, // ULScript_Ugaritic,
|
|
499
|
+
UNKNOWN_LANGUAGE, // ULScript_Shavian,
|
|
500
|
+
UNKNOWN_LANGUAGE, // ULScript_Osmanya,
|
|
501
|
+
UNKNOWN_LANGUAGE, // ULScript_Cypriot,
|
|
502
|
+
UNKNOWN_LANGUAGE, // ULScript_Buginese,
|
|
503
|
+
UNKNOWN_LANGUAGE, // ULScript_Coptic,
|
|
504
|
+
UNKNOWN_LANGUAGE, // ULScript_New_Tai_Lue,
|
|
505
|
+
UNKNOWN_LANGUAGE, // ULScript_Glagolitic,
|
|
506
|
+
UNKNOWN_LANGUAGE, // ULScript_Tifinagh,
|
|
507
|
+
UNKNOWN_LANGUAGE, // ULScript_Syloti_Nagri,
|
|
508
|
+
UNKNOWN_LANGUAGE, // ULScript_Old_Persian,
|
|
509
|
+
UNKNOWN_LANGUAGE, // ULScript_Kharoshthi,
|
|
510
|
+
UNKNOWN_LANGUAGE, // ULScript_Balinese,
|
|
511
|
+
UNKNOWN_LANGUAGE, // ULScript_Cuneiform,
|
|
512
|
+
UNKNOWN_LANGUAGE, // ULScript_Phoenician,
|
|
513
|
+
UNKNOWN_LANGUAGE, // ULScript_Phags_Pa,
|
|
514
|
+
UNKNOWN_LANGUAGE, // ULScript_Nko,
|
|
515
|
+
|
|
516
|
+
// Unicode 5.1
|
|
517
|
+
UNKNOWN_LANGUAGE, // ULScript_Sundanese,
|
|
518
|
+
UNKNOWN_LANGUAGE, // ULScript_Lepcha,
|
|
519
|
+
UNKNOWN_LANGUAGE, // ULScript_Ol_Chiki,
|
|
520
|
+
UNKNOWN_LANGUAGE, // ULScript_Vai,
|
|
521
|
+
UNKNOWN_LANGUAGE, // ULScript_Saurashtra,
|
|
522
|
+
UNKNOWN_LANGUAGE, // ULScript_Kayah_Li,
|
|
523
|
+
UNKNOWN_LANGUAGE, // ULScript_Rejang,
|
|
524
|
+
UNKNOWN_LANGUAGE, // ULScript_Lycian,
|
|
525
|
+
UNKNOWN_LANGUAGE, // ULScript_Carian,
|
|
526
|
+
UNKNOWN_LANGUAGE, // ULScript_Lydian,
|
|
527
|
+
UNKNOWN_LANGUAGE, // ULScript_Cham,
|
|
528
|
+
};
|
|
529
|
+
|
|
530
|
+
COMPILE_ASSERT(arraysize(kDefaultLanguagePerLScript) == ULScript_NUM_SCRIPTS,
|
|
531
|
+
kDefaultLanguagePerLScript_has_incorrect_length);
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
// True for standalone languages (only lang in a script)
|
|
535
|
+
// Subscripted by packed language number
|
|
536
|
+
// If 1, we will use nilgrams to determine language
|
|
537
|
+
static const uint8 kIsStandaloneLang[EXT_NUM_LANGUAGES + 1] = {
|
|
538
|
+
0,
|
|
539
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,1,0, // GREEK
|
|
540
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
|
|
541
|
+
0,1,0,0,1, 0,1,0,0,0, 0,0,1,1,0, 0,0,0,0,1, // MALAYALAM..KANNADA
|
|
542
|
+
1,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 1,0,0,0,1, // PUNJABI..SINHALESE
|
|
543
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,1,1,0, // ARMENIAN..LAOTHIAN
|
|
544
|
+
|
|
545
|
+
0,0,0,0,1, 0,1,1,1,0, 1,0,0,0,0, 0,0,0,0,0, // KHMER..ORIYA
|
|
546
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
|
|
547
|
+
0,1,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, // INUKTITUT
|
|
548
|
+
|
|
549
|
+
0,0,0,0,0, // [160..164]
|
|
550
|
+
// Add new language standalone bit just before here
|
|
551
|
+
0,0,0,0,0, 1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,1,
|
|
552
|
+
1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,1,
|
|
553
|
+
|
|
554
|
+
1,1,1,1,
|
|
555
|
+
};
|
|
556
|
+
|
|
557
|
+
// True for ULScript_HanCJK
|
|
558
|
+
// (Vietnamese and Zhuang also have Latin script quadgrams)
|
|
559
|
+
// Subscripted by packed language number
|
|
560
|
+
static const uint8 kIsUnigramLang[EXT_NUM_LANGUAGES + 1] = {
|
|
561
|
+
0,
|
|
562
|
+
0,0,0,0,0, 0,0,0,1,1, 0,0,0,0,0, 0,1,0,0,0, // JAPANESE KOREAN CHINESE
|
|
563
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
|
|
564
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
|
|
565
|
+
0,0,0,0,0, 0,1,0,0,1, 0,0,0,0,0, 0,0,0,0,0, // VIETNAMESE CHINESE_T
|
|
566
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
|
|
567
|
+
|
|
568
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
|
|
569
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
|
|
570
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 1,0,0,0,0, // ZHUANG
|
|
571
|
+
|
|
572
|
+
0,0,0,0,0, // [160..164]
|
|
573
|
+
// Add new language unigram bit just before here
|
|
574
|
+
|
|
575
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
|
|
576
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
|
|
577
|
+
|
|
578
|
+
0,0,0,0,
|
|
579
|
+
};
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
// True for ULScript_HanCJK
|
|
583
|
+
// Subscripted by lscript number
|
|
584
|
+
static const uint8 kScoreUniPerLScript[] = {
|
|
585
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1,0,0,0,0,0,0,0,
|
|
586
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
587
|
+
0,0,0,0,0,0,0,0,
|
|
588
|
+
};
|
|
589
|
+
|
|
590
|
+
COMPILE_ASSERT(arraysize(kScoreUniPerLScript) == ULScript_NUM_SCRIPTS,
|
|
591
|
+
kScoreUniPerLScript_has_incorrect_length);
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
// Defines Top40 packed languages
|
|
595
|
+
|
|
596
|
+
// Tier 0/1 Language enum list (16)
|
|
597
|
+
// ENGLISH, /*no en_GB,*/ FRENCH, ITALIAN, GERMAN, SPANISH, // E - FIGS
|
|
598
|
+
// DUTCH, CHINESE, CHINESE_T, JAPANESE, KOREAN,
|
|
599
|
+
// PORTUGUESE, RUSSIAN, POLISH, TURKISH, THAI,
|
|
600
|
+
// ARABIC,
|
|
601
|
+
//
|
|
602
|
+
// Tier 2 Language enum list (22)
|
|
603
|
+
// SWEDISH, FINNISH, DANISH, /*no pt-PT,*/ ROMANIAN, HUNGARIAN,
|
|
604
|
+
// HEBREW, INDONESIAN, CZECH, GREEK, NORWEGIAN,
|
|
605
|
+
// VIETNAMESE, BULGARIAN, CROATIAN, LITHUANIAN, SLOVAK,
|
|
606
|
+
// TAGALOG, SLOVENIAN, SERBIAN, CATALAN, LATVIAN,
|
|
607
|
+
// UKRAINIAN, HINDI,
|
|
608
|
+
//
|
|
609
|
+
// use SERBO_CROATIAN instead of BOSNIAN, SERBIAN, CROATIAN, MONTENEGRIN(21)
|
|
610
|
+
//
|
|
611
|
+
// Include IgnoreMe (TG_UNKNOWN_LANGUAGE, 25+1) as a top 40
|
|
612
|
+
|
|
613
|
+
// NOTE: packed, i.e. Language enum + 1
|
|
614
|
+
static const uint8 kIsPackedTop40[EXT_NUM_LANGUAGES + 1] = {
|
|
615
|
+
0,
|
|
616
|
+
1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,0,
|
|
617
|
+
1,1,1,1,0, 1,0,1,0,0, 0,0,1,1,1, 1,0,0,1,0,
|
|
618
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,1,1, 1,0,0,0,0,
|
|
619
|
+
0,0,0,1,0, 0,1,0,1,1, 0,0,0,0,0, 0,0,0,0,0,
|
|
620
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,1,0,0, 0,0,0,0,0,
|
|
621
|
+
|
|
622
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
|
|
623
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
|
|
624
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
|
|
625
|
+
|
|
626
|
+
0,0,0,0,0, // [160..164]
|
|
627
|
+
// Add new language top40 bit just before here
|
|
628
|
+
|
|
629
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
|
|
630
|
+
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
|
|
631
|
+
|
|
632
|
+
0,0,0,0,
|
|
633
|
+
};
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
// Table has 234 eight-byte entries. Each entry has a five-byte array and
|
|
638
|
+
// a three-byte array of log base 2 probabilities in the range 0..11.
|
|
639
|
+
// The intended use is to express five or three probabilities in a single-byte
|
|
640
|
+
// subscript, then decode via this table. These probabilities are
|
|
641
|
+
// intended to go with an array of five or three language numbers.
|
|
642
|
+
//
|
|
643
|
+
// The corresponding language numbers will have to be sorted by descending
|
|
644
|
+
// probability, then the actual probability subscript chosen to match the
|
|
645
|
+
// closest available entry in this table.
|
|
646
|
+
//
|
|
647
|
+
// Pattern of probability values:
|
|
648
|
+
// hi 3/4 1/2 1/4 lo hi mid lo
|
|
649
|
+
// where "3/4" is (hi*3+lo)/4, "1/2" is (hi+lo)/2, and "1/4" is (hi+lo*3)/4 and
|
|
650
|
+
// mid is one of 3/4 1/2 or 1/4.
|
|
651
|
+
// There are three groups of 78 (=12*13/2) entries, with hi running 0..11 and
|
|
652
|
+
// lo running 0..hi. Only the first group is used for five-entry lookups.
|
|
653
|
+
// The mid value in the first group is 1/2, the second group 3/4, and the
|
|
654
|
+
// third group 1/4. For three-entry lookups, this allows the mid entry to be
|
|
655
|
+
// somewhat higher or lower than the midpoint, to allow a better match to the
|
|
656
|
+
// original probabilities.
|
|
657
|
+
static const int kLgProbV2TblSize = 234;
|
|
658
|
+
static const uint8 kLgProbV2Tbl[kLgProbV2TblSize * 8] = {
|
|
659
|
+
1,1,1,1,1, 1,1,1, // [0]
|
|
660
|
+
2,2,2,1,1, 2,2,1, // [1]
|
|
661
|
+
2,2,2,2,2, 2,2,2,
|
|
662
|
+
3,3,2,2,1, 3,2,1, // [3]
|
|
663
|
+
3,3,3,2,2, 3,3,2,
|
|
664
|
+
3,3,3,3,3, 3,3,3,
|
|
665
|
+
4,3,3,2,1, 4,3,1, // [6]
|
|
666
|
+
4,4,3,3,2, 4,3,2,
|
|
667
|
+
4,4,4,3,3, 4,4,3,
|
|
668
|
+
4,4,4,4,4, 4,4,4,
|
|
669
|
+
5,4,3,2,1, 5,3,1, // [10]
|
|
670
|
+
5,4,4,3,2, 5,4,2,
|
|
671
|
+
5,5,4,4,3, 5,4,3,
|
|
672
|
+
5,5,5,4,4, 5,5,4,
|
|
673
|
+
5,5,5,5,5, 5,5,5,
|
|
674
|
+
6,5,4,2,1, 6,4,1, // [15]
|
|
675
|
+
6,5,4,3,2, 6,4,2,
|
|
676
|
+
6,5,5,4,3, 6,5,3,
|
|
677
|
+
6,6,5,5,4, 6,5,4,
|
|
678
|
+
6,6,6,5,5, 6,6,5,
|
|
679
|
+
6,6,6,6,6, 6,6,6,
|
|
680
|
+
7,6,4,3,1, 7,4,1, // [21]
|
|
681
|
+
7,6,5,3,2, 7,5,2,
|
|
682
|
+
7,6,5,4,3, 7,5,3,
|
|
683
|
+
7,6,6,5,4, 7,6,4,
|
|
684
|
+
7,7,6,6,5, 7,6,5,
|
|
685
|
+
7,7,7,6,6, 7,7,6,
|
|
686
|
+
7,7,7,7,7, 7,7,7,
|
|
687
|
+
8,6,5,3,1, 8,5,1, // [28]
|
|
688
|
+
8,7,5,4,2, 8,5,2,
|
|
689
|
+
8,7,6,4,3, 8,6,3,
|
|
690
|
+
8,7,6,5,4, 8,6,4,
|
|
691
|
+
8,7,7,6,5, 8,7,5,
|
|
692
|
+
8,8,7,7,6, 8,7,6,
|
|
693
|
+
8,8,8,7,7, 8,8,7,
|
|
694
|
+
8,8,8,8,8, 8,8,8,
|
|
695
|
+
9,7,5,3,1, 9,5,1, // [36]
|
|
696
|
+
9,7,6,4,2, 9,6,2,
|
|
697
|
+
9,8,6,5,3, 9,6,3,
|
|
698
|
+
9,8,7,5,4, 9,7,4,
|
|
699
|
+
9,8,7,6,5, 9,7,5,
|
|
700
|
+
9,8,8,7,6, 9,8,6,
|
|
701
|
+
9,9,8,8,7, 9,8,7,
|
|
702
|
+
9,9,9,8,8, 9,9,8,
|
|
703
|
+
9,9,9,9,9, 9,9,9,
|
|
704
|
+
10,8,6,3,1, 10,6,1, // [45]
|
|
705
|
+
10,8,6,4,2, 10,6,2,
|
|
706
|
+
10,8,7,5,3, 10,7,3,
|
|
707
|
+
10,9,7,6,4, 10,7,4,
|
|
708
|
+
10,9,8,6,5, 10,8,5,
|
|
709
|
+
10,9,8,7,6, 10,8,6,
|
|
710
|
+
10,9,9,8,7, 10,9,7,
|
|
711
|
+
10,10,9,9,8, 10,9,8,
|
|
712
|
+
10,10,10,9,9, 10,10,9,
|
|
713
|
+
10,10,10,10,10, 10,10,10,
|
|
714
|
+
11,9,6,4,1, 11,6,1, // [55]
|
|
715
|
+
11,9,7,4,2, 11,7,2,
|
|
716
|
+
11,9,7,5,3, 11,7,3,
|
|
717
|
+
11,9,8,6,4, 11,8,4,
|
|
718
|
+
11,10,8,7,5, 11,8,5,
|
|
719
|
+
11,10,9,7,6, 11,9,6,
|
|
720
|
+
11,10,9,8,7, 11,9,7,
|
|
721
|
+
11,10,10,9,8, 11,10,8,
|
|
722
|
+
11,11,10,10,9, 11,10,9,
|
|
723
|
+
11,11,11,10,10, 11,11,10,
|
|
724
|
+
11,11,11,11,11, 11,11,11,
|
|
725
|
+
12,9,7,4,1, 12,7,1, // [66]
|
|
726
|
+
12,10,7,5,2, 12,7,2,
|
|
727
|
+
12,10,8,5,3, 12,8,3,
|
|
728
|
+
12,10,8,6,4, 12,8,4,
|
|
729
|
+
12,10,9,7,5, 12,9,5,
|
|
730
|
+
12,11,9,8,6, 12,9,6,
|
|
731
|
+
12,11,10,8,7, 12,10,7,
|
|
732
|
+
12,11,10,9,8, 12,10,8,
|
|
733
|
+
12,11,11,10,9, 12,11,9,
|
|
734
|
+
12,12,11,11,10, 12,11,10,
|
|
735
|
+
12,12,12,11,11, 12,12,11,
|
|
736
|
+
12,12,12,12,12, 12,12,12,
|
|
737
|
+
|
|
738
|
+
1,1,1,1,1, 1,1,1,
|
|
739
|
+
2,2,2,1,1, 2,2,1,
|
|
740
|
+
2,2,2,2,2, 2,2,2,
|
|
741
|
+
3,3,2,2,1, 3,3,1,
|
|
742
|
+
3,3,3,2,2, 3,3,2,
|
|
743
|
+
3,3,3,3,3, 3,3,3,
|
|
744
|
+
4,3,3,2,1, 4,3,1,
|
|
745
|
+
4,4,3,3,2, 4,4,2,
|
|
746
|
+
4,4,4,3,3, 4,4,3,
|
|
747
|
+
4,4,4,4,4, 4,4,4,
|
|
748
|
+
5,4,3,2,1, 5,4,1,
|
|
749
|
+
5,4,4,3,2, 5,4,2,
|
|
750
|
+
5,5,4,4,3, 5,5,3,
|
|
751
|
+
5,5,5,4,4, 5,5,4,
|
|
752
|
+
5,5,5,5,5, 5,5,5,
|
|
753
|
+
6,5,4,2,1, 6,5,1,
|
|
754
|
+
6,5,4,3,2, 6,5,2,
|
|
755
|
+
6,5,5,4,3, 6,5,3,
|
|
756
|
+
6,6,5,5,4, 6,6,4,
|
|
757
|
+
6,6,6,5,5, 6,6,5,
|
|
758
|
+
6,6,6,6,6, 6,6,6,
|
|
759
|
+
7,6,4,3,1, 7,6,1,
|
|
760
|
+
7,6,5,3,2, 7,6,2,
|
|
761
|
+
7,6,5,4,3, 7,6,3,
|
|
762
|
+
7,6,6,5,4, 7,6,4,
|
|
763
|
+
7,7,6,6,5, 7,7,5,
|
|
764
|
+
7,7,7,6,6, 7,7,6,
|
|
765
|
+
7,7,7,7,7, 7,7,7,
|
|
766
|
+
8,6,5,3,1, 8,6,1,
|
|
767
|
+
8,7,5,4,2, 8,7,2,
|
|
768
|
+
8,7,6,4,3, 8,7,3,
|
|
769
|
+
8,7,6,5,4, 8,7,4,
|
|
770
|
+
8,7,7,6,5, 8,7,5,
|
|
771
|
+
8,8,7,7,6, 8,8,6,
|
|
772
|
+
8,8,8,7,7, 8,8,7,
|
|
773
|
+
8,8,8,8,8, 8,8,8,
|
|
774
|
+
9,7,5,3,1, 9,7,1,
|
|
775
|
+
9,7,6,4,2, 9,7,2,
|
|
776
|
+
9,8,6,5,3, 9,8,3,
|
|
777
|
+
9,8,7,5,4, 9,8,4,
|
|
778
|
+
9,8,7,6,5, 9,8,5,
|
|
779
|
+
9,8,8,7,6, 9,8,6,
|
|
780
|
+
9,9,8,8,7, 9,9,7,
|
|
781
|
+
9,9,9,8,8, 9,9,8,
|
|
782
|
+
9,9,9,9,9, 9,9,9,
|
|
783
|
+
10,8,6,3,1, 10,8,1,
|
|
784
|
+
10,8,6,4,2, 10,8,2,
|
|
785
|
+
10,8,7,5,3, 10,8,3,
|
|
786
|
+
10,9,7,6,4, 10,9,4,
|
|
787
|
+
10,9,8,6,5, 10,9,5,
|
|
788
|
+
10,9,8,7,6, 10,9,6,
|
|
789
|
+
10,9,9,8,7, 10,9,7,
|
|
790
|
+
10,10,9,9,8, 10,10,8,
|
|
791
|
+
10,10,10,9,9, 10,10,9,
|
|
792
|
+
10,10,10,10,10, 10,10,10,
|
|
793
|
+
11,9,6,4,1, 11,9,1,
|
|
794
|
+
11,9,7,4,2, 11,9,2,
|
|
795
|
+
11,9,7,5,3, 11,9,3,
|
|
796
|
+
11,9,8,6,4, 11,9,4,
|
|
797
|
+
11,10,8,7,5, 11,10,5,
|
|
798
|
+
11,10,9,7,6, 11,10,6,
|
|
799
|
+
11,10,9,8,7, 11,10,7,
|
|
800
|
+
11,10,10,9,8, 11,10,8,
|
|
801
|
+
11,11,10,10,9, 11,11,9,
|
|
802
|
+
11,11,11,10,10, 11,11,10,
|
|
803
|
+
11,11,11,11,11, 11,11,11,
|
|
804
|
+
12,9,7,4,1, 12,9,1,
|
|
805
|
+
12,10,7,5,2, 12,10,2,
|
|
806
|
+
12,10,8,5,3, 12,10,3,
|
|
807
|
+
12,10,8,6,4, 12,10,4,
|
|
808
|
+
12,10,9,7,5, 12,10,5,
|
|
809
|
+
12,11,9,8,6, 12,11,6,
|
|
810
|
+
12,11,10,8,7, 12,11,7,
|
|
811
|
+
12,11,10,9,8, 12,11,8,
|
|
812
|
+
12,11,11,10,9, 12,11,9,
|
|
813
|
+
12,12,11,11,10, 12,12,10,
|
|
814
|
+
12,12,12,11,11, 12,12,11,
|
|
815
|
+
12,12,12,12,12, 12,12,12,
|
|
816
|
+
|
|
817
|
+
1,1,1,1,1, 1,1,1,
|
|
818
|
+
2,2,2,1,1, 2,1,1,
|
|
819
|
+
2,2,2,2,2, 2,2,2,
|
|
820
|
+
3,3,2,2,1, 3,2,1,
|
|
821
|
+
3,3,3,2,2, 3,2,2,
|
|
822
|
+
3,3,3,3,3, 3,3,3,
|
|
823
|
+
4,3,3,2,1, 4,2,1,
|
|
824
|
+
4,4,3,3,2, 4,3,2,
|
|
825
|
+
4,4,4,3,3, 4,3,3,
|
|
826
|
+
4,4,4,4,4, 4,4,4,
|
|
827
|
+
5,4,3,2,1, 5,2,1,
|
|
828
|
+
5,4,4,3,2, 5,3,2,
|
|
829
|
+
5,5,4,4,3, 5,4,3,
|
|
830
|
+
5,5,5,4,4, 5,4,4,
|
|
831
|
+
5,5,5,5,5, 5,5,5,
|
|
832
|
+
6,5,4,2,1, 6,2,1,
|
|
833
|
+
6,5,4,3,2, 6,3,2,
|
|
834
|
+
6,5,5,4,3, 6,4,3,
|
|
835
|
+
6,6,5,5,4, 6,5,4,
|
|
836
|
+
6,6,6,5,5, 6,5,5,
|
|
837
|
+
6,6,6,6,6, 6,6,6,
|
|
838
|
+
7,6,4,3,1, 7,3,1,
|
|
839
|
+
7,6,5,3,2, 7,3,2,
|
|
840
|
+
7,6,5,4,3, 7,4,3,
|
|
841
|
+
7,6,6,5,4, 7,5,4,
|
|
842
|
+
7,7,6,6,5, 7,6,5,
|
|
843
|
+
7,7,7,6,6, 7,6,6,
|
|
844
|
+
7,7,7,7,7, 7,7,7,
|
|
845
|
+
8,6,5,3,1, 8,3,1,
|
|
846
|
+
8,7,5,4,2, 8,4,2,
|
|
847
|
+
8,7,6,4,3, 8,4,3,
|
|
848
|
+
8,7,6,5,4, 8,5,4,
|
|
849
|
+
8,7,7,6,5, 8,6,5,
|
|
850
|
+
8,8,7,7,6, 8,7,6,
|
|
851
|
+
8,8,8,7,7, 8,7,7,
|
|
852
|
+
8,8,8,8,8, 8,8,8,
|
|
853
|
+
9,7,5,3,1, 9,3,1,
|
|
854
|
+
9,7,6,4,2, 9,4,2,
|
|
855
|
+
9,8,6,5,3, 9,5,3,
|
|
856
|
+
9,8,7,5,4, 9,5,4,
|
|
857
|
+
9,8,7,6,5, 9,6,5,
|
|
858
|
+
9,8,8,7,6, 9,7,6,
|
|
859
|
+
9,9,8,8,7, 9,8,7,
|
|
860
|
+
9,9,9,8,8, 9,8,8,
|
|
861
|
+
9,9,9,9,9, 9,9,9,
|
|
862
|
+
10,8,6,3,1, 10,3,1,
|
|
863
|
+
10,8,6,4,2, 10,4,2,
|
|
864
|
+
10,8,7,5,3, 10,5,3,
|
|
865
|
+
10,9,7,6,4, 10,6,4,
|
|
866
|
+
10,9,8,6,5, 10,6,5,
|
|
867
|
+
10,9,8,7,6, 10,7,6,
|
|
868
|
+
10,9,9,8,7, 10,8,7,
|
|
869
|
+
10,10,9,9,8, 10,9,8,
|
|
870
|
+
10,10,10,9,9, 10,9,9,
|
|
871
|
+
10,10,10,10,10, 10,10,10,
|
|
872
|
+
11,9,6,4,1, 11,4,1,
|
|
873
|
+
11,9,7,4,2, 11,4,2,
|
|
874
|
+
11,9,7,5,3, 11,5,3,
|
|
875
|
+
11,9,8,6,4, 11,6,4,
|
|
876
|
+
11,10,8,7,5, 11,7,5,
|
|
877
|
+
11,10,9,7,6, 11,7,6,
|
|
878
|
+
11,10,9,8,7, 11,8,7,
|
|
879
|
+
11,10,10,9,8, 11,9,8,
|
|
880
|
+
11,11,10,10,9, 11,10,9,
|
|
881
|
+
11,11,11,10,10, 11,10,10,
|
|
882
|
+
11,11,11,11,11, 11,11,11,
|
|
883
|
+
12,9,7,4,1, 12,4,1,
|
|
884
|
+
12,10,7,5,2, 12,5,2,
|
|
885
|
+
12,10,8,5,3, 12,5,3,
|
|
886
|
+
12,10,8,6,4, 12,6,4,
|
|
887
|
+
12,10,9,7,5, 12,7,5,
|
|
888
|
+
12,11,9,8,6, 12,8,6,
|
|
889
|
+
12,11,10,8,7, 12,8,7,
|
|
890
|
+
12,11,10,9,8, 12,9,8,
|
|
891
|
+
12,11,11,10,9, 12,10,9,
|
|
892
|
+
12,12,11,11,10, 12,11,10,
|
|
893
|
+
12,12,12,11,11, 12,11,11,
|
|
894
|
+
12,12,12,12,12, 12,12,12,
|
|
895
|
+
};
|
|
896
|
+
|
|
897
|
+
// Backmap a single desired probability into an entry in kLgProbV2Tbl
|
|
898
|
+
static const uint8 kLgProbV2TblBackmap[13] = {
|
|
899
|
+
0,
|
|
900
|
+
0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66,
|
|
901
|
+
};
|
|
902
|
+
|
|
903
|
+
|
|
904
|
+
// Always advances one UTF-8 character
|
|
905
|
+
static const uint8 kAdvanceOneChar[256] = {
|
|
906
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
907
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
908
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
909
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
910
|
+
|
|
911
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
912
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
913
|
+
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
|
|
914
|
+
3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,
|
|
915
|
+
};
|
|
916
|
+
|
|
917
|
+
// Does not advance past space or cr/lf/nul
|
|
918
|
+
static const uint8 kAdvanceOneCharButSpace[256] = {
|
|
919
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
920
|
+
0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
921
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
922
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
923
|
+
|
|
924
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
925
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
926
|
+
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
|
|
927
|
+
3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,
|
|
928
|
+
};
|
|
929
|
+
|
|
930
|
+
// Advances *only* on space or ASCII vowel (or illegal byte)
|
|
931
|
+
static const uint8 kAdvanceOneCharSpaceVowel[256] = {
|
|
932
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
933
|
+
1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
934
|
+
0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
|
|
935
|
+
0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
|
|
936
|
+
|
|
937
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
938
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
939
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
940
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
941
|
+
};
|
|
942
|
+
|
|
943
|
+
// Advances *only* on space (or illegal byte)
|
|
944
|
+
static const uint8 kAdvanceOneCharSpace[256] = {
|
|
945
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
946
|
+
1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
947
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
948
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
949
|
+
|
|
950
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
951
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
952
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
953
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
954
|
+
};
|
|
955
|
+
|
|
956
|
+
|
|
957
|
+
//------------------------------------------------------------------------------
|
|
958
|
+
// General
|
|
959
|
+
//------------------------------------------------------------------------------
|
|
960
|
+
static inline int minint(int a, int b) {return (a < b) ? a: b;}
|
|
961
|
+
static inline int maxint(int a, int b) {return (a > b) ? a: b;}
|
|
962
|
+
|
|
963
|
+
// Here to make available for debugging
|
|
964
|
+
int ReliabilityDelta(int value1, int value2, int count);
|
|
965
|
+
int ReliabilityMainstream(int topscore, int len, int mean_score);
|
|
966
|
+
|
|
967
|
+
// Returns "0" for too small
|
|
968
|
+
inline const char* MyExtLanguageCode(Language lang) {
|
|
969
|
+
return ExtLanguageCode(lang);
|
|
970
|
+
}
|
|
971
|
+
|
|
972
|
+
// Map script into Latin, Cyrillic, Arabic, Other. Used in keeping track of
|
|
973
|
+
// amount of training data for language-script combinations
|
|
974
|
+
inline int LScript4(UnicodeLScript lscript) {
|
|
975
|
+
if (lscript == ULScript_Latin) {return 0;}
|
|
976
|
+
if (lscript == ULScript_Cyrillic) {return 1;}
|
|
977
|
+
if (lscript == ULScript_Arabic) {return 2;}
|
|
978
|
+
return 3;
|
|
979
|
+
}
|
|
980
|
+
|
|
981
|
+
|
|
982
|
+
// Routines to access 3 or 5 log probabilities in a single byte.
|
|
983
|
+
|
|
984
|
+
// Return address of 8-byte entry[i]
|
|
985
|
+
inline const uint8* LgProb2TblEntry(int i) {
|
|
986
|
+
return &kLgProbV2Tbl[i * 8];
|
|
987
|
+
}
|
|
988
|
+
|
|
989
|
+
// Return one of five probabilities in an entry
|
|
990
|
+
// CURRENTLY UNUSED
|
|
991
|
+
inline uint8 LgProb5(const uint8* entry, int j) {
|
|
992
|
+
return entry[j];
|
|
993
|
+
}
|
|
994
|
+
|
|
995
|
+
// Return one of three probabilities in an entry
|
|
996
|
+
inline uint8 LgProb3(const uint8* entry, int j) {
|
|
997
|
+
return entry[j + 5];
|
|
998
|
+
}
|
|
999
|
+
|
|
1000
|
+
|
|
1001
|
+
|
|
1002
|
+
//------------------------------------------------------------------------------
|
|
1003
|
+
// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores
|
|
1004
|
+
//------------------------------------------------------------------------------
|
|
1005
|
+
|
|
1006
|
+
// Pick up 1..12 bytes and hash them via mask/shift/add. NO pre/post
|
|
1007
|
+
// OVERSHOOTS up to 3 bytes
|
|
1008
|
+
uint32 BiHashV25(const char* word_ptr, int bytecount);
|
|
1009
|
+
|
|
1010
|
+
// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
|
|
1011
|
+
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
|
|
1012
|
+
uint32 QuadHashV25(const char* word_ptr, int bytecount);
|
|
1013
|
+
|
|
1014
|
+
// Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
|
|
1015
|
+
// OVERSHOOTS up to 3 bytes
|
|
1016
|
+
uint32 QuadHashV25Underscore(const char* word_ptr, int bytecount);
|
|
1017
|
+
|
|
1018
|
+
|
|
1019
|
+
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
|
|
1020
|
+
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
|
|
1021
|
+
// For runtime use of tables V3
|
|
1022
|
+
uint64 OctaHash40(const char* word_ptr, int bytecount);
|
|
1023
|
+
|
|
1024
|
+
uint64 OctaHash40underscore(const char* word_ptr, int bytecount);
|
|
1025
|
+
|
|
1026
|
+
|
|
1027
|
+
// From 32-bit gram FP, return hash table subscript and remaining key
|
|
1028
|
+
inline void QuadFPJustHash(uint32 quadhash,
|
|
1029
|
+
uint32 keymask,
|
|
1030
|
+
int bucketcount,
|
|
1031
|
+
uint32* subscr, uint32* hashkey) {
|
|
1032
|
+
*subscr = (quadhash + (quadhash >> 12)) & (bucketcount - 1);
|
|
1033
|
+
*hashkey = quadhash & keymask;
|
|
1034
|
+
}
|
|
1035
|
+
|
|
1036
|
+
// Look up 32-bit gram FP in caller-passed table
|
|
1037
|
+
// Typical size 256K entries (1.5MB)
|
|
1038
|
+
// Two-byte hashkey
|
|
1039
|
+
inline const uint32 QuadHashV3Lookup4(const cld::CLDTableSummary* gram_obj,
|
|
1040
|
+
uint32 quadhash) {
|
|
1041
|
+
|
|
1042
|
+
uint32 subscr, hashkey;
|
|
1043
|
+
const IndirectProbBucket4* quadtable = gram_obj->kCLDTable;
|
|
1044
|
+
uint32 keymask = gram_obj->kCLDTableKeyMask;
|
|
1045
|
+
int bucketcount = gram_obj->kCLDTableSize;
|
|
1046
|
+
QuadFPJustHash(quadhash, keymask, bucketcount, &subscr, &hashkey);
|
|
1047
|
+
const IndirectProbBucket4* bucket_ptr = &quadtable[subscr];
|
|
1048
|
+
// Four-way associative, 4 compares
|
|
1049
|
+
if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) {
|
|
1050
|
+
return bucket_ptr->keyvalue[0];
|
|
1051
|
+
}
|
|
1052
|
+
if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) {
|
|
1053
|
+
return bucket_ptr->keyvalue[1];
|
|
1054
|
+
}
|
|
1055
|
+
if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) {
|
|
1056
|
+
return bucket_ptr->keyvalue[2];
|
|
1057
|
+
}
|
|
1058
|
+
if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) {
|
|
1059
|
+
return bucket_ptr->keyvalue[3];
|
|
1060
|
+
}
|
|
1061
|
+
return 0;
|
|
1062
|
+
}
|
|
1063
|
+
|
|
1064
|
+
|
|
1065
|
+
// Map 40 bits to subscript, hashkey, expected 18-22 bit subscript (min 16)
|
|
1066
|
+
// wwwwwwww xxxxxxxx xxxxxxxx yyyyyyyy yyyyyyyy
|
|
1067
|
+
// + ........ ....wwww wwwwxxxx xxxxxxxx xxxxyyyy
|
|
1068
|
+
// 00000000 00000000 00000011 11111111 11111111 (18-bit bucketcount-1)
|
|
1069
|
+
//
|
|
1070
|
+
// hashkey:
|
|
1071
|
+
// wwwwxxxx xxxxxxxx xxxx.... ........ (20-bit keymask)
|
|
1072
|
+
// 12-bit shift in subscript mixes in ~4 letters x 4 bits each
|
|
1073
|
+
|
|
1074
|
+
// From 40-bit gram FP, return hash table subscript and remaining key
|
|
1075
|
+
inline void OctaFPJustHash(uint64 longwordhash,
|
|
1076
|
+
uint32 keymask,
|
|
1077
|
+
int bucketcount,
|
|
1078
|
+
uint32* subscr, uint32* hashkey) {
|
|
1079
|
+
uint32 temp = (longwordhash + (longwordhash >> 12)) & (bucketcount - 1);
|
|
1080
|
+
*subscr = temp;
|
|
1081
|
+
temp = longwordhash >> 4;
|
|
1082
|
+
*hashkey = temp & keymask;
|
|
1083
|
+
}
|
|
1084
|
+
|
|
1085
|
+
// Look up 40-bit gram FP in caller-passed table
|
|
1086
|
+
// Typical size 256K-4M entries (1-16MB)
|
|
1087
|
+
// 24-12 bit hashkey packed with 8-20 bit indirect lang/probs
|
|
1088
|
+
// keymask is 0xfffff000 for 20-bit hashkey and 12-bit indirect
|
|
1089
|
+
inline const uint32 OctaHashV3Lookup4(const cld::CLDTableSummary* gram_obj,
|
|
1090
|
+
uint64 longwordhash) {
|
|
1091
|
+
uint32 subscr, hashkey;
|
|
1092
|
+
const IndirectProbBucket4* octatable = gram_obj->kCLDTable;
|
|
1093
|
+
uint32 keymask = gram_obj->kCLDTableKeyMask;
|
|
1094
|
+
int bucketcount = gram_obj->kCLDTableSize;
|
|
1095
|
+
OctaFPJustHash(longwordhash, keymask, bucketcount,
|
|
1096
|
+
&subscr, &hashkey);
|
|
1097
|
+
const IndirectProbBucket4* bucket_ptr = &octatable[subscr];
|
|
1098
|
+
// Four-way associative, 4 compares
|
|
1099
|
+
if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) {
|
|
1100
|
+
return bucket_ptr->keyvalue[0];
|
|
1101
|
+
}
|
|
1102
|
+
if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) {
|
|
1103
|
+
return bucket_ptr->keyvalue[1];
|
|
1104
|
+
}
|
|
1105
|
+
if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) {
|
|
1106
|
+
return bucket_ptr->keyvalue[2];
|
|
1107
|
+
}
|
|
1108
|
+
if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) {
|
|
1109
|
+
return bucket_ptr->keyvalue[3];
|
|
1110
|
+
}
|
|
1111
|
+
return 0;
|
|
1112
|
+
}
|
|
1113
|
+
|
|
1114
|
+
|
|
1115
|
+
|
|
1116
|
+
//------------------------------------------------------------------------------
|
|
1117
|
+
// Scoring single groups of letters
|
|
1118
|
+
//------------------------------------------------------------------------------
|
|
1119
|
+
|
|
1120
|
+
// UNIGRAM score one => tote
|
|
1121
|
+
// Input: 1-byte entry of subscript into unigram probs, plus
|
|
1122
|
+
// an accumulator tote.
|
|
1123
|
+
// Output: running sums in tote updated
|
|
1124
|
+
void ProcessProbV25UniTote(int propval, Tote* tote);
|
|
1125
|
+
|
|
1126
|
+
// BIGRAM, QUADGRAM, OCTAGRAM score one => tote
|
|
1127
|
+
// Input: 4-byte entry of 3 language numbers and one probability subscript,
|
|
1128
|
+
// plus an accumulator tote. (language 0 means unused entry)
|
|
1129
|
+
// Output: running sums in tote updated
|
|
1130
|
+
void ProcessProbV25Tote(uint32 probs, Tote* tote);
|
|
1131
|
+
|
|
1132
|
+
|
|
1133
|
+
//------------------------------------------------------------------------------
|
|
1134
|
+
// Routines to accumulate probabilities
|
|
1135
|
+
//------------------------------------------------------------------------------
|
|
1136
|
+
|
|
1137
|
+
// Score up to n=gram_limit unigrams, returning number of bytes consumed
|
|
1138
|
+
// Caller supplies table, such as compact_lang_det_generated_ctjkvz_b1_obj
|
|
1139
|
+
int DoUniScoreV3(const UTF8PropObj* unigram_obj,
|
|
1140
|
+
const char* isrc, int srclen, int advance_by,
|
|
1141
|
+
int* tote_grams, int gram_limit, Tote* chunk_tote);
|
|
1142
|
+
|
|
1143
|
+
|
|
1144
|
+
// Score all words in isrc, using languages that have bigrams (CJK)
|
|
1145
|
+
// Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj
|
|
1146
|
+
// Return number of bigrams that hit in the hash table
|
|
1147
|
+
int DoBigramScoreV3(const cld::CLDTableSummary* bigram_obj,
|
|
1148
|
+
const char* isrc, int srclen, Tote* chunk_tote);
|
|
1149
|
+
|
|
1150
|
+
|
|
1151
|
+
// Score up to n=gram_limit quadgrams, returning number of bytes consumed
|
|
1152
|
+
// Caller supplies table, such as &kQuadTable_obj or &kGibberishTable_obj
|
|
1153
|
+
int DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj,
|
|
1154
|
+
const char* isrc, int srclen, int advance_by,
|
|
1155
|
+
int* tote_grams, int gram_limit, Tote* chunk_tote);
|
|
1156
|
+
|
|
1157
|
+
// Score all octagrams (words) in isrc, using languages that have quadgrams
|
|
1158
|
+
// Caller supplies table, such as &kLongWord8Table_obj
|
|
1159
|
+
// Return number of words that hit in the hash table
|
|
1160
|
+
int DoOctaScoreV3(const cld::CLDTableSummary* octagram_obj,
|
|
1161
|
+
const char* isrc, int srclen, Tote* chunk_tote);
|
|
1162
|
+
|
|
1163
|
+
//------------------------------------------------------------------------------
|
|
1164
|
+
// Reliability calculations, for single language and between languages
|
|
1165
|
+
//------------------------------------------------------------------------------
|
|
1166
|
+
|
|
1167
|
+
// Reliability = 0..100
|
|
1168
|
+
static const int kMinReliable = 75;
|
|
1169
|
+
|
|
1170
|
+
// Calculate ratio of score per 1KB vs. expected score per 1KB
|
|
1171
|
+
double GetNormalizedScore(Language lang, UnicodeLScript lscript,
|
|
1172
|
+
int bytes, int score);
|
|
1173
|
+
|
|
1174
|
+
// Calculate reliablity of len bytes of script lscript with chunk_tote
|
|
1175
|
+
int GetReliability(int len, UnicodeLScript lscript, const Tote* chunk_tote);
|
|
1176
|
+
|
|
1177
|
+
|
|
1178
|
+
//------------------------------------------------------------------------------
|
|
1179
|
+
// Miscellaneous
|
|
1180
|
+
//------------------------------------------------------------------------------
|
|
1181
|
+
|
|
1182
|
+
// Make languages packed into uint32 values non-zero
|
|
1183
|
+
// These routines later could remap so languages not in QuadHash tables are not
|
|
1184
|
+
// represented, and so that any thrashing in accumulation is eliminated
|
|
1185
|
+
uint8 inline PackLanguage(Language lang) {
|
|
1186
|
+
return static_cast<uint8>(lang + 1);}
|
|
1187
|
+
|
|
1188
|
+
Language inline UnpackLanguage(int ilang) {
|
|
1189
|
+
return static_cast<Language>(ilang - 1);}
|
|
1190
|
+
|
|
1191
|
+
// Useful single-byte tests
|
|
1192
|
+
bool inline IsUTF8ContinueByte(char c) {
|
|
1193
|
+
return static_cast<signed char>(c) < -64;}
|
|
1194
|
+
bool inline IsUTF8HighByte(char c) {
|
|
1195
|
+
return static_cast<signed char>(c) < 0;}
|
|
1196
|
+
|
|
1197
|
+
|
|
1198
|
+
// Demote all languages except Top40 and plus_one
|
|
1199
|
+
// Do this just before sorting
|
|
1200
|
+
void DemoteNotTop40(Tote* chunk_tote, int packed_plus_one);
|
|
1201
|
+
|
|
1202
|
+
} // End namespace cld
|
|
1203
|
+
|
|
1204
|
+
|
|
1205
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_CLDUTIL_H_
|