cld-fixed 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +20 -0
- data/.rspec +2 -0
- data/Gemfile +6 -0
- data/LICENSE +27 -0
- data/README.md +34 -0
- data/Rakefile +5 -0
- data/cld.gemspec +22 -0
- data/ext/cld/Makefile.am +28 -0
- data/ext/cld/Makefile.in +790 -0
- data/ext/cld/aclocal.m4 +8895 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +115 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/build_aux/config.guess +1500 -0
- data/ext/cld/build_aux/config.sub +1616 -0
- data/ext/cld/build_aux/depcomp +584 -0
- data/ext/cld/build_aux/install-sh +507 -0
- data/ext/cld/build_aux/ltmain.sh +8745 -0
- data/ext/cld/build_aux/missing +367 -0
- data/ext/cld/cld_encodings.h +95 -0
- data/ext/cld/configure +17362 -0
- data/ext/cld/configure.ac +14 -0
- data/ext/cld/encodings/compact_lang_det/#cldutil.cc# +905 -0
- data/ext/cld/encodings/compact_lang_det/#cldutil.h# +1205 -0
- data/ext/cld/encodings/compact_lang_det/#compact_lang_det_impl.h# +171 -0
- data/ext/cld/encodings/compact_lang_det/#ext_lang_enc.cc# +545 -0
- data/ext/cld/encodings/compact_lang_det/#ext_lang_enc.h# +119 -0
- data/ext/cld/encodings/compact_lang_det/#getonescriptspan.cc# +570 -0
- data/ext/cld/encodings/compact_lang_det/#getonescriptspan.h# +131 -0
- data/ext/cld/encodings/compact_lang_det/#tote.cc# +299 -0
- data/ext/cld/encodings/compact_lang_det/#tote.h# +89 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/#cld_unilib_windows.cc# +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +7 -0
- data/ext/cld/languages/internal/#languages.cc# +337 -0
- data/ext/cld/languages/internal/languages.cc +336 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/ext/cld/thunk.cc +55 -0
- data/lib/cld.rb +21 -0
- data/lib/cld/version.rb +3 -0
- data/spec/cld_spec.rb +67 -0
- data/spec/spec_helper.rb +6 -0
- metadata +193 -0
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
|
|
6
|
+
|
|
7
|
+
// Return true if current Tbl pointer is within state0 range
|
|
8
|
+
// Note that unsigned compare checks both ends of range simultaneously
|
|
9
|
+
static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
|
|
10
|
+
const uint8* Tbl0 = &st->state_table[st->state0];
|
|
11
|
+
return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
// Look up property of one UTF-8 character and advance over it
|
|
16
|
+
// Return 0 if input length is zero
|
|
17
|
+
// Return 0 and advance one byte if input is ill-formed
|
|
18
|
+
uint8 UTF8GenericProperty(const UTF8PropObj* st,
|
|
19
|
+
const uint8** src,
|
|
20
|
+
int* srclen) {
|
|
21
|
+
if (*srclen <= 0) {
|
|
22
|
+
return 0;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
const uint8* lsrc = *src;
|
|
26
|
+
const uint8* Tbl_0 = &st->state_table[st->state0];
|
|
27
|
+
const uint8* Tbl = Tbl_0;
|
|
28
|
+
int e;
|
|
29
|
+
int eshift = st->entry_shift;
|
|
30
|
+
|
|
31
|
+
// Short series of tests faster than switch, optimizes 7-bit ASCII
|
|
32
|
+
unsigned char c = lsrc[0];
|
|
33
|
+
if (static_cast<signed char>(c) >= 0) { // one byte
|
|
34
|
+
e = Tbl[c];
|
|
35
|
+
*src += 1;
|
|
36
|
+
*srclen -= 1;
|
|
37
|
+
} else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes
|
|
38
|
+
e = Tbl[c];
|
|
39
|
+
Tbl = &Tbl_0[e << eshift];
|
|
40
|
+
e = Tbl[lsrc[1]];
|
|
41
|
+
*src += 2;
|
|
42
|
+
*srclen -= 2;
|
|
43
|
+
} else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes
|
|
44
|
+
e = Tbl[c];
|
|
45
|
+
Tbl = &Tbl_0[e << eshift];
|
|
46
|
+
e = Tbl[lsrc[1]];
|
|
47
|
+
Tbl = &Tbl_0[e << eshift];
|
|
48
|
+
e = Tbl[lsrc[2]];
|
|
49
|
+
*src += 3;
|
|
50
|
+
*srclen -= 3;
|
|
51
|
+
}else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes
|
|
52
|
+
e = Tbl[c];
|
|
53
|
+
Tbl = &Tbl_0[e << eshift];
|
|
54
|
+
e = Tbl[lsrc[1]];
|
|
55
|
+
Tbl = &Tbl_0[e << eshift];
|
|
56
|
+
e = Tbl[lsrc[2]];
|
|
57
|
+
Tbl = &Tbl_0[e << eshift];
|
|
58
|
+
e = Tbl[lsrc[3]];
|
|
59
|
+
*src += 4;
|
|
60
|
+
*srclen -= 4;
|
|
61
|
+
} else { // Ill-formed
|
|
62
|
+
e = 0;
|
|
63
|
+
*src += 1;
|
|
64
|
+
*srclen -= 1;
|
|
65
|
+
}
|
|
66
|
+
return e;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// BigOneByte versions are needed for tables > 240 states, but most
|
|
70
|
+
// won't need the TwoByte versions.
|
|
71
|
+
// Internally, to next-to-last offset is multiplied by 16 and the last
|
|
72
|
+
// offset is relative instead of absolute.
|
|
73
|
+
// Look up property of one UTF-8 character and advance over it
|
|
74
|
+
// Return 0 if input length is zero
|
|
75
|
+
// Return 0 and advance one byte if input is ill-formed
|
|
76
|
+
uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
|
|
77
|
+
const uint8** src,
|
|
78
|
+
int* srclen) {
|
|
79
|
+
if (*srclen <= 0) {
|
|
80
|
+
return 0;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const uint8* lsrc = *src;
|
|
84
|
+
const uint8* Tbl_0 = &st->state_table[st->state0];
|
|
85
|
+
const uint8* Tbl = Tbl_0;
|
|
86
|
+
int e;
|
|
87
|
+
int eshift = st->entry_shift;
|
|
88
|
+
|
|
89
|
+
// Short series of tests faster than switch, optimizes 7-bit ASCII
|
|
90
|
+
unsigned char c = lsrc[0];
|
|
91
|
+
if (static_cast<signed char>(c) >= 0) { // one byte
|
|
92
|
+
e = Tbl[c];
|
|
93
|
+
*src += 1;
|
|
94
|
+
*srclen -= 1;
|
|
95
|
+
} else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes
|
|
96
|
+
e = Tbl[c];
|
|
97
|
+
Tbl = &Tbl_0[e << eshift];
|
|
98
|
+
e = Tbl[lsrc[1]];
|
|
99
|
+
*src += 2;
|
|
100
|
+
*srclen -= 2;
|
|
101
|
+
} else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes
|
|
102
|
+
e = Tbl[c];
|
|
103
|
+
Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range
|
|
104
|
+
e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];
|
|
105
|
+
Tbl = &Tbl[e << eshift]; // Relative +/-
|
|
106
|
+
e = Tbl[lsrc[2]];
|
|
107
|
+
*src += 3;
|
|
108
|
+
*srclen -= 3;
|
|
109
|
+
}else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes
|
|
110
|
+
e = Tbl[c];
|
|
111
|
+
Tbl = &Tbl_0[e << eshift];
|
|
112
|
+
e = Tbl[lsrc[1]];
|
|
113
|
+
Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range
|
|
114
|
+
e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];
|
|
115
|
+
Tbl = &Tbl[e << eshift]; // Relative +/-
|
|
116
|
+
e = Tbl[lsrc[3]];
|
|
117
|
+
*src += 4;
|
|
118
|
+
*srclen -= 4;
|
|
119
|
+
} else { // Ill-formed
|
|
120
|
+
e = 0;
|
|
121
|
+
*src += 1;
|
|
122
|
+
*srclen -= 1;
|
|
123
|
+
}
|
|
124
|
+
return e;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// Scan a UTF-8 stringpiece based on a state table.
|
|
128
|
+
// Always scan complete UTF-8 characters
|
|
129
|
+
// Set number of bytes scanned. Return reason for exiting
|
|
130
|
+
int UTF8GenericScan(const UTF8ScanObj* st,
|
|
131
|
+
const uint8* str,
|
|
132
|
+
const int len,
|
|
133
|
+
int* bytes_consumed) {
|
|
134
|
+
int eshift = st->entry_shift; // 6 (space optimized) or 8
|
|
135
|
+
// int nEntries = (1 << eshift); // 64 or 256 entries per state
|
|
136
|
+
|
|
137
|
+
const uint8* isrc = str;
|
|
138
|
+
//reinterpret_cast<const uint8*>(str.data());
|
|
139
|
+
const uint8* src = isrc;
|
|
140
|
+
//const int len = str.length();
|
|
141
|
+
const uint8* srclimit = isrc + len;
|
|
142
|
+
const uint8* srclimit8 = srclimit - 7;
|
|
143
|
+
*bytes_consumed = 0;
|
|
144
|
+
if (len == 0) return kExitOK;
|
|
145
|
+
|
|
146
|
+
const uint8* Tbl_0 = &st->state_table[st->state0];
|
|
147
|
+
|
|
148
|
+
DoAgain:
|
|
149
|
+
// Do state-table scan
|
|
150
|
+
int e = 0;
|
|
151
|
+
uint8 c;
|
|
152
|
+
|
|
153
|
+
// Do fast for groups of 8 identity bytes.
|
|
154
|
+
// This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
|
|
155
|
+
// including slowing slightly on cr/lf/ht
|
|
156
|
+
//----------------------------
|
|
157
|
+
const uint8* Tbl2 = &st->fast_state[0];
|
|
158
|
+
uint32 losub = st->losub;
|
|
159
|
+
uint32 hiadd = st->hiadd;
|
|
160
|
+
while (src < srclimit8) {
|
|
161
|
+
uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
|
|
162
|
+
uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
|
|
163
|
+
src += 8;
|
|
164
|
+
// This is a fast range check for all bytes in [lowsub..0x80-hiadd)
|
|
165
|
+
uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
|
|
166
|
+
(s4567 - losub) | (s4567 + hiadd);
|
|
167
|
+
if ((temp & 0x80808080) != 0) {
|
|
168
|
+
// We typically end up here on cr/lf/ht; src was incremented
|
|
169
|
+
int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
|
|
170
|
+
(Tbl2[src[-6]] | Tbl2[src[-5]]);
|
|
171
|
+
if (e0123 != 0) {src -= 8; break;} // Exit on Non-interchange
|
|
172
|
+
e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
|
|
173
|
+
(Tbl2[src[-2]] | Tbl2[src[-1]]);
|
|
174
|
+
if (e0123 != 0) {src -= 4; break;} // Exit on Non-interchange
|
|
175
|
+
// Else OK, go around again
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
//----------------------------
|
|
179
|
+
|
|
180
|
+
// Byte-at-a-time scan
|
|
181
|
+
//----------------------------
|
|
182
|
+
const uint8* Tbl = Tbl_0;
|
|
183
|
+
while (src < srclimit) {
|
|
184
|
+
c = *src;
|
|
185
|
+
e = Tbl[c];
|
|
186
|
+
src++;
|
|
187
|
+
if (e >= kExitIllegalStructure) {break;}
|
|
188
|
+
Tbl = &Tbl_0[e << eshift];
|
|
189
|
+
}
|
|
190
|
+
//----------------------------
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
// Exit posibilities:
|
|
194
|
+
// Some exit code, !state0, back up over last char
|
|
195
|
+
// Some exit code, state0, back up one byte exactly
|
|
196
|
+
// source consumed, !state0, back up over partial char
|
|
197
|
+
// source consumed, state0, exit OK
|
|
198
|
+
// For illegal byte in state0, avoid backup up over PREVIOUS char
|
|
199
|
+
// For truncated last char, back up to beginning of it
|
|
200
|
+
|
|
201
|
+
if (e >= kExitIllegalStructure) {
|
|
202
|
+
// Back up over exactly one byte of rejected/illegal UTF-8 character
|
|
203
|
+
src--;
|
|
204
|
+
// Back up more if needed
|
|
205
|
+
if (!InStateZero(st, Tbl)) {
|
|
206
|
+
do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
|
|
207
|
+
}
|
|
208
|
+
} else if (!InStateZero(st, Tbl)) {
|
|
209
|
+
// Back up over truncated UTF-8 character
|
|
210
|
+
e = kExitIllegalStructure;
|
|
211
|
+
do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
|
|
212
|
+
} else {
|
|
213
|
+
// Normal termination, source fully consumed
|
|
214
|
+
e = kExitOK;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
if (e == kExitDoAgain) {
|
|
218
|
+
// Loop back up to the fast scan
|
|
219
|
+
goto DoAgain;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
*bytes_consumed = src - isrc;
|
|
223
|
+
return e;
|
|
224
|
+
}
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_
|
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_
|
|
7
|
+
|
|
8
|
+
#if !defined(CLD_WINDOWS)
|
|
9
|
+
|
|
10
|
+
#include "util/utf8/utf8statetable.h"
|
|
11
|
+
|
|
12
|
+
#else
|
|
13
|
+
|
|
14
|
+
#include "encodings/compact_lang_det/win/cld_basictypes.h"
|
|
15
|
+
|
|
16
|
+
// These four-byte entries compactly encode how many bytes 0..255 to delete
|
|
17
|
+
// in making a string replacement, how many bytes to add 0..255, and the offset
|
|
18
|
+
// 0..64k-1 of the replacement string in remap_string.
|
|
19
|
+
struct RemapEntry {
|
|
20
|
+
uint8 delete_bytes;
|
|
21
|
+
uint8 add_bytes;
|
|
22
|
+
uint16 bytes_offset;
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
// Exit type codes for state tables. All but the first get stuffed into
|
|
26
|
+
// signed one-byte entries. The first is only generated by executable code.
|
|
27
|
+
// To distinguish from next-state entries, these must be contiguous and
|
|
28
|
+
// all <= kExitNone
|
|
29
|
+
typedef enum {
|
|
30
|
+
kExitDstSpaceFull = 239,
|
|
31
|
+
kExitIllegalStructure, // 240
|
|
32
|
+
kExitOK, // 241
|
|
33
|
+
kExitReject, // ...
|
|
34
|
+
kExitReplace1,
|
|
35
|
+
kExitReplace2,
|
|
36
|
+
kExitReplace3,
|
|
37
|
+
kExitReplace21,
|
|
38
|
+
kExitReplace31,
|
|
39
|
+
kExitReplace32,
|
|
40
|
+
kExitReplaceOffset1,
|
|
41
|
+
kExitReplaceOffset2,
|
|
42
|
+
kExitReplace1S0,
|
|
43
|
+
kExitSpecial,
|
|
44
|
+
kExitDoAgain,
|
|
45
|
+
kExitRejectAlt,
|
|
46
|
+
kExitNone // 255
|
|
47
|
+
} ExitReason;
|
|
48
|
+
|
|
49
|
+
typedef enum {
|
|
50
|
+
kExitDstSpaceFull_2 = -32769,
|
|
51
|
+
kExitIllegalStructure_2, // -32768
|
|
52
|
+
kExitOK_2, // -32767
|
|
53
|
+
kExitReject_2, // ...
|
|
54
|
+
kExitReplace1_2,
|
|
55
|
+
kExitReplace2_2,
|
|
56
|
+
kExitReplace3_2,
|
|
57
|
+
kExitReplace21_2,
|
|
58
|
+
kExitReplace31_2,
|
|
59
|
+
kExitReplace32_2,
|
|
60
|
+
kExitReplaceOffset1_2,
|
|
61
|
+
kExitReplaceOffset2_2,
|
|
62
|
+
kExitReplace1S0_2,
|
|
63
|
+
kExitSpecial_2,
|
|
64
|
+
kExitDoAgain_2,
|
|
65
|
+
kExitRejectAlt_2,
|
|
66
|
+
kExitNone_2 // -32753
|
|
67
|
+
} ExitReason_2;
|
|
68
|
+
|
|
69
|
+
// This struct represents one entire state table. The three initialized byte
|
|
70
|
+
// areas are state_table, remap_base, and remap_string. state0 and state0_size
|
|
71
|
+
// give the byte offset and length within state_table of the initial state --
|
|
72
|
+
// table lookups are expected to start and end in this state, but for
|
|
73
|
+
// truncated UTF-8 strings, may end in a different state. These allow a quick
|
|
74
|
+
// test for that condition. entry_shift is 8 for tables subscripted by a full
|
|
75
|
+
// byte value and 6 for space-optimized tables subscripted by only six
|
|
76
|
+
// significant bits in UTF-8 continuation bytes.
|
|
77
|
+
typedef struct {
|
|
78
|
+
const uint32 state0;
|
|
79
|
+
const uint32 state0_size;
|
|
80
|
+
const uint32 total_size;
|
|
81
|
+
const int max_expand;
|
|
82
|
+
const int entry_shift;
|
|
83
|
+
const int bytes_per_entry;
|
|
84
|
+
const uint32 losub;
|
|
85
|
+
const uint32 hiadd;
|
|
86
|
+
const uint8* state_table;
|
|
87
|
+
const RemapEntry* remap_base;
|
|
88
|
+
const uint8* remap_string;
|
|
89
|
+
const uint8* fast_state;
|
|
90
|
+
} UTF8StateMachineObj;
|
|
91
|
+
|
|
92
|
+
// Near-duplicate declaration for tables with two-byte entries
|
|
93
|
+
typedef struct {
|
|
94
|
+
const uint32 state0;
|
|
95
|
+
const uint32 state0_size;
|
|
96
|
+
const uint32 total_size;
|
|
97
|
+
const int max_expand;
|
|
98
|
+
const int entry_shift;
|
|
99
|
+
const int bytes_per_entry;
|
|
100
|
+
const uint32 losub;
|
|
101
|
+
const uint32 hiadd;
|
|
102
|
+
const signed short* state_table;
|
|
103
|
+
const RemapEntry* remap_base;
|
|
104
|
+
const uint8* remap_string;
|
|
105
|
+
const uint8* fast_state;
|
|
106
|
+
} UTF8StateMachineObj_2;
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
typedef UTF8StateMachineObj UTF8PropObj;
|
|
110
|
+
typedef UTF8StateMachineObj UTF8ScanObj;
|
|
111
|
+
typedef UTF8StateMachineObj_2 UTF8PropObj_2;
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
// Look up property of one UTF-8 character and advance over it
|
|
115
|
+
// Return 0 if input length is zero
|
|
116
|
+
// Return 0 and advance one byte if input is ill-formed
|
|
117
|
+
uint8 UTF8GenericProperty(const UTF8PropObj* st,
|
|
118
|
+
const uint8** src,
|
|
119
|
+
int* srclen);
|
|
120
|
+
|
|
121
|
+
// BigOneByte versions are needed for tables > 240 states, but most
|
|
122
|
+
// won't need the TwoByte versions.
|
|
123
|
+
|
|
124
|
+
// Look up property of one UTF-8 character and advance over it
|
|
125
|
+
// Return 0 if input length is zero
|
|
126
|
+
// Return 0 and advance one byte if input is ill-formed
|
|
127
|
+
uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
|
|
128
|
+
const uint8** src,
|
|
129
|
+
int* srclen);
|
|
130
|
+
|
|
131
|
+
// Scan a UTF-8 stringpiece based on a state table.
|
|
132
|
+
// Always scan complete UTF-8 characters
|
|
133
|
+
// Set number of bytes scanned. Return reason for exiting
|
|
134
|
+
int UTF8GenericScan(const UTF8ScanObj* st,
|
|
135
|
+
const uint8* str,
|
|
136
|
+
const int len,
|
|
137
|
+
int* bytes_consumed);
|
|
138
|
+
|
|
139
|
+
#endif
|
|
140
|
+
|
|
141
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8UTILS_H_
|
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8UTILS_H_
|
|
7
|
+
|
|
8
|
+
#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
|
|
9
|
+
|
|
10
|
+
namespace cld {
|
|
11
|
+
|
|
12
|
+
// Scan a UTF-8 stringpiece based on a state table.
|
|
13
|
+
// Always scan complete UTF-8 characters
|
|
14
|
+
// Set number of bytes scanned. Return reason for exiting
|
|
15
|
+
int UTF8GenericScan(const UTF8ScanObj* st,
|
|
16
|
+
const char* src,
|
|
17
|
+
int len,
|
|
18
|
+
int* bytes_consumed);
|
|
19
|
+
|
|
20
|
+
} // namespace cld
|
|
21
|
+
|
|
22
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8UTILS_H_
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
// Copyright 2009 Google Inc. All Rights Reserved.
|
|
2
|
+
// Author: alekseys@google.com (Aleksey Shlyapnikov)
|
|
3
|
+
|
|
4
|
+
// This code is not actually used, it was copied here for the reference only.
|
|
5
|
+
// See cld_htmlutils_windows.cc for Windows version of this code.
|
|
6
|
+
|
|
7
|
+
#include "cld/encodings/compact_lang_det/win/cld_utf8utils.h"
|
|
8
|
+
|
|
9
|
+
#include "cld/util/utf8/utf8statetable.h"
|
|
10
|
+
|
|
11
|
+
namespace cld {
|
|
12
|
+
|
|
13
|
+
int UTF8GenericScan(const UTF8ScanObj* st, const char* src, int len,
|
|
14
|
+
int* bytes_consumed) {
|
|
15
|
+
return ::UTF8GenericScan(st, StringPiece(src, len), bytes_consumed);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
} // namespace cld
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#include "encodings/compact_lang_det/win/cld_utf8utils.h"
|
|
6
|
+
|
|
7
|
+
#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
|
|
8
|
+
|
|
9
|
+
namespace cld {
|
|
10
|
+
|
|
11
|
+
int UTF8GenericScan(const UTF8ScanObj* st, const char* src, int len,
|
|
12
|
+
int* bytes_consumed) {
|
|
13
|
+
return ::UTF8GenericScan(st, reinterpret_cast<const uint8*>(src), len,
|
|
14
|
+
bytes_consumed);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
} // namespace cld
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#include "encodings/compact_lang_det/win/normalizedunicodetext.h"
|
|
6
|
+
|
|
7
|
+
#include <tchar.h>
|
|
8
|
+
#include <windows.h>
|
|
9
|
+
#include <winnls.h>
|
|
10
|
+
|
|
11
|
+
#include "encodings/compact_lang_det/win/cld_scopedptr.h"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
namespace {
|
|
15
|
+
|
|
16
|
+
// Function prototypes copied from MSDN.
|
|
17
|
+
typedef BOOL (WINAPI *IsNormalizedStringFunction)(NORM_FORM NormForm,
|
|
18
|
+
LPCWSTR lpSrcString,
|
|
19
|
+
int cwSrcLength);
|
|
20
|
+
typedef int (WINAPI *NormalizeStringFunction)(NORM_FORM NormForm,
|
|
21
|
+
LPCWSTR lpSrcString,
|
|
22
|
+
int cwSrcLength,
|
|
23
|
+
LPWSTR lpDstString,
|
|
24
|
+
int cwDstLength);
|
|
25
|
+
|
|
26
|
+
// A class to provide an access to Normaliz.dll functions.
|
|
27
|
+
// New normalization API implemented in Normaliz.dll is available starting
|
|
28
|
+
// from Windows XP SP2, that's why we have to bind to it dynamically.
|
|
29
|
+
class NormalizationAPI {
|
|
30
|
+
public:
|
|
31
|
+
// Creates fully initialized NormalizationAPI object.
|
|
32
|
+
// Loads DLL and binds all referenced functions.
|
|
33
|
+
NormalizationAPI()
|
|
34
|
+
: library_(_T("Normaliz.dll")) {
|
|
35
|
+
if (library_.IsValid()) {
|
|
36
|
+
is_normalized_string_.Bind(library_.handle(), "IsNormalizedString");
|
|
37
|
+
normalize_string_.Bind(library_.handle(), "NormalizeString");
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// Proxy functions for the ones loaded from DLL.
|
|
42
|
+
BOOL IsNormalizedString(NORM_FORM NormForm, LPCWSTR lpSrcString,
|
|
43
|
+
int cwSrcLength) {
|
|
44
|
+
if (!is_normalized_string_.IsValid())
|
|
45
|
+
return FALSE;
|
|
46
|
+
return is_normalized_string_.function()(NormForm, lpSrcString, cwSrcLength);
|
|
47
|
+
}
|
|
48
|
+
int NormalizeString(NORM_FORM NormForm, LPCWSTR lpSrcString, int cwSrcLength,
|
|
49
|
+
LPWSTR lpDstString, int cwDstLength) {
|
|
50
|
+
if (!normalize_string_.IsValid()) {
|
|
51
|
+
::SetLastError(ERROR_INVALID_FUNCTION);
|
|
52
|
+
return 0;
|
|
53
|
+
}
|
|
54
|
+
return normalize_string_.function()(NormForm, lpSrcString, cwSrcLength,
|
|
55
|
+
lpDstString, cwDstLength);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// Returns true if all functions were bound successfully.
|
|
59
|
+
// This implies that library_ itself was loaded successfully.
|
|
60
|
+
bool IsValid() const {
|
|
61
|
+
return is_normalized_string_.IsValid() && normalize_string_.IsValid();
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
private:
|
|
65
|
+
// Holds a handle to loaded Normaliz.dll.
|
|
66
|
+
ScopedLibrary library_;
|
|
67
|
+
// Pointers to the functions loaded from Normaliz.dll.
|
|
68
|
+
FunctionFromDll<IsNormalizedStringFunction> is_normalized_string_;
|
|
69
|
+
FunctionFromDll<NormalizeStringFunction> normalize_string_;
|
|
70
|
+
|
|
71
|
+
DISALLOW_COPY_AND_ASSIGN(NormalizationAPI);
|
|
72
|
+
};
|
|
73
|
+
|
|
74
|
+
static NormalizationAPI normalization_api;
|
|
75
|
+
|
|
76
|
+
} // namespace
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
// NormalizedUnicodeText
|
|
80
|
+
|
|
81
|
+
NormalizedUnicodeText::NormalizedUnicodeText()
|
|
82
|
+
: normalized_text_(NULL) {
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
DWORD NormalizedUnicodeText::Normalize(NORM_FORM normalization_form,
|
|
87
|
+
const WCHAR* text) {
|
|
88
|
+
DWORD result = 0;
|
|
89
|
+
normalized_text_ = TryToNormalizeText(normalization_form, text, &result);
|
|
90
|
+
return result;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
const WCHAR* NormalizedUnicodeText::TryToNormalizeText(
|
|
95
|
+
NORM_FORM normalization_form, const WCHAR* text, DWORD *error_code) {
|
|
96
|
+
if (!text) {
|
|
97
|
+
text_.reset();
|
|
98
|
+
return text;
|
|
99
|
+
}
|
|
100
|
+
_ASSERT(NULL != error_code);
|
|
101
|
+
if (!error_code)
|
|
102
|
+
return text;
|
|
103
|
+
|
|
104
|
+
if (!normalization_api.IsValid()) {
|
|
105
|
+
// Fall back to the previous version of normalization API.
|
|
106
|
+
int folded_text_size = ::FoldStringW(MAP_PRECOMPOSED, text, -1, NULL, 0);
|
|
107
|
+
if (!folded_text_size) {
|
|
108
|
+
*error_code = ::GetLastError();
|
|
109
|
+
return text;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
text_.reset(new WCHAR[folded_text_size]);
|
|
113
|
+
if (!text_.get()) {
|
|
114
|
+
*error_code = ERROR_OUTOFMEMORY;
|
|
115
|
+
return text;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
int folding_result =
|
|
119
|
+
::FoldStringW(MAP_PRECOMPOSED, text, -1, text_.get(), folded_text_size);
|
|
120
|
+
if (!folding_result) {
|
|
121
|
+
*error_code = ::GetLastError();
|
|
122
|
+
text_.reset();
|
|
123
|
+
return text;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
return text_.get();
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// No need to allocate anything when text is already normalized.
|
|
130
|
+
if (normalization_api.IsNormalizedString(normalization_form, text, -1))
|
|
131
|
+
return text;
|
|
132
|
+
|
|
133
|
+
// Get the first approximation for the buffer size required to store
|
|
134
|
+
// normalized text.
|
|
135
|
+
int normalized_text_size_guess =
|
|
136
|
+
normalization_api.NormalizeString(normalization_form, text, -1, NULL, 0);
|
|
137
|
+
|
|
138
|
+
while (normalized_text_size_guess > 0) {
|
|
139
|
+
text_.reset(new WCHAR[normalized_text_size_guess]);
|
|
140
|
+
if (!text_.get()) {
|
|
141
|
+
*error_code = ERROR_OUTOFMEMORY;
|
|
142
|
+
break;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
int normalized_text_size =
|
|
146
|
+
normalization_api.NormalizeString(normalization_form, text, -1,
|
|
147
|
+
text_.get(),
|
|
148
|
+
normalized_text_size_guess);
|
|
149
|
+
|
|
150
|
+
if (normalized_text_size > 0) {
|
|
151
|
+
// Text was successfully converted.
|
|
152
|
+
return text_.get();
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
if (ERROR_INSUFFICIENT_BUFFER != ::GetLastError()) {
|
|
156
|
+
*error_code = ::GetLastError();
|
|
157
|
+
// Text cannot be normalized, use the original.
|
|
158
|
+
// By the way, ERROR_SUCCESS is a puzzling case.
|
|
159
|
+
// MSDN says 'The action completed successfully but yielded no results'.
|
|
160
|
+
// Does this mean that output buffer was not changed?
|
|
161
|
+
// Anyway, just in case, also return the original text.
|
|
162
|
+
break;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// Try again with the corrected buffer size.
|
|
166
|
+
normalized_text_size_guess = -normalized_text_size;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
// Use the original text in case of any problem with normalization.
|
|
170
|
+
text_.reset();
|
|
171
|
+
return text;
|
|
172
|
+
}
|