compact_enc_det 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/ext/compact_enc_det/compact_enc_det/CMakeLists.txt +103 -0
- data/ext/compact_enc_det/compact_enc_det/LICENSE +202 -0
- data/ext/compact_enc_det/compact_enc_det/README.md +46 -0
- data/ext/compact_enc_det/compact_enc_det/autogen.sh +74 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.cc +5719 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.h +83 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_fuzz_test.cc +54 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables.h +6326 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables2.h +856 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.cc +169 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.h +45 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_unittest.cc +5260 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/detail_head_string.inc +152 -0
- data/ext/compact_enc_det/compact_enc_det/util/basictypes.h +331 -0
- data/ext/compact_enc_det/compact_enc_det/util/case_insensitive_hash.h +88 -0
- data/ext/compact_enc_det/compact_enc_det/util/commandlineflags.h +39 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.cc +891 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.h +299 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.pb.h +181 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings_unittest.cc +34 -0
- data/ext/compact_enc_det/compact_enc_det/util/languages/languages.cc +349 -0
- data/ext/compact_enc_det/compact_enc_det/util/languages/languages.h +381 -0
- data/ext/compact_enc_det/compact_enc_det/util/languages/languages.pb.h +191 -0
- data/ext/compact_enc_det/compact_enc_det/util/logging.h +25 -0
- data/ext/compact_enc_det/compact_enc_det/util/port.h +53 -0
- data/ext/compact_enc_det/compact_enc_det/util/string_util.h +61 -0
- data/ext/compact_enc_det/compact_enc_det/util/varsetter.h +66 -0
- data/ext/compact_enc_det/compact_enc_det.cc +100 -0
- data/ext/compact_enc_det/extconf.rb +20 -0
- data/lib/compact_enc_det/version.rb +3 -0
- data/lib/compact_enc_det.rb +2 -0
- metadata +106 -0
@@ -0,0 +1,299 @@
|
|
1
|
+
// Copyright 2016 Google Inc.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
////////////////////////////////////////////////////////////////////////////////
|
16
|
+
|
17
|
+
#ifndef UTIL_ENCODINGS_ENCODINGS_H_
|
18
|
+
#define UTIL_ENCODINGS_ENCODINGS_H_
|
19
|
+
|
20
|
+
// This interface defines the Encoding enum and various functions that
|
21
|
+
// depend only on Encoding values.
|
22
|
+
|
23
|
+
// A hash-function for Encoding, hash<Encoding>, is defined in
|
24
|
+
// i18n/encodings/public/encodings-hash.h
|
25
|
+
|
26
|
+
// On some Windows projects, UNICODE may be defined, which would prevent the
|
27
|
+
// Encoding enum below from compiling. Note that this is a quick fix that does
|
28
|
+
// not break any existing projects. The UNICODE enum may someday be changed
|
29
|
+
// to something more specific and non-colliding, but this involves careful
|
30
|
+
// testing of changes in many other projects.
|
31
|
+
#undef UNICODE
|
32
|
+
|
33
|
+
// NOTE: The Encoding enum must always start at 0. This assumption has
|
34
|
+
// been made and used.
|
35
|
+
|
36
|
+
#ifndef SWIG
|
37
|
+
|
38
|
+
#include "util/encodings/encodings.pb.h"
|
39
|
+
|
40
|
+
#else
|
41
|
+
|
42
|
+
// TODO: Include a SWIG workaround header file.
|
43
|
+
|
44
|
+
#endif
|
45
|
+
|
46
|
+
const int kNumEncodings = NUM_ENCODINGS;
|
47
|
+
|
48
|
+
// some of the popular encoding aliases
|
49
|
+
// TODO: Make these static const Encoding values instead of macros.
|
50
|
+
#define LATIN1 ISO_8859_1
|
51
|
+
#define LATIN2 ISO_8859_2
|
52
|
+
#define LATIN3 ISO_8859_3
|
53
|
+
#define LATIN4 ISO_8859_4
|
54
|
+
#define CYRILLIC ISO_8859_5
|
55
|
+
#define ARABIC_ENCODING ISO_8859_6 // avoiding the same name as language
|
56
|
+
#define GREEK_ENCODING ISO_8859_7 // avoiding the same name as language
|
57
|
+
#define HEBREW_ENCODING ISO_8859_8 // avoiding the same name as language
|
58
|
+
#define LATIN5 ISO_8859_9
|
59
|
+
#define LATIN6 ISO_8859_10
|
60
|
+
#define KOREAN_HANGUL KOREAN_EUC_KR
|
61
|
+
|
62
|
+
// The default Encoding (LATIN1).
|
63
|
+
Encoding default_encoding();
|
64
|
+
|
65
|
+
|
66
|
+
|
67
|
+
// *************************************************************
|
68
|
+
// Encoding predicates
|
69
|
+
// IsValidEncoding()
|
70
|
+
// IsEncEncCompatible
|
71
|
+
// IsSupersetOfAscii7Bit
|
72
|
+
// Is8BitEncoding
|
73
|
+
// IsCJKEncoding
|
74
|
+
// IsHebrewEncoding
|
75
|
+
// IsRightToLeftEncoding
|
76
|
+
// IsLogicalRightToLeftEncoding
|
77
|
+
// IsVisualRightToLeftEncoding
|
78
|
+
// IsIso2022Encoding
|
79
|
+
// IsIso2022JpOrVariant
|
80
|
+
// IsShiftJisOrVariant
|
81
|
+
// IsJapaneseCellPhoneCarrierSpecificEncoding
|
82
|
+
// *************************************************************
|
83
|
+
|
84
|
+
// IsValidEncoding
|
85
|
+
// ===================================
|
86
|
+
//
|
87
|
+
// Function to check if the input language enum is within range.
|
88
|
+
//
|
89
|
+
|
90
|
+
bool IsValidEncoding(Encoding enc);
|
91
|
+
|
92
|
+
//
|
93
|
+
// IsEncEncCompatible
|
94
|
+
// ------------------
|
95
|
+
//
|
96
|
+
// This function is to determine whether or not converting from the
|
97
|
+
// first encoding to the second requires any changes to the underlying
|
98
|
+
// text (e.g. ASCII_7BIT is a subset of UTF8).
|
99
|
+
//
|
100
|
+
// TODO: the current implementation is likely incomplete. It would be
|
101
|
+
// good to consider the full matrix of all pairs of encodings and to fish out
|
102
|
+
// all compatible pairs.
|
103
|
+
//
|
104
|
+
bool IsEncEncCompatible(const Encoding from, const Encoding to);
|
105
|
+
|
106
|
+
// To be a superset of 7-bit Ascii means that bytes 0...127 in the given
|
107
|
+
// encoding represent the same characters as they do in ISO_8859_1.
|
108
|
+
|
109
|
+
// WARNING: This function does not currently return true for all encodings that
|
110
|
+
// are supersets of Ascii 7-bit.
|
111
|
+
bool IsSupersetOfAscii7Bit(Encoding e);
|
112
|
+
|
113
|
+
// To be an 8-bit encoding means that there are fewer than 256 symbols.
|
114
|
+
// Each byte determines a new character; there are no multi-byte sequences.
|
115
|
+
|
116
|
+
// WARNING: This function does not currently return true for all encodings that
|
117
|
+
// are 8-bit encodings.
|
118
|
+
bool Is8BitEncoding(Encoding e);
|
119
|
+
|
120
|
+
// IsCJKEncoding
|
121
|
+
// -------------
|
122
|
+
//
|
123
|
+
// This function returns true if the encoding is either Chinese
|
124
|
+
// (simplified or traditional), Japanese, or Korean. Note: UTF8 is not
|
125
|
+
// considered a CJK encoding.
|
126
|
+
bool IsCJKEncoding(Encoding e);
|
127
|
+
|
128
|
+
// IsHebrewEncoding
|
129
|
+
// -------------
|
130
|
+
//
|
131
|
+
// This function returns true if the encoding is a Hebrew specific
|
132
|
+
// encoding (not UTF8, etc).
|
133
|
+
bool IsHebrewEncoding(Encoding e);
|
134
|
+
|
135
|
+
// IsRightToLeftEncoding
|
136
|
+
// ---------------------
|
137
|
+
//
|
138
|
+
// Returns true if the encoding is a right-to-left encoding.
|
139
|
+
//
|
140
|
+
// Note that the name of this function is somewhat misleading. There is nothing
|
141
|
+
// "right to left" about these encodings. They merely contain code points for
|
142
|
+
// characters in RTL languages such as Hebrew and Arabic. But this is also
|
143
|
+
// true for UTF-8.
|
144
|
+
//
|
145
|
+
// TODO: Get rid of this function. The only special-case we
|
146
|
+
// should need to worry about are visual encodings. Anything we
|
147
|
+
// need to do for all 'RTL' encodings we need to do for UTF-8 as well.
|
148
|
+
bool IsRightToLeftEncoding(Encoding enc);
|
149
|
+
|
150
|
+
// IsLogicalRightToLeftEncoding
|
151
|
+
// ----------------------------
|
152
|
+
//
|
153
|
+
// Returns true if the encoding is a logical right-to-left encoding.
|
154
|
+
// Logical right-to-left encodings are those that the browser renders
|
155
|
+
// right-to-left and applies the BiDi algorithm to. Therefore the characters
|
156
|
+
// appear in reading order in the file, and indexing, snippet generation etc.
|
157
|
+
// should all just work with no special processing.
|
158
|
+
//
|
159
|
+
// TODO: Get rid of this function. The only special-case we
|
160
|
+
// should need to worry about are visual encodings.
|
161
|
+
bool IsLogicalRightToLeftEncoding(Encoding enc);
|
162
|
+
|
163
|
+
// IsVisualRightToLeftEncoding
|
164
|
+
// ---------------------------
|
165
|
+
//
|
166
|
+
// Returns true if the encoding is a visual right-to-left encoding.
|
167
|
+
// Visual right-to-left encodings are those that the browser renders
|
168
|
+
// left-to-right and does not apply the BiDi algorithm to. Therefore each
|
169
|
+
// line appears in reverse order in the file, lines are manually wrapped
|
170
|
+
// by abusing <br> or <p> tags, etc. Visual RTL encoding is a relic of
|
171
|
+
// the prehistoric days when browsers couldn't render right-to-left, but
|
172
|
+
// unfortunately some visual pages persist to this day. These documents require
|
173
|
+
// special processing so that we don't index or snippet them with each line
|
174
|
+
// reversed.
|
175
|
+
bool IsVisualRightToLeftEncoding(Encoding enc);
|
176
|
+
|
177
|
+
// IsIso2022Encoding
|
178
|
+
// -----------------
|
179
|
+
//
|
180
|
+
// Returns true if the encoding is a kind of ISO 2022 such as
|
181
|
+
// ISO-2022-JP.
|
182
|
+
bool IsIso2022Encoding(Encoding enc);
|
183
|
+
|
184
|
+
// IsIso2022JpOrVariant
|
185
|
+
// --------------------
|
186
|
+
//
|
187
|
+
// Returns true if the encoding is ISO-2022-JP or a variant such as
|
188
|
+
// KDDI's ISO-2022-JP.
|
189
|
+
bool IsIso2022JpOrVariant(Encoding enc);
|
190
|
+
|
191
|
+
// IsShiftJisOrVariant
|
192
|
+
// --------------------
|
193
|
+
//
|
194
|
+
// Returns true if the encoding is Shift_JIS or a variant such as
|
195
|
+
// KDDI's Shift_JIS.
|
196
|
+
bool IsShiftJisOrVariant(Encoding enc);
|
197
|
+
|
198
|
+
// IsJapanesCellPhoneCarrierSpecificEncoding
|
199
|
+
// -----------------------------------------
|
200
|
+
//
|
201
|
+
// Returns true if it's Japanese cell phone carrier specific encoding
|
202
|
+
// such as KDDI_SHIFT_JIS.
|
203
|
+
bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc);
|
204
|
+
|
205
|
+
|
206
|
+
|
207
|
+
// *************************************************************
|
208
|
+
// ENCODING NAMES
|
209
|
+
//
|
210
|
+
// This interface defines a standard name for each valid encoding, and
|
211
|
+
// a standard name for invalid encodings. (Some names use all upper
|
212
|
+
// case, but others use mixed case.)
|
213
|
+
//
|
214
|
+
// EncodingName() [Encoding to name]
|
215
|
+
// MimeEncodingName() [Encoding to name]
|
216
|
+
// EncodingFromName() [name to Encoding]
|
217
|
+
// EncodingNameAliasToEncoding() [name to Encoding]
|
218
|
+
// default_encoding_name()
|
219
|
+
// invalid_encoding_name()
|
220
|
+
// *************************************************************
|
221
|
+
|
222
|
+
// EncodingName
|
223
|
+
// ------------
|
224
|
+
//
|
225
|
+
// Given the encoding, returns its standard name.
|
226
|
+
// Return invalid_encoding_name() if the encoding is invalid.
|
227
|
+
//
|
228
|
+
const char* EncodingName(Encoding enc);
|
229
|
+
|
230
|
+
//
|
231
|
+
// MimeEncodingName
|
232
|
+
// ----------------
|
233
|
+
//
|
234
|
+
// Return the "preferred MIME name" of an encoding.
|
235
|
+
//
|
236
|
+
// This name is suitable for using in HTTP headers, HTML tags,
|
237
|
+
// and as the "charset" parameter of a MIME Content-Type.
|
238
|
+
const char* MimeEncodingName(Encoding enc);
|
239
|
+
|
240
|
+
|
241
|
+
// The maximum length of an encoding name
|
242
|
+
const int kMaxEncodingNameSize = 50;
|
243
|
+
|
244
|
+
// The standard name of the default encoding.
|
245
|
+
const char* default_encoding_name();
|
246
|
+
|
247
|
+
// The name used for an invalid encoding.
|
248
|
+
const char* invalid_encoding_name();
|
249
|
+
|
250
|
+
// EncodingFromName
|
251
|
+
// ----------------
|
252
|
+
//
|
253
|
+
// If enc_name matches the standard name of an Encoding, using a
|
254
|
+
// case-insensitive comparison, set *encoding to that Encoding and
|
255
|
+
// return true. Otherwise set *encoding to UNKNOWN_ENCODING and
|
256
|
+
// return false.
|
257
|
+
//
|
258
|
+
// REQUIRES: encoding must not be NULL.
|
259
|
+
//
|
260
|
+
bool EncodingFromName(const char* enc_name, Encoding *encoding);
|
261
|
+
|
262
|
+
//
|
263
|
+
// EncodingNameAliasToEncoding
|
264
|
+
// ---------------------------
|
265
|
+
//
|
266
|
+
// If enc_name matches the standard name or an alias of an Encoding,
|
267
|
+
// using a case-insensitive comparison, return that
|
268
|
+
// Encoding. Otherwise, return UNKNOWN_ENCODING.
|
269
|
+
//
|
270
|
+
// Aliases include most mime-encoding names (e.g., "ISO-8859-7" for
|
271
|
+
// GREEK), alternate names (e.g., "cyrillic" for ISO_8859_5) and
|
272
|
+
// common variations with hyphens and underscores (e.g., "koi8-u" and
|
273
|
+
// "koi8u" for RUSSIAN_KOI8_R).
|
274
|
+
|
275
|
+
Encoding EncodingNameAliasToEncoding(const char *enc_name);
|
276
|
+
|
277
|
+
// *************************************************************
|
278
|
+
// Miscellany
|
279
|
+
// *************************************************************
|
280
|
+
|
281
|
+
// PreferredWebOutputEncoding
|
282
|
+
// --------------------------
|
283
|
+
//
|
284
|
+
// Some multi-byte encodings use byte values that coincide with the
|
285
|
+
// ASCII codes for HTML syntax characters <>"&' and browsers like MSIE
|
286
|
+
// can misinterpret these, as indicated in an external XSS report from
|
287
|
+
// 2007-02-15. Here, we map these dangerous encodings to safer ones. We
|
288
|
+
// also use UTF8 instead of encodings that we don't support in our
|
289
|
+
// output, and we generally try to be conservative in what we send out.
|
290
|
+
// Where the client asks for single- or double-byte encodings that are
|
291
|
+
// not as common, we substitute a more common single- or double-byte
|
292
|
+
// encoding, if there is one, thereby preserving the client's intent
|
293
|
+
// to use less space than UTF-8. This also means that characters
|
294
|
+
// outside the destination set will be converted to HTML NCRs (&#NNN;)
|
295
|
+
// if requested.
|
296
|
+
Encoding PreferredWebOutputEncoding(Encoding enc);
|
297
|
+
|
298
|
+
|
299
|
+
#endif // UTIL_ENCODINGS_ENCODINGS_H_
|
@@ -0,0 +1,181 @@
|
|
1
|
+
// Copyright 2016 Google Inc.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
////////////////////////////////////////////////////////////////////////////////
|
16
|
+
|
17
|
+
#ifndef UTIL_ENCODINGS_ENCODINGS_PB_H_
|
18
|
+
#define UTIL_ENCODINGS_ENCODINGS_PB_H_
|
19
|
+
|
20
|
+
enum Encoding {
|
21
|
+
ISO_8859_1 = 0, // Teragram ASCII
|
22
|
+
ISO_8859_2 = 1, // Teragram Latin2
|
23
|
+
ISO_8859_3 = 2, // in BasisTech but not in Teragram
|
24
|
+
ISO_8859_4 = 3, // Teragram Latin4
|
25
|
+
ISO_8859_5 = 4, // Teragram ISO-8859-5
|
26
|
+
ISO_8859_6 = 5, // Teragram Arabic
|
27
|
+
ISO_8859_7 = 6, // Teragram Greek
|
28
|
+
ISO_8859_8 = 7, // Teragram Hebrew
|
29
|
+
ISO_8859_9 = 8, // in BasisTech but not in Teragram
|
30
|
+
ISO_8859_10 = 9, // in BasisTech but not in Teragram
|
31
|
+
JAPANESE_EUC_JP = 10, // Teragram EUC_JP
|
32
|
+
JAPANESE_SHIFT_JIS = 11, // Teragram SJS
|
33
|
+
JAPANESE_JIS = 12, // Teragram JIS
|
34
|
+
CHINESE_BIG5 = 13, // Teragram BIG5
|
35
|
+
CHINESE_GB = 14, // Teragram GB
|
36
|
+
CHINESE_EUC_CN = 15, // Misnamed. Should be EUC_TW. Was Basis Tech
|
37
|
+
// CNS11643EUC, before that Teragram EUC-CN(!)
|
38
|
+
// See //i18n/basistech/basistech_encodings.h
|
39
|
+
KOREAN_EUC_KR = 16, // Teragram KSC
|
40
|
+
UNICODE = 17, // Teragram Unicode
|
41
|
+
CHINESE_EUC_DEC = 18, // Misnamed. Should be EUC_TW. Was Basis Tech
|
42
|
+
// CNS11643EUC, before that Teragram EUC.
|
43
|
+
CHINESE_CNS = 19, // Misnamed. Should be EUC_TW. Was Basis Tech
|
44
|
+
// CNS11643EUC, before that Teragram CNS.
|
45
|
+
CHINESE_BIG5_CP950 = 20, // Teragram BIG5_CP950
|
46
|
+
JAPANESE_CP932 = 21, // Teragram CP932
|
47
|
+
UTF8 = 22,
|
48
|
+
UNKNOWN_ENCODING = 23,
|
49
|
+
ASCII_7BIT = 24, // ISO_8859_1 with all characters <= 127.
|
50
|
+
// Should be present only in the crawler
|
51
|
+
// and in the repository,
|
52
|
+
// *never* as a result of Document::encoding().
|
53
|
+
RUSSIAN_KOI8_R = 25, // Teragram KOI8R
|
54
|
+
RUSSIAN_CP1251 = 26, // Teragram CP1251
|
55
|
+
|
56
|
+
//----------------------------------------------------------
|
57
|
+
// These are _not_ output from teragram. Instead, they are as
|
58
|
+
// detected in the headers of usenet articles.
|
59
|
+
MSFT_CP1252 = 27, // 27: CP1252 aka MSFT euro ascii
|
60
|
+
RUSSIAN_KOI8_RU = 28, // CP21866 aka KOI8-U, used for Ukrainian.
|
61
|
+
// Misnamed, this is _not_ KOI8-RU but KOI8-U.
|
62
|
+
// KOI8-U is used much more often than KOI8-RU.
|
63
|
+
MSFT_CP1250 = 29, // CP1250 aka MSFT eastern european
|
64
|
+
ISO_8859_15 = 30, // aka ISO_8859_0 aka ISO_8859_1 euroized
|
65
|
+
//----------------------------------------------------------
|
66
|
+
|
67
|
+
//----------------------------------------------------------
|
68
|
+
// These are in BasisTech but not in Teragram. They are
|
69
|
+
// needed for new interface languages. Now detected by
|
70
|
+
// research langid
|
71
|
+
MSFT_CP1254 = 31, // used for Turkish
|
72
|
+
MSFT_CP1257 = 32, // used in Baltic countries
|
73
|
+
//----------------------------------------------------------
|
74
|
+
|
75
|
+
//----------------------------------------------------------
|
76
|
+
//----------------------------------------------------------
|
77
|
+
// New encodings detected by Teragram
|
78
|
+
ISO_8859_11 = 33, // aka TIS-620, used for Thai
|
79
|
+
MSFT_CP874 = 34, // used for Thai
|
80
|
+
MSFT_CP1256 = 35, // used for Arabic
|
81
|
+
|
82
|
+
//----------------------------------------------------------
|
83
|
+
// Detected as ISO_8859_8 by Teragram, but can be found in META tags
|
84
|
+
MSFT_CP1255 = 36, // Logical Hebrew Microsoft
|
85
|
+
ISO_8859_8_I = 37, // Iso Hebrew Logical
|
86
|
+
HEBREW_VISUAL = 38, // Iso Hebrew Visual
|
87
|
+
//----------------------------------------------------------
|
88
|
+
|
89
|
+
//----------------------------------------------------------
|
90
|
+
// Detected by research langid
|
91
|
+
CZECH_CP852 = 39,
|
92
|
+
CZECH_CSN_369103 = 40, // aka ISO_IR_139 aka KOI8_CS
|
93
|
+
MSFT_CP1253 = 41, // used for Greek
|
94
|
+
RUSSIAN_CP866 = 42,
|
95
|
+
//----------------------------------------------------------
|
96
|
+
|
97
|
+
//----------------------------------------------------------
|
98
|
+
// Handled by iconv in glibc
|
99
|
+
ISO_8859_13 = 43,
|
100
|
+
ISO_2022_KR = 44,
|
101
|
+
GBK = 45,
|
102
|
+
GB18030 = 46,
|
103
|
+
BIG5_HKSCS = 47,
|
104
|
+
ISO_2022_CN = 48,
|
105
|
+
|
106
|
+
//-----------------------------------------------------------
|
107
|
+
// Detected by xin liu's detector
|
108
|
+
// Handled by transcoder
|
109
|
+
// (Indic encodings)
|
110
|
+
|
111
|
+
TSCII = 49,
|
112
|
+
TAMIL_MONO = 50,
|
113
|
+
TAMIL_BI = 51,
|
114
|
+
JAGRAN = 52,
|
115
|
+
|
116
|
+
|
117
|
+
MACINTOSH_ROMAN = 53,
|
118
|
+
UTF7 = 54,
|
119
|
+
BHASKAR = 55, // Indic encoding - Devanagari
|
120
|
+
HTCHANAKYA = 56, // 56 Indic encoding - Devanagari
|
121
|
+
|
122
|
+
//-----------------------------------------------------------
|
123
|
+
// These allow a single place (inputconverter and outputconverter)
|
124
|
+
// to do UTF-16 <==> UTF-8 bulk conversions and UTF-32 <==> UTF-8
|
125
|
+
// bulk conversions, with interchange-valid checking on input and
|
126
|
+
// fallback if needed on ouput.
|
127
|
+
UTF16BE = 57, // big-endian UTF-16
|
128
|
+
UTF16LE = 58, // little-endian UTF-16
|
129
|
+
UTF32BE = 59, // big-endian UTF-32
|
130
|
+
UTF32LE = 60, // little-endian UTF-32
|
131
|
+
//-----------------------------------------------------------
|
132
|
+
|
133
|
+
//-----------------------------------------------------------
|
134
|
+
// An encoding that means "This is not text, but it may have some
|
135
|
+
// simple ASCII text embedded". Intended input conversion (not yet
|
136
|
+
// implemented) is to keep strings of >=4 seven-bit ASCII characters
|
137
|
+
// (follow each kept string with an ASCII space), delete the rest of
|
138
|
+
// the bytes. This will pick up and allow indexing of e.g. captions
|
139
|
+
// in JPEGs. No output conversion needed.
|
140
|
+
BINARYENC = 61,
|
141
|
+
//-----------------------------------------------------------
|
142
|
+
|
143
|
+
//-----------------------------------------------------------
|
144
|
+
// Some Web pages allow a mixture of HZ-GB and GB-2312 by using
|
145
|
+
// ~{ ... ~} for 2-byte pairs, and the browsers support this.
|
146
|
+
HZ_GB_2312 = 62,
|
147
|
+
//-----------------------------------------------------------
|
148
|
+
|
149
|
+
//-----------------------------------------------------------
|
150
|
+
// Some external vendors make the common input error of
|
151
|
+
// converting MSFT_CP1252 to UTF8 *twice*. No output conversion needed.
|
152
|
+
UTF8UTF8 = 63,
|
153
|
+
//-----------------------------------------------------------
|
154
|
+
|
155
|
+
//-----------------------------------------------------------
|
156
|
+
// Handled by transcoder for tamil language specific font
|
157
|
+
// encodings without the support for detection at present.
|
158
|
+
TAM_ELANGO = 64, // Elango - Tamil
|
159
|
+
TAM_LTTMBARANI = 65, // Barani - Tamil
|
160
|
+
TAM_SHREE = 66, // Shree - Tamil
|
161
|
+
TAM_TBOOMIS = 67, // TBoomis - Tamil
|
162
|
+
TAM_TMNEWS = 68, // TMNews - Tamil
|
163
|
+
TAM_WEBTAMIL = 69, // Webtamil - Tamil
|
164
|
+
//-----------------------------------------------------------
|
165
|
+
|
166
|
+
//-----------------------------------------------------------
|
167
|
+
// Shift_JIS variants used by Japanese cell phone carriers.
|
168
|
+
KDDI_SHIFT_JIS = 70,
|
169
|
+
DOCOMO_SHIFT_JIS = 71,
|
170
|
+
SOFTBANK_SHIFT_JIS = 72,
|
171
|
+
// ISO-2022-JP variants used by KDDI and SoftBank.
|
172
|
+
KDDI_ISO_2022_JP = 73,
|
173
|
+
SOFTBANK_ISO_2022_JP = 74,
|
174
|
+
//-----------------------------------------------------------
|
175
|
+
|
176
|
+
NUM_ENCODINGS = 75, // Always keep this at the end. It is not a
|
177
|
+
// valid Encoding enum, it is only used to
|
178
|
+
// indicate the total number of Encodings.
|
179
|
+
};
|
180
|
+
|
181
|
+
#endif // UTIL_ENCODINGS_ENCODINGS_PB_H_
|
@@ -0,0 +1,34 @@
|
|
1
|
+
// Copyright 2016 Google Inc.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
////////////////////////////////////////////////////////////////////////////////
|
16
|
+
|
17
|
+
#include "util/encodings/encodings.h"
|
18
|
+
|
19
|
+
#include "gtest/gtest.h"
|
20
|
+
|
21
|
+
TEST(EncodingsTest, EncodingNameAliasToEncoding) {
|
22
|
+
// Test that cases, non-alpha-numeric chars are ignored.
|
23
|
+
EXPECT_EQ(ISO_8859_1, EncodingNameAliasToEncoding("iso_8859_1"));
|
24
|
+
EXPECT_EQ(ISO_8859_1, EncodingNameAliasToEncoding("iso-8859-1"));
|
25
|
+
|
26
|
+
// Test that spaces are ignored.
|
27
|
+
EXPECT_EQ(UTF8, EncodingNameAliasToEncoding("UTF8"));
|
28
|
+
EXPECT_EQ(UTF8, EncodingNameAliasToEncoding("UTF 8"));
|
29
|
+
EXPECT_EQ(UTF8, EncodingNameAliasToEncoding("UTF-8"));
|
30
|
+
|
31
|
+
// Test alphanumeric differences are counted.
|
32
|
+
EXPECT_NE(UTF8, EncodingNameAliasToEncoding("UTF-7"));
|
33
|
+
EXPECT_NE(KOREAN_EUC_KR, EncodingNameAliasToEncoding("euc-jp"));
|
34
|
+
}
|