compact_enc_det 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/ext/compact_enc_det/compact_enc_det/CMakeLists.txt +103 -0
- data/ext/compact_enc_det/compact_enc_det/LICENSE +202 -0
- data/ext/compact_enc_det/compact_enc_det/README.md +46 -0
- data/ext/compact_enc_det/compact_enc_det/autogen.sh +74 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.cc +5719 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.h +83 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_fuzz_test.cc +54 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables.h +6326 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables2.h +856 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.cc +169 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.h +45 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_unittest.cc +5260 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/detail_head_string.inc +152 -0
- data/ext/compact_enc_det/compact_enc_det/util/basictypes.h +331 -0
- data/ext/compact_enc_det/compact_enc_det/util/case_insensitive_hash.h +88 -0
- data/ext/compact_enc_det/compact_enc_det/util/commandlineflags.h +39 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.cc +891 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.h +299 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.pb.h +181 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings_unittest.cc +34 -0
- data/ext/compact_enc_det/compact_enc_det/util/languages/languages.cc +349 -0
- data/ext/compact_enc_det/compact_enc_det/util/languages/languages.h +381 -0
- data/ext/compact_enc_det/compact_enc_det/util/languages/languages.pb.h +191 -0
- data/ext/compact_enc_det/compact_enc_det/util/logging.h +25 -0
- data/ext/compact_enc_det/compact_enc_det/util/port.h +53 -0
- data/ext/compact_enc_det/compact_enc_det/util/string_util.h +61 -0
- data/ext/compact_enc_det/compact_enc_det/util/varsetter.h +66 -0
- data/ext/compact_enc_det/compact_enc_det.cc +100 -0
- data/ext/compact_enc_det/extconf.rb +20 -0
- data/lib/compact_enc_det/version.rb +3 -0
- data/lib/compact_enc_det.rb +2 -0
- metadata +106 -0
@@ -0,0 +1,891 @@
|
|
1
|
+
// Copyright 2016 Google Inc.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
////////////////////////////////////////////////////////////////////////////////
|
16
|
+
|
17
|
+
#include "util/encodings/encodings.h"
|
18
|
+
|
19
|
+
#include <string.h> // for strcasecmp
|
20
|
+
#include <unordered_map>
|
21
|
+
#include <utility> // for pair
|
22
|
+
|
23
|
+
#include "util/basictypes.h"
|
24
|
+
#include "util/string_util.h"
|
25
|
+
#include "util/case_insensitive_hash.h"
|
26
|
+
|
27
|
+
struct EncodingInfo {
|
28
|
+
// The standard name for this encoding.
|
29
|
+
//
|
30
|
+
const char* encoding_name_;
|
31
|
+
|
32
|
+
// The "preferred MIME name" of an encoding as specified by the IANA at:
|
33
|
+
// http://www.iana.org/assignments/character-sets
|
34
|
+
//
|
35
|
+
// Note that the preferred MIME name may differ slightly from the
|
36
|
+
// official IANA name: i.e. ISO-8859-1 vs. ISO_8859-1:1987
|
37
|
+
//
|
38
|
+
const char* mime_encoding_name_;
|
39
|
+
|
40
|
+
// It is an internal policy that if an encoding has an IANA name,
|
41
|
+
// then encoding_name_ and mime_encoding_name_ must be the same string.
|
42
|
+
//
|
43
|
+
// However, there can be exceptions if there are compelling reasons.
|
44
|
+
// For example, Japanese mobile handsets require the name
|
45
|
+
// "Shift_JIS" in charset=... parameter in Content-Type headers to
|
46
|
+
// process emoji (emoticons) in their private encodings. In that
|
47
|
+
// case, mime_encoding_name_ should be "Shift_JIS", despite
|
48
|
+
// encoding_name_ actually is "X-KDDI-Shift_JIS".
|
49
|
+
|
50
|
+
// Some multi-byte encodings use byte values that coincide with the
|
51
|
+
// ASCII codes for HTML syntax characters <>"&' and browsers like MSIE
|
52
|
+
// can misinterpret these, as indicated in an external XSS report from
|
53
|
+
// 2007-02-15. Here, we map these dangerous encodings to safer ones. We
|
54
|
+
// also use UTF8 instead of encodings that we don't support in our
|
55
|
+
// output, and we generally try to be conservative in what we send out.
|
56
|
+
// Where the client asks for single- or double-byte encodings that are
|
57
|
+
// not as common, we substitute a more common single- or double-byte
|
58
|
+
// encoding, if there is one, thereby preserving the client's intent
|
59
|
+
// to use less space than UTF-8. This also means that characters
|
60
|
+
// outside the destination set will be converted to HTML NCRs (&#NNN;)
|
61
|
+
// if requested.
|
62
|
+
|
63
|
+
Encoding preferred_web_output_encoding_;
|
64
|
+
};
|
65
|
+
|
66
|
+
static const EncodingInfo kEncodingInfoTable[] = {
|
67
|
+
{ "ASCII", "ISO-8859-1", ISO_8859_1},
|
68
|
+
{ "Latin2", "ISO-8859-2", ISO_8859_2},
|
69
|
+
{ "Latin3", "ISO-8859-3", UTF8},
|
70
|
+
// MSIE 6 does not support ISO-8859-3 (XSS issue)
|
71
|
+
{ "Latin4", "ISO-8859-4", ISO_8859_4},
|
72
|
+
{ "ISO-8859-5", "ISO-8859-5", ISO_8859_5},
|
73
|
+
{ "Arabic", "ISO-8859-6", ISO_8859_6},
|
74
|
+
{ "Greek", "ISO-8859-7", ISO_8859_7},
|
75
|
+
{ "Hebrew", "ISO-8859-8", MSFT_CP1255},
|
76
|
+
// we do not endorse the visual order
|
77
|
+
{ "Latin5", "ISO-8859-9", ISO_8859_9},
|
78
|
+
{ "Latin6", "ISO-8859-10", UTF8},
|
79
|
+
// MSIE does not support ISO-8859-10 (XSS issue)
|
80
|
+
{ "EUC-JP", "EUC-JP", JAPANESE_EUC_JP},
|
81
|
+
{ "SJS", "Shift_JIS", JAPANESE_SHIFT_JIS},
|
82
|
+
{ "JIS", "ISO-2022-JP", JAPANESE_SHIFT_JIS},
|
83
|
+
// due to potential confusion with HTML syntax chars
|
84
|
+
{ "BIG5", "Big5", CHINESE_BIG5},
|
85
|
+
{ "GB", "GB2312", CHINESE_GB},
|
86
|
+
{ "EUC-CN",
|
87
|
+
"EUC-CN",
|
88
|
+
// Misnamed. Should be EUC-TW.
|
89
|
+
CHINESE_BIG5},
|
90
|
+
// MSIE treats "EUC-CN" like GB2312, which is not EUC-TW,
|
91
|
+
// and EUC-TW is rare, so we prefer Big5 for output.
|
92
|
+
{ "KSC", "EUC-KR", KOREAN_EUC_KR},
|
93
|
+
{ "Unicode",
|
94
|
+
"UTF-16LE",
|
95
|
+
// Internet Explorer doesn't recognize "ISO-10646-UCS-2"
|
96
|
+
UTF8
|
97
|
+
// due to potential confusion with HTML syntax chars
|
98
|
+
},
|
99
|
+
{ "EUC",
|
100
|
+
"EUC", // Misnamed. Should be EUC-TW.
|
101
|
+
CHINESE_BIG5
|
102
|
+
// MSIE does not recognize "EUC" (XSS issue),
|
103
|
+
// and EUC-TW is rare, so we prefer Big5 for output.
|
104
|
+
},
|
105
|
+
{ "CNS",
|
106
|
+
"CNS", // Misnamed. Should be EUC-TW.
|
107
|
+
CHINESE_BIG5},
|
108
|
+
// MSIE does not recognize "CNS" (XSS issue),
|
109
|
+
// and EUC-TW is rare, so we prefer Big5 for output.
|
110
|
+
{ "BIG5-CP950",
|
111
|
+
"BIG5-CP950", // Not an IANA name
|
112
|
+
CHINESE_BIG5
|
113
|
+
// MSIE does not recognize "BIG5-CP950" (XSS issue)
|
114
|
+
},
|
115
|
+
{ "CP932", "CP932", // Not an IANA name
|
116
|
+
JAPANESE_SHIFT_JIS}, // MSIE does not recognize "CP932" (XSS issue)
|
117
|
+
{ "UTF8", "UTF-8", UTF8},
|
118
|
+
{ "Unknown",
|
119
|
+
"x-unknown", // Not an IANA name
|
120
|
+
UTF8}, // UTF-8 is our default output encoding
|
121
|
+
{ "ASCII-7-bit", "US-ASCII", ASCII_7BIT},
|
122
|
+
{ "KOI8R", "KOI8-R", RUSSIAN_KOI8_R},
|
123
|
+
{ "CP1251", "windows-1251", RUSSIAN_CP1251},
|
124
|
+
{ "CP1252", "windows-1252", MSFT_CP1252},
|
125
|
+
{ "KOI8U",
|
126
|
+
"KOI8-U",
|
127
|
+
ISO_8859_5}, // because koi8-u is not as common
|
128
|
+
{ "CP1250", "windows-1250", MSFT_CP1250},
|
129
|
+
{ "ISO-8859-15", "ISO-8859-15", ISO_8859_15},
|
130
|
+
{ "CP1254", "windows-1254", MSFT_CP1254},
|
131
|
+
{ "CP1257", "windows-1257", MSFT_CP1257},
|
132
|
+
{ "ISO-8859-11", "ISO-8859-11", ISO_8859_11},
|
133
|
+
{ "CP874", "windows-874", MSFT_CP874},
|
134
|
+
{ "CP1256", "windows-1256", MSFT_CP1256},
|
135
|
+
{ "CP1255", "windows-1255", MSFT_CP1255},
|
136
|
+
{ "ISO-8859-8-I", "ISO-8859-8-I", MSFT_CP1255},
|
137
|
+
// Java does not support iso-8859-8-i
|
138
|
+
{ "VISUAL", "ISO-8859-8", MSFT_CP1255},
|
139
|
+
// we do not endorse the visual order
|
140
|
+
{ "CP852", "cp852", MSFT_CP1250},
|
141
|
+
// because cp852 is not as common
|
142
|
+
{ "CSN_369103", "csn_369103", MSFT_CP1250},
|
143
|
+
// MSIE does not recognize "csn_369103" (XSS issue)
|
144
|
+
{ "CP1253", "windows-1253", MSFT_CP1253},
|
145
|
+
{ "CP866", "IBM866", RUSSIAN_CP1251},
|
146
|
+
// because cp866 is not as common
|
147
|
+
{ "ISO-8859-13", "ISO-8859-13", UTF8},
|
148
|
+
// because iso-8859-13 is not widely supported
|
149
|
+
{ "ISO-2022-KR", "ISO-2022-KR", KOREAN_EUC_KR},
|
150
|
+
// due to potential confusion with HTML syntax chars
|
151
|
+
{ "GBK", "GBK", GBK},
|
152
|
+
{ "GB18030", "GB18030", GBK},
|
153
|
+
// because gb18030 is not widely supported
|
154
|
+
{ "BIG5_HKSCS", "BIG5-HKSCS", CHINESE_BIG5},
|
155
|
+
// because Big5-HKSCS is not widely supported
|
156
|
+
{ "ISO_2022_CN", "ISO-2022-CN", CHINESE_GB},
|
157
|
+
// due to potential confusion with HTML syntax chars
|
158
|
+
{ "TSCII", "tscii", UTF8},
|
159
|
+
// we do not have an output converter for this font encoding
|
160
|
+
{ "TAM", "tam", UTF8},
|
161
|
+
// we do not have an output converter for this font encoding
|
162
|
+
{ "TAB", "tab", UTF8},
|
163
|
+
// we do not have an output converter for this font encoding
|
164
|
+
{ "JAGRAN", "jagran", UTF8},
|
165
|
+
// we do not have an output converter for this font encoding
|
166
|
+
{ "MACINTOSH", "MACINTOSH", ISO_8859_1},
|
167
|
+
// because macintosh is relatively uncommon
|
168
|
+
{ "UTF7", "UTF-7",
|
169
|
+
UTF8}, // UTF-7 has been the subject of XSS attacks and is deprecated
|
170
|
+
{ "BHASKAR", "bhaskar",
|
171
|
+
UTF8}, // we do not have an output converter for this font encoding
|
172
|
+
{ "HTCHANAKYA", "htchanakya", // not an IANA charset name.
|
173
|
+
UTF8}, // we do not have an output converter for this font encoding
|
174
|
+
{ "UTF-16BE", "UTF-16BE",
|
175
|
+
UTF8}, // due to potential confusion with HTML syntax chars
|
176
|
+
{ "UTF-16LE", "UTF-16LE",
|
177
|
+
UTF8}, // due to potential confusion with HTML syntax chars
|
178
|
+
{ "UTF-32BE", "UTF-32BE",
|
179
|
+
UTF8}, // unlikely to cause XSS bugs, but very uncommon on Web
|
180
|
+
{ "UTF-32LE", "UTF-32LE",
|
181
|
+
UTF8}, // unlikely to cause XSS bugs, but very uncommon on Web
|
182
|
+
{ "X-BINARYENC", "x-binaryenc", // Not an IANA name
|
183
|
+
UTF8}, // because this one is not intended for output (just input)
|
184
|
+
{ "HZ-GB-2312", "HZ-GB-2312",
|
185
|
+
CHINESE_GB}, // due to potential confusion with HTML syntax chars
|
186
|
+
{ "X-UTF8UTF8", "x-utf8utf8", // Not an IANA name
|
187
|
+
UTF8}, // because this one is not intended for output (just input)
|
188
|
+
{ "X-TAM-ELANGO", "x-tam-elango",
|
189
|
+
UTF8}, // we do not have an output converter for this font encoding
|
190
|
+
{ "X-TAM-LTTMBARANI", "x-tam-lttmbarani",
|
191
|
+
UTF8}, // we do not have an output converter for this font encoding
|
192
|
+
{ "X-TAM-SHREE", "x-tam-shree",
|
193
|
+
UTF8}, // we do not have an output converter for this font encoding
|
194
|
+
{ "X-TAM-TBOOMIS", "x-tam-tboomis",
|
195
|
+
UTF8}, // we do not have an output converter for this font encoding
|
196
|
+
{ "X-TAM-TMNEWS", "x-tam-tmnews",
|
197
|
+
UTF8}, // we do not have an output converter for this font encoding
|
198
|
+
{ "X-TAM-WEBTAMIL", "x-tam-webtamil",
|
199
|
+
UTF8}, // we do not have an output converter for this font encoding
|
200
|
+
|
201
|
+
{ "X-KDDI-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},
|
202
|
+
// KDDI version of Shift_JIS with Google Emoji PUA mappings.
|
203
|
+
// Note that MimeEncodingName() returns "Shift_JIS", since KDDI uses
|
204
|
+
// "Shift_JIS" in HTTP headers and email messages.
|
205
|
+
|
206
|
+
{ "X-DoCoMo-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},
|
207
|
+
// DoCoMo version of Shift_JIS with Google Emoji PUA mappings.
|
208
|
+
// See the comment at KDDI_SHIFT_JIS for other issues.
|
209
|
+
|
210
|
+
{ "X-SoftBank-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},
|
211
|
+
// SoftBank version of Shift_JIS with Google Emoji PUA mappings.
|
212
|
+
// See the comment at KDDI_SHIFT_JIS for other issues.
|
213
|
+
|
214
|
+
{ "X-KDDI-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS},
|
215
|
+
// KDDI version of ISO-2022-JP with Google Emoji PUA mappings.
|
216
|
+
// See the comment at KDDI_SHIFT_JIS for other issues.
|
217
|
+
// The preferred Web encoding is due to potential confusion with
|
218
|
+
// HTML syntax chars.
|
219
|
+
|
220
|
+
{ "X-SoftBank-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS},
|
221
|
+
// SoftBank version of ISO-2022-JP with Google Emoji PUA mappings.
|
222
|
+
// See the comment at KDDI_SHIFT_JIS for other issues.
|
223
|
+
// The preferred Web encoding is due to potential confusion with
|
224
|
+
// HTML syntax chars.
|
225
|
+
|
226
|
+
// Please refer to NOTE: section in the comments in the definition
|
227
|
+
// of "struct I18NInfoByEncoding", before adding new encodings.
|
228
|
+
|
229
|
+
};
|
230
|
+
|
231
|
+
|
232
|
+
|
233
|
+
COMPILE_ASSERT(arraysize(kEncodingInfoTable) == NUM_ENCODINGS,
|
234
|
+
kEncodingInfoTable_has_incorrect_size);
|
235
|
+
|
236
|
+
Encoding default_encoding() {return LATIN1;}
|
237
|
+
|
238
|
+
// *************************************************************
|
239
|
+
// Encoding predicates
|
240
|
+
// IsValidEncoding()
|
241
|
+
// IsEncEncCompatible
|
242
|
+
// IsEncodingWithSupportedLanguage
|
243
|
+
// IsSupersetOfAscii7Bit
|
244
|
+
// Is8BitEncoding
|
245
|
+
// IsCJKEncoding
|
246
|
+
// IsHebrewEncoding
|
247
|
+
// IsRightToLeftEncoding
|
248
|
+
// IsLogicalRightToLeftEncoding
|
249
|
+
// IsVisualRightToLeftEncoding
|
250
|
+
// IsIso2022Encoding
|
251
|
+
// IsIso2022JpOrVariant
|
252
|
+
// IsShiftJisOrVariant
|
253
|
+
// IsJapaneseCellPhoneCarrierSpecificEncoding
|
254
|
+
// *************************************************************
|
255
|
+
|
256
|
+
bool IsValidEncoding(Encoding enc) {
|
257
|
+
return ((enc >= 0) && (enc < kNumEncodings));
|
258
|
+
}
|
259
|
+
|
260
|
+
bool IsEncEncCompatible(const Encoding from, const Encoding to) {
|
261
|
+
// Tests compatibility between the "from" and "to" encodings; in
|
262
|
+
// the typical case -- when both are valid known encodings -- this
|
263
|
+
// returns true iff converting from first to second is a no-op.
|
264
|
+
if (!IsValidEncoding(from) || !IsValidEncoding(to)) {
|
265
|
+
return false; // we only work with valid encodings...
|
266
|
+
} else if (to == from) {
|
267
|
+
return true; // the trivial common case
|
268
|
+
}
|
269
|
+
|
270
|
+
if (to == UNKNOWN_ENCODING) {
|
271
|
+
return true; // all valid encodings are compatible with the unknown
|
272
|
+
}
|
273
|
+
|
274
|
+
if (from == UNKNOWN_ENCODING) {
|
275
|
+
return false; // no unknown encoding is compatible with one that is
|
276
|
+
}
|
277
|
+
|
278
|
+
if (from == ASCII_7BIT) {
|
279
|
+
return IsSupersetOfAscii7Bit(to);
|
280
|
+
}
|
281
|
+
|
282
|
+
return (from == ISO_8859_1 && to == MSFT_CP1252) ||
|
283
|
+
(from == ISO_8859_8 && to == HEBREW_VISUAL) ||
|
284
|
+
(from == HEBREW_VISUAL && to == ISO_8859_8) ||
|
285
|
+
(from == ISO_8859_9 && to == MSFT_CP1254) ||
|
286
|
+
(from == ISO_8859_11 && to == MSFT_CP874) ||
|
287
|
+
(from == JAPANESE_SHIFT_JIS && to == JAPANESE_CP932) ||
|
288
|
+
(from == CHINESE_BIG5 && to == CHINESE_BIG5_CP950) ||
|
289
|
+
(from == CHINESE_GB && to == GBK) ||
|
290
|
+
(from == CHINESE_GB && to == GB18030) ||
|
291
|
+
(from == CHINESE_EUC_CN && to == CHINESE_EUC_DEC) ||
|
292
|
+
(from == CHINESE_EUC_CN && to == CHINESE_CNS) ||
|
293
|
+
(from == CHINESE_EUC_DEC && to == CHINESE_EUC_CN) ||
|
294
|
+
(from == CHINESE_EUC_DEC && to == CHINESE_CNS) ||
|
295
|
+
(from == CHINESE_CNS && to == CHINESE_EUC_CN) ||
|
296
|
+
(from == CHINESE_CNS && to == CHINESE_EUC_DEC);
|
297
|
+
}
|
298
|
+
|
299
|
+
// To be a superset of 7-bit Ascii means that bytes 0...127 in the given
|
300
|
+
// encoding represent the same characters as they do in ISO_8859_1.
|
301
|
+
|
302
|
+
// TODO: This list could be expanded. Many other encodings are supersets
|
303
|
+
// of 7-bit Ascii. In fact, Japanese JIS and Unicode are the only two
|
304
|
+
// encodings that I know for a fact should *not* be in this list.
|
305
|
+
bool IsSupersetOfAscii7Bit(Encoding e) {
|
306
|
+
switch (e) {
|
307
|
+
case ISO_8859_1:
|
308
|
+
case ISO_8859_2:
|
309
|
+
case ISO_8859_3:
|
310
|
+
case ISO_8859_4:
|
311
|
+
case ISO_8859_5:
|
312
|
+
case ISO_8859_6:
|
313
|
+
case ISO_8859_7:
|
314
|
+
case ISO_8859_8:
|
315
|
+
case ISO_8859_9:
|
316
|
+
case ISO_8859_10:
|
317
|
+
case JAPANESE_EUC_JP:
|
318
|
+
case JAPANESE_SHIFT_JIS:
|
319
|
+
case CHINESE_BIG5:
|
320
|
+
case CHINESE_GB:
|
321
|
+
case CHINESE_EUC_CN:
|
322
|
+
case KOREAN_EUC_KR:
|
323
|
+
case CHINESE_EUC_DEC:
|
324
|
+
case CHINESE_CNS:
|
325
|
+
case CHINESE_BIG5_CP950:
|
326
|
+
case JAPANESE_CP932:
|
327
|
+
case UTF8:
|
328
|
+
case UNKNOWN_ENCODING:
|
329
|
+
case ASCII_7BIT:
|
330
|
+
case RUSSIAN_KOI8_R:
|
331
|
+
case RUSSIAN_CP1251:
|
332
|
+
case MSFT_CP1252:
|
333
|
+
case RUSSIAN_KOI8_RU:
|
334
|
+
case MSFT_CP1250:
|
335
|
+
case ISO_8859_15:
|
336
|
+
case MSFT_CP1254:
|
337
|
+
case MSFT_CP1257:
|
338
|
+
case ISO_8859_11:
|
339
|
+
case MSFT_CP874:
|
340
|
+
case MSFT_CP1256:
|
341
|
+
case MSFT_CP1255:
|
342
|
+
case ISO_8859_8_I:
|
343
|
+
case HEBREW_VISUAL:
|
344
|
+
case CZECH_CP852:
|
345
|
+
case MSFT_CP1253:
|
346
|
+
case RUSSIAN_CP866:
|
347
|
+
case ISO_8859_13:
|
348
|
+
case GBK:
|
349
|
+
case GB18030:
|
350
|
+
case BIG5_HKSCS:
|
351
|
+
case MACINTOSH_ROMAN:
|
352
|
+
return true;
|
353
|
+
default:
|
354
|
+
return false;
|
355
|
+
}
|
356
|
+
}
|
357
|
+
|
358
|
+
// To be an 8-bit encoding means that there are fewer than 256 symbols.
|
359
|
+
// Each byte determines a new character; there are no multi-byte sequences.
|
360
|
+
|
361
|
+
// TODO: This list could maybe be expanded. Other encodings may be 8-bit.
|
362
|
+
bool Is8BitEncoding(Encoding e) {
|
363
|
+
switch (e) {
|
364
|
+
case ASCII_7BIT:
|
365
|
+
case ISO_8859_1:
|
366
|
+
case ISO_8859_2:
|
367
|
+
case ISO_8859_3:
|
368
|
+
case ISO_8859_4:
|
369
|
+
case ISO_8859_5:
|
370
|
+
case ISO_8859_6:
|
371
|
+
case ISO_8859_7:
|
372
|
+
case ISO_8859_8:
|
373
|
+
case ISO_8859_8_I:
|
374
|
+
case ISO_8859_9:
|
375
|
+
case ISO_8859_10:
|
376
|
+
case ISO_8859_11:
|
377
|
+
case ISO_8859_13:
|
378
|
+
case ISO_8859_15:
|
379
|
+
case MSFT_CP1252:
|
380
|
+
case MSFT_CP1253:
|
381
|
+
case MSFT_CP1254:
|
382
|
+
case MSFT_CP1255:
|
383
|
+
case MSFT_CP1256:
|
384
|
+
case MSFT_CP1257:
|
385
|
+
case RUSSIAN_KOI8_R:
|
386
|
+
case RUSSIAN_KOI8_RU:
|
387
|
+
case RUSSIAN_CP866:
|
388
|
+
return true;
|
389
|
+
default:
|
390
|
+
return false;
|
391
|
+
}
|
392
|
+
}
|
393
|
+
|
394
|
+
bool IsCJKEncoding(Encoding e) {
|
395
|
+
switch (e) {
|
396
|
+
case JAPANESE_EUC_JP:
|
397
|
+
case JAPANESE_SHIFT_JIS:
|
398
|
+
case JAPANESE_JIS:
|
399
|
+
case CHINESE_BIG5:
|
400
|
+
case CHINESE_GB:
|
401
|
+
case CHINESE_EUC_CN:
|
402
|
+
case KOREAN_EUC_KR:
|
403
|
+
case CHINESE_EUC_DEC:
|
404
|
+
case CHINESE_CNS:
|
405
|
+
case CHINESE_BIG5_CP950:
|
406
|
+
case JAPANESE_CP932:
|
407
|
+
case ISO_2022_KR:
|
408
|
+
case GBK:
|
409
|
+
case GB18030:
|
410
|
+
case BIG5_HKSCS:
|
411
|
+
case ISO_2022_CN:
|
412
|
+
case HZ_GB_2312:
|
413
|
+
return true;
|
414
|
+
default:
|
415
|
+
return false;
|
416
|
+
}
|
417
|
+
}
|
418
|
+
|
419
|
+
bool IsHebrewEncoding(Encoding e) {
|
420
|
+
return (e == ISO_8859_8 ||
|
421
|
+
e == ISO_8859_8_I ||
|
422
|
+
e == MSFT_CP1255 ||
|
423
|
+
e == HEBREW_VISUAL);
|
424
|
+
}
|
425
|
+
|
426
|
+
|
427
|
+
|
428
|
+
bool IsRightToLeftEncoding(Encoding enc) {
|
429
|
+
switch (enc) {
|
430
|
+
case MSFT_CP1255:
|
431
|
+
case MSFT_CP1256:
|
432
|
+
case ARABIC_ENCODING:
|
433
|
+
case HEBREW_ENCODING:
|
434
|
+
case ISO_8859_8_I:
|
435
|
+
case HEBREW_VISUAL:
|
436
|
+
return true;
|
437
|
+
default:
|
438
|
+
return false;
|
439
|
+
}
|
440
|
+
}
|
441
|
+
|
442
|
+
bool IsLogicalRightToLeftEncoding(Encoding enc) {
|
443
|
+
return IsRightToLeftEncoding(enc) && !IsVisualRightToLeftEncoding(enc);
|
444
|
+
}
|
445
|
+
|
446
|
+
// Note that despite an RFC to the contrary, ARABIC_ENCODING (ISO-8859-6)
|
447
|
+
// is NOT visual.
|
448
|
+
bool IsVisualRightToLeftEncoding(Encoding enc) {
|
449
|
+
switch (enc) {
|
450
|
+
case HEBREW_ENCODING:
|
451
|
+
case HEBREW_VISUAL:
|
452
|
+
return true;
|
453
|
+
default:
|
454
|
+
return false;
|
455
|
+
}
|
456
|
+
}
|
457
|
+
|
458
|
+
|
459
|
+
|
460
|
+
|
461
|
+
|
462
|
+
bool IsIso2022Encoding(Encoding enc) {
|
463
|
+
return (IsIso2022JpOrVariant(enc) ||
|
464
|
+
enc == ISO_2022_KR ||
|
465
|
+
enc == ISO_2022_CN);
|
466
|
+
}
|
467
|
+
|
468
|
+
bool IsIso2022JpOrVariant(Encoding enc) {
|
469
|
+
return (enc == JAPANESE_JIS ||
|
470
|
+
enc == KDDI_ISO_2022_JP ||
|
471
|
+
enc == SOFTBANK_ISO_2022_JP);
|
472
|
+
}
|
473
|
+
|
474
|
+
bool IsShiftJisOrVariant(Encoding enc) {
|
475
|
+
return (enc == JAPANESE_SHIFT_JIS ||
|
476
|
+
enc == JAPANESE_CP932 ||
|
477
|
+
enc == KDDI_SHIFT_JIS ||
|
478
|
+
enc == DOCOMO_SHIFT_JIS ||
|
479
|
+
enc == SOFTBANK_SHIFT_JIS);
|
480
|
+
}
|
481
|
+
|
482
|
+
bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc) {
|
483
|
+
return (enc == KDDI_ISO_2022_JP ||
|
484
|
+
enc == KDDI_SHIFT_JIS ||
|
485
|
+
enc == DOCOMO_SHIFT_JIS ||
|
486
|
+
enc == SOFTBANK_SHIFT_JIS ||
|
487
|
+
enc == SOFTBANK_ISO_2022_JP);
|
488
|
+
}
|
489
|
+
|
490
|
+
|
491
|
+
// *************************************************************
|
492
|
+
// ENCODING NAMES
|
493
|
+
// EncodingName() [Encoding to name]
|
494
|
+
// MimeEncodingName() [Encoding to name]
|
495
|
+
// EncodingFromName() [name to Encoding]
|
496
|
+
// EncodingNameAliasToEncoding() [name to Encoding]
|
497
|
+
// default_encoding_name()
|
498
|
+
// invalid_encoding_name()
|
499
|
+
// *************************************************************
|
500
|
+
|
501
|
+
const char * EncodingName(const Encoding enc) {
|
502
|
+
if ( (enc < 0) || (enc >= kNumEncodings) )
|
503
|
+
return invalid_encoding_name();
|
504
|
+
return kEncodingInfoTable[enc].encoding_name_;
|
505
|
+
}
|
506
|
+
|
507
|
+
// TODO: Unify MimeEncodingName and EncodingName, or determine why
|
508
|
+
// such a unification is not possible.
|
509
|
+
|
510
|
+
const char * MimeEncodingName(Encoding enc) {
|
511
|
+
if ( (enc < 0) || (enc >= kNumEncodings) )
|
512
|
+
return ""; // TODO: Should this be invalid_encoding_name()?
|
513
|
+
return kEncodingInfoTable[enc].mime_encoding_name_;
|
514
|
+
}
|
515
|
+
|
516
|
+
bool EncodingFromName(const char* enc_name, Encoding *encoding) {
|
517
|
+
*encoding = UNKNOWN_ENCODING;
|
518
|
+
if ( enc_name == NULL ) return false;
|
519
|
+
|
520
|
+
for ( int i = 0; i < kNumEncodings; i++ ) {
|
521
|
+
if (!base::strcasecmp(enc_name, kEncodingInfoTable[i].encoding_name_) ) {
|
522
|
+
*encoding = static_cast<Encoding>(i);
|
523
|
+
return true;
|
524
|
+
}
|
525
|
+
}
|
526
|
+
return false;
|
527
|
+
}
|
528
|
+
|
529
|
+
// The encoding_map maps standard and non-standard encoding-names
|
530
|
+
// (strings) to Encoding enums. It is used only by
|
531
|
+
// EncodingNameAliasToEncoding. Note that the map uses
|
532
|
+
// case-insensitive hash and comparison functions.
|
533
|
+
|
534
|
+
typedef std::unordered_map<const char *, Encoding,
|
535
|
+
CStringAlnumCaseHash,
|
536
|
+
CStringAlnumCaseEqual> EncodingMap;
|
537
|
+
|
538
|
+
static const EncodingMap& GetEncodingMap() {
|
539
|
+
static EncodingMap encoding_map;
|
540
|
+
if (!encoding_map.empty()) {
|
541
|
+
// Already initialized
|
542
|
+
return encoding_map;
|
543
|
+
}
|
544
|
+
|
545
|
+
// Initialize the map with all the "standard" encoding names,
|
546
|
+
// i.e., the ones returned by EncodingName and MimeEncodingName.
|
547
|
+
//
|
548
|
+
// First, add internal encoding names returned by EncodingName().
|
549
|
+
for (int i = 0; i < NUM_ENCODINGS; ++i) {
|
550
|
+
Encoding e = static_cast<Encoding>(i);
|
551
|
+
// Internal encoding names must be unique.
|
552
|
+
// The internal names are guaranteed to be unique by the CHECK_EQ.
|
553
|
+
const char *encoding_name = EncodingName(e);
|
554
|
+
// CHECK_EQ(0, encoding_map.count(encoding_name))
|
555
|
+
// << "Duplicate found for " << encoding_name;
|
556
|
+
encoding_map[encoding_name] = e;
|
557
|
+
}
|
558
|
+
// Then, add mime encoding names returned by MimeEncodingName().
|
559
|
+
// We don't override existing entries, to give precedence to entries
|
560
|
+
// added earlier.
|
561
|
+
for (int i = 0; i < NUM_ENCODINGS; ++i) {
|
562
|
+
Encoding e = static_cast<Encoding>(i);
|
563
|
+
// Note that MimeEncodingName() can return the same mime encoding
|
564
|
+
// name for different encoding enums like JAPANESE_SHIFT_JIS and
|
565
|
+
// KDDI_SHIFT_JIS. In that case, the encoding enum first seen
|
566
|
+
// will be the value for the encoding name in the map.
|
567
|
+
const char *mime_encoding_name = MimeEncodingName(e);
|
568
|
+
if (encoding_map.count(mime_encoding_name) == 0) {
|
569
|
+
encoding_map[mime_encoding_name] = e;
|
570
|
+
}
|
571
|
+
}
|
572
|
+
|
573
|
+
// Add some non-standard names: alternate spellings, common typos,
|
574
|
+
// etc. (It does no harm to add names already in the map.) Note
|
575
|
+
// that although the map is case-insensitive, by convention the
|
576
|
+
// keys are written here in lower case. For ease of maintenance,
|
577
|
+
// they are listed in alphabetical order.
|
578
|
+
encoding_map["5601"] = KOREAN_EUC_KR;
|
579
|
+
encoding_map["646"] = ASCII_7BIT;
|
580
|
+
encoding_map["852"] = CZECH_CP852;
|
581
|
+
encoding_map["866"] = RUSSIAN_CP866;
|
582
|
+
encoding_map["8859-1"] = ISO_8859_1;
|
583
|
+
encoding_map["ansi-1251"] = RUSSIAN_CP1251;
|
584
|
+
encoding_map["ansi_x3.4-1968"] = ASCII_7BIT;
|
585
|
+
encoding_map["arabic"] = ISO_8859_6;
|
586
|
+
encoding_map["ascii"] = ISO_8859_1;
|
587
|
+
encoding_map["ascii-7-bit"] = ASCII_7BIT; // not iana standard
|
588
|
+
encoding_map["asmo-708"] = ISO_8859_6;
|
589
|
+
encoding_map["bhaskar"] = BHASKAR;
|
590
|
+
encoding_map["big5"] = CHINESE_BIG5;
|
591
|
+
encoding_map["big5-cp950"] = CHINESE_BIG5_CP950; // not iana standard
|
592
|
+
encoding_map["big5-hkscs"] = BIG5_HKSCS;
|
593
|
+
encoding_map["chinese"] = CHINESE_GB;
|
594
|
+
encoding_map["cns"] = CHINESE_CNS; // not iana standard
|
595
|
+
encoding_map["cns11643"] = CHINESE_CNS;
|
596
|
+
encoding_map["cp1250"] = MSFT_CP1250; // not iana standard
|
597
|
+
encoding_map["cp1251"] = RUSSIAN_CP1251; // not iana standard
|
598
|
+
encoding_map["cp1252"] = MSFT_CP1252; // not iana standard
|
599
|
+
encoding_map["cp1253"] = MSFT_CP1253; // not iana standard
|
600
|
+
encoding_map["cp1254"] = MSFT_CP1254; // not iana standard
|
601
|
+
encoding_map["cp1255"] = MSFT_CP1255;
|
602
|
+
encoding_map["cp1256"] = MSFT_CP1256;
|
603
|
+
encoding_map["cp1257"] = MSFT_CP1257; // not iana standard
|
604
|
+
encoding_map["cp819"] = ISO_8859_1;
|
605
|
+
encoding_map["cp852"] = CZECH_CP852;
|
606
|
+
encoding_map["cp866"] = RUSSIAN_CP866;
|
607
|
+
encoding_map["cp-866"] = RUSSIAN_CP866;
|
608
|
+
encoding_map["cp874"] = MSFT_CP874;
|
609
|
+
encoding_map["cp932"] = JAPANESE_CP932; // not iana standard
|
610
|
+
encoding_map["cp950"] = CHINESE_BIG5_CP950; // not iana standard
|
611
|
+
encoding_map["csbig5"] = CHINESE_BIG5;
|
612
|
+
encoding_map["cseucjpkdfmtjapanese"] = JAPANESE_EUC_JP;
|
613
|
+
encoding_map["cseuckr"] = KOREAN_EUC_KR;
|
614
|
+
encoding_map["csgb2312"] = CHINESE_GB;
|
615
|
+
encoding_map["csibm852"] = CZECH_CP852;
|
616
|
+
encoding_map["csibm866"] = RUSSIAN_CP866;
|
617
|
+
encoding_map["csiso2022jp"] = JAPANESE_JIS;
|
618
|
+
encoding_map["csiso2022kr"] = ISO_2022_KR;
|
619
|
+
encoding_map["csiso58gb231280"] = CHINESE_GB;
|
620
|
+
encoding_map["csiso88598i"] = ISO_8859_8_I;
|
621
|
+
encoding_map["csisolatin1"] = ISO_8859_1;
|
622
|
+
encoding_map["csisolatin2"] = ISO_8859_2;
|
623
|
+
encoding_map["csisolatin3"] = ISO_8859_3;
|
624
|
+
encoding_map["csisolatin4"] = ISO_8859_4;
|
625
|
+
encoding_map["csisolatin5"] = ISO_8859_9;
|
626
|
+
encoding_map["csisolatin6"] = ISO_8859_10;
|
627
|
+
encoding_map["csisolatinarabic"] = ISO_8859_6;
|
628
|
+
encoding_map["csisolatincyrillic"] = ISO_8859_5;
|
629
|
+
encoding_map["csisolatingreek"] = ISO_8859_7;
|
630
|
+
encoding_map["csisolatinhebrew"] = ISO_8859_8;
|
631
|
+
encoding_map["csksc56011987"] = KOREAN_EUC_KR;
|
632
|
+
encoding_map["csmacintosh"] = MACINTOSH_ROMAN;
|
633
|
+
encoding_map["csn-369103"] = CZECH_CSN_369103;
|
634
|
+
encoding_map["csshiftjis"] = JAPANESE_SHIFT_JIS;
|
635
|
+
encoding_map["csunicode"] = UTF16BE;
|
636
|
+
encoding_map["csunicode11"] = UTF16BE;
|
637
|
+
encoding_map["csunicode11utf7"] = UTF7;
|
638
|
+
encoding_map["csunicodeascii"] = UTF16BE;
|
639
|
+
encoding_map["csunicodelatin1"] = UTF16BE;
|
640
|
+
encoding_map["cyrillic"] = ISO_8859_5;
|
641
|
+
encoding_map["ecma-114"] = ISO_8859_6;
|
642
|
+
encoding_map["ecma-118"] = ISO_8859_7;
|
643
|
+
encoding_map["elot_928"] = ISO_8859_7;
|
644
|
+
encoding_map["euc"] = CHINESE_EUC_DEC; // not iana standard
|
645
|
+
encoding_map["euc-cn"] = CHINESE_EUC_CN; // not iana standard
|
646
|
+
encoding_map["euc-dec"] = CHINESE_EUC_DEC; // not iana standard
|
647
|
+
encoding_map["euc-jp"] = JAPANESE_EUC_JP;
|
648
|
+
encoding_map["euc-kr"] = KOREAN_EUC_KR;
|
649
|
+
encoding_map["eucgb2312_cn"] = CHINESE_GB;
|
650
|
+
encoding_map["gb"] = CHINESE_GB; // not iana standard
|
651
|
+
encoding_map["gb18030"] = GB18030;
|
652
|
+
encoding_map["gb2132"] = CHINESE_GB; // common typo
|
653
|
+
encoding_map["gb2312"] = CHINESE_GB;
|
654
|
+
encoding_map["gb_2312-80"] = CHINESE_GB;
|
655
|
+
encoding_map["gbk"] = GBK;
|
656
|
+
encoding_map["greek"] = ISO_8859_7;
|
657
|
+
encoding_map["greek8"] = ISO_8859_7;
|
658
|
+
encoding_map["hebrew"] = ISO_8859_8;
|
659
|
+
encoding_map["htchanakya"] = HTCHANAKYA;
|
660
|
+
encoding_map["hz-gb-2312"] = HZ_GB_2312;
|
661
|
+
encoding_map["ibm819"] = ISO_8859_1;
|
662
|
+
encoding_map["ibm852"] = CZECH_CP852;
|
663
|
+
encoding_map["ibm874"] = MSFT_CP874;
|
664
|
+
encoding_map["iso-10646"] = UTF16BE;
|
665
|
+
encoding_map["iso-10646-j-1"] = UTF16BE;
|
666
|
+
encoding_map["iso-10646-ucs-2"] = UNICODE;
|
667
|
+
encoding_map["iso-10646-ucs-4"] = UTF32BE;
|
668
|
+
encoding_map["iso-10646-ucs-basic"] = UTF16BE;
|
669
|
+
encoding_map["iso-10646-unicode-latin1"] = UTF16BE;
|
670
|
+
encoding_map["iso-2022-cn"] = ISO_2022_CN;
|
671
|
+
encoding_map["iso-2022-jp"] = JAPANESE_JIS;
|
672
|
+
encoding_map["iso-2022-kr"] = ISO_2022_KR;
|
673
|
+
encoding_map["iso-8559-1"] = ISO_8859_1; // common typo
|
674
|
+
encoding_map["iso-874"] = MSFT_CP874;
|
675
|
+
encoding_map["iso-8858-1"] = ISO_8859_1; // common typo
|
676
|
+
// iso-8859-0 was a temporary name, eventually renamed iso-8859-15
|
677
|
+
encoding_map["iso-8859-0"] = ISO_8859_15;
|
678
|
+
encoding_map["iso-8859-1"] = ISO_8859_1;
|
679
|
+
encoding_map["iso-8859-10"] = ISO_8859_10;
|
680
|
+
encoding_map["iso-8859-11"] = ISO_8859_11;
|
681
|
+
encoding_map["iso-8859-13"] = ISO_8859_13;
|
682
|
+
encoding_map["iso-8859-15"] = ISO_8859_15;
|
683
|
+
encoding_map["iso-8859-2"] = ISO_8859_2;
|
684
|
+
encoding_map["iso-8859-3"] = ISO_8859_3;
|
685
|
+
encoding_map["iso-8859-4"] = ISO_8859_4;
|
686
|
+
encoding_map["iso-8859-5"] = ISO_8859_5;
|
687
|
+
encoding_map["iso-8859-6"] = ISO_8859_6;
|
688
|
+
encoding_map["iso-8859-7"] = ISO_8859_7;
|
689
|
+
encoding_map["iso-8859-8"] = ISO_8859_8;
|
690
|
+
encoding_map["iso-8859-8-i"] = ISO_8859_8_I;
|
691
|
+
encoding_map["iso-8859-9"] = ISO_8859_9;
|
692
|
+
encoding_map["iso-9959-1"] = ISO_8859_1; // common typo
|
693
|
+
encoding_map["iso-ir-100"] = ISO_8859_1;
|
694
|
+
encoding_map["iso-ir-101"] = ISO_8859_2;
|
695
|
+
encoding_map["iso-ir-109"] = ISO_8859_3;
|
696
|
+
encoding_map["iso-ir-110"] = ISO_8859_4;
|
697
|
+
encoding_map["iso-ir-126"] = ISO_8859_7;
|
698
|
+
encoding_map["iso-ir-127"] = ISO_8859_6;
|
699
|
+
encoding_map["iso-ir-138"] = ISO_8859_8;
|
700
|
+
encoding_map["iso-ir-144"] = ISO_8859_5;
|
701
|
+
encoding_map["iso-ir-148"] = ISO_8859_9;
|
702
|
+
encoding_map["iso-ir-149"] = KOREAN_EUC_KR;
|
703
|
+
encoding_map["iso-ir-157"] = ISO_8859_10;
|
704
|
+
encoding_map["iso-ir-58"] = CHINESE_GB;
|
705
|
+
encoding_map["iso-latin-1"] = ISO_8859_1;
|
706
|
+
encoding_map["iso_2022-cn"] = ISO_2022_CN;
|
707
|
+
encoding_map["iso_2022-kr"] = ISO_2022_KR;
|
708
|
+
encoding_map["iso_8859-1"] = ISO_8859_1;
|
709
|
+
encoding_map["iso_8859-10:1992"] = ISO_8859_10;
|
710
|
+
encoding_map["iso_8859-11"] = ISO_8859_11;
|
711
|
+
encoding_map["iso_8859-13"] = ISO_8859_13;
|
712
|
+
encoding_map["iso_8859-15"] = ISO_8859_15;
|
713
|
+
encoding_map["iso_8859-1:1987"] = ISO_8859_1;
|
714
|
+
encoding_map["iso_8859-2"] = ISO_8859_2;
|
715
|
+
encoding_map["iso_8859-2:1987"] = ISO_8859_2;
|
716
|
+
encoding_map["iso_8859-3"] = ISO_8859_3;
|
717
|
+
encoding_map["iso_8859-3:1988"] = ISO_8859_3;
|
718
|
+
encoding_map["iso_8859-4"] = ISO_8859_4;
|
719
|
+
encoding_map["iso_8859-4:1988"] = ISO_8859_4;
|
720
|
+
encoding_map["iso_8859-5"] = ISO_8859_5;
|
721
|
+
encoding_map["iso_8859-5:1988"] = ISO_8859_5;
|
722
|
+
encoding_map["iso_8859-6"] = ISO_8859_6;
|
723
|
+
encoding_map["iso_8859-6:1987"] = ISO_8859_6;
|
724
|
+
encoding_map["iso_8859-7"] = ISO_8859_7;
|
725
|
+
encoding_map["iso_8859-7:1987"] = ISO_8859_7;
|
726
|
+
encoding_map["iso_8859-8"] = ISO_8859_8;
|
727
|
+
encoding_map["iso_8859-8:1988:"] = ISO_8859_8;
|
728
|
+
encoding_map["iso_8859-9"] = ISO_8859_9;
|
729
|
+
encoding_map["iso_8859-9:1989"] = ISO_8859_9;
|
730
|
+
encoding_map["jagran"] = JAGRAN;
|
731
|
+
encoding_map["jis"] = JAPANESE_JIS; // not iana standard
|
732
|
+
encoding_map["koi8-cs"] = CZECH_CSN_369103;
|
733
|
+
encoding_map["koi8-r"] = RUSSIAN_KOI8_R;
|
734
|
+
encoding_map["koi8-ru"] = RUSSIAN_KOI8_RU; // not iana standard
|
735
|
+
encoding_map["koi8-u"] = RUSSIAN_KOI8_RU;
|
736
|
+
encoding_map["koi8r"] = RUSSIAN_KOI8_R; // not iana standard
|
737
|
+
encoding_map["koi8u"] = RUSSIAN_KOI8_RU; // not iana standard
|
738
|
+
encoding_map["korean"] = KOREAN_EUC_KR; // i assume this is what is meant
|
739
|
+
encoding_map["ks-c-5601"] = KOREAN_EUC_KR; // not iana standard
|
740
|
+
encoding_map["ks-c-5601-1987"] = KOREAN_EUC_KR; // not iana standard
|
741
|
+
encoding_map["ks_c_5601-1989"] = KOREAN_EUC_KR;
|
742
|
+
encoding_map["ksc"] = KOREAN_EUC_KR; // not iana standard
|
743
|
+
encoding_map["l1"] = ISO_8859_1;
|
744
|
+
encoding_map["l2"] = ISO_8859_2;
|
745
|
+
encoding_map["l3"] = ISO_8859_3;
|
746
|
+
encoding_map["l4"] = ISO_8859_4;
|
747
|
+
encoding_map["l5"] = ISO_8859_9;
|
748
|
+
encoding_map["l6"] = ISO_8859_10;
|
749
|
+
encoding_map["latin-1"] = ISO_8859_1; // not iana standard
|
750
|
+
encoding_map["latin1"] = ISO_8859_1;
|
751
|
+
encoding_map["latin2"] = ISO_8859_2;
|
752
|
+
encoding_map["latin3"] = ISO_8859_3;
|
753
|
+
encoding_map["latin4"] = ISO_8859_4;
|
754
|
+
encoding_map["latin5"] = ISO_8859_9;
|
755
|
+
encoding_map["latin6"] = ISO_8859_10;
|
756
|
+
encoding_map["mac"] = MACINTOSH_ROMAN;
|
757
|
+
encoding_map["macintosh"] = MACINTOSH_ROMAN;
|
758
|
+
encoding_map["macintosh-roman"] = MACINTOSH_ROMAN;
|
759
|
+
encoding_map["ms932"] = JAPANESE_CP932; // not iana standard
|
760
|
+
encoding_map["ms_kanji"] = JAPANESE_CP932;
|
761
|
+
encoding_map["shift-jis"] = JAPANESE_SHIFT_JIS;
|
762
|
+
encoding_map["shift_jis"] = JAPANESE_SHIFT_JIS;
|
763
|
+
encoding_map["sjis"] = JAPANESE_SHIFT_JIS; // not iana standard
|
764
|
+
encoding_map["sjs"] = JAPANESE_SHIFT_JIS; // not iana standard
|
765
|
+
encoding_map["sun_eu_greek"] = ISO_8859_7;
|
766
|
+
encoding_map["tab"] = TAMIL_BI;
|
767
|
+
encoding_map["tam"] = TAMIL_MONO;
|
768
|
+
encoding_map["tis-620"] = ISO_8859_11;
|
769
|
+
encoding_map["tscii"] = TSCII;
|
770
|
+
encoding_map["un"] = UNKNOWN_ENCODING; // not iana standard
|
771
|
+
encoding_map["unicode"] = UNICODE; // not iana standard
|
772
|
+
encoding_map["unicode-1-1-utf-7"] = UTF7;
|
773
|
+
encoding_map["unicode-1-1-utf-8"] = UTF8;
|
774
|
+
encoding_map["unicode-2-0-utf-7"] = UTF7;
|
775
|
+
encoding_map["unknown"] = UNKNOWN_ENCODING; // not iana standard
|
776
|
+
encoding_map["us"] = ISO_8859_1;
|
777
|
+
encoding_map["us-ascii"] = ISO_8859_1;
|
778
|
+
encoding_map["utf-16be"] = UTF16BE;
|
779
|
+
encoding_map["utf-16le"] = UTF16LE;
|
780
|
+
encoding_map["utf-32be"] = UTF32BE;
|
781
|
+
encoding_map["utf-32le"] = UTF32LE;
|
782
|
+
encoding_map["utf-7"] = UTF7;
|
783
|
+
encoding_map["utf-8"] = UTF8;
|
784
|
+
encoding_map["utf7"] = UTF7;
|
785
|
+
encoding_map["utf8"] = UTF8; // not iana standard
|
786
|
+
encoding_map["visual"] = HEBREW_VISUAL;
|
787
|
+
encoding_map["win-1250"] = MSFT_CP1250; // not iana standard
|
788
|
+
encoding_map["win-1251"] = RUSSIAN_CP1251; // not iana standard
|
789
|
+
encoding_map["window-874"] = MSFT_CP874;
|
790
|
+
encoding_map["windows-1250"] = MSFT_CP1250;
|
791
|
+
encoding_map["windows-1251"] = RUSSIAN_CP1251;
|
792
|
+
encoding_map["windows-1252"] = MSFT_CP1252;
|
793
|
+
encoding_map["windows-1253"] = MSFT_CP1253;
|
794
|
+
encoding_map["windows-1254"] = MSFT_CP1254;
|
795
|
+
encoding_map["windows-1255"] = MSFT_CP1255;
|
796
|
+
encoding_map["windows-1256"] = MSFT_CP1256;
|
797
|
+
encoding_map["windows-1257"] = MSFT_CP1257;
|
798
|
+
encoding_map["windows-31j"] = JAPANESE_CP932;
|
799
|
+
encoding_map["windows-874"] = MSFT_CP874;
|
800
|
+
encoding_map["windows-936"] = GBK;
|
801
|
+
encoding_map["x-big5"] = CHINESE_BIG5;
|
802
|
+
encoding_map["x-binaryenc"] = BINARYENC; // not iana standard
|
803
|
+
encoding_map["x-cp1250"] = MSFT_CP1250;
|
804
|
+
encoding_map["x-cp1251"] = RUSSIAN_CP1251;
|
805
|
+
encoding_map["x-cp1252"] = MSFT_CP1252;
|
806
|
+
encoding_map["x-cp1253"] = MSFT_CP1253;
|
807
|
+
encoding_map["x-cp1254"] = MSFT_CP1254;
|
808
|
+
encoding_map["x-cp1255"] = MSFT_CP1255;
|
809
|
+
encoding_map["x-cp1256"] = MSFT_CP1256;
|
810
|
+
encoding_map["x-cp1257"] = MSFT_CP1257;
|
811
|
+
encoding_map["x-euc-jp"] = JAPANESE_EUC_JP;
|
812
|
+
encoding_map["x-euc-tw"] = CHINESE_CNS;
|
813
|
+
encoding_map["x-gbk"] = GBK;
|
814
|
+
encoding_map["x-iso-10646-ucs-2-be"] = UTF16BE;
|
815
|
+
encoding_map["x-iso-10646-ucs-2-le"] = UTF16LE;
|
816
|
+
encoding_map["x-iso-10646-ucs-4-be"] = UTF32BE;
|
817
|
+
encoding_map["x-iso-10646-ucs-4-le"] = UTF32LE;
|
818
|
+
encoding_map["x-jis"] = JAPANESE_JIS; // not iana standard
|
819
|
+
encoding_map["x-mac-roman"] = MACINTOSH_ROMAN;
|
820
|
+
encoding_map["x-shift_jis"] = JAPANESE_SHIFT_JIS; // not iana standard
|
821
|
+
encoding_map["x-sjis"] = JAPANESE_SHIFT_JIS;
|
822
|
+
encoding_map["x-unicode-2-0-utf-7"] = UTF7;
|
823
|
+
encoding_map["x-utf8utf8"] = UTF8UTF8; // not iana standard
|
824
|
+
encoding_map["x-x-big5"] = CHINESE_BIG5;
|
825
|
+
encoding_map["zh_cn.euc"] = CHINESE_GB;
|
826
|
+
encoding_map["zh_tw-big5"] = CHINESE_BIG5;
|
827
|
+
encoding_map["zh_tw-euc"] = CHINESE_CNS;
|
828
|
+
|
829
|
+
// Remove they entry for the empty string, if any.
|
830
|
+
encoding_map.erase("");
|
831
|
+
|
832
|
+
return encoding_map;
|
833
|
+
}
|
834
|
+
|
835
|
+
// ----------------------------------------------------------------------
|
836
|
+
// EncodingNameAliasToEncoding()
|
837
|
+
//
|
838
|
+
// This function takes an encoding name/alias and returns the Encoding
|
839
|
+
// enum. The input is case insensitive. It is the union of the common
|
840
|
+
// IANA standard names, the charset names used in Netscape Navigator,
|
841
|
+
// and some common names we have been using.
|
842
|
+
// See: http://www.iana.org/assignments/character-sets
|
843
|
+
// http://physics.hallym.ac.kr/resource/relnotes/windows-2.0.html
|
844
|
+
//
|
845
|
+
// UNKNOWN_ENCODING is returned if none matches.
|
846
|
+
//
|
847
|
+
// TODO: Check if it is possible to remove the non-standard,
|
848
|
+
// non-netscape-use names. It is because this routine is used for
|
849
|
+
// encoding detections from html meta info. Non-standard names may
|
850
|
+
// introduce noise on encoding detection.
|
851
|
+
//
|
852
|
+
// TODO: Unify EncodingNameAliasToEncoding and EncodingFromName,
|
853
|
+
// or determine why such a unification is not possible.
|
854
|
+
// ----------------------------------------------------------------------
|
855
|
+
Encoding EncodingNameAliasToEncoding(const char *encoding_name) {
|
856
|
+
if (!encoding_name) {
|
857
|
+
return UNKNOWN_ENCODING;
|
858
|
+
}
|
859
|
+
|
860
|
+
const EncodingMap& encoding_map = GetEncodingMap();
|
861
|
+
|
862
|
+
EncodingMap::const_iterator emi = encoding_map.find(encoding_name);
|
863
|
+
if (emi != encoding_map.end()) {
|
864
|
+
return emi->second;
|
865
|
+
} else {
|
866
|
+
return UNKNOWN_ENCODING;
|
867
|
+
}
|
868
|
+
}
|
869
|
+
|
870
|
+
const char* default_encoding_name() {
|
871
|
+
return kEncodingInfoTable[LATIN1].encoding_name_;
|
872
|
+
}
|
873
|
+
|
874
|
+
static const char* const kInvalidEncodingName = "invalid_encoding";
|
875
|
+
|
876
|
+
const char *invalid_encoding_name() {
|
877
|
+
return kInvalidEncodingName;
|
878
|
+
}
|
879
|
+
|
880
|
+
|
881
|
+
|
882
|
+
// *************************************************************
|
883
|
+
// Miscellany
|
884
|
+
// *************************************************************
|
885
|
+
|
886
|
+
|
887
|
+
Encoding PreferredWebOutputEncoding(Encoding enc) {
|
888
|
+
return IsValidEncoding(enc)
|
889
|
+
? kEncodingInfoTable[enc].preferred_web_output_encoding_
|
890
|
+
: UTF8;
|
891
|
+
}
|