compact_enc_det 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/ext/compact_enc_det/compact_enc_det/CMakeLists.txt +103 -0
- data/ext/compact_enc_det/compact_enc_det/LICENSE +202 -0
- data/ext/compact_enc_det/compact_enc_det/README.md +46 -0
- data/ext/compact_enc_det/compact_enc_det/autogen.sh +74 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.cc +5719 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.h +83 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_fuzz_test.cc +54 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables.h +6326 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables2.h +856 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.cc +169 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.h +45 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_unittest.cc +5260 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/detail_head_string.inc +152 -0
- data/ext/compact_enc_det/compact_enc_det/util/basictypes.h +331 -0
- data/ext/compact_enc_det/compact_enc_det/util/case_insensitive_hash.h +88 -0
- data/ext/compact_enc_det/compact_enc_det/util/commandlineflags.h +39 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.cc +891 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.h +299 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.pb.h +181 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings_unittest.cc +34 -0
- data/ext/compact_enc_det/compact_enc_det/util/languages/languages.cc +349 -0
- data/ext/compact_enc_det/compact_enc_det/util/languages/languages.h +381 -0
- data/ext/compact_enc_det/compact_enc_det/util/languages/languages.pb.h +191 -0
- data/ext/compact_enc_det/compact_enc_det/util/logging.h +25 -0
- data/ext/compact_enc_det/compact_enc_det/util/port.h +53 -0
- data/ext/compact_enc_det/compact_enc_det/util/string_util.h +61 -0
- data/ext/compact_enc_det/compact_enc_det/util/varsetter.h +66 -0
- data/ext/compact_enc_det/compact_enc_det.cc +100 -0
- data/ext/compact_enc_det/extconf.rb +20 -0
- data/lib/compact_enc_det/version.rb +3 -0
- data/lib/compact_enc_det.rb +2 -0
- metadata +106 -0
@@ -0,0 +1,891 @@
|
|
1
|
+
// Copyright 2016 Google Inc.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
////////////////////////////////////////////////////////////////////////////////
|
16
|
+
|
17
|
+
#include "util/encodings/encodings.h"
|
18
|
+
|
19
|
+
#include <string.h> // for strcasecmp
|
20
|
+
#include <unordered_map>
|
21
|
+
#include <utility> // for pair
|
22
|
+
|
23
|
+
#include "util/basictypes.h"
|
24
|
+
#include "util/string_util.h"
|
25
|
+
#include "util/case_insensitive_hash.h"
|
26
|
+
|
27
|
+
struct EncodingInfo {
|
28
|
+
// The standard name for this encoding.
|
29
|
+
//
|
30
|
+
const char* encoding_name_;
|
31
|
+
|
32
|
+
// The "preferred MIME name" of an encoding as specified by the IANA at:
|
33
|
+
// http://www.iana.org/assignments/character-sets
|
34
|
+
//
|
35
|
+
// Note that the preferred MIME name may differ slightly from the
|
36
|
+
// official IANA name: i.e. ISO-8859-1 vs. ISO_8859-1:1987
|
37
|
+
//
|
38
|
+
const char* mime_encoding_name_;
|
39
|
+
|
40
|
+
// It is an internal policy that if an encoding has an IANA name,
|
41
|
+
// then encoding_name_ and mime_encoding_name_ must be the same string.
|
42
|
+
//
|
43
|
+
// However, there can be exceptions if there are compelling reasons.
|
44
|
+
// For example, Japanese mobile handsets require the name
|
45
|
+
// "Shift_JIS" in charset=... parameter in Content-Type headers to
|
46
|
+
// process emoji (emoticons) in their private encodings. In that
|
47
|
+
// case, mime_encoding_name_ should be "Shift_JIS", despite
|
48
|
+
// encoding_name_ actually is "X-KDDI-Shift_JIS".
|
49
|
+
|
50
|
+
// Some multi-byte encodings use byte values that coincide with the
|
51
|
+
// ASCII codes for HTML syntax characters <>"&' and browsers like MSIE
|
52
|
+
// can misinterpret these, as indicated in an external XSS report from
|
53
|
+
// 2007-02-15. Here, we map these dangerous encodings to safer ones. We
|
54
|
+
// also use UTF8 instead of encodings that we don't support in our
|
55
|
+
// output, and we generally try to be conservative in what we send out.
|
56
|
+
// Where the client asks for single- or double-byte encodings that are
|
57
|
+
// not as common, we substitute a more common single- or double-byte
|
58
|
+
// encoding, if there is one, thereby preserving the client's intent
|
59
|
+
// to use less space than UTF-8. This also means that characters
|
60
|
+
// outside the destination set will be converted to HTML NCRs (&#NNN;)
|
61
|
+
// if requested.
|
62
|
+
|
63
|
+
Encoding preferred_web_output_encoding_;
|
64
|
+
};
|
65
|
+
|
66
|
+
static const EncodingInfo kEncodingInfoTable[] = {
|
67
|
+
{ "ASCII", "ISO-8859-1", ISO_8859_1},
|
68
|
+
{ "Latin2", "ISO-8859-2", ISO_8859_2},
|
69
|
+
{ "Latin3", "ISO-8859-3", UTF8},
|
70
|
+
// MSIE 6 does not support ISO-8859-3 (XSS issue)
|
71
|
+
{ "Latin4", "ISO-8859-4", ISO_8859_4},
|
72
|
+
{ "ISO-8859-5", "ISO-8859-5", ISO_8859_5},
|
73
|
+
{ "Arabic", "ISO-8859-6", ISO_8859_6},
|
74
|
+
{ "Greek", "ISO-8859-7", ISO_8859_7},
|
75
|
+
{ "Hebrew", "ISO-8859-8", MSFT_CP1255},
|
76
|
+
// we do not endorse the visual order
|
77
|
+
{ "Latin5", "ISO-8859-9", ISO_8859_9},
|
78
|
+
{ "Latin6", "ISO-8859-10", UTF8},
|
79
|
+
// MSIE does not support ISO-8859-10 (XSS issue)
|
80
|
+
{ "EUC-JP", "EUC-JP", JAPANESE_EUC_JP},
|
81
|
+
{ "SJS", "Shift_JIS", JAPANESE_SHIFT_JIS},
|
82
|
+
{ "JIS", "ISO-2022-JP", JAPANESE_SHIFT_JIS},
|
83
|
+
// due to potential confusion with HTML syntax chars
|
84
|
+
{ "BIG5", "Big5", CHINESE_BIG5},
|
85
|
+
{ "GB", "GB2312", CHINESE_GB},
|
86
|
+
{ "EUC-CN",
|
87
|
+
"EUC-CN",
|
88
|
+
// Misnamed. Should be EUC-TW.
|
89
|
+
CHINESE_BIG5},
|
90
|
+
// MSIE treats "EUC-CN" like GB2312, which is not EUC-TW,
|
91
|
+
// and EUC-TW is rare, so we prefer Big5 for output.
|
92
|
+
{ "KSC", "EUC-KR", KOREAN_EUC_KR},
|
93
|
+
{ "Unicode",
|
94
|
+
"UTF-16LE",
|
95
|
+
// Internet Explorer doesn't recognize "ISO-10646-UCS-2"
|
96
|
+
UTF8
|
97
|
+
// due to potential confusion with HTML syntax chars
|
98
|
+
},
|
99
|
+
{ "EUC",
|
100
|
+
"EUC", // Misnamed. Should be EUC-TW.
|
101
|
+
CHINESE_BIG5
|
102
|
+
// MSIE does not recognize "EUC" (XSS issue),
|
103
|
+
// and EUC-TW is rare, so we prefer Big5 for output.
|
104
|
+
},
|
105
|
+
{ "CNS",
|
106
|
+
"CNS", // Misnamed. Should be EUC-TW.
|
107
|
+
CHINESE_BIG5},
|
108
|
+
// MSIE does not recognize "CNS" (XSS issue),
|
109
|
+
// and EUC-TW is rare, so we prefer Big5 for output.
|
110
|
+
{ "BIG5-CP950",
|
111
|
+
"BIG5-CP950", // Not an IANA name
|
112
|
+
CHINESE_BIG5
|
113
|
+
// MSIE does not recognize "BIG5-CP950" (XSS issue)
|
114
|
+
},
|
115
|
+
{ "CP932", "CP932", // Not an IANA name
|
116
|
+
JAPANESE_SHIFT_JIS}, // MSIE does not recognize "CP932" (XSS issue)
|
117
|
+
{ "UTF8", "UTF-8", UTF8},
|
118
|
+
{ "Unknown",
|
119
|
+
"x-unknown", // Not an IANA name
|
120
|
+
UTF8}, // UTF-8 is our default output encoding
|
121
|
+
{ "ASCII-7-bit", "US-ASCII", ASCII_7BIT},
|
122
|
+
{ "KOI8R", "KOI8-R", RUSSIAN_KOI8_R},
|
123
|
+
{ "CP1251", "windows-1251", RUSSIAN_CP1251},
|
124
|
+
{ "CP1252", "windows-1252", MSFT_CP1252},
|
125
|
+
{ "KOI8U",
|
126
|
+
"KOI8-U",
|
127
|
+
ISO_8859_5}, // because koi8-u is not as common
|
128
|
+
{ "CP1250", "windows-1250", MSFT_CP1250},
|
129
|
+
{ "ISO-8859-15", "ISO-8859-15", ISO_8859_15},
|
130
|
+
{ "CP1254", "windows-1254", MSFT_CP1254},
|
131
|
+
{ "CP1257", "windows-1257", MSFT_CP1257},
|
132
|
+
{ "ISO-8859-11", "ISO-8859-11", ISO_8859_11},
|
133
|
+
{ "CP874", "windows-874", MSFT_CP874},
|
134
|
+
{ "CP1256", "windows-1256", MSFT_CP1256},
|
135
|
+
{ "CP1255", "windows-1255", MSFT_CP1255},
|
136
|
+
{ "ISO-8859-8-I", "ISO-8859-8-I", MSFT_CP1255},
|
137
|
+
// Java does not support iso-8859-8-i
|
138
|
+
{ "VISUAL", "ISO-8859-8", MSFT_CP1255},
|
139
|
+
// we do not endorse the visual order
|
140
|
+
{ "CP852", "cp852", MSFT_CP1250},
|
141
|
+
// because cp852 is not as common
|
142
|
+
{ "CSN_369103", "csn_369103", MSFT_CP1250},
|
143
|
+
// MSIE does not recognize "csn_369103" (XSS issue)
|
144
|
+
{ "CP1253", "windows-1253", MSFT_CP1253},
|
145
|
+
{ "CP866", "IBM866", RUSSIAN_CP1251},
|
146
|
+
// because cp866 is not as common
|
147
|
+
{ "ISO-8859-13", "ISO-8859-13", UTF8},
|
148
|
+
// because iso-8859-13 is not widely supported
|
149
|
+
{ "ISO-2022-KR", "ISO-2022-KR", KOREAN_EUC_KR},
|
150
|
+
// due to potential confusion with HTML syntax chars
|
151
|
+
{ "GBK", "GBK", GBK},
|
152
|
+
{ "GB18030", "GB18030", GBK},
|
153
|
+
// because gb18030 is not widely supported
|
154
|
+
{ "BIG5_HKSCS", "BIG5-HKSCS", CHINESE_BIG5},
|
155
|
+
// because Big5-HKSCS is not widely supported
|
156
|
+
{ "ISO_2022_CN", "ISO-2022-CN", CHINESE_GB},
|
157
|
+
// due to potential confusion with HTML syntax chars
|
158
|
+
{ "TSCII", "tscii", UTF8},
|
159
|
+
// we do not have an output converter for this font encoding
|
160
|
+
{ "TAM", "tam", UTF8},
|
161
|
+
// we do not have an output converter for this font encoding
|
162
|
+
{ "TAB", "tab", UTF8},
|
163
|
+
// we do not have an output converter for this font encoding
|
164
|
+
{ "JAGRAN", "jagran", UTF8},
|
165
|
+
// we do not have an output converter for this font encoding
|
166
|
+
{ "MACINTOSH", "MACINTOSH", ISO_8859_1},
|
167
|
+
// because macintosh is relatively uncommon
|
168
|
+
{ "UTF7", "UTF-7",
|
169
|
+
UTF8}, // UTF-7 has been the subject of XSS attacks and is deprecated
|
170
|
+
{ "BHASKAR", "bhaskar",
|
171
|
+
UTF8}, // we do not have an output converter for this font encoding
|
172
|
+
{ "HTCHANAKYA", "htchanakya", // not an IANA charset name.
|
173
|
+
UTF8}, // we do not have an output converter for this font encoding
|
174
|
+
{ "UTF-16BE", "UTF-16BE",
|
175
|
+
UTF8}, // due to potential confusion with HTML syntax chars
|
176
|
+
{ "UTF-16LE", "UTF-16LE",
|
177
|
+
UTF8}, // due to potential confusion with HTML syntax chars
|
178
|
+
{ "UTF-32BE", "UTF-32BE",
|
179
|
+
UTF8}, // unlikely to cause XSS bugs, but very uncommon on Web
|
180
|
+
{ "UTF-32LE", "UTF-32LE",
|
181
|
+
UTF8}, // unlikely to cause XSS bugs, but very uncommon on Web
|
182
|
+
{ "X-BINARYENC", "x-binaryenc", // Not an IANA name
|
183
|
+
UTF8}, // because this one is not intended for output (just input)
|
184
|
+
{ "HZ-GB-2312", "HZ-GB-2312",
|
185
|
+
CHINESE_GB}, // due to potential confusion with HTML syntax chars
|
186
|
+
{ "X-UTF8UTF8", "x-utf8utf8", // Not an IANA name
|
187
|
+
UTF8}, // because this one is not intended for output (just input)
|
188
|
+
{ "X-TAM-ELANGO", "x-tam-elango",
|
189
|
+
UTF8}, // we do not have an output converter for this font encoding
|
190
|
+
{ "X-TAM-LTTMBARANI", "x-tam-lttmbarani",
|
191
|
+
UTF8}, // we do not have an output converter for this font encoding
|
192
|
+
{ "X-TAM-SHREE", "x-tam-shree",
|
193
|
+
UTF8}, // we do not have an output converter for this font encoding
|
194
|
+
{ "X-TAM-TBOOMIS", "x-tam-tboomis",
|
195
|
+
UTF8}, // we do not have an output converter for this font encoding
|
196
|
+
{ "X-TAM-TMNEWS", "x-tam-tmnews",
|
197
|
+
UTF8}, // we do not have an output converter for this font encoding
|
198
|
+
{ "X-TAM-WEBTAMIL", "x-tam-webtamil",
|
199
|
+
UTF8}, // we do not have an output converter for this font encoding
|
200
|
+
|
201
|
+
{ "X-KDDI-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},
|
202
|
+
// KDDI version of Shift_JIS with Google Emoji PUA mappings.
|
203
|
+
// Note that MimeEncodingName() returns "Shift_JIS", since KDDI uses
|
204
|
+
// "Shift_JIS" in HTTP headers and email messages.
|
205
|
+
|
206
|
+
{ "X-DoCoMo-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},
|
207
|
+
// DoCoMo version of Shift_JIS with Google Emoji PUA mappings.
|
208
|
+
// See the comment at KDDI_SHIFT_JIS for other issues.
|
209
|
+
|
210
|
+
{ "X-SoftBank-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},
|
211
|
+
// SoftBank version of Shift_JIS with Google Emoji PUA mappings.
|
212
|
+
// See the comment at KDDI_SHIFT_JIS for other issues.
|
213
|
+
|
214
|
+
{ "X-KDDI-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS},
|
215
|
+
// KDDI version of ISO-2022-JP with Google Emoji PUA mappings.
|
216
|
+
// See the comment at KDDI_SHIFT_JIS for other issues.
|
217
|
+
// The preferred Web encoding is due to potential confusion with
|
218
|
+
// HTML syntax chars.
|
219
|
+
|
220
|
+
{ "X-SoftBank-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS},
|
221
|
+
// SoftBank version of ISO-2022-JP with Google Emoji PUA mappings.
|
222
|
+
// See the comment at KDDI_SHIFT_JIS for other issues.
|
223
|
+
// The preferred Web encoding is due to potential confusion with
|
224
|
+
// HTML syntax chars.
|
225
|
+
|
226
|
+
// Please refer to NOTE: section in the comments in the definition
|
227
|
+
// of "struct I18NInfoByEncoding", before adding new encodings.
|
228
|
+
|
229
|
+
};
|
230
|
+
|
231
|
+
|
232
|
+
|
233
|
+
COMPILE_ASSERT(arraysize(kEncodingInfoTable) == NUM_ENCODINGS,
|
234
|
+
kEncodingInfoTable_has_incorrect_size);
|
235
|
+
|
236
|
+
Encoding default_encoding() {return LATIN1;}
|
237
|
+
|
238
|
+
// *************************************************************
|
239
|
+
// Encoding predicates
|
240
|
+
// IsValidEncoding()
|
241
|
+
// IsEncEncCompatible
|
242
|
+
// IsEncodingWithSupportedLanguage
|
243
|
+
// IsSupersetOfAscii7Bit
|
244
|
+
// Is8BitEncoding
|
245
|
+
// IsCJKEncoding
|
246
|
+
// IsHebrewEncoding
|
247
|
+
// IsRightToLeftEncoding
|
248
|
+
// IsLogicalRightToLeftEncoding
|
249
|
+
// IsVisualRightToLeftEncoding
|
250
|
+
// IsIso2022Encoding
|
251
|
+
// IsIso2022JpOrVariant
|
252
|
+
// IsShiftJisOrVariant
|
253
|
+
// IsJapaneseCellPhoneCarrierSpecificEncoding
|
254
|
+
// *************************************************************
|
255
|
+
|
256
|
+
bool IsValidEncoding(Encoding enc) {
|
257
|
+
return ((enc >= 0) && (enc < kNumEncodings));
|
258
|
+
}
|
259
|
+
|
260
|
+
bool IsEncEncCompatible(const Encoding from, const Encoding to) {
|
261
|
+
// Tests compatibility between the "from" and "to" encodings; in
|
262
|
+
// the typical case -- when both are valid known encodings -- this
|
263
|
+
// returns true iff converting from first to second is a no-op.
|
264
|
+
if (!IsValidEncoding(from) || !IsValidEncoding(to)) {
|
265
|
+
return false; // we only work with valid encodings...
|
266
|
+
} else if (to == from) {
|
267
|
+
return true; // the trivial common case
|
268
|
+
}
|
269
|
+
|
270
|
+
if (to == UNKNOWN_ENCODING) {
|
271
|
+
return true; // all valid encodings are compatible with the unknown
|
272
|
+
}
|
273
|
+
|
274
|
+
if (from == UNKNOWN_ENCODING) {
|
275
|
+
return false; // no unknown encoding is compatible with one that is
|
276
|
+
}
|
277
|
+
|
278
|
+
if (from == ASCII_7BIT) {
|
279
|
+
return IsSupersetOfAscii7Bit(to);
|
280
|
+
}
|
281
|
+
|
282
|
+
return (from == ISO_8859_1 && to == MSFT_CP1252) ||
|
283
|
+
(from == ISO_8859_8 && to == HEBREW_VISUAL) ||
|
284
|
+
(from == HEBREW_VISUAL && to == ISO_8859_8) ||
|
285
|
+
(from == ISO_8859_9 && to == MSFT_CP1254) ||
|
286
|
+
(from == ISO_8859_11 && to == MSFT_CP874) ||
|
287
|
+
(from == JAPANESE_SHIFT_JIS && to == JAPANESE_CP932) ||
|
288
|
+
(from == CHINESE_BIG5 && to == CHINESE_BIG5_CP950) ||
|
289
|
+
(from == CHINESE_GB && to == GBK) ||
|
290
|
+
(from == CHINESE_GB && to == GB18030) ||
|
291
|
+
(from == CHINESE_EUC_CN && to == CHINESE_EUC_DEC) ||
|
292
|
+
(from == CHINESE_EUC_CN && to == CHINESE_CNS) ||
|
293
|
+
(from == CHINESE_EUC_DEC && to == CHINESE_EUC_CN) ||
|
294
|
+
(from == CHINESE_EUC_DEC && to == CHINESE_CNS) ||
|
295
|
+
(from == CHINESE_CNS && to == CHINESE_EUC_CN) ||
|
296
|
+
(from == CHINESE_CNS && to == CHINESE_EUC_DEC);
|
297
|
+
}
|
298
|
+
|
299
|
+
// To be a superset of 7-bit Ascii means that bytes 0...127 in the given
|
300
|
+
// encoding represent the same characters as they do in ISO_8859_1.
|
301
|
+
|
302
|
+
// TODO: This list could be expanded. Many other encodings are supersets
|
303
|
+
// of 7-bit Ascii. In fact, Japanese JIS and Unicode are the only two
|
304
|
+
// encodings that I know for a fact should *not* be in this list.
|
305
|
+
bool IsSupersetOfAscii7Bit(Encoding e) {
|
306
|
+
switch (e) {
|
307
|
+
case ISO_8859_1:
|
308
|
+
case ISO_8859_2:
|
309
|
+
case ISO_8859_3:
|
310
|
+
case ISO_8859_4:
|
311
|
+
case ISO_8859_5:
|
312
|
+
case ISO_8859_6:
|
313
|
+
case ISO_8859_7:
|
314
|
+
case ISO_8859_8:
|
315
|
+
case ISO_8859_9:
|
316
|
+
case ISO_8859_10:
|
317
|
+
case JAPANESE_EUC_JP:
|
318
|
+
case JAPANESE_SHIFT_JIS:
|
319
|
+
case CHINESE_BIG5:
|
320
|
+
case CHINESE_GB:
|
321
|
+
case CHINESE_EUC_CN:
|
322
|
+
case KOREAN_EUC_KR:
|
323
|
+
case CHINESE_EUC_DEC:
|
324
|
+
case CHINESE_CNS:
|
325
|
+
case CHINESE_BIG5_CP950:
|
326
|
+
case JAPANESE_CP932:
|
327
|
+
case UTF8:
|
328
|
+
case UNKNOWN_ENCODING:
|
329
|
+
case ASCII_7BIT:
|
330
|
+
case RUSSIAN_KOI8_R:
|
331
|
+
case RUSSIAN_CP1251:
|
332
|
+
case MSFT_CP1252:
|
333
|
+
case RUSSIAN_KOI8_RU:
|
334
|
+
case MSFT_CP1250:
|
335
|
+
case ISO_8859_15:
|
336
|
+
case MSFT_CP1254:
|
337
|
+
case MSFT_CP1257:
|
338
|
+
case ISO_8859_11:
|
339
|
+
case MSFT_CP874:
|
340
|
+
case MSFT_CP1256:
|
341
|
+
case MSFT_CP1255:
|
342
|
+
case ISO_8859_8_I:
|
343
|
+
case HEBREW_VISUAL:
|
344
|
+
case CZECH_CP852:
|
345
|
+
case MSFT_CP1253:
|
346
|
+
case RUSSIAN_CP866:
|
347
|
+
case ISO_8859_13:
|
348
|
+
case GBK:
|
349
|
+
case GB18030:
|
350
|
+
case BIG5_HKSCS:
|
351
|
+
case MACINTOSH_ROMAN:
|
352
|
+
return true;
|
353
|
+
default:
|
354
|
+
return false;
|
355
|
+
}
|
356
|
+
}
|
357
|
+
|
358
|
+
// To be an 8-bit encoding means that there are fewer than 256 symbols.
|
359
|
+
// Each byte determines a new character; there are no multi-byte sequences.
|
360
|
+
|
361
|
+
// TODO: This list could maybe be expanded. Other encodings may be 8-bit.
|
362
|
+
bool Is8BitEncoding(Encoding e) {
|
363
|
+
switch (e) {
|
364
|
+
case ASCII_7BIT:
|
365
|
+
case ISO_8859_1:
|
366
|
+
case ISO_8859_2:
|
367
|
+
case ISO_8859_3:
|
368
|
+
case ISO_8859_4:
|
369
|
+
case ISO_8859_5:
|
370
|
+
case ISO_8859_6:
|
371
|
+
case ISO_8859_7:
|
372
|
+
case ISO_8859_8:
|
373
|
+
case ISO_8859_8_I:
|
374
|
+
case ISO_8859_9:
|
375
|
+
case ISO_8859_10:
|
376
|
+
case ISO_8859_11:
|
377
|
+
case ISO_8859_13:
|
378
|
+
case ISO_8859_15:
|
379
|
+
case MSFT_CP1252:
|
380
|
+
case MSFT_CP1253:
|
381
|
+
case MSFT_CP1254:
|
382
|
+
case MSFT_CP1255:
|
383
|
+
case MSFT_CP1256:
|
384
|
+
case MSFT_CP1257:
|
385
|
+
case RUSSIAN_KOI8_R:
|
386
|
+
case RUSSIAN_KOI8_RU:
|
387
|
+
case RUSSIAN_CP866:
|
388
|
+
return true;
|
389
|
+
default:
|
390
|
+
return false;
|
391
|
+
}
|
392
|
+
}
|
393
|
+
|
394
|
+
bool IsCJKEncoding(Encoding e) {
|
395
|
+
switch (e) {
|
396
|
+
case JAPANESE_EUC_JP:
|
397
|
+
case JAPANESE_SHIFT_JIS:
|
398
|
+
case JAPANESE_JIS:
|
399
|
+
case CHINESE_BIG5:
|
400
|
+
case CHINESE_GB:
|
401
|
+
case CHINESE_EUC_CN:
|
402
|
+
case KOREAN_EUC_KR:
|
403
|
+
case CHINESE_EUC_DEC:
|
404
|
+
case CHINESE_CNS:
|
405
|
+
case CHINESE_BIG5_CP950:
|
406
|
+
case JAPANESE_CP932:
|
407
|
+
case ISO_2022_KR:
|
408
|
+
case GBK:
|
409
|
+
case GB18030:
|
410
|
+
case BIG5_HKSCS:
|
411
|
+
case ISO_2022_CN:
|
412
|
+
case HZ_GB_2312:
|
413
|
+
return true;
|
414
|
+
default:
|
415
|
+
return false;
|
416
|
+
}
|
417
|
+
}
|
418
|
+
|
419
|
+
bool IsHebrewEncoding(Encoding e) {
|
420
|
+
return (e == ISO_8859_8 ||
|
421
|
+
e == ISO_8859_8_I ||
|
422
|
+
e == MSFT_CP1255 ||
|
423
|
+
e == HEBREW_VISUAL);
|
424
|
+
}
|
425
|
+
|
426
|
+
|
427
|
+
|
428
|
+
bool IsRightToLeftEncoding(Encoding enc) {
|
429
|
+
switch (enc) {
|
430
|
+
case MSFT_CP1255:
|
431
|
+
case MSFT_CP1256:
|
432
|
+
case ARABIC_ENCODING:
|
433
|
+
case HEBREW_ENCODING:
|
434
|
+
case ISO_8859_8_I:
|
435
|
+
case HEBREW_VISUAL:
|
436
|
+
return true;
|
437
|
+
default:
|
438
|
+
return false;
|
439
|
+
}
|
440
|
+
}
|
441
|
+
|
442
|
+
bool IsLogicalRightToLeftEncoding(Encoding enc) {
|
443
|
+
return IsRightToLeftEncoding(enc) && !IsVisualRightToLeftEncoding(enc);
|
444
|
+
}
|
445
|
+
|
446
|
+
// Note that despite an RFC to the contrary, ARABIC_ENCODING (ISO-8859-6)
|
447
|
+
// is NOT visual.
|
448
|
+
bool IsVisualRightToLeftEncoding(Encoding enc) {
|
449
|
+
switch (enc) {
|
450
|
+
case HEBREW_ENCODING:
|
451
|
+
case HEBREW_VISUAL:
|
452
|
+
return true;
|
453
|
+
default:
|
454
|
+
return false;
|
455
|
+
}
|
456
|
+
}
|
457
|
+
|
458
|
+
|
459
|
+
|
460
|
+
|
461
|
+
|
462
|
+
bool IsIso2022Encoding(Encoding enc) {
|
463
|
+
return (IsIso2022JpOrVariant(enc) ||
|
464
|
+
enc == ISO_2022_KR ||
|
465
|
+
enc == ISO_2022_CN);
|
466
|
+
}
|
467
|
+
|
468
|
+
bool IsIso2022JpOrVariant(Encoding enc) {
|
469
|
+
return (enc == JAPANESE_JIS ||
|
470
|
+
enc == KDDI_ISO_2022_JP ||
|
471
|
+
enc == SOFTBANK_ISO_2022_JP);
|
472
|
+
}
|
473
|
+
|
474
|
+
bool IsShiftJisOrVariant(Encoding enc) {
|
475
|
+
return (enc == JAPANESE_SHIFT_JIS ||
|
476
|
+
enc == JAPANESE_CP932 ||
|
477
|
+
enc == KDDI_SHIFT_JIS ||
|
478
|
+
enc == DOCOMO_SHIFT_JIS ||
|
479
|
+
enc == SOFTBANK_SHIFT_JIS);
|
480
|
+
}
|
481
|
+
|
482
|
+
bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc) {
|
483
|
+
return (enc == KDDI_ISO_2022_JP ||
|
484
|
+
enc == KDDI_SHIFT_JIS ||
|
485
|
+
enc == DOCOMO_SHIFT_JIS ||
|
486
|
+
enc == SOFTBANK_SHIFT_JIS ||
|
487
|
+
enc == SOFTBANK_ISO_2022_JP);
|
488
|
+
}
|
489
|
+
|
490
|
+
|
491
|
+
// *************************************************************
|
492
|
+
// ENCODING NAMES
|
493
|
+
// EncodingName() [Encoding to name]
|
494
|
+
// MimeEncodingName() [Encoding to name]
|
495
|
+
// EncodingFromName() [name to Encoding]
|
496
|
+
// EncodingNameAliasToEncoding() [name to Encoding]
|
497
|
+
// default_encoding_name()
|
498
|
+
// invalid_encoding_name()
|
499
|
+
// *************************************************************
|
500
|
+
|
501
|
+
const char * EncodingName(const Encoding enc) {
|
502
|
+
if ( (enc < 0) || (enc >= kNumEncodings) )
|
503
|
+
return invalid_encoding_name();
|
504
|
+
return kEncodingInfoTable[enc].encoding_name_;
|
505
|
+
}
|
506
|
+
|
507
|
+
// TODO: Unify MimeEncodingName and EncodingName, or determine why
|
508
|
+
// such a unification is not possible.
|
509
|
+
|
510
|
+
const char * MimeEncodingName(Encoding enc) {
|
511
|
+
if ( (enc < 0) || (enc >= kNumEncodings) )
|
512
|
+
return ""; // TODO: Should this be invalid_encoding_name()?
|
513
|
+
return kEncodingInfoTable[enc].mime_encoding_name_;
|
514
|
+
}
|
515
|
+
|
516
|
+
bool EncodingFromName(const char* enc_name, Encoding *encoding) {
|
517
|
+
*encoding = UNKNOWN_ENCODING;
|
518
|
+
if ( enc_name == NULL ) return false;
|
519
|
+
|
520
|
+
for ( int i = 0; i < kNumEncodings; i++ ) {
|
521
|
+
if (!base::strcasecmp(enc_name, kEncodingInfoTable[i].encoding_name_) ) {
|
522
|
+
*encoding = static_cast<Encoding>(i);
|
523
|
+
return true;
|
524
|
+
}
|
525
|
+
}
|
526
|
+
return false;
|
527
|
+
}
|
528
|
+
|
529
|
+
// The encoding_map maps standard and non-standard encoding-names
|
530
|
+
// (strings) to Encoding enums. It is used only by
|
531
|
+
// EncodingNameAliasToEncoding. Note that the map uses
|
532
|
+
// case-insensitive hash and comparison functions.
|
533
|
+
|
534
|
+
typedef std::unordered_map<const char *, Encoding,
|
535
|
+
CStringAlnumCaseHash,
|
536
|
+
CStringAlnumCaseEqual> EncodingMap;
|
537
|
+
|
538
|
+
static const EncodingMap& GetEncodingMap() {
|
539
|
+
static EncodingMap encoding_map;
|
540
|
+
if (!encoding_map.empty()) {
|
541
|
+
// Already initialized
|
542
|
+
return encoding_map;
|
543
|
+
}
|
544
|
+
|
545
|
+
// Initialize the map with all the "standard" encoding names,
|
546
|
+
// i.e., the ones returned by EncodingName and MimeEncodingName.
|
547
|
+
//
|
548
|
+
// First, add internal encoding names returned by EncodingName().
|
549
|
+
for (int i = 0; i < NUM_ENCODINGS; ++i) {
|
550
|
+
Encoding e = static_cast<Encoding>(i);
|
551
|
+
// Internal encoding names must be unique.
|
552
|
+
// The internal names are guaranteed to be unique by the CHECK_EQ.
|
553
|
+
const char *encoding_name = EncodingName(e);
|
554
|
+
// CHECK_EQ(0, encoding_map.count(encoding_name))
|
555
|
+
// << "Duplicate found for " << encoding_name;
|
556
|
+
encoding_map[encoding_name] = e;
|
557
|
+
}
|
558
|
+
// Then, add mime encoding names returned by MimeEncodingName().
|
559
|
+
// We don't override existing entries, to give precedence to entries
|
560
|
+
// added earlier.
|
561
|
+
for (int i = 0; i < NUM_ENCODINGS; ++i) {
|
562
|
+
Encoding e = static_cast<Encoding>(i);
|
563
|
+
// Note that MimeEncodingName() can return the same mime encoding
|
564
|
+
// name for different encoding enums like JAPANESE_SHIFT_JIS and
|
565
|
+
// KDDI_SHIFT_JIS. In that case, the encoding enum first seen
|
566
|
+
// will be the value for the encoding name in the map.
|
567
|
+
const char *mime_encoding_name = MimeEncodingName(e);
|
568
|
+
if (encoding_map.count(mime_encoding_name) == 0) {
|
569
|
+
encoding_map[mime_encoding_name] = e;
|
570
|
+
}
|
571
|
+
}
|
572
|
+
|
573
|
+
// Add some non-standard names: alternate spellings, common typos,
|
574
|
+
// etc. (It does no harm to add names already in the map.) Note
|
575
|
+
// that although the map is case-insensitive, by convention the
|
576
|
+
// keys are written here in lower case. For ease of maintenance,
|
577
|
+
// they are listed in alphabetical order.
|
578
|
+
encoding_map["5601"] = KOREAN_EUC_KR;
|
579
|
+
encoding_map["646"] = ASCII_7BIT;
|
580
|
+
encoding_map["852"] = CZECH_CP852;
|
581
|
+
encoding_map["866"] = RUSSIAN_CP866;
|
582
|
+
encoding_map["8859-1"] = ISO_8859_1;
|
583
|
+
encoding_map["ansi-1251"] = RUSSIAN_CP1251;
|
584
|
+
encoding_map["ansi_x3.4-1968"] = ASCII_7BIT;
|
585
|
+
encoding_map["arabic"] = ISO_8859_6;
|
586
|
+
encoding_map["ascii"] = ISO_8859_1;
|
587
|
+
encoding_map["ascii-7-bit"] = ASCII_7BIT; // not iana standard
|
588
|
+
encoding_map["asmo-708"] = ISO_8859_6;
|
589
|
+
encoding_map["bhaskar"] = BHASKAR;
|
590
|
+
encoding_map["big5"] = CHINESE_BIG5;
|
591
|
+
encoding_map["big5-cp950"] = CHINESE_BIG5_CP950; // not iana standard
|
592
|
+
encoding_map["big5-hkscs"] = BIG5_HKSCS;
|
593
|
+
encoding_map["chinese"] = CHINESE_GB;
|
594
|
+
encoding_map["cns"] = CHINESE_CNS; // not iana standard
|
595
|
+
encoding_map["cns11643"] = CHINESE_CNS;
|
596
|
+
encoding_map["cp1250"] = MSFT_CP1250; // not iana standard
|
597
|
+
encoding_map["cp1251"] = RUSSIAN_CP1251; // not iana standard
|
598
|
+
encoding_map["cp1252"] = MSFT_CP1252; // not iana standard
|
599
|
+
encoding_map["cp1253"] = MSFT_CP1253; // not iana standard
|
600
|
+
encoding_map["cp1254"] = MSFT_CP1254; // not iana standard
|
601
|
+
encoding_map["cp1255"] = MSFT_CP1255;
|
602
|
+
encoding_map["cp1256"] = MSFT_CP1256;
|
603
|
+
encoding_map["cp1257"] = MSFT_CP1257; // not iana standard
|
604
|
+
encoding_map["cp819"] = ISO_8859_1;
|
605
|
+
encoding_map["cp852"] = CZECH_CP852;
|
606
|
+
encoding_map["cp866"] = RUSSIAN_CP866;
|
607
|
+
encoding_map["cp-866"] = RUSSIAN_CP866;
|
608
|
+
encoding_map["cp874"] = MSFT_CP874;
|
609
|
+
encoding_map["cp932"] = JAPANESE_CP932; // not iana standard
|
610
|
+
encoding_map["cp950"] = CHINESE_BIG5_CP950; // not iana standard
|
611
|
+
encoding_map["csbig5"] = CHINESE_BIG5;
|
612
|
+
encoding_map["cseucjpkdfmtjapanese"] = JAPANESE_EUC_JP;
|
613
|
+
encoding_map["cseuckr"] = KOREAN_EUC_KR;
|
614
|
+
encoding_map["csgb2312"] = CHINESE_GB;
|
615
|
+
encoding_map["csibm852"] = CZECH_CP852;
|
616
|
+
encoding_map["csibm866"] = RUSSIAN_CP866;
|
617
|
+
encoding_map["csiso2022jp"] = JAPANESE_JIS;
|
618
|
+
encoding_map["csiso2022kr"] = ISO_2022_KR;
|
619
|
+
encoding_map["csiso58gb231280"] = CHINESE_GB;
|
620
|
+
encoding_map["csiso88598i"] = ISO_8859_8_I;
|
621
|
+
encoding_map["csisolatin1"] = ISO_8859_1;
|
622
|
+
encoding_map["csisolatin2"] = ISO_8859_2;
|
623
|
+
encoding_map["csisolatin3"] = ISO_8859_3;
|
624
|
+
encoding_map["csisolatin4"] = ISO_8859_4;
|
625
|
+
encoding_map["csisolatin5"] = ISO_8859_9;
|
626
|
+
encoding_map["csisolatin6"] = ISO_8859_10;
|
627
|
+
encoding_map["csisolatinarabic"] = ISO_8859_6;
|
628
|
+
encoding_map["csisolatincyrillic"] = ISO_8859_5;
|
629
|
+
encoding_map["csisolatingreek"] = ISO_8859_7;
|
630
|
+
encoding_map["csisolatinhebrew"] = ISO_8859_8;
|
631
|
+
encoding_map["csksc56011987"] = KOREAN_EUC_KR;
|
632
|
+
encoding_map["csmacintosh"] = MACINTOSH_ROMAN;
|
633
|
+
encoding_map["csn-369103"] = CZECH_CSN_369103;
|
634
|
+
encoding_map["csshiftjis"] = JAPANESE_SHIFT_JIS;
|
635
|
+
encoding_map["csunicode"] = UTF16BE;
|
636
|
+
encoding_map["csunicode11"] = UTF16BE;
|
637
|
+
encoding_map["csunicode11utf7"] = UTF7;
|
638
|
+
encoding_map["csunicodeascii"] = UTF16BE;
|
639
|
+
encoding_map["csunicodelatin1"] = UTF16BE;
|
640
|
+
encoding_map["cyrillic"] = ISO_8859_5;
|
641
|
+
encoding_map["ecma-114"] = ISO_8859_6;
|
642
|
+
encoding_map["ecma-118"] = ISO_8859_7;
|
643
|
+
encoding_map["elot_928"] = ISO_8859_7;
|
644
|
+
encoding_map["euc"] = CHINESE_EUC_DEC; // not iana standard
|
645
|
+
encoding_map["euc-cn"] = CHINESE_EUC_CN; // not iana standard
|
646
|
+
encoding_map["euc-dec"] = CHINESE_EUC_DEC; // not iana standard
|
647
|
+
encoding_map["euc-jp"] = JAPANESE_EUC_JP;
|
648
|
+
encoding_map["euc-kr"] = KOREAN_EUC_KR;
|
649
|
+
encoding_map["eucgb2312_cn"] = CHINESE_GB;
|
650
|
+
encoding_map["gb"] = CHINESE_GB; // not iana standard
|
651
|
+
encoding_map["gb18030"] = GB18030;
|
652
|
+
encoding_map["gb2132"] = CHINESE_GB; // common typo
|
653
|
+
encoding_map["gb2312"] = CHINESE_GB;
|
654
|
+
encoding_map["gb_2312-80"] = CHINESE_GB;
|
655
|
+
encoding_map["gbk"] = GBK;
|
656
|
+
encoding_map["greek"] = ISO_8859_7;
|
657
|
+
encoding_map["greek8"] = ISO_8859_7;
|
658
|
+
encoding_map["hebrew"] = ISO_8859_8;
|
659
|
+
encoding_map["htchanakya"] = HTCHANAKYA;
|
660
|
+
encoding_map["hz-gb-2312"] = HZ_GB_2312;
|
661
|
+
encoding_map["ibm819"] = ISO_8859_1;
|
662
|
+
encoding_map["ibm852"] = CZECH_CP852;
|
663
|
+
encoding_map["ibm874"] = MSFT_CP874;
|
664
|
+
encoding_map["iso-10646"] = UTF16BE;
|
665
|
+
encoding_map["iso-10646-j-1"] = UTF16BE;
|
666
|
+
encoding_map["iso-10646-ucs-2"] = UNICODE;
|
667
|
+
encoding_map["iso-10646-ucs-4"] = UTF32BE;
|
668
|
+
encoding_map["iso-10646-ucs-basic"] = UTF16BE;
|
669
|
+
encoding_map["iso-10646-unicode-latin1"] = UTF16BE;
|
670
|
+
encoding_map["iso-2022-cn"] = ISO_2022_CN;
|
671
|
+
encoding_map["iso-2022-jp"] = JAPANESE_JIS;
|
672
|
+
encoding_map["iso-2022-kr"] = ISO_2022_KR;
|
673
|
+
encoding_map["iso-8559-1"] = ISO_8859_1; // common typo
|
674
|
+
encoding_map["iso-874"] = MSFT_CP874;
|
675
|
+
encoding_map["iso-8858-1"] = ISO_8859_1; // common typo
|
676
|
+
// iso-8859-0 was a temporary name, eventually renamed iso-8859-15
|
677
|
+
encoding_map["iso-8859-0"] = ISO_8859_15;
|
678
|
+
encoding_map["iso-8859-1"] = ISO_8859_1;
|
679
|
+
encoding_map["iso-8859-10"] = ISO_8859_10;
|
680
|
+
encoding_map["iso-8859-11"] = ISO_8859_11;
|
681
|
+
encoding_map["iso-8859-13"] = ISO_8859_13;
|
682
|
+
encoding_map["iso-8859-15"] = ISO_8859_15;
|
683
|
+
encoding_map["iso-8859-2"] = ISO_8859_2;
|
684
|
+
encoding_map["iso-8859-3"] = ISO_8859_3;
|
685
|
+
encoding_map["iso-8859-4"] = ISO_8859_4;
|
686
|
+
encoding_map["iso-8859-5"] = ISO_8859_5;
|
687
|
+
encoding_map["iso-8859-6"] = ISO_8859_6;
|
688
|
+
encoding_map["iso-8859-7"] = ISO_8859_7;
|
689
|
+
encoding_map["iso-8859-8"] = ISO_8859_8;
|
690
|
+
encoding_map["iso-8859-8-i"] = ISO_8859_8_I;
|
691
|
+
encoding_map["iso-8859-9"] = ISO_8859_9;
|
692
|
+
encoding_map["iso-9959-1"] = ISO_8859_1; // common typo
|
693
|
+
encoding_map["iso-ir-100"] = ISO_8859_1;
|
694
|
+
encoding_map["iso-ir-101"] = ISO_8859_2;
|
695
|
+
encoding_map["iso-ir-109"] = ISO_8859_3;
|
696
|
+
encoding_map["iso-ir-110"] = ISO_8859_4;
|
697
|
+
encoding_map["iso-ir-126"] = ISO_8859_7;
|
698
|
+
encoding_map["iso-ir-127"] = ISO_8859_6;
|
699
|
+
encoding_map["iso-ir-138"] = ISO_8859_8;
|
700
|
+
encoding_map["iso-ir-144"] = ISO_8859_5;
|
701
|
+
encoding_map["iso-ir-148"] = ISO_8859_9;
|
702
|
+
encoding_map["iso-ir-149"] = KOREAN_EUC_KR;
|
703
|
+
encoding_map["iso-ir-157"] = ISO_8859_10;
|
704
|
+
encoding_map["iso-ir-58"] = CHINESE_GB;
|
705
|
+
encoding_map["iso-latin-1"] = ISO_8859_1;
|
706
|
+
encoding_map["iso_2022-cn"] = ISO_2022_CN;
|
707
|
+
encoding_map["iso_2022-kr"] = ISO_2022_KR;
|
708
|
+
encoding_map["iso_8859-1"] = ISO_8859_1;
|
709
|
+
encoding_map["iso_8859-10:1992"] = ISO_8859_10;
|
710
|
+
encoding_map["iso_8859-11"] = ISO_8859_11;
|
711
|
+
encoding_map["iso_8859-13"] = ISO_8859_13;
|
712
|
+
encoding_map["iso_8859-15"] = ISO_8859_15;
|
713
|
+
encoding_map["iso_8859-1:1987"] = ISO_8859_1;
|
714
|
+
encoding_map["iso_8859-2"] = ISO_8859_2;
|
715
|
+
encoding_map["iso_8859-2:1987"] = ISO_8859_2;
|
716
|
+
encoding_map["iso_8859-3"] = ISO_8859_3;
|
717
|
+
encoding_map["iso_8859-3:1988"] = ISO_8859_3;
|
718
|
+
encoding_map["iso_8859-4"] = ISO_8859_4;
|
719
|
+
encoding_map["iso_8859-4:1988"] = ISO_8859_4;
|
720
|
+
encoding_map["iso_8859-5"] = ISO_8859_5;
|
721
|
+
encoding_map["iso_8859-5:1988"] = ISO_8859_5;
|
722
|
+
encoding_map["iso_8859-6"] = ISO_8859_6;
|
723
|
+
encoding_map["iso_8859-6:1987"] = ISO_8859_6;
|
724
|
+
encoding_map["iso_8859-7"] = ISO_8859_7;
|
725
|
+
encoding_map["iso_8859-7:1987"] = ISO_8859_7;
|
726
|
+
encoding_map["iso_8859-8"] = ISO_8859_8;
|
727
|
+
encoding_map["iso_8859-8:1988:"] = ISO_8859_8;
|
728
|
+
encoding_map["iso_8859-9"] = ISO_8859_9;
|
729
|
+
encoding_map["iso_8859-9:1989"] = ISO_8859_9;
|
730
|
+
encoding_map["jagran"] = JAGRAN;
|
731
|
+
encoding_map["jis"] = JAPANESE_JIS; // not iana standard
|
732
|
+
encoding_map["koi8-cs"] = CZECH_CSN_369103;
|
733
|
+
encoding_map["koi8-r"] = RUSSIAN_KOI8_R;
|
734
|
+
encoding_map["koi8-ru"] = RUSSIAN_KOI8_RU; // not iana standard
|
735
|
+
encoding_map["koi8-u"] = RUSSIAN_KOI8_RU;
|
736
|
+
encoding_map["koi8r"] = RUSSIAN_KOI8_R; // not iana standard
|
737
|
+
encoding_map["koi8u"] = RUSSIAN_KOI8_RU; // not iana standard
|
738
|
+
encoding_map["korean"] = KOREAN_EUC_KR; // i assume this is what is meant
|
739
|
+
encoding_map["ks-c-5601"] = KOREAN_EUC_KR; // not iana standard
|
740
|
+
encoding_map["ks-c-5601-1987"] = KOREAN_EUC_KR; // not iana standard
|
741
|
+
encoding_map["ks_c_5601-1989"] = KOREAN_EUC_KR;
|
742
|
+
encoding_map["ksc"] = KOREAN_EUC_KR; // not iana standard
|
743
|
+
encoding_map["l1"] = ISO_8859_1;
|
744
|
+
encoding_map["l2"] = ISO_8859_2;
|
745
|
+
encoding_map["l3"] = ISO_8859_3;
|
746
|
+
encoding_map["l4"] = ISO_8859_4;
|
747
|
+
encoding_map["l5"] = ISO_8859_9;
|
748
|
+
encoding_map["l6"] = ISO_8859_10;
|
749
|
+
encoding_map["latin-1"] = ISO_8859_1; // not iana standard
|
750
|
+
encoding_map["latin1"] = ISO_8859_1;
|
751
|
+
encoding_map["latin2"] = ISO_8859_2;
|
752
|
+
encoding_map["latin3"] = ISO_8859_3;
|
753
|
+
encoding_map["latin4"] = ISO_8859_4;
|
754
|
+
encoding_map["latin5"] = ISO_8859_9;
|
755
|
+
encoding_map["latin6"] = ISO_8859_10;
|
756
|
+
encoding_map["mac"] = MACINTOSH_ROMAN;
|
757
|
+
encoding_map["macintosh"] = MACINTOSH_ROMAN;
|
758
|
+
encoding_map["macintosh-roman"] = MACINTOSH_ROMAN;
|
759
|
+
encoding_map["ms932"] = JAPANESE_CP932; // not iana standard
|
760
|
+
encoding_map["ms_kanji"] = JAPANESE_CP932;
|
761
|
+
encoding_map["shift-jis"] = JAPANESE_SHIFT_JIS;
|
762
|
+
encoding_map["shift_jis"] = JAPANESE_SHIFT_JIS;
|
763
|
+
encoding_map["sjis"] = JAPANESE_SHIFT_JIS; // not iana standard
|
764
|
+
encoding_map["sjs"] = JAPANESE_SHIFT_JIS; // not iana standard
|
765
|
+
encoding_map["sun_eu_greek"] = ISO_8859_7;
|
766
|
+
encoding_map["tab"] = TAMIL_BI;
|
767
|
+
encoding_map["tam"] = TAMIL_MONO;
|
768
|
+
encoding_map["tis-620"] = ISO_8859_11;
|
769
|
+
encoding_map["tscii"] = TSCII;
|
770
|
+
encoding_map["un"] = UNKNOWN_ENCODING; // not iana standard
|
771
|
+
encoding_map["unicode"] = UNICODE; // not iana standard
|
772
|
+
encoding_map["unicode-1-1-utf-7"] = UTF7;
|
773
|
+
encoding_map["unicode-1-1-utf-8"] = UTF8;
|
774
|
+
encoding_map["unicode-2-0-utf-7"] = UTF7;
|
775
|
+
encoding_map["unknown"] = UNKNOWN_ENCODING; // not iana standard
|
776
|
+
encoding_map["us"] = ISO_8859_1;
|
777
|
+
encoding_map["us-ascii"] = ISO_8859_1;
|
778
|
+
encoding_map["utf-16be"] = UTF16BE;
|
779
|
+
encoding_map["utf-16le"] = UTF16LE;
|
780
|
+
encoding_map["utf-32be"] = UTF32BE;
|
781
|
+
encoding_map["utf-32le"] = UTF32LE;
|
782
|
+
encoding_map["utf-7"] = UTF7;
|
783
|
+
encoding_map["utf-8"] = UTF8;
|
784
|
+
encoding_map["utf7"] = UTF7;
|
785
|
+
encoding_map["utf8"] = UTF8; // not iana standard
|
786
|
+
encoding_map["visual"] = HEBREW_VISUAL;
|
787
|
+
encoding_map["win-1250"] = MSFT_CP1250; // not iana standard
|
788
|
+
encoding_map["win-1251"] = RUSSIAN_CP1251; // not iana standard
|
789
|
+
encoding_map["window-874"] = MSFT_CP874;
|
790
|
+
encoding_map["windows-1250"] = MSFT_CP1250;
|
791
|
+
encoding_map["windows-1251"] = RUSSIAN_CP1251;
|
792
|
+
encoding_map["windows-1252"] = MSFT_CP1252;
|
793
|
+
encoding_map["windows-1253"] = MSFT_CP1253;
|
794
|
+
encoding_map["windows-1254"] = MSFT_CP1254;
|
795
|
+
encoding_map["windows-1255"] = MSFT_CP1255;
|
796
|
+
encoding_map["windows-1256"] = MSFT_CP1256;
|
797
|
+
encoding_map["windows-1257"] = MSFT_CP1257;
|
798
|
+
encoding_map["windows-31j"] = JAPANESE_CP932;
|
799
|
+
encoding_map["windows-874"] = MSFT_CP874;
|
800
|
+
encoding_map["windows-936"] = GBK;
|
801
|
+
encoding_map["x-big5"] = CHINESE_BIG5;
|
802
|
+
encoding_map["x-binaryenc"] = BINARYENC; // not iana standard
|
803
|
+
encoding_map["x-cp1250"] = MSFT_CP1250;
|
804
|
+
encoding_map["x-cp1251"] = RUSSIAN_CP1251;
|
805
|
+
encoding_map["x-cp1252"] = MSFT_CP1252;
|
806
|
+
encoding_map["x-cp1253"] = MSFT_CP1253;
|
807
|
+
encoding_map["x-cp1254"] = MSFT_CP1254;
|
808
|
+
encoding_map["x-cp1255"] = MSFT_CP1255;
|
809
|
+
encoding_map["x-cp1256"] = MSFT_CP1256;
|
810
|
+
encoding_map["x-cp1257"] = MSFT_CP1257;
|
811
|
+
encoding_map["x-euc-jp"] = JAPANESE_EUC_JP;
|
812
|
+
encoding_map["x-euc-tw"] = CHINESE_CNS;
|
813
|
+
encoding_map["x-gbk"] = GBK;
|
814
|
+
encoding_map["x-iso-10646-ucs-2-be"] = UTF16BE;
|
815
|
+
encoding_map["x-iso-10646-ucs-2-le"] = UTF16LE;
|
816
|
+
encoding_map["x-iso-10646-ucs-4-be"] = UTF32BE;
|
817
|
+
encoding_map["x-iso-10646-ucs-4-le"] = UTF32LE;
|
818
|
+
encoding_map["x-jis"] = JAPANESE_JIS; // not iana standard
|
819
|
+
encoding_map["x-mac-roman"] = MACINTOSH_ROMAN;
|
820
|
+
encoding_map["x-shift_jis"] = JAPANESE_SHIFT_JIS; // not iana standard
|
821
|
+
encoding_map["x-sjis"] = JAPANESE_SHIFT_JIS;
|
822
|
+
encoding_map["x-unicode-2-0-utf-7"] = UTF7;
|
823
|
+
encoding_map["x-utf8utf8"] = UTF8UTF8; // not iana standard
|
824
|
+
encoding_map["x-x-big5"] = CHINESE_BIG5;
|
825
|
+
encoding_map["zh_cn.euc"] = CHINESE_GB;
|
826
|
+
encoding_map["zh_tw-big5"] = CHINESE_BIG5;
|
827
|
+
encoding_map["zh_tw-euc"] = CHINESE_CNS;
|
828
|
+
|
829
|
+
// Remove they entry for the empty string, if any.
|
830
|
+
encoding_map.erase("");
|
831
|
+
|
832
|
+
return encoding_map;
|
833
|
+
}
|
834
|
+
|
835
|
+
// ----------------------------------------------------------------------
|
836
|
+
// EncodingNameAliasToEncoding()
|
837
|
+
//
|
838
|
+
// This function takes an encoding name/alias and returns the Encoding
|
839
|
+
// enum. The input is case insensitive. It is the union of the common
|
840
|
+
// IANA standard names, the charset names used in Netscape Navigator,
|
841
|
+
// and some common names we have been using.
|
842
|
+
// See: http://www.iana.org/assignments/character-sets
|
843
|
+
// http://physics.hallym.ac.kr/resource/relnotes/windows-2.0.html
|
844
|
+
//
|
845
|
+
// UNKNOWN_ENCODING is returned if none matches.
|
846
|
+
//
|
847
|
+
// TODO: Check if it is possible to remove the non-standard,
|
848
|
+
// non-netscape-use names. It is because this routine is used for
|
849
|
+
// encoding detections from html meta info. Non-standard names may
|
850
|
+
// introduce noise on encoding detection.
|
851
|
+
//
|
852
|
+
// TODO: Unify EncodingNameAliasToEncoding and EncodingFromName,
|
853
|
+
// or determine why such a unification is not possible.
|
854
|
+
// ----------------------------------------------------------------------
|
855
|
+
Encoding EncodingNameAliasToEncoding(const char *encoding_name) {
|
856
|
+
if (!encoding_name) {
|
857
|
+
return UNKNOWN_ENCODING;
|
858
|
+
}
|
859
|
+
|
860
|
+
const EncodingMap& encoding_map = GetEncodingMap();
|
861
|
+
|
862
|
+
EncodingMap::const_iterator emi = encoding_map.find(encoding_name);
|
863
|
+
if (emi != encoding_map.end()) {
|
864
|
+
return emi->second;
|
865
|
+
} else {
|
866
|
+
return UNKNOWN_ENCODING;
|
867
|
+
}
|
868
|
+
}
|
869
|
+
|
870
|
+
const char* default_encoding_name() {
|
871
|
+
return kEncodingInfoTable[LATIN1].encoding_name_;
|
872
|
+
}
|
873
|
+
|
874
|
+
static const char* const kInvalidEncodingName = "invalid_encoding";
|
875
|
+
|
876
|
+
const char *invalid_encoding_name() {
|
877
|
+
return kInvalidEncodingName;
|
878
|
+
}
|
879
|
+
|
880
|
+
|
881
|
+
|
882
|
+
// *************************************************************
|
883
|
+
// Miscellany
|
884
|
+
// *************************************************************
|
885
|
+
|
886
|
+
|
887
|
+
Encoding PreferredWebOutputEncoding(Encoding enc) {
|
888
|
+
return IsValidEncoding(enc)
|
889
|
+
? kEncodingInfoTable[enc].preferred_web_output_encoding_
|
890
|
+
: UTF8;
|
891
|
+
}
|