compact_enc_det 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. checksums.yaml +7 -0
  2. data/ext/compact_enc_det/compact_enc_det/CMakeLists.txt +103 -0
  3. data/ext/compact_enc_det/compact_enc_det/LICENSE +202 -0
  4. data/ext/compact_enc_det/compact_enc_det/README.md +46 -0
  5. data/ext/compact_enc_det/compact_enc_det/autogen.sh +74 -0
  6. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.cc +5719 -0
  7. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.h +83 -0
  8. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_fuzz_test.cc +54 -0
  9. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables.h +6326 -0
  10. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables2.h +856 -0
  11. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.cc +169 -0
  12. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.h +45 -0
  13. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_unittest.cc +5260 -0
  14. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/detail_head_string.inc +152 -0
  15. data/ext/compact_enc_det/compact_enc_det/util/basictypes.h +331 -0
  16. data/ext/compact_enc_det/compact_enc_det/util/case_insensitive_hash.h +88 -0
  17. data/ext/compact_enc_det/compact_enc_det/util/commandlineflags.h +39 -0
  18. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.cc +891 -0
  19. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.h +299 -0
  20. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.pb.h +181 -0
  21. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings_unittest.cc +34 -0
  22. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.cc +349 -0
  23. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.h +381 -0
  24. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.pb.h +191 -0
  25. data/ext/compact_enc_det/compact_enc_det/util/logging.h +25 -0
  26. data/ext/compact_enc_det/compact_enc_det/util/port.h +53 -0
  27. data/ext/compact_enc_det/compact_enc_det/util/string_util.h +61 -0
  28. data/ext/compact_enc_det/compact_enc_det/util/varsetter.h +66 -0
  29. data/ext/compact_enc_det/compact_enc_det.cc +100 -0
  30. data/ext/compact_enc_det/extconf.rb +20 -0
  31. data/lib/compact_enc_det/version.rb +3 -0
  32. data/lib/compact_enc_det.rb +2 -0
  33. metadata +106 -0
@@ -0,0 +1,891 @@
1
+ // Copyright 2016 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ ////////////////////////////////////////////////////////////////////////////////
16
+
17
+ #include "util/encodings/encodings.h"
18
+
19
+ #include <string.h> // for strcasecmp
20
+ #include <unordered_map>
21
+ #include <utility> // for pair
22
+
23
+ #include "util/basictypes.h"
24
+ #include "util/string_util.h"
25
+ #include "util/case_insensitive_hash.h"
26
+
27
+ struct EncodingInfo {
28
+ // The standard name for this encoding.
29
+ //
30
+ const char* encoding_name_;
31
+
32
+ // The "preferred MIME name" of an encoding as specified by the IANA at:
33
+ // http://www.iana.org/assignments/character-sets
34
+ //
35
+ // Note that the preferred MIME name may differ slightly from the
36
+ // official IANA name: i.e. ISO-8859-1 vs. ISO_8859-1:1987
37
+ //
38
+ const char* mime_encoding_name_;
39
+
40
+ // It is an internal policy that if an encoding has an IANA name,
41
+ // then encoding_name_ and mime_encoding_name_ must be the same string.
42
+ //
43
+ // However, there can be exceptions if there are compelling reasons.
44
+ // For example, Japanese mobile handsets require the name
45
+ // "Shift_JIS" in charset=... parameter in Content-Type headers to
46
+ // process emoji (emoticons) in their private encodings. In that
47
+ // case, mime_encoding_name_ should be "Shift_JIS", despite
48
+ // encoding_name_ actually is "X-KDDI-Shift_JIS".
49
+
50
+ // Some multi-byte encodings use byte values that coincide with the
51
+ // ASCII codes for HTML syntax characters <>"&' and browsers like MSIE
52
+ // can misinterpret these, as indicated in an external XSS report from
53
+ // 2007-02-15. Here, we map these dangerous encodings to safer ones. We
54
+ // also use UTF8 instead of encodings that we don't support in our
55
+ // output, and we generally try to be conservative in what we send out.
56
+ // Where the client asks for single- or double-byte encodings that are
57
+ // not as common, we substitute a more common single- or double-byte
58
+ // encoding, if there is one, thereby preserving the client's intent
59
+ // to use less space than UTF-8. This also means that characters
60
+ // outside the destination set will be converted to HTML NCRs (&#NNN;)
61
+ // if requested.
62
+
63
+ Encoding preferred_web_output_encoding_;
64
+ };
65
+
66
+ static const EncodingInfo kEncodingInfoTable[] = {
67
+ { "ASCII", "ISO-8859-1", ISO_8859_1},
68
+ { "Latin2", "ISO-8859-2", ISO_8859_2},
69
+ { "Latin3", "ISO-8859-3", UTF8},
70
+ // MSIE 6 does not support ISO-8859-3 (XSS issue)
71
+ { "Latin4", "ISO-8859-4", ISO_8859_4},
72
+ { "ISO-8859-5", "ISO-8859-5", ISO_8859_5},
73
+ { "Arabic", "ISO-8859-6", ISO_8859_6},
74
+ { "Greek", "ISO-8859-7", ISO_8859_7},
75
+ { "Hebrew", "ISO-8859-8", MSFT_CP1255},
76
+ // we do not endorse the visual order
77
+ { "Latin5", "ISO-8859-9", ISO_8859_9},
78
+ { "Latin6", "ISO-8859-10", UTF8},
79
+ // MSIE does not support ISO-8859-10 (XSS issue)
80
+ { "EUC-JP", "EUC-JP", JAPANESE_EUC_JP},
81
+ { "SJS", "Shift_JIS", JAPANESE_SHIFT_JIS},
82
+ { "JIS", "ISO-2022-JP", JAPANESE_SHIFT_JIS},
83
+ // due to potential confusion with HTML syntax chars
84
+ { "BIG5", "Big5", CHINESE_BIG5},
85
+ { "GB", "GB2312", CHINESE_GB},
86
+ { "EUC-CN",
87
+ "EUC-CN",
88
+ // Misnamed. Should be EUC-TW.
89
+ CHINESE_BIG5},
90
+ // MSIE treats "EUC-CN" like GB2312, which is not EUC-TW,
91
+ // and EUC-TW is rare, so we prefer Big5 for output.
92
+ { "KSC", "EUC-KR", KOREAN_EUC_KR},
93
+ { "Unicode",
94
+ "UTF-16LE",
95
+ // Internet Explorer doesn't recognize "ISO-10646-UCS-2"
96
+ UTF8
97
+ // due to potential confusion with HTML syntax chars
98
+ },
99
+ { "EUC",
100
+ "EUC", // Misnamed. Should be EUC-TW.
101
+ CHINESE_BIG5
102
+ // MSIE does not recognize "EUC" (XSS issue),
103
+ // and EUC-TW is rare, so we prefer Big5 for output.
104
+ },
105
+ { "CNS",
106
+ "CNS", // Misnamed. Should be EUC-TW.
107
+ CHINESE_BIG5},
108
+ // MSIE does not recognize "CNS" (XSS issue),
109
+ // and EUC-TW is rare, so we prefer Big5 for output.
110
+ { "BIG5-CP950",
111
+ "BIG5-CP950", // Not an IANA name
112
+ CHINESE_BIG5
113
+ // MSIE does not recognize "BIG5-CP950" (XSS issue)
114
+ },
115
+ { "CP932", "CP932", // Not an IANA name
116
+ JAPANESE_SHIFT_JIS}, // MSIE does not recognize "CP932" (XSS issue)
117
+ { "UTF8", "UTF-8", UTF8},
118
+ { "Unknown",
119
+ "x-unknown", // Not an IANA name
120
+ UTF8}, // UTF-8 is our default output encoding
121
+ { "ASCII-7-bit", "US-ASCII", ASCII_7BIT},
122
+ { "KOI8R", "KOI8-R", RUSSIAN_KOI8_R},
123
+ { "CP1251", "windows-1251", RUSSIAN_CP1251},
124
+ { "CP1252", "windows-1252", MSFT_CP1252},
125
+ { "KOI8U",
126
+ "KOI8-U",
127
+ ISO_8859_5}, // because koi8-u is not as common
128
+ { "CP1250", "windows-1250", MSFT_CP1250},
129
+ { "ISO-8859-15", "ISO-8859-15", ISO_8859_15},
130
+ { "CP1254", "windows-1254", MSFT_CP1254},
131
+ { "CP1257", "windows-1257", MSFT_CP1257},
132
+ { "ISO-8859-11", "ISO-8859-11", ISO_8859_11},
133
+ { "CP874", "windows-874", MSFT_CP874},
134
+ { "CP1256", "windows-1256", MSFT_CP1256},
135
+ { "CP1255", "windows-1255", MSFT_CP1255},
136
+ { "ISO-8859-8-I", "ISO-8859-8-I", MSFT_CP1255},
137
+ // Java does not support iso-8859-8-i
138
+ { "VISUAL", "ISO-8859-8", MSFT_CP1255},
139
+ // we do not endorse the visual order
140
+ { "CP852", "cp852", MSFT_CP1250},
141
+ // because cp852 is not as common
142
+ { "CSN_369103", "csn_369103", MSFT_CP1250},
143
+ // MSIE does not recognize "csn_369103" (XSS issue)
144
+ { "CP1253", "windows-1253", MSFT_CP1253},
145
+ { "CP866", "IBM866", RUSSIAN_CP1251},
146
+ // because cp866 is not as common
147
+ { "ISO-8859-13", "ISO-8859-13", UTF8},
148
+ // because iso-8859-13 is not widely supported
149
+ { "ISO-2022-KR", "ISO-2022-KR", KOREAN_EUC_KR},
150
+ // due to potential confusion with HTML syntax chars
151
+ { "GBK", "GBK", GBK},
152
+ { "GB18030", "GB18030", GBK},
153
+ // because gb18030 is not widely supported
154
+ { "BIG5_HKSCS", "BIG5-HKSCS", CHINESE_BIG5},
155
+ // because Big5-HKSCS is not widely supported
156
+ { "ISO_2022_CN", "ISO-2022-CN", CHINESE_GB},
157
+ // due to potential confusion with HTML syntax chars
158
+ { "TSCII", "tscii", UTF8},
159
+ // we do not have an output converter for this font encoding
160
+ { "TAM", "tam", UTF8},
161
+ // we do not have an output converter for this font encoding
162
+ { "TAB", "tab", UTF8},
163
+ // we do not have an output converter for this font encoding
164
+ { "JAGRAN", "jagran", UTF8},
165
+ // we do not have an output converter for this font encoding
166
+ { "MACINTOSH", "MACINTOSH", ISO_8859_1},
167
+ // because macintosh is relatively uncommon
168
+ { "UTF7", "UTF-7",
169
+ UTF8}, // UTF-7 has been the subject of XSS attacks and is deprecated
170
+ { "BHASKAR", "bhaskar",
171
+ UTF8}, // we do not have an output converter for this font encoding
172
+ { "HTCHANAKYA", "htchanakya", // not an IANA charset name.
173
+ UTF8}, // we do not have an output converter for this font encoding
174
+ { "UTF-16BE", "UTF-16BE",
175
+ UTF8}, // due to potential confusion with HTML syntax chars
176
+ { "UTF-16LE", "UTF-16LE",
177
+ UTF8}, // due to potential confusion with HTML syntax chars
178
+ { "UTF-32BE", "UTF-32BE",
179
+ UTF8}, // unlikely to cause XSS bugs, but very uncommon on Web
180
+ { "UTF-32LE", "UTF-32LE",
181
+ UTF8}, // unlikely to cause XSS bugs, but very uncommon on Web
182
+ { "X-BINARYENC", "x-binaryenc", // Not an IANA name
183
+ UTF8}, // because this one is not intended for output (just input)
184
+ { "HZ-GB-2312", "HZ-GB-2312",
185
+ CHINESE_GB}, // due to potential confusion with HTML syntax chars
186
+ { "X-UTF8UTF8", "x-utf8utf8", // Not an IANA name
187
+ UTF8}, // because this one is not intended for output (just input)
188
+ { "X-TAM-ELANGO", "x-tam-elango",
189
+ UTF8}, // we do not have an output converter for this font encoding
190
+ { "X-TAM-LTTMBARANI", "x-tam-lttmbarani",
191
+ UTF8}, // we do not have an output converter for this font encoding
192
+ { "X-TAM-SHREE", "x-tam-shree",
193
+ UTF8}, // we do not have an output converter for this font encoding
194
+ { "X-TAM-TBOOMIS", "x-tam-tboomis",
195
+ UTF8}, // we do not have an output converter for this font encoding
196
+ { "X-TAM-TMNEWS", "x-tam-tmnews",
197
+ UTF8}, // we do not have an output converter for this font encoding
198
+ { "X-TAM-WEBTAMIL", "x-tam-webtamil",
199
+ UTF8}, // we do not have an output converter for this font encoding
200
+
201
+ { "X-KDDI-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},
202
+ // KDDI version of Shift_JIS with Google Emoji PUA mappings.
203
+ // Note that MimeEncodingName() returns "Shift_JIS", since KDDI uses
204
+ // "Shift_JIS" in HTTP headers and email messages.
205
+
206
+ { "X-DoCoMo-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},
207
+ // DoCoMo version of Shift_JIS with Google Emoji PUA mappings.
208
+ // See the comment at KDDI_SHIFT_JIS for other issues.
209
+
210
+ { "X-SoftBank-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},
211
+ // SoftBank version of Shift_JIS with Google Emoji PUA mappings.
212
+ // See the comment at KDDI_SHIFT_JIS for other issues.
213
+
214
+ { "X-KDDI-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS},
215
+ // KDDI version of ISO-2022-JP with Google Emoji PUA mappings.
216
+ // See the comment at KDDI_SHIFT_JIS for other issues.
217
+ // The preferred Web encoding is due to potential confusion with
218
+ // HTML syntax chars.
219
+
220
+ { "X-SoftBank-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS},
221
+ // SoftBank version of ISO-2022-JP with Google Emoji PUA mappings.
222
+ // See the comment at KDDI_SHIFT_JIS for other issues.
223
+ // The preferred Web encoding is due to potential confusion with
224
+ // HTML syntax chars.
225
+
226
+ // Please refer to NOTE: section in the comments in the definition
227
+ // of "struct I18NInfoByEncoding", before adding new encodings.
228
+
229
+ };
230
+
231
+
232
+
233
+ COMPILE_ASSERT(arraysize(kEncodingInfoTable) == NUM_ENCODINGS,
234
+ kEncodingInfoTable_has_incorrect_size);
235
+
236
+ Encoding default_encoding() {return LATIN1;}
237
+
238
+ // *************************************************************
239
+ // Encoding predicates
240
+ // IsValidEncoding()
241
+ // IsEncEncCompatible
242
+ // IsEncodingWithSupportedLanguage
243
+ // IsSupersetOfAscii7Bit
244
+ // Is8BitEncoding
245
+ // IsCJKEncoding
246
+ // IsHebrewEncoding
247
+ // IsRightToLeftEncoding
248
+ // IsLogicalRightToLeftEncoding
249
+ // IsVisualRightToLeftEncoding
250
+ // IsIso2022Encoding
251
+ // IsIso2022JpOrVariant
252
+ // IsShiftJisOrVariant
253
+ // IsJapaneseCellPhoneCarrierSpecificEncoding
254
+ // *************************************************************
255
+
256
+ bool IsValidEncoding(Encoding enc) {
257
+ return ((enc >= 0) && (enc < kNumEncodings));
258
+ }
259
+
260
+ bool IsEncEncCompatible(const Encoding from, const Encoding to) {
261
+ // Tests compatibility between the "from" and "to" encodings; in
262
+ // the typical case -- when both are valid known encodings -- this
263
+ // returns true iff converting from first to second is a no-op.
264
+ if (!IsValidEncoding(from) || !IsValidEncoding(to)) {
265
+ return false; // we only work with valid encodings...
266
+ } else if (to == from) {
267
+ return true; // the trivial common case
268
+ }
269
+
270
+ if (to == UNKNOWN_ENCODING) {
271
+ return true; // all valid encodings are compatible with the unknown
272
+ }
273
+
274
+ if (from == UNKNOWN_ENCODING) {
275
+ return false; // no unknown encoding is compatible with one that is
276
+ }
277
+
278
+ if (from == ASCII_7BIT) {
279
+ return IsSupersetOfAscii7Bit(to);
280
+ }
281
+
282
+ return (from == ISO_8859_1 && to == MSFT_CP1252) ||
283
+ (from == ISO_8859_8 && to == HEBREW_VISUAL) ||
284
+ (from == HEBREW_VISUAL && to == ISO_8859_8) ||
285
+ (from == ISO_8859_9 && to == MSFT_CP1254) ||
286
+ (from == ISO_8859_11 && to == MSFT_CP874) ||
287
+ (from == JAPANESE_SHIFT_JIS && to == JAPANESE_CP932) ||
288
+ (from == CHINESE_BIG5 && to == CHINESE_BIG5_CP950) ||
289
+ (from == CHINESE_GB && to == GBK) ||
290
+ (from == CHINESE_GB && to == GB18030) ||
291
+ (from == CHINESE_EUC_CN && to == CHINESE_EUC_DEC) ||
292
+ (from == CHINESE_EUC_CN && to == CHINESE_CNS) ||
293
+ (from == CHINESE_EUC_DEC && to == CHINESE_EUC_CN) ||
294
+ (from == CHINESE_EUC_DEC && to == CHINESE_CNS) ||
295
+ (from == CHINESE_CNS && to == CHINESE_EUC_CN) ||
296
+ (from == CHINESE_CNS && to == CHINESE_EUC_DEC);
297
+ }
298
+
299
+ // To be a superset of 7-bit Ascii means that bytes 0...127 in the given
300
+ // encoding represent the same characters as they do in ISO_8859_1.
301
+
302
+ // TODO: This list could be expanded. Many other encodings are supersets
303
+ // of 7-bit Ascii. In fact, Japanese JIS and Unicode are the only two
304
+ // encodings that I know for a fact should *not* be in this list.
305
+ bool IsSupersetOfAscii7Bit(Encoding e) {
306
+ switch (e) {
307
+ case ISO_8859_1:
308
+ case ISO_8859_2:
309
+ case ISO_8859_3:
310
+ case ISO_8859_4:
311
+ case ISO_8859_5:
312
+ case ISO_8859_6:
313
+ case ISO_8859_7:
314
+ case ISO_8859_8:
315
+ case ISO_8859_9:
316
+ case ISO_8859_10:
317
+ case JAPANESE_EUC_JP:
318
+ case JAPANESE_SHIFT_JIS:
319
+ case CHINESE_BIG5:
320
+ case CHINESE_GB:
321
+ case CHINESE_EUC_CN:
322
+ case KOREAN_EUC_KR:
323
+ case CHINESE_EUC_DEC:
324
+ case CHINESE_CNS:
325
+ case CHINESE_BIG5_CP950:
326
+ case JAPANESE_CP932:
327
+ case UTF8:
328
+ case UNKNOWN_ENCODING:
329
+ case ASCII_7BIT:
330
+ case RUSSIAN_KOI8_R:
331
+ case RUSSIAN_CP1251:
332
+ case MSFT_CP1252:
333
+ case RUSSIAN_KOI8_RU:
334
+ case MSFT_CP1250:
335
+ case ISO_8859_15:
336
+ case MSFT_CP1254:
337
+ case MSFT_CP1257:
338
+ case ISO_8859_11:
339
+ case MSFT_CP874:
340
+ case MSFT_CP1256:
341
+ case MSFT_CP1255:
342
+ case ISO_8859_8_I:
343
+ case HEBREW_VISUAL:
344
+ case CZECH_CP852:
345
+ case MSFT_CP1253:
346
+ case RUSSIAN_CP866:
347
+ case ISO_8859_13:
348
+ case GBK:
349
+ case GB18030:
350
+ case BIG5_HKSCS:
351
+ case MACINTOSH_ROMAN:
352
+ return true;
353
+ default:
354
+ return false;
355
+ }
356
+ }
357
+
358
+ // To be an 8-bit encoding means that there are fewer than 256 symbols.
359
+ // Each byte determines a new character; there are no multi-byte sequences.
360
+
361
+ // TODO: This list could maybe be expanded. Other encodings may be 8-bit.
362
+ bool Is8BitEncoding(Encoding e) {
363
+ switch (e) {
364
+ case ASCII_7BIT:
365
+ case ISO_8859_1:
366
+ case ISO_8859_2:
367
+ case ISO_8859_3:
368
+ case ISO_8859_4:
369
+ case ISO_8859_5:
370
+ case ISO_8859_6:
371
+ case ISO_8859_7:
372
+ case ISO_8859_8:
373
+ case ISO_8859_8_I:
374
+ case ISO_8859_9:
375
+ case ISO_8859_10:
376
+ case ISO_8859_11:
377
+ case ISO_8859_13:
378
+ case ISO_8859_15:
379
+ case MSFT_CP1252:
380
+ case MSFT_CP1253:
381
+ case MSFT_CP1254:
382
+ case MSFT_CP1255:
383
+ case MSFT_CP1256:
384
+ case MSFT_CP1257:
385
+ case RUSSIAN_KOI8_R:
386
+ case RUSSIAN_KOI8_RU:
387
+ case RUSSIAN_CP866:
388
+ return true;
389
+ default:
390
+ return false;
391
+ }
392
+ }
393
+
394
+ bool IsCJKEncoding(Encoding e) {
395
+ switch (e) {
396
+ case JAPANESE_EUC_JP:
397
+ case JAPANESE_SHIFT_JIS:
398
+ case JAPANESE_JIS:
399
+ case CHINESE_BIG5:
400
+ case CHINESE_GB:
401
+ case CHINESE_EUC_CN:
402
+ case KOREAN_EUC_KR:
403
+ case CHINESE_EUC_DEC:
404
+ case CHINESE_CNS:
405
+ case CHINESE_BIG5_CP950:
406
+ case JAPANESE_CP932:
407
+ case ISO_2022_KR:
408
+ case GBK:
409
+ case GB18030:
410
+ case BIG5_HKSCS:
411
+ case ISO_2022_CN:
412
+ case HZ_GB_2312:
413
+ return true;
414
+ default:
415
+ return false;
416
+ }
417
+ }
418
+
419
+ bool IsHebrewEncoding(Encoding e) {
420
+ return (e == ISO_8859_8 ||
421
+ e == ISO_8859_8_I ||
422
+ e == MSFT_CP1255 ||
423
+ e == HEBREW_VISUAL);
424
+ }
425
+
426
+
427
+
428
+ bool IsRightToLeftEncoding(Encoding enc) {
429
+ switch (enc) {
430
+ case MSFT_CP1255:
431
+ case MSFT_CP1256:
432
+ case ARABIC_ENCODING:
433
+ case HEBREW_ENCODING:
434
+ case ISO_8859_8_I:
435
+ case HEBREW_VISUAL:
436
+ return true;
437
+ default:
438
+ return false;
439
+ }
440
+ }
441
+
442
+ bool IsLogicalRightToLeftEncoding(Encoding enc) {
443
+ return IsRightToLeftEncoding(enc) && !IsVisualRightToLeftEncoding(enc);
444
+ }
445
+
446
+ // Note that despite an RFC to the contrary, ARABIC_ENCODING (ISO-8859-6)
447
+ // is NOT visual.
448
+ bool IsVisualRightToLeftEncoding(Encoding enc) {
449
+ switch (enc) {
450
+ case HEBREW_ENCODING:
451
+ case HEBREW_VISUAL:
452
+ return true;
453
+ default:
454
+ return false;
455
+ }
456
+ }
457
+
458
+
459
+
460
+
461
+
462
+ bool IsIso2022Encoding(Encoding enc) {
463
+ return (IsIso2022JpOrVariant(enc) ||
464
+ enc == ISO_2022_KR ||
465
+ enc == ISO_2022_CN);
466
+ }
467
+
468
+ bool IsIso2022JpOrVariant(Encoding enc) {
469
+ return (enc == JAPANESE_JIS ||
470
+ enc == KDDI_ISO_2022_JP ||
471
+ enc == SOFTBANK_ISO_2022_JP);
472
+ }
473
+
474
+ bool IsShiftJisOrVariant(Encoding enc) {
475
+ return (enc == JAPANESE_SHIFT_JIS ||
476
+ enc == JAPANESE_CP932 ||
477
+ enc == KDDI_SHIFT_JIS ||
478
+ enc == DOCOMO_SHIFT_JIS ||
479
+ enc == SOFTBANK_SHIFT_JIS);
480
+ }
481
+
482
+ bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc) {
483
+ return (enc == KDDI_ISO_2022_JP ||
484
+ enc == KDDI_SHIFT_JIS ||
485
+ enc == DOCOMO_SHIFT_JIS ||
486
+ enc == SOFTBANK_SHIFT_JIS ||
487
+ enc == SOFTBANK_ISO_2022_JP);
488
+ }
489
+
490
+
491
+ // *************************************************************
492
+ // ENCODING NAMES
493
+ // EncodingName() [Encoding to name]
494
+ // MimeEncodingName() [Encoding to name]
495
+ // EncodingFromName() [name to Encoding]
496
+ // EncodingNameAliasToEncoding() [name to Encoding]
497
+ // default_encoding_name()
498
+ // invalid_encoding_name()
499
+ // *************************************************************
500
+
501
+ const char * EncodingName(const Encoding enc) {
502
+ if ( (enc < 0) || (enc >= kNumEncodings) )
503
+ return invalid_encoding_name();
504
+ return kEncodingInfoTable[enc].encoding_name_;
505
+ }
506
+
507
+ // TODO: Unify MimeEncodingName and EncodingName, or determine why
508
+ // such a unification is not possible.
509
+
510
+ const char * MimeEncodingName(Encoding enc) {
511
+ if ( (enc < 0) || (enc >= kNumEncodings) )
512
+ return ""; // TODO: Should this be invalid_encoding_name()?
513
+ return kEncodingInfoTable[enc].mime_encoding_name_;
514
+ }
515
+
516
+ bool EncodingFromName(const char* enc_name, Encoding *encoding) {
517
+ *encoding = UNKNOWN_ENCODING;
518
+ if ( enc_name == NULL ) return false;
519
+
520
+ for ( int i = 0; i < kNumEncodings; i++ ) {
521
+ if (!base::strcasecmp(enc_name, kEncodingInfoTable[i].encoding_name_) ) {
522
+ *encoding = static_cast<Encoding>(i);
523
+ return true;
524
+ }
525
+ }
526
+ return false;
527
+ }
528
+
529
+ // The encoding_map maps standard and non-standard encoding-names
530
+ // (strings) to Encoding enums. It is used only by
531
+ // EncodingNameAliasToEncoding. Note that the map uses
532
+ // case-insensitive hash and comparison functions.
533
+
534
+ typedef std::unordered_map<const char *, Encoding,
535
+ CStringAlnumCaseHash,
536
+ CStringAlnumCaseEqual> EncodingMap;
537
+
538
+ static const EncodingMap& GetEncodingMap() {
539
+ static EncodingMap encoding_map;
540
+ if (!encoding_map.empty()) {
541
+ // Already initialized
542
+ return encoding_map;
543
+ }
544
+
545
+ // Initialize the map with all the "standard" encoding names,
546
+ // i.e., the ones returned by EncodingName and MimeEncodingName.
547
+ //
548
+ // First, add internal encoding names returned by EncodingName().
549
+ for (int i = 0; i < NUM_ENCODINGS; ++i) {
550
+ Encoding e = static_cast<Encoding>(i);
551
+ // Internal encoding names must be unique.
552
+ // The internal names are guaranteed to be unique by the CHECK_EQ.
553
+ const char *encoding_name = EncodingName(e);
554
+ // CHECK_EQ(0, encoding_map.count(encoding_name))
555
+ // << "Duplicate found for " << encoding_name;
556
+ encoding_map[encoding_name] = e;
557
+ }
558
+ // Then, add mime encoding names returned by MimeEncodingName().
559
+ // We don't override existing entries, to give precedence to entries
560
+ // added earlier.
561
+ for (int i = 0; i < NUM_ENCODINGS; ++i) {
562
+ Encoding e = static_cast<Encoding>(i);
563
+ // Note that MimeEncodingName() can return the same mime encoding
564
+ // name for different encoding enums like JAPANESE_SHIFT_JIS and
565
+ // KDDI_SHIFT_JIS. In that case, the encoding enum first seen
566
+ // will be the value for the encoding name in the map.
567
+ const char *mime_encoding_name = MimeEncodingName(e);
568
+ if (encoding_map.count(mime_encoding_name) == 0) {
569
+ encoding_map[mime_encoding_name] = e;
570
+ }
571
+ }
572
+
573
+ // Add some non-standard names: alternate spellings, common typos,
574
+ // etc. (It does no harm to add names already in the map.) Note
575
+ // that although the map is case-insensitive, by convention the
576
+ // keys are written here in lower case. For ease of maintenance,
577
+ // they are listed in alphabetical order.
578
+ encoding_map["5601"] = KOREAN_EUC_KR;
579
+ encoding_map["646"] = ASCII_7BIT;
580
+ encoding_map["852"] = CZECH_CP852;
581
+ encoding_map["866"] = RUSSIAN_CP866;
582
+ encoding_map["8859-1"] = ISO_8859_1;
583
+ encoding_map["ansi-1251"] = RUSSIAN_CP1251;
584
+ encoding_map["ansi_x3.4-1968"] = ASCII_7BIT;
585
+ encoding_map["arabic"] = ISO_8859_6;
586
+ encoding_map["ascii"] = ISO_8859_1;
587
+ encoding_map["ascii-7-bit"] = ASCII_7BIT; // not iana standard
588
+ encoding_map["asmo-708"] = ISO_8859_6;
589
+ encoding_map["bhaskar"] = BHASKAR;
590
+ encoding_map["big5"] = CHINESE_BIG5;
591
+ encoding_map["big5-cp950"] = CHINESE_BIG5_CP950; // not iana standard
592
+ encoding_map["big5-hkscs"] = BIG5_HKSCS;
593
+ encoding_map["chinese"] = CHINESE_GB;
594
+ encoding_map["cns"] = CHINESE_CNS; // not iana standard
595
+ encoding_map["cns11643"] = CHINESE_CNS;
596
+ encoding_map["cp1250"] = MSFT_CP1250; // not iana standard
597
+ encoding_map["cp1251"] = RUSSIAN_CP1251; // not iana standard
598
+ encoding_map["cp1252"] = MSFT_CP1252; // not iana standard
599
+ encoding_map["cp1253"] = MSFT_CP1253; // not iana standard
600
+ encoding_map["cp1254"] = MSFT_CP1254; // not iana standard
601
+ encoding_map["cp1255"] = MSFT_CP1255;
602
+ encoding_map["cp1256"] = MSFT_CP1256;
603
+ encoding_map["cp1257"] = MSFT_CP1257; // not iana standard
604
+ encoding_map["cp819"] = ISO_8859_1;
605
+ encoding_map["cp852"] = CZECH_CP852;
606
+ encoding_map["cp866"] = RUSSIAN_CP866;
607
+ encoding_map["cp-866"] = RUSSIAN_CP866;
608
+ encoding_map["cp874"] = MSFT_CP874;
609
+ encoding_map["cp932"] = JAPANESE_CP932; // not iana standard
610
+ encoding_map["cp950"] = CHINESE_BIG5_CP950; // not iana standard
611
+ encoding_map["csbig5"] = CHINESE_BIG5;
612
+ encoding_map["cseucjpkdfmtjapanese"] = JAPANESE_EUC_JP;
613
+ encoding_map["cseuckr"] = KOREAN_EUC_KR;
614
+ encoding_map["csgb2312"] = CHINESE_GB;
615
+ encoding_map["csibm852"] = CZECH_CP852;
616
+ encoding_map["csibm866"] = RUSSIAN_CP866;
617
+ encoding_map["csiso2022jp"] = JAPANESE_JIS;
618
+ encoding_map["csiso2022kr"] = ISO_2022_KR;
619
+ encoding_map["csiso58gb231280"] = CHINESE_GB;
620
+ encoding_map["csiso88598i"] = ISO_8859_8_I;
621
+ encoding_map["csisolatin1"] = ISO_8859_1;
622
+ encoding_map["csisolatin2"] = ISO_8859_2;
623
+ encoding_map["csisolatin3"] = ISO_8859_3;
624
+ encoding_map["csisolatin4"] = ISO_8859_4;
625
+ encoding_map["csisolatin5"] = ISO_8859_9;
626
+ encoding_map["csisolatin6"] = ISO_8859_10;
627
+ encoding_map["csisolatinarabic"] = ISO_8859_6;
628
+ encoding_map["csisolatincyrillic"] = ISO_8859_5;
629
+ encoding_map["csisolatingreek"] = ISO_8859_7;
630
+ encoding_map["csisolatinhebrew"] = ISO_8859_8;
631
+ encoding_map["csksc56011987"] = KOREAN_EUC_KR;
632
+ encoding_map["csmacintosh"] = MACINTOSH_ROMAN;
633
+ encoding_map["csn-369103"] = CZECH_CSN_369103;
634
+ encoding_map["csshiftjis"] = JAPANESE_SHIFT_JIS;
635
+ encoding_map["csunicode"] = UTF16BE;
636
+ encoding_map["csunicode11"] = UTF16BE;
637
+ encoding_map["csunicode11utf7"] = UTF7;
638
+ encoding_map["csunicodeascii"] = UTF16BE;
639
+ encoding_map["csunicodelatin1"] = UTF16BE;
640
+ encoding_map["cyrillic"] = ISO_8859_5;
641
+ encoding_map["ecma-114"] = ISO_8859_6;
642
+ encoding_map["ecma-118"] = ISO_8859_7;
643
+ encoding_map["elot_928"] = ISO_8859_7;
644
+ encoding_map["euc"] = CHINESE_EUC_DEC; // not iana standard
645
+ encoding_map["euc-cn"] = CHINESE_EUC_CN; // not iana standard
646
+ encoding_map["euc-dec"] = CHINESE_EUC_DEC; // not iana standard
647
+ encoding_map["euc-jp"] = JAPANESE_EUC_JP;
648
+ encoding_map["euc-kr"] = KOREAN_EUC_KR;
649
+ encoding_map["eucgb2312_cn"] = CHINESE_GB;
650
+ encoding_map["gb"] = CHINESE_GB; // not iana standard
651
+ encoding_map["gb18030"] = GB18030;
652
+ encoding_map["gb2132"] = CHINESE_GB; // common typo
653
+ encoding_map["gb2312"] = CHINESE_GB;
654
+ encoding_map["gb_2312-80"] = CHINESE_GB;
655
+ encoding_map["gbk"] = GBK;
656
+ encoding_map["greek"] = ISO_8859_7;
657
+ encoding_map["greek8"] = ISO_8859_7;
658
+ encoding_map["hebrew"] = ISO_8859_8;
659
+ encoding_map["htchanakya"] = HTCHANAKYA;
660
+ encoding_map["hz-gb-2312"] = HZ_GB_2312;
661
+ encoding_map["ibm819"] = ISO_8859_1;
662
+ encoding_map["ibm852"] = CZECH_CP852;
663
+ encoding_map["ibm874"] = MSFT_CP874;
664
+ encoding_map["iso-10646"] = UTF16BE;
665
+ encoding_map["iso-10646-j-1"] = UTF16BE;
666
+ encoding_map["iso-10646-ucs-2"] = UNICODE;
667
+ encoding_map["iso-10646-ucs-4"] = UTF32BE;
668
+ encoding_map["iso-10646-ucs-basic"] = UTF16BE;
669
+ encoding_map["iso-10646-unicode-latin1"] = UTF16BE;
670
+ encoding_map["iso-2022-cn"] = ISO_2022_CN;
671
+ encoding_map["iso-2022-jp"] = JAPANESE_JIS;
672
+ encoding_map["iso-2022-kr"] = ISO_2022_KR;
673
+ encoding_map["iso-8559-1"] = ISO_8859_1; // common typo
674
+ encoding_map["iso-874"] = MSFT_CP874;
675
+ encoding_map["iso-8858-1"] = ISO_8859_1; // common typo
676
+ // iso-8859-0 was a temporary name, eventually renamed iso-8859-15
677
+ encoding_map["iso-8859-0"] = ISO_8859_15;
678
+ encoding_map["iso-8859-1"] = ISO_8859_1;
679
+ encoding_map["iso-8859-10"] = ISO_8859_10;
680
+ encoding_map["iso-8859-11"] = ISO_8859_11;
681
+ encoding_map["iso-8859-13"] = ISO_8859_13;
682
+ encoding_map["iso-8859-15"] = ISO_8859_15;
683
+ encoding_map["iso-8859-2"] = ISO_8859_2;
684
+ encoding_map["iso-8859-3"] = ISO_8859_3;
685
+ encoding_map["iso-8859-4"] = ISO_8859_4;
686
+ encoding_map["iso-8859-5"] = ISO_8859_5;
687
+ encoding_map["iso-8859-6"] = ISO_8859_6;
688
+ encoding_map["iso-8859-7"] = ISO_8859_7;
689
+ encoding_map["iso-8859-8"] = ISO_8859_8;
690
+ encoding_map["iso-8859-8-i"] = ISO_8859_8_I;
691
+ encoding_map["iso-8859-9"] = ISO_8859_9;
692
+ encoding_map["iso-9959-1"] = ISO_8859_1; // common typo
693
+ encoding_map["iso-ir-100"] = ISO_8859_1;
694
+ encoding_map["iso-ir-101"] = ISO_8859_2;
695
+ encoding_map["iso-ir-109"] = ISO_8859_3;
696
+ encoding_map["iso-ir-110"] = ISO_8859_4;
697
+ encoding_map["iso-ir-126"] = ISO_8859_7;
698
+ encoding_map["iso-ir-127"] = ISO_8859_6;
699
+ encoding_map["iso-ir-138"] = ISO_8859_8;
700
+ encoding_map["iso-ir-144"] = ISO_8859_5;
701
+ encoding_map["iso-ir-148"] = ISO_8859_9;
702
+ encoding_map["iso-ir-149"] = KOREAN_EUC_KR;
703
+ encoding_map["iso-ir-157"] = ISO_8859_10;
704
+ encoding_map["iso-ir-58"] = CHINESE_GB;
705
+ encoding_map["iso-latin-1"] = ISO_8859_1;
706
+ encoding_map["iso_2022-cn"] = ISO_2022_CN;
707
+ encoding_map["iso_2022-kr"] = ISO_2022_KR;
708
+ encoding_map["iso_8859-1"] = ISO_8859_1;
709
+ encoding_map["iso_8859-10:1992"] = ISO_8859_10;
710
+ encoding_map["iso_8859-11"] = ISO_8859_11;
711
+ encoding_map["iso_8859-13"] = ISO_8859_13;
712
+ encoding_map["iso_8859-15"] = ISO_8859_15;
713
+ encoding_map["iso_8859-1:1987"] = ISO_8859_1;
714
+ encoding_map["iso_8859-2"] = ISO_8859_2;
715
+ encoding_map["iso_8859-2:1987"] = ISO_8859_2;
716
+ encoding_map["iso_8859-3"] = ISO_8859_3;
717
+ encoding_map["iso_8859-3:1988"] = ISO_8859_3;
718
+ encoding_map["iso_8859-4"] = ISO_8859_4;
719
+ encoding_map["iso_8859-4:1988"] = ISO_8859_4;
720
+ encoding_map["iso_8859-5"] = ISO_8859_5;
721
+ encoding_map["iso_8859-5:1988"] = ISO_8859_5;
722
+ encoding_map["iso_8859-6"] = ISO_8859_6;
723
+ encoding_map["iso_8859-6:1987"] = ISO_8859_6;
724
+ encoding_map["iso_8859-7"] = ISO_8859_7;
725
+ encoding_map["iso_8859-7:1987"] = ISO_8859_7;
726
+ encoding_map["iso_8859-8"] = ISO_8859_8;
727
+ encoding_map["iso_8859-8:1988:"] = ISO_8859_8;
728
+ encoding_map["iso_8859-9"] = ISO_8859_9;
729
+ encoding_map["iso_8859-9:1989"] = ISO_8859_9;
730
+ encoding_map["jagran"] = JAGRAN;
731
+ encoding_map["jis"] = JAPANESE_JIS; // not iana standard
732
+ encoding_map["koi8-cs"] = CZECH_CSN_369103;
733
+ encoding_map["koi8-r"] = RUSSIAN_KOI8_R;
734
+ encoding_map["koi8-ru"] = RUSSIAN_KOI8_RU; // not iana standard
735
+ encoding_map["koi8-u"] = RUSSIAN_KOI8_RU;
736
+ encoding_map["koi8r"] = RUSSIAN_KOI8_R; // not iana standard
737
+ encoding_map["koi8u"] = RUSSIAN_KOI8_RU; // not iana standard
738
+ encoding_map["korean"] = KOREAN_EUC_KR; // i assume this is what is meant
739
+ encoding_map["ks-c-5601"] = KOREAN_EUC_KR; // not iana standard
740
+ encoding_map["ks-c-5601-1987"] = KOREAN_EUC_KR; // not iana standard
741
+ encoding_map["ks_c_5601-1989"] = KOREAN_EUC_KR;
742
+ encoding_map["ksc"] = KOREAN_EUC_KR; // not iana standard
743
+ encoding_map["l1"] = ISO_8859_1;
744
+ encoding_map["l2"] = ISO_8859_2;
745
+ encoding_map["l3"] = ISO_8859_3;
746
+ encoding_map["l4"] = ISO_8859_4;
747
+ encoding_map["l5"] = ISO_8859_9;
748
+ encoding_map["l6"] = ISO_8859_10;
749
+ encoding_map["latin-1"] = ISO_8859_1; // not iana standard
750
+ encoding_map["latin1"] = ISO_8859_1;
751
+ encoding_map["latin2"] = ISO_8859_2;
752
+ encoding_map["latin3"] = ISO_8859_3;
753
+ encoding_map["latin4"] = ISO_8859_4;
754
+ encoding_map["latin5"] = ISO_8859_9;
755
+ encoding_map["latin6"] = ISO_8859_10;
756
+ encoding_map["mac"] = MACINTOSH_ROMAN;
757
+ encoding_map["macintosh"] = MACINTOSH_ROMAN;
758
+ encoding_map["macintosh-roman"] = MACINTOSH_ROMAN;
759
+ encoding_map["ms932"] = JAPANESE_CP932; // not iana standard
760
+ encoding_map["ms_kanji"] = JAPANESE_CP932;
761
+ encoding_map["shift-jis"] = JAPANESE_SHIFT_JIS;
762
+ encoding_map["shift_jis"] = JAPANESE_SHIFT_JIS;
763
+ encoding_map["sjis"] = JAPANESE_SHIFT_JIS; // not iana standard
764
+ encoding_map["sjs"] = JAPANESE_SHIFT_JIS; // not iana standard
765
+ encoding_map["sun_eu_greek"] = ISO_8859_7;
766
+ encoding_map["tab"] = TAMIL_BI;
767
+ encoding_map["tam"] = TAMIL_MONO;
768
+ encoding_map["tis-620"] = ISO_8859_11;
769
+ encoding_map["tscii"] = TSCII;
770
+ encoding_map["un"] = UNKNOWN_ENCODING; // not iana standard
771
+ encoding_map["unicode"] = UNICODE; // not iana standard
772
+ encoding_map["unicode-1-1-utf-7"] = UTF7;
773
+ encoding_map["unicode-1-1-utf-8"] = UTF8;
774
+ encoding_map["unicode-2-0-utf-7"] = UTF7;
775
+ encoding_map["unknown"] = UNKNOWN_ENCODING; // not iana standard
776
+ encoding_map["us"] = ISO_8859_1;
777
+ encoding_map["us-ascii"] = ISO_8859_1;
778
+ encoding_map["utf-16be"] = UTF16BE;
779
+ encoding_map["utf-16le"] = UTF16LE;
780
+ encoding_map["utf-32be"] = UTF32BE;
781
+ encoding_map["utf-32le"] = UTF32LE;
782
+ encoding_map["utf-7"] = UTF7;
783
+ encoding_map["utf-8"] = UTF8;
784
+ encoding_map["utf7"] = UTF7;
785
+ encoding_map["utf8"] = UTF8; // not iana standard
786
+ encoding_map["visual"] = HEBREW_VISUAL;
787
+ encoding_map["win-1250"] = MSFT_CP1250; // not iana standard
788
+ encoding_map["win-1251"] = RUSSIAN_CP1251; // not iana standard
789
+ encoding_map["window-874"] = MSFT_CP874;
790
+ encoding_map["windows-1250"] = MSFT_CP1250;
791
+ encoding_map["windows-1251"] = RUSSIAN_CP1251;
792
+ encoding_map["windows-1252"] = MSFT_CP1252;
793
+ encoding_map["windows-1253"] = MSFT_CP1253;
794
+ encoding_map["windows-1254"] = MSFT_CP1254;
795
+ encoding_map["windows-1255"] = MSFT_CP1255;
796
+ encoding_map["windows-1256"] = MSFT_CP1256;
797
+ encoding_map["windows-1257"] = MSFT_CP1257;
798
+ encoding_map["windows-31j"] = JAPANESE_CP932;
799
+ encoding_map["windows-874"] = MSFT_CP874;
800
+ encoding_map["windows-936"] = GBK;
801
+ encoding_map["x-big5"] = CHINESE_BIG5;
802
+ encoding_map["x-binaryenc"] = BINARYENC; // not iana standard
803
+ encoding_map["x-cp1250"] = MSFT_CP1250;
804
+ encoding_map["x-cp1251"] = RUSSIAN_CP1251;
805
+ encoding_map["x-cp1252"] = MSFT_CP1252;
806
+ encoding_map["x-cp1253"] = MSFT_CP1253;
807
+ encoding_map["x-cp1254"] = MSFT_CP1254;
808
+ encoding_map["x-cp1255"] = MSFT_CP1255;
809
+ encoding_map["x-cp1256"] = MSFT_CP1256;
810
+ encoding_map["x-cp1257"] = MSFT_CP1257;
811
+ encoding_map["x-euc-jp"] = JAPANESE_EUC_JP;
812
+ encoding_map["x-euc-tw"] = CHINESE_CNS;
813
+ encoding_map["x-gbk"] = GBK;
814
+ encoding_map["x-iso-10646-ucs-2-be"] = UTF16BE;
815
+ encoding_map["x-iso-10646-ucs-2-le"] = UTF16LE;
816
+ encoding_map["x-iso-10646-ucs-4-be"] = UTF32BE;
817
+ encoding_map["x-iso-10646-ucs-4-le"] = UTF32LE;
818
+ encoding_map["x-jis"] = JAPANESE_JIS; // not iana standard
819
+ encoding_map["x-mac-roman"] = MACINTOSH_ROMAN;
820
+ encoding_map["x-shift_jis"] = JAPANESE_SHIFT_JIS; // not iana standard
821
+ encoding_map["x-sjis"] = JAPANESE_SHIFT_JIS;
822
+ encoding_map["x-unicode-2-0-utf-7"] = UTF7;
823
+ encoding_map["x-utf8utf8"] = UTF8UTF8; // not iana standard
824
+ encoding_map["x-x-big5"] = CHINESE_BIG5;
825
+ encoding_map["zh_cn.euc"] = CHINESE_GB;
826
+ encoding_map["zh_tw-big5"] = CHINESE_BIG5;
827
+ encoding_map["zh_tw-euc"] = CHINESE_CNS;
828
+
829
+ // Remove they entry for the empty string, if any.
830
+ encoding_map.erase("");
831
+
832
+ return encoding_map;
833
+ }
834
+
835
+ // ----------------------------------------------------------------------
836
+ // EncodingNameAliasToEncoding()
837
+ //
838
+ // This function takes an encoding name/alias and returns the Encoding
839
+ // enum. The input is case insensitive. It is the union of the common
840
+ // IANA standard names, the charset names used in Netscape Navigator,
841
+ // and some common names we have been using.
842
+ // See: http://www.iana.org/assignments/character-sets
843
+ // http://physics.hallym.ac.kr/resource/relnotes/windows-2.0.html
844
+ //
845
+ // UNKNOWN_ENCODING is returned if none matches.
846
+ //
847
+ // TODO: Check if it is possible to remove the non-standard,
848
+ // non-netscape-use names. It is because this routine is used for
849
+ // encoding detections from html meta info. Non-standard names may
850
+ // introduce noise on encoding detection.
851
+ //
852
+ // TODO: Unify EncodingNameAliasToEncoding and EncodingFromName,
853
+ // or determine why such a unification is not possible.
854
+ // ----------------------------------------------------------------------
855
+ Encoding EncodingNameAliasToEncoding(const char *encoding_name) {
856
+ if (!encoding_name) {
857
+ return UNKNOWN_ENCODING;
858
+ }
859
+
860
+ const EncodingMap& encoding_map = GetEncodingMap();
861
+
862
+ EncodingMap::const_iterator emi = encoding_map.find(encoding_name);
863
+ if (emi != encoding_map.end()) {
864
+ return emi->second;
865
+ } else {
866
+ return UNKNOWN_ENCODING;
867
+ }
868
+ }
869
+
870
+ const char* default_encoding_name() {
871
+ return kEncodingInfoTable[LATIN1].encoding_name_;
872
+ }
873
+
874
+ static const char* const kInvalidEncodingName = "invalid_encoding";
875
+
876
+ const char *invalid_encoding_name() {
877
+ return kInvalidEncodingName;
878
+ }
879
+
880
+
881
+
882
+ // *************************************************************
883
+ // Miscellany
884
+ // *************************************************************
885
+
886
+
887
+ Encoding PreferredWebOutputEncoding(Encoding enc) {
888
+ return IsValidEncoding(enc)
889
+ ? kEncodingInfoTable[enc].preferred_web_output_encoding_
890
+ : UTF8;
891
+ }