compact_enc_det 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +7 -0
  2. data/ext/compact_enc_det/compact_enc_det/CMakeLists.txt +103 -0
  3. data/ext/compact_enc_det/compact_enc_det/LICENSE +202 -0
  4. data/ext/compact_enc_det/compact_enc_det/README.md +46 -0
  5. data/ext/compact_enc_det/compact_enc_det/autogen.sh +74 -0
  6. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.cc +5719 -0
  7. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.h +83 -0
  8. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_fuzz_test.cc +54 -0
  9. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables.h +6326 -0
  10. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables2.h +856 -0
  11. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.cc +169 -0
  12. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.h +45 -0
  13. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_unittest.cc +5260 -0
  14. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/detail_head_string.inc +152 -0
  15. data/ext/compact_enc_det/compact_enc_det/util/basictypes.h +331 -0
  16. data/ext/compact_enc_det/compact_enc_det/util/case_insensitive_hash.h +88 -0
  17. data/ext/compact_enc_det/compact_enc_det/util/commandlineflags.h +39 -0
  18. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.cc +891 -0
  19. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.h +299 -0
  20. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.pb.h +181 -0
  21. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings_unittest.cc +34 -0
  22. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.cc +349 -0
  23. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.h +381 -0
  24. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.pb.h +191 -0
  25. data/ext/compact_enc_det/compact_enc_det/util/logging.h +25 -0
  26. data/ext/compact_enc_det/compact_enc_det/util/port.h +53 -0
  27. data/ext/compact_enc_det/compact_enc_det/util/string_util.h +61 -0
  28. data/ext/compact_enc_det/compact_enc_det/util/varsetter.h +66 -0
  29. data/ext/compact_enc_det/compact_enc_det.cc +100 -0
  30. data/ext/compact_enc_det/extconf.rb +20 -0
  31. data/lib/compact_enc_det/version.rb +3 -0
  32. data/lib/compact_enc_det.rb +2 -0
  33. metadata +106 -0
@@ -0,0 +1,891 @@
1
+ // Copyright 2016 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ ////////////////////////////////////////////////////////////////////////////////
16
+
17
+ #include "util/encodings/encodings.h"
18
+
19
+ #include <string.h> // for strcasecmp
20
+ #include <unordered_map>
21
+ #include <utility> // for pair
22
+
23
+ #include "util/basictypes.h"
24
+ #include "util/string_util.h"
25
+ #include "util/case_insensitive_hash.h"
26
+
27
+ struct EncodingInfo {
28
+ // The standard name for this encoding.
29
+ //
30
+ const char* encoding_name_;
31
+
32
+ // The "preferred MIME name" of an encoding as specified by the IANA at:
33
+ // http://www.iana.org/assignments/character-sets
34
+ //
35
+ // Note that the preferred MIME name may differ slightly from the
36
+ // official IANA name: i.e. ISO-8859-1 vs. ISO_8859-1:1987
37
+ //
38
+ const char* mime_encoding_name_;
39
+
40
+ // It is an internal policy that if an encoding has an IANA name,
41
+ // then encoding_name_ and mime_encoding_name_ must be the same string.
42
+ //
43
+ // However, there can be exceptions if there are compelling reasons.
44
+ // For example, Japanese mobile handsets require the name
45
+ // "Shift_JIS" in charset=... parameter in Content-Type headers to
46
+ // process emoji (emoticons) in their private encodings. In that
47
+ // case, mime_encoding_name_ should be "Shift_JIS", despite
48
+ // encoding_name_ actually is "X-KDDI-Shift_JIS".
49
+
50
+ // Some multi-byte encodings use byte values that coincide with the
51
+ // ASCII codes for HTML syntax characters <>"&' and browsers like MSIE
52
+ // can misinterpret these, as indicated in an external XSS report from
53
+ // 2007-02-15. Here, we map these dangerous encodings to safer ones. We
54
+ // also use UTF8 instead of encodings that we don't support in our
55
+ // output, and we generally try to be conservative in what we send out.
56
+ // Where the client asks for single- or double-byte encodings that are
57
+ // not as common, we substitute a more common single- or double-byte
58
+ // encoding, if there is one, thereby preserving the client's intent
59
+ // to use less space than UTF-8. This also means that characters
60
+ // outside the destination set will be converted to HTML NCRs (&#NNN;)
61
+ // if requested.
62
+
63
+ Encoding preferred_web_output_encoding_;
64
+ };
65
+
66
+ static const EncodingInfo kEncodingInfoTable[] = {
67
+ { "ASCII", "ISO-8859-1", ISO_8859_1},
68
+ { "Latin2", "ISO-8859-2", ISO_8859_2},
69
+ { "Latin3", "ISO-8859-3", UTF8},
70
+ // MSIE 6 does not support ISO-8859-3 (XSS issue)
71
+ { "Latin4", "ISO-8859-4", ISO_8859_4},
72
+ { "ISO-8859-5", "ISO-8859-5", ISO_8859_5},
73
+ { "Arabic", "ISO-8859-6", ISO_8859_6},
74
+ { "Greek", "ISO-8859-7", ISO_8859_7},
75
+ { "Hebrew", "ISO-8859-8", MSFT_CP1255},
76
+ // we do not endorse the visual order
77
+ { "Latin5", "ISO-8859-9", ISO_8859_9},
78
+ { "Latin6", "ISO-8859-10", UTF8},
79
+ // MSIE does not support ISO-8859-10 (XSS issue)
80
+ { "EUC-JP", "EUC-JP", JAPANESE_EUC_JP},
81
+ { "SJS", "Shift_JIS", JAPANESE_SHIFT_JIS},
82
+ { "JIS", "ISO-2022-JP", JAPANESE_SHIFT_JIS},
83
+ // due to potential confusion with HTML syntax chars
84
+ { "BIG5", "Big5", CHINESE_BIG5},
85
+ { "GB", "GB2312", CHINESE_GB},
86
+ { "EUC-CN",
87
+ "EUC-CN",
88
+ // Misnamed. Should be EUC-TW.
89
+ CHINESE_BIG5},
90
+ // MSIE treats "EUC-CN" like GB2312, which is not EUC-TW,
91
+ // and EUC-TW is rare, so we prefer Big5 for output.
92
+ { "KSC", "EUC-KR", KOREAN_EUC_KR},
93
+ { "Unicode",
94
+ "UTF-16LE",
95
+ // Internet Explorer doesn't recognize "ISO-10646-UCS-2"
96
+ UTF8
97
+ // due to potential confusion with HTML syntax chars
98
+ },
99
+ { "EUC",
100
+ "EUC", // Misnamed. Should be EUC-TW.
101
+ CHINESE_BIG5
102
+ // MSIE does not recognize "EUC" (XSS issue),
103
+ // and EUC-TW is rare, so we prefer Big5 for output.
104
+ },
105
+ { "CNS",
106
+ "CNS", // Misnamed. Should be EUC-TW.
107
+ CHINESE_BIG5},
108
+ // MSIE does not recognize "CNS" (XSS issue),
109
+ // and EUC-TW is rare, so we prefer Big5 for output.
110
+ { "BIG5-CP950",
111
+ "BIG5-CP950", // Not an IANA name
112
+ CHINESE_BIG5
113
+ // MSIE does not recognize "BIG5-CP950" (XSS issue)
114
+ },
115
+ { "CP932", "CP932", // Not an IANA name
116
+ JAPANESE_SHIFT_JIS}, // MSIE does not recognize "CP932" (XSS issue)
117
+ { "UTF8", "UTF-8", UTF8},
118
+ { "Unknown",
119
+ "x-unknown", // Not an IANA name
120
+ UTF8}, // UTF-8 is our default output encoding
121
+ { "ASCII-7-bit", "US-ASCII", ASCII_7BIT},
122
+ { "KOI8R", "KOI8-R", RUSSIAN_KOI8_R},
123
+ { "CP1251", "windows-1251", RUSSIAN_CP1251},
124
+ { "CP1252", "windows-1252", MSFT_CP1252},
125
+ { "KOI8U",
126
+ "KOI8-U",
127
+ ISO_8859_5}, // because koi8-u is not as common
128
+ { "CP1250", "windows-1250", MSFT_CP1250},
129
+ { "ISO-8859-15", "ISO-8859-15", ISO_8859_15},
130
+ { "CP1254", "windows-1254", MSFT_CP1254},
131
+ { "CP1257", "windows-1257", MSFT_CP1257},
132
+ { "ISO-8859-11", "ISO-8859-11", ISO_8859_11},
133
+ { "CP874", "windows-874", MSFT_CP874},
134
+ { "CP1256", "windows-1256", MSFT_CP1256},
135
+ { "CP1255", "windows-1255", MSFT_CP1255},
136
+ { "ISO-8859-8-I", "ISO-8859-8-I", MSFT_CP1255},
137
+ // Java does not support iso-8859-8-i
138
+ { "VISUAL", "ISO-8859-8", MSFT_CP1255},
139
+ // we do not endorse the visual order
140
+ { "CP852", "cp852", MSFT_CP1250},
141
+ // because cp852 is not as common
142
+ { "CSN_369103", "csn_369103", MSFT_CP1250},
143
+ // MSIE does not recognize "csn_369103" (XSS issue)
144
+ { "CP1253", "windows-1253", MSFT_CP1253},
145
+ { "CP866", "IBM866", RUSSIAN_CP1251},
146
+ // because cp866 is not as common
147
+ { "ISO-8859-13", "ISO-8859-13", UTF8},
148
+ // because iso-8859-13 is not widely supported
149
+ { "ISO-2022-KR", "ISO-2022-KR", KOREAN_EUC_KR},
150
+ // due to potential confusion with HTML syntax chars
151
+ { "GBK", "GBK", GBK},
152
+ { "GB18030", "GB18030", GBK},
153
+ // because gb18030 is not widely supported
154
+ { "BIG5_HKSCS", "BIG5-HKSCS", CHINESE_BIG5},
155
+ // because Big5-HKSCS is not widely supported
156
+ { "ISO_2022_CN", "ISO-2022-CN", CHINESE_GB},
157
+ // due to potential confusion with HTML syntax chars
158
+ { "TSCII", "tscii", UTF8},
159
+ // we do not have an output converter for this font encoding
160
+ { "TAM", "tam", UTF8},
161
+ // we do not have an output converter for this font encoding
162
+ { "TAB", "tab", UTF8},
163
+ // we do not have an output converter for this font encoding
164
+ { "JAGRAN", "jagran", UTF8},
165
+ // we do not have an output converter for this font encoding
166
+ { "MACINTOSH", "MACINTOSH", ISO_8859_1},
167
+ // because macintosh is relatively uncommon
168
+ { "UTF7", "UTF-7",
169
+ UTF8}, // UTF-7 has been the subject of XSS attacks and is deprecated
170
+ { "BHASKAR", "bhaskar",
171
+ UTF8}, // we do not have an output converter for this font encoding
172
+ { "HTCHANAKYA", "htchanakya", // not an IANA charset name.
173
+ UTF8}, // we do not have an output converter for this font encoding
174
+ { "UTF-16BE", "UTF-16BE",
175
+ UTF8}, // due to potential confusion with HTML syntax chars
176
+ { "UTF-16LE", "UTF-16LE",
177
+ UTF8}, // due to potential confusion with HTML syntax chars
178
+ { "UTF-32BE", "UTF-32BE",
179
+ UTF8}, // unlikely to cause XSS bugs, but very uncommon on Web
180
+ { "UTF-32LE", "UTF-32LE",
181
+ UTF8}, // unlikely to cause XSS bugs, but very uncommon on Web
182
+ { "X-BINARYENC", "x-binaryenc", // Not an IANA name
183
+ UTF8}, // because this one is not intended for output (just input)
184
+ { "HZ-GB-2312", "HZ-GB-2312",
185
+ CHINESE_GB}, // due to potential confusion with HTML syntax chars
186
+ { "X-UTF8UTF8", "x-utf8utf8", // Not an IANA name
187
+ UTF8}, // because this one is not intended for output (just input)
188
+ { "X-TAM-ELANGO", "x-tam-elango",
189
+ UTF8}, // we do not have an output converter for this font encoding
190
+ { "X-TAM-LTTMBARANI", "x-tam-lttmbarani",
191
+ UTF8}, // we do not have an output converter for this font encoding
192
+ { "X-TAM-SHREE", "x-tam-shree",
193
+ UTF8}, // we do not have an output converter for this font encoding
194
+ { "X-TAM-TBOOMIS", "x-tam-tboomis",
195
+ UTF8}, // we do not have an output converter for this font encoding
196
+ { "X-TAM-TMNEWS", "x-tam-tmnews",
197
+ UTF8}, // we do not have an output converter for this font encoding
198
+ { "X-TAM-WEBTAMIL", "x-tam-webtamil",
199
+ UTF8}, // we do not have an output converter for this font encoding
200
+
201
+ { "X-KDDI-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},
202
+ // KDDI version of Shift_JIS with Google Emoji PUA mappings.
203
+ // Note that MimeEncodingName() returns "Shift_JIS", since KDDI uses
204
+ // "Shift_JIS" in HTTP headers and email messages.
205
+
206
+ { "X-DoCoMo-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},
207
+ // DoCoMo version of Shift_JIS with Google Emoji PUA mappings.
208
+ // See the comment at KDDI_SHIFT_JIS for other issues.
209
+
210
+ { "X-SoftBank-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},
211
+ // SoftBank version of Shift_JIS with Google Emoji PUA mappings.
212
+ // See the comment at KDDI_SHIFT_JIS for other issues.
213
+
214
+ { "X-KDDI-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS},
215
+ // KDDI version of ISO-2022-JP with Google Emoji PUA mappings.
216
+ // See the comment at KDDI_SHIFT_JIS for other issues.
217
+ // The preferred Web encoding is due to potential confusion with
218
+ // HTML syntax chars.
219
+
220
+ { "X-SoftBank-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS},
221
+ // SoftBank version of ISO-2022-JP with Google Emoji PUA mappings.
222
+ // See the comment at KDDI_SHIFT_JIS for other issues.
223
+ // The preferred Web encoding is due to potential confusion with
224
+ // HTML syntax chars.
225
+
226
+ // Please refer to NOTE: section in the comments in the definition
227
+ // of "struct I18NInfoByEncoding", before adding new encodings.
228
+
229
+ };
230
+
231
+
232
+
233
+ COMPILE_ASSERT(arraysize(kEncodingInfoTable) == NUM_ENCODINGS,
234
+ kEncodingInfoTable_has_incorrect_size);
235
+
236
+ Encoding default_encoding() {return LATIN1;}
237
+
238
+ // *************************************************************
239
+ // Encoding predicates
240
+ // IsValidEncoding()
241
+ // IsEncEncCompatible
242
+ // IsEncodingWithSupportedLanguage
243
+ // IsSupersetOfAscii7Bit
244
+ // Is8BitEncoding
245
+ // IsCJKEncoding
246
+ // IsHebrewEncoding
247
+ // IsRightToLeftEncoding
248
+ // IsLogicalRightToLeftEncoding
249
+ // IsVisualRightToLeftEncoding
250
+ // IsIso2022Encoding
251
+ // IsIso2022JpOrVariant
252
+ // IsShiftJisOrVariant
253
+ // IsJapaneseCellPhoneCarrierSpecificEncoding
254
+ // *************************************************************
255
+
256
+ bool IsValidEncoding(Encoding enc) {
257
+ return ((enc >= 0) && (enc < kNumEncodings));
258
+ }
259
+
260
+ bool IsEncEncCompatible(const Encoding from, const Encoding to) {
261
+ // Tests compatibility between the "from" and "to" encodings; in
262
+ // the typical case -- when both are valid known encodings -- this
263
+ // returns true iff converting from first to second is a no-op.
264
+ if (!IsValidEncoding(from) || !IsValidEncoding(to)) {
265
+ return false; // we only work with valid encodings...
266
+ } else if (to == from) {
267
+ return true; // the trivial common case
268
+ }
269
+
270
+ if (to == UNKNOWN_ENCODING) {
271
+ return true; // all valid encodings are compatible with the unknown
272
+ }
273
+
274
+ if (from == UNKNOWN_ENCODING) {
275
+ return false; // no unknown encoding is compatible with one that is
276
+ }
277
+
278
+ if (from == ASCII_7BIT) {
279
+ return IsSupersetOfAscii7Bit(to);
280
+ }
281
+
282
+ return (from == ISO_8859_1 && to == MSFT_CP1252) ||
283
+ (from == ISO_8859_8 && to == HEBREW_VISUAL) ||
284
+ (from == HEBREW_VISUAL && to == ISO_8859_8) ||
285
+ (from == ISO_8859_9 && to == MSFT_CP1254) ||
286
+ (from == ISO_8859_11 && to == MSFT_CP874) ||
287
+ (from == JAPANESE_SHIFT_JIS && to == JAPANESE_CP932) ||
288
+ (from == CHINESE_BIG5 && to == CHINESE_BIG5_CP950) ||
289
+ (from == CHINESE_GB && to == GBK) ||
290
+ (from == CHINESE_GB && to == GB18030) ||
291
+ (from == CHINESE_EUC_CN && to == CHINESE_EUC_DEC) ||
292
+ (from == CHINESE_EUC_CN && to == CHINESE_CNS) ||
293
+ (from == CHINESE_EUC_DEC && to == CHINESE_EUC_CN) ||
294
+ (from == CHINESE_EUC_DEC && to == CHINESE_CNS) ||
295
+ (from == CHINESE_CNS && to == CHINESE_EUC_CN) ||
296
+ (from == CHINESE_CNS && to == CHINESE_EUC_DEC);
297
+ }
298
+
299
+ // To be a superset of 7-bit Ascii means that bytes 0...127 in the given
300
+ // encoding represent the same characters as they do in ISO_8859_1.
301
+
302
+ // TODO: This list could be expanded. Many other encodings are supersets
303
+ // of 7-bit Ascii. In fact, Japanese JIS and Unicode are the only two
304
+ // encodings that I know for a fact should *not* be in this list.
305
+ bool IsSupersetOfAscii7Bit(Encoding e) {
306
+ switch (e) {
307
+ case ISO_8859_1:
308
+ case ISO_8859_2:
309
+ case ISO_8859_3:
310
+ case ISO_8859_4:
311
+ case ISO_8859_5:
312
+ case ISO_8859_6:
313
+ case ISO_8859_7:
314
+ case ISO_8859_8:
315
+ case ISO_8859_9:
316
+ case ISO_8859_10:
317
+ case JAPANESE_EUC_JP:
318
+ case JAPANESE_SHIFT_JIS:
319
+ case CHINESE_BIG5:
320
+ case CHINESE_GB:
321
+ case CHINESE_EUC_CN:
322
+ case KOREAN_EUC_KR:
323
+ case CHINESE_EUC_DEC:
324
+ case CHINESE_CNS:
325
+ case CHINESE_BIG5_CP950:
326
+ case JAPANESE_CP932:
327
+ case UTF8:
328
+ case UNKNOWN_ENCODING:
329
+ case ASCII_7BIT:
330
+ case RUSSIAN_KOI8_R:
331
+ case RUSSIAN_CP1251:
332
+ case MSFT_CP1252:
333
+ case RUSSIAN_KOI8_RU:
334
+ case MSFT_CP1250:
335
+ case ISO_8859_15:
336
+ case MSFT_CP1254:
337
+ case MSFT_CP1257:
338
+ case ISO_8859_11:
339
+ case MSFT_CP874:
340
+ case MSFT_CP1256:
341
+ case MSFT_CP1255:
342
+ case ISO_8859_8_I:
343
+ case HEBREW_VISUAL:
344
+ case CZECH_CP852:
345
+ case MSFT_CP1253:
346
+ case RUSSIAN_CP866:
347
+ case ISO_8859_13:
348
+ case GBK:
349
+ case GB18030:
350
+ case BIG5_HKSCS:
351
+ case MACINTOSH_ROMAN:
352
+ return true;
353
+ default:
354
+ return false;
355
+ }
356
+ }
357
+
358
+ // To be an 8-bit encoding means that there are fewer than 256 symbols.
359
+ // Each byte determines a new character; there are no multi-byte sequences.
360
+
361
+ // TODO: This list could maybe be expanded. Other encodings may be 8-bit.
362
+ bool Is8BitEncoding(Encoding e) {
363
+ switch (e) {
364
+ case ASCII_7BIT:
365
+ case ISO_8859_1:
366
+ case ISO_8859_2:
367
+ case ISO_8859_3:
368
+ case ISO_8859_4:
369
+ case ISO_8859_5:
370
+ case ISO_8859_6:
371
+ case ISO_8859_7:
372
+ case ISO_8859_8:
373
+ case ISO_8859_8_I:
374
+ case ISO_8859_9:
375
+ case ISO_8859_10:
376
+ case ISO_8859_11:
377
+ case ISO_8859_13:
378
+ case ISO_8859_15:
379
+ case MSFT_CP1252:
380
+ case MSFT_CP1253:
381
+ case MSFT_CP1254:
382
+ case MSFT_CP1255:
383
+ case MSFT_CP1256:
384
+ case MSFT_CP1257:
385
+ case RUSSIAN_KOI8_R:
386
+ case RUSSIAN_KOI8_RU:
387
+ case RUSSIAN_CP866:
388
+ return true;
389
+ default:
390
+ return false;
391
+ }
392
+ }
393
+
394
+ bool IsCJKEncoding(Encoding e) {
395
+ switch (e) {
396
+ case JAPANESE_EUC_JP:
397
+ case JAPANESE_SHIFT_JIS:
398
+ case JAPANESE_JIS:
399
+ case CHINESE_BIG5:
400
+ case CHINESE_GB:
401
+ case CHINESE_EUC_CN:
402
+ case KOREAN_EUC_KR:
403
+ case CHINESE_EUC_DEC:
404
+ case CHINESE_CNS:
405
+ case CHINESE_BIG5_CP950:
406
+ case JAPANESE_CP932:
407
+ case ISO_2022_KR:
408
+ case GBK:
409
+ case GB18030:
410
+ case BIG5_HKSCS:
411
+ case ISO_2022_CN:
412
+ case HZ_GB_2312:
413
+ return true;
414
+ default:
415
+ return false;
416
+ }
417
+ }
418
+
419
+ bool IsHebrewEncoding(Encoding e) {
420
+ return (e == ISO_8859_8 ||
421
+ e == ISO_8859_8_I ||
422
+ e == MSFT_CP1255 ||
423
+ e == HEBREW_VISUAL);
424
+ }
425
+
426
+
427
+
428
+ bool IsRightToLeftEncoding(Encoding enc) {
429
+ switch (enc) {
430
+ case MSFT_CP1255:
431
+ case MSFT_CP1256:
432
+ case ARABIC_ENCODING:
433
+ case HEBREW_ENCODING:
434
+ case ISO_8859_8_I:
435
+ case HEBREW_VISUAL:
436
+ return true;
437
+ default:
438
+ return false;
439
+ }
440
+ }
441
+
442
+ bool IsLogicalRightToLeftEncoding(Encoding enc) {
443
+ return IsRightToLeftEncoding(enc) && !IsVisualRightToLeftEncoding(enc);
444
+ }
445
+
446
+ // Note that despite an RFC to the contrary, ARABIC_ENCODING (ISO-8859-6)
447
+ // is NOT visual.
448
+ bool IsVisualRightToLeftEncoding(Encoding enc) {
449
+ switch (enc) {
450
+ case HEBREW_ENCODING:
451
+ case HEBREW_VISUAL:
452
+ return true;
453
+ default:
454
+ return false;
455
+ }
456
+ }
457
+
458
+
459
+
460
+
461
+
462
+ bool IsIso2022Encoding(Encoding enc) {
463
+ return (IsIso2022JpOrVariant(enc) ||
464
+ enc == ISO_2022_KR ||
465
+ enc == ISO_2022_CN);
466
+ }
467
+
468
+ bool IsIso2022JpOrVariant(Encoding enc) {
469
+ return (enc == JAPANESE_JIS ||
470
+ enc == KDDI_ISO_2022_JP ||
471
+ enc == SOFTBANK_ISO_2022_JP);
472
+ }
473
+
474
+ bool IsShiftJisOrVariant(Encoding enc) {
475
+ return (enc == JAPANESE_SHIFT_JIS ||
476
+ enc == JAPANESE_CP932 ||
477
+ enc == KDDI_SHIFT_JIS ||
478
+ enc == DOCOMO_SHIFT_JIS ||
479
+ enc == SOFTBANK_SHIFT_JIS);
480
+ }
481
+
482
+ bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc) {
483
+ return (enc == KDDI_ISO_2022_JP ||
484
+ enc == KDDI_SHIFT_JIS ||
485
+ enc == DOCOMO_SHIFT_JIS ||
486
+ enc == SOFTBANK_SHIFT_JIS ||
487
+ enc == SOFTBANK_ISO_2022_JP);
488
+ }
489
+
490
+
491
+ // *************************************************************
492
+ // ENCODING NAMES
493
+ // EncodingName() [Encoding to name]
494
+ // MimeEncodingName() [Encoding to name]
495
+ // EncodingFromName() [name to Encoding]
496
+ // EncodingNameAliasToEncoding() [name to Encoding]
497
+ // default_encoding_name()
498
+ // invalid_encoding_name()
499
+ // *************************************************************
500
+
501
+ const char * EncodingName(const Encoding enc) {
502
+ if ( (enc < 0) || (enc >= kNumEncodings) )
503
+ return invalid_encoding_name();
504
+ return kEncodingInfoTable[enc].encoding_name_;
505
+ }
506
+
507
+ // TODO: Unify MimeEncodingName and EncodingName, or determine why
508
+ // such a unification is not possible.
509
+
510
+ const char * MimeEncodingName(Encoding enc) {
511
+ if ( (enc < 0) || (enc >= kNumEncodings) )
512
+ return ""; // TODO: Should this be invalid_encoding_name()?
513
+ return kEncodingInfoTable[enc].mime_encoding_name_;
514
+ }
515
+
516
+ bool EncodingFromName(const char* enc_name, Encoding *encoding) {
517
+ *encoding = UNKNOWN_ENCODING;
518
+ if ( enc_name == NULL ) return false;
519
+
520
+ for ( int i = 0; i < kNumEncodings; i++ ) {
521
+ if (!base::strcasecmp(enc_name, kEncodingInfoTable[i].encoding_name_) ) {
522
+ *encoding = static_cast<Encoding>(i);
523
+ return true;
524
+ }
525
+ }
526
+ return false;
527
+ }
528
+
529
+ // The encoding_map maps standard and non-standard encoding-names
530
+ // (strings) to Encoding enums. It is used only by
531
+ // EncodingNameAliasToEncoding. Note that the map uses
532
+ // case-insensitive hash and comparison functions.
533
+
534
+ typedef std::unordered_map<const char *, Encoding,
535
+ CStringAlnumCaseHash,
536
+ CStringAlnumCaseEqual> EncodingMap;
537
+
538
+ static const EncodingMap& GetEncodingMap() {
539
+ static EncodingMap encoding_map;
540
+ if (!encoding_map.empty()) {
541
+ // Already initialized
542
+ return encoding_map;
543
+ }
544
+
545
+ // Initialize the map with all the "standard" encoding names,
546
+ // i.e., the ones returned by EncodingName and MimeEncodingName.
547
+ //
548
+ // First, add internal encoding names returned by EncodingName().
549
+ for (int i = 0; i < NUM_ENCODINGS; ++i) {
550
+ Encoding e = static_cast<Encoding>(i);
551
+ // Internal encoding names must be unique.
552
+ // The internal names are guaranteed to be unique by the CHECK_EQ.
553
+ const char *encoding_name = EncodingName(e);
554
+ // CHECK_EQ(0, encoding_map.count(encoding_name))
555
+ // << "Duplicate found for " << encoding_name;
556
+ encoding_map[encoding_name] = e;
557
+ }
558
+ // Then, add mime encoding names returned by MimeEncodingName().
559
+ // We don't override existing entries, to give precedence to entries
560
+ // added earlier.
561
+ for (int i = 0; i < NUM_ENCODINGS; ++i) {
562
+ Encoding e = static_cast<Encoding>(i);
563
+ // Note that MimeEncodingName() can return the same mime encoding
564
+ // name for different encoding enums like JAPANESE_SHIFT_JIS and
565
+ // KDDI_SHIFT_JIS. In that case, the encoding enum first seen
566
+ // will be the value for the encoding name in the map.
567
+ const char *mime_encoding_name = MimeEncodingName(e);
568
+ if (encoding_map.count(mime_encoding_name) == 0) {
569
+ encoding_map[mime_encoding_name] = e;
570
+ }
571
+ }
572
+
573
+ // Add some non-standard names: alternate spellings, common typos,
574
+ // etc. (It does no harm to add names already in the map.) Note
575
+ // that although the map is case-insensitive, by convention the
576
+ // keys are written here in lower case. For ease of maintenance,
577
+ // they are listed in alphabetical order.
578
+ encoding_map["5601"] = KOREAN_EUC_KR;
579
+ encoding_map["646"] = ASCII_7BIT;
580
+ encoding_map["852"] = CZECH_CP852;
581
+ encoding_map["866"] = RUSSIAN_CP866;
582
+ encoding_map["8859-1"] = ISO_8859_1;
583
+ encoding_map["ansi-1251"] = RUSSIAN_CP1251;
584
+ encoding_map["ansi_x3.4-1968"] = ASCII_7BIT;
585
+ encoding_map["arabic"] = ISO_8859_6;
586
+ encoding_map["ascii"] = ISO_8859_1;
587
+ encoding_map["ascii-7-bit"] = ASCII_7BIT; // not iana standard
588
+ encoding_map["asmo-708"] = ISO_8859_6;
589
+ encoding_map["bhaskar"] = BHASKAR;
590
+ encoding_map["big5"] = CHINESE_BIG5;
591
+ encoding_map["big5-cp950"] = CHINESE_BIG5_CP950; // not iana standard
592
+ encoding_map["big5-hkscs"] = BIG5_HKSCS;
593
+ encoding_map["chinese"] = CHINESE_GB;
594
+ encoding_map["cns"] = CHINESE_CNS; // not iana standard
595
+ encoding_map["cns11643"] = CHINESE_CNS;
596
+ encoding_map["cp1250"] = MSFT_CP1250; // not iana standard
597
+ encoding_map["cp1251"] = RUSSIAN_CP1251; // not iana standard
598
+ encoding_map["cp1252"] = MSFT_CP1252; // not iana standard
599
+ encoding_map["cp1253"] = MSFT_CP1253; // not iana standard
600
+ encoding_map["cp1254"] = MSFT_CP1254; // not iana standard
601
+ encoding_map["cp1255"] = MSFT_CP1255;
602
+ encoding_map["cp1256"] = MSFT_CP1256;
603
+ encoding_map["cp1257"] = MSFT_CP1257; // not iana standard
604
+ encoding_map["cp819"] = ISO_8859_1;
605
+ encoding_map["cp852"] = CZECH_CP852;
606
+ encoding_map["cp866"] = RUSSIAN_CP866;
607
+ encoding_map["cp-866"] = RUSSIAN_CP866;
608
+ encoding_map["cp874"] = MSFT_CP874;
609
+ encoding_map["cp932"] = JAPANESE_CP932; // not iana standard
610
+ encoding_map["cp950"] = CHINESE_BIG5_CP950; // not iana standard
611
+ encoding_map["csbig5"] = CHINESE_BIG5;
612
+ encoding_map["cseucjpkdfmtjapanese"] = JAPANESE_EUC_JP;
613
+ encoding_map["cseuckr"] = KOREAN_EUC_KR;
614
+ encoding_map["csgb2312"] = CHINESE_GB;
615
+ encoding_map["csibm852"] = CZECH_CP852;
616
+ encoding_map["csibm866"] = RUSSIAN_CP866;
617
+ encoding_map["csiso2022jp"] = JAPANESE_JIS;
618
+ encoding_map["csiso2022kr"] = ISO_2022_KR;
619
+ encoding_map["csiso58gb231280"] = CHINESE_GB;
620
+ encoding_map["csiso88598i"] = ISO_8859_8_I;
621
+ encoding_map["csisolatin1"] = ISO_8859_1;
622
+ encoding_map["csisolatin2"] = ISO_8859_2;
623
+ encoding_map["csisolatin3"] = ISO_8859_3;
624
+ encoding_map["csisolatin4"] = ISO_8859_4;
625
+ encoding_map["csisolatin5"] = ISO_8859_9;
626
+ encoding_map["csisolatin6"] = ISO_8859_10;
627
+ encoding_map["csisolatinarabic"] = ISO_8859_6;
628
+ encoding_map["csisolatincyrillic"] = ISO_8859_5;
629
+ encoding_map["csisolatingreek"] = ISO_8859_7;
630
+ encoding_map["csisolatinhebrew"] = ISO_8859_8;
631
+ encoding_map["csksc56011987"] = KOREAN_EUC_KR;
632
+ encoding_map["csmacintosh"] = MACINTOSH_ROMAN;
633
+ encoding_map["csn-369103"] = CZECH_CSN_369103;
634
+ encoding_map["csshiftjis"] = JAPANESE_SHIFT_JIS;
635
+ encoding_map["csunicode"] = UTF16BE;
636
+ encoding_map["csunicode11"] = UTF16BE;
637
+ encoding_map["csunicode11utf7"] = UTF7;
638
+ encoding_map["csunicodeascii"] = UTF16BE;
639
+ encoding_map["csunicodelatin1"] = UTF16BE;
640
+ encoding_map["cyrillic"] = ISO_8859_5;
641
+ encoding_map["ecma-114"] = ISO_8859_6;
642
+ encoding_map["ecma-118"] = ISO_8859_7;
643
+ encoding_map["elot_928"] = ISO_8859_7;
644
+ encoding_map["euc"] = CHINESE_EUC_DEC; // not iana standard
645
+ encoding_map["euc-cn"] = CHINESE_EUC_CN; // not iana standard
646
+ encoding_map["euc-dec"] = CHINESE_EUC_DEC; // not iana standard
647
+ encoding_map["euc-jp"] = JAPANESE_EUC_JP;
648
+ encoding_map["euc-kr"] = KOREAN_EUC_KR;
649
+ encoding_map["eucgb2312_cn"] = CHINESE_GB;
650
+ encoding_map["gb"] = CHINESE_GB; // not iana standard
651
+ encoding_map["gb18030"] = GB18030;
652
+ encoding_map["gb2132"] = CHINESE_GB; // common typo
653
+ encoding_map["gb2312"] = CHINESE_GB;
654
+ encoding_map["gb_2312-80"] = CHINESE_GB;
655
+ encoding_map["gbk"] = GBK;
656
+ encoding_map["greek"] = ISO_8859_7;
657
+ encoding_map["greek8"] = ISO_8859_7;
658
+ encoding_map["hebrew"] = ISO_8859_8;
659
+ encoding_map["htchanakya"] = HTCHANAKYA;
660
+ encoding_map["hz-gb-2312"] = HZ_GB_2312;
661
+ encoding_map["ibm819"] = ISO_8859_1;
662
+ encoding_map["ibm852"] = CZECH_CP852;
663
+ encoding_map["ibm874"] = MSFT_CP874;
664
+ encoding_map["iso-10646"] = UTF16BE;
665
+ encoding_map["iso-10646-j-1"] = UTF16BE;
666
+ encoding_map["iso-10646-ucs-2"] = UNICODE;
667
+ encoding_map["iso-10646-ucs-4"] = UTF32BE;
668
+ encoding_map["iso-10646-ucs-basic"] = UTF16BE;
669
+ encoding_map["iso-10646-unicode-latin1"] = UTF16BE;
670
+ encoding_map["iso-2022-cn"] = ISO_2022_CN;
671
+ encoding_map["iso-2022-jp"] = JAPANESE_JIS;
672
+ encoding_map["iso-2022-kr"] = ISO_2022_KR;
673
+ encoding_map["iso-8559-1"] = ISO_8859_1; // common typo
674
+ encoding_map["iso-874"] = MSFT_CP874;
675
+ encoding_map["iso-8858-1"] = ISO_8859_1; // common typo
676
+ // iso-8859-0 was a temporary name, eventually renamed iso-8859-15
677
+ encoding_map["iso-8859-0"] = ISO_8859_15;
678
+ encoding_map["iso-8859-1"] = ISO_8859_1;
679
+ encoding_map["iso-8859-10"] = ISO_8859_10;
680
+ encoding_map["iso-8859-11"] = ISO_8859_11;
681
+ encoding_map["iso-8859-13"] = ISO_8859_13;
682
+ encoding_map["iso-8859-15"] = ISO_8859_15;
683
+ encoding_map["iso-8859-2"] = ISO_8859_2;
684
+ encoding_map["iso-8859-3"] = ISO_8859_3;
685
+ encoding_map["iso-8859-4"] = ISO_8859_4;
686
+ encoding_map["iso-8859-5"] = ISO_8859_5;
687
+ encoding_map["iso-8859-6"] = ISO_8859_6;
688
+ encoding_map["iso-8859-7"] = ISO_8859_7;
689
+ encoding_map["iso-8859-8"] = ISO_8859_8;
690
+ encoding_map["iso-8859-8-i"] = ISO_8859_8_I;
691
+ encoding_map["iso-8859-9"] = ISO_8859_9;
692
+ encoding_map["iso-9959-1"] = ISO_8859_1; // common typo
693
+ encoding_map["iso-ir-100"] = ISO_8859_1;
694
+ encoding_map["iso-ir-101"] = ISO_8859_2;
695
+ encoding_map["iso-ir-109"] = ISO_8859_3;
696
+ encoding_map["iso-ir-110"] = ISO_8859_4;
697
+ encoding_map["iso-ir-126"] = ISO_8859_7;
698
+ encoding_map["iso-ir-127"] = ISO_8859_6;
699
+ encoding_map["iso-ir-138"] = ISO_8859_8;
700
+ encoding_map["iso-ir-144"] = ISO_8859_5;
701
+ encoding_map["iso-ir-148"] = ISO_8859_9;
702
+ encoding_map["iso-ir-149"] = KOREAN_EUC_KR;
703
+ encoding_map["iso-ir-157"] = ISO_8859_10;
704
+ encoding_map["iso-ir-58"] = CHINESE_GB;
705
+ encoding_map["iso-latin-1"] = ISO_8859_1;
706
+ encoding_map["iso_2022-cn"] = ISO_2022_CN;
707
+ encoding_map["iso_2022-kr"] = ISO_2022_KR;
708
+ encoding_map["iso_8859-1"] = ISO_8859_1;
709
+ encoding_map["iso_8859-10:1992"] = ISO_8859_10;
710
+ encoding_map["iso_8859-11"] = ISO_8859_11;
711
+ encoding_map["iso_8859-13"] = ISO_8859_13;
712
+ encoding_map["iso_8859-15"] = ISO_8859_15;
713
+ encoding_map["iso_8859-1:1987"] = ISO_8859_1;
714
+ encoding_map["iso_8859-2"] = ISO_8859_2;
715
+ encoding_map["iso_8859-2:1987"] = ISO_8859_2;
716
+ encoding_map["iso_8859-3"] = ISO_8859_3;
717
+ encoding_map["iso_8859-3:1988"] = ISO_8859_3;
718
+ encoding_map["iso_8859-4"] = ISO_8859_4;
719
+ encoding_map["iso_8859-4:1988"] = ISO_8859_4;
720
+ encoding_map["iso_8859-5"] = ISO_8859_5;
721
+ encoding_map["iso_8859-5:1988"] = ISO_8859_5;
722
+ encoding_map["iso_8859-6"] = ISO_8859_6;
723
+ encoding_map["iso_8859-6:1987"] = ISO_8859_6;
724
+ encoding_map["iso_8859-7"] = ISO_8859_7;
725
+ encoding_map["iso_8859-7:1987"] = ISO_8859_7;
726
+ encoding_map["iso_8859-8"] = ISO_8859_8;
727
+ encoding_map["iso_8859-8:1988:"] = ISO_8859_8;
728
+ encoding_map["iso_8859-9"] = ISO_8859_9;
729
+ encoding_map["iso_8859-9:1989"] = ISO_8859_9;
730
+ encoding_map["jagran"] = JAGRAN;
731
+ encoding_map["jis"] = JAPANESE_JIS; // not iana standard
732
+ encoding_map["koi8-cs"] = CZECH_CSN_369103;
733
+ encoding_map["koi8-r"] = RUSSIAN_KOI8_R;
734
+ encoding_map["koi8-ru"] = RUSSIAN_KOI8_RU; // not iana standard
735
+ encoding_map["koi8-u"] = RUSSIAN_KOI8_RU;
736
+ encoding_map["koi8r"] = RUSSIAN_KOI8_R; // not iana standard
737
+ encoding_map["koi8u"] = RUSSIAN_KOI8_RU; // not iana standard
738
+ encoding_map["korean"] = KOREAN_EUC_KR; // i assume this is what is meant
739
+ encoding_map["ks-c-5601"] = KOREAN_EUC_KR; // not iana standard
740
+ encoding_map["ks-c-5601-1987"] = KOREAN_EUC_KR; // not iana standard
741
+ encoding_map["ks_c_5601-1989"] = KOREAN_EUC_KR;
742
+ encoding_map["ksc"] = KOREAN_EUC_KR; // not iana standard
743
+ encoding_map["l1"] = ISO_8859_1;
744
+ encoding_map["l2"] = ISO_8859_2;
745
+ encoding_map["l3"] = ISO_8859_3;
746
+ encoding_map["l4"] = ISO_8859_4;
747
+ encoding_map["l5"] = ISO_8859_9;
748
+ encoding_map["l6"] = ISO_8859_10;
749
+ encoding_map["latin-1"] = ISO_8859_1; // not iana standard
750
+ encoding_map["latin1"] = ISO_8859_1;
751
+ encoding_map["latin2"] = ISO_8859_2;
752
+ encoding_map["latin3"] = ISO_8859_3;
753
+ encoding_map["latin4"] = ISO_8859_4;
754
+ encoding_map["latin5"] = ISO_8859_9;
755
+ encoding_map["latin6"] = ISO_8859_10;
756
+ encoding_map["mac"] = MACINTOSH_ROMAN;
757
+ encoding_map["macintosh"] = MACINTOSH_ROMAN;
758
+ encoding_map["macintosh-roman"] = MACINTOSH_ROMAN;
759
+ encoding_map["ms932"] = JAPANESE_CP932; // not iana standard
760
+ encoding_map["ms_kanji"] = JAPANESE_CP932;
761
+ encoding_map["shift-jis"] = JAPANESE_SHIFT_JIS;
762
+ encoding_map["shift_jis"] = JAPANESE_SHIFT_JIS;
763
+ encoding_map["sjis"] = JAPANESE_SHIFT_JIS; // not iana standard
764
+ encoding_map["sjs"] = JAPANESE_SHIFT_JIS; // not iana standard
765
+ encoding_map["sun_eu_greek"] = ISO_8859_7;
766
+ encoding_map["tab"] = TAMIL_BI;
767
+ encoding_map["tam"] = TAMIL_MONO;
768
+ encoding_map["tis-620"] = ISO_8859_11;
769
+ encoding_map["tscii"] = TSCII;
770
+ encoding_map["un"] = UNKNOWN_ENCODING; // not iana standard
771
+ encoding_map["unicode"] = UNICODE; // not iana standard
772
+ encoding_map["unicode-1-1-utf-7"] = UTF7;
773
+ encoding_map["unicode-1-1-utf-8"] = UTF8;
774
+ encoding_map["unicode-2-0-utf-7"] = UTF7;
775
+ encoding_map["unknown"] = UNKNOWN_ENCODING; // not iana standard
776
+ encoding_map["us"] = ISO_8859_1;
777
+ encoding_map["us-ascii"] = ISO_8859_1;
778
+ encoding_map["utf-16be"] = UTF16BE;
779
+ encoding_map["utf-16le"] = UTF16LE;
780
+ encoding_map["utf-32be"] = UTF32BE;
781
+ encoding_map["utf-32le"] = UTF32LE;
782
+ encoding_map["utf-7"] = UTF7;
783
+ encoding_map["utf-8"] = UTF8;
784
+ encoding_map["utf7"] = UTF7;
785
+ encoding_map["utf8"] = UTF8; // not iana standard
786
+ encoding_map["visual"] = HEBREW_VISUAL;
787
+ encoding_map["win-1250"] = MSFT_CP1250; // not iana standard
788
+ encoding_map["win-1251"] = RUSSIAN_CP1251; // not iana standard
789
+ encoding_map["window-874"] = MSFT_CP874;
790
+ encoding_map["windows-1250"] = MSFT_CP1250;
791
+ encoding_map["windows-1251"] = RUSSIAN_CP1251;
792
+ encoding_map["windows-1252"] = MSFT_CP1252;
793
+ encoding_map["windows-1253"] = MSFT_CP1253;
794
+ encoding_map["windows-1254"] = MSFT_CP1254;
795
+ encoding_map["windows-1255"] = MSFT_CP1255;
796
+ encoding_map["windows-1256"] = MSFT_CP1256;
797
+ encoding_map["windows-1257"] = MSFT_CP1257;
798
+ encoding_map["windows-31j"] = JAPANESE_CP932;
799
+ encoding_map["windows-874"] = MSFT_CP874;
800
+ encoding_map["windows-936"] = GBK;
801
+ encoding_map["x-big5"] = CHINESE_BIG5;
802
+ encoding_map["x-binaryenc"] = BINARYENC; // not iana standard
803
+ encoding_map["x-cp1250"] = MSFT_CP1250;
804
+ encoding_map["x-cp1251"] = RUSSIAN_CP1251;
805
+ encoding_map["x-cp1252"] = MSFT_CP1252;
806
+ encoding_map["x-cp1253"] = MSFT_CP1253;
807
+ encoding_map["x-cp1254"] = MSFT_CP1254;
808
+ encoding_map["x-cp1255"] = MSFT_CP1255;
809
+ encoding_map["x-cp1256"] = MSFT_CP1256;
810
+ encoding_map["x-cp1257"] = MSFT_CP1257;
811
+ encoding_map["x-euc-jp"] = JAPANESE_EUC_JP;
812
+ encoding_map["x-euc-tw"] = CHINESE_CNS;
813
+ encoding_map["x-gbk"] = GBK;
814
+ encoding_map["x-iso-10646-ucs-2-be"] = UTF16BE;
815
+ encoding_map["x-iso-10646-ucs-2-le"] = UTF16LE;
816
+ encoding_map["x-iso-10646-ucs-4-be"] = UTF32BE;
817
+ encoding_map["x-iso-10646-ucs-4-le"] = UTF32LE;
818
+ encoding_map["x-jis"] = JAPANESE_JIS; // not iana standard
819
+ encoding_map["x-mac-roman"] = MACINTOSH_ROMAN;
820
+ encoding_map["x-shift_jis"] = JAPANESE_SHIFT_JIS; // not iana standard
821
+ encoding_map["x-sjis"] = JAPANESE_SHIFT_JIS;
822
+ encoding_map["x-unicode-2-0-utf-7"] = UTF7;
823
+ encoding_map["x-utf8utf8"] = UTF8UTF8; // not iana standard
824
+ encoding_map["x-x-big5"] = CHINESE_BIG5;
825
+ encoding_map["zh_cn.euc"] = CHINESE_GB;
826
+ encoding_map["zh_tw-big5"] = CHINESE_BIG5;
827
+ encoding_map["zh_tw-euc"] = CHINESE_CNS;
828
+
829
+ // Remove they entry for the empty string, if any.
830
+ encoding_map.erase("");
831
+
832
+ return encoding_map;
833
+ }
834
+
835
+ // ----------------------------------------------------------------------
836
+ // EncodingNameAliasToEncoding()
837
+ //
838
+ // This function takes an encoding name/alias and returns the Encoding
839
+ // enum. The input is case insensitive. It is the union of the common
840
+ // IANA standard names, the charset names used in Netscape Navigator,
841
+ // and some common names we have been using.
842
+ // See: http://www.iana.org/assignments/character-sets
843
+ // http://physics.hallym.ac.kr/resource/relnotes/windows-2.0.html
844
+ //
845
+ // UNKNOWN_ENCODING is returned if none matches.
846
+ //
847
+ // TODO: Check if it is possible to remove the non-standard,
848
+ // non-netscape-use names. It is because this routine is used for
849
+ // encoding detections from html meta info. Non-standard names may
850
+ // introduce noise on encoding detection.
851
+ //
852
+ // TODO: Unify EncodingNameAliasToEncoding and EncodingFromName,
853
+ // or determine why such a unification is not possible.
854
+ // ----------------------------------------------------------------------
855
+ Encoding EncodingNameAliasToEncoding(const char *encoding_name) {
856
+ if (!encoding_name) {
857
+ return UNKNOWN_ENCODING;
858
+ }
859
+
860
+ const EncodingMap& encoding_map = GetEncodingMap();
861
+
862
+ EncodingMap::const_iterator emi = encoding_map.find(encoding_name);
863
+ if (emi != encoding_map.end()) {
864
+ return emi->second;
865
+ } else {
866
+ return UNKNOWN_ENCODING;
867
+ }
868
+ }
869
+
870
+ const char* default_encoding_name() {
871
+ return kEncodingInfoTable[LATIN1].encoding_name_;
872
+ }
873
+
874
+ static const char* const kInvalidEncodingName = "invalid_encoding";
875
+
876
+ const char *invalid_encoding_name() {
877
+ return kInvalidEncodingName;
878
+ }
879
+
880
+
881
+
882
+ // *************************************************************
883
+ // Miscellany
884
+ // *************************************************************
885
+
886
+
887
+ Encoding PreferredWebOutputEncoding(Encoding enc) {
888
+ return IsValidEncoding(enc)
889
+ ? kEncodingInfoTable[enc].preferred_web_output_encoding_
890
+ : UTF8;
891
+ }