compact_enc_det 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. checksums.yaml +7 -0
  2. data/ext/compact_enc_det/compact_enc_det/CMakeLists.txt +103 -0
  3. data/ext/compact_enc_det/compact_enc_det/LICENSE +202 -0
  4. data/ext/compact_enc_det/compact_enc_det/README.md +46 -0
  5. data/ext/compact_enc_det/compact_enc_det/autogen.sh +74 -0
  6. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.cc +5719 -0
  7. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.h +83 -0
  8. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_fuzz_test.cc +54 -0
  9. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables.h +6326 -0
  10. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables2.h +856 -0
  11. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.cc +169 -0
  12. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.h +45 -0
  13. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_unittest.cc +5260 -0
  14. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/detail_head_string.inc +152 -0
  15. data/ext/compact_enc_det/compact_enc_det/util/basictypes.h +331 -0
  16. data/ext/compact_enc_det/compact_enc_det/util/case_insensitive_hash.h +88 -0
  17. data/ext/compact_enc_det/compact_enc_det/util/commandlineflags.h +39 -0
  18. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.cc +891 -0
  19. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.h +299 -0
  20. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.pb.h +181 -0
  21. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings_unittest.cc +34 -0
  22. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.cc +349 -0
  23. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.h +381 -0
  24. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.pb.h +191 -0
  25. data/ext/compact_enc_det/compact_enc_det/util/logging.h +25 -0
  26. data/ext/compact_enc_det/compact_enc_det/util/port.h +53 -0
  27. data/ext/compact_enc_det/compact_enc_det/util/string_util.h +61 -0
  28. data/ext/compact_enc_det/compact_enc_det/util/varsetter.h +66 -0
  29. data/ext/compact_enc_det/compact_enc_det.cc +100 -0
  30. data/ext/compact_enc_det/extconf.rb +20 -0
  31. data/lib/compact_enc_det/version.rb +3 -0
  32. data/lib/compact_enc_det.rb +2 -0
  33. metadata +106 -0
@@ -0,0 +1,299 @@
1
+ // Copyright 2016 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ ////////////////////////////////////////////////////////////////////////////////
16
+
17
+ #ifndef UTIL_ENCODINGS_ENCODINGS_H_
18
+ #define UTIL_ENCODINGS_ENCODINGS_H_
19
+
20
+ // This interface defines the Encoding enum and various functions that
21
+ // depend only on Encoding values.
22
+
23
+ // A hash-function for Encoding, hash<Encoding>, is defined in
24
+ // i18n/encodings/public/encodings-hash.h
25
+
26
+ // On some Windows projects, UNICODE may be defined, which would prevent the
27
+ // Encoding enum below from compiling. Note that this is a quick fix that does
28
+ // not break any existing projects. The UNICODE enum may someday be changed
29
+ // to something more specific and non-colliding, but this involves careful
30
+ // testing of changes in many other projects.
31
+ #undef UNICODE
32
+
33
+ // NOTE: The Encoding enum must always start at 0. This assumption has
34
+ // been made and used.
35
+
36
+ #ifndef SWIG
37
+
38
+ #include "util/encodings/encodings.pb.h"
39
+
40
+ #else
41
+
42
+ // TODO: Include a SWIG workaround header file.
43
+
44
+ #endif
45
+
46
+ const int kNumEncodings = NUM_ENCODINGS;
47
+
48
+ // some of the popular encoding aliases
49
+ // TODO: Make these static const Encoding values instead of macros.
50
+ #define LATIN1 ISO_8859_1
51
+ #define LATIN2 ISO_8859_2
52
+ #define LATIN3 ISO_8859_3
53
+ #define LATIN4 ISO_8859_4
54
+ #define CYRILLIC ISO_8859_5
55
+ #define ARABIC_ENCODING ISO_8859_6 // avoiding the same name as language
56
+ #define GREEK_ENCODING ISO_8859_7 // avoiding the same name as language
57
+ #define HEBREW_ENCODING ISO_8859_8 // avoiding the same name as language
58
+ #define LATIN5 ISO_8859_9
59
+ #define LATIN6 ISO_8859_10
60
+ #define KOREAN_HANGUL KOREAN_EUC_KR
61
+
62
+ // The default Encoding (LATIN1).
63
+ Encoding default_encoding();
64
+
65
+
66
+
67
+ // *************************************************************
68
+ // Encoding predicates
69
+ // IsValidEncoding()
70
+ // IsEncEncCompatible
71
+ // IsSupersetOfAscii7Bit
72
+ // Is8BitEncoding
73
+ // IsCJKEncoding
74
+ // IsHebrewEncoding
75
+ // IsRightToLeftEncoding
76
+ // IsLogicalRightToLeftEncoding
77
+ // IsVisualRightToLeftEncoding
78
+ // IsIso2022Encoding
79
+ // IsIso2022JpOrVariant
80
+ // IsShiftJisOrVariant
81
+ // IsJapaneseCellPhoneCarrierSpecificEncoding
82
+ // *************************************************************
83
+
84
+ // IsValidEncoding
85
+ // ===================================
86
+ //
87
+ // Function to check if the input language enum is within range.
88
+ //
89
+
90
+ bool IsValidEncoding(Encoding enc);
91
+
92
+ //
93
+ // IsEncEncCompatible
94
+ // ------------------
95
+ //
96
+ // This function is to determine whether or not converting from the
97
+ // first encoding to the second requires any changes to the underlying
98
+ // text (e.g. ASCII_7BIT is a subset of UTF8).
99
+ //
100
+ // TODO: the current implementation is likely incomplete. It would be
101
+ // good to consider the full matrix of all pairs of encodings and to fish out
102
+ // all compatible pairs.
103
+ //
104
+ bool IsEncEncCompatible(const Encoding from, const Encoding to);
105
+
106
+ // To be a superset of 7-bit Ascii means that bytes 0...127 in the given
107
+ // encoding represent the same characters as they do in ISO_8859_1.
108
+
109
+ // WARNING: This function does not currently return true for all encodings that
110
+ // are supersets of Ascii 7-bit.
111
+ bool IsSupersetOfAscii7Bit(Encoding e);
112
+
113
+ // To be an 8-bit encoding means that there are fewer than 256 symbols.
114
+ // Each byte determines a new character; there are no multi-byte sequences.
115
+
116
+ // WARNING: This function does not currently return true for all encodings that
117
+ // are 8-bit encodings.
118
+ bool Is8BitEncoding(Encoding e);
119
+
120
+ // IsCJKEncoding
121
+ // -------------
122
+ //
123
+ // This function returns true if the encoding is either Chinese
124
+ // (simplified or traditional), Japanese, or Korean. Note: UTF8 is not
125
+ // considered a CJK encoding.
126
+ bool IsCJKEncoding(Encoding e);
127
+
128
+ // IsHebrewEncoding
129
+ // -------------
130
+ //
131
+ // This function returns true if the encoding is a Hebrew specific
132
+ // encoding (not UTF8, etc).
133
+ bool IsHebrewEncoding(Encoding e);
134
+
135
+ // IsRightToLeftEncoding
136
+ // ---------------------
137
+ //
138
+ // Returns true if the encoding is a right-to-left encoding.
139
+ //
140
+ // Note that the name of this function is somewhat misleading. There is nothing
141
+ // "right to left" about these encodings. They merely contain code points for
142
+ // characters in RTL languages such as Hebrew and Arabic. But this is also
143
+ // true for UTF-8.
144
+ //
145
+ // TODO: Get rid of this function. The only special-case we
146
+ // should need to worry about are visual encodings. Anything we
147
+ // need to do for all 'RTL' encodings we need to do for UTF-8 as well.
148
+ bool IsRightToLeftEncoding(Encoding enc);
149
+
150
+ // IsLogicalRightToLeftEncoding
151
+ // ----------------------------
152
+ //
153
+ // Returns true if the encoding is a logical right-to-left encoding.
154
+ // Logical right-to-left encodings are those that the browser renders
155
+ // right-to-left and applies the BiDi algorithm to. Therefore the characters
156
+ // appear in reading order in the file, and indexing, snippet generation etc.
157
+ // should all just work with no special processing.
158
+ //
159
+ // TODO: Get rid of this function. The only special-case we
160
+ // should need to worry about are visual encodings.
161
+ bool IsLogicalRightToLeftEncoding(Encoding enc);
162
+
163
+ // IsVisualRightToLeftEncoding
164
+ // ---------------------------
165
+ //
166
+ // Returns true if the encoding is a visual right-to-left encoding.
167
+ // Visual right-to-left encodings are those that the browser renders
168
+ // left-to-right and does not apply the BiDi algorithm to. Therefore each
169
+ // line appears in reverse order in the file, lines are manually wrapped
170
+ // by abusing <br> or <p> tags, etc. Visual RTL encoding is a relic of
171
+ // the prehistoric days when browsers couldn't render right-to-left, but
172
+ // unfortunately some visual pages persist to this day. These documents require
173
+ // special processing so that we don't index or snippet them with each line
174
+ // reversed.
175
+ bool IsVisualRightToLeftEncoding(Encoding enc);
176
+
177
+ // IsIso2022Encoding
178
+ // -----------------
179
+ //
180
+ // Returns true if the encoding is a kind of ISO 2022 such as
181
+ // ISO-2022-JP.
182
+ bool IsIso2022Encoding(Encoding enc);
183
+
184
+ // IsIso2022JpOrVariant
185
+ // --------------------
186
+ //
187
+ // Returns true if the encoding is ISO-2022-JP or a variant such as
188
+ // KDDI's ISO-2022-JP.
189
+ bool IsIso2022JpOrVariant(Encoding enc);
190
+
191
+ // IsShiftJisOrVariant
192
+ // --------------------
193
+ //
194
+ // Returns true if the encoding is Shift_JIS or a variant such as
195
+ // KDDI's Shift_JIS.
196
+ bool IsShiftJisOrVariant(Encoding enc);
197
+
198
+ // IsJapanesCellPhoneCarrierSpecificEncoding
199
+ // -----------------------------------------
200
+ //
201
+ // Returns true if it's Japanese cell phone carrier specific encoding
202
+ // such as KDDI_SHIFT_JIS.
203
+ bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc);
204
+
205
+
206
+
207
+ // *************************************************************
208
+ // ENCODING NAMES
209
+ //
210
+ // This interface defines a standard name for each valid encoding, and
211
+ // a standard name for invalid encodings. (Some names use all upper
212
+ // case, but others use mixed case.)
213
+ //
214
+ // EncodingName() [Encoding to name]
215
+ // MimeEncodingName() [Encoding to name]
216
+ // EncodingFromName() [name to Encoding]
217
+ // EncodingNameAliasToEncoding() [name to Encoding]
218
+ // default_encoding_name()
219
+ // invalid_encoding_name()
220
+ // *************************************************************
221
+
222
+ // EncodingName
223
+ // ------------
224
+ //
225
+ // Given the encoding, returns its standard name.
226
+ // Return invalid_encoding_name() if the encoding is invalid.
227
+ //
228
+ const char* EncodingName(Encoding enc);
229
+
230
+ //
231
+ // MimeEncodingName
232
+ // ----------------
233
+ //
234
+ // Return the "preferred MIME name" of an encoding.
235
+ //
236
+ // This name is suitable for using in HTTP headers, HTML tags,
237
+ // and as the "charset" parameter of a MIME Content-Type.
238
+ const char* MimeEncodingName(Encoding enc);
239
+
240
+
241
+ // The maximum length of an encoding name
242
+ const int kMaxEncodingNameSize = 50;
243
+
244
+ // The standard name of the default encoding.
245
+ const char* default_encoding_name();
246
+
247
+ // The name used for an invalid encoding.
248
+ const char* invalid_encoding_name();
249
+
250
+ // EncodingFromName
251
+ // ----------------
252
+ //
253
+ // If enc_name matches the standard name of an Encoding, using a
254
+ // case-insensitive comparison, set *encoding to that Encoding and
255
+ // return true. Otherwise set *encoding to UNKNOWN_ENCODING and
256
+ // return false.
257
+ //
258
+ // REQUIRES: encoding must not be NULL.
259
+ //
260
+ bool EncodingFromName(const char* enc_name, Encoding *encoding);
261
+
262
+ //
263
+ // EncodingNameAliasToEncoding
264
+ // ---------------------------
265
+ //
266
+ // If enc_name matches the standard name or an alias of an Encoding,
267
+ // using a case-insensitive comparison, return that
268
+ // Encoding. Otherwise, return UNKNOWN_ENCODING.
269
+ //
270
+ // Aliases include most mime-encoding names (e.g., "ISO-8859-7" for
271
+ // GREEK), alternate names (e.g., "cyrillic" for ISO_8859_5) and
272
+ // common variations with hyphens and underscores (e.g., "koi8-u" and
273
+ // "koi8u" for RUSSIAN_KOI8_R).
274
+
275
+ Encoding EncodingNameAliasToEncoding(const char *enc_name);
276
+
277
+ // *************************************************************
278
+ // Miscellany
279
+ // *************************************************************
280
+
281
+ // PreferredWebOutputEncoding
282
+ // --------------------------
283
+ //
284
+ // Some multi-byte encodings use byte values that coincide with the
285
+ // ASCII codes for HTML syntax characters <>"&' and browsers like MSIE
286
+ // can misinterpret these, as indicated in an external XSS report from
287
+ // 2007-02-15. Here, we map these dangerous encodings to safer ones. We
288
+ // also use UTF8 instead of encodings that we don't support in our
289
+ // output, and we generally try to be conservative in what we send out.
290
+ // Where the client asks for single- or double-byte encodings that are
291
+ // not as common, we substitute a more common single- or double-byte
292
+ // encoding, if there is one, thereby preserving the client's intent
293
+ // to use less space than UTF-8. This also means that characters
294
+ // outside the destination set will be converted to HTML NCRs (&#NNN;)
295
+ // if requested.
296
+ Encoding PreferredWebOutputEncoding(Encoding enc);
297
+
298
+
299
+ #endif // UTIL_ENCODINGS_ENCODINGS_H_
@@ -0,0 +1,181 @@
1
+ // Copyright 2016 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ ////////////////////////////////////////////////////////////////////////////////
16
+
17
+ #ifndef UTIL_ENCODINGS_ENCODINGS_PB_H_
18
+ #define UTIL_ENCODINGS_ENCODINGS_PB_H_
19
+
20
+ enum Encoding {
21
+ ISO_8859_1 = 0, // Teragram ASCII
22
+ ISO_8859_2 = 1, // Teragram Latin2
23
+ ISO_8859_3 = 2, // in BasisTech but not in Teragram
24
+ ISO_8859_4 = 3, // Teragram Latin4
25
+ ISO_8859_5 = 4, // Teragram ISO-8859-5
26
+ ISO_8859_6 = 5, // Teragram Arabic
27
+ ISO_8859_7 = 6, // Teragram Greek
28
+ ISO_8859_8 = 7, // Teragram Hebrew
29
+ ISO_8859_9 = 8, // in BasisTech but not in Teragram
30
+ ISO_8859_10 = 9, // in BasisTech but not in Teragram
31
+ JAPANESE_EUC_JP = 10, // Teragram EUC_JP
32
+ JAPANESE_SHIFT_JIS = 11, // Teragram SJS
33
+ JAPANESE_JIS = 12, // Teragram JIS
34
+ CHINESE_BIG5 = 13, // Teragram BIG5
35
+ CHINESE_GB = 14, // Teragram GB
36
+ CHINESE_EUC_CN = 15, // Misnamed. Should be EUC_TW. Was Basis Tech
37
+ // CNS11643EUC, before that Teragram EUC-CN(!)
38
+ // See //i18n/basistech/basistech_encodings.h
39
+ KOREAN_EUC_KR = 16, // Teragram KSC
40
+ UNICODE = 17, // Teragram Unicode
41
+ CHINESE_EUC_DEC = 18, // Misnamed. Should be EUC_TW. Was Basis Tech
42
+ // CNS11643EUC, before that Teragram EUC.
43
+ CHINESE_CNS = 19, // Misnamed. Should be EUC_TW. Was Basis Tech
44
+ // CNS11643EUC, before that Teragram CNS.
45
+ CHINESE_BIG5_CP950 = 20, // Teragram BIG5_CP950
46
+ JAPANESE_CP932 = 21, // Teragram CP932
47
+ UTF8 = 22,
48
+ UNKNOWN_ENCODING = 23,
49
+ ASCII_7BIT = 24, // ISO_8859_1 with all characters <= 127.
50
+ // Should be present only in the crawler
51
+ // and in the repository,
52
+ // *never* as a result of Document::encoding().
53
+ RUSSIAN_KOI8_R = 25, // Teragram KOI8R
54
+ RUSSIAN_CP1251 = 26, // Teragram CP1251
55
+
56
+ //----------------------------------------------------------
57
+ // These are _not_ output from teragram. Instead, they are as
58
+ // detected in the headers of usenet articles.
59
+ MSFT_CP1252 = 27, // 27: CP1252 aka MSFT euro ascii
60
+ RUSSIAN_KOI8_RU = 28, // CP21866 aka KOI8-U, used for Ukrainian.
61
+ // Misnamed, this is _not_ KOI8-RU but KOI8-U.
62
+ // KOI8-U is used much more often than KOI8-RU.
63
+ MSFT_CP1250 = 29, // CP1250 aka MSFT eastern european
64
+ ISO_8859_15 = 30, // aka ISO_8859_0 aka ISO_8859_1 euroized
65
+ //----------------------------------------------------------
66
+
67
+ //----------------------------------------------------------
68
+ // These are in BasisTech but not in Teragram. They are
69
+ // needed for new interface languages. Now detected by
70
+ // research langid
71
+ MSFT_CP1254 = 31, // used for Turkish
72
+ MSFT_CP1257 = 32, // used in Baltic countries
73
+ //----------------------------------------------------------
74
+
75
+ //----------------------------------------------------------
76
+ //----------------------------------------------------------
77
+ // New encodings detected by Teragram
78
+ ISO_8859_11 = 33, // aka TIS-620, used for Thai
79
+ MSFT_CP874 = 34, // used for Thai
80
+ MSFT_CP1256 = 35, // used for Arabic
81
+
82
+ //----------------------------------------------------------
83
+ // Detected as ISO_8859_8 by Teragram, but can be found in META tags
84
+ MSFT_CP1255 = 36, // Logical Hebrew Microsoft
85
+ ISO_8859_8_I = 37, // Iso Hebrew Logical
86
+ HEBREW_VISUAL = 38, // Iso Hebrew Visual
87
+ //----------------------------------------------------------
88
+
89
+ //----------------------------------------------------------
90
+ // Detected by research langid
91
+ CZECH_CP852 = 39,
92
+ CZECH_CSN_369103 = 40, // aka ISO_IR_139 aka KOI8_CS
93
+ MSFT_CP1253 = 41, // used for Greek
94
+ RUSSIAN_CP866 = 42,
95
+ //----------------------------------------------------------
96
+
97
+ //----------------------------------------------------------
98
+ // Handled by iconv in glibc
99
+ ISO_8859_13 = 43,
100
+ ISO_2022_KR = 44,
101
+ GBK = 45,
102
+ GB18030 = 46,
103
+ BIG5_HKSCS = 47,
104
+ ISO_2022_CN = 48,
105
+
106
+ //-----------------------------------------------------------
107
+ // Detected by xin liu's detector
108
+ // Handled by transcoder
109
+ // (Indic encodings)
110
+
111
+ TSCII = 49,
112
+ TAMIL_MONO = 50,
113
+ TAMIL_BI = 51,
114
+ JAGRAN = 52,
115
+
116
+
117
+ MACINTOSH_ROMAN = 53,
118
+ UTF7 = 54,
119
+ BHASKAR = 55, // Indic encoding - Devanagari
120
+ HTCHANAKYA = 56, // 56 Indic encoding - Devanagari
121
+
122
+ //-----------------------------------------------------------
123
+ // These allow a single place (inputconverter and outputconverter)
124
+ // to do UTF-16 <==> UTF-8 bulk conversions and UTF-32 <==> UTF-8
125
+ // bulk conversions, with interchange-valid checking on input and
126
+ // fallback if needed on ouput.
127
+ UTF16BE = 57, // big-endian UTF-16
128
+ UTF16LE = 58, // little-endian UTF-16
129
+ UTF32BE = 59, // big-endian UTF-32
130
+ UTF32LE = 60, // little-endian UTF-32
131
+ //-----------------------------------------------------------
132
+
133
+ //-----------------------------------------------------------
134
+ // An encoding that means "This is not text, but it may have some
135
+ // simple ASCII text embedded". Intended input conversion (not yet
136
+ // implemented) is to keep strings of >=4 seven-bit ASCII characters
137
+ // (follow each kept string with an ASCII space), delete the rest of
138
+ // the bytes. This will pick up and allow indexing of e.g. captions
139
+ // in JPEGs. No output conversion needed.
140
+ BINARYENC = 61,
141
+ //-----------------------------------------------------------
142
+
143
+ //-----------------------------------------------------------
144
+ // Some Web pages allow a mixture of HZ-GB and GB-2312 by using
145
+ // ~{ ... ~} for 2-byte pairs, and the browsers support this.
146
+ HZ_GB_2312 = 62,
147
+ //-----------------------------------------------------------
148
+
149
+ //-----------------------------------------------------------
150
+ // Some external vendors make the common input error of
151
+ // converting MSFT_CP1252 to UTF8 *twice*. No output conversion needed.
152
+ UTF8UTF8 = 63,
153
+ //-----------------------------------------------------------
154
+
155
+ //-----------------------------------------------------------
156
+ // Handled by transcoder for tamil language specific font
157
+ // encodings without the support for detection at present.
158
+ TAM_ELANGO = 64, // Elango - Tamil
159
+ TAM_LTTMBARANI = 65, // Barani - Tamil
160
+ TAM_SHREE = 66, // Shree - Tamil
161
+ TAM_TBOOMIS = 67, // TBoomis - Tamil
162
+ TAM_TMNEWS = 68, // TMNews - Tamil
163
+ TAM_WEBTAMIL = 69, // Webtamil - Tamil
164
+ //-----------------------------------------------------------
165
+
166
+ //-----------------------------------------------------------
167
+ // Shift_JIS variants used by Japanese cell phone carriers.
168
+ KDDI_SHIFT_JIS = 70,
169
+ DOCOMO_SHIFT_JIS = 71,
170
+ SOFTBANK_SHIFT_JIS = 72,
171
+ // ISO-2022-JP variants used by KDDI and SoftBank.
172
+ KDDI_ISO_2022_JP = 73,
173
+ SOFTBANK_ISO_2022_JP = 74,
174
+ //-----------------------------------------------------------
175
+
176
+ NUM_ENCODINGS = 75, // Always keep this at the end. It is not a
177
+ // valid Encoding enum, it is only used to
178
+ // indicate the total number of Encodings.
179
+ };
180
+
181
+ #endif // UTIL_ENCODINGS_ENCODINGS_PB_H_
@@ -0,0 +1,34 @@
1
+ // Copyright 2016 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ ////////////////////////////////////////////////////////////////////////////////
16
+
17
+ #include "util/encodings/encodings.h"
18
+
19
+ #include "gtest/gtest.h"
20
+
21
+ TEST(EncodingsTest, EncodingNameAliasToEncoding) {
22
+ // Test that cases, non-alpha-numeric chars are ignored.
23
+ EXPECT_EQ(ISO_8859_1, EncodingNameAliasToEncoding("iso_8859_1"));
24
+ EXPECT_EQ(ISO_8859_1, EncodingNameAliasToEncoding("iso-8859-1"));
25
+
26
+ // Test that spaces are ignored.
27
+ EXPECT_EQ(UTF8, EncodingNameAliasToEncoding("UTF8"));
28
+ EXPECT_EQ(UTF8, EncodingNameAliasToEncoding("UTF 8"));
29
+ EXPECT_EQ(UTF8, EncodingNameAliasToEncoding("UTF-8"));
30
+
31
+ // Test alphanumeric differences are counted.
32
+ EXPECT_NE(UTF8, EncodingNameAliasToEncoding("UTF-7"));
33
+ EXPECT_NE(KOREAN_EUC_KR, EncodingNameAliasToEncoding("euc-jp"));
34
+ }