compact_enc_det 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +7 -0
  2. data/ext/compact_enc_det/compact_enc_det/CMakeLists.txt +103 -0
  3. data/ext/compact_enc_det/compact_enc_det/LICENSE +202 -0
  4. data/ext/compact_enc_det/compact_enc_det/README.md +46 -0
  5. data/ext/compact_enc_det/compact_enc_det/autogen.sh +74 -0
  6. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.cc +5719 -0
  7. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.h +83 -0
  8. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_fuzz_test.cc +54 -0
  9. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables.h +6326 -0
  10. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables2.h +856 -0
  11. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.cc +169 -0
  12. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.h +45 -0
  13. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_unittest.cc +5260 -0
  14. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/detail_head_string.inc +152 -0
  15. data/ext/compact_enc_det/compact_enc_det/util/basictypes.h +331 -0
  16. data/ext/compact_enc_det/compact_enc_det/util/case_insensitive_hash.h +88 -0
  17. data/ext/compact_enc_det/compact_enc_det/util/commandlineflags.h +39 -0
  18. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.cc +891 -0
  19. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.h +299 -0
  20. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.pb.h +181 -0
  21. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings_unittest.cc +34 -0
  22. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.cc +349 -0
  23. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.h +381 -0
  24. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.pb.h +191 -0
  25. data/ext/compact_enc_det/compact_enc_det/util/logging.h +25 -0
  26. data/ext/compact_enc_det/compact_enc_det/util/port.h +53 -0
  27. data/ext/compact_enc_det/compact_enc_det/util/string_util.h +61 -0
  28. data/ext/compact_enc_det/compact_enc_det/util/varsetter.h +66 -0
  29. data/ext/compact_enc_det/compact_enc_det.cc +100 -0
  30. data/ext/compact_enc_det/extconf.rb +20 -0
  31. data/lib/compact_enc_det/version.rb +3 -0
  32. data/lib/compact_enc_det.rb +2 -0
  33. metadata +106 -0
@@ -0,0 +1,299 @@
1
+ // Copyright 2016 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ ////////////////////////////////////////////////////////////////////////////////
16
+
17
+ #ifndef UTIL_ENCODINGS_ENCODINGS_H_
18
+ #define UTIL_ENCODINGS_ENCODINGS_H_
19
+
20
+ // This interface defines the Encoding enum and various functions that
21
+ // depend only on Encoding values.
22
+
23
+ // A hash-function for Encoding, hash<Encoding>, is defined in
24
+ // i18n/encodings/public/encodings-hash.h
25
+
26
+ // On some Windows projects, UNICODE may be defined, which would prevent the
27
+ // Encoding enum below from compiling. Note that this is a quick fix that does
28
+ // not break any existing projects. The UNICODE enum may someday be changed
29
+ // to something more specific and non-colliding, but this involves careful
30
+ // testing of changes in many other projects.
31
+ #undef UNICODE
32
+
33
+ // NOTE: The Encoding enum must always start at 0. This assumption has
34
+ // been made and used.
35
+
36
+ #ifndef SWIG
37
+
38
+ #include "util/encodings/encodings.pb.h"
39
+
40
+ #else
41
+
42
+ // TODO: Include a SWIG workaround header file.
43
+
44
+ #endif
45
+
46
+ const int kNumEncodings = NUM_ENCODINGS;
47
+
48
+ // some of the popular encoding aliases
49
+ // TODO: Make these static const Encoding values instead of macros.
50
+ #define LATIN1 ISO_8859_1
51
+ #define LATIN2 ISO_8859_2
52
+ #define LATIN3 ISO_8859_3
53
+ #define LATIN4 ISO_8859_4
54
+ #define CYRILLIC ISO_8859_5
55
+ #define ARABIC_ENCODING ISO_8859_6 // avoiding the same name as language
56
+ #define GREEK_ENCODING ISO_8859_7 // avoiding the same name as language
57
+ #define HEBREW_ENCODING ISO_8859_8 // avoiding the same name as language
58
+ #define LATIN5 ISO_8859_9
59
+ #define LATIN6 ISO_8859_10
60
+ #define KOREAN_HANGUL KOREAN_EUC_KR
61
+
62
+ // The default Encoding (LATIN1).
63
+ Encoding default_encoding();
64
+
65
+
66
+
67
+ // *************************************************************
68
+ // Encoding predicates
69
+ // IsValidEncoding()
70
+ // IsEncEncCompatible
71
+ // IsSupersetOfAscii7Bit
72
+ // Is8BitEncoding
73
+ // IsCJKEncoding
74
+ // IsHebrewEncoding
75
+ // IsRightToLeftEncoding
76
+ // IsLogicalRightToLeftEncoding
77
+ // IsVisualRightToLeftEncoding
78
+ // IsIso2022Encoding
79
+ // IsIso2022JpOrVariant
80
+ // IsShiftJisOrVariant
81
+ // IsJapaneseCellPhoneCarrierSpecificEncoding
82
+ // *************************************************************
83
+
84
+ // IsValidEncoding
85
+ // ===================================
86
+ //
87
+ // Function to check if the input language enum is within range.
88
+ //
89
+
90
+ bool IsValidEncoding(Encoding enc);
91
+
92
+ //
93
+ // IsEncEncCompatible
94
+ // ------------------
95
+ //
96
+ // This function is to determine whether or not converting from the
97
+ // first encoding to the second requires any changes to the underlying
98
+ // text (e.g. ASCII_7BIT is a subset of UTF8).
99
+ //
100
+ // TODO: the current implementation is likely incomplete. It would be
101
+ // good to consider the full matrix of all pairs of encodings and to fish out
102
+ // all compatible pairs.
103
+ //
104
+ bool IsEncEncCompatible(const Encoding from, const Encoding to);
105
+
106
+ // To be a superset of 7-bit Ascii means that bytes 0...127 in the given
107
+ // encoding represent the same characters as they do in ISO_8859_1.
108
+
109
+ // WARNING: This function does not currently return true for all encodings that
110
+ // are supersets of Ascii 7-bit.
111
+ bool IsSupersetOfAscii7Bit(Encoding e);
112
+
113
+ // To be an 8-bit encoding means that there are fewer than 256 symbols.
114
+ // Each byte determines a new character; there are no multi-byte sequences.
115
+
116
+ // WARNING: This function does not currently return true for all encodings that
117
+ // are 8-bit encodings.
118
+ bool Is8BitEncoding(Encoding e);
119
+
120
+ // IsCJKEncoding
121
+ // -------------
122
+ //
123
+ // This function returns true if the encoding is either Chinese
124
+ // (simplified or traditional), Japanese, or Korean. Note: UTF8 is not
125
+ // considered a CJK encoding.
126
+ bool IsCJKEncoding(Encoding e);
127
+
128
+ // IsHebrewEncoding
129
+ // -------------
130
+ //
131
+ // This function returns true if the encoding is a Hebrew specific
132
+ // encoding (not UTF8, etc).
133
+ bool IsHebrewEncoding(Encoding e);
134
+
135
+ // IsRightToLeftEncoding
136
+ // ---------------------
137
+ //
138
+ // Returns true if the encoding is a right-to-left encoding.
139
+ //
140
+ // Note that the name of this function is somewhat misleading. There is nothing
141
+ // "right to left" about these encodings. They merely contain code points for
142
+ // characters in RTL languages such as Hebrew and Arabic. But this is also
143
+ // true for UTF-8.
144
+ //
145
+ // TODO: Get rid of this function. The only special-case we
146
+ // should need to worry about are visual encodings. Anything we
147
+ // need to do for all 'RTL' encodings we need to do for UTF-8 as well.
148
+ bool IsRightToLeftEncoding(Encoding enc);
149
+
150
+ // IsLogicalRightToLeftEncoding
151
+ // ----------------------------
152
+ //
153
+ // Returns true if the encoding is a logical right-to-left encoding.
154
+ // Logical right-to-left encodings are those that the browser renders
155
+ // right-to-left and applies the BiDi algorithm to. Therefore the characters
156
+ // appear in reading order in the file, and indexing, snippet generation etc.
157
+ // should all just work with no special processing.
158
+ //
159
+ // TODO: Get rid of this function. The only special-case we
160
+ // should need to worry about are visual encodings.
161
+ bool IsLogicalRightToLeftEncoding(Encoding enc);
162
+
163
+ // IsVisualRightToLeftEncoding
164
+ // ---------------------------
165
+ //
166
+ // Returns true if the encoding is a visual right-to-left encoding.
167
+ // Visual right-to-left encodings are those that the browser renders
168
+ // left-to-right and does not apply the BiDi algorithm to. Therefore each
169
+ // line appears in reverse order in the file, lines are manually wrapped
170
+ // by abusing <br> or <p> tags, etc. Visual RTL encoding is a relic of
171
+ // the prehistoric days when browsers couldn't render right-to-left, but
172
+ // unfortunately some visual pages persist to this day. These documents require
173
+ // special processing so that we don't index or snippet them with each line
174
+ // reversed.
175
+ bool IsVisualRightToLeftEncoding(Encoding enc);
176
+
177
+ // IsIso2022Encoding
178
+ // -----------------
179
+ //
180
+ // Returns true if the encoding is a kind of ISO 2022 such as
181
+ // ISO-2022-JP.
182
+ bool IsIso2022Encoding(Encoding enc);
183
+
184
+ // IsIso2022JpOrVariant
185
+ // --------------------
186
+ //
187
+ // Returns true if the encoding is ISO-2022-JP or a variant such as
188
+ // KDDI's ISO-2022-JP.
189
+ bool IsIso2022JpOrVariant(Encoding enc);
190
+
191
+ // IsShiftJisOrVariant
192
+ // --------------------
193
+ //
194
+ // Returns true if the encoding is Shift_JIS or a variant such as
195
+ // KDDI's Shift_JIS.
196
+ bool IsShiftJisOrVariant(Encoding enc);
197
+
198
+ // IsJapanesCellPhoneCarrierSpecificEncoding
199
+ // -----------------------------------------
200
+ //
201
+ // Returns true if it's Japanese cell phone carrier specific encoding
202
+ // such as KDDI_SHIFT_JIS.
203
+ bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc);
204
+
205
+
206
+
207
+ // *************************************************************
208
+ // ENCODING NAMES
209
+ //
210
+ // This interface defines a standard name for each valid encoding, and
211
+ // a standard name for invalid encodings. (Some names use all upper
212
+ // case, but others use mixed case.)
213
+ //
214
+ // EncodingName() [Encoding to name]
215
+ // MimeEncodingName() [Encoding to name]
216
+ // EncodingFromName() [name to Encoding]
217
+ // EncodingNameAliasToEncoding() [name to Encoding]
218
+ // default_encoding_name()
219
+ // invalid_encoding_name()
220
+ // *************************************************************
221
+
222
+ // EncodingName
223
+ // ------------
224
+ //
225
+ // Given the encoding, returns its standard name.
226
+ // Return invalid_encoding_name() if the encoding is invalid.
227
+ //
228
+ const char* EncodingName(Encoding enc);
229
+
230
+ //
231
+ // MimeEncodingName
232
+ // ----------------
233
+ //
234
+ // Return the "preferred MIME name" of an encoding.
235
+ //
236
+ // This name is suitable for using in HTTP headers, HTML tags,
237
+ // and as the "charset" parameter of a MIME Content-Type.
238
+ const char* MimeEncodingName(Encoding enc);
239
+
240
+
241
+ // The maximum length of an encoding name
242
+ const int kMaxEncodingNameSize = 50;
243
+
244
+ // The standard name of the default encoding.
245
+ const char* default_encoding_name();
246
+
247
+ // The name used for an invalid encoding.
248
+ const char* invalid_encoding_name();
249
+
250
+ // EncodingFromName
251
+ // ----------------
252
+ //
253
+ // If enc_name matches the standard name of an Encoding, using a
254
+ // case-insensitive comparison, set *encoding to that Encoding and
255
+ // return true. Otherwise set *encoding to UNKNOWN_ENCODING and
256
+ // return false.
257
+ //
258
+ // REQUIRES: encoding must not be NULL.
259
+ //
260
+ bool EncodingFromName(const char* enc_name, Encoding *encoding);
261
+
262
+ //
263
+ // EncodingNameAliasToEncoding
264
+ // ---------------------------
265
+ //
266
+ // If enc_name matches the standard name or an alias of an Encoding,
267
+ // using a case-insensitive comparison, return that
268
+ // Encoding. Otherwise, return UNKNOWN_ENCODING.
269
+ //
270
+ // Aliases include most mime-encoding names (e.g., "ISO-8859-7" for
271
+ // GREEK), alternate names (e.g., "cyrillic" for ISO_8859_5) and
272
+ // common variations with hyphens and underscores (e.g., "koi8-u" and
273
+ // "koi8u" for RUSSIAN_KOI8_R).
274
+
275
+ Encoding EncodingNameAliasToEncoding(const char *enc_name);
276
+
277
+ // *************************************************************
278
+ // Miscellany
279
+ // *************************************************************
280
+
281
+ // PreferredWebOutputEncoding
282
+ // --------------------------
283
+ //
284
+ // Some multi-byte encodings use byte values that coincide with the
285
+ // ASCII codes for HTML syntax characters <>"&' and browsers like MSIE
286
+ // can misinterpret these, as indicated in an external XSS report from
287
+ // 2007-02-15. Here, we map these dangerous encodings to safer ones. We
288
+ // also use UTF8 instead of encodings that we don't support in our
289
+ // output, and we generally try to be conservative in what we send out.
290
+ // Where the client asks for single- or double-byte encodings that are
291
+ // not as common, we substitute a more common single- or double-byte
292
+ // encoding, if there is one, thereby preserving the client's intent
293
+ // to use less space than UTF-8. This also means that characters
294
+ // outside the destination set will be converted to HTML NCRs (&#NNN;)
295
+ // if requested.
296
+ Encoding PreferredWebOutputEncoding(Encoding enc);
297
+
298
+
299
+ #endif // UTIL_ENCODINGS_ENCODINGS_H_
@@ -0,0 +1,181 @@
1
+ // Copyright 2016 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ ////////////////////////////////////////////////////////////////////////////////
16
+
17
+ #ifndef UTIL_ENCODINGS_ENCODINGS_PB_H_
18
+ #define UTIL_ENCODINGS_ENCODINGS_PB_H_
19
+
20
+ enum Encoding {
21
+ ISO_8859_1 = 0, // Teragram ASCII
22
+ ISO_8859_2 = 1, // Teragram Latin2
23
+ ISO_8859_3 = 2, // in BasisTech but not in Teragram
24
+ ISO_8859_4 = 3, // Teragram Latin4
25
+ ISO_8859_5 = 4, // Teragram ISO-8859-5
26
+ ISO_8859_6 = 5, // Teragram Arabic
27
+ ISO_8859_7 = 6, // Teragram Greek
28
+ ISO_8859_8 = 7, // Teragram Hebrew
29
+ ISO_8859_9 = 8, // in BasisTech but not in Teragram
30
+ ISO_8859_10 = 9, // in BasisTech but not in Teragram
31
+ JAPANESE_EUC_JP = 10, // Teragram EUC_JP
32
+ JAPANESE_SHIFT_JIS = 11, // Teragram SJS
33
+ JAPANESE_JIS = 12, // Teragram JIS
34
+ CHINESE_BIG5 = 13, // Teragram BIG5
35
+ CHINESE_GB = 14, // Teragram GB
36
+ CHINESE_EUC_CN = 15, // Misnamed. Should be EUC_TW. Was Basis Tech
37
+ // CNS11643EUC, before that Teragram EUC-CN(!)
38
+ // See //i18n/basistech/basistech_encodings.h
39
+ KOREAN_EUC_KR = 16, // Teragram KSC
40
+ UNICODE = 17, // Teragram Unicode
41
+ CHINESE_EUC_DEC = 18, // Misnamed. Should be EUC_TW. Was Basis Tech
42
+ // CNS11643EUC, before that Teragram EUC.
43
+ CHINESE_CNS = 19, // Misnamed. Should be EUC_TW. Was Basis Tech
44
+ // CNS11643EUC, before that Teragram CNS.
45
+ CHINESE_BIG5_CP950 = 20, // Teragram BIG5_CP950
46
+ JAPANESE_CP932 = 21, // Teragram CP932
47
+ UTF8 = 22,
48
+ UNKNOWN_ENCODING = 23,
49
+ ASCII_7BIT = 24, // ISO_8859_1 with all characters <= 127.
50
+ // Should be present only in the crawler
51
+ // and in the repository,
52
+ // *never* as a result of Document::encoding().
53
+ RUSSIAN_KOI8_R = 25, // Teragram KOI8R
54
+ RUSSIAN_CP1251 = 26, // Teragram CP1251
55
+
56
+ //----------------------------------------------------------
57
+ // These are _not_ output from teragram. Instead, they are as
58
+ // detected in the headers of usenet articles.
59
+ MSFT_CP1252 = 27, // 27: CP1252 aka MSFT euro ascii
60
+ RUSSIAN_KOI8_RU = 28, // CP21866 aka KOI8-U, used for Ukrainian.
61
+ // Misnamed, this is _not_ KOI8-RU but KOI8-U.
62
+ // KOI8-U is used much more often than KOI8-RU.
63
+ MSFT_CP1250 = 29, // CP1250 aka MSFT eastern european
64
+ ISO_8859_15 = 30, // aka ISO_8859_0 aka ISO_8859_1 euroized
65
+ //----------------------------------------------------------
66
+
67
+ //----------------------------------------------------------
68
+ // These are in BasisTech but not in Teragram. They are
69
+ // needed for new interface languages. Now detected by
70
+ // research langid
71
+ MSFT_CP1254 = 31, // used for Turkish
72
+ MSFT_CP1257 = 32, // used in Baltic countries
73
+ //----------------------------------------------------------
74
+
75
+ //----------------------------------------------------------
76
+ //----------------------------------------------------------
77
+ // New encodings detected by Teragram
78
+ ISO_8859_11 = 33, // aka TIS-620, used for Thai
79
+ MSFT_CP874 = 34, // used for Thai
80
+ MSFT_CP1256 = 35, // used for Arabic
81
+
82
+ //----------------------------------------------------------
83
+ // Detected as ISO_8859_8 by Teragram, but can be found in META tags
84
+ MSFT_CP1255 = 36, // Logical Hebrew Microsoft
85
+ ISO_8859_8_I = 37, // Iso Hebrew Logical
86
+ HEBREW_VISUAL = 38, // Iso Hebrew Visual
87
+ //----------------------------------------------------------
88
+
89
+ //----------------------------------------------------------
90
+ // Detected by research langid
91
+ CZECH_CP852 = 39,
92
+ CZECH_CSN_369103 = 40, // aka ISO_IR_139 aka KOI8_CS
93
+ MSFT_CP1253 = 41, // used for Greek
94
+ RUSSIAN_CP866 = 42,
95
+ //----------------------------------------------------------
96
+
97
+ //----------------------------------------------------------
98
+ // Handled by iconv in glibc
99
+ ISO_8859_13 = 43,
100
+ ISO_2022_KR = 44,
101
+ GBK = 45,
102
+ GB18030 = 46,
103
+ BIG5_HKSCS = 47,
104
+ ISO_2022_CN = 48,
105
+
106
+ //-----------------------------------------------------------
107
+ // Detected by xin liu's detector
108
+ // Handled by transcoder
109
+ // (Indic encodings)
110
+
111
+ TSCII = 49,
112
+ TAMIL_MONO = 50,
113
+ TAMIL_BI = 51,
114
+ JAGRAN = 52,
115
+
116
+
117
+ MACINTOSH_ROMAN = 53,
118
+ UTF7 = 54,
119
+ BHASKAR = 55, // Indic encoding - Devanagari
120
+ HTCHANAKYA = 56, // 56 Indic encoding - Devanagari
121
+
122
+ //-----------------------------------------------------------
123
+ // These allow a single place (inputconverter and outputconverter)
124
+ // to do UTF-16 <==> UTF-8 bulk conversions and UTF-32 <==> UTF-8
125
+ // bulk conversions, with interchange-valid checking on input and
126
+ // fallback if needed on ouput.
127
+ UTF16BE = 57, // big-endian UTF-16
128
+ UTF16LE = 58, // little-endian UTF-16
129
+ UTF32BE = 59, // big-endian UTF-32
130
+ UTF32LE = 60, // little-endian UTF-32
131
+ //-----------------------------------------------------------
132
+
133
+ //-----------------------------------------------------------
134
+ // An encoding that means "This is not text, but it may have some
135
+ // simple ASCII text embedded". Intended input conversion (not yet
136
+ // implemented) is to keep strings of >=4 seven-bit ASCII characters
137
+ // (follow each kept string with an ASCII space), delete the rest of
138
+ // the bytes. This will pick up and allow indexing of e.g. captions
139
+ // in JPEGs. No output conversion needed.
140
+ BINARYENC = 61,
141
+ //-----------------------------------------------------------
142
+
143
+ //-----------------------------------------------------------
144
+ // Some Web pages allow a mixture of HZ-GB and GB-2312 by using
145
+ // ~{ ... ~} for 2-byte pairs, and the browsers support this.
146
+ HZ_GB_2312 = 62,
147
+ //-----------------------------------------------------------
148
+
149
+ //-----------------------------------------------------------
150
+ // Some external vendors make the common input error of
151
+ // converting MSFT_CP1252 to UTF8 *twice*. No output conversion needed.
152
+ UTF8UTF8 = 63,
153
+ //-----------------------------------------------------------
154
+
155
+ //-----------------------------------------------------------
156
+ // Handled by transcoder for tamil language specific font
157
+ // encodings without the support for detection at present.
158
+ TAM_ELANGO = 64, // Elango - Tamil
159
+ TAM_LTTMBARANI = 65, // Barani - Tamil
160
+ TAM_SHREE = 66, // Shree - Tamil
161
+ TAM_TBOOMIS = 67, // TBoomis - Tamil
162
+ TAM_TMNEWS = 68, // TMNews - Tamil
163
+ TAM_WEBTAMIL = 69, // Webtamil - Tamil
164
+ //-----------------------------------------------------------
165
+
166
+ //-----------------------------------------------------------
167
+ // Shift_JIS variants used by Japanese cell phone carriers.
168
+ KDDI_SHIFT_JIS = 70,
169
+ DOCOMO_SHIFT_JIS = 71,
170
+ SOFTBANK_SHIFT_JIS = 72,
171
+ // ISO-2022-JP variants used by KDDI and SoftBank.
172
+ KDDI_ISO_2022_JP = 73,
173
+ SOFTBANK_ISO_2022_JP = 74,
174
+ //-----------------------------------------------------------
175
+
176
+ NUM_ENCODINGS = 75, // Always keep this at the end. It is not a
177
+ // valid Encoding enum, it is only used to
178
+ // indicate the total number of Encodings.
179
+ };
180
+
181
+ #endif // UTIL_ENCODINGS_ENCODINGS_PB_H_
@@ -0,0 +1,34 @@
1
+ // Copyright 2016 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ ////////////////////////////////////////////////////////////////////////////////
16
+
17
+ #include "util/encodings/encodings.h"
18
+
19
+ #include "gtest/gtest.h"
20
+
21
+ TEST(EncodingsTest, EncodingNameAliasToEncoding) {
22
+ // Test that cases, non-alpha-numeric chars are ignored.
23
+ EXPECT_EQ(ISO_8859_1, EncodingNameAliasToEncoding("iso_8859_1"));
24
+ EXPECT_EQ(ISO_8859_1, EncodingNameAliasToEncoding("iso-8859-1"));
25
+
26
+ // Test that spaces are ignored.
27
+ EXPECT_EQ(UTF8, EncodingNameAliasToEncoding("UTF8"));
28
+ EXPECT_EQ(UTF8, EncodingNameAliasToEncoding("UTF 8"));
29
+ EXPECT_EQ(UTF8, EncodingNameAliasToEncoding("UTF-8"));
30
+
31
+ // Test alphanumeric differences are counted.
32
+ EXPECT_NE(UTF8, EncodingNameAliasToEncoding("UTF-7"));
33
+ EXPECT_NE(KOREAN_EUC_KR, EncodingNameAliasToEncoding("euc-jp"));
34
+ }