language_detection 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. data/.gitignore +19 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +85 -0
  5. data/Rakefile +11 -0
  6. data/ext/cld/Makefile +34 -0
  7. data/ext/cld/base/basictypes.h +348 -0
  8. data/ext/cld/base/build_config.h +124 -0
  9. data/ext/cld/base/casts.h +156 -0
  10. data/ext/cld/base/commandlineflags.h +443 -0
  11. data/ext/cld/base/crash.h +41 -0
  12. data/ext/cld/base/dynamic_annotations.h +358 -0
  13. data/ext/cld/base/global_strip_options.h +59 -0
  14. data/ext/cld/base/log_severity.h +46 -0
  15. data/ext/cld/base/logging.h +1403 -0
  16. data/ext/cld/base/macros.h +243 -0
  17. data/ext/cld/base/port.h +54 -0
  18. data/ext/cld/base/scoped_ptr.h +428 -0
  19. data/ext/cld/base/stl_decl.h +0 -0
  20. data/ext/cld/base/stl_decl_msvc.h +107 -0
  21. data/ext/cld/base/string_util.h +29 -0
  22. data/ext/cld/base/strtoint.h +93 -0
  23. data/ext/cld/base/template_util.h +96 -0
  24. data/ext/cld/base/type_traits.h +198 -0
  25. data/ext/cld/base/vlog_is_on.h +143 -0
  26. data/ext/cld/cld.so +0 -0
  27. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  28. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  29. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  30. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  31. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  32. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  33. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  34. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  35. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  36. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  37. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  38. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  39. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  40. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  41. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  42. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  43. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  44. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  45. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  46. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  47. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  48. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  49. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  50. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  51. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  52. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  53. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  54. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  55. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  56. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  57. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  58. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  59. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  60. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  61. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  62. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  63. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  64. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  65. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  66. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  67. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  68. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  69. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  70. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  71. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  72. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  73. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  74. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  75. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  76. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  77. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  78. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  79. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  80. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  81. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  82. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  83. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  84. data/ext/cld/encodings/internal/encodings.cc +12 -0
  85. data/ext/cld/encodings/lang_enc.h +254 -0
  86. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  87. data/ext/cld/encodings/public/encodings.h +301 -0
  88. data/ext/cld/extconf.rb +1 -0
  89. data/ext/cld/language_detection.cc +88 -0
  90. data/ext/cld/languages/internal/languages.cc +337 -0
  91. data/ext/cld/languages/proto/languages.pb.h +179 -0
  92. data/ext/cld/languages/public/languages.h +379 -0
  93. data/language_detection.gemspec +28 -0
  94. data/lib/language_detection/string.rb +1 -0
  95. data/lib/language_detection/version.rb +3 -0
  96. data/lib/language_detection.rb +54 -0
  97. data/test/_helper.rb +15 -0
  98. data/test/fixtures/languages.csv +80 -0
  99. data/test/language_detection_test.rb +88 -0
  100. metadata +250 -0
@@ -0,0 +1,254 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ // This file is for i18n. It contains two enums, namely Language and
6
+ // Encoding, where Language is the linguistic convention, and Encoding
7
+ // contains information on both language encoding and character set.
8
+ //
9
+ // The language and encoding are both based on Teragram's conventions,
10
+ // except for some common ISO-8859 encodings that are not detected by
11
+ // Teragram but might be in the future.
12
+ //
13
+ // This file also includes functions that do mappings among
14
+ // Language/Encoding enums, language/encoding string names (typically
15
+ // the output from Language Encoding identifier), and language codes
16
+ // (iso 639), and two-letter country codes (iso 3166)
17
+ //
18
+ // NOTE: Both Language and Encoding enums should always start from
19
+ // zero value. This assumption has been made and used.
20
+ //
21
+
22
+ #ifndef ENCODINGS_LANG_ENC_H__
23
+ #define ENCODINGS_LANG_ENC_H__
24
+
25
+ #include "languages/public/languages.h"
26
+ #include "encodings/public/encodings.h"
27
+
28
+
29
+ // EncodingsForLanguage
30
+ // --------------------
31
+ //
32
+ // Given the language, returns a pointer to an array of encodings this
33
+ // language supports. Typically, the encs array has at least one
34
+ // element: UNKNOWN_ENCODING, which is always the last element of the
35
+ // array. The first encoding is the default encoding of the language.
36
+ // Return NULL if the input is invalid.
37
+ //
38
+ // Note: The output encoding array does not include ASCII_7BIT, UTF8
39
+ // or UNICODE which are good for all languages. TODO: Find out whether
40
+ // it is better to include ASCII_7BIT, UTF8 and UNICODE or leave them
41
+ // as special cases.
42
+ //
43
+ const Encoding* EncodingsForLanguage(Language lang);
44
+
45
+
46
+ // DefaultEncodingForLanguage
47
+ // --------------------------
48
+ //
49
+ // Given the language, returns the default encoding for the language
50
+ // via the argument encoding.
51
+ //
52
+ // The function returns true if the input lang is valid. Otherwise,
53
+ // false is returned, and encoding is set to UNKNOWN_ENCODING.
54
+ //
55
+ bool DefaultEncodingForLanguage(Language lang,
56
+ Encoding *encoding);
57
+
58
+ // LanguagesForEncoding
59
+ // --------------------
60
+ //
61
+ // Given the encoding, returns a pointer to an array of languages this
62
+ // encoding supports. Typically, the langs array has at least one
63
+ // element: UNKNOWN_LANGUAGE, which is always the last element of the
64
+ // array. The first language in the array if the most popular
65
+ // language for that encoding. NULL is returned if the input is
66
+ // invalid.
67
+ //
68
+ // Note: For ASCII_7BIT, UNICODE and UTF8, only ENGLISH and
69
+ // UNKNOWN_LANGUAGE are returned. TODO: Find out whether to return all
70
+ // the languages or to treat these two encodings as special cases.
71
+ //
72
+ // For other known encodings, ENGLISH is always included. This is
73
+ // because English (Latin) characters are included in each encoding.
74
+ //
75
+ const Language* LanguagesForEncoding(Encoding enc);
76
+
77
+ // DefaultLanguageForEncoding
78
+ // --------------------------
79
+ //
80
+ // Given the encoding, returns the default language for that encoding
81
+ // via the argument language.
82
+ //
83
+ // The function returns true if the input enc is valid. Otherwise,
84
+ // false is returned, and language is set to UNKNOWN_LANGUAGE.
85
+ //
86
+ // Note, this function is more useful for the encodings that have only
87
+ // one corresponding language i.e. shift_jis => Japanese. There are
88
+ // cases that multiple langauges have the same encoding, for which the
89
+ // default language is an arbitrary choice from them.
90
+ //
91
+ bool DefaultLanguageForEncoding(Encoding enc, Language* language);
92
+
93
+ //
94
+ // IsLangEncCompatible
95
+ // -------------------
96
+ //
97
+ // This function is to determine whether the input language and
98
+ // encoding are compatible. For example, FRENCH and LATIN1 are
99
+ // compatible, but FRENCH and GB are not.
100
+ //
101
+ // If either lang or enc is invalid return false.
102
+ // If either lang is unknown, return true.
103
+ // (e.g. we can detect a page's encoding as latin1 from metatag info, but
104
+ // cannot derive it language since there are more than one
105
+ // language encoding in Latin1 )
106
+ // If language is known, but encoding is unknown, return false.
107
+ // (return true will do us no good since we cannot convert to UTF8 anyway)
108
+ // If enc is unicode or utf8, return true.
109
+ // Otherwise check if lang is supported by enc and enc supported by
110
+ // lang.
111
+ //
112
+ bool IsLangEncCompatible(Language lang, Encoding enc);
113
+
114
+ //
115
+ // DominantLanguageFromEncoding
116
+ // ----------------------------
117
+ //
118
+ // This function determine if there exists a dominant language for the
119
+ // input encoding. For example, the encoding GB has a dominant
120
+ // language (Chinese), but Latin1 does not.
121
+ //
122
+ // The word "dominant" is used here because English characters are
123
+ // included in each encoding.
124
+ //
125
+ // If there is no dominant langauge for the encoding, such as Latin1,
126
+ // UNKNOWN_LANGUAGE is returned.
127
+ //
128
+ Language DominantLanguageFromEncoding(Encoding enc);
129
+
130
+ // LanguageCode
131
+ // ------------------------
132
+ // Given the Language and Encoding, return language code with dialects
133
+ // (>= 2 letters). Encoding is necessary to disambiguate between
134
+ // Simplified and Traditional Chinese.
135
+ //
136
+ // See the note on Chinese Language Codes in
137
+ // i18n/languages/public/languages.h
138
+ // for the details.
139
+
140
+ const char* LanguageCode(Language lang, Encoding enc);
141
+
142
+ //
143
+ // IsEncodingWithSupportedLanguage()
144
+ // ---------------------------------
145
+ //
146
+ // There are some encoding listed here just because they are commonly
147
+ // used. There is no interface language for them yet. They are not
148
+ // detected by Teragram, but can be detected from the meta info of the
149
+ // HTML page.
150
+ //
151
+ // For example, we have list ARABIC_ENCODING but there is no arabic in
152
+ // the Language enum. If the user input an Arabic query from Google
153
+ // main page, Netscape will just send the raw bytes to GWS, and GWS
154
+ // will treat them as Latin1. Therefore, there is no use to detect
155
+ // ARABIC_ENCODING for indexing, since they will never match the
156
+ // queries which are treated as Latin1 by GWS. On the contrary, if we
157
+ // treat page with ARABIC_ENCODING as UNKNOWN_ENCODING, Google will
158
+ // fall them through as Latin1 in indexing time. And there might be a
159
+ // match for some ARABIC queries which are also treated as Latin1 by
160
+ // GWS. In fact, some people are relying on this feature to do Arabic
161
+ // searches.
162
+ //
163
+ // Thus for these type of encoding, before we have the UI support for
164
+ // their language and have a pretty comprehensive language/encoding
165
+ // identification quality, it is better to revert them as
166
+ // UNKNOWN_ENCODING.
167
+ //
168
+ // This function checks whether the input encoding is one with
169
+ // an interface language.
170
+ bool IsEncodingWithSupportedLanguage(Encoding enc);
171
+
172
+
173
+ //
174
+ // LangsFromCountryCode and EncFromCountryCode
175
+ // -------------------------------------------
176
+ //
177
+ // These two functions return the possible languages and encodings,
178
+ // respectively, according to the input country code, which is a
179
+ // 2-letter string. The country code is usually specified in the url
180
+ // of a document.
181
+ //
182
+ //
183
+
184
+ // LangsFromCountryCode
185
+ // --------------------
186
+ //
187
+ // This function takes a string of arbitrary length. It treats the
188
+ // first 2 bytes of the string as the country code, as defined in iso
189
+ // 3166-1993 (E). It returns, via arguments, an array of the
190
+ // languages that are popular in that country, roughly in order of
191
+ // popularity, together with the size of the array.
192
+ //
193
+ // This function returns true if we have language information for
194
+ // country_code. Otherwise, it returns false.
195
+ //
196
+ bool LangsFromCountryCode(const char* country_code,
197
+ const Language** lang_arry,
198
+ int* num_langs);
199
+
200
+
201
+ //
202
+ // EncFromCountryCode
203
+ // ------------------
204
+ //
205
+ // This function takes a string of arbitrary length. It treats the
206
+ // first 2 bytes of that string as the country code, as defined in iso
207
+ // 3166-1993 (E). It sets *enc to the encoding that is
208
+ // most often used for the languages spoken in that country.
209
+ //
210
+ // This function returns true if we have encoding information for
211
+ // country_code. Otherwise, it returns false, and *enc is set to
212
+ // UNKNOWN_ENCODING.
213
+ //
214
+ bool EncFromCountryCode(const char* country_code, Encoding* enc);
215
+
216
+
217
+
218
+ // VisualType
219
+ // ----------
220
+ //
221
+ // Right-to-left documents may be in logical or visual order. When they
222
+ // are in visual order we convert them to logical order before processing.
223
+ // This enum lists the types of visual document we can encounter.
224
+ // Some, but not all, documents in Hebrew/Arabic/Persian etc. will be visual.
225
+ // The other documents in those languages, and all documents in non-RTL
226
+ // languages, will be NOT_VISUAL_DOCUMENT.
227
+ enum VisualType {
228
+ NOT_VISUAL_DOCUMENT = 0,
229
+ VISUAL_HEBREW_HTML, // HTML documents in the legacy visual order.
230
+ CONVERTED_RTL_PDF, // Converted RTL PDFs, which are always visual.
231
+ };
232
+
233
+ VisualType default_visualtype();
234
+
235
+ // VisualTypeName
236
+ // --------------
237
+ //
238
+ // Given the visual type, returns a string name useful for debug output.
239
+ const char* VisualTypeName(VisualType visualtype);
240
+
241
+
242
+
243
+ // InitLangEnc
244
+ // -----------
245
+ //
246
+ // Ensures the LangEnc module has been initialized. Normally this
247
+ // happens during InitGoogle, but this allows access for scripts that
248
+ // don't support InitGoogle. InitLangEnc calls InitEncodings (see
249
+ // i18n/encodings/public/encodings.h) and also initializes data
250
+ // structures used in lang_enc.cc.
251
+ //
252
+ void InitLangEnc();
253
+
254
+ #endif // ENCODINGS_LANG_ENC_H__
@@ -0,0 +1,169 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_PROTO_ENCODINGS_PB_H_
6
+ #define ENCODINGS_PROTO_ENCODINGS_PB_H_
7
+
8
+ enum Encoding {
9
+ ISO_8859_1 = 0, // Teragram ASCII
10
+ ISO_8859_2 = 1, // Teragram Latin2
11
+ ISO_8859_3 = 2, // in BasisTech but not in Teragram
12
+ ISO_8859_4 = 3, // Teragram Latin4
13
+ ISO_8859_5 = 4, // Teragram ISO-8859-5
14
+ ISO_8859_6 = 5, // Teragram Arabic
15
+ ISO_8859_7 = 6, // Teragram Greek
16
+ ISO_8859_8 = 7, // Teragram Hebrew
17
+ ISO_8859_9 = 8, // in BasisTech but not in Teragram
18
+ ISO_8859_10 = 9, // in BasisTech but not in Teragram
19
+ JAPANESE_EUC_JP = 10, // Teragram EUC_JP
20
+ JAPANESE_SHIFT_JIS = 11, // Teragram SJS
21
+ JAPANESE_JIS = 12, // Teragram JIS
22
+ CHINESE_BIG5 = 13, // Teragram BIG5
23
+ CHINESE_GB = 14, // Teragram GB
24
+ CHINESE_EUC_CN = 15, // Misnamed. Should be EUC_TW. Was Basis Tech
25
+ // CNS11643EUC, before that Teragram EUC-CN(!)
26
+ // See //i18n/basistech/basistech_encodings.h
27
+ KOREAN_EUC_KR = 16, // Teragram KSC
28
+ UNICODE = 17, // Teragram Unicode
29
+ CHINESE_EUC_DEC = 18, // Misnamed. Should be EUC_TW. Was Basis Tech
30
+ // CNS11643EUC, before that Teragram EUC.
31
+ CHINESE_CNS = 19, // Misnamed. Should be EUC_TW. Was Basis Tech
32
+ // CNS11643EUC, before that Teragram CNS.
33
+ CHINESE_BIG5_CP950 = 20, // Teragram BIG5_CP950
34
+ JAPANESE_CP932 = 21, // Teragram CP932
35
+ UTF8 = 22,
36
+ UNKNOWN_ENCODING = 23,
37
+ ASCII_7BIT = 24, // ISO_8859_1 with all characters <= 127.
38
+ // Should be present only in the crawler
39
+ // and in the repository,
40
+ // *never* as a result of Document::encoding().
41
+ RUSSIAN_KOI8_R = 25, // Teragram KOI8R
42
+ RUSSIAN_CP1251 = 26, // Teragram CP1251
43
+
44
+ //----------------------------------------------------------
45
+ // These are _not_ output from teragram. Instead, they are as
46
+ // detected in the headers of usenet articles.
47
+ MSFT_CP1252 = 27, // 27: CP1252 aka MSFT euro ascii
48
+ RUSSIAN_KOI8_RU = 28, // CP21866 aka KOI8-U, used for Ukrainian.
49
+ // Misnamed, this is _not_ KOI8-RU but KOI8-U.
50
+ // KOI8-U is used much more often than KOI8-RU.
51
+ MSFT_CP1250 = 29, // CP1250 aka MSFT eastern european
52
+ ISO_8859_15 = 30, // aka ISO_8859_0 aka ISO_8859_1 euroized
53
+ //----------------------------------------------------------
54
+
55
+ //----------------------------------------------------------
56
+ // These are in BasisTech but not in Teragram. They are
57
+ // needed for new interface languages. Now detected by
58
+ // research langid
59
+ MSFT_CP1254 = 31, // used for Turkish
60
+ MSFT_CP1257 = 32, // used in Baltic countries
61
+ //----------------------------------------------------------
62
+
63
+ //----------------------------------------------------------
64
+ //----------------------------------------------------------
65
+ // New encodings detected by Teragram
66
+ ISO_8859_11 = 33, // aka TIS-620, used for Thai
67
+ MSFT_CP874 = 34, // used for Thai
68
+ MSFT_CP1256 = 35, // used for Arabic
69
+
70
+ //----------------------------------------------------------
71
+ // Detected as ISO_8859_8 by Teragram, but can be found in META tags
72
+ MSFT_CP1255 = 36, // Logical Hebrew Microsoft
73
+ ISO_8859_8_I = 37, // Iso Hebrew Logical
74
+ HEBREW_VISUAL = 38, // Iso Hebrew Visual
75
+ //----------------------------------------------------------
76
+
77
+ //----------------------------------------------------------
78
+ // Detected by research langid
79
+ CZECH_CP852 = 39,
80
+ CZECH_CSN_369103 = 40, // aka ISO_IR_139 aka KOI8_CS
81
+ MSFT_CP1253 = 41, // used for Greek
82
+ RUSSIAN_CP866 = 42,
83
+ //----------------------------------------------------------
84
+
85
+ //----------------------------------------------------------
86
+ // Handled by iconv in glibc
87
+ ISO_8859_13 = 43,
88
+ ISO_2022_KR = 44,
89
+ GBK = 45,
90
+ GB18030 = 46,
91
+ BIG5_HKSCS = 47,
92
+ ISO_2022_CN = 48,
93
+
94
+ //-----------------------------------------------------------
95
+ // Detected by xin liu's detector
96
+ // Handled by transcoder
97
+ // (Indic encodings)
98
+
99
+ TSCII = 49,
100
+ TAMIL_MONO = 50,
101
+ TAMIL_BI = 51,
102
+ JAGRAN = 52,
103
+
104
+
105
+ MACINTOSH_ROMAN = 53,
106
+ UTF7 = 54,
107
+ BHASKAR = 55, // Indic encoding - Devanagari
108
+ HTCHANAKYA = 56, // 56 Indic encoding - Devanagari
109
+
110
+ //-----------------------------------------------------------
111
+ // These allow a single place (inputconverter and outputconverter)
112
+ // to do UTF-16 <==> UTF-8 bulk conversions and UTF-32 <==> UTF-8
113
+ // bulk conversions, with interchange-valid checking on input and
114
+ // fallback if needed on ouput.
115
+ UTF16BE = 57, // big-endian UTF-16
116
+ UTF16LE = 58, // little-endian UTF-16
117
+ UTF32BE = 59, // big-endian UTF-32
118
+ UTF32LE = 60, // little-endian UTF-32
119
+ //-----------------------------------------------------------
120
+
121
+ //-----------------------------------------------------------
122
+ // An encoding that means "This is not text, but it may have some
123
+ // simple ASCII text embedded". Intended input conversion (not yet
124
+ // implemented) is to keep strings of >=4 seven-bit ASCII characters
125
+ // (follow each kept string with an ASCII space), delete the rest of
126
+ // the bytes. This will pick up and allow indexing of e.g. captions
127
+ // in JPEGs. No output conversion needed.
128
+ BINARYENC = 61,
129
+ //-----------------------------------------------------------
130
+
131
+ //-----------------------------------------------------------
132
+ // Some Web pages allow a mixture of HZ-GB and GB-2312 by using
133
+ // ~{ ... ~} for 2-byte pairs, and the browsers support this.
134
+ HZ_GB_2312 = 62,
135
+ //-----------------------------------------------------------
136
+
137
+ //-----------------------------------------------------------
138
+ // Some external vendors make the common input error of
139
+ // converting MSFT_CP1252 to UTF8 *twice*. No output conversion needed.
140
+ UTF8UTF8 = 63,
141
+ //-----------------------------------------------------------
142
+
143
+ //-----------------------------------------------------------
144
+ // Handled by transcoder for tamil language specific font
145
+ // encodings without the support for detection at present.
146
+ TAM_ELANGO = 64, // Elango - Tamil
147
+ TAM_LTTMBARANI = 65, // Barani - Tamil
148
+ TAM_SHREE = 66, // Shree - Tamil
149
+ TAM_TBOOMIS = 67, // TBoomis - Tamil
150
+ TAM_TMNEWS = 68, // TMNews - Tamil
151
+ TAM_WEBTAMIL = 69, // Webtamil - Tamil
152
+ //-----------------------------------------------------------
153
+
154
+ //-----------------------------------------------------------
155
+ // Shift_JIS variants used by Japanese cell phone carriers.
156
+ KDDI_SHIFT_JIS = 70,
157
+ DOCOMO_SHIFT_JIS = 71,
158
+ SOFTBANK_SHIFT_JIS = 72,
159
+ // ISO-2022-JP variants used by KDDI and SoftBank.
160
+ KDDI_ISO_2022_JP = 73,
161
+ SOFTBANK_ISO_2022_JP = 74,
162
+ //-----------------------------------------------------------
163
+
164
+ NUM_ENCODINGS = 75, // Always keep this at the end. It is not a
165
+ // valid Encoding enum, it is only used to
166
+ // indicate the total number of Encodings.
167
+ };
168
+
169
+ #endif // ENCODINGS_PROTO_ENCODINGS_PB_H_