language_detection 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (100) hide show
  1. data/.gitignore +19 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +85 -0
  5. data/Rakefile +11 -0
  6. data/ext/cld/Makefile +34 -0
  7. data/ext/cld/base/basictypes.h +348 -0
  8. data/ext/cld/base/build_config.h +124 -0
  9. data/ext/cld/base/casts.h +156 -0
  10. data/ext/cld/base/commandlineflags.h +443 -0
  11. data/ext/cld/base/crash.h +41 -0
  12. data/ext/cld/base/dynamic_annotations.h +358 -0
  13. data/ext/cld/base/global_strip_options.h +59 -0
  14. data/ext/cld/base/log_severity.h +46 -0
  15. data/ext/cld/base/logging.h +1403 -0
  16. data/ext/cld/base/macros.h +243 -0
  17. data/ext/cld/base/port.h +54 -0
  18. data/ext/cld/base/scoped_ptr.h +428 -0
  19. data/ext/cld/base/stl_decl.h +0 -0
  20. data/ext/cld/base/stl_decl_msvc.h +107 -0
  21. data/ext/cld/base/string_util.h +29 -0
  22. data/ext/cld/base/strtoint.h +93 -0
  23. data/ext/cld/base/template_util.h +96 -0
  24. data/ext/cld/base/type_traits.h +198 -0
  25. data/ext/cld/base/vlog_is_on.h +143 -0
  26. data/ext/cld/cld.so +0 -0
  27. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  28. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  29. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  30. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  31. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  32. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  33. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  34. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  35. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  36. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  37. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  38. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  39. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  40. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  41. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  42. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  43. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  44. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  45. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  46. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  47. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  48. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  49. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  50. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  51. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  52. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  53. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  54. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  55. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  56. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  57. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  58. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  59. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  60. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  61. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  62. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  63. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  64. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  65. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  66. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  67. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  68. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  69. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  70. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  71. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  72. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  73. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  74. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  75. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  76. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  77. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  78. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  79. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  80. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  81. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  82. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  83. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  84. data/ext/cld/encodings/internal/encodings.cc +12 -0
  85. data/ext/cld/encodings/lang_enc.h +254 -0
  86. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  87. data/ext/cld/encodings/public/encodings.h +301 -0
  88. data/ext/cld/extconf.rb +1 -0
  89. data/ext/cld/language_detection.cc +88 -0
  90. data/ext/cld/languages/internal/languages.cc +337 -0
  91. data/ext/cld/languages/proto/languages.pb.h +179 -0
  92. data/ext/cld/languages/public/languages.h +379 -0
  93. data/language_detection.gemspec +28 -0
  94. data/lib/language_detection/string.rb +1 -0
  95. data/lib/language_detection/version.rb +3 -0
  96. data/lib/language_detection.rb +54 -0
  97. data/test/_helper.rb +15 -0
  98. data/test/fixtures/languages.csv +80 -0
  99. data/test/language_detection_test.rb +88 -0
  100. metadata +250 -0
@@ -0,0 +1,254 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ // This file is for i18n. It contains two enums, namely Language and
6
+ // Encoding, where Language is the linguistic convention, and Encoding
7
+ // contains information on both language encoding and character set.
8
+ //
9
+ // The language and encoding are both based on Teragram's conventions,
10
+ // except for some common ISO-8859 encodings that are not detected by
11
+ // Teragram but might be in the future.
12
+ //
13
+ // This file also includes functions that do mappings among
14
+ // Language/Encoding enums, language/encoding string names (typically
15
+ // the output from Language Encoding identifier), and language codes
16
+ // (iso 639), and two-letter country codes (iso 3166)
17
+ //
18
+ // NOTE: Both Language and Encoding enums should always start from
19
+ // zero value. This assumption has been made and used.
20
+ //
21
+
22
+ #ifndef ENCODINGS_LANG_ENC_H__
23
+ #define ENCODINGS_LANG_ENC_H__
24
+
25
+ #include "languages/public/languages.h"
26
+ #include "encodings/public/encodings.h"
27
+
28
+
29
+ // EncodingsForLanguage
30
+ // --------------------
31
+ //
32
+ // Given the language, returns a pointer to an array of encodings this
33
+ // language supports. Typically, the encs array has at least one
34
+ // element: UNKNOWN_ENCODING, which is always the last element of the
35
+ // array. The first encoding is the default encoding of the language.
36
+ // Return NULL if the input is invalid.
37
+ //
38
+ // Note: The output encoding array does not include ASCII_7BIT, UTF8
39
+ // or UNICODE which are good for all languages. TODO: Find out whether
40
+ // it is better to include ASCII_7BIT, UTF8 and UNICODE or leave them
41
+ // as special cases.
42
+ //
43
+ const Encoding* EncodingsForLanguage(Language lang);
44
+
45
+
46
+ // DefaultEncodingForLanguage
47
+ // --------------------------
48
+ //
49
+ // Given the language, returns the default encoding for the language
50
+ // via the argument encoding.
51
+ //
52
+ // The function returns true if the input lang is valid. Otherwise,
53
+ // false is returned, and encoding is set to UNKNOWN_ENCODING.
54
+ //
55
+ bool DefaultEncodingForLanguage(Language lang,
56
+ Encoding *encoding);
57
+
58
+ // LanguagesForEncoding
59
+ // --------------------
60
+ //
61
+ // Given the encoding, returns a pointer to an array of languages this
62
+ // encoding supports. Typically, the langs array has at least one
63
+ // element: UNKNOWN_LANGUAGE, which is always the last element of the
64
+ // array. The first language in the array if the most popular
65
+ // language for that encoding. NULL is returned if the input is
66
+ // invalid.
67
+ //
68
+ // Note: For ASCII_7BIT, UNICODE and UTF8, only ENGLISH and
69
+ // UNKNOWN_LANGUAGE are returned. TODO: Find out whether to return all
70
+ // the languages or to treat these two encodings as special cases.
71
+ //
72
+ // For other known encodings, ENGLISH is always included. This is
73
+ // because English (Latin) characters are included in each encoding.
74
+ //
75
+ const Language* LanguagesForEncoding(Encoding enc);
76
+
77
+ // DefaultLanguageForEncoding
78
+ // --------------------------
79
+ //
80
+ // Given the encoding, returns the default language for that encoding
81
+ // via the argument language.
82
+ //
83
+ // The function returns true if the input enc is valid. Otherwise,
84
+ // false is returned, and language is set to UNKNOWN_LANGUAGE.
85
+ //
86
+ // Note, this function is more useful for the encodings that have only
87
+ // one corresponding language i.e. shift_jis => Japanese. There are
88
+ // cases that multiple langauges have the same encoding, for which the
89
+ // default language is an arbitrary choice from them.
90
+ //
91
+ bool DefaultLanguageForEncoding(Encoding enc, Language* language);
92
+
93
+ //
94
+ // IsLangEncCompatible
95
+ // -------------------
96
+ //
97
+ // This function is to determine whether the input language and
98
+ // encoding are compatible. For example, FRENCH and LATIN1 are
99
+ // compatible, but FRENCH and GB are not.
100
+ //
101
+ // If either lang or enc is invalid return false.
102
+ // If either lang is unknown, return true.
103
+ // (e.g. we can detect a page's encoding as latin1 from metatag info, but
104
+ // cannot derive it language since there are more than one
105
+ // language encoding in Latin1 )
106
+ // If language is known, but encoding is unknown, return false.
107
+ // (return true will do us no good since we cannot convert to UTF8 anyway)
108
+ // If enc is unicode or utf8, return true.
109
+ // Otherwise check if lang is supported by enc and enc supported by
110
+ // lang.
111
+ //
112
+ bool IsLangEncCompatible(Language lang, Encoding enc);
113
+
114
+ //
115
+ // DominantLanguageFromEncoding
116
+ // ----------------------------
117
+ //
118
+ // This function determine if there exists a dominant language for the
119
+ // input encoding. For example, the encoding GB has a dominant
120
+ // language (Chinese), but Latin1 does not.
121
+ //
122
+ // The word "dominant" is used here because English characters are
123
+ // included in each encoding.
124
+ //
125
+ // If there is no dominant langauge for the encoding, such as Latin1,
126
+ // UNKNOWN_LANGUAGE is returned.
127
+ //
128
+ Language DominantLanguageFromEncoding(Encoding enc);
129
+
130
+ // LanguageCode
131
+ // ------------------------
132
+ // Given the Language and Encoding, return language code with dialects
133
+ // (>= 2 letters). Encoding is necessary to disambiguate between
134
+ // Simplified and Traditional Chinese.
135
+ //
136
+ // See the note on Chinese Language Codes in
137
+ // i18n/languages/public/languages.h
138
+ // for the details.
139
+
140
+ const char* LanguageCode(Language lang, Encoding enc);
141
+
142
+ //
143
+ // IsEncodingWithSupportedLanguage()
144
+ // ---------------------------------
145
+ //
146
+ // There are some encoding listed here just because they are commonly
147
+ // used. There is no interface language for them yet. They are not
148
+ // detected by Teragram, but can be detected from the meta info of the
149
+ // HTML page.
150
+ //
151
+ // For example, we have list ARABIC_ENCODING but there is no arabic in
152
+ // the Language enum. If the user input an Arabic query from Google
153
+ // main page, Netscape will just send the raw bytes to GWS, and GWS
154
+ // will treat them as Latin1. Therefore, there is no use to detect
155
+ // ARABIC_ENCODING for indexing, since they will never match the
156
+ // queries which are treated as Latin1 by GWS. On the contrary, if we
157
+ // treat page with ARABIC_ENCODING as UNKNOWN_ENCODING, Google will
158
+ // fall them through as Latin1 in indexing time. And there might be a
159
+ // match for some ARABIC queries which are also treated as Latin1 by
160
+ // GWS. In fact, some people are relying on this feature to do Arabic
161
+ // searches.
162
+ //
163
+ // Thus for these type of encoding, before we have the UI support for
164
+ // their language and have a pretty comprehensive language/encoding
165
+ // identification quality, it is better to revert them as
166
+ // UNKNOWN_ENCODING.
167
+ //
168
+ // This function checks whether the input encoding is one with
169
+ // an interface language.
170
+ bool IsEncodingWithSupportedLanguage(Encoding enc);
171
+
172
+
173
+ //
174
+ // LangsFromCountryCode and EncFromCountryCode
175
+ // -------------------------------------------
176
+ //
177
+ // These two functions return the possible languages and encodings,
178
+ // respectively, according to the input country code, which is a
179
+ // 2-letter string. The country code is usually specified in the url
180
+ // of a document.
181
+ //
182
+ //
183
+
184
+ // LangsFromCountryCode
185
+ // --------------------
186
+ //
187
+ // This function takes a string of arbitrary length. It treats the
188
+ // first 2 bytes of the string as the country code, as defined in iso
189
+ // 3166-1993 (E). It returns, via arguments, an array of the
190
+ // languages that are popular in that country, roughly in order of
191
+ // popularity, together with the size of the array.
192
+ //
193
+ // This function returns true if we have language information for
194
+ // country_code. Otherwise, it returns false.
195
+ //
196
+ bool LangsFromCountryCode(const char* country_code,
197
+ const Language** lang_arry,
198
+ int* num_langs);
199
+
200
+
201
+ //
202
+ // EncFromCountryCode
203
+ // ------------------
204
+ //
205
+ // This function takes a string of arbitrary length. It treats the
206
+ // first 2 bytes of that string as the country code, as defined in iso
207
+ // 3166-1993 (E). It sets *enc to the encoding that is
208
+ // most often used for the languages spoken in that country.
209
+ //
210
+ // This function returns true if we have encoding information for
211
+ // country_code. Otherwise, it returns false, and *enc is set to
212
+ // UNKNOWN_ENCODING.
213
+ //
214
+ bool EncFromCountryCode(const char* country_code, Encoding* enc);
215
+
216
+
217
+
218
+ // VisualType
219
+ // ----------
220
+ //
221
+ // Right-to-left documents may be in logical or visual order. When they
222
+ // are in visual order we convert them to logical order before processing.
223
+ // This enum lists the types of visual document we can encounter.
224
+ // Some, but not all, documents in Hebrew/Arabic/Persian etc. will be visual.
225
+ // The other documents in those languages, and all documents in non-RTL
226
+ // languages, will be NOT_VISUAL_DOCUMENT.
227
+ enum VisualType {
228
+ NOT_VISUAL_DOCUMENT = 0,
229
+ VISUAL_HEBREW_HTML, // HTML documents in the legacy visual order.
230
+ CONVERTED_RTL_PDF, // Converted RTL PDFs, which are always visual.
231
+ };
232
+
233
+ VisualType default_visualtype();
234
+
235
+ // VisualTypeName
236
+ // --------------
237
+ //
238
+ // Given the visual type, returns a string name useful for debug output.
239
+ const char* VisualTypeName(VisualType visualtype);
240
+
241
+
242
+
243
+ // InitLangEnc
244
+ // -----------
245
+ //
246
+ // Ensures the LangEnc module has been initialized. Normally this
247
+ // happens during InitGoogle, but this allows access for scripts that
248
+ // don't support InitGoogle. InitLangEnc calls InitEncodings (see
249
+ // i18n/encodings/public/encodings.h) and also initializes data
250
+ // structures used in lang_enc.cc.
251
+ //
252
+ void InitLangEnc();
253
+
254
+ #endif // ENCODINGS_LANG_ENC_H__
@@ -0,0 +1,169 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_PROTO_ENCODINGS_PB_H_
6
+ #define ENCODINGS_PROTO_ENCODINGS_PB_H_
7
+
8
+ enum Encoding {
9
+ ISO_8859_1 = 0, // Teragram ASCII
10
+ ISO_8859_2 = 1, // Teragram Latin2
11
+ ISO_8859_3 = 2, // in BasisTech but not in Teragram
12
+ ISO_8859_4 = 3, // Teragram Latin4
13
+ ISO_8859_5 = 4, // Teragram ISO-8859-5
14
+ ISO_8859_6 = 5, // Teragram Arabic
15
+ ISO_8859_7 = 6, // Teragram Greek
16
+ ISO_8859_8 = 7, // Teragram Hebrew
17
+ ISO_8859_9 = 8, // in BasisTech but not in Teragram
18
+ ISO_8859_10 = 9, // in BasisTech but not in Teragram
19
+ JAPANESE_EUC_JP = 10, // Teragram EUC_JP
20
+ JAPANESE_SHIFT_JIS = 11, // Teragram SJS
21
+ JAPANESE_JIS = 12, // Teragram JIS
22
+ CHINESE_BIG5 = 13, // Teragram BIG5
23
+ CHINESE_GB = 14, // Teragram GB
24
+ CHINESE_EUC_CN = 15, // Misnamed. Should be EUC_TW. Was Basis Tech
25
+ // CNS11643EUC, before that Teragram EUC-CN(!)
26
+ // See //i18n/basistech/basistech_encodings.h
27
+ KOREAN_EUC_KR = 16, // Teragram KSC
28
+ UNICODE = 17, // Teragram Unicode
29
+ CHINESE_EUC_DEC = 18, // Misnamed. Should be EUC_TW. Was Basis Tech
30
+ // CNS11643EUC, before that Teragram EUC.
31
+ CHINESE_CNS = 19, // Misnamed. Should be EUC_TW. Was Basis Tech
32
+ // CNS11643EUC, before that Teragram CNS.
33
+ CHINESE_BIG5_CP950 = 20, // Teragram BIG5_CP950
34
+ JAPANESE_CP932 = 21, // Teragram CP932
35
+ UTF8 = 22,
36
+ UNKNOWN_ENCODING = 23,
37
+ ASCII_7BIT = 24, // ISO_8859_1 with all characters <= 127.
38
+ // Should be present only in the crawler
39
+ // and in the repository,
40
+ // *never* as a result of Document::encoding().
41
+ RUSSIAN_KOI8_R = 25, // Teragram KOI8R
42
+ RUSSIAN_CP1251 = 26, // Teragram CP1251
43
+
44
+ //----------------------------------------------------------
45
+ // These are _not_ output from teragram. Instead, they are as
46
+ // detected in the headers of usenet articles.
47
+ MSFT_CP1252 = 27, // 27: CP1252 aka MSFT euro ascii
48
+ RUSSIAN_KOI8_RU = 28, // CP21866 aka KOI8-U, used for Ukrainian.
49
+ // Misnamed, this is _not_ KOI8-RU but KOI8-U.
50
+ // KOI8-U is used much more often than KOI8-RU.
51
+ MSFT_CP1250 = 29, // CP1250 aka MSFT eastern european
52
+ ISO_8859_15 = 30, // aka ISO_8859_0 aka ISO_8859_1 euroized
53
+ //----------------------------------------------------------
54
+
55
+ //----------------------------------------------------------
56
+ // These are in BasisTech but not in Teragram. They are
57
+ // needed for new interface languages. Now detected by
58
+ // research langid
59
+ MSFT_CP1254 = 31, // used for Turkish
60
+ MSFT_CP1257 = 32, // used in Baltic countries
61
+ //----------------------------------------------------------
62
+
63
+ //----------------------------------------------------------
64
+ //----------------------------------------------------------
65
+ // New encodings detected by Teragram
66
+ ISO_8859_11 = 33, // aka TIS-620, used for Thai
67
+ MSFT_CP874 = 34, // used for Thai
68
+ MSFT_CP1256 = 35, // used for Arabic
69
+
70
+ //----------------------------------------------------------
71
+ // Detected as ISO_8859_8 by Teragram, but can be found in META tags
72
+ MSFT_CP1255 = 36, // Logical Hebrew Microsoft
73
+ ISO_8859_8_I = 37, // Iso Hebrew Logical
74
+ HEBREW_VISUAL = 38, // Iso Hebrew Visual
75
+ //----------------------------------------------------------
76
+
77
+ //----------------------------------------------------------
78
+ // Detected by research langid
79
+ CZECH_CP852 = 39,
80
+ CZECH_CSN_369103 = 40, // aka ISO_IR_139 aka KOI8_CS
81
+ MSFT_CP1253 = 41, // used for Greek
82
+ RUSSIAN_CP866 = 42,
83
+ //----------------------------------------------------------
84
+
85
+ //----------------------------------------------------------
86
+ // Handled by iconv in glibc
87
+ ISO_8859_13 = 43,
88
+ ISO_2022_KR = 44,
89
+ GBK = 45,
90
+ GB18030 = 46,
91
+ BIG5_HKSCS = 47,
92
+ ISO_2022_CN = 48,
93
+
94
+ //-----------------------------------------------------------
95
+ // Detected by xin liu's detector
96
+ // Handled by transcoder
97
+ // (Indic encodings)
98
+
99
+ TSCII = 49,
100
+ TAMIL_MONO = 50,
101
+ TAMIL_BI = 51,
102
+ JAGRAN = 52,
103
+
104
+
105
+ MACINTOSH_ROMAN = 53,
106
+ UTF7 = 54,
107
+ BHASKAR = 55, // Indic encoding - Devanagari
108
+ HTCHANAKYA = 56, // 56 Indic encoding - Devanagari
109
+
110
+ //-----------------------------------------------------------
111
+ // These allow a single place (inputconverter and outputconverter)
112
+ // to do UTF-16 <==> UTF-8 bulk conversions and UTF-32 <==> UTF-8
113
+ // bulk conversions, with interchange-valid checking on input and
114
+ // fallback if needed on ouput.
115
+ UTF16BE = 57, // big-endian UTF-16
116
+ UTF16LE = 58, // little-endian UTF-16
117
+ UTF32BE = 59, // big-endian UTF-32
118
+ UTF32LE = 60, // little-endian UTF-32
119
+ //-----------------------------------------------------------
120
+
121
+ //-----------------------------------------------------------
122
+ // An encoding that means "This is not text, but it may have some
123
+ // simple ASCII text embedded". Intended input conversion (not yet
124
+ // implemented) is to keep strings of >=4 seven-bit ASCII characters
125
+ // (follow each kept string with an ASCII space), delete the rest of
126
+ // the bytes. This will pick up and allow indexing of e.g. captions
127
+ // in JPEGs. No output conversion needed.
128
+ BINARYENC = 61,
129
+ //-----------------------------------------------------------
130
+
131
+ //-----------------------------------------------------------
132
+ // Some Web pages allow a mixture of HZ-GB and GB-2312 by using
133
+ // ~{ ... ~} for 2-byte pairs, and the browsers support this.
134
+ HZ_GB_2312 = 62,
135
+ //-----------------------------------------------------------
136
+
137
+ //-----------------------------------------------------------
138
+ // Some external vendors make the common input error of
139
+ // converting MSFT_CP1252 to UTF8 *twice*. No output conversion needed.
140
+ UTF8UTF8 = 63,
141
+ //-----------------------------------------------------------
142
+
143
+ //-----------------------------------------------------------
144
+ // Handled by transcoder for tamil language specific font
145
+ // encodings without the support for detection at present.
146
+ TAM_ELANGO = 64, // Elango - Tamil
147
+ TAM_LTTMBARANI = 65, // Barani - Tamil
148
+ TAM_SHREE = 66, // Shree - Tamil
149
+ TAM_TBOOMIS = 67, // TBoomis - Tamil
150
+ TAM_TMNEWS = 68, // TMNews - Tamil
151
+ TAM_WEBTAMIL = 69, // Webtamil - Tamil
152
+ //-----------------------------------------------------------
153
+
154
+ //-----------------------------------------------------------
155
+ // Shift_JIS variants used by Japanese cell phone carriers.
156
+ KDDI_SHIFT_JIS = 70,
157
+ DOCOMO_SHIFT_JIS = 71,
158
+ SOFTBANK_SHIFT_JIS = 72,
159
+ // ISO-2022-JP variants used by KDDI and SoftBank.
160
+ KDDI_ISO_2022_JP = 73,
161
+ SOFTBANK_ISO_2022_JP = 74,
162
+ //-----------------------------------------------------------
163
+
164
+ NUM_ENCODINGS = 75, // Always keep this at the end. It is not a
165
+ // valid Encoding enum, it is only used to
166
+ // indicate the total number of Encodings.
167
+ };
168
+
169
+ #endif // ENCODINGS_PROTO_ENCODINGS_PB_H_