language_detection 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +85 -0
- data/Rakefile +11 -0
- data/ext/cld/Makefile +34 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +124 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/cld.so +0 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +1 -0
- data/ext/cld/language_detection.cc +88 -0
- data/ext/cld/languages/internal/languages.cc +337 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/language_detection.gemspec +28 -0
- data/lib/language_detection/string.rb +1 -0
- data/lib/language_detection/version.rb +3 -0
- data/lib/language_detection.rb +54 -0
- data/test/_helper.rb +15 -0
- data/test/fixtures/languages.csv +80 -0
- data/test/language_detection_test.rb +88 -0
- metadata +250 -0
@@ -0,0 +1,254 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
// This file is for i18n. It contains two enums, namely Language and
|
6
|
+
// Encoding, where Language is the linguistic convention, and Encoding
|
7
|
+
// contains information on both language encoding and character set.
|
8
|
+
//
|
9
|
+
// The language and encoding are both based on Teragram's conventions,
|
10
|
+
// except for some common ISO-8859 encodings that are not detected by
|
11
|
+
// Teragram but might be in the future.
|
12
|
+
//
|
13
|
+
// This file also includes functions that do mappings among
|
14
|
+
// Language/Encoding enums, language/encoding string names (typically
|
15
|
+
// the output from Language Encoding identifier), and language codes
|
16
|
+
// (iso 639), and two-letter country codes (iso 3166)
|
17
|
+
//
|
18
|
+
// NOTE: Both Language and Encoding enums should always start from
|
19
|
+
// zero value. This assumption has been made and used.
|
20
|
+
//
|
21
|
+
|
22
|
+
#ifndef ENCODINGS_LANG_ENC_H__
|
23
|
+
#define ENCODINGS_LANG_ENC_H__
|
24
|
+
|
25
|
+
#include "languages/public/languages.h"
|
26
|
+
#include "encodings/public/encodings.h"
|
27
|
+
|
28
|
+
|
29
|
+
// EncodingsForLanguage
|
30
|
+
// --------------------
|
31
|
+
//
|
32
|
+
// Given the language, returns a pointer to an array of encodings this
|
33
|
+
// language supports. Typically, the encs array has at least one
|
34
|
+
// element: UNKNOWN_ENCODING, which is always the last element of the
|
35
|
+
// array. The first encoding is the default encoding of the language.
|
36
|
+
// Return NULL if the input is invalid.
|
37
|
+
//
|
38
|
+
// Note: The output encoding array does not include ASCII_7BIT, UTF8
|
39
|
+
// or UNICODE which are good for all languages. TODO: Find out whether
|
40
|
+
// it is better to include ASCII_7BIT, UTF8 and UNICODE or leave them
|
41
|
+
// as special cases.
|
42
|
+
//
|
43
|
+
const Encoding* EncodingsForLanguage(Language lang);
|
44
|
+
|
45
|
+
|
46
|
+
// DefaultEncodingForLanguage
|
47
|
+
// --------------------------
|
48
|
+
//
|
49
|
+
// Given the language, returns the default encoding for the language
|
50
|
+
// via the argument encoding.
|
51
|
+
//
|
52
|
+
// The function returns true if the input lang is valid. Otherwise,
|
53
|
+
// false is returned, and encoding is set to UNKNOWN_ENCODING.
|
54
|
+
//
|
55
|
+
bool DefaultEncodingForLanguage(Language lang,
|
56
|
+
Encoding *encoding);
|
57
|
+
|
58
|
+
// LanguagesForEncoding
|
59
|
+
// --------------------
|
60
|
+
//
|
61
|
+
// Given the encoding, returns a pointer to an array of languages this
|
62
|
+
// encoding supports. Typically, the langs array has at least one
|
63
|
+
// element: UNKNOWN_LANGUAGE, which is always the last element of the
|
64
|
+
// array. The first language in the array if the most popular
|
65
|
+
// language for that encoding. NULL is returned if the input is
|
66
|
+
// invalid.
|
67
|
+
//
|
68
|
+
// Note: For ASCII_7BIT, UNICODE and UTF8, only ENGLISH and
|
69
|
+
// UNKNOWN_LANGUAGE are returned. TODO: Find out whether to return all
|
70
|
+
// the languages or to treat these two encodings as special cases.
|
71
|
+
//
|
72
|
+
// For other known encodings, ENGLISH is always included. This is
|
73
|
+
// because English (Latin) characters are included in each encoding.
|
74
|
+
//
|
75
|
+
const Language* LanguagesForEncoding(Encoding enc);
|
76
|
+
|
77
|
+
// DefaultLanguageForEncoding
|
78
|
+
// --------------------------
|
79
|
+
//
|
80
|
+
// Given the encoding, returns the default language for that encoding
|
81
|
+
// via the argument language.
|
82
|
+
//
|
83
|
+
// The function returns true if the input enc is valid. Otherwise,
|
84
|
+
// false is returned, and language is set to UNKNOWN_LANGUAGE.
|
85
|
+
//
|
86
|
+
// Note, this function is more useful for the encodings that have only
|
87
|
+
// one corresponding language i.e. shift_jis => Japanese. There are
|
88
|
+
// cases that multiple langauges have the same encoding, for which the
|
89
|
+
// default language is an arbitrary choice from them.
|
90
|
+
//
|
91
|
+
bool DefaultLanguageForEncoding(Encoding enc, Language* language);
|
92
|
+
|
93
|
+
//
|
94
|
+
// IsLangEncCompatible
|
95
|
+
// -------------------
|
96
|
+
//
|
97
|
+
// This function is to determine whether the input language and
|
98
|
+
// encoding are compatible. For example, FRENCH and LATIN1 are
|
99
|
+
// compatible, but FRENCH and GB are not.
|
100
|
+
//
|
101
|
+
// If either lang or enc is invalid return false.
|
102
|
+
// If either lang is unknown, return true.
|
103
|
+
// (e.g. we can detect a page's encoding as latin1 from metatag info, but
|
104
|
+
// cannot derive it language since there are more than one
|
105
|
+
// language encoding in Latin1 )
|
106
|
+
// If language is known, but encoding is unknown, return false.
|
107
|
+
// (return true will do us no good since we cannot convert to UTF8 anyway)
|
108
|
+
// If enc is unicode or utf8, return true.
|
109
|
+
// Otherwise check if lang is supported by enc and enc supported by
|
110
|
+
// lang.
|
111
|
+
//
|
112
|
+
bool IsLangEncCompatible(Language lang, Encoding enc);
|
113
|
+
|
114
|
+
//
|
115
|
+
// DominantLanguageFromEncoding
|
116
|
+
// ----------------------------
|
117
|
+
//
|
118
|
+
// This function determine if there exists a dominant language for the
|
119
|
+
// input encoding. For example, the encoding GB has a dominant
|
120
|
+
// language (Chinese), but Latin1 does not.
|
121
|
+
//
|
122
|
+
// The word "dominant" is used here because English characters are
|
123
|
+
// included in each encoding.
|
124
|
+
//
|
125
|
+
// If there is no dominant langauge for the encoding, such as Latin1,
|
126
|
+
// UNKNOWN_LANGUAGE is returned.
|
127
|
+
//
|
128
|
+
Language DominantLanguageFromEncoding(Encoding enc);
|
129
|
+
|
130
|
+
// LanguageCode
|
131
|
+
// ------------------------
|
132
|
+
// Given the Language and Encoding, return language code with dialects
|
133
|
+
// (>= 2 letters). Encoding is necessary to disambiguate between
|
134
|
+
// Simplified and Traditional Chinese.
|
135
|
+
//
|
136
|
+
// See the note on Chinese Language Codes in
|
137
|
+
// i18n/languages/public/languages.h
|
138
|
+
// for the details.
|
139
|
+
|
140
|
+
const char* LanguageCode(Language lang, Encoding enc);
|
141
|
+
|
142
|
+
//
|
143
|
+
// IsEncodingWithSupportedLanguage()
|
144
|
+
// ---------------------------------
|
145
|
+
//
|
146
|
+
// There are some encoding listed here just because they are commonly
|
147
|
+
// used. There is no interface language for them yet. They are not
|
148
|
+
// detected by Teragram, but can be detected from the meta info of the
|
149
|
+
// HTML page.
|
150
|
+
//
|
151
|
+
// For example, we have list ARABIC_ENCODING but there is no arabic in
|
152
|
+
// the Language enum. If the user input an Arabic query from Google
|
153
|
+
// main page, Netscape will just send the raw bytes to GWS, and GWS
|
154
|
+
// will treat them as Latin1. Therefore, there is no use to detect
|
155
|
+
// ARABIC_ENCODING for indexing, since they will never match the
|
156
|
+
// queries which are treated as Latin1 by GWS. On the contrary, if we
|
157
|
+
// treat page with ARABIC_ENCODING as UNKNOWN_ENCODING, Google will
|
158
|
+
// fall them through as Latin1 in indexing time. And there might be a
|
159
|
+
// match for some ARABIC queries which are also treated as Latin1 by
|
160
|
+
// GWS. In fact, some people are relying on this feature to do Arabic
|
161
|
+
// searches.
|
162
|
+
//
|
163
|
+
// Thus for these type of encoding, before we have the UI support for
|
164
|
+
// their language and have a pretty comprehensive language/encoding
|
165
|
+
// identification quality, it is better to revert them as
|
166
|
+
// UNKNOWN_ENCODING.
|
167
|
+
//
|
168
|
+
// This function checks whether the input encoding is one with
|
169
|
+
// an interface language.
|
170
|
+
bool IsEncodingWithSupportedLanguage(Encoding enc);
|
171
|
+
|
172
|
+
|
173
|
+
//
|
174
|
+
// LangsFromCountryCode and EncFromCountryCode
|
175
|
+
// -------------------------------------------
|
176
|
+
//
|
177
|
+
// These two functions return the possible languages and encodings,
|
178
|
+
// respectively, according to the input country code, which is a
|
179
|
+
// 2-letter string. The country code is usually specified in the url
|
180
|
+
// of a document.
|
181
|
+
//
|
182
|
+
//
|
183
|
+
|
184
|
+
// LangsFromCountryCode
|
185
|
+
// --------------------
|
186
|
+
//
|
187
|
+
// This function takes a string of arbitrary length. It treats the
|
188
|
+
// first 2 bytes of the string as the country code, as defined in iso
|
189
|
+
// 3166-1993 (E). It returns, via arguments, an array of the
|
190
|
+
// languages that are popular in that country, roughly in order of
|
191
|
+
// popularity, together with the size of the array.
|
192
|
+
//
|
193
|
+
// This function returns true if we have language information for
|
194
|
+
// country_code. Otherwise, it returns false.
|
195
|
+
//
|
196
|
+
bool LangsFromCountryCode(const char* country_code,
|
197
|
+
const Language** lang_arry,
|
198
|
+
int* num_langs);
|
199
|
+
|
200
|
+
|
201
|
+
//
|
202
|
+
// EncFromCountryCode
|
203
|
+
// ------------------
|
204
|
+
//
|
205
|
+
// This function takes a string of arbitrary length. It treats the
|
206
|
+
// first 2 bytes of that string as the country code, as defined in iso
|
207
|
+
// 3166-1993 (E). It sets *enc to the encoding that is
|
208
|
+
// most often used for the languages spoken in that country.
|
209
|
+
//
|
210
|
+
// This function returns true if we have encoding information for
|
211
|
+
// country_code. Otherwise, it returns false, and *enc is set to
|
212
|
+
// UNKNOWN_ENCODING.
|
213
|
+
//
|
214
|
+
bool EncFromCountryCode(const char* country_code, Encoding* enc);
|
215
|
+
|
216
|
+
|
217
|
+
|
218
|
+
// VisualType
|
219
|
+
// ----------
|
220
|
+
//
|
221
|
+
// Right-to-left documents may be in logical or visual order. When they
|
222
|
+
// are in visual order we convert them to logical order before processing.
|
223
|
+
// This enum lists the types of visual document we can encounter.
|
224
|
+
// Some, but not all, documents in Hebrew/Arabic/Persian etc. will be visual.
|
225
|
+
// The other documents in those languages, and all documents in non-RTL
|
226
|
+
// languages, will be NOT_VISUAL_DOCUMENT.
|
227
|
+
enum VisualType {
|
228
|
+
NOT_VISUAL_DOCUMENT = 0,
|
229
|
+
VISUAL_HEBREW_HTML, // HTML documents in the legacy visual order.
|
230
|
+
CONVERTED_RTL_PDF, // Converted RTL PDFs, which are always visual.
|
231
|
+
};
|
232
|
+
|
233
|
+
VisualType default_visualtype();
|
234
|
+
|
235
|
+
// VisualTypeName
|
236
|
+
// --------------
|
237
|
+
//
|
238
|
+
// Given the visual type, returns a string name useful for debug output.
|
239
|
+
const char* VisualTypeName(VisualType visualtype);
|
240
|
+
|
241
|
+
|
242
|
+
|
243
|
+
// InitLangEnc
|
244
|
+
// -----------
|
245
|
+
//
|
246
|
+
// Ensures the LangEnc module has been initialized. Normally this
|
247
|
+
// happens during InitGoogle, but this allows access for scripts that
|
248
|
+
// don't support InitGoogle. InitLangEnc calls InitEncodings (see
|
249
|
+
// i18n/encodings/public/encodings.h) and also initializes data
|
250
|
+
// structures used in lang_enc.cc.
|
251
|
+
//
|
252
|
+
void InitLangEnc();
|
253
|
+
|
254
|
+
#endif // ENCODINGS_LANG_ENC_H__
|
@@ -0,0 +1,169 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef ENCODINGS_PROTO_ENCODINGS_PB_H_
|
6
|
+
#define ENCODINGS_PROTO_ENCODINGS_PB_H_
|
7
|
+
|
8
|
+
enum Encoding {
|
9
|
+
ISO_8859_1 = 0, // Teragram ASCII
|
10
|
+
ISO_8859_2 = 1, // Teragram Latin2
|
11
|
+
ISO_8859_3 = 2, // in BasisTech but not in Teragram
|
12
|
+
ISO_8859_4 = 3, // Teragram Latin4
|
13
|
+
ISO_8859_5 = 4, // Teragram ISO-8859-5
|
14
|
+
ISO_8859_6 = 5, // Teragram Arabic
|
15
|
+
ISO_8859_7 = 6, // Teragram Greek
|
16
|
+
ISO_8859_8 = 7, // Teragram Hebrew
|
17
|
+
ISO_8859_9 = 8, // in BasisTech but not in Teragram
|
18
|
+
ISO_8859_10 = 9, // in BasisTech but not in Teragram
|
19
|
+
JAPANESE_EUC_JP = 10, // Teragram EUC_JP
|
20
|
+
JAPANESE_SHIFT_JIS = 11, // Teragram SJS
|
21
|
+
JAPANESE_JIS = 12, // Teragram JIS
|
22
|
+
CHINESE_BIG5 = 13, // Teragram BIG5
|
23
|
+
CHINESE_GB = 14, // Teragram GB
|
24
|
+
CHINESE_EUC_CN = 15, // Misnamed. Should be EUC_TW. Was Basis Tech
|
25
|
+
// CNS11643EUC, before that Teragram EUC-CN(!)
|
26
|
+
// See //i18n/basistech/basistech_encodings.h
|
27
|
+
KOREAN_EUC_KR = 16, // Teragram KSC
|
28
|
+
UNICODE = 17, // Teragram Unicode
|
29
|
+
CHINESE_EUC_DEC = 18, // Misnamed. Should be EUC_TW. Was Basis Tech
|
30
|
+
// CNS11643EUC, before that Teragram EUC.
|
31
|
+
CHINESE_CNS = 19, // Misnamed. Should be EUC_TW. Was Basis Tech
|
32
|
+
// CNS11643EUC, before that Teragram CNS.
|
33
|
+
CHINESE_BIG5_CP950 = 20, // Teragram BIG5_CP950
|
34
|
+
JAPANESE_CP932 = 21, // Teragram CP932
|
35
|
+
UTF8 = 22,
|
36
|
+
UNKNOWN_ENCODING = 23,
|
37
|
+
ASCII_7BIT = 24, // ISO_8859_1 with all characters <= 127.
|
38
|
+
// Should be present only in the crawler
|
39
|
+
// and in the repository,
|
40
|
+
// *never* as a result of Document::encoding().
|
41
|
+
RUSSIAN_KOI8_R = 25, // Teragram KOI8R
|
42
|
+
RUSSIAN_CP1251 = 26, // Teragram CP1251
|
43
|
+
|
44
|
+
//----------------------------------------------------------
|
45
|
+
// These are _not_ output from teragram. Instead, they are as
|
46
|
+
// detected in the headers of usenet articles.
|
47
|
+
MSFT_CP1252 = 27, // 27: CP1252 aka MSFT euro ascii
|
48
|
+
RUSSIAN_KOI8_RU = 28, // CP21866 aka KOI8-U, used for Ukrainian.
|
49
|
+
// Misnamed, this is _not_ KOI8-RU but KOI8-U.
|
50
|
+
// KOI8-U is used much more often than KOI8-RU.
|
51
|
+
MSFT_CP1250 = 29, // CP1250 aka MSFT eastern european
|
52
|
+
ISO_8859_15 = 30, // aka ISO_8859_0 aka ISO_8859_1 euroized
|
53
|
+
//----------------------------------------------------------
|
54
|
+
|
55
|
+
//----------------------------------------------------------
|
56
|
+
// These are in BasisTech but not in Teragram. They are
|
57
|
+
// needed for new interface languages. Now detected by
|
58
|
+
// research langid
|
59
|
+
MSFT_CP1254 = 31, // used for Turkish
|
60
|
+
MSFT_CP1257 = 32, // used in Baltic countries
|
61
|
+
//----------------------------------------------------------
|
62
|
+
|
63
|
+
//----------------------------------------------------------
|
64
|
+
//----------------------------------------------------------
|
65
|
+
// New encodings detected by Teragram
|
66
|
+
ISO_8859_11 = 33, // aka TIS-620, used for Thai
|
67
|
+
MSFT_CP874 = 34, // used for Thai
|
68
|
+
MSFT_CP1256 = 35, // used for Arabic
|
69
|
+
|
70
|
+
//----------------------------------------------------------
|
71
|
+
// Detected as ISO_8859_8 by Teragram, but can be found in META tags
|
72
|
+
MSFT_CP1255 = 36, // Logical Hebrew Microsoft
|
73
|
+
ISO_8859_8_I = 37, // Iso Hebrew Logical
|
74
|
+
HEBREW_VISUAL = 38, // Iso Hebrew Visual
|
75
|
+
//----------------------------------------------------------
|
76
|
+
|
77
|
+
//----------------------------------------------------------
|
78
|
+
// Detected by research langid
|
79
|
+
CZECH_CP852 = 39,
|
80
|
+
CZECH_CSN_369103 = 40, // aka ISO_IR_139 aka KOI8_CS
|
81
|
+
MSFT_CP1253 = 41, // used for Greek
|
82
|
+
RUSSIAN_CP866 = 42,
|
83
|
+
//----------------------------------------------------------
|
84
|
+
|
85
|
+
//----------------------------------------------------------
|
86
|
+
// Handled by iconv in glibc
|
87
|
+
ISO_8859_13 = 43,
|
88
|
+
ISO_2022_KR = 44,
|
89
|
+
GBK = 45,
|
90
|
+
GB18030 = 46,
|
91
|
+
BIG5_HKSCS = 47,
|
92
|
+
ISO_2022_CN = 48,
|
93
|
+
|
94
|
+
//-----------------------------------------------------------
|
95
|
+
// Detected by xin liu's detector
|
96
|
+
// Handled by transcoder
|
97
|
+
// (Indic encodings)
|
98
|
+
|
99
|
+
TSCII = 49,
|
100
|
+
TAMIL_MONO = 50,
|
101
|
+
TAMIL_BI = 51,
|
102
|
+
JAGRAN = 52,
|
103
|
+
|
104
|
+
|
105
|
+
MACINTOSH_ROMAN = 53,
|
106
|
+
UTF7 = 54,
|
107
|
+
BHASKAR = 55, // Indic encoding - Devanagari
|
108
|
+
HTCHANAKYA = 56, // 56 Indic encoding - Devanagari
|
109
|
+
|
110
|
+
//-----------------------------------------------------------
|
111
|
+
// These allow a single place (inputconverter and outputconverter)
|
112
|
+
// to do UTF-16 <==> UTF-8 bulk conversions and UTF-32 <==> UTF-8
|
113
|
+
// bulk conversions, with interchange-valid checking on input and
|
114
|
+
// fallback if needed on ouput.
|
115
|
+
UTF16BE = 57, // big-endian UTF-16
|
116
|
+
UTF16LE = 58, // little-endian UTF-16
|
117
|
+
UTF32BE = 59, // big-endian UTF-32
|
118
|
+
UTF32LE = 60, // little-endian UTF-32
|
119
|
+
//-----------------------------------------------------------
|
120
|
+
|
121
|
+
//-----------------------------------------------------------
|
122
|
+
// An encoding that means "This is not text, but it may have some
|
123
|
+
// simple ASCII text embedded". Intended input conversion (not yet
|
124
|
+
// implemented) is to keep strings of >=4 seven-bit ASCII characters
|
125
|
+
// (follow each kept string with an ASCII space), delete the rest of
|
126
|
+
// the bytes. This will pick up and allow indexing of e.g. captions
|
127
|
+
// in JPEGs. No output conversion needed.
|
128
|
+
BINARYENC = 61,
|
129
|
+
//-----------------------------------------------------------
|
130
|
+
|
131
|
+
//-----------------------------------------------------------
|
132
|
+
// Some Web pages allow a mixture of HZ-GB and GB-2312 by using
|
133
|
+
// ~{ ... ~} for 2-byte pairs, and the browsers support this.
|
134
|
+
HZ_GB_2312 = 62,
|
135
|
+
//-----------------------------------------------------------
|
136
|
+
|
137
|
+
//-----------------------------------------------------------
|
138
|
+
// Some external vendors make the common input error of
|
139
|
+
// converting MSFT_CP1252 to UTF8 *twice*. No output conversion needed.
|
140
|
+
UTF8UTF8 = 63,
|
141
|
+
//-----------------------------------------------------------
|
142
|
+
|
143
|
+
//-----------------------------------------------------------
|
144
|
+
// Handled by transcoder for tamil language specific font
|
145
|
+
// encodings without the support for detection at present.
|
146
|
+
TAM_ELANGO = 64, // Elango - Tamil
|
147
|
+
TAM_LTTMBARANI = 65, // Barani - Tamil
|
148
|
+
TAM_SHREE = 66, // Shree - Tamil
|
149
|
+
TAM_TBOOMIS = 67, // TBoomis - Tamil
|
150
|
+
TAM_TMNEWS = 68, // TMNews - Tamil
|
151
|
+
TAM_WEBTAMIL = 69, // Webtamil - Tamil
|
152
|
+
//-----------------------------------------------------------
|
153
|
+
|
154
|
+
//-----------------------------------------------------------
|
155
|
+
// Shift_JIS variants used by Japanese cell phone carriers.
|
156
|
+
KDDI_SHIFT_JIS = 70,
|
157
|
+
DOCOMO_SHIFT_JIS = 71,
|
158
|
+
SOFTBANK_SHIFT_JIS = 72,
|
159
|
+
// ISO-2022-JP variants used by KDDI and SoftBank.
|
160
|
+
KDDI_ISO_2022_JP = 73,
|
161
|
+
SOFTBANK_ISO_2022_JP = 74,
|
162
|
+
//-----------------------------------------------------------
|
163
|
+
|
164
|
+
NUM_ENCODINGS = 75, // Always keep this at the end. It is not a
|
165
|
+
// valid Encoding enum, it is only used to
|
166
|
+
// indicate the total number of Encodings.
|
167
|
+
};
|
168
|
+
|
169
|
+
#endif // ENCODINGS_PROTO_ENCODINGS_PB_H_
|