language_detection 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +85 -0
- data/Rakefile +11 -0
- data/ext/cld/Makefile +34 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +124 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/cld.so +0 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +1 -0
- data/ext/cld/language_detection.cc +88 -0
- data/ext/cld/languages/internal/languages.cc +337 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/language_detection.gemspec +28 -0
- data/lib/language_detection/string.rb +1 -0
- data/lib/language_detection/version.rb +3 -0
- data/lib/language_detection.rb +54 -0
- data/test/_helper.rb +15 -0
- data/test/fixtures/languages.csv +80 -0
- data/test/language_detection_test.rb +88 -0
- metadata +250 -0
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
// This file is for i18n. It contains two enums, namely Language and
|
|
6
|
+
// Encoding, where Language is the linguistic convention, and Encoding
|
|
7
|
+
// contains information on both language encoding and character set.
|
|
8
|
+
//
|
|
9
|
+
// The language and encoding are both based on Teragram's conventions,
|
|
10
|
+
// except for some common ISO-8859 encodings that are not detected by
|
|
11
|
+
// Teragram but might be in the future.
|
|
12
|
+
//
|
|
13
|
+
// This file also includes functions that do mappings among
|
|
14
|
+
// Language/Encoding enums, language/encoding string names (typically
|
|
15
|
+
// the output from Language Encoding identifier), and language codes
|
|
16
|
+
// (iso 639), and two-letter country codes (iso 3166)
|
|
17
|
+
//
|
|
18
|
+
// NOTE: Both Language and Encoding enums should always start from
|
|
19
|
+
// zero value. This assumption has been made and used.
|
|
20
|
+
//
|
|
21
|
+
|
|
22
|
+
#ifndef ENCODINGS_LANG_ENC_H__
|
|
23
|
+
#define ENCODINGS_LANG_ENC_H__
|
|
24
|
+
|
|
25
|
+
#include "languages/public/languages.h"
|
|
26
|
+
#include "encodings/public/encodings.h"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
// EncodingsForLanguage
|
|
30
|
+
// --------------------
|
|
31
|
+
//
|
|
32
|
+
// Given the language, returns a pointer to an array of encodings this
|
|
33
|
+
// language supports. Typically, the encs array has at least one
|
|
34
|
+
// element: UNKNOWN_ENCODING, which is always the last element of the
|
|
35
|
+
// array. The first encoding is the default encoding of the language.
|
|
36
|
+
// Return NULL if the input is invalid.
|
|
37
|
+
//
|
|
38
|
+
// Note: The output encoding array does not include ASCII_7BIT, UTF8
|
|
39
|
+
// or UNICODE which are good for all languages. TODO: Find out whether
|
|
40
|
+
// it is better to include ASCII_7BIT, UTF8 and UNICODE or leave them
|
|
41
|
+
// as special cases.
|
|
42
|
+
//
|
|
43
|
+
const Encoding* EncodingsForLanguage(Language lang);
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
// DefaultEncodingForLanguage
|
|
47
|
+
// --------------------------
|
|
48
|
+
//
|
|
49
|
+
// Given the language, returns the default encoding for the language
|
|
50
|
+
// via the argument encoding.
|
|
51
|
+
//
|
|
52
|
+
// The function returns true if the input lang is valid. Otherwise,
|
|
53
|
+
// false is returned, and encoding is set to UNKNOWN_ENCODING.
|
|
54
|
+
//
|
|
55
|
+
bool DefaultEncodingForLanguage(Language lang,
|
|
56
|
+
Encoding *encoding);
|
|
57
|
+
|
|
58
|
+
// LanguagesForEncoding
|
|
59
|
+
// --------------------
|
|
60
|
+
//
|
|
61
|
+
// Given the encoding, returns a pointer to an array of languages this
|
|
62
|
+
// encoding supports. Typically, the langs array has at least one
|
|
63
|
+
// element: UNKNOWN_LANGUAGE, which is always the last element of the
|
|
64
|
+
// array. The first language in the array if the most popular
|
|
65
|
+
// language for that encoding. NULL is returned if the input is
|
|
66
|
+
// invalid.
|
|
67
|
+
//
|
|
68
|
+
// Note: For ASCII_7BIT, UNICODE and UTF8, only ENGLISH and
|
|
69
|
+
// UNKNOWN_LANGUAGE are returned. TODO: Find out whether to return all
|
|
70
|
+
// the languages or to treat these two encodings as special cases.
|
|
71
|
+
//
|
|
72
|
+
// For other known encodings, ENGLISH is always included. This is
|
|
73
|
+
// because English (Latin) characters are included in each encoding.
|
|
74
|
+
//
|
|
75
|
+
const Language* LanguagesForEncoding(Encoding enc);
|
|
76
|
+
|
|
77
|
+
// DefaultLanguageForEncoding
|
|
78
|
+
// --------------------------
|
|
79
|
+
//
|
|
80
|
+
// Given the encoding, returns the default language for that encoding
|
|
81
|
+
// via the argument language.
|
|
82
|
+
//
|
|
83
|
+
// The function returns true if the input enc is valid. Otherwise,
|
|
84
|
+
// false is returned, and language is set to UNKNOWN_LANGUAGE.
|
|
85
|
+
//
|
|
86
|
+
// Note, this function is more useful for the encodings that have only
|
|
87
|
+
// one corresponding language i.e. shift_jis => Japanese. There are
|
|
88
|
+
// cases that multiple langauges have the same encoding, for which the
|
|
89
|
+
// default language is an arbitrary choice from them.
|
|
90
|
+
//
|
|
91
|
+
bool DefaultLanguageForEncoding(Encoding enc, Language* language);
|
|
92
|
+
|
|
93
|
+
//
|
|
94
|
+
// IsLangEncCompatible
|
|
95
|
+
// -------------------
|
|
96
|
+
//
|
|
97
|
+
// This function is to determine whether the input language and
|
|
98
|
+
// encoding are compatible. For example, FRENCH and LATIN1 are
|
|
99
|
+
// compatible, but FRENCH and GB are not.
|
|
100
|
+
//
|
|
101
|
+
// If either lang or enc is invalid return false.
|
|
102
|
+
// If either lang is unknown, return true.
|
|
103
|
+
// (e.g. we can detect a page's encoding as latin1 from metatag info, but
|
|
104
|
+
// cannot derive it language since there are more than one
|
|
105
|
+
// language encoding in Latin1 )
|
|
106
|
+
// If language is known, but encoding is unknown, return false.
|
|
107
|
+
// (return true will do us no good since we cannot convert to UTF8 anyway)
|
|
108
|
+
// If enc is unicode or utf8, return true.
|
|
109
|
+
// Otherwise check if lang is supported by enc and enc supported by
|
|
110
|
+
// lang.
|
|
111
|
+
//
|
|
112
|
+
bool IsLangEncCompatible(Language lang, Encoding enc);
|
|
113
|
+
|
|
114
|
+
//
|
|
115
|
+
// DominantLanguageFromEncoding
|
|
116
|
+
// ----------------------------
|
|
117
|
+
//
|
|
118
|
+
// This function determine if there exists a dominant language for the
|
|
119
|
+
// input encoding. For example, the encoding GB has a dominant
|
|
120
|
+
// language (Chinese), but Latin1 does not.
|
|
121
|
+
//
|
|
122
|
+
// The word "dominant" is used here because English characters are
|
|
123
|
+
// included in each encoding.
|
|
124
|
+
//
|
|
125
|
+
// If there is no dominant langauge for the encoding, such as Latin1,
|
|
126
|
+
// UNKNOWN_LANGUAGE is returned.
|
|
127
|
+
//
|
|
128
|
+
Language DominantLanguageFromEncoding(Encoding enc);
|
|
129
|
+
|
|
130
|
+
// LanguageCode
|
|
131
|
+
// ------------------------
|
|
132
|
+
// Given the Language and Encoding, return language code with dialects
|
|
133
|
+
// (>= 2 letters). Encoding is necessary to disambiguate between
|
|
134
|
+
// Simplified and Traditional Chinese.
|
|
135
|
+
//
|
|
136
|
+
// See the note on Chinese Language Codes in
|
|
137
|
+
// i18n/languages/public/languages.h
|
|
138
|
+
// for the details.
|
|
139
|
+
|
|
140
|
+
const char* LanguageCode(Language lang, Encoding enc);
|
|
141
|
+
|
|
142
|
+
//
|
|
143
|
+
// IsEncodingWithSupportedLanguage()
|
|
144
|
+
// ---------------------------------
|
|
145
|
+
//
|
|
146
|
+
// There are some encoding listed here just because they are commonly
|
|
147
|
+
// used. There is no interface language for them yet. They are not
|
|
148
|
+
// detected by Teragram, but can be detected from the meta info of the
|
|
149
|
+
// HTML page.
|
|
150
|
+
//
|
|
151
|
+
// For example, we have list ARABIC_ENCODING but there is no arabic in
|
|
152
|
+
// the Language enum. If the user input an Arabic query from Google
|
|
153
|
+
// main page, Netscape will just send the raw bytes to GWS, and GWS
|
|
154
|
+
// will treat them as Latin1. Therefore, there is no use to detect
|
|
155
|
+
// ARABIC_ENCODING for indexing, since they will never match the
|
|
156
|
+
// queries which are treated as Latin1 by GWS. On the contrary, if we
|
|
157
|
+
// treat page with ARABIC_ENCODING as UNKNOWN_ENCODING, Google will
|
|
158
|
+
// fall them through as Latin1 in indexing time. And there might be a
|
|
159
|
+
// match for some ARABIC queries which are also treated as Latin1 by
|
|
160
|
+
// GWS. In fact, some people are relying on this feature to do Arabic
|
|
161
|
+
// searches.
|
|
162
|
+
//
|
|
163
|
+
// Thus for these type of encoding, before we have the UI support for
|
|
164
|
+
// their language and have a pretty comprehensive language/encoding
|
|
165
|
+
// identification quality, it is better to revert them as
|
|
166
|
+
// UNKNOWN_ENCODING.
|
|
167
|
+
//
|
|
168
|
+
// This function checks whether the input encoding is one with
|
|
169
|
+
// an interface language.
|
|
170
|
+
bool IsEncodingWithSupportedLanguage(Encoding enc);
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
//
|
|
174
|
+
// LangsFromCountryCode and EncFromCountryCode
|
|
175
|
+
// -------------------------------------------
|
|
176
|
+
//
|
|
177
|
+
// These two functions return the possible languages and encodings,
|
|
178
|
+
// respectively, according to the input country code, which is a
|
|
179
|
+
// 2-letter string. The country code is usually specified in the url
|
|
180
|
+
// of a document.
|
|
181
|
+
//
|
|
182
|
+
//
|
|
183
|
+
|
|
184
|
+
// LangsFromCountryCode
|
|
185
|
+
// --------------------
|
|
186
|
+
//
|
|
187
|
+
// This function takes a string of arbitrary length. It treats the
|
|
188
|
+
// first 2 bytes of the string as the country code, as defined in iso
|
|
189
|
+
// 3166-1993 (E). It returns, via arguments, an array of the
|
|
190
|
+
// languages that are popular in that country, roughly in order of
|
|
191
|
+
// popularity, together with the size of the array.
|
|
192
|
+
//
|
|
193
|
+
// This function returns true if we have language information for
|
|
194
|
+
// country_code. Otherwise, it returns false.
|
|
195
|
+
//
|
|
196
|
+
bool LangsFromCountryCode(const char* country_code,
|
|
197
|
+
const Language** lang_arry,
|
|
198
|
+
int* num_langs);
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
//
|
|
202
|
+
// EncFromCountryCode
|
|
203
|
+
// ------------------
|
|
204
|
+
//
|
|
205
|
+
// This function takes a string of arbitrary length. It treats the
|
|
206
|
+
// first 2 bytes of that string as the country code, as defined in iso
|
|
207
|
+
// 3166-1993 (E). It sets *enc to the encoding that is
|
|
208
|
+
// most often used for the languages spoken in that country.
|
|
209
|
+
//
|
|
210
|
+
// This function returns true if we have encoding information for
|
|
211
|
+
// country_code. Otherwise, it returns false, and *enc is set to
|
|
212
|
+
// UNKNOWN_ENCODING.
|
|
213
|
+
//
|
|
214
|
+
bool EncFromCountryCode(const char* country_code, Encoding* enc);
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
// VisualType
|
|
219
|
+
// ----------
|
|
220
|
+
//
|
|
221
|
+
// Right-to-left documents may be in logical or visual order. When they
|
|
222
|
+
// are in visual order we convert them to logical order before processing.
|
|
223
|
+
// This enum lists the types of visual document we can encounter.
|
|
224
|
+
// Some, but not all, documents in Hebrew/Arabic/Persian etc. will be visual.
|
|
225
|
+
// The other documents in those languages, and all documents in non-RTL
|
|
226
|
+
// languages, will be NOT_VISUAL_DOCUMENT.
|
|
227
|
+
enum VisualType {
|
|
228
|
+
NOT_VISUAL_DOCUMENT = 0,
|
|
229
|
+
VISUAL_HEBREW_HTML, // HTML documents in the legacy visual order.
|
|
230
|
+
CONVERTED_RTL_PDF, // Converted RTL PDFs, which are always visual.
|
|
231
|
+
};
|
|
232
|
+
|
|
233
|
+
VisualType default_visualtype();
|
|
234
|
+
|
|
235
|
+
// VisualTypeName
|
|
236
|
+
// --------------
|
|
237
|
+
//
|
|
238
|
+
// Given the visual type, returns a string name useful for debug output.
|
|
239
|
+
const char* VisualTypeName(VisualType visualtype);
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
// InitLangEnc
|
|
244
|
+
// -----------
|
|
245
|
+
//
|
|
246
|
+
// Ensures the LangEnc module has been initialized. Normally this
|
|
247
|
+
// happens during InitGoogle, but this allows access for scripts that
|
|
248
|
+
// don't support InitGoogle. InitLangEnc calls InitEncodings (see
|
|
249
|
+
// i18n/encodings/public/encodings.h) and also initializes data
|
|
250
|
+
// structures used in lang_enc.cc.
|
|
251
|
+
//
|
|
252
|
+
void InitLangEnc();
|
|
253
|
+
|
|
254
|
+
#endif // ENCODINGS_LANG_ENC_H__
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#ifndef ENCODINGS_PROTO_ENCODINGS_PB_H_
|
|
6
|
+
#define ENCODINGS_PROTO_ENCODINGS_PB_H_
|
|
7
|
+
|
|
8
|
+
enum Encoding {
|
|
9
|
+
ISO_8859_1 = 0, // Teragram ASCII
|
|
10
|
+
ISO_8859_2 = 1, // Teragram Latin2
|
|
11
|
+
ISO_8859_3 = 2, // in BasisTech but not in Teragram
|
|
12
|
+
ISO_8859_4 = 3, // Teragram Latin4
|
|
13
|
+
ISO_8859_5 = 4, // Teragram ISO-8859-5
|
|
14
|
+
ISO_8859_6 = 5, // Teragram Arabic
|
|
15
|
+
ISO_8859_7 = 6, // Teragram Greek
|
|
16
|
+
ISO_8859_8 = 7, // Teragram Hebrew
|
|
17
|
+
ISO_8859_9 = 8, // in BasisTech but not in Teragram
|
|
18
|
+
ISO_8859_10 = 9, // in BasisTech but not in Teragram
|
|
19
|
+
JAPANESE_EUC_JP = 10, // Teragram EUC_JP
|
|
20
|
+
JAPANESE_SHIFT_JIS = 11, // Teragram SJS
|
|
21
|
+
JAPANESE_JIS = 12, // Teragram JIS
|
|
22
|
+
CHINESE_BIG5 = 13, // Teragram BIG5
|
|
23
|
+
CHINESE_GB = 14, // Teragram GB
|
|
24
|
+
CHINESE_EUC_CN = 15, // Misnamed. Should be EUC_TW. Was Basis Tech
|
|
25
|
+
// CNS11643EUC, before that Teragram EUC-CN(!)
|
|
26
|
+
// See //i18n/basistech/basistech_encodings.h
|
|
27
|
+
KOREAN_EUC_KR = 16, // Teragram KSC
|
|
28
|
+
UNICODE = 17, // Teragram Unicode
|
|
29
|
+
CHINESE_EUC_DEC = 18, // Misnamed. Should be EUC_TW. Was Basis Tech
|
|
30
|
+
// CNS11643EUC, before that Teragram EUC.
|
|
31
|
+
CHINESE_CNS = 19, // Misnamed. Should be EUC_TW. Was Basis Tech
|
|
32
|
+
// CNS11643EUC, before that Teragram CNS.
|
|
33
|
+
CHINESE_BIG5_CP950 = 20, // Teragram BIG5_CP950
|
|
34
|
+
JAPANESE_CP932 = 21, // Teragram CP932
|
|
35
|
+
UTF8 = 22,
|
|
36
|
+
UNKNOWN_ENCODING = 23,
|
|
37
|
+
ASCII_7BIT = 24, // ISO_8859_1 with all characters <= 127.
|
|
38
|
+
// Should be present only in the crawler
|
|
39
|
+
// and in the repository,
|
|
40
|
+
// *never* as a result of Document::encoding().
|
|
41
|
+
RUSSIAN_KOI8_R = 25, // Teragram KOI8R
|
|
42
|
+
RUSSIAN_CP1251 = 26, // Teragram CP1251
|
|
43
|
+
|
|
44
|
+
//----------------------------------------------------------
|
|
45
|
+
// These are _not_ output from teragram. Instead, they are as
|
|
46
|
+
// detected in the headers of usenet articles.
|
|
47
|
+
MSFT_CP1252 = 27, // 27: CP1252 aka MSFT euro ascii
|
|
48
|
+
RUSSIAN_KOI8_RU = 28, // CP21866 aka KOI8-U, used for Ukrainian.
|
|
49
|
+
// Misnamed, this is _not_ KOI8-RU but KOI8-U.
|
|
50
|
+
// KOI8-U is used much more often than KOI8-RU.
|
|
51
|
+
MSFT_CP1250 = 29, // CP1250 aka MSFT eastern european
|
|
52
|
+
ISO_8859_15 = 30, // aka ISO_8859_0 aka ISO_8859_1 euroized
|
|
53
|
+
//----------------------------------------------------------
|
|
54
|
+
|
|
55
|
+
//----------------------------------------------------------
|
|
56
|
+
// These are in BasisTech but not in Teragram. They are
|
|
57
|
+
// needed for new interface languages. Now detected by
|
|
58
|
+
// research langid
|
|
59
|
+
MSFT_CP1254 = 31, // used for Turkish
|
|
60
|
+
MSFT_CP1257 = 32, // used in Baltic countries
|
|
61
|
+
//----------------------------------------------------------
|
|
62
|
+
|
|
63
|
+
//----------------------------------------------------------
|
|
64
|
+
//----------------------------------------------------------
|
|
65
|
+
// New encodings detected by Teragram
|
|
66
|
+
ISO_8859_11 = 33, // aka TIS-620, used for Thai
|
|
67
|
+
MSFT_CP874 = 34, // used for Thai
|
|
68
|
+
MSFT_CP1256 = 35, // used for Arabic
|
|
69
|
+
|
|
70
|
+
//----------------------------------------------------------
|
|
71
|
+
// Detected as ISO_8859_8 by Teragram, but can be found in META tags
|
|
72
|
+
MSFT_CP1255 = 36, // Logical Hebrew Microsoft
|
|
73
|
+
ISO_8859_8_I = 37, // Iso Hebrew Logical
|
|
74
|
+
HEBREW_VISUAL = 38, // Iso Hebrew Visual
|
|
75
|
+
//----------------------------------------------------------
|
|
76
|
+
|
|
77
|
+
//----------------------------------------------------------
|
|
78
|
+
// Detected by research langid
|
|
79
|
+
CZECH_CP852 = 39,
|
|
80
|
+
CZECH_CSN_369103 = 40, // aka ISO_IR_139 aka KOI8_CS
|
|
81
|
+
MSFT_CP1253 = 41, // used for Greek
|
|
82
|
+
RUSSIAN_CP866 = 42,
|
|
83
|
+
//----------------------------------------------------------
|
|
84
|
+
|
|
85
|
+
//----------------------------------------------------------
|
|
86
|
+
// Handled by iconv in glibc
|
|
87
|
+
ISO_8859_13 = 43,
|
|
88
|
+
ISO_2022_KR = 44,
|
|
89
|
+
GBK = 45,
|
|
90
|
+
GB18030 = 46,
|
|
91
|
+
BIG5_HKSCS = 47,
|
|
92
|
+
ISO_2022_CN = 48,
|
|
93
|
+
|
|
94
|
+
//-----------------------------------------------------------
|
|
95
|
+
// Detected by xin liu's detector
|
|
96
|
+
// Handled by transcoder
|
|
97
|
+
// (Indic encodings)
|
|
98
|
+
|
|
99
|
+
TSCII = 49,
|
|
100
|
+
TAMIL_MONO = 50,
|
|
101
|
+
TAMIL_BI = 51,
|
|
102
|
+
JAGRAN = 52,
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
MACINTOSH_ROMAN = 53,
|
|
106
|
+
UTF7 = 54,
|
|
107
|
+
BHASKAR = 55, // Indic encoding - Devanagari
|
|
108
|
+
HTCHANAKYA = 56, // 56 Indic encoding - Devanagari
|
|
109
|
+
|
|
110
|
+
//-----------------------------------------------------------
|
|
111
|
+
// These allow a single place (inputconverter and outputconverter)
|
|
112
|
+
// to do UTF-16 <==> UTF-8 bulk conversions and UTF-32 <==> UTF-8
|
|
113
|
+
// bulk conversions, with interchange-valid checking on input and
|
|
114
|
+
// fallback if needed on ouput.
|
|
115
|
+
UTF16BE = 57, // big-endian UTF-16
|
|
116
|
+
UTF16LE = 58, // little-endian UTF-16
|
|
117
|
+
UTF32BE = 59, // big-endian UTF-32
|
|
118
|
+
UTF32LE = 60, // little-endian UTF-32
|
|
119
|
+
//-----------------------------------------------------------
|
|
120
|
+
|
|
121
|
+
//-----------------------------------------------------------
|
|
122
|
+
// An encoding that means "This is not text, but it may have some
|
|
123
|
+
// simple ASCII text embedded". Intended input conversion (not yet
|
|
124
|
+
// implemented) is to keep strings of >=4 seven-bit ASCII characters
|
|
125
|
+
// (follow each kept string with an ASCII space), delete the rest of
|
|
126
|
+
// the bytes. This will pick up and allow indexing of e.g. captions
|
|
127
|
+
// in JPEGs. No output conversion needed.
|
|
128
|
+
BINARYENC = 61,
|
|
129
|
+
//-----------------------------------------------------------
|
|
130
|
+
|
|
131
|
+
//-----------------------------------------------------------
|
|
132
|
+
// Some Web pages allow a mixture of HZ-GB and GB-2312 by using
|
|
133
|
+
// ~{ ... ~} for 2-byte pairs, and the browsers support this.
|
|
134
|
+
HZ_GB_2312 = 62,
|
|
135
|
+
//-----------------------------------------------------------
|
|
136
|
+
|
|
137
|
+
//-----------------------------------------------------------
|
|
138
|
+
// Some external vendors make the common input error of
|
|
139
|
+
// converting MSFT_CP1252 to UTF8 *twice*. No output conversion needed.
|
|
140
|
+
UTF8UTF8 = 63,
|
|
141
|
+
//-----------------------------------------------------------
|
|
142
|
+
|
|
143
|
+
//-----------------------------------------------------------
|
|
144
|
+
// Handled by transcoder for tamil language specific font
|
|
145
|
+
// encodings without the support for detection at present.
|
|
146
|
+
TAM_ELANGO = 64, // Elango - Tamil
|
|
147
|
+
TAM_LTTMBARANI = 65, // Barani - Tamil
|
|
148
|
+
TAM_SHREE = 66, // Shree - Tamil
|
|
149
|
+
TAM_TBOOMIS = 67, // TBoomis - Tamil
|
|
150
|
+
TAM_TMNEWS = 68, // TMNews - Tamil
|
|
151
|
+
TAM_WEBTAMIL = 69, // Webtamil - Tamil
|
|
152
|
+
//-----------------------------------------------------------
|
|
153
|
+
|
|
154
|
+
//-----------------------------------------------------------
|
|
155
|
+
// Shift_JIS variants used by Japanese cell phone carriers.
|
|
156
|
+
KDDI_SHIFT_JIS = 70,
|
|
157
|
+
DOCOMO_SHIFT_JIS = 71,
|
|
158
|
+
SOFTBANK_SHIFT_JIS = 72,
|
|
159
|
+
// ISO-2022-JP variants used by KDDI and SoftBank.
|
|
160
|
+
KDDI_ISO_2022_JP = 73,
|
|
161
|
+
SOFTBANK_ISO_2022_JP = 74,
|
|
162
|
+
//-----------------------------------------------------------
|
|
163
|
+
|
|
164
|
+
NUM_ENCODINGS = 75, // Always keep this at the end. It is not a
|
|
165
|
+
// valid Encoding enum, it is only used to
|
|
166
|
+
// indicate the total number of Encodings.
|
|
167
|
+
};
|
|
168
|
+
|
|
169
|
+
#endif // ENCODINGS_PROTO_ENCODINGS_PB_H_
|