cld 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (107) hide show
  1. data/LICENSE +27 -0
  2. data/Manifest +106 -0
  3. data/README.rdoc +173 -0
  4. data/Rakefile +15 -0
  5. data/base/basictypes.h +348 -0
  6. data/base/build_config.h +115 -0
  7. data/base/casts.h +156 -0
  8. data/base/commandlineflags.h +443 -0
  9. data/base/crash.h +41 -0
  10. data/base/dynamic_annotations.h +358 -0
  11. data/base/global_strip_options.h +59 -0
  12. data/base/log_severity.h +46 -0
  13. data/base/logging.h +1403 -0
  14. data/base/macros.h +243 -0
  15. data/base/port.h +54 -0
  16. data/base/scoped_ptr.h +428 -0
  17. data/base/stl_decl.h +0 -0
  18. data/base/stl_decl_msvc.h +107 -0
  19. data/base/string_util.h +29 -0
  20. data/base/strtoint.h +93 -0
  21. data/base/template_util.h +96 -0
  22. data/base/type_traits.h +198 -0
  23. data/base/vlog_is_on.h +143 -0
  24. data/build.sh +48 -0
  25. data/build.win.cmd +28 -0
  26. data/cld.gemspec +30 -0
  27. data/cld_encodings.h +95 -0
  28. data/encodings/compact_lang_det/#cldutil.cc# +905 -0
  29. data/encodings/compact_lang_det/#cldutil.h# +1205 -0
  30. data/encodings/compact_lang_det/#compact_lang_det_impl.h# +171 -0
  31. data/encodings/compact_lang_det/#ext_lang_enc.cc# +545 -0
  32. data/encodings/compact_lang_det/#ext_lang_enc.h# +119 -0
  33. data/encodings/compact_lang_det/#getonescriptspan.cc# +570 -0
  34. data/encodings/compact_lang_det/#getonescriptspan.h# +131 -0
  35. data/encodings/compact_lang_det/#tote.cc# +299 -0
  36. data/encodings/compact_lang_det/#tote.h# +89 -0
  37. data/encodings/compact_lang_det/cldutil.cc +905 -0
  38. data/encodings/compact_lang_det/cldutil.h +1205 -0
  39. data/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  40. data/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  41. data/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  42. data/encodings/compact_lang_det/compact_lang_det.h +145 -0
  43. data/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  44. data/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  45. data/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  46. data/encodings/compact_lang_det/compile.cmd +1 -0
  47. data/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  48. data/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  49. data/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  50. data/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  51. data/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  52. data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  53. data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  54. data/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  55. data/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  56. data/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  57. data/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  58. data/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  59. data/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  60. data/encodings/compact_lang_det/getonescriptspan.h +131 -0
  61. data/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  62. data/encodings/compact_lang_det/letterscript_enum.h +99 -0
  63. data/encodings/compact_lang_det/subsetsequence.cc +259 -0
  64. data/encodings/compact_lang_det/subsetsequence.h +44 -0
  65. data/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  66. data/encodings/compact_lang_det/tote.cc +299 -0
  67. data/encodings/compact_lang_det/tote.h +89 -0
  68. data/encodings/compact_lang_det/unittest_data.h +193 -0
  69. data/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  70. data/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  71. data/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  72. data/encodings/compact_lang_det/win/#cld_unilib_windows.cc# +29 -0
  73. data/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  74. data/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  75. data/encodings/compact_lang_det/win/cld_google.h +18 -0
  76. data/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  77. data/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  78. data/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  79. data/encodings/compact_lang_det/win/cld_logging.h +21 -0
  80. data/encodings/compact_lang_det/win/cld_macros.h +19 -0
  81. data/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  82. data/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  83. data/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  84. data/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  85. data/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  86. data/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  87. data/encodings/compact_lang_det/win/cld_utf.h +24 -0
  88. data/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  89. data/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  90. data/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  91. data/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  92. data/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  93. data/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  94. data/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  95. data/encodings/internal/encodings.cc +12 -0
  96. data/encodings/lang_enc.h +254 -0
  97. data/encodings/proto/encodings.pb.h +169 -0
  98. data/encodings/public/encodings.h +301 -0
  99. data/ext/cld/extconf.rb +7 -0
  100. data/languages/internal/#languages.cc# +337 -0
  101. data/languages/internal/languages.cc +337 -0
  102. data/languages/proto/languages.pb.h +179 -0
  103. data/languages/public/languages.h +379 -0
  104. data/lib/cld.rb +12 -0
  105. data/test/test.rb +570 -0
  106. data/thunk.cc +131 -0
  107. metadata +168 -0
@@ -0,0 +1,301 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_PUBLIC_ENCODINGS_H_
6
+ #define ENCODINGS_PUBLIC_ENCODINGS_H_
7
+
8
+ // This interface defines the Encoding enum and various functions that
9
+ // depend only on Encoding values.
10
+
11
+ // A hash-function for Encoding, hash<Encoding>, is defined in
12
+ // i18n/encodings/public/encodings-hash.h
13
+
14
+ // On some Windows projects, UNICODE may be defined, which would prevent the
15
+ // Encoding enum below from compiling. Note that this is a quick fix that does
16
+ // not break any existing projects. The UNICODE enum may someday be changed
17
+ // to something more specific and non-colliding, but this involves careful
18
+ // testing of changes in many other projects.
19
+ #undef UNICODE
20
+
21
+ // NOTE: The Encoding enum must always start at 0. This assumption has
22
+ // been made and used.
23
+
24
+ #ifndef SWIG
25
+
26
+ #include "encodings/proto/encodings.pb.h"
27
+
28
+ // We must have this for compatibility.
29
+ // COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
30
+ //using namespace i18n::encodings;
31
+
32
+ #else
33
+
34
+ // Special proto SWIG workaround header file.
35
+ #include "i18n/encodings/internal/encodings_proto_wrapper.h"
36
+
37
+ #endif
38
+
39
+ const int kNumEncodings = NUM_ENCODINGS;
40
+
41
+ // some of the popular encoding aliases
42
+ // TODO(jrm) Make these static const Encoding values instead of macros.
43
+ #define LATIN1 ISO_8859_1
44
+ #define LATIN2 ISO_8859_2
45
+ #define LATIN3 ISO_8859_3
46
+ #define LATIN4 ISO_8859_4
47
+ #define CYRILLIC ISO_8859_5
48
+ #define ARABIC_ENCODING ISO_8859_6 // avoiding the same name as language
49
+ #define GREEK_ENCODING ISO_8859_7 // avoiding the same name as language
50
+ #define HEBREW_ENCODING ISO_8859_8 // avoiding the same name as language
51
+ #define LATIN5 ISO_8859_9
52
+ #define LATIN6 ISO_8859_10
53
+ #define KOREAN_HANGUL KOREAN_EUC_KR
54
+
55
+ // The default Encoding (LATIN1).
56
+ Encoding default_encoding();
57
+
58
+
59
+
60
+ // *************************************************************
61
+ // Encoding predicates
62
+ // IsValidEncoding()
63
+ // IsEncEncCompatible
64
+ // IsSupersetOfAscii7Bit
65
+ // Is8BitEncoding
66
+ // IsCJKEncoding
67
+ // IsHebrewEncoding
68
+ // IsRightToLeftEncoding
69
+ // IsLogicalRightToLeftEncoding
70
+ // IsVisualRightToLeftEncoding
71
+ // IsIso2022Encoding
72
+ // IsIso2022JpOrVariant
73
+ // IsShiftJisOrVariant
74
+ // IsJapaneseCellPhoneCarrierSpecificEncoding
75
+ // *************************************************************
76
+
77
+ // IsValidEncoding
78
+ // ===================================
79
+ //
80
+ // Function to check if the input language enum is within range.
81
+ //
82
+
83
+ bool IsValidEncoding(Encoding enc);
84
+
85
+ //
86
+ // IsEncEncCompatible
87
+ // ------------------
88
+ //
89
+ // This function is to determine whether or not converting from the
90
+ // first encoding to the second requires any changes to the underlying
91
+ // text (e.g. ASCII_7BIT is a subset of UTF8).
92
+ //
93
+ // TODO(someone more familiar with i18n): the current implementation
94
+ // is likely incomplete. It would be good to consider the full matrix
95
+ // of all pairs of encodings and to fish out all compatible pairs.
96
+ //
97
+ bool IsEncEncCompatible(const Encoding from, const Encoding to);
98
+
99
+ // To be a superset of 7-bit Ascii means that bytes 0...127 in the given
100
+ // encoding represent the same characters as they do in ISO_8859_1.
101
+
102
+ // WARNING: This function does not currently return true for all encodings that
103
+ // are supersets of Ascii 7-bit.
104
+ bool IsSupersetOfAscii7Bit(Encoding e);
105
+
106
+ // To be an 8-bit encoding means that there are fewer than 256 symbols.
107
+ // Each byte determines a new character; there are no multi-byte sequences.
108
+
109
+ // WARNING: This function does not currently return true for all encodings that
110
+ // are 8-bit encodings.
111
+ bool Is8BitEncoding(Encoding e);
112
+
113
+ // IsCJKEncoding
114
+ // -------------
115
+ //
116
+ // This function returns true if the encoding is either Chinese
117
+ // (simplified or traditional), Japanese, or Korean. Note: UTF8 is not
118
+ // considered a CJK encoding.
119
+ bool IsCJKEncoding(Encoding e);
120
+
121
+ // IsHebrewEncoding
122
+ // -------------
123
+ //
124
+ // This function returns true if the encoding is a Hebrew specific
125
+ // encoding (not UTF8, etc).
126
+ bool IsHebrewEncoding(Encoding e);
127
+
128
+ // IsRightToLeftEncoding
129
+ // ---------------------
130
+ //
131
+ // Returns true if the encoding is a right-to-left encoding.
132
+ //
133
+ // Note that the name of this function is somewhat misleading. There is nothing
134
+ // "right to left" about these encodings. They merely contain code points for
135
+ // characters in RTL languages such as Hebrew and Arabic. But this is also
136
+ // true for UTF-8.
137
+ //
138
+ // TODO(benjy): Get rid of this function. The only special-case we
139
+ // should need to worry about are visual encodings. Anything we
140
+ // need to do for all 'RTL' encodings we need to do for UTF-8 as well.
141
+ bool IsRightToLeftEncoding(Encoding enc);
142
+
143
+ // IsLogicalRightToLeftEncoding
144
+ // ----------------------------
145
+ //
146
+ // Returns true if the encoding is a logical right-to-left encoding.
147
+ // Logical right-to-left encodings are those that the browser renders
148
+ // right-to-left and applies the BiDi algorithm to. Therefore the characters
149
+ // appear in reading order in the file, and indexing, snippet generation etc.
150
+ // should all just work with no special processing.
151
+ //
152
+ // TODO(benjy): Get rid of this function. The only special-case we
153
+ // should need to worry about are visual encodings.
154
+ bool IsLogicalRightToLeftEncoding(Encoding enc);
155
+
156
+ // IsVisualRightToLeftEncoding
157
+ // ---------------------------
158
+ //
159
+ // Returns true if the encoding is a visual right-to-left encoding.
160
+ // Visual right-to-left encodings are those that the browser renders
161
+ // left-to-right and does not apply the BiDi algorithm to. Therefore each
162
+ // line appears in reverse order in the file, lines are manually wrapped
163
+ // by abusing <br> or <p> tags, etc. Visual RTL encoding is a relic of
164
+ // the prehistoric days when browsers couldn't render right-to-left, but
165
+ // unfortunately some visual pages persist to this day. These documents require
166
+ // special processing so that we don't index or snippet them with each line
167
+ // reversed.
168
+ bool IsVisualRightToLeftEncoding(Encoding enc);
169
+
170
+ // IsIso2022Encoding
171
+ // -----------------
172
+ //
173
+ // Returns true if the encoding is a kind of ISO 2022 such as
174
+ // ISO-2022-JP.
175
+ bool IsIso2022Encoding(Encoding enc);
176
+
177
+ // IsIso2022JpOrVariant
178
+ // --------------------
179
+ //
180
+ // Returns true if the encoding is ISO-2022-JP or a variant such as
181
+ // KDDI's ISO-2022-JP.
182
+ bool IsIso2022JpOrVariant(Encoding enc);
183
+
184
+ // IsShiftJisOrVariant
185
+ // --------------------
186
+ //
187
+ // Returns true if the encoding is Shift_JIS or a variant such as
188
+ // KDDI's Shift_JIS.
189
+ bool IsShiftJisOrVariant(Encoding enc);
190
+
191
+ // IsJapanesCellPhoneCarrierSpecificEncoding
192
+ // -----------------------------------------
193
+ //
194
+ // Returns true if it's Japanese cell phone carrier specific encoding
195
+ // such as KDDI_SHIFT_JIS.
196
+ bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc);
197
+
198
+
199
+
200
+ // *************************************************************
201
+ // ENCODING NAMES
202
+ //
203
+ // This interface defines a standard name for each valid encoding, and
204
+ // a standard name for invalid encodings. (Some names use all upper
205
+ // case, but others use mixed case.)
206
+ //
207
+ // EncodingName() [Encoding to name]
208
+ // MimeEncodingName() [Encoding to name]
209
+ // EncodingFromName() [name to Encoding]
210
+ // EncodingNameAliasToEncoding() [name to Encoding]
211
+ // default_encoding_name()
212
+ // invalid_encoding_name()
213
+ // *************************************************************
214
+
215
+ // EncodingName
216
+ // ------------
217
+ //
218
+ // Given the encoding, returns its standard name.
219
+ // Return invalid_encoding_name() if the encoding is invalid.
220
+ //
221
+ const char* EncodingName(Encoding enc);
222
+
223
+ //
224
+ // MimeEncodingName
225
+ // ----------------
226
+ //
227
+ // Return the "preferred MIME name" of an encoding.
228
+ //
229
+ // This name is suitable for using in HTTP headers, HTML tags,
230
+ // and as the "charset" parameter of a MIME Content-Type.
231
+ const char* MimeEncodingName(Encoding enc);
232
+
233
+
234
+ // The maximum length of an encoding name
235
+ const int kMaxEncodingNameSize = 50;
236
+
237
+ // The standard name of the default encoding.
238
+ const char* default_encoding_name();
239
+
240
+ // The name used for an invalid encoding.
241
+ const char* invalid_encoding_name();
242
+
243
+ // EncodingFromName
244
+ // ----------------
245
+ //
246
+ // If enc_name matches the standard name of an Encoding, using a
247
+ // case-insensitive comparison, set *encoding to that Encoding and
248
+ // return true. Otherwise set *encoding to UNKNOWN_ENCODING and
249
+ // return false.
250
+ //
251
+ // REQUIRES: encoding must not be NULL.
252
+ //
253
+ bool EncodingFromName(const char* enc_name, Encoding *encoding);
254
+
255
+ //
256
+ // EncodingNameAliasToEncoding
257
+ // ---------------------------
258
+ //
259
+ // If enc_name matches the standard name or an alias of an Encoding,
260
+ // using a case-insensitive comparison, return that
261
+ // Encoding. Otherwise, return UNKNOWN_ENCODING.
262
+ //
263
+ // Aliases include most mime-encoding names (e.g., "ISO-8859-7" for
264
+ // GREEK), alternate names (e.g., "cyrillic" for ISO_8859_5) and
265
+ // common variations with hyphens and underscores (e.g., "koi8-u" and
266
+ // "koi8u" for RUSSIAN_KOI8_R).
267
+
268
+ Encoding EncodingNameAliasToEncoding(const char *enc_name);
269
+
270
+
271
+ // *************************************************************
272
+ // Miscellany
273
+ // *************************************************************
274
+
275
+ // PreferredWebOutputEncoding
276
+ // --------------------------
277
+ //
278
+ // Some multi-byte encodings use byte values that coincide with the
279
+ // ASCII codes for HTML syntax characters <>"&' and browsers like MSIE
280
+ // can misinterpret these, as indicated in an external XSS report from
281
+ // 2007-02-15. Here, we map these dangerous encodings to safer ones. We
282
+ // also use UTF8 instead of encodings that we don't support in our
283
+ // output, and we generally try to be conservative in what we send out.
284
+ // Where the client asks for single- or double-byte encodings that are
285
+ // not as common, we substitute a more common single- or double-byte
286
+ // encoding, if there is one, thereby preserving the client's intent
287
+ // to use less space than UTF-8. This also means that characters
288
+ // outside the destination set will be converted to HTML NCRs (&#NNN;)
289
+ // if requested.
290
+ Encoding PreferredWebOutputEncoding(Encoding enc);
291
+
292
+
293
+ // InitEncodings
294
+ // -------------
295
+ //
296
+ // Ensures the encodings module has been initialized. Normally this happens
297
+ // during InitGoogle, but this allows access for scripts that don't
298
+ // support InitGoogle.
299
+ void InitEncodings();
300
+
301
+ #endif // ENCODINGS_PUBLIC_ENCODINGS_H_
@@ -0,0 +1,7 @@
1
+ require "rake"
2
+ home_dir = File.expand_path(File.join(File.dirname(__FILE__), "../../"))
3
+ puts home_dir
4
+ cmd = "cd #{home_dir}; ./build.sh"
5
+ sh cmd
6
+ sh "mv #{home_dir}/cld.so #{home_dir}/ext/cld/"
7
+ sh "echo 'install:\n\tdate' > #{home_dir}/ext/cld/Makefile"
@@ -0,0 +1,337 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include "languages/public/languages.h"
6
+
7
+ #include "base/string_util.h"
8
+ #include "encodings/compact_lang_det/win/cld_basictypes.h"
9
+
10
+
11
+ Language default_language() {return ENGLISH;}
12
+
13
+
14
+ // Language names and codes
15
+
16
+ struct LanguageInfo {
17
+ const char * language_name_;
18
+ const char * language_code_639_1_; // the ISO-639-1 code for the language
19
+ const char * language_code_639_2_; // the ISO-639-2 code for the language
20
+ const char * language_code_other_; // some nonstandard code for the language
21
+ };
22
+
23
+ static const LanguageInfo kLanguageInfoTable[] = {
24
+ { "ENGLISH", "en", "eng", NULL},
25
+ { "DANISH", "da", "dan", NULL},
26
+ { "DUTCH", "nl", "dut", NULL},
27
+ { "FINNISH", "fi", "fin", NULL},
28
+ { "FRENCH", "fr", "fre", NULL},
29
+ { "GERMAN", "de", "ger", NULL},
30
+ { "HEBREW", "he", "heb", NULL},
31
+ { "ITALIAN", "it", "ita", NULL},
32
+ { "Japanese", "ja", "jpn", NULL},
33
+ { "Korean", "ko", "kor", NULL},
34
+ { "NORWEGIAN", "nb", "nor", NULL},
35
+ { "POLISH", "pl", "pol", NULL},
36
+ { "PORTUGUESE", "pt", "por", NULL},
37
+ { "RUSSIAN", "ru", "rus", NULL},
38
+ { "SPANISH", "es", "spa", NULL},
39
+ { "SWEDISH", "sv", "swe", NULL},
40
+ { "Chinese", "zh", "chi", "zh-CN"},
41
+ { "CZECH", "cs", "cze", NULL},
42
+ { "GREEK", "el", "gre", NULL},
43
+ { "ICELANDIC", "is", "ice", NULL},
44
+ { "LATVIAN", "lv", "lav", NULL},
45
+ { "LITHUANIAN", "lt", "lit", NULL},
46
+ { "ROMANIAN", "ro", "rum", NULL},
47
+ { "HUNGARIAN", "hu", "hun", NULL},
48
+ { "ESTONIAN", "et", "est", NULL},
49
+ // TODO: Although Teragram has two output names "TG_UNKNOWN_LANGUAGE"
50
+ // and "Unknown", they are essentially the same. Need to unify them.
51
+ // "un" and "ut" are invented by us, not from ISO-639.
52
+ //
53
+ { "TG_UNKNOWN_LANGUAGE", NULL, NULL, "ut"},
54
+ { "Unknown", NULL, NULL, "un"},
55
+ { "BULGARIAN", "bg", "bul", NULL},
56
+ { "CROATIAN", "hr", "scr", NULL},
57
+ { "SERBIAN", "sr", "scc", NULL},
58
+ { "IRISH", "ga", "gle", NULL},
59
+ { "GALICIAN", "gl", "glg", NULL},
60
+ // Impossible to tell Tagalog from Filipino at the moment.
61
+ // Use ISO 639-2 code for Filipino here.
62
+ { "TAGALOG", NULL, "fil", NULL},
63
+ { "TURKISH", "tr", "tur", NULL},
64
+ { "UKRAINIAN", "uk", "ukr", NULL},
65
+ { "HINDI", "hi", "hin", NULL},
66
+ { "MACEDONIAN", "mk", "mac", NULL},
67
+ { "BENGALI", "bn", "ben", NULL},
68
+ { "INDONESIAN", "id", "ind", NULL},
69
+ { "LATIN", "la", "lat", NULL},
70
+ { "MALAY", "ms", "may", NULL},
71
+ { "MALAYALAM", "ml", "mal", NULL},
72
+ { "WELSH", "cy", "wel", NULL},
73
+ { "NEPALI", "ne", "nep", NULL},
74
+ { "TELUGU", "te", "tel", NULL},
75
+ { "ALBANIAN", "sq", "alb", NULL},
76
+ { "TAMIL", "ta", "tam", NULL},
77
+ { "BELARUSIAN", "be", "bel", NULL},
78
+ { "JAVANESE", "jw", "jav", NULL},
79
+ { "OCCITAN", "oc", "oci", NULL},
80
+ { "URDU", "ur", "urd", NULL},
81
+ { "BIHARI", "bh", "bih", NULL},
82
+ { "GUJARATI", "gu", "guj", NULL},
83
+ { "THAI", "th", "tha", NULL},
84
+ { "ARABIC", "ar", "ara", NULL},
85
+ { "CATALAN", "ca", "cat", NULL},
86
+ { "ESPERANTO", "eo", "epo", NULL},
87
+ { "BASQUE", "eu", "baq", NULL},
88
+ { "INTERLINGUA", "ia", "ina", NULL},
89
+ { "KANNADA", "kn", "kan", NULL},
90
+ { "PUNJABI", "pa", "pan", NULL},
91
+ { "SCOTS_GAELIC", "gd", "gla", NULL},
92
+ { "SWAHILI", "sw", "swa", NULL},
93
+ { "SLOVENIAN", "sl", "slv", NULL},
94
+ { "MARATHI", "mr", "mar", NULL},
95
+ { "MALTESE", "mt", "mlt", NULL},
96
+ { "VIETNAMESE", "vi", "vie", NULL},
97
+ { "FRISIAN", "fy", "fry", NULL},
98
+ { "SLOVAK", "sk", "slo", NULL},
99
+ { "ChineseT",
100
+ NULL, NULL, // We intentionally set these 2 fields to NULL to avoid
101
+ // confusion between CHINESE_T and CHINESE.
102
+ "zh-TW"},
103
+ { "FAROESE", "fo", "fao", NULL},
104
+ { "SUNDANESE", "su", "sun", NULL},
105
+ { "UZBEK", "uz", "uzb", NULL},
106
+ { "AMHARIC", "am", "amh", NULL},
107
+ { "AZERBAIJANI", "az", "aze", NULL},
108
+ { "GEORGIAN", "ka", "geo", NULL},
109
+ { "TIGRINYA", "ti", "tir", NULL},
110
+ { "PERSIAN", "fa", "per", NULL},
111
+ { "BOSNIAN", "bs", "bos", NULL},
112
+ { "SINHALESE", "si", "sin", NULL},
113
+ { "NORWEGIAN_N", "nn", "nno", NULL},
114
+ { "PORTUGUESE_P", NULL, NULL, "pt-PT"},
115
+ { "PORTUGUESE_B", NULL, NULL, "pt-BR"},
116
+ { "XHOSA", "xh", "xho", NULL},
117
+ { "ZULU", "zu", "zul", NULL},
118
+ { "GUARANI", "gn", "grn", NULL},
119
+ { "SESOTHO", "st", "sot", NULL},
120
+ { "TURKMEN", "tk", "tuk", NULL},
121
+ { "KYRGYZ", "ky", "kir", NULL},
122
+ { "BRETON", "br", "bre", NULL},
123
+ { "TWI", "tw", "twi", NULL},
124
+ { "YIDDISH", "yi", "yid", NULL},
125
+ { "SERBO_CROATIAN", "sh", NULL, NULL},
126
+ { "SOMALI", "so", "som", NULL},
127
+ { "UIGHUR", "ug", "uig", NULL},
128
+ { "KURDISH", "ku", "kur", NULL},
129
+ { "MONGOLIAN", "mn", "mon", NULL},
130
+ { "ARMENIAN", "hy", "arm", NULL},
131
+ { "LAOTHIAN", "lo", "lao", NULL},
132
+ { "SINDHI", "sd", "snd", NULL},
133
+ { "RHAETO_ROMANCE", "rm", "roh", NULL},
134
+ { "AFRIKAANS", "af", "afr", NULL},
135
+ { "LUXEMBOURGISH", "lb", "ltz", NULL},
136
+ { "BURMESE", "my", "bur", NULL},
137
+ // KHMER is known as Cambodian for Google user interfaces.
138
+ { "KHMER", "km", "khm", NULL},
139
+ { "TIBETAN", "bo", "tib", NULL},
140
+ { "DHIVEHI", "dv", "div", NULL},
141
+ { "CHEROKEE", NULL, "chr", NULL},
142
+ { "SYRIAC", NULL, "syr", NULL},
143
+ { "LIMBU", NULL, NULL, "sit-NP"},
144
+ { "ORIYA", "or", "ori", NULL},
145
+ { "ASSAMESE", "as", "asm", NULL},
146
+ { "CORSICAN", "co", "cos", NULL},
147
+ { "INTERLINGUE", "ie", "ine", NULL},
148
+ { "KAZAKH", "kk", "kaz", NULL},
149
+ { "LINGALA", "ln", "lin", NULL},
150
+ { "MOLDAVIAN", "mo", "mol", NULL},
151
+ { "PASHTO", "ps", "pus", NULL},
152
+ { "QUECHUA", "qu", "que", NULL},
153
+ { "SHONA", "sn", "sna", NULL},
154
+ { "TAJIK", "tg", "tgk", NULL},
155
+ { "TATAR", "tt", "tat", NULL},
156
+ { "TONGA", "to", "tog", NULL},
157
+ { "YORUBA", "yo", "yor", NULL},
158
+ { "CREOLES_AND_PIDGINS_ENGLISH_BASED", NULL, "cpe", NULL},
159
+ { "CREOLES_AND_PIDGINS_FRENCH_BASED", NULL, "cpf", NULL},
160
+ { "CREOLES_AND_PIDGINS_PORTUGUESE_BASED", NULL, "cpp", NULL},
161
+ { "CREOLES_AND_PIDGINS_OTHER", NULL, "crp", NULL},
162
+ { "MAORI", "mi", "mao", NULL},
163
+ { "WOLOF", "wo", "wol", NULL},
164
+ { "ABKHAZIAN", "ab", "abk", NULL},
165
+ { "AFAR", "aa", "aar", NULL},
166
+ { "AYMARA", "ay", "aym", NULL},
167
+ { "BASHKIR", "ba", "bak", NULL},
168
+ { "BISLAMA", "bi", "bis", NULL},
169
+ { "DZONGKHA", "dz", "dzo", NULL},
170
+ { "FIJIAN", "fj", "fij", NULL},
171
+ { "GREENLANDIC", "kl", "kal", NULL},
172
+ { "HAUSA", "ha", "hau", NULL},
173
+ { "HAITIAN_CREOLE", "ht", NULL, NULL},
174
+ { "INUPIAK", "ik", "ipk", NULL},
175
+ { "INUKTITUT", "iu", "iku", NULL},
176
+ { "KASHMIRI", "ks", "kas", NULL},
177
+ { "KINYARWANDA", "rw", "kin", NULL},
178
+ { "MALAGASY", "mg", "mlg", NULL},
179
+ { "NAURU", "na", "nau", NULL},
180
+ { "OROMO", "om", "orm", NULL},
181
+ { "RUNDI", "rn", "run", NULL},
182
+ { "SAMOAN", "sm", "smo", NULL},
183
+ { "SANGO", "sg", "sag", NULL},
184
+ { "SANSKRIT", "sa", "san", NULL},
185
+ { "SISWANT", "ss", "ssw", NULL},
186
+ { "TSONGA", "ts", "tso", NULL},
187
+ { "TSWANA", "tn", "tsn", NULL},
188
+ { "VOLAPUK", "vo", "vol", NULL},
189
+ { "ZHUANG", "za", "zha", NULL},
190
+ { "KHASI", NULL, "kha", NULL},
191
+ { "SCOTS", NULL, "sco", NULL},
192
+ { "GANDA", "lg", "lug", NULL},
193
+ { "MANX", "gv", "glv", NULL},
194
+ { "MONTENEGRIN", NULL, NULL, "sr-ME"},
195
+ { "XX", NULL, NULL, "XX"},
196
+ };
197
+
198
+ COMPILE_ASSERT(arraysize(kLanguageInfoTable) == NUM_LANGUAGES + 1,
199
+ kLanguageInfoTable_has_incorrect_length);
200
+
201
+
202
+ // LANGUAGE NAMES
203
+
204
+ const char* default_language_name() {
205
+ return kLanguageInfoTable[ENGLISH].language_name_;
206
+ }
207
+
208
+ static const char* const kInvalidLanguageName = "invalid_language";
209
+
210
+ const char *invalid_language_name() {
211
+ return kInvalidLanguageName;
212
+ }
213
+
214
+ const char* LanguageName(Language lang) {
215
+ return IsValidLanguage(lang)
216
+ ? kLanguageInfoTable[lang].language_name_
217
+ : kInvalidLanguageName;
218
+ }
219
+
220
+
221
+
222
+ // LANGUAGE CODES
223
+
224
+
225
+ // The space before invalid_language_code is intentional. It is used
226
+ // to prevent it matching any two letter language code.
227
+ //
228
+ static const char* const kInvalidLanguageCode = " invalid_language_code";
229
+
230
+ const char *invalid_language_code() {
231
+ return kInvalidLanguageCode;
232
+ }
233
+
234
+ const char * LanguageCode(Language lang) {
235
+ if (! IsValidLanguage(lang))
236
+ return kInvalidLanguageCode;
237
+ const LanguageInfo& info = kLanguageInfoTable[lang];
238
+ if (info.language_code_639_1_) {
239
+ return info.language_code_639_1_;
240
+ } else if (info.language_code_639_2_) {
241
+ return info.language_code_639_2_;
242
+ } else if (info.language_code_other_) {
243
+ return info.language_code_other_;
244
+ } else {
245
+ return kInvalidLanguageCode;
246
+ }
247
+ }
248
+
249
+ const char* default_language_code() {
250
+ return kLanguageInfoTable[ENGLISH].language_code_639_1_;
251
+ }
252
+
253
+ const char* LanguageCodeISO639_1(Language lang) {
254
+ if (! IsValidLanguage(lang))
255
+ return kInvalidLanguageCode;
256
+ if (const char* code = kLanguageInfoTable[lang].language_code_639_1_)
257
+ return code;
258
+ return kInvalidLanguageCode;
259
+ }
260
+
261
+ const char* LanguageCodeISO639_2(Language lang) {
262
+ if (! IsValidLanguage(lang))
263
+ return kInvalidLanguageCode;
264
+ if (const char* code = kLanguageInfoTable[lang].language_code_639_2_)
265
+ return code;
266
+ return kInvalidLanguageCode;
267
+ }
268
+
269
+ const char* LanguageCodeWithDialects(Language lang) {
270
+ if (lang == CHINESE)
271
+ return "zh-CN";
272
+ return LanguageCode(lang);
273
+ }
274
+
275
+
276
+
277
+ bool LanguageFromCode(const char* lang_code, Language *language) {
278
+ *language = UNKNOWN_LANGUAGE;
279
+ if ( lang_code == NULL ) return false;
280
+
281
+ for ( int i = 0 ; i < kNumLanguages ; i++ ) {
282
+ const LanguageInfo& info = kLanguageInfoTable[i];
283
+ if ((info.language_code_639_1_ &&
284
+ !base::strcasecmp(lang_code, info.language_code_639_1_)) ||
285
+ (info.language_code_639_2_ &&
286
+ !base::strcasecmp(lang_code, info.language_code_639_2_)) ||
287
+ (info.language_code_other_ &&
288
+ !base::strcasecmp(lang_code, info.language_code_other_))) {
289
+ *language = static_cast<Language>(i);
290
+ return true;
291
+ }
292
+ }
293
+
294
+ // For convenience, this function can also parse the non-standard
295
+ // five-letter language codes "zh-cn" and "zh-tw" which are used by
296
+ // front-ends such as GWS to distinguish Simplified from Traditional
297
+ // Chinese.
298
+ if (!base::strcasecmp(lang_code, "zh-cn") ||
299
+ !base::strcasecmp(lang_code, "zh_cn")) {
300
+ *language = CHINESE;
301
+ return true;
302
+ }
303
+ if (!base::strcasecmp(lang_code, "zh-tw") ||
304
+ !base::strcasecmp(lang_code, "zh_tw")) {
305
+ *language = CHINESE_T;
306
+ return true;
307
+ }
308
+ if (!base::strcasecmp(lang_code, "sr-me") ||
309
+ !base::strcasecmp(lang_code, "sr_me")) {
310
+ *language = MONTENEGRIN;
311
+ return true;
312
+ }
313
+
314
+ // Process language-code synonyms.
315
+ if (!base::strcasecmp(lang_code, "he")) {
316
+ *language = HEBREW; // Use "iw".
317
+ return true;
318
+ }
319
+ if (!base::strcasecmp(lang_code, "in")) {
320
+ *language = INDONESIAN; // Use "id".
321
+ return true;
322
+ }
323
+ if (!base::strcasecmp(lang_code, "ji")) {
324
+ *language = YIDDISH; // Use "yi".
325
+ return true;
326
+ }
327
+
328
+ // Process language-detection synonyms.
329
+ // These distinct languages cannot be differentiated by our current
330
+ // language-detection algorithms.
331
+ if (!base::strcasecmp(lang_code, "fil")) {
332
+ *language = TAGALOG;
333
+ return true;
334
+ }
335
+
336
+ return false;
337
+ }