RubyGems - language_detection - Versions diffs - 0.0.1 - Mend

language_detection 0.0.1

Files changed (100) hide show

data/.gitignore +19 -0
data/Gemfile +4 -0
data/LICENSE.txt +22 -0
data/README.md +85 -0
data/Rakefile +11 -0
data/ext/cld/Makefile +34 -0
data/ext/cld/base/basictypes.h +348 -0
data/ext/cld/base/build_config.h +124 -0
data/ext/cld/base/casts.h +156 -0
data/ext/cld/base/commandlineflags.h +443 -0
data/ext/cld/base/crash.h +41 -0
data/ext/cld/base/dynamic_annotations.h +358 -0
data/ext/cld/base/global_strip_options.h +59 -0
data/ext/cld/base/log_severity.h +46 -0
data/ext/cld/base/logging.h +1403 -0
data/ext/cld/base/macros.h +243 -0
data/ext/cld/base/port.h +54 -0
data/ext/cld/base/scoped_ptr.h +428 -0
data/ext/cld/base/stl_decl.h +0 -0
data/ext/cld/base/stl_decl_msvc.h +107 -0
data/ext/cld/base/string_util.h +29 -0
data/ext/cld/base/strtoint.h +93 -0
data/ext/cld/base/template_util.h +96 -0
data/ext/cld/base/type_traits.h +198 -0
data/ext/cld/base/vlog_is_on.h +143 -0
data/ext/cld/cld.so +0 -0
data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
data/ext/cld/encodings/internal/encodings.cc +12 -0
data/ext/cld/encodings/lang_enc.h +254 -0
data/ext/cld/encodings/proto/encodings.pb.h +169 -0
data/ext/cld/encodings/public/encodings.h +301 -0
data/ext/cld/extconf.rb +1 -0
data/ext/cld/language_detection.cc +88 -0
data/ext/cld/languages/internal/languages.cc +337 -0
data/ext/cld/languages/proto/languages.pb.h +179 -0
data/ext/cld/languages/public/languages.h +379 -0
data/language_detection.gemspec +28 -0
data/lib/language_detection/string.rb +1 -0
data/lib/language_detection/version.rb +3 -0
data/lib/language_detection.rb +54 -0
data/test/_helper.rb +15 -0
data/test/fixtures/languages.csv +80 -0
data/test/language_detection_test.rb +88 -0
metadata +250 -0

data/ext/cld/encodings/lang_enc.h ADDED Viewed

@@ -0,0 +1,254 @@
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// This file is for i18n. It contains two enums, namely Language and
+// Encoding, where Language is the linguistic convention, and Encoding
+// contains information on both language encoding and character set.
+//
+// The language and encoding are both based on Teragram's conventions,
+// except for some common ISO-8859 encodings that are not detected by
+// Teragram but might be in the future.
+//
+// This file also includes functions that do mappings among
+// Language/Encoding enums, language/encoding string names (typically
+// the output from Language Encoding identifier), and language codes
+// (iso 639), and two-letter country codes (iso 3166)
+//
+// NOTE: Both Language and Encoding enums should always start from
+// zero value. This assumption has been made and used.
+//
+#ifndef ENCODINGS_LANG_ENC_H__
+#define ENCODINGS_LANG_ENC_H__
+#include "languages/public/languages.h"
+#include "encodings/public/encodings.h"
+// EncodingsForLanguage
+// --------------------
+//
+// Given the language, returns a pointer to an array of encodings this
+// language supports. Typically, the encs array has at least one
+// element: UNKNOWN_ENCODING, which is always the last element of the
+// array. The first encoding is the default encoding of the language.
+// Return NULL if the input is invalid.
+//
+// Note: The output encoding array does not include ASCII_7BIT, UTF8
+// or UNICODE which are good for all languages. TODO: Find out whether
+// it is better to include ASCII_7BIT, UTF8 and UNICODE or leave them
+// as special cases.
+//
+const Encoding* EncodingsForLanguage(Language lang);
+// DefaultEncodingForLanguage
+// --------------------------
+//
+// Given the language, returns the default encoding for the language
+// via the argument encoding.
+//
+// The function returns true if the input lang is valid. Otherwise,
+// false is returned, and encoding is set to UNKNOWN_ENCODING.
+//
+bool DefaultEncodingForLanguage(Language lang,
+                                Encoding *encoding);
+// LanguagesForEncoding
+// --------------------
+//
+// Given the encoding, returns a pointer to an array of languages this
+// encoding supports. Typically, the langs array has at least one
+// element: UNKNOWN_LANGUAGE, which is always the last element of the
+// array. The first language in the array if the most popular
+// language for that encoding. NULL is returned if the input is
+// invalid.
+//
+// Note: For ASCII_7BIT, UNICODE and UTF8, only ENGLISH and
+// UNKNOWN_LANGUAGE are returned. TODO: Find out whether to return all
+// the languages or to treat these two encodings as special cases.
+//
+// For other known encodings, ENGLISH is always included. This is
+// because English (Latin) characters are included in each encoding.
+//
+const Language* LanguagesForEncoding(Encoding enc);
+// DefaultLanguageForEncoding
+// --------------------------
+//
+// Given the encoding, returns the default language for that encoding
+// via the argument language.
+//
+// The function returns true if the input enc is valid. Otherwise,
+// false is returned, and language is set to UNKNOWN_LANGUAGE.
+//
+// Note, this function is more useful for the encodings that have only
+// one corresponding language i.e. shift_jis => Japanese. There are
+// cases that multiple langauges have the same encoding, for which the
+// default language is an arbitrary choice from them.
+//
+bool DefaultLanguageForEncoding(Encoding enc, Language* language);
+//
+// IsLangEncCompatible
+// -------------------
+//
+// This function is to determine whether the input language and
+// encoding are compatible. For example, FRENCH and LATIN1 are
+// compatible, but FRENCH and GB are not.
+//
+// If either lang or enc is invalid return false.
+// If either lang is unknown, return true.
+//    (e.g. we can detect a page's encoding as latin1 from metatag info, but
+//     cannot derive it language since there are more than one
+//     language encoding in Latin1 )
+// If language is known, but encoding is unknown, return false.
+//    (return true will do us no good since we cannot convert to UTF8 anyway)
+// If enc is unicode or utf8, return true.
+// Otherwise check if lang is supported by enc and enc supported by
+// lang.
+//
+bool IsLangEncCompatible(Language lang, Encoding enc);
+//
+// DominantLanguageFromEncoding
+// ----------------------------
+//
+// This function determine if there exists a dominant language for the
+// input encoding. For example, the encoding GB has a dominant
+// language (Chinese), but Latin1 does not.
+//
+// The word "dominant" is used here because English characters are
+// included in each encoding.
+//
+// If there is no dominant langauge for the encoding, such as Latin1,
+// UNKNOWN_LANGUAGE is returned.
+//
+Language DominantLanguageFromEncoding(Encoding enc);
+// LanguageCode
+// ------------------------
+// Given the Language and Encoding, return language code with dialects
+// (>= 2 letters).  Encoding is necessary to disambiguate between
+// Simplified and Traditional Chinese.
+//
+// See the note on Chinese Language Codes in
+// i18n/languages/public/languages.h
+// for the details.
+const char* LanguageCode(Language lang, Encoding enc);
+//
+// IsEncodingWithSupportedLanguage()
+// ---------------------------------
+//
+// There are some encoding listed here just because they are commonly
+// used.  There is no interface language for them yet. They are not
+// detected by Teragram, but can be detected from the meta info of the
+// HTML page.
+//
+// For example, we have list ARABIC_ENCODING but there is no arabic in
+// the Language enum. If the user input an Arabic query from Google
+// main page, Netscape will just send the raw bytes to GWS, and GWS
+// will treat them as Latin1.  Therefore, there is no use to detect
+// ARABIC_ENCODING for indexing, since they will never match the
+// queries which are treated as Latin1 by GWS. On the contrary, if we
+// treat page with ARABIC_ENCODING as UNKNOWN_ENCODING, Google will
+// fall them through as Latin1 in indexing time. And there might be a
+// match for some ARABIC queries which are also treated as Latin1 by
+// GWS. In fact, some people are relying on this feature to do Arabic
+// searches.
+//
+// Thus for these type of encoding, before we have the UI support for
+// their language and have a pretty comprehensive language/encoding
+// identification quality, it is better to revert them as
+// UNKNOWN_ENCODING.
+//
+// This function checks whether the input encoding is one with
+// an interface language.
+bool IsEncodingWithSupportedLanguage(Encoding enc);
+//
+// LangsFromCountryCode and EncFromCountryCode
+// -------------------------------------------
+//
+// These two functions return the possible languages and encodings,
+// respectively, according to the input country code, which is a
+// 2-letter string. The country code is usually specified in the url
+// of a document.
+//
+//
+// LangsFromCountryCode
+// --------------------
+//
+// This function takes a string of arbitrary length. It treats the
+// first 2 bytes of the string as the country code, as defined in iso
+// 3166-1993 (E).  It returns, via arguments, an array of the
+// languages that are popular in that country, roughly in order of
+// popularity, together with the size of the array.
+//
+// This function returns true if we have language information for
+// country_code.  Otherwise, it returns false.
+//
+bool LangsFromCountryCode(const char* country_code,
+                          const Language** lang_arry,
+                          int* num_langs);
+//
+// EncFromCountryCode
+// ------------------
+//
+// This function takes a string of arbitrary length. It treats the
+// first 2 bytes of that string as the country code, as defined in iso
+// 3166-1993 (E). It sets *enc to the encoding that is
+// most often used for the languages spoken in that country.
+//
+// This function returns true if we have encoding information for
+// country_code.  Otherwise, it returns false, and *enc is set to
+// UNKNOWN_ENCODING.
+//
+bool EncFromCountryCode(const char* country_code, Encoding* enc);
+// VisualType
+// ----------
+//
+// Right-to-left documents may be in logical or visual order. When they
+// are in visual order we convert them to logical order before processing.
+// This enum lists the types of visual document we can encounter.
+// Some, but not all, documents in Hebrew/Arabic/Persian etc. will be visual.
+// The other documents in those languages, and all documents in non-RTL
+// languages, will be NOT_VISUAL_DOCUMENT.
+enum VisualType {
+  NOT_VISUAL_DOCUMENT = 0,
+  VISUAL_HEBREW_HTML,  // HTML documents in the legacy visual order.
+  CONVERTED_RTL_PDF,   // Converted RTL PDFs, which are always visual.
+};
+VisualType default_visualtype();
+// VisualTypeName
+// --------------
+//
+// Given the visual type, returns a string name useful for debug output.
+const char* VisualTypeName(VisualType visualtype);
+// InitLangEnc
+// -----------
+//
+// Ensures the LangEnc module has been initialized.  Normally this
+// happens during InitGoogle, but this allows access for scripts that
+// don't support InitGoogle. InitLangEnc calls InitEncodings (see
+// i18n/encodings/public/encodings.h) and also initializes data
+// structures used in lang_enc.cc.
+//
+void InitLangEnc();
+#endif  // ENCODINGS_LANG_ENC_H__

data/ext/cld/encodings/proto/encodings.pb.h ADDED Viewed

@@ -0,0 +1,169 @@
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#ifndef ENCODINGS_PROTO_ENCODINGS_PB_H_
+#define ENCODINGS_PROTO_ENCODINGS_PB_H_
+enum Encoding {
+  ISO_8859_1           =  0,  // Teragram ASCII
+  ISO_8859_2           =  1,  // Teragram Latin2
+  ISO_8859_3           =  2,  // in BasisTech but not in Teragram
+  ISO_8859_4           =  3,  // Teragram Latin4
+  ISO_8859_5           =  4,  // Teragram ISO-8859-5
+  ISO_8859_6           =  5,  // Teragram Arabic
+  ISO_8859_7           =  6,  // Teragram Greek
+  ISO_8859_8           =  7,  // Teragram Hebrew
+  ISO_8859_9           =  8,  // in BasisTech but not in Teragram
+  ISO_8859_10          =  9,  // in BasisTech but not in Teragram
+  JAPANESE_EUC_JP      = 10,  // Teragram EUC_JP
+  JAPANESE_SHIFT_JIS   = 11,  // Teragram SJS
+  JAPANESE_JIS         = 12,  // Teragram JIS
+  CHINESE_BIG5         = 13,  // Teragram BIG5
+  CHINESE_GB           = 14,  // Teragram GB
+  CHINESE_EUC_CN       = 15,  // Misnamed. Should be EUC_TW. Was Basis Tech
+                              // CNS11643EUC, before that Teragram EUC-CN(!)
+                              // See //i18n/basistech/basistech_encodings.h
+  KOREAN_EUC_KR        = 16,  // Teragram KSC
+  UNICODE              = 17,  // Teragram Unicode
+  CHINESE_EUC_DEC      = 18,  // Misnamed. Should be EUC_TW. Was Basis Tech
+                              // CNS11643EUC, before that Teragram EUC.
+  CHINESE_CNS          = 19,  // Misnamed. Should be EUC_TW. Was Basis Tech
+                              // CNS11643EUC, before that Teragram CNS.
+  CHINESE_BIG5_CP950   = 20,  // Teragram BIG5_CP950
+  JAPANESE_CP932       = 21,  // Teragram CP932
+  UTF8                 = 22,
+  UNKNOWN_ENCODING     = 23,
+  ASCII_7BIT           = 24,  // ISO_8859_1 with all characters <= 127.
+                              // Should be present only in the crawler
+                              // and in the repository,
+                              // *never* as a result of Document::encoding().
+  RUSSIAN_KOI8_R       = 25,  // Teragram KOI8R
+  RUSSIAN_CP1251       = 26,  // Teragram CP1251
+  //----------------------------------------------------------
+  // These are _not_ output from teragram. Instead, they are as
+  // detected in the headers of usenet articles.
+  MSFT_CP1252          = 27,  // 27: CP1252 aka MSFT euro ascii
+  RUSSIAN_KOI8_RU      = 28,  // CP21866 aka KOI8-U, used for Ukrainian.
+                              // Misnamed, this is _not_ KOI8-RU but KOI8-U.
+                              // KOI8-U is used much more often than KOI8-RU.
+  MSFT_CP1250          = 29,  // CP1250 aka MSFT eastern european
+  ISO_8859_15          = 30,  // aka ISO_8859_0 aka ISO_8859_1 euroized
+  //----------------------------------------------------------
+  //----------------------------------------------------------
+  // These are in BasisTech but not in Teragram. They are
+  // needed for new interface languages. Now detected by
+  // research langid
+  MSFT_CP1254          = 31,  // used for Turkish
+  MSFT_CP1257          = 32,  // used in Baltic countries
+  //----------------------------------------------------------
+  //----------------------------------------------------------
+  //----------------------------------------------------------
+  // New encodings detected by Teragram
+  ISO_8859_11          = 33,  // aka TIS-620, used for Thai
+  MSFT_CP874           = 34,  // used for Thai
+  MSFT_CP1256          = 35,  // used for Arabic
+  //----------------------------------------------------------
+  // Detected as ISO_8859_8 by Teragram, but can be found in META tags
+  MSFT_CP1255          = 36,  // Logical Hebrew Microsoft
+  ISO_8859_8_I         = 37,  // Iso Hebrew Logical
+  HEBREW_VISUAL        = 38,  // Iso Hebrew Visual
+  //----------------------------------------------------------
+  //----------------------------------------------------------
+  // Detected by research langid
+  CZECH_CP852          = 39,
+  CZECH_CSN_369103     = 40,  // aka ISO_IR_139 aka KOI8_CS
+  MSFT_CP1253          = 41,  // used for Greek
+  RUSSIAN_CP866        = 42,
+  //----------------------------------------------------------
+  //----------------------------------------------------------
+  // Handled by iconv in glibc
+  ISO_8859_13          = 43,
+  ISO_2022_KR          = 44,
+  GBK                  = 45,
+  GB18030              = 46,
+  BIG5_HKSCS           = 47,
+  ISO_2022_CN          = 48,
+  //-----------------------------------------------------------
+  // Detected by xin liu's detector
+  // Handled by transcoder
+  // (Indic encodings)
+  TSCII                = 49,
+  TAMIL_MONO           = 50,
+  TAMIL_BI             = 51,
+  JAGRAN               = 52,
+  MACINTOSH_ROMAN      = 53,
+  UTF7                 = 54,
+  BHASKAR              = 55,  // Indic encoding - Devanagari
+  HTCHANAKYA           = 56,  // 56 Indic encoding - Devanagari
+  //-----------------------------------------------------------
+  // These allow a single place (inputconverter and outputconverter)
+  // to do UTF-16 <==> UTF-8 bulk conversions and UTF-32 <==> UTF-8
+  // bulk conversions, with interchange-valid checking on input and
+  // fallback if needed on ouput.
+  UTF16BE              = 57,  // big-endian UTF-16
+  UTF16LE              = 58,  // little-endian UTF-16
+  UTF32BE              = 59,  // big-endian UTF-32
+  UTF32LE              = 60,  // little-endian UTF-32
+  //-----------------------------------------------------------
+  //-----------------------------------------------------------
+  // An encoding that means "This is not text, but it may have some
+  // simple ASCII text embedded". Intended input conversion (not yet
+  // implemented) is to keep strings of >=4 seven-bit ASCII characters
+  // (follow each kept string with an ASCII space), delete the rest of
+  // the bytes. This will pick up and allow indexing of e.g. captions
+  // in JPEGs. No output conversion needed.
+  BINARYENC            = 61,
+  //-----------------------------------------------------------
+  //-----------------------------------------------------------
+  // Some Web pages allow a mixture of HZ-GB and GB-2312 by using
+  // ~{ ... ~} for 2-byte pairs, and the browsers support this.
+  HZ_GB_2312           = 62,
+  //-----------------------------------------------------------
+  //-----------------------------------------------------------
+  // Some external vendors make the common input error of
+  // converting MSFT_CP1252 to UTF8 *twice*. No output conversion needed.
+  UTF8UTF8             = 63,
+  //-----------------------------------------------------------
+  //-----------------------------------------------------------
+  // Handled by transcoder for tamil language specific font
+  // encodings without the support for detection at present.
+  TAM_ELANGO           = 64,  // Elango - Tamil
+  TAM_LTTMBARANI       = 65,  // Barani - Tamil
+  TAM_SHREE            = 66,  // Shree - Tamil
+  TAM_TBOOMIS          = 67,  // TBoomis - Tamil
+  TAM_TMNEWS           = 68,  // TMNews - Tamil
+  TAM_WEBTAMIL         = 69,  // Webtamil - Tamil
+  //-----------------------------------------------------------
+  //-----------------------------------------------------------
+  // Shift_JIS variants used by Japanese cell phone carriers.
+  KDDI_SHIFT_JIS       = 70,
+  DOCOMO_SHIFT_JIS     = 71,
+  SOFTBANK_SHIFT_JIS   = 72,
+  // ISO-2022-JP variants used by KDDI and SoftBank.
+  KDDI_ISO_2022_JP     = 73,
+  SOFTBANK_ISO_2022_JP = 74,
+  //-----------------------------------------------------------
+  NUM_ENCODINGS        = 75,  // Always keep this at the end. It is not a
+                              // valid Encoding enum, it is only used to
+                              // indicate the total number of Encodings.
+};
+#endif  // ENCODINGS_PROTO_ENCODINGS_PB_H_