RubyGems - language_detection - Versions diffs - 0.0.1 - Mend

language_detection 0.0.1

Files changed (100) hide show

data/.gitignore +19 -0
data/Gemfile +4 -0
data/LICENSE.txt +22 -0
data/README.md +85 -0
data/Rakefile +11 -0
data/ext/cld/Makefile +34 -0
data/ext/cld/base/basictypes.h +348 -0
data/ext/cld/base/build_config.h +124 -0
data/ext/cld/base/casts.h +156 -0
data/ext/cld/base/commandlineflags.h +443 -0
data/ext/cld/base/crash.h +41 -0
data/ext/cld/base/dynamic_annotations.h +358 -0
data/ext/cld/base/global_strip_options.h +59 -0
data/ext/cld/base/log_severity.h +46 -0
data/ext/cld/base/logging.h +1403 -0
data/ext/cld/base/macros.h +243 -0
data/ext/cld/base/port.h +54 -0
data/ext/cld/base/scoped_ptr.h +428 -0
data/ext/cld/base/stl_decl.h +0 -0
data/ext/cld/base/stl_decl_msvc.h +107 -0
data/ext/cld/base/string_util.h +29 -0
data/ext/cld/base/strtoint.h +93 -0
data/ext/cld/base/template_util.h +96 -0
data/ext/cld/base/type_traits.h +198 -0
data/ext/cld/base/vlog_is_on.h +143 -0
data/ext/cld/cld.so +0 -0
data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
data/ext/cld/encodings/internal/encodings.cc +12 -0
data/ext/cld/encodings/lang_enc.h +254 -0
data/ext/cld/encodings/proto/encodings.pb.h +169 -0
data/ext/cld/encodings/public/encodings.h +301 -0
data/ext/cld/extconf.rb +1 -0
data/ext/cld/language_detection.cc +88 -0
data/ext/cld/languages/internal/languages.cc +337 -0
data/ext/cld/languages/proto/languages.pb.h +179 -0
data/ext/cld/languages/public/languages.h +379 -0
data/language_detection.gemspec +28 -0
data/lib/language_detection/string.rb +1 -0
data/lib/language_detection/version.rb +3 -0
data/lib/language_detection.rb +54 -0
data/test/_helper.rb +15 -0
data/test/fixtures/languages.csv +80 -0
data/test/language_detection_test.rb +88 -0
metadata +250 -0

data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc ADDED Viewed

@@ -0,0 +1,2574 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#include <stdio.h>
+#include <string.h>
+//#include <sys/time.h>                               // for gettimeofday
+#include <string>
+#include "encodings/lang_enc.h"
+#include "encodings/compact_lang_det/compact_lang_det.h"
+#include "encodings/compact_lang_det/compact_lang_det_impl.h"
+#include "encodings/compact_lang_det/getonescriptspan.h"
+#include "encodings/compact_lang_det/letterscript_enum.h"
+#include "encodings/compact_lang_det/tote.h"
+#include "encodings/compact_lang_det/utf8propjustletter.h"
+#include "encodings/compact_lang_det/utf8propletterscriptnum.h"
+#include "encodings/compact_lang_det/utf8scannotjustletterspecial.h"
+#include "encodings/compact_lang_det/cldutil_dbg.h"
+#include "encodings/compact_lang_det/win/cld_basictypes.h"
+#include "encodings/compact_lang_det/win/cld_commandlineflags.h"
+#include "encodings/compact_lang_det/win/cld_google.h"
+#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
+// Linker supplies the right tables
+extern const UTF8PropObj compact_lang_det_generated_ctjkvz_b1_obj;
+extern const cld::CLDTableSummary kCjkBiTable_obj;
+extern const cld::CLDTableSummary kQuadTable_obj;
+extern const cld::CLDTableSummary kLongWord8Table_obj;
+DEFINE_bool(cld_html, false, "Print language spans in HTML on stderr");
+DEFINE_bool(cld_forcewords, false, "Score all words, in addition to quads");
+DEFINE_bool(cld_showme, false, "Put squeeze/repeat points into HTML text");
+DEFINE_bool(cld_echotext, false, "Print each scriptspan to stderr");
+DEFINE_int32(cld_textlimit, 160, "Examine only initial n KB of actual text");
+// 20 quadgrams is about 80 bytes or about 12 words in real text
+DEFINE_int32(cld_smoothwidth, 20, "Smoothing window width in quadgrams");
+static const int kLangHintInitial = 12;  // Boost language by N initially
+static const int kLangHintBoost = 12;    // Boost language by N/16 per quadgram
+static const int kShortSpanThresh = 32;       // Bytes
+static const int kMaxSecondChanceLen = 1024;  // Look at first 1K of short spans
+static const int kCheapSqueezeTestThresh = 4096;  // Only look for squeezing
+                                                  // after this many text bytes
+static const int kCheapSqueezeTestLen = 256;   // Bytes to test to trigger sqz
+static const int kSpacesTriggerPercent = 25;   // Trigger sqz if >=25% spaces
+static const int kPredictTriggerPercent = 67;  // Trigger sqz if >=67% predicted
+static const int kChunksizeDefault = 48;      // Squeeze 48-byte chunks
+static const int kSpacesThreshPercent = 25;   // Squeeze if >=25% spaces
+static const int kPredictThreshPercent = 40;  // Squeeze if >=40% predicted
+static const int kMaxSpaceScan = 32;          // Bytes
+static const int kGoodLang1Percent = 70;
+static const int kGoodLang1and2Percent = 93;
+static const int kShortTextThresh = 256;      // Bytes
+static const int kMinChunkSizeQuads = 4;      // Chunk is at least four quads
+static const int kMaxChunkSizeQuads = 1024;   // Chunk is at most 1K quads
+static const int kDefaultWordSpan = 256;      // Scan at least this many initial
+                                              // bytes with word scoring
+static const int kReallyBigWordSpan = 9999999;  // Forces word scoring all text
+static const int kMinReliableSeq = 50;      // Record in seq if >= 50% reliable
+static const int kPredictionTableSize = 4096;   // Must be exactly 4096 for
+                                                // cheap compressor
+//
+// Generated by dsites 2008.07.07 from 10% of Base
+//
+// Three packed language probs, subscripted by Encoding
+static const uint32 kEncodingHintProbs[] = {
+  0x00000000,  // ASCII
+  0x18120cd5,  // Latin2   POLISH.11 CZECH.5 HUNGARIAN.3
+  0x1d3a4bc9,  // Latin3   AZERBAIJANI.10 BASQUE.3 CROATIAN.1
+  0x030819d4,  // Latin4   ESTONIAN.11 ITALIAN.4 DUTCH.2
+  0x00000000,  // ISO-8859-5
+  0x00003742,  // Arabic   ARABIC.12
+  0x00000000,  // Greek
+  0x00000742,  // Hebrew   HEBREW.12
+  0x00002242,  // Latin5   TURKISH.12
+  0x060419c9,  // Latin6   ESTONIAN.10 FINNISH.3 GERMAN.1
+  0x00000942,  // EUC-JP   Japanese.12
+  0x00000942,  // SJS   Japanese.12
+  0x00000942,  // JIS   Japanese.12
+  0x00004642,  // BIG5   ChineseT.12
+  0x00001142,  // GB   Chinese.12
+  0x46295fcd,  // EUC-CN   UIGHUR.10 MALAY.6 ChineseT.5
+  0x00000a42,  // KSC   Korean.12
+  0x00000000,  // Unicode
+  0x03104674,  // EUC   ChineseT.9 SWEDISH.8 DUTCH.3
+  0x00000000,  // CNS
+  0x0f1146c3,  // BIG5-CP950   ChineseT.9 Chinese.5 SPANISH.4
+  0x00000942,  // CP932   Japanese.12
+  0x00000000,  // UTF8
+  0x00000000,  // Unknown
+  0x00000000,  // ASCII-7-bit
+  0x00000000,  // KOI8R
+  0x00000000,  // CP1251
+  0x00000000,  // CP1252
+  0x00000000,  // KOI8U
+  0x451d12cd,  // CP1250   CZECH.10 CROATIAN.6 SLOVAK.5
+  0x0d06052a,  // ISO-8859-15   FRENCH.9 GERMAN.8 PORTUGUESE.7
+  0x00002242,  // CP1254   TURKISH.12
+  0x191516be,  // CP1257   LITHUANIAN.8 LATVIAN.7 ESTONIAN.7
+  0x08003642,  // ISO-8859-11   THAI.12 ITALIAN.1
+  0x00000000,  // CP874
+  0x00003742,  // CP1256   ARABIC.12
+  0x00000742,  // CP1255   HEBREW.12
+  0x00000000,  // ISO-8859-8-I
+  0x00000000,  // VISUAL
+  0x00000000,  // CP852
+  0x39001242,  // CSN_369103   CZECH.12 ESPERANTO.1
+  0x00000000,  // CP1253
+  0x00000000,  // CP866
+  0x2e001944,  // ISO-8859-13   ESTONIAN.12 ALBANIAN.3
+  0x08090a74,  // ISO-2022-KR   Korean.9 Japanese.8 ITALIAN.3
+  0x00001142,  // GBK   Chinese.12
+  0x4600113d,  // GB18030   Chinese.11 ChineseT.7
+  0x00004642,  // BIG5_HKSCS   ChineseT.12
+  0x00000000,  // ISO_2022_CN
+  0x00000000,  // TSCII
+  0x00000000,  // TAM
+  0x00000000,  // TAB
+  0x00000000,  // JAGRAN
+  0x00000000,  // MACINTOSH
+  0x00000000,  // UTF7
+  0x00000000,  // BHASKAR
+  0x00000000,  // HTCHANAKYA
+  0x090646ca,  // UTF-16BE   ChineseT.10 GERMAN.4 Japanese.2
+  0x00000000,  // UTF-16LE
+  0x00000000,  // UTF-32BE
+  0x00000000,  // UTF-32LE
+  0x00000000,  // X-BINARYENC
+  0x06001142,  // HZ-GB-2312   Chinese.12 GERMAN.1
+  0x461109c2,  // X-UTF8UTF8   Japanese.9 Chinese.5 ChineseT.3
+  0x00000000,  // X-TAM-ELANGO
+  0x00000000,  // X-TAM-LTTMBARANI
+  0x00000000,  // X-TAM-SHREE
+  0x00000000,  // X-TAM-TBOOMIS
+  0x00000000,  // X-TAM-TMNEWS
+  0x00000000,  // X-TAM-WEBTAMIL
+  0x00000000,  // X-KDDI-Shift_JIS
+  0x00000000,  // X-DoCoMo-Shift_JIS
+  0x00000000,  // X-SoftBank-Shift_JIS
+  0x00000000,  // X-KDDI-ISO-2022-JP
+  0x00000000,  // X-SoftBank-ISO-2022-JP
+};
+COMPILE_ASSERT(arraysize(kEncodingHintProbs) == NUM_ENCODINGS,
+               kEncodingHintProbs_has_incorrect_size);
+//
+// Generated by dsites 2008.07.07 from 10% of Base
+//
+// Three packed language probs, subscripted by (anchor) language
+static const uint32 kLanguageHintProbs[] = {
+  0x00000000,  // ENGLISH
+  0x00000242,  // DANISH   DANISH.12
+  0x00000342,  // DUTCH   DUTCH.12
+  0x00000442,  // FINNISH   FINNISH.12
+  0x00000542,  // FRENCH   FRENCH.12
+  0x00000642,  // GERMAN   GERMAN.12
+  0x00000742,  // HEBREW   HEBREW.12
+  0x00000842,  // ITALIAN   ITALIAN.12
+  0x00000942,  // Japanese   Japanese.12
+  0x00000a42,  // Korean   Korean.12
+  0x51000b43,  // NORWEGIAN   NORWEGIAN.12 NORWEGIAN_N.2
+  0x00000c42,  // POLISH   POLISH.12
+  0x00000d42,  // PORTUGUESE   PORTUGUESE.12
+  0x00000000,  // RUSSIAN
+  0x00000f42,  // SPANISH   SPANISH.12
+  0x00001042,  // SWEDISH   SWEDISH.12
+  0x00001142,  // Chinese   Chinese.12
+  0x00001242,  // CZECH   CZECH.12
+  0x00000000,  // GREEK
+  0x47001442,  // ICELANDIC   ICELANDIC.12 FAROESE.1
+  0x00001542,  // LATVIAN   LATVIAN.12
+  0x00001642,  // LITHUANIAN   LITHUANIAN.12
+  0x00001742,  // ROMANIAN   ROMANIAN.12
+  0x00001842,  // HUNGARIAN   HUNGARIAN.12
+  0x00001942,  // ESTONIAN   ESTONIAN.12
+  0x00000000,  // TG_UNKNOWN_LANGUAGE
+  0x00000000,  // Unknown
+  0x00001c42,  // BULGARIAN   BULGARIAN.12
+  0x00001d42,  // CROATIAN   CROATIAN.12
+  0x1e001d46,  // SERBIAN   CROATIAN.12 SERBIAN.5
+  0x00000000,  // IRISH
+  0x0f00203d,  // GALICIAN   GALICIAN.11 SPANISH.7
+  0x5e00213a,  // TAGALOG   TAGALOG.11 SOMALI.4
+  0x00002242,  // TURKISH   TURKISH.12
+  0x00002342,  // UKRAINIAN   UKRAINIAN.12
+  0x00000000,  // HINDI
+  0x1c1e25d4,  // MACEDONIAN   MACEDONIAN.11 SERBIAN.4 BULGARIAN.2
+  0x00002642,  // BENGALI   BENGALI.12
+  0x00002742,  // INDONESIAN   INDONESIAN.12
+  0x00000000,  // LATIN
+  0x2700293c,  // MALAY   MALAY.11 INDONESIAN.6
+  0x00000000,  // MALAYALAM
+  0x00000000,  // WELSH
+  0x00000000,  // NEPALI
+  0x00000000,  // TELUGU
+  0x00002e42,  // ALBANIAN   ALBANIAN.12
+  0x00000000,  // TAMIL
+  0x00003042,  // BELARUSIAN   BELARUSIAN.12
+  0x00000000,  // JAVANESE
+  0x00000000,  // OCCITAN
+  0x375f3330,  // URDU   URDU.10 UIGHUR.7 ARABIC.4
+  0x41003436,  // BIHARI   BIHARI.10 MARATHI.10
+  0x00000000,  // GUJARATI
+  0x0a4636b2,  // THAI   THAI.7 ChineseT.3 Korean.2
+  0x00003742,  // ARABIC   ARABIC.12
+  0x00003842,  // CATALAN   CATALAN.12
+  0x00003942,  // ESPERANTO   ESPERANTO.12
+  0x00003a42,  // BASQUE   BASQUE.12
+  0x00000000,  // INTERLINGUA
+  0x00000000,  // KANNADA
+  0x05060cca,  // PUNJABI   POLISH.10 GERMAN.4 FRENCH.2
+  0x00000000,  // SCOTS_GAELIC
+  0x00003f42,  // SWAHILI   SWAHILI.12
+  0x00004042,  // SLOVENIAN   SLOVENIAN.12
+  0x00004142,  // MARATHI   MARATHI.12
+  0x00004242,  // MALTESE   MALTESE.12
+  0x00004342,  // VIETNAMESE   VIETNAMESE.12
+  0x00000000,  // FRISIAN
+  0x12004543,  // SLOVAK   SLOVAK.12 CZECH.2
+  0x00004642,  // ChineseT   ChineseT.12
+  0x00000000,  // FAROESE
+  0x00000000,  // SUNDANESE
+  0x79004944,  // UZBEK   UZBEK.12 TAJIK.3
+  0x4d004a46,  // AMHARIC   AMHARIC.12 TIGRINYA.5
+  0x00004b42,  // AZERBAIJANI   AZERBAIJANI.12
+  0x00000000,  // GEORGIAN
+  0x00000000,  // TIGRINYA
+  0x00004e42,  // PERSIAN   PERSIAN.12
+  0x00000000,  // BOSNIAN
+  0x00000000,  // SINHALESE
+  0x00000000,  // NORWEGIAN_N
+  0x00000000,  // PORTUGUESE_P
+  0x00000000,  // PORTUGUESE_B
+  0x00000000,  // XHOSA
+  0x00000000,  // ZULU
+  0x00000000,  // GUARANI
+  0x00000000,  // SESOTHO
+  0x00000000,  // TURKMEN
+  0x7a005933,  // KYRGYZ   KYRGYZ.10 TATAR.7
+  0x00000000,  // BRETON
+  0x00000000,  // TWI
+  0x00000000,  // YIDDISH
+  0x00000000,  // SERBO_CROATIAN
+  0x00000000,  // SOMALI
+  0x00005f42,  // UIGHUR   UIGHUR.12
+  0x00006042,  // KURDISH   KURDISH.12
+  0x00006142,  // MONGOLIAN   MONGOLIAN.12
+  0x051130c9,  // ARMENIAN   BELARUSIAN.10 Chinese.3 FRENCH.1
+  0x020f0521,  // LAOTHIAN   FRENCH.8 SPANISH.7 DANISH.6
+  0x64004e35,  // SINDHI   PERSIAN.10 SINDHI.9
+  0x00000000,  // RHAETO_ROMANCE
+  0x00006642,  // AFRIKAANS   AFRIKAANS.12
+  0x00000000,  // LUXEMBOURGISH
+  0x00006842,  // BURMESE   BURMESE.12
+  0x00002242,  // KHMER   TURKISH.12
+  0x88006a3c,  // TIBETAN   TIBETAN.11 DZONGKHA.6
+  0x00000000,  // DHIVEHI
+  0x00000000,  // CHEROKEE
+  0x00000000,  // SYRIAC
+  0x00000000,  // LIMBU
+  0x00000000,  // ORIYA
+  0x00000000,  // ASSAMESE
+  0x00000000,  // CORSICAN
+  0x00000000,  // INTERLINGUE
+  0x00007342,  // KAZAKH   KAZAKH.12
+  0x00000000,  // LINGALA
+  0x00000000,  // MOLDAVIAN
+  0x5f007645,  // PASHTO   PASHTO.12 UIGHUR.4
+  0x00000000,  // QUECHUA
+  0x00000000,  // SHONA
+  0x00007942,  // TAJIK   TAJIK.12
+  0x00000000,  // TATAR
+  0x00000000,  // TONGA
+  0x00000000,  // YORUBA
+  0x00000000,  // CREOLES_AND_PIDGINS_ENGLISH_BASED
+  0x00000000,  // CREOLES_AND_PIDGINS_FRENCH_BASED
+  0x00000000,  // CREOLES_AND_PIDGINS_PORTUGUESE_BASED
+  0x00000000,  // CREOLES_AND_PIDGINS_OTHER
+  0x00000000,  // MAORI
+  0x00000000,  // WOLOF
+  0x00000000,  // ABKHAZIAN
+  0x00000000,  // AFAR
+  0x00000000,  // AYMARA
+  0x00000000,  // BASHKIR
+  0x00000000,  // BISLAMA
+  0x00000000,  // DZONGKHA
+  0x00000000,  // FIJIAN
+  0x00000000,  // GREENLANDIC
+  0x00000000,  // HAUSA
+  0x00000000,  // HAITIAN_CREOLE
+  0x00000000,  // INUPIAK
+  0x00000542,  // INUKTITUT   FRENCH.12
+  0x00000000,  // KASHMIRI
+  0x00000000,  // KINYARWANDA
+  0x00000000,  // MALAGASY
+  0x00000000,  // NAURU
+  0x00000000,  // OROMO
+  0x00000000,  // RUNDI
+  0x00000000,  // SAMOAN
+  0x00000000,  // SANGO
+  0x344197d3,  // SANSKRIT   SANSKRIT.11 MARATHI.4 BIHARI.1
+  0x00000000,  // SISWANT
+  0x00000000,  // TSONGA
+  0x00000000,  // TSWANA
+  0x00000000,  // VOLAPUK
+  0x00000000,  // ZHUANG
+  0x00000000,  // KHASI
+  0x00000000,  // SCOTS
+  0x00000000,  // GANDA
+  0x00000000,  // MANX
+  0x00000000,  // MONTENEGRIN
+  // Add new language hints just before here (just use 0x00000000)
+};
+COMPILE_ASSERT(arraysize(kLanguageHintProbs) == NUM_LANGUAGES,
+               kLanguageHintProbs_has_incorrect_size);
+//
+// Generated by dsites 2008.07.07 from 10% of Base
+//
+typedef struct {
+  char key[4];
+  uint32 probs;
+} HintEntry;
+// Massaged TLD, followed by three packed language probs
+// Hand-removed 4 items dsites 2008.07.15
+static const int kTLDHintProbsSize = 201;
+static const HintEntry kTLDHintProbs[kTLDHintProbsSize] = {   // MaxRange 12
+  {{0x61,0x63,0x5f,0x5f}, 0x0a000945},   // ac__ Japanese.12 Korean.4
+  {{0x61,0x64,0x5f,0x5f}, 0x00003842},   // ad__ CATALAN.12
+  {{0x61,0x65,0x5f,0x5f}, 0x00003742},   // ae__ ARABIC.12
+  {{0x61,0x66,0x5f,0x5f}, 0x4e00763d},   // af__ PASHTO.11 PERSIAN.7
+  {{0x61,0x67,0x5f,0x5f}, 0x09000643},   // ag__ GERMAN.12 Japanese.2
+  {{0x61,0x69,0x5f,0x5f}, 0x0c180938},   // ai__ Japanese.11 HUNGARIAN.7 POLISH.2
+  {{0x61,0x6c,0x5f,0x5f}, 0x00002e42},   // al__ ALBANIAN.12
+  {{0x61,0x6e,0x5f,0x5f}, 0x6e00033d},   // an__ DUTCH.11 LIMBU.7
+  {{0x61,0x6f,0x5f,0x5f}, 0x05000d42},   // ao__ PORTUGUESE.12 FRENCH.1
+  {{0x61,0x71,0x5f,0x5f}, 0x05000f29},   // aq__ SPANISH.9 FRENCH.6
+  {{0x61,0x72,0x5f,0x5f}, 0x00000f42},   // ar__ SPANISH.12
+  {{0x61,0x73,0x5f,0x5f}, 0x0f120bcd},   // as__ NORWEGIAN.10 CZECH.6 SPANISH.5
+  {{0x61,0x74,0x5f,0x5f}, 0x00000642},   // at__ GERMAN.12
+  {{0x61,0x77,0x5f,0x5f}, 0x0f000345},   // aw__ DUTCH.12 SPANISH.4
+  {{0x61,0x78,0x5f,0x5f}, 0x00001042},   // ax__ SWEDISH.12
+  {{0x61,0x7a,0x5f,0x5f}, 0x00004b42},   // az__ AZERBAIJANI.12
+  {{0x62,0x61,0x5f,0x5f}, 0x00001d42},   // ba__ CROATIAN.12
+  {{0x62,0x62,0x5f,0x5f}, 0x00002842},   // bb__ LATIN.12
+  {{0x62,0x64,0x5f,0x5f}, 0x00002642},   // bd__ BENGALI.12
+  {{0x62,0x65,0x5f,0x5f}, 0x05000335},   // be__ DUTCH.10 FRENCH.9
+  {{0x62,0x66,0x5f,0x5f}, 0x00000542},   // bf__ FRENCH.12
+  {{0x62,0x67,0x5f,0x5f}, 0x00001c42},   // bg__ BULGARIAN.12
+  {{0x62,0x68,0x5f,0x5f}, 0x00003742},   // bh__ ARABIC.12
+  {{0x62,0x69,0x5f,0x5f}, 0x0f00053f},   // bi__ FRENCH.11 SPANISH.9
+  {{0x62,0x6a,0x5f,0x5f}, 0x00000542},   // bj__ FRENCH.12
+  {{0x62,0x6d,0x5f,0x5f}, 0x98043929},   // bm__ ESPERANTO.9 FINNISH.8 SISWANT.6
+  {{0x62,0x6e,0x5f,0x5f}, 0x00002942},   // bn__ MALAY.12
+  {{0x62,0x6f,0x5f,0x5f}, 0x00000f42},   // bo__ SPANISH.12
+  {{0x62,0x72,0x5f,0x5f}, 0x00000d42},   // br__ PORTUGUESE.12
+  {{0x62,0x74,0x5f,0x5f}, 0x00008842},   // bt__ DZONGKHA.12
+  {{0x62,0x77,0x5f,0x5f}, 0x06059ac4},   // bw__ TSWANA.9 FRENCH.6 GERMAN.5
+  {{0x62,0x79,0x5f,0x5f}, 0x00003024},   // by__ BELARUSIAN.9
+  {{0x62,0x7a,0x5f,0x5f}, 0x0f0a0924},   // bz__ Japanese.9 Korean.5 SPANISH.1
+  {{0x63,0x61,0x5f,0x5f}, 0x00000542},   // ca__ FRENCH.12
+  {{0x63,0x61,0x74,0x5f}, 0x00003842},   // cat_ CATALAN.12
+  {{0x63,0x64,0x5f,0x5f}, 0x06051224},   // cd__ CZECH.9 FRENCH.5 GERMAN.1
+  {{0x63,0x66,0x5f,0x5f}, 0x00000542},   // cf__ FRENCH.12
+  {{0x63,0x67,0x5f,0x5f}, 0x00000542},   // cg__ FRENCH.12
+  {{0x63,0x68,0x5f,0x5f}, 0x08050638},   // ch__ GERMAN.11 FRENCH.7 ITALIAN.2
+  {{0x63,0x69,0x5f,0x5f}, 0x00000542},   // ci__ FRENCH.12
+  {{0x63,0x6c,0x5f,0x5f}, 0x00000f42},   // cl__ SPANISH.12
+  {{0x63,0x6d,0x5f,0x5f}, 0x00000542},   // cm__ FRENCH.12
+  {{0x63,0x6e,0x5f,0x5f}, 0x00001142},   // cn__ Chinese.12
+  {{0x63,0x6f,0x5f,0x5f}, 0x00000f42},   // co__ SPANISH.12
+// {{0x63,0x6f,0x6f,0x70}, 0x0f0509cd},   // coop Japanese.10 FRENCH.6 SPANISH.5
+  {{0x63,0x72,0x5f,0x5f}, 0x00000f42},   // cr__ SPANISH.12
+  {{0x63,0x75,0x5f,0x5f}, 0x00000f42},   // cu__ SPANISH.12
+  {{0x63,0x76,0x5f,0x5f}, 0x00000d42},   // cv__ PORTUGUESE.12
+  {{0x63,0x78,0x5f,0x5f}, 0x223a091f},   // cx__ Japanese.8 BASQUE.6 TURKISH.4
+  {{0x63,0x79,0x5f,0x5f}, 0x150622ba},   // cy__ TURKISH.8 GERMAN.4 LATVIAN.3
+  {{0x63,0x7a,0x5f,0x5f}, 0x00001242},   // cz__ CZECH.12
+  {{0x64,0x65,0x5f,0x5f}, 0x00000642},   // de__ GERMAN.12
+  {{0x64,0x6b,0x5f,0x5f}, 0x00000242},   // dk__ DANISH.12
+  {{0x64,0x6f,0x5f,0x5f}, 0x21000f42},   // do__ SPANISH.12 TAGALOG.1
+  {{0x64,0x7a,0x5f,0x5f}, 0x37000535},   // dz__ FRENCH.10 ARABIC.9
+  {{0x65,0x63,0x5f,0x5f}, 0x00000f42},   // ec__ SPANISH.12
+// {{0x65,0x64,0x75,0x5f}, 0x2e0f3873},   // edu_ CATALAN.9 SPANISH.7 ALBANIAN.2
+  {{0x65,0x65,0x5f,0x5f}, 0x00001942},   // ee__ ESTONIAN.12
+  {{0x65,0x67,0x5f,0x5f}, 0x05003742},   // eg__ ARABIC.12 FRENCH.1
+  {{0x65,0x72,0x5f,0x5f}, 0x00000b42},   // er__ NORWEGIAN.12
+  {{0x65,0x73,0x5f,0x5f}, 0x38200fd4},   // es__ SPANISH.11 GALICIAN.4 CATALAN.2
+  {{0x65,0x74,0x5f,0x5f}, 0x39004a39},   // et__ AMHARIC.11 ESPERANTO.3
+  {{0x66,0x69,0x5f,0x5f}, 0x10000444},   // fi__ FINNISH.12 SWEDISH.3
+  {{0x66,0x6a,0x5f,0x5f}, 0x050489e0},   // fj__ FIJIAN.12 FINNISH.5 FRENCH.3
+  {{0x66,0x6f,0x5f,0x5f}, 0x00004742},   // fo__ FAROESE.12
+  {{0x66,0x72,0x5f,0x5f}, 0x00000542},   // fr__ FRENCH.12
+  {{0x67,0x61,0x5f,0x5f}, 0x00000542},   // ga__ FRENCH.12
+  {{0x67,0x64,0x5f,0x5f}, 0x061d05d5},   // gd__ FRENCH.11 CROATIAN.5 GERMAN.3
+  {{0x67,0x65,0x5f,0x5f}, 0x00004c2d},   // ge__ GEORGIAN.10
+  {{0x67,0x66,0x5f,0x5f}, 0x00000542},   // gf__ FRENCH.12
+  {{0x67,0x67,0x5f,0x5f}, 0x06002244},   // gg__ TURKISH.12 GERMAN.3
+  {{0x67,0x68,0x5f,0x5f}, 0x05000436},   // gh__ FINNISH.10 FRENCH.10
+  {{0x67,0x69,0x5f,0x5f}, 0x0f0538ce},   // gi__ CATALAN.10 FRENCH.7 SPANISH.6
+  {{0x67,0x6c,0x5f,0x5f}, 0x398a0238},   // gl__ DANISH.11 GREENLANDIC.7 ESPERANTO.2
+  {{0x67,0x6d,0x5f,0x5f}, 0x0600043e},   // gm__ FINNISH.11 GERMAN.8
+  {{0x67,0x6e,0x5f,0x5f}, 0x00000542},   // gn__ FRENCH.12
+// {{0x67,0x6f,0x76,0x5f}, 0x05000f25},   // gov_ SPANISH.9 FRENCH.2
+  {{0x67,0x70,0x5f,0x5f}, 0x00000542},   // gp__ FRENCH.12
+  {{0x67,0x71,0x5f,0x5f}, 0x0f000547},   // gq__ FRENCH.12 SPANISH.6
+  {{0x67,0x73,0x5f,0x5f}, 0x00000942},   // gs__ Japanese.12
+  {{0x67,0x74,0x5f,0x5f}, 0x00000f42},   // gt__ SPANISH.12
+  {{0x68,0x6b,0x5f,0x5f}, 0x11004643},   // hk__ ChineseT.12 Chinese.2
+  {{0x68,0x6d,0x5f,0x5f}, 0x4606092e},   // hm__ Japanese.10 GERMAN.6 ChineseT.2
+  {{0x68,0x6e,0x5f,0x5f}, 0x00000f42},   // hn__ SPANISH.12
+  {{0x68,0x72,0x5f,0x5f}, 0x00001d42},   // hr__ CROATIAN.12
+  {{0x68,0x74,0x5f,0x5f}, 0x0f000542},   // ht__ FRENCH.12 SPANISH.1
+  {{0x68,0x75,0x5f,0x5f}, 0x00001842},   // hu__ HUNGARIAN.12
+  {{0x69,0x64,0x5f,0x5f}, 0x00002742},   // id__ INDONESIAN.12
+  {{0x69,0x65,0x5f,0x5f}, 0x050c1f24},   // ie__ IRISH.9 POLISH.5 FRENCH.1
+  {{0x69,0x6c,0x5f,0x5f}, 0x00000742},   // il__ HEBREW.12
+  {{0x69,0x6e,0x74,0x5f}, 0x0f060574},   // int_ FRENCH.9 GERMAN.8 SPANISH.3
+  {{0x69,0x6f,0x5f,0x5f}, 0x11090fd5},   // io__ SPANISH.11 Japanese.5 Chinese.3
+  {{0x69,0x71,0x5f,0x5f}, 0x60003744},   // iq__ ARABIC.12 KURDISH.3
+  {{0x69,0x72,0x5f,0x5f}, 0x00004e42},   // ir__ PERSIAN.12
+  {{0x69,0x73,0x5f,0x5f}, 0x00001442},   // is__ ICELANDIC.12
+  {{0x69,0x74,0x5f,0x5f}, 0x00000842},   // it__ ITALIAN.12
+  {{0x6a,0x65,0x5f,0x5f}, 0x29050328},   // je__ DUTCH.9 FRENCH.7 MALAY.5
+  {{0x6a,0x6d,0x5f,0x5f}, 0x040f0576},   // jm__ FRENCH.9 SPANISH.8 FINNISH.5
+  {{0x6a,0x6f,0x5f,0x5f}, 0x00003742},   // jo__ ARABIC.12
+// {{0x6a,0x6f,0x62,0x73}, 0x0f060329},   // jobs DUTCH.9 GERMAN.8 SPANISH.6
+  {{0x6a,0x70,0x5f,0x5f}, 0x00000942},   // jp__ Japanese.12
+  {{0x6b,0x65,0x5f,0x5f}, 0x040f3fc3},   // ke__ SWAHILI.9 SPANISH.5 FINNISH.4
+  {{0x6b,0x69,0x5f,0x5f}, 0x04000643},   // ki__ GERMAN.12 FINNISH.2
+  {{0x6b,0x6d,0x5f,0x5f}, 0x00000542},   // km__ FRENCH.12
+  {{0x6b,0x70,0x5f,0x5f}, 0x00000a42},   // kp__ Korean.12
+  {{0x6b,0x72,0x5f,0x5f}, 0x00000a42},   // kr__ Korean.12
+  {{0x6b,0x77,0x5f,0x5f}, 0x00003742},   // kw__ ARABIC.12
+  {{0x6b,0x79,0x5f,0x5f}, 0x0500083f},   // ky__ ITALIAN.11 FRENCH.9
+  {{0x6b,0x7a,0x5f,0x5f}, 0x0000732d},   // kz__ KAZAKH.10
+  {{0x6c,0x62,0x5f,0x5f}, 0x05003747},   // lb__ ARABIC.12 FRENCH.6
+  {{0x6c,0x63,0x5f,0x5f}, 0x09000645},   // lc__ GERMAN.12 Japanese.4
+  {{0x6c,0x69,0x5f,0x5f}, 0x1600063d},   // li__ GERMAN.11 LITHUANIAN.7
+  {{0x6c,0x73,0x5f,0x5f}, 0x00005742},   // ls__ SESOTHO.12
+  {{0x6c,0x74,0x5f,0x5f}, 0x00001642},   // lt__ LITHUANIAN.12
+  {{0x6c,0x75,0x5f,0x5f}, 0x0600053d},   // lu__ FRENCH.11 GERMAN.7
+  {{0x6c,0x76,0x5f,0x5f}, 0x00001542},   // lv__ LATVIAN.12
+  {{0x6c,0x79,0x5f,0x5f}, 0x05003744},   // ly__ ARABIC.12 FRENCH.3
+  {{0x6d,0x61,0x5f,0x5f}, 0x3700053d},   // ma__ FRENCH.11 ARABIC.7
+  {{0x6d,0x63,0x5f,0x5f}, 0x00000542},   // mc__ FRENCH.12
+  {{0x6d,0x64,0x5f,0x5f}, 0x00001724},   // md__ ROMANIAN.9
+  {{0x6d,0x65,0x5f,0x5f}, 0x00001d42},   // me__ CROATIAN.12
+  {{0x6d,0x67,0x5f,0x5f}, 0x00000542},   // mg__ FRENCH.12
+  {{0x6d,0x6b,0x5f,0x5f}, 0x1c002543},   // mk__ MACEDONIAN.12 BULGARIAN.2
+  {{0x6d,0x6c,0x5f,0x5f}, 0x00000542},   // ml__ FRENCH.12
+  {{0x6d,0x6e,0x5f,0x5f}, 0x00006142},   // mn__ MONGOLIAN.12
+  {{0x6d,0x6f,0x5f,0x5f}, 0x110d4631},   // mo__ ChineseT.10 PORTUGUESE.8 Chinese.5
+  {{0x6d,0x71,0x5f,0x5f}, 0x00000542},   // mq__ FRENCH.12
+  {{0x6d,0x72,0x5f,0x5f}, 0x37000535},   // mr__ FRENCH.10 ARABIC.9
+  {{0x6d,0x73,0x5f,0x5f}, 0x090f06d5},   // ms__ GERMAN.11 SPANISH.5 Japanese.3
+  {{0x6d,0x74,0x5f,0x5f}, 0x00004242},   // mt__ MALTESE.12
+  {{0x6d,0x75,0x5f,0x5f}, 0x05000934},   // mu__ Japanese.10 FRENCH.8
+  {{0x6d,0x76,0x5f,0x5f}, 0x28000436},   // mv__ FINNISH.10 LATIN.10
+  {{0x6d,0x77,0x5f,0x5f}, 0x0611092a},   // mw__ Japanese.9 Chinese.8 GERMAN.7
+  {{0x6d,0x78,0x5f,0x5f}, 0x00000f42},   // mx__ SPANISH.12
+  {{0x6d,0x79,0x5f,0x5f}, 0x00002942},   // my__ MALAY.12
+  {{0x6d,0x7a,0x5f,0x5f}, 0x00000d42},   // mz__ PORTUGUESE.12
+  {{0x6e,0x61,0x5f,0x5f}, 0x06006644},   // na__ AFRIKAANS.12 GERMAN.3
+  {{0x6e,0x63,0x5f,0x5f}, 0x00000542},   // nc__ FRENCH.12
+  {{0x6e,0x65,0x5f,0x5f}, 0x8b000542},   // ne__ FRENCH.12 HAUSA.1
+  {{0x6e,0x66,0x5f,0x5f}, 0x00000542},   // nf__ FRENCH.12
+  {{0x6e,0x69,0x5f,0x5f}, 0x00000f42},   // ni__ SPANISH.12
+  {{0x6e,0x6c,0x5f,0x5f}, 0x00000342},   // nl__ DUTCH.12
+  {{0x6e,0x6f,0x5f,0x5f}, 0x51000b43},   // no__ NORWEGIAN.12 NORWEGIAN_N.2
+  {{0x6e,0x75,0x5f,0x5f}, 0x0300103b},   // nu__ SWEDISH.11 DUTCH.5
+  {{0x6f,0x6d,0x5f,0x5f}, 0x00003742},   // om__ ARABIC.12
+  {{0x70,0x61,0x5f,0x5f}, 0x00000f42},   // pa__ SPANISH.12
+  {{0x70,0x65,0x5f,0x5f}, 0x00000f42},   // pe__ SPANISH.12
+  {{0x70,0x66,0x5f,0x5f}, 0x00000542},   // pf__ FRENCH.12
+  {{0x70,0x67,0x5f,0x5f}, 0x00000f24},   // pg__ SPANISH.9
+  {{0x70,0x68,0x5f,0x5f}, 0x00002142},   // ph__ TAGALOG.12
+  {{0x70,0x6b,0x5f,0x5f}, 0x00003342},   // pk__ URDU.12
+  {{0x70,0x6c,0x5f,0x5f}, 0x30000c42},   // pl__ POLISH.12 BELARUSIAN.1
+  {{0x70,0x6e,0x5f,0x5f}, 0x04000644},   // pn__ GERMAN.12 FINNISH.3
+  {{0x70,0x72,0x5f,0x5f}, 0x00000f42},   // pr__ SPANISH.12
+  {{0x70,0x72,0x6f,0x5f}, 0x46050fd5},   // pro_ SPANISH.11 FRENCH.5 ChineseT.3
+  {{0x70,0x73,0x5f,0x5f}, 0x00003742},   // ps__ ARABIC.12
+  {{0x70,0x74,0x5f,0x5f}, 0x00000d42},   // pt__ PORTUGUESE.12
+  {{0x70,0x79,0x5f,0x5f}, 0x00000f42},   // py__ SPANISH.12
+  {{0x71,0x61,0x5f,0x5f}, 0x00003742},   // qa__ ARABIC.12
+  {{0x72,0x65,0x5f,0x5f}, 0x00000542},   // re__ FRENCH.12
+  {{0x72,0x6f,0x5f,0x5f}, 0x00001742},   // ro__ ROMANIAN.12
+  {{0x72,0x73,0x5f,0x5f}, 0x00001d42},   // rs__ CROATIAN.12
+  {{0x72,0x77,0x5f,0x5f}, 0x9000053e},   // rw__ FRENCH.11 KINYARWANDA.8
+  {{0x73,0x61,0x5f,0x5f}, 0x00003742},   // sa__ ARABIC.12
+  {{0x73,0x62,0x5f,0x5f}, 0x00000442},   // sb__ FINNISH.12
+  {{0x73,0x63,0x5f,0x5f}, 0x060f092f},   // sc__ Japanese.10 SPANISH.7 GERMAN.3
+  {{0x73,0x64,0x5f,0x5f}, 0x00003742},   // sd__ ARABIC.12
+  {{0x73,0x65,0x5f,0x5f}, 0x00001042},   // se__ SWEDISH.12
+  {{0x73,0x69,0x5f,0x5f}, 0x00004042},   // si__ SLOVENIAN.12
+  {{0x73,0x6b,0x5f,0x5f}, 0x12004543},   // sk__ SLOVAK.12 CZECH.2
+  {{0x73,0x6d,0x5f,0x5f}, 0x00000842},   // sm__ ITALIAN.12
+  {{0x73,0x6e,0x5f,0x5f}, 0x00000542},   // sn__ FRENCH.12
+  {{0x73,0x72,0x5f,0x5f}, 0x03001e44},   // sr__ SERBIAN.12 DUTCH.3
+  {{0x73,0x76,0x5f,0x5f}, 0x00000f42},   // sv__ SPANISH.12
+  {{0x73,0x79,0x5f,0x5f}, 0x00003742},   // sy__ ARABIC.12
+  {{0x74,0x63,0x5f,0x5f}, 0x0a2206cd},   // tc__ GERMAN.10 TURKISH.6 Korean.5
+  {{0x74,0x66,0x5f,0x5f}, 0x00000642},   // tf__ GERMAN.12
+  {{0x74,0x67,0x5f,0x5f}, 0x00000542},   // tg__ FRENCH.12
+  {{0x74,0x68,0x5f,0x5f}, 0x9e0936c9},   // th__ THAI.10 Japanese.3 SCOTS.1
+  {{0x74,0x6a,0x5f,0x5f}, 0x00007924},   // tj__ TAJIK.9
+  {{0x74,0x6c,0x5f,0x5f}, 0x060f0dcd},   // tl__ PORTUGUESE.10 SPANISH.6 GERMAN.5
+  {{0x74,0x6e,0x5f,0x5f}, 0x3700053e},   // tn__ FRENCH.11 ARABIC.8
+  {{0x74,0x6f,0x5f,0x5f}, 0x064609c5},   // to__ Japanese.9 ChineseT.7 GERMAN.6
+  {{0x74,0x70,0x5f,0x5f}, 0x06000944},   // tp__ Japanese.12 GERMAN.3
+  {{0x74,0x72,0x5f,0x5f}, 0x00002242},   // tr__ TURKISH.12
+  {{0x74,0x72,0x61,0x76}, 0x064509c3},   // trav Japanese.9 SLOVAK.5 GERMAN.4
+  {{0x74,0x74,0x5f,0x5f}, 0x0f00063e},   // tt__ GERMAN.11 SPANISH.8
+  {{0x74,0x77,0x5f,0x5f}, 0x00004642},   // tw__ ChineseT.12
+  {{0x74,0x7a,0x5f,0x5f}, 0x00003f42},   // tz__ SWAHILI.12
+  {{0x75,0x61,0x5f,0x5f}, 0x0000232d},   // ua__ UKRAINIAN.10
+  {{0x75,0x79,0x5f,0x5f}, 0x00000f42},   // uy__ SPANISH.12
+  {{0x75,0x7a,0x5f,0x5f}, 0x0000492d},   // uz__ UZBEK.10
+  {{0x76,0x61,0x5f,0x5f}, 0x060f0828},   // va__ ITALIAN.9 SPANISH.7 GERMAN.5
+  {{0x76,0x63,0x5f,0x5f}, 0x0d000939},   // vc__ Japanese.11 PORTUGUESE.3
+  {{0x76,0x65,0x5f,0x5f}, 0x00000f42},   // ve__ SPANISH.12
+  {{0x76,0x67,0x5f,0x5f}, 0x09000f43},   // vg__ SPANISH.12 Japanese.2
+  {{0x76,0x69,0x5f,0x5f}, 0x00002942},   // vi__ MALAY.12
+  {{0x76,0x6e,0x5f,0x5f}, 0x00004342},   // vn__ VIETNAMESE.12
+  {{0x76,0x75,0x5f,0x5f}, 0x00000642},   // vu__ GERMAN.12
+  {{0x77,0x73,0x5f,0x5f}, 0x4b0f0624},   // ws__ GERMAN.9 SPANISH.5 AZERBAIJANI.1
+  {{0x79,0x65,0x5f,0x5f}, 0x00003742},   // ye__ ARABIC.12
+  {{0x79,0x75,0x5f,0x5f}, 0x1e001d3d},   // yu__ CROATIAN.11 SERBIAN.7
+  {{0x7a,0x61,0x5f,0x5f}, 0x00006642},   // za__ AFRIKAANS.12
+  {{0x7a,0x6d,0x5f,0x5f}, 0x0b000435},   // zm__ FINNISH.10 NORWEGIAN.9
+  {{0x7a,0x77,0x5f,0x5f}, 0x3f00783e},   // zw__ SHONA.11 SWAHILI.8
+};
+// Statistically closest language, based on quadgram table
+// Those that are far from other languges map to UNKNOWN_LANGUAGE
+// Subscripted by Language
+//
+// From lang_correlation.txt and hand-edits
+// sed 's/^\([^ ]*\) \([^ ]*\) coef=0\.\(..\).*$/
+//   (\3 >= kMinCorrPercent) ? \2 : UNKNOWN_LANGUAGE,
+//   \/\/ \1/' lang_correlation.txt >/tmp/closest_lang_decl.txt
+//
+static const int kMinCorrPercent = 24;        // Pick off how close you want
+                                              // 24 catches PERSIAN <== ARABIC
+                                              // but not SPANISH <== PORTUGESE
+static Language Unknown = UNKNOWN_LANGUAGE;
+// Subscripted by Language
+static const Language kClosestAltLanguage[] = {
+  (28 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE,  // ENGLISH
+  (36 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE,  // DANISH
+  (31 >= kMinCorrPercent) ? AFRIKAANS : UNKNOWN_LANGUAGE,  // DUTCH
+  (15 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE,  // FINNISH
+  (11 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE,  // FRENCH
+  (17 >= kMinCorrPercent) ? LUXEMBOURGISH : UNKNOWN_LANGUAGE,  // GERMAN
+  (27 >= kMinCorrPercent) ? YIDDISH : UNKNOWN_LANGUAGE,  // HEBREW
+  (16 >= kMinCorrPercent) ? CORSICAN : UNKNOWN_LANGUAGE,  // ITALIAN
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Japanese
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Korean
+  (41 >= kMinCorrPercent) ? NORWEGIAN_N : UNKNOWN_LANGUAGE,  // NORWEGIAN
+  ( 5 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE,  // POLISH
+  (23 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE,  // PORTUGUESE
+  (33 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE,  // RUSSIAN
+  (28 >= kMinCorrPercent) ? GALICIAN : UNKNOWN_LANGUAGE,  // SPANISH
+  (17 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE,  // SWEDISH
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Chinese
+  (42 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE,  // CZECH
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // GREEK
+  (35 >= kMinCorrPercent) ? FAROESE : UNKNOWN_LANGUAGE,  // ICELANDIC
+  ( 7 >= kMinCorrPercent) ? LITHUANIAN : UNKNOWN_LANGUAGE,  // LATVIAN
+  ( 7 >= kMinCorrPercent) ? LATVIAN : UNKNOWN_LANGUAGE,  // LITHUANIAN
+  ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE,  // ROMANIAN
+  ( 4 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE,  // HUNGARIAN
+  (15 >= kMinCorrPercent) ? FINNISH : UNKNOWN_LANGUAGE,  // ESTONIAN
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Ignore
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Unknown
+  (33 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE,  // BULGARIAN
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CROATIAN
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // SERBIAN
+  (24 >= kMinCorrPercent) ? SCOTS_GAELIC : UNKNOWN_LANGUAGE,  // IRISH
+  (28 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE,  // GALICIAN
+  ( 8 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE,  // TAGALOG
+  (29 >= kMinCorrPercent) ? AZERBAIJANI : UNKNOWN_LANGUAGE,  // TURKISH
+  (28 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE,  // UKRAINIAN
+  (37 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE,  // HINDI
+  (29 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE,  // MACEDONIAN
+  (14 >= kMinCorrPercent) ? ASSAMESE : UNKNOWN_LANGUAGE,  // BENGALI
+  (46 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE,  // INDONESIAN
+  ( 9 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE,  // LATIN
+  (46 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE,  // MALAY
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // MALAYALAM
+  ( 4 >= kMinCorrPercent) ? BRETON : UNKNOWN_LANGUAGE,  // WELSH
+  ( 8 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // NEPALI
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // TELUGU
+  ( 3 >= kMinCorrPercent) ? ESPERANTO : UNKNOWN_LANGUAGE,  // ALBANIAN
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // TAMIL
+  (22 >= kMinCorrPercent) ? UKRAINIAN : UNKNOWN_LANGUAGE,  // BELARUSIAN
+  (15 >= kMinCorrPercent) ? SUNDANESE : UNKNOWN_LANGUAGE,  // JAVANESE
+  (19 >= kMinCorrPercent) ? CATALAN : UNKNOWN_LANGUAGE,  // OCCITAN
+  (27 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // URDU
+  (36 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // BIHARI
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // GUJARATI
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // THAI
+  (24 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // ARABIC
+  (19 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE,  // CATALAN
+  ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE,  // ESPERANTO
+  ( 3 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE,  // BASQUE
+  ( 9 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE,  // INTERLINGUA
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // KANNADA
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // PUNJABI
+  (24 >= kMinCorrPercent) ? IRISH : UNKNOWN_LANGUAGE,  // SCOTS_GAELIC
+  ( 7 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // SWAHILI
+  (28 >= kMinCorrPercent) ? SERBO_CROATIAN : UNKNOWN_LANGUAGE,  // SLOVENIAN
+  (37 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // MARATHI
+  ( 3 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE,  // MALTESE
+  ( 1 >= kMinCorrPercent) ? YORUBA : UNKNOWN_LANGUAGE,  // VIETNAMESE
+  (15 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE,  // FRISIAN
+  (42 >= kMinCorrPercent) ? CZECH : UNKNOWN_LANGUAGE,  // SLOVAK
+  // Original ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ChineseT
+  (24 >= kMinCorrPercent) ? CHINESE : UNKNOWN_LANGUAGE,  // ChineseT
+  (35 >= kMinCorrPercent) ? ICELANDIC : UNKNOWN_LANGUAGE,  // FAROESE
+  (15 >= kMinCorrPercent) ? JAVANESE : UNKNOWN_LANGUAGE,  // SUNDANESE
+  (17 >= kMinCorrPercent) ? TAJIK : UNKNOWN_LANGUAGE,  // UZBEK
+  ( 7 >= kMinCorrPercent) ? TIGRINYA : UNKNOWN_LANGUAGE,  // AMHARIC
+  (29 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE,  // AZERBAIJANI
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // GEORGIAN
+  ( 7 >= kMinCorrPercent) ? AMHARIC : UNKNOWN_LANGUAGE,  // TIGRINYA
+  (27 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE,  // PERSIAN
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // BOSNIAN
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // SINHALESE
+  (41 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE,  // NORWEGIAN_N
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // PORTUGUESE_P
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // PORTUGUESE_B
+  (37 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE,  // XHOSA
+  (37 >= kMinCorrPercent) ? XHOSA : UNKNOWN_LANGUAGE,  // ZULU
+  ( 2 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE,  // GUARANI
+  (29 >= kMinCorrPercent) ? TSWANA : UNKNOWN_LANGUAGE,  // SESOTHO
+  ( 7 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE,  // TURKMEN
+  ( 8 >= kMinCorrPercent) ? KAZAKH : UNKNOWN_LANGUAGE,  // KYRGYZ
+  ( 5 >= kMinCorrPercent) ? FRENCH : UNKNOWN_LANGUAGE,  // BRETON
+  ( 3 >= kMinCorrPercent) ? GANDA : UNKNOWN_LANGUAGE,  // TWI
+  (27 >= kMinCorrPercent) ? HEBREW : UNKNOWN_LANGUAGE,  // YIDDISH
+  (28 >= kMinCorrPercent) ? SLOVENIAN : UNKNOWN_LANGUAGE,  // SERBO_CROATIAN
+  (12 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE,  // SOMALI
+  ( 9 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE,  // UIGHUR
+  (15 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // KURDISH
+  ( 6 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE,  // MONGOLIAN
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ARMENIAN
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // LAOTHIAN
+  ( 8 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE,  // SINDHI
+  (10 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE,  // RHAETO_ROMANCE
+  (31 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE,  // AFRIKAANS
+  (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE,  // LUXEMBOURGISH
+  ( 2 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE,  // BURMESE
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // KHMER
+  (45 >= kMinCorrPercent) ? DZONGKHA : UNKNOWN_LANGUAGE,  // TIBETAN
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // DHIVEHI
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CHEROKEE
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // SYRIAC
+  ( 8 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE,  // LIMBU
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ORIYA
+  (14 >= kMinCorrPercent) ? BENGALI : UNKNOWN_LANGUAGE,  // ASSAMESE
+  (16 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE,  // CORSICAN
+  ( 5 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE,  // INTERLINGUE
+  ( 8 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE,  // KAZAKH
+  ( 4 >= kMinCorrPercent) ? SWAHILI : UNKNOWN_LANGUAGE,  // LINGALA
+  (11 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE,  // MOLDAVIAN
+  (19 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // PASHTO
+  ( 5 >= kMinCorrPercent) ? AYMARA : UNKNOWN_LANGUAGE,  // QUECHUA
+  ( 5 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // SHONA
+  (17 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE,  // TAJIK
+  (13 >= kMinCorrPercent) ? BASHKIR : UNKNOWN_LANGUAGE,  // TATAR
+  (11 >= kMinCorrPercent) ? SAMOAN : UNKNOWN_LANGUAGE,  // TONGA
+  ( 2 >= kMinCorrPercent) ? TWI : UNKNOWN_LANGUAGE,  // YORUBA
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_ENGLISH_BASED
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_FRENCH_BASED
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_PORTUGUESE_BASED
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_OTHER
+  ( 6 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE,  // MAORI
+  ( 3 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE,  // WOLOF
+  ( 1 >= kMinCorrPercent) ? MONGOLIAN : UNKNOWN_LANGUAGE,  // ABKHAZIAN
+  ( 8 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE,  // AFAR
+  ( 5 >= kMinCorrPercent) ? QUECHUA : UNKNOWN_LANGUAGE,  // AYMARA
+  (13 >= kMinCorrPercent) ? TATAR : UNKNOWN_LANGUAGE,  // BASHKIR
+  ( 3 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE,  // BISLAMA
+  (45 >= kMinCorrPercent) ? TIBETAN : UNKNOWN_LANGUAGE,  // DZONGKHA
+  ( 4 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE,  // FIJIAN
+  ( 7 >= kMinCorrPercent) ? INUPIAK : UNKNOWN_LANGUAGE,  // GREENLANDIC
+  ( 3 >= kMinCorrPercent) ? AFAR : UNKNOWN_LANGUAGE,  // HAUSA
+  ( 3 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE,  // HAITIAN_CREOLE
+  ( 7 >= kMinCorrPercent) ? GREENLANDIC : UNKNOWN_LANGUAGE,  // INUPIAK
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // INUKTITUT
+  ( 4 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // KASHMIRI
+  (30 >= kMinCorrPercent) ? RUNDI : UNKNOWN_LANGUAGE,  // KINYARWANDA
+  ( 2 >= kMinCorrPercent) ? TAGALOG : UNKNOWN_LANGUAGE,  // MALAGASY
+  (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE,  // NAURU
+  (12 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE,  // OROMO
+  (30 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // RUNDI
+  (11 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE,  // SAMOAN
+  ( 1 >= kMinCorrPercent) ? LINGALA : UNKNOWN_LANGUAGE,  // SANGO
+  (32 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE,  // SANSKRIT
+  (16 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE,  // SISWANT
+  ( 5 >= kMinCorrPercent) ? SISWANT : UNKNOWN_LANGUAGE,  // TSONGA
+  (29 >= kMinCorrPercent) ? SESOTHO : UNKNOWN_LANGUAGE,  // TSWANA
+  ( 2 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE,  // VOLAPUK
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ZHUANG
+  ( 1 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE,  // KHASI
+  (28 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE,  // SCOTS
+  (15 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // GANDA
+  ( 7 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE,  // MANX
+  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // MONTENEGRIN
+};
+COMPILE_ASSERT(arraysize(kClosestAltLanguage) == NUM_LANGUAGES,
+               kClosestAltLanguage_has_incorrect_size);
+inline bool FlagFinish(int flags) {return (flags & kCLDFlagFinish) != 0;}
+inline bool FlagSqueeze(int flags) {return (flags & kCLDFlagSqueeze) != 0;}
+inline bool FlagRepeats(int flags) {return (flags & kCLDFlagRepeats) != 0;}
+inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;}
+inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;}
+inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;}
+inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;}
+//------------------------------------------------------------------------------
+// For --cld_html debugging output. Not thread safe
+//------------------------------------------------------------------------------
+static Language prior_lang = UNKNOWN_LANGUAGE;
+static bool prior_unreliable = false;
+//------------------------------------------------------------------------------
+// End For --cld_html debugging output
+//------------------------------------------------------------------------------
+// Backscan to word boundary, returning how many bytes n to go back
+// so that src - n is non-space ans src - n - 1 is space.
+// If not found in kMaxSpaceScan bytes, return 0
+int BackscanToSpace(const char* src, int limit) {
+  int n = 0;
+  limit = cld::minint(limit, kMaxSpaceScan);
+  while (n < limit) {
+    if (src[-n - 1] == ' ') {return n;}    // We are at _X
+    ++n;
+  }
+  return 0;
+}
+// Forwardscan to word boundary, returning how many bytes n to go forward
+// so that src + n is non-space ans src + n - 1 is space.
+// If not found in kMaxSpaceScan bytes, return 0
+int ForwardscanToSpace(const char* src, int limit) {
+  int n = 0;
+  limit = cld::minint(limit, kMaxSpaceScan);
+  while (n < limit) {
+    if (src[n] == ' ') {return n + 1;}    // We are at _X
+    ++n;
+  }
+  return 0;
+}
+// This uses a cheap predictor to get a measure of compression, and
+// hence a measure of repetitiveness. It works on complete UTF-8 characters
+// instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly
+// all the time when done with a byte-based count. Sigh.
+//
+// To allow running prediction across multiple chunks, caller passes in current
+// 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
+//
+// Returns the number of *bytes* correctly predicted, increments by 1..4 for
+// each correctly-predicted character.
+//
+// NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text
+//
+int CountPredictedBytes(const char* isrc, int srclen, int* hash, int* tbl) {
+  int p_count = 0;
+  const uint8* src = reinterpret_cast<const uint8*>(isrc);
+  const uint8* srclimit = src + srclen;
+  int local_hash = *hash;
+  while (src < srclimit) {
+    int c = src[0];
+    int incr = 1;
+    // Pick up one char and length
+    if (c < 0xc0) {
+      // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
+      // Do nothing more
+    } else if ((c & 0xe0) == 0xc0) {
+      // Two-byte
+      c = (c << 8) | src[1];
+      incr = 2;
+    } else if ((c & 0xf0) == 0xe0) {
+      // Three-byte
+      c = (c << 16) | (src[1] << 8) | src[2];
+      incr = 3;
+    } else {
+      // Four-byte
+      c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
+      incr = 4;
+    }
+    src += incr;
+    int p = tbl[local_hash];            // Prediction
+    tbl[local_hash] = c;                // Update prediction
+    p_count += (c == p);                // Count good predictions
+    local_hash = ((local_hash << 4) ^ c) & 0xfff;
+  }
+  *hash = local_hash;
+  return p_count;
+}
+// Counts number of spaces; a little faster than one-at-a-time
+// Doesn't count odd bytes at end
+int CountSpaces4(const char* src, int src_len) {
+  int s_count = 0;
+  for (int i = 0; i < (src_len & ~3); i += 4) {
+    s_count += (src[i] == ' ');
+    s_count += (src[i+1] == ' ');
+    s_count += (src[i+2] == ' ');
+    s_count += (src[i+3] == ' ');
+  }
+  return s_count;
+}
+// Remove words of text that have more than half their letters predicted
+// correctly by our cheap predictor, moving the remaining words in-place
+// to the front of the input buffer.
+//
+// To allow running prediction across multiple chunks, caller passes in current
+// 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
+//
+// Return the new, possibly-shorter length
+//
+// Result Buffer ALWAYS has leading space and trailing space space space NUL,
+// if input does
+//
+int CheapRepWordsInplace(char* isrc, int srclen, int* hash, int* tbl) {
+  const uint8* src = reinterpret_cast<const uint8*>(isrc);
+  const uint8* srclimit = src + srclen;
+  char* dst = isrc;
+  int local_hash = *hash;
+  char* word_dst = dst;           // Start of next word
+  int good_predict_bytes = 0;
+  int word_length_bytes = 0;
+  while (src < srclimit) {
+    int c = src[0];
+    int incr = 1;
+    *dst++ = c;
+    if (c == ' ') {
+      if ((good_predict_bytes * 2) > word_length_bytes) {
+        // Word is well-predicted: backup to start of this word
+        dst = word_dst;
+        if (FLAGS_cld_showme) {
+          // Mark the deletion point with period
+          // Don't repeat multiple periods
+          // Cannot mark with more bytes or may overwrite unseen input
+          if ((isrc < (dst - 2)) && (dst[-2] != '.')) {
+            *dst++ = '.';
+            *dst++ = ' ';
+          }
+        }
+      }
+      word_dst = dst;              // Start of next word
+      good_predict_bytes = 0;
+      word_length_bytes = 0;
+    }
+    // Pick up one char and length
+    if (c < 0xc0) {
+      // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
+      // Do nothing more
+    } else if ((c & 0xe0) == 0xc0) {
+      // Two-byte
+      *dst++ = src[1];
+      c = (c << 8) | src[1];
+      incr = 2;
+    } else if ((c & 0xf0) == 0xe0) {
+      // Three-byte
+      *dst++ = src[1];
+      *dst++ = src[2];
+      c = (c << 16) | (src[1] << 8) | src[2];
+      incr = 3;
+    } else {
+      // Four-byte
+      *dst++ = src[1];
+      *dst++ = src[2];
+      *dst++ = src[3];
+      c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
+      incr = 4;
+    }
+    src += incr;
+    word_length_bytes += incr;
+    int p = tbl[local_hash];            // Prediction
+    tbl[local_hash] = c;                // Update prediction
+    if (c == p) {
+      good_predict_bytes += incr;       // Count good predictions
+    }
+    local_hash = ((local_hash << 4) ^ c) & 0xfff;
+  }
+  *hash = local_hash;
+  if ((dst - isrc) < (srclen - 3)) {
+    // Pad and make last char clean UTF-8 by putting following spaces
+    dst[0] = ' ';
+    dst[1] = ' ';
+    dst[2] = ' ';
+    dst[3] = '\0';
+  } else   if ((dst - isrc) < srclen) {
+    // Make last char clean UTF-8 by putting following space off the end
+    dst[0] = ' ';
+  }
+  return static_cast<int>(dst - isrc);
+}
+// Remove portions of text that have a high density of spaces, or that are
+// overly repetitive, squeezing the remaining text in-place to the front of the
+// input buffer.
+//
+// Squeezing looks at density of space/prediced chars in fixed-size chunks,
+// specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes.
+//
+// Return the new, possibly-shorter length
+//
+// Result Buffer ALWAYS has leading space and trailing space space space NUL,
+// if input does
+//
+int CompactLangDetImpl::CheapSqueezeInplace(char* isrc,
+                                            int srclen,
+                                            int ichunksize) {
+  char* src = isrc;
+  char* dst = src;
+  char* srclimit = src + srclen;
+  bool skipping = false;
+  int hash = 0;
+  // Allocate local prediction table.
+  int* predict_tbl = new int[kPredictionTableSize];
+  memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
+  int chunksize = ichunksize;
+  if (chunksize == 0) {chunksize = kChunksizeDefault;}
+  int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
+  int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
+  while (src < srclimit) {
+    int remaining_bytes = srclimit - src;
+    int len = cld::minint(chunksize, remaining_bytes);
+    int space_n = CountSpaces4(src, len);
+    int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
+    if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
+      // Skip the text
+      if (!skipping) {
+        // Keeping-to-skipping transition; do it at a space
+        int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
+        dst -= n;
+        skipping = true;
+        if (FLAGS_cld_showme) {
+          // Mark the deletion point with black square U+25A0
+          *dst++ = 0xe2;
+          *dst++ = 0x96;
+          *dst++ = 0xa0;
+          *dst++ = ' ';
+        }
+        if (dst == isrc) {
+          // Force a leading space if the first chunk is deleted
+          *dst++ = ' ';
+        }
+      }
+    } else {
+      // Keep the text
+      if (skipping) {
+        // Skipping-to-keeping transition; do it at a space
+        int n = ForwardscanToSpace(src, len);
+        src += n;
+        remaining_bytes -= n;   // Shrink remaining length
+        len -= n;
+        skipping = false;
+      }
+      // "len" can be negative in some cases
+      if (len > 0) {
+        memcpy(dst, src, len);
+        dst += len;
+      }
+    }
+    src += len;
+  }
+  if ((dst - isrc) < (srclen - 3)) {
+    // Pad and make last char clean UTF-8 by putting following spaces
+    dst[0] = ' ';
+    dst[1] = ' ';
+    dst[2] = ' ';
+    dst[3] = '\0';
+  } else   if ((dst - isrc) < srclen) {
+    // Make last char clean UTF-8 by putting following space off the end
+    dst[0] = ' ';
+  }
+  // Deallocate local prediction table
+  delete[] predict_tbl;
+  return static_cast<int>(dst - isrc);
+}
+// Timing 2.8GHz P4 (dsites 2008.03.20) with 170KB input
+//  About 90 MB/sec, with or without memcpy, chunksize 48 or 4096
+//  Just CountSpaces is about 340 MB/sec
+//  Byte-only CountPredictedBytes is about 150 MB/sec
+//  Byte-only CountPredictedBytes, conditional tbl[] = is about 85! MB/sec
+//  Byte-only CountPredictedBytes is about 180 MB/sec, byte tbl, byte/int c
+//  Unjammed byte-only both = 170 MB/sec
+//  Jammed byte-only both = 120 MB/sec
+//  Back to original w/slight updates, 110 MB/sec
+//
+bool CheapSqueezeTriggerTest(const char* src, int srclen, int testsize) {
+  // Don't trigger at all on short text
+  if (srclen < testsize) {return false;}
+  int space_thresh = (testsize * kSpacesTriggerPercent) / 100;
+  int predict_thresh = (testsize * kPredictTriggerPercent) / 100;
+  int hash = 0;
+  // Allocate local prediction table.
+  int* predict_tbl = new int[kPredictionTableSize];
+  memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
+  bool retval = false;
+  if ((CountSpaces4(src, testsize) >= space_thresh) ||
+      (CountPredictedBytes(src, testsize, &hash, predict_tbl) >=
+       predict_thresh)) {
+    retval = true;
+  }
+  // Deallocate local prediction table
+  delete[] predict_tbl;
+  return retval;
+}
+// Close pairs (correlation) language_enum/language_enum
+//  id/ms (0.47)    38/40    [1]
+//  bo/dz (0.46)    105/135  [2]
+//  cz/sk (0.43)    17/68    [3]
+//  no/nn (0.42)    10/80    [4]
+//  hi/mr (0.38)    35/64    [5]
+//  xh/zu (0.37)    83/84    [6]
+// Subscripted by packed language, gives 0 or a subscript in closepair
+// scoring array inside doc_tote
+static const uint8 kClosePair[EXT_NUM_LANGUAGES + 1] = {
+  0,
+  0,0,0,0,0,0,0,0, 0,0,4,0,0,0,0,0, 0,3,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+  0,0,0,5,0,0,1,0, 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+  5,0,0,0,3,0,0,0, 0,0,0,0,0,0,0,0, 4,0,0,6,6,0,0,0, 0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0, 0,2,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,2, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+  // Add new language close-pair number just before here (just use 0)
+};
+// Delete any extended languages from doc_tote
+void RemoveExtendedLanguages(ToteWithReliability* doc_tote) {
+  for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
+    if (cld::UnpackLanguage(doc_tote->Key(sub)) >= NUM_LANGUAGES) {
+      // Effectively remove the extended language by setting key&score to zero
+      if (FLAGS_dbgscore) {
+        fprintf(stderr, "{-%s} ",
+                ExtLanguageCode(cld::UnpackLanguage(doc_tote->Key(sub))));
+      }
+      // Delete entry
+      doc_tote->SetKey(sub, 0);
+      doc_tote->SetValue(sub, 0);
+      doc_tote->SetReliability(sub, 0);
+    }
+  }
+}
+static const int kMinReliableKeepPercent = 41;  // Remove lang if reli < this
+// For Tier3 languages, require a minimum number of bytes to be first-place lang
+static const int kGoodFirstT3MinBytes = 24;         // <this => no first
+// Move bytes for unreliable langs to another lang or UNKNOWN
+// doc_tote is sorted, so cannot Add
+//
+// If both CHINESE and CHINESET are present and unreliable, do not delete both;
+// merge both into CHINESE.
+//
+//dsites 2009.03.19
+// we also want to remove Tier3 languages as the first lang if there is very
+// little text like ej1 ej2 ej3 ej4
+// maybe fold this back in earlier
+//
+void RemoveUnreliableLanguages(ToteWithReliability* doc_tote, bool do_remove_weak_matches) {
+  // Prepass to merge some low-reliablility languages
+  int total_bytes = 0;
+  for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
+    int plang = doc_tote->Key(sub);
+    if (plang == 0) {continue;}                     // Empty slot
+    Language lang = cld::UnpackLanguage(plang);
+    int bytes = doc_tote->Value(sub);
+    int reli = doc_tote->Reliability(sub);
+    if (bytes == 0) {continue;}                     // Zero bytes
+    total_bytes += bytes;
+    // Reliable percent is stored reliable score over stored bytecount
+    int reliable_percent = reli / bytes;
+    if (reliable_percent >= kMinReliableKeepPercent) {continue;}   // Keeper
+    // This language is too unreliable to keep, but we might merge it.
+    Language altlang = UNKNOWN_LANGUAGE;
+    if (lang < NUM_LANGUAGES) {altlang = kClosestAltLanguage[lang];}
+    if (altlang == UNKNOWN_LANGUAGE) {continue;}    // No alternative
+    // Look for alternative in doc_tote
+    int altsub = doc_tote->Find(cld::PackLanguage(altlang));
+    if (altsub < 0) {continue;}                     // No alternative text
+    int bytes2 = doc_tote->Value(altsub);
+    int reli2 = doc_tote->Reliability(altsub);
+    if (bytes2 == 0) {continue;}                    // Zero bytes
+    // Reliable percent is stored reliable score over stored bytecount
+    int reliable_percent2 = reli2 / bytes2;
+    // Merge one language into the other. Break ties toward lower lang #
+    int tosub = altsub;
+    int fromsub = sub;
+    bool into_lang = false;
+    if ((reliable_percent2 < reliable_percent) ||
+        ((reliable_percent2 == reliable_percent) && (lang < altlang))) {
+      tosub = sub;
+      fromsub = altsub;
+      into_lang = true;
+    }
+    // Make sure reliability doesn't drop and is enough to avoid delete
+    int newpercent = cld::maxint(reliable_percent, reliable_percent2);
+    newpercent = cld::maxint(newpercent, kMinReliableKeepPercent);
+    int newbytes = bytes + bytes2;
+    int newreli = newpercent * newbytes;
+    doc_tote->SetKey(fromsub, 0);
+    doc_tote->SetValue(fromsub, 0);
+    doc_tote->SetReliability(fromsub, 0);
+    doc_tote->SetValue(tosub, newbytes);
+    doc_tote->SetReliability(tosub, newreli);
+    // Show fate of unreliable languages if at least 10 bytes
+    if (FLAGS_cld_html /*&& (newpercent >= 10)*/ && (newbytes >= 10)) {
+      if (into_lang) {
+        fprintf(stderr, "{Unreli %s.%d(%dB) => %s} ",
+                ExtLanguageCode(altlang), reliable_percent2, bytes2,
+                ExtLanguageCode(lang));
+      } else {
+        fprintf(stderr, "{Unreli %s.%d(%dB) => %s} ",
+                ExtLanguageCode(lang), reliable_percent, bytes,
+                ExtLanguageCode(altlang));
+      }
+    }
+  }
+  if (do_remove_weak_matches) {
+    // Pass to delete any remaining unreliable languages
+    for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
+      int plang = doc_tote->Key(sub);
+      if (plang == 0) {continue;}                     // Empty slot
+      Language lang = cld::UnpackLanguage(plang);
+      int bytes = doc_tote->Value(sub);
+      int reli = doc_tote->Reliability(sub);
+      if (bytes == 0) {continue;}                     // Zero bytes
+      bool is_tier3 = (cld::kIsPackedTop40[plang] == 0);
+      if (is_tier3 &&
+          (bytes < kGoodFirstT3MinBytes) &&
+          (bytes < total_bytes)) {
+        reli = 0;                                     // Too-short tier3
+      }
+      // Reliable percent is stored as reliable score over stored bytecount
+      int reliable_percent = reli / bytes;
+      if (reliable_percent >= kMinReliableKeepPercent) {continue;}  // Keeper
+      // Delete unreliable entry
+      doc_tote->SetKey(sub, 0);
+      doc_tote->SetValue(sub, 0);
+      doc_tote->SetReliability(sub, 0);
+      // Show fate of unreliable languages if at least 10 bytes
+      if (FLAGS_cld_html /*&& (reliable_percent >= 10)*/ && (bytes >= 10)) {
+        fprintf(stderr, "{Unreli %s.%d(%dB)} ",
+                ExtLanguageCode(lang), reliable_percent, bytes);
+      }
+    }
+  }
+  if (FLAGS_cld_html) {fprintf(stderr, "<br>\n");}
+}
+// Move less likely byte count to more likely for close pairs of languages
+void RefineScoredClosePairs(ToteWithReliability* doc_tote) {
+  for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
+    int close_packedlang = doc_tote->Key(sub);
+    int subscr = kClosePair[close_packedlang];
+    if (subscr == 0) {continue;}
+    // We have a close pair language -- if the other one is also scored and the
+    // longword score differs enough, put all our eggs into one basket
+    // Nonzero longword score: Go look for the other of this pair
+    for (int sub2 = sub + 1; sub2 < doc_tote->MaxSize(); ++sub2) {
+      if (kClosePair[doc_tote->Key(sub2)] == subscr) {
+        // We have a matching pair
+        int close_packedlang2 = doc_tote->Key(sub2);
+        // Move all the text bytes from lower byte-count to higher one
+        int from_sub, to_sub;
+        Language from_lang, to_lang;
+        if (doc_tote->Value(sub) < doc_tote->Value(sub2)) {
+          from_sub = sub;
+          to_sub = sub2;
+          from_lang = cld::UnpackLanguage(close_packedlang);
+          to_lang = cld::UnpackLanguage(close_packedlang2);
+        } else {
+          from_sub = sub2;
+          to_sub = sub;
+          from_lang = cld::UnpackLanguage(close_packedlang2);
+          to_lang = cld::UnpackLanguage(close_packedlang);
+        }
+        // Move all the bytes smaller => larger of the pair
+        if (FLAGS_cld_html || FLAGS_dbgscore) {
+          // Show fate of closepair language
+          int val = doc_tote->Value(from_sub);
+          int reli = doc_tote->Reliability(from_sub);
+          int reliable_percent = reli / (val ? val : 1);  // avoid zdiv
+          fprintf(stderr, "{CloseLangPair: %s.%d%%(%dB) => %s} ",
+                  ExtLanguageCode(from_lang),
+                  reliable_percent,
+                  doc_tote->Value(from_sub),
+                  ExtLanguageCode(to_lang));
+        }
+        int sum = doc_tote->Value(to_sub) + doc_tote->Value(from_sub);
+        doc_tote->SetValue(to_sub, sum);
+        doc_tote->SetReliability(to_sub, 100 * sum);
+        // Delete old entry
+        doc_tote->SetKey(from_sub, 0);
+        doc_tote->SetValue(from_sub, 0);
+        doc_tote->SetReliability(from_sub, 0);
+        break;    // Exit inner for sub2 loop
+      }
+    }     // End for sub2
+  }   // End for sub
+}
+void ApplyLanguageHints(Tote* chunk_tote, int tote_grams,
+                        uint8* lang_hint_boost) {
+  // Need 8 quad/unigrams to give full hint boost, else derate linearly
+  if (tote_grams > 8) {
+    tote_grams = 8;
+  }
+  for (int sub = 0; sub < chunk_tote->MaxSize(); ++sub) {
+    // Hint boosts are per packed subscript
+    int lang_sub = chunk_tote->Key(sub);
+    int new_value = chunk_tote->Value(sub) +
+      ((lang_hint_boost[lang_sub] * tote_grams) >> 3);
+    chunk_tote->SetValue(sub, new_value);
+    if (FLAGS_dbgscore && (lang_hint_boost[lang_sub] > 0)) {
+      fprintf(stderr, "[%s+=%d*%d/8] ",
+              ExtLanguageCode(cld::UnpackLanguage(lang_sub)),
+              lang_hint_boost[lang_sub], tote_grams);
+    }
+  }
+}
+void PrintHtmlEscapedText(FILE* f, const char* txt, int len) {
+  for (int i = 0; i < len; ++i) {
+    char c = txt[i];
+    if (c == '<') {
+      fprintf(f, "&lt;");
+    } else if (c == '>') {
+      fprintf(f, "&gt;");
+    } else if (c == '&') {
+      fprintf(f, "&amp;");
+    } else if (c == '\'') {
+      fprintf(f, "&apos;");
+    } else if (c == '"') {
+      fprintf(f, "&quot;");
+    } else {
+      fprintf(f, "%c", c);
+    }
+  }
+  fprintf(f, "<br>\n");
+}
+// Add one chunk's score to running document score
+// If the top language is UNKNOWN_LANGUAGE, score nothing. This is used to
+// positively identify text to be ignored, such as link farms.
+// Sort before scoring and reinit afterward
+//
+// src and srclen are just for debug output
+void ScoreChunkIntoDoc(const char* src, int srclen, int advance_by,
+                       UnicodeLScript lscript,
+                       Tote* chunk_tote,
+                       ToteWithReliability* doc_tote,
+                       int tote_grams,
+                       uint8* lang_hint_boost) {
+  // Apply hints before sorting
+  if (lang_hint_boost) {
+    ApplyLanguageHints(chunk_tote, tote_grams, lang_hint_boost);
+  }
+  // Sort to get top two languages
+  chunk_tote->Sort(2);
+  Language cur_lang = cld::UnpackLanguage(chunk_tote->Key(0));
+  // Return if empty
+  if (cur_lang < 0) {
+    chunk_tote->Reinit();
+    return;
+  }
+  bool cur_unreliable = false;
+  // Reliability is a function of mean script score per KB of text
+  int len = chunk_tote->GetByteCount();
+  int reliability = cld::GetReliability((len * 2) / advance_by,
+                                        lscript,
+                                        chunk_tote);
+  cur_unreliable = (reliability < cld::kMinReliable);
+  // If tote_grams=0, always reliable
+  // If tote_grams=1, always unreliable
+  if (tote_grams == 0) {
+    reliability = 100;
+    cur_unreliable = false;
+  } else if (tote_grams == 1) {
+    reliability = 0;
+    cur_unreliable = true;
+  }
+#if 0
+  // TEMP
+  if (FLAGS_cld_html) {
+    if (reliability >= kMinReliableKeepPercent) {
+      fprintf(stderr, "R%d%% ", reliability);
+    } else {
+      fprintf(stderr, "--R%d%% ", reliability);
+    }
+  }
+#endif
+  // Track the sequence of language fragments [result currently unused]
+  ////if (reliability >= kMinReliableSeq) {
+  ////  doc_tote->AddSeq(chunk_tote->Key(0));
+  ////}
+  if (cur_unreliable && (chunk_tote->Key(1) != 0)) {
+    // Unreliable and two top contenders, split byte count 5/8 - 3/8
+    int top_len = ((len * 5) + 4) >> 3;
+    int second_len = len - top_len;
+    doc_tote->Add(chunk_tote->Key(0),
+                  top_len, chunk_tote->Value(0), reliability);
+    doc_tote->Add(chunk_tote->Key(1),
+                  second_len, chunk_tote->Value(1), reliability);
+    if (FLAGS_dbgscore) {
+      fprintf(stderr, "{+%s.%d.%dR(%dB) +%s.%d.%dR(%dB)} ",
+              ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(0))),
+              chunk_tote->Value(0),
+              reliability,
+              top_len,
+              ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(1))),
+              chunk_tote->Value(1),
+              reliability,
+              second_len);
+    }
+  } else {
+    // Reliable or single contender
+    doc_tote->Add(chunk_tote->Key(0),
+                  len, chunk_tote->Value(0), reliability);
+    if (FLAGS_dbgscore) {
+      fprintf(stderr, "{+%s.%d.%dR(%dB)} ",
+              ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(0))),
+              chunk_tote->Value(0),
+              reliability,
+              len);
+    }
+  }
+  if (FLAGS_cld_html) {
+    if (cur_lang < 0) {cur_lang = UNKNOWN_LANGUAGE;}
+    cld::PrintLang(stderr, chunk_tote,
+              cur_lang, cur_unreliable,
+              prior_lang, prior_unreliable);
+    prior_lang = cur_lang;
+    prior_unreliable = cur_unreliable;
+    string temp(src, srclen);
+    if (temp[0] == '=') {
+      // Rewrite =ScriptX= or =SwitchX= as =Xxxx= for script code Xxxx
+      temp = "=Buffered_";
+      temp.append(UnicodeLScriptCode(lscript));
+      temp.append("=");
+    }
+    cld::PrintText(stderr, cur_lang, temp);
+  }
+  chunk_tote->Reinit();
+}
+void PrintTopLang(Language top_lang) {
+  if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
+    fprintf(stderr, "[] ");
+  } else {
+    fprintf(stderr, "[%s] ", ExtLanguageName(top_lang));
+    prior_lang = top_lang;
+  }
+}
+void PrintTopLangSpeculative(Language top_lang) {
+  fprintf(stderr, "<span style=\"color:#%06X;\">", 0xa0a0a0);
+  if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
+    fprintf(stderr, "[] ");
+  } else {
+    fprintf(stderr, "[%s] ", ExtLanguageName(top_lang));
+    prior_lang = top_lang;
+  }
+  fprintf(stderr, "</span>\n");
+}
+// Add one chunk's score to running document score
+// Convenience function with constant src text
+void ScoreChunkIntoDoc2(const char* src, int advance_by,
+                       UnicodeLScript lscript,
+                       Tote* chunk_tote,
+                       ToteWithReliability* doc_tote,
+                       int tote_grams,
+                       uint8* lang_hint_boost) {
+  int srclen = static_cast<int>(strlen(src));
+  ScoreChunkIntoDoc(src, srclen, advance_by, lscript, chunk_tote,
+                    doc_tote, tote_grams, lang_hint_boost);
+}
+// Score one scriptspan using the only language for that script
+void ScoreNilgrams(getone::LangSpan* scriptspan, int lang,
+                  ToteWithReliability* doc_tote,
+                  uint8* lang_hint_boost,
+                  int flags, Language plus_one) {
+  // For debugging only. Not thread-safe
+  prior_lang = UNKNOWN_LANGUAGE;
+  prior_unreliable = false;
+  const char* src = scriptspan->text;
+  int len = scriptspan->text_bytes;
+  Tote chunk_tote;
+  // Score 1000 for 1000 bytes
+  chunk_tote.AddGram();
+  chunk_tote.Add(lang, scriptspan->text_bytes);
+  chunk_tote.AddBytes(scriptspan->text_bytes);
+  int advance_by = 2;
+  int tote_grams = 0;   // Indicates fully reliable
+  ScoreChunkIntoDoc(src, len, advance_by,
+                    scriptspan->script, &chunk_tote,
+                    doc_tote, tote_grams, lang_hint_boost);
+}
+// Score one scriptspan using unigrams
+// Updates tote_grams
+static void ScoreUnigrams(const UTF8PropObj* unigram_obj,
+                      getone::LangSpan* scriptspan,
+                      int* tote_grams, int gram_limit,
+                      Tote* chunk_tote,
+                      ToteWithReliability* doc_tote,
+                      uint8* lang_hint_boost,
+                      int advance_by, int flags,
+                   int* initial_word_span, Language plus_one) {
+  // chunk_tote may have partial sum coming in
+  const char* src = scriptspan->text;
+  const char* srclimit = src + scriptspan->text_bytes;
+  // For debugging only. Not thread-safe
+  prior_lang = UNKNOWN_LANGUAGE;
+  prior_unreliable = false;
+  // Break text up into multiple chunks and score each
+  while (src < srclimit) {
+    // Updates tote_grams
+    int len = cld::DoUniScoreV3(unigram_obj,
+                                 src, srclimit - src, advance_by,
+                                 tote_grams, gram_limit, chunk_tote);
+    if (FlagUseWords(flags) || (*initial_word_span > 0)) {
+      // Use bigram scoring in addition to quadgrams
+      cld::DoBigramScoreV3(&kCjkBiTable_obj,
+                           src, len, chunk_tote);
+    }
+    chunk_tote->AddBytes(len);
+    *initial_word_span -= len;
+    if (*tote_grams >= gram_limit) {
+      // Add this chunk to doc totals
+      // Remove all but top40 if asked
+      if (FlagTop40(flags)) {
+        cld::DemoteNotTop40(chunk_tote, cld::PackLanguage(plus_one));
+      }
+      // Sort, accumulate into doc total, reinit
+      ScoreChunkIntoDoc(src, len, advance_by,
+                        scriptspan->script, chunk_tote,
+                        doc_tote, *tote_grams, lang_hint_boost);
+      *tote_grams = 0;
+    } else {
+      if (FLAGS_cld_html) {
+        string temp(src, len);
+        Language top_lang = cld::UnpackLanguage(chunk_tote->CurrentTopKey());
+        PrintTopLangSpeculative(top_lang);
+        cld::PrintText(stderr, top_lang, temp);
+      }
+    }
+    src += len;
+  }
+  // chunk_tote may have partial sum going out
+}
+// Back up one UTF-8 character
+const uint8* BackOneUTF8(const uint8* p) {
+  const uint8* retval = p - 1;
+  if ((*retval & 0xc0) == 0x80) {--retval;}
+  if ((*retval & 0xc0) == 0x80) {--retval;}
+  if ((*retval & 0xc0) == 0x80) {--retval;}
+  return retval;
+}
+// Score one scriptspan using quadgrams
+// Incoming chunk_tote may have partial accumulation
+static void ScoreQuadgrams(const cld::CLDTableSummary* quadgram_obj,
+                       getone::LangSpan* scriptspan,
+                       int* tote_grams, int gram_limit,
+                       Tote* chunk_tote,
+                       ToteWithReliability* doc_tote,
+                       uint8* lang_hint_boost,
+                       int advance_by, int flags,
+                       int* initial_word_span, Language plus_one) {
+  // chunk_tote may have partial sum coming in
+  const char* src = scriptspan->text;
+  const char* srclimit = src + scriptspan->text_bytes;
+  const char* lastscored_src = src;
+  // For debugging only. Not thread-safe
+  prior_lang = UNKNOWN_LANGUAGE;
+  prior_unreliable = false;
+  // Break text up into multiple chunks and score each
+  while (src < srclimit) {
+    // Updates tote_grams
+    int len = cld::DoQuadScoreV3(quadgram_obj,
+                                 src, srclimit - src, advance_by,
+                                 tote_grams, gram_limit, chunk_tote);
+    if (FlagUseWords(flags) || (*initial_word_span > 0)) {
+      // Use word scoring in addition to quadgrams
+      cld::DoOctaScoreV3(&kLongWord8Table_obj,
+                         src, len, chunk_tote);
+    }
+    chunk_tote->AddBytes(len);
+    *initial_word_span -= len;
+    if (*tote_grams >= gram_limit) {
+      // Remove all but top40 if asked
+      if (FlagTop40(flags)) {
+        cld::DemoteNotTop40(chunk_tote, cld::PackLanguage(plus_one));
+      }
+      // Sort, accumulate into doc total, reinit
+      ScoreChunkIntoDoc(src, len, advance_by,
+                        scriptspan->script, chunk_tote,
+                        doc_tote, *tote_grams, lang_hint_boost);
+      lastscored_src = src + len;
+      *tote_grams = 0;
+    } else {
+      if (FLAGS_cld_html) {
+        string temp(src, len);
+        Language top_lang = cld::UnpackLanguage(chunk_tote->CurrentTopKey());
+        PrintTopLangSpeculative(top_lang);
+        cld::PrintText(stderr, top_lang, temp);
+      }
+    }
+    src += len;
+  }
+}
+void PrintLangs(FILE* f, const Language* language3, const int* percent3,
+                const int* text_bytes, const bool* is_reliable) {
+  fprintf(f, "<br>&nbsp;&nbsp;Initial_Languages ");
+  if (language3[0] != UNKNOWN_LANGUAGE) {
+    fprintf(f, "%s%s(%d%%)  ",
+            ExtLanguageName(language3[0]),
+            *is_reliable ? "" : "*",
+            percent3[0]);
+  }
+  if (language3[1] != UNKNOWN_LANGUAGE) {
+    fprintf(f, "%s(%d%%)  ", ExtLanguageName(language3[1]), percent3[1]);
+  }
+  if (language3[2] != UNKNOWN_LANGUAGE) {
+    fprintf(f, "%s(%d%%)  ", ExtLanguageName(language3[2]), percent3[2]);
+  }
+  fprintf(f, "%d bytes \n", *text_bytes);
+  fprintf(f, "<br>\n");
+}
+// Start the tote with a count of one for the default language for script
+void InitScriptToteLang(Tote* script_tote, UnicodeLScript lscript) {
+  Language defaultlang = cld::kDefaultLanguagePerLScript[lscript];
+  script_tote->Add(cld::PackLanguage(defaultlang), 1);
+  script_tote->AddBytes(1);
+#if 0
+  if (FLAGS_cld_html) {
+    cld::PrintLang(stderr, script_tote,
+              defaultlang, false,
+              UNKNOWN_LANGUAGE, false);
+    prior_lang = cur_lang;
+    string temp("+1");
+    cld::PrintText(stderr, defaultlang, temp);
+  }
+#endif
+}
+static const char* const kToteName[4] =
+  {"=Latn=", "=Hani=", "=Script2=", "=Script3="};
+static const char* const kToteSwitch[4] =
+  {"=Latn=", "=Hani=", "=Switch2=", "=Switch3="};
+// Upper to lower, keep digits, everything else to minus '-' (2d)
+static const char kCharsetToLowerTbl[256] = {
+  0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
+  0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
+  0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
+  0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37, 0x38,0x39,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
+  0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
+  0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x2d,
+  0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
+  0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x2d,
+  0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
+  0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
+  0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
+  0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
+  0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
+  0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
+  0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
+  0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
+};
+static const char kIsAlpha[256] = {
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+  0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,
+  0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+};
+static const char kIsDigit[256] = {
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1, 1,1,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+};
+// Normalize ASCII string to first 4 alphabetic/digit chars
+// Letters are forced to lowercase ASCII
+// Used to normalize TLD values
+void MakeChar4(const char* str, char* norm) {
+  memcpy(norm, "____", 4);     // four underscores
+  int l_ptr = 0;
+  for (unsigned int i = 0; i < strlen(str); ++i) {
+    uint8 uc = static_cast<uint8>(str[i]);
+    if (kIsAlpha[uc] | kIsDigit[uc]) {
+      if (l_ptr < 4) {                  // Else ignore
+        norm[l_ptr] = kCharsetToLowerTbl[uc];
+        l_ptr++;
+      }
+    }
+  }
+}
+// Find subscript of matching key in first 4 bytes of sorted hint array, or -1
+static int HintBinaryLookup4(const HintEntry* hintprobs, int hintprobssize,
+                     const char* norm_key) {
+  // Key is always in range [lo..hi)
+  int lo = 0;
+  int hi = hintprobssize;
+  while (lo < hi) {
+    int mid = (lo + hi) >> 1;
+    int comp = memcmp(&hintprobs[mid].key[0], norm_key, 4);
+    if (comp < 0) {
+      lo = mid + 1;
+    } else if (comp > 0) {
+      hi = mid;
+    } else {
+      return mid;
+    }
+  }
+  return -1;
+}
+// Increment the initial probabilities based on a per-TLD probs entry
+void ApplyTLDHint(uint8* lang_hint_boost, const char* tld_hint) {
+  if (FLAGS_dbgscore) {
+    fprintf(stderr, "TLD hint %s\n", tld_hint);
+  }
+  char normalized_tld[8];
+  MakeChar4(tld_hint, normalized_tld);
+  int n = HintBinaryLookup4(kTLDHintProbs, kTLDHintProbsSize,
+                           normalized_tld);
+  // TLD is four bytes, probability entry is 4 bytes
+  if (n >= 0) {
+    uint32 probs = kTLDHintProbs[n].probs;
+    uint8 prob123 = (probs >> 0) & 0xff;
+    const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
+    uint8 top1 = (probs >> 8) & 0xff;
+    if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);}
+    uint8 top2 = (probs >> 16) & 0xff;
+    if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);}
+    uint8 top3 = (probs >> 24) & 0xff;
+    if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);}
+  }
+}
+// Increment the initial probabilities based on a per-encoding probs entry
+void ApplyEncodingHint(uint8* lang_hint_boost, int encoding_hint) {
+  if (FLAGS_dbgscore) {
+    Encoding tempenc = static_cast<Encoding>(encoding_hint);
+    fprintf(stderr, "ENC hint %s\n", EncodingName(tempenc));
+  }
+  if (encoding_hint < ISO_8859_1) {return;}
+  if (encoding_hint >= NUM_ENCODINGS) {return;}
+  uint32 probs = kEncodingHintProbs[encoding_hint];
+  uint8 prob123 = (probs >> 0) & 0xff;
+  const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
+  uint8 top1 = (probs >> 8) & 0xff;
+  if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);}
+  uint8 top2 = (probs >> 16) & 0xff;
+  if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);}
+  uint8 top3 = (probs >> 24) & 0xff;
+  if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);}
+}
+// Increment the initial probability for given language by fixed amount
+// Does not recognize extended languages as hints
+void ApplyLanguageHint(uint8* lang_hint_boost, Language language_hint) {
+  if (FLAGS_dbgscore) {
+    fprintf(stderr, "LANG hint %s\n", ExtLanguageName(language_hint));
+  }
+  if (language_hint < ENGLISH) {return;}
+  if (language_hint >= NUM_LANGUAGES) {return;}
+  uint32 probs = kLanguageHintProbs[language_hint];
+  uint8 prob123 = (probs >> 0) & 0xff;
+  const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
+  uint8 top1 = (probs >> 8) & 0xff;
+  if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);}
+  uint8 top2 = (probs >> 16) & 0xff;
+  if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);}
+  uint8 top3 = (probs >> 24) & 0xff;
+  if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);}
+}
+// Extract return values before fixups
+void ExtractLangEtc(ToteWithReliability* doc_tote, int total_text_bytes,
+                    int* reliable_percent3, Language* language3, int* percent3,
+                    double*  normalized_score3,
+                    int* text_bytes, bool* is_reliable) {
+  reliable_percent3[0] = 0;
+  reliable_percent3[1] = 0;
+  reliable_percent3[2] = 0;
+  language3[0] = UNKNOWN_LANGUAGE;
+  language3[1] = UNKNOWN_LANGUAGE;
+  language3[2] = UNKNOWN_LANGUAGE;
+  percent3[0] = 100;
+  percent3[1] = 0;
+  percent3[2] = 0;
+  normalized_score3[0] = 0.0;
+  normalized_score3[1] = 0.0;
+  normalized_score3[2] = 0.0;
+  *text_bytes = total_text_bytes;
+  *is_reliable = false;
+  int bytecount1 = total_text_bytes;
+  int bytecount2 = 0;
+  int bytecount3 = 0;
+  int lang1 = doc_tote->Key(0);
+  if (lang1 != 0) {
+    // We have a top language
+    language3[0] = cld::UnpackLanguage(lang1);
+    bytecount1 = doc_tote->Value(0);
+    int reli1 = doc_tote->Reliability(0);
+    reliable_percent3[0] = reli1 / (bytecount1 ? bytecount1 : 1);  // avoid zdiv
+    normalized_score3[0] = cld::GetNormalizedScore(language3[0],
+                                                  ULScript_Common,
+                                                  bytecount1,
+                                                  doc_tote->Score(0));
+  }
+  int lang2 = doc_tote->Key(1);
+  if (lang2 != 0) {
+    language3[1] = cld::UnpackLanguage(lang2);
+    bytecount2 = doc_tote->Value(1);
+    int reli2 = doc_tote->Reliability(1);
+    reliable_percent3[1] = reli2 / (bytecount2 ? bytecount2 : 1);  // avoid zdiv
+    normalized_score3[1] = cld::GetNormalizedScore(language3[1],
+                                                  ULScript_Common,
+                                                  bytecount2,
+                                                  doc_tote->Score(1));
+  }
+  int lang3 = doc_tote->Key(2);
+  if (lang3 != 0) {
+    language3[2] = cld::UnpackLanguage(lang3);
+    bytecount3 = doc_tote->Value(2);
+    int reli3 = doc_tote->Reliability(2);
+    reliable_percent3[2] = reli3 / (bytecount3 ? bytecount3 : 1);  // avoid zdiv
+    normalized_score3[2] = cld::GetNormalizedScore(language3[2],
+                                                  ULScript_Common,
+                                                  bytecount3,
+                                                  doc_tote->Score(2));
+  }
+  // Increase total bytes to sum (top 3) if low for some reason
+  int total_bytecount12 = bytecount1 + bytecount2;
+  int total_bytecount123 = total_bytecount12 + bytecount3;
+  if (total_text_bytes < total_bytecount123) {
+    total_text_bytes = total_bytecount123;
+    *text_bytes = total_text_bytes;
+  }
+  // Sum minus previous % gives better roundoff behavior than bytecount/total
+  int total_text_bytes_div = cld::maxint(1, total_text_bytes);    // Avoid zdiv
+  percent3[0] = (bytecount1 * 100) / total_text_bytes_div;
+  percent3[1] = (total_bytecount12 * 100) / total_text_bytes_div;
+  percent3[2] = (total_bytecount123 * 100) / total_text_bytes_div;
+  percent3[2] -= percent3[1];
+  percent3[1] -= percent3[0];
+  // Roundoff, say 96% 1.6% 1.4%, will produce non-obvious 96% 1% 2%
+  // Fix this explicitly
+  if (percent3[1] < percent3[2]) {
+    ++percent3[1];
+    --percent3[2];
+  }
+  if (percent3[0] < percent3[1]) {
+    ++percent3[0];
+    --percent3[1];
+  }
+  *text_bytes = total_text_bytes;
+  if (lang1 != 0) {
+    // We have a top language
+    // Its reliability is overal result reliability
+    int bytecount = doc_tote->Value(0);
+    int reli = doc_tote->Reliability(0);
+    int reliable_percent = reli / (bytecount ? bytecount : 1);  // avoid zdiv
+    *is_reliable = reliable_percent >= cld::kMinReliable;
+  } else {
+    // No top language at all. This can happen with zero text or 100% Klingon
+    // if extended=false. Just return all UNKNOWN_LANGUAGE, reliable.
+    *is_reliable = true;
+  }
+}
+bool IsFIGS(Language lang) {
+  if (lang == FRENCH) {return true;}
+  if (lang == ITALIAN) {return true;}
+  if (lang == GERMAN) {return true;}
+  if (lang == SPANISH) {return true;}
+  return false;
+}
+bool IsEFIGS(Language lang) {
+  if (lang == ENGLISH) {return true;}
+  if (lang == FRENCH) {return true;}
+  if (lang == ITALIAN) {return true;}
+  if (lang == GERMAN) {return true;}
+  if (lang == SPANISH) {return true;}
+  return false;
+}
+static const int kNonEnBoilerplateMinPercent = 17;    // <this => no second
+static const int kNonFIGSBoilerplateMinPercent = 20;  // <this => no second
+static const int kGoodFirstMinPercent = 26;           // <this => UNK
+static const int kGoodFirstReliableMinPercent = 51;   // <this => unreli
+static const int kIgnoreMaxPercent = 95;              // >this => unreli
+static const int kKeepMinPercent = 2;                 // <this => unreli
+// For Tier3 languages, require more bytes of text to override
+// the first-place language
+static const int kGoodSecondT1T2MinBytes = 15;        // <this => no second
+static const int kGoodSecondT3MinBytes = 128;         // <this => no second
+                                                      //
+// Calculate a single summary language for the document, and its reliability.
+// Returns language3[0] or language3[1] or ENGLISH or UNKNOWN_LANGUAGE
+// This is the heart of matching human-rater perception.
+// reliable_percent3[] is currently unused
+//
+// Do not return Tier3 second language unless there are at least 128 bytes
+void CalcSummaryLang(ToteWithReliability* doc_tote, int total_text_bytes,
+                     const int* reliable_percent3,
+                     const Language* language3,
+                     const int* percent3,
+                     Language* summary_lang, bool* is_reliable) {
+  // Vector of active languages; changes if we delete some
+  int slot_count = 3;
+  int active_slot[3] = {0, 1, 2};
+  int ignore_percent = 0;
+  int return_percent = percent3[0];   // Default to top lang
+  *summary_lang = language3[0];
+  *is_reliable = true;
+  if (percent3[0] < kKeepMinPercent) {*is_reliable = false;}
+  // If any of top 3 is IGNORE, remove it and increment ignore_percent
+  for (int i = 0; i < 3; ++i) {
+    if (language3[i] == TG_UNKNOWN_LANGUAGE) {
+      ignore_percent += percent3[i];
+      // Move the rest up, levaing input vectors unchanged
+      for (int j=i+1; j < 3; ++j) {
+        active_slot[j - 1] = active_slot[j];
+      }
+      -- slot_count;
+      // Logically remove Ignore from percentage-text calculation
+      // (extra 1 in 101 avoids zdiv, biases slightly small)
+      return_percent = (percent3[0] * 100) / (101 - ignore_percent);
+      *summary_lang = language3[active_slot[0]];
+      if (percent3[active_slot[0]] < kKeepMinPercent) {*is_reliable = false;}
+    }
+  }
+  // If English and X, where X (not UNK) is big enough,
+  // assume the English is boilerplate and return X.
+  // Logically remove English from percentage-text calculation
+  int second_bytes = (total_text_bytes * percent3[active_slot[1]]) / 100;
+  // Require more bytes of text for Tier3 languages
+  int minbytesneeded = kGoodSecondT1T2MinBytes;
+  int plang_second =  cld::PackLanguage(language3[active_slot[1]]);
+  bool is_tier3 = (cld::kIsPackedTop40[plang_second] == 0);
+  if (is_tier3) {
+    minbytesneeded = kGoodSecondT3MinBytes;
+  }
+  if ((language3[active_slot[0]] == ENGLISH) &&
+      (language3[active_slot[1]] != ENGLISH) &&
+      (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
+      (percent3[active_slot[1]] >= kNonEnBoilerplateMinPercent) &&
+      (second_bytes >= minbytesneeded)) {
+    ignore_percent += percent3[active_slot[0]];
+    return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
+    *summary_lang = language3[active_slot[1]];
+    if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
+  // Else If FIGS and X, where X (not UNK, EFIGS) is big enough,
+  // assume the FIGS is boilerplate and return X.
+  // Logically remove FIGS from percentage-text calculation
+  } else if (IsFIGS(language3[active_slot[0]]) &&
+             !IsEFIGS(language3[active_slot[1]]) &&
+             (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
+             (percent3[active_slot[1]] >= kNonFIGSBoilerplateMinPercent) &&
+             (second_bytes >= minbytesneeded)) {
+    ignore_percent += percent3[active_slot[0]];
+    return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
+    *summary_lang = language3[active_slot[1]];
+    if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
+  // Else we are returning the first language, but want to improve its
+  // return_percent if the second language should be ignored
+  } else  if ((language3[active_slot[1]] == ENGLISH) &&
+              (language3[active_slot[0]] != ENGLISH)) {
+    ignore_percent += percent3[active_slot[1]];
+    return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
+  } else  if (IsFIGS(language3[active_slot[1]]) &&
+              !IsEFIGS(language3[active_slot[0]])) {
+    ignore_percent += percent3[active_slot[1]];
+    return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
+  }
+  // If return percent is too small (too many languages), return UNKNOWN
+  if ((return_percent < kGoodFirstMinPercent)) {
+    *summary_lang = UNKNOWN_LANGUAGE;
+    *is_reliable = false;
+  }
+  // If return percent is small, return language but set unreliable.
+  if ((return_percent < kGoodFirstReliableMinPercent)) {
+    *is_reliable = false;
+  }
+  // If ignore percent is too large, set unreliable.
+  if ((ignore_percent > kIgnoreMaxPercent)) {
+    *is_reliable = false;
+  }
+  // If we removed all the active languages, return UNKNOWN
+  if (slot_count == 0) {
+    *summary_lang = UNKNOWN_LANGUAGE;
+    *is_reliable = false;
+  }
+}
+// Result vector must be exactly three items
+Language CompactLangDetImpl::DetectLanguageSummaryV25(
+                        const CompactLangDet::DetectionTables* tables,
+                        const char* buffer,
+                        int buffer_length,
+                        bool is_plain_text,
+                        bool do_pick_summary_language,
+                        bool do_remove_weak_matches,
+                        const char* tld_hint,       // "id" boosts Indonesian
+                        int encoding_hint,          // SJS boosts Japanese
+                        Language language_hint,     // ITALIAN boosts it
+                        bool allow_extended_lang,
+                        int flags,
+                        Language plus_one,
+                        Language* language3,
+                        int* percent3,
+                        double* normalized_score3,
+                        int* text_bytes,
+                        bool* is_reliable) {
+  if (!tables) {
+    static const CompactLangDet::DetectionTables default_cld_tables = {
+      &kQuadTable_obj,
+      &compact_lang_det_generated_ctjkvz_b1_obj
+    };
+    tables = &default_cld_tables;
+  }
+  language3[0] = UNKNOWN_LANGUAGE;
+  language3[1] = UNKNOWN_LANGUAGE;
+  language3[2] = UNKNOWN_LANGUAGE;
+  percent3[0] = 100;
+  percent3[1] = 0;
+  percent3[2] = 0;
+  normalized_score3[0] = 0.0;
+  normalized_score3[1] = 0.0;
+  normalized_score3[2] = 0.0;
+  *text_bytes = 0;
+  *is_reliable = false;
+  // Document totals
+  ToteWithReliability doc_tote;   // Reliability = 0..100
+  // Vector of packed per-language boosts (just one filled in from hints)
+  uint8 lang_hint_boost[EXT_NUM_LANGUAGES + 1];
+  memset(lang_hint_boost, 0, sizeof(lang_hint_boost));
+  // Apply hints,if any
+  if ((tld_hint != NULL) && (tld_hint[0] != '\0')) {
+    ApplyTLDHint(lang_hint_boost, tld_hint);
+  }
+  if (encoding_hint != UNKNOWN_ENCODING) {
+    ApplyEncodingHint(lang_hint_boost, encoding_hint);
+  }
+  if (language_hint != UNKNOWN_LANGUAGE) {
+    ApplyLanguageHint(lang_hint_boost, language_hint);
+  }
+  // Four individual script totals, Latin, Han, other2, other3
+  int next_other_tote = 2;
+  // Four totes for up to four different scripts pending at once
+  Tote totes[4];                  // [0] Latn  [1] Hani  [2] other  [3] other
+  bool tote_seen[4] = {false, false, false, false};
+  int tote_grams[4] = {0, 0, 0, 0};     // Number in partial chunk
+  UnicodeLScript tote_script[4] =
+    {ULScript_Latin, ULScript_HanCJK, ULScript_Common, ULScript_Common};
+  // Loop through text spans in a single script
+  ScriptScanner ss(buffer, buffer_length, is_plain_text);
+  getone::LangSpan scriptspan;
+  scriptspan.text = NULL;
+  scriptspan.text_bytes = 0;
+  scriptspan.offset = 0;
+  scriptspan.script = ULScript_Common;
+  scriptspan.lang = UNKNOWN_LANGUAGE;
+  int total_text_bytes = 0;
+  int textlimit = FLAGS_cld_textlimit << 10;    // in KB
+  if (textlimit == 0) {textlimit = 0x7fffffff;}
+  int advance_by = 2;                   // Advance 2 bytes
+  int advance_limit = textlimit >> 3;   // For first 1/8 of max document
+  int initial_word_span = kDefaultWordSpan;
+  if (FLAGS_cld_forcewords) {
+    initial_word_span = kReallyBigWordSpan;
+  }
+  // Pick up chunk sizes
+  // Smoothwidth is units of quadgrams, about 2.5 chars (unigrams) each
+  // Sanity check -- force into a reasonable range
+  int chunksizequads = FLAGS_cld_smoothwidth;
+  chunksizequads = cld::minint(cld::maxint(chunksizequads, kMinChunkSizeQuads),
+                               kMaxChunkSizeQuads);
+  int chunksizeunis = (chunksizequads * 5) >> 1;
+  // Varying short-span limit doesn't work well -- skips too much beyond 20KB
+  // int spantooshortlimit = advance_by * FLAGS_cld_smoothwidth;
+  int spantooshortlimit = kShortSpanThresh;
+  // For debugging only. Not thread-safe
+  prior_lang = UNKNOWN_LANGUAGE;
+  prior_unreliable = false;
+  // Allocate full-document prediction table for finding repeating words
+  int hash = 0;
+  int* predict_tbl = new int[kPredictionTableSize];
+  if (FlagRepeats(flags)) {
+    memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
+  }
+  // Loop through scriptspans accumulating number of text bytes in each language
+  while (ss.GetOneScriptSpanLower(&scriptspan)) {
+    UnicodeLScript lscript = scriptspan.script;
+    // Echo text if asked to
+    if (FLAGS_cld_echotext) {
+      PrintHtmlEscapedText(stderr, scriptspan.text, scriptspan.text_bytes);
+    }
+    // Squeeze out big chunks of text span if asked to
+    if (FlagSqueeze(flags)) {
+      // Remove repetitive or mostly-spaces chunks
+      int newlen;
+      int chunksize = 0;    // Use the default
+      newlen = CheapSqueezeInplace(scriptspan.text, scriptspan.text_bytes,
+                                   chunksize);
+      scriptspan.text_bytes = newlen;
+    } else {
+      // Check now and then to see if we should be squeezing
+      if ((total_text_bytes >= kCheapSqueezeTestThresh) &&
+          !FlagFinish(flags) &&
+          ((getone::kMaxScriptBuffer >> 1) < scriptspan.text_bytes) &&
+          CheapSqueezeTriggerTest(scriptspan.text,
+                                    scriptspan.text_bytes,
+                                    kCheapSqueezeTestLen)) {
+        // Recursive call with big-chunk squeezing set
+        if (FLAGS_cld_html || FLAGS_dbgscore) {
+          fprintf(stderr,
+                  "<br>---text_bytes[%d] Recursive(Squeeze)---<br><br>\n",
+                  total_text_bytes);
+        }
+        // Deallocate full-document prediction table
+        delete[] predict_tbl;
+        return DetectLanguageSummaryV25(
+                          tables,
+                          buffer,
+                          buffer_length,
+                          is_plain_text,
+                          do_pick_summary_language,
+                          do_remove_weak_matches,
+                          tld_hint,               // "id" boosts Indonesian
+                          encoding_hint,          // SJS boosts Japanese
+                          language_hint,          // ITALIAN boosts it
+                          allow_extended_lang,
+                          flags | kCLDFlagSqueeze,
+                          plus_one,
+                          language3,
+                          percent3,
+                          normalized_score3,
+                          text_bytes,
+                          is_reliable);
+      }
+    }
+    // Remove repetitive words if asked to
+    if (FlagRepeats(flags)) {
+      // Remove repetitive words
+      int newlen;
+      newlen = CheapRepWordsInplace(scriptspan.text, scriptspan.text_bytes,
+                                    &hash, predict_tbl);
+      scriptspan.text_bytes = newlen;
+    }
+    // The real scoring
+    // Accumulate directly into the document total, or accmulate in one of four
+    // chunk totals. The purpose of the multiple chunk totals is to piece
+    // together short choppy pieces of text in alternating scripts. One total is
+    // dedicated to Latin text, one to Han text, and the other two are dynamicly
+    // assigned.
+    Language onlylang = cld::kOnlyLanguagePerLScript[lscript];
+    if (onlylang != UNKNOWN_LANGUAGE) {
+      // This entire script run is in a single language.
+      ScoreNilgrams(&scriptspan, cld::PackLanguage(onlylang), &doc_tote,
+                   lang_hint_boost, flags, plus_one);
+    } else if (cld::kScoreUniPerLScript[lscript] != 0) {
+      // This entire script run's languages can be distinguished by uni-grams
+      // Accumulate in hani_tote
+      int tote_num = 1;
+      if (!tote_seen[tote_num]) {
+        tote_seen[tote_num] = true;
+        // Default language gets 1 byte
+        total_text_bytes += 1;
+        InitScriptToteLang(&totes[tote_num], lscript);
+      }
+      ScoreUnigrams(tables->unigram_obj,
+                       &scriptspan, &tote_grams[tote_num], chunksizeunis,
+                       &totes[tote_num],
+                       &doc_tote, lang_hint_boost,
+                       advance_by, flags, &initial_word_span, plus_one);
+    } else {
+      // This entire script-run's languages can be distinguished by quad-grams
+      // Accumulate in latn_tote or script0/1_tote
+      int tote_num = -1;
+      for (int t = 0; t < 4; ++t) {
+        if (lscript == tote_script[t]) {
+          tote_num = t;
+          break;
+        }
+      }
+      if (tote_num < 0) {
+        // Need to allocate other0/1
+        tote_num = next_other_tote;
+        next_other_tote ^= 1;     // Round-robin
+        if (tote_seen[tote_num]) {
+          // Flush previous
+          ScoreChunkIntoDoc2(kToteSwitch[tote_num], advance_by,
+                             tote_script[tote_num], &totes[tote_num],
+                             &doc_tote, tote_grams[tote_num], lang_hint_boost);
+          totes[tote_num].Reinit();
+        }
+        tote_script[tote_num] = lscript;
+      }
+      if (!tote_seen[tote_num]) {
+        tote_seen[tote_num] = true;
+        // Default language gets 1 byte
+        total_text_bytes += 1;
+        InitScriptToteLang(&totes[tote_num], lscript);
+      }
+      // The actual accumulation, possibly with word scoring also
+      ScoreQuadgrams(tables->quadgram_obj, &scriptspan, &tote_grams[tote_num],
+                        chunksizequads,
+                        &totes[tote_num],
+                        &doc_tote, lang_hint_boost,
+                        advance_by, flags, &initial_word_span, plus_one);
+    }
+    total_text_bytes += scriptspan.text_bytes;
+    // For long documents, do less-dense samples the further along we go.
+    // This is to keep speed sublinear in document size.
+    if (total_text_bytes > advance_limit) {
+      if (total_text_bytes > textlimit) {
+        // Don't look at rest of doc
+        if (FLAGS_cld_html || FLAGS_dbgscore) {
+          fprintf(stderr, "<br>---text_bytes[%d] textlimit %d reached---<br>",
+                  total_text_bytes, textlimit);
+        }
+        break;
+      }
+      advance_by <<= 1;         // Double advance bytes
+      advance_limit <<= 1;      // Double limit until next change
+      spantooshortlimit <<= 1;  // Double short-span size
+      if (FLAGS_cld_html || FLAGS_dbgscore) {
+        fprintf(stderr, "<br>---text_bytes[%d] advance_by doubled to %d---<br>",
+                total_text_bytes, advance_by);
+      }
+    }
+  }     // End while (ss.GetOneScriptSpanLower())
+  // Deallocate full-document prediction table
+  delete[] predict_tbl;
+  // Flush pending totals
+  for (int tote_num = 0; tote_num < 4; ++tote_num) {
+    if (tote_seen[tote_num]) {
+      ScoreChunkIntoDoc2(kToteName[tote_num], advance_by,
+                         tote_script[tote_num], &totes[tote_num], &doc_tote,
+                         tote_grams[tote_num], lang_hint_boost);
+    }
+  }
+  // If extended languages are disallowed, remove them here
+  if (!allow_extended_lang) {
+    RemoveExtendedLanguages(&doc_tote);
+  }
+  // Force close pairs to one or the other
+  RefineScoredClosePairs(&doc_tote);
+  // Calculate return results
+  // Find top three byte counts in tote heap
+  int reliable_percent3[3];
+  // Cannot use Add, etc. after sorting
+  doc_tote.Sort(3);
+  ExtractLangEtc(&doc_tote, total_text_bytes,
+                 reliable_percent3, language3, percent3, normalized_score3,
+                 text_bytes, is_reliable);
+  bool have_good_answer = false;
+  if (FlagFinish(flags)) {
+    // Force a result
+    have_good_answer = true;
+  } else if (total_text_bytes <= kShortTextThresh) {
+    // Don't recurse on short text -- we already did word scores
+    have_good_answer = true;
+  } else if (*is_reliable &&
+             (percent3[0] >= kGoodLang1Percent)) {
+    have_good_answer = true;
+  } else if (*is_reliable &&
+             ((percent3[0] + percent3[1]) >= kGoodLang1and2Percent)) {
+    have_good_answer = true;
+  }
+  if (have_good_answer) {
+    // This is the real, non-recursive return
+    // Move bytes for unreliable langs to another lang or
+    // UNKNOWN
+    RemoveUnreliableLanguages(&doc_tote, do_remove_weak_matches);
+    // Redo the result extraction after the removal above
+    doc_tote.Sort(3);
+    ExtractLangEtc(&doc_tote, total_text_bytes,
+                   reliable_percent3, language3, percent3, normalized_score3,
+                   text_bytes, is_reliable);
+#if 0
+    // OLD code, replaced by CalcSummaryLang
+    //
+    // Suppress ignore-me text, TG_UNKNOWN_LANGUAGE if 2nd or 3rd language
+    // Force it to English if first language
+    if (language3[2] == TG_UNKNOWN_LANGUAGE) {
+      reliable_percent3[2] = 0;
+      language3[2] = UNKNOWN_LANGUAGE;
+      percent3[2] = 0;
+    } else if (language3[1] == TG_UNKNOWN_LANGUAGE) {
+      // Move up lower language
+      reliable_percent3[1] = reliable_percent3[2];
+      language3[1] = language3[2];
+      percent3[1] = percent3[2];
+      reliable_percent3[2] = 0;
+      language3[2] = UNKNOWN_LANGUAGE;
+      percent3[2] = 0;
+    } else if (language3[0] == TG_UNKNOWN_LANGUAGE) {
+      language3[0] = ENGLISH;
+    }
+    if (language3[0] == UNKNOWN_LANGUAGE) {
+      // Last-ditch test for some result, but it is UNKNOWN_LANGUAGE
+      // Force it to English (should not happen)
+      language3[0] = ENGLISH;
+      percent3[0] = 100;
+      *is_reliable = true;
+    }
+#endif
+#if 0
+    // Scaffolding to reveal subset sequence lang distribution across doc text
+    // Track the sequence of language fragments [result currently unused]
+    if (FLAGS_cld_html) {
+      static const int kMaxSubsetSeq = 12;
+      uint8 subseq[kMaxSubsetSeq];
+      doc_tote.ExtractSeq(kMaxSubsetSeq, subseq);
+      fprintf(stderr, "<br>\nSubset Sequence[%d]: ", kMaxSubsetSeq);
+      for (int i = 0; i < kMaxSubsetSeq; ++i) {
+        fprintf(stderr, "%s ", ExtLanguageCode(cld::UnpackLanguage(subseq[i])));
+        if ((i % 4) == 3) {fprintf(stderr, "&nbsp; ");}
+      }
+      fprintf(stderr, "&nbsp;&nbsp; ");
+      for (int i = 0; i < 3; ++i) {
+        if (language3[i] != UNKNOWN_LANGUAGE) {
+          fprintf(stderr, "%s.%d(%d%%) ",
+                  ExtLanguageCode(language3[i]),
+                  reliable_percent3[i],
+                  percent3[i]);
+        }
+      }
+      fprintf(stderr, "%d B ", total_text_bytes);
+      fprintf(stderr, "<br>\n");
+    }
+    // End Scaffolding to reveal subset sequence lang distribution
+#endif
+    Language summary_lang;
+    if (do_pick_summary_language) {
+      CalcSummaryLang(&doc_tote, total_text_bytes,
+                      reliable_percent3, language3, percent3,
+                      &summary_lang, is_reliable);
+    } else {
+      summary_lang = language3[0];
+    }
+    if (FLAGS_cld_html) {
+      for (int i = 0; i < 3; ++i) {
+        if (language3[i] != UNKNOWN_LANGUAGE) {
+          fprintf(stderr, "%s.%d(%d%%) ",
+                  ExtLanguageCode(language3[i]),
+                  reliable_percent3[i],
+                  percent3[i]);
+        }
+      }
+      fprintf(stderr, "%d B ", total_text_bytes);
+      fprintf(stderr, "= %s%c ",
+              ExtLanguageName(summary_lang), is_reliable ? ' ' : '*');
+      fprintf(stderr, "<br>\n");
+    }
+    return summary_lang;
+  }
+  // Not a good answer -- do recursive call to refine
+  if (FLAGS_cld_html || FLAGS_dbgscore) {
+    // This is what we hope to improve on in the recursive call, if any
+    PrintLangs(stderr, language3, percent3, text_bytes, is_reliable);
+  }
+  // For restriction to Top40 + one, the one is 1st/2nd lang that is not Top40
+  // For this purpose, we treate "Ignore" as top40
+  Language new_plus_one = UNKNOWN_LANGUAGE;
+  if (cld::kIsPackedTop40[cld::PackLanguage(language3[0])] == 0) {
+    new_plus_one = language3[0];
+  } else if (cld::kIsPackedTop40[cld::PackLanguage(language3[1])] == 0) {
+    new_plus_one = language3[1];
+  }
+  if (total_text_bytes < kShortTextThresh) {
+      // Short text: Recursive call with top40 and short set
+      if (FLAGS_cld_html || FLAGS_dbgscore) {
+        fprintf(stderr, "&nbsp;&nbsp;---text_bytes[%d] "
+                "Recursive(Top40/Rep/Short/Words)---<br><br>\n",
+                total_text_bytes);
+      }
+      return DetectLanguageSummaryV25(
+                        tables,
+                        buffer,
+                        buffer_length,
+                        is_plain_text,
+                        do_pick_summary_language,
+                        do_remove_weak_matches,
+                        tld_hint,               // "id" boosts Indonesian
+                        encoding_hint,          // SJS boosts Japanese
+                        language_hint,          // ITALIAN boosts it
+                        allow_extended_lang,
+                        flags | kCLDFlagTop40 | kCLDFlagRepeats |
+                          kCLDFlagShort | kCLDFlagUseWords | kCLDFlagFinish,
+                        new_plus_one,
+                        language3,
+                        percent3,
+                        normalized_score3,
+                        text_bytes,
+                        is_reliable);
+  }
+  // Longer text: Recursive call with top40 set
+  if (FLAGS_cld_html || FLAGS_dbgscore) {
+    fprintf(stderr,
+            "&nbsp;&nbsp;---text_bytes[%d] Recursive(Top40/Rep)---<br><br>\n",
+            total_text_bytes);
+  }
+  return DetectLanguageSummaryV25(
+                        tables,
+                        buffer,
+                        buffer_length,
+                        is_plain_text,
+                        do_pick_summary_language,
+                        do_remove_weak_matches,
+                        tld_hint,               // "id" boosts Indonesian
+                        encoding_hint,          // SJS boosts Japanese
+                        language_hint,          // ITALIAN boosts it
+                        allow_extended_lang,
+                        flags | kCLDFlagTop40 | kCLDFlagRepeats |
+                          kCLDFlagFinish,
+                        new_plus_one,
+                        language3,
+                        percent3,
+                        normalized_score3,
+                        text_bytes,
+                        is_reliable);
+}   // End CompactLangDetImpl::DetectLanguageSummaryV25