RubyGems - language_detection - Versions diffs - 0.0.1 - Mend

language_detection 0.0.1

Files changed (100) hide show

data/.gitignore +19 -0
data/Gemfile +4 -0
data/LICENSE.txt +22 -0
data/README.md +85 -0
data/Rakefile +11 -0
data/ext/cld/Makefile +34 -0
data/ext/cld/base/basictypes.h +348 -0
data/ext/cld/base/build_config.h +124 -0
data/ext/cld/base/casts.h +156 -0
data/ext/cld/base/commandlineflags.h +443 -0
data/ext/cld/base/crash.h +41 -0
data/ext/cld/base/dynamic_annotations.h +358 -0
data/ext/cld/base/global_strip_options.h +59 -0
data/ext/cld/base/log_severity.h +46 -0
data/ext/cld/base/logging.h +1403 -0
data/ext/cld/base/macros.h +243 -0
data/ext/cld/base/port.h +54 -0
data/ext/cld/base/scoped_ptr.h +428 -0
data/ext/cld/base/stl_decl.h +0 -0
data/ext/cld/base/stl_decl_msvc.h +107 -0
data/ext/cld/base/string_util.h +29 -0
data/ext/cld/base/strtoint.h +93 -0
data/ext/cld/base/template_util.h +96 -0
data/ext/cld/base/type_traits.h +198 -0
data/ext/cld/base/vlog_is_on.h +143 -0
data/ext/cld/cld.so +0 -0
data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
data/ext/cld/encodings/internal/encodings.cc +12 -0
data/ext/cld/encodings/lang_enc.h +254 -0
data/ext/cld/encodings/proto/encodings.pb.h +169 -0
data/ext/cld/encodings/public/encodings.h +301 -0
data/ext/cld/extconf.rb +1 -0
data/ext/cld/language_detection.cc +88 -0
data/ext/cld/languages/internal/languages.cc +337 -0
data/ext/cld/languages/proto/languages.pb.h +179 -0
data/ext/cld/languages/public/languages.h +379 -0
data/language_detection.gemspec +28 -0
data/lib/language_detection/string.rb +1 -0
data/lib/language_detection/version.rb +3 -0
data/lib/language_detection.rb +54 -0
data/test/_helper.rb +15 -0
data/test/fixtures/languages.csv +80 -0
data/test/language_detection_test.rb +88 -0
metadata +250 -0

data/ext/cld/encodings/compact_lang_det/cldutil.cc ADDED Viewed

@@ -0,0 +1,905 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#include <string>
+#include "encodings/compact_lang_det/cldutil.h"
+#include "encodings/compact_lang_det/cldutil_dbg.h"
+#include "encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h"
+#include "encodings/compact_lang_det/utf8propletterscriptnum.h"
+#include "encodings/compact_lang_det/win/cld_commandlineflags.h"
+#include "encodings/compact_lang_det/win/cld_logging.h"
+#include "encodings/compact_lang_det/win/cld_unilib.h"
+#include "encodings/compact_lang_det/win/cld_utf.h"
+#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
+// Runtime routines for hashing, looking up, and scoring
+// unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
+// Unigrams and bigrams are for CJK languages only, including simplified/
+// traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
+// Zhuang Han characters. Surrounding spaces are not considered.
+// Quadgrams and octagrams for for non-CJK and include two bits indicating
+// preceding and trailing spaces (word boundaries).
+// Indicator bits for leading/trailing space around quad/octagram
+// NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of
+// 1-, 2-, or 3-bytes each.
+static const uint32 kPreSpaceIndicator =  0x00004444;
+static const uint32 kPostSpaceIndicator = 0x44440000;
+// Little-endian masks for 0..24 bytes picked up as uint32's
+static const uint32 kWordMask0[4] = {
+  0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF
+};
+static const int kMinCJKUTF8CharBytes = 3;
+static const int kMinGramCount = 3;
+static const int kMaxGramCount = 16;
+// Routines to access a hash table of <key:wordhash, value:probs> pairs
+// Buckets have 4-byte wordhash for sizes < 32K buckets, but only
+// 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
+// bucket subscript.
+// Probs is a packed: three languages plus a subscript for probability table
+// Buckets have all the keys together, then all the values.Key array never
+// crosses a cache-line boundary, so no-match case takes exactly one cache miss.
+// Match case may sometimes take an additional cache miss on value access.
+//
+// Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
+// byte buckets with single cache miss.
+// Or 2-byte key and 6-byte value, allowing 5 languages instead  of three.
+//------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
+// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores
+//------------------------------------------------------------------------------
+// Design principles for these hash functions
+// - Few operations
+// - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in
+//   Latin script expect 1- and 2-byte mixtures.
+// - Last byte of each character has about 5 bits of information
+// - Spread good bits around so they can interact in at least two ways
+//   with other characters
+// - Use add for additional mixing thorugh carries
+// CJK Three-byte bigram
+//   ....dddd..cccccc..bbbbbb....aaaa
+//   ..................ffffff..eeeeee
+// make
+//   ....dddd..cccccc..bbbbbb....aaaa
+//   000....dddd..cccccc..bbbbbb....a
+//   ..................ffffff..eeeeee
+//   ffffff..eeeeee000000000000000000
+//
+// CJK Four-byte bigram
+//   ..dddddd..cccccc....bbbb....aaaa
+//   ..hhhhhh..gggggg....ffff....eeee
+// make
+//   ..dddddd..cccccc....bbbb....aaaa
+//   000..dddddd..cccccc....bbbb....a
+//   ..hhhhhh..gggggg....ffff....eeee
+//   ..ffff....eeee000000000000000000
+// BIGRAM
+// Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
+// OVERSHOOTS up to 3 bytes
+// For runtime use of tables
+uint32 cld::BiHashV25(const char* word_ptr, int bytecount) {
+  if (bytecount == 0) {
+    return 0;
+  }
+  const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
+  uint32 word0, word1;
+  if (bytecount <= 4) {
+    word0 = word_ptr32[0] & kWordMask0[bytecount & 3];
+    word0 = word0 ^ (word0 >> 3);
+    return word0;
+  }
+  // Else do 8 bytes
+  word0 = word_ptr32[0];
+  word0 = word0 ^ (word0 >> 3);
+  word1 = word_ptr32[1] & kWordMask0[bytecount & 3];
+  word1 = word1 ^ (word1 << 18);
+  return word0 + word1;
+}
+//
+// Ascii-7 One-byte chars
+//   ...ddddd...ccccc...bbbbb...aaaaa
+// make
+//   ...ddddd...ccccc...bbbbb...aaaaa
+//   000...ddddd...ccccc...bbbbb...aa
+//
+// Latin 1- and 2-byte chars
+//   ...ddddd...ccccc...bbbbb...aaaaa
+//   ...................fffff...eeeee
+// make
+//   ...ddddd...ccccc...bbbbb...aaaaa
+//   000...ddddd...ccccc...bbbbb...aa
+//   ...................fffff...eeeee
+//   ...............fffff...eeeee0000
+//
+// Non-CJK Two-byte chars
+//   ...ddddd...........bbbbb........
+//   ...hhhhh...........fffff........
+// make
+//   ...ddddd...........bbbbb........
+//   000...ddddd...........bbbbb.....
+//   ...hhhhh...........fffff........
+//   hhhh...........fffff........0000
+//
+// Non-CJK Three-byte chars
+//   ...........ccccc................
+//   ...................fffff........
+//   ...lllll...................iiiii
+// make
+//   ...........ccccc................
+//   000...........ccccc.............
+//   ...................fffff........
+//   ...............fffff........0000
+//   ...lllll...................iiiii
+//   .lllll...................iiiii00
+//
+// QUADGRAM
+// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
+// OVERSHOOTS up to 3 bytes
+// For runtime use of tables
+uint32 QuadHashV25Mix(const char* word_ptr, int bytecount, uint32 prepost) {
+  const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
+  uint32 word0, word1, word2;
+  if (bytecount <= 4) {
+    word0 = word_ptr32[0] & kWordMask0[bytecount & 3];
+    word0 = word0 ^ (word0 >> 3);
+    return word0 ^ prepost;
+  } else if (bytecount <= 8) {
+    word0 = word_ptr32[0];
+    word0 = word0 ^ (word0 >> 3);
+    word1 = word_ptr32[1] & kWordMask0[bytecount & 3];
+    word1 = word1 ^ (word1 << 4);
+    return (word0 ^ prepost) + word1;
+  }
+  // else do 12 bytes
+  word0 = word_ptr32[0];
+  word0 = word0 ^ (word0 >> 3);
+  word1 = word_ptr32[1];
+  word1 = word1 ^ (word1 << 4);
+  word2 = word_ptr32[2] & kWordMask0[bytecount & 3];
+  word2 = word2 ^ (word2 << 2);
+  return (word0 ^ prepost) + word1 + word2;
+}
+// QUADGRAM wrapper with surrounding spaces
+// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
+// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
+// For runtime use of tables
+uint32 cld::QuadHashV25(const char* word_ptr, int bytecount) {
+  if (bytecount == 0) {
+    return 0;
+  }
+  uint32 prepost = 0;
+  if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
+  if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
+  return QuadHashV25Mix(word_ptr, bytecount, prepost);
+}
+// QUADGRAM wrapper with surrounding underscores (offline use)
+// Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
+// OVERSHOOTS up to 3 bytes
+// For offline construction of tables
+uint32 cld::QuadHashV25Underscore(const char* word_ptr, int bytecount) {
+  if (bytecount == 0) {
+    return 0;
+  }
+  const char* local_word_ptr = word_ptr;
+  int local_bytecount = bytecount;
+  uint32 prepost = 0;
+  if (local_word_ptr[0] == '_') {
+    prepost |= kPreSpaceIndicator;
+    ++local_word_ptr;
+    --local_bytecount;
+  }
+  if (local_word_ptr[local_bytecount - 1] == '_') {
+    prepost |= kPostSpaceIndicator;
+    --local_bytecount;
+  }
+  return QuadHashV25Mix(local_word_ptr, local_bytecount, prepost);
+}
+// OCTAGRAM
+// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
+// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
+//
+// The low 32 bits follow the pattern from above, tuned to different scripts
+// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
+// For runtime use of tables V3
+uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) {
+  const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
+  uint64 word0;
+  uint64 word1;
+  uint64 sum;
+  if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
+  if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
+  switch ((bytecount - 1) >> 2) {
+  case 0:       // 1..4 bytes
+    word0 = word_ptr32[0] & kWordMask0[bytecount & 3];
+    sum = word0;
+    word0 = word0 ^ (word0 >> 3);
+    break;
+  case 1:       // 5..8 bytes
+    word0 = word_ptr32[0];
+    sum = word0;
+    word0 = word0 ^ (word0 >> 3);
+    word1 = word_ptr32[1] & kWordMask0[bytecount & 3];
+    sum += word1;
+    word1 = word1 ^ (word1 << 4);
+    word0 += word1;
+    break;
+  case 2:       // 9..12 bytes
+    word0 = word_ptr32[0];
+    sum = word0;
+    word0 = word0 ^ (word0 >> 3);
+    word1 = word_ptr32[1];
+    sum += word1;
+    word1 = word1 ^ (word1 << 4);
+    word0 += word1;
+    word1 = word_ptr32[2] & kWordMask0[bytecount & 3];
+    sum += word1;
+    word1 = word1 ^ (word1 << 2);
+    word0 += word1;
+    break;
+  case 3:       // 13..16 bytes
+    word0 = word_ptr32[0];
+    sum = word0;
+    word0 = word0 ^ (word0 >> 3);
+    word1 = word_ptr32[1];
+    sum += word1;
+    word1 = word1 ^ (word1 << 4);
+    word0 += word1;
+    word1 = word_ptr32[2];
+    sum += word1;
+    word1 = word1 ^ (word1 << 2);
+    word0 += word1;
+    word1 = word_ptr32[3] & kWordMask0[bytecount & 3];
+    sum += word1;
+    word1 = word1 ^ (word1 >> 8);
+    word0 += word1;
+    break;
+  case 4:       // 17..20 bytes
+    word0 = word_ptr32[0];
+    sum = word0;
+    word0 = word0 ^ (word0 >> 3);
+    word1 = word_ptr32[1];
+    sum += word1;
+    word1 = word1 ^ (word1 << 4);
+    word0 += word1;
+    word1 = word_ptr32[2];
+    sum += word1;
+    word1 = word1 ^ (word1 << 2);
+    word0 += word1;
+    word1 = word_ptr32[3];
+    sum += word1;
+    word1 = word1 ^ (word1 >> 8);
+    word0 += word1;
+    word1 = word_ptr32[4] & kWordMask0[bytecount & 3];
+    sum += word1;
+    word1 = word1 ^ (word1 >> 4);
+    word0 += word1;
+    break;
+  default:      // 21..24 bytes and higher (ignores beyond 24)
+    word0 = word_ptr32[0];
+    sum = word0;
+    word0 = word0 ^ (word0 >> 3);
+    word1 = word_ptr32[1];
+    sum += word1;
+    word1 = word1 ^ (word1 << 4);
+    word0 += word1;
+    word1 = word_ptr32[2];
+    sum += word1;
+    word1 = word1 ^ (word1 << 2);
+    word0 += word1;
+    word1 = word_ptr32[3];
+    sum += word1;
+    word1 = word1 ^ (word1 >> 8);
+    word0 += word1;
+    word1 = word_ptr32[4];
+    sum += word1;
+    word1 = word1 ^ (word1 >> 4);
+    word0 += word1;
+    word1 = word_ptr32[5] & kWordMask0[bytecount & 3];
+    sum += word1;
+    word1 = word1 ^ (word1 >> 6);
+    word0 += word1;
+    break;
+  }
+  sum += (sum >> 17);             // extra 1-bit shift for bytes 2 & 3
+  sum += (sum >> 9);              // extra 1-bit shift for bytes 1 & 3
+  sum = (sum & 0xff) << 32;
+  return (word0 ^ prepost) + sum;
+}
+// OCTAGRAM wrapper with surrounding spaces
+// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
+// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
+//
+// The low 32 bits follow the pattern from above, tuned to different scripts
+// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
+// For runtime use of tables V3
+uint64 cld::OctaHash40(const char* word_ptr, int bytecount) {
+  if (bytecount == 0) {
+    return 0;
+  }
+  uint64 prepost = 0;
+  if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
+  if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
+  return OctaHash40Mix(word_ptr, bytecount, prepost);
+}
+// OCTAGRAM wrapper with surrounding underscores (offline use)
+// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
+// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
+//
+// The low 32 bits follow the pattern from above, tuned to different scripts
+// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
+// For offline construction of tables
+uint64 cld::OctaHash40underscore(const char* word_ptr, int bytecount) {
+  if (bytecount == 0) {
+    return 0;
+  }
+  const char* local_word_ptr = word_ptr;
+  int local_bytecount = bytecount;
+  uint64 prepost = 0;
+  if (local_word_ptr[0] == '_') {
+    prepost |= kPreSpaceIndicator;
+    ++local_word_ptr;
+    --local_bytecount;
+  }
+  if (local_word_ptr[local_bytecount - 1] == '_') {
+    prepost |= kPostSpaceIndicator;
+    --local_bytecount;
+  }
+  return OctaHash40Mix(local_word_ptr, local_bytecount, prepost);
+}
+//------------------------------------------------------------------------------
+// Scoring single groups of letters
+//------------------------------------------------------------------------------
+// UNIGRAM score one => tote
+// Input: 1-byte entry of subscript into unigram probs, plus
+//  an accumulator tote.
+// Output: running sums in tote updated
+void cld::ProcessProbV25UniTote(int propval, Tote* tote) {
+  tote->AddGram();
+  const UnigramProbArray* pa = &kTargetCTJKVZProbs[propval];
+  if (pa->probs[0] > 0) {tote->Add(cld::PackLanguage(CHINESE), pa->probs[0]);}
+  if (pa->probs[1] > 0) {tote->Add(cld::PackLanguage(CHINESE_T), pa->probs[1]);}
+  if (pa->probs[2] > 0) {tote->Add(cld::PackLanguage(JAPANESE), pa->probs[2]);}
+  if (pa->probs[3] > 0) {tote->Add(cld::PackLanguage(KOREAN), pa->probs[3]);}
+  if (pa->probs[4] > 0) {tote->Add(cld::PackLanguage(VIETNAMESE), pa->probs[4]);}
+  if (pa->probs[5] > 0) {tote->Add(cld::PackLanguage(ZHUANG), pa->probs[5]);}
+}
+// BIGRAM, QUADGRAM, OCTAGRAM score one => tote
+// Input: 4-byte entry of 3 language numbers and one probability subscript, plus
+//  an accumulator tote. (language 0 means unused entry)
+// Output: running sums in tote updated
+void cld::ProcessProbV25Tote(uint32 probs, Tote* tote) {
+  tote->AddGram();
+  uint8 prob123 = (probs >> 0) & 0xff;
+  const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
+  uint8 top1 = (probs >> 8) & 0xff;
+  if (top1 > 0) {tote->Add(top1, cld::LgProb3(prob123_entry, 0));}
+  uint8 top2 = (probs >> 16) & 0xff;
+  if (top2 > 0) {tote->Add(top2, cld::LgProb3(prob123_entry, 1));}
+  uint8 top3 = (probs >> 24) & 0xff;
+  if (top3 > 0) {tote->Add(top3, cld::LgProb3(prob123_entry, 2));}
+}
+//------------------------------------------------------------------------------
+// Routines to accumulate probabilities
+//------------------------------------------------------------------------------
+// UNIGRAM, using UTF-8 property table, advancing by 1/2/4/8 chars
+// Caller supplies table, such as compact_lang_det_generated_ctjkvz_b1_obj
+// Score up to n unigrams, returning number of bytes consumed
+// Updates tote_grams
+int cld::DoUniScoreV3(const UTF8PropObj* unigram_obj,
+                      const char* isrc, int srclen, int advance_by,
+                      int* tote_grams, int gram_limit, Tote* chunk_tote) {
+  const char* src = isrc;
+  if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
+  // Property-based CJK unigram lookup
+  if (src[0] == ' ') {++src; --srclen;}
+  const uint8* usrc = reinterpret_cast<const uint8*>(src);
+  int usrclen = srclen;
+  while (usrclen > 0) {
+    int len = kAdvanceOneChar[usrc[0]];
+    // Look up property of one UTF-8 character and advance over it
+    // Return 0 if input length is zero
+    // Return 0 and advance one byte if input is ill-formed
+    int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &usrclen);
+    if (FLAGS_dbglookup) {
+      DbgUniTermToStderr(propval, usrc, len);
+    }
+    if (propval > 0) {
+      ProcessProbV25UniTote(propval, chunk_tote);
+      ++(*tote_grams);
+      if (FLAGS_dbgscore) {DbgScoreRecordUni((const char*)usrc, propval, len);}
+    }
+    // Advance by 1/2/4/8 characters (half of quad advance)
+    if (advance_by == 2) {
+      // Already advanced by 1
+    } else if (advance_by == 4) {
+      // Advance by 2 chars total, if not at end
+      if (UTFmax <= usrclen) {
+        int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
+      }
+    } else if (advance_by == 8) {
+      // Advance by 4 chars total, if not at end
+      if ((UTFmax * 3) <= usrclen) {
+        int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
+        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
+        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
+      }
+    } else {
+      // Advance by 8 chars total, if not at end
+      if ((UTFmax * 7) <= usrclen) {
+        int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
+        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
+        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
+        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
+        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
+        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
+        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
+      }
+    }
+    DCHECK(usrclen >= 0);
+    if (*tote_grams >= gram_limit) {
+      break;
+    }
+  }
+  if (FLAGS_dbgscore) {
+    // With advance_by>2, we consume more input to get the same number of quads
+    int len = src - isrc;
+    DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
+    DbgScoreFlush();
+  }
+  int consumed2 = reinterpret_cast<const char*>(usrc) - isrc;
+  return consumed2;
+}
+// BIGRAM, using hash table, always advancing by 1 char
+// Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj
+// Score all bigrams in isrc, using languages that have bigrams (CJK)
+// Return number of bigrams that hit in the hash table
+int cld::DoBigramScoreV3(const cld::CLDTableSummary* bigram_obj,
+                         const char* isrc, int srclen, Tote* chunk_tote) {
+  int hit_count = 0;
+  const char* src = isrc;
+  // Hashtable-based CJK bigram lookup
+  const uint8* usrc = reinterpret_cast<const uint8*>(src);
+  const uint8* usrclimit1 = usrc + srclen - UTFmax;
+  if (FLAGS_dbgscore) {
+    fprintf(stderr, "  " );
+  }
+  while (usrc < usrclimit1) {
+    int len = kAdvanceOneChar[usrc[0]];
+    int len2 = kAdvanceOneChar[usrc[len]] + len;
+    if ((kMinCJKUTF8CharBytes * 2) <= len2) {      // Two CJK chars possible
+      // Lookup and score this bigram
+      // Always ignore pre/post spaces
+      uint32 bihash = BiHashV25(reinterpret_cast<const char*>(usrc), len2);
+      uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash);
+      // Now go indirect on the subscript
+      probs = bigram_obj->kCLDTableInd[probs &
+        ~bigram_obj->kCLDTableKeyMask];
+      // Process the bigram
+      if (FLAGS_dbglookup) {
+        const char* ssrc = reinterpret_cast<const char*>(usrc);
+        DbgBiTermToStderr(bihash, probs, ssrc, len2);
+        DbgScoreRecord(NULL, probs, len2);
+      } else if (FLAGS_dbgscore && (probs != 0)) {
+        const char* ssrc = reinterpret_cast<const char*>(usrc);
+        DbgScoreRecord(NULL, probs, len2);
+        string temp(ssrc, len2);
+        fprintf(stderr, "%s ", temp.c_str());
+      }
+      if (probs != 0) {
+        ProcessProbV25Tote(probs, chunk_tote);
+        ++hit_count;
+      }
+    }
+    usrc += len;  // Advance by one char
+  }
+  if (FLAGS_dbgscore) {
+    fprintf(stderr, "[%d bigrams scored]\n", hit_count);
+    DbgScoreState();
+  }
+  return hit_count;
+}
+// QUADGRAM, using hash table, advancing by 2/4/8/16 chars
+// Caller supplies table, such as &kQuadTable_obj or &kGibberishTable_obj
+// Score up to n quadgrams, returning number of bytes consumed
+// Updates tote_grams
+int cld::DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj,
+                       const char* isrc, int srclen, int advance_by,
+                       int* tote_grams, int gram_limit, Tote* chunk_tote) {
+  const char* src = isrc;
+  const char* srclimit = src + srclen;
+  // Limit is end, which has extra 20 20 20 00 past len
+  const char* srclimit7 = src + srclen - (UTFmax * 7);
+  const char* srclimit15 = src + srclen - (UTFmax * 15);
+  if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
+  // Run a little cache of last hits to catch overly-repetitive "text"
+  int next_prior = 0;
+  uint32 prior_quads[2] = {0, 0};
+  // Visit all quadgrams
+  if (src[0] == ' ') {++src;}
+  while (src < srclimit) {
+    // Find one quadgram
+    const char* src_end = src;
+    src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
+    src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
+    const char* src_mid = src_end;
+    src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
+    src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
+    int len = src_end - src;
+    // Lookup and score this quadgram
+    uint32 quadhash = QuadHashV25(src, len);
+    uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash);
+    // Now go indirect on the subscript
+    probs = quadgram_obj->kCLDTableInd[probs &
+      ~quadgram_obj->kCLDTableKeyMask];
+    // Process the quadgram
+    if (FLAGS_dbglookup) {
+      DbgQuadTermToStderr(quadhash, probs, src, len);
+    }
+    if (probs != 0) {
+      // Filter out recent repeats. If this works out, use in the other lookups
+      if ((quadhash != prior_quads[0]) && (quadhash != prior_quads[1])) {
+        prior_quads[next_prior] = quadhash;
+        next_prior = (next_prior + 1) & 1;
+        ProcessProbV25Tote(probs, chunk_tote);
+        ++(*tote_grams);
+        if (FLAGS_dbgscore) {DbgScoreRecord(src, probs, len);}
+      }
+    }
+    // Advance all the way past word if at end-of-word
+    if (src_end[0] == ' ') {
+      src_mid = src_end;
+    }
+    // Advance by 2/4/8/16 characters
+    if (advance_by == 2) {
+      src = src_mid;
+    } else if (advance_by == 4) {
+      src = src_end;
+    } else if (advance_by == 8) {
+      // Advance by 8 chars total (4 more), if not at end
+      if (src < srclimit7) {
+        src_end += kAdvanceOneChar[(uint8)src_end[0]];
+        src_end += kAdvanceOneChar[(uint8)src_end[0]];
+        src_end += kAdvanceOneChar[(uint8)src_end[0]];
+        src_end += kAdvanceOneChar[(uint8)src_end[0]];
+      }
+      src = src_end;
+    } else {
+      // Advance by 16 chars total (12 more), if not at end
+      if (src < srclimit15) {
+        // Advance by ~16 chars by adding 3 * current bytelen
+        int fourcharlen = src_end - src;
+        src = src_end + (3 * fourcharlen);
+        // Advance a bit more if mid-character
+        src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
+        src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
+      } else {
+        src = src_end;
+      }
+    }
+    DCHECK(src < srclimit);
+    src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
+    if (*tote_grams >= gram_limit) {
+      break;
+    }
+  }
+  if (FLAGS_dbgscore) {
+    // With advance_by>2, we consume more input to get the same number of quads
+    int len = src - isrc;
+    DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
+    DbgScoreFlush();
+  }
+  int consumed = src - isrc;
+  // If advancing by more than 2, src may have overshot srclimit
+  if (consumed > srclen) {
+    consumed = srclen;
+  }
+  return consumed;
+}
+// OCTAGRAM, using hash table, always advancing by 1 word
+// Caller supplies table, such as &kLongWord8Table_obj
+// Score all words in isrc, using languages that have quadgrams
+// We don't normally use this routine except on the first quadgram run,
+// but it can be used to resolve unreliable pages.
+// This routine does not have an optimized advance_by
+// SOON: Uses indirect language/probability longword
+//
+// Return number of words that hit in the hash table
+int cld::DoOctaScoreV3(const cld::CLDTableSummary* octagram_obj,
+                       const char* isrc, int srclen, Tote* chunk_tote) {
+  int hit_count = 0;
+  const char* src = isrc;
+  const char* srclimit = src + srclen + 1;
+  // Limit is end+1, to include extra space char (0x20) off the end
+  //
+  // Score all words truncated to 8 characters
+  int charcount = 0;
+  // Skip any initial space
+  if (src[0] == ' ') {++src;}
+  const char* word_ptr = src;
+  const char* word_end = word_ptr;
+  if (FLAGS_dbgscore) {
+    fprintf(stderr, "  " );
+  }
+  while (src < srclimit) {
+    // Terminate previous word or continue current word
+    if (src[0] == ' ') {
+      int bytecount = word_end - word_ptr;
+      if (bytecount == 0)
+        break;
+      // Lookup and score this word
+      uint64 wordhash40 = OctaHash40(word_ptr, bytecount);
+      uint32 probs = OctaHashV3Lookup4(octagram_obj, wordhash40);
+      // Now go indirect on the subscript
+      probs = octagram_obj->kCLDTableInd[probs &
+        ~octagram_obj->kCLDTableKeyMask];
+      // // Lookup and score this word
+      // uint32 wordhash = QuadHashV25(word_ptr, bytecount);
+      // uint32 probs = WordHashLookup4(wordhash, kLongWord8Table,
+      //                                kLongWord8TableSize);
+      //
+      if (FLAGS_dbglookup) {
+        DbgWordTermToStderr(wordhash40, probs, word_ptr, bytecount);
+        DbgScoreRecord(NULL, probs, bytecount);
+      } else if (FLAGS_dbgscore && (probs != 0)) {
+        DbgScoreRecord(NULL, probs, bytecount);
+        string temp(word_ptr, bytecount);
+        fprintf(stderr, "%s ", temp.c_str());
+      }
+      if (probs != 0) {
+        ProcessProbV25Tote(probs, chunk_tote);
+        ++hit_count;
+      }
+      charcount = 0;
+      word_ptr = src + 1;   // Over the space
+      word_end = word_ptr;
+    } else {
+      ++charcount;
+    }
+    // Advance to next char
+    src += cld_UniLib::OneCharLen(src);
+    if (charcount <= 8) {
+      word_end = src;
+    }
+  }
+  if (FLAGS_dbgscore) {
+    fprintf(stderr, "[%d words scored]\n", hit_count);
+    DbgScoreState();
+  }
+  return hit_count;
+}
+//------------------------------------------------------------------------------
+// Reliability calculations, for single language and between languages
+//------------------------------------------------------------------------------
+// Return reliablity of result 0..100 for top two scores
+// delta==0 is 0% reliable, delta==fully_reliable_thresh is 100% reliable
+// (on a scale where +1 is a factor of  2 ** 1.6 = 3.02)
+// Threshold is uni/quadgram increment count, bounded above and below.
+//
+// Requiring a factor of 3 improvement (e.g. +1 log base 3)
+// for each scored quadgram is too stringent, so I've backed this off to a
+// factor of 2 (e.g. +5/8 log base 3).
+//
+// I also somewhat lowered the Min/MaxGramCount limits above
+//
+// Added: if fewer than 8 quads/unis, max reliability is 12*n percent
+//
+int cld::ReliabilityDelta(int value1, int value2, int gramcount) {
+  int max_reliability_percent = 100;
+  if (gramcount < 8) {
+    max_reliability_percent = 12 * gramcount;
+  }
+  int fully_reliable_thresh = (gramcount * 5) >> 3;     // see note above
+  if (fully_reliable_thresh < kMinGramCount) {          // Fully = 3..16
+    fully_reliable_thresh = kMinGramCount;
+  } else if (fully_reliable_thresh > kMaxGramCount) {
+    fully_reliable_thresh = kMaxGramCount;
+  }
+  int delta = value1 - value2;
+  if (delta >= fully_reliable_thresh) {return max_reliability_percent;}
+  if (delta <= 0) {return 0;}
+  return cld::minint(max_reliability_percent,
+                     (100 * delta) / fully_reliable_thresh);
+}
+// Return reliablity of result 0..100 for top score vs. mainsteam score
+// Values are score per 1024 bytes of input
+// ratio = max(top/mainstream, mainstream/top)
+// ratio > 4.0 is 0% reliable, <= 2.0 is 100% reliable
+// Change: short-text word scoring can give unusually good results.
+//  Let top exceed mainstream by 4x at 50% reliable
+int cld::ReliabilityMainstream(int topscore, int len, int mean_score) {
+  if (mean_score == 0) {return 100;}    // No reliability data available yet
+  if (topscore == 0) {return 0;}        // zero score = unreliable
+  if (len == 0) {return 0;}             // zero len = unreliable
+  int top_kb = (topscore << 10) / len;
+  double ratio;
+  double ratio_cutoff;
+  if (top_kb > mean_score) {
+    ratio = (1.0 * top_kb) / mean_score;
+    ratio_cutoff = 5.0;                 // ramp down from 100% to 0%: 3.0-5.0
+  } else {
+    ratio = (1.0 * mean_score) / top_kb;
+    ratio_cutoff = 4.0;                 // ramp down from 100% to 0%: 2.0-4.0
+  }
+  if (ratio <= ratio_cutoff - 2.0) {return 100;}
+  if (ratio > ratio_cutoff) {return 0;}
+  int iratio = static_cast<int>(100 * (ratio_cutoff - ratio) / 2.0);
+  return iratio;
+}
+// Calculate ratio of score per 1KB vs. expected score per 1KB
+double cld::GetNormalizedScore(Language lang, UnicodeLScript lscript,
+                          int bytes, int score) {
+  // Average training-data score for this language-script combo, per 1KB
+  int expected_score = kMeanScore[lang * 4 + LScript4(lscript)];
+  if (lscript == ULScript_Common) {
+    // We don't know the script (only happens with second-chance score)
+    // Look for first non-zero mean value
+    for (int i = 0; i < 3; ++i) {
+      if (kMeanScore[lang * 4 + i] > 0) {
+        expected_score = kMeanScore[lang * 4 + i];
+      }
+    }
+  }
+  if (expected_score < 100) {
+      expected_score = 1000;
+  }
+  // Our score per 1KB
+  double our_score = (score << 10) / (bytes ? bytes : 1);  // Avoid zdiv
+  double ratio = our_score / expected_score;
+  // Just the raw count normalized as though each language has mean=1000;
+  ratio = (score * 1000.0) /  expected_score;
+  return ratio;
+}
+// Calculate reliablity of len bytes of script lscript with chunk_tote
+int cld::GetReliability(int len, UnicodeLScript lscript,
+                   const Tote* chunk_tote) {
+  Language cur_lang = UnpackLanguage(chunk_tote->Key(0));
+  // Average score for this language-script combo
+  int mean_score = kMeanScore[cur_lang * 4 + LScript4(lscript)];
+  if (lscript == ULScript_Common) {
+    // We don't know the script (only happens with second-chance score)
+    // Look for first non-zero mean value
+    for (int i = 0; i < 3; ++i) {
+      if (kMeanScore[cur_lang * 4 + i] > 0) {
+        mean_score = kMeanScore[cur_lang * 4 + i];
+      }
+    }
+  }
+  int reliability_delta = ReliabilityDelta(chunk_tote->Value(0),
+                                           chunk_tote->Value(1),
+                                           chunk_tote->GetGramCount());
+  int reliability_main = ReliabilityMainstream(chunk_tote->Value(0),
+                                               len,
+                                               mean_score);
+  int reliability_min = minint(reliability_delta, reliability_main);
+  if (FLAGS_dbgreli) {
+    char temp1[4];
+    char temp2[4];
+    cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(0)), temp1);
+    if (temp1[2] == ' ') {temp1[2] = '\0';}
+    cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(1)), temp2);
+    if (temp2[2] == ' ') {temp2[2] = '\0';}
+    int srclen = len;
+    fprintf(stderr, "CALC GetReliability gram=%d incr=%d srclen=%d,  %s=%d %s=%d "
+                   "top/KB=%d mean/KB=%d del=%d%% reli=%d%%   "
+                   "lang/lscript %d %d\n",
+           chunk_tote->GetGramCount(),
+           chunk_tote->GetIncrCount(),
+           srclen,
+           temp1, chunk_tote->Value(0),
+           temp2, chunk_tote->Value(1),
+           (chunk_tote->Value(0) << 10) / (srclen ? srclen : 1),
+           mean_score,
+           reliability_delta,
+           reliability_main,
+           cur_lang, lscript);
+  }
+  return reliability_min;
+}
+//------------------------------------------------------------------------------
+// Miscellaneous
+//------------------------------------------------------------------------------
+// Demote all languages except Top40 and plus_one
+// Do this just before sorting chunk_tote results
+void cld::DemoteNotTop40(Tote* chunk_tote, int packed_plus_one) {
+  for (int sub = 0; sub < chunk_tote->MaxSize(); ++sub) {
+    if (chunk_tote->Key(sub) == 0) continue;
+    if (chunk_tote->Key(sub) == packed_plus_one) continue;
+    if (kIsPackedTop40[chunk_tote->Key(sub)]) continue;
+    // Quarter the score of others
+    chunk_tote->SetValue(sub, chunk_tote->Value(sub) >> 2);
+  }
+}