RubyGems - language_detection - Versions diffs - 0.0.1 - Mend

language_detection 0.0.1

Files changed (100) hide show

data/.gitignore +19 -0
data/Gemfile +4 -0
data/LICENSE.txt +22 -0
data/README.md +85 -0
data/Rakefile +11 -0
data/ext/cld/Makefile +34 -0
data/ext/cld/base/basictypes.h +348 -0
data/ext/cld/base/build_config.h +124 -0
data/ext/cld/base/casts.h +156 -0
data/ext/cld/base/commandlineflags.h +443 -0
data/ext/cld/base/crash.h +41 -0
data/ext/cld/base/dynamic_annotations.h +358 -0
data/ext/cld/base/global_strip_options.h +59 -0
data/ext/cld/base/log_severity.h +46 -0
data/ext/cld/base/logging.h +1403 -0
data/ext/cld/base/macros.h +243 -0
data/ext/cld/base/port.h +54 -0
data/ext/cld/base/scoped_ptr.h +428 -0
data/ext/cld/base/stl_decl.h +0 -0
data/ext/cld/base/stl_decl_msvc.h +107 -0
data/ext/cld/base/string_util.h +29 -0
data/ext/cld/base/strtoint.h +93 -0
data/ext/cld/base/template_util.h +96 -0
data/ext/cld/base/type_traits.h +198 -0
data/ext/cld/base/vlog_is_on.h +143 -0
data/ext/cld/cld.so +0 -0
data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
data/ext/cld/encodings/internal/encodings.cc +12 -0
data/ext/cld/encodings/lang_enc.h +254 -0
data/ext/cld/encodings/proto/encodings.pb.h +169 -0
data/ext/cld/encodings/public/encodings.h +301 -0
data/ext/cld/extconf.rb +1 -0
data/ext/cld/language_detection.cc +88 -0
data/ext/cld/languages/internal/languages.cc +337 -0
data/ext/cld/languages/proto/languages.pb.h +179 -0
data/ext/cld/languages/public/languages.h +379 -0
data/language_detection.gemspec +28 -0
data/lib/language_detection/string.rb +1 -0
data/lib/language_detection/version.rb +3 -0
data/lib/language_detection.rb +54 -0
data/test/_helper.rb +15 -0
data/test/fixtures/languages.csv +80 -0
data/test/language_detection_test.rb +88 -0
metadata +250 -0

data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc ADDED Viewed

@@ -0,0 +1,570 @@
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#include "encodings/compact_lang_det/getonescriptspan.h"
+#include <stdio.h>
+#include <string.h>
+#include "encodings/lang_enc.h"
+#include "encodings/compact_lang_det/utf8propjustletter.h"
+#include "encodings/compact_lang_det/utf8propletterscriptnum.h"
+#include "encodings/compact_lang_det/utf8scannotjustletterspecial.h"
+#include "encodings/compact_lang_det/win/cld_basictypes.h"
+#include "encodings/compact_lang_det/win/cld_commandlineflags.h"
+#include "encodings/compact_lang_det/win/cld_google.h"
+#include "encodings/compact_lang_det/win/cld_htmlutils.h"
+#include "encodings/compact_lang_det/win/cld_unilib.h"
+#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
+#include "encodings/compact_lang_det/win/cld_utf8utils.h"
+static const Language GRAY_LANG = (Language)254;
+static const int kMaxUpToWordBoundary = 50;       // span < this make longer,
+                                                  // else make shorter
+static const int kMaxAdvanceToWordBoundary = 10;  // +/- this many bytes
+                                                  // to round to word boundary,
+                                                  // direction above
+static const char kSpecialSymbol[256] = {       // true for < > &
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+};
+#define LT 0      // <
+#define GT 1      // >
+#define EX 2      // !
+#define HY 3      // -
+#define QU 4      // "
+#define AP 5      // '
+#define SL 6      // /
+#define S_ 7
+#define C_ 8
+#define R_ 9
+#define I_ 10
+#define P_ 11
+#define T_ 12
+#define Y_ 13
+#define L_ 14
+#define E_ 15
+#define CR 16     // <cr> or <lf>
+#define NL 17     // non-letter: ASCII whitespace, digit, punctuation
+#define PL 18     // possible letter, incl. &
+#define xx 19     // <unused>
+// Map byte to one of ~20 interesting categories for cheap tag parsing
+static const uint8 kCharToSub[256] = {
+  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
+  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
+  NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
+  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
+  PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
+  P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
+  PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
+  P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
+  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
+  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
+  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
+  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
+  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
+  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
+  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
+  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
+};
+#undef LT
+#undef GT
+#undef EX
+#undef HY
+#undef QU
+#undef AP
+#undef SL
+#undef S_
+#undef C_
+#undef R_
+#undef I_
+#undef P_
+#undef T_
+#undef Y_
+#undef L_
+#undef E_
+#undef CR
+#undef NL
+#undef PL
+#undef xx
+#define OK 0
+#define X_ 1
+// State machine to do cheap parse of non-letter strings incl. tags
+// advances <tag>
+//          |    |
+// advances <tag> ... </tag>  for <script> <style>
+//          |               |
+// advances <!-- ... <tag> ... -->
+//          |                     |
+// advances <tag
+//          ||  (0)
+// advances <tag <tag2>
+//          ||  (0)
+static const uint8 kTagParseTbl_0[] = {
+// <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
+   3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [0] OK
+  X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error
+   3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [2] NL*
+  X_, 2, 4, 9, 10,11, 9,13,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [3] <
+  X_, 2, 9, 5, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [4] <!
+  X_, 2, 9, 6, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [5] <!-
+   6, 6, 6, 7,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [6] <!--.*
+   6, 6, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [7] <!--.*-
+   6, 2, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [8] <!--.*--
+  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [9] <.*
+  10,10,10,10,  9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
+  11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
+  X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '
+// <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
+  X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9,  9, 9, 9,X_, // [13] <S
+  X_, 2, 9, 9, 10,11, 9, 9,  9,15, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [14] <SC
+  X_, 2, 9, 9, 10,11, 9, 9,  9, 9,16, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [15] <SCR
+  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9,17,  9, 9, 9, 9,  9, 9, 9,X_, // [16] <SCRI
+  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9, 18, 9, 9, 9,  9, 9, 9,X_, // [17] <SCRIP
+  X_,19, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
+  20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
+  19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
+  19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [21] <SCRIPT .*</
+  19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
+  19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
+  19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
+  19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
+  19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
+  19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT
+// <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
+  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9,29, 9, 9,  9, 9, 9,X_, // [28] <ST
+  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9,30, 9,  9, 9, 9,X_, // [29] <STY
+  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9,31,  9, 9, 9,X_, // [30] <STYL
+  X_,32, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
+  33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
+  32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
+  32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [34] <STYLE .*</
+  32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
+  32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
+  32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
+  32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
+  32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
+};
+#undef OK
+#undef X_
+/*
+// Convert GetTimeOfDay output to 64-bit usec
+static inline uint64 Microseconds(const struct timeval& t) {
+  // The SumReducer uses uint64, so convert to (uint64) microseconds,
+  // not (double) seconds.
+  return t.tv_sec * 1000000ULL + t.tv_usec;
+}
+*/
+// Returns true if character is < > or &
+bool inline IsSpecial(char c) {
+  if ((c & 0xe0) == 0x20) {
+    return kSpecialSymbol[static_cast<uint8>(c)];
+  }
+  return false;
+}
+// Quick Skip to next letter or < > & or to end of string (eos)
+// Always return is_letter for eos
+int ScanToLetterOrSpecial(const char* src, int len) {
+  int bytes_consumed;
+  cld::UTF8GenericScan(&utf8scannotjustletterspecial_obj, src, len,
+                       &bytes_consumed);
+  return bytes_consumed;
+}
+// src points to non-letter, such as tag-opening '<'
+// Return length from here to next possible letter
+// On eos or another < before >, return 1
+// advances <tag>
+//          |    |
+// advances <tag> ... </tag>  for <script> <style>
+//          |               |
+// advances <!-- ... <tag> ... -->
+//          |                     |
+// advances <tag
+//          ||  (1)
+// advances <tag <tag2>
+//          ||  (1)
+int ScanToPossibleLetter(const char* isrc, int len) {
+  const uint8* src = reinterpret_cast<const uint8*>(isrc);
+  const uint8* srclimit = src + len;
+  const uint8* tagParseTbl = kTagParseTbl_0;
+  int e = 0;
+  while (src < srclimit) {
+    e = tagParseTbl[kCharToSub[*src++]];
+    if ((e & ~1) == 0) {
+      // We overshot by one byte
+      --src;
+      break;
+    }
+    tagParseTbl = &kTagParseTbl_0[e * 20];
+  }
+  if (src >= srclimit) {
+    // We fell off the end of the text.
+    // It looks like the most common case for this is a truncated file, not
+    // mismatched angle brackets. So we pretend that the last char was '>'
+    return len;
+  }
+  // OK to be in state 0 or state 2 at exit
+  if ((e != 0) && (e != 2)) {
+    // Error, '<' followed by '<'
+    // We want to back up to first <, then advance by one byte past it
+    int offset = src - reinterpret_cast<const uint8*>(isrc);
+    // printf("ScanToPossibleLetter error at %d[%d] in '%s'\n",offset,e,isrc);
+    // Backscan to first '<' and return enough length to just get past it
+    --offset;   // back up over the second '<', which caused us to stop
+    while ((0 < offset) && (isrc[offset] != '<')) {
+      // Find the first '<', which is unmatched
+      --offset;
+    }
+    // skip to just beyond first '<'
+    // printf("  returning %d\n", offset + 1);
+    return offset + 1;
+  }
+  return src - reinterpret_cast<const uint8*>(isrc);
+}
+ScriptScanner::ScriptScanner(const char* buffer,
+                             int buffer_length,
+                             bool is_plain_text)
+  : start_byte_(buffer),
+  next_byte_(buffer),
+  next_byte_limit_(buffer + buffer_length),
+  byte_length_(buffer_length),
+  is_plain_text_(is_plain_text) {
+    script_buffer_ = new char[getone::kMaxScriptBuffer];
+    script_buffer_lower_ = new char[getone::kMaxScriptLowerBuffer];
+}
+ScriptScanner::~ScriptScanner() {
+  delete[] script_buffer_;
+  delete[] script_buffer_lower_;
+}
+// Get to the first real non-tag letter or entity that is a letter
+// Sets script of that letter
+// Return len if no more letters
+int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
+  int sc = UNKNOWN_LSCRIPT;
+  int skip = 0;
+  int tlen, plen;
+  // Do run of non-letters (tag | &NL | NL)*
+  while (skip < len) {
+    // Do fast scan to next interesting byte
+    // int oldskip = skip;
+    skip += ScanToLetterOrSpecial(src + skip, len - skip);
+    // TEMP
+    // printf("ScanToLetterOrSpecial[%d] 0x%02x => [%d] 0x%02x\n",
+    //       oldskip, src[oldskip], skip, src[skip]);
+    // Check for no more letters/specials
+    if (skip >= len) {
+      // All done
+      return len;
+    }
+    // We are at a letter, nonletter, tag, or entity
+    if (IsSpecial(src[skip]) && !is_plain_text_) {
+      if (src[skip] == '<') {
+        // Begining of tag; skip to end and go around again
+        tlen = ScanToPossibleLetter(src + skip, len - skip);
+        sc = 0;
+        // printf("<...> ");
+      } else if (src[skip] == '>') {
+        // Unexpected end of tag; skip it and go around again
+        tlen = 1;         // Over the >
+        sc = 0;
+        // printf("..> ");
+      } else if (src[skip] == '&') {
+        // Expand entity, no advance
+        char temp[4];
+        EntityToBuffer(src + skip, len - skip,
+                       temp, &tlen, &plen);
+        sc = getone::GetUTF8LetterScriptNum(temp);
+        // printf("#(%02x%02x)=%d ", temp[0], temp[1], sc);
+      }
+    } else {
+      // Update 1..4 bytes
+      tlen = cld_UniLib::OneCharLen(src + skip);
+      sc = getone::GetUTF8LetterScriptNum(src + skip);
+      // printf("#(%02x%02x)=%d ", src[skip], src[skip+1], sc);
+    }
+    // TEMP
+    // printf("sc=%d ", sc);
+    if (sc != 0) {break;}           // Letter found
+    skip += tlen;                   // Advance
+  }
+  *script = sc;
+  return skip;
+}
+// Copy next run of same-script non-tag letters to buffer [NUL terminated]
+// Buffer has leading space and all text is lowercased
+bool ScriptScanner::GetOneScriptSpan(getone::LangSpan* span) {
+  span->text = script_buffer_;
+  span->text_bytes = 0;
+  span->offset = next_byte_ - start_byte_;
+  span->script = UNKNOWN_LSCRIPT;
+  span->lang = UNKNOWN_LANGUAGE;
+  span->truncated = false;
+  // printf("GetOneScriptSpan[[ ");
+  // struct timeval script_start, script_mid, script_end;
+  int spanscript;           // The script of this span
+  int sc = UNKNOWN_LSCRIPT;  // The script of next character
+  int tlen, plen;
+  script_buffer_[0] = ' ';  // Always a space at front of output
+  script_buffer_[1] = '\0';
+  int take = 0;
+  int put = 1;              // Start after the initial space
+  // gettimeofday(&script_start, NULL);
+  // Get to the first real non-tag letter or entity that is a letter
+  int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
+  next_byte_ += skip;
+  byte_length_ -= skip;
+  if (byte_length_ <= 0) {
+    // printf("]]\n");
+    return false;               // No more letters to be found
+  }
+  // gettimeofday(&script_mid, NULL);
+  // There is at least one letter, so we know the script for this span
+  // printf("{%d} ", spanscript);
+  span->script = (UnicodeLScript)spanscript;
+  // Go over alternating spans of same-script letters and non-letters,
+  // copying letters to buffer with single spaces for each run of non-letters
+  while (take < byte_length_) {
+    // Copy run of letters in same script (&LS | LS)*
+    int letter_count = 0;              // Keep track of word length
+    bool need_break = false;
+    while (take < byte_length_) {
+      // We are at a letter, nonletter, tag, or entity
+      if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
+        // printf("\"%c\" ", next_byte_[take]);
+        if (next_byte_[take] == '<') {
+          // Begining of tag
+          sc = 0;
+          break;
+        } else if (next_byte_[take] == '>') {
+          // Unexpected end of tag
+          sc = 0;
+          break;
+        } else if (next_byte_[take] == '&') {
+          // Copy entity, no advance
+          EntityToBuffer(next_byte_ + take, byte_length_ - take,
+                         script_buffer_ + put, &tlen, &plen);
+          sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
+        }
+      } else {
+        // Real letter, safely copy up to 4 bytes, increment by 1..4
+        // Will update by 1..4 bytes at Advance, below
+        tlen = plen = cld_UniLib::OneCharLen(next_byte_ + take);
+        if (take < (byte_length_ - 3)) {
+          // Fast case
+          *reinterpret_cast<uint32*>(script_buffer_ + put) =
+            *reinterpret_cast<const uint32*>(next_byte_ + take);
+        } else {
+          // Slow case, happens 1-3 times per input document
+          memcpy(script_buffer_ + put, next_byte_ + take, plen);
+        }
+        sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
+      }
+      // printf("sc(%c)=%d ", next_byte_[take], sc);
+      // char xtmp[8]; memcpy(xtmp,script_buffer_ + put, plen);
+      // xtmp[plen] = '\0'; printf("'%s'{%d} ", xtmp, sc);
+      // Allow continue across a single letter in a different script:
+      // A B D = three scripts, c = common script, i = inherited script,
+      // - = don't care, ( = take position before the += below
+      //  AAA(A-    continue
+      //
+      //  AAA(BA    continue
+      //  AAA(BB    break
+      //  AAA(Bc    continue (breaks after B)
+      //  AAA(BD    break
+      //  AAA(Bi    break
+      //
+      //  AAA(c-    break
+      //
+      //  AAA(i-    continue
+      //
+      if ((sc != spanscript) && (sc != ULScript_Inherited)) {
+        // Might need to break this script span
+        if (sc == ULScript_Common) {
+          need_break = true;
+        } else {
+          // Look at next following character, ignoring entity as Common
+          int sc2 = getone::GetUTF8LetterScriptNum(next_byte_ + take + tlen);
+          if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
+            need_break = true;
+          }
+        }
+      }
+      if (need_break) {break;}  // Non-letter or letter in wrong script
+      take += tlen;                   // Advance
+      put += plen;                    // Advance
+      ++letter_count;
+      if (put >= getone::kMaxScriptBytes) {
+        // Buffer is full
+        span->truncated = true;
+        break;
+      }
+    }     // End while letters
+    // Do run of non-letters (tag | &NL | NL)*
+    while (take < byte_length_) {
+      // Do fast scan to next interesting byte
+      take += ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
+      // Check for no more letters/specials
+      if (take >= byte_length_) {
+        take = byte_length_;
+        break;
+      }
+      // We are at a letter, nonletter, tag, or entity
+      if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
+        // printf("\"%c\" ", next_byte_[take]);
+        if (next_byte_[take] == '<') {
+          // Begining of tag; skip to end and go around again
+          tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take);
+          sc = 0;
+          // printf("<...> ");
+        } else if (next_byte_[take] == '>') {
+          // Unexpected end of tag; skip it and go around again
+          tlen = 1;         // Over the >
+          sc = 0;
+          // printf("..> ");
+        } else if (next_byte_[take] == '&') {
+          // Expand entity, no advance
+          EntityToBuffer(next_byte_ + take, byte_length_ - take,
+                         script_buffer_ + put, &tlen, &plen);
+          sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
+        }
+      } else {
+        // Update 1..4
+        tlen = cld_UniLib::OneCharLen(next_byte_ + take);
+        sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
+      }
+      // printf("sc[%c]=%d ", next_byte_[take], sc);
+      if (sc != 0) {break;}           // Letter found
+      take += tlen;                   // Advance
+    }     // End while not-letters
+    script_buffer_[put++] = ' ';
+    // We are at a letter again (or eos), after letter* not-letter*
+    if (sc != spanscript) {break;}            // Letter in wrong script
+    if (put >= getone::kMaxScriptBytes - 8) {
+      // Buffer is almost full
+      span->truncated = true;
+      break;
+    }
+  }
+  // Update input position
+  next_byte_ += take;
+  byte_length_ -= take;
+  // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
+  //                          kMaxScriptBytes |   | put
+  script_buffer_[put + 0] = ' ';
+  script_buffer_[put + 1] = ' ';
+  script_buffer_[put + 2] = ' ';
+  script_buffer_[put + 3] = '\0';
+  span->text_bytes = put;       // Does not include the last four chars above
+  // printf(" %d]]\n\n", put);
+  return true;
+}
+// Force Latin, Cyrillic, Greek scripts to be lowercase
+void ScriptScanner::LowerScriptSpan(getone::LangSpan* span) {
+  // On Windows, text is lowercased beforehand, so no need to do anything here.
+#if !defined(CLD_WINDOWS)
+  // If needed, lowercase all the text. If we do it sooner, might miss
+  // lowercasing an entity such as &Aacute;
+  // We only need to do this for Latn and Cyrl scripts
+  if ((span->script == ULScript_Latin) ||
+      (span->script == ULScript_Cyrillic) ||
+      (span->script == ULScript_Greek)) {
+    // Full Unicode lowercase of the entire buffer, including
+    // four pad bytes off the end
+    int consumed, filled;
+    UniLib::ToLower(span->text, span->text_bytes + 4,
+                    script_buffer_lower_, getone::kMaxScriptLowerBuffer,
+                    &consumed, &filled);
+    span->text = script_buffer_lower_;
+    span->text_bytes = filled - 4;
+  }
+#endif
+}
+// Copy next run of same-script non-tag letters to buffer [NUL terminated]
+// Force Latin and Cyrillic scripts to be lowercase
+bool ScriptScanner::GetOneScriptSpanLower(getone::LangSpan* span) {
+  bool ok = GetOneScriptSpan(span);
+  LowerScriptSpan(span);
+  return ok;
+}
+// Gets lscript number for letters; always returns
+//   0 (common script) for non-letters
+int getone::GetUTF8LetterScriptNum(const char* src) {
+  int srclen = cld_UniLib::OneCharLen(src);
+  const uint8* usrc = reinterpret_cast<const uint8*>(src);
+  return UTF8GenericProperty(&utf8propletterscriptnum_obj, &usrc, &srclen);
+}

data/ext/cld/encodings/compact_lang_det/getonescriptspan.h ADDED Viewed

@@ -0,0 +1,131 @@
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#ifndef ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
+#define ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
+#include "encodings/compact_lang_det/letterscript_enum.h"
+#include "encodings/compact_lang_det/compact_lang_det_impl.h"
+namespace getone {
+  static const int kMaxScriptBuffer = 4096;
+  static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
+  static const int kMaxScriptBytes = kMaxScriptBuffer- 8;   // Leave some room
+  static const int kMaxAnswerBuffer = 256;
+  typedef enum UnicodeLScript ULScript;
+  typedef struct {
+    char* text;             // Pointer to the span, somewhere
+    int text_bytes;         // Number of bytes of text in the span
+    int offset;             // Offset of start of span in original input buffer
+    ULScript script;        // Script of all the letters in this span
+    Language lang;          // Language identified for this span
+    bool truncated;         // true if buffer filled up before a
+                            // different script or EOF was found
+  } LangSpan;
+  static inline bool IsContinuationByte(char c) {
+    return static_cast<signed char>(c) < -64;
+  }
+  // Gets lscript number for letters; always returns
+  //   0 (common script) for non-letters
+  int GetUTF8LetterScriptNum(const char* src);
+  // Update src pointer to point to next quadgram, +2..+5
+  // Looks at src[0..4]
+  const char* AdvanceQuad(const char* src);
+}     // end namespace getone
+class ScriptScanner {
+ public:
+  ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
+  ~ScriptScanner();
+  // Copy next run of same-script non-tag letters to buffer [NUL terminated]
+  bool GetOneScriptSpan(getone::LangSpan* span);
+  // Force Latin and Cyrillic scripts to be lowercase
+  void LowerScriptSpan(getone::LangSpan* span);
+  // Copy next run of same-script non-tag letters to buffer [NUL terminated]
+  // Force Latin and Cyrillic scripts to be lowercase
+  bool GetOneScriptSpanLower(getone::LangSpan* span);
+ private:
+  int SkipToFrontOfSpan(const char* src, int len, int* script);
+  const char* start_byte_;
+  const char* next_byte_;
+  const char* next_byte_limit_;
+  int byte_length_;
+  bool is_plain_text_;
+  char* script_buffer_;           // Holds text with expanded entities
+  char* script_buffer_lower_;     // Holds lowercased text
+};
+class LangScanner {
+ public:
+  LangScanner(const CompactLangDetImpl::LangDetObj* langdetobj,
+              getone::LangSpan* spn, int smoothwidth, int smoothcandidates,
+              int maxlangs, int minlangspan);
+  ~LangScanner();
+  int script() {return script_;}
+  // Use new text
+  // Keep smoothing state if same script, otherwise reinit smoothing
+  void NewText(getone::LangSpan* spn);
+  bool GetOneShortLangSpanBoot(getone::LangSpan* span);  // Just for bootstrapping
+  bool GetOneLangSpanBoot(getone::LangSpan* span);       // Just for bootstrapping
+  // The real ones
+  bool GetOneShortLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
+                           getone::LangSpan* span);
+  bool GetOneLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
+                      getone::LangSpan* span);
+  // Increases language bias by delta
+  void SetLanguageBias(const CompactLangDetImpl::LangDetObj* langdetobj,
+                       Language key, int delta);
+  // For debugging output
+  int next_answer_;
+  char answer_buffer_[getone::kMaxAnswerBuffer];
+  char answer_buffer2_[getone::kMaxAnswerBuffer];
+  char answer_buffer3_[getone::kMaxAnswerBuffer];
+  char answer_buffer4_[getone::kMaxAnswerBuffer];
+ private:
+  const char* start_byte_;
+  const char* next_byte_limit_;
+  const char* next_byte_;
+  const char* onelangspan_begin_;
+  int byte_length_;
+  int script_;
+  Language spanlang_;
+  int smoothwidth_;
+  int smoothwidth_2_;
+  int smoothcandidates_;
+  int maxlangs_;
+  int minlangspan_;
+  int rb_size_;
+  int next_rb_;
+  int rb_mask_;
+  uint32* rb_;
+  int* offset_rb_;
+};
+#endif  // ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_