RubyGems - language_detection - Versions diffs - 0.0.1 - Mend

language_detection 0.0.1

Files changed (100) hide show

data/.gitignore +19 -0
data/Gemfile +4 -0
data/LICENSE.txt +22 -0
data/README.md +85 -0
data/Rakefile +11 -0
data/ext/cld/Makefile +34 -0
data/ext/cld/base/basictypes.h +348 -0
data/ext/cld/base/build_config.h +124 -0
data/ext/cld/base/casts.h +156 -0
data/ext/cld/base/commandlineflags.h +443 -0
data/ext/cld/base/crash.h +41 -0
data/ext/cld/base/dynamic_annotations.h +358 -0
data/ext/cld/base/global_strip_options.h +59 -0
data/ext/cld/base/log_severity.h +46 -0
data/ext/cld/base/logging.h +1403 -0
data/ext/cld/base/macros.h +243 -0
data/ext/cld/base/port.h +54 -0
data/ext/cld/base/scoped_ptr.h +428 -0
data/ext/cld/base/stl_decl.h +0 -0
data/ext/cld/base/stl_decl_msvc.h +107 -0
data/ext/cld/base/string_util.h +29 -0
data/ext/cld/base/strtoint.h +93 -0
data/ext/cld/base/template_util.h +96 -0
data/ext/cld/base/type_traits.h +198 -0
data/ext/cld/base/vlog_is_on.h +143 -0
data/ext/cld/cld.so +0 -0
data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
data/ext/cld/encodings/internal/encodings.cc +12 -0
data/ext/cld/encodings/lang_enc.h +254 -0
data/ext/cld/encodings/proto/encodings.pb.h +169 -0
data/ext/cld/encodings/public/encodings.h +301 -0
data/ext/cld/extconf.rb +1 -0
data/ext/cld/language_detection.cc +88 -0
data/ext/cld/languages/internal/languages.cc +337 -0
data/ext/cld/languages/proto/languages.pb.h +179 -0
data/ext/cld/languages/public/languages.h +379 -0
data/language_detection.gemspec +28 -0
data/lib/language_detection/string.rb +1 -0
data/lib/language_detection/version.rb +3 -0
data/lib/language_detection.rb +54 -0
data/test/_helper.rb +15 -0
data/test/fixtures/languages.csv +80 -0
data/test/language_detection_test.rb +88 -0
metadata +250 -0

data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h ADDED Viewed

@@ -0,0 +1,10 @@
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_BASICTYPES_H_
+#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_BASICTYPES_H_
+#include "base/basictypes.h"
+#endif  // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_BASICTYPES_H_

data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h ADDED Viewed

@@ -0,0 +1,28 @@
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_COMMANDLINEFLAGS_H_
+#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_COMMANDLINEFLAGS_H_
+#if !defined(CLD_WINDOWS)
+#include "base/commandlineflags.h"
+#else
+#undef DEFINE_bool
+#define DEFINE_bool(name, default_value, comment) \
+    const bool FLAGS_##name = default_value;
+#undef DEFINE_int32
+#define DEFINE_int32(name, default_value, comment) \
+    const int32 FLAGS_##name = default_value;
+#undef DECLARE_bool
+#define DECLARE_bool(name) extern const bool FLAGS_##name;
+#undef DECLARE_int32
+#define DECLARE_int32(name) extern int32 FLAGS_##name;
+#endif
+#endif  // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_COMMANDLINEFLAGS_H_

data/ext/cld/encodings/compact_lang_det/win/cld_google.h ADDED Viewed

@@ -0,0 +1,18 @@
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_GOOGLE_H_
+#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_GOOGLE_H_
+#if !defined(CLD_WINDOWS)
+#include "base/google.h"
+#else
+// Include nothing
+#endif
+#endif  // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_GOOGLE_H_

data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h ADDED Viewed

@@ -0,0 +1,13 @@
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_HTMLUTILS_H_
+#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_HTMLUTILS_H_
+// Src points to '&'
+// Writes entity value to dst. Returns take(src), put(dst) byte counts
+void EntityToBuffer(const char* src, int len, char* dst,
+                    int* tlen, int* plen);
+#endif  // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_HTMLUTILS_H_

data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc ADDED Viewed

@@ -0,0 +1,32 @@
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// Author: alekseys@google.com (Aleksey Shlyapnikov)
+// This code is not actually used, it was copied here for the reference only.
+// See cld_htmlutils_windows.cc for Windows version of this code.
+#include "cld/encodings/compact_lang_det/win/cld_htmlutils.h"
+#include "cld/third_party/utf/utf.h"        // for runetochar
+#include "cld/webutil/html/htmlutils.h"     // for ReadEntity
+// Copied from getonescriptspan.cc
+// Src points to '&'
+// Writes entity value to dst. Returns take(src), put(dst) byte counts
+void EntityToBuffer(const char* src, int len, char* dst,
+                    int* tlen, int* plen) {
+  char32 entval = HtmlUtils::ReadEntity(src, len, tlen);
+  // ReadEntity does this already: entval = FixUnicodeValue(entval);
+  if (entval > 0) {
+    *plen = runetochar(dst, &entval);
+  } else {
+    // Illegal entity; ignore the '&'
+    *tlen = 1;
+    *plen = 0;
+  }
+  // fprintf(stderr,"t%d p%d]\n", *tlen, *plen);
+}

data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc ADDED Viewed

@@ -0,0 +1,29 @@
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#include "encodings/compact_lang_det/win/cld_htmlutils.h"
+// Src points to '&'
+// Writes entity value to dst. Returns take(src), put(dst) byte counts
+void EntityToBuffer(const char* src, int len, char* dst,
+                    int* tlen, int* plen) {
+  // On Windows we do not have to do anything, browser expands HTML entities
+  // for us, so text we're retrieving from it is ready for translation as it is.
+  // But:
+  // This is a temporary solution to let us continue the development without
+  // having a real DOM text scraping in place.  For now the full HTML is fed
+  // to CLD for language detection and just ignoring entities is good enough
+  // for testing.  Later entities will be expanded by browser itself.
+  // Skip entity in the source.
+  *tlen = 1;
+  do {
+    ++src;
+    ++*tlen;
+  } while (*src && *src != ';');
+  // Report a bogus entity (space).
+  *dst = ' ';
+  *plen = 1;
+}

data/ext/cld/encodings/compact_lang_det/win/cld_logging.h ADDED Viewed

@@ -0,0 +1,21 @@
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_LOGGING_H_
+#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_LOGGING_H_
+#if !defined(CLD_WINDOWS)
+#include "base/logging.h"
+#else
+#undef CHECK
+#define CHECK(expr)
+#undef DCHECK
+#define DCHECK(expr)
+#endif
+#endif  // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_LOGGING_H_

data/ext/cld/encodings/compact_lang_det/win/cld_macros.h ADDED Viewed

@@ -0,0 +1,19 @@
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_MACROS_H_
+#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_MACROS_H_
+#include "base/macros.h"
+// Checks for Win32 result and if it indicates failure, returns it.
+#define RETURN_IF_ERROR(cmd) \
+  do { \
+    DWORD result_ = (cmd); \
+    if (0 != result_) \
+      return result_; \
+  } \
+  while (0);
+#endif  // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_MACROS_H_

data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h ADDED Viewed

@@ -0,0 +1,26 @@
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_STRTOINT_H_
+#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_STRTOINT_H_
+#if !defined(CLD_WINDOWS)
+//#include "cld/base/strtoint.h"
+#else
+#include <stdlib.h>
+#include "encodings/compact_lang_det/win/cld_basictypes.h"
+// This implementation is not as good as the one in base/strtoint.h,
+// but it's sufficient for our purposes.
+inline int32 strto32(const char *nptr, char **endptr, int base) {
+  return static_cast<int32>(strtol(nptr, endptr, base));
+}
+#endif
+#endif  // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_STRTOINT_H_

data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc ADDED Viewed

@@ -0,0 +1,84 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#include "encodings/compact_lang_det/win/cld_unicodetext.h"
+#include <string>
+#include <vector>  // to compile bar/common/component.h
+#include "encodings/compact_lang_det/compact_lang_det.h"
+#include "base/string_util.h"
+#include "unicode/normlzr.h"
+#include "unicode/unistr.h"
+#include "unicode/ustring.h"
+/*
+std::string NormalizeText(const UChar* text) {
+  // To avoid a copy, use the read-only aliasing ctor.
+  icu::UnicodeString source(1, text, -1);
+  icu::UnicodeString normalized;
+  UErrorCode status = U_ZERO_ERROR;
+  icu::Normalizer::normalize(source, UNORM_NFC, 0, normalized, status);
+  if (U_FAILURE(status))
+    return std::string();
+  normalized.toLower();
+  std::string utf8;
+  // Internally, toUTF8String uses a 1kB stack buffer (which is not large enough
+  // for most web pages) and does pre-flighting followed by malloc for larger
+  // strings. We have to switch to obtaining the buffer with the maximum size
+  // (UTF-16 length * 3) without pre-flighting if necessary.
+  return normalized.toUTF8String(utf8);
+}
+*/
+// Detects a language of the UTF-16 encoded zero-terminated text.
+// Returns: Language enum.
+Language DetectLanguageOfUnicodeText(
+    const CompactLangDet::DetectionTables* detection_tables,
+    const UChar* text, bool is_plain_text,
+    bool* is_reliable, int* num_languages,
+    int* error_code, int* text_bytes) {
+  if (!text || !num_languages)
+    return NUM_LANGUAGES;
+  // Normalize text to NFC, lowercase and convert to UTF-8.
+  std::string utf8_encoded = NormalizeText(text);
+  if (utf8_encoded.empty())
+    return NUM_LANGUAGES;
+  // Engage core CLD library language detection.
+  Language language3[3] = {
+    UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE
+  };
+  int percent3[3] = { 0, 0, 0 };
+  int text_bytes_tmp = 0;
+  // We ignore return value here due to the problem described in bug 1800161.
+  // For example, translate.google.com was detected as Indonesian.  It happened
+  // due to the heuristic in CLD, which ignores English as a top language
+  // in the presence of another reliably detected language.
+  // See the actual code in compact_lang_det_impl.cc, CalcSummaryLang function.
+  // language3 array is always set according to the detection results and
+  // is not affected by this heuristic.
+  CompactLangDet::DetectLanguageSummary(detection_tables,
+                                        utf8_encoded.c_str(),
+                                        utf8_encoded.length(),
+                                        is_plain_text, language3, percent3,
+                                        &text_bytes_tmp, is_reliable);
+  // Calcualte a number of languages detected in more than 20% of the text.
+  const int kMinTextPercentToCountLanguage = 20;
+  *num_languages = 0;
+  if (text_bytes)
+    *text_bytes = text_bytes_tmp;
+  COMPILE_ASSERT(arraysize(language3) == arraysize(percent3),
+                 language3_and_percent3_should_be_of_the_same_size);
+  for (int i = 0; i < arraysize(language3); ++i) {
+    if (IsValidLanguage(language3[i]) && !IS_LANGUAGE_UNKNOWN(language3[i]) &&
+        percent3[i] >= kMinTextPercentToCountLanguage) {
+      ++*num_languages;
+    }
+  }
+  return language3[0];
+}

data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h ADDED Viewed

@@ -0,0 +1,40 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNICODETEXT_H_
+#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNICODETEXT_H_
+#include "languages/public/languages.h"
+#include "unicode/utypes.h"
+namespace CompactLangDet {
+  struct DetectionTables;
+}  // namespace CompactLangDet
+// Detects a language of the UTF-16 encoded zero-terminated text.
+// [in] detection_tables - internal CLD data tables (see compact_lang_det.h).
+//     Can be NULL, in this case CLD will fall back to builtin static tables.
+// [in] text - UTF-16 encoded text to detect a language of.
+// [in] is_plain_text - true if plain text, false otherwise (e.g. HTML).
+// [out] is_reliable - true, if returned language was detected reliably.
+//     See compact_lang_det.h for details.
+// [out] num_languages - set to the number of languages detected on the page.
+//     Language counts only if it's detected in more than 20% of the text.
+// [out, optional] error_code - set to 0 in case of success, Windows
+//     GetLastError() code otherwise.  Pass NULL, if not interested in errors.
+// See encodings/compact_lang_det/compact_lang_det.h,
+//     CompactLangDet::DetectLanguage() description for other input parameters
+//     description.
+// Returns: Language enum.
+//     Returns NUM_LANGUAGES in case of any error.
+//     See googleclient/languages/internal/languages.cc
+//     for details.
+Language DetectLanguageOfUnicodeText(
+    const CompactLangDet::DetectionTables* detection_tables,
+    const UChar* text, bool is_plain_text,
+    bool* is_reliable, int* num_languages,
+    int* error_code, int* text_bytes);
+#endif  // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNICODETEXT_H_

data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h ADDED Viewed

@@ -0,0 +1,15 @@
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNILIB_H_
+#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNILIB_H_
+namespace cld_UniLib {
+// Return length of a single UTF-8 source character
+int OneCharLen(const char* src);
+}  // namespace cld_UniLib
+#endif  // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNILIB_H_

data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc ADDED Viewed

@@ -0,0 +1,18 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// This code is not actually used, it was copied here for the reference only.
+// See cld_htmlutils_windows.cc for Windows version of this code.
+#include "i18n/encodings/compact_lang_det/cld_unilib.h"
+#include "util/utf8/unilib.h"
+namespace cld_UniLib {
+int OneCharLen(const char* src) {
+  return UniLib::OneCharLen(src);
+}
+}  // namespace cld_UniLib

data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc ADDED Viewed

@@ -0,0 +1,29 @@
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#include "encodings/compact_lang_det/win/cld_unilib.h"
+#include "encodings/compact_lang_det/win/cld_basictypes.h"
+namespace cld_UniLib {
+// Table of UTF-8 character lengths, based on first byte
+static const unsigned char kUTF8LenTbl[256] = {
+  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
+  2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
+  3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
+};
+// Return length of a single UTF-8 source character
+int OneCharLen(const char* src) {
+  return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)];
+}
+}  // namespace cld_UniLib

data/ext/cld/encodings/compact_lang_det/win/cld_utf.h ADDED Viewed

@@ -0,0 +1,24 @@
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF_H_
+#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF_H_
+#if !defined(CLD_WINDOWS)
+//#include "third_party/utf/utf.h"
+#else
+enum {
+  UTFmax        = 4,            // maximum bytes per rune
+  Runesync      = 0x80,         // cannot represent part of a UTF sequence (<)
+  Runeself      = 0x80,         // rune and UTF sequences are the same (<)
+  Runeerror     = 0xFFFD,       // decoding error in UTF
+  Runemax       = 0x10FFFF,     // maximum rune value
+};
+#endif
+#endif  // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF_H_

data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc ADDED Viewed

@@ -0,0 +1,224 @@
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
+// Return true if current Tbl pointer is within state0 range
+// Note that unsigned compare checks both ends of range simultaneously
+static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
+  const uint8* Tbl0 = &st->state_table[st->state0];
+  return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
+}
+// Look up property of one UTF-8 character and advance over it
+// Return 0 if input length is zero
+// Return 0 and advance one byte if input is ill-formed
+uint8 UTF8GenericProperty(const UTF8PropObj* st,
+                          const uint8** src,
+                          int* srclen) {
+  if (*srclen <= 0) {
+    return 0;
+  }
+  const uint8* lsrc = *src;
+  const uint8* Tbl_0 = &st->state_table[st->state0];
+  const uint8* Tbl = Tbl_0;
+  int e;
+  int eshift = st->entry_shift;
+  // Short series of tests faster than switch, optimizes 7-bit ASCII
+  unsigned char c = lsrc[0];
+  if (static_cast<signed char>(c) >= 0) {           // one byte
+    e = Tbl[c];
+    *src += 1;
+    *srclen -= 1;
+  } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) {     // two bytes
+    e = Tbl[c];
+    Tbl = &Tbl_0[e << eshift];
+    e = Tbl[lsrc[1]];
+    *src += 2;
+    *srclen -= 2;
+  } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) {     // three bytes
+    e = Tbl[c];
+    Tbl = &Tbl_0[e << eshift];
+    e = Tbl[lsrc[1]];
+    Tbl = &Tbl_0[e << eshift];
+    e = Tbl[lsrc[2]];
+    *src += 3;
+    *srclen -= 3;
+  }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) {     // four bytes
+    e = Tbl[c];
+    Tbl = &Tbl_0[e << eshift];
+    e = Tbl[lsrc[1]];
+    Tbl = &Tbl_0[e << eshift];
+    e = Tbl[lsrc[2]];
+    Tbl = &Tbl_0[e << eshift];
+    e = Tbl[lsrc[3]];
+    *src += 4;
+    *srclen -= 4;
+  } else {                                                // Ill-formed
+    e = 0;
+    *src += 1;
+    *srclen -= 1;
+  }
+  return e;
+}
+// BigOneByte versions are needed for tables > 240 states, but most
+// won't need the TwoByte versions.
+// Internally, to next-to-last offset is multiplied by 16 and the last
+// offset is relative instead of absolute.
+// Look up property of one UTF-8 character and advance over it
+// Return 0 if input length is zero
+// Return 0 and advance one byte if input is ill-formed
+uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
+                          const uint8** src,
+                          int* srclen) {
+  if (*srclen <= 0) {
+    return 0;
+  }
+  const uint8* lsrc = *src;
+  const uint8* Tbl_0 = &st->state_table[st->state0];
+  const uint8* Tbl = Tbl_0;
+  int e;
+  int eshift = st->entry_shift;
+  // Short series of tests faster than switch, optimizes 7-bit ASCII
+  unsigned char c = lsrc[0];
+  if (static_cast<signed char>(c) >= 0) {           // one byte
+    e = Tbl[c];
+    *src += 1;
+    *srclen -= 1;
+  } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) {     // two bytes
+    e = Tbl[c];
+    Tbl = &Tbl_0[e << eshift];
+    e = Tbl[lsrc[1]];
+    *src += 2;
+    *srclen -= 2;
+  } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) {     // three bytes
+    e = Tbl[c];
+    Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range
+    e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];
+    Tbl = &Tbl[e << eshift];          // Relative +/-
+    e = Tbl[lsrc[2]];
+    *src += 3;
+    *srclen -= 3;
+  }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) {     // four bytes
+    e = Tbl[c];
+    Tbl = &Tbl_0[e << eshift];
+    e = Tbl[lsrc[1]];
+    Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range
+    e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];
+    Tbl = &Tbl[e << eshift];          // Relative +/-
+    e = Tbl[lsrc[3]];
+    *src += 4;
+    *srclen -= 4;
+  } else {                                                // Ill-formed
+    e = 0;
+    *src += 1;
+    *srclen -= 1;
+  }
+  return e;
+}
+// Scan a UTF-8 stringpiece based on a state table.
+// Always scan complete UTF-8 characters
+// Set number of bytes scanned. Return reason for exiting
+int UTF8GenericScan(const UTF8ScanObj* st,
+                    const uint8* str,
+                    const int len,
+                    int* bytes_consumed) {
+  int eshift = st->entry_shift;        // 6 (space optimized) or 8
+  // int nEntries = (1 << eshift);       // 64 or 256 entries per state
+  const uint8* isrc = str;
+    //reinterpret_cast<const uint8*>(str.data());
+  const uint8* src = isrc;
+  //const int len = str.length();
+  const uint8* srclimit = isrc + len;
+  const uint8* srclimit8 = srclimit - 7;
+  *bytes_consumed = 0;
+  if (len == 0) return kExitOK;
+  const uint8* Tbl_0 = &st->state_table[st->state0];
+DoAgain:
+  // Do state-table scan
+  int e = 0;
+  uint8 c;
+  // Do fast for groups of 8 identity bytes.
+  // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
+  // including slowing slightly on cr/lf/ht
+  //----------------------------
+  const uint8* Tbl2 = &st->fast_state[0];
+  uint32 losub = st->losub;
+  uint32 hiadd = st->hiadd;
+  while (src < srclimit8) {
+    uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
+    uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
+    src += 8;
+    // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
+    uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
+                  (s4567 - losub) | (s4567 + hiadd);
+    if ((temp & 0x80808080) != 0) {
+      // We typically end up here on cr/lf/ht; src was incremented
+      int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
+                  (Tbl2[src[-6]] | Tbl2[src[-5]]);
+      if (e0123 != 0) {src -= 8; break;}    // Exit on Non-interchange
+      e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
+              (Tbl2[src[-2]] | Tbl2[src[-1]]);
+      if (e0123 != 0) {src -= 4; break;}    // Exit on Non-interchange
+      // Else OK, go around again
+    }
+  }
+  //----------------------------
+  // Byte-at-a-time scan
+  //----------------------------
+  const uint8* Tbl = Tbl_0;
+  while (src < srclimit) {
+    c = *src;
+    e = Tbl[c];
+    src++;
+    if (e >= kExitIllegalStructure) {break;}
+    Tbl = &Tbl_0[e << eshift];
+  }
+  //----------------------------
+  // Exit posibilities:
+  //  Some exit code, !state0, back up over last char
+  //  Some exit code, state0, back up one byte exactly
+  //  source consumed, !state0, back up over partial char
+  //  source consumed, state0, exit OK
+  // For illegal byte in state0, avoid backup up over PREVIOUS char
+  // For truncated last char, back up to beginning of it
+  if (e >= kExitIllegalStructure) {
+    // Back up over exactly one byte of rejected/illegal UTF-8 character
+    src--;
+    // Back up more if needed
+    if (!InStateZero(st, Tbl)) {
+      do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
+    }
+  } else if (!InStateZero(st, Tbl)) {
+    // Back up over truncated UTF-8 character
+    e = kExitIllegalStructure;
+    do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
+  } else {
+    // Normal termination, source fully consumed
+    e = kExitOK;
+  }
+  if (e == kExitDoAgain) {
+    // Loop back up to the fast scan
+    goto DoAgain;
+  }
+  *bytes_consumed = src - isrc;
+  return e;
+}