RubyGems - language_detection - Versions diffs - 0.0.1 - Mend

language_detection 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (100) hide show

data/.gitignore +19 -0
data/Gemfile +4 -0
data/LICENSE.txt +22 -0
data/README.md +85 -0
data/Rakefile +11 -0
data/ext/cld/Makefile +34 -0
data/ext/cld/base/basictypes.h +348 -0
data/ext/cld/base/build_config.h +124 -0
data/ext/cld/base/casts.h +156 -0
data/ext/cld/base/commandlineflags.h +443 -0
data/ext/cld/base/crash.h +41 -0
data/ext/cld/base/dynamic_annotations.h +358 -0
data/ext/cld/base/global_strip_options.h +59 -0
data/ext/cld/base/log_severity.h +46 -0
data/ext/cld/base/logging.h +1403 -0
data/ext/cld/base/macros.h +243 -0
data/ext/cld/base/port.h +54 -0
data/ext/cld/base/scoped_ptr.h +428 -0
data/ext/cld/base/stl_decl.h +0 -0
data/ext/cld/base/stl_decl_msvc.h +107 -0
data/ext/cld/base/string_util.h +29 -0
data/ext/cld/base/strtoint.h +93 -0
data/ext/cld/base/template_util.h +96 -0
data/ext/cld/base/type_traits.h +198 -0
data/ext/cld/base/vlog_is_on.h +143 -0
data/ext/cld/cld.so +0 -0
data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
data/ext/cld/encodings/internal/encodings.cc +12 -0
data/ext/cld/encodings/lang_enc.h +254 -0
data/ext/cld/encodings/proto/encodings.pb.h +169 -0
data/ext/cld/encodings/public/encodings.h +301 -0
data/ext/cld/extconf.rb +1 -0
data/ext/cld/language_detection.cc +88 -0
data/ext/cld/languages/internal/languages.cc +337 -0
data/ext/cld/languages/proto/languages.pb.h +179 -0
data/ext/cld/languages/public/languages.h +379 -0
data/language_detection.gemspec +28 -0
data/lib/language_detection/string.rb +1 -0
data/lib/language_detection/version.rb +3 -0
data/lib/language_detection.rb +54 -0
data/test/_helper.rb +15 -0
data/test/fixtures/languages.csv +80 -0
data/test/language_detection_test.rb +88 -0
metadata +250 -0

data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h ADDED Viewed

@@ -0,0 +1,76 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#ifndef ENCODINGS_COMPACT_LANG_DET_CLDUTIL_DBG_H_
+#define ENCODINGS_COMPACT_LANG_DET_CLDUTIL_DBG_H_
+#include "encodings/compact_lang_det/cldutil.h"
+#include <string>
+#include "encodings/compact_lang_det/ext_lang_enc.h"
+#include "encodings/compact_lang_det/tote.h"
+#include "encodings/compact_lang_det/win/cld_basictypes.h"
+#include "encodings/compact_lang_det/win/cld_commandlineflags.h"
+DECLARE_bool(dbgscore);
+DECLARE_bool(dbglookup);
+DECLARE_bool(dbgreli);
+using std::string;
+namespace cld {
+//------------------------------------------------------------------------------
+// Debugging. Not thread safe
+//------------------------------------------------------------------------------
+  void DbgScoreInit(const char* src, int len);
+  // Return a 3-byte + NUL code for language
+  void DbgLangName3(Language lang, char* temp);
+  // Show all per-language totals
+  void DbgScoreState();
+  void DbgScoreTop(const char* src, int srclen, Tote* chunk_tote);
+  void DbgScoreFlush();
+  // Allow additional scoring debug output
+  void DbgScoreRecord(const char* src, uint32 probs, int len);
+  void DbgScoreRecordUni(const char* src, int propval, int len);
+  // Debug print language name(s)
+  void PrintLang(FILE* f, const Tote* chunk_tote,
+                 const Language cur_lang, const bool cur_unreliable,
+                 Language prior_lang, bool prior_unreliable);
+  // Debug print language name(s)
+  void PrintLang2(FILE* f,
+                  const Language lang1, const Language lang2, bool diff_prior);
+  // Debug print text span
+  void PrintText(FILE* f, Language cur_lang, const string& str);
+  // Debug print text span with speculative language
+  void PrintTextSpeculative(FILE* f, Language cur_lang, const string& str);
+  // Debug print ignored text span
+  void PrintSkippedText(FILE* f, const string& str);
+  void DbgProbsToStderr(uint32 probs);
+  void DbgUniTermToStderr(int propval, const uint8* usrc, int len);
+  // No pre/post space
+  void DbgBiTermToStderr(uint32 bihash, uint32 probs,
+                          const char* src, int len);
+  void DbgQuadTermToStderr(uint32 quadhash, uint32 probs,
+                          const char* src, int len);
+  void DbgWordTermToStderr(uint64 wordhash, uint32 probs,
+                          const char* src, int len);
+}       // End namespace cld
+#endif  // ENCODINGS_COMPACT_LANG_DET_CLDUTIL_DBG_H_

data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc ADDED Viewed

@@ -0,0 +1,76 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#include "encodings/compact_lang_det/cldutil_dbg.h"
+//#include <string>
+//#include "base/logging.h"
+//#include "i18n/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h"
+//#include "i18n/encodings/compact_lang_det/utf8propletterscriptnum.h"
+//#include "third_party/utf/utf.h"            // for UTFmax
+//#include "util/utf8/unicodeprops.h"
+//#include "util/utf8/unilib.h"
+//#include "util/utf8/utf8statetable.h"
+#include "encodings/compact_lang_det/win/cld_commandlineflags.h"
+DEFINE_bool(dbgscore, false, "Print picture of score calculation");
+DEFINE_bool(dbglookup, false, "Print every quad/uni lookup in score calc");
+DEFINE_bool(dbgreli, false, "Print reliability in score calc");
+namespace cld {
+//------------------------------------------------------------------------------
+// Debugging. Not thread safe
+// This is the empty version -- routines return immediately
+//------------------------------------------------------------------------------
+  void DbgScoreInit(const char* src, int len) {};
+  // Return a 3-byte + NUL code for language
+  void DbgLangName3(Language lang, char* temp) {};
+  // Show all per-language totals
+  void DbgScoreState() {};
+  void DbgScoreTop(const char* src, int srclen, Tote* chunk_tote) {};
+  void DbgScoreFlush() {};
+  // Allow additional scoring debug output
+  void DbgScoreRecord(const char* src, uint32 probs, int len) {};
+  void DbgScoreRecordUni(const char* src, int propval, int len) {};
+  // Debug print language name(s)
+  void PrintLang(FILE* f, const Tote* chunk_tote,
+                 const Language cur_lang, const bool cur_unreliable,
+                 Language prior_lang, bool prior_unreliable) {};
+  // Debug print language name(s)
+  void PrintLang2(FILE* f,
+                  const Language lang1, const Language lang2, bool diff_prior) {};
+  // Debug print text span
+  void PrintText(FILE* f, Language cur_lang, const string& str) {};
+  // Debug print text span with speculative language
+  void PrintTextSpeculative(FILE* f, Language cur_lang, const string& str) {};
+  // Debug print ignored text span
+  void PrintSkippedText(FILE* f, const string& str) {};
+  void DbgProbsToStderr(uint32 probs) {};
+  void DbgUniTermToStderr(int propval, const uint8* usrc, int len) {};
+  // No pre/post space
+  void DbgBiTermToStderr(uint32 bihash, uint32 probs,
+                          const char* src, int len) {};
+  void DbgQuadTermToStderr(uint32 quadhash, uint32 probs,
+                          const char* src, int len) {};
+  void DbgWordTermToStderr(uint64 wordhash, uint32 probs,
+                          const char* src, int len) {};
+}       // End namespace cld

data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc ADDED Viewed

@@ -0,0 +1,62 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#include "encodings/compact_lang_det/compact_lang_det.h"
+#include "encodings/compact_lang_det/compact_lang_det_impl.h"
+#include "encodings/compact_lang_det/win/cld_basictypes.h"
+// String is "code_version - data_scrape_date"
+static const char* kDetectLanguageVersion = "V1.6 - 20081121";
+// Large-table version for all ~160 languages (all Tiers)
+Language CompactLangDet::DetectLanguage(
+                        const DetectionTables* tables,
+                        const char* buffer,
+                        int buffer_length,
+                        bool is_plain_text,
+                        bool do_allow_extended_languages,
+                        bool do_pick_summary_language,
+                        bool do_remove_weak_matches,
+                        const char* tld_hint,       // "id" boosts Indonesian
+                        int encoding_hint,          // SJS boosts Japanese
+                        Language language_hint,     // ITALIAN boosts it
+                        Language* language3,
+                        int* percent3,
+                        double* normalized_score3,
+                        int* text_bytes,
+                        bool* is_reliable) {
+  int flags = 0;
+  Language plus_one = UNKNOWN_LANGUAGE;
+  Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
+                          tables,
+                          buffer,
+                          buffer_length,
+                          is_plain_text,
+                          do_pick_summary_language,
+                          do_remove_weak_matches,
+                          tld_hint,               // "id" boosts Indonesian
+                          encoding_hint,          // SJS boosts Japanese
+                          language_hint,          // ITALIAN boosts it
+                          do_allow_extended_languages,
+                          flags,
+                          plus_one,
+                          language3,
+                          percent3,
+                          normalized_score3,
+                          text_bytes,
+                          is_reliable);
+  // Do not default to English
+  return lang;
+  }
+// Return version text string
+// String is "code_version - data_scrape_date"
+const char* CompactLangDet::DetectLanguageVersion() {
+  return kDetectLanguageVersion;
+}

data/ext/cld/encodings/compact_lang_det/compact_lang_det.h ADDED Viewed

@@ -0,0 +1,145 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// Baybayin (ancient script of the Philippines) is detected as TAGALOG.
+// Chu Nom (Vietnamese ancient Han characters) is detected as VIETNAMESE.
+// HAITIAN_CREOLE is detected as such.
+// NORWEGIAN and NORWEGIAN_N are detected separately (but not robustly)
+// PORTUGUESE, PORTUGUESE_P, and PORTUGUESE_B are all detected as PORTUGUESE.
+// ROMANIAN-Latin is detected as ROMANIAN; ROMANIAN-Cyrillic as MOLDAVIAN.
+// SERBO_CROATIAN, BOSNIAN, CROATIAN, SERBIAN, MONTENEGRIN in the Latin script
+//  are all detected as CROATIAN; in the Cyrillic script as SERBIAN.
+// Zhuang is detected in the Latin script only.
+//
+// The Google interface languages X_PIG_LATIN and X_KLINGON are detected in the
+//  extended calls ExtDetectLanguageSummary(). BorkBorkBork, ElmerFudd, and
+//  Hacker are not detected (too little training data).
+//
+// UNKNOWN_LANGUAGE is returned if no language's internal reliablity measure
+//  is high enough. This happens with non-text input such as the bytes of a
+//  JPEG, and also with some text in languages outside the Google Language
+//  enum, such as Ilonggo.
+//
+// The following languages are detected in multiple scripts:
+//  AZERBAIJANI (Latin, Cyrillic*, Arabic*)
+//  BURMESE (Latin, Myanmar)
+//  HAUSA (Latin, Arabic)
+//  KASHMIRI (Arabic, Devanagari)
+//  KAZAKH (Latin, Cyrillic, Arabic)
+//  KURDISH (Latin*, Arabic)
+//  KYRGYZ (Cyrillic, Arabic)
+//  LIMBU (Devanagari, Limbu)
+//  MONGOLIAN (Cyrillic, Mongolian)
+//  SANSKRIT (Latin, Devanagari)
+//  SINDHI (Arabic, Devanagari)
+//  TAGALOG (Latin, Tagalog)
+//  TAJIK (Cyrillic, Arabic*)
+//  TATAR (Latin, Cyrillic, Arabic)
+//  TURKMEN (Latin, Cyrillic, Arabic)
+//  UIGHUR (Latin, Cyrillic, Arabic)
+//  UZBEK (Latin, Cyrillic, Arabic)
+//
+// * Due to a shortage of training text, AZERBAIJANI is not currently detected
+//   in Arabic or Cyrillic scripts, nor KURDISH in Latin script, nor TAJIK in
+//   Arabic script.
+//
+#ifndef ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_H_
+#define ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_H_
+#include "languages/public/languages.h"
+#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
+namespace cld {
+  struct CLDTableSummary;
+}  // namespace cld
+namespace CompactLangDet {
+  // Scan interchange-valid UTF-8 bytes and detect most likely language,
+  // or set of languages.
+  //
+  // Design goals:
+  //   Skip over big stretches of HTML tags
+  //   Able to return ranges of different languages
+  //   Relatively small tables and relatively fast processing
+  //   Thread safe
+  //
+  // For HTML documents, tags are skipped, along with <script> ... </script>
+  // and <style> ... </style> sequences, and entities are expanded.
+  //
+  // We distinguish between bytes of the raw input buffer and bytes of non-tag
+  // text letters. Since tags can be over 50% of the bytes of an HTML Page,
+  // and are nearly all seven-bit ASCII English, we prefer to distinguish
+  // language mixture fractions based on just the non-tag text.
+  //
+  // Inputs: text and text_length
+  //  Code skips HTML tags and expands HTML entities, unless
+  //  is_plain_text is true
+  // Outputs:
+  //  language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE
+  //  percent3 is an array of the text percentages 0..100 of the top 3 languages
+  //  text_bytes is the amount of non-tag/letters-only text found
+  //  is_reliable set true if the returned Language is some amount more
+  //  probable then the second-best Language. Calculation is a complex function
+  //  of the length of the text and the different-script runs of text.
+  // Return value: the most likely Language for the majority of the input text
+  //  Length 0 input returns UNKNOWN_LANGUAGE.
+  //
+  // Subsetting: For fast detection over large documents, these routines will
+  // scan non-tag text of the initial part of a document, then will
+  // skip 4-16 bytes and subsample text in the rest of the document, up to a
+  // fixed limit (currently 160KB of non-tag letters).
+  //
+  struct DetectionTables {
+    const cld::CLDTableSummary* quadgram_obj;
+    const UTF8PropObj* unigram_obj;
+  };
+  // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
+  //
+  // Accepts hints to bias languagepriors.
+  //
+  // Extended languages are additional Google interface languages and Unicode
+  // single-language scripts, from ext_lang_enc.h. They are experimental and
+  // this call may be removed.
+  //
+  // Returns internal language scores as a ratio to
+  // normal score for real text in that language. Scores close to 1.0 indicate
+  // normal text, while scores far away from 1.0 indicate badly-skewed text or
+  // gibberish
+  //
+  // If do_pick_summary_lang is true then CLD will sometimes
+  // not pick the top-scoring language; see CalcSummaryLang
+  // in compact_lang_det_impl.cc.  If it's false then the
+  // top language is always returned.
+  //
+  // If do_remove_weak_matches is true then CLD will delete
+  // poor scoring languages from the results, so that if a
+  // language is returned there is some confidence it is
+  // correct.
+  //
+  Language DetectLanguage(
+                          const DetectionTables* tables,
+                          const char* buffer,
+                          int buffer_length,
+                          bool is_plain_text,
+                          bool do_allow_extended_languages,
+                          bool do_pick_summary_language,
+                          bool do_remove_weak_matches,
+                          const char* tld_hint,       // "id" boosts Indonesian
+                          int encoding_hint,          // SJS boosts Japanese
+                          Language language_hint,     // ITALIAN boosts it
+                          Language* language3,
+                          int* percent3,
+                          double* normalized_score3,
+                          int* text_bytes,
+                          bool* is_reliable);
+  // Return version text string
+  // String is "code_version - data_scrape_date"
+  const char* DetectLanguageVersion();
+};      // End namespace CompactLangDet
+#endif  // ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_H_