RubyGems - language_detection - Versions diffs - 0.0.1 - Mend

language_detection 0.0.1

Files changed (100) hide show

data/.gitignore +19 -0
data/Gemfile +4 -0
data/LICENSE.txt +22 -0
data/README.md +85 -0
data/Rakefile +11 -0
data/ext/cld/Makefile +34 -0
data/ext/cld/base/basictypes.h +348 -0
data/ext/cld/base/build_config.h +124 -0
data/ext/cld/base/casts.h +156 -0
data/ext/cld/base/commandlineflags.h +443 -0
data/ext/cld/base/crash.h +41 -0
data/ext/cld/base/dynamic_annotations.h +358 -0
data/ext/cld/base/global_strip_options.h +59 -0
data/ext/cld/base/log_severity.h +46 -0
data/ext/cld/base/logging.h +1403 -0
data/ext/cld/base/macros.h +243 -0
data/ext/cld/base/port.h +54 -0
data/ext/cld/base/scoped_ptr.h +428 -0
data/ext/cld/base/stl_decl.h +0 -0
data/ext/cld/base/stl_decl_msvc.h +107 -0
data/ext/cld/base/string_util.h +29 -0
data/ext/cld/base/strtoint.h +93 -0
data/ext/cld/base/template_util.h +96 -0
data/ext/cld/base/type_traits.h +198 -0
data/ext/cld/base/vlog_is_on.h +143 -0
data/ext/cld/cld.so +0 -0
data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
data/ext/cld/encodings/internal/encodings.cc +12 -0
data/ext/cld/encodings/lang_enc.h +254 -0
data/ext/cld/encodings/proto/encodings.pb.h +169 -0
data/ext/cld/encodings/public/encodings.h +301 -0
data/ext/cld/extconf.rb +1 -0
data/ext/cld/language_detection.cc +88 -0
data/ext/cld/languages/internal/languages.cc +337 -0
data/ext/cld/languages/proto/languages.pb.h +179 -0
data/ext/cld/languages/public/languages.h +379 -0
data/language_detection.gemspec +28 -0
data/lib/language_detection/string.rb +1 -0
data/lib/language_detection/version.rb +3 -0
data/lib/language_detection.rb +54 -0
data/test/_helper.rb +15 -0
data/test/fixtures/languages.csv +80 -0
data/test/language_detection_test.rb +88 -0
metadata +250 -0

data/ext/cld/encodings/public/encodings.h ADDED Viewed

@@ -0,0 +1,301 @@
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#ifndef ENCODINGS_PUBLIC_ENCODINGS_H_
+#define ENCODINGS_PUBLIC_ENCODINGS_H_
+// This interface defines the Encoding enum and various functions that
+// depend only on Encoding values.
+// A hash-function for Encoding, hash<Encoding>, is defined in
+// i18n/encodings/public/encodings-hash.h
+// On some Windows projects, UNICODE may be defined, which would prevent the
+// Encoding enum below from compiling. Note that this is a quick fix that does
+// not break any existing projects. The UNICODE enum may someday be changed
+// to something more specific and non-colliding, but this involves careful
+// testing of changes in many other projects.
+#undef UNICODE
+// NOTE: The Encoding enum must always start at 0. This assumption has
+// been made and used.
+#ifndef SWIG
+#include "encodings/proto/encodings.pb.h"
+// We must have this for compatibility.
+// COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
+//using namespace i18n::encodings;
+#else
+// Special proto SWIG workaround header file.
+#include "i18n/encodings/internal/encodings_proto_wrapper.h"
+#endif
+const int kNumEncodings = NUM_ENCODINGS;
+// some of the popular encoding aliases
+// TODO(jrm) Make these static const Encoding values instead of macros.
+#define LATIN1           ISO_8859_1
+#define LATIN2           ISO_8859_2
+#define LATIN3           ISO_8859_3
+#define LATIN4           ISO_8859_4
+#define CYRILLIC         ISO_8859_5
+#define ARABIC_ENCODING  ISO_8859_6     // avoiding the same name as language
+#define GREEK_ENCODING   ISO_8859_7     // avoiding the same name as language
+#define HEBREW_ENCODING  ISO_8859_8     // avoiding the same name as language
+#define LATIN5           ISO_8859_9
+#define LATIN6           ISO_8859_10
+#define KOREAN_HANGUL    KOREAN_EUC_KR
+// The default Encoding (LATIN1).
+Encoding default_encoding();
+// *************************************************************
+// Encoding predicates
+//   IsValidEncoding()
+//   IsEncEncCompatible
+//   IsSupersetOfAscii7Bit
+//   Is8BitEncoding
+//   IsCJKEncoding
+//   IsHebrewEncoding
+//   IsRightToLeftEncoding
+//   IsLogicalRightToLeftEncoding
+//   IsVisualRightToLeftEncoding
+//   IsIso2022Encoding
+//   IsIso2022JpOrVariant
+//   IsShiftJisOrVariant
+//   IsJapaneseCellPhoneCarrierSpecificEncoding
+// *************************************************************
+// IsValidEncoding
+// ===================================
+//
+// Function to check if the input language enum is within range.
+//
+bool IsValidEncoding(Encoding enc);
+//
+// IsEncEncCompatible
+// ------------------
+//
+// This function is to determine whether or not converting from the
+// first encoding to the second requires any changes to the underlying
+// text (e.g.  ASCII_7BIT is a subset of UTF8).
+//
+// TODO(someone more familiar with i18n): the current implementation
+// is likely incomplete.  It would be good to consider the full matrix
+// of all pairs of encodings and to fish out all compatible pairs.
+//
+bool IsEncEncCompatible(const Encoding from, const Encoding to);
+// To be a superset of 7-bit Ascii means that bytes 0...127 in the given
+// encoding represent the same characters as they do in ISO_8859_1.
+// WARNING: This function does not currently return true for all encodings that
+// are supersets of Ascii 7-bit.
+bool IsSupersetOfAscii7Bit(Encoding e);
+// To be an 8-bit encoding means that there are fewer than 256 symbols.
+// Each byte determines a new character; there are no multi-byte sequences.
+// WARNING: This function does not currently return true for all encodings that
+// are 8-bit encodings.
+bool Is8BitEncoding(Encoding e);
+// IsCJKEncoding
+// -------------
+//
+// This function returns true if the encoding is either Chinese
+// (simplified or traditional), Japanese, or Korean. Note: UTF8 is not
+// considered a CJK encoding.
+bool IsCJKEncoding(Encoding e);
+// IsHebrewEncoding
+// -------------
+//
+// This function returns true if the encoding is a Hebrew specific
+// encoding (not UTF8, etc).
+bool IsHebrewEncoding(Encoding e);
+// IsRightToLeftEncoding
+// ---------------------
+//
+// Returns true if the encoding is a right-to-left encoding.
+//
+// Note that the name of this function is somewhat misleading. There is nothing
+// "right to left" about these encodings. They merely contain code points for
+// characters in RTL languages such as Hebrew and Arabic. But this is also
+// true for UTF-8.
+//
+// TODO(benjy): Get rid of this function. The only special-case we
+// should need to worry about are visual encodings. Anything we
+// need to do for all 'RTL' encodings we need to do for UTF-8 as well.
+bool IsRightToLeftEncoding(Encoding enc);
+// IsLogicalRightToLeftEncoding
+// ----------------------------
+//
+// Returns true if the encoding is a logical right-to-left encoding.
+// Logical right-to-left encodings are those that the browser renders
+// right-to-left and applies the BiDi algorithm to. Therefore the characters
+// appear in reading order in the file, and indexing, snippet generation etc.
+// should all just work with no special processing.
+//
+// TODO(benjy): Get rid of this function. The only special-case we
+// should need to worry about are visual encodings.
+bool IsLogicalRightToLeftEncoding(Encoding enc);
+// IsVisualRightToLeftEncoding
+// ---------------------------
+//
+// Returns true if the encoding is a visual right-to-left encoding.
+// Visual right-to-left encodings are those that the browser renders
+// left-to-right and does not apply the BiDi algorithm to. Therefore each
+// line appears in reverse order in the file, lines are manually wrapped
+// by abusing <br> or <p> tags, etc. Visual RTL encoding is a relic of
+// the prehistoric days when browsers couldn't render right-to-left, but
+// unfortunately some visual pages persist to this day. These documents require
+// special processing so that we don't index or snippet them with each line
+// reversed.
+bool IsVisualRightToLeftEncoding(Encoding enc);
+// IsIso2022Encoding
+// -----------------
+//
+// Returns true if the encoding is a kind of ISO 2022 such as
+// ISO-2022-JP.
+bool IsIso2022Encoding(Encoding enc);
+// IsIso2022JpOrVariant
+// --------------------
+//
+// Returns true if the encoding is ISO-2022-JP or a variant such as
+// KDDI's ISO-2022-JP.
+bool IsIso2022JpOrVariant(Encoding enc);
+// IsShiftJisOrVariant
+// --------------------
+//
+// Returns true if the encoding is Shift_JIS or a variant such as
+// KDDI's Shift_JIS.
+bool IsShiftJisOrVariant(Encoding enc);
+// IsJapanesCellPhoneCarrierSpecificEncoding
+// -----------------------------------------
+//
+// Returns true if it's Japanese cell phone carrier specific encoding
+// such as KDDI_SHIFT_JIS.
+bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc);
+// *************************************************************
+// ENCODING NAMES
+//
+// This interface defines a standard name for each valid encoding, and
+// a standard name for invalid encodings. (Some names use all upper
+// case, but others use mixed case.)
+//
+//   EncodingName() [Encoding to name]
+//   MimeEncodingName() [Encoding to name]
+//   EncodingFromName() [name to Encoding]
+//   EncodingNameAliasToEncoding() [name to Encoding]
+//   default_encoding_name()
+//   invalid_encoding_name()
+// *************************************************************
+// EncodingName
+// ------------
+//
+// Given the encoding, returns its standard name.
+// Return invalid_encoding_name() if the encoding is invalid.
+//
+const char* EncodingName(Encoding enc);
+//
+// MimeEncodingName
+// ----------------
+//
+// Return the "preferred MIME name" of an encoding.
+//
+// This name is suitable for using in HTTP headers, HTML tags,
+// and as the "charset" parameter of a MIME Content-Type.
+const char* MimeEncodingName(Encoding enc);
+// The maximum length of an encoding name
+const int kMaxEncodingNameSize = 50;
+// The standard name of the default encoding.
+const char* default_encoding_name();
+// The name used for an invalid encoding.
+const char* invalid_encoding_name();
+// EncodingFromName
+// ----------------
+//
+// If enc_name matches the standard name of an Encoding, using a
+// case-insensitive comparison, set *encoding to that Encoding and
+// return true.  Otherwise set *encoding to UNKNOWN_ENCODING and
+// return false.
+//
+// REQUIRES: encoding must not be NULL.
+//
+bool EncodingFromName(const char* enc_name, Encoding *encoding);
+//
+// EncodingNameAliasToEncoding
+// ---------------------------
+//
+// If enc_name matches the standard name or an alias of an Encoding,
+// using a case-insensitive comparison, return that
+// Encoding. Otherwise, return UNKNOWN_ENCODING.
+//
+// Aliases include most mime-encoding names (e.g., "ISO-8859-7" for
+// GREEK), alternate names (e.g., "cyrillic" for ISO_8859_5) and
+// common variations with hyphens and underscores (e.g., "koi8-u" and
+// "koi8u" for RUSSIAN_KOI8_R).
+Encoding EncodingNameAliasToEncoding(const char *enc_name);
+// *************************************************************
+// Miscellany
+// *************************************************************
+// PreferredWebOutputEncoding
+// --------------------------
+//
+// Some multi-byte encodings use byte values that coincide with the
+// ASCII codes for HTML syntax characters <>"&' and browsers like MSIE
+// can misinterpret these, as indicated in an external XSS report from
+// 2007-02-15. Here, we map these dangerous encodings to safer ones. We
+// also use UTF8 instead of encodings that we don't support in our
+// output, and we generally try to be conservative in what we send out.
+// Where the client asks for single- or double-byte encodings that are
+// not as common, we substitute a more common single- or double-byte
+// encoding, if there is one, thereby preserving the client's intent
+// to use less space than UTF-8. This also means that characters
+// outside the destination set will be converted to HTML NCRs (&#NNN;)
+// if requested.
+Encoding PreferredWebOutputEncoding(Encoding enc);
+// InitEncodings
+// -------------
+//
+// Ensures the encodings module has been initialized.  Normally this happens
+// during InitGoogle, but this allows access for scripts that don't
+// support InitGoogle.
+void InitEncodings();
+#endif  // ENCODINGS_PUBLIC_ENCODINGS_H_

data/ext/cld/extconf.rb ADDED Viewed

	@@ -0,0 +1 @@
1	+ # TODO: Generate Makefile

data/ext/cld/language_detection.cc ADDED Viewed

@@ -0,0 +1,88 @@
+#include <stdio.h>
+#include <string.h>
+#include "encodings/compact_lang_det/compact_lang_det.h"
+#include "encodings/compact_lang_det/ext_lang_enc.h"
+#include "encodings/proto/encodings.pb.h"
+typedef struct {
+  const  char *name;
+  const  char *code;
+  int    percent;
+  double score;
+} LanguageDetail;
+typedef struct {
+  const char *name;
+  const char *code;
+  bool reliable;
+  int  text_bytes;
+  LanguageDetail *details;
+} DetectedLanguage;
+extern "C" {
+  DetectedLanguage language_detection(const char * src, bool is_plain_text) {
+    bool do_allow_extended_languages = true;
+    bool do_pick_summary_language    = false;
+    bool do_remove_weak_matches      = false;
+    bool is_reliable;
+    // "id" boosts Indonesian
+    //
+    const char* tld_hint = NULL;
+    // SJS boosts Japanese
+    //
+    int encoding_hint = UNKNOWN_ENCODING;
+    // ITALIAN boosts it
+    //
+    Language language_hint = UNKNOWN_LANGUAGE;
+    double   normalized_score3[3];
+    Language language3[3];
+    int      percent3[3];
+    int      text_bytes;
+    Language lang;
+    lang = CompactLangDet::DetectLanguage(0,
+                                          src, strlen(src),
+                                          is_plain_text,
+                                          do_allow_extended_languages,
+                                          do_pick_summary_language,
+                                          do_remove_weak_matches,
+                                          tld_hint,
+                                          encoding_hint,
+                                          language_hint,
+                                          language3,
+                                          percent3,
+                                          normalized_score3,
+                                          &text_bytes,
+                                          &is_reliable);
+    DetectedLanguage detected_language;
+    LanguageDetail * details = new LanguageDetail [3];
+    detected_language.name       = LanguageName(lang);
+    detected_language.code       = ExtLanguageCode(lang);
+    detected_language.reliable   = is_reliable;
+    detected_language.text_bytes = text_bytes;
+    for(int i = 0; i < 3; i++) {
+      Language lang  = language3[i];
+      LanguageDetail detail;
+      detail.name    = LanguageName(lang);
+      detail.code    = ExtLanguageCode(lang);
+      detail.percent = percent3[i];
+      detail.score   = normalized_score3[i];
+      details[i] = detail;
+    }
+    detected_language.details = details;
+    return detected_language;
+  }
+}