RubyGems - feedbackmine-language_detector - Versions diffs - 0.1.0 → 0.1.1 - Mend

feedbackmine-language_detector 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

data/README +7 -4
data/lib/language_detector.rb +37 -32
data/lib/model.yml +73838 -80934
data/test/language_detector_test.rb +18 -27
metadata +2 -2

data/README CHANGED Viewed

@@ -1,8 +1,8 @@
-feedbackmine
-http://twitter.com/feedbackmine
 == Welcome
-This is a n-gram based language detector, written in ruby
+This is a n-gram based language detector, written in ruby.
+For technical details about language detection, I recommend this paper: http://www.sfs.uni-tuebingen.de/iscl/Theses/kranig.pdf
 == Installation
 sudo gem sources -a http://gems.github.com (you only have to do this once)
@@ -14,5 +14,8 @@ d = LanguageDetector.new
 p d.detect('this text is in English')
 == Thanks
-Kevin Burton (http://feedblog.org/2005/08/19/ngram-language-categorization-source/). I used all the training data in his ngramcat project.
+Kevin Burton (http://feedblog.org/2005/08/19/ngram-language-categorization-source/ ). I used all the training data in his ngramcat project.
+== Follow us on twitter
+http://twitter.com/feedbackmine

data/lib/language_detector.rb CHANGED Viewed

@@ -1,3 +1,7 @@
+require 'yaml'
+require 'jcode'
+$KCODE = 'u' if RUBY_VERSION < '1.9'
 class LanguageDetector
   def detect text
     @profiles ||= load_model
@@ -8,7 +12,7 @@ class LanguageDetector
     best_distance = nil
     @profiles.each {|profile|
       distance = profile.compute_distance(p)
       if !best_distance || distance < best_distance
         best_distance = distance
         best_profile = profile
@@ -16,11 +20,11 @@ class LanguageDetector
     }
     return best_profile.name
   end
   def self.train
     # For a full list of ISO 639 language tags visit:
     # http:#www.loc.gov/standards/iso639-2/englangn.html
     #LARGE profiles follow:
@@ -31,7 +35,7 @@ class LanguageDetector
     #always a good source of data.
     #
     # http:#en.wikipedia.org/wiki/World_War_II
     training_data = [
       # af (afrikaans)
       [ "ar", "ar-utf8.txt", "utf8", "arabic" ],
@@ -92,7 +96,7 @@ class LanguageDetector
     ]
     profiles = []
-    training_data.each {|data|
+    training_data.each {|data|
       p = Profile.new data[0]
       p.init_with_file data[1]
       profiles << p
@@ -103,7 +107,7 @@ class LanguageDetector
       YAML.dump(profiles, f)
     }
   end
   def load_model
     filename = File.expand_path(File.join(File.dirname(__FILE__), "model.yml"))
     @profiles = YAML.load_file(filename)
@@ -112,12 +116,12 @@ end
 class Profile
-  PUNCTUATIONS = [?\n, ?\r, ?\t, ?\s, ?!, ?", ?#, ?$, ?%, ?&, ?', ?(, ?), ?*, ?+, ?,, ?-, ?., ?/,
-  ?0, ?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9,
+  PUNCTUATIONS = [?\n, ?\r, ?\t, ?\s, ?!, ?", ?#, ?$, ?%, ?&, ?', ?(, ?), ?*, ?+, ?,, ?-, ?., ?/,
+  ?0, ?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9,
   ?:, ?;, ?<, ?=, ?>, ??, ?@, ?[, ?\\, ?], ?^, ?_, ?`, ?{, ?|, ?}, ?~]
   LIMIT = 2000
   def compute_distance other_profile
     distance = 0
     other_profile.ngrams.each {|k, v|
@@ -130,48 +134,48 @@ class Profile
     }
     return distance
   end
   attr_reader :ngrams, :name
   def initialize(name)
     @name = name
     @puctuations = {}
     PUNCTUATIONS.each {|p| @puctuations[p] = 1}
     @ngrams = {}
   end
   def init_with_file filename
     ngram_count = {}
     path = File.expand_path(File.join(File.dirname(__FILE__), "training_data/" + filename))
     puts "training with " + path
-    File.open(path).each_line{ |line|
-      _init_with_string line, ngram_count
+    File.open(path).each_line{ |line|
+      _init_with_string line, ngram_count
     }
     a = ngram_count.sort {|a,b| b[1] <=> a[1]}
     i = 1
-    a.each {|t|
+    a.each {|t|
       @ngrams[t[0]] = i
       i += 1
       break if i > LIMIT
     }
   end
   def init_with_string str
     ngram_count = {}
     _init_with_string str, ngram_count
     a = ngram_count.sort {|a,b| b[1] <=> a[1]}
     i = 1
-    a.each {|t|
+    a.each {|t|
       @ngrams[t[0]] = i
       i += 1
       break if i > LIMIT
     }
   end
   def _init_with_string str, ngram_count
     tokens = tokenize(str)
     tokens.each {|token|
@@ -179,13 +183,13 @@ class Profile
       count_ngram token, 3, ngram_count
       count_ngram token, 4, ngram_count
       count_ngram token, 5, ngram_count
-    }
+    }
   end
   def tokenize str
     tokens = []
     s = ''
-    str.each_byte {|b|
+    str.each_byte {|b|
       if is_puctuation?(b)
         tokens << s unless s.empty?
         s = ''
@@ -196,12 +200,13 @@ class Profile
     tokens << s unless s.empty?
     return tokens
   end
   def is_puctuation? b
     @puctuations[b]
   end
   def count_ngram token, n, counts
+    token = "_#{token}#{'_' * (n-1)}" if n > 1 && token.jlength >= n
     i = 0
     while i + n <= token.length
       s = ''
@@ -210,14 +215,14 @@ class Profile
         s << token[i+j]
         j += 1
       end
-      if counts[s]
+      if counts[s]
         counts[s] = counts[s] + 1
-      else
+      else
         counts[s] = 1
       end
       i += 1
     end
     return counts
   end