RubyGems - langusta - Versions diffs - 0.1.0 - Mend

langusta 0.1.0

Files changed (136) hide show

data/.document +5 -0
data/Gemfile +11 -0
data/Gemfile.lock +32 -0
data/LICENSE.txt +13 -0
data/README.rdoc +34 -0
data/Rakefile +55 -0
data/VERSION +1 -0
data/bin/langusta +5 -0
data/data/messages.properties +128 -0
data/data/uppercase.bin +0 -0
data/langusta.gemspec +210 -0
data/lib/langusta.rb +36 -0
data/lib/langusta/command.rb +78 -0
data/lib/langusta/detector.rb +197 -0
data/lib/langusta/detector_factory.rb +46 -0
data/lib/langusta/java_property_reader.rb +35 -0
data/lib/langusta/lang_profile.rb +80 -0
data/lib/langusta/language.rb +14 -0
data/lib/langusta/language_detection_facade.rb +24 -0
data/lib/langusta/n_gram.rb +116 -0
data/lib/langusta/regex_helper.rb +15 -0
data/lib/langusta/tag_extractor.rb +39 -0
data/lib/langusta/ucs2_string.rb +70 -0
data/lib/langusta/unicode_block.rb +56 -0
data/profiles/af +1 -0
data/profiles/ar +1 -0
data/profiles/bg +1 -0
data/profiles/bn +1 -0
data/profiles/cs +1 -0
data/profiles/da +1 -0
data/profiles/de +1 -0
data/profiles/el +1 -0
data/profiles/en +1 -0
data/profiles/es +1 -0
data/profiles/fa +1 -0
data/profiles/fi +1 -0
data/profiles/fr +1 -0
data/profiles/gu +1 -0
data/profiles/he +1 -0
data/profiles/hi +1 -0
data/profiles/hr +1 -0
data/profiles/hu +1 -0
data/profiles/id +1 -0
data/profiles/it +1 -0
data/profiles/ja +1 -0
data/profiles/kn +1 -0
data/profiles/ko +1 -0
data/profiles/mk +1 -0
data/profiles/ml +1 -0
data/profiles/mr +1 -0
data/profiles/ne +1 -0
data/profiles/nl +1 -0
data/profiles/no +1 -0
data/profiles/pa +1 -0
data/profiles/pl +1 -0
data/profiles/pt +1 -0
data/profiles/ro +1 -0
data/profiles/ru +1 -0
data/profiles/sk +1 -0
data/profiles/so +1 -0
data/profiles/sq +1 -0
data/profiles/sv +1 -0
data/profiles/sw +1 -0
data/profiles/ta +1 -0
data/profiles/te +1 -0
data/profiles/th +1 -0
data/profiles/tl +1 -0
data/profiles/tr +1 -0
data/profiles/uk +1 -0
data/profiles/ur +1 -0
data/profiles/vi +1 -0
data/profiles/zh-cn +1 -0
data/profiles/zh-tw +1 -0
data/test/helper.rb +20 -0
data/test/quality/test_falsified.rb +33 -0
data/test/test_command.rb +34 -0
data/test/test_data/af +1 -0
data/test/test_data/ar +1 -0
data/test/test_data/bg +32 -0
data/test/test_data/bn +9 -0
data/test/test_data/cs +9 -0
data/test/test_data/da +14 -0
data/test/test_data/de +4 -0
data/test/test_data/el +7 -0
data/test/test_data/en +26 -0
data/test/test_data/es +4 -0
data/test/test_data/fa +21 -0
data/test/test_data/fi +8 -0
data/test/test_data/fr +13 -0
data/test/test_data/gu +3 -0
data/test/test_data/he +20 -0
data/test/test_data/hi +1 -0
data/test/test_data/hr +16 -0
data/test/test_data/hu +6 -0
data/test/test_data/id +2 -0
data/test/test_data/it +3 -0
data/test/test_data/ja +34 -0
data/test/test_data/kn +14 -0
data/test/test_data/ko +2 -0
data/test/test_data/mk +3 -0
data/test/test_data/ml +1 -0
data/test/test_data/mr +3 -0
data/test/test_data/ne +2 -0
data/test/test_data/nl +1 -0
data/test/test_data/no +3 -0
data/test/test_data/pa +1 -0
data/test/test_data/pl +23 -0
data/test/test_data/pt +2 -0
data/test/test_data/ro +2 -0
data/test/test_data/ru +1 -0
data/test/test_data/sk +2 -0
data/test/test_data/so +4 -0
data/test/test_data/sq +4 -0
data/test/test_data/sv +3 -0
data/test/test_data/sw +6 -0
data/test/test_data/ta +1 -0
data/test/test_data/te +2 -0
data/test/test_data/th +3 -0
data/test/test_data/tl +1 -0
data/test/test_data/tr +2 -0
data/test/test_data/uk +3 -0
data/test/test_data/ur +1 -0
data/test/test_data/vi +2 -0
data/test/test_data/zh-tw +3 -0
data/test/test_detector.rb +52 -0
data/test/test_detector_factory.rb +16 -0
data/test/test_java_property_reader.rb +8 -0
data/test/test_lang_profile.rb +79 -0
data/test/test_language.rb +15 -0
data/test/test_language_detection_facade.rb +9 -0
data/test/test_langusta.rb +25 -0
data/test/test_n_gram.rb +103 -0
data/test/test_tag_extractor.rb +71 -0
data/test/test_ucs2_string.rb +9 -0
data/test/test_unicode_block.rb +9 -0
metadata +320 -0

@@ -0,0 +1,36 @@
+$: << File.expand_path(File.dirname(__FILE__))
+require 'rubygems'
+require 'bundler'
+Bundler.setup
+require 'optparse'
+require 'iconv'
+# Required gems
+require 'oniguruma'
+require 'yajl'
+module Langusta
+  VERSION = '0.1.0'
+  autoload :RegexHelper, 'langusta/regex_helper'
+  autoload :UCS2String, 'langusta/ucs2_string'
+  autoload :Language, 'langusta/language'
+  autoload :LangProfile, 'langusta/lang_profile'
+  autoload :Detector, 'langusta/detector'
+  autoload :JavaPropertyReader, 'langusta/java_property_reader'
+  autoload :UnicodeBlock, 'langusta/unicode_block'
+  autoload :NGram, 'langusta/n_gram'
+  autoload :DetectorFactory, 'langusta/detector_factory'
+  autoload :Detector, 'langusta/detector'
+  autoload :TagExtractor, 'langusta/tag_extractor'
+  autoload :Command, 'langusta/command'
+  autoload :LanguageDetectionFacade, 'langusta/language_detection_facade'
+  ABSOLUTE_PATH = File.expand_path(File.join(File.dirname(__FILE__), '..'))
+  PROFILES_PATH = File.join(ABSOLUTE_PATH, 'profiles')
+  UPPERCASE_BIN = File.join(ABSOLUTE_PATH, 'data/uppercase.bin')
+  MESSAGES_PROPERTIES = File.join(ABSOLUTE_PATH, 'data/messages.properties')
+end

data/lib/langusta/command.rb ADDED

@@ -0,0 +1,78 @@
+module Langusta
+  class Command
+    def self.run(argv)
+      options = {}
+      opts = OptionParser.new do |opts|
+        opts.on("--detectlang", "Detect the language from the given text") do |d|
+          options[:operation] = :detectlang if d
+        end
+        opts.on("--batchtest", "Batch test of language detection") do |b|
+          options[:operation] = :batchtest if b
+        end
+        opts.on("-d [profile directory]") do |pd|
+          options[:profile_directory] = pd
+        end
+        opts.on("-a [alpha]", Float) do |alpha|
+          options[:alpha] = alpha
+        end
+      end.parse!(argv)
+      arguments = [options[:profile_directory]] + [argv]
+      arguments << options[:alpha] if options[:alpha]
+      case options[:operation]
+      when :detectlang
+        self.new.send(:detect_lang, *arguments)
+      when :batchtest
+        self.new.send(:batch_test, *arguments)
+      else
+        $stderr.puts <<EOF
+Usage:
+  langusta --detectlang -d [profile directory] -a [alpha] [test file(s)]
+  langusta --batchtest -d [profile directory] -a [alpha] [test file(s)]
+EOF
+      end
+      0
+    end
+    def initialize
+      @detector_factory = DetectorFactory.new
+    end
+    def detect_lang(profile_directory, test_files, alpha=nil)
+      initialize_factory(profile_directory)
+      test_files.each do |filename|
+        language = detect_single_lang(filename, alpha)
+        puts "%s: %s" % [filename, language]
+      end
+    end
+    def batch_test(profile_directory, test_files, alpha=nil)
+    end
+    def detect_single_lang(filename, alpha)
+      ucs2_content = UCS2String.from_utf8(File.open(filename).read)
+      detector = @detector_factory.create(alpha)
+      detector.append(ucs2_content)
+      language = detector.detect()
+    end
+    def initialize_factory(profile_directory)
+      profiles = load_profiles(profile_directory)
+      profiles.each_with_index do |profile, index|
+        @detector_factory.add_profile(profile, index, profiles.length)
+      end
+    end
+    def load_profiles(directory)
+      @profiles = Dir[File.join(directory, '/*')].map do |filename|
+        LangProfile.load_from_file(filename)
+      end
+    end
+  end
+end

data/lib/langusta/detector.rb ADDED

@@ -0,0 +1,197 @@
+module Langusta
+  class Detector
+    attr_accessor :verbose, :alpha, :max_text_length
+    ALPHA_DEFAULT = 0.5
+    ALPHA_WIDTH = 0.05
+    ITERATION_LIMIT = 1000
+    PROB_THRESHOLD = 0.1
+    CONV_THRESHOLD = 0.99999
+    BASE_FREQ = 10000
+    UNKNOWN_LANG = "unknown"
+    def initialize(factory)
+      @word_lang_prob_map = factory.word_lang_prob_map
+      @lang_list = factory.lang_list
+      @text = UCS2String.new('')
+      @langprob = nil
+      @alpha = ALPHA_DEFAULT
+      @n_trial = 7
+      @max_text_length = 10000
+      @prior_map = nil
+      @verbose = false
+    end
+    # Append more text to be recognized.
+    # @param text [UCS2String] text to be recognized
+    def append(text)
+      raise TypeError.new("Expected: UCS2String, got: #{text.class}") unless text.is_a?(UCS2String)
+      text.gsub!(RegexHelper::URL_REGEX, "\x00\x20")
+      text.gsub!(RegexHelper::MAIL_REGEX, "\x00\x20")
+      text = text.map do |c|
+        NGram.normalize(c)
+      end
+      @text = text.gsub!(RegexHelper::SPACE_REGEX, "\x00\x20")
+    end
+    # Detect the language.
+    # @return [String] (usually) two-letter code describing the language.
+    def detect
+      probabilities = get_probabilities()
+      (probabilities.length > 0) ? probabilities.first.lang : UNKNOWN_LANG
+    end
+    private
+    def detect_block
+      cleaning_text()
+      ngrams = extract_ngrams()
+      raise "no features in text" if ngrams.empty?
+      @langprob = Array.new(@lang_list.length, 0.0)
+      @n_trial.times do
+        prob = init_probability()
+        alpha = @alpha + Detector.next_gaussian() * ALPHA_WIDTH
+        i = 0
+        Kernel.loop do
+          r = Kernel.rand(ngrams.length)
+          update_lang_prob(prob, ngrams[r], alpha)
+          if i % 5
+            break if Detector.normalize_prob(prob) > CONV_THRESHOLD || i >= ITERATION_LIMIT
+            # verbose
+          end
+        end
+        @langprob.length.times do |j|
+          @langprob[j] += prob[j] / @n_trial
+        end
+        # verbose
+      end
+    end
+    def set_prior_map(prior_map)
+      @prior_map = Array.new[@lang_list.length]
+      sump = 0.0
+      @prior_map.length.times do |i|
+        lang = @lang_list[i]
+        if @prior_map.has_key?(lang)
+          p = @prior_map[lang]
+          raise "probability must be non-negative" if p < 0
+          @prior_map[i] = p
+          sump += p
+        end
+      end
+      raise "more one of prob must be non-zero" if sump <= 0
+      @prior_map.map! do |p|
+        p /= sump
+      end
+    end
+    def self.normalize_prob(prob)
+      maxp = 0.0; sump = 0.0
+      prob.each do |p|
+        sump += p
+      end
+      prob.map! do |p|
+        q = p / sump
+        maxp = q if q > maxp
+        q
+      end
+      maxp
+    end
+    private
+    def cleaning_text
+      non_latin_count = latin_count = 0
+      @text.each_char do |c|
+        if c < "\00z" && c >= "\x00A"
+          latin_count += 1
+        elsif c >= "\x03\x00" && UnicodeBlock.of(c) != UnicodeBlock::LATIN_EXTENDED_ADDITIONAL
+          non_latin_count += 1
+        end
+      end
+      if latin_count * 2 < non_latin_count
+        text_without_latin = UCS2String.new('')
+        @text.each_char do |c|
+          text_without_latin << c if c > "\x00z" || c < "\x00A"
+        end
+        @text = text_without_latin
+      end
+    end
+    def extract_ngrams
+      list = []
+      ngram = NGram.new
+      @text.each_char do |char|
+        ngram.add_char(char)
+        (1..NGram::N_GRAM).each do |n|
+          w = ngram.get(n)
+          list << w if w && @word_lang_prob_map.has_key?(w)
+        end
+      end
+      list
+    end
+    def get_probabilities
+      if @langprob.nil?
+        detect_block()
+      end
+      sort_probability(@langprob)
+    end
+    def init_probability
+      prob = Array.new(@lang_list.length)
+      if @prior_map
+        prob = @prior_map.clone
+      else
+        prob.length.times do |i|
+          prob[i] = 1.0 / @lang_list.length
+        end
+      end
+      prob
+    end
+    def sort_probability(prob)
+      list = []
+      prob.each_with_index do |prob, index|
+        list[index] = Language.new(@lang_list[index], prob)
+      end
+      list.sort_by do |x|
+        x.prob
+      end.select do |x|
+        x.prob > PROB_THRESHOLD
+      end
+    end
+    def update_lang_prob(prob, word, alpha)
+      return false if word.nil? || ! @word_lang_prob_map.has_key?(word)
+      lang_prob_map = @word_lang_prob_map[word]
+      # verbose
+      weight = alpha / BASE_FREQ
+      prob.length.times do |i|
+        prob[i] *= weight + lang_prob_map[i]
+      end
+      true
+    end
+    def word_prob_to_string(prob)
+      prob.zip(@lang_list).select do |p, lang|
+        p > 0.00001
+      end.map do |p, lang|
+        "%s:%.5f" % [p, lang]
+      end.join(' ')
+    end
+    # Box-Muller transform.
+    def self.next_gaussian
+      s = 0
+      while s >= 1 || s == 0
+        v1 = 2 * Kernel.rand - 1
+        v2 = 2 * Kernel.rand - 1
+        s = v1 * v1 + v2 * v2
+      end
+      multiplier = Math.sqrt(-2 * Math.log(s)/s)
+      return v1 * multiplier
+    end
+  end
+end

data/lib/langusta/detector_factory.rb ADDED

@@ -0,0 +1,46 @@
+module Langusta
+  class LangDetectException < StandardError; end
+  class DetectorFactory
+    attr_reader :word_lang_prob_map, :lang_list
+    def initialize
+      @word_lang_prob_map = {}
+      @lang_list = []
+    end
+    # Adds a new language profile to this factory.
+    # @param [LangProfile] language profile to be added.
+    # @param [Fixnum] index at which the language profile is to be added.
+    # @param [Fixnum] counts how many language profiles are to be added to this factory in total.
+    def add_profile(profile, index, langsize)
+      raise LangDetectException.new("duplicate the same language profile") if @lang_list.include?(profile.name)
+      @lang_list << profile.name
+      profile.freq.keys.each do |word|
+        if not @word_lang_prob_map.has_key?(word)
+          @word_lang_prob_map[word] = Array.new(langsize, 0.0)
+        end
+        prob = 1.0 * profile.freq[word] / profile.n_words[word.length - 1]
+        @word_lang_prob_map[word][index] = prob
+      end
+    end
+    # Creates a new detector object, based on a preconfigured set of language profiles.
+    # @return [Detector]
+    def create(alpha=nil)
+      if alpha
+        detector = create_detector()
+        detector.alpha = alpha
+        detector
+      else
+        create_detector()
+      end
+    end
+    private
+    def create_detector
+      raise LangDetectException.new("need to load profiles") if @lang_list.length == 0
+      detector = Detector.new(self)
+    end
+  end
+end

data/lib/langusta/java_property_reader.rb ADDED

@@ -0,0 +1,35 @@
+module Langusta
+  class JavaPropertyReader
+    # This is a minimal implementation, don't expect this to actually work.
+    def initialize(filename)
+      @lines = File.open(filename).read
+      parse()
+    end
+    def [](property)
+      @properties[property]
+    end
+    def underlying_hash
+      @properties
+    end
+    private
+    def parse
+      @properties = {}
+      @lines.each do |line|
+        prop_name, value = line.split(/\=/)
+        @properties[prop_name] = parse_value(value)
+      end
+    end
+    def parse_value(value)
+      codepoints = value.scan(/([0-9A-F]{4})/)
+      codepoints.map do |cp|
+        int_cp = cp.first.to_i(16)
+        [int_cp / 256, int_cp % 256].pack("c*")
+      end.join
+    end
+  end
+end

data/lib/langusta/lang_profile.rb ADDED

@@ -0,0 +1,80 @@
+require 'set'
+module Langusta
+  class LangProfile
+    MINIMUM_FREQ = 2
+    LESS_FREQ_RATIO = 100_000
+    attr_reader :name, :freq, :n_words
+    # Constructs a language profile from a file. Converts all NGrams from UTF-8 to Unicode codepoints.
+    # @param [String] file name of the language profile.
+    # @return [LangProfile]
+    def self.load_from_file(filename)
+      json = Yajl::Parser.parse(File.new(filename))
+      profile = self.new
+      name = json['name']
+      n_words = json['n_words']
+      freq = json['freq'].inject({}) do |acc, kv|
+        key, value = kv
+        acc[UCS2String.from_utf8(key)] = value
+        acc
+      end
+      profile.populate_json(name, freq, n_words)
+      profile
+    end
+    def initialize(name=nil)
+      @name = name
+      @freq = {}
+      @n_words = Array.new(NGram::N_GRAM, 0)
+    end
+    def populate_json(name, freq, n_words)
+      @name, @freq, @n_words = name, freq, n_words
+    end
+    # Adds a given NGram to this language profile. This operation is expected to be invoked multiple times for the same arguments.
+    # @param gram [UCS2String]
+    def add(gram)
+      raise TypeError.new("UCS2String or NilClass expected, got: #{gram.class}") unless gram.is_a?(UCS2String) or gram.is_a?(NilClass)
+      return if @name.nil? or gram.nil?
+      length = gram.size
+      return if length < 1 or length > NGram::N_GRAM
+      @n_words[length - 1] += 1
+      @freq[gram] ||= 0
+      @freq[gram] += 1
+    end
+    def omit_less_freq
+      return if @name.nil?
+      threshold = @n_words[0] / LESS_FREQ_RATIO
+      threshold = MINIMUM_FREQ if threshold < MINIMUM_FREQ
+      keys = Set.new(@freq.keys)
+      roman = 0
+      keys.each do |key|
+        count = @freq[key]
+        if count <= threshold
+          @n_words[key.size - 1] -= count
+          @freq.delete(key)
+        else
+          # temp workaround
+          if RegexHelper::ROMAN_REGEX.match(key.underlying)
+            roman += count
+          end
+        end
+      end
+      if roman < @n_words[0] / 3
+        keys2 = Set.new(@freq.keys)
+        keys2.each do |key|
+          # temp workaround
+          if RegexHelper::INCL_ROMAN_REGEX.match(key.underlying)
+            @n_words[key.size - 1] -= @freq[key]
+            @freq.delete(key)
+          end
+        end
+      end
+    end
+  end
+end