RubyGems - langusta - Versions diffs - 0.1.0 - Mend

langusta 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (136) hide show

data/.document +5 -0
data/Gemfile +11 -0
data/Gemfile.lock +32 -0
data/LICENSE.txt +13 -0
data/README.rdoc +34 -0
data/Rakefile +55 -0
data/VERSION +1 -0
data/bin/langusta +5 -0
data/data/messages.properties +128 -0
data/data/uppercase.bin +0 -0
data/langusta.gemspec +210 -0
data/lib/langusta.rb +36 -0
data/lib/langusta/command.rb +78 -0
data/lib/langusta/detector.rb +197 -0
data/lib/langusta/detector_factory.rb +46 -0
data/lib/langusta/java_property_reader.rb +35 -0
data/lib/langusta/lang_profile.rb +80 -0
data/lib/langusta/language.rb +14 -0
data/lib/langusta/language_detection_facade.rb +24 -0
data/lib/langusta/n_gram.rb +116 -0
data/lib/langusta/regex_helper.rb +15 -0
data/lib/langusta/tag_extractor.rb +39 -0
data/lib/langusta/ucs2_string.rb +70 -0
data/lib/langusta/unicode_block.rb +56 -0
data/profiles/af +1 -0
data/profiles/ar +1 -0
data/profiles/bg +1 -0
data/profiles/bn +1 -0
data/profiles/cs +1 -0
data/profiles/da +1 -0
data/profiles/de +1 -0
data/profiles/el +1 -0
data/profiles/en +1 -0
data/profiles/es +1 -0
data/profiles/fa +1 -0
data/profiles/fi +1 -0
data/profiles/fr +1 -0
data/profiles/gu +1 -0
data/profiles/he +1 -0
data/profiles/hi +1 -0
data/profiles/hr +1 -0
data/profiles/hu +1 -0
data/profiles/id +1 -0
data/profiles/it +1 -0
data/profiles/ja +1 -0
data/profiles/kn +1 -0
data/profiles/ko +1 -0
data/profiles/mk +1 -0
data/profiles/ml +1 -0
data/profiles/mr +1 -0
data/profiles/ne +1 -0
data/profiles/nl +1 -0
data/profiles/no +1 -0
data/profiles/pa +1 -0
data/profiles/pl +1 -0
data/profiles/pt +1 -0
data/profiles/ro +1 -0
data/profiles/ru +1 -0
data/profiles/sk +1 -0
data/profiles/so +1 -0
data/profiles/sq +1 -0
data/profiles/sv +1 -0
data/profiles/sw +1 -0
data/profiles/ta +1 -0
data/profiles/te +1 -0
data/profiles/th +1 -0
data/profiles/tl +1 -0
data/profiles/tr +1 -0
data/profiles/uk +1 -0
data/profiles/ur +1 -0
data/profiles/vi +1 -0
data/profiles/zh-cn +1 -0
data/profiles/zh-tw +1 -0
data/test/helper.rb +20 -0
data/test/quality/test_falsified.rb +33 -0
data/test/test_command.rb +34 -0
data/test/test_data/af +1 -0
data/test/test_data/ar +1 -0
data/test/test_data/bg +32 -0
data/test/test_data/bn +9 -0
data/test/test_data/cs +9 -0
data/test/test_data/da +14 -0
data/test/test_data/de +4 -0
data/test/test_data/el +7 -0
data/test/test_data/en +26 -0
data/test/test_data/es +4 -0
data/test/test_data/fa +21 -0
data/test/test_data/fi +8 -0
data/test/test_data/fr +13 -0
data/test/test_data/gu +3 -0
data/test/test_data/he +20 -0
data/test/test_data/hi +1 -0
data/test/test_data/hr +16 -0
data/test/test_data/hu +6 -0
data/test/test_data/id +2 -0
data/test/test_data/it +3 -0
data/test/test_data/ja +34 -0
data/test/test_data/kn +14 -0
data/test/test_data/ko +2 -0
data/test/test_data/mk +3 -0
data/test/test_data/ml +1 -0
data/test/test_data/mr +3 -0
data/test/test_data/ne +2 -0
data/test/test_data/nl +1 -0
data/test/test_data/no +3 -0
data/test/test_data/pa +1 -0
data/test/test_data/pl +23 -0
data/test/test_data/pt +2 -0
data/test/test_data/ro +2 -0
data/test/test_data/ru +1 -0
data/test/test_data/sk +2 -0
data/test/test_data/so +4 -0
data/test/test_data/sq +4 -0
data/test/test_data/sv +3 -0
data/test/test_data/sw +6 -0
data/test/test_data/ta +1 -0
data/test/test_data/te +2 -0
data/test/test_data/th +3 -0
data/test/test_data/tl +1 -0
data/test/test_data/tr +2 -0
data/test/test_data/uk +3 -0
data/test/test_data/ur +1 -0
data/test/test_data/vi +2 -0
data/test/test_data/zh-tw +3 -0
data/test/test_detector.rb +52 -0
data/test/test_detector_factory.rb +16 -0
data/test/test_java_property_reader.rb +8 -0
data/test/test_lang_profile.rb +79 -0
data/test/test_language.rb +15 -0
data/test/test_language_detection_facade.rb +9 -0
data/test/test_langusta.rb +25 -0
data/test/test_n_gram.rb +103 -0
data/test/test_tag_extractor.rb +71 -0
data/test/test_ucs2_string.rb +9 -0
data/test/test_unicode_block.rb +9 -0
metadata +320 -0

data/lib/langusta.rb ADDED

@@ -0,0 +1,36 @@
+$: << File.expand_path(File.dirname(__FILE__))
+require 'rubygems'
+require 'bundler'
+Bundler.setup
+require 'optparse'
+require 'iconv'
+# Required gems
+require 'oniguruma'
+require 'yajl'
+module Langusta
+  VERSION = '0.1.0'
+  autoload :RegexHelper, 'langusta/regex_helper'
+  autoload :UCS2String, 'langusta/ucs2_string'
+  autoload :Language, 'langusta/language'
+  autoload :LangProfile, 'langusta/lang_profile'
+  autoload :Detector, 'langusta/detector'
+  autoload :JavaPropertyReader, 'langusta/java_property_reader'
+  autoload :UnicodeBlock, 'langusta/unicode_block'
+  autoload :NGram, 'langusta/n_gram'
+  autoload :DetectorFactory, 'langusta/detector_factory'
+  autoload :Detector, 'langusta/detector'
+  autoload :TagExtractor, 'langusta/tag_extractor'
+  autoload :Command, 'langusta/command'
+  autoload :LanguageDetectionFacade, 'langusta/language_detection_facade'
+  ABSOLUTE_PATH = File.expand_path(File.join(File.dirname(__FILE__), '..'))
+  PROFILES_PATH = File.join(ABSOLUTE_PATH, 'profiles')
+  UPPERCASE_BIN = File.join(ABSOLUTE_PATH, 'data/uppercase.bin')
+  MESSAGES_PROPERTIES = File.join(ABSOLUTE_PATH, 'data/messages.properties')
+end

data/lib/langusta/command.rb ADDED

@@ -0,0 +1,78 @@
+module Langusta
+  class Command
+    def self.run(argv)
+      options = {}
+      opts = OptionParser.new do |opts|
+        opts.on("--detectlang", "Detect the language from the given text") do |d|
+          options[:operation] = :detectlang if d
+        end
+        opts.on("--batchtest", "Batch test of language detection") do |b|
+          options[:operation] = :batchtest if b
+        end
+        opts.on("-d [profile directory]") do |pd|
+          options[:profile_directory] = pd
+        end
+        opts.on("-a [alpha]", Float) do |alpha|
+          options[:alpha] = alpha
+        end
+      end.parse!(argv)
+      arguments = [options[:profile_directory]] + [argv]
+      arguments << options[:alpha] if options[:alpha]
+      case options[:operation]
+      when :detectlang
+        self.new.send(:detect_lang, *arguments)
+      when :batchtest
+        self.new.send(:batch_test, *arguments)
+      else
+        $stderr.puts <<EOF
+Usage:
+  langusta --detectlang -d [profile directory] -a [alpha] [test file(s)]
+  langusta --batchtest -d [profile directory] -a [alpha] [test file(s)]
+EOF
+      end
+      0
+    end
+    def initialize
+      @detector_factory = DetectorFactory.new
+    end
+    def detect_lang(profile_directory, test_files, alpha=nil)
+      initialize_factory(profile_directory)
+      test_files.each do |filename|
+        language = detect_single_lang(filename, alpha)
+        puts "%s: %s" % [filename, language]
+      end
+    end
+    def batch_test(profile_directory, test_files, alpha=nil)
+    end
+    def detect_single_lang(filename, alpha)
+      ucs2_content = UCS2String.from_utf8(File.open(filename).read)
+      detector = @detector_factory.create(alpha)
+      detector.append(ucs2_content)
+      language = detector.detect()
+    end
+    def initialize_factory(profile_directory)
+      profiles = load_profiles(profile_directory)
+      profiles.each_with_index do |profile, index|
+        @detector_factory.add_profile(profile, index, profiles.length)
+      end
+    end
+    def load_profiles(directory)
+      @profiles = Dir[File.join(directory, '/*')].map do |filename|
+        LangProfile.load_from_file(filename)
+      end
+    end
+  end
+end

data/lib/langusta/detector.rb ADDED

@@ -0,0 +1,197 @@
+module Langusta
+  class Detector
+    attr_accessor :verbose, :alpha, :max_text_length
+    ALPHA_DEFAULT = 0.5
+    ALPHA_WIDTH = 0.05
+    ITERATION_LIMIT = 1000
+    PROB_THRESHOLD = 0.1
+    CONV_THRESHOLD = 0.99999
+    BASE_FREQ = 10000
+    UNKNOWN_LANG = "unknown"
+    def initialize(factory)
+      @word_lang_prob_map = factory.word_lang_prob_map
+      @lang_list = factory.lang_list
+      @text = UCS2String.new('')
+      @langprob = nil
+      @alpha = ALPHA_DEFAULT
+      @n_trial = 7
+      @max_text_length = 10000
+      @prior_map = nil
+      @verbose = false
+    end
+    # Append more text to be recognized.
+    # @param text [UCS2String] text to be recognized
+    def append(text)
+      raise TypeError.new("Expected: UCS2String, got: #{text.class}") unless text.is_a?(UCS2String)
+      text.gsub!(RegexHelper::URL_REGEX, "\x00\x20")
+      text.gsub!(RegexHelper::MAIL_REGEX, "\x00\x20")
+      text = text.map do |c|
+        NGram.normalize(c)
+      end
+      @text = text.gsub!(RegexHelper::SPACE_REGEX, "\x00\x20")
+    end
+    # Detect the language.
+    # @return [String] (usually) two-letter code describing the language.
+    def detect
+      probabilities = get_probabilities()
+      (probabilities.length > 0) ? probabilities.first.lang : UNKNOWN_LANG
+    end
+    private
+    def detect_block
+      cleaning_text()
+      ngrams = extract_ngrams()
+      raise "no features in text" if ngrams.empty?
+      @langprob = Array.new(@lang_list.length, 0.0)
+      @n_trial.times do
+        prob = init_probability()
+        alpha = @alpha + Detector.next_gaussian() * ALPHA_WIDTH
+        i = 0
+        Kernel.loop do
+          r = Kernel.rand(ngrams.length)
+          update_lang_prob(prob, ngrams[r], alpha)
+          if i % 5
+            break if Detector.normalize_prob(prob) > CONV_THRESHOLD || i >= ITERATION_LIMIT
+            # verbose
+          end
+        end
+        @langprob.length.times do |j|
+          @langprob[j] += prob[j] / @n_trial
+        end
+        # verbose
+      end
+    end
+    def set_prior_map(prior_map)
+      @prior_map = Array.new[@lang_list.length]
+      sump = 0.0
+      @prior_map.length.times do |i|
+        lang = @lang_list[i]
+        if @prior_map.has_key?(lang)
+          p = @prior_map[lang]
+          raise "probability must be non-negative" if p < 0
+          @prior_map[i] = p
+          sump += p
+        end
+      end
+      raise "more one of prob must be non-zero" if sump <= 0
+      @prior_map.map! do |p|
+        p /= sump
+      end
+    end
+    def self.normalize_prob(prob)
+      maxp = 0.0; sump = 0.0
+      prob.each do |p|
+        sump += p
+      end
+      prob.map! do |p|
+        q = p / sump
+        maxp = q if q > maxp
+        q
+      end
+      maxp
+    end
+    private
+    def cleaning_text
+      non_latin_count = latin_count = 0
+      @text.each_char do |c|
+        if c < "\00z" && c >= "\x00A"
+          latin_count += 1
+        elsif c >= "\x03\x00" && UnicodeBlock.of(c) != UnicodeBlock::LATIN_EXTENDED_ADDITIONAL
+          non_latin_count += 1
+        end
+      end
+      if latin_count * 2 < non_latin_count
+        text_without_latin = UCS2String.new('')
+        @text.each_char do |c|
+          text_without_latin << c if c > "\x00z" || c < "\x00A"
+        end
+        @text = text_without_latin
+      end
+    end
+    def extract_ngrams
+      list = []
+      ngram = NGram.new
+      @text.each_char do |char|
+        ngram.add_char(char)
+        (1..NGram::N_GRAM).each do |n|
+          w = ngram.get(n)
+          list << w if w && @word_lang_prob_map.has_key?(w)
+        end
+      end
+      list
+    end
+    def get_probabilities
+      if @langprob.nil?
+        detect_block()
+      end
+      sort_probability(@langprob)
+    end
+    def init_probability
+      prob = Array.new(@lang_list.length)
+      if @prior_map
+        prob = @prior_map.clone
+      else
+        prob.length.times do |i|
+          prob[i] = 1.0 / @lang_list.length
+        end
+      end
+      prob
+    end
+    def sort_probability(prob)
+      list = []
+      prob.each_with_index do |prob, index|
+        list[index] = Language.new(@lang_list[index], prob)
+      end
+      list.sort_by do |x|
+        x.prob
+      end.select do |x|
+        x.prob > PROB_THRESHOLD
+      end
+    end
+    def update_lang_prob(prob, word, alpha)
+      return false if word.nil? || ! @word_lang_prob_map.has_key?(word)
+      lang_prob_map = @word_lang_prob_map[word]
+      # verbose
+      weight = alpha / BASE_FREQ
+      prob.length.times do |i|
+        prob[i] *= weight + lang_prob_map[i]
+      end
+      true
+    end
+    def word_prob_to_string(prob)
+      prob.zip(@lang_list).select do |p, lang|
+        p > 0.00001
+      end.map do |p, lang|
+        "%s:%.5f" % [p, lang]
+      end.join(' ')
+    end
+    # Box-Muller transform.
+    def self.next_gaussian
+      s = 0
+      while s >= 1 || s == 0
+        v1 = 2 * Kernel.rand - 1
+        v2 = 2 * Kernel.rand - 1
+        s = v1 * v1 + v2 * v2
+      end
+      multiplier = Math.sqrt(-2 * Math.log(s)/s)
+      return v1 * multiplier
+    end
+  end
+end

data/lib/langusta/detector_factory.rb ADDED

@@ -0,0 +1,46 @@
+module Langusta
+  class LangDetectException < StandardError; end
+  class DetectorFactory
+    attr_reader :word_lang_prob_map, :lang_list
+    def initialize
+      @word_lang_prob_map = {}
+      @lang_list = []
+    end
+    # Adds a new language profile to this factory.
+    # @param [LangProfile] language profile to be added.
+    # @param [Fixnum] index at which the language profile is to be added.
+    # @param [Fixnum] counts how many language profiles are to be added to this factory in total.
+    def add_profile(profile, index, langsize)
+      raise LangDetectException.new("duplicate the same language profile") if @lang_list.include?(profile.name)
+      @lang_list << profile.name
+      profile.freq.keys.each do |word|
+        if not @word_lang_prob_map.has_key?(word)
+          @word_lang_prob_map[word] = Array.new(langsize, 0.0)
+        end
+        prob = 1.0 * profile.freq[word] / profile.n_words[word.length - 1]
+        @word_lang_prob_map[word][index] = prob
+      end
+    end
+    # Creates a new detector object, based on a preconfigured set of language profiles.
+    # @return [Detector]
+    def create(alpha=nil)
+      if alpha
+        detector = create_detector()
+        detector.alpha = alpha
+        detector
+      else
+        create_detector()
+      end
+    end
+    private
+    def create_detector
+      raise LangDetectException.new("need to load profiles") if @lang_list.length == 0
+      detector = Detector.new(self)
+    end
+  end
+end

data/lib/langusta/java_property_reader.rb ADDED

@@ -0,0 +1,35 @@
+module Langusta
+  class JavaPropertyReader
+    # This is a minimal implementation, don't expect this to actually work.
+    def initialize(filename)
+      @lines = File.open(filename).read
+      parse()
+    end
+    def [](property)
+      @properties[property]
+    end
+    def underlying_hash
+      @properties
+    end
+    private
+    def parse
+      @properties = {}
+      @lines.each do |line|
+        prop_name, value = line.split(/\=/)
+        @properties[prop_name] = parse_value(value)
+      end
+    end
+    def parse_value(value)
+      codepoints = value.scan(/([0-9A-F]{4})/)
+      codepoints.map do |cp|
+        int_cp = cp.first.to_i(16)
+        [int_cp / 256, int_cp % 256].pack("c*")
+      end.join
+    end
+  end
+end

data/lib/langusta/lang_profile.rb ADDED

@@ -0,0 +1,80 @@
+require 'set'
+module Langusta
+  class LangProfile
+    MINIMUM_FREQ = 2
+    LESS_FREQ_RATIO = 100_000
+    attr_reader :name, :freq, :n_words
+    # Constructs a language profile from a file. Converts all NGrams from UTF-8 to Unicode codepoints.
+    # @param [String] file name of the language profile.
+    # @return [LangProfile]
+    def self.load_from_file(filename)
+      json = Yajl::Parser.parse(File.new(filename))
+      profile = self.new
+      name = json['name']
+      n_words = json['n_words']
+      freq = json['freq'].inject({}) do |acc, kv|
+        key, value = kv
+        acc[UCS2String.from_utf8(key)] = value
+        acc
+      end
+      profile.populate_json(name, freq, n_words)
+      profile
+    end
+    def initialize(name=nil)
+      @name = name
+      @freq = {}
+      @n_words = Array.new(NGram::N_GRAM, 0)
+    end
+    def populate_json(name, freq, n_words)
+      @name, @freq, @n_words = name, freq, n_words
+    end
+    # Adds a given NGram to this language profile. This operation is expected to be invoked multiple times for the same arguments.
+    # @param gram [UCS2String]
+    def add(gram)
+      raise TypeError.new("UCS2String or NilClass expected, got: #{gram.class}") unless gram.is_a?(UCS2String) or gram.is_a?(NilClass)
+      return if @name.nil? or gram.nil?
+      length = gram.size
+      return if length < 1 or length > NGram::N_GRAM
+      @n_words[length - 1] += 1
+      @freq[gram] ||= 0
+      @freq[gram] += 1
+    end
+    def omit_less_freq
+      return if @name.nil?
+      threshold = @n_words[0] / LESS_FREQ_RATIO
+      threshold = MINIMUM_FREQ if threshold < MINIMUM_FREQ
+      keys = Set.new(@freq.keys)
+      roman = 0
+      keys.each do |key|
+        count = @freq[key]
+        if count <= threshold
+          @n_words[key.size - 1] -= count
+          @freq.delete(key)
+        else
+          # temp workaround
+          if RegexHelper::ROMAN_REGEX.match(key.underlying)
+            roman += count
+          end
+        end
+      end
+      if roman < @n_words[0] / 3
+        keys2 = Set.new(@freq.keys)
+        keys2.each do |key|
+          # temp workaround
+          if RegexHelper::INCL_ROMAN_REGEX.match(key.underlying)
+            @n_words[key.size - 1] -= @freq[key]
+            @freq.delete(key)
+          end
+        end
+      end
+    end
+  end
+end