RubyGems - myaso - Versions diffs - 0.4.0 - Mend

myaso 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

checksums.yaml +7 -0
data/.gitignore +25 -0
data/.travis.yml +10 -0
data/Gemfile +14 -0
data/LICENSE.txt +22 -0
data/README.md +213 -0
data/Rakefile +21 -0
data/bin/myaso +73 -0
data/lib/myaso.rb +35 -0
data/lib/myaso/lexicon.rb +70 -0
data/lib/myaso/mystem.rb +187 -0
data/lib/myaso/mystem/library.rb +59 -0
data/lib/myaso/ngrams.rb +67 -0
data/lib/myaso/pi_table.rb +36 -0
data/lib/myaso/tagger.rb +94 -0
data/lib/myaso/tagger/model.rb +68 -0
data/lib/myaso/tagger/tnt.rb +183 -0
data/lib/myaso/version.rb +9 -0
data/myaso.gemspec +26 -0
data/myaso.jpg +0 -0
data/spec/bin_spec.rb +48 -0
data/spec/data/test.123 +77 -0
data/spec/data/test.lex +10 -0
data/spec/fixtures/interpolations.yml +4 -0
data/spec/fixtures/lexicon.yml +32 -0
data/spec/fixtures/ngrams.yml +106 -0
data/spec/lexicon_spec.rb +84 -0
data/spec/mystem_spec.rb +81 -0
data/spec/ngrams_spec.rb +97 -0
data/spec/pi_table_spec.rb +53 -0
data/spec/spec_helper.rb +12 -0
data/spec/support/fixtures.rb +34 -0
data/spec/support/invoker.rb +29 -0
data/spec/tagger_spec.rb +27 -0
data/spec/tagger_tnt_spec.rb +73 -0
metadata +137 -0

data/lib/myaso/mystem.rb ADDED

@@ -0,0 +1,187 @@
+# Mystem is a popular morphological analyzer for Russian that is written
+# in Yandex by Ilya Segalovich and Vitaly Titov. The analyzer can
+# efficiently deal with non-dictionary word and produce hypotheses
+# for such words. It is available on <https://tech.yandex.ru/mystem/>.
+module Myaso::Mystem extend self
+  # Lemma is a canonical form of the word.
+  class Lemma < Struct.new(:lemma, :form, :quality, :msd, :stem_grammemes, :flex_grammemes, :flex_length, :rule_id)
+    ##
+    # :attr_accessor: lemma
+    # A lemma of the word.
+    ##
+    # :attr_accessor: form
+    # A normalized word form.
+    ##
+    # :attr_accessor: quality
+    # Quality as according to +Myaso::Mystem::Library::QUALITY+.
+    ##
+    # :attr_accessor: msd
+    # A morphosyntactic descriptor.
+    ##
+    # :attr_accessor: rule_id
+    # An inflection rule identifier.
+    # A shortcut to +Myaso::Mystem.forms+.
+    def forms
+      Myaso::Mystem.forms(lemma, rule_id)
+    end
+    # A shortcut to +Myaso::Mystem.inflect+.
+    #
+    # :call-seq:
+    #   inflect(Hash)
+    def inflect(grammemes)
+      Myaso::Mystem.inflect(forms, grammemes)
+    end
+    def inspect #:nodoc:
+      '#<%s lemma=%s msd="%s">' % [self.class.name, to_s.inspect, msd]
+    end
+    def to_s #:nodoc:
+      lemma
+    end
+  end
+  # A word form generated by mystem.
+  class Form < Struct.new(:form, :msd, :stem_grammemes, :flex_grammemes)
+    ##
+    # :attr_accessor: form
+    # A normalized word form.
+    ##
+    # :attr_accessor: msd
+    # A morphosyntactic descriptor.
+    def inspect #:nodoc:
+      '#<%s form=%s msd="%s">' % [self.class.name, to_s.inspect, msd]
+    end
+    def to_s #:nodoc:
+      form
+    end
+  end
+  # Analyzes a +word+ and returns an array of lemmas, each of which
+  # represent a particular ambiguous morphological interpretation.
+  #
+  # :call-seq:
+  #   analyze(String)
+  def analyze(word)
+    Array.new.tap do |lemmas|
+      invoke_analyze(as_symbols(word), word.length) do |lemma|
+        lemma_text = MystemLemmaText(lemma)
+        lemma_text_len = MystemLemmaTextLen(lemma)
+        form_text = MystemLemmaForm(lemma)
+        form_text_len = MystemLemmaFormLen(lemma)
+        stem_grammemes = MystemLemmaStemGram(lemma).bytes
+        flex_grammemes_raw = MystemLemmaFlexGram(lemma)
+        flex_grammemes_len = MystemLemmaFlexGramNum(lemma)
+        flex_grammemes = as_strings(flex_grammemes_raw, flex_grammemes_len)
+        grammemes = stem_grammemes | flex_grammemes
+        lemmas << Lemma.new(
+          as_string(lemma_text, lemma_text_len),        # lemma
+          as_string(form_text, form_text_len),          # form
+          QUALITY[MystemLemmaQuality(lemma)],           # quality
+          Myasorubka::Mystem::Binary.to_msd(grammemes), # msd
+          stem_grammemes,                               # stem_grammemes
+          flex_grammemes,                               # flex_grammemes
+          MystemLemmaFlexLen(lemma),                    # flex_length
+          MystemLemmaRuleId(lemma)                      # rule_id
+        )
+      end
+    end
+  end
+  # Analyzes a +word+ and returns an array of its forms as according
+  # to the given +rule_id+.
+  #
+  # :call-seq:
+  #   forms(String, Fixnum)
+  def forms(word, rule_id)
+    Array.new.tap do |forms|
+      invoke_analyze(as_symbols(word), word.length) do |lemma|
+        next unless rule_id == MystemLemmaRuleId(lemma)
+        invoke_generate(lemma) do |form|
+          form_text = MystemFormText(form)
+          form_text_len = MystemFormTextLen(form)
+          stem_grammemes = MystemFormStemGram(form).bytes
+          flex_grammemes_raw = MystemFormFlexGram(form)
+          flex_grammemes_len = MystemFormFlexGramNum(form)
+          flex_grammemes = as_strings(flex_grammemes_raw, flex_grammemes_len)
+          grammemes = stem_grammemes | flex_grammemes
+          forms << Form.new(
+            as_string(form_text, form_text_len),          # form
+            Myasorubka::Mystem::Binary.to_msd(grammemes), # msd
+            stem_grammemes,                               # stem_grammemes
+            flex_grammemes,                               # flex_grammemes
+          )
+        end
+      end
+    end
+  end
+  # Finds exact matches of +grammemes+ for the provided +forms+ of a word.
+  # It is necessary to be careful because computational linguistics is a
+  # hard field.
+  #
+  # :call-seq:
+  #   inflect([Form], Hash)
+  def inflect(forms, grammemes)
+    forms.select do |form|
+      grammemes.inject(true) { |r, (k, v)| r && form.msd.grammemes[k] == v }
+    end
+  end
+  protected
+  def invoke_analyze(symbols, length, &block) #:nodoc:
+    analyzes = MystemAnalyze(symbols, length)
+    MystemAnalysesCount(analyzes).times do |i|
+      block.call(MystemLemma(analyzes, i))
+    end
+  ensure
+    MystemDeleteAnalyses(analyzes)
+  end
+  def invoke_generate(lemma, &block) #:nodoc:
+    forms = MystemGenerate(lemma)
+    MystemFormsCount(forms).times do |i|
+      block.call(MystemForm(forms, i))
+    end
+  ensure
+    MystemDeleteForms(forms)
+  end
+  def as_symbols(string) #:nodoc:
+    FFI::MemoryPointer.
+      new(:ushort, string.length).
+      write_array_of_short(string.chars.map!(&:ord))
+  end
+  def as_string(symbols, length) #:nodoc:
+    symbols.read_array_of_ushort(length).
+      map! { |c| c.chr(Encoding::UTF_8) }.
+      join
+  end
+  def as_strings(grammemes, grammemes_length) #:nodoc:
+    Array.new.tap do |bytes|
+      grammemes.get_array_of_string(0, grammemes_length).each do |ids|
+        bytes << ids.bytes
+      end
+      bytes.flatten!
+      bytes.uniq!
+    end
+  end
+end

data/lib/myaso/mystem/library.rb ADDED

@@ -0,0 +1,59 @@
+# Myaso uses foreign function interface to interact with the mystem
+# shared library.
+module Myaso::Mystem::Library
+  extend FFI::Library
+  begin
+    ffi_lib ENV.fetch('MYSTEM_LIBRARY', Dir["{/{opt,usr}/{,local/}lib{,64},.}/libmystem_c_binding.{dylib,so}"])
+  rescue LoadError
+    fail 'The mystem library could not be loaded. '      \
+         'Please install it and set the MYSTEM_LIBRARY ' \
+         'environment variable to its path.'
+  end
+  attach_function :MystemAnalyze,          [:pointer, :int], :pointer
+  attach_function :MystemAnalysesCount,    [:pointer],       :int
+  attach_function :MystemDeleteAnalyses,   [:pointer],       :void
+  attach_function :MystemLemma,            [:pointer, :int], :pointer
+  attach_function :MystemLemmaText,        [:pointer],       :pointer
+  attach_function :MystemLemmaTextLen,     [:pointer],       :int
+  attach_function :MystemLemmaForm,        [:pointer],       :pointer
+  attach_function :MystemLemmaFormLen,     [:pointer],       :int
+  attach_function :MystemLemmaQuality,     [:pointer],       :int
+  attach_function :MystemLemmaStemGram,    [:pointer],       :string
+  attach_function :MystemLemmaFlexGram,    [:pointer],       :pointer
+  attach_function :MystemLemmaFlexGramNum, [:pointer],       :int
+  attach_function :MystemLemmaFlexLen,     [:pointer],       :int
+  attach_function :MystemLemmaRuleId,      [:pointer],       :int
+  attach_function :MystemGenerate,         [:pointer],       :pointer
+  attach_function :MystemDeleteForms,      [:pointer],       :void
+  attach_function :MystemFormsCount,       [:pointer],       :int
+  attach_function :MystemForm,             [:pointer, :int], :pointer
+  attach_function :MystemFormText,         [:pointer],       :pointer
+  attach_function :MystemFormTextLen,      [:pointer],       :int
+  attach_function :MystemFormStemGram,     [:pointer],       :string
+  attach_function :MystemFormFlexGram,     [:pointer],       :pointer
+  attach_function :MystemFormFlexGramNum,  [:pointer],       :int
+  # A meaningful mapping between mystem's internal word quality
+  # descriptors and the Ruby symbols.
+  QUALITY = {
+    0x00000000 => :dictionary,
+    0x00000001 => :bastard,
+    0x00000002 => :sob,
+    0x00000004 => :prefixoid,
+    0x00000008 => :foundling,
+    0x00000010 => :bad_request,
+    0x00010000 => :from_english,
+    0x00020000 => :to_english,
+    0x00040000 => :untranslit,
+    0x00100000 => :overrode,
+    0x01000000 => :fix
+  }.freeze
+end
+Myaso::Mystem.send(:extend,  Myaso::Mystem::Library)
+Myaso::Mystem.send(:include, Myaso::Mystem::Library)

data/lib/myaso/ngrams.rb ADDED

@@ -0,0 +1,67 @@
+# encoding: utf-8
+# A simple yet handy implementation of a n-gram storage.
+#
+class Myaso::Ngrams
+  extend Forwardable
+  include Enumerable
+  attr_reader :table
+  protected :table
+  def_delegator :@table, :each, :each
+  # An instance of a n-gram storage is initialized by zero counts.
+  #
+  def initialize
+    @table = Hash.new do |h, k|
+      h[k] = Hash.new { |h_local, k_local| h_local[k_local] = Hash.new(0) }
+    end
+  end
+  # Obtain the count of the specified unigram, bigram, or trigram.
+  #
+  def [] unigram, bigram = nil, trigram = nil
+    return 0 unless table.include? unigram
+    return 0 unless table[unigram].include? bigram
+    table[unigram][bigram][trigram]
+  end
+  # Assign the count to the specified unigram, bigram, or trigram.
+  #
+  def []= unigram, bigram = nil, trigram = nil, count
+    @unigrams_count = nil
+    table[unigram][bigram][trigram] = count
+  end
+  # Two storages are equal iff they tables are equal.
+  #
+  def == other
+    self.table == other.table
+  end
+  # Trigrams enumerator. Yes, this method should return an Enumerator
+  # instance, but it is too slow.
+  #
+  def each_trigram
+    table.each do |unigram, bigrams|
+      bigrams.each do |bigram, trigrams|
+        next unless bigram
+        trigrams.each do |trigram, count|
+          next unless trigram
+          yield [[unigram, bigram, trigram], count]
+        end
+      end
+    end
+  end
+  # Unigrams count.
+  #
+  def unigrams_count
+    @unigrams_count ||= table.keys.inject(0) do |count, unigram|
+      count + table[unigram][nil][nil]
+    end
+  end
+end

data/lib/myaso/pi_table.rb ADDED

@@ -0,0 +1,36 @@
+# encoding: utf-8
+# A simple implementation of a dynamic programming table in the following
+# form: $\pi(i, u, v)$. where $i$ is an index and $u, v$ are elements of
+# a finite set of tags.
+#
+class Myaso::PiTable
+  extend Forwardable
+  include Enumerable
+  attr_reader :default, :table
+  def_delegator :@table, :each, :each
+  # An instance of a dynamic programming table can consider the specified
+  # default value.
+  #
+  def initialize(default = nil)
+    @default = default
+    @table = Hash.new do |h, k|
+      h[k] = Hash.new { |h_local, k_local| h_local[k_local] = Hash.new(default) }
+    end
+  end
+  # Obtain the value of $\pi(i, u, v)$ or return the default value if it
+  # is nil.
+  #
+  def [] i, u, v
+    table[i][u][v]
+  end
+  # Set a value of $\pi(i, u, v)$.
+  #
+  def []= i, u, v, value
+    table[i][u][v] = value
+  end
+end

data/lib/myaso/tagger.rb ADDED

@@ -0,0 +1,94 @@
+# encoding: utf-8
+# This class is an implementation of the Viterbi algorithm.
+#
+class Myaso::Tagger
+  attr_reader :model
+  # An instance of Tagger should be initialized with an instance of
+  # trained HMM.
+  #
+  def initialize(model)
+    @model = model
+  end
+  # Viterbi algorithm itself. Return tags that input sentence
+  # should be annotated.
+  #
+  def annotate(sentence)
+    return [] if sentence.size == 0
+    sentence = sentence.map { |w| model.classify(w) }
+    sentence.unshift(model.start_symbol, model.start_symbol)
+    backward(sentence, *forward(sentence))
+  end
+  protected
+  # Emit probabilities into the dynamic programming tables.
+  #
+  def forward(sentence)
+    pi, bp = Myaso::PiTable.new, Myaso::PiTable.new
+    pi[1, model.start_symbol, model.start_symbol] = 0.0
+    sentence.each_with_index.each_cons(3) do |(w1, i1), (w2, i2), (word, k)|
+      w_tags = (i1 < 2) ? [model.start_symbol] : model.lexicon.tags(w1)
+      u_tags = (i2 < 2) ? [model.start_symbol] : model.lexicon.tags(w2)
+      v_tags = model.lexicon.tags(word)
+      u_tags.product(v_tags).each do |u, v|
+        pi[k, u, v], bp[k, u, v] = forward_iteration(pi, k, u, v, w_tags, word)
+      end
+    end
+    [pi, bp]
+  end
+  # Essential of forward part of Viterbi algorithm.
+  #
+  def forward_iteration(pi, k, u, v, tags, word)
+    tags.select { |w| (value = pi[k - 1, w, u]) && value.finite? }.
+      map! { |w| [pi[k - 1, w, u] + probability(w, u, v, word), w] }.
+      max_by(&:first)
+  end
+  # Use backpoints to retrieve the computed tags from the previous stage.
+  #
+  def backward(sentence, pi, bp)
+    size = sentence.size - 1
+    if (size - 2).zero?
+      return model.lexicon.tags(sentence[-1]).map { |v| [v] }.
+        max_by { |v| pi[size, model.start_symbol, *v] +
+                     probability(model.start_symbol, *v, model.stop_symbol) }
+    end
+    tags = prepare_backward(sentence, pi)
+    size.downto(4) do |k|
+      tags[k - 2] = bp[k, tags[k - 1], tags[k]]
+    end
+    tags.slice! 2..-1
+  end
+  # Preparations to tags computing.
+  #
+  def prepare_backward(sentence, pi)
+    size = sentence.size - 1
+    tags = Array.new(sentence.size)
+    u_tags, v_tags = model.lexicon.tags(sentence[-2]), model.lexicon.tags(sentence[-1])
+    tags[size - 1], tags[size] = u_tags.product(v_tags).
+      select { |u, v| (value = pi[size, u, v]) && value.finite? }.
+      max_by { |u, v| pi[size, u, v] + probability(u, v, model.stop_symbol) }
+    tags
+  end
+  # Compute the probability of q(v|w, u) * e(word|v).
+  #
+  def probability(w, u, v, word = nil)
+    return Math.log2(model.q(w, u, v)) unless word
+    Math.log2(model.q(w, u, v) * model.e(word, v))
+  end
+end

data/lib/myaso/tagger/model.rb ADDED

@@ -0,0 +1,68 @@
+# encoding: utf-8
+# Any HMM tagger requires a trained model that can perform such tasks as
+# producing smoothed q() and e() values, replace unknown words with special
+# symbols.
+#
+class Myaso::Tagger::Model
+  attr_reader :ngrams, :lexicon, :interpolations
+  # Tagging model requires n-grams and lexicon.
+  #
+  # It is possible to the the interpolations vector when its values are
+  # known. If there are necessity to recompute the interpolations then
+  # nil shall be given (default behavior). If there should be no
+  # interpolations then false shall be given. In other cases it is possible
+  # to set them explicitly.
+  #
+  def initialize(interpolations = nil)
+    @ngrams, @lexicon = Myaso::Ngrams.new, Myaso::Lexicon.new
+    @interpolations = if interpolations == false
+      [0.33, 0.33, 0.33]
+    elsif interpolations.nil?
+      nil
+    else
+      interpolations
+    end
+    learn!
+  end
+  # Linear interpolation model of processing probability of
+  # occurence of the trigram (first, second, third). It
+  # consider three summands: the first one has the next sense:
+  # probability that current tag is (third) if last two are
+  # (first, second), the second one -- that last one is (second),
+  # and the last summand consider independent probability that
+  # current tag is (third).
+  #
+  def q(first, second, third)
+    q1 = conditional(ngrams[third], ngrams.unigrams_count)
+    q2 = conditional(ngrams[second, third], ngrams[second])
+    q3 = conditional(ngrams[first, second, third], ngrams[first, second])
+    q1 * interpolations[0] + q2 * interpolations[1] + q3 * interpolations[2]
+  end
+  # Function e in the Viterbi algorithm. It process probability of
+  # generation word with this tag relatively to all words with
+  # this tag.
+  #
+  def e(word, tag)
+    conditional(lexicon[word, tag], ngrams[tag])
+  end
+  # If word is rare, than it should be replaced in preparation of the
+  # training set. So, it can't be in the training set.
+  #
+  def rare?(word)
+    lexicon[word] <= 1
+  end
+  # Conditional probability p(A|B) = p(A, B) / p(B). Returns zero when
+  # denominator is zero.
+  #
+  def conditional(ab, b)
+    return 0.0 if b.zero?
+    ab / b.to_f
+  end
+end