myaso 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,187 @@
1
+ # Mystem is a popular morphological analyzer for Russian that is written
2
+ # in Yandex by Ilya Segalovich and Vitaly Titov. The analyzer can
3
+ # efficiently deal with non-dictionary word and produce hypotheses
4
+ # for such words. It is available on <https://tech.yandex.ru/mystem/>.
5
+ module Myaso::Mystem extend self
6
+ # Lemma is a canonical form of the word.
7
+ class Lemma < Struct.new(:lemma, :form, :quality, :msd, :stem_grammemes, :flex_grammemes, :flex_length, :rule_id)
8
+ ##
9
+ # :attr_accessor: lemma
10
+ # A lemma of the word.
11
+
12
+ ##
13
+ # :attr_accessor: form
14
+ # A normalized word form.
15
+
16
+ ##
17
+ # :attr_accessor: quality
18
+ # Quality as according to +Myaso::Mystem::Library::QUALITY+.
19
+
20
+ ##
21
+ # :attr_accessor: msd
22
+ # A morphosyntactic descriptor.
23
+
24
+ ##
25
+ # :attr_accessor: rule_id
26
+ # An inflection rule identifier.
27
+
28
+ # A shortcut to +Myaso::Mystem.forms+.
29
+ def forms
30
+ Myaso::Mystem.forms(lemma, rule_id)
31
+ end
32
+
33
+ # A shortcut to +Myaso::Mystem.inflect+.
34
+ #
35
+ # :call-seq:
36
+ # inflect(Hash)
37
+ def inflect(grammemes)
38
+ Myaso::Mystem.inflect(forms, grammemes)
39
+ end
40
+
41
+ def inspect #:nodoc:
42
+ '#<%s lemma=%s msd="%s">' % [self.class.name, to_s.inspect, msd]
43
+ end
44
+
45
+ def to_s #:nodoc:
46
+ lemma
47
+ end
48
+ end
49
+
50
+ # A word form generated by mystem.
51
+ class Form < Struct.new(:form, :msd, :stem_grammemes, :flex_grammemes)
52
+ ##
53
+ # :attr_accessor: form
54
+ # A normalized word form.
55
+
56
+ ##
57
+ # :attr_accessor: msd
58
+ # A morphosyntactic descriptor.
59
+
60
+ def inspect #:nodoc:
61
+ '#<%s form=%s msd="%s">' % [self.class.name, to_s.inspect, msd]
62
+ end
63
+
64
+ def to_s #:nodoc:
65
+ form
66
+ end
67
+ end
68
+
69
+ # Analyzes a +word+ and returns an array of lemmas, each of which
70
+ # represent a particular ambiguous morphological interpretation.
71
+ #
72
+ # :call-seq:
73
+ # analyze(String)
74
+ def analyze(word)
75
+ Array.new.tap do |lemmas|
76
+ invoke_analyze(as_symbols(word), word.length) do |lemma|
77
+ lemma_text = MystemLemmaText(lemma)
78
+ lemma_text_len = MystemLemmaTextLen(lemma)
79
+
80
+ form_text = MystemLemmaForm(lemma)
81
+ form_text_len = MystemLemmaFormLen(lemma)
82
+
83
+ stem_grammemes = MystemLemmaStemGram(lemma).bytes
84
+ flex_grammemes_raw = MystemLemmaFlexGram(lemma)
85
+ flex_grammemes_len = MystemLemmaFlexGramNum(lemma)
86
+ flex_grammemes = as_strings(flex_grammemes_raw, flex_grammemes_len)
87
+ grammemes = stem_grammemes | flex_grammemes
88
+
89
+ lemmas << Lemma.new(
90
+ as_string(lemma_text, lemma_text_len), # lemma
91
+ as_string(form_text, form_text_len), # form
92
+ QUALITY[MystemLemmaQuality(lemma)], # quality
93
+ Myasorubka::Mystem::Binary.to_msd(grammemes), # msd
94
+ stem_grammemes, # stem_grammemes
95
+ flex_grammemes, # flex_grammemes
96
+ MystemLemmaFlexLen(lemma), # flex_length
97
+ MystemLemmaRuleId(lemma) # rule_id
98
+ )
99
+ end
100
+ end
101
+ end
102
+
103
+ # Analyzes a +word+ and returns an array of its forms as according
104
+ # to the given +rule_id+.
105
+ #
106
+ # :call-seq:
107
+ # forms(String, Fixnum)
108
+ def forms(word, rule_id)
109
+ Array.new.tap do |forms|
110
+ invoke_analyze(as_symbols(word), word.length) do |lemma|
111
+ next unless rule_id == MystemLemmaRuleId(lemma)
112
+
113
+ invoke_generate(lemma) do |form|
114
+ form_text = MystemFormText(form)
115
+ form_text_len = MystemFormTextLen(form)
116
+
117
+ stem_grammemes = MystemFormStemGram(form).bytes
118
+ flex_grammemes_raw = MystemFormFlexGram(form)
119
+ flex_grammemes_len = MystemFormFlexGramNum(form)
120
+ flex_grammemes = as_strings(flex_grammemes_raw, flex_grammemes_len)
121
+ grammemes = stem_grammemes | flex_grammemes
122
+
123
+ forms << Form.new(
124
+ as_string(form_text, form_text_len), # form
125
+ Myasorubka::Mystem::Binary.to_msd(grammemes), # msd
126
+ stem_grammemes, # stem_grammemes
127
+ flex_grammemes, # flex_grammemes
128
+ )
129
+ end
130
+ end
131
+ end
132
+ end
133
+
134
+ # Finds exact matches of +grammemes+ for the provided +forms+ of a word.
135
+ # It is necessary to be careful because computational linguistics is a
136
+ # hard field.
137
+ #
138
+ # :call-seq:
139
+ # inflect([Form], Hash)
140
+ def inflect(forms, grammemes)
141
+ forms.select do |form|
142
+ grammemes.inject(true) { |r, (k, v)| r && form.msd.grammemes[k] == v }
143
+ end
144
+ end
145
+
146
+ protected
147
+
148
+ def invoke_analyze(symbols, length, &block) #:nodoc:
149
+ analyzes = MystemAnalyze(symbols, length)
150
+ MystemAnalysesCount(analyzes).times do |i|
151
+ block.call(MystemLemma(analyzes, i))
152
+ end
153
+ ensure
154
+ MystemDeleteAnalyses(analyzes)
155
+ end
156
+
157
+ def invoke_generate(lemma, &block) #:nodoc:
158
+ forms = MystemGenerate(lemma)
159
+ MystemFormsCount(forms).times do |i|
160
+ block.call(MystemForm(forms, i))
161
+ end
162
+ ensure
163
+ MystemDeleteForms(forms)
164
+ end
165
+
166
+ def as_symbols(string) #:nodoc:
167
+ FFI::MemoryPointer.
168
+ new(:ushort, string.length).
169
+ write_array_of_short(string.chars.map!(&:ord))
170
+ end
171
+
172
+ def as_string(symbols, length) #:nodoc:
173
+ symbols.read_array_of_ushort(length).
174
+ map! { |c| c.chr(Encoding::UTF_8) }.
175
+ join
176
+ end
177
+
178
+ def as_strings(grammemes, grammemes_length) #:nodoc:
179
+ Array.new.tap do |bytes|
180
+ grammemes.get_array_of_string(0, grammemes_length).each do |ids|
181
+ bytes << ids.bytes
182
+ end
183
+ bytes.flatten!
184
+ bytes.uniq!
185
+ end
186
+ end
187
+ end
@@ -0,0 +1,59 @@
1
+ # Myaso uses foreign function interface to interact with the mystem
2
+ # shared library.
3
+ module Myaso::Mystem::Library
4
+ extend FFI::Library
5
+
6
+ begin
7
+ ffi_lib ENV.fetch('MYSTEM_LIBRARY', Dir["{/{opt,usr}/{,local/}lib{,64},.}/libmystem_c_binding.{dylib,so}"])
8
+ rescue LoadError
9
+ fail 'The mystem library could not be loaded. ' \
10
+ 'Please install it and set the MYSTEM_LIBRARY ' \
11
+ 'environment variable to its path.'
12
+ end
13
+
14
+ attach_function :MystemAnalyze, [:pointer, :int], :pointer
15
+ attach_function :MystemAnalysesCount, [:pointer], :int
16
+ attach_function :MystemDeleteAnalyses, [:pointer], :void
17
+
18
+ attach_function :MystemLemma, [:pointer, :int], :pointer
19
+ attach_function :MystemLemmaText, [:pointer], :pointer
20
+ attach_function :MystemLemmaTextLen, [:pointer], :int
21
+ attach_function :MystemLemmaForm, [:pointer], :pointer
22
+ attach_function :MystemLemmaFormLen, [:pointer], :int
23
+ attach_function :MystemLemmaQuality, [:pointer], :int
24
+ attach_function :MystemLemmaStemGram, [:pointer], :string
25
+ attach_function :MystemLemmaFlexGram, [:pointer], :pointer
26
+ attach_function :MystemLemmaFlexGramNum, [:pointer], :int
27
+ attach_function :MystemLemmaFlexLen, [:pointer], :int
28
+ attach_function :MystemLemmaRuleId, [:pointer], :int
29
+
30
+ attach_function :MystemGenerate, [:pointer], :pointer
31
+ attach_function :MystemDeleteForms, [:pointer], :void
32
+ attach_function :MystemFormsCount, [:pointer], :int
33
+
34
+ attach_function :MystemForm, [:pointer, :int], :pointer
35
+ attach_function :MystemFormText, [:pointer], :pointer
36
+ attach_function :MystemFormTextLen, [:pointer], :int
37
+ attach_function :MystemFormStemGram, [:pointer], :string
38
+ attach_function :MystemFormFlexGram, [:pointer], :pointer
39
+ attach_function :MystemFormFlexGramNum, [:pointer], :int
40
+
41
+ # A meaningful mapping between mystem's internal word quality
42
+ # descriptors and the Ruby symbols.
43
+ QUALITY = {
44
+ 0x00000000 => :dictionary,
45
+ 0x00000001 => :bastard,
46
+ 0x00000002 => :sob,
47
+ 0x00000004 => :prefixoid,
48
+ 0x00000008 => :foundling,
49
+ 0x00000010 => :bad_request,
50
+ 0x00010000 => :from_english,
51
+ 0x00020000 => :to_english,
52
+ 0x00040000 => :untranslit,
53
+ 0x00100000 => :overrode,
54
+ 0x01000000 => :fix
55
+ }.freeze
56
+ end
57
+
58
+ Myaso::Mystem.send(:extend, Myaso::Mystem::Library)
59
+ Myaso::Mystem.send(:include, Myaso::Mystem::Library)
@@ -0,0 +1,67 @@
1
+ # encoding: utf-8
2
+
3
+ # A simple yet handy implementation of a n-gram storage.
4
+ #
5
+ class Myaso::Ngrams
6
+ extend Forwardable
7
+ include Enumerable
8
+
9
+ attr_reader :table
10
+ protected :table
11
+
12
+ def_delegator :@table, :each, :each
13
+
14
+ # An instance of a n-gram storage is initialized by zero counts.
15
+ #
16
+ def initialize
17
+ @table = Hash.new do |h, k|
18
+ h[k] = Hash.new { |h_local, k_local| h_local[k_local] = Hash.new(0) }
19
+ end
20
+ end
21
+
22
+ # Obtain the count of the specified unigram, bigram, or trigram.
23
+ #
24
+ def [] unigram, bigram = nil, trigram = nil
25
+ return 0 unless table.include? unigram
26
+ return 0 unless table[unigram].include? bigram
27
+ table[unigram][bigram][trigram]
28
+ end
29
+
30
+ # Assign the count to the specified unigram, bigram, or trigram.
31
+ #
32
+ def []= unigram, bigram = nil, trigram = nil, count
33
+ @unigrams_count = nil
34
+ table[unigram][bigram][trigram] = count
35
+ end
36
+
37
+ # Two storages are equal iff they tables are equal.
38
+ #
39
+ def == other
40
+ self.table == other.table
41
+ end
42
+
43
+ # Trigrams enumerator. Yes, this method should return an Enumerator
44
+ # instance, but it is too slow.
45
+ #
46
+ def each_trigram
47
+ table.each do |unigram, bigrams|
48
+ bigrams.each do |bigram, trigrams|
49
+ next unless bigram
50
+
51
+ trigrams.each do |trigram, count|
52
+ next unless trigram
53
+
54
+ yield [[unigram, bigram, trigram], count]
55
+ end
56
+ end
57
+ end
58
+ end
59
+
60
+ # Unigrams count.
61
+ #
62
+ def unigrams_count
63
+ @unigrams_count ||= table.keys.inject(0) do |count, unigram|
64
+ count + table[unigram][nil][nil]
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,36 @@
1
+ # encoding: utf-8
2
+
3
+ # A simple implementation of a dynamic programming table in the following
4
+ # form: $\pi(i, u, v)$. where $i$ is an index and $u, v$ are elements of
5
+ # a finite set of tags.
6
+ #
7
+ class Myaso::PiTable
8
+ extend Forwardable
9
+ include Enumerable
10
+
11
+ attr_reader :default, :table
12
+ def_delegator :@table, :each, :each
13
+
14
+ # An instance of a dynamic programming table can consider the specified
15
+ # default value.
16
+ #
17
+ def initialize(default = nil)
18
+ @default = default
19
+ @table = Hash.new do |h, k|
20
+ h[k] = Hash.new { |h_local, k_local| h_local[k_local] = Hash.new(default) }
21
+ end
22
+ end
23
+
24
+ # Obtain the value of $\pi(i, u, v)$ or return the default value if it
25
+ # is nil.
26
+ #
27
+ def [] i, u, v
28
+ table[i][u][v]
29
+ end
30
+
31
+ # Set a value of $\pi(i, u, v)$.
32
+ #
33
+ def []= i, u, v, value
34
+ table[i][u][v] = value
35
+ end
36
+ end
@@ -0,0 +1,94 @@
1
+ # encoding: utf-8
2
+
3
+ # This class is an implementation of the Viterbi algorithm.
4
+ #
5
+ class Myaso::Tagger
6
+ attr_reader :model
7
+
8
+ # An instance of Tagger should be initialized with an instance of
9
+ # trained HMM.
10
+ #
11
+ def initialize(model)
12
+ @model = model
13
+ end
14
+
15
+ # Viterbi algorithm itself. Return tags that input sentence
16
+ # should be annotated.
17
+ #
18
+ def annotate(sentence)
19
+ return [] if sentence.size == 0
20
+ sentence = sentence.map { |w| model.classify(w) }
21
+ sentence.unshift(model.start_symbol, model.start_symbol)
22
+ backward(sentence, *forward(sentence))
23
+ end
24
+
25
+ protected
26
+ # Emit probabilities into the dynamic programming tables.
27
+ #
28
+ def forward(sentence)
29
+ pi, bp = Myaso::PiTable.new, Myaso::PiTable.new
30
+ pi[1, model.start_symbol, model.start_symbol] = 0.0
31
+
32
+ sentence.each_with_index.each_cons(3) do |(w1, i1), (w2, i2), (word, k)|
33
+ w_tags = (i1 < 2) ? [model.start_symbol] : model.lexicon.tags(w1)
34
+ u_tags = (i2 < 2) ? [model.start_symbol] : model.lexicon.tags(w2)
35
+ v_tags = model.lexicon.tags(word)
36
+
37
+ u_tags.product(v_tags).each do |u, v|
38
+ pi[k, u, v], bp[k, u, v] = forward_iteration(pi, k, u, v, w_tags, word)
39
+ end
40
+ end
41
+
42
+ [pi, bp]
43
+ end
44
+
45
+ # Essential of forward part of Viterbi algorithm.
46
+ #
47
+ def forward_iteration(pi, k, u, v, tags, word)
48
+ tags.select { |w| (value = pi[k - 1, w, u]) && value.finite? }.
49
+ map! { |w| [pi[k - 1, w, u] + probability(w, u, v, word), w] }.
50
+ max_by(&:first)
51
+ end
52
+
53
+ # Use backpoints to retrieve the computed tags from the previous stage.
54
+ #
55
+ def backward(sentence, pi, bp)
56
+ size = sentence.size - 1
57
+
58
+ if (size - 2).zero?
59
+ return model.lexicon.tags(sentence[-1]).map { |v| [v] }.
60
+ max_by { |v| pi[size, model.start_symbol, *v] +
61
+ probability(model.start_symbol, *v, model.stop_symbol) }
62
+ end
63
+
64
+ tags = prepare_backward(sentence, pi)
65
+
66
+ size.downto(4) do |k|
67
+ tags[k - 2] = bp[k, tags[k - 1], tags[k]]
68
+ end
69
+
70
+ tags.slice! 2..-1
71
+ end
72
+
73
+ # Preparations to tags computing.
74
+ #
75
+ def prepare_backward(sentence, pi)
76
+ size = sentence.size - 1
77
+ tags = Array.new(sentence.size)
78
+
79
+ u_tags, v_tags = model.lexicon.tags(sentence[-2]), model.lexicon.tags(sentence[-1])
80
+
81
+ tags[size - 1], tags[size] = u_tags.product(v_tags).
82
+ select { |u, v| (value = pi[size, u, v]) && value.finite? }.
83
+ max_by { |u, v| pi[size, u, v] + probability(u, v, model.stop_symbol) }
84
+
85
+ tags
86
+ end
87
+
88
+ # Compute the probability of q(v|w, u) * e(word|v).
89
+ #
90
+ def probability(w, u, v, word = nil)
91
+ return Math.log2(model.q(w, u, v)) unless word
92
+ Math.log2(model.q(w, u, v) * model.e(word, v))
93
+ end
94
+ end
@@ -0,0 +1,68 @@
1
+ # encoding: utf-8
2
+
3
+ # Any HMM tagger requires a trained model that can perform such tasks as
4
+ # producing smoothed q() and e() values, replace unknown words with special
5
+ # symbols.
6
+ #
7
+ class Myaso::Tagger::Model
8
+ attr_reader :ngrams, :lexicon, :interpolations
9
+
10
+ # Tagging model requires n-grams and lexicon.
11
+ #
12
+ # It is possible to the the interpolations vector when its values are
13
+ # known. If there are necessity to recompute the interpolations then
14
+ # nil shall be given (default behavior). If there should be no
15
+ # interpolations then false shall be given. In other cases it is possible
16
+ # to set them explicitly.
17
+ #
18
+ def initialize(interpolations = nil)
19
+ @ngrams, @lexicon = Myaso::Ngrams.new, Myaso::Lexicon.new
20
+ @interpolations = if interpolations == false
21
+ [0.33, 0.33, 0.33]
22
+ elsif interpolations.nil?
23
+ nil
24
+ else
25
+ interpolations
26
+ end
27
+ learn!
28
+ end
29
+
30
+ # Linear interpolation model of processing probability of
31
+ # occurence of the trigram (first, second, third). It
32
+ # consider three summands: the first one has the next sense:
33
+ # probability that current tag is (third) if last two are
34
+ # (first, second), the second one -- that last one is (second),
35
+ # and the last summand consider independent probability that
36
+ # current tag is (third).
37
+ #
38
+ def q(first, second, third)
39
+ q1 = conditional(ngrams[third], ngrams.unigrams_count)
40
+ q2 = conditional(ngrams[second, third], ngrams[second])
41
+ q3 = conditional(ngrams[first, second, third], ngrams[first, second])
42
+
43
+ q1 * interpolations[0] + q2 * interpolations[1] + q3 * interpolations[2]
44
+ end
45
+
46
+ # Function e in the Viterbi algorithm. It process probability of
47
+ # generation word with this tag relatively to all words with
48
+ # this tag.
49
+ #
50
+ def e(word, tag)
51
+ conditional(lexicon[word, tag], ngrams[tag])
52
+ end
53
+
54
+ # If word is rare, than it should be replaced in preparation of the
55
+ # training set. So, it can't be in the training set.
56
+ #
57
+ def rare?(word)
58
+ lexicon[word] <= 1
59
+ end
60
+
61
+ # Conditional probability p(A|B) = p(A, B) / p(B). Returns zero when
62
+ # denominator is zero.
63
+ #
64
+ def conditional(ab, b)
65
+ return 0.0 if b.zero?
66
+ ab / b.to_f
67
+ end
68
+ end