myaso 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,187 @@
1
+ # Mystem is a popular morphological analyzer for Russian that is written
2
+ # in Yandex by Ilya Segalovich and Vitaly Titov. The analyzer can
3
+ # efficiently deal with non-dictionary word and produce hypotheses
4
+ # for such words. It is available on <https://tech.yandex.ru/mystem/>.
5
+ module Myaso::Mystem extend self
6
+ # Lemma is a canonical form of the word.
7
+ class Lemma < Struct.new(:lemma, :form, :quality, :msd, :stem_grammemes, :flex_grammemes, :flex_length, :rule_id)
8
+ ##
9
+ # :attr_accessor: lemma
10
+ # A lemma of the word.
11
+
12
+ ##
13
+ # :attr_accessor: form
14
+ # A normalized word form.
15
+
16
+ ##
17
+ # :attr_accessor: quality
18
+ # Quality as according to +Myaso::Mystem::Library::QUALITY+.
19
+
20
+ ##
21
+ # :attr_accessor: msd
22
+ # A morphosyntactic descriptor.
23
+
24
+ ##
25
+ # :attr_accessor: rule_id
26
+ # An inflection rule identifier.
27
+
28
+ # A shortcut to +Myaso::Mystem.forms+.
29
+ def forms
30
+ Myaso::Mystem.forms(lemma, rule_id)
31
+ end
32
+
33
+ # A shortcut to +Myaso::Mystem.inflect+.
34
+ #
35
+ # :call-seq:
36
+ # inflect(Hash)
37
+ def inflect(grammemes)
38
+ Myaso::Mystem.inflect(forms, grammemes)
39
+ end
40
+
41
+ def inspect #:nodoc:
42
+ '#<%s lemma=%s msd="%s">' % [self.class.name, to_s.inspect, msd]
43
+ end
44
+
45
+ def to_s #:nodoc:
46
+ lemma
47
+ end
48
+ end
49
+
50
+ # A word form generated by mystem.
51
+ class Form < Struct.new(:form, :msd, :stem_grammemes, :flex_grammemes)
52
+ ##
53
+ # :attr_accessor: form
54
+ # A normalized word form.
55
+
56
+ ##
57
+ # :attr_accessor: msd
58
+ # A morphosyntactic descriptor.
59
+
60
+ def inspect #:nodoc:
61
+ '#<%s form=%s msd="%s">' % [self.class.name, to_s.inspect, msd]
62
+ end
63
+
64
+ def to_s #:nodoc:
65
+ form
66
+ end
67
+ end
68
+
69
+ # Analyzes a +word+ and returns an array of lemmas, each of which
70
+ # represent a particular ambiguous morphological interpretation.
71
+ #
72
+ # :call-seq:
73
+ # analyze(String)
74
+ def analyze(word)
75
+ Array.new.tap do |lemmas|
76
+ invoke_analyze(as_symbols(word), word.length) do |lemma|
77
+ lemma_text = MystemLemmaText(lemma)
78
+ lemma_text_len = MystemLemmaTextLen(lemma)
79
+
80
+ form_text = MystemLemmaForm(lemma)
81
+ form_text_len = MystemLemmaFormLen(lemma)
82
+
83
+ stem_grammemes = MystemLemmaStemGram(lemma).bytes
84
+ flex_grammemes_raw = MystemLemmaFlexGram(lemma)
85
+ flex_grammemes_len = MystemLemmaFlexGramNum(lemma)
86
+ flex_grammemes = as_strings(flex_grammemes_raw, flex_grammemes_len)
87
+ grammemes = stem_grammemes | flex_grammemes
88
+
89
+ lemmas << Lemma.new(
90
+ as_string(lemma_text, lemma_text_len), # lemma
91
+ as_string(form_text, form_text_len), # form
92
+ QUALITY[MystemLemmaQuality(lemma)], # quality
93
+ Myasorubka::Mystem::Binary.to_msd(grammemes), # msd
94
+ stem_grammemes, # stem_grammemes
95
+ flex_grammemes, # flex_grammemes
96
+ MystemLemmaFlexLen(lemma), # flex_length
97
+ MystemLemmaRuleId(lemma) # rule_id
98
+ )
99
+ end
100
+ end
101
+ end
102
+
103
+ # Analyzes a +word+ and returns an array of its forms as according
104
+ # to the given +rule_id+.
105
+ #
106
+ # :call-seq:
107
+ # forms(String, Fixnum)
108
+ def forms(word, rule_id)
109
+ Array.new.tap do |forms|
110
+ invoke_analyze(as_symbols(word), word.length) do |lemma|
111
+ next unless rule_id == MystemLemmaRuleId(lemma)
112
+
113
+ invoke_generate(lemma) do |form|
114
+ form_text = MystemFormText(form)
115
+ form_text_len = MystemFormTextLen(form)
116
+
117
+ stem_grammemes = MystemFormStemGram(form).bytes
118
+ flex_grammemes_raw = MystemFormFlexGram(form)
119
+ flex_grammemes_len = MystemFormFlexGramNum(form)
120
+ flex_grammemes = as_strings(flex_grammemes_raw, flex_grammemes_len)
121
+ grammemes = stem_grammemes | flex_grammemes
122
+
123
+ forms << Form.new(
124
+ as_string(form_text, form_text_len), # form
125
+ Myasorubka::Mystem::Binary.to_msd(grammemes), # msd
126
+ stem_grammemes, # stem_grammemes
127
+ flex_grammemes, # flex_grammemes
128
+ )
129
+ end
130
+ end
131
+ end
132
+ end
133
+
134
+ # Finds exact matches of +grammemes+ for the provided +forms+ of a word.
135
+ # It is necessary to be careful because computational linguistics is a
136
+ # hard field.
137
+ #
138
+ # :call-seq:
139
+ # inflect([Form], Hash)
140
+ def inflect(forms, grammemes)
141
+ forms.select do |form|
142
+ grammemes.inject(true) { |r, (k, v)| r && form.msd.grammemes[k] == v }
143
+ end
144
+ end
145
+
146
+ protected
147
+
148
+ def invoke_analyze(symbols, length, &block) #:nodoc:
149
+ analyzes = MystemAnalyze(symbols, length)
150
+ MystemAnalysesCount(analyzes).times do |i|
151
+ block.call(MystemLemma(analyzes, i))
152
+ end
153
+ ensure
154
+ MystemDeleteAnalyses(analyzes)
155
+ end
156
+
157
+ def invoke_generate(lemma, &block) #:nodoc:
158
+ forms = MystemGenerate(lemma)
159
+ MystemFormsCount(forms).times do |i|
160
+ block.call(MystemForm(forms, i))
161
+ end
162
+ ensure
163
+ MystemDeleteForms(forms)
164
+ end
165
+
166
+ def as_symbols(string) #:nodoc:
167
+ FFI::MemoryPointer.
168
+ new(:ushort, string.length).
169
+ write_array_of_short(string.chars.map!(&:ord))
170
+ end
171
+
172
+ def as_string(symbols, length) #:nodoc:
173
+ symbols.read_array_of_ushort(length).
174
+ map! { |c| c.chr(Encoding::UTF_8) }.
175
+ join
176
+ end
177
+
178
+ def as_strings(grammemes, grammemes_length) #:nodoc:
179
+ Array.new.tap do |bytes|
180
+ grammemes.get_array_of_string(0, grammemes_length).each do |ids|
181
+ bytes << ids.bytes
182
+ end
183
+ bytes.flatten!
184
+ bytes.uniq!
185
+ end
186
+ end
187
+ end
@@ -0,0 +1,59 @@
1
+ # Myaso uses foreign function interface to interact with the mystem
2
+ # shared library.
3
+ module Myaso::Mystem::Library
4
+ extend FFI::Library
5
+
6
+ begin
7
+ ffi_lib ENV.fetch('MYSTEM_LIBRARY', Dir["{/{opt,usr}/{,local/}lib{,64},.}/libmystem_c_binding.{dylib,so}"])
8
+ rescue LoadError
9
+ fail 'The mystem library could not be loaded. ' \
10
+ 'Please install it and set the MYSTEM_LIBRARY ' \
11
+ 'environment variable to its path.'
12
+ end
13
+
14
+ attach_function :MystemAnalyze, [:pointer, :int], :pointer
15
+ attach_function :MystemAnalysesCount, [:pointer], :int
16
+ attach_function :MystemDeleteAnalyses, [:pointer], :void
17
+
18
+ attach_function :MystemLemma, [:pointer, :int], :pointer
19
+ attach_function :MystemLemmaText, [:pointer], :pointer
20
+ attach_function :MystemLemmaTextLen, [:pointer], :int
21
+ attach_function :MystemLemmaForm, [:pointer], :pointer
22
+ attach_function :MystemLemmaFormLen, [:pointer], :int
23
+ attach_function :MystemLemmaQuality, [:pointer], :int
24
+ attach_function :MystemLemmaStemGram, [:pointer], :string
25
+ attach_function :MystemLemmaFlexGram, [:pointer], :pointer
26
+ attach_function :MystemLemmaFlexGramNum, [:pointer], :int
27
+ attach_function :MystemLemmaFlexLen, [:pointer], :int
28
+ attach_function :MystemLemmaRuleId, [:pointer], :int
29
+
30
+ attach_function :MystemGenerate, [:pointer], :pointer
31
+ attach_function :MystemDeleteForms, [:pointer], :void
32
+ attach_function :MystemFormsCount, [:pointer], :int
33
+
34
+ attach_function :MystemForm, [:pointer, :int], :pointer
35
+ attach_function :MystemFormText, [:pointer], :pointer
36
+ attach_function :MystemFormTextLen, [:pointer], :int
37
+ attach_function :MystemFormStemGram, [:pointer], :string
38
+ attach_function :MystemFormFlexGram, [:pointer], :pointer
39
+ attach_function :MystemFormFlexGramNum, [:pointer], :int
40
+
41
+ # A meaningful mapping between mystem's internal word quality
42
+ # descriptors and the Ruby symbols.
43
+ QUALITY = {
44
+ 0x00000000 => :dictionary,
45
+ 0x00000001 => :bastard,
46
+ 0x00000002 => :sob,
47
+ 0x00000004 => :prefixoid,
48
+ 0x00000008 => :foundling,
49
+ 0x00000010 => :bad_request,
50
+ 0x00010000 => :from_english,
51
+ 0x00020000 => :to_english,
52
+ 0x00040000 => :untranslit,
53
+ 0x00100000 => :overrode,
54
+ 0x01000000 => :fix
55
+ }.freeze
56
+ end
57
+
58
+ Myaso::Mystem.send(:extend, Myaso::Mystem::Library)
59
+ Myaso::Mystem.send(:include, Myaso::Mystem::Library)
@@ -0,0 +1,67 @@
1
+ # encoding: utf-8
2
+
3
+ # A simple yet handy implementation of a n-gram storage.
4
+ #
5
+ class Myaso::Ngrams
6
+ extend Forwardable
7
+ include Enumerable
8
+
9
+ attr_reader :table
10
+ protected :table
11
+
12
+ def_delegator :@table, :each, :each
13
+
14
+ # An instance of a n-gram storage is initialized by zero counts.
15
+ #
16
+ def initialize
17
+ @table = Hash.new do |h, k|
18
+ h[k] = Hash.new { |h_local, k_local| h_local[k_local] = Hash.new(0) }
19
+ end
20
+ end
21
+
22
+ # Obtain the count of the specified unigram, bigram, or trigram.
23
+ #
24
+ def [] unigram, bigram = nil, trigram = nil
25
+ return 0 unless table.include? unigram
26
+ return 0 unless table[unigram].include? bigram
27
+ table[unigram][bigram][trigram]
28
+ end
29
+
30
+ # Assign the count to the specified unigram, bigram, or trigram.
31
+ #
32
+ def []= unigram, bigram = nil, trigram = nil, count
33
+ @unigrams_count = nil
34
+ table[unigram][bigram][trigram] = count
35
+ end
36
+
37
+ # Two storages are equal iff they tables are equal.
38
+ #
39
+ def == other
40
+ self.table == other.table
41
+ end
42
+
43
+ # Trigrams enumerator. Yes, this method should return an Enumerator
44
+ # instance, but it is too slow.
45
+ #
46
+ def each_trigram
47
+ table.each do |unigram, bigrams|
48
+ bigrams.each do |bigram, trigrams|
49
+ next unless bigram
50
+
51
+ trigrams.each do |trigram, count|
52
+ next unless trigram
53
+
54
+ yield [[unigram, bigram, trigram], count]
55
+ end
56
+ end
57
+ end
58
+ end
59
+
60
+ # Unigrams count.
61
+ #
62
+ def unigrams_count
63
+ @unigrams_count ||= table.keys.inject(0) do |count, unigram|
64
+ count + table[unigram][nil][nil]
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,36 @@
1
+ # encoding: utf-8
2
+
3
+ # A simple implementation of a dynamic programming table in the following
4
+ # form: $\pi(i, u, v)$. where $i$ is an index and $u, v$ are elements of
5
+ # a finite set of tags.
6
+ #
7
+ class Myaso::PiTable
8
+ extend Forwardable
9
+ include Enumerable
10
+
11
+ attr_reader :default, :table
12
+ def_delegator :@table, :each, :each
13
+
14
+ # An instance of a dynamic programming table can consider the specified
15
+ # default value.
16
+ #
17
+ def initialize(default = nil)
18
+ @default = default
19
+ @table = Hash.new do |h, k|
20
+ h[k] = Hash.new { |h_local, k_local| h_local[k_local] = Hash.new(default) }
21
+ end
22
+ end
23
+
24
+ # Obtain the value of $\pi(i, u, v)$ or return the default value if it
25
+ # is nil.
26
+ #
27
+ def [] i, u, v
28
+ table[i][u][v]
29
+ end
30
+
31
+ # Set a value of $\pi(i, u, v)$.
32
+ #
33
+ def []= i, u, v, value
34
+ table[i][u][v] = value
35
+ end
36
+ end
@@ -0,0 +1,94 @@
1
+ # encoding: utf-8
2
+
3
+ # This class is an implementation of the Viterbi algorithm.
4
+ #
5
+ class Myaso::Tagger
6
+ attr_reader :model
7
+
8
+ # An instance of Tagger should be initialized with an instance of
9
+ # trained HMM.
10
+ #
11
+ def initialize(model)
12
+ @model = model
13
+ end
14
+
15
+ # Viterbi algorithm itself. Return tags that input sentence
16
+ # should be annotated.
17
+ #
18
+ def annotate(sentence)
19
+ return [] if sentence.size == 0
20
+ sentence = sentence.map { |w| model.classify(w) }
21
+ sentence.unshift(model.start_symbol, model.start_symbol)
22
+ backward(sentence, *forward(sentence))
23
+ end
24
+
25
+ protected
26
+ # Emit probabilities into the dynamic programming tables.
27
+ #
28
+ def forward(sentence)
29
+ pi, bp = Myaso::PiTable.new, Myaso::PiTable.new
30
+ pi[1, model.start_symbol, model.start_symbol] = 0.0
31
+
32
+ sentence.each_with_index.each_cons(3) do |(w1, i1), (w2, i2), (word, k)|
33
+ w_tags = (i1 < 2) ? [model.start_symbol] : model.lexicon.tags(w1)
34
+ u_tags = (i2 < 2) ? [model.start_symbol] : model.lexicon.tags(w2)
35
+ v_tags = model.lexicon.tags(word)
36
+
37
+ u_tags.product(v_tags).each do |u, v|
38
+ pi[k, u, v], bp[k, u, v] = forward_iteration(pi, k, u, v, w_tags, word)
39
+ end
40
+ end
41
+
42
+ [pi, bp]
43
+ end
44
+
45
+ # Essential of forward part of Viterbi algorithm.
46
+ #
47
+ def forward_iteration(pi, k, u, v, tags, word)
48
+ tags.select { |w| (value = pi[k - 1, w, u]) && value.finite? }.
49
+ map! { |w| [pi[k - 1, w, u] + probability(w, u, v, word), w] }.
50
+ max_by(&:first)
51
+ end
52
+
53
+ # Use backpoints to retrieve the computed tags from the previous stage.
54
+ #
55
+ def backward(sentence, pi, bp)
56
+ size = sentence.size - 1
57
+
58
+ if (size - 2).zero?
59
+ return model.lexicon.tags(sentence[-1]).map { |v| [v] }.
60
+ max_by { |v| pi[size, model.start_symbol, *v] +
61
+ probability(model.start_symbol, *v, model.stop_symbol) }
62
+ end
63
+
64
+ tags = prepare_backward(sentence, pi)
65
+
66
+ size.downto(4) do |k|
67
+ tags[k - 2] = bp[k, tags[k - 1], tags[k]]
68
+ end
69
+
70
+ tags.slice! 2..-1
71
+ end
72
+
73
+ # Preparations to tags computing.
74
+ #
75
+ def prepare_backward(sentence, pi)
76
+ size = sentence.size - 1
77
+ tags = Array.new(sentence.size)
78
+
79
+ u_tags, v_tags = model.lexicon.tags(sentence[-2]), model.lexicon.tags(sentence[-1])
80
+
81
+ tags[size - 1], tags[size] = u_tags.product(v_tags).
82
+ select { |u, v| (value = pi[size, u, v]) && value.finite? }.
83
+ max_by { |u, v| pi[size, u, v] + probability(u, v, model.stop_symbol) }
84
+
85
+ tags
86
+ end
87
+
88
+ # Compute the probability of q(v|w, u) * e(word|v).
89
+ #
90
+ def probability(w, u, v, word = nil)
91
+ return Math.log2(model.q(w, u, v)) unless word
92
+ Math.log2(model.q(w, u, v) * model.e(word, v))
93
+ end
94
+ end
@@ -0,0 +1,68 @@
1
+ # encoding: utf-8
2
+
3
+ # Any HMM tagger requires a trained model that can perform such tasks as
4
+ # producing smoothed q() and e() values, replace unknown words with special
5
+ # symbols.
6
+ #
7
+ class Myaso::Tagger::Model
8
+ attr_reader :ngrams, :lexicon, :interpolations
9
+
10
+ # Tagging model requires n-grams and lexicon.
11
+ #
12
+ # It is possible to the the interpolations vector when its values are
13
+ # known. If there are necessity to recompute the interpolations then
14
+ # nil shall be given (default behavior). If there should be no
15
+ # interpolations then false shall be given. In other cases it is possible
16
+ # to set them explicitly.
17
+ #
18
+ def initialize(interpolations = nil)
19
+ @ngrams, @lexicon = Myaso::Ngrams.new, Myaso::Lexicon.new
20
+ @interpolations = if interpolations == false
21
+ [0.33, 0.33, 0.33]
22
+ elsif interpolations.nil?
23
+ nil
24
+ else
25
+ interpolations
26
+ end
27
+ learn!
28
+ end
29
+
30
+ # Linear interpolation model of processing probability of
31
+ # occurence of the trigram (first, second, third). It
32
+ # consider three summands: the first one has the next sense:
33
+ # probability that current tag is (third) if last two are
34
+ # (first, second), the second one -- that last one is (second),
35
+ # and the last summand consider independent probability that
36
+ # current tag is (third).
37
+ #
38
+ def q(first, second, third)
39
+ q1 = conditional(ngrams[third], ngrams.unigrams_count)
40
+ q2 = conditional(ngrams[second, third], ngrams[second])
41
+ q3 = conditional(ngrams[first, second, third], ngrams[first, second])
42
+
43
+ q1 * interpolations[0] + q2 * interpolations[1] + q3 * interpolations[2]
44
+ end
45
+
46
+ # Function e in the Viterbi algorithm. It process probability of
47
+ # generation word with this tag relatively to all words with
48
+ # this tag.
49
+ #
50
+ def e(word, tag)
51
+ conditional(lexicon[word, tag], ngrams[tag])
52
+ end
53
+
54
+ # If word is rare, than it should be replaced in preparation of the
55
+ # training set. So, it can't be in the training set.
56
+ #
57
+ def rare?(word)
58
+ lexicon[word] <= 1
59
+ end
60
+
61
+ # Conditional probability p(A|B) = p(A, B) / p(B). Returns zero when
62
+ # denominator is zero.
63
+ #
64
+ def conditional(ab, b)
65
+ return 0.0 if b.zero?
66
+ ab / b.to_f
67
+ end
68
+ end