myaso 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +25 -0
- data/.travis.yml +10 -0
- data/Gemfile +14 -0
- data/LICENSE.txt +22 -0
- data/README.md +213 -0
- data/Rakefile +21 -0
- data/bin/myaso +73 -0
- data/lib/myaso.rb +35 -0
- data/lib/myaso/lexicon.rb +70 -0
- data/lib/myaso/mystem.rb +187 -0
- data/lib/myaso/mystem/library.rb +59 -0
- data/lib/myaso/ngrams.rb +67 -0
- data/lib/myaso/pi_table.rb +36 -0
- data/lib/myaso/tagger.rb +94 -0
- data/lib/myaso/tagger/model.rb +68 -0
- data/lib/myaso/tagger/tnt.rb +183 -0
- data/lib/myaso/version.rb +9 -0
- data/myaso.gemspec +26 -0
- data/myaso.jpg +0 -0
- data/spec/bin_spec.rb +48 -0
- data/spec/data/test.123 +77 -0
- data/spec/data/test.lex +10 -0
- data/spec/fixtures/interpolations.yml +4 -0
- data/spec/fixtures/lexicon.yml +32 -0
- data/spec/fixtures/ngrams.yml +106 -0
- data/spec/lexicon_spec.rb +84 -0
- data/spec/mystem_spec.rb +81 -0
- data/spec/ngrams_spec.rb +97 -0
- data/spec/pi_table_spec.rb +53 -0
- data/spec/spec_helper.rb +12 -0
- data/spec/support/fixtures.rb +34 -0
- data/spec/support/invoker.rb +29 -0
- data/spec/tagger_spec.rb +27 -0
- data/spec/tagger_tnt_spec.rb +73 -0
- metadata +137 -0
data/lib/myaso/mystem.rb
ADDED
@@ -0,0 +1,187 @@
|
|
1
|
+
# Mystem is a popular morphological analyzer for Russian that is written
|
2
|
+
# in Yandex by Ilya Segalovich and Vitaly Titov. The analyzer can
|
3
|
+
# efficiently deal with non-dictionary word and produce hypotheses
|
4
|
+
# for such words. It is available on <https://tech.yandex.ru/mystem/>.
|
5
|
+
module Myaso::Mystem extend self
|
6
|
+
# Lemma is a canonical form of the word.
|
7
|
+
class Lemma < Struct.new(:lemma, :form, :quality, :msd, :stem_grammemes, :flex_grammemes, :flex_length, :rule_id)
|
8
|
+
##
|
9
|
+
# :attr_accessor: lemma
|
10
|
+
# A lemma of the word.
|
11
|
+
|
12
|
+
##
|
13
|
+
# :attr_accessor: form
|
14
|
+
# A normalized word form.
|
15
|
+
|
16
|
+
##
|
17
|
+
# :attr_accessor: quality
|
18
|
+
# Quality as according to +Myaso::Mystem::Library::QUALITY+.
|
19
|
+
|
20
|
+
##
|
21
|
+
# :attr_accessor: msd
|
22
|
+
# A morphosyntactic descriptor.
|
23
|
+
|
24
|
+
##
|
25
|
+
# :attr_accessor: rule_id
|
26
|
+
# An inflection rule identifier.
|
27
|
+
|
28
|
+
# A shortcut to +Myaso::Mystem.forms+.
|
29
|
+
def forms
|
30
|
+
Myaso::Mystem.forms(lemma, rule_id)
|
31
|
+
end
|
32
|
+
|
33
|
+
# A shortcut to +Myaso::Mystem.inflect+.
|
34
|
+
#
|
35
|
+
# :call-seq:
|
36
|
+
# inflect(Hash)
|
37
|
+
def inflect(grammemes)
|
38
|
+
Myaso::Mystem.inflect(forms, grammemes)
|
39
|
+
end
|
40
|
+
|
41
|
+
def inspect #:nodoc:
|
42
|
+
'#<%s lemma=%s msd="%s">' % [self.class.name, to_s.inspect, msd]
|
43
|
+
end
|
44
|
+
|
45
|
+
def to_s #:nodoc:
|
46
|
+
lemma
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# A word form generated by mystem.
|
51
|
+
class Form < Struct.new(:form, :msd, :stem_grammemes, :flex_grammemes)
|
52
|
+
##
|
53
|
+
# :attr_accessor: form
|
54
|
+
# A normalized word form.
|
55
|
+
|
56
|
+
##
|
57
|
+
# :attr_accessor: msd
|
58
|
+
# A morphosyntactic descriptor.
|
59
|
+
|
60
|
+
def inspect #:nodoc:
|
61
|
+
'#<%s form=%s msd="%s">' % [self.class.name, to_s.inspect, msd]
|
62
|
+
end
|
63
|
+
|
64
|
+
def to_s #:nodoc:
|
65
|
+
form
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
# Analyzes a +word+ and returns an array of lemmas, each of which
|
70
|
+
# represent a particular ambiguous morphological interpretation.
|
71
|
+
#
|
72
|
+
# :call-seq:
|
73
|
+
# analyze(String)
|
74
|
+
def analyze(word)
|
75
|
+
Array.new.tap do |lemmas|
|
76
|
+
invoke_analyze(as_symbols(word), word.length) do |lemma|
|
77
|
+
lemma_text = MystemLemmaText(lemma)
|
78
|
+
lemma_text_len = MystemLemmaTextLen(lemma)
|
79
|
+
|
80
|
+
form_text = MystemLemmaForm(lemma)
|
81
|
+
form_text_len = MystemLemmaFormLen(lemma)
|
82
|
+
|
83
|
+
stem_grammemes = MystemLemmaStemGram(lemma).bytes
|
84
|
+
flex_grammemes_raw = MystemLemmaFlexGram(lemma)
|
85
|
+
flex_grammemes_len = MystemLemmaFlexGramNum(lemma)
|
86
|
+
flex_grammemes = as_strings(flex_grammemes_raw, flex_grammemes_len)
|
87
|
+
grammemes = stem_grammemes | flex_grammemes
|
88
|
+
|
89
|
+
lemmas << Lemma.new(
|
90
|
+
as_string(lemma_text, lemma_text_len), # lemma
|
91
|
+
as_string(form_text, form_text_len), # form
|
92
|
+
QUALITY[MystemLemmaQuality(lemma)], # quality
|
93
|
+
Myasorubka::Mystem::Binary.to_msd(grammemes), # msd
|
94
|
+
stem_grammemes, # stem_grammemes
|
95
|
+
flex_grammemes, # flex_grammemes
|
96
|
+
MystemLemmaFlexLen(lemma), # flex_length
|
97
|
+
MystemLemmaRuleId(lemma) # rule_id
|
98
|
+
)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
# Analyzes a +word+ and returns an array of its forms as according
|
104
|
+
# to the given +rule_id+.
|
105
|
+
#
|
106
|
+
# :call-seq:
|
107
|
+
# forms(String, Fixnum)
|
108
|
+
def forms(word, rule_id)
|
109
|
+
Array.new.tap do |forms|
|
110
|
+
invoke_analyze(as_symbols(word), word.length) do |lemma|
|
111
|
+
next unless rule_id == MystemLemmaRuleId(lemma)
|
112
|
+
|
113
|
+
invoke_generate(lemma) do |form|
|
114
|
+
form_text = MystemFormText(form)
|
115
|
+
form_text_len = MystemFormTextLen(form)
|
116
|
+
|
117
|
+
stem_grammemes = MystemFormStemGram(form).bytes
|
118
|
+
flex_grammemes_raw = MystemFormFlexGram(form)
|
119
|
+
flex_grammemes_len = MystemFormFlexGramNum(form)
|
120
|
+
flex_grammemes = as_strings(flex_grammemes_raw, flex_grammemes_len)
|
121
|
+
grammemes = stem_grammemes | flex_grammemes
|
122
|
+
|
123
|
+
forms << Form.new(
|
124
|
+
as_string(form_text, form_text_len), # form
|
125
|
+
Myasorubka::Mystem::Binary.to_msd(grammemes), # msd
|
126
|
+
stem_grammemes, # stem_grammemes
|
127
|
+
flex_grammemes, # flex_grammemes
|
128
|
+
)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
# Finds exact matches of +grammemes+ for the provided +forms+ of a word.
|
135
|
+
# It is necessary to be careful because computational linguistics is a
|
136
|
+
# hard field.
|
137
|
+
#
|
138
|
+
# :call-seq:
|
139
|
+
# inflect([Form], Hash)
|
140
|
+
def inflect(forms, grammemes)
|
141
|
+
forms.select do |form|
|
142
|
+
grammemes.inject(true) { |r, (k, v)| r && form.msd.grammemes[k] == v }
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
protected
|
147
|
+
|
148
|
+
def invoke_analyze(symbols, length, &block) #:nodoc:
|
149
|
+
analyzes = MystemAnalyze(symbols, length)
|
150
|
+
MystemAnalysesCount(analyzes).times do |i|
|
151
|
+
block.call(MystemLemma(analyzes, i))
|
152
|
+
end
|
153
|
+
ensure
|
154
|
+
MystemDeleteAnalyses(analyzes)
|
155
|
+
end
|
156
|
+
|
157
|
+
def invoke_generate(lemma, &block) #:nodoc:
|
158
|
+
forms = MystemGenerate(lemma)
|
159
|
+
MystemFormsCount(forms).times do |i|
|
160
|
+
block.call(MystemForm(forms, i))
|
161
|
+
end
|
162
|
+
ensure
|
163
|
+
MystemDeleteForms(forms)
|
164
|
+
end
|
165
|
+
|
166
|
+
def as_symbols(string) #:nodoc:
|
167
|
+
FFI::MemoryPointer.
|
168
|
+
new(:ushort, string.length).
|
169
|
+
write_array_of_short(string.chars.map!(&:ord))
|
170
|
+
end
|
171
|
+
|
172
|
+
def as_string(symbols, length) #:nodoc:
|
173
|
+
symbols.read_array_of_ushort(length).
|
174
|
+
map! { |c| c.chr(Encoding::UTF_8) }.
|
175
|
+
join
|
176
|
+
end
|
177
|
+
|
178
|
+
def as_strings(grammemes, grammemes_length) #:nodoc:
|
179
|
+
Array.new.tap do |bytes|
|
180
|
+
grammemes.get_array_of_string(0, grammemes_length).each do |ids|
|
181
|
+
bytes << ids.bytes
|
182
|
+
end
|
183
|
+
bytes.flatten!
|
184
|
+
bytes.uniq!
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
# Myaso uses foreign function interface to interact with the mystem
|
2
|
+
# shared library.
|
3
|
+
module Myaso::Mystem::Library
|
4
|
+
extend FFI::Library
|
5
|
+
|
6
|
+
begin
|
7
|
+
ffi_lib ENV.fetch('MYSTEM_LIBRARY', Dir["{/{opt,usr}/{,local/}lib{,64},.}/libmystem_c_binding.{dylib,so}"])
|
8
|
+
rescue LoadError
|
9
|
+
fail 'The mystem library could not be loaded. ' \
|
10
|
+
'Please install it and set the MYSTEM_LIBRARY ' \
|
11
|
+
'environment variable to its path.'
|
12
|
+
end
|
13
|
+
|
14
|
+
attach_function :MystemAnalyze, [:pointer, :int], :pointer
|
15
|
+
attach_function :MystemAnalysesCount, [:pointer], :int
|
16
|
+
attach_function :MystemDeleteAnalyses, [:pointer], :void
|
17
|
+
|
18
|
+
attach_function :MystemLemma, [:pointer, :int], :pointer
|
19
|
+
attach_function :MystemLemmaText, [:pointer], :pointer
|
20
|
+
attach_function :MystemLemmaTextLen, [:pointer], :int
|
21
|
+
attach_function :MystemLemmaForm, [:pointer], :pointer
|
22
|
+
attach_function :MystemLemmaFormLen, [:pointer], :int
|
23
|
+
attach_function :MystemLemmaQuality, [:pointer], :int
|
24
|
+
attach_function :MystemLemmaStemGram, [:pointer], :string
|
25
|
+
attach_function :MystemLemmaFlexGram, [:pointer], :pointer
|
26
|
+
attach_function :MystemLemmaFlexGramNum, [:pointer], :int
|
27
|
+
attach_function :MystemLemmaFlexLen, [:pointer], :int
|
28
|
+
attach_function :MystemLemmaRuleId, [:pointer], :int
|
29
|
+
|
30
|
+
attach_function :MystemGenerate, [:pointer], :pointer
|
31
|
+
attach_function :MystemDeleteForms, [:pointer], :void
|
32
|
+
attach_function :MystemFormsCount, [:pointer], :int
|
33
|
+
|
34
|
+
attach_function :MystemForm, [:pointer, :int], :pointer
|
35
|
+
attach_function :MystemFormText, [:pointer], :pointer
|
36
|
+
attach_function :MystemFormTextLen, [:pointer], :int
|
37
|
+
attach_function :MystemFormStemGram, [:pointer], :string
|
38
|
+
attach_function :MystemFormFlexGram, [:pointer], :pointer
|
39
|
+
attach_function :MystemFormFlexGramNum, [:pointer], :int
|
40
|
+
|
41
|
+
# A meaningful mapping between mystem's internal word quality
|
42
|
+
# descriptors and the Ruby symbols.
|
43
|
+
QUALITY = {
|
44
|
+
0x00000000 => :dictionary,
|
45
|
+
0x00000001 => :bastard,
|
46
|
+
0x00000002 => :sob,
|
47
|
+
0x00000004 => :prefixoid,
|
48
|
+
0x00000008 => :foundling,
|
49
|
+
0x00000010 => :bad_request,
|
50
|
+
0x00010000 => :from_english,
|
51
|
+
0x00020000 => :to_english,
|
52
|
+
0x00040000 => :untranslit,
|
53
|
+
0x00100000 => :overrode,
|
54
|
+
0x01000000 => :fix
|
55
|
+
}.freeze
|
56
|
+
end
|
57
|
+
|
58
|
+
Myaso::Mystem.send(:extend, Myaso::Mystem::Library)
|
59
|
+
Myaso::Mystem.send(:include, Myaso::Mystem::Library)
|
data/lib/myaso/ngrams.rb
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# A simple yet handy implementation of a n-gram storage.
|
4
|
+
#
|
5
|
+
class Myaso::Ngrams
|
6
|
+
extend Forwardable
|
7
|
+
include Enumerable
|
8
|
+
|
9
|
+
attr_reader :table
|
10
|
+
protected :table
|
11
|
+
|
12
|
+
def_delegator :@table, :each, :each
|
13
|
+
|
14
|
+
# An instance of a n-gram storage is initialized by zero counts.
|
15
|
+
#
|
16
|
+
def initialize
|
17
|
+
@table = Hash.new do |h, k|
|
18
|
+
h[k] = Hash.new { |h_local, k_local| h_local[k_local] = Hash.new(0) }
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# Obtain the count of the specified unigram, bigram, or trigram.
|
23
|
+
#
|
24
|
+
def [] unigram, bigram = nil, trigram = nil
|
25
|
+
return 0 unless table.include? unigram
|
26
|
+
return 0 unless table[unigram].include? bigram
|
27
|
+
table[unigram][bigram][trigram]
|
28
|
+
end
|
29
|
+
|
30
|
+
# Assign the count to the specified unigram, bigram, or trigram.
|
31
|
+
#
|
32
|
+
def []= unigram, bigram = nil, trigram = nil, count
|
33
|
+
@unigrams_count = nil
|
34
|
+
table[unigram][bigram][trigram] = count
|
35
|
+
end
|
36
|
+
|
37
|
+
# Two storages are equal iff they tables are equal.
|
38
|
+
#
|
39
|
+
def == other
|
40
|
+
self.table == other.table
|
41
|
+
end
|
42
|
+
|
43
|
+
# Trigrams enumerator. Yes, this method should return an Enumerator
|
44
|
+
# instance, but it is too slow.
|
45
|
+
#
|
46
|
+
def each_trigram
|
47
|
+
table.each do |unigram, bigrams|
|
48
|
+
bigrams.each do |bigram, trigrams|
|
49
|
+
next unless bigram
|
50
|
+
|
51
|
+
trigrams.each do |trigram, count|
|
52
|
+
next unless trigram
|
53
|
+
|
54
|
+
yield [[unigram, bigram, trigram], count]
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Unigrams count.
|
61
|
+
#
|
62
|
+
def unigrams_count
|
63
|
+
@unigrams_count ||= table.keys.inject(0) do |count, unigram|
|
64
|
+
count + table[unigram][nil][nil]
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# A simple implementation of a dynamic programming table in the following
|
4
|
+
# form: $\pi(i, u, v)$. where $i$ is an index and $u, v$ are elements of
|
5
|
+
# a finite set of tags.
|
6
|
+
#
|
7
|
+
class Myaso::PiTable
|
8
|
+
extend Forwardable
|
9
|
+
include Enumerable
|
10
|
+
|
11
|
+
attr_reader :default, :table
|
12
|
+
def_delegator :@table, :each, :each
|
13
|
+
|
14
|
+
# An instance of a dynamic programming table can consider the specified
|
15
|
+
# default value.
|
16
|
+
#
|
17
|
+
def initialize(default = nil)
|
18
|
+
@default = default
|
19
|
+
@table = Hash.new do |h, k|
|
20
|
+
h[k] = Hash.new { |h_local, k_local| h_local[k_local] = Hash.new(default) }
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Obtain the value of $\pi(i, u, v)$ or return the default value if it
|
25
|
+
# is nil.
|
26
|
+
#
|
27
|
+
def [] i, u, v
|
28
|
+
table[i][u][v]
|
29
|
+
end
|
30
|
+
|
31
|
+
# Set a value of $\pi(i, u, v)$.
|
32
|
+
#
|
33
|
+
def []= i, u, v, value
|
34
|
+
table[i][u][v] = value
|
35
|
+
end
|
36
|
+
end
|
data/lib/myaso/tagger.rb
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# This class is an implementation of the Viterbi algorithm.
|
4
|
+
#
|
5
|
+
class Myaso::Tagger
|
6
|
+
attr_reader :model
|
7
|
+
|
8
|
+
# An instance of Tagger should be initialized with an instance of
|
9
|
+
# trained HMM.
|
10
|
+
#
|
11
|
+
def initialize(model)
|
12
|
+
@model = model
|
13
|
+
end
|
14
|
+
|
15
|
+
# Viterbi algorithm itself. Return tags that input sentence
|
16
|
+
# should be annotated.
|
17
|
+
#
|
18
|
+
def annotate(sentence)
|
19
|
+
return [] if sentence.size == 0
|
20
|
+
sentence = sentence.map { |w| model.classify(w) }
|
21
|
+
sentence.unshift(model.start_symbol, model.start_symbol)
|
22
|
+
backward(sentence, *forward(sentence))
|
23
|
+
end
|
24
|
+
|
25
|
+
protected
|
26
|
+
# Emit probabilities into the dynamic programming tables.
|
27
|
+
#
|
28
|
+
def forward(sentence)
|
29
|
+
pi, bp = Myaso::PiTable.new, Myaso::PiTable.new
|
30
|
+
pi[1, model.start_symbol, model.start_symbol] = 0.0
|
31
|
+
|
32
|
+
sentence.each_with_index.each_cons(3) do |(w1, i1), (w2, i2), (word, k)|
|
33
|
+
w_tags = (i1 < 2) ? [model.start_symbol] : model.lexicon.tags(w1)
|
34
|
+
u_tags = (i2 < 2) ? [model.start_symbol] : model.lexicon.tags(w2)
|
35
|
+
v_tags = model.lexicon.tags(word)
|
36
|
+
|
37
|
+
u_tags.product(v_tags).each do |u, v|
|
38
|
+
pi[k, u, v], bp[k, u, v] = forward_iteration(pi, k, u, v, w_tags, word)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
[pi, bp]
|
43
|
+
end
|
44
|
+
|
45
|
+
# Essential of forward part of Viterbi algorithm.
|
46
|
+
#
|
47
|
+
def forward_iteration(pi, k, u, v, tags, word)
|
48
|
+
tags.select { |w| (value = pi[k - 1, w, u]) && value.finite? }.
|
49
|
+
map! { |w| [pi[k - 1, w, u] + probability(w, u, v, word), w] }.
|
50
|
+
max_by(&:first)
|
51
|
+
end
|
52
|
+
|
53
|
+
# Use backpoints to retrieve the computed tags from the previous stage.
|
54
|
+
#
|
55
|
+
def backward(sentence, pi, bp)
|
56
|
+
size = sentence.size - 1
|
57
|
+
|
58
|
+
if (size - 2).zero?
|
59
|
+
return model.lexicon.tags(sentence[-1]).map { |v| [v] }.
|
60
|
+
max_by { |v| pi[size, model.start_symbol, *v] +
|
61
|
+
probability(model.start_symbol, *v, model.stop_symbol) }
|
62
|
+
end
|
63
|
+
|
64
|
+
tags = prepare_backward(sentence, pi)
|
65
|
+
|
66
|
+
size.downto(4) do |k|
|
67
|
+
tags[k - 2] = bp[k, tags[k - 1], tags[k]]
|
68
|
+
end
|
69
|
+
|
70
|
+
tags.slice! 2..-1
|
71
|
+
end
|
72
|
+
|
73
|
+
# Preparations to tags computing.
|
74
|
+
#
|
75
|
+
def prepare_backward(sentence, pi)
|
76
|
+
size = sentence.size - 1
|
77
|
+
tags = Array.new(sentence.size)
|
78
|
+
|
79
|
+
u_tags, v_tags = model.lexicon.tags(sentence[-2]), model.lexicon.tags(sentence[-1])
|
80
|
+
|
81
|
+
tags[size - 1], tags[size] = u_tags.product(v_tags).
|
82
|
+
select { |u, v| (value = pi[size, u, v]) && value.finite? }.
|
83
|
+
max_by { |u, v| pi[size, u, v] + probability(u, v, model.stop_symbol) }
|
84
|
+
|
85
|
+
tags
|
86
|
+
end
|
87
|
+
|
88
|
+
# Compute the probability of q(v|w, u) * e(word|v).
|
89
|
+
#
|
90
|
+
def probability(w, u, v, word = nil)
|
91
|
+
return Math.log2(model.q(w, u, v)) unless word
|
92
|
+
Math.log2(model.q(w, u, v) * model.e(word, v))
|
93
|
+
end
|
94
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# Any HMM tagger requires a trained model that can perform such tasks as
|
4
|
+
# producing smoothed q() and e() values, replace unknown words with special
|
5
|
+
# symbols.
|
6
|
+
#
|
7
|
+
class Myaso::Tagger::Model
|
8
|
+
attr_reader :ngrams, :lexicon, :interpolations
|
9
|
+
|
10
|
+
# Tagging model requires n-grams and lexicon.
|
11
|
+
#
|
12
|
+
# It is possible to the the interpolations vector when its values are
|
13
|
+
# known. If there are necessity to recompute the interpolations then
|
14
|
+
# nil shall be given (default behavior). If there should be no
|
15
|
+
# interpolations then false shall be given. In other cases it is possible
|
16
|
+
# to set them explicitly.
|
17
|
+
#
|
18
|
+
def initialize(interpolations = nil)
|
19
|
+
@ngrams, @lexicon = Myaso::Ngrams.new, Myaso::Lexicon.new
|
20
|
+
@interpolations = if interpolations == false
|
21
|
+
[0.33, 0.33, 0.33]
|
22
|
+
elsif interpolations.nil?
|
23
|
+
nil
|
24
|
+
else
|
25
|
+
interpolations
|
26
|
+
end
|
27
|
+
learn!
|
28
|
+
end
|
29
|
+
|
30
|
+
# Linear interpolation model of processing probability of
|
31
|
+
# occurence of the trigram (first, second, third). It
|
32
|
+
# consider three summands: the first one has the next sense:
|
33
|
+
# probability that current tag is (third) if last two are
|
34
|
+
# (first, second), the second one -- that last one is (second),
|
35
|
+
# and the last summand consider independent probability that
|
36
|
+
# current tag is (third).
|
37
|
+
#
|
38
|
+
def q(first, second, third)
|
39
|
+
q1 = conditional(ngrams[third], ngrams.unigrams_count)
|
40
|
+
q2 = conditional(ngrams[second, third], ngrams[second])
|
41
|
+
q3 = conditional(ngrams[first, second, third], ngrams[first, second])
|
42
|
+
|
43
|
+
q1 * interpolations[0] + q2 * interpolations[1] + q3 * interpolations[2]
|
44
|
+
end
|
45
|
+
|
46
|
+
# Function e in the Viterbi algorithm. It process probability of
|
47
|
+
# generation word with this tag relatively to all words with
|
48
|
+
# this tag.
|
49
|
+
#
|
50
|
+
def e(word, tag)
|
51
|
+
conditional(lexicon[word, tag], ngrams[tag])
|
52
|
+
end
|
53
|
+
|
54
|
+
# If word is rare, than it should be replaced in preparation of the
|
55
|
+
# training set. So, it can't be in the training set.
|
56
|
+
#
|
57
|
+
def rare?(word)
|
58
|
+
lexicon[word] <= 1
|
59
|
+
end
|
60
|
+
|
61
|
+
# Conditional probability p(A|B) = p(A, B) / p(B). Returns zero when
|
62
|
+
# denominator is zero.
|
63
|
+
#
|
64
|
+
def conditional(ab, b)
|
65
|
+
return 0.0 if b.zero?
|
66
|
+
ab / b.to_f
|
67
|
+
end
|
68
|
+
end
|