myaso 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +25 -0
- data/.travis.yml +10 -0
- data/Gemfile +14 -0
- data/LICENSE.txt +22 -0
- data/README.md +213 -0
- data/Rakefile +21 -0
- data/bin/myaso +73 -0
- data/lib/myaso.rb +35 -0
- data/lib/myaso/lexicon.rb +70 -0
- data/lib/myaso/mystem.rb +187 -0
- data/lib/myaso/mystem/library.rb +59 -0
- data/lib/myaso/ngrams.rb +67 -0
- data/lib/myaso/pi_table.rb +36 -0
- data/lib/myaso/tagger.rb +94 -0
- data/lib/myaso/tagger/model.rb +68 -0
- data/lib/myaso/tagger/tnt.rb +183 -0
- data/lib/myaso/version.rb +9 -0
- data/myaso.gemspec +26 -0
- data/myaso.jpg +0 -0
- data/spec/bin_spec.rb +48 -0
- data/spec/data/test.123 +77 -0
- data/spec/data/test.lex +10 -0
- data/spec/fixtures/interpolations.yml +4 -0
- data/spec/fixtures/lexicon.yml +32 -0
- data/spec/fixtures/ngrams.yml +106 -0
- data/spec/lexicon_spec.rb +84 -0
- data/spec/mystem_spec.rb +81 -0
- data/spec/ngrams_spec.rb +97 -0
- data/spec/pi_table_spec.rb +53 -0
- data/spec/spec_helper.rb +12 -0
- data/spec/support/fixtures.rb +34 -0
- data/spec/support/invoker.rb +29 -0
- data/spec/tagger_spec.rb +27 -0
- data/spec/tagger_tnt_spec.rb +73 -0
- metadata +137 -0
@@ -0,0 +1,183 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# A Tagger model that can work with TnT data files.
|
4
|
+
#
|
5
|
+
class Myaso::Tagger::TnT < Myaso::Tagger::Model
|
6
|
+
# A start tag for a sentence.
|
7
|
+
#
|
8
|
+
START = 'SENT'
|
9
|
+
|
10
|
+
# A stop tag for a sentence.
|
11
|
+
#
|
12
|
+
STOP = 'SENT'
|
13
|
+
|
14
|
+
# Unknown tag for token.
|
15
|
+
#
|
16
|
+
MISSING = '-'
|
17
|
+
|
18
|
+
# Tokens consisting of a sequence of decimal digits.
|
19
|
+
#
|
20
|
+
CARD = '@CARD'
|
21
|
+
|
22
|
+
# Decimal digits followed by punctuation.
|
23
|
+
#
|
24
|
+
CARDPUNCT = '@CARDPUNCT'
|
25
|
+
|
26
|
+
# Decimal digits followed by any suffix.
|
27
|
+
#
|
28
|
+
CARDSUFFIX = '@CARDSUFFIX'
|
29
|
+
|
30
|
+
# Decimal digits separated by dots, dashes, etc.
|
31
|
+
#
|
32
|
+
CARDSEPS = '@CARDSEPS'
|
33
|
+
|
34
|
+
# Tag frequencies to handle unknown words.
|
35
|
+
#
|
36
|
+
UNKNOWN = '@UNKNOWN'
|
37
|
+
|
38
|
+
attr_reader :ngrams_path, :lexicon_path
|
39
|
+
|
40
|
+
# The tagging model is initialized by two data files. The first one is a
|
41
|
+
# n-grams file that stores statistics for unigrams, bigrams, trigrams.
|
42
|
+
# The second one is a lexicon file that stores words and their
|
43
|
+
# frequencies in the source corpus.
|
44
|
+
#
|
45
|
+
# Please note that the learning stage is not so optimized, so the
|
46
|
+
# initialization procedure may take about 120 seconds.
|
47
|
+
#
|
48
|
+
def initialize(ngrams_path, lexicon_path, interpolations = nil)
|
49
|
+
@ngrams_path = File.expand_path(ngrams_path)
|
50
|
+
@lexicon_path = File.expand_path(lexicon_path)
|
51
|
+
super(interpolations)
|
52
|
+
end
|
53
|
+
|
54
|
+
# If word is rare, it can be one of the following categories:
|
55
|
+
# includes numbers, numbers and punctuation symbols, non-numbers
|
56
|
+
# following numbers and unknown. Otherwise, word has it's own category.
|
57
|
+
#
|
58
|
+
def classify(word)
|
59
|
+
return word unless rare? word
|
60
|
+
case word
|
61
|
+
when /^\d+$/ then CARD
|
62
|
+
when /^\d+[.,;:]+$/ then CARDPUNCT
|
63
|
+
when /^\d+\D+$/ then CARDSUFFIX
|
64
|
+
when /^\d+[.,;:\-]+(\d+[.,;:\-]+)*\d+$/ then CARDSEPS
|
65
|
+
else UNKNOWN
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
# Tagger requires the sentence start symbol to be defined.
|
70
|
+
#
|
71
|
+
def start_symbol
|
72
|
+
START
|
73
|
+
end
|
74
|
+
|
75
|
+
# Tagger requires the sentence stop symbol to be defined.
|
76
|
+
#
|
77
|
+
def stop_symbol
|
78
|
+
STOP
|
79
|
+
end
|
80
|
+
|
81
|
+
# Parse n-grams and lexicon files, and compute statistics over them.
|
82
|
+
#
|
83
|
+
def learn!
|
84
|
+
parse_ngrams!
|
85
|
+
parse_lexicon!
|
86
|
+
compute_interpolations! if interpolations.nil?
|
87
|
+
end
|
88
|
+
|
89
|
+
# Parse the n-grams file.
|
90
|
+
#
|
91
|
+
def parse_ngrams!
|
92
|
+
unigram, bigram = nil, nil
|
93
|
+
|
94
|
+
read(ngrams_path) do |values|
|
95
|
+
values[0] = unigram unless values[0]
|
96
|
+
values[1] = bigram unless values[1]
|
97
|
+
|
98
|
+
if values[0] && values[1] && values[2] && values[3] # a trigram
|
99
|
+
ngrams[*values[0..2]] = values[3].to_i
|
100
|
+
elsif values[0] && values[1] && values[2] && !values[3] # a bigram
|
101
|
+
ngrams[*values[0..1]] = values[2].to_i
|
102
|
+
elsif values[0] && values[1] && !values[2] && !values[3] # an unigram
|
103
|
+
ngrams[values[0]] = values[1].to_i
|
104
|
+
else
|
105
|
+
raise 'dafuq i just read: %s' % values.inspect
|
106
|
+
end
|
107
|
+
|
108
|
+
unigram, bigram = values[0], values[1]
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
# Parse the lexicon file.
|
113
|
+
#
|
114
|
+
def parse_lexicon!
|
115
|
+
read(lexicon_path) do |values|
|
116
|
+
values.compact!
|
117
|
+
|
118
|
+
word, word_count, rare = values.shift, values.shift.to_i, false
|
119
|
+
word = classify(word) if rare = (word_count == 1)
|
120
|
+
|
121
|
+
lexicon[word] += word_count
|
122
|
+
|
123
|
+
values.each_slice(2) do |tag, count|
|
124
|
+
lexicon[word, tag] += count.to_i
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
# Count coefficients for linear interpolation for evaluating
|
130
|
+
# q(first, second, third).
|
131
|
+
#
|
132
|
+
def compute_interpolations!
|
133
|
+
lambdas = [0.0, 0.0, 0.0]
|
134
|
+
|
135
|
+
unigram, bigram = nil, nil
|
136
|
+
|
137
|
+
read(ngrams_path) do |first, second, third, count|
|
138
|
+
first = unigram unless first
|
139
|
+
second = bigram unless second
|
140
|
+
|
141
|
+
unless third && count
|
142
|
+
unigram, bigram = first, second
|
143
|
+
next
|
144
|
+
end
|
145
|
+
|
146
|
+
count = count.to_i
|
147
|
+
|
148
|
+
f = Array.new
|
149
|
+
|
150
|
+
f << conditional(ngrams[third] - 1, ngrams.unigrams_count - 1)
|
151
|
+
f << conditional(ngrams[second, third] - 1, ngrams[second] - 1)
|
152
|
+
f << conditional(count - 1, ngrams[first, second] - 1)
|
153
|
+
|
154
|
+
index = f.index(f.max)
|
155
|
+
|
156
|
+
lambdas[index] += count if index
|
157
|
+
end
|
158
|
+
|
159
|
+
total = lambdas.inject(&:+)
|
160
|
+
@interpolations = lambdas.map! { |l| l / total }
|
161
|
+
end
|
162
|
+
|
163
|
+
# @private
|
164
|
+
#
|
165
|
+
def inspect
|
166
|
+
sprintf('#<%s @ngrams_path=%s @lexicon_path=%s @interpolations=%s>',
|
167
|
+
self.class.name, ngrams_path.inspect, lexicon_path.inspect,
|
168
|
+
interpolations.inspect)
|
169
|
+
end
|
170
|
+
|
171
|
+
private
|
172
|
+
# Read the TnT data file.
|
173
|
+
#
|
174
|
+
def read(path)
|
175
|
+
File.open(path) do |f|
|
176
|
+
until f.eof?
|
177
|
+
line = f.gets.chomp
|
178
|
+
next if line.empty? || line =~ /^%%/
|
179
|
+
yield line.split(/\t/).map! { |s| s.empty? ? nil : s }
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
data/myaso.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
lib = File.expand_path('../lib', __FILE__)
|
4
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
+
require 'myaso/version'
|
6
|
+
|
7
|
+
Gem::Specification.new do |spec|
|
8
|
+
spec.name = 'myaso'
|
9
|
+
spec.version = Myaso::VERSION
|
10
|
+
spec.authors = ['Dmitry Ustalov', 'Sergey Smagin']
|
11
|
+
spec.email = ['dmitry.ustalov@gmail.com']
|
12
|
+
spec.description = 'Myaso is a morphological analysis library in Ruby.'
|
13
|
+
spec.summary = 'Myaso is a morphological analysis and synthesis ' \
|
14
|
+
'library in Ruby.'
|
15
|
+
spec.homepage = 'https://github.com/dustalov/myaso'
|
16
|
+
spec.license = 'MIT'
|
17
|
+
|
18
|
+
spec.files = `git ls-files`.split($/)
|
19
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
20
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
21
|
+
spec.require_paths = ['lib']
|
22
|
+
|
23
|
+
spec.add_dependency 'ffi', '~> 1.9.0'
|
24
|
+
spec.add_dependency 'myasorubka', '~> 0.2.0'
|
25
|
+
spec.add_development_dependency 'minitest', '~> 5.0'
|
26
|
+
end
|
data/myaso.jpg
ADDED
Binary file
|
data/spec/bin_spec.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative 'spec_helper'
|
4
|
+
|
5
|
+
describe 'CLI' do
|
6
|
+
it 'should print help when ran without arguments' do
|
7
|
+
invoke('').must_include 'Commands:'
|
8
|
+
invoke('').must_include 'Options:'
|
9
|
+
end
|
10
|
+
|
11
|
+
it 'should print version information' do
|
12
|
+
invoke('-v').must_include 'Myaso v%s' % Myaso::VERSION
|
13
|
+
invoke('--version').must_equal invoke('-v')
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'should print help' do
|
17
|
+
invoke('-h').must_include 'Commands:'
|
18
|
+
invoke('-h').must_include 'Options:'
|
19
|
+
invoke('--help').must_equal invoke('-h')
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'should evaluate parameters' do
|
23
|
+
invoke('-e', 'puts :hi; exit').must_include 'hi'
|
24
|
+
invoke('--eval', 'puts :hi; exit').must_equal(
|
25
|
+
invoke('-e', 'puts :hi; exit'))
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'should consider the ngrams parameter' do
|
29
|
+
invoke('-n', 'test', '-e', 'puts options.ngrams; exit').
|
30
|
+
must_include 'test'
|
31
|
+
invoke('--ngrams', 'test', '-e', 'puts options.ngrams; exit').
|
32
|
+
must_equal(invoke('-n', 'test', '-e', 'puts options.ngrams; exit'))
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'should consider the lexicon parameter' do
|
36
|
+
invoke('-l', 'test', '-e', 'puts options.lexicon; exit').
|
37
|
+
must_include 'test'
|
38
|
+
invoke('--lexicon', 'test', '-e', 'puts options.lexicon; exit').
|
39
|
+
must_equal(invoke('-l', 'test', '-e', 'puts options.lexicon; exit'))
|
40
|
+
end
|
41
|
+
|
42
|
+
it 'should annotate sentences' do
|
43
|
+
stdout = invoke('-n', 'spec/data/test.123', '-l', 'spec/data/test.lex',
|
44
|
+
'tagger', stdin: "братишка\nя\nтебе\nпокушать\nпринес")
|
45
|
+
stdout.must_equal ["братишка\ta", "я\tb", "тебе\tb", "покушать\td",
|
46
|
+
"принес\te"]
|
47
|
+
end
|
48
|
+
end
|
data/spec/data/test.123
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
%% Test set of ngrams.
|
2
|
+
%%
|
3
|
+
%% Possible tags are: a, b, c, d, e.
|
4
|
+
a 26
|
5
|
+
a 6
|
6
|
+
a 1
|
7
|
+
b 2
|
8
|
+
c 1
|
9
|
+
SENT 1
|
10
|
+
e 1
|
11
|
+
b 8
|
12
|
+
c 3
|
13
|
+
d 3
|
14
|
+
e 2
|
15
|
+
c 4
|
16
|
+
a 2
|
17
|
+
e 1
|
18
|
+
SENT 1
|
19
|
+
d 4
|
20
|
+
a 3
|
21
|
+
e 1
|
22
|
+
e 3
|
23
|
+
b 1
|
24
|
+
d 1
|
25
|
+
SENT 1
|
26
|
+
SENT 3
|
27
|
+
a 2
|
28
|
+
e 1
|
29
|
+
b 12
|
30
|
+
a 6
|
31
|
+
a 1
|
32
|
+
b 2
|
33
|
+
c 1
|
34
|
+
SENT 1
|
35
|
+
e 1
|
36
|
+
c 4
|
37
|
+
a 2
|
38
|
+
e 2
|
39
|
+
d 2
|
40
|
+
a 1
|
41
|
+
e 1
|
42
|
+
c 10
|
43
|
+
a 8
|
44
|
+
a 1
|
45
|
+
b 3
|
46
|
+
c 2
|
47
|
+
d 1
|
48
|
+
e 1
|
49
|
+
e 2
|
50
|
+
b 1
|
51
|
+
d 1
|
52
|
+
d 16
|
53
|
+
a 12
|
54
|
+
a 1
|
55
|
+
b 4
|
56
|
+
c 3
|
57
|
+
d 2
|
58
|
+
e 2
|
59
|
+
e 4
|
60
|
+
SENT 2
|
61
|
+
d 2
|
62
|
+
e 6
|
63
|
+
d 2
|
64
|
+
a 1
|
65
|
+
e 1
|
66
|
+
SENT 4
|
67
|
+
a 1
|
68
|
+
c 2
|
69
|
+
d 1
|
70
|
+
SENT 10
|
71
|
+
SENT 2
|
72
|
+
a 2
|
73
|
+
a 4
|
74
|
+
b 3
|
75
|
+
d 1
|
76
|
+
e 1
|
77
|
+
SENT 1
|
data/spec/data/test.lex
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
--- !ruby/object:Myaso::Lexicon
|
2
|
+
table:
|
3
|
+
бра:
|
4
|
+
братишка:
|
5
|
+
! '': 2
|
6
|
+
a: 1
|
7
|
+
e: 1
|
8
|
+
я:
|
9
|
+
я:
|
10
|
+
! '': 6
|
11
|
+
b: 4
|
12
|
+
c: 2
|
13
|
+
теб:
|
14
|
+
тебе:
|
15
|
+
! '': 11
|
16
|
+
b: 8
|
17
|
+
c: 2
|
18
|
+
a: 1
|
19
|
+
пок:
|
20
|
+
покушать:
|
21
|
+
! '': 5
|
22
|
+
d: 5
|
23
|
+
при:
|
24
|
+
принес:
|
25
|
+
! '': 7
|
26
|
+
d: 5
|
27
|
+
e: 2
|
28
|
+
'@UN':
|
29
|
+
'@UNKNOWN':
|
30
|
+
! '': 1
|
31
|
+
d: 1
|
32
|
+
tags:
|
@@ -0,0 +1,106 @@
|
|
1
|
+
--- !ruby/object:Myaso::Ngrams
|
2
|
+
table:
|
3
|
+
a:
|
4
|
+
! '':
|
5
|
+
! '': 26
|
6
|
+
a:
|
7
|
+
! '': 6
|
8
|
+
a: 1
|
9
|
+
b: 2
|
10
|
+
c: 1
|
11
|
+
SENT: 1
|
12
|
+
e: 1
|
13
|
+
b:
|
14
|
+
! '': 8
|
15
|
+
c: 3
|
16
|
+
d: 3
|
17
|
+
e: 2
|
18
|
+
c:
|
19
|
+
! '': 4
|
20
|
+
a: 2
|
21
|
+
e: 1
|
22
|
+
SENT: 1
|
23
|
+
d:
|
24
|
+
! '': 4
|
25
|
+
a: 3
|
26
|
+
e: 1
|
27
|
+
e:
|
28
|
+
! '': 3
|
29
|
+
b: 1
|
30
|
+
d: 1
|
31
|
+
SENT: 1
|
32
|
+
SENT:
|
33
|
+
! '': 3
|
34
|
+
a: 2
|
35
|
+
e: 1
|
36
|
+
b:
|
37
|
+
! '':
|
38
|
+
! '': 12
|
39
|
+
a:
|
40
|
+
! '': 6
|
41
|
+
a: 1
|
42
|
+
b: 2
|
43
|
+
c: 1
|
44
|
+
SENT: 1
|
45
|
+
e: 1
|
46
|
+
c:
|
47
|
+
! '': 4
|
48
|
+
a: 2
|
49
|
+
e: 2
|
50
|
+
d:
|
51
|
+
! '': 2
|
52
|
+
a: 1
|
53
|
+
e: 1
|
54
|
+
c:
|
55
|
+
! '':
|
56
|
+
! '': 10
|
57
|
+
a:
|
58
|
+
! '': 8
|
59
|
+
a: 1
|
60
|
+
b: 3
|
61
|
+
c: 2
|
62
|
+
d: 1
|
63
|
+
e: 1
|
64
|
+
e:
|
65
|
+
! '': 2
|
66
|
+
b: 1
|
67
|
+
d: 1
|
68
|
+
d:
|
69
|
+
! '':
|
70
|
+
! '': 16
|
71
|
+
a:
|
72
|
+
! '': 12
|
73
|
+
a: 1
|
74
|
+
b: 4
|
75
|
+
c: 3
|
76
|
+
d: 2
|
77
|
+
e: 2
|
78
|
+
e:
|
79
|
+
! '': 4
|
80
|
+
SENT: 2
|
81
|
+
d: 2
|
82
|
+
e:
|
83
|
+
! '':
|
84
|
+
! '': 6
|
85
|
+
d:
|
86
|
+
! '': 2
|
87
|
+
a: 1
|
88
|
+
e: 1
|
89
|
+
SENT:
|
90
|
+
! '': 4
|
91
|
+
a: 1
|
92
|
+
c: 2
|
93
|
+
d: 1
|
94
|
+
SENT:
|
95
|
+
! '':
|
96
|
+
! '': 10
|
97
|
+
SENT:
|
98
|
+
! '': 2
|
99
|
+
a: 2
|
100
|
+
a:
|
101
|
+
! '': 4
|
102
|
+
b: 3
|
103
|
+
d: 1
|
104
|
+
e:
|
105
|
+
! '': 1
|
106
|
+
SENT: 1
|