myaso 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +25 -0
- data/.travis.yml +10 -0
- data/Gemfile +14 -0
- data/LICENSE.txt +22 -0
- data/README.md +213 -0
- data/Rakefile +21 -0
- data/bin/myaso +73 -0
- data/lib/myaso.rb +35 -0
- data/lib/myaso/lexicon.rb +70 -0
- data/lib/myaso/mystem.rb +187 -0
- data/lib/myaso/mystem/library.rb +59 -0
- data/lib/myaso/ngrams.rb +67 -0
- data/lib/myaso/pi_table.rb +36 -0
- data/lib/myaso/tagger.rb +94 -0
- data/lib/myaso/tagger/model.rb +68 -0
- data/lib/myaso/tagger/tnt.rb +183 -0
- data/lib/myaso/version.rb +9 -0
- data/myaso.gemspec +26 -0
- data/myaso.jpg +0 -0
- data/spec/bin_spec.rb +48 -0
- data/spec/data/test.123 +77 -0
- data/spec/data/test.lex +10 -0
- data/spec/fixtures/interpolations.yml +4 -0
- data/spec/fixtures/lexicon.yml +32 -0
- data/spec/fixtures/ngrams.yml +106 -0
- data/spec/lexicon_spec.rb +84 -0
- data/spec/mystem_spec.rb +81 -0
- data/spec/ngrams_spec.rb +97 -0
- data/spec/pi_table_spec.rb +53 -0
- data/spec/spec_helper.rb +12 -0
- data/spec/support/fixtures.rb +34 -0
- data/spec/support/invoker.rb +29 -0
- data/spec/tagger_spec.rb +27 -0
- data/spec/tagger_tnt_spec.rb +73 -0
- metadata +137 -0
@@ -0,0 +1,183 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# A Tagger model that can work with TnT data files.
|
4
|
+
#
|
5
|
+
class Myaso::Tagger::TnT < Myaso::Tagger::Model
|
6
|
+
# A start tag for a sentence.
|
7
|
+
#
|
8
|
+
START = 'SENT'
|
9
|
+
|
10
|
+
# A stop tag for a sentence.
|
11
|
+
#
|
12
|
+
STOP = 'SENT'
|
13
|
+
|
14
|
+
# Unknown tag for token.
|
15
|
+
#
|
16
|
+
MISSING = '-'
|
17
|
+
|
18
|
+
# Tokens consisting of a sequence of decimal digits.
|
19
|
+
#
|
20
|
+
CARD = '@CARD'
|
21
|
+
|
22
|
+
# Decimal digits followed by punctuation.
|
23
|
+
#
|
24
|
+
CARDPUNCT = '@CARDPUNCT'
|
25
|
+
|
26
|
+
# Decimal digits followed by any suffix.
|
27
|
+
#
|
28
|
+
CARDSUFFIX = '@CARDSUFFIX'
|
29
|
+
|
30
|
+
# Decimal digits separated by dots, dashes, etc.
|
31
|
+
#
|
32
|
+
CARDSEPS = '@CARDSEPS'
|
33
|
+
|
34
|
+
# Tag frequencies to handle unknown words.
|
35
|
+
#
|
36
|
+
UNKNOWN = '@UNKNOWN'
|
37
|
+
|
38
|
+
attr_reader :ngrams_path, :lexicon_path
|
39
|
+
|
40
|
+
# The tagging model is initialized by two data files. The first one is a
|
41
|
+
# n-grams file that stores statistics for unigrams, bigrams, trigrams.
|
42
|
+
# The second one is a lexicon file that stores words and their
|
43
|
+
# frequencies in the source corpus.
|
44
|
+
#
|
45
|
+
# Please note that the learning stage is not so optimized, so the
|
46
|
+
# initialization procedure may take about 120 seconds.
|
47
|
+
#
|
48
|
+
def initialize(ngrams_path, lexicon_path, interpolations = nil)
|
49
|
+
@ngrams_path = File.expand_path(ngrams_path)
|
50
|
+
@lexicon_path = File.expand_path(lexicon_path)
|
51
|
+
super(interpolations)
|
52
|
+
end
|
53
|
+
|
54
|
+
# If word is rare, it can be one of the following categories:
|
55
|
+
# includes numbers, numbers and punctuation symbols, non-numbers
|
56
|
+
# following numbers and unknown. Otherwise, word has it's own category.
|
57
|
+
#
|
58
|
+
def classify(word)
|
59
|
+
return word unless rare? word
|
60
|
+
case word
|
61
|
+
when /^\d+$/ then CARD
|
62
|
+
when /^\d+[.,;:]+$/ then CARDPUNCT
|
63
|
+
when /^\d+\D+$/ then CARDSUFFIX
|
64
|
+
when /^\d+[.,;:\-]+(\d+[.,;:\-]+)*\d+$/ then CARDSEPS
|
65
|
+
else UNKNOWN
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
# Tagger requires the sentence start symbol to be defined.
|
70
|
+
#
|
71
|
+
def start_symbol
|
72
|
+
START
|
73
|
+
end
|
74
|
+
|
75
|
+
# Tagger requires the sentence stop symbol to be defined.
|
76
|
+
#
|
77
|
+
def stop_symbol
|
78
|
+
STOP
|
79
|
+
end
|
80
|
+
|
81
|
+
# Parse n-grams and lexicon files, and compute statistics over them.
|
82
|
+
#
|
83
|
+
def learn!
|
84
|
+
parse_ngrams!
|
85
|
+
parse_lexicon!
|
86
|
+
compute_interpolations! if interpolations.nil?
|
87
|
+
end
|
88
|
+
|
89
|
+
# Parse the n-grams file.
|
90
|
+
#
|
91
|
+
def parse_ngrams!
|
92
|
+
unigram, bigram = nil, nil
|
93
|
+
|
94
|
+
read(ngrams_path) do |values|
|
95
|
+
values[0] = unigram unless values[0]
|
96
|
+
values[1] = bigram unless values[1]
|
97
|
+
|
98
|
+
if values[0] && values[1] && values[2] && values[3] # a trigram
|
99
|
+
ngrams[*values[0..2]] = values[3].to_i
|
100
|
+
elsif values[0] && values[1] && values[2] && !values[3] # a bigram
|
101
|
+
ngrams[*values[0..1]] = values[2].to_i
|
102
|
+
elsif values[0] && values[1] && !values[2] && !values[3] # an unigram
|
103
|
+
ngrams[values[0]] = values[1].to_i
|
104
|
+
else
|
105
|
+
raise 'dafuq i just read: %s' % values.inspect
|
106
|
+
end
|
107
|
+
|
108
|
+
unigram, bigram = values[0], values[1]
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
# Parse the lexicon file.
|
113
|
+
#
|
114
|
+
def parse_lexicon!
|
115
|
+
read(lexicon_path) do |values|
|
116
|
+
values.compact!
|
117
|
+
|
118
|
+
word, word_count, rare = values.shift, values.shift.to_i, false
|
119
|
+
word = classify(word) if rare = (word_count == 1)
|
120
|
+
|
121
|
+
lexicon[word] += word_count
|
122
|
+
|
123
|
+
values.each_slice(2) do |tag, count|
|
124
|
+
lexicon[word, tag] += count.to_i
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
# Count coefficients for linear interpolation for evaluating
|
130
|
+
# q(first, second, third).
|
131
|
+
#
|
132
|
+
def compute_interpolations!
|
133
|
+
lambdas = [0.0, 0.0, 0.0]
|
134
|
+
|
135
|
+
unigram, bigram = nil, nil
|
136
|
+
|
137
|
+
read(ngrams_path) do |first, second, third, count|
|
138
|
+
first = unigram unless first
|
139
|
+
second = bigram unless second
|
140
|
+
|
141
|
+
unless third && count
|
142
|
+
unigram, bigram = first, second
|
143
|
+
next
|
144
|
+
end
|
145
|
+
|
146
|
+
count = count.to_i
|
147
|
+
|
148
|
+
f = Array.new
|
149
|
+
|
150
|
+
f << conditional(ngrams[third] - 1, ngrams.unigrams_count - 1)
|
151
|
+
f << conditional(ngrams[second, third] - 1, ngrams[second] - 1)
|
152
|
+
f << conditional(count - 1, ngrams[first, second] - 1)
|
153
|
+
|
154
|
+
index = f.index(f.max)
|
155
|
+
|
156
|
+
lambdas[index] += count if index
|
157
|
+
end
|
158
|
+
|
159
|
+
total = lambdas.inject(&:+)
|
160
|
+
@interpolations = lambdas.map! { |l| l / total }
|
161
|
+
end
|
162
|
+
|
163
|
+
# @private
|
164
|
+
#
|
165
|
+
def inspect
|
166
|
+
sprintf('#<%s @ngrams_path=%s @lexicon_path=%s @interpolations=%s>',
|
167
|
+
self.class.name, ngrams_path.inspect, lexicon_path.inspect,
|
168
|
+
interpolations.inspect)
|
169
|
+
end
|
170
|
+
|
171
|
+
private
|
172
|
+
# Read the TnT data file.
|
173
|
+
#
|
174
|
+
def read(path)
|
175
|
+
File.open(path) do |f|
|
176
|
+
until f.eof?
|
177
|
+
line = f.gets.chomp
|
178
|
+
next if line.empty? || line =~ /^%%/
|
179
|
+
yield line.split(/\t/).map! { |s| s.empty? ? nil : s }
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
data/myaso.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
lib = File.expand_path('../lib', __FILE__)
|
4
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
+
require 'myaso/version'
|
6
|
+
|
7
|
+
Gem::Specification.new do |spec|
|
8
|
+
spec.name = 'myaso'
|
9
|
+
spec.version = Myaso::VERSION
|
10
|
+
spec.authors = ['Dmitry Ustalov', 'Sergey Smagin']
|
11
|
+
spec.email = ['dmitry.ustalov@gmail.com']
|
12
|
+
spec.description = 'Myaso is a morphological analysis library in Ruby.'
|
13
|
+
spec.summary = 'Myaso is a morphological analysis and synthesis ' \
|
14
|
+
'library in Ruby.'
|
15
|
+
spec.homepage = 'https://github.com/dustalov/myaso'
|
16
|
+
spec.license = 'MIT'
|
17
|
+
|
18
|
+
spec.files = `git ls-files`.split($/)
|
19
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
20
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
21
|
+
spec.require_paths = ['lib']
|
22
|
+
|
23
|
+
spec.add_dependency 'ffi', '~> 1.9.0'
|
24
|
+
spec.add_dependency 'myasorubka', '~> 0.2.0'
|
25
|
+
spec.add_development_dependency 'minitest', '~> 5.0'
|
26
|
+
end
|
data/myaso.jpg
ADDED
Binary file
|
data/spec/bin_spec.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative 'spec_helper'
|
4
|
+
|
5
|
+
describe 'CLI' do
|
6
|
+
it 'should print help when ran without arguments' do
|
7
|
+
invoke('').must_include 'Commands:'
|
8
|
+
invoke('').must_include 'Options:'
|
9
|
+
end
|
10
|
+
|
11
|
+
it 'should print version information' do
|
12
|
+
invoke('-v').must_include 'Myaso v%s' % Myaso::VERSION
|
13
|
+
invoke('--version').must_equal invoke('-v')
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'should print help' do
|
17
|
+
invoke('-h').must_include 'Commands:'
|
18
|
+
invoke('-h').must_include 'Options:'
|
19
|
+
invoke('--help').must_equal invoke('-h')
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'should evaluate parameters' do
|
23
|
+
invoke('-e', 'puts :hi; exit').must_include 'hi'
|
24
|
+
invoke('--eval', 'puts :hi; exit').must_equal(
|
25
|
+
invoke('-e', 'puts :hi; exit'))
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'should consider the ngrams parameter' do
|
29
|
+
invoke('-n', 'test', '-e', 'puts options.ngrams; exit').
|
30
|
+
must_include 'test'
|
31
|
+
invoke('--ngrams', 'test', '-e', 'puts options.ngrams; exit').
|
32
|
+
must_equal(invoke('-n', 'test', '-e', 'puts options.ngrams; exit'))
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'should consider the lexicon parameter' do
|
36
|
+
invoke('-l', 'test', '-e', 'puts options.lexicon; exit').
|
37
|
+
must_include 'test'
|
38
|
+
invoke('--lexicon', 'test', '-e', 'puts options.lexicon; exit').
|
39
|
+
must_equal(invoke('-l', 'test', '-e', 'puts options.lexicon; exit'))
|
40
|
+
end
|
41
|
+
|
42
|
+
it 'should annotate sentences' do
|
43
|
+
stdout = invoke('-n', 'spec/data/test.123', '-l', 'spec/data/test.lex',
|
44
|
+
'tagger', stdin: "братишка\nя\nтебе\nпокушать\nпринес")
|
45
|
+
stdout.must_equal ["братишка\ta", "я\tb", "тебе\tb", "покушать\td",
|
46
|
+
"принес\te"]
|
47
|
+
end
|
48
|
+
end
|
data/spec/data/test.123
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
%% Test set of ngrams.
|
2
|
+
%%
|
3
|
+
%% Possible tags are: a, b, c, d, e.
|
4
|
+
a 26
|
5
|
+
a 6
|
6
|
+
a 1
|
7
|
+
b 2
|
8
|
+
c 1
|
9
|
+
SENT 1
|
10
|
+
e 1
|
11
|
+
b 8
|
12
|
+
c 3
|
13
|
+
d 3
|
14
|
+
e 2
|
15
|
+
c 4
|
16
|
+
a 2
|
17
|
+
e 1
|
18
|
+
SENT 1
|
19
|
+
d 4
|
20
|
+
a 3
|
21
|
+
e 1
|
22
|
+
e 3
|
23
|
+
b 1
|
24
|
+
d 1
|
25
|
+
SENT 1
|
26
|
+
SENT 3
|
27
|
+
a 2
|
28
|
+
e 1
|
29
|
+
b 12
|
30
|
+
a 6
|
31
|
+
a 1
|
32
|
+
b 2
|
33
|
+
c 1
|
34
|
+
SENT 1
|
35
|
+
e 1
|
36
|
+
c 4
|
37
|
+
a 2
|
38
|
+
e 2
|
39
|
+
d 2
|
40
|
+
a 1
|
41
|
+
e 1
|
42
|
+
c 10
|
43
|
+
a 8
|
44
|
+
a 1
|
45
|
+
b 3
|
46
|
+
c 2
|
47
|
+
d 1
|
48
|
+
e 1
|
49
|
+
e 2
|
50
|
+
b 1
|
51
|
+
d 1
|
52
|
+
d 16
|
53
|
+
a 12
|
54
|
+
a 1
|
55
|
+
b 4
|
56
|
+
c 3
|
57
|
+
d 2
|
58
|
+
e 2
|
59
|
+
e 4
|
60
|
+
SENT 2
|
61
|
+
d 2
|
62
|
+
e 6
|
63
|
+
d 2
|
64
|
+
a 1
|
65
|
+
e 1
|
66
|
+
SENT 4
|
67
|
+
a 1
|
68
|
+
c 2
|
69
|
+
d 1
|
70
|
+
SENT 10
|
71
|
+
SENT 2
|
72
|
+
a 2
|
73
|
+
a 4
|
74
|
+
b 3
|
75
|
+
d 1
|
76
|
+
e 1
|
77
|
+
SENT 1
|
data/spec/data/test.lex
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
--- !ruby/object:Myaso::Lexicon
|
2
|
+
table:
|
3
|
+
бра:
|
4
|
+
братишка:
|
5
|
+
! '': 2
|
6
|
+
a: 1
|
7
|
+
e: 1
|
8
|
+
я:
|
9
|
+
я:
|
10
|
+
! '': 6
|
11
|
+
b: 4
|
12
|
+
c: 2
|
13
|
+
теб:
|
14
|
+
тебе:
|
15
|
+
! '': 11
|
16
|
+
b: 8
|
17
|
+
c: 2
|
18
|
+
a: 1
|
19
|
+
пок:
|
20
|
+
покушать:
|
21
|
+
! '': 5
|
22
|
+
d: 5
|
23
|
+
при:
|
24
|
+
принес:
|
25
|
+
! '': 7
|
26
|
+
d: 5
|
27
|
+
e: 2
|
28
|
+
'@UN':
|
29
|
+
'@UNKNOWN':
|
30
|
+
! '': 1
|
31
|
+
d: 1
|
32
|
+
tags:
|
@@ -0,0 +1,106 @@
|
|
1
|
+
--- !ruby/object:Myaso::Ngrams
|
2
|
+
table:
|
3
|
+
a:
|
4
|
+
! '':
|
5
|
+
! '': 26
|
6
|
+
a:
|
7
|
+
! '': 6
|
8
|
+
a: 1
|
9
|
+
b: 2
|
10
|
+
c: 1
|
11
|
+
SENT: 1
|
12
|
+
e: 1
|
13
|
+
b:
|
14
|
+
! '': 8
|
15
|
+
c: 3
|
16
|
+
d: 3
|
17
|
+
e: 2
|
18
|
+
c:
|
19
|
+
! '': 4
|
20
|
+
a: 2
|
21
|
+
e: 1
|
22
|
+
SENT: 1
|
23
|
+
d:
|
24
|
+
! '': 4
|
25
|
+
a: 3
|
26
|
+
e: 1
|
27
|
+
e:
|
28
|
+
! '': 3
|
29
|
+
b: 1
|
30
|
+
d: 1
|
31
|
+
SENT: 1
|
32
|
+
SENT:
|
33
|
+
! '': 3
|
34
|
+
a: 2
|
35
|
+
e: 1
|
36
|
+
b:
|
37
|
+
! '':
|
38
|
+
! '': 12
|
39
|
+
a:
|
40
|
+
! '': 6
|
41
|
+
a: 1
|
42
|
+
b: 2
|
43
|
+
c: 1
|
44
|
+
SENT: 1
|
45
|
+
e: 1
|
46
|
+
c:
|
47
|
+
! '': 4
|
48
|
+
a: 2
|
49
|
+
e: 2
|
50
|
+
d:
|
51
|
+
! '': 2
|
52
|
+
a: 1
|
53
|
+
e: 1
|
54
|
+
c:
|
55
|
+
! '':
|
56
|
+
! '': 10
|
57
|
+
a:
|
58
|
+
! '': 8
|
59
|
+
a: 1
|
60
|
+
b: 3
|
61
|
+
c: 2
|
62
|
+
d: 1
|
63
|
+
e: 1
|
64
|
+
e:
|
65
|
+
! '': 2
|
66
|
+
b: 1
|
67
|
+
d: 1
|
68
|
+
d:
|
69
|
+
! '':
|
70
|
+
! '': 16
|
71
|
+
a:
|
72
|
+
! '': 12
|
73
|
+
a: 1
|
74
|
+
b: 4
|
75
|
+
c: 3
|
76
|
+
d: 2
|
77
|
+
e: 2
|
78
|
+
e:
|
79
|
+
! '': 4
|
80
|
+
SENT: 2
|
81
|
+
d: 2
|
82
|
+
e:
|
83
|
+
! '':
|
84
|
+
! '': 6
|
85
|
+
d:
|
86
|
+
! '': 2
|
87
|
+
a: 1
|
88
|
+
e: 1
|
89
|
+
SENT:
|
90
|
+
! '': 4
|
91
|
+
a: 1
|
92
|
+
c: 2
|
93
|
+
d: 1
|
94
|
+
SENT:
|
95
|
+
! '':
|
96
|
+
! '': 10
|
97
|
+
SENT:
|
98
|
+
! '': 2
|
99
|
+
a: 2
|
100
|
+
a:
|
101
|
+
! '': 4
|
102
|
+
b: 3
|
103
|
+
d: 1
|
104
|
+
e:
|
105
|
+
! '': 1
|
106
|
+
SENT: 1
|