myaso 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,183 @@
1
+ # encoding: utf-8
2
+
3
+ # A Tagger model that can work with TnT data files.
4
+ #
5
+ class Myaso::Tagger::TnT < Myaso::Tagger::Model
6
+ # A start tag for a sentence.
7
+ #
8
+ START = 'SENT'
9
+
10
+ # A stop tag for a sentence.
11
+ #
12
+ STOP = 'SENT'
13
+
14
+ # Unknown tag for token.
15
+ #
16
+ MISSING = '-'
17
+
18
+ # Tokens consisting of a sequence of decimal digits.
19
+ #
20
+ CARD = '@CARD'
21
+
22
+ # Decimal digits followed by punctuation.
23
+ #
24
+ CARDPUNCT = '@CARDPUNCT'
25
+
26
+ # Decimal digits followed by any suffix.
27
+ #
28
+ CARDSUFFIX = '@CARDSUFFIX'
29
+
30
+ # Decimal digits separated by dots, dashes, etc.
31
+ #
32
+ CARDSEPS = '@CARDSEPS'
33
+
34
+ # Tag frequencies to handle unknown words.
35
+ #
36
+ UNKNOWN = '@UNKNOWN'
37
+
38
+ attr_reader :ngrams_path, :lexicon_path
39
+
40
+ # The tagging model is initialized by two data files. The first one is a
41
+ # n-grams file that stores statistics for unigrams, bigrams, trigrams.
42
+ # The second one is a lexicon file that stores words and their
43
+ # frequencies in the source corpus.
44
+ #
45
+ # Please note that the learning stage is not so optimized, so the
46
+ # initialization procedure may take about 120 seconds.
47
+ #
48
+ def initialize(ngrams_path, lexicon_path, interpolations = nil)
49
+ @ngrams_path = File.expand_path(ngrams_path)
50
+ @lexicon_path = File.expand_path(lexicon_path)
51
+ super(interpolations)
52
+ end
53
+
54
+ # If word is rare, it can be one of the following categories:
55
+ # includes numbers, numbers and punctuation symbols, non-numbers
56
+ # following numbers and unknown. Otherwise, word has it's own category.
57
+ #
58
+ def classify(word)
59
+ return word unless rare? word
60
+ case word
61
+ when /^\d+$/ then CARD
62
+ when /^\d+[.,;:]+$/ then CARDPUNCT
63
+ when /^\d+\D+$/ then CARDSUFFIX
64
+ when /^\d+[.,;:\-]+(\d+[.,;:\-]+)*\d+$/ then CARDSEPS
65
+ else UNKNOWN
66
+ end
67
+ end
68
+
69
+ # Tagger requires the sentence start symbol to be defined.
70
+ #
71
+ def start_symbol
72
+ START
73
+ end
74
+
75
+ # Tagger requires the sentence stop symbol to be defined.
76
+ #
77
+ def stop_symbol
78
+ STOP
79
+ end
80
+
81
+ # Parse n-grams and lexicon files, and compute statistics over them.
82
+ #
83
+ def learn!
84
+ parse_ngrams!
85
+ parse_lexicon!
86
+ compute_interpolations! if interpolations.nil?
87
+ end
88
+
89
+ # Parse the n-grams file.
90
+ #
91
+ def parse_ngrams!
92
+ unigram, bigram = nil, nil
93
+
94
+ read(ngrams_path) do |values|
95
+ values[0] = unigram unless values[0]
96
+ values[1] = bigram unless values[1]
97
+
98
+ if values[0] && values[1] && values[2] && values[3] # a trigram
99
+ ngrams[*values[0..2]] = values[3].to_i
100
+ elsif values[0] && values[1] && values[2] && !values[3] # a bigram
101
+ ngrams[*values[0..1]] = values[2].to_i
102
+ elsif values[0] && values[1] && !values[2] && !values[3] # an unigram
103
+ ngrams[values[0]] = values[1].to_i
104
+ else
105
+ raise 'dafuq i just read: %s' % values.inspect
106
+ end
107
+
108
+ unigram, bigram = values[0], values[1]
109
+ end
110
+ end
111
+
112
+ # Parse the lexicon file.
113
+ #
114
+ def parse_lexicon!
115
+ read(lexicon_path) do |values|
116
+ values.compact!
117
+
118
+ word, word_count, rare = values.shift, values.shift.to_i, false
119
+ word = classify(word) if rare = (word_count == 1)
120
+
121
+ lexicon[word] += word_count
122
+
123
+ values.each_slice(2) do |tag, count|
124
+ lexicon[word, tag] += count.to_i
125
+ end
126
+ end
127
+ end
128
+
129
+ # Count coefficients for linear interpolation for evaluating
130
+ # q(first, second, third).
131
+ #
132
+ def compute_interpolations!
133
+ lambdas = [0.0, 0.0, 0.0]
134
+
135
+ unigram, bigram = nil, nil
136
+
137
+ read(ngrams_path) do |first, second, third, count|
138
+ first = unigram unless first
139
+ second = bigram unless second
140
+
141
+ unless third && count
142
+ unigram, bigram = first, second
143
+ next
144
+ end
145
+
146
+ count = count.to_i
147
+
148
+ f = Array.new
149
+
150
+ f << conditional(ngrams[third] - 1, ngrams.unigrams_count - 1)
151
+ f << conditional(ngrams[second, third] - 1, ngrams[second] - 1)
152
+ f << conditional(count - 1, ngrams[first, second] - 1)
153
+
154
+ index = f.index(f.max)
155
+
156
+ lambdas[index] += count if index
157
+ end
158
+
159
+ total = lambdas.inject(&:+)
160
+ @interpolations = lambdas.map! { |l| l / total }
161
+ end
162
+
163
+ # @private
164
+ #
165
+ def inspect
166
+ sprintf('#<%s @ngrams_path=%s @lexicon_path=%s @interpolations=%s>',
167
+ self.class.name, ngrams_path.inspect, lexicon_path.inspect,
168
+ interpolations.inspect)
169
+ end
170
+
171
+ private
172
+ # Read the TnT data file.
173
+ #
174
+ def read(path)
175
+ File.open(path) do |f|
176
+ until f.eof?
177
+ line = f.gets.chomp
178
+ next if line.empty? || line =~ /^%%/
179
+ yield line.split(/\t/).map! { |s| s.empty? ? nil : s }
180
+ end
181
+ end
182
+ end
183
+ end
@@ -0,0 +1,9 @@
1
+ # encoding: utf-8
2
+
3
+ # Myaso is a morphological analysis and synthesis library in Ruby.
4
+ #
5
+ module Myaso
6
+ # Myaso version string.
7
+ #
8
+ VERSION = '0.4.0'
9
+ end
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+
3
+ lib = File.expand_path('../lib', __FILE__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+ require 'myaso/version'
6
+
7
+ Gem::Specification.new do |spec|
8
+ spec.name = 'myaso'
9
+ spec.version = Myaso::VERSION
10
+ spec.authors = ['Dmitry Ustalov', 'Sergey Smagin']
11
+ spec.email = ['dmitry.ustalov@gmail.com']
12
+ spec.description = 'Myaso is a morphological analysis library in Ruby.'
13
+ spec.summary = 'Myaso is a morphological analysis and synthesis ' \
14
+ 'library in Ruby.'
15
+ spec.homepage = 'https://github.com/dustalov/myaso'
16
+ spec.license = 'MIT'
17
+
18
+ spec.files = `git ls-files`.split($/)
19
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
20
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
21
+ spec.require_paths = ['lib']
22
+
23
+ spec.add_dependency 'ffi', '~> 1.9.0'
24
+ spec.add_dependency 'myasorubka', '~> 0.2.0'
25
+ spec.add_development_dependency 'minitest', '~> 5.0'
26
+ end
Binary file
@@ -0,0 +1,48 @@
1
+ # encoding: utf-8
2
+
3
+ require_relative 'spec_helper'
4
+
5
+ describe 'CLI' do
6
+ it 'should print help when ran without arguments' do
7
+ invoke('').must_include 'Commands:'
8
+ invoke('').must_include 'Options:'
9
+ end
10
+
11
+ it 'should print version information' do
12
+ invoke('-v').must_include 'Myaso v%s' % Myaso::VERSION
13
+ invoke('--version').must_equal invoke('-v')
14
+ end
15
+
16
+ it 'should print help' do
17
+ invoke('-h').must_include 'Commands:'
18
+ invoke('-h').must_include 'Options:'
19
+ invoke('--help').must_equal invoke('-h')
20
+ end
21
+
22
+ it 'should evaluate parameters' do
23
+ invoke('-e', 'puts :hi; exit').must_include 'hi'
24
+ invoke('--eval', 'puts :hi; exit').must_equal(
25
+ invoke('-e', 'puts :hi; exit'))
26
+ end
27
+
28
+ it 'should consider the ngrams parameter' do
29
+ invoke('-n', 'test', '-e', 'puts options.ngrams; exit').
30
+ must_include 'test'
31
+ invoke('--ngrams', 'test', '-e', 'puts options.ngrams; exit').
32
+ must_equal(invoke('-n', 'test', '-e', 'puts options.ngrams; exit'))
33
+ end
34
+
35
+ it 'should consider the lexicon parameter' do
36
+ invoke('-l', 'test', '-e', 'puts options.lexicon; exit').
37
+ must_include 'test'
38
+ invoke('--lexicon', 'test', '-e', 'puts options.lexicon; exit').
39
+ must_equal(invoke('-l', 'test', '-e', 'puts options.lexicon; exit'))
40
+ end
41
+
42
+ it 'should annotate sentences' do
43
+ stdout = invoke('-n', 'spec/data/test.123', '-l', 'spec/data/test.lex',
44
+ 'tagger', stdin: "братишка\nя\nтебе\nпокушать\nпринес")
45
+ stdout.must_equal ["братишка\ta", "я\tb", "тебе\tb", "покушать\td",
46
+ "принес\te"]
47
+ end
48
+ end
@@ -0,0 +1,77 @@
1
+ %% Test set of ngrams.
2
+ %%
3
+ %% Possible tags are: a, b, c, d, e.
4
+ a 26
5
+ a 6
6
+ a 1
7
+ b 2
8
+ c 1
9
+ SENT 1
10
+ e 1
11
+ b 8
12
+ c 3
13
+ d 3
14
+ e 2
15
+ c 4
16
+ a 2
17
+ e 1
18
+ SENT 1
19
+ d 4
20
+ a 3
21
+ e 1
22
+ e 3
23
+ b 1
24
+ d 1
25
+ SENT 1
26
+ SENT 3
27
+ a 2
28
+ e 1
29
+ b 12
30
+ a 6
31
+ a 1
32
+ b 2
33
+ c 1
34
+ SENT 1
35
+ e 1
36
+ c 4
37
+ a 2
38
+ e 2
39
+ d 2
40
+ a 1
41
+ e 1
42
+ c 10
43
+ a 8
44
+ a 1
45
+ b 3
46
+ c 2
47
+ d 1
48
+ e 1
49
+ e 2
50
+ b 1
51
+ d 1
52
+ d 16
53
+ a 12
54
+ a 1
55
+ b 4
56
+ c 3
57
+ d 2
58
+ e 2
59
+ e 4
60
+ SENT 2
61
+ d 2
62
+ e 6
63
+ d 2
64
+ a 1
65
+ e 1
66
+ SENT 4
67
+ a 1
68
+ c 2
69
+ d 1
70
+ SENT 10
71
+ SENT 2
72
+ a 2
73
+ a 4
74
+ b 3
75
+ d 1
76
+ e 1
77
+ SENT 1
@@ -0,0 +1,10 @@
1
+ %% Test lexicon. It should be crafty.
2
+ %%
3
+ %% Tags are: a, b, c, d, e.
4
+ %%
5
+ братишка 2 a 1 e 1
6
+ я 6 b 4 c 2
7
+ тебе 11 b 8 c 2 a 1
8
+ покушать 5 d 5
9
+ принес 7 d 5 e 2
10
+ проголодался 1 d 1
@@ -0,0 +1,4 @@
1
+ ---
2
+ - 0.189873417721519
3
+ - 0.4177215189873418
4
+ - 0.3924050632911392
@@ -0,0 +1,32 @@
1
+ --- !ruby/object:Myaso::Lexicon
2
+ table:
3
+ бра:
4
+ братишка:
5
+ ! '': 2
6
+ a: 1
7
+ e: 1
8
+ я:
9
+ я:
10
+ ! '': 6
11
+ b: 4
12
+ c: 2
13
+ теб:
14
+ тебе:
15
+ ! '': 11
16
+ b: 8
17
+ c: 2
18
+ a: 1
19
+ пок:
20
+ покушать:
21
+ ! '': 5
22
+ d: 5
23
+ при:
24
+ принес:
25
+ ! '': 7
26
+ d: 5
27
+ e: 2
28
+ '@UN':
29
+ '@UNKNOWN':
30
+ ! '': 1
31
+ d: 1
32
+ tags:
@@ -0,0 +1,106 @@
1
+ --- !ruby/object:Myaso::Ngrams
2
+ table:
3
+ a:
4
+ ! '':
5
+ ! '': 26
6
+ a:
7
+ ! '': 6
8
+ a: 1
9
+ b: 2
10
+ c: 1
11
+ SENT: 1
12
+ e: 1
13
+ b:
14
+ ! '': 8
15
+ c: 3
16
+ d: 3
17
+ e: 2
18
+ c:
19
+ ! '': 4
20
+ a: 2
21
+ e: 1
22
+ SENT: 1
23
+ d:
24
+ ! '': 4
25
+ a: 3
26
+ e: 1
27
+ e:
28
+ ! '': 3
29
+ b: 1
30
+ d: 1
31
+ SENT: 1
32
+ SENT:
33
+ ! '': 3
34
+ a: 2
35
+ e: 1
36
+ b:
37
+ ! '':
38
+ ! '': 12
39
+ a:
40
+ ! '': 6
41
+ a: 1
42
+ b: 2
43
+ c: 1
44
+ SENT: 1
45
+ e: 1
46
+ c:
47
+ ! '': 4
48
+ a: 2
49
+ e: 2
50
+ d:
51
+ ! '': 2
52
+ a: 1
53
+ e: 1
54
+ c:
55
+ ! '':
56
+ ! '': 10
57
+ a:
58
+ ! '': 8
59
+ a: 1
60
+ b: 3
61
+ c: 2
62
+ d: 1
63
+ e: 1
64
+ e:
65
+ ! '': 2
66
+ b: 1
67
+ d: 1
68
+ d:
69
+ ! '':
70
+ ! '': 16
71
+ a:
72
+ ! '': 12
73
+ a: 1
74
+ b: 4
75
+ c: 3
76
+ d: 2
77
+ e: 2
78
+ e:
79
+ ! '': 4
80
+ SENT: 2
81
+ d: 2
82
+ e:
83
+ ! '':
84
+ ! '': 6
85
+ d:
86
+ ! '': 2
87
+ a: 1
88
+ e: 1
89
+ SENT:
90
+ ! '': 4
91
+ a: 1
92
+ c: 2
93
+ d: 1
94
+ SENT:
95
+ ! '':
96
+ ! '': 10
97
+ SENT:
98
+ ! '': 2
99
+ a: 2
100
+ a:
101
+ ! '': 4
102
+ b: 3
103
+ d: 1
104
+ e:
105
+ ! '': 1
106
+ SENT: 1