myaso 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,183 @@
1
+ # encoding: utf-8
2
+
3
+ # A Tagger model that can work with TnT data files.
4
+ #
5
+ class Myaso::Tagger::TnT < Myaso::Tagger::Model
6
+ # A start tag for a sentence.
7
+ #
8
+ START = 'SENT'
9
+
10
+ # A stop tag for a sentence.
11
+ #
12
+ STOP = 'SENT'
13
+
14
+ # Unknown tag for token.
15
+ #
16
+ MISSING = '-'
17
+
18
+ # Tokens consisting of a sequence of decimal digits.
19
+ #
20
+ CARD = '@CARD'
21
+
22
+ # Decimal digits followed by punctuation.
23
+ #
24
+ CARDPUNCT = '@CARDPUNCT'
25
+
26
+ # Decimal digits followed by any suffix.
27
+ #
28
+ CARDSUFFIX = '@CARDSUFFIX'
29
+
30
+ # Decimal digits separated by dots, dashes, etc.
31
+ #
32
+ CARDSEPS = '@CARDSEPS'
33
+
34
+ # Tag frequencies to handle unknown words.
35
+ #
36
+ UNKNOWN = '@UNKNOWN'
37
+
38
+ attr_reader :ngrams_path, :lexicon_path
39
+
40
+ # The tagging model is initialized by two data files. The first one is a
41
+ # n-grams file that stores statistics for unigrams, bigrams, trigrams.
42
+ # The second one is a lexicon file that stores words and their
43
+ # frequencies in the source corpus.
44
+ #
45
+ # Please note that the learning stage is not so optimized, so the
46
+ # initialization procedure may take about 120 seconds.
47
+ #
48
+ def initialize(ngrams_path, lexicon_path, interpolations = nil)
49
+ @ngrams_path = File.expand_path(ngrams_path)
50
+ @lexicon_path = File.expand_path(lexicon_path)
51
+ super(interpolations)
52
+ end
53
+
54
+ # If word is rare, it can be one of the following categories:
55
+ # includes numbers, numbers and punctuation symbols, non-numbers
56
+ # following numbers and unknown. Otherwise, word has it's own category.
57
+ #
58
+ def classify(word)
59
+ return word unless rare? word
60
+ case word
61
+ when /^\d+$/ then CARD
62
+ when /^\d+[.,;:]+$/ then CARDPUNCT
63
+ when /^\d+\D+$/ then CARDSUFFIX
64
+ when /^\d+[.,;:\-]+(\d+[.,;:\-]+)*\d+$/ then CARDSEPS
65
+ else UNKNOWN
66
+ end
67
+ end
68
+
69
+ # Tagger requires the sentence start symbol to be defined.
70
+ #
71
+ def start_symbol
72
+ START
73
+ end
74
+
75
+ # Tagger requires the sentence stop symbol to be defined.
76
+ #
77
+ def stop_symbol
78
+ STOP
79
+ end
80
+
81
+ # Parse n-grams and lexicon files, and compute statistics over them.
82
+ #
83
+ def learn!
84
+ parse_ngrams!
85
+ parse_lexicon!
86
+ compute_interpolations! if interpolations.nil?
87
+ end
88
+
89
+ # Parse the n-grams file.
90
+ #
91
+ def parse_ngrams!
92
+ unigram, bigram = nil, nil
93
+
94
+ read(ngrams_path) do |values|
95
+ values[0] = unigram unless values[0]
96
+ values[1] = bigram unless values[1]
97
+
98
+ if values[0] && values[1] && values[2] && values[3] # a trigram
99
+ ngrams[*values[0..2]] = values[3].to_i
100
+ elsif values[0] && values[1] && values[2] && !values[3] # a bigram
101
+ ngrams[*values[0..1]] = values[2].to_i
102
+ elsif values[0] && values[1] && !values[2] && !values[3] # an unigram
103
+ ngrams[values[0]] = values[1].to_i
104
+ else
105
+ raise 'dafuq i just read: %s' % values.inspect
106
+ end
107
+
108
+ unigram, bigram = values[0], values[1]
109
+ end
110
+ end
111
+
112
+ # Parse the lexicon file.
113
+ #
114
+ def parse_lexicon!
115
+ read(lexicon_path) do |values|
116
+ values.compact!
117
+
118
+ word, word_count, rare = values.shift, values.shift.to_i, false
119
+ word = classify(word) if rare = (word_count == 1)
120
+
121
+ lexicon[word] += word_count
122
+
123
+ values.each_slice(2) do |tag, count|
124
+ lexicon[word, tag] += count.to_i
125
+ end
126
+ end
127
+ end
128
+
129
+ # Count coefficients for linear interpolation for evaluating
130
+ # q(first, second, third).
131
+ #
132
+ def compute_interpolations!
133
+ lambdas = [0.0, 0.0, 0.0]
134
+
135
+ unigram, bigram = nil, nil
136
+
137
+ read(ngrams_path) do |first, second, third, count|
138
+ first = unigram unless first
139
+ second = bigram unless second
140
+
141
+ unless third && count
142
+ unigram, bigram = first, second
143
+ next
144
+ end
145
+
146
+ count = count.to_i
147
+
148
+ f = Array.new
149
+
150
+ f << conditional(ngrams[third] - 1, ngrams.unigrams_count - 1)
151
+ f << conditional(ngrams[second, third] - 1, ngrams[second] - 1)
152
+ f << conditional(count - 1, ngrams[first, second] - 1)
153
+
154
+ index = f.index(f.max)
155
+
156
+ lambdas[index] += count if index
157
+ end
158
+
159
+ total = lambdas.inject(&:+)
160
+ @interpolations = lambdas.map! { |l| l / total }
161
+ end
162
+
163
+ # @private
164
+ #
165
+ def inspect
166
+ sprintf('#<%s @ngrams_path=%s @lexicon_path=%s @interpolations=%s>',
167
+ self.class.name, ngrams_path.inspect, lexicon_path.inspect,
168
+ interpolations.inspect)
169
+ end
170
+
171
+ private
172
+ # Read the TnT data file.
173
+ #
174
+ def read(path)
175
+ File.open(path) do |f|
176
+ until f.eof?
177
+ line = f.gets.chomp
178
+ next if line.empty? || line =~ /^%%/
179
+ yield line.split(/\t/).map! { |s| s.empty? ? nil : s }
180
+ end
181
+ end
182
+ end
183
+ end
@@ -0,0 +1,9 @@
1
+ # encoding: utf-8
2
+
3
+ # Myaso is a morphological analysis and synthesis library in Ruby.
4
+ #
5
+ module Myaso
6
+ # Myaso version string.
7
+ #
8
+ VERSION = '0.4.0'
9
+ end
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+
3
+ lib = File.expand_path('../lib', __FILE__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+ require 'myaso/version'
6
+
7
+ Gem::Specification.new do |spec|
8
+ spec.name = 'myaso'
9
+ spec.version = Myaso::VERSION
10
+ spec.authors = ['Dmitry Ustalov', 'Sergey Smagin']
11
+ spec.email = ['dmitry.ustalov@gmail.com']
12
+ spec.description = 'Myaso is a morphological analysis library in Ruby.'
13
+ spec.summary = 'Myaso is a morphological analysis and synthesis ' \
14
+ 'library in Ruby.'
15
+ spec.homepage = 'https://github.com/dustalov/myaso'
16
+ spec.license = 'MIT'
17
+
18
+ spec.files = `git ls-files`.split($/)
19
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
20
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
21
+ spec.require_paths = ['lib']
22
+
23
+ spec.add_dependency 'ffi', '~> 1.9.0'
24
+ spec.add_dependency 'myasorubka', '~> 0.2.0'
25
+ spec.add_development_dependency 'minitest', '~> 5.0'
26
+ end
Binary file
@@ -0,0 +1,48 @@
1
+ # encoding: utf-8
2
+
3
+ require_relative 'spec_helper'
4
+
5
+ describe 'CLI' do
6
+ it 'should print help when ran without arguments' do
7
+ invoke('').must_include 'Commands:'
8
+ invoke('').must_include 'Options:'
9
+ end
10
+
11
+ it 'should print version information' do
12
+ invoke('-v').must_include 'Myaso v%s' % Myaso::VERSION
13
+ invoke('--version').must_equal invoke('-v')
14
+ end
15
+
16
+ it 'should print help' do
17
+ invoke('-h').must_include 'Commands:'
18
+ invoke('-h').must_include 'Options:'
19
+ invoke('--help').must_equal invoke('-h')
20
+ end
21
+
22
+ it 'should evaluate parameters' do
23
+ invoke('-e', 'puts :hi; exit').must_include 'hi'
24
+ invoke('--eval', 'puts :hi; exit').must_equal(
25
+ invoke('-e', 'puts :hi; exit'))
26
+ end
27
+
28
+ it 'should consider the ngrams parameter' do
29
+ invoke('-n', 'test', '-e', 'puts options.ngrams; exit').
30
+ must_include 'test'
31
+ invoke('--ngrams', 'test', '-e', 'puts options.ngrams; exit').
32
+ must_equal(invoke('-n', 'test', '-e', 'puts options.ngrams; exit'))
33
+ end
34
+
35
+ it 'should consider the lexicon parameter' do
36
+ invoke('-l', 'test', '-e', 'puts options.lexicon; exit').
37
+ must_include 'test'
38
+ invoke('--lexicon', 'test', '-e', 'puts options.lexicon; exit').
39
+ must_equal(invoke('-l', 'test', '-e', 'puts options.lexicon; exit'))
40
+ end
41
+
42
+ it 'should annotate sentences' do
43
+ stdout = invoke('-n', 'spec/data/test.123', '-l', 'spec/data/test.lex',
44
+ 'tagger', stdin: "братишка\nя\nтебе\nпокушать\nпринес")
45
+ stdout.must_equal ["братишка\ta", "я\tb", "тебе\tb", "покушать\td",
46
+ "принес\te"]
47
+ end
48
+ end
@@ -0,0 +1,77 @@
1
+ %% Test set of ngrams.
2
+ %%
3
+ %% Possible tags are: a, b, c, d, e.
4
+ a 26
5
+ a 6
6
+ a 1
7
+ b 2
8
+ c 1
9
+ SENT 1
10
+ e 1
11
+ b 8
12
+ c 3
13
+ d 3
14
+ e 2
15
+ c 4
16
+ a 2
17
+ e 1
18
+ SENT 1
19
+ d 4
20
+ a 3
21
+ e 1
22
+ e 3
23
+ b 1
24
+ d 1
25
+ SENT 1
26
+ SENT 3
27
+ a 2
28
+ e 1
29
+ b 12
30
+ a 6
31
+ a 1
32
+ b 2
33
+ c 1
34
+ SENT 1
35
+ e 1
36
+ c 4
37
+ a 2
38
+ e 2
39
+ d 2
40
+ a 1
41
+ e 1
42
+ c 10
43
+ a 8
44
+ a 1
45
+ b 3
46
+ c 2
47
+ d 1
48
+ e 1
49
+ e 2
50
+ b 1
51
+ d 1
52
+ d 16
53
+ a 12
54
+ a 1
55
+ b 4
56
+ c 3
57
+ d 2
58
+ e 2
59
+ e 4
60
+ SENT 2
61
+ d 2
62
+ e 6
63
+ d 2
64
+ a 1
65
+ e 1
66
+ SENT 4
67
+ a 1
68
+ c 2
69
+ d 1
70
+ SENT 10
71
+ SENT 2
72
+ a 2
73
+ a 4
74
+ b 3
75
+ d 1
76
+ e 1
77
+ SENT 1
@@ -0,0 +1,10 @@
1
+ %% Test lexicon. It should be crafty.
2
+ %%
3
+ %% Tags are: a, b, c, d, e.
4
+ %%
5
+ братишка 2 a 1 e 1
6
+ я 6 b 4 c 2
7
+ тебе 11 b 8 c 2 a 1
8
+ покушать 5 d 5
9
+ принес 7 d 5 e 2
10
+ проголодался 1 d 1
@@ -0,0 +1,4 @@
1
+ ---
2
+ - 0.189873417721519
3
+ - 0.4177215189873418
4
+ - 0.3924050632911392
@@ -0,0 +1,32 @@
1
+ --- !ruby/object:Myaso::Lexicon
2
+ table:
3
+ бра:
4
+ братишка:
5
+ ! '': 2
6
+ a: 1
7
+ e: 1
8
+ я:
9
+ я:
10
+ ! '': 6
11
+ b: 4
12
+ c: 2
13
+ теб:
14
+ тебе:
15
+ ! '': 11
16
+ b: 8
17
+ c: 2
18
+ a: 1
19
+ пок:
20
+ покушать:
21
+ ! '': 5
22
+ d: 5
23
+ при:
24
+ принес:
25
+ ! '': 7
26
+ d: 5
27
+ e: 2
28
+ '@UN':
29
+ '@UNKNOWN':
30
+ ! '': 1
31
+ d: 1
32
+ tags:
@@ -0,0 +1,106 @@
1
+ --- !ruby/object:Myaso::Ngrams
2
+ table:
3
+ a:
4
+ ! '':
5
+ ! '': 26
6
+ a:
7
+ ! '': 6
8
+ a: 1
9
+ b: 2
10
+ c: 1
11
+ SENT: 1
12
+ e: 1
13
+ b:
14
+ ! '': 8
15
+ c: 3
16
+ d: 3
17
+ e: 2
18
+ c:
19
+ ! '': 4
20
+ a: 2
21
+ e: 1
22
+ SENT: 1
23
+ d:
24
+ ! '': 4
25
+ a: 3
26
+ e: 1
27
+ e:
28
+ ! '': 3
29
+ b: 1
30
+ d: 1
31
+ SENT: 1
32
+ SENT:
33
+ ! '': 3
34
+ a: 2
35
+ e: 1
36
+ b:
37
+ ! '':
38
+ ! '': 12
39
+ a:
40
+ ! '': 6
41
+ a: 1
42
+ b: 2
43
+ c: 1
44
+ SENT: 1
45
+ e: 1
46
+ c:
47
+ ! '': 4
48
+ a: 2
49
+ e: 2
50
+ d:
51
+ ! '': 2
52
+ a: 1
53
+ e: 1
54
+ c:
55
+ ! '':
56
+ ! '': 10
57
+ a:
58
+ ! '': 8
59
+ a: 1
60
+ b: 3
61
+ c: 2
62
+ d: 1
63
+ e: 1
64
+ e:
65
+ ! '': 2
66
+ b: 1
67
+ d: 1
68
+ d:
69
+ ! '':
70
+ ! '': 16
71
+ a:
72
+ ! '': 12
73
+ a: 1
74
+ b: 4
75
+ c: 3
76
+ d: 2
77
+ e: 2
78
+ e:
79
+ ! '': 4
80
+ SENT: 2
81
+ d: 2
82
+ e:
83
+ ! '':
84
+ ! '': 6
85
+ d:
86
+ ! '': 2
87
+ a: 1
88
+ e: 1
89
+ SENT:
90
+ ! '': 4
91
+ a: 1
92
+ c: 2
93
+ d: 1
94
+ SENT:
95
+ ! '':
96
+ ! '': 10
97
+ SENT:
98
+ ! '': 2
99
+ a: 2
100
+ a:
101
+ ! '': 4
102
+ b: 3
103
+ d: 1
104
+ e:
105
+ ! '': 1
106
+ SENT: 1