myaso 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +25 -0
- data/.travis.yml +10 -0
- data/Gemfile +14 -0
- data/LICENSE.txt +22 -0
- data/README.md +213 -0
- data/Rakefile +21 -0
- data/bin/myaso +73 -0
- data/lib/myaso.rb +35 -0
- data/lib/myaso/lexicon.rb +70 -0
- data/lib/myaso/mystem.rb +187 -0
- data/lib/myaso/mystem/library.rb +59 -0
- data/lib/myaso/ngrams.rb +67 -0
- data/lib/myaso/pi_table.rb +36 -0
- data/lib/myaso/tagger.rb +94 -0
- data/lib/myaso/tagger/model.rb +68 -0
- data/lib/myaso/tagger/tnt.rb +183 -0
- data/lib/myaso/version.rb +9 -0
- data/myaso.gemspec +26 -0
- data/myaso.jpg +0 -0
- data/spec/bin_spec.rb +48 -0
- data/spec/data/test.123 +77 -0
- data/spec/data/test.lex +10 -0
- data/spec/fixtures/interpolations.yml +4 -0
- data/spec/fixtures/lexicon.yml +32 -0
- data/spec/fixtures/ngrams.yml +106 -0
- data/spec/lexicon_spec.rb +84 -0
- data/spec/mystem_spec.rb +81 -0
- data/spec/ngrams_spec.rb +97 -0
- data/spec/pi_table_spec.rb +53 -0
- data/spec/spec_helper.rb +12 -0
- data/spec/support/fixtures.rb +34 -0
- data/spec/support/invoker.rb +29 -0
- data/spec/tagger_spec.rb +27 -0
- data/spec/tagger_tnt_spec.rb +73 -0
- metadata +137 -0
@@ -0,0 +1,84 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative 'spec_helper'
|
4
|
+
|
5
|
+
describe Myaso::Lexicon do
|
6
|
+
subject { Myaso::Lexicon.new }
|
7
|
+
|
8
|
+
describe '#new' do
|
9
|
+
it 'should be empty' do
|
10
|
+
subject.table.must_be_empty
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'should not initialize @tags' do
|
14
|
+
subject.instance_variable_defined?(:@tags).must_equal false
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
describe '#[]' do
|
19
|
+
it 'should treat unknown words as zeroes' do
|
20
|
+
subject['lopata'].must_equal 0
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'should treat unknown words and tags as zeroes' do
|
24
|
+
subject['lopata', 'dno'].must_equal 0
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'should modify a word' do
|
28
|
+
subject['lopata'] = 1
|
29
|
+
subject['lopata'].must_equal 1
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'should modify a word with tag' do
|
33
|
+
subject['lopata', 'dno'] = 2
|
34
|
+
subject['lopata', 'dno'].must_equal 2
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
describe '#tags' do
|
39
|
+
it 'should perform lazy initialization' do
|
40
|
+
subject.instance_variable_defined?(:@tags).must_equal false
|
41
|
+
subject.tags
|
42
|
+
subject.instance_variable_get(:@tags).wont_be_nil
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'should collect global tag counts' do
|
46
|
+
subject['lopata', 'dno'] = 1
|
47
|
+
subject['lopata', 'bydlow'] = 2
|
48
|
+
subject.tags.must_equal 'dno' => 1, 'bydlow' => 2
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'should be invalidated after the value assignment' do
|
52
|
+
subject.tags.must_be_empty
|
53
|
+
subject['lopata', 'dno'] = 1
|
54
|
+
subject.tags.must_equal 'dno' => 1
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'should return tags of the given word' do
|
58
|
+
subject.tags('lopata').must_be_empty
|
59
|
+
subject['lopata', 'dno'] = 1
|
60
|
+
subject.tags('lopata').must_equal %w(dno)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
describe '#each' do
|
65
|
+
it 'should iterate over the internal table' do
|
66
|
+
subject.each.to_a.must_equal subject.table.to_a
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
describe '#==' do
|
71
|
+
let(:other) { Myaso::Lexicon.new }
|
72
|
+
|
73
|
+
it 'should be equal to a new instance when not modified' do
|
74
|
+
subject.must_equal other
|
75
|
+
end
|
76
|
+
|
77
|
+
it 'should check equality by internal tables' do
|
78
|
+
subject['lopata', 'dno'] = 1
|
79
|
+
subject.wont_equal other
|
80
|
+
other['lopata', 'dno'] = 1
|
81
|
+
subject.must_equal other
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
data/spec/mystem_spec.rb
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative 'spec_helper'
|
4
|
+
|
5
|
+
describe Myaso::Mystem do
|
6
|
+
describe 'analysis of dictionary words' do
|
7
|
+
subject { Myaso::Mystem.analyze('СТАЛИ') }
|
8
|
+
|
9
|
+
it 'is ambiguous' do
|
10
|
+
subject.length.must_equal 2
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'is a dictionary word' do
|
14
|
+
subject.each { |s| s.quality.must_equal :dictionary }
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'lemmatizes' do
|
18
|
+
subject.map(&:lemma).sort!.must_equal %w(сталь становиться)
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'normalizes' do
|
22
|
+
subject.each { |s| s.form.must_equal 'стали' }
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'analyzes' do
|
26
|
+
subject.map { |s| s.msd.pos.to_s }.sort!.must_equal %w(noun verb)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe 'analysis of bastard words' do
|
31
|
+
subject { Myaso::Mystem.analyze('дОлБоЯщЕрА') }
|
32
|
+
|
33
|
+
it 'is unambiguous' do
|
34
|
+
subject.length.must_equal 1
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'is really a dictionary word' do
|
38
|
+
subject.first.quality.must_equal :bastard
|
39
|
+
end
|
40
|
+
|
41
|
+
it 'lemmatizes' do
|
42
|
+
subject.first.lemma.must_equal 'долбоящер'
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'normalizes' do
|
46
|
+
subject.first.form.must_equal 'долбоящера'
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'analyzes' do
|
50
|
+
subject.first.msd.pos.must_equal :noun
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
describe 'form enumeration' do
|
55
|
+
let(:lemma) { Myaso::Mystem.analyze('человеком').first }
|
56
|
+
|
57
|
+
subject { Myaso::Mystem.forms('человеком', 3890) }
|
58
|
+
|
59
|
+
it 'enumerates forms' do
|
60
|
+
subject.length.must_equal 14
|
61
|
+
end
|
62
|
+
|
63
|
+
it 'works for lemmas' do
|
64
|
+
subject.must_equal lemma.forms
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
describe 'inflection' do
|
69
|
+
let(:lemma) { Myaso::Mystem.analyze('людьми').first }
|
70
|
+
|
71
|
+
subject { lemma.inflect(:number => :plural, :case => :dative) }
|
72
|
+
|
73
|
+
it 'is ambiguous' do
|
74
|
+
subject.length.must_equal 2
|
75
|
+
end
|
76
|
+
|
77
|
+
it 'inflects' do
|
78
|
+
subject.map!(&:form).sort!.must_equal %w(людям человекам)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
data/spec/ngrams_spec.rb
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative 'spec_helper'
|
4
|
+
|
5
|
+
describe Myaso::Ngrams do
|
6
|
+
let(:tags) { %w(D V N) }
|
7
|
+
let(:unigrams) { tags }
|
8
|
+
let(:bigrams) { tags + [nil] }
|
9
|
+
let(:trigrams) { tags + [nil] }
|
10
|
+
|
11
|
+
subject { Myaso::Ngrams.new }
|
12
|
+
|
13
|
+
describe '#new' do
|
14
|
+
it 'should be full of zeroes' do
|
15
|
+
unigrams.each do |u|
|
16
|
+
bigrams.each do |b|
|
17
|
+
trigrams.each do |t|
|
18
|
+
subject[u, b, t].must_equal 0
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'should be empty' do
|
25
|
+
subject.unigrams_count.must_equal 0
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
describe '#[]' do
|
30
|
+
it 'should treat unset unigrams as zeroes' do
|
31
|
+
subject['D'].must_equal 0
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'should treat unset bigrams as zeroes' do
|
35
|
+
subject['D', 'V'].must_equal 0
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'should treat unset trigrams as zeroes' do
|
39
|
+
subject['D', 'V', 'N'].must_equal 0
|
40
|
+
end
|
41
|
+
|
42
|
+
it 'should modify an unigram' do
|
43
|
+
subject['D'] = 1
|
44
|
+
subject['D'].must_equal 1
|
45
|
+
subject.unigrams_count.must_equal 1
|
46
|
+
end
|
47
|
+
|
48
|
+
it 'should modify a bigram' do
|
49
|
+
subject['D', 'N'] = 2
|
50
|
+
subject['D', 'N'].must_equal 2
|
51
|
+
end
|
52
|
+
|
53
|
+
it 'should modify a trigram' do
|
54
|
+
subject['D', 'N', 'V'] = 3
|
55
|
+
subject['D', 'N', 'V'].must_equal 3
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
describe '#==' do
|
60
|
+
let(:other) { Myaso::Ngrams.new }
|
61
|
+
|
62
|
+
it 'should be equal to a new instance when not modified' do
|
63
|
+
subject.must_equal other
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'should check equality by internal tables' do
|
67
|
+
subject['D', 'N', 'V'] = 1
|
68
|
+
subject.wont_equal other
|
69
|
+
other['D', 'N', 'V'] = 1
|
70
|
+
subject.must_equal other
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
describe '#each' do
|
75
|
+
before do
|
76
|
+
subject['D'] = 1
|
77
|
+
subject['N'] = 2
|
78
|
+
subject['D', 'N'] = 3
|
79
|
+
subject['V', 'D'] = 4
|
80
|
+
subject['D', 'N', 'V'] = 5
|
81
|
+
subject['N', 'V', 'D'] = 6
|
82
|
+
end
|
83
|
+
|
84
|
+
it 'should iterate over the internal table' do
|
85
|
+
subject.each.to_a.must_equal([["D", {nil=>{nil=>1},
|
86
|
+
"N"=>{nil=>3, "V"=>5}}], ["N", {nil=>{nil=>2}, "V"=>{"D"=>6}}],
|
87
|
+
["V", {"D"=>{nil=>4}}]])
|
88
|
+
end
|
89
|
+
|
90
|
+
it 'should enumerate over trigrams' do
|
91
|
+
Array.new.tap do |trigrams|
|
92
|
+
subject.each_trigram { |trigram| trigrams << trigram }
|
93
|
+
trigrams.must_equal [[%w(D N V), 5], [%w(N V D), 6]]
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative 'spec_helper'
|
4
|
+
|
5
|
+
describe Myaso::PiTable do
|
6
|
+
let(:length) { 10 }
|
7
|
+
let(:tags) { %w(D V N) }
|
8
|
+
|
9
|
+
subject { Myaso::PiTable.new }
|
10
|
+
|
11
|
+
describe '#new' do
|
12
|
+
it 'should be full of nils' do
|
13
|
+
length.times do |i|
|
14
|
+
tags.each do |u|
|
15
|
+
tags.each do |v|
|
16
|
+
subject[i, u, v].must_be_nil
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
describe 'with default value' do
|
23
|
+
let(:default) { 0 }
|
24
|
+
|
25
|
+
subject { Myaso::PiTable.new(default) }
|
26
|
+
|
27
|
+
it 'should be full of zeros' do
|
28
|
+
length.times do |i|
|
29
|
+
tags.each do |u|
|
30
|
+
tags.each do |v|
|
31
|
+
subject[i, u, v].must_equal 0
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
describe '#[]' do
|
40
|
+
it 'should get and set a tuple' do
|
41
|
+
subject[0, 'D', 'N'] = 1
|
42
|
+
subject[0, 'D', 'V'] = 2
|
43
|
+
subject[0, 'D', 'N'].must_equal 1
|
44
|
+
subject[0, 'D', 'V'].must_equal 2
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
describe '#each' do
|
49
|
+
it 'iterates over an internal table' do
|
50
|
+
subject.each.to_a.must_equal subject.table.to_a
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
|
5
|
+
gem 'minitest'
|
6
|
+
require 'minitest/autorun'
|
7
|
+
require 'minitest/hell'
|
8
|
+
|
9
|
+
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
10
|
+
require 'myaso'
|
11
|
+
|
12
|
+
Dir[File.expand_path('../support/**/*.rb', __FILE__)].each { |f| require f }
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'yaml'
|
4
|
+
|
5
|
+
# When this file is loaded, for each fixture file, a singleton method is
|
6
|
+
# created within the Myaso::Fixtures module with the same name as the fixture
|
7
|
+
# file, returning the value of the fixture.
|
8
|
+
#
|
9
|
+
# For example, file <tt>prefixes.yml</tt>:
|
10
|
+
#
|
11
|
+
# - id: 1
|
12
|
+
# prefix: sub
|
13
|
+
# - id: 2
|
14
|
+
# prefix: bi
|
15
|
+
#
|
16
|
+
# These fixtures would be made available like so:
|
17
|
+
#
|
18
|
+
# Myaso::Fixtures::PREFIXES
|
19
|
+
# => [{"id"=>1, "prefix"=>"sub"}, {"id"=>2, "prefix"=>"bi"}]
|
20
|
+
#
|
21
|
+
# You can find out all available fixtures by calling
|
22
|
+
#
|
23
|
+
# Myaso::Fixtures.constants
|
24
|
+
# => [ :BUCKETS ]
|
25
|
+
#
|
26
|
+
module Myaso::Fixtures
|
27
|
+
end
|
28
|
+
|
29
|
+
fixtures_path = File.expand_path('../../fixtures', __FILE__)
|
30
|
+
|
31
|
+
Dir[File.expand_path('*.yml', fixtures_path)].each do |filename|
|
32
|
+
const_name = File.basename(filename, '.*').upcase
|
33
|
+
Myaso::Fixtures.const_set(const_name, YAML.load_file(filename))
|
34
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'open3'
|
4
|
+
|
5
|
+
# http://dota2.ru/guides/880-invokirkhakha-sanstrajk-ni-azhydal-da/
|
6
|
+
#
|
7
|
+
class MiniTest::Test
|
8
|
+
# Quas Wex Exort.
|
9
|
+
#
|
10
|
+
def invoke_cache
|
11
|
+
@invoke_cache ||= {}
|
12
|
+
end
|
13
|
+
|
14
|
+
# So begins a new age of knowledge.
|
15
|
+
#
|
16
|
+
def invoke(*argv)
|
17
|
+
return invoke_cache[argv] if invoke_cache.has_key? argv
|
18
|
+
|
19
|
+
arguments = argv.dup
|
20
|
+
options = (arguments.last.is_a? Hash) ? arguments.pop : {}
|
21
|
+
executable = File.expand_path('../../../bin/myaso', __FILE__)
|
22
|
+
|
23
|
+
Open3.popen3(executable, *arguments) do |i, o, *_|
|
24
|
+
i.puts options[:stdin] if options[:stdin]
|
25
|
+
i.close
|
26
|
+
invoke_cache[argv] = o.readlines.map(&:chomp!)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/spec/tagger_spec.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative 'spec_helper'
|
4
|
+
|
5
|
+
describe Myaso::Tagger do
|
6
|
+
let(:ngrams) { File.expand_path('../data/test.123', __FILE__) }
|
7
|
+
let(:lexicon) { File.expand_path('../data/test.lex', __FILE__) }
|
8
|
+
let(:model) { Myaso::Tagger::TnT.new(ngrams, lexicon) }
|
9
|
+
|
10
|
+
subject { Myaso::Tagger.new(model) }
|
11
|
+
|
12
|
+
describe 'annotate(sentence)' do
|
13
|
+
it 'should annotate one word sentences' do
|
14
|
+
subject.annotate(%w(братишка)).must_equal %w(e)
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'should annotate sentences with tags' do
|
18
|
+
subject.annotate(%w(братишка я тебе покушать принес)).
|
19
|
+
must_equal(%w(a b b d e))
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'should handle unknown words' do
|
23
|
+
subject.annotate(%w(мир прекрасен , как никогда)).
|
24
|
+
must_equal(%w(d d d d d))
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative 'spec_helper'
|
4
|
+
|
5
|
+
describe Myaso::Tagger::Model do
|
6
|
+
let(:ngrams) { File.expand_path('../data/test.123', __FILE__) }
|
7
|
+
let(:lexicon) { File.expand_path('../data/test.lex', __FILE__) }
|
8
|
+
|
9
|
+
subject { Myaso::Tagger::TnT.new(ngrams, lexicon) }
|
10
|
+
|
11
|
+
describe '#q(t1,t2,t3)' do
|
12
|
+
it 'counts the quotient between trigram and bigram counts othewise' do
|
13
|
+
subject.q('a', 'a', 'a').must_be_close_to 0.224, 0.001
|
14
|
+
subject.q('b', 'a', 'b').must_be_close_to 0.287, 0.001
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
describe '#e(w|t)' do
|
19
|
+
it 'returns 0 if there is no such bunch word => tag' do
|
20
|
+
subject.e('братишка', 'b').must_equal(0)
|
21
|
+
subject.e('проголодался', 'c').must_equal(0)
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'counts the quotient between count(word, tag) and ngrams(tag)' do
|
25
|
+
subject.e('братишка', 'a').must_equal(1 / 26.0)
|
26
|
+
subject.e('принес', 'e').must_equal(2 / 6.0)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe '#learn!' do
|
31
|
+
it 'should has the same ngrams as in the gold standard' do
|
32
|
+
subject.ngrams.must_equal Myaso::Fixtures::NGRAMS
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'should has the same lexicon as in the gold standard' do
|
36
|
+
subject.lexicon.must_equal Myaso::Fixtures::LEXICON
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'should has the same interpolations as in the gold standard' do
|
40
|
+
subject.interpolations.must_equal Myaso::Fixtures::INTERPOLATIONS
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
describe '#start_symbol' do
|
45
|
+
it 'should be SENT' do
|
46
|
+
subject.start_symbol.must_equal 'SENT'
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
describe '#stop_symbol' do
|
51
|
+
it 'should be SENT' do
|
52
|
+
subject.stop_symbol.must_equal 'SENT'
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
describe '#conditional' do
|
57
|
+
it 'should compute p(0|0) as 0' do
|
58
|
+
subject.conditional(0, 0).must_equal 0.0
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'should compute p(1|0) as 0' do
|
62
|
+
subject.conditional(1, 0).must_equal 0.0
|
63
|
+
end
|
64
|
+
|
65
|
+
it 'should compute p(0|1) as 0' do
|
66
|
+
subject.conditional(0, 1).must_equal 0.0
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'should compute p(3|2) as 1.5' do
|
70
|
+
subject.conditional(3, 2).must_equal 1.5
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|