myaso 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +25 -0
- data/.travis.yml +10 -0
- data/Gemfile +14 -0
- data/LICENSE.txt +22 -0
- data/README.md +213 -0
- data/Rakefile +21 -0
- data/bin/myaso +73 -0
- data/lib/myaso.rb +35 -0
- data/lib/myaso/lexicon.rb +70 -0
- data/lib/myaso/mystem.rb +187 -0
- data/lib/myaso/mystem/library.rb +59 -0
- data/lib/myaso/ngrams.rb +67 -0
- data/lib/myaso/pi_table.rb +36 -0
- data/lib/myaso/tagger.rb +94 -0
- data/lib/myaso/tagger/model.rb +68 -0
- data/lib/myaso/tagger/tnt.rb +183 -0
- data/lib/myaso/version.rb +9 -0
- data/myaso.gemspec +26 -0
- data/myaso.jpg +0 -0
- data/spec/bin_spec.rb +48 -0
- data/spec/data/test.123 +77 -0
- data/spec/data/test.lex +10 -0
- data/spec/fixtures/interpolations.yml +4 -0
- data/spec/fixtures/lexicon.yml +32 -0
- data/spec/fixtures/ngrams.yml +106 -0
- data/spec/lexicon_spec.rb +84 -0
- data/spec/mystem_spec.rb +81 -0
- data/spec/ngrams_spec.rb +97 -0
- data/spec/pi_table_spec.rb +53 -0
- data/spec/spec_helper.rb +12 -0
- data/spec/support/fixtures.rb +34 -0
- data/spec/support/invoker.rb +29 -0
- data/spec/tagger_spec.rb +27 -0
- data/spec/tagger_tnt_spec.rb +73 -0
- metadata +137 -0
@@ -0,0 +1,84 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative 'spec_helper'
|
4
|
+
|
5
|
+
describe Myaso::Lexicon do
|
6
|
+
subject { Myaso::Lexicon.new }
|
7
|
+
|
8
|
+
describe '#new' do
|
9
|
+
it 'should be empty' do
|
10
|
+
subject.table.must_be_empty
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'should not initialize @tags' do
|
14
|
+
subject.instance_variable_defined?(:@tags).must_equal false
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
describe '#[]' do
|
19
|
+
it 'should treat unknown words as zeroes' do
|
20
|
+
subject['lopata'].must_equal 0
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'should treat unknown words and tags as zeroes' do
|
24
|
+
subject['lopata', 'dno'].must_equal 0
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'should modify a word' do
|
28
|
+
subject['lopata'] = 1
|
29
|
+
subject['lopata'].must_equal 1
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'should modify a word with tag' do
|
33
|
+
subject['lopata', 'dno'] = 2
|
34
|
+
subject['lopata', 'dno'].must_equal 2
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
describe '#tags' do
|
39
|
+
it 'should perform lazy initialization' do
|
40
|
+
subject.instance_variable_defined?(:@tags).must_equal false
|
41
|
+
subject.tags
|
42
|
+
subject.instance_variable_get(:@tags).wont_be_nil
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'should collect global tag counts' do
|
46
|
+
subject['lopata', 'dno'] = 1
|
47
|
+
subject['lopata', 'bydlow'] = 2
|
48
|
+
subject.tags.must_equal 'dno' => 1, 'bydlow' => 2
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'should be invalidated after the value assignment' do
|
52
|
+
subject.tags.must_be_empty
|
53
|
+
subject['lopata', 'dno'] = 1
|
54
|
+
subject.tags.must_equal 'dno' => 1
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'should return tags of the given word' do
|
58
|
+
subject.tags('lopata').must_be_empty
|
59
|
+
subject['lopata', 'dno'] = 1
|
60
|
+
subject.tags('lopata').must_equal %w(dno)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
describe '#each' do
|
65
|
+
it 'should iterate over the internal table' do
|
66
|
+
subject.each.to_a.must_equal subject.table.to_a
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
describe '#==' do
|
71
|
+
let(:other) { Myaso::Lexicon.new }
|
72
|
+
|
73
|
+
it 'should be equal to a new instance when not modified' do
|
74
|
+
subject.must_equal other
|
75
|
+
end
|
76
|
+
|
77
|
+
it 'should check equality by internal tables' do
|
78
|
+
subject['lopata', 'dno'] = 1
|
79
|
+
subject.wont_equal other
|
80
|
+
other['lopata', 'dno'] = 1
|
81
|
+
subject.must_equal other
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
data/spec/mystem_spec.rb
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative 'spec_helper'
|
4
|
+
|
5
|
+
describe Myaso::Mystem do
|
6
|
+
describe 'analysis of dictionary words' do
|
7
|
+
subject { Myaso::Mystem.analyze('СТАЛИ') }
|
8
|
+
|
9
|
+
it 'is ambiguous' do
|
10
|
+
subject.length.must_equal 2
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'is a dictionary word' do
|
14
|
+
subject.each { |s| s.quality.must_equal :dictionary }
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'lemmatizes' do
|
18
|
+
subject.map(&:lemma).sort!.must_equal %w(сталь становиться)
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'normalizes' do
|
22
|
+
subject.each { |s| s.form.must_equal 'стали' }
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'analyzes' do
|
26
|
+
subject.map { |s| s.msd.pos.to_s }.sort!.must_equal %w(noun verb)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe 'analysis of bastard words' do
|
31
|
+
subject { Myaso::Mystem.analyze('дОлБоЯщЕрА') }
|
32
|
+
|
33
|
+
it 'is unambiguous' do
|
34
|
+
subject.length.must_equal 1
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'is really a dictionary word' do
|
38
|
+
subject.first.quality.must_equal :bastard
|
39
|
+
end
|
40
|
+
|
41
|
+
it 'lemmatizes' do
|
42
|
+
subject.first.lemma.must_equal 'долбоящер'
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'normalizes' do
|
46
|
+
subject.first.form.must_equal 'долбоящера'
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'analyzes' do
|
50
|
+
subject.first.msd.pos.must_equal :noun
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
describe 'form enumeration' do
|
55
|
+
let(:lemma) { Myaso::Mystem.analyze('человеком').first }
|
56
|
+
|
57
|
+
subject { Myaso::Mystem.forms('человеком', 3890) }
|
58
|
+
|
59
|
+
it 'enumerates forms' do
|
60
|
+
subject.length.must_equal 14
|
61
|
+
end
|
62
|
+
|
63
|
+
it 'works for lemmas' do
|
64
|
+
subject.must_equal lemma.forms
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
describe 'inflection' do
|
69
|
+
let(:lemma) { Myaso::Mystem.analyze('людьми').first }
|
70
|
+
|
71
|
+
subject { lemma.inflect(:number => :plural, :case => :dative) }
|
72
|
+
|
73
|
+
it 'is ambiguous' do
|
74
|
+
subject.length.must_equal 2
|
75
|
+
end
|
76
|
+
|
77
|
+
it 'inflects' do
|
78
|
+
subject.map!(&:form).sort!.must_equal %w(людям человекам)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
data/spec/ngrams_spec.rb
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative 'spec_helper'
|
4
|
+
|
5
|
+
describe Myaso::Ngrams do
|
6
|
+
let(:tags) { %w(D V N) }
|
7
|
+
let(:unigrams) { tags }
|
8
|
+
let(:bigrams) { tags + [nil] }
|
9
|
+
let(:trigrams) { tags + [nil] }
|
10
|
+
|
11
|
+
subject { Myaso::Ngrams.new }
|
12
|
+
|
13
|
+
describe '#new' do
|
14
|
+
it 'should be full of zeroes' do
|
15
|
+
unigrams.each do |u|
|
16
|
+
bigrams.each do |b|
|
17
|
+
trigrams.each do |t|
|
18
|
+
subject[u, b, t].must_equal 0
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'should be empty' do
|
25
|
+
subject.unigrams_count.must_equal 0
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
describe '#[]' do
|
30
|
+
it 'should treat unset unigrams as zeroes' do
|
31
|
+
subject['D'].must_equal 0
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'should treat unset bigrams as zeroes' do
|
35
|
+
subject['D', 'V'].must_equal 0
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'should treat unset trigrams as zeroes' do
|
39
|
+
subject['D', 'V', 'N'].must_equal 0
|
40
|
+
end
|
41
|
+
|
42
|
+
it 'should modify an unigram' do
|
43
|
+
subject['D'] = 1
|
44
|
+
subject['D'].must_equal 1
|
45
|
+
subject.unigrams_count.must_equal 1
|
46
|
+
end
|
47
|
+
|
48
|
+
it 'should modify a bigram' do
|
49
|
+
subject['D', 'N'] = 2
|
50
|
+
subject['D', 'N'].must_equal 2
|
51
|
+
end
|
52
|
+
|
53
|
+
it 'should modify a trigram' do
|
54
|
+
subject['D', 'N', 'V'] = 3
|
55
|
+
subject['D', 'N', 'V'].must_equal 3
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
describe '#==' do
|
60
|
+
let(:other) { Myaso::Ngrams.new }
|
61
|
+
|
62
|
+
it 'should be equal to a new instance when not modified' do
|
63
|
+
subject.must_equal other
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'should check equality by internal tables' do
|
67
|
+
subject['D', 'N', 'V'] = 1
|
68
|
+
subject.wont_equal other
|
69
|
+
other['D', 'N', 'V'] = 1
|
70
|
+
subject.must_equal other
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
describe '#each' do
|
75
|
+
before do
|
76
|
+
subject['D'] = 1
|
77
|
+
subject['N'] = 2
|
78
|
+
subject['D', 'N'] = 3
|
79
|
+
subject['V', 'D'] = 4
|
80
|
+
subject['D', 'N', 'V'] = 5
|
81
|
+
subject['N', 'V', 'D'] = 6
|
82
|
+
end
|
83
|
+
|
84
|
+
it 'should iterate over the internal table' do
|
85
|
+
subject.each.to_a.must_equal([["D", {nil=>{nil=>1},
|
86
|
+
"N"=>{nil=>3, "V"=>5}}], ["N", {nil=>{nil=>2}, "V"=>{"D"=>6}}],
|
87
|
+
["V", {"D"=>{nil=>4}}]])
|
88
|
+
end
|
89
|
+
|
90
|
+
it 'should enumerate over trigrams' do
|
91
|
+
Array.new.tap do |trigrams|
|
92
|
+
subject.each_trigram { |trigram| trigrams << trigram }
|
93
|
+
trigrams.must_equal [[%w(D N V), 5], [%w(N V D), 6]]
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative 'spec_helper'
|
4
|
+
|
5
|
+
describe Myaso::PiTable do
|
6
|
+
let(:length) { 10 }
|
7
|
+
let(:tags) { %w(D V N) }
|
8
|
+
|
9
|
+
subject { Myaso::PiTable.new }
|
10
|
+
|
11
|
+
describe '#new' do
|
12
|
+
it 'should be full of nils' do
|
13
|
+
length.times do |i|
|
14
|
+
tags.each do |u|
|
15
|
+
tags.each do |v|
|
16
|
+
subject[i, u, v].must_be_nil
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
describe 'with default value' do
|
23
|
+
let(:default) { 0 }
|
24
|
+
|
25
|
+
subject { Myaso::PiTable.new(default) }
|
26
|
+
|
27
|
+
it 'should be full of zeros' do
|
28
|
+
length.times do |i|
|
29
|
+
tags.each do |u|
|
30
|
+
tags.each do |v|
|
31
|
+
subject[i, u, v].must_equal 0
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
describe '#[]' do
|
40
|
+
it 'should get and set a tuple' do
|
41
|
+
subject[0, 'D', 'N'] = 1
|
42
|
+
subject[0, 'D', 'V'] = 2
|
43
|
+
subject[0, 'D', 'N'].must_equal 1
|
44
|
+
subject[0, 'D', 'V'].must_equal 2
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
describe '#each' do
|
49
|
+
it 'iterates over an internal table' do
|
50
|
+
subject.each.to_a.must_equal subject.table.to_a
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
|
5
|
+
gem 'minitest'
|
6
|
+
require 'minitest/autorun'
|
7
|
+
require 'minitest/hell'
|
8
|
+
|
9
|
+
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
10
|
+
require 'myaso'
|
11
|
+
|
12
|
+
Dir[File.expand_path('../support/**/*.rb', __FILE__)].each { |f| require f }
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'yaml'
|
4
|
+
|
5
|
+
# When this file is loaded, for each fixture file, a singleton method is
|
6
|
+
# created within the Myaso::Fixtures module with the same name as the fixture
|
7
|
+
# file, returning the value of the fixture.
|
8
|
+
#
|
9
|
+
# For example, file <tt>prefixes.yml</tt>:
|
10
|
+
#
|
11
|
+
# - id: 1
|
12
|
+
# prefix: sub
|
13
|
+
# - id: 2
|
14
|
+
# prefix: bi
|
15
|
+
#
|
16
|
+
# These fixtures would be made available like so:
|
17
|
+
#
|
18
|
+
# Myaso::Fixtures::PREFIXES
|
19
|
+
# => [{"id"=>1, "prefix"=>"sub"}, {"id"=>2, "prefix"=>"bi"}]
|
20
|
+
#
|
21
|
+
# You can find out all available fixtures by calling
|
22
|
+
#
|
23
|
+
# Myaso::Fixtures.constants
|
24
|
+
# => [ :BUCKETS ]
|
25
|
+
#
|
26
|
+
module Myaso::Fixtures
|
27
|
+
end
|
28
|
+
|
29
|
+
fixtures_path = File.expand_path('../../fixtures', __FILE__)
|
30
|
+
|
31
|
+
Dir[File.expand_path('*.yml', fixtures_path)].each do |filename|
|
32
|
+
const_name = File.basename(filename, '.*').upcase
|
33
|
+
Myaso::Fixtures.const_set(const_name, YAML.load_file(filename))
|
34
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'open3'
|
4
|
+
|
5
|
+
# http://dota2.ru/guides/880-invokirkhakha-sanstrajk-ni-azhydal-da/
|
6
|
+
#
|
7
|
+
class MiniTest::Test
|
8
|
+
# Quas Wex Exort.
|
9
|
+
#
|
10
|
+
def invoke_cache
|
11
|
+
@invoke_cache ||= {}
|
12
|
+
end
|
13
|
+
|
14
|
+
# So begins a new age of knowledge.
|
15
|
+
#
|
16
|
+
def invoke(*argv)
|
17
|
+
return invoke_cache[argv] if invoke_cache.has_key? argv
|
18
|
+
|
19
|
+
arguments = argv.dup
|
20
|
+
options = (arguments.last.is_a? Hash) ? arguments.pop : {}
|
21
|
+
executable = File.expand_path('../../../bin/myaso', __FILE__)
|
22
|
+
|
23
|
+
Open3.popen3(executable, *arguments) do |i, o, *_|
|
24
|
+
i.puts options[:stdin] if options[:stdin]
|
25
|
+
i.close
|
26
|
+
invoke_cache[argv] = o.readlines.map(&:chomp!)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/spec/tagger_spec.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative 'spec_helper'
|
4
|
+
|
5
|
+
describe Myaso::Tagger do
|
6
|
+
let(:ngrams) { File.expand_path('../data/test.123', __FILE__) }
|
7
|
+
let(:lexicon) { File.expand_path('../data/test.lex', __FILE__) }
|
8
|
+
let(:model) { Myaso::Tagger::TnT.new(ngrams, lexicon) }
|
9
|
+
|
10
|
+
subject { Myaso::Tagger.new(model) }
|
11
|
+
|
12
|
+
describe 'annotate(sentence)' do
|
13
|
+
it 'should annotate one word sentences' do
|
14
|
+
subject.annotate(%w(братишка)).must_equal %w(e)
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'should annotate sentences with tags' do
|
18
|
+
subject.annotate(%w(братишка я тебе покушать принес)).
|
19
|
+
must_equal(%w(a b b d e))
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'should handle unknown words' do
|
23
|
+
subject.annotate(%w(мир прекрасен , как никогда)).
|
24
|
+
must_equal(%w(d d d d d))
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative 'spec_helper'
|
4
|
+
|
5
|
+
describe Myaso::Tagger::Model do
|
6
|
+
let(:ngrams) { File.expand_path('../data/test.123', __FILE__) }
|
7
|
+
let(:lexicon) { File.expand_path('../data/test.lex', __FILE__) }
|
8
|
+
|
9
|
+
subject { Myaso::Tagger::TnT.new(ngrams, lexicon) }
|
10
|
+
|
11
|
+
describe '#q(t1,t2,t3)' do
|
12
|
+
it 'counts the quotient between trigram and bigram counts othewise' do
|
13
|
+
subject.q('a', 'a', 'a').must_be_close_to 0.224, 0.001
|
14
|
+
subject.q('b', 'a', 'b').must_be_close_to 0.287, 0.001
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
describe '#e(w|t)' do
|
19
|
+
it 'returns 0 if there is no such bunch word => tag' do
|
20
|
+
subject.e('братишка', 'b').must_equal(0)
|
21
|
+
subject.e('проголодался', 'c').must_equal(0)
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'counts the quotient between count(word, tag) and ngrams(tag)' do
|
25
|
+
subject.e('братишка', 'a').must_equal(1 / 26.0)
|
26
|
+
subject.e('принес', 'e').must_equal(2 / 6.0)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe '#learn!' do
|
31
|
+
it 'should has the same ngrams as in the gold standard' do
|
32
|
+
subject.ngrams.must_equal Myaso::Fixtures::NGRAMS
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'should has the same lexicon as in the gold standard' do
|
36
|
+
subject.lexicon.must_equal Myaso::Fixtures::LEXICON
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'should has the same interpolations as in the gold standard' do
|
40
|
+
subject.interpolations.must_equal Myaso::Fixtures::INTERPOLATIONS
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
describe '#start_symbol' do
|
45
|
+
it 'should be SENT' do
|
46
|
+
subject.start_symbol.must_equal 'SENT'
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
describe '#stop_symbol' do
|
51
|
+
it 'should be SENT' do
|
52
|
+
subject.stop_symbol.must_equal 'SENT'
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
describe '#conditional' do
|
57
|
+
it 'should compute p(0|0) as 0' do
|
58
|
+
subject.conditional(0, 0).must_equal 0.0
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'should compute p(1|0) as 0' do
|
62
|
+
subject.conditional(1, 0).must_equal 0.0
|
63
|
+
end
|
64
|
+
|
65
|
+
it 'should compute p(0|1) as 0' do
|
66
|
+
subject.conditional(0, 1).must_equal 0.0
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'should compute p(3|2) as 1.5' do
|
70
|
+
subject.conditional(3, 2).must_equal 1.5
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|