myaso 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,84 @@
1
+ # encoding: utf-8
2
+
3
+ require_relative 'spec_helper'
4
+
5
+ describe Myaso::Lexicon do
6
+ subject { Myaso::Lexicon.new }
7
+
8
+ describe '#new' do
9
+ it 'should be empty' do
10
+ subject.table.must_be_empty
11
+ end
12
+
13
+ it 'should not initialize @tags' do
14
+ subject.instance_variable_defined?(:@tags).must_equal false
15
+ end
16
+ end
17
+
18
+ describe '#[]' do
19
+ it 'should treat unknown words as zeroes' do
20
+ subject['lopata'].must_equal 0
21
+ end
22
+
23
+ it 'should treat unknown words and tags as zeroes' do
24
+ subject['lopata', 'dno'].must_equal 0
25
+ end
26
+
27
+ it 'should modify a word' do
28
+ subject['lopata'] = 1
29
+ subject['lopata'].must_equal 1
30
+ end
31
+
32
+ it 'should modify a word with tag' do
33
+ subject['lopata', 'dno'] = 2
34
+ subject['lopata', 'dno'].must_equal 2
35
+ end
36
+ end
37
+
38
+ describe '#tags' do
39
+ it 'should perform lazy initialization' do
40
+ subject.instance_variable_defined?(:@tags).must_equal false
41
+ subject.tags
42
+ subject.instance_variable_get(:@tags).wont_be_nil
43
+ end
44
+
45
+ it 'should collect global tag counts' do
46
+ subject['lopata', 'dno'] = 1
47
+ subject['lopata', 'bydlow'] = 2
48
+ subject.tags.must_equal 'dno' => 1, 'bydlow' => 2
49
+ end
50
+
51
+ it 'should be invalidated after the value assignment' do
52
+ subject.tags.must_be_empty
53
+ subject['lopata', 'dno'] = 1
54
+ subject.tags.must_equal 'dno' => 1
55
+ end
56
+
57
+ it 'should return tags of the given word' do
58
+ subject.tags('lopata').must_be_empty
59
+ subject['lopata', 'dno'] = 1
60
+ subject.tags('lopata').must_equal %w(dno)
61
+ end
62
+ end
63
+
64
+ describe '#each' do
65
+ it 'should iterate over the internal table' do
66
+ subject.each.to_a.must_equal subject.table.to_a
67
+ end
68
+ end
69
+
70
+ describe '#==' do
71
+ let(:other) { Myaso::Lexicon.new }
72
+
73
+ it 'should be equal to a new instance when not modified' do
74
+ subject.must_equal other
75
+ end
76
+
77
+ it 'should check equality by internal tables' do
78
+ subject['lopata', 'dno'] = 1
79
+ subject.wont_equal other
80
+ other['lopata', 'dno'] = 1
81
+ subject.must_equal other
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,81 @@
1
+ # encoding: utf-8
2
+
3
+ require_relative 'spec_helper'
4
+
5
+ describe Myaso::Mystem do
6
+ describe 'analysis of dictionary words' do
7
+ subject { Myaso::Mystem.analyze('СТАЛИ') }
8
+
9
+ it 'is ambiguous' do
10
+ subject.length.must_equal 2
11
+ end
12
+
13
+ it 'is a dictionary word' do
14
+ subject.each { |s| s.quality.must_equal :dictionary }
15
+ end
16
+
17
+ it 'lemmatizes' do
18
+ subject.map(&:lemma).sort!.must_equal %w(сталь становиться)
19
+ end
20
+
21
+ it 'normalizes' do
22
+ subject.each { |s| s.form.must_equal 'стали' }
23
+ end
24
+
25
+ it 'analyzes' do
26
+ subject.map { |s| s.msd.pos.to_s }.sort!.must_equal %w(noun verb)
27
+ end
28
+ end
29
+
30
+ describe 'analysis of bastard words' do
31
+ subject { Myaso::Mystem.analyze('дОлБоЯщЕрА') }
32
+
33
+ it 'is unambiguous' do
34
+ subject.length.must_equal 1
35
+ end
36
+
37
+ it 'is really a dictionary word' do
38
+ subject.first.quality.must_equal :bastard
39
+ end
40
+
41
+ it 'lemmatizes' do
42
+ subject.first.lemma.must_equal 'долбоящер'
43
+ end
44
+
45
+ it 'normalizes' do
46
+ subject.first.form.must_equal 'долбоящера'
47
+ end
48
+
49
+ it 'analyzes' do
50
+ subject.first.msd.pos.must_equal :noun
51
+ end
52
+ end
53
+
54
+ describe 'form enumeration' do
55
+ let(:lemma) { Myaso::Mystem.analyze('человеком').first }
56
+
57
+ subject { Myaso::Mystem.forms('человеком', 3890) }
58
+
59
+ it 'enumerates forms' do
60
+ subject.length.must_equal 14
61
+ end
62
+
63
+ it 'works for lemmas' do
64
+ subject.must_equal lemma.forms
65
+ end
66
+ end
67
+
68
+ describe 'inflection' do
69
+ let(:lemma) { Myaso::Mystem.analyze('людьми').first }
70
+
71
+ subject { lemma.inflect(:number => :plural, :case => :dative) }
72
+
73
+ it 'is ambiguous' do
74
+ subject.length.must_equal 2
75
+ end
76
+
77
+ it 'inflects' do
78
+ subject.map!(&:form).sort!.must_equal %w(людям человекам)
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,97 @@
1
+ # encoding: utf-8
2
+
3
+ require_relative 'spec_helper'
4
+
5
+ describe Myaso::Ngrams do
6
+ let(:tags) { %w(D V N) }
7
+ let(:unigrams) { tags }
8
+ let(:bigrams) { tags + [nil] }
9
+ let(:trigrams) { tags + [nil] }
10
+
11
+ subject { Myaso::Ngrams.new }
12
+
13
+ describe '#new' do
14
+ it 'should be full of zeroes' do
15
+ unigrams.each do |u|
16
+ bigrams.each do |b|
17
+ trigrams.each do |t|
18
+ subject[u, b, t].must_equal 0
19
+ end
20
+ end
21
+ end
22
+ end
23
+
24
+ it 'should be empty' do
25
+ subject.unigrams_count.must_equal 0
26
+ end
27
+ end
28
+
29
+ describe '#[]' do
30
+ it 'should treat unset unigrams as zeroes' do
31
+ subject['D'].must_equal 0
32
+ end
33
+
34
+ it 'should treat unset bigrams as zeroes' do
35
+ subject['D', 'V'].must_equal 0
36
+ end
37
+
38
+ it 'should treat unset trigrams as zeroes' do
39
+ subject['D', 'V', 'N'].must_equal 0
40
+ end
41
+
42
+ it 'should modify an unigram' do
43
+ subject['D'] = 1
44
+ subject['D'].must_equal 1
45
+ subject.unigrams_count.must_equal 1
46
+ end
47
+
48
+ it 'should modify a bigram' do
49
+ subject['D', 'N'] = 2
50
+ subject['D', 'N'].must_equal 2
51
+ end
52
+
53
+ it 'should modify a trigram' do
54
+ subject['D', 'N', 'V'] = 3
55
+ subject['D', 'N', 'V'].must_equal 3
56
+ end
57
+ end
58
+
59
+ describe '#==' do
60
+ let(:other) { Myaso::Ngrams.new }
61
+
62
+ it 'should be equal to a new instance when not modified' do
63
+ subject.must_equal other
64
+ end
65
+
66
+ it 'should check equality by internal tables' do
67
+ subject['D', 'N', 'V'] = 1
68
+ subject.wont_equal other
69
+ other['D', 'N', 'V'] = 1
70
+ subject.must_equal other
71
+ end
72
+ end
73
+
74
+ describe '#each' do
75
+ before do
76
+ subject['D'] = 1
77
+ subject['N'] = 2
78
+ subject['D', 'N'] = 3
79
+ subject['V', 'D'] = 4
80
+ subject['D', 'N', 'V'] = 5
81
+ subject['N', 'V', 'D'] = 6
82
+ end
83
+
84
+ it 'should iterate over the internal table' do
85
+ subject.each.to_a.must_equal([["D", {nil=>{nil=>1},
86
+ "N"=>{nil=>3, "V"=>5}}], ["N", {nil=>{nil=>2}, "V"=>{"D"=>6}}],
87
+ ["V", {"D"=>{nil=>4}}]])
88
+ end
89
+
90
+ it 'should enumerate over trigrams' do
91
+ Array.new.tap do |trigrams|
92
+ subject.each_trigram { |trigram| trigrams << trigram }
93
+ trigrams.must_equal [[%w(D N V), 5], [%w(N V D), 6]]
94
+ end
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,53 @@
1
+ # encoding: utf-8
2
+
3
+ require_relative 'spec_helper'
4
+
5
+ describe Myaso::PiTable do
6
+ let(:length) { 10 }
7
+ let(:tags) { %w(D V N) }
8
+
9
+ subject { Myaso::PiTable.new }
10
+
11
+ describe '#new' do
12
+ it 'should be full of nils' do
13
+ length.times do |i|
14
+ tags.each do |u|
15
+ tags.each do |v|
16
+ subject[i, u, v].must_be_nil
17
+ end
18
+ end
19
+ end
20
+ end
21
+
22
+ describe 'with default value' do
23
+ let(:default) { 0 }
24
+
25
+ subject { Myaso::PiTable.new(default) }
26
+
27
+ it 'should be full of zeros' do
28
+ length.times do |i|
29
+ tags.each do |u|
30
+ tags.each do |v|
31
+ subject[i, u, v].must_equal 0
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
38
+
39
+ describe '#[]' do
40
+ it 'should get and set a tuple' do
41
+ subject[0, 'D', 'N'] = 1
42
+ subject[0, 'D', 'V'] = 2
43
+ subject[0, 'D', 'N'].must_equal 1
44
+ subject[0, 'D', 'V'].must_equal 2
45
+ end
46
+ end
47
+
48
+ describe '#each' do
49
+ it 'iterates over an internal table' do
50
+ subject.each.to_a.must_equal subject.table.to_a
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,12 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+
5
+ gem 'minitest'
6
+ require 'minitest/autorun'
7
+ require 'minitest/hell'
8
+
9
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
10
+ require 'myaso'
11
+
12
+ Dir[File.expand_path('../support/**/*.rb', __FILE__)].each { |f| require f }
@@ -0,0 +1,34 @@
1
+ # encoding: utf-8
2
+
3
+ require 'yaml'
4
+
5
+ # When this file is loaded, for each fixture file, a singleton method is
6
+ # created within the Myaso::Fixtures module with the same name as the fixture
7
+ # file, returning the value of the fixture.
8
+ #
9
+ # For example, file <tt>prefixes.yml</tt>:
10
+ #
11
+ # - id: 1
12
+ # prefix: sub
13
+ # - id: 2
14
+ # prefix: bi
15
+ #
16
+ # These fixtures would be made available like so:
17
+ #
18
+ # Myaso::Fixtures::PREFIXES
19
+ # => [{"id"=>1, "prefix"=>"sub"}, {"id"=>2, "prefix"=>"bi"}]
20
+ #
21
+ # You can find out all available fixtures by calling
22
+ #
23
+ # Myaso::Fixtures.constants
24
+ # => [ :BUCKETS ]
25
+ #
26
+ module Myaso::Fixtures
27
+ end
28
+
29
+ fixtures_path = File.expand_path('../../fixtures', __FILE__)
30
+
31
+ Dir[File.expand_path('*.yml', fixtures_path)].each do |filename|
32
+ const_name = File.basename(filename, '.*').upcase
33
+ Myaso::Fixtures.const_set(const_name, YAML.load_file(filename))
34
+ end
@@ -0,0 +1,29 @@
1
+ # encoding: utf-8
2
+
3
+ require 'open3'
4
+
5
+ # http://dota2.ru/guides/880-invokirkhakha-sanstrajk-ni-azhydal-da/
6
+ #
7
+ class MiniTest::Test
8
+ # Quas Wex Exort.
9
+ #
10
+ def invoke_cache
11
+ @invoke_cache ||= {}
12
+ end
13
+
14
+ # So begins a new age of knowledge.
15
+ #
16
+ def invoke(*argv)
17
+ return invoke_cache[argv] if invoke_cache.has_key? argv
18
+
19
+ arguments = argv.dup
20
+ options = (arguments.last.is_a? Hash) ? arguments.pop : {}
21
+ executable = File.expand_path('../../../bin/myaso', __FILE__)
22
+
23
+ Open3.popen3(executable, *arguments) do |i, o, *_|
24
+ i.puts options[:stdin] if options[:stdin]
25
+ i.close
26
+ invoke_cache[argv] = o.readlines.map(&:chomp!)
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,27 @@
1
+ # encoding: utf-8
2
+
3
+ require_relative 'spec_helper'
4
+
5
+ describe Myaso::Tagger do
6
+ let(:ngrams) { File.expand_path('../data/test.123', __FILE__) }
7
+ let(:lexicon) { File.expand_path('../data/test.lex', __FILE__) }
8
+ let(:model) { Myaso::Tagger::TnT.new(ngrams, lexicon) }
9
+
10
+ subject { Myaso::Tagger.new(model) }
11
+
12
+ describe 'annotate(sentence)' do
13
+ it 'should annotate one word sentences' do
14
+ subject.annotate(%w(братишка)).must_equal %w(e)
15
+ end
16
+
17
+ it 'should annotate sentences with tags' do
18
+ subject.annotate(%w(братишка я тебе покушать принес)).
19
+ must_equal(%w(a b b d e))
20
+ end
21
+
22
+ it 'should handle unknown words' do
23
+ subject.annotate(%w(мир прекрасен , как никогда)).
24
+ must_equal(%w(d d d d d))
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,73 @@
1
+ # encoding: utf-8
2
+
3
+ require_relative 'spec_helper'
4
+
5
+ describe Myaso::Tagger::Model do
6
+ let(:ngrams) { File.expand_path('../data/test.123', __FILE__) }
7
+ let(:lexicon) { File.expand_path('../data/test.lex', __FILE__) }
8
+
9
+ subject { Myaso::Tagger::TnT.new(ngrams, lexicon) }
10
+
11
+ describe '#q(t1,t2,t3)' do
12
+ it 'counts the quotient between trigram and bigram counts othewise' do
13
+ subject.q('a', 'a', 'a').must_be_close_to 0.224, 0.001
14
+ subject.q('b', 'a', 'b').must_be_close_to 0.287, 0.001
15
+ end
16
+ end
17
+
18
+ describe '#e(w|t)' do
19
+ it 'returns 0 if there is no such bunch word => tag' do
20
+ subject.e('братишка', 'b').must_equal(0)
21
+ subject.e('проголодался', 'c').must_equal(0)
22
+ end
23
+
24
+ it 'counts the quotient between count(word, tag) and ngrams(tag)' do
25
+ subject.e('братишка', 'a').must_equal(1 / 26.0)
26
+ subject.e('принес', 'e').must_equal(2 / 6.0)
27
+ end
28
+ end
29
+
30
+ describe '#learn!' do
31
+ it 'should has the same ngrams as in the gold standard' do
32
+ subject.ngrams.must_equal Myaso::Fixtures::NGRAMS
33
+ end
34
+
35
+ it 'should has the same lexicon as in the gold standard' do
36
+ subject.lexicon.must_equal Myaso::Fixtures::LEXICON
37
+ end
38
+
39
+ it 'should has the same interpolations as in the gold standard' do
40
+ subject.interpolations.must_equal Myaso::Fixtures::INTERPOLATIONS
41
+ end
42
+ end
43
+
44
+ describe '#start_symbol' do
45
+ it 'should be SENT' do
46
+ subject.start_symbol.must_equal 'SENT'
47
+ end
48
+ end
49
+
50
+ describe '#stop_symbol' do
51
+ it 'should be SENT' do
52
+ subject.stop_symbol.must_equal 'SENT'
53
+ end
54
+ end
55
+
56
+ describe '#conditional' do
57
+ it 'should compute p(0|0) as 0' do
58
+ subject.conditional(0, 0).must_equal 0.0
59
+ end
60
+
61
+ it 'should compute p(1|0) as 0' do
62
+ subject.conditional(1, 0).must_equal 0.0
63
+ end
64
+
65
+ it 'should compute p(0|1) as 0' do
66
+ subject.conditional(0, 1).must_equal 0.0
67
+ end
68
+
69
+ it 'should compute p(3|2) as 1.5' do
70
+ subject.conditional(3, 2).must_equal 1.5
71
+ end
72
+ end
73
+ end