myaso 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +25 -0
- data/.travis.yml +10 -0
- data/Gemfile +14 -0
- data/LICENSE.txt +22 -0
- data/README.md +213 -0
- data/Rakefile +21 -0
- data/bin/myaso +73 -0
- data/lib/myaso.rb +35 -0
- data/lib/myaso/lexicon.rb +70 -0
- data/lib/myaso/mystem.rb +187 -0
- data/lib/myaso/mystem/library.rb +59 -0
- data/lib/myaso/ngrams.rb +67 -0
- data/lib/myaso/pi_table.rb +36 -0
- data/lib/myaso/tagger.rb +94 -0
- data/lib/myaso/tagger/model.rb +68 -0
- data/lib/myaso/tagger/tnt.rb +183 -0
- data/lib/myaso/version.rb +9 -0
- data/myaso.gemspec +26 -0
- data/myaso.jpg +0 -0
- data/spec/bin_spec.rb +48 -0
- data/spec/data/test.123 +77 -0
- data/spec/data/test.lex +10 -0
- data/spec/fixtures/interpolations.yml +4 -0
- data/spec/fixtures/lexicon.yml +32 -0
- data/spec/fixtures/ngrams.yml +106 -0
- data/spec/lexicon_spec.rb +84 -0
- data/spec/mystem_spec.rb +81 -0
- data/spec/ngrams_spec.rb +97 -0
- data/spec/pi_table_spec.rb +53 -0
- data/spec/spec_helper.rb +12 -0
- data/spec/support/fixtures.rb +34 -0
- data/spec/support/invoker.rb +29 -0
- data/spec/tagger_spec.rb +27 -0
- data/spec/tagger_tnt_spec.rb +73 -0
- metadata +137 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 37940cb5479932d889a3e0cf92fe74d4d706ed46
|
4
|
+
data.tar.gz: e468549cebbddf8e6cad46d72b90f375de35d41b
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 82f29168750fe0b45192866c8d8f82e7eaba533d5fd8e6450f5b8ea9e106098a3255922357f354d0c7813793c2f06b02c51c1d67d6af74b8cb2c13800a90fefc
|
7
|
+
data.tar.gz: de8f484dc371381ef9e9ad1d9d558f7b21d1bcc2aeb73b96e4ad60b538995c14fd34266880ee4345ce8fbd4f041a1eb7e124adc8f2e622680d2e8743fe922f62
|
data/.gitignore
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
*.so
|
2
|
+
*.zip
|
3
|
+
*swp
|
4
|
+
*.~*
|
5
|
+
*.gem
|
6
|
+
*.rbc
|
7
|
+
.rbx
|
8
|
+
.bundle
|
9
|
+
.config
|
10
|
+
.yardoc
|
11
|
+
Gemfile.lock
|
12
|
+
InstalledFiles
|
13
|
+
_yardoc
|
14
|
+
coverage
|
15
|
+
doc/
|
16
|
+
lib/bundler/man
|
17
|
+
pkg
|
18
|
+
rdoc
|
19
|
+
spec/reports
|
20
|
+
test/tmp
|
21
|
+
test/version_tmp
|
22
|
+
tmp
|
23
|
+
.DS_Store
|
24
|
+
.ruby-version
|
25
|
+
.ruby-gemset
|
data/.travis.yml
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
sudo: false
|
2
|
+
language: ruby
|
3
|
+
bundler_args: --without development
|
4
|
+
rvm:
|
5
|
+
- ruby
|
6
|
+
- jruby
|
7
|
+
install:
|
8
|
+
- wget 'https://github.com/yandex/tomita-parser/releases/download/v1.0/libmystem_c_binding.so.linux_x64.zip'
|
9
|
+
- unzip 'libmystem_c_binding.so.linux_x64.zip'
|
10
|
+
- bundle
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2010-2019 Dmitry Ustalov
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,213 @@
|
|
1
|
+
# Myaso
|
2
|
+
|
3
|
+
Myaso [ˈmʲæ.sə] is a morphological analysis and synthesis library, written in Ruby.
|
4
|
+
|
5
|
+
[![Gem Version][badge_fury_badge]][badge_fury_link] [![Build Status][travis_ci_badge]][travis_ci_link] [![Code Climate][code_climate_badge]][code_climage_link]
|
6
|
+
|
7
|
+
![Myaso](myaso.jpg)
|
8
|
+
|
9
|
+
[badge_fury_badge]: https://badge.fury.io/rb/myaso.svg
|
10
|
+
[badge_fury_link]: https://badge.fury.io/rb/myaso
|
11
|
+
[travis_ci_badge]: https://travis-ci.org/dustalov/myaso.svg
|
12
|
+
[travis_ci_link]: https://travis-ci.org/dustalov/myaso
|
13
|
+
[code_climate_badge]: https://codeclimate.com/github/dustalov/myaso/badges/gpa.svg
|
14
|
+
[code_climage_link]: https://codeclimate.com/github/dustalov/myaso
|
15
|
+
|
16
|
+
## Installation
|
17
|
+
|
18
|
+
Add this line to your application's Gemfile:
|
19
|
+
|
20
|
+
```ruby
|
21
|
+
gem 'myaso'
|
22
|
+
```
|
23
|
+
|
24
|
+
And then execute:
|
25
|
+
|
26
|
+
$ bundle
|
27
|
+
|
28
|
+
Or install it:
|
29
|
+
|
30
|
+
$ gem install myaso
|
31
|
+
|
32
|
+
## Usage
|
33
|
+
|
34
|
+
At the moment, Myaso has pretty fast part of speech (POS) tagger built on hidden Markov models (HMMs). The tagging operation requires statistical model to be trained.
|
35
|
+
|
36
|
+
Myaso supports trained models in the TnT format. One could be obtained at the Serge Sharoff et al. resource called [Russian statistical taggers and parsers](http://corpus.leeds.ac.uk/mocky/).
|
37
|
+
|
38
|
+
### Analysis
|
39
|
+
|
40
|
+
Since Yandex has released the [Mystem](https://tech.yandex.ru/mystem/) analyzer in the form of shared library, it makes it possible to use the analyzer through the foreign function interface.
|
41
|
+
|
42
|
+
Firstly, it is necessary to read and agree with the [mystem EULA](https://yandex.ru/legal/mystem/). Secondly, [download](https://github.com/yandex/tomita-parser/releases/tag/v1.0) and install the shared library for your operating system. Finally, use Myaso and enjoy the benefits.
|
43
|
+
|
44
|
+
#### Analysis API
|
45
|
+
|
46
|
+
Myaso uses mystem library to process Russian words. That is quite simple.
|
47
|
+
|
48
|
+
```ruby
|
49
|
+
pp Myaso::Mystem.analyze('котёночка')
|
50
|
+
=begin
|
51
|
+
[#<struct Myaso::Mystem::Lemma
|
52
|
+
lemma="котеночек",
|
53
|
+
form="котёночка",
|
54
|
+
quality=:dictionary,
|
55
|
+
msd=#<Myasorubka::MSD::Russian msd="Ncmsay">,
|
56
|
+
stem_grammemes=[136, 192, 201],
|
57
|
+
flex_grammemes=[168, 174, 166],
|
58
|
+
flex_length=6,
|
59
|
+
rule_id=1525>]
|
60
|
+
=end
|
61
|
+
```
|
62
|
+
|
63
|
+
Myaso works fine even in case the given word is either ambiguous or does not appear in the mystem's dictionary.
|
64
|
+
|
65
|
+
```ruby
|
66
|
+
pp Myaso::Mystem.analyze('аудисты')
|
67
|
+
=begin
|
68
|
+
[#<struct Myaso::Mystem::Lemma
|
69
|
+
lemma="аудист",
|
70
|
+
form="аудисты",
|
71
|
+
quality=:bastard,
|
72
|
+
msd=#<Myasorubka::MSD::Russian msd="Ncmpny">,
|
73
|
+
stem_grammemes=[136, 192, 201],
|
74
|
+
flex_grammemes=[165, 175],
|
75
|
+
flex_length=1,
|
76
|
+
rule_id=25>,
|
77
|
+
#<struct Myaso::Mystem::Lemma
|
78
|
+
lemma="аудистый",
|
79
|
+
form="аудисты",
|
80
|
+
quality=:bastard,
|
81
|
+
msd=#<Myasorubka::MSD::Russian msd="A---p-s">,
|
82
|
+
stem_grammemes=[128],
|
83
|
+
flex_grammemes=[175, 183],
|
84
|
+
flex_length=1,
|
85
|
+
rule_id=65>]
|
86
|
+
=end
|
87
|
+
```
|
88
|
+
|
89
|
+
### Synthesis
|
90
|
+
|
91
|
+
Given the analyzed word, it is possible to retrieve all the possible forms. Having this information, one may use it to inflect a word. This is implemeneted using the abovementioned mystem shared library.
|
92
|
+
|
93
|
+
#### Synthesis API
|
94
|
+
|
95
|
+
In general form, all the possible word forms can be extracted with the specified word and its inflection rule.
|
96
|
+
|
97
|
+
```ruby
|
98
|
+
pp Myaso::Mystem.forms('человеком', 3890)
|
99
|
+
=begin
|
100
|
+
[#<struct Myaso::Mystem::Form
|
101
|
+
form="людей",
|
102
|
+
msd=#<Myasorubka::MSD::Russian msd="Ncmpay">,
|
103
|
+
stem_grammemes=[136, 192, 201],
|
104
|
+
flex_grammemes=[168, 175, 166]>,
|
105
|
+
...
|
106
|
+
#<struct Myaso::Mystem::Form
|
107
|
+
form="человеку",
|
108
|
+
msd=#<Myasorubka::MSD::Russian msd="Ncmsdy">,
|
109
|
+
stem_grammemes=[136, 192, 201],
|
110
|
+
flex_grammemes=[167, 174]>]
|
111
|
+
=end
|
112
|
+
```
|
113
|
+
|
114
|
+
There exists a convenient way of doing this, which requires a previously lemmatized word.
|
115
|
+
|
116
|
+
```ruby
|
117
|
+
lemmas = Myaso::Mystem.analyze('кот') # => [#<Myaso::Mystem::Lemma lemma="кот" msd="Ncmsny">]
|
118
|
+
pp lemmas[0].forms
|
119
|
+
=begin
|
120
|
+
[#<struct Myaso::Mystem::Form
|
121
|
+
form="кот",
|
122
|
+
msd=#<Myasorubka::MSD::Russian msd="Ncmsny">,
|
123
|
+
stem_grammemes=[136, 192, 201],
|
124
|
+
flex_grammemes=[165, 174]>,
|
125
|
+
...
|
126
|
+
#<struct Myaso::Mystem::Form
|
127
|
+
form="коты",
|
128
|
+
msd=#<Myasorubka::MSD::Russian msd="Ncmpny">,
|
129
|
+
stem_grammemes=[136, 192, 201],
|
130
|
+
flex_grammemes=[165, 175]>]
|
131
|
+
=end
|
132
|
+
```
|
133
|
+
|
134
|
+
Moreover, Myaso makes it possible to find exact matches of grammemes, but you have to be careful because computational linguistics is a hard field.
|
135
|
+
|
136
|
+
```ruby
|
137
|
+
lemmas = Myaso::Mystem.analyze('человек') # => [#<Myaso::Mystem::Lemma lemma="человек" msd="Ncmpay">]
|
138
|
+
pp lemmas[0].inflect(:number => :plural, :case => :dative)
|
139
|
+
=begin
|
140
|
+
[#<struct Myaso::Mystem::Form
|
141
|
+
form="людям",
|
142
|
+
msd=#<Myasorubka::MSD::Russian msd="Ncmpdy">,
|
143
|
+
stem_grammemes=[136, 192, 201],
|
144
|
+
flex_grammemes=[167, 175]>,
|
145
|
+
#<struct Myaso::Mystem::Form
|
146
|
+
form="человекам",
|
147
|
+
msd=#<Myasorubka::MSD::Russian msd="Ncmpdy">,
|
148
|
+
stem_grammemes=[136, 192, 201],
|
149
|
+
flex_grammemes=[167, 175]>]
|
150
|
+
=end
|
151
|
+
```
|
152
|
+
|
153
|
+
### Tagging
|
154
|
+
|
155
|
+
Myaso performs POS tagging using its own implementation of the Viterbi algorithm on HMMs. The output has the following format: `token<TAB>tag`.
|
156
|
+
|
157
|
+
Please remember that tagger command line interface accepts only tokenized texts — one token per line. For instance, the [Greeb](https://github.com/dustalov/greeb) tokenizer can help you. Do not be afraid to use another text tokenization or segmentation tool if necessary.
|
158
|
+
|
159
|
+
```
|
160
|
+
% echo 'Как поспал, проголодался наверное?' | greeb | myaso -n snyat-msd.123 -l snyat-msd.lex tagger
|
161
|
+
Как P-----r
|
162
|
+
поспал Vmis-sma
|
163
|
+
, ,
|
164
|
+
проголодался Vmis-sma
|
165
|
+
наверное R
|
166
|
+
? SENT
|
167
|
+
```
|
168
|
+
|
169
|
+
Unfortunately, current implementation of the tagger has two significant drawbacks:
|
170
|
+
|
171
|
+
1. The tagger handles unknown words not so good. Sorry.
|
172
|
+
2. Tagging is fast inself, but requires pretty slow training procedure running only once.
|
173
|
+
|
174
|
+
#### Tagging API
|
175
|
+
|
176
|
+
It is possible to embed the POS tagging feature in your own application using API.
|
177
|
+
|
178
|
+
```ruby
|
179
|
+
model = Myaso::Tagger::TnT.new('model.123', 'model.lex')
|
180
|
+
tagger = Myaso::Tagger.new(model)
|
181
|
+
pp tagger.annotate(%w(Как поспал , проголодался наверное ?))
|
182
|
+
=begin
|
183
|
+
["P-----r", "Vmis-sma", ",", "Vmis-sma", "R", "SENT"]
|
184
|
+
=end
|
185
|
+
```
|
186
|
+
|
187
|
+
It is possible to significantly speed up the initialization process by expicit setting of the interpolations vector. For instance, the TnT model from http://corpus.leeds.ac.uk/mocky/ has the following (approximated) linear interpolation coefficients: *k1 = 0.14*, *k2 = 0.30*, *k3 = 0.56*. In the example these values are provided precisely.
|
188
|
+
|
189
|
+
```ruby
|
190
|
+
interpolations = [0.14095796503456284, 0.3032174211273352, 0.555824613838102]
|
191
|
+
model = Myaso::Tagger::TnT.new('model.123', 'model.lex', interpolations)
|
192
|
+
tagger = Myaso::Tagger.new(model)
|
193
|
+
pp tagger.annotate(%w(Как поспал , проголодался наверное ?))
|
194
|
+
=begin
|
195
|
+
["P-----r", "Vmis-sma", ",", "Vmis-sma", "R", "SENT"]
|
196
|
+
=end
|
197
|
+
```
|
198
|
+
|
199
|
+
## Acknowledgement
|
200
|
+
|
201
|
+
This work is partially supported by the Ural Branch of the Russian Academy of Sciences, grant no. РЦП-12-П10.
|
202
|
+
|
203
|
+
## Contributing
|
204
|
+
|
205
|
+
1. Fork it;
|
206
|
+
2. Create your feature branch (`git checkout -b my-new-feature`);
|
207
|
+
3. Commit your changes (`git commit -am 'Added some feature'`);
|
208
|
+
4. Push to the branch (`git push origin my-new-feature`);
|
209
|
+
5. Create new Pull Request.
|
210
|
+
|
211
|
+
## Copyright
|
212
|
+
|
213
|
+
Copyright (c) 2010-2019 Dmitry Ustalov. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
#!/usr/bin/env rake
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
require 'rubygems/package_task'
|
5
|
+
require 'bundler/gem_tasks'
|
6
|
+
require 'rake/testtask'
|
7
|
+
require 'rdoc/task'
|
8
|
+
|
9
|
+
task :default => :test
|
10
|
+
|
11
|
+
Rake::TestTask.new do |test|
|
12
|
+
test.pattern = 'spec/**/*_spec.rb'
|
13
|
+
test.verbose = true
|
14
|
+
end
|
15
|
+
|
16
|
+
RDoc::Task.new do |rdoc|
|
17
|
+
rdoc.rdoc_dir = 'doc/rdoc'
|
18
|
+
rdoc.main = 'README.md'
|
19
|
+
rdoc.markup = 'markdown'
|
20
|
+
rdoc.rdoc_files.include('README.md', 'CHANGES.md', 'LICENSE.txt', 'lib/**/*.rb')
|
21
|
+
end
|
data/bin/myaso
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
require 'ostruct'
|
5
|
+
require 'optparse'
|
6
|
+
|
7
|
+
if File.exists? File.expand_path('../../.git', __FILE__)
|
8
|
+
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
9
|
+
end
|
10
|
+
|
11
|
+
require 'myaso'
|
12
|
+
|
13
|
+
options = OpenStruct.new
|
14
|
+
|
15
|
+
optparse = OptionParser.new do |opts|
|
16
|
+
opts.banner = 'Usage: %s [options] command' % $PROGRAM_NAME
|
17
|
+
|
18
|
+
opts.separator ''
|
19
|
+
opts.separator 'Commands:'
|
20
|
+
opts.separator ' tagger: run the HMM tagger'
|
21
|
+
opts.separator ' console: start an IRB session'
|
22
|
+
opts.separator ''
|
23
|
+
opts.separator 'Options:'
|
24
|
+
|
25
|
+
opts.on('-n', '--ngrams ngrams', 'Path to ngrams file for tagger') do |n|
|
26
|
+
options.ngrams = n
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on('-l', '--lexicon lexicon', 'Path to lexicon file for tagger') do |l|
|
30
|
+
options.lexicon = l
|
31
|
+
end
|
32
|
+
|
33
|
+
opts.on '-e', '--eval [code]', 'Evaluate the given line of code' do |e|
|
34
|
+
options.eval = e
|
35
|
+
end
|
36
|
+
|
37
|
+
opts.on_tail '-h', '--help', 'Just display this help' do
|
38
|
+
puts opts
|
39
|
+
exit
|
40
|
+
end
|
41
|
+
|
42
|
+
opts.on_tail '-v', '--version', 'Just print the version infomation' do
|
43
|
+
puts 'Myaso v%s' % Myaso::VERSION
|
44
|
+
puts 'Copyright (c) 2010-2013 Dmitry Ustalov'
|
45
|
+
exit
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
optparse.parse!
|
50
|
+
|
51
|
+
eval(options.eval, binding, __FILE__, __LINE__) if options.eval
|
52
|
+
|
53
|
+
case ARGV.first
|
54
|
+
when 'tagger' then
|
55
|
+
sentence = STDIN.readlines.map(&:chomp)
|
56
|
+
|
57
|
+
STDERR.puts 'Training the tagger, this procedure is not so fast.'
|
58
|
+
model = Myaso::Tagger::TnT.new(options.ngrams, options.lexicon)
|
59
|
+
tagger = Myaso::Tagger.new(model)
|
60
|
+
tags = tagger.annotate(sentence)
|
61
|
+
|
62
|
+
sentence.zip(tags).each do |word, tag|
|
63
|
+
puts "%s\t%s" % [word, tag]
|
64
|
+
end
|
65
|
+
when 'console' then
|
66
|
+
ARGV.clear
|
67
|
+
include Myaso
|
68
|
+
require 'irb'
|
69
|
+
IRB.start
|
70
|
+
else
|
71
|
+
puts optparse
|
72
|
+
exit 1
|
73
|
+
end
|
data/lib/myaso.rb
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'forwardable'
|
4
|
+
require 'ffi'
|
5
|
+
|
6
|
+
require 'myasorubka'
|
7
|
+
require 'myasorubka/msd/russian'
|
8
|
+
require 'myasorubka/mystem'
|
9
|
+
|
10
|
+
require 'myaso/version'
|
11
|
+
require 'myaso/pi_table'
|
12
|
+
require 'myaso/ngrams'
|
13
|
+
require 'myaso/lexicon'
|
14
|
+
require 'myaso/tagger'
|
15
|
+
require 'myaso/tagger/model'
|
16
|
+
require 'myaso/tagger/tnt'
|
17
|
+
require 'myaso/mystem'
|
18
|
+
require 'myaso/mystem/library'
|
19
|
+
|
20
|
+
# The UnknownWord exception is raised when Tagger considers an unknown
|
21
|
+
# word.
|
22
|
+
#
|
23
|
+
class Myaso::UnknownWord < RuntimeError
|
24
|
+
attr_reader :word
|
25
|
+
|
26
|
+
# @private
|
27
|
+
def initialize(word)
|
28
|
+
@word = word
|
29
|
+
end
|
30
|
+
|
31
|
+
# @private
|
32
|
+
def to_s
|
33
|
+
'unknown word "%s"' % word
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# A pretty useful representation of a lexicon in the following form:
|
4
|
+
# `word_prefix -> word -> tags`.
|
5
|
+
#
|
6
|
+
class Myaso::Lexicon
|
7
|
+
extend Forwardable
|
8
|
+
include Enumerable
|
9
|
+
|
10
|
+
attr_reader :table
|
11
|
+
def_delegator :@table, :each, :each
|
12
|
+
|
13
|
+
# An instance of a n-gram storage is initialized by zero counts.
|
14
|
+
#
|
15
|
+
def initialize
|
16
|
+
@table = Hash.new do |h, k|
|
17
|
+
h[k] = Hash.new { |h_local, k_local| h_local[k_local] = Hash.new(0) }
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Obtain the count of the specified word and tag.
|
22
|
+
#
|
23
|
+
def [] word, tag = nil
|
24
|
+
return 0 unless table.include? prefix(word)
|
25
|
+
return 0 unless table[prefix(word)].include? word
|
26
|
+
table[prefix(word)][word][tag]
|
27
|
+
end
|
28
|
+
|
29
|
+
# Assign the count to the specified word and tag.
|
30
|
+
#
|
31
|
+
def []= word, tag = nil, count
|
32
|
+
@tags = nil
|
33
|
+
table[prefix(word)][word][tag] = count
|
34
|
+
end
|
35
|
+
|
36
|
+
# Retrieve global tags or tags of the given word.
|
37
|
+
#
|
38
|
+
def tags(word = nil)
|
39
|
+
return lazy_aggregated_tags unless word
|
40
|
+
table[prefix(word)][word].keys.compact
|
41
|
+
end
|
42
|
+
|
43
|
+
# Two lexicons are equal iff they tables are equal.
|
44
|
+
#
|
45
|
+
def == other
|
46
|
+
self.table == other.table
|
47
|
+
end
|
48
|
+
|
49
|
+
protected
|
50
|
+
# Perform lazy initialization of global tags.
|
51
|
+
#
|
52
|
+
def lazy_aggregated_tags
|
53
|
+
@tags ||= table.inject(Hash.new(0)) do |hash, (_, wts)|
|
54
|
+
wts.each do |word, tags|
|
55
|
+
tags.each do |tag, count|
|
56
|
+
next unless tag
|
57
|
+
hash[tag] += count
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
hash
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# Extract the word prefix of three characters.
|
66
|
+
#
|
67
|
+
def prefix(word)
|
68
|
+
word[0..2]
|
69
|
+
end
|
70
|
+
end
|