myasorubka 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: f19b580bd580d01ac69b901c274b066f936dfd39
4
+ data.tar.gz: 1d5dde45cbdb11897a595b0dcd42dd2dbd096980
5
+ SHA512:
6
+ metadata.gz: a330c97844b4860cfbdfd4ad5bcc950d67bdbb4ef02489b12afe7e84c16167a384ea6523c05e315bec4007b6070351a331d2d014cf8b2296310a012a11977840
7
+ data.tar.gz: bb98c6a3055464bea25b6ad27a20c66abc02e3c9847d71c3cde993b8a1c9673826633909608bdc81eba25c4645943a416c32f1607382e071a97501da1f398b03
@@ -0,0 +1,45 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## RUBINIUS
17
+ *.rbc
18
+
19
+ ## NETBEANS
20
+ nbproject
21
+
22
+ ## REDCAR
23
+ .redcar
24
+
25
+ ## RVM
26
+ .rvmrc
27
+ .ruby-version
28
+ .ruby-gemset
29
+
30
+ ## RUBINIUS
31
+ .rbx
32
+
33
+ ## BUNDLER
34
+ .bundle
35
+ Gemfile.lock
36
+
37
+ ## YARD
38
+ .yardoc
39
+ doc
40
+
41
+ ## PROJECT::GENERAL
42
+ coverage
43
+ pkg
44
+
45
+ ## PROJECT::SPECIFIC
@@ -0,0 +1,7 @@
1
+ branches:
2
+ only:
3
+ - master
4
+ rvm:
5
+ - 2.0.0
6
+ - jruby-19mode
7
+ - rbx-19mode
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2011-2013 Dmitry Ustalov
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,23 @@
1
+ Myasorubka
2
+ ==========
3
+
4
+ Myasorubka is a morphological data processor that supports
5
+ [AOT](http://aot.ru) and [MULTEXT-East](http://nl.ijs.si/ME/)
6
+ notations.
7
+
8
+ ## Contributing
9
+
10
+ 1. Fork it;
11
+ 2. Create your feature branch (`git checkout -b my-new-feature`);
12
+ 3. Commit your changes (`git commit -am 'Added some feature'`);
13
+ 4. Push to the branch (`git push origin my-new-feature`);
14
+ 5. Create new Pull Request.
15
+
16
+ ## Build Status [<img src="https://secure.travis-ci.org/ustalov/myasorubka.png"/>](http://travis-ci.org/ustalov/myasorubka)
17
+
18
+ ## Dependency Status [<img src="https://gemnasium.com/ustalov/myasorubka.png"/>](https://gemnasium.com/ustalov/myasorubka)
19
+
20
+ ## Copyright
21
+ Copyright (c) 2011-2013 [Dmitry Ustalov]. See LICENSE for details.
22
+
23
+ [Dmitry Ustalov]: http://eveel.ru
@@ -0,0 +1,11 @@
1
+ # encoding: utf-8
2
+
3
+ require 'bundler/gem_tasks'
4
+
5
+ require 'rake/testtask'
6
+ Rake::TestTask.new do |test|
7
+ test.pattern = 'spec/**/*_spec.rb'
8
+ test.verbose = true
9
+ end
10
+
11
+ task :default => :test
@@ -0,0 +1,7 @@
1
+ #!/bin/sh
2
+ rake clean aot \
3
+ mrd=morphs.mrd \
4
+ tab=rgramtab.tab \
5
+ encoding=CP1251 \
6
+ language=russian \
7
+ --trace
@@ -0,0 +1,4 @@
1
+ # encoding: utf-8
2
+
3
+ require 'myasorubka/version'
4
+ require 'myasorubka/msd'
@@ -0,0 +1,8 @@
1
+ # encoding: utf-8
2
+
3
+ # http://aot.ru/
4
+ module Myasorubka::AOT
5
+ require 'myasorubka/aot/dictionary'
6
+ require 'myasorubka/aot/gramtab'
7
+ require 'myasorubka/aot/tags'
8
+ end
@@ -0,0 +1,125 @@
1
+ # encoding: utf-8
2
+
3
+ # MRD file is a text file that contains a morphological dictionary of
4
+ # a natural language. MRD is an abbreviation of "morphological dictionary".
5
+ #
6
+ # All words in MRD file are written in UPPERCASE. One MRD file has the
7
+ # following sections: section of flexion and prefix models, section of
8
+ # accentual models, section of user sessions, session of prefix sets,
9
+ # section of lemmas.
10
+ #
11
+ class Myasorubka::AOT::Dictionary
12
+ attr_reader :lines, :language
13
+ attr_reader :rules_offset, :accents_offset, :logs_offset,
14
+ :prefixes_offset, :lemmas_offset
15
+
16
+ # The parser should be initialized by passing filename and language
17
+ # parameters.
18
+ #
19
+ def initialize(filename, language = nil, ee = nil, ie = Encoding.default_external)
20
+ encoding = { internal_encoding: ie, external_encoding: ee }
21
+ @lines, @language = File.readlines(filename, $/, encoding), language
22
+
23
+ @rules_offset = 0
24
+ @accents_offset = rules_offset + rules.length + 1
25
+ @logs_offset = accents_offset + accents.length + 1
26
+ @prefixes_offset = logs_offset + logs.length + 1
27
+ @lemmas_offset = prefixes_offset + prefixes.length + 1
28
+ end
29
+
30
+ # MRD file section handling helper class.
31
+ #
32
+ # Each section is a set of records, one per line. The number of
33
+ # all records of the section is written in the very beginning of
34
+ # the section ata separate line.
35
+ #
36
+ class Section
37
+ include Enumerable
38
+
39
+ attr_reader :lines, :offset, :length, :parser
40
+
41
+ # :nodoc:
42
+ def initialize(lines, offset, &block)
43
+ @lines, @offset = lines, offset
44
+ @length = lines[offset].strip.to_i
45
+ @parser = block || proc { |s| s }
46
+ end
47
+
48
+ # :nodoc:
49
+ def [] id
50
+ if id < 0 or id >= length
51
+ raise ArgumentError,
52
+ 'invalid id=%d when offset=%d and length=%d' %
53
+ [id, offset, length]
54
+ end
55
+
56
+ parser.call(lines[offset + 1 + id].strip)
57
+ end
58
+
59
+ # :nodoc:
60
+ def each(&block)
61
+ length.times { |id| block.call self[id] }
62
+ end
63
+ end
64
+
65
+ # Rules section accessor.
66
+ #
67
+ def rules
68
+ @rules ||= Section.new(lines, rules_offset) do |line|
69
+ line.split('%').map do |rule_line|
70
+ next unless rule_line && !rule_line.empty?
71
+
72
+ suffix, ancode, prefix = rule_line.split '*'
73
+
74
+ case language
75
+ when :russian then
76
+ suffix &&= suffix.tr 'Ёё', 'Ее'
77
+ prefix &&= prefix.tr 'Ёё', 'Ее'
78
+ end
79
+
80
+ [suffix, ancode[0..1], prefix]
81
+ end.compact
82
+ end
83
+ end
84
+
85
+ # Accents section accessor.
86
+ #
87
+ def accents
88
+ @accents ||= Section.new(lines, accents_offset)
89
+ end
90
+
91
+ # Logs section accessor.
92
+ #
93
+ def logs
94
+ @logs ||= Section.new(lines, logs_offset)
95
+ end
96
+
97
+ # Prefixes section accessor.
98
+ #
99
+ def prefixes
100
+ @prefixes ||= Section.new(lines, prefixes_offset)
101
+ end
102
+
103
+ # Lemmas section accessor.
104
+ #
105
+ def lemmas
106
+ @lemmas ||= Section.new(lines, lemmas_offset) do |line|
107
+ stem, rule_id, accent_id, session_id, ancode, prefix_id = line.split
108
+
109
+ case language
110
+ when :russian then
111
+ stem &&= stem.tr 'Ёё', 'Ее'
112
+ end
113
+
114
+ Array.new.tap do |result|
115
+ result <<
116
+ (stem == '#' ? nil : stem) <<
117
+ rule_id.to_i <<
118
+ accent_id.to_i <<
119
+ session_id.to_i <<
120
+ (ancode == '-' ? nil : ancode[0..1]) <<
121
+ (prefix_id == '-' ? nil : prefix_id.to_i)
122
+ end
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,32 @@
1
+ # encoding: utf-8
2
+
3
+ # Tab file contains all possible full morphological patterns for the
4
+ # words.
5
+ #
6
+ # One line in a Tab file looks like as follows:
7
+ #
8
+ # <ancode> <useless_number> <pos> <grammemes>
9
+ #
10
+ # An ancode is an ID, which consists of two letters and which
11
+ # uniquely identifies a morphological pattern. A morphological pattern
12
+ # consists of:
13
+ #
14
+ # <pos> and <grammemes>
15
+ #
16
+ # A MRD file refers to a Tab file, which is language-dependent.
17
+ #
18
+ class Myasorubka::AOT::Gramtab
19
+ attr_reader :ancodes, :language
20
+
21
+ # :nodoc:
22
+ def initialize(filename, ee = nil, ie = Encoding.default_external)
23
+ @ancodes, @language, id = {}, language, -1
24
+ encoding = { internal_encoding: ie, external_encoding: ee }
25
+
26
+ File.readlines(filename, $/, encoding).each do |line, i|
27
+ next if line.empty? or line.start_with?('//')
28
+ ancode, _, pos, grammemes = line.split
29
+ ancodes[ancode] = { id: id += 1, pos: pos, grammemes: grammemes || '' }
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,326 @@
1
+ # encoding: utf-8
2
+
3
+ # AOT-to-MSD morphosyntactic descriptors translator.
4
+ #
5
+ class Myasorubka::AOT::Tags
6
+ def self.join(hash, msd)
7
+ msd.grammemes.inject(hash) { |h, (k, v)| h[k.to_s] = v.to_s; h }
8
+ end
9
+
10
+ # Russian language helpers.
11
+ #
12
+ module Russian
13
+ def self.gender(msd, grammemes)
14
+ msd[:gender] = if grammemes.include? 'мр'
15
+ :masculine
16
+ elsif grammemes.include? 'жр'
17
+ :feminine
18
+ elsif grammemes.include? 'ср'
19
+ :neuter
20
+ elsif grammemes.include? 'мр-жр'
21
+ :common
22
+ end
23
+ msd
24
+ end
25
+
26
+ def self.animate(msd, grammemes)
27
+ msd[:animate] = if grammemes.include? 'од'
28
+ :yes
29
+ elsif grammemes.include? 'но'
30
+ :no
31
+ end
32
+ msd
33
+ end
34
+
35
+ def self.number(msd, grammemes)
36
+ msd[:number] = if grammemes.include? 'ед'
37
+ :singular
38
+ elsif grammemes.include? 'мн'
39
+ :plural
40
+ end
41
+ msd
42
+ end
43
+
44
+ def self.case(msd, grammemes)
45
+ msd[:case] = if grammemes.include? 'им'
46
+ :nominative
47
+ elsif grammemes.include? 'рд'
48
+ :genitive
49
+ elsif grammemes.include? 'дт'
50
+ :dative
51
+ elsif grammemes.include? 'вн'
52
+ :accusative
53
+ elsif grammemes.include? 'тв'
54
+ :instrumental
55
+ elsif grammemes.include? 'пр'
56
+ :locative
57
+ elsif grammemes.include? 'зв'
58
+ :vocative
59
+ end
60
+ msd
61
+ end
62
+
63
+ def self.case2(msd, grammemes)
64
+ if grammemes.include? '2'
65
+ msd[:case2] = :partitive if :genitive == msd[:case]
66
+ msd[:case2] = :locative if :locative == msd[:case]
67
+ end
68
+ msd
69
+ end
70
+
71
+ def self.aspect(msd, grammemes)
72
+ msd[:aspect] = if grammemes.include? 'св'
73
+ :perfective
74
+ elsif grammemes.include? 'нс'
75
+ :progressive
76
+ end
77
+ msd
78
+ end
79
+
80
+ def self.voice(msd, grammemes)
81
+ msd[:voice] = if grammemes.include? 'дст'
82
+ :active
83
+ elsif grammemes.include? 'стр'
84
+ :passive
85
+ elsif :verb == msd[:pos]
86
+ :medial
87
+ end
88
+ msd
89
+ end
90
+
91
+ def self.tense(msd, grammemes)
92
+ msd[:tense] = if grammemes.include? 'нст'
93
+ :present
94
+ elsif grammemes.include? 'прш'
95
+ :past
96
+ elsif grammemes.include? 'буд'
97
+ :future
98
+ end
99
+ msd
100
+ end
101
+
102
+ def self.person(msd, grammemes)
103
+ msd[:person] = if grammemes.include? '1л'
104
+ :first
105
+ elsif grammemes.include? '2л'
106
+ :second
107
+ elsif grammemes.include? '3л'
108
+ :third
109
+ elsif grammemes.include? 'безл'
110
+ nil
111
+ end
112
+ msd
113
+ end
114
+
115
+ def self.definiteness(msd, grammemes)
116
+ msd[:definiteness] = if grammemes.include? 'кр'
117
+ :short_art
118
+ end
119
+ msd
120
+ end
121
+
122
+ def self.degree(msd, grammemes)
123
+ msd[:degree] = if grammemes.include? 'сравн'
124
+ :comparative
125
+ elsif grammemes.include? 'прев'
126
+ :superlative
127
+ end
128
+ msd
129
+ end
130
+ end
131
+
132
+ # Russian language.
133
+ #
134
+ def self.russian(pos_line, grammemes_line)
135
+ grammemes = grammemes_line.split(',').map do |grammeme|
136
+ UnicodeUtils.downcase(grammeme)
137
+ end
138
+
139
+ msd = Myasorubka::MSD.new(Myasorubka::MSD::Russian)
140
+
141
+ if grammemes.include? 'aббр'
142
+ msd[:pos] = :abbreviation
143
+ msd[:syntactic_type] = if 'Н' == pos_line
144
+ :adverbial
145
+ else
146
+ :nominal
147
+ end
148
+ pos_line = 'АББР'
149
+ end
150
+
151
+ case UnicodeUtils.upcase(pos_line)
152
+ when 'С' then begin
153
+ msd[:pos] = :noun
154
+ msd[:type] = if (grammemes & [ 'имя', 'фам', 'отч', 'жарг', 'арх', 'проф', 'опч' ]).empty?
155
+ :common
156
+ else
157
+ :proper
158
+ end
159
+ [ :gender, :number, :case, :animate, :case2 ].each do |attribute|
160
+ Russian.send(attribute, msd, grammemes)
161
+ end
162
+ end
163
+ when 'П' then begin
164
+ msd[:pos] = :adjective
165
+ msd[:type] = if grammemes.include? 'кач'
166
+ :qualificative
167
+ elsif grammemes.include? 'притяж'
168
+ :possessive
169
+ end
170
+ msd[:degree] = :positive
171
+ msd[:definiteness] = :full_art
172
+ [ :degree, :gender, :number, :case, :definiteness ].each do |attribute|
173
+ Russian.send(attribute, msd, grammemes)
174
+ end
175
+ end
176
+ when 'МС' then begin
177
+ msd[:pos] = :pronoun
178
+ msd[:type] = :possessive if grammemes.include? 'притяж'
179
+ [ :person, :gender, :number, :case, :animate ].each do |attribute|
180
+ Russian.send(attribute, msd, grammemes)
181
+ end
182
+ end
183
+ when 'МС-ПРЕДК' then begin
184
+ msd[:pos] = :pronoun
185
+ msd[:type] = :possessive if grammemes.include? 'притяж'
186
+ msd[:case] = :genitive
187
+ [ :person, :gender, :number, :animate ].each do |attribute|
188
+ Russian.send(attribute, msd, grammemes)
189
+ end
190
+ end
191
+ when 'МС-П' then begin
192
+ msd[:pos] = :pronoun
193
+ msd[:type] = :possessive if grammemes.include? 'притяж'
194
+ msd[:syntactic_type] = :adjectival
195
+ [ :person, :gender, :number, :case, :animate ].each do |attribute|
196
+ Russian.send(attribute, msd, grammemes)
197
+ end
198
+ end
199
+ when 'Г' then begin
200
+ msd[:pos] = :verb
201
+ msd[:type] = :main
202
+ msd[:vform] = if grammemes.include? 'пвл'
203
+ :imperative
204
+ end
205
+ msd[:definiteness] = :full_art
206
+ [ :tense, :person, :number, :gender, :voice, :definiteness,
207
+ :aspect, :case ].each do |attribute|
208
+ Russian.send(attribute, msd, grammemes)
209
+ end
210
+ end
211
+ when 'ПРИЧАСТИЕ' then begin
212
+ msd[:pos] = :verb
213
+ msd[:vform] = :participle
214
+ msd[:definiteness] = :full_art
215
+ [ :tense, :person, :number, :gender, :voice, :definiteness,
216
+ :aspect, :case ].each do |attribute|
217
+ Russian.send(attribute, msd, grammemes)
218
+ end
219
+ end
220
+ when 'ДЕЕПРИЧАСТИЕ' then begin
221
+ msd[:pos] = :verb
222
+ msd[:vform] = :gerund
223
+ msd[:definiteness] = :full_art
224
+ [ :tense, :person, :number, :gender, :voice, :definiteness,
225
+ :aspect, :case ].each do |attribute|
226
+ Russian.send(attribute, msd, grammemes)
227
+ end
228
+ end
229
+ when 'ИНФИНИТИВ' then begin
230
+ msd[:pos] = :verb
231
+ msd[:vform] = :infinitive
232
+ msd[:definiteness] = :full_art
233
+ [ :tense, :person, :number, :gender, :voice, :definiteness,
234
+ :aspect, :case ].each do |attribute|
235
+ Russian.send(attribute, msd, grammemes)
236
+ end
237
+ end
238
+ when 'ЧИСЛ' then begin
239
+ msd[:pos] = :numeral
240
+ msd[:type] = :cardinal
241
+ [ :gender, :number, :case, :animate ].each do |attribute|
242
+ Russian.send(attribute, msd, grammemes)
243
+ end
244
+ end
245
+ when 'ЧИСЛ-П' then begin
246
+ msd[:pos] = :numeral
247
+ msd[:type] = :ordinal
248
+ [ :gender, :number, :case, :animate ].each do |attribute|
249
+ Russian.send(attribute, msd, grammemes)
250
+ end
251
+ end
252
+ when 'Н' then begin
253
+ msd[:pos] = :adverb
254
+ msd[:degree] = :positive
255
+ [ :degree ].each do |attribute|
256
+ Russian.send(attribute, msd, grammemes)
257
+ end
258
+ end
259
+ when 'ПРЕДК' then begin
260
+ msd[:pos] = :adverb
261
+ msd[:degree] = :positive
262
+ [ :degree ].each do |attribute|
263
+ Russian.send(attribute, msd, grammemes)
264
+ end
265
+ end
266
+ when 'ПРЕДЛ' then begin
267
+ msd[:pos] = :adposition
268
+ msd[:type] = :preposition
269
+ [ :case ].each do |attribute|
270
+ Russian.send(attribute, msd, grammemes)
271
+ end
272
+ end
273
+ when 'СОЮЗ' then begin
274
+ msd[:pos] = :conjunction
275
+ end
276
+ when 'МЕЖД' then begin
277
+ msd[:pos] = :interjection
278
+ end
279
+ when 'ЧАСТ' then begin
280
+ msd[:pos] = :particle
281
+ end
282
+ when 'ВВОДН' then begin
283
+ msd[:pos] = :adposition
284
+ end
285
+ when 'КР_ПРИЛ' then begin
286
+ msd[:pos] = :adjective
287
+ msd[:type] = if grammemes.include? 'кач'
288
+ :qualificative
289
+ elsif grammemes.include? 'притяж'
290
+ :possessive
291
+ end
292
+ msd[:degree] = :positive
293
+ [ :degree, :gender, :number, :case ].each do |attribute|
294
+ Russian.send(attribute, msd, grammemes)
295
+ end
296
+ msd[:definiteness] = :short_art
297
+ end
298
+ when 'КР_ПРИЧАСТИЕ' then begin
299
+ msd[:pos] = :verb
300
+ msd[:vform] = :participle
301
+ [ :tense, :person, :number, :gender, :voice,
302
+ :aspect, :case ].each do |attribute|
303
+ Russian.send(attribute, msd, grammemes)
304
+ end
305
+ msd[:definiteness] = :short_art
306
+ end
307
+ when 'АББР' then begin
308
+ [ :gender, :number, :case ].each do |attribute|
309
+ Russian.send(attribute, msd, grammemes)
310
+ end
311
+ end
312
+ when '*' then begin
313
+ msd[:pos] = :crutch
314
+ [ :gender, :animate, :number, :case, :case2, :aspect,
315
+ :voice, :tense, :person, :definiteness,
316
+ :degree ].each do |attribute|
317
+ Russian.send(attribute, msd, grammemes)
318
+ end
319
+ end
320
+ else
321
+ msd[:pos] = :residual
322
+ end
323
+
324
+ msd
325
+ end
326
+ end