myasorubka 0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: f19b580bd580d01ac69b901c274b066f936dfd39
4
+ data.tar.gz: 1d5dde45cbdb11897a595b0dcd42dd2dbd096980
5
+ SHA512:
6
+ metadata.gz: a330c97844b4860cfbdfd4ad5bcc950d67bdbb4ef02489b12afe7e84c16167a384ea6523c05e315bec4007b6070351a331d2d014cf8b2296310a012a11977840
7
+ data.tar.gz: bb98c6a3055464bea25b6ad27a20c66abc02e3c9847d71c3cde993b8a1c9673826633909608bdc81eba25c4645943a416c32f1607382e071a97501da1f398b03
@@ -0,0 +1,45 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## RUBINIUS
17
+ *.rbc
18
+
19
+ ## NETBEANS
20
+ nbproject
21
+
22
+ ## REDCAR
23
+ .redcar
24
+
25
+ ## RVM
26
+ .rvmrc
27
+ .ruby-version
28
+ .ruby-gemset
29
+
30
+ ## RUBINIUS
31
+ .rbx
32
+
33
+ ## BUNDLER
34
+ .bundle
35
+ Gemfile.lock
36
+
37
+ ## YARD
38
+ .yardoc
39
+ doc
40
+
41
+ ## PROJECT::GENERAL
42
+ coverage
43
+ pkg
44
+
45
+ ## PROJECT::SPECIFIC
@@ -0,0 +1,7 @@
1
+ branches:
2
+ only:
3
+ - master
4
+ rvm:
5
+ - 2.0.0
6
+ - jruby-19mode
7
+ - rbx-19mode
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2011-2013 Dmitry Ustalov
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,23 @@
1
+ Myasorubka
2
+ ==========
3
+
4
+ Myasorubka is a morphological data processor that supports
5
+ [AOT](http://aot.ru) and [MULTEXT-East](http://nl.ijs.si/ME/)
6
+ notations.
7
+
8
+ ## Contributing
9
+
10
+ 1. Fork it;
11
+ 2. Create your feature branch (`git checkout -b my-new-feature`);
12
+ 3. Commit your changes (`git commit -am 'Added some feature'`);
13
+ 4. Push to the branch (`git push origin my-new-feature`);
14
+ 5. Create new Pull Request.
15
+
16
+ ## Build Status [<img src="https://secure.travis-ci.org/ustalov/myasorubka.png"/>](http://travis-ci.org/ustalov/myasorubka)
17
+
18
+ ## Dependency Status [<img src="https://gemnasium.com/ustalov/myasorubka.png"/>](https://gemnasium.com/ustalov/myasorubka)
19
+
20
+ ## Copyright
21
+ Copyright (c) 2011-2013 [Dmitry Ustalov]. See LICENSE for details.
22
+
23
+ [Dmitry Ustalov]: http://eveel.ru
@@ -0,0 +1,11 @@
1
+ # encoding: utf-8
2
+
3
+ require 'bundler/gem_tasks'
4
+
5
+ require 'rake/testtask'
6
+ Rake::TestTask.new do |test|
7
+ test.pattern = 'spec/**/*_spec.rb'
8
+ test.verbose = true
9
+ end
10
+
11
+ task :default => :test
@@ -0,0 +1,7 @@
1
+ #!/bin/sh
2
+ rake clean aot \
3
+ mrd=morphs.mrd \
4
+ tab=rgramtab.tab \
5
+ encoding=CP1251 \
6
+ language=russian \
7
+ --trace
@@ -0,0 +1,4 @@
1
+ # encoding: utf-8
2
+
3
+ require 'myasorubka/version'
4
+ require 'myasorubka/msd'
@@ -0,0 +1,8 @@
1
+ # encoding: utf-8
2
+
3
+ # http://aot.ru/
4
+ module Myasorubka::AOT
5
+ require 'myasorubka/aot/dictionary'
6
+ require 'myasorubka/aot/gramtab'
7
+ require 'myasorubka/aot/tags'
8
+ end
@@ -0,0 +1,125 @@
1
+ # encoding: utf-8
2
+
3
+ # MRD file is a text file that contains a morphological dictionary of
4
+ # a natural language. MRD is an abbreviation of "morphological dictionary".
5
+ #
6
+ # All words in MRD file are written in UPPERCASE. One MRD file has the
7
+ # following sections: section of flexion and prefix models, section of
8
+ # accentual models, section of user sessions, session of prefix sets,
9
+ # section of lemmas.
10
+ #
11
+ class Myasorubka::AOT::Dictionary
12
+ attr_reader :lines, :language
13
+ attr_reader :rules_offset, :accents_offset, :logs_offset,
14
+ :prefixes_offset, :lemmas_offset
15
+
16
+ # The parser should be initialized by passing filename and language
17
+ # parameters.
18
+ #
19
+ def initialize(filename, language = nil, ee = nil, ie = Encoding.default_external)
20
+ encoding = { internal_encoding: ie, external_encoding: ee }
21
+ @lines, @language = File.readlines(filename, $/, encoding), language
22
+
23
+ @rules_offset = 0
24
+ @accents_offset = rules_offset + rules.length + 1
25
+ @logs_offset = accents_offset + accents.length + 1
26
+ @prefixes_offset = logs_offset + logs.length + 1
27
+ @lemmas_offset = prefixes_offset + prefixes.length + 1
28
+ end
29
+
30
+ # MRD file section handling helper class.
31
+ #
32
+ # Each section is a set of records, one per line. The number of
33
+ # all records of the section is written in the very beginning of
34
+ # the section ata separate line.
35
+ #
36
+ class Section
37
+ include Enumerable
38
+
39
+ attr_reader :lines, :offset, :length, :parser
40
+
41
+ # :nodoc:
42
+ def initialize(lines, offset, &block)
43
+ @lines, @offset = lines, offset
44
+ @length = lines[offset].strip.to_i
45
+ @parser = block || proc { |s| s }
46
+ end
47
+
48
+ # :nodoc:
49
+ def [] id
50
+ if id < 0 or id >= length
51
+ raise ArgumentError,
52
+ 'invalid id=%d when offset=%d and length=%d' %
53
+ [id, offset, length]
54
+ end
55
+
56
+ parser.call(lines[offset + 1 + id].strip)
57
+ end
58
+
59
+ # :nodoc:
60
+ def each(&block)
61
+ length.times { |id| block.call self[id] }
62
+ end
63
+ end
64
+
65
+ # Rules section accessor.
66
+ #
67
+ def rules
68
+ @rules ||= Section.new(lines, rules_offset) do |line|
69
+ line.split('%').map do |rule_line|
70
+ next unless rule_line && !rule_line.empty?
71
+
72
+ suffix, ancode, prefix = rule_line.split '*'
73
+
74
+ case language
75
+ when :russian then
76
+ suffix &&= suffix.tr 'Ёё', 'Ее'
77
+ prefix &&= prefix.tr 'Ёё', 'Ее'
78
+ end
79
+
80
+ [suffix, ancode[0..1], prefix]
81
+ end.compact
82
+ end
83
+ end
84
+
85
+ # Accents section accessor.
86
+ #
87
+ def accents
88
+ @accents ||= Section.new(lines, accents_offset)
89
+ end
90
+
91
+ # Logs section accessor.
92
+ #
93
+ def logs
94
+ @logs ||= Section.new(lines, logs_offset)
95
+ end
96
+
97
+ # Prefixes section accessor.
98
+ #
99
+ def prefixes
100
+ @prefixes ||= Section.new(lines, prefixes_offset)
101
+ end
102
+
103
+ # Lemmas section accessor.
104
+ #
105
+ def lemmas
106
+ @lemmas ||= Section.new(lines, lemmas_offset) do |line|
107
+ stem, rule_id, accent_id, session_id, ancode, prefix_id = line.split
108
+
109
+ case language
110
+ when :russian then
111
+ stem &&= stem.tr 'Ёё', 'Ее'
112
+ end
113
+
114
+ Array.new.tap do |result|
115
+ result <<
116
+ (stem == '#' ? nil : stem) <<
117
+ rule_id.to_i <<
118
+ accent_id.to_i <<
119
+ session_id.to_i <<
120
+ (ancode == '-' ? nil : ancode[0..1]) <<
121
+ (prefix_id == '-' ? nil : prefix_id.to_i)
122
+ end
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,32 @@
1
+ # encoding: utf-8
2
+
3
+ # Tab file contains all possible full morphological patterns for the
4
+ # words.
5
+ #
6
+ # One line in a Tab file looks like as follows:
7
+ #
8
+ # <ancode> <useless_number> <pos> <grammemes>
9
+ #
10
+ # An ancode is an ID, which consists of two letters and which
11
+ # uniquely identifies a morphological pattern. A morphological pattern
12
+ # consists of:
13
+ #
14
+ # <pos> and <grammemes>
15
+ #
16
+ # A MRD file refers to a Tab file, which is language-dependent.
17
+ #
18
+ class Myasorubka::AOT::Gramtab
19
+ attr_reader :ancodes, :language
20
+
21
+ # :nodoc:
22
+ def initialize(filename, ee = nil, ie = Encoding.default_external)
23
+ @ancodes, @language, id = {}, language, -1
24
+ encoding = { internal_encoding: ie, external_encoding: ee }
25
+
26
+ File.readlines(filename, $/, encoding).each do |line, i|
27
+ next if line.empty? or line.start_with?('//')
28
+ ancode, _, pos, grammemes = line.split
29
+ ancodes[ancode] = { id: id += 1, pos: pos, grammemes: grammemes || '' }
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,326 @@
1
+ # encoding: utf-8
2
+
3
+ # AOT-to-MSD morphosyntactic descriptors translator.
4
+ #
5
+ class Myasorubka::AOT::Tags
6
+ def self.join(hash, msd)
7
+ msd.grammemes.inject(hash) { |h, (k, v)| h[k.to_s] = v.to_s; h }
8
+ end
9
+
10
+ # Russian language helpers.
11
+ #
12
+ module Russian
13
+ def self.gender(msd, grammemes)
14
+ msd[:gender] = if grammemes.include? 'мр'
15
+ :masculine
16
+ elsif grammemes.include? 'жр'
17
+ :feminine
18
+ elsif grammemes.include? 'ср'
19
+ :neuter
20
+ elsif grammemes.include? 'мр-жр'
21
+ :common
22
+ end
23
+ msd
24
+ end
25
+
26
+ def self.animate(msd, grammemes)
27
+ msd[:animate] = if grammemes.include? 'од'
28
+ :yes
29
+ elsif grammemes.include? 'но'
30
+ :no
31
+ end
32
+ msd
33
+ end
34
+
35
+ def self.number(msd, grammemes)
36
+ msd[:number] = if grammemes.include? 'ед'
37
+ :singular
38
+ elsif grammemes.include? 'мн'
39
+ :plural
40
+ end
41
+ msd
42
+ end
43
+
44
+ def self.case(msd, grammemes)
45
+ msd[:case] = if grammemes.include? 'им'
46
+ :nominative
47
+ elsif grammemes.include? 'рд'
48
+ :genitive
49
+ elsif grammemes.include? 'дт'
50
+ :dative
51
+ elsif grammemes.include? 'вн'
52
+ :accusative
53
+ elsif grammemes.include? 'тв'
54
+ :instrumental
55
+ elsif grammemes.include? 'пр'
56
+ :locative
57
+ elsif grammemes.include? 'зв'
58
+ :vocative
59
+ end
60
+ msd
61
+ end
62
+
63
+ def self.case2(msd, grammemes)
64
+ if grammemes.include? '2'
65
+ msd[:case2] = :partitive if :genitive == msd[:case]
66
+ msd[:case2] = :locative if :locative == msd[:case]
67
+ end
68
+ msd
69
+ end
70
+
71
+ def self.aspect(msd, grammemes)
72
+ msd[:aspect] = if grammemes.include? 'св'
73
+ :perfective
74
+ elsif grammemes.include? 'нс'
75
+ :progressive
76
+ end
77
+ msd
78
+ end
79
+
80
+ def self.voice(msd, grammemes)
81
+ msd[:voice] = if grammemes.include? 'дст'
82
+ :active
83
+ elsif grammemes.include? 'стр'
84
+ :passive
85
+ elsif :verb == msd[:pos]
86
+ :medial
87
+ end
88
+ msd
89
+ end
90
+
91
+ def self.tense(msd, grammemes)
92
+ msd[:tense] = if grammemes.include? 'нст'
93
+ :present
94
+ elsif grammemes.include? 'прш'
95
+ :past
96
+ elsif grammemes.include? 'буд'
97
+ :future
98
+ end
99
+ msd
100
+ end
101
+
102
+ def self.person(msd, grammemes)
103
+ msd[:person] = if grammemes.include? '1л'
104
+ :first
105
+ elsif grammemes.include? '2л'
106
+ :second
107
+ elsif grammemes.include? '3л'
108
+ :third
109
+ elsif grammemes.include? 'безл'
110
+ nil
111
+ end
112
+ msd
113
+ end
114
+
115
+ def self.definiteness(msd, grammemes)
116
+ msd[:definiteness] = if grammemes.include? 'кр'
117
+ :short_art
118
+ end
119
+ msd
120
+ end
121
+
122
+ def self.degree(msd, grammemes)
123
+ msd[:degree] = if grammemes.include? 'сравн'
124
+ :comparative
125
+ elsif grammemes.include? 'прев'
126
+ :superlative
127
+ end
128
+ msd
129
+ end
130
+ end
131
+
132
+ # Russian language.
133
+ #
134
+ def self.russian(pos_line, grammemes_line)
135
+ grammemes = grammemes_line.split(',').map do |grammeme|
136
+ UnicodeUtils.downcase(grammeme)
137
+ end
138
+
139
+ msd = Myasorubka::MSD.new(Myasorubka::MSD::Russian)
140
+
141
+ if grammemes.include? 'aббр'
142
+ msd[:pos] = :abbreviation
143
+ msd[:syntactic_type] = if 'Н' == pos_line
144
+ :adverbial
145
+ else
146
+ :nominal
147
+ end
148
+ pos_line = 'АББР'
149
+ end
150
+
151
+ case UnicodeUtils.upcase(pos_line)
152
+ when 'С' then begin
153
+ msd[:pos] = :noun
154
+ msd[:type] = if (grammemes & [ 'имя', 'фам', 'отч', 'жарг', 'арх', 'проф', 'опч' ]).empty?
155
+ :common
156
+ else
157
+ :proper
158
+ end
159
+ [ :gender, :number, :case, :animate, :case2 ].each do |attribute|
160
+ Russian.send(attribute, msd, grammemes)
161
+ end
162
+ end
163
+ when 'П' then begin
164
+ msd[:pos] = :adjective
165
+ msd[:type] = if grammemes.include? 'кач'
166
+ :qualificative
167
+ elsif grammemes.include? 'притяж'
168
+ :possessive
169
+ end
170
+ msd[:degree] = :positive
171
+ msd[:definiteness] = :full_art
172
+ [ :degree, :gender, :number, :case, :definiteness ].each do |attribute|
173
+ Russian.send(attribute, msd, grammemes)
174
+ end
175
+ end
176
+ when 'МС' then begin
177
+ msd[:pos] = :pronoun
178
+ msd[:type] = :possessive if grammemes.include? 'притяж'
179
+ [ :person, :gender, :number, :case, :animate ].each do |attribute|
180
+ Russian.send(attribute, msd, grammemes)
181
+ end
182
+ end
183
+ when 'МС-ПРЕДК' then begin
184
+ msd[:pos] = :pronoun
185
+ msd[:type] = :possessive if grammemes.include? 'притяж'
186
+ msd[:case] = :genitive
187
+ [ :person, :gender, :number, :animate ].each do |attribute|
188
+ Russian.send(attribute, msd, grammemes)
189
+ end
190
+ end
191
+ when 'МС-П' then begin
192
+ msd[:pos] = :pronoun
193
+ msd[:type] = :possessive if grammemes.include? 'притяж'
194
+ msd[:syntactic_type] = :adjectival
195
+ [ :person, :gender, :number, :case, :animate ].each do |attribute|
196
+ Russian.send(attribute, msd, grammemes)
197
+ end
198
+ end
199
+ when 'Г' then begin
200
+ msd[:pos] = :verb
201
+ msd[:type] = :main
202
+ msd[:vform] = if grammemes.include? 'пвл'
203
+ :imperative
204
+ end
205
+ msd[:definiteness] = :full_art
206
+ [ :tense, :person, :number, :gender, :voice, :definiteness,
207
+ :aspect, :case ].each do |attribute|
208
+ Russian.send(attribute, msd, grammemes)
209
+ end
210
+ end
211
+ when 'ПРИЧАСТИЕ' then begin
212
+ msd[:pos] = :verb
213
+ msd[:vform] = :participle
214
+ msd[:definiteness] = :full_art
215
+ [ :tense, :person, :number, :gender, :voice, :definiteness,
216
+ :aspect, :case ].each do |attribute|
217
+ Russian.send(attribute, msd, grammemes)
218
+ end
219
+ end
220
+ when 'ДЕЕПРИЧАСТИЕ' then begin
221
+ msd[:pos] = :verb
222
+ msd[:vform] = :gerund
223
+ msd[:definiteness] = :full_art
224
+ [ :tense, :person, :number, :gender, :voice, :definiteness,
225
+ :aspect, :case ].each do |attribute|
226
+ Russian.send(attribute, msd, grammemes)
227
+ end
228
+ end
229
+ when 'ИНФИНИТИВ' then begin
230
+ msd[:pos] = :verb
231
+ msd[:vform] = :infinitive
232
+ msd[:definiteness] = :full_art
233
+ [ :tense, :person, :number, :gender, :voice, :definiteness,
234
+ :aspect, :case ].each do |attribute|
235
+ Russian.send(attribute, msd, grammemes)
236
+ end
237
+ end
238
+ when 'ЧИСЛ' then begin
239
+ msd[:pos] = :numeral
240
+ msd[:type] = :cardinal
241
+ [ :gender, :number, :case, :animate ].each do |attribute|
242
+ Russian.send(attribute, msd, grammemes)
243
+ end
244
+ end
245
+ when 'ЧИСЛ-П' then begin
246
+ msd[:pos] = :numeral
247
+ msd[:type] = :ordinal
248
+ [ :gender, :number, :case, :animate ].each do |attribute|
249
+ Russian.send(attribute, msd, grammemes)
250
+ end
251
+ end
252
+ when 'Н' then begin
253
+ msd[:pos] = :adverb
254
+ msd[:degree] = :positive
255
+ [ :degree ].each do |attribute|
256
+ Russian.send(attribute, msd, grammemes)
257
+ end
258
+ end
259
+ when 'ПРЕДК' then begin
260
+ msd[:pos] = :adverb
261
+ msd[:degree] = :positive
262
+ [ :degree ].each do |attribute|
263
+ Russian.send(attribute, msd, grammemes)
264
+ end
265
+ end
266
+ when 'ПРЕДЛ' then begin
267
+ msd[:pos] = :adposition
268
+ msd[:type] = :preposition
269
+ [ :case ].each do |attribute|
270
+ Russian.send(attribute, msd, grammemes)
271
+ end
272
+ end
273
+ when 'СОЮЗ' then begin
274
+ msd[:pos] = :conjunction
275
+ end
276
+ when 'МЕЖД' then begin
277
+ msd[:pos] = :interjection
278
+ end
279
+ when 'ЧАСТ' then begin
280
+ msd[:pos] = :particle
281
+ end
282
+ when 'ВВОДН' then begin
283
+ msd[:pos] = :adposition
284
+ end
285
+ when 'КР_ПРИЛ' then begin
286
+ msd[:pos] = :adjective
287
+ msd[:type] = if grammemes.include? 'кач'
288
+ :qualificative
289
+ elsif grammemes.include? 'притяж'
290
+ :possessive
291
+ end
292
+ msd[:degree] = :positive
293
+ [ :degree, :gender, :number, :case ].each do |attribute|
294
+ Russian.send(attribute, msd, grammemes)
295
+ end
296
+ msd[:definiteness] = :short_art
297
+ end
298
+ when 'КР_ПРИЧАСТИЕ' then begin
299
+ msd[:pos] = :verb
300
+ msd[:vform] = :participle
301
+ [ :tense, :person, :number, :gender, :voice,
302
+ :aspect, :case ].each do |attribute|
303
+ Russian.send(attribute, msd, grammemes)
304
+ end
305
+ msd[:definiteness] = :short_art
306
+ end
307
+ when 'АББР' then begin
308
+ [ :gender, :number, :case ].each do |attribute|
309
+ Russian.send(attribute, msd, grammemes)
310
+ end
311
+ end
312
+ when '*' then begin
313
+ msd[:pos] = :crutch
314
+ [ :gender, :animate, :number, :case, :case2, :aspect,
315
+ :voice, :tense, :person, :definiteness,
316
+ :degree ].each do |attribute|
317
+ Russian.send(attribute, msd, grammemes)
318
+ end
319
+ end
320
+ else
321
+ msd[:pos] = :residual
322
+ end
323
+
324
+ msd
325
+ end
326
+ end