myasorubka 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +45 -0
- data/.travis.yml +7 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +22 -0
- data/README.md +23 -0
- data/Rakefile +11 -0
- data/aot-russian +7 -0
- data/lib/myasorubka.rb +4 -0
- data/lib/myasorubka/aot.rb +8 -0
- data/lib/myasorubka/aot/dictionary.rb +125 -0
- data/lib/myasorubka/aot/gramtab.rb +32 -0
- data/lib/myasorubka/aot/tags.rb +326 -0
- data/lib/myasorubka/msd.rb +218 -0
- data/lib/myasorubka/msd/english.rb +263 -0
- data/lib/myasorubka/msd/russian.rb +454 -0
- data/lib/myasorubka/version.rb +9 -0
- data/myasorubka.gemspec +28 -0
- data/spec/msd/russian.tsv +717 -0
- data/spec/msd/russian_spec.rb +24 -0
- data/spec/msd_spec.rb +145 -0
- data/spec/spec_helper.rb +17 -0
- metadata +126 -0
@@ -0,0 +1,218 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# MSD is a morphosyntactic descriptor model.
|
4
|
+
#
|
5
|
+
# This representation, with the concrete applications which
|
6
|
+
# display and exemplify the attributes and values and provide
|
7
|
+
# their internal constraints and relationships, makes the proposal
|
8
|
+
# self-explanatory. Other groups can easily test the
|
9
|
+
# specifications on their language, simply by following the method of
|
10
|
+
# the applications. The possibility of incorporating idiosyncratic
|
11
|
+
# classes and distinctions after the common core features makes the
|
12
|
+
# proposal relatively adaptable and flexible, without compromising
|
13
|
+
# compatibility.
|
14
|
+
#
|
15
|
+
# MSD implementation and documentation are based on MULTEXT-East
|
16
|
+
# Morphosyntactic Specifications, Version 4:
|
17
|
+
# http://nl.ijs.si/ME/V4/msd/html/msd.html
|
18
|
+
#
|
19
|
+
# You may use Myasorubka::MSD either as parser and generator.
|
20
|
+
#
|
21
|
+
# ```ruby
|
22
|
+
# msd = Myasorubka::MSD.new(Myasorubka::MSD::Russian)
|
23
|
+
# msd[:pos] = :noun
|
24
|
+
# msd[:type] = :common
|
25
|
+
# msd[:number] = :plural
|
26
|
+
# msd[:case] = :locative
|
27
|
+
# msd.to_s # => "Nc-pl"
|
28
|
+
# ```
|
29
|
+
#
|
30
|
+
# ```ruby
|
31
|
+
# msd = Myasorubka::MSD.new(Myasorubka::MSD::Russian, 'Vmps-snpfel')
|
32
|
+
# msd[:pos] # => :verb
|
33
|
+
# msd[:tense] # => :past
|
34
|
+
# msd[:person] # => nil
|
35
|
+
# msd.grammemes # => {:type=>:main, :vform=>:participle, ...}
|
36
|
+
# ```
|
37
|
+
#
|
38
|
+
class Myasorubka::MSD
|
39
|
+
# Empty descriptor character.
|
40
|
+
#
|
41
|
+
EMPTY_DESCRIPTOR = '-'
|
42
|
+
|
43
|
+
# An exception that is raised when Myasorubka::MSD unable to
|
44
|
+
# operate with given morphosyntactic descriptor. Mostly
|
45
|
+
# this problem is caused by inappropriate language selection
|
46
|
+
# or just typos.
|
47
|
+
#
|
48
|
+
class InvalidDescriptor < RuntimeError; end
|
49
|
+
|
50
|
+
attr_reader :pos, :grammemes, :language
|
51
|
+
|
52
|
+
# Creates a new morphosyntactic descriptor model instance.
|
53
|
+
# Please specify a `language` module with defined
|
54
|
+
# `CATEGORIES`.
|
55
|
+
#
|
56
|
+
# Optionally, you can parse MSD string that is passed as
|
57
|
+
# `msd` argument.
|
58
|
+
#
|
59
|
+
# @param language [Myasorubka::MSD::Language] a language to use.
|
60
|
+
# @param msd [String] a String to initialize new MSD.
|
61
|
+
#
|
62
|
+
def initialize(language, msd = '')
|
63
|
+
@language, @pos, @grammemes = language, nil, {}
|
64
|
+
|
65
|
+
unless language.const_defined? 'CATEGORIES'
|
66
|
+
raise ArgumentError,
|
67
|
+
'given language has no morphosyntactic descriptions'
|
68
|
+
end
|
69
|
+
|
70
|
+
parse! msd if msd && !msd.empty?
|
71
|
+
end
|
72
|
+
|
73
|
+
# Retrieves the morphosyntactic descriptor corresponding
|
74
|
+
# to the `key` object. If not, returns `nil`.
|
75
|
+
#
|
76
|
+
# @param key [Symbol] a key to look at.
|
77
|
+
# @return [Symbol] a value of `key`.
|
78
|
+
#
|
79
|
+
def [] key
|
80
|
+
return pos if :pos == key
|
81
|
+
grammemes[key]
|
82
|
+
end
|
83
|
+
|
84
|
+
# Assignes the morphosyntactic descriptor given by
|
85
|
+
# `value` with the key given by `key` object.
|
86
|
+
#
|
87
|
+
# @param key [Symbol] a key to be set.
|
88
|
+
# @param value [Symbol] a value to be assigned.
|
89
|
+
# @return [Symbol] the assigned value.
|
90
|
+
#
|
91
|
+
def []= key, value
|
92
|
+
return @pos = value if :pos == key
|
93
|
+
grammemes[key] = value
|
94
|
+
end
|
95
|
+
|
96
|
+
# @private
|
97
|
+
def inspect
|
98
|
+
'#<%s msd=%s>' % [language.name, to_s.inspect]
|
99
|
+
end
|
100
|
+
|
101
|
+
# @private
|
102
|
+
def <=> other
|
103
|
+
to_s <=> other.to_s
|
104
|
+
end
|
105
|
+
|
106
|
+
# @private
|
107
|
+
def == other
|
108
|
+
to_s == other.to_s
|
109
|
+
end
|
110
|
+
|
111
|
+
# Generates Regexp from the MSD that is useful to perform
|
112
|
+
# database queries.
|
113
|
+
#
|
114
|
+
# ```ruby
|
115
|
+
# msd = Myasorubka::MSD.new(Myasorubka::MSD::Russian, 'Vm')
|
116
|
+
# r = msd.to_regexp # => /^Vm.*$/
|
117
|
+
# 'Vmp' =~ r # 0
|
118
|
+
# 'Nc-pl' =~ r # nil
|
119
|
+
# ```
|
120
|
+
#
|
121
|
+
# @return [Regexp] the correspondent regular expression.
|
122
|
+
#
|
123
|
+
def to_regexp
|
124
|
+
Regexp.new([
|
125
|
+
'^',
|
126
|
+
self.to_s.gsub(EMPTY_DESCRIPTOR, '.'),
|
127
|
+
'.*',
|
128
|
+
'$'
|
129
|
+
].join)
|
130
|
+
end
|
131
|
+
|
132
|
+
# Merges grammemes that are stored in `hash` into the
|
133
|
+
# MSD grammemes.
|
134
|
+
#
|
135
|
+
# @param hash [Hash<Symbol, Symbol>] a hash to be processed.
|
136
|
+
# @return [MSD] self.
|
137
|
+
#
|
138
|
+
def merge! hash
|
139
|
+
hash.each do |key, value|
|
140
|
+
self[key.to_sym] = value.to_sym
|
141
|
+
end
|
142
|
+
|
143
|
+
self
|
144
|
+
end
|
145
|
+
|
146
|
+
# @private
|
147
|
+
def to_s
|
148
|
+
return '' unless pos
|
149
|
+
|
150
|
+
unless category = language::CATEGORIES[pos]
|
151
|
+
raise InvalidDescriptor, "category is nil"
|
152
|
+
end
|
153
|
+
|
154
|
+
msd = [category[:code]]
|
155
|
+
|
156
|
+
attrs = category[:attrs]
|
157
|
+
grammemes.each do |attr_name, value|
|
158
|
+
next unless value
|
159
|
+
|
160
|
+
attr_index = attrs.index { |name, *values| name == attr_name }
|
161
|
+
unless attr_index
|
162
|
+
raise InvalidDescriptor, "no such attribute: '#{attr_name}' " \
|
163
|
+
"of category '#{pos}'"
|
164
|
+
end
|
165
|
+
|
166
|
+
attr_name, values = attrs[attr_index]
|
167
|
+
|
168
|
+
unless attr_value = values[value]
|
169
|
+
raise InvalidDescriptor, "no such value: '#{value}'' " \
|
170
|
+
"for attribute '#{attr_name}' " \
|
171
|
+
"of category '#{pos}'"
|
172
|
+
end
|
173
|
+
|
174
|
+
msd[attr_index + 1] = attr_value
|
175
|
+
end
|
176
|
+
|
177
|
+
msd.map { |e| e || EMPTY_DESCRIPTOR }.join
|
178
|
+
end
|
179
|
+
|
180
|
+
# Validates the MSD instance.
|
181
|
+
#
|
182
|
+
# @return [true, false] validation state of the MSD instance.
|
183
|
+
#
|
184
|
+
def valid?
|
185
|
+
!!to_s
|
186
|
+
rescue InvalidDescriptor
|
187
|
+
false
|
188
|
+
end
|
189
|
+
|
190
|
+
protected
|
191
|
+
# @private
|
192
|
+
def parse! msd_line
|
193
|
+
msd = msd_line.chars.to_a
|
194
|
+
|
195
|
+
category_code = msd.shift
|
196
|
+
|
197
|
+
@pos, category = language::CATEGORIES.find do |name, category|
|
198
|
+
category[:code] == category_code
|
199
|
+
end
|
200
|
+
|
201
|
+
raise InvalidDescriptor, msd_line unless @pos
|
202
|
+
|
203
|
+
attrs = category[:attrs]
|
204
|
+
|
205
|
+
msd.each_with_index do |value_code, i|
|
206
|
+
attr_name, values = attrs[i]
|
207
|
+
raise InvalidDescriptor, msd_line unless attr_name
|
208
|
+
|
209
|
+
next if :blank == attr_name
|
210
|
+
next if EMPTY_DESCRIPTOR == value_code
|
211
|
+
|
212
|
+
attribute = values.find { |name, code| code == value_code }
|
213
|
+
raise InvalidDescriptor, msd_line unless attribute
|
214
|
+
|
215
|
+
self[attr_name] = attribute.first
|
216
|
+
end
|
217
|
+
end
|
218
|
+
end
|
@@ -0,0 +1,263 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# English Specifications by Nancy Ide, Greg Priest-Dorman,
|
4
|
+
# Tomaž Erjavec, Tamas Varadi.
|
5
|
+
#
|
6
|
+
# http://nl.ijs.si/ME/V4/msd/html/msd-en.html
|
7
|
+
#
|
8
|
+
# This specification was translated into the Ruby language
|
9
|
+
# by [Dmitry Ustalov](http://eveel.ru).
|
10
|
+
#
|
11
|
+
module Myasorubka::MSD::English
|
12
|
+
# English Noun.
|
13
|
+
#
|
14
|
+
NOUN = {
|
15
|
+
code: 'N',
|
16
|
+
attrs: [
|
17
|
+
[ :type, {
|
18
|
+
common: 'c',
|
19
|
+
proper: 'p'
|
20
|
+
} ],
|
21
|
+
[ :gender, {
|
22
|
+
masculine: 'm',
|
23
|
+
feminine: 'f',
|
24
|
+
neuter: 'n'
|
25
|
+
} ],
|
26
|
+
[ :number, {
|
27
|
+
singular: 's',
|
28
|
+
plural: 'p'
|
29
|
+
} ]
|
30
|
+
]
|
31
|
+
}
|
32
|
+
|
33
|
+
# English Verb.
|
34
|
+
#
|
35
|
+
VERB = {
|
36
|
+
code: 'V',
|
37
|
+
attrs: [
|
38
|
+
[ :type, {
|
39
|
+
main: 'm',
|
40
|
+
auxiliary: 'a',
|
41
|
+
modal: 'o',
|
42
|
+
base: 'b'
|
43
|
+
} ],
|
44
|
+
[ :vform, {
|
45
|
+
indicative: 'i',
|
46
|
+
conditional: 'c',
|
47
|
+
infinitive: 'n',
|
48
|
+
participle: 'p'
|
49
|
+
} ],
|
50
|
+
[ :tense, {
|
51
|
+
present: 'p',
|
52
|
+
past: 's'
|
53
|
+
} ],
|
54
|
+
[ :person, {
|
55
|
+
first: '1',
|
56
|
+
second: '2',
|
57
|
+
third: '3'
|
58
|
+
} ],
|
59
|
+
[ :number, {
|
60
|
+
singular: 's',
|
61
|
+
plural: 'p'
|
62
|
+
} ]
|
63
|
+
]
|
64
|
+
}
|
65
|
+
|
66
|
+
# English Adjective.
|
67
|
+
#
|
68
|
+
ADJECTIVE = {
|
69
|
+
code: 'A',
|
70
|
+
attrs: [
|
71
|
+
[ :type, {
|
72
|
+
qualificative: 'f'
|
73
|
+
} ],
|
74
|
+
[ :degree, {
|
75
|
+
positive: 'p',
|
76
|
+
comparative: 'c',
|
77
|
+
superlative: 's'
|
78
|
+
} ]
|
79
|
+
]
|
80
|
+
}
|
81
|
+
|
82
|
+
# English Pronoun.
|
83
|
+
#
|
84
|
+
PRONOUN = {
|
85
|
+
code: 'P',
|
86
|
+
attrs: [
|
87
|
+
[ :type, {
|
88
|
+
personal: 'p',
|
89
|
+
possessive: 's',
|
90
|
+
interrogative: 'q',
|
91
|
+
relative: 'r',
|
92
|
+
reflexive: 'x',
|
93
|
+
general: 'g',
|
94
|
+
ex_there: 't'
|
95
|
+
} ],
|
96
|
+
[ :person, {
|
97
|
+
first: '1',
|
98
|
+
second: '2',
|
99
|
+
third: '3'
|
100
|
+
} ],
|
101
|
+
[ :gender, {
|
102
|
+
masculine: 'm',
|
103
|
+
feminine: 'f',
|
104
|
+
neuter: 'n'
|
105
|
+
} ],
|
106
|
+
[ :number, {
|
107
|
+
singular: 's',
|
108
|
+
plural: 'p'
|
109
|
+
} ],
|
110
|
+
[ :case, {
|
111
|
+
nominative: 'n',
|
112
|
+
accusative: 'a'
|
113
|
+
} ],
|
114
|
+
[ :owner_number, {
|
115
|
+
singular: 's',
|
116
|
+
plural: 'p'
|
117
|
+
} ],
|
118
|
+
[ :owner_gender, {
|
119
|
+
masculine: 'm',
|
120
|
+
feminine: 'f'
|
121
|
+
} ],
|
122
|
+
[ :wh_type, {
|
123
|
+
relative: 'r',
|
124
|
+
question: 'q'
|
125
|
+
} ],
|
126
|
+
]
|
127
|
+
}
|
128
|
+
|
129
|
+
# English Determiner.
|
130
|
+
#
|
131
|
+
DETERMINER = {
|
132
|
+
code: 'D',
|
133
|
+
attrs: [
|
134
|
+
[ :type, {
|
135
|
+
demonstrative: 'd',
|
136
|
+
indefinite: 'i',
|
137
|
+
possessive: 's',
|
138
|
+
general: 'g'
|
139
|
+
} ],
|
140
|
+
[ :person, {
|
141
|
+
first: '1',
|
142
|
+
second: '2',
|
143
|
+
third: '3'
|
144
|
+
} ],
|
145
|
+
[ :number, {
|
146
|
+
singular: 's',
|
147
|
+
plural: 'p'
|
148
|
+
} ],
|
149
|
+
[ :owner_number, {
|
150
|
+
singular: 's',
|
151
|
+
plural: 'p'
|
152
|
+
} ],
|
153
|
+
[ :owner_gender, {
|
154
|
+
masculine: 'm',
|
155
|
+
feminine: 'f',
|
156
|
+
neuter: 'n'
|
157
|
+
} ],
|
158
|
+
[ :wh_type, {
|
159
|
+
relative: 'r',
|
160
|
+
question: 'q'
|
161
|
+
} ]
|
162
|
+
]
|
163
|
+
}
|
164
|
+
|
165
|
+
# English Adverb.
|
166
|
+
#
|
167
|
+
ADVERB = {
|
168
|
+
code: 'R',
|
169
|
+
attrs: [
|
170
|
+
[ :type, {
|
171
|
+
modifier: 'm',
|
172
|
+
specifier: 's'
|
173
|
+
} ],
|
174
|
+
[ :degree, {
|
175
|
+
positive: 'p',
|
176
|
+
comparative: 'c',
|
177
|
+
superlative: 's'
|
178
|
+
} ],
|
179
|
+
[ :wh_type, {
|
180
|
+
relative: 'r',
|
181
|
+
question: 'q'
|
182
|
+
} ]
|
183
|
+
]
|
184
|
+
}
|
185
|
+
|
186
|
+
# English Adposition.
|
187
|
+
#
|
188
|
+
ADPOSITION = {
|
189
|
+
code: 'S',
|
190
|
+
attrs: [
|
191
|
+
[ :type, {
|
192
|
+
preposition: 'p',
|
193
|
+
postposition: 't'
|
194
|
+
} ]
|
195
|
+
]
|
196
|
+
}
|
197
|
+
|
198
|
+
# English Conjunction.
|
199
|
+
#
|
200
|
+
CONJUNCTION = {
|
201
|
+
code: 'C',
|
202
|
+
attrs: [
|
203
|
+
[ :type, {
|
204
|
+
coordinating: 'c',
|
205
|
+
subordinating: 's'
|
206
|
+
} ],
|
207
|
+
[ :coord_type, {
|
208
|
+
initial: 'i',
|
209
|
+
non_initial: 'n'
|
210
|
+
} ],
|
211
|
+
]
|
212
|
+
}
|
213
|
+
|
214
|
+
# English Numeral.
|
215
|
+
#
|
216
|
+
NUMERAL = {
|
217
|
+
code: 'M',
|
218
|
+
attrs: [
|
219
|
+
[ :type, {
|
220
|
+
cardinal: 'c',
|
221
|
+
ordinal: 'o'
|
222
|
+
} ],
|
223
|
+
]
|
224
|
+
}
|
225
|
+
|
226
|
+
# English Interjection.
|
227
|
+
#
|
228
|
+
INTERJECTION = {
|
229
|
+
code: 'I',
|
230
|
+
attrs: []
|
231
|
+
}
|
232
|
+
|
233
|
+
# English Abbreviation.
|
234
|
+
#
|
235
|
+
ABBREVIATION = {
|
236
|
+
code: 'Y',
|
237
|
+
attrs: []
|
238
|
+
}
|
239
|
+
|
240
|
+
# English Residual.
|
241
|
+
#
|
242
|
+
RESIDUAL = {
|
243
|
+
code: 'X',
|
244
|
+
attrs: []
|
245
|
+
}
|
246
|
+
|
247
|
+
# Actual part-of-speech mapping.
|
248
|
+
#
|
249
|
+
CATEGORIES = {
|
250
|
+
noun: NOUN,
|
251
|
+
verb: VERB,
|
252
|
+
adjective: ADJECTIVE,
|
253
|
+
pronoun: PRONOUN,
|
254
|
+
determiner: DETERMINER,
|
255
|
+
adverb: ADVERB,
|
256
|
+
adposition: ADPOSITION,
|
257
|
+
conjunction: CONJUNCTION,
|
258
|
+
numeral: NUMERAL,
|
259
|
+
interjection: INTERJECTION,
|
260
|
+
abbreviation: ABBREVIATION,
|
261
|
+
residual: RESIDUAL
|
262
|
+
}
|
263
|
+
end
|