myasorubka 0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,218 @@
1
+ # encoding: utf-8
2
+
3
+ # MSD is a morphosyntactic descriptor model.
4
+ #
5
+ # This representation, with the concrete applications which
6
+ # display and exemplify the attributes and values and provide
7
+ # their internal constraints and relationships, makes the proposal
8
+ # self-explanatory. Other groups can easily test the
9
+ # specifications on their language, simply by following the method of
10
+ # the applications. The possibility of incorporating idiosyncratic
11
+ # classes and distinctions after the common core features makes the
12
+ # proposal relatively adaptable and flexible, without compromising
13
+ # compatibility.
14
+ #
15
+ # MSD implementation and documentation are based on MULTEXT-East
16
+ # Morphosyntactic Specifications, Version 4:
17
+ # http://nl.ijs.si/ME/V4/msd/html/msd.html
18
+ #
19
+ # You may use Myasorubka::MSD either as parser and generator.
20
+ #
21
+ # ```ruby
22
+ # msd = Myasorubka::MSD.new(Myasorubka::MSD::Russian)
23
+ # msd[:pos] = :noun
24
+ # msd[:type] = :common
25
+ # msd[:number] = :plural
26
+ # msd[:case] = :locative
27
+ # msd.to_s # => "Nc-pl"
28
+ # ```
29
+ #
30
+ # ```ruby
31
+ # msd = Myasorubka::MSD.new(Myasorubka::MSD::Russian, 'Vmps-snpfel')
32
+ # msd[:pos] # => :verb
33
+ # msd[:tense] # => :past
34
+ # msd[:person] # => nil
35
+ # msd.grammemes # => {:type=>:main, :vform=>:participle, ...}
36
+ # ```
37
+ #
38
+ class Myasorubka::MSD
39
+ # Empty descriptor character.
40
+ #
41
+ EMPTY_DESCRIPTOR = '-'
42
+
43
+ # An exception that is raised when Myasorubka::MSD unable to
44
+ # operate with given morphosyntactic descriptor. Mostly
45
+ # this problem is caused by inappropriate language selection
46
+ # or just typos.
47
+ #
48
+ class InvalidDescriptor < RuntimeError; end
49
+
50
+ attr_reader :pos, :grammemes, :language
51
+
52
+ # Creates a new morphosyntactic descriptor model instance.
53
+ # Please specify a `language` module with defined
54
+ # `CATEGORIES`.
55
+ #
56
+ # Optionally, you can parse MSD string that is passed as
57
+ # `msd` argument.
58
+ #
59
+ # @param language [Myasorubka::MSD::Language] a language to use.
60
+ # @param msd [String] a String to initialize new MSD.
61
+ #
62
+ def initialize(language, msd = '')
63
+ @language, @pos, @grammemes = language, nil, {}
64
+
65
+ unless language.const_defined? 'CATEGORIES'
66
+ raise ArgumentError,
67
+ 'given language has no morphosyntactic descriptions'
68
+ end
69
+
70
+ parse! msd if msd && !msd.empty?
71
+ end
72
+
73
+ # Retrieves the morphosyntactic descriptor corresponding
74
+ # to the `key` object. If not, returns `nil`.
75
+ #
76
+ # @param key [Symbol] a key to look at.
77
+ # @return [Symbol] a value of `key`.
78
+ #
79
+ def [] key
80
+ return pos if :pos == key
81
+ grammemes[key]
82
+ end
83
+
84
+ # Assignes the morphosyntactic descriptor given by
85
+ # `value` with the key given by `key` object.
86
+ #
87
+ # @param key [Symbol] a key to be set.
88
+ # @param value [Symbol] a value to be assigned.
89
+ # @return [Symbol] the assigned value.
90
+ #
91
+ def []= key, value
92
+ return @pos = value if :pos == key
93
+ grammemes[key] = value
94
+ end
95
+
96
+ # @private
97
+ def inspect
98
+ '#<%s msd=%s>' % [language.name, to_s.inspect]
99
+ end
100
+
101
+ # @private
102
+ def <=> other
103
+ to_s <=> other.to_s
104
+ end
105
+
106
+ # @private
107
+ def == other
108
+ to_s == other.to_s
109
+ end
110
+
111
+ # Generates Regexp from the MSD that is useful to perform
112
+ # database queries.
113
+ #
114
+ # ```ruby
115
+ # msd = Myasorubka::MSD.new(Myasorubka::MSD::Russian, 'Vm')
116
+ # r = msd.to_regexp # => /^Vm.*$/
117
+ # 'Vmp' =~ r # 0
118
+ # 'Nc-pl' =~ r # nil
119
+ # ```
120
+ #
121
+ # @return [Regexp] the correspondent regular expression.
122
+ #
123
+ def to_regexp
124
+ Regexp.new([
125
+ '^',
126
+ self.to_s.gsub(EMPTY_DESCRIPTOR, '.'),
127
+ '.*',
128
+ '$'
129
+ ].join)
130
+ end
131
+
132
+ # Merges grammemes that are stored in `hash` into the
133
+ # MSD grammemes.
134
+ #
135
+ # @param hash [Hash<Symbol, Symbol>] a hash to be processed.
136
+ # @return [MSD] self.
137
+ #
138
+ def merge! hash
139
+ hash.each do |key, value|
140
+ self[key.to_sym] = value.to_sym
141
+ end
142
+
143
+ self
144
+ end
145
+
146
+ # @private
147
+ def to_s
148
+ return '' unless pos
149
+
150
+ unless category = language::CATEGORIES[pos]
151
+ raise InvalidDescriptor, "category is nil"
152
+ end
153
+
154
+ msd = [category[:code]]
155
+
156
+ attrs = category[:attrs]
157
+ grammemes.each do |attr_name, value|
158
+ next unless value
159
+
160
+ attr_index = attrs.index { |name, *values| name == attr_name }
161
+ unless attr_index
162
+ raise InvalidDescriptor, "no such attribute: '#{attr_name}' " \
163
+ "of category '#{pos}'"
164
+ end
165
+
166
+ attr_name, values = attrs[attr_index]
167
+
168
+ unless attr_value = values[value]
169
+ raise InvalidDescriptor, "no such value: '#{value}'' " \
170
+ "for attribute '#{attr_name}' " \
171
+ "of category '#{pos}'"
172
+ end
173
+
174
+ msd[attr_index + 1] = attr_value
175
+ end
176
+
177
+ msd.map { |e| e || EMPTY_DESCRIPTOR }.join
178
+ end
179
+
180
+ # Validates the MSD instance.
181
+ #
182
+ # @return [true, false] validation state of the MSD instance.
183
+ #
184
+ def valid?
185
+ !!to_s
186
+ rescue InvalidDescriptor
187
+ false
188
+ end
189
+
190
+ protected
191
+ # @private
192
+ def parse! msd_line
193
+ msd = msd_line.chars.to_a
194
+
195
+ category_code = msd.shift
196
+
197
+ @pos, category = language::CATEGORIES.find do |name, category|
198
+ category[:code] == category_code
199
+ end
200
+
201
+ raise InvalidDescriptor, msd_line unless @pos
202
+
203
+ attrs = category[:attrs]
204
+
205
+ msd.each_with_index do |value_code, i|
206
+ attr_name, values = attrs[i]
207
+ raise InvalidDescriptor, msd_line unless attr_name
208
+
209
+ next if :blank == attr_name
210
+ next if EMPTY_DESCRIPTOR == value_code
211
+
212
+ attribute = values.find { |name, code| code == value_code }
213
+ raise InvalidDescriptor, msd_line unless attribute
214
+
215
+ self[attr_name] = attribute.first
216
+ end
217
+ end
218
+ end
@@ -0,0 +1,263 @@
1
+ # encoding: utf-8
2
+
3
+ # English Specifications by Nancy Ide, Greg Priest-Dorman,
4
+ # Tomaž Erjavec, Tamas Varadi.
5
+ #
6
+ # http://nl.ijs.si/ME/V4/msd/html/msd-en.html
7
+ #
8
+ # This specification was translated into the Ruby language
9
+ # by [Dmitry Ustalov](http://eveel.ru).
10
+ #
11
+ module Myasorubka::MSD::English
12
+ # English Noun.
13
+ #
14
+ NOUN = {
15
+ code: 'N',
16
+ attrs: [
17
+ [ :type, {
18
+ common: 'c',
19
+ proper: 'p'
20
+ } ],
21
+ [ :gender, {
22
+ masculine: 'm',
23
+ feminine: 'f',
24
+ neuter: 'n'
25
+ } ],
26
+ [ :number, {
27
+ singular: 's',
28
+ plural: 'p'
29
+ } ]
30
+ ]
31
+ }
32
+
33
+ # English Verb.
34
+ #
35
+ VERB = {
36
+ code: 'V',
37
+ attrs: [
38
+ [ :type, {
39
+ main: 'm',
40
+ auxiliary: 'a',
41
+ modal: 'o',
42
+ base: 'b'
43
+ } ],
44
+ [ :vform, {
45
+ indicative: 'i',
46
+ conditional: 'c',
47
+ infinitive: 'n',
48
+ participle: 'p'
49
+ } ],
50
+ [ :tense, {
51
+ present: 'p',
52
+ past: 's'
53
+ } ],
54
+ [ :person, {
55
+ first: '1',
56
+ second: '2',
57
+ third: '3'
58
+ } ],
59
+ [ :number, {
60
+ singular: 's',
61
+ plural: 'p'
62
+ } ]
63
+ ]
64
+ }
65
+
66
+ # English Adjective.
67
+ #
68
+ ADJECTIVE = {
69
+ code: 'A',
70
+ attrs: [
71
+ [ :type, {
72
+ qualificative: 'f'
73
+ } ],
74
+ [ :degree, {
75
+ positive: 'p',
76
+ comparative: 'c',
77
+ superlative: 's'
78
+ } ]
79
+ ]
80
+ }
81
+
82
+ # English Pronoun.
83
+ #
84
+ PRONOUN = {
85
+ code: 'P',
86
+ attrs: [
87
+ [ :type, {
88
+ personal: 'p',
89
+ possessive: 's',
90
+ interrogative: 'q',
91
+ relative: 'r',
92
+ reflexive: 'x',
93
+ general: 'g',
94
+ ex_there: 't'
95
+ } ],
96
+ [ :person, {
97
+ first: '1',
98
+ second: '2',
99
+ third: '3'
100
+ } ],
101
+ [ :gender, {
102
+ masculine: 'm',
103
+ feminine: 'f',
104
+ neuter: 'n'
105
+ } ],
106
+ [ :number, {
107
+ singular: 's',
108
+ plural: 'p'
109
+ } ],
110
+ [ :case, {
111
+ nominative: 'n',
112
+ accusative: 'a'
113
+ } ],
114
+ [ :owner_number, {
115
+ singular: 's',
116
+ plural: 'p'
117
+ } ],
118
+ [ :owner_gender, {
119
+ masculine: 'm',
120
+ feminine: 'f'
121
+ } ],
122
+ [ :wh_type, {
123
+ relative: 'r',
124
+ question: 'q'
125
+ } ],
126
+ ]
127
+ }
128
+
129
+ # English Determiner.
130
+ #
131
+ DETERMINER = {
132
+ code: 'D',
133
+ attrs: [
134
+ [ :type, {
135
+ demonstrative: 'd',
136
+ indefinite: 'i',
137
+ possessive: 's',
138
+ general: 'g'
139
+ } ],
140
+ [ :person, {
141
+ first: '1',
142
+ second: '2',
143
+ third: '3'
144
+ } ],
145
+ [ :number, {
146
+ singular: 's',
147
+ plural: 'p'
148
+ } ],
149
+ [ :owner_number, {
150
+ singular: 's',
151
+ plural: 'p'
152
+ } ],
153
+ [ :owner_gender, {
154
+ masculine: 'm',
155
+ feminine: 'f',
156
+ neuter: 'n'
157
+ } ],
158
+ [ :wh_type, {
159
+ relative: 'r',
160
+ question: 'q'
161
+ } ]
162
+ ]
163
+ }
164
+
165
+ # English Adverb.
166
+ #
167
+ ADVERB = {
168
+ code: 'R',
169
+ attrs: [
170
+ [ :type, {
171
+ modifier: 'm',
172
+ specifier: 's'
173
+ } ],
174
+ [ :degree, {
175
+ positive: 'p',
176
+ comparative: 'c',
177
+ superlative: 's'
178
+ } ],
179
+ [ :wh_type, {
180
+ relative: 'r',
181
+ question: 'q'
182
+ } ]
183
+ ]
184
+ }
185
+
186
+ # English Adposition.
187
+ #
188
+ ADPOSITION = {
189
+ code: 'S',
190
+ attrs: [
191
+ [ :type, {
192
+ preposition: 'p',
193
+ postposition: 't'
194
+ } ]
195
+ ]
196
+ }
197
+
198
+ # English Conjunction.
199
+ #
200
+ CONJUNCTION = {
201
+ code: 'C',
202
+ attrs: [
203
+ [ :type, {
204
+ coordinating: 'c',
205
+ subordinating: 's'
206
+ } ],
207
+ [ :coord_type, {
208
+ initial: 'i',
209
+ non_initial: 'n'
210
+ } ],
211
+ ]
212
+ }
213
+
214
+ # English Numeral.
215
+ #
216
+ NUMERAL = {
217
+ code: 'M',
218
+ attrs: [
219
+ [ :type, {
220
+ cardinal: 'c',
221
+ ordinal: 'o'
222
+ } ],
223
+ ]
224
+ }
225
+
226
+ # English Interjection.
227
+ #
228
+ INTERJECTION = {
229
+ code: 'I',
230
+ attrs: []
231
+ }
232
+
233
+ # English Abbreviation.
234
+ #
235
+ ABBREVIATION = {
236
+ code: 'Y',
237
+ attrs: []
238
+ }
239
+
240
+ # English Residual.
241
+ #
242
+ RESIDUAL = {
243
+ code: 'X',
244
+ attrs: []
245
+ }
246
+
247
+ # Actual part-of-speech mapping.
248
+ #
249
+ CATEGORIES = {
250
+ noun: NOUN,
251
+ verb: VERB,
252
+ adjective: ADJECTIVE,
253
+ pronoun: PRONOUN,
254
+ determiner: DETERMINER,
255
+ adverb: ADVERB,
256
+ adposition: ADPOSITION,
257
+ conjunction: CONJUNCTION,
258
+ numeral: NUMERAL,
259
+ interjection: INTERJECTION,
260
+ abbreviation: ABBREVIATION,
261
+ residual: RESIDUAL
262
+ }
263
+ end