myasorubka 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,218 @@
1
+ # encoding: utf-8
2
+
3
+ # MSD is a morphosyntactic descriptor model.
4
+ #
5
+ # This representation, with the concrete applications which
6
+ # display and exemplify the attributes and values and provide
7
+ # their internal constraints and relationships, makes the proposal
8
+ # self-explanatory. Other groups can easily test the
9
+ # specifications on their language, simply by following the method of
10
+ # the applications. The possibility of incorporating idiosyncratic
11
+ # classes and distinctions after the common core features makes the
12
+ # proposal relatively adaptable and flexible, without compromising
13
+ # compatibility.
14
+ #
15
+ # MSD implementation and documentation are based on MULTEXT-East
16
+ # Morphosyntactic Specifications, Version 4:
17
+ # http://nl.ijs.si/ME/V4/msd/html/msd.html
18
+ #
19
+ # You may use Myasorubka::MSD either as parser and generator.
20
+ #
21
+ # ```ruby
22
+ # msd = Myasorubka::MSD.new(Myasorubka::MSD::Russian)
23
+ # msd[:pos] = :noun
24
+ # msd[:type] = :common
25
+ # msd[:number] = :plural
26
+ # msd[:case] = :locative
27
+ # msd.to_s # => "Nc-pl"
28
+ # ```
29
+ #
30
+ # ```ruby
31
+ # msd = Myasorubka::MSD.new(Myasorubka::MSD::Russian, 'Vmps-snpfel')
32
+ # msd[:pos] # => :verb
33
+ # msd[:tense] # => :past
34
+ # msd[:person] # => nil
35
+ # msd.grammemes # => {:type=>:main, :vform=>:participle, ...}
36
+ # ```
37
+ #
38
+ class Myasorubka::MSD
39
+ # Empty descriptor character.
40
+ #
41
+ EMPTY_DESCRIPTOR = '-'
42
+
43
+ # An exception that is raised when Myasorubka::MSD unable to
44
+ # operate with given morphosyntactic descriptor. Mostly
45
+ # this problem is caused by inappropriate language selection
46
+ # or just typos.
47
+ #
48
+ class InvalidDescriptor < RuntimeError; end
49
+
50
+ attr_reader :pos, :grammemes, :language
51
+
52
+ # Creates a new morphosyntactic descriptor model instance.
53
+ # Please specify a `language` module with defined
54
+ # `CATEGORIES`.
55
+ #
56
+ # Optionally, you can parse MSD string that is passed as
57
+ # `msd` argument.
58
+ #
59
+ # @param language [Myasorubka::MSD::Language] a language to use.
60
+ # @param msd [String] a String to initialize new MSD.
61
+ #
62
+ def initialize(language, msd = '')
63
+ @language, @pos, @grammemes = language, nil, {}
64
+
65
+ unless language.const_defined? 'CATEGORIES'
66
+ raise ArgumentError,
67
+ 'given language has no morphosyntactic descriptions'
68
+ end
69
+
70
+ parse! msd if msd && !msd.empty?
71
+ end
72
+
73
+ # Retrieves the morphosyntactic descriptor corresponding
74
+ # to the `key` object. If not, returns `nil`.
75
+ #
76
+ # @param key [Symbol] a key to look at.
77
+ # @return [Symbol] a value of `key`.
78
+ #
79
+ def [] key
80
+ return pos if :pos == key
81
+ grammemes[key]
82
+ end
83
+
84
+ # Assignes the morphosyntactic descriptor given by
85
+ # `value` with the key given by `key` object.
86
+ #
87
+ # @param key [Symbol] a key to be set.
88
+ # @param value [Symbol] a value to be assigned.
89
+ # @return [Symbol] the assigned value.
90
+ #
91
+ def []= key, value
92
+ return @pos = value if :pos == key
93
+ grammemes[key] = value
94
+ end
95
+
96
+ # @private
97
+ def inspect
98
+ '#<%s msd=%s>' % [language.name, to_s.inspect]
99
+ end
100
+
101
+ # @private
102
+ def <=> other
103
+ to_s <=> other.to_s
104
+ end
105
+
106
+ # @private
107
+ def == other
108
+ to_s == other.to_s
109
+ end
110
+
111
+ # Generates Regexp from the MSD that is useful to perform
112
+ # database queries.
113
+ #
114
+ # ```ruby
115
+ # msd = Myasorubka::MSD.new(Myasorubka::MSD::Russian, 'Vm')
116
+ # r = msd.to_regexp # => /^Vm.*$/
117
+ # 'Vmp' =~ r # 0
118
+ # 'Nc-pl' =~ r # nil
119
+ # ```
120
+ #
121
+ # @return [Regexp] the correspondent regular expression.
122
+ #
123
+ def to_regexp
124
+ Regexp.new([
125
+ '^',
126
+ self.to_s.gsub(EMPTY_DESCRIPTOR, '.'),
127
+ '.*',
128
+ '$'
129
+ ].join)
130
+ end
131
+
132
+ # Merges grammemes that are stored in `hash` into the
133
+ # MSD grammemes.
134
+ #
135
+ # @param hash [Hash<Symbol, Symbol>] a hash to be processed.
136
+ # @return [MSD] self.
137
+ #
138
+ def merge! hash
139
+ hash.each do |key, value|
140
+ self[key.to_sym] = value.to_sym
141
+ end
142
+
143
+ self
144
+ end
145
+
146
+ # @private
147
+ def to_s
148
+ return '' unless pos
149
+
150
+ unless category = language::CATEGORIES[pos]
151
+ raise InvalidDescriptor, "category is nil"
152
+ end
153
+
154
+ msd = [category[:code]]
155
+
156
+ attrs = category[:attrs]
157
+ grammemes.each do |attr_name, value|
158
+ next unless value
159
+
160
+ attr_index = attrs.index { |name, *values| name == attr_name }
161
+ unless attr_index
162
+ raise InvalidDescriptor, "no such attribute: '#{attr_name}' " \
163
+ "of category '#{pos}'"
164
+ end
165
+
166
+ attr_name, values = attrs[attr_index]
167
+
168
+ unless attr_value = values[value]
169
+ raise InvalidDescriptor, "no such value: '#{value}'' " \
170
+ "for attribute '#{attr_name}' " \
171
+ "of category '#{pos}'"
172
+ end
173
+
174
+ msd[attr_index + 1] = attr_value
175
+ end
176
+
177
+ msd.map { |e| e || EMPTY_DESCRIPTOR }.join
178
+ end
179
+
180
+ # Validates the MSD instance.
181
+ #
182
+ # @return [true, false] validation state of the MSD instance.
183
+ #
184
+ def valid?
185
+ !!to_s
186
+ rescue InvalidDescriptor
187
+ false
188
+ end
189
+
190
+ protected
191
+ # @private
192
+ def parse! msd_line
193
+ msd = msd_line.chars.to_a
194
+
195
+ category_code = msd.shift
196
+
197
+ @pos, category = language::CATEGORIES.find do |name, category|
198
+ category[:code] == category_code
199
+ end
200
+
201
+ raise InvalidDescriptor, msd_line unless @pos
202
+
203
+ attrs = category[:attrs]
204
+
205
+ msd.each_with_index do |value_code, i|
206
+ attr_name, values = attrs[i]
207
+ raise InvalidDescriptor, msd_line unless attr_name
208
+
209
+ next if :blank == attr_name
210
+ next if EMPTY_DESCRIPTOR == value_code
211
+
212
+ attribute = values.find { |name, code| code == value_code }
213
+ raise InvalidDescriptor, msd_line unless attribute
214
+
215
+ self[attr_name] = attribute.first
216
+ end
217
+ end
218
+ end
@@ -0,0 +1,263 @@
1
+ # encoding: utf-8
2
+
3
+ # English Specifications by Nancy Ide, Greg Priest-Dorman,
4
+ # Tomaž Erjavec, Tamas Varadi.
5
+ #
6
+ # http://nl.ijs.si/ME/V4/msd/html/msd-en.html
7
+ #
8
+ # This specification was translated into the Ruby language
9
+ # by [Dmitry Ustalov](http://eveel.ru).
10
+ #
11
+ module Myasorubka::MSD::English
12
+ # English Noun.
13
+ #
14
+ NOUN = {
15
+ code: 'N',
16
+ attrs: [
17
+ [ :type, {
18
+ common: 'c',
19
+ proper: 'p'
20
+ } ],
21
+ [ :gender, {
22
+ masculine: 'm',
23
+ feminine: 'f',
24
+ neuter: 'n'
25
+ } ],
26
+ [ :number, {
27
+ singular: 's',
28
+ plural: 'p'
29
+ } ]
30
+ ]
31
+ }
32
+
33
+ # English Verb.
34
+ #
35
+ VERB = {
36
+ code: 'V',
37
+ attrs: [
38
+ [ :type, {
39
+ main: 'm',
40
+ auxiliary: 'a',
41
+ modal: 'o',
42
+ base: 'b'
43
+ } ],
44
+ [ :vform, {
45
+ indicative: 'i',
46
+ conditional: 'c',
47
+ infinitive: 'n',
48
+ participle: 'p'
49
+ } ],
50
+ [ :tense, {
51
+ present: 'p',
52
+ past: 's'
53
+ } ],
54
+ [ :person, {
55
+ first: '1',
56
+ second: '2',
57
+ third: '3'
58
+ } ],
59
+ [ :number, {
60
+ singular: 's',
61
+ plural: 'p'
62
+ } ]
63
+ ]
64
+ }
65
+
66
+ # English Adjective.
67
+ #
68
+ ADJECTIVE = {
69
+ code: 'A',
70
+ attrs: [
71
+ [ :type, {
72
+ qualificative: 'f'
73
+ } ],
74
+ [ :degree, {
75
+ positive: 'p',
76
+ comparative: 'c',
77
+ superlative: 's'
78
+ } ]
79
+ ]
80
+ }
81
+
82
+ # English Pronoun.
83
+ #
84
+ PRONOUN = {
85
+ code: 'P',
86
+ attrs: [
87
+ [ :type, {
88
+ personal: 'p',
89
+ possessive: 's',
90
+ interrogative: 'q',
91
+ relative: 'r',
92
+ reflexive: 'x',
93
+ general: 'g',
94
+ ex_there: 't'
95
+ } ],
96
+ [ :person, {
97
+ first: '1',
98
+ second: '2',
99
+ third: '3'
100
+ } ],
101
+ [ :gender, {
102
+ masculine: 'm',
103
+ feminine: 'f',
104
+ neuter: 'n'
105
+ } ],
106
+ [ :number, {
107
+ singular: 's',
108
+ plural: 'p'
109
+ } ],
110
+ [ :case, {
111
+ nominative: 'n',
112
+ accusative: 'a'
113
+ } ],
114
+ [ :owner_number, {
115
+ singular: 's',
116
+ plural: 'p'
117
+ } ],
118
+ [ :owner_gender, {
119
+ masculine: 'm',
120
+ feminine: 'f'
121
+ } ],
122
+ [ :wh_type, {
123
+ relative: 'r',
124
+ question: 'q'
125
+ } ],
126
+ ]
127
+ }
128
+
129
+ # English Determiner.
130
+ #
131
+ DETERMINER = {
132
+ code: 'D',
133
+ attrs: [
134
+ [ :type, {
135
+ demonstrative: 'd',
136
+ indefinite: 'i',
137
+ possessive: 's',
138
+ general: 'g'
139
+ } ],
140
+ [ :person, {
141
+ first: '1',
142
+ second: '2',
143
+ third: '3'
144
+ } ],
145
+ [ :number, {
146
+ singular: 's',
147
+ plural: 'p'
148
+ } ],
149
+ [ :owner_number, {
150
+ singular: 's',
151
+ plural: 'p'
152
+ } ],
153
+ [ :owner_gender, {
154
+ masculine: 'm',
155
+ feminine: 'f',
156
+ neuter: 'n'
157
+ } ],
158
+ [ :wh_type, {
159
+ relative: 'r',
160
+ question: 'q'
161
+ } ]
162
+ ]
163
+ }
164
+
165
+ # English Adverb.
166
+ #
167
+ ADVERB = {
168
+ code: 'R',
169
+ attrs: [
170
+ [ :type, {
171
+ modifier: 'm',
172
+ specifier: 's'
173
+ } ],
174
+ [ :degree, {
175
+ positive: 'p',
176
+ comparative: 'c',
177
+ superlative: 's'
178
+ } ],
179
+ [ :wh_type, {
180
+ relative: 'r',
181
+ question: 'q'
182
+ } ]
183
+ ]
184
+ }
185
+
186
+ # English Adposition.
187
+ #
188
+ ADPOSITION = {
189
+ code: 'S',
190
+ attrs: [
191
+ [ :type, {
192
+ preposition: 'p',
193
+ postposition: 't'
194
+ } ]
195
+ ]
196
+ }
197
+
198
+ # English Conjunction.
199
+ #
200
+ CONJUNCTION = {
201
+ code: 'C',
202
+ attrs: [
203
+ [ :type, {
204
+ coordinating: 'c',
205
+ subordinating: 's'
206
+ } ],
207
+ [ :coord_type, {
208
+ initial: 'i',
209
+ non_initial: 'n'
210
+ } ],
211
+ ]
212
+ }
213
+
214
+ # English Numeral.
215
+ #
216
+ NUMERAL = {
217
+ code: 'M',
218
+ attrs: [
219
+ [ :type, {
220
+ cardinal: 'c',
221
+ ordinal: 'o'
222
+ } ],
223
+ ]
224
+ }
225
+
226
+ # English Interjection.
227
+ #
228
+ INTERJECTION = {
229
+ code: 'I',
230
+ attrs: []
231
+ }
232
+
233
+ # English Abbreviation.
234
+ #
235
+ ABBREVIATION = {
236
+ code: 'Y',
237
+ attrs: []
238
+ }
239
+
240
+ # English Residual.
241
+ #
242
+ RESIDUAL = {
243
+ code: 'X',
244
+ attrs: []
245
+ }
246
+
247
+ # Actual part-of-speech mapping.
248
+ #
249
+ CATEGORIES = {
250
+ noun: NOUN,
251
+ verb: VERB,
252
+ adjective: ADJECTIVE,
253
+ pronoun: PRONOUN,
254
+ determiner: DETERMINER,
255
+ adverb: ADVERB,
256
+ adposition: ADPOSITION,
257
+ conjunction: CONJUNCTION,
258
+ numeral: NUMERAL,
259
+ interjection: INTERJECTION,
260
+ abbreviation: ABBREVIATION,
261
+ residual: RESIDUAL
262
+ }
263
+ end