myasorubka 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: dd23e096eea3cce744fc37c905b46f6b8c051d65
4
- data.tar.gz: 059064ded607b2b4c21cbefd7a06411e4c123625
3
+ metadata.gz: 8e16d63b366b3abc035827745c49a22cdf28159a
4
+ data.tar.gz: b551b32694f999c898d3c898210a9ea5439e22c9
5
5
  SHA512:
6
- metadata.gz: 0fa734d01b8a3082b4cd62ef0025bd73395bce010828ebf923b4b8f597f278d5ea1b9df9642782e9db3115d62e8279a364bb3feba851e6b7f7a77d0c95c6d3c4
7
- data.tar.gz: b6cbb0c92d8a08db9ddd2ef89c147ff9b167bed9b167ca4afa50f12d0717d853f297874c4e28eeae96e7482ca365dc911d7871beb84170cb58a61ee27fb0945d
6
+ metadata.gz: 343b769967295d8fd6f1860fcd9f2c1b37e9d8f05b1aee00a54c460a242fe79190c9c993691dfbd236540b71473e2e17b013730cd0db37ca25b66cbff71f2926
7
+ data.tar.gz: 913cdc0cd36ef38e6ffdc4740678f6872107d0cb49d79710a73f7955a6656f7453c28b1f48a488be35a523b59764779c1e461eba94d32a0574fe6eb4ad25d17a
@@ -1,7 +1,7 @@
1
- branches:
2
- only:
3
- - master
1
+ sudo: false
2
+ language: ruby
3
+ bundler_args: --without development
4
4
  rvm:
5
- - 2.0.0
5
+ - ruby
6
+ - rbx
6
7
  - jruby-19mode
7
- - rbx-19mode
data/Gemfile CHANGED
@@ -1,3 +1,10 @@
1
+ # encoding: utf-8
2
+
1
3
  source 'https://rubygems.org'
2
4
 
3
5
  gemspec
6
+
7
+ group :test do
8
+ gem 'minitest', '~> 5.0'
9
+ gem 'rake'
10
+ end
@@ -1,4 +1,4 @@
1
- Copyright (c) 2011-2013 Dmitry Ustalov
1
+ Copyright (c) 2011-2015 Dmitry Ustalov
2
2
 
3
3
  MIT License
4
4
 
data/README.md CHANGED
@@ -5,7 +5,17 @@ Myasorubka is a morphological data processor that supports
5
5
  [AOT](http://aot.ru) and [MULTEXT-East](http://nl.ijs.si/ME/)
6
6
  notations.
7
7
 
8
+ [![Gem Version][badge_fury_badge]][badge_fury_link] [![Build Status][travis_ci_badge]][travis_ci_link] [![Dependency Status][gemnasium_badge]][gemnasium_link]
9
+
10
+ [badge_fury_badge]: https://badge.fury.io/rb/myasorubka.svg
11
+ [badge_fury_link]: https://badge.fury.io/rb/myasorubka
12
+ [travis_ci_badge]: https://travis-ci.org/dustalov/myasorubka.svg
13
+ [travis_ci_link]: https://travis-ci.org/dustalov/myasorubka
14
+ [gemnasium_badge]: https://gemnasium.com/dustalov/myasorubka.svg
15
+ [gemnasium_link]: https://gemnasium.com/dustalov/myasorubka
16
+
8
17
  ## MULTEXT-East morphosyntactic descriptors
18
+
9
19
  It is possible to process the MULTEXT-East morphosyntactic descriptors
10
20
  (MSDs) in a convenient way.
11
21
 
@@ -51,7 +61,8 @@ Also, the `Myasorubka::MSD` class allows to write MSDs.
51
61
  ```
52
62
 
53
63
  ## AOT dictionaries
54
- Myasorubka provides a simple parsers for lexicon in the [AOT](http://aot.ru)
64
+
65
+ Myasorubka provides simple parsers for lexicon in the [AOT](http://aot.ru)
55
66
  format, both for gramtab and dictionary files.
56
67
 
57
68
  ```ruby
@@ -104,11 +115,8 @@ You can learn more about AOT lexicon from the
104
115
  4. Push to the branch (`git push origin my-new-feature`);
105
116
  5. Create new Pull Request.
106
117
 
107
- ## Build Status [<img src="https://secure.travis-ci.org/ustalov/myasorubka.png"/>](http://travis-ci.org/ustalov/myasorubka)
108
-
109
- ## Dependency Status [<img src="https://gemnasium.com/ustalov/myasorubka.png"/>](https://gemnasium.com/ustalov/myasorubka)
110
-
111
118
  ## Copyright
112
- Copyright (c) 2011-2013 [Dmitry Ustalov]. See LICENSE for details.
113
119
 
114
- [Dmitry Ustalov]: http://eveel.ru
120
+ Copyright (c) 2011-2015 [Dmitry Ustalov]. See LICENSE for details.
121
+
122
+ [Dmitry Ustalov]: https://ustalov.name/
data/Rakefile CHANGED
@@ -1,5 +1,6 @@
1
1
  # encoding: utf-8
2
2
 
3
+ require 'rubygems/package_task'
3
4
  require 'bundler/gem_tasks'
4
5
 
5
6
  require 'rake/testtask'
@@ -1,4 +1,5 @@
1
1
  # encoding: utf-8
2
2
 
3
3
  require 'myasorubka/version'
4
+ require 'myasorubka/unicode'
4
5
  require 'myasorubka/msd'
@@ -133,7 +133,7 @@ class Myasorubka::AOT::Tags
133
133
  #
134
134
  def self.russian(pos_line, grammemes_line)
135
135
  grammemes = grammemes_line.split(',').map do |grammeme|
136
- UnicodeUtils.downcase(grammeme)
136
+ Myasorubka::Unicode.downcase(grammeme)
137
137
  end
138
138
 
139
139
  msd = Myasorubka::MSD.new(Myasorubka::MSD::Russian)
@@ -148,7 +148,7 @@ class Myasorubka::AOT::Tags
148
148
  pos_line = 'АББР'
149
149
  end
150
150
 
151
- case UnicodeUtils.upcase(pos_line)
151
+ case Myasorubka::Unicode.upcase(pos_line)
152
152
  when 'С' then begin
153
153
  msd[:pos] = :noun
154
154
  msd[:type] = if (grammemes & [ 'имя', 'фам', 'отч', 'жарг', 'арх', 'проф', 'опч' ]).empty?
@@ -153,26 +153,23 @@ class Myasorubka::MSD
153
153
  raise InvalidDescriptor, "category is nil"
154
154
  end
155
155
 
156
+ attributes = category[:attrs]
156
157
  msd = [category[:code]]
157
158
 
158
- attrs = category[:attrs]
159
- grammemes.each do |attr_name, value|
159
+ grammemes.each do |attribute, value|
160
160
  next unless value
161
161
 
162
- attr_index = attrs.index { |name, *values| name == attr_name }
163
- unless attr_index
164
- raise InvalidDescriptor, 'no such attribute "%s" of category "%s"' %
165
- [attr_name, pos]
162
+ unless index = attributes.index { |name, _| name == attribute }
163
+ raise InvalidDescriptor, 'no such attribute "%s" of category "%s"' % [attribute, pos]
166
164
  end
167
165
 
168
- attr_name, values = attrs[attr_index]
166
+ _, values = attributes[index]
169
167
 
170
- unless attr_value = values[value]
171
- raise InvalidDescriptor, 'no such attribute "%s" ' \
172
- 'for attribute "%s" of category "%s"' % [value, attr_name, pos]
168
+ unless attribute_value = values[value]
169
+ raise InvalidDescriptor, 'no such value "%s" for attribute "%s" of category "%s"' % [value, attribute, pos]
173
170
  end
174
171
 
175
- msd[attr_index + 1] = attr_value
172
+ msd[index + 1] = attribute_value
176
173
  end
177
174
 
178
175
  msd.map { |e| e || EMPTY_DESCRIPTOR }.join
@@ -188,6 +185,31 @@ class Myasorubka::MSD
188
185
  false
189
186
  end
190
187
 
188
+ # Drop every attribute that does not appear in the category.
189
+ #
190
+ # @return [MSD] self.
191
+ #
192
+ def prune!
193
+ unless category = language::CATEGORIES[pos]
194
+ self.pos = nil
195
+ grammemes.clear
196
+ return self
197
+ end
198
+
199
+ attributes = category[:attrs]
200
+
201
+ grammemes.reject! do |attribute, value|
202
+ if index = attributes.index { |name, _| name == attribute }
203
+ _, values = attributes[index]
204
+ !values[value]
205
+ else
206
+ true
207
+ end
208
+ end
209
+
210
+ self
211
+ end
212
+
191
213
  protected
192
214
  # @private
193
215
  def parse! msd_line
@@ -5,9 +5,6 @@
5
5
  #
6
6
  # http://nl.ijs.si/ME/V4/msd/html/msd-en.html
7
7
  #
8
- # This specification was translated into the Ruby language
9
- # by [Dmitry Ustalov](http://eveel.ru).
10
- #
11
8
  module Myasorubka::MSD::English
12
9
  # English Noun.
13
10
  #
@@ -11,9 +11,6 @@
11
11
  #
12
12
  # http://nl.ijs.si/ME/V4/msd/html/msd-ru.html
13
13
  #
14
- # This specification was translated into the Ruby language
15
- # by [Dmitry Ustalov](http://eveel.ru).
16
- #
17
14
  module Myasorubka::MSD::Russian
18
15
  # Russian Noun.
19
16
  #
@@ -0,0 +1,135 @@
1
+ # https://tech.yandex.ru/mystem/
2
+ module Myasorubka::Mystem extend self
3
+ require 'myasorubka/mystem/binary'
4
+
5
+ # https://tech.yandex.ru/mystem/doc/grammemes-values-docpage/
6
+ GRAMMEMES = {
7
+ 'A' => :adjective,
8
+ 'ADV' => :adverb,
9
+ 'ADVPRO' => :adv_pronoun,
10
+ 'ANUM' => :adj_numeral,
11
+ 'APRO' => :adj_pronoun,
12
+ 'COM' => :composite,
13
+ 'CONJ' => :conjunction,
14
+ 'INTJ' => :interjunction,
15
+ 'NUM' => :numeral,
16
+ 'PART' => :particle,
17
+ 'PR' => :preposition,
18
+ 'S' => :substantive,
19
+ 'SPRO' => :subst_pronoun,
20
+ 'V' => :verb,
21
+ 'наст' => :present,
22
+ 'praes' => :present,
23
+ 'непрош' => :notpast,
24
+ 'inpraes' => :notpast,
25
+ 'прош' => :past,
26
+ 'praet' => :past,
27
+ 'им' => :nominative,
28
+ 'nom' => :nominative,
29
+ 'род' => :genitive,
30
+ 'gen' => :genitive,
31
+ 'дат' => :dative,
32
+ 'dat' => :dative,
33
+ 'вин' => :accusative,
34
+ 'acc' => :accusative,
35
+ 'твор' => :instrumental,
36
+ 'ins' => :instrumental,
37
+ 'пр' => :ablative,
38
+ 'abl' => :ablative,
39
+ 'парт' => :partitive,
40
+ 'part' => :partitive,
41
+ 'местн' => :locative,
42
+ 'loc' => :locative,
43
+ 'зват' => :vocative,
44
+ 'voc' => :vocative,
45
+ 'ед' => :singular,
46
+ 'sg' => :singular,
47
+ 'мн' => :plural,
48
+ 'pl' => :plural,
49
+ 'деепр' => :gerund,
50
+ 'ger' => :gerund,
51
+ 'инф' => :infinitive,
52
+ 'inf' => :infinitive,
53
+ 'прич' => :participle,
54
+ 'partcp' => :participle,
55
+ 'изъяв' => :indicative,
56
+ 'indic' => :indicative,
57
+ 'пов' => :imperative,
58
+ 'imper' => :imperative,
59
+ 'кр' => :short,
60
+ 'brev' => :short,
61
+ 'полн' => :full,
62
+ 'plen' => :full,
63
+ 'притяж' => :possessive,
64
+ 'poss' => :possessive,
65
+ 'прев' => :superlative,
66
+ 'supr' => :superlative,
67
+ 'срав' => :comparative,
68
+ 'comp' => :comparative,
69
+ '1-л' => :person1,
70
+ '1p' => :person1,
71
+ '2-л' => :person2,
72
+ '2p' => :person2,
73
+ '3-л' => :person3,
74
+ '3p' => :person3,
75
+ 'муж' => :masculine,
76
+ 'm' => :masculine,
77
+ 'жен' => :feminine,
78
+ 'f' => :feminine,
79
+ 'сред' => :neuter,
80
+ 'n' => :neuter,
81
+ 'несов' => :imperfect,
82
+ 'ipf' => :imperfect,
83
+ 'сов' => :perfect,
84
+ 'pf' => :perfect,
85
+ 'действ' => :active,
86
+ 'act' => :active,
87
+ 'страд' => :passive,
88
+ 'pass' => :passive,
89
+ 'од' => :animated,
90
+ 'anim' => :animated,
91
+ 'неод' => :inanimated,
92
+ 'inan' => :inanimated,
93
+ 'пе' => :transitive,
94
+ 'tran' => :transitive,
95
+ 'нп' => :intransitive,
96
+ 'intr' => :intransitive,
97
+ 'вводн' => :parenth,
98
+ 'parenth' => :parenth,
99
+ 'гео' => :geo,
100
+ 'geo' => :geo,
101
+ 'затр' => :awkward,
102
+ 'awkw' => :awkward,
103
+ 'имя' => :first_name,
104
+ 'persn' => :first_name,
105
+ 'искаж' => :distort,
106
+ 'dist' => :distort,
107
+ 'мж' => :mas_fem,
108
+ 'mf' => :mas_fem,
109
+ 'обсц' => :obscene,
110
+ 'obsc' => :obscene,
111
+ 'отч' => :patronymic,
112
+ 'patrn' => :patronymic,
113
+ 'прдк' => :praedic,
114
+ 'praed' => :praedic,
115
+ 'разг' => :informal,
116
+ 'inform' => :informal,
117
+ 'редк' => :rare,
118
+ 'rare' => :rare,
119
+ 'сокр' => :abbreviation,
120
+ 'abbr' => :abbreviation,
121
+ 'устар' => :obsolete,
122
+ 'obsol' => :obsolete,
123
+ 'фам' => :surname,
124
+ 'famn' => :surname
125
+ }.freeze
126
+
127
+ # Convert an array with mystem character-based grammemes into an MSD.
128
+ #
129
+ def to_msd(grammemes)
130
+ grammemes = grammemes.map { |g| GRAMMEMES[g] }
131
+ grammemes.compact!
132
+ grammemes.map! { |g| Myasorubka::Mystem::Binary::GRAMMEMES.key(g) }
133
+ Myasorubka::Mystem::Binary.to_msd(grammemes)
134
+ end
135
+ end
@@ -0,0 +1,187 @@
1
+ # A wrapper around mystem's internal binary format.
2
+ #
3
+ module Myasorubka::Mystem::Binary extend self
4
+ # https://github.com/yandex/tomita-parser/blob/master/src/library/lemmer/dictlib/yx_gram_enum.h
5
+ GRAMMEMES = {
6
+ 127 => :postposition,
7
+ 128 => :adjective,
8
+ 129 => :adverb,
9
+ 130 => :composite,
10
+ 131 => :conjunction,
11
+ 132 => :interjunction,
12
+ 133 => :numeral,
13
+ 134 => :particle,
14
+ 135 => :preposition,
15
+ 136 => :substantive,
16
+ 137 => :verb,
17
+ 138 => :adj_numeral,
18
+ 139 => :adj_pronoun,
19
+ 140 => :adv_pronoun,
20
+ 141 => :subst_pronoun,
21
+ 142 => :article,
22
+ 143 => :part_of_idiom,
23
+ 144 => :reserved,
24
+ 145 => :abbreviation,
25
+ 146 => :irregular_stem,
26
+ 147 => :informal,
27
+ 148 => :distort,
28
+ 149 => :contracted,
29
+ 150 => :obscene,
30
+ 151 => :rare,
31
+ 152 => :awkward,
32
+ 153 => :obsolete,
33
+ 154 => :subst_adjective,
34
+ 155 => :first_name,
35
+ 156 => :surname,
36
+ 157 => :patronymic,
37
+ 158 => :geo,
38
+ 159 => :proper,
39
+ 160 => :present,
40
+ 161 => :notpast,
41
+ 162 => :past,
42
+ 163 => :future,
43
+ 164 => :past2,
44
+ 165 => :nominative,
45
+ 166 => :genitive,
46
+ 167 => :dative,
47
+ 168 => :accusative,
48
+ 169 => :instrumental,
49
+ 170 => :ablative,
50
+ 171 => :partitive,
51
+ 172 => :locative,
52
+ 173 => :vocative,
53
+ 174 => :singular,
54
+ 175 => :plural,
55
+ 176 => :gerund,
56
+ 177 => :infinitive,
57
+ 178 => :participle,
58
+ 179 => :indicative,
59
+ 180 => :imperative,
60
+ 181 => :conditional,
61
+ 182 => :subjunctive,
62
+ 183 => :short,
63
+ 184 => :full,
64
+ 185 => :superlative,
65
+ 186 => :comparative,
66
+ 187 => :possessive,
67
+ 188 => :person1,
68
+ 189 => :person2,
69
+ 190 => :person3,
70
+ 191 => :feminine,
71
+ 192 => :masculine,
72
+ 193 => :neuter,
73
+ 194 => :mas_fem,
74
+ 195 => :perfect,
75
+ 196 => :imperfect,
76
+ 197 => :passive,
77
+ 198 => :active,
78
+ 199 => :reflexive,
79
+ 200 => :impersonal,
80
+ 201 => :animated,
81
+ 202 => :inanimated,
82
+ 203 => :praedic,
83
+ 204 => :parenth,
84
+ 205 => :transitive,
85
+ 206 => :intransitive,
86
+ 207 => :definite,
87
+ 208 => :indefinite,
88
+ 209 => :sim_conj,
89
+ 210 => :sub_conj,
90
+ 211 => :pronoun_conj,
91
+ 212 => :correlate_conj,
92
+ 213 => :aux_verb
93
+ }.freeze
94
+
95
+ # Convert an array with mystem grammeme codes into a MSD.
96
+ #
97
+ def to_msd(grammemes)
98
+ msd = Myasorubka::MSD.new(Myasorubka::MSD::Russian)
99
+
100
+ grammemes.sort.each do |code|
101
+ case GRAMMEMES[code]
102
+ # Nomenus
103
+ when :postposition then msd[:pos] = :adposition
104
+ when :adjective then msd[:pos] = :adjective; msd[:type] = :qualificative; msd[:degree] = :positive
105
+ when :adverb then msd[:pos] = :adverb
106
+ when :conjunction then msd[:pos] = :conjunction
107
+ when :interjunction then msd[:pos] = :interjection
108
+ when :numeral then msd[:pos] = :numeral; msd[:type] = :cardinal
109
+ when :particle then msd[:pos] = :particle
110
+ when :preposition then msd[:pos] = :adposition; msd[:type] = :preposition
111
+ when :substantive then msd[:pos] = :noun; msd[:type] = :common
112
+ when :verb then msd[:pos] = :verb; msd[:type] = :main
113
+ when :adj_numeral then msd[:pos] = :numeral; msd[:type] = :ordinal
114
+ when :adj_pronoun then msd[:pos] = :pronoun; msd[:syntactic_type] = :adjectival
115
+ when :adv_pronoun then msd[:pos] = :pronoun; msd[:syntactic_type] = :adverbial
116
+ when :subst_pronoun then msd[:pos] = :pronoun; msd[:syntactic_type] = :nominal
117
+ when :abbreviation then msd[:pos] = :abbreviation
118
+ when :first_name then msd[:type] = :proper
119
+ when :surname then msd[:type] = :proper
120
+ when :patronymic then msd[:type] = :proper
121
+ when :geo then msd[:type] = :proper
122
+ when :proper then msd[:type] = :proper
123
+ # Tempus
124
+ when :present then msd[:tense] = :present
125
+ # TODO: how to handle :notpast tense?
126
+ when :past then msd[:tense] = :past
127
+ when :future then msd[:tense] = :future
128
+ when :past2 then msd[:tense] = :past
129
+ # Casus
130
+ when :nominative then msd[:case] = :nominative
131
+ when :genitive then msd[:case] = :genitive
132
+ when :dative then msd[:case] = :dative
133
+ when :accusative then msd[:case] = :accusative
134
+ when :instrumental then msd[:case] = :instrumental
135
+ when :ablative then msd[:case] = :genitive
136
+ when :partitive then msd[:case] = :genitive; msd[:case2] = :partitive
137
+ when :locative then msd[:case] = :genitive; msd[:case2] = :locative
138
+ when :vocative then msd[:case] = :vocative
139
+ # Numerus
140
+ when :singular then msd[:number] = :singular
141
+ when :plural then msd[:number] = :plural
142
+ # Modus
143
+ when :gerund then msd[:vform] = :gerund
144
+ when :infinitive then msd[:vform] = :infinitive
145
+ when :participle then msd[:vform] = :participle
146
+ when :indicative then msd[:vform] = :indicative
147
+ when :imperative then msd[:vform] = :imperative
148
+ when :conditional then msd[:vform] = :conditional
149
+ # Gradus
150
+ when :short then msd[:definiteness] = :short_art
151
+ when :full then msd[:definiteness] = :full_art
152
+ when :superlative then msd[:degree] = :superlative
153
+ when :comparative then msd[:degree] = :comparative
154
+ when :possessive then msd[:type] = :possessive
155
+ # Personae
156
+ when :person1 then msd[:person] = :first
157
+ when :person2 then msd[:person] = :second
158
+ when :person3 then msd[:person] = :third
159
+ # Gender
160
+ when :feminine then msd[:gender] = :feminine
161
+ when :masculine then msd[:gender] = :masculine
162
+ when :neuter then msd[:gender] = :neuter
163
+ when :mas_fem then msd[:gender] = :common
164
+ # Perfectum-Imperfectum
165
+ when :perfect then msd[:aspect] = :perfective
166
+ when :imperfect then msd[:aspect] = :progressive
167
+ # Voice
168
+ when :passive then msd[:voice] = :passive
169
+ when :active then msd[:voice] = :active
170
+ when :reflexive then msd[:type] = :reflexive
171
+ # Animated
172
+ when :animated then msd[:animate] = :yes
173
+ when :inanimated then msd[:animate] = :no
174
+ # Transitivity
175
+ when :definite then msd[:definiteness] = :full_art
176
+ when :indefinite then msd[:definiteness] = :short_art
177
+ # Definiteness
178
+ when :sim_conj then msd[:type] = :coordinating
179
+ when :sub_conj then msd[:type] = :subordinating
180
+ when :aux_verb then msd[:type] = :auxiliary
181
+ else
182
+ end
183
+ end
184
+
185
+ msd.prune!
186
+ end
187
+ end
@@ -0,0 +1,123 @@
1
+ # encoding: utf-8
2
+
3
+ # The Penn Treebank Project annotates naturally-occuring text for
4
+ # linguistic structure. Most notably, we produce skeletal parses
5
+ # showing rough syntactic and semantic information — a bank of
6
+ # linguistic trees.
7
+ #
8
+ # Treebanks are often created on top of a corpus that has already been
9
+ # annotated with part-of-speech tags. In turn, treebanks are sometimes
10
+ # enhanced with semantic or other linguistic information.
11
+ #
12
+ module Myasorubka::Treebank
13
+ extend self
14
+
15
+ # Convert the given tag from English Penn Treebank format to the English
16
+ # representation in the MULTEXT-East format.
17
+ #
18
+ def english(tag)
19
+ msd = Myasorubka::MSD.new(Myasorubka::MSD::English)
20
+
21
+ case tag
22
+ when 'CC' then
23
+ msd[:pos] = :conjunction
24
+ msd[:type] = :coordinating
25
+ when 'CD' then
26
+ msd[:pos] = :numeral
27
+ msd[:type] = :cardinal
28
+ when 'DT' then
29
+ msd[:pos] = :determiner
30
+ when 'IN' then
31
+ msd[:pos] = :conjunction
32
+ msd[:type] = :subordinating
33
+ when 'JJ' then
34
+ msd[:pos] = :adjective
35
+ when 'JJR' then
36
+ msd[:pos] = :adjective
37
+ msd[:degree] = :comparative
38
+ when 'JJS' then
39
+ msd[:pos] = :adjective
40
+ msd[:degree] = :superlative
41
+ when 'MD' then
42
+ msd[:pos] = :verb
43
+ msd[:type] = :modal
44
+ when 'NN' then
45
+ msd[:pos] = :noun
46
+ msd[:type] = :common
47
+ msd[:number] = :singular
48
+ when 'NNS'
49
+ msd[:pos] = :noun
50
+ msd[:type] = :common
51
+ msd[:number] = :plural
52
+ when 'NP'
53
+ msd[:pos] = :noun
54
+ msd[:type] = :proper
55
+ msd[:number] = :singular
56
+ when 'NPS'
57
+ msd[:pos] = :noun
58
+ msd[:type] = :proper
59
+ msd[:number] = :plural
60
+ when 'PDT' then
61
+ msd[:pos] = :determiner
62
+ when 'PP' then
63
+ msd[:pos] = :pronoun
64
+ msd[:type] = :personal
65
+ when 'PP$' then
66
+ msd[:pos] = :pronoun
67
+ msd[:type] = :possessive
68
+ when 'RB' then
69
+ msd[:pos] = :adverb
70
+ when 'RBR' then
71
+ msd[:pos] = :adverb
72
+ msd[:degree] = :comparative
73
+ when 'RBS' then
74
+ msd[:pos] = :adverb
75
+ msd[:degree] = :superlative
76
+ when 'TO' then
77
+ msd[:pos] = :determiner
78
+ when 'UH' then
79
+ msd[:pos] = :interjection
80
+ when 'VB' then
81
+ msd[:pos] = :verb
82
+ msd[:type] = :base
83
+ when 'VBD' then
84
+ msd[:pos] = :verb
85
+ msd[:type] = :base
86
+ msd[:tense] = :past
87
+ when 'VBG' then
88
+ msd[:pos] = :verb
89
+ msd[:type] = :base
90
+ msd[:vform] = :participle
91
+ msd[:tense] = :present
92
+ when 'VBN' then
93
+ msd[:pos] = :verb
94
+ msd[:type] = :base
95
+ msd[:vform] = :participle
96
+ msd[:tense] = :past
97
+ when 'VBP' then
98
+ msd[:pos] = :verb
99
+ msd[:type] = :base
100
+ msd[:tense] = :present
101
+ msd[:number] = :singular
102
+ when 'VBZ' then
103
+ msd[:pos] = :verb
104
+ msd[:type] = :base
105
+ msd[:tense] = :present
106
+ msd[:person] = :third
107
+ msd[:number] = :singular
108
+ when 'WDT' then
109
+ msd[:pos] = :determiner
110
+ when 'WP' then
111
+ msd[:pos] = :pronoun
112
+ when 'WP$' then
113
+ msd[:pos] = :pronoun
114
+ msd[:type] = :possessive
115
+ when 'WRB' then
116
+ msd[:pos] = :adverb
117
+ else
118
+ msd[:pos] = :residual
119
+ end
120
+
121
+ msd
122
+ end
123
+ end
@@ -0,0 +1,46 @@
1
+ # This module provides downcase and upcase methods designed for Russian.
2
+ # The original code is written by Andrew Kozlov for the Petrovich library.
3
+ #
4
+ # https://github.com/petrovich/petrovich-ruby/blob/df705075542979ab85e1f2bf9a2024b1c0813e1a/lib/petrovich/unicode.rb
5
+ #
6
+ module Myasorubka::Unicode extend self
7
+ # Russian capital letters.
8
+ #
9
+ RU_UPPERCASE = [
10
+ "\u0410", "\u0411", "\u0412", "\u0413", "\u0414", "\u0415", "\u0416", "\u0417",
11
+ "\u0418", "\u0419", "\u041A", "\u041B", "\u041C", "\u041D", "\u041E", "\u041F",
12
+ "\u0420", "\u0421", "\u0422", "\u0423", "\u0424", "\u0425", "\u0426", "\u0427",
13
+ "\u0428", "\u0429", "\u042A", "\u042B", "\u042C", "\u042D", "\u042E", "\u042F",
14
+ "\u0401" # Ё
15
+ ].join
16
+
17
+ # Russian small letters.
18
+ #
19
+ RU_LOWERCASE = [
20
+ "\u0430", "\u0431", "\u0432", "\u0433", "\u0434", "\u0435", "\u0436", "\u0437",
21
+ "\u0438", "\u0439", "\u043A", "\u043B", "\u043C", "\u043D", "\u043E", "\u043F",
22
+ "\u0440", "\u0441", "\u0442", "\u0443", "\u0444", "\u0445", "\u0446", "\u0447",
23
+ "\u0448", "\u0449", "\u044A", "\u044B", "\u044C", "\u044D", "\u044E", "\u044F",
24
+ "\u0451" # Ё
25
+ ].join
26
+
27
+ # Returns a copy of the given string having replaced
28
+ # capital Russian letters with small ones.
29
+ #
30
+ # @param string [String] a string.
31
+ # @return [String] a new string.
32
+ #
33
+ def downcase(string)
34
+ string.tr(RU_UPPERCASE, RU_LOWERCASE).tap(&:downcase!)
35
+ end
36
+
37
+ # Returns a copy of the given string having replaced
38
+ # small Russian letters with capital ones.
39
+ #
40
+ # @param string [String] a string.
41
+ # @return [String] a new string.
42
+ #
43
+ def upcase(string)
44
+ string.tr(RU_LOWERCASE, RU_UPPERCASE).tap(&:upcase!)
45
+ end
46
+ end
@@ -5,5 +5,5 @@
5
5
  module Myasorubka
6
6
  # Version of Myasorubka.
7
7
  #
8
- VERSION = '0.1.1'
8
+ VERSION = '0.2.0'
9
9
  end
@@ -12,17 +12,11 @@ Gem::Specification.new do |spec|
12
12
  spec.description = 'Myasorubka is a morphological data processor.'
13
13
  spec.summary = 'Myasorubka is a morphological data proceesor ' \
14
14
  'that supports AOT and MULTEXT-East notations.'
15
- spec.homepage = 'https://github.com/ustalov/myasorubka'
15
+ spec.homepage = 'https://github.com/dustalov/myasorubka'
16
16
  spec.license = 'MIT'
17
17
 
18
18
  spec.files = `git ls-files`.split($/)
19
19
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
20
20
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
21
21
  spec.require_paths = ['lib']
22
-
23
- spec.add_development_dependency 'bundler', '~> 1.3'
24
- spec.add_development_dependency 'minitest', '>= 2.11'
25
- spec.add_development_dependency 'rake'
26
-
27
- spec.add_dependency 'unicode_utils', '~> 1.4'
28
22
  end
@@ -83,6 +83,19 @@ module Myasorubka
83
83
  ('Vmp' =~ re).must_equal 0
84
84
  ('Nc-pl' =~ re).must_be_nil
85
85
  end
86
+
87
+ it 'can be pruned and became valid when the category is wrong' do
88
+ subject[:pos] = :zalupa
89
+ subject.prune!
90
+ subject.must_be :valid?
91
+ end
92
+
93
+ it 'can be pruned and became valid when an attribute is wrong' do
94
+ subject[:pos] = :verb
95
+ subject[:animate] = :yes
96
+ subject.prune!
97
+ subject.must_be :valid?
98
+ end
86
99
  end
87
100
 
88
101
  describe 'Generator' do
@@ -4,11 +4,9 @@ require 'rubygems'
4
4
 
5
5
  $:.unshift File.expand_path('../../lib', __FILE__)
6
6
 
7
- if RUBY_VERSION == '1.8'
8
- gem 'minitest'
9
- end
10
-
7
+ gem 'minitest'
11
8
  require 'minitest/autorun'
9
+ require 'minitest/hell'
12
10
 
13
11
  require 'myasorubka'
14
12
  require 'myasorubka/aot'
metadata CHANGED
@@ -1,71 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: myasorubka
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Ustalov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-05-07 00:00:00.000000000 Z
12
- dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: bundler
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - ~>
18
- - !ruby/object:Gem::Version
19
- version: '1.3'
20
- type: :development
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - ~>
25
- - !ruby/object:Gem::Version
26
- version: '1.3'
27
- - !ruby/object:Gem::Dependency
28
- name: minitest
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - '>='
32
- - !ruby/object:Gem::Version
33
- version: '2.11'
34
- type: :development
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - '>='
39
- - !ruby/object:Gem::Version
40
- version: '2.11'
41
- - !ruby/object:Gem::Dependency
42
- name: rake
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - '>='
46
- - !ruby/object:Gem::Version
47
- version: '0'
48
- type: :development
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - '>='
53
- - !ruby/object:Gem::Version
54
- version: '0'
55
- - !ruby/object:Gem::Dependency
56
- name: unicode_utils
57
- requirement: !ruby/object:Gem::Requirement
58
- requirements:
59
- - - ~>
60
- - !ruby/object:Gem::Version
61
- version: '1.4'
62
- type: :runtime
63
- prerelease: false
64
- version_requirements: !ruby/object:Gem::Requirement
65
- requirements:
66
- - - ~>
67
- - !ruby/object:Gem::Version
68
- version: '1.4'
11
+ date: 2015-10-17 00:00:00.000000000 Z
12
+ dependencies: []
69
13
  description: Myasorubka is a morphological data processor.
70
14
  email:
71
15
  - dmitry@eveel.ru
@@ -73,13 +17,12 @@ executables: []
73
17
  extensions: []
74
18
  extra_rdoc_files: []
75
19
  files:
76
- - .gitignore
77
- - .travis.yml
20
+ - ".gitignore"
21
+ - ".travis.yml"
78
22
  - Gemfile
79
23
  - LICENSE.txt
80
24
  - README.md
81
25
  - Rakefile
82
- - aot-russian
83
26
  - lib/myasorubka.rb
84
27
  - lib/myasorubka/aot.rb
85
28
  - lib/myasorubka/aot/dictionary.rb
@@ -88,13 +31,17 @@ files:
88
31
  - lib/myasorubka/msd.rb
89
32
  - lib/myasorubka/msd/english.rb
90
33
  - lib/myasorubka/msd/russian.rb
34
+ - lib/myasorubka/mystem.rb
35
+ - lib/myasorubka/mystem/binary.rb
36
+ - lib/myasorubka/treebank.rb
37
+ - lib/myasorubka/unicode.rb
91
38
  - lib/myasorubka/version.rb
92
39
  - myasorubka.gemspec
93
40
  - spec/data/russian.tsv
94
41
  - spec/msd/russian_spec.rb
95
42
  - spec/msd_spec.rb
96
43
  - spec/spec_helper.rb
97
- homepage: https://github.com/ustalov/myasorubka
44
+ homepage: https://github.com/dustalov/myasorubka
98
45
  licenses:
99
46
  - MIT
100
47
  metadata: {}
@@ -104,17 +51,17 @@ require_paths:
104
51
  - lib
105
52
  required_ruby_version: !ruby/object:Gem::Requirement
106
53
  requirements:
107
- - - '>='
54
+ - - ">="
108
55
  - !ruby/object:Gem::Version
109
56
  version: '0'
110
57
  required_rubygems_version: !ruby/object:Gem::Requirement
111
58
  requirements:
112
- - - '>='
59
+ - - ">="
113
60
  - !ruby/object:Gem::Version
114
61
  version: '0'
115
62
  requirements: []
116
63
  rubyforge_project:
117
- rubygems_version: 2.0.3
64
+ rubygems_version: 2.4.8
118
65
  signing_key:
119
66
  specification_version: 4
120
67
  summary: Myasorubka is a morphological data proceesor that supports AOT and MULTEXT-East
@@ -1,7 +0,0 @@
1
- #!/bin/sh
2
- rake clean aot \
3
- mrd=morphs.mrd \
4
- tab=rgramtab.tab \
5
- encoding=CP1251 \
6
- language=russian \
7
- --trace