myasorubka 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: dd23e096eea3cce744fc37c905b46f6b8c051d65
4
- data.tar.gz: 059064ded607b2b4c21cbefd7a06411e4c123625
3
+ metadata.gz: 8e16d63b366b3abc035827745c49a22cdf28159a
4
+ data.tar.gz: b551b32694f999c898d3c898210a9ea5439e22c9
5
5
  SHA512:
6
- metadata.gz: 0fa734d01b8a3082b4cd62ef0025bd73395bce010828ebf923b4b8f597f278d5ea1b9df9642782e9db3115d62e8279a364bb3feba851e6b7f7a77d0c95c6d3c4
7
- data.tar.gz: b6cbb0c92d8a08db9ddd2ef89c147ff9b167bed9b167ca4afa50f12d0717d853f297874c4e28eeae96e7482ca365dc911d7871beb84170cb58a61ee27fb0945d
6
+ metadata.gz: 343b769967295d8fd6f1860fcd9f2c1b37e9d8f05b1aee00a54c460a242fe79190c9c993691dfbd236540b71473e2e17b013730cd0db37ca25b66cbff71f2926
7
+ data.tar.gz: 913cdc0cd36ef38e6ffdc4740678f6872107d0cb49d79710a73f7955a6656f7453c28b1f48a488be35a523b59764779c1e461eba94d32a0574fe6eb4ad25d17a
@@ -1,7 +1,7 @@
1
- branches:
2
- only:
3
- - master
1
+ sudo: false
2
+ language: ruby
3
+ bundler_args: --without development
4
4
  rvm:
5
- - 2.0.0
5
+ - ruby
6
+ - rbx
6
7
  - jruby-19mode
7
- - rbx-19mode
data/Gemfile CHANGED
@@ -1,3 +1,10 @@
1
+ # encoding: utf-8
2
+
1
3
  source 'https://rubygems.org'
2
4
 
3
5
  gemspec
6
+
7
+ group :test do
8
+ gem 'minitest', '~> 5.0'
9
+ gem 'rake'
10
+ end
@@ -1,4 +1,4 @@
1
- Copyright (c) 2011-2013 Dmitry Ustalov
1
+ Copyright (c) 2011-2015 Dmitry Ustalov
2
2
 
3
3
  MIT License
4
4
 
data/README.md CHANGED
@@ -5,7 +5,17 @@ Myasorubka is a morphological data processor that supports
5
5
  [AOT](http://aot.ru) and [MULTEXT-East](http://nl.ijs.si/ME/)
6
6
  notations.
7
7
 
8
+ [![Gem Version][badge_fury_badge]][badge_fury_link] [![Build Status][travis_ci_badge]][travis_ci_link] [![Dependency Status][gemnasium_badge]][gemnasium_link]
9
+
10
+ [badge_fury_badge]: https://badge.fury.io/rb/myasorubka.svg
11
+ [badge_fury_link]: https://badge.fury.io/rb/myasorubka
12
+ [travis_ci_badge]: https://travis-ci.org/dustalov/myasorubka.svg
13
+ [travis_ci_link]: https://travis-ci.org/dustalov/myasorubka
14
+ [gemnasium_badge]: https://gemnasium.com/dustalov/myasorubka.svg
15
+ [gemnasium_link]: https://gemnasium.com/dustalov/myasorubka
16
+
8
17
  ## MULTEXT-East morphosyntactic descriptors
18
+
9
19
  It is possible to process the MULTEXT-East morphosyntactic descriptors
10
20
  (MSDs) in a convenient way.
11
21
 
@@ -51,7 +61,8 @@ Also, the `Myasorubka::MSD` class allows to write MSDs.
51
61
  ```
52
62
 
53
63
  ## AOT dictionaries
54
- Myasorubka provides a simple parsers for lexicon in the [AOT](http://aot.ru)
64
+
65
+ Myasorubka provides simple parsers for lexicon in the [AOT](http://aot.ru)
55
66
  format, both for gramtab and dictionary files.
56
67
 
57
68
  ```ruby
@@ -104,11 +115,8 @@ You can learn more about AOT lexicon from the
104
115
  4. Push to the branch (`git push origin my-new-feature`);
105
116
  5. Create new Pull Request.
106
117
 
107
- ## Build Status [<img src="https://secure.travis-ci.org/ustalov/myasorubka.png"/>](http://travis-ci.org/ustalov/myasorubka)
108
-
109
- ## Dependency Status [<img src="https://gemnasium.com/ustalov/myasorubka.png"/>](https://gemnasium.com/ustalov/myasorubka)
110
-
111
118
  ## Copyright
112
- Copyright (c) 2011-2013 [Dmitry Ustalov]. See LICENSE for details.
113
119
 
114
- [Dmitry Ustalov]: http://eveel.ru
120
+ Copyright (c) 2011-2015 [Dmitry Ustalov]. See LICENSE for details.
121
+
122
+ [Dmitry Ustalov]: https://ustalov.name/
data/Rakefile CHANGED
@@ -1,5 +1,6 @@
1
1
  # encoding: utf-8
2
2
 
3
+ require 'rubygems/package_task'
3
4
  require 'bundler/gem_tasks'
4
5
 
5
6
  require 'rake/testtask'
@@ -1,4 +1,5 @@
1
1
  # encoding: utf-8
2
2
 
3
3
  require 'myasorubka/version'
4
+ require 'myasorubka/unicode'
4
5
  require 'myasorubka/msd'
@@ -133,7 +133,7 @@ class Myasorubka::AOT::Tags
133
133
  #
134
134
  def self.russian(pos_line, grammemes_line)
135
135
  grammemes = grammemes_line.split(',').map do |grammeme|
136
- UnicodeUtils.downcase(grammeme)
136
+ Myasorubka::Unicode.downcase(grammeme)
137
137
  end
138
138
 
139
139
  msd = Myasorubka::MSD.new(Myasorubka::MSD::Russian)
@@ -148,7 +148,7 @@ class Myasorubka::AOT::Tags
148
148
  pos_line = 'АББР'
149
149
  end
150
150
 
151
- case UnicodeUtils.upcase(pos_line)
151
+ case Myasorubka::Unicode.upcase(pos_line)
152
152
  when 'С' then begin
153
153
  msd[:pos] = :noun
154
154
  msd[:type] = if (grammemes & [ 'имя', 'фам', 'отч', 'жарг', 'арх', 'проф', 'опч' ]).empty?
@@ -153,26 +153,23 @@ class Myasorubka::MSD
153
153
  raise InvalidDescriptor, "category is nil"
154
154
  end
155
155
 
156
+ attributes = category[:attrs]
156
157
  msd = [category[:code]]
157
158
 
158
- attrs = category[:attrs]
159
- grammemes.each do |attr_name, value|
159
+ grammemes.each do |attribute, value|
160
160
  next unless value
161
161
 
162
- attr_index = attrs.index { |name, *values| name == attr_name }
163
- unless attr_index
164
- raise InvalidDescriptor, 'no such attribute "%s" of category "%s"' %
165
- [attr_name, pos]
162
+ unless index = attributes.index { |name, _| name == attribute }
163
+ raise InvalidDescriptor, 'no such attribute "%s" of category "%s"' % [attribute, pos]
166
164
  end
167
165
 
168
- attr_name, values = attrs[attr_index]
166
+ _, values = attributes[index]
169
167
 
170
- unless attr_value = values[value]
171
- raise InvalidDescriptor, 'no such attribute "%s" ' \
172
- 'for attribute "%s" of category "%s"' % [value, attr_name, pos]
168
+ unless attribute_value = values[value]
169
+ raise InvalidDescriptor, 'no such value "%s" for attribute "%s" of category "%s"' % [value, attribute, pos]
173
170
  end
174
171
 
175
- msd[attr_index + 1] = attr_value
172
+ msd[index + 1] = attribute_value
176
173
  end
177
174
 
178
175
  msd.map { |e| e || EMPTY_DESCRIPTOR }.join
@@ -188,6 +185,31 @@ class Myasorubka::MSD
188
185
  false
189
186
  end
190
187
 
188
+ # Drop every attribute that does not appear in the category.
189
+ #
190
+ # @return [MSD] self.
191
+ #
192
+ def prune!
193
+ unless category = language::CATEGORIES[pos]
194
+ self.pos = nil
195
+ grammemes.clear
196
+ return self
197
+ end
198
+
199
+ attributes = category[:attrs]
200
+
201
+ grammemes.reject! do |attribute, value|
202
+ if index = attributes.index { |name, _| name == attribute }
203
+ _, values = attributes[index]
204
+ !values[value]
205
+ else
206
+ true
207
+ end
208
+ end
209
+
210
+ self
211
+ end
212
+
191
213
  protected
192
214
  # @private
193
215
  def parse! msd_line
@@ -5,9 +5,6 @@
5
5
  #
6
6
  # http://nl.ijs.si/ME/V4/msd/html/msd-en.html
7
7
  #
8
- # This specification was translated into the Ruby language
9
- # by [Dmitry Ustalov](http://eveel.ru).
10
- #
11
8
  module Myasorubka::MSD::English
12
9
  # English Noun.
13
10
  #
@@ -11,9 +11,6 @@
11
11
  #
12
12
  # http://nl.ijs.si/ME/V4/msd/html/msd-ru.html
13
13
  #
14
- # This specification was translated into the Ruby language
15
- # by [Dmitry Ustalov](http://eveel.ru).
16
- #
17
14
  module Myasorubka::MSD::Russian
18
15
  # Russian Noun.
19
16
  #
@@ -0,0 +1,135 @@
1
+ # https://tech.yandex.ru/mystem/
2
+ module Myasorubka::Mystem extend self
3
+ require 'myasorubka/mystem/binary'
4
+
5
+ # https://tech.yandex.ru/mystem/doc/grammemes-values-docpage/
6
+ GRAMMEMES = {
7
+ 'A' => :adjective,
8
+ 'ADV' => :adverb,
9
+ 'ADVPRO' => :adv_pronoun,
10
+ 'ANUM' => :adj_numeral,
11
+ 'APRO' => :adj_pronoun,
12
+ 'COM' => :composite,
13
+ 'CONJ' => :conjunction,
14
+ 'INTJ' => :interjunction,
15
+ 'NUM' => :numeral,
16
+ 'PART' => :particle,
17
+ 'PR' => :preposition,
18
+ 'S' => :substantive,
19
+ 'SPRO' => :subst_pronoun,
20
+ 'V' => :verb,
21
+ 'наст' => :present,
22
+ 'praes' => :present,
23
+ 'непрош' => :notpast,
24
+ 'inpraes' => :notpast,
25
+ 'прош' => :past,
26
+ 'praet' => :past,
27
+ 'им' => :nominative,
28
+ 'nom' => :nominative,
29
+ 'род' => :genitive,
30
+ 'gen' => :genitive,
31
+ 'дат' => :dative,
32
+ 'dat' => :dative,
33
+ 'вин' => :accusative,
34
+ 'acc' => :accusative,
35
+ 'твор' => :instrumental,
36
+ 'ins' => :instrumental,
37
+ 'пр' => :ablative,
38
+ 'abl' => :ablative,
39
+ 'парт' => :partitive,
40
+ 'part' => :partitive,
41
+ 'местн' => :locative,
42
+ 'loc' => :locative,
43
+ 'зват' => :vocative,
44
+ 'voc' => :vocative,
45
+ 'ед' => :singular,
46
+ 'sg' => :singular,
47
+ 'мн' => :plural,
48
+ 'pl' => :plural,
49
+ 'деепр' => :gerund,
50
+ 'ger' => :gerund,
51
+ 'инф' => :infinitive,
52
+ 'inf' => :infinitive,
53
+ 'прич' => :participle,
54
+ 'partcp' => :participle,
55
+ 'изъяв' => :indicative,
56
+ 'indic' => :indicative,
57
+ 'пов' => :imperative,
58
+ 'imper' => :imperative,
59
+ 'кр' => :short,
60
+ 'brev' => :short,
61
+ 'полн' => :full,
62
+ 'plen' => :full,
63
+ 'притяж' => :possessive,
64
+ 'poss' => :possessive,
65
+ 'прев' => :superlative,
66
+ 'supr' => :superlative,
67
+ 'срав' => :comparative,
68
+ 'comp' => :comparative,
69
+ '1-л' => :person1,
70
+ '1p' => :person1,
71
+ '2-л' => :person2,
72
+ '2p' => :person2,
73
+ '3-л' => :person3,
74
+ '3p' => :person3,
75
+ 'муж' => :masculine,
76
+ 'm' => :masculine,
77
+ 'жен' => :feminine,
78
+ 'f' => :feminine,
79
+ 'сред' => :neuter,
80
+ 'n' => :neuter,
81
+ 'несов' => :imperfect,
82
+ 'ipf' => :imperfect,
83
+ 'сов' => :perfect,
84
+ 'pf' => :perfect,
85
+ 'действ' => :active,
86
+ 'act' => :active,
87
+ 'страд' => :passive,
88
+ 'pass' => :passive,
89
+ 'од' => :animated,
90
+ 'anim' => :animated,
91
+ 'неод' => :inanimated,
92
+ 'inan' => :inanimated,
93
+ 'пе' => :transitive,
94
+ 'tran' => :transitive,
95
+ 'нп' => :intransitive,
96
+ 'intr' => :intransitive,
97
+ 'вводн' => :parenth,
98
+ 'parenth' => :parenth,
99
+ 'гео' => :geo,
100
+ 'geo' => :geo,
101
+ 'затр' => :awkward,
102
+ 'awkw' => :awkward,
103
+ 'имя' => :first_name,
104
+ 'persn' => :first_name,
105
+ 'искаж' => :distort,
106
+ 'dist' => :distort,
107
+ 'мж' => :mas_fem,
108
+ 'mf' => :mas_fem,
109
+ 'обсц' => :obscene,
110
+ 'obsc' => :obscene,
111
+ 'отч' => :patronymic,
112
+ 'patrn' => :patronymic,
113
+ 'прдк' => :praedic,
114
+ 'praed' => :praedic,
115
+ 'разг' => :informal,
116
+ 'inform' => :informal,
117
+ 'редк' => :rare,
118
+ 'rare' => :rare,
119
+ 'сокр' => :abbreviation,
120
+ 'abbr' => :abbreviation,
121
+ 'устар' => :obsolete,
122
+ 'obsol' => :obsolete,
123
+ 'фам' => :surname,
124
+ 'famn' => :surname
125
+ }.freeze
126
+
127
+ # Convert an array with mystem character-based grammemes into an MSD.
128
+ #
129
+ def to_msd(grammemes)
130
+ grammemes = grammemes.map { |g| GRAMMEMES[g] }
131
+ grammemes.compact!
132
+ grammemes.map! { |g| Myasorubka::Mystem::Binary::GRAMMEMES.key(g) }
133
+ Myasorubka::Mystem::Binary.to_msd(grammemes)
134
+ end
135
+ end
@@ -0,0 +1,187 @@
1
+ # A wrapper around mystem's internal binary format.
2
+ #
3
+ module Myasorubka::Mystem::Binary extend self
4
+ # https://github.com/yandex/tomita-parser/blob/master/src/library/lemmer/dictlib/yx_gram_enum.h
5
+ GRAMMEMES = {
6
+ 127 => :postposition,
7
+ 128 => :adjective,
8
+ 129 => :adverb,
9
+ 130 => :composite,
10
+ 131 => :conjunction,
11
+ 132 => :interjunction,
12
+ 133 => :numeral,
13
+ 134 => :particle,
14
+ 135 => :preposition,
15
+ 136 => :substantive,
16
+ 137 => :verb,
17
+ 138 => :adj_numeral,
18
+ 139 => :adj_pronoun,
19
+ 140 => :adv_pronoun,
20
+ 141 => :subst_pronoun,
21
+ 142 => :article,
22
+ 143 => :part_of_idiom,
23
+ 144 => :reserved,
24
+ 145 => :abbreviation,
25
+ 146 => :irregular_stem,
26
+ 147 => :informal,
27
+ 148 => :distort,
28
+ 149 => :contracted,
29
+ 150 => :obscene,
30
+ 151 => :rare,
31
+ 152 => :awkward,
32
+ 153 => :obsolete,
33
+ 154 => :subst_adjective,
34
+ 155 => :first_name,
35
+ 156 => :surname,
36
+ 157 => :patronymic,
37
+ 158 => :geo,
38
+ 159 => :proper,
39
+ 160 => :present,
40
+ 161 => :notpast,
41
+ 162 => :past,
42
+ 163 => :future,
43
+ 164 => :past2,
44
+ 165 => :nominative,
45
+ 166 => :genitive,
46
+ 167 => :dative,
47
+ 168 => :accusative,
48
+ 169 => :instrumental,
49
+ 170 => :ablative,
50
+ 171 => :partitive,
51
+ 172 => :locative,
52
+ 173 => :vocative,
53
+ 174 => :singular,
54
+ 175 => :plural,
55
+ 176 => :gerund,
56
+ 177 => :infinitive,
57
+ 178 => :participle,
58
+ 179 => :indicative,
59
+ 180 => :imperative,
60
+ 181 => :conditional,
61
+ 182 => :subjunctive,
62
+ 183 => :short,
63
+ 184 => :full,
64
+ 185 => :superlative,
65
+ 186 => :comparative,
66
+ 187 => :possessive,
67
+ 188 => :person1,
68
+ 189 => :person2,
69
+ 190 => :person3,
70
+ 191 => :feminine,
71
+ 192 => :masculine,
72
+ 193 => :neuter,
73
+ 194 => :mas_fem,
74
+ 195 => :perfect,
75
+ 196 => :imperfect,
76
+ 197 => :passive,
77
+ 198 => :active,
78
+ 199 => :reflexive,
79
+ 200 => :impersonal,
80
+ 201 => :animated,
81
+ 202 => :inanimated,
82
+ 203 => :praedic,
83
+ 204 => :parenth,
84
+ 205 => :transitive,
85
+ 206 => :intransitive,
86
+ 207 => :definite,
87
+ 208 => :indefinite,
88
+ 209 => :sim_conj,
89
+ 210 => :sub_conj,
90
+ 211 => :pronoun_conj,
91
+ 212 => :correlate_conj,
92
+ 213 => :aux_verb
93
+ }.freeze
94
+
95
+ # Convert an array with mystem grammeme codes into a MSD.
96
+ #
97
+ def to_msd(grammemes)
98
+ msd = Myasorubka::MSD.new(Myasorubka::MSD::Russian)
99
+
100
+ grammemes.sort.each do |code|
101
+ case GRAMMEMES[code]
102
+ # Nomenus
103
+ when :postposition then msd[:pos] = :adposition
104
+ when :adjective then msd[:pos] = :adjective; msd[:type] = :qualificative; msd[:degree] = :positive
105
+ when :adverb then msd[:pos] = :adverb
106
+ when :conjunction then msd[:pos] = :conjunction
107
+ when :interjunction then msd[:pos] = :interjection
108
+ when :numeral then msd[:pos] = :numeral; msd[:type] = :cardinal
109
+ when :particle then msd[:pos] = :particle
110
+ when :preposition then msd[:pos] = :adposition; msd[:type] = :preposition
111
+ when :substantive then msd[:pos] = :noun; msd[:type] = :common
112
+ when :verb then msd[:pos] = :verb; msd[:type] = :main
113
+ when :adj_numeral then msd[:pos] = :numeral; msd[:type] = :ordinal
114
+ when :adj_pronoun then msd[:pos] = :pronoun; msd[:syntactic_type] = :adjectival
115
+ when :adv_pronoun then msd[:pos] = :pronoun; msd[:syntactic_type] = :adverbial
116
+ when :subst_pronoun then msd[:pos] = :pronoun; msd[:syntactic_type] = :nominal
117
+ when :abbreviation then msd[:pos] = :abbreviation
118
+ when :first_name then msd[:type] = :proper
119
+ when :surname then msd[:type] = :proper
120
+ when :patronymic then msd[:type] = :proper
121
+ when :geo then msd[:type] = :proper
122
+ when :proper then msd[:type] = :proper
123
+ # Tempus
124
+ when :present then msd[:tense] = :present
125
+ # TODO: how to handle :notpast tense?
126
+ when :past then msd[:tense] = :past
127
+ when :future then msd[:tense] = :future
128
+ when :past2 then msd[:tense] = :past
129
+ # Casus
130
+ when :nominative then msd[:case] = :nominative
131
+ when :genitive then msd[:case] = :genitive
132
+ when :dative then msd[:case] = :dative
133
+ when :accusative then msd[:case] = :accusative
134
+ when :instrumental then msd[:case] = :instrumental
135
+ when :ablative then msd[:case] = :genitive
136
+ when :partitive then msd[:case] = :genitive; msd[:case2] = :partitive
137
+ when :locative then msd[:case] = :genitive; msd[:case2] = :locative
138
+ when :vocative then msd[:case] = :vocative
139
+ # Numerus
140
+ when :singular then msd[:number] = :singular
141
+ when :plural then msd[:number] = :plural
142
+ # Modus
143
+ when :gerund then msd[:vform] = :gerund
144
+ when :infinitive then msd[:vform] = :infinitive
145
+ when :participle then msd[:vform] = :participle
146
+ when :indicative then msd[:vform] = :indicative
147
+ when :imperative then msd[:vform] = :imperative
148
+ when :conditional then msd[:vform] = :conditional
149
+ # Gradus
150
+ when :short then msd[:definiteness] = :short_art
151
+ when :full then msd[:definiteness] = :full_art
152
+ when :superlative then msd[:degree] = :superlative
153
+ when :comparative then msd[:degree] = :comparative
154
+ when :possessive then msd[:type] = :possessive
155
+ # Personae
156
+ when :person1 then msd[:person] = :first
157
+ when :person2 then msd[:person] = :second
158
+ when :person3 then msd[:person] = :third
159
+ # Gender
160
+ when :feminine then msd[:gender] = :feminine
161
+ when :masculine then msd[:gender] = :masculine
162
+ when :neuter then msd[:gender] = :neuter
163
+ when :mas_fem then msd[:gender] = :common
164
+ # Perfectum-Imperfectum
165
+ when :perfect then msd[:aspect] = :perfective
166
+ when :imperfect then msd[:aspect] = :progressive
167
+ # Voice
168
+ when :passive then msd[:voice] = :passive
169
+ when :active then msd[:voice] = :active
170
+ when :reflexive then msd[:type] = :reflexive
171
+ # Animated
172
+ when :animated then msd[:animate] = :yes
173
+ when :inanimated then msd[:animate] = :no
174
+ # Transitivity
175
+ when :definite then msd[:definiteness] = :full_art
176
+ when :indefinite then msd[:definiteness] = :short_art
177
+ # Definiteness
178
+ when :sim_conj then msd[:type] = :coordinating
179
+ when :sub_conj then msd[:type] = :subordinating
180
+ when :aux_verb then msd[:type] = :auxiliary
181
+ else
182
+ end
183
+ end
184
+
185
+ msd.prune!
186
+ end
187
+ end
@@ -0,0 +1,123 @@
1
+ # encoding: utf-8
2
+
3
+ # The Penn Treebank Project annotates naturally-occuring text for
4
+ # linguistic structure. Most notably, we produce skeletal parses
5
+ # showing rough syntactic and semantic information — a bank of
6
+ # linguistic trees.
7
+ #
8
+ # Treebanks are often created on top of a corpus that has already been
9
+ # annotated with part-of-speech tags. In turn, treebanks are sometimes
10
+ # enhanced with semantic or other linguistic information.
11
+ #
12
+ module Myasorubka::Treebank
13
+ extend self
14
+
15
+ # Convert the given tag from English Penn Treebank format to the English
16
+ # representation in the MULTEXT-East format.
17
+ #
18
+ def english(tag)
19
+ msd = Myasorubka::MSD.new(Myasorubka::MSD::English)
20
+
21
+ case tag
22
+ when 'CC' then
23
+ msd[:pos] = :conjunction
24
+ msd[:type] = :coordinating
25
+ when 'CD' then
26
+ msd[:pos] = :numeral
27
+ msd[:type] = :cardinal
28
+ when 'DT' then
29
+ msd[:pos] = :determiner
30
+ when 'IN' then
31
+ msd[:pos] = :conjunction
32
+ msd[:type] = :subordinating
33
+ when 'JJ' then
34
+ msd[:pos] = :adjective
35
+ when 'JJR' then
36
+ msd[:pos] = :adjective
37
+ msd[:degree] = :comparative
38
+ when 'JJS' then
39
+ msd[:pos] = :adjective
40
+ msd[:degree] = :superlative
41
+ when 'MD' then
42
+ msd[:pos] = :verb
43
+ msd[:type] = :modal
44
+ when 'NN' then
45
+ msd[:pos] = :noun
46
+ msd[:type] = :common
47
+ msd[:number] = :singular
48
+ when 'NNS'
49
+ msd[:pos] = :noun
50
+ msd[:type] = :common
51
+ msd[:number] = :plural
52
+ when 'NP'
53
+ msd[:pos] = :noun
54
+ msd[:type] = :proper
55
+ msd[:number] = :singular
56
+ when 'NPS'
57
+ msd[:pos] = :noun
58
+ msd[:type] = :proper
59
+ msd[:number] = :plural
60
+ when 'PDT' then
61
+ msd[:pos] = :determiner
62
+ when 'PP' then
63
+ msd[:pos] = :pronoun
64
+ msd[:type] = :personal
65
+ when 'PP$' then
66
+ msd[:pos] = :pronoun
67
+ msd[:type] = :possessive
68
+ when 'RB' then
69
+ msd[:pos] = :adverb
70
+ when 'RBR' then
71
+ msd[:pos] = :adverb
72
+ msd[:degree] = :comparative
73
+ when 'RBS' then
74
+ msd[:pos] = :adverb
75
+ msd[:degree] = :superlative
76
+ when 'TO' then
77
+ msd[:pos] = :determiner
78
+ when 'UH' then
79
+ msd[:pos] = :interjection
80
+ when 'VB' then
81
+ msd[:pos] = :verb
82
+ msd[:type] = :base
83
+ when 'VBD' then
84
+ msd[:pos] = :verb
85
+ msd[:type] = :base
86
+ msd[:tense] = :past
87
+ when 'VBG' then
88
+ msd[:pos] = :verb
89
+ msd[:type] = :base
90
+ msd[:vform] = :participle
91
+ msd[:tense] = :present
92
+ when 'VBN' then
93
+ msd[:pos] = :verb
94
+ msd[:type] = :base
95
+ msd[:vform] = :participle
96
+ msd[:tense] = :past
97
+ when 'VBP' then
98
+ msd[:pos] = :verb
99
+ msd[:type] = :base
100
+ msd[:tense] = :present
101
+ msd[:number] = :singular
102
+ when 'VBZ' then
103
+ msd[:pos] = :verb
104
+ msd[:type] = :base
105
+ msd[:tense] = :present
106
+ msd[:person] = :third
107
+ msd[:number] = :singular
108
+ when 'WDT' then
109
+ msd[:pos] = :determiner
110
+ when 'WP' then
111
+ msd[:pos] = :pronoun
112
+ when 'WP$' then
113
+ msd[:pos] = :pronoun
114
+ msd[:type] = :possessive
115
+ when 'WRB' then
116
+ msd[:pos] = :adverb
117
+ else
118
+ msd[:pos] = :residual
119
+ end
120
+
121
+ msd
122
+ end
123
+ end
@@ -0,0 +1,46 @@
1
+ # This module provides downcase and upcase methods designed for Russian.
2
+ # The original code is written by Andrew Kozlov for the Petrovich library.
3
+ #
4
+ # https://github.com/petrovich/petrovich-ruby/blob/df705075542979ab85e1f2bf9a2024b1c0813e1a/lib/petrovich/unicode.rb
5
+ #
6
+ module Myasorubka::Unicode extend self
7
+ # Russian capital letters.
8
+ #
9
+ RU_UPPERCASE = [
10
+ "\u0410", "\u0411", "\u0412", "\u0413", "\u0414", "\u0415", "\u0416", "\u0417",
11
+ "\u0418", "\u0419", "\u041A", "\u041B", "\u041C", "\u041D", "\u041E", "\u041F",
12
+ "\u0420", "\u0421", "\u0422", "\u0423", "\u0424", "\u0425", "\u0426", "\u0427",
13
+ "\u0428", "\u0429", "\u042A", "\u042B", "\u042C", "\u042D", "\u042E", "\u042F",
14
+ "\u0401" # Ё
15
+ ].join
16
+
17
+ # Russian small letters.
18
+ #
19
+ RU_LOWERCASE = [
20
+ "\u0430", "\u0431", "\u0432", "\u0433", "\u0434", "\u0435", "\u0436", "\u0437",
21
+ "\u0438", "\u0439", "\u043A", "\u043B", "\u043C", "\u043D", "\u043E", "\u043F",
22
+ "\u0440", "\u0441", "\u0442", "\u0443", "\u0444", "\u0445", "\u0446", "\u0447",
23
+ "\u0448", "\u0449", "\u044A", "\u044B", "\u044C", "\u044D", "\u044E", "\u044F",
24
+ "\u0451" # Ё
25
+ ].join
26
+
27
+ # Returns a copy of the given string having replaced
28
+ # capital Russian letters with small ones.
29
+ #
30
+ # @param string [String] a string.
31
+ # @return [String] a new string.
32
+ #
33
+ def downcase(string)
34
+ string.tr(RU_UPPERCASE, RU_LOWERCASE).tap(&:downcase!)
35
+ end
36
+
37
+ # Returns a copy of the given string having replaced
38
+ # small Russian letters with capital ones.
39
+ #
40
+ # @param string [String] a string.
41
+ # @return [String] a new string.
42
+ #
43
+ def upcase(string)
44
+ string.tr(RU_LOWERCASE, RU_UPPERCASE).tap(&:upcase!)
45
+ end
46
+ end
@@ -5,5 +5,5 @@
5
5
  module Myasorubka
6
6
  # Version of Myasorubka.
7
7
  #
8
- VERSION = '0.1.1'
8
+ VERSION = '0.2.0'
9
9
  end
@@ -12,17 +12,11 @@ Gem::Specification.new do |spec|
12
12
  spec.description = 'Myasorubka is a morphological data processor.'
13
13
  spec.summary = 'Myasorubka is a morphological data proceesor ' \
14
14
  'that supports AOT and MULTEXT-East notations.'
15
- spec.homepage = 'https://github.com/ustalov/myasorubka'
15
+ spec.homepage = 'https://github.com/dustalov/myasorubka'
16
16
  spec.license = 'MIT'
17
17
 
18
18
  spec.files = `git ls-files`.split($/)
19
19
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
20
20
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
21
21
  spec.require_paths = ['lib']
22
-
23
- spec.add_development_dependency 'bundler', '~> 1.3'
24
- spec.add_development_dependency 'minitest', '>= 2.11'
25
- spec.add_development_dependency 'rake'
26
-
27
- spec.add_dependency 'unicode_utils', '~> 1.4'
28
22
  end
@@ -83,6 +83,19 @@ module Myasorubka
83
83
  ('Vmp' =~ re).must_equal 0
84
84
  ('Nc-pl' =~ re).must_be_nil
85
85
  end
86
+
87
+ it 'can be pruned and became valid when the category is wrong' do
88
+ subject[:pos] = :zalupa
89
+ subject.prune!
90
+ subject.must_be :valid?
91
+ end
92
+
93
+ it 'can be pruned and became valid when an attribute is wrong' do
94
+ subject[:pos] = :verb
95
+ subject[:animate] = :yes
96
+ subject.prune!
97
+ subject.must_be :valid?
98
+ end
86
99
  end
87
100
 
88
101
  describe 'Generator' do
@@ -4,11 +4,9 @@ require 'rubygems'
4
4
 
5
5
  $:.unshift File.expand_path('../../lib', __FILE__)
6
6
 
7
- if RUBY_VERSION == '1.8'
8
- gem 'minitest'
9
- end
10
-
7
+ gem 'minitest'
11
8
  require 'minitest/autorun'
9
+ require 'minitest/hell'
12
10
 
13
11
  require 'myasorubka'
14
12
  require 'myasorubka/aot'
metadata CHANGED
@@ -1,71 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: myasorubka
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Ustalov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-05-07 00:00:00.000000000 Z
12
- dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: bundler
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - ~>
18
- - !ruby/object:Gem::Version
19
- version: '1.3'
20
- type: :development
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - ~>
25
- - !ruby/object:Gem::Version
26
- version: '1.3'
27
- - !ruby/object:Gem::Dependency
28
- name: minitest
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - '>='
32
- - !ruby/object:Gem::Version
33
- version: '2.11'
34
- type: :development
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - '>='
39
- - !ruby/object:Gem::Version
40
- version: '2.11'
41
- - !ruby/object:Gem::Dependency
42
- name: rake
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - '>='
46
- - !ruby/object:Gem::Version
47
- version: '0'
48
- type: :development
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - '>='
53
- - !ruby/object:Gem::Version
54
- version: '0'
55
- - !ruby/object:Gem::Dependency
56
- name: unicode_utils
57
- requirement: !ruby/object:Gem::Requirement
58
- requirements:
59
- - - ~>
60
- - !ruby/object:Gem::Version
61
- version: '1.4'
62
- type: :runtime
63
- prerelease: false
64
- version_requirements: !ruby/object:Gem::Requirement
65
- requirements:
66
- - - ~>
67
- - !ruby/object:Gem::Version
68
- version: '1.4'
11
+ date: 2015-10-17 00:00:00.000000000 Z
12
+ dependencies: []
69
13
  description: Myasorubka is a morphological data processor.
70
14
  email:
71
15
  - dmitry@eveel.ru
@@ -73,13 +17,12 @@ executables: []
73
17
  extensions: []
74
18
  extra_rdoc_files: []
75
19
  files:
76
- - .gitignore
77
- - .travis.yml
20
+ - ".gitignore"
21
+ - ".travis.yml"
78
22
  - Gemfile
79
23
  - LICENSE.txt
80
24
  - README.md
81
25
  - Rakefile
82
- - aot-russian
83
26
  - lib/myasorubka.rb
84
27
  - lib/myasorubka/aot.rb
85
28
  - lib/myasorubka/aot/dictionary.rb
@@ -88,13 +31,17 @@ files:
88
31
  - lib/myasorubka/msd.rb
89
32
  - lib/myasorubka/msd/english.rb
90
33
  - lib/myasorubka/msd/russian.rb
34
+ - lib/myasorubka/mystem.rb
35
+ - lib/myasorubka/mystem/binary.rb
36
+ - lib/myasorubka/treebank.rb
37
+ - lib/myasorubka/unicode.rb
91
38
  - lib/myasorubka/version.rb
92
39
  - myasorubka.gemspec
93
40
  - spec/data/russian.tsv
94
41
  - spec/msd/russian_spec.rb
95
42
  - spec/msd_spec.rb
96
43
  - spec/spec_helper.rb
97
- homepage: https://github.com/ustalov/myasorubka
44
+ homepage: https://github.com/dustalov/myasorubka
98
45
  licenses:
99
46
  - MIT
100
47
  metadata: {}
@@ -104,17 +51,17 @@ require_paths:
104
51
  - lib
105
52
  required_ruby_version: !ruby/object:Gem::Requirement
106
53
  requirements:
107
- - - '>='
54
+ - - ">="
108
55
  - !ruby/object:Gem::Version
109
56
  version: '0'
110
57
  required_rubygems_version: !ruby/object:Gem::Requirement
111
58
  requirements:
112
- - - '>='
59
+ - - ">="
113
60
  - !ruby/object:Gem::Version
114
61
  version: '0'
115
62
  requirements: []
116
63
  rubyforge_project:
117
- rubygems_version: 2.0.3
64
+ rubygems_version: 2.4.8
118
65
  signing_key:
119
66
  specification_version: 4
120
67
  summary: Myasorubka is a morphological data proceesor that supports AOT and MULTEXT-East
@@ -1,7 +0,0 @@
1
- #!/bin/sh
2
- rake clean aot \
3
- mrd=morphs.mrd \
4
- tab=rgramtab.tab \
5
- encoding=CP1251 \
6
- language=russian \
7
- --trace