myasorubka 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +5 -5
- data/Gemfile +7 -0
- data/LICENSE.txt +1 -1
- data/README.md +15 -7
- data/Rakefile +1 -0
- data/lib/myasorubka.rb +1 -0
- data/lib/myasorubka/aot/tags.rb +2 -2
- data/lib/myasorubka/msd.rb +33 -11
- data/lib/myasorubka/msd/english.rb +0 -3
- data/lib/myasorubka/msd/russian.rb +0 -3
- data/lib/myasorubka/mystem.rb +135 -0
- data/lib/myasorubka/mystem/binary.rb +187 -0
- data/lib/myasorubka/treebank.rb +123 -0
- data/lib/myasorubka/unicode.rb +46 -0
- data/lib/myasorubka/version.rb +1 -1
- data/myasorubka.gemspec +1 -7
- data/spec/msd_spec.rb +13 -0
- data/spec/spec_helper.rb +2 -4
- metadata +13 -66
- data/aot-russian +0 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8e16d63b366b3abc035827745c49a22cdf28159a
|
4
|
+
data.tar.gz: b551b32694f999c898d3c898210a9ea5439e22c9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 343b769967295d8fd6f1860fcd9f2c1b37e9d8f05b1aee00a54c460a242fe79190c9c993691dfbd236540b71473e2e17b013730cd0db37ca25b66cbff71f2926
|
7
|
+
data.tar.gz: 913cdc0cd36ef38e6ffdc4740678f6872107d0cb49d79710a73f7955a6656f7453c28b1f48a488be35a523b59764779c1e461eba94d32a0574fe6eb4ad25d17a
|
data/.travis.yml
CHANGED
data/Gemfile
CHANGED
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -5,7 +5,17 @@ Myasorubka is a morphological data processor that supports
|
|
5
5
|
[AOT](http://aot.ru) and [MULTEXT-East](http://nl.ijs.si/ME/)
|
6
6
|
notations.
|
7
7
|
|
8
|
+
[![Gem Version][badge_fury_badge]][badge_fury_link] [![Build Status][travis_ci_badge]][travis_ci_link] [![Dependency Status][gemnasium_badge]][gemnasium_link]
|
9
|
+
|
10
|
+
[badge_fury_badge]: https://badge.fury.io/rb/myasorubka.svg
|
11
|
+
[badge_fury_link]: https://badge.fury.io/rb/myasorubka
|
12
|
+
[travis_ci_badge]: https://travis-ci.org/dustalov/myasorubka.svg
|
13
|
+
[travis_ci_link]: https://travis-ci.org/dustalov/myasorubka
|
14
|
+
[gemnasium_badge]: https://gemnasium.com/dustalov/myasorubka.svg
|
15
|
+
[gemnasium_link]: https://gemnasium.com/dustalov/myasorubka
|
16
|
+
|
8
17
|
## MULTEXT-East morphosyntactic descriptors
|
18
|
+
|
9
19
|
It is possible to process the MULTEXT-East morphosyntactic descriptors
|
10
20
|
(MSDs) in a convenient way.
|
11
21
|
|
@@ -51,7 +61,8 @@ Also, the `Myasorubka::MSD` class allows to write MSDs.
|
|
51
61
|
```
|
52
62
|
|
53
63
|
## AOT dictionaries
|
54
|
-
|
64
|
+
|
65
|
+
Myasorubka provides simple parsers for lexicon in the [AOT](http://aot.ru)
|
55
66
|
format, both for gramtab and dictionary files.
|
56
67
|
|
57
68
|
```ruby
|
@@ -104,11 +115,8 @@ You can learn more about AOT lexicon from the
|
|
104
115
|
4. Push to the branch (`git push origin my-new-feature`);
|
105
116
|
5. Create new Pull Request.
|
106
117
|
|
107
|
-
## Build Status [<img src="https://secure.travis-ci.org/ustalov/myasorubka.png"/>](http://travis-ci.org/ustalov/myasorubka)
|
108
|
-
|
109
|
-
## Dependency Status [<img src="https://gemnasium.com/ustalov/myasorubka.png"/>](https://gemnasium.com/ustalov/myasorubka)
|
110
|
-
|
111
118
|
## Copyright
|
112
|
-
Copyright (c) 2011-2013 [Dmitry Ustalov]. See LICENSE for details.
|
113
119
|
|
114
|
-
[Dmitry Ustalov]
|
120
|
+
Copyright (c) 2011-2015 [Dmitry Ustalov]. See LICENSE for details.
|
121
|
+
|
122
|
+
[Dmitry Ustalov]: https://ustalov.name/
|
data/Rakefile
CHANGED
data/lib/myasorubka.rb
CHANGED
data/lib/myasorubka/aot/tags.rb
CHANGED
@@ -133,7 +133,7 @@ class Myasorubka::AOT::Tags
|
|
133
133
|
#
|
134
134
|
def self.russian(pos_line, grammemes_line)
|
135
135
|
grammemes = grammemes_line.split(',').map do |grammeme|
|
136
|
-
|
136
|
+
Myasorubka::Unicode.downcase(grammeme)
|
137
137
|
end
|
138
138
|
|
139
139
|
msd = Myasorubka::MSD.new(Myasorubka::MSD::Russian)
|
@@ -148,7 +148,7 @@ class Myasorubka::AOT::Tags
|
|
148
148
|
pos_line = 'АББР'
|
149
149
|
end
|
150
150
|
|
151
|
-
case
|
151
|
+
case Myasorubka::Unicode.upcase(pos_line)
|
152
152
|
when 'С' then begin
|
153
153
|
msd[:pos] = :noun
|
154
154
|
msd[:type] = if (grammemes & [ 'имя', 'фам', 'отч', 'жарг', 'арх', 'проф', 'опч' ]).empty?
|
data/lib/myasorubka/msd.rb
CHANGED
@@ -153,26 +153,23 @@ class Myasorubka::MSD
|
|
153
153
|
raise InvalidDescriptor, "category is nil"
|
154
154
|
end
|
155
155
|
|
156
|
+
attributes = category[:attrs]
|
156
157
|
msd = [category[:code]]
|
157
158
|
|
158
|
-
|
159
|
-
grammemes.each do |attr_name, value|
|
159
|
+
grammemes.each do |attribute, value|
|
160
160
|
next unless value
|
161
161
|
|
162
|
-
|
163
|
-
|
164
|
-
raise InvalidDescriptor, 'no such attribute "%s" of category "%s"' %
|
165
|
-
[attr_name, pos]
|
162
|
+
unless index = attributes.index { |name, _| name == attribute }
|
163
|
+
raise InvalidDescriptor, 'no such attribute "%s" of category "%s"' % [attribute, pos]
|
166
164
|
end
|
167
165
|
|
168
|
-
|
166
|
+
_, values = attributes[index]
|
169
167
|
|
170
|
-
unless
|
171
|
-
raise InvalidDescriptor, 'no such attribute "%s" '
|
172
|
-
'for attribute "%s" of category "%s"' % [value, attr_name, pos]
|
168
|
+
unless attribute_value = values[value]
|
169
|
+
raise InvalidDescriptor, 'no such value "%s" for attribute "%s" of category "%s"' % [value, attribute, pos]
|
173
170
|
end
|
174
171
|
|
175
|
-
msd[
|
172
|
+
msd[index + 1] = attribute_value
|
176
173
|
end
|
177
174
|
|
178
175
|
msd.map { |e| e || EMPTY_DESCRIPTOR }.join
|
@@ -188,6 +185,31 @@ class Myasorubka::MSD
|
|
188
185
|
false
|
189
186
|
end
|
190
187
|
|
188
|
+
# Drop every attribute that does not appear in the category.
|
189
|
+
#
|
190
|
+
# @return [MSD] self.
|
191
|
+
#
|
192
|
+
def prune!
|
193
|
+
unless category = language::CATEGORIES[pos]
|
194
|
+
self.pos = nil
|
195
|
+
grammemes.clear
|
196
|
+
return self
|
197
|
+
end
|
198
|
+
|
199
|
+
attributes = category[:attrs]
|
200
|
+
|
201
|
+
grammemes.reject! do |attribute, value|
|
202
|
+
if index = attributes.index { |name, _| name == attribute }
|
203
|
+
_, values = attributes[index]
|
204
|
+
!values[value]
|
205
|
+
else
|
206
|
+
true
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
self
|
211
|
+
end
|
212
|
+
|
191
213
|
protected
|
192
214
|
# @private
|
193
215
|
def parse! msd_line
|
@@ -0,0 +1,135 @@
|
|
1
|
+
# https://tech.yandex.ru/mystem/
|
2
|
+
module Myasorubka::Mystem extend self
|
3
|
+
require 'myasorubka/mystem/binary'
|
4
|
+
|
5
|
+
# https://tech.yandex.ru/mystem/doc/grammemes-values-docpage/
|
6
|
+
GRAMMEMES = {
|
7
|
+
'A' => :adjective,
|
8
|
+
'ADV' => :adverb,
|
9
|
+
'ADVPRO' => :adv_pronoun,
|
10
|
+
'ANUM' => :adj_numeral,
|
11
|
+
'APRO' => :adj_pronoun,
|
12
|
+
'COM' => :composite,
|
13
|
+
'CONJ' => :conjunction,
|
14
|
+
'INTJ' => :interjunction,
|
15
|
+
'NUM' => :numeral,
|
16
|
+
'PART' => :particle,
|
17
|
+
'PR' => :preposition,
|
18
|
+
'S' => :substantive,
|
19
|
+
'SPRO' => :subst_pronoun,
|
20
|
+
'V' => :verb,
|
21
|
+
'наст' => :present,
|
22
|
+
'praes' => :present,
|
23
|
+
'непрош' => :notpast,
|
24
|
+
'inpraes' => :notpast,
|
25
|
+
'прош' => :past,
|
26
|
+
'praet' => :past,
|
27
|
+
'им' => :nominative,
|
28
|
+
'nom' => :nominative,
|
29
|
+
'род' => :genitive,
|
30
|
+
'gen' => :genitive,
|
31
|
+
'дат' => :dative,
|
32
|
+
'dat' => :dative,
|
33
|
+
'вин' => :accusative,
|
34
|
+
'acc' => :accusative,
|
35
|
+
'твор' => :instrumental,
|
36
|
+
'ins' => :instrumental,
|
37
|
+
'пр' => :ablative,
|
38
|
+
'abl' => :ablative,
|
39
|
+
'парт' => :partitive,
|
40
|
+
'part' => :partitive,
|
41
|
+
'местн' => :locative,
|
42
|
+
'loc' => :locative,
|
43
|
+
'зват' => :vocative,
|
44
|
+
'voc' => :vocative,
|
45
|
+
'ед' => :singular,
|
46
|
+
'sg' => :singular,
|
47
|
+
'мн' => :plural,
|
48
|
+
'pl' => :plural,
|
49
|
+
'деепр' => :gerund,
|
50
|
+
'ger' => :gerund,
|
51
|
+
'инф' => :infinitive,
|
52
|
+
'inf' => :infinitive,
|
53
|
+
'прич' => :participle,
|
54
|
+
'partcp' => :participle,
|
55
|
+
'изъяв' => :indicative,
|
56
|
+
'indic' => :indicative,
|
57
|
+
'пов' => :imperative,
|
58
|
+
'imper' => :imperative,
|
59
|
+
'кр' => :short,
|
60
|
+
'brev' => :short,
|
61
|
+
'полн' => :full,
|
62
|
+
'plen' => :full,
|
63
|
+
'притяж' => :possessive,
|
64
|
+
'poss' => :possessive,
|
65
|
+
'прев' => :superlative,
|
66
|
+
'supr' => :superlative,
|
67
|
+
'срав' => :comparative,
|
68
|
+
'comp' => :comparative,
|
69
|
+
'1-л' => :person1,
|
70
|
+
'1p' => :person1,
|
71
|
+
'2-л' => :person2,
|
72
|
+
'2p' => :person2,
|
73
|
+
'3-л' => :person3,
|
74
|
+
'3p' => :person3,
|
75
|
+
'муж' => :masculine,
|
76
|
+
'm' => :masculine,
|
77
|
+
'жен' => :feminine,
|
78
|
+
'f' => :feminine,
|
79
|
+
'сред' => :neuter,
|
80
|
+
'n' => :neuter,
|
81
|
+
'несов' => :imperfect,
|
82
|
+
'ipf' => :imperfect,
|
83
|
+
'сов' => :perfect,
|
84
|
+
'pf' => :perfect,
|
85
|
+
'действ' => :active,
|
86
|
+
'act' => :active,
|
87
|
+
'страд' => :passive,
|
88
|
+
'pass' => :passive,
|
89
|
+
'од' => :animated,
|
90
|
+
'anim' => :animated,
|
91
|
+
'неод' => :inanimated,
|
92
|
+
'inan' => :inanimated,
|
93
|
+
'пе' => :transitive,
|
94
|
+
'tran' => :transitive,
|
95
|
+
'нп' => :intransitive,
|
96
|
+
'intr' => :intransitive,
|
97
|
+
'вводн' => :parenth,
|
98
|
+
'parenth' => :parenth,
|
99
|
+
'гео' => :geo,
|
100
|
+
'geo' => :geo,
|
101
|
+
'затр' => :awkward,
|
102
|
+
'awkw' => :awkward,
|
103
|
+
'имя' => :first_name,
|
104
|
+
'persn' => :first_name,
|
105
|
+
'искаж' => :distort,
|
106
|
+
'dist' => :distort,
|
107
|
+
'мж' => :mas_fem,
|
108
|
+
'mf' => :mas_fem,
|
109
|
+
'обсц' => :obscene,
|
110
|
+
'obsc' => :obscene,
|
111
|
+
'отч' => :patronymic,
|
112
|
+
'patrn' => :patronymic,
|
113
|
+
'прдк' => :praedic,
|
114
|
+
'praed' => :praedic,
|
115
|
+
'разг' => :informal,
|
116
|
+
'inform' => :informal,
|
117
|
+
'редк' => :rare,
|
118
|
+
'rare' => :rare,
|
119
|
+
'сокр' => :abbreviation,
|
120
|
+
'abbr' => :abbreviation,
|
121
|
+
'устар' => :obsolete,
|
122
|
+
'obsol' => :obsolete,
|
123
|
+
'фам' => :surname,
|
124
|
+
'famn' => :surname
|
125
|
+
}.freeze
|
126
|
+
|
127
|
+
# Convert an array with mystem character-based grammemes into an MSD.
|
128
|
+
#
|
129
|
+
def to_msd(grammemes)
|
130
|
+
grammemes = grammemes.map { |g| GRAMMEMES[g] }
|
131
|
+
grammemes.compact!
|
132
|
+
grammemes.map! { |g| Myasorubka::Mystem::Binary::GRAMMEMES.key(g) }
|
133
|
+
Myasorubka::Mystem::Binary.to_msd(grammemes)
|
134
|
+
end
|
135
|
+
end
|
@@ -0,0 +1,187 @@
|
|
1
|
+
# A wrapper around mystem's internal binary format.
|
2
|
+
#
|
3
|
+
module Myasorubka::Mystem::Binary extend self
|
4
|
+
# https://github.com/yandex/tomita-parser/blob/master/src/library/lemmer/dictlib/yx_gram_enum.h
|
5
|
+
GRAMMEMES = {
|
6
|
+
127 => :postposition,
|
7
|
+
128 => :adjective,
|
8
|
+
129 => :adverb,
|
9
|
+
130 => :composite,
|
10
|
+
131 => :conjunction,
|
11
|
+
132 => :interjunction,
|
12
|
+
133 => :numeral,
|
13
|
+
134 => :particle,
|
14
|
+
135 => :preposition,
|
15
|
+
136 => :substantive,
|
16
|
+
137 => :verb,
|
17
|
+
138 => :adj_numeral,
|
18
|
+
139 => :adj_pronoun,
|
19
|
+
140 => :adv_pronoun,
|
20
|
+
141 => :subst_pronoun,
|
21
|
+
142 => :article,
|
22
|
+
143 => :part_of_idiom,
|
23
|
+
144 => :reserved,
|
24
|
+
145 => :abbreviation,
|
25
|
+
146 => :irregular_stem,
|
26
|
+
147 => :informal,
|
27
|
+
148 => :distort,
|
28
|
+
149 => :contracted,
|
29
|
+
150 => :obscene,
|
30
|
+
151 => :rare,
|
31
|
+
152 => :awkward,
|
32
|
+
153 => :obsolete,
|
33
|
+
154 => :subst_adjective,
|
34
|
+
155 => :first_name,
|
35
|
+
156 => :surname,
|
36
|
+
157 => :patronymic,
|
37
|
+
158 => :geo,
|
38
|
+
159 => :proper,
|
39
|
+
160 => :present,
|
40
|
+
161 => :notpast,
|
41
|
+
162 => :past,
|
42
|
+
163 => :future,
|
43
|
+
164 => :past2,
|
44
|
+
165 => :nominative,
|
45
|
+
166 => :genitive,
|
46
|
+
167 => :dative,
|
47
|
+
168 => :accusative,
|
48
|
+
169 => :instrumental,
|
49
|
+
170 => :ablative,
|
50
|
+
171 => :partitive,
|
51
|
+
172 => :locative,
|
52
|
+
173 => :vocative,
|
53
|
+
174 => :singular,
|
54
|
+
175 => :plural,
|
55
|
+
176 => :gerund,
|
56
|
+
177 => :infinitive,
|
57
|
+
178 => :participle,
|
58
|
+
179 => :indicative,
|
59
|
+
180 => :imperative,
|
60
|
+
181 => :conditional,
|
61
|
+
182 => :subjunctive,
|
62
|
+
183 => :short,
|
63
|
+
184 => :full,
|
64
|
+
185 => :superlative,
|
65
|
+
186 => :comparative,
|
66
|
+
187 => :possessive,
|
67
|
+
188 => :person1,
|
68
|
+
189 => :person2,
|
69
|
+
190 => :person3,
|
70
|
+
191 => :feminine,
|
71
|
+
192 => :masculine,
|
72
|
+
193 => :neuter,
|
73
|
+
194 => :mas_fem,
|
74
|
+
195 => :perfect,
|
75
|
+
196 => :imperfect,
|
76
|
+
197 => :passive,
|
77
|
+
198 => :active,
|
78
|
+
199 => :reflexive,
|
79
|
+
200 => :impersonal,
|
80
|
+
201 => :animated,
|
81
|
+
202 => :inanimated,
|
82
|
+
203 => :praedic,
|
83
|
+
204 => :parenth,
|
84
|
+
205 => :transitive,
|
85
|
+
206 => :intransitive,
|
86
|
+
207 => :definite,
|
87
|
+
208 => :indefinite,
|
88
|
+
209 => :sim_conj,
|
89
|
+
210 => :sub_conj,
|
90
|
+
211 => :pronoun_conj,
|
91
|
+
212 => :correlate_conj,
|
92
|
+
213 => :aux_verb
|
93
|
+
}.freeze
|
94
|
+
|
95
|
+
# Convert an array with mystem grammeme codes into a MSD.
|
96
|
+
#
|
97
|
+
def to_msd(grammemes)
|
98
|
+
msd = Myasorubka::MSD.new(Myasorubka::MSD::Russian)
|
99
|
+
|
100
|
+
grammemes.sort.each do |code|
|
101
|
+
case GRAMMEMES[code]
|
102
|
+
# Nomenus
|
103
|
+
when :postposition then msd[:pos] = :adposition
|
104
|
+
when :adjective then msd[:pos] = :adjective; msd[:type] = :qualificative; msd[:degree] = :positive
|
105
|
+
when :adverb then msd[:pos] = :adverb
|
106
|
+
when :conjunction then msd[:pos] = :conjunction
|
107
|
+
when :interjunction then msd[:pos] = :interjection
|
108
|
+
when :numeral then msd[:pos] = :numeral; msd[:type] = :cardinal
|
109
|
+
when :particle then msd[:pos] = :particle
|
110
|
+
when :preposition then msd[:pos] = :adposition; msd[:type] = :preposition
|
111
|
+
when :substantive then msd[:pos] = :noun; msd[:type] = :common
|
112
|
+
when :verb then msd[:pos] = :verb; msd[:type] = :main
|
113
|
+
when :adj_numeral then msd[:pos] = :numeral; msd[:type] = :ordinal
|
114
|
+
when :adj_pronoun then msd[:pos] = :pronoun; msd[:syntactic_type] = :adjectival
|
115
|
+
when :adv_pronoun then msd[:pos] = :pronoun; msd[:syntactic_type] = :adverbial
|
116
|
+
when :subst_pronoun then msd[:pos] = :pronoun; msd[:syntactic_type] = :nominal
|
117
|
+
when :abbreviation then msd[:pos] = :abbreviation
|
118
|
+
when :first_name then msd[:type] = :proper
|
119
|
+
when :surname then msd[:type] = :proper
|
120
|
+
when :patronymic then msd[:type] = :proper
|
121
|
+
when :geo then msd[:type] = :proper
|
122
|
+
when :proper then msd[:type] = :proper
|
123
|
+
# Tempus
|
124
|
+
when :present then msd[:tense] = :present
|
125
|
+
# TODO: how to handle :notpast tense?
|
126
|
+
when :past then msd[:tense] = :past
|
127
|
+
when :future then msd[:tense] = :future
|
128
|
+
when :past2 then msd[:tense] = :past
|
129
|
+
# Casus
|
130
|
+
when :nominative then msd[:case] = :nominative
|
131
|
+
when :genitive then msd[:case] = :genitive
|
132
|
+
when :dative then msd[:case] = :dative
|
133
|
+
when :accusative then msd[:case] = :accusative
|
134
|
+
when :instrumental then msd[:case] = :instrumental
|
135
|
+
when :ablative then msd[:case] = :genitive
|
136
|
+
when :partitive then msd[:case] = :genitive; msd[:case2] = :partitive
|
137
|
+
when :locative then msd[:case] = :genitive; msd[:case2] = :locative
|
138
|
+
when :vocative then msd[:case] = :vocative
|
139
|
+
# Numerus
|
140
|
+
when :singular then msd[:number] = :singular
|
141
|
+
when :plural then msd[:number] = :plural
|
142
|
+
# Modus
|
143
|
+
when :gerund then msd[:vform] = :gerund
|
144
|
+
when :infinitive then msd[:vform] = :infinitive
|
145
|
+
when :participle then msd[:vform] = :participle
|
146
|
+
when :indicative then msd[:vform] = :indicative
|
147
|
+
when :imperative then msd[:vform] = :imperative
|
148
|
+
when :conditional then msd[:vform] = :conditional
|
149
|
+
# Gradus
|
150
|
+
when :short then msd[:definiteness] = :short_art
|
151
|
+
when :full then msd[:definiteness] = :full_art
|
152
|
+
when :superlative then msd[:degree] = :superlative
|
153
|
+
when :comparative then msd[:degree] = :comparative
|
154
|
+
when :possessive then msd[:type] = :possessive
|
155
|
+
# Personae
|
156
|
+
when :person1 then msd[:person] = :first
|
157
|
+
when :person2 then msd[:person] = :second
|
158
|
+
when :person3 then msd[:person] = :third
|
159
|
+
# Gender
|
160
|
+
when :feminine then msd[:gender] = :feminine
|
161
|
+
when :masculine then msd[:gender] = :masculine
|
162
|
+
when :neuter then msd[:gender] = :neuter
|
163
|
+
when :mas_fem then msd[:gender] = :common
|
164
|
+
# Perfectum-Imperfectum
|
165
|
+
when :perfect then msd[:aspect] = :perfective
|
166
|
+
when :imperfect then msd[:aspect] = :progressive
|
167
|
+
# Voice
|
168
|
+
when :passive then msd[:voice] = :passive
|
169
|
+
when :active then msd[:voice] = :active
|
170
|
+
when :reflexive then msd[:type] = :reflexive
|
171
|
+
# Animated
|
172
|
+
when :animated then msd[:animate] = :yes
|
173
|
+
when :inanimated then msd[:animate] = :no
|
174
|
+
# Transitivity
|
175
|
+
when :definite then msd[:definiteness] = :full_art
|
176
|
+
when :indefinite then msd[:definiteness] = :short_art
|
177
|
+
# Definiteness
|
178
|
+
when :sim_conj then msd[:type] = :coordinating
|
179
|
+
when :sub_conj then msd[:type] = :subordinating
|
180
|
+
when :aux_verb then msd[:type] = :auxiliary
|
181
|
+
else
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
msd.prune!
|
186
|
+
end
|
187
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# The Penn Treebank Project annotates naturally-occuring text for
|
4
|
+
# linguistic structure. Most notably, we produce skeletal parses
|
5
|
+
# showing rough syntactic and semantic information — a bank of
|
6
|
+
# linguistic trees.
|
7
|
+
#
|
8
|
+
# Treebanks are often created on top of a corpus that has already been
|
9
|
+
# annotated with part-of-speech tags. In turn, treebanks are sometimes
|
10
|
+
# enhanced with semantic or other linguistic information.
|
11
|
+
#
|
12
|
+
module Myasorubka::Treebank
|
13
|
+
extend self
|
14
|
+
|
15
|
+
# Convert the given tag from English Penn Treebank format to the English
|
16
|
+
# representation in the MULTEXT-East format.
|
17
|
+
#
|
18
|
+
def english(tag)
|
19
|
+
msd = Myasorubka::MSD.new(Myasorubka::MSD::English)
|
20
|
+
|
21
|
+
case tag
|
22
|
+
when 'CC' then
|
23
|
+
msd[:pos] = :conjunction
|
24
|
+
msd[:type] = :coordinating
|
25
|
+
when 'CD' then
|
26
|
+
msd[:pos] = :numeral
|
27
|
+
msd[:type] = :cardinal
|
28
|
+
when 'DT' then
|
29
|
+
msd[:pos] = :determiner
|
30
|
+
when 'IN' then
|
31
|
+
msd[:pos] = :conjunction
|
32
|
+
msd[:type] = :subordinating
|
33
|
+
when 'JJ' then
|
34
|
+
msd[:pos] = :adjective
|
35
|
+
when 'JJR' then
|
36
|
+
msd[:pos] = :adjective
|
37
|
+
msd[:degree] = :comparative
|
38
|
+
when 'JJS' then
|
39
|
+
msd[:pos] = :adjective
|
40
|
+
msd[:degree] = :superlative
|
41
|
+
when 'MD' then
|
42
|
+
msd[:pos] = :verb
|
43
|
+
msd[:type] = :modal
|
44
|
+
when 'NN' then
|
45
|
+
msd[:pos] = :noun
|
46
|
+
msd[:type] = :common
|
47
|
+
msd[:number] = :singular
|
48
|
+
when 'NNS'
|
49
|
+
msd[:pos] = :noun
|
50
|
+
msd[:type] = :common
|
51
|
+
msd[:number] = :plural
|
52
|
+
when 'NP'
|
53
|
+
msd[:pos] = :noun
|
54
|
+
msd[:type] = :proper
|
55
|
+
msd[:number] = :singular
|
56
|
+
when 'NPS'
|
57
|
+
msd[:pos] = :noun
|
58
|
+
msd[:type] = :proper
|
59
|
+
msd[:number] = :plural
|
60
|
+
when 'PDT' then
|
61
|
+
msd[:pos] = :determiner
|
62
|
+
when 'PP' then
|
63
|
+
msd[:pos] = :pronoun
|
64
|
+
msd[:type] = :personal
|
65
|
+
when 'PP$' then
|
66
|
+
msd[:pos] = :pronoun
|
67
|
+
msd[:type] = :possessive
|
68
|
+
when 'RB' then
|
69
|
+
msd[:pos] = :adverb
|
70
|
+
when 'RBR' then
|
71
|
+
msd[:pos] = :adverb
|
72
|
+
msd[:degree] = :comparative
|
73
|
+
when 'RBS' then
|
74
|
+
msd[:pos] = :adverb
|
75
|
+
msd[:degree] = :superlative
|
76
|
+
when 'TO' then
|
77
|
+
msd[:pos] = :determiner
|
78
|
+
when 'UH' then
|
79
|
+
msd[:pos] = :interjection
|
80
|
+
when 'VB' then
|
81
|
+
msd[:pos] = :verb
|
82
|
+
msd[:type] = :base
|
83
|
+
when 'VBD' then
|
84
|
+
msd[:pos] = :verb
|
85
|
+
msd[:type] = :base
|
86
|
+
msd[:tense] = :past
|
87
|
+
when 'VBG' then
|
88
|
+
msd[:pos] = :verb
|
89
|
+
msd[:type] = :base
|
90
|
+
msd[:vform] = :participle
|
91
|
+
msd[:tense] = :present
|
92
|
+
when 'VBN' then
|
93
|
+
msd[:pos] = :verb
|
94
|
+
msd[:type] = :base
|
95
|
+
msd[:vform] = :participle
|
96
|
+
msd[:tense] = :past
|
97
|
+
when 'VBP' then
|
98
|
+
msd[:pos] = :verb
|
99
|
+
msd[:type] = :base
|
100
|
+
msd[:tense] = :present
|
101
|
+
msd[:number] = :singular
|
102
|
+
when 'VBZ' then
|
103
|
+
msd[:pos] = :verb
|
104
|
+
msd[:type] = :base
|
105
|
+
msd[:tense] = :present
|
106
|
+
msd[:person] = :third
|
107
|
+
msd[:number] = :singular
|
108
|
+
when 'WDT' then
|
109
|
+
msd[:pos] = :determiner
|
110
|
+
when 'WP' then
|
111
|
+
msd[:pos] = :pronoun
|
112
|
+
when 'WP$' then
|
113
|
+
msd[:pos] = :pronoun
|
114
|
+
msd[:type] = :possessive
|
115
|
+
when 'WRB' then
|
116
|
+
msd[:pos] = :adverb
|
117
|
+
else
|
118
|
+
msd[:pos] = :residual
|
119
|
+
end
|
120
|
+
|
121
|
+
msd
|
122
|
+
end
|
123
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# This module provides downcase and upcase methods designed for Russian.
|
2
|
+
# The original code is written by Andrew Kozlov for the Petrovich library.
|
3
|
+
#
|
4
|
+
# https://github.com/petrovich/petrovich-ruby/blob/df705075542979ab85e1f2bf9a2024b1c0813e1a/lib/petrovich/unicode.rb
|
5
|
+
#
|
6
|
+
module Myasorubka::Unicode extend self
|
7
|
+
# Russian capital letters.
|
8
|
+
#
|
9
|
+
RU_UPPERCASE = [
|
10
|
+
"\u0410", "\u0411", "\u0412", "\u0413", "\u0414", "\u0415", "\u0416", "\u0417",
|
11
|
+
"\u0418", "\u0419", "\u041A", "\u041B", "\u041C", "\u041D", "\u041E", "\u041F",
|
12
|
+
"\u0420", "\u0421", "\u0422", "\u0423", "\u0424", "\u0425", "\u0426", "\u0427",
|
13
|
+
"\u0428", "\u0429", "\u042A", "\u042B", "\u042C", "\u042D", "\u042E", "\u042F",
|
14
|
+
"\u0401" # Ё
|
15
|
+
].join
|
16
|
+
|
17
|
+
# Russian small letters.
|
18
|
+
#
|
19
|
+
RU_LOWERCASE = [
|
20
|
+
"\u0430", "\u0431", "\u0432", "\u0433", "\u0434", "\u0435", "\u0436", "\u0437",
|
21
|
+
"\u0438", "\u0439", "\u043A", "\u043B", "\u043C", "\u043D", "\u043E", "\u043F",
|
22
|
+
"\u0440", "\u0441", "\u0442", "\u0443", "\u0444", "\u0445", "\u0446", "\u0447",
|
23
|
+
"\u0448", "\u0449", "\u044A", "\u044B", "\u044C", "\u044D", "\u044E", "\u044F",
|
24
|
+
"\u0451" # Ё
|
25
|
+
].join
|
26
|
+
|
27
|
+
# Returns a copy of the given string having replaced
|
28
|
+
# capital Russian letters with small ones.
|
29
|
+
#
|
30
|
+
# @param string [String] a string.
|
31
|
+
# @return [String] a new string.
|
32
|
+
#
|
33
|
+
def downcase(string)
|
34
|
+
string.tr(RU_UPPERCASE, RU_LOWERCASE).tap(&:downcase!)
|
35
|
+
end
|
36
|
+
|
37
|
+
# Returns a copy of the given string having replaced
|
38
|
+
# small Russian letters with capital ones.
|
39
|
+
#
|
40
|
+
# @param string [String] a string.
|
41
|
+
# @return [String] a new string.
|
42
|
+
#
|
43
|
+
def upcase(string)
|
44
|
+
string.tr(RU_LOWERCASE, RU_UPPERCASE).tap(&:upcase!)
|
45
|
+
end
|
46
|
+
end
|
data/lib/myasorubka/version.rb
CHANGED
data/myasorubka.gemspec
CHANGED
@@ -12,17 +12,11 @@ Gem::Specification.new do |spec|
|
|
12
12
|
spec.description = 'Myasorubka is a morphological data processor.'
|
13
13
|
spec.summary = 'Myasorubka is a morphological data proceesor ' \
|
14
14
|
'that supports AOT and MULTEXT-East notations.'
|
15
|
-
spec.homepage = 'https://github.com/
|
15
|
+
spec.homepage = 'https://github.com/dustalov/myasorubka'
|
16
16
|
spec.license = 'MIT'
|
17
17
|
|
18
18
|
spec.files = `git ls-files`.split($/)
|
19
19
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
20
20
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
21
21
|
spec.require_paths = ['lib']
|
22
|
-
|
23
|
-
spec.add_development_dependency 'bundler', '~> 1.3'
|
24
|
-
spec.add_development_dependency 'minitest', '>= 2.11'
|
25
|
-
spec.add_development_dependency 'rake'
|
26
|
-
|
27
|
-
spec.add_dependency 'unicode_utils', '~> 1.4'
|
28
22
|
end
|
data/spec/msd_spec.rb
CHANGED
@@ -83,6 +83,19 @@ module Myasorubka
|
|
83
83
|
('Vmp' =~ re).must_equal 0
|
84
84
|
('Nc-pl' =~ re).must_be_nil
|
85
85
|
end
|
86
|
+
|
87
|
+
it 'can be pruned and became valid when the category is wrong' do
|
88
|
+
subject[:pos] = :zalupa
|
89
|
+
subject.prune!
|
90
|
+
subject.must_be :valid?
|
91
|
+
end
|
92
|
+
|
93
|
+
it 'can be pruned and became valid when an attribute is wrong' do
|
94
|
+
subject[:pos] = :verb
|
95
|
+
subject[:animate] = :yes
|
96
|
+
subject.prune!
|
97
|
+
subject.must_be :valid?
|
98
|
+
end
|
86
99
|
end
|
87
100
|
|
88
101
|
describe 'Generator' do
|
data/spec/spec_helper.rb
CHANGED
@@ -4,11 +4,9 @@ require 'rubygems'
|
|
4
4
|
|
5
5
|
$:.unshift File.expand_path('../../lib', __FILE__)
|
6
6
|
|
7
|
-
|
8
|
-
gem 'minitest'
|
9
|
-
end
|
10
|
-
|
7
|
+
gem 'minitest'
|
11
8
|
require 'minitest/autorun'
|
9
|
+
require 'minitest/hell'
|
12
10
|
|
13
11
|
require 'myasorubka'
|
14
12
|
require 'myasorubka/aot'
|
metadata
CHANGED
@@ -1,71 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: myasorubka
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Ustalov
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
12
|
-
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: bundler
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - ~>
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '1.3'
|
20
|
-
type: :development
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - ~>
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: '1.3'
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: minitest
|
29
|
-
requirement: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - '>='
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '2.11'
|
34
|
-
type: :development
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
-
requirements:
|
38
|
-
- - '>='
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
version: '2.11'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: rake
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - '>='
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '0'
|
48
|
-
type: :development
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - '>='
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '0'
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: unicode_utils
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
58
|
-
requirements:
|
59
|
-
- - ~>
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '1.4'
|
62
|
-
type: :runtime
|
63
|
-
prerelease: false
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
65
|
-
requirements:
|
66
|
-
- - ~>
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version: '1.4'
|
11
|
+
date: 2015-10-17 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
69
13
|
description: Myasorubka is a morphological data processor.
|
70
14
|
email:
|
71
15
|
- dmitry@eveel.ru
|
@@ -73,13 +17,12 @@ executables: []
|
|
73
17
|
extensions: []
|
74
18
|
extra_rdoc_files: []
|
75
19
|
files:
|
76
|
-
- .gitignore
|
77
|
-
- .travis.yml
|
20
|
+
- ".gitignore"
|
21
|
+
- ".travis.yml"
|
78
22
|
- Gemfile
|
79
23
|
- LICENSE.txt
|
80
24
|
- README.md
|
81
25
|
- Rakefile
|
82
|
-
- aot-russian
|
83
26
|
- lib/myasorubka.rb
|
84
27
|
- lib/myasorubka/aot.rb
|
85
28
|
- lib/myasorubka/aot/dictionary.rb
|
@@ -88,13 +31,17 @@ files:
|
|
88
31
|
- lib/myasorubka/msd.rb
|
89
32
|
- lib/myasorubka/msd/english.rb
|
90
33
|
- lib/myasorubka/msd/russian.rb
|
34
|
+
- lib/myasorubka/mystem.rb
|
35
|
+
- lib/myasorubka/mystem/binary.rb
|
36
|
+
- lib/myasorubka/treebank.rb
|
37
|
+
- lib/myasorubka/unicode.rb
|
91
38
|
- lib/myasorubka/version.rb
|
92
39
|
- myasorubka.gemspec
|
93
40
|
- spec/data/russian.tsv
|
94
41
|
- spec/msd/russian_spec.rb
|
95
42
|
- spec/msd_spec.rb
|
96
43
|
- spec/spec_helper.rb
|
97
|
-
homepage: https://github.com/
|
44
|
+
homepage: https://github.com/dustalov/myasorubka
|
98
45
|
licenses:
|
99
46
|
- MIT
|
100
47
|
metadata: {}
|
@@ -104,17 +51,17 @@ require_paths:
|
|
104
51
|
- lib
|
105
52
|
required_ruby_version: !ruby/object:Gem::Requirement
|
106
53
|
requirements:
|
107
|
-
- -
|
54
|
+
- - ">="
|
108
55
|
- !ruby/object:Gem::Version
|
109
56
|
version: '0'
|
110
57
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
111
58
|
requirements:
|
112
|
-
- -
|
59
|
+
- - ">="
|
113
60
|
- !ruby/object:Gem::Version
|
114
61
|
version: '0'
|
115
62
|
requirements: []
|
116
63
|
rubyforge_project:
|
117
|
-
rubygems_version: 2.
|
64
|
+
rubygems_version: 2.4.8
|
118
65
|
signing_key:
|
119
66
|
specification_version: 4
|
120
67
|
summary: Myasorubka is a morphological data proceesor that supports AOT and MULTEXT-East
|