myasorubka 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +5 -5
- data/Gemfile +7 -0
- data/LICENSE.txt +1 -1
- data/README.md +15 -7
- data/Rakefile +1 -0
- data/lib/myasorubka.rb +1 -0
- data/lib/myasorubka/aot/tags.rb +2 -2
- data/lib/myasorubka/msd.rb +33 -11
- data/lib/myasorubka/msd/english.rb +0 -3
- data/lib/myasorubka/msd/russian.rb +0 -3
- data/lib/myasorubka/mystem.rb +135 -0
- data/lib/myasorubka/mystem/binary.rb +187 -0
- data/lib/myasorubka/treebank.rb +123 -0
- data/lib/myasorubka/unicode.rb +46 -0
- data/lib/myasorubka/version.rb +1 -1
- data/myasorubka.gemspec +1 -7
- data/spec/msd_spec.rb +13 -0
- data/spec/spec_helper.rb +2 -4
- metadata +13 -66
- data/aot-russian +0 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8e16d63b366b3abc035827745c49a22cdf28159a
|
4
|
+
data.tar.gz: b551b32694f999c898d3c898210a9ea5439e22c9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 343b769967295d8fd6f1860fcd9f2c1b37e9d8f05b1aee00a54c460a242fe79190c9c993691dfbd236540b71473e2e17b013730cd0db37ca25b66cbff71f2926
|
7
|
+
data.tar.gz: 913cdc0cd36ef38e6ffdc4740678f6872107d0cb49d79710a73f7955a6656f7453c28b1f48a488be35a523b59764779c1e461eba94d32a0574fe6eb4ad25d17a
|
data/.travis.yml
CHANGED
data/Gemfile
CHANGED
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -5,7 +5,17 @@ Myasorubka is a morphological data processor that supports
|
|
5
5
|
[AOT](http://aot.ru) and [MULTEXT-East](http://nl.ijs.si/ME/)
|
6
6
|
notations.
|
7
7
|
|
8
|
+
[![Gem Version][badge_fury_badge]][badge_fury_link] [![Build Status][travis_ci_badge]][travis_ci_link] [![Dependency Status][gemnasium_badge]][gemnasium_link]
|
9
|
+
|
10
|
+
[badge_fury_badge]: https://badge.fury.io/rb/myasorubka.svg
|
11
|
+
[badge_fury_link]: https://badge.fury.io/rb/myasorubka
|
12
|
+
[travis_ci_badge]: https://travis-ci.org/dustalov/myasorubka.svg
|
13
|
+
[travis_ci_link]: https://travis-ci.org/dustalov/myasorubka
|
14
|
+
[gemnasium_badge]: https://gemnasium.com/dustalov/myasorubka.svg
|
15
|
+
[gemnasium_link]: https://gemnasium.com/dustalov/myasorubka
|
16
|
+
|
8
17
|
## MULTEXT-East morphosyntactic descriptors
|
18
|
+
|
9
19
|
It is possible to process the MULTEXT-East morphosyntactic descriptors
|
10
20
|
(MSDs) in a convenient way.
|
11
21
|
|
@@ -51,7 +61,8 @@ Also, the `Myasorubka::MSD` class allows to write MSDs.
|
|
51
61
|
```
|
52
62
|
|
53
63
|
## AOT dictionaries
|
54
|
-
|
64
|
+
|
65
|
+
Myasorubka provides simple parsers for lexicon in the [AOT](http://aot.ru)
|
55
66
|
format, both for gramtab and dictionary files.
|
56
67
|
|
57
68
|
```ruby
|
@@ -104,11 +115,8 @@ You can learn more about AOT lexicon from the
|
|
104
115
|
4. Push to the branch (`git push origin my-new-feature`);
|
105
116
|
5. Create new Pull Request.
|
106
117
|
|
107
|
-
## Build Status [<img src="https://secure.travis-ci.org/ustalov/myasorubka.png"/>](http://travis-ci.org/ustalov/myasorubka)
|
108
|
-
|
109
|
-
## Dependency Status [<img src="https://gemnasium.com/ustalov/myasorubka.png"/>](https://gemnasium.com/ustalov/myasorubka)
|
110
|
-
|
111
118
|
## Copyright
|
112
|
-
Copyright (c) 2011-2013 [Dmitry Ustalov]. See LICENSE for details.
|
113
119
|
|
114
|
-
[Dmitry Ustalov]
|
120
|
+
Copyright (c) 2011-2015 [Dmitry Ustalov]. See LICENSE for details.
|
121
|
+
|
122
|
+
[Dmitry Ustalov]: https://ustalov.name/
|
data/Rakefile
CHANGED
data/lib/myasorubka.rb
CHANGED
data/lib/myasorubka/aot/tags.rb
CHANGED
@@ -133,7 +133,7 @@ class Myasorubka::AOT::Tags
|
|
133
133
|
#
|
134
134
|
def self.russian(pos_line, grammemes_line)
|
135
135
|
grammemes = grammemes_line.split(',').map do |grammeme|
|
136
|
-
|
136
|
+
Myasorubka::Unicode.downcase(grammeme)
|
137
137
|
end
|
138
138
|
|
139
139
|
msd = Myasorubka::MSD.new(Myasorubka::MSD::Russian)
|
@@ -148,7 +148,7 @@ class Myasorubka::AOT::Tags
|
|
148
148
|
pos_line = 'АББР'
|
149
149
|
end
|
150
150
|
|
151
|
-
case
|
151
|
+
case Myasorubka::Unicode.upcase(pos_line)
|
152
152
|
when 'С' then begin
|
153
153
|
msd[:pos] = :noun
|
154
154
|
msd[:type] = if (grammemes & [ 'имя', 'фам', 'отч', 'жарг', 'арх', 'проф', 'опч' ]).empty?
|
data/lib/myasorubka/msd.rb
CHANGED
@@ -153,26 +153,23 @@ class Myasorubka::MSD
|
|
153
153
|
raise InvalidDescriptor, "category is nil"
|
154
154
|
end
|
155
155
|
|
156
|
+
attributes = category[:attrs]
|
156
157
|
msd = [category[:code]]
|
157
158
|
|
158
|
-
|
159
|
-
grammemes.each do |attr_name, value|
|
159
|
+
grammemes.each do |attribute, value|
|
160
160
|
next unless value
|
161
161
|
|
162
|
-
|
163
|
-
|
164
|
-
raise InvalidDescriptor, 'no such attribute "%s" of category "%s"' %
|
165
|
-
[attr_name, pos]
|
162
|
+
unless index = attributes.index { |name, _| name == attribute }
|
163
|
+
raise InvalidDescriptor, 'no such attribute "%s" of category "%s"' % [attribute, pos]
|
166
164
|
end
|
167
165
|
|
168
|
-
|
166
|
+
_, values = attributes[index]
|
169
167
|
|
170
|
-
unless
|
171
|
-
raise InvalidDescriptor, 'no such attribute "%s" '
|
172
|
-
'for attribute "%s" of category "%s"' % [value, attr_name, pos]
|
168
|
+
unless attribute_value = values[value]
|
169
|
+
raise InvalidDescriptor, 'no such value "%s" for attribute "%s" of category "%s"' % [value, attribute, pos]
|
173
170
|
end
|
174
171
|
|
175
|
-
msd[
|
172
|
+
msd[index + 1] = attribute_value
|
176
173
|
end
|
177
174
|
|
178
175
|
msd.map { |e| e || EMPTY_DESCRIPTOR }.join
|
@@ -188,6 +185,31 @@ class Myasorubka::MSD
|
|
188
185
|
false
|
189
186
|
end
|
190
187
|
|
188
|
+
# Drop every attribute that does not appear in the category.
|
189
|
+
#
|
190
|
+
# @return [MSD] self.
|
191
|
+
#
|
192
|
+
def prune!
|
193
|
+
unless category = language::CATEGORIES[pos]
|
194
|
+
self.pos = nil
|
195
|
+
grammemes.clear
|
196
|
+
return self
|
197
|
+
end
|
198
|
+
|
199
|
+
attributes = category[:attrs]
|
200
|
+
|
201
|
+
grammemes.reject! do |attribute, value|
|
202
|
+
if index = attributes.index { |name, _| name == attribute }
|
203
|
+
_, values = attributes[index]
|
204
|
+
!values[value]
|
205
|
+
else
|
206
|
+
true
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
self
|
211
|
+
end
|
212
|
+
|
191
213
|
protected
|
192
214
|
# @private
|
193
215
|
def parse! msd_line
|
@@ -0,0 +1,135 @@
|
|
1
|
+
# https://tech.yandex.ru/mystem/
|
2
|
+
module Myasorubka::Mystem extend self
|
3
|
+
require 'myasorubka/mystem/binary'
|
4
|
+
|
5
|
+
# https://tech.yandex.ru/mystem/doc/grammemes-values-docpage/
|
6
|
+
GRAMMEMES = {
|
7
|
+
'A' => :adjective,
|
8
|
+
'ADV' => :adverb,
|
9
|
+
'ADVPRO' => :adv_pronoun,
|
10
|
+
'ANUM' => :adj_numeral,
|
11
|
+
'APRO' => :adj_pronoun,
|
12
|
+
'COM' => :composite,
|
13
|
+
'CONJ' => :conjunction,
|
14
|
+
'INTJ' => :interjunction,
|
15
|
+
'NUM' => :numeral,
|
16
|
+
'PART' => :particle,
|
17
|
+
'PR' => :preposition,
|
18
|
+
'S' => :substantive,
|
19
|
+
'SPRO' => :subst_pronoun,
|
20
|
+
'V' => :verb,
|
21
|
+
'наст' => :present,
|
22
|
+
'praes' => :present,
|
23
|
+
'непрош' => :notpast,
|
24
|
+
'inpraes' => :notpast,
|
25
|
+
'прош' => :past,
|
26
|
+
'praet' => :past,
|
27
|
+
'им' => :nominative,
|
28
|
+
'nom' => :nominative,
|
29
|
+
'род' => :genitive,
|
30
|
+
'gen' => :genitive,
|
31
|
+
'дат' => :dative,
|
32
|
+
'dat' => :dative,
|
33
|
+
'вин' => :accusative,
|
34
|
+
'acc' => :accusative,
|
35
|
+
'твор' => :instrumental,
|
36
|
+
'ins' => :instrumental,
|
37
|
+
'пр' => :ablative,
|
38
|
+
'abl' => :ablative,
|
39
|
+
'парт' => :partitive,
|
40
|
+
'part' => :partitive,
|
41
|
+
'местн' => :locative,
|
42
|
+
'loc' => :locative,
|
43
|
+
'зват' => :vocative,
|
44
|
+
'voc' => :vocative,
|
45
|
+
'ед' => :singular,
|
46
|
+
'sg' => :singular,
|
47
|
+
'мн' => :plural,
|
48
|
+
'pl' => :plural,
|
49
|
+
'деепр' => :gerund,
|
50
|
+
'ger' => :gerund,
|
51
|
+
'инф' => :infinitive,
|
52
|
+
'inf' => :infinitive,
|
53
|
+
'прич' => :participle,
|
54
|
+
'partcp' => :participle,
|
55
|
+
'изъяв' => :indicative,
|
56
|
+
'indic' => :indicative,
|
57
|
+
'пов' => :imperative,
|
58
|
+
'imper' => :imperative,
|
59
|
+
'кр' => :short,
|
60
|
+
'brev' => :short,
|
61
|
+
'полн' => :full,
|
62
|
+
'plen' => :full,
|
63
|
+
'притяж' => :possessive,
|
64
|
+
'poss' => :possessive,
|
65
|
+
'прев' => :superlative,
|
66
|
+
'supr' => :superlative,
|
67
|
+
'срав' => :comparative,
|
68
|
+
'comp' => :comparative,
|
69
|
+
'1-л' => :person1,
|
70
|
+
'1p' => :person1,
|
71
|
+
'2-л' => :person2,
|
72
|
+
'2p' => :person2,
|
73
|
+
'3-л' => :person3,
|
74
|
+
'3p' => :person3,
|
75
|
+
'муж' => :masculine,
|
76
|
+
'm' => :masculine,
|
77
|
+
'жен' => :feminine,
|
78
|
+
'f' => :feminine,
|
79
|
+
'сред' => :neuter,
|
80
|
+
'n' => :neuter,
|
81
|
+
'несов' => :imperfect,
|
82
|
+
'ipf' => :imperfect,
|
83
|
+
'сов' => :perfect,
|
84
|
+
'pf' => :perfect,
|
85
|
+
'действ' => :active,
|
86
|
+
'act' => :active,
|
87
|
+
'страд' => :passive,
|
88
|
+
'pass' => :passive,
|
89
|
+
'од' => :animated,
|
90
|
+
'anim' => :animated,
|
91
|
+
'неод' => :inanimated,
|
92
|
+
'inan' => :inanimated,
|
93
|
+
'пе' => :transitive,
|
94
|
+
'tran' => :transitive,
|
95
|
+
'нп' => :intransitive,
|
96
|
+
'intr' => :intransitive,
|
97
|
+
'вводн' => :parenth,
|
98
|
+
'parenth' => :parenth,
|
99
|
+
'гео' => :geo,
|
100
|
+
'geo' => :geo,
|
101
|
+
'затр' => :awkward,
|
102
|
+
'awkw' => :awkward,
|
103
|
+
'имя' => :first_name,
|
104
|
+
'persn' => :first_name,
|
105
|
+
'искаж' => :distort,
|
106
|
+
'dist' => :distort,
|
107
|
+
'мж' => :mas_fem,
|
108
|
+
'mf' => :mas_fem,
|
109
|
+
'обсц' => :obscene,
|
110
|
+
'obsc' => :obscene,
|
111
|
+
'отч' => :patronymic,
|
112
|
+
'patrn' => :patronymic,
|
113
|
+
'прдк' => :praedic,
|
114
|
+
'praed' => :praedic,
|
115
|
+
'разг' => :informal,
|
116
|
+
'inform' => :informal,
|
117
|
+
'редк' => :rare,
|
118
|
+
'rare' => :rare,
|
119
|
+
'сокр' => :abbreviation,
|
120
|
+
'abbr' => :abbreviation,
|
121
|
+
'устар' => :obsolete,
|
122
|
+
'obsol' => :obsolete,
|
123
|
+
'фам' => :surname,
|
124
|
+
'famn' => :surname
|
125
|
+
}.freeze
|
126
|
+
|
127
|
+
# Convert an array with mystem character-based grammemes into an MSD.
|
128
|
+
#
|
129
|
+
def to_msd(grammemes)
|
130
|
+
grammemes = grammemes.map { |g| GRAMMEMES[g] }
|
131
|
+
grammemes.compact!
|
132
|
+
grammemes.map! { |g| Myasorubka::Mystem::Binary::GRAMMEMES.key(g) }
|
133
|
+
Myasorubka::Mystem::Binary.to_msd(grammemes)
|
134
|
+
end
|
135
|
+
end
|
@@ -0,0 +1,187 @@
|
|
1
|
+
# A wrapper around mystem's internal binary format.
|
2
|
+
#
|
3
|
+
module Myasorubka::Mystem::Binary extend self
|
4
|
+
# https://github.com/yandex/tomita-parser/blob/master/src/library/lemmer/dictlib/yx_gram_enum.h
|
5
|
+
GRAMMEMES = {
|
6
|
+
127 => :postposition,
|
7
|
+
128 => :adjective,
|
8
|
+
129 => :adverb,
|
9
|
+
130 => :composite,
|
10
|
+
131 => :conjunction,
|
11
|
+
132 => :interjunction,
|
12
|
+
133 => :numeral,
|
13
|
+
134 => :particle,
|
14
|
+
135 => :preposition,
|
15
|
+
136 => :substantive,
|
16
|
+
137 => :verb,
|
17
|
+
138 => :adj_numeral,
|
18
|
+
139 => :adj_pronoun,
|
19
|
+
140 => :adv_pronoun,
|
20
|
+
141 => :subst_pronoun,
|
21
|
+
142 => :article,
|
22
|
+
143 => :part_of_idiom,
|
23
|
+
144 => :reserved,
|
24
|
+
145 => :abbreviation,
|
25
|
+
146 => :irregular_stem,
|
26
|
+
147 => :informal,
|
27
|
+
148 => :distort,
|
28
|
+
149 => :contracted,
|
29
|
+
150 => :obscene,
|
30
|
+
151 => :rare,
|
31
|
+
152 => :awkward,
|
32
|
+
153 => :obsolete,
|
33
|
+
154 => :subst_adjective,
|
34
|
+
155 => :first_name,
|
35
|
+
156 => :surname,
|
36
|
+
157 => :patronymic,
|
37
|
+
158 => :geo,
|
38
|
+
159 => :proper,
|
39
|
+
160 => :present,
|
40
|
+
161 => :notpast,
|
41
|
+
162 => :past,
|
42
|
+
163 => :future,
|
43
|
+
164 => :past2,
|
44
|
+
165 => :nominative,
|
45
|
+
166 => :genitive,
|
46
|
+
167 => :dative,
|
47
|
+
168 => :accusative,
|
48
|
+
169 => :instrumental,
|
49
|
+
170 => :ablative,
|
50
|
+
171 => :partitive,
|
51
|
+
172 => :locative,
|
52
|
+
173 => :vocative,
|
53
|
+
174 => :singular,
|
54
|
+
175 => :plural,
|
55
|
+
176 => :gerund,
|
56
|
+
177 => :infinitive,
|
57
|
+
178 => :participle,
|
58
|
+
179 => :indicative,
|
59
|
+
180 => :imperative,
|
60
|
+
181 => :conditional,
|
61
|
+
182 => :subjunctive,
|
62
|
+
183 => :short,
|
63
|
+
184 => :full,
|
64
|
+
185 => :superlative,
|
65
|
+
186 => :comparative,
|
66
|
+
187 => :possessive,
|
67
|
+
188 => :person1,
|
68
|
+
189 => :person2,
|
69
|
+
190 => :person3,
|
70
|
+
191 => :feminine,
|
71
|
+
192 => :masculine,
|
72
|
+
193 => :neuter,
|
73
|
+
194 => :mas_fem,
|
74
|
+
195 => :perfect,
|
75
|
+
196 => :imperfect,
|
76
|
+
197 => :passive,
|
77
|
+
198 => :active,
|
78
|
+
199 => :reflexive,
|
79
|
+
200 => :impersonal,
|
80
|
+
201 => :animated,
|
81
|
+
202 => :inanimated,
|
82
|
+
203 => :praedic,
|
83
|
+
204 => :parenth,
|
84
|
+
205 => :transitive,
|
85
|
+
206 => :intransitive,
|
86
|
+
207 => :definite,
|
87
|
+
208 => :indefinite,
|
88
|
+
209 => :sim_conj,
|
89
|
+
210 => :sub_conj,
|
90
|
+
211 => :pronoun_conj,
|
91
|
+
212 => :correlate_conj,
|
92
|
+
213 => :aux_verb
|
93
|
+
}.freeze
|
94
|
+
|
95
|
+
# Convert an array with mystem grammeme codes into a MSD.
|
96
|
+
#
|
97
|
+
def to_msd(grammemes)
|
98
|
+
msd = Myasorubka::MSD.new(Myasorubka::MSD::Russian)
|
99
|
+
|
100
|
+
grammemes.sort.each do |code|
|
101
|
+
case GRAMMEMES[code]
|
102
|
+
# Nomenus
|
103
|
+
when :postposition then msd[:pos] = :adposition
|
104
|
+
when :adjective then msd[:pos] = :adjective; msd[:type] = :qualificative; msd[:degree] = :positive
|
105
|
+
when :adverb then msd[:pos] = :adverb
|
106
|
+
when :conjunction then msd[:pos] = :conjunction
|
107
|
+
when :interjunction then msd[:pos] = :interjection
|
108
|
+
when :numeral then msd[:pos] = :numeral; msd[:type] = :cardinal
|
109
|
+
when :particle then msd[:pos] = :particle
|
110
|
+
when :preposition then msd[:pos] = :adposition; msd[:type] = :preposition
|
111
|
+
when :substantive then msd[:pos] = :noun; msd[:type] = :common
|
112
|
+
when :verb then msd[:pos] = :verb; msd[:type] = :main
|
113
|
+
when :adj_numeral then msd[:pos] = :numeral; msd[:type] = :ordinal
|
114
|
+
when :adj_pronoun then msd[:pos] = :pronoun; msd[:syntactic_type] = :adjectival
|
115
|
+
when :adv_pronoun then msd[:pos] = :pronoun; msd[:syntactic_type] = :adverbial
|
116
|
+
when :subst_pronoun then msd[:pos] = :pronoun; msd[:syntactic_type] = :nominal
|
117
|
+
when :abbreviation then msd[:pos] = :abbreviation
|
118
|
+
when :first_name then msd[:type] = :proper
|
119
|
+
when :surname then msd[:type] = :proper
|
120
|
+
when :patronymic then msd[:type] = :proper
|
121
|
+
when :geo then msd[:type] = :proper
|
122
|
+
when :proper then msd[:type] = :proper
|
123
|
+
# Tempus
|
124
|
+
when :present then msd[:tense] = :present
|
125
|
+
# TODO: how to handle :notpast tense?
|
126
|
+
when :past then msd[:tense] = :past
|
127
|
+
when :future then msd[:tense] = :future
|
128
|
+
when :past2 then msd[:tense] = :past
|
129
|
+
# Casus
|
130
|
+
when :nominative then msd[:case] = :nominative
|
131
|
+
when :genitive then msd[:case] = :genitive
|
132
|
+
when :dative then msd[:case] = :dative
|
133
|
+
when :accusative then msd[:case] = :accusative
|
134
|
+
when :instrumental then msd[:case] = :instrumental
|
135
|
+
when :ablative then msd[:case] = :genitive
|
136
|
+
when :partitive then msd[:case] = :genitive; msd[:case2] = :partitive
|
137
|
+
when :locative then msd[:case] = :genitive; msd[:case2] = :locative
|
138
|
+
when :vocative then msd[:case] = :vocative
|
139
|
+
# Numerus
|
140
|
+
when :singular then msd[:number] = :singular
|
141
|
+
when :plural then msd[:number] = :plural
|
142
|
+
# Modus
|
143
|
+
when :gerund then msd[:vform] = :gerund
|
144
|
+
when :infinitive then msd[:vform] = :infinitive
|
145
|
+
when :participle then msd[:vform] = :participle
|
146
|
+
when :indicative then msd[:vform] = :indicative
|
147
|
+
when :imperative then msd[:vform] = :imperative
|
148
|
+
when :conditional then msd[:vform] = :conditional
|
149
|
+
# Gradus
|
150
|
+
when :short then msd[:definiteness] = :short_art
|
151
|
+
when :full then msd[:definiteness] = :full_art
|
152
|
+
when :superlative then msd[:degree] = :superlative
|
153
|
+
when :comparative then msd[:degree] = :comparative
|
154
|
+
when :possessive then msd[:type] = :possessive
|
155
|
+
# Personae
|
156
|
+
when :person1 then msd[:person] = :first
|
157
|
+
when :person2 then msd[:person] = :second
|
158
|
+
when :person3 then msd[:person] = :third
|
159
|
+
# Gender
|
160
|
+
when :feminine then msd[:gender] = :feminine
|
161
|
+
when :masculine then msd[:gender] = :masculine
|
162
|
+
when :neuter then msd[:gender] = :neuter
|
163
|
+
when :mas_fem then msd[:gender] = :common
|
164
|
+
# Perfectum-Imperfectum
|
165
|
+
when :perfect then msd[:aspect] = :perfective
|
166
|
+
when :imperfect then msd[:aspect] = :progressive
|
167
|
+
# Voice
|
168
|
+
when :passive then msd[:voice] = :passive
|
169
|
+
when :active then msd[:voice] = :active
|
170
|
+
when :reflexive then msd[:type] = :reflexive
|
171
|
+
# Animated
|
172
|
+
when :animated then msd[:animate] = :yes
|
173
|
+
when :inanimated then msd[:animate] = :no
|
174
|
+
# Transitivity
|
175
|
+
when :definite then msd[:definiteness] = :full_art
|
176
|
+
when :indefinite then msd[:definiteness] = :short_art
|
177
|
+
# Definiteness
|
178
|
+
when :sim_conj then msd[:type] = :coordinating
|
179
|
+
when :sub_conj then msd[:type] = :subordinating
|
180
|
+
when :aux_verb then msd[:type] = :auxiliary
|
181
|
+
else
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
msd.prune!
|
186
|
+
end
|
187
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# The Penn Treebank Project annotates naturally-occuring text for
|
4
|
+
# linguistic structure. Most notably, we produce skeletal parses
|
5
|
+
# showing rough syntactic and semantic information — a bank of
|
6
|
+
# linguistic trees.
|
7
|
+
#
|
8
|
+
# Treebanks are often created on top of a corpus that has already been
|
9
|
+
# annotated with part-of-speech tags. In turn, treebanks are sometimes
|
10
|
+
# enhanced with semantic or other linguistic information.
|
11
|
+
#
|
12
|
+
module Myasorubka::Treebank
|
13
|
+
extend self
|
14
|
+
|
15
|
+
# Convert the given tag from English Penn Treebank format to the English
|
16
|
+
# representation in the MULTEXT-East format.
|
17
|
+
#
|
18
|
+
def english(tag)
|
19
|
+
msd = Myasorubka::MSD.new(Myasorubka::MSD::English)
|
20
|
+
|
21
|
+
case tag
|
22
|
+
when 'CC' then
|
23
|
+
msd[:pos] = :conjunction
|
24
|
+
msd[:type] = :coordinating
|
25
|
+
when 'CD' then
|
26
|
+
msd[:pos] = :numeral
|
27
|
+
msd[:type] = :cardinal
|
28
|
+
when 'DT' then
|
29
|
+
msd[:pos] = :determiner
|
30
|
+
when 'IN' then
|
31
|
+
msd[:pos] = :conjunction
|
32
|
+
msd[:type] = :subordinating
|
33
|
+
when 'JJ' then
|
34
|
+
msd[:pos] = :adjective
|
35
|
+
when 'JJR' then
|
36
|
+
msd[:pos] = :adjective
|
37
|
+
msd[:degree] = :comparative
|
38
|
+
when 'JJS' then
|
39
|
+
msd[:pos] = :adjective
|
40
|
+
msd[:degree] = :superlative
|
41
|
+
when 'MD' then
|
42
|
+
msd[:pos] = :verb
|
43
|
+
msd[:type] = :modal
|
44
|
+
when 'NN' then
|
45
|
+
msd[:pos] = :noun
|
46
|
+
msd[:type] = :common
|
47
|
+
msd[:number] = :singular
|
48
|
+
when 'NNS'
|
49
|
+
msd[:pos] = :noun
|
50
|
+
msd[:type] = :common
|
51
|
+
msd[:number] = :plural
|
52
|
+
when 'NP'
|
53
|
+
msd[:pos] = :noun
|
54
|
+
msd[:type] = :proper
|
55
|
+
msd[:number] = :singular
|
56
|
+
when 'NPS'
|
57
|
+
msd[:pos] = :noun
|
58
|
+
msd[:type] = :proper
|
59
|
+
msd[:number] = :plural
|
60
|
+
when 'PDT' then
|
61
|
+
msd[:pos] = :determiner
|
62
|
+
when 'PP' then
|
63
|
+
msd[:pos] = :pronoun
|
64
|
+
msd[:type] = :personal
|
65
|
+
when 'PP$' then
|
66
|
+
msd[:pos] = :pronoun
|
67
|
+
msd[:type] = :possessive
|
68
|
+
when 'RB' then
|
69
|
+
msd[:pos] = :adverb
|
70
|
+
when 'RBR' then
|
71
|
+
msd[:pos] = :adverb
|
72
|
+
msd[:degree] = :comparative
|
73
|
+
when 'RBS' then
|
74
|
+
msd[:pos] = :adverb
|
75
|
+
msd[:degree] = :superlative
|
76
|
+
when 'TO' then
|
77
|
+
msd[:pos] = :determiner
|
78
|
+
when 'UH' then
|
79
|
+
msd[:pos] = :interjection
|
80
|
+
when 'VB' then
|
81
|
+
msd[:pos] = :verb
|
82
|
+
msd[:type] = :base
|
83
|
+
when 'VBD' then
|
84
|
+
msd[:pos] = :verb
|
85
|
+
msd[:type] = :base
|
86
|
+
msd[:tense] = :past
|
87
|
+
when 'VBG' then
|
88
|
+
msd[:pos] = :verb
|
89
|
+
msd[:type] = :base
|
90
|
+
msd[:vform] = :participle
|
91
|
+
msd[:tense] = :present
|
92
|
+
when 'VBN' then
|
93
|
+
msd[:pos] = :verb
|
94
|
+
msd[:type] = :base
|
95
|
+
msd[:vform] = :participle
|
96
|
+
msd[:tense] = :past
|
97
|
+
when 'VBP' then
|
98
|
+
msd[:pos] = :verb
|
99
|
+
msd[:type] = :base
|
100
|
+
msd[:tense] = :present
|
101
|
+
msd[:number] = :singular
|
102
|
+
when 'VBZ' then
|
103
|
+
msd[:pos] = :verb
|
104
|
+
msd[:type] = :base
|
105
|
+
msd[:tense] = :present
|
106
|
+
msd[:person] = :third
|
107
|
+
msd[:number] = :singular
|
108
|
+
when 'WDT' then
|
109
|
+
msd[:pos] = :determiner
|
110
|
+
when 'WP' then
|
111
|
+
msd[:pos] = :pronoun
|
112
|
+
when 'WP$' then
|
113
|
+
msd[:pos] = :pronoun
|
114
|
+
msd[:type] = :possessive
|
115
|
+
when 'WRB' then
|
116
|
+
msd[:pos] = :adverb
|
117
|
+
else
|
118
|
+
msd[:pos] = :residual
|
119
|
+
end
|
120
|
+
|
121
|
+
msd
|
122
|
+
end
|
123
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# This module provides downcase and upcase methods designed for Russian.
|
2
|
+
# The original code is written by Andrew Kozlov for the Petrovich library.
|
3
|
+
#
|
4
|
+
# https://github.com/petrovich/petrovich-ruby/blob/df705075542979ab85e1f2bf9a2024b1c0813e1a/lib/petrovich/unicode.rb
|
5
|
+
#
|
6
|
+
module Myasorubka::Unicode extend self
|
7
|
+
# Russian capital letters.
|
8
|
+
#
|
9
|
+
RU_UPPERCASE = [
|
10
|
+
"\u0410", "\u0411", "\u0412", "\u0413", "\u0414", "\u0415", "\u0416", "\u0417",
|
11
|
+
"\u0418", "\u0419", "\u041A", "\u041B", "\u041C", "\u041D", "\u041E", "\u041F",
|
12
|
+
"\u0420", "\u0421", "\u0422", "\u0423", "\u0424", "\u0425", "\u0426", "\u0427",
|
13
|
+
"\u0428", "\u0429", "\u042A", "\u042B", "\u042C", "\u042D", "\u042E", "\u042F",
|
14
|
+
"\u0401" # Ё
|
15
|
+
].join
|
16
|
+
|
17
|
+
# Russian small letters.
|
18
|
+
#
|
19
|
+
RU_LOWERCASE = [
|
20
|
+
"\u0430", "\u0431", "\u0432", "\u0433", "\u0434", "\u0435", "\u0436", "\u0437",
|
21
|
+
"\u0438", "\u0439", "\u043A", "\u043B", "\u043C", "\u043D", "\u043E", "\u043F",
|
22
|
+
"\u0440", "\u0441", "\u0442", "\u0443", "\u0444", "\u0445", "\u0446", "\u0447",
|
23
|
+
"\u0448", "\u0449", "\u044A", "\u044B", "\u044C", "\u044D", "\u044E", "\u044F",
|
24
|
+
"\u0451" # Ё
|
25
|
+
].join
|
26
|
+
|
27
|
+
# Returns a copy of the given string having replaced
|
28
|
+
# capital Russian letters with small ones.
|
29
|
+
#
|
30
|
+
# @param string [String] a string.
|
31
|
+
# @return [String] a new string.
|
32
|
+
#
|
33
|
+
def downcase(string)
|
34
|
+
string.tr(RU_UPPERCASE, RU_LOWERCASE).tap(&:downcase!)
|
35
|
+
end
|
36
|
+
|
37
|
+
# Returns a copy of the given string having replaced
|
38
|
+
# small Russian letters with capital ones.
|
39
|
+
#
|
40
|
+
# @param string [String] a string.
|
41
|
+
# @return [String] a new string.
|
42
|
+
#
|
43
|
+
def upcase(string)
|
44
|
+
string.tr(RU_LOWERCASE, RU_UPPERCASE).tap(&:upcase!)
|
45
|
+
end
|
46
|
+
end
|
data/lib/myasorubka/version.rb
CHANGED
data/myasorubka.gemspec
CHANGED
@@ -12,17 +12,11 @@ Gem::Specification.new do |spec|
|
|
12
12
|
spec.description = 'Myasorubka is a morphological data processor.'
|
13
13
|
spec.summary = 'Myasorubka is a morphological data proceesor ' \
|
14
14
|
'that supports AOT and MULTEXT-East notations.'
|
15
|
-
spec.homepage = 'https://github.com/
|
15
|
+
spec.homepage = 'https://github.com/dustalov/myasorubka'
|
16
16
|
spec.license = 'MIT'
|
17
17
|
|
18
18
|
spec.files = `git ls-files`.split($/)
|
19
19
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
20
20
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
21
21
|
spec.require_paths = ['lib']
|
22
|
-
|
23
|
-
spec.add_development_dependency 'bundler', '~> 1.3'
|
24
|
-
spec.add_development_dependency 'minitest', '>= 2.11'
|
25
|
-
spec.add_development_dependency 'rake'
|
26
|
-
|
27
|
-
spec.add_dependency 'unicode_utils', '~> 1.4'
|
28
22
|
end
|
data/spec/msd_spec.rb
CHANGED
@@ -83,6 +83,19 @@ module Myasorubka
|
|
83
83
|
('Vmp' =~ re).must_equal 0
|
84
84
|
('Nc-pl' =~ re).must_be_nil
|
85
85
|
end
|
86
|
+
|
87
|
+
it 'can be pruned and became valid when the category is wrong' do
|
88
|
+
subject[:pos] = :zalupa
|
89
|
+
subject.prune!
|
90
|
+
subject.must_be :valid?
|
91
|
+
end
|
92
|
+
|
93
|
+
it 'can be pruned and became valid when an attribute is wrong' do
|
94
|
+
subject[:pos] = :verb
|
95
|
+
subject[:animate] = :yes
|
96
|
+
subject.prune!
|
97
|
+
subject.must_be :valid?
|
98
|
+
end
|
86
99
|
end
|
87
100
|
|
88
101
|
describe 'Generator' do
|
data/spec/spec_helper.rb
CHANGED
@@ -4,11 +4,9 @@ require 'rubygems'
|
|
4
4
|
|
5
5
|
$:.unshift File.expand_path('../../lib', __FILE__)
|
6
6
|
|
7
|
-
|
8
|
-
gem 'minitest'
|
9
|
-
end
|
10
|
-
|
7
|
+
gem 'minitest'
|
11
8
|
require 'minitest/autorun'
|
9
|
+
require 'minitest/hell'
|
12
10
|
|
13
11
|
require 'myasorubka'
|
14
12
|
require 'myasorubka/aot'
|
metadata
CHANGED
@@ -1,71 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: myasorubka
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Ustalov
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
12
|
-
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: bundler
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - ~>
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '1.3'
|
20
|
-
type: :development
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - ~>
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: '1.3'
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: minitest
|
29
|
-
requirement: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - '>='
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '2.11'
|
34
|
-
type: :development
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
-
requirements:
|
38
|
-
- - '>='
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
version: '2.11'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: rake
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - '>='
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '0'
|
48
|
-
type: :development
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - '>='
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '0'
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: unicode_utils
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
58
|
-
requirements:
|
59
|
-
- - ~>
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '1.4'
|
62
|
-
type: :runtime
|
63
|
-
prerelease: false
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
65
|
-
requirements:
|
66
|
-
- - ~>
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version: '1.4'
|
11
|
+
date: 2015-10-17 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
69
13
|
description: Myasorubka is a morphological data processor.
|
70
14
|
email:
|
71
15
|
- dmitry@eveel.ru
|
@@ -73,13 +17,12 @@ executables: []
|
|
73
17
|
extensions: []
|
74
18
|
extra_rdoc_files: []
|
75
19
|
files:
|
76
|
-
- .gitignore
|
77
|
-
- .travis.yml
|
20
|
+
- ".gitignore"
|
21
|
+
- ".travis.yml"
|
78
22
|
- Gemfile
|
79
23
|
- LICENSE.txt
|
80
24
|
- README.md
|
81
25
|
- Rakefile
|
82
|
-
- aot-russian
|
83
26
|
- lib/myasorubka.rb
|
84
27
|
- lib/myasorubka/aot.rb
|
85
28
|
- lib/myasorubka/aot/dictionary.rb
|
@@ -88,13 +31,17 @@ files:
|
|
88
31
|
- lib/myasorubka/msd.rb
|
89
32
|
- lib/myasorubka/msd/english.rb
|
90
33
|
- lib/myasorubka/msd/russian.rb
|
34
|
+
- lib/myasorubka/mystem.rb
|
35
|
+
- lib/myasorubka/mystem/binary.rb
|
36
|
+
- lib/myasorubka/treebank.rb
|
37
|
+
- lib/myasorubka/unicode.rb
|
91
38
|
- lib/myasorubka/version.rb
|
92
39
|
- myasorubka.gemspec
|
93
40
|
- spec/data/russian.tsv
|
94
41
|
- spec/msd/russian_spec.rb
|
95
42
|
- spec/msd_spec.rb
|
96
43
|
- spec/spec_helper.rb
|
97
|
-
homepage: https://github.com/
|
44
|
+
homepage: https://github.com/dustalov/myasorubka
|
98
45
|
licenses:
|
99
46
|
- MIT
|
100
47
|
metadata: {}
|
@@ -104,17 +51,17 @@ require_paths:
|
|
104
51
|
- lib
|
105
52
|
required_ruby_version: !ruby/object:Gem::Requirement
|
106
53
|
requirements:
|
107
|
-
- -
|
54
|
+
- - ">="
|
108
55
|
- !ruby/object:Gem::Version
|
109
56
|
version: '0'
|
110
57
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
111
58
|
requirements:
|
112
|
-
- -
|
59
|
+
- - ">="
|
113
60
|
- !ruby/object:Gem::Version
|
114
61
|
version: '0'
|
115
62
|
requirements: []
|
116
63
|
rubyforge_project:
|
117
|
-
rubygems_version: 2.
|
64
|
+
rubygems_version: 2.4.8
|
118
65
|
signing_key:
|
119
66
|
specification_version: 4
|
120
67
|
summary: Myasorubka is a morphological data proceesor that supports AOT and MULTEXT-East
|