ru_translit 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +14 -0
- data/LICENSE +22 -0
- data/README.md +26 -0
- data/Rakefile +2 -0
- data/lib/detranslit_rules.rb +191 -0
- data/lib/detransliterator.rb +16 -0
- data/lib/ru_translit.rb +37 -0
- data/lib/translit_rules.rb +151 -0
- data/lib/transliteration/version.rb +3 -0
- data/lib/transliterator.rb +13 -0
- data/lib/word.rb +98 -0
- data/lib/word_part.rb +52 -0
- data/ru_translit.gemspec +21 -0
- data/spec/ru_translit_spec.rb +76 -0
- metadata +80 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
(The MIT License)
|
2
|
+
|
3
|
+
Copyright (c) 2011 Johannes Stein
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
19
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
20
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
21
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
22
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# ru_translit
|
2
|
+
|
3
|
+
A simple Object with two class methods for transliterating russian cyrillic words to latin,
|
4
|
+
and for detransliterating transliterated words from latin back to cyrillic. At this point,
|
5
|
+
it does not follow any one specific transliteration ruleset, but uses a variation of German,
|
6
|
+
English and (simplified) scientific transliteration rules. It is meant to be a very pragmatic, catch-all
|
7
|
+
way of getting different real-world variants of how a given word might be transliterated or detransliterated.
|
8
|
+
Each of the two methods returns an array of words.
|
9
|
+
|
10
|
+
The detransliteration method has an optional second parameter that decides whether the returned
|
11
|
+
cyrillic options should include those whose only difference to another option is that it includes one
|
12
|
+
or more softeners, which are usually not transliterated. This parameter defaults to false.
|
13
|
+
|
14
|
+
What's special about this is that it takes the context of letters in the word into account in order to find out
|
15
|
+
whether certain options are to be included. Most other transliteration tools don't do this.
|
16
|
+
|
17
|
+
## Install
|
18
|
+
|
19
|
+
gem install ru_translit
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
require 'ru_translit'
|
24
|
+
RuTranslit.to_cyrillic 'vodka' # => ["водка"]
|
25
|
+
RuTranslit.to_latin 'водка' # => ["vodka", "wodka"]
|
26
|
+
RuTranslit.to_cyrillic 'vodka', true # => ["водка", "водкьа", "водька", "водькьа"]
|
data/Rakefile
ADDED
@@ -0,0 +1,191 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
class DetranslitRules
|
3
|
+
|
4
|
+
def self.rules
|
5
|
+
RULES
|
6
|
+
end
|
7
|
+
|
8
|
+
# arrays: [output, predecessor-rule, successor-rule, softener-if-0]
|
9
|
+
RULES = {
|
10
|
+
'a' => [
|
11
|
+
['а', /.*/, /.*/, 1]
|
12
|
+
],
|
13
|
+
'b' => [
|
14
|
+
['б', /.*/, /.*/, 1],
|
15
|
+
['бь', /.*/, /[a,e,i,ja,ya,ju,yu]|^$/, 0]
|
16
|
+
],
|
17
|
+
'c' => [
|
18
|
+
['ц', /.*/, /.*/, 1]
|
19
|
+
],
|
20
|
+
'd' => [
|
21
|
+
['д', /.*/, /.*/, 1],
|
22
|
+
['дь', /.*/, /[ja,e,m,b,k,ju,yu,s]|^$/, 0]
|
23
|
+
],
|
24
|
+
'e' => [
|
25
|
+
['е', /.*/, /.*/, 1]
|
26
|
+
],
|
27
|
+
'f' => [
|
28
|
+
['ф', /.*/, /.*/, 1],
|
29
|
+
['фь', /.*/, /[ja,ya,e,je,o]|^$/, 0]
|
30
|
+
],
|
31
|
+
'g' => [
|
32
|
+
['г', /.*/, /.*/, 1]
|
33
|
+
],
|
34
|
+
'h' => [
|
35
|
+
['х', /.*/, /.*/, 1]
|
36
|
+
],
|
37
|
+
'i' => [
|
38
|
+
['и', /.*/, /.*/, 1],
|
39
|
+
['ы', /.+/, /.*/, 1],
|
40
|
+
['ий', /.+/, /^$/, 1]
|
41
|
+
],
|
42
|
+
'j' => [
|
43
|
+
['й', /.*/, /.*/, 1]
|
44
|
+
],
|
45
|
+
'k' => [
|
46
|
+
['к', /.*/, /.*/, 1],
|
47
|
+
['кь', /.*/, /[ja,ya]/, 0] #
|
48
|
+
],
|
49
|
+
|
50
|
+
'l' => [
|
51
|
+
['л', /.*/, /.*/, 1]
|
52
|
+
],
|
53
|
+
'm' => [
|
54
|
+
['м', /.*/, /.*/, 1]#,
|
55
|
+
# ['мь', /[a,e,i,o,u,ju,yu,ja,ya]/, /[e,ja,ya,d,s,ju,yu]|^$/, 0]
|
56
|
+
# ['мь', /[e]/, /[e]/, 0]
|
57
|
+
],
|
58
|
+
'n' => [
|
59
|
+
['н',/.*/,/.*/, 1],
|
60
|
+
# ['нь', /[a,e,o,u,i,y,ja,ya,ju,yu,z,r,s,g]/, /[k,e,ja,sh,sch,o,i,g,-,ju,yu,zh,ch,tsch,b,d,s]|^$/, 0]
|
61
|
+
['нь', /[a,e,o,u,i,y,ja,ya]/, /[k,e,ja,sh,sch,o,i,g]|^$/, 0]
|
62
|
+
],
|
63
|
+
'o' => [
|
64
|
+
['о', /.*/, /.*/, 1]
|
65
|
+
],
|
66
|
+
'p' => [
|
67
|
+
['п', /.*/, /.*/, 1],
|
68
|
+
['пь', /.*/, /[ja,ya,e,ju,yu]|^$/, 0]
|
69
|
+
],
|
70
|
+
'q' => [
|
71
|
+
['', /.*/, /.*/, 1]
|
72
|
+
],
|
73
|
+
'r' => [
|
74
|
+
['р', /.*/, /.*/, 1],
|
75
|
+
# ['рь', /[a,o,e,u,y,i,b,ja,ju,t,p,kh,ch]/, /[e,k,ja,ya,m,c,-,i,b,tsch,ch,g]|^$/, 0]
|
76
|
+
['рь', /[a,o,e,u,y,i,b]/, /[e,k,ja,ya,m]|^$/, 0]
|
77
|
+
],
|
78
|
+
's' => [
|
79
|
+
['с', /.*/, /.*/, 1],
|
80
|
+
# ['сь', /.*/, /[m,e,k,b,ja,ya,ju,yu,o]|^$/, 0]
|
81
|
+
['сь', /[i,o,e]/, /[m,e]|^$/, 0]
|
82
|
+
],
|
83
|
+
't' => [
|
84
|
+
['т', /.*/, /.*/, 1],
|
85
|
+
# ['ть', /[a,i,s,ja,ya,u,e,y,o,z,r,l,f,n]/, /[s,e,ja,ya,ju,yu,i,-,b,m,f,k,d,u,v,w]|^$/, 0],
|
86
|
+
['ть', /[a,i,s,ja,ya,u,e,y,o,z,r]/, /[s,e,ja,ya,ju,yu,i,-]|^$/, 0],
|
87
|
+
],
|
88
|
+
'u' => [
|
89
|
+
['у', /.*/, /.*/, 1]
|
90
|
+
],
|
91
|
+
'v' => [
|
92
|
+
['в', /.*/, /.*/, 1],
|
93
|
+
['вь', /.*/, /[ju,yu,e,i]|^$/, 0]
|
94
|
+
],
|
95
|
+
'w' => [
|
96
|
+
['в', /.*/, /.*/, 1],
|
97
|
+
['вь', /.+/, /[ju,yu,e,i]|^$/, 0],
|
98
|
+
['вь', /.*/, /[ju,yu,e,i]/, 0]
|
99
|
+
],
|
100
|
+
'x' => [
|
101
|
+
['кс', /.*/, /.*/, 1]
|
102
|
+
],
|
103
|
+
'y' => [
|
104
|
+
['ы', /.*/, /.*/, 1]
|
105
|
+
],
|
106
|
+
'z' => [
|
107
|
+
['з', /.*/, /.*/, 1],
|
108
|
+
['зь', /.*/, /[ja,ya,b,e,m,k]|^$/, 0]
|
109
|
+
],
|
110
|
+
"'" => [
|
111
|
+
['ь', /.+/, /.*/, 0]
|
112
|
+
],
|
113
|
+
# english letter combinations
|
114
|
+
'ch' => [
|
115
|
+
['ч', /.*/, /.*/, 1],
|
116
|
+
['чь', /.*/, /[s,e,i,ja,ya,ju,yu,-]|^$/, 0]
|
117
|
+
],
|
118
|
+
'кн' => [
|
119
|
+
['х', /.*/, /.*/, 1]
|
120
|
+
],
|
121
|
+
'yu' => [
|
122
|
+
['ю', /.*/, /.*/, 1]
|
123
|
+
],
|
124
|
+
'ya' => [
|
125
|
+
['я', /.*/, /.*/, 1]
|
126
|
+
],
|
127
|
+
'yo' => [
|
128
|
+
['е', /.*/, /.*/, 1]
|
129
|
+
],
|
130
|
+
'sh' => [
|
131
|
+
['ш', /.*/, /.*/, 1],
|
132
|
+
['шь', /.*/, /[ja,ya,e,s]|^$/, 0]
|
133
|
+
],
|
134
|
+
'shch' => [
|
135
|
+
['щ', /.*/, /.*/, 1],
|
136
|
+
['щь', /.*/, /^$/, 0]
|
137
|
+
],
|
138
|
+
'ts' => [
|
139
|
+
['ц', /.*/, /.*/, 1]
|
140
|
+
],
|
141
|
+
'zh' => [
|
142
|
+
['ж', /.*/, /.*/, 1],
|
143
|
+
['жь', /.*/, /[e,i,ja,ya]|^$/, 0]
|
144
|
+
],
|
145
|
+
# german letter combinations
|
146
|
+
'sch' => [
|
147
|
+
['ш', /.*/, /.*/, 1], #de
|
148
|
+
['щ', /.*/, /.*/, 1], #en
|
149
|
+
['шь', /.*/, /^$/, 0]
|
150
|
+
#maschadov ?
|
151
|
+
],
|
152
|
+
'tsch' => [
|
153
|
+
['ч', /.*/, /.*/, 1],
|
154
|
+
['чь', /.*/, /[s,e,i,ja,ya,ju,yu,-]|^$/, 0]
|
155
|
+
],
|
156
|
+
'schtsch' => [
|
157
|
+
['щ', /.*/, /.*/, 1]
|
158
|
+
],
|
159
|
+
'ju' => [
|
160
|
+
['ю', /.*/, /.*/, 1]
|
161
|
+
],
|
162
|
+
'ja' => [
|
163
|
+
['я', /.*/, /.*/, 1]
|
164
|
+
],
|
165
|
+
# 'je' => [
|
166
|
+
# ['е', /.*/, /.*/, 1],
|
167
|
+
# ['же', /.*/, /.*/, 1],
|
168
|
+
# ],
|
169
|
+
'jo' => [
|
170
|
+
['е', /.*/, /.*/, 1]
|
171
|
+
],
|
172
|
+
'ä' => [
|
173
|
+
['я', /.*/, /.*/, 1]
|
174
|
+
],
|
175
|
+
'ö' => [
|
176
|
+
['ё', /.*/, /.*/, 1]
|
177
|
+
],
|
178
|
+
'ü' => [
|
179
|
+
['ю', /.*/, /.*/, 1]
|
180
|
+
],
|
181
|
+
'ß' => [
|
182
|
+
['с', /.*/, /.*/, 1]
|
183
|
+
],
|
184
|
+
'je' => [
|
185
|
+
['е', /.*/, /.*/, 1],
|
186
|
+
#['йе', /.*/, /.*/, 1],
|
187
|
+
['же', /^$/, /.*/, 1]
|
188
|
+
]
|
189
|
+
# TODO scientific letter combinations
|
190
|
+
}
|
191
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require_relative 'word'
|
3
|
+
|
4
|
+
#TODO check if a word is german or english, then don't detransliterate, right?
|
5
|
+
|
6
|
+
# Pass in a word and get some detransliteration suggestions based on our ruleset,
|
7
|
+
# which covers german and english transliterations. If softeners are to be included,
|
8
|
+
# the returned lists can get quite long.
|
9
|
+
class Detransliterator
|
10
|
+
#returns an array of cyrillic options for the passed-in word (just one word at a time!)
|
11
|
+
def self.cyrillic_options(input, include_softeners)
|
12
|
+
data = Word.detransliterations_for(input, include_softeners)
|
13
|
+
options = data[:options]
|
14
|
+
Word.output_words options
|
15
|
+
end
|
16
|
+
end
|
data/lib/ru_translit.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'unicode_utils/downcase'
|
2
|
+
require 'transliterator'
|
3
|
+
require 'detransliterator'
|
4
|
+
|
5
|
+
# Transliteration as well as De-/Retransliteration between russian cyrillic and
|
6
|
+
# English, German and Scientific transliterations. Accounts for context-dependent
|
7
|
+
# transliteration rules.
|
8
|
+
# Current limitations:
|
9
|
+
# * Only one word per pass (technically, it should work for multiple words,
|
10
|
+
# but the number of variations returned likely grows beyond manageability).
|
11
|
+
# * Everything will be downcased.
|
12
|
+
# * No distinction between the different translit variants: Just one list with all possible options gets returned.
|
13
|
+
module RuTranslit
|
14
|
+
|
15
|
+
# De-transliterates a single latin word to cyrillic. returns an array of possible cyrillic strings
|
16
|
+
# if include_softeners is true, variations including only the positioning of softeners get added
|
17
|
+
# to the returned array as well. considers mainly German and English transliteration variants.
|
18
|
+
def self.to_cyrillic latin_term, include_softeners=false
|
19
|
+
latin_term = UnicodeUtils.downcase(latin_term) #generally, the regular downcase should be fine here, but doesn't hurt like this.
|
20
|
+
Detransliterator.cyrillic_options(latin_term, include_softeners)
|
21
|
+
end
|
22
|
+
|
23
|
+
# Transliterates a single cyrillic word to latin. Returns an array of possible latin strings.
|
24
|
+
# Considers mainly English, German and scientific (mostly minus the diacritics) transliteration variants.
|
25
|
+
def self.to_latin cyrillic_term
|
26
|
+
cyrillic_term = UnicodeUtils.downcase(cyrillic_term) #generally, the regular downcase should be fine here, but doesn't hurt like this.
|
27
|
+
Transliterator.translit_options(cyrillic_term)
|
28
|
+
end
|
29
|
+
|
30
|
+
# Short forms of the main methods.
|
31
|
+
class << self
|
32
|
+
alias to_cy to_cyrillic
|
33
|
+
alias to_la to_latin
|
34
|
+
alias detransliterate to_cyrillic
|
35
|
+
alias transliterate to_latin
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,151 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
class TranslitRules
|
3
|
+
|
4
|
+
def self.rules
|
5
|
+
RULES
|
6
|
+
end
|
7
|
+
|
8
|
+
# arrays: [output, predecessor-rule, successor-rule, softener-if-0]
|
9
|
+
#source: mainly https://secure.wikimedia.org/wikipedia/de/wiki/Kyrillisch#Russisch
|
10
|
+
RULES = {
|
11
|
+
'а' => [
|
12
|
+
['a', /.*/, /.*/, 1]
|
13
|
+
],
|
14
|
+
'б' => [
|
15
|
+
['b', /.*/, /.*/, 1]
|
16
|
+
],
|
17
|
+
'в' => [
|
18
|
+
['v', /.*/, /.*/, 1], #sc, en
|
19
|
+
['w', /.*/, /.*/, 1] #de
|
20
|
+
],
|
21
|
+
'г' => [
|
22
|
+
['g', /.*/, /.*/, 1],
|
23
|
+
['w', /[eo]/, /o/, 1] #de bei genitiv-wendung
|
24
|
+
],
|
25
|
+
'д' => [
|
26
|
+
['d', /.*/, /.*/, 1]
|
27
|
+
],
|
28
|
+
'е' => [
|
29
|
+
['e', /.*/, /.*/, 1],
|
30
|
+
['je', /^$|[аоуыэяёюиеь]/, /.*/, 1], #de, am anfang oder nach vokal
|
31
|
+
['ye', /^$|[аоуыэяёюиеь]/, /.*/, 1], #en, ''
|
32
|
+
['jo', /[р]/, /.*/, 1] #de, not official, for book_id 44460
|
33
|
+
],
|
34
|
+
'ё' => [
|
35
|
+
['ë', /.*/, /.*/, 1], #sc
|
36
|
+
['jo', /[^жчшщ]/, /.*/, 1], #de
|
37
|
+
['e', /[жчшщ]/, /.*/, 1], #de
|
38
|
+
['yo', /[^жчшщ]/, /.*/, 1], #en
|
39
|
+
['o', /[жчшщ]/, /.*/, 1] #en
|
40
|
+
],
|
41
|
+
'ж' => [
|
42
|
+
#['ž', /.*/, /.*/, 1], #sc
|
43
|
+
['z', /.*/, /.*/, 1], #sc
|
44
|
+
['sch', /.*/, /.*/, 1], #de
|
45
|
+
['sh', /.*/, /.*/, 1], #de (ddr?)
|
46
|
+
['zh', /.*/, /.*/, 1], #en
|
47
|
+
],
|
48
|
+
'з' => [
|
49
|
+
['z', /.*/, /.*/, 1], #sc, en
|
50
|
+
['s', /.*/, /.*/, 1], #de
|
51
|
+
],
|
52
|
+
'и' => [
|
53
|
+
['i', /.*/, /.*/, 1], #sc, en, de
|
54
|
+
['ji', /ь/, /.*/, 1], #de
|
55
|
+
['yi', /ь/, /.*/, 1], #en
|
56
|
+
],
|
57
|
+
'й' => [
|
58
|
+
['j', /.*/, /.*/, 1], #sc
|
59
|
+
['i', /.*/, /.*/, 1], #de, sometimes also 'j', but that's already covered with sc.
|
60
|
+
['y', /.*/, /.*/, 1], #en
|
61
|
+
],
|
62
|
+
'к' => [
|
63
|
+
['k', /.*/, /.*/, 1], #sc, de, en
|
64
|
+
#['x', /.*/, /c/, 1], #de?
|
65
|
+
],
|
66
|
+
'л' => [
|
67
|
+
['l', /.*/, /.*/, 1], #sc, de, en
|
68
|
+
],
|
69
|
+
'м' => [
|
70
|
+
['m', /.*/, /.*/, 1], #sc, de, en
|
71
|
+
],
|
72
|
+
'н' => [
|
73
|
+
['n', /.*/, /.*/, 1], #sc, de, en
|
74
|
+
],
|
75
|
+
'о' => [
|
76
|
+
['o', /.*/, /.*/, 1], #sc, de, en
|
77
|
+
['jo', /ь/, /.*/, 1], #de
|
78
|
+
['yo', /ь/, /.*/, 1], #en
|
79
|
+
],
|
80
|
+
'п' => [
|
81
|
+
['p', /.*/, /.*/, 1], #sc, de, en
|
82
|
+
],
|
83
|
+
'р' => [
|
84
|
+
['r', /.*/, /.*/, 1], #sc, de, en
|
85
|
+
],
|
86
|
+
'с' => [
|
87
|
+
['s', /.*/, /.*/, 1], #sc, de, en
|
88
|
+
['ss', /[аоуыэяёюие]/, /[аоуыэяёюие]/, 1], #de, between vowels
|
89
|
+
],
|
90
|
+
'т' => [
|
91
|
+
['t', /.*/, /.*/, 1], #sc, de, en
|
92
|
+
],
|
93
|
+
'у' => [
|
94
|
+
['u', /.*/, /.*/, 1], #sc, de, en
|
95
|
+
],
|
96
|
+
'ф' => [
|
97
|
+
['f', /.*/, /.*/, 1], #sc, de, en
|
98
|
+
],
|
99
|
+
'х' => [
|
100
|
+
['ch', /.*/, /.*/, 1], #sc, de
|
101
|
+
['kh', /.*/, /.*/, 1], #en
|
102
|
+
['h', /.*/, /.*/, 1], #not officially, for author of book_id 37718
|
103
|
+
],
|
104
|
+
'ц' => [
|
105
|
+
['c', /.*/, /.*/, 1], #sc
|
106
|
+
['z', /.*/, /.*/, 1], #de
|
107
|
+
['ts', /.*/, /.*/, 1], #en
|
108
|
+
],
|
109
|
+
'ч' => [
|
110
|
+
#['č', /.*/, /.*/, 1], #sc
|
111
|
+
['c', /.*/, /.*/, 1], #sc
|
112
|
+
['tsch', /.*/, /.*/, 1], #de
|
113
|
+
['ch', /.*/, /.*/, 1], #en
|
114
|
+
],
|
115
|
+
'ш' => [
|
116
|
+
#['š', /.*/, /.*/, 1], #sc
|
117
|
+
['s', /.*/, /.*/, 1], #sc
|
118
|
+
['sch', /.*/, /.*/, 1], #de
|
119
|
+
['sh', /.*/, /.*/, 1], #en
|
120
|
+
],
|
121
|
+
'щ' => [
|
122
|
+
#['šč', /.*/, /.*/, 1], #sc
|
123
|
+
['sc', /.*/, /.*/, 1], #sc
|
124
|
+
['schtsch', /.*/, /.*/, 1], #de
|
125
|
+
#['stsch', /.*/, /.*/, 1], #de (ddr)
|
126
|
+
['shch', /.*/, /.*/, 1], #en
|
127
|
+
],
|
128
|
+
'ъ' => [
|
129
|
+
['"', /.*/, /.*/, 1], #sc
|
130
|
+
#hardener, generally not transcribed for de and en
|
131
|
+
],
|
132
|
+
'ы' => [
|
133
|
+
['y', /.*/, /.*/, 1], #sc, de, en
|
134
|
+
],
|
135
|
+
'ь' => [
|
136
|
+
["'", /.*/, /.*/, 1], #sc
|
137
|
+
],
|
138
|
+
'э' => [
|
139
|
+
["ė", /.*/, /.*/, 1], #sc
|
140
|
+
["e", /.*/, /.*/, 1], #de, en
|
141
|
+
],
|
142
|
+
'ю' => [
|
143
|
+
["ju", /.*/, /.*/, 1], #sc, de
|
144
|
+
["yu", /.*/, /.*/, 1], #en
|
145
|
+
],
|
146
|
+
'я' => [
|
147
|
+
["ja", /.*/, /.*/, 1], #sc, de
|
148
|
+
["ya", /.*/, /.*/, 1], #en
|
149
|
+
]
|
150
|
+
}
|
151
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require_relative 'word'
|
3
|
+
|
4
|
+
# Pass in a word and get some transliteration suggestions based on our ruleset,
|
5
|
+
# which covers german, english and scientific transliterations.
|
6
|
+
class Transliterator
|
7
|
+
# Returns an array of cyrillic options for the passed-in word (just one word at a time!)
|
8
|
+
def self.translit_options(input)
|
9
|
+
data = Word.transliterations_for(input)
|
10
|
+
options = data[:options]
|
11
|
+
Word.output_words options
|
12
|
+
end
|
13
|
+
end
|
data/lib/word.rb
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require_relative 'word_part'
|
3
|
+
|
4
|
+
@@max_checked_string_length = 7 # To accomodate 'schtsch' or s.th like that.
|
5
|
+
|
6
|
+
class Word
|
7
|
+
def self.detransliterations_for(word, include_softeners, is_detranslit=true)
|
8
|
+
last_part = build_word_part_structure word, is_detranslit
|
9
|
+
first_part = add_link_to_succ_to_each_word_part(last_part)
|
10
|
+
input_structure = word_input_structure(first_part)
|
11
|
+
wps_options = [] #this will be filled with lists of possible cyrillic output (2d-array then)
|
12
|
+
part = first_part
|
13
|
+
until part.nil? do
|
14
|
+
wps_options << part.cyrillic_options(include_softeners)
|
15
|
+
part = part.succ
|
16
|
+
end
|
17
|
+
{:input_structure => input_structure, :options => wps_options}
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.transliterations_for(word)
|
21
|
+
self.detransliterations_for(word, true, false)
|
22
|
+
end
|
23
|
+
|
24
|
+
#transform 2d-options-array two an array of cyrillic words.
|
25
|
+
def self.output_words part_options
|
26
|
+
part_options.map!{|a| a.size==0 ? nil : a} #delete empty option arrays
|
27
|
+
part_options.compact!
|
28
|
+
get_part_combinations '', part_options, []
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
private
|
33
|
+
#builds a singly linked list, and returns the last item
|
34
|
+
def self.build_word_part_structure word, is_detranslit
|
35
|
+
rest_size = [word.size-@@max_checked_string_length,0].max #size of the chunk from the beginning of word to be checked for any matching wordparts
|
36
|
+
offset = 0 #offset from the beginning of the word --do we need this at all?
|
37
|
+
cy_result = ""
|
38
|
+
prev_wp = nil #previous wordpart (relative to current. nil for first wordpart)
|
39
|
+
while(word && word.size>0) do
|
40
|
+
#in die liste schaun. wenn ja, dann offset = rest_size
|
41
|
+
chunk = word[0..(-1-rest_size)]
|
42
|
+
if WordPart.has_rules?(chunk, is_detranslit) #if there are rules for this chunk
|
43
|
+
wp = WordPart.new(chunk, prev_wp, is_detranslit)#cy_result << new_cy_char
|
44
|
+
prev_wp = wp
|
45
|
+
offset = word.size-rest_size
|
46
|
+
word = word[(offset)..-1] #cut the word to the rest of the string that has to be put into the wp-structure
|
47
|
+
rest_size = [word.size-@@max_checked_string_length,0].max
|
48
|
+
elsif chunk.size > 0 #no rules found, so make chunk one letter shorter
|
49
|
+
rest_size += 1
|
50
|
+
else #no rules found and chunk is empty. meaning: no rules for the current word's first character. which means: just keep it.
|
51
|
+
offset = word.size-rest_size+1
|
52
|
+
wp = WordPart.new(word[0..offset-1], prev_wp, is_detranslit)
|
53
|
+
prev_wp = wp
|
54
|
+
wp.just_thru = true
|
55
|
+
word = word[(offset)..-1]
|
56
|
+
rest_size = [word.size-@@max_checked_string_length,0].max
|
57
|
+
end
|
58
|
+
end
|
59
|
+
prev_wp
|
60
|
+
end
|
61
|
+
|
62
|
+
#go thru the linked list and add the next to each (except the last of course). return the first.
|
63
|
+
def self.add_link_to_succ_to_each_word_part last
|
64
|
+
part = last
|
65
|
+
until part.prev.nil? do
|
66
|
+
prev = part.prev
|
67
|
+
prev.succ = part
|
68
|
+
part = prev
|
69
|
+
end
|
70
|
+
part
|
71
|
+
end
|
72
|
+
|
73
|
+
#just for debugging: return an array of the input parts, how the input has been split into wordparts
|
74
|
+
def self.word_input_structure first_part
|
75
|
+
ret = []
|
76
|
+
part = first_part
|
77
|
+
until part.nil? do
|
78
|
+
ret << part.input
|
79
|
+
part = part.succ
|
80
|
+
end
|
81
|
+
ret
|
82
|
+
end
|
83
|
+
|
84
|
+
#go through the 2d-array of wordpart-options and build a list of words from all possible combinations
|
85
|
+
def self.get_part_combinations(str, arr_in, arr_out)
|
86
|
+
if arr_in.size == 0
|
87
|
+
arr_out << str
|
88
|
+
return arr_out
|
89
|
+
end
|
90
|
+
cur_arr = arr_in[0]
|
91
|
+
cur_arr.each do |w|
|
92
|
+
w2 = str == "" ? w : str + w
|
93
|
+
arr_out = arr_out | get_part_combinations(w2, arr_in[1..-1], arr_out) #merge the two arrays. may be faster with just adding, not merging, since there won't be any duplicates anyway
|
94
|
+
end
|
95
|
+
arr_out
|
96
|
+
end
|
97
|
+
|
98
|
+
end
|
data/lib/word_part.rb
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require_relative 'translit_rules'
|
3
|
+
require_relative 'detranslit_rules'
|
4
|
+
|
5
|
+
class WordPart
|
6
|
+
attr_accessor :input, :prev, :succ, :static_output, :just_thru #may be more restricted than this
|
7
|
+
@prev, @succ = nil
|
8
|
+
@just_thru = false
|
9
|
+
@input = nil
|
10
|
+
@is_detranslit = nil
|
11
|
+
|
12
|
+
# arrays: [output, predecessor-rule, successor-rule]
|
13
|
+
DT_RULES = DetranslitRules.rules
|
14
|
+
T_RULES = TranslitRules.rules
|
15
|
+
|
16
|
+
def initialize input, prev, detranslit=true
|
17
|
+
@input = input
|
18
|
+
@prev = prev
|
19
|
+
@is_detranslit = detranslit
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.has_rules? input, is_detranslit
|
23
|
+
if is_detranslit
|
24
|
+
DT_RULES.has_key? input
|
25
|
+
else
|
26
|
+
T_RULES.has_key? input
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
#return array of (cyrillic) strings which can be empty (or nil instead?) --should we cache the result? i think it's called no more than once.
|
31
|
+
def cyrillic_options(include_softeners)
|
32
|
+
return [input] if just_thru #this can be set to accomodate for untranslatable characters.
|
33
|
+
ret = []
|
34
|
+
arr = @is_detranslit ? DT_RULES[input] : T_RULES[input]
|
35
|
+
if arr #there are rules for this part, now see if any of them match
|
36
|
+
arr.each do |rule|
|
37
|
+
if match?(rule, include_softeners)
|
38
|
+
ret << rule[0]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
ret
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
def match?(rule, include_softeners) #rule is expected to be an array of the form [output, prev-rule, succ-rule, priority(0 is for softeners)]
|
47
|
+
prev_input = prev.nil? ? '' : prev.input
|
48
|
+
succ_input = succ.nil? ? '' : succ.input
|
49
|
+
(prev_input =~ rule[1]) && (succ_input =~ rule[2]) && (rule[3]>0 || include_softeners)
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
data/ru_translit.gemspec
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "transliteration/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "ru_translit"
|
7
|
+
s.version = RuTranslit::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Johannes Stein"]
|
10
|
+
s.email = ["johannes@unsyn.com"]
|
11
|
+
s.homepage = "http://rubygems.org/gems/ru_translit"
|
12
|
+
s.summary = %q{Get a list of latin transliterations from a cyrillic word and vice versa.}
|
13
|
+
s.description = %q{Transliterations and detransliterations, using English, German and scientific transliteration variants.}
|
14
|
+
|
15
|
+
s.rubyforge_project = "ru_translit"
|
16
|
+
|
17
|
+
s.files = `git ls-files`.split("\n")
|
18
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
19
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
|
+
s.require_paths = ["lib"]
|
21
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
require 'ru_translit'
|
3
|
+
|
4
|
+
describe RuTranslit do
|
5
|
+
before(:each) do
|
6
|
+
@la_str = 'something'
|
7
|
+
@cy_str = 'что-то'
|
8
|
+
@cy_str2 = 'биографиями'
|
9
|
+
@nr = '123'
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should respond to the two main methods for transliteration and detransliteration" do
|
13
|
+
RuTranslit.should respond_to('to_latin')
|
14
|
+
RuTranslit.should respond_to('to_cyrillic')
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should respond to some short or alternative forms for the two main methods" do
|
18
|
+
RuTranslit.should respond_to('to_la')
|
19
|
+
RuTranslit.should respond_to('to_cy')
|
20
|
+
RuTranslit.should respond_to('transliterate')
|
21
|
+
RuTranslit.should respond_to('detransliterate')
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should return an array containing only the input when given a lowercase latin string to transliterate" do
|
25
|
+
RuTranslit.to_latin(@la_str).should be_an_instance_of(Array)
|
26
|
+
RuTranslit.to_latin(@la_str).should have(1).items
|
27
|
+
RuTranslit.to_latin(@la_str).first.should == @la_str
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should return an array containing only the input when given a lowercase cyrillic string to detransliterate" do
|
31
|
+
RuTranslit.to_latin(@cy_str).should be_an_instance_of(Array)
|
32
|
+
RuTranslit.to_latin(@cy_str).should have_at_least(1).items
|
33
|
+
RuTranslit.to_cyrillic(@cy_str).first.should == @cy_str
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should leave numbers alone in both directions" do
|
37
|
+
RuTranslit.to_latin(@nr).should have(1).item
|
38
|
+
RuTranslit.to_latin(@nr).first.should == @nr
|
39
|
+
RuTranslit.to_cyrillic(@nr).should have(1).item
|
40
|
+
RuTranslit.to_cyrillic(@nr).first.should == @nr
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should preserve latin word-parts while transliterating cyrillic word-parts" do
|
44
|
+
RuTranslit.to_latin(@cy_str + @la_str + @cy_str).first.should =~ /#{@la_str}/
|
45
|
+
end
|
46
|
+
|
47
|
+
it "should return a list of several transliterations for the russian word 'биографиями'" do
|
48
|
+
RuTranslit.to_latin(@cy_str2).should have_at_least(2).items
|
49
|
+
RuTranslit.to_latin(@cy_str2).should have_at_most(4).items #making sure it doesn't explode for some reason
|
50
|
+
end
|
51
|
+
|
52
|
+
it "should return (among others) the original input if we do a 'round-trip' cy->la->cy, using the first result" do
|
53
|
+
RuTranslit.to_cyrillic(RuTranslit.to_latin(@cy_str2).first).should include(@cy_str2)
|
54
|
+
end
|
55
|
+
|
56
|
+
it "should return (among others) the original input if we do a 'round-trip' la->cy->la, using the first result" do
|
57
|
+
RuTranslit.to_latin(RuTranslit.to_cyrillic(@la_str).first).should include(@la_str)
|
58
|
+
end
|
59
|
+
|
60
|
+
it "should offer cyrillic alternatives for 'schtsch', at least one of which should be only one letter long" do
|
61
|
+
RuTranslit.to_cyrillic('schtsch').should include('щ')
|
62
|
+
end
|
63
|
+
|
64
|
+
it "should return different transliteration results depending on context in some cases" do
|
65
|
+
RuTranslit.to_latin('де').should include('de')
|
66
|
+
RuTranslit.to_latin('де').should_not include('dje')
|
67
|
+
RuTranslit.to_latin('е').should include('je')
|
68
|
+
end
|
69
|
+
|
70
|
+
it "should include softeners in the detransliteration results (only) if called with the appropriate argument" do
|
71
|
+
RuTranslit.to_cyrillic('f', false).should_not include('фь')
|
72
|
+
RuTranslit.to_cyrillic('f', true).should include('фь')
|
73
|
+
end
|
74
|
+
|
75
|
+
end
|
76
|
+
|
metadata
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ru_translit
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 3
|
9
|
+
version: 0.0.3
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Johannes Stein
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2011-01-22 00:00:00 +01:00
|
18
|
+
default_executable:
|
19
|
+
dependencies: []
|
20
|
+
|
21
|
+
description: Transliterations and detransliterations, using English, German and scientific transliteration variants.
|
22
|
+
email:
|
23
|
+
- johannes@unsyn.com
|
24
|
+
executables: []
|
25
|
+
|
26
|
+
extensions: []
|
27
|
+
|
28
|
+
extra_rdoc_files: []
|
29
|
+
|
30
|
+
files:
|
31
|
+
- .gitignore
|
32
|
+
- Gemfile
|
33
|
+
- Gemfile.lock
|
34
|
+
- LICENSE
|
35
|
+
- README.md
|
36
|
+
- Rakefile
|
37
|
+
- lib/detranslit_rules.rb
|
38
|
+
- lib/detransliterator.rb
|
39
|
+
- lib/ru_translit.rb
|
40
|
+
- lib/translit_rules.rb
|
41
|
+
- lib/transliteration/version.rb
|
42
|
+
- lib/transliterator.rb
|
43
|
+
- lib/word.rb
|
44
|
+
- lib/word_part.rb
|
45
|
+
- ru_translit.gemspec
|
46
|
+
- spec/ru_translit_spec.rb
|
47
|
+
has_rdoc: true
|
48
|
+
homepage: http://rubygems.org/gems/ru_translit
|
49
|
+
licenses: []
|
50
|
+
|
51
|
+
post_install_message:
|
52
|
+
rdoc_options: []
|
53
|
+
|
54
|
+
require_paths:
|
55
|
+
- lib
|
56
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
segments:
|
62
|
+
- 0
|
63
|
+
version: "0"
|
64
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
segments:
|
70
|
+
- 0
|
71
|
+
version: "0"
|
72
|
+
requirements: []
|
73
|
+
|
74
|
+
rubyforge_project: ru_translit
|
75
|
+
rubygems_version: 1.3.7
|
76
|
+
signing_key:
|
77
|
+
specification_version: 3
|
78
|
+
summary: Get a list of latin transliterations from a cyrillic word and vice versa.
|
79
|
+
test_files: []
|
80
|
+
|