ru_translit 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +14 -0
- data/LICENSE +22 -0
- data/README.md +26 -0
- data/Rakefile +2 -0
- data/lib/detranslit_rules.rb +191 -0
- data/lib/detransliterator.rb +16 -0
- data/lib/ru_translit.rb +37 -0
- data/lib/translit_rules.rb +151 -0
- data/lib/transliteration/version.rb +3 -0
- data/lib/transliterator.rb +13 -0
- data/lib/word.rb +98 -0
- data/lib/word_part.rb +52 -0
- data/ru_translit.gemspec +21 -0
- data/spec/ru_translit_spec.rb +76 -0
- metadata +80 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
(The MIT License)
|
2
|
+
|
3
|
+
Copyright (c) 2011 Johannes Stein
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
19
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
20
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
21
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
22
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# ru_translit
|
2
|
+
|
3
|
+
A simple Object with two class methods for transliterating russian cyrillic words to latin,
|
4
|
+
and for detransliterating transliterated words from latin back to cyrillic. At this point,
|
5
|
+
it does not follow any one specific transliteration ruleset, but uses a variation of German,
|
6
|
+
English and (simplified) scientific transliteration rules. It is meant to be a very pragmatic, catch-all
|
7
|
+
way of getting different real-world variants of how a given word might be transliterated or detransliterated.
|
8
|
+
Each of the two methods returns an array of words.
|
9
|
+
|
10
|
+
The detransliteration method has an optional second parameter that decides whether the returned
|
11
|
+
cyrillic options should include those whose only difference to another option is that it includes one
|
12
|
+
or more softeners, which are usually not transliterated. This parameter defaults to false.
|
13
|
+
|
14
|
+
What's special about this is that it takes the context of letters in the word into account in order to find out
|
15
|
+
whether certain options are to be included. Most other transliteration tools don't do this.
|
16
|
+
|
17
|
+
## Install
|
18
|
+
|
19
|
+
gem install ru_translit
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
require 'ru_translit'
|
24
|
+
RuTranslit.to_cyrillic 'vodka' # => ["водка"]
|
25
|
+
RuTranslit.to_latin 'водка' # => ["vodka", "wodka"]
|
26
|
+
RuTranslit.to_cyrillic 'vodka', true # => ["водка", "водкьа", "водька", "водькьа"]
|
data/Rakefile
ADDED
@@ -0,0 +1,191 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
class DetranslitRules
|
3
|
+
|
4
|
+
def self.rules
|
5
|
+
RULES
|
6
|
+
end
|
7
|
+
|
8
|
+
# arrays: [output, predecessor-rule, successor-rule, softener-if-0]
|
9
|
+
RULES = {
|
10
|
+
'a' => [
|
11
|
+
['а', /.*/, /.*/, 1]
|
12
|
+
],
|
13
|
+
'b' => [
|
14
|
+
['б', /.*/, /.*/, 1],
|
15
|
+
['бь', /.*/, /[a,e,i,ja,ya,ju,yu]|^$/, 0]
|
16
|
+
],
|
17
|
+
'c' => [
|
18
|
+
['ц', /.*/, /.*/, 1]
|
19
|
+
],
|
20
|
+
'd' => [
|
21
|
+
['д', /.*/, /.*/, 1],
|
22
|
+
['дь', /.*/, /[ja,e,m,b,k,ju,yu,s]|^$/, 0]
|
23
|
+
],
|
24
|
+
'e' => [
|
25
|
+
['е', /.*/, /.*/, 1]
|
26
|
+
],
|
27
|
+
'f' => [
|
28
|
+
['ф', /.*/, /.*/, 1],
|
29
|
+
['фь', /.*/, /[ja,ya,e,je,o]|^$/, 0]
|
30
|
+
],
|
31
|
+
'g' => [
|
32
|
+
['г', /.*/, /.*/, 1]
|
33
|
+
],
|
34
|
+
'h' => [
|
35
|
+
['х', /.*/, /.*/, 1]
|
36
|
+
],
|
37
|
+
'i' => [
|
38
|
+
['и', /.*/, /.*/, 1],
|
39
|
+
['ы', /.+/, /.*/, 1],
|
40
|
+
['ий', /.+/, /^$/, 1]
|
41
|
+
],
|
42
|
+
'j' => [
|
43
|
+
['й', /.*/, /.*/, 1]
|
44
|
+
],
|
45
|
+
'k' => [
|
46
|
+
['к', /.*/, /.*/, 1],
|
47
|
+
['кь', /.*/, /[ja,ya]/, 0] #
|
48
|
+
],
|
49
|
+
|
50
|
+
'l' => [
|
51
|
+
['л', /.*/, /.*/, 1]
|
52
|
+
],
|
53
|
+
'm' => [
|
54
|
+
['м', /.*/, /.*/, 1]#,
|
55
|
+
# ['мь', /[a,e,i,o,u,ju,yu,ja,ya]/, /[e,ja,ya,d,s,ju,yu]|^$/, 0]
|
56
|
+
# ['мь', /[e]/, /[e]/, 0]
|
57
|
+
],
|
58
|
+
'n' => [
|
59
|
+
['н',/.*/,/.*/, 1],
|
60
|
+
# ['нь', /[a,e,o,u,i,y,ja,ya,ju,yu,z,r,s,g]/, /[k,e,ja,sh,sch,o,i,g,-,ju,yu,zh,ch,tsch,b,d,s]|^$/, 0]
|
61
|
+
['нь', /[a,e,o,u,i,y,ja,ya]/, /[k,e,ja,sh,sch,o,i,g]|^$/, 0]
|
62
|
+
],
|
63
|
+
'o' => [
|
64
|
+
['о', /.*/, /.*/, 1]
|
65
|
+
],
|
66
|
+
'p' => [
|
67
|
+
['п', /.*/, /.*/, 1],
|
68
|
+
['пь', /.*/, /[ja,ya,e,ju,yu]|^$/, 0]
|
69
|
+
],
|
70
|
+
'q' => [
|
71
|
+
['', /.*/, /.*/, 1]
|
72
|
+
],
|
73
|
+
'r' => [
|
74
|
+
['р', /.*/, /.*/, 1],
|
75
|
+
# ['рь', /[a,o,e,u,y,i,b,ja,ju,t,p,kh,ch]/, /[e,k,ja,ya,m,c,-,i,b,tsch,ch,g]|^$/, 0]
|
76
|
+
['рь', /[a,o,e,u,y,i,b]/, /[e,k,ja,ya,m]|^$/, 0]
|
77
|
+
],
|
78
|
+
's' => [
|
79
|
+
['с', /.*/, /.*/, 1],
|
80
|
+
# ['сь', /.*/, /[m,e,k,b,ja,ya,ju,yu,o]|^$/, 0]
|
81
|
+
['сь', /[i,o,e]/, /[m,e]|^$/, 0]
|
82
|
+
],
|
83
|
+
't' => [
|
84
|
+
['т', /.*/, /.*/, 1],
|
85
|
+
# ['ть', /[a,i,s,ja,ya,u,e,y,o,z,r,l,f,n]/, /[s,e,ja,ya,ju,yu,i,-,b,m,f,k,d,u,v,w]|^$/, 0],
|
86
|
+
['ть', /[a,i,s,ja,ya,u,e,y,o,z,r]/, /[s,e,ja,ya,ju,yu,i,-]|^$/, 0],
|
87
|
+
],
|
88
|
+
'u' => [
|
89
|
+
['у', /.*/, /.*/, 1]
|
90
|
+
],
|
91
|
+
'v' => [
|
92
|
+
['в', /.*/, /.*/, 1],
|
93
|
+
['вь', /.*/, /[ju,yu,e,i]|^$/, 0]
|
94
|
+
],
|
95
|
+
'w' => [
|
96
|
+
['в', /.*/, /.*/, 1],
|
97
|
+
['вь', /.+/, /[ju,yu,e,i]|^$/, 0],
|
98
|
+
['вь', /.*/, /[ju,yu,e,i]/, 0]
|
99
|
+
],
|
100
|
+
'x' => [
|
101
|
+
['кс', /.*/, /.*/, 1]
|
102
|
+
],
|
103
|
+
'y' => [
|
104
|
+
['ы', /.*/, /.*/, 1]
|
105
|
+
],
|
106
|
+
'z' => [
|
107
|
+
['з', /.*/, /.*/, 1],
|
108
|
+
['зь', /.*/, /[ja,ya,b,e,m,k]|^$/, 0]
|
109
|
+
],
|
110
|
+
"'" => [
|
111
|
+
['ь', /.+/, /.*/, 0]
|
112
|
+
],
|
113
|
+
# english letter combinations
|
114
|
+
'ch' => [
|
115
|
+
['ч', /.*/, /.*/, 1],
|
116
|
+
['чь', /.*/, /[s,e,i,ja,ya,ju,yu,-]|^$/, 0]
|
117
|
+
],
|
118
|
+
'кн' => [
|
119
|
+
['х', /.*/, /.*/, 1]
|
120
|
+
],
|
121
|
+
'yu' => [
|
122
|
+
['ю', /.*/, /.*/, 1]
|
123
|
+
],
|
124
|
+
'ya' => [
|
125
|
+
['я', /.*/, /.*/, 1]
|
126
|
+
],
|
127
|
+
'yo' => [
|
128
|
+
['е', /.*/, /.*/, 1]
|
129
|
+
],
|
130
|
+
'sh' => [
|
131
|
+
['ш', /.*/, /.*/, 1],
|
132
|
+
['шь', /.*/, /[ja,ya,e,s]|^$/, 0]
|
133
|
+
],
|
134
|
+
'shch' => [
|
135
|
+
['щ', /.*/, /.*/, 1],
|
136
|
+
['щь', /.*/, /^$/, 0]
|
137
|
+
],
|
138
|
+
'ts' => [
|
139
|
+
['ц', /.*/, /.*/, 1]
|
140
|
+
],
|
141
|
+
'zh' => [
|
142
|
+
['ж', /.*/, /.*/, 1],
|
143
|
+
['жь', /.*/, /[e,i,ja,ya]|^$/, 0]
|
144
|
+
],
|
145
|
+
# german letter combinations
|
146
|
+
'sch' => [
|
147
|
+
['ш', /.*/, /.*/, 1], #de
|
148
|
+
['щ', /.*/, /.*/, 1], #en
|
149
|
+
['шь', /.*/, /^$/, 0]
|
150
|
+
#maschadov ?
|
151
|
+
],
|
152
|
+
'tsch' => [
|
153
|
+
['ч', /.*/, /.*/, 1],
|
154
|
+
['чь', /.*/, /[s,e,i,ja,ya,ju,yu,-]|^$/, 0]
|
155
|
+
],
|
156
|
+
'schtsch' => [
|
157
|
+
['щ', /.*/, /.*/, 1]
|
158
|
+
],
|
159
|
+
'ju' => [
|
160
|
+
['ю', /.*/, /.*/, 1]
|
161
|
+
],
|
162
|
+
'ja' => [
|
163
|
+
['я', /.*/, /.*/, 1]
|
164
|
+
],
|
165
|
+
# 'je' => [
|
166
|
+
# ['е', /.*/, /.*/, 1],
|
167
|
+
# ['же', /.*/, /.*/, 1],
|
168
|
+
# ],
|
169
|
+
'jo' => [
|
170
|
+
['е', /.*/, /.*/, 1]
|
171
|
+
],
|
172
|
+
'ä' => [
|
173
|
+
['я', /.*/, /.*/, 1]
|
174
|
+
],
|
175
|
+
'ö' => [
|
176
|
+
['ё', /.*/, /.*/, 1]
|
177
|
+
],
|
178
|
+
'ü' => [
|
179
|
+
['ю', /.*/, /.*/, 1]
|
180
|
+
],
|
181
|
+
'ß' => [
|
182
|
+
['с', /.*/, /.*/, 1]
|
183
|
+
],
|
184
|
+
'je' => [
|
185
|
+
['е', /.*/, /.*/, 1],
|
186
|
+
#['йе', /.*/, /.*/, 1],
|
187
|
+
['же', /^$/, /.*/, 1]
|
188
|
+
]
|
189
|
+
# TODO scientific letter combinations
|
190
|
+
}
|
191
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require_relative 'word'
|
3
|
+
|
4
|
+
#TODO check if a word is german or english, then don't detransliterate, right?
|
5
|
+
|
6
|
+
# Pass in a word and get some detransliteration suggestions based on our ruleset,
|
7
|
+
# which covers german and english transliterations. If softeners are to be included,
|
8
|
+
# the returned lists can get quite long.
|
9
|
+
class Detransliterator
|
10
|
+
#returns an array of cyrillic options for the passed-in word (just one word at a time!)
|
11
|
+
def self.cyrillic_options(input, include_softeners)
|
12
|
+
data = Word.detransliterations_for(input, include_softeners)
|
13
|
+
options = data[:options]
|
14
|
+
Word.output_words options
|
15
|
+
end
|
16
|
+
end
|
data/lib/ru_translit.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'unicode_utils/downcase'
|
2
|
+
require 'transliterator'
|
3
|
+
require 'detransliterator'
|
4
|
+
|
5
|
+
# Transliteration as well as De-/Retransliteration between russian cyrillic and
|
6
|
+
# English, German and Scientific transliterations. Accounts for context-dependent
|
7
|
+
# transliteration rules.
|
8
|
+
# Current limitations:
|
9
|
+
# * Only one word per pass (technically, it should work for multiple words,
|
10
|
+
# but the number of variations returned likely grows beyond manageability).
|
11
|
+
# * Everything will be downcased.
|
12
|
+
# * No distinction between the different translit variants: Just one list with all possible options gets returned.
|
13
|
+
module RuTranslit
|
14
|
+
|
15
|
+
# De-transliterates a single latin word to cyrillic. returns an array of possible cyrillic strings
|
16
|
+
# if include_softeners is true, variations including only the positioning of softeners get added
|
17
|
+
# to the returned array as well. considers mainly German and English transliteration variants.
|
18
|
+
def self.to_cyrillic latin_term, include_softeners=false
|
19
|
+
latin_term = UnicodeUtils.downcase(latin_term) #generally, the regular downcase should be fine here, but doesn't hurt like this.
|
20
|
+
Detransliterator.cyrillic_options(latin_term, include_softeners)
|
21
|
+
end
|
22
|
+
|
23
|
+
# Transliterates a single cyrillic word to latin. Returns an array of possible latin strings.
|
24
|
+
# Considers mainly English, German and scientific (mostly minus the diacritics) transliteration variants.
|
25
|
+
def self.to_latin cyrillic_term
|
26
|
+
cyrillic_term = UnicodeUtils.downcase(cyrillic_term) #generally, the regular downcase should be fine here, but doesn't hurt like this.
|
27
|
+
Transliterator.translit_options(cyrillic_term)
|
28
|
+
end
|
29
|
+
|
30
|
+
# Short forms of the main methods.
|
31
|
+
class << self
|
32
|
+
alias to_cy to_cyrillic
|
33
|
+
alias to_la to_latin
|
34
|
+
alias detransliterate to_cyrillic
|
35
|
+
alias transliterate to_latin
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,151 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
class TranslitRules
|
3
|
+
|
4
|
+
def self.rules
|
5
|
+
RULES
|
6
|
+
end
|
7
|
+
|
8
|
+
# arrays: [output, predecessor-rule, successor-rule, softener-if-0]
|
9
|
+
#source: mainly https://secure.wikimedia.org/wikipedia/de/wiki/Kyrillisch#Russisch
|
10
|
+
RULES = {
|
11
|
+
'а' => [
|
12
|
+
['a', /.*/, /.*/, 1]
|
13
|
+
],
|
14
|
+
'б' => [
|
15
|
+
['b', /.*/, /.*/, 1]
|
16
|
+
],
|
17
|
+
'в' => [
|
18
|
+
['v', /.*/, /.*/, 1], #sc, en
|
19
|
+
['w', /.*/, /.*/, 1] #de
|
20
|
+
],
|
21
|
+
'г' => [
|
22
|
+
['g', /.*/, /.*/, 1],
|
23
|
+
['w', /[eo]/, /o/, 1] #de bei genitiv-wendung
|
24
|
+
],
|
25
|
+
'д' => [
|
26
|
+
['d', /.*/, /.*/, 1]
|
27
|
+
],
|
28
|
+
'е' => [
|
29
|
+
['e', /.*/, /.*/, 1],
|
30
|
+
['je', /^$|[аоуыэяёюиеь]/, /.*/, 1], #de, am anfang oder nach vokal
|
31
|
+
['ye', /^$|[аоуыэяёюиеь]/, /.*/, 1], #en, ''
|
32
|
+
['jo', /[р]/, /.*/, 1] #de, not official, for book_id 44460
|
33
|
+
],
|
34
|
+
'ё' => [
|
35
|
+
['ë', /.*/, /.*/, 1], #sc
|
36
|
+
['jo', /[^жчшщ]/, /.*/, 1], #de
|
37
|
+
['e', /[жчшщ]/, /.*/, 1], #de
|
38
|
+
['yo', /[^жчшщ]/, /.*/, 1], #en
|
39
|
+
['o', /[жчшщ]/, /.*/, 1] #en
|
40
|
+
],
|
41
|
+
'ж' => [
|
42
|
+
#['ž', /.*/, /.*/, 1], #sc
|
43
|
+
['z', /.*/, /.*/, 1], #sc
|
44
|
+
['sch', /.*/, /.*/, 1], #de
|
45
|
+
['sh', /.*/, /.*/, 1], #de (ddr?)
|
46
|
+
['zh', /.*/, /.*/, 1], #en
|
47
|
+
],
|
48
|
+
'з' => [
|
49
|
+
['z', /.*/, /.*/, 1], #sc, en
|
50
|
+
['s', /.*/, /.*/, 1], #de
|
51
|
+
],
|
52
|
+
'и' => [
|
53
|
+
['i', /.*/, /.*/, 1], #sc, en, de
|
54
|
+
['ji', /ь/, /.*/, 1], #de
|
55
|
+
['yi', /ь/, /.*/, 1], #en
|
56
|
+
],
|
57
|
+
'й' => [
|
58
|
+
['j', /.*/, /.*/, 1], #sc
|
59
|
+
['i', /.*/, /.*/, 1], #de, sometimes also 'j', but that's already covered with sc.
|
60
|
+
['y', /.*/, /.*/, 1], #en
|
61
|
+
],
|
62
|
+
'к' => [
|
63
|
+
['k', /.*/, /.*/, 1], #sc, de, en
|
64
|
+
#['x', /.*/, /c/, 1], #de?
|
65
|
+
],
|
66
|
+
'л' => [
|
67
|
+
['l', /.*/, /.*/, 1], #sc, de, en
|
68
|
+
],
|
69
|
+
'м' => [
|
70
|
+
['m', /.*/, /.*/, 1], #sc, de, en
|
71
|
+
],
|
72
|
+
'н' => [
|
73
|
+
['n', /.*/, /.*/, 1], #sc, de, en
|
74
|
+
],
|
75
|
+
'о' => [
|
76
|
+
['o', /.*/, /.*/, 1], #sc, de, en
|
77
|
+
['jo', /ь/, /.*/, 1], #de
|
78
|
+
['yo', /ь/, /.*/, 1], #en
|
79
|
+
],
|
80
|
+
'п' => [
|
81
|
+
['p', /.*/, /.*/, 1], #sc, de, en
|
82
|
+
],
|
83
|
+
'р' => [
|
84
|
+
['r', /.*/, /.*/, 1], #sc, de, en
|
85
|
+
],
|
86
|
+
'с' => [
|
87
|
+
['s', /.*/, /.*/, 1], #sc, de, en
|
88
|
+
['ss', /[аоуыэяёюие]/, /[аоуыэяёюие]/, 1], #de, between vowels
|
89
|
+
],
|
90
|
+
'т' => [
|
91
|
+
['t', /.*/, /.*/, 1], #sc, de, en
|
92
|
+
],
|
93
|
+
'у' => [
|
94
|
+
['u', /.*/, /.*/, 1], #sc, de, en
|
95
|
+
],
|
96
|
+
'ф' => [
|
97
|
+
['f', /.*/, /.*/, 1], #sc, de, en
|
98
|
+
],
|
99
|
+
'х' => [
|
100
|
+
['ch', /.*/, /.*/, 1], #sc, de
|
101
|
+
['kh', /.*/, /.*/, 1], #en
|
102
|
+
['h', /.*/, /.*/, 1], #not officially, for author of book_id 37718
|
103
|
+
],
|
104
|
+
'ц' => [
|
105
|
+
['c', /.*/, /.*/, 1], #sc
|
106
|
+
['z', /.*/, /.*/, 1], #de
|
107
|
+
['ts', /.*/, /.*/, 1], #en
|
108
|
+
],
|
109
|
+
'ч' => [
|
110
|
+
#['č', /.*/, /.*/, 1], #sc
|
111
|
+
['c', /.*/, /.*/, 1], #sc
|
112
|
+
['tsch', /.*/, /.*/, 1], #de
|
113
|
+
['ch', /.*/, /.*/, 1], #en
|
114
|
+
],
|
115
|
+
'ш' => [
|
116
|
+
#['š', /.*/, /.*/, 1], #sc
|
117
|
+
['s', /.*/, /.*/, 1], #sc
|
118
|
+
['sch', /.*/, /.*/, 1], #de
|
119
|
+
['sh', /.*/, /.*/, 1], #en
|
120
|
+
],
|
121
|
+
'щ' => [
|
122
|
+
#['šč', /.*/, /.*/, 1], #sc
|
123
|
+
['sc', /.*/, /.*/, 1], #sc
|
124
|
+
['schtsch', /.*/, /.*/, 1], #de
|
125
|
+
#['stsch', /.*/, /.*/, 1], #de (ddr)
|
126
|
+
['shch', /.*/, /.*/, 1], #en
|
127
|
+
],
|
128
|
+
'ъ' => [
|
129
|
+
['"', /.*/, /.*/, 1], #sc
|
130
|
+
#hardener, generally not transcribed for de and en
|
131
|
+
],
|
132
|
+
'ы' => [
|
133
|
+
['y', /.*/, /.*/, 1], #sc, de, en
|
134
|
+
],
|
135
|
+
'ь' => [
|
136
|
+
["'", /.*/, /.*/, 1], #sc
|
137
|
+
],
|
138
|
+
'э' => [
|
139
|
+
["ė", /.*/, /.*/, 1], #sc
|
140
|
+
["e", /.*/, /.*/, 1], #de, en
|
141
|
+
],
|
142
|
+
'ю' => [
|
143
|
+
["ju", /.*/, /.*/, 1], #sc, de
|
144
|
+
["yu", /.*/, /.*/, 1], #en
|
145
|
+
],
|
146
|
+
'я' => [
|
147
|
+
["ja", /.*/, /.*/, 1], #sc, de
|
148
|
+
["ya", /.*/, /.*/, 1], #en
|
149
|
+
]
|
150
|
+
}
|
151
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require_relative 'word'
|
3
|
+
|
4
|
+
# Pass in a word and get some transliteration suggestions based on our ruleset,
|
5
|
+
# which covers german, english and scientific transliterations.
|
6
|
+
class Transliterator
|
7
|
+
# Returns an array of cyrillic options for the passed-in word (just one word at a time!)
|
8
|
+
def self.translit_options(input)
|
9
|
+
data = Word.transliterations_for(input)
|
10
|
+
options = data[:options]
|
11
|
+
Word.output_words options
|
12
|
+
end
|
13
|
+
end
|
data/lib/word.rb
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require_relative 'word_part'
|
3
|
+
|
4
|
+
@@max_checked_string_length = 7 # To accomodate 'schtsch' or s.th like that.
|
5
|
+
|
6
|
+
class Word
|
7
|
+
def self.detransliterations_for(word, include_softeners, is_detranslit=true)
|
8
|
+
last_part = build_word_part_structure word, is_detranslit
|
9
|
+
first_part = add_link_to_succ_to_each_word_part(last_part)
|
10
|
+
input_structure = word_input_structure(first_part)
|
11
|
+
wps_options = [] #this will be filled with lists of possible cyrillic output (2d-array then)
|
12
|
+
part = first_part
|
13
|
+
until part.nil? do
|
14
|
+
wps_options << part.cyrillic_options(include_softeners)
|
15
|
+
part = part.succ
|
16
|
+
end
|
17
|
+
{:input_structure => input_structure, :options => wps_options}
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.transliterations_for(word)
|
21
|
+
self.detransliterations_for(word, true, false)
|
22
|
+
end
|
23
|
+
|
24
|
+
#transform 2d-options-array two an array of cyrillic words.
|
25
|
+
def self.output_words part_options
|
26
|
+
part_options.map!{|a| a.size==0 ? nil : a} #delete empty option arrays
|
27
|
+
part_options.compact!
|
28
|
+
get_part_combinations '', part_options, []
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
private
|
33
|
+
#builds a singly linked list, and returns the last item
|
34
|
+
def self.build_word_part_structure word, is_detranslit
|
35
|
+
rest_size = [word.size-@@max_checked_string_length,0].max #size of the chunk from the beginning of word to be checked for any matching wordparts
|
36
|
+
offset = 0 #offset from the beginning of the word --do we need this at all?
|
37
|
+
cy_result = ""
|
38
|
+
prev_wp = nil #previous wordpart (relative to current. nil for first wordpart)
|
39
|
+
while(word && word.size>0) do
|
40
|
+
#in die liste schaun. wenn ja, dann offset = rest_size
|
41
|
+
chunk = word[0..(-1-rest_size)]
|
42
|
+
if WordPart.has_rules?(chunk, is_detranslit) #if there are rules for this chunk
|
43
|
+
wp = WordPart.new(chunk, prev_wp, is_detranslit)#cy_result << new_cy_char
|
44
|
+
prev_wp = wp
|
45
|
+
offset = word.size-rest_size
|
46
|
+
word = word[(offset)..-1] #cut the word to the rest of the string that has to be put into the wp-structure
|
47
|
+
rest_size = [word.size-@@max_checked_string_length,0].max
|
48
|
+
elsif chunk.size > 0 #no rules found, so make chunk one letter shorter
|
49
|
+
rest_size += 1
|
50
|
+
else #no rules found and chunk is empty. meaning: no rules for the current word's first character. which means: just keep it.
|
51
|
+
offset = word.size-rest_size+1
|
52
|
+
wp = WordPart.new(word[0..offset-1], prev_wp, is_detranslit)
|
53
|
+
prev_wp = wp
|
54
|
+
wp.just_thru = true
|
55
|
+
word = word[(offset)..-1]
|
56
|
+
rest_size = [word.size-@@max_checked_string_length,0].max
|
57
|
+
end
|
58
|
+
end
|
59
|
+
prev_wp
|
60
|
+
end
|
61
|
+
|
62
|
+
#go thru the linked list and add the next to each (except the last of course). return the first.
|
63
|
+
def self.add_link_to_succ_to_each_word_part last
|
64
|
+
part = last
|
65
|
+
until part.prev.nil? do
|
66
|
+
prev = part.prev
|
67
|
+
prev.succ = part
|
68
|
+
part = prev
|
69
|
+
end
|
70
|
+
part
|
71
|
+
end
|
72
|
+
|
73
|
+
#just for debugging: return an array of the input parts, how the input has been split into wordparts
|
74
|
+
def self.word_input_structure first_part
|
75
|
+
ret = []
|
76
|
+
part = first_part
|
77
|
+
until part.nil? do
|
78
|
+
ret << part.input
|
79
|
+
part = part.succ
|
80
|
+
end
|
81
|
+
ret
|
82
|
+
end
|
83
|
+
|
84
|
+
#go through the 2d-array of wordpart-options and build a list of words from all possible combinations
|
85
|
+
def self.get_part_combinations(str, arr_in, arr_out)
|
86
|
+
if arr_in.size == 0
|
87
|
+
arr_out << str
|
88
|
+
return arr_out
|
89
|
+
end
|
90
|
+
cur_arr = arr_in[0]
|
91
|
+
cur_arr.each do |w|
|
92
|
+
w2 = str == "" ? w : str + w
|
93
|
+
arr_out = arr_out | get_part_combinations(w2, arr_in[1..-1], arr_out) #merge the two arrays. may be faster with just adding, not merging, since there won't be any duplicates anyway
|
94
|
+
end
|
95
|
+
arr_out
|
96
|
+
end
|
97
|
+
|
98
|
+
end
|
data/lib/word_part.rb
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require_relative 'translit_rules'
|
3
|
+
require_relative 'detranslit_rules'
|
4
|
+
|
5
|
+
class WordPart
|
6
|
+
attr_accessor :input, :prev, :succ, :static_output, :just_thru #may be more restricted than this
|
7
|
+
@prev, @succ = nil
|
8
|
+
@just_thru = false
|
9
|
+
@input = nil
|
10
|
+
@is_detranslit = nil
|
11
|
+
|
12
|
+
# arrays: [output, predecessor-rule, successor-rule]
|
13
|
+
DT_RULES = DetranslitRules.rules
|
14
|
+
T_RULES = TranslitRules.rules
|
15
|
+
|
16
|
+
def initialize input, prev, detranslit=true
|
17
|
+
@input = input
|
18
|
+
@prev = prev
|
19
|
+
@is_detranslit = detranslit
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.has_rules? input, is_detranslit
|
23
|
+
if is_detranslit
|
24
|
+
DT_RULES.has_key? input
|
25
|
+
else
|
26
|
+
T_RULES.has_key? input
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
#return array of (cyrillic) strings which can be empty (or nil instead?) --should we cache the result? i think it's called no more than once.
|
31
|
+
def cyrillic_options(include_softeners)
|
32
|
+
return [input] if just_thru #this can be set to accomodate for untranslatable characters.
|
33
|
+
ret = []
|
34
|
+
arr = @is_detranslit ? DT_RULES[input] : T_RULES[input]
|
35
|
+
if arr #there are rules for this part, now see if any of them match
|
36
|
+
arr.each do |rule|
|
37
|
+
if match?(rule, include_softeners)
|
38
|
+
ret << rule[0]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
ret
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
def match?(rule, include_softeners) #rule is expected to be an array of the form [output, prev-rule, succ-rule, priority(0 is for softeners)]
|
47
|
+
prev_input = prev.nil? ? '' : prev.input
|
48
|
+
succ_input = succ.nil? ? '' : succ.input
|
49
|
+
(prev_input =~ rule[1]) && (succ_input =~ rule[2]) && (rule[3]>0 || include_softeners)
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
data/ru_translit.gemspec
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "transliteration/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "ru_translit"
|
7
|
+
s.version = RuTranslit::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Johannes Stein"]
|
10
|
+
s.email = ["johannes@unsyn.com"]
|
11
|
+
s.homepage = "http://rubygems.org/gems/ru_translit"
|
12
|
+
s.summary = %q{Get a list of latin transliterations from a cyrillic word and vice versa.}
|
13
|
+
s.description = %q{Transliterations and detransliterations, using English, German and scientific transliteration variants.}
|
14
|
+
|
15
|
+
s.rubyforge_project = "ru_translit"
|
16
|
+
|
17
|
+
s.files = `git ls-files`.split("\n")
|
18
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
19
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
|
+
s.require_paths = ["lib"]
|
21
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
require 'ru_translit'
|
3
|
+
|
4
|
+
describe RuTranslit do
|
5
|
+
before(:each) do
|
6
|
+
@la_str = 'something'
|
7
|
+
@cy_str = 'что-то'
|
8
|
+
@cy_str2 = 'биографиями'
|
9
|
+
@nr = '123'
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should respond to the two main methods for transliteration and detransliteration" do
|
13
|
+
RuTranslit.should respond_to('to_latin')
|
14
|
+
RuTranslit.should respond_to('to_cyrillic')
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should respond to some short or alternative forms for the two main methods" do
|
18
|
+
RuTranslit.should respond_to('to_la')
|
19
|
+
RuTranslit.should respond_to('to_cy')
|
20
|
+
RuTranslit.should respond_to('transliterate')
|
21
|
+
RuTranslit.should respond_to('detransliterate')
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should return an array containing only the input when given a lowercase latin string to transliterate" do
|
25
|
+
RuTranslit.to_latin(@la_str).should be_an_instance_of(Array)
|
26
|
+
RuTranslit.to_latin(@la_str).should have(1).items
|
27
|
+
RuTranslit.to_latin(@la_str).first.should == @la_str
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should return an array containing only the input when given a lowercase cyrillic string to detransliterate" do
|
31
|
+
RuTranslit.to_latin(@cy_str).should be_an_instance_of(Array)
|
32
|
+
RuTranslit.to_latin(@cy_str).should have_at_least(1).items
|
33
|
+
RuTranslit.to_cyrillic(@cy_str).first.should == @cy_str
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should leave numbers alone in both directions" do
|
37
|
+
RuTranslit.to_latin(@nr).should have(1).item
|
38
|
+
RuTranslit.to_latin(@nr).first.should == @nr
|
39
|
+
RuTranslit.to_cyrillic(@nr).should have(1).item
|
40
|
+
RuTranslit.to_cyrillic(@nr).first.should == @nr
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should preserve latin word-parts while transliterating cyrillic word-parts" do
|
44
|
+
RuTranslit.to_latin(@cy_str + @la_str + @cy_str).first.should =~ /#{@la_str}/
|
45
|
+
end
|
46
|
+
|
47
|
+
it "should return a list of several transliterations for the russian word 'биографиями'" do
|
48
|
+
RuTranslit.to_latin(@cy_str2).should have_at_least(2).items
|
49
|
+
RuTranslit.to_latin(@cy_str2).should have_at_most(4).items #making sure it doesn't explode for some reason
|
50
|
+
end
|
51
|
+
|
52
|
+
it "should return (among others) the original input if we do a 'round-trip' cy->la->cy, using the first result" do
|
53
|
+
RuTranslit.to_cyrillic(RuTranslit.to_latin(@cy_str2).first).should include(@cy_str2)
|
54
|
+
end
|
55
|
+
|
56
|
+
it "should return (among others) the original input if we do a 'round-trip' la->cy->la, using the first result" do
|
57
|
+
RuTranslit.to_latin(RuTranslit.to_cyrillic(@la_str).first).should include(@la_str)
|
58
|
+
end
|
59
|
+
|
60
|
+
it "should offer cyrillic alternatives for 'schtsch', at least one of which should be only one letter long" do
|
61
|
+
RuTranslit.to_cyrillic('schtsch').should include('щ')
|
62
|
+
end
|
63
|
+
|
64
|
+
it "should return different transliteration results depending on context in some cases" do
|
65
|
+
RuTranslit.to_latin('де').should include('de')
|
66
|
+
RuTranslit.to_latin('де').should_not include('dje')
|
67
|
+
RuTranslit.to_latin('е').should include('je')
|
68
|
+
end
|
69
|
+
|
70
|
+
it "should include softeners in the detransliteration results (only) if called with the appropriate argument" do
|
71
|
+
RuTranslit.to_cyrillic('f', false).should_not include('фь')
|
72
|
+
RuTranslit.to_cyrillic('f', true).should include('фь')
|
73
|
+
end
|
74
|
+
|
75
|
+
end
|
76
|
+
|
metadata
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ru_translit
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 3
|
9
|
+
version: 0.0.3
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Johannes Stein
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2011-01-22 00:00:00 +01:00
|
18
|
+
default_executable:
|
19
|
+
dependencies: []
|
20
|
+
|
21
|
+
description: Transliterations and detransliterations, using English, German and scientific transliteration variants.
|
22
|
+
email:
|
23
|
+
- johannes@unsyn.com
|
24
|
+
executables: []
|
25
|
+
|
26
|
+
extensions: []
|
27
|
+
|
28
|
+
extra_rdoc_files: []
|
29
|
+
|
30
|
+
files:
|
31
|
+
- .gitignore
|
32
|
+
- Gemfile
|
33
|
+
- Gemfile.lock
|
34
|
+
- LICENSE
|
35
|
+
- README.md
|
36
|
+
- Rakefile
|
37
|
+
- lib/detranslit_rules.rb
|
38
|
+
- lib/detransliterator.rb
|
39
|
+
- lib/ru_translit.rb
|
40
|
+
- lib/translit_rules.rb
|
41
|
+
- lib/transliteration/version.rb
|
42
|
+
- lib/transliterator.rb
|
43
|
+
- lib/word.rb
|
44
|
+
- lib/word_part.rb
|
45
|
+
- ru_translit.gemspec
|
46
|
+
- spec/ru_translit_spec.rb
|
47
|
+
has_rdoc: true
|
48
|
+
homepage: http://rubygems.org/gems/ru_translit
|
49
|
+
licenses: []
|
50
|
+
|
51
|
+
post_install_message:
|
52
|
+
rdoc_options: []
|
53
|
+
|
54
|
+
require_paths:
|
55
|
+
- lib
|
56
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
segments:
|
62
|
+
- 0
|
63
|
+
version: "0"
|
64
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
segments:
|
70
|
+
- 0
|
71
|
+
version: "0"
|
72
|
+
requirements: []
|
73
|
+
|
74
|
+
rubyforge_project: ru_translit
|
75
|
+
rubygems_version: 1.3.7
|
76
|
+
signing_key:
|
77
|
+
specification_version: 3
|
78
|
+
summary: Get a list of latin transliterations from a cyrillic word and vice versa.
|
79
|
+
test_files: []
|
80
|
+
|