ru_translit 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ pkg/*
2
+ *.gem
3
+ .bundle
4
+ nbproject/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in ru_translit.gemspec
4
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,14 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ ru_translit (0.0.2)
5
+
6
+ GEM
7
+ remote: http://rubygems.org/
8
+ specs:
9
+
10
+ PLATFORMS
11
+ ruby
12
+
13
+ DEPENDENCIES
14
+ ru_translit!
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ (The MIT License)
2
+
3
+ Copyright (c) 2011 Johannes Stein
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,26 @@
1
+ # ru_translit
2
+
3
+ A simple Object with two class methods for transliterating russian cyrillic words to latin,
4
+ and for detransliterating transliterated words from latin back to cyrillic. At this point,
5
+ it does not follow any one specific transliteration ruleset, but uses a variation of German,
6
+ English and (simplified) scientific transliteration rules. It is meant to be a very pragmatic, catch-all
7
+ way of getting different real-world variants of how a given word might be transliterated or detransliterated.
8
+ Each of the two methods returns an array of words.
9
+
10
+ The detransliteration method has an optional second parameter that decides whether the returned
11
+ cyrillic options should include those whose only difference to another option is that it includes one
12
+ or more softeners, which are usually not transliterated. This parameter defaults to false.
13
+
14
+ What's special about this is that it takes the context of letters in the word into account in order to find out
15
+ whether certain options are to be included. Most other transliteration tools don't do this.
16
+
17
+ ## Install
18
+
19
+ gem install ru_translit
20
+
21
+ ## Usage
22
+
23
+ require 'ru_translit'
24
+ RuTranslit.to_cyrillic 'vodka' # => ["водка"]
25
+ RuTranslit.to_latin 'водка' # => ["vodka", "wodka"]
26
+ RuTranslit.to_cyrillic 'vodka', true # => ["водка", "водкьа", "водька", "водькьа"]
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
@@ -0,0 +1,191 @@
1
+ # coding: utf-8
2
+ class DetranslitRules
3
+
4
+ def self.rules
5
+ RULES
6
+ end
7
+
8
+ # arrays: [output, predecessor-rule, successor-rule, softener-if-0]
9
+ RULES = {
10
+ 'a' => [
11
+ ['а', /.*/, /.*/, 1]
12
+ ],
13
+ 'b' => [
14
+ ['б', /.*/, /.*/, 1],
15
+ ['бь', /.*/, /[a,e,i,ja,ya,ju,yu]|^$/, 0]
16
+ ],
17
+ 'c' => [
18
+ ['ц', /.*/, /.*/, 1]
19
+ ],
20
+ 'd' => [
21
+ ['д', /.*/, /.*/, 1],
22
+ ['дь', /.*/, /[ja,e,m,b,k,ju,yu,s]|^$/, 0]
23
+ ],
24
+ 'e' => [
25
+ ['е', /.*/, /.*/, 1]
26
+ ],
27
+ 'f' => [
28
+ ['ф', /.*/, /.*/, 1],
29
+ ['фь', /.*/, /[ja,ya,e,je,o]|^$/, 0]
30
+ ],
31
+ 'g' => [
32
+ ['г', /.*/, /.*/, 1]
33
+ ],
34
+ 'h' => [
35
+ ['х', /.*/, /.*/, 1]
36
+ ],
37
+ 'i' => [
38
+ ['и', /.*/, /.*/, 1],
39
+ ['ы', /.+/, /.*/, 1],
40
+ ['ий', /.+/, /^$/, 1]
41
+ ],
42
+ 'j' => [
43
+ ['й', /.*/, /.*/, 1]
44
+ ],
45
+ 'k' => [
46
+ ['к', /.*/, /.*/, 1],
47
+ ['кь', /.*/, /[ja,ya]/, 0] #
48
+ ],
49
+
50
+ 'l' => [
51
+ ['л', /.*/, /.*/, 1]
52
+ ],
53
+ 'm' => [
54
+ ['м', /.*/, /.*/, 1]#,
55
+ # ['мь', /[a,e,i,o,u,ju,yu,ja,ya]/, /[e,ja,ya,d,s,ju,yu]|^$/, 0]
56
+ # ['мь', /[e]/, /[e]/, 0]
57
+ ],
58
+ 'n' => [
59
+ ['н',/.*/,/.*/, 1],
60
+ # ['нь', /[a,e,o,u,i,y,ja,ya,ju,yu,z,r,s,g]/, /[k,e,ja,sh,sch,o,i,g,-,ju,yu,zh,ch,tsch,b,d,s]|^$/, 0]
61
+ ['нь', /[a,e,o,u,i,y,ja,ya]/, /[k,e,ja,sh,sch,o,i,g]|^$/, 0]
62
+ ],
63
+ 'o' => [
64
+ ['о', /.*/, /.*/, 1]
65
+ ],
66
+ 'p' => [
67
+ ['п', /.*/, /.*/, 1],
68
+ ['пь', /.*/, /[ja,ya,e,ju,yu]|^$/, 0]
69
+ ],
70
+ 'q' => [
71
+ ['', /.*/, /.*/, 1]
72
+ ],
73
+ 'r' => [
74
+ ['р', /.*/, /.*/, 1],
75
+ # ['рь', /[a,o,e,u,y,i,b,ja,ju,t,p,kh,ch]/, /[e,k,ja,ya,m,c,-,i,b,tsch,ch,g]|^$/, 0]
76
+ ['рь', /[a,o,e,u,y,i,b]/, /[e,k,ja,ya,m]|^$/, 0]
77
+ ],
78
+ 's' => [
79
+ ['с', /.*/, /.*/, 1],
80
+ # ['сь', /.*/, /[m,e,k,b,ja,ya,ju,yu,o]|^$/, 0]
81
+ ['сь', /[i,o,e]/, /[m,e]|^$/, 0]
82
+ ],
83
+ 't' => [
84
+ ['т', /.*/, /.*/, 1],
85
+ # ['ть', /[a,i,s,ja,ya,u,e,y,o,z,r,l,f,n]/, /[s,e,ja,ya,ju,yu,i,-,b,m,f,k,d,u,v,w]|^$/, 0],
86
+ ['ть', /[a,i,s,ja,ya,u,e,y,o,z,r]/, /[s,e,ja,ya,ju,yu,i,-]|^$/, 0],
87
+ ],
88
+ 'u' => [
89
+ ['у', /.*/, /.*/, 1]
90
+ ],
91
+ 'v' => [
92
+ ['в', /.*/, /.*/, 1],
93
+ ['вь', /.*/, /[ju,yu,e,i]|^$/, 0]
94
+ ],
95
+ 'w' => [
96
+ ['в', /.*/, /.*/, 1],
97
+ ['вь', /.+/, /[ju,yu,e,i]|^$/, 0],
98
+ ['вь', /.*/, /[ju,yu,e,i]/, 0]
99
+ ],
100
+ 'x' => [
101
+ ['кс', /.*/, /.*/, 1]
102
+ ],
103
+ 'y' => [
104
+ ['ы', /.*/, /.*/, 1]
105
+ ],
106
+ 'z' => [
107
+ ['з', /.*/, /.*/, 1],
108
+ ['зь', /.*/, /[ja,ya,b,e,m,k]|^$/, 0]
109
+ ],
110
+ "'" => [
111
+ ['ь', /.+/, /.*/, 0]
112
+ ],
113
+ # english letter combinations
114
+ 'ch' => [
115
+ ['ч', /.*/, /.*/, 1],
116
+ ['чь', /.*/, /[s,e,i,ja,ya,ju,yu,-]|^$/, 0]
117
+ ],
118
+ 'кн' => [
119
+ ['х', /.*/, /.*/, 1]
120
+ ],
121
+ 'yu' => [
122
+ ['ю', /.*/, /.*/, 1]
123
+ ],
124
+ 'ya' => [
125
+ ['я', /.*/, /.*/, 1]
126
+ ],
127
+ 'yo' => [
128
+ ['е', /.*/, /.*/, 1]
129
+ ],
130
+ 'sh' => [
131
+ ['ш', /.*/, /.*/, 1],
132
+ ['шь', /.*/, /[ja,ya,e,s]|^$/, 0]
133
+ ],
134
+ 'shch' => [
135
+ ['щ', /.*/, /.*/, 1],
136
+ ['щь', /.*/, /^$/, 0]
137
+ ],
138
+ 'ts' => [
139
+ ['ц', /.*/, /.*/, 1]
140
+ ],
141
+ 'zh' => [
142
+ ['ж', /.*/, /.*/, 1],
143
+ ['жь', /.*/, /[e,i,ja,ya]|^$/, 0]
144
+ ],
145
+ # german letter combinations
146
+ 'sch' => [
147
+ ['ш', /.*/, /.*/, 1], #de
148
+ ['щ', /.*/, /.*/, 1], #en
149
+ ['шь', /.*/, /^$/, 0]
150
+ #maschadov ?
151
+ ],
152
+ 'tsch' => [
153
+ ['ч', /.*/, /.*/, 1],
154
+ ['чь', /.*/, /[s,e,i,ja,ya,ju,yu,-]|^$/, 0]
155
+ ],
156
+ 'schtsch' => [
157
+ ['щ', /.*/, /.*/, 1]
158
+ ],
159
+ 'ju' => [
160
+ ['ю', /.*/, /.*/, 1]
161
+ ],
162
+ 'ja' => [
163
+ ['я', /.*/, /.*/, 1]
164
+ ],
165
+ # 'je' => [
166
+ # ['е', /.*/, /.*/, 1],
167
+ # ['же', /.*/, /.*/, 1],
168
+ # ],
169
+ 'jo' => [
170
+ ['е', /.*/, /.*/, 1]
171
+ ],
172
+ 'ä' => [
173
+ ['я', /.*/, /.*/, 1]
174
+ ],
175
+ 'ö' => [
176
+ ['ё', /.*/, /.*/, 1]
177
+ ],
178
+ 'ü' => [
179
+ ['ю', /.*/, /.*/, 1]
180
+ ],
181
+ 'ß' => [
182
+ ['с', /.*/, /.*/, 1]
183
+ ],
184
+ 'je' => [
185
+ ['е', /.*/, /.*/, 1],
186
+ #['йе', /.*/, /.*/, 1],
187
+ ['же', /^$/, /.*/, 1]
188
+ ]
189
+ # TODO scientific letter combinations
190
+ }
191
+ end
@@ -0,0 +1,16 @@
1
+ # coding: utf-8
2
+ require_relative 'word'
3
+
4
+ #TODO check if a word is german or english, then don't detransliterate, right?
5
+
6
+ # Pass in a word and get some detransliteration suggestions based on our ruleset,
7
+ # which covers german and english transliterations. If softeners are to be included,
8
+ # the returned lists can get quite long.
9
+ class Detransliterator
10
+ #returns an array of cyrillic options for the passed-in word (just one word at a time!)
11
+ def self.cyrillic_options(input, include_softeners)
12
+ data = Word.detransliterations_for(input, include_softeners)
13
+ options = data[:options]
14
+ Word.output_words options
15
+ end
16
+ end
@@ -0,0 +1,37 @@
1
+ require 'unicode_utils/downcase'
2
+ require 'transliterator'
3
+ require 'detransliterator'
4
+
5
+ # Transliteration as well as De-/Retransliteration between russian cyrillic and
6
+ # English, German and Scientific transliterations. Accounts for context-dependent
7
+ # transliteration rules.
8
+ # Current limitations:
9
+ # * Only one word per pass (technically, it should work for multiple words,
10
+ # but the number of variations returned likely grows beyond manageability).
11
+ # * Everything will be downcased.
12
+ # * No distinction between the different translit variants: Just one list with all possible options gets returned.
13
+ module RuTranslit
14
+
15
+ # De-transliterates a single latin word to cyrillic. returns an array of possible cyrillic strings
16
+ # if include_softeners is true, variations including only the positioning of softeners get added
17
+ # to the returned array as well. considers mainly German and English transliteration variants.
18
+ def self.to_cyrillic latin_term, include_softeners=false
19
+ latin_term = UnicodeUtils.downcase(latin_term) #generally, the regular downcase should be fine here, but doesn't hurt like this.
20
+ Detransliterator.cyrillic_options(latin_term, include_softeners)
21
+ end
22
+
23
+ # Transliterates a single cyrillic word to latin. Returns an array of possible latin strings.
24
+ # Considers mainly English, German and scientific (mostly minus the diacritics) transliteration variants.
25
+ def self.to_latin cyrillic_term
26
+ cyrillic_term = UnicodeUtils.downcase(cyrillic_term) #generally, the regular downcase should be fine here, but doesn't hurt like this.
27
+ Transliterator.translit_options(cyrillic_term)
28
+ end
29
+
30
+ # Short forms of the main methods.
31
+ class << self
32
+ alias to_cy to_cyrillic
33
+ alias to_la to_latin
34
+ alias detransliterate to_cyrillic
35
+ alias transliterate to_latin
36
+ end
37
+ end
@@ -0,0 +1,151 @@
1
+ # coding: utf-8
2
+ class TranslitRules
3
+
4
+ def self.rules
5
+ RULES
6
+ end
7
+
8
+ # arrays: [output, predecessor-rule, successor-rule, softener-if-0]
9
+ #source: mainly https://secure.wikimedia.org/wikipedia/de/wiki/Kyrillisch#Russisch
10
+ RULES = {
11
+ 'а' => [
12
+ ['a', /.*/, /.*/, 1]
13
+ ],
14
+ 'б' => [
15
+ ['b', /.*/, /.*/, 1]
16
+ ],
17
+ 'в' => [
18
+ ['v', /.*/, /.*/, 1], #sc, en
19
+ ['w', /.*/, /.*/, 1] #de
20
+ ],
21
+ 'г' => [
22
+ ['g', /.*/, /.*/, 1],
23
+ ['w', /[eo]/, /o/, 1] #de bei genitiv-wendung
24
+ ],
25
+ 'д' => [
26
+ ['d', /.*/, /.*/, 1]
27
+ ],
28
+ 'е' => [
29
+ ['e', /.*/, /.*/, 1],
30
+ ['je', /^$|[аоуыэяёюиеь]/, /.*/, 1], #de, am anfang oder nach vokal
31
+ ['ye', /^$|[аоуыэяёюиеь]/, /.*/, 1], #en, ''
32
+ ['jo', /[р]/, /.*/, 1] #de, not official, for book_id 44460
33
+ ],
34
+ 'ё' => [
35
+ ['ë', /.*/, /.*/, 1], #sc
36
+ ['jo', /[^жчшщ]/, /.*/, 1], #de
37
+ ['e', /[жчшщ]/, /.*/, 1], #de
38
+ ['yo', /[^жчшщ]/, /.*/, 1], #en
39
+ ['o', /[жчшщ]/, /.*/, 1] #en
40
+ ],
41
+ 'ж' => [
42
+ #['ž', /.*/, /.*/, 1], #sc
43
+ ['z', /.*/, /.*/, 1], #sc
44
+ ['sch', /.*/, /.*/, 1], #de
45
+ ['sh', /.*/, /.*/, 1], #de (ddr?)
46
+ ['zh', /.*/, /.*/, 1], #en
47
+ ],
48
+ 'з' => [
49
+ ['z', /.*/, /.*/, 1], #sc, en
50
+ ['s', /.*/, /.*/, 1], #de
51
+ ],
52
+ 'и' => [
53
+ ['i', /.*/, /.*/, 1], #sc, en, de
54
+ ['ji', /ь/, /.*/, 1], #de
55
+ ['yi', /ь/, /.*/, 1], #en
56
+ ],
57
+ 'й' => [
58
+ ['j', /.*/, /.*/, 1], #sc
59
+ ['i', /.*/, /.*/, 1], #de, sometimes also 'j', but that's already covered with sc.
60
+ ['y', /.*/, /.*/, 1], #en
61
+ ],
62
+ 'к' => [
63
+ ['k', /.*/, /.*/, 1], #sc, de, en
64
+ #['x', /.*/, /c/, 1], #de?
65
+ ],
66
+ 'л' => [
67
+ ['l', /.*/, /.*/, 1], #sc, de, en
68
+ ],
69
+ 'м' => [
70
+ ['m', /.*/, /.*/, 1], #sc, de, en
71
+ ],
72
+ 'н' => [
73
+ ['n', /.*/, /.*/, 1], #sc, de, en
74
+ ],
75
+ 'о' => [
76
+ ['o', /.*/, /.*/, 1], #sc, de, en
77
+ ['jo', /ь/, /.*/, 1], #de
78
+ ['yo', /ь/, /.*/, 1], #en
79
+ ],
80
+ 'п' => [
81
+ ['p', /.*/, /.*/, 1], #sc, de, en
82
+ ],
83
+ 'р' => [
84
+ ['r', /.*/, /.*/, 1], #sc, de, en
85
+ ],
86
+ 'с' => [
87
+ ['s', /.*/, /.*/, 1], #sc, de, en
88
+ ['ss', /[аоуыэяёюие]/, /[аоуыэяёюие]/, 1], #de, between vowels
89
+ ],
90
+ 'т' => [
91
+ ['t', /.*/, /.*/, 1], #sc, de, en
92
+ ],
93
+ 'у' => [
94
+ ['u', /.*/, /.*/, 1], #sc, de, en
95
+ ],
96
+ 'ф' => [
97
+ ['f', /.*/, /.*/, 1], #sc, de, en
98
+ ],
99
+ 'х' => [
100
+ ['ch', /.*/, /.*/, 1], #sc, de
101
+ ['kh', /.*/, /.*/, 1], #en
102
+ ['h', /.*/, /.*/, 1], #not officially, for author of book_id 37718
103
+ ],
104
+ 'ц' => [
105
+ ['c', /.*/, /.*/, 1], #sc
106
+ ['z', /.*/, /.*/, 1], #de
107
+ ['ts', /.*/, /.*/, 1], #en
108
+ ],
109
+ 'ч' => [
110
+ #['č', /.*/, /.*/, 1], #sc
111
+ ['c', /.*/, /.*/, 1], #sc
112
+ ['tsch', /.*/, /.*/, 1], #de
113
+ ['ch', /.*/, /.*/, 1], #en
114
+ ],
115
+ 'ш' => [
116
+ #['š', /.*/, /.*/, 1], #sc
117
+ ['s', /.*/, /.*/, 1], #sc
118
+ ['sch', /.*/, /.*/, 1], #de
119
+ ['sh', /.*/, /.*/, 1], #en
120
+ ],
121
+ 'щ' => [
122
+ #['šč', /.*/, /.*/, 1], #sc
123
+ ['sc', /.*/, /.*/, 1], #sc
124
+ ['schtsch', /.*/, /.*/, 1], #de
125
+ #['stsch', /.*/, /.*/, 1], #de (ddr)
126
+ ['shch', /.*/, /.*/, 1], #en
127
+ ],
128
+ 'ъ' => [
129
+ ['"', /.*/, /.*/, 1], #sc
130
+ #hardener, generally not transcribed for de and en
131
+ ],
132
+ 'ы' => [
133
+ ['y', /.*/, /.*/, 1], #sc, de, en
134
+ ],
135
+ 'ь' => [
136
+ ["'", /.*/, /.*/, 1], #sc
137
+ ],
138
+ 'э' => [
139
+ ["ė", /.*/, /.*/, 1], #sc
140
+ ["e", /.*/, /.*/, 1], #de, en
141
+ ],
142
+ 'ю' => [
143
+ ["ju", /.*/, /.*/, 1], #sc, de
144
+ ["yu", /.*/, /.*/, 1], #en
145
+ ],
146
+ 'я' => [
147
+ ["ja", /.*/, /.*/, 1], #sc, de
148
+ ["ya", /.*/, /.*/, 1], #en
149
+ ]
150
+ }
151
+ end
@@ -0,0 +1,3 @@
1
+ module RuTranslit
2
+ VERSION = "0.0.3"
3
+ end
@@ -0,0 +1,13 @@
1
+ # coding: utf-8
2
+ require_relative 'word'
3
+
4
+ # Pass in a word and get some transliteration suggestions based on our ruleset,
5
+ # which covers german, english and scientific transliterations.
6
+ class Transliterator
7
+ # Returns an array of cyrillic options for the passed-in word (just one word at a time!)
8
+ def self.translit_options(input)
9
+ data = Word.transliterations_for(input)
10
+ options = data[:options]
11
+ Word.output_words options
12
+ end
13
+ end
data/lib/word.rb ADDED
@@ -0,0 +1,98 @@
1
+ # coding: utf-8
2
+ require_relative 'word_part'
3
+
4
+ @@max_checked_string_length = 7 # To accomodate 'schtsch' or s.th like that.
5
+
6
+ class Word
7
+ def self.detransliterations_for(word, include_softeners, is_detranslit=true)
8
+ last_part = build_word_part_structure word, is_detranslit
9
+ first_part = add_link_to_succ_to_each_word_part(last_part)
10
+ input_structure = word_input_structure(first_part)
11
+ wps_options = [] #this will be filled with lists of possible cyrillic output (2d-array then)
12
+ part = first_part
13
+ until part.nil? do
14
+ wps_options << part.cyrillic_options(include_softeners)
15
+ part = part.succ
16
+ end
17
+ {:input_structure => input_structure, :options => wps_options}
18
+ end
19
+
20
+ def self.transliterations_for(word)
21
+ self.detransliterations_for(word, true, false)
22
+ end
23
+
24
+ #transform 2d-options-array two an array of cyrillic words.
25
+ def self.output_words part_options
26
+ part_options.map!{|a| a.size==0 ? nil : a} #delete empty option arrays
27
+ part_options.compact!
28
+ get_part_combinations '', part_options, []
29
+ end
30
+
31
+
32
+ private
33
+ #builds a singly linked list, and returns the last item
34
+ def self.build_word_part_structure word, is_detranslit
35
+ rest_size = [word.size-@@max_checked_string_length,0].max #size of the chunk from the beginning of word to be checked for any matching wordparts
36
+ offset = 0 #offset from the beginning of the word --do we need this at all?
37
+ cy_result = ""
38
+ prev_wp = nil #previous wordpart (relative to current. nil for first wordpart)
39
+ while(word && word.size>0) do
40
+ #in die liste schaun. wenn ja, dann offset = rest_size
41
+ chunk = word[0..(-1-rest_size)]
42
+ if WordPart.has_rules?(chunk, is_detranslit) #if there are rules for this chunk
43
+ wp = WordPart.new(chunk, prev_wp, is_detranslit)#cy_result << new_cy_char
44
+ prev_wp = wp
45
+ offset = word.size-rest_size
46
+ word = word[(offset)..-1] #cut the word to the rest of the string that has to be put into the wp-structure
47
+ rest_size = [word.size-@@max_checked_string_length,0].max
48
+ elsif chunk.size > 0 #no rules found, so make chunk one letter shorter
49
+ rest_size += 1
50
+ else #no rules found and chunk is empty. meaning: no rules for the current word's first character. which means: just keep it.
51
+ offset = word.size-rest_size+1
52
+ wp = WordPart.new(word[0..offset-1], prev_wp, is_detranslit)
53
+ prev_wp = wp
54
+ wp.just_thru = true
55
+ word = word[(offset)..-1]
56
+ rest_size = [word.size-@@max_checked_string_length,0].max
57
+ end
58
+ end
59
+ prev_wp
60
+ end
61
+
62
+ #go thru the linked list and add the next to each (except the last of course). return the first.
63
+ def self.add_link_to_succ_to_each_word_part last
64
+ part = last
65
+ until part.prev.nil? do
66
+ prev = part.prev
67
+ prev.succ = part
68
+ part = prev
69
+ end
70
+ part
71
+ end
72
+
73
+ #just for debugging: return an array of the input parts, how the input has been split into wordparts
74
+ def self.word_input_structure first_part
75
+ ret = []
76
+ part = first_part
77
+ until part.nil? do
78
+ ret << part.input
79
+ part = part.succ
80
+ end
81
+ ret
82
+ end
83
+
84
+ #go through the 2d-array of wordpart-options and build a list of words from all possible combinations
85
+ def self.get_part_combinations(str, arr_in, arr_out)
86
+ if arr_in.size == 0
87
+ arr_out << str
88
+ return arr_out
89
+ end
90
+ cur_arr = arr_in[0]
91
+ cur_arr.each do |w|
92
+ w2 = str == "" ? w : str + w
93
+ arr_out = arr_out | get_part_combinations(w2, arr_in[1..-1], arr_out) #merge the two arrays. may be faster with just adding, not merging, since there won't be any duplicates anyway
94
+ end
95
+ arr_out
96
+ end
97
+
98
+ end
data/lib/word_part.rb ADDED
@@ -0,0 +1,52 @@
1
+ # coding: utf-8
2
+ require_relative 'translit_rules'
3
+ require_relative 'detranslit_rules'
4
+
5
+ class WordPart
6
+ attr_accessor :input, :prev, :succ, :static_output, :just_thru #may be more restricted than this
7
+ @prev, @succ = nil
8
+ @just_thru = false
9
+ @input = nil
10
+ @is_detranslit = nil
11
+
12
+ # arrays: [output, predecessor-rule, successor-rule]
13
+ DT_RULES = DetranslitRules.rules
14
+ T_RULES = TranslitRules.rules
15
+
16
+ def initialize input, prev, detranslit=true
17
+ @input = input
18
+ @prev = prev
19
+ @is_detranslit = detranslit
20
+ end
21
+
22
+ def self.has_rules? input, is_detranslit
23
+ if is_detranslit
24
+ DT_RULES.has_key? input
25
+ else
26
+ T_RULES.has_key? input
27
+ end
28
+ end
29
+
30
+ #return array of (cyrillic) strings which can be empty (or nil instead?) --should we cache the result? i think it's called no more than once.
31
+ def cyrillic_options(include_softeners)
32
+ return [input] if just_thru #this can be set to accomodate for untranslatable characters.
33
+ ret = []
34
+ arr = @is_detranslit ? DT_RULES[input] : T_RULES[input]
35
+ if arr #there are rules for this part, now see if any of them match
36
+ arr.each do |rule|
37
+ if match?(rule, include_softeners)
38
+ ret << rule[0]
39
+ end
40
+ end
41
+ end
42
+ ret
43
+ end
44
+
45
+ private
46
+ def match?(rule, include_softeners) #rule is expected to be an array of the form [output, prev-rule, succ-rule, priority(0 is for softeners)]
47
+ prev_input = prev.nil? ? '' : prev.input
48
+ succ_input = succ.nil? ? '' : succ.input
49
+ (prev_input =~ rule[1]) && (succ_input =~ rule[2]) && (rule[3]>0 || include_softeners)
50
+ end
51
+
52
+ end
@@ -0,0 +1,21 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "transliteration/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "ru_translit"
7
+ s.version = RuTranslit::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Johannes Stein"]
10
+ s.email = ["johannes@unsyn.com"]
11
+ s.homepage = "http://rubygems.org/gems/ru_translit"
12
+ s.summary = %q{Get a list of latin transliterations from a cyrillic word and vice versa.}
13
+ s.description = %q{Transliterations and detransliterations, using English, German and scientific transliteration variants.}
14
+
15
+ s.rubyforge_project = "ru_translit"
16
+
17
+ s.files = `git ls-files`.split("\n")
18
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
+ s.require_paths = ["lib"]
21
+ end
@@ -0,0 +1,76 @@
1
+ #coding: utf-8
2
+ require 'ru_translit'
3
+
4
+ describe RuTranslit do
5
+ before(:each) do
6
+ @la_str = 'something'
7
+ @cy_str = 'что-то'
8
+ @cy_str2 = 'биографиями'
9
+ @nr = '123'
10
+ end
11
+
12
+ it "should respond to the two main methods for transliteration and detransliteration" do
13
+ RuTranslit.should respond_to('to_latin')
14
+ RuTranslit.should respond_to('to_cyrillic')
15
+ end
16
+
17
+ it "should respond to some short or alternative forms for the two main methods" do
18
+ RuTranslit.should respond_to('to_la')
19
+ RuTranslit.should respond_to('to_cy')
20
+ RuTranslit.should respond_to('transliterate')
21
+ RuTranslit.should respond_to('detransliterate')
22
+ end
23
+
24
+ it "should return an array containing only the input when given a lowercase latin string to transliterate" do
25
+ RuTranslit.to_latin(@la_str).should be_an_instance_of(Array)
26
+ RuTranslit.to_latin(@la_str).should have(1).items
27
+ RuTranslit.to_latin(@la_str).first.should == @la_str
28
+ end
29
+
30
+ it "should return an array containing only the input when given a lowercase cyrillic string to detransliterate" do
31
+ RuTranslit.to_latin(@cy_str).should be_an_instance_of(Array)
32
+ RuTranslit.to_latin(@cy_str).should have_at_least(1).items
33
+ RuTranslit.to_cyrillic(@cy_str).first.should == @cy_str
34
+ end
35
+
36
+ it "should leave numbers alone in both directions" do
37
+ RuTranslit.to_latin(@nr).should have(1).item
38
+ RuTranslit.to_latin(@nr).first.should == @nr
39
+ RuTranslit.to_cyrillic(@nr).should have(1).item
40
+ RuTranslit.to_cyrillic(@nr).first.should == @nr
41
+ end
42
+
43
+ it "should preserve latin word-parts while transliterating cyrillic word-parts" do
44
+ RuTranslit.to_latin(@cy_str + @la_str + @cy_str).first.should =~ /#{@la_str}/
45
+ end
46
+
47
+ it "should return a list of several transliterations for the russian word 'биографиями'" do
48
+ RuTranslit.to_latin(@cy_str2).should have_at_least(2).items
49
+ RuTranslit.to_latin(@cy_str2).should have_at_most(4).items #making sure it doesn't explode for some reason
50
+ end
51
+
52
+ it "should return (among others) the original input if we do a 'round-trip' cy->la->cy, using the first result" do
53
+ RuTranslit.to_cyrillic(RuTranslit.to_latin(@cy_str2).first).should include(@cy_str2)
54
+ end
55
+
56
+ it "should return (among others) the original input if we do a 'round-trip' la->cy->la, using the first result" do
57
+ RuTranslit.to_latin(RuTranslit.to_cyrillic(@la_str).first).should include(@la_str)
58
+ end
59
+
60
+ it "should offer cyrillic alternatives for 'schtsch', at least one of which should be only one letter long" do
61
+ RuTranslit.to_cyrillic('schtsch').should include('щ')
62
+ end
63
+
64
+ it "should return different transliteration results depending on context in some cases" do
65
+ RuTranslit.to_latin('де').should include('de')
66
+ RuTranslit.to_latin('де').should_not include('dje')
67
+ RuTranslit.to_latin('е').should include('je')
68
+ end
69
+
70
+ it "should include softeners in the detransliteration results (only) if called with the appropriate argument" do
71
+ RuTranslit.to_cyrillic('f', false).should_not include('фь')
72
+ RuTranslit.to_cyrillic('f', true).should include('фь')
73
+ end
74
+
75
+ end
76
+
metadata ADDED
@@ -0,0 +1,80 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ru_translit
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 3
9
+ version: 0.0.3
10
+ platform: ruby
11
+ authors:
12
+ - Johannes Stein
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2011-01-22 00:00:00 +01:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description: Transliterations and detransliterations, using English, German and scientific transliteration variants.
22
+ email:
23
+ - johannes@unsyn.com
24
+ executables: []
25
+
26
+ extensions: []
27
+
28
+ extra_rdoc_files: []
29
+
30
+ files:
31
+ - .gitignore
32
+ - Gemfile
33
+ - Gemfile.lock
34
+ - LICENSE
35
+ - README.md
36
+ - Rakefile
37
+ - lib/detranslit_rules.rb
38
+ - lib/detransliterator.rb
39
+ - lib/ru_translit.rb
40
+ - lib/translit_rules.rb
41
+ - lib/transliteration/version.rb
42
+ - lib/transliterator.rb
43
+ - lib/word.rb
44
+ - lib/word_part.rb
45
+ - ru_translit.gemspec
46
+ - spec/ru_translit_spec.rb
47
+ has_rdoc: true
48
+ homepage: http://rubygems.org/gems/ru_translit
49
+ licenses: []
50
+
51
+ post_install_message:
52
+ rdoc_options: []
53
+
54
+ require_paths:
55
+ - lib
56
+ required_ruby_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ segments:
62
+ - 0
63
+ version: "0"
64
+ required_rubygems_version: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ segments:
70
+ - 0
71
+ version: "0"
72
+ requirements: []
73
+
74
+ rubyforge_project: ru_translit
75
+ rubygems_version: 1.3.7
76
+ signing_key:
77
+ specification_version: 3
78
+ summary: Get a list of latin transliterations from a cyrillic word and vice versa.
79
+ test_files: []
80
+