ru_translit 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ pkg/*
2
+ *.gem
3
+ .bundle
4
+ nbproject/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in ru_translit.gemspec
4
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,14 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ ru_translit (0.0.2)
5
+
6
+ GEM
7
+ remote: http://rubygems.org/
8
+ specs:
9
+
10
+ PLATFORMS
11
+ ruby
12
+
13
+ DEPENDENCIES
14
+ ru_translit!
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ (The MIT License)
2
+
3
+ Copyright (c) 2011 Johannes Stein
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,26 @@
1
+ # ru_translit
2
+
3
+ A simple Object with two class methods for transliterating russian cyrillic words to latin,
4
+ and for detransliterating transliterated words from latin back to cyrillic. At this point,
5
+ it does not follow any one specific transliteration ruleset, but uses a variation of German,
6
+ English and (simplified) scientific transliteration rules. It is meant to be a very pragmatic, catch-all
7
+ way of getting different real-world variants of how a given word might be transliterated or detransliterated.
8
+ Each of the two methods returns an array of words.
9
+
10
+ The detransliteration method has an optional second parameter that decides whether the returned
11
+ cyrillic options should include those whose only difference to another option is that it includes one
12
+ or more softeners, which are usually not transliterated. This parameter defaults to false.
13
+
14
+ What's special about this is that it takes the context of letters in the word into account in order to find out
15
+ whether certain options are to be included. Most other transliteration tools don't do this.
16
+
17
+ ## Install
18
+
19
+ gem install ru_translit
20
+
21
+ ## Usage
22
+
23
+ require 'ru_translit'
24
+ RuTranslit.to_cyrillic 'vodka' # => ["водка"]
25
+ RuTranslit.to_latin 'водка' # => ["vodka", "wodka"]
26
+ RuTranslit.to_cyrillic 'vodka', true # => ["водка", "водкьа", "водька", "водькьа"]
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
@@ -0,0 +1,191 @@
1
+ # coding: utf-8
2
+ class DetranslitRules
3
+
4
+ def self.rules
5
+ RULES
6
+ end
7
+
8
+ # arrays: [output, predecessor-rule, successor-rule, softener-if-0]
9
+ RULES = {
10
+ 'a' => [
11
+ ['а', /.*/, /.*/, 1]
12
+ ],
13
+ 'b' => [
14
+ ['б', /.*/, /.*/, 1],
15
+ ['бь', /.*/, /[a,e,i,ja,ya,ju,yu]|^$/, 0]
16
+ ],
17
+ 'c' => [
18
+ ['ц', /.*/, /.*/, 1]
19
+ ],
20
+ 'd' => [
21
+ ['д', /.*/, /.*/, 1],
22
+ ['дь', /.*/, /[ja,e,m,b,k,ju,yu,s]|^$/, 0]
23
+ ],
24
+ 'e' => [
25
+ ['е', /.*/, /.*/, 1]
26
+ ],
27
+ 'f' => [
28
+ ['ф', /.*/, /.*/, 1],
29
+ ['фь', /.*/, /[ja,ya,e,je,o]|^$/, 0]
30
+ ],
31
+ 'g' => [
32
+ ['г', /.*/, /.*/, 1]
33
+ ],
34
+ 'h' => [
35
+ ['х', /.*/, /.*/, 1]
36
+ ],
37
+ 'i' => [
38
+ ['и', /.*/, /.*/, 1],
39
+ ['ы', /.+/, /.*/, 1],
40
+ ['ий', /.+/, /^$/, 1]
41
+ ],
42
+ 'j' => [
43
+ ['й', /.*/, /.*/, 1]
44
+ ],
45
+ 'k' => [
46
+ ['к', /.*/, /.*/, 1],
47
+ ['кь', /.*/, /[ja,ya]/, 0] #
48
+ ],
49
+
50
+ 'l' => [
51
+ ['л', /.*/, /.*/, 1]
52
+ ],
53
+ 'm' => [
54
+ ['м', /.*/, /.*/, 1]#,
55
+ # ['мь', /[a,e,i,o,u,ju,yu,ja,ya]/, /[e,ja,ya,d,s,ju,yu]|^$/, 0]
56
+ # ['мь', /[e]/, /[e]/, 0]
57
+ ],
58
+ 'n' => [
59
+ ['н',/.*/,/.*/, 1],
60
+ # ['нь', /[a,e,o,u,i,y,ja,ya,ju,yu,z,r,s,g]/, /[k,e,ja,sh,sch,o,i,g,-,ju,yu,zh,ch,tsch,b,d,s]|^$/, 0]
61
+ ['нь', /[a,e,o,u,i,y,ja,ya]/, /[k,e,ja,sh,sch,o,i,g]|^$/, 0]
62
+ ],
63
+ 'o' => [
64
+ ['о', /.*/, /.*/, 1]
65
+ ],
66
+ 'p' => [
67
+ ['п', /.*/, /.*/, 1],
68
+ ['пь', /.*/, /[ja,ya,e,ju,yu]|^$/, 0]
69
+ ],
70
+ 'q' => [
71
+ ['', /.*/, /.*/, 1]
72
+ ],
73
+ 'r' => [
74
+ ['р', /.*/, /.*/, 1],
75
+ # ['рь', /[a,o,e,u,y,i,b,ja,ju,t,p,kh,ch]/, /[e,k,ja,ya,m,c,-,i,b,tsch,ch,g]|^$/, 0]
76
+ ['рь', /[a,o,e,u,y,i,b]/, /[e,k,ja,ya,m]|^$/, 0]
77
+ ],
78
+ 's' => [
79
+ ['с', /.*/, /.*/, 1],
80
+ # ['сь', /.*/, /[m,e,k,b,ja,ya,ju,yu,o]|^$/, 0]
81
+ ['сь', /[i,o,e]/, /[m,e]|^$/, 0]
82
+ ],
83
+ 't' => [
84
+ ['т', /.*/, /.*/, 1],
85
+ # ['ть', /[a,i,s,ja,ya,u,e,y,o,z,r,l,f,n]/, /[s,e,ja,ya,ju,yu,i,-,b,m,f,k,d,u,v,w]|^$/, 0],
86
+ ['ть', /[a,i,s,ja,ya,u,e,y,o,z,r]/, /[s,e,ja,ya,ju,yu,i,-]|^$/, 0],
87
+ ],
88
+ 'u' => [
89
+ ['у', /.*/, /.*/, 1]
90
+ ],
91
+ 'v' => [
92
+ ['в', /.*/, /.*/, 1],
93
+ ['вь', /.*/, /[ju,yu,e,i]|^$/, 0]
94
+ ],
95
+ 'w' => [
96
+ ['в', /.*/, /.*/, 1],
97
+ ['вь', /.+/, /[ju,yu,e,i]|^$/, 0],
98
+ ['вь', /.*/, /[ju,yu,e,i]/, 0]
99
+ ],
100
+ 'x' => [
101
+ ['кс', /.*/, /.*/, 1]
102
+ ],
103
+ 'y' => [
104
+ ['ы', /.*/, /.*/, 1]
105
+ ],
106
+ 'z' => [
107
+ ['з', /.*/, /.*/, 1],
108
+ ['зь', /.*/, /[ja,ya,b,e,m,k]|^$/, 0]
109
+ ],
110
+ "'" => [
111
+ ['ь', /.+/, /.*/, 0]
112
+ ],
113
+ # english letter combinations
114
+ 'ch' => [
115
+ ['ч', /.*/, /.*/, 1],
116
+ ['чь', /.*/, /[s,e,i,ja,ya,ju,yu,-]|^$/, 0]
117
+ ],
118
+ 'кн' => [
119
+ ['х', /.*/, /.*/, 1]
120
+ ],
121
+ 'yu' => [
122
+ ['ю', /.*/, /.*/, 1]
123
+ ],
124
+ 'ya' => [
125
+ ['я', /.*/, /.*/, 1]
126
+ ],
127
+ 'yo' => [
128
+ ['е', /.*/, /.*/, 1]
129
+ ],
130
+ 'sh' => [
131
+ ['ш', /.*/, /.*/, 1],
132
+ ['шь', /.*/, /[ja,ya,e,s]|^$/, 0]
133
+ ],
134
+ 'shch' => [
135
+ ['щ', /.*/, /.*/, 1],
136
+ ['щь', /.*/, /^$/, 0]
137
+ ],
138
+ 'ts' => [
139
+ ['ц', /.*/, /.*/, 1]
140
+ ],
141
+ 'zh' => [
142
+ ['ж', /.*/, /.*/, 1],
143
+ ['жь', /.*/, /[e,i,ja,ya]|^$/, 0]
144
+ ],
145
+ # german letter combinations
146
+ 'sch' => [
147
+ ['ш', /.*/, /.*/, 1], #de
148
+ ['щ', /.*/, /.*/, 1], #en
149
+ ['шь', /.*/, /^$/, 0]
150
+ #maschadov ?
151
+ ],
152
+ 'tsch' => [
153
+ ['ч', /.*/, /.*/, 1],
154
+ ['чь', /.*/, /[s,e,i,ja,ya,ju,yu,-]|^$/, 0]
155
+ ],
156
+ 'schtsch' => [
157
+ ['щ', /.*/, /.*/, 1]
158
+ ],
159
+ 'ju' => [
160
+ ['ю', /.*/, /.*/, 1]
161
+ ],
162
+ 'ja' => [
163
+ ['я', /.*/, /.*/, 1]
164
+ ],
165
+ # 'je' => [
166
+ # ['е', /.*/, /.*/, 1],
167
+ # ['же', /.*/, /.*/, 1],
168
+ # ],
169
+ 'jo' => [
170
+ ['е', /.*/, /.*/, 1]
171
+ ],
172
+ 'ä' => [
173
+ ['я', /.*/, /.*/, 1]
174
+ ],
175
+ 'ö' => [
176
+ ['ё', /.*/, /.*/, 1]
177
+ ],
178
+ 'ü' => [
179
+ ['ю', /.*/, /.*/, 1]
180
+ ],
181
+ 'ß' => [
182
+ ['с', /.*/, /.*/, 1]
183
+ ],
184
+ 'je' => [
185
+ ['е', /.*/, /.*/, 1],
186
+ #['йе', /.*/, /.*/, 1],
187
+ ['же', /^$/, /.*/, 1]
188
+ ]
189
+ # TODO scientific letter combinations
190
+ }
191
+ end
@@ -0,0 +1,16 @@
1
+ # coding: utf-8
2
+ require_relative 'word'
3
+
4
+ #TODO check if a word is german or english, then don't detransliterate, right?
5
+
6
+ # Pass in a word and get some detransliteration suggestions based on our ruleset,
7
+ # which covers german and english transliterations. If softeners are to be included,
8
+ # the returned lists can get quite long.
9
+ class Detransliterator
10
+ #returns an array of cyrillic options for the passed-in word (just one word at a time!)
11
+ def self.cyrillic_options(input, include_softeners)
12
+ data = Word.detransliterations_for(input, include_softeners)
13
+ options = data[:options]
14
+ Word.output_words options
15
+ end
16
+ end
@@ -0,0 +1,37 @@
1
+ require 'unicode_utils/downcase'
2
+ require 'transliterator'
3
+ require 'detransliterator'
4
+
5
+ # Transliteration as well as De-/Retransliteration between russian cyrillic and
6
+ # English, German and Scientific transliterations. Accounts for context-dependent
7
+ # transliteration rules.
8
+ # Current limitations:
9
+ # * Only one word per pass (technically, it should work for multiple words,
10
+ # but the number of variations returned likely grows beyond manageability).
11
+ # * Everything will be downcased.
12
+ # * No distinction between the different translit variants: Just one list with all possible options gets returned.
13
+ module RuTranslit
14
+
15
+ # De-transliterates a single latin word to cyrillic. returns an array of possible cyrillic strings
16
+ # if include_softeners is true, variations including only the positioning of softeners get added
17
+ # to the returned array as well. considers mainly German and English transliteration variants.
18
+ def self.to_cyrillic latin_term, include_softeners=false
19
+ latin_term = UnicodeUtils.downcase(latin_term) #generally, the regular downcase should be fine here, but doesn't hurt like this.
20
+ Detransliterator.cyrillic_options(latin_term, include_softeners)
21
+ end
22
+
23
+ # Transliterates a single cyrillic word to latin. Returns an array of possible latin strings.
24
+ # Considers mainly English, German and scientific (mostly minus the diacritics) transliteration variants.
25
+ def self.to_latin cyrillic_term
26
+ cyrillic_term = UnicodeUtils.downcase(cyrillic_term) #generally, the regular downcase should be fine here, but doesn't hurt like this.
27
+ Transliterator.translit_options(cyrillic_term)
28
+ end
29
+
30
+ # Short forms of the main methods.
31
+ class << self
32
+ alias to_cy to_cyrillic
33
+ alias to_la to_latin
34
+ alias detransliterate to_cyrillic
35
+ alias transliterate to_latin
36
+ end
37
+ end
@@ -0,0 +1,151 @@
1
+ # coding: utf-8
2
+ class TranslitRules
3
+
4
+ def self.rules
5
+ RULES
6
+ end
7
+
8
+ # arrays: [output, predecessor-rule, successor-rule, softener-if-0]
9
+ #source: mainly https://secure.wikimedia.org/wikipedia/de/wiki/Kyrillisch#Russisch
10
+ RULES = {
11
+ 'а' => [
12
+ ['a', /.*/, /.*/, 1]
13
+ ],
14
+ 'б' => [
15
+ ['b', /.*/, /.*/, 1]
16
+ ],
17
+ 'в' => [
18
+ ['v', /.*/, /.*/, 1], #sc, en
19
+ ['w', /.*/, /.*/, 1] #de
20
+ ],
21
+ 'г' => [
22
+ ['g', /.*/, /.*/, 1],
23
+ ['w', /[eo]/, /o/, 1] #de bei genitiv-wendung
24
+ ],
25
+ 'д' => [
26
+ ['d', /.*/, /.*/, 1]
27
+ ],
28
+ 'е' => [
29
+ ['e', /.*/, /.*/, 1],
30
+ ['je', /^$|[аоуыэяёюиеь]/, /.*/, 1], #de, am anfang oder nach vokal
31
+ ['ye', /^$|[аоуыэяёюиеь]/, /.*/, 1], #en, ''
32
+ ['jo', /[р]/, /.*/, 1] #de, not official, for book_id 44460
33
+ ],
34
+ 'ё' => [
35
+ ['ë', /.*/, /.*/, 1], #sc
36
+ ['jo', /[^жчшщ]/, /.*/, 1], #de
37
+ ['e', /[жчшщ]/, /.*/, 1], #de
38
+ ['yo', /[^жчшщ]/, /.*/, 1], #en
39
+ ['o', /[жчшщ]/, /.*/, 1] #en
40
+ ],
41
+ 'ж' => [
42
+ #['ž', /.*/, /.*/, 1], #sc
43
+ ['z', /.*/, /.*/, 1], #sc
44
+ ['sch', /.*/, /.*/, 1], #de
45
+ ['sh', /.*/, /.*/, 1], #de (ddr?)
46
+ ['zh', /.*/, /.*/, 1], #en
47
+ ],
48
+ 'з' => [
49
+ ['z', /.*/, /.*/, 1], #sc, en
50
+ ['s', /.*/, /.*/, 1], #de
51
+ ],
52
+ 'и' => [
53
+ ['i', /.*/, /.*/, 1], #sc, en, de
54
+ ['ji', /ь/, /.*/, 1], #de
55
+ ['yi', /ь/, /.*/, 1], #en
56
+ ],
57
+ 'й' => [
58
+ ['j', /.*/, /.*/, 1], #sc
59
+ ['i', /.*/, /.*/, 1], #de, sometimes also 'j', but that's already covered with sc.
60
+ ['y', /.*/, /.*/, 1], #en
61
+ ],
62
+ 'к' => [
63
+ ['k', /.*/, /.*/, 1], #sc, de, en
64
+ #['x', /.*/, /c/, 1], #de?
65
+ ],
66
+ 'л' => [
67
+ ['l', /.*/, /.*/, 1], #sc, de, en
68
+ ],
69
+ 'м' => [
70
+ ['m', /.*/, /.*/, 1], #sc, de, en
71
+ ],
72
+ 'н' => [
73
+ ['n', /.*/, /.*/, 1], #sc, de, en
74
+ ],
75
+ 'о' => [
76
+ ['o', /.*/, /.*/, 1], #sc, de, en
77
+ ['jo', /ь/, /.*/, 1], #de
78
+ ['yo', /ь/, /.*/, 1], #en
79
+ ],
80
+ 'п' => [
81
+ ['p', /.*/, /.*/, 1], #sc, de, en
82
+ ],
83
+ 'р' => [
84
+ ['r', /.*/, /.*/, 1], #sc, de, en
85
+ ],
86
+ 'с' => [
87
+ ['s', /.*/, /.*/, 1], #sc, de, en
88
+ ['ss', /[аоуыэяёюие]/, /[аоуыэяёюие]/, 1], #de, between vowels
89
+ ],
90
+ 'т' => [
91
+ ['t', /.*/, /.*/, 1], #sc, de, en
92
+ ],
93
+ 'у' => [
94
+ ['u', /.*/, /.*/, 1], #sc, de, en
95
+ ],
96
+ 'ф' => [
97
+ ['f', /.*/, /.*/, 1], #sc, de, en
98
+ ],
99
+ 'х' => [
100
+ ['ch', /.*/, /.*/, 1], #sc, de
101
+ ['kh', /.*/, /.*/, 1], #en
102
+ ['h', /.*/, /.*/, 1], #not officially, for author of book_id 37718
103
+ ],
104
+ 'ц' => [
105
+ ['c', /.*/, /.*/, 1], #sc
106
+ ['z', /.*/, /.*/, 1], #de
107
+ ['ts', /.*/, /.*/, 1], #en
108
+ ],
109
+ 'ч' => [
110
+ #['č', /.*/, /.*/, 1], #sc
111
+ ['c', /.*/, /.*/, 1], #sc
112
+ ['tsch', /.*/, /.*/, 1], #de
113
+ ['ch', /.*/, /.*/, 1], #en
114
+ ],
115
+ 'ш' => [
116
+ #['š', /.*/, /.*/, 1], #sc
117
+ ['s', /.*/, /.*/, 1], #sc
118
+ ['sch', /.*/, /.*/, 1], #de
119
+ ['sh', /.*/, /.*/, 1], #en
120
+ ],
121
+ 'щ' => [
122
+ #['šč', /.*/, /.*/, 1], #sc
123
+ ['sc', /.*/, /.*/, 1], #sc
124
+ ['schtsch', /.*/, /.*/, 1], #de
125
+ #['stsch', /.*/, /.*/, 1], #de (ddr)
126
+ ['shch', /.*/, /.*/, 1], #en
127
+ ],
128
+ 'ъ' => [
129
+ ['"', /.*/, /.*/, 1], #sc
130
+ #hardener, generally not transcribed for de and en
131
+ ],
132
+ 'ы' => [
133
+ ['y', /.*/, /.*/, 1], #sc, de, en
134
+ ],
135
+ 'ь' => [
136
+ ["'", /.*/, /.*/, 1], #sc
137
+ ],
138
+ 'э' => [
139
+ ["ė", /.*/, /.*/, 1], #sc
140
+ ["e", /.*/, /.*/, 1], #de, en
141
+ ],
142
+ 'ю' => [
143
+ ["ju", /.*/, /.*/, 1], #sc, de
144
+ ["yu", /.*/, /.*/, 1], #en
145
+ ],
146
+ 'я' => [
147
+ ["ja", /.*/, /.*/, 1], #sc, de
148
+ ["ya", /.*/, /.*/, 1], #en
149
+ ]
150
+ }
151
+ end
@@ -0,0 +1,3 @@
1
+ module RuTranslit
2
+ VERSION = "0.0.3"
3
+ end
@@ -0,0 +1,13 @@
1
+ # coding: utf-8
2
+ require_relative 'word'
3
+
4
+ # Pass in a word and get some transliteration suggestions based on our ruleset,
5
+ # which covers german, english and scientific transliterations.
6
+ class Transliterator
7
+ # Returns an array of cyrillic options for the passed-in word (just one word at a time!)
8
+ def self.translit_options(input)
9
+ data = Word.transliterations_for(input)
10
+ options = data[:options]
11
+ Word.output_words options
12
+ end
13
+ end
data/lib/word.rb ADDED
@@ -0,0 +1,98 @@
1
+ # coding: utf-8
2
+ require_relative 'word_part'
3
+
4
+ @@max_checked_string_length = 7 # To accomodate 'schtsch' or s.th like that.
5
+
6
+ class Word
7
+ def self.detransliterations_for(word, include_softeners, is_detranslit=true)
8
+ last_part = build_word_part_structure word, is_detranslit
9
+ first_part = add_link_to_succ_to_each_word_part(last_part)
10
+ input_structure = word_input_structure(first_part)
11
+ wps_options = [] #this will be filled with lists of possible cyrillic output (2d-array then)
12
+ part = first_part
13
+ until part.nil? do
14
+ wps_options << part.cyrillic_options(include_softeners)
15
+ part = part.succ
16
+ end
17
+ {:input_structure => input_structure, :options => wps_options}
18
+ end
19
+
20
+ def self.transliterations_for(word)
21
+ self.detransliterations_for(word, true, false)
22
+ end
23
+
24
+ #transform 2d-options-array two an array of cyrillic words.
25
+ def self.output_words part_options
26
+ part_options.map!{|a| a.size==0 ? nil : a} #delete empty option arrays
27
+ part_options.compact!
28
+ get_part_combinations '', part_options, []
29
+ end
30
+
31
+
32
+ private
33
+ #builds a singly linked list, and returns the last item
34
+ def self.build_word_part_structure word, is_detranslit
35
+ rest_size = [word.size-@@max_checked_string_length,0].max #size of the chunk from the beginning of word to be checked for any matching wordparts
36
+ offset = 0 #offset from the beginning of the word --do we need this at all?
37
+ cy_result = ""
38
+ prev_wp = nil #previous wordpart (relative to current. nil for first wordpart)
39
+ while(word && word.size>0) do
40
+ #in die liste schaun. wenn ja, dann offset = rest_size
41
+ chunk = word[0..(-1-rest_size)]
42
+ if WordPart.has_rules?(chunk, is_detranslit) #if there are rules for this chunk
43
+ wp = WordPart.new(chunk, prev_wp, is_detranslit)#cy_result << new_cy_char
44
+ prev_wp = wp
45
+ offset = word.size-rest_size
46
+ word = word[(offset)..-1] #cut the word to the rest of the string that has to be put into the wp-structure
47
+ rest_size = [word.size-@@max_checked_string_length,0].max
48
+ elsif chunk.size > 0 #no rules found, so make chunk one letter shorter
49
+ rest_size += 1
50
+ else #no rules found and chunk is empty. meaning: no rules for the current word's first character. which means: just keep it.
51
+ offset = word.size-rest_size+1
52
+ wp = WordPart.new(word[0..offset-1], prev_wp, is_detranslit)
53
+ prev_wp = wp
54
+ wp.just_thru = true
55
+ word = word[(offset)..-1]
56
+ rest_size = [word.size-@@max_checked_string_length,0].max
57
+ end
58
+ end
59
+ prev_wp
60
+ end
61
+
62
+ #go thru the linked list and add the next to each (except the last of course). return the first.
63
+ def self.add_link_to_succ_to_each_word_part last
64
+ part = last
65
+ until part.prev.nil? do
66
+ prev = part.prev
67
+ prev.succ = part
68
+ part = prev
69
+ end
70
+ part
71
+ end
72
+
73
+ #just for debugging: return an array of the input parts, how the input has been split into wordparts
74
+ def self.word_input_structure first_part
75
+ ret = []
76
+ part = first_part
77
+ until part.nil? do
78
+ ret << part.input
79
+ part = part.succ
80
+ end
81
+ ret
82
+ end
83
+
84
+ #go through the 2d-array of wordpart-options and build a list of words from all possible combinations
85
+ def self.get_part_combinations(str, arr_in, arr_out)
86
+ if arr_in.size == 0
87
+ arr_out << str
88
+ return arr_out
89
+ end
90
+ cur_arr = arr_in[0]
91
+ cur_arr.each do |w|
92
+ w2 = str == "" ? w : str + w
93
+ arr_out = arr_out | get_part_combinations(w2, arr_in[1..-1], arr_out) #merge the two arrays. may be faster with just adding, not merging, since there won't be any duplicates anyway
94
+ end
95
+ arr_out
96
+ end
97
+
98
+ end
data/lib/word_part.rb ADDED
@@ -0,0 +1,52 @@
1
+ # coding: utf-8
2
+ require_relative 'translit_rules'
3
+ require_relative 'detranslit_rules'
4
+
5
+ class WordPart
6
+ attr_accessor :input, :prev, :succ, :static_output, :just_thru #may be more restricted than this
7
+ @prev, @succ = nil
8
+ @just_thru = false
9
+ @input = nil
10
+ @is_detranslit = nil
11
+
12
+ # arrays: [output, predecessor-rule, successor-rule]
13
+ DT_RULES = DetranslitRules.rules
14
+ T_RULES = TranslitRules.rules
15
+
16
+ def initialize input, prev, detranslit=true
17
+ @input = input
18
+ @prev = prev
19
+ @is_detranslit = detranslit
20
+ end
21
+
22
+ def self.has_rules? input, is_detranslit
23
+ if is_detranslit
24
+ DT_RULES.has_key? input
25
+ else
26
+ T_RULES.has_key? input
27
+ end
28
+ end
29
+
30
+ #return array of (cyrillic) strings which can be empty (or nil instead?) --should we cache the result? i think it's called no more than once.
31
+ def cyrillic_options(include_softeners)
32
+ return [input] if just_thru #this can be set to accomodate for untranslatable characters.
33
+ ret = []
34
+ arr = @is_detranslit ? DT_RULES[input] : T_RULES[input]
35
+ if arr #there are rules for this part, now see if any of them match
36
+ arr.each do |rule|
37
+ if match?(rule, include_softeners)
38
+ ret << rule[0]
39
+ end
40
+ end
41
+ end
42
+ ret
43
+ end
44
+
45
+ private
46
+ def match?(rule, include_softeners) #rule is expected to be an array of the form [output, prev-rule, succ-rule, priority(0 is for softeners)]
47
+ prev_input = prev.nil? ? '' : prev.input
48
+ succ_input = succ.nil? ? '' : succ.input
49
+ (prev_input =~ rule[1]) && (succ_input =~ rule[2]) && (rule[3]>0 || include_softeners)
50
+ end
51
+
52
+ end
@@ -0,0 +1,21 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "transliteration/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "ru_translit"
7
+ s.version = RuTranslit::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Johannes Stein"]
10
+ s.email = ["johannes@unsyn.com"]
11
+ s.homepage = "http://rubygems.org/gems/ru_translit"
12
+ s.summary = %q{Get a list of latin transliterations from a cyrillic word and vice versa.}
13
+ s.description = %q{Transliterations and detransliterations, using English, German and scientific transliteration variants.}
14
+
15
+ s.rubyforge_project = "ru_translit"
16
+
17
+ s.files = `git ls-files`.split("\n")
18
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
+ s.require_paths = ["lib"]
21
+ end
@@ -0,0 +1,76 @@
1
+ #coding: utf-8
2
+ require 'ru_translit'
3
+
4
+ describe RuTranslit do
5
+ before(:each) do
6
+ @la_str = 'something'
7
+ @cy_str = 'что-то'
8
+ @cy_str2 = 'биографиями'
9
+ @nr = '123'
10
+ end
11
+
12
+ it "should respond to the two main methods for transliteration and detransliteration" do
13
+ RuTranslit.should respond_to('to_latin')
14
+ RuTranslit.should respond_to('to_cyrillic')
15
+ end
16
+
17
+ it "should respond to some short or alternative forms for the two main methods" do
18
+ RuTranslit.should respond_to('to_la')
19
+ RuTranslit.should respond_to('to_cy')
20
+ RuTranslit.should respond_to('transliterate')
21
+ RuTranslit.should respond_to('detransliterate')
22
+ end
23
+
24
+ it "should return an array containing only the input when given a lowercase latin string to transliterate" do
25
+ RuTranslit.to_latin(@la_str).should be_an_instance_of(Array)
26
+ RuTranslit.to_latin(@la_str).should have(1).items
27
+ RuTranslit.to_latin(@la_str).first.should == @la_str
28
+ end
29
+
30
+ it "should return an array containing only the input when given a lowercase cyrillic string to detransliterate" do
31
+ RuTranslit.to_latin(@cy_str).should be_an_instance_of(Array)
32
+ RuTranslit.to_latin(@cy_str).should have_at_least(1).items
33
+ RuTranslit.to_cyrillic(@cy_str).first.should == @cy_str
34
+ end
35
+
36
+ it "should leave numbers alone in both directions" do
37
+ RuTranslit.to_latin(@nr).should have(1).item
38
+ RuTranslit.to_latin(@nr).first.should == @nr
39
+ RuTranslit.to_cyrillic(@nr).should have(1).item
40
+ RuTranslit.to_cyrillic(@nr).first.should == @nr
41
+ end
42
+
43
+ it "should preserve latin word-parts while transliterating cyrillic word-parts" do
44
+ RuTranslit.to_latin(@cy_str + @la_str + @cy_str).first.should =~ /#{@la_str}/
45
+ end
46
+
47
+ it "should return a list of several transliterations for the russian word 'биографиями'" do
48
+ RuTranslit.to_latin(@cy_str2).should have_at_least(2).items
49
+ RuTranslit.to_latin(@cy_str2).should have_at_most(4).items #making sure it doesn't explode for some reason
50
+ end
51
+
52
+ it "should return (among others) the original input if we do a 'round-trip' cy->la->cy, using the first result" do
53
+ RuTranslit.to_cyrillic(RuTranslit.to_latin(@cy_str2).first).should include(@cy_str2)
54
+ end
55
+
56
+ it "should return (among others) the original input if we do a 'round-trip' la->cy->la, using the first result" do
57
+ RuTranslit.to_latin(RuTranslit.to_cyrillic(@la_str).first).should include(@la_str)
58
+ end
59
+
60
+ it "should offer cyrillic alternatives for 'schtsch', at least one of which should be only one letter long" do
61
+ RuTranslit.to_cyrillic('schtsch').should include('щ')
62
+ end
63
+
64
+ it "should return different transliteration results depending on context in some cases" do
65
+ RuTranslit.to_latin('де').should include('de')
66
+ RuTranslit.to_latin('де').should_not include('dje')
67
+ RuTranslit.to_latin('е').should include('je')
68
+ end
69
+
70
+ it "should include softeners in the detransliteration results (only) if called with the appropriate argument" do
71
+ RuTranslit.to_cyrillic('f', false).should_not include('фь')
72
+ RuTranslit.to_cyrillic('f', true).should include('фь')
73
+ end
74
+
75
+ end
76
+
metadata ADDED
@@ -0,0 +1,80 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ru_translit
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 3
9
+ version: 0.0.3
10
+ platform: ruby
11
+ authors:
12
+ - Johannes Stein
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2011-01-22 00:00:00 +01:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description: Transliterations and detransliterations, using English, German and scientific transliteration variants.
22
+ email:
23
+ - johannes@unsyn.com
24
+ executables: []
25
+
26
+ extensions: []
27
+
28
+ extra_rdoc_files: []
29
+
30
+ files:
31
+ - .gitignore
32
+ - Gemfile
33
+ - Gemfile.lock
34
+ - LICENSE
35
+ - README.md
36
+ - Rakefile
37
+ - lib/detranslit_rules.rb
38
+ - lib/detransliterator.rb
39
+ - lib/ru_translit.rb
40
+ - lib/translit_rules.rb
41
+ - lib/transliteration/version.rb
42
+ - lib/transliterator.rb
43
+ - lib/word.rb
44
+ - lib/word_part.rb
45
+ - ru_translit.gemspec
46
+ - spec/ru_translit_spec.rb
47
+ has_rdoc: true
48
+ homepage: http://rubygems.org/gems/ru_translit
49
+ licenses: []
50
+
51
+ post_install_message:
52
+ rdoc_options: []
53
+
54
+ require_paths:
55
+ - lib
56
+ required_ruby_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ segments:
62
+ - 0
63
+ version: "0"
64
+ required_rubygems_version: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ segments:
70
+ - 0
71
+ version: "0"
72
+ requirements: []
73
+
74
+ rubyforge_project: ru_translit
75
+ rubygems_version: 1.3.7
76
+ signing_key:
77
+ specification_version: 3
78
+ summary: Get a list of latin transliterations from a cyrillic word and vice versa.
79
+ test_files: []
80
+